summaryrefslogtreecommitdiff
path: root/gmp/mpn/powerpc64
diff options
context:
space:
mode:
Diffstat (limited to 'gmp/mpn/powerpc64')
-rw-r--r--gmp/mpn/powerpc64/README166
-rw-r--r--gmp/mpn/powerpc64/aix.m497
-rw-r--r--gmp/mpn/powerpc64/com.asm136
-rw-r--r--gmp/mpn/powerpc64/copyd.asm84
-rw-r--r--gmp/mpn/powerpc64/copyi.asm78
-rw-r--r--gmp/mpn/powerpc64/darwin.m4119
-rw-r--r--gmp/mpn/powerpc64/elf.m4123
-rw-r--r--gmp/mpn/powerpc64/logops_n.asm151
-rw-r--r--gmp/mpn/powerpc64/lshift.asm207
-rw-r--r--gmp/mpn/powerpc64/lshiftc.asm210
-rw-r--r--gmp/mpn/powerpc64/mode32/add_n.asm86
-rw-r--r--gmp/mpn/powerpc64/mode32/addmul_1.asm79
-rw-r--r--gmp/mpn/powerpc64/mode32/mul_1.asm73
-rw-r--r--gmp/mpn/powerpc64/mode32/p4/gmp-mparam.h173
-rw-r--r--gmp/mpn/powerpc64/mode32/sqr_diagonal.asm117
-rw-r--r--gmp/mpn/powerpc64/mode32/sub_n.asm88
-rw-r--r--gmp/mpn/powerpc64/mode32/submul_1.asm82
-rw-r--r--gmp/mpn/powerpc64/mode64/aors_n.asm189
-rw-r--r--gmp/mpn/powerpc64/mode64/aorsmul_1.asm225
-rw-r--r--gmp/mpn/powerpc64/mode64/aorsorrlsh1_n.asm43
-rw-r--r--gmp/mpn/powerpc64/mode64/aorsorrlsh2_n.asm43
-rw-r--r--gmp/mpn/powerpc64/mode64/aorsorrlshC_n.asm187
-rw-r--r--gmp/mpn/powerpc64/mode64/bdiv_dbm1c.asm132
-rw-r--r--gmp/mpn/powerpc64/mode64/cnd_aors_n.asm196
-rw-r--r--gmp/mpn/powerpc64/mode64/dive_1.asm132
-rw-r--r--gmp/mpn/powerpc64/mode64/divrem_1.asm274
-rw-r--r--gmp/mpn/powerpc64/mode64/divrem_2.asm187
-rw-r--r--gmp/mpn/powerpc64/mode64/gcd_1.asm122
-rw-r--r--gmp/mpn/powerpc64/mode64/gmp-mparam.h82
-rw-r--r--gmp/mpn/powerpc64/mode64/invert_limb.asm88
-rw-r--r--gmp/mpn/powerpc64/mode64/mod_1_1.asm164
-rw-r--r--gmp/mpn/powerpc64/mode64/mod_1_4.asm270
-rw-r--r--gmp/mpn/powerpc64/mode64/mod_34lsub1.asm132
-rw-r--r--gmp/mpn/powerpc64/mode64/mode1o.asm117
-rw-r--r--gmp/mpn/powerpc64/mode64/mul_1.asm168
-rw-r--r--gmp/mpn/powerpc64/mode64/mul_basecase.asm708
-rw-r--r--gmp/mpn/powerpc64/mode64/p3/gmp-mparam.h179
-rw-r--r--gmp/mpn/powerpc64/mode64/p4/gmp-mparam.h208
-rw-r--r--gmp/mpn/powerpc64/mode64/p5/gmp-mparam.h219
-rw-r--r--gmp/mpn/powerpc64/mode64/p6/aorsmul_1.asm183
-rw-r--r--gmp/mpn/powerpc64/mode64/p6/gmp-mparam.h160
-rw-r--r--gmp/mpn/powerpc64/mode64/p6/mul_basecase.asm589
-rw-r--r--gmp/mpn/powerpc64/mode64/p7/aormul_2.asm135
-rw-r--r--gmp/mpn/powerpc64/mode64/p7/aors_n.asm128
-rw-r--r--gmp/mpn/powerpc64/mode64/p7/aorsorrlsh1_n.asm43
-rw-r--r--gmp/mpn/powerpc64/mode64/p7/aorsorrlsh2_n.asm43
-rw-r--r--gmp/mpn/powerpc64/mode64/p7/aorsorrlshC_n.asm129
-rw-r--r--gmp/mpn/powerpc64/mode64/p7/gcd_1.asm110
-rw-r--r--gmp/mpn/powerpc64/mode64/p7/gmp-mparam.h243
-rw-r--r--gmp/mpn/powerpc64/mode64/rsh1aors_n.asm172
-rw-r--r--gmp/mpn/powerpc64/mode64/sqr_basecase.asm863
-rw-r--r--gmp/mpn/powerpc64/p6/lshift.asm132
-rw-r--r--gmp/mpn/powerpc64/p6/lshiftc.asm136
-rw-r--r--gmp/mpn/powerpc64/p6/rshift.asm131
-rw-r--r--gmp/mpn/powerpc64/p7/copyd.asm128
-rw-r--r--gmp/mpn/powerpc64/p7/copyi.asm129
-rw-r--r--gmp/mpn/powerpc64/p7/hamdist.asm110
-rw-r--r--gmp/mpn/powerpc64/p7/popcount.asm90
-rw-r--r--gmp/mpn/powerpc64/rshift.asm207
-rw-r--r--gmp/mpn/powerpc64/sec_tabselect.asm147
-rw-r--r--gmp/mpn/powerpc64/umul.asm53
-rw-r--r--gmp/mpn/powerpc64/vmx/popcount.asm230
62 files changed, 10425 insertions, 0 deletions
diff --git a/gmp/mpn/powerpc64/README b/gmp/mpn/powerpc64/README
new file mode 100644
index 0000000000..50dd3995c3
--- /dev/null
+++ b/gmp/mpn/powerpc64/README
@@ -0,0 +1,166 @@
+Copyright 1999-2001, 2003-2005 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+or
+
+ * the GNU General Public License as published by the Free Software
+ Foundation; either version 2 of the License, or (at your option) any
+ later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library. If not,
+see https://www.gnu.org/licenses/.
+
+
+
+ POWERPC-64 MPN SUBROUTINES
+
+
+This directory contains mpn functions for 64-bit PowerPC chips.
+
+
+CODE ORGANIZATION
+
+ mpn/powerpc64 mode-neutral code
+ mpn/powerpc64/mode32 code for mode32
+ mpn/powerpc64/mode64 code for mode64
+
+
+The mode32 and mode64 sub-directories contain code which is for use in the
+respective chip mode, 32 or 64. The top-level directory is code that's
+unaffected by the mode.
+
+The "adde" instruction is the main difference between mode32 and mode64. It
+operates on either on a 32-bit or 64-bit quantity according to the chip mode.
+Other instructions have an operand size in their opcode and hence don't vary.
+
+
+
+POWER3/PPC630 pipeline information:
+
+Decoding is 4-way + branch and issue is 8-way with some out-of-order
+capability.
+
+Functional units:
+LS1 - ld/st unit 1
+LS2 - ld/st unit 2
+FXU1 - integer unit 1, handles any simple integer instruction
+FXU2 - integer unit 2, handles any simple integer instruction
+FXU3 - integer unit 3, handles integer multiply and divide
+FPU1 - floating-point unit 1
+FPU2 - floating-point unit 2
+
+Memory: Any two memory operations can issue, but memory subsystem
+ can sustain just one store per cycle. No need for data
+ prefetch; the hardware has very sophisticated prefetch logic.
+Simple integer: 2 operations (such as add, rl*)
+Integer multiply: 1 operation every 9th cycle worst case; exact timing depends
+ on 2nd operand's most significant bit position (10 bits per
+ cycle). Multiply unit is not pipelined, only one multiply
+ operation in progress is allowed.
+Integer divide: ?
+Floating-point: Any plain 2 arithmetic instructions (such as fmul, fadd, and
+ fmadd), latency 4 cycles.
+Floating-point divide:
+ ?
+Floating-point square root:
+ ?
+
+POWER3/PPC630 best possible times for the main loops:
+shift: 1.5 cycles limited by integer unit contention.
+ With 63 special loops, one for each shift count, we could
+ reduce the needed integer instructions to 2, which would
+ reduce the best possible time to 1 cycle.
+add/sub: 1.5 cycles, limited by ld/st unit contention.
+mul: 18 cycles (average) unless floating-point operations are used,
+ but that would only help for multiplies of perhaps 10 and more
+ limbs.
+addmul/submul:Same situation as for mul.
+
+
+POWER4/PPC970 and POWER5 pipeline information:
+
+This is a very odd pipeline, it is basically a VLIW masquerading as a plain
+architecture. Its issue rules are not made public, and since it is so weird,
+it is very hard to figure out any useful information from experimentation.
+An example:
+
+ A well-aligned loop with nop's take 3, 4, 6, 7, ... cycles.
+ 3 cycles for 0, 1, 2, 3, 4, 5, 6, 7 nop's
+ 4 cycles for 8, 9, 10, 11, 12, 13, 14, 15 nop's
+ 6 cycles for 16, 17, 18, 19, 20, 21, 22, 23 nop's
+ 7 cycles for 24, 25, 26, 27 nop's
+ 8 cycles for 28, 29, 30, 31 nop's
+ ... continues regularly
+
+
+Functional units:
+LS1 - ld/st unit 1
+LS2 - ld/st unit 2
+FXU1 - integer unit 1, handles any integer instruction
+FXU2 - integer unit 2, handles any integer instruction
+FPU1 - floating-point unit 1
+FPU2 - floating-point unit 2
+
+While this is one integer unit less than POWER3/PPC630, the remaining units
+are more powerful; here they handle multiply and divide.
+
+Memory: 2 ld/st. Stores go to the L2 cache, which can sustain just
+ one store per cycle.
+ L1 load latency: to gregs 3-4 cycles, to fregs 5-6 cycles.
+ Operations that modify the address register might be split
+ to use also an integer issue slot.
+Simple integer: 2 operations every cycle, latency 2.
+Integer multiply: 2 operations every 6th cycle, latency 7 cycles.
+Integer divide: ?
+Floating-point: Any plain 2 arithmetic instructions (such as fmul, fadd, and
+ fmadd), latency 6 cycles.
+Floating-point divide:
+ ?
+Floating-point square root:
+ ?
+
+
+IDEAS
+
+*mul_1: Handling one limb using mulld/mulhdu and two limbs using floating-
+point operations should give performance of about 20 cycles for 3 limbs, or 7
+cycles/limb.
+
+We should probably split the single-limb operand in 32-bit chunks, and the
+multi-limb operand in 16-bit chunks, allowing us to accumulate well in fp
+registers.
+
+Problem is to get 32-bit or 16-bit words to the fp registers. Only 64-bit fp
+memops copies bits without fiddling with them. We might therefore need to
+load to integer registers with zero extension, store as 64 bits into temp
+space, and then load to fp regs. Alternatively, load directly to fp space
+and add well-chosen constants to get cancellation. (Other part after given by
+subsequent subtraction.)
+
+Possible code mix for load-via-intregs variant:
+
+lwz,std,lfd
+fmadd,fmadd,fmul,fmul
+fctidz,stfd,ld,fctidz,stfd,ld
+add,adde
+lwz,std,lfd
+fmadd,fmadd,fmul,fmul
+fctidz,stfd,ld,fctidz,stfd,ld
+add,adde
+srd,sld,add,adde,add,adde
diff --git a/gmp/mpn/powerpc64/aix.m4 b/gmp/mpn/powerpc64/aix.m4
new file mode 100644
index 0000000000..bf6517d69d
--- /dev/null
+++ b/gmp/mpn/powerpc64/aix.m4
@@ -0,0 +1,97 @@
+divert(-1)
+dnl m4 macros for AIX 64-bit assembly.
+
+dnl Copyright 2000-2002, 2005, 2006, 2010, 2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+define(`ASM_START',
+ `.machine "any"
+ .toc')
+
+dnl Called: PROLOGUE_cpu(GSYM_PREFIX`'foo[,toc])
+dnl EPILOGUE_cpu(GSYM_PREFIX`'foo)
+dnl
+dnl Don't want ELF style .size in the epilogue.
+
+define(`PROLOGUE_cpu',
+m4_assert_numargs_range(1,2)
+`ifelse(`$2',toc,,
+`ifelse(`$2',,,`m4_error(`Unrecognised PROLOGUE parameter')')')dnl
+ .globl $1
+ .globl .$1
+ .csect [DS], 3
+$1:
+ .llong .$1, TOC[tc0], 0
+ .csect .$1[PR], 6
+.$1:')
+
+define(`EPILOGUE_cpu',
+m4_assert_numargs(1)
+`')
+
+define(`TOC_ENTRY', `')
+
+define(`LEA',
+m4_assert_numargs(2)
+`define(`TOC_ENTRY',
+` .toc
+..$2: .tc $2[TC], $2')'
+ `ld $1, ..$2(2)')
+
+define(`LEAL',
+m4_assert_numargs(2)
+`LEA($1,$2)')
+
+
+define(`EXTERN',
+m4_assert_numargs(1)
+` .globl $1')
+
+define(`EXTERN_FUNC',
+m4_assert_numargs(1)
+` .globl .$1')
+
+define(`DEF_OBJECT',
+m4_assert_numargs_range(1,2)
+` .csect [RO], 3
+ ALIGN(ifelse($#,1,2,$2))
+$1:
+')
+
+define(`END_OBJECT',
+m4_assert_numargs(1))
+
+define(`CALL',
+ `bl .$1
+ nop')
+
+define(`ASM_END', `TOC_ENTRY')
+
+undefine(`EXTRA_REGISTER')
+
+divert
diff --git a/gmp/mpn/powerpc64/com.asm b/gmp/mpn/powerpc64/com.asm
new file mode 100644
index 0000000000..074b7ff6e4
--- /dev/null
+++ b/gmp/mpn/powerpc64/com.asm
@@ -0,0 +1,136 @@
+dnl PowerPC-64 mpn_com.
+
+dnl Copyright 2004, 2005, 2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C POWER3/PPC630 ?
+C POWER4/PPC970 1.25
+C POWER5 ?
+C POWER6 1.32
+C POWER7 1.13
+
+C INPUT PARAMETERS
+define(`rp', `r3')
+define(`up', `r4')
+define(`n', `r5')
+
+ASM_START()
+PROLOGUE(mpn_com)
+
+ifdef(`HAVE_ABI_mode32',
+` rldicl n, n, 0,32')
+
+ cmpdi cr0, n, 4
+ blt L(sml)
+
+ addi r10, n, 4
+ srdi r10, r10, 3
+ mtctr r10
+
+ andi. r0, n, 1
+ rlwinm r11, n, 0,30,30
+ rlwinm r12, n, 0,29,29
+ cmpdi cr6, r11, 0
+ cmpdi cr7, r12, 0
+
+ beq cr0, L(xx0)
+L(xx1): ld r6, 0(up)
+ addi up, up, 8
+ nor r6, r6, r6
+ std r6, 0(rp)
+ addi rp, rp, 8
+
+L(xx0): bne cr6, L(x10)
+L(x00): ld r6, 0(r4)
+ ld r7, 8(r4)
+ bne cr7, L(100)
+L(000): addi rp, rp, -32
+ b L(lo0)
+L(100): addi up, up, -32
+ b L(lo4)
+L(x10): ld r8, 0(r4)
+ ld r9, 8(r4)
+ bne cr7, L(110)
+L(010): addi up, up, 16
+ addi rp, rp, -16
+ b L(lo2)
+L(110): addi up, up, -16
+ addi rp, rp, -48
+ b L(lo6)
+
+L(sml): mtctr n
+L(t): ld r6, 0(up)
+ addi up, up, 8
+ nor r6, r6, r6
+ std r6, 0(rp)
+ addi rp, rp, 8
+ bdnz L(t)
+ blr
+
+ ALIGN(32)
+L(top): nor r6, r6, r6
+ nor r7, r7, r7
+ std r6, 0(rp)
+ std r7, 8(rp)
+L(lo2): ld r6, 0(up)
+ ld r7, 8(up)
+ nor r8, r8, r8
+ nor r9, r9, r9
+ std r8, 16(rp)
+ std r9, 24(rp)
+L(lo0): ld r8, 16(up)
+ ld r9, 24(up)
+ nor r6, r6, r6
+ nor r7, r7, r7
+ std r6, 32(rp)
+ std r7, 40(rp)
+L(lo6): ld r6, 32(up)
+ ld r7, 40(up)
+ nor r8, r8, r8
+ nor r9, r9, r9
+ std r8, 48(rp)
+ std r9, 56(rp)
+ addi rp, rp, 64
+L(lo4): ld r8, 48(up)
+ ld r9, 56(up)
+ addi up, up, 64
+ bdnz L(top)
+
+L(end): nor r6, r6, r6
+ nor r7, r7, r7
+ std r6, 0(rp)
+ std r7, 8(rp)
+ nor r8, r8, r8
+ nor r9, r9, r9
+ std r8, 16(rp)
+ std r9, 24(rp)
+ blr
+EPILOGUE()
diff --git a/gmp/mpn/powerpc64/copyd.asm b/gmp/mpn/powerpc64/copyd.asm
new file mode 100644
index 0000000000..c6ce9309f1
--- /dev/null
+++ b/gmp/mpn/powerpc64/copyd.asm
@@ -0,0 +1,84 @@
+dnl PowerPC-64 mpn_copyd
+
+dnl Copyright 2004, 2005 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C POWER3/PPC630 1
+C POWER4/PPC970 1
+C POWER5 ?
+C POWER6 ?
+C POWER7 1.4
+
+C INPUT PARAMETERS
+C rp r3
+C up r4
+C n r5
+
+ASM_START()
+PROLOGUE(mpn_copyd)
+ rldic. r0, r5, 3, 59 C r0 = (r5 & 3) << 3; cr0 = (n == 4t)?
+ cmpldi cr6, r0, 16 C cr6 = (n cmp 4t + 2)?
+
+ifdef(`HAVE_ABI_mode32',
+` rldic r6, r5, 3, 32', C byte count corresponding to n
+` rldicr r6, r5, 3, 60') C byte count corresponding to n
+
+ addi r5, r5, 4 C compute...
+ifdef(`HAVE_ABI_mode32',
+` rldicl r5, r5, 62,34', C ...branch count
+` rldicl r5, r5, 62, 2') C ...branch count
+ mtctr r5
+
+ add r4, r4, r6
+ add r3, r3, r6
+ sub r4, r4, r0 C offset up
+ sub r3, r3, r0 C offset rp
+
+ beq cr0, L(L00)
+ blt cr6, L(L01)
+ beq cr6, L(L10)
+ b L(L11)
+
+ ALIGN(16)
+L(oop): ld r6, 24(r4)
+ std r6, 24(r3)
+L(L11): ld r6, 16(r4)
+ std r6, 16(r3)
+L(L10): ld r6, 8(r4)
+ std r6, 8(r3)
+L(L01): ld r6, 0(r4)
+ std r6, 0(r3)
+L(L00): addi r4, r4, -32
+ addi r3, r3, -32
+ bdnz L(oop)
+
+ blr
+EPILOGUE()
diff --git a/gmp/mpn/powerpc64/copyi.asm b/gmp/mpn/powerpc64/copyi.asm
new file mode 100644
index 0000000000..9a86cb21cc
--- /dev/null
+++ b/gmp/mpn/powerpc64/copyi.asm
@@ -0,0 +1,78 @@
+dnl PowerPC-64 mpn_copyi.
+
+dnl Copyright 2004, 2005 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C POWER3/PPC630 1
+C POWER4/PPC970 1
+C POWER5 ?
+C POWER6 ?
+C POWER7 1.4
+
+C INPUT PARAMETERS
+C rp r3
+C up r4
+C n r5
+
+ASM_START()
+PROLOGUE(mpn_copyi)
+ rldic. r0, r5, 3, 59 C r0 = (r5 & 3) << 3; cr0 = (n == 4t)?
+ cmpldi cr6, r0, 16 C cr6 = (n cmp 4t + 2)?
+
+ addi r5, r5, 4 C compute...
+ifdef(`HAVE_ABI_mode32',
+` rldicl r5, r5, 62,34', C ...branch count
+` rldicl r5, r5, 62, 2') C ...branch count
+ mtctr r5
+
+ add r4, r4, r0 C offset up
+ add r3, r3, r0 C offset rp
+
+ beq cr0, L(L00)
+ blt cr6, L(L01)
+ beq cr6, L(L10)
+ b L(L11)
+
+ ALIGN(16)
+L(oop): ld r6, -32(r4)
+ std r6, -32(r3)
+L(L11): ld r6, -24(r4)
+ std r6, -24(r3)
+L(L10): ld r6, -16(r4)
+ std r6, -16(r3)
+L(L01): ld r6, -8(r4)
+ std r6, -8(r3)
+L(L00): addi r4, r4, 32
+ addi r3, r3, 32
+ bdnz L(oop)
+
+ blr
+EPILOGUE()
diff --git a/gmp/mpn/powerpc64/darwin.m4 b/gmp/mpn/powerpc64/darwin.m4
new file mode 100644
index 0000000000..a3180e48fd
--- /dev/null
+++ b/gmp/mpn/powerpc64/darwin.m4
@@ -0,0 +1,119 @@
+divert(-1)
+dnl m4 macros for Mac OS 64-bit assembly.
+
+dnl Copyright 2005, 2006 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+define(`ASM_START',`')
+
+dnl Called: PROLOGUE_cpu(GSYM_PREFIX`'foo[,toc])
+dnl EPILOGUE_cpu(GSYM_PREFIX`'foo)
+dnl
+
+define(`DARWIN')
+
+define(`PROLOGUE_cpu',
+m4_assert_numargs_range(1,2)
+`ifelse(`$2',toc,,
+`ifelse(`$2',,,`m4_error(`Unrecognised PROLOGUE parameter')')')dnl
+ .text
+ .globl $1
+ .align 5
+$1:')
+
+define(`EPILOGUE_cpu',
+m4_assert_numargs(1))
+
+dnl LEAL -- Load Effective Address Local. This is to be used for symbols
+dnl defined in the same file. It will not work for externally defined
+dnl symbols.
+
+define(`LEAL',
+m4_assert_numargs(2)
+`ifdef(`PIC',
+`
+ mflr r0 C save return address
+ bcl 20, 31, 1f
+1: mflr $1
+ addis $1, $1, ha16($2-1b)
+ la $1, lo16($2-1b)($1)
+ mtlr r0 C restore return address
+',`
+ lis $1, ha16($2)
+ la $1, lo16($2)($1)
+')')
+
+dnl LEA -- Load Effective Address. This is to be used for symbols defined in
+dnl another file. It will not work for locally defined symbols.
+
+define(`LEA',
+m4_assert_numargs(2)
+`ifdef(`PIC',
+`define(`EPILOGUE_cpu',
+` .non_lazy_symbol_pointer
+`L'$2`'$non_lazy_ptr:
+ .indirect_symbol $2
+ .quad 0
+')
+ mflr r0 C save return address
+ bcl 20, 31, 1f
+1: mflr $1
+ addis $1, $1, ha16(`L'$2`'$non_lazy_ptr-1b)
+ ld $1, lo16(`L'$2`'$non_lazy_ptr-1b)($1)
+ mtlr r0 C restore return address
+',`
+ lis $1, ha16($2)
+ la $1, lo16($2)($1)
+')')
+
+define(`EXTERN',
+m4_assert_numargs(1)
+`dnl')
+
+define(`EXTERN_FUNC',
+m4_assert_numargs(1)
+`dnl')
+
+define(`DEF_OBJECT',
+m4_assert_numargs_range(1,2)
+` .const
+ ALIGN(ifelse($#,1,2,$2))
+$1:
+')
+
+define(`END_OBJECT',
+m4_assert_numargs(1))
+
+define(`CALL',
+ `bl GSYM_PREFIX`'$1')
+
+define(`ASM_END', `dnl')
+
+define(`EXTRA_REGISTER', r2)
+
+divert
diff --git a/gmp/mpn/powerpc64/elf.m4 b/gmp/mpn/powerpc64/elf.m4
new file mode 100644
index 0000000000..ddb5a8ed79
--- /dev/null
+++ b/gmp/mpn/powerpc64/elf.m4
@@ -0,0 +1,123 @@
+divert(-1)
+dnl m4 macros for powerpc64 GNU/Linux assembly.
+
+dnl Copyright 2003, 2005, 2006 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+define(`ASM_START',
+`ifdef(`ELFv2_ABI',
+`
+ .abiversion 2
+')')
+
+dnl Called: PROLOGUE_cpu(GSYM_PREFIX`'foo[,toc])
+dnl EPILOGUE_cpu(GSYM_PREFIX`'foo)
+dnl
+
+define(`PROLOGUE_cpu',
+m4_assert_numargs_range(1,2)
+`ifelse(`$2',toc,,
+`ifelse(`$2',,,`m4_error(`Unrecognised PROLOGUE parameter')')')dnl
+ifdef(`ELFv2_ABI',
+`
+ .globl $1
+ .type $1, @function
+ .section ".text"
+ .align 5
+$1:
+ifelse(`$2',toc,`
+0: addis 2, 12, (.TOC.-0b)@ha
+ addi 2, 2, (.TOC.-0b)@l
+ .localentry $1, .-$1
+',)
+',`
+ .globl $1
+ .globl .$1
+ .section ".opd","aw"
+ .align 3
+$1:
+ .llong .$1, .TOC.@tocbase, 0
+ .size $1, 24
+ .type .$1, @function
+ .section ".text"
+ .align 5
+.$1:
+')')
+
+define(`EPILOGUE_cpu',
+m4_assert_numargs(1)
+`ifdef(`ELFv2_ABI',`
+ .size $1, .-$1
+',`
+ .size .$1, .-.$1
+')')
+
+define(`TOC_ENTRY', `')
+
+define(`LEA',
+m4_assert_numargs(2)
+`define(`TOC_ENTRY',
+` .section ".toc", "aw"
+..$2: .tc $2[TC], $2')'
+ `ld $1, ..$2@toc(2)')
+
+define(`LEAL',
+m4_assert_numargs(2)
+`LEA($1,$2)')
+
+
+define(`EXTERN',
+m4_assert_numargs(1)
+`dnl')
+
+define(`EXTERN_FUNC',
+m4_assert_numargs(1)
+`dnl')
+
+define(`DEF_OBJECT',
+m4_assert_numargs_range(1,2)
+`
+ .section .rodata
+ ALIGN(ifelse($#,1,2,$2))
+ .type $1, @object
+$1:
+')
+
+define(`END_OBJECT',
+m4_assert_numargs(1)
+` .size $1, .-$1')
+
+define(`CALL',
+ `bl GSYM_PREFIX`'$1
+ nop')
+
+define(`ASM_END', `TOC_ENTRY')
+
+undefine(`EXTRA_REGISTER')
+
+divert
diff --git a/gmp/mpn/powerpc64/logops_n.asm b/gmp/mpn/powerpc64/logops_n.asm
new file mode 100644
index 0000000000..2fa6985d7a
--- /dev/null
+++ b/gmp/mpn/powerpc64/logops_n.asm
@@ -0,0 +1,151 @@
+dnl PowerPC-64 mpn_and_n, mpn_andn_n, mpn_nand_n, mpn_ior_n, mpn_iorn_n,
+dnl mpn_nior_n, mpn_xor_n, mpn_xnor_n -- mpn bitwise logical operations.
+
+dnl Copyright 2003-2005 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C POWER3/PPC630 1.75
+C POWER4/PPC970 2.10
+C POWER5 ?
+C POWER6 ?
+C POWER7 1.75
+
+C n POWER3/PPC630 POWER4/PPC970
+C 1 15.00 15.33
+C 2 7.50 7.99
+C 3 5.33 6.00
+C 4 4.50 4.74
+C 5 4.20 4.39
+C 6 3.50 3.99
+C 7 3.14 3.64
+C 8 3.00 3.36
+C 9 3.00 3.36
+C 10 2.70 3.25
+C 11 2.63 3.11
+C 12 2.58 3.00
+C 13 2.61 3.02
+C 14 2.42 2.82
+C 15 2.40 2.79
+C 50 2.08 2.67
+C 100 1.85 2.31
+C 200 1.80 2.18
+C 400 1.77 2.14
+C 1000 1.76 2.10#
+C 2000 1.75# 2.13
+C 4000 2.30 2.57
+C 8000 2.62 2.58
+C 16000 2.52 4.25
+C 32000 2.49 16.25
+C 64000 2.66 18.76
+
+ifdef(`OPERATION_and_n',
+` define(`func',`mpn_and_n')
+ define(`logop', `and')')
+ifdef(`OPERATION_andn_n',
+` define(`func',`mpn_andn_n')
+ define(`logop', `andc')')
+ifdef(`OPERATION_nand_n',
+` define(`func',`mpn_nand_n')
+ define(`logop', `nand')')
+ifdef(`OPERATION_ior_n',
+` define(`func',`mpn_ior_n')
+ define(`logop', `or')')
+ifdef(`OPERATION_iorn_n',
+` define(`func',`mpn_iorn_n')
+ define(`logop', `orc')')
+ifdef(`OPERATION_nior_n',
+` define(`func',`mpn_nior_n')
+ define(`logop', `nor')')
+ifdef(`OPERATION_xor_n',
+` define(`func',`mpn_xor_n')
+ define(`logop', `xor')')
+ifdef(`OPERATION_xnor_n',
+` define(`func',`mpn_xnor_n')
+ define(`logop', `eqv')')
+
+C INPUT PARAMETERS
+C rp r3
+C up r4
+C vp r5
+C n r6
+
+MULFUNC_PROLOGUE(mpn_and_n mpn_andn_n mpn_nand_n mpn_ior_n mpn_iorn_n mpn_nior_n mpn_xor_n mpn_xnor_n)
+
+ASM_START()
+PROLOGUE(func)
+ ld r8, 0(r4) C read lowest u limb
+ ld r9, 0(r5) C read lowest v limb
+ addi r6, r6, 3 C compute branch count (1)
+ rldic. r0, r6, 3, 59 C r0 = (n-1 & 3) << 3; cr0 = (n == 4(t+1))?
+ cmpldi cr6, r0, 16 C cr6 = (n cmp 4t + 3)
+
+ifdef(`HAVE_ABI_mode32',
+` rldicl r6, r6, 62,34', C ...branch count
+` rldicl r6, r6, 62, 2') C ...branch count
+ mtctr r6
+
+ ld r6, 0(r4) C read lowest u limb (again)
+ ld r7, 0(r5) C read lowest v limb (again)
+
+ add r5, r5, r0 C offset vp
+ add r4, r4, r0 C offset up
+ add r3, r3, r0 C offset rp
+
+ beq cr0, L(L01)
+ blt cr6, L(L10)
+ beq cr6, L(L11)
+ b L(L00)
+
+L(oop): ld r8, -24(r4)
+ ld r9, -24(r5)
+ logop r10, r6, r7
+ std r10, -32(r3)
+L(L00): ld r6, -16(r4)
+ ld r7, -16(r5)
+ logop r10, r8, r9
+ std r10, -24(r3)
+L(L11): ld r8, -8(r4)
+ ld r9, -8(r5)
+ logop r10, r6, r7
+ std r10, -16(r3)
+L(L10): ld r6, 0(r4)
+ ld r7, 0(r5)
+ logop r10, r8, r9
+ std r10, -8(r3)
+L(L01): addi r5, r5, 32
+ addi r4, r4, 32
+ addi r3, r3, 32
+ bdnz L(oop)
+
+ logop r10, r6, r7
+ std r10, -32(r3)
+ blr
+EPILOGUE()
diff --git a/gmp/mpn/powerpc64/lshift.asm b/gmp/mpn/powerpc64/lshift.asm
new file mode 100644
index 0000000000..880944a4ae
--- /dev/null
+++ b/gmp/mpn/powerpc64/lshift.asm
@@ -0,0 +1,207 @@
+dnl PowerPC-64 mpn_lshift -- rp[] = up[] << cnt
+
+dnl Copyright 2003, 2005, 2010, 2011 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C POWER3/PPC630 ?
+C POWER4/PPC970 ?
+C POWER5 2.25
+C POWER6 9.75
+C POWER7 2.15
+
+C TODO
+C * Try to reduce the number of needed live registers
+C * Micro-optimise header code
+C * Keep in synch with rshift.asm and lshiftc.asm
+
+C INPUT PARAMETERS
+define(`rp', `r3')
+define(`up', `r4')
+define(`n', `r5')
+define(`cnt', `r6')
+
+define(`tnc',`r0')
+define(`u0',`r30')
+define(`u1',`r31')
+define(`retval',`r5')
+
+ASM_START()
+PROLOGUE(mpn_lshift)
+ std r31, -8(r1)
+ std r30, -16(r1)
+ subfic tnc, cnt, 64
+ sldi r7, n, 3 C byte count corresponding to n
+ add up, up, r7 C up = up + n
+ add rp, rp, r7 C rp = rp + n
+ rldicl. r30, n, 0,62 C r30 = n & 3, set cr0
+ cmpdi cr6, r30, 2
+ addi r31, n, 3 C compute count...
+ ld r10, -8(up) C load 1st limb for b00...b11
+ srd retval, r10, tnc
+ifdef(`HAVE_ABI_mode32',
+` rldicl r31, r31, 62,34', C ...branch count
+` srdi r31, r31, 2') C ...for ctr
+ mtctr r31 C copy count into ctr
+ beq cr0, L(b00)
+ blt cr6, L(b01)
+ ld r11, -16(up) C load 2nd limb for b10 and b11
+ beq cr6, L(b10)
+
+ ALIGN(16)
+L(b11): sld r8, r10, cnt
+ srd r9, r11, tnc
+ ld u1, -24(up)
+ addi up, up, -24
+ sld r12, r11, cnt
+ srd r7, u1, tnc
+ addi rp, rp, 16
+ bdnz L(gt3)
+
+ or r11, r8, r9
+ sld r8, u1, cnt
+ b L(cj3)
+
+ ALIGN(16)
+L(gt3): ld u0, -8(up)
+ or r11, r8, r9
+ sld r8, u1, cnt
+ srd r9, u0, tnc
+ ld u1, -16(up)
+ or r10, r12, r7
+ b L(L11)
+
+ ALIGN(32)
+L(b10): sld r12, r10, cnt
+ addi rp, rp, 24
+ srd r7, r11, tnc
+ bdnz L(gt2)
+
+ sld r8, r11, cnt
+ or r10, r12, r7
+ b L(cj2)
+
+L(gt2): ld u0, -24(up)
+ sld r8, r11, cnt
+ srd r9, u0, tnc
+ ld u1, -32(up)
+ or r10, r12, r7
+ sld r12, u0, cnt
+ srd r7, u1, tnc
+ ld u0, -40(up)
+ or r11, r8, r9
+ addi up, up, -16
+ b L(L10)
+
+ ALIGN(16)
+L(b00): ld u1, -16(up)
+ sld r12, r10, cnt
+ srd r7, u1, tnc
+ ld u0, -24(up)
+ sld r8, u1, cnt
+ srd r9, u0, tnc
+ ld u1, -32(up)
+ or r10, r12, r7
+ sld r12, u0, cnt
+ srd r7, u1, tnc
+ addi rp, rp, 8
+ bdz L(cj4)
+
+L(gt4): addi up, up, -32
+ ld u0, -8(up)
+ or r11, r8, r9
+ b L(L00)
+
+ ALIGN(16)
+L(b01): bdnz L(gt1)
+ sld r8, r10, cnt
+ std r8, -8(rp)
+ b L(ret)
+
+L(gt1): ld u0, -16(up)
+ sld r8, r10, cnt
+ srd r9, u0, tnc
+ ld u1, -24(up)
+ sld r12, u0, cnt
+ srd r7, u1, tnc
+ ld u0, -32(up)
+ or r11, r8, r9
+ sld r8, u1, cnt
+ srd r9, u0, tnc
+ ld u1, -40(up)
+ addi up, up, -40
+ or r10, r12, r7
+ bdz L(end)
+
+ ALIGN(32)
+L(top): sld r12, u0, cnt
+ srd r7, u1, tnc
+ ld u0, -8(up)
+ std r11, -8(rp)
+ or r11, r8, r9
+L(L00): sld r8, u1, cnt
+ srd r9, u0, tnc
+ ld u1, -16(up)
+ std r10, -16(rp)
+ or r10, r12, r7
+L(L11): sld r12, u0, cnt
+ srd r7, u1, tnc
+ ld u0, -24(up)
+ std r11, -24(rp)
+ or r11, r8, r9
+L(L10): sld r8, u1, cnt
+ srd r9, u0, tnc
+ ld u1, -32(up)
+ addi up, up, -32
+ std r10, -32(rp)
+ addi rp, rp, -32
+ or r10, r12, r7
+ bdnz L(top)
+
+ ALIGN(32)
+L(end): sld r12, u0, cnt
+ srd r7, u1, tnc
+ std r11, -8(rp)
+L(cj4): or r11, r8, r9
+ sld r8, u1, cnt
+ std r10, -16(rp)
+L(cj3): or r10, r12, r7
+ std r11, -24(rp)
+L(cj2): std r10, -32(rp)
+ std r8, -40(rp)
+
+L(ret): ld r31, -8(r1)
+ ld r30, -16(r1)
+ifdef(`HAVE_ABI_mode32',
+` srdi r3, retval, 32
+ mr r4, retval
+',` mr r3, retval')
+ blr
+EPILOGUE()
diff --git a/gmp/mpn/powerpc64/lshiftc.asm b/gmp/mpn/powerpc64/lshiftc.asm
new file mode 100644
index 0000000000..7cf6a83428
--- /dev/null
+++ b/gmp/mpn/powerpc64/lshiftc.asm
@@ -0,0 +1,210 @@
+dnl PowerPC-64 mpn_lshiftc -- rp[] = ~up[] << cnt
+
+dnl Copyright 2003, 2005, 2010 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C POWER3/PPC630 ?
+C POWER4/PPC970 ?
+C POWER5 2.25
+C POWER6 9.5
+C POWER7 2.15
+
+C TODO
+C * Try to reduce the number of needed live registers
+C * Micro-optimise header code
+C * Keep in synch with lshift.asm and rshift.asm
+C * Could the long-scheduled std insns be less scheduled?
+
+C INPUT PARAMETERS
+define(`rp', `r3')
+define(`up', `r4')
+define(`n', `r5')
+define(`cnt', `r6')
+
+define(`tnc',`r0')
+define(`u0',`r30')
+define(`u1',`r31')
+define(`retval',`r5')
+
+ASM_START()
+PROLOGUE(mpn_lshiftc)
+ std r31, -8(r1)
+ std r30, -16(r1)
+ subfic tnc, cnt, 64
+ sldi r7, n, 3 C byte count corresponding to n
+ add up, up, r7 C up = up + n
+ add rp, rp, r7 C rp = rp + n
+ rldicl. r30, n, 0,62 C r30 = n & 3, set cr0
+ cmpdi cr6, r30, 2
+ addi r31, n, 3 C compute count...
+ ld r10, -8(up) C load 1st limb for b00...b11
+ srd retval, r10, tnc
+ srdi r31, r31, 2 C ...for ctr
+ mtctr r31 C copy count into ctr
+ beq cr0, L(b00)
+ blt cr6, L(b01)
+ ld r11, -16(up) C load 2nd limb for b10 and b11
+ beq cr6, L(b10)
+
+ ALIGN(16)
+L(b11): sld r8, r10, cnt
+ srd r9, r11, tnc
+ ld u1, -24(up)
+ addi up, up, -24
+ sld r12, r11, cnt
+ srd r7, u1, tnc
+ addi rp, rp, 16
+ bdnz L(gt3)
+
+ nor r11, r8, r9
+ sld r8, u1, cnt
+ nor r8, r8, r8
+ b L(cj3)
+
+ ALIGN(16)
+L(gt3): ld u0, -8(up)
+ nor r11, r8, r9
+ sld r8, u1, cnt
+ srd r9, u0, tnc
+ ld u1, -16(up)
+ nor r10, r12, r7
+ b L(L11)
+
+ ALIGN(32)
+L(b10): sld r12, r10, cnt
+ addi rp, rp, 24
+ srd r7, r11, tnc
+ bdnz L(gt2)
+
+ sld r8, r11, cnt
+ nor r10, r12, r7
+ nor r8, r8, r8
+ b L(cj2)
+
+L(gt2): ld u0, -24(up)
+ sld r8, r11, cnt
+ srd r9, u0, tnc
+ ld u1, -32(up)
+ nor r10, r12, r7
+ sld r12, u0, cnt
+ srd r7, u1, tnc
+ ld u0, -40(up)
+ nor r11, r8, r9
+ addi up, up, -16
+ b L(L10)
+
+ ALIGN(16)
+L(b00): ld u1, -16(up)
+ sld r12, r10, cnt
+ srd r7, u1, tnc
+ ld u0, -24(up)
+ sld r8, u1, cnt
+ srd r9, u0, tnc
+ ld u1, -32(up)
+ nor r10, r12, r7
+ sld r12, u0, cnt
+ srd r7, u1, tnc
+ addi rp, rp, 8
+ bdz L(cj4)
+
+L(gt4): addi up, up, -32
+ ld u0, -8(up)
+ nor r11, r8, r9
+ b L(L00)
+
+ ALIGN(16)
+L(b01): bdnz L(gt1)
+ sld r8, r10, cnt
+ nor r8, r8, r8
+ std r8, -8(rp)
+ b L(ret)
+
+L(gt1): ld u0, -16(up)
+ sld r8, r10, cnt
+ srd r9, u0, tnc
+ ld u1, -24(up)
+ sld r12, u0, cnt
+ srd r7, u1, tnc
+ ld u0, -32(up)
+ nor r11, r8, r9
+ sld r8, u1, cnt
+ srd r9, u0, tnc
+ ld u1, -40(up)
+ addi up, up, -40
+ nor r10, r12, r7
+ bdz L(end)
+
+ ALIGN(32)
+L(top): sld r12, u0, cnt
+ srd r7, u1, tnc
+ ld u0, -8(up)
+ std r11, -8(rp)
+ nor r11, r8, r9
+L(L00): sld r8, u1, cnt
+ srd r9, u0, tnc
+ ld u1, -16(up)
+ std r10, -16(rp)
+ nor r10, r12, r7
+L(L11): sld r12, u0, cnt
+ srd r7, u1, tnc
+ ld u0, -24(up)
+ std r11, -24(rp)
+ nor r11, r8, r9
+L(L10): sld r8, u1, cnt
+ srd r9, u0, tnc
+ ld u1, -32(up)
+ addi up, up, -32
+ std r10, -32(rp)
+ addi rp, rp, -32
+ nor r10, r12, r7
+ bdnz L(top)
+
+ ALIGN(32)
+L(end): sld r12, u0, cnt
+ srd r7, u1, tnc
+ std r11, -8(rp)
+L(cj4): nor r11, r8, r9
+ sld r8, u1, cnt
+ std r10, -16(rp)
+ nor r8, r8, r8
+L(cj3): nor r10, r12, r7
+ std r11, -24(rp)
+L(cj2): std r10, -32(rp)
+ std r8, -40(rp)
+
+L(ret): ld r31, -8(r1)
+ ld r30, -16(r1)
+ifdef(`HAVE_ABI_mode32',
+` srdi r3, retval, 32
+ mr r4, retval
+',` mr r3, retval')
+ blr
+EPILOGUE()
diff --git a/gmp/mpn/powerpc64/mode32/add_n.asm b/gmp/mpn/powerpc64/mode32/add_n.asm
new file mode 100644
index 0000000000..1da8087fe1
--- /dev/null
+++ b/gmp/mpn/powerpc64/mode32/add_n.asm
@@ -0,0 +1,86 @@
+dnl PowerPC-64/mode32 mpn_add_n -- Add two limb vectors of the same length > 0
+dnl and store sum in a third limb vector.
+
+dnl Copyright 1999-2001, 2003, 2005 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C POWER3/PPC630: ?
+C POWER4/PPC970: 4.25
+
+C INPUT PARAMETERS
+C rp r3
+C up r4
+C vp r5
+C n r6
+
+ASM_START()
+PROLOGUE(mpn_add_n)
+ mtctr r6 C copy size into CTR
+ addic r0, r0, 0 C clear cy
+ ld r8, 0(r4) C load least significant s1 limb
+ ld r0, 0(r5) C load least significant s2 limb
+ addi r3, r3, -8 C offset res_ptr, it's updated before it's used
+ bdz L(end) C If done, skip loop
+
+L(oop): ld r9, 8(r4) C load s1 limb
+ ld r10, 8(r5) C load s2 limb
+ adde r7, r0, r8 C add limbs with cy, set cy
+ srdi r6, r0, 32
+ srdi r11, r8, 32
+ adde r6, r6, r11 C add high limb parts, set cy
+ std r7, 8(r3) C store result limb
+ bdz L(exit) C decrement CTR and exit if done
+ ldu r8, 16(r4) C load s1 limb and update s1_ptr
+ ldu r0, 16(r5) C load s2 limb and update s2_ptr
+ adde r7, r10, r9 C add limbs with cy, set cy
+ srdi r6, r10, 32
+ srdi r11, r9, 32
+ adde r6, r6, r11 C add high limb parts, set cy
+ stdu r7, 16(r3) C store result limb and update res_ptr
+ bdnz L(oop) C decrement CTR and loop back
+
+L(end): adde r7, r0, r8
+ srdi r6, r0, 32
+ srdi r11, r8, 32
+ adde r6, r6, r11 C add limbs with cy, set cy
+ std r7, 8(r3) C store ultimate result limb
+ li r3, 0 C load cy into ...
+ addze r4, r3 C ... return value register
+ blr
+L(exit): adde r7, r10, r9
+ srdi r6, r10, 32
+ srdi r11, r9, 32
+ adde r6, r6, r11 C add limbs with cy, set cy
+ std r7, 16(r3)
+ li r3, 0 C load cy into ...
+ addze r4, r3 C ... return value register
+ blr
+EPILOGUE()
diff --git a/gmp/mpn/powerpc64/mode32/addmul_1.asm b/gmp/mpn/powerpc64/mode32/addmul_1.asm
new file mode 100644
index 0000000000..bdc39512ac
--- /dev/null
+++ b/gmp/mpn/powerpc64/mode32/addmul_1.asm
@@ -0,0 +1,79 @@
+dnl PowerPC-64 mpn_addmul_1 -- Multiply a limb vector with a limb and add
+dnl the result to a second limb vector.
+
+dnl Copyright 1999-2001, 2003, 2005 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C POWER3/PPC630: ?
+C POWER4/PPC970: 12.5
+
+C INPUT PARAMETERS
+C rp r3
+C up r4
+C n r5
+C v r6,r7 or r7,r8
+
+ASM_START()
+PROLOGUE(mpn_addmul_1)
+
+ifdef(`BROKEN_LONGLONG_PARAM',
+` rldimi r8, r7, 32,0 C assemble vlimb from separate 32-bit arguments
+ mr r6, r8
+',`
+ rldimi r7, r6, 32,0 C assemble vlimb from separate 32-bit arguments
+ mr r6, r7
+')
+ li r7, 0 C cy_limb = 0
+ mtctr r5
+ addic r0, r0, 0
+ addi r3, r3, -8
+ addi r4, r4, -8
+
+L(oop): ldu r0, 8(r4)
+ mulld r9, r0, r6
+ adde r12, r9, r7 C add old high limb and new low limb
+ srdi r5, r9, 32
+ srdi r11, r7, 32
+ adde r5, r5, r11 C add high limb parts, set cy
+ mulhdu r7, r0, r6
+ addze r7, r7
+ ld r10, 8(r3)
+ addc r9, r12, r10
+ srdi r5, r12, 32
+ srdi r11, r10, 32
+ adde r5, r5, r11 C add high limb parts, set cy
+ stdu r9, 8(r3)
+ bdnz L(oop)
+
+ addze r4, r7
+ srdi r3, r4, 32
+ blr
+EPILOGUE()
diff --git a/gmp/mpn/powerpc64/mode32/mul_1.asm b/gmp/mpn/powerpc64/mode32/mul_1.asm
new file mode 100644
index 0000000000..3a17e98797
--- /dev/null
+++ b/gmp/mpn/powerpc64/mode32/mul_1.asm
@@ -0,0 +1,73 @@
+dnl PowerPC-64 mpn_mul_1 -- Multiply a limb vector with a limb and add
+dnl the result to a second limb vector.
+
+dnl Copyright 1999-2001, 2003, 2005 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C POWER3/PPC630: ?
+C POWER4/PPC970: 10
+
+C INPUT PARAMETERS
+C rp r3
+C up r4
+C n r5
+C v r6,r7 or r7,r8
+
+ASM_START()
+PROLOGUE(mpn_mul_1)
+
+ifdef(`BROKEN_LONGLONG_PARAM',
+` rldimi r8, r7, 32,0 C assemble vlimb from separate 32-bit arguments
+ mr r6, r8
+',`
+ rldimi r7, r6, 32,0 C assemble vlimb from separate 32-bit arguments
+ mr r6, r7
+')
+ li r7, 0 C cy_limb = 0
+ mtctr r5
+ addic r0, r0, 0
+ addi r3, r3, -8
+ addi r4, r4, -8
+
+L(oop): ldu r0, 8(r4)
+ mulld r9, r0, r6
+ adde r12, r9, r7 C add old high limb and new low limb
+ srdi r5, r9, 32
+ srdi r11, r7, 32
+ adde r5, r5, r11 C add high limb parts, set cy
+ mulhdu r7, r0, r6
+ stdu r12, 8(r3)
+ bdnz L(oop)
+
+ addze r4, r7
+ srdi r3, r4, 32
+ blr
+EPILOGUE()
diff --git a/gmp/mpn/powerpc64/mode32/p4/gmp-mparam.h b/gmp/mpn/powerpc64/mode32/p4/gmp-mparam.h
new file mode 100644
index 0000000000..a7271381c5
--- /dev/null
+++ b/gmp/mpn/powerpc64/mode32/p4/gmp-mparam.h
@@ -0,0 +1,173 @@
+/* PowerPC-64 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 2008, 2009, 2011 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+or
+
+ * the GNU General Public License as published by the Free Software
+ Foundation; either version 2 of the License, or (at your option) any
+ later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library. If not,
+see https://www.gnu.org/licenses/. */
+
+/* 1800 MHz PPC970 */
+
+#define DIVREM_1_NORM_THRESHOLD 0 /* always */
+#define DIVREM_1_UNNORM_THRESHOLD 0 /* always */
+#define MOD_1_1P_METHOD 1
+#define MOD_1_NORM_THRESHOLD 0 /* always */
+#define MOD_1_UNNORM_THRESHOLD 0 /* always */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD 7
+#define MOD_1U_TO_MOD_1_1_THRESHOLD 5
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD 6
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD 46
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 14
+#define USE_PREINV_DIVREM_1 1
+#define DIV_QR_2_PI2_THRESHOLD 12
+#define DIVEXACT_1_THRESHOLD 0 /* always */
+#define BMOD_1_TO_MOD_1_THRESHOLD 90
+
+#define MUL_TOOM22_THRESHOLD 16
+#define MUL_TOOM33_THRESHOLD 57
+#define MUL_TOOM44_THRESHOLD 94
+#define MUL_TOOM6H_THRESHOLD 125
+#define MUL_TOOM8H_THRESHOLD 187
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD 65
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD 99
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD 61
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD 56
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD 70
+
+#define SQR_BASECASE_THRESHOLD 4
+#define SQR_TOOM2_THRESHOLD 30
+#define SQR_TOOM3_THRESHOLD 98
+#define SQR_TOOM4_THRESHOLD 136
+#define SQR_TOOM6_THRESHOLD 180
+#define SQR_TOOM8_THRESHOLD 272
+
+#define MULMID_TOOM42_THRESHOLD 34
+
+#define MULMOD_BNM1_THRESHOLD 12
+#define SQRMOD_BNM1_THRESHOLD 13
+
+#define MUL_FFT_MODF_THRESHOLD 244 /* k = 5 */
+#define MUL_FFT_TABLE3 \
+ { { 244, 5}, { 13, 6}, { 7, 5}, { 15, 6}, \
+ { 15, 7}, { 8, 6}, { 17, 7}, { 9, 6}, \
+ { 19, 7}, { 13, 8}, { 7, 7}, { 17, 8}, \
+ { 9, 7}, { 20, 8}, { 11, 7}, { 23, 8}, \
+ { 13, 7}, { 29, 8}, { 19, 9}, { 11, 8}, \
+ { 27,10}, { 7, 9}, { 15, 8}, { 33, 9}, \
+ { 19, 8}, { 39, 9}, { 23, 8}, { 47, 9}, \
+ { 27,10}, { 15, 9}, { 39,10}, { 23, 9}, \
+ { 47,11}, { 15,10}, { 31, 9}, { 67,10}, \
+ { 39, 9}, { 83,10}, { 47, 9}, { 95, 8}, \
+ { 191, 9}, { 99,10}, { 55,11}, { 31,10}, \
+ { 63, 9}, { 127, 8}, { 255,10}, { 71, 9}, \
+ { 143, 8}, { 287,10}, { 79, 9}, { 159, 8}, \
+ { 319,11}, { 47,10}, { 95, 9}, { 191, 8}, \
+ { 383,10}, { 103,12}, { 31,11}, { 63,10}, \
+ { 127, 9}, { 255, 8}, { 511,10}, { 143, 9}, \
+ { 287,11}, { 79,10}, { 159, 9}, { 319, 8}, \
+ { 639,10}, { 175, 9}, { 351, 8}, { 703,11}, \
+ { 95,10}, { 191, 9}, { 383, 8}, { 767,10}, \
+ { 207, 9}, { 415,10}, { 223, 9}, { 447,12}, \
+ { 63,11}, { 127,10}, { 255, 9}, { 511,11}, \
+ { 143,10}, { 287, 9}, { 575, 8}, { 1151,11}, \
+ { 159,10}, { 319, 9}, { 639,11}, { 175,10}, \
+ { 351, 9}, { 703,12}, { 95,11}, { 191,10}, \
+ { 383, 9}, { 767,11}, { 207,10}, { 415, 9}, \
+ { 831,11}, { 223,10}, { 447,13}, { 8192,14}, \
+ { 16384,15}, { 32768,16} }
+#define MUL_FFT_TABLE3_SIZE 106
+#define MUL_FFT_THRESHOLD 2688
+
+#define SQR_FFT_MODF_THRESHOLD 212 /* k = 5 */
+#define SQR_FFT_TABLE3 \
+ { { 212, 5}, { 13, 6}, { 15, 7}, { 8, 6}, \
+ { 17, 7}, { 9, 6}, { 19, 7}, { 13, 8}, \
+ { 7, 7}, { 17, 8}, { 9, 7}, { 20, 8}, \
+ { 11, 7}, { 23, 8}, { 13, 7}, { 27, 9}, \
+ { 7, 8}, { 21, 9}, { 11, 8}, { 25,10}, \
+ { 7, 9}, { 15, 8}, { 33, 9}, { 19, 8}, \
+ { 39, 9}, { 23, 8}, { 47, 9}, { 27,10}, \
+ { 15, 9}, { 39,10}, { 23, 9}, { 47,11}, \
+ { 15,10}, { 31, 9}, { 63, 8}, { 127, 9}, \
+ { 67,10}, { 39, 9}, { 79, 8}, { 159,10}, \
+ { 47, 9}, { 95, 8}, { 191,11}, { 31,10}, \
+ { 63, 9}, { 127, 8}, { 255,10}, { 71, 9}, \
+ { 143, 8}, { 287,10}, { 79, 9}, { 159, 8}, \
+ { 319,11}, { 47, 9}, { 191, 8}, { 383,12}, \
+ { 31,11}, { 63,10}, { 127, 9}, { 255, 8}, \
+ { 511,10}, { 143, 9}, { 287, 8}, { 575,11}, \
+ { 79,10}, { 159, 9}, { 319, 8}, { 639,10}, \
+ { 175, 9}, { 351, 8}, { 703,10}, { 191, 9}, \
+ { 383, 8}, { 767,10}, { 207, 9}, { 415,11}, \
+ { 111,10}, { 223,12}, { 63,11}, { 127,10}, \
+ { 255, 9}, { 511,11}, { 143,10}, { 287, 9}, \
+ { 575,11}, { 159,10}, { 319, 9}, { 639,11}, \
+ { 175,10}, { 351, 9}, { 703, 8}, { 1407,11}, \
+ { 191,10}, { 383, 9}, { 767,11}, { 207,10}, \
+ { 415,11}, { 223,10}, { 447,13}, { 8192,14}, \
+ { 16384,15}, { 32768,16} }
+#define SQR_FFT_TABLE3_SIZE 102
+#define SQR_FFT_THRESHOLD 1984
+
+#define MULLO_BASECASE_THRESHOLD 0 /* always */
+#define MULLO_DC_THRESHOLD 55
+#define MULLO_MUL_N_THRESHOLD 5240
+
+#define DC_DIV_QR_THRESHOLD 27
+#define DC_DIVAPPR_Q_THRESHOLD 108
+#define DC_BDIV_QR_THRESHOLD 51
+#define DC_BDIV_Q_THRESHOLD 126
+
+#define INV_MULMOD_BNM1_THRESHOLD 38
+#define INV_NEWTON_THRESHOLD 129
+#define INV_APPR_THRESHOLD 116
+
+#define BINV_NEWTON_THRESHOLD 198
+#define REDC_1_TO_REDC_N_THRESHOLD 51
+
+#define MU_DIV_QR_THRESHOLD 807
+#define MU_DIVAPPR_Q_THRESHOLD 807
+#define MUPI_DIV_QR_THRESHOLD 54
+#define MU_BDIV_QR_THRESHOLD 748
+#define MU_BDIV_Q_THRESHOLD 872
+
+#define POWM_SEC_TABLE 4,35,152,780,2145
+
+#define MATRIX22_STRASSEN_THRESHOLD 11
+#define HGCD_THRESHOLD 104
+#define HGCD_APPR_THRESHOLD 118
+#define HGCD_REDUCE_THRESHOLD 1329
+#define GCD_DC_THRESHOLD 268
+#define GCDEXT_DC_THRESHOLD 241
+#define JACOBI_BASE_METHOD 4
+
+#define GET_STR_DC_THRESHOLD 9
+#define GET_STR_PRECOMPUTE_THRESHOLD 18
+#define SET_STR_DC_THRESHOLD 996
+#define SET_STR_PRECOMPUTE_THRESHOLD 2170
+
+#define FAC_DSC_THRESHOLD 442
+#define FAC_ODD_THRESHOLD 26
diff --git a/gmp/mpn/powerpc64/mode32/sqr_diagonal.asm b/gmp/mpn/powerpc64/mode32/sqr_diagonal.asm
new file mode 100644
index 0000000000..ff5f4b3cfb
--- /dev/null
+++ b/gmp/mpn/powerpc64/mode32/sqr_diagonal.asm
@@ -0,0 +1,117 @@
+dnl PowerPC-64 mpn_sqr_diagonal.
+
+dnl Copyright 2001-2003, 2005, 2006, 20010 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C POWER3/PPC630 18
+C POWER4/PPC970 ?
+C POWER5 7.25
+C POWER6 9.5
+
+C INPUT PARAMETERS
+define(`rp', r3)
+define(`up', r4)
+define(`n', r5)
+
+ASM_START()
+PROLOGUE(mpn_sqr_diagonal)
+ifdef(`HAVE_ABI_mode32',
+` rldicl n, n, 0, 32') C zero extend n
+
+ rldicl. r0, n, 0,62 C r0 = n & 3, set cr0
+ addi n, n, 3 C compute count...
+ cmpdi cr6, r0, 2
+ srdi n, n, 2 C ...for ctr
+ mtctr n C copy count into ctr
+ beq cr0, L(b00)
+ blt cr6, L(b01)
+ beq cr6, L(b10)
+
+L(b11): ld r0, 0(up)
+ ld r10, 8(up)
+ ld r12, 16(up)
+ addi rp, rp, -16
+ mulld r7, r0, r0
+ mulhdu r8, r0, r0
+ mulld r9, r10, r10
+ mulhdu r10, r10, r10
+ mulld r11, r12, r12
+ mulhdu r12, r12, r12
+ addi up, up, 24
+ b L(11)
+
+ ALIGN(16)
+L(b01): ld r0, 0(up)
+ addi rp, rp, -48
+ addi up, up, 8
+ mulld r11, r0, r0
+ mulhdu r12, r0, r0
+ b L(01)
+
+ ALIGN(16)
+L(b10): ld r0, 0(up)
+ ld r12, 8(up)
+ addi rp, rp, -32
+ addi up, up, 16
+ mulld r9, r0, r0
+ mulhdu r10, r0, r0
+ mulld r11, r12, r12
+ mulhdu r12, r12, r12
+ b L(10)
+
+ ALIGN(32)
+L(b00):
+L(top): ld r0, 0(up)
+ ld r8, 8(up)
+ ld r10, 16(up)
+ ld r12, 24(up)
+ mulld r5, r0, r0
+ mulhdu r6, r0, r0
+ mulld r7, r8, r8
+ mulhdu r8, r8, r8
+ mulld r9, r10, r10
+ mulhdu r10, r10, r10
+ mulld r11, r12, r12
+ mulhdu r12, r12, r12
+ addi up, up, 32
+ std r5, 0(rp)
+ std r6, 8(rp)
+L(11): std r7, 16(rp)
+ std r8, 24(rp)
+L(10): std r9, 32(rp)
+ std r10, 40(rp)
+L(01): std r11, 48(rp)
+ std r12, 56(rp)
+ addi rp, rp, 64
+ bdnz L(top)
+
+ blr
+EPILOGUE()
diff --git a/gmp/mpn/powerpc64/mode32/sub_n.asm b/gmp/mpn/powerpc64/mode32/sub_n.asm
new file mode 100644
index 0000000000..6fdc1d4719
--- /dev/null
+++ b/gmp/mpn/powerpc64/mode32/sub_n.asm
@@ -0,0 +1,88 @@
+dnl PowerPC-64/mode32 mpn_sub_n -- Subtract two limb vectors of the same
+dnl length and store difference in a third limb vector.
+
+dnl Copyright 1999-2001, 2003, 2005 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C POWER3/PPC630: ?
+C POWER4/PPC970: 4.25
+
+C INPUT PARAMETERS
+C rp r3
+C up r4
+C vp r5
+C n r6
+
+ASM_START()
+PROLOGUE(mpn_sub_n)
+ mtctr r6 C copy size into CTR
+ addic r0, r6, -1 C set cy
+ ld r8, 0(r4) C load least significant s1 limb
+ ld r0, 0(r5) C load least significant s2 limb
+ addi r3, r3, -8 C offset res_ptr, it's updated before it's used
+ bdz L(end) C If done, skip loop
+
+L(oop): ld r9, 8(r4) C load s1 limb
+ ld r10, 8(r5) C load s2 limb
+ subfe r7, r0, r8 C subtract limbs with cy, set cy
+ srdi r6, r0, 32
+ srdi r11, r8, 32
+ subfe r6, r6, r11
+ std r7, 8(r3) C store result limb
+ bdz L(exit) C decrement CTR and exit if done
+ ldu r8, 16(r4) C load s1 limb and update s1_ptr
+ ldu r0, 16(r5) C load s2 limb and update s2_ptr
+ subfe r7, r10, r9 C subtract limbs with cy, set cy
+ srdi r6, r10, 32
+ srdi r11, r9, 32
+ subfe r6, r6, r11
+ stdu r7, 16(r3) C store result limb and update res_ptr
+ bdnz L(oop) C decrement CTR and loop back
+
+L(end): subfe r7, r0, r8
+ srdi r6, r0, 32
+ srdi r11, r8, 32
+ subfe r6, r6, r11
+ std r7, 8(r3) C store ultimate result limb
+ subfe r3, r0, r0 C load !cy into ...
+ subfic r4, r3, 0 C ... return value register
+ li r3, 0 C zero extend return value
+ blr
+L(exit): subfe r7, r10, r9
+ srdi r6, r10, 32
+ srdi r11, r9, 32
+ subfe r6, r6, r11
+ std r7, 16(r3)
+ subfe r3, r0, r0 C load !cy into ...
+ subfic r4, r3, 0 C ... return value register
+ li r3, 0 C zero extend return value
+ blr
+EPILOGUE()
diff --git a/gmp/mpn/powerpc64/mode32/submul_1.asm b/gmp/mpn/powerpc64/mode32/submul_1.asm
new file mode 100644
index 0000000000..22601c417e
--- /dev/null
+++ b/gmp/mpn/powerpc64/mode32/submul_1.asm
@@ -0,0 +1,82 @@
+dnl PowerPC-64 mpn_submul_1 -- Multiply a limb vector with a limb and subtract
+dnl the result from a second limb vector.
+
+dnl Copyright 1999-2001, 2003, 2005 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C POWER3/PPC630: ?
+C POWER4/PPC970: 16
+
+C INPUT PARAMETERS
+C rp r3
+C up r4
+C n r5
+C v r6,r7 or r7,r8
+
+ASM_START()
+PROLOGUE(mpn_submul_1)
+
+ifdef(`BROKEN_LONGLONG_PARAM',
+` rldimi r8, r7, 32,0 C assemble vlimb from separate 32-bit arguments
+ mr r6, r8
+',`
+ rldimi r7, r6, 32,0 C assemble vlimb from separate 32-bit arguments
+ mr r6, r7
+')
+ li r7, 0 C cy_limb = 0
+ mtctr r5
+ addic r0, r0, 0
+ addi r3, r3, -8
+ addi r4, r4, -8
+
+L(oop): ldu r0, 8(r4)
+ mulld r9, r0, r6
+ adde r12, r9, r7 C add old high limb and new low limb
+ srdi r5, r9, 32
+ srdi r11, r7, 32
+ adde r5, r5, r11 C add high limb parts, set cy
+ mulhdu r7, r0, r6
+ addze r7, r7
+ ld r10, 8(r3)
+ subfc r9, r12, r10
+ srdi r5, r12, 32
+ srdi r11, r10, 32
+ subfe r5, r5, r11 C subtract high limb parts, set cy
+ stdu r9, 8(r3)
+ subfe r11, r11, r11 C invert ...
+ addic r11, r11, 1 C ... carry
+ bdnz L(oop)
+
+ addze r4, r7
+ srdi r3, r4, 32
+ blr
+EPILOGUE()
+
diff --git a/gmp/mpn/powerpc64/mode64/aors_n.asm b/gmp/mpn/powerpc64/mode64/aors_n.asm
new file mode 100644
index 0000000000..0e8474fdcc
--- /dev/null
+++ b/gmp/mpn/powerpc64/mode64/aors_n.asm
@@ -0,0 +1,189 @@
+dnl PowerPC-64 mpn_add_n/mpn_sub_n -- mpn addition and subtraction.
+
+dnl Copyright 1999-2001, 2003-2005, 2007, 2011 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C POWER3/PPC630 1.5
+C POWER4/PPC970 2
+C POWER5 2
+C POWER6 2.63
+C POWER7 2.25-2.87
+
+C This code is a little bit slower for POWER3/PPC630 than the simple code used
+C previously, but it is much faster for POWER4/PPC970. The reason for the
+C POWER3/PPC630 slowdown can be attributed to the saving and restoring of 4
+C registers.
+
+C INPUT PARAMETERS
+C rp r3
+C up r4
+C vp r5
+C n r6
+
+ifdef(`OPERATION_add_n',`
+ define(ADDSUBC, adde)
+ define(ADDSUB, addc)
+ define(func, mpn_add_n)
+ define(func_nc, mpn_add_nc)
+ define(GENRVAL, `addi r3, r3, 1')
+ define(SETCBR, `addic r0, $1, -1')
+ define(CLRCB, `addic r0, r0, 0')
+')
+ifdef(`OPERATION_sub_n',`
+ define(ADDSUBC, subfe)
+ define(ADDSUB, subfc)
+ define(func, mpn_sub_n)
+ define(func_nc, mpn_sub_nc)
+ define(GENRVAL, `neg r3, r3')
+ define(SETCBR, `subfic r0, $1, 0')
+ define(CLRCB, `addic r0, r1, -1')
+')
+
+MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
+
+ASM_START()
+PROLOGUE(func_nc)
+ SETCBR(r7)
+ b L(ent)
+EPILOGUE()
+
+PROLOGUE(func)
+ CLRCB
+L(ent): std r31, -8(r1)
+ std r30, -16(r1)
+ std r29, -24(r1)
+ std r28, -32(r1)
+
+ rldicl. r0, r6, 0,62 C r0 = n & 3, set cr0
+ cmpdi cr6, r0, 2
+ addi r6, r6, 3 C compute count...
+ srdi r6, r6, 2 C ...for ctr
+ mtctr r6 C copy count into ctr
+ beq cr0, L(b00)
+ blt cr6, L(b01)
+ beq cr6, L(b10)
+
+L(b11): ld r8, 0(r4) C load s1 limb
+ ld r9, 0(r5) C load s2 limb
+ ld r10, 8(r4) C load s1 limb
+ ld r11, 8(r5) C load s2 limb
+ ld r12, 16(r4) C load s1 limb
+ addi r4, r4, 24
+ ld r0, 16(r5) C load s2 limb
+ addi r5, r5, 24
+ ADDSUBC r29, r9, r8
+ ADDSUBC r30, r11, r10
+ ADDSUBC r31, r0, r12
+ std r29, 0(r3)
+ std r30, 8(r3)
+ std r31, 16(r3)
+ addi r3, r3, 24
+ bdnz L(go)
+ b L(ret)
+
+L(b01): ld r12, 0(r4) C load s1 limb
+ addi r4, r4, 8
+ ld r0, 0(r5) C load s2 limb
+ addi r5, r5, 8
+ ADDSUBC r31, r0, r12 C add
+ std r31, 0(r3)
+ addi r3, r3, 8
+ bdnz L(go)
+ b L(ret)
+
+L(b10): ld r10, 0(r4) C load s1 limb
+ ld r11, 0(r5) C load s2 limb
+ ld r12, 8(r4) C load s1 limb
+ addi r4, r4, 16
+ ld r0, 8(r5) C load s2 limb
+ addi r5, r5, 16
+ ADDSUBC r30, r11, r10 C add
+ ADDSUBC r31, r0, r12 C add
+ std r30, 0(r3)
+ std r31, 8(r3)
+ addi r3, r3, 16
+ bdnz L(go)
+ b L(ret)
+
+L(b00): C INITCY C clear/set cy
+L(go): ld r6, 0(r4) C load s1 limb
+ ld r7, 0(r5) C load s2 limb
+ ld r8, 8(r4) C load s1 limb
+ ld r9, 8(r5) C load s2 limb
+ ld r10, 16(r4) C load s1 limb
+ ld r11, 16(r5) C load s2 limb
+ ld r12, 24(r4) C load s1 limb
+ ld r0, 24(r5) C load s2 limb
+ bdz L(end)
+
+ addi r4, r4, 32
+ addi r5, r5, 32
+
+ ALIGN(16)
+L(top): ADDSUBC r28, r7, r6
+ ld r6, 0(r4) C load s1 limb
+ ld r7, 0(r5) C load s2 limb
+ ADDSUBC r29, r9, r8
+ ld r8, 8(r4) C load s1 limb
+ ld r9, 8(r5) C load s2 limb
+ ADDSUBC r30, r11, r10
+ ld r10, 16(r4) C load s1 limb
+ ld r11, 16(r5) C load s2 limb
+ ADDSUBC r31, r0, r12
+ ld r12, 24(r4) C load s1 limb
+ ld r0, 24(r5) C load s2 limb
+ std r28, 0(r3)
+ addi r4, r4, 32
+ std r29, 8(r3)
+ addi r5, r5, 32
+ std r30, 16(r3)
+ std r31, 24(r3)
+ addi r3, r3, 32
+ bdnz L(top) C decrement ctr and loop back
+
+L(end): ADDSUBC r28, r7, r6
+ ADDSUBC r29, r9, r8
+ ADDSUBC r30, r11, r10
+ ADDSUBC r31, r0, r12
+ std r28, 0(r3)
+ std r29, 8(r3)
+ std r30, 16(r3)
+ std r31, 24(r3)
+
+L(ret): ld r31, -8(r1)
+ ld r30, -16(r1)
+ ld r29, -24(r1)
+ ld r28, -32(r1)
+
+ subfe r3, r0, r0 C -cy
+ GENRVAL
+ blr
+EPILOGUE()
diff --git a/gmp/mpn/powerpc64/mode64/aorsmul_1.asm b/gmp/mpn/powerpc64/mode64/aorsmul_1.asm
new file mode 100644
index 0000000000..0c12f9b660
--- /dev/null
+++ b/gmp/mpn/powerpc64/mode64/aorsmul_1.asm
@@ -0,0 +1,225 @@
+dnl PowerPC-64 mpn_addmul_1 and mpn_submul_1.
+
+dnl Copyright 1999-2001, 2003-2006, 2010-2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C mpn_addmul_1 mpn_submul_1
+C cycles/limb cycles/limb
+C POWER3/PPC630 6-18 6-18
+C POWER4/PPC970 8 8.3
+C POWER5 8 8.25
+C POWER6 16.25 16.75
+C POWER7 3.77 4.9
+
+C TODO
+C * Try to reduce the number of needed live registers
+C * Add support for _1c entry points
+
+C INPUT PARAMETERS
+define(`rp', `r3')
+define(`up', `r4')
+define(`n', `r5')
+define(`vl', `r6')
+
+ifdef(`OPERATION_addmul_1',`
+ define(ADDSUBC, adde)
+ define(ADDSUB, addc)
+ define(func, mpn_addmul_1)
+ define(func_nc, mpn_addmul_1c) C FIXME: not really supported
+ define(SM, `')
+')
+ifdef(`OPERATION_submul_1',`
+ define(ADDSUBC, subfe)
+ define(ADDSUB, subfc)
+ define(func, mpn_submul_1)
+ define(func_nc, mpn_submul_1c) C FIXME: not really supported
+ define(SM, `$1')
+')
+
+MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1)
+
+ASM_START()
+PROLOGUE(func)
+ std r31, -8(r1)
+ rldicl. r0, n, 0,62 C r0 = n & 3, set cr0
+ std r30, -16(r1)
+ cmpdi cr6, r0, 2
+ std r29, -24(r1)
+ addi n, n, 3 C compute count...
+ std r28, -32(r1)
+ srdi n, n, 2 C ...for ctr
+ std r27, -40(r1)
+ mtctr n C copy count into ctr
+ beq cr0, L(b00)
+ blt cr6, L(b01)
+ beq cr6, L(b10)
+
+L(b11): ld r9, 0(up)
+ ld r28, 0(rp)
+ mulld r0, r9, r6
+ mulhdu r12, r9, r6
+ ADDSUB r0, r0, r28
+ std r0, 0(rp)
+ addi rp, rp, 8
+ ld r9, 8(up)
+ ld r27, 16(up)
+ addi up, up, 24
+SM(` subfe r11, r11, r11 ')
+ b L(bot)
+
+ ALIGN(16)
+L(b00): ld r9, 0(up)
+ ld r27, 8(up)
+ ld r28, 0(rp)
+ ld r29, 8(rp)
+ mulld r0, r9, r6
+ mulhdu r5, r9, r6
+ mulld r7, r27, r6
+ mulhdu r8, r27, r6
+ addc r7, r7, r5
+ addze r12, r8
+ ADDSUB r0, r0, r28
+ std r0, 0(rp)
+ ADDSUBC r7, r7, r29
+ std r7, 8(rp)
+ addi rp, rp, 16
+ ld r9, 16(up)
+ ld r27, 24(up)
+ addi up, up, 32
+SM(` subfe r11, r11, r11 ')
+ b L(bot)
+
+ ALIGN(16)
+L(b01): bdnz L(gt1)
+ ld r9, 0(up)
+ ld r11, 0(rp)
+ mulld r0, r9, r6
+ mulhdu r8, r9, r6
+ ADDSUB r0, r0, r11
+ std r0, 0(rp)
+SM(` subfe r11, r11, r11 ')
+SM(` addic r11, r11, 1 ')
+ addze r3, r8
+ blr
+L(gt1): ld r9, 0(up)
+ ld r27, 8(up)
+ mulld r0, r9, r6
+ mulhdu r5, r9, r6
+ mulld r7, r27, r6
+ mulhdu r8, r27, r6
+ ld r9, 16(up)
+ ld r28, 0(rp)
+ ld r29, 8(rp)
+ ld r30, 16(rp)
+ mulld r11, r9, r6
+ mulhdu r10, r9, r6
+ addc r7, r7, r5
+ adde r11, r11, r8
+ addze r12, r10
+ ADDSUB r0, r0, r28
+ std r0, 0(rp)
+ ADDSUBC r7, r7, r29
+ std r7, 8(rp)
+ ADDSUBC r11, r11, r30
+ std r11, 16(rp)
+ addi rp, rp, 24
+ ld r9, 24(up)
+ ld r27, 32(up)
+ addi up, up, 40
+SM(` subfe r11, r11, r11 ')
+ b L(bot)
+
+L(b10): addic r0, r0, 0
+ li r12, 0 C cy_limb = 0
+ ld r9, 0(up)
+ ld r27, 8(up)
+ bdz L(end)
+ addi up, up, 16
+
+ ALIGN(16)
+L(top): mulld r0, r9, r6
+ mulhdu r5, r9, r6 C 9
+ mulld r7, r27, r6
+ mulhdu r8, r27, r6 C 27
+ ld r9, 0(up)
+ ld r28, 0(rp)
+ ld r27, 8(up)
+ ld r29, 8(rp)
+ adde r0, r0, r12 C 0 12
+ adde r7, r7, r5 C 5 7
+ mulld r5, r9, r6
+ mulhdu r10, r9, r6 C 9
+ mulld r11, r27, r6
+ mulhdu r12, r27, r6 C 27
+ ld r9, 16(up)
+ ld r30, 16(rp)
+ ld r27, 24(up)
+ ld r31, 24(rp)
+ adde r5, r5, r8 C 8 5
+ adde r11, r11, r10 C 10 11
+ addze r12, r12 C 12
+ ADDSUB r0, r0, r28 C 0 28
+ std r0, 0(rp) C 0
+ ADDSUBC r7, r7, r29 C 7 29
+ std r7, 8(rp) C 7
+ ADDSUBC r5, r5, r30 C 5 30
+ std r5, 16(rp) C 5
+ ADDSUBC r11, r11, r31 C 11 31
+ std r11, 24(rp) C 11
+ addi up, up, 32
+SM(` subfe r11, r11, r11 ')
+ addi rp, rp, 32
+L(bot):
+SM(` addic r11, r11, 1 ')
+ bdnz L(top)
+
+L(end): mulld r0, r9, r6
+ mulhdu r5, r9, r6
+ mulld r7, r27, r6
+ mulhdu r8, r27, r6
+ ld r28, 0(rp)
+ ld r29, 8(rp)
+ adde r0, r0, r12
+ adde r7, r7, r5
+ addze r8, r8
+ ADDSUB r0, r0, r28
+ std r0, 0(rp)
+ ADDSUBC r7, r7, r29
+ std r7, 8(rp)
+SM(` subfe r11, r11, r11 ')
+SM(` addic r11, r11, 1 ')
+ addze r3, r8
+ ld r31, -8(r1)
+ ld r30, -16(r1)
+ ld r29, -24(r1)
+ ld r28, -32(r1)
+ ld r27, -40(r1)
+ blr
+EPILOGUE()
diff --git a/gmp/mpn/powerpc64/mode64/aorsorrlsh1_n.asm b/gmp/mpn/powerpc64/mode64/aorsorrlsh1_n.asm
new file mode 100644
index 0000000000..2c5400ab52
--- /dev/null
+++ b/gmp/mpn/powerpc64/mode64/aorsorrlsh1_n.asm
@@ -0,0 +1,43 @@
+dnl PowerPC-64 mpn_addlsh1_n, mpn_sublsh1_n, mpn_rsblsh1_n.
+
+dnl Copyright 2003, 2005, 2009, 2010, 2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+define(LSH, 1)
+define(RSH, 63)
+
+ifdef(`OPERATION_addlsh1_n',`define(`DO_add')')
+ifdef(`OPERATION_sublsh1_n',`define(`DO_sub')')
+ifdef(`OPERATION_rsblsh1_n',`define(`DO_rsb')')
+
+MULFUNC_PROLOGUE(mpn_addlsh1_n mpn_sublsh1_n mpn_rsblsh1_n)
+
+include_mpn(`powerpc64/mode64/aorsorrlshC_n.asm')
diff --git a/gmp/mpn/powerpc64/mode64/aorsorrlsh2_n.asm b/gmp/mpn/powerpc64/mode64/aorsorrlsh2_n.asm
new file mode 100644
index 0000000000..447791abb0
--- /dev/null
+++ b/gmp/mpn/powerpc64/mode64/aorsorrlsh2_n.asm
@@ -0,0 +1,43 @@
+dnl PowerPC-64 mpn_addlsh2_n, mpn_sublsh2_n, mpn_rsblsh2_n.
+
+dnl Copyright 2003, 2005, 2009, 2010, 2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+define(LSH, 2)
+define(RSH, 62)
+
+ifdef(`OPERATION_addlsh2_n',`define(`DO_add')')
+ifdef(`OPERATION_sublsh2_n',`define(`DO_sub')')
+ifdef(`OPERATION_rsblsh2_n',`define(`DO_rsb')')
+
+MULFUNC_PROLOGUE(mpn_addlsh2_n mpn_sublsh2_n mpn_rsblsh2_n)
+
+include_mpn(`powerpc64/mode64/aorsorrlshC_n.asm')
diff --git a/gmp/mpn/powerpc64/mode64/aorsorrlshC_n.asm b/gmp/mpn/powerpc64/mode64/aorsorrlshC_n.asm
new file mode 100644
index 0000000000..6158f541fc
--- /dev/null
+++ b/gmp/mpn/powerpc64/mode64/aorsorrlshC_n.asm
@@ -0,0 +1,187 @@
+dnl PowerPC-64 mpn_addlshC_n, mpn_sublshC_n, mpn_rsblshC_n.
+
+dnl Copyright 2003, 2005, 2009, 2010, 2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+C cycles/limb
+C POWER3/PPC630 1.83 (1.5 c/l should be possible)
+C POWER4/PPC970 3 (2.0 c/l should be possible)
+C POWER5 3
+C POWER6 3.5-47
+C POWER7 3
+
+C STATUS
+C * Try combining upx+up, and vpx+vp.
+C * The worst case 47 c/l for POWER6 happens if the 3rd operand for ldx is
+C greater than the 2nd operand. Yes, this addition is non-commutative wrt
+C performance.
+
+C INPUT PARAMETERS
+define(`rp', `r3')
+define(`up', `r4')
+define(`vp', `r5')
+define(`n', `r6')
+
+ifdef(`DO_add', `
+ define(`ADDSUBC', `addc $1, $2, $3')
+ define(`ADDSUBE', `adde $1, $2, $3')
+ define(INITCY, `addic $1, r1, 0')
+ define(RETVAL, `addze r3, $1')
+ define(`func', mpn_addlsh`'LSH`'_n)')
+ifdef(`DO_sub', `
+ define(`ADDSUBC', `subfc $1, $2, $3')
+ define(`ADDSUBE', `subfe $1, $2, $3')
+ define(INITCY, `addic $1, r1, -1')
+ define(RETVAL, `subfze r3, $1
+ neg r3, r3')
+ define(`func', mpn_sublsh`'LSH`'_n)')
+ifdef(`DO_rsb', `
+ define(`ADDSUBC', `subfc $1, $3, $2')
+ define(`ADDSUBE', `subfe $1, $3, $2')
+ define(INITCY, `addic $1, r1, -1')
+ define(RETVAL, `addme r3, $1')
+ define(`func', mpn_rsblsh`'LSH`'_n)')
+
+define(`rpx', `r6')
+define(`upx', `r7')
+define(`vpx', `r12')
+
+define(`s0', `r0') define(`s1', `r9')
+define(`u0', `r8')
+define(`v0', `r10') define(`v1', `r11')
+
+
+ASM_START()
+PROLOGUE(func)
+ cmpldi cr0, n, 13
+ bgt L(big)
+
+ mtctr n C copy n in ctr
+ INITCY( r0) C clear cy
+
+ ld v0, 0(vp) C load v limb
+ ld u0, 0(up) C load u limb
+ addi up, up, -8 C update up
+ addi rp, rp, -8 C update rp
+ sldi s1, v0, LSH
+ bdz L(ex1) C If done, skip loop
+
+ ALIGN(16)
+L(lo0): ld v1, 8(vp) C load v limb
+ ADDSUBE(s1, s1, u0) C add limbs with cy, set cy
+ ldu u0, 16(up) C load u limb and update up
+ srdi s0, v0, RSH C shift down previous v limb
+ std s1, 8(rp) C store result limb
+ rldimi s0, v1, LSH, 0 C left shift v limb and merge with prev v limb
+ bdz L(ex0) C decrement ctr and exit if done
+ ldu v0, 16(vp) C load v limb and update vp
+ ADDSUBE(s0, s0, u0) C add limbs with cy, set cy
+ ld u0, 8(up) C load u limb
+ srdi s1, v1, RSH C shift down previous v limb
+ stdu s0, 16(rp) C store result limb and update rp
+ rldimi s1, v0, LSH, 0 C left shift v limb and merge with prev v limb
+ bdnz L(lo0) C decrement ctr and loop back
+
+L(ex1): ADDSUBE(r7, s1, u0)
+ std r7, 8(rp) C store last result limb
+ srdi r0, v0, RSH
+ RETVAL( r0)
+ blr
+L(ex0): ADDSUBE(r7, s0, u0)
+ std r7, 16(rp) C store last result limb
+ srdi r0, v1, RSH
+ RETVAL( r0)
+ blr
+
+
+L(big): rldicl. r0, n, 0,63 C r0 = n & 1, set cr0
+ addi r6, n, -1 C ...for ctr
+ srdi r6, r6, 1 C ...for ctr
+ mtctr r6 C copy count into ctr
+ beq cr0, L(b0)
+
+L(b1): ld v1, 0(vp)
+ ld u0, 0(up)
+ sldi s1, v1, LSH
+ srdi s0, v1, RSH
+ ld v0, 8(vp)
+ ADDSUBC(s1, s1, u0) C add limbs without cy, set cy
+ addi rpx, rp, -16
+ addi rp, rp, -8
+ sub upx, up, rp
+ sub vpx, vp, rp
+ sub up, up, rpx
+ sub vp, vp, rpx
+ addi up, up, 8
+ addi upx, upx, 16
+ addi vp, vp, 16
+ addi vpx, vpx, 24
+ b L(mid)
+
+L(b0): ld v0, 0(vp)
+ ld u0, 0(up)
+ sldi s0, v0, LSH
+ srdi s1, v0, RSH
+ ld v1, 8(vp)
+ ADDSUBC(s0, s0, u0) C add limbs without cy, set cy
+ addi rpx, rp, -8
+ addi rp, rp, -16
+ sub upx, up, rpx
+ sub vpx, vp, rpx
+ sub up, up, rp
+ sub vp, vp, rp
+ addi up, up, 8
+ addi upx, upx, 16
+ addi vp, vp, 16
+ addi vpx, vpx, 24
+
+ ALIGN(32)
+L(top): ldx u0, rp, up
+ ldx v0, rp, vp
+ rldimi s1, v1, LSH, 0
+ stdu s0, 16(rp)
+ srdi s0, v1, RSH
+ ADDSUBE(s1, s1, u0) C add limbs with cy, set cy
+L(mid): ldx u0, rpx, upx
+ ldx v1, rpx, vpx
+ rldimi s0, v0, LSH, 0
+ stdu s1, 16(rpx)
+ srdi s1, v0, RSH
+ ADDSUBE(s0, s0, u0) C add limbs with cy, set cy
+ bdnz L(top) C decrement CTR and loop back
+
+ ldx u0, rp, up
+ rldimi s1, v1, LSH, 0
+ std s0, 16(rp)
+ srdi s0, v1, RSH
+ ADDSUBE(s1, s1, u0) C add limbs with cy, set cy
+ std s1, 24(rp)
+
+ RETVAL( s0)
+ blr
+EPILOGUE()
diff --git a/gmp/mpn/powerpc64/mode64/bdiv_dbm1c.asm b/gmp/mpn/powerpc64/mode64/bdiv_dbm1c.asm
new file mode 100644
index 0000000000..45cded9715
--- /dev/null
+++ b/gmp/mpn/powerpc64/mode64/bdiv_dbm1c.asm
@@ -0,0 +1,132 @@
+dnl PPC64 mpn_bdiv_dbm1c.
+
+dnl Copyright 2008, 2010 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C POWER3/PPC630 6-18
+C POWER4/PPC970 8.25
+C POWER5 8.5 fluctuating as function of n % 3
+C POWER6 15
+C POWER7 4.75
+
+C TODO
+C * Nothing to do...
+
+C INPUT PARAMETERS
+define(`rp', `r3')
+define(`up', `r4')
+define(`n', `r5')
+define(`bd', `r6')
+define(`cy', `r7')
+
+ASM_START()
+PROLOGUE(mpn_bdiv_dbm1c)
+ ld r0, 0(r4)
+
+ rldicl. r12, r5, 0,62
+ cmpldi cr6, r12, 2
+ cmpldi cr7, r5, 4
+ addi r5, r5, 1
+ srwi r5, r5, 2
+ mtctr r5
+ beq cr0, L(b00)
+ blt cr6, L(b01)
+ beq cr6, L(b10)
+
+ ALIGN(16)
+L(b11): mulld r5, r0, r6
+ mulhdu r12, r0, r6
+ ld r0, 8(r4)
+ addi r4, r4, -24
+ addi r3, r3, -24
+ b L(3)
+
+ ALIGN(16)
+L(b00): mulld r9, r0, r6
+ mulhdu r8, r0, r6
+ addi r4, r4, -16
+ addi r3, r3, -16
+ b L(0)
+
+ ALIGN(16)
+L(b01): mulld r5, r0, r6
+ mulhdu r12, r0, r6
+ addi r3, r3, -8
+ ble cr7, L(e1)
+ ld r0, 8(r4)
+ addi r4, r4, -8
+ b L(1)
+
+ ALIGN(16)
+L(b10): mulld r9, r0, r6
+ mulhdu r8, r0, r6
+ ble cr7, L(e2)
+
+ ALIGN(16)
+L(top): subfc r11, r9, r7
+ ld r10, 8(r4)
+ ld r0, 16(r4)
+ subfe r7, r8, r11
+ std r11, 0(r3)
+ mulld r5, r10, r6
+ mulhdu r12, r10, r6
+L(1): mulld r9, r0, r6
+ mulhdu r8, r0, r6
+ subfc r11, r5, r7
+ subfe r7, r12, r11
+ std r11, 8(r3)
+L(0): subfc r11, r9, r7
+ ld r10, 24(r4)
+ ld r0, 32(r4)
+ subfe r7, r8, r11
+ std r11, 16(r3)
+ mulld r5, r10, r6
+ mulhdu r12, r10, r6
+L(3): mulld r9, r0, r6
+ mulhdu r8, r0, r6
+ subfc r11, r5, r7
+ subfe r7, r12, r11
+ std r11, 24(r3)
+ addi r4, r4, 32
+ addi r3, r3, 32
+ bdnz L(top)
+
+L(e2): ld r10, 8(r4)
+ mulld r5, r10, r6
+ mulhdu r12, r10, r6
+ subfc r11, r9, r7
+ subfe r7, r8, r11
+ std r11, 0(r3)
+L(e1): subfc r11, r5, r7
+ std r11, 8(r3)
+ subfe r3, r12, r11
+ blr
+EPILOGUE()
diff --git a/gmp/mpn/powerpc64/mode64/cnd_aors_n.asm b/gmp/mpn/powerpc64/mode64/cnd_aors_n.asm
new file mode 100644
index 0000000000..24968c1912
--- /dev/null
+++ b/gmp/mpn/powerpc64/mode64/cnd_aors_n.asm
@@ -0,0 +1,196 @@
+dnl PowerPC-64 mpn_cnd_add_n/mpn_cnd_sub_n.
+
+dnl Copyright 1999-2001, 2003-2005, 2007, 2011, 2012 Free Software Foundation,
+dnl Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C POWER3/PPC630 ?
+C POWER4/PPC970 2.25
+C POWER5 ?
+C POWER6 3
+C POWER7 2
+
+C INPUT PARAMETERS
+define(`cnd', `r3')
+define(`rp', `r4')
+define(`up', `r5')
+define(`vp', `r6')
+define(`n', `r7')
+
+ifdef(`OPERATION_cnd_add_n',`
+ define(ADDSUBC, adde)
+ define(ADDSUB, addc)
+ define(func, mpn_cnd_add_n)
+ define(GENRVAL, `addi r3, r3, 1')
+ define(SETCBR, `addic r0, $1, -1')
+ define(CLRCB, `addic r0, r0, 0')
+')
+ifdef(`OPERATION_cnd_sub_n',`
+ define(ADDSUBC, subfe)
+ define(ADDSUB, subfc)
+ define(func, mpn_cnd_sub_n)
+ define(GENRVAL, `neg r3, r3')
+ define(SETCBR, `subfic r0, $1, 0')
+ define(CLRCB, `addic r0, r1, -1')
+')
+
+MULFUNC_PROLOGUE(mpn_cnd_add_n mpn_cnd_sub_n)
+
+ASM_START()
+PROLOGUE(func)
+ std r31, -8(r1)
+ std r30, -16(r1)
+ std r29, -24(r1)
+ std r28, -32(r1)
+ std r27, -40(r1)
+
+ subfic cnd, cnd, 0
+ subfe cnd, cnd, cnd
+
+ rldicl. r0, n, 0,62 C r0 = n & 3, set cr0
+ cmpdi cr6, r0, 2
+ addi n, n, 3 C compute count...
+ srdi n, n, 2 C ...for ctr
+ mtctr n C copy count into ctr
+ beq cr0, L(b00)
+ blt cr6, L(b01)
+ beq cr6, L(b10)
+
+L(b11): ld r8, 0(up) C load s1 limb
+ ld r9, 0(vp) C load s2 limb
+ ld r10, 8(up) C load s1 limb
+ ld r11, 8(vp) C load s2 limb
+ ld r12, 16(up) C load s1 limb
+ addi up, up, 24
+ ld r0, 16(vp) C load s2 limb
+ addi vp, vp, 24
+ and r9, r9, cnd
+ and r11, r11, cnd
+ and r0, r0, cnd
+ ADDSUB r29, r9, r8
+ ADDSUBC r30, r11, r10
+ ADDSUBC r31, r0, r12
+ std r29, 0(rp)
+ std r30, 8(rp)
+ std r31, 16(rp)
+ addi rp, rp, 24
+ bdnz L(go)
+ b L(ret)
+
+L(b01): ld r12, 0(up) C load s1 limb
+ addi up, up, 8
+ ld r0, 0(vp) C load s2 limb
+ addi vp, vp, 8
+ and r0, r0, cnd
+ ADDSUB r31, r0, r12 C add
+ std r31, 0(rp)
+ addi rp, rp, 8
+ bdnz L(go)
+ b L(ret)
+
+L(b10): ld r10, 0(up) C load s1 limb
+ ld r11, 0(vp) C load s2 limb
+ ld r12, 8(up) C load s1 limb
+ addi up, up, 16
+ ld r0, 8(vp) C load s2 limb
+ addi vp, vp, 16
+ and r11, r11, cnd
+ and r0, r0, cnd
+ ADDSUB r30, r11, r10 C add
+ ADDSUBC r31, r0, r12 C add
+ std r30, 0(rp)
+ std r31, 8(rp)
+ addi rp, rp, 16
+ bdnz L(go)
+ b L(ret)
+
+L(b00): CLRCB C clear/set cy
+L(go): ld r7, 0(up) C load s1 limb
+ ld r27, 0(vp) C load s2 limb
+ ld r8, 8(up) C load s1 limb
+ ld r9, 8(vp) C load s2 limb
+ ld r10, 16(up) C load s1 limb
+ ld r11, 16(vp) C load s2 limb
+ ld r12, 24(up) C load s1 limb
+ ld r0, 24(vp) C load s2 limb
+ and r27, r27, cnd
+ and r9, r9, cnd
+ and r11, r11, cnd
+ and r0, r0, cnd
+ bdz L(end)
+
+ addi up, up, 32
+ addi vp, vp, 32
+
+L(top): ADDSUBC r28, r27, r7
+ ld r7, 0(up) C load s1 limb
+ ld r27, 0(vp) C load s2 limb
+ ADDSUBC r29, r9, r8
+ ld r8, 8(up) C load s1 limb
+ ld r9, 8(vp) C load s2 limb
+ ADDSUBC r30, r11, r10
+ ld r10, 16(up) C load s1 limb
+ ld r11, 16(vp) C load s2 limb
+ ADDSUBC r31, r0, r12
+ ld r12, 24(up) C load s1 limb
+ ld r0, 24(vp) C load s2 limb
+ std r28, 0(rp)
+ addi up, up, 32
+ std r29, 8(rp)
+ addi vp, vp, 32
+ std r30, 16(rp)
+ std r31, 24(rp)
+ addi rp, rp, 32
+ and r27, r27, cnd
+ and r9, r9, cnd
+ and r11, r11, cnd
+ and r0, r0, cnd
+ bdnz L(top) C decrement ctr and loop back
+
+L(end): ADDSUBC r28, r27, r7
+ ADDSUBC r29, r9, r8
+ ADDSUBC r30, r11, r10
+ ADDSUBC r31, r0, r12
+ std r28, 0(rp)
+ std r29, 8(rp)
+ std r30, 16(rp)
+ std r31, 24(rp)
+
+L(ret): ld r31, -8(r1)
+ ld r30, -16(r1)
+ ld r29, -24(r1)
+ ld r28, -32(r1)
+ ld r27, -40(r1)
+
+ subfe r3, r0, r0 C -cy
+ GENRVAL
+ blr
+EPILOGUE()
diff --git a/gmp/mpn/powerpc64/mode64/dive_1.asm b/gmp/mpn/powerpc64/mode64/dive_1.asm
new file mode 100644
index 0000000000..434dde9145
--- /dev/null
+++ b/gmp/mpn/powerpc64/mode64/dive_1.asm
@@ -0,0 +1,132 @@
+dnl PowerPC-64 mpn_divexact_1 -- mpn by limb exact division.
+
+dnl Copyright 2006, 2010 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C norm unorm
+C POWER3/PPC630 13-19
+C POWER4/PPC970 16
+C POWER5 16 16
+C POWER6 37 46
+C POWER7 12 12
+
+C TODO
+C * Check if n=1 code is really an improvement. It probably isn't.
+C * Make more similar to mode1o.asm.
+
+C INPUT PARAMETERS
+define(`rp', `r3')
+define(`up', `r4')
+define(`n', `r5')
+define(`d', `r6')
+
+
+ASM_START()
+
+EXTERN(binvert_limb_table)
+
+PROLOGUE(mpn_divexact_1,toc)
+ addic. n, n, -1
+ ld r12, 0(up)
+ bne cr0, L(2)
+ divdu r0, r12, d
+ std r0, 0(rp)
+ blr
+L(2):
+ rldicl. r0, d, 0, 63
+ li r10, 0
+ bne cr0, L(7)
+ neg r0, d
+ and r0, d, r0
+ cntlzd r0, r0
+ subfic r0, r0, 63
+ rldicl r10, r0, 0, 32
+ srd d, d, r0
+L(7):
+ mtctr n
+ LEA( r5, binvert_limb_table)
+ rldicl r11, d, 63, 57
+ lbzx r0, r5, r11
+ mulld r9, r0, r0
+ sldi r0, r0, 1
+ mulld r9, d, r9
+ subf r0, r9, r0
+ mulld r5, r0, r0
+ sldi r0, r0, 1
+ mulld r5, d, r5
+ subf r0, r5, r0
+ mulld r9, r0, r0
+ sldi r0, r0, 1
+ mulld r9, d, r9
+ subf r7, r9, r0 C r7 = 1/d mod 2^64
+ bne cr0, L(norm)
+ subfic r8, r10, 64 C set carry as side effect
+ li r5, 0
+ srd r11, r12, r10
+
+ ALIGN(16)
+L(loop0):
+ ld r12, 8(up)
+ nop
+ addi up, up, 8
+ sld r0, r12, r8
+ or r11, r11, r0
+ subfe r9, r5, r11
+ srd r11, r12, r10
+ mulld r0, r7, r9
+ mulhdu r5, r0, d
+ std r0, 0(rp)
+ addi rp, rp, 8
+ bdnz L(loop0)
+
+ subfe r0, r5, r11
+ mulld r0, r7, r0
+ std r0, 0(rp)
+ blr
+
+ ALIGN(16)
+L(norm):
+ mulld r11, r12, r7
+ mulhdu r5, r11, d
+ std r11, 0(rp)
+ ALIGN(16)
+L(loop1):
+ ld r9, 8(up)
+ addi up, up, 8
+ subfe r5, r5, r9
+ mulld r11, r7, r5
+ mulhdu r5, r11, d C result not used
+ std r11, 8(rp)
+ addi rp, rp, 8
+ bdnz L(loop1)
+ blr
+EPILOGUE()
+ASM_END()
diff --git a/gmp/mpn/powerpc64/mode64/divrem_1.asm b/gmp/mpn/powerpc64/mode64/divrem_1.asm
new file mode 100644
index 0000000000..b283877006
--- /dev/null
+++ b/gmp/mpn/powerpc64/mode64/divrem_1.asm
@@ -0,0 +1,274 @@
+dnl PowerPC-64 mpn_divrem_1 -- Divide an mpn number by an unnormalized limb.
+
+dnl Copyright 2003-2005, 2007, 2008, 2010, 2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C norm unorm frac
+C POWER3/PPC630 16-34 16-34 ~11 outdated figures
+C POWER4/PPC970 28 28 19
+C POWER5 29 29 ~19
+C POWER6 49 59 ~42
+C POWER7 24.5 23 ~14
+
+C INPUT PARAMETERS
+C qp = r3
+C fn = r4
+C up = r5
+C un = r6
+C d = r7
+
+C We use a not very predictable branch in the frac code, therefore the cycle
+C count wobbles somewhat. With the alternative branch-free code, things run
+C considerably slower on POWER4/PPC970 and POWER5.
+
+C Add preinv entry point.
+
+
+ASM_START()
+
+EXTERN_FUNC(mpn_invert_limb)
+
+PROLOGUE(mpn_divrem_1,toc)
+
+ mfcr r12
+ add. r10, r6, r4
+ std r25, -56(r1)
+ mr r25, r4
+ mflr r0
+ std r26, -48(r1)
+ mr r26, r5
+ std r28, -32(r1)
+ mr r28, r6
+ std r29, -24(r1)
+ mr r29, r3
+ li r3, 0
+ std r30, -16(r1)
+ mr r30, r7
+ std r31, -8(r1)
+ li r31, 0
+ std r27, -40(r1)
+ std r0, 16(r1)
+ stw r12, 8(r1)
+ stdu r1, -176(r1)
+ beq- cr0, L(1)
+ cmpdi cr7, r7, 0
+ sldi r0, r10, 3
+ add r11, r0, r29
+ addi r29, r11, -8
+ blt- cr7, L(162)
+ cmpdi cr4, r6, 0
+ beq+ cr4, L(71)
+L(163):
+ sldi r9, r6, 3
+ add r9, r9, r5
+ ld r7, -8(r9)
+ cmpld cr7, r7, r30
+ bge- cr7, L(71)
+ cmpdi cr7, r10, 1
+ li r0, 0
+ mr r31, r7
+ std r0, -8(r11)
+ addi r29, r29, -8
+ mr r3, r7
+ beq- cr7, L(1)
+ addi r28, r6, -1
+ cmpdi cr4, r28, 0
+L(71):
+ cntlzd r27, r30
+ sld r30, r30, r27
+ sld r31, r31, r27
+ mr r3, r30
+ CALL( mpn_invert_limb)
+ beq- cr4, L(110)
+ sldi r9, r28, 3
+ addic. r6, r28, -2
+ add r9, r9, r26
+ subfic r5, r27, 64
+ ld r8, -8(r9)
+ srd r0, r8, r5
+ or r31, r31, r0
+ sld r7, r8, r27
+ blt- cr0, L(154)
+ addi r28, r28, -1
+ mtctr r28
+ sldi r6, r6, 3
+ ALIGN(16)
+L(uloop):
+ ldx r8, r26, r6
+ nop
+ mulld r0, r31, r3
+ mulhdu r10, r31, r3
+ addi r11, r31, 1
+ srd r9, r8, r5
+ addi r6, r6, -8
+ or r9, r7, r9
+ addc r0, r0, r9
+ adde r10, r10, r11
+ mulld r31, r10, r30
+ subf r31, r31, r9
+ subfc r0, r31, r0 C r <= ql
+ subfe r0, r0, r0 C r0 = -(r <= ql)
+ and r9, r30, r0
+ add r31, r31, r9
+ add r10, r0, r10 C qh -= (r >= ql)
+ cmpld cr7, r31, r30
+ bge- cr7, L(164)
+L(123):
+ std r10, 0(r29)
+ addi r29, r29, -8
+ sld r7, r8, r27
+ bdnz L(uloop)
+L(154):
+ addi r11, r31, 1
+ nop
+ mulld r0, r31, r3
+ mulhdu r8, r31, r3
+ addc r0, r0, r7
+ adde r8, r8, r11
+ mulld r31, r8, r30
+ subf r31, r31, r7
+ subfc r0, r0, r31 C r >= ql
+ subfe r0, r0, r0 C r0 = -(r >= ql)
+ not r7, r0
+ add r8, r7, r8 C qh -= (r >= ql)
+ andc r0, r30, r0
+ add r31, r31, r0
+ cmpld cr7, r31, r30
+ bge- cr7, L(165)
+L(134):
+ std r8, 0(r29)
+ addi r29, r29, -8
+L(110):
+ addic. r0, r25, -1
+ blt- cr0, L(156)
+ mtctr r25
+ neg r9, r30
+ ALIGN(16)
+L(ufloop):
+ addi r11, r31, 1
+ nop
+ mulld r0, r3, r31
+ mulhdu r10, r3, r31
+ add r10, r10, r11
+ mulld r31, r9, r10
+ifelse(0,1,`
+ subfc r0, r0, r31
+ subfe r0, r0, r0 C r0 = -(r >= ql)
+ not r7, r0
+ add r10, r7, r10 C qh -= (r >= ql)
+ andc r0, r30, r0
+ add r31, r31, r0
+',`
+ cmpld cr7, r31, r0
+ blt cr7, L(29)
+ add r31, r30, r31
+ addi r10, r10, -1
+L(29):
+')
+ std r10, 0(r29)
+ addi r29, r29, -8
+ bdnz L(ufloop)
+L(156):
+ srd r3, r31, r27
+L(1):
+ addi r1, r1, 176
+ ld r0, 16(r1)
+ lwz r12, 8(r1)
+ mtlr r0
+ ld r25, -56(r1)
+ ld r26, -48(r1)
+ mtcrf 8, r12
+ ld r27, -40(r1)
+ ld r28, -32(r1)
+ ld r29, -24(r1)
+ ld r30, -16(r1)
+ ld r31, -8(r1)
+ blr
+L(162):
+ cmpdi cr7, r6, 0
+ beq- cr7, L(8)
+ sldi r9, r6, 3
+ addi r29, r29, -8
+ add r9, r9, r5
+ addi r28, r6, -1
+ ld r31, -8(r9)
+ subfc r9, r7, r31
+ li r9, 0
+ adde r9, r9, r9
+ neg r0, r9
+ std r9, -8(r11)
+ and r0, r0, r7
+ subf r31, r0, r31
+L(8):
+ mr r3, r30
+ CALL( mpn_invert_limb)
+ li r27, 0
+ addic. r6, r28, -1
+ blt- cr0, L(110)
+ mtctr r28
+ sldi r6, r6, 3
+ ALIGN(16)
+L(nloop):
+ addi r11, r31, 1
+ ldx r8, r26, r6
+ mulld r0, r31, r3
+ mulhdu r10, r31, r3
+ addi r6, r6, -8
+ addc r0, r0, r8
+ adde r10, r10, r11
+ mulld r31, r10, r30
+ subf r31, r31, r8 C r = nl - qh * d
+ subfc r0, r31, r0 C r <= ql
+ subfe r0, r0, r0 C r0 = -(r <= ql)
+ and r9, r30, r0
+ add r31, r31, r9
+ add r10, r0, r10 C qh -= (r >= ql)
+ cmpld cr7, r31, r30
+ bge- cr7, L(167)
+L(51):
+ std r10, 0(r29)
+ addi r29, r29, -8
+ bdnz L(nloop)
+ b L(110)
+
+L(164):
+ subf r31, r30, r31
+ addi r10, r10, 1
+ b L(123)
+L(167):
+ subf r31, r30, r31
+ addi r10, r10, 1
+ b L(51)
+L(165):
+ subf r31, r30, r31
+ addi r8, r8, 1
+ b L(134)
+EPILOGUE()
diff --git a/gmp/mpn/powerpc64/mode64/divrem_2.asm b/gmp/mpn/powerpc64/mode64/divrem_2.asm
new file mode 100644
index 0000000000..73ec23c94d
--- /dev/null
+++ b/gmp/mpn/powerpc64/mode64/divrem_2.asm
@@ -0,0 +1,187 @@
+dnl PPC-64 mpn_divrem_2 -- Divide an mpn number by a normalized 2-limb number.
+
+dnl Copyright 2007, 2008 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C norm frac
+C POWER3/PPC630
+C POWER4/PPC970 ? ?
+C POWER5 37 ?
+C POWER6 62 ?
+C POWER6 30.5 ?
+
+C INPUT PARAMETERS
+C qp = r3
+C fn = r4
+C up = r5
+C un = r6
+C dp = r7
+
+
+ifdef(`DARWIN',,`
+define(`r2',`r31')') C FIXME!
+
+ASM_START()
+
+EXTERN_FUNC(mpn_invert_limb)
+
+PROLOGUE(mpn_divrem_2,toc)
+ mflr r0
+ std r23, -72(r1)
+ std r24, -64(r1)
+ std r25, -56(r1)
+ std r26, -48(r1)
+ std r27, -40(r1)
+ std r28, -32(r1)
+ std r29, -24(r1)
+ std r30, -16(r1)
+ std r31, -8(r1)
+ std r0, 16(r1)
+ stdu r1, -192(r1)
+ mr r24, r3
+ mr r25, r4
+ sldi r0, r6, 3
+ add r26, r5, r0
+ addi r26, r26, -24
+ ld r30, 8(r7)
+ ld r28, 0(r7)
+ ld r29, 16(r26)
+ ld r31, 8(r26)
+
+ifelse(0,1,`
+ li r23, 0
+ cmpld cr7, r29, r30
+ blt cr7, L(8)
+ bgt cr7, L(9)
+ cmpld cr0, r31, r28
+ blt cr0, L(8)
+L(9): subfc r31, r28, r31
+ subfe r29, r30, r29
+ li r23, 1
+',`
+ li r23, 0
+ cmpld cr7, r29, r30
+ blt cr7, L(8)
+ mfcr r0
+ rlwinm r0, r0, 30, 1
+ subfc r9, r28, r31
+ addze. r0, r0
+ nop
+ beq cr0, L(8)
+ subfc r31, r28, r31
+ subfe r29, r30, r29
+ li r23, 1
+')
+
+L(8):
+ add r27, r25, r6
+ addic. r27, r27, -3
+ blt cr0, L(18)
+ mr r3, r30
+ CALL( mpn_invert_limb)
+ mulld r10, r3, r30
+ mulhdu r0, r3, r28
+ addc r8, r10, r28
+ subfe r11, r1, r1
+ addc r10, r8, r0
+ addze. r11, r11
+ blt cr0, L(91)
+L(40):
+ subfc r10, r30, r10
+ addme. r11, r11
+ addi r3, r3, -1
+ bge cr0, L(40)
+L(91):
+ addi r5, r27, 1
+ mtctr r5
+ sldi r0, r27, 3
+ add r24, r24, r0
+ ALIGN(16)
+L(loop):
+ mulhdu r8, r29, r3
+ mulld r6, r29, r3
+ addc r6, r6, r31
+ adde r8, r8, r29
+ cmpd cr7, r27, r25
+ mulld r0, r30, r8
+ mulhdu r11, r28, r8
+ mulld r10, r28, r8
+ subf r31, r0, r31
+ li r7, 0
+ blt cr7, L(60)
+ ld r7, 0(r26)
+ addi r26, r26, -8
+ nop
+L(60): subfc r7, r28, r7
+ subfe r31, r30, r31
+ subfc r7, r10, r7
+ subfe r4, r11, r31
+ subfc r9, r6, r4
+ subfe r9, r1, r1
+ andc r6, r28, r9
+ andc r0, r30, r9
+ addc r31, r7, r6
+ adde r29, r4, r0
+ subf r8, r9, r8
+ cmpld cr7, r29, r30
+ bge- cr7, L(fix)
+L(bck): std r8, 0(r24)
+ addi r24, r24, -8
+ addi r27, r27, -1
+ bdnz L(loop)
+L(18):
+ std r31, 8(r26)
+ std r29, 16(r26)
+ mr r3, r23
+ addi r1, r1, 192
+ ld r0, 16(r1)
+ mtlr r0
+ ld r23, -72(r1)
+ ld r24, -64(r1)
+ ld r25, -56(r1)
+ ld r26, -48(r1)
+ ld r27, -40(r1)
+ ld r28, -32(r1)
+ ld r29, -24(r1)
+ ld r30, -16(r1)
+ ld r31, -8(r1)
+ blr
+L(fix):
+ mfcr r0
+ rlwinm r0, r0, 30, 1
+ subfc r9, r28, r31
+ addze. r0, r0
+ beq cr0, L(bck)
+ subfc r31, r28, r31
+ subfe r29, r30, r29
+ addi r8, r8, 1
+ b L(bck)
+EPILOGUE()
diff --git a/gmp/mpn/powerpc64/mode64/gcd_1.asm b/gmp/mpn/powerpc64/mode64/gcd_1.asm
new file mode 100644
index 0000000000..8762bbbef5
--- /dev/null
+++ b/gmp/mpn/powerpc64/mode64/gcd_1.asm
@@ -0,0 +1,122 @@
+dnl PowerPC-64 mpn_gcd_1.
+
+dnl Copyright 2000-2002, 2005, 2009, 2011-2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/bit (approx)
+C POWER3/PPC630 ?
+C POWER4/PPC970 8.5
+C POWER5 ?
+C POWER6 10.1
+C POWER7 9.4
+C Numbers measured with: speed -CD -s16-64 -t48 mpn_gcd_1
+
+C INPUT PARAMETERS
+define(`up', `r3')
+define(`n', `r4')
+define(`v0', `r5')
+
+EXTERN_FUNC(mpn_mod_1)
+EXTERN_FUNC(mpn_modexact_1c_odd)
+
+ASM_START()
+PROLOGUE(mpn_gcd_1,toc)
+ mflr r0
+ std r30, -16(r1)
+ std r31, -8(r1)
+ std r0, 16(r1)
+ stdu r1, -128(r1)
+
+ ld r7, 0(up) C U low limb
+ or r0, r5, r7 C x | y
+
+ neg r6, r0
+ and r6, r6, r0
+ cntlzd r31, r6 C common twos
+ subfic r31, r31, 63
+
+ neg r6, r5
+ and r6, r6, r5
+ cntlzd r8, r6
+ subfic r8, r8, 63
+ srd r5, r5, r8
+ mr r30, r5 C v0 saved
+
+ cmpdi r4, BMOD_1_TO_MOD_1_THRESHOLD
+ blt L(bmod)
+ CALL( mpn_mod_1)
+ b L(reduced)
+L(bmod):
+ li r6, 0
+ CALL( mpn_modexact_1c_odd)
+L(reduced):
+
+define(`mask', `r0')dnl
+define(`a1', `r4')dnl
+define(`a2', `r5')dnl
+define(`d1', `r6')dnl
+define(`d2', `r7')dnl
+define(`cnt', `r9')dnl
+
+ neg. r6, r3
+ and r6, r6, r3
+ cntlzd cnt, r6
+ subfic cnt, cnt, 63
+ li r12, 63
+ bne L(mid)
+ b L(end)
+
+ ALIGN(16)
+L(top):
+ and a1, r10, mask C d - a
+ andc a2, r11, mask C a - d
+ and d1, r3, mask C a
+ andc d2, r30, mask C d
+ or r3, a1, a2 C new a
+ subf cnt, cnt, r12
+ or r30, d1, d2 C new d
+L(mid): srd r3, r3, cnt
+ sub. r10, r30, r3 C r10 = d - a
+ subc r11, r3, r30 C r11 = a - d
+ neg r8, r10
+ and r8, r8, r10
+ subfe mask, mask, mask
+ cntlzd cnt, r8
+ bne L(top)
+
+L(end): sld r3, r30, r31
+
+ addi r1, r1, 128
+ ld r0, 16(r1)
+ ld r30, -16(r1)
+ ld r31, -8(r1)
+ mtlr r0
+ blr
+EPILOGUE()
diff --git a/gmp/mpn/powerpc64/mode64/gmp-mparam.h b/gmp/mpn/powerpc64/mode64/gmp-mparam.h
new file mode 100644
index 0000000000..f8305f4720
--- /dev/null
+++ b/gmp/mpn/powerpc64/mode64/gmp-mparam.h
@@ -0,0 +1,82 @@
+/* PowerPC-64 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 2008, 2009 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+or
+
+ * the GNU General Public License as published by the Free Software
+ Foundation; either version 2 of the License, or (at your option) any
+ later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library. If not,
+see https://www.gnu.org/licenses/. */
+
+#define GMP_LIMB_BITS 64
+#define GMP_LIMB_BYTES 8
+
+/* 1600MHz PPC970 */
+
+/* Generated by tuneup.c, 2009-01-14, gcc 4.0 */
+
+#define MUL_TOOM22_THRESHOLD 14
+#define MUL_TOOM33_THRESHOLD 93
+#define MUL_TOOM44_THRESHOLD 135
+
+#define SQR_BASECASE_THRESHOLD 6
+#define SQR_TOOM2_THRESHOLD 32
+#define SQR_TOOM3_THRESHOLD 74
+#define SQR_TOOM4_THRESHOLD 136
+
+#define MULLO_BASECASE_THRESHOLD 0 /* always */
+#define MULLO_DC_THRESHOLD 44
+#define MULLO_MUL_N_THRESHOLD 234
+
+#define DIV_SB_PREINV_THRESHOLD 0 /* always */
+#define DIV_DC_THRESHOLD 33
+#define POWM_THRESHOLD 89
+
+#define MATRIX22_STRASSEN_THRESHOLD 15
+#define HGCD_THRESHOLD 93
+#define GCD_DC_THRESHOLD 237
+#define GCDEXT_DC_THRESHOLD 273
+#define JACOBI_BASE_METHOD 1
+
+#define MOD_1_NORM_THRESHOLD 0 /* always */
+#define MOD_1_UNNORM_THRESHOLD 0 /* always */
+#define MOD_1_1_THRESHOLD 6
+#define MOD_1_2_THRESHOLD 9
+#define MOD_1_4_THRESHOLD 23
+#define USE_PREINV_DIVREM_1 0
+#define USE_PREINV_MOD_1 0
+#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
+#define MODEXACT_1_ODD_THRESHOLD 0 /* always (native) */
+
+#define GET_STR_DC_THRESHOLD 12
+#define GET_STR_PRECOMPUTE_THRESHOLD 24
+#define SET_STR_DC_THRESHOLD 650
+#define SET_STR_PRECOMPUTE_THRESHOLD 1713
+
+#define MUL_FFT_TABLE { 336, 672, 1856, 2816, 7168, 20480, 81920, 327680, 0 }
+#define MUL_FFT_MODF_THRESHOLD 304
+#define MUL_FFT_THRESHOLD 4224
+
+#define SQR_FFT_TABLE { 272, 672, 1600, 2816, 7168, 20480, 81920, 327680, 786432, 0 }
+#define SQR_FFT_MODF_THRESHOLD 272
+#define SQR_FFT_THRESHOLD 2688
diff --git a/gmp/mpn/powerpc64/mode64/invert_limb.asm b/gmp/mpn/powerpc64/mode64/invert_limb.asm
new file mode 100644
index 0000000000..dfdba6451e
--- /dev/null
+++ b/gmp/mpn/powerpc64/mode64/invert_limb.asm
@@ -0,0 +1,88 @@
+dnl PowerPC-64 mpn_invert_limb -- Invert a normalized limb.
+
+dnl Copyright 2004-2006, 2008, 2010, 2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb (approximate)
+C POWER3/PPC630 80
+C POWER4/PPC970 86
+C POWER5 86
+C POWER6 170
+C POWER7 66
+
+ASM_START()
+PROLOGUE(mpn_invert_limb,toc)
+ LEAL( r12, approx_tab)
+ srdi r9, r3, 32
+ rlwinm r9, r9, 10, 23, 30 C (d >> 55) & 0x1fe
+ srdi r10, r3, 24 C d >> 24
+ lis r11, 0x1000
+ rldicl r8, r3, 0, 63 C d mod 2
+ addi r10, r10, 1 C d40
+ sldi r11, r11, 32 C 2^60
+ srdi r7, r3, 1 C d/2
+ add r7, r7, r8 C d63 = ceil(d/2)
+ neg r8, r8 C mask = -(d mod 2)
+ lhzx r0, r9, r12
+ mullw r9, r0, r0 C v0*v0
+ sldi r6, r0, 11 C v0 << 11
+ addi r0, r6, -1 C (v0 << 11) - 1
+ mulld r9, r9, r10 C v0*v0*d40
+ srdi r9, r9, 40 C v0*v0*d40 >> 40
+ subf r9, r9, r0 C v1 = (v0 << 11) - (v0*v0*d40 >> 40) - 1
+ mulld r0, r9, r10 C v1*d40
+ sldi r6, r9, 13 C v1 << 13
+ subf r0, r0, r11 C 2^60 - v1*d40
+ mulld r0, r0, r9 C v1 * (2^60 - v1*d40)
+ srdi r0, r0, 47 C v1 * (2^60 - v1*d40) >> 47
+ add r0, r0, r6 C v2 = (v1 << 13) + (v1 * (2^60 - v1*d40) >> 47)
+ mulld r11, r0, r7 C v2 * d63
+ srdi r10, r0, 1 C v2 >> 1
+ sldi r9, r0, 31 C v2 << 31
+ and r8, r10, r8 C (v2 >> 1) & mask
+ subf r8, r11, r8 C ((v2 >> 1) & mask) - v2 * d63
+ mulhdu r0, r8, r0 C p1 = v2 * (((v2 >> 1) & mask) - v2 * d63)
+ srdi r0, r0, 1 C p1 >> 1
+ add r0, r0, r9 C v3 = (v2 << 31) + (p1 >> 1)
+ nop
+ mulld r11, r0, r3
+ mulhdu r9, r0, r3
+ addc r10, r11, r3
+ adde r3, r9, r3
+ subf r3, r3, r0
+ blr
+EPILOGUE()
+
+DEF_OBJECT(approx_tab)
+forloop(i,256,512-1,dnl
+` .short eval(0x7fd00/i)
+')dnl
+END_OBJECT(approx_tab)
+ASM_END()
diff --git a/gmp/mpn/powerpc64/mode64/mod_1_1.asm b/gmp/mpn/powerpc64/mode64/mod_1_1.asm
new file mode 100644
index 0000000000..873373054f
--- /dev/null
+++ b/gmp/mpn/powerpc64/mode64/mod_1_1.asm
@@ -0,0 +1,164 @@
+dnl PowerPC-64 mpn_mod_1_1p
+
+dnl Copyright 2010, 2011 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C POWER3/PPC630 ?
+C POWER4/PPC970 17
+C POWER5 16
+C POWER6 30
+C POWER7 10.2
+
+C TODO
+C * Optimise, in particular the cps function. This was compiler-generated and
+C then hand optimised.
+
+C INPUT PARAMETERS
+define(`ap', `r3')
+define(`n', `r4')
+define(`d', `r5')
+define(`cps', `r6')
+
+ASM_START()
+
+EXTERN_FUNC(mpn_invert_limb)
+
+PROLOGUE(mpn_mod_1_1p)
+ sldi r10, r4, 3
+ addi r4, r4, -1
+ add r3, r3, r10
+ ld r0, 16(r6) C B1modb
+ ld r12, 24(r6) C B2modb
+ ld r9, -8(r3)
+ ld r10, -16(r3)
+ mtctr r4
+ mulhdu r8, r9, r0
+ mulld r7, r9, r0
+ addc r11, r7, r10
+ addze r9, r8
+ bdz L(end)
+
+ ALIGN(16)
+L(top): ld r4, -24(r3)
+ addi r3, r3, -8
+ nop
+ mulld r10, r11, r0
+ mulld r8, r9, r12
+ mulhdu r11, r11, r0
+ mulhdu r9, r9, r12
+ addc r7, r10, r4
+ addze r10, r11
+ addc r11, r8, r7
+ adde r9, r9, r10
+ bdnz L(top)
+
+L(end):
+ifdef(`HAVE_LIMB_LITTLE_ENDIAN',
+` lwz r0, 8(r6)',
+` lwz r0, 12(r6)')
+ ld r3, 0(r6)
+ cmpdi cr7, r0, 0
+ beq- cr7, L(4)
+ subfic r10, r0, 64
+ sld r9, r9, r0
+ srd r10, r11, r10
+ or r9, r10, r9
+L(4): subfc r10, r5, r9
+ subfe r10, r10, r10
+ nand r10, r10, r10
+ sld r11, r11, r0
+ and r10, r10, r5
+ subf r9, r10, r9
+ mulhdu r10, r9, r3
+ mulld r3, r9, r3
+ addi r9, r9, 1
+ addc r8, r3, r11
+ adde r3, r10, r9
+ mulld r3, r3, r5
+ subf r3, r3, r11
+ cmpld cr7, r8, r3
+ bge cr7, L(5) C FIXME: Make branch-less
+ add r3, r3, r5
+L(5): cmpld cr7, r3, r5
+ bge- cr7, L(10)
+ srd r3, r3, r0
+ blr
+
+L(10): subf r3, r5, r3
+ srd r3, r3, r0
+ blr
+EPILOGUE()
+
+PROLOGUE(mpn_mod_1_1p_cps,toc)
+ mflr r0
+ std r29, -24(r1)
+ std r30, -16(r1)
+ std r31, -8(r1)
+ cntlzd r31, r4
+ std r0, 16(r1)
+ extsw r31, r31
+ mr r29, r3
+ stdu r1, -144(r1)
+ sld r30, r4, r31
+ mr r3, r30
+ CALL( mpn_invert_limb)
+ cmpdi cr7, r31, 0
+ neg r0, r30
+ beq- cr7, L(13)
+ subfic r11, r31, 64
+ li r0, 1
+ neg r9, r30
+ srd r11, r3, r11
+ sld r0, r0, r31
+ or r0, r11, r0
+ mulld r0, r0, r9
+L(13): mulhdu r9, r0, r3
+ mulld r11, r0, r3
+ add r9, r0, r9
+ nor r9, r9, r9
+ mulld r9, r9, r30
+ cmpld cr7, r11, r9
+ bge cr7, L(14)
+ add r9, r9, r30
+L(14): addi r1, r1, 144
+ srd r0, r0, r31
+ std r31, 8(r29)
+ std r3, 0(r29)
+ std r0, 16(r29)
+ ld r0, 16(r1)
+ srd r9, r9, r31
+ ld r30, -16(r1)
+ ld r31, -8(r1)
+ std r9, 24(r29)
+ ld r29, -24(r1)
+ mtlr r0
+ blr
+EPILOGUE()
diff --git a/gmp/mpn/powerpc64/mode64/mod_1_4.asm b/gmp/mpn/powerpc64/mode64/mod_1_4.asm
new file mode 100644
index 0000000000..0b7d6bf699
--- /dev/null
+++ b/gmp/mpn/powerpc64/mode64/mod_1_4.asm
@@ -0,0 +1,270 @@
+dnl PowerPC-64 mpn_mod_1s_4p
+
+dnl Copyright 2010, 2011 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C POWER3/PPC630 ?
+C POWER4/PPC970 9
+C POWER5 9
+C POWER6 13
+C POWER7 3.5
+
+C TODO
+C * Optimise, in particular the cps function. This was compiler-generated and
+C then hand optimised.
+
+C INPUT PARAMETERS
+define(`ap', `r3')
+define(`n', `r4')
+define(`d', `r5')
+define(`cps', `r6')
+
+ASM_START()
+
+EXTERN_FUNC(mpn_invert_limb)
+
+PROLOGUE(mpn_mod_1s_4p)
+ std r23, -72(r1)
+ ld r23, 48(cps)
+ std r24, -64(r1)
+ std r25, -56(r1)
+ ld r24, 32(cps)
+ ld r25, 24(cps)
+ std r26, -48(r1)
+ std r27, -40(r1)
+ ld r26, 16(cps)
+ std r28, -32(r1)
+ std r29, -24(r1)
+ std r30, -16(r1)
+ std r31, -8(r1)
+ ld r30, 40(cps)
+
+ rldicl. r0, n, 0,62
+ sldi r31, n, 3
+ add ap, ap, r31 C make ap point at end of operand
+
+ cmpdi cr7, r0, 2
+ beq cr0, L(b00)
+ blt cr7, L(b01)
+ beq cr7, L(b10)
+
+L(b11): ld r11, -16(ap)
+ ld r9, -8(ap)
+ ld r0, -24(ap)
+ mulhdu r27, r11, r26
+ mulld r8, r11, r26
+ mulhdu r11, r9, r25
+ mulld r9, r9, r25
+ addc r31, r8, r0
+ addze r10, r27
+ addc r0, r9, r31
+ adde r9, r11, r10
+ addi ap, ap, -40
+ b L(6)
+
+ ALIGN(16)
+L(b00): ld r11, -24(ap)
+ ld r10, -16(ap)
+ ld r9, -8(ap)
+ ld r0, -32(ap)
+ mulld r8, r11, r26
+ mulhdu r7, r10, r25
+ mulhdu r27, r11, r26
+ mulhdu r11, r9, r24
+ mulld r10, r10, r25
+ mulld r9, r9, r24
+ addc r31, r8, r0
+ addze r0, r27
+ addc r8, r31, r10
+ adde r10, r0, r7
+ addc r0, r9, r8
+ adde r9, r11, r10
+ addi ap, ap, -48
+ b L(6)
+
+ ALIGN(16)
+L(b01): li r9, 0
+ ld r0, -8(ap)
+ addi ap, ap, -24
+ b L(6)
+
+ ALIGN(16)
+L(b10): ld r9, -8(ap)
+ ld r0, -16(ap)
+ addi ap, ap, -32
+
+ ALIGN(16)
+L(6): addi r10, n, 3
+ srdi r7, r10, 2
+ mtctr r7
+ bdz L(end)
+
+ ALIGN(16)
+L(top): ld r31, -16(ap)
+ ld r10, -8(ap)
+ ld r11, 8(ap)
+ ld r12, 0(ap)
+ mulld r29, r0, r30 C rl * B4modb
+ mulhdu r0, r0, r30 C rl * B4modb
+ mulhdu r27, r10, r26
+ mulld r10, r10, r26
+ mulhdu r7, r9, r23 C rh * B5modb
+ mulld r9, r9, r23 C rh * B5modb
+ mulhdu r28, r11, r24
+ mulld r11, r11, r24
+ mulhdu r4, r12, r25
+ mulld r12, r12, r25
+ addc r8, r10, r31
+ addze r10, r27
+ addi ap, ap, -32
+ addc r27, r8, r12
+ adde r12, r10, r4
+ addc r11, r27, r11
+ adde r31, r12, r28
+ addc r12, r11, r29
+ adde r4, r31, r0
+ addc r0, r9, r12
+ adde r9, r7, r4
+ bdnz L(top)
+
+L(end):
+ifdef(`HAVE_LIMB_LITTLE_ENDIAN',
+` lwz r3, 8(cps)',
+` lwz r3, 12(cps)')
+ mulld r10, r9, r26
+ mulhdu r9, r9, r26
+ addc r11, r0, r10
+ addze r9, r9
+ ld r10, 0(cps)
+ subfic r8, r3, 64
+ sld r9, r9, r3
+ srd r8, r11, r8
+ sld r11, r11, r3
+ or r9, r8, r9
+ mulld r0, r9, r10
+ mulhdu r10, r9, r10
+ addi r9, r9, 1
+ addc r8, r0, r11
+ adde r0, r10, r9
+ mulld r0, r0, d
+ subf r0, r0, r11
+ cmpld cr7, r8, r0
+ bge cr7, L(9)
+ add r0, r0, d
+L(9): cmpld cr7, r0, d
+ bge- cr7, L(16)
+L(10): srd r3, r0, r3
+ ld r23, -72(r1)
+ ld r24, -64(r1)
+ ld r25, -56(r1)
+ ld r26, -48(r1)
+ ld r27, -40(r1)
+ ld r28, -32(r1)
+ ld r29, -24(r1)
+ ld r30, -16(r1)
+ ld r31, -8(r1)
+ blr
+
+L(16): subf r0, d, r0
+ b L(10)
+EPILOGUE()
+
+PROLOGUE(mpn_mod_1s_4p_cps,toc)
+ mflr r0
+ std r29, -24(r1)
+ std r30, -16(r1)
+ mr r29, r3
+ std r0, 16(r1)
+ std r31, -8(r1)
+ stdu r1, -144(r1)
+ cntlzd r31, r4
+ sld r30, r4, r31
+ mr r3, r30
+ CALL( mpn_invert_limb)
+ subfic r9, r31, 64
+ li r10, 1
+ sld r10, r10, r31
+ srd r9, r3, r9
+ neg r0, r30
+ or r10, r10, r9
+ mulld r10, r10, r0
+ mulhdu r11, r10, r3
+ nor r11, r11, r11
+ subf r11, r10, r11
+ mulld r11, r11, r30
+ mulld r0, r10, r3
+ cmpld cr7, r0, r11
+ bge cr7, L(18)
+ add r11, r11, r30
+L(18): mulhdu r9, r11, r3
+ add r9, r11, r9
+ nor r9, r9, r9
+ mulld r9, r9, r30
+ mulld r0, r11, r3
+ cmpld cr7, r0, r9
+ bge cr7, L(19)
+ add r9, r9, r30
+L(19): mulhdu r0, r9, r3
+ add r0, r9, r0
+ nor r0, r0, r0
+ mulld r0, r0, r30
+ mulld r8, r9, r3
+ cmpld cr7, r8, r0
+ bge cr7, L(20)
+ add r0, r0, r30
+L(20): mulhdu r8, r0, r3
+ add r8, r0, r8
+ nor r8, r8, r8
+ mulld r8, r8, r30
+ mulld r7, r0, r3
+ cmpld cr7, r7, r8
+ bge cr7, L(21)
+ add r8, r8, r30
+L(21): srd r0, r0, r31
+ addi r1, r1, 144
+ srd r8, r8, r31
+ srd r10, r10, r31
+ srd r11, r11, r31
+ std r0, 40(r29)
+ std r31, 8(r29)
+ srd r9, r9, r31
+ ld r0, 16(r1)
+ ld r30, -16(r1)
+ std r8, 48(r29)
+ std r3, 0(r29)
+ mtlr r0
+ ld r31, -8(r1)
+ std r10, 16(r29)
+ std r11, 24(r29)
+ std r9, 32(r29)
+ ld r29, -24(r1)
+ blr
+EPILOGUE()
diff --git a/gmp/mpn/powerpc64/mode64/mod_34lsub1.asm b/gmp/mpn/powerpc64/mode64/mod_34lsub1.asm
new file mode 100644
index 0000000000..c35e0e37a4
--- /dev/null
+++ b/gmp/mpn/powerpc64/mode64/mod_34lsub1.asm
@@ -0,0 +1,132 @@
+dnl PowerPC-64 mpn_mod_34lsub1 -- modulo 2^48-1.
+
+dnl Copyright 2005 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C POWER3/PPC630 1.33
+C POWER4/PPC970 1.5
+C POWER5 1.32
+C POWER6 2.35
+C POWER7 1
+
+C INPUT PARAMETERS
+define(`up',`r3')
+define(`n',`r4')
+
+ASM_START()
+PROLOGUE(mpn_mod_34lsub1)
+ li r8, 0
+ li r9, 0
+ li r10, 0
+ li r11, 0
+
+ cmpdi cr6, n, 3
+ blt cr6, L(lt3)
+
+ li r0, -0x5556 C 0xFFFFFFFFFFFFAAAA
+ rldimi r0, r0, 16, 32 C 0xFFFFFFFFAAAAAAAA
+ rldimi r0, r0, 32, 63 C 0xAAAAAAAAAAAAAAAB
+ mulhdu r0, r0, n
+ srdi r0, r0, 1 C r0 = [n / 3]
+ mtctr r0
+
+ ld r5, 0(up)
+ ld r6, 8(up)
+ ld r7, 16(up)
+ addi up, up, 24
+ bdz L(end)
+
+ ALIGN(16)
+L(top): addc r8, r8, r5
+ nop
+ ld r5, 0(up)
+ adde r9, r9, r6
+ ld r6, 8(up)
+ adde r10, r10, r7
+ ld r7, 16(up)
+ addi up, up, 48
+ addze r11, r11
+ bdz L(endx)
+ addc r8, r8, r5
+ nop
+ ld r5, -24(up)
+ adde r9, r9, r6
+ ld r6, -16(up)
+ adde r10, r10, r7
+ ld r7, -8(up)
+ addze r11, r11
+ bdnz L(top)
+
+ addi up, up, 24
+L(endx):
+ addi up, up, -24
+
+L(end): addc r8, r8, r5
+ adde r9, r9, r6
+ adde r10, r10, r7
+ addze r11, r11
+
+ sldi r5, r0, 1
+ add r5, r5, r0 C r11 = n / 3 * 3
+ sub n, n, r5 C n = n mod 3
+L(lt3): cmpdi cr6, n, 1
+ blt cr6, L(2)
+
+ ld r5, 0(up)
+ addc r8, r8, r5
+ li r6, 0
+ beq cr6, L(1)
+
+ ld r6, 8(up)
+L(1): adde r9, r9, r6
+ addze r10, r10
+ addze r11, r11
+
+L(2): rldicl r0, r8, 0, 16 C r0 = r8 mod 2^48
+ srdi r3, r8, 48 C r3 = r8 div 2^48
+ rldic r4, r9, 16, 16 C r4 = (r9 mod 2^32) << 16
+ srdi r5, r9, 32 C r5 = r9 div 2^32
+ rldic r6, r10, 32, 16 C r6 = (r10 mod 2^16) << 32
+ srdi r7, r10, 16 C r7 = r10 div 2^16
+
+ add r0, r0, r3
+ add r4, r4, r5
+ add r6, r6, r7
+
+ add r0, r0, r4
+ add r6, r6, r11
+
+ add r3, r0, r6
+ blr
+EPILOGUE()
+
+C |__r10__|__r9___|__r8___|
+C |-----|-----|-----|-----|
diff --git a/gmp/mpn/powerpc64/mode64/mode1o.asm b/gmp/mpn/powerpc64/mode64/mode1o.asm
new file mode 100644
index 0000000000..726339a931
--- /dev/null
+++ b/gmp/mpn/powerpc64/mode64/mode1o.asm
@@ -0,0 +1,117 @@
+dnl PowerPC-64 mpn_modexact_1_odd -- mpn by limb exact remainder.
+
+dnl Copyright 2006 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C POWER3/PPC630 13-19
+C POWER4/PPC970 16
+C POWER5 16
+C POWER6 ?
+C POWER7 12
+
+C TODO
+C * Check if n=1 code is really an improvement. It probably isn't.
+C * Make more similar to dive_1.asm.
+
+C INPUT PARAMETERS
+define(`up', `r3')
+define(`n', `r4')
+define(`d', `r5')
+define(`cy', `r6')
+
+
+ASM_START()
+
+EXTERN(binvert_limb_table)
+
+PROLOGUE(mpn_modexact_1c_odd,toc)
+ addic. n, n, -1 C set carry as side effect
+ ld r8, 0(up)
+ bne cr0, L(2)
+ cmpld cr7, r6, r8
+ bge cr7, L(4)
+ subf r8, r6, r8
+ divdu r3, r8, d
+ mulld r3, r3, d
+ subf. r3, r3, r8
+ beqlr cr0
+ subf r3, r3, d
+ blr
+
+L(4): subf r3, r8, r6
+ divdu r8, r3, d
+ mulld r8, r8, d
+ subf r3, r8, r3
+ blr
+
+L(2): LEA( r7, binvert_limb_table)
+ rldicl r9, d, 63, 57
+ mtctr n
+ lbzx r0, r7, r9
+ mulld r7, r0, r0
+ sldi r0, r0, 1
+ mulld r7, d, r7
+ subf r0, r7, r0
+ mulld r9, r0, r0
+ sldi r0, r0, 1
+ mulld r9, d, r9
+ subf r0, r9, r0
+ mulld r7, r0, r0
+ sldi r0, r0, 1
+ mulld r7, d, r7
+ subf r9, r7, r0
+
+ ALIGN(16)
+L(loop):
+ subfe r0, r6, r8
+ ld r8, 8(up)
+ addi up, up, 8
+ mulld r0, r9, r0
+ mulhdu r6, r0, d
+ bdnz L(loop)
+
+ cmpld cr7, d, r8
+ blt cr7, L(10)
+
+ subfe r0, r0, r0
+ subf r6, r0, r6
+ cmpld cr7, r6, r8
+ subf r3, r8, r6
+ bgelr cr7
+ add r3, d, r3
+ blr
+
+L(10): subfe r0, r6, r8
+ mulld r0, r9, r0
+ mulhdu r3, r0, d
+ blr
+EPILOGUE()
+ASM_END()
diff --git a/gmp/mpn/powerpc64/mode64/mul_1.asm b/gmp/mpn/powerpc64/mode64/mul_1.asm
new file mode 100644
index 0000000000..27a8f8fb4d
--- /dev/null
+++ b/gmp/mpn/powerpc64/mode64/mul_1.asm
@@ -0,0 +1,168 @@
+dnl PowerPC-64 mpn_mul_1 -- Multiply a limb vector with a limb and store
+dnl the result in a second limb vector.
+
+dnl Copyright 1999-2001, 2003-2006, 2010 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C POWER3/PPC630 6-18
+C POWER4/PPC970 7.25? not updated for last file revision
+C POWER5 7.25
+C POWER6 14
+C POWER7 2.9
+
+C TODO
+C * Try to reduce the number of needed live registers (at least r5 and r10
+C could be combined)
+C * Optimize feed-in code, for speed and size.
+C * Clean up r12/r7 usage in feed-in code.
+
+C INPUT PARAMETERS
+define(`rp', `r3')
+define(`up', `r4')
+define(`n', `r5')
+define(`vl', `r6')
+
+ASM_START()
+PROLOGUE(mpn_mul_1c)
+ std r27, -40(r1)
+ std r26, -48(r1)
+ mr r12, r7
+ b L(ent)
+EPILOGUE()
+PROLOGUE(mpn_mul_1)
+ std r27, -40(r1)
+ std r26, -48(r1)
+ li r12, 0 C cy_limb = 0
+L(ent): ld r26, 0(up)
+
+ rldicl. r0, n, 0,62 C r0 = n & 3, set cr0
+ cmpdi cr6, r0, 2
+ addic n, n, 3 C compute count...
+ srdi n, n, 2 C ...for ctr
+ mtctr n C copy count into ctr
+ beq cr0, L(b00)
+ blt cr6, L(b01)
+ beq cr6, L(b10)
+
+L(b11): mr r7, r12
+ mulld r0, r26, r6
+ mulhdu r12, r26, r6
+ addi up, up, 8
+ addc r0, r0, r7
+ std r0, 0(rp)
+ addi rp, rp, 8
+ b L(fic)
+
+L(b00): ld r27, 8(up)
+ addi up, up, 16
+ mulld r0, r26, r6
+ mulhdu r5, r26, r6
+ mulld r7, r27, r6
+ mulhdu r8, r27, r6
+ addc r0, r0, r12
+ adde r7, r7, r5
+ addze r12, r8
+ std r0, 0(rp)
+ std r7, 8(rp)
+ addi rp, rp, 16
+ b L(fic)
+
+ nop C alignment
+L(b01): bdnz L(gt1)
+ mulld r0, r26, r6
+ mulhdu r8, r26, r6
+ addc r0, r0, r12
+ std r0, 0(rp)
+ b L(ret)
+L(gt1): ld r27, 8(up)
+ nop
+ mulld r0, r26, r6
+ mulhdu r5, r26, r6
+ ld r26, 16(up)
+ mulld r7, r27, r6
+ mulhdu r8, r27, r6
+ mulld r9, r26, r6
+ mulhdu r10, r26, r6
+ addc r0, r0, r12
+ adde r7, r7, r5
+ adde r9, r9, r8
+ addze r12, r10
+ std r0, 0(rp)
+ std r7, 8(rp)
+ std r9, 16(rp)
+ addi up, up, 24
+ addi rp, rp, 24
+ b L(fic)
+
+ nop
+L(fic): ld r26, 0(up)
+L(b10): ld r27, 8(up)
+ addi up, up, 16
+ bdz L(end)
+
+L(top): mulld r0, r26, r6
+ mulhdu r5, r26, r6
+ mulld r7, r27, r6
+ mulhdu r8, r27, r6
+ ld r26, 0(up)
+ ld r27, 8(up)
+ adde r0, r0, r12
+ adde r7, r7, r5
+ mulld r9, r26, r6
+ mulhdu r10, r26, r6
+ mulld r11, r27, r6
+ mulhdu r12, r27, r6
+ ld r26, 16(up)
+ ld r27, 24(up)
+ std r0, 0(rp)
+ adde r9, r9, r8
+ std r7, 8(rp)
+ adde r11, r11, r10
+ std r9, 16(rp)
+ addi up, up, 32
+ std r11, 24(rp)
+
+ addi rp, rp, 32
+ bdnz L(top)
+
+L(end): mulld r0, r26, r6
+ mulhdu r5, r26, r6
+ mulld r7, r27, r6
+ mulhdu r8, r27, r6
+ adde r0, r0, r12
+ adde r7, r7, r5
+ std r0, 0(rp)
+ std r7, 8(rp)
+L(ret): addze r3, r8
+ ld r27, -40(r1)
+ ld r26, -48(r1)
+ blr
+EPILOGUE()
diff --git a/gmp/mpn/powerpc64/mode64/mul_basecase.asm b/gmp/mpn/powerpc64/mode64/mul_basecase.asm
new file mode 100644
index 0000000000..18731879e4
--- /dev/null
+++ b/gmp/mpn/powerpc64/mode64/mul_basecase.asm
@@ -0,0 +1,708 @@
+dnl PowerPC-64 mpn_mul_basecase.
+
+dnl Copyright 1999-2001, 2003-2006, 2008 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C POWER3/PPC630 6-18
+C POWER4/PPC970 8
+C POWER5 8
+C POWER6 24
+
+C INPUT PARAMETERS
+define(`rp', `r3')
+define(`up', `r4')
+define(`un', `r5')
+define(`vp', `r6')
+define(`vn', `r7')
+
+define(`v0', `r25')
+define(`outer_rp', `r22')
+define(`outer_up', `r23')
+
+ASM_START()
+PROLOGUE(mpn_mul_basecase)
+
+C Special code for un <= 2, for efficiency of these important cases,
+C and since it simplifies the default code.
+ cmpdi cr0, un, 2
+ bgt cr0, L(un_gt2)
+ cmpdi cr6, vn, 1
+ ld r7, 0(vp)
+ ld r5, 0(up)
+ mulld r8, r5, r7 C weight 0
+ mulhdu r9, r5, r7 C weight 1
+ std r8, 0(rp)
+ beq cr0, L(2x)
+ std r9, 8(rp)
+ blr
+ ALIGN(16)
+L(2x): ld r0, 8(up)
+ mulld r8, r0, r7 C weight 1
+ mulhdu r10, r0, r7 C weight 2
+ addc r9, r9, r8
+ addze r10, r10
+ bne cr6, L(2x2)
+ std r9, 8(rp)
+ std r10, 16(rp)
+ blr
+ ALIGN(16)
+L(2x2): ld r6, 8(vp)
+ nop
+ mulld r8, r5, r6 C weight 1
+ mulhdu r11, r5, r6 C weight 2
+ addc r9, r9, r8
+ std r9, 8(rp)
+ adde r11, r11, r10
+ mulld r12, r0, r6 C weight 2
+ mulhdu r0, r0, r6 C weight 3
+ addze r0, r0
+ addc r11, r11, r12
+ addze r0, r0
+ std r11, 16(rp)
+ std r0, 24(rp)
+ blr
+
+L(un_gt2):
+ std r31, -8(r1)
+ std r30, -16(r1)
+ std r29, -24(r1)
+ std r28, -32(r1)
+ std r27, -40(r1)
+ std r26, -48(r1)
+ std r25, -56(r1)
+ std r24, -64(r1)
+ std r23, -72(r1)
+ std r22, -80(r1)
+
+ mr outer_rp, rp
+ mr outer_up, up
+
+ ld v0, 0(vp) C new v limb
+ addi vp, vp, 8
+ ld r26, 0(up)
+
+ rldicl. r0, un, 0,62 C r0 = n & 3, set cr0
+ cmpdi cr6, r0, 2
+ addi un, un, 1 C compute count...
+ srdi un, un, 2 C ...for ctr
+ mtctr un C copy inner loop count into ctr
+ beq cr0, L(b0)
+ blt cr6, L(b1)
+ beq cr6, L(b2)
+
+
+ ALIGN(16)
+L(b3): mulld r0, r26, v0
+ mulhdu r12, r26, v0
+ addic r0, r0, 0
+ std r0, 0(rp)
+ ld r26, 8(up)
+ ld r27, 16(up)
+ bdz L(end_m_3)
+
+ ALIGN(16)
+L(lo_m_3):
+ mulld r0, r26, v0
+ mulhdu r31, r26, v0
+ ld r26, 24(up)
+ nop
+ mulld r24, r27, v0
+ mulhdu r8, r27, v0
+ ld r27, 32(up)
+ nop
+ adde r0, r0, r12
+ adde r24, r24, r31
+ mulld r9, r26, v0
+ mulhdu r10, r26, v0
+ ld r26, 40(up)
+ nop
+ mulld r11, r27, v0
+ mulhdu r12, r27, v0
+ ld r27, 48(up)
+ std r0, 8(rp)
+ adde r9, r9, r8
+ std r24, 16(rp)
+ adde r11, r11, r10
+ std r9, 24(rp)
+ addi up, up, 32
+ std r11, 32(rp)
+ addi rp, rp, 32
+ bdnz L(lo_m_3)
+
+ ALIGN(16)
+L(end_m_3):
+ mulld r0, r26, v0
+ mulhdu r31, r26, v0
+
+ mulld r24, r27, v0
+ mulhdu r8, r27, v0
+
+ adde r0, r0, r12
+ adde r24, r24, r31
+
+ std r0, 8(rp)
+ std r24, 16(rp)
+ addze r8, r8
+ std r8, 24(rp)
+ addic. vn, vn, -1
+ beq L(ret)
+
+ ALIGN(16)
+L(outer_lo_3):
+ mtctr un C copy inner loop count into ctr
+ addi rp, outer_rp, 8
+ mr up, outer_up
+ addi outer_rp, outer_rp, 8
+ ld v0, 0(vp) C new v limb
+ addi vp, vp, 8
+ ld r26, 0(up)
+ ld r28, 0(rp)
+ mulld r0, r26, v0
+ mulhdu r12, r26, v0
+ addc r0, r0, r28
+ std r0, 0(rp)
+ ld r26, 8(up)
+ ld r27, 16(up)
+ bdz L(end_3)
+
+ ALIGN(16) C registers dying
+L(lo_3):
+ mulld r0, r26, v0 C
+ mulhdu r10, r26, v0 C 26
+ ld r26, 24(up) C
+ ld r28, 8(rp) C
+ mulld r24, r27, v0 C
+ mulhdu r8, r27, v0 C 27
+ ld r27, 32(up) C
+ ld r29, 16(rp) C
+ adde r0, r0, r12 C 0 12
+ adde r24, r24, r10 C 24 10
+ mulld r9, r26, v0 C
+ mulhdu r10, r26, v0 C 26
+ ld r26, 40(up) C
+ ld r30, 24(rp) C
+ mulld r11, r27, v0 C
+ mulhdu r12, r27, v0 C 27
+ ld r27, 48(up) C
+ ld r31, 32(rp) C
+ adde r9, r9, r8 C 8 9
+ adde r11, r11, r10 C 10 11
+ addze r12, r12 C 12
+ addc r0, r0, r28 C 0 28
+ std r0, 8(rp) C 0
+ adde r24, r24, r29 C 7 29
+ std r24, 16(rp) C 7
+ adde r9, r9, r30 C 9 30
+ std r9, 24(rp) C 9
+ adde r11, r11, r31 C 11 31
+ std r11, 32(rp) C 11
+ addi up, up, 32 C
+ addi rp, rp, 32 C
+ bdnz L(lo_3) C
+
+ ALIGN(16)
+L(end_3):
+ mulld r0, r26, v0
+ mulhdu r10, r26, v0
+ ld r28, 8(rp)
+ nop
+ mulld r24, r27, v0
+ mulhdu r8, r27, v0
+ ld r29, 16(rp)
+ nop
+ adde r0, r0, r12
+ adde r24, r24, r10
+ addze r8, r8
+ addc r0, r0, r28
+ std r0, 8(rp)
+ adde r24, r24, r29
+ std r24, 16(rp)
+ addze r8, r8
+ std r8, 24(rp)
+
+ addic. vn, vn, -1
+ bne L(outer_lo_3)
+ b L(ret)
+
+
+ ALIGN(16)
+L(b0): ld r27, 8(up)
+ addi up, up, 8
+ mulld r0, r26, v0
+ mulhdu r10, r26, v0
+ mulld r24, r27, v0
+ mulhdu r8, r27, v0
+ addc r24, r24, r10
+ addze r12, r8
+ std r0, 0(rp)
+ std r24, 8(rp)
+ addi rp, rp, 8
+ ld r26, 8(up)
+ ld r27, 16(up)
+ bdz L(end_m_0)
+
+ ALIGN(16)
+L(lo_m_0):
+ mulld r0, r26, v0
+ mulhdu r31, r26, v0
+ ld r26, 24(up)
+ nop
+ mulld r24, r27, v0
+ mulhdu r8, r27, v0
+ ld r27, 32(up)
+ nop
+ adde r0, r0, r12
+ adde r24, r24, r31
+ mulld r9, r26, v0
+ mulhdu r10, r26, v0
+ ld r26, 40(up)
+ nop
+ mulld r11, r27, v0
+ mulhdu r12, r27, v0
+ ld r27, 48(up)
+ std r0, 8(rp)
+ adde r9, r9, r8
+ std r24, 16(rp)
+ adde r11, r11, r10
+ std r9, 24(rp)
+ addi up, up, 32
+ std r11, 32(rp)
+ addi rp, rp, 32
+ bdnz L(lo_m_0)
+
+ ALIGN(16)
+L(end_m_0):
+ mulld r0, r26, v0
+ mulhdu r31, r26, v0
+
+ mulld r24, r27, v0
+ mulhdu r8, r27, v0
+
+ adde r0, r0, r12
+ adde r24, r24, r31
+
+ std r0, 8(rp)
+ addze r8, r8
+ std r24, 16(rp)
+ addic. vn, vn, -1
+ std r8, 24(rp)
+ nop
+ beq L(ret)
+
+ ALIGN(16)
+L(outer_lo_0):
+ mtctr un C copy inner loop count into ctr
+ addi rp, outer_rp, 16
+ addi up, outer_up, 8
+ addi outer_rp, outer_rp, 8
+ ld v0, 0(vp) C new v limb
+ addi vp, vp, 8
+ ld r26, -8(up)
+ ld r27, 0(up)
+ ld r28, -8(rp)
+ ld r29, 0(rp)
+ nop
+ nop
+ mulld r0, r26, v0
+ mulhdu r10, r26, v0
+ mulld r24, r27, v0
+ mulhdu r8, r27, v0
+ addc r24, r24, r10
+ addze r12, r8
+ addc r0, r0, r28
+ std r0, -8(rp)
+ adde r24, r24, r29
+ std r24, 0(rp)
+ ld r26, 8(up)
+ ld r27, 16(up)
+ bdz L(end_0)
+
+ ALIGN(16) C registers dying
+L(lo_0):
+ mulld r0, r26, v0 C
+ mulhdu r10, r26, v0 C 26
+ ld r26, 24(up) C
+ ld r28, 8(rp) C
+ mulld r24, r27, v0 C
+ mulhdu r8, r27, v0 C 27
+ ld r27, 32(up) C
+ ld r29, 16(rp) C
+ adde r0, r0, r12 C 0 12
+ adde r24, r24, r10 C 24 10
+ mulld r9, r26, v0 C
+ mulhdu r10, r26, v0 C 26
+ ld r26, 40(up) C
+ ld r30, 24(rp) C
+ mulld r11, r27, v0 C
+ mulhdu r12, r27, v0 C 27
+ ld r27, 48(up) C
+ ld r31, 32(rp) C
+ adde r9, r9, r8 C 8 9
+ adde r11, r11, r10 C 10 11
+ addze r12, r12 C 12
+ addc r0, r0, r28 C 0 28
+ std r0, 8(rp) C 0
+ adde r24, r24, r29 C 7 29
+ std r24, 16(rp) C 7
+ adde r9, r9, r30 C 9 30
+ std r9, 24(rp) C 9
+ adde r11, r11, r31 C 11 31
+ std r11, 32(rp) C 11
+ addi up, up, 32 C
+ addi rp, rp, 32 C
+ bdnz L(lo_0) C
+
+ ALIGN(16)
+L(end_0):
+ mulld r0, r26, v0
+ mulhdu r10, r26, v0
+ ld r28, 8(rp)
+ nop
+ mulld r24, r27, v0
+ mulhdu r8, r27, v0
+ ld r29, 16(rp)
+ nop
+ adde r0, r0, r12
+ adde r24, r24, r10
+ addze r8, r8
+ addic. vn, vn, -1
+ addc r0, r0, r28
+ std r0, 8(rp)
+ adde r24, r24, r29
+ std r24, 16(rp)
+ addze r8, r8
+ std r8, 24(rp)
+ bne L(outer_lo_0)
+ b L(ret)
+
+
+ ALIGN(16)
+L(b1): ld r27, 8(up)
+ nop
+ mulld r0, r26, v0
+ mulhdu r31, r26, v0
+ ld r26, 16(up)
+ mulld r24, r27, v0
+ mulhdu r8, r27, v0
+ mulld r9, r26, v0
+ mulhdu r10, r26, v0
+ addc r24, r24, r31
+ adde r9, r9, r8
+ addze r12, r10
+ std r0, 0(rp)
+ std r24, 8(rp)
+ std r9, 16(rp)
+ addi up, up, 16
+ addi rp, rp, 16
+ ld r26, 8(up)
+ ld r27, 16(up)
+ bdz L(end_m_1)
+
+ ALIGN(16)
+L(lo_m_1):
+ mulld r0, r26, v0
+ mulhdu r31, r26, v0
+ ld r26, 24(up)
+ nop
+ mulld r24, r27, v0
+ mulhdu r8, r27, v0
+ ld r27, 32(up)
+ nop
+ adde r0, r0, r12
+ adde r24, r24, r31
+ mulld r9, r26, v0
+ mulhdu r10, r26, v0
+ ld r26, 40(up)
+ nop
+ mulld r11, r27, v0
+ mulhdu r12, r27, v0
+ ld r27, 48(up)
+ std r0, 8(rp)
+ adde r9, r9, r8
+ std r24, 16(rp)
+ adde r11, r11, r10
+ std r9, 24(rp)
+ addi up, up, 32
+ std r11, 32(rp)
+ addi rp, rp, 32
+ bdnz L(lo_m_1)
+
+ ALIGN(16)
+L(end_m_1):
+ mulld r0, r26, v0
+ mulhdu r31, r26, v0
+
+ mulld r24, r27, v0
+ mulhdu r8, r27, v0
+
+ adde r0, r0, r12
+ adde r24, r24, r31
+
+ std r0, 8(rp)
+ addze r8, r8
+ std r24, 16(rp)
+ addic. vn, vn, -1
+ std r8, 24(rp)
+ nop
+ beq L(ret)
+
+ ALIGN(16)
+L(outer_lo_1):
+ mtctr un C copy inner loop count into ctr
+ addi rp, outer_rp, 24
+ addi up, outer_up, 16
+ addi outer_rp, outer_rp, 8
+ ld v0, 0(vp) C new v limb
+ addi vp, vp, 8
+ ld r26, -16(up)
+ ld r27, -8(up)
+ mulld r0, r26, v0
+ mulhdu r31, r26, v0
+ ld r26, 0(up)
+ ld r28, -16(rp)
+ mulld r24, r27, v0
+ mulhdu r8, r27, v0
+ ld r29, -8(rp)
+ ld r30, 0(rp)
+ mulld r9, r26, v0
+ mulhdu r10, r26, v0
+ addc r24, r24, r31
+ adde r9, r9, r8
+ addze r12, r10
+ addc r0, r0, r28
+ std r0, -16(rp)
+ adde r24, r24, r29
+ std r24, -8(rp)
+ adde r9, r9, r30
+ std r9, 0(rp)
+ ld r26, 8(up)
+ ld r27, 16(up)
+ bdz L(end_1)
+
+ ALIGN(16) C registers dying
+L(lo_1):
+ mulld r0, r26, v0 C
+ mulhdu r10, r26, v0 C 26
+ ld r26, 24(up) C
+ ld r28, 8(rp) C
+ mulld r24, r27, v0 C
+ mulhdu r8, r27, v0 C 27
+ ld r27, 32(up) C
+ ld r29, 16(rp) C
+ adde r0, r0, r12 C 0 12
+ adde r24, r24, r10 C 24 10
+ mulld r9, r26, v0 C
+ mulhdu r10, r26, v0 C 26
+ ld r26, 40(up) C
+ ld r30, 24(rp) C
+ mulld r11, r27, v0 C
+ mulhdu r12, r27, v0 C 27
+ ld r27, 48(up) C
+ ld r31, 32(rp) C
+ adde r9, r9, r8 C 8 9
+ adde r11, r11, r10 C 10 11
+ addze r12, r12 C 12
+ addc r0, r0, r28 C 0 28
+ std r0, 8(rp) C 0
+ adde r24, r24, r29 C 7 29
+ std r24, 16(rp) C 7
+ adde r9, r9, r30 C 9 30
+ std r9, 24(rp) C 9
+ adde r11, r11, r31 C 11 31
+ std r11, 32(rp) C 11
+ addi up, up, 32 C
+ addi rp, rp, 32 C
+ bdnz L(lo_1) C
+
+ ALIGN(16)
+L(end_1):
+ mulld r0, r26, v0
+ mulhdu r10, r26, v0
+ ld r28, 8(rp)
+ nop
+ mulld r24, r27, v0
+ mulhdu r8, r27, v0
+ ld r29, 16(rp)
+ nop
+ adde r0, r0, r12
+ adde r24, r24, r10
+ addze r8, r8
+ addic. vn, vn, -1
+ addc r0, r0, r28
+ std r0, 8(rp)
+ adde r24, r24, r29
+ std r24, 16(rp)
+ addze r8, r8
+ std r8, 24(rp)
+ bne L(outer_lo_1)
+ b L(ret)
+
+
+ ALIGN(16)
+L(b2): ld r27, 8(up)
+ addi up, up, -8
+ addi rp, rp, -8
+ li r12, 0
+ addic r12, r12, 0
+
+ ALIGN(16)
+L(lo_m_2):
+ mulld r0, r26, v0
+ mulhdu r31, r26, v0
+ ld r26, 24(up)
+ nop
+ mulld r24, r27, v0
+ mulhdu r8, r27, v0
+ ld r27, 32(up)
+ nop
+ adde r0, r0, r12
+ adde r24, r24, r31
+ mulld r9, r26, v0
+ mulhdu r10, r26, v0
+ ld r26, 40(up)
+ nop
+ mulld r11, r27, v0
+ mulhdu r12, r27, v0
+ ld r27, 48(up)
+ std r0, 8(rp)
+ adde r9, r9, r8
+ std r24, 16(rp)
+ adde r11, r11, r10
+ std r9, 24(rp)
+ addi up, up, 32
+ std r11, 32(rp)
+
+ addi rp, rp, 32
+ bdnz L(lo_m_2)
+
+ ALIGN(16)
+L(end_m_2):
+ mulld r0, r26, v0
+ mulhdu r31, r26, v0
+
+ mulld r24, r27, v0
+ mulhdu r8, r27, v0
+
+ adde r0, r0, r12
+ adde r24, r24, r31
+
+ std r0, 8(rp)
+ addze r8, r8
+ std r24, 16(rp)
+ addic. vn, vn, -1
+ std r8, 24(rp)
+ nop
+ beq L(ret)
+
+ ALIGN(16)
+L(outer_lo_2):
+ mtctr un C copy inner loop count into ctr
+ addi rp, outer_rp, 0
+ addi up, outer_up, -8
+ addi outer_rp, outer_rp, 8
+ ld v0, 0(vp) C new v limb
+ addi vp, vp, 8
+ ld r26, 8(up)
+ ld r27, 16(up)
+ li r12, 0
+ addic r12, r12, 0
+
+ ALIGN(16) C registers dying
+L(lo_2):
+ mulld r0, r26, v0 C
+ mulhdu r10, r26, v0 C 26
+ ld r26, 24(up) C
+ ld r28, 8(rp) C
+ mulld r24, r27, v0 C
+ mulhdu r8, r27, v0 C 27
+ ld r27, 32(up) C
+ ld r29, 16(rp) C
+ adde r0, r0, r12 C 0 12
+ adde r24, r24, r10 C 24 10
+ mulld r9, r26, v0 C
+ mulhdu r10, r26, v0 C 26
+ ld r26, 40(up) C
+ ld r30, 24(rp) C
+ mulld r11, r27, v0 C
+ mulhdu r12, r27, v0 C 27
+ ld r27, 48(up) C
+ ld r31, 32(rp) C
+ adde r9, r9, r8 C 8 9
+ adde r11, r11, r10 C 10 11
+ addze r12, r12 C 12
+ addc r0, r0, r28 C 0 28
+ std r0, 8(rp) C 0
+ adde r24, r24, r29 C 7 29
+ std r24, 16(rp) C 7
+ adde r9, r9, r30 C 9 30
+ std r9, 24(rp) C 9
+ adde r11, r11, r31 C 11 31
+ std r11, 32(rp) C 11
+ addi up, up, 32 C
+ addi rp, rp, 32 C
+ bdnz L(lo_2) C
+
+ ALIGN(16)
+L(end_2):
+ mulld r0, r26, v0
+ mulhdu r10, r26, v0
+ ld r28, 8(rp)
+ nop
+ mulld r24, r27, v0
+ mulhdu r8, r27, v0
+ ld r29, 16(rp)
+ nop
+ adde r0, r0, r12
+ adde r24, r24, r10
+ addze r8, r8
+ addic. vn, vn, -1
+ addc r0, r0, r28
+ std r0, 8(rp)
+ adde r24, r24, r29
+ std r24, 16(rp)
+ addze r8, r8
+ std r8, 24(rp)
+ bne L(outer_lo_2)
+ b L(ret)
+
+
+L(ret): ld r31, -8(r1)
+ ld r30, -16(r1)
+ ld r29, -24(r1)
+ ld r28, -32(r1)
+ ld r27, -40(r1)
+ ld r26, -48(r1)
+ ld r25, -56(r1)
+ ld r24, -64(r1)
+ ld r23, -72(r1)
+ ld r22, -80(r1)
+ blr
+EPILOGUE()
diff --git a/gmp/mpn/powerpc64/mode64/p3/gmp-mparam.h b/gmp/mpn/powerpc64/mode64/p3/gmp-mparam.h
new file mode 100644
index 0000000000..61a437b6e6
--- /dev/null
+++ b/gmp/mpn/powerpc64/mode64/p3/gmp-mparam.h
@@ -0,0 +1,179 @@
+/* POWER3/PowerPC630 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 2008-2010 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+or
+
+ * the GNU General Public License as published by the Free Software
+ Foundation; either version 2 of the License, or (at your option) any
+ later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library. If not,
+see https://www.gnu.org/licenses/. */
+
+#define GMP_LIMB_BITS 64
+#define GMP_LIMB_BYTES 8
+
+
+#define MOD_1_NORM_THRESHOLD 0 /* always */
+#define MOD_1_UNNORM_THRESHOLD 0 /* always */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD 7
+#define MOD_1U_TO_MOD_1_1_THRESHOLD 3
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD 18
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 17
+#define USE_PREINV_DIVREM_1 0
+#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD MP_SIZE_T_MAX /* never */
+
+#define MUL_TOOM22_THRESHOLD 10
+#define MUL_TOOM33_THRESHOLD 33
+#define MUL_TOOM44_THRESHOLD 46
+#define MUL_TOOM6H_THRESHOLD 77
+#define MUL_TOOM8H_THRESHOLD 139
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD 49
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD 47
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD 49
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD 49
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD 34
+
+#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
+#define SQR_TOOM2_THRESHOLD 14
+#define SQR_TOOM3_THRESHOLD 45
+#define SQR_TOOM4_THRESHOLD 64
+#define SQR_TOOM6_THRESHOLD 85
+#define SQR_TOOM8_THRESHOLD 139
+
+#define MULMID_TOOM42_THRESHOLD 22
+
+#define MULMOD_BNM1_THRESHOLD 8
+#define SQRMOD_BNM1_THRESHOLD 10
+
+#define MUL_FFT_MODF_THRESHOLD 220 /* k = 5 */
+#define MUL_FFT_TABLE3 \
+ { { 220, 5}, { 9, 6}, { 5, 5}, { 11, 6}, \
+ { 13, 7}, { 7, 6}, { 15, 7}, { 13, 8}, \
+ { 7, 7}, { 15, 8}, { 13, 9}, { 7, 8}, \
+ { 19, 9}, { 11, 8}, { 23,10}, { 7, 9}, \
+ { 15, 8}, { 33, 9}, { 23,10}, { 15, 9}, \
+ { 35, 8}, { 71,10}, { 23, 9}, { 47,11}, \
+ { 15,10}, { 31, 9}, { 71,10}, { 39, 9}, \
+ { 79,10}, { 55,11}, { 31,10}, { 63, 9}, \
+ { 127,10}, { 71, 9}, { 143, 8}, { 287,10}, \
+ { 79,11}, { 47,10}, { 95, 9}, { 191,12}, \
+ { 31,11}, { 63,10}, { 127, 9}, { 255, 8}, \
+ { 511,10}, { 143, 9}, { 287,11}, { 79,10}, \
+ { 159, 9}, { 319, 8}, { 639,10}, { 175, 9}, \
+ { 351,11}, { 95,10}, { 191, 9}, { 383,11}, \
+ { 111,10}, { 223,12}, { 63,11}, { 127,10}, \
+ { 255, 9}, { 511,11}, { 143,10}, { 287, 9}, \
+ { 575,11}, { 159,10}, { 319, 9}, { 639,11}, \
+ { 175,10}, { 351,12}, { 95,11}, { 191,10}, \
+ { 383, 9}, { 767,11}, { 223,13}, { 63,12}, \
+ { 127,11}, { 255,10}, { 511,11}, { 287,10}, \
+ { 575, 9}, { 1151,12}, { 159,11}, { 319,10}, \
+ { 639,11}, { 351,12}, { 191,11}, { 383,10}, \
+ { 767,12}, { 223,11}, { 447,10}, { 895,13}, \
+ { 127,12}, { 255,11}, { 511,12}, { 287,11}, \
+ { 575,10}, { 1151,12}, { 319,11}, { 639,12}, \
+ { 351,11}, { 703,13}, { 191,12}, { 383,11}, \
+ { 767,12}, { 415,11}, { 831,10}, { 1663,12}, \
+ { 447,11}, { 895,14}, { 16384,15}, { 32768,16}, \
+ { 65536,17}, { 131072,18}, { 262144,19}, { 524288,20}, \
+ {1048576,21}, {2097152,22}, {4194304,23}, {8388608,24} }
+#define MUL_FFT_TABLE3_SIZE 120
+#define MUL_FFT_THRESHOLD 2688
+
+#define SQR_FFT_MODF_THRESHOLD 188 /* k = 5 */
+#define SQR_FFT_TABLE3 \
+ { { 188, 5}, { 9, 6}, { 5, 5}, { 11, 6}, \
+ { 13, 7}, { 13, 8}, { 7, 7}, { 16, 8}, \
+ { 9, 7}, { 19, 8}, { 13, 9}, { 7, 8}, \
+ { 19, 9}, { 11, 8}, { 23,10}, { 7, 9}, \
+ { 15, 8}, { 31, 9}, { 19, 8}, { 39, 9}, \
+ { 23,10}, { 15, 9}, { 39,10}, { 23,11}, \
+ { 15,10}, { 31, 9}, { 67,10}, { 39, 9}, \
+ { 79, 8}, { 159,10}, { 47, 9}, { 95, 8}, \
+ { 191,11}, { 31,10}, { 63, 9}, { 127, 8}, \
+ { 255,10}, { 71, 9}, { 143, 8}, { 287,10}, \
+ { 79, 9}, { 159,11}, { 47,10}, { 95, 9}, \
+ { 191,12}, { 31,11}, { 63,10}, { 127, 9}, \
+ { 255, 8}, { 511,10}, { 143, 9}, { 287,11}, \
+ { 79,10}, { 159, 9}, { 319, 8}, { 639,10}, \
+ { 175,11}, { 95,10}, { 191, 9}, { 383,11}, \
+ { 111,10}, { 223,12}, { 63,11}, { 127,10}, \
+ { 255, 9}, { 511,11}, { 143,10}, { 287, 9}, \
+ { 575,11}, { 159,10}, { 319, 9}, { 639,11}, \
+ { 175,12}, { 95,11}, { 191,10}, { 383, 9}, \
+ { 767,11}, { 223,13}, { 63,12}, { 127,11}, \
+ { 255,10}, { 511,11}, { 287,10}, { 575,12}, \
+ { 159,11}, { 319,10}, { 639,11}, { 351,12}, \
+ { 191,11}, { 383,10}, { 767,12}, { 223,11}, \
+ { 447,10}, { 895,13}, { 127,12}, { 255,11}, \
+ { 511,12}, { 287,11}, { 575,10}, { 1151,12}, \
+ { 319,11}, { 639,12}, { 351,13}, { 191,12}, \
+ { 383,11}, { 767,12}, { 447,11}, { 895,14}, \
+ { 16384,15}, { 32768,16}, { 65536,17}, { 131072,18}, \
+ { 262144,19}, { 524288,20}, {1048576,21}, {2097152,22}, \
+ {4194304,23}, {8388608,24} }
+#define SQR_FFT_TABLE3_SIZE 118
+#define SQR_FFT_THRESHOLD 1728
+
+#define MULLO_BASECASE_THRESHOLD 2
+#define MULLO_DC_THRESHOLD 27
+#define MULLO_MUL_N_THRESHOLD 2511
+
+#define DC_DIV_QR_THRESHOLD 23
+#define DC_DIVAPPR_Q_THRESHOLD 87
+#define DC_BDIV_QR_THRESHOLD 27
+#define DC_BDIV_Q_THRESHOLD 60
+
+#define INV_MULMOD_BNM1_THRESHOLD 27
+#define INV_NEWTON_THRESHOLD 91
+#define INV_APPR_THRESHOLD 91
+
+#define BINV_NEWTON_THRESHOLD 115
+#define REDC_1_TO_REDC_N_THRESHOLD 31
+
+#define MU_DIV_QR_THRESHOLD 551
+#define MU_DIVAPPR_Q_THRESHOLD 551
+#define MUPI_DIV_QR_THRESHOLD 42
+#define MU_BDIV_QR_THRESHOLD 483
+#define MU_BDIV_Q_THRESHOLD 492
+
+#define POWM_SEC_TABLE 2,23,140,556,713,746
+
+#define MATRIX22_STRASSEN_THRESHOLD 8
+#define HGCD_THRESHOLD 56
+#define HGCD_APPR_THRESHOLD 51
+#define HGCD_REDUCE_THRESHOLD 688
+#define GCD_DC_THRESHOLD 333
+#define GCDEXT_DC_THRESHOLD 126
+#define JACOBI_BASE_METHOD 1
+
+#define GET_STR_DC_THRESHOLD 17
+#define GET_STR_PRECOMPUTE_THRESHOLD 28
+#define SET_STR_DC_THRESHOLD 375
+#define SET_STR_PRECOMPUTE_THRESHOLD 812
+
+#define FAC_DSC_THRESHOLD 351
+#define FAC_ODD_THRESHOLD 0 /* always */
diff --git a/gmp/mpn/powerpc64/mode64/p4/gmp-mparam.h b/gmp/mpn/powerpc64/mode64/p4/gmp-mparam.h
new file mode 100644
index 0000000000..d909b292bb
--- /dev/null
+++ b/gmp/mpn/powerpc64/mode64/p4/gmp-mparam.h
@@ -0,0 +1,208 @@
+/* POWER4/PowerPC970 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 2008-2010, 2014 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+or
+
+ * the GNU General Public License as published by the Free Software
+ Foundation; either version 2 of the License, or (at your option) any
+ later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library. If not,
+see https://www.gnu.org/licenses/. */
+
+#define GMP_LIMB_BITS 64
+#define GMP_LIMB_BYTES 8
+
+/* 1800 MHz PPC970 */
+/* FFT tuning limit = 10000000 */
+/* Generated by tuneup.c, 2014-03-12, gcc 4.0 */
+
+#define MOD_1_NORM_THRESHOLD 0 /* always */
+#define MOD_1_UNNORM_THRESHOLD 0 /* always */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD 6
+#define MOD_1U_TO_MOD_1_1_THRESHOLD 5
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD 10
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD 22
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 16
+#define USE_PREINV_DIVREM_1 0
+#define DIV_QR_1N_PI1_METHOD 1
+#define DIV_QR_1_NORM_THRESHOLD 1
+#define DIV_QR_1_UNNORM_THRESHOLD 1
+#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD 34
+
+#define MUL_TOOM22_THRESHOLD 14
+#define MUL_TOOM33_THRESHOLD 53
+#define MUL_TOOM44_THRESHOLD 136
+#define MUL_TOOM6H_THRESHOLD 197
+#define MUL_TOOM8H_THRESHOLD 296
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD 89
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD 91
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD 89
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD 96
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD 79
+
+#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
+#define SQR_TOOM2_THRESHOLD 24
+#define SQR_TOOM3_THRESHOLD 85
+#define SQR_TOOM4_THRESHOLD 142
+#define SQR_TOOM6_THRESHOLD 270
+#define SQR_TOOM8_THRESHOLD 430
+
+#define MULMID_TOOM42_THRESHOLD 32
+
+#define MULMOD_BNM1_THRESHOLD 11
+#define SQRMOD_BNM1_THRESHOLD 15
+
+#define MUL_FFT_MODF_THRESHOLD 380 /* k = 5 */
+#define MUL_FFT_TABLE3 \
+ { { 380, 5}, { 13, 6}, { 7, 5}, { 17, 6}, \
+ { 9, 5}, { 19, 6}, { 10, 5}, { 21, 6}, \
+ { 13, 5}, { 28, 6}, { 21, 7}, { 11, 6}, \
+ { 23, 7}, { 12, 6}, { 25, 7}, { 21, 8}, \
+ { 11, 7}, { 25, 8}, { 13, 7}, { 27, 8}, \
+ { 15, 7}, { 31, 8}, { 21, 9}, { 11, 8}, \
+ { 27, 9}, { 15, 8}, { 35, 9}, { 19, 8}, \
+ { 39, 9}, { 23, 8}, { 47, 9}, { 27,10}, \
+ { 15, 9}, { 39,10}, { 23, 9}, { 55,11}, \
+ { 15,10}, { 31, 9}, { 71,10}, { 39, 9}, \
+ { 83,10}, { 47, 9}, { 95,10}, { 55,11}, \
+ { 31,10}, { 63, 9}, { 127,10}, { 87,11}, \
+ { 47,10}, { 95, 9}, { 191,10}, { 103,12}, \
+ { 31,11}, { 63,10}, { 127, 9}, { 255, 8}, \
+ { 511,10}, { 135, 9}, { 271,11}, { 79,10}, \
+ { 159, 9}, { 319,10}, { 167, 9}, { 335,11}, \
+ { 95,10}, { 191, 9}, { 383, 8}, { 767,10}, \
+ { 207, 9}, { 415,11}, { 111,12}, { 63,11}, \
+ { 127,10}, { 255, 9}, { 511,10}, { 271, 9}, \
+ { 543,11}, { 143,10}, { 287, 9}, { 575,10}, \
+ { 303, 9}, { 607,10}, { 319, 9}, { 639,10}, \
+ { 335, 9}, { 671,10}, { 351,12}, { 95,11}, \
+ { 191,10}, { 383, 9}, { 767,11}, { 207,10}, \
+ { 415, 9}, { 831,13}, { 63,12}, { 127,11}, \
+ { 255,10}, { 511,11}, { 271,10}, { 543,11}, \
+ { 287,10}, { 575,11}, { 303,10}, { 607,11}, \
+ { 319,10}, { 639,11}, { 335,10}, { 671,11}, \
+ { 351,10}, { 703,12}, { 191,11}, { 383,10}, \
+ { 767,11}, { 415,10}, { 831,12}, { 223,10}, \
+ { 895,11}, { 479,13}, { 127,12}, { 255,11}, \
+ { 543,12}, { 287,11}, { 607,12}, { 319,11}, \
+ { 671,12}, { 351,11}, { 703,13}, { 191,12}, \
+ { 383,11}, { 767,12}, { 415,11}, { 895,12}, \
+ { 479,14}, { 127,13}, { 255,12}, { 607,13}, \
+ { 319,12}, { 703,13}, { 383,12}, { 895,14}, \
+ { 255,13}, { 511,12}, { 1023,13}, { 575,12}, \
+ { 1151,13}, { 703,14}, { 383,13}, { 895,15}, \
+ { 255,14}, { 511,13}, { 1023,12}, { 2047,13}, \
+ { 1087,12}, { 2175,13}, { 1151,14}, { 16384,15}, \
+ { 32768,16}, { 65536,17}, { 131072,18}, { 262144,19}, \
+ { 524288,20}, {1048576,21}, {2097152,22}, {4194304,23}, \
+ {8388608,24} }
+#define MUL_FFT_TABLE3_SIZE 165
+#define MUL_FFT_THRESHOLD 9088
+
+#define SQR_FFT_MODF_THRESHOLD 308 /* k = 5 */
+#define SQR_FFT_TABLE3 \
+ { { 308, 5}, { 17, 6}, { 9, 5}, { 19, 6}, \
+ { 13, 5}, { 28, 6}, { 21, 7}, { 11, 6}, \
+ { 23, 7}, { 14, 6}, { 29, 7}, { 21, 8}, \
+ { 11, 7}, { 25, 8}, { 13, 7}, { 27, 8}, \
+ { 15, 7}, { 31, 8}, { 21, 9}, { 11, 8}, \
+ { 27, 9}, { 15, 8}, { 35, 9}, { 19, 8}, \
+ { 39, 9}, { 23, 8}, { 47, 9}, { 27,10}, \
+ { 15, 9}, { 39,10}, { 23, 9}, { 51,11}, \
+ { 15,10}, { 31, 9}, { 67,10}, { 39, 9}, \
+ { 83,10}, { 47, 9}, { 95,10}, { 55,11}, \
+ { 31,10}, { 79,11}, { 47,10}, { 95, 9}, \
+ { 191, 8}, { 383,12}, { 31,11}, { 63,10}, \
+ { 127, 9}, { 255, 8}, { 511,10}, { 135, 9}, \
+ { 271, 8}, { 543,11}, { 79,10}, { 159, 9}, \
+ { 319, 8}, { 639,10}, { 175, 9}, { 351,11}, \
+ { 95,10}, { 191, 9}, { 383, 8}, { 767,10}, \
+ { 207, 9}, { 415,11}, { 111,12}, { 63,11}, \
+ { 127,10}, { 255, 9}, { 511,10}, { 271, 9}, \
+ { 543,10}, { 287, 9}, { 575,10}, { 303,11}, \
+ { 159,10}, { 319, 9}, { 639,11}, { 175,10}, \
+ { 351,12}, { 95,11}, { 191,10}, { 383, 9}, \
+ { 767,11}, { 207,10}, { 415, 9}, { 831,11}, \
+ { 223,13}, { 63,12}, { 127,11}, { 255,10}, \
+ { 511,11}, { 271,10}, { 543,11}, { 287,10}, \
+ { 575,11}, { 303,10}, { 607,12}, { 159,11}, \
+ { 319,10}, { 639,11}, { 351,10}, { 703,12}, \
+ { 191,11}, { 383,10}, { 767,11}, { 415,10}, \
+ { 831,12}, { 223,10}, { 895,11}, { 479,13}, \
+ { 127,12}, { 255,11}, { 543,12}, { 287,11}, \
+ { 607,12}, { 319,11}, { 639,12}, { 351,11}, \
+ { 703,13}, { 191,12}, { 383,11}, { 767,12}, \
+ { 415,11}, { 895,12}, { 479,14}, { 127,13}, \
+ { 255,12}, { 607,13}, { 319,12}, { 703,13}, \
+ { 383,12}, { 927,14}, { 255,13}, { 511,12}, \
+ { 1023,13}, { 575,12}, { 1151,13}, { 639,12}, \
+ { 1279,13}, { 703,14}, { 383,13}, { 895,12}, \
+ { 1791,15}, { 255,14}, { 511,13}, { 1023,12}, \
+ { 2047,13}, { 1087,12}, { 2175,13}, { 1151,14}, \
+ { 16384,15}, { 32768,16}, { 65536,17}, { 131072,18}, \
+ { 262144,19}, { 524288,20}, {1048576,21}, {2097152,22}, \
+ {4194304,23}, {8388608,24} }
+#define SQR_FFT_TABLE3_SIZE 162
+#define SQR_FFT_THRESHOLD 6272
+
+#define MULLO_BASECASE_THRESHOLD 5
+#define MULLO_DC_THRESHOLD 44
+#define MULLO_MUL_N_THRESHOLD 18087
+
+#define DC_DIV_QR_THRESHOLD 42
+#define DC_DIVAPPR_Q_THRESHOLD 167
+#define DC_BDIV_QR_THRESHOLD 46
+#define DC_BDIV_Q_THRESHOLD 110
+
+#define INV_MULMOD_BNM1_THRESHOLD 30
+#define INV_NEWTON_THRESHOLD 181
+#define INV_APPR_THRESHOLD 173
+
+#define BINV_NEWTON_THRESHOLD 214
+#define REDC_1_TO_REDC_N_THRESHOLD 56
+
+#define MU_DIV_QR_THRESHOLD 998
+#define MU_DIVAPPR_Q_THRESHOLD 1017
+#define MUPI_DIV_QR_THRESHOLD 92
+#define MU_BDIV_QR_THRESHOLD 889
+#define MU_BDIV_Q_THRESHOLD 1017
+
+#define POWM_SEC_TABLE 2,22,87,579,1925
+
+#define MATRIX22_STRASSEN_THRESHOLD 15
+#define HGCD_THRESHOLD 109
+#define HGCD_APPR_THRESHOLD 115
+#define HGCD_REDUCE_THRESHOLD 4633
+#define GCD_DC_THRESHOLD 318
+#define GCDEXT_DC_THRESHOLD 242
+#define JACOBI_BASE_METHOD 4
+
+#define GET_STR_DC_THRESHOLD 11
+#define GET_STR_PRECOMPUTE_THRESHOLD 23
+#define SET_STR_DC_THRESHOLD 802
+#define SET_STR_PRECOMPUTE_THRESHOLD 1712
+
+#define FAC_DSC_THRESHOLD 507
+#define FAC_ODD_THRESHOLD 25
diff --git a/gmp/mpn/powerpc64/mode64/p5/gmp-mparam.h b/gmp/mpn/powerpc64/mode64/p5/gmp-mparam.h
new file mode 100644
index 0000000000..15b009c357
--- /dev/null
+++ b/gmp/mpn/powerpc64/mode64/p5/gmp-mparam.h
@@ -0,0 +1,219 @@
+/* POWER5 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 1991, 1993, 1994, 1999-2003, 2009, 2010 Free Software Foundation,
+Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+or
+
+ * the GNU General Public License as published by the Free Software
+ Foundation; either version 2 of the License, or (at your option) any
+ later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library. If not,
+see https://www.gnu.org/licenses/. */
+
+#define GMP_LIMB_BITS 64
+#define GMP_LIMB_BYTES 8
+
+/* POWER5 (friggms.hpc.ntnu.no) */
+
+#define MOD_1_NORM_THRESHOLD 0 /* always */
+#define MOD_1_UNNORM_THRESHOLD 0 /* always */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD 6
+#define MOD_1U_TO_MOD_1_1_THRESHOLD 5
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD 15
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 11
+#define USE_PREINV_DIVREM_1 0
+#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD 40
+
+#define MUL_TOOM22_THRESHOLD 21
+#define MUL_TOOM33_THRESHOLD 24
+#define MUL_TOOM44_THRESHOLD 70
+#define MUL_TOOM6H_THRESHOLD 262
+#define MUL_TOOM8H_THRESHOLD 393
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD 49
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD 126
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD 85
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD 94
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD 70
+
+#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
+#define SQR_TOOM2_THRESHOLD 24
+#define SQR_TOOM3_THRESHOLD 81
+#define SQR_TOOM4_THRESHOLD 142
+#define SQR_TOOM6_THRESHOLD 189
+#define SQR_TOOM8_THRESHOLD 284
+
+#define MULMID_TOOM42_THRESHOLD 36
+
+#define MULMOD_BNM1_THRESHOLD 12
+#define SQRMOD_BNM1_THRESHOLD 15
+
+#define MUL_FFT_MODF_THRESHOLD 304 /* k = 5 */
+#define MUL_FFT_TABLE3 \
+ { { 348, 5}, { 17, 6}, { 9, 5}, { 19, 6}, \
+ { 10, 5}, { 21, 6}, { 21, 7}, { 11, 6}, \
+ { 23, 7}, { 12, 6}, { 25, 7}, { 21, 8}, \
+ { 11, 7}, { 25, 8}, { 13, 7}, { 27, 8}, \
+ { 15, 7}, { 31, 8}, { 21, 9}, { 11, 8}, \
+ { 27, 9}, { 15, 8}, { 33, 9}, { 19, 8}, \
+ { 39, 9}, { 23, 8}, { 47, 9}, { 27,10}, \
+ { 15, 9}, { 39,10}, { 23, 9}, { 51,11}, \
+ { 15,10}, { 31, 9}, { 67,10}, { 39, 9}, \
+ { 79,10}, { 47, 9}, { 95,10}, { 55,11}, \
+ { 31,10}, { 79,11}, { 47,10}, { 95,12}, \
+ { 31,11}, { 63,10}, { 127, 9}, { 255,10}, \
+ { 135,11}, { 79,10}, { 159, 9}, { 319,11}, \
+ { 95,10}, { 191, 9}, { 383,12}, { 63,11}, \
+ { 127,10}, { 255, 9}, { 511,10}, { 271,11}, \
+ { 143,10}, { 287, 9}, { 575,10}, { 319,12}, \
+ { 95,11}, { 191,10}, { 383,13}, { 63,12}, \
+ { 127,11}, { 255,10}, { 511,11}, { 271,10}, \
+ { 543,11}, { 287,10}, { 575, 9}, { 1151,11}, \
+ { 319,10}, { 639,11}, { 351,10}, { 703,12}, \
+ { 191,11}, { 383,10}, { 767,11}, { 415,12}, \
+ { 223,11}, { 447,13}, { 127,12}, { 255,11}, \
+ { 511,10}, { 1023,11}, { 543,10}, { 1087,12}, \
+ { 287,11}, { 575,10}, { 1151,12}, { 319,11}, \
+ { 639,12}, { 351,11}, { 703,13}, { 191,12}, \
+ { 383,11}, { 767,12}, { 415,11}, { 831,12}, \
+ { 447,11}, { 895,14}, { 127,13}, { 255,12}, \
+ { 511,11}, { 1023,12}, { 543,11}, { 1087,10}, \
+ { 2175,12}, { 575,11}, { 1151,12}, { 607,13}, \
+ { 319,12}, { 639,11}, { 1279,12}, { 671,11}, \
+ { 1343,12}, { 703,11}, { 1407,13}, { 383,12}, \
+ { 767,11}, { 1535,12}, { 831,13}, { 447,12}, \
+ { 959,11}, { 1919,14}, { 255,13}, { 511,12}, \
+ { 1087,11}, { 2175,13}, { 575,12}, { 1215,11}, \
+ { 2431,10}, { 4863,13}, { 639,12}, { 1343,13}, \
+ { 703,12}, { 1407,14}, { 383,13}, { 767,12}, \
+ { 1535,13}, { 831,12}, { 1663,13}, { 959,12}, \
+ { 1919,11}, { 3839,15}, { 255,14}, { 511,13}, \
+ { 1087,12}, { 2175,13}, { 1215,12}, { 2431,11}, \
+ { 4863,14}, { 639,13}, { 1343,12}, { 2687,13}, \
+ { 1407,12}, { 2815,13}, { 1471,12}, { 2943,14}, \
+ { 767,13}, { 1599,12}, { 3199,13}, { 1663,14}, \
+ { 895,13}, { 1919,12}, { 3839,15}, { 511,14}, \
+ { 1023,13}, { 2175,14}, { 1151,13}, { 2431,12}, \
+ { 4863,14}, { 1279,13}, { 2687,14}, { 1407,13}, \
+ { 2943,15}, { 767,14}, { 1535,13}, { 3199,14}, \
+ { 1663,13}, { 3327,14}, { 1919,13}, { 3839,16}, \
+ { 511,15}, { 1023,14}, { 2431,13}, { 4863,15}, \
+ { 1279,14}, { 2943,12}, { 11775,15}, { 1535,14}, \
+ { 3327,15}, { 1791,14}, { 16384,15}, { 32768,16}, \
+ { 65536,17}, { 131072,18}, { 262144,19}, { 524288,20}, \
+ {1048576,21}, {2097152,22}, {4194304,23}, {8388608,24} }
+#define MUL_FFT_TABLE3_SIZE 208
+#define MUL_FFT_THRESHOLD 4224
+
+#define SQR_FFT_MODF_THRESHOLD 284 /* k = 5 */
+#define SQR_FFT_TABLE3 \
+ { { 272, 5}, { 15, 6}, { 8, 5}, { 17, 6}, \
+ { 19, 7}, { 17, 8}, { 9, 7}, { 21, 8}, \
+ { 11, 7}, { 24, 8}, { 13, 7}, { 27, 8}, \
+ { 15, 7}, { 31, 8}, { 21, 9}, { 11, 8}, \
+ { 27, 9}, { 15, 8}, { 33, 9}, { 19, 8}, \
+ { 39, 9}, { 23, 8}, { 47, 9}, { 27,10}, \
+ { 15, 9}, { 39,10}, { 23, 9}, { 47,11}, \
+ { 15,10}, { 31, 9}, { 63,10}, { 47,11}, \
+ { 31,10}, { 71, 9}, { 143,10}, { 79,11}, \
+ { 47,10}, { 95,12}, { 31,11}, { 63,10}, \
+ { 127, 9}, { 255, 8}, { 511, 9}, { 271,10}, \
+ { 143,11}, { 79,10}, { 159, 9}, { 319,10}, \
+ { 175, 9}, { 351,11}, { 95,10}, { 191, 9}, \
+ { 383,10}, { 207, 9}, { 415,12}, { 63,11}, \
+ { 127,10}, { 255, 9}, { 511,10}, { 271,11}, \
+ { 143,10}, { 287, 9}, { 575,11}, { 159,10}, \
+ { 319,11}, { 175,10}, { 351,12}, { 95,11}, \
+ { 191,10}, { 383,11}, { 207,10}, { 415,13}, \
+ { 63,12}, { 127,11}, { 255,10}, { 511,11}, \
+ { 271,10}, { 543,11}, { 287,10}, { 575,12}, \
+ { 159,11}, { 319,10}, { 639,11}, { 351,10}, \
+ { 703,12}, { 191,11}, { 383,10}, { 767,11}, \
+ { 415,12}, { 223,11}, { 447,10}, { 895,11}, \
+ { 479,10}, { 959,12}, { 255,11}, { 511,10}, \
+ { 1023,11}, { 543,12}, { 287,11}, { 575,12}, \
+ { 319,11}, { 639,12}, { 351,11}, { 703,13}, \
+ { 191,12}, { 383,11}, { 767,12}, { 415,11}, \
+ { 831,12}, { 447,11}, { 895,12}, { 479,11}, \
+ { 959,13}, { 255,12}, { 511,11}, { 1023,12}, \
+ { 543,11}, { 1087,12}, { 575,13}, { 319,12}, \
+ { 639,11}, { 1279,12}, { 703,11}, { 1407,13}, \
+ { 383,12}, { 831,13}, { 447,12}, { 959,14}, \
+ { 255,13}, { 511,12}, { 1087,13}, { 575,12}, \
+ { 1215,13}, { 639,12}, { 1279,13}, { 703,12}, \
+ { 1407,14}, { 383,13}, { 831,12}, { 1663,13}, \
+ { 959,12}, { 1919,15}, { 255,14}, { 511,13}, \
+ { 1023,12}, { 2047,13}, { 1087,12}, { 2175,13}, \
+ { 1215,14}, { 639,13}, { 1407,12}, { 2815,14}, \
+ { 767,13}, { 1663,14}, { 895,13}, { 1919,15}, \
+ { 511,14}, { 1023,13}, { 2175,14}, { 1151,13}, \
+ { 2431,12}, { 4863,14}, { 1407,13}, { 2815,15}, \
+ { 767,14}, { 1663,13}, { 3327,14}, { 1919,13}, \
+ { 3839,16}, { 511,15}, { 1023,14}, { 2431,13}, \
+ { 4863,15}, { 1279,14}, { 2943,13}, { 5887,12}, \
+ { 11775,15}, { 1535,14}, { 3327,15}, { 1791,14}, \
+ { 16384,15}, { 32768,16}, { 65536,17}, { 131072,18}, \
+ { 262144,19}, { 524288,20}, {1048576,21}, {2097152,22}, \
+ {4194304,23}, {8388608,24} }
+#define SQR_FFT_TABLE3_SIZE 190
+#define SQR_FFT_THRESHOLD 3264
+
+#define MULLO_BASECASE_THRESHOLD 6
+#define MULLO_DC_THRESHOLD 60
+#define MULLO_MUL_N_THRESHOLD 7463
+
+#define DC_DIV_QR_THRESHOLD 58
+#define DC_DIVAPPR_Q_THRESHOLD 232
+#define DC_BDIV_QR_THRESHOLD 78
+#define DC_BDIV_Q_THRESHOLD 238
+
+#define INV_MULMOD_BNM1_THRESHOLD 92
+#define INV_NEWTON_THRESHOLD 155
+#define INV_APPR_THRESHOLD 157
+
+#define BINV_NEWTON_THRESHOLD 155
+#define REDC_1_TO_REDC_N_THRESHOLD 61
+
+#define MU_DIV_QR_THRESHOLD 998
+#define MU_DIVAPPR_Q_THRESHOLD 979
+#define MUPI_DIV_QR_THRESHOLD 79
+#define MU_BDIV_QR_THRESHOLD 823
+#define MU_BDIV_Q_THRESHOLD 942
+
+#define MATRIX22_STRASSEN_THRESHOLD 14
+#define HGCD_THRESHOLD 74
+#define HGCD_APPR_THRESHOLD 155
+#define HGCD_REDUCE_THRESHOLD 2479
+#define GCD_DC_THRESHOLD 351
+#define GCDEXT_DC_THRESHOLD 288
+#define JACOBI_BASE_METHOD 4
+
+#define GET_STR_DC_THRESHOLD 12
+#define GET_STR_PRECOMPUTE_THRESHOLD 21
+#define SET_STR_DC_THRESHOLD 650
+#define SET_STR_PRECOMPUTE_THRESHOLD 1585
+
+#define FAC_DSC_THRESHOLD 662
+#define FAC_ODD_THRESHOLD 28
diff --git a/gmp/mpn/powerpc64/mode64/p6/aorsmul_1.asm b/gmp/mpn/powerpc64/mode64/p6/aorsmul_1.asm
new file mode 100644
index 0000000000..5a85f84f4a
--- /dev/null
+++ b/gmp/mpn/powerpc64/mode64/p6/aorsmul_1.asm
@@ -0,0 +1,183 @@
+dnl PowerPC-64 mpn_addmul_1 and mpn_submul_1 optimised for power6.
+
+dnl Copyright 1999-2001, 2003-2006, 2008, 2010, 2011 Free Software Foundation,
+dnl Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C mpn_addmul_1 mpn_submul_1
+C cycles/limb cycles/limb
+C POWER3/PPC630 ? ?
+C POWER4/PPC970 ? ?
+C POWER5 ? ?
+C POWER6 12.25 12.8
+C POWER7 ? ?
+
+C TODO
+C * Reduce register usage.
+C * Schedule function entry code.
+C * Unroll more. 8-way unrolling would bring us to 10 c/l, 16-way unrolling
+C would bring us to 9 c/l.
+C * Handle n = 1 and perhaps n = 2 separately, without saving any registers.
+
+C INPUT PARAMETERS
+define(`rp', `r3')
+define(`up', `r4')
+define(`n', `r5')
+define(`v0', `r6')
+
+ifdef(`OPERATION_addmul_1',`
+ define(ADDSUBC, adde)
+ define(ADDSUB, addc)
+ define(func, mpn_addmul_1)
+ define(func_nc, mpn_addmul_1c) C FIXME: not really supported
+ define(AM, `$1')
+ define(SM, `')
+ define(CLRRSC, `addic $1, r0, 0')
+')
+ifdef(`OPERATION_submul_1',`
+ define(ADDSUBC, subfe)
+ define(ADDSUB, subfc)
+ define(func, mpn_submul_1)
+ define(func_nc, mpn_submul_1c) C FIXME: not really supported
+ define(AM, `')
+ define(SM, `$1')
+ define(CLRRSC, `subfc $1, r0, r0')
+')
+
+ASM_START()
+PROLOGUE(func)
+ std r31, -8(r1)
+ std r30, -16(r1)
+ std r29, -24(r1)
+ std r28, -32(r1)
+ std r27, -40(r1)
+
+ rldicl. r0, n, 0,62 C r0 = n & 3, set cr0
+ cmpdi cr6, r0, 2
+ addi n, n, 3 C compute count...
+ srdi n, n, 2 C ...for ctr
+ mtctr n C copy loop count into ctr
+ beq cr0, L(b0)
+ blt cr6, L(b1)
+ beq cr6, L(b2)
+
+L(b3): ld r8, 0(up)
+ ld r7, 8(up)
+ ld r27, 16(up)
+ addi up, up, 16
+ addi rp, rp, 16
+ mulld r5, r8, v0
+ mulhdu r8, r8, v0
+ mulld r9, r7, v0
+ mulhdu r7, r7, v0
+ mulld r11, r27, v0
+ mulhdu r27, r27, v0
+ ld r29, -16(rp)
+ ld r30, -8(rp)
+ ld r31, 0(rp)
+ addc r9, r9, r8
+ adde r11, r11, r7
+ addze r12, r27
+ ADDSUB r5, r5, r29
+ b L(l3)
+
+L(b2): ld r7, 0(up)
+ ld r27, 8(up)
+ addi up, up, 8
+ addi rp, rp, 8
+ mulld r9, r7, v0
+ mulhdu r7, r7, v0
+ mulld r11, r27, v0
+ mulhdu r27, r27, v0
+ ld r30, -8(rp)
+ ld r31, 0(rp)
+ addc r11, r11, r7
+ addze r12, r27
+ ADDSUB r9, r9, r30
+ b L(l2)
+
+L(b1): ld r27, 0(up)
+ ld r31, 0(rp)
+ mulld r11, r27, v0
+ mulhdu r12, r27, v0
+ ADDSUB r11, r11, r31
+ b L(l1)
+
+L(b0): addi up, up, -8
+ addi rp, rp, -8
+ CLRRSC( r12) C clear r12 and clr/set cy
+
+ ALIGN(32)
+L(top):
+SM(` subfe r11, r0, r0') C complement...
+SM(` addic r11, r11, 1') C ...carry flag
+ ld r10, 8(up)
+ ld r8, 16(up)
+ ld r7, 24(up)
+ ld r27, 32(up)
+ addi up, up, 32
+ addi rp, rp, 32
+ mulld r0, r10, v0
+ mulhdu r10, r10, v0
+ mulld r5, r8, v0
+ mulhdu r8, r8, v0
+ mulld r9, r7, v0
+ mulhdu r7, r7, v0
+ mulld r11, r27, v0
+ mulhdu r27, r27, v0
+ ld r28, -24(rp)
+ adde r0, r0, r12
+ ld r29, -16(rp)
+ adde r5, r5, r10
+ ld r30, -8(rp)
+ ld r31, 0(rp)
+ adde r9, r9, r8
+ adde r11, r11, r7
+ addze r12, r27
+ ADDSUB r0, r0, r28
+ std r0, -24(rp)
+ ADDSUBC r5, r5, r29
+L(l3): std r5, -16(rp)
+ ADDSUBC r9, r9, r30
+L(l2): std r9, -8(rp)
+ ADDSUBC r11, r11, r31
+L(l1): std r11, 0(rp)
+ bdnz L(top)
+
+AM(` addze r3, r12')
+SM(` subfe r11, r0, r0') C complement...
+ ld r31, -8(r1)
+SM(` subf r3, r11, r12')
+ ld r30, -16(r1)
+ ld r29, -24(r1)
+ ld r28, -32(r1)
+ ld r27, -40(r1)
+ blr
+EPILOGUE()
diff --git a/gmp/mpn/powerpc64/mode64/p6/gmp-mparam.h b/gmp/mpn/powerpc64/mode64/p6/gmp-mparam.h
new file mode 100644
index 0000000000..c7e2f894ad
--- /dev/null
+++ b/gmp/mpn/powerpc64/mode64/p6/gmp-mparam.h
@@ -0,0 +1,160 @@
+/* POWER6 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 1991, 1993, 1994, 1999-2003, 2009-2011 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+or
+
+ * the GNU General Public License as published by the Free Software
+ Foundation; either version 2 of the License, or (at your option) any
+ later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library. If not,
+see https://www.gnu.org/licenses/. */
+
+#define GMP_LIMB_BITS 64
+#define GMP_LIMB_BYTES 8
+
+/* 3500 MHz POWER6 (kolga.bibsys.no) */
+
+#define MOD_1_NORM_THRESHOLD 0 /* always */
+#define MOD_1_UNNORM_THRESHOLD 0 /* always */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD 3
+#define MOD_1U_TO_MOD_1_1_THRESHOLD 3
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD 12
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 6
+#define USE_PREINV_DIVREM_1 0
+#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD 21
+
+#define MUL_TOOM22_THRESHOLD 20
+#define MUL_TOOM33_THRESHOLD 50
+#define MUL_TOOM44_THRESHOLD 106
+#define MUL_TOOM6H_THRESHOLD 274
+#define MUL_TOOM8H_THRESHOLD 339
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD 62
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD 76
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD 73
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD 66
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD 88
+
+#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
+#define SQR_TOOM2_THRESHOLD 24
+#define SQR_TOOM3_THRESHOLD 49
+#define SQR_TOOM4_THRESHOLD 130
+#define SQR_TOOM6_THRESHOLD 226
+#define SQR_TOOM8_THRESHOLD 272
+
+#define MULMID_TOOM42_THRESHOLD 36
+
+#define MULMOD_BNM1_THRESHOLD 14
+#define SQRMOD_BNM1_THRESHOLD 14
+
+#define MUL_FFT_MODF_THRESHOLD 380 /* k = 5 */
+#define MUL_FFT_TABLE3 \
+ { { 340, 5}, { 19, 6}, { 10, 5}, { 21, 6}, \
+ { 11, 5}, { 23, 6}, { 21, 7}, { 11, 6}, \
+ { 23, 7}, { 12, 6}, { 25, 7}, { 21, 8}, \
+ { 11, 7}, { 24, 8}, { 13, 7}, { 27, 8}, \
+ { 21, 9}, { 11, 8}, { 25, 9}, { 15, 8}, \
+ { 33, 9}, { 23, 8}, { 47, 9}, { 27,10}, \
+ { 15, 9}, { 39,10}, { 23, 9}, { 47,11}, \
+ { 15,10}, { 31, 9}, { 63,10}, { 47,11}, \
+ { 31,10}, { 71,11}, { 47,12}, { 31,11}, \
+ { 63,10}, { 127, 9}, { 255, 8}, { 511,10}, \
+ { 135, 9}, { 271,11}, { 79, 9}, { 319, 8}, \
+ { 639,10}, { 175,11}, { 95,10}, { 191, 9}, \
+ { 383,10}, { 207,12}, { 63,10}, { 255, 9}, \
+ { 511,10}, { 271, 9}, { 543,11}, { 143,10}, \
+ { 287, 9}, { 575,10}, { 303, 9}, { 607,10}, \
+ { 319, 9}, { 639,11}, { 175,12}, { 95,11}, \
+ { 191,10}, { 383,11}, { 207,10}, { 415,13}, \
+ { 8192,14}, { 16384,15}, { 32768,16}, { 65536,17}, \
+ { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \
+ {2097152,22}, {4194304,23}, {8388608,24} }
+#define MUL_FFT_TABLE3_SIZE 79
+#define MUL_FFT_THRESHOLD 3520
+
+#define SQR_FFT_MODF_THRESHOLD 308 /* k = 5 */
+#define SQR_FFT_TABLE3 \
+ { { 280, 5}, { 17, 6}, { 9, 5}, { 19, 6}, \
+ { 21, 7}, { 11, 6}, { 23, 7}, { 21, 8}, \
+ { 11, 7}, { 24, 8}, { 13, 7}, { 27, 8}, \
+ { 21, 9}, { 11, 8}, { 25, 9}, { 15, 8}, \
+ { 33, 9}, { 19, 8}, { 39, 9}, { 23, 8}, \
+ { 47, 9}, { 27,10}, { 15, 9}, { 39,10}, \
+ { 23, 9}, { 47,11}, { 15,10}, { 31, 9}, \
+ { 63,10}, { 47,11}, { 31,10}, { 71, 9}, \
+ { 143,11}, { 47,12}, { 31,11}, { 63, 9}, \
+ { 255, 8}, { 511, 9}, { 271,10}, { 143,11}, \
+ { 79,10}, { 159, 9}, { 319,10}, { 175, 9}, \
+ { 351,11}, { 95,10}, { 191, 9}, { 383,10}, \
+ { 207,12}, { 63,11}, { 127,10}, { 255, 9}, \
+ { 511, 8}, { 1023,10}, { 271, 9}, { 543,11}, \
+ { 143,10}, { 287, 9}, { 575,11}, { 159,10}, \
+ { 319, 9}, { 639,11}, { 175,10}, { 351,12}, \
+ { 95,11}, { 191,10}, { 383,11}, { 207,10}, \
+ { 415,13}, { 8192,14}, { 16384,15}, { 32768,16}, \
+ { 65536,17}, { 131072,18}, { 262144,19}, { 524288,20}, \
+ {1048576,21}, {2097152,22}, {4194304,23}, {8388608,24} }
+#define SQR_FFT_TABLE3_SIZE 80
+#define SQR_FFT_THRESHOLD 2752
+
+#define MULLO_BASECASE_THRESHOLD 5
+#define MULLO_DC_THRESHOLD 62
+#define MULLO_MUL_N_THRESHOLD 2995
+
+#define DC_DIV_QR_THRESHOLD 59
+#define DC_DIVAPPR_Q_THRESHOLD 200
+#define DC_BDIV_QR_THRESHOLD 70
+#define DC_BDIV_Q_THRESHOLD 168
+
+#define INV_MULMOD_BNM1_THRESHOLD 53
+#define INV_NEWTON_THRESHOLD 170
+#define INV_APPR_THRESHOLD 166
+
+#define BINV_NEWTON_THRESHOLD 220
+#define REDC_1_TO_REDC_N_THRESHOLD 67
+
+#define MU_DIV_QR_THRESHOLD 998
+#define MU_DIVAPPR_Q_THRESHOLD 942
+#define MUPI_DIV_QR_THRESHOLD 57
+#define MU_BDIV_QR_THRESHOLD 889
+#define MU_BDIV_Q_THRESHOLD 1078
+
+#define POWM_SEC_TABLE 4,26,216,804,1731
+
+#define MATRIX22_STRASSEN_THRESHOLD 13
+#define HGCD_THRESHOLD 106
+#define HGCD_APPR_THRESHOLD 109
+#define HGCD_REDUCE_THRESHOLD 2205
+#define GCD_DC_THRESHOLD 492
+#define GCDEXT_DC_THRESHOLD 327
+#define JACOBI_BASE_METHOD 4
+
+#define GET_STR_DC_THRESHOLD 16
+#define GET_STR_PRECOMPUTE_THRESHOLD 28
+#define SET_STR_DC_THRESHOLD 537
+#define SET_STR_PRECOMPUTE_THRESHOLD 1576
+
+#define FAC_DSC_THRESHOLD 426
+#define FAC_ODD_THRESHOLD 0 /* always */
diff --git a/gmp/mpn/powerpc64/mode64/p6/mul_basecase.asm b/gmp/mpn/powerpc64/mode64/p6/mul_basecase.asm
new file mode 100644
index 0000000000..3d32b46c35
--- /dev/null
+++ b/gmp/mpn/powerpc64/mode64/p6/mul_basecase.asm
@@ -0,0 +1,589 @@
+dnl PowerPC-64 mpn_mul_basecase.
+
+dnl Copyright 1999-2001, 2003-2006, 2008, 2010 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C POWER3/PPC630 ?
+C POWER4/PPC970 ?
+C POWER5 ?
+C POWER6 12.25
+
+C TODO
+C * Reduce register usage. At least 4 register less can be used.
+C * Unroll more. 8-way unrolling would bring us to 10 c/l, 16-way unrolling
+C would bring us to 9 c/l.
+C * The bdz insns for b1 and b2 will never branch,
+C * Align things better, perhaps by moving things like pointer updates from
+C before to after loops.
+
+C INPUT PARAMETERS
+define(`rp', `r3')
+define(`up', `r4')
+define(`un', `r5')
+define(`vp', `r6')
+define(`vn', `r7')
+
+define(`v0', `r25')
+define(`outer_rp', `r22')
+define(`outer_up', `r23')
+
+ASM_START()
+PROLOGUE(mpn_mul_basecase)
+
+C Special code for un <= 2, for efficiency of these important cases,
+C and since it simplifies the default code.
+ cmpdi cr0, un, 2
+ bgt cr0, L(un_gt2)
+ cmpdi cr6, vn, 1
+ ld r7, 0(vp)
+ ld r5, 0(up)
+ mulld r8, r5, r7 C weight 0
+ mulhdu r9, r5, r7 C weight 1
+ std r8, 0(rp)
+ beq cr0, L(2x)
+ std r9, 8(rp)
+ blr
+ ALIGN(16)
+L(2x): ld r0, 8(up)
+ mulld r8, r0, r7 C weight 1
+ mulhdu r10, r0, r7 C weight 2
+ addc r9, r9, r8
+ addze r10, r10
+ bne cr6, L(2x2)
+ std r9, 8(rp)
+ std r10, 16(rp)
+ blr
+ ALIGN(16)
+L(2x2): ld r6, 8(vp)
+ nop
+ mulld r8, r5, r6 C weight 1
+ mulhdu r11, r5, r6 C weight 2
+ mulld r12, r0, r6 C weight 2
+ mulhdu r0, r0, r6 C weight 3
+ addc r9, r9, r8
+ std r9, 8(rp)
+ adde r11, r11, r10
+ addze r0, r0
+ addc r11, r11, r12
+ addze r0, r0
+ std r11, 16(rp)
+ std r0, 24(rp)
+ blr
+
+L(un_gt2):
+ std r31, -8(r1)
+ std r30, -16(r1)
+ std r29, -24(r1)
+ std r28, -32(r1)
+ std r27, -40(r1)
+ std r26, -48(r1)
+ std r25, -56(r1)
+ std r24, -64(r1)
+ std r23, -72(r1)
+ std r22, -80(r1)
+ std r21, -88(r1)
+ std r20, -96(r1)
+
+ mr outer_rp, rp
+ mr outer_up, up
+
+ ld v0, 0(vp) C new v limb
+ addi vp, vp, 8
+ ld r26, 0(up)
+
+ rldicl. r0, un, 0,62 C r0 = n & 3, set cr0
+ cmpdi cr6, r0, 2
+ addi un, un, 4 C compute count...
+ srdi un, un, 2 C ...for ctr
+ mtctr un C copy inner loop count into ctr
+ beq cr0, L(b0)
+ blt cr6, L(b1)
+ beq cr6, L(b2)
+
+
+ ALIGN(16)
+L(b3):
+ ld r27, 8(up)
+ ld r20, 16(up)
+ mulld r0, r26, v0
+ mulhdu r31, r26, v0
+ mulld r24, r27, v0
+ mulhdu r8, r27, v0
+ mulld r9, r20, v0
+ mulhdu r10, r20, v0
+ addc r24, r24, r31
+ adde r9, r9, r8
+ addze r12, r10
+ std r0, 0(rp)
+ std r24, 8(rp)
+ std r9, 16(rp)
+ addi up, up, 16
+ addi rp, rp, 16
+ bdz L(end_m_3)
+
+ ALIGN(32)
+L(lo_m_3):
+ ld r26, 8(up)
+ ld r27, 16(up)
+ ld r20, 24(up)
+ ld r21, 32(up)
+ mulld r0, r26, v0
+ mulhdu r31, r26, v0
+ mulld r24, r27, v0
+ mulhdu r8, r27, v0
+ mulld r9, r20, v0
+ mulhdu r27, r20, v0
+ mulld r11, r21, v0
+ mulhdu r26, r21, v0
+ adde r0, r0, r12
+ adde r24, r24, r31
+ std r0, 8(rp)
+ adde r9, r9, r8
+ std r24, 16(rp)
+ adde r11, r11, r27
+ std r9, 24(rp)
+ addi up, up, 32
+ std r11, 32(rp)
+ addi rp, rp, 32
+ mr r12, r26
+ bdnz L(lo_m_3)
+
+ ALIGN(16)
+L(end_m_3):
+ addze r12, r12
+ addic. vn, vn, -1
+ std r12, 8(rp)
+ beq L(ret)
+
+ ALIGN(16)
+L(outer_lo_3):
+ mtctr un C copy inner loop count into ctr
+ addi rp, outer_rp, 24
+ addi up, outer_up, 16
+ addi outer_rp, outer_rp, 8
+ ld v0, 0(vp) C new v limb
+ addi vp, vp, 8
+ ld r26, -16(up)
+ ld r27, -8(up)
+ ld r20, 0(up)
+ mulld r0, r26, v0
+ mulhdu r31, r26, v0
+ mulld r24, r27, v0
+ mulhdu r8, r27, v0
+ mulld r9, r20, v0
+ mulhdu r10, r20, v0
+ ld r28, -16(rp)
+ ld r29, -8(rp)
+ ld r30, 0(rp)
+ addc r24, r24, r31
+ adde r9, r9, r8
+ addze r12, r10
+ addc r0, r0, r28
+ std r0, -16(rp)
+ adde r24, r24, r29
+ std r24, -8(rp)
+ adde r9, r9, r30
+ std r9, 0(rp)
+ bdz L(end_3)
+
+ ALIGN(32) C registers dying
+L(lo_3):
+ ld r26, 8(up)
+ ld r27, 16(up)
+ ld r20, 24(up) C
+ ld r21, 32(up) C
+ addi up, up, 32 C
+ addi rp, rp, 32 C
+ mulld r0, r26, v0 C
+ mulhdu r10, r26, v0 C 26
+ mulld r24, r27, v0 C
+ mulhdu r8, r27, v0 C 27
+ mulld r9, r20, v0 C
+ mulhdu r27, r20, v0 C 26
+ mulld r11, r21, v0 C
+ mulhdu r26, r21, v0 C 27
+ ld r28, -24(rp) C
+ adde r0, r0, r12 C 0 12
+ ld r29, -16(rp) C
+ adde r24, r24, r10 C 24 10
+ ld r30, -8(rp) C
+ ld r31, 0(rp) C
+ adde r9, r9, r8 C 8 9
+ adde r11, r11, r27 C 27 11
+ addze r12, r26 C 26
+ addc r0, r0, r28 C 0 28
+ std r0, -24(rp) C 0
+ adde r24, r24, r29 C 7 29
+ std r24, -16(rp) C 7
+ adde r9, r9, r30 C 9 30
+ std r9, -8(rp) C 9
+ adde r11, r11, r31 C 11 31
+ std r11, 0(rp) C 11
+ bdnz L(lo_3) C
+
+ ALIGN(16)
+L(end_3):
+ addze r12, r12
+ addic. vn, vn, -1
+ std r12, 8(rp)
+ bne L(outer_lo_3)
+ b L(ret)
+
+
+ ALIGN(16)
+L(b1):
+ mulld r0, r26, v0
+ mulhdu r12, r26, v0
+ addic r0, r0, 0
+ std r0, 0(rp)
+ bdz L(end_m_1)
+
+ ALIGN(16)
+L(lo_m_1):
+ ld r26, 8(up)
+ ld r27, 16(up)
+ ld r20, 24(up)
+ ld r21, 32(up)
+ mulld r0, r26, v0
+ mulhdu r31, r26, v0
+ mulld r24, r27, v0
+ mulhdu r8, r27, v0
+ mulld r9, r20, v0
+ mulhdu r27, r20, v0
+ mulld r11, r21, v0
+ mulhdu r26, r21, v0
+ adde r0, r0, r12
+ adde r24, r24, r31
+ std r0, 8(rp)
+ adde r9, r9, r8
+ std r24, 16(rp)
+ adde r11, r11, r27
+ std r9, 24(rp)
+ addi up, up, 32
+ std r11, 32(rp)
+ addi rp, rp, 32
+ mr r12, r26
+ bdnz L(lo_m_1)
+
+ ALIGN(16)
+L(end_m_1):
+ addze r12, r12
+ addic. vn, vn, -1
+ std r12, 8(rp)
+ beq L(ret)
+
+ ALIGN(16)
+L(outer_lo_1):
+ mtctr un C copy inner loop count into ctr
+ addi rp, outer_rp, 8
+ mr up, outer_up
+ addi outer_rp, outer_rp, 8
+ ld v0, 0(vp) C new v limb
+ addi vp, vp, 8
+ ld r26, 0(up)
+ ld r28, 0(rp)
+ mulld r0, r26, v0
+ mulhdu r12, r26, v0
+ addc r0, r0, r28
+ std r0, 0(rp)
+ bdz L(end_1)
+
+ ALIGN(32) C registers dying
+L(lo_1):
+ ld r26, 8(up)
+ ld r27, 16(up)
+ ld r20, 24(up) C
+ ld r21, 32(up) C
+ addi up, up, 32 C
+ addi rp, rp, 32 C
+ mulld r0, r26, v0 C
+ mulhdu r10, r26, v0 C 26
+ mulld r24, r27, v0 C
+ mulhdu r8, r27, v0 C 27
+ mulld r9, r20, v0 C
+ mulhdu r27, r20, v0 C 26
+ mulld r11, r21, v0 C
+ mulhdu r26, r21, v0 C 27
+ ld r28, -24(rp) C
+ adde r0, r0, r12 C 0 12
+ ld r29, -16(rp) C
+ adde r24, r24, r10 C 24 10
+ ld r30, -8(rp) C
+ ld r31, 0(rp) C
+ adde r9, r9, r8 C 8 9
+ adde r11, r11, r27 C 27 11
+ addze r12, r26 C 26
+ addc r0, r0, r28 C 0 28
+ std r0, -24(rp) C 0
+ adde r24, r24, r29 C 7 29
+ std r24, -16(rp) C 7
+ adde r9, r9, r30 C 9 30
+ std r9, -8(rp) C 9
+ adde r11, r11, r31 C 11 31
+ std r11, 0(rp) C 11
+ bdnz L(lo_1) C
+
+ ALIGN(16)
+L(end_1):
+ addze r12, r12
+ addic. vn, vn, -1
+ std r12, 8(rp)
+ bne L(outer_lo_1)
+ b L(ret)
+
+
+ ALIGN(16)
+L(b0):
+ addi up, up, -8
+ addi rp, rp, -8
+ li r12, 0
+ addic r12, r12, 0
+ bdz L(end_m_0)
+
+ ALIGN(16)
+L(lo_m_0):
+ ld r26, 8(up)
+ ld r27, 16(up)
+ ld r20, 24(up)
+ ld r21, 32(up)
+ mulld r0, r26, v0
+ mulhdu r31, r26, v0
+ mulld r24, r27, v0
+ mulhdu r8, r27, v0
+ mulld r9, r20, v0
+ mulhdu r27, r20, v0
+ mulld r11, r21, v0
+ mulhdu r26, r21, v0
+ adde r0, r0, r12
+ adde r24, r24, r31
+ std r0, 8(rp)
+ adde r9, r9, r8
+ std r24, 16(rp)
+ adde r11, r11, r27
+ std r9, 24(rp)
+ addi up, up, 32
+ std r11, 32(rp)
+ addi rp, rp, 32
+ mr r12, r26
+ bdnz L(lo_m_0)
+
+ ALIGN(16)
+L(end_m_0):
+ addze r12, r12
+ addic. vn, vn, -1
+ std r12, 8(rp)
+ beq L(ret)
+
+ ALIGN(16)
+L(outer_lo_0):
+ mtctr un C copy inner loop count into ctr
+ addi rp, outer_rp, 0
+ addi up, outer_up, -8
+ addi outer_rp, outer_rp, 8
+ ld v0, 0(vp) C new v limb
+ addi vp, vp, 8
+ li r12, 0
+ addic r12, r12, 0
+ bdz L(end_0)
+
+ ALIGN(32) C registers dying
+L(lo_0):
+ ld r26, 8(up)
+ ld r27, 16(up)
+ ld r20, 24(up) C
+ ld r21, 32(up) C
+ addi up, up, 32 C
+ addi rp, rp, 32 C
+ mulld r0, r26, v0 C
+ mulhdu r10, r26, v0 C 26
+ mulld r24, r27, v0 C
+ mulhdu r8, r27, v0 C 27
+ mulld r9, r20, v0 C
+ mulhdu r27, r20, v0 C 26
+ mulld r11, r21, v0 C
+ mulhdu r26, r21, v0 C 27
+ ld r28, -24(rp) C
+ adde r0, r0, r12 C 0 12
+ ld r29, -16(rp) C
+ adde r24, r24, r10 C 24 10
+ ld r30, -8(rp) C
+ ld r31, 0(rp) C
+ adde r9, r9, r8 C 8 9
+ adde r11, r11, r27 C 27 11
+ addze r12, r26 C 26
+ addc r0, r0, r28 C 0 28
+ std r0, -24(rp) C 0
+ adde r24, r24, r29 C 7 29
+ std r24, -16(rp) C 7
+ adde r9, r9, r30 C 9 30
+ std r9, -8(rp) C 9
+ adde r11, r11, r31 C 11 31
+ std r11, 0(rp) C 11
+ bdnz L(lo_0) C
+
+ ALIGN(16)
+L(end_0):
+ addze r12, r12
+ addic. vn, vn, -1
+ std r12, 8(rp)
+ bne L(outer_lo_0)
+ b L(ret)
+
+
+ ALIGN(16)
+L(b2): ld r27, 8(up)
+ addi up, up, 8
+ mulld r0, r26, v0
+ mulhdu r10, r26, v0
+ mulld r24, r27, v0
+ mulhdu r8, r27, v0
+ addc r24, r24, r10
+ addze r12, r8
+ std r0, 0(rp)
+ std r24, 8(rp)
+ addi rp, rp, 8
+ bdz L(end_m_2)
+
+ ALIGN(16)
+L(lo_m_2):
+ ld r26, 8(up)
+ ld r27, 16(up)
+ ld r20, 24(up)
+ ld r21, 32(up)
+ mulld r0, r26, v0
+ mulhdu r31, r26, v0
+ mulld r24, r27, v0
+ mulhdu r8, r27, v0
+ mulld r9, r20, v0
+ mulhdu r27, r20, v0
+ mulld r11, r21, v0
+ mulhdu r26, r21, v0
+ adde r0, r0, r12
+ adde r24, r24, r31
+ std r0, 8(rp)
+ adde r9, r9, r8
+ std r24, 16(rp)
+ adde r11, r11, r27
+ std r9, 24(rp)
+ addi up, up, 32
+ std r11, 32(rp)
+ addi rp, rp, 32
+ mr r12, r26
+ bdnz L(lo_m_2)
+
+ ALIGN(16)
+L(end_m_2):
+ addze r12, r12
+ addic. vn, vn, -1
+ std r12, 8(rp)
+ beq L(ret)
+
+ ALIGN(16)
+L(outer_lo_2):
+ mtctr un C copy inner loop count into ctr
+ addi rp, outer_rp, 16
+ addi up, outer_up, 8
+ addi outer_rp, outer_rp, 8
+ ld v0, 0(vp) C new v limb
+ addi vp, vp, 8
+ ld r26, -8(up)
+ ld r27, 0(up)
+ ld r28, -8(rp)
+ ld r29, 0(rp)
+ mulld r0, r26, v0
+ mulhdu r10, r26, v0
+ mulld r24, r27, v0
+ mulhdu r8, r27, v0
+ addc r24, r24, r10
+ addze r12, r8
+ addc r0, r0, r28
+ std r0, -8(rp)
+ adde r24, r24, r29
+ std r24, 0(rp)
+ bdz L(end_2)
+
+ ALIGN(16) C registers dying
+L(lo_2):
+ ld r26, 8(up)
+ ld r27, 16(up)
+ ld r20, 24(up) C
+ ld r21, 32(up) C
+ addi up, up, 32 C
+ addi rp, rp, 32 C
+ mulld r0, r26, v0 C
+ mulhdu r10, r26, v0 C 26
+ mulld r24, r27, v0 C
+ mulhdu r8, r27, v0 C 27
+ mulld r9, r20, v0 C
+ mulhdu r27, r20, v0 C 26
+ mulld r11, r21, v0 C
+ mulhdu r26, r21, v0 C 27
+ ld r28, -24(rp) C
+ adde r0, r0, r12 C 0 12
+ ld r29, -16(rp) C
+ adde r24, r24, r10 C 24 10
+ ld r30, -8(rp) C
+ ld r31, 0(rp) C
+ adde r9, r9, r8 C 8 9
+ adde r11, r11, r27 C 27 11
+ addze r12, r26 C 26
+ addc r0, r0, r28 C 0 28
+ std r0, -24(rp) C 0
+ adde r24, r24, r29 C 7 29
+ std r24, -16(rp) C 7
+ adde r9, r9, r30 C 9 30
+ std r9, -8(rp) C 9
+ adde r11, r11, r31 C 11 31
+ std r11, 0(rp) C 11
+ bdnz L(lo_2) C
+
+ ALIGN(16)
+L(end_2):
+ addze r12, r12
+ addic. vn, vn, -1
+ std r12, 8(rp)
+ bne L(outer_lo_2)
+C b L(ret)
+
+L(ret): ld r31, -8(r1)
+ ld r30, -16(r1)
+ ld r29, -24(r1)
+ ld r28, -32(r1)
+ ld r27, -40(r1)
+ ld r26, -48(r1)
+ ld r25, -56(r1)
+ ld r24, -64(r1)
+ ld r23, -72(r1)
+ ld r22, -80(r1)
+ ld r21, -88(r1)
+ ld r20, -96(r1)
+ blr
+EPILOGUE()
diff --git a/gmp/mpn/powerpc64/mode64/p7/aormul_2.asm b/gmp/mpn/powerpc64/mode64/p7/aormul_2.asm
new file mode 100644
index 0000000000..8731e01a89
--- /dev/null
+++ b/gmp/mpn/powerpc64/mode64/p7/aormul_2.asm
@@ -0,0 +1,135 @@
+dnl PowerPC-64 mpn_mul_2 and mpn_addmul_2.
+
+dnl Copyright 2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb cycles/limb
+C mul_2 addmul_2
+C POWER3/PPC630 ? ?
+C POWER4/PPC970 ? ?
+C POWER5 ? ?
+C POWER6 ? ?
+C POWER7-SMT4 3 3
+C POWER7-SMT2 ? ?
+C POWER7-SMT1 ? ?
+
+C INPUT PARAMETERS
+define(`rp', `r3')
+define(`up', `r4')
+define(`n', `r5')
+define(`vp', `r6')
+
+define(`cy0', `r10')
+ifdef(`EXTRA_REGISTER',
+` define(`cy1', EXTRA_REGISTER)',
+` define(`cy1', `r31')')
+
+ifdef(`OPERATION_mul_2',`
+ define(`AM', `')
+ define(`ADDX', `addc')
+ define(`func', `mpn_mul_2')
+')
+ifdef(`OPERATION_addmul_2',`
+ define(`AM', `$1')
+ define(`ADDX', `adde')
+ define(`func', `mpn_addmul_2')
+')
+
+MULFUNC_PROLOGUE(mpn_mul_2 mpn_addmul_2)
+
+ASM_START()
+PROLOGUE(func)
+
+ifdef(`EXTRA_REGISTER',,`
+ std r31, -8(r1)
+')
+ andi. r12, n, 1
+ addi r0, n, 1
+ srdi r0, r0, 1
+ mtctr r0
+ ld r11, 0(vp) C v0
+ li cy0, 0
+ ld r12, 8(vp) C v1
+ li cy1, 0
+ ld r5, 0(up)
+ beq L(lo0)
+ addi up, up, -8
+ addi rp, rp, -8
+ b L(lo1)
+
+ ALIGN(32)
+L(top):
+AM(` ld r0, -8(rp)')
+ ld r5, 0(up)
+AM(` addc r6, r6, r0')
+ ADDX r7, r7, r8
+ addze r9, r9
+ addc r6, r6, cy0
+ adde cy0, r7, cy1
+ std r6, -8(rp)
+ addze cy1, r9
+L(lo0): mulld r6, r11, r5 C v0 * u[i] weight 0
+ mulhdu r7, r11, r5 C v0 * u[i] weight 1
+ mulld r8, r12, r5 C v1 * u[i] weight 1
+ mulhdu r9, r12, r5 C v1 * u[i] weight 2
+AM(` ld r0, 0(rp)')
+ ld r5, 8(up)
+AM(` addc r6, r6, r0')
+ ADDX r7, r7, r8
+ addze r9, r9
+ addc r6, r6, cy0
+ adde cy0, r7, cy1
+ std r6, 0(rp)
+ addze cy1, r9
+L(lo1): mulld r6, r11, r5 C v0 * u[i] weight 0
+ mulhdu r7, r11, r5 C v0 * u[i] weight 1
+ addi up, up, 16
+ addi rp, rp, 16
+ mulld r8, r12, r5 C v1 * u[i] weight 1
+ mulhdu r9, r12, r5 C v1 * u[i] weight 2
+ bdnz L(top)
+
+L(end):
+AM(` ld r0, -8(rp)')
+AM(` addc r6, r6, r0')
+ ADDX r7, r7, r8
+ addze r9, r9
+ addc r6, r6, cy0
+ std r6, -8(rp)
+ adde cy0, r7, cy1
+ addze cy1, r9
+ std cy0, 0(rp)
+ mr r3, cy1
+
+ifdef(`EXTRA_REGISTER',,`
+ ld r31, -8(r1)
+')
+ blr
+EPILOGUE()
diff --git a/gmp/mpn/powerpc64/mode64/p7/aors_n.asm b/gmp/mpn/powerpc64/mode64/p7/aors_n.asm
new file mode 100644
index 0000000000..857c701dec
--- /dev/null
+++ b/gmp/mpn/powerpc64/mode64/p7/aors_n.asm
@@ -0,0 +1,128 @@
+dnl PowerPC-64 mpn_add_n, mpn_sub_n optimised for POWER7.
+
+dnl Copyright 2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C POWER3/PPC630 ?
+C POWER4/PPC970 ?
+C POWER5 ?
+C POWER6 ?
+C POWER7 2.18
+
+C This is a tad bit slower than the cnd_aors_n.asm code, which is of course an
+C anomaly.
+
+ifdef(`OPERATION_add_n',`
+ define(ADDSUBC, adde)
+ define(ADDSUB, addc)
+ define(func, mpn_add_n)
+ define(func_nc, mpn_add_nc)
+ define(GENRVAL, `addi r3, r3, 1')
+ define(SETCBR, `addic r0, $1, -1')
+ define(CLRCB, `addic r0, r0, 0')
+')
+ifdef(`OPERATION_sub_n',`
+ define(ADDSUBC, subfe)
+ define(ADDSUB, subfc)
+ define(func, mpn_sub_n)
+ define(func_nc, mpn_sub_nc)
+ define(GENRVAL, `neg r3, r3')
+ define(SETCBR, `subfic r0, $1, 0')
+ define(CLRCB, `addic r0, r1, -1')
+')
+
+MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
+
+C INPUT PARAMETERS
+define(`rp', `r3')
+define(`up', `r4')
+define(`vp', `r5')
+define(`n', `r6')
+
+ASM_START()
+PROLOGUE(func_nc)
+ SETCBR(r7)
+ b L(ent)
+EPILOGUE()
+
+PROLOGUE(func)
+ CLRCB
+L(ent):
+ andi. r7, n, 1
+ beq L(bx0)
+
+L(bx1): ld r7, 0(up)
+ ld r9, 0(vp)
+ ADDSUBC r11, r9, r7
+ std r11, 0(rp)
+ cmpldi cr6, n, 1
+ beq cr6, L(end)
+ addi up, up, 8
+ addi vp, vp, 8
+ addi rp, rp, 8
+
+L(bx0): addi r0, n, 2 C compute branch...
+ srdi r0, r0, 2 C ...count
+ mtctr r0
+
+ andi. r7, n, 2
+ bne L(mid)
+
+ addi up, up, 16
+ addi vp, vp, 16
+ addi rp, rp, 16
+
+ ALIGN(32)
+L(top): ld r6, -16(up)
+ ld r7, -8(up)
+ ld r8, -16(vp)
+ ld r9, -8(vp)
+ ADDSUBC r10, r8, r6
+ ADDSUBC r11, r9, r7
+ std r10, -16(rp)
+ std r11, -8(rp)
+L(mid): ld r6, 0(up)
+ ld r7, 8(up)
+ ld r8, 0(vp)
+ ld r9, 8(vp)
+ ADDSUBC r10, r8, r6
+ ADDSUBC r11, r9, r7
+ std r10, 0(rp)
+ std r11, 8(rp)
+ addi up, up, 32
+ addi vp, vp, 32
+ addi rp, rp, 32
+ bdnz L(top)
+
+L(end): subfe r3, r0, r0 C -cy
+ GENRVAL
+ blr
+EPILOGUE()
diff --git a/gmp/mpn/powerpc64/mode64/p7/aorsorrlsh1_n.asm b/gmp/mpn/powerpc64/mode64/p7/aorsorrlsh1_n.asm
new file mode 100644
index 0000000000..ddf5fd84b1
--- /dev/null
+++ b/gmp/mpn/powerpc64/mode64/p7/aorsorrlsh1_n.asm
@@ -0,0 +1,43 @@
+dnl PowerPC-64 mpn_addlsh1_n, mpn_sublsh1_n, mpn_rsblsh1_n.
+
+dnl Copyright 2003, 2005, 2009, 2010, 2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+define(LSH, 1)
+define(RSH, 63)
+
+ifdef(`OPERATION_addlsh1_n',`define(`DO_add')')
+ifdef(`OPERATION_sublsh1_n',`define(`DO_sub')')
+ifdef(`OPERATION_rsblsh1_n',`define(`DO_rsb')')
+
+MULFUNC_PROLOGUE(mpn_addlsh1_n mpn_sublsh1_n mpn_rsblsh1_n)
+
+include_mpn(`powerpc64/mode64/p7/aorsorrlshC_n.asm')
diff --git a/gmp/mpn/powerpc64/mode64/p7/aorsorrlsh2_n.asm b/gmp/mpn/powerpc64/mode64/p7/aorsorrlsh2_n.asm
new file mode 100644
index 0000000000..3f9d88d6ca
--- /dev/null
+++ b/gmp/mpn/powerpc64/mode64/p7/aorsorrlsh2_n.asm
@@ -0,0 +1,43 @@
+dnl PowerPC-64 mpn_addlsh2_n, mpn_sublsh2_n, mpn_rsblsh2_n.
+
+dnl Copyright 2003, 2005, 2009, 2010, 2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+define(LSH, 2)
+define(RSH, 62)
+
+ifdef(`OPERATION_addlsh2_n',`define(`DO_add')')
+ifdef(`OPERATION_sublsh2_n',`define(`DO_sub')')
+ifdef(`OPERATION_rsblsh2_n',`define(`DO_rsb')')
+
+MULFUNC_PROLOGUE(mpn_addlsh2_n mpn_sublsh2_n mpn_rsblsh2_n)
+
+include_mpn(`powerpc64/mode64/p7/aorsorrlshC_n.asm')
diff --git a/gmp/mpn/powerpc64/mode64/p7/aorsorrlshC_n.asm b/gmp/mpn/powerpc64/mode64/p7/aorsorrlshC_n.asm
new file mode 100644
index 0000000000..525120262f
--- /dev/null
+++ b/gmp/mpn/powerpc64/mode64/p7/aorsorrlshC_n.asm
@@ -0,0 +1,129 @@
+dnl PowerPC-64 mpn_addlshC_n, mpn_sublshC_n, mpn_rsblshC_n.
+
+dnl Copyright 2003, 2005, 2009, 2010, 2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+C cycles/limb
+C POWER3/PPC630 ?
+C POWER4/PPC970 ?
+C POWER5 ?
+C POWER6 ?
+C POWER7 2.5
+
+C INPUT PARAMETERS
+define(`rp', `r3')
+define(`up', `r4')
+define(`vp', `r5')
+define(`n', `r6')
+
+ifdef(`DO_add', `
+ define(`ADDSUBC', `addc $1, $2, $3')
+ define(`ADDSUBE', `adde $1, $2, $3')
+ define(INITCY, `addic $1, r1, 0')
+ define(RETVAL, `addze r3, $1')
+ define(`func', mpn_addlsh`'LSH`'_n)')
+ifdef(`DO_sub', `
+ define(`ADDSUBC', `subfc $1, $2, $3')
+ define(`ADDSUBE', `subfe $1, $2, $3')
+ define(INITCY, `addic $1, r1, -1')
+ define(RETVAL, `subfze r3, $1
+ neg r3, r3')
+ define(`func', mpn_sublsh`'LSH`'_n)')
+ifdef(`DO_rsb', `
+ define(`ADDSUBC', `subfc $1, $3, $2')
+ define(`ADDSUBE', `subfe $1, $3, $2')
+ define(INITCY, `addic $1, r1, -1')
+ define(RETVAL, `addme r3, $1')
+ define(`func', mpn_rsblsh`'LSH`'_n)')
+
+define(`s0', `r0') define(`s1', `r9')
+define(`u0', `r6') define(`u1', `r7')
+define(`v0', `r10') define(`v1', `r11')
+
+
+ASM_START()
+PROLOGUE(func)
+ rldic r7, n, 3, 59
+ add up, up, r7
+ add vp, vp, r7
+ add rp, rp, r7
+
+ifdef(`DO_add', `
+ addic r0, n, 3 C set cy flag as side effect
+',`
+ subfc r0, r0, r0 C set cy flag
+ addi r0, n, 3
+')
+ srdi r0, r0, 2
+ mtctr r0
+
+ andi. r0, n, 1
+ beq L(bx0)
+
+L(bx1): andi. r0, n, 2
+ li s0, 0
+ bne L(lo3)
+ b L(lo1)
+
+L(bx0): andi. r0, n, 2
+ li s1, 0
+ bne L(lo2)
+
+ ALIGN(32)
+L(top): addi rp, rp, 32
+ ld v0, 0(vp)
+ addi vp, vp, 32
+ rldimi s1, v0, LSH, 0
+ ld u0, 0(up)
+ addi up, up, 32
+ srdi s0, v0, RSH
+ ADDSUBE(s1, s1, u0)
+ std s1, -32(rp)
+L(lo3): ld v1, -24(vp)
+ rldimi s0, v1, LSH, 0
+ ld u1, -24(up)
+ srdi s1, v1, RSH
+ ADDSUBE(s0, s0, u1)
+ std s0, -24(rp)
+L(lo2): ld v0, -16(vp)
+ rldimi s1, v0, LSH, 0
+ ld u0, -16(up)
+ srdi s0, v0, RSH
+ ADDSUBE(s1, s1, u0)
+ std s1, -16(rp)
+L(lo1): ld v1, -8(vp)
+ rldimi s0, v1, LSH, 0
+ ld u1, -8(up)
+ srdi s1, v1, RSH
+ ADDSUBE(s0, s0, u1)
+ std s0, -8(rp)
+ bdnz L(top) C decrement CTR and loop back
+
+ RETVAL( s1)
+ blr
+EPILOGUE()
diff --git a/gmp/mpn/powerpc64/mode64/p7/gcd_1.asm b/gmp/mpn/powerpc64/mode64/p7/gcd_1.asm
new file mode 100644
index 0000000000..47cb40bdc5
--- /dev/null
+++ b/gmp/mpn/powerpc64/mode64/p7/gcd_1.asm
@@ -0,0 +1,110 @@
+dnl PowerPC-64 mpn_gcd_1.
+
+dnl Copyright 2000-2002, 2005, 2009, 2011-2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/bit (approx)
+C POWER3/PPC630 -
+C POWER4/PPC970 -
+C POWER5 -
+C POWER6 -
+C POWER7 7.6
+C Numbers measured with: speed -CD -s16-64 -t48 mpn_gcd_1
+
+C INPUT PARAMETERS
+define(`up', `r3')
+define(`n', `r4')
+define(`v0', `r5')
+
+EXTERN_FUNC(mpn_mod_1)
+EXTERN_FUNC(mpn_modexact_1c_odd)
+
+ASM_START()
+PROLOGUE(mpn_gcd_1,toc)
+ mflr r0
+ std r30, -16(r1)
+ std r31, -8(r1)
+ std r0, 16(r1)
+ stdu r1, -128(r1)
+
+ ld r7, 0(up) C U low limb
+ or r0, r5, r7 C x | y
+
+ neg r6, r0
+ and r6, r6, r0
+ cntlzd r31, r6 C common twos
+ subfic r31, r31, 63
+
+ neg r6, r5
+ and r6, r6, r5
+ cntlzd r8, r6
+ subfic r8, r8, 63
+ srd r5, r5, r8
+ mr r30, r5 C v0 saved
+
+ cmpdi r4, BMOD_1_TO_MOD_1_THRESHOLD
+ blt L(bmod)
+ CALL( mpn_mod_1)
+ b L(reduced)
+L(bmod):
+ li r6, 0
+ CALL( mpn_modexact_1c_odd)
+L(reduced):
+
+define(`cnt', `r9')dnl
+
+ neg. r6, r3
+ and r6, r6, r3
+ cntlzd cnt, r6
+ li r12, 63
+ bne L(mid)
+ b L(end)
+
+ ALIGN(16)
+L(top): isel r30, r3, r30, 29 C y = min(x,y)
+ isel r3, r10, r11, 29 C x = |y - x|
+L(mid): subf cnt, cnt, r12 C cnt = 63-cnt
+ srd r3, r3, cnt
+ subf r10, r3, r30 C r10 = y - x
+ subf r11, r30, r3 C r11 = x - y
+ cmpld cr7, r30, r3
+ and r8, r11, r10 C isolate lsb
+ cntlzd cnt, r8
+ bne cr7, L(top)
+
+L(end): sld r3, r30, r31
+
+ addi r1, r1, 128
+ ld r0, 16(r1)
+ ld r30, -16(r1)
+ ld r31, -8(r1)
+ mtlr r0
+ blr
+EPILOGUE()
diff --git a/gmp/mpn/powerpc64/mode64/p7/gmp-mparam.h b/gmp/mpn/powerpc64/mode64/p7/gmp-mparam.h
new file mode 100644
index 0000000000..7e719e8aac
--- /dev/null
+++ b/gmp/mpn/powerpc64/mode64/p7/gmp-mparam.h
@@ -0,0 +1,243 @@
+/* POWER7 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 1991, 1993, 1994, 1999-2003, 2009-2011, 2013, 2014 Free Software
+Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+or
+
+ * the GNU General Public License as published by the Free Software
+ Foundation; either version 2 of the License, or (at your option) any
+ later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library. If not,
+see https://www.gnu.org/licenses/. */
+
+#define GMP_LIMB_BITS 64
+#define GMP_LIMB_BYTES 8
+
+/* 3700 MHz POWER7/SMT4 (gcc111.fsffrance.org) */
+/* FFT tuning limit = 40000000 */
+/* Generated by tuneup.c, 2014-03-13, gcc 4.8 */
+
+#define MOD_1_NORM_THRESHOLD 0 /* always */
+#define MOD_1_UNNORM_THRESHOLD 0 /* always */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD 6
+#define MOD_1U_TO_MOD_1_1_THRESHOLD 5
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD 8
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD 24
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 13
+#define USE_PREINV_DIVREM_1 0
+#define DIV_QR_1N_PI1_METHOD 1
+#define DIV_QR_1_NORM_THRESHOLD 1
+#define DIV_QR_1_UNNORM_THRESHOLD 1
+#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD 28
+
+#define MUL_TOOM22_THRESHOLD 22
+#define MUL_TOOM33_THRESHOLD 72
+#define MUL_TOOM44_THRESHOLD 200
+#define MUL_TOOM6H_THRESHOLD 298
+#define MUL_TOOM8H_THRESHOLD 406
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD 69
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD 140
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD 132
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD 138
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD 124
+
+#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
+#define SQR_TOOM2_THRESHOLD 36
+#define SQR_TOOM3_THRESHOLD 109
+#define SQR_TOOM4_THRESHOLD 196
+#define SQR_TOOM6_THRESHOLD 414
+#define SQR_TOOM8_THRESHOLD 547
+
+#define MULMID_TOOM42_THRESHOLD 58
+
+#define MULMOD_BNM1_THRESHOLD 15
+#define SQRMOD_BNM1_THRESHOLD 20
+
+#define MUL_FFT_MODF_THRESHOLD 412 /* k = 5 */
+#define MUL_FFT_TABLE3 \
+ { { 412, 5}, { 19, 6}, { 10, 5}, { 21, 6}, \
+ { 21, 7}, { 11, 6}, { 23, 7}, { 12, 6}, \
+ { 25, 7}, { 21, 8}, { 11, 7}, { 25, 8}, \
+ { 13, 7}, { 28, 8}, { 15, 7}, { 33, 8}, \
+ { 17, 7}, { 35, 8}, { 19, 7}, { 39, 8}, \
+ { 21, 9}, { 11, 8}, { 29, 9}, { 15, 8}, \
+ { 35, 9}, { 19, 8}, { 41, 9}, { 23, 8}, \
+ { 47, 9}, { 27,10}, { 15, 9}, { 31, 8}, \
+ { 63, 9}, { 43,10}, { 23, 9}, { 51,11}, \
+ { 15,10}, { 31, 9}, { 67,10}, { 39, 9}, \
+ { 79,10}, { 47, 9}, { 95,10}, { 55,11}, \
+ { 31,10}, { 79,11}, { 47,10}, { 95,12}, \
+ { 31,11}, { 63,10}, { 135,11}, { 79,10}, \
+ { 159,11}, { 95,10}, { 191, 9}, { 383,11}, \
+ { 111,12}, { 63,11}, { 127,10}, { 255, 9}, \
+ { 511,11}, { 143,10}, { 287, 9}, { 575,11}, \
+ { 159,10}, { 319,12}, { 95,11}, { 191,10}, \
+ { 383, 9}, { 767,13}, { 63,12}, { 127,11}, \
+ { 255,10}, { 511,11}, { 271,10}, { 543, 9}, \
+ { 1087,11}, { 287,10}, { 575,11}, { 303,12}, \
+ { 159,11}, { 319,10}, { 639,11}, { 335,10}, \
+ { 671,11}, { 351,10}, { 703, 9}, { 1407,11}, \
+ { 383,10}, { 767,11}, { 415,10}, { 831,12}, \
+ { 223,11}, { 447,10}, { 895,13}, { 127,12}, \
+ { 255,11}, { 511,10}, { 1023,11}, { 543,12}, \
+ { 287,11}, { 575,10}, { 1151,11}, { 607,12}, \
+ { 319,11}, { 639,10}, { 1279,11}, { 671,12}, \
+ { 351,11}, { 703,12}, { 383,11}, { 767,12}, \
+ { 415,11}, { 831,10}, { 1663,12}, { 447,11}, \
+ { 895,12}, { 479,14}, { 127,13}, { 255,12}, \
+ { 511,11}, { 1023,12}, { 543,11}, { 1087,10}, \
+ { 2175,12}, { 575,11}, { 1151,12}, { 607,11}, \
+ { 1215,13}, { 319,12}, { 639,11}, { 1279,12}, \
+ { 671,11}, { 1343,10}, { 2687,12}, { 703,11}, \
+ { 1407,13}, { 383,12}, { 767,11}, { 1535,12}, \
+ { 799,11}, { 1599,12}, { 831,11}, { 1663,13}, \
+ { 447,12}, { 895,11}, { 1791,12}, { 959,11}, \
+ { 1919,14}, { 255,13}, { 511,12}, { 1087,11}, \
+ { 2175,13}, { 575,12}, { 1215,11}, { 2431,13}, \
+ { 639,12}, { 1343,11}, { 2687,13}, { 703,12}, \
+ { 1407,11}, { 2815,14}, { 383,13}, { 767,12}, \
+ { 1599,13}, { 831,12}, { 1663,13}, { 895,12}, \
+ { 1791,13}, { 959,12}, { 1919,11}, { 3839,14}, \
+ { 511,13}, { 1023,12}, { 2047,13}, { 1087,12}, \
+ { 2175,13}, { 1215,12}, { 2431,11}, { 4863,14}, \
+ { 639,13}, { 1279,12}, { 2559,13}, { 1343,12}, \
+ { 2687,13}, { 1407,12}, { 2815,13}, { 1471,12}, \
+ { 2943,14}, { 767,13}, { 1599,12}, { 3199,13}, \
+ { 1663,14}, { 895,13}, { 1791,12}, { 3583,13}, \
+ { 1919,12}, { 3839,15}, { 511,14}, { 1023,13}, \
+ { 2175,14}, { 1151,13}, { 2431,12}, { 4863,14}, \
+ { 1279,13}, { 2687,14}, { 1407,13}, { 2815,15}, \
+ { 767,14}, { 1535,13}, { 3199,14}, { 1663,13}, \
+ { 3455,12}, { 6911,14}, { 1919,13}, { 3839,16}, \
+ { 511,15}, { 1023,14}, { 2175,13}, { 4351,14}, \
+ { 2431,13}, { 4863,15}, { 32768,16}, { 65536,17}, \
+ { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \
+ {2097152,22}, {4194304,23}, {8388608,24} }
+#define MUL_FFT_TABLE3_SIZE 231
+#define MUL_FFT_THRESHOLD 4288
+
+#define SQR_FFT_MODF_THRESHOLD 368 /* k = 5 */
+#define SQR_FFT_TABLE3 \
+ { { 368, 5}, { 19, 6}, { 10, 5}, { 21, 6}, \
+ { 21, 7}, { 11, 6}, { 23, 7}, { 12, 6}, \
+ { 25, 7}, { 15, 6}, { 31, 7}, { 21, 8}, \
+ { 11, 7}, { 25, 8}, { 13, 7}, { 28, 8}, \
+ { 15, 7}, { 32, 8}, { 17, 7}, { 35, 8}, \
+ { 21, 9}, { 11, 8}, { 29, 9}, { 15, 8}, \
+ { 35, 9}, { 19, 8}, { 41, 9}, { 23, 8}, \
+ { 47, 9}, { 27,10}, { 15, 9}, { 31, 8}, \
+ { 63, 9}, { 39,10}, { 23, 9}, { 51,11}, \
+ { 15,10}, { 31, 9}, { 67,10}, { 39, 9}, \
+ { 79,10}, { 47, 9}, { 95,10}, { 55,11}, \
+ { 31,10}, { 79,11}, { 47,10}, { 95,12}, \
+ { 31,11}, { 63,10}, { 135,11}, { 79,10}, \
+ { 159, 9}, { 319,11}, { 95,10}, { 191, 9}, \
+ { 383,12}, { 63,11}, { 127,10}, { 255, 9}, \
+ { 511,10}, { 271,11}, { 143,10}, { 287, 9}, \
+ { 575,10}, { 303,11}, { 159,10}, { 319, 9}, \
+ { 639,12}, { 95,11}, { 191,10}, { 383,11}, \
+ { 207,13}, { 63,12}, { 127,11}, { 255,10}, \
+ { 511,11}, { 271,10}, { 543, 9}, { 1087,11}, \
+ { 287,10}, { 575, 9}, { 1151,11}, { 303,10}, \
+ { 607,12}, { 159,11}, { 319,10}, { 639,11}, \
+ { 335,10}, { 671,11}, { 351,10}, { 703,12}, \
+ { 191,11}, { 383,10}, { 767,11}, { 415,10}, \
+ { 831,12}, { 223,11}, { 447,10}, { 895,11}, \
+ { 479,13}, { 127,12}, { 255,11}, { 543,10}, \
+ { 1087,12}, { 287,11}, { 575,10}, { 1151,11}, \
+ { 607,12}, { 319,11}, { 639,10}, { 1279,11}, \
+ { 671,12}, { 351,11}, { 703,13}, { 191,12}, \
+ { 383,11}, { 767,12}, { 415,11}, { 831,10}, \
+ { 1663,12}, { 447,11}, { 895,12}, { 479,14}, \
+ { 127,13}, { 255,12}, { 511,11}, { 1023,12}, \
+ { 543,11}, { 1087,12}, { 575,11}, { 1151,12}, \
+ { 607,13}, { 319,12}, { 639,11}, { 1279,12}, \
+ { 703,11}, { 1407,10}, { 2815,13}, { 383,12}, \
+ { 767,11}, { 1535,12}, { 799,11}, { 1599,12}, \
+ { 831,11}, { 1663,13}, { 447,12}, { 895,11}, \
+ { 1791,12}, { 959,11}, { 1919,10}, { 3839,14}, \
+ { 255,13}, { 511,12}, { 1087,13}, { 575,12}, \
+ { 1215,11}, { 2431,13}, { 639,12}, { 1343,11}, \
+ { 2687,13}, { 703,12}, { 1407,14}, { 383,13}, \
+ { 767,12}, { 1599,13}, { 831,12}, { 1663,13}, \
+ { 895,12}, { 1791,13}, { 959,12}, { 1919,14}, \
+ { 511,13}, { 1087,12}, { 2175,13}, { 1151,12}, \
+ { 2303,13}, { 1215,12}, { 2431,14}, { 639,13}, \
+ { 1279,12}, { 2559,13}, { 1343,12}, { 2687,13}, \
+ { 1407,12}, { 2815,13}, { 1471,14}, { 767,13}, \
+ { 1663,12}, { 3327,13}, { 1727,14}, { 895,13}, \
+ { 1791,12}, { 3583,13}, { 1919,15}, { 511,14}, \
+ { 1023,13}, { 2175,14}, { 1151,13}, { 2431,12}, \
+ { 4863,14}, { 1279,13}, { 2687,14}, { 1407,13}, \
+ { 2943,15}, { 767,14}, { 1535,13}, { 3199,14}, \
+ { 1663,13}, { 3455,14}, { 1791,13}, { 3583,14}, \
+ { 1919,13}, { 3839,16}, { 511,15}, { 1023,14}, \
+ { 2175,13}, { 4479,14}, { 2303,13}, { 4607,14}, \
+ { 2431,15}, { 32768,16}, { 65536,17}, { 131072,18}, \
+ { 262144,19}, { 524288,20}, {1048576,21}, {2097152,22}, \
+ {4194304,23}, {8388608,24} }
+#define SQR_FFT_TABLE3_SIZE 230
+#define SQR_FFT_THRESHOLD 3264
+
+#define MULLO_BASECASE_THRESHOLD 3
+#define MULLO_DC_THRESHOLD 34
+#define MULLO_MUL_N_THRESHOLD 9174
+
+#define DC_DIV_QR_THRESHOLD 33
+#define DC_DIVAPPR_Q_THRESHOLD 126
+#define DC_BDIV_QR_THRESHOLD 63
+#define DC_BDIV_Q_THRESHOLD 152
+
+#define INV_MULMOD_BNM1_THRESHOLD 54
+#define INV_NEWTON_THRESHOLD 155
+#define INV_APPR_THRESHOLD 125
+
+#define BINV_NEWTON_THRESHOLD 294
+#define REDC_1_TO_REDC_2_THRESHOLD 17
+#define REDC_2_TO_REDC_N_THRESHOLD 115
+
+#define MU_DIV_QR_THRESHOLD 1334
+#define MU_DIVAPPR_Q_THRESHOLD 1334
+#define MUPI_DIV_QR_THRESHOLD 54
+#define MU_BDIV_QR_THRESHOLD 1142
+#define MU_BDIV_Q_THRESHOLD 1470
+
+#define POWM_SEC_TABLE 1,14,62,642,960
+
+#define MATRIX22_STRASSEN_THRESHOLD 14
+#define HGCD_THRESHOLD 126
+#define HGCD_APPR_THRESHOLD 184
+#define HGCD_REDUCE_THRESHOLD 3014
+#define GCD_DC_THRESHOLD 440
+#define GCDEXT_DC_THRESHOLD 386
+#define JACOBI_BASE_METHOD 4
+
+#define GET_STR_DC_THRESHOLD 11
+#define GET_STR_PRECOMPUTE_THRESHOLD 17
+#define SET_STR_DC_THRESHOLD 1655
+#define SET_STR_PRECOMPUTE_THRESHOLD 3417
+
+#define FAC_DSC_THRESHOLD 1138
+#define FAC_ODD_THRESHOLD 27
diff --git a/gmp/mpn/powerpc64/mode64/rsh1aors_n.asm b/gmp/mpn/powerpc64/mode64/rsh1aors_n.asm
new file mode 100644
index 0000000000..7f7734bcef
--- /dev/null
+++ b/gmp/mpn/powerpc64/mode64/rsh1aors_n.asm
@@ -0,0 +1,172 @@
+dnl PowerPC-64 mpn_rsh1add_n, mpn_rsh1sub_n
+
+dnl Copyright 2003, 2005, 2010, 2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C POWER3/PPC630 ?
+C POWER4/PPC970 2.9
+C POWER5 ?
+C POWER6 3.5
+C POWER7 2.25
+
+define(`rp', `r3')
+define(`up', `r4')
+define(`vp', `r5')
+define(`n', `r6')
+
+ifdef(`OPERATION_rsh1add_n', `
+ define(`ADDSUBC', `addc')
+ define(`ADDSUBE', `adde')
+ define(INITCY, `addic $1, r1, 0')
+ define(`func', mpn_rsh1add_n)')
+ifdef(`OPERATION_rsh1sub_n', `
+ define(`ADDSUBC', `subfc')
+ define(`ADDSUBE', `subfe')
+ define(INITCY, `addic $1, r1, -1')
+ define(`func', mpn_rsh1sub_n)')
+
+define(`s0', `r9')
+define(`s1', `r7')
+define(`x0', `r0')
+define(`x1', `r12')
+define(`u0', `r8')
+define(`v0', `r10')
+
+
+ASM_START()
+PROLOGUE(func)
+ ld u0, 0(up)
+ ld v0, 0(vp)
+
+ cmpdi cr6, n, 2
+
+ addi r0, n, 1
+ srdi r0, r0, 2
+ mtctr r0 C copy size to count register
+
+ andi. r0, n, 1
+ bne cr0, L(bx1)
+
+L(bx0): ADDSUBC x1, v0, u0
+ ld u0, 8(up)
+ ld v0, 8(vp)
+ ADDSUBE x0, v0, u0
+ ble cr6, L(n2)
+ ld u0, 16(up)
+ ld v0, 16(vp)
+ srdi s0, x1, 1
+ rldicl r11, x1, 0, 63 C return value
+ ADDSUBE x1, v0, u0
+ andi. n, n, 2
+ bne cr0, L(b10)
+L(b00): addi rp, rp, -24
+ b L(lo0)
+L(b10): addi up, up, 16
+ addi vp, vp, 16
+ addi rp, rp, -8
+ b L(lo2)
+
+ ALIGN(16)
+L(bx1): ADDSUBC x0, v0, u0
+ ble cr6, L(n1)
+ ld u0, 8(up)
+ ld v0, 8(vp)
+ ADDSUBE x1, v0, u0
+ ld u0, 16(up)
+ ld v0, 16(vp)
+ srdi s1, x0, 1
+ rldicl r11, x0, 0, 63 C return value
+ ADDSUBE x0, v0, u0
+ andi. n, n, 2
+ bne cr0, L(b11)
+L(b01): addi up, up, 8
+ addi vp, vp, 8
+ addi rp, rp, -16
+ b L(lo1)
+L(b11): addi up, up, 24
+ addi vp, vp, 24
+ bdz L(end)
+
+ ALIGN(32)
+L(top): ld u0, 0(up)
+ ld v0, 0(vp)
+ srdi s0, x1, 1
+ rldimi s1, x1, 63, 0
+ std s1, 0(rp)
+ ADDSUBE x1, v0, u0
+L(lo2): ld u0, 8(up)
+ ld v0, 8(vp)
+ srdi s1, x0, 1
+ rldimi s0, x0, 63, 0
+ std s0, 8(rp)
+ ADDSUBE x0, v0, u0
+L(lo1): ld u0, 16(up)
+ ld v0, 16(vp)
+ srdi s0, x1, 1
+ rldimi s1, x1, 63, 0
+ std s1, 16(rp)
+ ADDSUBE x1, v0, u0
+L(lo0): ld u0, 24(up)
+ ld v0, 24(vp)
+ srdi s1, x0, 1
+ rldimi s0, x0, 63, 0
+ std s0, 24(rp)
+ ADDSUBE x0, v0, u0
+ addi up, up, 32
+ addi vp, vp, 32
+ addi rp, rp, 32
+ bdnz L(top)
+
+L(end): srdi s0, x1, 1
+ rldimi s1, x1, 63, 0
+ std s1, 0(rp)
+L(cj2): srdi s1, x0, 1
+ rldimi s0, x0, 63, 0
+ std s0, 8(rp)
+L(cj1): ADDSUBE x1, x1, x1 C pseudo-depends on x1
+ rldimi s1, x1, 63, 0
+ std s1, 16(rp)
+ mr r3, r11
+ blr
+
+L(n1): srdi s1, x0, 1
+ rldicl r11, x0, 0, 63 C return value
+ ADDSUBE x1, x1, x1 C pseudo-depends on x1
+ rldimi s1, x1, 63, 0
+ std s1, 0(rp)
+ mr r3, r11
+ blr
+
+L(n2): addi rp, rp, -8
+ srdi s0, x1, 1
+ rldicl r11, x1, 0, 63 C return value
+ b L(cj2)
+EPILOGUE()
diff --git a/gmp/mpn/powerpc64/mode64/sqr_basecase.asm b/gmp/mpn/powerpc64/mode64/sqr_basecase.asm
new file mode 100644
index 0000000000..e76bb8878d
--- /dev/null
+++ b/gmp/mpn/powerpc64/mode64/sqr_basecase.asm
@@ -0,0 +1,863 @@
+dnl PowerPC-64 mpn_sqr_basecase.
+
+dnl Contributed to the GNU project by Torbjorn Granlund.
+
+dnl Copyright 1999-2001, 2003-2006, 2008, 2010, 2011 Free Software Foundation,
+dnl Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C POWER3/PPC630 6-18
+C POWER4/PPC970 8
+C POWER5 8
+C POWER6 16.25
+C POWER7 3.77
+
+C NOTES
+C * This is very crude, cleanup!
+C * Try to reduce the number of needed live registers.
+C * Rewrite for POWER6 to use 8 consecutive muls, not 2 groups of 4. The
+C cost will be more live registers.
+C * Rewrite for POWER7 to use addmul_2 building blocks; this will reduce code
+C size a lot and speed things up perhaps 25%.
+C * Use computed goto in order to compress the code.
+C * Implement a larger final corner.
+C * Schedule callee-saves register saves into other insns. This could save
+C about 5 cycles/call. (We cannot analogously optimise the restores, since
+C the sqr_diag_addlsh1 loop has no wind-down code as currently written.)
+C * Should the alternating std/adde sequences be split? Some pipelines handle
+C adde poorly, and might sequentialise all these instructions.
+C * The sqr_diag_addlsh1 loop was written for POWER6 and its preferences for
+C adjacent integer multiply insns. Except for the multiply insns, the code
+C was not carefully optimised for POWER6 or any other CPU.
+C * Perform cross-jumping in sqr_diag_addlsh1's feed-in code, into the loop.
+
+C INPUT PARAMETERS
+define(`rp', `r3')
+define(`up', `r4')
+define(`n', `r5')
+
+define(`rp_outer', `r25')
+define(`up_outer', `r21')
+define(`rp_saved', `r22')
+define(`up_saved', `r23')
+define(`n_saved', `r24')
+
+ASM_START()
+PROLOGUE(mpn_sqr_basecase)
+ cmpdi cr0, n, 2
+ bge cr0, L(ge2)
+ ld r5, 0(up) C n = 1
+ nop
+ mulld r8, r5, r5 C weight 0
+ mulhdu r9, r5, r5 C weight 1
+ std r8, 0(rp)
+ std r9, 8(rp)
+ blr
+ ALIGN(16)
+L(ge2): bgt cr0, L(gt2)
+ ld r0, 0(up) C n = 2
+ nop
+ mulld r8, r0, r0 C u0 * u0
+ mulhdu r9, r0, r0 C u0 * u0
+ ld r6, 8(up)
+ mulld r10, r6, r6 C u1 * u1
+ mulhdu r11, r6, r6 C u1 * u1
+ mulld r4, r6, r0 C u1 * u0
+ mulhdu r5, r6, r0 C u1 * u0
+ addc r4, r4, r4
+ adde r5, r5, r5
+ addze r11, r11
+ addc r9, r9, r4
+ adde r10, r10, r5
+ addze r11, r11
+ std r8, 0(rp)
+ std r9, 8(rp)
+ std r10, 16(rp)
+ std r11, 24(rp)
+ blr
+
+ ALIGN(16)
+L(gt2): std r31, -8(r1)
+ std r30, -16(r1)
+ std r29, -24(r1)
+ std r28, -32(r1)
+ std r27, -40(r1)
+ std r26, -48(r1)
+ std r25, -56(r1)
+ std r24, -64(r1)
+ std r23, -72(r1)
+ std r22, -80(r1)
+ std r21, -88(r1)
+
+ mr rp_saved, rp
+ mr up_saved, up
+ mr n_saved, n
+ mr rp_outer, rp
+ mr up_outer, up
+
+ rldicl. r0, n, 0,62 C r0 = n & 3, set cr0
+ cmpdi cr6, r0, 2
+ addic r7, n, 2 C compute count...
+ srdi r7, r7, 2 C ...for ctr
+ mtctr r7 C copy count into ctr
+ beq- cr0, L(b0)
+ blt- cr6, L(b1)
+ beq- cr6, L(b2)
+
+L(b3): ld r6, 0(up)
+ ld r9, 8(up)
+ ld r27, 16(up)
+ addi up, up, 24
+ li r12, 0 C carry limb
+ bdz L(em3)
+
+ ALIGN(16)
+L(tm3): mulld r0, r9, r6
+ mulhdu r26, r9, r6
+ mulld r7, r27, r6
+ mulhdu r8, r27, r6
+ ld r9, 0(up)
+ ld r27, 8(up)
+ adde r0, r0, r12
+ adde r7, r7, r26
+ mulld r26, r9, r6
+ mulhdu r10, r9, r6
+ mulld r11, r27, r6
+ mulhdu r12, r27, r6
+ ld r9, 16(up)
+ ld r27, 24(up)
+ std r0, 8(rp)
+ adde r26, r26, r8
+ std r7, 16(rp)
+ adde r11, r11, r10
+ std r26, 24(rp)
+ addi up, up, 32
+ std r11, 32(rp)
+ addi rp, rp, 32
+ bdnz L(tm3)
+
+L(em3): mulld r0, r9, r6
+ mulhdu r26, r9, r6
+ mulld r7, r27, r6
+ mulhdu r8, r27, r6
+ adde r0, r0, r12
+ adde r7, r7, r26
+ std r0, 8(rp)
+ std r7, 16(rp)
+ addze r8, r8
+ std r8, 24(rp)
+ addi n, n, 2
+ b L(outer_loop)
+
+L(b0): ld r6, 0(up)
+ ld r27, 8(up)
+ mulld r7, r27, r6
+ mulhdu r12, r27, r6
+ std r7, 8(rp)
+ addi rp, rp, 8
+ ld r9, 16(up)
+ ld r27, 24(up)
+ addi up, up, 32
+ bdz L(em0)
+
+ ALIGN(16)
+L(tm0): mulld r0, r9, r6
+ mulhdu r26, r9, r6
+ mulld r7, r27, r6
+ mulhdu r8, r27, r6
+ ld r9, 0(up)
+ ld r27, 8(up)
+ adde r0, r0, r12
+ adde r7, r7, r26
+ mulld r26, r9, r6
+ mulhdu r10, r9, r6
+ mulld r11, r27, r6
+ mulhdu r12, r27, r6
+ ld r9, 16(up)
+ ld r27, 24(up)
+ std r0, 8(rp)
+ adde r26, r26, r8
+ std r7, 16(rp)
+ adde r11, r11, r10
+ std r26, 24(rp)
+ addi up, up, 32
+ std r11, 32(rp)
+ addi rp, rp, 32
+ bdnz L(tm0)
+
+L(em0): mulld r0, r9, r6
+ mulhdu r26, r9, r6
+ mulld r7, r27, r6
+ mulhdu r8, r27, r6
+ adde r0, r0, r12
+ adde r7, r7, r26
+ std r0, 8(rp)
+ std r7, 16(rp)
+ addze r8, r8
+ std r8, 24(rp)
+ addi n, n, 2
+ b L(outer_loop_ent_2)
+
+L(b1): ld r6, 0(up)
+ ld r9, 8(up)
+ ld r27, 16(up)
+ mulld r0, r9, r6
+ mulhdu r26, r9, r6
+ mulld r7, r27, r6
+ mulhdu r12, r27, r6
+ addc r7, r7, r26
+ std r0, 8(rp)
+ std r7, 16(rp)
+ addi rp, rp, 16
+ ld r9, 24(up)
+ ld r27, 32(up)
+ addi up, up, 40
+ bdz L(em1)
+
+ ALIGN(16)
+L(tm1): mulld r0, r9, r6
+ mulhdu r26, r9, r6
+ mulld r7, r27, r6
+ mulhdu r8, r27, r6
+ ld r9, 0(up)
+ ld r27, 8(up)
+ adde r0, r0, r12
+ adde r7, r7, r26
+ mulld r26, r9, r6
+ mulhdu r10, r9, r6
+ mulld r11, r27, r6
+ mulhdu r12, r27, r6
+ ld r9, 16(up)
+ ld r27, 24(up)
+ std r0, 8(rp)
+ adde r26, r26, r8
+ std r7, 16(rp)
+ adde r11, r11, r10
+ std r26, 24(rp)
+ addi up, up, 32
+ std r11, 32(rp)
+ addi rp, rp, 32
+ bdnz L(tm1)
+
+L(em1): mulld r0, r9, r6
+ mulhdu r26, r9, r6
+ mulld r7, r27, r6
+ mulhdu r8, r27, r6
+ adde r0, r0, r12
+ adde r7, r7, r26
+ std r0, 8(rp)
+ std r7, 16(rp)
+ addze r8, r8
+ std r8, 24(rp)
+ addi n, n, 2
+ b L(outer_loop_ent_3)
+
+L(b2): addi r7, r7, -1 C FIXME
+ mtctr r7 C FIXME
+ ld r6, 0(up)
+ ld r9, 8(up)
+ ld r27, 16(up)
+ mulld r0, r9, r6
+ mulhdu r26, r9, r6
+ mulld r7, r27, r6
+ mulhdu r8, r27, r6
+ ld r9, 24(up)
+ mulld r11, r9, r6
+ mulhdu r10, r9, r6
+ addc r7, r7, r26
+ adde r11, r11, r8
+ addze r12, r10
+ std r0, 8(rp)
+ std r7, 16(rp)
+ std r11, 24(rp)
+ addi rp, rp, 24
+ ld r9, 32(up)
+ ld r27, 40(up)
+ addi up, up, 48
+ bdz L(em2)
+
+ ALIGN(16)
+L(tm2): mulld r0, r9, r6
+ mulhdu r26, r9, r6
+ mulld r7, r27, r6
+ mulhdu r8, r27, r6
+ ld r9, 0(up)
+ ld r27, 8(up)
+ adde r0, r0, r12
+ adde r7, r7, r26
+ mulld r26, r9, r6
+ mulhdu r10, r9, r6
+ mulld r11, r27, r6
+ mulhdu r12, r27, r6
+ ld r9, 16(up)
+ ld r27, 24(up)
+ std r0, 8(rp)
+ adde r26, r26, r8
+ std r7, 16(rp)
+ adde r11, r11, r10
+ std r26, 24(rp)
+ addi up, up, 32
+ std r11, 32(rp)
+ addi rp, rp, 32
+ bdnz L(tm2)
+
+L(em2): mulld r0, r9, r6
+ mulhdu r26, r9, r6
+ mulld r7, r27, r6
+ mulhdu r8, r27, r6
+ adde r0, r0, r12
+ adde r7, r7, r26
+ std r0, 8(rp)
+ std r7, 16(rp)
+ addze r8, r8
+ std r8, 24(rp)
+ addi n, n, 2
+ b L(outer_loop_ent_0)
+
+
+L(outer_loop):
+ addi n, n, -1
+ addi up_outer, up_outer, 8
+ addi rp_outer, rp_outer, 16
+
+ mr up, up_outer
+ addi rp, rp_outer, 8
+
+ srdi r0, n, 2
+ mtctr r0
+
+ bdz L(outer_end)
+
+ ld r6, 0(up)
+ ld r9, 8(up)
+ ld r27, 16(up)
+ mulld r0, r9, r6
+ mulhdu r26, r9, r6
+ mulld r7, r27, r6
+ mulhdu r8, r27, r6
+ ld r9, 24(up)
+ ld r28, 0(rp)
+ ld r29, 8(rp)
+ ld r30, 16(rp)
+ mulld r11, r9, r6
+ mulhdu r10, r9, r6
+ addc r7, r7, r26
+ adde r11, r11, r8
+ addze r12, r10
+ addc r0, r0, r28
+ std r0, 0(rp)
+ adde r7, r7, r29
+ std r7, 8(rp)
+ adde r11, r11, r30
+ std r11, 16(rp)
+ addi rp, rp, 24
+ ld r9, 32(up)
+ ld r27, 40(up)
+ addi up, up, 48
+ bdz L(ea1)
+
+ ALIGN(16)
+L(ta1): mulld r0, r9, r6
+ mulhdu r26, r9, r6 C 9
+ mulld r7, r27, r6
+ mulhdu r8, r27, r6 C 27
+ ld r9, 0(up)
+ ld r28, 0(rp)
+ ld r27, 8(up)
+ ld r29, 8(rp)
+ adde r0, r0, r12 C 0 12
+ adde r7, r7, r26 C 5 7
+ mulld r26, r9, r6
+ mulhdu r10, r9, r6 C 9
+ mulld r11, r27, r6
+ mulhdu r12, r27, r6 C 27
+ ld r9, 16(up)
+ ld r30, 16(rp)
+ ld r27, 24(up)
+ ld r31, 24(rp)
+ adde r26, r26, r8 C 8 5
+ adde r11, r11, r10 C 10 11
+ addze r12, r12 C 12
+ addc r0, r0, r28 C 0 28
+ std r0, 0(rp) C 0
+ adde r7, r7, r29 C 7 29
+ std r7, 8(rp) C 7
+ adde r26, r26, r30 C 5 30
+ std r26, 16(rp) C 5
+ adde r11, r11, r31 C 11 31
+ std r11, 24(rp) C 11
+ addi up, up, 32
+ addi rp, rp, 32
+ bdnz L(ta1)
+
+L(ea1): mulld r0, r9, r6
+ mulhdu r26, r9, r6
+ mulld r7, r27, r6
+ mulhdu r8, r27, r6
+ ld r28, 0(rp)
+ ld r29, 8(rp)
+ adde r0, r0, r12
+ adde r7, r7, r26
+ addze r8, r8
+ addc r0, r0, r28
+ std r0, 0(rp)
+ adde r7, r7, r29
+ std r7, 8(rp)
+ addze r8, r8
+ std r8, 16(rp)
+
+L(outer_loop_ent_0):
+ addi n, n, -1
+ addi up_outer, up_outer, 8
+ addi rp_outer, rp_outer, 16
+
+ mr up, up_outer
+ addi rp, rp_outer, 8
+
+ srdi r0, n, 2
+ mtctr r0
+
+ ld r6, 0(up)
+ ld r9, 8(up)
+ ld r27, 16(up)
+ ld r28, 0(rp)
+ ld r29, 8(rp)
+ mulld r0, r9, r6
+ mulhdu r26, r9, r6
+ mulld r7, r27, r6
+ mulhdu r8, r27, r6
+ addc r0, r0, r28
+ adde r7, r7, r26
+ addze r12, r8
+ std r0, 0(rp)
+ adde r7, r7, r29
+ std r7, 8(rp)
+ addi rp, rp, 16
+ ld r9, 24(up)
+ ld r27, 32(up)
+ addi up, up, 40
+ bdz L(ea0)
+
+ ALIGN(16)
+L(ta0): mulld r0, r9, r6
+ mulhdu r26, r9, r6 C 9
+ mulld r7, r27, r6
+ mulhdu r8, r27, r6 C 27
+ ld r9, 0(up)
+ ld r28, 0(rp)
+ ld r27, 8(up)
+ ld r29, 8(rp)
+ adde r0, r0, r12 C 0 12
+ adde r7, r7, r26 C 5 7
+ mulld r26, r9, r6
+ mulhdu r10, r9, r6 C 9
+ mulld r11, r27, r6
+ mulhdu r12, r27, r6 C 27
+ ld r9, 16(up)
+ ld r30, 16(rp)
+ ld r27, 24(up)
+ ld r31, 24(rp)
+ adde r26, r26, r8 C 8 5
+ adde r11, r11, r10 C 10 11
+ addze r12, r12 C 12
+ addc r0, r0, r28 C 0 28
+ std r0, 0(rp) C 0
+ adde r7, r7, r29 C 7 29
+ std r7, 8(rp) C 7
+ adde r26, r26, r30 C 5 30
+ std r26, 16(rp) C 5
+ adde r11, r11, r31 C 11 31
+ std r11, 24(rp) C 11
+ addi up, up, 32
+ addi rp, rp, 32
+ bdnz L(ta0)
+
+L(ea0): mulld r0, r9, r6
+ mulhdu r26, r9, r6
+ mulld r7, r27, r6
+ mulhdu r8, r27, r6
+ ld r28, 0(rp)
+ ld r29, 8(rp)
+ adde r0, r0, r12
+ adde r7, r7, r26
+ addze r8, r8
+ addc r0, r0, r28
+ std r0, 0(rp)
+ adde r7, r7, r29
+ std r7, 8(rp)
+ addze r8, r8
+ std r8, 16(rp)
+
+L(outer_loop_ent_3):
+ addi n, n, -1
+ addi up_outer, up_outer, 8
+ addi rp_outer, rp_outer, 16
+
+ mr up, up_outer
+ addi rp, rp_outer, 8
+
+ srdi r0, n, 2
+ mtctr r0
+
+ ld r6, 0(up)
+ ld r9, 8(up)
+ ld r28, 0(rp)
+ mulld r0, r9, r6
+ mulhdu r12, r9, r6
+ addc r0, r0, r28
+ std r0, 0(rp)
+ addi rp, rp, 8
+ ld r9, 16(up)
+ ld r27, 24(up)
+ addi up, up, 32
+ bdz L(ea3)
+
+ ALIGN(16)
+L(ta3): mulld r0, r9, r6
+ mulhdu r26, r9, r6 C 9
+ mulld r7, r27, r6
+ mulhdu r8, r27, r6 C 27
+ ld r9, 0(up)
+ ld r28, 0(rp)
+ ld r27, 8(up)
+ ld r29, 8(rp)
+ adde r0, r0, r12 C 0 12
+ adde r7, r7, r26 C 5 7
+ mulld r26, r9, r6
+ mulhdu r10, r9, r6 C 9
+ mulld r11, r27, r6
+ mulhdu r12, r27, r6 C 27
+ ld r9, 16(up)
+ ld r30, 16(rp)
+ ld r27, 24(up)
+ ld r31, 24(rp)
+ adde r26, r26, r8 C 8 5
+ adde r11, r11, r10 C 10 11
+ addze r12, r12 C 12
+ addc r0, r0, r28 C 0 28
+ std r0, 0(rp) C 0
+ adde r7, r7, r29 C 7 29
+ std r7, 8(rp) C 7
+ adde r26, r26, r30 C 5 30
+ std r26, 16(rp) C 5
+ adde r11, r11, r31 C 11 31
+ std r11, 24(rp) C 11
+ addi up, up, 32
+ addi rp, rp, 32
+ bdnz L(ta3)
+
+L(ea3): mulld r0, r9, r6
+ mulhdu r26, r9, r6
+ mulld r7, r27, r6
+ mulhdu r8, r27, r6
+ ld r28, 0(rp)
+ ld r29, 8(rp)
+ adde r0, r0, r12
+ adde r7, r7, r26
+ addze r8, r8
+ addc r0, r0, r28
+ std r0, 0(rp)
+ adde r7, r7, r29
+ std r7, 8(rp)
+ addze r8, r8
+ std r8, 16(rp)
+
+
+L(outer_loop_ent_2):
+ addi n, n, -1
+ addi up_outer, up_outer, 8
+ addi rp_outer, rp_outer, 16
+
+ mr up, up_outer
+ addi rp, rp_outer, 8
+
+ srdi r0, n, 2
+ mtctr r0
+
+ addic r0, r0, 0
+ li r12, 0 C cy_limb = 0
+ ld r6, 0(up)
+ ld r9, 8(up)
+ ld r27, 16(up)
+ bdz L(ea2)
+ addi up, up, 24
+
+ ALIGN(16)
+L(ta2): mulld r0, r9, r6
+ mulhdu r26, r9, r6 C 9
+ mulld r7, r27, r6
+ mulhdu r8, r27, r6 C 27
+ ld r9, 0(up)
+ ld r28, 0(rp)
+ ld r27, 8(up)
+ ld r29, 8(rp)
+ adde r0, r0, r12 C 0 12
+ adde r7, r7, r26 C 5 7
+ mulld r26, r9, r6
+ mulhdu r10, r9, r6 C 9
+ mulld r11, r27, r6
+ mulhdu r12, r27, r6 C 27
+ ld r9, 16(up)
+ ld r30, 16(rp)
+ ld r27, 24(up)
+ ld r31, 24(rp)
+ adde r26, r26, r8 C 8 5
+ adde r11, r11, r10 C 10 11
+ addze r12, r12 C 12
+ addc r0, r0, r28 C 0 28
+ std r0, 0(rp) C 0
+ adde r7, r7, r29 C 7 29
+ std r7, 8(rp) C 7
+ adde r26, r26, r30 C 5 30
+ std r26, 16(rp) C 5
+ adde r11, r11, r31 C 11 31
+ std r11, 24(rp) C 11
+ addi up, up, 32
+ addi rp, rp, 32
+ bdnz L(ta2)
+
+L(ea2): mulld r0, r9, r6
+ mulhdu r26, r9, r6
+ mulld r7, r27, r6
+ mulhdu r8, r27, r6
+ ld r28, 0(rp)
+ ld r29, 8(rp)
+ adde r0, r0, r12
+ adde r7, r7, r26
+ addze r8, r8
+ addc r0, r0, r28
+ std r0, 0(rp)
+ adde r7, r7, r29
+ std r7, 8(rp)
+ addze r8, r8
+ std r8, 16(rp)
+
+ b L(outer_loop)
+
+L(outer_end):
+ ld r6, 0(up)
+ ld r9, 8(up)
+ ld r11, 0(rp)
+ mulld r0, r9, r6
+ mulhdu r8, r9, r6
+ addc r0, r0, r11
+ std r0, 0(rp)
+ addze r8, r8
+ std r8, 8(rp)
+
+define(`rp', `rp_saved')
+define(`up', `r5')
+define(`n', `r6')
+define(`climb', `r0')
+
+ addi r4, rp_saved, 8
+ mr r5, up_saved
+ mr r6, n_saved
+
+ rldicl. r0, n, 0,62 C r0 = n & 3, set cr0
+ cmpdi cr6, r0, 2
+ addi n, n, 2 C compute count...
+ srdi n, n, 2 C ...for ctr
+ mtctr n C put loop count into ctr
+ beq cr0, L(xb0)
+ blt cr6, L(xb1)
+ beq cr6, L(xb2)
+
+L(xb3): ld r6, 0(up)
+ ld r7, 8(up)
+ ld r12, 16(up)
+ addi up, up, 24
+ mulld r24, r6, r6
+ mulhdu r25, r6, r6
+ mulld r26, r7, r7
+ mulhdu r27, r7, r7
+ mulld r28, r12, r12
+ mulhdu r29, r12, r12
+ ld r10, 8(rp)
+ ld r11, 16(rp)
+ ld r6, 24(rp)
+ ld r7, 32(rp)
+ addc r10, r10, r10
+ adde r11, r11, r11
+ adde r6, r6, r6
+ adde r7, r7, r7
+ addze climb, r29
+ addc r10, r10, r25
+ adde r11, r11, r26
+ adde r6, r6, r27
+ adde r7, r7, r28
+ std r24, 0(rp)
+ std r10, 8(rp)
+ std r11, 16(rp)
+ std r6, 24(rp)
+ std r7, 32(rp)
+ addi rp, rp, 40
+ bdnz L(top)
+ b L(end)
+
+L(xb2): ld r6, 0(up)
+ ld r7, 8(up)
+ addi up, up, 16
+ mulld r24, r6, r6
+ mulhdu r25, r6, r6
+ mulld r26, r7, r7
+ mulhdu r27, r7, r7
+ ld r10, 8(rp)
+ ld r11, 16(rp)
+ addc r10, r10, r10
+ adde r11, r11, r11
+ addze climb, r27
+ addc r10, r10, r25
+ adde r11, r11, r26
+ std r24, 0(rp)
+ std r10, 8(rp)
+ std r11, 16(rp)
+ addi rp, rp, 24
+ bdnz L(top)
+ b L(end)
+
+L(xb0): ld r6, 0(up)
+ ld r7, 8(up)
+ ld r12, 16(up)
+ ld r23, 24(up)
+ addi up, up, 32
+ mulld r24, r6, r6
+ mulhdu r25, r6, r6
+ mulld r26, r7, r7
+ mulhdu r27, r7, r7
+ mulld r28, r12, r12
+ mulhdu r29, r12, r12
+ mulld r30, r23, r23
+ mulhdu r31, r23, r23
+ ld r10, 8(rp)
+ ld r11, 16(rp)
+ ld r6, 24(rp)
+ ld r7, 32(rp)
+ ld r12, 40(rp)
+ ld r23, 48(rp)
+ addc r10, r10, r10
+ adde r11, r11, r11
+ adde r6, r6, r6
+ adde r7, r7, r7
+ adde r12, r12, r12
+ adde r23, r23, r23
+ addze climb, r31
+ std r24, 0(rp)
+ addc r10, r10, r25
+ std r10, 8(rp)
+ adde r11, r11, r26
+ std r11, 16(rp)
+ adde r6, r6, r27
+ std r6, 24(rp)
+ adde r7, r7, r28
+ std r7, 32(rp)
+ adde r12, r12, r29
+ std r12, 40(rp)
+ adde r23, r23, r30
+ std r23, 48(rp)
+ addi rp, rp, 56
+ bdnz L(top)
+ b L(end)
+
+L(xb1): ld r6, 0(up)
+ addi up, up, 8
+ mulld r24, r6, r6
+ mulhdu climb, r6, r6
+ std r24, 0(rp)
+ addic rp, rp, 8 C clear carry as side-effect
+
+ ALIGN(32)
+L(top): ld r6, 0(up)
+ ld r7, 8(up)
+ ld r12, 16(up)
+ ld r23, 24(up)
+ addi up, up, 32
+ mulld r24, r6, r6
+ mulhdu r25, r6, r6
+ mulld r26, r7, r7
+ mulhdu r27, r7, r7
+ mulld r28, r12, r12
+ mulhdu r29, r12, r12
+ mulld r30, r23, r23
+ mulhdu r31, r23, r23
+ ld r8, 0(rp)
+ ld r9, 8(rp)
+ adde r8, r8, r8
+ adde r9, r9, r9
+ ld r10, 16(rp)
+ ld r11, 24(rp)
+ adde r10, r10, r10
+ adde r11, r11, r11
+ ld r6, 32(rp)
+ ld r7, 40(rp)
+ adde r6, r6, r6
+ adde r7, r7, r7
+ ld r12, 48(rp)
+ ld r23, 56(rp)
+ adde r12, r12, r12
+ adde r23, r23, r23
+ addze r31, r31
+ addc r8, r8, climb
+ std r8, 0(rp)
+ adde r9, r9, r24
+ std r9, 8(rp)
+ adde r10, r10, r25
+ std r10, 16(rp)
+ adde r11, r11, r26
+ std r11, 24(rp)
+ adde r6, r6, r27
+ std r6, 32(rp)
+ adde r7, r7, r28
+ std r7, 40(rp)
+ adde r12, r12, r29
+ std r12, 48(rp)
+ adde r23, r23, r30
+ std r23, 56(rp)
+ mr climb, r31
+ addi rp, rp, 64
+ bdnz L(top)
+
+L(end): addze climb, climb
+ std climb, 0(rp)
+
+ ld r31, -8(r1)
+ ld r30, -16(r1)
+ ld r29, -24(r1)
+ ld r28, -32(r1)
+ ld r27, -40(r1)
+ ld r26, -48(r1)
+ ld r25, -56(r1)
+ ld r24, -64(r1)
+ ld r23, -72(r1)
+ ld r22, -80(r1)
+ ld r21, -88(r1)
+ blr
+EPILOGUE()
diff --git a/gmp/mpn/powerpc64/p6/lshift.asm b/gmp/mpn/powerpc64/p6/lshift.asm
new file mode 100644
index 0000000000..1a200fb346
--- /dev/null
+++ b/gmp/mpn/powerpc64/p6/lshift.asm
@@ -0,0 +1,132 @@
+dnl PowerPC-64 mpn_lshift -- rp[] = up[] << cnt
+
+dnl Copyright 2003, 2005, 2010, 2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C POWER3/PPC630 ?
+C POWER4/PPC970 ?
+C POWER5 2.25
+C POWER6 4
+
+C TODO
+C * Micro-optimise header code
+C * Perhaps do 4-way unrolling, for 2.5 c/l on POWER6. The code is 4236
+C bytes, 4-way code would become about 50% larger.
+
+C INPUT PARAMETERS
+define(`rp_param', `r3')
+define(`up', `r4')
+define(`n', `r5')
+define(`cnt', `r6')
+
+define(`tnc',`r0')
+define(`retval',`r3')
+define(`rp', `r7')
+
+ASM_START()
+PROLOGUE(mpn_lshift,toc)
+
+ifdef(`HAVE_ABI_mode32',`
+ rldicl n, n, 0,32 C FIXME: avoid this zero extend
+')
+ mflr r12
+ sldi r8, n, 3
+ sldi r10, cnt, 6 C multiply cnt by size of a SHIFT block
+ LEAL( r11, L(e1)) C address of L(e1) label in SHIFT(1)
+ add up, up, r8 C make up point at end of up[]
+ add r11, r11, r10 C address of L(oN) for N = cnt
+ srdi r10, n, 1
+ add rp, rp_param, r8 C make rp point at end of rp[]
+ subfic tnc, cnt, 64
+ rlwinm. r8, n, 0,31,31 C extract bit 0
+ mtctr r10
+ beq L(evn)
+
+L(odd): ld r9, -8(up)
+ cmpdi cr0, n, 1 C n = 1?
+ beq L(1)
+ ld r8, -16(up)
+ addi r11, r11, -84 C L(o1) - L(e1) - 64
+ mtlr r11
+ srd r3, r9, tnc C retval
+ addi up, up, 8
+ addi rp, rp, -8
+ blr C branch to L(oN)
+
+L(evn): ld r8, -8(up)
+ ld r9, -16(up)
+ addi r11, r11, -64
+ mtlr r11
+ srd r3, r8, tnc C retval
+ blr C branch to L(eN)
+
+L(1): srd r3, r9, tnc C retval
+ sld r8, r9, cnt
+ std r8, -8(rp)
+ mtlr r12
+ifdef(`HAVE_ABI_mode32',
+` mr r4, r3
+ srdi r3, r3, 32
+')
+ blr
+
+
+define(SHIFT,`
+L(lo$1):ld r8, -24(up)
+ std r11, -8(rp)
+ addi rp, rp, -16
+L(o$1): srdi r10, r8, eval(64-$1)
+ rldimi r10, r9, $1, 0
+ ld r9, -32(up)
+ addi up, up, -16
+ std r10, 0(rp)
+L(e$1): srdi r11, r9, eval(64-$1)
+ rldimi r11, r8, $1, 0
+ bdnz L(lo$1)
+ std r11, -8(rp)
+ sldi r10, r9, $1
+ b L(com)
+ nop
+ nop
+')
+
+ ALIGN(64)
+forloop(`i',1,63,`SHIFT(i)')
+
+L(com): std r10, -16(rp)
+ mtlr r12
+ifdef(`HAVE_ABI_mode32',
+` mr r4, r3
+ srdi r3, r3, 32
+')
+ blr
+EPILOGUE()
+ASM_END()
diff --git a/gmp/mpn/powerpc64/p6/lshiftc.asm b/gmp/mpn/powerpc64/p6/lshiftc.asm
new file mode 100644
index 0000000000..e4b3caaab8
--- /dev/null
+++ b/gmp/mpn/powerpc64/p6/lshiftc.asm
@@ -0,0 +1,136 @@
+dnl PowerPC-64 mpn_lshiftc -- rp[] = ~up[] << cnt
+
+dnl Copyright 2003, 2005, 2010, 2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C POWER3/PPC630 ?
+C POWER4/PPC970 ?
+C POWER5 2.25
+C POWER6 4
+
+C TODO
+C * Micro-optimise header code
+C * Perhaps do 4-way unrolling, for 2.5 c/l on POWER6. The code is 4236
+C bytes, 4-way code would become about 50% larger.
+
+C INPUT PARAMETERS
+define(`rp_param', `r3')
+define(`up', `r4')
+define(`n', `r5')
+define(`cnt', `r6')
+
+define(`tnc',`r0')
+define(`retval',`r3')
+define(`rp', `r7')
+
+ASM_START()
+PROLOGUE(mpn_lshiftc,toc)
+
+ifdef(`HAVE_ABI_mode32',`
+ rldicl n, n, 0,32 C FIXME: avoid this zero extend
+')
+ mflr r12
+ sldi r8, n, 3
+ sldi r10, cnt, 6 C multiply cnt by size of a SHIFT block
+ LEAL( r11, L(e1)) C address of L(e1) label in SHIFT(1)
+ add up, up, r8 C make up point at end of up[]
+ add r11, r11, r10 C address of L(oN) for N = cnt
+ srdi r10, n, 1
+ add rp, rp_param, r8 C make rp point at end of rp[]
+ subfic tnc, cnt, 64
+ rlwinm. r8, n, 0,31,31 C extract bit 0
+ mtctr r10
+ beq L(evn)
+
+L(odd): ld r9, -8(up)
+ cmpdi cr0, n, 1 C n = 1?
+ beq L(1)
+ ld r8, -16(up)
+ addi r11, r11, -88 C L(o1) - L(e1) - 64
+ mtlr r11
+ srd r3, r9, tnc C retval
+ addi up, up, 8
+ addi rp, rp, -8
+ blr C branch to L(oN)
+
+L(evn): ld r8, -8(up)
+ ld r9, -16(up)
+ addi r11, r11, -64
+ mtlr r11
+ srd r3, r8, tnc C retval
+ blr C branch to L(eN)
+
+L(1): srd r3, r9, tnc C retval
+ sld r8, r9, cnt
+ nor r8, r8, r8
+ std r8, -8(rp)
+ mtlr r12
+ifdef(`HAVE_ABI_mode32',
+` mr r4, r3
+ srdi r3, r3, 32
+')
+ blr
+
+
+define(SHIFT,`
+L(lo$1):ld r8, -24(up)
+ nor r11, r11, r11
+ std r11, -8(rp)
+ addi rp, rp, -16
+L(o$1): srdi r10, r8, eval(64-$1)
+ rldimi r10, r9, $1, 0
+ ld r9, -32(up)
+ addi up, up, -16
+ nor r10, r10, r10
+ std r10, 0(rp)
+L(e$1): srdi r11, r9, eval(64-$1)
+ rldimi r11, r8, $1, 0
+ bdnz L(lo$1)
+ sldi r10, r9, $1
+ b L(com)
+ nop
+')
+
+ ALIGN(64)
+forloop(`i',1,63,`SHIFT(i)')
+
+L(com): nor r11, r11, r11
+ nor r10, r10, r10
+ std r11, -8(rp)
+ std r10, -16(rp)
+ mtlr r12
+ifdef(`HAVE_ABI_mode32',
+` mr r4, r3
+ srdi r3, r3, 32
+')
+ blr
+EPILOGUE()
+ASM_END()
diff --git a/gmp/mpn/powerpc64/p6/rshift.asm b/gmp/mpn/powerpc64/p6/rshift.asm
new file mode 100644
index 0000000000..9e848c1fc7
--- /dev/null
+++ b/gmp/mpn/powerpc64/p6/rshift.asm
@@ -0,0 +1,131 @@
+dnl PowerPC-64 mpn_rshift -- rp[] = up[] << cnt
+
+dnl Copyright 2003, 2005, 2010, 2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C POWER3/PPC630 ?
+C POWER4/PPC970 ?
+C POWER5 2
+C POWER6 3.5 (mysteriously 3.0 for cnt=1)
+
+C TODO
+C * Micro-optimise header code
+C * Perhaps do 4-way unrolling, for 2.5 c/l on POWER6. The code is 4248
+C bytes, 4-way code would become about 50% larger.
+
+C INPUT PARAMETERS
+define(`rp_param', `r3')
+define(`up', `r4')
+define(`n', `r5')
+define(`cnt', `r6')
+
+define(`tnc',`r0')
+define(`retval',`r3')
+define(`rp', `r7')
+
+ASM_START()
+PROLOGUE(mpn_rshift,toc)
+
+ifdef(`HAVE_ABI_mode32',`
+ rldicl n, n, 0,32 C FIXME: avoid this zero extend
+')
+ mflr r12
+ LEAL( r11, L(e1)) C address of L(e1) label in SHIFT(1)
+ sldi r10, cnt, 6 C multiply cnt by size of a SHIFT block
+ add r11, r11, r10 C address of L(oN) for N = cnt
+ srdi r10, n, 1
+ mr rp, rp_param
+ subfic tnc, cnt, 64
+ rlwinm. r8, n, 0,31,31 C extract bit 0
+ mtctr r10
+ beq L(evn)
+
+L(odd): ld r9, 0(up)
+ cmpdi cr0, n, 1 C n = 1?
+ beq L(1)
+ ld r8, 8(up)
+ addi r11, r11, -84 C L(o1) - L(e1) - 64
+ mtlr r11
+ sld r3, r9, tnc C retval
+ addi up, up, 8
+ addi rp, rp, 8
+ blr C branch to L(oN)
+
+L(evn): ld r8, 0(up)
+ ld r9, 8(up)
+ addi r11, r11, -64
+ mtlr r11
+ sld r3, r8, tnc C retval
+ addi up, up, 16
+ blr C branch to L(eN)
+
+L(1): sld r3, r9, tnc C retval
+ srd r8, r9, cnt
+ std r8, 0(rp)
+ mtlr r12
+ifdef(`HAVE_ABI_mode32',
+` mr r4, r3
+ srdi r3, r3, 32
+')
+ blr
+
+
+define(SHIFT,`
+L(lo$1):ld r8, 0(up)
+ std r11, 0(rp)
+ addi rp, rp, 16
+L(o$1): srdi r10, r9, $1
+ rldimi r10, r8, eval(64-$1), 0
+ ld r9, 8(up)
+ addi up, up, 16
+ std r10, -8(rp)
+L(e$1): srdi r11, r8, $1
+ rldimi r11, r9, eval(64-$1), 0
+ bdnz L(lo$1)
+ std r11, 0(rp)
+ srdi r10, r9, $1
+ b L(com)
+ nop
+ nop
+')
+
+ ALIGN(64)
+forloop(`i',1,63,`SHIFT(i)')
+
+L(com): std r10, 8(rp)
+ mtlr r12
+ifdef(`HAVE_ABI_mode32',
+` mr r4, r3
+ srdi r3, r3, 32
+')
+ blr
+EPILOGUE()
+ASM_END()
diff --git a/gmp/mpn/powerpc64/p7/copyd.asm b/gmp/mpn/powerpc64/p7/copyd.asm
new file mode 100644
index 0000000000..f04ca586e8
--- /dev/null
+++ b/gmp/mpn/powerpc64/p7/copyd.asm
@@ -0,0 +1,128 @@
+dnl PowerPC-64 mpn_copyd.
+
+dnl Copyright 2004, 2005, 2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C POWER3/PPC630 ?
+C POWER4/PPC970 ?
+C POWER5 ?
+C POWER6 1.25
+C POWER7 1.09
+
+C INPUT PARAMETERS
+define(`rp', `r3')
+define(`up', `r4')
+define(`n', `r5')
+
+ASM_START()
+PROLOGUE(mpn_copyd)
+
+ifdef(`HAVE_ABI_mode32',
+` rldicl n, n, 0,32')
+
+ sldi r0, n, 3
+ add up, up, r0 C point at u[] end
+ add rp, rp, r0 C point at r[] end
+
+ cmpdi cr0, n, 4
+ blt L(sml)
+
+ addi r10, n, 4
+ srdi r10, r10, 3
+ mtctr r10
+
+ andi. r0, n, 1
+ rlwinm r11, n, 0,30,30
+ rlwinm r12, n, 0,29,29
+ cmpdi cr6, r11, 0
+ cmpdi cr7, r12, 0
+
+ beq cr0, L(xx0)
+L(xx1): ld r6, -8(up)
+ addi up, up, -8
+ std r6, -8(rp)
+ addi rp, rp, -8
+
+L(xx0): bne cr6, L(x10)
+L(x00): ld r6, -8(up)
+ ld r7, -16(up)
+ bne cr7, L(100)
+L(000): addi rp, rp, 32
+ b L(lo0)
+L(100): addi up, up, 32
+ b L(lo4)
+L(x10): ld r8, -8(up)
+ ld r9, -16(up)
+ bne cr7, L(110)
+L(010): addi up, up, -16
+ addi rp, rp, 16
+ b L(lo2)
+L(110): addi up, up, 16
+ addi rp, rp, 48
+ b L(lo6)
+
+L(sml): cmpdi cr0, n, 0
+ beqlr- cr0
+ mtctr n
+L(t): ld r6, -8(up)
+ addi up, up, -8
+ std r6, -8(rp)
+ addi rp, rp, -8
+ bdnz L(t)
+ blr
+
+ ALIGN(32)
+L(top): std r6, -8(rp)
+ std r7, -16(rp)
+L(lo2): ld r6, -8(up)
+ ld r7, -16(up)
+ std r8, -24(rp)
+ std r9, -32(rp)
+L(lo0): ld r8, -24(up)
+ ld r9, -32(up)
+ std r6, -40(rp)
+ std r7, -48(rp)
+L(lo6): ld r6, -40(up)
+ ld r7, -48(up)
+ std r8, -56(rp)
+ std r9, -64(rp)
+ addi rp, rp, -64
+L(lo4): ld r8, -56(up)
+ ld r9, -64(up)
+ addi up, up, -64
+ bdnz L(top)
+
+L(end): std r6, -8(rp)
+ std r7, -16(rp)
+ std r8, -24(rp)
+ std r9, -32(rp)
+ blr
+EPILOGUE()
diff --git a/gmp/mpn/powerpc64/p7/copyi.asm b/gmp/mpn/powerpc64/p7/copyi.asm
new file mode 100644
index 0000000000..854cf9f809
--- /dev/null
+++ b/gmp/mpn/powerpc64/p7/copyi.asm
@@ -0,0 +1,129 @@
+dnl PowerPC-64 mpn_copyi.
+
+dnl Copyright 2004, 2005, 2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C POWER3/PPC630 ?
+C POWER4/PPC970 ?
+C POWER5 ?
+C POWER6 1.25
+C POWER7 1.09
+
+C INPUT PARAMETERS
+define(`rp', `r3')
+define(`up', `r4')
+define(`n', `r5')
+
+C TODO
+C * Try rolling the two loop leading std to the end, allowing the code to
+C handle also n = 2.
+C * Consider using 4 pointers, schedule ptr update early wrt use.
+
+ASM_START()
+PROLOGUE(mpn_copyi)
+
+ifdef(`HAVE_ABI_mode32',
+` rldicl n, n, 0,32')
+
+ cmpdi cr0, n, 4
+ blt L(sml)
+
+ addi r10, n, 4
+ srdi r10, r10, 3
+ mtctr r10
+
+ andi. r0, n, 1
+ rlwinm r11, n, 0,30,30
+ rlwinm r12, n, 0,29,29
+ cmpdi cr6, r11, 0
+ cmpdi cr7, r12, 0
+
+ beq cr0, L(xx0)
+L(xx1): ld r6, 0(up)
+ addi up, up, 8
+ std r6, 0(rp)
+ addi rp, rp, 8
+
+L(xx0): bne cr6, L(x10)
+L(x00): ld r6, 0(up)
+ ld r7, 8(up)
+ bne cr7, L(100)
+L(000): addi rp, rp, -32
+ b L(lo0)
+L(100): addi up, up, -32
+ b L(lo4)
+L(x10): ld r8, 0(up)
+ ld r9, 8(up)
+ bne cr7, L(110)
+L(010): addi up, up, 16
+ addi rp, rp, -16
+ b L(lo2)
+L(110): addi up, up, -16
+ addi rp, rp, -48
+ b L(lo6)
+
+L(sml): cmpdi cr0, n, 0
+ beqlr- cr0
+ mtctr n
+L(t): ld r6, 0(up)
+ addi up, up, 8
+ std r6, 0(rp)
+ addi rp, rp, 8
+ bdnz L(t)
+ blr
+
+ ALIGN(32)
+L(top): std r6, 0(rp)
+ std r7, 8(rp)
+L(lo2): ld r6, 0(up)
+ ld r7, 8(up)
+ std r8, 16(rp)
+ std r9, 24(rp)
+L(lo0): ld r8, 16(up)
+ ld r9, 24(up)
+ std r6, 32(rp)
+ std r7, 40(rp)
+L(lo6): ld r6, 32(up)
+ ld r7, 40(up)
+ std r8, 48(rp)
+ std r9, 56(rp)
+ addi rp, rp, 64
+L(lo4): ld r8, 48(up)
+ ld r9, 56(up)
+ addi up, up, 64
+ bdnz L(top)
+
+L(end): std r6, 0(rp)
+ std r7, 8(rp)
+ std r8, 16(rp)
+ std r9, 24(rp)
+ blr
+EPILOGUE()
diff --git a/gmp/mpn/powerpc64/p7/hamdist.asm b/gmp/mpn/powerpc64/p7/hamdist.asm
new file mode 100644
index 0000000000..5af98946f7
--- /dev/null
+++ b/gmp/mpn/powerpc64/p7/hamdist.asm
@@ -0,0 +1,110 @@
+dnl PowerPC-64 mpn_hamdist.
+
+dnl Copyright 2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C POWER3/PPC630 -
+C POWER4/PPC970 -
+C POWER5 -
+C POWER6 -
+C POWER7 2.87
+
+define(`up', r3)
+define(`vp', r4)
+define(`n', r5)
+
+ASM_START()
+PROLOGUE(mpn_hamdist)
+ std r30, -16(r1)
+ std r31, -8(r1)
+
+ addi r0, n, 1
+ifdef(`HAVE_ABI_mode32',
+` rldicl r0, r0, 63,33', C ...branch count
+` srdi r0, r0, 1') C ...for ctr
+ mtctr r0
+
+ andi. r0, n, 1
+
+ li r0, 0
+ li r12, 0
+
+ beq L(evn)
+
+L(odd): ld r6, 0(up)
+ addi up, up, 8
+ ld r8, 0(vp)
+ addi vp, vp, 8
+ xor r10, r6, r8
+ popcntd r0, r10
+ bdz L(e1)
+
+L(evn): ld r6, 0(up)
+ ld r8, 0(vp)
+ ld r7, 8(up)
+ ld r9, 8(vp)
+ xor r10, r6, r8
+ addi up, up, 16
+ addi vp, vp, 16
+ li r30, 0
+ li r31, 0
+ bdz L(end)
+
+ nop
+ nop
+C ALIGN(16)
+L(top): add r0, r0, r30
+ ld r6, 0(up)
+ ld r8, 0(vp)
+ xor r11, r7, r9
+ popcntd r30, r10
+ add r12, r12, r31
+ ld r7, 8(up)
+ ld r9, 8(vp)
+ xor r10, r6, r8
+ popcntd r31, r11
+ addi up, up, 16
+ addi vp, vp, 16
+ bdnz L(top)
+
+L(end): add r0, r0, r30
+ xor r11, r7, r9
+ popcntd r30, r10
+ add r12, r12, r31
+ popcntd r31, r11
+
+ add r0, r0, r30
+ add r12, r12, r31
+L(e1): add r3, r0, r12
+ ld r30, -16(r1)
+ ld r31, -8(r1)
+ blr
+EPILOGUE()
diff --git a/gmp/mpn/powerpc64/p7/popcount.asm b/gmp/mpn/powerpc64/p7/popcount.asm
new file mode 100644
index 0000000000..eac72a6493
--- /dev/null
+++ b/gmp/mpn/powerpc64/p7/popcount.asm
@@ -0,0 +1,90 @@
+dnl PowerPC-64 mpn_popcount.
+
+dnl Copyright 2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C POWER3/PPC630 -
+C POWER4/PPC970 -
+C POWER5 -
+C POWER6 -
+C POWER7 2
+
+define(`up', r3)
+define(`n', r4)
+
+ASM_START()
+PROLOGUE(mpn_popcount)
+ addi r0, n, 1
+ifdef(`HAVE_ABI_mode32',
+` rldicl r0, r0, 63,33', C ...branch count
+` srdi r0, r0, 1') C ...for ctr
+ mtctr r0
+
+ andi. r0, n, 1
+
+ li r0, 0
+ li r12, 0
+ beq L(evn)
+
+L(odd): ld r4, 0(up)
+ addi up, up, 8
+ popcntd r0, r4
+ bdz L(e1)
+
+L(evn): ld r4, 0(up)
+ ld r5, 8(up)
+ popcntd r8, r4
+ popcntd r9, r5
+ bdz L(e2)
+
+ ld r4, 16(up)
+ ld r5, 24(up)
+ bdz L(e4)
+ addi up, up, 32
+
+L(top): add r0, r0, r8
+ popcntd r8, r4
+ ld r4, 0(up)
+ add r12, r12, r9
+ popcntd r9, r5
+ ld r5, 8(up)
+ addi up, up, 16
+ bdnz L(top)
+
+L(e4): add r0, r0, r8
+ popcntd r8, r4
+ add r12, r12, r9
+ popcntd r9, r5
+L(e2): add r0, r0, r8
+ add r12, r12, r9
+L(e1): add r3, r0, r12
+ blr
+EPILOGUE()
diff --git a/gmp/mpn/powerpc64/rshift.asm b/gmp/mpn/powerpc64/rshift.asm
new file mode 100644
index 0000000000..7654a16ae8
--- /dev/null
+++ b/gmp/mpn/powerpc64/rshift.asm
@@ -0,0 +1,207 @@
+dnl PowerPC-64 mpn_rshift -- rp[] = up[] >> cnt
+
+dnl Copyright 2003, 2005, 2010, 2011 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C POWER3/PPC630 ?
+C POWER4/PPC970 ?
+C POWER5 2.25
+C POWER6 9.75
+C POWER7 2.15
+
+C TODO
+C * Try to reduce the number of needed live registers
+C * Micro-optimise header code
+C * Keep in synch with lshift.asm and lshiftc.asm
+
+C INPUT PARAMETERS
+define(`rp', `r3')
+define(`up', `r4')
+define(`n', `r5')
+define(`cnt', `r6')
+
+define(`tnc',`r0')
+define(`u0',`r30')
+define(`u1',`r31')
+define(`retval',`r5')
+
+ASM_START()
+PROLOGUE(mpn_rshift)
+ std r31, -8(r1)
+ std r30, -16(r1)
+ subfic tnc, cnt, 64
+C sldi r30, n, 3 C byte count corresponding to n
+C add rp, rp, r30 C rp = rp + n
+C add up, up, r30 C up = up + n
+ rldicl. r30, n, 0,62 C r30 = n & 3, set cr0
+ cmpdi cr6, r30, 2
+ addi r31, n, 3 C compute count...
+ ld r10, 0(up) C load 1st limb for b00...b11
+ sld retval, r10, tnc
+ifdef(`HAVE_ABI_mode32',
+` rldicl r31, r31, 62,34', C ...branch count
+` srdi r31, r31, 2') C ...for ctr
+ mtctr r31 C copy count into ctr
+ beq cr0, L(b00)
+ blt cr6, L(b01)
+ ld r11, 8(up) C load 2nd limb for b10 and b11
+ beq cr6, L(b10)
+
+ ALIGN(16)
+L(b11): srd r8, r10, cnt
+ sld r9, r11, tnc
+ ld u1, 16(up)
+ addi up, up, 24
+ srd r12, r11, cnt
+ sld r7, u1, tnc
+ addi rp, rp, -16
+ bdnz L(gt3)
+
+ or r11, r8, r9
+ srd r8, u1, cnt
+ b L(cj3)
+
+ ALIGN(16)
+L(gt3): ld u0, 0(up)
+ or r11, r8, r9
+ srd r8, u1, cnt
+ sld r9, u0, tnc
+ ld u1, 8(up)
+ or r10, r12, r7
+ b L(L11)
+
+ ALIGN(32)
+L(b10): srd r12, r10, cnt
+ addi rp, rp, -24
+ sld r7, r11, tnc
+ bdnz L(gt2)
+
+ srd r8, r11, cnt
+ or r10, r12, r7
+ b L(cj2)
+
+L(gt2): ld u0, 16(up)
+ srd r8, r11, cnt
+ sld r9, u0, tnc
+ ld u1, 24(up)
+ or r10, r12, r7
+ srd r12, u0, cnt
+ sld r7, u1, tnc
+ ld u0, 32(up)
+ or r11, r8, r9
+ addi up, up, 16
+ b L(L10)
+
+ ALIGN(16)
+L(b00): ld u1, 8(up)
+ srd r12, r10, cnt
+ sld r7, u1, tnc
+ ld u0, 16(up)
+ srd r8, u1, cnt
+ sld r9, u0, tnc
+ ld u1, 24(up)
+ or r10, r12, r7
+ srd r12, u0, cnt
+ sld r7, u1, tnc
+ addi rp, rp, -8
+ bdz L(cj4)
+
+L(gt4): addi up, up, 32
+ ld u0, 0(up)
+ or r11, r8, r9
+ b L(L00)
+
+ ALIGN(16)
+L(b01): bdnz L(gt1)
+ srd r8, r10, cnt
+ std r8, 0(rp)
+ b L(ret)
+
+L(gt1): ld u0, 8(up)
+ srd r8, r10, cnt
+ sld r9, u0, tnc
+ ld u1, 16(up)
+ srd r12, u0, cnt
+ sld r7, u1, tnc
+ ld u0, 24(up)
+ or r11, r8, r9
+ srd r8, u1, cnt
+ sld r9, u0, tnc
+ ld u1, 32(up)
+ addi up, up, 40
+ or r10, r12, r7
+ bdz L(end)
+
+ ALIGN(32)
+L(top): srd r12, u0, cnt
+ sld r7, u1, tnc
+ ld u0, 0(up)
+ std r11, 0(rp)
+ or r11, r8, r9
+L(L00): srd r8, u1, cnt
+ sld r9, u0, tnc
+ ld u1, 8(up)
+ std r10, 8(rp)
+ or r10, r12, r7
+L(L11): srd r12, u0, cnt
+ sld r7, u1, tnc
+ ld u0, 16(up)
+ std r11, 16(rp)
+ or r11, r8, r9
+L(L10): srd r8, u1, cnt
+ sld r9, u0, tnc
+ ld u1, 24(up)
+ addi up, up, 32
+ std r10, 24(rp)
+ addi rp, rp, 32
+ or r10, r12, r7
+ bdnz L(top)
+
+ ALIGN(32)
+L(end): srd r12, u0, cnt
+ sld r7, u1, tnc
+ std r11, 0(rp)
+L(cj4): or r11, r8, r9
+ srd r8, u1, cnt
+ std r10, 8(rp)
+L(cj3): or r10, r12, r7
+ std r11, 16(rp)
+L(cj2): std r10, 24(rp)
+ std r8, 32(rp)
+
+L(ret): ld r31, -8(r1)
+ ld r30, -16(r1)
+ifdef(`HAVE_ABI_mode32',
+` srdi r3, retval, 32
+ mr r4, retval
+',` mr r3, retval')
+ blr
+EPILOGUE()
diff --git a/gmp/mpn/powerpc64/sec_tabselect.asm b/gmp/mpn/powerpc64/sec_tabselect.asm
new file mode 100644
index 0000000000..085577ca9b
--- /dev/null
+++ b/gmp/mpn/powerpc64/sec_tabselect.asm
@@ -0,0 +1,147 @@
+dnl PowerPC-64 mpn_sec_tabselect.
+
+dnl Contributed to the GNU project by Torbjörn Granlund.
+
+dnl Copyright 2011-2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C POWER3/PPC630 1.75
+C POWER4/PPC970 2.0
+C POWER5 ?
+C POWER6 5.0
+C POWER7 1.75
+
+define(`rp', `r3')
+define(`tp', `r4')
+define(`n', `r5')
+define(`nents', `r6')
+define(`which', `r7')
+
+define(`i', `r8')
+define(`j', `r9')
+define(`stride', `r12')
+define(`mask', `r11')
+
+
+ASM_START()
+PROLOGUE(mpn_sec_tabselect)
+ addic. j, n, -4 C outer loop induction variable
+ std r31, -8(r1)
+ std r30, -16(r1)
+ std r29, -24(r1)
+ std r28, -32(r1)
+ std r27, -40(r1)
+ sldi stride, n, 3
+
+ blt cr0, L(outer_end)
+L(outer_top):
+ mtctr nents
+ mr r10, tp
+ li r28, 0
+ li r29, 0
+ li r30, 0
+ li r31, 0
+ addic. j, j, -4 C outer loop induction variable
+ mr i, which
+
+ ALIGN(16)
+L(top): addic i, i, -1 C set carry iff i != 0
+ subfe mask, mask, mask
+ ld r0, 0(tp)
+ ld r27, 8(tp)
+ and r0, r0, mask
+ and r27, r27, mask
+ or r28, r28, r0
+ or r29, r29, r27
+ ld r0, 16(tp)
+ ld r27, 24(tp)
+ and r0, r0, mask
+ and r27, r27, mask
+ or r30, r30, r0
+ or r31, r31, r27
+ add tp, tp, stride
+ bdnz L(top)
+
+ std r28, 0(rp)
+ std r29, 8(rp)
+ std r30, 16(rp)
+ std r31, 24(rp)
+ addi tp, r10, 32
+ addi rp, rp, 32
+ bge cr0, L(outer_top)
+L(outer_end):
+
+ rldicl. r0, n, 63, 63
+ beq cr0, L(b0x)
+L(b1x): mtctr nents
+ mr r10, tp
+ li r28, 0
+ li r29, 0
+ mr i, which
+ ALIGN(16)
+L(tp2): addic i, i, -1
+ subfe mask, mask, mask
+ ld r0, 0(tp)
+ ld r27, 8(tp)
+ and r0, r0, mask
+ and r27, r27, mask
+ or r28, r28, r0
+ or r29, r29, r27
+ add tp, tp, stride
+ bdnz L(tp2)
+ std r28, 0(rp)
+ std r29, 8(rp)
+ addi tp, r10, 16
+ addi rp, rp, 16
+
+L(b0x): rldicl. r0, n, 0, 63
+ beq cr0, L(b00)
+L(b01): mtctr nents
+ mr r10, tp
+ li r28, 0
+ mr i, which
+ ALIGN(16)
+L(tp1): addic i, i, -1
+ subfe mask, mask, mask
+ ld r0, 0(tp)
+ and r0, r0, mask
+ or r28, r28, r0
+ add tp, tp, stride
+ bdnz L(tp1)
+ std r28, 0(rp)
+
+L(b00): ld r31, -8(r1)
+ ld r30, -16(r1)
+ ld r29, -24(r1)
+ ld r28, -32(r1)
+ ld r27, -40(r1)
+ blr
+EPILOGUE()
diff --git a/gmp/mpn/powerpc64/umul.asm b/gmp/mpn/powerpc64/umul.asm
new file mode 100644
index 0000000000..7fcc72f18f
--- /dev/null
+++ b/gmp/mpn/powerpc64/umul.asm
@@ -0,0 +1,53 @@
+dnl PowerPC-64 umul_ppmm -- support for longlong.h
+
+dnl Copyright 2000, 2001, 2005 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C mp_limb_t mpn_umul_ppmm (mp_limb_t *lowptr, mp_limb_t m1, mp_limb_t m2);
+C
+
+ASM_START()
+PROLOGUE(mpn_umul_ppmm)
+
+ C r3 lowptr
+ C r4 m1
+ C r5 m2
+
+ mulld r0, r4, r5
+ mulhdu r4, r4, r5
+ std r0, 0(r3)
+ifdef(`HAVE_ABI_mode32',
+` srdi r3, r4, 32
+',` mr r3, r4
+')
+ blr
+
+EPILOGUE(mpn_umul_ppmm)
diff --git a/gmp/mpn/powerpc64/vmx/popcount.asm b/gmp/mpn/powerpc64/vmx/popcount.asm
new file mode 100644
index 0000000000..b95fb88b1a
--- /dev/null
+++ b/gmp/mpn/powerpc64/vmx/popcount.asm
@@ -0,0 +1,230 @@
+dnl PowerPC-32/VMX and PowerPC-64/VMX mpn_popcount.
+
+dnl Copyright 2006, 2010 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C 7400,7410 (G4): ?
+C 744x,745x (G4+): 1.125
+C 970 (G5): 2.25
+
+C TODO
+C * Rewrite the awkward huge n outer loop code.
+C * Two lvx, two vperm, and two vxor could make us a similar hamdist.
+C * Compress cnsts table in 64-bit mode, only half the values are needed.
+
+define(`GMP_LIMB_BYTES', eval(GMP_LIMB_BITS/8))
+define(`LIMBS_PER_VR', eval(16/GMP_LIMB_BYTES))
+define(`LIMBS_PER_2VR', eval(32/GMP_LIMB_BYTES))
+
+define(`OPERATION_popcount')
+
+define(`ap', `r3')
+define(`n', `r4')
+
+define(`rtab', `v10')
+define(`cnt4', `v11')
+
+ifelse(GMP_LIMB_BITS,32,`
+ define(`LIMB32',` $1')
+ define(`LIMB64',`')
+',`
+ define(`LIMB32',`')
+ define(`LIMB64',` $1')
+')
+
+C The inner loop handles up to 2^34 bits, i.e., 2^31 64-limbs, due to overflow
+C in vsum4ubs. For large operands, we work in chunks, of size LIMBS_PER_CHUNK.
+define(`LIMBS_PER_CHUNK', 0x1000)
+define(`LIMBS_CHUNK_THRES', 0x1001)
+
+ASM_START()
+PROLOGUE(mpn_popcount,toc)
+ mfspr r10, 256
+ oris r0, r10, 0xfffc C Set VRSAVE bit 0-13
+ mtspr 256, r0
+
+ifdef(`HAVE_ABI_mode32',
+` rldicl n, n, 0, 32') C zero extend n
+
+C Load various constants into vector registers
+ LEAL( r11, cnsts)
+ li r12, 16
+ vspltisb cnt4, 4 C 0x0404...04 used as shift count
+
+ li r7, 160
+ lvx rtab, 0, r11
+
+LIMB64(`lis r0, LIMBS_CHUNK_THRES ')
+LIMB64(`cmpd cr7, n, r0 ')
+
+ lvx v0, 0, ap
+ addi r7, r11, 80
+ rlwinm r6, ap, 2,26,29
+ lvx v8, r7, r6
+ vand v0, v0, v8
+
+LIMB32(`rlwinm r8, ap, 30,30,31 ')
+LIMB64(`rlwinm r8, ap, 29,31,31 ')
+ add n, n, r8 C compensate n for rounded down `ap'
+
+ vxor v1, v1, v1
+ li r8, 0 C grand total count
+
+ vxor v12, v12, v12 C zero total count
+ vxor v13, v13, v13 C zero total count
+
+ addic. n, n, -LIMBS_PER_VR
+ ble L(sum)
+
+ addic. n, n, -LIMBS_PER_VR
+ ble L(lsum)
+
+C For 64-bit machines, handle huge n that would overflow vsum4ubs
+LIMB64(`ble cr7, L(small) ')
+LIMB64(`addis r9, n, -LIMBS_PER_CHUNK ') C remaining n
+LIMB64(`lis n, LIMBS_PER_CHUNK ')
+
+ ALIGN(16)
+L(small):
+LIMB32(`srwi r7, n, 3 ') C loop count corresponding to n
+LIMB64(`srdi r7, n, 2 ') C loop count corresponding to n
+ addi r7, r7, 1
+ mtctr r7 C copy n to count register
+ b L(ent)
+
+ ALIGN(16)
+L(top):
+ lvx v0, 0, ap
+L(ent): lvx v1, r12, ap
+ addi ap, ap, 32
+ vsrb v8, v0, cnt4
+ vsrb v9, v1, cnt4
+ vperm v2, rtab, rtab, v0
+ vperm v3, rtab, rtab, v8
+ vperm v4, rtab, rtab, v1
+ vperm v5, rtab, rtab, v9
+ vaddubm v6, v2, v3
+ vaddubm v7, v4, v5
+ vsum4ubs v12, v6, v12
+ vsum4ubs v13, v7, v13
+ bdnz L(top)
+
+ andi. n, n, eval(LIMBS_PER_2VR-1)
+ beq L(rt)
+
+ lvx v0, 0, ap
+ vxor v1, v1, v1
+ cmpwi n, LIMBS_PER_VR
+ ble L(sum)
+L(lsum):
+ vor v1, v0, v0
+ lvx v0, r12, ap
+L(sum):
+LIMB32(`rlwinm r6, n, 4,26,27 ')
+LIMB64(`rlwinm r6, n, 5,26,26 ')
+ addi r7, r11, 16
+ lvx v8, r7, r6
+ vand v0, v0, v8
+ vsrb v8, v0, cnt4
+ vsrb v9, v1, cnt4
+ vperm v2, rtab, rtab, v0
+ vperm v3, rtab, rtab, v8
+ vperm v4, rtab, rtab, v1
+ vperm v5, rtab, rtab, v9
+ vaddubm v6, v2, v3
+ vaddubm v7, v4, v5
+ vsum4ubs v12, v6, v12
+ vsum4ubs v13, v7, v13
+
+ ALIGN(16)
+L(rt): vadduwm v3, v12, v13
+ li r7, -16 C FIXME: does all ppc32 and ppc64 ABIs
+ stvx v3, r7, r1 C FIXME: ...support storing below sp?
+
+ lwz r7, -16(r1)
+ add r8, r8, r7
+ lwz r7, -12(r1)
+ add r8, r8, r7
+ lwz r7, -8(r1)
+ add r8, r8, r7
+ lwz r7, -4(r1)
+ add r8, r8, r7
+
+C Handle outer loop for huge n. We inherit cr7 and r0 from above.
+LIMB64(`ble cr7, L(ret)
+ vxor v12, v12, v12 C zero total count
+ vxor v13, v13, v13 C zero total count
+ mr n, r9
+ cmpd cr7, n, r0
+ ble cr7, L(2)
+ addis r9, n, -LIMBS_PER_CHUNK C remaining n
+ lis n, LIMBS_PER_CHUNK
+L(2): srdi r7, n, 2 C loop count corresponding to n
+ mtctr r7 C copy n to count register
+ b L(top)
+')
+
+ ALIGN(16)
+L(ret): mr r3, r8
+ mtspr 256, r10
+ blr
+EPILOGUE()
+
+DEF_OBJECT(cnsts,16)
+C Counts for vperm
+ .byte 0x00,0x01,0x01,0x02,0x01,0x02,0x02,0x03
+ .byte 0x01,0x02,0x02,0x03,0x02,0x03,0x03,0x04
+C Masks for high end of number
+ .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
+ .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
+
+ .byte 0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00
+ .byte 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+
+ .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
+ .byte 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+
+ .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
+ .byte 0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00
+C Masks for low end of number
+ .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
+ .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
+
+ .byte 0x00,0x00,0x00,0x00,0xff,0xff,0xff,0xff
+ .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
+
+ .byte 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+ .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
+
+ .byte 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+ .byte 0x00,0x00,0x00,0x00,0xff,0xff,0xff,0xff
+END_OBJECT(cnsts)
+ASM_END()