Add gmp, mpc and mpfr sourcesbaserock/pedroalvarez/4.9.1

author: Pedro Alvarez <pedro.alvarez@codethink.co.uk> 2014-12-22 00:55:04 +0000
committer: Pedro Alvarez <pedro.alvarez@codethink.co.uk> 2014-12-22 00:56:42 +0000
commit: 54eea31d0053620bab65153ab39d61e5575aaf1b (patch)
tree: 5f97c96dffdb6b27df36795689abfb9086011585 /gmp/mpn/arm
parent: c16297b7cfb0c1708f1d84b5d0f90be0844d07ce (diff)
download: gcc-tarball-54eea31d0053620bab65153ab39d61e5575aaf1b.tar.gz
61 files changed, 8374 insertions, 0 deletions
diff --git a/gmp/mpn/arm/README b/gmp/mpn/arm/README
new file mode 100644
index 0000000000..598baa3f2e
--- /dev/null
+++ b/gmp/mpn/arm/README
@@ -0,0 +1,35 @@
+Copyright 2002, 2012 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.
+
+
+
+
+
+This directory contains mpn functions for ARM processors.  It has been
+optimised for Cortex-A9, but the code in the top-level directory should run
+on all ARM processors at architecture level v4 or later.
diff --git a/gmp/mpn/arm/aors_n.asm b/gmp/mpn/arm/aors_n.asm
new file mode 100644
index 0000000000..fdad9f7ba6
--- /dev/null
+++ b/gmp/mpn/arm/aors_n.asm
@@ -0,0 +1,112 @@
+dnl  ARM mpn_add_n and mpn_sub_n
+
+dnl  Contributed to the GNU project by Robert Harley.
+
+dnl  Copyright 1997, 2000, 2001, 2012 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb
+C StrongARM	 ?
+C XScale	 ?
+C Cortex-A7	 ?
+C Cortex-A8	 ?
+C Cortex-A9	 2.5	slightly fluctuating
+C Cortex-A15	 2.25
+
+define(`rp', `r0')
+define(`up', `r1')
+define(`vp', `r2')
+define(`n',  `r3')
+
+ifdef(`OPERATION_add_n', `
+  define(`ADDSUB',	adds)
+  define(`ADDSUBC',	adcs)
+  define(`CLRCY',	`cmn	r0, #0')
+  define(`SETCY',	`cmp	$1, #1')
+  define(`RETVAL',	`adc	r0, n, #0')
+  define(`func',	mpn_add_n)
+  define(`func_nc',	mpn_add_nc)')
+ifdef(`OPERATION_sub_n', `
+  define(`ADDSUB',	subs)
+  define(`ADDSUBC',	sbcs)
+  define(`CLRCY',	`cmp	r0, r0')
+  define(`SETCY',	`rsbs	$1, $1, #0')
+  define(`RETVAL',	`sbc	r0, r0, r0
+			and	r0, r0, #1')
+  define(`func',	mpn_sub_n)
+  define(`func_nc',	mpn_sub_nc)')
+
+MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
+
+ASM_START()
+PROLOGUE(func_nc)
+	ldr	r12, [sp, #0]
+	stmfd	sp!, { r8, r9, lr }
+	SETCY(	r12)
+	b	L(ent)
+EPILOGUE()
+PROLOGUE(func)
+	stmfd	sp!, { r8, r9, lr }
+	CLRCY(	r12)
+L(ent):	tst	n, #1
+	beq	L(skip1)
+	ldr	r12, [up], #4
+	ldr	lr, [vp], #4
+	ADDSUBC	r12, r12, lr
+	str	r12, [rp], #4
+L(skip1):
+	tst	n, #2
+	beq	L(skip2)
+	ldmia	up!, { r8, r9 }
+	ldmia	vp!, { r12, lr }
+	ADDSUBC	r8, r8, r12
+	ADDSUBC	r9, r9, lr
+	stmia	rp!, { r8, r9 }
+L(skip2):
+	bics	n, n, #3
+	beq	L(rtn)
+	stmfd	sp!, { r4, r5, r6, r7 }
+
+L(top):	ldmia	up!, { r4, r5, r6, r7 }
+	ldmia	vp!, { r8, r9, r12, lr }
+	ADDSUBC	r4, r4, r8
+	sub	n, n, #4
+	ADDSUBC	r5, r5, r9
+	ADDSUBC	r6, r6, r12
+	ADDSUBC	r7, r7, lr
+	stmia	rp!, { r4, r5, r6, r7 }
+	teq	n, #0
+	bne	L(top)
+
+	ldmfd	sp!, { r4, r5, r6, r7 }
+
+L(rtn):	RETVAL
+	ldmfd	sp!, { r8, r9, pc }
+EPILOGUE()
diff --git a/gmp/mpn/arm/aorslsh1_n.asm b/gmp/mpn/arm/aorslsh1_n.asm
new file mode 100644
index 0000000000..1cbd4ba1af
--- /dev/null
+++ b/gmp/mpn/arm/aorslsh1_n.asm
@@ -0,0 +1,167 @@
+dnl  ARM mpn_addlsh1_n and mpn_sublsh1_n
+
+dnl  Contributed to the GNU project by Torbjörn Granlund.
+
+dnl  Copyright 2012 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	      addlsh1_n       sublsh1_n
+C	     cycles/limb     cycles/limb
+C StrongARM	 ?		 ?
+C XScale	 ?		 ?
+C Cortex-A7	 ?		 ?
+C Cortex-A8	 ?		 ?
+C Cortex-A9	 3.12		 3.7
+C Cortex-A15	 ?		 ?
+
+C TODO
+C  * The addlsh1_n code runs well, but is only barely faster than mpn_addmul_1.
+C    The sublsh1_n code could surely be tweaked, its REVCY slows down things
+C    very much.  If two insns are really needed, it might help to separate them
+C    for better micro-parallelism.
+
+define(`rp', `r0')
+define(`up', `r1')
+define(`vp', `r2')
+define(`n',  `r3')
+
+ifdef(`OPERATION_addlsh1_n', `
+  define(`ADDSUB',	adds)
+  define(`ADDSUBC',	adcs)
+  define(`SETCY',	`cmp	$1, #1')
+  define(`RETVAL',	`adc	r0, $1, #2')
+  define(`SAVECY',	`sbc	$1, $2, #0')
+  define(`RESTCY',	`cmn	$1, #1')
+  define(`REVCY',	`')
+  define(`INICYR',	`mov	$1, #0')
+  define(`r10r11',	`r11')
+  define(`func',	mpn_addlsh1_n)
+  define(`func_nc',	mpn_addlsh1_nc)')
+ifdef(`OPERATION_sublsh1_n', `
+  define(`ADDSUB',	subs)
+  define(`ADDSUBC',	sbcs)
+  define(`SETCY',	`rsbs	$1, $1, #0')
+  define(`RETVAL',	`adc	r0, $1, #1')
+  define(`SAVECY',	`sbc	$1, $1, $1')
+  define(`RESTCY',	`cmn	$1, #1')
+  define(`REVCY',	`sbc	$1, $1, $1
+			cmn	$1, #1')
+  define(`INICYR',	`mvn	$1, #0')
+  define(`r10r11',	`r10')
+  define(`func',	mpn_sublsh1_n)
+  define(`func_nc',	mpn_sublsh1_nc)')
+
+MULFUNC_PROLOGUE(mpn_addlsh1_n mpn_sublsh1_n)
+
+ASM_START()
+PROLOGUE(func)
+	push	{r4-r10r11, r14}
+
+ifdef(`OPERATION_addlsh1_n', `
+	mvn	r11, #0
+')
+	INICYR(	r14)
+	subs	n, n, #3
+	blt	L(le2)			C carry clear on branch path
+
+	cmn	r0, #0			C clear carry
+	ldmia	vp!, {r8, r9, r10}
+	b	L(mid)
+
+L(top):	RESTCY(	r14)
+	ADDSUBC	r4, r4, r8
+	ADDSUBC	r5, r5, r9
+	ADDSUBC	r6, r6, r10
+	ldmia	vp!, {r8, r9, r10}
+	stmia	rp!, {r4, r5, r6}
+	REVCY(r14)
+	adcs	r8, r8, r8
+	adcs	r9, r9, r9
+	adcs	r10, r10, r10
+	ldmia	up!, {r4, r5, r6}
+	SAVECY(	r14, r11)
+	subs	n, n, #3
+	blt	L(exi)
+	RESTCY(	r12)
+	ADDSUBC	r4, r4, r8
+	ADDSUBC	r5, r5, r9
+	ADDSUBC	r6, r6, r10
+	ldmia	vp!, {r8, r9, r10}
+	stmia	rp!, {r4, r5, r6}
+	REVCY(r12)
+L(mid):	adcs	r8, r8, r8
+	adcs	r9, r9, r9
+	adcs	r10, r10, r10
+	ldmia	up!, {r4, r5, r6}
+	SAVECY(	r12, r11)
+	subs	n, n, #3
+	bge	L(top)
+
+	mov	r7, r12			C swap alternating...
+	mov	r12, r14		C ...carry-save...
+	mov	r14, r7			C ...registers
+
+L(exi):	RESTCY(	r12)
+	ADDSUBC	r4, r4, r8
+	ADDSUBC	r5, r5, r9
+	ADDSUBC	r6, r6, r10
+	stmia	rp!, {r4, r5, r6}
+
+	REVCY(r12)
+L(le2):	tst	n, #1			C n = {-1,-2,-3} map to [2], [1], [0]
+	beq	L(e1)
+
+L(e02):	tst	n, #2
+	beq	L(rt0)
+	ldm	vp, {r8, r9}
+	adcs	r8, r8, r8
+	adcs	r9, r9, r9
+	ldm	up, {r4, r5}
+	SAVECY(	r12, r11)
+	RESTCY(	r14)
+	ADDSUBC	r4, r4, r8
+	ADDSUBC	r5, r5, r9
+	stm	rp, {r4, r5}
+	b	L(rt1)
+
+L(e1):	ldr	r8, [vp]
+	adcs	r8, r8, r8
+	ldr	r4, [up]
+	SAVECY(	r12, r11)
+	RESTCY(	r14)
+	ADDSUBC	r4, r4, r8
+	str	r4, [rp]
+
+L(rt1):	mov	r14, r12
+	REVCY(r12)
+L(rt0):	RETVAL(	r14)
+	pop	{r4-r10r11, r14}
+	bx	r14
+EPILOGUE()
diff --git a/gmp/mpn/arm/aorsmul_1.asm b/gmp/mpn/arm/aorsmul_1.asm
new file mode 100644
index 0000000000..b02fbb3b2a
--- /dev/null
+++ b/gmp/mpn/arm/aorsmul_1.asm
@@ -0,0 +1,135 @@
+dnl  ARM mpn_addmul_1 and mpn_submul_1.
+
+dnl  Copyright 1998, 2000, 2001, 2003, 2012 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb
+C StrongARM:     ?
+C XScale	 ?
+C Cortex-A7	 ?
+C Cortex-A8	 ?
+C Cortex-A9	 5.25
+C Cortex-A15	 4
+
+define(`rp', `r0')
+define(`up', `r1')
+define(`n',  `r2')
+define(`vl', `r3')
+define(`rl', `r12')
+define(`ul', `r6')
+define(`r',  `lr')
+
+ifdef(`OPERATION_addmul_1', `
+  define(`ADDSUB',	adds)
+  define(`ADDSUBC',	adcs)
+  define(`CLRRCY',	`mov	$1, #0
+			adds	r0, r0, #0')
+  define(`RETVAL',	`adc	r0, r4, #0')
+  define(`func',	mpn_addmul_1)')
+ifdef(`OPERATION_submul_1', `
+  define(`ADDSUB',	subs)
+  define(`ADDSUBC',	sbcs)
+  define(`CLRRCY',	`subs	$1, r0, r0')
+  define(`RETVAL',	`sbc	r0, r0, r0
+			sub	r0, $1, r0')
+  define(`func',	mpn_submul_1)')
+
+MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1)
+
+ASM_START()
+PROLOGUE(func)
+	stmfd	sp!, { r4-r6, lr }
+	CLRRCY(	r4)
+	tst	n, #1
+	beq	L(skip1)
+	ldr	ul, [up], #4
+	ldr	rl, [rp, #0]
+	umull	r5, r4, ul, vl
+	ADDSUB	r, rl, r5
+	str	r, [rp], #4
+L(skip1):
+	tst	n, #2
+	beq	L(skip2)
+	ldr	ul, [up], #4
+	ldr	rl, [rp, #0]
+	mov	r5, #0
+	umlal	r4, r5, ul, vl
+	ldr	ul, [up], #4
+	ADDSUBC	r, rl, r4
+	ldr	rl, [rp, #4]
+	mov	r4, #0
+	umlal	r5, r4, ul, vl
+	str	r, [rp], #4
+	ADDSUBC	r, rl, r5
+	str	r, [rp], #4
+L(skip2):
+	bics	n, n, #3
+	beq	L(rtn)
+
+	ldr	ul, [up], #4
+	ldr	rl, [rp, #0]
+	mov	r5, #0
+	umlal	r4, r5, ul, vl
+	b	L(in)
+
+L(top):	ldr	ul, [up], #4
+	ADDSUBC	r, rl, r5
+	ldr	rl, [rp, #4]
+	mov	r5, #0
+	umlal	r4, r5, ul, vl
+	str	r, [rp], #4
+L(in):	ldr	ul, [up], #4
+	ADDSUBC	r, rl, r4
+	ldr	rl, [rp, #4]
+	mov	r4, #0
+	umlal	r5, r4, ul, vl
+	str	r, [rp], #4
+	ldr	ul, [up], #4
+	ADDSUBC	r, rl, r5
+	ldr	rl, [rp, #4]
+	mov	r5, #0
+	umlal	r4, r5, ul, vl
+	str	r, [rp], #4
+	ldr	ul, [up], #4
+	ADDSUBC	r, rl, r4
+	ldr	rl, [rp, #4]
+	mov	r4, #0
+	umlal	r5, r4, ul, vl
+	sub	n, n, #4
+	tst	n, n
+	str	r, [rp], #4
+	bne	L(top)
+
+	ADDSUBC	r, rl, r5
+	str	r, [rp]
+
+L(rtn):	RETVAL(	r4)
+	ldmfd	sp!, { r4-r6, pc }
+EPILOGUE()
diff --git a/gmp/mpn/arm/arm-defs.m4 b/gmp/mpn/arm/arm-defs.m4
new file mode 100644
index 0000000000..6ca964a245
--- /dev/null
+++ b/gmp/mpn/arm/arm-defs.m4
@@ -0,0 +1,91 @@
+divert(-1)
+
+dnl  m4 macros for ARM assembler.
+
+dnl  Copyright 2001, 2012, 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+
+dnl  Standard commenting is with @, the default m4 # is for constants and we
+dnl  don't want to disable macro expansions in or after them.
+
+changecom(@&*$)
+
+
+dnl  APCS register names.
+
+deflit(a1,r0)
+deflit(a2,r1)
+deflit(a3,r2)
+deflit(a4,r3)
+deflit(v1,r4)
+deflit(v2,r5)
+deflit(v3,r6)
+deflit(v4,r7)
+deflit(v5,r8)
+deflit(v6,r9)
+deflit(sb,r9)
+deflit(v7,r10)
+deflit(sl,r10)
+deflit(fp,r11)
+deflit(ip,r12)
+deflit(sp,r13)
+deflit(lr,r14)
+deflit(pc,r15)
+
+
+define(`lea_list', `')
+define(`lea_num',0)
+
+dnl  LEA(reg,gmp_symbol)
+dnl
+dnl  Load the address of gmp_symbol into a register.  The gmp_symbol must be
+dnl  either local or protected/hidden, since we assume it has a fixed distance
+dnl  from the point of use.
+
+define(`LEA',`dnl
+ldr	$1, L(ptr`'lea_num)
+ifdef(`PIC',dnl
+`dnl
+L(bas`'lea_num):dnl
+	add	$1, $1, pc`'dnl
+	m4append(`lea_list',`
+L(ptr'lea_num`):	.word	GSYM_PREFIX`'$2-L(bas'lea_num`)-8')
+	define(`lea_num', eval(lea_num+1))dnl
+',`dnl
+	m4append(`lea_list',`
+L(ptr'lea_num`):	.word	GSYM_PREFIX`'$2')
+	define(`lea_num', eval(lea_num+1))dnl
+')dnl
+')
+
+define(`EPILOGUE_cpu',
+`lea_list
+	SIZE(`$1',.-`$1')')
+
+divert
diff --git a/gmp/mpn/arm/bdiv_dbm1c.asm b/gmp/mpn/arm/bdiv_dbm1c.asm
new file mode 100644
index 0000000000..ec3de50e8e
--- /dev/null
+++ b/gmp/mpn/arm/bdiv_dbm1c.asm
@@ -0,0 +1,113 @@
+dnl  ARM mpn_bdiv_dbm1c.
+
+dnl  Copyright 2008, 2011, 2012 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb
+C StrongARM	 ?
+C XScale	 ?
+C Cortex-A7	 ?
+C Cortex-A8	 ?
+C Cortex-A9	 4.25
+C Cortex-A15	 2.5
+
+C TODO
+C  * Try using umlal or umaal.
+C  * Try using ldm/stm.
+
+define(`qp',	  `r0')
+define(`up',	  `r1')
+define(`n',	  `r2')
+define(`bd',	  `r3')
+define(`cy',	  `sp,#0')
+
+ASM_START()
+	TEXT
+	ALIGN(16)
+PROLOGUE(mpn_bdiv_dbm1c)
+	push	{r4, r5, r6, r7, r8}
+	ldr	r4, [up], #4
+	ldr	r5, [sp, #20]
+	ands	r12, n, #3
+	beq	L(fi0)
+	cmp	r12, #2
+	bcc	L(fi1)
+	beq	L(fi2)
+
+L(fi3):	umull	r8, r12, r4, bd
+	ldr	r4, [up], #4
+	b	L(lo3)
+
+L(fi0):	umull	r6, r7, r4, bd
+	ldr	r4, [up], #4
+	b	L(lo0)
+
+L(fi1):	subs	n, n, #1
+	umull	r8, r12, r4, bd
+	bls	L(wd1)
+	ldr	r4, [up], #4
+	b	L(lo1)
+
+L(fi2):	umull	r6, r7, r4, bd
+	ldr	r4, [up], #4
+	b	L(lo2)
+
+L(top):	ldr	r4, [up], #4
+	subs	r5, r5, r6
+	str	r5, [qp], #4
+	sbc	r5, r5, r7
+L(lo1):	umull	r6, r7, r4, bd
+	ldr	r4, [up], #4
+	subs	r5, r5, r8
+	str	r5, [qp], #4
+	sbc	r5, r5, r12
+L(lo0):	umull	r8, r12, r4, bd
+	ldr	r4, [up], #4
+	subs	r5, r5, r6
+	str	r5, [qp], #4
+	sbc	r5, r5, r7
+L(lo3):	umull	r6, r7, r4, bd
+	ldr	r4, [up], #4
+	subs	r5, r5, r8
+	str	r5, [qp], #4
+	sbc	r5, r5, r12
+L(lo2):	subs	n, n, #4
+	umull	r8, r12, r4, bd
+	bhi	L(top)
+
+L(wd2):	subs	r5, r5, r6
+	str	r5, [qp], #4
+	sbc	r5, r5, r7
+L(wd1):	subs	r5, r5, r8
+	str	r5, [qp]
+	sbc	r0, r5, r12
+	pop	{r4, r5, r6, r7, r8}
+	bx	lr
+EPILOGUE()
diff --git a/gmp/mpn/arm/cnd_aors_n.asm b/gmp/mpn/arm/cnd_aors_n.asm
new file mode 100644
index 0000000000..e8eb60983a
--- /dev/null
+++ b/gmp/mpn/arm/cnd_aors_n.asm
@@ -0,0 +1,134 @@
+dnl  ARM mpn_cnd_add_n, mpn_cnd_sub_n
+
+dnl  Copyright 2012, 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb
+C StrongARM	 ?
+C XScale	 ?
+C Cortex-A7	 ?
+C Cortex-A8	 ?
+C Cortex-A9	 3
+C Cortex-A15	 2.5
+
+define(`cnd',	`r0')
+define(`rp',	`r1')
+define(`up',	`r2')
+define(`vp',	`r3')
+
+define(`n',	`r12')
+
+
+ifdef(`OPERATION_cnd_add_n', `
+	define(`ADDSUB',      adds)
+	define(`ADDSUBC',      adcs)
+	define(`INITCY',      `cmn	r0, #0')
+	define(`RETVAL',      `adc	r0, n, #0')
+	define(func,	      mpn_cnd_add_n)')
+ifdef(`OPERATION_cnd_sub_n', `
+	define(`ADDSUB',      subs)
+	define(`ADDSUBC',      sbcs)
+	define(`INITCY',      `cmp	r0, #0')
+	define(`RETVAL',      `adc	r0, n, #0
+			      rsb	r0, r0, #1')
+	define(func,	      mpn_cnd_sub_n)')
+
+MULFUNC_PROLOGUE(mpn_cnd_add_n mpn_cnd_sub_n)
+
+ASM_START()
+PROLOGUE(func)
+	push	{r4-r11}
+	ldr	n, [sp, #32]
+
+	cmp	cnd, #1
+	sbc	cnd, cnd, cnd		C conditionally set to 0xffffffff
+
+	INITCY				C really only needed for n = 0 (mod 4)
+
+	ands	r4, n, #3
+	beq	L(top)
+	cmp	r4, #2
+	bcc	L(b1)
+	beq	L(b2)
+
+L(b3):	ldm	vp!, {r4,r5,r6}
+	ldm	up!, {r8,r9,r10}
+	bic	r4, r4, cnd
+	bic	r5, r5, cnd
+	bic	r6, r6, cnd
+	ADDSUB	r8, r8, r4
+	ADDSUBC	r9, r9, r5
+	ADDSUBC	r10, r10, r6
+	stm	rp!, {r8,r9,r10}
+	sub	n, n, #3
+	teq	n, #0
+	bne	L(top)
+	b	L(end)
+
+L(b2):	ldm	vp!, {r4,r5}
+	ldm	up!, {r8,r9}
+	bic	r4, r4, cnd
+	bic	r5, r5, cnd
+	ADDSUB	r8, r8, r4
+	ADDSUBC	r9, r9, r5
+	stm	rp!, {r8,r9}
+	sub	n, n, #2
+	teq	n, #0
+	bne	L(top)
+	b	L(end)
+
+L(b1):	ldr	r4, [vp], #4
+	ldr	r8, [up], #4
+	bic	r4, r4, cnd
+	ADDSUB	r8, r8, r4
+	str	r8, [rp], #4
+	sub	n, n, #1
+	teq	n, #0
+	beq	L(end)
+
+L(top):	ldm	vp!, {r4,r5,r6,r7}
+	ldm	up!, {r8,r9,r10,r11}
+	bic	r4, r4, cnd
+	bic	r5, r5, cnd
+	bic	r6, r6, cnd
+	bic	r7, r7, cnd
+	ADDSUBC	r8, r8, r4
+	ADDSUBC	r9, r9, r5
+	ADDSUBC	r10, r10, r6
+	ADDSUBC	r11, r11, r7
+	sub	n, n, #4
+	stm	rp!, {r8,r9,r10,r11}
+	teq	n, #0
+	bne	L(top)
+
+L(end):	RETVAL
+	pop	{r4-r11}
+	bx	r14
+EPILOGUE()
diff --git a/gmp/mpn/arm/com.asm b/gmp/mpn/arm/com.asm
new file mode 100644
index 0000000000..42f8e3cbbe
--- /dev/null
+++ b/gmp/mpn/arm/com.asm
@@ -0,0 +1,75 @@
+dnl  ARM mpn_com.
+
+dnl  Copyright 2003, 2012 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb
+C StrongARM	 ?
+C XScale	 ?
+C Cortex-A7	 ?
+C Cortex-A8	 ?
+C Cortex-A9	 2.0
+C Cortex-A15	 1.75
+
+define(`rp', `r0')
+define(`up', `r1')
+define(`n',  `r2')
+
+ASM_START()
+PROLOGUE(mpn_com)
+	tst	n, #1
+	beq	L(skip1)
+	ldr	r3, [up], #4
+	mvn	r3, r3
+	str	r3, [rp], #4
+L(skip1):
+	tst	n, #2
+	beq	L(skip2)
+	ldmia	up!, { r3, r12 }		C load 2 limbs
+	mvn	r3, r3
+	mvn	r12, r12
+	stmia	rp!, { r3, r12 }		C store 2 limbs
+L(skip2):
+	bics	n, n, #3
+	beq	L(rtn)
+	stmfd	sp!, { r7, r8, r9 }		C save regs on stack
+
+L(top):	ldmia	up!, { r3, r8, r9, r12 }	C load 4 limbs
+	subs	n, n, #4
+	mvn	r3, r3
+	mvn	r8, r8
+	mvn	r9, r9
+	mvn	r12, r12
+	stmia	rp!, { r3, r8, r9, r12 }	C store 4 limbs
+	bne	L(top)
+
+	ldmfd	sp!, { r7, r8, r9 }		C restore regs from stack
+L(rtn):	bx	lr
+EPILOGUE()
diff --git a/gmp/mpn/arm/copyd.asm b/gmp/mpn/arm/copyd.asm
new file mode 100644
index 0000000000..3ea2035099
--- /dev/null
+++ b/gmp/mpn/arm/copyd.asm
@@ -0,0 +1,84 @@
+dnl  ARM mpn_copyd.
+
+dnl  Contributed to the GNU project by Robert Harley and Torbjörn Granlund.
+
+dnl  Copyright 2003, 2012, 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb
+C StrongARM	 ?
+C XScale	 ?
+C Cortex-A7	 ?
+C Cortex-A8	 ?
+C Cortex-A9	 1.25-1.5
+C Cortex-A15	 1.25
+
+C TODO
+C  * Consider wider unrolling.  Analogous 8-way code runs 10% faster on both A9
+C    and A15.  But it probably slows things down for 8 <= n < a few dozen.
+
+define(`rp', `r0')
+define(`up', `r1')
+define(`n',  `r2')
+
+ASM_START()
+PROLOGUE(mpn_copyd)
+	mov	r12, n, lsl #2
+	sub	r12, r12, #4
+	add	rp, rp, r12
+	add	up, up, r12
+
+	tst	n, #1
+	beq	L(skip1)
+	ldr	r3, [up], #-4
+	str	r3, [rp], #-4
+L(skip1):
+	tst	n, #2
+	beq	L(skip2)
+	ldmda	up!, { r3,r12 }
+	stmda	rp!, { r3,r12 }
+L(skip2):
+	bics	n, n, #3
+	beq	L(rtn)
+
+	push	{ r4-r5 }
+	subs	n, n, #4
+	ldmda	up!, { r3,r4,r5,r12 }
+	beq	L(end)
+
+L(top):	subs	n, n, #4
+	stmda	rp!, { r3,r4,r5,r12 }
+	ldmda	up!, { r3,r4,r5,r12 }
+	bne	L(top)
+
+L(end):	stmda	rp, { r3,r4,r5,r12 }
+	pop	{ r4-r5 }
+L(rtn):	bx	lr
+EPILOGUE()
diff --git a/gmp/mpn/arm/copyi.asm b/gmp/mpn/arm/copyi.asm
new file mode 100644
index 0000000000..fa454702c1
--- /dev/null
+++ b/gmp/mpn/arm/copyi.asm
@@ -0,0 +1,79 @@
+dnl  ARM mpn_copyi.
+
+dnl  Contributed to the GNU project by Robert Harley and Torbjörn Granlund.
+
+dnl  Copyright 2003, 2012, 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb
+C StrongARM	 ?
+C XScale	 ?
+C Cortex-A7	 ?
+C Cortex-A8	 ?
+C Cortex-A9	 1.25-1.5
+C Cortex-A15	 1.25
+
+C TODO
+C  * Consider wider unrolling.  Analogous 8-way code runs 10% faster on both A9
+C    and A15.  But it probably slows things down for 8 <= n < a few dozen.
+
+define(`rp', `r0')
+define(`up', `r1')
+define(`n',  `r2')
+
+ASM_START()
+PROLOGUE(mpn_copyi)
+	tst	n, #1
+	beq	L(skip1)
+	ldr	r3, [up], #4
+	str	r3, [rp], #4
+L(skip1):
+	tst	n, #2
+	beq	L(skip2)
+	ldmia	up!, { r3,r12 }
+	stmia	rp!, { r3,r12 }
+L(skip2):
+	bics	n, n, #3
+	beq	L(rtn)
+
+	push	{ r4-r5 }
+	subs	n, n, #4
+	ldmia	up!, { r3,r4,r5,r12 }
+	beq	L(end)
+
+L(top):	subs	n, n, #4
+	stmia	rp!, { r3,r4,r5,r12 }
+	ldmia	up!, { r3,r4,r5,r12 }
+	bne	L(top)
+
+L(end):	stm	rp, { r3,r4,r5,r12 }
+	pop	{ r4-r5 }
+L(rtn):	bx	lr
+EPILOGUE()
diff --git a/gmp/mpn/arm/dive_1.asm b/gmp/mpn/arm/dive_1.asm
new file mode 100644
index 0000000000..a695e47c77
--- /dev/null
+++ b/gmp/mpn/arm/dive_1.asm
@@ -0,0 +1,151 @@
+dnl  ARM v4 mpn_modexact_1c_odd
+
+dnl  Contributed to the GNU project by Torbjorn Granlund.
+
+dnl  Copyright 2012 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C               cycles/limb       cycles/limb
+C               norm    unorm    modexact_1c_odd
+C StrongARM	 ?
+C XScale	 ?
+C Cortex-A7	 ?
+C Cortex-A8	 ?
+C Cortex-A9	10	12
+C Cortex-A15	 9	 9
+
+C Architecture requirements:
+C v5	-
+C v5t	-
+C v5te	-
+C v6	-
+C v6t2	-
+C v7a	-
+
+define(`rp', `r0')
+define(`up', `r1')
+define(`n',  `r2')
+define(`d',  `r3')
+
+define(`cy', `r7')
+define(`cnt', `r6')
+define(`tnc', `r8')
+
+ASM_START()
+PROLOGUE(mpn_divexact_1)
+	tst	d, #1
+	push	{r4-r9}
+	mov	cnt, #0
+	bne	L(inv)
+
+C count trailing zeros
+	movs	r4, d, lsl #16
+	moveq	d, d, lsr #16
+	moveq	cnt, #16
+	tst	d, #0xff
+	moveq	d, d, lsr #8
+	addeq	cnt, cnt, #8
+	LEA(	r4, ctz_tab)
+	and	r5, d, #0xff
+	ldrb	r4, [r4, r5]
+	mov	d, d, lsr r4
+	add	cnt, cnt, r4
+
+C binvert limb
+L(inv):	LEA(	r4, binvert_limb_table)
+	and	r12, d, #254
+	ldrb	r4, [r4, r12, lsr #1]
+	mul	r12, r4, r4
+	mul	r12, d, r12
+	rsb	r12, r12, r4, lsl #1
+	mul	r4, r12, r12
+	mul	r4, d, r4
+	rsb	r4, r4, r12, lsl #1	C r4 = inverse
+
+	tst	cnt, cnt
+	ldr	r5, [up], #4		C up[0]
+	mov	cy, #0
+	bne	L(unnorm)
+
+L(norm):
+	subs	n, n, #1		C set carry as side-effect
+	beq	L(end)
+
+	ALIGN(16)
+L(top):	sbcs	cy, r5, cy
+	ldr	r5, [up], #4
+	sub	n, n, #1
+	mul	r9, r4, cy
+	tst	n, n
+	umull	r12, cy, d, r9
+	str	r9, [rp], #4
+	bne	L(top)
+
+L(end):	sbc	cy, r5, cy
+	mul	r9, r4, cy
+	str	r9, [rp]
+	pop	{r4-r9}
+	bx	r14
+
+L(unnorm):
+	rsb	tnc, cnt, #32
+	mov	r5, r5, lsr cnt
+	subs	n, n, #1		C set carry as side-effect
+	beq	L(edu)
+
+	ALIGN(16)
+L(tpu):	ldr	r12, [up], #4
+	orr	r9, r5, r12, lsl tnc
+	mov	r5, r12, lsr cnt
+	sbcs	cy, r9, cy		C critical path ->cy->cy->
+	sub	n, n, #1
+	mul	r9, r4, cy		C critical path ->cy->r9->
+	tst	n, n
+	umull	r12, cy, d, r9		C critical path ->r9->cy->
+	str	r9, [rp], #4
+	bne	L(tpu)
+
+L(edu):	sbc	cy, r5, cy
+	mul	r9, r4, cy
+	str	r9, [rp]
+	pop	{r4-r9}
+	bx	r14
+EPILOGUE()
+
+	.section .rodata
+ctz_tab:
+	.byte	8,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0
+	.byte	5,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0
+	.byte	6,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0
+	.byte	5,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0
+	.byte	7,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0
+	.byte	5,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0
+	.byte	6,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0
+	.byte	5,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0
diff --git a/gmp/mpn/arm/gmp-mparam.h b/gmp/mpn/arm/gmp-mparam.h
new file mode 100644
index 0000000000..87eec3a149
--- /dev/null
+++ b/gmp/mpn/arm/gmp-mparam.h
@@ -0,0 +1,127 @@
+/* gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 1991, 1993, 1994, 1999-2003, 2009, 2010 Free Software Foundation,
+Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#define GMP_LIMB_BITS 32
+#define GMP_LIMB_BYTES 4
+
+/* 1193MHz ARM (gcc55.fsffrance.org) */
+
+#define DIVREM_1_NORM_THRESHOLD              0  /* preinv always */
+#define DIVREM_1_UNNORM_THRESHOLD            0  /* always */
+#define MOD_1_NORM_THRESHOLD                 0  /* always */
+#define MOD_1_UNNORM_THRESHOLD               0  /* always */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD         56
+#define MOD_1U_TO_MOD_1_1_THRESHOLD         11
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD         0  /* never mpn_mod_1_1p */
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD     MP_SIZE_T_MAX
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD     71
+#define USE_PREINV_DIVREM_1                  1  /* preinv always */
+#define DIVREM_2_THRESHOLD                   0  /* preinv always */
+#define DIVEXACT_1_THRESHOLD                 0  /* always */
+#define BMOD_1_TO_MOD_1_THRESHOLD           41
+
+#define MUL_TOOM22_THRESHOLD                36
+#define MUL_TOOM33_THRESHOLD               125
+#define MUL_TOOM44_THRESHOLD               193
+#define MUL_TOOM6H_THRESHOLD               303
+#define MUL_TOOM8H_THRESHOLD               418
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD     125
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD     176
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD     114
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD     129
+
+#define SQR_BASECASE_THRESHOLD              12
+#define SQR_TOOM2_THRESHOLD                 78
+#define SQR_TOOM3_THRESHOLD                137
+#define SQR_TOOM4_THRESHOLD                212
+#define SQR_TOOM6_THRESHOLD                306
+#define SQR_TOOM8_THRESHOLD                422
+
+#define MULMOD_BNM1_THRESHOLD               20
+#define SQRMOD_BNM1_THRESHOLD               26
+
+#define MUL_FFT_MODF_THRESHOLD             436  /* k = 5 */
+#define MUL_FFT_TABLE3                                      \
+  { {    436, 5}, {     27, 6}, {     28, 7}, {     15, 6}, \
+    {     32, 7}, {     17, 6}, {     35, 7}, {     19, 6}, \
+    {     39, 7}, {     29, 8}, {     15, 7}, {     35, 8}, \
+    {     19, 7}, {     41, 8}, {     23, 7}, {     49, 8}, \
+    {     27, 9}, {     15, 8}, {     31, 7}, {     63, 8}, \
+    {    256, 9}, {    512,10}, {   1024,11}, {   2048,12}, \
+    {   4096,13}, {   8192,14}, {  16384,15}, {  32768,16} }
+#define MUL_FFT_TABLE3_SIZE 28
+#define MUL_FFT_THRESHOLD                 5760
+
+#define SQR_FFT_MODF_THRESHOLD             404  /* k = 5 */
+#define SQR_FFT_TABLE3                                      \
+  { {    404, 5}, {     13, 4}, {     27, 5}, {     27, 6}, \
+    {     28, 7}, {     15, 6}, {     32, 7}, {     17, 6}, \
+    {     35, 7}, {     29, 8}, {     15, 7}, {     35, 8}, \
+    {     19, 7}, {     41, 8}, {     23, 7}, {     47, 8}, \
+    {     27, 9}, {     15, 8}, {     39, 9}, {    512,10}, \
+    {   1024,11}, {   2048,12}, {   4096,13}, {   8192,14}, \
+    {  16384,15}, {  32768,16} }
+#define SQR_FFT_TABLE3_SIZE 26
+#define SQR_FFT_THRESHOLD                 3776
+
+#define MULLO_BASECASE_THRESHOLD             0  /* always */
+#define MULLO_DC_THRESHOLD                 137
+#define MULLO_MUL_N_THRESHOLD            11479
+
+#define DC_DIV_QR_THRESHOLD                150
+#define DC_DIVAPPR_Q_THRESHOLD             494
+#define DC_BDIV_QR_THRESHOLD               148
+#define DC_BDIV_Q_THRESHOLD                345
+
+#define INV_MULMOD_BNM1_THRESHOLD           70
+#define INV_NEWTON_THRESHOLD               474
+#define INV_APPR_THRESHOLD                 478
+
+#define BINV_NEWTON_THRESHOLD              542
+#define REDC_1_TO_REDC_N_THRESHOLD         117
+
+#define MU_DIV_QR_THRESHOLD               2089
+#define MU_DIVAPPR_Q_THRESHOLD            2172
+#define MUPI_DIV_QR_THRESHOLD              225
+#define MU_BDIV_QR_THRESHOLD              1528
+#define MU_BDIV_Q_THRESHOLD               2089
+
+#define MATRIX22_STRASSEN_THRESHOLD         16
+#define HGCD_THRESHOLD                     197
+#define GCD_DC_THRESHOLD                   902
+#define GCDEXT_DC_THRESHOLD                650
+#define JACOBI_BASE_METHOD                   2
+
+#define GET_STR_DC_THRESHOLD                20
+#define GET_STR_PRECOMPUTE_THRESHOLD        39
+#define SET_STR_DC_THRESHOLD              1045
+#define SET_STR_PRECOMPUTE_THRESHOLD      2147
diff --git a/gmp/mpn/arm/invert_limb.asm b/gmp/mpn/arm/invert_limb.asm
new file mode 100644
index 0000000000..d4c3afe2da
--- /dev/null
+++ b/gmp/mpn/arm/invert_limb.asm
@@ -0,0 +1,93 @@
+dnl  ARM mpn_invert_limb -- Invert a normalized limb.
+
+dnl  Copyright 2001, 2009, 2011, 2012 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ASM_START()
+PROLOGUE(mpn_invert_limb)
+	LEA(	r2, approx_tab-512)
+	mov	r3, r0, lsr #23
+	mov	r3, r3, asl #1
+	ldrh	r3, [r3, r2]
+	mov	r1, r3, asl #17
+	mul	r12, r3, r3
+	umull	r3, r2, r12, r0
+	sub	r1, r1, r2, asl #1
+	umull	r3, r2, r1, r1
+	umull	r12, r3, r0, r3
+	umull	r2, r12, r0, r2
+	adds	r2, r2, r3
+	adc	r12, r12, #0
+	rsb	r1, r12, r1
+	mvn	r2, r2, lsr #30
+	add	r2, r2, r1, asl #2
+	umull	r12, r3, r0, r2
+	adds	r1, r12, r0
+	adc	r3, r3, r0
+	rsb	r0, r3, r2
+	bx	lr
+EPILOGUE()
+
+	.section .rodata
+	ALIGN(2)
+approx_tab:
+	.short    0xffc0,0xfec0,0xfdc0,0xfcc0,0xfbc0,0xfac0,0xfa00,0xf900
+	.short    0xf800,0xf700,0xf640,0xf540,0xf440,0xf380,0xf280,0xf180
+	.short    0xf0c0,0xefc0,0xef00,0xee00,0xed40,0xec40,0xeb80,0xeac0
+	.short    0xe9c0,0xe900,0xe840,0xe740,0xe680,0xe5c0,0xe500,0xe400
+	.short    0xe340,0xe280,0xe1c0,0xe100,0xe040,0xdf80,0xdec0,0xde00
+	.short    0xdd40,0xdc80,0xdbc0,0xdb00,0xda40,0xd980,0xd8c0,0xd800
+	.short    0xd740,0xd680,0xd600,0xd540,0xd480,0xd3c0,0xd340,0xd280
+	.short    0xd1c0,0xd140,0xd080,0xcfc0,0xcf40,0xce80,0xcdc0,0xcd40
+	.short    0xcc80,0xcc00,0xcb40,0xcac0,0xca00,0xc980,0xc8c0,0xc840
+	.short    0xc780,0xc700,0xc640,0xc5c0,0xc540,0xc480,0xc400,0xc380
+	.short    0xc2c0,0xc240,0xc1c0,0xc100,0xc080,0xc000,0xbf80,0xbec0
+	.short    0xbe40,0xbdc0,0xbd40,0xbc80,0xbc00,0xbb80,0xbb00,0xba80
+	.short    0xba00,0xb980,0xb900,0xb840,0xb7c0,0xb740,0xb6c0,0xb640
+	.short    0xb5c0,0xb540,0xb4c0,0xb440,0xb3c0,0xb340,0xb2c0,0xb240
+	.short    0xb1c0,0xb140,0xb0c0,0xb080,0xb000,0xaf80,0xaf00,0xae80
+	.short    0xae00,0xad80,0xad40,0xacc0,0xac40,0xabc0,0xab40,0xaac0
+	.short    0xaa80,0xaa00,0xa980,0xa900,0xa8c0,0xa840,0xa7c0,0xa740
+	.short    0xa700,0xa680,0xa600,0xa5c0,0xa540,0xa4c0,0xa480,0xa400
+	.short    0xa380,0xa340,0xa2c0,0xa240,0xa200,0xa180,0xa140,0xa0c0
+	.short    0xa080,0xa000,0x9f80,0x9f40,0x9ec0,0x9e80,0x9e00,0x9dc0
+	.short    0x9d40,0x9d00,0x9c80,0x9c40,0x9bc0,0x9b80,0x9b00,0x9ac0
+	.short    0x9a40,0x9a00,0x9980,0x9940,0x98c0,0x9880,0x9840,0x97c0
+	.short    0x9780,0x9700,0x96c0,0x9680,0x9600,0x95c0,0x9580,0x9500
+	.short    0x94c0,0x9440,0x9400,0x93c0,0x9340,0x9300,0x92c0,0x9240
+	.short    0x9200,0x91c0,0x9180,0x9100,0x90c0,0x9080,0x9000,0x8fc0
+	.short    0x8f80,0x8f40,0x8ec0,0x8e80,0x8e40,0x8e00,0x8d80,0x8d40
+	.short    0x8d00,0x8cc0,0x8c80,0x8c00,0x8bc0,0x8b80,0x8b40,0x8b00
+	.short    0x8a80,0x8a40,0x8a00,0x89c0,0x8980,0x8940,0x88c0,0x8880
+	.short    0x8840,0x8800,0x87c0,0x8780,0x8740,0x8700,0x8680,0x8640
+	.short    0x8600,0x85c0,0x8580,0x8540,0x8500,0x84c0,0x8480,0x8440
+	.short    0x8400,0x8380,0x8340,0x8300,0x82c0,0x8280,0x8240,0x8200
+	.short    0x81c0,0x8180,0x8140,0x8100,0x80c0,0x8080,0x8040,0x8000
+ASM_END()
diff --git a/gmp/mpn/arm/logops_n.asm b/gmp/mpn/arm/logops_n.asm
new file mode 100644
index 0000000000..5a61683fc2
--- /dev/null
+++ b/gmp/mpn/arm/logops_n.asm
@@ -0,0 +1,139 @@
+dnl  ARM mpn_and_n, mpn_andn_n. mpn_nand_n, etc.
+
+dnl  Contributed to the GNU project by Torbjörn Granlund.
+
+dnl  Copyright 1997, 2000, 2001, 2012 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C            cycles/limb             cycles/limb
+C          and andn ior xor         nand iorn nior xnor
+C StrongARM	 ?			 ?
+C XScale	 ?			 ?
+C Cortex-A7	 ?			 ?
+C Cortex-A8	 ?			 ?
+C Cortex-A9	2.5-2.72		2.75-3
+C Cortex-A15	2.25			2.75
+
+C TODO
+C  * It seems that 2.25 c/l and 2.75 c/l is possible for A9.
+C  * Debug popping issue, see comment below.
+
+define(`rp', `r0')
+define(`up', `r1')
+define(`vp', `r2')
+define(`n',  `r3')
+
+define(`POSTOP')
+
+ifdef(`OPERATION_and_n',`
+  define(`func',    `mpn_and_n')
+  define(`LOGOP',   `and	$1, $2, $3')')
+ifdef(`OPERATION_andn_n',`
+  define(`func',    `mpn_andn_n')
+  define(`LOGOP',   `bic	$1, $2, $3')')
+ifdef(`OPERATION_nand_n',`
+  define(`func',    `mpn_nand_n')
+  define(`POSTOP',  `mvn	$1, $1')
+  define(`LOGOP',   `and	$1, $2, $3')')
+ifdef(`OPERATION_ior_n',`
+  define(`func',    `mpn_ior_n')
+  define(`LOGOP',   `orr	$1, $2, $3')')
+ifdef(`OPERATION_iorn_n',`
+  define(`func',    `mpn_iorn_n')
+  define(`POSTOP',  `mvn	$1, $1')
+  define(`LOGOP',   `bic	$1, $3, $2')')
+ifdef(`OPERATION_nior_n',`
+  define(`func',    `mpn_nior_n')
+  define(`POSTOP',  `mvn	$1, $1')
+  define(`LOGOP',   `orr	$1, $2, $3')')
+ifdef(`OPERATION_xor_n',`
+  define(`func',    `mpn_xor_n')
+  define(`LOGOP',   `eor	$1, $2, $3')')
+ifdef(`OPERATION_xnor_n',`
+  define(`func',    `mpn_xnor_n')
+  define(`POSTOP',  `mvn	$1, $1')
+  define(`LOGOP',   `eor	$1, $2, $3')')
+
+MULFUNC_PROLOGUE(mpn_and_n mpn_andn_n mpn_nand_n mpn_ior_n mpn_iorn_n mpn_nior_n mpn_xor_n mpn_xnor_n)
+
+ASM_START()
+PROLOGUE(func)
+	push	{ r8, r9, r10 }
+	tst	n, #1
+	beq	L(skip1)
+	ldr	r10, [vp], #4
+	ldr	r12, [up], #4
+	LOGOP(	r12, r12, r10)
+	POSTOP(	r12)
+	str	r12, [rp], #4
+L(skip1):
+	tst	n, #2
+	beq	L(skip2)
+	ldmia	vp!, { r10, r12 }
+	ldmia	up!, { r8, r9 }
+	LOGOP(	r8, r8, r10)
+	LOGOP(	r9, r9, r12)
+	POSTOP(	r8)
+	POSTOP(	r9)
+	stmia	rp!, { r8, r9 }
+L(skip2):
+	bics	n, n, #3
+	beq	L(rtn)
+	push	{ r4, r5, r6, r7 }
+
+	ldmia	vp!, { r8, r9, r10, r12 }
+	b	L(mid)
+
+L(top):	ldmia	vp!, { r8, r9, r10, r12 }
+	POSTOP(	r4)
+	POSTOP(	r5)
+	POSTOP(	r6)
+	POSTOP(	r7)
+	stmia	rp!, { r4, r5, r6, r7 }
+L(mid):	sub	n, n, #4
+	ldmia	up!, { r4, r5, r6, r7 }
+	teq	n, #0
+	LOGOP(	r4, r4, r8)
+	LOGOP(	r5, r5, r9)
+	LOGOP(	r6, r6, r10)
+	LOGOP(	r7, r7, r12)
+	bne	L(top)
+
+	POSTOP(	r4)
+	POSTOP(	r5)
+	POSTOP(	r6)
+	POSTOP(	r7)
+	stmia	rp!, { r4, r5, r6, r7 }
+
+	pop	{ r4, r5, r6, r7 }	C popping r8-r10 here strangely fails
+
+L(rtn):	pop	{ r8, r9, r10 }
+	bx	r14
+EPILOGUE()
diff --git a/gmp/mpn/arm/lshift.asm b/gmp/mpn/arm/lshift.asm
new file mode 100644
index 0000000000..9f777eb4dd
--- /dev/null
+++ b/gmp/mpn/arm/lshift.asm
@@ -0,0 +1,88 @@
+dnl  ARM mpn_lshift.
+
+dnl  Contributed to the GNU project by Torbjörn Granlund.
+
+dnl  Copyright 2012 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb
+C StrongARM	 ?
+C XScale	 ?
+C Cortex-A7	 ?
+C Cortex-A8	 ?
+C Cortex-A9	 3.5
+C Cortex-A15	 ?
+
+define(`rp',  `r0')
+define(`up',  `r1')
+define(`n',   `r2')
+define(`cnt', `r3')
+define(`tnc', `r12')
+
+ASM_START()
+PROLOGUE(mpn_lshift)
+	add	up, up, n, lsl #2
+	push	{r4, r6, r7, r8}
+	ldr	r4, [up, #-4]!
+	add	rp, rp, n, lsl #2
+	rsb	tnc, cnt, #32
+
+	mov	r7, r4, lsl cnt
+	tst	n, #1
+	beq	L(evn)			C n even
+
+L(odd):	subs	n, n, #2
+	bcc	L(1)			C n = 1
+	ldr	r8, [up, #-4]!
+	b	L(mid)
+
+L(evn):	ldr	r6, [up, #-4]!
+	subs	n, n, #2
+	beq	L(end)
+
+L(top):	ldr	r8, [up, #-4]!
+	orr	r7, r7, r6, lsr tnc
+	str	r7, [rp, #-4]!
+	mov	r7, r6, lsl cnt
+L(mid):	ldr	r6, [up, #-4]!
+	orr	r7, r7, r8, lsr tnc
+	str	r7, [rp, #-4]!
+	mov	r7, r8, lsl cnt
+	subs	n, n, #2
+	bgt	L(top)
+
+L(end):	orr	r7, r7, r6, lsr tnc
+	str	r7, [rp, #-4]!
+	mov	r7, r6, lsl cnt
+L(1):	str	r7, [rp, #-4]
+	mov	r0, r4, lsr tnc
+	pop	{r4, r6, r7, r8}
+	bx	r14
+EPILOGUE()
diff --git a/gmp/mpn/arm/lshiftc.asm b/gmp/mpn/arm/lshiftc.asm
new file mode 100644
index 0000000000..5f3d6e3f5b
--- /dev/null
+++ b/gmp/mpn/arm/lshiftc.asm
@@ -0,0 +1,95 @@
+dnl  ARM mpn_lshiftc.
+
+dnl  Contributed to the GNU project by Torbjörn Granlund.
+
+dnl  Copyright 2012 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb
+C StrongARM	 ?
+C XScale	 ?
+C Cortex-A7	 ?
+C Cortex-A8	 ?
+C Cortex-A9	 4.0
+C Cortex-A15	 ?
+
+define(`rp',  `r0')
+define(`up',  `r1')
+define(`n',   `r2')
+define(`cnt', `r3')
+define(`tnc', `r12')
+
+ASM_START()
+PROLOGUE(mpn_lshiftc)
+	add	up, up, n, lsl #2
+	push	{r4, r6, r7, r8}
+	ldr	r4, [up, #-4]!
+	add	rp, rp, n, lsl #2
+	rsb	tnc, cnt, #32
+	mvn	r6, r4
+
+	mov	r7, r6, lsl cnt
+	tst	n, #1
+	beq	L(evn)			C n even
+
+L(odd):	subs	n, n, #2
+	bcc	L(1)			C n = 1
+	ldr	r8, [up, #-4]!
+	mvn	r8, r8
+	b	L(mid)
+
+L(evn):	ldr	r6, [up, #-4]!
+	mvn	r6, r6
+	subs	n, n, #2
+	beq	L(end)
+
+L(top):	ldr	r8, [up, #-4]!
+	orr	r7, r7, r6, lsr tnc
+	str	r7, [rp, #-4]!
+	mvn	r8, r8
+	mov	r7, r6, lsl cnt
+L(mid):	ldr	r6, [up, #-4]!
+	orr	r7, r7, r8, lsr tnc
+	str	r7, [rp, #-4]!
+	mvn	r6, r6
+	mov	r7, r8, lsl cnt
+	subs	n, n, #2
+	bgt	L(top)
+
+L(end):	orr	r7, r7, r6, lsr tnc
+	str	r7, [rp, #-4]!
+	mov	r7, r6, lsl cnt
+L(1):	mvn	r6, #0
+	orr	r7, r7, r6, lsr tnc
+	str	r7, [rp, #-4]
+	mov	r0, r4, lsr tnc
+	pop	{r4, r6, r7, r8}
+	bx	r14
+EPILOGUE()
diff --git a/gmp/mpn/arm/mod_34lsub1.asm b/gmp/mpn/arm/mod_34lsub1.asm
new file mode 100644
index 0000000000..ba3c06d8db
--- /dev/null
+++ b/gmp/mpn/arm/mod_34lsub1.asm
@@ -0,0 +1,121 @@
+dnl  ARM mpn_mod_34lsub1 -- remainder modulo 2^24-1.
+
+dnl  Copyright 2012, 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb
+C StrongARM	 ?
+C XScale	 ?
+C Cortex-A7	 ?
+C Cortex-A8	 ?
+C Cortex-A9	 1.33
+C Cortex-A15	 1.33
+
+define(`ap',	r0)
+define(`n',	r1)
+
+C mp_limb_t mpn_mod_34lsub1 (mp_srcptr up, mp_size_t n)
+
+C TODO
+C  * Write cleverer summation code.
+C  * Consider loading 6 64-bit aligned registers at a time, to approach 1 c/l.
+
+ASM_START()
+	TEXT
+	ALIGN(32)
+PROLOGUE(mpn_mod_34lsub1)
+	push	{ r4, r5, r6, r7 }
+
+	subs	n, n, #3
+	mov	r7, #0
+	blt	L(le2)			C n <= 2
+
+	ldmia	ap!, { r2, r3, r12 }
+	subs	n, n, #3
+	blt	L(sum)			C n <= 5
+	cmn	r0, #0			C clear carry
+	sub	n, n, #3
+	b	L(mid)
+
+L(top):	adcs	r2, r2, r4
+	adcs	r3, r3, r5
+	adcs	r12, r12, r6
+L(mid):	ldmia	ap!, { r4, r5, r6 }
+	tst	n, n
+	sub	n, n, #3
+	bpl	L(top)
+
+	add	n, n, #3
+
+	adcs	r2, r2, r4
+	adcs	r3, r3, r5
+	adcs	r12, r12, r6
+	movcs	r7, #1			C r7 <= 1
+
+L(sum):	cmn	n, #2
+	movlo	r4, #0
+	ldrhs	r4, [ap], #4
+	movls	r5, #0
+	ldrhi	r5, [ap], #4
+
+	adds	r2, r2, r4
+	adcs	r3, r3, r5
+	adcs	r12, r12, #0
+	adc	r7, r7, #0		C r7 <= 2
+
+L(sum2):
+	bic	r0, r2, #0xff000000
+	add	r0, r0, r2, lsr #24
+	add	r0, r0, r7
+
+	mov	r7, r3, lsl #8
+	bic	r1, r7, #0xff000000
+	add	r0, r0, r1
+	add	r0, r0, r3, lsr #16
+
+	mov	r7, r12, lsl #16
+	bic	r1, r7, #0xff000000
+	add	r0, r0, r1
+	add	r0, r0, r12, lsr #8
+
+	pop	{ r4, r5, r6, r7 }
+	bx	lr
+
+L(le2):	cmn	n, #1
+	bne	L(1)
+	ldmia	ap!, { r2, r3 }
+	mov	r12, #0
+	b	L(sum2)
+L(1):	ldr	r2, [ap]
+	bic	r0, r2, #0xff000000
+	add	r0, r0, r2, lsr #24
+	pop	{ r4, r5, r6, r7 }
+	bx	lr
+EPILOGUE()
diff --git a/gmp/mpn/arm/mode1o.asm b/gmp/mpn/arm/mode1o.asm
new file mode 100644
index 0000000000..5e0f78fc8f
--- /dev/null
+++ b/gmp/mpn/arm/mode1o.asm
@@ -0,0 +1,92 @@
+dnl  ARM mpn_modexact_1c_odd
+
+dnl  Contributed to the GNU project by Torbjörn Granlund.
+
+dnl  Copyright 2012 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb
+C StrongARM	 ?
+C XScale	 ?
+C Cortex-A7	 ?
+C Cortex-A8	 ?
+C Cortex-A9	10
+C Cortex-A15	 9
+
+C Architecture requirements:
+C v5	-
+C v5t	-
+C v5te	-
+C v6	-
+C v6t2	-
+C v7a	-
+
+define(`up', `r0')
+define(`n',  `r1')
+define(`d',  `r2')
+define(`cy', `r3')
+
+	.protected	binvert_limb_table
+ASM_START()
+PROLOGUE(mpn_modexact_1c_odd)
+	stmfd	sp!, {r4, r5}
+
+	LEA(	r4, binvert_limb_table)
+
+	ldr	r5, [up], #4		C up[0]
+
+	and	r12, d, #254
+	ldrb	r4, [r4, r12, lsr #1]
+	mul	r12, r4, r4
+	mul	r12, d, r12
+	rsb	r12, r12, r4, asl #1
+	mul	r4, r12, r12
+	mul	r4, d, r4
+	rsb	r4, r4, r12, asl #1	C r4 = inverse
+
+	subs	n, n, #1		C set carry as side-effect
+	beq	L(end)
+
+L(top):	sbcs	cy, r5, cy
+	ldr	r5, [up], #4
+	sub	n, n, #1
+	mul	r12, r4, cy
+	tst	n, n
+	umull	r12, cy, d, r12
+	bne	L(top)
+
+L(end):	sbcs	cy, r5, cy
+	mul	r12, r4, cy
+	umull	r12, r0, d, r12
+	addcc	r0, r0, #1
+
+	ldmfd	sp!, {r4, r5}
+	bx	r14
+EPILOGUE()
diff --git a/gmp/mpn/arm/mul_1.asm b/gmp/mpn/arm/mul_1.asm
new file mode 100644
index 0000000000..f7bc1bc386
--- /dev/null
+++ b/gmp/mpn/arm/mul_1.asm
@@ -0,0 +1,94 @@
+dnl  ARM mpn_mul_1 -- Multiply a limb vector with a limb and store the result
+dnl  in a second limb vector.
+dnl  Contributed by Robert Harley.
+
+dnl  Copyright 1998, 2000, 2001, 2003, 2012 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb
+C StrongARM	6-8
+C XScale	 ?
+C Cortex-A7	 ?
+C Cortex-A8	 ?
+C Cortex-A9	 4.75
+C Cortex-A15	 ?
+
+C We should rewrite this along the lines of addmul_1.asm.  That should save a
+C cycle on StrongARM, and several cycles on XScale.
+
+define(`rp',`r0')
+define(`up',`r1')
+define(`n',`r2')
+define(`vl',`r3')
+
+
+ASM_START()
+PROLOGUE(mpn_mul_1)
+	stmfd	sp!, { r8, r9, lr }
+	ands	r12, n, #1
+	beq	L(skip1)
+	ldr	lr, [up], #4
+	umull	r9, r12, lr, vl
+	str	r9, [rp], #4
+L(skip1):
+	tst	n, #2
+	beq	L(skip2)
+	mov	r8, r12
+	ldmia	up!, { r12, lr }
+	mov	r9, #0
+	umlal	r8, r9, r12, vl
+	mov	r12, #0
+	umlal	r9, r12, lr, vl
+	stmia	rp!, { r8, r9 }
+L(skip2):
+	bics	n, n, #3
+	beq	L(rtn)
+	stmfd	sp!, { r6, r7 }
+
+L(top):	mov	r6, r12
+	ldmia	up!, { r8, r9, r12, lr }
+	ldr	r7, [rp, #12]			C cache allocate
+	mov	r7, #0
+	umlal	r6, r7, r8, vl
+	mov	r8, #0
+	umlal	r7, r8, r9, vl
+	mov	r9, #0
+	umlal	r8, r9, r12, vl
+	mov	r12, #0
+	umlal	r9, r12, lr, vl
+	subs	n, n, #4
+	stmia	rp!, { r6, r7, r8, r9 }
+	bne	L(top)
+
+	ldmfd	sp!, { r6, r7 }
+
+L(rtn):	mov	r0, r12
+	ldmfd	sp!, { r8, r9, pc }
+EPILOGUE()
diff --git a/gmp/mpn/arm/neon/README b/gmp/mpn/arm/neon/README
new file mode 100644
index 0000000000..79e3b48ee6
--- /dev/null
+++ b/gmp/mpn/arm/neon/README
@@ -0,0 +1,2 @@
+This directory contains Neon code which runs and is efficient on all
+ARM CPUs which support Neon.
diff --git a/gmp/mpn/arm/neon/hamdist.asm b/gmp/mpn/arm/neon/hamdist.asm
new file mode 100644
index 0000000000..232089647d
--- /dev/null
+++ b/gmp/mpn/arm/neon/hamdist.asm
@@ -0,0 +1,194 @@
+dnl  ARM Neon mpn_hamdist -- mpn bit hamming distance.
+
+dnl  Copyright 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb
+C StrongARM:	 -
+C XScale	 -
+C Cortex-A7	 ?
+C Cortex-A8	 ?
+C Cortex-A9	 1.89
+C Cortex-A15	 0.95
+
+C TODO
+C  * Explore using vldr and vldm.  Does it help on A9?  (These loads do
+C    64-bits-at-a-time, which will mess up in big-endian mode.  Except not for
+C    popcount. Except perhaps also for popcount for the edge loads.)
+C  * Arrange to align the pointer, if that helps performance.  Use the same
+C    read-and-mask trick we use on PCs, for simplicity and performance.  (Sorry
+C    valgrind!)
+C  * Explore if explicit align directives, e.g., "[ptr:128]" help.
+C  * See rth's gmp-devel 2013-02/03 messages about final summation tricks.
+
+C INPUT PARAMETERS
+define(`ap', r0)
+define(`bp', r1)
+define(`n',  r2)
+
+C We sum into 16 16-bit counters in q8,q9, but at the end we sum them and end
+C up with 8 16-bit counters.  Therefore, we can sum to 8(2^16-1) bits, or
+C (8*2^16-1)/32 = 0x3fff limbs.  We use a chunksize close to that, but which
+C can be represented as a 8-bit ARM constant.
+C
+define(`chunksize',0x3f80)
+
+ASM_START()
+PROLOGUE(mpn_hamdist)
+
+	cmp	n, #chunksize
+	bhi	L(gt16k)
+
+L(lt16k):
+	vmov.i64   q8, #0		C clear summation register
+	vmov.i64   q9, #0		C clear summation register
+
+	tst	   n, #1
+	beq	   L(xxx0)
+	vmov.i64   d0, #0
+	vmov.i64   d20, #0
+	sub	   n, n, #1
+	vld1.32   {d0[0]}, [ap]!	C load 1 limb
+	vld1.32   {d20[0]}, [bp]!	C load 1 limb
+	veor	   d0, d0, d20
+	vcnt.8	   d24, d0
+	vpadal.u8  d16, d24		C d16/q8 = 0; could just splat
+
+L(xxx0):tst	   n, #2
+	beq	   L(xx00)
+	sub	   n, n, #2
+	vld1.32    {d0}, [ap]!		C load 2 limbs
+	vld1.32    {d20}, [bp]!		C load 2 limbs
+	veor	   d0, d0, d20
+	vcnt.8	   d24, d0
+	vpadal.u8  d16, d24
+
+L(xx00):tst	   n, #4
+	beq	   L(x000)
+	sub	   n, n, #4
+	vld1.32    {q0}, [ap]!		C load 4 limbs
+	vld1.32    {q10}, [bp]!		C load 4 limbs
+	veor	   q0, q0, q10
+	vcnt.8	   q12, q0
+	vpadal.u8  q8, q12
+
+L(x000):tst	   n, #8
+	beq	   L(0000)
+
+	subs	   n, n, #8
+	vld1.32    {q0,q1}, [ap]!	C load 8 limbs
+	vld1.32    {q10,q11}, [bp]!	C load 8 limbs
+	bls	   L(sum)
+
+L(gt8):	vld1.32    {q2,q3}, [ap]!	C load 8 limbs
+	vld1.32    {q14,q15}, [bp]!	C load 8 limbs
+	veor	   q0, q0, q10
+	veor	   q1, q1, q11
+	sub	   n, n, #8
+	vcnt.8	   q12, q0
+	vcnt.8	   q13, q1
+	b	   L(mid)
+
+L(0000):subs	   n, n, #16
+	blo	   L(e0)
+
+	vld1.32    {q2,q3}, [ap]!	C load 8 limbs
+	vld1.32    {q0,q1}, [ap]!	C load 8 limbs
+	vld1.32    {q14,q15}, [bp]!	C load 8 limbs
+	vld1.32    {q10,q11}, [bp]!	C load 8 limbs
+	veor	   q2, q2, q14
+	veor	   q3, q3, q15
+	vcnt.8	   q12, q2
+	vcnt.8	   q13, q3
+	subs	   n, n, #16
+	blo	   L(end)
+
+L(top):	vld1.32    {q2,q3}, [ap]!	C load 8 limbs
+	vld1.32    {q14,q15}, [bp]!	C load 8 limbs
+	veor	   q0, q0, q10
+	veor	   q1, q1, q11
+	vpadal.u8  q8, q12
+	vcnt.8	   q12, q0
+	vpadal.u8  q9, q13
+	vcnt.8	   q13, q1
+L(mid):	vld1.32    {q0,q1}, [ap]!	C load 8 limbs
+	vld1.32    {q10,q11}, [bp]!	C load 8 limbs
+	veor	   q2, q2, q14
+	veor	   q3, q3, q15
+	subs	   n, n, #16
+	vpadal.u8  q8, q12
+	vcnt.8	   q12, q2
+	vpadal.u8  q9, q13
+	vcnt.8	   q13, q3
+	bhs	   L(top)
+
+L(end):	vpadal.u8  q8, q12
+	vpadal.u8  q9, q13
+L(sum):	veor	   q0, q0, q10
+	veor	   q1, q1, q11
+	vcnt.8	   q12, q0
+	vcnt.8	   q13, q1
+	vpadal.u8  q8, q12
+	vpadal.u8  q9, q13
+	vadd.i16   q8, q8, q9
+					C we have 8 16-bit counts
+L(e0):	vpaddl.u16 q8, q8		C we have 4 32-bit counts
+	vpaddl.u32 q8, q8		C we have 2 64-bit counts
+	vmov.32    r0, d16[0]
+	vmov.32    r1, d17[0]
+	add	   r0, r0, r1
+	bx	lr
+
+C Code for large count.  Splits operand and calls above code.
+define(`ap2', r5)
+define(`bp2', r6)
+L(gt16k):
+	push	{r4,r5,r6,r14}
+	mov	ap2, ap
+	mov	bp2, bp
+	mov	r3, n			C full count
+	mov	r4, #0			C total sum
+
+1:	mov	n, #chunksize		C count for this invocation
+	bl	L(lt16k)		C could jump deep inside code
+	add	ap2, ap2, #chunksize*4	C point at next chunk
+	add	bp2, bp2, #chunksize*4	C point at next chunk
+	add	r4, r4, r0
+	mov	ap, ap2			C put chunk pointer in place for call
+	mov	bp, bp2			C put chunk pointer in place for call
+	sub	r3, r3, #chunksize
+	cmp	r3, #chunksize
+	bhi	1b
+
+	mov	n, r3			C count for final invocation
+	bl	L(lt16k)
+	add	r0, r4, r0
+	pop	{r4,r5,r6,pc}
+EPILOGUE()
diff --git a/gmp/mpn/arm/neon/lorrshift.asm b/gmp/mpn/arm/neon/lorrshift.asm
new file mode 100644
index 0000000000..3d6253fd49
--- /dev/null
+++ b/gmp/mpn/arm/neon/lorrshift.asm
@@ -0,0 +1,279 @@
+dnl  ARM Neon mpn_lshift and mpn_rshift.
+
+dnl  Contributed to the GNU project by Torbjörn Granlund.
+
+dnl  Copyright 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb     cycles/limb     cycles/limb      good
+C              aligned	      unaligned	      best seen	     for cpu?
+C StrongARM	 -		 -
+C XScale	 -		 -
+C Cortex-A7	 ?		 ?
+C Cortex-A8	 ?		 ?
+C Cortex-A9	 3		 3				Y
+C Cortex-A15	 1.5		 1.5				Y
+
+
+C We read 64 bits at a time at 32-bit aligned addresses, and except for the
+C first and last store, we write using 64-bit aligned addresses.  All shifting
+C is done on 64-bit words in 'extension' registers.
+C
+C It should be possible to read also using 64-bit alignment, by manipulating
+C the shift count for unaligned operands.  Not done, since it does not seem to
+C matter for A9 or A15.
+C
+C This will not work in big-endian mode.
+
+C TODO
+C  * Try using 128-bit operations.  Note that Neon lacks pure 128-bit shifts,
+C    which might make it tricky.
+C  * Clean up and simplify.
+C  * Consider sharing most of the code for lshift and rshift, since the feed-in code,
+C    the loop, and most of the wind-down code are identical.
+C  * Replace the basecase code with code using 'extension' registers.
+C  * Optimise.  It is not clear that this loop insn permutation is optimal for
+C    either A9 or A15.
+
+C INPUT PARAMETERS
+define(`rp',  `r0')
+define(`ap',  `r1')
+define(`n',   `r2')
+define(`cnt', `r3')
+
+ifdef(`OPERATION_lshift',`
+	define(`IFLSH', `$1')
+	define(`IFRSH', `')
+	define(`X',`0')
+	define(`Y',`1')
+	define(`func',`mpn_lshift')
+')
+ifdef(`OPERATION_rshift',`
+	define(`IFLSH', `')
+	define(`IFRSH', `$1')
+	define(`X',`1')
+	define(`Y',`0')
+	define(`func',`mpn_rshift')
+')
+
+MULFUNC_PROLOGUE(mpn_lshift mpn_rshift)
+
+ASM_START()
+	TEXT
+	ALIGN(64)
+PROLOGUE(func)
+IFLSH(`	mov	r12, n, lsl #2	')
+IFLSH(`	add	rp, rp, r12	')
+IFLSH(`	add	ap, ap, r12	')
+
+	cmp	n, #4			C SIMD code n limit
+	ble	L(base)
+
+ifdef(`OPERATION_lshift',`
+	vdup.32	d6, r3			C left shift count is positive
+	sub	r3, r3, #64		C right shift count is negative
+	vdup.32	d7, r3
+	mov	r12, #-8')		C lshift pointer update offset
+ifdef(`OPERATION_rshift',`
+	rsb	r3, r3, #0		C right shift count is negative
+	vdup.32	d6, r3
+	add	r3, r3, #64		C left shift count is positive
+	vdup.32	d7, r3
+	mov	r12, #8')		C rshift pointer update offset
+
+IFLSH(`	sub	ap, ap, #8	')
+	vld1.32	{d19}, [ap], r12	C load initial 2 limbs
+	vshl.u64 d18, d19, d7		C retval
+
+	tst	rp, #4			C is rp 64-bit aligned already?
+	beq	L(rp_aligned)		C yes, skip
+IFLSH(`	add	ap, ap, #4	')	C move back ap pointer
+IFRSH(`	sub	ap, ap, #4	')	C move back ap pointer
+	vshl.u64 d4, d19, d6
+	sub	n, n, #1		C first limb handled
+IFLSH(`	sub	 rp, rp, #4	')
+	vst1.32	 {d4[Y]}, [rp]IFRSH(!)	C store first limb, rp gets aligned
+	vld1.32	 {d19}, [ap], r12	C load ap[1] and ap[2]
+
+L(rp_aligned):
+IFLSH(`	sub	rp, rp, #8	')
+	subs	n, n, #6
+	blt	L(two_or_three_more)
+	tst	n, #2
+	beq	L(2)
+
+L(1):	vld1.32	 {d17}, [ap], r12
+	vshl.u64 d5, d19, d6
+	vld1.32	 {d16}, [ap], r12
+	vshl.u64 d0, d17, d7
+	vshl.u64 d4, d17, d6
+	sub	n, n, #2
+	b	 L(mid)
+
+L(2):	vld1.32	 {d16}, [ap], r12
+	vshl.u64 d4, d19, d6
+	vld1.32	 {d17}, [ap], r12
+	vshl.u64 d1, d16, d7
+	vshl.u64 d5, d16, d6
+	subs	n, n, #4
+	blt	L(end)
+
+L(top):	vld1.32	 {d16}, [ap], r12
+	vorr	 d2, d4, d1
+	vshl.u64 d0, d17, d7
+	vshl.u64 d4, d17, d6
+	vst1.32	 {d2}, [rp:64], r12
+L(mid):	vld1.32	 {d17}, [ap], r12
+	vorr	 d3, d5, d0
+	vshl.u64 d1, d16, d7
+	vshl.u64 d5, d16, d6
+	vst1.32	 {d3}, [rp:64], r12
+	subs	n, n, #4
+	bge	L(top)
+
+L(end):	tst	 n, #1
+	beq	 L(evn)
+
+	vorr	 d2, d4, d1
+	vst1.32	 {d2}, [rp:64], r12
+	b	 L(cj1)
+
+L(evn):	vorr	 d2, d4, d1
+	vshl.u64 d0, d17, d7
+	vshl.u64 d16, d17, d6
+	vst1.32	 {d2}, [rp:64], r12
+	vorr	 d2, d5, d0
+	b	 L(cj2)
+
+C Load last 2 - 3 limbs, store last 4 - 5 limbs
+L(two_or_three_more):
+	tst	n, #1
+	beq	L(l2)
+
+L(l3):	vshl.u64 d5, d19, d6
+	vld1.32	 {d17}, [ap], r12
+L(cj1):	veor	 d16, d16, d16
+IFLSH(`	add	 ap, ap, #4	')
+	vld1.32	 {d16[Y]}, [ap], r12
+	vshl.u64 d0, d17, d7
+	vshl.u64 d4, d17, d6
+	vorr	 d3, d5, d0
+	vshl.u64 d1, d16, d7
+	vshl.u64 d5, d16, d6
+	vst1.32	 {d3}, [rp:64], r12
+	vorr	 d2, d4, d1
+	vst1.32	 {d2}, [rp:64], r12
+IFLSH(`	add	 rp, rp, #4	')
+	vst1.32	 {d5[Y]}, [rp]
+	vmov.32	 r0, d18[X]
+	bx	lr
+
+L(l2):	vld1.32	 {d16}, [ap], r12
+	vshl.u64 d4, d19, d6
+	vshl.u64 d1, d16, d7
+	vshl.u64 d16, d16, d6
+	vorr	 d2, d4, d1
+L(cj2):	vst1.32	 {d2}, [rp:64], r12
+	vst1.32	 {d16}, [rp]
+	vmov.32	 r0, d18[X]
+	bx	lr
+
+
+define(`tnc', `r12')
+L(base):
+	push	{r4, r6, r7, r8}
+ifdef(`OPERATION_lshift',`
+	ldr	r4, [ap, #-4]!
+	rsb	tnc, cnt, #32
+
+	mov	r7, r4, lsl cnt
+	tst	n, #1
+	beq	L(ev)			C n even
+
+L(od):	subs	n, n, #2
+	bcc	L(ed1)			C n = 1
+	ldr	r8, [ap, #-4]!
+	b	L(md)			C n = 3
+
+L(ev):	ldr	r6, [ap, #-4]!
+	subs	n, n, #2
+	beq	L(ed)			C n = 3
+					C n = 4
+L(tp):	ldr	r8, [ap, #-4]!
+	orr	r7, r7, r6, lsr tnc
+	str	r7, [rp, #-4]!
+	mov	r7, r6, lsl cnt
+L(md):	ldr	r6, [ap, #-4]!
+	orr	r7, r7, r8, lsr tnc
+	str	r7, [rp, #-4]!
+	mov	r7, r8, lsl cnt
+
+L(ed):	orr	r7, r7, r6, lsr tnc
+	str	r7, [rp, #-4]!
+	mov	r7, r6, lsl cnt
+L(ed1):	str	r7, [rp, #-4]
+	mov	r0, r4, lsr tnc
+')
+ifdef(`OPERATION_rshift',`
+	ldr	r4, [ap]
+	rsb	tnc, cnt, #32
+
+	mov	r7, r4, lsr cnt
+	tst	n, #1
+	beq	L(ev)			C n even
+
+L(od):	subs	n, n, #2
+	bcc	L(ed1)			C n = 1
+	ldr	r8, [ap, #4]!
+	b	L(md)			C n = 3
+
+L(ev):	ldr	r6, [ap, #4]!
+	subs	n, n, #2
+	beq	L(ed)			C n = 2
+					C n = 4
+
+L(tp):	ldr	r8, [ap, #4]!
+	orr	r7, r7, r6, lsl tnc
+	str	r7, [rp], #4
+	mov	r7, r6, lsr cnt
+L(md):	ldr	r6, [ap, #4]!
+	orr	r7, r7, r8, lsl tnc
+	str	r7, [rp], #4
+	mov	r7, r8, lsr cnt
+
+L(ed):	orr	r7, r7, r6, lsl tnc
+	str	r7, [rp], #4
+	mov	r7, r6, lsr cnt
+L(ed1):	str	r7, [rp], #4
+	mov	r0, r4, lsl tnc
+')
+	pop	{r4, r6, r7, r8}
+	bx	r14
+EPILOGUE()
diff --git a/gmp/mpn/arm/neon/lshiftc.asm b/gmp/mpn/arm/neon/lshiftc.asm
new file mode 100644
index 0000000000..9e4096256d
--- /dev/null
+++ b/gmp/mpn/arm/neon/lshiftc.asm
@@ -0,0 +1,257 @@
+dnl  ARM Neon mpn_lshiftc.
+
+dnl  Contributed to the GNU project by Torbjörn Granlund.
+
+dnl  Copyright 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb     cycles/limb     cycles/limb      good
+C              aligned	      unaligned	      best seen	     for cpu?
+C StrongARM	 -		 -
+C XScale	 -		 -
+C Cortex-A7	 ?		 ?
+C Cortex-A8	 ?		 ?
+C Cortex-A9	 3.5		 3.5				Y
+C Cortex-A15	 1.75		 1.75				Y
+
+
+C We read 64 bits at a time at 32-bit aligned addresses, and except for the
+C first and last store, we write using 64-bit aligned addresses.  All shifting
+C is done on 64-bit words in 'extension' registers.
+C
+C It should be possible to read also using 64-bit alignment, by manipulating
+C the shift count for unaligned operands.  Not done, since it does not seem to
+C matter for A9 or A15.
+C
+C This will not work in big-endian mode.
+
+C TODO
+C  * Try using 128-bit operations.  Note that Neon lacks pure 128-bit shifts,
+C    which might make it tricky.
+C  * Clean up and simplify.
+C  * Consider sharing most of the code for lshift and rshift, since the feed-in
+C    code, the loop, and most of the wind-down code are identical.
+C  * Replace the basecase code with code using 'extension' registers.
+C  * Optimise.  It is not clear that this loop insn permutation is optimal for
+C    either A9 or A15.
+
+C INPUT PARAMETERS
+define(`rp',  `r0')
+define(`ap',  `r1')
+define(`n',   `r2')
+define(`cnt', `r3')
+
+	define(`IFLSH', `$1')
+	define(`IFRSH', `')
+	define(`X',`0')
+	define(`Y',`1')
+	define(`func',`mpn_lshiftc')
+define(`OPERATION_lshiftc',1)
+
+ASM_START()
+	TEXT
+	ALIGN(64)
+PROLOGUE(mpn_lshiftc)
+IFLSH(`	mov	r12, n, lsl #2	')
+IFLSH(`	add	rp, rp, r12	')
+IFLSH(`	add	ap, ap, r12	')
+
+	cmp	n, #4			C SIMD code n limit
+	ble	L(base)
+
+ifdef(`OPERATION_lshiftc',`
+	vdup.32	d6, r3			C left shift count is positive
+	sub	r3, r3, #64		C right shift count is negative
+	vdup.32	d7, r3
+	mov	r12, #-8')		C lshift pointer update offset
+ifdef(`OPERATION_rshift',`
+	rsb	r3, r3, #0		C right shift count is negative
+	vdup.32	d6, r3
+	add	r3, r3, #64		C left shift count is positive
+	vdup.32	d7, r3
+	mov	r12, #8')		C rshift pointer update offset
+
+IFLSH(`	sub	ap, ap, #8	')
+	vld1.32	{d19}, [ap], r12	C load initial 2 limbs
+	vshl.u64 d18, d19, d7		C retval
+
+	tst	rp, #4			C is rp 64-bit aligned already?
+	beq	L(rp_aligned)		C yes, skip
+	vmvn	 d19, d19
+IFLSH(`	add	ap, ap, #4	')	C move back ap pointer
+IFRSH(`	sub	ap, ap, #4	')	C move back ap pointer
+	vshl.u64 d4, d19, d6
+	sub	n, n, #1		C first limb handled
+IFLSH(`	sub	 rp, rp, #4	')
+	vst1.32	 {d4[Y]}, [rp]IFRSH(!)	C store first limb, rp gets aligned
+	vld1.32	 {d19}, [ap], r12	C load ap[1] and ap[2]
+
+L(rp_aligned):
+IFLSH(`	sub	rp, rp, #8	')
+	subs	n, n, #6
+	vmvn	 d19, d19
+	blt	L(two_or_three_more)
+	tst	n, #2
+	beq	L(2)
+
+L(1):	vld1.32	 {d17}, [ap], r12
+	vshl.u64 d5, d19, d6
+	vmvn	 d17, d17
+	vld1.32	 {d16}, [ap], r12
+	vshl.u64 d0, d17, d7
+	vshl.u64 d4, d17, d6
+	sub	n, n, #2
+	b	 L(mid)
+
+L(2):	vld1.32	 {d16}, [ap], r12
+	vshl.u64 d4, d19, d6
+	vmvn	 d16, d16
+	vld1.32	 {d17}, [ap], r12
+	vshl.u64 d1, d16, d7
+	vshl.u64 d5, d16, d6
+	subs	n, n, #4
+	blt	L(end)
+
+L(top):	vmvn	 d17, d17
+	vld1.32	 {d16}, [ap], r12
+	vorr	 d2, d4, d1
+	vshl.u64 d0, d17, d7
+	vshl.u64 d4, d17, d6
+	vst1.32	 {d2}, [rp:64], r12
+L(mid):	vmvn	 d16, d16
+	vld1.32	 {d17}, [ap], r12
+	vorr	 d3, d5, d0
+	vshl.u64 d1, d16, d7
+	vshl.u64 d5, d16, d6
+	vst1.32	 {d3}, [rp:64], r12
+	subs	n, n, #4
+	bge	L(top)
+
+L(end):	tst	 n, #1
+	beq	 L(evn)
+
+	vorr	 d2, d4, d1
+	vst1.32	 {d2}, [rp:64], r12
+	b	 L(cj1)
+
+L(evn):	vmvn	 d17, d17
+	vorr	 d2, d4, d1
+	vshl.u64 d0, d17, d7
+	vshl.u64 d4, d17, d6
+	vst1.32	 {d2}, [rp:64], r12
+	vmvn.u8	 d17, #0
+	vorr	 d2, d5, d0
+	vshl.u64 d0, d17, d7
+	vorr	 d3, d4, d0
+	b	 L(cj2)
+
+C Load last 2 - 3 limbs, store last 4 - 5 limbs
+L(two_or_three_more):
+	tst	n, #1
+	beq	L(l2)
+
+L(l3):	vshl.u64 d5, d19, d6
+	vld1.32	 {d17}, [ap], r12
+L(cj1):	vmov.u8	 d16, #0
+IFLSH(`	add	 ap, ap, #4	')
+	vmvn	 d17, d17
+	vld1.32	 {d16[Y]}, [ap], r12
+	vshl.u64 d0, d17, d7
+	vshl.u64 d4, d17, d6
+	vmvn	 d16, d16
+	vorr	 d3, d5, d0
+	vshl.u64 d1, d16, d7
+	vshl.u64 d5, d16, d6
+	vst1.32	 {d3}, [rp:64], r12
+	vorr	 d2, d4, d1
+	vst1.32	 {d2}, [rp:64], r12
+IFLSH(`	add	 rp, rp, #4	')
+	vst1.32	 {d5[Y]}, [rp]
+	vmov.32	 r0, d18[X]
+	bx	lr
+
+L(l2):	vld1.32	 {d16}, [ap], r12
+	vshl.u64 d4, d19, d6
+	vmvn	 d16, d16
+	vshl.u64 d1, d16, d7
+	vshl.u64 d5, d16, d6
+	vmvn.u8	 d17, #0
+	vorr	 d2, d4, d1
+	vshl.u64 d0, d17, d7
+	vorr	 d3, d5, d0
+L(cj2):	vst1.32	 {d2}, [rp:64], r12
+	vst1.32	 {d3}, [rp]
+	vmov.32	 r0, d18[X]
+	bx	lr
+
+
+define(`tnc', `r12')
+L(base):
+	push	{r4, r6, r7, r8}
+	ldr	r4, [ap, #-4]!
+	rsb	tnc, cnt, #32
+	mvn	r6, r4
+
+	mov	r7, r6, lsl cnt
+	tst	n, #1
+	beq	L(ev)			C n even
+
+L(od):	subs	n, n, #2
+	bcc	L(ed1)			C n = 1
+	ldr	r8, [ap, #-4]!
+	mvn	r8, r8
+	b	L(md)			C n = 3
+
+L(ev):	ldr	r6, [ap, #-4]!
+	mvn	r6, r6
+	subs	n, n, #2
+	beq	L(ed)			C n = 3
+					C n = 4
+L(tp):	ldr	r8, [ap, #-4]!
+	orr	r7, r7, r6, lsr tnc
+	str	r7, [rp, #-4]!
+	mvn	r8, r8
+	mov	r7, r6, lsl cnt
+L(md):	ldr	r6, [ap, #-4]!
+	orr	r7, r7, r8, lsr tnc
+	str	r7, [rp, #-4]!
+	mvn	r6, r6
+	mov	r7, r8, lsl cnt
+
+L(ed):	orr	r7, r7, r6, lsr tnc
+	str	r7, [rp, #-4]!
+	mov	r7, r6, lsl cnt
+L(ed1):	mvn	r6, #0
+	orr	r7, r7, r6, lsr tnc
+	str	r7, [rp, #-4]
+	mov	r0, r4, lsr tnc
+	pop	{r4, r6, r7, r8}
+	bx	r14
+EPILOGUE()
diff --git a/gmp/mpn/arm/neon/popcount.asm b/gmp/mpn/arm/neon/popcount.asm
new file mode 100644
index 0000000000..2f8f9afc8d
--- /dev/null
+++ b/gmp/mpn/arm/neon/popcount.asm
@@ -0,0 +1,166 @@
+dnl  ARM Neon mpn_popcount -- mpn bit population count.
+
+dnl  Copyright 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb
+C StrongARM:	 -
+C XScale	 -
+C Cortex-A7	 ?
+C Cortex-A8	 ?
+C Cortex-A9	 1.125
+C Cortex-A15	 0.56
+
+C TODO
+C  * Explore using vldr and vldm.  Does it help on A9?  (These loads do
+C    64-bits-at-a-time, which will mess up in big-endian mode.  Except not for
+C    popcount. Except perhaps also for popcount for the edge loads.)
+C  * Arrange to align the pointer, if that helps performance.  Use the same
+C    read-and-mask trick we use on PCs, for simplicity and performance.  (Sorry
+C    valgrind!)
+C  * Explore if explicit align directives, e.g., "[ptr:128]" help.
+C  * See rth's gmp-devel 2013-02/03 messages about final summation tricks.
+
+C INPUT PARAMETERS
+define(`ap', r0)
+define(`n',  r1)
+
+C We sum into 16 16-bit counters in q8,q9, but at the end we sum them and end
+C up with 8 16-bit counters.  Therefore, we can sum to 8(2^16-1) bits, or
+C (8*2^16-1)/32 = 0x3fff limbs.  We use a chunksize close to that, but which
+C can be represented as a 8-bit ARM constant.
+C
+define(`chunksize',0x3f80)
+
+ASM_START()
+PROLOGUE(mpn_popcount)
+
+	cmp	n, #chunksize
+	bhi	L(gt16k)
+
+L(lt16k):
+	vmov.i64   q8, #0		C clear summation register
+	vmov.i64   q9, #0		C clear summation register
+
+	tst	   n, #1
+	beq	   L(xxx0)
+	vmov.i64   d0, #0
+	sub	   n, n, #1
+	vld1.32   {d0[0]}, [ap]!	C load 1 limb
+	vcnt.8	   d24, d0
+	vpadal.u8  d16, d24		C d16/q8 = 0; could just splat
+
+L(xxx0):tst	   n, #2
+	beq	   L(xx00)
+	sub	   n, n, #2
+	vld1.32    {d0}, [ap]!		C load 2 limbs
+	vcnt.8	   d24, d0
+	vpadal.u8  d16, d24
+
+L(xx00):tst	   n, #4
+	beq	   L(x000)
+	sub	   n, n, #4
+	vld1.32    {q0}, [ap]!		C load 4 limbs
+	vcnt.8	   q12, q0
+	vpadal.u8  q8, q12
+
+L(x000):tst	   n, #8
+	beq	   L(0000)
+
+	subs	   n, n, #8
+	vld1.32    {q0,q1}, [ap]!	C load 8 limbs
+	bls	   L(sum)
+
+L(gt8):	vld1.32    {q2,q3}, [ap]!	C load 8 limbs
+	sub	   n, n, #8
+	vcnt.8	   q12, q0
+	vcnt.8	   q13, q1
+	b	   L(mid)
+
+L(0000):subs	   n, n, #16
+	blo	   L(e0)
+
+	vld1.32    {q2,q3}, [ap]!	C load 8 limbs
+	vld1.32    {q0,q1}, [ap]!	C load 8 limbs
+	vcnt.8	   q12, q2
+	vcnt.8	   q13, q3
+	subs	   n, n, #16
+	blo	   L(end)
+
+L(top):	vld1.32    {q2,q3}, [ap]!	C load 8 limbs
+	vpadal.u8  q8, q12
+	vcnt.8	   q12, q0
+	vpadal.u8  q9, q13
+	vcnt.8	   q13, q1
+L(mid):	vld1.32    {q0,q1}, [ap]!	C load 8 limbs
+	subs	   n, n, #16
+	vpadal.u8  q8, q12
+	vcnt.8	   q12, q2
+	vpadal.u8  q9, q13
+	vcnt.8	   q13, q3
+	bhs	   L(top)
+
+L(end):	vpadal.u8  q8, q12
+	vpadal.u8  q9, q13
+L(sum):	vcnt.8	   q12, q0
+	vcnt.8	   q13, q1
+	vpadal.u8  q8, q12
+	vpadal.u8  q9, q13
+	vadd.i16   q8, q8, q9
+					C we have 8 16-bit counts
+L(e0):	vpaddl.u16 q8, q8		C we have 4 32-bit counts
+	vpaddl.u32 q8, q8		C we have 2 64-bit counts
+	vmov.32    r0, d16[0]
+	vmov.32    r1, d17[0]
+	add	   r0, r0, r1
+	bx	lr
+
+C Code for large count.  Splits operand and calls above code.
+define(`ap2', r2)			C caller-saves reg not used above
+L(gt16k):
+	push	{r4,r14}
+	mov	ap2, ap
+	mov	r3, n			C full count
+	mov	r4, #0			C total sum
+
+1:	mov	n, #chunksize		C count for this invocation
+	bl	L(lt16k)		C could jump deep inside code
+	add	ap2, ap2, #chunksize*4	C point at next chunk
+	add	r4, r4, r0
+	mov	ap, ap2			C put chunk pointer in place for call
+	sub	r3, r3, #chunksize
+	cmp	r3, #chunksize
+	bhi	1b
+
+	mov	n, r3			C count for final invocation
+	bl	L(lt16k)
+	add	r0, r4, r0
+	pop	{r4,pc}
+EPILOGUE()
diff --git a/gmp/mpn/arm/neon/sec_tabselect.asm b/gmp/mpn/arm/neon/sec_tabselect.asm
new file mode 100644
index 0000000000..69fceb0063
--- /dev/null
+++ b/gmp/mpn/arm/neon/sec_tabselect.asm
@@ -0,0 +1,140 @@
+dnl  ARM Neon mpn_sec_tabselect.
+
+dnl  Contributed to the GNU project by Torbjörn Granlund.
+
+dnl  Copyright 2011-2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C	     cycles/limb
+C StrongARM	 -
+C XScale	 -
+C Cortex-A7	 ?
+C Cortex-A8	 ?
+C Cortex-A9	 1.15
+C Cortex-A15	 0.65
+
+define(`rp',     `r0')
+define(`tp',     `r1')
+define(`n',      `r2')
+define(`nents',  `r3')
+C define(`which',  on stack)
+
+define(`i',      `r4')
+define(`j',      `r5')
+
+define(`maskq',  `q10')
+define(`maskd',  `d20')
+
+ASM_START()
+PROLOGUE(mpn_sec_tabselect)
+	push	{r4-r5}
+
+	add	  r4, sp, #8
+	vld1.32	  {d30[], d31[]}, [r4]	C 4 `which' copies
+	vmov.i32  q14, #1		C 4 copies of 1
+
+	subs	j, n, #8
+	bmi	L(outer_end)
+
+L(outer_top):
+	mov	  i, nents
+	mov	  r12, tp		C preserve tp
+	veor	  q13, q13, q13		C 4 counter copies
+	veor	  q2, q2, q2
+	veor	  q3, q3, q3
+	ALIGN(16)
+L(top):	vceq.i32  maskq, q13, q15	C compare idx copies to `which' copies
+	vld1.32	  {q0,q1}, [tp]
+	vadd.i32  q13, q13, q14
+	vbit	  q2, q0, maskq
+	vbit	  q3, q1, maskq
+	add	  tp, tp, n, lsl #2
+	subs	  i, i, #1
+	bne	  L(top)
+	vst1.32	  {q2,q3}, [rp]!
+	add	  tp, r12, #32		C restore tp, point to next slice
+	subs	  j, j, #8
+	bpl	  L(outer_top)
+L(outer_end):
+
+	tst	  n, #4
+	beq	  L(b0xx)
+L(b1xx):mov	  i, nents
+	mov	  r12, tp
+	veor	  q13, q13, q13
+	veor	  q2, q2, q2
+	ALIGN(16)
+L(tp4):	vceq.i32  maskq, q13, q15
+	vld1.32	  {q0}, [tp]
+	vadd.i32  q13, q13, q14
+	vbit	  q2, q0, maskq
+	add	  tp, tp, n, lsl #2
+	subs	  i, i, #1
+	bne	  L(tp4)
+	vst1.32	  {q2}, [rp]!
+	add	  tp, r12, #16
+
+L(b0xx):tst	  n, #2
+	beq	  L(b00x)
+L(b01x):mov	  i, nents
+	mov	  r12, tp
+	veor	  d26, d26, d26
+	veor	  d4, d4, d4
+	ALIGN(16)
+L(tp2):	vceq.i32  maskd, d26, d30
+	vld1.32	  {d0}, [tp]
+	vadd.i32  d26, d26, d28
+	vbit	  d4, d0, maskd
+	add	  tp, tp, n, lsl #2
+	subs	  i, i, #1
+	bne	  L(tp2)
+	vst1.32	  {d4}, [rp]!
+	add	  tp, r12, #8
+
+L(b00x):tst	  n, #1
+	beq	  L(b000)
+L(b001):mov	  i, nents
+	mov	  r12, tp
+	veor	  d26, d26, d26
+	veor	  d4, d4, d4
+	ALIGN(16)
+L(tp1):	vceq.i32  maskd, d26, d30
+	vld1.32	  {d0[0]}, [tp]
+	vadd.i32  d26, d26, d28
+	vbit	  d4, d0, maskd
+	add	  tp, tp, n, lsl #2
+	subs	  i, i, #1
+	bne	  L(tp1)
+	vst1.32	  {d4[0]}, [rp]
+
+L(b000):pop	{r4-r5}
+	bx	r14
+EPILOGUE()
diff --git a/gmp/mpn/arm/rsh1aors_n.asm b/gmp/mpn/arm/rsh1aors_n.asm
new file mode 100644
index 0000000000..95c1f79ad9
--- /dev/null
+++ b/gmp/mpn/arm/rsh1aors_n.asm
@@ -0,0 +1,124 @@
+dnl  ARM mpn_rsh1add_n and mpn_rsh1sub_n.
+
+dnl  Contributed to the GNU project by Torbjörn Granlund.
+
+dnl  Copyright 2012 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb
+C StrongARM	 ?
+C XScale	 ?
+C Cortex-A7	 ?
+C Cortex-A8	 ?
+C Cortex-A9	3.64-3.7
+C Cortex-A15	 2.5
+
+C TODO
+C  * Not optimised.
+
+define(`rp', `r0')
+define(`up', `r1')
+define(`vp', `r2')
+define(`n',  `r3')
+
+ifdef(`OPERATION_rsh1add_n', `
+  define(`ADDSUB',	adds)
+  define(`ADDSUBC',	adcs)
+  define(`RSTCY',	`cmn	$1, $1')
+  define(`func',	mpn_rsh1add_n)
+  define(`func_nc',	mpn_rsh1add_nc)')
+ifdef(`OPERATION_rsh1sub_n', `
+  define(`ADDSUB',	subs)
+  define(`ADDSUBC',	sbcs)
+  define(`RSTCY',
+	`mvn	$2, #0x80000000
+	cmp	$2, $1')
+  define(`func',	mpn_rsh1sub_n)
+  define(`func_nc',	mpn_rsh1sub_nc)')
+
+MULFUNC_PROLOGUE(mpn_rsh1add_n mpn_rsh1sub_n)
+
+ASM_START()
+PROLOGUE(func)
+	push	{r4-r11}
+	ldr	r4, [up], #4
+	ldr	r8, [vp], #4
+	ADDSUB	r4, r4, r8
+	movs	r12, r7, rrx
+	and	r11, r4, #1	C return value
+	subs	n, n, #4
+	blo	L(end)
+
+L(top):	ldmia	up!, {r5,r6,r7}
+	ldmia	vp!, {r8,r9,r10}
+	cmn	r12, r12
+	ADDSUBC	r5, r5, r8
+	ADDSUBC	r6, r6, r9
+	ADDSUBC	r7, r7, r10
+	movs	r12, r7, rrx
+	movs	r6, r6, rrx
+	movs	r5, r5, rrx
+	movs	r4, r4, rrx
+	subs	n, n, #3
+	stmia	rp!, {r4,r5,r6}
+	mov	r4, r7
+	bhs	L(top)
+
+L(end):	cmn	n, #2
+	bls	L(e2)
+	ldm	up, {r5,r6}
+	ldm	vp, {r8,r9}
+	cmn	r12, r12
+	ADDSUBC	r5, r5, r8
+	ADDSUBC	r6, r6, r9
+	movs	r12, r6, rrx
+	movs	r5, r5, rrx
+	movs	r4, r4, rrx
+	stmia	rp!, {r4,r5}
+	mov	r4, r6
+	b	L(e1)
+
+L(e2):	bne	L(e1)
+	ldr	r5, [up, #0]
+	ldr	r8, [vp, #0]
+	cmn	r12, r12
+	ADDSUBC	r5, r5, r8
+	movs	r12, r5, rrx
+	movs	r4, r4, rrx
+	str	r4, [rp], #4
+	mov	r4, r5
+
+L(e1):	RSTCY(	r12, r1)
+	mov	r4, r4, rrx
+	str	r4, [rp, #0]
+	mov	r0, r11
+	pop	{r4-r11}
+	bx	r14
+EPILOGUE()
diff --git a/gmp/mpn/arm/rshift.asm b/gmp/mpn/arm/rshift.asm
new file mode 100644
index 0000000000..84728d038a
--- /dev/null
+++ b/gmp/mpn/arm/rshift.asm
@@ -0,0 +1,86 @@
+dnl  ARM mpn_rshift.
+
+dnl  Contributed to the GNU project by Torbjörn Granlund.
+
+dnl  Copyright 1997, 2000, 2001, 2012 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb
+C StrongARM	 ?
+C XScale	 ?
+C Cortex-A7	 ?
+C Cortex-A8	 ?
+C Cortex-A9	 3.5
+C Cortex-A15	 ?
+
+define(`rp',  `r0')
+define(`up',  `r1')
+define(`n',   `r2')
+define(`cnt', `r3')
+define(`tnc', `r12')
+
+ASM_START()
+PROLOGUE(mpn_rshift)
+	push	{r4, r6, r7, r8}
+	ldr	r4, [up]
+	rsb	tnc, cnt, #32
+
+	mov	r7, r4, lsr cnt
+	tst	n, #1
+	beq	L(evn)			C n even
+
+L(odd):	subs	n, n, #2
+	bcc	L(1)			C n = 1
+	ldr	r8, [up, #4]!
+	b	L(mid)
+
+L(evn):	ldr	r6, [up, #4]!
+	subs	n, n, #2
+	beq	L(end)
+
+L(top):	ldr	r8, [up, #4]!
+	orr	r7, r7, r6, lsl tnc
+	str	r7, [rp], #4
+	mov	r7, r6, lsr cnt
+L(mid):	ldr	r6, [up, #4]!
+	orr	r7, r7, r8, lsl tnc
+	str	r7, [rp], #4
+	mov	r7, r8, lsr cnt
+	subs	n, n, #2
+	bgt	L(top)
+
+L(end):	orr	r7, r7, r6, lsl tnc
+	str	r7, [rp], #4
+	mov	r7, r6, lsr cnt
+L(1):	str	r7, [rp]
+	mov	r0, r4, lsl tnc
+	pop	{r4, r6, r7, r8}
+	bx	r14
+EPILOGUE()
diff --git a/gmp/mpn/arm/sec_tabselect.asm b/gmp/mpn/arm/sec_tabselect.asm
new file mode 100644
index 0000000000..8cf937a091
--- /dev/null
+++ b/gmp/mpn/arm/sec_tabselect.asm
@@ -0,0 +1,131 @@
+dnl  ARM mpn_sec_tabselect
+
+dnl  Contributed to the GNU project by Torbjörn Granlund.
+
+dnl  Copyright 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb
+C StrongARM	 ?
+C XScale	 ?
+C Cortex-A7	 ?
+C Cortex-A8	 ?
+C Cortex-A9	 2.33
+C Cortex-A15	 2.2
+
+C TODO
+C  * Consider using special code for small nents, either swapping the inner and
+C    outer loops, or providing a few completely unrolling the inner loops.
+
+define(`rp',    `r0')
+define(`tp',    `r1')
+define(`n',     `r2')
+define(`nents', `r3')
+C      which  on stack
+
+define(`i',     `r11')
+define(`j',     `r12')
+define(`c',     `r14')
+define(`mask',  `r7')
+
+ASM_START()
+PROLOGUE(mpn_sec_tabselect)
+	push	{r4-r11, r14}
+
+	subs	j, n, #3
+	bmi	L(outer_end)
+L(outer_top):
+	ldr	c, [sp, #36]
+	mov	i, nents
+	push	{tp}
+
+	mov	r8, #0
+	mov	r9, #0
+	mov	r10, #0
+
+L(top):	subs	c, c, #1
+	ldm	tp, {r4,r5,r6}
+	sbc	mask, mask, mask
+	subs	i, i, #1
+	add	tp, tp, n, lsl #2
+	and	r4, r4, mask
+	and	r5, r5, mask
+	and	r6, r6, mask
+	orr	r8, r8, r4
+	orr	r9, r9, r5
+	orr	r10, r10, r6
+	bge	L(top)
+
+	stmia	rp!, {r8,r9,r10}
+	pop	{tp}
+	add	tp, tp, #12
+	subs	j, j, #3
+	bpl	L(outer_top)
+L(outer_end):
+
+	cmp	j, #-1
+	bne	L(n2)
+
+	ldr	c, [sp, #36]
+	mov	i, nents
+	mov	r8, #0
+	mov	r9, #0
+L(tp2):	subs	c, c, #1
+	sbc	mask, mask, mask
+	ldm	tp, {r4,r5}
+	subs	i, i, #1
+	add	tp, tp, n, lsl #2
+	and	r4, r4, mask
+	and	r5, r5, mask
+	orr	r8, r8, r4
+	orr	r9, r9, r5
+	bge	L(tp2)
+	stmia	rp, {r8,r9}
+	pop	{r4-r11, r14}
+	bx	lr
+
+L(n2):	cmp	j, #-2
+	bne	L(n1)
+
+	ldr	c, [sp, #36]
+	mov	i, nents
+	mov	r8, #0
+L(tp1):	subs	c, c, #1
+	sbc	mask, mask, mask
+	ldr	r4, [tp]
+	subs	i, i, #1
+	add	tp, tp, n, lsl #2
+	and	r4, r4, mask
+	orr	r8, r8, r4
+	bge	L(tp1)
+	str	r8, [rp]
+L(n1):	pop	{r4-r11, r14}
+	bx	lr
+EPILOGUE()
diff --git a/gmp/mpn/arm/udiv.asm b/gmp/mpn/arm/udiv.asm
new file mode 100644
index 0000000000..8d441c74ed
--- /dev/null
+++ b/gmp/mpn/arm/udiv.asm
@@ -0,0 +1,104 @@
+dnl  ARM mpn_udiv_qrnnd -- divide a two limb dividend and a one limb divisor.
+dnl  Return quotient and store remainder through a supplied pointer.
+
+dnl  Copyright 2001, 2012 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+define(`rem_ptr',`r0')
+define(`n1',`r1')
+define(`n0',`r2')
+define(`d',`r3')
+
+C divstep -- develop one quotient bit.  Dividend in $1$2, divisor in $3.
+C Quotient bit is shifted into $2.
+define(`divstep',
+       `adcs	$2, $2, $2
+	adc	$1, $1, $1
+	cmp	$1, $3
+	subcs	$1, $1, $3')
+
+ASM_START()
+PROLOGUE(mpn_udiv_qrnnd)
+	mov	r12, #8			C loop counter for both loops below
+	cmp	d, #0x80000000		C check divisor msb and clear carry
+	bcs	L(_large_divisor)
+
+L(oop):	divstep(n1,n0,d)
+	divstep(n1,n0,d)
+	divstep(n1,n0,d)
+	divstep(n1,n0,d)
+	sub	r12, r12, #1
+	teq	r12, #0
+	bne	L(oop)
+
+	str	n1, [rem_ptr]		C store remainder
+	adc	r0, n0, n0		C quotient: add last carry from divstep
+	bx	lr
+
+L(_large_divisor):
+	stmfd	sp!, { r8, lr }
+
+	and	r8, n0, #1		C save lsb of dividend
+	mov	lr, n1, lsl #31
+	orrs	n0, lr, n0, lsr #1	C n0 = lo(n1n0 >> 1)
+	mov	n1, n1, lsr #1		C n1 = hi(n1n0 >> 1)
+
+	and	lr, d, #1		C save lsb of divisor
+	movs	d, d, lsr #1		C d = floor(orig_d / 2)
+	adc	d, d, #0		C d = ceil(orig_d / 2)
+
+L(oop2):
+	divstep(n1,n0,d)
+	divstep(n1,n0,d)
+	divstep(n1,n0,d)
+	divstep(n1,n0,d)
+	sub	r12, r12, #1
+	teq	r12, #0
+	bne	L(oop2)
+
+	adc	n0, n0, n0		C shift and add last carry from divstep
+	add	n1, r8, n1, lsl #1	C shift in omitted dividend lsb
+	tst	lr, lr			C test saved divisor lsb
+	beq	L(_even_divisor)
+
+	rsb	d, lr, d, lsl #1	C restore orig d value
+	adds	n1, n1, n0		C fix remainder for omitted divisor lsb
+	addcs	n0, n0, #1		C adjust quotient if rem. fix carried
+	subcs	n1, n1, d		C adjust remainder accordingly
+	cmp	n1, d			C remainder >= divisor?
+	subcs	n1, n1, d		C adjust remainder
+	addcs	n0, n0, #1		C adjust quotient
+
+L(_even_divisor):
+	str	n1, [rem_ptr]		C store remainder
+	mov	r0, n0			C quotient
+	ldmfd	sp!, { r8, pc }
+EPILOGUE(mpn_udiv_qrnnd)
diff --git a/gmp/mpn/arm/v5/gcd_1.asm b/gmp/mpn/arm/v5/gcd_1.asm
new file mode 100644
index 0000000000..169d154bf0
--- /dev/null
+++ b/gmp/mpn/arm/v5/gcd_1.asm
@@ -0,0 +1,120 @@
+dnl  ARM v5 mpn_gcd_1.
+
+dnl  Based on the K7 gcd_1.asm, by Kevin Ryde.  Rehacked for ARM by Torbjörn
+dnl  Granlund.
+
+dnl  Copyright 2000-2002, 2005, 2009, 2011, 2012 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/bit (approx)
+C StrongARM	 -
+C XScale	 ?
+C Cortex-A7	 ?
+C Cortex-A8	 ?
+C Cortex-A9	 5.9
+C Cortex-A15	 ?
+C Numbers measured with: speed -CD -s8-32 -t24 mpn_gcd_1
+
+C TODO
+C  * Optimise inner-loop better.
+
+C Threshold of when to call bmod when U is one limb.  Should be about
+C (time_in_cycles(bmod_1,1) + call_overhead) / (cycles/bit).
+define(`BMOD_THRES_LOG2', 6)
+
+C INPUT PARAMETERS
+define(`up',    `r0')
+define(`n',     `r1')
+define(`v0',    `r2')
+
+ifdef(`BMOD_1_TO_MOD_1_THRESHOLD',,
+  `define(`BMOD_1_TO_MOD_1_THRESHOLD',0xffffffff)')
+
+ASM_START()
+	TEXT
+	ALIGN(16)
+PROLOGUE(mpn_gcd_1)
+	push	{r4, r7, lr}
+	ldr	r3, [up]	C U low limb
+
+	orr	r3, r3, v0
+	rsb	r4, r3, #0
+	and	r4, r4, r3
+	clz	r4, r4		C min(ctz(u0),ctz(v0))
+	rsb	r4, r4, #31
+
+	rsb	r12, v0, #0
+	and	r12, r12, v0
+	clz	r12, r12
+	rsb	r12, r12, #31
+	mov	v0, v0, lsr r12
+
+	mov	r7, v0
+
+	cmp	n, #1
+	bne	L(nby1)
+
+C Both U and V are single limbs, reduce with bmod if u0 >> v0.
+	ldr	r3, [up]
+	cmp	v0, r3, lsr #BMOD_THRES_LOG2
+	bhi	L(red1)
+
+L(bmod):mov	r3, #0		C carry argument
+	bl	mpn_modexact_1c_odd
+	b	L(red0)
+
+L(nby1):cmp	n, #BMOD_1_TO_MOD_1_THRESHOLD
+	blo	L(bmod)
+
+	bl	mpn_mod_1
+
+L(red0):mov	r3, r0
+L(red1):rsbs	r12, r3, #0
+	and	r12, r12, r3
+	clz	r12, r12
+	rsb	r12, r12, #31
+	bne	L(mid)
+	b	L(end)
+
+	ALIGN(8)
+L(top):	rsb	r12, r12, #31
+	movcc	r3, r1		C if x-y < 0
+	movcc	r7, r0		C use x,y-x
+L(mid):	mov	r3, r3, lsr r12	C
+	mov	r0, r3		C
+	sub	r1, r7, r3	C
+	rsbs	r3, r7, r3	C
+	and	r12, r1, r3	C
+	clz	r12, r12	C
+	bne	L(top)		C
+
+L(end):	mov	r0, r7, lsl r4
+	pop	{r4, r7, pc}
+EPILOGUE()
diff --git a/gmp/mpn/arm/v5/mod_1_1.asm b/gmp/mpn/arm/v5/mod_1_1.asm
new file mode 100644
index 0000000000..3cf0cd7763
--- /dev/null
+++ b/gmp/mpn/arm/v5/mod_1_1.asm
@@ -0,0 +1,129 @@
+dnl  ARM mpn_mod_1_1p
+
+dnl  Contributed to the GNU project by Torbjörn Granlund.
+
+dnl  Copyright 2012 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb
+C StrongARM	 -
+C XScale	 ?
+C Cortex-A7	 ?
+C Cortex-A8	 ?
+C Cortex-A9	 7
+C Cortex-A15	 6
+
+define(`ap', `r0')
+define(`n',  `r1')
+define(`d',  `r2')
+define(`cps',`r3')
+
+ASM_START()
+PROLOGUE(mpn_mod_1_1p)
+	push	{r4-r10}
+	add	r0, r0, r1, asl #2
+	ldr	r5, [r0, #-4]!
+	ldr	r12, [r0, #-4]!
+	subs	r1, r1, #2
+	ble	L(4)
+	ldr	r8, [r3, #12]
+	mov	r4, r12
+	mov	r10, r5
+	umull	r7, r5, r10, r8
+	sub	r1, r1, #1
+	b	L(mid)
+
+L(top):	adds	r12, r6, r7
+	adcs	r10, r4, r5
+	sub	r1, r1, #1
+	mov	r6, #0
+	movcs	r6, r8
+	umull	r7, r5, r10, r8
+	adds	r4, r12, r6
+	subcs	r4, r4, r2
+L(mid):	ldr	r6, [r0, #-4]!
+	teq	r1, #0
+	bne	L(top)
+
+	adds	r12, r6, r7
+	adcs	r5, r4, r5
+	subcs	r5, r5, r2
+L(4):	ldr	r1, [r3, #4]
+	cmp	r1, #0
+	beq	L(7)
+	ldr	r4, [r3, #8]
+	umull	r0, r6, r5, r4
+	adds	r12, r0, r12
+	addcs	r6, r6, #1
+	rsb	r0, r1, #32
+	mov	r0, r12, lsr r0
+	orr	r5, r0, r6, asl r1
+	mov	r12, r12, asl r1
+	b	L(8)
+L(7):	cmp	r5, r2
+	subcs	r5, r5, r2
+L(8):	ldr	r0, [r3, #0]
+	umull	r4, r3, r5, r0
+	add	r5, r5, #1
+	adds	r0, r4, r12
+	adc	r5, r3, r5
+	mul	r5, r2, r5
+	sub	r12, r12, r5
+	cmp	r12, r0
+	addhi	r12, r12, r2
+	cmp	r2, r12
+	subls	r12, r12, r2
+	mov	r0, r12, lsr r1
+	pop	{r4-r10}
+	bx	r14
+EPILOGUE()
+
+PROLOGUE(mpn_mod_1_1p_cps)
+	stmfd	sp!, {r4, r5, r6, r14}
+	mov	r5, r0
+	clz	r4, r1
+	mov	r0, r1, asl r4
+	rsb	r6, r0, #0
+	bl	mpn_invert_limb
+	str	r0, [r5, #0]
+	str	r4, [r5, #4]
+	cmp	r4, #0
+	beq	L(2)
+	rsb	r1, r4, #32
+	mov	r3, #1
+	mov	r3, r3, asl r4
+	orr	r3, r3, r0, lsr r1
+	mul	r3, r6, r3
+	mov	r4, r3, lsr r4
+	str	r4, [r5, #8]
+L(2):	mul	r0, r6, r0
+	str	r0, [r5, #12]
+	ldmfd	sp!, {r4, r5, r6, pc}
+EPILOGUE()
diff --git a/gmp/mpn/arm/v5/mod_1_2.asm b/gmp/mpn/arm/v5/mod_1_2.asm
new file mode 100644
index 0000000000..aa26ecb21c
--- /dev/null
+++ b/gmp/mpn/arm/v5/mod_1_2.asm
@@ -0,0 +1,156 @@
+dnl  ARM mpn_mod_1s_2p
+
+dnl  Contributed to the GNU project by Torbjörn Granlund.
+
+dnl  Copyright 2012 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb
+C StrongARM	 -
+C XScale	 ?
+C Cortex-A7	 ?
+C Cortex-A8	 ?
+C Cortex-A9	 4.25
+C Cortex-A15	 3
+
+define(`ap', `r0')
+define(`n',  `r1')
+define(`d',  `r2')
+define(`cps',`r3')
+
+ASM_START()
+PROLOGUE(mpn_mod_1s_2p)
+	push	{r4-r10}
+	tst	n, #1
+	add	r7, r3, #8
+	ldmia	r7, {r7, r8, r12}	C load B1, B2, B3
+	add	ap, ap, n, lsl #2	C put ap at operand end
+	beq	L(evn)
+
+L(odd):	subs	n, n, #1
+	beq	L(1)
+	ldmdb	ap!, {r4,r6,r9}
+	mov	r10, #0
+	umlal	r4, r10, r6, r7
+	umlal	r4, r10, r9, r8
+	b	L(com)
+
+L(evn):	ldmdb	ap!, {r4,r10}
+L(com):	subs	n, n, #2
+	ble	L(end)
+	ldmdb	ap!, {r5,r6}
+	b	L(mid)
+
+L(top):	mov	r9, #0
+	umlal	r5, r9, r6, r7		C B1
+	umlal	r5, r9, r4, r8		C B2
+	ldmdb	ap!, {r4,r6}
+	umlal	r5, r9, r10, r12	C B3
+	ble	L(xit)
+	mov	r10, #0
+	umlal	r4, r10, r6, r7		C B1
+	umlal	r4, r10, r5, r8		C B2
+	ldmdb	ap!, {r5,r6}
+	umlal	r4, r10, r9, r12	C B3
+L(mid):	subs	n, n, #4
+	bge	L(top)
+
+	mov	r9, #0
+	umlal	r5, r9, r6, r7		C B1
+	umlal	r5, r9, r4, r8		C B2
+	umlal	r5, r9, r10, r12	C B3
+	mov	r4, r5
+
+L(end):	movge	   r9, r10		C executed iff coming via xit
+	ldr	r6, [r3, #4]		C cps[1] = cnt
+	mov	r5, #0
+	umlal	r4, r5, r9, r7
+	mov	r7, r5, lsl r6
+L(x):	rsb	r1, r6, #32
+	orr	r8, r7, r4, lsr r1
+	mov	r9, r4, lsl r6
+	ldr	r5, [r3, #0]
+	add	r0, r8, #1
+	umull	r12, r1, r8, r5
+	adds	r4, r12, r9
+	adc	r1, r1, r0
+	mul	r5, r2, r1
+	sub	r9, r9, r5
+	cmp	r9, r4
+	addhi	r9, r9, r2
+	cmp	r2, r9
+	subls	r9, r9, r2
+	mov	r0, r9, lsr r6
+	pop	{r4-r10}
+	bx	r14
+
+L(xit):	mov	r10, #0
+	umlal	r4, r10, r6, r7		C B1
+	umlal	r4, r10, r5, r8		C B2
+	umlal	r4, r10, r9, r12	C B3
+	b	L(end)
+
+L(1):	ldr	r6, [r3, #4]		C cps[1] = cnt
+	ldr	r4, [ap, #-4]		C ap[0]
+	mov	r7, #0
+	b	L(x)
+EPILOGUE()
+
+PROLOGUE(mpn_mod_1s_2p_cps)
+	push	{r4-r8, r14}
+	clz	r4, r1
+	mov	r5, r1, lsl r4		C b <<= cnt
+	mov	r6, r0			C r6 = cps
+	mov	r0, r5
+	bl	mpn_invert_limb
+	rsb	r3, r4, #32
+	mov	r3, r0, lsr r3
+	mov	r2, #1
+	orr	r3, r3, r2, lsl r4
+	rsb	r1, r5, #0
+	mul	r2, r1, r3
+	umull	r3, r12, r2, r0
+	add	r12, r2, r12
+	mvn	r12, r12
+	mul	r1, r5, r12
+	cmp	r1, r3
+	addhi	r1, r1, r5
+	umull	r12, r7, r1, r0
+	add	r7, r1, r7
+	mvn	r7, r7
+	mul	r3, r5, r7
+	cmp	r3, r12
+	addhi	r3, r3, r5
+	mov	r5, r2, lsr r4
+	mov	r7, r1, lsr r4
+	mov	r8, r3, lsr r4
+	stmia	r6, {r0,r4,r5,r7,r8}	C fill cps
+	pop	{r4-r8, pc}
+EPILOGUE()
diff --git a/gmp/mpn/arm/v6/addmul_1.asm b/gmp/mpn/arm/v6/addmul_1.asm
new file mode 100644
index 0000000000..57019e4b2b
--- /dev/null
+++ b/gmp/mpn/arm/v6/addmul_1.asm
@@ -0,0 +1,111 @@
+dnl  ARM mpn_addmul_1.
+
+dnl  Copyright 2012 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb
+C StrongARM:	 -
+C XScale	 -
+C Cortex-A7	 ?
+C Cortex-A8	 ?
+C Cortex-A9	 3.25
+C Cortex-A15	 4
+
+C TODO
+C  * Micro-optimise feed-in code.
+C  * Optimise for n=1,2 by delaying register saving.
+C  * Try using ldm/stm.
+
+define(`rp',`r0')
+define(`up',`r1')
+define(`n', `r2')
+define(`v0',`r3')
+
+ASM_START()
+PROLOGUE(mpn_addmul_1)
+	stmfd	sp!, { r4, r5, r6, r7 }
+
+	ands	r6, n, #3
+	mov	r12, #0
+	beq	L(fi0)
+	cmp	r6, #2
+	bcc	L(fi1)
+	beq	L(fi2)
+
+L(fi3):	ldr	r4, [up], #4
+	ldr	r6, [rp, #0]
+	ldr	r5, [up], #4
+	b	L(lo3)
+
+L(fi0):	ldr	r5, [up], #4
+	ldr	r7, [rp], #4
+	ldr	r4, [up], #4
+	b	L(lo0)
+
+L(fi1):	ldr	r4, [up], #4
+	ldr	r6, [rp], #8
+	subs	n, n, #1
+	beq	L(1)
+	ldr	r5, [up], #4
+	b	L(lo1)
+
+L(fi2):	ldr	r5, [up], #4
+	ldr	r7, [rp], #12
+	ldr	r4, [up], #4
+	b	L(lo2)
+
+	ALIGN(16)
+L(top):	ldr	r6, [rp, #-8]
+	ldr	r5, [up], #4
+	str	r7, [rp, #-12]
+L(lo1):	umaal	r6, r12, r4, v0
+	ldr	r7, [rp, #-4]
+	ldr	r4, [up], #4
+	str	r6, [rp, #-8]
+L(lo0):	umaal	r7, r12, r5, v0
+	ldr	r6, [rp, #0]
+	ldr	r5, [up], #4
+	str	r7, [rp, #-4]
+L(lo3):	umaal	r6, r12, r4, v0
+	ldr	r7, [rp, #4]
+	ldr	r4, [up], #4
+	str	r6, [rp], #16
+L(lo2):	umaal	r7, r12, r5, v0
+	subs	n, n, #4
+	bhi	L(top)
+
+	ldr	r6, [rp, #-8]
+	str	r7, [rp, #-12]
+L(1):	umaal	r6, r12, r4, v0
+	str	r6, [rp, #-8]
+	mov	r0, r12
+	ldmfd	sp!, { r4, r5, r6, r7 }
+	bx	lr
+EPILOGUE()
diff --git a/gmp/mpn/arm/v6/addmul_2.asm b/gmp/mpn/arm/v6/addmul_2.asm
new file mode 100644
index 0000000000..69817ce340
--- /dev/null
+++ b/gmp/mpn/arm/v6/addmul_2.asm
@@ -0,0 +1,138 @@
+dnl  ARM mpn_addmul_2.
+
+dnl  Contributed to the GNU project by Torbjörn Granlund.
+
+dnl  Copyright 2012, 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb
+C StrongARM:	 -
+C XScale	 -
+C Cortex-A7	 ?
+C Cortex-A8	 ?
+C Cortex-A9	 2.25
+C Cortex-A15	 2.5
+
+C This is believed to be optimal for A15 for any unrolling, and optimal for A9
+C for 4-way unrolling.  Using separate pointer update instructions is necessary
+C for optimal A9 speed.
+
+C TODO:
+C  * Start the first multiply or multiplies directly at function entry.
+
+
+define(`rp',`r0')
+define(`up',`r1')
+define(`n', `r2')
+define(`vp',`r3')
+
+define(`v0',`r6')
+define(`v1',`r7')
+define(`u0',`r3')
+define(`u1',`r9')
+
+define(`cya',`r8')
+define(`cyb',`r12')
+
+
+ASM_START()
+PROLOGUE(mpn_addmul_2)
+	push	{ r4, r5, r6, r7, r8, r9 }
+
+	ldm	vp, { v0, v1 }
+	mov	cya, #0
+	mov	cyb, #0
+
+	tst	n, #1
+	beq	L(evn)
+
+L(odd):	ldr	r5, [rp, #0]
+	ldr	u0, [up, #0]
+	ldr	r4, [rp, #4]
+	tst	n, #2
+	beq	L(fi1)
+L(fi3):	sub	up, up, #12
+	sub	rp, rp, #12
+	b	L(lo3)
+L(fi1):	sub	n, n, #1
+	sub	up, up, #4
+	sub	rp, rp, #4
+	b	L(lo1)
+
+L(evn):	ldr	r4, [rp, #0]
+	ldr	u1, [up, #0]
+	ldr	r5, [rp, #4]
+	tst	n, #2
+	bne	L(fi2)
+L(fi0):	sub	up, up, #8
+	sub	rp, rp, #8
+	b	L(lo0)
+L(fi2):	subs	n, n, #2
+	bls	L(end)
+
+	ALIGN(16)
+L(top):	ldr	u0, [up, #4]
+	umaal	r4, cya, u1, v0
+	str	r4, [rp, #0]
+	ldr	r4, [rp, #8]
+	umaal	r5, cyb, u1, v1
+L(lo1):	ldr	u1, [up, #8]
+	umaal	r5, cya, u0, v0
+	str	r5, [rp, #4]
+	ldr	r5, [rp, #12]
+	umaal	r4, cyb, u0, v1
+L(lo0):	ldr	u0, [up, #12]
+	umaal	r4, cya, u1, v0
+	str	r4, [rp, #8]
+	ldr	r4, [rp, #16]
+	umaal	r5, cyb, u1, v1
+L(lo3):	ldr	u1, [up, #16]
+	umaal	r5, cya, u0, v0
+	str	r5, [rp, #12]
+	ldr	r5, [rp, #20]
+	add	rp, rp, #16
+	umaal	r4, cyb, u0, v1
+	add	up, up, #16
+	subs	n, n, #4
+	bhi	L(top)
+
+L(end):	umaal	r4, cya, u1, v0
+	ldr	u0, [up, #4]
+	umaal	r5, cyb, u1, v1
+	str	r4, [rp, #0]
+	umaal	r5, cya, u0, v0
+	umaal	cya, cyb, u0, v1
+	str	r5, [rp, #4]
+	str	cya, [rp, #8]
+	mov	r0, cyb
+
+	pop	{ r4, r5, r6, r7, r8, r9 }
+	bx	r14
+EPILOGUE()
diff --git a/gmp/mpn/arm/v6/addmul_3.asm b/gmp/mpn/arm/v6/addmul_3.asm
new file mode 100644
index 0000000000..046543020f
--- /dev/null
+++ b/gmp/mpn/arm/v6/addmul_3.asm
@@ -0,0 +1,187 @@
+dnl  ARM mpn_addmul_3.
+
+dnl  Contributed to the GNU project by Torbjörn Granlund.
+
+dnl  Copyright 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb
+C StrongARM:	 -
+C XScale	 -
+C Cortex-A7	 ?
+C Cortex-A8	 ?
+C Cortex-A9	 2.125
+C Cortex-A15	 2
+
+C TODO
+C  * Use a fast path for n <= KARATSUBA_MUL_THRESHOLD using a jump table,
+C    avoiding the current multiply.
+C  * Start the first multiply or multiplies early.
+
+define(`rp',`r0')
+define(`up',`r1')
+define(`n', `r2')
+define(`vp',`r3')
+
+define(`v0',`r4')  define(`v1',`r5')  define(`v2',`r6')
+define(`u0',`r3')  define(`u1',`r14')
+define(`w0',`r7')  define(`w1',`r8')  define(`w2',`r9')
+define(`cy0',`r10')  define(`cy1',`r11') define(`cy2',`r12')
+
+
+ASM_START()
+PROLOGUE(mpn_addmul_3)
+	push	{ r4-r11, r14 }
+
+	ldr	w0, =0xaaaaaaab		C 3^{-1} mod 2^32
+	ldm	vp, { v0,v1,v2 }
+	mov	cy0, #0
+	mov	cy1, #0
+	mov	cy2, #0
+
+C Tricky n mod 6
+	mul	w0, w0, n		C n * 3^{-1} mod 2^32
+	and	w0, w0, #0xc0000001	C pseudo-CRT mod 3,2
+	sub	n, n, #3
+ifdef(`PIC',`
+	add	pc, pc, w0, ror $28
+	nop
+	b	L(b0)
+	b	L(b2)
+	b	L(b4)
+	.word	0xe7f000f0	C udf
+	b	L(b3)
+	b	L(b5)
+	b	L(b1)
+',`
+	ldr	pc, [pc, w0, ror $28]
+	nop
+	.word	L(b0), L(b2), L(b4), 0, L(b3), L(b5), L(b1)
+')
+
+L(b5):	add	up, up, #-8
+	ldr	w1, [rp, #0]
+	ldr	w2, [rp, #4]
+	ldr	u1, [up, #8]
+	b	L(lo5)
+
+L(b4):	add	rp, rp, #-4
+	add	up, up, #-12
+	ldr	w2, [rp, #4]
+	ldr	w0, [rp, #8]
+	ldr	u0, [up, #12]
+	b	L(lo4)
+
+L(b3):	add	rp, rp, #-8
+	add	up, up, #-16
+	ldr	w0, [rp, #8]
+	ldr	w1, [rp, #12]
+	ldr	u1, [up, #16]
+	b	L(lo3)
+
+L(b1):	add	rp, rp, #8
+	ldr	w2, [rp, #-8]
+	ldr	w0, [rp, #-4]
+	ldr	u1, [up, #0]
+	b	L(lo1)
+
+L(b0):	add	rp, rp, #4
+	add	up, up, #-4
+	ldr	w0, [rp, #-4]
+	ldr	w1, [rp, #0]
+	ldr	u0, [up, #4]
+	b	L(lo0)
+
+L(b2):	add	rp, rp, #12
+	add	up, up, #4
+	ldr	w1, [rp, #-12]
+	ldr	w2, [rp, #-8]
+	ldr	u0, [up, #-4]
+
+	ALIGN(16)
+L(top):	ldr	w0, [rp, #-4]
+	umaal	w1, cy0, u0, v0
+	ldr	u1, [up, #0]
+	umaal	w2, cy1, u0, v1
+	str	w1, [rp, #-12]
+	umaal	w0, cy2, u0, v2
+L(lo1):	ldr	w1, [rp, #0]
+	umaal	w2, cy0, u1, v0
+	ldr	u0, [up, #4]
+	umaal	w0, cy1, u1, v1
+	str	w2, [rp, #-8]
+	umaal	w1, cy2, u1, v2
+L(lo0):	ldr	w2, [rp, #4]
+	umaal	w0, cy0, u0, v0
+	ldr	u1, [up, #8]
+	umaal	w1, cy1, u0, v1
+	str	w0, [rp, #-4]
+	umaal	w2, cy2, u0, v2
+L(lo5):	ldr	w0, [rp, #8]
+	umaal	w1, cy0, u1, v0
+	ldr	u0, [up, #12]
+	umaal	w2, cy1, u1, v1
+	str	w1, [rp, #0]
+	umaal	w0, cy2, u1, v2
+L(lo4):	ldr	w1, [rp, #12]
+	umaal	w2, cy0, u0, v0
+	ldr	u1, [up, #16]
+	umaal	w0, cy1, u0, v1
+	str	w2, [rp, #4]
+	umaal	w1, cy2, u0, v2
+L(lo3):	ldr	w2, [rp, #16]
+	umaal	w0, cy0, u1, v0
+	ldr	u0, [up, #20]
+	umaal	w1, cy1, u1, v1
+	str	w0, [rp, #8]
+	umaal	w2, cy2, u1, v2
+L(lo2):	subs	n, n, #6
+	add	up, up, #24
+	add	rp, rp, #24
+	bge	L(top)
+
+L(end):	umaal	w1, cy0, u0, v0
+	ldr	u1, [up, #0]
+	umaal	w2, cy1, u0, v1
+	str	w1, [rp, #-12]
+	mov	w0, #0
+	umaal	w0, cy2, u0, v2
+	umaal	w2, cy0, u1, v0
+	umaal	w0, cy1, u1, v1
+	str	w2, [rp, #-8]
+	umaal	cy1, cy2, u1, v2
+	adds	w0, w0, cy0
+	str	w0, [rp, #-4]
+	adcs	w1, cy1, #0
+	str	w1, [rp, #0]
+	adc	r0, cy2, #0
+
+	pop	{ r4-r11, pc }
+EPILOGUE()
diff --git a/gmp/mpn/arm/v6/dive_1.asm b/gmp/mpn/arm/v6/dive_1.asm
new file mode 100644
index 0000000000..92de81473f
--- /dev/null
+++ b/gmp/mpn/arm/v6/dive_1.asm
@@ -0,0 +1,149 @@
+dnl  ARM v6 mpn_divexact_1
+
+dnl  Contributed to the GNU project by Torbjörn Granlund.
+
+dnl  Copyright 2012, 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C               cycles/limb       cycles/limb
+C               norm    unorm    modexact_1c_odd
+C StrongARM	 -	 -
+C XScale	 -	 -
+C Cortex-A7	 ?	 ?
+C Cortex-A8	 ?	 ?
+C Cortex-A9	 9	10		 9
+C Cortex-A15	 7	 7		 7
+
+C Architecture requirements:
+C v5	-
+C v5t	clz
+C v5te	-
+C v6	umaal
+C v6t2	-
+C v7a	-
+
+define(`rp', `r0')
+define(`up', `r1')
+define(`n',  `r2')
+define(`d',  `r3')
+
+define(`cy',  `r7')
+define(`cnt', `r6')
+define(`tnc', `r10')
+
+ASM_START()
+PROLOGUE(mpn_divexact_1)
+	push	{r4,r5,r6,r7,r8,r9}
+
+	tst	d, #1
+
+	rsb	r4, d, #0
+	and	r4, r4, d
+	clz	r4, r4
+	rsb	cnt, r4, #31		C count_trailing_zeros
+	mov	d, d, lsr cnt
+
+C binvert limb
+	LEA(	r4, binvert_limb_table)
+	and	r12, d, #254
+	ldrb	r4, [r4, r12, lsr #1]
+	mul	r12, r4, r4
+	mul	r12, d, r12
+	rsb	r12, r12, r4, lsl #1
+	mul	r4, r12, r12
+	mul	r4, d, r4
+	rsb	r4, r4, r12, lsl #1	C r4 = inverse
+
+	ldr	r5, [up], #4		C up[0]
+	mov	cy, #0
+	rsb	r8, r4, #0		C r8 = -inverse
+	beq	L(unnorm)
+
+L(norm):
+	subs	n, n, #1
+	mul	r5, r5, r4
+	beq	L(end)
+
+	ALIGN(16)
+L(top):	ldr	r9, [up], #4
+	mov	r12, #0
+	str	r5, [rp], #4
+	umaal	r12, cy, r5, d
+	mul	r5, r9, r4
+	mla	r5, cy, r8, r5
+	subs	n, n, #1
+	bne	L(top)
+
+L(end):	str	r5, [rp]
+	pop	{r4,r5,r6,r7,r8,r9}
+	bx	r14
+
+L(unnorm):
+	push	{r10,r11}
+	rsb	tnc, cnt, #32
+	mov	r11, r5, lsr cnt
+	subs	n, n, #1
+	beq	L(edx)
+
+	ldr	r12, [up], #4
+	orr	r9, r11, r12, lsl tnc
+	mov	r11, r12, lsr cnt
+	mul	r5, r9, r4
+	subs	n, n, #1
+	beq	L(edu)
+
+	ALIGN(16)
+L(tpu):	ldr	r12, [up], #4
+	orr	r9, r11, r12, lsl tnc
+	mov	r11, r12, lsr cnt
+	mov	r12, #0
+	str	r5, [rp], #4
+	umaal	r12, cy, r5, d
+	mul	r5, r9, r4
+	mla	r5, cy, r8, r5
+	subs	n, n, #1
+	bne	L(tpu)
+
+L(edu):	str	r5, [rp], #4
+	mov	r12, #0
+	umaal	r12, cy, r5, d
+	mul	r5, r11, r4
+	mla	r5, cy, r8, r5
+	str	r5, [rp]
+	pop	{r10,r11}
+	pop	{r4,r5,r6,r7,r8,r9}
+	bx	r14
+
+L(edx):	mul	r5, r11, r4
+	str	r5, [rp]
+	pop	{r10,r11}
+	pop	{r4,r5,r6,r7,r8,r9}
+	bx	r14
+EPILOGUE()
diff --git a/gmp/mpn/arm/v6/gmp-mparam.h b/gmp/mpn/arm/v6/gmp-mparam.h
new file mode 100644
index 0000000000..c9c6851769
--- /dev/null
+++ b/gmp/mpn/arm/v6/gmp-mparam.h
@@ -0,0 +1,157 @@
+/* gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 1991, 1993, 1994, 1999-2003, 2009, 2010, 2012 Free Software
+Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#define GMP_LIMB_BITS 32
+#define GMP_LIMB_BYTES 4
+
+/* 700MHz ARM11 (raspberry pi) */
+
+#define DIVREM_1_NORM_THRESHOLD              0  /* preinv always */
+#define DIVREM_1_UNNORM_THRESHOLD            0  /* always */
+#define MOD_1_1P_METHOD                      2
+#define MOD_1_NORM_THRESHOLD                 0  /* always */
+#define MOD_1_UNNORM_THRESHOLD               0  /* always */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD          3
+#define MOD_1U_TO_MOD_1_1_THRESHOLD          7
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD         0  /* never mpn_mod_1_1p */
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD     MP_SIZE_T_MAX
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD     29
+#define USE_PREINV_DIVREM_1                  1  /* preinv always */
+#define DIV_QR_2_PI2_THRESHOLD           MP_SIZE_T_MAX  /* never */
+#define DIVEXACT_1_THRESHOLD                 0  /* always */
+#define BMOD_1_TO_MOD_1_THRESHOLD           33
+
+#define MUL_TOOM22_THRESHOLD                36
+#define MUL_TOOM33_THRESHOLD               117
+#define MUL_TOOM44_THRESHOLD               462
+#define MUL_TOOM6H_THRESHOLD                 0  /* always */
+#define MUL_TOOM8H_THRESHOLD               620
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD     130
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD     573
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD     209
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD     209
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD     305
+
+#define SQR_BASECASE_THRESHOLD               0  /* always (native) */
+#define SQR_TOOM2_THRESHOLD                 50
+#define SQR_TOOM3_THRESHOLD                181
+#define SQR_TOOM4_THRESHOLD                686
+#define SQR_TOOM6_THRESHOLD                  0  /* always */
+#define SQR_TOOM8_THRESHOLD                915
+
+#define MULMID_TOOM42_THRESHOLD             72
+
+#define MULMOD_BNM1_THRESHOLD               25
+#define SQRMOD_BNM1_THRESHOLD               30
+
+#define MUL_FFT_MODF_THRESHOLD             476  /* k = 5 */
+#define MUL_FFT_TABLE3                                      \
+  { {    476, 5}, {     21, 6}, {     11, 5}, {     25, 6}, \
+    {     13, 5}, {     27, 6}, {     25, 7}, {     13, 6}, \
+    {     28, 7}, {     15, 6}, {     32, 7}, {     17, 6}, \
+    {     35, 7}, {     19, 6}, {     39, 7}, {     23, 6}, \
+    {     47, 7}, {     29, 8}, {     15, 7}, {     35, 8}, \
+    {     19, 7}, {     43, 8}, {     23, 7}, {     51, 8}, \
+    {     27, 7}, {     55, 8}, {     31, 7}, {     63, 8}, \
+    {     43, 9}, {     23, 8}, {     55, 9}, {     31, 8}, \
+    {     71, 9}, {     39, 8}, {     83, 9}, {     47, 8}, \
+    {     95, 9}, {     55,10}, {     31, 9}, {     79,10}, \
+    {     47, 9}, {    103,11}, {     31,10}, {     63, 9}, \
+    {    135,10}, {     79, 9}, {    159,10}, {     95, 9}, \
+    {    191,10}, {    111,11}, {     63,10}, {    127, 9}, \
+    {    255,10}, {    143, 9}, {    287,10}, {    159,11}, \
+    {     95,10}, {    191, 9}, {    383,12}, {   4096,13}, \
+    {   8192,14}, {  16384,15}, {  32768,16} }
+#define MUL_FFT_TABLE3_SIZE 63
+#define MUL_FFT_THRESHOLD                 4736
+
+#define SQR_FFT_MODF_THRESHOLD             464  /* k = 5 */
+#define SQR_FFT_TABLE3                                      \
+  { {    464, 5}, {     25, 6}, {     13, 5}, {     27, 6}, \
+    {     29, 7}, {     15, 6}, {     33, 7}, {     17, 6}, \
+    {     36, 7}, {     19, 6}, {     39, 7}, {     23, 6}, \
+    {     47, 7}, {     29, 8}, {     15, 7}, {     35, 8}, \
+    {     19, 7}, {     43, 8}, {     23, 7}, {     49, 8}, \
+    {     27, 7}, {     55, 8}, {     31, 7}, {     63, 8}, \
+    {     35, 7}, {     71, 8}, {     43, 9}, {     23, 8}, \
+    {     55, 9}, {     31, 8}, {     71, 9}, {     39, 8}, \
+    {     83, 9}, {     47, 8}, {     95, 9}, {     55,10}, \
+    {     31, 9}, {     79,10}, {     47, 9}, {    103,11}, \
+    {     31,10}, {     63, 9}, {    135,10}, {     79, 9}, \
+    {    159,10}, {     95, 9}, {    191,10}, {    111,11}, \
+    {     63,10}, {    127, 9}, {    255,10}, {    143, 9}, \
+    {    287,10}, {    159,11}, {     95,10}, {    191, 9}, \
+    {    383,12}, {   4096,13}, {   8192,14}, {  16384,15}, \
+    {  32768,16} }
+#define SQR_FFT_TABLE3_SIZE 61
+#define SQR_FFT_THRESHOLD                 3776
+
+#define MULLO_BASECASE_THRESHOLD             0  /* always */
+#define MULLO_DC_THRESHOLD                  67
+#define MULLO_MUL_N_THRESHOLD             8907
+
+#define DC_DIV_QR_THRESHOLD                 40
+#define DC_DIVAPPR_Q_THRESHOLD             156
+#define DC_BDIV_QR_THRESHOLD                71
+#define DC_BDIV_Q_THRESHOLD                208
+
+#define INV_MULMOD_BNM1_THRESHOLD           70
+#define INV_NEWTON_THRESHOLD               151
+#define INV_APPR_THRESHOLD                 150
+
+#define BINV_NEWTON_THRESHOLD              375
+#define REDC_1_TO_REDC_2_THRESHOLD           5
+#define REDC_2_TO_REDC_N_THRESHOLD         134
+
+#define MU_DIV_QR_THRESHOLD               2130
+#define MU_DIVAPPR_Q_THRESHOLD            2130
+#define MUPI_DIV_QR_THRESHOLD               80
+#define MU_BDIV_QR_THRESHOLD              1787
+#define MU_BDIV_Q_THRESHOLD               2130
+
+#define POWM_SEC_TABLE  7,32,460,1705
+
+#define MATRIX22_STRASSEN_THRESHOLD         19
+#define HGCD_THRESHOLD                      85
+#define HGCD_APPR_THRESHOLD                119
+#define HGCD_REDUCE_THRESHOLD             3389
+#define GCD_DC_THRESHOLD                   333
+#define GCDEXT_DC_THRESHOLD                309
+#define JACOBI_BASE_METHOD                   1
+
+#define GET_STR_DC_THRESHOLD                21
+#define GET_STR_PRECOMPUTE_THRESHOLD        41
+#define SET_STR_DC_THRESHOLD               527
+#define SET_STR_PRECOMPUTE_THRESHOLD      1323
+
+#define FAC_DSC_THRESHOLD                  414
+#define FAC_ODD_THRESHOLD                  154
diff --git a/gmp/mpn/arm/v6/mode1o.asm b/gmp/mpn/arm/v6/mode1o.asm
new file mode 100644
index 0000000000..a2f77a6bf5
--- /dev/null
+++ b/gmp/mpn/arm/v6/mode1o.asm
@@ -0,0 +1,95 @@
+dnl  ARM v6 mpn_modexact_1c_odd
+
+dnl  Contributed to the GNU project by Torbjörn Granlund.
+
+dnl  Copyright 2012 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb
+C StrongARM	 -
+C XScale	 -
+C Cortex-A7	 ?
+C Cortex-A8	 ?
+C Cortex-A9	 9
+C Cortex-A15	 7
+
+C Architecture requirements:
+C v5	-
+C v5t	-
+C v5te	smulbb
+C v6	umaal
+C v6t2	-
+C v7a	-
+
+define(`up', `r0')
+define(`n',  `r1')
+define(`d',  `r2')
+define(`cy', `r3')
+
+	.protected	binvert_limb_table
+ASM_START()
+PROLOGUE(mpn_modexact_1c_odd)
+	stmfd	sp!, {r4, r5, r6, r7}
+
+	LEA(	r4, binvert_limb_table)
+
+	ldr	r6, [up], #4		C up[0]
+
+	and	r12, d, #254
+	ldrb	r4, [r4, r12, lsr #1]
+	smulbb	r12, r4, r4
+	mul	r12, d, r12
+	rsb	r12, r12, r4, asl #1
+	mul	r4, r12, r12
+	mul	r4, d, r4
+	rsb	r4, r4, r12, asl #1	C r4 = inverse
+
+	subs	n, n, #1
+	sub	r6, r6, cy
+	mul	r6, r6, r4
+	beq	L(end)
+
+	rsb	r5, r4, #0		C r5 = -inverse
+
+L(top):	ldr	r7, [up], #4
+	mov	r12, #0
+	umaal	r12, cy, r6, d
+	mul	r6, r7, r4
+	mla	r6, cy, r5, r6
+	subs	n, n, #1
+	bne	L(top)
+
+L(end):	mov	r12, #0
+	umaal	r12, cy, r6, d
+	mov	r0, cy
+
+	ldmfd	sp!, {r4, r5, r6, r7}
+	bx	r14
+EPILOGUE()
diff --git a/gmp/mpn/arm/v6/mul_1.asm b/gmp/mpn/arm/v6/mul_1.asm
new file mode 100644
index 0000000000..0fcc0e46d9
--- /dev/null
+++ b/gmp/mpn/arm/v6/mul_1.asm
@@ -0,0 +1,114 @@
+dnl  ARM mpn_mul_1.
+
+dnl  Copyright 2012 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb
+C StrongARM:	 -
+C XScale	 -
+C Cortex-A7	 ?
+C Cortex-A8	 ?
+C Cortex-A9	 3.25
+C Cortex-A15	 4
+
+C TODO
+C  * Micro-optimise feed-in code.
+C  * Optimise for n=1,2 by delaying register saving.
+C  * Try using ldm/stm.
+
+define(`rp',`r0')
+define(`up',`r1')
+define(`n', `r2')
+define(`v0',`r3')
+
+ASM_START()
+PROLOGUE(mpn_mul_1)
+	stmfd	sp!, { r4, r5, r6, r7 }
+
+	ands	r6, n, #3
+	mov	r12, #0
+	beq	L(fi0)
+	cmp	r6, #2
+	bcc	L(fi1)
+	beq	L(fi2)
+
+L(fi3):	ldr	r4, [up], #4
+	mov	r6, #0
+	ldr	r5, [up], #4
+	b	L(lo3)
+
+L(fi0):	ldr	r5, [up], #4
+	add	rp, rp, #4
+	mov	r7, #0
+	ldr	r4, [up], #4
+	b	L(lo0)
+
+L(fi1):	ldr	r4, [up], #4
+	mov	r6, #0
+	add	rp, rp, #8
+	subs	n, n, #1
+	beq	L(1)
+	ldr	r5, [up], #4
+	b	L(lo1)
+
+L(fi2):	ldr	r5, [up], #4
+	add	rp, rp, #12
+	mov	r7, #0
+	ldr	r4, [up], #4
+	b	L(lo2)
+
+	ALIGN(16)
+L(top):	mov	r6, #0
+	ldr	r5, [up], #4
+	str	r7, [rp, #-12]
+L(lo1):	umaal	r6, r12, r4, v0
+	mov	r7, #0
+	ldr	r4, [up], #4
+	str	r6, [rp, #-8]
+L(lo0):	umaal	r7, r12, r5, v0
+	mov	r6, #0
+	ldr	r5, [up], #4
+	str	r7, [rp, #-4]
+L(lo3):	umaal	r6, r12, r4, v0
+	mov	r7, #0
+	ldr	r4, [up], #4
+	str	r6, [rp], #16
+L(lo2):	umaal	r7, r12, r5, v0
+	subs	n, n, #4
+	bhi	L(top)
+
+	mov	r6, #0
+	str	r7, [rp, #-12]
+L(1):	umaal	r6, r12, r4, v0
+	str	r6, [rp, #-8]
+	mov	r0, r12
+	ldmfd	sp!, { r4, r5, r6, r7 }
+	bx	lr
+EPILOGUE()
diff --git a/gmp/mpn/arm/v6/mul_2.asm b/gmp/mpn/arm/v6/mul_2.asm
new file mode 100644
index 0000000000..1679542a3c
--- /dev/null
+++ b/gmp/mpn/arm/v6/mul_2.asm
@@ -0,0 +1,131 @@
+dnl  ARM mpn_mul_2.
+
+dnl  Contributed to the GNU project by Torbjörn Granlund.
+
+dnl  Copyright 2012 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb
+C StrongARM:	 -
+C XScale	 -
+C Cortex-A7	 ?
+C Cortex-A8	 ?
+C Cortex-A9	 2.25
+C Cortex-A15	 2.5
+
+C TODO
+C  * This is a trivial edit of the addmul_2 code.  Check for simplifications,
+C    and possible speedups to 2.0 c/l.
+
+define(`rp',`r0')
+define(`up',`r1')
+define(`n', `r2')
+define(`vp',`r3')
+
+define(`v0',`r6')
+define(`v1',`r7')
+define(`u0',`r3')
+define(`u1',`r9')
+
+define(`cya',`r8')
+define(`cyb',`r12')
+
+
+ASM_START()
+PROLOGUE(mpn_mul_2)
+	push	{ r4, r5, r6, r7, r8, r9 }
+
+	ldm	vp, { v0, v1 }
+	mov	cya, #0
+	mov	cyb, #0
+
+	tst	n, #1
+	beq	L(evn)
+L(odd):	mov	r5, #0
+	ldr	u0, [up, #0]
+	mov	r4, #0
+	tst	n, #2
+	beq	L(fi1)
+L(fi3):	sub	up, up, #12
+	sub	rp, rp, #16
+	b	L(lo3)
+L(fi1):	sub	n, n, #1
+	sub	up, up, #4
+	sub	rp, rp, #8
+	b	L(lo1)
+L(evn):	mov	r4, #0
+	ldr	u1, [up, #0]
+	mov	r5, #0
+	tst	n, #2
+	bne	L(fi2)
+L(fi0):	sub	up, up, #8
+	sub	rp, rp, #12
+	b	L(lo0)
+L(fi2):	subs	n, n, #2
+	sub	rp, rp, #4
+	bls	L(end)
+
+	ALIGN(16)
+L(top):	ldr	u0, [up, #4]
+	umaal	r4, cya, u1, v0
+	str	r4, [rp, #4]
+	mov	r4, #0
+	umaal	r5, cyb, u1, v1
+L(lo1):	ldr	u1, [up, #8]
+	umaal	r5, cya, u0, v0
+	str	r5, [rp, #8]
+	mov	r5, #0
+	umaal	r4, cyb, u0, v1
+L(lo0):	ldr	u0, [up, #12]
+	umaal	r4, cya, u1, v0
+	str	r4, [rp, #12]
+	mov	r4, #0
+	umaal	r5, cyb, u1, v1
+L(lo3):	ldr	u1, [up, #16]!
+	umaal	r5, cya, u0, v0
+	str	r5, [rp, #16]!
+	mov	r5, #0
+	umaal	r4, cyb, u0, v1
+	subs	n, n, #4
+	bhi	L(top)
+
+L(end):	umaal	r4, cya, u1, v0
+	ldr	u0, [up, #4]
+	umaal	r5, cyb, u1, v1
+	str	r4, [rp, #4]
+	umaal	r5, cya, u0, v0
+	umaal	cya, cyb, u0, v1
+	str	r5, [rp, #8]
+	str	cya, [rp, #12]
+	mov	r0, cyb
+
+	pop	{ r4, r5, r6, r7, r8, r9 }
+	bx	r14
+EPILOGUE()
diff --git a/gmp/mpn/arm/v6/popham.asm b/gmp/mpn/arm/v6/popham.asm
new file mode 100644
index 0000000000..44c8f2361c
--- /dev/null
+++ b/gmp/mpn/arm/v6/popham.asm
@@ -0,0 +1,138 @@
+dnl  ARM mpn_popcount and mpn_hamdist.
+
+dnl  Contributed to the GNU project by Torbjörn Granlund.
+
+dnl  Copyright 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C		     popcount	      hamdist
+C		    cycles/limb	    cycles/limb
+C StrongARM		 -
+C XScale		 -
+C Cortex-A7		 ?
+C Cortex-A8		 ?
+C Cortex-A9		 8.94		 9.47
+C Cortex-A15		 5.67		 6.44
+
+C Architecture requirements:
+C v5	-
+C v5t	-
+C v5te	ldrd strd
+C v6	usada8
+C v6t2	-
+C v7a	-
+
+ifdef(`OPERATION_popcount',`
+  define(`func',`mpn_popcount')
+  define(`ap',		`r0')
+  define(`n',		`r1')
+  define(`a0',		`r2')
+  define(`a1',		`r3')
+  define(`s',		`r5')
+  define(`b_01010101',	`r6')
+  define(`b_00110011',	`r7')
+  define(`b_00001111',	`r8')
+  define(`zero',	`r9')
+  define(`POPC',	`$1')
+  define(`HAMD',	`dnl')
+')
+ifdef(`OPERATION_hamdist',`
+  define(`func',`mpn_hamdist')
+  define(`ap',		`r0')
+  define(`bp',		`r1')
+  define(`n',		`r2')
+  define(`a0',		`r6')
+  define(`a1',		`r7')
+  define(`b0',		`r4')
+  define(`b1',		`r5')
+  define(`s',		`r11')
+  define(`b_01010101',	`r8')
+  define(`b_00110011',	`r9')
+  define(`b_00001111',	`r10')
+  define(`zero',	`r3')
+  define(`POPC',	`dnl')
+  define(`HAMD',	`$1')
+')
+
+
+ASM_START()
+PROLOGUE(func)
+POPC(`	push	{ r4-r9 }	')
+HAMD(`	push	{ r4-r11 }	')
+
+	ldr	b_01010101, =0x55555555
+	mov	r12, #0
+	ldr	b_00110011, =0x33333333
+	mov	zero, #0
+	ldr	b_00001111, =0x0f0f0f0f
+
+	tst	n, #1
+	beq	L(evn)
+
+L(odd):	ldr	a1, [ap], #4		C 1 x 32 1-bit accumulators, 0-1
+HAMD(`	ldr	b1, [bp], #4	')	C 1 x 32 1-bit accumulators, 0-1
+HAMD(`	eor	a1, a1, b1	')
+	and	r4, b_01010101, a1, lsr #1
+	sub	a1, a1, r4
+	and	r4, a1, b_00110011
+	bic	r5, a1, b_00110011
+	add	r5, r4, r5, lsr #2	C 8 4-bit accumulators, 0-4
+	subs	n, n, #1
+	b	L(mid)
+
+L(evn):	mov	s, #0
+
+L(top):	ldrd	a0, a1, [ap], #8	C 2 x 32 1-bit accumulators, 0-1
+HAMD(`	ldrd	b0, b1, [bp], #8')
+HAMD(`	eor	a0, a0, b0	')
+HAMD(`	eor	a1, a1, b1	')
+	subs	n, n, #2
+	usada8	r12, s, zero, r12
+	and	r4, b_01010101, a0, lsr #1
+	sub	a0, a0, r4
+	and	r4, b_01010101, a1, lsr #1
+	sub	a1, a1, r4
+	and	r4, a0, b_00110011
+	bic	r5, a0, b_00110011
+	add	a0, r4, r5, lsr #2	C 8 4-bit accumulators, 0-4
+	and	r4, a1, b_00110011
+	bic	r5, a1, b_00110011
+	add	a1, r4, r5, lsr #2	C 8 4-bit accumulators, 0-4
+	add	r5, a0, a1		C 8 4-bit accumulators, 0-8
+L(mid):	and	r4, r5, b_00001111
+	bic	r5, r5, b_00001111
+	add	s, r4, r5, lsr #4	C 4 8-bit accumulators
+	bne	L(top)
+
+	usada8	r0, s, zero, r12
+POPC(`	pop	{ r4-r9 }	')
+HAMD(`	pop	{ r4-r11 }	')
+	bx	r14
+EPILOGUE()
diff --git a/gmp/mpn/arm/v6/sqr_basecase.asm b/gmp/mpn/arm/v6/sqr_basecase.asm
new file mode 100644
index 0000000000..d52970aaa7
--- /dev/null
+++ b/gmp/mpn/arm/v6/sqr_basecase.asm
@@ -0,0 +1,518 @@
+dnl  ARM v6 mpn_sqr_basecase.
+
+dnl  Contributed to the GNU project by Torbjörn Granlund.
+
+dnl  Copyright 2012, 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C Code structure:
+C
+C
+C        m_2(0m4)        m_2(2m4)        m_2(1m4)        m_2(3m4)
+C           |               |               |               |
+C           |               |               |               |
+C           |               |               |               |
+C          \|/             \|/             \|/             \|/
+C              ____________                   ____________
+C             /            \                 /            \
+C            \|/            \               \|/            \
+C         am_2(3m4)       am_2(1m4)       am_2(0m4)       am_2(2m4)
+C            \            /|\                \            /|\
+C             \____________/                  \____________/
+C                       \                        /
+C                        \                      /
+C                         \                    /
+C                       tail(0m2)          tail(1m2)
+C                            \              /
+C                             \            /
+C                            sqr_diag_addlsh1
+
+C TODO
+C  * Further tweak counter and updates in outer loops.  (This could save
+C    perhaps 5n cycles).
+C  * Try to use fewer register.  Perhaps coalesce r9 branch target and n_saved.
+C    (This could save 2-3 cycles for n > 4.)
+C  * Optimise sqr_diag_addlsh1 loop.  (This could save O(n) cycles.)
+C  * Implement larger final corners (xit/tix).  Also stop loops earlier
+C    suppressing writes of upper-most rp[] values.  (This could save 10-20
+C    cycles for n > 4.)
+C  * Is the branch table really faster than discrete branches?
+
+define(`rp',      r0)
+define(`up',      r1)
+define(`n',       r2)
+
+define(`v0',      r3)
+define(`v1',      r6)
+define(`i',       r8)
+define(`n_saved', r14)
+define(`cya',     r11)
+define(`cyb',     r12)
+define(`u0',      r7)
+define(`u1',      r9)
+
+ASM_START()
+PROLOGUE(mpn_sqr_basecase)
+	and	r12, n, #3
+	cmp	n, #4
+	addgt	r12, r12, #4
+	add	pc, pc, r12, lsl #2
+	nop
+	b	L(4)
+	b	L(1)
+	b	L(2)
+	b	L(3)
+	b	L(0m4)
+	b	L(1m4)
+	b	L(2m4)
+	b	L(3m4)
+
+
+L(1m4):	push	{r4-r10,r11,r14}
+	mov	n_saved, n
+	sub	i, n, #4
+	sub	n, n, #2
+	add	r10, pc, #L(am2_2m4)-.-8
+	ldm	up, {v0,v1,u0}
+	sub	up, up, #4
+	mov	cyb, #0
+	mov	r5, #0
+	umull	r4, cya, v1, v0
+	str	r4, [rp], #-12
+	mov	r4, #0
+	b	L(ko0)
+
+L(3m4):	push	{r4-r10,r11,r14}
+	mov	n_saved, n
+	sub	i, n, #4
+	sub	n, n, #2
+	add	r10, pc, #L(am2_0m4)-.-8
+	ldm	up, {v0,v1,u0}
+	add	up, up, #4
+	mov	cyb, #0
+	mov	r5, #0
+	umull	r4, cya, v1, v0
+	str	r4, [rp], #-4
+	mov	r4, #0
+	b	L(ko2)
+
+L(2m4):	push	{r4-r10,r11,r14}
+	mov	n_saved, n
+	sub	i, n, #4
+	sub	n, n, #2
+	add	r10, pc, #L(am2_3m4)-.-8
+	ldm	up, {v0,v1,u1}
+	mov	cyb, #0
+	mov	r4, #0
+	umull	r5, cya, v1, v0
+	str	r5, [rp], #-8
+	mov	r5, #0
+	b	L(ko1)
+
+L(0m4):	push	{r4-r10,r11,r14}
+	mov	n_saved, n
+	sub	i, n, #4
+	sub	n, n, #2
+	add	r10, pc, #L(am2_1m4)-.-8
+	ldm	up, {v0,v1,u1}
+	mov	cyb, #0
+	mov	r4, #0
+	add	up, up, #8
+	umull	r5, cya, v1, v0
+	str	r5, [rp, #0]
+	mov	r5, #0
+
+L(top):	ldr	u0, [up, #4]
+	umaal	r4, cya, u1, v0
+	str	r4, [rp, #4]
+	mov	r4, #0
+	umaal	r5, cyb, u1, v1
+L(ko2):	ldr	u1, [up, #8]
+	umaal	r5, cya, u0, v0
+	str	r5, [rp, #8]
+	mov	r5, #0
+	umaal	r4, cyb, u0, v1
+L(ko1):	ldr	u0, [up, #12]
+	umaal	r4, cya, u1, v0
+	str	r4, [rp, #12]
+	mov	r4, #0
+	umaal	r5, cyb, u1, v1
+L(ko0):	ldr	u1, [up, #16]!
+	umaal	r5, cya, u0, v0
+	str	r5, [rp, #16]!
+	mov	r5, #0
+	umaal	r4, cyb, u0, v1
+	subs	i, i, #4
+	bhi	L(top)
+	bx	r10
+
+L(evnloop):
+	subs	i, n, #4
+	sub	n, n, #2
+	blt	L(tix)
+	ldm	up, {v0,v1,u0}
+	add	up, up, #4
+	mov	cya, #0
+	mov	cyb, #0
+	ldm	rp, {r4,r5}
+	sub	rp, rp, #4
+	umaal	r4, cya, v1, v0
+	str	r4, [rp, #4]
+	ldr	r4, [rp, #12]
+	b	L(lo2)
+L(ua2):	ldr	u0, [up, #4]
+	umaal	r4, cya, u1, v0
+	str	r4, [rp, #4]
+	ldr	r4, [rp, #12]
+	umaal	r5, cyb, u1, v1
+L(lo2):	ldr	u1, [up, #8]
+	umaal	r5, cya, u0, v0
+	str	r5, [rp, #8]
+	ldr	r5, [rp, #16]
+	umaal	r4, cyb, u0, v1
+	ldr	u0, [up, #12]
+	umaal	r4, cya, u1, v0
+	str	r4, [rp, #12]
+	ldr	r4, [rp, #20]
+	umaal	r5, cyb, u1, v1
+	ldr	u1, [up, #16]!
+	umaal	r5, cya, u0, v0
+	str	r5, [rp, #16]!
+	ldr	r5, [rp, #8]
+	umaal	r4, cyb, u0, v1
+	subs	i, i, #4
+	bhi	L(ua2)
+L(am2_0m4):
+	umaal	r4, cya, u1, v0
+	ldr	u0, [up, #4]
+	umaal	r5, cyb, u1, v1
+	str	r4, [rp, #4]
+	umaal	r5, cya, u0, v0
+	umaal	cya, cyb, u0, v1
+	str	r5, [rp, #8]
+	str	cya, [rp, #12]
+	str	cyb, [rp, #16]
+	sub	up, up, n, lsl #2
+	sub	rp, rp, n, lsl #2
+	add	up, up, #8
+	sub	i, n, #4
+	sub	n, n, #2
+	ldm	up, {v0,v1,u0}
+	sub	up, up, #4
+	mov	cya, #0
+	mov	cyb, #0
+	ldr	r4, [rp, #24]
+	ldr	r5, [rp, #28]
+	add	rp, rp, #12
+	umaal	r4, cya, v1, v0
+	str	r4, [rp, #12]
+	ldr	r4, [rp, #20]
+	b	L(lo0)
+L(ua0):	ldr	u0, [up, #4]
+	umaal	r4, cya, u1, v0
+	str	r4, [rp, #4]
+	ldr	r4, [rp, #12]
+	umaal	r5, cyb, u1, v1
+	ldr	u1, [up, #8]
+	umaal	r5, cya, u0, v0
+	str	r5, [rp, #8]
+	ldr	r5, [rp, #16]
+	umaal	r4, cyb, u0, v1
+	ldr	u0, [up, #12]
+	umaal	r4, cya, u1, v0
+	str	r4, [rp, #12]
+	ldr	r4, [rp, #20]
+	umaal	r5, cyb, u1, v1
+L(lo0):	ldr	u1, [up, #16]!
+	umaal	r5, cya, u0, v0
+	str	r5, [rp, #16]!
+	ldr	r5, [rp, #8]
+	umaal	r4, cyb, u0, v1
+	subs	i, i, #4
+	bhi	L(ua0)
+L(am2_2m4):
+	umaal	r4, cya, u1, v0
+	ldr	u0, [up, #4]
+	umaal	r5, cyb, u1, v1
+	str	r4, [rp, #4]
+	umaal	r5, cya, u0, v0
+	umaal	cya, cyb, u0, v1
+	str	r5, [rp, #8]
+	str	cya, [rp, #12]
+	str	cyb, [rp, #16]
+	sub	up, up, n, lsl #2
+	sub	rp, rp, n, lsl #2
+	add	up, up, #8
+	add	rp, rp, #24
+	b	L(evnloop)
+
+
+L(oddloop):
+	subs	i, n, #4
+	sub	n, n, #2
+	blt	L(xit)
+	ldm	up, {v0,v1,u1}
+	mov	cya, #0
+	mov	cyb, #0
+	sub	rp, rp, #8
+	ldr	r5, [rp, #8]
+	ldr	r4, [rp, #12]
+	umaal	r5, cya, v1, v0
+	str	r5, [rp, #8]
+	ldr	r5, [rp, #16]
+	b	L(lo1)
+L(ua1):	ldr	u0, [up, #4]
+	umaal	r4, cya, u1, v0
+	str	r4, [rp, #4]
+	ldr	r4, [rp, #12]
+	umaal	r5, cyb, u1, v1
+	ldr	u1, [up, #8]
+	umaal	r5, cya, u0, v0
+	str	r5, [rp, #8]
+	ldr	r5, [rp, #16]
+	umaal	r4, cyb, u0, v1
+L(lo1):	ldr	u0, [up, #12]
+	umaal	r4, cya, u1, v0
+	str	r4, [rp, #12]
+	ldr	r4, [rp, #20]
+	umaal	r5, cyb, u1, v1
+	ldr	u1, [up, #16]!
+	umaal	r5, cya, u0, v0
+	str	r5, [rp, #16]!
+	ldr	r5, [rp, #8]
+	umaal	r4, cyb, u0, v1
+	subs	i, i, #4
+	bhi	L(ua1)
+L(am2_3m4):
+	umaal	r4, cya, u1, v0
+	ldr	u0, [up, #4]
+	umaal	r5, cyb, u1, v1
+	str	r4, [rp, #4]
+	umaal	r5, cya, u0, v0
+	umaal	cya, cyb, u0, v1
+	str	r5, [rp, #8]
+	str	cya, [rp, #12]
+	str	cyb, [rp, #16]
+	sub	up, up, n, lsl #2
+	sub	rp, rp, n, lsl #2
+	add	up, up, #8
+	add	rp, rp, #24
+	subs	i, n, #4
+	sub	n, n, #2
+	ldm	up, {v0,v1,u1}
+	mov	cya, #0
+	mov	cyb, #0
+	ldr	r5, [rp, #0]
+	ldr	r4, [rp, #4]
+	add	up, up, #8
+	umaal	r5, cya, v1, v0
+	str	r5, [rp, #0]
+	ldr	r5, [rp, #8]
+	bls	L(e3)
+L(ua3):	ldr	u0, [up, #4]
+	umaal	r4, cya, u1, v0
+	str	r4, [rp, #4]
+	ldr	r4, [rp, #12]
+	umaal	r5, cyb, u1, v1
+	ldr	u1, [up, #8]
+	umaal	r5, cya, u0, v0
+	str	r5, [rp, #8]
+	ldr	r5, [rp, #16]
+	umaal	r4, cyb, u0, v1
+	ldr	u0, [up, #12]
+	umaal	r4, cya, u1, v0
+	str	r4, [rp, #12]
+	ldr	r4, [rp, #20]
+	umaal	r5, cyb, u1, v1
+	ldr	u1, [up, #16]!
+	umaal	r5, cya, u0, v0
+	str	r5, [rp, #16]!
+	ldr	r5, [rp, #8]
+	umaal	r4, cyb, u0, v1
+	subs	i, i, #4
+	bhi	L(ua3)
+L(e3):
+L(am2_1m4):
+	umaal	r4, cya, u1, v0
+	ldr	u0, [up, #4]
+	umaal	r5, cyb, u1, v1
+	str	r4, [rp, #4]
+	umaal	r5, cya, u0, v0
+	umaal	cya, cyb, u0, v1
+	str	r5, [rp, #8]
+	str	cya, [rp, #12]
+	str	cyb, [rp, #16]
+	sub	up, up, n, lsl #2
+	sub	rp, rp, n, lsl #2
+	add	up, up, #8
+	add	rp, rp, #24
+	b	L(oddloop)
+
+L(xit):	ldm	up!, {v0,u0}
+	ldr	cya, [rp], #12
+	mov	cyb, #0
+	umaal	cya, cyb, u0, v0
+	b	L(sqr_diag_addlsh1)
+
+L(tix):	ldm	up!, {v0,v1,u0}
+	ldm	rp, {r4,r5}
+	mov	cya, #0
+	mov	cyb, #0
+	umaal	r4, cya, v1, v0
+	umaal	r5, cya, u0, v0
+	stm	rp, {r4,r5}
+	umaal	cya, cyb, u0, v1
+	add	rp, rp, #20
+C	b	L(sqr_diag_addlsh1)
+
+
+define(`w0',  r6)
+define(`w1',  r7)
+define(`w2',  r8)
+define(`rbx', r9)
+
+L(sqr_diag_addlsh1):
+	str	cya, [rp, #-12]
+	str	cyb, [rp, #-8]
+	sub	n, n_saved, #1
+	sub	up, up, n_saved, lsl #2
+	sub	rp, rp, n_saved, lsl #3
+	ldr	r3, [up], #4
+	umull	w1, r5, r3, r3
+	mov	w2, #0
+	mov	r10, #0
+C	cmn	r0, #0			C clear cy (already clear by luck)
+	b	L(lm)
+
+L(tsd):	adds	w0, w0, rbx
+	adcs	w1, w1, r4
+	str	w0, [rp, #0]
+L(lm):	ldr	w0, [rp, #4]
+	str	w1, [rp, #4]
+	ldr	w1, [rp, #8]!
+	add	rbx, r5, w2
+	adcs	w0, w0, w0
+	ldr	r3, [up], #4
+	adcs	w1, w1, w1
+	adc	w2, r10, r10
+	umull	r4, r5, r3, r3
+	subs	n, n, #1
+	bne	L(tsd)
+
+	adds	w0, w0, rbx
+	adcs	w1, w1, r4
+	adc	w2, r5, w2
+	stm	rp, {w0,w1,w2}
+
+	pop	{r4-r10,r11,pc}
+
+
+C Straight line code for n <= 4
+
+L(1):	ldr	r3, [up, #0]
+	umull	r1, r2, r3, r3
+	stm	rp, {r1,r2}
+	bx	r14
+
+L(2):	push	{r4-r5}
+	ldm	up, {r5,r12}
+	umull	r1, r2, r5, r5
+	umull	r3, r4, r12, r12
+	umull	r5, r12, r5, r12
+	adds	r5, r5, r5
+	adcs	r12, r12, r12
+	adc	r4, r4, #0
+	adds	r2, r2, r5
+	adcs	r3, r3, r12
+	adc	r4, r4, #0
+	stm	rp, {r1,r2,r3,r4}
+	pop	{r4-r5}
+	bx	r14
+
+L(3):	push	{r4-r11}
+	ldm	up, {r7,r8,r9}
+	umull	r1, r2, r7, r7
+	umull	r3, r4, r8, r8
+	umull	r5, r6, r9, r9
+	umull	r10, r11, r7, r8
+	mov	r12, #0
+	umlal	r11, r12, r7, r9
+	mov	r7, #0
+	umlal	r12, r7, r8, r9
+	adds	r10, r10, r10
+	adcs	r11, r11, r11
+	adcs	r12, r12, r12
+	adcs	r7, r7, r7
+	adc	r6, r6, #0
+	adds	r2, r2, r10
+	adcs	r3, r3, r11
+	adcs	r4, r4, r12
+	adcs	r5, r5, r7
+	adc	r6, r6, #0
+	stm	rp, {r1,r2,r3,r4,r5,r6}
+	pop	{r4-r11}
+	bx	r14
+
+L(4):	push	{r4-r11, r14}
+	ldm	up, {r9,r10,r11,r12}
+	umull	r1, r2, r9, r9
+	umull	r3, r4, r10, r10
+	umull	r5, r6, r11, r11
+	umull	r7, r8, r12, r12
+	stm	rp, {r1,r2,r3,r4,r5,r6,r7}
+	umull	r1, r2, r9, r10
+	mov	r3, #0
+	umlal	r2, r3, r9, r11
+	mov	r4, #0
+	umlal	r3, r4, r9, r12
+	mov	r5, #0
+	umlal	r3, r5, r10, r11
+	umaal	r4, r5, r10, r12
+	mov	r6, #0
+	umlal	r5, r6, r11, r12
+	adds	r1, r1, r1
+	adcs	r2, r2, r2
+	adcs	r3, r3, r3
+	adcs	r4, r4, r4
+	adcs	r5, r5, r5
+	adcs	r6, r6, r6
+	adc	r7, r8, #0
+	add	rp, rp, #4
+	ldm	rp, {r8,r9,r10,r11,r12,r14}
+	adds	r1, r1, r8
+	adcs	r2, r2, r9
+	adcs	r3, r3, r10
+	adcs	r4, r4, r11
+	adcs	r5, r5, r12
+	adcs	r6, r6, r14
+	adc	r7, r7, #0
+	stm	rp, {r1,r2,r3,r4,r5,r6,r7}
+	pop	{r4-r11, pc}
+EPILOGUE()
diff --git a/gmp/mpn/arm/v6/submul_1.asm b/gmp/mpn/arm/v6/submul_1.asm
new file mode 100644
index 0000000000..8a21733a0a
--- /dev/null
+++ b/gmp/mpn/arm/v6/submul_1.asm
@@ -0,0 +1,125 @@
+dnl  ARM mpn_submul_1.
+
+dnl  Copyright 2012, 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb
+C StrongARM:	 -
+C XScale	 -
+C Cortex-A7	 ?
+C Cortex-A8	 ?
+C Cortex-A9	 3.75
+C Cortex-A15	 4.0
+
+C This loop complements U on the fly,
+C   U' = B^n - 1 - U
+C and then uses that
+C   R - U*v = R + U'*v + v - B^n v
+
+C TODO
+C  * Micro-optimise feed-in code.
+C  * Optimise for n=1,2 by delaying register saving.
+C  * Try using ldm/stm.
+
+define(`rp',`r0')
+define(`up',`r1')
+define(`n', `r2')
+define(`v0',`r3')
+
+ASM_START()
+PROLOGUE(mpn_submul_1)
+	stmfd	sp!, { r4, r5, r6, r7 }
+
+	ands	r6, n, #3
+	mov	r12, v0
+	beq	L(fi0)
+	cmp	r6, #2
+	bcc	L(fi1)
+	beq	L(fi2)
+
+L(fi3):	ldr	r4, [up], #12
+	mvn	r4, r4
+	ldr	r6, [rp, #0]
+	ldr	r5, [up, #-8]
+	b	L(lo3)
+
+L(fi0):	ldr	r5, [up], #16
+	mvn	r5, r5
+	ldr	r7, [rp], #4
+	ldr	r4, [up, #-12]
+	b	L(lo0)
+
+L(fi1):	ldr	r4, [up], #4
+	mvn	r4, r4
+	ldr	r6, [rp], #8
+	subs	n, n, #1
+	beq	L(1)
+	ldr	r5, [up]
+	b	L(lo1)
+
+L(fi2):	ldr	r5, [up], #8
+	mvn	r5, r5
+	ldr	r7, [rp], #12
+	ldr	r4, [up, #-4]
+	b	L(lo2)
+
+	ALIGN(16)
+L(top):	ldr	r6, [rp, #-8]
+	ldr	r5, [up]
+	str	r7, [rp, #-12]
+L(lo1):	umaal	r6, r12, r4, v0
+	add	up, up, #16
+	mvn	r5, r5
+	ldr	r7, [rp, #-4]
+	ldr	r4, [up, #-12]
+	str	r6, [rp, #-8]
+L(lo0):	umaal	r7, r12, r5, v0
+	mvn	r4, r4
+	ldr	r6, [rp, #0]
+	ldr	r5, [up, #-8]
+	str	r7, [rp, #-4]
+L(lo3):	umaal	r6, r12, r4, v0
+	mvn	r5, r5
+	ldr	r7, [rp, #4]
+	ldr	r4, [up, #-4]
+	str	r6, [rp], #16
+L(lo2):	umaal	r7, r12, r5, v0
+	mvn	r4, r4
+	subs	n, n, #4
+	bhi	L(top)
+
+	ldr	r6, [rp, #-8]
+	str	r7, [rp, #-12]
+L(1):	umaal	r6, r12, r4, v0
+	str	r6, [rp, #-8]
+	sub	r0, v0, r12
+	ldmfd	sp!, { r4, r5, r6, r7 }
+	bx	lr
+EPILOGUE()
diff --git a/gmp/mpn/arm/v6t2/divrem_1.asm b/gmp/mpn/arm/v6t2/divrem_1.asm
new file mode 100644
index 0000000000..be24615acb
--- /dev/null
+++ b/gmp/mpn/arm/v6t2/divrem_1.asm
@@ -0,0 +1,212 @@
+dnl  ARM v6t2 mpn_divrem_1 and mpn_preinv_divrem_1.
+
+dnl  Contributed to the GNU project by Torbjörn Granlund.
+
+dnl  Copyright 2012 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C		norm	unorm	frac
+C StrongARM	 -	 -	 -
+C XScale	 -	 -	 -
+C Cortex-A7	 ?	 ?	 ?
+C Cortex-A8	 ?	 ?	 ?
+C Cortex-A9	13	14	13
+C Cortex-A15	11.4	11.8	11.1
+
+C TODO
+C  * Optimise inner-loops better, they could likely run a cycle or two faster.
+C  * Decrease register usage, streamline non-loop code.
+
+define(`qp_arg',  `r0')
+define(`fn',      `r1')
+define(`up_arg',  `r2')
+define(`n_arg',   `r3')
+define(`d_arg',   `0')
+define(`dinv_arg',`4')
+define(`cnt_arg', `8')
+
+define(`n',       `r9')
+define(`qp',      `r5')
+define(`up',      `r6')
+define(`cnt',     `r7')
+define(`tnc',     `r10')
+define(`dinv',    `r0')
+define(`d',       `r4')
+
+ASM_START()
+PROLOGUE(mpn_preinv_divrem_1)
+	stmfd	sp!, {r4, r5, r6, r7, r8, r9, r10, r11, lr}
+	ldr	d,    [sp, #9*4+d_arg]
+	ldr	cnt,  [sp, #9*4+cnt_arg]
+	str	r1, [sp, #9*4+d_arg]	C reuse d stack slot for fn
+	sub	n, r3, #1
+	add	r3, r1, n
+	cmp	d, #0
+	add	qp, qp_arg, r3, lsl #2	C put qp at Q[] end
+	add	up, up_arg, n, lsl #2	C put up at U[] end
+	ldr	dinv, [sp, #9*4+dinv_arg]
+	blt	L(nent)
+	b	L(uent)
+EPILOGUE()
+
+PROLOGUE(mpn_divrem_1)
+	stmfd	sp!, {r4, r5, r6, r7, r8, r9, r10, r11, lr}
+	sub	n, r3, #1
+	ldr	d, [sp, #9*4+d_arg]	C d
+	str	r1, [sp, #9*4+d_arg]	C reuse d stack slot for fn
+	add	r3, r1, n
+	cmp	d, #0
+	add	qp, qp_arg, r3, lsl #2	C put qp at Q[] end
+	add	up, up_arg, n, lsl #2	C put up at U[] end
+	blt	L(normalised)
+
+L(unnorm):
+	clz	cnt, d
+	mov	r0, d, lsl cnt		C pass d << cnt
+	bl	mpn_invert_limb
+L(uent):
+	mov	d, d, lsl cnt		C d <<= cnt
+	cmp	n, #0
+	mov	r1, #0			C r
+	blt	L(frac)
+
+	ldr	r11, [up, #0]
+
+	rsb	tnc, cnt, #32
+	mov	r1, r11, lsr tnc
+	mov	r11, r11, lsl cnt
+	beq	L(uend)
+
+	ldr	r3, [up, #-4]!
+	orr	r2, r11, r3, lsr tnc
+	b	L(mid)
+
+L(utop):
+	mls	r1, d, r8, r11
+	mov	r11, r3, lsl cnt
+	ldr	r3, [up, #-4]!
+	cmp	r1, r2
+	addhi	r1, r1, d
+	subhi	r8, r8, #1
+	orr	r2, r11, r3, lsr tnc
+	cmp	r1, d
+	bcs	L(ufx)
+L(uok):	str	r8, [qp], #-4
+L(mid):	add	r8, r1, #1
+	mov	r11, r2
+	umlal	r2, r8, r1, dinv
+	subs	n, n, #1
+	bne	L(utop)
+
+	mls	r1, d, r8, r11
+	mov	r11, r3, lsl cnt
+	cmp	r1, r2
+	addhi	r1, r1, d
+	subhi	r8, r8, #1
+	cmp	r1, d
+	rsbcs	r1, d, r1
+	addcs	r8, r8, #1
+	str	r8, [qp], #-4
+
+L(uend):add	r8, r1, #1
+	mov	r2, r11
+	umlal	r2, r8, r1, dinv
+	mls	r1, d, r8, r11
+	cmp	r1, r2
+	addhi	r1, r1, d
+	subhi	r8, r8, #1
+	cmp	r1, d
+	rsbcs	r1, d, r1
+	addcs	r8, r8, #1
+	str	r8, [qp], #-4
+L(frac):
+	ldr	r2, [sp, #9*4+d_arg]	C fn
+	cmp	r2, #0
+	beq	L(fend)
+
+L(ftop):mov	r6, #0
+	add	r3, r1, #1
+	umlal	r6, r3, r1, dinv
+	mov	r8, #0
+	mls	r1, d, r3, r8
+	cmp	r1, r6
+	addhi	r1, r1, d
+	subhi	r3, r3, #1
+	subs	r2, r2, #1
+	str	r3, [qp], #-4
+	bne	L(ftop)
+
+L(fend):mov	r11, r1, lsr cnt
+L(rtn):	mov	r0, r11
+	ldmfd	sp!, {r4, r5, r6, r7, r8, r9, r10, r11, pc}
+
+L(normalised):
+	mov	r0, d
+	bl	mpn_invert_limb
+L(nent):
+	cmp	n, #0
+	mov	r11, #0			C r
+	blt	L(nend)
+
+	ldr	r11, [up, #0]
+	cmp	r11, d
+	movlo	r2, #0			C hi q limb
+	movhs	r2, #1			C hi q limb
+	subhs	r11, r11, d
+
+	str	r2, [qp], #-4
+	cmp	n, #0
+	beq	L(nend)
+
+L(ntop):ldr	r1, [up, #-4]!
+	add	r12, r11, #1
+	umlal	r1, r12, r11, dinv
+	ldr	r3, [up, #0]
+	mls	r11, d, r12, r3
+	cmp	r11, r1
+	addhi	r11, r11, d
+	subhi	r12, r12, #1
+	cmp	d, r11
+	bls	L(nfx)
+L(nok):	str	r12, [qp], #-4
+	subs	n, n, #1
+	bne	L(ntop)
+
+L(nend):mov	r1, r11			C r
+	mov	cnt, #0			C shift cnt
+	b	L(frac)
+
+L(nfx):	add	r12, r12, #1
+	rsb	r11, d, r11
+	b	L(nok)
+L(ufx):	rsb	r1, d, r1
+	add	r8, r8, #1
+	b	L(uok)
+EPILOGUE()
diff --git a/gmp/mpn/arm/v6t2/gcd_1.asm b/gmp/mpn/arm/v6t2/gcd_1.asm
new file mode 100644
index 0000000000..2063647963
--- /dev/null
+++ b/gmp/mpn/arm/v6t2/gcd_1.asm
@@ -0,0 +1,115 @@
+dnl  ARM v6t2 mpn_gcd_1.
+
+dnl  Based on the K7 gcd_1.asm, by Kevin Ryde.  Rehacked for ARM by Torbjörn
+dnl  Granlund.
+
+dnl  Copyright 2000-2002, 2005, 2009, 2011, 2012 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/bit (approx)
+C StrongARM	 -
+C XScale	 -
+C Cortex-A7	 ?
+C Cortex-A8	 ?
+C Cortex-A9	 5.3
+C Cortex-A15	 3.5
+C Numbers measured with: speed -CD -s8-32 -t24 mpn_gcd_1
+
+C TODO
+C  * Optimise inner-loop better.
+C  * Push saving/restoring of callee-user regs into call code
+
+C Threshold of when to call bmod when U is one limb.  Should be about
+C (time_in_cycles(bmod_1,1) + call_overhead) / (cycles/bit).
+define(`BMOD_THRES_LOG2', 7)
+
+C INPUT PARAMETERS
+define(`up',    `r0')
+define(`n',     `r1')
+define(`v0',    `r2')
+
+ifdef(`BMOD_1_TO_MOD_1_THRESHOLD',,
+  `define(`BMOD_1_TO_MOD_1_THRESHOLD',0xffffffff)')
+
+ASM_START()
+	TEXT
+	ALIGN(16)
+PROLOGUE(mpn_gcd_1)
+	push	{r4, r7, lr}
+	ldr	r3, [up]	C U low limb
+
+	orr	r3, r3, v0
+	rbit	r4, r3
+	clz	r4, r4		C min(ctz(u0),ctz(v0))
+
+	rbit	r12, v0
+	clz	r12, r12
+	mov	v0, v0, lsr r12
+
+	mov	r7, v0
+
+	cmp	n, #1
+	bne	L(nby1)
+
+C Both U and V are single limbs, reduce with bmod if u0 >> v0.
+	ldr	r3, [up]
+	cmp	v0, r3, lsr #BMOD_THRES_LOG2
+	bhi	L(red1)
+
+L(bmod):mov	r3, #0		C carry argument
+	bl	mpn_modexact_1c_odd
+	b	L(red0)
+
+L(nby1):cmp	n, #BMOD_1_TO_MOD_1_THRESHOLD
+	blo	L(bmod)
+
+	bl	mpn_mod_1
+
+L(red0):mov	r3, r0
+L(red1):cmp	r3, #0
+	rbit	r12, r3
+	clz	r12, r12
+	bne	L(mid)
+	b	L(end)
+
+	ALIGN(8)
+L(top):	movcs	r3, r1		C if x-y < 0
+	movcs	r7, r0		C use x,y-x
+L(mid):	mov	r3, r3, lsr r12	C
+	mov	r0, r3		C
+	subs	r1, r7, r3	C
+	rsb	r3, r7, r3	C
+	rbit	r12, r1
+	clz	r12, r12	C
+	bne	L(top)		C
+
+L(end):	mov	r0, r7, lsl r4
+	pop	{r4, r7, pc}
+EPILOGUE()
diff --git a/gmp/mpn/arm/v7a/cora15/addmul_1.asm b/gmp/mpn/arm/v7a/cora15/addmul_1.asm
new file mode 100644
index 0000000000..c2277b32b2
--- /dev/null
+++ b/gmp/mpn/arm/v7a/cora15/addmul_1.asm
@@ -0,0 +1,145 @@
+dnl  ARM mpn_addmul_1 optimised for A15.
+
+dnl  Copyright 2012, 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb		best
+C StrongARM:     -
+C XScale	 ?
+C Cortex-A7	 ?
+C Cortex-A8	 ?
+C Cortex-A9	 6			3.25
+C Cortex-A15	 2			this
+
+C This code uses umlal for adding in the rp[] data, keeping the recurrency path
+C separate from any multiply instructions.  It performs well on A15, at umlal's
+C bandwidth.
+C
+C An A9 variant should perhaps stick to 3-way unrolling, and use ldm and stm
+C for all loads and stores.  Alternatively, it could do 2-way or 4-way, but
+C then alignment aware code will be necessary (adding O(1) bookkeeping
+C overhead).
+C
+C We don't use r12 due to ldrd and strd limitations.
+
+C Architecture requirements:
+C v5	-
+C v5t	-
+C v5te	ldrd strd
+C v6	-
+C v6t2	-
+C v7a	-
+
+define(`rp', `r0')
+define(`up', `r1')
+define(`n',  `r2')
+define(`v0', `r3')
+
+define(`w0', `r10') define(`w1', `r11')
+define(`u0', `r8')  define(`u1', `r9')
+
+ASM_START()
+PROLOGUE(mpn_addmul_1)
+	push	{ r4-r11 }
+
+	ands	r6, n, #3
+	sub	n, n, #3
+	beq	L(b00)
+	cmp	r6, #2
+	bcc	L(b01)
+	beq	L(b10)
+
+L(b11):	mov	r6, #0
+	cmn	r13, #0			C carry clear
+	ldr	u1, [up], #-4
+	ldr	w1, [rp], #-4
+	mov	r7, #0
+	b	L(mid)
+
+L(b00):	ldrd	u0, u1, [up]
+	ldrd	w0, w1, [rp]
+	mov	r6, #0
+	umlal	w0, r6, u0, v0
+	cmn	r13, #0			C carry clear
+	mov	r7, #0
+	str	w0, [rp]
+	b	L(mid)
+
+L(b10):	ldrd	u0, u1, [up], #8
+	ldrd	w0, w1, [rp]
+	mov	r4, #0
+	umlal	w0, r4, u0, v0
+	cmn	r13, #0			C carry clear
+	mov	r5, #0
+	str	w0, [rp], #8
+	umlal	w1, r5, u1, v0
+	tst	n, n
+	bmi	L(end)
+	b	L(top)
+
+L(b01):	mov	r4, #0
+	ldr	u1, [up], #4
+	ldr	w1, [rp], #4
+	mov	r5, #0
+	umlal	w1, r5, u1, v0
+	tst	n, n
+	bmi	L(end)
+
+	ALIGN(16)
+L(top):	ldrd	u0, u1, [up, #0]
+	adcs	r4, r4, w1
+	ldrd	w0, w1, [rp, #0]
+	mov	r6, #0
+	umlal	w0, r6, u0, v0		C 1 2
+	adcs	r5, r5, w0
+	mov	r7, #0
+	strd	r4, r5, [rp, #-4]
+L(mid):	umlal	w1, r7, u1, v0		C 2 3
+	ldrd	u0, u1, [up, #8]
+	adcs	r6, r6, w1
+	ldrd	w0, w1, [rp, #8]
+	mov	r4, #0
+	umlal	w0, r4, u0, v0		C 3 4
+	adcs	r7, r7, w0
+	mov	r5, #0
+	strd	r6, r7, [rp, #4]
+	umlal	w1, r5, u1, v0		C 0 1
+	sub	n, n, #4
+	add	up, up, #16
+	add	rp, rp, #16
+	tst	n, n
+	bpl	L(top)
+
+L(end):	adcs	r4, r4, w1
+	str	r4, [rp, #-4]
+	adc	r0, r5, #0
+	pop	{ r4-r11 }
+	bx	r14
+EPILOGUE()
diff --git a/gmp/mpn/arm/v7a/cora15/aors_n.asm b/gmp/mpn/arm/v7a/cora15/aors_n.asm
new file mode 100644
index 0000000000..dc3f83992e
--- /dev/null
+++ b/gmp/mpn/arm/v7a/cora15/aors_n.asm
@@ -0,0 +1,162 @@
+dnl  ARM mpn_add_n/mpn_sub_n optimised for A15.
+
+dnl  Copyright 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb		best
+C StrongARM:     -
+C XScale	 ?
+C Cortex-A7	 ?
+C Cortex-A8	 ?
+C Cortex-A9	 3.55			2.5
+C Cortex-A15	 1.27			this
+
+C This was a major improvement compared to the code we had before, but it might
+C not be the best 8-way code possible.  We've tried some permutations of auto-
+C increments and separate pointer updates, but they all ran at the same speed
+C on A15.
+
+C Architecture requirements:
+C v5	-
+C v5t	-
+C v5te	ldrd strd
+C v6	-
+C v6t2	-
+C v7a	-
+
+define(`rp', `r0')
+define(`up', `r1')
+define(`vp', `r2')
+define(`n',  `r3')
+
+ifdef(`OPERATION_add_n', `
+  define(`ADDSUBC',	adcs)
+  define(`IFADD',	`$1')
+  define(`SETCY',	`cmp	$1, #1')
+  define(`RETVAL',	`adc	r0, n, #0')
+  define(`RETVAL2',	`adc	r0, n, #1')
+  define(`func',	mpn_add_n)
+  define(`func_nc',	mpn_add_nc)')
+ifdef(`OPERATION_sub_n', `
+  define(`ADDSUBC',	sbcs)
+  define(`IFADD',	`')
+  define(`SETCY',	`rsbs	$1, $1, #0')
+  define(`RETVAL',	`sbc	r0, r0, r0
+			and	r0, r0, #1')
+  define(`RETVAL2',	`RETVAL')
+  define(`func',	mpn_sub_n)
+  define(`func_nc',	mpn_sub_nc)')
+
+MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
+
+ASM_START()
+PROLOGUE(func_nc)
+	ldr	r12, [sp]
+	b	L(ent)
+EPILOGUE()
+PROLOGUE(func)
+	mov	r12, #0
+L(ent):	push	{ r4-r9 }
+
+	ands	r6, n, #3
+	mov	n, n, lsr #2
+	beq	L(b00)
+	cmp	r6, #2
+	bcc	L(b01)
+	beq	L(b10)
+
+L(b11):	ldr	r5, [up], #4
+	ldr	r7, [vp], #4
+	SETCY(	r12)
+	ADDSUBC	r9, r5, r7
+	ldrd	r4, r5, [up, #0]
+	ldrd	r6, r7, [vp, #0]
+	str	r9, [rp], #-4
+	b	L(lo)
+
+L(b00):	ldrd	r4, r5, [up], #-8
+	ldrd	r6, r7, [vp], #-8
+	SETCY(	r12)
+	sub	rp, rp, #16
+	b	L(mid)
+
+L(b01):	ldr	r5, [up], #-4
+	ldr	r7, [vp], #-4
+	SETCY(	r12)
+	ADDSUBC	r9, r5, r7
+	str	r9, [rp], #-12
+	tst	n, n
+	beq	L(wd1)
+L(gt1):	ldrd	r4, r5, [up, #8]
+	ldrd	r6, r7, [vp, #8]
+	b	L(mid)
+
+L(b10):	ldrd	r4, r5, [up]
+	ldrd	r6, r7, [vp]
+	SETCY(	r12)
+	sub	rp, rp, #8
+	b	L(lo)
+
+	ALIGN(16)
+L(top):	ldrd	r4, r5, [up, #8]
+	ldrd	r6, r7, [vp, #8]
+	strd	r8, r9, [rp, #8]
+L(mid):	ADDSUBC	r8, r4, r6
+	ADDSUBC	r9, r5, r7
+	ldrd	r4, r5, [up, #16]
+	ldrd	r6, r7, [vp, #16]
+	strd	r8, r9, [rp, #16]
+	ADDSUBC	r8, r4, r6
+	ADDSUBC	r9, r5, r7
+	sub	n, n, #2
+	tst	n, n
+	bmi	L(dne)
+	ldrd	r4, r5, [up, #24]
+	ldrd	r6, r7, [vp, #24]
+	strd	r8, r9, [rp, #24]
+	ADDSUBC	r8, r4, r6
+	ADDSUBC	r9, r5, r7
+	ldrd	r4, r5, [up, #32]!
+	ldrd	r6, r7, [vp, #32]!
+	strd	r8, r9, [rp, #32]!
+L(lo):	ADDSUBC	r8, r4, r6
+	ADDSUBC	r9, r5, r7
+	tst	n, n
+	bne	L(top)
+
+L(end):	strd	r8, r9, [rp, #8]
+L(wd1):	RETVAL
+	pop	{ r4-r9 }
+	bx	r14
+L(dne):	strd	r8, r9, [rp, #24]
+	RETVAL2
+	pop	{ r4-r9 }
+	bx	r14
+EPILOGUE()
diff --git a/gmp/mpn/arm/v7a/cora15/cnd_aors_n.asm b/gmp/mpn/arm/v7a/cora15/cnd_aors_n.asm
new file mode 100644
index 0000000000..b9e5cd3f79
--- /dev/null
+++ b/gmp/mpn/arm/v7a/cora15/cnd_aors_n.asm
@@ -0,0 +1,158 @@
+dnl  ARM mpn_cnd_add_n/mpn_cnd_sub_n optimised for A15.
+
+dnl  Copyright 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb		best
+C StrongARM:     -
+C XScale	 ?
+C Cortex-A7	 ?
+C Cortex-A8	 ?
+C Cortex-A9	 3.75			 3
+C Cortex-A15	 1.78			this
+
+C This code does not run as well as one could have hoped, since 1.5 c/l seems
+C realistic for this insn mix.
+
+C Architecture requirements:
+C v5	-
+C v5t	-
+C v5te	ldrd strd
+C v6	-
+C v6t2	-
+C v7a	-
+
+define(`cnd',`r0')
+define(`rp', `r1')
+define(`up', `r2')
+define(`vp', `r3')
+define(`n',  `r12')
+
+ifdef(`OPERATION_cnd_add_n', `
+  define(`ADDSUB',	adds)
+  define(`ADDSUBC',	adcs)
+  define(`IFADD',	`$1')
+  define(`INITCY',      `cmn	r0, #0')
+  define(`RETVAL',	`adc	r0, n, #0')
+  define(`RETVAL2',	`adc	r0, n, #1')
+  define(`func',	mpn_cnd_add_n)
+  define(`func_nc',	mpn_add_nc)')
+ifdef(`OPERATION_cnd_sub_n', `
+  define(`ADDSUB',	subs)
+  define(`ADDSUBC',	sbcs)
+  define(`IFADD',	`')
+  define(`INITCY',      `cmp	r0, #0')
+  define(`RETVAL',	`sbc	r0, r0, r0
+			and	r0, r0, #1')
+  define(`RETVAL2',	`RETVAL')
+  define(`func',	mpn_cnd_sub_n)
+  define(`func_nc',	mpn_sub_nc)')
+
+MULFUNC_PROLOGUE(mpn_cnd_add_n mpn_cnd_sub_n)
+
+ASM_START()
+PROLOGUE(func)
+	ldr	n, [sp]
+	push	{ r4-r9 }
+
+	cmp	cnd, #1
+	sbc	cnd, cnd, cnd		C conditionally set to 0xffffffff
+
+	ands	r6, n, #3
+	mov	n, n, lsr #2
+	beq	L(b00)
+	cmp	r6, #2
+	bcc	L(b01)
+	beq	L(b10)
+
+L(b11):	ldr	r5, [up], #4
+	ldr	r7, [vp], #4
+	bic	r7, r7, cnd
+	ADDSUB	r9, r5, r7
+	ldrd	r4, r5, [up, #0]
+	ldrd	r6, r7, [vp, #0]
+	bic	r6, r6, cnd
+	bic	r7, r7, cnd
+	str	r9, [rp], #-4
+	b	L(lo)
+
+L(b00):	ldrd	r4, r5, [up], #-8
+	ldrd	r6, r7, [vp], #-8
+	bic	r6, r6, cnd
+	bic	r7, r7, cnd
+	INITCY
+	sub	rp, rp, #16
+	b	L(mid)
+
+L(b01):	ldr	r5, [up], #-4
+	ldr	r7, [vp], #-4
+	bic	r7, r7, cnd
+	ADDSUB	r9, r5, r7
+	str	r9, [rp], #-12
+	tst	n, n
+	beq	L(wd1)
+L(gt1):	ldrd	r4, r5, [up, #8]
+	ldrd	r6, r7, [vp, #8]
+	bic	r6, r6, cnd
+	bic	r7, r7, cnd
+	b	L(mid)
+
+L(b10):	ldrd	r4, r5, [up]
+	ldrd	r6, r7, [vp]
+	bic	r6, r6, cnd
+	bic	r7, r7, cnd
+	INITCY
+	sub	rp, rp, #8
+	b	L(lo)
+
+	ALIGN(16)
+L(top):	ldrd	r6, r7, [vp, #8]
+	ldrd	r4, r5, [up, #8]
+	bic	r6, r6, cnd
+	bic	r7, r7, cnd
+	strd	r8, r9, [rp, #8]
+L(mid):	ADDSUBC	r8, r4, r6
+	ADDSUBC	r9, r5, r7
+	ldrd	r6, r7, [vp, #16]!
+	ldrd	r4, r5, [up, #16]!
+	bic	r6, r6, cnd
+	bic	r7, r7, cnd
+	sub	n, n, #1
+	strd	r8, r9, [rp, #16]!
+L(lo):	ADDSUBC	r8, r4, r6
+	ADDSUBC	r9, r5, r7
+	tst	n, n
+	bne	L(top)
+
+L(end):	strd	r8, r9, [rp, #8]
+L(wd1):	RETVAL
+	pop	{ r4-r9 }
+	bx	r14
+EPILOGUE()
diff --git a/gmp/mpn/arm/v7a/cora15/com.asm b/gmp/mpn/arm/v7a/cora15/com.asm
new file mode 100644
index 0000000000..a258afe934
--- /dev/null
+++ b/gmp/mpn/arm/v7a/cora15/com.asm
@@ -0,0 +1,180 @@
+dnl  ARM mpn_com optimised for A15.
+
+dnl  Contributed to the GNU project by Torbjörn Granlund.
+
+dnl  Copyright 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C            cycles/limb
+C StrongARM	 ?
+C XScale	 ?
+C Cortex-A7	 ?
+C Cortex-A8	 ?
+C Cortex-A9	2.5
+C Cortex-A15	1.0
+
+C This is great A15 core register code, but it is a bit large.
+C We use FEEDIN_VARIANT 1 to save some space, but use 8-way unrolling.
+
+C Architecture requirements:
+C v5	-
+C v5t	-
+C v5te	ldrd strd
+C v6	-
+C v6t2	-
+C v7a	-
+
+define(`FEEDIN_VARIANT', 1)	C alternatives: 0 1 2
+define(`UNROLL', 4x2)		C alternatives: 4 4x2
+
+define(`rp', `r0')
+define(`up', `r1')
+define(`n',  `r2')
+
+ASM_START()
+PROLOGUE(mpn_com)
+	push	{ r4-r5,r8-r9 }
+
+ifelse(FEEDIN_VARIANT,0,`
+	ands	r12, n, #3
+	mov	n, n, lsr #2
+	beq	L(b00a)
+	tst	r12, #1
+	beq	L(bx0)
+	ldr	r5, [up], #4
+	mvn	r9, r5
+	str	r9, [rp], #4
+	tst	r12, #2
+	beq	L(b00)
+L(bx0):	ldrd	r4, r5, [up, #0]
+	sub	rp, rp, #8
+	b	L(lo)
+L(b00):	tst	n, n
+	beq	L(wd1)
+L(b00a):ldrd	r4, r5, [up], #-8
+	sub	rp, rp, #16
+	b	L(mid)
+')
+ifelse(FEEDIN_VARIANT,1,`
+	and	r12, n, #3
+	mov	n, n, lsr #2
+	tst	r12, #1
+	beq	L(bx0)
+	ldr	r5, [up], #4
+	mvn	r9, r5
+	str	r9, [rp], #4
+L(bx0):	tst	r12, #2
+	beq	L(b00)
+	ldrd	r4, r5, [up, #0]
+	sub	rp, rp, #8
+	b	L(lo)
+L(b00):	tst	n, n
+	beq	L(wd1)
+	ldrd	r4, r5, [up], #-8
+	sub	rp, rp, #16
+	b	L(mid)
+')
+ifelse(FEEDIN_VARIANT,2,`
+	ands	r12, n, #3
+	mov	n, n, lsr #2
+	beq	L(b00)
+	cmp	r12, #2
+	bcc	L(b01)
+	beq	L(b10)
+
+L(b11):	ldr	r5, [up], #4
+	mvn	r9, r5
+	ldrd	r4, r5, [up, #0]
+	str	r9, [rp], #-4
+	b	L(lo)
+
+L(b00):	ldrd	r4, r5, [up], #-8
+	sub	rp, rp, #16
+	b	L(mid)
+
+L(b01):	ldr	r5, [up], #-4
+	mvn	r9, r5
+	str	r9, [rp], #-12
+	tst	n, n
+	beq	L(wd1)
+L(gt1):	ldrd	r4, r5, [up, #8]
+	b	L(mid)
+
+L(b10):	ldrd	r4, r5, [up]
+	sub	rp, rp, #8
+	b	L(lo)
+')
+	ALIGN(16)
+ifelse(UNROLL,4,`
+L(top):	ldrd	r4, r5, [up, #8]
+	strd	r8, r9, [rp, #8]
+L(mid):	mvn	r8, r4
+	mvn	r9, r5
+	ldrd	r4, r5, [up, #16]!
+	strd	r8, r9, [rp, #16]!
+	sub	n, n, #1
+L(lo):	mvn	r8, r4
+	mvn	r9, r5
+	tst	n, n
+	bne	L(top)
+')
+ifelse(UNROLL,4x2,`
+L(top):	ldrd	r4, r5, [up, #8]
+	strd	r8, r9, [rp, #8]
+L(mid):	mvn	r8, r4
+	mvn	r9, r5
+	ldrd	r4, r5, [up, #16]
+	strd	r8, r9, [rp, #16]
+	mvn	r8, r4
+	mvn	r9, r5
+	sub	n, n, #2
+	tst	n, n
+	bmi	L(dne)
+	ldrd	r4, r5, [up, #24]
+	strd	r8, r9, [rp, #24]
+	mvn	r8, r4
+	mvn	r9, r5
+	ldrd	r4, r5, [up, #32]!
+	strd	r8, r9, [rp, #32]!
+L(lo):	mvn	r8, r4
+	mvn	r9, r5
+	tst	n, n
+	bne	L(top)
+')
+
+L(end):	strd	r8, r9, [rp, #8]
+L(wd1):	pop	{ r4-r5,r8-r9 }
+	bx	r14
+ifelse(UNROLL,4x2,`
+L(dne):	strd	r8, r9, [rp, #24]
+	pop	{ r4-r5,r8-r9 }
+	bx	r14
+')
+EPILOGUE()
diff --git a/gmp/mpn/arm/v7a/cora15/gmp-mparam.h b/gmp/mpn/arm/v7a/cora15/gmp-mparam.h
new file mode 100644
index 0000000000..2a06532b3e
--- /dev/null
+++ b/gmp/mpn/arm/v7a/cora15/gmp-mparam.h
@@ -0,0 +1,197 @@
+/* gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 1991, 1993, 1994, 1999-2003, 2009, 2010, 2012-2014 Free Software
+Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#define GMP_LIMB_BITS 32
+#define GMP_LIMB_BYTES 4
+
+/* 1700MHz Cortex-A15 with Neon (in spite of file position) */
+/* FFT tuning limit = 25000000 */
+/* Generated by tuneup.c, 2014-03-12, gcc 4.6 */
+
+#define MOD_1_NORM_THRESHOLD                 0  /* always */
+#define MOD_1_UNNORM_THRESHOLD               0  /* always */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD          3
+#define MOD_1U_TO_MOD_1_1_THRESHOLD          3
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD         9
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD     MP_SIZE_T_MAX
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD      8
+#define USE_PREINV_DIVREM_1                  1  /* native */
+#define DIV_QR_1N_PI1_METHOD                 1
+#define DIV_QR_1_NORM_THRESHOLD          MP_SIZE_T_MAX  /* never */
+#define DIV_QR_1_UNNORM_THRESHOLD        MP_SIZE_T_MAX  /* never */
+#define DIV_QR_2_PI2_THRESHOLD           MP_SIZE_T_MAX  /* never */
+#define DIVEXACT_1_THRESHOLD                 0  /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD           15
+
+#define MUL_TOOM22_THRESHOLD                23
+#define MUL_TOOM33_THRESHOLD                90
+#define MUL_TOOM44_THRESHOLD               262
+#define MUL_TOOM6H_THRESHOLD               351
+#define MUL_TOOM8H_THRESHOLD               557
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD      90
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD     160
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD      89
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD     169
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD     130
+
+#define SQR_BASECASE_THRESHOLD               0  /* always (native) */
+#define SQR_TOOM2_THRESHOLD                 43
+#define SQR_TOOM3_THRESHOLD                138
+#define SQR_TOOM4_THRESHOLD                363
+#define SQR_TOOM6_THRESHOLD                517
+#define SQR_TOOM8_THRESHOLD                725
+
+#define MULMID_TOOM42_THRESHOLD             52
+
+#define MULMOD_BNM1_THRESHOLD               17
+#define SQRMOD_BNM1_THRESHOLD               23
+
+#define MUL_FFT_MODF_THRESHOLD             550  /* k = 5 */
+#define MUL_FFT_TABLE3                                      \
+  { {    550, 5}, {     25, 6}, {     27, 7}, {     15, 6}, \
+    {     31, 7}, {     19, 6}, {     39, 7}, {     25, 6}, \
+    {     51, 7}, {     27, 8}, {     15, 7}, {     33, 8}, \
+    {     19, 7}, {     41, 8}, {     23, 7}, {     51, 8}, \
+    {     27, 9}, {     15, 8}, {     31, 7}, {     63, 8}, \
+    {     39, 9}, {     23, 8}, {     51,10}, {     15, 9}, \
+    {     31, 8}, {     67, 9}, {     39, 8}, {     79, 9}, \
+    {     47, 8}, {     99, 9}, {     55,10}, {     31, 9}, \
+    {     79,10}, {     47, 9}, {    103,11}, {     31,10}, \
+    {     63, 9}, {    135,10}, {     79, 9}, {    159,10}, \
+    {     95, 9}, {    191,10}, {    111,11}, {     63,10}, \
+    {    159,11}, {     95,10}, {    191, 9}, {    383,10}, \
+    {    207,12}, {     63,11}, {    127,10}, {    255, 9}, \
+    {    511,10}, {    271, 9}, {    543,11}, {    159,10}, \
+    {    319, 9}, {    639,10}, {    335, 9}, {    671,10}, \
+    {    351,11}, {    191,10}, {    383, 9}, {    767,10}, \
+    {    399, 9}, {    799,10}, {    415,11}, {    223,12}, \
+    {    127,11}, {    255,10}, {    543,11}, {    287,10}, \
+    {    607,11}, {    319,10}, {    671,11}, {    351,12}, \
+    {    191,11}, {    383,10}, {    799,11}, {    415,10}, \
+    {    831,13}, {    127,12}, {    255,11}, {    543,10}, \
+    {   1087,11}, {    607,12}, {    319,11}, {    671,10}, \
+    {   1343,11}, {    735,12}, {    383,11}, {    799,10}, \
+    {   1599,11}, {    831,12}, {    447,11}, {    895,13}, \
+    {    255,12}, {    511,11}, {   1023,12}, {    575,11}, \
+    {   1151,12}, {    639,11}, {   1279,12}, {    703,13}, \
+    {    383,12}, {    767,11}, {   1599,12}, {    831,11}, \
+    {   1663,12}, {    895,13}, {    511,12}, {   1087,13}, \
+    {    639,12}, {   1407,13}, {    767,12}, {   1599,13}, \
+    {    895,14}, {    511,13}, {   1023,12}, {   2111,13}, \
+    {   1151,12}, {   2431,13}, {   1279,14}, {    767,13}, \
+    {   1535,12}, {   3071,15}, {    511,14}, {   1023,13}, \
+    {   2175,14}, {   1279,13}, {   2559,12}, {   5119,13}, \
+    {   2815,12}, {   5631,13}, {   2943,14}, {  16384,15}, \
+    {  32768,16} }
+#define MUL_FFT_TABLE3_SIZE 137
+#define MUL_FFT_THRESHOLD                 5760
+
+#define SQR_FFT_MODF_THRESHOLD             525  /* k = 5 */
+#define SQR_FFT_TABLE3                                      \
+  { {    525, 5}, {     25, 6}, {     27, 7}, {     15, 6}, \
+    {     32, 7}, {     17, 6}, {     35, 7}, {     19, 6}, \
+    {     39, 7}, {     25, 6}, {     51, 7}, {     27, 8}, \
+    {     15, 7}, {     35, 8}, {     19, 7}, {     41, 8}, \
+    {     23, 7}, {     51, 8}, {     27, 7}, {     55, 9}, \
+    {     15, 8}, {     31, 7}, {     63, 8}, {     39, 9}, \
+    {     23, 8}, {     55,10}, {     15, 9}, {     31, 8}, \
+    {     67, 9}, {     39, 8}, {     79, 9}, {     47, 8}, \
+    {     95, 9}, {     55,10}, {     31, 9}, {     79,10}, \
+    {     47, 9}, {     95,11}, {     31,10}, {     63, 9}, \
+    {    135,10}, {     79, 9}, {    159,10}, {     95, 9}, \
+    {    191,10}, {    111,11}, {     63,10}, {    143, 9}, \
+    {    287,10}, {    159,11}, {     95,10}, {    191, 9}, \
+    {    383,12}, {     63,11}, {    127,10}, {    255, 9}, \
+    {    511,10}, {    271, 9}, {    543,10}, {    287,11}, \
+    {    159,10}, {    335, 9}, {    671,10}, {    351,11}, \
+    {    191,10}, {    383, 9}, {    767,10}, {    399, 9}, \
+    {    799,10}, {    415,11}, {    223,12}, {    127,11}, \
+    {    255,10}, {    543,11}, {    287,10}, {    607,11}, \
+    {    319,10}, {    671,11}, {    351,12}, {    191,11}, \
+    {    383,10}, {    799,11}, {    415,10}, {    831,13}, \
+    {    127,12}, {    255,11}, {    543,10}, {   1087,11}, \
+    {    607,12}, {    319,11}, {    671,10}, {   1343,11}, \
+    {    735,12}, {    383,11}, {    799,10}, {   1599,11}, \
+    {    831,12}, {    447,11}, {    895,12}, {    511,11}, \
+    {   1023,12}, {    575,11}, {   1151,12}, {    639,11}, \
+    {   1343,12}, {    703,13}, {    383,12}, {    767,11}, \
+    {   1599,12}, {    831,11}, {   1663,12}, {    895,13}, \
+    {    511,12}, {   1087,13}, {    639,12}, {   1407,13}, \
+    {    767,12}, {   1727,13}, {    895,14}, {    511,13}, \
+    {   1023,12}, {   2047,13}, {   1151,12}, {   2431,13}, \
+    {   1279,14}, {    767,13}, {   1535,12}, {   3071,15}, \
+    {    511,14}, {   1023,13}, {   2047,12}, {   4095,13}, \
+    {   2175,14}, {   1279,13}, {   2559,12}, {   5119,13}, \
+    {   2687,14}, {  16384,15}, {  32768,16} }
+#define SQR_FFT_TABLE3_SIZE 139
+#define SQR_FFT_THRESHOLD                 4736
+
+#define MULLO_BASECASE_THRESHOLD             9
+#define MULLO_DC_THRESHOLD                  39
+#define MULLO_MUL_N_THRESHOLD            11278
+
+#define DC_DIV_QR_THRESHOLD                 54
+#define DC_DIVAPPR_Q_THRESHOLD             296
+#define DC_BDIV_QR_THRESHOLD                52
+#define DC_BDIV_Q_THRESHOLD                300
+
+#define INV_MULMOD_BNM1_THRESHOLD           44
+#define INV_NEWTON_THRESHOLD               294
+#define INV_APPR_THRESHOLD                 294
+
+#define BINV_NEWTON_THRESHOLD              375
+#define REDC_1_TO_REDC_2_THRESHOLD         102
+#define REDC_2_TO_REDC_N_THRESHOLD           0  /* always */
+
+#define MU_DIV_QR_THRESHOLD               1718
+#define MU_DIVAPPR_Q_THRESHOLD            1718
+#define MUPI_DIV_QR_THRESHOLD              108
+#define MU_BDIV_QR_THRESHOLD              1528
+#define MU_BDIV_Q_THRESHOLD               1718
+
+#define POWM_SEC_TABLE  3,32,70,416,1464
+
+#define MATRIX22_STRASSEN_THRESHOLD         22
+#define HGCD_THRESHOLD                     152
+#define HGCD_APPR_THRESHOLD                230
+#define HGCD_REDUCE_THRESHOLD             3259
+#define GCD_DC_THRESHOLD                   702
+#define GCDEXT_DC_THRESHOLD                538
+#define JACOBI_BASE_METHOD                   4
+
+#define GET_STR_DC_THRESHOLD                18
+#define GET_STR_PRECOMPUTE_THRESHOLD        32
+#define SET_STR_DC_THRESHOLD               119
+#define SET_STR_PRECOMPUTE_THRESHOLD      1063
+
+#define FAC_DSC_THRESHOLD                  262
+#define FAC_ODD_THRESHOLD                   26
diff --git a/gmp/mpn/arm/v7a/cora15/logops_n.asm b/gmp/mpn/arm/v7a/cora15/logops_n.asm
new file mode 100644
index 0000000000..06026143e1
--- /dev/null
+++ b/gmp/mpn/arm/v7a/cora15/logops_n.asm
@@ -0,0 +1,253 @@
+dnl  ARM mpn_and_n, mpn_andn_n. mpn_nand_n, etc, optimised for A15.
+
+dnl  Contributed to the GNU project by Torbjörn Granlund.
+
+dnl  Copyright 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C            cycles/limb             cycles/limb
+C          and andn ior xor         nand iorn nior xnor
+C StrongARM	 ?			 ?
+C XScale	 ?			 ?
+C Cortex-A7	 ?			 ?
+C Cortex-A8	 ?			 ?
+C Cortex-A9	3.5			3.56
+C Cortex-A15	1.27			1.64
+
+C This is great A15 core register code, but it is a bit large.
+C We use FEEDIN_VARIANT 1 to save some space, but use 8-way unrolling.
+
+C Architecture requirements:
+C v5	-
+C v5t	-
+C v5te	ldrd strd
+C v6	-
+C v6t2	-
+C v7a	-
+
+define(`FEEDIN_VARIANT', 1)	C alternatives: 0 1 2
+define(`UNROLL', 4x2)		C alternatives: 4 4x2
+
+define(`rp', `r0')
+define(`up', `r1')
+define(`vp', `r2')
+define(`n',  `r3')
+
+define(`POSTOP')
+
+ifdef(`OPERATION_and_n',`
+  define(`func',    `mpn_and_n')
+  define(`LOGOP',   `and	$1, $2, $3')')
+ifdef(`OPERATION_andn_n',`
+  define(`func',    `mpn_andn_n')
+  define(`LOGOP',   `bic	$1, $2, $3')')
+ifdef(`OPERATION_nand_n',`
+  define(`func',    `mpn_nand_n')
+  define(`POSTOP',  `mvn	$1, $1')
+  define(`LOGOP',   `and	$1, $2, $3')')
+ifdef(`OPERATION_ior_n',`
+  define(`func',    `mpn_ior_n')
+  define(`LOGOP',   `orr	$1, $2, $3')')
+ifdef(`OPERATION_iorn_n',`
+  define(`func',    `mpn_iorn_n')
+  define(`POSTOP',  `mvn	$1, $1')
+  define(`LOGOP',   `bic	$1, $3, $2')')
+ifdef(`OPERATION_nior_n',`
+  define(`func',    `mpn_nior_n')
+  define(`POSTOP',  `mvn	$1, $1')
+  define(`LOGOP',   `orr	$1, $2, $3')')
+ifdef(`OPERATION_xor_n',`
+  define(`func',    `mpn_xor_n')
+  define(`LOGOP',   `eor	$1, $2, $3')')
+ifdef(`OPERATION_xnor_n',`
+  define(`func',    `mpn_xnor_n')
+  define(`POSTOP',  `mvn	$1, $1')
+  define(`LOGOP',   `eor	$1, $2, $3')')
+
+MULFUNC_PROLOGUE(mpn_and_n mpn_andn_n mpn_nand_n mpn_ior_n mpn_iorn_n mpn_nior_n mpn_xor_n mpn_xnor_n)
+
+ASM_START()
+PROLOGUE(func)
+	push	{ r4-r9 }
+
+ifelse(FEEDIN_VARIANT,0,`
+	ands	r6, n, #3
+	mov	n, n, lsr #2
+	beq	L(b00a)
+	tst	r6, #1
+	beq	L(bx0)
+	ldr	r5, [up], #4
+	ldr	r7, [vp], #4
+	LOGOP(	r9, r5, r7)
+	POSTOP(	r9)
+	str	r9, [rp], #4
+	tst	r6, #2
+	beq	L(b00)
+L(bx0):	ldrd	r4, r5, [up, #0]
+	ldrd	r6, r7, [vp, #0]
+	sub	rp, rp, #8
+	b	L(lo)
+L(b00):	tst	n, n
+	beq	L(wd1)
+L(b00a):ldrd	r4, r5, [up], #-8
+	ldrd	r6, r7, [vp], #-8
+	sub	rp, rp, #16
+	b	L(mid)
+')
+ifelse(FEEDIN_VARIANT,1,`
+	and	r6, n, #3
+	mov	n, n, lsr #2
+	tst	r6, #1
+	beq	L(bx0)
+	ldr	r5, [up], #4
+	ldr	r7, [vp], #4
+	LOGOP(	r9, r5, r7)
+	POSTOP(	r9)
+	str	r9, [rp], #4
+L(bx0):	tst	r6, #2
+	beq	L(b00)
+	ldrd	r4, r5, [up, #0]
+	ldrd	r6, r7, [vp, #0]
+	sub	rp, rp, #8
+	b	L(lo)
+L(b00):	tst	n, n
+	beq	L(wd1)
+	ldrd	r4, r5, [up], #-8
+	ldrd	r6, r7, [vp], #-8
+	sub	rp, rp, #16
+	b	L(mid)
+')
+ifelse(FEEDIN_VARIANT,2,`
+	ands	r6, n, #3
+	mov	n, n, lsr #2
+	beq	L(b00)
+	cmp	r6, #2
+	bcc	L(b01)
+	beq	L(b10)
+
+L(b11):	ldr	r5, [up], #4
+	ldr	r7, [vp], #4
+	LOGOP(	r9, r5, r7)
+	ldrd	r4, r5, [up, #0]
+	ldrd	r6, r7, [vp, #0]
+	POSTOP(	r9)
+	str	r9, [rp], #-4
+	b	L(lo)
+
+L(b00):	ldrd	r4, r5, [up], #-8
+	ldrd	r6, r7, [vp], #-8
+	sub	rp, rp, #16
+	b	L(mid)
+
+L(b01):	ldr	r5, [up], #-4
+	ldr	r7, [vp], #-4
+	LOGOP(	r9, r5, r7)
+	POSTOP(	r9)
+	str	r9, [rp], #-12
+	tst	n, n
+	beq	L(wd1)
+L(gt1):	ldrd	r4, r5, [up, #8]
+	ldrd	r6, r7, [vp, #8]
+	b	L(mid)
+
+L(b10):	ldrd	r4, r5, [up]
+	ldrd	r6, r7, [vp]
+	sub	rp, rp, #8
+	b	L(lo)
+')
+	ALIGN(16)
+ifelse(UNROLL,4,`
+L(top):	ldrd	r4, r5, [up, #8]
+	ldrd	r6, r7, [vp, #8]
+	POSTOP(	r8)
+	POSTOP(	r9)
+	strd	r8, r9, [rp, #8]
+L(mid):	LOGOP(	r8, r4, r6)
+	LOGOP(	r9, r5, r7)
+	ldrd	r4, r5, [up, #16]!
+	ldrd	r6, r7, [vp, #16]!
+	POSTOP(	r8)
+	POSTOP(	r9)
+	strd	r8, r9, [rp, #16]!
+	sub	n, n, #1
+L(lo):	LOGOP(	r8, r4, r6)
+	LOGOP(	r9, r5, r7)
+	tst	n, n
+	bne	L(top)
+')
+ifelse(UNROLL,4x2,`
+L(top):	ldrd	r4, r5, [up, #8]
+	ldrd	r6, r7, [vp, #8]
+	POSTOP(	r8)
+	POSTOP(	r9)
+	strd	r8, r9, [rp, #8]
+L(mid):	LOGOP(	r8, r4, r6)
+	LOGOP(	r9, r5, r7)
+	ldrd	r4, r5, [up, #16]
+	ldrd	r6, r7, [vp, #16]
+	POSTOP(	r8)
+	POSTOP(	r9)
+	strd	r8, r9, [rp, #16]
+	LOGOP(	r8, r4, r6)
+	LOGOP(	r9, r5, r7)
+	sub	n, n, #2
+	tst	n, n
+	bmi	L(dne)
+	ldrd	r4, r5, [up, #24]
+	ldrd	r6, r7, [vp, #24]
+	POSTOP(	r8)
+	POSTOP(	r9)
+	strd	r8, r9, [rp, #24]
+	LOGOP(	r8, r4, r6)
+	LOGOP(	r9, r5, r7)
+	ldrd	r4, r5, [up, #32]!
+	ldrd	r6, r7, [vp, #32]!
+	POSTOP(	r8)
+	POSTOP(	r9)
+	strd	r8, r9, [rp, #32]!
+L(lo):	LOGOP(	r8, r4, r6)
+	LOGOP(	r9, r5, r7)
+	tst	n, n
+	bne	L(top)
+')
+
+L(end):	POSTOP(	r8)
+	POSTOP(	r9)
+	strd	r8, r9, [rp, #8]
+L(wd1):	pop	{ r4-r9 }
+	bx	r14
+ifelse(UNROLL,4x2,`
+L(dne):	POSTOP(	r8)
+	POSTOP(	r9)
+	strd	r8, r9, [rp, #24]
+	pop	{ r4-r9 }
+	bx	r14
+')
+EPILOGUE()
diff --git a/gmp/mpn/arm/v7a/cora15/mul_1.asm b/gmp/mpn/arm/v7a/cora15/mul_1.asm
new file mode 100644
index 0000000000..766ba5c57f
--- /dev/null
+++ b/gmp/mpn/arm/v7a/cora15/mul_1.asm
@@ -0,0 +1,104 @@
+dnl  ARM mpn_mul_1 optimised for A15.
+
+dnl  Copyright 2012, 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb		best
+C StrongARM:	 -
+C XScale	 ?
+C Cortex-A7	 ?
+C Cortex-A8	 ?
+C Cortex-A9	 5.25			3.25
+C Cortex-A15	 2.25			this
+
+
+C This runs well on A15 but very poorly on A9.  By scheduling loads and adds
+C it is possible to get good A9 performance as well, but at the cost of using
+C many more (callee-saves) registers.
+
+C This is armv5 code, optimized for the armv7a cpu A15.  Its location in the
+C GMP file structure might be misleading.
+
+
+define(`rp', `r0')
+define(`up', `r1')
+define(`n',  `r2')
+define(`v0', `r3')
+
+ASM_START()
+PROLOGUE(mpn_mul_1c)
+	ldr	r12, [sp]
+	b	L(ent)
+EPILOGUE()
+PROLOGUE(mpn_mul_1)
+	mov	r12, #0
+L(ent):	push	{r4-r7}
+
+	ldr	r6, [up], #4
+	tst	n, #1
+	beq	L(bx0)
+
+L(bx1):	umull	r4, r7, r6, v0
+	adds	r4, r4, r12
+	tst	n, #2
+	beq	L(lo1)
+	b	L(lo3)
+
+L(bx0):	umull	r4, r5, r6, v0
+	adds	r4, r4, r12
+	tst	n, #2
+	beq	L(lo0)
+	b	L(lo2)
+
+L(top):	ldr	r6, [up], #4
+	str	r4, [rp], #4
+	umull	r4, r5, r6, v0
+	adds	r4, r4, r7
+L(lo0):	ldr	r6, [up], #4
+	str	r4, [rp], #4
+	umull	r4, r7, r6, v0
+	adcs	r4, r4, r5
+L(lo3):	ldr	r6, [up], #4
+	str	r4, [rp], #4
+	umull	r4, r5, r6, v0
+	adcs	r4, r4, r7
+L(lo2):	ldr	r6, [up], #4
+	str	r4, [rp], #4
+	umull	r4, r7, r6, v0
+	adcs	r4, r4, r5
+L(lo1):	adc	r7, r7, #0
+	subs	n, n, #4
+	bgt	L(top)
+
+	str	r4, [rp]
+	mov	r0, r7
+	pop	{r4-r7}
+	bx	lr
+EPILOGUE()
diff --git a/gmp/mpn/arm/v7a/cora15/neon/aorsorrlsh1_n.asm b/gmp/mpn/arm/v7a/cora15/neon/aorsorrlsh1_n.asm
new file mode 100644
index 0000000000..d8cfe3f78f
--- /dev/null
+++ b/gmp/mpn/arm/v7a/cora15/neon/aorsorrlsh1_n.asm
@@ -0,0 +1,43 @@
+dnl  ARM mpn_addlshC_n, mpn_sublshC_n, mpn_rsblshC_n
+
+dnl  Contributed to the GNU project by Torbjörn Granlund.
+
+dnl  Copyright 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+define(LSH,		1)
+
+ifdef(`OPERATION_addlsh1_n',`define(`DO_add')')
+ifdef(`OPERATION_sublsh1_n',`define(`DO_sub')')
+ifdef(`OPERATION_rsblsh1_n',`define(`DO_rsb')')
+
+MULFUNC_PROLOGUE(mpn_addlsh1_n mpn_sublsh1_n mpn_rsblsh1_n)
+
+include_mpn(`arm/v7a/cora15/neon/aorsorrlshC_n.asm')
diff --git a/gmp/mpn/arm/v7a/cora15/neon/aorsorrlsh2_n.asm b/gmp/mpn/arm/v7a/cora15/neon/aorsorrlsh2_n.asm
new file mode 100644
index 0000000000..b48204d926
--- /dev/null
+++ b/gmp/mpn/arm/v7a/cora15/neon/aorsorrlsh2_n.asm
@@ -0,0 +1,43 @@
+dnl  ARM mpn_addlshC_n, mpn_sublshC_n, mpn_rsblshC_n
+
+dnl  Contributed to the GNU project by Torbjörn Granlund.
+
+dnl  Copyright 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+define(LSH,		2)
+
+ifdef(`OPERATION_addlsh2_n',`define(`DO_add')')
+ifdef(`OPERATION_sublsh2_n',`define(`DO_sub')')
+ifdef(`OPERATION_rsblsh2_n',`define(`DO_rsb')')
+
+MULFUNC_PROLOGUE(mpn_addlsh2_n mpn_sublsh2_n mpn_rsblsh2_n)
+
+include_mpn(`arm/v7a/cora15/neon/aorsorrlshC_n.asm')
diff --git a/gmp/mpn/arm/v7a/cora15/neon/aorsorrlshC_n.asm b/gmp/mpn/arm/v7a/cora15/neon/aorsorrlshC_n.asm
new file mode 100644
index 0000000000..16c34a2699
--- /dev/null
+++ b/gmp/mpn/arm/v7a/cora15/neon/aorsorrlshC_n.asm
@@ -0,0 +1,144 @@
+dnl  ARM mpn_addlshC_n, mpn_sublshC_n, mpn_rsblshC_n
+
+dnl  Contributed to the GNU project by Torbjörn Granlund.
+
+dnl  Copyright 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+
+C	     cycles/limb
+C StrongARM	 -
+C XScale	 -
+C Cortex-A7	 ?
+C Cortex-A8	 ?
+C Cortex-A9	 5.25
+C Cortex-A15	 2.25
+
+C TODO
+C  * Consider using 4-way feed-in code.
+C  * This is ad-hoc scheduled, perhaps unnecessarily so for A15, and perhaps
+C    insufficiently for A7 and A8.
+
+define(`rp', `r0')
+define(`up', `r1')
+define(`vp', `r2')
+define(`n',  `r3')
+
+ifdef(`DO_add', `
+  define(`ADCSBCS',	`adcs	$1, $2, $3')
+  define(`CLRCY',	`cmn	r13, #1')
+  define(`RETVAL',	`adc	r0, $1, #0')
+  define(`func',	mpn_addlsh`'LSH`'_n)')
+ifdef(`DO_sub', `
+  define(`ADCSBCS',	`sbcs	$1, $2, $3')
+  define(`CLRCY',	`cmp	r13, #0')
+  define(`RETVAL',	`sbc	$2, $2, $2
+			cmn	$2, #1
+			adc	 r0, $1, #0')
+  define(`func',	mpn_sublsh`'LSH`'_n)')
+ifdef(`DO_rsb', `
+  define(`ADCSBCS',	`sbcs	$1, $3, $2')
+  define(`CLRCY',	`cmp	r13, #0')
+  define(`RETVAL',	`sbc	r0, $1, #0')
+  define(`func',	mpn_rsblsh`'LSH`'_n)')
+
+
+ASM_START()
+PROLOGUE(func)
+	push	 {r4-r10}
+	vmov.i8	 d0, #0			C could feed carry through here
+	CLRCY
+	tst	n, #1
+	beq	L(bb0)
+
+L(bb1):	vld1.32	 {d3[0]}, [vp]!
+	vsli.u32 d0, d3, #LSH
+	ldr	 r12, [up], #4
+	vmov.32	 r5, d0[0]
+	vshr.u32 d0, d3, #32-LSH
+	ADCSBCS( r12, r12, r5)
+	str	 r12, [rp], #4
+	bics	 n, n, #1
+	beq	 L(rtn)
+
+L(bb0):	tst	n, #2
+	beq	L(b00)
+
+L(b10):	vld1.32	 {d3}, [vp]!
+	vsli.u64 d0, d3, #LSH
+	ldmia	 up!, {r10,r12}
+	vmov	 r4, r5, d0
+	vshr.u64 d0, d3, #64-LSH
+	ADCSBCS( r10, r10, r4)
+	ADCSBCS( r12, r12, r5)
+	stmia	 rp!, {r10,r12}
+	bics	 n, n, #2
+	beq	 L(rtn)
+
+L(b00):	vld1.32	 {d2}, [vp]!
+	vsli.u64 d0, d2, #LSH
+	vshr.u64 d1, d2, #64-LSH
+	vld1.32	 {d3}, [vp]!
+	vsli.u64 d1, d3, #LSH
+	vmov	 r6, r7, d0
+	vshr.u64 d0, d3, #64-LSH
+	sub	 n, n, #4
+	tst	 n, n
+	beq	 L(end)
+
+	ALIGN(16)
+L(top):	ldmia	 up!, {r8,r9,r10,r12}
+	vld1.32	 {d2}, [vp]!
+	vsli.u64 d0, d2, #LSH
+	vmov	 r4, r5, d1
+	vshr.u64 d1, d2, #64-LSH
+	ADCSBCS( r8, r8, r6)
+	ADCSBCS( r9, r9, r7)
+	vld1.32	 {d3}, [vp]!
+	vsli.u64 d1, d3, #LSH
+	vmov	 r6, r7, d0
+	vshr.u64 d0, d3, #64-LSH
+	ADCSBCS( r10, r10, r4)
+	ADCSBCS( r12, r12, r5)
+	stmia	 rp!, {r8,r9,r10,r12}
+	sub	 n, n, #4
+	tst	 n, n
+	bne	 L(top)
+
+L(end):	ldmia	 up!, {r8,r9,r10,r12}
+	vmov	 r4, r5, d1
+	ADCSBCS( r8, r8, r6)
+	ADCSBCS( r9, r9, r7)
+	ADCSBCS( r10, r10, r4)
+	ADCSBCS( r12, r12, r5)
+	stmia	 rp!, {r8,r9,r10,r12}
+L(rtn):	vmov.32	 r0, d0[0]
+	RETVAL(	 r0, r1)
+	pop	 {r4-r10}
+	bx	 r14
+EPILOGUE()
diff --git a/gmp/mpn/arm/v7a/cora15/neon/com.asm b/gmp/mpn/arm/v7a/cora15/neon/com.asm
new file mode 100644
index 0000000000..9e7a629287
--- /dev/null
+++ b/gmp/mpn/arm/v7a/cora15/neon/com.asm
@@ -0,0 +1,97 @@
+dnl  ARM Neon mpn_com optimised for A15.
+
+dnl  Copyright 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C            cycles/limb
+C StrongARM	 ?
+C XScale	 ?
+C Cortex-A8	 ?
+C Cortex-A9	 2.1
+C Cortex-A15	 0.65
+
+define(`rp', `r0')
+define(`up', `r1')
+define(`n',  `r2')
+
+ASM_START()
+PROLOGUE(mpn_com)
+	cmp		n, #7
+	ble		L(bc)
+
+C Perform a few initial operation until rp is 128-bit aligned
+	tst		rp, #4
+	beq		L(al1)
+	vld1.32		{d0[0]}, [up]!
+	sub		n, n, #1
+	vmvn		d0, d0
+	vst1.32		{d0[0]}, [rp]!
+L(al1):	tst		rp, #8
+	beq		L(al2)
+	vld1.32		{d0}, [up]!
+	sub		n, n, #2
+	vmvn		d0, d0
+	vst1.32		{d0}, [rp:64]!
+L(al2):	vld1.32		{q2}, [up]!
+	subs		n, n, #12
+	blt		L(end)
+
+	ALIGN(16)
+L(top):	vld1.32		{q0}, [up]!
+	vmvn		q2, q2
+	subs		n, n, #8
+	vst1.32		{q2}, [rp:128]!
+	vld1.32		{q2}, [up]!
+	vmvn		q0, q0
+	vst1.32		{q0}, [rp:128]!
+	bge	L(top)
+
+L(end):	vmvn		q2, q2
+	vst1.32		{q2}, [rp:128]!
+
+C Handle last 0-7 limbs.  Note that rp is aligned after loop, but not when we
+C arrive here via L(bc)
+L(bc):	tst		n, #4
+	beq		L(tl1)
+	vld1.32		{q0}, [up]!
+	vmvn		q0, q0
+	vst1.32		{q0}, [rp]!
+L(tl1):	tst		n, #2
+	beq		L(tl2)
+	vld1.32		{d0}, [up]!
+	vmvn		d0, d0
+	vst1.32		{d0}, [rp]!
+L(tl2):	tst		n, #1
+	beq		L(tl3)
+	vld1.32		{d0[0]}, [up]
+	vmvn		d0, d0
+	vst1.32		{d0[0]}, [rp]
+L(tl3):	bx		lr
+EPILOGUE()
diff --git a/gmp/mpn/arm/v7a/cora15/neon/copyd.asm b/gmp/mpn/arm/v7a/cora15/neon/copyd.asm
new file mode 100644
index 0000000000..98fe535def
--- /dev/null
+++ b/gmp/mpn/arm/v7a/cora15/neon/copyd.asm
@@ -0,0 +1,110 @@
+dnl  ARM Neon mpn_copyd optimised for A15.
+
+dnl  Copyright 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb
+C StrongARM	 -
+C XScale	 -
+C Cortex-A7	 ?
+C Cortex-A8	 ?
+C Cortex-A9	 1.75		slower than core register code
+C Cortex-A15	 0.52
+
+define(`rp', `r0')
+define(`up', `r1')
+define(`n',  `r2')
+
+ASM_START()
+PROLOGUE(mpn_copyd)
+	add	rp, rp, n, lsl #2
+	add	up, up, n, lsl #2
+
+	cmp	n, #7
+	ble	L(bc)
+
+C Copy until rp is 128-bit aligned
+	tst	rp, #4
+	beq	L(al1)
+	sub	up, up, #4
+	vld1.32	{d22[0]}, [up]
+	sub	n, n, #1
+	sub	rp, rp, #4
+	vst1.32	{d22[0]}, [rp]
+L(al1):	tst	rp, #8
+	beq	L(al2)
+	sub	up, up, #8
+	vld1.32	{d22}, [up]
+	sub	n, n, #2
+	sub	rp, rp, #8
+	vst1.32	{d22}, [rp:64]
+L(al2):	sub	up, up, #16
+	vld1.32	{d26-d27}, [up]
+	subs	n, n, #12
+	sub	rp, rp, #16			C offset rp for loop
+	blt	L(end)
+
+	sub	up, up, #16			C offset up for loop
+	mov	r12, #-16
+
+	ALIGN(16)
+L(top):	vld1.32	{d22-d23}, [up], r12
+	vst1.32	{d26-d27}, [rp:128], r12
+	vld1.32	{d26-d27}, [up], r12
+	vst1.32	{d22-d23}, [rp:128], r12
+	subs	n, n, #8
+	bge	L(top)
+
+	add	up, up, #16			C undo up offset
+						C rp offset undoing folded
+L(end):	vst1.32	{d26-d27}, [rp:128]
+
+C Copy last 0-7 limbs.  Note that rp is aligned after loop, but not when we
+C arrive here via L(bc)
+L(bc):	tst	n, #4
+	beq	L(tl1)
+	sub	up, up, #16
+	vld1.32	{d22-d23}, [up]
+	sub	rp, rp, #16
+	vst1.32	{d22-d23}, [rp]
+L(tl1):	tst	n, #2
+	beq	L(tl2)
+	sub	up, up, #8
+	vld1.32	{d22}, [up]
+	sub	rp, rp, #8
+	vst1.32	{d22}, [rp]
+L(tl2):	tst	n, #1
+	beq	L(tl3)
+	sub	up, up, #4
+	vld1.32	{d22[0]}, [up]
+	sub	rp, rp, #4
+	vst1.32	{d22[0]}, [rp]
+L(tl3):	bx	lr
+EPILOGUE()
diff --git a/gmp/mpn/arm/v7a/cora15/neon/copyi.asm b/gmp/mpn/arm/v7a/cora15/neon/copyi.asm
new file mode 100644
index 0000000000..2e05afe5e8
--- /dev/null
+++ b/gmp/mpn/arm/v7a/cora15/neon/copyi.asm
@@ -0,0 +1,90 @@
+dnl  ARM Neon mpn_copyi optimised for A15.
+
+dnl  Copyright 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb
+C StrongARM	 -
+C XScale	 -
+C Cortex-A7	 ?
+C Cortex-A8	 ?
+C Cortex-A9	 1.75		slower than core register code
+C Cortex-A15	 0.52
+
+define(`rp', `r0')
+define(`up', `r1')
+define(`n',  `r2')
+
+ASM_START()
+PROLOGUE(mpn_copyi)
+	cmp	n, #7
+	ble	L(bc)
+
+C Copy until rp is 128-bit aligned
+	tst	rp, #4
+	beq	L(al1)
+	vld1.32	{d22[0]}, [up]!
+	sub	n, n, #1
+	vst1.32	{d22[0]}, [rp]!
+L(al1):	tst	rp, #8
+	beq	L(al2)
+	vld1.32	{d22}, [up]!
+	sub	n, n, #2
+	vst1.32	{d22}, [rp:64]!
+L(al2):	vld1.32	{d26-d27}, [up]!
+	subs	n, n, #12
+	blt	L(end)
+
+	ALIGN(16)
+L(top):	vld1.32	{d22-d23}, [up]!
+	vst1.32	{d26-d27}, [rp:128]!
+	vld1.32	{d26-d27}, [up]!
+	vst1.32	{d22-d23}, [rp:128]!
+	subs	n, n, #8
+	bge	L(top)
+
+L(end):	vst1.32	{d26-d27}, [rp:128]!
+
+C Copy last 0-7 limbs.  Note that rp is aligned after loop, but not when we
+C arrive here via L(bc)
+L(bc):	tst	n, #4
+	beq	L(tl1)
+	vld1.32	{d22-d23}, [up]!
+	vst1.32	{d22-d23}, [rp]!
+L(tl1):	tst	n, #2
+	beq	L(tl2)
+	vld1.32	{d22}, [up]!
+	vst1.32	{d22}, [rp]!
+L(tl2):	tst	n, #1
+	beq	L(tl3)
+	vld1.32	{d22[0]}, [up]
+	vst1.32	{d22[0]}, [rp]
+L(tl3):	bx	lr
+EPILOGUE()
diff --git a/gmp/mpn/arm/v7a/cora15/neon/rsh1aors_n.asm b/gmp/mpn/arm/v7a/cora15/neon/rsh1aors_n.asm
new file mode 100644
index 0000000000..2c11d6debd
--- /dev/null
+++ b/gmp/mpn/arm/v7a/cora15/neon/rsh1aors_n.asm
@@ -0,0 +1,177 @@
+dnl  ARM Neon mpn_rsh1add_n, mpn_rsh1sub_n.
+
+dnl  Contributed to the GNU project by Torbjörn Granlund.
+
+dnl  Copyright 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb
+C StrongARM	 -
+C XScale	 -
+C Cortex-A7	 ?
+C Cortex-A8	 ?
+C Cortex-A9	4-5
+C Cortex-A15	 2.5
+
+C TODO
+C  * Try to make this smaller, its size (384 bytes) is excessive.
+C  * Try to reach 2.25 c/l on A15, to match the addlsh_1 family.
+C  * This is ad-hoc scheduled, perhaps unnecessarily so for A15, and perhaps
+C    insufficiently for A7 and A8.
+
+define(`rp', `r0')
+define(`up', `r1')
+define(`vp', `r2')
+define(`n',  `r3')
+
+ifdef(`OPERATION_rsh1add_n', `
+  define(`ADDSUBS',	`adds	$1, $2, $3')
+  define(`ADCSBCS',	`adcs	$1, $2, $3')
+  define(`IFADD',	`$1')
+  define(`IFSUB',	`')
+  define(`func',	mpn_rsh1add_n)')
+ifdef(`OPERATION_rsh1sub_n', `
+  define(`ADDSUBS',	`subs	$1, $2, $3')
+  define(`ADCSBCS',	`sbcs	$1, $2, $3')
+  define(`IFADD',	`')
+  define(`IFSUB',	`$1')
+  define(`func',	mpn_rsh1sub_n)')
+
+MULFUNC_PROLOGUE(mpn_rsh1add_n mpn_rsh1sub_n)
+
+ASM_START()
+PROLOGUE(func)
+	push	 {r4-r10}
+
+	ands	r4, n, #3
+	beq	L(b00)
+	cmp	r4, #2
+	blo	L(b01)
+	beq	L(b10)
+
+L(b11):	ldmia	 up!, {r9,r10,r12}
+	ldmia	 vp!, {r5,r6,r7}
+	ADDSUBS( r9, r9, r5)
+	vmov	 d4, r9, r9
+	ADCSBCS( r10, r10, r6)
+	ADCSBCS( r12, r12, r7)
+	vshr.u64 d3, d4, #1
+	vmov	 d1, r10, r12
+	vsli.u64 d3, d1, #31
+	vshr.u64 d2, d1, #1
+	vst1.32	 d3[0], [rp]!
+	bics	 n, n, #3
+	beq	 L(wd2)
+L(gt3):	ldmia	 up!, {r8,r9,r10,r12}
+	ldmia	 vp!, {r4,r5,r6,r7}
+	b	 L(mi0)
+
+L(b10):	ldmia	 up!, {r10,r12}
+	ldmia	 vp!, {r6,r7}
+	ADDSUBS( r10, r10, r6)
+	ADCSBCS( r12, r12, r7)
+	vmov	 d4, r10, r12
+	bics	 n, n, #2
+	vshr.u64 d2, d4, #1
+	beq	 L(wd2)
+L(gt2):	ldmia	 up!, {r8,r9,r10,r12}
+	ldmia	 vp!, {r4,r5,r6,r7}
+	b	 L(mi0)
+
+L(b01):	ldr	 r12, [up], #4
+	ldr	 r7, [vp], #4
+	ADDSUBS( r12, r12, r7)
+	vmov	 d4, r12, r12
+	bics	 n, n, #1
+	bne	 L(gt1)
+	mov	 r5, r12, lsr #1
+IFADD(`	adc	 r1, n, #0')
+IFSUB(`	adc	 r1, n, #1')
+	bfi	 r5, r1, #31, #1
+	str	 r5, [rp]
+	and	 r0, r12, #1
+	pop	 {r4-r10}
+	bx	 r14
+L(gt1):	ldmia	 up!, {r8,r9,r10,r12}
+	ldmia	 vp!, {r4,r5,r6,r7}
+	vshr.u64 d2, d4, #1
+	ADCSBCS( r8, r8, r4)
+	ADCSBCS( r9, r9, r5)
+	vmov	 d0, r8, r9
+	ADCSBCS( r10, r10, r6)
+	ADCSBCS( r12, r12, r7)
+	vsli.u64 d2, d0, #31
+	vshr.u64 d3, d0, #1
+	vst1.32	 d2[0], [rp]!
+	b	 L(mi1)
+
+L(b00):	ldmia	 up!, {r8,r9,r10,r12}
+	ldmia	 vp!, {r4,r5,r6,r7}
+	ADDSUBS( r8, r8, r4)
+	ADCSBCS( r9, r9, r5)
+	vmov	 d4, r8, r9
+	ADCSBCS( r10, r10, r6)
+	ADCSBCS( r12, r12, r7)
+	vshr.u64 d3, d4, #1
+	b	 L(mi1)
+
+	ALIGN(16)
+L(top):	ldmia	 up!, {r8,r9,r10,r12}
+	ldmia	 vp!, {r4,r5,r6,r7}
+	vsli.u64 d3, d1, #63
+	vshr.u64 d2, d1, #1
+	vst1.32	 d3, [rp]!
+L(mi0):	ADCSBCS( r8, r8, r4)
+	ADCSBCS( r9, r9, r5)
+	vmov	 d0, r8, r9
+	ADCSBCS( r10, r10, r6)
+	ADCSBCS( r12, r12, r7)
+	vsli.u64 d2, d0, #63
+	vshr.u64 d3, d0, #1
+	vst1.32	 d2, [rp]!
+L(mi1):	vmov	 d1, r10, r12
+	sub	 n, n, #4
+	tst	 n, n
+	bne	 L(top)
+
+L(end):	vsli.u64 d3, d1, #63
+	vshr.u64 d2, d1, #1
+	vst1.32	 d3, [rp]!
+L(wd2):	vmov	 r4, r5, d2
+IFADD(`	adc	 r1, n, #0')
+IFSUB(`	adc	 r1, n, #1')
+	bfi	 r5, r1, #31, #1
+	stm	 rp, {r4,r5}
+
+L(rtn):	vmov.32	 r0, d4[0]
+	and	 r0, r0, #1
+	pop	 {r4-r10}
+	bx	 r14
+EPILOGUE()
diff --git a/gmp/mpn/arm/v7a/cora15/submul_1.asm b/gmp/mpn/arm/v7a/cora15/submul_1.asm
new file mode 100644
index 0000000000..ed7bfe820b
--- /dev/null
+++ b/gmp/mpn/arm/v7a/cora15/submul_1.asm
@@ -0,0 +1,159 @@
+dnl  ARM mpn_submul_1 optimised for A15.
+
+dnl  Copyright 2012, 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb		best
+C StrongARM:     -
+C XScale	 ?
+C Cortex-A7	 ?
+C Cortex-A8	 ?
+C Cortex-A9	 5.75			3.75
+C Cortex-A15	 2.32			this
+
+C This code uses umlal and umaal for adding in the rp[] data, keeping the
+C recurrency path separate from any multiply instructions.  It performs well on
+C A15, but not quite at the multiply bandwidth like the corresponding addmul_1
+C code.
+C
+C We don't use r12 due to ldrd and strd limitations.
+C
+C This loop complements U on the fly,
+C   U' = B^n - 1 - U
+C and then uses that
+C   R - U*v = R + U'*v + v - B^n v
+
+C Architecture requirements:
+C v5	-
+C v5t	-
+C v5te	ldrd strd
+C v6	umaal
+C v6t2	-
+C v7a	-
+
+define(`rp', `r0')
+define(`up', `r1')
+define(`n',  `r2')
+define(`v0', `r3')
+
+define(`w0', `r10') define(`w1', `r11')
+define(`u0', `r8')  define(`u1', `r9')
+
+ASM_START()
+PROLOGUE(mpn_submul_1)
+	sub	sp, sp, #32
+	strd	r10, r11, [sp, #24]
+	strd	r8, r9, [sp, #16]
+	strd	r6, r7, [sp, #8]
+	strd	r4, r5, [sp, #0]
+C	push	{ r4-r11 }
+
+	ands	r6, n, #3
+	sub	n, n, #3
+	beq	L(b00)
+	cmp	r6, #2
+	bcc	L(b01)
+	beq	L(b10)
+
+L(b11):	mov	r6, #0
+	ldr	u1, [up], #-4
+	ldr	w1, [rp], #-16
+	mvn	u1, u1
+	adds	r7, v0, #0
+	b	L(mid)
+
+L(b00):	ldrd	u0, u1, [up]
+	ldrd	w0, w1, [rp], #-12
+	mvn	u0, u0
+	mvn	u1, u1
+	mov	r6, v0
+	umaal	w0, r6, u0, v0
+	cmn	r13, #0			C carry clear
+	mov	r7, #0
+	str	w0, [rp, #12]
+	b	L(mid)
+
+L(b10):	ldrd	u0, u1, [up], #8
+	ldrd	w0, w1, [rp]
+	mvn	u0, u0
+	mvn	u1, u1
+	mov	r4, v0
+	umaal	w0, r4, u0, v0
+	mov	r5, #0
+	str	w0, [rp], #-4
+	umlal	w1, r5, u1, v0
+	adds	n, n, #0
+	bmi	L(end)
+	b	L(top)
+
+L(b01):	ldr	u1, [up], #4
+	ldr	w1, [rp], #-8
+	mvn	u1, u1
+	mov	r5, v0
+	mov	r4, #0
+	umaal	w1, r5, u1, v0
+	tst	n, n
+	bmi	L(end)
+
+C	ALIGN(16)
+L(top):	ldrd	u0, u1, [up, #0]
+	adcs	r4, r4, w1
+	mvn	u0, u0
+	ldrd	w0, w1, [rp, #12]
+	mvn	u1, u1
+	mov	r6, #0
+	umlal	w0, r6, u0, v0		C 1 2
+	adcs	r5, r5, w0
+	mov	r7, #0
+	strd	r4, r5, [rp, #8]
+L(mid):	umaal	w1, r7, u1, v0		C 2 3
+	ldrd	u0, u1, [up, #8]
+	add	up, up, #16
+	adcs	r6, r6, w1
+	mvn	u0, u0
+	ldrd	w0, w1, [rp, #20]
+	mvn	u1, u1
+	mov	r4, #0
+	umlal	w0, r4, u0, v0		C 3 4
+	adcs	r7, r7, w0
+	mov	r5, #0
+	strd	r6, r7, [rp, #16]!
+	sub	n, n, #4
+	umlal	w1, r5, u1, v0		C 0 1
+	tst	n, n
+	bpl	L(top)
+
+L(end):	adcs	r4, r4, w1
+	str	r4, [rp, #8]
+	adc	r0, r5, #0
+	sub	r0, v0, r0
+	pop	{ r4-r11 }
+	bx	r14
+EPILOGUE()
diff --git a/gmp/mpn/arm/v7a/cora9/gmp-mparam.h b/gmp/mpn/arm/v7a/cora9/gmp-mparam.h
new file mode 100644
index 0000000000..9660257820
--- /dev/null
+++ b/gmp/mpn/arm/v7a/cora9/gmp-mparam.h
@@ -0,0 +1,209 @@
+/* gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 1991, 1993, 1994, 1999-2003, 2009, 2010, 2012-2014 Free Software
+Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#define GMP_LIMB_BITS 32
+#define GMP_LIMB_BYTES 4
+
+/* 1000MHz Cortex-A9 */
+/* FFT tuning limit = 25000000 */
+/* Generated by tuneup.c, 2014-03-12, gcc 4.6 */
+
+#define MOD_1_NORM_THRESHOLD                 0  /* always */
+#define MOD_1_UNNORM_THRESHOLD               0  /* always */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD          3
+#define MOD_1U_TO_MOD_1_1_THRESHOLD          3
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD         8
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD     MP_SIZE_T_MAX
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD     12
+#define USE_PREINV_DIVREM_1                  1  /* native */
+#define DIV_QR_1N_PI1_METHOD                 1
+#define DIV_QR_1_NORM_THRESHOLD          MP_SIZE_T_MAX  /* never */
+#define DIV_QR_1_UNNORM_THRESHOLD        MP_SIZE_T_MAX  /* never */
+#define DIV_QR_2_PI2_THRESHOLD           MP_SIZE_T_MAX  /* never */
+#define DIVEXACT_1_THRESHOLD                 0  /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD           20
+
+#define MUL_TOOM22_THRESHOLD                45
+#define MUL_TOOM33_THRESHOLD               129
+#define MUL_TOOM44_THRESHOLD               387
+#define MUL_TOOM6H_THRESHOLD               517
+#define MUL_TOOM8H_THRESHOLD               774
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD     137
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD     222
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD     137
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD     235
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD     208
+
+#define SQR_BASECASE_THRESHOLD               0  /* always (native) */
+#define SQR_TOOM2_THRESHOLD                 54
+#define SQR_TOOM3_THRESHOLD                181
+#define SQR_TOOM4_THRESHOLD                490
+#define SQR_TOOM6_THRESHOLD                656
+#define SQR_TOOM8_THRESHOLD                  0  /* always */
+
+#define MULMID_TOOM42_THRESHOLD             64
+
+#define MULMOD_BNM1_THRESHOLD               26
+#define SQRMOD_BNM1_THRESHOLD               28
+
+#define MUL_FFT_MODF_THRESHOLD             624  /* k = 5 */
+#define MUL_FFT_TABLE3                                      \
+  { {    624, 5}, {     28, 6}, {     15, 5}, {     34, 6}, \
+    {     18, 5}, {     37, 6}, {     28, 7}, {     15, 6}, \
+    {     36, 7}, {     19, 6}, {     40, 7}, {     21, 6}, \
+    {     43, 7}, {     23, 6}, {     47, 7}, {     25, 6}, \
+    {     51, 7}, {     27, 6}, {     55, 7}, {     29, 8}, \
+    {     15, 7}, {     31, 6}, {     63, 7}, {     37, 8}, \
+    {     19, 7}, {     43, 8}, {     23, 7}, {     51, 8}, \
+    {     27, 7}, {     57, 9}, {     15, 8}, {     31, 7}, \
+    {     65, 8}, {     35, 7}, {     71, 8}, {     43, 9}, \
+    {     23, 8}, {     55,10}, {     15, 9}, {     31, 8}, \
+    {     71, 9}, {     39, 8}, {     83, 9}, {     47, 8}, \
+    {     99, 9}, {     55,10}, {     31, 9}, {     79,10}, \
+    {     47, 9}, {    103,11}, {     31,10}, {     63, 9}, \
+    {    135,10}, {     79, 9}, {    167,10}, {     95, 9}, \
+    {    191,10}, {    111,11}, {     63,10}, {    159,11}, \
+    {     95,10}, {    191, 9}, {    383,12}, {     63,11}, \
+    {    127,10}, {    255, 9}, {    511,10}, {    271,11}, \
+    {    159,10}, {    319, 9}, {    639,10}, {    335, 9}, \
+    {    671,10}, {    351,11}, {    191,10}, {    383, 9}, \
+    {    767,10}, {    399, 9}, {    799,10}, {    415,11}, \
+    {    223,12}, {    127,11}, {    255,10}, {    511, 9}, \
+    {   1023,10}, {    543,11}, {    287,10}, {    575, 9}, \
+    {   1151,11}, {    319,10}, {    671,11}, {    351,12}, \
+    {    191,11}, {    383,10}, {    799,11}, {    415,10}, \
+    {    831,13}, {    127,12}, {    255,11}, {    511,10}, \
+    {   1023,11}, {    607,12}, {    319,11}, {    735,12}, \
+    {    383,11}, {    863,12}, {    447,11}, {    959,13}, \
+    {    255,12}, {    511,11}, {   1087,12}, {    575,11}, \
+    {   1215,12}, {    639,11}, {   1279,12}, {    703,13}, \
+    {    383,12}, {    767,11}, {   1535,12}, {    831,11}, \
+    {   1663,12}, {    959,14}, {    255,13}, {    511,12}, \
+    {   1023,11}, {   2047,12}, {   1215,13}, {    639,12}, \
+    {   1407,13}, {    767,12}, {   1663,13}, {    895,12}, \
+    {   1791,14}, {    511,13}, {   1023,12}, {   2111,13}, \
+    {   1151,12}, {   2431,13}, {   1279,12}, {   2559,13}, \
+    {   1407,14}, {    767,13}, {   1535,12}, {   3071,13}, \
+    {   1663,12}, {   3455,13}, {   1791,15}, {    511,14}, \
+    {   1023,13}, {   2047,12}, {   4095,13}, {   2175,12}, \
+    {   4351,13}, {   2431,14}, {   1279,13}, {   2559,12}, \
+    {   5119,13}, {   2815,14}, {  16384,15}, {  32768,16} }
+#define MUL_FFT_TABLE3_SIZE 160
+#define MUL_FFT_THRESHOLD                 6784
+
+#define SQR_FFT_MODF_THRESHOLD             560  /* k = 5 */
+#define SQR_FFT_TABLE3                                      \
+  { {    560, 5}, {     19, 4}, {     39, 5}, {     21, 4}, \
+    {     43, 5}, {     29, 6}, {     15, 5}, {     33, 6}, \
+    {     17, 5}, {     35, 6}, {     36, 7}, {     19, 6}, \
+    {     40, 7}, {     21, 6}, {     43, 7}, {     23, 6}, \
+    {     47, 7}, {     29, 8}, {     15, 7}, {     35, 8}, \
+    {     19, 7}, {     43, 8}, {     23, 7}, {     49, 8}, \
+    {     27, 7}, {     55, 9}, {     15, 8}, {     31, 7}, \
+    {     65, 8}, {     35, 7}, {     71, 8}, {     43, 9}, \
+    {     23, 8}, {     55, 9}, {     31, 8}, {     71, 9}, \
+    {     39, 8}, {     83, 9}, {     47, 8}, {     95, 9}, \
+    {     55,10}, {     31, 9}, {     79,10}, {     47, 9}, \
+    {    103,11}, {     31,10}, {     63, 9}, {    135,10}, \
+    {     79, 9}, {    159,10}, {     95, 9}, {    191,10}, \
+    {    111,11}, {     63,10}, {    159,11}, {     95,10}, \
+    {    191, 9}, {    383,10}, {    207,12}, {     63,11}, \
+    {    127,10}, {    255, 9}, {    511, 8}, {   1023, 9}, \
+    {    543,10}, {    287,11}, {    159,10}, {    319, 9}, \
+    {    639,10}, {    335, 9}, {    671,10}, {    351,11}, \
+    {    191,10}, {    383, 9}, {    767,10}, {    399, 9}, \
+    {    799,10}, {    415, 9}, {    831,11}, {    223,12}, \
+    {    127,11}, {    255,10}, {    511, 9}, {   1023,10}, \
+    {    543,11}, {    287,10}, {    575, 9}, {   1151,10}, \
+    {    607,11}, {    319,10}, {    671,11}, {    351,10}, \
+    {    703,12}, {    191,11}, {    383,10}, {    799,11}, \
+    {    415,10}, {    831,13}, {    127,11}, {    511,10}, \
+    {   1023,11}, {    543,10}, {   1087,11}, {    575,10}, \
+    {   1151,11}, {    607,12}, {    319,11}, {    671,10}, \
+    {   1343,11}, {    735,12}, {    383,11}, {    863,12}, \
+    {    447,11}, {    959,12}, {    511,11}, {   1087,12}, \
+    {    575,11}, {   1215,12}, {    639,11}, {   1343,12}, \
+    {    703,11}, {   1407,13}, {    383,12}, {    767,11}, \
+    {   1599,12}, {    831,11}, {   1663,12}, {    895,11}, \
+    {   1791,12}, {    959,13}, {    511,12}, {   1023,11}, \
+    {   2047,12}, {   1215,13}, {    639,12}, {   1407,13}, \
+    {    767,12}, {   1663,13}, {    895,12}, {   1791,14}, \
+    {    511,13}, {   1023,12}, {   2111,13}, {   1151,12}, \
+    {   2431,13}, {   1279,12}, {   2559,13}, {   1407,14}, \
+    {    767,13}, {   1535,12}, {   3071,13}, {   1663,12}, \
+    {   3455,13}, {   1791,15}, {    511,14}, {   1023,13}, \
+    {   2047,12}, {   4095,13}, {   2175,12}, {   4351,13}, \
+    {   2431,14}, {   1279,13}, {   2559,12}, {   5119,13}, \
+    {   2815,14}, {  16384,15}, {  32768,16} }
+#define SQR_FFT_TABLE3_SIZE 167
+#define SQR_FFT_THRESHOLD                 5312
+
+#define MULLO_BASECASE_THRESHOLD             0  /* always */
+#define MULLO_DC_THRESHOLD                  38
+#define MULLO_MUL_N_THRESHOLD            13463
+
+#define DC_DIV_QR_THRESHOLD                 42
+#define DC_DIVAPPR_Q_THRESHOLD             100
+#define DC_BDIV_QR_THRESHOLD                43
+#define DC_BDIV_Q_THRESHOLD                104
+
+#define INV_MULMOD_BNM1_THRESHOLD           98
+#define INV_NEWTON_THRESHOLD               138
+#define INV_APPR_THRESHOLD                 133
+
+#define BINV_NEWTON_THRESHOLD              333
+#define REDC_1_TO_REDC_2_THRESHOLD           2
+#define REDC_2_TO_REDC_N_THRESHOLD         142
+
+#define MU_DIV_QR_THRESHOLD               2350
+#define MU_DIVAPPR_Q_THRESHOLD            2259
+#define MUPI_DIV_QR_THRESHOLD               70
+#define MU_BDIV_QR_THRESHOLD              2089
+#define MU_BDIV_Q_THRESHOLD               2172
+
+#define POWM_SEC_TABLE  37,48,81,615,1925
+
+#define MATRIX22_STRASSEN_THRESHOLD         22
+#define HGCD_THRESHOLD                      64
+#define HGCD_APPR_THRESHOLD                 50
+#define HGCD_REDUCE_THRESHOLD             4284
+#define GCD_DC_THRESHOLD                   416
+#define GCDEXT_DC_THRESHOLD                298
+#define JACOBI_BASE_METHOD                   4
+
+#define GET_STR_DC_THRESHOLD                18
+#define GET_STR_PRECOMPUTE_THRESHOLD        33
+#define SET_STR_DC_THRESHOLD               140
+#define SET_STR_PRECOMPUTE_THRESHOLD       748
+
+#define FAC_DSC_THRESHOLD                  309
+#define FAC_ODD_THRESHOLD                   29
author	Pedro Alvarez <pedro.alvarez@codethink.co.uk>	2014-12-22 00:55:04 +0000
committer	Pedro Alvarez <pedro.alvarez@codethink.co.uk>	2014-12-22 00:56:42 +0000
commit	54eea31d0053620bab65153ab39d61e5575aaf1b (patch)
tree	5f97c96dffdb6b27df36795689abfb9086011585 /gmp/mpn/arm
parent	c16297b7cfb0c1708f1d84b5d0f90be0844d07ce (diff)
download	gcc-tarball-54eea31d0053620bab65153ab39d61e5575aaf1b.tar.gz