1 files changed, 100 insertions, 97 deletions
diff --git a/gmp/mpn/x86_64/dive_1.asm b/gmp/mpn/x86_64/dive_1.asm
index 988bdab632..4889faccb5 100644
--- a/gmp/mpn/x86_64/dive_1.asm
+++ b/gmp/mpn/x86_64/dive_1.asm
@@ -1,44 +1,31 @@
 dnl  AMD64 mpn_divexact_1 -- mpn by limb exact division.
 
-dnl  Copyright 2001, 2002, 2004-2006, 2011, 2012 Free Software Foundation, Inc.
+dnl  Copyright 2001, 2002, 2004, 2005, 2006 Free Software Foundation, Inc.
 
 dnl  This file is part of the GNU MP Library.
-dnl
+
 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
-dnl  it under the terms of either:
-dnl
-dnl    * the GNU Lesser General Public License as published by the Free
-dnl      Software Foundation; either version 3 of the License, or (at your
-dnl      option) any later version.
-dnl
-dnl  or
-dnl
-dnl    * the GNU General Public License as published by the Free Software
-dnl      Foundation; either version 2 of the License, or (at your option) any
-dnl      later version.
-dnl
-dnl  or both in parallel, as here.
-dnl
-dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  it under the terms of the GNU Lesser General Public License as published
+dnl  by the Free Software Foundation; either version 3 of the License, or (at
+dnl  your option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
-dnl  for more details.
-dnl
-dnl  You should have received copies of the GNU General Public License and the
-dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
-dnl  see https://www.gnu.org/licenses/.
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
 
 include(`../config.m4')
 
 
 C	     cycles/limb
-C AMD K8,K9	10
-C AMD K10	10
-C Intel P4	33
-C Intel core2	13.25
-C Intel corei	14
-C Intel atom	42
-C VIA nano	43
+C K8,K9:	10
+C K10:		10
+C P4:		33
+C P6-15 (Core2):13.25
+C P6-28 (Atom):	42
 
 C A quick adoption of the 32-bit K7 code.
 
@@ -49,66 +36,67 @@ C up		rsi
 C n		rdx
 C divisor	rcx
 
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
 ASM_START()
 	TEXT
 	ALIGN(16)
 PROLOGUE(mpn_divexact_1)
-	FUNC_ENTRY(4)
-	push	%rbx
+	pushq	%rbx
 
-	mov	%rcx, %rax
-	xor	R32(%rcx), R32(%rcx)	C shift count
-	mov	%rdx, %r8
+	movq	%rcx, %rax
+	movl	$0, %ecx		C shift count
+	movq	%rdx, %r8
 
-	bt	$0, R32(%rax)
+	btl	$0, %eax
 	jnc	L(evn)			C skip bsfq unless divisor is even
 
-L(odd):	mov	%rax, %rbx
-	shr	R32(%rax)
-	and	$127, R32(%rax)		C d/2, 7 bits
+L(odd):	movq	%rax, %rbx
+	shrl	%eax
+	andl	$127, %eax		C d/2, 7 bits
 
-	LEA(	binvert_limb_table, %rdx)
+ifdef(`PIC',`
+	movq	binvert_limb_table@GOTPCREL(%rip), %rdx
+',`
+	movabsq	$binvert_limb_table, %rdx
+')
 
-	movzbl	(%rdx,%rax), R32(%rax)	C inv 8 bits
+	movzbl	(%rax,%rdx), %eax	C inv 8 bits
 
-	mov	%rbx, %r11		C d without twos
+	movq	%rbx, %r11		C d without twos
 
-	lea	(%rax,%rax), R32(%rdx)	C 2*inv
-	imul	R32(%rax), R32(%rax)	C inv*inv
-	imul	R32(%rbx), R32(%rax)	C inv*inv*d
-	sub	R32(%rax), R32(%rdx)	C inv = 2*inv - inv*inv*d, 16 bits
+	leal	(%rax,%rax), %edx	C 2*inv
+	imull	%eax, %eax		C inv*inv
+	imull	%ebx, %eax		C inv*inv*d
+	subl	%eax, %edx		C inv = 2*inv - inv*inv*d, 16 bits
 
-	lea	(%rdx,%rdx), R32(%rax)	C 2*inv
-	imul	R32(%rdx), R32(%rdx)	C inv*inv
-	imul	R32(%rbx), R32(%rdx)	C inv*inv*d
-	sub	R32(%rdx), R32(%rax)	C inv = 2*inv - inv*inv*d, 32 bits
+	leal	(%rdx,%rdx), %eax	C 2*inv
+	imull	%edx, %edx		C inv*inv
+	imull	%ebx, %edx		C inv*inv*d
+	subl	%edx, %eax		C inv = 2*inv - inv*inv*d, 32 bits
 
-	lea	(%rax,%rax), %r10	C 2*inv
-	imul	%rax, %rax		C inv*inv
-	imul	%rbx, %rax		C inv*inv*d
-	sub	%rax, %r10		C inv = 2*inv - inv*inv*d, 64 bits
+	leaq	(%rax,%rax), %rdx	C 2*inv
+	imulq	%rax, %rax		C inv*inv
+	imulq	%rbx, %rax		C inv*inv*d
+	subq	%rax, %rdx		C inv = 2*inv - inv*inv*d, 64 bits
 
-	lea	(%rsi,%r8,8), %rsi	C up end
-	lea	-8(%rdi,%r8,8), %rdi	C rp end
-	neg	%r8			C -n
+	leaq	(%rsi,%r8,8), %rsi	C up end
+	leaq	-8(%rdi,%r8,8), %rdi	C rp end
+	negq	%r8			C -n
 
-	mov	(%rsi,%r8,8), %rax	C up[0]
+	movq	%rdx, %r10		C final inverse
+	movq	(%rsi,%r8,8), %rax	C up[0]
 
-	inc	%r8
+	incq	%r8
 	jz	L(one)
 
-	mov	(%rsi,%r8,8), %rdx	C up[1]
+	movq	(%rsi,%r8,8), %rdx	C up[1]
 
-	shrd	R8(%rcx), %rdx, %rax
+	shrdq	%cl, %rdx, %rax
 
-	xor	R32(%rbx), R32(%rbx)
-	jmp	L(ent)
+	xorl	%ebx, %ebx
+	jmp	L(entry)
 
-L(evn):	bsf	%rax, %rcx
-	shr	R8(%rcx), %rax
+L(evn):	bsfq	%rax, %rcx
+	shrq	%cl, %rax
 	jmp	L(odd)
 
 	ALIGN(8)
@@ -120,39 +108,54 @@ L(top):
 	C rsi	up end
 	C rdi	rp end
 	C r8	counter, limbs, negative
-	C r10	d^(-1) mod 2^64
-	C r11	d, shifted down
-
-	mul	%r11			C carry limb in rdx	0 10
-	mov	-8(%rsi,%r8,8), %rax	C
-	mov	(%rsi,%r8,8), %r9	C
-	shrd	R8(%rcx), %r9, %rax	C
-	nop				C
-	sub	%rbx, %rax		C apply carry bit
-	setc	%bl			C
-	sub	%rdx, %rax		C apply carry limb	5
-	adc	$0, %rbx		C			6
-L(ent):	imul	%r10, %rax		C			6
-	mov	%rax, (%rdi,%r8,8)	C
-	inc	%r8			C
+
+	mulq	%r11			C carry limb in rdx
+
+	movq	-8(%rsi,%r8,8), %rax
+	movq	(%rsi,%r8,8), %r9
+
+	shrdq	%cl, %r9, %rax
+	nop
+
+	subq	%rbx, %rax		C apply carry bit
+	setc	%bl
+
+	subq	%rdx, %rax		C apply carry limb
+	adcq	$0, %rbx
+
+L(entry):
+	imulq	%r10, %rax
+
+	movq	%rax, (%rdi,%r8,8)
+	incq	%r8
 	jnz	L(top)
 
-	mul	%r11			C carry limb in rdx
-	mov	-8(%rsi), %rax		C up high limb
-	shr	R8(%rcx), %rax
-	sub	%rbx, %rax		C apply carry bit
-	sub	%rdx, %rax		C apply carry limb
-	imul	%r10, %rax
-	mov	%rax, (%rdi)
-	pop	%rbx
-	FUNC_EXIT()
+
+	mulq	%r11			C carry limb in rdx
+
+	movq	-8(%rsi), %rax		C up high limb
+	shrq	%cl, %rax
+
+	subq	%rbx, %rax		C apply carry bit
+
+	subq	%rdx, %rax		C apply carry limb
+
+	imulq	%r10, %rax
+
+	movq	%rax, (%rdi)
+
+	popq	%rbx
 	ret
 
-L(one):	shr	R8(%rcx), %rax
-	imul	%r10, %rax
-	mov	%rax, (%rdi)
-	pop	%rbx
-	FUNC_EXIT()
+
+L(one):
+	shrq	%cl, %rax
+
+	imulq	%r10, %rax
+
+	movq	%rax, (%rdi)
+
+	popq	%rbx
 	ret
 
 EPILOGUE()