summaryrefslogtreecommitdiff
path: root/mpn
diff options
context:
space:
mode:
authorTorbjorn Granlund <tg@gmplib.org>2019-08-22 15:09:51 +0200
committerTorbjorn Granlund <tg@gmplib.org>2019-08-22 15:09:51 +0200
commitae6b9cf24aa6625ba84a508f581037204786da10 (patch)
tree5916508189cdb809c8e0629f37ab9367d3c84e72 /mpn
parent831196f27bf83250d4820168c77019d1c668a745 (diff)
downloadgmp-ae6b9cf24aa6625ba84a508f581037204786da10.tar.gz
Make sure rdx is zero on return to benefit gcd_22's private calls.
Make the gcd_11 files more similar in register use.
Diffstat (limited to 'mpn')
-rw-r--r--mpn/x86_64/bd2/gcd_11.asm39
-rw-r--r--mpn/x86_64/bd4/gcd_11.asm38
-rw-r--r--mpn/x86_64/core2/gcd_11.asm27
-rw-r--r--mpn/x86_64/gcd_11.asm37
4 files changed, 73 insertions, 68 deletions
diff --git a/mpn/x86_64/bd2/gcd_11.asm b/mpn/x86_64/bd2/gcd_11.asm
index 6260b2536..edf8f4c28 100644
--- a/mpn/x86_64/bd2/gcd_11.asm
+++ b/mpn/x86_64/bd2/gcd_11.asm
@@ -39,13 +39,13 @@ C cycles/bit (approx)
C AMD K8,K9 -
C AMD K10 -
C AMD bd1 -
-C AMD bd2 3.27 *
+C AMD bd2 3.27 *
C AMD bd3 ?
C AMD bd4 3.79
C AMD bt1 -
-C AMD bt2 3.64 *
-C AMD zn1 3.25
-C AMD zn2 3.50
+C AMD bt2 3.64 *
+C AMD zn1 3.25 *
+C AMD zn2 3.25 *
C Intel P4 -
C Intel CNR -
C Intel PNR -
@@ -73,21 +73,24 @@ ASM_START()
ALIGN(16)
PROLOGUE(mpn_gcd_11)
FUNC_ENTRY(2)
- mov v0, %rax C
- sub u0, v0 C
- jz L(end) C
+ mov v0, %rdx
+ sub u0, %rdx
+ jz L(end)
- ALIGN(16) C
-L(top): rep;bsf v0, %rcx C tzcnt!
- mov u0, %r9 C
- sub %rax, u0 C u - v
- cmovc v0, u0 C u = |u - v|
- cmovc %r9, %rax C v = min(u,v)
- shr R8(%rcx), u0 C
- mov %rax, v0 C
- sub u0, v0 C v - u
- jnz L(top) C
+ ALIGN(16)
+L(top): rep;bsf %rdx, %rcx C tzcnt!
+ mov u0, %rax
+ sub v0, u0 C u - v
+ cmovc %rdx, u0 C u = |u - v|
+ cmovc %rax, v0 C v = min(u,v)
+ shr R8(%rcx), u0
+ mov v0, %rdx
+ sub u0, %rdx C v - u
+ jnz L(top)
-L(end): FUNC_EXIT()
+L(end): mov v0, %rax
+ C rax = result
+ C rdx = 0 for the benefit of internal gcd_22 call
+ FUNC_EXIT()
ret
EPILOGUE()
diff --git a/mpn/x86_64/bd4/gcd_11.asm b/mpn/x86_64/bd4/gcd_11.asm
index d34fb4f2a..4c4cc2dfe 100644
--- a/mpn/x86_64/bd4/gcd_11.asm
+++ b/mpn/x86_64/bd4/gcd_11.asm
@@ -41,11 +41,11 @@ C AMD K10 -
C AMD bd1 -
C AMD bd2 -
C AMD bd3 -
-C AMD bd4 2.86 *
+C AMD bd4 4.0 *
C AMD bt1 -
C AMD bt2 -
-C AMD zn1 2.66 *
-C AMD zn2 3.48
+C AMD zn1 3.25 *
+C AMD zn2 3.50
C Intel P4 -
C Intel CNR -
C Intel PNR -
@@ -73,22 +73,24 @@ ASM_START()
ALIGN(16)
PROLOGUE(mpn_gcd_11)
FUNC_ENTRY(2)
- mov v0, %rax C
- sub u0, v0 C
- jz L(end) C
- mov u0, %r9
+ mov u0, %rax
+ mov v0, %rdx
+ sub u0, %rdx C v - u
+ jz L(end)
- ALIGN(16) C
-L(top): rep;bsf v0, %rcx C
- sub %rax, u0 C u - v
- cmovc v0, u0 C u = |u - v|
- cmovc %r9, %rax C v = min(u,v)
- shrx( %rcx, u0, %r9) C
- shrx( %rcx, u0, u0) C
- mov %rax, v0 C
- sub u0, v0 C v - u
- jnz L(top) C
+ ALIGN(16)
+L(top): rep;bsf %rdx, %rcx C tzcnt!
+ sub v0, u0 C u - v
+ cmovc %rdx, u0 C u = |u - v|
+ cmovc %rax, v0 C v = min(u,v)
+ shrx( %rcx, u0, %rax)
+ shrx( %rcx, u0, u0)
+ mov v0, %rdx
+ sub %rax, %rdx C v - u
+ jnz L(top)
-L(end): FUNC_EXIT()
+L(end): C rax = result
+ C rdx = 0 for the benefit of internal gcd_22 call
+ FUNC_EXIT()
ret
EPILOGUE()
diff --git a/mpn/x86_64/core2/gcd_11.asm b/mpn/x86_64/core2/gcd_11.asm
index 11aca4746..ba7e1b6d9 100644
--- a/mpn/x86_64/core2/gcd_11.asm
+++ b/mpn/x86_64/core2/gcd_11.asm
@@ -73,20 +73,21 @@ ASM_START()
ALIGN(16)
PROLOGUE(mpn_gcd_11)
FUNC_ENTRY(2)
- mov v0, %rax C
- jmp L(odd) C
+ jmp L(odd)
- ALIGN(16) C
-L(top): cmovc v0, u0 C u = |u - v|
- cmovc %r9, %rax C v = min(u,v)
- shr R8(%rcx), u0 C
- mov %rax, v0 C
-L(odd): sub u0, v0 C
- bsf v0, %rcx C
- mov u0, %r9 C
- sub %rax, u0 C
- jnz L(top) C
+ ALIGN(16)
+L(top): cmovc %rdx, u0 C u = |u - v|
+ cmovc %rax, v0 C v = min(u,v)
+ shr R8(%rcx), u0
+L(odd): mov v0, %rdx
+ sub u0, %rdx C v - u
+ bsf %rdx, %rcx
+ mov u0, %rax
+ sub v0, u0 C u - v
+ jnz L(top)
-L(end): FUNC_EXIT()
+L(end): C rax = result
+ C rdx = 0 for the benefit of internal gcd_22 call
+ FUNC_EXIT()
ret
EPILOGUE()
diff --git a/mpn/x86_64/gcd_11.asm b/mpn/x86_64/gcd_11.asm
index a10cc003e..40273e0d9 100644
--- a/mpn/x86_64/gcd_11.asm
+++ b/mpn/x86_64/gcd_11.asm
@@ -73,30 +73,29 @@ ASM_START()
ALIGN(16)
PROLOGUE(mpn_gcd_11)
FUNC_ENTRY(2)
- mov u0, %rax
-
- LEA( ctz_table, %rdx)
+ LEA( ctz_table, %r8)
jmp L(ent)
- ALIGN(16) C K8
-L(top): cmovc %rcx, %rax C if x-y < 0 0
- cmovc %rdi, v0 C use x,y-x 0
-L(mid): and $MASK, R32(%rcx) C 0
- movzbl (%rdx,%rcx), R32(%rcx) C 1
- jz L(shift_alot) C 1
- shr R8(%rcx), %rax C 3
- mov %rax, %rdi C 4
-L(ent): mov v0, %rcx C 3
- sub %rax, %rcx C 4
- sub v0, %rax C 4
- jnz L(top) C
-
-L(end): mov v0, %rax
+ ALIGN(16)
+L(top): cmovc %rdx, u0 C u = |u - v|
+ cmovc %rax, v0 C v = min(u,v)
+L(mid): and $MASK, R32(%rdx)
+ movzbl (%r8,%rdx), R32(%rcx)
+ jz L(shift_alot)
+ shr R8(%rcx), u0
+L(ent): mov u0, %rax
+ mov v0, %rdx
+ sub u0, %rdx
+ sub v0, u0
+ jnz L(top)
+
+L(end): C rax = result
+ C rdx = 0 for the benefit of internal gcd_22 call
FUNC_EXIT()
ret
L(shift_alot):
- shr $MAXSHIFT, %rax
- mov %rax, %rcx
+ shr $MAXSHIFT, u0
+ mov u0, %rdx
jmp L(mid)
EPILOGUE()