diff options
Diffstat (limited to 'mpn')
-rw-r--r-- | mpn/x86_64/bd2/gcd_11.asm | 39 | ||||
-rw-r--r-- | mpn/x86_64/bd4/gcd_11.asm | 38 | ||||
-rw-r--r-- | mpn/x86_64/core2/gcd_11.asm | 27 | ||||
-rw-r--r-- | mpn/x86_64/gcd_11.asm | 37 |
4 files changed, 73 insertions, 68 deletions
diff --git a/mpn/x86_64/bd2/gcd_11.asm b/mpn/x86_64/bd2/gcd_11.asm index 6260b2536..edf8f4c28 100644 --- a/mpn/x86_64/bd2/gcd_11.asm +++ b/mpn/x86_64/bd2/gcd_11.asm @@ -39,13 +39,13 @@ C cycles/bit (approx) C AMD K8,K9 - C AMD K10 - C AMD bd1 - -C AMD bd2 3.27 * +C AMD bd2 3.27 * C AMD bd3 ? C AMD bd4 3.79 C AMD bt1 - -C AMD bt2 3.64 * -C AMD zn1 3.25 -C AMD zn2 3.50 +C AMD bt2 3.64 * +C AMD zn1 3.25 * +C AMD zn2 3.25 * C Intel P4 - C Intel CNR - C Intel PNR - @@ -73,21 +73,24 @@ ASM_START() ALIGN(16) PROLOGUE(mpn_gcd_11) FUNC_ENTRY(2) - mov v0, %rax C - sub u0, v0 C - jz L(end) C + mov v0, %rdx + sub u0, %rdx + jz L(end) - ALIGN(16) C -L(top): rep;bsf v0, %rcx C tzcnt! - mov u0, %r9 C - sub %rax, u0 C u - v - cmovc v0, u0 C u = |u - v| - cmovc %r9, %rax C v = min(u,v) - shr R8(%rcx), u0 C - mov %rax, v0 C - sub u0, v0 C v - u - jnz L(top) C + ALIGN(16) +L(top): rep;bsf %rdx, %rcx C tzcnt! + mov u0, %rax + sub v0, u0 C u - v + cmovc %rdx, u0 C u = |u - v| + cmovc %rax, v0 C v = min(u,v) + shr R8(%rcx), u0 + mov v0, %rdx + sub u0, %rdx C v - u + jnz L(top) -L(end): FUNC_EXIT() +L(end): mov v0, %rax + C rax = result + C rdx = 0 for the benefit of internal gcd_22 call + FUNC_EXIT() ret EPILOGUE() diff --git a/mpn/x86_64/bd4/gcd_11.asm b/mpn/x86_64/bd4/gcd_11.asm index d34fb4f2a..4c4cc2dfe 100644 --- a/mpn/x86_64/bd4/gcd_11.asm +++ b/mpn/x86_64/bd4/gcd_11.asm @@ -41,11 +41,11 @@ C AMD K10 - C AMD bd1 - C AMD bd2 - C AMD bd3 - -C AMD bd4 2.86 * +C AMD bd4 4.0 * C AMD bt1 - C AMD bt2 - -C AMD zn1 2.66 * -C AMD zn2 3.48 +C AMD zn1 3.25 * +C AMD zn2 3.50 C Intel P4 - C Intel CNR - C Intel PNR - @@ -73,22 +73,24 @@ ASM_START() ALIGN(16) PROLOGUE(mpn_gcd_11) FUNC_ENTRY(2) - mov v0, %rax C - sub u0, v0 C - jz L(end) C - mov u0, %r9 + mov u0, %rax + mov v0, %rdx + sub u0, %rdx C v - u + jz L(end) - ALIGN(16) C -L(top): rep;bsf v0, %rcx C - sub %rax, u0 C u - v - cmovc v0, u0 C u = |u - v| - cmovc %r9, %rax C v = min(u,v) - shrx( %rcx, u0, %r9) C - shrx( %rcx, u0, u0) C - mov %rax, v0 C - sub u0, v0 C v - u - jnz L(top) C + ALIGN(16) +L(top): rep;bsf %rdx, %rcx C tzcnt! + sub v0, u0 C u - v + cmovc %rdx, u0 C u = |u - v| + cmovc %rax, v0 C v = min(u,v) + shrx( %rcx, u0, %rax) + shrx( %rcx, u0, u0) + mov v0, %rdx + sub %rax, %rdx C v - u + jnz L(top) -L(end): FUNC_EXIT() +L(end): C rax = result + C rdx = 0 for the benefit of internal gcd_22 call + FUNC_EXIT() ret EPILOGUE() diff --git a/mpn/x86_64/core2/gcd_11.asm b/mpn/x86_64/core2/gcd_11.asm index 11aca4746..ba7e1b6d9 100644 --- a/mpn/x86_64/core2/gcd_11.asm +++ b/mpn/x86_64/core2/gcd_11.asm @@ -73,20 +73,21 @@ ASM_START() ALIGN(16) PROLOGUE(mpn_gcd_11) FUNC_ENTRY(2) - mov v0, %rax C - jmp L(odd) C + jmp L(odd) - ALIGN(16) C -L(top): cmovc v0, u0 C u = |u - v| - cmovc %r9, %rax C v = min(u,v) - shr R8(%rcx), u0 C - mov %rax, v0 C -L(odd): sub u0, v0 C - bsf v0, %rcx C - mov u0, %r9 C - sub %rax, u0 C - jnz L(top) C + ALIGN(16) +L(top): cmovc %rdx, u0 C u = |u - v| + cmovc %rax, v0 C v = min(u,v) + shr R8(%rcx), u0 +L(odd): mov v0, %rdx + sub u0, %rdx C v - u + bsf %rdx, %rcx + mov u0, %rax + sub v0, u0 C u - v + jnz L(top) -L(end): FUNC_EXIT() +L(end): C rax = result + C rdx = 0 for the benefit of internal gcd_22 call + FUNC_EXIT() ret EPILOGUE() diff --git a/mpn/x86_64/gcd_11.asm b/mpn/x86_64/gcd_11.asm index a10cc003e..40273e0d9 100644 --- a/mpn/x86_64/gcd_11.asm +++ b/mpn/x86_64/gcd_11.asm @@ -73,30 +73,29 @@ ASM_START() ALIGN(16) PROLOGUE(mpn_gcd_11) FUNC_ENTRY(2) - mov u0, %rax - - LEA( ctz_table, %rdx) + LEA( ctz_table, %r8) jmp L(ent) - ALIGN(16) C K8 -L(top): cmovc %rcx, %rax C if x-y < 0 0 - cmovc %rdi, v0 C use x,y-x 0 -L(mid): and $MASK, R32(%rcx) C 0 - movzbl (%rdx,%rcx), R32(%rcx) C 1 - jz L(shift_alot) C 1 - shr R8(%rcx), %rax C 3 - mov %rax, %rdi C 4 -L(ent): mov v0, %rcx C 3 - sub %rax, %rcx C 4 - sub v0, %rax C 4 - jnz L(top) C - -L(end): mov v0, %rax + ALIGN(16) +L(top): cmovc %rdx, u0 C u = |u - v| + cmovc %rax, v0 C v = min(u,v) +L(mid): and $MASK, R32(%rdx) + movzbl (%r8,%rdx), R32(%rcx) + jz L(shift_alot) + shr R8(%rcx), u0 +L(ent): mov u0, %rax + mov v0, %rdx + sub u0, %rdx + sub v0, u0 + jnz L(top) + +L(end): C rax = result + C rdx = 0 for the benefit of internal gcd_22 call FUNC_EXIT() ret L(shift_alot): - shr $MAXSHIFT, %rax - mov %rax, %rcx + shr $MAXSHIFT, u0 + mov u0, %rdx jmp L(mid) EPILOGUE() |