diff options
Diffstat (limited to 'gmp/mpn/x86_64/dive_1.asm')
-rw-r--r-- | gmp/mpn/x86_64/dive_1.asm | 197 |
1 files changed, 100 insertions, 97 deletions
diff --git a/gmp/mpn/x86_64/dive_1.asm b/gmp/mpn/x86_64/dive_1.asm index 988bdab632..4889faccb5 100644 --- a/gmp/mpn/x86_64/dive_1.asm +++ b/gmp/mpn/x86_64/dive_1.asm @@ -1,44 +1,31 @@ dnl AMD64 mpn_divexact_1 -- mpn by limb exact division. -dnl Copyright 2001, 2002, 2004-2006, 2011, 2012 Free Software Foundation, Inc. +dnl Copyright 2001, 2002, 2004, 2005, 2006 Free Software Foundation, Inc. dnl This file is part of the GNU MP Library. -dnl + dnl The GNU MP Library is free software; you can redistribute it and/or modify -dnl it under the terms of either: -dnl -dnl * the GNU Lesser General Public License as published by the Free -dnl Software Foundation; either version 3 of the License, or (at your -dnl option) any later version. -dnl -dnl or -dnl -dnl * the GNU General Public License as published by the Free Software -dnl Foundation; either version 2 of the License, or (at your option) any -dnl later version. -dnl -dnl or both in parallel, as here. -dnl -dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl it under the terms of the GNU Lesser General Public License as published +dnl by the Free Software Foundation; either version 3 of the License, or (at +dnl your option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -dnl for more details. -dnl -dnl You should have received copies of the GNU General Public License and the -dnl GNU Lesser General Public License along with the GNU MP Library. If not, -dnl see https://www.gnu.org/licenses/. +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. include(`../config.m4') C cycles/limb -C AMD K8,K9 10 -C AMD K10 10 -C Intel P4 33 -C Intel core2 13.25 -C Intel corei 14 -C Intel atom 42 -C VIA nano 43 +C K8,K9: 10 +C K10: 10 +C P4: 33 +C P6-15 (Core2):13.25 +C P6-28 (Atom): 42 C A quick adoption of the 32-bit K7 code. @@ -49,66 +36,67 @@ C up rsi C n rdx C divisor rcx -ABI_SUPPORT(DOS64) -ABI_SUPPORT(STD64) - ASM_START() TEXT ALIGN(16) PROLOGUE(mpn_divexact_1) - FUNC_ENTRY(4) - push %rbx + pushq %rbx - mov %rcx, %rax - xor R32(%rcx), R32(%rcx) C shift count - mov %rdx, %r8 + movq %rcx, %rax + movl $0, %ecx C shift count + movq %rdx, %r8 - bt $0, R32(%rax) + btl $0, %eax jnc L(evn) C skip bsfq unless divisor is even -L(odd): mov %rax, %rbx - shr R32(%rax) - and $127, R32(%rax) C d/2, 7 bits +L(odd): movq %rax, %rbx + shrl %eax + andl $127, %eax C d/2, 7 bits - LEA( binvert_limb_table, %rdx) +ifdef(`PIC',` + movq binvert_limb_table@GOTPCREL(%rip), %rdx +',` + movabsq $binvert_limb_table, %rdx +') - movzbl (%rdx,%rax), R32(%rax) C inv 8 bits + movzbl (%rax,%rdx), %eax C inv 8 bits - mov %rbx, %r11 C d without twos + movq %rbx, %r11 C d without twos - lea (%rax,%rax), R32(%rdx) C 2*inv - imul R32(%rax), R32(%rax) C inv*inv - imul R32(%rbx), R32(%rax) C inv*inv*d - sub R32(%rax), R32(%rdx) C inv = 2*inv - inv*inv*d, 16 bits + leal (%rax,%rax), %edx C 2*inv + imull %eax, %eax C inv*inv + imull %ebx, %eax C inv*inv*d + subl %eax, %edx C inv = 2*inv - inv*inv*d, 16 bits - lea (%rdx,%rdx), R32(%rax) C 2*inv - imul R32(%rdx), R32(%rdx) C inv*inv - imul R32(%rbx), R32(%rdx) C inv*inv*d - sub R32(%rdx), R32(%rax) C inv = 2*inv - inv*inv*d, 32 bits + leal (%rdx,%rdx), %eax C 2*inv + imull %edx, %edx C inv*inv + imull %ebx, %edx C inv*inv*d + subl %edx, %eax C inv = 2*inv - inv*inv*d, 32 bits - lea (%rax,%rax), %r10 C 2*inv - imul %rax, %rax C inv*inv - imul %rbx, %rax C inv*inv*d - sub %rax, %r10 C inv = 2*inv - inv*inv*d, 64 bits + leaq (%rax,%rax), %rdx C 2*inv + imulq %rax, %rax C inv*inv + imulq %rbx, %rax C inv*inv*d + subq %rax, %rdx C inv = 2*inv - inv*inv*d, 64 bits - lea (%rsi,%r8,8), %rsi C up end - lea -8(%rdi,%r8,8), %rdi C rp end - neg %r8 C -n + leaq (%rsi,%r8,8), %rsi C up end + leaq -8(%rdi,%r8,8), %rdi C rp end + negq %r8 C -n - mov (%rsi,%r8,8), %rax C up[0] + movq %rdx, %r10 C final inverse + movq (%rsi,%r8,8), %rax C up[0] - inc %r8 + incq %r8 jz L(one) - mov (%rsi,%r8,8), %rdx C up[1] + movq (%rsi,%r8,8), %rdx C up[1] - shrd R8(%rcx), %rdx, %rax + shrdq %cl, %rdx, %rax - xor R32(%rbx), R32(%rbx) - jmp L(ent) + xorl %ebx, %ebx + jmp L(entry) -L(evn): bsf %rax, %rcx - shr R8(%rcx), %rax +L(evn): bsfq %rax, %rcx + shrq %cl, %rax jmp L(odd) ALIGN(8) @@ -120,39 +108,54 @@ L(top): C rsi up end C rdi rp end C r8 counter, limbs, negative - C r10 d^(-1) mod 2^64 - C r11 d, shifted down - - mul %r11 C carry limb in rdx 0 10 - mov -8(%rsi,%r8,8), %rax C - mov (%rsi,%r8,8), %r9 C - shrd R8(%rcx), %r9, %rax C - nop C - sub %rbx, %rax C apply carry bit - setc %bl C - sub %rdx, %rax C apply carry limb 5 - adc $0, %rbx C 6 -L(ent): imul %r10, %rax C 6 - mov %rax, (%rdi,%r8,8) C - inc %r8 C + + mulq %r11 C carry limb in rdx + + movq -8(%rsi,%r8,8), %rax + movq (%rsi,%r8,8), %r9 + + shrdq %cl, %r9, %rax + nop + + subq %rbx, %rax C apply carry bit + setc %bl + + subq %rdx, %rax C apply carry limb + adcq $0, %rbx + +L(entry): + imulq %r10, %rax + + movq %rax, (%rdi,%r8,8) + incq %r8 jnz L(top) - mul %r11 C carry limb in rdx - mov -8(%rsi), %rax C up high limb - shr R8(%rcx), %rax - sub %rbx, %rax C apply carry bit - sub %rdx, %rax C apply carry limb - imul %r10, %rax - mov %rax, (%rdi) - pop %rbx - FUNC_EXIT() + + mulq %r11 C carry limb in rdx + + movq -8(%rsi), %rax C up high limb + shrq %cl, %rax + + subq %rbx, %rax C apply carry bit + + subq %rdx, %rax C apply carry limb + + imulq %r10, %rax + + movq %rax, (%rdi) + + popq %rbx ret -L(one): shr R8(%rcx), %rax - imul %r10, %rax - mov %rax, (%rdi) - pop %rbx - FUNC_EXIT() + +L(one): + shrq %cl, %rax + + imulq %r10, %rax + + movq %rax, (%rdi) + + popq %rbx ret EPILOGUE() |