diff options
Diffstat (limited to 'gmp/mpn/x86/pentium4')
28 files changed, 866 insertions, 1870 deletions
diff --git a/gmp/mpn/x86/pentium4/README b/gmp/mpn/x86/pentium4/README index 90f752e5d5..8dc0479f04 100644 --- a/gmp/mpn/x86/pentium4/README +++ b/gmp/mpn/x86/pentium4/README @@ -3,28 +3,17 @@ Copyright 2001 Free Software Foundation, Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify -it under the terms of either: - - * the GNU Lesser General Public License as published by the Free - Software Foundation; either version 3 of the License, or (at your - option) any later version. - -or - - * the GNU General Public License as published by the Free Software - Foundation; either version 2 of the License, or (at your option) any - later version. - -or both in parallel, as here. +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 3 of the License, or (at your +option) any later version. The GNU MP Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -for more details. +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. -You should have received copies of the GNU General Public License and the -GNU Lesser General Public License along with the GNU MP Library. If not, -see https://www.gnu.org/licenses/. +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. diff --git a/gmp/mpn/x86/pentium4/copyd.asm b/gmp/mpn/x86/pentium4/copyd.asm index 82af81c522..491ad60128 100644 --- a/gmp/mpn/x86/pentium4/copyd.asm +++ b/gmp/mpn/x86/pentium4/copyd.asm @@ -1,32 +1,22 @@ dnl Pentium-4 mpn_copyd -- copy limb vector, decrementing. - -dnl Copyright 1999-2001 Free Software Foundation, Inc. - -dnl This file is part of the GNU MP Library. -dnl -dnl The GNU MP Library is free software; you can redistribute it and/or modify -dnl it under the terms of either: dnl -dnl * the GNU Lesser General Public License as published by the Free -dnl Software Foundation; either version 3 of the License, or (at your -dnl option) any later version. -dnl -dnl or + +dnl Copyright 1999, 2000, 2001 Free Software Foundation, Inc. dnl -dnl * the GNU General Public License as published by the Free Software -dnl Foundation; either version 2 of the License, or (at your option) any -dnl later version. +dnl This file is part of the GNU MP Library. dnl -dnl or both in parallel, as here. +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 3 of the +dnl License, or (at your option) any later version. dnl -dnl The GNU MP Library is distributed in the hope that it will be useful, but -dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -dnl for more details. +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. dnl -dnl You should have received copies of the GNU General Public License and the -dnl GNU Lesser General Public License along with the GNU MP Library. If not, -dnl see https://www.gnu.org/licenses/. +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. dnl The std/rep/movsl/cld is very slow for small blocks on pentium4. Its diff --git a/gmp/mpn/x86/pentium4/copyi.asm b/gmp/mpn/x86/pentium4/copyi.asm index b6148879fa..bf812c822b 100644 --- a/gmp/mpn/x86/pentium4/copyi.asm +++ b/gmp/mpn/x86/pentium4/copyi.asm @@ -1,32 +1,22 @@ dnl Pentium-4 mpn_copyi -- copy limb vector, incrementing. - -dnl Copyright 1999-2001 Free Software Foundation, Inc. - -dnl This file is part of the GNU MP Library. -dnl -dnl The GNU MP Library is free software; you can redistribute it and/or modify -dnl it under the terms of either: dnl -dnl * the GNU Lesser General Public License as published by the Free -dnl Software Foundation; either version 3 of the License, or (at your -dnl option) any later version. -dnl -dnl or + +dnl Copyright 1999, 2000, 2001 Free Software Foundation, Inc. dnl -dnl * the GNU General Public License as published by the Free Software -dnl Foundation; either version 2 of the License, or (at your option) any -dnl later version. +dnl This file is part of the GNU MP Library. dnl -dnl or both in parallel, as here. +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 3 of the +dnl License, or (at your option) any later version. dnl -dnl The GNU MP Library is distributed in the hope that it will be useful, but -dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -dnl for more details. +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. dnl -dnl You should have received copies of the GNU General Public License and the -dnl GNU Lesser General Public License along with the GNU MP Library. If not, -dnl see https://www.gnu.org/licenses/. +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. dnl The rep/movsl is very slow for small blocks on pentium4. Its startup diff --git a/gmp/mpn/x86/pentium4/mmx/lshift.asm b/gmp/mpn/x86/pentium4/mmx/lshift.asm index b5eca66698..5d316d5da4 100644 --- a/gmp/mpn/x86/pentium4/mmx/lshift.asm +++ b/gmp/mpn/x86/pentium4/mmx/lshift.asm @@ -1,32 +1,21 @@ dnl Intel Pentium-4 mpn_lshift -- left shift. dnl Copyright 2001, 2002 Free Software Foundation, Inc. - -dnl This file is part of the GNU MP Library. -dnl -dnl The GNU MP Library is free software; you can redistribute it and/or modify -dnl it under the terms of either: dnl -dnl * the GNU Lesser General Public License as published by the Free -dnl Software Foundation; either version 3 of the License, or (at your -dnl option) any later version. -dnl -dnl or -dnl -dnl * the GNU General Public License as published by the Free Software -dnl Foundation; either version 2 of the License, or (at your option) any -dnl later version. +dnl This file is part of the GNU MP Library. dnl -dnl or both in parallel, as here. +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 3 of the +dnl License, or (at your option) any later version. dnl -dnl The GNU MP Library is distributed in the hope that it will be useful, but -dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -dnl for more details. +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. dnl -dnl You should have received copies of the GNU General Public License and the -dnl GNU Lesser General Public License along with the GNU MP Library. If not, -dnl see https://www.gnu.org/licenses/. +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. include(`../config.m4') diff --git a/gmp/mpn/x86/pentium4/mmx/popham.asm b/gmp/mpn/x86/pentium4/mmx/popham.asm index 9563cb57e4..2e79816821 100644 --- a/gmp/mpn/x86/pentium4/mmx/popham.asm +++ b/gmp/mpn/x86/pentium4/mmx/popham.asm @@ -1,33 +1,22 @@ dnl Intel Pentium 4 mpn_popcount, mpn_hamdist -- population count and dnl hamming distance. -dnl Copyright 2000-2002, 2007 Free Software Foundation, Inc. - -dnl This file is part of the GNU MP Library. -dnl -dnl The GNU MP Library is free software; you can redistribute it and/or modify -dnl it under the terms of either: +dnl Copyright 2000, 2001, 2002, 2007 Free Software Foundation, Inc. dnl -dnl * the GNU Lesser General Public License as published by the Free -dnl Software Foundation; either version 3 of the License, or (at your -dnl option) any later version. -dnl -dnl or -dnl -dnl * the GNU General Public License as published by the Free Software -dnl Foundation; either version 2 of the License, or (at your option) any -dnl later version. +dnl This file is part of the GNU MP Library. dnl -dnl or both in parallel, as here. +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 3 of the +dnl License, or (at your option) any later version. dnl -dnl The GNU MP Library is distributed in the hope that it will be useful, but -dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -dnl for more details. +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. dnl -dnl You should have received copies of the GNU General Public License and the -dnl GNU Lesser General Public License along with the GNU MP Library. If not, -dnl see https://www.gnu.org/licenses/. +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. include(`../config.m4') diff --git a/gmp/mpn/x86/pentium4/mmx/rshift.asm b/gmp/mpn/x86/pentium4/mmx/rshift.asm index 3ac0094a5a..a7dec54a3a 100644 --- a/gmp/mpn/x86/pentium4/mmx/rshift.asm +++ b/gmp/mpn/x86/pentium4/mmx/rshift.asm @@ -1,32 +1,21 @@ dnl Intel Pentium-4 mpn_rshift -- right shift. dnl Copyright 2001, 2002 Free Software Foundation, Inc. - -dnl This file is part of the GNU MP Library. -dnl -dnl The GNU MP Library is free software; you can redistribute it and/or modify -dnl it under the terms of either: dnl -dnl * the GNU Lesser General Public License as published by the Free -dnl Software Foundation; either version 3 of the License, or (at your -dnl option) any later version. -dnl -dnl or -dnl -dnl * the GNU General Public License as published by the Free Software -dnl Foundation; either version 2 of the License, or (at your option) any -dnl later version. +dnl This file is part of the GNU MP Library. dnl -dnl or both in parallel, as here. +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 3 of the +dnl License, or (at your option) any later version. dnl -dnl The GNU MP Library is distributed in the hope that it will be useful, but -dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -dnl for more details. +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. dnl -dnl You should have received copies of the GNU General Public License and the -dnl GNU Lesser General Public License along with the GNU MP Library. If not, -dnl see https://www.gnu.org/licenses/. +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. include(`../config.m4') diff --git a/gmp/mpn/x86/pentium4/sse2/add_n.asm b/gmp/mpn/x86/pentium4/sse2/add_n.asm index 8e2380e493..04c0c68d0e 100644 --- a/gmp/mpn/x86/pentium4/sse2/add_n.asm +++ b/gmp/mpn/x86/pentium4/sse2/add_n.asm @@ -1,44 +1,36 @@ dnl Intel Pentium-4 mpn_add_n -- mpn addition. dnl Copyright 2001, 2002 Free Software Foundation, Inc. - -dnl This file is part of the GNU MP Library. -dnl -dnl The GNU MP Library is free software; you can redistribute it and/or modify -dnl it under the terms of either: dnl -dnl * the GNU Lesser General Public License as published by the Free -dnl Software Foundation; either version 3 of the License, or (at your -dnl option) any later version. -dnl -dnl or -dnl -dnl * the GNU General Public License as published by the Free Software -dnl Foundation; either version 2 of the License, or (at your option) any -dnl later version. +dnl This file is part of the GNU MP Library. dnl -dnl or both in parallel, as here. +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 3 of the +dnl License, or (at your option) any later version. dnl -dnl The GNU MP Library is distributed in the hope that it will be useful, but -dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -dnl for more details. +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. dnl -dnl You should have received copies of the GNU General Public License and the -dnl GNU Lesser General Public License along with the GNU MP Library. If not, -dnl see https://www.gnu.org/licenses/. +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. include(`../config.m4') -C cycles/limb -C dst!=src1,2 dst==src1 dst==src2 -C P6 model 0-8,10-12 - -C P6 model 9 (Banias) ? -C P6 model 13 (Dothan) ? -C P4 model 0-1 (Willamette) ? -C P4 model 2 (Northwood) 4 6 6 -C P4 model 3-4 (Prescott) 4.25 7.5 7.5 +C P4 Willamette, Northwood: 4.0 cycles/limb if dst!=src1 and dst!=src2 +C 6.0 cycles/limb if dst==src1 or dst==src2 +C P4 Prescott: >= 5 cycles/limb + +C mp_limb_t mpn_add_n (mp_ptr dst, mp_srcptr src1, mp_srcptr src2, +C mp_size_t size); +C mp_limb_t mpn_add_nc (mp_ptr dst, mp_srcptr src1, mp_srcptr src2, +C mp_size_t size, mp_limb_t carry); +C +C The 4 c/l achieved here isn't particularly good, but is better than 9 c/l +C for a basic adc loop. defframe(PARAM_CARRY,20) defframe(PARAM_SIZE, 16) @@ -54,25 +46,29 @@ define(SAVE_EBX,`PARAM_SRC1') PROLOGUE(mpn_add_nc) deflit(`FRAME',0) + movd PARAM_CARRY, %mm0 jmp L(start_nc) + EPILOGUE() ALIGN(8) PROLOGUE(mpn_add_n) deflit(`FRAME',0) + pxor %mm0, %mm0 + L(start_nc): - mov PARAM_SRC1, %eax - mov %ebx, SAVE_EBX - mov PARAM_SRC2, %ebx - mov PARAM_DST, %edx - mov PARAM_SIZE, %ecx + movl PARAM_SRC1, %eax + movl %ebx, SAVE_EBX + movl PARAM_SRC2, %ebx + movl PARAM_DST, %edx + movl PARAM_SIZE, %ecx - lea (%eax,%ecx,4), %eax C src1 end - lea (%ebx,%ecx,4), %ebx C src2 end - lea (%edx,%ecx,4), %edx C dst end - neg %ecx C -size + leal (%eax,%ecx,4), %eax C src1 end + leal (%ebx,%ecx,4), %ebx C src2 end + leal (%edx,%ecx,4), %edx C dst end + negl %ecx C -size L(top): C eax src1 end @@ -90,11 +86,12 @@ L(top): psrlq $32, %mm0 - add $1, %ecx + addl $1, %ecx jnz L(top) + movd %mm0, %eax - mov SAVE_EBX, %ebx + movl SAVE_EBX, %ebx emms ret diff --git a/gmp/mpn/x86/pentium4/sse2/addlsh1_n.asm b/gmp/mpn/x86/pentium4/sse2/addlsh1_n.asm index 93b63b2018..46b0903c50 100644 --- a/gmp/mpn/x86/pentium4/sse2/addlsh1_n.asm +++ b/gmp/mpn/x86/pentium4/sse2/addlsh1_n.asm @@ -1,45 +1,33 @@ dnl Intel Pentium-4 mpn_addlsh1_n -- mpn x+2*y. -dnl Copyright 2001-2004, 2006 Free Software Foundation, Inc. - -dnl This file is part of the GNU MP Library. -dnl -dnl The GNU MP Library is free software; you can redistribute it and/or modify -dnl it under the terms of either: -dnl -dnl * the GNU Lesser General Public License as published by the Free -dnl Software Foundation; either version 3 of the License, or (at your -dnl option) any later version. -dnl -dnl or +dnl Copyright 2001, 2002, 2003, 2004, 2006 Free Software Foundation, Inc. dnl -dnl * the GNU General Public License as published by the Free Software -dnl Foundation; either version 2 of the License, or (at your option) any -dnl later version. +dnl This file is part of the GNU MP Library. dnl -dnl or both in parallel, as here. +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 3 of the +dnl License, or (at your option) any later version. dnl -dnl The GNU MP Library is distributed in the hope that it will be useful, but -dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -dnl for more details. +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. dnl -dnl You should have received copies of the GNU General Public License and the -dnl GNU Lesser General Public License along with the GNU MP Library. If not, -dnl see https://www.gnu.org/licenses/. +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. include(`../config.m4') -C cycles/limb -C dst!=src1,2 dst==src1 dst==src2 -C P6 model 0-8,10-12 - -C P6 model 9 (Banias) ? -C P6 model 13 (Dothan) ? -C P4 model 0-1 (Willamette) ? -C P4 model 2 (Northwood) 4.25 6 6 -C P4 model 3-4 (Prescott) 5 8.5 8.5 +C cycles/limb (approx) +C dst!=src1,2 dst==src1 dst==src2 +C P4 m2: 4.5 ?7.25 ?6.75 +C P4 m3: 5.3 ? ? +C mp_limb_t mpn_addlsh1_n (mp_ptr dst, mp_srcptr src1, mp_srcptr src2, +C mp_size_t size); +C C The slightly strange combination of indexing and pointer incrementing C that's used seems to work best. Not sure why, but %ecx,4 with src1 and/or C src2 is a slowdown. @@ -63,18 +51,18 @@ define(SAVE_EBX,`PARAM_SRC1') PROLOGUE(mpn_addlsh1_n) deflit(`FRAME',0) - mov PARAM_SRC1, %eax - mov %ebx, SAVE_EBX + movl PARAM_SRC1, %eax + movl %ebx, SAVE_EBX - mov PARAM_SRC2, %ebx + movl PARAM_SRC2, %ebx pxor %mm0, %mm0 C initial carry - mov PARAM_DST, %edx + movl PARAM_DST, %edx - mov PARAM_SIZE, %ecx + movl PARAM_SIZE, %ecx - lea (%edx,%ecx,4), %edx C dst end - neg %ecx C -size + leal (%edx,%ecx,4), %edx C dst end + negl %ecx C -size L(top): C eax src1 end @@ -83,24 +71,24 @@ L(top): C edx dst end C mm0 carry - movd (%ebx), %mm2 movd (%eax), %mm1 + movd (%ebx), %mm2 psrlq $32, %mm0 - lea 4(%eax), %eax - lea 4(%ebx), %ebx + leal 4(%eax), %eax + leal 4(%ebx), %ebx - psllq $1, %mm2 + paddq %mm2, %mm1 paddq %mm2, %mm1 paddq %mm1, %mm0 movd %mm0, (%edx,%ecx,4) - add $1, %ecx + addl $1, %ecx jnz L(top) psrlq $32, %mm0 - mov SAVE_EBX, %ebx + movl SAVE_EBX, %ebx movd %mm0, %eax emms ret diff --git a/gmp/mpn/x86/pentium4/sse2/addmul_1.asm b/gmp/mpn/x86/pentium4/sse2/addmul_1.asm index 78102072bf..3a8d0bb9bd 100644 --- a/gmp/mpn/x86/pentium4/sse2/addmul_1.asm +++ b/gmp/mpn/x86/pentium4/sse2/addmul_1.asm @@ -1,48 +1,37 @@ dnl mpn_addmul_1 for Pentium 4 and P6 models with SSE2 (i.e., 9,D,E,F). -dnl Copyright 2005, 2007, 2011 Free Software Foundation, Inc. - +dnl Copyright 2005, 2007 Free Software Foundation, Inc. +dnl dnl This file is part of the GNU MP Library. dnl dnl The GNU MP Library is free software; you can redistribute it and/or modify -dnl it under the terms of either: -dnl -dnl * the GNU Lesser General Public License as published by the Free -dnl Software Foundation; either version 3 of the License, or (at your -dnl option) any later version. -dnl -dnl or -dnl -dnl * the GNU General Public License as published by the Free Software -dnl Foundation; either version 2 of the License, or (at your option) any -dnl later version. -dnl -dnl or both in parallel, as here. +dnl it under the terms of the GNU Lesser General Public License as published +dnl by the Free Software Foundation; either version 3 of the License, or (at +dnl your option) any later version. dnl dnl The GNU MP Library is distributed in the hope that it will be useful, but dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -dnl for more details. +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. dnl -dnl You should have received copies of the GNU General Public License and the -dnl GNU Lesser General Public License along with the GNU MP Library. If not, -dnl see https://www.gnu.org/licenses/. +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. include(`../config.m4') -C cycles/limb -C P6 model 0-8,10-12 - -C P6 model 9 (Banias) 5.24 -C P6 model 13 (Dothan) 5.24 -C P4 model 0-1 (Willamette) 5 -C P4 model 2 (Northwood) 5 -C P4 model 3-4 (Prescott) 5 - C TODO: C * Tweak eax/edx offsets in loop as to save some lea's C * Perhaps software pipeline small-case code +C cycles/limb +C P6 model 0-8,10-12) - +C P6 model 9 (Banias) ? +C P6 model 13 (Dothan) 5.24 +C P4 model 0-1 (Willamette): 5 +C P4 model 2 (Northwood): 5 +C P4 model 3-4 (Prescott): 5 + C INPUT PARAMETERS C rp sp + 4 C up sp + 8 @@ -51,13 +40,22 @@ C v0 sp + 16 TEXT ALIGN(16) +PROLOGUE(mpn_addmul_1c) + mov 4(%esp), %edx + mov 8(%esp), %eax + mov 12(%esp), %ecx + movd 16(%esp), %mm7 + movd 20(%esp), %mm6 + jmp L(ent) +EPILOGUE() + ALIGN(16) PROLOGUE(mpn_addmul_1) - pxor %mm6, %mm6 -L(ent): mov 4(%esp), %edx + mov 4(%esp), %edx mov 8(%esp), %eax mov 12(%esp), %ecx movd 16(%esp), %mm7 - cmp $4, %ecx + pxor %mm6, %mm6 +L(ent): cmp $4, %ecx jnc L(big) L(lp0): movd (%eax), %mm0 @@ -183,7 +181,3 @@ L(end): pmuludq %mm7, %mm2 emms ret EPILOGUE() -PROLOGUE(mpn_addmul_1c) - movd 20(%esp), %mm6 - jmp L(ent) -EPILOGUE() diff --git a/gmp/mpn/x86/pentium4/sse2/bdiv_dbm1c.asm b/gmp/mpn/x86/pentium4/sse2/bdiv_dbm1c.asm deleted file mode 100644 index 354300e4de..0000000000 --- a/gmp/mpn/x86/pentium4/sse2/bdiv_dbm1c.asm +++ /dev/null @@ -1,141 +0,0 @@ -dnl Intel Atom mpn_bdiv_dbm1. - -dnl Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato. - -dnl Copyright 2011 Free Software Foundation, Inc. - -dnl This file is part of the GNU MP Library. -dnl -dnl The GNU MP Library is free software; you can redistribute it and/or modify -dnl it under the terms of either: -dnl -dnl * the GNU Lesser General Public License as published by the Free -dnl Software Foundation; either version 3 of the License, or (at your -dnl option) any later version. -dnl -dnl or -dnl -dnl * the GNU General Public License as published by the Free Software -dnl Foundation; either version 2 of the License, or (at your option) any -dnl later version. -dnl -dnl or both in parallel, as here. -dnl -dnl The GNU MP Library is distributed in the hope that it will be useful, but -dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -dnl for more details. -dnl -dnl You should have received copies of the GNU General Public License and the -dnl GNU Lesser General Public License along with the GNU MP Library. If not, -dnl see https://www.gnu.org/licenses/. - -include(`../config.m4') - -C cycles/limb -C cycles/limb -C P5 - -C P6 model 0-8,10-12 - -C P6 model 9 (Banias) 9.75 -C P6 model 13 (Dothan) -C P4 model 0 (Willamette) -C P4 model 1 (?) -C P4 model 2 (Northwood) 8.25 -C P4 model 3 (Prescott) -C P4 model 4 (Nocona) -C Intel Atom 8 -C AMD K6 - -C AMD K7 - -C AMD K8 -C AMD K10 - -C TODO: This code was optimised for atom-32, consider moving it back to atom -C dir(atom currently grabs this code), and write a 4-way version(7c/l). - -defframe(PARAM_CARRY,20) -defframe(PARAM_MUL, 16) -defframe(PARAM_SIZE, 12) -defframe(PARAM_SRC, 8) -defframe(PARAM_DST, 4) - -dnl re-use parameter space -define(SAVE_RP,`PARAM_MUL') -define(SAVE_UP,`PARAM_SIZE') - -define(`rp', `%edi') -define(`up', `%esi') -define(`n', `%ecx') -define(`reg', `%edx') -define(`cy', `%eax') C contains the return value - -ASM_START() - TEXT - ALIGN(16) -deflit(`FRAME',0) - -PROLOGUE(mpn_bdiv_dbm1c) - mov PARAM_SIZE, n C size - mov up, SAVE_UP - mov PARAM_SRC, up - movd PARAM_MUL, %mm7 - mov rp, SAVE_RP - mov PARAM_DST, rp - - movd (up), %mm0 - pmuludq %mm7, %mm0 - shr n - mov PARAM_CARRY, cy - jz L(eq1) - - movd 4(up), %mm1 - jc L(odd) - - lea 4(up), up - pmuludq %mm7, %mm1 - movd %mm0, reg - psrlq $32, %mm0 - sub reg, cy - movd %mm0, reg - movq %mm1, %mm0 - dec n - mov cy, (rp) - lea 4(rp), rp - jz L(end) - -C ALIGN(16) -L(top): movd 4(up), %mm1 - sbb reg, cy -L(odd): movd %mm0, reg - psrlq $32, %mm0 - pmuludq %mm7, %mm1 - sub reg, cy - lea 8(up), up - movd %mm0, reg - movd (up), %mm0 - mov cy, (rp) - sbb reg, cy - movd %mm1, reg - psrlq $32, %mm1 - sub reg, cy - movd %mm1, reg - pmuludq %mm7, %mm0 - dec n - mov cy, 4(rp) - lea 8(rp), rp - jnz L(top) - -L(end): sbb reg, cy - -L(eq1): movd %mm0, reg - psrlq $32, %mm0 - mov SAVE_UP, up - sub reg, cy - movd %mm0, reg - emms - mov cy, (rp) - sbb reg, cy - - mov SAVE_RP, rp - ret -EPILOGUE() -ASM_END() diff --git a/gmp/mpn/x86/pentium4/sse2/bdiv_q_1.asm b/gmp/mpn/x86/pentium4/sse2/bdiv_q_1.asm deleted file mode 100644 index f7f461d56f..0000000000 --- a/gmp/mpn/x86/pentium4/sse2/bdiv_q_1.asm +++ /dev/null @@ -1,233 +0,0 @@ -dnl Intel Pentium-4 mpn_divexact_1 -- mpn by limb exact division. - -dnl Rearranged from mpn/x86/pentium4/sse2/dive_1.asm by Marco Bodrato. - -dnl Copyright 2001, 2002, 2007, 2011 Free Software Foundation, Inc. - -dnl This file is part of the GNU MP Library. -dnl -dnl The GNU MP Library is free software; you can redistribute it and/or modify -dnl it under the terms of either: -dnl -dnl * the GNU Lesser General Public License as published by the Free -dnl Software Foundation; either version 3 of the License, or (at your -dnl option) any later version. -dnl -dnl or -dnl -dnl * the GNU General Public License as published by the Free Software -dnl Foundation; either version 2 of the License, or (at your option) any -dnl later version. -dnl -dnl or both in parallel, as here. -dnl -dnl The GNU MP Library is distributed in the hope that it will be useful, but -dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -dnl for more details. -dnl -dnl You should have received copies of the GNU General Public License and the -dnl GNU Lesser General Public License along with the GNU MP Library. If not, -dnl see https://www.gnu.org/licenses/. - -include(`../config.m4') - - -C P4: 19.0 cycles/limb - -C Pairs of movd's are used to avoid unaligned loads. Despite the loads not -C being on the dependent chain and there being plenty of cycles available, -C using an unaligned movq on every second iteration measured about 23 c/l. -C - -defframe(PARAM_SHIFT, 24) -defframe(PARAM_INVERSE,20) -defframe(PARAM_DIVISOR,16) -defframe(PARAM_SIZE, 12) -defframe(PARAM_SRC, 8) -defframe(PARAM_DST, 4) - - TEXT - -C mp_limb_t -C mpn_pi1_bdiv_q_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, mp_limb_t divisor, -C mp_limb_t inverse, int shift) - ALIGN(32) -PROLOGUE(mpn_pi1_bdiv_q_1) -deflit(`FRAME',0) - - movl PARAM_SIZE, %edx - - movl PARAM_SRC, %eax - - movl PARAM_DIVISOR, %ecx - - movd %ecx, %mm6 - movl PARAM_SHIFT, %ecx - - movd %ecx, %mm7 C shift - - C - - movl PARAM_INVERSE, %ecx - movd %ecx, %mm5 C inv - - movl PARAM_DST, %ecx - pxor %mm1, %mm1 C initial carry limb - pxor %mm0, %mm0 C initial carry bit - - subl $1, %edx - jz L(done) - - pcmpeqd %mm4, %mm4 - psrlq $32, %mm4 C 0x00000000FFFFFFFF - -C The dependent chain here is as follows. -C -C latency -C psubq s = (src-cbit) - climb 2 -C pmuludq q = s*inverse 8 -C pmuludq prod = q*divisor 8 -C psrlq climb = high(prod) 2 -C -- -C 20 -C -C Yet the loop measures 19.0 c/l, so obviously there's something gained -C there over a straight reading of the chip documentation. - -L(top): - C eax src, incrementing - C ebx - C ecx dst, incrementing - C edx counter, size-1 iterations - C - C mm0 carry bit - C mm1 carry limb - C mm4 0x00000000FFFFFFFF - C mm5 inverse - C mm6 divisor - C mm7 shift - - movd (%eax), %mm2 - movd 4(%eax), %mm3 - addl $4, %eax - punpckldq %mm3, %mm2 - - psrlq %mm7, %mm2 - pand %mm4, %mm2 C src - psubq %mm0, %mm2 C src - cbit - - psubq %mm1, %mm2 C src - cbit - climb - movq %mm2, %mm0 - psrlq $63, %mm0 C new cbit - - pmuludq %mm5, %mm2 C s*inverse - movd %mm2, (%ecx) C q - addl $4, %ecx - - movq %mm6, %mm1 - pmuludq %mm2, %mm1 C q*divisor - psrlq $32, %mm1 C new climb - -L(entry): - subl $1, %edx - jnz L(top) - -L(done): - movd (%eax), %mm2 - psrlq %mm7, %mm2 C src - psubq %mm0, %mm2 C src - cbit - - psubq %mm1, %mm2 C src - cbit - climb - - pmuludq %mm5, %mm2 C s*inverse - movd %mm2, (%ecx) C q - - emms - ret - -EPILOGUE() - - ALIGN(16) -C mp_limb_t mpn_bdiv_q_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, -C mp_limb_t divisor); -C -PROLOGUE(mpn_bdiv_q_1) -deflit(`FRAME',0) - - movl PARAM_SIZE, %edx - - movl PARAM_DIVISOR, %ecx - - C eax src - C ebx - C ecx divisor - C edx size-1 - - movl %ecx, %eax - bsfl %ecx, %ecx C trailing twos - - shrl %cl, %eax C d = divisor without twos - movd %eax, %mm6 - movd %ecx, %mm7 C shift - - shrl %eax C d/2 - - andl $127, %eax C d/2, 7 bits - -ifdef(`PIC',` - LEA( binvert_limb_table, %ecx) - movzbl (%eax,%ecx), %eax C inv 8 bits -',` - movzbl binvert_limb_table(%eax), %eax C inv 8 bits -') - - C - - movd %eax, %mm5 C inv - - movd %eax, %mm0 C inv - - pmuludq %mm5, %mm5 C inv*inv - - C - - pmuludq %mm6, %mm5 C inv*inv*d - paddd %mm0, %mm0 C 2*inv - - C - - psubd %mm5, %mm0 C inv = 2*inv - inv*inv*d - pxor %mm5, %mm5 - - paddd %mm0, %mm5 - pmuludq %mm0, %mm0 C inv*inv - - pcmpeqd %mm4, %mm4 - psrlq $32, %mm4 C 0x00000000FFFFFFFF - - C - - pmuludq %mm6, %mm0 C inv*inv*d - paddd %mm5, %mm5 C 2*inv - - movl PARAM_SRC, %eax - movl PARAM_DST, %ecx - pxor %mm1, %mm1 C initial carry limb - - C - - psubd %mm0, %mm5 C inv = 2*inv - inv*inv*d - - ASSERT(e,` C expect d*inv == 1 mod 2^GMP_LIMB_BITS - pushl %eax FRAME_pushl() - movq %mm6, %mm0 - pmuludq %mm5, %mm0 - movd %mm0, %eax - cmpl $1, %eax - popl %eax FRAME_popl()') - - pxor %mm0, %mm0 C initial carry bit - jmp L(entry) - -EPILOGUE() diff --git a/gmp/mpn/x86/pentium4/sse2/cnd_add_n.asm b/gmp/mpn/x86/pentium4/sse2/cnd_add_n.asm deleted file mode 100644 index b3f3474e67..0000000000 --- a/gmp/mpn/x86/pentium4/sse2/cnd_add_n.asm +++ /dev/null @@ -1,95 +0,0 @@ -dnl Intel Pentium-4 mpn_cnd_add_n -- mpn addition. - -dnl Copyright 2001, 2002, 2013 Free Software Foundation, Inc. - -dnl This file is part of the GNU MP Library. -dnl -dnl The GNU MP Library is free software; you can redistribute it and/or modify -dnl it under the terms of either: -dnl -dnl * the GNU Lesser General Public License as published by the Free -dnl Software Foundation; either version 3 of the License, or (at your -dnl option) any later version. -dnl -dnl or -dnl -dnl * the GNU General Public License as published by the Free Software -dnl Foundation; either version 2 of the License, or (at your option) any -dnl later version. -dnl -dnl or both in parallel, as here. -dnl -dnl The GNU MP Library is distributed in the hope that it will be useful, but -dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -dnl for more details. -dnl -dnl You should have received copies of the GNU General Public License and the -dnl GNU Lesser General Public License along with the GNU MP Library. If not, -dnl see https://www.gnu.org/licenses/. - -include(`../config.m4') - - -C cycles/limb -C P6 model 0-8,10-12 - -C P6 model 9 (Banias) ? -C P6 model 13 (Dothan) 4.67 -C P4 model 0-1 (Willamette) ? -C P4 model 2 (Northwood) 5 -C P4 model 3-4 (Prescott) 5.25 - -defframe(PARAM_SIZE, 20) -defframe(PARAM_SRC2, 16) -defframe(PARAM_SRC1, 12) -defframe(PARAM_DST, 8) -defframe(PARAM_CND, 4) - -dnl re-use parameter space -define(SAVE_EBX,`PARAM_SRC1') - -define(`cnd', `%mm3') - - TEXT - ALIGN(8) - - ALIGN(8) -PROLOGUE(mpn_cnd_add_n) -deflit(`FRAME',0) - pxor %mm0, %mm0 - - mov PARAM_CND, %eax - neg %eax - sbb %eax, %eax - movd %eax, cnd - - mov PARAM_SRC1, %eax - mov %ebx, SAVE_EBX - mov PARAM_SRC2, %ebx - mov PARAM_DST, %edx - mov PARAM_SIZE, %ecx - - lea (%eax,%ecx,4), %eax C src1 end - lea (%ebx,%ecx,4), %ebx C src2 end - lea (%edx,%ecx,4), %edx C dst end - neg %ecx C -size - -L(top): movd (%ebx,%ecx,4), %mm2 - movd (%eax,%ecx,4), %mm1 - pand cnd, %mm2 - paddq %mm2, %mm1 - - paddq %mm1, %mm0 - movd %mm0, (%edx,%ecx,4) - - psrlq $32, %mm0 - - add $1, %ecx - jnz L(top) - - movd %mm0, %eax - mov SAVE_EBX, %ebx - emms - ret - -EPILOGUE() diff --git a/gmp/mpn/x86/pentium4/sse2/cnd_sub_n.asm b/gmp/mpn/x86/pentium4/sse2/cnd_sub_n.asm deleted file mode 100644 index 339a23e0b6..0000000000 --- a/gmp/mpn/x86/pentium4/sse2/cnd_sub_n.asm +++ /dev/null @@ -1,114 +0,0 @@ -dnl Intel Pentium-4 mpn_cnd_sub_n -- mpn subtraction. - -dnl Copyright 2001, 2002, 2013 Free Software Foundation, Inc. - -dnl This file is part of the GNU MP Library. -dnl -dnl The GNU MP Library is free software; you can redistribute it and/or modify -dnl it under the terms of either: -dnl -dnl * the GNU Lesser General Public License as published by the Free -dnl Software Foundation; either version 3 of the License, or (at your -dnl option) any later version. -dnl -dnl or -dnl -dnl * the GNU General Public License as published by the Free Software -dnl Foundation; either version 2 of the License, or (at your option) any -dnl later version. -dnl -dnl or both in parallel, as here. -dnl -dnl The GNU MP Library is distributed in the hope that it will be useful, but -dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -dnl for more details. -dnl -dnl You should have received copies of the GNU General Public License and the -dnl GNU Lesser General Public License along with the GNU MP Library. If not, -dnl see https://www.gnu.org/licenses/. - -include(`../config.m4') - - -C cycles/limb -C P6 model 0-8,10-12 - -C P6 model 9 (Banias) ? -C P6 model 13 (Dothan) 4.67 -C P4 model 0-1 (Willamette) ? -C P4 model 2 (Northwood) 5 -C P4 model 3-4 (Prescott) 5.25 - -defframe(PARAM_SIZE, 20) -defframe(PARAM_SRC2, 16) -defframe(PARAM_SRC1, 12) -defframe(PARAM_DST, 8) -defframe(PARAM_CND, 4) - -dnl re-use parameter space -define(SAVE_EBX,`PARAM_SRC1') - -define(`cnd', `%mm3') - - TEXT - ALIGN(8) - - ALIGN(8) -PROLOGUE(mpn_cnd_sub_n) -deflit(`FRAME',0) - pxor %mm0, %mm0 - - mov PARAM_CND, %eax - neg %eax - sbb %eax, %eax - movd %eax, cnd - - mov PARAM_SRC1, %eax - mov %ebx, SAVE_EBX - mov PARAM_SRC2, %ebx - mov PARAM_DST, %edx - mov PARAM_SIZE, %ecx - - lea (%eax,%ecx,4), %eax C src1 end - lea (%ebx,%ecx,4), %ebx C src2 end - lea (%edx,%ecx,4), %edx C dst end - neg %ecx C -size - -L(top): movd (%ebx,%ecx,4), %mm2 - movd (%eax,%ecx,4), %mm1 - pand cnd, %mm2 - psubq %mm2, %mm1 - - psubq %mm0, %mm1 - movd %mm1, (%edx,%ecx,4) - - psrlq $63, %mm1 - - add $1, %ecx - jz L(done_mm1) - - movd (%ebx,%ecx,4), %mm2 - movd (%eax,%ecx,4), %mm0 - pand cnd, %mm2 - psubq %mm2, %mm0 - - psubq %mm1, %mm0 - movd %mm0, (%edx,%ecx,4) - - psrlq $63, %mm0 - - add $1, %ecx - jnz L(top) - - movd %mm0, %eax - mov SAVE_EBX, %ebx - emms - ret - -L(done_mm1): - movd %mm1, %eax - mov SAVE_EBX, %ebx - emms - ret - -EPILOGUE() diff --git a/gmp/mpn/x86/pentium4/sse2/dive_1.asm b/gmp/mpn/x86/pentium4/sse2/dive_1.asm index 238f0dd8a5..c50ef7d29e 100644 --- a/gmp/mpn/x86/pentium4/sse2/dive_1.asm +++ b/gmp/mpn/x86/pentium4/sse2/dive_1.asm @@ -1,32 +1,21 @@ dnl Intel Pentium-4 mpn_divexact_1 -- mpn by limb exact division. dnl Copyright 2001, 2002, 2007 Free Software Foundation, Inc. - -dnl This file is part of the GNU MP Library. -dnl -dnl The GNU MP Library is free software; you can redistribute it and/or modify -dnl it under the terms of either: dnl -dnl * the GNU Lesser General Public License as published by the Free -dnl Software Foundation; either version 3 of the License, or (at your -dnl option) any later version. -dnl -dnl or -dnl -dnl * the GNU General Public License as published by the Free Software -dnl Foundation; either version 2 of the License, or (at your option) any -dnl later version. +dnl This file is part of the GNU MP Library. dnl -dnl or both in parallel, as here. +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 3 of the +dnl License, or (at your option) any later version. dnl -dnl The GNU MP Library is distributed in the hope that it will be useful, but -dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -dnl for more details. +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. dnl -dnl You should have received copies of the GNU General Public License and the -dnl GNU Lesser General Public License along with the GNU MP Library. If not, -dnl see https://www.gnu.org/licenses/. +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. include(`../config.m4') @@ -137,7 +126,7 @@ ifdef(`PIC',` psubd %mm0, %mm5 C inv = 2*inv - inv*inv*d - ASSERT(e,` C expect d*inv == 1 mod 2^GMP_LIMB_BITS + ASSERT(e,` C expect d*inv == 1 mod 2^BITS_PER_MP_LIMB pushl %eax FRAME_pushl() movq %mm6, %mm0 pmuludq %mm5, %mm0 @@ -150,13 +139,13 @@ ifdef(`PIC',` C The dependent chain here is as follows. C -C latency -C psubq s = (src-cbit) - climb 2 -C pmuludq q = s*inverse 8 -C pmuludq prod = q*divisor 8 -C psrlq climb = high(prod) 2 -C -- -C 20 +C latency +C psubq s = (src-cbit) - climb 2 +C pmuludq q = s*inverse 8 +C pmuludq prod = q*divisor 8 +C psrlq climb = high(prod) 2 +C -- +C 20 C C Yet the loop measures 19.0 c/l, so obviously there's something gained C there over a straight reading of the chip documentation. diff --git a/gmp/mpn/x86/pentium4/sse2/divrem_1.asm b/gmp/mpn/x86/pentium4/sse2/divrem_1.asm index 0146fab117..7f973dbf98 100644 --- a/gmp/mpn/x86/pentium4/sse2/divrem_1.asm +++ b/gmp/mpn/x86/pentium4/sse2/divrem_1.asm @@ -1,32 +1,22 @@ dnl Intel Pentium-4 mpn_divrem_1 -- mpn by limb division. -dnl Copyright 1999-2004 Free Software Foundation, Inc. - -dnl This file is part of the GNU MP Library. -dnl -dnl The GNU MP Library is free software; you can redistribute it and/or modify -dnl it under the terms of either: +dnl Copyright 1999, 2000, 2001, 2002, 2003, 2004 Free Software Foundation, +dnl Inc. dnl -dnl * the GNU Lesser General Public License as published by the Free -dnl Software Foundation; either version 3 of the License, or (at your -dnl option) any later version. -dnl -dnl or -dnl -dnl * the GNU General Public License as published by the Free Software -dnl Foundation; either version 2 of the License, or (at your option) any -dnl later version. +dnl This file is part of the GNU MP Library. dnl -dnl or both in parallel, as here. +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 3 of the +dnl License, or (at your option) any later version. dnl -dnl The GNU MP Library is distributed in the hope that it will be useful, but -dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -dnl for more details. +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. dnl -dnl You should have received copies of the GNU General Public License and the -dnl GNU Lesser General Public License along with the GNU MP Library. If not, -dnl see https://www.gnu.org/licenses/. +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. include(`../config.m4') diff --git a/gmp/mpn/x86/pentium4/sse2/gmp-mparam.h b/gmp/mpn/x86/pentium4/sse2/gmp-mparam.h index a94ae868b3..5071aae092 100644 --- a/gmp/mpn/x86/pentium4/sse2/gmp-mparam.h +++ b/gmp/mpn/x86/pentium4/sse2/gmp-mparam.h @@ -1,206 +1,68 @@ /* Intel Pentium-4 gmp-mparam.h -- Compiler/machine parameter header file. -Copyright 1991, 1993, 1994, 2000-2005, 2007-2010, 2014 Free Software -Foundation, Inc. +Copyright 1991, 1993, 1994, 2000, 2001, 2002, 2003, 2004, 2005, 2007, 2008, +2009 Free Software Foundation, Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify -it under the terms of either: +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 3 of the License, or (at your +option) any later version. - * the GNU Lesser General Public License as published by the Free - Software Foundation; either version 3 of the License, or (at your - option) any later version. +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. -or +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ - * the GNU General Public License as published by the Free Software - Foundation; either version 2 of the License, or (at your option) any - later version. +#define BITS_PER_MP_LIMB 32 +#define BYTES_PER_MP_LIMB 4 -or both in parallel, as here. -The GNU MP Library is distributed in the hope that it will be useful, but -WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -for more details. - -You should have received copies of the GNU General Public License and the -GNU Lesser General Public License along with the GNU MP Library. If not, -see https://www.gnu.org/licenses/. */ - -#define GMP_LIMB_BITS 32 -#define GMP_LIMB_BYTES 4 - -/* 2600 MHz P4 Northwood */ -/* FFT tuning limit = 12500000 */ -/* Generated by tuneup.c, 2014-03-12, gcc 4.2 */ - -#define MOD_1_NORM_THRESHOLD 24 -#define MOD_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */ -#define MOD_1N_TO_MOD_1_1_THRESHOLD 6 -#define MOD_1U_TO_MOD_1_1_THRESHOLD 5 -#define MOD_1_1_TO_MOD_1_2_THRESHOLD 13 -#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */ -#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 2 -#define USE_PREINV_DIVREM_1 1 /* native */ -#define DIV_QR_1N_PI1_METHOD 2 -#define DIV_QR_1_NORM_THRESHOLD 19 -#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */ -#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */ -#define DIVEXACT_1_THRESHOLD 0 /* always (native) */ -#define BMOD_1_TO_MOD_1_THRESHOLD 20 - -#define MUL_TOOM22_THRESHOLD 29 -#define MUL_TOOM33_THRESHOLD 113 -#define MUL_TOOM44_THRESHOLD 288 -#define MUL_TOOM6H_THRESHOLD 454 -#define MUL_TOOM8H_THRESHOLD 592 - -#define MUL_TOOM32_TO_TOOM43_THRESHOLD 118 -#define MUL_TOOM32_TO_TOOM53_THRESHOLD 214 -#define MUL_TOOM42_TO_TOOM53_THRESHOLD 193 -#define MUL_TOOM42_TO_TOOM63_THRESHOLD 186 -#define MUL_TOOM43_TO_TOOM54_THRESHOLD 287 - -#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ -#define SQR_TOOM2_THRESHOLD 44 -#define SQR_TOOM3_THRESHOLD 173 -#define SQR_TOOM4_THRESHOLD 390 -#define SQR_TOOM6_THRESHOLD 0 /* always */ -#define SQR_TOOM8_THRESHOLD 915 - -#define MULMID_TOOM42_THRESHOLD 66 - -#define MULMOD_BNM1_THRESHOLD 19 -#define SQRMOD_BNM1_THRESHOLD 23 - -#define MUL_FFT_MODF_THRESHOLD 1147 /* k = 5 */ -#define MUL_FFT_TABLE3 \ - { { 1147, 5}, { 36, 6}, { 19, 5}, { 39, 6}, \ - { 27, 7}, { 15, 6}, { 33, 7}, { 17, 6}, \ - { 35, 7}, { 19, 6}, { 40, 7}, { 21, 6}, \ - { 43, 7}, { 23, 6}, { 49, 7}, { 27, 6}, \ - { 55, 7}, { 31, 6}, { 63, 7}, { 35, 8}, \ - { 19, 7}, { 43, 8}, { 23, 7}, { 51, 8}, \ - { 27, 7}, { 55, 8}, { 31, 7}, { 63, 8}, \ - { 39, 7}, { 79, 8}, { 43, 9}, { 23, 8}, \ - { 55, 9}, { 31, 8}, { 71, 9}, { 39, 8}, \ - { 79, 9}, { 47, 8}, { 95, 9}, { 55,10}, \ - { 31, 9}, { 63, 8}, { 127, 9}, { 79,10}, \ - { 47, 9}, { 111,11}, { 31,10}, { 63, 9}, \ - { 143,10}, { 79, 9}, { 159,10}, { 111,11}, \ - { 63,10}, { 127, 9}, { 255,10}, { 159, 9}, \ - { 319,11}, { 95,10}, { 207,12}, { 63,11}, \ - { 127,10}, { 287,11}, { 159,10}, { 335,11}, \ - { 191,10}, { 383,11}, { 223,12}, { 127,11}, \ - { 255,10}, { 511,11}, { 319,10}, { 671,11}, \ - { 351,12}, { 191,11}, { 383,10}, { 799,13}, \ - { 127,12}, { 255,11}, { 511,10}, { 1055, 9}, \ - { 2111,10}, { 1119, 9}, { 2239,11}, { 607,12}, \ - { 319,11}, { 671,10}, { 1407,11}, { 735,10}, \ - { 1471, 9}, { 2943,12}, { 383,11}, { 799,10}, \ - { 1599,11}, { 863,10}, { 1727, 9}, { 3455,12}, \ - { 447,11}, { 895,13}, { 255,12}, { 511,11}, \ - { 1055,10}, { 2111,11}, { 1119,10}, { 2239, 9}, \ - { 4479,12}, { 575,11}, { 1247,10}, { 2495, 9}, \ - { 4991,12}, { 639,11}, { 1471,10}, { 2943,13}, \ - { 383,12}, { 767,11}, { 1599,12}, { 831,11}, \ - { 1727,10}, { 3455,12}, { 895,14}, { 255,13}, \ - { 511,12}, { 1023,11}, { 2047,12}, { 1087,11}, \ - { 2239,10}, { 4479,12}, { 1215,11}, { 2495,10}, \ - { 4991,13}, { 639,12}, { 1471,11}, { 2943,10}, \ - { 5887,11}, { 3007,13}, { 767,12}, { 1727,11}, \ - { 3455,13}, { 895,12}, { 1791,11}, { 3711,12}, \ - { 1983,11}, { 3967,10}, { 7935,14}, { 511,13}, \ - { 1023,12}, { 2239,11}, { 4479,13}, { 1151,12}, \ - { 2495,11}, { 4991,13}, { 1279,12}, { 2623,13}, \ - { 1407,12}, { 2943,11}, { 5887,12}, { 3007,14}, \ - { 16384,15}, { 32768,16} } -#define MUL_FFT_TABLE3_SIZE 158 -#define MUL_FFT_THRESHOLD 7808 - -#define SQR_FFT_MODF_THRESHOLD 896 /* k = 5 */ -#define SQR_FFT_TABLE3 \ - { { 896, 5}, { 28, 6}, { 15, 5}, { 33, 6}, \ - { 17, 5}, { 35, 6}, { 19, 5}, { 39, 6}, \ - { 27, 7}, { 15, 6}, { 33, 7}, { 17, 6}, \ - { 36, 7}, { 19, 6}, { 39, 7}, { 23, 6}, \ - { 47, 7}, { 27, 6}, { 55, 7}, { 31, 6}, \ - { 63, 7}, { 37, 8}, { 19, 7}, { 43, 8}, \ - { 23, 7}, { 51, 8}, { 27, 7}, { 55, 8}, \ - { 31, 7}, { 63, 8}, { 39, 7}, { 79, 8}, \ - { 43, 9}, { 23, 8}, { 55, 9}, { 31, 8}, \ - { 71, 9}, { 39, 8}, { 79, 9}, { 47, 8}, \ - { 95, 9}, { 55,10}, { 31, 9}, { 79,10}, \ - { 47, 9}, { 95,11}, { 31,10}, { 63, 9}, \ - { 127,10}, { 79, 9}, { 159,10}, { 95, 9}, \ - { 191,11}, { 63,10}, { 127, 9}, { 255,10}, \ - { 159,11}, { 95,10}, { 191,12}, { 63,11}, \ - { 127,10}, { 255, 9}, { 511,10}, { 271, 9}, \ - { 543,11}, { 159,10}, { 319, 9}, { 639,11}, \ - { 191,10}, { 383, 9}, { 767,11}, { 223,12}, \ - { 127,11}, { 255,10}, { 511, 9}, { 1055,10}, \ - { 543,11}, { 287,10}, { 607,11}, { 319,12}, \ - { 191,11}, { 383,10}, { 767,13}, { 127,12}, \ - { 255,11}, { 511,10}, { 1055,11}, { 543,10}, \ - { 1119, 9}, { 2239,11}, { 607,12}, { 319,11}, \ - { 671,10}, { 1407,11}, { 735,10}, { 1471, 9}, \ - { 2943,12}, { 383,11}, { 799,10}, { 1599,11}, \ - { 863,10}, { 1727,12}, { 447,11}, { 991,13}, \ - { 255,12}, { 511,11}, { 1055,10}, { 2111,11}, \ - { 1119,10}, { 2239,12}, { 575,11}, { 1247,10}, \ - { 2495,12}, { 639,11}, { 1471,10}, { 2943,13}, \ - { 383,12}, { 767,11}, { 1599,12}, { 831,11}, \ - { 1727,10}, { 3455,12}, { 959,14}, { 255,13}, \ - { 511,12}, { 1023,11}, { 2111,12}, { 1087,11}, \ - { 2239,10}, { 4479,12}, { 1215,11}, { 2495,13}, \ - { 639,12}, { 1471,11}, { 2943,10}, { 5887,13}, \ - { 767,12}, { 1727,11}, { 3455,13}, { 895,12}, \ - { 1791,11}, { 3711,12}, { 1983,11}, { 3967,10}, \ - { 7935,14}, { 511,13}, { 1023,12}, { 2239,11}, \ - { 4479,13}, { 1151,12}, { 2495,11}, { 4991,13}, \ - { 1279,12}, { 2623,13}, { 1407,12}, { 2943,11}, \ - { 5887,14}, { 16384,15}, { 32768,16} } -#define SQR_FFT_TABLE3_SIZE 159 -#define SQR_FFT_THRESHOLD 7296 - -#define MULLO_BASECASE_THRESHOLD 12 -#define MULLO_DC_THRESHOLD 55 -#define MULLO_MUL_N_THRESHOLD 14709 - -#define DC_DIV_QR_THRESHOLD 38 -#define DC_DIVAPPR_Q_THRESHOLD 77 -#define DC_BDIV_QR_THRESHOLD 51 -#define DC_BDIV_Q_THRESHOLD 85 - -#define INV_MULMOD_BNM1_THRESHOLD 56 -#define INV_NEWTON_THRESHOLD 121 -#define INV_APPR_THRESHOLD 93 - -#define BINV_NEWTON_THRESHOLD 366 -#define REDC_1_TO_REDC_N_THRESHOLD 64 - -#define MU_DIV_QR_THRESHOLD 2350 -#define MU_DIVAPPR_Q_THRESHOLD 2172 -#define MUPI_DIV_QR_THRESHOLD 62 -#define MU_BDIV_QR_THRESHOLD 2172 -#define MU_BDIV_Q_THRESHOLD 2304 - -#define POWM_SEC_TABLE 1,19,102,615,2111 - -#define MATRIX22_STRASSEN_THRESHOLD 23 -#define HGCD_THRESHOLD 88 -#define HGCD_APPR_THRESHOLD 93 -#define HGCD_REDUCE_THRESHOLD 5010 -#define GCD_DC_THRESHOLD 379 -#define GCDEXT_DC_THRESHOLD 258 -#define JACOBI_BASE_METHOD 4 - -#define GET_STR_DC_THRESHOLD 12 -#define GET_STR_PRECOMPUTE_THRESHOLD 26 -#define SET_STR_DC_THRESHOLD 147 -#define SET_STR_PRECOMPUTE_THRESHOLD 894 - -#define FAC_DSC_THRESHOLD 906 -#define FAC_ODD_THRESHOLD 28 +/* 2600 MHz Pentium 4 model 2 */ + +/* Generated by tuneup.c, 2009-01-06, gcc 3.4 */ + +#define MUL_KARATSUBA_THRESHOLD 31 +#define MUL_TOOM3_THRESHOLD 119 +#define MUL_TOOM44_THRESHOLD 178 + +#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ +#define SQR_KARATSUBA_THRESHOLD 49 +#define SQR_TOOM3_THRESHOLD 165 +#define SQR_TOOM4_THRESHOLD 252 + +#define MULLOW_BASECASE_THRESHOLD 15 +#define MULLOW_DC_THRESHOLD 44 +#define MULLOW_MUL_N_THRESHOLD 363 + +#define DIV_SB_PREINV_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIV_DC_THRESHOLD 33 +#define POWM_THRESHOLD 95 + +#define MATRIX22_STRASSEN_THRESHOLD 23 +#define HGCD_THRESHOLD 64 +#define GCD_DC_THRESHOLD 310 +#define GCDEXT_DC_THRESHOLD 310 +#define JACOBI_BASE_METHOD 1 + +#define USE_PREINV_DIVREM_1 1 /* native */ +#define USE_PREINV_MOD_1 1 /* native */ +#define DIVEXACT_1_THRESHOLD 0 /* always (native) */ +#define MODEXACT_1_ODD_THRESHOLD 0 /* always (native) */ + +#define GET_STR_DC_THRESHOLD 11 +#define GET_STR_PRECOMPUTE_THRESHOLD 26 +#define SET_STR_DC_THRESHOLD 118 +#define SET_STR_PRECOMPUTE_THRESHOLD 1078 + +#define MUL_FFT_TABLE { 560, 928, 1920, 5632, 14336, 40960, 0 } +#define MUL_FFT_MODF_THRESHOLD 720 +#define MUL_FFT_THRESHOLD 9216 + +#define SQR_FFT_TABLE { 592, 928, 1920, 4608, 14336, 40960, 0 } +#define SQR_FFT_MODF_THRESHOLD 608 +#define SQR_FFT_THRESHOLD 5888 diff --git a/gmp/mpn/x86/pentium4/sse2/mod_1.asm b/gmp/mpn/x86/pentium4/sse2/mod_1.asm new file mode 100644 index 0000000000..0e95f13913 --- /dev/null +++ b/gmp/mpn/x86/pentium4/sse2/mod_1.asm @@ -0,0 +1,391 @@ +dnl Intel Pentium-4 mpn_mod_1 -- mpn by limb remainder. + +dnl Copyright 2001, 2002, 2003 Free Software Foundation, Inc. +dnl +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 3 of the +dnl License, or (at your option) any later version. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. +dnl +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. + +include(`../config.m4') + + +dnl P4: 31 cycles/limb. + + +C mp_limb_t mpn_mod_1 (mp_srcptr src, mp_size_t size, mp_limb_t divisor); +C mp_limb_t mpn_mod_1c (mp_srcptr src, mp_size_t size, mp_limb_t divisor, +C mp_limb_t carry); +C mp_limb_t mpn_preinv_mod_1 (mp_srcptr src, mp_size_t size, mp_limb_t divisor, +C mp_limb_t inverse); +C +C An idea was tried in the mul-by-inverse to process the last limb by a jump +C back to the top of the loop skipping the -4(%esi) fetch. But that seemed +C to produce slightly strange timings, like 9 and 10 limb operations about +C the same speed. The jump would be successively taken and not-taken, which +C in theory should predict ok, but perhaps isn't enjoyed by the chip. +C Duplicating the loop for the last limb seems to be a couple of cycles +C quicker too. +C +C Enhancements: +C +C The loop measures 31 cycles, but the dependent chain would suggest it +C could be done with 30. Not sure where to start looking for the extra +C cycle. + + +dnl MUL_THRESHOLD is the size at which the multiply by inverse method is +dnl used, rather than plain "divl"s. Minimum value 2. +dnl +dnl The inverse takes about 80-90 cycles to calculate, but after that the +dnl multiply is 31 c/l versus division at about 58 c/l. + +deflit(MUL_THRESHOLD, 5) + + +defframe(PARAM_INVERSE,16) dnl mpn_preinv_mod_1 +defframe(PARAM_CARRY, 16) dnl mpn_mod_1c +defframe(PARAM_DIVISOR,12) +defframe(PARAM_SIZE, 8) +defframe(PARAM_SRC, 4) + +dnl re-use parameter space +define(SAVE_ESI,`PARAM_SIZE') +define(SAVE_EBP,`PARAM_SRC') + + TEXT + + ALIGN(16) +PROLOGUE(mpn_preinv_mod_1) +deflit(`FRAME',0) + + movl PARAM_SIZE, %ecx + movl %esi, SAVE_ESI + movl $32, %eax + + movd %eax, %mm6 C l = 0, so 32-l = 32 + movl PARAM_SRC, %esi + movl %ebp, SAVE_EBP + + movd PARAM_DIVISOR, %mm5 + pxor %mm7, %mm7 C l = 0 + + movd -4(%esi,%ecx,4), %mm0 C src high limb + leal -8(%esi,%ecx,4), %esi C &src[size-2] + + movd PARAM_INVERSE, %mm4 + subl $2, %ecx C size-2 + + psubq %mm5, %mm0 C high-divisor + movq %mm0, %mm2 + + psrlq $32, %mm0 C -1 if underflow + + pand %mm5, %mm0 C divisor if underflow + + paddq %mm2, %mm0 C addback if underflow + jz L(inverse_last) C if size==2 + ja L(inverse_top) C if size>2 + + + C if size==1 + movl SAVE_ESI, %esi + movd %mm0, %eax + emms + ret + +EPILOGUE() + + + ALIGN(16) +PROLOGUE(mpn_mod_1c) +deflit(`FRAME',0) + movl PARAM_SIZE, %ecx + movl %esi, SAVE_ESI + + movl PARAM_SRC, %esi + movl %ebp, SAVE_EBP + + movl PARAM_CARRY, %edx + orl %ecx, %ecx + jz L(divide_done) C result==carry if size==0 + + movl PARAM_DIVISOR, %ebp + jmp L(start_1c) + +EPILOGUE() + + + ALIGN(16) +PROLOGUE(mpn_mod_1) +deflit(`FRAME',0) + + movl PARAM_SIZE, %ecx + movl %esi, SAVE_ESI + + movl PARAM_SRC, %esi + movl %ebp, SAVE_EBP + + movl PARAM_DIVISOR, %ebp + xorl %edx, %edx C result 0 if size==0 + + orl %ecx, %ecx + jz L(divide_done) + movl -4(%esi,%ecx,4), %eax C src high limb + + leal -1(%ecx), %edx + cmpl %ebp, %eax C c if high<divisor + + cmovc( %edx, %ecx) C size-1 if high<divisor + + movl $0, %edx C initial carry + cmovc( %eax, %edx) C src high limb if high<divisor + + orl %ecx, %ecx + jz L(divide_done) C if size==1 and skip div + + +L(start_1c): + C eax + C ebx + C ecx size + C edx carry + C esi src + C edi + C ebp divisor + + leal -4(%esi,%ecx,4), %esi C &src[size-1] + cmpl $MUL_THRESHOLD, %ecx + jae L(mul_by_inverse) + + +L(divide_top): + C eax + C ebx + C ecx counter, limbs, decrementing + C edx remainder + C esi src, decrementing + C edi + C ebp divisor + + movl (%esi), %eax + subl $4, %esi + + divl %ebp + + subl $1, %ecx + jnz L(divide_top) + + +L(divide_done): + movl SAVE_ESI, %esi + movl SAVE_EBP, %ebp + movl %edx, %eax + ret + + +C ----------------------------------------------------------------------------- + +L(mul_by_inverse): + C eax + C ebx + C ecx size + C edx carry + C esi src + C edi + C ebp divisor + + bsrl %ebp, %eax C 31-l + + movd %edx, %mm1 C carry + movl %ecx, %edx C size + movl $31, %ecx + + C + + xorl %eax, %ecx C l = leading zeros on d + addl $1, %eax C 32-l + + shll %cl, %ebp C normalize d + movd %ecx, %mm7 C l + leal -1(%edx), %ecx C size-1 + + movd %eax, %mm6 C 32-l + movl $-1, %edx + movl $-1, %eax + + C + + subl %ebp, %edx C (b-d)-1 so edx:eax = b*(b-d)-1 + + divl %ebp C floor (b*(b-d)-1 / d) + + movd %ebp, %mm5 C d + movd (%esi), %mm0 C src high limb + punpckldq %mm1, %mm0 + psrlq %mm6, %mm0 C n2 = high (carry:srchigh << l) + + C + + movd %eax, %mm4 C m + + +C The dependent chain here consists of +C +C 2 paddd n1+n2 +C 8 pmuludq m*(n1+n2) +C 2 paddq n2:nadj + m*(n1+n2) +C 2 psrlq q1 +C 8 pmuludq d*q1 +C 2 psubq (n-d)-q1*d +C 2 psrlq high mask +C 2 pand d masked +C 2 paddd n2+d addback +C -- +C 30 +C +C But it seems to run at 31 cycles, so presumably there's something else +C going on. + + + ALIGN(16) +L(inverse_top): + C eax + C ebx + C ecx counter, size-1 to 1 + C edx + C esi src, decrementing + C edi + C ebp + C + C mm0 n2 + C mm4 m + C mm5 d + C mm6 32-l + C mm7 l + + ASSERT(b,`C n2<d + movd %mm0, %eax + movd %mm5, %edx + cmpl %edx, %eax') + + movd -4(%esi), %mm1 C next src limbs + movd (%esi), %mm2 + leal -4(%esi), %esi + + punpckldq %mm2, %mm1 + psrlq %mm6, %mm1 C n10 + + movq %mm1, %mm2 C n10 + movq %mm1, %mm3 C n10 + psrad $31, %mm1 C -n1 + pand %mm5, %mm1 C -n1 & d + paddd %mm2, %mm1 C nadj = n10+(-n1&d), ignore overflow + + psrld $31, %mm2 C n1 + paddd %mm0, %mm2 C n2+n1 + punpckldq %mm0, %mm1 C n2:nadj + + pmuludq %mm4, %mm2 C m*(n2+n1) + + paddq %mm2, %mm1 C n2:nadj + m*(n2+n1) + + psrlq $32, %mm1 C q1 = high(n2:nadj + m*(n2+n1)) + + pmuludq %mm5, %mm1 C q1*d + punpckldq %mm0, %mm3 C n + psubq %mm5, %mm3 C n - d + pxor %mm0, %mm0 + + psubq %mm1, %mm3 C n - (q1+1)*d + + por %mm3, %mm0 C remainder -> n2 + psrlq $32, %mm3 C high n - (q1+1)*d, 0 or -1 + + ASSERT(be,`C 0 or -1 + movd %mm3, %eax + addl $1, %eax + cmpl $1, %eax') + + pand %mm5, %mm3 C mask & d + + paddd %mm3, %mm0 C addback if necessary + + subl $1, %ecx + jnz L(inverse_top) + + + C Least significant limb. + C Same code as the loop, but there's no -4(%esi) limb to fetch. + +L(inverse_last): + C eax + C ebx + C ecx + C edx + C esi &src[0] + C + C mm0 n2 + C mm4 m + C mm5 d + C mm6 32-l + C mm7 l + + movd (%esi), %mm1 C src[0] + psllq %mm7, %mm1 C n10 + + movq %mm1, %mm2 C n10 + movq %mm1, %mm3 C n10 + psrad $31, %mm1 C -n1 + pand %mm5, %mm1 C -n1 & d + paddd %mm2, %mm1 C nadj = n10+(-n1&d), ignore overflow + + psrld $31, %mm2 C n1 + paddd %mm0, %mm2 C n2+n1 + punpckldq %mm0, %mm1 C n2:nadj + + pmuludq %mm4, %mm2 C m*(n2+n1) + + paddq %mm2, %mm1 C n2:nadj + m*(n2+n1) + + psrlq $32, %mm1 C q1 = high(n2:nadj + m*(n2+n1)) + + pmuludq %mm5, %mm1 C q1*d + punpckldq %mm0, %mm3 C n + psubq %mm5, %mm3 C n - d + pxor %mm0, %mm0 + + psubq %mm1, %mm3 C n - (q1+1)*d + + por %mm3, %mm0 C remainder -> n2 + psrlq $32, %mm3 C high n - (q1+1)*d, 0 or -1 + + ASSERT(be,`C 0 or -1 + movd %mm3, %eax + addl $1, %eax + cmpl $1, %eax') + + movl SAVE_EBP, %ebp + pand %mm5, %mm3 C mask & d + + movl SAVE_ESI, %esi + paddd %mm3, %mm0 C addback if necessary + + psrld %mm7, %mm0 + + movd %mm0, %eax + + emms + ret + +EPILOGUE() diff --git a/gmp/mpn/x86/pentium4/sse2/mod_1_1.asm b/gmp/mpn/x86/pentium4/sse2/mod_1_1.asm deleted file mode 100644 index ee88babeee..0000000000 --- a/gmp/mpn/x86/pentium4/sse2/mod_1_1.asm +++ /dev/null @@ -1,166 +0,0 @@ -dnl x86-32 mpn_mod_1_1p for Pentium 4 and P6 models with SSE2 (i.e., 9,D,E,F). - -dnl Contributed to the GNU project by Torbjorn Granlund. - -dnl Copyright 2009, 2010 Free Software Foundation, Inc. - -dnl This file is part of the GNU MP Library. -dnl -dnl The GNU MP Library is free software; you can redistribute it and/or modify -dnl it under the terms of either: -dnl -dnl * the GNU Lesser General Public License as published by the Free -dnl Software Foundation; either version 3 of the License, or (at your -dnl option) any later version. -dnl -dnl or -dnl -dnl * the GNU General Public License as published by the Free Software -dnl Foundation; either version 2 of the License, or (at your option) any -dnl later version. -dnl -dnl or both in parallel, as here. -dnl -dnl The GNU MP Library is distributed in the hope that it will be useful, but -dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -dnl for more details. -dnl -dnl You should have received copies of the GNU General Public License and the -dnl GNU Lesser General Public License along with the GNU MP Library. If not, -dnl see https://www.gnu.org/licenses/. - -include(`../config.m4') - -C TODO: -C * Optimize. The present code was written quite straightforwardly. -C * Optimize post-loop reduction code; it is from mod_1s_4p, thus overkill. -C * Write a cps function that uses sse2 insns. - -C cycles/limb -C P6 model 0-8,10-12 - -C P6 model 9 (Banias) ? -C P6 model 13 (Dothan) ? -C P4 model 0-1 (Willamette) ? -C P4 model 2 (Northwood) 16 -C P4 model 3-4 (Prescott) 18 - -C INPUT PARAMETERS -C ap sp + 4 -C n sp + 8 -C b sp + 12 -C cps sp + 16 - -define(`B1modb', `%mm1') -define(`B2modb', `%mm2') -define(`ap', `%edx') -define(`n', `%eax') - - TEXT - ALIGN(16) -PROLOGUE(mpn_mod_1_1p) - push %ebx - mov 8(%esp), ap - mov 12(%esp), n - mov 20(%esp), %ecx - movd 8(%ecx), B1modb - movd 12(%ecx), B2modb - - lea -4(ap,n,4), ap - -C FIXME: See comment in generic/mod_1_1.c. - movd (ap), %mm7 - movd -4(ap), %mm4 - pmuludq B1modb, %mm7 - paddq %mm4, %mm7 - add $-2, n - jz L(end) - - ALIGN(8) -L(top): movq %mm7, %mm6 - psrlq $32, %mm7 C rh - movd -8(ap), %mm0 - add $-4, ap - pmuludq B2modb, %mm7 - pmuludq B1modb, %mm6 - add $-1, n - paddq %mm0, %mm7 - paddq %mm6, %mm7 - jnz L(top) - -L(end): pcmpeqd %mm4, %mm4 - psrlq $32, %mm4 C 0x00000000FFFFFFFF - pand %mm7, %mm4 C rl - psrlq $32, %mm7 C rh - pmuludq B1modb, %mm7 C rh,cl - paddq %mm4, %mm7 C rh,rl - movd 4(%ecx), %mm4 C cnt - psllq %mm4, %mm7 C rh,rl normalized - movq %mm7, %mm2 C rl in low half - psrlq $32, %mm7 C rh - movd (%ecx), %mm1 C bi - pmuludq %mm7, %mm1 C qh,ql - paddq %mm2, %mm1 C qh-1,ql - movd %mm1, %ecx C ql - psrlq $32, %mm1 C qh-1 - movd 16(%esp), %mm3 C b - pmuludq %mm1, %mm3 C (qh-1) * b - psubq %mm3, %mm2 C r in low half (could use psubd) - movd %mm2, %eax C r - mov 16(%esp), %ebx - sub %ebx, %eax C r - cmp %eax, %ecx - lea (%eax,%ebx), %edx - cmovc( %edx, %eax) - movd %mm4, %ecx C cnt - cmp %ebx, %eax - jae L(fix) - emms - pop %ebx - shr %cl, %eax - ret - -L(fix): sub %ebx, %eax - emms - pop %ebx - shr %cl, %eax - ret -EPILOGUE() - -PROLOGUE(mpn_mod_1_1p_cps) -C CAUTION: This is the same code as in k7/mod_1_1.asm - push %ebp - mov 12(%esp), %ebp - push %esi - bsr %ebp, %ecx - push %ebx - xor $31, %ecx - mov 16(%esp), %esi - sal %cl, %ebp - mov %ebp, %edx - not %edx - mov $-1, %eax - div %ebp - mov %eax, (%esi) C store bi - mov %ecx, 4(%esi) C store cnt - xor %ebx, %ebx - sub %ebp, %ebx - mov $1, %edx - shld %cl, %eax, %edx - imul %edx, %ebx - mul %ebx - add %ebx, %edx - not %edx - imul %ebp, %edx - add %edx, %ebp - cmp %edx, %eax - cmovc( %ebp, %edx) - shr %cl, %ebx - mov %ebx, 8(%esi) C store B1modb - shr %cl, %edx - mov %edx, 12(%esi) C store B2modb - pop %ebx - pop %esi - pop %ebp - ret -EPILOGUE() diff --git a/gmp/mpn/x86/pentium4/sse2/mod_1_4.asm b/gmp/mpn/x86/pentium4/sse2/mod_1_4.asm deleted file mode 100644 index eb2edb6297..0000000000 --- a/gmp/mpn/x86/pentium4/sse2/mod_1_4.asm +++ /dev/null @@ -1,269 +0,0 @@ -dnl x86-32 mpn_mod_1s_4p for Pentium 4 and P6 models with SSE2 (i.e. 9,D,E,F). - -dnl Contributed to the GNU project by Torbjorn Granlund. - -dnl Copyright 2009, 2010 Free Software Foundation, Inc. - -dnl This file is part of the GNU MP Library. -dnl -dnl The GNU MP Library is free software; you can redistribute it and/or modify -dnl it under the terms of either: -dnl -dnl * the GNU Lesser General Public License as published by the Free -dnl Software Foundation; either version 3 of the License, or (at your -dnl option) any later version. -dnl -dnl or -dnl -dnl * the GNU General Public License as published by the Free Software -dnl Foundation; either version 2 of the License, or (at your option) any -dnl later version. -dnl -dnl or both in parallel, as here. -dnl -dnl The GNU MP Library is distributed in the hope that it will be useful, but -dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -dnl for more details. -dnl -dnl You should have received copies of the GNU General Public License and the -dnl GNU Lesser General Public License along with the GNU MP Library. If not, -dnl see https://www.gnu.org/licenses/. - -include(`../config.m4') - -C TODO: -C * Optimize. The present code was written quite straightforwardly. -C * Optimize post-loop reduction code. -C * Write a cps function that uses sse2 insns. - -C cycles/limb -C P6 model 0-8,10-12 - -C P6 model 9 (Banias) ? -C P6 model 13 (Dothan) 3.4 -C P4 model 0-1 (Willamette) ? -C P4 model 2 (Northwood) 4 -C P4 model 3-4 (Prescott) 4.5 - -C INPUT PARAMETERS -C ap sp + 4 -C n sp + 8 -C b sp + 12 -C cps sp + 16 - -define(`B1modb', `%mm1') -define(`B2modb', `%mm2') -define(`B3modb', `%mm3') -define(`B4modb', `%mm4') -define(`B5modb', `%mm5') -define(`ap', `%edx') -define(`n', `%eax') - -ASM_START() - TEXT - ALIGN(16) -PROLOGUE(mpn_mod_1s_4p) - push %ebx - mov 8(%esp), ap - mov 12(%esp), n - mov 20(%esp), %ecx - - movd 8(%ecx), B1modb - movd 12(%ecx), B2modb - movd 16(%ecx), B3modb - movd 20(%ecx), B4modb - movd 24(%ecx), B5modb - - mov n, %ebx - lea -4(ap,n,4), ap - and $3, %ebx - je L(b0) - cmp $2, %ebx - jc L(b1) - je L(b2) - -L(b3): movd -4(ap), %mm7 - pmuludq B1modb, %mm7 - movd -8(ap), %mm6 - paddq %mm6, %mm7 - movd (ap), %mm6 - pmuludq B2modb, %mm6 - paddq %mm6, %mm7 - lea -24(ap), ap - add $-3, n - jz L(end) - jmp L(top) - -L(b0): movd -8(ap), %mm7 - pmuludq B1modb, %mm7 - movd -12(ap), %mm6 - paddq %mm6, %mm7 - movd -4(ap), %mm6 - pmuludq B2modb, %mm6 - paddq %mm6, %mm7 - movd (ap), %mm6 - pmuludq B3modb, %mm6 - paddq %mm6, %mm7 - lea -28(ap), ap - add $-4, n - jz L(end) - jmp L(top) - -L(b1): movd (ap), %mm7 - lea -16(ap), ap - dec n - jz L(x) - jmp L(top) - -L(b2): movd -4(ap), %mm7 C rl - punpckldq (ap), %mm7 C rh - lea -20(ap), ap - add $-2, n - jz L(end) - - ALIGN(8) -L(top): movd 4(ap), %mm0 - pmuludq B1modb, %mm0 - movd 0(ap), %mm6 - paddq %mm6, %mm0 - - movd 8(ap), %mm6 - pmuludq B2modb, %mm6 - paddq %mm6, %mm0 - - movd 12(ap), %mm6 - pmuludq B3modb, %mm6 - paddq %mm6, %mm0 - - movq %mm7, %mm6 - psrlq $32, %mm7 C rh - pmuludq B5modb, %mm7 - pmuludq B4modb, %mm6 - - paddq %mm0, %mm7 - paddq %mm6, %mm7 - - add $-16, ap - add $-4, n - jnz L(top) - -L(end): pcmpeqd %mm4, %mm4 - psrlq $32, %mm4 C 0x00000000FFFFFFFF - pand %mm7, %mm4 C rl - psrlq $32, %mm7 C rh - pmuludq B1modb, %mm7 C rh,cl - paddq %mm4, %mm7 C rh,rl -L(x): movd 4(%ecx), %mm4 C cnt - psllq %mm4, %mm7 C rh,rl normalized - movq %mm7, %mm2 C rl in low half - psrlq $32, %mm7 C rh - movd (%ecx), %mm1 C bi - pmuludq %mm7, %mm1 C qh,ql - paddq %mm2, %mm1 C qh-1,ql - movd %mm1, %ecx C ql - psrlq $32, %mm1 C qh-1 - movd 16(%esp), %mm3 C b - pmuludq %mm1, %mm3 C (qh-1) * b - psubq %mm3, %mm2 C r in low half (could use psubd) - movd %mm2, %eax C r - mov 16(%esp), %ebx - sub %ebx, %eax C r - cmp %eax, %ecx - lea (%eax,%ebx), %edx - cmovc( %edx, %eax) - movd %mm4, %ecx C cnt - cmp %ebx, %eax - jae L(fix) - emms - pop %ebx - shr %cl, %eax - ret - -L(fix): sub %ebx, %eax - emms - pop %ebx - shr %cl, %eax - ret -EPILOGUE() - - ALIGN(16) -PROLOGUE(mpn_mod_1s_4p_cps) -C CAUTION: This is the same code as in k7/mod_1_4.asm - push %ebp - push %edi - push %esi - push %ebx - mov 20(%esp), %ebp C FIXME: avoid bp for 0-idx - mov 24(%esp), %ebx - bsr %ebx, %ecx - xor $31, %ecx - sal %cl, %ebx C b << cnt - mov %ebx, %edx - not %edx - mov $-1, %eax - div %ebx - xor %edi, %edi - sub %ebx, %edi - mov $1, %esi - mov %eax, (%ebp) C store bi - mov %ecx, 4(%ebp) C store cnt - shld %cl, %eax, %esi - imul %edi, %esi - mov %eax, %edi - mul %esi - - add %esi, %edx - shr %cl, %esi - mov %esi, 8(%ebp) C store B1modb - - not %edx - imul %ebx, %edx - lea (%edx,%ebx), %esi - cmp %edx, %eax - cmovnc( %edx, %esi) - mov %edi, %eax - mul %esi - - add %esi, %edx - shr %cl, %esi - mov %esi, 12(%ebp) C store B2modb - - not %edx - imul %ebx, %edx - lea (%edx,%ebx), %esi - cmp %edx, %eax - cmovnc( %edx, %esi) - mov %edi, %eax - mul %esi - - add %esi, %edx - shr %cl, %esi - mov %esi, 16(%ebp) C store B3modb - - not %edx - imul %ebx, %edx - lea (%edx,%ebx), %esi - cmp %edx, %eax - cmovnc( %edx, %esi) - mov %edi, %eax - mul %esi - - add %esi, %edx - shr %cl, %esi - mov %esi, 20(%ebp) C store B4modb - - not %edx - imul %ebx, %edx - add %edx, %ebx - cmp %edx, %eax - cmovnc( %edx, %ebx) - - shr %cl, %ebx - mov %ebx, 24(%ebp) C store B5modb - - pop %ebx - pop %esi - pop %edi - pop %ebp - ret -EPILOGUE() diff --git a/gmp/mpn/x86/pentium4/sse2/mod_34lsub1.asm b/gmp/mpn/x86/pentium4/sse2/mod_34lsub1.asm index 31e25b79bc..1598b41785 100644 --- a/gmp/mpn/x86/pentium4/sse2/mod_34lsub1.asm +++ b/gmp/mpn/x86/pentium4/sse2/mod_34lsub1.asm @@ -1,32 +1,21 @@ dnl Intel Pentium 4 mpn_mod_34lsub1 -- remainder modulo 2^24-1. -dnl Copyright 2000-2003 Free Software Foundation, Inc. - -dnl This file is part of the GNU MP Library. -dnl -dnl The GNU MP Library is free software; you can redistribute it and/or modify -dnl it under the terms of either: +dnl Copyright 2000, 2001, 2002, 2003 Free Software Foundation, Inc. dnl -dnl * the GNU Lesser General Public License as published by the Free -dnl Software Foundation; either version 3 of the License, or (at your -dnl option) any later version. -dnl -dnl or -dnl -dnl * the GNU General Public License as published by the Free Software -dnl Foundation; either version 2 of the License, or (at your option) any -dnl later version. +dnl This file is part of the GNU MP Library. dnl -dnl or both in parallel, as here. +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 3 of the +dnl License, or (at your option) any later version. dnl -dnl The GNU MP Library is distributed in the hope that it will be useful, but -dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -dnl for more details. +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. dnl -dnl You should have received copies of the GNU General Public License and the -dnl GNU Lesser General Public License along with the GNU MP Library. If not, -dnl see https://www.gnu.org/licenses/. +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. include(`../config.m4') diff --git a/gmp/mpn/x86/pentium4/sse2/mode1o.asm b/gmp/mpn/x86/pentium4/sse2/mode1o.asm index 778c478169..2f0b177a00 100644 --- a/gmp/mpn/x86/pentium4/sse2/mode1o.asm +++ b/gmp/mpn/x86/pentium4/sse2/mode1o.asm @@ -1,32 +1,21 @@ dnl Intel Pentium-4 mpn_modexact_1_odd -- mpn by limb exact remainder. dnl Copyright 2001, 2002, 2007 Free Software Foundation, Inc. - -dnl This file is part of the GNU MP Library. -dnl -dnl The GNU MP Library is free software; you can redistribute it and/or modify -dnl it under the terms of either: dnl -dnl * the GNU Lesser General Public License as published by the Free -dnl Software Foundation; either version 3 of the License, or (at your -dnl option) any later version. -dnl -dnl or -dnl -dnl * the GNU General Public License as published by the Free Software -dnl Foundation; either version 2 of the License, or (at your option) any -dnl later version. +dnl This file is part of the GNU MP Library. dnl -dnl or both in parallel, as here. +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 3 of the +dnl License, or (at your option) any later version. dnl -dnl The GNU MP Library is distributed in the hope that it will be useful, but -dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -dnl for more details. +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. dnl -dnl You should have received copies of the GNU General Public License and the -dnl GNU Lesser General Public License along with the GNU MP Library. If not, -dnl see https://www.gnu.org/licenses/. +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. include(`../config.m4') @@ -112,7 +101,7 @@ ifdef(`PIC',` psubd %mm0, %mm6 C inv = 2*inv - inv*inv*d - ASSERT(e,` C expect d*inv == 1 mod 2^GMP_LIMB_BITS + ASSERT(e,` C expect d*inv == 1 mod 2^BITS_PER_MP_LIMB pushl %eax FRAME_pushl() movd %mm6, %eax imul PARAM_DIVISOR, %eax @@ -124,13 +113,13 @@ ifdef(`PIC',` C The dependent chain here is as follows. C -C latency -C psubq s = (src-cbit) - climb 2 -C pmuludq q = s*inverse 8 -C pmuludq prod = q*divisor 8 -C psrlq climb = high(prod) 2 -C -- -C 20 +C latency +C psubq s = (src-cbit) - climb 2 +C pmuludq q = s*inverse 8 +C pmuludq prod = q*divisor 8 +C psrlq climb = high(prod) 2 +C -- +C 20 C C Yet the loop measures 19.0 c/l, so obviously there's something gained C there over a straight reading of the chip documentation. diff --git a/gmp/mpn/x86/pentium4/sse2/mul_1.asm b/gmp/mpn/x86/pentium4/sse2/mul_1.asm index 6347b8bf62..07be951921 100644 --- a/gmp/mpn/x86/pentium4/sse2/mul_1.asm +++ b/gmp/mpn/x86/pentium4/sse2/mul_1.asm @@ -1,48 +1,37 @@ dnl mpn_mul_1 for Pentium 4 and P6 models with SSE2 (i.e., 9,D,E,F). -dnl Copyright 2005, 2007, 2011 Free Software Foundation, Inc. - +dnl Copyright 2005, 2007 Free Software Foundation, Inc. +dnl dnl This file is part of the GNU MP Library. dnl dnl The GNU MP Library is free software; you can redistribute it and/or modify -dnl it under the terms of either: -dnl -dnl * the GNU Lesser General Public License as published by the Free -dnl Software Foundation; either version 3 of the License, or (at your -dnl option) any later version. -dnl -dnl or -dnl -dnl * the GNU General Public License as published by the Free Software -dnl Foundation; either version 2 of the License, or (at your option) any -dnl later version. -dnl -dnl or both in parallel, as here. +dnl it under the terms of the GNU Lesser General Public License as published +dnl by the Free Software Foundation; either version 3 of the License, or (at +dnl your option) any later version. dnl dnl The GNU MP Library is distributed in the hope that it will be useful, but dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -dnl for more details. +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. dnl -dnl You should have received copies of the GNU General Public License and the -dnl GNU Lesser General Public License along with the GNU MP Library. If not, -dnl see https://www.gnu.org/licenses/. +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. include(`../config.m4') -C cycles/limb -C P6 model 0-8,10-12 - -C P6 model 9 (Banias) 4.17 -C P6 model 13 (Dothan) 4.17 -C P4 model 0-1 (Willamette) 4 -C P4 model 2 (Northwood) 4 -C P4 model 3-4 (Prescott) 4.55 - C TODO: C * Tweak eax/edx offsets in loop as to save some lea's C * Perhaps software pipeline small-case code +C cycles/limb +C P6 model 0-8,10-12) - +C P6 model 9 (Banias) ? +C P6 model 13 (Dothan) 4.17 +C P4 model 0-1 (Willamette): 4 +C P4 model 2 (Northwood): 4 +C P4 model 3-4 (Prescott): 4.55 + C INPUT PARAMETERS C rp sp + 4 C up sp + 8 @@ -51,13 +40,22 @@ C v0 sp + 16 TEXT ALIGN(16) +PROLOGUE(mpn_mul_1c) + mov 4(%esp), %edx + mov 8(%esp), %eax + mov 12(%esp), %ecx + movd 16(%esp), %mm7 + movd 20(%esp), %mm6 + jmp L(ent) +EPILOGUE() + ALIGN(16) PROLOGUE(mpn_mul_1) - pxor %mm6, %mm6 -L(ent): mov 4(%esp), %edx + mov 4(%esp), %edx mov 8(%esp), %eax mov 12(%esp), %ecx movd 16(%esp), %mm7 - cmp $4, %ecx + pxor %mm6, %mm6 +L(ent): cmp $4, %ecx jnc L(big) L(lp0): movd (%eax), %mm0 @@ -158,7 +156,3 @@ L(end): pmuludq %mm7, %mm2 emms ret EPILOGUE() -PROLOGUE(mpn_mul_1c) - movd 20(%esp), %mm6 - jmp L(ent) -EPILOGUE() diff --git a/gmp/mpn/x86/pentium4/sse2/mul_basecase.asm b/gmp/mpn/x86/pentium4/sse2/mul_basecase.asm index 6e3775ae09..2628e5eb72 100644 --- a/gmp/mpn/x86/pentium4/sse2/mul_basecase.asm +++ b/gmp/mpn/x86/pentium4/sse2/mul_basecase.asm @@ -1,32 +1,21 @@ dnl mpn_mul_basecase for Pentium 4 and P6 models with SSE2 (i.e., 9,D,E,F). dnl Copyright 2001, 2002, 2005, 2007 Free Software Foundation, Inc. - +dnl dnl This file is part of the GNU MP Library. dnl dnl The GNU MP Library is free software; you can redistribute it and/or modify -dnl it under the terms of either: -dnl -dnl * the GNU Lesser General Public License as published by the Free -dnl Software Foundation; either version 3 of the License, or (at your -dnl option) any later version. -dnl -dnl or -dnl -dnl * the GNU General Public License as published by the Free Software -dnl Foundation; either version 2 of the License, or (at your option) any -dnl later version. -dnl -dnl or both in parallel, as here. +dnl it under the terms of the GNU Lesser General Public License as published +dnl by the Free Software Foundation; either version 3 of the License, or (at +dnl your option) any later version. dnl dnl The GNU MP Library is distributed in the hope that it will be useful, but dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -dnl for more details. +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. dnl -dnl You should have received copies of the GNU General Public License and the -dnl GNU Lesser General Public License along with the GNU MP Library. If not, -dnl see https://www.gnu.org/licenses/. +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. include(`../config.m4') diff --git a/gmp/mpn/x86/pentium4/sse2/popcount.asm b/gmp/mpn/x86/pentium4/sse2/popcount.asm index b8238b9b66..cb982ade46 100644 --- a/gmp/mpn/x86/pentium4/sse2/popcount.asm +++ b/gmp/mpn/x86/pentium4/sse2/popcount.asm @@ -1,66 +1,52 @@ dnl X86-32 and X86-64 mpn_popcount using SSE2. -dnl Copyright 2006, 2007, 2011 Free Software Foundation, Inc. - +dnl Copyright 2006, 2007 Free Software Foundation, Inc. +dnl dnl This file is part of the GNU MP Library. dnl dnl The GNU MP Library is free software; you can redistribute it and/or modify -dnl it under the terms of either: -dnl -dnl * the GNU Lesser General Public License as published by the Free -dnl Software Foundation; either version 3 of the License, or (at your -dnl option) any later version. -dnl -dnl or -dnl -dnl * the GNU General Public License as published by the Free Software -dnl Foundation; either version 2 of the License, or (at your option) any -dnl later version. -dnl -dnl or both in parallel, as here. +dnl it under the terms of the GNU Lesser General Public License as published +dnl by the Free Software Foundation; either version 3 of the License, or (at +dnl your option) any later version. dnl dnl The GNU MP Library is distributed in the hope that it will be useful, but dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -dnl for more details. +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. dnl -dnl You should have received copies of the GNU General Public License and the -dnl GNU Lesser General Public License along with the GNU MP Library. If not, -dnl see https://www.gnu.org/licenses/. +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. include(`../config.m4') -C 32-bit popcount hamdist -C cycles/limb cycles/limb -C P5 - -C P6 model 0-8,10-12 - -C P6 model 9 (Banias) ? -C P6 model 13 (Dothan) 4 -C P4 model 0 (Willamette) ? -C P4 model 1 (?) ? -C P4 model 2 (Northwood) 3.9 -C P4 model 3 (Prescott) ? -C P4 model 4 (Nocona) ? -C AMD K6 - -C AMD K7 - -C AMD K8 ? - -C 64-bit popcount hamdist -C cycles/limb cycles/limb -C P4 model 4 (Nocona): 8 -C AMD K8,K9 7.5 -C AMD K10 3.5 -C Intel core2 3.68 -C Intel corei 3.15 -C Intel atom 10.8 -C VIA nano 6.5 +C 32-bit popcount hamdist +C cycles/limb cycles/limb +C P5: - +C P6 model 0-8,10-12) - +C P6 model 9 (Banias) ? +C P6 model 13 (Dothan) 4 +C P4 model 0 (Willamette) ? +C P4 model 1 (?) ? +C P4 model 2 (Northwood) 3.9 +C P4 model 3 (Prescott) ? +C P4 model 4 (Nocona) ? +C K6: - +C K7: - +C K8: ? + +C 64-bit popcount hamdist +C cycles/limb cycles/limb +C P4 model 4 (Nocona): 8 +C K8: 7.5 +C K10: 3.5 +C P6-15: 3.68 C TODO C * Make a mpn_hamdist based on this. Alignment could either be handled by C using movdqu for one operand and movdqa for the other, or by painfully -C shifting as we go. Unfortunately, there seem to be no usable shift +C shifting as we go. Unfortunately, there seem to be no useable shift C instruction, except for one that takes an immediate count. C * It would probably be possible to cut a few cycles/limb using software C pipelining. diff --git a/gmp/mpn/x86/pentium4/sse2/rsh1add_n.asm b/gmp/mpn/x86/pentium4/sse2/rsh1add_n.asm index f421d1323e..bbf43245cb 100644 --- a/gmp/mpn/x86/pentium4/sse2/rsh1add_n.asm +++ b/gmp/mpn/x86/pentium4/sse2/rsh1add_n.asm @@ -1,32 +1,21 @@ dnl Intel Pentium-4 mpn_rsh1add_n -- mpn (x+y)/2 -dnl Copyright 2001-2004 Free Software Foundation, Inc. - -dnl This file is part of the GNU MP Library. -dnl -dnl The GNU MP Library is free software; you can redistribute it and/or modify -dnl it under the terms of either: +dnl Copyright 2001, 2002, 2003, 2004 Free Software Foundation, Inc. dnl -dnl * the GNU Lesser General Public License as published by the Free -dnl Software Foundation; either version 3 of the License, or (at your -dnl option) any later version. -dnl -dnl or -dnl -dnl * the GNU General Public License as published by the Free Software -dnl Foundation; either version 2 of the License, or (at your option) any -dnl later version. +dnl This file is part of the GNU MP Library. dnl -dnl or both in parallel, as here. +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 3 of the +dnl License, or (at your option) any later version. dnl -dnl The GNU MP Library is distributed in the hope that it will be useful, but -dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -dnl for more details. +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. dnl -dnl You should have received copies of the GNU General Public License and the -dnl GNU Lesser General Public License along with the GNU MP Library. If not, -dnl see https://www.gnu.org/licenses/. +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. include(`../config.m4') diff --git a/gmp/mpn/x86/pentium4/sse2/sqr_basecase.asm b/gmp/mpn/x86/pentium4/sse2/sqr_basecase.asm index 2dd57d25d9..fc56f164ed 100644 --- a/gmp/mpn/x86/pentium4/sse2/sqr_basecase.asm +++ b/gmp/mpn/x86/pentium4/sse2/sqr_basecase.asm @@ -1,32 +1,21 @@ dnl mpn_sqr_basecase for Pentium 4 and P6 models with SSE2 (i.e., 9,D,E,F). dnl Copyright 2001, 2002, 2007 Free Software Foundation, Inc. - +dnl dnl This file is part of the GNU MP Library. dnl dnl The GNU MP Library is free software; you can redistribute it and/or modify -dnl it under the terms of either: -dnl -dnl * the GNU Lesser General Public License as published by the Free -dnl Software Foundation; either version 3 of the License, or (at your -dnl option) any later version. -dnl -dnl or -dnl -dnl * the GNU General Public License as published by the Free Software -dnl Foundation; either version 2 of the License, or (at your option) any -dnl later version. -dnl -dnl or both in parallel, as here. +dnl it under the terms of the GNU Lesser General Public License as published +dnl by the Free Software Foundation; either version 3 of the License, or (at +dnl your option) any later version. dnl dnl The GNU MP Library is distributed in the hope that it will be useful, but dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -dnl for more details. +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. dnl -dnl You should have received copies of the GNU General Public License and the -dnl GNU Lesser General Public License along with the GNU MP Library. If not, -dnl see https://www.gnu.org/licenses/. +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. include(`../config.m4') @@ -41,7 +30,7 @@ C * Look into different loop alignment, we now expand the code about 50 bytes C with possibly needless alignment. C * Use OSP, should solve feed-in latency problems. C * Address relative slowness for un<=3 for Pentium M. The old code is there -C considerably faster. (1:20/14, 2:34:32, 3:66/57) +C consideraly faster. (1:20/14, 2:34:32, 3:66/57) C INPUT PARAMETERS C rp sp + 4 diff --git a/gmp/mpn/x86/pentium4/sse2/sub_n.asm b/gmp/mpn/x86/pentium4/sse2/sub_n.asm index 5ba1c018ec..02d5f01474 100644 --- a/gmp/mpn/x86/pentium4/sse2/sub_n.asm +++ b/gmp/mpn/x86/pentium4/sse2/sub_n.asm @@ -1,44 +1,37 @@ dnl Intel Pentium-4 mpn_sub_n -- mpn subtraction. dnl Copyright 2001, 2002 Free Software Foundation, Inc. - -dnl This file is part of the GNU MP Library. -dnl -dnl The GNU MP Library is free software; you can redistribute it and/or modify -dnl it under the terms of either: dnl -dnl * the GNU Lesser General Public License as published by the Free -dnl Software Foundation; either version 3 of the License, or (at your -dnl option) any later version. -dnl -dnl or -dnl -dnl * the GNU General Public License as published by the Free Software -dnl Foundation; either version 2 of the License, or (at your option) any -dnl later version. +dnl This file is part of the GNU MP Library. dnl -dnl or both in parallel, as here. +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 3 of the +dnl License, or (at your option) any later version. dnl -dnl The GNU MP Library is distributed in the hope that it will be useful, but -dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -dnl for more details. +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. dnl -dnl You should have received copies of the GNU General Public License and the -dnl GNU Lesser General Public License along with the GNU MP Library. If not, -dnl see https://www.gnu.org/licenses/. +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. include(`../config.m4') -C cycles/limb -C dst!=src1,2 dst==src1 dst==src2 -C P6 model 0-8,10-12 - -C P6 model 9 (Banias) ? -C P6 model 13 (Dothan) ? -C P4 model 0-1 (Willamette) ? -C P4 model 2 (Northwood) 4 6 6 -C P4 model 3-4 (Prescott) 4.25 7.5 7.5 +C P4 Willamette, Northwood: 4.0 cycles/limb if dst!=src1 and dst!=src2 +C 6.0 cycles/limb if dst==src1 or dst==src2 +C P4 Prescott: >= 5 cycles/limb + + +C mp_limb_t mpn_sub_n (mp_ptr dst, mp_srcptr src1, mp_srcptr src2, +C mp_size_t size); +C mp_limb_t mpn_sub_nc (mp_ptr dst, mp_srcptr src1, mp_srcptr src2, +C mp_size_t size, mp_limb_t carry); +C +C The main loop code is 2x unrolled so that the carry bit can alternate +C between mm0 and mm1. defframe(PARAM_CARRY,20) defframe(PARAM_SIZE, 16) @@ -54,8 +47,10 @@ define(SAVE_EBX,`PARAM_SRC1') PROLOGUE(mpn_sub_nc) deflit(`FRAME',0) + movd PARAM_CARRY, %mm0 jmp L(start_nc) + EPILOGUE() ALIGN(8) @@ -63,16 +58,16 @@ PROLOGUE(mpn_sub_n) deflit(`FRAME',0) pxor %mm0, %mm0 L(start_nc): - mov PARAM_SRC1, %eax - mov %ebx, SAVE_EBX - mov PARAM_SRC2, %ebx - mov PARAM_DST, %edx - mov PARAM_SIZE, %ecx + movl PARAM_SRC1, %eax + movl %ebx, SAVE_EBX + movl PARAM_SRC2, %ebx + movl PARAM_DST, %edx + movl PARAM_SIZE, %ecx - lea (%eax,%ecx,4), %eax C src1 end - lea (%ebx,%ecx,4), %ebx C src2 end - lea (%edx,%ecx,4), %edx C dst end - neg %ecx C -size + leal (%eax,%ecx,4), %eax C src1 end + leal (%ebx,%ecx,4), %ebx C src2 end + leal (%edx,%ecx,4), %edx C dst end + negl %ecx C -size L(top): C eax src1 end @@ -90,7 +85,7 @@ L(top): psrlq $63, %mm1 - add $1, %ecx + addl $1, %ecx jz L(done_mm1) movd (%eax,%ecx,4), %mm0 @@ -102,17 +97,18 @@ L(top): psrlq $63, %mm0 - add $1, %ecx + addl $1, %ecx jnz L(top) + movd %mm0, %eax - mov SAVE_EBX, %ebx + movl SAVE_EBX, %ebx emms ret L(done_mm1): movd %mm1, %eax - mov SAVE_EBX, %ebx + movl SAVE_EBX, %ebx emms ret diff --git a/gmp/mpn/x86/pentium4/sse2/submul_1.asm b/gmp/mpn/x86/pentium4/sse2/submul_1.asm index 020675bd7b..ceb41f2ac0 100644 --- a/gmp/mpn/x86/pentium4/sse2/submul_1.asm +++ b/gmp/mpn/x86/pentium4/sse2/submul_1.asm @@ -1,71 +1,60 @@ dnl Intel Pentium-4 mpn_submul_1 -- Multiply a limb vector with a limb and dnl subtract the result from a second limb vector. -dnl Copyright 2001, 2002, 2008, 2010 Free Software Foundation, Inc. - -dnl This file is part of the GNU MP Library. -dnl -dnl The GNU MP Library is free software; you can redistribute it and/or modify -dnl it under the terms of either: -dnl -dnl * the GNU Lesser General Public License as published by the Free -dnl Software Foundation; either version 3 of the License, or (at your -dnl option) any later version. +dnl Copyright 2001, 2002 Free Software Foundation, Inc. dnl -dnl or -dnl -dnl * the GNU General Public License as published by the Free Software -dnl Foundation; either version 2 of the License, or (at your option) any -dnl later version. +dnl This file is part of the GNU MP Library. dnl -dnl or both in parallel, as here. +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 3 of the +dnl License, or (at your option) any later version. dnl -dnl The GNU MP Library is distributed in the hope that it will be useful, but -dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -dnl for more details. +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. dnl -dnl You should have received copies of the GNU General Public License and the -dnl GNU Lesser General Public License along with the GNU MP Library. If not, -dnl see https://www.gnu.org/licenses/. +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. include(`../config.m4') -C cycles/limb -C P6 model 0-8,10-12 - -C P6 model 9 (Banias) 6.8 -C P6 model 13 (Dothan) 6.9 -C P4 model 0-1 (Willamette) ? -C P4 model 2 (Northwood) 5.87 -C P4 model 3-4 (Prescott) 6.5 +C P4: 7 cycles/limb, unstable timing, at least on early Pentium4 silicon +C (stepping 10). -C This code represents a step forwards compared to the code available before -C GMP 5.1, but it is not carefully tuned for either P6 or P4. In fact, it is -C not good for P6. For P4 it saved a bit over 1 c/l for both Northwood and -C Prescott compared to the old code. + +C mp_limb_t mpn_submul_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, +C mp_limb_t mult); +C mp_limb_t mpn_submul_1c (mp_ptr dst, mp_srcptr src, mp_size_t size, +C mp_limb_t mult, mp_limb_t carry); +C +C This code is not particularly good at 7 c/l. The dependent chain is only +C 4 c/l and there's only 4 MMX unit instructions, so it's not clear why that +C speed isn't achieved. C C The arrangements made here to get a two instruction dependent chain are -C slightly subtle. In the loop the carry (or borrow rather) is a negative so -C that a paddq can be used to give a low limb ready to store, and a high limb -C ready to become the new carry after a psrlq. +C slightly subtle. In the loop the carry (or borrow rather) is a negative +C so that a paddq can be used to give a low limb ready to store, and a high +C limb ready to become the new carry after a psrlq. C -C If the carry was a simple twos complement negative then the psrlq shift would -C need to bring in 0 bits or 1 bits according to whether the high was zero or -C non-zero, since a non-zero value would represent a negative needing sign -C extension. That wouldn't be particularly easy to arrange and certainly would -C add an instruction to the dependent chain, so instead an offset is applied so -C that the high limb will be 0xFFFFFFFF+c. With c in the range -0xFFFFFFFF to -C 0, the value 0xFFFFFFFF+c is in the range 0 to 0xFFFFFFFF and is therefore -C always positive and can always have 0 bits shifted in, which is what psrlq -C does. +C If the carry was a simple twos complement negative then the psrlq shift +C would need to bring in 0 bits or 1 bits according to whether the high was +C zero or non-zero, since a non-zero value would represent a negative +C needing sign extension. That wouldn't be particularly easy to arrange and +C certainly would add an instruction to the dependent chain, so instead an +C offset is applied so that the high limb will be 0xFFFFFFFF+c. With c in +C the range -0xFFFFFFFF to 0, the value 0xFFFFFFFF+c is in the range 0 to +C 0xFFFFFFFF and is therefore always positive and can always have 0 bits +C shifted in, which is what psrlq does. C C The extra 0xFFFFFFFF must be subtracted before c is used, but that can be C done off the dependent chain. The total adjustment then is to add -C 0xFFFFFFFF00000000 to offset the new carry, and subtract 0x00000000FFFFFFFF -C to remove the offset from the current carry, for a net add of -C 0xFFFFFFFE00000001. In the code this is applied to the destination limb when -C fetched. +C 0xFFFFFFFF00000000 to offset the new carry, and subtract +C 0x00000000FFFFFFFF to remove the offset from the current carry, for a net +C add of 0xFFFFFFFE00000001. In the code this is applied to the destination +C limb when fetched. C C It's also possible to view the 0xFFFFFFFF adjustment as a ones-complement C negative, which is how it's undone for the return value, but that doesn't @@ -91,16 +80,16 @@ deflit(`FRAME',0) pxor %mm1, %mm1 C initial borrow L(start_1c): - mov PARAM_SRC, %eax + movl PARAM_SRC, %eax pcmpeqd %mm0, %mm0 movd PARAM_MULTIPLIER, %mm7 pcmpeqd %mm6, %mm6 - mov PARAM_DST, %edx + movl PARAM_DST, %edx psrlq $32, %mm0 C 0x00000000FFFFFFFF - mov PARAM_SIZE, %ecx + movl PARAM_SIZE, %ecx psllq $32, %mm6 C 0xFFFFFFFF00000000 psubq %mm0, %mm6 C 0xFFFFFFFE00000001 @@ -108,75 +97,32 @@ L(start_1c): psubq %mm1, %mm0 C 0xFFFFFFFF - borrow - movd (%eax), %mm3 C up - movd (%edx), %mm4 C rp - - add $-1, %ecx - paddq %mm6, %mm4 C add 0xFFFFFFFE00000001 - pmuludq %mm7, %mm3 - jnz L(gt1) - psubq %mm3, %mm4 C prod - paddq %mm4, %mm0 C borrow - movd %mm0, (%edx) C result - jmp L(rt) - -L(gt1): movd 4(%eax), %mm1 C up - movd 4(%edx), %mm2 C rp - - add $-1, %ecx - jz L(eev) - - ALIGN(16) -L(top): paddq %mm6, %mm2 C add 0xFFFFFFFE00000001 + C eax src, incrementing + C ebx + C ecx loop counter, decrementing + C edx dst, incrementing + C + C mm0 0xFFFFFFFF - borrow + C mm6 0xFFFFFFFE00000001 + C mm7 multiplier + +L(loop): + movd (%eax), %mm1 C src + leal 4(%eax), %eax + movd (%edx), %mm2 C dst + paddq %mm6, %mm2 C add 0xFFFFFFFE00000001 pmuludq %mm7, %mm1 - psubq %mm3, %mm4 C prod - movd 8(%eax), %mm3 C up - paddq %mm4, %mm0 C borrow - movd 8(%edx), %mm4 C rp - movd %mm0, (%edx) C result - psrlq $32, %mm0 - - add $-1, %ecx - jz L(eod) - - paddq %mm6, %mm4 C add 0xFFFFFFFE00000001 - pmuludq %mm7, %mm3 psubq %mm1, %mm2 C prod - movd 12(%eax), %mm1 C up paddq %mm2, %mm0 C borrow - movd 12(%edx), %mm2 C rp - movd %mm0, 4(%edx) C result - psrlq $32, %mm0 - - lea 8(%eax), %eax - lea 8(%edx), %edx - add $-1, %ecx - jnz L(top) - - -L(eev): paddq %mm6, %mm2 C add 0xFFFFFFFE00000001 - pmuludq %mm7, %mm1 - psubq %mm3, %mm4 C prod - paddq %mm4, %mm0 C borrow + subl $1, %ecx movd %mm0, (%edx) C result psrlq $32, %mm0 - psubq %mm1, %mm2 C prod - paddq %mm2, %mm0 C borrow - movd %mm0, 4(%edx) C result -L(rt): psrlq $32, %mm0 + leal 4(%edx), %edx + jnz L(loop) + movd %mm0, %eax - not %eax + notl %eax emms ret -L(eod): paddq %mm6, %mm4 C add 0xFFFFFFFE00000001 - pmuludq %mm7, %mm3 - psubq %mm1, %mm2 C prod - paddq %mm2, %mm0 C borrow - movd %mm0, 4(%edx) C result - psrlq $32, %mm0 - psubq %mm3, %mm4 C prod - paddq %mm4, %mm0 C borrow - movd %mm0, 8(%edx) C result - jmp L(rt) EPILOGUE() |