summaryrefslogtreecommitdiff
path: root/gmp/mpn/x86/pentium4
diff options
context:
space:
mode:
Diffstat (limited to 'gmp/mpn/x86/pentium4')
-rw-r--r--gmp/mpn/x86/pentium4/README25
-rw-r--r--gmp/mpn/x86/pentium4/copyd.asm36
-rw-r--r--gmp/mpn/x86/pentium4/copyi.asm36
-rw-r--r--gmp/mpn/x86/pentium4/mmx/lshift.asm33
-rw-r--r--gmp/mpn/x86/pentium4/mmx/popham.asm35
-rw-r--r--gmp/mpn/x86/pentium4/mmx/rshift.asm33
-rw-r--r--gmp/mpn/x86/pentium4/sse2/add_n.asm79
-rw-r--r--gmp/mpn/x86/pentium4/sse2/addlsh1_n.asm76
-rw-r--r--gmp/mpn/x86/pentium4/sse2/addmul_1.asm64
-rw-r--r--gmp/mpn/x86/pentium4/sse2/bdiv_dbm1c.asm141
-rw-r--r--gmp/mpn/x86/pentium4/sse2/bdiv_q_1.asm233
-rw-r--r--gmp/mpn/x86/pentium4/sse2/cnd_add_n.asm95
-rw-r--r--gmp/mpn/x86/pentium4/sse2/cnd_sub_n.asm114
-rw-r--r--gmp/mpn/x86/pentium4/sse2/dive_1.asm49
-rw-r--r--gmp/mpn/x86/pentium4/sse2/divrem_1.asm36
-rw-r--r--gmp/mpn/x86/pentium4/sse2/gmp-mparam.h252
-rw-r--r--gmp/mpn/x86/pentium4/sse2/mod_1.asm391
-rw-r--r--gmp/mpn/x86/pentium4/sse2/mod_1_1.asm166
-rw-r--r--gmp/mpn/x86/pentium4/sse2/mod_1_4.asm269
-rw-r--r--gmp/mpn/x86/pentium4/sse2/mod_34lsub1.asm35
-rw-r--r--gmp/mpn/x86/pentium4/sse2/mode1o.asm49
-rw-r--r--gmp/mpn/x86/pentium4/sse2/mul_1.asm64
-rw-r--r--gmp/mpn/x86/pentium4/sse2/mul_basecase.asm27
-rw-r--r--gmp/mpn/x86/pentium4/sse2/popcount.asm76
-rw-r--r--gmp/mpn/x86/pentium4/sse2/rsh1add_n.asm35
-rw-r--r--gmp/mpn/x86/pentium4/sse2/sqr_basecase.asm29
-rw-r--r--gmp/mpn/x86/pentium4/sse2/sub_n.asm82
-rw-r--r--gmp/mpn/x86/pentium4/sse2/submul_1.asm176
28 files changed, 866 insertions, 1870 deletions
diff --git a/gmp/mpn/x86/pentium4/README b/gmp/mpn/x86/pentium4/README
index 90f752e5d5..8dc0479f04 100644
--- a/gmp/mpn/x86/pentium4/README
+++ b/gmp/mpn/x86/pentium4/README
@@ -3,28 +3,17 @@ Copyright 2001 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/.
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
diff --git a/gmp/mpn/x86/pentium4/copyd.asm b/gmp/mpn/x86/pentium4/copyd.asm
index 82af81c522..491ad60128 100644
--- a/gmp/mpn/x86/pentium4/copyd.asm
+++ b/gmp/mpn/x86/pentium4/copyd.asm
@@ -1,32 +1,22 @@
dnl Pentium-4 mpn_copyd -- copy limb vector, decrementing.
-
-dnl Copyright 1999-2001 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
+
+dnl Copyright 1999, 2000, 2001 Free Software Foundation, Inc.
dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
dnl The std/rep/movsl/cld is very slow for small blocks on pentium4. Its
diff --git a/gmp/mpn/x86/pentium4/copyi.asm b/gmp/mpn/x86/pentium4/copyi.asm
index b6148879fa..bf812c822b 100644
--- a/gmp/mpn/x86/pentium4/copyi.asm
+++ b/gmp/mpn/x86/pentium4/copyi.asm
@@ -1,32 +1,22 @@
dnl Pentium-4 mpn_copyi -- copy limb vector, incrementing.
-
-dnl Copyright 1999-2001 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
+
+dnl Copyright 1999, 2000, 2001 Free Software Foundation, Inc.
dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
dnl The rep/movsl is very slow for small blocks on pentium4. Its startup
diff --git a/gmp/mpn/x86/pentium4/mmx/lshift.asm b/gmp/mpn/x86/pentium4/mmx/lshift.asm
index b5eca66698..5d316d5da4 100644
--- a/gmp/mpn/x86/pentium4/mmx/lshift.asm
+++ b/gmp/mpn/x86/pentium4/mmx/lshift.asm
@@ -1,32 +1,21 @@
dnl Intel Pentium-4 mpn_lshift -- left shift.
dnl Copyright 2001, 2002 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/x86/pentium4/mmx/popham.asm b/gmp/mpn/x86/pentium4/mmx/popham.asm
index 9563cb57e4..2e79816821 100644
--- a/gmp/mpn/x86/pentium4/mmx/popham.asm
+++ b/gmp/mpn/x86/pentium4/mmx/popham.asm
@@ -1,33 +1,22 @@
dnl Intel Pentium 4 mpn_popcount, mpn_hamdist -- population count and
dnl hamming distance.
-dnl Copyright 2000-2002, 2007 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
+dnl Copyright 2000, 2001, 2002, 2007 Free Software Foundation, Inc.
dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/x86/pentium4/mmx/rshift.asm b/gmp/mpn/x86/pentium4/mmx/rshift.asm
index 3ac0094a5a..a7dec54a3a 100644
--- a/gmp/mpn/x86/pentium4/mmx/rshift.asm
+++ b/gmp/mpn/x86/pentium4/mmx/rshift.asm
@@ -1,32 +1,21 @@
dnl Intel Pentium-4 mpn_rshift -- right shift.
dnl Copyright 2001, 2002 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/x86/pentium4/sse2/add_n.asm b/gmp/mpn/x86/pentium4/sse2/add_n.asm
index 8e2380e493..04c0c68d0e 100644
--- a/gmp/mpn/x86/pentium4/sse2/add_n.asm
+++ b/gmp/mpn/x86/pentium4/sse2/add_n.asm
@@ -1,44 +1,36 @@
dnl Intel Pentium-4 mpn_add_n -- mpn addition.
dnl Copyright 2001, 2002 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
-C cycles/limb
-C dst!=src1,2 dst==src1 dst==src2
-C P6 model 0-8,10-12 -
-C P6 model 9 (Banias) ?
-C P6 model 13 (Dothan) ?
-C P4 model 0-1 (Willamette) ?
-C P4 model 2 (Northwood) 4 6 6
-C P4 model 3-4 (Prescott) 4.25 7.5 7.5
+C P4 Willamette, Northwood: 4.0 cycles/limb if dst!=src1 and dst!=src2
+C 6.0 cycles/limb if dst==src1 or dst==src2
+C P4 Prescott: >= 5 cycles/limb
+
+C mp_limb_t mpn_add_n (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
+C mp_size_t size);
+C mp_limb_t mpn_add_nc (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
+C mp_size_t size, mp_limb_t carry);
+C
+C The 4 c/l achieved here isn't particularly good, but is better than 9 c/l
+C for a basic adc loop.
defframe(PARAM_CARRY,20)
defframe(PARAM_SIZE, 16)
@@ -54,25 +46,29 @@ define(SAVE_EBX,`PARAM_SRC1')
PROLOGUE(mpn_add_nc)
deflit(`FRAME',0)
+
movd PARAM_CARRY, %mm0
jmp L(start_nc)
+
EPILOGUE()
ALIGN(8)
PROLOGUE(mpn_add_n)
deflit(`FRAME',0)
+
pxor %mm0, %mm0
+
L(start_nc):
- mov PARAM_SRC1, %eax
- mov %ebx, SAVE_EBX
- mov PARAM_SRC2, %ebx
- mov PARAM_DST, %edx
- mov PARAM_SIZE, %ecx
+ movl PARAM_SRC1, %eax
+ movl %ebx, SAVE_EBX
+ movl PARAM_SRC2, %ebx
+ movl PARAM_DST, %edx
+ movl PARAM_SIZE, %ecx
- lea (%eax,%ecx,4), %eax C src1 end
- lea (%ebx,%ecx,4), %ebx C src2 end
- lea (%edx,%ecx,4), %edx C dst end
- neg %ecx C -size
+ leal (%eax,%ecx,4), %eax C src1 end
+ leal (%ebx,%ecx,4), %ebx C src2 end
+ leal (%edx,%ecx,4), %edx C dst end
+ negl %ecx C -size
L(top):
C eax src1 end
@@ -90,11 +86,12 @@ L(top):
psrlq $32, %mm0
- add $1, %ecx
+ addl $1, %ecx
jnz L(top)
+
movd %mm0, %eax
- mov SAVE_EBX, %ebx
+ movl SAVE_EBX, %ebx
emms
ret
diff --git a/gmp/mpn/x86/pentium4/sse2/addlsh1_n.asm b/gmp/mpn/x86/pentium4/sse2/addlsh1_n.asm
index 93b63b2018..46b0903c50 100644
--- a/gmp/mpn/x86/pentium4/sse2/addlsh1_n.asm
+++ b/gmp/mpn/x86/pentium4/sse2/addlsh1_n.asm
@@ -1,45 +1,33 @@
dnl Intel Pentium-4 mpn_addlsh1_n -- mpn x+2*y.
-dnl Copyright 2001-2004, 2006 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
+dnl Copyright 2001, 2002, 2003, 2004, 2006 Free Software Foundation, Inc.
dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
-C cycles/limb
-C dst!=src1,2 dst==src1 dst==src2
-C P6 model 0-8,10-12 -
-C P6 model 9 (Banias) ?
-C P6 model 13 (Dothan) ?
-C P4 model 0-1 (Willamette) ?
-C P4 model 2 (Northwood) 4.25 6 6
-C P4 model 3-4 (Prescott) 5 8.5 8.5
+C cycles/limb (approx)
+C dst!=src1,2 dst==src1 dst==src2
+C P4 m2: 4.5 ?7.25 ?6.75
+C P4 m3: 5.3 ? ?
+C mp_limb_t mpn_addlsh1_n (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
+C mp_size_t size);
+C
C The slightly strange combination of indexing and pointer incrementing
C that's used seems to work best. Not sure why, but %ecx,4 with src1 and/or
C src2 is a slowdown.
@@ -63,18 +51,18 @@ define(SAVE_EBX,`PARAM_SRC1')
PROLOGUE(mpn_addlsh1_n)
deflit(`FRAME',0)
- mov PARAM_SRC1, %eax
- mov %ebx, SAVE_EBX
+ movl PARAM_SRC1, %eax
+ movl %ebx, SAVE_EBX
- mov PARAM_SRC2, %ebx
+ movl PARAM_SRC2, %ebx
pxor %mm0, %mm0 C initial carry
- mov PARAM_DST, %edx
+ movl PARAM_DST, %edx
- mov PARAM_SIZE, %ecx
+ movl PARAM_SIZE, %ecx
- lea (%edx,%ecx,4), %edx C dst end
- neg %ecx C -size
+ leal (%edx,%ecx,4), %edx C dst end
+ negl %ecx C -size
L(top):
C eax src1 end
@@ -83,24 +71,24 @@ L(top):
C edx dst end
C mm0 carry
- movd (%ebx), %mm2
movd (%eax), %mm1
+ movd (%ebx), %mm2
psrlq $32, %mm0
- lea 4(%eax), %eax
- lea 4(%ebx), %ebx
+ leal 4(%eax), %eax
+ leal 4(%ebx), %ebx
- psllq $1, %mm2
+ paddq %mm2, %mm1
paddq %mm2, %mm1
paddq %mm1, %mm0
movd %mm0, (%edx,%ecx,4)
- add $1, %ecx
+ addl $1, %ecx
jnz L(top)
psrlq $32, %mm0
- mov SAVE_EBX, %ebx
+ movl SAVE_EBX, %ebx
movd %mm0, %eax
emms
ret
diff --git a/gmp/mpn/x86/pentium4/sse2/addmul_1.asm b/gmp/mpn/x86/pentium4/sse2/addmul_1.asm
index 78102072bf..3a8d0bb9bd 100644
--- a/gmp/mpn/x86/pentium4/sse2/addmul_1.asm
+++ b/gmp/mpn/x86/pentium4/sse2/addmul_1.asm
@@ -1,48 +1,37 @@
dnl mpn_addmul_1 for Pentium 4 and P6 models with SSE2 (i.e., 9,D,E,F).
-dnl Copyright 2005, 2007, 2011 Free Software Foundation, Inc.
-
+dnl Copyright 2005, 2007 Free Software Foundation, Inc.
+dnl
dnl This file is part of the GNU MP Library.
dnl
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
dnl
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
-C cycles/limb
-C P6 model 0-8,10-12 -
-C P6 model 9 (Banias) 5.24
-C P6 model 13 (Dothan) 5.24
-C P4 model 0-1 (Willamette) 5
-C P4 model 2 (Northwood) 5
-C P4 model 3-4 (Prescott) 5
-
C TODO:
C * Tweak eax/edx offsets in loop as to save some lea's
C * Perhaps software pipeline small-case code
+C cycles/limb
+C P6 model 0-8,10-12) -
+C P6 model 9 (Banias) ?
+C P6 model 13 (Dothan) 5.24
+C P4 model 0-1 (Willamette): 5
+C P4 model 2 (Northwood): 5
+C P4 model 3-4 (Prescott): 5
+
C INPUT PARAMETERS
C rp sp + 4
C up sp + 8
@@ -51,13 +40,22 @@ C v0 sp + 16
TEXT
ALIGN(16)
+PROLOGUE(mpn_addmul_1c)
+ mov 4(%esp), %edx
+ mov 8(%esp), %eax
+ mov 12(%esp), %ecx
+ movd 16(%esp), %mm7
+ movd 20(%esp), %mm6
+ jmp L(ent)
+EPILOGUE()
+ ALIGN(16)
PROLOGUE(mpn_addmul_1)
- pxor %mm6, %mm6
-L(ent): mov 4(%esp), %edx
+ mov 4(%esp), %edx
mov 8(%esp), %eax
mov 12(%esp), %ecx
movd 16(%esp), %mm7
- cmp $4, %ecx
+ pxor %mm6, %mm6
+L(ent): cmp $4, %ecx
jnc L(big)
L(lp0): movd (%eax), %mm0
@@ -183,7 +181,3 @@ L(end): pmuludq %mm7, %mm2
emms
ret
EPILOGUE()
-PROLOGUE(mpn_addmul_1c)
- movd 20(%esp), %mm6
- jmp L(ent)
-EPILOGUE()
diff --git a/gmp/mpn/x86/pentium4/sse2/bdiv_dbm1c.asm b/gmp/mpn/x86/pentium4/sse2/bdiv_dbm1c.asm
deleted file mode 100644
index 354300e4de..0000000000
--- a/gmp/mpn/x86/pentium4/sse2/bdiv_dbm1c.asm
+++ /dev/null
@@ -1,141 +0,0 @@
-dnl Intel Atom mpn_bdiv_dbm1.
-
-dnl Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato.
-
-dnl Copyright 2011 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C cycles/limb
-C P5 -
-C P6 model 0-8,10-12 -
-C P6 model 9 (Banias) 9.75
-C P6 model 13 (Dothan)
-C P4 model 0 (Willamette)
-C P4 model 1 (?)
-C P4 model 2 (Northwood) 8.25
-C P4 model 3 (Prescott)
-C P4 model 4 (Nocona)
-C Intel Atom 8
-C AMD K6 -
-C AMD K7 -
-C AMD K8
-C AMD K10
-
-C TODO: This code was optimised for atom-32, consider moving it back to atom
-C dir(atom currently grabs this code), and write a 4-way version(7c/l).
-
-defframe(PARAM_CARRY,20)
-defframe(PARAM_MUL, 16)
-defframe(PARAM_SIZE, 12)
-defframe(PARAM_SRC, 8)
-defframe(PARAM_DST, 4)
-
-dnl re-use parameter space
-define(SAVE_RP,`PARAM_MUL')
-define(SAVE_UP,`PARAM_SIZE')
-
-define(`rp', `%edi')
-define(`up', `%esi')
-define(`n', `%ecx')
-define(`reg', `%edx')
-define(`cy', `%eax') C contains the return value
-
-ASM_START()
- TEXT
- ALIGN(16)
-deflit(`FRAME',0)
-
-PROLOGUE(mpn_bdiv_dbm1c)
- mov PARAM_SIZE, n C size
- mov up, SAVE_UP
- mov PARAM_SRC, up
- movd PARAM_MUL, %mm7
- mov rp, SAVE_RP
- mov PARAM_DST, rp
-
- movd (up), %mm0
- pmuludq %mm7, %mm0
- shr n
- mov PARAM_CARRY, cy
- jz L(eq1)
-
- movd 4(up), %mm1
- jc L(odd)
-
- lea 4(up), up
- pmuludq %mm7, %mm1
- movd %mm0, reg
- psrlq $32, %mm0
- sub reg, cy
- movd %mm0, reg
- movq %mm1, %mm0
- dec n
- mov cy, (rp)
- lea 4(rp), rp
- jz L(end)
-
-C ALIGN(16)
-L(top): movd 4(up), %mm1
- sbb reg, cy
-L(odd): movd %mm0, reg
- psrlq $32, %mm0
- pmuludq %mm7, %mm1
- sub reg, cy
- lea 8(up), up
- movd %mm0, reg
- movd (up), %mm0
- mov cy, (rp)
- sbb reg, cy
- movd %mm1, reg
- psrlq $32, %mm1
- sub reg, cy
- movd %mm1, reg
- pmuludq %mm7, %mm0
- dec n
- mov cy, 4(rp)
- lea 8(rp), rp
- jnz L(top)
-
-L(end): sbb reg, cy
-
-L(eq1): movd %mm0, reg
- psrlq $32, %mm0
- mov SAVE_UP, up
- sub reg, cy
- movd %mm0, reg
- emms
- mov cy, (rp)
- sbb reg, cy
-
- mov SAVE_RP, rp
- ret
-EPILOGUE()
-ASM_END()
diff --git a/gmp/mpn/x86/pentium4/sse2/bdiv_q_1.asm b/gmp/mpn/x86/pentium4/sse2/bdiv_q_1.asm
deleted file mode 100644
index f7f461d56f..0000000000
--- a/gmp/mpn/x86/pentium4/sse2/bdiv_q_1.asm
+++ /dev/null
@@ -1,233 +0,0 @@
-dnl Intel Pentium-4 mpn_divexact_1 -- mpn by limb exact division.
-
-dnl Rearranged from mpn/x86/pentium4/sse2/dive_1.asm by Marco Bodrato.
-
-dnl Copyright 2001, 2002, 2007, 2011 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-
-C P4: 19.0 cycles/limb
-
-C Pairs of movd's are used to avoid unaligned loads. Despite the loads not
-C being on the dependent chain and there being plenty of cycles available,
-C using an unaligned movq on every second iteration measured about 23 c/l.
-C
-
-defframe(PARAM_SHIFT, 24)
-defframe(PARAM_INVERSE,20)
-defframe(PARAM_DIVISOR,16)
-defframe(PARAM_SIZE, 12)
-defframe(PARAM_SRC, 8)
-defframe(PARAM_DST, 4)
-
- TEXT
-
-C mp_limb_t
-C mpn_pi1_bdiv_q_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, mp_limb_t divisor,
-C mp_limb_t inverse, int shift)
- ALIGN(32)
-PROLOGUE(mpn_pi1_bdiv_q_1)
-deflit(`FRAME',0)
-
- movl PARAM_SIZE, %edx
-
- movl PARAM_SRC, %eax
-
- movl PARAM_DIVISOR, %ecx
-
- movd %ecx, %mm6
- movl PARAM_SHIFT, %ecx
-
- movd %ecx, %mm7 C shift
-
- C
-
- movl PARAM_INVERSE, %ecx
- movd %ecx, %mm5 C inv
-
- movl PARAM_DST, %ecx
- pxor %mm1, %mm1 C initial carry limb
- pxor %mm0, %mm0 C initial carry bit
-
- subl $1, %edx
- jz L(done)
-
- pcmpeqd %mm4, %mm4
- psrlq $32, %mm4 C 0x00000000FFFFFFFF
-
-C The dependent chain here is as follows.
-C
-C latency
-C psubq s = (src-cbit) - climb 2
-C pmuludq q = s*inverse 8
-C pmuludq prod = q*divisor 8
-C psrlq climb = high(prod) 2
-C --
-C 20
-C
-C Yet the loop measures 19.0 c/l, so obviously there's something gained
-C there over a straight reading of the chip documentation.
-
-L(top):
- C eax src, incrementing
- C ebx
- C ecx dst, incrementing
- C edx counter, size-1 iterations
- C
- C mm0 carry bit
- C mm1 carry limb
- C mm4 0x00000000FFFFFFFF
- C mm5 inverse
- C mm6 divisor
- C mm7 shift
-
- movd (%eax), %mm2
- movd 4(%eax), %mm3
- addl $4, %eax
- punpckldq %mm3, %mm2
-
- psrlq %mm7, %mm2
- pand %mm4, %mm2 C src
- psubq %mm0, %mm2 C src - cbit
-
- psubq %mm1, %mm2 C src - cbit - climb
- movq %mm2, %mm0
- psrlq $63, %mm0 C new cbit
-
- pmuludq %mm5, %mm2 C s*inverse
- movd %mm2, (%ecx) C q
- addl $4, %ecx
-
- movq %mm6, %mm1
- pmuludq %mm2, %mm1 C q*divisor
- psrlq $32, %mm1 C new climb
-
-L(entry):
- subl $1, %edx
- jnz L(top)
-
-L(done):
- movd (%eax), %mm2
- psrlq %mm7, %mm2 C src
- psubq %mm0, %mm2 C src - cbit
-
- psubq %mm1, %mm2 C src - cbit - climb
-
- pmuludq %mm5, %mm2 C s*inverse
- movd %mm2, (%ecx) C q
-
- emms
- ret
-
-EPILOGUE()
-
- ALIGN(16)
-C mp_limb_t mpn_bdiv_q_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
-C mp_limb_t divisor);
-C
-PROLOGUE(mpn_bdiv_q_1)
-deflit(`FRAME',0)
-
- movl PARAM_SIZE, %edx
-
- movl PARAM_DIVISOR, %ecx
-
- C eax src
- C ebx
- C ecx divisor
- C edx size-1
-
- movl %ecx, %eax
- bsfl %ecx, %ecx C trailing twos
-
- shrl %cl, %eax C d = divisor without twos
- movd %eax, %mm6
- movd %ecx, %mm7 C shift
-
- shrl %eax C d/2
-
- andl $127, %eax C d/2, 7 bits
-
-ifdef(`PIC',`
- LEA( binvert_limb_table, %ecx)
- movzbl (%eax,%ecx), %eax C inv 8 bits
-',`
- movzbl binvert_limb_table(%eax), %eax C inv 8 bits
-')
-
- C
-
- movd %eax, %mm5 C inv
-
- movd %eax, %mm0 C inv
-
- pmuludq %mm5, %mm5 C inv*inv
-
- C
-
- pmuludq %mm6, %mm5 C inv*inv*d
- paddd %mm0, %mm0 C 2*inv
-
- C
-
- psubd %mm5, %mm0 C inv = 2*inv - inv*inv*d
- pxor %mm5, %mm5
-
- paddd %mm0, %mm5
- pmuludq %mm0, %mm0 C inv*inv
-
- pcmpeqd %mm4, %mm4
- psrlq $32, %mm4 C 0x00000000FFFFFFFF
-
- C
-
- pmuludq %mm6, %mm0 C inv*inv*d
- paddd %mm5, %mm5 C 2*inv
-
- movl PARAM_SRC, %eax
- movl PARAM_DST, %ecx
- pxor %mm1, %mm1 C initial carry limb
-
- C
-
- psubd %mm0, %mm5 C inv = 2*inv - inv*inv*d
-
- ASSERT(e,` C expect d*inv == 1 mod 2^GMP_LIMB_BITS
- pushl %eax FRAME_pushl()
- movq %mm6, %mm0
- pmuludq %mm5, %mm0
- movd %mm0, %eax
- cmpl $1, %eax
- popl %eax FRAME_popl()')
-
- pxor %mm0, %mm0 C initial carry bit
- jmp L(entry)
-
-EPILOGUE()
diff --git a/gmp/mpn/x86/pentium4/sse2/cnd_add_n.asm b/gmp/mpn/x86/pentium4/sse2/cnd_add_n.asm
deleted file mode 100644
index b3f3474e67..0000000000
--- a/gmp/mpn/x86/pentium4/sse2/cnd_add_n.asm
+++ /dev/null
@@ -1,95 +0,0 @@
-dnl Intel Pentium-4 mpn_cnd_add_n -- mpn addition.
-
-dnl Copyright 2001, 2002, 2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-
-C cycles/limb
-C P6 model 0-8,10-12 -
-C P6 model 9 (Banias) ?
-C P6 model 13 (Dothan) 4.67
-C P4 model 0-1 (Willamette) ?
-C P4 model 2 (Northwood) 5
-C P4 model 3-4 (Prescott) 5.25
-
-defframe(PARAM_SIZE, 20)
-defframe(PARAM_SRC2, 16)
-defframe(PARAM_SRC1, 12)
-defframe(PARAM_DST, 8)
-defframe(PARAM_CND, 4)
-
-dnl re-use parameter space
-define(SAVE_EBX,`PARAM_SRC1')
-
-define(`cnd', `%mm3')
-
- TEXT
- ALIGN(8)
-
- ALIGN(8)
-PROLOGUE(mpn_cnd_add_n)
-deflit(`FRAME',0)
- pxor %mm0, %mm0
-
- mov PARAM_CND, %eax
- neg %eax
- sbb %eax, %eax
- movd %eax, cnd
-
- mov PARAM_SRC1, %eax
- mov %ebx, SAVE_EBX
- mov PARAM_SRC2, %ebx
- mov PARAM_DST, %edx
- mov PARAM_SIZE, %ecx
-
- lea (%eax,%ecx,4), %eax C src1 end
- lea (%ebx,%ecx,4), %ebx C src2 end
- lea (%edx,%ecx,4), %edx C dst end
- neg %ecx C -size
-
-L(top): movd (%ebx,%ecx,4), %mm2
- movd (%eax,%ecx,4), %mm1
- pand cnd, %mm2
- paddq %mm2, %mm1
-
- paddq %mm1, %mm0
- movd %mm0, (%edx,%ecx,4)
-
- psrlq $32, %mm0
-
- add $1, %ecx
- jnz L(top)
-
- movd %mm0, %eax
- mov SAVE_EBX, %ebx
- emms
- ret
-
-EPILOGUE()
diff --git a/gmp/mpn/x86/pentium4/sse2/cnd_sub_n.asm b/gmp/mpn/x86/pentium4/sse2/cnd_sub_n.asm
deleted file mode 100644
index 339a23e0b6..0000000000
--- a/gmp/mpn/x86/pentium4/sse2/cnd_sub_n.asm
+++ /dev/null
@@ -1,114 +0,0 @@
-dnl Intel Pentium-4 mpn_cnd_sub_n -- mpn subtraction.
-
-dnl Copyright 2001, 2002, 2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-
-C cycles/limb
-C P6 model 0-8,10-12 -
-C P6 model 9 (Banias) ?
-C P6 model 13 (Dothan) 4.67
-C P4 model 0-1 (Willamette) ?
-C P4 model 2 (Northwood) 5
-C P4 model 3-4 (Prescott) 5.25
-
-defframe(PARAM_SIZE, 20)
-defframe(PARAM_SRC2, 16)
-defframe(PARAM_SRC1, 12)
-defframe(PARAM_DST, 8)
-defframe(PARAM_CND, 4)
-
-dnl re-use parameter space
-define(SAVE_EBX,`PARAM_SRC1')
-
-define(`cnd', `%mm3')
-
- TEXT
- ALIGN(8)
-
- ALIGN(8)
-PROLOGUE(mpn_cnd_sub_n)
-deflit(`FRAME',0)
- pxor %mm0, %mm0
-
- mov PARAM_CND, %eax
- neg %eax
- sbb %eax, %eax
- movd %eax, cnd
-
- mov PARAM_SRC1, %eax
- mov %ebx, SAVE_EBX
- mov PARAM_SRC2, %ebx
- mov PARAM_DST, %edx
- mov PARAM_SIZE, %ecx
-
- lea (%eax,%ecx,4), %eax C src1 end
- lea (%ebx,%ecx,4), %ebx C src2 end
- lea (%edx,%ecx,4), %edx C dst end
- neg %ecx C -size
-
-L(top): movd (%ebx,%ecx,4), %mm2
- movd (%eax,%ecx,4), %mm1
- pand cnd, %mm2
- psubq %mm2, %mm1
-
- psubq %mm0, %mm1
- movd %mm1, (%edx,%ecx,4)
-
- psrlq $63, %mm1
-
- add $1, %ecx
- jz L(done_mm1)
-
- movd (%ebx,%ecx,4), %mm2
- movd (%eax,%ecx,4), %mm0
- pand cnd, %mm2
- psubq %mm2, %mm0
-
- psubq %mm1, %mm0
- movd %mm0, (%edx,%ecx,4)
-
- psrlq $63, %mm0
-
- add $1, %ecx
- jnz L(top)
-
- movd %mm0, %eax
- mov SAVE_EBX, %ebx
- emms
- ret
-
-L(done_mm1):
- movd %mm1, %eax
- mov SAVE_EBX, %ebx
- emms
- ret
-
-EPILOGUE()
diff --git a/gmp/mpn/x86/pentium4/sse2/dive_1.asm b/gmp/mpn/x86/pentium4/sse2/dive_1.asm
index 238f0dd8a5..c50ef7d29e 100644
--- a/gmp/mpn/x86/pentium4/sse2/dive_1.asm
+++ b/gmp/mpn/x86/pentium4/sse2/dive_1.asm
@@ -1,32 +1,21 @@
dnl Intel Pentium-4 mpn_divexact_1 -- mpn by limb exact division.
dnl Copyright 2001, 2002, 2007 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
@@ -137,7 +126,7 @@ ifdef(`PIC',`
psubd %mm0, %mm5 C inv = 2*inv - inv*inv*d
- ASSERT(e,` C expect d*inv == 1 mod 2^GMP_LIMB_BITS
+ ASSERT(e,` C expect d*inv == 1 mod 2^BITS_PER_MP_LIMB
pushl %eax FRAME_pushl()
movq %mm6, %mm0
pmuludq %mm5, %mm0
@@ -150,13 +139,13 @@ ifdef(`PIC',`
C The dependent chain here is as follows.
C
-C latency
-C psubq s = (src-cbit) - climb 2
-C pmuludq q = s*inverse 8
-C pmuludq prod = q*divisor 8
-C psrlq climb = high(prod) 2
-C --
-C 20
+C latency
+C psubq s = (src-cbit) - climb 2
+C pmuludq q = s*inverse 8
+C pmuludq prod = q*divisor 8
+C psrlq climb = high(prod) 2
+C --
+C 20
C
C Yet the loop measures 19.0 c/l, so obviously there's something gained
C there over a straight reading of the chip documentation.
diff --git a/gmp/mpn/x86/pentium4/sse2/divrem_1.asm b/gmp/mpn/x86/pentium4/sse2/divrem_1.asm
index 0146fab117..7f973dbf98 100644
--- a/gmp/mpn/x86/pentium4/sse2/divrem_1.asm
+++ b/gmp/mpn/x86/pentium4/sse2/divrem_1.asm
@@ -1,32 +1,22 @@
dnl Intel Pentium-4 mpn_divrem_1 -- mpn by limb division.
-dnl Copyright 1999-2004 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
+dnl Copyright 1999, 2000, 2001, 2002, 2003, 2004 Free Software Foundation,
+dnl Inc.
dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/x86/pentium4/sse2/gmp-mparam.h b/gmp/mpn/x86/pentium4/sse2/gmp-mparam.h
index a94ae868b3..5071aae092 100644
--- a/gmp/mpn/x86/pentium4/sse2/gmp-mparam.h
+++ b/gmp/mpn/x86/pentium4/sse2/gmp-mparam.h
@@ -1,206 +1,68 @@
/* Intel Pentium-4 gmp-mparam.h -- Compiler/machine parameter header file.
-Copyright 1991, 1993, 1994, 2000-2005, 2007-2010, 2014 Free Software
-Foundation, Inc.
+Copyright 1991, 1993, 1994, 2000, 2001, 2002, 2003, 2004, 2005, 2007, 2008,
+2009 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
-or
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
+#define BITS_PER_MP_LIMB 32
+#define BYTES_PER_MP_LIMB 4
-or both in parallel, as here.
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-#define GMP_LIMB_BITS 32
-#define GMP_LIMB_BYTES 4
-
-/* 2600 MHz P4 Northwood */
-/* FFT tuning limit = 12500000 */
-/* Generated by tuneup.c, 2014-03-12, gcc 4.2 */
-
-#define MOD_1_NORM_THRESHOLD 24
-#define MOD_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */
-#define MOD_1N_TO_MOD_1_1_THRESHOLD 6
-#define MOD_1U_TO_MOD_1_1_THRESHOLD 5
-#define MOD_1_1_TO_MOD_1_2_THRESHOLD 13
-#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */
-#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 2
-#define USE_PREINV_DIVREM_1 1 /* native */
-#define DIV_QR_1N_PI1_METHOD 2
-#define DIV_QR_1_NORM_THRESHOLD 19
-#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */
-#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
-#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
-#define BMOD_1_TO_MOD_1_THRESHOLD 20
-
-#define MUL_TOOM22_THRESHOLD 29
-#define MUL_TOOM33_THRESHOLD 113
-#define MUL_TOOM44_THRESHOLD 288
-#define MUL_TOOM6H_THRESHOLD 454
-#define MUL_TOOM8H_THRESHOLD 592
-
-#define MUL_TOOM32_TO_TOOM43_THRESHOLD 118
-#define MUL_TOOM32_TO_TOOM53_THRESHOLD 214
-#define MUL_TOOM42_TO_TOOM53_THRESHOLD 193
-#define MUL_TOOM42_TO_TOOM63_THRESHOLD 186
-#define MUL_TOOM43_TO_TOOM54_THRESHOLD 287
-
-#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
-#define SQR_TOOM2_THRESHOLD 44
-#define SQR_TOOM3_THRESHOLD 173
-#define SQR_TOOM4_THRESHOLD 390
-#define SQR_TOOM6_THRESHOLD 0 /* always */
-#define SQR_TOOM8_THRESHOLD 915
-
-#define MULMID_TOOM42_THRESHOLD 66
-
-#define MULMOD_BNM1_THRESHOLD 19
-#define SQRMOD_BNM1_THRESHOLD 23
-
-#define MUL_FFT_MODF_THRESHOLD 1147 /* k = 5 */
-#define MUL_FFT_TABLE3 \
- { { 1147, 5}, { 36, 6}, { 19, 5}, { 39, 6}, \
- { 27, 7}, { 15, 6}, { 33, 7}, { 17, 6}, \
- { 35, 7}, { 19, 6}, { 40, 7}, { 21, 6}, \
- { 43, 7}, { 23, 6}, { 49, 7}, { 27, 6}, \
- { 55, 7}, { 31, 6}, { 63, 7}, { 35, 8}, \
- { 19, 7}, { 43, 8}, { 23, 7}, { 51, 8}, \
- { 27, 7}, { 55, 8}, { 31, 7}, { 63, 8}, \
- { 39, 7}, { 79, 8}, { 43, 9}, { 23, 8}, \
- { 55, 9}, { 31, 8}, { 71, 9}, { 39, 8}, \
- { 79, 9}, { 47, 8}, { 95, 9}, { 55,10}, \
- { 31, 9}, { 63, 8}, { 127, 9}, { 79,10}, \
- { 47, 9}, { 111,11}, { 31,10}, { 63, 9}, \
- { 143,10}, { 79, 9}, { 159,10}, { 111,11}, \
- { 63,10}, { 127, 9}, { 255,10}, { 159, 9}, \
- { 319,11}, { 95,10}, { 207,12}, { 63,11}, \
- { 127,10}, { 287,11}, { 159,10}, { 335,11}, \
- { 191,10}, { 383,11}, { 223,12}, { 127,11}, \
- { 255,10}, { 511,11}, { 319,10}, { 671,11}, \
- { 351,12}, { 191,11}, { 383,10}, { 799,13}, \
- { 127,12}, { 255,11}, { 511,10}, { 1055, 9}, \
- { 2111,10}, { 1119, 9}, { 2239,11}, { 607,12}, \
- { 319,11}, { 671,10}, { 1407,11}, { 735,10}, \
- { 1471, 9}, { 2943,12}, { 383,11}, { 799,10}, \
- { 1599,11}, { 863,10}, { 1727, 9}, { 3455,12}, \
- { 447,11}, { 895,13}, { 255,12}, { 511,11}, \
- { 1055,10}, { 2111,11}, { 1119,10}, { 2239, 9}, \
- { 4479,12}, { 575,11}, { 1247,10}, { 2495, 9}, \
- { 4991,12}, { 639,11}, { 1471,10}, { 2943,13}, \
- { 383,12}, { 767,11}, { 1599,12}, { 831,11}, \
- { 1727,10}, { 3455,12}, { 895,14}, { 255,13}, \
- { 511,12}, { 1023,11}, { 2047,12}, { 1087,11}, \
- { 2239,10}, { 4479,12}, { 1215,11}, { 2495,10}, \
- { 4991,13}, { 639,12}, { 1471,11}, { 2943,10}, \
- { 5887,11}, { 3007,13}, { 767,12}, { 1727,11}, \
- { 3455,13}, { 895,12}, { 1791,11}, { 3711,12}, \
- { 1983,11}, { 3967,10}, { 7935,14}, { 511,13}, \
- { 1023,12}, { 2239,11}, { 4479,13}, { 1151,12}, \
- { 2495,11}, { 4991,13}, { 1279,12}, { 2623,13}, \
- { 1407,12}, { 2943,11}, { 5887,12}, { 3007,14}, \
- { 16384,15}, { 32768,16} }
-#define MUL_FFT_TABLE3_SIZE 158
-#define MUL_FFT_THRESHOLD 7808
-
-#define SQR_FFT_MODF_THRESHOLD 896 /* k = 5 */
-#define SQR_FFT_TABLE3 \
- { { 896, 5}, { 28, 6}, { 15, 5}, { 33, 6}, \
- { 17, 5}, { 35, 6}, { 19, 5}, { 39, 6}, \
- { 27, 7}, { 15, 6}, { 33, 7}, { 17, 6}, \
- { 36, 7}, { 19, 6}, { 39, 7}, { 23, 6}, \
- { 47, 7}, { 27, 6}, { 55, 7}, { 31, 6}, \
- { 63, 7}, { 37, 8}, { 19, 7}, { 43, 8}, \
- { 23, 7}, { 51, 8}, { 27, 7}, { 55, 8}, \
- { 31, 7}, { 63, 8}, { 39, 7}, { 79, 8}, \
- { 43, 9}, { 23, 8}, { 55, 9}, { 31, 8}, \
- { 71, 9}, { 39, 8}, { 79, 9}, { 47, 8}, \
- { 95, 9}, { 55,10}, { 31, 9}, { 79,10}, \
- { 47, 9}, { 95,11}, { 31,10}, { 63, 9}, \
- { 127,10}, { 79, 9}, { 159,10}, { 95, 9}, \
- { 191,11}, { 63,10}, { 127, 9}, { 255,10}, \
- { 159,11}, { 95,10}, { 191,12}, { 63,11}, \
- { 127,10}, { 255, 9}, { 511,10}, { 271, 9}, \
- { 543,11}, { 159,10}, { 319, 9}, { 639,11}, \
- { 191,10}, { 383, 9}, { 767,11}, { 223,12}, \
- { 127,11}, { 255,10}, { 511, 9}, { 1055,10}, \
- { 543,11}, { 287,10}, { 607,11}, { 319,12}, \
- { 191,11}, { 383,10}, { 767,13}, { 127,12}, \
- { 255,11}, { 511,10}, { 1055,11}, { 543,10}, \
- { 1119, 9}, { 2239,11}, { 607,12}, { 319,11}, \
- { 671,10}, { 1407,11}, { 735,10}, { 1471, 9}, \
- { 2943,12}, { 383,11}, { 799,10}, { 1599,11}, \
- { 863,10}, { 1727,12}, { 447,11}, { 991,13}, \
- { 255,12}, { 511,11}, { 1055,10}, { 2111,11}, \
- { 1119,10}, { 2239,12}, { 575,11}, { 1247,10}, \
- { 2495,12}, { 639,11}, { 1471,10}, { 2943,13}, \
- { 383,12}, { 767,11}, { 1599,12}, { 831,11}, \
- { 1727,10}, { 3455,12}, { 959,14}, { 255,13}, \
- { 511,12}, { 1023,11}, { 2111,12}, { 1087,11}, \
- { 2239,10}, { 4479,12}, { 1215,11}, { 2495,13}, \
- { 639,12}, { 1471,11}, { 2943,10}, { 5887,13}, \
- { 767,12}, { 1727,11}, { 3455,13}, { 895,12}, \
- { 1791,11}, { 3711,12}, { 1983,11}, { 3967,10}, \
- { 7935,14}, { 511,13}, { 1023,12}, { 2239,11}, \
- { 4479,13}, { 1151,12}, { 2495,11}, { 4991,13}, \
- { 1279,12}, { 2623,13}, { 1407,12}, { 2943,11}, \
- { 5887,14}, { 16384,15}, { 32768,16} }
-#define SQR_FFT_TABLE3_SIZE 159
-#define SQR_FFT_THRESHOLD 7296
-
-#define MULLO_BASECASE_THRESHOLD 12
-#define MULLO_DC_THRESHOLD 55
-#define MULLO_MUL_N_THRESHOLD 14709
-
-#define DC_DIV_QR_THRESHOLD 38
-#define DC_DIVAPPR_Q_THRESHOLD 77
-#define DC_BDIV_QR_THRESHOLD 51
-#define DC_BDIV_Q_THRESHOLD 85
-
-#define INV_MULMOD_BNM1_THRESHOLD 56
-#define INV_NEWTON_THRESHOLD 121
-#define INV_APPR_THRESHOLD 93
-
-#define BINV_NEWTON_THRESHOLD 366
-#define REDC_1_TO_REDC_N_THRESHOLD 64
-
-#define MU_DIV_QR_THRESHOLD 2350
-#define MU_DIVAPPR_Q_THRESHOLD 2172
-#define MUPI_DIV_QR_THRESHOLD 62
-#define MU_BDIV_QR_THRESHOLD 2172
-#define MU_BDIV_Q_THRESHOLD 2304
-
-#define POWM_SEC_TABLE 1,19,102,615,2111
-
-#define MATRIX22_STRASSEN_THRESHOLD 23
-#define HGCD_THRESHOLD 88
-#define HGCD_APPR_THRESHOLD 93
-#define HGCD_REDUCE_THRESHOLD 5010
-#define GCD_DC_THRESHOLD 379
-#define GCDEXT_DC_THRESHOLD 258
-#define JACOBI_BASE_METHOD 4
-
-#define GET_STR_DC_THRESHOLD 12
-#define GET_STR_PRECOMPUTE_THRESHOLD 26
-#define SET_STR_DC_THRESHOLD 147
-#define SET_STR_PRECOMPUTE_THRESHOLD 894
-
-#define FAC_DSC_THRESHOLD 906
-#define FAC_ODD_THRESHOLD 28
+/* 2600 MHz Pentium 4 model 2 */
+
+/* Generated by tuneup.c, 2009-01-06, gcc 3.4 */
+
+#define MUL_KARATSUBA_THRESHOLD 31
+#define MUL_TOOM3_THRESHOLD 119
+#define MUL_TOOM44_THRESHOLD 178
+
+#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
+#define SQR_KARATSUBA_THRESHOLD 49
+#define SQR_TOOM3_THRESHOLD 165
+#define SQR_TOOM4_THRESHOLD 252
+
+#define MULLOW_BASECASE_THRESHOLD 15
+#define MULLOW_DC_THRESHOLD 44
+#define MULLOW_MUL_N_THRESHOLD 363
+
+#define DIV_SB_PREINV_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIV_DC_THRESHOLD 33
+#define POWM_THRESHOLD 95
+
+#define MATRIX22_STRASSEN_THRESHOLD 23
+#define HGCD_THRESHOLD 64
+#define GCD_DC_THRESHOLD 310
+#define GCDEXT_DC_THRESHOLD 310
+#define JACOBI_BASE_METHOD 1
+
+#define USE_PREINV_DIVREM_1 1 /* native */
+#define USE_PREINV_MOD_1 1 /* native */
+#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
+#define MODEXACT_1_ODD_THRESHOLD 0 /* always (native) */
+
+#define GET_STR_DC_THRESHOLD 11
+#define GET_STR_PRECOMPUTE_THRESHOLD 26
+#define SET_STR_DC_THRESHOLD 118
+#define SET_STR_PRECOMPUTE_THRESHOLD 1078
+
+#define MUL_FFT_TABLE { 560, 928, 1920, 5632, 14336, 40960, 0 }
+#define MUL_FFT_MODF_THRESHOLD 720
+#define MUL_FFT_THRESHOLD 9216
+
+#define SQR_FFT_TABLE { 592, 928, 1920, 4608, 14336, 40960, 0 }
+#define SQR_FFT_MODF_THRESHOLD 608
+#define SQR_FFT_THRESHOLD 5888
diff --git a/gmp/mpn/x86/pentium4/sse2/mod_1.asm b/gmp/mpn/x86/pentium4/sse2/mod_1.asm
new file mode 100644
index 0000000000..0e95f13913
--- /dev/null
+++ b/gmp/mpn/x86/pentium4/sse2/mod_1.asm
@@ -0,0 +1,391 @@
+dnl Intel Pentium-4 mpn_mod_1 -- mpn by limb remainder.
+
+dnl Copyright 2001, 2002, 2003 Free Software Foundation, Inc.
+dnl
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
+dnl
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+dnl P4: 31 cycles/limb.
+
+
+C mp_limb_t mpn_mod_1 (mp_srcptr src, mp_size_t size, mp_limb_t divisor);
+C mp_limb_t mpn_mod_1c (mp_srcptr src, mp_size_t size, mp_limb_t divisor,
+C mp_limb_t carry);
+C mp_limb_t mpn_preinv_mod_1 (mp_srcptr src, mp_size_t size, mp_limb_t divisor,
+C mp_limb_t inverse);
+C
+C An idea was tried in the mul-by-inverse to process the last limb by a jump
+C back to the top of the loop skipping the -4(%esi) fetch. But that seemed
+C to produce slightly strange timings, like 9 and 10 limb operations about
+C the same speed. The jump would be successively taken and not-taken, which
+C in theory should predict ok, but perhaps isn't enjoyed by the chip.
+C Duplicating the loop for the last limb seems to be a couple of cycles
+C quicker too.
+C
+C Enhancements:
+C
+C The loop measures 31 cycles, but the dependent chain would suggest it
+C could be done with 30. Not sure where to start looking for the extra
+C cycle.
+
+
+dnl MUL_THRESHOLD is the size at which the multiply by inverse method is
+dnl used, rather than plain "divl"s. Minimum value 2.
+dnl
+dnl The inverse takes about 80-90 cycles to calculate, but after that the
+dnl multiply is 31 c/l versus division at about 58 c/l.
+
+deflit(MUL_THRESHOLD, 5)
+
+
+defframe(PARAM_INVERSE,16) dnl mpn_preinv_mod_1
+defframe(PARAM_CARRY, 16) dnl mpn_mod_1c
+defframe(PARAM_DIVISOR,12)
+defframe(PARAM_SIZE, 8)
+defframe(PARAM_SRC, 4)
+
+dnl re-use parameter space
+define(SAVE_ESI,`PARAM_SIZE')
+define(SAVE_EBP,`PARAM_SRC')
+
+ TEXT
+
+ ALIGN(16)
+PROLOGUE(mpn_preinv_mod_1)
+deflit(`FRAME',0)
+
+ movl PARAM_SIZE, %ecx
+ movl %esi, SAVE_ESI
+ movl $32, %eax
+
+ movd %eax, %mm6 C l = 0, so 32-l = 32
+ movl PARAM_SRC, %esi
+ movl %ebp, SAVE_EBP
+
+ movd PARAM_DIVISOR, %mm5
+ pxor %mm7, %mm7 C l = 0
+
+ movd -4(%esi,%ecx,4), %mm0 C src high limb
+ leal -8(%esi,%ecx,4), %esi C &src[size-2]
+
+ movd PARAM_INVERSE, %mm4
+ subl $2, %ecx C size-2
+
+ psubq %mm5, %mm0 C high-divisor
+ movq %mm0, %mm2
+
+ psrlq $32, %mm0 C -1 if underflow
+
+ pand %mm5, %mm0 C divisor if underflow
+
+ paddq %mm2, %mm0 C addback if underflow
+ jz L(inverse_last) C if size==2
+ ja L(inverse_top) C if size>2
+
+
+ C if size==1
+ movl SAVE_ESI, %esi
+ movd %mm0, %eax
+ emms
+ ret
+
+EPILOGUE()
+
+
+ ALIGN(16)
+PROLOGUE(mpn_mod_1c)
+deflit(`FRAME',0)
+ movl PARAM_SIZE, %ecx
+ movl %esi, SAVE_ESI
+
+ movl PARAM_SRC, %esi
+ movl %ebp, SAVE_EBP
+
+ movl PARAM_CARRY, %edx
+ orl %ecx, %ecx
+ jz L(divide_done) C result==carry if size==0
+
+ movl PARAM_DIVISOR, %ebp
+ jmp L(start_1c)
+
+EPILOGUE()
+
+
+ ALIGN(16)
+PROLOGUE(mpn_mod_1)
+deflit(`FRAME',0)
+
+ movl PARAM_SIZE, %ecx
+ movl %esi, SAVE_ESI
+
+ movl PARAM_SRC, %esi
+ movl %ebp, SAVE_EBP
+
+ movl PARAM_DIVISOR, %ebp
+ xorl %edx, %edx C result 0 if size==0
+
+ orl %ecx, %ecx
+ jz L(divide_done)
+ movl -4(%esi,%ecx,4), %eax C src high limb
+
+ leal -1(%ecx), %edx
+ cmpl %ebp, %eax C c if high<divisor
+
+ cmovc( %edx, %ecx) C size-1 if high<divisor
+
+ movl $0, %edx C initial carry
+ cmovc( %eax, %edx) C src high limb if high<divisor
+
+ orl %ecx, %ecx
+ jz L(divide_done) C if size==1 and skip div
+
+
+L(start_1c):
+ C eax
+ C ebx
+ C ecx size
+ C edx carry
+ C esi src
+ C edi
+ C ebp divisor
+
+ leal -4(%esi,%ecx,4), %esi C &src[size-1]
+ cmpl $MUL_THRESHOLD, %ecx
+ jae L(mul_by_inverse)
+
+
+L(divide_top):
+ C eax
+ C ebx
+ C ecx counter, limbs, decrementing
+ C edx remainder
+ C esi src, decrementing
+ C edi
+ C ebp divisor
+
+ movl (%esi), %eax
+ subl $4, %esi
+
+ divl %ebp
+
+ subl $1, %ecx
+ jnz L(divide_top)
+
+
+L(divide_done):
+ movl SAVE_ESI, %esi
+ movl SAVE_EBP, %ebp
+ movl %edx, %eax
+ ret
+
+
+C -----------------------------------------------------------------------------
+
+L(mul_by_inverse):
+ C eax
+ C ebx
+ C ecx size
+ C edx carry
+ C esi src
+ C edi
+ C ebp divisor
+
+ bsrl %ebp, %eax C 31-l
+
+ movd %edx, %mm1 C carry
+ movl %ecx, %edx C size
+ movl $31, %ecx
+
+ C
+
+ xorl %eax, %ecx C l = leading zeros on d
+ addl $1, %eax C 32-l
+
+ shll %cl, %ebp C normalize d
+ movd %ecx, %mm7 C l
+ leal -1(%edx), %ecx C size-1
+
+ movd %eax, %mm6 C 32-l
+ movl $-1, %edx
+ movl $-1, %eax
+
+ C
+
+ subl %ebp, %edx C (b-d)-1 so edx:eax = b*(b-d)-1
+
+ divl %ebp C floor (b*(b-d)-1 / d)
+
+ movd %ebp, %mm5 C d
+ movd (%esi), %mm0 C src high limb
+ punpckldq %mm1, %mm0
+ psrlq %mm6, %mm0 C n2 = high (carry:srchigh << l)
+
+ C
+
+ movd %eax, %mm4 C m
+
+
+C The dependent chain here consists of
+C
+C 2 paddd n1+n2
+C 8 pmuludq m*(n1+n2)
+C 2 paddq n2:nadj + m*(n1+n2)
+C 2 psrlq q1
+C 8 pmuludq d*q1
+C 2 psubq (n-d)-q1*d
+C 2 psrlq high mask
+C 2 pand d masked
+C 2 paddd n2+d addback
+C --
+C 30
+C
+C But it seems to run at 31 cycles, so presumably there's something else
+C going on.
+
+
+ ALIGN(16)
+L(inverse_top):
+ C eax
+ C ebx
+ C ecx counter, size-1 to 1
+ C edx
+ C esi src, decrementing
+ C edi
+ C ebp
+ C
+ C mm0 n2
+ C mm4 m
+ C mm5 d
+ C mm6 32-l
+ C mm7 l
+
+ ASSERT(b,`C n2<d
+ movd %mm0, %eax
+ movd %mm5, %edx
+ cmpl %edx, %eax')
+
+ movd -4(%esi), %mm1 C next src limbs
+ movd (%esi), %mm2
+ leal -4(%esi), %esi
+
+ punpckldq %mm2, %mm1
+ psrlq %mm6, %mm1 C n10
+
+ movq %mm1, %mm2 C n10
+ movq %mm1, %mm3 C n10
+ psrad $31, %mm1 C -n1
+ pand %mm5, %mm1 C -n1 & d
+ paddd %mm2, %mm1 C nadj = n10+(-n1&d), ignore overflow
+
+ psrld $31, %mm2 C n1
+ paddd %mm0, %mm2 C n2+n1
+ punpckldq %mm0, %mm1 C n2:nadj
+
+ pmuludq %mm4, %mm2 C m*(n2+n1)
+
+ paddq %mm2, %mm1 C n2:nadj + m*(n2+n1)
+
+ psrlq $32, %mm1 C q1 = high(n2:nadj + m*(n2+n1))
+
+ pmuludq %mm5, %mm1 C q1*d
+ punpckldq %mm0, %mm3 C n
+ psubq %mm5, %mm3 C n - d
+ pxor %mm0, %mm0
+
+ psubq %mm1, %mm3 C n - (q1+1)*d
+
+ por %mm3, %mm0 C remainder -> n2
+ psrlq $32, %mm3 C high n - (q1+1)*d, 0 or -1
+
+ ASSERT(be,`C 0 or -1
+ movd %mm3, %eax
+ addl $1, %eax
+ cmpl $1, %eax')
+
+ pand %mm5, %mm3 C mask & d
+
+ paddd %mm3, %mm0 C addback if necessary
+
+ subl $1, %ecx
+ jnz L(inverse_top)
+
+
+ C Least significant limb.
+ C Same code as the loop, but there's no -4(%esi) limb to fetch.
+
+L(inverse_last):
+ C eax
+ C ebx
+ C ecx
+ C edx
+ C esi &src[0]
+ C
+ C mm0 n2
+ C mm4 m
+ C mm5 d
+ C mm6 32-l
+ C mm7 l
+
+ movd (%esi), %mm1 C src[0]
+ psllq %mm7, %mm1 C n10
+
+ movq %mm1, %mm2 C n10
+ movq %mm1, %mm3 C n10
+ psrad $31, %mm1 C -n1
+ pand %mm5, %mm1 C -n1 & d
+ paddd %mm2, %mm1 C nadj = n10+(-n1&d), ignore overflow
+
+ psrld $31, %mm2 C n1
+ paddd %mm0, %mm2 C n2+n1
+ punpckldq %mm0, %mm1 C n2:nadj
+
+ pmuludq %mm4, %mm2 C m*(n2+n1)
+
+ paddq %mm2, %mm1 C n2:nadj + m*(n2+n1)
+
+ psrlq $32, %mm1 C q1 = high(n2:nadj + m*(n2+n1))
+
+ pmuludq %mm5, %mm1 C q1*d
+ punpckldq %mm0, %mm3 C n
+ psubq %mm5, %mm3 C n - d
+ pxor %mm0, %mm0
+
+ psubq %mm1, %mm3 C n - (q1+1)*d
+
+ por %mm3, %mm0 C remainder -> n2
+ psrlq $32, %mm3 C high n - (q1+1)*d, 0 or -1
+
+ ASSERT(be,`C 0 or -1
+ movd %mm3, %eax
+ addl $1, %eax
+ cmpl $1, %eax')
+
+ movl SAVE_EBP, %ebp
+ pand %mm5, %mm3 C mask & d
+
+ movl SAVE_ESI, %esi
+ paddd %mm3, %mm0 C addback if necessary
+
+ psrld %mm7, %mm0
+
+ movd %mm0, %eax
+
+ emms
+ ret
+
+EPILOGUE()
diff --git a/gmp/mpn/x86/pentium4/sse2/mod_1_1.asm b/gmp/mpn/x86/pentium4/sse2/mod_1_1.asm
deleted file mode 100644
index ee88babeee..0000000000
--- a/gmp/mpn/x86/pentium4/sse2/mod_1_1.asm
+++ /dev/null
@@ -1,166 +0,0 @@
-dnl x86-32 mpn_mod_1_1p for Pentium 4 and P6 models with SSE2 (i.e., 9,D,E,F).
-
-dnl Contributed to the GNU project by Torbjorn Granlund.
-
-dnl Copyright 2009, 2010 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C TODO:
-C * Optimize. The present code was written quite straightforwardly.
-C * Optimize post-loop reduction code; it is from mod_1s_4p, thus overkill.
-C * Write a cps function that uses sse2 insns.
-
-C cycles/limb
-C P6 model 0-8,10-12 -
-C P6 model 9 (Banias) ?
-C P6 model 13 (Dothan) ?
-C P4 model 0-1 (Willamette) ?
-C P4 model 2 (Northwood) 16
-C P4 model 3-4 (Prescott) 18
-
-C INPUT PARAMETERS
-C ap sp + 4
-C n sp + 8
-C b sp + 12
-C cps sp + 16
-
-define(`B1modb', `%mm1')
-define(`B2modb', `%mm2')
-define(`ap', `%edx')
-define(`n', `%eax')
-
- TEXT
- ALIGN(16)
-PROLOGUE(mpn_mod_1_1p)
- push %ebx
- mov 8(%esp), ap
- mov 12(%esp), n
- mov 20(%esp), %ecx
- movd 8(%ecx), B1modb
- movd 12(%ecx), B2modb
-
- lea -4(ap,n,4), ap
-
-C FIXME: See comment in generic/mod_1_1.c.
- movd (ap), %mm7
- movd -4(ap), %mm4
- pmuludq B1modb, %mm7
- paddq %mm4, %mm7
- add $-2, n
- jz L(end)
-
- ALIGN(8)
-L(top): movq %mm7, %mm6
- psrlq $32, %mm7 C rh
- movd -8(ap), %mm0
- add $-4, ap
- pmuludq B2modb, %mm7
- pmuludq B1modb, %mm6
- add $-1, n
- paddq %mm0, %mm7
- paddq %mm6, %mm7
- jnz L(top)
-
-L(end): pcmpeqd %mm4, %mm4
- psrlq $32, %mm4 C 0x00000000FFFFFFFF
- pand %mm7, %mm4 C rl
- psrlq $32, %mm7 C rh
- pmuludq B1modb, %mm7 C rh,cl
- paddq %mm4, %mm7 C rh,rl
- movd 4(%ecx), %mm4 C cnt
- psllq %mm4, %mm7 C rh,rl normalized
- movq %mm7, %mm2 C rl in low half
- psrlq $32, %mm7 C rh
- movd (%ecx), %mm1 C bi
- pmuludq %mm7, %mm1 C qh,ql
- paddq %mm2, %mm1 C qh-1,ql
- movd %mm1, %ecx C ql
- psrlq $32, %mm1 C qh-1
- movd 16(%esp), %mm3 C b
- pmuludq %mm1, %mm3 C (qh-1) * b
- psubq %mm3, %mm2 C r in low half (could use psubd)
- movd %mm2, %eax C r
- mov 16(%esp), %ebx
- sub %ebx, %eax C r
- cmp %eax, %ecx
- lea (%eax,%ebx), %edx
- cmovc( %edx, %eax)
- movd %mm4, %ecx C cnt
- cmp %ebx, %eax
- jae L(fix)
- emms
- pop %ebx
- shr %cl, %eax
- ret
-
-L(fix): sub %ebx, %eax
- emms
- pop %ebx
- shr %cl, %eax
- ret
-EPILOGUE()
-
-PROLOGUE(mpn_mod_1_1p_cps)
-C CAUTION: This is the same code as in k7/mod_1_1.asm
- push %ebp
- mov 12(%esp), %ebp
- push %esi
- bsr %ebp, %ecx
- push %ebx
- xor $31, %ecx
- mov 16(%esp), %esi
- sal %cl, %ebp
- mov %ebp, %edx
- not %edx
- mov $-1, %eax
- div %ebp
- mov %eax, (%esi) C store bi
- mov %ecx, 4(%esi) C store cnt
- xor %ebx, %ebx
- sub %ebp, %ebx
- mov $1, %edx
- shld %cl, %eax, %edx
- imul %edx, %ebx
- mul %ebx
- add %ebx, %edx
- not %edx
- imul %ebp, %edx
- add %edx, %ebp
- cmp %edx, %eax
- cmovc( %ebp, %edx)
- shr %cl, %ebx
- mov %ebx, 8(%esi) C store B1modb
- shr %cl, %edx
- mov %edx, 12(%esi) C store B2modb
- pop %ebx
- pop %esi
- pop %ebp
- ret
-EPILOGUE()
diff --git a/gmp/mpn/x86/pentium4/sse2/mod_1_4.asm b/gmp/mpn/x86/pentium4/sse2/mod_1_4.asm
deleted file mode 100644
index eb2edb6297..0000000000
--- a/gmp/mpn/x86/pentium4/sse2/mod_1_4.asm
+++ /dev/null
@@ -1,269 +0,0 @@
-dnl x86-32 mpn_mod_1s_4p for Pentium 4 and P6 models with SSE2 (i.e. 9,D,E,F).
-
-dnl Contributed to the GNU project by Torbjorn Granlund.
-
-dnl Copyright 2009, 2010 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C TODO:
-C * Optimize. The present code was written quite straightforwardly.
-C * Optimize post-loop reduction code.
-C * Write a cps function that uses sse2 insns.
-
-C cycles/limb
-C P6 model 0-8,10-12 -
-C P6 model 9 (Banias) ?
-C P6 model 13 (Dothan) 3.4
-C P4 model 0-1 (Willamette) ?
-C P4 model 2 (Northwood) 4
-C P4 model 3-4 (Prescott) 4.5
-
-C INPUT PARAMETERS
-C ap sp + 4
-C n sp + 8
-C b sp + 12
-C cps sp + 16
-
-define(`B1modb', `%mm1')
-define(`B2modb', `%mm2')
-define(`B3modb', `%mm3')
-define(`B4modb', `%mm4')
-define(`B5modb', `%mm5')
-define(`ap', `%edx')
-define(`n', `%eax')
-
-ASM_START()
- TEXT
- ALIGN(16)
-PROLOGUE(mpn_mod_1s_4p)
- push %ebx
- mov 8(%esp), ap
- mov 12(%esp), n
- mov 20(%esp), %ecx
-
- movd 8(%ecx), B1modb
- movd 12(%ecx), B2modb
- movd 16(%ecx), B3modb
- movd 20(%ecx), B4modb
- movd 24(%ecx), B5modb
-
- mov n, %ebx
- lea -4(ap,n,4), ap
- and $3, %ebx
- je L(b0)
- cmp $2, %ebx
- jc L(b1)
- je L(b2)
-
-L(b3): movd -4(ap), %mm7
- pmuludq B1modb, %mm7
- movd -8(ap), %mm6
- paddq %mm6, %mm7
- movd (ap), %mm6
- pmuludq B2modb, %mm6
- paddq %mm6, %mm7
- lea -24(ap), ap
- add $-3, n
- jz L(end)
- jmp L(top)
-
-L(b0): movd -8(ap), %mm7
- pmuludq B1modb, %mm7
- movd -12(ap), %mm6
- paddq %mm6, %mm7
- movd -4(ap), %mm6
- pmuludq B2modb, %mm6
- paddq %mm6, %mm7
- movd (ap), %mm6
- pmuludq B3modb, %mm6
- paddq %mm6, %mm7
- lea -28(ap), ap
- add $-4, n
- jz L(end)
- jmp L(top)
-
-L(b1): movd (ap), %mm7
- lea -16(ap), ap
- dec n
- jz L(x)
- jmp L(top)
-
-L(b2): movd -4(ap), %mm7 C rl
- punpckldq (ap), %mm7 C rh
- lea -20(ap), ap
- add $-2, n
- jz L(end)
-
- ALIGN(8)
-L(top): movd 4(ap), %mm0
- pmuludq B1modb, %mm0
- movd 0(ap), %mm6
- paddq %mm6, %mm0
-
- movd 8(ap), %mm6
- pmuludq B2modb, %mm6
- paddq %mm6, %mm0
-
- movd 12(ap), %mm6
- pmuludq B3modb, %mm6
- paddq %mm6, %mm0
-
- movq %mm7, %mm6
- psrlq $32, %mm7 C rh
- pmuludq B5modb, %mm7
- pmuludq B4modb, %mm6
-
- paddq %mm0, %mm7
- paddq %mm6, %mm7
-
- add $-16, ap
- add $-4, n
- jnz L(top)
-
-L(end): pcmpeqd %mm4, %mm4
- psrlq $32, %mm4 C 0x00000000FFFFFFFF
- pand %mm7, %mm4 C rl
- psrlq $32, %mm7 C rh
- pmuludq B1modb, %mm7 C rh,cl
- paddq %mm4, %mm7 C rh,rl
-L(x): movd 4(%ecx), %mm4 C cnt
- psllq %mm4, %mm7 C rh,rl normalized
- movq %mm7, %mm2 C rl in low half
- psrlq $32, %mm7 C rh
- movd (%ecx), %mm1 C bi
- pmuludq %mm7, %mm1 C qh,ql
- paddq %mm2, %mm1 C qh-1,ql
- movd %mm1, %ecx C ql
- psrlq $32, %mm1 C qh-1
- movd 16(%esp), %mm3 C b
- pmuludq %mm1, %mm3 C (qh-1) * b
- psubq %mm3, %mm2 C r in low half (could use psubd)
- movd %mm2, %eax C r
- mov 16(%esp), %ebx
- sub %ebx, %eax C r
- cmp %eax, %ecx
- lea (%eax,%ebx), %edx
- cmovc( %edx, %eax)
- movd %mm4, %ecx C cnt
- cmp %ebx, %eax
- jae L(fix)
- emms
- pop %ebx
- shr %cl, %eax
- ret
-
-L(fix): sub %ebx, %eax
- emms
- pop %ebx
- shr %cl, %eax
- ret
-EPILOGUE()
-
- ALIGN(16)
-PROLOGUE(mpn_mod_1s_4p_cps)
-C CAUTION: This is the same code as in k7/mod_1_4.asm
- push %ebp
- push %edi
- push %esi
- push %ebx
- mov 20(%esp), %ebp C FIXME: avoid bp for 0-idx
- mov 24(%esp), %ebx
- bsr %ebx, %ecx
- xor $31, %ecx
- sal %cl, %ebx C b << cnt
- mov %ebx, %edx
- not %edx
- mov $-1, %eax
- div %ebx
- xor %edi, %edi
- sub %ebx, %edi
- mov $1, %esi
- mov %eax, (%ebp) C store bi
- mov %ecx, 4(%ebp) C store cnt
- shld %cl, %eax, %esi
- imul %edi, %esi
- mov %eax, %edi
- mul %esi
-
- add %esi, %edx
- shr %cl, %esi
- mov %esi, 8(%ebp) C store B1modb
-
- not %edx
- imul %ebx, %edx
- lea (%edx,%ebx), %esi
- cmp %edx, %eax
- cmovnc( %edx, %esi)
- mov %edi, %eax
- mul %esi
-
- add %esi, %edx
- shr %cl, %esi
- mov %esi, 12(%ebp) C store B2modb
-
- not %edx
- imul %ebx, %edx
- lea (%edx,%ebx), %esi
- cmp %edx, %eax
- cmovnc( %edx, %esi)
- mov %edi, %eax
- mul %esi
-
- add %esi, %edx
- shr %cl, %esi
- mov %esi, 16(%ebp) C store B3modb
-
- not %edx
- imul %ebx, %edx
- lea (%edx,%ebx), %esi
- cmp %edx, %eax
- cmovnc( %edx, %esi)
- mov %edi, %eax
- mul %esi
-
- add %esi, %edx
- shr %cl, %esi
- mov %esi, 20(%ebp) C store B4modb
-
- not %edx
- imul %ebx, %edx
- add %edx, %ebx
- cmp %edx, %eax
- cmovnc( %edx, %ebx)
-
- shr %cl, %ebx
- mov %ebx, 24(%ebp) C store B5modb
-
- pop %ebx
- pop %esi
- pop %edi
- pop %ebp
- ret
-EPILOGUE()
diff --git a/gmp/mpn/x86/pentium4/sse2/mod_34lsub1.asm b/gmp/mpn/x86/pentium4/sse2/mod_34lsub1.asm
index 31e25b79bc..1598b41785 100644
--- a/gmp/mpn/x86/pentium4/sse2/mod_34lsub1.asm
+++ b/gmp/mpn/x86/pentium4/sse2/mod_34lsub1.asm
@@ -1,32 +1,21 @@
dnl Intel Pentium 4 mpn_mod_34lsub1 -- remainder modulo 2^24-1.
-dnl Copyright 2000-2003 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
+dnl Copyright 2000, 2001, 2002, 2003 Free Software Foundation, Inc.
dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/x86/pentium4/sse2/mode1o.asm b/gmp/mpn/x86/pentium4/sse2/mode1o.asm
index 778c478169..2f0b177a00 100644
--- a/gmp/mpn/x86/pentium4/sse2/mode1o.asm
+++ b/gmp/mpn/x86/pentium4/sse2/mode1o.asm
@@ -1,32 +1,21 @@
dnl Intel Pentium-4 mpn_modexact_1_odd -- mpn by limb exact remainder.
dnl Copyright 2001, 2002, 2007 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
@@ -112,7 +101,7 @@ ifdef(`PIC',`
psubd %mm0, %mm6 C inv = 2*inv - inv*inv*d
- ASSERT(e,` C expect d*inv == 1 mod 2^GMP_LIMB_BITS
+ ASSERT(e,` C expect d*inv == 1 mod 2^BITS_PER_MP_LIMB
pushl %eax FRAME_pushl()
movd %mm6, %eax
imul PARAM_DIVISOR, %eax
@@ -124,13 +113,13 @@ ifdef(`PIC',`
C The dependent chain here is as follows.
C
-C latency
-C psubq s = (src-cbit) - climb 2
-C pmuludq q = s*inverse 8
-C pmuludq prod = q*divisor 8
-C psrlq climb = high(prod) 2
-C --
-C 20
+C latency
+C psubq s = (src-cbit) - climb 2
+C pmuludq q = s*inverse 8
+C pmuludq prod = q*divisor 8
+C psrlq climb = high(prod) 2
+C --
+C 20
C
C Yet the loop measures 19.0 c/l, so obviously there's something gained
C there over a straight reading of the chip documentation.
diff --git a/gmp/mpn/x86/pentium4/sse2/mul_1.asm b/gmp/mpn/x86/pentium4/sse2/mul_1.asm
index 6347b8bf62..07be951921 100644
--- a/gmp/mpn/x86/pentium4/sse2/mul_1.asm
+++ b/gmp/mpn/x86/pentium4/sse2/mul_1.asm
@@ -1,48 +1,37 @@
dnl mpn_mul_1 for Pentium 4 and P6 models with SSE2 (i.e., 9,D,E,F).
-dnl Copyright 2005, 2007, 2011 Free Software Foundation, Inc.
-
+dnl Copyright 2005, 2007 Free Software Foundation, Inc.
+dnl
dnl This file is part of the GNU MP Library.
dnl
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
dnl
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
-C cycles/limb
-C P6 model 0-8,10-12 -
-C P6 model 9 (Banias) 4.17
-C P6 model 13 (Dothan) 4.17
-C P4 model 0-1 (Willamette) 4
-C P4 model 2 (Northwood) 4
-C P4 model 3-4 (Prescott) 4.55
-
C TODO:
C * Tweak eax/edx offsets in loop as to save some lea's
C * Perhaps software pipeline small-case code
+C cycles/limb
+C P6 model 0-8,10-12) -
+C P6 model 9 (Banias) ?
+C P6 model 13 (Dothan) 4.17
+C P4 model 0-1 (Willamette): 4
+C P4 model 2 (Northwood): 4
+C P4 model 3-4 (Prescott): 4.55
+
C INPUT PARAMETERS
C rp sp + 4
C up sp + 8
@@ -51,13 +40,22 @@ C v0 sp + 16
TEXT
ALIGN(16)
+PROLOGUE(mpn_mul_1c)
+ mov 4(%esp), %edx
+ mov 8(%esp), %eax
+ mov 12(%esp), %ecx
+ movd 16(%esp), %mm7
+ movd 20(%esp), %mm6
+ jmp L(ent)
+EPILOGUE()
+ ALIGN(16)
PROLOGUE(mpn_mul_1)
- pxor %mm6, %mm6
-L(ent): mov 4(%esp), %edx
+ mov 4(%esp), %edx
mov 8(%esp), %eax
mov 12(%esp), %ecx
movd 16(%esp), %mm7
- cmp $4, %ecx
+ pxor %mm6, %mm6
+L(ent): cmp $4, %ecx
jnc L(big)
L(lp0): movd (%eax), %mm0
@@ -158,7 +156,3 @@ L(end): pmuludq %mm7, %mm2
emms
ret
EPILOGUE()
-PROLOGUE(mpn_mul_1c)
- movd 20(%esp), %mm6
- jmp L(ent)
-EPILOGUE()
diff --git a/gmp/mpn/x86/pentium4/sse2/mul_basecase.asm b/gmp/mpn/x86/pentium4/sse2/mul_basecase.asm
index 6e3775ae09..2628e5eb72 100644
--- a/gmp/mpn/x86/pentium4/sse2/mul_basecase.asm
+++ b/gmp/mpn/x86/pentium4/sse2/mul_basecase.asm
@@ -1,32 +1,21 @@
dnl mpn_mul_basecase for Pentium 4 and P6 models with SSE2 (i.e., 9,D,E,F).
dnl Copyright 2001, 2002, 2005, 2007 Free Software Foundation, Inc.
-
+dnl
dnl This file is part of the GNU MP Library.
dnl
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
dnl
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/x86/pentium4/sse2/popcount.asm b/gmp/mpn/x86/pentium4/sse2/popcount.asm
index b8238b9b66..cb982ade46 100644
--- a/gmp/mpn/x86/pentium4/sse2/popcount.asm
+++ b/gmp/mpn/x86/pentium4/sse2/popcount.asm
@@ -1,66 +1,52 @@
dnl X86-32 and X86-64 mpn_popcount using SSE2.
-dnl Copyright 2006, 2007, 2011 Free Software Foundation, Inc.
-
+dnl Copyright 2006, 2007 Free Software Foundation, Inc.
+dnl
dnl This file is part of the GNU MP Library.
dnl
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
dnl
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
-C 32-bit popcount hamdist
-C cycles/limb cycles/limb
-C P5 -
-C P6 model 0-8,10-12 -
-C P6 model 9 (Banias) ?
-C P6 model 13 (Dothan) 4
-C P4 model 0 (Willamette) ?
-C P4 model 1 (?) ?
-C P4 model 2 (Northwood) 3.9
-C P4 model 3 (Prescott) ?
-C P4 model 4 (Nocona) ?
-C AMD K6 -
-C AMD K7 -
-C AMD K8 ?
-
-C 64-bit popcount hamdist
-C cycles/limb cycles/limb
-C P4 model 4 (Nocona): 8
-C AMD K8,K9 7.5
-C AMD K10 3.5
-C Intel core2 3.68
-C Intel corei 3.15
-C Intel atom 10.8
-C VIA nano 6.5
+C 32-bit popcount hamdist
+C cycles/limb cycles/limb
+C P5: -
+C P6 model 0-8,10-12) -
+C P6 model 9 (Banias) ?
+C P6 model 13 (Dothan) 4
+C P4 model 0 (Willamette) ?
+C P4 model 1 (?) ?
+C P4 model 2 (Northwood) 3.9
+C P4 model 3 (Prescott) ?
+C P4 model 4 (Nocona) ?
+C K6: -
+C K7: -
+C K8: ?
+
+C 64-bit popcount hamdist
+C cycles/limb cycles/limb
+C P4 model 4 (Nocona): 8
+C K8: 7.5
+C K10: 3.5
+C P6-15: 3.68
C TODO
C * Make a mpn_hamdist based on this. Alignment could either be handled by
C using movdqu for one operand and movdqa for the other, or by painfully
-C shifting as we go. Unfortunately, there seem to be no usable shift
+C shifting as we go. Unfortunately, there seem to be no useable shift
C instruction, except for one that takes an immediate count.
C * It would probably be possible to cut a few cycles/limb using software
C pipelining.
diff --git a/gmp/mpn/x86/pentium4/sse2/rsh1add_n.asm b/gmp/mpn/x86/pentium4/sse2/rsh1add_n.asm
index f421d1323e..bbf43245cb 100644
--- a/gmp/mpn/x86/pentium4/sse2/rsh1add_n.asm
+++ b/gmp/mpn/x86/pentium4/sse2/rsh1add_n.asm
@@ -1,32 +1,21 @@
dnl Intel Pentium-4 mpn_rsh1add_n -- mpn (x+y)/2
-dnl Copyright 2001-2004 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
+dnl Copyright 2001, 2002, 2003, 2004 Free Software Foundation, Inc.
dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/x86/pentium4/sse2/sqr_basecase.asm b/gmp/mpn/x86/pentium4/sse2/sqr_basecase.asm
index 2dd57d25d9..fc56f164ed 100644
--- a/gmp/mpn/x86/pentium4/sse2/sqr_basecase.asm
+++ b/gmp/mpn/x86/pentium4/sse2/sqr_basecase.asm
@@ -1,32 +1,21 @@
dnl mpn_sqr_basecase for Pentium 4 and P6 models with SSE2 (i.e., 9,D,E,F).
dnl Copyright 2001, 2002, 2007 Free Software Foundation, Inc.
-
+dnl
dnl This file is part of the GNU MP Library.
dnl
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
dnl
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
@@ -41,7 +30,7 @@ C * Look into different loop alignment, we now expand the code about 50 bytes
C with possibly needless alignment.
C * Use OSP, should solve feed-in latency problems.
C * Address relative slowness for un<=3 for Pentium M. The old code is there
-C considerably faster. (1:20/14, 2:34:32, 3:66/57)
+C consideraly faster. (1:20/14, 2:34:32, 3:66/57)
C INPUT PARAMETERS
C rp sp + 4
diff --git a/gmp/mpn/x86/pentium4/sse2/sub_n.asm b/gmp/mpn/x86/pentium4/sse2/sub_n.asm
index 5ba1c018ec..02d5f01474 100644
--- a/gmp/mpn/x86/pentium4/sse2/sub_n.asm
+++ b/gmp/mpn/x86/pentium4/sse2/sub_n.asm
@@ -1,44 +1,37 @@
dnl Intel Pentium-4 mpn_sub_n -- mpn subtraction.
dnl Copyright 2001, 2002 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
-C cycles/limb
-C dst!=src1,2 dst==src1 dst==src2
-C P6 model 0-8,10-12 -
-C P6 model 9 (Banias) ?
-C P6 model 13 (Dothan) ?
-C P4 model 0-1 (Willamette) ?
-C P4 model 2 (Northwood) 4 6 6
-C P4 model 3-4 (Prescott) 4.25 7.5 7.5
+C P4 Willamette, Northwood: 4.0 cycles/limb if dst!=src1 and dst!=src2
+C 6.0 cycles/limb if dst==src1 or dst==src2
+C P4 Prescott: >= 5 cycles/limb
+
+
+C mp_limb_t mpn_sub_n (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
+C mp_size_t size);
+C mp_limb_t mpn_sub_nc (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
+C mp_size_t size, mp_limb_t carry);
+C
+C The main loop code is 2x unrolled so that the carry bit can alternate
+C between mm0 and mm1.
defframe(PARAM_CARRY,20)
defframe(PARAM_SIZE, 16)
@@ -54,8 +47,10 @@ define(SAVE_EBX,`PARAM_SRC1')
PROLOGUE(mpn_sub_nc)
deflit(`FRAME',0)
+
movd PARAM_CARRY, %mm0
jmp L(start_nc)
+
EPILOGUE()
ALIGN(8)
@@ -63,16 +58,16 @@ PROLOGUE(mpn_sub_n)
deflit(`FRAME',0)
pxor %mm0, %mm0
L(start_nc):
- mov PARAM_SRC1, %eax
- mov %ebx, SAVE_EBX
- mov PARAM_SRC2, %ebx
- mov PARAM_DST, %edx
- mov PARAM_SIZE, %ecx
+ movl PARAM_SRC1, %eax
+ movl %ebx, SAVE_EBX
+ movl PARAM_SRC2, %ebx
+ movl PARAM_DST, %edx
+ movl PARAM_SIZE, %ecx
- lea (%eax,%ecx,4), %eax C src1 end
- lea (%ebx,%ecx,4), %ebx C src2 end
- lea (%edx,%ecx,4), %edx C dst end
- neg %ecx C -size
+ leal (%eax,%ecx,4), %eax C src1 end
+ leal (%ebx,%ecx,4), %ebx C src2 end
+ leal (%edx,%ecx,4), %edx C dst end
+ negl %ecx C -size
L(top):
C eax src1 end
@@ -90,7 +85,7 @@ L(top):
psrlq $63, %mm1
- add $1, %ecx
+ addl $1, %ecx
jz L(done_mm1)
movd (%eax,%ecx,4), %mm0
@@ -102,17 +97,18 @@ L(top):
psrlq $63, %mm0
- add $1, %ecx
+ addl $1, %ecx
jnz L(top)
+
movd %mm0, %eax
- mov SAVE_EBX, %ebx
+ movl SAVE_EBX, %ebx
emms
ret
L(done_mm1):
movd %mm1, %eax
- mov SAVE_EBX, %ebx
+ movl SAVE_EBX, %ebx
emms
ret
diff --git a/gmp/mpn/x86/pentium4/sse2/submul_1.asm b/gmp/mpn/x86/pentium4/sse2/submul_1.asm
index 020675bd7b..ceb41f2ac0 100644
--- a/gmp/mpn/x86/pentium4/sse2/submul_1.asm
+++ b/gmp/mpn/x86/pentium4/sse2/submul_1.asm
@@ -1,71 +1,60 @@
dnl Intel Pentium-4 mpn_submul_1 -- Multiply a limb vector with a limb and
dnl subtract the result from a second limb vector.
-dnl Copyright 2001, 2002, 2008, 2010 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
+dnl Copyright 2001, 2002 Free Software Foundation, Inc.
dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
-C cycles/limb
-C P6 model 0-8,10-12 -
-C P6 model 9 (Banias) 6.8
-C P6 model 13 (Dothan) 6.9
-C P4 model 0-1 (Willamette) ?
-C P4 model 2 (Northwood) 5.87
-C P4 model 3-4 (Prescott) 6.5
+C P4: 7 cycles/limb, unstable timing, at least on early Pentium4 silicon
+C (stepping 10).
-C This code represents a step forwards compared to the code available before
-C GMP 5.1, but it is not carefully tuned for either P6 or P4. In fact, it is
-C not good for P6. For P4 it saved a bit over 1 c/l for both Northwood and
-C Prescott compared to the old code.
+
+C mp_limb_t mpn_submul_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C mp_limb_t mult);
+C mp_limb_t mpn_submul_1c (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C mp_limb_t mult, mp_limb_t carry);
+C
+C This code is not particularly good at 7 c/l. The dependent chain is only
+C 4 c/l and there's only 4 MMX unit instructions, so it's not clear why that
+C speed isn't achieved.
C
C The arrangements made here to get a two instruction dependent chain are
-C slightly subtle. In the loop the carry (or borrow rather) is a negative so
-C that a paddq can be used to give a low limb ready to store, and a high limb
-C ready to become the new carry after a psrlq.
+C slightly subtle. In the loop the carry (or borrow rather) is a negative
+C so that a paddq can be used to give a low limb ready to store, and a high
+C limb ready to become the new carry after a psrlq.
C
-C If the carry was a simple twos complement negative then the psrlq shift would
-C need to bring in 0 bits or 1 bits according to whether the high was zero or
-C non-zero, since a non-zero value would represent a negative needing sign
-C extension. That wouldn't be particularly easy to arrange and certainly would
-C add an instruction to the dependent chain, so instead an offset is applied so
-C that the high limb will be 0xFFFFFFFF+c. With c in the range -0xFFFFFFFF to
-C 0, the value 0xFFFFFFFF+c is in the range 0 to 0xFFFFFFFF and is therefore
-C always positive and can always have 0 bits shifted in, which is what psrlq
-C does.
+C If the carry was a simple twos complement negative then the psrlq shift
+C would need to bring in 0 bits or 1 bits according to whether the high was
+C zero or non-zero, since a non-zero value would represent a negative
+C needing sign extension. That wouldn't be particularly easy to arrange and
+C certainly would add an instruction to the dependent chain, so instead an
+C offset is applied so that the high limb will be 0xFFFFFFFF+c. With c in
+C the range -0xFFFFFFFF to 0, the value 0xFFFFFFFF+c is in the range 0 to
+C 0xFFFFFFFF and is therefore always positive and can always have 0 bits
+C shifted in, which is what psrlq does.
C
C The extra 0xFFFFFFFF must be subtracted before c is used, but that can be
C done off the dependent chain. The total adjustment then is to add
-C 0xFFFFFFFF00000000 to offset the new carry, and subtract 0x00000000FFFFFFFF
-C to remove the offset from the current carry, for a net add of
-C 0xFFFFFFFE00000001. In the code this is applied to the destination limb when
-C fetched.
+C 0xFFFFFFFF00000000 to offset the new carry, and subtract
+C 0x00000000FFFFFFFF to remove the offset from the current carry, for a net
+C add of 0xFFFFFFFE00000001. In the code this is applied to the destination
+C limb when fetched.
C
C It's also possible to view the 0xFFFFFFFF adjustment as a ones-complement
C negative, which is how it's undone for the return value, but that doesn't
@@ -91,16 +80,16 @@ deflit(`FRAME',0)
pxor %mm1, %mm1 C initial borrow
L(start_1c):
- mov PARAM_SRC, %eax
+ movl PARAM_SRC, %eax
pcmpeqd %mm0, %mm0
movd PARAM_MULTIPLIER, %mm7
pcmpeqd %mm6, %mm6
- mov PARAM_DST, %edx
+ movl PARAM_DST, %edx
psrlq $32, %mm0 C 0x00000000FFFFFFFF
- mov PARAM_SIZE, %ecx
+ movl PARAM_SIZE, %ecx
psllq $32, %mm6 C 0xFFFFFFFF00000000
psubq %mm0, %mm6 C 0xFFFFFFFE00000001
@@ -108,75 +97,32 @@ L(start_1c):
psubq %mm1, %mm0 C 0xFFFFFFFF - borrow
- movd (%eax), %mm3 C up
- movd (%edx), %mm4 C rp
-
- add $-1, %ecx
- paddq %mm6, %mm4 C add 0xFFFFFFFE00000001
- pmuludq %mm7, %mm3
- jnz L(gt1)
- psubq %mm3, %mm4 C prod
- paddq %mm4, %mm0 C borrow
- movd %mm0, (%edx) C result
- jmp L(rt)
-
-L(gt1): movd 4(%eax), %mm1 C up
- movd 4(%edx), %mm2 C rp
-
- add $-1, %ecx
- jz L(eev)
-
- ALIGN(16)
-L(top): paddq %mm6, %mm2 C add 0xFFFFFFFE00000001
+ C eax src, incrementing
+ C ebx
+ C ecx loop counter, decrementing
+ C edx dst, incrementing
+ C
+ C mm0 0xFFFFFFFF - borrow
+ C mm6 0xFFFFFFFE00000001
+ C mm7 multiplier
+
+L(loop):
+ movd (%eax), %mm1 C src
+ leal 4(%eax), %eax
+ movd (%edx), %mm2 C dst
+ paddq %mm6, %mm2 C add 0xFFFFFFFE00000001
pmuludq %mm7, %mm1
- psubq %mm3, %mm4 C prod
- movd 8(%eax), %mm3 C up
- paddq %mm4, %mm0 C borrow
- movd 8(%edx), %mm4 C rp
- movd %mm0, (%edx) C result
- psrlq $32, %mm0
-
- add $-1, %ecx
- jz L(eod)
-
- paddq %mm6, %mm4 C add 0xFFFFFFFE00000001
- pmuludq %mm7, %mm3
psubq %mm1, %mm2 C prod
- movd 12(%eax), %mm1 C up
paddq %mm2, %mm0 C borrow
- movd 12(%edx), %mm2 C rp
- movd %mm0, 4(%edx) C result
- psrlq $32, %mm0
-
- lea 8(%eax), %eax
- lea 8(%edx), %edx
- add $-1, %ecx
- jnz L(top)
-
-
-L(eev): paddq %mm6, %mm2 C add 0xFFFFFFFE00000001
- pmuludq %mm7, %mm1
- psubq %mm3, %mm4 C prod
- paddq %mm4, %mm0 C borrow
+ subl $1, %ecx
movd %mm0, (%edx) C result
psrlq $32, %mm0
- psubq %mm1, %mm2 C prod
- paddq %mm2, %mm0 C borrow
- movd %mm0, 4(%edx) C result
-L(rt): psrlq $32, %mm0
+ leal 4(%edx), %edx
+ jnz L(loop)
+
movd %mm0, %eax
- not %eax
+ notl %eax
emms
ret
-L(eod): paddq %mm6, %mm4 C add 0xFFFFFFFE00000001
- pmuludq %mm7, %mm3
- psubq %mm1, %mm2 C prod
- paddq %mm2, %mm0 C borrow
- movd %mm0, 4(%edx) C result
- psrlq $32, %mm0
- psubq %mm3, %mm4 C prod
- paddq %mm4, %mm0 C borrow
- movd %mm0, 8(%edx) C result
- jmp L(rt)
EPILOGUE()