summaryrefslogtreecommitdiff
path: root/REORG.TODO/sysdeps/i386/i686
diff options
context:
space:
mode:
authorZack Weinberg <zackw@panix.com>2017-06-08 15:39:03 -0400
committerZack Weinberg <zackw@panix.com>2017-06-08 15:39:03 -0400
commit5046dbb4a7eba5eccfd258f92f4735c9ffc8d069 (patch)
tree4470480d904b65cf14ca524f96f79eca818c3eaf /REORG.TODO/sysdeps/i386/i686
parent199fc19d3aaaf57944ef036e15904febe877fc93 (diff)
downloadglibc-5046dbb4a7eba5eccfd258f92f4735c9ffc8d069.tar.gz
Prepare for radical source tree reorganization.zack/build-layout-experiment
All top-level files and directories are moved into a temporary storage directory, REORG.TODO, except for files that will certainly still exist in their current form at top level when we're done (COPYING, COPYING.LIB, LICENSES, NEWS, README), all old ChangeLog files (which are moved to the new directory OldChangeLogs, instead), and the generated file INSTALL (which is just deleted; in the new order, there will be no generated files checked into version control).
Diffstat (limited to 'REORG.TODO/sysdeps/i386/i686')
-rw-r--r--REORG.TODO/sysdeps/i386/i686/Makefile12
-rw-r--r--REORG.TODO/sysdeps/i386/i686/add_n.S110
-rw-r--r--REORG.TODO/sysdeps/i386/i686/bcopy.S3
-rw-r--r--REORG.TODO/sysdeps/i386/i686/bzero.S4
-rw-r--r--REORG.TODO/sysdeps/i386/i686/dl-hash.h79
-rw-r--r--REORG.TODO/sysdeps/i386/i686/ffs.c48
-rw-r--r--REORG.TODO/sysdeps/i386/i686/fpu/e_log.S29
-rw-r--r--REORG.TODO/sysdeps/i386/i686/fpu/e_logf.S30
-rw-r--r--REORG.TODO/sysdeps/i386/i686/fpu/e_logl.S94
-rw-r--r--REORG.TODO/sysdeps/i386/i686/fpu/multiarch/Makefile4
-rw-r--r--REORG.TODO/sysdeps/i386/i686/fpu/multiarch/e_expf-ia32.S22
-rw-r--r--REORG.TODO/sysdeps/i386/i686/fpu/multiarch/e_expf-sse2.S325
-rw-r--r--REORG.TODO/sysdeps/i386/i686/fpu/multiarch/e_expf.c37
-rw-r--r--REORG.TODO/sysdeps/i386/i686/fpu/multiarch/libm-test-ulps2188
-rw-r--r--REORG.TODO/sysdeps/i386/i686/fpu/multiarch/libm-test-ulps-name1
-rw-r--r--REORG.TODO/sysdeps/i386/i686/fpu/multiarch/s_cosf-sse2.S553
-rw-r--r--REORG.TODO/sysdeps/i386/i686/fpu/multiarch/s_cosf.c29
-rw-r--r--REORG.TODO/sysdeps/i386/i686/fpu/multiarch/s_sincosf-sse2.S586
-rw-r--r--REORG.TODO/sysdeps/i386/i686/fpu/multiarch/s_sincosf.c30
-rw-r--r--REORG.TODO/sysdeps/i386/i686/fpu/multiarch/s_sinf-sse2.S566
-rw-r--r--REORG.TODO/sysdeps/i386/i686/fpu/multiarch/s_sinf.c28
-rw-r--r--REORG.TODO/sysdeps/i386/i686/fpu/s_fmax.S39
-rw-r--r--REORG.TODO/sysdeps/i386/i686/fpu/s_fmaxf.S39
-rw-r--r--REORG.TODO/sysdeps/i386/i686/fpu/s_fmaxl.S58
-rw-r--r--REORG.TODO/sysdeps/i386/i686/fpu/s_fmin.S37
-rw-r--r--REORG.TODO/sysdeps/i386/i686/fpu/s_fminf.S37
-rw-r--r--REORG.TODO/sysdeps/i386/i686/fpu/s_fminl.S58
-rw-r--r--REORG.TODO/sysdeps/i386/i686/hp-timing.h42
-rw-r--r--REORG.TODO/sysdeps/i386/i686/init-arch.h19
-rw-r--r--REORG.TODO/sysdeps/i386/i686/memcmp.S408
-rw-r--r--REORG.TODO/sysdeps/i386/i686/memcpy.S98
-rw-r--r--REORG.TODO/sysdeps/i386/i686/memmove.S120
-rw-r--r--REORG.TODO/sysdeps/i386/i686/mempcpy.S65
-rw-r--r--REORG.TODO/sysdeps/i386/i686/memset.S100
-rw-r--r--REORG.TODO/sysdeps/i386/i686/memusage.h21
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/Makefile44
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/bcopy-sse2-unaligned.S4
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/bcopy-ssse3-rep.S4
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/bcopy-ssse3.S4
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/bcopy.S59
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/bzero-sse2-rep.S3
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/bzero-sse2.S3
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/bzero.S62
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/ifunc-impl-list.c376
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/locale-defines.sym11
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/memchr-sse2-bsf.S502
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/memchr-sse2.S709
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/memchr.S65
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/memcmp-sse4.S1225
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/memcmp-ssse3.S2157
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/memcmp.S62
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/memcpy-sse2-unaligned.S681
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/memcpy-ssse3-rep.S1809
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/memcpy-ssse3.S3162
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/memcpy.S78
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/memcpy_chk.S50
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/memmove-sse2-unaligned.S4
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/memmove-ssse3-rep.S4
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/memmove-ssse3.S4
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/memmove.S89
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/memmove_chk.S94
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/mempcpy-sse2-unaligned.S4
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/mempcpy-ssse3-rep.S4
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/mempcpy-ssse3.S4
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/mempcpy.S81
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/mempcpy_chk.S50
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/memrchr-c.c7
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/memrchr-sse2-bsf.S417
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/memrchr-sse2.S724
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/memrchr.S45
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/memset-sse2-rep.S811
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/memset-sse2.S860
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/memset.S75
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/memset_chk.S82
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/rawmemchr-sse2-bsf.S3
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/rawmemchr-sse2.S3
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/rawmemchr.S65
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/rtld-strnlen.c1
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/s_fma-fma.c27
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/s_fma.c34
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/s_fmaf-fma.c27
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/s_fmaf.c34
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/sched_cpucount.c1
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/stpcpy-sse2.S3
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/stpcpy-ssse3.S3
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/stpcpy.S9
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/stpncpy-sse2.S4
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/stpncpy-ssse3.S4
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/stpncpy.S8
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp-c.c12
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp.S39
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp_l-c.c13
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp_l-sse4.S2
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp_l-ssse3.S2
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp_l.S7
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strcat-sse2.S1245
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strcat-ssse3.S572
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strcat.S92
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strchr-sse2-bsf.S158
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strchr-sse2.S348
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strchr.S57
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strcmp-sse4.S804
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strcmp-ssse3.S2810
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strcmp.S95
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strcpy-sse2.S2250
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strcpy-ssse3.S3901
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strcpy.S116
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strcspn-c.c2
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strcspn.S75
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strlen-sse2-bsf.S125
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strlen-sse2.S695
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strlen.S60
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strncase-c.c8
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strncase.S39
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strncase_l-c.c13
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strncase_l-sse4.S2
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strncase_l-ssse3.S2
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strncase_l.S7
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strncat-c.c8
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strncat-sse2.S4
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strncat-ssse3.S4
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strncat.S5
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strncmp-c.c8
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strncmp-sse4.S5
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strncmp-ssse3.S5
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strncmp.S5
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strncpy-c.c8
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strncpy-sse2.S3
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strncpy-ssse3.S3
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strncpy.S5
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strnlen-c.c10
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strnlen-sse2.S3
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strnlen.S37
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strpbrk-c.c2
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strpbrk.S5
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strrchr-sse2-bsf.S282
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strrchr-sse2.S708
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strrchr.S57
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strspn-c.c2
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strspn.S56
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/test-multiarch.c1
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/varshift.c1
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/varshift.h1
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/wcschr-c.c22
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/wcschr-sse2.S219
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/wcschr.S36
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/wcscmp-c.c14
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/wcscmp-sse2.S1018
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/wcscmp.S39
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/wcscpy-c.c5
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/wcscpy-ssse3.S600
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/wcscpy.S36
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/wcslen-c.c9
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/wcslen-sse2.S193
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/wcslen.S37
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/wcsrchr-c.c5
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/wcsrchr-sse2.S354
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/wcsrchr.S35
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/wmemcmp-c.c9
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/wmemcmp-sse4.S4
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/wmemcmp-ssse3.S4
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/wmemcmp.S40
-rw-r--r--REORG.TODO/sysdeps/i386/i686/nptl/tls.h35
-rw-r--r--REORG.TODO/sysdeps/i386/i686/pthread_spin_trylock.S20
-rw-r--r--REORG.TODO/sysdeps/i386/i686/stack-aliasing.h23
-rw-r--r--REORG.TODO/sysdeps/i386/i686/strcmp.S52
-rw-r--r--REORG.TODO/sysdeps/i386/i686/tst-stack-align.h44
167 files changed, 38206 insertions, 0 deletions
diff --git a/REORG.TODO/sysdeps/i386/i686/Makefile b/REORG.TODO/sysdeps/i386/i686/Makefile
new file mode 100644
index 0000000000..311042787b
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/Makefile
@@ -0,0 +1,12 @@
+# So that we can test __m128's alignment
+stack-align-test-flags += -msse
+
+CFLAGS-.o += -Wa,-mtune=i686
+CFLAGS-.os += -Wa,-mtune=i686
+CFLAGS-.op += -Wa,-mtune=i686
+CFLAGS-.oS += -Wa,-mtune=i686
+
+ASFLAGS-.o += -Wa,-mtune=i686
+ASFLAGS-.os += -Wa,-mtune=i686
+ASFLAGS-.op += -Wa,-mtune=i686
+ASFLAGS-.oS += -Wa,-mtune=i686
diff --git a/REORG.TODO/sysdeps/i386/i686/add_n.S b/REORG.TODO/sysdeps/i386/i686/add_n.S
new file mode 100644
index 0000000000..4afa648ceb
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/add_n.S
@@ -0,0 +1,110 @@
+/* Add two limb vectors of the same length > 0 and store sum in a third
+ limb vector.
+ Copyright (C) 1992-2017 Free Software Foundation, Inc.
+ This file is part of the GNU MP Library.
+
+ The GNU MP Library is free software; you can redistribute it and/or modify
+ it under the terms of the GNU Lesser General Public License as published by
+ the Free Software Foundation; either version 2.1 of the License, or (at your
+ option) any later version.
+
+ The GNU MP Library is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+ License for more details.
+
+ You should have received a copy of the GNU Lesser General Public License
+ along with the GNU MP Library; see the file COPYING.LIB. If not,
+ see <http://www.gnu.org/licenses/>. */
+
+#include "sysdep.h"
+#include "asm-syntax.h"
+
+#define PARMS 4+8 /* space for 2 saved regs */
+#define RES PARMS
+#define S1 RES+4
+#define S2 S1+4
+#define SIZE S2+4
+
+ .text
+#ifdef PIC
+L(1): addl (%esp), %eax
+ ret
+#endif
+ENTRY (__mpn_add_n)
+
+ pushl %edi
+ cfi_adjust_cfa_offset (4)
+ pushl %esi
+ cfi_adjust_cfa_offset (4)
+
+ movl RES(%esp),%edi
+ cfi_rel_offset (edi, 4)
+ movl S1(%esp),%esi
+ cfi_rel_offset (esi, 0)
+ movl S2(%esp),%edx
+ movl SIZE(%esp),%ecx
+ movl %ecx,%eax
+ shrl $3,%ecx /* compute count for unrolled loop */
+ negl %eax
+ andl $7,%eax /* get index where to start loop */
+ jz L(oop) /* necessary special case for 0 */
+ incl %ecx /* adjust loop count */
+ shll $2,%eax /* adjustment for pointers... */
+ subl %eax,%edi /* ... since they are offset ... */
+ subl %eax,%esi /* ... by a constant when we ... */
+ subl %eax,%edx /* ... enter the loop */
+ shrl $2,%eax /* restore previous value */
+#ifdef PIC
+/* Calculate start address in loop for PIC. */
+ leal (L(oop)-L(0)-3)(%eax,%eax,8),%eax
+ call L(1)
+L(0):
+#else
+/* Calculate start address in loop for non-PIC. */
+ leal (L(oop) - 3)(%eax,%eax,8),%eax
+#endif
+ jmp *%eax /* jump into loop */
+ ALIGN (3)
+L(oop): movl (%esi),%eax
+ adcl (%edx),%eax
+ movl %eax,(%edi)
+ movl 4(%esi),%eax
+ adcl 4(%edx),%eax
+ movl %eax,4(%edi)
+ movl 8(%esi),%eax
+ adcl 8(%edx),%eax
+ movl %eax,8(%edi)
+ movl 12(%esi),%eax
+ adcl 12(%edx),%eax
+ movl %eax,12(%edi)
+ movl 16(%esi),%eax
+ adcl 16(%edx),%eax
+ movl %eax,16(%edi)
+ movl 20(%esi),%eax
+ adcl 20(%edx),%eax
+ movl %eax,20(%edi)
+ movl 24(%esi),%eax
+ adcl 24(%edx),%eax
+ movl %eax,24(%edi)
+ movl 28(%esi),%eax
+ adcl 28(%edx),%eax
+ movl %eax,28(%edi)
+ leal 32(%edi),%edi
+ leal 32(%esi),%esi
+ leal 32(%edx),%edx
+ decl %ecx
+ jnz L(oop)
+
+ sbbl %eax,%eax
+ negl %eax
+
+ popl %esi
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (esi)
+ popl %edi
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (edi)
+
+ ret
+END (__mpn_add_n)
diff --git a/REORG.TODO/sysdeps/i386/i686/bcopy.S b/REORG.TODO/sysdeps/i386/i686/bcopy.S
new file mode 100644
index 0000000000..15ef9419a4
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/bcopy.S
@@ -0,0 +1,3 @@
+#define USE_AS_BCOPY
+#define memmove bcopy
+#include <sysdeps/i386/i686/memmove.S>
diff --git a/REORG.TODO/sysdeps/i386/i686/bzero.S b/REORG.TODO/sysdeps/i386/i686/bzero.S
new file mode 100644
index 0000000000..c7898f18e0
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/bzero.S
@@ -0,0 +1,4 @@
+#define USE_AS_BZERO
+#define memset __bzero
+#include <sysdeps/i386/i686/memset.S>
+weak_alias (__bzero, bzero)
diff --git a/REORG.TODO/sysdeps/i386/i686/dl-hash.h b/REORG.TODO/sysdeps/i386/i686/dl-hash.h
new file mode 100644
index 0000000000..ceda785b32
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/dl-hash.h
@@ -0,0 +1,79 @@
+/* Compute hash alue for given string according to ELF standard.
+ Copyright (C) 1998-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+ Contributed by Ulrich Drepper <drepper@cygnus.com>, 1998.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#ifndef _DL_HASH_H
+#define _DL_HASH_H 1
+
+
+/* This is the hashing function specified by the ELF ABI. It is highly
+ optimized for the PII processors. Though it will run on i586 it
+ would be much slower than the generic C implementation. So don't
+ use it. */
+static unsigned int
+__attribute__ ((unused))
+_dl_elf_hash (const char *name)
+{
+ unsigned int result;
+ unsigned int temp0;
+ unsigned int temp1;
+
+ __asm__ __volatile__
+ ("movzbl (%1),%2\n\t"
+ "testl %2, %2\n\t"
+ "jz 1f\n\t"
+ "movl %2, %0\n\t"
+ "movzbl 1(%1), %2\n\t"
+ "jecxz 1f\n\t"
+ "shll $4, %0\n\t"
+ "addl %2, %0\n\t"
+ "movzbl 2(%1), %2\n\t"
+ "jecxz 1f\n\t"
+ "shll $4, %0\n\t"
+ "addl %2, %0\n\t"
+ "movzbl 3(%1), %2\n\t"
+ "jecxz 1f\n\t"
+ "shll $4, %0\n\t"
+ "addl %2, %0\n\t"
+ "movzbl 4(%1), %2\n\t"
+ "jecxz 1f\n\t"
+ "shll $4, %0\n\t"
+ "addl $5, %1\n\t"
+ "addl %2, %0\n\t"
+ "movzbl (%1), %2\n\t"
+ "jecxz 1f\n"
+ "2:\t"
+ "shll $4, %0\n\t"
+ "movl $0xf0000000, %3\n\t"
+ "incl %1\n\t"
+ "addl %2, %0\n\t"
+ "andl %0, %3\n\t"
+ "andl $0x0fffffff, %0\n\t"
+ "shrl $24, %3\n\t"
+ "movzbl (%1), %2\n\t"
+ "xorl %3, %0\n\t"
+ "testl %2, %2\n\t"
+ "jnz 2b\n"
+ "1:\t"
+ : "=&r" (result), "=r" (name), "=&c" (temp0), "=&r" (temp1)
+ : "0" (0), "1" ((const unsigned char *) name));
+
+ return result;
+}
+
+#endif /* dl-hash.h */
diff --git a/REORG.TODO/sysdeps/i386/i686/ffs.c b/REORG.TODO/sysdeps/i386/i686/ffs.c
new file mode 100644
index 0000000000..cbe36ff873
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/ffs.c
@@ -0,0 +1,48 @@
+/* ffs -- find first set bit in a word, counted from least significant end.
+ For Intel 80x86, x>=6.
+ This file is part of the GNU C Library.
+ Copyright (C) 1991-2017 Free Software Foundation, Inc.
+ Contributed by Ulrich Drepper <drepper@cygnus.com>.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#define ffsl __something_else
+#include <string.h>
+
+#undef ffs
+
+#ifdef __GNUC__
+
+int
+__ffs (int x)
+{
+ int cnt;
+ int tmp;
+
+ asm ("bsfl %2,%0\n" /* Count low bits in X and store in %1. */
+ "cmovel %1,%0\n" /* If number was zero, use -1 as result. */
+ : "=&r" (cnt), "=r" (tmp) : "rm" (x), "1" (-1));
+
+ return cnt + 1;
+}
+weak_alias (__ffs, ffs)
+libc_hidden_def (__ffs)
+libc_hidden_builtin_def (ffs)
+#undef ffsl
+weak_alias (__ffs, ffsl)
+
+#else
+#include <string/ffs.c>
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/fpu/e_log.S b/REORG.TODO/sysdeps/i386/i686/fpu/e_log.S
new file mode 100644
index 0000000000..73060b088c
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/fpu/e_log.S
@@ -0,0 +1,29 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ *
+ * Adapted for i686 instructions.
+ */
+
+#include <machine/asm.h>
+
+
+ .text
+ENTRY(__ieee754_log)
+ fldln2 // log(2)
+ fldl 4(%esp) // x : log(2)
+ fucomi %st
+ jp 3f
+ fyl2x // log(x)
+ ret
+
+3: fstp %st(1)
+ ret
+END (__ieee754_log)
+
+ENTRY(__log_finite)
+ fldln2 // log(2)
+ fldl 4(%esp) // x : log(2)
+ fyl2x // log(x)
+ ret
+END(__log_finite)
diff --git a/REORG.TODO/sysdeps/i386/i686/fpu/e_logf.S b/REORG.TODO/sysdeps/i386/i686/fpu/e_logf.S
new file mode 100644
index 0000000000..6fd39d50d3
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/fpu/e_logf.S
@@ -0,0 +1,30 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ * Adapted for float by Ulrich Drepper <drepper@cygnus.com>.
+ *
+ * Adapted for i686 instructions.
+ */
+
+#include <machine/asm.h>
+
+
+ .text
+ENTRY(__ieee754_logf)
+ fldln2 // log(2)
+ flds 4(%esp) // x : log(2)
+ fucomi %st
+ jp 3f
+ fyl2x // log(x)
+ ret
+
+3: fstp %st(1)
+ ret
+END (__ieee754_logf)
+
+ENTRY(__logf_finite)
+ fldln2 // log(2)
+ flds 4(%esp) // x : log(2)
+ fyl2x // log(x)
+ ret
+END(__logf_finite)
diff --git a/REORG.TODO/sysdeps/i386/i686/fpu/e_logl.S b/REORG.TODO/sysdeps/i386/i686/fpu/e_logl.S
new file mode 100644
index 0000000000..7e3bc8d817
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/fpu/e_logl.S
@@ -0,0 +1,94 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ *
+ * Adapted for `long double' by Ulrich Drepper <drepper@cygnus.com>.
+ * Changed to use fyl2xp1 for values near 1, <drepper@cygnus.com>.
+ * Adapted for i686 instructions.
+ */
+
+#include <machine/asm.h>
+
+ .section .rodata.cst8,"aM",@progbits,8
+
+ .p2align 3
+ .type one,@object
+one: .double 1.0
+ ASM_SIZE_DIRECTIVE(one)
+ /* It is not important that this constant is precise. It is only
+ a value which is known to be on the safe side for using the
+ fyl2xp1 instruction. */
+ .type limit,@object
+limit: .double 0.29
+ ASM_SIZE_DIRECTIVE(limit)
+
+
+#ifdef PIC
+# define MO(op) op##@GOTOFF(%edx)
+#else
+# define MO(op) op
+#endif
+
+ .text
+ENTRY(__ieee754_logl)
+ fldln2 // log(2)
+ fldt 4(%esp) // x : log(2)
+ fucomi %st
+ jp 3f
+#ifdef PIC
+ LOAD_PIC_REG (dx)
+#endif
+ fld %st // x : x : log(2)
+ movzwl 4+8(%esp), %eax
+ cmpl $0xc000, %eax
+ jae 5f // x <= -2, avoid overflow from -LDBL_MAX - 1.
+ fsubl MO(one) // x-1 : x : log(2)
+5: fld %st // x-1 : x-1 : x : log(2)
+ fabs // |x-1| : x-1 : x : log(2)
+ fld MO(limit) // 0.29 : |x-1| : x-1 : x : log(2)
+ fcomip %st(1) // |x-1| : x-1 : x : log(2)
+ fstp %st(0) // x-1 : x : log(2)
+ jc 2f
+ fxam
+ fnstsw
+ andb $0x45, %ah
+ cmpb $0x40, %ah
+ jne 4f
+ fabs // log(1) is +0 in all rounding modes.
+4: fstp %st(1) // x-1 : log(2)
+ fyl2xp1 // log(x)
+ ret
+
+2: fstp %st(0) // x : log(2)
+ fyl2x // log(x)
+ ret
+
+3: fstp %st(1)
+ fadd %st(0)
+ ret
+END (__ieee754_logl)
+
+ENTRY(__logl_finite)
+ fldln2 // log(2)
+ fldt 4(%esp) // x : log(2)
+#ifdef PIC
+ LOAD_PIC_REG (dx)
+#endif
+ fld %st // x : x : log(2)
+ fsubl MO(one) // x-1 : x : log(2)
+ fld %st // x-1 : x-1 : x : log(2)
+ fabs // |x-1| : x-1 : x : log(2)
+ fld MO(limit) // 0.29 : |x-1| : x-1 : x : log(2)
+ fcomip %st(1) // |x-1| : x-1 : x : log(2)
+ fstp %st(0) // x-1 : x : log(2)
+ jc 2b
+ fxam
+ fnstsw
+ andb $0x45, %ah
+ cmpb $0x40, %ah
+ jne 6f
+ fabs // log(1) is +0 in all rounding modes.
+6: fstp %st(1) // x-1 : log(2)
+ fyl2xp1 // log(x)
+ ret
+END(__logl_finite)
diff --git a/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/Makefile b/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/Makefile
new file mode 100644
index 0000000000..7d9089232f
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/Makefile
@@ -0,0 +1,4 @@
+ifeq ($(subdir),math)
+libm-sysdep_routines += e_expf-sse2 e_expf-ia32 s_sinf-sse2 s_cosf-sse2 \
+ s_sincosf-sse2
+endif
diff --git a/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/e_expf-ia32.S b/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/e_expf-ia32.S
new file mode 100644
index 0000000000..b486b4d1ca
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/e_expf-ia32.S
@@ -0,0 +1,22 @@
+/*
+ Copyright (C) 2012-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#define __ieee754_expf __ieee754_expf_ia32
+#define __expf_finite __expf_finite_ia32
+
+#include <sysdeps/i386/fpu/e_expf.S>
diff --git a/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/e_expf-sse2.S b/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/e_expf-sse2.S
new file mode 100644
index 0000000000..e6bb6fa289
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/e_expf-sse2.S
@@ -0,0 +1,325 @@
+/* SSE2 version of __ieee754_expf and __expf_finite
+ Copyright (C) 2012-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+
+#include <sysdep.h>
+
+/* Short algorithm description:
+ *
+ * Let K = 64 (table size).
+ * e^x = 2^(x/log(2)) = 2^n * T[j] * (1 + P(y))
+ * where
+ * x = m*log(2)/K + y, y in [0.0..log(2)/K]
+ * m = n*K + j, m,n,j - signed integer, j in [0..K-1]
+ * values of 2^(j/K) are tabulated as T[j].
+ *
+ * P(y) is a minimax polynomial approximation of expf(x)-1
+ * on small interval [0.0..log(2)/K].
+ *
+ * P(y) = P3*y*y*y*y + P2*y*y*y + P1*y*y + P0*y, calculated as
+ * z = y*y; P(y) = (P3*z + P1)*z + (P2*z + P0)*y
+ *
+ * Special cases:
+ * __ieee754_expf_sse2(NaN) = NaN
+ * __ieee754_expf_sse2(+INF) = +INF
+ * __ieee754_expf_sse2(-INF) = 0
+ * __ieee754_expf_sse2(x) = 1 for subnormals
+ * for finite argument, only __ieee754_expf_sse2(0)=1 is exact
+ * __ieee754_expf_sse2(x) overflows if x>700
+ * __ieee754_expf_sse2(x) underflows if x<-700
+ *
+ * Note:
+ * For |x|<700, __ieee754_expf_sse2 computes result in double precision,
+ * with accuracy a bit more than needed for expf, and does not round it
+ * to single precision.
+ */
+
+
+#ifdef PIC
+# define MO1(symbol) L(symbol)##@GOTOFF(%edx)
+# define MO2(symbol,reg2,_scale) L(symbol)##@GOTOFF(%edx,reg2,_scale)
+#else
+# define MO1(symbol) L(symbol)
+# define MO2(symbol,reg2,_scale) L(symbol)(,reg2,_scale)
+#endif
+
+ .text
+ENTRY(__ieee754_expf_sse2)
+ /* Input: single precision x on stack at address 4(%esp) */
+
+#ifdef PIC
+ LOAD_PIC_REG(dx)
+#endif
+
+ cvtss2sd 4(%esp), %xmm1 /* Convert x to double precision */
+ mov 4(%esp), %ecx /* Copy x */
+ movsd MO1(DP_KLN2), %xmm2 /* DP K/log(2) */
+ movsd MO1(DP_P2), %xmm3 /* DP P2 */
+ movl %ecx, %eax /* x */
+ mulsd %xmm1, %xmm2 /* DP x*K/log(2) */
+ andl $0x7fffffff, %ecx /* |x| */
+ cmpl $0x442f0000, %ecx /* |x|<700 ? */
+ movsd MO1(DP_P3), %xmm4 /* DP P3 */
+ addsd MO1(DP_RS), %xmm2 /* DP x*K/log(2)+RS */
+ jae L(special_paths)
+
+ /* Here if |x|<700 */
+ cmpl $0x31800000, %ecx /* |x|<2^(-28) ? */
+ jb L(small_arg)
+
+ /* Main path: here if 2^(-28)<=|x|<700 */
+ cvtsd2ss %xmm2, %xmm2 /* SP x*K/log(2)+RS */
+ movd %xmm2, %eax /* bits of n*K+j with trash */
+ subss MO1(SP_RS), %xmm2 /* SP t=round(x*K/log(2)) */
+ movl %eax, %ecx /* n*K+j with trash */
+ cvtss2sd %xmm2, %xmm2 /* DP t */
+ andl $0x3f, %eax /* bits of j */
+ mulsd MO1(DP_NLN2K), %xmm2 /* DP -t*log(2)/K */
+ andl $0xffffffc0, %ecx /* bits of n */
+#ifdef __AVX__
+ vaddsd %xmm1, %xmm2, %xmm0 /* DP y=x-t*log(2)/K */
+ vmulsd %xmm0, %xmm0, %xmm2 /* DP z=y*y */
+#else
+ addsd %xmm1, %xmm2 /* DP y=x-t*log(2)/K */
+ movaps %xmm2, %xmm0 /* DP y */
+ mulsd %xmm2, %xmm2 /* DP z=y*y */
+#endif
+ mulsd %xmm2, %xmm4 /* DP P3*z */
+ addl $0xffc0, %ecx /* bits of n + DP exponent bias */
+ mulsd %xmm2, %xmm3 /* DP P2*z */
+ shrl $2, %ecx /* High 2 bytes of DP 2^n */
+ pxor %xmm1, %xmm1 /* clear %xmm1 */
+ addsd MO1(DP_P1), %xmm4 /* DP P3*z+P1 */
+ addsd MO1(DP_P0), %xmm3 /* DP P2*z+P0 */
+ pinsrw $3, %ecx, %xmm1 /* DP 2^n */
+ mulsd %xmm2, %xmm4 /* DP (P3*z+P1)*z */
+ mulsd %xmm3, %xmm0 /* DP (P2*z+P0)*y */
+ addsd %xmm4, %xmm0 /* DP P(y) */
+ mulsd MO2(DP_T,%eax,8), %xmm0 /* DP P(y)*T[j] */
+ addsd MO2(DP_T,%eax,8), %xmm0 /* DP T[j]*(P(y)+1) */
+ mulsd %xmm1, %xmm0 /* DP result=2^n*(T[j]*(P(y)+1)) */
+ cvtsd2ss %xmm0, %xmm1
+
+ lea -4(%esp), %esp /* Borrow 4 bytes of stack frame */
+ movss %xmm1, 0(%esp) /* Move result from sse... */
+ flds 0(%esp) /* ...to FPU. */
+ lea 4(%esp), %esp /* Return back 4 bytes of stack frame */
+ ret
+
+ .p2align 4
+L(small_arg):
+ /* Here if 0<=|x|<2^(-28) */
+ movss 4(%esp), %xmm0 /* load x */
+ addss MO1(SP_ONE), %xmm0 /* 1.0 + x */
+ /* Return 1.0 with inexact raised, except for x==0 */
+ jmp L(epilogue)
+
+ .p2align 4
+L(special_paths):
+ /* Here if x is NaN, or Inf, or finite |x|>=700 */
+ movss 4(%esp), %xmm0 /* load x */
+
+ cmpl $0x7f800000, %ecx /* |x| is finite ? */
+ jae L(arg_inf_or_nan)
+
+ /* Here if finite |x|>=700 */
+ testl $0x80000000, %eax /* sign of x nonzero ? */
+ je L(res_overflow)
+
+ /* Here if finite x<=-700 */
+ movss MO1(SP_SMALL), %xmm0 /* load small value 2^(-100) */
+ mulss %xmm0, %xmm0 /* Return underflowed result (zero or subnormal) */
+ jmp L(epilogue)
+
+ .p2align 4
+L(res_overflow):
+ /* Here if finite x>=700 */
+ movss MO1(SP_LARGE), %xmm0 /* load large value 2^100 */
+ mulss %xmm0, %xmm0 /* Return overflowed result (Inf or max normal) */
+ jmp L(epilogue)
+
+ .p2align 4
+L(arg_inf_or_nan):
+ /* Here if |x| is Inf or NAN */
+ jne L(arg_nan) /* |x| is Inf ? */
+
+ /* Here if |x| is Inf */
+ shrl $31, %eax /* Get sign bit of x */
+ movss MO2(SP_INF_0,%eax,4), %xmm0/* return zero or Inf, depending on sign of x */
+ jmp L(epilogue)
+
+ .p2align 4
+L(arg_nan):
+ /* Here if |x| is NaN */
+ addss %xmm0, %xmm0 /* Return x+x (raise invalid) */
+
+ .p2align 4
+L(epilogue):
+ lea -4(%esp), %esp /* Borrow 4 bytes of stack frame */
+ movss %xmm0, 0(%esp) /* Move result from sse... */
+ flds 0(%esp) /* ...to FPU. */
+ lea 4(%esp), %esp /* Return back 4 bytes of stack frame */
+ ret
+END(__ieee754_expf_sse2)
+
+ .section .rodata, "a"
+ .p2align 3
+L(DP_T): /* table of double precision values 2^(j/K) for j=[0..K-1] */
+ .long 0x00000000, 0x3ff00000
+ .long 0x3e778061, 0x3ff02c9a
+ .long 0xd3158574, 0x3ff059b0
+ .long 0x18759bc8, 0x3ff08745
+ .long 0x6cf9890f, 0x3ff0b558
+ .long 0x32d3d1a2, 0x3ff0e3ec
+ .long 0xd0125b51, 0x3ff11301
+ .long 0xaea92de0, 0x3ff1429a
+ .long 0x3c7d517b, 0x3ff172b8
+ .long 0xeb6fcb75, 0x3ff1a35b
+ .long 0x3168b9aa, 0x3ff1d487
+ .long 0x88628cd6, 0x3ff2063b
+ .long 0x6e756238, 0x3ff2387a
+ .long 0x65e27cdd, 0x3ff26b45
+ .long 0xf51fdee1, 0x3ff29e9d
+ .long 0xa6e4030b, 0x3ff2d285
+ .long 0x0a31b715, 0x3ff306fe
+ .long 0xb26416ff, 0x3ff33c08
+ .long 0x373aa9cb, 0x3ff371a7
+ .long 0x34e59ff7, 0x3ff3a7db
+ .long 0x4c123422, 0x3ff3dea6
+ .long 0x21f72e2a, 0x3ff4160a
+ .long 0x6061892d, 0x3ff44e08
+ .long 0xb5c13cd0, 0x3ff486a2
+ .long 0xd5362a27, 0x3ff4bfda
+ .long 0x769d2ca7, 0x3ff4f9b2
+ .long 0x569d4f82, 0x3ff5342b
+ .long 0x36b527da, 0x3ff56f47
+ .long 0xdd485429, 0x3ff5ab07
+ .long 0x15ad2148, 0x3ff5e76f
+ .long 0xb03a5585, 0x3ff6247e
+ .long 0x82552225, 0x3ff66238
+ .long 0x667f3bcd, 0x3ff6a09e
+ .long 0x3c651a2f, 0x3ff6dfb2
+ .long 0xe8ec5f74, 0x3ff71f75
+ .long 0x564267c9, 0x3ff75feb
+ .long 0x73eb0187, 0x3ff7a114
+ .long 0x36cf4e62, 0x3ff7e2f3
+ .long 0x994cce13, 0x3ff82589
+ .long 0x9b4492ed, 0x3ff868d9
+ .long 0x422aa0db, 0x3ff8ace5
+ .long 0x99157736, 0x3ff8f1ae
+ .long 0xb0cdc5e5, 0x3ff93737
+ .long 0x9fde4e50, 0x3ff97d82
+ .long 0x82a3f090, 0x3ff9c491
+ .long 0x7b5de565, 0x3ffa0c66
+ .long 0xb23e255d, 0x3ffa5503
+ .long 0x5579fdbf, 0x3ffa9e6b
+ .long 0x995ad3ad, 0x3ffae89f
+ .long 0xb84f15fb, 0x3ffb33a2
+ .long 0xf2fb5e47, 0x3ffb7f76
+ .long 0x904bc1d2, 0x3ffbcc1e
+ .long 0xdd85529c, 0x3ffc199b
+ .long 0x2e57d14b, 0x3ffc67f1
+ .long 0xdcef9069, 0x3ffcb720
+ .long 0x4a07897c, 0x3ffd072d
+ .long 0xdcfba487, 0x3ffd5818
+ .long 0x03db3285, 0x3ffda9e6
+ .long 0x337b9b5f, 0x3ffdfc97
+ .long 0xe78b3ff6, 0x3ffe502e
+ .long 0xa2a490da, 0x3ffea4af
+ .long 0xee615a27, 0x3ffefa1b
+ .long 0x5b6e4540, 0x3fff5076
+ .long 0x819e90d8, 0x3fffa7c1
+ .type L(DP_T), @object
+ ASM_SIZE_DIRECTIVE(L(DP_T))
+
+ .section .rodata.cst8,"aM",@progbits,8
+ .p2align 3
+L(DP_KLN2): /* double precision K/log(2) */
+ .long 0x652b82fe, 0x40571547
+ .type L(DP_KLN2), @object
+ ASM_SIZE_DIRECTIVE(L(DP_KLN2))
+
+ .p2align 3
+L(DP_NLN2K): /* double precision -log(2)/K */
+ .long 0xfefa39ef, 0xbf862e42
+ .type L(DP_NLN2K), @object
+ ASM_SIZE_DIRECTIVE(L(DP_NLN2K))
+
+ .p2align 3
+L(DP_RS): /* double precision 2^23+2^22 */
+ .long 0x00000000, 0x41680000
+ .type L(DP_RS), @object
+ ASM_SIZE_DIRECTIVE(L(DP_RS))
+
+ .p2align 3
+L(DP_P3): /* double precision polynomial coefficient P3 */
+ .long 0xeb78fa85, 0x3fa56420
+ .type L(DP_P3), @object
+ ASM_SIZE_DIRECTIVE(L(DP_P3))
+
+ .p2align 3
+L(DP_P1): /* double precision polynomial coefficient P1 */
+ .long 0x008d6118, 0x3fe00000
+ .type L(DP_P1), @object
+ ASM_SIZE_DIRECTIVE(L(DP_P1))
+
+ .p2align 3
+L(DP_P2): /* double precision polynomial coefficient P2 */
+ .long 0xda752d4f, 0x3fc55550
+ .type L(DP_P2), @object
+ ASM_SIZE_DIRECTIVE(L(DP_P2))
+
+ .p2align 3
+L(DP_P0): /* double precision polynomial coefficient P0 */
+ .long 0xffffe7c6, 0x3fefffff
+ .type L(DP_P0), @object
+ ASM_SIZE_DIRECTIVE(L(DP_P0))
+
+ .p2align 2
+L(SP_INF_0):
+ .long 0x7f800000 /* single precision Inf */
+ .long 0 /* single precision zero */
+ .type L(SP_INF_0), @object
+ ASM_SIZE_DIRECTIVE(L(SP_INF_0))
+
+ .section .rodata.cst4,"aM",@progbits,4
+ .p2align 2
+L(SP_RS): /* single precision 2^23+2^22 */
+ .long 0x4b400000
+ .type L(SP_RS), @object
+ ASM_SIZE_DIRECTIVE(L(SP_RS))
+
+ .p2align 2
+L(SP_SMALL): /* single precision small value 2^(-100) */
+ .long 0x0d800000
+ .type L(SP_SMALL), @object
+ ASM_SIZE_DIRECTIVE(L(SP_SMALL))
+
+ .p2align 2
+L(SP_LARGE): /* single precision large value 2^100 */
+ .long 0x71800000
+ .type L(SP_LARGE), @object
+ ASM_SIZE_DIRECTIVE(L(SP_LARGE))
+
+ .p2align 2
+L(SP_ONE): /* single precision 1.0 */
+ .long 0x3f800000
+ .type L(SP_ONE), @object
+ ASM_SIZE_DIRECTIVE(L(SP_ONE))
+
+strong_alias (__ieee754_expf_sse2, __expf_finite_sse2)
diff --git a/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/e_expf.c b/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/e_expf.c
new file mode 100644
index 0000000000..388cf98a39
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/e_expf.c
@@ -0,0 +1,37 @@
+/* Multiple versions of expf
+ Copyright (C) 2012-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <init-arch.h>
+
+extern double __ieee754_expf_sse2 (double);
+extern double __ieee754_expf_ia32 (double);
+
+double __ieee754_expf (double);
+libm_ifunc (__ieee754_expf,
+ HAS_CPU_FEATURE (SSE2)
+ ? __ieee754_expf_sse2
+ : __ieee754_expf_ia32);
+
+extern double __expf_finite_sse2 (double);
+extern double __expf_finite_ia32 (double);
+
+double __expf_finite (double);
+libm_ifunc (__expf_finite,
+ HAS_CPU_FEATURE (SSE2)
+ ? __expf_finite_sse2
+ : __expf_finite_ia32);
diff --git a/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/libm-test-ulps b/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/libm-test-ulps
new file mode 100644
index 0000000000..04bc23b37b
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/libm-test-ulps
@@ -0,0 +1,2188 @@
+# Begin of automatic generation
+
+# Maximal error of functions:
+Function: "acos":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "acos_downward":
+ildouble: 2
+ldouble: 2
+
+Function: "acos_towardzero":
+ildouble: 2
+ldouble: 2
+
+Function: "acos_upward":
+double: 1
+idouble: 1
+ildouble: 2
+ldouble: 2
+
+Function: "acosh":
+double: 1
+idouble: 1
+ildouble: 4
+ldouble: 2
+
+Function: "acosh_downward":
+double: 1
+idouble: 1
+ildouble: 6
+ldouble: 4
+
+Function: "acosh_towardzero":
+double: 1
+idouble: 1
+ildouble: 6
+ldouble: 4
+
+Function: "acosh_upward":
+double: 1
+idouble: 1
+ildouble: 4
+ldouble: 3
+
+Function: "asin":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "asin_downward":
+double: 1
+idouble: 1
+ildouble: 2
+ldouble: 2
+
+Function: "asin_towardzero":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "asin_upward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "asinh":
+double: 1
+idouble: 1
+ildouble: 3
+ldouble: 3
+
+Function: "asinh_downward":
+double: 1
+float: 1
+idouble: 1
+ildouble: 5
+ldouble: 5
+
+Function: "asinh_towardzero":
+double: 1
+float: 1
+idouble: 1
+ildouble: 4
+ldouble: 4
+
+Function: "asinh_upward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 5
+ldouble: 5
+
+Function: "atan":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "atan2":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "atan2_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "atan2_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "atan2_upward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "atan_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "atan_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "atan_upward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "atanh":
+double: 1
+idouble: 1
+ildouble: 3
+ldouble: 3
+
+Function: "atanh_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 3
+ldouble: 4
+
+Function: "atanh_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 5
+ldouble: 3
+
+Function: "atanh_upward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 5
+ldouble: 5
+
+Function: "cabs":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "cabs_downward":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "cabs_towardzero":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "cabs_upward":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: Real part of "cacos":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Imaginary part of "cacos":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: Real part of "cacos_downward":
+double: 2
+float: 1
+idouble: 2
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: Imaginary part of "cacos_downward":
+double: 5
+float: 3
+idouble: 5
+ifloat: 3
+ildouble: 6
+ldouble: 6
+
+Function: Real part of "cacos_towardzero":
+double: 2
+float: 1
+idouble: 2
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: Imaginary part of "cacos_towardzero":
+double: 4
+float: 3
+idouble: 4
+ifloat: 3
+ildouble: 5
+ldouble: 5
+
+Function: Real part of "cacos_upward":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 2
+ldouble: 2
+
+Function: Imaginary part of "cacos_upward":
+double: 7
+float: 7
+idouble: 7
+ifloat: 7
+ildouble: 7
+ldouble: 7
+
+Function: Real part of "cacosh":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: Imaginary part of "cacosh":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Real part of "cacosh_downward":
+double: 4
+float: 3
+idouble: 4
+ifloat: 3
+ildouble: 5
+ldouble: 5
+
+Function: Imaginary part of "cacosh_downward":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "cacosh_towardzero":
+double: 4
+float: 3
+idouble: 4
+ifloat: 3
+ildouble: 5
+ldouble: 5
+
+Function: Imaginary part of "cacosh_towardzero":
+double: 2
+float: 1
+idouble: 2
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: Real part of "cacosh_upward":
+double: 4
+float: 4
+idouble: 4
+ifloat: 4
+ildouble: 5
+ldouble: 5
+
+Function: Imaginary part of "cacosh_upward":
+double: 3
+float: 2
+idouble: 3
+ifloat: 2
+ildouble: 3
+ldouble: 3
+
+Function: "carg":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "carg_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "carg_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "carg_upward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Real part of "casin":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Imaginary part of "casin":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: Real part of "casin_downward":
+double: 3
+float: 2
+idouble: 3
+ifloat: 2
+ildouble: 3
+ldouble: 3
+
+Function: Imaginary part of "casin_downward":
+double: 5
+float: 3
+idouble: 5
+ifloat: 3
+ildouble: 6
+ldouble: 6
+
+Function: Real part of "casin_towardzero":
+double: 3
+float: 1
+idouble: 3
+ifloat: 1
+ildouble: 3
+ldouble: 3
+
+Function: Imaginary part of "casin_towardzero":
+double: 4
+float: 3
+idouble: 4
+ifloat: 3
+ildouble: 5
+ldouble: 5
+
+Function: Real part of "casin_upward":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 2
+ldouble: 2
+
+Function: Imaginary part of "casin_upward":
+double: 7
+float: 7
+idouble: 7
+ifloat: 7
+ildouble: 7
+ldouble: 7
+
+Function: Real part of "casinh":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: Imaginary part of "casinh":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Real part of "casinh_downward":
+double: 5
+float: 3
+idouble: 5
+ifloat: 3
+ildouble: 6
+ldouble: 6
+
+Function: Imaginary part of "casinh_downward":
+double: 3
+float: 2
+idouble: 3
+ifloat: 2
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "casinh_towardzero":
+double: 4
+float: 3
+idouble: 4
+ifloat: 3
+ildouble: 5
+ldouble: 5
+
+Function: Imaginary part of "casinh_towardzero":
+double: 3
+float: 1
+idouble: 3
+ifloat: 1
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "casinh_upward":
+double: 7
+float: 7
+idouble: 7
+ifloat: 7
+ildouble: 7
+ldouble: 7
+
+Function: Imaginary part of "casinh_upward":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 2
+ldouble: 2
+
+Function: Real part of "catan":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: Imaginary part of "catan":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Real part of "catan_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Imaginary part of "catan_downward":
+double: 2
+float: 1
+idouble: 2
+ifloat: 1
+ildouble: 4
+ldouble: 4
+
+Function: Real part of "catan_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Imaginary part of "catan_towardzero":
+double: 2
+float: 1
+idouble: 2
+ifloat: 1
+ildouble: 4
+ldouble: 4
+
+Function: Real part of "catan_upward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Imaginary part of "catan_upward":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "catanh":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Imaginary part of "catanh":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: Real part of "catanh_downward":
+double: 2
+float: 1
+idouble: 2
+ifloat: 1
+ildouble: 4
+ldouble: 4
+
+Function: Imaginary part of "catanh_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Real part of "catanh_towardzero":
+double: 2
+float: 1
+idouble: 2
+ifloat: 1
+ildouble: 4
+ldouble: 4
+
+Function: Imaginary part of "catanh_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Real part of "catanh_upward":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 4
+ldouble: 4
+
+Function: Imaginary part of "catanh_upward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "cbrt":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 3
+ldouble: 3
+
+Function: "cbrt_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 4
+ldouble: 4
+
+Function: "cbrt_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 3
+ldouble: 3
+
+Function: "cbrt_upward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 4
+ldouble: 4
+
+Function: Real part of "ccos":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Imaginary part of "ccos":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Real part of "ccos_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 3
+ldouble: 3
+
+Function: Imaginary part of "ccos_downward":
+double: 3
+float: 3
+idouble: 3
+ifloat: 3
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "ccos_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 3
+ldouble: 3
+
+Function: Imaginary part of "ccos_towardzero":
+double: 3
+float: 3
+idouble: 3
+ifloat: 3
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "ccos_upward":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 2
+ldouble: 2
+
+Function: Imaginary part of "ccos_upward":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 2
+ldouble: 2
+
+Function: Real part of "ccosh":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Imaginary part of "ccosh":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Real part of "ccosh_downward":
+double: 1
+float: 2
+idouble: 1
+ifloat: 2
+ildouble: 3
+ldouble: 3
+
+Function: Imaginary part of "ccosh_downward":
+double: 3
+float: 3
+idouble: 3
+ifloat: 3
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "ccosh_towardzero":
+double: 1
+float: 2
+idouble: 1
+ifloat: 2
+ildouble: 3
+ldouble: 3
+
+Function: Imaginary part of "ccosh_towardzero":
+double: 3
+float: 3
+idouble: 3
+ifloat: 3
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "ccosh_upward":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 2
+ldouble: 2
+
+Function: Imaginary part of "ccosh_upward":
+double: 3
+float: 2
+idouble: 3
+ifloat: 2
+ildouble: 2
+ldouble: 2
+
+Function: Real part of "cexp":
+double: 2
+float: 1
+idouble: 2
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Imaginary part of "cexp":
+double: 1
+float: 2
+idouble: 1
+ifloat: 2
+ildouble: 1
+ldouble: 1
+
+Function: Real part of "cexp_downward":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 3
+ldouble: 3
+
+Function: Imaginary part of "cexp_downward":
+double: 3
+float: 3
+idouble: 3
+ifloat: 3
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "cexp_towardzero":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 3
+ldouble: 3
+
+Function: Imaginary part of "cexp_towardzero":
+double: 3
+float: 3
+idouble: 3
+ifloat: 3
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "cexp_upward":
+double: 1
+float: 2
+idouble: 1
+ifloat: 2
+ildouble: 2
+ldouble: 2
+
+Function: Imaginary part of "cexp_upward":
+double: 3
+float: 2
+idouble: 3
+ifloat: 2
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "clog":
+double: 2
+float: 1
+idouble: 2
+ifloat: 1
+ildouble: 3
+ldouble: 3
+
+Function: Imaginary part of "clog":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: Real part of "clog10":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 4
+ldouble: 4
+
+Function: Imaginary part of "clog10":
+double: 1
+idouble: 1
+ildouble: 2
+ldouble: 2
+
+Function: Real part of "clog10_downward":
+double: 3
+float: 3
+idouble: 3
+ifloat: 3
+ildouble: 8
+ldouble: 8
+
+Function: Imaginary part of "clog10_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "clog10_towardzero":
+double: 3
+float: 3
+idouble: 3
+ifloat: 3
+ildouble: 8
+ldouble: 8
+
+Function: Imaginary part of "clog10_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "clog10_upward":
+double: 3
+float: 3
+idouble: 3
+ifloat: 3
+ildouble: 7
+ldouble: 7
+
+Function: Imaginary part of "clog10_upward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "clog_downward":
+double: 3
+float: 3
+idouble: 3
+ifloat: 3
+ildouble: 5
+ldouble: 5
+
+Function: Imaginary part of "clog_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Real part of "clog_towardzero":
+double: 3
+float: 3
+idouble: 3
+ifloat: 3
+ildouble: 5
+ldouble: 5
+
+Function: Imaginary part of "clog_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Real part of "clog_upward":
+double: 2
+float: 3
+idouble: 2
+ifloat: 3
+ildouble: 4
+ldouble: 4
+
+Function: Imaginary part of "clog_upward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "cos":
+ildouble: 1
+ldouble: 1
+
+Function: "cos_downward":
+double: 1
+idouble: 1
+ildouble: 3
+ldouble: 3
+
+Function: "cos_towardzero":
+double: 1
+idouble: 1
+ildouble: 2
+ldouble: 2
+
+Function: "cos_upward":
+double: 1
+idouble: 1
+ildouble: 2
+ldouble: 2
+
+Function: "cosh":
+double: 1
+float: 1
+idouble: 1
+ildouble: 2
+ldouble: 2
+
+Function: "cosh_downward":
+double: 2
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 2
+ldouble: 3
+
+Function: "cosh_towardzero":
+double: 2
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: "cosh_upward":
+double: 4
+float: 2
+idouble: 1
+ifloat: 1
+ildouble: 2
+ldouble: 3
+
+Function: Real part of "cpow":
+double: 2
+float: 5
+idouble: 2
+ifloat: 5
+ildouble: 3
+ldouble: 3
+
+Function: Imaginary part of "cpow":
+double: 1
+float: 2
+idouble: 1
+ifloat: 2
+ildouble: 4
+ldouble: 4
+
+Function: Real part of "cpow_downward":
+double: 5
+float: 8
+idouble: 5
+ifloat: 8
+ildouble: 7
+ldouble: 7
+
+Function: Imaginary part of "cpow_downward":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 2
+ldouble: 2
+
+Function: Real part of "cpow_towardzero":
+double: 5
+float: 8
+idouble: 5
+ifloat: 8
+ildouble: 7
+ldouble: 7
+
+Function: Imaginary part of "cpow_towardzero":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 1
+ldouble: 1
+
+Function: Real part of "cpow_upward":
+double: 4
+float: 1
+idouble: 4
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: Imaginary part of "cpow_upward":
+double: 1
+float: 2
+idouble: 1
+ifloat: 2
+ildouble: 2
+ldouble: 2
+
+Function: Real part of "csin":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Imaginary part of "csin":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+
+Function: Real part of "csin_downward":
+double: 3
+float: 3
+idouble: 3
+ifloat: 3
+ildouble: 3
+ldouble: 3
+
+Function: Imaginary part of "csin_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "csin_towardzero":
+double: 3
+float: 3
+idouble: 3
+ifloat: 3
+ildouble: 3
+ldouble: 3
+
+Function: Imaginary part of "csin_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "csin_upward":
+double: 3
+float: 2
+idouble: 3
+ifloat: 2
+ildouble: 3
+ldouble: 3
+
+Function: Imaginary part of "csin_upward":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "csinh":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Imaginary part of "csinh":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Real part of "csinh_downward":
+double: 1
+float: 2
+idouble: 1
+ifloat: 2
+ildouble: 3
+ldouble: 3
+
+Function: Imaginary part of "csinh_downward":
+double: 3
+float: 3
+idouble: 3
+ifloat: 3
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "csinh_towardzero":
+double: 1
+float: 2
+idouble: 1
+ifloat: 2
+ildouble: 3
+ldouble: 3
+
+Function: Imaginary part of "csinh_towardzero":
+double: 3
+float: 3
+idouble: 3
+ifloat: 3
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "csinh_upward":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 3
+ldouble: 3
+
+Function: Imaginary part of "csinh_upward":
+double: 3
+float: 2
+idouble: 3
+ifloat: 2
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "csqrt":
+double: 1
+idouble: 1
+ildouble: 2
+ldouble: 2
+
+Function: Imaginary part of "csqrt":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: Real part of "csqrt_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 5
+ldouble: 5
+
+Function: Imaginary part of "csqrt_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 4
+ldouble: 4
+
+Function: Real part of "csqrt_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 4
+ldouble: 4
+
+Function: Imaginary part of "csqrt_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 4
+ldouble: 4
+
+Function: Real part of "csqrt_upward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 5
+ldouble: 5
+
+Function: Imaginary part of "csqrt_upward":
+double: 1
+float: 2
+idouble: 1
+ifloat: 2
+ildouble: 4
+ldouble: 4
+
+Function: Real part of "ctan":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: Imaginary part of "ctan":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Real part of "ctan_downward":
+double: 1
+float: 2
+idouble: 1
+ifloat: 2
+ildouble: 5
+ldouble: 5
+
+Function: Imaginary part of "ctan_downward":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 4
+ldouble: 4
+
+Function: Real part of "ctan_towardzero":
+double: 3
+float: 1
+idouble: 3
+ifloat: 1
+ildouble: 5
+ldouble: 5
+
+Function: Imaginary part of "ctan_towardzero":
+double: 2
+float: 1
+idouble: 2
+ifloat: 1
+ildouble: 4
+ldouble: 4
+
+Function: Real part of "ctan_upward":
+double: 3
+float: 2
+idouble: 3
+ifloat: 2
+ildouble: 3
+ldouble: 3
+
+Function: Imaginary part of "ctan_upward":
+double: 2
+float: 1
+idouble: 2
+ifloat: 1
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "ctanh":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Imaginary part of "ctanh":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: Real part of "ctanh_downward":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 4
+ldouble: 4
+
+Function: Imaginary part of "ctanh_downward":
+double: 2
+float: 1
+idouble: 2
+ifloat: 1
+ildouble: 4
+ldouble: 4
+
+Function: Real part of "ctanh_towardzero":
+double: 2
+float: 1
+idouble: 2
+ifloat: 1
+ildouble: 4
+ldouble: 4
+
+Function: Imaginary part of "ctanh_towardzero":
+double: 2
+float: 1
+idouble: 2
+ifloat: 1
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "ctanh_upward":
+double: 2
+float: 1
+idouble: 2
+ifloat: 1
+ildouble: 3
+ldouble: 3
+
+Function: Imaginary part of "ctanh_upward":
+double: 3
+float: 2
+idouble: 3
+ifloat: 2
+ildouble: 3
+ldouble: 3
+
+Function: "erf":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "erf_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "erf_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "erf_upward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "erfc":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 3
+ldouble: 3
+
+Function: "erfc_downward":
+double: 2
+float: 3
+idouble: 2
+ifloat: 3
+ildouble: 4
+ldouble: 4
+
+Function: "erfc_towardzero":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 4
+ldouble: 4
+
+Function: "erfc_upward":
+double: 2
+float: 3
+idouble: 2
+ifloat: 3
+ildouble: 5
+ldouble: 5
+
+Function: "exp":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "exp10":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "exp10_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: "exp10_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: "exp10_upward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: "exp2":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "exp2_downward":
+ildouble: 1
+ldouble: 1
+
+Function: "exp2_towardzero":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "exp2_upward":
+ildouble: 1
+ldouble: 1
+
+Function: "exp_downward":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "exp_towardzero":
+double: 1
+idouble: 1
+ildouble: 2
+ldouble: 2
+
+Function: "exp_upward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "expm1":
+double: 1
+idouble: 1
+ildouble: 2
+ldouble: 2
+
+Function: "expm1_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 4
+ldouble: 4
+
+Function: "expm1_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 4
+ldouble: 4
+
+Function: "expm1_upward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 4
+ldouble: 4
+
+Function: "gamma":
+double: 3
+float: 3
+idouble: 3
+ifloat: 3
+ildouble: 4
+ldouble: 4
+
+Function: "gamma_downward":
+double: 4
+float: 4
+idouble: 4
+ifloat: 4
+ildouble: 7
+ldouble: 7
+
+Function: "gamma_towardzero":
+double: 4
+float: 4
+idouble: 4
+ifloat: 4
+ildouble: 7
+ldouble: 7
+
+Function: "gamma_upward":
+double: 3
+float: 4
+idouble: 3
+ifloat: 4
+ildouble: 5
+ldouble: 5
+
+Function: "hypot":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "hypot_downward":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "hypot_towardzero":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "hypot_upward":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "j0":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: "j0_downward":
+double: 1
+float: 3
+idouble: 1
+ifloat: 3
+ildouble: 4
+ldouble: 4
+
+Function: "j0_towardzero":
+double: 2
+float: 1
+idouble: 2
+ifloat: 1
+ildouble: 5
+ldouble: 5
+
+Function: "j0_upward":
+double: 1
+float: 3
+idouble: 1
+ifloat: 3
+ildouble: 4
+ldouble: 4
+
+Function: "j1":
+double: 2
+float: 1
+idouble: 2
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "j1_downward":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 4
+ldouble: 4
+
+Function: "j1_towardzero":
+double: 2
+float: 1
+idouble: 2
+ifloat: 1
+ildouble: 4
+ldouble: 4
+
+Function: "j1_upward":
+double: 2
+float: 3
+idouble: 2
+ifloat: 3
+ildouble: 3
+ldouble: 3
+
+Function: "jn":
+double: 2
+float: 3
+idouble: 2
+ifloat: 3
+ildouble: 4
+ldouble: 4
+
+Function: "jn_downward":
+double: 2
+float: 3
+idouble: 2
+ifloat: 3
+ildouble: 4
+ldouble: 4
+
+Function: "jn_towardzero":
+double: 2
+float: 3
+idouble: 2
+ifloat: 3
+ildouble: 5
+ldouble: 5
+
+Function: "jn_upward":
+double: 2
+float: 3
+idouble: 2
+ifloat: 3
+ildouble: 5
+ldouble: 5
+
+Function: "lgamma":
+double: 3
+float: 3
+idouble: 3
+ifloat: 3
+ildouble: 4
+ldouble: 4
+
+Function: "lgamma_downward":
+double: 4
+float: 4
+idouble: 4
+ifloat: 4
+ildouble: 7
+ldouble: 7
+
+Function: "lgamma_towardzero":
+double: 4
+float: 4
+idouble: 4
+ifloat: 4
+ildouble: 7
+ldouble: 7
+
+Function: "lgamma_upward":
+double: 3
+float: 4
+idouble: 3
+ifloat: 4
+ildouble: 5
+ldouble: 5
+
+Function: "log":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "log10":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "log10_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: "log10_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: "log10_upward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "log1p":
+double: 1
+idouble: 1
+ildouble: 2
+ldouble: 2
+
+Function: "log1p_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 4
+ldouble: 4
+
+Function: "log1p_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 4
+ldouble: 4
+
+Function: "log1p_upward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 3
+ldouble: 3
+
+Function: "log2":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "log2_downward":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "log2_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "log2_upward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "log_downward":
+double: 1
+idouble: 1
+ildouble: 2
+ldouble: 2
+
+Function: "log_towardzero":
+double: 1
+idouble: 1
+ildouble: 2
+ldouble: 2
+
+Function: "log_upward":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "pow":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "pow10":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "pow10_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: "pow10_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: "pow10_upward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: "pow_downward":
+double: 1
+idouble: 1
+ildouble: 4
+ldouble: 4
+
+Function: "pow_towardzero":
+double: 1
+idouble: 1
+ildouble: 4
+ldouble: 4
+
+Function: "pow_upward":
+double: 1
+idouble: 1
+ildouble: 4
+ldouble: 4
+
+Function: "sin":
+ildouble: 1
+ldouble: 1
+
+Function: "sin_downward":
+double: 1
+idouble: 1
+ildouble: 3
+ldouble: 3
+
+Function: "sin_towardzero":
+double: 1
+idouble: 1
+ildouble: 2
+ldouble: 2
+
+Function: "sin_upward":
+double: 1
+idouble: 1
+ildouble: 3
+ldouble: 3
+
+Function: "sincos":
+ildouble: 1
+ldouble: 1
+
+Function: "sincos_downward":
+double: 1
+idouble: 1
+ildouble: 3
+ldouble: 3
+
+Function: "sincos_towardzero":
+double: 1
+idouble: 1
+ildouble: 2
+ldouble: 2
+
+Function: "sincos_upward":
+double: 1
+idouble: 1
+ildouble: 3
+ldouble: 3
+
+Function: "sinh":
+double: 1
+ildouble: 2
+ldouble: 2
+
+Function: "sinh_downward":
+double: 2
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 4
+ldouble: 5
+
+Function: "sinh_towardzero":
+double: 2
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 3
+ldouble: 4
+
+Function: "sinh_upward":
+double: 4
+float: 2
+idouble: 1
+ifloat: 1
+ildouble: 4
+ldouble: 5
+
+Function: "tan":
+float: 1
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: "tan_downward":
+double: 1
+float: 2
+idouble: 1
+ifloat: 2
+ildouble: 3
+ldouble: 3
+
+Function: "tan_towardzero":
+double: 1
+float: 2
+idouble: 1
+ifloat: 2
+ildouble: 3
+ldouble: 3
+
+Function: "tan_upward":
+double: 1
+float: 2
+idouble: 1
+ifloat: 2
+ildouble: 2
+ldouble: 2
+
+Function: "tanh":
+double: 1
+idouble: 1
+ildouble: 3
+ldouble: 3
+
+Function: "tanh_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 7
+ldouble: 4
+
+Function: "tanh_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 3
+ldouble: 3
+
+Function: "tanh_upward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 5
+ldouble: 4
+
+Function: "tgamma":
+double: 3
+float: 3
+idouble: 3
+ifloat: 3
+ildouble: 5
+ldouble: 5
+
+Function: "tgamma_downward":
+double: 3
+float: 4
+idouble: 3
+ifloat: 4
+ildouble: 5
+ldouble: 5
+
+Function: "tgamma_towardzero":
+double: 4
+float: 4
+idouble: 4
+ifloat: 4
+ildouble: 5
+ldouble: 5
+
+Function: "tgamma_upward":
+double: 4
+float: 4
+idouble: 4
+ifloat: 4
+ildouble: 5
+ldouble: 5
+
+Function: "y0":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "y0_downward":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 5
+ldouble: 5
+
+Function: "y0_towardzero":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 5
+ldouble: 5
+
+Function: "y0_upward":
+double: 1
+float: 2
+idouble: 1
+ifloat: 2
+ildouble: 3
+ldouble: 3
+
+Function: "y1":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 2
+ldouble: 2
+
+Function: "y1_downward":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 7
+ldouble: 7
+
+Function: "y1_towardzero":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 5
+ldouble: 5
+
+Function: "y1_upward":
+double: 1
+float: 3
+idouble: 1
+ifloat: 3
+ildouble: 7
+ldouble: 7
+
+Function: "yn":
+double: 2
+float: 3
+idouble: 2
+ifloat: 3
+ildouble: 4
+ldouble: 4
+
+Function: "yn_downward":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 5
+ldouble: 5
+
+Function: "yn_towardzero":
+double: 3
+float: 3
+idouble: 3
+ifloat: 3
+ildouble: 5
+ldouble: 5
+
+Function: "yn_upward":
+double: 3
+float: 3
+idouble: 3
+ifloat: 3
+ildouble: 4
+ldouble: 4
+
+# end of automatic generation
diff --git a/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/libm-test-ulps-name b/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/libm-test-ulps-name
new file mode 100644
index 0000000000..193dd704b3
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/libm-test-ulps-name
@@ -0,0 +1 @@
+i686
diff --git a/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/s_cosf-sse2.S b/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/s_cosf-sse2.S
new file mode 100644
index 0000000000..f37850d0b3
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/s_cosf-sse2.S
@@ -0,0 +1,553 @@
+/* Optimized with sse2 version of cosf
+ Copyright (C) 2012-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#define __need_Emath
+#include <bits/errno.h>
+
+/* Short algorithm description:
+ *
+ * 1) if |x| == 0: return 1.0-|x|.
+ * 2) if |x| < 2^-27: return 1.0-|x|.
+ * 3) if |x| < 2^-5 : return 1.0+x^2*DP_COS2_0+x^5*DP_COS2_1.
+ * 4) if |x| < Pi/4: return 1.0+x^2*(C0+x^2*(C1+x^2*(C2+x^2*(C3+x^2*C4)))).
+ * 5) if |x| < 9*Pi/4:
+ * 5.1) Range reduction: k=trunc(|x|/(Pi/4)), j=(k+1)&0x0e, n=k+3,
+ * t=|x|-j*Pi/4.
+ * 5.2) Reconstruction:
+ * s = (-1.0)^((n>>2)&1)
+ * if(n&2 != 0) {
+ * using cos(t) polynomial for |t|<Pi/4, result is
+ * s * (1.0+t^2*(C0+t^2*(C1+t^2*(C2+t^2*(C3+t^2*C4))))).
+ * } else {
+ * using sin(t) polynomial for |t|<Pi/4, result is
+ * s * t * (1.0+t^2*(S0+t^2*(S1+t^2*(S2+t^2*(S3+t^2*S4))))).
+ * }
+ * 6) if |x| < 2^23, large args:
+ * 6.1) Range reduction: k=trunc(|x|/(Pi/4)), j=(k+1)&0xfffffffe, n=k+3,
+ * t=|x|-j*Pi/4.
+ * 6.2) Reconstruction same as (5.2).
+ * 7) if |x| >= 2^23, very large args:
+ * 7.1) Range reduction: k=trunc(|x|/(Pi/4)), j=(k+1)&0xfffffffe, n=k+3,
+ * t=|x|-j*Pi/4.
+ * 7.2) Reconstruction same as (5.2).
+ * 8) if x is Inf, return x-x, and set errno=EDOM.
+ * 9) if x is NaN, return x-x.
+ *
+ * Special cases:
+ * cos(+-0) = 1 not raising inexact,
+ * cos(subnormal) raises inexact,
+ * cos(min_normalized) raises inexact,
+ * cos(normalized) raises inexact,
+ * cos(Inf) = NaN, raises invalid, sets errno to EDOM,
+ * cos(NaN) = NaN.
+ */
+
+#ifdef PIC
+# define MO1(symbol) L(symbol)##@GOTOFF(%ebx)
+# define MO2(symbol,reg2,_scale) L(symbol)##@GOTOFF(%ebx,reg2,_scale)
+# define CFI_PUSH(REG) cfi_adjust_cfa_offset(4); cfi_rel_offset(REG,0)
+# define CFI_POP(REG) cfi_adjust_cfa_offset(-4); cfi_restore(REG)
+# define PUSH(REG) pushl REG; CFI_PUSH(REG)
+# define POP(REG) popl REG; CFI_POP(REG)
+# define ENTRANCE PUSH(%ebx); LOAD_PIC_REG(bx)
+# define RETURN POP(%ebx); ret; CFI_PUSH(%ebx)
+# define ARG_X 8(%esp)
+#else
+# define MO1(symbol) L(symbol)
+# define MO2(symbol,reg2,_scale) L(symbol)(,reg2,_scale)
+# define ENTRANCE
+# define RETURN ret
+# define ARG_X 4(%esp)
+#endif
+
+ .text
+ENTRY(__cosf_sse2)
+ /* Input: single precision x on stack at address ARG_X */
+
+ ENTRANCE
+ movl ARG_X, %eax /* Bits of x */
+ cvtss2sd ARG_X, %xmm0 /* DP x */
+ andl $0x7fffffff, %eax /* |x| */
+
+ cmpl $0x3f490fdb, %eax /* |x|<Pi/4? */
+ jb L(arg_less_pio4)
+
+ /* Here if |x|>=Pi/4 */
+ movd %eax, %xmm3 /* SP |x| */
+ andpd MO1(DP_ABS_MASK),%xmm0 /* DP |x| */
+ movss MO1(SP_INVPIO4), %xmm2 /* SP 1/(Pi/4) */
+
+ cmpl $0x40e231d6, %eax /* |x|<9*Pi/4? */
+ jae L(large_args)
+
+ /* Here if Pi/4<=|x|<9*Pi/4 */
+ mulss %xmm3, %xmm2 /* SP |x|/(Pi/4) */
+ cvttss2si %xmm2, %eax /* k, number of Pi/4 in x */
+ addl $1, %eax /* k+1 */
+ movl $0x0e, %edx
+ andl %eax, %edx /* j = (k+1)&0x0e */
+ addl $2, %eax /* n */
+ subsd MO2(PIO4J,%edx,8), %xmm0 /* t = |x| - j * Pi/4 */
+
+L(reconstruction):
+ /* Input: %eax=n, %xmm0=t */
+ testl $2, %eax /* n&2 != 0? */
+ jz L(sin_poly)
+
+/*L(cos_poly):*/
+ /* Here if cos(x) calculated using cos(t) polynomial for |t|<Pi/4:
+ * y = t*t; z = y*y;
+ * s = sign(x) * (-1.0)^((n>>2)&1)
+ * result = s * (1.0+t^2*(C0+t^2*(C1+t^2*(C2+t^2*(C3+t^2*C4)))))
+ */
+ shrl $2, %eax /* n>>2 */
+ mulsd %xmm0, %xmm0 /* y=t^2 */
+ andl $1, %eax /* (n>>2)&1 */
+ movaps %xmm0, %xmm1 /* y */
+ mulsd %xmm0, %xmm0 /* z=t^4 */
+
+ movsd MO1(DP_C4), %xmm4 /* C4 */
+ mulsd %xmm0, %xmm4 /* z*C4 */
+ movsd MO1(DP_C3), %xmm3 /* C3 */
+ mulsd %xmm0, %xmm3 /* z*C3 */
+ addsd MO1(DP_C2), %xmm4 /* C2+z*C4 */
+ mulsd %xmm0, %xmm4 /* z*(C2+z*C4) */
+ lea -8(%esp), %esp /* Borrow 4 bytes of stack frame */
+ addsd MO1(DP_C1), %xmm3 /* C1+z*C3 */
+ mulsd %xmm0, %xmm3 /* z*(C1+z*C3) */
+ addsd MO1(DP_C0), %xmm4 /* C0+z*(C2+z*C4) */
+ mulsd %xmm1, %xmm4 /* y*(C0+z*(C2+z*C4)) */
+
+ addsd %xmm4, %xmm3 /* y*(C0+y*(C1+y*(C2+y*(C3+y*C4)))) */
+ /* 1.0+y*(C0+y*(C1+y*(C2+y*(C3+y*C4)))) */
+ addsd MO1(DP_ONES), %xmm3
+
+ mulsd MO2(DP_ONES,%eax,8), %xmm3 /* DP result */
+ movsd %xmm3, 0(%esp) /* Move result from sse... */
+ fldl 0(%esp) /* ...to FPU. */
+ /* Return back 4 bytes of stack frame */
+ lea 8(%esp), %esp
+ RETURN
+
+ .p2align 4
+L(sin_poly):
+ /* Here if cos(x) calculated using sin(t) polynomial for |t|<Pi/4:
+ * y = t*t; z = y*y;
+ * s = sign(x) * (-1.0)^((n>>2)&1)
+ * result = s * t * (1.0+t^2*(S0+t^2*(S1+t^2*(S2+t^2*(S3+t^2*S4)))))
+ */
+
+ movaps %xmm0, %xmm4 /* t */
+ shrl $2, %eax /* n>>2 */
+ mulsd %xmm0, %xmm0 /* y=t^2 */
+ andl $1, %eax /* (n>>2)&1 */
+ movaps %xmm0, %xmm1 /* y */
+ mulsd %xmm0, %xmm0 /* z=t^4 */
+
+ movsd MO1(DP_S4), %xmm2 /* S4 */
+ mulsd %xmm0, %xmm2 /* z*S4 */
+ movsd MO1(DP_S3), %xmm3 /* S3 */
+ mulsd %xmm0, %xmm3 /* z*S3 */
+ lea -8(%esp), %esp /* Borrow 4 bytes of stack frame */
+ addsd MO1(DP_S2), %xmm2 /* S2+z*S4 */
+ mulsd %xmm0, %xmm2 /* z*(S2+z*S4) */
+ addsd MO1(DP_S1), %xmm3 /* S1+z*S3 */
+ mulsd %xmm0, %xmm3 /* z*(S1+z*S3) */
+ addsd MO1(DP_S0), %xmm2 /* S0+z*(S2+z*S4) */
+ mulsd %xmm1, %xmm2 /* y*(S0+z*(S2+z*S4)) */
+ /* t*s, where s = sign(x) * (-1.0)^((n>>2)&1) */
+ mulsd MO2(DP_ONES,%eax,8), %xmm4
+ addsd %xmm2, %xmm3 /* y*(S0+y*(S1+y*(S2+y*(S3+y*S4)))) */
+ /* t*s*y*(S0+y*(S1+y*(S2+y*(S3+y*S4)))) */
+ mulsd %xmm4, %xmm3
+ /* t*s*(1.0+y*(S0+y*(S1+y*(S2+y*(S3+y*S4)))) */
+ addsd %xmm4, %xmm3
+ movsd %xmm3, 0(%esp) /* Move result from sse... */
+ fldl 0(%esp) /* ...to FPU. */
+ /* Return back 4 bytes of stack frame */
+ lea 8(%esp), %esp
+ RETURN
+
+ .p2align 4
+L(large_args):
+ /* Here if |x|>=9*Pi/4 */
+ cmpl $0x7f800000, %eax /* x is Inf or NaN? */
+ jae L(arg_inf_or_nan)
+
+ /* Here if finite |x|>=9*Pi/4 */
+ cmpl $0x4b000000, %eax /* |x|<2^23? */
+ jae L(very_large_args)
+
+ /* Here if 9*Pi/4<=|x|<2^23 */
+ movsd MO1(DP_INVPIO4), %xmm1 /* 1/(Pi/4) */
+ mulsd %xmm0, %xmm1 /* |x|/(Pi/4) */
+ cvttsd2si %xmm1, %eax /* k=trunc(|x|/(Pi/4)) */
+ addl $1, %eax /* k+1 */
+ movl %eax, %edx
+ andl $0xfffffffe, %edx /* j=(k+1)&0xfffffffe */
+ cvtsi2sdl %edx, %xmm4 /* DP j */
+ movsd MO1(DP_PIO4HI), %xmm2 /* -PIO4HI = high part of -Pi/4 */
+ mulsd %xmm4, %xmm2 /* -j*PIO4HI */
+ movsd MO1(DP_PIO4LO), %xmm3 /* -PIO4LO = low part of -Pi/4 */
+ addsd %xmm2, %xmm0 /* |x| - j*PIO4HI */
+ addl $2, %eax /* n */
+ mulsd %xmm3, %xmm4 /* j*PIO4LO */
+ addsd %xmm4, %xmm0 /* t = |x| - j*PIO4HI - j*PIO4LO */
+ jmp L(reconstruction)
+
+ .p2align 4
+L(very_large_args):
+ /* Here if finite |x|>=2^23 */
+
+ /* bitpos = (ix>>23) - BIAS_32 + 59; */
+ shrl $23, %eax /* eb = biased exponent of x */
+ /* bitpos = eb - 0x7f + 59, where 0x7f is exponent bias */
+ subl $68, %eax
+ movl $28, %ecx /* %cl=28 */
+ movl %eax, %edx /* bitpos copy */
+
+ /* j = bitpos/28; */
+ div %cl /* j in register %al=%ax/%cl */
+ movapd %xmm0, %xmm3 /* |x| */
+ /* clear unneeded remainder from %ah */
+ andl $0xff, %eax
+
+ imull $28, %eax, %ecx /* j*28 */
+ movsd MO1(DP_HI_MASK), %xmm4 /* DP_HI_MASK */
+ movapd %xmm0, %xmm5 /* |x| */
+ mulsd -2*8+MO2(_FPI,%eax,8), %xmm3 /* tmp3 = FPI[j-2]*|x| */
+ movapd %xmm0, %xmm1 /* |x| */
+ mulsd -1*8+MO2(_FPI,%eax,8), %xmm5 /* tmp2 = FPI[j-1]*|x| */
+ mulsd 0*8+MO2(_FPI,%eax,8), %xmm0 /* tmp0 = FPI[j]*|x| */
+ addl $19, %ecx /* j*28+19 */
+ mulsd 1*8+MO2(_FPI,%eax,8), %xmm1 /* tmp1 = FPI[j+1]*|x| */
+ cmpl %ecx, %edx /* bitpos>=j*28+19? */
+ jl L(very_large_skip1)
+
+ /* Here if bitpos>=j*28+19 */
+ andpd %xmm3, %xmm4 /* HI(tmp3) */
+ subsd %xmm4, %xmm3 /* tmp3 = tmp3 - HI(tmp3) */
+L(very_large_skip1):
+
+ movsd MO1(DP_2POW52), %xmm6
+ movapd %xmm5, %xmm2 /* tmp2 copy */
+ addsd %xmm3, %xmm5 /* tmp5 = tmp3 + tmp2 */
+ movl $1, %edx
+ addsd %xmm5, %xmm6 /* tmp6 = tmp5 + 2^52 */
+ movsd 8+MO1(DP_2POW52), %xmm4
+ movd %xmm6, %eax /* k = I64_LO(tmp6); */
+ addsd %xmm6, %xmm4 /* tmp4 = tmp6 - 2^52 */
+ comisd %xmm5, %xmm4 /* tmp4 > tmp5? */
+ jbe L(very_large_skip2)
+
+ /* Here if tmp4 > tmp5 */
+ subl $1, %eax /* k-- */
+ addsd 8+MO1(DP_ONES), %xmm4 /* tmp4 -= 1.0 */
+L(very_large_skip2):
+
+ andl %eax, %edx /* k&1 */
+ subsd %xmm4, %xmm3 /* tmp3 -= tmp4 */
+ addsd MO2(DP_ZERONE,%edx,8), %xmm3 /* t = DP_ZERONE[k&1] + tmp3 */
+ addsd %xmm2, %xmm3 /* t += tmp2 */
+ addsd %xmm3, %xmm0 /* t += tmp0 */
+ addl $3, %eax /* n=k+3 */
+ addsd %xmm1, %xmm0 /* t += tmp1 */
+ mulsd MO1(DP_PIO4), %xmm0 /* t *= PI04 */
+
+ jmp L(reconstruction) /* end of very_large_args peth */
+
+ .p2align 4
+L(arg_less_pio4):
+ /* Here if |x|<Pi/4 */
+ cmpl $0x3d000000, %eax /* |x|<2^-5? */
+ jl L(arg_less_2pn5)
+
+ /* Here if 2^-5<=|x|<Pi/4 */
+ mulsd %xmm0, %xmm0 /* y=x^2 */
+ movaps %xmm0, %xmm1 /* y */
+ mulsd %xmm0, %xmm0 /* z=x^4 */
+ movsd MO1(DP_C4), %xmm3 /* C4 */
+ mulsd %xmm0, %xmm3 /* z*C4 */
+ movsd MO1(DP_C3), %xmm5 /* C3 */
+ mulsd %xmm0, %xmm5 /* z*C3 */
+ addsd MO1(DP_C2), %xmm3 /* C2+z*C4 */
+ mulsd %xmm0, %xmm3 /* z*(C2+z*C4) */
+ addsd MO1(DP_C1), %xmm5 /* C1+z*C3 */
+ mulsd %xmm0, %xmm5 /* z*(C1+z*C3) */
+ addsd MO1(DP_C0), %xmm3 /* C0+z*(C2+z*C4) */
+ mulsd %xmm1, %xmm3 /* y*(C0+z*(C2+z*C4)) */
+ addsd %xmm5, %xmm3 /* y*(C0+y*(C1+y*(C2+y*(C3+y*C4)))) */
+ /* 1.0 + y*(C0+y*(C1+y*(C2+y*(C3+y*C4)))) */
+ addsd MO1(DP_ONES), %xmm3
+ cvtsd2ss %xmm3, %xmm3 /* SP result */
+
+L(epilogue):
+ lea -4(%esp), %esp /* Borrow 4 bytes of stack frame */
+ movss %xmm3, 0(%esp) /* Move result from sse... */
+ flds 0(%esp) /* ...to FPU. */
+ /* Return back 4 bytes of stack frame */
+ lea 4(%esp), %esp
+ RETURN
+
+ .p2align 4
+L(arg_less_2pn5):
+ /* Here if |x|<2^-5 */
+ cmpl $0x32000000, %eax /* |x|<2^-27? */
+ jl L(arg_less_2pn27)
+
+ /* Here if 2^-27<=|x|<2^-5 */
+ mulsd %xmm0, %xmm0 /* DP x^2 */
+ movsd MO1(DP_COS2_1), %xmm3 /* DP DP_COS2_1 */
+ mulsd %xmm0, %xmm3 /* DP x^2*DP_COS2_1 */
+ addsd MO1(DP_COS2_0), %xmm3 /* DP DP_COS2_0+x^2*DP_COS2_1 */
+ mulsd %xmm0, %xmm3 /* DP x^2*DP_COS2_0+x^4*DP_COS2_1 */
+ /* DP 1.0+x^2*DP_COS2_0+x^4*DP_COS2_1 */
+ addsd MO1(DP_ONES), %xmm3
+ cvtsd2ss %xmm3, %xmm3 /* SP result */
+ jmp L(epilogue)
+
+ .p2align 4
+L(arg_less_2pn27):
+ /* Here if |x|<2^-27 */
+ movss ARG_X, %xmm0 /* x */
+ andps MO1(SP_ABS_MASK),%xmm0 /* |x| */
+ movss MO1(SP_ONE), %xmm3 /* 1.0 */
+ subss %xmm0, %xmm3 /* result is 1.0-|x| */
+ jmp L(epilogue)
+
+ .p2align 4
+L(arg_inf_or_nan):
+ /* Here if |x| is Inf or NAN */
+ jne L(skip_errno_setting) /* in case of x is NaN */
+
+ /* Here if x is Inf. Set errno to EDOM. */
+ call JUMPTARGET(__errno_location)
+ movl $EDOM, (%eax)
+
+ .p2align 4
+L(skip_errno_setting):
+ /* Here if |x| is Inf or NAN. Continued. */
+ movss ARG_X, %xmm3 /* load x */
+ subss %xmm3, %xmm3 /* Result is NaN */
+ jmp L(epilogue)
+END(__cosf_sse2)
+
+ .section .rodata, "a"
+ .p2align 3
+L(PIO4J): /* Table of j*Pi/4, for j=0,1,..,10 */
+ .long 0x00000000,0x00000000
+ .long 0x54442d18,0x3fe921fb
+ .long 0x54442d18,0x3ff921fb
+ .long 0x7f3321d2,0x4002d97c
+ .long 0x54442d18,0x400921fb
+ .long 0x2955385e,0x400f6a7a
+ .long 0x7f3321d2,0x4012d97c
+ .long 0xe9bba775,0x4015fdbb
+ .long 0x54442d18,0x401921fb
+ .long 0xbeccb2bb,0x401c463a
+ .long 0x2955385e,0x401f6a7a
+ .type L(PIO4J), @object
+ ASM_SIZE_DIRECTIVE(L(PIO4J))
+
+ .p2align 3
+L(_FPI): /* 4/Pi broken into sum of positive DP values */
+ .long 0x00000000,0x00000000
+ .long 0x6c000000,0x3ff45f30
+ .long 0x2a000000,0x3e3c9c88
+ .long 0xa8000000,0x3c54fe13
+ .long 0xd0000000,0x3aaf47d4
+ .long 0x6c000000,0x38fbb81b
+ .long 0xe0000000,0x3714acc9
+ .long 0x7c000000,0x3560e410
+ .long 0x56000000,0x33bca2c7
+ .long 0xac000000,0x31fbd778
+ .long 0xe0000000,0x300b7246
+ .long 0xe8000000,0x2e5d2126
+ .long 0x48000000,0x2c970032
+ .long 0xe8000000,0x2ad77504
+ .long 0xe0000000,0x290921cf
+ .long 0xb0000000,0x274deb1c
+ .long 0xe0000000,0x25829a73
+ .long 0xbe000000,0x23fd1046
+ .long 0x10000000,0x2224baed
+ .long 0x8e000000,0x20709d33
+ .long 0x80000000,0x1e535a2f
+ .long 0x64000000,0x1cef904e
+ .long 0x30000000,0x1b0d6398
+ .long 0x24000000,0x1964ce7d
+ .long 0x16000000,0x17b908bf
+ .type L(_FPI), @object
+ ASM_SIZE_DIRECTIVE(L(_FPI))
+
+/* Coefficients of polynomial
+ for cos(x)~=1.0+x^2*DP_COS2_0+x^4*DP_COS2_1, |x|<2^-5. */
+ .p2align 3
+L(DP_COS2_0):
+ .long 0xff5cc6fd,0xbfdfffff
+ .type L(DP_COS2_0), @object
+ ASM_SIZE_DIRECTIVE(L(DP_COS2_0))
+
+ .p2align 3
+L(DP_COS2_1):
+ .long 0xb178dac5,0x3fa55514
+ .type L(DP_COS2_1), @object
+ ASM_SIZE_DIRECTIVE(L(DP_COS2_1))
+
+ .p2align 3
+L(DP_ZERONE):
+ .long 0x00000000,0x00000000 /* 0.0 */
+ .long 0x00000000,0xbff00000 /* 1.0 */
+ .type L(DP_ZERONE),@object
+ ASM_SIZE_DIRECTIVE(L(DP_ZERONE))
+
+ .p2align 3
+L(DP_ONES):
+ .long 0x00000000,0x3ff00000 /* +1.0 */
+ .long 0x00000000,0xbff00000 /* -1.0 */
+ .type L(DP_ONES), @object
+ ASM_SIZE_DIRECTIVE(L(DP_ONES))
+
+/* Coefficients of polynomial
+ for sin(t)~=t+t^3*(S0+t^2*(S1+t^2*(S2+t^2*(S3+t^2*S4)))), |t|<Pi/4. */
+ .p2align 3
+L(DP_S3):
+ .long 0x64e6b5b4,0x3ec71d72
+ .type L(DP_S3), @object
+ ASM_SIZE_DIRECTIVE(L(DP_S3))
+
+ .p2align 3
+L(DP_S1):
+ .long 0x10c2688b,0x3f811111
+ .type L(DP_S1), @object
+ ASM_SIZE_DIRECTIVE(L(DP_S1))
+
+ .p2align 3
+L(DP_S4):
+ .long 0x1674b58a,0xbe5a947e
+ .type L(DP_S4), @object
+ ASM_SIZE_DIRECTIVE(L(DP_S4))
+
+ .p2align 3
+L(DP_S2):
+ .long 0x8b4bd1f9,0xbf2a019f
+ .type L(DP_S2), @object
+ ASM_SIZE_DIRECTIVE(L(DP_S2))
+
+ .p2align 3
+L(DP_S0):
+ .long 0x55551cd9,0xbfc55555
+ .type L(DP_S0), @object
+ ASM_SIZE_DIRECTIVE(L(DP_S0))
+
+/* Coefficients of polynomial
+ for cos(t)~=1.0+t^2*(C0+t^2*(C1+t^2*(C2+t^2*(C3+t^2*C4)))), |t|<Pi/4. */
+ .p2align 3
+L(DP_C3):
+ .long 0x9ac43cc0,0x3efa00eb
+ .type L(DP_C3), @object
+ ASM_SIZE_DIRECTIVE(L(DP_C3))
+
+ .p2align 3
+L(DP_C1):
+ .long 0x545c50c7,0x3fa55555
+ .type L(DP_C1), @object
+ ASM_SIZE_DIRECTIVE(L(DP_C1))
+
+ .p2align 3
+L(DP_C4):
+ .long 0xdd8844d7,0xbe923c97
+ .type L(DP_C4), @object
+ ASM_SIZE_DIRECTIVE(L(DP_C4))
+
+ .p2align 3
+L(DP_C2):
+ .long 0x348b6874,0xbf56c16b
+ .type L(DP_C2), @object
+ ASM_SIZE_DIRECTIVE(L(DP_C2))
+
+ .p2align 3
+L(DP_C0):
+ .long 0xfffe98ae,0xbfdfffff
+ .type L(DP_C0), @object
+ ASM_SIZE_DIRECTIVE(L(DP_C0))
+
+ .p2align 3
+L(DP_PIO4):
+ .long 0x54442d18,0x3fe921fb /* Pi/4 */
+ .type L(DP_PIO4), @object
+ ASM_SIZE_DIRECTIVE(L(DP_PIO4))
+
+ .p2align 3
+L(DP_2POW52):
+ .long 0x00000000,0x43300000 /* +2^52 */
+ .long 0x00000000,0xc3300000 /* -2^52 */
+ .type L(DP_2POW52), @object
+ ASM_SIZE_DIRECTIVE(L(DP_2POW52))
+
+ .p2align 3
+L(DP_INVPIO4):
+ .long 0x6dc9c883,0x3ff45f30 /* 4/Pi */
+ .type L(DP_INVPIO4), @object
+ ASM_SIZE_DIRECTIVE(L(DP_INVPIO4))
+
+ .p2align 3
+L(DP_PIO4HI):
+ .long 0x54000000,0xbfe921fb /* High part of Pi/4 */
+ .type L(DP_PIO4HI), @object
+ ASM_SIZE_DIRECTIVE(L(DP_PIO4HI))
+
+ .p2align 3
+L(DP_PIO4LO):
+ .long 0x11A62633,0xbe010b46 /* Low part of Pi/4 */
+ .type L(DP_PIO4LO), @object
+ ASM_SIZE_DIRECTIVE(L(DP_PIO4LO))
+
+ .p2align 2
+L(SP_INVPIO4):
+ .long 0x3fa2f983 /* 4/Pi */
+ .type L(SP_INVPIO4), @object
+ ASM_SIZE_DIRECTIVE(L(SP_INVPIO4))
+
+ .p2align 4
+L(DP_ABS_MASK): /* Mask for getting DP absolute value */
+ .long 0xffffffff,0x7fffffff
+ .long 0xffffffff,0x7fffffff
+ .type L(DP_ABS_MASK), @object
+ ASM_SIZE_DIRECTIVE(L(DP_ABS_MASK))
+
+ .p2align 3
+L(DP_HI_MASK): /* Mask for getting high 21 bits of DP value */
+ .long 0x00000000,0xffffffff
+ .type L(DP_HI_MASK), @object
+ ASM_SIZE_DIRECTIVE(L(DP_HI_MASK))
+
+ .p2align 4
+L(SP_ABS_MASK): /* Mask for getting SP absolute value */
+ .long 0x7fffffff,0x7fffffff
+ .long 0x7fffffff,0x7fffffff
+ .type L(SP_ABS_MASK), @object
+ ASM_SIZE_DIRECTIVE(L(SP_ABS_MASK))
+
+ .p2align 2
+L(SP_ONE):
+ .long 0x3f800000 /* 1.0 */
+ .type L(SP_ONE), @object
+ ASM_SIZE_DIRECTIVE(L(SP_ONE))
+
+weak_alias (__cosf, cosf)
diff --git a/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/s_cosf.c b/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/s_cosf.c
new file mode 100644
index 0000000000..af588de9dc
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/s_cosf.c
@@ -0,0 +1,29 @@
+/* Multiple versions of cosf
+ Copyright (C) 2012-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <init-arch.h>
+
+extern float __cosf_sse2 (float);
+extern float __cosf_ia32 (float);
+float __cosf (float);
+
+libm_ifunc (__cosf, HAS_CPU_FEATURE (SSE2) ? __cosf_sse2 : __cosf_ia32);
+weak_alias (__cosf, cosf);
+
+#define COSF __cosf_ia32
+#include <sysdeps/ieee754/flt-32/s_cosf.c>
diff --git a/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/s_sincosf-sse2.S b/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/s_sincosf-sse2.S
new file mode 100644
index 0000000000..f31a925522
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/s_sincosf-sse2.S
@@ -0,0 +1,586 @@
+/* Optimized with sse2 version of sincosf
+ Copyright (C) 2012-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#define __need_Emath
+#include <bits/errno.h>
+
+/* Short algorithm description:
+ *
+ * 1) if |x|==0: sin(x)=x,
+ * cos(x)=1.
+ * 2) if |x|<2^-27: sin(x)=x-x*DP_SMALL, raising underflow only when needed,
+ * cos(x)=1-|x|.
+ * 3) if |x|<2^-5 : sin(x)=x+x*x^2*DP_SIN2_0+x^5*DP_SIN2_1,
+ * cos(x)=1+1*x^2*DP_COS2_0+x^5*DP_COS2_1
+ * 4) if |x|< Pi/4: sin(x)=x+x*x^2*(S0+x^2*(S1+x^2*(S2+x^2*(S3+x^2*S4)))),
+ * cos(x)=1+1*x^2*(C0+x^2*(C1+x^2*(C2+x^2*(C3+x^2*C4)))).
+ * 5) if |x| < 9*Pi/4:
+ * 5.1) Range reduction:
+ * k=trunc(|x|/(Pi/4)), j=(k+1)&0x0e, n=k+1, t=|x|-j*Pi/4.
+ * 5.2) Reconstruction:
+ * sign_sin = sign(x) * (-1.0)^(( n >>2)&1)
+ * sign_cos = (-1.0)^(((n+2)>>2)&1)
+ * poly_sin = ((((S4*t^2 + S3)*t^2 + S2)*t^2 + S1)*t^2 + S0)*t^2*t+t
+ * poly_cos = ((((C4*t^2 + C3)*t^2 + C2)*t^2 + C1)*t^2 + C0)*t^2*s+s
+ * if(n&2 != 0) {
+ * using cos(t) and sin(t) polynomials for |t|<Pi/4, results are
+ * cos(x) = poly_sin * sign_cos
+ * sin(x) = poly_cos * sign_sin
+ * } else {
+ * sin(x) = poly_sin * sign_sin
+ * cos(x) = poly_cos * sign_cos
+ * }
+ * 6) if |x| < 2^23, large args:
+ * 6.1) Range reduction:
+ * k=trunc(|x|/(Pi/4)), j=(k+1)&0xfffffffe, n=k+1, t=|x|-j*Pi/4
+ * 6.2) Reconstruction same as (5.2).
+ * 7) if |x| >= 2^23, very large args:
+ * 7.1) Range reduction:
+ * k=trunc(|x|/(Pi/4)), j=(k+1)&0xfffffffe, n=k+1, t=|x|-j*Pi/4.
+ * 7.2) Reconstruction same as (5.2).
+ * 8) if x is Inf, return x-x, and set errno=EDOM.
+ * 9) if x is NaN, return x-x.
+ *
+ * Special cases:
+ * sin/cos(+-0) = +-0/1 not raising inexact/underflow,
+ * sin/cos(subnormal) raises inexact/underflow,
+ * sin/cos(min_normalized) raises inexact/underflow,
+ * sin/cos(normalized) raises inexact,
+ * sin/cos(Inf) = NaN, raises invalid, sets errno to EDOM,
+ * sin/cos(NaN) = NaN.
+ */
+
+#ifdef PIC
+# define MO1(symbol) L(symbol)##@GOTOFF(%ebx)
+# define MO2(symbol,reg2,_scale) L(symbol)##@GOTOFF(%ebx,reg2,_scale)
+# define CFI_PUSH(REG) cfi_adjust_cfa_offset(4); cfi_rel_offset(REG,0)
+# define CFI_POP(REG) cfi_adjust_cfa_offset(-4); cfi_restore(REG)
+# define PUSH(REG) pushl REG; CFI_PUSH(REG)
+# define POP(REG) popl REG; CFI_POP(REG)
+# define ENTRANCE PUSH(%ebx); LOAD_PIC_REG(bx)
+# define RETURN POP(%ebx); ret; CFI_PUSH(%ebx)
+# define ARG_X 8(%esp)
+# define ARG_SIN_PTR 12(%esp)
+# define ARG_COS_PTR 16(%esp)
+#else
+# define MO1(symbol) L(symbol)
+# define MO2(symbol,reg2,_scale) L(symbol)(,reg2,_scale)
+# define ENTRANCE
+# define RETURN ret
+# define ARG_X 4(%esp)
+# define ARG_SIN_PTR 8(%esp)
+# define ARG_COS_PTR 12(%esp)
+#endif
+
+ .text
+ENTRY(__sincosf_sse2)
+ /* Input: single precision x on stack at address ARG_X */
+ /* pointer to sin result on stack at address ARG_SIN_PTR */
+ /* pointer to cos result on stack at address ARG_COS_PTR */
+
+ ENTRANCE
+ movl ARG_X, %eax /* Bits of x */
+ cvtss2sd ARG_X, %xmm0 /* DP x */
+ andl $0x7fffffff, %eax /* |x| */
+
+ cmpl $0x3f490fdb, %eax /* |x|<Pi/4 ? */
+ jb L(arg_less_pio4)
+
+ /* Here if |x|>=Pi/4 */
+ movd %eax, %xmm3 /* SP |x| */
+ andpd MO1(DP_ABS_MASK),%xmm0 /* DP |x| */
+ movss MO1(SP_INVPIO4), %xmm2 /* SP 1/(Pi/4) */
+
+ cmpl $0x40e231d6, %eax /* |x|<9*Pi/4 ? */
+ jae L(large_args)
+
+ /* Here if Pi/4<=|x|<9*Pi/4 */
+ mulss %xmm3, %xmm2 /* SP |x|/(Pi/4) */
+ movl ARG_X, %ecx /* Load x */
+ cvttss2si %xmm2, %eax /* k, number of Pi/4 in x */
+ shrl $29, %ecx /* (sign of x) << 2 */
+ addl $1, %eax /* k+1 */
+ movl $0x0e, %edx
+ andl %eax, %edx /* j = (k+1)&0x0e */
+ subsd MO2(PIO4J,%edx,8), %xmm0/* t = |x| - j * Pi/4 */
+
+L(reconstruction):
+ /* Input: %eax=n, %xmm0=t, %ecx=sign(x) */
+
+ movaps %xmm0, %xmm4 /* t */
+ movhpd MO1(DP_ONES), %xmm4 /* 1|t */
+ mulsd %xmm0, %xmm0 /* y=t^2 */
+ movl $2, %edx
+ unpcklpd %xmm0, %xmm0 /* y|y */
+ addl %eax, %edx /* k+2 */
+ movaps %xmm0, %xmm1 /* y|y */
+ mulpd %xmm0, %xmm0 /* z=t^4|z=t^4 */
+
+ movaps MO1(DP_SC4), %xmm2 /* S4 */
+ mulpd %xmm0, %xmm2 /* z*S4 */
+ movaps MO1(DP_SC3), %xmm3 /* S3 */
+ mulpd %xmm0, %xmm3 /* z*S3 */
+ xorl %eax, %ecx /* (sign_x ^ (k>>2))<<2 */
+ addpd MO1(DP_SC2), %xmm2 /* S2+z*S4 */
+ mulpd %xmm0, %xmm2 /* z*(S2+z*S4) */
+ shrl $2, %edx /* (k+2)>>2 */
+ addpd MO1(DP_SC1), %xmm3 /* S1+z*S3 */
+ mulpd %xmm0, %xmm3 /* z*(S1+z*S3) */
+ shrl $2, %ecx /* sign_x ^ k>>2 */
+ addpd MO1(DP_SC0), %xmm2 /* S0+z*(S2+z*S4) */
+ andl $1, %edx /* sign_cos = ((k+2)>>2)&1 */
+ mulpd %xmm1, %xmm2 /* y*(S0+z*(S2+z*S4)) */
+ andl $1, %ecx /* sign_sin = sign_x ^ ((k>>2)&1) */
+ addpd %xmm2, %xmm3 /* y*(S0+y*(S1+y*(S2+y*(S3+y*S4)))) */
+ mulpd %xmm4, %xmm3 /*t*y*(S0+y*(S1+y*(S2+y*(S3+y*S4))))*/
+ testl $2, %eax /* n&2 != 0 ? */
+ addpd %xmm4, %xmm3 /*t+t*y*(S0+y*(S1+y*(S2+y*(S3+y*S4))*/
+ jnz L(sin_result_sin_poly)
+
+/*L(sin_result_cos_poly):*/
+ /*
+ * Here if
+ * cos(x) = poly_sin * sign_cos
+ * sin(x) = poly_cos * sign_sin
+ */
+ movsd MO2(DP_ONES,%ecx,8), %xmm4/* 0|sign_sin */
+ movhpd MO2(DP_ONES,%edx,8), %xmm4/* sign_cos|sign_sin */
+ mulpd %xmm4, %xmm3 /* result_cos|result_sin */
+ movl ARG_SIN_PTR, %eax
+ cvtpd2ps %xmm3, %xmm0 /* SP results */
+ movl ARG_COS_PTR, %ecx
+ movss %xmm0, (%eax) /* store sin(x) from xmm0[0] */
+ shufps $1, %xmm0, %xmm0 /* move cos(x) to xmm0[0] */
+ movss %xmm0, (%ecx) /* store cos(x) */
+ RETURN
+
+ .p2align 4
+L(sin_result_sin_poly):
+ /*
+ * Here if
+ * sin(x) = poly_sin * sign_sin
+ * cos(x) = poly_cos * sign_cos
+ */
+ movsd MO2(DP_ONES,%edx,8), %xmm4/* 0|sign_cos */
+ movhpd MO2(DP_ONES,%ecx,8), %xmm4/* sign_sin|sign_cos */
+ mulpd %xmm4, %xmm3 /* result_sin|result_cos */
+ movl ARG_SIN_PTR, %eax
+ cvtpd2ps %xmm3, %xmm0 /* SP results */
+ movl ARG_COS_PTR, %ecx
+ movss %xmm0, (%ecx) /* store cos(x) from xmm0[0] */
+ shufps $1, %xmm0, %xmm0 /* move sin(x) to xmm0[0] */
+ movss %xmm0, (%eax) /* store sin(x) */
+ RETURN
+
+ .p2align 4
+L(large_args):
+ /* Here if |x|>=9*Pi/4 */
+ cmpl $0x7f800000, %eax /* x is Inf or NaN ? */
+ jae L(arg_inf_or_nan)
+
+ /* Here if finite |x|>=9*Pi/4 */
+ cmpl $0x4b000000, %eax /* |x|<2^23 ? */
+ jae L(very_large_args)
+
+ /* Here if 9*Pi/4<=|x|<2^23 */
+ movsd MO1(DP_INVPIO4), %xmm1 /* 1/(Pi/4) */
+ mulsd %xmm0, %xmm1 /* |x|/(Pi/4) */
+ cvttsd2si %xmm1, %eax /* k=trunc(|x|/(Pi/4)) */
+ addl $1, %eax /* k+1 */
+ movl %eax, %edx
+ andl $0xfffffffe, %edx /* j=(k+1)&0xfffffffe */
+ cvtsi2sdl %edx, %xmm4 /* DP j */
+ movl ARG_X, %ecx /* Load x */
+ movsd MO1(DP_PIO4HI), %xmm2 /* -PIO4HI = high part of -Pi/4 */
+ shrl $29, %ecx /* (sign of x) << 2 */
+ mulsd %xmm4, %xmm2 /* -j*PIO4HI */
+ movsd MO1(DP_PIO4LO), %xmm3 /* -PIO4LO = low part of -Pi/4 */
+ addsd %xmm2, %xmm0 /* |x| - j*PIO4HI */
+ mulsd %xmm3, %xmm4 /* j*PIO4LO */
+ addsd %xmm4, %xmm0 /* t = |x| - j*PIO4HI - j*PIO4LO */
+ jmp L(reconstruction)
+
+ .p2align 4
+L(very_large_args):
+ /* Here if finite |x|>=2^23 */
+
+ /* bitpos = (ix>>23) - BIAS_32 + 59; */
+ shrl $23, %eax /* eb = biased exponent of x */
+ subl $68, %eax /* bitpos=eb-0x7f+59, where 0x7f */
+ /*is exponent bias */
+ movl $28, %ecx /* %cl=28 */
+ movl %eax, %edx /* bitpos copy */
+
+ /* j = bitpos/28; */
+ div %cl /* j in register %al=%ax/%cl */
+ movapd %xmm0, %xmm3 /* |x| */
+ andl $0xff, %eax /* clear unneeded remainder from %ah*/
+
+ imull $28, %eax, %ecx /* j*28 */
+ movsd MO1(DP_HI_MASK), %xmm4 /* DP_HI_MASK */
+ movapd %xmm0, %xmm5 /* |x| */
+ mulsd -2*8+MO2(_FPI,%eax,8), %xmm3/* tmp3 = FPI[j-2]*|x| */
+ movapd %xmm0, %xmm1 /* |x| */
+ mulsd -1*8+MO2(_FPI,%eax,8), %xmm5/* tmp2 = FPI[j-1]*|x| */
+ mulsd 0*8+MO2(_FPI,%eax,8), %xmm0/* tmp0 = FPI[j]*|x| */
+ addl $19, %ecx /* j*28+19 */
+ mulsd 1*8+MO2(_FPI,%eax,8), %xmm1/* tmp1 = FPI[j+1]*|x| */
+ cmpl %ecx, %edx /* bitpos>=j*28+19 ? */
+ jl L(very_large_skip1)
+
+ /* Here if bitpos>=j*28+19 */
+ andpd %xmm3, %xmm4 /* HI(tmp3) */
+ subsd %xmm4, %xmm3 /* tmp3 = tmp3 - HI(tmp3) */
+L(very_large_skip1):
+
+ movsd MO1(DP_2POW52), %xmm6
+ movapd %xmm5, %xmm2 /* tmp2 copy */
+ addsd %xmm3, %xmm5 /* tmp5 = tmp3 + tmp2 */
+ movl $1, %edx
+ addsd %xmm5, %xmm6 /* tmp6 = tmp5 + 2^52 */
+ movsd 8+MO1(DP_2POW52), %xmm4
+ movd %xmm6, %eax /* k = I64_LO(tmp6); */
+ addsd %xmm6, %xmm4 /* tmp4 = tmp6 - 2^52 */
+ movl ARG_X, %ecx /* Load x */
+ comisd %xmm5, %xmm4 /* tmp4 > tmp5 ? */
+ jbe L(very_large_skip2)
+
+ /* Here if tmp4 > tmp5 */
+ subl $1, %eax /* k-- */
+ addsd 8+MO1(DP_ONES), %xmm4 /* tmp4 -= 1.0 */
+L(very_large_skip2):
+
+ andl %eax, %edx /* k&1 */
+ subsd %xmm4, %xmm3 /* tmp3 -= tmp4 */
+ addsd MO2(DP_ZERONE,%edx,8), %xmm3/* t = DP_ZERONE[k&1] + tmp3 */
+ addsd %xmm2, %xmm3 /* t += tmp2 */
+ shrl $29, %ecx /* (sign of x) << 2 */
+ addsd %xmm3, %xmm0 /* t += tmp0 */
+ addl $1, %eax /* n=k+1 */
+ addsd %xmm1, %xmm0 /* t += tmp1 */
+ mulsd MO1(DP_PIO4), %xmm0 /* t *= PI04 */
+
+ jmp L(reconstruction) /* end of very_large_args peth */
+
+ .p2align 4
+L(arg_less_pio4):
+ /* Here if |x|<Pi/4 */
+ cmpl $0x3d000000, %eax /* |x|<2^-5 ? */
+ jl L(arg_less_2pn5)
+
+ /* Here if 2^-5<=|x|<Pi/4 */
+ movaps %xmm0, %xmm3 /* DP x */
+ movhpd MO1(DP_ONES), %xmm3 /* DP 1|x */
+ mulsd %xmm0, %xmm0 /* DP y=x^2 */
+ unpcklpd %xmm0, %xmm0 /* DP y|y */
+ movaps %xmm0, %xmm1 /* y|y */
+ mulpd %xmm0, %xmm0 /* z=x^4|z=x^4 */
+
+ movapd MO1(DP_SC4), %xmm4 /* S4 */
+ mulpd %xmm0, %xmm4 /* z*S4 */
+ movapd MO1(DP_SC3), %xmm5 /* S3 */
+ mulpd %xmm0, %xmm5 /* z*S3 */
+ addpd MO1(DP_SC2), %xmm4 /* S2+z*S4 */
+ mulpd %xmm0, %xmm4 /* z*(S2+z*S4) */
+ addpd MO1(DP_SC1), %xmm5 /* S1+z*S3 */
+ mulpd %xmm0, %xmm5 /* z*(S1+z*S3) */
+ addpd MO1(DP_SC0), %xmm4 /* S0+z*(S2+z*S4) */
+ mulpd %xmm1, %xmm4 /* y*(S0+z*(S2+z*S4)) */
+ mulpd %xmm3, %xmm5 /* x*z*(S1+z*S3) */
+ mulpd %xmm3, %xmm4 /* x*y*(S0+z*(S2+z*S4)) */
+ addpd %xmm5, %xmm4 /*x*y*(S0+y*(S1+y*(S2+y*(S3+y*S4))))*/
+ movl ARG_SIN_PTR, %eax
+ addpd %xmm4, %xmm3 /*x+x*y*(S0+y*(S1+y*(S2+y*(S3+y*S4))*/
+ movl ARG_COS_PTR, %ecx
+ cvtpd2ps %xmm3, %xmm0 /* SP results */
+ movss %xmm0, (%eax) /* store sin(x) from xmm0[0] */
+ shufps $1, %xmm0, %xmm0 /* move cos(x) to xmm0[0] */
+ movss %xmm0, (%ecx) /* store cos(x) */
+ RETURN
+
+ .p2align 4
+L(arg_less_2pn5):
+ /* Here if |x|<2^-5 */
+ cmpl $0x32000000, %eax /* |x|<2^-27 ? */
+ jl L(arg_less_2pn27)
+
+ /* Here if 2^-27<=|x|<2^-5 */
+ movaps %xmm0, %xmm1 /* DP x */
+ movhpd MO1(DP_ONES), %xmm1 /* DP 1|x */
+ mulsd %xmm0, %xmm0 /* DP x^2 */
+ unpcklpd %xmm0, %xmm0 /* DP x^2|x^2 */
+
+ movaps MO1(DP_SINCOS2_1), %xmm3/* DP DP_SIN2_1 */
+ mulpd %xmm0, %xmm3 /* DP x^2*DP_SIN2_1 */
+ addpd MO1(DP_SINCOS2_0), %xmm3/* DP DP_SIN2_0+x^2*DP_SIN2_1 */
+ mulpd %xmm0, %xmm3 /* DP x^2*DP_SIN2_0+x^4*DP_SIN2_1 */
+ mulpd %xmm1, %xmm3 /* DP x^3*DP_SIN2_0+x^5*DP_SIN2_1 */
+ addpd %xmm1, %xmm3 /* DP x+x^3*DP_SIN2_0+x^5*DP_SIN2_1 */
+ movl ARG_SIN_PTR, %eax
+ cvtpd2ps %xmm3, %xmm0 /* SP results */
+ movl ARG_COS_PTR, %ecx
+ movss %xmm0, (%eax) /* store sin(x) from xmm0[0] */
+ shufps $1, %xmm0, %xmm0 /* move cos(x) to xmm0[0] */
+ movss %xmm0, (%ecx) /* store cos(x) */
+ RETURN
+
+ .p2align 4
+L(arg_less_2pn27):
+ movss ARG_X, %xmm7 /* SP x */
+ cmpl $0, %eax /* x=0 ? */
+ je L(arg_zero) /* in case x=0 return sin(+-0)==+-0 */
+ /* Here if |x|<2^-27 */
+ /*
+ * Special cases here:
+ * sin(subnormal) raises inexact/underflow
+ * sin(min_normalized) raises inexact/underflow
+ * sin(normalized) raises inexact
+ * cos(here)=1-|x| (raising inexact)
+ */
+ movaps %xmm0, %xmm3 /* DP x */
+ mulsd MO1(DP_SMALL), %xmm0 /* DP x*DP_SMALL */
+ subsd %xmm0, %xmm3 /* DP sin result is x-x*DP_SMALL */
+ andps MO1(SP_ABS_MASK), %xmm7 /* SP |x| */
+ cvtsd2ss %xmm3, %xmm0 /* sin(x) */
+ movl ARG_SIN_PTR, %eax
+ movss MO1(SP_ONE), %xmm1 /* SP 1.0 */
+ movss %xmm0, (%eax) /* sin(x) store */
+ movl ARG_COS_PTR, %ecx
+ subss %xmm7, %xmm1 /* cos(x) */
+ movss %xmm1, (%ecx) /* cos(x) store */
+ RETURN
+
+ .p2align 4
+L(arg_zero):
+ movss MO1(SP_ONE), %xmm0 /* 1.0 */
+ movl ARG_SIN_PTR, %eax
+ movl ARG_COS_PTR, %ecx
+ movss %xmm7, (%eax) /* sin(+-0)==x */
+ movss %xmm0, (%ecx) /* cos(+-0)==1 */
+ RETURN
+
+ .p2align 4
+L(arg_inf_or_nan):
+ movss ARG_X, %xmm7 /* SP x */
+ /* Here if |x| is Inf or NAN */
+ jne L(skip_errno_setting) /* in case of x is NaN */
+
+ /* Here if x is Inf. Set errno to EDOM. */
+ call JUMPTARGET(__errno_location)
+ movl $EDOM, (%eax)
+
+ .p2align 4
+L(skip_errno_setting):
+ /* Here if |x| is Inf or NAN. Continued. */
+ subss %xmm7, %xmm7 /* x-x, result is NaN */
+ movl ARG_SIN_PTR, %eax
+ movl ARG_COS_PTR, %ecx
+ movss %xmm7, (%eax)
+ movss %xmm7, (%ecx)
+ RETURN
+END(__sincosf_sse2)
+
+ .section .rodata, "a"
+ .p2align 3
+L(PIO4J): /* Table of j*Pi/4, for j=0,1,..,10 */
+ .long 0x00000000,0x00000000
+ .long 0x54442d18,0x3fe921fb
+ .long 0x54442d18,0x3ff921fb
+ .long 0x7f3321d2,0x4002d97c
+ .long 0x54442d18,0x400921fb
+ .long 0x2955385e,0x400f6a7a
+ .long 0x7f3321d2,0x4012d97c
+ .long 0xe9bba775,0x4015fdbb
+ .long 0x54442d18,0x401921fb
+ .long 0xbeccb2bb,0x401c463a
+ .long 0x2955385e,0x401f6a7a
+ .type L(PIO4J), @object
+ ASM_SIZE_DIRECTIVE(L(PIO4J))
+
+ .p2align 3
+L(_FPI): /* 4/Pi broken into sum of positive DP values */
+ .long 0x00000000,0x00000000
+ .long 0x6c000000,0x3ff45f30
+ .long 0x2a000000,0x3e3c9c88
+ .long 0xa8000000,0x3c54fe13
+ .long 0xd0000000,0x3aaf47d4
+ .long 0x6c000000,0x38fbb81b
+ .long 0xe0000000,0x3714acc9
+ .long 0x7c000000,0x3560e410
+ .long 0x56000000,0x33bca2c7
+ .long 0xac000000,0x31fbd778
+ .long 0xe0000000,0x300b7246
+ .long 0xe8000000,0x2e5d2126
+ .long 0x48000000,0x2c970032
+ .long 0xe8000000,0x2ad77504
+ .long 0xe0000000,0x290921cf
+ .long 0xb0000000,0x274deb1c
+ .long 0xe0000000,0x25829a73
+ .long 0xbe000000,0x23fd1046
+ .long 0x10000000,0x2224baed
+ .long 0x8e000000,0x20709d33
+ .long 0x80000000,0x1e535a2f
+ .long 0x64000000,0x1cef904e
+ .long 0x30000000,0x1b0d6398
+ .long 0x24000000,0x1964ce7d
+ .long 0x16000000,0x17b908bf
+ .type L(_FPI), @object
+ ASM_SIZE_DIRECTIVE(L(_FPI))
+
+/* Coefficients of polynomials for */
+/* sin(x)~=x+x*x^2*(DP_SIN2_0+x^2*DP_SIN2_1) in low DP part, */
+/* cos(x)~=1+1*x^2*(DP_COS2_0+x^2*DP_COS2_1) in high DP part, */
+/* for |x|<2^-5. */
+ .p2align 4
+L(DP_SINCOS2_0):
+ .long 0x5543d49d,0xbfc55555
+ .long 0xff5cc6fd,0xbfdfffff
+ .type L(DP_SINCOS2_0), @object
+ ASM_SIZE_DIRECTIVE(L(DP_SINCOS2_0))
+
+ .p2align 4
+L(DP_SINCOS2_1):
+ .long 0x75cec8c5,0x3f8110f4
+ .long 0xb178dac5,0x3fa55514
+ .type L(DP_SINCOS2_1), @object
+ ASM_SIZE_DIRECTIVE(L(DP_SINCOS2_1))
+
+ .p2align 3
+L(DP_ZERONE):
+ .long 0x00000000,0x00000000 /* 0.0 */
+ .long 0x00000000,0xbff00000 /* 1.0 */
+ .type L(DP_ZERONE), @object
+ ASM_SIZE_DIRECTIVE(L(DP_ZERONE))
+
+ .p2align 3
+L(DP_ONES):
+ .long 0x00000000,0x3ff00000 /* +1.0 */
+ .long 0x00000000,0xbff00000 /* -1.0 */
+ .type L(DP_ONES), @object
+ ASM_SIZE_DIRECTIVE(L(DP_ONES))
+
+/* Coefficients of polynomials for */
+/* sin(t)~=t+t*t^2*(S0+t^2*(S1+t^2*(S2+t^2*(S3+t^2*S4)))) in low DP part, */
+/* cos(t)~=1+1*t^2*(C0+t^2*(C1+t^2*(C2+t^2*(C3+t^2*C4)))) in high DP part, */
+/* for |t|<Pi/4. */
+ .p2align 4
+L(DP_SC4):
+ .long 0x1674b58a,0xbe5a947e
+ .long 0xdd8844d7,0xbe923c97
+ .type L(DP_SC4), @object
+ ASM_SIZE_DIRECTIVE(L(DP_SC4))
+
+ .p2align 4
+L(DP_SC3):
+ .long 0x64e6b5b4,0x3ec71d72
+ .long 0x9ac43cc0,0x3efa00eb
+ .type L(DP_SC3), @object
+ ASM_SIZE_DIRECTIVE(L(DP_SC3))
+
+ .p2align 4
+L(DP_SC2):
+ .long 0x8b4bd1f9,0xbf2a019f
+ .long 0x348b6874,0xbf56c16b
+ .type L(DP_SC2), @object
+ ASM_SIZE_DIRECTIVE(L(DP_SC2))
+
+ .p2align 4
+L(DP_SC1):
+ .long 0x10c2688b,0x3f811111
+ .long 0x545c50c7,0x3fa55555
+ .type L(DP_SC1), @object
+ ASM_SIZE_DIRECTIVE(L(DP_SC1))
+
+ .p2align 4
+L(DP_SC0):
+ .long 0x55551cd9,0xbfc55555
+ .long 0xfffe98ae,0xbfdfffff
+ .type L(DP_SC0), @object
+ ASM_SIZE_DIRECTIVE(L(DP_SC0))
+
+ .p2align 3
+L(DP_SMALL):
+ .long 0x00000000,0x3cd00000 /* 2^(-50) */
+ .type L(DP_SMALL), @object
+ ASM_SIZE_DIRECTIVE(L(DP_SMALL))
+
+ .p2align 3
+L(DP_PIO4):
+ .long 0x54442d18,0x3fe921fb /* Pi/4 */
+ .type L(DP_PIO4), @object
+ ASM_SIZE_DIRECTIVE(L(DP_PIO4))
+
+ .p2align 3
+L(DP_2POW52):
+ .long 0x00000000,0x43300000 /* +2^52 */
+ .long 0x00000000,0xc3300000 /* -2^52 */
+ .type L(DP_2POW52), @object
+ ASM_SIZE_DIRECTIVE(L(DP_2POW52))
+
+ .p2align 3
+L(DP_INVPIO4):
+ .long 0x6dc9c883,0x3ff45f30 /* 4/Pi */
+ .type L(DP_INVPIO4), @object
+ ASM_SIZE_DIRECTIVE(L(DP_INVPIO4))
+
+ .p2align 3
+L(DP_PIO4HI):
+ .long 0x54000000,0xbfe921fb /* High part of Pi/4 */
+ .type L(DP_PIO4HI), @object
+ ASM_SIZE_DIRECTIVE(L(DP_PIO4HI))
+
+ .p2align 3
+L(DP_PIO4LO):
+ .long 0x11A62633,0xbe010b46 /* Low part of Pi/4 */
+ .type L(DP_PIO4LO), @object
+ ASM_SIZE_DIRECTIVE(L(DP_PIO4LO))
+
+ .p2align 2
+L(SP_INVPIO4):
+ .long 0x3fa2f983 /* 4/Pi */
+ .type L(SP_INVPIO4), @object
+ ASM_SIZE_DIRECTIVE(L(SP_INVPIO4))
+
+ .p2align 4
+L(DP_ABS_MASK): /* Mask for getting DP absolute value */
+ .long 0xffffffff,0x7fffffff
+ .long 0xffffffff,0x7fffffff
+ .type L(DP_ABS_MASK), @object
+ ASM_SIZE_DIRECTIVE(L(DP_ABS_MASK))
+
+ .p2align 3
+L(DP_HI_MASK): /* Mask for getting high 21 bits of DP value */
+ .long 0x00000000,0xffffffff
+ .type L(DP_HI_MASK), @object
+ ASM_SIZE_DIRECTIVE(L(DP_HI_MASK))
+
+ .p2align 4
+L(SP_ABS_MASK): /* Mask for getting SP absolute value */
+ .long 0x7fffffff,0x7fffffff
+ .long 0x7fffffff,0x7fffffff
+ .type L(SP_ABS_MASK), @object
+ ASM_SIZE_DIRECTIVE(L(SP_ABS_MASK))
+
+ .p2align 2
+L(SP_ONE):
+ .long 0x3f800000 /* 1.0 */
+ .type L(SP_ONE), @object
+ ASM_SIZE_DIRECTIVE(L(SP_ONE))
+
+weak_alias(__sincosf, sincosf)
diff --git a/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/s_sincosf.c b/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/s_sincosf.c
new file mode 100644
index 0000000000..9428f9b4ea
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/s_sincosf.c
@@ -0,0 +1,30 @@
+/* Multiple versions of sincosf
+ Copyright (C) 2012-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <init-arch.h>
+
+extern void __sincosf_sse2 (float, float *, float *);
+extern void __sincosf_ia32 (float, float *, float *);
+void __sincosf (float, float *, float *);
+
+libm_ifunc (__sincosf,
+ HAS_CPU_FEATURE (SSE2) ? __sincosf_sse2 : __sincosf_ia32);
+weak_alias (__sincosf, sincosf);
+
+#define SINCOSF __sincosf_ia32
+#include <sysdeps/ieee754/flt-32/s_sincosf.c>
diff --git a/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/s_sinf-sse2.S b/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/s_sinf-sse2.S
new file mode 100644
index 0000000000..ee96018061
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/s_sinf-sse2.S
@@ -0,0 +1,566 @@
+/* Optimized with sse2 version of sinf
+ Copyright (C) 2012-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#define __need_Emath
+#include <bits/errno.h>
+
+/* Short algorithm description:
+ *
+ * 1) if |x| == 0: return x.
+ * 2) if |x| < 2^-27: return x-x*DP_SMALL, raise underflow only when needed.
+ * 3) if |x| < 2^-5 : return x+x^3*DP_SIN2_0+x^5*DP_SIN2_1.
+ * 4) if |x| < Pi/4: return x+x^3*(S0+x^2*(S1+x^2*(S2+x^2*(S3+x^2*S4)))).
+ * 5) if |x| < 9*Pi/4:
+ * 5.1) Range reduction: k=trunc(|x|/(Pi/4)), j=(k+1)&0x0e, n=k+1,
+ * t=|x|-j*Pi/4.
+ * 5.2) Reconstruction:
+ * s = sign(x) * (-1.0)^((n>>2)&1)
+ * if(n&2 != 0) {
+ * using cos(t) polynomial for |t|<Pi/4, result is
+ * s * (1.0+t^2*(C0+t^2*(C1+t^2*(C2+t^2*(C3+t^2*C4))))).
+ * } else {
+ * using sin(t) polynomial for |t|<Pi/4, result is
+ * s * t * (1.0+t^2*(S0+t^2*(S1+t^2*(S2+t^2*(S3+t^2*S4))))).
+ * }
+ * 6) if |x| < 2^23, large args:
+ * 6.1) Range reduction: k=trunc(|x|/(Pi/4)), j=(k+1)&0xfffffffe, n=k+1,
+ * t=|x|-j*Pi/4.
+ * 6.2) Reconstruction same as (5.2).
+ * 7) if |x| >= 2^23, very large args:
+ * 7.1) Range reduction: k=trunc(|x|/(Pi/4)), j=(k+1)&0xfffffffe, n=k+1,
+ * t=|x|-j*Pi/4.
+ * 7.2) Reconstruction same as (5.2).
+ * 8) if x is Inf, return x-x, and set errno=EDOM.
+ * 9) if x is NaN, return x-x.
+ *
+ * Special cases:
+ * sin(+-0) = +-0 not raising inexact/underflow,
+ * sin(subnormal) raises inexact/underflow,
+ * sin(min_normalized) raises inexact/underflow,
+ * sin(normalized) raises inexact,
+ * sin(Inf) = NaN, raises invalid, sets errno to EDOM,
+ * sin(NaN) = NaN.
+ */
+
+#ifdef PIC
+# define MO1(symbol) L(symbol)##@GOTOFF(%ebx)
+# define MO2(symbol,reg2,_scale) L(symbol)##@GOTOFF(%ebx,reg2,_scale)
+# define CFI_PUSH(REG) cfi_adjust_cfa_offset(4); cfi_rel_offset(REG,0)
+# define CFI_POP(REG) cfi_adjust_cfa_offset(-4); cfi_restore(REG)
+# define PUSH(REG) pushl REG; CFI_PUSH(REG)
+# define POP(REG) popl REG; CFI_POP(REG)
+# define ENTRANCE PUSH(%ebx); LOAD_PIC_REG(bx)
+# define RETURN POP(%ebx); ret; CFI_PUSH(%ebx)
+# define ARG_X 8(%esp)
+#else
+# define MO1(symbol) L(symbol)
+# define MO2(symbol,reg2,_scale) L(symbol)(,reg2,_scale)
+# define ENTRANCE
+# define RETURN ret
+# define ARG_X 4(%esp)
+#endif
+
+ .text
+ENTRY(__sinf_sse2)
+ /* Input: single precision x on stack at address ARG_X */
+
+ ENTRANCE
+ movl ARG_X, %eax /* Bits of x */
+ cvtss2sd ARG_X, %xmm0 /* DP x */
+ andl $0x7fffffff, %eax /* |x| */
+
+ cmpl $0x3f490fdb, %eax /* |x|<Pi/4? */
+ jb L(arg_less_pio4)
+
+ /* Here if |x|>=Pi/4 */
+ movd %eax, %xmm3 /* SP |x| */
+ andpd MO1(DP_ABS_MASK),%xmm0 /* DP |x| */
+ movss MO1(SP_INVPIO4), %xmm2 /* SP 1/(Pi/4) */
+
+ cmpl $0x40e231d6, %eax /* |x|<9*Pi/4? */
+ jae L(large_args)
+
+ /* Here if Pi/4<=|x|<9*Pi/4 */
+ mulss %xmm3, %xmm2 /* SP |x|/(Pi/4) */
+ movl ARG_X, %ecx /* Load x */
+ cvttss2si %xmm2, %eax /* k, number of Pi/4 in x */
+ shrl $31, %ecx /* sign of x */
+ addl $1, %eax /* k+1 */
+ movl $0x0e, %edx
+ andl %eax, %edx /* j = (k+1)&0x0e */
+ subsd MO2(PIO4J,%edx,8), %xmm0 /* t = |x| - j * Pi/4 */
+
+L(reconstruction):
+ /* Input: %eax=n, %xmm0=t, %ecx=sign(x) */
+ testl $2, %eax /* n&2 != 0? */
+ jz L(sin_poly)
+
+/*L(cos_poly):*/
+ /* Here if sin(x) calculated using cos(t) polynomial for |t|<Pi/4:
+ * y = t*t; z = y*y;
+ * s = sign(x) * (-1.0)^((n>>2)&1)
+ * result = s * (1.0+t^2*(C0+t^2*(C1+t^2*(C2+t^2*(C3+t^2*C4)))))
+ */
+ shrl $2, %eax /* n>>2 */
+ mulsd %xmm0, %xmm0 /* y=t^2 */
+ andl $1, %eax /* (n>>2)&1 */
+ movaps %xmm0, %xmm1 /* y */
+ mulsd %xmm0, %xmm0 /* z=t^4 */
+
+ movsd MO1(DP_C4), %xmm4 /* C4 */
+ mulsd %xmm0, %xmm4 /* z*C4 */
+ xorl %eax, %ecx /* (-1.0)^((n>>2)&1) XOR sign(x) */
+ movsd MO1(DP_C3), %xmm3 /* C3 */
+ mulsd %xmm0, %xmm3 /* z*C3 */
+ addsd MO1(DP_C2), %xmm4 /* C2+z*C4 */
+ mulsd %xmm0, %xmm4 /* z*(C2+z*C4) */
+ lea -8(%esp), %esp /* Borrow 4 bytes of stack frame */
+ addsd MO1(DP_C1), %xmm3 /* C1+z*C3 */
+ mulsd %xmm0, %xmm3 /* z*(C1+z*C3) */
+ addsd MO1(DP_C0), %xmm4 /* C0+z*(C2+z*C4) */
+ mulsd %xmm1, %xmm4 /* y*(C0+z*(C2+z*C4)) */
+
+ addsd %xmm4, %xmm3 /* y*(C0+y*(C1+y*(C2+y*(C3+y*C4)))) */
+ /* 1.0+y*(C0+y*(C1+y*(C2+y*(C3+y*C4)))) */
+ addsd MO1(DP_ONES), %xmm3
+
+ mulsd MO2(DP_ONES,%ecx,8), %xmm3 /* DP result */
+ movsd %xmm3, 0(%esp) /* Move result from sse... */
+ fldl 0(%esp) /* ...to FPU. */
+ /* Return back 4 bytes of stack frame */
+ lea 8(%esp), %esp
+ RETURN
+
+ .p2align 4
+L(sin_poly):
+ /* Here if sin(x) calculated using sin(t) polynomial for |t|<Pi/4:
+ * y = t*t; z = y*y;
+ * s = sign(x) * (-1.0)^((n>>2)&1)
+ * result = s * t * (1.0+t^2*(S0+t^2*(S1+t^2*(S2+t^2*(S3+t^2*S4)))))
+ */
+
+ movaps %xmm0, %xmm4 /* t */
+ shrl $2, %eax /* n>>2 */
+ mulsd %xmm0, %xmm0 /* y=t^2 */
+ andl $1, %eax /* (n>>2)&1 */
+ movaps %xmm0, %xmm1 /* y */
+ xorl %eax, %ecx /* (-1.0)^((n>>2)&1) XOR sign(x) */
+ mulsd %xmm0, %xmm0 /* z=t^4 */
+
+ movsd MO1(DP_S4), %xmm2 /* S4 */
+ mulsd %xmm0, %xmm2 /* z*S4 */
+ movsd MO1(DP_S3), %xmm3 /* S3 */
+ mulsd %xmm0, %xmm3 /* z*S3 */
+ lea -8(%esp), %esp /* Borrow 4 bytes of stack frame */
+ addsd MO1(DP_S2), %xmm2 /* S2+z*S4 */
+ mulsd %xmm0, %xmm2 /* z*(S2+z*S4) */
+ addsd MO1(DP_S1), %xmm3 /* S1+z*S3 */
+ mulsd %xmm0, %xmm3 /* z*(S1+z*S3) */
+ addsd MO1(DP_S0), %xmm2 /* S0+z*(S2+z*S4) */
+ mulsd %xmm1, %xmm2 /* y*(S0+z*(S2+z*S4)) */
+ /* t*s, where s = sign(x) * (-1.0)^((n>>2)&1) */
+ mulsd MO2(DP_ONES,%ecx,8), %xmm4
+ addsd %xmm2, %xmm3 /* y*(S0+y*(S1+y*(S2+y*(S3+y*S4)))) */
+ /* t*s*y*(S0+y*(S1+y*(S2+y*(S3+y*S4)))) */
+ mulsd %xmm4, %xmm3
+ /* t*s*(1.0+y*(S0+y*(S1+y*(S2+y*(S3+y*S4)))) */
+ addsd %xmm4, %xmm3
+ movsd %xmm3, 0(%esp) /* Move result from sse... */
+ fldl 0(%esp) /* ...to FPU. */
+ /* Return back 4 bytes of stack frame */
+ lea 8(%esp), %esp
+ RETURN
+
+ .p2align 4
+L(large_args):
+ /* Here if |x|>=9*Pi/4 */
+ cmpl $0x7f800000, %eax /* x is Inf or NaN? */
+ jae L(arg_inf_or_nan)
+
+ /* Here if finite |x|>=9*Pi/4 */
+ cmpl $0x4b000000, %eax /* |x|<2^23? */
+ jae L(very_large_args)
+
+ /* Here if 9*Pi/4<=|x|<2^23 */
+ movsd MO1(DP_INVPIO4), %xmm1 /* 1/(Pi/4) */
+ mulsd %xmm0, %xmm1 /* |x|/(Pi/4) */
+ cvttsd2si %xmm1, %eax /* k=trunc(|x|/(Pi/4)) */
+ addl $1, %eax /* k+1 */
+ movl %eax, %edx
+ andl $0xfffffffe, %edx /* j=(k+1)&0xfffffffe */
+ cvtsi2sdl %edx, %xmm4 /* DP j */
+ movl ARG_X, %ecx /* Load x */
+ movsd MO1(DP_PIO4HI), %xmm2 /* -PIO4HI = high part of -Pi/4 */
+ shrl $31, %ecx /* sign bit of x */
+ mulsd %xmm4, %xmm2 /* -j*PIO4HI */
+ movsd MO1(DP_PIO4LO), %xmm3 /* -PIO4LO = low part of -Pi/4 */
+ addsd %xmm2, %xmm0 /* |x| - j*PIO4HI */
+ mulsd %xmm3, %xmm4 /* j*PIO4LO */
+ addsd %xmm4, %xmm0 /* t = |x| - j*PIO4HI - j*PIO4LO */
+ jmp L(reconstruction)
+
+ .p2align 4
+L(very_large_args):
+ /* Here if finite |x|>=2^23 */
+
+ /* bitpos = (ix>>23) - BIAS_32 + 59; */
+ shrl $23, %eax /* eb = biased exponent of x */
+ /* bitpos = eb - 0x7f + 59, where 0x7f is exponent bias */
+ subl $68, %eax
+ movl $28, %ecx /* %cl=28 */
+ movl %eax, %edx /* bitpos copy */
+
+ /* j = bitpos/28; */
+ div %cl /* j in register %al=%ax/%cl */
+ movapd %xmm0, %xmm3 /* |x| */
+ /* clear unneeded remainder from %ah */
+ andl $0xff, %eax
+
+ imull $28, %eax, %ecx /* j*28 */
+ movsd MO1(DP_HI_MASK), %xmm4 /* DP_HI_MASK */
+ movapd %xmm0, %xmm5 /* |x| */
+ mulsd -2*8+MO2(_FPI,%eax,8), %xmm3 /* tmp3 = FPI[j-2]*|x| */
+ movapd %xmm0, %xmm1 /* |x| */
+ mulsd -1*8+MO2(_FPI,%eax,8), %xmm5 /* tmp2 = FPI[j-1]*|x| */
+ mulsd 0*8+MO2(_FPI,%eax,8), %xmm0 /* tmp0 = FPI[j]*|x| */
+ addl $19, %ecx /* j*28+19 */
+ mulsd 1*8+MO2(_FPI,%eax,8), %xmm1 /* tmp1 = FPI[j+1]*|x| */
+ cmpl %ecx, %edx /* bitpos>=j*28+19? */
+ jl L(very_large_skip1)
+
+ /* Here if bitpos>=j*28+19 */
+ andpd %xmm3, %xmm4 /* HI(tmp3) */
+ subsd %xmm4, %xmm3 /* tmp3 = tmp3 - HI(tmp3) */
+L(very_large_skip1):
+
+ movsd MO1(DP_2POW52), %xmm6
+ movapd %xmm5, %xmm2 /* tmp2 copy */
+ addsd %xmm3, %xmm5 /* tmp5 = tmp3 + tmp2 */
+ movl $1, %edx
+ addsd %xmm5, %xmm6 /* tmp6 = tmp5 + 2^52 */
+ movsd 8+MO1(DP_2POW52), %xmm4
+ movd %xmm6, %eax /* k = I64_LO(tmp6); */
+ addsd %xmm6, %xmm4 /* tmp4 = tmp6 - 2^52 */
+ movl ARG_X, %ecx /* Load x */
+ comisd %xmm5, %xmm4 /* tmp4 > tmp5? */
+ jbe L(very_large_skip2)
+
+ /* Here if tmp4 > tmp5 */
+ subl $1, %eax /* k-- */
+ addsd 8+MO1(DP_ONES), %xmm4 /* tmp4 -= 1.0 */
+L(very_large_skip2):
+
+ andl %eax, %edx /* k&1 */
+ subsd %xmm4, %xmm3 /* tmp3 -= tmp4 */
+ addsd MO2(DP_ZERONE,%edx,8), %xmm3 /* t = DP_ZERONE[k&1] + tmp3 */
+ addsd %xmm2, %xmm3 /* t += tmp2 */
+ shrl $31, %ecx /* sign of x */
+ addsd %xmm3, %xmm0 /* t += tmp0 */
+ addl $1, %eax /* n=k+1 */
+ addsd %xmm1, %xmm0 /* t += tmp1 */
+ mulsd MO1(DP_PIO4), %xmm0 /* t *= PI04 */
+
+ jmp L(reconstruction) /* end of very_large_args peth */
+
+ .p2align 4
+L(arg_less_pio4):
+ /* Here if |x|<Pi/4 */
+ cmpl $0x3d000000, %eax /* |x|<2^-5? */
+ jl L(arg_less_2pn5)
+
+ /* Here if 2^-5<=|x|<Pi/4 */
+ movaps %xmm0, %xmm3 /* x */
+ mulsd %xmm0, %xmm0 /* y=x^2 */
+ movaps %xmm0, %xmm1 /* y */
+ mulsd %xmm0, %xmm0 /* z=x^4 */
+ movsd MO1(DP_S4), %xmm4 /* S4 */
+ mulsd %xmm0, %xmm4 /* z*S4 */
+ movsd MO1(DP_S3), %xmm5 /* S3 */
+ mulsd %xmm0, %xmm5 /* z*S3 */
+ addsd MO1(DP_S2), %xmm4 /* S2+z*S4 */
+ mulsd %xmm0, %xmm4 /* z*(S2+z*S4) */
+ addsd MO1(DP_S1), %xmm5 /* S1+z*S3 */
+ mulsd %xmm0, %xmm5 /* z*(S1+z*S3) */
+ addsd MO1(DP_S0), %xmm4 /* S0+z*(S2+z*S4) */
+ mulsd %xmm1, %xmm4 /* y*(S0+z*(S2+z*S4)) */
+ mulsd %xmm3, %xmm5 /* x*z*(S1+z*S3) */
+ mulsd %xmm3, %xmm4 /* x*y*(S0+z*(S2+z*S4)) */
+ /* x*y*(S0+y*(S1+y*(S2+y*(S3+y*S4)))) */
+ addsd %xmm5, %xmm4
+ /* x + x*y*(S0+y*(S1+y*(S2+y*(S3+y*S4)))) */
+ addsd %xmm4, %xmm3
+ cvtsd2ss %xmm3, %xmm3 /* SP result */
+
+L(epilogue):
+ lea -4(%esp), %esp /* Borrow 4 bytes of stack frame */
+ movss %xmm3, 0(%esp) /* Move result from sse... */
+ flds 0(%esp) /* ...to FPU. */
+ /* Return back 4 bytes of stack frame */
+ lea 4(%esp), %esp
+ RETURN
+
+ .p2align 4
+L(arg_less_2pn5):
+ /* Here if |x|<2^-5 */
+ cmpl $0x32000000, %eax /* |x|<2^-27? */
+ jl L(arg_less_2pn27)
+
+ /* Here if 2^-27<=|x|<2^-5 */
+ movaps %xmm0, %xmm1 /* DP x */
+ mulsd %xmm0, %xmm0 /* DP x^2 */
+ movsd MO1(DP_SIN2_1), %xmm3 /* DP DP_SIN2_1 */
+ mulsd %xmm0, %xmm3 /* DP x^2*DP_SIN2_1 */
+ addsd MO1(DP_SIN2_0), %xmm3 /* DP DP_SIN2_0+x^2*DP_SIN2_1 */
+ mulsd %xmm0, %xmm3 /* DP x^2*DP_SIN2_0+x^4*DP_SIN2_1 */
+ mulsd %xmm1, %xmm3 /* DP x^3*DP_SIN2_0+x^5*DP_SIN2_1 */
+ addsd %xmm1, %xmm3 /* DP x+x^3*DP_SIN2_0+x^5*DP_SIN2_1 */
+ cvtsd2ss %xmm3, %xmm3 /* SP result */
+ jmp L(epilogue)
+
+ .p2align 4
+L(arg_less_2pn27):
+ movss ARG_X, %xmm3 /* SP x */
+ cmpl $0, %eax /* x=0? */
+ je L(epilogue) /* in case x=0 return sin(+-0)==+-0 */
+ /* Here if |x|<2^-27 */
+ /*
+ * Special cases here:
+ * sin(subnormal) raises inexact/underflow
+ * sin(min_normalized) raises inexact/underflow
+ * sin(normalized) raises inexact
+ */
+ movaps %xmm0, %xmm3 /* Copy of DP x */
+ mulsd MO1(DP_SMALL), %xmm0 /* x*DP_SMALL */
+ subsd %xmm0, %xmm3 /* Result is x-x*DP_SMALL */
+ cvtsd2ss %xmm3, %xmm3 /* Result converted to SP */
+ jmp L(epilogue)
+
+ .p2align 4
+L(arg_inf_or_nan):
+ /* Here if |x| is Inf or NAN */
+ jne L(skip_errno_setting) /* in case of x is NaN */
+
+ /* Here if x is Inf. Set errno to EDOM. */
+ call JUMPTARGET(__errno_location)
+ movl $EDOM, (%eax)
+
+ .p2align 4
+L(skip_errno_setting):
+ /* Here if |x| is Inf or NAN. Continued. */
+ movss ARG_X, %xmm3 /* load x */
+ subss %xmm3, %xmm3 /* Result is NaN */
+ jmp L(epilogue)
+END(__sinf_sse2)
+
+ .section .rodata, "a"
+ .p2align 3
+L(PIO4J): /* Table of j*Pi/4, for j=0,1,..,10 */
+ .long 0x00000000,0x00000000
+ .long 0x54442d18,0x3fe921fb
+ .long 0x54442d18,0x3ff921fb
+ .long 0x7f3321d2,0x4002d97c
+ .long 0x54442d18,0x400921fb
+ .long 0x2955385e,0x400f6a7a
+ .long 0x7f3321d2,0x4012d97c
+ .long 0xe9bba775,0x4015fdbb
+ .long 0x54442d18,0x401921fb
+ .long 0xbeccb2bb,0x401c463a
+ .long 0x2955385e,0x401f6a7a
+ .type L(PIO4J), @object
+ ASM_SIZE_DIRECTIVE(L(PIO4J))
+
+ .p2align 3
+L(_FPI): /* 4/Pi broken into sum of positive DP values */
+ .long 0x00000000,0x00000000
+ .long 0x6c000000,0x3ff45f30
+ .long 0x2a000000,0x3e3c9c88
+ .long 0xa8000000,0x3c54fe13
+ .long 0xd0000000,0x3aaf47d4
+ .long 0x6c000000,0x38fbb81b
+ .long 0xe0000000,0x3714acc9
+ .long 0x7c000000,0x3560e410
+ .long 0x56000000,0x33bca2c7
+ .long 0xac000000,0x31fbd778
+ .long 0xe0000000,0x300b7246
+ .long 0xe8000000,0x2e5d2126
+ .long 0x48000000,0x2c970032
+ .long 0xe8000000,0x2ad77504
+ .long 0xe0000000,0x290921cf
+ .long 0xb0000000,0x274deb1c
+ .long 0xe0000000,0x25829a73
+ .long 0xbe000000,0x23fd1046
+ .long 0x10000000,0x2224baed
+ .long 0x8e000000,0x20709d33
+ .long 0x80000000,0x1e535a2f
+ .long 0x64000000,0x1cef904e
+ .long 0x30000000,0x1b0d6398
+ .long 0x24000000,0x1964ce7d
+ .long 0x16000000,0x17b908bf
+ .type L(_FPI), @object
+ ASM_SIZE_DIRECTIVE(L(_FPI))
+
+/* Coefficients of polynomial
+ for sin(x)~=x+x^3*DP_SIN2_0+x^5*DP_SIN2_1, |x|<2^-5. */
+ .p2align 3
+L(DP_SIN2_0):
+ .long 0x5543d49d,0xbfc55555
+ .type L(DP_SIN2_0), @object
+ ASM_SIZE_DIRECTIVE(L(DP_SIN2_0))
+
+ .p2align 3
+L(DP_SIN2_1):
+ .long 0x75cec8c5,0x3f8110f4
+ .type L(DP_SIN2_1), @object
+ ASM_SIZE_DIRECTIVE(L(DP_SIN2_1))
+
+ .p2align 3
+L(DP_ZERONE):
+ .long 0x00000000,0x00000000 /* 0.0 */
+ .long 0x00000000,0xbff00000 /* 1.0 */
+ .type L(DP_ZERONE), @object
+ ASM_SIZE_DIRECTIVE(L(DP_ZERONE))
+
+ .p2align 3
+L(DP_ONES):
+ .long 0x00000000,0x3ff00000 /* +1.0 */
+ .long 0x00000000,0xbff00000 /* -1.0 */
+ .type L(DP_ONES), @object
+ ASM_SIZE_DIRECTIVE(L(DP_ONES))
+
+/* Coefficients of polynomial
+ for sin(t)~=t+t^3*(S0+t^2*(S1+t^2*(S2+t^2*(S3+t^2*S4)))), |t|<Pi/4. */
+ .p2align 3
+L(DP_S3):
+ .long 0x64e6b5b4,0x3ec71d72
+ .type L(DP_S3), @object
+ ASM_SIZE_DIRECTIVE(L(DP_S3))
+
+ .p2align 3
+L(DP_S1):
+ .long 0x10c2688b,0x3f811111
+ .type L(DP_S1), @object
+ ASM_SIZE_DIRECTIVE(L(DP_S1))
+
+ .p2align 3
+L(DP_S4):
+ .long 0x1674b58a,0xbe5a947e
+ .type L(DP_S4), @object
+ ASM_SIZE_DIRECTIVE(L(DP_S4))
+
+ .p2align 3
+L(DP_S2):
+ .long 0x8b4bd1f9,0xbf2a019f
+ .type L(DP_S2), @object
+ ASM_SIZE_DIRECTIVE(L(DP_S2))
+
+ .p2align 3
+L(DP_S0):
+ .long 0x55551cd9,0xbfc55555
+ .type L(DP_S0), @object
+ ASM_SIZE_DIRECTIVE(L(DP_S0))
+
+ .p2align 3
+L(DP_SMALL):
+ .long 0x00000000,0x3cd00000 /* 2^(-50) */
+ .type L(DP_SMALL), @object
+ ASM_SIZE_DIRECTIVE(L(DP_SMALL))
+
+/* Coefficients of polynomial
+ for cos(t)~=1.0+t^2*(C0+t^2*(C1+t^2*(C2+t^2*(C3+t^2*C4)))), |t|<Pi/4. */
+ .p2align 3
+L(DP_C3):
+ .long 0x9ac43cc0,0x3efa00eb
+ .type L(DP_C3), @object
+ ASM_SIZE_DIRECTIVE(L(DP_C3))
+
+ .p2align 3
+L(DP_C1):
+ .long 0x545c50c7,0x3fa55555
+ .type L(DP_C1), @object
+ ASM_SIZE_DIRECTIVE(L(DP_C1))
+
+ .p2align 3
+L(DP_C4):
+ .long 0xdd8844d7,0xbe923c97
+ .type L(DP_C4), @object
+ ASM_SIZE_DIRECTIVE(L(DP_C4))
+
+ .p2align 3
+L(DP_C2):
+ .long 0x348b6874,0xbf56c16b
+ .type L(DP_C2), @object
+ ASM_SIZE_DIRECTIVE(L(DP_C2))
+
+ .p2align 3
+L(DP_C0):
+ .long 0xfffe98ae,0xbfdfffff
+ .type L(DP_C0), @object
+ ASM_SIZE_DIRECTIVE(L(DP_C0))
+
+ .p2align 3
+L(DP_PIO4):
+ .long 0x54442d18,0x3fe921fb /* Pi/4 */
+ .type L(DP_PIO4), @object
+ ASM_SIZE_DIRECTIVE(L(DP_PIO4))
+
+ .p2align 3
+L(DP_2POW52):
+ .long 0x00000000,0x43300000 /* +2^52 */
+ .long 0x00000000,0xc3300000 /* -2^52 */
+ .type L(DP_2POW52), @object
+ ASM_SIZE_DIRECTIVE(L(DP_2POW52))
+
+ .p2align 3
+L(DP_INVPIO4):
+ .long 0x6dc9c883,0x3ff45f30 /* 4/Pi */
+ .type L(DP_INVPIO4), @object
+ ASM_SIZE_DIRECTIVE(L(DP_INVPIO4))
+
+ .p2align 3
+L(DP_PIO4HI):
+ .long 0x54000000,0xbfe921fb /* High part of Pi/4 */
+ .type L(DP_PIO4HI), @object
+ ASM_SIZE_DIRECTIVE(L(DP_PIO4HI))
+
+ .p2align 3
+L(DP_PIO4LO):
+ .long 0x11A62633,0xbe010b46 /* Low part of Pi/4 */
+ .type L(DP_PIO4LO), @object
+ ASM_SIZE_DIRECTIVE(L(DP_PIO4LO))
+
+ .p2align 2
+L(SP_INVPIO4):
+ .long 0x3fa2f983 /* 4/Pi */
+ .type L(SP_INVPIO4), @object
+ ASM_SIZE_DIRECTIVE(L(SP_INVPIO4))
+
+ .p2align 4
+L(DP_ABS_MASK): /* Mask for getting DP absolute value */
+ .long 0xffffffff,0x7fffffff
+ .long 0xffffffff,0x7fffffff
+ .type L(DP_ABS_MASK), @object
+ ASM_SIZE_DIRECTIVE(L(DP_ABS_MASK))
+
+ .p2align 3
+L(DP_HI_MASK): /* Mask for getting high 21 bits of DP value */
+ .long 0x00000000,0xffffffff
+ .type L(DP_HI_MASK), @object
+ ASM_SIZE_DIRECTIVE(L(DP_HI_MASK))
+
+weak_alias (__sinf, sinf)
diff --git a/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/s_sinf.c b/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/s_sinf.c
new file mode 100644
index 0000000000..8ccdd2f34d
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/s_sinf.c
@@ -0,0 +1,28 @@
+/* Multiple versions of sinf
+ Copyright (C) 2012-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <init-arch.h>
+
+extern float __sinf_sse2 (float);
+extern float __sinf_ia32 (float);
+float __sinf (float);
+
+libm_ifunc (__sinf, HAS_CPU_FEATURE (SSE2) ? __sinf_sse2 : __sinf_ia32);
+weak_alias (__sinf, sinf);
+#define SINF __sinf_ia32
+#include <sysdeps/ieee754/flt-32/s_sinf.c>
diff --git a/REORG.TODO/sysdeps/i386/i686/fpu/s_fmax.S b/REORG.TODO/sysdeps/i386/i686/fpu/s_fmax.S
new file mode 100644
index 0000000000..ace8db9410
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/fpu/s_fmax.S
@@ -0,0 +1,39 @@
+/* Compute maximum of two numbers, regarding NaN as missing argument.
+ Copyright (C) 1997-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+ Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+ .text
+ENTRY(__fmax)
+ fldl 4(%esp) // x
+ fldl 12(%esp) // x : y
+
+ fucomi %st(0), %st
+ fcmovu %st(1), %st // now %st contains y if not NaN, x otherwise
+
+ fxch
+
+ fucomi %st(1), %st
+ fcmovb %st(1), %st
+
+ fstp %st(1)
+
+ ret
+END(__fmax)
+weak_alias (__fmax, fmax)
diff --git a/REORG.TODO/sysdeps/i386/i686/fpu/s_fmaxf.S b/REORG.TODO/sysdeps/i386/i686/fpu/s_fmaxf.S
new file mode 100644
index 0000000000..3a25951a09
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/fpu/s_fmaxf.S
@@ -0,0 +1,39 @@
+/* Compute maximum of two numbers, regarding NaN as missing argument.
+ Copyright (C) 1997-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+ Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+ .text
+ENTRY(__fmaxf)
+ flds 4(%esp) // x
+ flds 8(%esp) // x : y
+
+ fucomi %st(0), %st
+ fcmovu %st(1), %st // now %st contains y if not NaN, x otherwise
+
+ fxch
+
+ fucomi %st(1), %st
+ fcmovb %st(1), %st
+
+ fstp %st(1)
+
+ ret
+END(__fmaxf)
+weak_alias (__fmaxf, fmaxf)
diff --git a/REORG.TODO/sysdeps/i386/i686/fpu/s_fmaxl.S b/REORG.TODO/sysdeps/i386/i686/fpu/s_fmaxl.S
new file mode 100644
index 0000000000..3f6c21c63d
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/fpu/s_fmaxl.S
@@ -0,0 +1,58 @@
+/* Compute maximum of two numbers, regarding NaN as missing argument.
+ Copyright (C) 1997-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+ Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+ .text
+ENTRY(__fmaxl)
+ fldt 4(%esp) // x
+ fldt 16(%esp) // x : y
+
+ fucomi %st(1), %st
+ jp 2f
+ fcmovb %st(1), %st
+
+ fstp %st(1)
+
+ ret
+
+2: // Unordered.
+ fucomi %st(0), %st
+ jp 3f
+ // st(1) is a NaN; st(0) is not. Test if st(1) is signaling.
+ testb $0x40, 11(%esp)
+ jz 4f
+ fstp %st(1)
+ ret
+
+3: // st(0) is a NaN; st(1) may or may not be.
+ fxch
+ fucomi %st(0), %st
+ jp 4f
+ // st(1) is a NaN; st(0) is not. Test if st(1) is signaling.
+ testb $0x40, 23(%esp)
+ jz 4f
+ fstp %st(1)
+ ret
+
+4: // Both arguments are NaNs, or one is a signaling NaN.
+ faddp
+ ret
+END(__fmaxl)
+weak_alias (__fmaxl, fmaxl)
diff --git a/REORG.TODO/sysdeps/i386/i686/fpu/s_fmin.S b/REORG.TODO/sysdeps/i386/i686/fpu/s_fmin.S
new file mode 100644
index 0000000000..72d306fd79
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/fpu/s_fmin.S
@@ -0,0 +1,37 @@
+/* Compute minimum of two numbers, regarding NaN as missing argument.
+ Copyright (C) 1997-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+ Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+ .text
+ENTRY(__fmin)
+ fldl 4(%esp) // x
+ fldl 12(%esp) // x : y
+
+ fucomi %st(0), %st
+ fcmovu %st(1), %st // now %st contains y if not NaN, x otherwise
+
+ fucomi %st(1), %st
+ fcmovnb %st(1), %st
+
+ fstp %st(1)
+
+ ret
+END(__fmin)
+weak_alias (__fmin, fmin)
diff --git a/REORG.TODO/sysdeps/i386/i686/fpu/s_fminf.S b/REORG.TODO/sysdeps/i386/i686/fpu/s_fminf.S
new file mode 100644
index 0000000000..52ea892bad
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/fpu/s_fminf.S
@@ -0,0 +1,37 @@
+/* Compute minimum of two numbers, regarding NaN as missing argument.
+ Copyright (C) 1997-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+ Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+ .text
+ENTRY(__fminf)
+ flds 4(%esp) // x
+ flds 8(%esp) // x : y
+
+ fucomi %st(0), %st
+ fcmovu %st(1), %st // now %st contains y if not NaN, x otherwise
+
+ fucomi %st(1), %st
+ fcmovnb %st(1), %st
+
+ fstp %st(1)
+
+ ret
+END(__fminf)
+weak_alias (__fminf, fminf)
diff --git a/REORG.TODO/sysdeps/i386/i686/fpu/s_fminl.S b/REORG.TODO/sysdeps/i386/i686/fpu/s_fminl.S
new file mode 100644
index 0000000000..e1cb83fed7
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/fpu/s_fminl.S
@@ -0,0 +1,58 @@
+/* Compute minimum of two numbers, regarding NaN as missing argument.
+ Copyright (C) 1997-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+ Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+ .text
+ENTRY(__fminl)
+ fldt 4(%esp) // x
+ fldt 16(%esp) // x : y
+
+ fucomi %st(1), %st
+ jp 2f
+ fcmovnb %st(1), %st
+
+ fstp %st(1)
+
+ ret
+
+2: // Unordered.
+ fucomi %st(0), %st
+ jp 3f
+ // st(1) is a NaN; st(0) is not. Test if st(1) is signaling.
+ testb $0x40, 11(%esp)
+ jz 4f
+ fstp %st(1)
+ ret
+
+3: // st(0) is a NaN; st(1) may or may not be.
+ fxch
+ fucomi %st(0), %st
+ jp 4f
+ // st(1) is a NaN; st(0) is not. Test if st(1) is signaling.
+ testb $0x40, 23(%esp)
+ jz 4f
+ fstp %st(1)
+ ret
+
+4: // Both arguments are NaNs, or one is a signaling NaN.
+ faddp
+ ret
+END(__fminl)
+weak_alias (__fminl, fminl)
diff --git a/REORG.TODO/sysdeps/i386/i686/hp-timing.h b/REORG.TODO/sysdeps/i386/i686/hp-timing.h
new file mode 100644
index 0000000000..1b11410feb
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/hp-timing.h
@@ -0,0 +1,42 @@
+/* High precision, low overhead timing functions. i686 version.
+ Copyright (C) 1998-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+ Contributed by Ulrich Drepper <drepper@cygnus.com>, 1998.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#ifndef _HP_TIMING_H
+#define _HP_TIMING_H 1
+
+/* We always assume having the timestamp register. */
+#define HP_TIMING_AVAIL (1)
+#define HP_SMALL_TIMING_AVAIL (1)
+
+/* We indeed have inlined functions. */
+#define HP_TIMING_INLINE (1)
+
+/* We use 64bit values for the times. */
+typedef unsigned long long int hp_timing_t;
+
+/* That's quite simple. Use the `rdtsc' instruction. Note that the value
+ might not be 100% accurate since there might be some more instructions
+ running in this moment. This could be changed by using a barrier like
+ 'cpuid' right before the `rdtsc' instruciton. But we are not interested
+ in accurate clock cycles here so we don't do this. */
+#define HP_TIMING_NOW(Var) __asm__ __volatile__ ("rdtsc" : "=A" (Var))
+
+#include <hp-timing-common.h>
+
+#endif /* hp-timing.h */
diff --git a/REORG.TODO/sysdeps/i386/i686/init-arch.h b/REORG.TODO/sysdeps/i386/i686/init-arch.h
new file mode 100644
index 0000000000..f55f80efa0
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/init-arch.h
@@ -0,0 +1,19 @@
+/* Copyright (C) 2015-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#define MINIMUM_ISA 686
+#include <sysdeps/x86/init-arch.h>
diff --git a/REORG.TODO/sysdeps/i386/i686/memcmp.S b/REORG.TODO/sysdeps/i386/i686/memcmp.S
new file mode 100644
index 0000000000..5140ee2145
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/memcmp.S
@@ -0,0 +1,408 @@
+/* Compare two memory blocks for differences in the first COUNT bytes.
+ Copyright (C) 2004-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+#define PARMS 4+4 /* Preserve EBX. */
+#define BLK1 PARMS
+#define BLK2 BLK1+4
+#define LEN BLK2+4
+#define ENTRANCE pushl %ebx; cfi_adjust_cfa_offset (4); \
+ cfi_rel_offset (ebx, 0)
+#define RETURN popl %ebx; cfi_adjust_cfa_offset (-4); \
+ cfi_restore (ebx); ret
+
+/* Load an entry in a jump table into EBX. TABLE is a jump table
+ with relative offsets. INDEX is a register contains the index
+ into the jump table. */
+#define LOAD_JUMP_TABLE_ENTRY(TABLE, INDEX) \
+ /* We first load PC into EBX. */ \
+ SETUP_PIC_REG(bx); \
+ /* Get the address of the jump table. */ \
+ addl $(TABLE - .), %ebx; \
+ /* Get the entry and convert the relative offset to the \
+ absolute address. */ \
+ addl (%ebx,INDEX,4), %ebx
+
+ .text
+ ALIGN (4)
+ENTRY (memcmp)
+ ENTRANCE
+
+ movl BLK1(%esp), %eax
+ movl BLK2(%esp), %edx
+ movl LEN(%esp), %ecx
+
+ cmpl $1, %ecx
+ jne L(not_1)
+ movzbl (%eax), %ecx /* LEN == 1 */
+ cmpb (%edx), %cl
+ jne L(neq)
+L(bye):
+ xorl %eax, %eax
+ RETURN
+
+ cfi_adjust_cfa_offset (4)
+ cfi_rel_offset (ebx, 0)
+L(neq):
+ sbbl %eax, %eax
+ sbbl $-1, %eax
+ RETURN
+
+ cfi_adjust_cfa_offset (4)
+ cfi_rel_offset (ebx, 0)
+L(not_1):
+ jl L(bye) /* LEN == 0 */
+
+ pushl %esi
+ cfi_adjust_cfa_offset (4)
+ movl %eax, %esi
+ cfi_rel_offset (esi, 0)
+ cmpl $32, %ecx;
+ jge L(32bytesormore) /* LEN => 32 */
+
+ LOAD_JUMP_TABLE_ENTRY (L(table_32bytes), %ecx)
+ addl %ecx, %edx
+ addl %ecx, %esi
+ jmp *%ebx
+
+ ALIGN (4)
+L(28bytes):
+ movl -28(%esi), %eax
+ movl -28(%edx), %ecx
+ cmpl %ecx, %eax
+ jne L(find_diff)
+L(24bytes):
+ movl -24(%esi), %eax
+ movl -24(%edx), %ecx
+ cmpl %ecx, %eax
+ jne L(find_diff)
+L(20bytes):
+ movl -20(%esi), %eax
+ movl -20(%edx), %ecx
+ cmpl %ecx, %eax
+ jne L(find_diff)
+L(16bytes):
+ movl -16(%esi), %eax
+ movl -16(%edx), %ecx
+ cmpl %ecx, %eax
+ jne L(find_diff)
+L(12bytes):
+ movl -12(%esi), %eax
+ movl -12(%edx), %ecx
+ cmpl %ecx, %eax
+ jne L(find_diff)
+L(8bytes):
+ movl -8(%esi), %eax
+ movl -8(%edx), %ecx
+ cmpl %ecx, %eax
+ jne L(find_diff)
+L(4bytes):
+ movl -4(%esi), %eax
+ movl -4(%edx), %ecx
+ cmpl %ecx, %eax
+ jne L(find_diff)
+L(0bytes):
+ popl %esi
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (esi)
+ xorl %eax, %eax
+ RETURN
+
+ cfi_adjust_cfa_offset (8)
+ cfi_rel_offset (esi, 0)
+ cfi_rel_offset (ebx, 4)
+L(29bytes):
+ movl -29(%esi), %eax
+ movl -29(%edx), %ecx
+ cmpl %ecx, %eax
+ jne L(find_diff)
+L(25bytes):
+ movl -25(%esi), %eax
+ movl -25(%edx), %ecx
+ cmpl %ecx, %eax
+ jne L(find_diff)
+L(21bytes):
+ movl -21(%esi), %eax
+ movl -21(%edx), %ecx
+ cmpl %ecx, %eax
+ jne L(find_diff)
+L(17bytes):
+ movl -17(%esi), %eax
+ movl -17(%edx), %ecx
+ cmpl %ecx, %eax
+ jne L(find_diff)
+L(13bytes):
+ movl -13(%esi), %eax
+ movl -13(%edx), %ecx
+ cmpl %ecx, %eax
+ jne L(find_diff)
+L(9bytes):
+ movl -9(%esi), %eax
+ movl -9(%edx), %ecx
+ cmpl %ecx, %eax
+ jne L(find_diff)
+L(5bytes):
+ movl -5(%esi), %eax
+ movl -5(%edx), %ecx
+ cmpl %ecx, %eax
+ jne L(find_diff)
+L(1bytes):
+ movzbl -1(%esi), %eax
+ cmpb -1(%edx), %al
+ jne L(set)
+ popl %esi
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (esi)
+ xorl %eax, %eax
+ RETURN
+
+ cfi_adjust_cfa_offset (8)
+ cfi_rel_offset (esi, 0)
+ cfi_rel_offset (ebx, 4)
+L(30bytes):
+ movl -30(%esi), %eax
+ movl -30(%edx), %ecx
+ cmpl %ecx, %eax
+ jne L(find_diff)
+L(26bytes):
+ movl -26(%esi), %eax
+ movl -26(%edx), %ecx
+ cmpl %ecx, %eax
+ jne L(find_diff)
+L(22bytes):
+ movl -22(%esi), %eax
+ movl -22(%edx), %ecx
+ cmpl %ecx, %eax
+ jne L(find_diff)
+L(18bytes):
+ movl -18(%esi), %eax
+ movl -18(%edx), %ecx
+ cmpl %ecx, %eax
+ jne L(find_diff)
+L(14bytes):
+ movl -14(%esi), %eax
+ movl -14(%edx), %ecx
+ cmpl %ecx, %eax
+ jne L(find_diff)
+L(10bytes):
+ movl -10(%esi), %eax
+ movl -10(%edx), %ecx
+ cmpl %ecx, %eax
+ jne L(find_diff)
+L(6bytes):
+ movl -6(%esi), %eax
+ movl -6(%edx), %ecx
+ cmpl %ecx, %eax
+ jne L(find_diff)
+L(2bytes):
+ movzwl -2(%esi), %eax
+ movzwl -2(%edx), %ecx
+ cmpb %cl, %al
+ jne L(set)
+ cmpl %ecx, %eax
+ jne L(set)
+ popl %esi
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (esi)
+ xorl %eax, %eax
+ RETURN
+
+ cfi_adjust_cfa_offset (8)
+ cfi_rel_offset (esi, 0)
+ cfi_rel_offset (ebx, 4)
+L(31bytes):
+ movl -31(%esi), %eax
+ movl -31(%edx), %ecx
+ cmpl %ecx, %eax
+ jne L(find_diff)
+L(27bytes):
+ movl -27(%esi), %eax
+ movl -27(%edx), %ecx
+ cmpl %ecx, %eax
+ jne L(find_diff)
+L(23bytes):
+ movl -23(%esi), %eax
+ movl -23(%edx), %ecx
+ cmpl %ecx, %eax
+ jne L(find_diff)
+L(19bytes):
+ movl -19(%esi), %eax
+ movl -19(%edx), %ecx
+ cmpl %ecx, %eax
+ jne L(find_diff)
+L(15bytes):
+ movl -15(%esi), %eax
+ movl -15(%edx), %ecx
+ cmpl %ecx, %eax
+ jne L(find_diff)
+L(11bytes):
+ movl -11(%esi), %eax
+ movl -11(%edx), %ecx
+ cmpl %ecx, %eax
+ jne L(find_diff)
+L(7bytes):
+ movl -7(%esi), %eax
+ movl -7(%edx), %ecx
+ cmpl %ecx, %eax
+ jne L(find_diff)
+L(3bytes):
+ movzwl -3(%esi), %eax
+ movzwl -3(%edx), %ecx
+ cmpb %cl, %al
+ jne L(set)
+ cmpl %ecx, %eax
+ jne L(set)
+ movzbl -1(%esi), %eax
+ cmpb -1(%edx), %al
+ jne L(set)
+ popl %esi
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (esi)
+ xorl %eax, %eax
+ RETURN
+
+ cfi_adjust_cfa_offset (8)
+ cfi_rel_offset (esi, 0)
+ cfi_rel_offset (ebx, 4)
+ ALIGN (4)
+/* ECX >= 32. */
+L(32bytesormore):
+ subl $32, %ecx
+
+ movl (%esi), %eax
+ cmpl (%edx), %eax
+ jne L(load_ecx)
+
+ movl 4(%esi), %eax
+ cmpl 4(%edx), %eax
+ jne L(load_ecx_4)
+
+ movl 8(%esi), %eax
+ cmpl 8(%edx), %eax
+ jne L(load_ecx_8)
+
+ movl 12(%esi), %eax
+ cmpl 12(%edx), %eax
+ jne L(load_ecx_12)
+
+ movl 16(%esi), %eax
+ cmpl 16(%edx), %eax
+ jne L(load_ecx_16)
+
+ movl 20(%esi), %eax
+ cmpl 20(%edx), %eax
+ jne L(load_ecx_20)
+
+ movl 24(%esi), %eax
+ cmpl 24(%edx), %eax
+ jne L(load_ecx_24)
+
+ movl 28(%esi), %eax
+ cmpl 28(%edx), %eax
+ jne L(load_ecx_28)
+
+ addl $32, %esi
+ addl $32, %edx
+ cmpl $32, %ecx
+ jge L(32bytesormore)
+
+ LOAD_JUMP_TABLE_ENTRY (L(table_32bytes), %ecx)
+ addl %ecx, %edx
+ addl %ecx, %esi
+ jmp *%ebx
+
+L(load_ecx_28):
+ addl $0x4, %edx
+L(load_ecx_24):
+ addl $0x4, %edx
+L(load_ecx_20):
+ addl $0x4, %edx
+L(load_ecx_16):
+ addl $0x4, %edx
+L(load_ecx_12):
+ addl $0x4, %edx
+L(load_ecx_8):
+ addl $0x4, %edx
+L(load_ecx_4):
+ addl $0x4, %edx
+L(load_ecx):
+ movl (%edx), %ecx
+
+L(find_diff):
+ cmpb %cl, %al
+ jne L(set)
+ cmpb %ch, %ah
+ jne L(set)
+ shrl $16,%eax
+ shrl $16,%ecx
+ cmpb %cl, %al
+ jne L(set)
+ /* We get there only if we already know there is a
+ difference. */
+ cmpl %ecx, %eax
+L(set):
+ sbbl %eax, %eax
+ sbbl $-1, %eax
+ popl %esi
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (esi)
+ RETURN
+END (memcmp)
+
+ .section .rodata
+ ALIGN (2)
+L(table_32bytes) :
+ .long L(0bytes) - L(table_32bytes)
+ .long L(1bytes) - L(table_32bytes)
+ .long L(2bytes) - L(table_32bytes)
+ .long L(3bytes) - L(table_32bytes)
+ .long L(4bytes) - L(table_32bytes)
+ .long L(5bytes) - L(table_32bytes)
+ .long L(6bytes) - L(table_32bytes)
+ .long L(7bytes) - L(table_32bytes)
+ .long L(8bytes) - L(table_32bytes)
+ .long L(9bytes) - L(table_32bytes)
+ .long L(10bytes) - L(table_32bytes)
+ .long L(11bytes) - L(table_32bytes)
+ .long L(12bytes) - L(table_32bytes)
+ .long L(13bytes) - L(table_32bytes)
+ .long L(14bytes) - L(table_32bytes)
+ .long L(15bytes) - L(table_32bytes)
+ .long L(16bytes) - L(table_32bytes)
+ .long L(17bytes) - L(table_32bytes)
+ .long L(18bytes) - L(table_32bytes)
+ .long L(19bytes) - L(table_32bytes)
+ .long L(20bytes) - L(table_32bytes)
+ .long L(21bytes) - L(table_32bytes)
+ .long L(22bytes) - L(table_32bytes)
+ .long L(23bytes) - L(table_32bytes)
+ .long L(24bytes) - L(table_32bytes)
+ .long L(25bytes) - L(table_32bytes)
+ .long L(26bytes) - L(table_32bytes)
+ .long L(27bytes) - L(table_32bytes)
+ .long L(28bytes) - L(table_32bytes)
+ .long L(29bytes) - L(table_32bytes)
+ .long L(30bytes) - L(table_32bytes)
+ .long L(31bytes) - L(table_32bytes)
+
+
+#undef bcmp
+weak_alias (memcmp, bcmp)
+libc_hidden_builtin_def (memcmp)
diff --git a/REORG.TODO/sysdeps/i386/i686/memcpy.S b/REORG.TODO/sysdeps/i386/i686/memcpy.S
new file mode 100644
index 0000000000..1d61447430
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/memcpy.S
@@ -0,0 +1,98 @@
+/* Copy memory block and return pointer to beginning of destination block
+ For Intel 80x86, x>=6.
+ This file is part of the GNU C Library.
+ Copyright (C) 1999-2017 Free Software Foundation, Inc.
+ Contributed by Ulrich Drepper <drepper@cygnus.com>, 1999.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+#define PARMS 4 /* no space for saved regs */
+#define RTN PARMS
+#define DEST RTN
+#define SRC DEST+4
+#define LEN SRC+4
+
+ .text
+#if defined PIC && IS_IN (libc)
+ENTRY_CHK (__memcpy_chk)
+ movl 12(%esp), %eax
+ cmpl %eax, 16(%esp)
+ jb HIDDEN_JUMPTARGET (__chk_fail)
+END_CHK (__memcpy_chk)
+#endif
+ENTRY (memcpy)
+
+ movl %edi, %eax
+ movl DEST(%esp), %edi
+ movl %esi, %edx
+ movl SRC(%esp), %esi
+
+ movl %edi, %ecx
+ xorl %esi, %ecx
+ andl $3, %ecx
+ movl LEN(%esp), %ecx
+ cld
+ jne .Lunaligned
+
+ cmpl $3, %ecx
+ jbe .Lunaligned
+
+ testl $3, %esi
+ je 1f
+ movsb
+ decl %ecx
+ testl $3, %esi
+ je 1f
+ movsb
+ decl %ecx
+ testl $3, %esi
+ je 1f
+ movsb
+ decl %ecx
+1: pushl %eax
+ movl %ecx, %eax
+ shrl $2, %ecx
+ andl $3, %eax
+ rep
+ movsl
+ movl %eax, %ecx
+ rep
+ movsb
+ popl %eax
+
+.Lend: movl %eax, %edi
+ movl %edx, %esi
+ movl DEST(%esp), %eax
+
+ ret
+
+ /* When we come here the pointers do not have the same
+ alignment or the length is too short. No need to optimize for
+ aligned memory accesses. */
+.Lunaligned:
+ shrl $1, %ecx
+ jnc 1f
+ movsb
+1: shrl $1, %ecx
+ jnc 2f
+ movsw
+2: rep
+ movsl
+ jmp .Lend
+END (memcpy)
+libc_hidden_builtin_def (memcpy)
diff --git a/REORG.TODO/sysdeps/i386/i686/memmove.S b/REORG.TODO/sysdeps/i386/i686/memmove.S
new file mode 100644
index 0000000000..f60c3d501b
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/memmove.S
@@ -0,0 +1,120 @@
+/* Copy memory block and return pointer to beginning of destination block
+ For Intel 80x86, x>=6.
+ This file is part of the GNU C Library.
+ Copyright (C) 2003-2017 Free Software Foundation, Inc.
+ Contributed by Ulrich Drepper <drepper@cygnus.com>, 2003.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+#define PARMS 4+4 /* one spilled register */
+#define RTN PARMS
+
+ .text
+
+#ifdef USE_AS_BCOPY
+# define SRC RTN
+# define DEST SRC+4
+# define LEN DEST+4
+#else
+# define DEST RTN
+# define SRC DEST+4
+# define LEN SRC+4
+
+# if defined PIC && IS_IN (libc)
+ENTRY_CHK (__memmove_chk)
+ movl 12(%esp), %eax
+ cmpl %eax, 16(%esp)
+ jb HIDDEN_JUMPTARGET (__chk_fail)
+END_CHK (__memmove_chk)
+# endif
+#endif
+
+ENTRY (memmove)
+
+ pushl %edi
+ cfi_adjust_cfa_offset (4)
+
+ movl LEN(%esp), %ecx
+ movl DEST(%esp), %edi
+ cfi_rel_offset (edi, 0)
+ movl %esi, %edx
+ movl SRC(%esp), %esi
+ cfi_register (esi, edx)
+
+ movl %edi, %eax
+ subl %esi, %eax
+ cmpl %eax, %ecx
+ ja 3f
+
+ cld
+ shrl $1, %ecx
+ jnc 1f
+ movsb
+1: shrl $1, %ecx
+ jnc 2f
+ movsw
+2: rep
+ movsl
+ movl %edx, %esi
+ cfi_restore (esi)
+#ifndef USE_AS_BCOPY
+ movl DEST(%esp), %eax
+#endif
+
+ popl %edi
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (edi)
+
+ ret
+
+ cfi_adjust_cfa_offset (4)
+ cfi_rel_offset (edi, 0)
+ cfi_register (esi, edx)
+
+ /* Backward copying. */
+3: std
+ leal -1(%edi, %ecx), %edi
+ leal -1(%esi, %ecx), %esi
+ shrl $1, %ecx
+ jnc 1f
+ movsb
+1: subl $1, %edi
+ subl $1, %esi
+ shrl $1, %ecx
+ jnc 2f
+ movsw
+2: subl $2, %edi
+ subl $2, %esi
+ rep
+ movsl
+ movl %edx, %esi
+ cfi_restore (esi)
+#ifndef USE_AS_BCOPY
+ movl DEST(%esp), %eax
+#endif
+
+ cld
+ popl %edi
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (edi)
+
+ ret
+END (memmove)
+#ifndef USE_AS_BCOPY
+libc_hidden_builtin_def (memmove)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/mempcpy.S b/REORG.TODO/sysdeps/i386/i686/mempcpy.S
new file mode 100644
index 0000000000..31cb4efdb2
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/mempcpy.S
@@ -0,0 +1,65 @@
+/* Copy memory block and return pointer to following byte.
+ For Intel 80x86, x>=6.
+ This file is part of the GNU C Library.
+ Copyright (C) 1998-2017 Free Software Foundation, Inc.
+ Contributed by Ulrich Drepper <drepper@cygnus.com>, 1998.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+#define PARMS 4 /* no space for saved regs */
+#define RTN PARMS
+#define DEST RTN
+#define SRC DEST+4
+#define LEN SRC+4
+
+ .text
+#if defined PIC && IS_IN (libc)
+ENTRY_CHK (__mempcpy_chk)
+ movl 12(%esp), %eax
+ cmpl %eax, 16(%esp)
+ jb HIDDEN_JUMPTARGET (__chk_fail)
+END_CHK (__mempcpy_chk)
+#endif
+ENTRY (__mempcpy)
+
+ movl LEN(%esp), %ecx
+ movl %edi, %eax
+ cfi_register (edi, eax)
+ movl DEST(%esp), %edi
+ movl %esi, %edx
+ cfi_register (esi, edx)
+ movl SRC(%esp), %esi
+ cld
+ shrl $1, %ecx
+ jnc 1f
+ movsb
+1: shrl $1, %ecx
+ jnc 2f
+ movsw
+2: rep
+ movsl
+ xchgl %edi, %eax
+ cfi_restore (edi)
+ movl %edx, %esi
+ cfi_restore (esi)
+
+ ret
+END (__mempcpy)
+libc_hidden_def (__mempcpy)
+weak_alias (__mempcpy, mempcpy)
+libc_hidden_builtin_def (mempcpy)
diff --git a/REORG.TODO/sysdeps/i386/i686/memset.S b/REORG.TODO/sysdeps/i386/i686/memset.S
new file mode 100644
index 0000000000..24d06178d2
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/memset.S
@@ -0,0 +1,100 @@
+/* memset/bzero -- set memory area to CH/0
+ Highly optimized version for ix86, x>=6.
+ Copyright (C) 1999-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+ Contributed by Ulrich Drepper <drepper@cygnus.com>, 1999.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+#define PARMS 4+4 /* space for 1 saved reg */
+#ifdef USE_AS_BZERO
+# define DEST PARMS
+# define LEN DEST+4
+#else
+# define RTN PARMS
+# define DEST RTN
+# define CHR DEST+4
+# define LEN CHR+4
+#endif
+
+ .text
+#if defined SHARED && IS_IN (libc) && !defined USE_AS_BZERO
+ENTRY_CHK (__memset_chk)
+ movl 12(%esp), %eax
+ cmpl %eax, 16(%esp)
+ jb HIDDEN_JUMPTARGET (__chk_fail)
+END_CHK (__memset_chk)
+#endif
+ENTRY (memset)
+
+ cld
+ pushl %edi
+ cfi_adjust_cfa_offset (4)
+ movl DEST(%esp), %edx
+ movl LEN(%esp), %ecx
+#ifdef USE_AS_BZERO
+ xorl %eax, %eax /* fill with 0 */
+#else
+ movzbl CHR(%esp), %eax
+#endif
+ jecxz 1f
+ movl %edx, %edi
+ cfi_rel_offset (edi, 0)
+ andl $3, %edx
+ jz 2f /* aligned */
+ jp 3f /* misaligned at 3, store just one byte below */
+ stosb /* misaligned at 1 or 2, store two bytes */
+ decl %ecx
+ jz 1f
+3: stosb
+ decl %ecx
+ jz 1f
+ xorl $1, %edx
+ jnz 2f /* was misaligned at 2 or 3, now aligned */
+ stosb /* was misaligned at 1, store third byte */
+ decl %ecx
+2: movl %ecx, %edx
+ shrl $2, %ecx
+ andl $3, %edx
+#ifndef USE_AS_BZERO
+ imul $0x01010101, %eax
+#endif
+ rep
+ stosl
+ movl %edx, %ecx
+ rep
+ stosb
+
+1:
+#ifndef USE_AS_BZERO
+ movl DEST(%esp), %eax /* start address of destination is result */
+#endif
+ popl %edi
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (edi)
+
+ ret
+END (memset)
+libc_hidden_builtin_def (memset)
+
+#if defined SHARED && IS_IN (libc) && !defined __memset_chk \
+ && !defined USE_AS_BZERO
+strong_alias (__memset_chk, __memset_zero_constant_len_parameter)
+ .section .gnu.warning.__memset_zero_constant_len_parameter
+ .string "memset used with constant zero length parameter; this could be due to transposed parameters"
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/memusage.h b/REORG.TODO/sysdeps/i386/i686/memusage.h
new file mode 100644
index 0000000000..77a020d7c0
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/memusage.h
@@ -0,0 +1,21 @@
+/* Copyright (C) 2000-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#define GETSP() ({ register uintptr_t stack_ptr asm ("esp"); stack_ptr; })
+#define GETTIME(low,high) asm ("rdtsc" : "=a" (low), "=d" (high))
+
+#include <sysdeps/generic/memusage.h>
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/Makefile b/REORG.TODO/sysdeps/i386/i686/multiarch/Makefile
new file mode 100644
index 0000000000..4a0c20c051
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/Makefile
@@ -0,0 +1,44 @@
+ifeq ($(subdir),csu)
+tests += test-multiarch
+endif
+
+ifeq ($(subdir),string)
+gen-as-const-headers += locale-defines.sym
+sysdep_routines += bzero-sse2 memset-sse2 memcpy-ssse3 mempcpy-ssse3 \
+ memmove-ssse3 memcpy-ssse3-rep mempcpy-ssse3-rep \
+ memmove-ssse3-rep bcopy-ssse3 bcopy-ssse3-rep \
+ memset-sse2-rep bzero-sse2-rep strcmp-ssse3 \
+ strcmp-sse4 strncmp-c strncmp-ssse3 strncmp-sse4 \
+ memcmp-ssse3 memcmp-sse4 varshift \
+ strlen-sse2 strlen-sse2-bsf strncpy-c strcpy-ssse3 \
+ strncpy-ssse3 stpcpy-ssse3 stpncpy-ssse3 strcpy-sse2 \
+ strncpy-sse2 stpcpy-sse2 stpncpy-sse2 strcat-ssse3 \
+ strcat-sse2 strncat-ssse3 strncat-sse2 strncat-c \
+ strchr-sse2 strrchr-sse2 strchr-sse2-bsf strrchr-sse2-bsf \
+ memchr-sse2 memchr-sse2-bsf \
+ memrchr-sse2 memrchr-sse2-bsf memrchr-c \
+ rawmemchr-sse2 rawmemchr-sse2-bsf \
+ strnlen-sse2 strnlen-c \
+ strcasecmp_l-c strcasecmp-c strcasecmp_l-ssse3 \
+ strncase_l-c strncase-c strncase_l-ssse3 \
+ strcasecmp_l-sse4 strncase_l-sse4 \
+ bcopy-sse2-unaligned memcpy-sse2-unaligned \
+ mempcpy-sse2-unaligned memmove-sse2-unaligned \
+ strcspn-c strpbrk-c strspn-c
+CFLAGS-varshift.c += -msse4
+CFLAGS-strcspn-c.c += -msse4
+CFLAGS-strpbrk-c.c += -msse4
+CFLAGS-strspn-c.c += -msse4
+endif
+
+ifeq ($(subdir),wcsmbs)
+sysdep_routines += wcscmp-sse2 wcscmp-c wcslen-sse2 wcslen-c \
+ wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c wcschr-sse2 \
+ wcschr-c wcsrchr-sse2 wcsrchr-c wcscpy-ssse3 wcscpy-c
+endif
+
+ifeq ($(subdir),math)
+libm-sysdep_routines += s_fma-fma s_fmaf-fma
+CFLAGS-s_fma-fma.c += -mavx -mfpmath=sse
+CFLAGS-s_fmaf-fma.c += -mavx -mfpmath=sse
+endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/bcopy-sse2-unaligned.S b/REORG.TODO/sysdeps/i386/i686/multiarch/bcopy-sse2-unaligned.S
new file mode 100644
index 0000000000..efef2a10dd
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/bcopy-sse2-unaligned.S
@@ -0,0 +1,4 @@
+#define USE_AS_MEMMOVE
+#define USE_AS_BCOPY
+#define MEMCPY __bcopy_sse2_unaligned
+#include "memcpy-sse2-unaligned.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/bcopy-ssse3-rep.S b/REORG.TODO/sysdeps/i386/i686/multiarch/bcopy-ssse3-rep.S
new file mode 100644
index 0000000000..cbc8b420e8
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/bcopy-ssse3-rep.S
@@ -0,0 +1,4 @@
+#define USE_AS_MEMMOVE
+#define USE_AS_BCOPY
+#define MEMCPY __bcopy_ssse3_rep
+#include "memcpy-ssse3-rep.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/bcopy-ssse3.S b/REORG.TODO/sysdeps/i386/i686/multiarch/bcopy-ssse3.S
new file mode 100644
index 0000000000..36aac44b9c
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/bcopy-ssse3.S
@@ -0,0 +1,4 @@
+#define USE_AS_MEMMOVE
+#define USE_AS_BCOPY
+#define MEMCPY __bcopy_ssse3
+#include "memcpy-ssse3.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/bcopy.S b/REORG.TODO/sysdeps/i386/i686/multiarch/bcopy.S
new file mode 100644
index 0000000000..877f82c28f
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/bcopy.S
@@ -0,0 +1,59 @@
+/* Multiple versions of bcopy
+ All versions must be listed in ifunc-impl-list.c.
+ Copyright (C) 2010-2017 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in lib. */
+#if IS_IN (libc)
+ .text
+ENTRY(bcopy)
+ .type bcopy, @gnu_indirect_function
+ LOAD_GOT_AND_RTLD_GLOBAL_RO
+ LOAD_FUNC_GOT_EAX (__bcopy_ia32)
+ HAS_CPU_FEATURE (SSE2)
+ jz 2f
+ LOAD_FUNC_GOT_EAX (__bcopy_sse2_unaligned)
+ HAS_ARCH_FEATURE (Fast_Unaligned_Load)
+ jnz 2f
+ HAS_CPU_FEATURE (SSSE3)
+ jz 2f
+ LOAD_FUNC_GOT_EAX (__bcopy_ssse3)
+ HAS_ARCH_FEATURE (Fast_Rep_String)
+ jz 2f
+ LOAD_FUNC_GOT_EAX (__bcopy_ssse3_rep)
+2: ret
+END(bcopy)
+
+# undef ENTRY
+# define ENTRY(name) \
+ .type __bcopy_ia32, @function; \
+ .p2align 4; \
+ .globl __bcopy_ia32; \
+ .hidden __bcopy_ia32; \
+ __bcopy_ia32: cfi_startproc; \
+ CALL_MCOUNT
+# undef END
+# define END(name) \
+ cfi_endproc; .size __bcopy_ia32, .-__bcopy_ia32
+
+#endif
+
+#include "../bcopy.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/bzero-sse2-rep.S b/REORG.TODO/sysdeps/i386/i686/multiarch/bzero-sse2-rep.S
new file mode 100644
index 0000000000..507b288bb3
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/bzero-sse2-rep.S
@@ -0,0 +1,3 @@
+#define USE_AS_BZERO
+#define __memset_sse2_rep __bzero_sse2_rep
+#include "memset-sse2-rep.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/bzero-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/bzero-sse2.S
new file mode 100644
index 0000000000..8d04512e4e
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/bzero-sse2.S
@@ -0,0 +1,3 @@
+#define USE_AS_BZERO
+#define __memset_sse2 __bzero_sse2
+#include "memset-sse2.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/bzero.S b/REORG.TODO/sysdeps/i386/i686/multiarch/bzero.S
new file mode 100644
index 0000000000..9dac490aa2
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/bzero.S
@@ -0,0 +1,62 @@
+/* Multiple versions of bzero
+ All versions must be listed in ifunc-impl-list.c.
+ Copyright (C) 2010-2017 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in lib. */
+#if IS_IN (libc)
+ .text
+ENTRY(__bzero)
+ .type __bzero, @gnu_indirect_function
+ LOAD_GOT_AND_RTLD_GLOBAL_RO
+ LOAD_FUNC_GOT_EAX (__bzero_ia32)
+ HAS_CPU_FEATURE (SSE2)
+ jz 2f
+ LOAD_FUNC_GOT_EAX ( __bzero_sse2)
+ HAS_ARCH_FEATURE (Fast_Rep_String)
+ jz 2f
+ LOAD_FUNC_GOT_EAX (__bzero_sse2_rep)
+2: ret
+END(__bzero)
+
+# undef ENTRY
+# define ENTRY(name) \
+ .type __bzero_ia32, @function; \
+ .p2align 4; \
+ .globl __bzero_ia32; \
+ .hidden __bzero_ia32; \
+ __bzero_ia32: cfi_startproc; \
+ CALL_MCOUNT
+# undef END
+# define END(name) \
+ cfi_endproc; .size __bzero_ia32, .-__bzero_ia32
+
+# ifdef SHARED
+# undef libc_hidden_builtin_def
+/* IFUNC doesn't work with the hidden functions in shared library since
+ they will be called without setting up EBX needed for PLT which is
+ used by IFUNC. */
+# define libc_hidden_builtin_def(name) \
+ .globl __GI___bzero; __GI___bzero = __bzero_ia32
+# endif
+#endif
+
+#include "../bzero.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/ifunc-impl-list.c b/REORG.TODO/sysdeps/i386/i686/multiarch/ifunc-impl-list.c
new file mode 100644
index 0000000000..e8026a2a78
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/ifunc-impl-list.c
@@ -0,0 +1,376 @@
+/* Enumerate available IFUNC implementations of a function. i686 version.
+ Copyright (C) 2012-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <assert.h>
+#include <string.h>
+#include <wchar.h>
+#include <ifunc-impl-list.h>
+#include "init-arch.h"
+
+/* Maximum number of IFUNC implementations. */
+#define MAX_IFUNC 4
+
+/* Fill ARRAY of MAX elements with IFUNC implementations for function
+ NAME and return the number of valid entries. */
+
+size_t
+__libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ size_t max)
+{
+ assert (max >= MAX_IFUNC);
+
+ size_t i = 0;
+
+ /* Support sysdeps/i386/i686/multiarch/bcopy.S. */
+ IFUNC_IMPL (i, name, bcopy,
+ IFUNC_IMPL_ADD (array, i, bcopy, HAS_CPU_FEATURE (SSSE3),
+ __bcopy_ssse3_rep)
+ IFUNC_IMPL_ADD (array, i, bcopy, HAS_CPU_FEATURE (SSSE3),
+ __bcopy_ssse3)
+ IFUNC_IMPL_ADD (array, i, bcopy, HAS_CPU_FEATURE (SSE2),
+ __bcopy_sse2_unaligned)
+ IFUNC_IMPL_ADD (array, i, bcopy, 1, __bcopy_ia32))
+
+ /* Support sysdeps/i386/i686/multiarch/bzero.S. */
+ IFUNC_IMPL (i, name, bzero,
+ IFUNC_IMPL_ADD (array, i, bzero, HAS_CPU_FEATURE (SSE2),
+ __bzero_sse2_rep)
+ IFUNC_IMPL_ADD (array, i, bzero, HAS_CPU_FEATURE (SSE2),
+ __bzero_sse2)
+ IFUNC_IMPL_ADD (array, i, bzero, 1, __bzero_ia32))
+
+ /* Support sysdeps/i386/i686/multiarch/memchr.S. */
+ IFUNC_IMPL (i, name, memchr,
+ IFUNC_IMPL_ADD (array, i, memchr, HAS_CPU_FEATURE (SSE2),
+ __memchr_sse2_bsf)
+ IFUNC_IMPL_ADD (array, i, memchr, HAS_CPU_FEATURE (SSE2),
+ __memchr_sse2)
+ IFUNC_IMPL_ADD (array, i, memchr, 1, __memchr_ia32))
+
+ /* Support sysdeps/i386/i686/multiarch/memcmp.S. */
+ IFUNC_IMPL (i, name, memcmp,
+ IFUNC_IMPL_ADD (array, i, memcmp, HAS_CPU_FEATURE (SSE4_2),
+ __memcmp_sse4_2)
+ IFUNC_IMPL_ADD (array, i, memcmp, HAS_CPU_FEATURE (SSSE3),
+ __memcmp_ssse3)
+ IFUNC_IMPL_ADD (array, i, memcmp, 1, __memcmp_ia32))
+
+ /* Support sysdeps/i386/i686/multiarch/memmove_chk.S. */
+ IFUNC_IMPL (i, name, __memmove_chk,
+ IFUNC_IMPL_ADD (array, i, __memmove_chk,
+ HAS_CPU_FEATURE (SSSE3),
+ __memmove_chk_ssse3_rep)
+ IFUNC_IMPL_ADD (array, i, __memmove_chk,
+ HAS_CPU_FEATURE (SSSE3),
+ __memmove_chk_ssse3)
+ IFUNC_IMPL_ADD (array, i, __memmove_chk,
+ HAS_CPU_FEATURE (SSE2),
+ __memmove_chk_sse2_unaligned)
+ IFUNC_IMPL_ADD (array, i, __memmove_chk, 1,
+ __memmove_chk_ia32))
+
+ /* Support sysdeps/i386/i686/multiarch/memmove.S. */
+ IFUNC_IMPL (i, name, memmove,
+ IFUNC_IMPL_ADD (array, i, memmove, HAS_CPU_FEATURE (SSSE3),
+ __memmove_ssse3_rep)
+ IFUNC_IMPL_ADD (array, i, memmove, HAS_CPU_FEATURE (SSSE3),
+ __memmove_ssse3)
+ IFUNC_IMPL_ADD (array, i, memmove, HAS_CPU_FEATURE (SSE2),
+ __memmove_sse2_unaligned)
+ IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_ia32))
+
+ /* Support sysdeps/i386/i686/multiarch/memrchr.S. */
+ IFUNC_IMPL (i, name, memrchr,
+ IFUNC_IMPL_ADD (array, i, memrchr, HAS_CPU_FEATURE (SSE2),
+ __memrchr_sse2_bsf)
+ IFUNC_IMPL_ADD (array, i, memrchr, HAS_CPU_FEATURE (SSE2),
+ __memrchr_sse2)
+ IFUNC_IMPL_ADD (array, i, memrchr, 1, __memrchr_ia32))
+
+ /* Support sysdeps/i386/i686/multiarch/memset_chk.S. */
+ IFUNC_IMPL (i, name, __memset_chk,
+ IFUNC_IMPL_ADD (array, i, __memset_chk,
+ HAS_CPU_FEATURE (SSE2),
+ __memset_chk_sse2_rep)
+ IFUNC_IMPL_ADD (array, i, __memset_chk,
+ HAS_CPU_FEATURE (SSE2),
+ __memset_chk_sse2)
+ IFUNC_IMPL_ADD (array, i, __memset_chk, 1,
+ __memset_chk_ia32))
+
+ /* Support sysdeps/i386/i686/multiarch/memset.S. */
+ IFUNC_IMPL (i, name, memset,
+ IFUNC_IMPL_ADD (array, i, memset, HAS_CPU_FEATURE (SSE2),
+ __memset_sse2_rep)
+ IFUNC_IMPL_ADD (array, i, memset, HAS_CPU_FEATURE (SSE2),
+ __memset_sse2)
+ IFUNC_IMPL_ADD (array, i, memset, 1, __memset_ia32))
+
+ /* Support sysdeps/i386/i686/multiarch/rawmemchr.S. */
+ IFUNC_IMPL (i, name, rawmemchr,
+ IFUNC_IMPL_ADD (array, i, rawmemchr, HAS_CPU_FEATURE (SSE2),
+ __rawmemchr_sse2_bsf)
+ IFUNC_IMPL_ADD (array, i, rawmemchr, HAS_CPU_FEATURE (SSE2),
+ __rawmemchr_sse2)
+ IFUNC_IMPL_ADD (array, i, rawmemchr, 1, __rawmemchr_ia32))
+
+ /* Support sysdeps/i386/i686/multiarch/stpncpy.S. */
+ IFUNC_IMPL (i, name, stpncpy,
+ IFUNC_IMPL_ADD (array, i, stpncpy, HAS_CPU_FEATURE (SSSE3),
+ __stpncpy_ssse3)
+ IFUNC_IMPL_ADD (array, i, stpncpy, HAS_CPU_FEATURE (SSE2),
+ __stpncpy_sse2)
+ IFUNC_IMPL_ADD (array, i, stpncpy, 1, __stpncpy_ia32))
+
+ /* Support sysdeps/i386/i686/multiarch/stpcpy.S. */
+ IFUNC_IMPL (i, name, stpcpy,
+ IFUNC_IMPL_ADD (array, i, stpcpy, HAS_CPU_FEATURE (SSSE3),
+ __stpcpy_ssse3)
+ IFUNC_IMPL_ADD (array, i, stpcpy, HAS_CPU_FEATURE (SSE2),
+ __stpcpy_sse2)
+ IFUNC_IMPL_ADD (array, i, stpcpy, 1, __stpcpy_ia32))
+
+ /* Support sysdeps/i386/i686/multiarch/strcasecmp.S. */
+ IFUNC_IMPL (i, name, strcasecmp,
+ IFUNC_IMPL_ADD (array, i, strcasecmp,
+ HAS_CPU_FEATURE (SSE4_2),
+ __strcasecmp_sse4_2)
+ IFUNC_IMPL_ADD (array, i, strcasecmp,
+ HAS_CPU_FEATURE (SSSE3),
+ __strcasecmp_ssse3)
+ IFUNC_IMPL_ADD (array, i, strcasecmp, 1, __strcasecmp_ia32))
+
+ /* Support sysdeps/i386/i686/multiarch/strcasecmp_l.S. */
+ IFUNC_IMPL (i, name, strcasecmp_l,
+ IFUNC_IMPL_ADD (array, i, strcasecmp_l,
+ HAS_CPU_FEATURE (SSE4_2),
+ __strcasecmp_l_sse4_2)
+ IFUNC_IMPL_ADD (array, i, strcasecmp_l,
+ HAS_CPU_FEATURE (SSSE3),
+ __strcasecmp_l_ssse3)
+ IFUNC_IMPL_ADD (array, i, strcasecmp_l, 1,
+ __strcasecmp_l_ia32))
+
+ /* Support sysdeps/i386/i686/multiarch/strcat.S. */
+ IFUNC_IMPL (i, name, strcat,
+ IFUNC_IMPL_ADD (array, i, strcat, HAS_CPU_FEATURE (SSSE3),
+ __strcat_ssse3)
+ IFUNC_IMPL_ADD (array, i, strcat, HAS_CPU_FEATURE (SSE2),
+ __strcat_sse2)
+ IFUNC_IMPL_ADD (array, i, strcat, 1, __strcat_ia32))
+
+ /* Support sysdeps/i386/i686/multiarch/strchr.S. */
+ IFUNC_IMPL (i, name, strchr,
+ IFUNC_IMPL_ADD (array, i, strchr, HAS_CPU_FEATURE (SSE2),
+ __strchr_sse2_bsf)
+ IFUNC_IMPL_ADD (array, i, strchr, HAS_CPU_FEATURE (SSE2),
+ __strchr_sse2)
+ IFUNC_IMPL_ADD (array, i, strchr, 1, __strchr_ia32))
+
+ /* Support sysdeps/i386/i686/multiarch/strcmp.S. */
+ IFUNC_IMPL (i, name, strcmp,
+ IFUNC_IMPL_ADD (array, i, strcmp, HAS_CPU_FEATURE (SSE4_2),
+ __strcmp_sse4_2)
+ IFUNC_IMPL_ADD (array, i, strcmp, HAS_CPU_FEATURE (SSSE3),
+ __strcmp_ssse3)
+ IFUNC_IMPL_ADD (array, i, strcmp, 1, __strcmp_ia32))
+
+ /* Support sysdeps/i386/i686/multiarch/strcpy.S. */
+ IFUNC_IMPL (i, name, strcpy,
+ IFUNC_IMPL_ADD (array, i, strcpy, HAS_CPU_FEATURE (SSSE3),
+ __strcpy_ssse3)
+ IFUNC_IMPL_ADD (array, i, strcpy, HAS_CPU_FEATURE (SSE2),
+ __strcpy_sse2)
+ IFUNC_IMPL_ADD (array, i, strcpy, 1, __strcpy_ia32))
+
+ /* Support sysdeps/i386/i686/multiarch/strcspn.S. */
+ IFUNC_IMPL (i, name, strcspn,
+ IFUNC_IMPL_ADD (array, i, strcspn, HAS_CPU_FEATURE (SSE4_2),
+ __strcspn_sse42)
+ IFUNC_IMPL_ADD (array, i, strcspn, 1, __strcspn_ia32))
+
+ /* Support sysdeps/i386/i686/multiarch/strncase.S. */
+ IFUNC_IMPL (i, name, strncasecmp,
+ IFUNC_IMPL_ADD (array, i, strncasecmp,
+ HAS_CPU_FEATURE (SSE4_2),
+ __strncasecmp_sse4_2)
+ IFUNC_IMPL_ADD (array, i, strncasecmp,
+ HAS_CPU_FEATURE (SSSE3),
+ __strncasecmp_ssse3)
+ IFUNC_IMPL_ADD (array, i, strncasecmp, 1,
+ __strncasecmp_ia32))
+
+ /* Support sysdeps/i386/i686/multiarch/strncase_l.S. */
+ IFUNC_IMPL (i, name, strncasecmp_l,
+ IFUNC_IMPL_ADD (array, i, strncasecmp_l,
+ HAS_CPU_FEATURE (SSE4_2),
+ __strncasecmp_l_sse4_2)
+ IFUNC_IMPL_ADD (array, i, strncasecmp_l,
+ HAS_CPU_FEATURE (SSSE3),
+ __strncasecmp_l_ssse3)
+ IFUNC_IMPL_ADD (array, i, strncasecmp_l, 1,
+ __strncasecmp_l_ia32))
+
+ /* Support sysdeps/i386/i686/multiarch/strncat.S. */
+ IFUNC_IMPL (i, name, strncat,
+ IFUNC_IMPL_ADD (array, i, strncat, HAS_CPU_FEATURE (SSSE3),
+ __strncat_ssse3)
+ IFUNC_IMPL_ADD (array, i, strncat, HAS_CPU_FEATURE (SSE2),
+ __strncat_sse2)
+ IFUNC_IMPL_ADD (array, i, strncat, 1, __strncat_ia32))
+
+ /* Support sysdeps/i386/i686/multiarch/strncpy.S. */
+ IFUNC_IMPL (i, name, strncpy,
+ IFUNC_IMPL_ADD (array, i, strncpy, HAS_CPU_FEATURE (SSSE3),
+ __strncpy_ssse3)
+ IFUNC_IMPL_ADD (array, i, strncpy, HAS_CPU_FEATURE (SSE2),
+ __strncpy_sse2)
+ IFUNC_IMPL_ADD (array, i, strncpy, 1, __strncpy_ia32))
+
+ /* Support sysdeps/i386/i686/multiarch/strnlen.S. */
+ IFUNC_IMPL (i, name, strnlen,
+ IFUNC_IMPL_ADD (array, i, strnlen, HAS_CPU_FEATURE (SSE2),
+ __strnlen_sse2)
+ IFUNC_IMPL_ADD (array, i, strnlen, 1, __strnlen_ia32))
+
+ /* Support sysdeps/i386/i686/multiarch/strpbrk.S. */
+ IFUNC_IMPL (i, name, strpbrk,
+ IFUNC_IMPL_ADD (array, i, strpbrk, HAS_CPU_FEATURE (SSE4_2),
+ __strpbrk_sse42)
+ IFUNC_IMPL_ADD (array, i, strpbrk, 1, __strpbrk_ia32))
+
+ /* Support sysdeps/i386/i686/multiarch/strrchr.S. */
+ IFUNC_IMPL (i, name, strrchr,
+ IFUNC_IMPL_ADD (array, i, strrchr, HAS_CPU_FEATURE (SSE2),
+ __strrchr_sse2_bsf)
+ IFUNC_IMPL_ADD (array, i, strrchr, HAS_CPU_FEATURE (SSE2),
+ __strrchr_sse2)
+ IFUNC_IMPL_ADD (array, i, strrchr, 1, __strrchr_ia32))
+
+ /* Support sysdeps/i386/i686/multiarch/strspn.S. */
+ IFUNC_IMPL (i, name, strspn,
+ IFUNC_IMPL_ADD (array, i, strspn, HAS_CPU_FEATURE (SSE4_2),
+ __strspn_sse42)
+ IFUNC_IMPL_ADD (array, i, strspn, 1, __strspn_ia32))
+
+ /* Support sysdeps/i386/i686/multiarch/wcschr.S. */
+ IFUNC_IMPL (i, name, wcschr,
+ IFUNC_IMPL_ADD (array, i, wcschr, HAS_CPU_FEATURE (SSE2),
+ __wcschr_sse2)
+ IFUNC_IMPL_ADD (array, i, wcschr, 1, __wcschr_ia32))
+
+ /* Support sysdeps/i386/i686/multiarch/wcscmp.S. */
+ IFUNC_IMPL (i, name, wcscmp,
+ IFUNC_IMPL_ADD (array, i, wcscmp, HAS_CPU_FEATURE (SSE2),
+ __wcscmp_sse2)
+ IFUNC_IMPL_ADD (array, i, wcscmp, 1, __wcscmp_ia32))
+
+ /* Support sysdeps/i386/i686/multiarch/wcscpy.S. */
+ IFUNC_IMPL (i, name, wcscpy,
+ IFUNC_IMPL_ADD (array, i, wcscpy, HAS_CPU_FEATURE (SSSE3),
+ __wcscpy_ssse3)
+ IFUNC_IMPL_ADD (array, i, wcscpy, 1, __wcscpy_ia32))
+
+ /* Support sysdeps/i386/i686/multiarch/wcslen.S. */
+ IFUNC_IMPL (i, name, wcslen,
+ IFUNC_IMPL_ADD (array, i, wcslen, HAS_CPU_FEATURE (SSE2),
+ __wcslen_sse2)
+ IFUNC_IMPL_ADD (array, i, wcslen, 1, __wcslen_ia32))
+
+ /* Support sysdeps/i386/i686/multiarch/wcsrchr.S. */
+ IFUNC_IMPL (i, name, wcsrchr,
+ IFUNC_IMPL_ADD (array, i, wcsrchr, HAS_CPU_FEATURE (SSE2),
+ __wcsrchr_sse2)
+ IFUNC_IMPL_ADD (array, i, wcsrchr, 1, __wcsrchr_ia32))
+
+ /* Support sysdeps/i386/i686/multiarch/wmemcmp.S. */
+ IFUNC_IMPL (i, name, wmemcmp,
+ IFUNC_IMPL_ADD (array, i, wmemcmp, HAS_CPU_FEATURE (SSE4_2),
+ __wmemcmp_sse4_2)
+ IFUNC_IMPL_ADD (array, i, wmemcmp, HAS_CPU_FEATURE (SSSE3),
+ __wmemcmp_ssse3)
+ IFUNC_IMPL_ADD (array, i, wmemcmp, 1, __wmemcmp_ia32))
+
+#ifdef SHARED
+ /* Support sysdeps/i386/i686/multiarch/memcpy_chk.S. */
+ IFUNC_IMPL (i, name, __memcpy_chk,
+ IFUNC_IMPL_ADD (array, i, __memcpy_chk,
+ HAS_CPU_FEATURE (SSSE3),
+ __memcpy_chk_ssse3_rep)
+ IFUNC_IMPL_ADD (array, i, __memcpy_chk,
+ HAS_CPU_FEATURE (SSSE3),
+ __memcpy_chk_ssse3)
+ IFUNC_IMPL_ADD (array, i, __memcpy_chk,
+ HAS_CPU_FEATURE (SSE2),
+ __memcpy_chk_sse2_unaligned)
+ IFUNC_IMPL_ADD (array, i, __memcpy_chk, 1,
+ __memcpy_chk_ia32))
+
+ /* Support sysdeps/i386/i686/multiarch/memcpy.S. */
+ IFUNC_IMPL (i, name, memcpy,
+ IFUNC_IMPL_ADD (array, i, memcpy, HAS_CPU_FEATURE (SSSE3),
+ __memcpy_ssse3_rep)
+ IFUNC_IMPL_ADD (array, i, memcpy, HAS_CPU_FEATURE (SSSE3),
+ __memcpy_ssse3)
+ IFUNC_IMPL_ADD (array, i, memcpy, HAS_CPU_FEATURE (SSE2),
+ __memcpy_sse2_unaligned)
+ IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_ia32))
+
+ /* Support sysdeps/i386/i686/multiarch/mempcpy_chk.S. */
+ IFUNC_IMPL (i, name, __mempcpy_chk,
+ IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
+ HAS_CPU_FEATURE (SSSE3),
+ __mempcpy_chk_ssse3_rep)
+ IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
+ HAS_CPU_FEATURE (SSSE3),
+ __mempcpy_chk_ssse3)
+ IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
+ HAS_CPU_FEATURE (SSE2),
+ __mempcpy_chk_sse2_unaligned)
+ IFUNC_IMPL_ADD (array, i, __mempcpy_chk, 1,
+ __mempcpy_chk_ia32))
+
+ /* Support sysdeps/i386/i686/multiarch/mempcpy.S. */
+ IFUNC_IMPL (i, name, mempcpy,
+ IFUNC_IMPL_ADD (array, i, mempcpy, HAS_CPU_FEATURE (SSSE3),
+ __mempcpy_ssse3_rep)
+ IFUNC_IMPL_ADD (array, i, mempcpy, HAS_CPU_FEATURE (SSSE3),
+ __mempcpy_ssse3)
+ IFUNC_IMPL_ADD (array, i, mempcpy, HAS_CPU_FEATURE (SSE2),
+ __mempcpy_sse2_unaligned)
+ IFUNC_IMPL_ADD (array, i, mempcpy, 1, __mempcpy_ia32))
+
+ /* Support sysdeps/i386/i686/multiarch/strlen.S. */
+ IFUNC_IMPL (i, name, strlen,
+ IFUNC_IMPL_ADD (array, i, strlen, HAS_CPU_FEATURE (SSE2),
+ __strlen_sse2_bsf)
+ IFUNC_IMPL_ADD (array, i, strlen, HAS_CPU_FEATURE (SSE2),
+ __strlen_sse2)
+ IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_ia32))
+
+ /* Support sysdeps/i386/i686/multiarch/strncmp.S. */
+ IFUNC_IMPL (i, name, strncmp,
+ IFUNC_IMPL_ADD (array, i, strncmp, HAS_CPU_FEATURE (SSE4_2),
+ __strncmp_sse4_2)
+ IFUNC_IMPL_ADD (array, i, strncmp, HAS_CPU_FEATURE (SSSE3),
+ __strncmp_ssse3)
+ IFUNC_IMPL_ADD (array, i, strncmp, 1, __strncmp_ia32))
+#endif
+
+ return i;
+}
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/locale-defines.sym b/REORG.TODO/sysdeps/i386/i686/multiarch/locale-defines.sym
new file mode 100644
index 0000000000..aebff9a4f9
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/locale-defines.sym
@@ -0,0 +1,11 @@
+#include <locale/localeinfo.h>
+#include <langinfo.h>
+#include <stddef.h>
+
+--
+
+LOCALE_T___LOCALES offsetof (struct __locale_struct, __locales)
+LC_CTYPE
+_NL_CTYPE_NONASCII_CASE
+LOCALE_DATA_VALUES offsetof (struct __locale_data, values)
+SIZEOF_VALUES sizeof (((struct __locale_data *) 0)->values[0])
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memchr-sse2-bsf.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memchr-sse2-bsf.S
new file mode 100644
index 0000000000..dd316486e6
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memchr-sse2-bsf.S
@@ -0,0 +1,502 @@
+/* Optimized memchr with sse2
+ Copyright (C) 2011-2017 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+
+# define CFI_PUSH(REG) \
+ cfi_adjust_cfa_offset (4); \
+ cfi_rel_offset (REG, 0)
+
+# define CFI_POP(REG) \
+ cfi_adjust_cfa_offset (-4); \
+ cfi_restore (REG)
+
+# define PUSH(REG) pushl REG; CFI_PUSH (REG)
+# define POP(REG) popl REG; CFI_POP (REG)
+
+# define PARMS 4
+# define STR1 PARMS
+# define STR2 STR1+4
+
+# ifndef USE_AS_RAWMEMCHR
+# define LEN STR2+4
+# define RETURN POP(%edi); ret; CFI_PUSH(%edi);
+# endif
+
+# ifndef MEMCHR
+# define MEMCHR __memchr_sse2_bsf
+# endif
+
+ .text
+ENTRY (MEMCHR)
+
+ mov STR1(%esp), %ecx
+ movd STR2(%esp), %xmm1
+
+# ifndef USE_AS_RAWMEMCHR
+ mov LEN(%esp), %edx
+ test %edx, %edx
+ jz L(return_null_1)
+# endif
+ mov %ecx, %eax
+
+ punpcklbw %xmm1, %xmm1
+ punpcklbw %xmm1, %xmm1
+
+ and $63, %ecx
+ pshufd $0, %xmm1, %xmm1
+
+ cmp $48, %ecx
+ ja L(crosscache)
+
+ movdqu (%eax), %xmm0
+ pcmpeqb %xmm1, %xmm0
+/* Check if there is a match. */
+ pmovmskb %xmm0, %ecx
+ test %ecx, %ecx
+ je L(unaligned_no_match_1)
+/* Check which byte is a match. */
+ bsf %ecx, %ecx
+
+# ifndef USE_AS_RAWMEMCHR
+ sub %ecx, %edx
+ jbe L(return_null_1)
+# endif
+ add %ecx, %eax
+ ret
+
+ .p2align 4
+L(unaligned_no_match_1):
+# ifndef USE_AS_RAWMEMCHR
+ sub $16, %edx
+ jbe L(return_null_1)
+ PUSH (%edi)
+ lea 16(%eax), %edi
+ and $15, %eax
+ and $-16, %edi
+ add %eax, %edx
+# else
+ lea 16(%eax), %edx
+ and $-16, %edx
+# endif
+ jmp L(loop_prolog)
+
+ .p2align 4
+L(return_null_1):
+ xor %eax, %eax
+ ret
+
+# ifndef USE_AS_RAWMEMCHR
+ CFI_POP (%edi)
+# endif
+
+ .p2align 4
+L(crosscache):
+/* Handle unaligned string. */
+
+# ifndef USE_AS_RAWMEMCHR
+ PUSH (%edi)
+ mov %eax, %edi
+ and $15, %ecx
+ and $-16, %edi
+ movdqa (%edi), %xmm0
+# else
+ mov %eax, %edx
+ and $15, %ecx
+ and $-16, %edx
+ movdqa (%edx), %xmm0
+# endif
+ pcmpeqb %xmm1, %xmm0
+/* Check if there is a match. */
+ pmovmskb %xmm0, %eax
+/* Remove the leading bytes. */
+ sar %cl, %eax
+ test %eax, %eax
+ je L(unaligned_no_match)
+/* Check which byte is a match. */
+ bsf %eax, %eax
+
+# ifndef USE_AS_RAWMEMCHR
+ sub %eax, %edx
+ jbe L(return_null)
+ add %edi, %eax
+ add %ecx, %eax
+ RETURN
+# else
+ add %edx, %eax
+ add %ecx, %eax
+ ret
+# endif
+
+ .p2align 4
+L(unaligned_no_match):
+# ifndef USE_AS_RAWMEMCHR
+ /* Calculate the last acceptable address and check for possible
+ addition overflow by using satured math:
+ edx = ecx + edx
+ edx |= -(edx < ecx) */
+ add %ecx, %edx
+ sbb %eax, %eax
+ or %eax, %edx
+ sub $16, %edx
+ jbe L(return_null)
+ add $16, %edi
+# else
+ add $16, %edx
+# endif
+
+ .p2align 4
+/* Loop start on aligned string. */
+L(loop_prolog):
+# ifndef USE_AS_RAWMEMCHR
+ sub $64, %edx
+ jbe L(exit_loop)
+ movdqa (%edi), %xmm0
+# else
+ movdqa (%edx), %xmm0
+# endif
+ pcmpeqb %xmm1, %xmm0
+ pmovmskb %xmm0, %eax
+ test %eax, %eax
+ jnz L(matches)
+
+# ifndef USE_AS_RAWMEMCHR
+ movdqa 16(%edi), %xmm2
+# else
+ movdqa 16(%edx), %xmm2
+# endif
+ pcmpeqb %xmm1, %xmm2
+ pmovmskb %xmm2, %eax
+ test %eax, %eax
+ jnz L(matches16)
+
+# ifndef USE_AS_RAWMEMCHR
+ movdqa 32(%edi), %xmm3
+# else
+ movdqa 32(%edx), %xmm3
+# endif
+ pcmpeqb %xmm1, %xmm3
+ pmovmskb %xmm3, %eax
+ test %eax, %eax
+ jnz L(matches32)
+
+# ifndef USE_AS_RAWMEMCHR
+ movdqa 48(%edi), %xmm4
+# else
+ movdqa 48(%edx), %xmm4
+# endif
+ pcmpeqb %xmm1, %xmm4
+
+# ifndef USE_AS_RAWMEMCHR
+ add $64, %edi
+# else
+ add $64, %edx
+# endif
+ pmovmskb %xmm4, %eax
+ test %eax, %eax
+ jnz L(matches0)
+
+# ifndef USE_AS_RAWMEMCHR
+ test $0x3f, %edi
+# else
+ test $0x3f, %edx
+# endif
+ jz L(align64_loop)
+
+# ifndef USE_AS_RAWMEMCHR
+ sub $64, %edx
+ jbe L(exit_loop)
+ movdqa (%edi), %xmm0
+# else
+ movdqa (%edx), %xmm0
+# endif
+ pcmpeqb %xmm1, %xmm0
+ pmovmskb %xmm0, %eax
+ test %eax, %eax
+ jnz L(matches)
+
+# ifndef USE_AS_RAWMEMCHR
+ movdqa 16(%edi), %xmm2
+# else
+ movdqa 16(%edx), %xmm2
+# endif
+ pcmpeqb %xmm1, %xmm2
+ pmovmskb %xmm2, %eax
+ test %eax, %eax
+ jnz L(matches16)
+
+# ifndef USE_AS_RAWMEMCHR
+ movdqa 32(%edi), %xmm3
+# else
+ movdqa 32(%edx), %xmm3
+# endif
+ pcmpeqb %xmm1, %xmm3
+ pmovmskb %xmm3, %eax
+ test %eax, %eax
+ jnz L(matches32)
+
+# ifndef USE_AS_RAWMEMCHR
+ movdqa 48(%edi), %xmm3
+# else
+ movdqa 48(%edx), %xmm3
+# endif
+ pcmpeqb %xmm1, %xmm3
+ pmovmskb %xmm3, %eax
+
+# ifndef USE_AS_RAWMEMCHR
+ add $64, %edi
+# else
+ add $64, %edx
+# endif
+ test %eax, %eax
+ jnz L(matches0)
+
+# ifndef USE_AS_RAWMEMCHR
+ mov %edi, %ecx
+ and $-64, %edi
+ and $63, %ecx
+ add %ecx, %edx
+# else
+ and $-64, %edx
+# endif
+
+ .p2align 4
+L(align64_loop):
+# ifndef USE_AS_RAWMEMCHR
+ sub $64, %edx
+ jbe L(exit_loop)
+ movdqa (%edi), %xmm0
+ movdqa 16(%edi), %xmm2
+ movdqa 32(%edi), %xmm3
+ movdqa 48(%edi), %xmm4
+# else
+ movdqa (%edx), %xmm0
+ movdqa 16(%edx), %xmm2
+ movdqa 32(%edx), %xmm3
+ movdqa 48(%edx), %xmm4
+# endif
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm1, %xmm2
+ pcmpeqb %xmm1, %xmm3
+ pcmpeqb %xmm1, %xmm4
+
+ pmaxub %xmm0, %xmm3
+ pmaxub %xmm2, %xmm4
+ pmaxub %xmm3, %xmm4
+ pmovmskb %xmm4, %eax
+
+# ifndef USE_AS_RAWMEMCHR
+ add $64, %edi
+# else
+ add $64, %edx
+# endif
+
+ test %eax, %eax
+ jz L(align64_loop)
+
+# ifndef USE_AS_RAWMEMCHR
+ sub $64, %edi
+# else
+ sub $64, %edx
+# endif
+
+ pmovmskb %xmm0, %eax
+ test %eax, %eax
+ jnz L(matches)
+
+ pmovmskb %xmm2, %eax
+ test %eax, %eax
+ jnz L(matches16)
+
+# ifndef USE_AS_RAWMEMCHR
+ movdqa 32(%edi), %xmm3
+# else
+ movdqa 32(%edx), %xmm3
+# endif
+
+ pcmpeqb %xmm1, %xmm3
+
+# ifndef USE_AS_RAWMEMCHR
+ pcmpeqb 48(%edi), %xmm1
+# else
+ pcmpeqb 48(%edx), %xmm1
+# endif
+ pmovmskb %xmm3, %eax
+ test %eax, %eax
+ jnz L(matches32)
+
+ pmovmskb %xmm1, %eax
+ bsf %eax, %eax
+
+# ifndef USE_AS_RAWMEMCHR
+ lea 48(%edi, %eax), %eax
+ RETURN
+# else
+ lea 48(%edx, %eax), %eax
+ ret
+# endif
+
+# ifndef USE_AS_RAWMEMCHR
+ .p2align 4
+L(exit_loop):
+ add $64, %edx
+ cmp $32, %edx
+ jbe L(exit_loop_32)
+
+ movdqa (%edi), %xmm0
+ pcmpeqb %xmm1, %xmm0
+ pmovmskb %xmm0, %eax
+ test %eax, %eax
+ jnz L(matches)
+
+ movdqa 16(%edi), %xmm2
+ pcmpeqb %xmm1, %xmm2
+ pmovmskb %xmm2, %eax
+ test %eax, %eax
+ jnz L(matches16)
+
+ movdqa 32(%edi), %xmm3
+ pcmpeqb %xmm1, %xmm3
+ pmovmskb %xmm3, %eax
+ test %eax, %eax
+ jnz L(matches32_1)
+ cmp $48, %edx
+ jbe L(return_null)
+
+ pcmpeqb 48(%edi), %xmm1
+ pmovmskb %xmm1, %eax
+ test %eax, %eax
+ jnz L(matches48_1)
+ xor %eax, %eax
+ RETURN
+
+ .p2align 4
+L(exit_loop_32):
+ movdqa (%edi), %xmm0
+ pcmpeqb %xmm1, %xmm0
+ pmovmskb %xmm0, %eax
+ test %eax, %eax
+ jnz L(matches_1)
+ cmp $16, %edx
+ jbe L(return_null)
+
+ pcmpeqb 16(%edi), %xmm1
+ pmovmskb %xmm1, %eax
+ test %eax, %eax
+ jnz L(matches16_1)
+ xor %eax, %eax
+ RETURN
+# endif
+ .p2align 4
+L(matches0):
+ bsf %eax, %eax
+# ifndef USE_AS_RAWMEMCHR
+ lea -16(%eax, %edi), %eax
+ RETURN
+# else
+ lea -16(%eax, %edx), %eax
+ ret
+# endif
+
+ .p2align 4
+L(matches):
+ bsf %eax, %eax
+# ifndef USE_AS_RAWMEMCHR
+ add %edi, %eax
+ RETURN
+# else
+ add %edx, %eax
+ ret
+# endif
+
+ .p2align 4
+L(matches16):
+ bsf %eax, %eax
+# ifndef USE_AS_RAWMEMCHR
+ lea 16(%eax, %edi), %eax
+ RETURN
+# else
+ lea 16(%eax, %edx), %eax
+ ret
+# endif
+
+ .p2align 4
+L(matches32):
+ bsf %eax, %eax
+# ifndef USE_AS_RAWMEMCHR
+ lea 32(%eax, %edi), %eax
+ RETURN
+# else
+ lea 32(%eax, %edx), %eax
+ ret
+# endif
+
+# ifndef USE_AS_RAWMEMCHR
+ .p2align 4
+L(matches_1):
+ bsf %eax, %eax
+ sub %eax, %edx
+ jbe L(return_null)
+
+ add %edi, %eax
+ RETURN
+
+ .p2align 4
+L(matches16_1):
+ sub $16, %edx
+ bsf %eax, %eax
+ sub %eax, %edx
+ jbe L(return_null)
+
+ lea 16(%edi, %eax), %eax
+ RETURN
+
+ .p2align 4
+L(matches32_1):
+ sub $32, %edx
+ bsf %eax, %eax
+ sub %eax, %edx
+ jbe L(return_null)
+
+ lea 32(%edi, %eax), %eax
+ RETURN
+
+ .p2align 4
+L(matches48_1):
+ sub $48, %edx
+ bsf %eax, %eax
+ sub %eax, %edx
+ jbe L(return_null)
+
+ lea 48(%edi, %eax), %eax
+ RETURN
+# endif
+ .p2align 4
+L(return_null):
+ xor %eax, %eax
+# ifndef USE_AS_RAWMEMCHR
+ RETURN
+# else
+ ret
+# endif
+
+END (MEMCHR)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memchr-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memchr-sse2.S
new file mode 100644
index 0000000000..172d70de13
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memchr-sse2.S
@@ -0,0 +1,709 @@
+/* Optimized memchr with sse2 without bsf
+ Copyright (C) 2011-2017 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+
+# define CFI_PUSH(REG) \
+ cfi_adjust_cfa_offset (4); \
+ cfi_rel_offset (REG, 0)
+
+# define CFI_POP(REG) \
+ cfi_adjust_cfa_offset (-4); \
+ cfi_restore (REG)
+
+# define PUSH(REG) pushl REG; CFI_PUSH (REG)
+# define POP(REG) popl REG; CFI_POP (REG)
+
+# ifndef USE_AS_RAWMEMCHR
+# define ENTRANCE PUSH(%edi);
+# define PARMS 8
+# define RETURN POP(%edi); ret; CFI_PUSH(%edi);
+# else
+# define ENTRANCE
+# define PARMS 4
+# endif
+
+# define STR1 PARMS
+# define STR2 STR1+4
+
+# ifndef USE_AS_RAWMEMCHR
+# define LEN STR2+4
+# endif
+
+# ifndef MEMCHR
+# define MEMCHR __memchr_sse2
+# endif
+
+ atom_text_section
+ENTRY (MEMCHR)
+ ENTRANCE
+ mov STR1(%esp), %ecx
+ movd STR2(%esp), %xmm1
+# ifndef USE_AS_RAWMEMCHR
+ mov LEN(%esp), %edx
+ test %edx, %edx
+ jz L(return_null)
+# endif
+
+ punpcklbw %xmm1, %xmm1
+# ifndef USE_AS_RAWMEMCHR
+ mov %ecx, %edi
+# else
+ mov %ecx, %edx
+# endif
+ punpcklbw %xmm1, %xmm1
+
+ and $63, %ecx
+ pshufd $0, %xmm1, %xmm1
+ cmp $48, %ecx
+ ja L(crosscache)
+
+# ifndef USE_AS_RAWMEMCHR
+ movdqu (%edi), %xmm0
+# else
+ movdqu (%edx), %xmm0
+# endif
+ pcmpeqb %xmm1, %xmm0
+ pmovmskb %xmm0, %eax
+ test %eax, %eax
+# ifndef USE_AS_RAWMEMCHR
+ jnz L(match_case2_prolog)
+
+ sub $16, %edx
+ jbe L(return_null)
+ lea 16(%edi), %edi
+ and $15, %ecx
+ and $-16, %edi
+ add %ecx, %edx
+# else
+ jnz L(match_case1_prolog)
+ lea 16(%edx), %edx
+ and $-16, %edx
+# endif
+ jmp L(loop_prolog)
+
+ .p2align 4
+L(crosscache):
+ and $15, %ecx
+# ifndef USE_AS_RAWMEMCHR
+ and $-16, %edi
+ movdqa (%edi), %xmm0
+# else
+ and $-16, %edx
+ movdqa (%edx), %xmm0
+# endif
+ pcmpeqb %xmm1, %xmm0
+ pmovmskb %xmm0, %eax
+ sar %cl, %eax
+ test %eax, %eax
+
+# ifndef USE_AS_RAWMEMCHR
+ jnz L(match_case2_prolog1)
+ /* "ecx" is less than 16. Calculate "edx + ecx - 16" by using
+ "edx - (16 - ecx)" instead of "(edx + ecx) - 16" to void
+ possible addition overflow. */
+ neg %ecx
+ add $16, %ecx
+ sub %ecx, %edx
+ jbe L(return_null)
+ lea 16(%edi), %edi
+# else
+ jnz L(match_case1_prolog1)
+ lea 16(%edx), %edx
+# endif
+
+ .p2align 4
+L(loop_prolog):
+# ifndef USE_AS_RAWMEMCHR
+ sub $64, %edx
+ jbe L(exit_loop)
+ movdqa (%edi), %xmm0
+# else
+ movdqa (%edx), %xmm0
+# endif
+ pcmpeqb %xmm1, %xmm0
+ xor %ecx, %ecx
+ pmovmskb %xmm0, %eax
+ test %eax, %eax
+ jnz L(match_case1)
+
+# ifndef USE_AS_RAWMEMCHR
+ movdqa 16(%edi), %xmm2
+# else
+ movdqa 16(%edx), %xmm2
+# endif
+ pcmpeqb %xmm1, %xmm2
+ lea 16(%ecx), %ecx
+ pmovmskb %xmm2, %eax
+ test %eax, %eax
+ jnz L(match_case1)
+
+# ifndef USE_AS_RAWMEMCHR
+ movdqa 32(%edi), %xmm3
+# else
+ movdqa 32(%edx), %xmm3
+# endif
+ pcmpeqb %xmm1, %xmm3
+ lea 16(%ecx), %ecx
+ pmovmskb %xmm3, %eax
+ test %eax, %eax
+ jnz L(match_case1)
+
+# ifndef USE_AS_RAWMEMCHR
+ movdqa 48(%edi), %xmm4
+# else
+ movdqa 48(%edx), %xmm4
+# endif
+ pcmpeqb %xmm1, %xmm4
+ lea 16(%ecx), %ecx
+ pmovmskb %xmm4, %eax
+ test %eax, %eax
+ jnz L(match_case1)
+
+# ifndef USE_AS_RAWMEMCHR
+ lea 64(%edi), %edi
+ sub $64, %edx
+ jbe L(exit_loop)
+
+ movdqa (%edi), %xmm0
+# else
+ lea 64(%edx), %edx
+ movdqa (%edx), %xmm0
+# endif
+ pcmpeqb %xmm1, %xmm0
+ xor %ecx, %ecx
+ pmovmskb %xmm0, %eax
+ test %eax, %eax
+ jnz L(match_case1)
+
+# ifndef USE_AS_RAWMEMCHR
+ movdqa 16(%edi), %xmm2
+# else
+ movdqa 16(%edx), %xmm2
+# endif
+ pcmpeqb %xmm1, %xmm2
+ lea 16(%ecx), %ecx
+ pmovmskb %xmm2, %eax
+ test %eax, %eax
+ jnz L(match_case1)
+
+# ifndef USE_AS_RAWMEMCHR
+ movdqa 32(%edi), %xmm3
+# else
+ movdqa 32(%edx), %xmm3
+# endif
+ pcmpeqb %xmm1, %xmm3
+ lea 16(%ecx), %ecx
+ pmovmskb %xmm3, %eax
+ test %eax, %eax
+ jnz L(match_case1)
+
+# ifndef USE_AS_RAWMEMCHR
+ movdqa 48(%edi), %xmm4
+# else
+ movdqa 48(%edx), %xmm4
+# endif
+ pcmpeqb %xmm1, %xmm4
+ lea 16(%ecx), %ecx
+ pmovmskb %xmm4, %eax
+ test %eax, %eax
+ jnz L(match_case1)
+
+# ifndef USE_AS_RAWMEMCHR
+ lea 64(%edi), %edi
+ mov %edi, %ecx
+ and $-64, %edi
+ and $63, %ecx
+ add %ecx, %edx
+# else
+ lea 64(%edx), %edx
+ and $-64, %edx
+# endif
+
+ .p2align 4
+L(align64_loop):
+
+# ifndef USE_AS_RAWMEMCHR
+ sub $64, %edx
+ jbe L(exit_loop)
+ movdqa (%edi), %xmm0
+ movdqa 16(%edi), %xmm2
+ movdqa 32(%edi), %xmm3
+ movdqa 48(%edi), %xmm4
+# else
+ movdqa (%edx), %xmm0
+ movdqa 16(%edx), %xmm2
+ movdqa 32(%edx), %xmm3
+ movdqa 48(%edx), %xmm4
+# endif
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm1, %xmm2
+ pcmpeqb %xmm1, %xmm3
+ pcmpeqb %xmm1, %xmm4
+
+ pmaxub %xmm0, %xmm3
+ pmaxub %xmm2, %xmm4
+ pmaxub %xmm3, %xmm4
+# ifndef USE_AS_RAWMEMCHR
+ add $64, %edi
+# else
+ add $64, %edx
+# endif
+ pmovmskb %xmm4, %eax
+
+ test %eax, %eax
+ jz L(align64_loop)
+
+# ifndef USE_AS_RAWMEMCHR
+ sub $64, %edi
+# else
+ sub $64, %edx
+# endif
+
+ pmovmskb %xmm0, %eax
+ xor %ecx, %ecx
+ test %eax, %eax
+ jnz L(match_case1)
+
+ pmovmskb %xmm2, %eax
+ lea 16(%ecx), %ecx
+ test %eax, %eax
+ jnz L(match_case1)
+
+# ifndef USE_AS_RAWMEMCHR
+ movdqa 32(%edi), %xmm3
+# else
+ movdqa 32(%edx), %xmm3
+# endif
+ pcmpeqb %xmm1, %xmm3
+ pmovmskb %xmm3, %eax
+ lea 16(%ecx), %ecx
+ test %eax, %eax
+ jnz L(match_case1)
+
+# ifndef USE_AS_RAWMEMCHR
+ pcmpeqb 48(%edi), %xmm1
+# else
+ pcmpeqb 48(%edx), %xmm1
+# endif
+ pmovmskb %xmm1, %eax
+ lea 16(%ecx), %ecx
+
+ .p2align 4
+L(match_case1):
+# ifndef USE_AS_RAWMEMCHR
+ add %ecx, %edi
+# else
+L(match_case1_prolog1):
+ add %ecx, %edx
+L(match_case1_prolog):
+# endif
+ test %al, %al
+ jz L(match_case1_high)
+ mov %al, %cl
+ and $15, %cl
+ jz L(match_case1_8)
+ test $0x01, %al
+ jnz L(ExitCase1_1)
+ test $0x02, %al
+ jnz L(ExitCase1_2)
+ test $0x04, %al
+ jnz L(ExitCase1_3)
+# ifndef USE_AS_RAWMEMCHR
+ lea 3(%edi), %eax
+ RETURN
+# else
+ lea 3(%edx), %eax
+ ret
+# endif
+
+ .p2align 4
+L(match_case1_8):
+ test $0x10, %al
+ jnz L(ExitCase1_5)
+ test $0x20, %al
+ jnz L(ExitCase1_6)
+ test $0x40, %al
+ jnz L(ExitCase1_7)
+# ifndef USE_AS_RAWMEMCHR
+ lea 7(%edi), %eax
+ RETURN
+# else
+ lea 7(%edx), %eax
+ ret
+# endif
+
+ .p2align 4
+L(match_case1_high):
+ mov %ah, %ch
+ and $15, %ch
+ jz L(match_case1_high_8)
+ test $0x01, %ah
+ jnz L(ExitCase1_9)
+ test $0x02, %ah
+ jnz L(ExitCase1_10)
+ test $0x04, %ah
+ jnz L(ExitCase1_11)
+# ifndef USE_AS_RAWMEMCHR
+ lea 11(%edi), %eax
+ RETURN
+# else
+ lea 11(%edx), %eax
+ ret
+# endif
+
+ .p2align 4
+L(match_case1_high_8):
+ test $0x10, %ah
+ jnz L(ExitCase1_13)
+ test $0x20, %ah
+ jnz L(ExitCase1_14)
+ test $0x40, %ah
+ jnz L(ExitCase1_15)
+# ifndef USE_AS_RAWMEMCHR
+ lea 15(%edi), %eax
+ RETURN
+# else
+ lea 15(%edx), %eax
+ ret
+# endif
+
+# ifndef USE_AS_RAWMEMCHR
+ .p2align 4
+L(exit_loop):
+ add $64, %edx
+
+ movdqa (%edi), %xmm0
+ pcmpeqb %xmm1, %xmm0
+ xor %ecx, %ecx
+ pmovmskb %xmm0, %eax
+ test %eax, %eax
+ jnz L(match_case2)
+ cmp $16, %edx
+ jbe L(return_null)
+
+ movdqa 16(%edi), %xmm2
+ pcmpeqb %xmm1, %xmm2
+ lea 16(%ecx), %ecx
+ pmovmskb %xmm2, %eax
+ test %eax, %eax
+ jnz L(match_case2)
+ cmp $32, %edx
+ jbe L(return_null)
+
+ movdqa 32(%edi), %xmm3
+ pcmpeqb %xmm1, %xmm3
+ lea 16(%ecx), %ecx
+ pmovmskb %xmm3, %eax
+ test %eax, %eax
+ jnz L(match_case2)
+ cmp $48, %edx
+ jbe L(return_null)
+
+ pcmpeqb 48(%edi), %xmm1
+ lea 16(%ecx), %ecx
+ pmovmskb %xmm1, %eax
+ test %eax, %eax
+ jnz L(match_case2)
+
+ xor %eax, %eax
+ RETURN
+# endif
+
+ .p2align 4
+L(ExitCase1_1):
+# ifndef USE_AS_RAWMEMCHR
+ mov %edi, %eax
+ RETURN
+# else
+ mov %edx, %eax
+ ret
+# endif
+
+ .p2align 4
+L(ExitCase1_2):
+# ifndef USE_AS_RAWMEMCHR
+ lea 1(%edi), %eax
+ RETURN
+# else
+ lea 1(%edx), %eax
+ ret
+# endif
+
+ .p2align 4
+L(ExitCase1_3):
+# ifndef USE_AS_RAWMEMCHR
+ lea 2(%edi), %eax
+ RETURN
+# else
+ lea 2(%edx), %eax
+ ret
+# endif
+
+ .p2align 4
+L(ExitCase1_5):
+# ifndef USE_AS_RAWMEMCHR
+ lea 4(%edi), %eax
+ RETURN
+# else
+ lea 4(%edx), %eax
+ ret
+# endif
+
+ .p2align 4
+L(ExitCase1_6):
+# ifndef USE_AS_RAWMEMCHR
+ lea 5(%edi), %eax
+ RETURN
+# else
+ lea 5(%edx), %eax
+ ret
+# endif
+
+ .p2align 4
+L(ExitCase1_7):
+# ifndef USE_AS_RAWMEMCHR
+ lea 6(%edi), %eax
+ RETURN
+# else
+ lea 6(%edx), %eax
+ ret
+# endif
+
+ .p2align 4
+L(ExitCase1_9):
+# ifndef USE_AS_RAWMEMCHR
+ lea 8(%edi), %eax
+ RETURN
+# else
+ lea 8(%edx), %eax
+ ret
+# endif
+
+ .p2align 4
+L(ExitCase1_10):
+# ifndef USE_AS_RAWMEMCHR
+ lea 9(%edi), %eax
+ RETURN
+# else
+ lea 9(%edx), %eax
+ ret
+# endif
+
+ .p2align 4
+L(ExitCase1_11):
+# ifndef USE_AS_RAWMEMCHR
+ lea 10(%edi), %eax
+ RETURN
+# else
+ lea 10(%edx), %eax
+ ret
+# endif
+
+ .p2align 4
+L(ExitCase1_13):
+# ifndef USE_AS_RAWMEMCHR
+ lea 12(%edi), %eax
+ RETURN
+# else
+ lea 12(%edx), %eax
+ ret
+# endif
+
+ .p2align 4
+L(ExitCase1_14):
+# ifndef USE_AS_RAWMEMCHR
+ lea 13(%edi), %eax
+ RETURN
+# else
+ lea 13(%edx), %eax
+ ret
+# endif
+
+ .p2align 4
+L(ExitCase1_15):
+# ifndef USE_AS_RAWMEMCHR
+ lea 14(%edi), %eax
+ RETURN
+# else
+ lea 14(%edx), %eax
+ ret
+# endif
+
+# ifndef USE_AS_RAWMEMCHR
+ .p2align 4
+L(match_case2):
+ sub %ecx, %edx
+L(match_case2_prolog1):
+ add %ecx, %edi
+L(match_case2_prolog):
+ test %al, %al
+ jz L(match_case2_high)
+ mov %al, %cl
+ and $15, %cl
+ jz L(match_case2_8)
+ test $0x01, %al
+ jnz L(ExitCase2_1)
+ test $0x02, %al
+ jnz L(ExitCase2_2)
+ test $0x04, %al
+ jnz L(ExitCase2_3)
+ sub $4, %edx
+ jb L(return_null)
+ lea 3(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(match_case2_8):
+ test $0x10, %al
+ jnz L(ExitCase2_5)
+ test $0x20, %al
+ jnz L(ExitCase2_6)
+ test $0x40, %al
+ jnz L(ExitCase2_7)
+ sub $8, %edx
+ jb L(return_null)
+ lea 7(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(match_case2_high):
+ mov %ah, %ch
+ and $15, %ch
+ jz L(match_case2_high_8)
+ test $0x01, %ah
+ jnz L(ExitCase2_9)
+ test $0x02, %ah
+ jnz L(ExitCase2_10)
+ test $0x04, %ah
+ jnz L(ExitCase2_11)
+ sub $12, %edx
+ jb L(return_null)
+ lea 11(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(match_case2_high_8):
+ test $0x10, %ah
+ jnz L(ExitCase2_13)
+ test $0x20, %ah
+ jnz L(ExitCase2_14)
+ test $0x40, %ah
+ jnz L(ExitCase2_15)
+ sub $16, %edx
+ jb L(return_null)
+ lea 15(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(ExitCase2_1):
+ mov %edi, %eax
+ RETURN
+
+ .p2align 4
+L(ExitCase2_2):
+ sub $2, %edx
+ jb L(return_null)
+ lea 1(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(ExitCase2_3):
+ sub $3, %edx
+ jb L(return_null)
+ lea 2(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(ExitCase2_5):
+ sub $5, %edx
+ jb L(return_null)
+ lea 4(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(ExitCase2_6):
+ sub $6, %edx
+ jb L(return_null)
+ lea 5(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(ExitCase2_7):
+ sub $7, %edx
+ jb L(return_null)
+ lea 6(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(ExitCase2_9):
+ sub $9, %edx
+ jb L(return_null)
+ lea 8(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(ExitCase2_10):
+ sub $10, %edx
+ jb L(return_null)
+ lea 9(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(ExitCase2_11):
+ sub $11, %edx
+ jb L(return_null)
+ lea 10(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(ExitCase2_13):
+ sub $13, %edx
+ jb L(return_null)
+ lea 12(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(ExitCase2_14):
+ sub $14, %edx
+ jb L(return_null)
+ lea 13(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(ExitCase2_15):
+ sub $15, %edx
+ jb L(return_null)
+ lea 14(%edi), %eax
+ RETURN
+# endif
+
+ .p2align 4
+L(return_null):
+ xor %eax, %eax
+# ifndef USE_AS_RAWMEMCHR
+ RETURN
+# else
+ ret
+# endif
+
+END (MEMCHR)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memchr.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memchr.S
new file mode 100644
index 0000000000..bd0dace290
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memchr.S
@@ -0,0 +1,65 @@
+/* Multiple versions of memchr
+ All versions must be listed in ifunc-impl-list.c.
+ Copyright (C) 2011-2017 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+#if IS_IN (libc)
+ .text
+ENTRY(__memchr)
+ .type __memchr, @gnu_indirect_function
+ LOAD_GOT_AND_RTLD_GLOBAL_RO
+ HAS_CPU_FEATURE (SSE2)
+ jz 2f
+ HAS_ARCH_FEATURE (Slow_BSF)
+ jz 3f
+
+ LOAD_FUNC_GOT_EAX ( __memchr_sse2)
+ ret
+
+2: LOAD_FUNC_GOT_EAX (__memchr_ia32)
+ ret
+
+3: LOAD_FUNC_GOT_EAX (__memchr_sse2_bsf)
+ ret
+END(__memchr)
+
+weak_alias(__memchr, memchr)
+
+# undef ENTRY
+# define ENTRY(name) \
+ .type __memchr_ia32, @function; \
+ .globl __memchr_ia32; \
+ .p2align 4; \
+ __memchr_ia32: cfi_startproc; \
+ CALL_MCOUNT
+# undef END
+# define END(name) \
+ cfi_endproc; .size __memchr_ia32, .-__memchr_ia32
+
+# undef libc_hidden_builtin_def
+/* IFUNC doesn't work with the hidden functions in shared library since
+ they will be called without setting up EBX needed for PLT which is
+ used by IFUNC. */
+# define libc_hidden_builtin_def(name) \
+ .globl __GI_memchr; __GI_memchr = __memchr_ia32
+
+#endif
+#include "../../memchr.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memcmp-sse4.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memcmp-sse4.S
new file mode 100644
index 0000000000..2aa13048b2
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memcmp-sse4.S
@@ -0,0 +1,1225 @@
+/* memcmp with SSE4.2, wmemcmp with SSE4.2
+ Copyright (C) 2010-2017 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+
+# ifndef MEMCMP
+# define MEMCMP __memcmp_sse4_2
+# endif
+
+# define CFI_PUSH(REG) \
+ cfi_adjust_cfa_offset (4); \
+ cfi_rel_offset (REG, 0)
+
+# define CFI_POP(REG) \
+ cfi_adjust_cfa_offset (-4); \
+ cfi_restore (REG)
+
+# define PUSH(REG) pushl REG; CFI_PUSH (REG)
+# define POP(REG) popl REG; CFI_POP (REG)
+
+# define PARMS 4
+# define BLK1 PARMS
+# define BLK2 BLK1 + 4
+# define LEN BLK2 + 4
+# define RETURN POP (%ebx); ret; CFI_PUSH (%ebx)
+
+
+# ifdef SHARED
+# define JMPTBL(I, B) I - B
+
+/* Load an entry in a jump table into EBX and branch to it. TABLE is a
+ jump table with relative offsets. INDEX is a register contains the
+ index into the jump table. SCALE is the scale of INDEX. */
+
+# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
+/* We first load PC into EBX. */ \
+ SETUP_PIC_REG(bx); \
+/* Get the address of the jump table. */ \
+ addl $(TABLE - .), %ebx; \
+/* Get the entry and convert the relative offset to the \
+ absolute address. */ \
+ addl (%ebx,INDEX,SCALE), %ebx; \
+/* We loaded the jump table and adjusted EDX/ESI. Go. */ \
+ jmp *%ebx
+# else
+# define JMPTBL(I, B) I
+
+/* Load an entry in a jump table into EBX and branch to it. TABLE is a
+ jump table with relative offsets. INDEX is a register contains the
+ index into the jump table. SCALE is the scale of INDEX. */
+# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
+ jmp *TABLE(,INDEX,SCALE)
+# endif
+
+
+/* Warning!
+ wmemcmp has to use SIGNED comparison for elements.
+ memcmp has to use UNSIGNED comparison for elemnts.
+*/
+
+ .section .text.sse4.2,"ax",@progbits
+ENTRY (MEMCMP)
+ movl BLK1(%esp), %eax
+ movl BLK2(%esp), %edx
+ movl LEN(%esp), %ecx
+
+# ifdef USE_AS_WMEMCMP
+ shl $2, %ecx
+ test %ecx, %ecx
+ jz L(return0)
+# else
+ cmp $1, %ecx
+ jbe L(less1bytes)
+# endif
+
+ pxor %xmm0, %xmm0
+ cmp $64, %ecx
+ ja L(64bytesormore)
+ cmp $8, %ecx
+
+# ifndef USE_AS_WMEMCMP
+ PUSH (%ebx)
+ jb L(less8bytes)
+# else
+ jb L(less8bytes)
+ PUSH (%ebx)
+# endif
+
+ add %ecx, %edx
+ add %ecx, %eax
+ BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %ecx, 4)
+
+# ifndef USE_AS_WMEMCMP
+ .p2align 4
+L(less8bytes):
+ mov (%eax), %bl
+ cmpb (%edx), %bl
+ jne L(nonzero)
+
+ mov 1(%eax), %bl
+ cmpb 1(%edx), %bl
+ jne L(nonzero)
+
+ cmp $2, %ecx
+ jz L(0bytes)
+
+ mov 2(%eax), %bl
+ cmpb 2(%edx), %bl
+ jne L(nonzero)
+
+ cmp $3, %ecx
+ jz L(0bytes)
+
+ mov 3(%eax), %bl
+ cmpb 3(%edx), %bl
+ jne L(nonzero)
+
+ cmp $4, %ecx
+ jz L(0bytes)
+
+ mov 4(%eax), %bl
+ cmpb 4(%edx), %bl
+ jne L(nonzero)
+
+ cmp $5, %ecx
+ jz L(0bytes)
+
+ mov 5(%eax), %bl
+ cmpb 5(%edx), %bl
+ jne L(nonzero)
+
+ cmp $6, %ecx
+ jz L(0bytes)
+
+ mov 6(%eax), %bl
+ cmpb 6(%edx), %bl
+ je L(0bytes)
+
+L(nonzero):
+ POP (%ebx)
+ mov $1, %eax
+ ja L(above)
+ neg %eax
+L(above):
+ ret
+ CFI_PUSH (%ebx)
+# endif
+
+ .p2align 4
+L(0bytes):
+ POP (%ebx)
+ xor %eax, %eax
+ ret
+
+# ifdef USE_AS_WMEMCMP
+
+/* for wmemcmp, case N == 1 */
+
+ .p2align 4
+L(less8bytes):
+ mov (%eax), %ecx
+ cmp (%edx), %ecx
+ je L(return0)
+ mov $1, %eax
+ jg L(find_diff_bigger)
+ neg %eax
+ ret
+
+ .p2align 4
+L(find_diff_bigger):
+ ret
+
+ .p2align 4
+L(return0):
+ xor %eax, %eax
+ ret
+# endif
+
+# ifndef USE_AS_WMEMCMP
+ .p2align 4
+L(less1bytes):
+ jb L(0bytesend)
+ movzbl (%eax), %eax
+ movzbl (%edx), %edx
+ sub %edx, %eax
+ ret
+
+ .p2align 4
+L(0bytesend):
+ xor %eax, %eax
+ ret
+# endif
+ .p2align 4
+L(64bytesormore):
+ PUSH (%ebx)
+ mov %ecx, %ebx
+ mov $64, %ecx
+ sub $64, %ebx
+L(64bytesormore_loop):
+ movdqu (%eax), %xmm1
+ movdqu (%edx), %xmm2
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(find_16diff)
+
+ movdqu 16(%eax), %xmm1
+ movdqu 16(%edx), %xmm2
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(find_32diff)
+
+ movdqu 32(%eax), %xmm1
+ movdqu 32(%edx), %xmm2
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(find_48diff)
+
+ movdqu 48(%eax), %xmm1
+ movdqu 48(%edx), %xmm2
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(find_64diff)
+ add %ecx, %eax
+ add %ecx, %edx
+ sub %ecx, %ebx
+ jae L(64bytesormore_loop)
+ add %ebx, %ecx
+ add %ecx, %edx
+ add %ecx, %eax
+ BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %ecx, 4)
+
+# ifdef USE_AS_WMEMCMP
+
+/* Label needs only for table_64bytes filling */
+L(unreal_case):
+/* no code here */
+
+# endif
+ .p2align 4
+L(find_16diff):
+ sub $16, %ecx
+L(find_32diff):
+ sub $16, %ecx
+L(find_48diff):
+ sub $16, %ecx
+L(find_64diff):
+ add %ecx, %edx
+ add %ecx, %eax
+
+# ifndef USE_AS_WMEMCMP
+ .p2align 4
+L(16bytes):
+ mov -16(%eax), %ecx
+ mov -16(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+L(12bytes):
+ mov -12(%eax), %ecx
+ mov -12(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+L(8bytes):
+ mov -8(%eax), %ecx
+ mov -8(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+L(4bytes):
+ mov -4(%eax), %ecx
+ mov -4(%edx), %ebx
+ cmp %ebx, %ecx
+ mov $0, %eax
+ jne L(find_diff)
+ RETURN
+# else
+ .p2align 4
+L(16bytes):
+ mov -16(%eax), %ecx
+ cmp -16(%edx), %ecx
+ jne L(find_diff)
+L(12bytes):
+ mov -12(%eax), %ecx
+ cmp -12(%edx), %ecx
+ jne L(find_diff)
+L(8bytes):
+ mov -8(%eax), %ecx
+ cmp -8(%edx), %ecx
+ jne L(find_diff)
+L(4bytes):
+ mov -4(%eax), %ecx
+ cmp -4(%edx), %ecx
+ mov $0, %eax
+ jne L(find_diff)
+ RETURN
+# endif
+
+# ifndef USE_AS_WMEMCMP
+ .p2align 4
+L(49bytes):
+ movdqu -49(%eax), %xmm1
+ movdqu -49(%edx), %xmm2
+ mov $-49, %ebx
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(33bytes):
+ movdqu -33(%eax), %xmm1
+ movdqu -33(%edx), %xmm2
+ mov $-33, %ebx
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(17bytes):
+ mov -17(%eax), %ecx
+ mov -17(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+L(13bytes):
+ mov -13(%eax), %ecx
+ mov -13(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+L(9bytes):
+ mov -9(%eax), %ecx
+ mov -9(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+L(5bytes):
+ mov -5(%eax), %ecx
+ mov -5(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+ movzbl -1(%eax), %ecx
+ cmp -1(%edx), %cl
+ mov $0, %eax
+ jne L(end)
+ RETURN
+
+ .p2align 4
+L(50bytes):
+ mov $-50, %ebx
+ movdqu -50(%eax), %xmm1
+ movdqu -50(%edx), %xmm2
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(34bytes):
+ mov $-34, %ebx
+ movdqu -34(%eax), %xmm1
+ movdqu -34(%edx), %xmm2
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(18bytes):
+ mov -18(%eax), %ecx
+ mov -18(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+L(14bytes):
+ mov -14(%eax), %ecx
+ mov -14(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+L(10bytes):
+ mov -10(%eax), %ecx
+ mov -10(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+L(6bytes):
+ mov -6(%eax), %ecx
+ mov -6(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+L(2bytes):
+ movzwl -2(%eax), %ecx
+ movzwl -2(%edx), %ebx
+ cmp %bl, %cl
+ jne L(end)
+ cmp %bh, %ch
+ mov $0, %eax
+ jne L(end)
+ RETURN
+
+ .p2align 4
+L(51bytes):
+ mov $-51, %ebx
+ movdqu -51(%eax), %xmm1
+ movdqu -51(%edx), %xmm2
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(35bytes):
+ mov $-35, %ebx
+ movdqu -35(%eax), %xmm1
+ movdqu -35(%edx), %xmm2
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(19bytes):
+ movl -19(%eax), %ecx
+ movl -19(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+L(15bytes):
+ movl -15(%eax), %ecx
+ movl -15(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+L(11bytes):
+ movl -11(%eax), %ecx
+ movl -11(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+L(7bytes):
+ movl -7(%eax), %ecx
+ movl -7(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+L(3bytes):
+ movzwl -3(%eax), %ecx
+ movzwl -3(%edx), %ebx
+ cmpb %bl, %cl
+ jne L(end)
+ cmp %bx, %cx
+ jne L(end)
+L(1bytes):
+ movzbl -1(%eax), %eax
+ cmpb -1(%edx), %al
+ mov $0, %eax
+ jne L(end)
+ RETURN
+# endif
+ .p2align 4
+L(52bytes):
+ movdqu -52(%eax), %xmm1
+ movdqu -52(%edx), %xmm2
+ mov $-52, %ebx
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(36bytes):
+ movdqu -36(%eax), %xmm1
+ movdqu -36(%edx), %xmm2
+ mov $-36, %ebx
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(20bytes):
+ movdqu -20(%eax), %xmm1
+ movdqu -20(%edx), %xmm2
+ mov $-20, %ebx
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+ mov -4(%eax), %ecx
+# ifndef USE_AS_WMEMCMP
+ mov -4(%edx), %ebx
+ cmp %ebx, %ecx
+# else
+ cmp -4(%edx), %ecx
+# endif
+ mov $0, %eax
+ jne L(find_diff)
+ RETURN
+
+# ifndef USE_AS_WMEMCMP
+ .p2align 4
+L(53bytes):
+ movdqu -53(%eax), %xmm1
+ movdqu -53(%edx), %xmm2
+ mov $-53, %ebx
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(37bytes):
+ mov $-37, %ebx
+ movdqu -37(%eax), %xmm1
+ movdqu -37(%edx), %xmm2
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(21bytes):
+ mov $-21, %ebx
+ movdqu -21(%eax), %xmm1
+ movdqu -21(%edx), %xmm2
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+ mov -5(%eax), %ecx
+ mov -5(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+ movzbl -1(%eax), %ecx
+ cmp -1(%edx), %cl
+ mov $0, %eax
+ jne L(end)
+ RETURN
+
+ .p2align 4
+L(54bytes):
+ movdqu -54(%eax), %xmm1
+ movdqu -54(%edx), %xmm2
+ mov $-54, %ebx
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(38bytes):
+ mov $-38, %ebx
+ movdqu -38(%eax), %xmm1
+ movdqu -38(%edx), %xmm2
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(22bytes):
+ mov $-22, %ebx
+ movdqu -22(%eax), %xmm1
+ movdqu -22(%edx), %xmm2
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+
+ mov -6(%eax), %ecx
+ mov -6(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+ movzwl -2(%eax), %ecx
+ movzwl -2(%edx), %ebx
+ cmp %bl, %cl
+ jne L(end)
+ cmp %bh, %ch
+ mov $0, %eax
+ jne L(end)
+ RETURN
+
+ .p2align 4
+L(55bytes):
+ movdqu -55(%eax), %xmm1
+ movdqu -55(%edx), %xmm2
+ mov $-55, %ebx
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(39bytes):
+ mov $-39, %ebx
+ movdqu -39(%eax), %xmm1
+ movdqu -39(%edx), %xmm2
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(23bytes):
+ mov $-23, %ebx
+ movdqu -23(%eax), %xmm1
+ movdqu -23(%edx), %xmm2
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+ movl -7(%eax), %ecx
+ movl -7(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+ movzwl -3(%eax), %ecx
+ movzwl -3(%edx), %ebx
+ cmpb %bl, %cl
+ jne L(end)
+ cmp %bx, %cx
+ jne L(end)
+ movzbl -1(%eax), %eax
+ cmpb -1(%edx), %al
+ mov $0, %eax
+ jne L(end)
+ RETURN
+# endif
+ .p2align 4
+L(56bytes):
+ movdqu -56(%eax), %xmm1
+ movdqu -56(%edx), %xmm2
+ mov $-56, %ebx
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(40bytes):
+ mov $-40, %ebx
+ movdqu -40(%eax), %xmm1
+ movdqu -40(%edx), %xmm2
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(24bytes):
+ mov $-24, %ebx
+ movdqu -24(%eax), %xmm1
+ movdqu -24(%edx), %xmm2
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+
+ mov -8(%eax), %ecx
+# ifndef USE_AS_WMEMCMP
+ mov -8(%edx), %ebx
+ cmp %ebx, %ecx
+# else
+ cmp -8(%edx), %ecx
+# endif
+ jne L(find_diff)
+
+ mov -4(%eax), %ecx
+# ifndef USE_AS_WMEMCMP
+ mov -4(%edx), %ebx
+ cmp %ebx, %ecx
+# else
+ cmp -4(%edx), %ecx
+# endif
+ mov $0, %eax
+ jne L(find_diff)
+ RETURN
+
+# ifndef USE_AS_WMEMCMP
+ .p2align 4
+L(57bytes):
+ movdqu -57(%eax), %xmm1
+ movdqu -57(%edx), %xmm2
+ mov $-57, %ebx
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(41bytes):
+ mov $-41, %ebx
+ movdqu -41(%eax), %xmm1
+ movdqu -41(%edx), %xmm2
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(25bytes):
+ mov $-25, %ebx
+ movdqu -25(%eax), %xmm1
+ movdqu -25(%edx), %xmm2
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+ mov -9(%eax), %ecx
+ mov -9(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+ mov -5(%eax), %ecx
+ mov -5(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+ movzbl -1(%eax), %ecx
+ cmp -1(%edx), %cl
+ mov $0, %eax
+ jne L(end)
+ RETURN
+
+ .p2align 4
+L(58bytes):
+ movdqu -58(%eax), %xmm1
+ movdqu -58(%edx), %xmm2
+ mov $-58, %ebx
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(42bytes):
+ mov $-42, %ebx
+ movdqu -42(%eax), %xmm1
+ movdqu -42(%edx), %xmm2
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(26bytes):
+ mov $-26, %ebx
+ movdqu -26(%eax), %xmm1
+ movdqu -26(%edx), %xmm2
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+
+ mov -10(%eax), %ecx
+ mov -10(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+
+ mov -6(%eax), %ecx
+ mov -6(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+
+ movzwl -2(%eax), %ecx
+ movzwl -2(%edx), %ebx
+ cmp %bl, %cl
+ jne L(end)
+ cmp %bh, %ch
+ mov $0, %eax
+ jne L(end)
+ RETURN
+
+ .p2align 4
+L(59bytes):
+ movdqu -59(%eax), %xmm1
+ movdqu -59(%edx), %xmm2
+ mov $-59, %ebx
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(43bytes):
+ mov $-43, %ebx
+ movdqu -43(%eax), %xmm1
+ movdqu -43(%edx), %xmm2
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(27bytes):
+ mov $-27, %ebx
+ movdqu -27(%eax), %xmm1
+ movdqu -27(%edx), %xmm2
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+ movl -11(%eax), %ecx
+ movl -11(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+ movl -7(%eax), %ecx
+ movl -7(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+ movzwl -3(%eax), %ecx
+ movzwl -3(%edx), %ebx
+ cmpb %bl, %cl
+ jne L(end)
+ cmp %bx, %cx
+ jne L(end)
+ movzbl -1(%eax), %eax
+ cmpb -1(%edx), %al
+ mov $0, %eax
+ jne L(end)
+ RETURN
+# endif
+ .p2align 4
+L(60bytes):
+ movdqu -60(%eax), %xmm1
+ movdqu -60(%edx), %xmm2
+ mov $-60, %ebx
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(44bytes):
+ mov $-44, %ebx
+ movdqu -44(%eax), %xmm1
+ movdqu -44(%edx), %xmm2
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(28bytes):
+ mov $-28, %ebx
+ movdqu -28(%eax), %xmm1
+ movdqu -28(%edx), %xmm2
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+
+ mov -12(%eax), %ecx
+# ifndef USE_AS_WMEMCMP
+ mov -12(%edx), %ebx
+ cmp %ebx, %ecx
+# else
+ cmp -12(%edx), %ecx
+# endif
+ jne L(find_diff)
+
+ mov -8(%eax), %ecx
+# ifndef USE_AS_WMEMCMP
+ mov -8(%edx), %ebx
+ cmp %ebx, %ecx
+# else
+ cmp -8(%edx), %ecx
+# endif
+ jne L(find_diff)
+
+ mov -4(%eax), %ecx
+# ifndef USE_AS_WMEMCMP
+ mov -4(%edx), %ebx
+ cmp %ebx, %ecx
+# else
+ cmp -4(%edx), %ecx
+# endif
+ mov $0, %eax
+ jne L(find_diff)
+ RETURN
+
+# ifndef USE_AS_WMEMCMP
+ .p2align 4
+L(61bytes):
+ movdqu -61(%eax), %xmm1
+ movdqu -61(%edx), %xmm2
+ mov $-61, %ebx
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(45bytes):
+ mov $-45, %ebx
+ movdqu -45(%eax), %xmm1
+ movdqu -45(%edx), %xmm2
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(29bytes):
+ mov $-29, %ebx
+ movdqu -29(%eax), %xmm1
+ movdqu -29(%edx), %xmm2
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+
+ mov -13(%eax), %ecx
+ mov -13(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+
+ mov -9(%eax), %ecx
+ mov -9(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+
+ mov -5(%eax), %ecx
+ mov -5(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+ movzbl -1(%eax), %ecx
+ cmp -1(%edx), %cl
+ mov $0, %eax
+ jne L(end)
+ RETURN
+
+ .p2align 4
+L(62bytes):
+ movdqu -62(%eax), %xmm1
+ movdqu -62(%edx), %xmm2
+ mov $-62, %ebx
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(46bytes):
+ mov $-46, %ebx
+ movdqu -46(%eax), %xmm1
+ movdqu -46(%edx), %xmm2
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(30bytes):
+ mov $-30, %ebx
+ movdqu -30(%eax), %xmm1
+ movdqu -30(%edx), %xmm2
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+ mov -14(%eax), %ecx
+ mov -14(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+ mov -10(%eax), %ecx
+ mov -10(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+ mov -6(%eax), %ecx
+ mov -6(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+ movzwl -2(%eax), %ecx
+ movzwl -2(%edx), %ebx
+ cmp %bl, %cl
+ jne L(end)
+ cmp %bh, %ch
+ mov $0, %eax
+ jne L(end)
+ RETURN
+
+ .p2align 4
+L(63bytes):
+ movdqu -63(%eax), %xmm1
+ movdqu -63(%edx), %xmm2
+ mov $-63, %ebx
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(47bytes):
+ mov $-47, %ebx
+ movdqu -47(%eax), %xmm1
+ movdqu -47(%edx), %xmm2
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(31bytes):
+ mov $-31, %ebx
+ movdqu -31(%eax), %xmm1
+ movdqu -31(%edx), %xmm2
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+
+ movl -15(%eax), %ecx
+ movl -15(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+ movl -11(%eax), %ecx
+ movl -11(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+ movl -7(%eax), %ecx
+ movl -7(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+ movzwl -3(%eax), %ecx
+ movzwl -3(%edx), %ebx
+ cmpb %bl, %cl
+ jne L(end)
+ cmp %bx, %cx
+ jne L(end)
+ movzbl -1(%eax), %eax
+ cmpb -1(%edx), %al
+ mov $0, %eax
+ jne L(end)
+ RETURN
+# endif
+
+ .p2align 4
+L(64bytes):
+ movdqu -64(%eax), %xmm1
+ movdqu -64(%edx), %xmm2
+ mov $-64, %ebx
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(48bytes):
+ movdqu -48(%eax), %xmm1
+ movdqu -48(%edx), %xmm2
+ mov $-48, %ebx
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(32bytes):
+ movdqu -32(%eax), %xmm1
+ movdqu -32(%edx), %xmm2
+ mov $-32, %ebx
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+
+ mov -16(%eax), %ecx
+# ifndef USE_AS_WMEMCMP
+ mov -16(%edx), %ebx
+ cmp %ebx, %ecx
+# else
+ cmp -16(%edx), %ecx
+# endif
+ jne L(find_diff)
+
+ mov -12(%eax), %ecx
+# ifndef USE_AS_WMEMCMP
+ mov -12(%edx), %ebx
+ cmp %ebx, %ecx
+# else
+ cmp -12(%edx), %ecx
+# endif
+ jne L(find_diff)
+
+ mov -8(%eax), %ecx
+# ifndef USE_AS_WMEMCMP
+ mov -8(%edx), %ebx
+ cmp %ebx, %ecx
+# else
+ cmp -8(%edx), %ecx
+# endif
+ jne L(find_diff)
+
+ mov -4(%eax), %ecx
+# ifndef USE_AS_WMEMCMP
+ mov -4(%edx), %ebx
+ cmp %ebx, %ecx
+# else
+ cmp -4(%edx), %ecx
+# endif
+ mov $0, %eax
+ jne L(find_diff)
+ RETURN
+
+# ifndef USE_AS_WMEMCMP
+ .p2align 4
+L(less16bytes):
+ add %ebx, %eax
+ add %ebx, %edx
+
+ mov (%eax), %ecx
+ mov (%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+
+ mov 4(%eax), %ecx
+ mov 4(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+
+ mov 8(%eax), %ecx
+ mov 8(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+
+ mov 12(%eax), %ecx
+ mov 12(%edx), %ebx
+ cmp %ebx, %ecx
+ mov $0, %eax
+ jne L(find_diff)
+ RETURN
+# else
+ .p2align 4
+L(less16bytes):
+ add %ebx, %eax
+ add %ebx, %edx
+
+ mov (%eax), %ecx
+ cmp (%edx), %ecx
+ jne L(find_diff)
+
+ mov 4(%eax), %ecx
+ cmp 4(%edx), %ecx
+ jne L(find_diff)
+
+ mov 8(%eax), %ecx
+ cmp 8(%edx), %ecx
+ jne L(find_diff)
+
+ mov 12(%eax), %ecx
+ cmp 12(%edx), %ecx
+
+ mov $0, %eax
+ jne L(find_diff)
+ RETURN
+# endif
+
+ .p2align 4
+L(find_diff):
+# ifndef USE_AS_WMEMCMP
+ cmpb %bl, %cl
+ jne L(end)
+ cmp %bx, %cx
+ jne L(end)
+ shr $16,%ecx
+ shr $16,%ebx
+ cmp %bl, %cl
+ jne L(end)
+ cmp %bx, %cx
+L(end):
+ POP (%ebx)
+ mov $1, %eax
+ ja L(bigger)
+ neg %eax
+L(bigger):
+ ret
+# else
+ POP (%ebx)
+ mov $1, %eax
+ jg L(bigger)
+ neg %eax
+ ret
+
+ .p2align 4
+L(bigger):
+ ret
+# endif
+END (MEMCMP)
+
+ .section .rodata.sse4.2,"a",@progbits
+ .p2align 2
+ .type L(table_64bytes), @object
+# ifndef USE_AS_WMEMCMP
+L(table_64bytes):
+ .int JMPTBL (L(0bytes), L(table_64bytes))
+ .int JMPTBL (L(1bytes), L(table_64bytes))
+ .int JMPTBL (L(2bytes), L(table_64bytes))
+ .int JMPTBL (L(3bytes), L(table_64bytes))
+ .int JMPTBL (L(4bytes), L(table_64bytes))
+ .int JMPTBL (L(5bytes), L(table_64bytes))
+ .int JMPTBL (L(6bytes), L(table_64bytes))
+ .int JMPTBL (L(7bytes), L(table_64bytes))
+ .int JMPTBL (L(8bytes), L(table_64bytes))
+ .int JMPTBL (L(9bytes), L(table_64bytes))
+ .int JMPTBL (L(10bytes), L(table_64bytes))
+ .int JMPTBL (L(11bytes), L(table_64bytes))
+ .int JMPTBL (L(12bytes), L(table_64bytes))
+ .int JMPTBL (L(13bytes), L(table_64bytes))
+ .int JMPTBL (L(14bytes), L(table_64bytes))
+ .int JMPTBL (L(15bytes), L(table_64bytes))
+ .int JMPTBL (L(16bytes), L(table_64bytes))
+ .int JMPTBL (L(17bytes), L(table_64bytes))
+ .int JMPTBL (L(18bytes), L(table_64bytes))
+ .int JMPTBL (L(19bytes), L(table_64bytes))
+ .int JMPTBL (L(20bytes), L(table_64bytes))
+ .int JMPTBL (L(21bytes), L(table_64bytes))
+ .int JMPTBL (L(22bytes), L(table_64bytes))
+ .int JMPTBL (L(23bytes), L(table_64bytes))
+ .int JMPTBL (L(24bytes), L(table_64bytes))
+ .int JMPTBL (L(25bytes), L(table_64bytes))
+ .int JMPTBL (L(26bytes), L(table_64bytes))
+ .int JMPTBL (L(27bytes), L(table_64bytes))
+ .int JMPTBL (L(28bytes), L(table_64bytes))
+ .int JMPTBL (L(29bytes), L(table_64bytes))
+ .int JMPTBL (L(30bytes), L(table_64bytes))
+ .int JMPTBL (L(31bytes), L(table_64bytes))
+ .int JMPTBL (L(32bytes), L(table_64bytes))
+ .int JMPTBL (L(33bytes), L(table_64bytes))
+ .int JMPTBL (L(34bytes), L(table_64bytes))
+ .int JMPTBL (L(35bytes), L(table_64bytes))
+ .int JMPTBL (L(36bytes), L(table_64bytes))
+ .int JMPTBL (L(37bytes), L(table_64bytes))
+ .int JMPTBL (L(38bytes), L(table_64bytes))
+ .int JMPTBL (L(39bytes), L(table_64bytes))
+ .int JMPTBL (L(40bytes), L(table_64bytes))
+ .int JMPTBL (L(41bytes), L(table_64bytes))
+ .int JMPTBL (L(42bytes), L(table_64bytes))
+ .int JMPTBL (L(43bytes), L(table_64bytes))
+ .int JMPTBL (L(44bytes), L(table_64bytes))
+ .int JMPTBL (L(45bytes), L(table_64bytes))
+ .int JMPTBL (L(46bytes), L(table_64bytes))
+ .int JMPTBL (L(47bytes), L(table_64bytes))
+ .int JMPTBL (L(48bytes), L(table_64bytes))
+ .int JMPTBL (L(49bytes), L(table_64bytes))
+ .int JMPTBL (L(50bytes), L(table_64bytes))
+ .int JMPTBL (L(51bytes), L(table_64bytes))
+ .int JMPTBL (L(52bytes), L(table_64bytes))
+ .int JMPTBL (L(53bytes), L(table_64bytes))
+ .int JMPTBL (L(54bytes), L(table_64bytes))
+ .int JMPTBL (L(55bytes), L(table_64bytes))
+ .int JMPTBL (L(56bytes), L(table_64bytes))
+ .int JMPTBL (L(57bytes), L(table_64bytes))
+ .int JMPTBL (L(58bytes), L(table_64bytes))
+ .int JMPTBL (L(59bytes), L(table_64bytes))
+ .int JMPTBL (L(60bytes), L(table_64bytes))
+ .int JMPTBL (L(61bytes), L(table_64bytes))
+ .int JMPTBL (L(62bytes), L(table_64bytes))
+ .int JMPTBL (L(63bytes), L(table_64bytes))
+ .int JMPTBL (L(64bytes), L(table_64bytes))
+# else
+L(table_64bytes):
+ .int JMPTBL (L(0bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(4bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(8bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(12bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(16bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(20bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(24bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(28bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(32bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(36bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(40bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(44bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(48bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(52bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(56bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(60bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(64bytes), L(table_64bytes))
+# endif
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memcmp-ssse3.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memcmp-ssse3.S
new file mode 100644
index 0000000000..5ebf5a4d73
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memcmp-ssse3.S
@@ -0,0 +1,2157 @@
+/* memcmp with SSSE3, wmemcmp with SSSE3
+ Copyright (C) 2010-2017 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+
+# ifndef MEMCMP
+# define MEMCMP __memcmp_ssse3
+# endif
+
+# define CFI_PUSH(REG) \
+ cfi_adjust_cfa_offset (4); \
+ cfi_rel_offset (REG, 0)
+
+# define CFI_POP(REG) \
+ cfi_adjust_cfa_offset (-4); \
+ cfi_restore (REG)
+
+# define PUSH(REG) pushl REG; CFI_PUSH (REG)
+# define POP(REG) popl REG; CFI_POP (REG)
+
+# define PARMS 4
+# define BLK1 PARMS
+# define BLK2 BLK1+4
+# define LEN BLK2+4
+# define RETURN_END POP (%edi); POP (%esi); POP (%ebx); ret
+# define RETURN RETURN_END; cfi_restore_state; cfi_remember_state
+
+/* Warning!
+ wmemcmp has to use SIGNED comparison for elements.
+ memcmp has to use UNSIGNED comparison for elemnts.
+*/
+
+ atom_text_section
+ENTRY (MEMCMP)
+ movl LEN(%esp), %ecx
+
+# ifdef USE_AS_WMEMCMP
+ shl $2, %ecx
+ test %ecx, %ecx
+ jz L(zero)
+# endif
+
+ movl BLK1(%esp), %eax
+ cmp $48, %ecx
+ movl BLK2(%esp), %edx
+ jae L(48bytesormore)
+
+# ifndef USE_AS_WMEMCMP
+ cmp $1, %ecx
+ jbe L(less1bytes)
+# endif
+
+ PUSH (%ebx)
+ add %ecx, %edx
+ add %ecx, %eax
+ jmp L(less48bytes)
+
+ CFI_POP (%ebx)
+
+# ifndef USE_AS_WMEMCMP
+ .p2align 4
+L(less1bytes):
+ jb L(zero)
+ movb (%eax), %cl
+ cmp (%edx), %cl
+ je L(zero)
+ mov $1, %eax
+ ja L(1bytesend)
+ neg %eax
+L(1bytesend):
+ ret
+# endif
+
+ .p2align 4
+L(zero):
+ xor %eax, %eax
+ ret
+
+ .p2align 4
+L(48bytesormore):
+ PUSH (%ebx)
+ PUSH (%esi)
+ PUSH (%edi)
+ cfi_remember_state
+ movdqu (%eax), %xmm3
+ movdqu (%edx), %xmm0
+ movl %eax, %edi
+ movl %edx, %esi
+ pcmpeqb %xmm0, %xmm3
+ pmovmskb %xmm3, %edx
+ lea 16(%edi), %edi
+
+ sub $0xffff, %edx
+ lea 16(%esi), %esi
+ jnz L(less16bytes)
+ mov %edi, %edx
+ and $0xf, %edx
+ xor %edx, %edi
+ sub %edx, %esi
+ add %edx, %ecx
+ mov %esi, %edx
+ and $0xf, %edx
+ jz L(shr_0)
+ xor %edx, %esi
+
+# ifndef USE_AS_WMEMCMP
+ cmp $8, %edx
+ jae L(next_unaligned_table)
+ cmp $0, %edx
+ je L(shr_0)
+ cmp $1, %edx
+ je L(shr_1)
+ cmp $2, %edx
+ je L(shr_2)
+ cmp $3, %edx
+ je L(shr_3)
+ cmp $4, %edx
+ je L(shr_4)
+ cmp $5, %edx
+ je L(shr_5)
+ cmp $6, %edx
+ je L(shr_6)
+ jmp L(shr_7)
+
+ .p2align 2
+L(next_unaligned_table):
+ cmp $8, %edx
+ je L(shr_8)
+ cmp $9, %edx
+ je L(shr_9)
+ cmp $10, %edx
+ je L(shr_10)
+ cmp $11, %edx
+ je L(shr_11)
+ cmp $12, %edx
+ je L(shr_12)
+ cmp $13, %edx
+ je L(shr_13)
+ cmp $14, %edx
+ je L(shr_14)
+ jmp L(shr_15)
+# else
+ cmp $0, %edx
+ je L(shr_0)
+ cmp $4, %edx
+ je L(shr_4)
+ cmp $8, %edx
+ je L(shr_8)
+ jmp L(shr_12)
+# endif
+
+ .p2align 4
+L(shr_0):
+ cmp $80, %ecx
+ jae L(shr_0_gobble)
+ lea -48(%ecx), %ecx
+ xor %eax, %eax
+ movaps (%esi), %xmm1
+ pcmpeqb (%edi), %xmm1
+ movaps 16(%esi), %xmm2
+ pcmpeqb 16(%edi), %xmm2
+ pand %xmm1, %xmm2
+ pmovmskb %xmm2, %edx
+ add $32, %edi
+ add $32, %esi
+ sub $0xffff, %edx
+ jnz L(exit)
+
+ lea (%ecx, %edi,1), %eax
+ lea (%ecx, %esi,1), %edx
+ POP (%edi)
+ POP (%esi)
+ jmp L(less48bytes)
+
+ cfi_restore_state
+ cfi_remember_state
+ .p2align 4
+L(shr_0_gobble):
+ lea -48(%ecx), %ecx
+ movdqa (%esi), %xmm0
+ xor %eax, %eax
+ pcmpeqb (%edi), %xmm0
+ sub $32, %ecx
+ movdqa 16(%esi), %xmm2
+ pcmpeqb 16(%edi), %xmm2
+L(shr_0_gobble_loop):
+ pand %xmm0, %xmm2
+ sub $32, %ecx
+ pmovmskb %xmm2, %edx
+ movdqa %xmm0, %xmm1
+ movdqa 32(%esi), %xmm0
+ movdqa 48(%esi), %xmm2
+ sbb $0xffff, %edx
+ pcmpeqb 32(%edi), %xmm0
+ pcmpeqb 48(%edi), %xmm2
+ lea 32(%edi), %edi
+ lea 32(%esi), %esi
+ jz L(shr_0_gobble_loop)
+
+ pand %xmm0, %xmm2
+ cmp $0, %ecx
+ jge L(shr_0_gobble_loop_next)
+ inc %edx
+ add $32, %ecx
+L(shr_0_gobble_loop_next):
+ test %edx, %edx
+ jnz L(exit)
+
+ pmovmskb %xmm2, %edx
+ movdqa %xmm0, %xmm1
+ lea 32(%edi), %edi
+ lea 32(%esi), %esi
+ sub $0xffff, %edx
+ jnz L(exit)
+ lea (%ecx, %edi,1), %eax
+ lea (%ecx, %esi,1), %edx
+ POP (%edi)
+ POP (%esi)
+ jmp L(less48bytes)
+
+# ifndef USE_AS_WMEMCMP
+ cfi_restore_state
+ cfi_remember_state
+ .p2align 4
+L(shr_1):
+ cmp $80, %ecx
+ lea -48(%ecx), %ecx
+ mov %edx, %eax
+ jae L(shr_1_gobble)
+
+ movdqa 16(%esi), %xmm1
+ movdqa %xmm1, %xmm2
+ palignr $1,(%esi), %xmm1
+ pcmpeqb (%edi), %xmm1
+
+ movdqa 32(%esi), %xmm3
+ palignr $1,%xmm2, %xmm3
+ pcmpeqb 16(%edi), %xmm3
+
+ pand %xmm1, %xmm3
+ pmovmskb %xmm3, %edx
+ lea 32(%edi), %edi
+ lea 32(%esi), %esi
+ sub $0xffff, %edx
+ jnz L(exit)
+ lea (%ecx, %edi,1), %eax
+ lea 1(%ecx, %esi,1), %edx
+ POP (%edi)
+ POP (%esi)
+ jmp L(less48bytes)
+
+ cfi_restore_state
+ cfi_remember_state
+ .p2align 4
+L(shr_1_gobble):
+ sub $32, %ecx
+ movdqa 16(%esi), %xmm0
+ palignr $1,(%esi), %xmm0
+ pcmpeqb (%edi), %xmm0
+
+ movdqa 32(%esi), %xmm3
+ palignr $1,16(%esi), %xmm3
+ pcmpeqb 16(%edi), %xmm3
+
+L(shr_1_gobble_loop):
+ pand %xmm0, %xmm3
+ sub $32, %ecx
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+
+ movdqa 64(%esi), %xmm3
+ palignr $1,48(%esi), %xmm3
+ sbb $0xffff, %edx
+ movdqa 48(%esi), %xmm0
+ palignr $1,32(%esi), %xmm0
+ pcmpeqb 32(%edi), %xmm0
+ lea 32(%esi), %esi
+ pcmpeqb 48(%edi), %xmm3
+
+ lea 32(%edi), %edi
+ jz L(shr_1_gobble_loop)
+ pand %xmm0, %xmm3
+
+ cmp $0, %ecx
+ jge L(shr_1_gobble_next)
+ inc %edx
+ add $32, %ecx
+L(shr_1_gobble_next):
+ test %edx, %edx
+ jnz L(exit)
+
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+ lea 32(%edi), %edi
+ lea 32(%esi), %esi
+ sub $0xffff, %edx
+ jnz L(exit)
+
+ lea (%ecx, %edi,1), %eax
+ lea 1(%ecx, %esi,1), %edx
+ POP (%edi)
+ POP (%esi)
+ jmp L(less48bytes)
+
+
+ cfi_restore_state
+ cfi_remember_state
+ .p2align 4
+L(shr_2):
+ cmp $80, %ecx
+ lea -48(%ecx), %ecx
+ mov %edx, %eax
+ jae L(shr_2_gobble)
+
+ movdqa 16(%esi), %xmm1
+ movdqa %xmm1, %xmm2
+ palignr $2,(%esi), %xmm1
+ pcmpeqb (%edi), %xmm1
+
+ movdqa 32(%esi), %xmm3
+ palignr $2,%xmm2, %xmm3
+ pcmpeqb 16(%edi), %xmm3
+
+ pand %xmm1, %xmm3
+ pmovmskb %xmm3, %edx
+ lea 32(%edi), %edi
+ lea 32(%esi), %esi
+ sub $0xffff, %edx
+ jnz L(exit)
+ lea (%ecx, %edi,1), %eax
+ lea 2(%ecx, %esi,1), %edx
+ POP (%edi)
+ POP (%esi)
+ jmp L(less48bytes)
+
+ cfi_restore_state
+ cfi_remember_state
+ .p2align 4
+L(shr_2_gobble):
+ sub $32, %ecx
+ movdqa 16(%esi), %xmm0
+ palignr $2,(%esi), %xmm0
+ pcmpeqb (%edi), %xmm0
+
+ movdqa 32(%esi), %xmm3
+ palignr $2,16(%esi), %xmm3
+ pcmpeqb 16(%edi), %xmm3
+
+L(shr_2_gobble_loop):
+ pand %xmm0, %xmm3
+ sub $32, %ecx
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+
+ movdqa 64(%esi), %xmm3
+ palignr $2,48(%esi), %xmm3
+ sbb $0xffff, %edx
+ movdqa 48(%esi), %xmm0
+ palignr $2,32(%esi), %xmm0
+ pcmpeqb 32(%edi), %xmm0
+ lea 32(%esi), %esi
+ pcmpeqb 48(%edi), %xmm3
+
+ lea 32(%edi), %edi
+ jz L(shr_2_gobble_loop)
+ pand %xmm0, %xmm3
+
+ cmp $0, %ecx
+ jge L(shr_2_gobble_next)
+ inc %edx
+ add $32, %ecx
+L(shr_2_gobble_next):
+ test %edx, %edx
+ jnz L(exit)
+
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+ lea 32(%edi), %edi
+ lea 32(%esi), %esi
+ sub $0xffff, %edx
+ jnz L(exit)
+
+ lea (%ecx, %edi,1), %eax
+ lea 2(%ecx, %esi,1), %edx
+ POP (%edi)
+ POP (%esi)
+ jmp L(less48bytes)
+
+ cfi_restore_state
+ cfi_remember_state
+ .p2align 4
+L(shr_3):
+ cmp $80, %ecx
+ lea -48(%ecx), %ecx
+ mov %edx, %eax
+ jae L(shr_3_gobble)
+
+ movdqa 16(%esi), %xmm1
+ movdqa %xmm1, %xmm2
+ palignr $3,(%esi), %xmm1
+ pcmpeqb (%edi), %xmm1
+
+ movdqa 32(%esi), %xmm3
+ palignr $3,%xmm2, %xmm3
+ pcmpeqb 16(%edi), %xmm3
+
+ pand %xmm1, %xmm3
+ pmovmskb %xmm3, %edx
+ lea 32(%edi), %edi
+ lea 32(%esi), %esi
+ sub $0xffff, %edx
+ jnz L(exit)
+ lea (%ecx, %edi,1), %eax
+ lea 3(%ecx, %esi,1), %edx
+ POP (%edi)
+ POP (%esi)
+ jmp L(less48bytes)
+
+ cfi_restore_state
+ cfi_remember_state
+ .p2align 4
+L(shr_3_gobble):
+ sub $32, %ecx
+ movdqa 16(%esi), %xmm0
+ palignr $3,(%esi), %xmm0
+ pcmpeqb (%edi), %xmm0
+
+ movdqa 32(%esi), %xmm3
+ palignr $3,16(%esi), %xmm3
+ pcmpeqb 16(%edi), %xmm3
+
+L(shr_3_gobble_loop):
+ pand %xmm0, %xmm3
+ sub $32, %ecx
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+
+ movdqa 64(%esi), %xmm3
+ palignr $3,48(%esi), %xmm3
+ sbb $0xffff, %edx
+ movdqa 48(%esi), %xmm0
+ palignr $3,32(%esi), %xmm0
+ pcmpeqb 32(%edi), %xmm0
+ lea 32(%esi), %esi
+ pcmpeqb 48(%edi), %xmm3
+
+ lea 32(%edi), %edi
+ jz L(shr_3_gobble_loop)
+ pand %xmm0, %xmm3
+
+ cmp $0, %ecx
+ jge L(shr_3_gobble_next)
+ inc %edx
+ add $32, %ecx
+L(shr_3_gobble_next):
+ test %edx, %edx
+ jnz L(exit)
+
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+ lea 32(%edi), %edi
+ lea 32(%esi), %esi
+ sub $0xffff, %edx
+ jnz L(exit)
+
+ lea (%ecx, %edi,1), %eax
+ lea 3(%ecx, %esi,1), %edx
+ POP (%edi)
+ POP (%esi)
+ jmp L(less48bytes)
+# endif
+
+ cfi_restore_state
+ cfi_remember_state
+ .p2align 4
+L(shr_4):
+ cmp $80, %ecx
+ lea -48(%ecx), %ecx
+ mov %edx, %eax
+ jae L(shr_4_gobble)
+
+ movdqa 16(%esi), %xmm1
+ movdqa %xmm1, %xmm2
+ palignr $4,(%esi), %xmm1
+ pcmpeqb (%edi), %xmm1
+
+ movdqa 32(%esi), %xmm3
+ palignr $4,%xmm2, %xmm3
+ pcmpeqb 16(%edi), %xmm3
+
+ pand %xmm1, %xmm3
+ pmovmskb %xmm3, %edx
+ lea 32(%edi), %edi
+ lea 32(%esi), %esi
+ sub $0xffff, %edx
+ jnz L(exit)
+ lea (%ecx, %edi,1), %eax
+ lea 4(%ecx, %esi,1), %edx
+ POP (%edi)
+ POP (%esi)
+ jmp L(less48bytes)
+
+ cfi_restore_state
+ cfi_remember_state
+ .p2align 4
+L(shr_4_gobble):
+ sub $32, %ecx
+ movdqa 16(%esi), %xmm0
+ palignr $4,(%esi), %xmm0
+ pcmpeqb (%edi), %xmm0
+
+ movdqa 32(%esi), %xmm3
+ palignr $4,16(%esi), %xmm3
+ pcmpeqb 16(%edi), %xmm3
+
+L(shr_4_gobble_loop):
+ pand %xmm0, %xmm3
+ sub $32, %ecx
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+
+ movdqa 64(%esi), %xmm3
+ palignr $4,48(%esi), %xmm3
+ sbb $0xffff, %edx
+ movdqa 48(%esi), %xmm0
+ palignr $4,32(%esi), %xmm0
+ pcmpeqb 32(%edi), %xmm0
+ lea 32(%esi), %esi
+ pcmpeqb 48(%edi), %xmm3
+
+ lea 32(%edi), %edi
+ jz L(shr_4_gobble_loop)
+ pand %xmm0, %xmm3
+
+ cmp $0, %ecx
+ jge L(shr_4_gobble_next)
+ inc %edx
+ add $32, %ecx
+L(shr_4_gobble_next):
+ test %edx, %edx
+ jnz L(exit)
+
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+ lea 32(%edi), %edi
+ lea 32(%esi), %esi
+ sub $0xffff, %edx
+ jnz L(exit)
+
+ lea (%ecx, %edi,1), %eax
+ lea 4(%ecx, %esi,1), %edx
+ POP (%edi)
+ POP (%esi)
+ jmp L(less48bytes)
+
+# ifndef USE_AS_WMEMCMP
+ cfi_restore_state
+ cfi_remember_state
+ .p2align 4
+L(shr_5):
+ cmp $80, %ecx
+ lea -48(%ecx), %ecx
+ mov %edx, %eax
+ jae L(shr_5_gobble)
+
+ movdqa 16(%esi), %xmm1
+ movdqa %xmm1, %xmm2
+ palignr $5,(%esi), %xmm1
+ pcmpeqb (%edi), %xmm1
+
+ movdqa 32(%esi), %xmm3
+ palignr $5,%xmm2, %xmm3
+ pcmpeqb 16(%edi), %xmm3
+
+ pand %xmm1, %xmm3
+ pmovmskb %xmm3, %edx
+ lea 32(%edi), %edi
+ lea 32(%esi), %esi
+ sub $0xffff, %edx
+ jnz L(exit)
+ lea (%ecx, %edi,1), %eax
+ lea 5(%ecx, %esi,1), %edx
+ POP (%edi)
+ POP (%esi)
+ jmp L(less48bytes)
+
+ cfi_restore_state
+ cfi_remember_state
+ .p2align 4
+L(shr_5_gobble):
+ sub $32, %ecx
+ movdqa 16(%esi), %xmm0
+ palignr $5,(%esi), %xmm0
+ pcmpeqb (%edi), %xmm0
+
+ movdqa 32(%esi), %xmm3
+ palignr $5,16(%esi), %xmm3
+ pcmpeqb 16(%edi), %xmm3
+
+L(shr_5_gobble_loop):
+ pand %xmm0, %xmm3
+ sub $32, %ecx
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+
+ movdqa 64(%esi), %xmm3
+ palignr $5,48(%esi), %xmm3
+ sbb $0xffff, %edx
+ movdqa 48(%esi), %xmm0
+ palignr $5,32(%esi), %xmm0
+ pcmpeqb 32(%edi), %xmm0
+ lea 32(%esi), %esi
+ pcmpeqb 48(%edi), %xmm3
+
+ lea 32(%edi), %edi
+ jz L(shr_5_gobble_loop)
+ pand %xmm0, %xmm3
+
+ cmp $0, %ecx
+ jge L(shr_5_gobble_next)
+ inc %edx
+ add $32, %ecx
+L(shr_5_gobble_next):
+ test %edx, %edx
+ jnz L(exit)
+
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+ lea 32(%edi), %edi
+ lea 32(%esi), %esi
+ sub $0xffff, %edx
+ jnz L(exit)
+
+ lea (%ecx, %edi,1), %eax
+ lea 5(%ecx, %esi,1), %edx
+ POP (%edi)
+ POP (%esi)
+ jmp L(less48bytes)
+
+ cfi_restore_state
+ cfi_remember_state
+ .p2align 4
+L(shr_6):
+ cmp $80, %ecx
+ lea -48(%ecx), %ecx
+ mov %edx, %eax
+ jae L(shr_6_gobble)
+
+ movdqa 16(%esi), %xmm1
+ movdqa %xmm1, %xmm2
+ palignr $6,(%esi), %xmm1
+ pcmpeqb (%edi), %xmm1
+
+ movdqa 32(%esi), %xmm3
+ palignr $6,%xmm2, %xmm3
+ pcmpeqb 16(%edi), %xmm3
+
+ pand %xmm1, %xmm3
+ pmovmskb %xmm3, %edx
+ lea 32(%edi), %edi
+ lea 32(%esi), %esi
+ sub $0xffff, %edx
+ jnz L(exit)
+ lea (%ecx, %edi,1), %eax
+ lea 6(%ecx, %esi,1), %edx
+ POP (%edi)
+ POP (%esi)
+ jmp L(less48bytes)
+
+ cfi_restore_state
+ cfi_remember_state
+ .p2align 4
+L(shr_6_gobble):
+ sub $32, %ecx
+ movdqa 16(%esi), %xmm0
+ palignr $6,(%esi), %xmm0
+ pcmpeqb (%edi), %xmm0
+
+ movdqa 32(%esi), %xmm3
+ palignr $6,16(%esi), %xmm3
+ pcmpeqb 16(%edi), %xmm3
+
+L(shr_6_gobble_loop):
+ pand %xmm0, %xmm3
+ sub $32, %ecx
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+
+ movdqa 64(%esi), %xmm3
+ palignr $6,48(%esi), %xmm3
+ sbb $0xffff, %edx
+ movdqa 48(%esi), %xmm0
+ palignr $6,32(%esi), %xmm0
+ pcmpeqb 32(%edi), %xmm0
+ lea 32(%esi), %esi
+ pcmpeqb 48(%edi), %xmm3
+
+ lea 32(%edi), %edi
+ jz L(shr_6_gobble_loop)
+ pand %xmm0, %xmm3
+
+ cmp $0, %ecx
+ jge L(shr_6_gobble_next)
+ inc %edx
+ add $32, %ecx
+L(shr_6_gobble_next):
+ test %edx, %edx
+ jnz L(exit)
+
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+ lea 32(%edi), %edi
+ lea 32(%esi), %esi
+ sub $0xffff, %edx
+ jnz L(exit)
+
+ lea (%ecx, %edi,1), %eax
+ lea 6(%ecx, %esi,1), %edx
+ POP (%edi)
+ POP (%esi)
+ jmp L(less48bytes)
+
+ cfi_restore_state
+ cfi_remember_state
+ .p2align 4
+L(shr_7):
+ cmp $80, %ecx
+ lea -48(%ecx), %ecx
+ mov %edx, %eax
+ jae L(shr_7_gobble)
+
+ movdqa 16(%esi), %xmm1
+ movdqa %xmm1, %xmm2
+ palignr $7,(%esi), %xmm1
+ pcmpeqb (%edi), %xmm1
+
+ movdqa 32(%esi), %xmm3
+ palignr $7,%xmm2, %xmm3
+ pcmpeqb 16(%edi), %xmm3
+
+ pand %xmm1, %xmm3
+ pmovmskb %xmm3, %edx
+ lea 32(%edi), %edi
+ lea 32(%esi), %esi
+ sub $0xffff, %edx
+ jnz L(exit)
+ lea (%ecx, %edi,1), %eax
+ lea 7(%ecx, %esi,1), %edx
+ POP (%edi)
+ POP (%esi)
+ jmp L(less48bytes)
+
+ cfi_restore_state
+ cfi_remember_state
+ .p2align 4
+L(shr_7_gobble):
+ sub $32, %ecx
+ movdqa 16(%esi), %xmm0
+ palignr $7,(%esi), %xmm0
+ pcmpeqb (%edi), %xmm0
+
+ movdqa 32(%esi), %xmm3
+ palignr $7,16(%esi), %xmm3
+ pcmpeqb 16(%edi), %xmm3
+
+L(shr_7_gobble_loop):
+ pand %xmm0, %xmm3
+ sub $32, %ecx
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+
+ movdqa 64(%esi), %xmm3
+ palignr $7,48(%esi), %xmm3
+ sbb $0xffff, %edx
+ movdqa 48(%esi), %xmm0
+ palignr $7,32(%esi), %xmm0
+ pcmpeqb 32(%edi), %xmm0
+ lea 32(%esi), %esi
+ pcmpeqb 48(%edi), %xmm3
+
+ lea 32(%edi), %edi
+ jz L(shr_7_gobble_loop)
+ pand %xmm0, %xmm3
+
+ cmp $0, %ecx
+ jge L(shr_7_gobble_next)
+ inc %edx
+ add $32, %ecx
+L(shr_7_gobble_next):
+ test %edx, %edx
+ jnz L(exit)
+
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+ lea 32(%edi), %edi
+ lea 32(%esi), %esi
+ sub $0xffff, %edx
+ jnz L(exit)
+
+ lea (%ecx, %edi,1), %eax
+ lea 7(%ecx, %esi,1), %edx
+ POP (%edi)
+ POP (%esi)
+ jmp L(less48bytes)
+# endif
+
+ cfi_restore_state
+ cfi_remember_state
+ .p2align 4
+L(shr_8):
+ cmp $80, %ecx
+ lea -48(%ecx), %ecx
+ mov %edx, %eax
+ jae L(shr_8_gobble)
+
+ movdqa 16(%esi), %xmm1
+ movdqa %xmm1, %xmm2
+ palignr $8,(%esi), %xmm1
+ pcmpeqb (%edi), %xmm1
+
+ movdqa 32(%esi), %xmm3
+ palignr $8,%xmm2, %xmm3
+ pcmpeqb 16(%edi), %xmm3
+
+ pand %xmm1, %xmm3
+ pmovmskb %xmm3, %edx
+ lea 32(%edi), %edi
+ lea 32(%esi), %esi
+ sub $0xffff, %edx
+ jnz L(exit)
+ lea (%ecx, %edi,1), %eax
+ lea 8(%ecx, %esi,1), %edx
+ POP (%edi)
+ POP (%esi)
+ jmp L(less48bytes)
+
+ cfi_restore_state
+ cfi_remember_state
+ .p2align 4
+L(shr_8_gobble):
+ sub $32, %ecx
+ movdqa 16(%esi), %xmm0
+ palignr $8,(%esi), %xmm0
+ pcmpeqb (%edi), %xmm0
+
+ movdqa 32(%esi), %xmm3
+ palignr $8,16(%esi), %xmm3
+ pcmpeqb 16(%edi), %xmm3
+
+L(shr_8_gobble_loop):
+ pand %xmm0, %xmm3
+ sub $32, %ecx
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+
+ movdqa 64(%esi), %xmm3
+ palignr $8,48(%esi), %xmm3
+ sbb $0xffff, %edx
+ movdqa 48(%esi), %xmm0
+ palignr $8,32(%esi), %xmm0
+ pcmpeqb 32(%edi), %xmm0
+ lea 32(%esi), %esi
+ pcmpeqb 48(%edi), %xmm3
+
+ lea 32(%edi), %edi
+ jz L(shr_8_gobble_loop)
+ pand %xmm0, %xmm3
+
+ cmp $0, %ecx
+ jge L(shr_8_gobble_next)
+ inc %edx
+ add $32, %ecx
+L(shr_8_gobble_next):
+ test %edx, %edx
+ jnz L(exit)
+
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+ lea 32(%edi), %edi
+ lea 32(%esi), %esi
+ sub $0xffff, %edx
+ jnz L(exit)
+
+ lea (%ecx, %edi,1), %eax
+ lea 8(%ecx, %esi,1), %edx
+ POP (%edi)
+ POP (%esi)
+ jmp L(less48bytes)
+
+# ifndef USE_AS_WMEMCMP
+ cfi_restore_state
+ cfi_remember_state
+ .p2align 4
+L(shr_9):
+ cmp $80, %ecx
+ lea -48(%ecx), %ecx
+ mov %edx, %eax
+ jae L(shr_9_gobble)
+
+ movdqa 16(%esi), %xmm1
+ movdqa %xmm1, %xmm2
+ palignr $9,(%esi), %xmm1
+ pcmpeqb (%edi), %xmm1
+
+ movdqa 32(%esi), %xmm3
+ palignr $9,%xmm2, %xmm3
+ pcmpeqb 16(%edi), %xmm3
+
+ pand %xmm1, %xmm3
+ pmovmskb %xmm3, %edx
+ lea 32(%edi), %edi
+ lea 32(%esi), %esi
+ sub $0xffff, %edx
+ jnz L(exit)
+ lea (%ecx, %edi,1), %eax
+ lea 9(%ecx, %esi,1), %edx
+ POP (%edi)
+ POP (%esi)
+ jmp L(less48bytes)
+
+ cfi_restore_state
+ cfi_remember_state
+ .p2align 4
+L(shr_9_gobble):
+ sub $32, %ecx
+ movdqa 16(%esi), %xmm0
+ palignr $9,(%esi), %xmm0
+ pcmpeqb (%edi), %xmm0
+
+ movdqa 32(%esi), %xmm3
+ palignr $9,16(%esi), %xmm3
+ pcmpeqb 16(%edi), %xmm3
+
+L(shr_9_gobble_loop):
+ pand %xmm0, %xmm3
+ sub $32, %ecx
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+
+ movdqa 64(%esi), %xmm3
+ palignr $9,48(%esi), %xmm3
+ sbb $0xffff, %edx
+ movdqa 48(%esi), %xmm0
+ palignr $9,32(%esi), %xmm0
+ pcmpeqb 32(%edi), %xmm0
+ lea 32(%esi), %esi
+ pcmpeqb 48(%edi), %xmm3
+
+ lea 32(%edi), %edi
+ jz L(shr_9_gobble_loop)
+ pand %xmm0, %xmm3
+
+ cmp $0, %ecx
+ jge L(shr_9_gobble_next)
+ inc %edx
+ add $32, %ecx
+L(shr_9_gobble_next):
+ test %edx, %edx
+ jnz L(exit)
+
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+ lea 32(%edi), %edi
+ lea 32(%esi), %esi
+ sub $0xffff, %edx
+ jnz L(exit)
+
+ lea (%ecx, %edi,1), %eax
+ lea 9(%ecx, %esi,1), %edx
+ POP (%edi)
+ POP (%esi)
+ jmp L(less48bytes)
+
+ cfi_restore_state
+ cfi_remember_state
+ .p2align 4
+L(shr_10):
+ cmp $80, %ecx
+ lea -48(%ecx), %ecx
+ mov %edx, %eax
+ jae L(shr_10_gobble)
+
+ movdqa 16(%esi), %xmm1
+ movdqa %xmm1, %xmm2
+ palignr $10, (%esi), %xmm1
+ pcmpeqb (%edi), %xmm1
+
+ movdqa 32(%esi), %xmm3
+ palignr $10,%xmm2, %xmm3
+ pcmpeqb 16(%edi), %xmm3
+
+ pand %xmm1, %xmm3
+ pmovmskb %xmm3, %edx
+ lea 32(%edi), %edi
+ lea 32(%esi), %esi
+ sub $0xffff, %edx
+ jnz L(exit)
+ lea (%ecx, %edi,1), %eax
+ lea 10(%ecx, %esi,1), %edx
+ POP (%edi)
+ POP (%esi)
+ jmp L(less48bytes)
+
+ cfi_restore_state
+ cfi_remember_state
+ .p2align 4
+L(shr_10_gobble):
+ sub $32, %ecx
+ movdqa 16(%esi), %xmm0
+ palignr $10, (%esi), %xmm0
+ pcmpeqb (%edi), %xmm0
+
+ movdqa 32(%esi), %xmm3
+ palignr $10, 16(%esi), %xmm3
+ pcmpeqb 16(%edi), %xmm3
+
+L(shr_10_gobble_loop):
+ pand %xmm0, %xmm3
+ sub $32, %ecx
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+
+ movdqa 64(%esi), %xmm3
+ palignr $10,48(%esi), %xmm3
+ sbb $0xffff, %edx
+ movdqa 48(%esi), %xmm0
+ palignr $10,32(%esi), %xmm0
+ pcmpeqb 32(%edi), %xmm0
+ lea 32(%esi), %esi
+ pcmpeqb 48(%edi), %xmm3
+
+ lea 32(%edi), %edi
+ jz L(shr_10_gobble_loop)
+ pand %xmm0, %xmm3
+
+ cmp $0, %ecx
+ jge L(shr_10_gobble_next)
+ inc %edx
+ add $32, %ecx
+L(shr_10_gobble_next):
+ test %edx, %edx
+ jnz L(exit)
+
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+ lea 32(%edi), %edi
+ lea 32(%esi), %esi
+ sub $0xffff, %edx
+ jnz L(exit)
+
+ lea (%ecx, %edi,1), %eax
+ lea 10(%ecx, %esi,1), %edx
+ POP (%edi)
+ POP (%esi)
+ jmp L(less48bytes)
+
+ cfi_restore_state
+ cfi_remember_state
+ .p2align 4
+L(shr_11):
+ cmp $80, %ecx
+ lea -48(%ecx), %ecx
+ mov %edx, %eax
+ jae L(shr_11_gobble)
+
+ movdqa 16(%esi), %xmm1
+ movdqa %xmm1, %xmm2
+ palignr $11, (%esi), %xmm1
+ pcmpeqb (%edi), %xmm1
+
+ movdqa 32(%esi), %xmm3
+ palignr $11, %xmm2, %xmm3
+ pcmpeqb 16(%edi), %xmm3
+
+ pand %xmm1, %xmm3
+ pmovmskb %xmm3, %edx
+ lea 32(%edi), %edi
+ lea 32(%esi), %esi
+ sub $0xffff, %edx
+ jnz L(exit)
+ lea (%ecx, %edi,1), %eax
+ lea 11(%ecx, %esi,1), %edx
+ POP (%edi)
+ POP (%esi)
+ jmp L(less48bytes)
+
+ cfi_restore_state
+ cfi_remember_state
+ .p2align 4
+L(shr_11_gobble):
+ sub $32, %ecx
+ movdqa 16(%esi), %xmm0
+ palignr $11, (%esi), %xmm0
+ pcmpeqb (%edi), %xmm0
+
+ movdqa 32(%esi), %xmm3
+ palignr $11, 16(%esi), %xmm3
+ pcmpeqb 16(%edi), %xmm3
+
+L(shr_11_gobble_loop):
+ pand %xmm0, %xmm3
+ sub $32, %ecx
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+
+ movdqa 64(%esi), %xmm3
+ palignr $11,48(%esi), %xmm3
+ sbb $0xffff, %edx
+ movdqa 48(%esi), %xmm0
+ palignr $11,32(%esi), %xmm0
+ pcmpeqb 32(%edi), %xmm0
+ lea 32(%esi), %esi
+ pcmpeqb 48(%edi), %xmm3
+
+ lea 32(%edi), %edi
+ jz L(shr_11_gobble_loop)
+ pand %xmm0, %xmm3
+
+ cmp $0, %ecx
+ jge L(shr_11_gobble_next)
+ inc %edx
+ add $32, %ecx
+L(shr_11_gobble_next):
+ test %edx, %edx
+ jnz L(exit)
+
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+ lea 32(%edi), %edi
+ lea 32(%esi), %esi
+ sub $0xffff, %edx
+ jnz L(exit)
+
+ lea (%ecx, %edi,1), %eax
+ lea 11(%ecx, %esi,1), %edx
+ POP (%edi)
+ POP (%esi)
+ jmp L(less48bytes)
+# endif
+
+ cfi_restore_state
+ cfi_remember_state
+ .p2align 4
+L(shr_12):
+ cmp $80, %ecx
+ lea -48(%ecx), %ecx
+ mov %edx, %eax
+ jae L(shr_12_gobble)
+
+ movdqa 16(%esi), %xmm1
+ movdqa %xmm1, %xmm2
+ palignr $12, (%esi), %xmm1
+ pcmpeqb (%edi), %xmm1
+
+ movdqa 32(%esi), %xmm3
+ palignr $12, %xmm2, %xmm3
+ pcmpeqb 16(%edi), %xmm3
+
+ pand %xmm1, %xmm3
+ pmovmskb %xmm3, %edx
+ lea 32(%edi), %edi
+ lea 32(%esi), %esi
+ sub $0xffff, %edx
+ jnz L(exit)
+ lea (%ecx, %edi,1), %eax
+ lea 12(%ecx, %esi,1), %edx
+ POP (%edi)
+ POP (%esi)
+ jmp L(less48bytes)
+
+ cfi_restore_state
+ cfi_remember_state
+ .p2align 4
+L(shr_12_gobble):
+ sub $32, %ecx
+ movdqa 16(%esi), %xmm0
+ palignr $12, (%esi), %xmm0
+ pcmpeqb (%edi), %xmm0
+
+ movdqa 32(%esi), %xmm3
+ palignr $12, 16(%esi), %xmm3
+ pcmpeqb 16(%edi), %xmm3
+
+L(shr_12_gobble_loop):
+ pand %xmm0, %xmm3
+ sub $32, %ecx
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+
+ movdqa 64(%esi), %xmm3
+ palignr $12,48(%esi), %xmm3
+ sbb $0xffff, %edx
+ movdqa 48(%esi), %xmm0
+ palignr $12,32(%esi), %xmm0
+ pcmpeqb 32(%edi), %xmm0
+ lea 32(%esi), %esi
+ pcmpeqb 48(%edi), %xmm3
+
+ lea 32(%edi), %edi
+ jz L(shr_12_gobble_loop)
+ pand %xmm0, %xmm3
+
+ cmp $0, %ecx
+ jge L(shr_12_gobble_next)
+ inc %edx
+ add $32, %ecx
+L(shr_12_gobble_next):
+ test %edx, %edx
+ jnz L(exit)
+
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+ lea 32(%edi), %edi
+ lea 32(%esi), %esi
+ sub $0xffff, %edx
+ jnz L(exit)
+
+ lea (%ecx, %edi,1), %eax
+ lea 12(%ecx, %esi,1), %edx
+ POP (%edi)
+ POP (%esi)
+ jmp L(less48bytes)
+
+# ifndef USE_AS_WMEMCMP
+ cfi_restore_state
+ cfi_remember_state
+ .p2align 4
+L(shr_13):
+ cmp $80, %ecx
+ lea -48(%ecx), %ecx
+ mov %edx, %eax
+ jae L(shr_13_gobble)
+
+ movdqa 16(%esi), %xmm1
+ movdqa %xmm1, %xmm2
+ palignr $13, (%esi), %xmm1
+ pcmpeqb (%edi), %xmm1
+
+ movdqa 32(%esi), %xmm3
+ palignr $13, %xmm2, %xmm3
+ pcmpeqb 16(%edi), %xmm3
+
+ pand %xmm1, %xmm3
+ pmovmskb %xmm3, %edx
+ lea 32(%edi), %edi
+ lea 32(%esi), %esi
+ sub $0xffff, %edx
+ jnz L(exit)
+ lea (%ecx, %edi,1), %eax
+ lea 13(%ecx, %esi,1), %edx
+ POP (%edi)
+ POP (%esi)
+ jmp L(less48bytes)
+
+ cfi_restore_state
+ cfi_remember_state
+ .p2align 4
+L(shr_13_gobble):
+ sub $32, %ecx
+ movdqa 16(%esi), %xmm0
+ palignr $13, (%esi), %xmm0
+ pcmpeqb (%edi), %xmm0
+
+ movdqa 32(%esi), %xmm3
+ palignr $13, 16(%esi), %xmm3
+ pcmpeqb 16(%edi), %xmm3
+
+L(shr_13_gobble_loop):
+ pand %xmm0, %xmm3
+ sub $32, %ecx
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+
+ movdqa 64(%esi), %xmm3
+ palignr $13,48(%esi), %xmm3
+ sbb $0xffff, %edx
+ movdqa 48(%esi), %xmm0
+ palignr $13,32(%esi), %xmm0
+ pcmpeqb 32(%edi), %xmm0
+ lea 32(%esi), %esi
+ pcmpeqb 48(%edi), %xmm3
+
+ lea 32(%edi), %edi
+ jz L(shr_13_gobble_loop)
+ pand %xmm0, %xmm3
+
+ cmp $0, %ecx
+ jge L(shr_13_gobble_next)
+ inc %edx
+ add $32, %ecx
+L(shr_13_gobble_next):
+ test %edx, %edx
+ jnz L(exit)
+
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+ lea 32(%edi), %edi
+ lea 32(%esi), %esi
+ sub $0xffff, %edx
+ jnz L(exit)
+
+ lea (%ecx, %edi,1), %eax
+ lea 13(%ecx, %esi,1), %edx
+ POP (%edi)
+ POP (%esi)
+ jmp L(less48bytes)
+
+ cfi_restore_state
+ cfi_remember_state
+ .p2align 4
+L(shr_14):
+ cmp $80, %ecx
+ lea -48(%ecx), %ecx
+ mov %edx, %eax
+ jae L(shr_14_gobble)
+
+ movdqa 16(%esi), %xmm1
+ movdqa %xmm1, %xmm2
+ palignr $14, (%esi), %xmm1
+ pcmpeqb (%edi), %xmm1
+
+ movdqa 32(%esi), %xmm3
+ palignr $14, %xmm2, %xmm3
+ pcmpeqb 16(%edi), %xmm3
+
+ pand %xmm1, %xmm3
+ pmovmskb %xmm3, %edx
+ lea 32(%edi), %edi
+ lea 32(%esi), %esi
+ sub $0xffff, %edx
+ jnz L(exit)
+ lea (%ecx, %edi,1), %eax
+ lea 14(%ecx, %esi,1), %edx
+ POP (%edi)
+ POP (%esi)
+ jmp L(less48bytes)
+
+ cfi_restore_state
+ cfi_remember_state
+ .p2align 4
+L(shr_14_gobble):
+ sub $32, %ecx
+ movdqa 16(%esi), %xmm0
+ palignr $14, (%esi), %xmm0
+ pcmpeqb (%edi), %xmm0
+
+ movdqa 32(%esi), %xmm3
+ palignr $14, 16(%esi), %xmm3
+ pcmpeqb 16(%edi), %xmm3
+
+L(shr_14_gobble_loop):
+ pand %xmm0, %xmm3
+ sub $32, %ecx
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+
+ movdqa 64(%esi), %xmm3
+ palignr $14,48(%esi), %xmm3
+ sbb $0xffff, %edx
+ movdqa 48(%esi), %xmm0
+ palignr $14,32(%esi), %xmm0
+ pcmpeqb 32(%edi), %xmm0
+ lea 32(%esi), %esi
+ pcmpeqb 48(%edi), %xmm3
+
+ lea 32(%edi), %edi
+ jz L(shr_14_gobble_loop)
+ pand %xmm0, %xmm3
+
+ cmp $0, %ecx
+ jge L(shr_14_gobble_next)
+ inc %edx
+ add $32, %ecx
+L(shr_14_gobble_next):
+ test %edx, %edx
+ jnz L(exit)
+
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+ lea 32(%edi), %edi
+ lea 32(%esi), %esi
+ sub $0xffff, %edx
+ jnz L(exit)
+
+ lea (%ecx, %edi,1), %eax
+ lea 14(%ecx, %esi,1), %edx
+ POP (%edi)
+ POP (%esi)
+ jmp L(less48bytes)
+
+ cfi_restore_state
+ cfi_remember_state
+ .p2align 4
+L(shr_15):
+ cmp $80, %ecx
+ lea -48(%ecx), %ecx
+ mov %edx, %eax
+ jae L(shr_15_gobble)
+
+ movdqa 16(%esi), %xmm1
+ movdqa %xmm1, %xmm2
+ palignr $15, (%esi), %xmm1
+ pcmpeqb (%edi), %xmm1
+
+ movdqa 32(%esi), %xmm3
+ palignr $15, %xmm2, %xmm3
+ pcmpeqb 16(%edi), %xmm3
+
+ pand %xmm1, %xmm3
+ pmovmskb %xmm3, %edx
+ lea 32(%edi), %edi
+ lea 32(%esi), %esi
+ sub $0xffff, %edx
+ jnz L(exit)
+ lea (%ecx, %edi,1), %eax
+ lea 15(%ecx, %esi,1), %edx
+ POP (%edi)
+ POP (%esi)
+ jmp L(less48bytes)
+
+ cfi_restore_state
+ cfi_remember_state
+ .p2align 4
+L(shr_15_gobble):
+ sub $32, %ecx
+ movdqa 16(%esi), %xmm0
+ palignr $15, (%esi), %xmm0
+ pcmpeqb (%edi), %xmm0
+
+ movdqa 32(%esi), %xmm3
+ palignr $15, 16(%esi), %xmm3
+ pcmpeqb 16(%edi), %xmm3
+
+L(shr_15_gobble_loop):
+ pand %xmm0, %xmm3
+ sub $32, %ecx
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+
+ movdqa 64(%esi), %xmm3
+ palignr $15,48(%esi), %xmm3
+ sbb $0xffff, %edx
+ movdqa 48(%esi), %xmm0
+ palignr $15,32(%esi), %xmm0
+ pcmpeqb 32(%edi), %xmm0
+ lea 32(%esi), %esi
+ pcmpeqb 48(%edi), %xmm3
+
+ lea 32(%edi), %edi
+ jz L(shr_15_gobble_loop)
+ pand %xmm0, %xmm3
+
+ cmp $0, %ecx
+ jge L(shr_15_gobble_next)
+ inc %edx
+ add $32, %ecx
+L(shr_15_gobble_next):
+ test %edx, %edx
+ jnz L(exit)
+
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+ lea 32(%edi), %edi
+ lea 32(%esi), %esi
+ sub $0xffff, %edx
+ jnz L(exit)
+
+ lea (%ecx, %edi,1), %eax
+ lea 15(%ecx, %esi,1), %edx
+ POP (%edi)
+ POP (%esi)
+ jmp L(less48bytes)
+# endif
+
+ cfi_restore_state
+ cfi_remember_state
+ .p2align 4
+L(exit):
+ pmovmskb %xmm1, %ebx
+ sub $0xffff, %ebx
+ jz L(first16bytes)
+ lea -16(%esi), %esi
+ lea -16(%edi), %edi
+ mov %ebx, %edx
+
+L(first16bytes):
+ add %eax, %esi
+L(less16bytes):
+
+# ifndef USE_AS_WMEMCMP
+ test %dl, %dl
+ jz L(next_24_bytes)
+
+ test $0x01, %dl
+ jnz L(Byte16)
+
+ test $0x02, %dl
+ jnz L(Byte17)
+
+ test $0x04, %dl
+ jnz L(Byte18)
+
+ test $0x08, %dl
+ jnz L(Byte19)
+
+ test $0x10, %dl
+ jnz L(Byte20)
+
+ test $0x20, %dl
+ jnz L(Byte21)
+
+ test $0x40, %dl
+ jnz L(Byte22)
+L(Byte23):
+ movzbl -9(%edi), %eax
+ movzbl -9(%esi), %edx
+ sub %edx, %eax
+ RETURN
+
+ .p2align 4
+L(Byte16):
+ movzbl -16(%edi), %eax
+ movzbl -16(%esi), %edx
+ sub %edx, %eax
+ RETURN
+
+ .p2align 4
+L(Byte17):
+ movzbl -15(%edi), %eax
+ movzbl -15(%esi), %edx
+ sub %edx, %eax
+ RETURN
+
+ .p2align 4
+L(Byte18):
+ movzbl -14(%edi), %eax
+ movzbl -14(%esi), %edx
+ sub %edx, %eax
+ RETURN
+
+ .p2align 4
+L(Byte19):
+ movzbl -13(%edi), %eax
+ movzbl -13(%esi), %edx
+ sub %edx, %eax
+ RETURN
+
+ .p2align 4
+L(Byte20):
+ movzbl -12(%edi), %eax
+ movzbl -12(%esi), %edx
+ sub %edx, %eax
+ RETURN
+
+ .p2align 4
+L(Byte21):
+ movzbl -11(%edi), %eax
+ movzbl -11(%esi), %edx
+ sub %edx, %eax
+ RETURN
+
+ .p2align 4
+L(Byte22):
+ movzbl -10(%edi), %eax
+ movzbl -10(%esi), %edx
+ sub %edx, %eax
+ RETURN
+
+ .p2align 4
+L(next_24_bytes):
+ lea 8(%edi), %edi
+ lea 8(%esi), %esi
+ test $0x01, %dh
+ jnz L(Byte16)
+
+ test $0x02, %dh
+ jnz L(Byte17)
+
+ test $0x04, %dh
+ jnz L(Byte18)
+
+ test $0x08, %dh
+ jnz L(Byte19)
+
+ test $0x10, %dh
+ jnz L(Byte20)
+
+ test $0x20, %dh
+ jnz L(Byte21)
+
+ test $0x40, %dh
+ jnz L(Byte22)
+
+ .p2align 4
+L(Byte31):
+ movzbl -9(%edi), %eax
+ movzbl -9(%esi), %edx
+ sub %edx, %eax
+ RETURN_END
+# else
+
+/* special for wmemcmp */
+ xor %eax, %eax
+ test %dl, %dl
+ jz L(next_two_double_words)
+ and $15, %dl
+ jz L(second_double_word)
+ mov -16(%edi), %eax
+ cmp -16(%esi), %eax
+ jne L(nequal)
+ RETURN
+
+ .p2align 4
+L(second_double_word):
+ mov -12(%edi), %eax
+ cmp -12(%esi), %eax
+ jne L(nequal)
+ RETURN
+
+ .p2align 4
+L(next_two_double_words):
+ and $15, %dh
+ jz L(fourth_double_word)
+ mov -8(%edi), %eax
+ cmp -8(%esi), %eax
+ jne L(nequal)
+ RETURN
+
+ .p2align 4
+L(fourth_double_word):
+ mov -4(%edi), %eax
+ cmp -4(%esi), %eax
+ jne L(nequal)
+ RETURN
+
+ .p2align 4
+L(nequal):
+ mov $1, %eax
+ jg L(nequal_bigger)
+ neg %eax
+ RETURN
+
+ .p2align 4
+L(nequal_bigger):
+ RETURN_END
+# endif
+
+ CFI_PUSH (%ebx)
+
+ .p2align 4
+L(more8bytes):
+ cmp $16, %ecx
+ jae L(more16bytes)
+ cmp $8, %ecx
+ je L(8bytes)
+# ifndef USE_AS_WMEMCMP
+ cmp $9, %ecx
+ je L(9bytes)
+ cmp $10, %ecx
+ je L(10bytes)
+ cmp $11, %ecx
+ je L(11bytes)
+ cmp $12, %ecx
+ je L(12bytes)
+ cmp $13, %ecx
+ je L(13bytes)
+ cmp $14, %ecx
+ je L(14bytes)
+ jmp L(15bytes)
+# else
+ jmp L(12bytes)
+# endif
+
+ .p2align 4
+L(more16bytes):
+ cmp $24, %ecx
+ jae L(more24bytes)
+ cmp $16, %ecx
+ je L(16bytes)
+# ifndef USE_AS_WMEMCMP
+ cmp $17, %ecx
+ je L(17bytes)
+ cmp $18, %ecx
+ je L(18bytes)
+ cmp $19, %ecx
+ je L(19bytes)
+ cmp $20, %ecx
+ je L(20bytes)
+ cmp $21, %ecx
+ je L(21bytes)
+ cmp $22, %ecx
+ je L(22bytes)
+ jmp L(23bytes)
+# else
+ jmp L(20bytes)
+# endif
+
+ .p2align 4
+L(more24bytes):
+ cmp $32, %ecx
+ jae L(more32bytes)
+ cmp $24, %ecx
+ je L(24bytes)
+# ifndef USE_AS_WMEMCMP
+ cmp $25, %ecx
+ je L(25bytes)
+ cmp $26, %ecx
+ je L(26bytes)
+ cmp $27, %ecx
+ je L(27bytes)
+ cmp $28, %ecx
+ je L(28bytes)
+ cmp $29, %ecx
+ je L(29bytes)
+ cmp $30, %ecx
+ je L(30bytes)
+ jmp L(31bytes)
+# else
+ jmp L(28bytes)
+# endif
+
+ .p2align 4
+L(more32bytes):
+ cmp $40, %ecx
+ jae L(more40bytes)
+ cmp $32, %ecx
+ je L(32bytes)
+# ifndef USE_AS_WMEMCMP
+ cmp $33, %ecx
+ je L(33bytes)
+ cmp $34, %ecx
+ je L(34bytes)
+ cmp $35, %ecx
+ je L(35bytes)
+ cmp $36, %ecx
+ je L(36bytes)
+ cmp $37, %ecx
+ je L(37bytes)
+ cmp $38, %ecx
+ je L(38bytes)
+ jmp L(39bytes)
+# else
+ jmp L(36bytes)
+# endif
+
+ .p2align 4
+L(less48bytes):
+ cmp $8, %ecx
+ jae L(more8bytes)
+# ifndef USE_AS_WMEMCMP
+ cmp $2, %ecx
+ je L(2bytes)
+ cmp $3, %ecx
+ je L(3bytes)
+ cmp $4, %ecx
+ je L(4bytes)
+ cmp $5, %ecx
+ je L(5bytes)
+ cmp $6, %ecx
+ je L(6bytes)
+ jmp L(7bytes)
+# else
+ jmp L(4bytes)
+# endif
+
+ .p2align 4
+L(more40bytes):
+ cmp $40, %ecx
+ je L(40bytes)
+# ifndef USE_AS_WMEMCMP
+ cmp $41, %ecx
+ je L(41bytes)
+ cmp $42, %ecx
+ je L(42bytes)
+ cmp $43, %ecx
+ je L(43bytes)
+ cmp $44, %ecx
+ je L(44bytes)
+ cmp $45, %ecx
+ je L(45bytes)
+ cmp $46, %ecx
+ je L(46bytes)
+ jmp L(47bytes)
+
+ .p2align 4
+L(44bytes):
+ mov -44(%eax), %ecx
+ mov -44(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+L(40bytes):
+ mov -40(%eax), %ecx
+ mov -40(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+L(36bytes):
+ mov -36(%eax), %ecx
+ mov -36(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+L(32bytes):
+ mov -32(%eax), %ecx
+ mov -32(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+L(28bytes):
+ mov -28(%eax), %ecx
+ mov -28(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+L(24bytes):
+ mov -24(%eax), %ecx
+ mov -24(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+L(20bytes):
+ mov -20(%eax), %ecx
+ mov -20(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+L(16bytes):
+ mov -16(%eax), %ecx
+ mov -16(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+L(12bytes):
+ mov -12(%eax), %ecx
+ mov -12(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+L(8bytes):
+ mov -8(%eax), %ecx
+ mov -8(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+L(4bytes):
+ mov -4(%eax), %ecx
+ mov -4(%edx), %ebx
+ cmp %ebx, %ecx
+ mov $0, %eax
+ jne L(find_diff)
+ POP (%ebx)
+ ret
+ CFI_PUSH (%ebx)
+# else
+ .p2align 4
+L(44bytes):
+ mov -44(%eax), %ecx
+ cmp -44(%edx), %ecx
+ jne L(find_diff)
+L(40bytes):
+ mov -40(%eax), %ecx
+ cmp -40(%edx), %ecx
+ jne L(find_diff)
+L(36bytes):
+ mov -36(%eax), %ecx
+ cmp -36(%edx), %ecx
+ jne L(find_diff)
+L(32bytes):
+ mov -32(%eax), %ecx
+ cmp -32(%edx), %ecx
+ jne L(find_diff)
+L(28bytes):
+ mov -28(%eax), %ecx
+ cmp -28(%edx), %ecx
+ jne L(find_diff)
+L(24bytes):
+ mov -24(%eax), %ecx
+ cmp -24(%edx), %ecx
+ jne L(find_diff)
+L(20bytes):
+ mov -20(%eax), %ecx
+ cmp -20(%edx), %ecx
+ jne L(find_diff)
+L(16bytes):
+ mov -16(%eax), %ecx
+ cmp -16(%edx), %ecx
+ jne L(find_diff)
+L(12bytes):
+ mov -12(%eax), %ecx
+ cmp -12(%edx), %ecx
+ jne L(find_diff)
+L(8bytes):
+ mov -8(%eax), %ecx
+ cmp -8(%edx), %ecx
+ jne L(find_diff)
+L(4bytes):
+ mov -4(%eax), %ecx
+ xor %eax, %eax
+ cmp -4(%edx), %ecx
+ jne L(find_diff)
+ POP (%ebx)
+ ret
+ CFI_PUSH (%ebx)
+# endif
+
+# ifndef USE_AS_WMEMCMP
+
+ .p2align 4
+L(45bytes):
+ mov -45(%eax), %ecx
+ mov -45(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+L(41bytes):
+ mov -41(%eax), %ecx
+ mov -41(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+L(37bytes):
+ mov -37(%eax), %ecx
+ mov -37(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+L(33bytes):
+ mov -33(%eax), %ecx
+ mov -33(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+L(29bytes):
+ mov -29(%eax), %ecx
+ mov -29(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+L(25bytes):
+ mov -25(%eax), %ecx
+ mov -25(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+L(21bytes):
+ mov -21(%eax), %ecx
+ mov -21(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+L(17bytes):
+ mov -17(%eax), %ecx
+ mov -17(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+L(13bytes):
+ mov -13(%eax), %ecx
+ mov -13(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+L(9bytes):
+ mov -9(%eax), %ecx
+ mov -9(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+L(5bytes):
+ mov -5(%eax), %ecx
+ mov -5(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+ movzbl -1(%eax), %ecx
+ cmp -1(%edx), %cl
+ mov $0, %eax
+ jne L(end)
+ POP (%ebx)
+ ret
+ CFI_PUSH (%ebx)
+
+ .p2align 4
+L(46bytes):
+ mov -46(%eax), %ecx
+ mov -46(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+L(42bytes):
+ mov -42(%eax), %ecx
+ mov -42(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+L(38bytes):
+ mov -38(%eax), %ecx
+ mov -38(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+L(34bytes):
+ mov -34(%eax), %ecx
+ mov -34(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+L(30bytes):
+ mov -30(%eax), %ecx
+ mov -30(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+L(26bytes):
+ mov -26(%eax), %ecx
+ mov -26(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+L(22bytes):
+ mov -22(%eax), %ecx
+ mov -22(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+L(18bytes):
+ mov -18(%eax), %ecx
+ mov -18(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+L(14bytes):
+ mov -14(%eax), %ecx
+ mov -14(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+L(10bytes):
+ mov -10(%eax), %ecx
+ mov -10(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+L(6bytes):
+ mov -6(%eax), %ecx
+ mov -6(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+L(2bytes):
+ movzwl -2(%eax), %ecx
+ movzwl -2(%edx), %ebx
+ cmp %bl, %cl
+ jne L(end)
+ cmp %bh, %ch
+ mov $0, %eax
+ jne L(end)
+ POP (%ebx)
+ ret
+ CFI_PUSH (%ebx)
+
+ .p2align 4
+L(47bytes):
+ movl -47(%eax), %ecx
+ movl -47(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+L(43bytes):
+ movl -43(%eax), %ecx
+ movl -43(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+L(39bytes):
+ movl -39(%eax), %ecx
+ movl -39(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+L(35bytes):
+ movl -35(%eax), %ecx
+ movl -35(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+L(31bytes):
+ movl -31(%eax), %ecx
+ movl -31(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+L(27bytes):
+ movl -27(%eax), %ecx
+ movl -27(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+L(23bytes):
+ movl -23(%eax), %ecx
+ movl -23(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+L(19bytes):
+ movl -19(%eax), %ecx
+ movl -19(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+L(15bytes):
+ movl -15(%eax), %ecx
+ movl -15(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+L(11bytes):
+ movl -11(%eax), %ecx
+ movl -11(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+L(7bytes):
+ movl -7(%eax), %ecx
+ movl -7(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+L(3bytes):
+ movzwl -3(%eax), %ecx
+ movzwl -3(%edx), %ebx
+ cmpb %bl, %cl
+ jne L(end)
+ cmp %bx, %cx
+ jne L(end)
+ movzbl -1(%eax), %eax
+ cmpb -1(%edx), %al
+ mov $0, %eax
+ jne L(end)
+ POP (%ebx)
+ ret
+ CFI_PUSH (%ebx)
+
+ .p2align 4
+L(find_diff):
+ cmpb %bl, %cl
+ jne L(end)
+ cmp %bx, %cx
+ jne L(end)
+ shr $16,%ecx
+ shr $16,%ebx
+ cmp %bl, %cl
+ jne L(end)
+ cmp %bx, %cx
+
+ .p2align 4
+L(end):
+ POP (%ebx)
+ mov $1, %eax
+ ja L(bigger)
+ neg %eax
+L(bigger):
+ ret
+# else
+
+/* for wmemcmp */
+ .p2align 4
+L(find_diff):
+ POP (%ebx)
+ mov $1, %eax
+ jg L(find_diff_bigger)
+ neg %eax
+ ret
+
+ .p2align 4
+L(find_diff_bigger):
+ ret
+
+# endif
+END (MEMCMP)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memcmp.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memcmp.S
new file mode 100644
index 0000000000..1fc5994a17
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memcmp.S
@@ -0,0 +1,62 @@
+/* Multiple versions of memcmp
+ All versions must be listed in ifunc-impl-list.c.
+ Copyright (C) 2010-2017 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in libc. */
+#if IS_IN (libc)
+ .text
+ENTRY(memcmp)
+ .type memcmp, @gnu_indirect_function
+ LOAD_GOT_AND_RTLD_GLOBAL_RO
+ LOAD_FUNC_GOT_EAX (__memcmp_ia32)
+ HAS_CPU_FEATURE (SSSE3)
+ jz 2f
+ LOAD_FUNC_GOT_EAX (__memcmp_ssse3)
+ HAS_CPU_FEATURE (SSE4_2)
+ jz 2f
+ LOAD_FUNC_GOT_EAX (__memcmp_sse4_2)
+2: ret
+END(memcmp)
+
+# undef ENTRY
+# define ENTRY(name) \
+ .type __memcmp_ia32, @function; \
+ .p2align 4; \
+ .globl __memcmp_ia32; \
+ .hidden __memcmp_ia32; \
+ __memcmp_ia32: cfi_startproc; \
+ CALL_MCOUNT
+# undef END
+# define END(name) \
+ cfi_endproc; .size __memcmp_ia32, .-__memcmp_ia32
+
+# ifdef SHARED
+# undef libc_hidden_builtin_def
+/* IFUNC doesn't work with the hidden functions in shared library since
+ they will be called without setting up EBX needed for PLT which is
+ used by IFUNC. */
+# define libc_hidden_builtin_def(name) \
+ .globl __GI_memcmp; __GI_memcmp = __memcmp_ia32
+# endif
+#endif
+
+#include "../memcmp.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memcpy-sse2-unaligned.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memcpy-sse2-unaligned.S
new file mode 100644
index 0000000000..2fe2072cb1
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memcpy-sse2-unaligned.S
@@ -0,0 +1,681 @@
+/* memcpy optimized with SSE2 unaligned memory access instructions.
+ Copyright (C) 2014-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#if IS_IN (libc) \
+ && (defined SHARED \
+ || defined USE_AS_MEMMOVE \
+ || !defined USE_MULTIARCH)
+
+# include <sysdep.h>
+# include "asm-syntax.h"
+
+# ifndef MEMCPY
+# define MEMCPY __memcpy_sse2_unaligned
+# define MEMCPY_CHK __memcpy_chk_sse2_unaligned
+# endif
+
+# ifdef USE_AS_BCOPY
+# define SRC PARMS
+# define DEST SRC+4
+# define LEN DEST+4
+# else
+# define DEST PARMS
+# define SRC DEST+4
+# define LEN SRC+4
+# endif
+
+# define CFI_PUSH(REG) \
+ cfi_adjust_cfa_offset (4); \
+ cfi_rel_offset (REG, 0)
+
+# define CFI_POP(REG) \
+ cfi_adjust_cfa_offset (-4); \
+ cfi_restore (REG)
+
+# define PUSH(REG) pushl REG; CFI_PUSH (REG)
+# define POP(REG) popl REG; CFI_POP (REG)
+
+# define PARMS 8 /* Preserve EBX. */
+# define ENTRANCE PUSH (%ebx);
+# define RETURN_END POP (%ebx); ret
+# define RETURN RETURN_END; CFI_PUSH (%ebx)
+
+ .section .text.sse2,"ax",@progbits
+# if !defined USE_AS_BCOPY
+ENTRY (MEMCPY_CHK)
+ movl 12(%esp), %eax
+ cmpl %eax, 16(%esp)
+ jb HIDDEN_JUMPTARGET (__chk_fail)
+END (MEMCPY_CHK)
+# endif
+
+ENTRY (MEMCPY)
+ ENTRANCE
+ movl LEN(%esp), %ecx
+ movl SRC(%esp), %eax
+ movl DEST(%esp), %edx
+ cmp %edx, %eax
+
+# ifdef USE_AS_MEMMOVE
+ jg L(check_forward)
+
+L(mm_len_0_or_more_backward):
+/* Now do checks for lengths. We do [0..16], [16..32], [32..64], [64..128]
+ separately. */
+ cmp $16, %ecx
+ jbe L(mm_len_0_16_bytes_backward)
+
+ cmpl $32, %ecx
+ jg L(mm_len_32_or_more_backward)
+
+/* Copy [0..32] and return. */
+ movdqu (%eax), %xmm0
+ movdqu -16(%eax, %ecx), %xmm1
+ movdqu %xmm0, (%edx)
+ movdqu %xmm1, -16(%edx, %ecx)
+ jmp L(return)
+
+L(mm_len_32_or_more_backward):
+ cmpl $64, %ecx
+ jg L(mm_len_64_or_more_backward)
+
+/* Copy [0..64] and return. */
+ movdqu (%eax), %xmm0
+ movdqu 16(%eax), %xmm1
+ movdqu -16(%eax, %ecx), %xmm2
+ movdqu -32(%eax, %ecx), %xmm3
+ movdqu %xmm0, (%edx)
+ movdqu %xmm1, 16(%edx)
+ movdqu %xmm2, -16(%edx, %ecx)
+ movdqu %xmm3, -32(%edx, %ecx)
+ jmp L(return)
+
+L(mm_len_64_or_more_backward):
+ cmpl $128, %ecx
+ jg L(mm_len_128_or_more_backward)
+
+/* Copy [0..128] and return. */
+ movdqu (%eax), %xmm0
+ movdqu 16(%eax), %xmm1
+ movdqu 32(%eax), %xmm2
+ movdqu 48(%eax), %xmm3
+ movdqu -64(%eax, %ecx), %xmm4
+ movdqu -48(%eax, %ecx), %xmm5
+ movdqu -32(%eax, %ecx), %xmm6
+ movdqu -16(%eax, %ecx), %xmm7
+ movdqu %xmm0, (%edx)
+ movdqu %xmm1, 16(%edx)
+ movdqu %xmm2, 32(%edx)
+ movdqu %xmm3, 48(%edx)
+ movdqu %xmm4, -64(%edx, %ecx)
+ movdqu %xmm5, -48(%edx, %ecx)
+ movdqu %xmm6, -32(%edx, %ecx)
+ movdqu %xmm7, -16(%edx, %ecx)
+ jmp L(return)
+
+L(mm_len_128_or_more_backward):
+ add %ecx, %eax
+ cmp %edx, %eax
+ movl SRC(%esp), %eax
+ jle L(forward)
+ PUSH (%esi)
+ PUSH (%edi)
+ PUSH (%ebx)
+
+/* Aligning the address of destination. */
+ movdqu (%eax), %xmm4
+ movdqu 16(%eax), %xmm5
+ movdqu 32(%eax), %xmm6
+ movdqu 48(%eax), %xmm7
+ leal (%edx, %ecx), %esi
+ movdqu -16(%eax, %ecx), %xmm0
+ subl $16, %esp
+ movdqu %xmm0, (%esp)
+ mov %ecx, %edi
+ movl %esi, %ecx
+ andl $-16, %ecx
+ leal (%ecx), %ebx
+ subl %edx, %ebx
+ leal (%eax, %ebx), %eax
+ shrl $6, %ebx
+
+# ifdef SHARED_CACHE_SIZE_HALF
+ cmp $SHARED_CACHE_SIZE_HALF, %edi
+# else
+# ifdef SHARED
+ PUSH (%ebx)
+ SETUP_PIC_REG (bx)
+ add $_GLOBAL_OFFSET_TABLE_, %ebx
+ cmp __x86_shared_cache_size_half@GOTOFF(%ebx), %edi
+ POP (%ebx)
+# else
+ cmp __x86_shared_cache_size_half, %edi
+# endif
+# endif
+ jae L(mm_large_page_loop_backward)
+
+ .p2align 4
+L(mm_main_loop_backward):
+
+ prefetcht0 -128(%eax)
+
+ movdqu -64(%eax), %xmm0
+ movdqu -48(%eax), %xmm1
+ movdqu -32(%eax), %xmm2
+ movdqu -16(%eax), %xmm3
+ movaps %xmm0, -64(%ecx)
+ subl $64, %eax
+ movaps %xmm1, -48(%ecx)
+ movaps %xmm2, -32(%ecx)
+ movaps %xmm3, -16(%ecx)
+ subl $64, %ecx
+ sub $1, %ebx
+ jnz L(mm_main_loop_backward)
+ movdqu (%esp), %xmm0
+ addl $16, %esp
+ movdqu %xmm0, -16(%esi)
+ movdqu %xmm4, (%edx)
+ movdqu %xmm5, 16(%edx)
+ movdqu %xmm6, 32(%edx)
+ movdqu %xmm7, 48(%edx)
+ POP (%ebx)
+ jmp L(mm_return_pop_all)
+
+/* Copy [0..16] and return. */
+L(mm_len_0_16_bytes_backward):
+ testb $24, %cl
+ jnz L(mm_len_9_16_bytes_backward)
+ testb $4, %cl
+ .p2align 4,,5
+ jnz L(mm_len_5_8_bytes_backward)
+ testl %ecx, %ecx
+ .p2align 4,,2
+ je L(return)
+ testb $2, %cl
+ .p2align 4,,1
+ jne L(mm_len_3_4_bytes_backward)
+ movzbl -1(%eax,%ecx), %ebx
+ movzbl (%eax), %eax
+ movb %bl, -1(%edx,%ecx)
+ movb %al, (%edx)
+ jmp L(return)
+
+L(mm_len_3_4_bytes_backward):
+ movzwl -2(%eax,%ecx), %ebx
+ movzwl (%eax), %eax
+ movw %bx, -2(%edx,%ecx)
+ movw %ax, (%edx)
+ jmp L(return)
+
+L(mm_len_9_16_bytes_backward):
+ PUSH (%esi)
+ movl -4(%eax,%ecx), %ebx
+ movl -8(%eax,%ecx), %esi
+ movl %ebx, -4(%edx,%ecx)
+ movl %esi, -8(%edx,%ecx)
+ subl $8, %ecx
+ POP (%esi)
+ jmp L(mm_len_0_16_bytes_backward)
+
+L(mm_len_5_8_bytes_backward):
+ movl (%eax), %ebx
+ movl -4(%eax,%ecx), %eax
+ movl %ebx, (%edx)
+ movl %eax, -4(%edx,%ecx)
+ jmp L(return)
+
+/* Big length copy backward part. */
+ .p2align 4
+L(mm_large_page_loop_backward):
+ movdqu -64(%eax), %xmm0
+ movdqu -48(%eax), %xmm1
+ movdqu -32(%eax), %xmm2
+ movdqu -16(%eax), %xmm3
+ movntdq %xmm0, -64(%ecx)
+ subl $64, %eax
+ movntdq %xmm1, -48(%ecx)
+ movntdq %xmm2, -32(%ecx)
+ movntdq %xmm3, -16(%ecx)
+ subl $64, %ecx
+ sub $1, %ebx
+ jnz L(mm_large_page_loop_backward)
+ sfence
+ movdqu (%esp), %xmm0
+ addl $16, %esp
+ movdqu %xmm0, -16(%esi)
+ movdqu %xmm4, (%edx)
+ movdqu %xmm5, 16(%edx)
+ movdqu %xmm6, 32(%edx)
+ movdqu %xmm7, 48(%edx)
+ POP (%ebx)
+ jmp L(mm_return_pop_all)
+
+L(check_forward):
+ add %edx, %ecx
+ cmp %eax, %ecx
+ movl LEN(%esp), %ecx
+ jle L(forward)
+
+/* Now do checks for lengths. We do [0..16], [0..32], [0..64], [0..128]
+ separately. */
+ cmp $16, %ecx
+ jbe L(mm_len_0_16_bytes_forward)
+
+ cmpl $32, %ecx
+ ja L(mm_len_32_or_more_forward)
+
+/* Copy [0..32] and return. */
+ movdqu (%eax), %xmm0
+ movdqu -16(%eax, %ecx), %xmm1
+ movdqu %xmm0, (%edx)
+ movdqu %xmm1, -16(%edx, %ecx)
+ jmp L(return)
+
+L(mm_len_32_or_more_forward):
+ cmpl $64, %ecx
+ ja L(mm_len_64_or_more_forward)
+
+/* Copy [0..64] and return. */
+ movdqu (%eax), %xmm0
+ movdqu 16(%eax), %xmm1
+ movdqu -16(%eax, %ecx), %xmm2
+ movdqu -32(%eax, %ecx), %xmm3
+ movdqu %xmm0, (%edx)
+ movdqu %xmm1, 16(%edx)
+ movdqu %xmm2, -16(%edx, %ecx)
+ movdqu %xmm3, -32(%edx, %ecx)
+ jmp L(return)
+
+L(mm_len_64_or_more_forward):
+ cmpl $128, %ecx
+ ja L(mm_len_128_or_more_forward)
+
+/* Copy [0..128] and return. */
+ movdqu (%eax), %xmm0
+ movdqu 16(%eax), %xmm1
+ movdqu 32(%eax), %xmm2
+ movdqu 48(%eax), %xmm3
+ movdqu -64(%eax, %ecx), %xmm4
+ movdqu -48(%eax, %ecx), %xmm5
+ movdqu -32(%eax, %ecx), %xmm6
+ movdqu -16(%eax, %ecx), %xmm7
+ movdqu %xmm0, (%edx)
+ movdqu %xmm1, 16(%edx)
+ movdqu %xmm2, 32(%edx)
+ movdqu %xmm3, 48(%edx)
+ movdqu %xmm4, -64(%edx, %ecx)
+ movdqu %xmm5, -48(%edx, %ecx)
+ movdqu %xmm6, -32(%edx, %ecx)
+ movdqu %xmm7, -16(%edx, %ecx)
+ jmp L(return)
+
+L(mm_len_128_or_more_forward):
+ PUSH (%esi)
+ PUSH (%edi)
+ PUSH (%ebx)
+
+/* Aligning the address of destination. */
+ movdqu -16(%eax, %ecx), %xmm4
+ movdqu -32(%eax, %ecx), %xmm5
+ movdqu -48(%eax, %ecx), %xmm6
+ movdqu -64(%eax, %ecx), %xmm7
+ leal (%edx, %ecx), %esi
+ movdqu (%eax), %xmm0
+ subl $16, %esp
+ movdqu %xmm0, (%esp)
+ mov %ecx, %edi
+ leal 16(%edx), %ecx
+ andl $-16, %ecx
+ movl %ecx, %ebx
+ subl %edx, %ebx
+ addl %ebx, %eax
+ movl %esi, %ebx
+ subl %ecx, %ebx
+ shrl $6, %ebx
+
+# ifdef SHARED_CACHE_SIZE_HALF
+ cmp $SHARED_CACHE_SIZE_HALF, %edi
+# else
+# ifdef SHARED
+ PUSH (%ebx)
+ SETUP_PIC_REG(bx)
+ add $_GLOBAL_OFFSET_TABLE_, %ebx
+ cmp __x86_shared_cache_size_half@GOTOFF(%ebx), %edi
+ POP (%ebx)
+# else
+ cmp __x86_shared_cache_size_half, %edi
+# endif
+# endif
+ jae L(mm_large_page_loop_forward)
+
+ .p2align 4
+L(mm_main_loop_forward):
+
+ prefetcht0 128(%eax)
+
+ movdqu (%eax), %xmm0
+ movdqu 16(%eax), %xmm1
+ movdqu 32(%eax), %xmm2
+ movdqu 48(%eax), %xmm3
+ movdqa %xmm0, (%ecx)
+ addl $64, %eax
+ movaps %xmm1, 16(%ecx)
+ movaps %xmm2, 32(%ecx)
+ movaps %xmm3, 48(%ecx)
+ addl $64, %ecx
+ sub $1, %ebx
+ jnz L(mm_main_loop_forward)
+ movdqu (%esp), %xmm0
+ addl $16, %esp
+ movdqu %xmm0, (%edx)
+ movdqu %xmm4, -16(%esi)
+ movdqu %xmm5, -32(%esi)
+ movdqu %xmm6, -48(%esi)
+ movdqu %xmm7, -64(%esi)
+ POP (%ebx)
+ jmp L(mm_return_pop_all)
+
+L(mm_len_0_16_bytes_forward):
+ testb $24, %cl
+ jne L(mm_len_9_16_bytes_forward)
+ testb $4, %cl
+ .p2align 4,,5
+ jne L(mm_len_5_8_bytes_forward)
+ testl %ecx, %ecx
+ .p2align 4,,2
+ je L(return)
+ testb $2, %cl
+ .p2align 4,,1
+ jne L(mm_len_2_4_bytes_forward)
+ movzbl -1(%eax,%ecx), %ebx
+ movzbl (%eax), %eax
+ movb %bl, -1(%edx,%ecx)
+ movb %al, (%edx)
+ jmp L(return)
+
+L(mm_len_2_4_bytes_forward):
+ movzwl -2(%eax,%ecx), %ebx
+ movzwl (%eax), %eax
+ movw %bx, -2(%edx,%ecx)
+ movw %ax, (%edx)
+ jmp L(return)
+
+L(mm_len_5_8_bytes_forward):
+ movl (%eax), %ebx
+ movl -4(%eax,%ecx), %eax
+ movl %ebx, (%edx)
+ movl %eax, -4(%edx,%ecx)
+ jmp L(return)
+
+L(mm_len_9_16_bytes_forward):
+ movq (%eax), %xmm0
+ movq -8(%eax, %ecx), %xmm1
+ movq %xmm0, (%edx)
+ movq %xmm1, -8(%edx, %ecx)
+ jmp L(return)
+
+L(mm_return_pop_all):
+ movl %edx, %eax
+ POP (%edi)
+ POP (%esi)
+ RETURN
+
+/* Big length copy forward part. */
+ .p2align 4
+L(mm_large_page_loop_forward):
+ movdqu (%eax), %xmm0
+ movdqu 16(%eax), %xmm1
+ movdqu 32(%eax), %xmm2
+ movdqu 48(%eax), %xmm3
+ movntdq %xmm0, (%ecx)
+ addl $64, %eax
+ movntdq %xmm1, 16(%ecx)
+ movntdq %xmm2, 32(%ecx)
+ movntdq %xmm3, 48(%ecx)
+ addl $64, %ecx
+ sub $1, %ebx
+ jnz L(mm_large_page_loop_forward)
+ sfence
+ movdqu (%esp), %xmm0
+ addl $16, %esp
+ movdqu %xmm0, (%edx)
+ movdqu %xmm4, -16(%esi)
+ movdqu %xmm5, -32(%esi)
+ movdqu %xmm6, -48(%esi)
+ movdqu %xmm7, -64(%esi)
+ POP (%ebx)
+ jmp L(mm_return_pop_all)
+# endif
+
+L(forward):
+ cmp $16, %ecx
+ jbe L(len_0_16_bytes)
+
+# ifdef SHARED_CACHE_SIZE_HALF
+ cmp $SHARED_CACHE_SIZE_HALF, %ecx
+# else
+# ifdef SHARED
+ SETUP_PIC_REG(bx)
+ add $_GLOBAL_OFFSET_TABLE_, %ebx
+ cmp __x86_shared_cache_size_half@GOTOFF(%ebx), %ecx
+# else
+ cmp __x86_shared_cache_size_half, %ecx
+# endif
+# endif
+ jae L(large_page)
+
+ movdqu (%eax), %xmm0
+ movdqu -16(%eax, %ecx), %xmm1
+ cmpl $32, %ecx
+ movdqu %xmm0, (%edx)
+ movdqu %xmm1, -16(%edx, %ecx)
+ jbe L(return)
+
+ movdqu 16(%eax), %xmm0
+ movdqu -32(%eax, %ecx), %xmm1
+ cmpl $64, %ecx
+ movdqu %xmm0, 16(%edx)
+ movdqu %xmm1, -32(%edx, %ecx)
+ jbe L(return)
+
+ movdqu 32(%eax), %xmm0
+ movdqu 48(%eax), %xmm1
+ movdqu -48(%eax, %ecx), %xmm2
+ movdqu -64(%eax, %ecx), %xmm3
+ cmpl $128, %ecx
+ movdqu %xmm0, 32(%edx)
+ movdqu %xmm1, 48(%edx)
+ movdqu %xmm2, -48(%edx, %ecx)
+ movdqu %xmm3, -64(%edx, %ecx)
+ jbe L(return)
+
+/* Now the main loop: we align the address of the destination. */
+ leal 64(%edx), %ebx
+ andl $-64, %ebx
+
+ addl %edx, %ecx
+ andl $-64, %ecx
+
+ subl %edx, %eax
+
+/* We should stop two iterations before the termination
+ (in order not to misprefetch). */
+ subl $64, %ecx
+ cmpl %ebx, %ecx
+ je L(main_loop_just_one_iteration)
+
+ subl $64, %ecx
+ cmpl %ebx, %ecx
+ je L(main_loop_last_two_iterations)
+
+ .p2align 4
+L(main_loop_cache):
+
+ prefetcht0 128(%ebx, %eax)
+
+ movdqu (%ebx, %eax), %xmm0
+ movdqu 16(%ebx, %eax), %xmm1
+ movdqu 32(%ebx, %eax), %xmm2
+ movdqu 48(%ebx, %eax), %xmm3
+ movdqa %xmm0, (%ebx)
+ movaps %xmm1, 16(%ebx)
+ movaps %xmm2, 32(%ebx)
+ movaps %xmm3, 48(%ebx)
+ lea 64(%ebx), %ebx
+ cmpl %ebx, %ecx
+ jne L(main_loop_cache)
+
+L(main_loop_last_two_iterations):
+ movdqu (%ebx, %eax), %xmm0
+ movdqu 16(%ebx, %eax), %xmm1
+ movdqu 32(%ebx, %eax), %xmm2
+ movdqu 48(%ebx, %eax), %xmm3
+ movdqu 64(%ebx, %eax), %xmm4
+ movdqu 80(%ebx, %eax), %xmm5
+ movdqu 96(%ebx, %eax), %xmm6
+ movdqu 112(%ebx, %eax), %xmm7
+ movdqa %xmm0, (%ebx)
+ movaps %xmm1, 16(%ebx)
+ movaps %xmm2, 32(%ebx)
+ movaps %xmm3, 48(%ebx)
+ movaps %xmm4, 64(%ebx)
+ movaps %xmm5, 80(%ebx)
+ movaps %xmm6, 96(%ebx)
+ movaps %xmm7, 112(%ebx)
+ jmp L(return)
+
+L(main_loop_just_one_iteration):
+ movdqu (%ebx, %eax), %xmm0
+ movdqu 16(%ebx, %eax), %xmm1
+ movdqu 32(%ebx, %eax), %xmm2
+ movdqu 48(%ebx, %eax), %xmm3
+ movdqa %xmm0, (%ebx)
+ movaps %xmm1, 16(%ebx)
+ movaps %xmm2, 32(%ebx)
+ movaps %xmm3, 48(%ebx)
+ jmp L(return)
+
+L(large_page):
+ movdqu (%eax), %xmm0
+ movdqu 16(%eax), %xmm1
+ movdqu 32(%eax), %xmm2
+ movdqu 48(%eax), %xmm3
+ movdqu -64(%eax, %ecx), %xmm4
+ movdqu -48(%eax, %ecx), %xmm5
+ movdqu -32(%eax, %ecx), %xmm6
+ movdqu -16(%eax, %ecx), %xmm7
+ movdqu %xmm0, (%edx)
+ movdqu %xmm1, 16(%edx)
+ movdqu %xmm2, 32(%edx)
+ movdqu %xmm3, 48(%edx)
+ movdqu %xmm4, -64(%edx, %ecx)
+ movdqu %xmm5, -48(%edx, %ecx)
+ movdqu %xmm6, -32(%edx, %ecx)
+ movdqu %xmm7, -16(%edx, %ecx)
+
+ movdqu 64(%eax), %xmm0
+ movdqu 80(%eax), %xmm1
+ movdqu 96(%eax), %xmm2
+ movdqu 112(%eax), %xmm3
+ movdqu -128(%eax, %ecx), %xmm4
+ movdqu -112(%eax, %ecx), %xmm5
+ movdqu -96(%eax, %ecx), %xmm6
+ movdqu -80(%eax, %ecx), %xmm7
+ movdqu %xmm0, 64(%edx)
+ movdqu %xmm1, 80(%edx)
+ movdqu %xmm2, 96(%edx)
+ movdqu %xmm3, 112(%edx)
+ movdqu %xmm4, -128(%edx, %ecx)
+ movdqu %xmm5, -112(%edx, %ecx)
+ movdqu %xmm6, -96(%edx, %ecx)
+ movdqu %xmm7, -80(%edx, %ecx)
+
+/* Now the main loop with non temporal stores. We align
+ the address of the destination. */
+ leal 128(%edx), %ebx
+ andl $-128, %ebx
+
+ addl %edx, %ecx
+ andl $-128, %ecx
+
+ subl %edx, %eax
+
+ .p2align 4
+L(main_loop_large_page):
+ movdqu (%ebx, %eax), %xmm0
+ movdqu 16(%ebx, %eax), %xmm1
+ movdqu 32(%ebx, %eax), %xmm2
+ movdqu 48(%ebx, %eax), %xmm3
+ movdqu 64(%ebx, %eax), %xmm4
+ movdqu 80(%ebx, %eax), %xmm5
+ movdqu 96(%ebx, %eax), %xmm6
+ movdqu 112(%ebx, %eax), %xmm7
+ movntdq %xmm0, (%ebx)
+ movntdq %xmm1, 16(%ebx)
+ movntdq %xmm2, 32(%ebx)
+ movntdq %xmm3, 48(%ebx)
+ movntdq %xmm4, 64(%ebx)
+ movntdq %xmm5, 80(%ebx)
+ movntdq %xmm6, 96(%ebx)
+ movntdq %xmm7, 112(%ebx)
+ lea 128(%ebx), %ebx
+ cmpl %ebx, %ecx
+ jne L(main_loop_large_page)
+ sfence
+ jmp L(return)
+
+L(len_0_16_bytes):
+ testb $24, %cl
+ jne L(len_9_16_bytes)
+ testb $4, %cl
+ .p2align 4,,5
+ jne L(len_5_8_bytes)
+ testl %ecx, %ecx
+ .p2align 4,,2
+ je L(return)
+ movzbl (%eax), %ebx
+ testb $2, %cl
+ movb %bl, (%edx)
+ je L(return)
+ movzwl -2(%eax,%ecx), %ebx
+ movw %bx, -2(%edx,%ecx)
+ jmp L(return)
+
+L(len_9_16_bytes):
+ movq (%eax), %xmm0
+ movq -8(%eax, %ecx), %xmm1
+ movq %xmm0, (%edx)
+ movq %xmm1, -8(%edx, %ecx)
+ jmp L(return)
+
+L(len_5_8_bytes):
+ movl (%eax), %ebx
+ movl %ebx, (%edx)
+ movl -4(%eax,%ecx), %ebx
+ movl %ebx, -4(%edx,%ecx)
+
+L(return):
+ movl %edx, %eax
+# if !defined USE_AS_BCOPY && defined USE_AS_MEMPCPY
+ movl LEN(%esp), %ecx
+ add %ecx, %eax
+# endif
+ RETURN
+
+END (MEMCPY)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memcpy-ssse3-rep.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memcpy-ssse3-rep.S
new file mode 100644
index 0000000000..687e083147
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memcpy-ssse3-rep.S
@@ -0,0 +1,1809 @@
+/* memcpy with SSSE3 and REP string.
+ Copyright (C) 2010-2017 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+#if IS_IN (libc) \
+ && (defined SHARED \
+ || defined USE_AS_MEMMOVE \
+ || !defined USE_MULTIARCH)
+
+#include "asm-syntax.h"
+
+#ifndef MEMCPY
+# define MEMCPY __memcpy_ssse3_rep
+# define MEMCPY_CHK __memcpy_chk_ssse3_rep
+#endif
+
+#ifdef USE_AS_BCOPY
+# define SRC PARMS
+# define DEST SRC+4
+# define LEN DEST+4
+#else
+# define DEST PARMS
+# define SRC DEST+4
+# define LEN SRC+4
+#endif
+
+#define CFI_PUSH(REG) \
+ cfi_adjust_cfa_offset (4); \
+ cfi_rel_offset (REG, 0)
+
+#define CFI_POP(REG) \
+ cfi_adjust_cfa_offset (-4); \
+ cfi_restore (REG)
+
+#define PUSH(REG) pushl REG; CFI_PUSH (REG)
+#define POP(REG) popl REG; CFI_POP (REG)
+
+#ifdef SHARED
+# define PARMS 8 /* Preserve EBX. */
+# define ENTRANCE PUSH (%ebx);
+# define RETURN_END POP (%ebx); ret
+# define RETURN RETURN_END; CFI_PUSH (%ebx)
+# define JMPTBL(I, B) I - B
+
+/* Load an entry in a jump table into EBX and branch to it. TABLE is a
+ jump table with relative offsets. INDEX is a register contains the
+ index into the jump table. SCALE is the scale of INDEX. */
+# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
+ /* We first load PC into EBX. */ \
+ SETUP_PIC_REG(bx); \
+ /* Get the address of the jump table. */ \
+ addl $(TABLE - .), %ebx; \
+ /* Get the entry and convert the relative offset to the \
+ absolute address. */ \
+ addl (%ebx,INDEX,SCALE), %ebx; \
+ /* We loaded the jump table. Go. */ \
+ jmp *%ebx
+
+# define BRANCH_TO_JMPTBL_ENTRY_VALUE(TABLE) \
+ addl $(TABLE - .), %ebx
+
+# define BRANCH_TO_JMPTBL_ENTRY_TAIL(TABLE, INDEX, SCALE) \
+ addl (%ebx,INDEX,SCALE), %ebx; \
+ /* We loaded the jump table. Go. */ \
+ jmp *%ebx
+#else
+# define PARMS 4
+# define ENTRANCE
+# define RETURN_END ret
+# define RETURN RETURN_END
+# define JMPTBL(I, B) I
+
+/* Branch to an entry in a jump table. TABLE is a jump table with
+ absolute offsets. INDEX is a register contains the index into the
+ jump table. SCALE is the scale of INDEX. */
+# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
+ jmp *TABLE(,INDEX,SCALE)
+
+# define BRANCH_TO_JMPTBL_ENTRY_VALUE(TABLE)
+
+# define BRANCH_TO_JMPTBL_ENTRY_TAIL(TABLE, INDEX, SCALE) \
+ jmp *TABLE(,INDEX,SCALE)
+#endif
+
+ .section .text.ssse3,"ax",@progbits
+#if !defined USE_AS_BCOPY
+ENTRY (MEMCPY_CHK)
+ movl 12(%esp), %eax
+ cmpl %eax, 16(%esp)
+ jb HIDDEN_JUMPTARGET (__chk_fail)
+END (MEMCPY_CHK)
+#endif
+ENTRY (MEMCPY)
+ ENTRANCE
+ movl LEN(%esp), %ecx
+ movl SRC(%esp), %eax
+ movl DEST(%esp), %edx
+
+#ifdef USE_AS_MEMMOVE
+ cmp %eax, %edx
+ jb L(copy_forward)
+ je L(fwd_write_0bytes)
+ cmp $48, %ecx
+ jb L(bk_write_less48bytes)
+ add %ecx, %eax
+ cmp %eax, %edx
+ movl SRC(%esp), %eax
+ jb L(copy_backward)
+
+L(copy_forward):
+#endif
+ cmp $48, %ecx
+ jae L(48bytesormore)
+
+L(fwd_write_less32bytes):
+#ifndef USE_AS_MEMMOVE
+ cmp %dl, %al
+ jb L(bk_write)
+#endif
+ add %ecx, %edx
+ add %ecx, %eax
+ BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
+#ifndef USE_AS_MEMMOVE
+L(bk_write):
+ BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4)
+#endif
+
+ ALIGN (4)
+/* ECX > 32 and EDX is 4 byte aligned. */
+L(48bytesormore):
+ movdqu (%eax), %xmm0
+ PUSH (%edi)
+ movl %edx, %edi
+ and $-16, %edx
+ PUSH (%esi)
+ cfi_remember_state
+ add $16, %edx
+ movl %edi, %esi
+ sub %edx, %edi
+ add %edi, %ecx
+ sub %edi, %eax
+
+#ifdef SHARED_CACHE_SIZE_HALF
+ cmp $SHARED_CACHE_SIZE_HALF, %ecx
+#else
+# ifdef SHARED
+ SETUP_PIC_REG(bx)
+ add $_GLOBAL_OFFSET_TABLE_, %ebx
+ cmp __x86_shared_cache_size_half@GOTOFF(%ebx), %ecx
+# else
+ cmp __x86_shared_cache_size_half, %ecx
+# endif
+#endif
+
+ mov %eax, %edi
+ jae L(large_page)
+ and $0xf, %edi
+ jz L(shl_0)
+
+ BRANCH_TO_JMPTBL_ENTRY (L(shl_table), %edi, 4)
+
+ ALIGN (4)
+L(shl_0):
+ movdqu %xmm0, (%esi)
+ xor %edi, %edi
+ cmp $127, %ecx
+ ja L(shl_0_gobble)
+ lea -32(%ecx), %ecx
+L(shl_0_loop):
+ movdqa (%eax, %edi), %xmm0
+ movdqa 16(%eax, %edi), %xmm1
+ sub $32, %ecx
+ movdqa %xmm0, (%edx, %edi)
+ movdqa %xmm1, 16(%edx, %edi)
+ lea 32(%edi), %edi
+ jb L(shl_0_end)
+
+ movdqa (%eax, %edi), %xmm0
+ movdqa 16(%eax, %edi), %xmm1
+ sub $32, %ecx
+ movdqa %xmm0, (%edx, %edi)
+ movdqa %xmm1, 16(%edx, %edi)
+ lea 32(%edi), %edi
+ jb L(shl_0_end)
+
+ movdqa (%eax, %edi), %xmm0
+ movdqa 16(%eax, %edi), %xmm1
+ sub $32, %ecx
+ movdqa %xmm0, (%edx, %edi)
+ movdqa %xmm1, 16(%edx, %edi)
+ lea 32(%edi), %edi
+ jb L(shl_0_end)
+
+ movdqa (%eax, %edi), %xmm0
+ movdqa 16(%eax, %edi), %xmm1
+ sub $32, %ecx
+ movdqa %xmm0, (%edx, %edi)
+ movdqa %xmm1, 16(%edx, %edi)
+ lea 32(%edi), %edi
+L(shl_0_end):
+ lea 32(%ecx), %ecx
+ add %ecx, %edi
+ add %edi, %edx
+ add %edi, %eax
+ POP (%esi)
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
+
+ cfi_restore_state
+ cfi_remember_state
+L(shl_0_gobble):
+
+#ifdef DATA_CACHE_SIZE_HALF
+ cmp $DATA_CACHE_SIZE_HALF, %ecx
+#else
+# ifdef SHARED
+ SETUP_PIC_REG(bx)
+ add $_GLOBAL_OFFSET_TABLE_, %ebx
+ mov __x86_data_cache_size_half@GOTOFF(%ebx), %edi
+# else
+ mov __x86_data_cache_size_half, %edi
+# endif
+#endif
+ mov %edi, %esi
+ shr $3, %esi
+ sub %esi, %edi
+ cmp %edi, %ecx
+ jae L(shl_0_gobble_mem_start)
+ sub $128, %ecx
+ ALIGN (4)
+L(shl_0_gobble_cache_loop):
+ movdqa (%eax), %xmm0
+ movaps 0x10(%eax), %xmm1
+ movaps 0x20(%eax), %xmm2
+ movaps 0x30(%eax), %xmm3
+ movaps 0x40(%eax), %xmm4
+ movaps 0x50(%eax), %xmm5
+ movaps 0x60(%eax), %xmm6
+ movaps 0x70(%eax), %xmm7
+ lea 0x80(%eax), %eax
+ sub $128, %ecx
+ movdqa %xmm0, (%edx)
+ movaps %xmm1, 0x10(%edx)
+ movaps %xmm2, 0x20(%edx)
+ movaps %xmm3, 0x30(%edx)
+ movaps %xmm4, 0x40(%edx)
+ movaps %xmm5, 0x50(%edx)
+ movaps %xmm6, 0x60(%edx)
+ movaps %xmm7, 0x70(%edx)
+ lea 0x80(%edx), %edx
+
+ jae L(shl_0_gobble_cache_loop)
+ add $0x80, %ecx
+ cmp $0x40, %ecx
+ jb L(shl_0_cache_less_64bytes)
+
+ movdqa (%eax), %xmm0
+ sub $0x40, %ecx
+ movdqa 0x10(%eax), %xmm1
+
+ movdqa %xmm0, (%edx)
+ movdqa %xmm1, 0x10(%edx)
+
+ movdqa 0x20(%eax), %xmm0
+ movdqa 0x30(%eax), %xmm1
+ add $0x40, %eax
+
+ movdqa %xmm0, 0x20(%edx)
+ movdqa %xmm1, 0x30(%edx)
+ add $0x40, %edx
+L(shl_0_cache_less_64bytes):
+ cmp $0x20, %ecx
+ jb L(shl_0_cache_less_32bytes)
+ movdqa (%eax), %xmm0
+ sub $0x20, %ecx
+ movdqa 0x10(%eax), %xmm1
+ add $0x20, %eax
+ movdqa %xmm0, (%edx)
+ movdqa %xmm1, 0x10(%edx)
+ add $0x20, %edx
+L(shl_0_cache_less_32bytes):
+ cmp $0x10, %ecx
+ jb L(shl_0_cache_less_16bytes)
+ sub $0x10, %ecx
+ movdqa (%eax), %xmm0
+ add $0x10, %eax
+ movdqa %xmm0, (%edx)
+ add $0x10, %edx
+L(shl_0_cache_less_16bytes):
+ add %ecx, %edx
+ add %ecx, %eax
+ POP (%esi)
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
+
+ cfi_restore_state
+ cfi_remember_state
+ ALIGN (4)
+L(shl_0_gobble_mem_start):
+ cmp %al, %dl
+ je L(copy_page_by_rep)
+ sub $128, %ecx
+L(shl_0_gobble_mem_loop):
+ prefetchnta 0x1c0(%eax)
+ prefetchnta 0x280(%eax)
+ prefetchnta 0x1c0(%edx)
+ prefetchnta 0x280(%edx)
+
+ movdqa (%eax), %xmm0
+ movaps 0x10(%eax), %xmm1
+ movaps 0x20(%eax), %xmm2
+ movaps 0x30(%eax), %xmm3
+ movaps 0x40(%eax), %xmm4
+ movaps 0x50(%eax), %xmm5
+ movaps 0x60(%eax), %xmm6
+ movaps 0x70(%eax), %xmm7
+ lea 0x80(%eax), %eax
+ sub $0x80, %ecx
+ movdqa %xmm0, (%edx)
+ movaps %xmm1, 0x10(%edx)
+ movaps %xmm2, 0x20(%edx)
+ movaps %xmm3, 0x30(%edx)
+ movaps %xmm4, 0x40(%edx)
+ movaps %xmm5, 0x50(%edx)
+ movaps %xmm6, 0x60(%edx)
+ movaps %xmm7, 0x70(%edx)
+ lea 0x80(%edx), %edx
+
+ jae L(shl_0_gobble_mem_loop)
+ add $0x80, %ecx
+ cmp $0x40, %ecx
+ jb L(shl_0_mem_less_64bytes)
+
+ movdqa (%eax), %xmm0
+ sub $0x40, %ecx
+ movdqa 0x10(%eax), %xmm1
+
+ movdqa %xmm0, (%edx)
+ movdqa %xmm1, 0x10(%edx)
+
+ movdqa 0x20(%eax), %xmm0
+ movdqa 0x30(%eax), %xmm1
+ add $0x40, %eax
+
+ movdqa %xmm0, 0x20(%edx)
+ movdqa %xmm1, 0x30(%edx)
+ add $0x40, %edx
+L(shl_0_mem_less_64bytes):
+ cmp $0x20, %ecx
+ jb L(shl_0_mem_less_32bytes)
+ movdqa (%eax), %xmm0
+ sub $0x20, %ecx
+ movdqa 0x10(%eax), %xmm1
+ add $0x20, %eax
+ movdqa %xmm0, (%edx)
+ movdqa %xmm1, 0x10(%edx)
+ add $0x20, %edx
+L(shl_0_mem_less_32bytes):
+ cmp $0x10, %ecx
+ jb L(shl_0_mem_less_16bytes)
+ sub $0x10, %ecx
+ movdqa (%eax), %xmm0
+ add $0x10, %eax
+ movdqa %xmm0, (%edx)
+ add $0x10, %edx
+L(shl_0_mem_less_16bytes):
+ add %ecx, %edx
+ add %ecx, %eax
+ POP (%esi)
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
+
+ cfi_restore_state
+ cfi_remember_state
+ ALIGN (4)
+L(shl_1):
+ BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+ sub $1, %eax
+ movaps (%eax), %xmm1
+ xor %edi, %edi
+ sub $32, %ecx
+ movdqu %xmm0, (%esi)
+ POP (%esi)
+L(shl_1_loop):
+
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm4
+ palignr $1, %xmm2, %xmm3
+ palignr $1, %xmm1, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+
+ jb L(shl_1_end)
+
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm1
+ palignr $1, %xmm2, %xmm3
+ palignr $1, %xmm4, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+
+ jae L(shl_1_loop)
+
+L(shl_1_end):
+ add $32, %ecx
+ add %ecx, %edi
+ add %edi, %edx
+ lea 1(%edi, %eax), %eax
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+ cfi_restore_state
+ cfi_remember_state
+ ALIGN (4)
+L(shl_2):
+ BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+ sub $2, %eax
+ movaps (%eax), %xmm1
+ xor %edi, %edi
+ sub $32, %ecx
+ movdqu %xmm0, (%esi)
+ POP (%esi)
+L(shl_2_loop):
+
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm4
+ palignr $2, %xmm2, %xmm3
+ palignr $2, %xmm1, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+
+ jb L(shl_2_end)
+
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm1
+ palignr $2, %xmm2, %xmm3
+ palignr $2, %xmm4, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+
+ jae L(shl_2_loop)
+
+L(shl_2_end):
+ add $32, %ecx
+ add %ecx, %edi
+ add %edi, %edx
+ lea 2(%edi, %eax), %eax
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+ cfi_restore_state
+ cfi_remember_state
+ ALIGN (4)
+L(shl_3):
+ BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+ sub $3, %eax
+ movaps (%eax), %xmm1
+ xor %edi, %edi
+ sub $32, %ecx
+ movdqu %xmm0, (%esi)
+ POP (%esi)
+L(shl_3_loop):
+
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm4
+ palignr $3, %xmm2, %xmm3
+ palignr $3, %xmm1, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+
+ jb L(shl_3_end)
+
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm1
+ palignr $3, %xmm2, %xmm3
+ palignr $3, %xmm4, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+
+ jae L(shl_3_loop)
+
+L(shl_3_end):
+ add $32, %ecx
+ add %ecx, %edi
+ add %edi, %edx
+ lea 3(%edi, %eax), %eax
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+ cfi_restore_state
+ cfi_remember_state
+ ALIGN (4)
+L(shl_4):
+ BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+ sub $4, %eax
+ movaps (%eax), %xmm1
+ xor %edi, %edi
+ sub $32, %ecx
+ movdqu %xmm0, (%esi)
+ POP (%esi)
+L(shl_4_loop):
+
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm4
+ palignr $4, %xmm2, %xmm3
+ palignr $4, %xmm1, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+
+ jb L(shl_4_end)
+
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm1
+ palignr $4, %xmm2, %xmm3
+ palignr $4, %xmm4, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+
+ jae L(shl_4_loop)
+
+L(shl_4_end):
+ add $32, %ecx
+ add %ecx, %edi
+ add %edi, %edx
+ lea 4(%edi, %eax), %eax
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+ cfi_restore_state
+ cfi_remember_state
+ ALIGN (4)
+L(shl_5):
+ BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+ sub $5, %eax
+ movaps (%eax), %xmm1
+ xor %edi, %edi
+ sub $32, %ecx
+ movdqu %xmm0, (%esi)
+ POP (%esi)
+L(shl_5_loop):
+
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm4
+ palignr $5, %xmm2, %xmm3
+ palignr $5, %xmm1, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+
+ jb L(shl_5_end)
+
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm1
+ palignr $5, %xmm2, %xmm3
+ palignr $5, %xmm4, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+
+ jae L(shl_5_loop)
+
+L(shl_5_end):
+ add $32, %ecx
+ add %ecx, %edi
+ add %edi, %edx
+ lea 5(%edi, %eax), %eax
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+ cfi_restore_state
+ cfi_remember_state
+ ALIGN (4)
+L(shl_6):
+ BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+ sub $6, %eax
+ movaps (%eax), %xmm1
+ xor %edi, %edi
+ sub $32, %ecx
+ movdqu %xmm0, (%esi)
+ POP (%esi)
+L(shl_6_loop):
+
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm4
+ palignr $6, %xmm2, %xmm3
+ palignr $6, %xmm1, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+
+ jb L(shl_6_end)
+
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm1
+ palignr $6, %xmm2, %xmm3
+ palignr $6, %xmm4, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+
+ jae L(shl_6_loop)
+
+L(shl_6_end):
+ add $32, %ecx
+ add %ecx, %edi
+ add %edi, %edx
+ lea 6(%edi, %eax), %eax
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+ cfi_restore_state
+ cfi_remember_state
+ ALIGN (4)
+L(shl_7):
+ BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+ sub $7, %eax
+ movaps (%eax), %xmm1
+ xor %edi, %edi
+ sub $32, %ecx
+ movdqu %xmm0, (%esi)
+ POP (%esi)
+L(shl_7_loop):
+
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm4
+ palignr $7, %xmm2, %xmm3
+ palignr $7, %xmm1, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+
+ jb L(shl_7_end)
+
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm1
+ palignr $7, %xmm2, %xmm3
+ palignr $7, %xmm4, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+
+ jae L(shl_7_loop)
+
+L(shl_7_end):
+ add $32, %ecx
+ add %ecx, %edi
+ add %edi, %edx
+ lea 7(%edi, %eax), %eax
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+ cfi_restore_state
+ cfi_remember_state
+ ALIGN (4)
+L(shl_8):
+ BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+ sub $8, %eax
+ movaps (%eax), %xmm1
+ xor %edi, %edi
+ sub $32, %ecx
+ movdqu %xmm0, (%esi)
+ POP (%esi)
+L(shl_8_loop):
+
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm4
+ palignr $8, %xmm2, %xmm3
+ palignr $8, %xmm1, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+
+ jb L(shl_8_end)
+
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm1
+ palignr $8, %xmm2, %xmm3
+ palignr $8, %xmm4, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+
+ jae L(shl_8_loop)
+
+L(shl_8_end):
+ add $32, %ecx
+ add %ecx, %edi
+ add %edi, %edx
+ lea 8(%edi, %eax), %eax
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+ cfi_restore_state
+ cfi_remember_state
+ ALIGN (4)
+L(shl_9):
+ BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+ sub $9, %eax
+ movaps (%eax), %xmm1
+ xor %edi, %edi
+ sub $32, %ecx
+ movdqu %xmm0, (%esi)
+ POP (%esi)
+L(shl_9_loop):
+
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm4
+ palignr $9, %xmm2, %xmm3
+ palignr $9, %xmm1, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+
+ jb L(shl_9_end)
+
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm1
+ palignr $9, %xmm2, %xmm3
+ palignr $9, %xmm4, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+
+ jae L(shl_9_loop)
+
+L(shl_9_end):
+ add $32, %ecx
+ add %ecx, %edi
+ add %edi, %edx
+ lea 9(%edi, %eax), %eax
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+ cfi_restore_state
+ cfi_remember_state
+ ALIGN (4)
+L(shl_10):
+ BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+ sub $10, %eax
+ movaps (%eax), %xmm1
+ xor %edi, %edi
+ sub $32, %ecx
+ movdqu %xmm0, (%esi)
+ POP (%esi)
+L(shl_10_loop):
+
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm4
+ palignr $10, %xmm2, %xmm3
+ palignr $10, %xmm1, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+
+ jb L(shl_10_end)
+
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm1
+ palignr $10, %xmm2, %xmm3
+ palignr $10, %xmm4, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+
+ jae L(shl_10_loop)
+
+L(shl_10_end):
+ add $32, %ecx
+ add %ecx, %edi
+ add %edi, %edx
+ lea 10(%edi, %eax), %eax
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+ cfi_restore_state
+ cfi_remember_state
+ ALIGN (4)
+L(shl_11):
+ BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+ sub $11, %eax
+ movaps (%eax), %xmm1
+ xor %edi, %edi
+ sub $32, %ecx
+ movdqu %xmm0, (%esi)
+ POP (%esi)
+L(shl_11_loop):
+
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm4
+ palignr $11, %xmm2, %xmm3
+ palignr $11, %xmm1, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+
+ jb L(shl_11_end)
+
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm1
+ palignr $11, %xmm2, %xmm3
+ palignr $11, %xmm4, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+
+ jae L(shl_11_loop)
+
+L(shl_11_end):
+ add $32, %ecx
+ add %ecx, %edi
+ add %edi, %edx
+ lea 11(%edi, %eax), %eax
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+ cfi_restore_state
+ cfi_remember_state
+ ALIGN (4)
+L(shl_12):
+ BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+ sub $12, %eax
+ movaps (%eax), %xmm1
+ xor %edi, %edi
+ sub $32, %ecx
+ movdqu %xmm0, (%esi)
+ POP (%esi)
+L(shl_12_loop):
+
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm4
+ palignr $12, %xmm2, %xmm3
+ palignr $12, %xmm1, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+
+ jb L(shl_12_end)
+
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm1
+ palignr $12, %xmm2, %xmm3
+ palignr $12, %xmm4, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+
+ jae L(shl_12_loop)
+
+L(shl_12_end):
+ add $32, %ecx
+ add %ecx, %edi
+ add %edi, %edx
+ lea 12(%edi, %eax), %eax
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+ cfi_restore_state
+ cfi_remember_state
+ ALIGN (4)
+L(shl_13):
+ BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+ sub $13, %eax
+ movaps (%eax), %xmm1
+ xor %edi, %edi
+ sub $32, %ecx
+ movdqu %xmm0, (%esi)
+ POP (%esi)
+L(shl_13_loop):
+
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm4
+ palignr $13, %xmm2, %xmm3
+ palignr $13, %xmm1, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+
+ jb L(shl_13_end)
+
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm1
+ palignr $13, %xmm2, %xmm3
+ palignr $13, %xmm4, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+
+ jae L(shl_13_loop)
+
+L(shl_13_end):
+ add $32, %ecx
+ add %ecx, %edi
+ add %edi, %edx
+ lea 13(%edi, %eax), %eax
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+ cfi_restore_state
+ cfi_remember_state
+ ALIGN (4)
+L(shl_14):
+ BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+ sub $14, %eax
+ movaps (%eax), %xmm1
+ xor %edi, %edi
+ sub $32, %ecx
+ movdqu %xmm0, (%esi)
+ POP (%esi)
+L(shl_14_loop):
+
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm4
+ palignr $14, %xmm2, %xmm3
+ palignr $14, %xmm1, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+
+ jb L(shl_14_end)
+
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm1
+ palignr $14, %xmm2, %xmm3
+ palignr $14, %xmm4, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+
+ jae L(shl_14_loop)
+
+L(shl_14_end):
+ add $32, %ecx
+ add %ecx, %edi
+ add %edi, %edx
+ lea 14(%edi, %eax), %eax
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+ cfi_restore_state
+ cfi_remember_state
+ ALIGN (4)
+L(shl_15):
+ BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+ sub $15, %eax
+ movaps (%eax), %xmm1
+ xor %edi, %edi
+ sub $32, %ecx
+ movdqu %xmm0, (%esi)
+ POP (%esi)
+L(shl_15_loop):
+
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm4
+ palignr $15, %xmm2, %xmm3
+ palignr $15, %xmm1, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+
+ jb L(shl_15_end)
+
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm1
+ palignr $15, %xmm2, %xmm3
+ palignr $15, %xmm4, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+
+ jae L(shl_15_loop)
+
+L(shl_15_end):
+ add $32, %ecx
+ add %ecx, %edi
+ add %edi, %edx
+ lea 15(%edi, %eax), %eax
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+
+ ALIGN (4)
+L(fwd_write_44bytes):
+ movl -44(%eax), %ecx
+ movl %ecx, -44(%edx)
+L(fwd_write_40bytes):
+ movl -40(%eax), %ecx
+ movl %ecx, -40(%edx)
+L(fwd_write_36bytes):
+ movl -36(%eax), %ecx
+ movl %ecx, -36(%edx)
+L(fwd_write_32bytes):
+ movl -32(%eax), %ecx
+ movl %ecx, -32(%edx)
+L(fwd_write_28bytes):
+ movl -28(%eax), %ecx
+ movl %ecx, -28(%edx)
+L(fwd_write_24bytes):
+ movl -24(%eax), %ecx
+ movl %ecx, -24(%edx)
+L(fwd_write_20bytes):
+ movl -20(%eax), %ecx
+ movl %ecx, -20(%edx)
+L(fwd_write_16bytes):
+ movl -16(%eax), %ecx
+ movl %ecx, -16(%edx)
+L(fwd_write_12bytes):
+ movl -12(%eax), %ecx
+ movl %ecx, -12(%edx)
+L(fwd_write_8bytes):
+ movl -8(%eax), %ecx
+ movl %ecx, -8(%edx)
+L(fwd_write_4bytes):
+ movl -4(%eax), %ecx
+ movl %ecx, -4(%edx)
+L(fwd_write_0bytes):
+#ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+ movl %edx, %eax
+# else
+ movl DEST(%esp), %eax
+# endif
+#endif
+ RETURN
+
+ ALIGN (4)
+L(fwd_write_5bytes):
+ movl -5(%eax), %ecx
+ movl -4(%eax), %eax
+ movl %ecx, -5(%edx)
+ movl %eax, -4(%edx)
+#ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+ movl %edx, %eax
+# else
+ movl DEST(%esp), %eax
+# endif
+#endif
+ RETURN
+
+ ALIGN (4)
+L(fwd_write_45bytes):
+ movl -45(%eax), %ecx
+ movl %ecx, -45(%edx)
+L(fwd_write_41bytes):
+ movl -41(%eax), %ecx
+ movl %ecx, -41(%edx)
+L(fwd_write_37bytes):
+ movl -37(%eax), %ecx
+ movl %ecx, -37(%edx)
+L(fwd_write_33bytes):
+ movl -33(%eax), %ecx
+ movl %ecx, -33(%edx)
+L(fwd_write_29bytes):
+ movl -29(%eax), %ecx
+ movl %ecx, -29(%edx)
+L(fwd_write_25bytes):
+ movl -25(%eax), %ecx
+ movl %ecx, -25(%edx)
+L(fwd_write_21bytes):
+ movl -21(%eax), %ecx
+ movl %ecx, -21(%edx)
+L(fwd_write_17bytes):
+ movl -17(%eax), %ecx
+ movl %ecx, -17(%edx)
+L(fwd_write_13bytes):
+ movl -13(%eax), %ecx
+ movl %ecx, -13(%edx)
+L(fwd_write_9bytes):
+ movl -9(%eax), %ecx
+ movl %ecx, -9(%edx)
+ movl -5(%eax), %ecx
+ movl %ecx, -5(%edx)
+L(fwd_write_1bytes):
+ movzbl -1(%eax), %ecx
+ movb %cl, -1(%edx)
+#ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+ movl %edx, %eax
+# else
+ movl DEST(%esp), %eax
+# endif
+#endif
+ RETURN
+
+ ALIGN (4)
+L(fwd_write_46bytes):
+ movl -46(%eax), %ecx
+ movl %ecx, -46(%edx)
+L(fwd_write_42bytes):
+ movl -42(%eax), %ecx
+ movl %ecx, -42(%edx)
+L(fwd_write_38bytes):
+ movl -38(%eax), %ecx
+ movl %ecx, -38(%edx)
+L(fwd_write_34bytes):
+ movl -34(%eax), %ecx
+ movl %ecx, -34(%edx)
+L(fwd_write_30bytes):
+ movl -30(%eax), %ecx
+ movl %ecx, -30(%edx)
+L(fwd_write_26bytes):
+ movl -26(%eax), %ecx
+ movl %ecx, -26(%edx)
+L(fwd_write_22bytes):
+ movl -22(%eax), %ecx
+ movl %ecx, -22(%edx)
+L(fwd_write_18bytes):
+ movl -18(%eax), %ecx
+ movl %ecx, -18(%edx)
+L(fwd_write_14bytes):
+ movl -14(%eax), %ecx
+ movl %ecx, -14(%edx)
+L(fwd_write_10bytes):
+ movl -10(%eax), %ecx
+ movl %ecx, -10(%edx)
+L(fwd_write_6bytes):
+ movl -6(%eax), %ecx
+ movl %ecx, -6(%edx)
+L(fwd_write_2bytes):
+ movzwl -2(%eax), %ecx
+ movw %cx, -2(%edx)
+#ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+ movl %edx, %eax
+# else
+ movl DEST(%esp), %eax
+# endif
+#endif
+ RETURN
+
+ ALIGN (4)
+L(fwd_write_47bytes):
+ movl -47(%eax), %ecx
+ movl %ecx, -47(%edx)
+L(fwd_write_43bytes):
+ movl -43(%eax), %ecx
+ movl %ecx, -43(%edx)
+L(fwd_write_39bytes):
+ movl -39(%eax), %ecx
+ movl %ecx, -39(%edx)
+L(fwd_write_35bytes):
+ movl -35(%eax), %ecx
+ movl %ecx, -35(%edx)
+L(fwd_write_31bytes):
+ movl -31(%eax), %ecx
+ movl %ecx, -31(%edx)
+L(fwd_write_27bytes):
+ movl -27(%eax), %ecx
+ movl %ecx, -27(%edx)
+L(fwd_write_23bytes):
+ movl -23(%eax), %ecx
+ movl %ecx, -23(%edx)
+L(fwd_write_19bytes):
+ movl -19(%eax), %ecx
+ movl %ecx, -19(%edx)
+L(fwd_write_15bytes):
+ movl -15(%eax), %ecx
+ movl %ecx, -15(%edx)
+L(fwd_write_11bytes):
+ movl -11(%eax), %ecx
+ movl %ecx, -11(%edx)
+L(fwd_write_7bytes):
+ movl -7(%eax), %ecx
+ movl %ecx, -7(%edx)
+L(fwd_write_3bytes):
+ movzwl -3(%eax), %ecx
+ movzbl -1(%eax), %eax
+ movw %cx, -3(%edx)
+ movb %al, -1(%edx)
+#ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+ movl %edx, %eax
+# else
+ movl DEST(%esp), %eax
+# endif
+#endif
+ RETURN_END
+
+ cfi_restore_state
+ cfi_remember_state
+ ALIGN (4)
+L(large_page):
+ movdqu (%eax), %xmm1
+ movdqu %xmm0, (%esi)
+ movntdq %xmm1, (%edx)
+ add $0x10, %eax
+ add $0x10, %edx
+ sub $0x10, %ecx
+ cmp %al, %dl
+ je L(copy_page_by_rep)
+L(large_page_loop_init):
+ POP (%esi)
+ sub $0x80, %ecx
+ POP (%edi)
+L(large_page_loop):
+ prefetchnta 0x1c0(%eax)
+ prefetchnta 0x280(%eax)
+ movdqu (%eax), %xmm0
+ movdqu 0x10(%eax), %xmm1
+ movdqu 0x20(%eax), %xmm2
+ movdqu 0x30(%eax), %xmm3
+ movdqu 0x40(%eax), %xmm4
+ movdqu 0x50(%eax), %xmm5
+ movdqu 0x60(%eax), %xmm6
+ movdqu 0x70(%eax), %xmm7
+ lea 0x80(%eax), %eax
+ lfence
+ sub $0x80, %ecx
+ movntdq %xmm0, (%edx)
+ movntdq %xmm1, 0x10(%edx)
+ movntdq %xmm2, 0x20(%edx)
+ movntdq %xmm3, 0x30(%edx)
+ movntdq %xmm4, 0x40(%edx)
+ movntdq %xmm5, 0x50(%edx)
+ movntdq %xmm6, 0x60(%edx)
+ movntdq %xmm7, 0x70(%edx)
+ lea 0x80(%edx), %edx
+ jae L(large_page_loop)
+ add $0x80, %ecx
+ cmp $0x40, %ecx
+ jb L(large_page_less_64bytes)
+
+ movdqu (%eax), %xmm0
+ movdqu 0x10(%eax), %xmm1
+ movdqu 0x20(%eax), %xmm2
+ movdqu 0x30(%eax), %xmm3
+ lea 0x40(%eax), %eax
+
+ movntdq %xmm0, (%edx)
+ movntdq %xmm1, 0x10(%edx)
+ movntdq %xmm2, 0x20(%edx)
+ movntdq %xmm3, 0x30(%edx)
+ lea 0x40(%edx), %edx
+ sub $0x40, %ecx
+L(large_page_less_64bytes):
+ cmp $32, %ecx
+ jb L(large_page_less_32bytes)
+ movdqu (%eax), %xmm0
+ movdqu 0x10(%eax), %xmm1
+ lea 0x20(%eax), %eax
+ movntdq %xmm0, (%edx)
+ movntdq %xmm1, 0x10(%edx)
+ lea 0x20(%edx), %edx
+ sub $0x20, %ecx
+L(large_page_less_32bytes):
+ add %ecx, %edx
+ add %ecx, %eax
+ sfence
+ BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
+
+ cfi_restore_state
+ cfi_remember_state
+ ALIGN (4)
+L(copy_page_by_rep):
+ mov %eax, %esi
+ mov %edx, %edi
+ mov %ecx, %edx
+ shr $2, %ecx
+ and $3, %edx
+ rep movsl
+ jz L(copy_page_by_rep_exit)
+ cmp $2, %edx
+ jb L(copy_page_by_rep_left_1)
+ movzwl (%esi), %eax
+ movw %ax, (%edi)
+ add $2, %esi
+ add $2, %edi
+ sub $2, %edx
+ jz L(copy_page_by_rep_exit)
+L(copy_page_by_rep_left_1):
+ movzbl (%esi), %eax
+ movb %al, (%edi)
+L(copy_page_by_rep_exit):
+ POP (%esi)
+ POP (%edi)
+#ifndef USE_AS_BCOPY
+ movl DEST(%esp), %eax
+# ifdef USE_AS_MEMPCPY
+ movl LEN(%esp), %ecx
+ add %ecx, %eax
+# endif
+#endif
+ RETURN
+
+ ALIGN (4)
+L(bk_write_44bytes):
+ movl 40(%eax), %ecx
+ movl %ecx, 40(%edx)
+L(bk_write_40bytes):
+ movl 36(%eax), %ecx
+ movl %ecx, 36(%edx)
+L(bk_write_36bytes):
+ movl 32(%eax), %ecx
+ movl %ecx, 32(%edx)
+L(bk_write_32bytes):
+ movl 28(%eax), %ecx
+ movl %ecx, 28(%edx)
+L(bk_write_28bytes):
+ movl 24(%eax), %ecx
+ movl %ecx, 24(%edx)
+L(bk_write_24bytes):
+ movl 20(%eax), %ecx
+ movl %ecx, 20(%edx)
+L(bk_write_20bytes):
+ movl 16(%eax), %ecx
+ movl %ecx, 16(%edx)
+L(bk_write_16bytes):
+ movl 12(%eax), %ecx
+ movl %ecx, 12(%edx)
+L(bk_write_12bytes):
+ movl 8(%eax), %ecx
+ movl %ecx, 8(%edx)
+L(bk_write_8bytes):
+ movl 4(%eax), %ecx
+ movl %ecx, 4(%edx)
+L(bk_write_4bytes):
+ movl (%eax), %ecx
+ movl %ecx, (%edx)
+L(bk_write_0bytes):
+#ifndef USE_AS_BCOPY
+ movl DEST(%esp), %eax
+# ifdef USE_AS_MEMPCPY
+ movl LEN(%esp), %ecx
+ add %ecx, %eax
+# endif
+#endif
+ RETURN
+
+ ALIGN (4)
+L(bk_write_45bytes):
+ movl 41(%eax), %ecx
+ movl %ecx, 41(%edx)
+L(bk_write_41bytes):
+ movl 37(%eax), %ecx
+ movl %ecx, 37(%edx)
+L(bk_write_37bytes):
+ movl 33(%eax), %ecx
+ movl %ecx, 33(%edx)
+L(bk_write_33bytes):
+ movl 29(%eax), %ecx
+ movl %ecx, 29(%edx)
+L(bk_write_29bytes):
+ movl 25(%eax), %ecx
+ movl %ecx, 25(%edx)
+L(bk_write_25bytes):
+ movl 21(%eax), %ecx
+ movl %ecx, 21(%edx)
+L(bk_write_21bytes):
+ movl 17(%eax), %ecx
+ movl %ecx, 17(%edx)
+L(bk_write_17bytes):
+ movl 13(%eax), %ecx
+ movl %ecx, 13(%edx)
+L(bk_write_13bytes):
+ movl 9(%eax), %ecx
+ movl %ecx, 9(%edx)
+L(bk_write_9bytes):
+ movl 5(%eax), %ecx
+ movl %ecx, 5(%edx)
+L(bk_write_5bytes):
+ movl 1(%eax), %ecx
+ movl %ecx, 1(%edx)
+L(bk_write_1bytes):
+ movzbl (%eax), %ecx
+ movb %cl, (%edx)
+#ifndef USE_AS_BCOPY
+ movl DEST(%esp), %eax
+# ifdef USE_AS_MEMPCPY
+ movl LEN(%esp), %ecx
+ add %ecx, %eax
+# endif
+#endif
+ RETURN
+
+ ALIGN (4)
+L(bk_write_46bytes):
+ movl 42(%eax), %ecx
+ movl %ecx, 42(%edx)
+L(bk_write_42bytes):
+ movl 38(%eax), %ecx
+ movl %ecx, 38(%edx)
+L(bk_write_38bytes):
+ movl 34(%eax), %ecx
+ movl %ecx, 34(%edx)
+L(bk_write_34bytes):
+ movl 30(%eax), %ecx
+ movl %ecx, 30(%edx)
+L(bk_write_30bytes):
+ movl 26(%eax), %ecx
+ movl %ecx, 26(%edx)
+L(bk_write_26bytes):
+ movl 22(%eax), %ecx
+ movl %ecx, 22(%edx)
+L(bk_write_22bytes):
+ movl 18(%eax), %ecx
+ movl %ecx, 18(%edx)
+L(bk_write_18bytes):
+ movl 14(%eax), %ecx
+ movl %ecx, 14(%edx)
+L(bk_write_14bytes):
+ movl 10(%eax), %ecx
+ movl %ecx, 10(%edx)
+L(bk_write_10bytes):
+ movl 6(%eax), %ecx
+ movl %ecx, 6(%edx)
+L(bk_write_6bytes):
+ movl 2(%eax), %ecx
+ movl %ecx, 2(%edx)
+L(bk_write_2bytes):
+ movzwl (%eax), %ecx
+ movw %cx, (%edx)
+#ifndef USE_AS_BCOPY
+ movl DEST(%esp), %eax
+# ifdef USE_AS_MEMPCPY
+ movl LEN(%esp), %ecx
+ add %ecx, %eax
+# endif
+#endif
+ RETURN
+
+ ALIGN (4)
+L(bk_write_47bytes):
+ movl 43(%eax), %ecx
+ movl %ecx, 43(%edx)
+L(bk_write_43bytes):
+ movl 39(%eax), %ecx
+ movl %ecx, 39(%edx)
+L(bk_write_39bytes):
+ movl 35(%eax), %ecx
+ movl %ecx, 35(%edx)
+L(bk_write_35bytes):
+ movl 31(%eax), %ecx
+ movl %ecx, 31(%edx)
+L(bk_write_31bytes):
+ movl 27(%eax), %ecx
+ movl %ecx, 27(%edx)
+L(bk_write_27bytes):
+ movl 23(%eax), %ecx
+ movl %ecx, 23(%edx)
+L(bk_write_23bytes):
+ movl 19(%eax), %ecx
+ movl %ecx, 19(%edx)
+L(bk_write_19bytes):
+ movl 15(%eax), %ecx
+ movl %ecx, 15(%edx)
+L(bk_write_15bytes):
+ movl 11(%eax), %ecx
+ movl %ecx, 11(%edx)
+L(bk_write_11bytes):
+ movl 7(%eax), %ecx
+ movl %ecx, 7(%edx)
+L(bk_write_7bytes):
+ movl 3(%eax), %ecx
+ movl %ecx, 3(%edx)
+L(bk_write_3bytes):
+ movzwl 1(%eax), %ecx
+ movw %cx, 1(%edx)
+ movzbl (%eax), %eax
+ movb %al, (%edx)
+#ifndef USE_AS_BCOPY
+ movl DEST(%esp), %eax
+# ifdef USE_AS_MEMPCPY
+ movl LEN(%esp), %ecx
+ add %ecx, %eax
+# endif
+#endif
+ RETURN_END
+
+
+ .pushsection .rodata.ssse3,"a",@progbits
+ ALIGN (2)
+L(table_48bytes_fwd):
+ .int JMPTBL (L(fwd_write_0bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_1bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_2bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_3bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_4bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_5bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_6bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_7bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_8bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_9bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_10bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_11bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_12bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_13bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_14bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_15bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_16bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_17bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_18bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_19bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_20bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_21bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_22bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_23bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_24bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_25bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_26bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_27bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_28bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_29bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_30bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_31bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_32bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_33bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_34bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_35bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_36bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_37bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_38bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_39bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_40bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_41bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_42bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_43bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_44bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_45bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_46bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_47bytes), L(table_48bytes_fwd))
+
+ ALIGN (2)
+L(shl_table):
+ .int JMPTBL (L(shl_0), L(shl_table))
+ .int JMPTBL (L(shl_1), L(shl_table))
+ .int JMPTBL (L(shl_2), L(shl_table))
+ .int JMPTBL (L(shl_3), L(shl_table))
+ .int JMPTBL (L(shl_4), L(shl_table))
+ .int JMPTBL (L(shl_5), L(shl_table))
+ .int JMPTBL (L(shl_6), L(shl_table))
+ .int JMPTBL (L(shl_7), L(shl_table))
+ .int JMPTBL (L(shl_8), L(shl_table))
+ .int JMPTBL (L(shl_9), L(shl_table))
+ .int JMPTBL (L(shl_10), L(shl_table))
+ .int JMPTBL (L(shl_11), L(shl_table))
+ .int JMPTBL (L(shl_12), L(shl_table))
+ .int JMPTBL (L(shl_13), L(shl_table))
+ .int JMPTBL (L(shl_14), L(shl_table))
+ .int JMPTBL (L(shl_15), L(shl_table))
+
+ ALIGN (2)
+L(table_48_bytes_bwd):
+ .int JMPTBL (L(bk_write_0bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_1bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_2bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_3bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_4bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_5bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_6bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_7bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_8bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_9bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_10bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_11bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_12bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_13bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_14bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_15bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_16bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_17bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_18bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_19bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_20bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_21bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_22bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_23bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_24bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_25bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_26bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_27bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_28bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_29bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_30bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_31bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_32bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_33bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_34bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_35bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_36bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_37bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_38bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_39bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_40bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_41bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_42bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_43bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_44bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_45bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_46bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_47bytes), L(table_48_bytes_bwd))
+
+ .popsection
+
+#ifdef USE_AS_MEMMOVE
+ ALIGN (4)
+L(copy_backward):
+ PUSH (%esi)
+ movl %eax, %esi
+ add %ecx, %edx
+ add %ecx, %esi
+ testl $0x3, %edx
+ jnz L(bk_align)
+
+L(bk_aligned_4):
+ cmp $64, %ecx
+ jae L(bk_write_more64bytes)
+
+L(bk_write_64bytesless):
+ cmp $32, %ecx
+ jb L(bk_write_less32bytes)
+
+L(bk_write_more32bytes):
+ /* Copy 32 bytes at a time. */
+ sub $32, %ecx
+ movl -4(%esi), %eax
+ movl %eax, -4(%edx)
+ movl -8(%esi), %eax
+ movl %eax, -8(%edx)
+ movl -12(%esi), %eax
+ movl %eax, -12(%edx)
+ movl -16(%esi), %eax
+ movl %eax, -16(%edx)
+ movl -20(%esi), %eax
+ movl %eax, -20(%edx)
+ movl -24(%esi), %eax
+ movl %eax, -24(%edx)
+ movl -28(%esi), %eax
+ movl %eax, -28(%edx)
+ movl -32(%esi), %eax
+ movl %eax, -32(%edx)
+ sub $32, %edx
+ sub $32, %esi
+
+L(bk_write_less32bytes):
+ movl %esi, %eax
+ sub %ecx, %edx
+ sub %ecx, %eax
+ POP (%esi)
+L(bk_write_less48bytes):
+ BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4)
+
+ CFI_PUSH (%esi)
+ ALIGN (4)
+L(bk_align):
+ cmp $8, %ecx
+ jbe L(bk_write_less32bytes)
+ testl $1, %edx
+ /* We get here only if (EDX & 3 ) != 0 so if (EDX & 1) ==0,
+ then (EDX & 2) must be != 0. */
+ jz L(bk_got2)
+ sub $1, %esi
+ sub $1, %ecx
+ sub $1, %edx
+ movzbl (%esi), %eax
+ movb %al, (%edx)
+
+ testl $2, %edx
+ jz L(bk_aligned_4)
+
+L(bk_got2):
+ sub $2, %esi
+ sub $2, %ecx
+ sub $2, %edx
+ movzwl (%esi), %eax
+ movw %ax, (%edx)
+ jmp L(bk_aligned_4)
+
+ ALIGN (4)
+L(bk_write_more64bytes):
+ /* Check alignment of last byte. */
+ testl $15, %edx
+ jz L(bk_ssse3_cpy_pre)
+
+/* EDX is aligned 4 bytes, but not 16 bytes. */
+L(bk_ssse3_align):
+ sub $4, %esi
+ sub $4, %ecx
+ sub $4, %edx
+ movl (%esi), %eax
+ movl %eax, (%edx)
+
+ testl $15, %edx
+ jz L(bk_ssse3_cpy_pre)
+
+ sub $4, %esi
+ sub $4, %ecx
+ sub $4, %edx
+ movl (%esi), %eax
+ movl %eax, (%edx)
+
+ testl $15, %edx
+ jz L(bk_ssse3_cpy_pre)
+
+ sub $4, %esi
+ sub $4, %ecx
+ sub $4, %edx
+ movl (%esi), %eax
+ movl %eax, (%edx)
+
+L(bk_ssse3_cpy_pre):
+ cmp $64, %ecx
+ jb L(bk_write_more32bytes)
+
+L(bk_ssse3_cpy):
+ sub $64, %esi
+ sub $64, %ecx
+ sub $64, %edx
+ movdqu 0x30(%esi), %xmm3
+ movdqa %xmm3, 0x30(%edx)
+ movdqu 0x20(%esi), %xmm2
+ movdqa %xmm2, 0x20(%edx)
+ movdqu 0x10(%esi), %xmm1
+ movdqa %xmm1, 0x10(%edx)
+ movdqu (%esi), %xmm0
+ movdqa %xmm0, (%edx)
+ cmp $64, %ecx
+ jae L(bk_ssse3_cpy)
+ jmp L(bk_write_64bytesless)
+
+#endif
+
+END (MEMCPY)
+
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memcpy-ssse3.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memcpy-ssse3.S
new file mode 100644
index 0000000000..53e8a6ca1d
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memcpy-ssse3.S
@@ -0,0 +1,3162 @@
+/* memcpy with SSSE3
+ Copyright (C) 2010-2017 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#if IS_IN (libc) \
+ && (defined SHARED \
+ || defined USE_AS_MEMMOVE \
+ || !defined USE_MULTIARCH)
+
+# include <sysdep.h>
+# include "asm-syntax.h"
+
+# ifndef MEMCPY
+# define MEMCPY __memcpy_ssse3
+# define MEMCPY_CHK __memcpy_chk_ssse3
+# endif
+
+# ifdef USE_AS_BCOPY
+# define SRC PARMS
+# define DEST SRC+4
+# define LEN DEST+4
+# else
+# define DEST PARMS
+# define SRC DEST+4
+# define LEN SRC+4
+# endif
+
+# define CFI_PUSH(REG) \
+ cfi_adjust_cfa_offset (4); \
+ cfi_rel_offset (REG, 0)
+
+# define CFI_POP(REG) \
+ cfi_adjust_cfa_offset (-4); \
+ cfi_restore (REG)
+
+# define PUSH(REG) pushl REG; CFI_PUSH (REG)
+# define POP(REG) popl REG; CFI_POP (REG)
+
+# ifdef SHARED
+# define PARMS 8 /* Preserve EBX. */
+# define ENTRANCE PUSH (%ebx);
+# define RETURN_END POP (%ebx); ret
+# define RETURN RETURN_END; CFI_PUSH (%ebx)
+# define JMPTBL(I, B) I - B
+
+/* Load an entry in a jump table into EBX and branch to it. TABLE is a
+ jump table with relative offsets. INDEX is a register contains the
+ index into the jump table. SCALE is the scale of INDEX. */
+
+# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
+ /* We first load PC into EBX. */ \
+ SETUP_PIC_REG(bx); \
+ /* Get the address of the jump table. */ \
+ addl $(TABLE - .), %ebx; \
+ /* Get the entry and convert the relative offset to the \
+ absolute address. */ \
+ addl (%ebx, INDEX, SCALE), %ebx; \
+ /* We loaded the jump table. Go. */ \
+ jmp *%ebx
+# else
+
+# define PARMS 4
+# define ENTRANCE
+# define RETURN_END ret
+# define RETURN RETURN_END
+# define JMPTBL(I, B) I
+
+/* Branch to an entry in a jump table. TABLE is a jump table with
+ absolute offsets. INDEX is a register contains the index into the
+ jump table. SCALE is the scale of INDEX. */
+
+# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
+ jmp *TABLE(, INDEX, SCALE)
+# endif
+
+ .section .text.ssse3,"ax",@progbits
+# if !defined USE_AS_BCOPY
+ENTRY (MEMCPY_CHK)
+ movl 12(%esp), %eax
+ cmpl %eax, 16(%esp)
+ jb HIDDEN_JUMPTARGET (__chk_fail)
+END (MEMCPY_CHK)
+# endif
+ENTRY (MEMCPY)
+ ENTRANCE
+ movl LEN(%esp), %ecx
+ movl SRC(%esp), %eax
+ movl DEST(%esp), %edx
+
+# ifdef USE_AS_MEMMOVE
+ cmp %eax, %edx
+ jb L(copy_forward)
+ je L(fwd_write_0bytes)
+ cmp $32, %ecx
+ jae L(memmove_bwd)
+ jmp L(bk_write_less32bytes_2)
+
+ .p2align 4
+L(memmove_bwd):
+ add %ecx, %eax
+ cmp %eax, %edx
+ movl SRC(%esp), %eax
+ jb L(copy_backward)
+
+L(copy_forward):
+# endif
+ cmp $48, %ecx
+ jae L(48bytesormore)
+
+L(fwd_write_less32bytes):
+# ifndef USE_AS_MEMMOVE
+ cmp %dl, %al
+ jb L(bk_write)
+# endif
+ add %ecx, %edx
+ add %ecx, %eax
+ BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
+# ifndef USE_AS_MEMMOVE
+ .p2align 4
+L(bk_write):
+ BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4)
+# endif
+
+ .p2align 4
+L(48bytesormore):
+# ifndef USE_AS_MEMMOVE
+ movlpd (%eax), %xmm0
+ movlpd 8(%eax), %xmm1
+ movlpd %xmm0, (%edx)
+ movlpd %xmm1, 8(%edx)
+# else
+ movdqu (%eax), %xmm0
+# endif
+ PUSH (%edi)
+ movl %edx, %edi
+ and $-16, %edx
+ add $16, %edx
+ sub %edx, %edi
+ add %edi, %ecx
+ sub %edi, %eax
+
+# ifdef SHARED_CACHE_SIZE_HALF
+ cmp $SHARED_CACHE_SIZE_HALF, %ecx
+# else
+# ifdef SHARED
+ SETUP_PIC_REG(bx)
+ add $_GLOBAL_OFFSET_TABLE_, %ebx
+ cmp __x86_shared_cache_size_half@GOTOFF(%ebx), %ecx
+# else
+ cmp __x86_shared_cache_size_half, %ecx
+# endif
+# endif
+
+ mov %eax, %edi
+ jae L(large_page)
+ and $0xf, %edi
+ jz L(shl_0)
+ BRANCH_TO_JMPTBL_ENTRY (L(shl_table), %edi, 4)
+
+ .p2align 4
+L(shl_0):
+# ifdef USE_AS_MEMMOVE
+ movl DEST+4(%esp), %edi
+ movdqu %xmm0, (%edi)
+# endif
+ xor %edi, %edi
+ cmp $127, %ecx
+ ja L(shl_0_gobble)
+ lea -32(%ecx), %ecx
+
+ .p2align 4
+L(shl_0_loop):
+ movdqa (%eax, %edi), %xmm0
+ movdqa 16(%eax, %edi), %xmm1
+ sub $32, %ecx
+ movdqa %xmm0, (%edx, %edi)
+ movdqa %xmm1, 16(%edx, %edi)
+ lea 32(%edi), %edi
+ jb L(shl_0_end)
+
+ movdqa (%eax, %edi), %xmm0
+ movdqa 16(%eax, %edi), %xmm1
+ sub $32, %ecx
+ movdqa %xmm0, (%edx, %edi)
+ movdqa %xmm1, 16(%edx, %edi)
+ lea 32(%edi), %edi
+ jb L(shl_0_end)
+
+ movdqa (%eax, %edi), %xmm0
+ movdqa 16(%eax, %edi), %xmm1
+ sub $32, %ecx
+ movdqa %xmm0, (%edx, %edi)
+ movdqa %xmm1, 16(%edx, %edi)
+ lea 32(%edi), %edi
+ jb L(shl_0_end)
+
+ movdqa (%eax, %edi), %xmm0
+ movdqa 16(%eax, %edi), %xmm1
+ sub $32, %ecx
+ movdqa %xmm0, (%edx, %edi)
+ movdqa %xmm1, 16(%edx, %edi)
+ lea 32(%edi), %edi
+
+L(shl_0_end):
+ lea 32(%ecx), %ecx
+ add %ecx, %edi
+ add %edi, %edx
+ add %edi, %eax
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd_align), %ecx, 4)
+
+ CFI_PUSH (%edi)
+
+ .p2align 4
+L(shl_0_gobble):
+# ifdef DATA_CACHE_SIZE_HALF
+ cmp $DATA_CACHE_SIZE_HALF, %ecx
+# else
+# ifdef SHARED
+ SETUP_PIC_REG(bx)
+ add $_GLOBAL_OFFSET_TABLE_, %ebx
+ cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+# else
+ cmp __x86_data_cache_size_half, %ecx
+# endif
+# endif
+ POP (%edi)
+ lea -128(%ecx), %ecx
+ jae L(shl_0_gobble_mem_loop)
+
+ .p2align 4
+L(shl_0_gobble_cache_loop):
+ movdqa (%eax), %xmm0
+ movdqa 0x10(%eax), %xmm1
+ movdqa 0x20(%eax), %xmm2
+ movdqa 0x30(%eax), %xmm3
+ movdqa 0x40(%eax), %xmm4
+ movdqa 0x50(%eax), %xmm5
+ movdqa 0x60(%eax), %xmm6
+ movdqa 0x70(%eax), %xmm7
+ lea 0x80(%eax), %eax
+ sub $128, %ecx
+ movdqa %xmm0, (%edx)
+ movdqa %xmm1, 0x10(%edx)
+ movdqa %xmm2, 0x20(%edx)
+ movdqa %xmm3, 0x30(%edx)
+ movdqa %xmm4, 0x40(%edx)
+ movdqa %xmm5, 0x50(%edx)
+ movdqa %xmm6, 0x60(%edx)
+ movdqa %xmm7, 0x70(%edx)
+ lea 0x80(%edx), %edx
+
+ jae L(shl_0_gobble_cache_loop)
+ cmp $-0x40, %ecx
+ lea 0x80(%ecx), %ecx
+ jl L(shl_0_cache_less_64bytes)
+
+ movdqa (%eax), %xmm0
+ sub $0x40, %ecx
+ movdqa 0x10(%eax), %xmm1
+ movdqa %xmm0, (%edx)
+ movdqa %xmm1, 0x10(%edx)
+ movdqa 0x20(%eax), %xmm0
+ movdqa 0x30(%eax), %xmm1
+ add $0x40, %eax
+ movdqa %xmm0, 0x20(%edx)
+ movdqa %xmm1, 0x30(%edx)
+ add $0x40, %edx
+
+L(shl_0_cache_less_64bytes):
+ cmp $0x20, %ecx
+ jb L(shl_0_cache_less_32bytes)
+ movdqa (%eax), %xmm0
+ sub $0x20, %ecx
+ movdqa 0x10(%eax), %xmm1
+ add $0x20, %eax
+ movdqa %xmm0, (%edx)
+ movdqa %xmm1, 0x10(%edx)
+ add $0x20, %edx
+
+L(shl_0_cache_less_32bytes):
+ cmp $0x10, %ecx
+ jb L(shl_0_cache_less_16bytes)
+ sub $0x10, %ecx
+ movdqa (%eax), %xmm0
+ add $0x10, %eax
+ movdqa %xmm0, (%edx)
+ add $0x10, %edx
+
+L(shl_0_cache_less_16bytes):
+ add %ecx, %edx
+ add %ecx, %eax
+ BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
+
+ .p2align 4
+L(shl_0_gobble_mem_loop):
+ prefetcht0 0x1c0(%eax)
+ prefetcht0 0x280(%eax)
+ prefetcht0 0x1c0(%edx)
+
+ movdqa (%eax), %xmm0
+ movdqa 0x10(%eax), %xmm1
+ movdqa 0x20(%eax), %xmm2
+ movdqa 0x30(%eax), %xmm3
+ movdqa 0x40(%eax), %xmm4
+ movdqa 0x50(%eax), %xmm5
+ movdqa 0x60(%eax), %xmm6
+ movdqa 0x70(%eax), %xmm7
+ lea 0x80(%eax), %eax
+ sub $0x80, %ecx
+ movdqa %xmm0, (%edx)
+ movdqa %xmm1, 0x10(%edx)
+ movdqa %xmm2, 0x20(%edx)
+ movdqa %xmm3, 0x30(%edx)
+ movdqa %xmm4, 0x40(%edx)
+ movdqa %xmm5, 0x50(%edx)
+ movdqa %xmm6, 0x60(%edx)
+ movdqa %xmm7, 0x70(%edx)
+ lea 0x80(%edx), %edx
+
+ jae L(shl_0_gobble_mem_loop)
+ cmp $-0x40, %ecx
+ lea 0x80(%ecx), %ecx
+ jl L(shl_0_mem_less_64bytes)
+
+ movdqa (%eax), %xmm0
+ sub $0x40, %ecx
+ movdqa 0x10(%eax), %xmm1
+
+ movdqa %xmm0, (%edx)
+ movdqa %xmm1, 0x10(%edx)
+
+ movdqa 0x20(%eax), %xmm0
+ movdqa 0x30(%eax), %xmm1
+ add $0x40, %eax
+
+ movdqa %xmm0, 0x20(%edx)
+ movdqa %xmm1, 0x30(%edx)
+ add $0x40, %edx
+
+L(shl_0_mem_less_64bytes):
+ cmp $0x20, %ecx
+ jb L(shl_0_mem_less_32bytes)
+ movdqa (%eax), %xmm0
+ sub $0x20, %ecx
+ movdqa 0x10(%eax), %xmm1
+ add $0x20, %eax
+ movdqa %xmm0, (%edx)
+ movdqa %xmm1, 0x10(%edx)
+ add $0x20, %edx
+
+L(shl_0_mem_less_32bytes):
+ cmp $0x10, %ecx
+ jb L(shl_0_mem_less_16bytes)
+ sub $0x10, %ecx
+ movdqa (%eax), %xmm0
+ add $0x10, %eax
+ movdqa %xmm0, (%edx)
+ add $0x10, %edx
+
+L(shl_0_mem_less_16bytes):
+ add %ecx, %edx
+ add %ecx, %eax
+ BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd_align), %ecx, 4)
+
+ .p2align 4
+L(shl_1):
+# ifndef USE_AS_MEMMOVE
+ movaps -1(%eax), %xmm1
+# else
+ movl DEST+4(%esp), %edi
+ movaps -1(%eax), %xmm1
+ movdqu %xmm0, (%edi)
+# endif
+# ifdef DATA_CACHE_SIZE_HALF
+ cmp $DATA_CACHE_SIZE_HALF, %ecx
+# else
+# ifdef SHARED
+ SETUP_PIC_REG(bx)
+ add $_GLOBAL_OFFSET_TABLE_, %ebx
+ cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+# else
+ cmp __x86_data_cache_size_half, %ecx
+# endif
+# endif
+ jb L(sh_1_no_prefetch)
+
+ lea -64(%ecx), %ecx
+
+ .p2align 4
+L(Shl1LoopStart):
+ prefetcht0 0x1c0(%eax)
+ prefetcht0 0x1c0(%edx)
+ movaps 15(%eax), %xmm2
+ movaps 31(%eax), %xmm3
+ movaps 47(%eax), %xmm4
+ movaps 63(%eax), %xmm5
+ movaps %xmm5, %xmm7
+ palignr $1, %xmm4, %xmm5
+ palignr $1, %xmm3, %xmm4
+ movaps %xmm5, 48(%edx)
+ palignr $1, %xmm2, %xmm3
+ lea 64(%eax), %eax
+ palignr $1, %xmm1, %xmm2
+ movaps %xmm4, 32(%edx)
+ movaps %xmm3, 16(%edx)
+ movaps %xmm7, %xmm1
+ movaps %xmm2, (%edx)
+ lea 64(%edx), %edx
+ sub $64, %ecx
+ ja L(Shl1LoopStart)
+
+L(Shl1LoopLeave):
+ add $32, %ecx
+ jle L(shl_end_0)
+
+ movaps 15(%eax), %xmm2
+ movaps 31(%eax), %xmm3
+ palignr $1, %xmm2, %xmm3
+ palignr $1, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps %xmm3, 16(%edx)
+ lea 32(%edx, %ecx), %edx
+ lea 32(%eax, %ecx), %eax
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+ CFI_PUSH (%edi)
+
+ .p2align 4
+L(sh_1_no_prefetch):
+ lea -32(%ecx), %ecx
+ lea -1(%eax), %eax
+ xor %edi, %edi
+
+ .p2align 4
+L(sh_1_no_prefetch_loop):
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm4
+ palignr $1, %xmm2, %xmm3
+ palignr $1, %xmm1, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+ jb L(sh_1_end_no_prefetch_loop)
+
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm1
+ palignr $1, %xmm2, %xmm3
+ palignr $1, %xmm4, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+ jae L(sh_1_no_prefetch_loop)
+
+L(sh_1_end_no_prefetch_loop):
+ lea 32(%ecx), %ecx
+ add %ecx, %edi
+ add %edi, %edx
+ lea 1(%edi, %eax), %eax
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+ CFI_PUSH (%edi)
+
+ .p2align 4
+L(shl_2):
+# ifndef USE_AS_MEMMOVE
+ movaps -2(%eax), %xmm1
+# else
+ movl DEST+4(%esp), %edi
+ movaps -2(%eax), %xmm1
+ movdqu %xmm0, (%edi)
+# endif
+# ifdef DATA_CACHE_SIZE_HALF
+ cmp $DATA_CACHE_SIZE_HALF, %ecx
+# else
+# ifdef SHARED
+ SETUP_PIC_REG(bx)
+ add $_GLOBAL_OFFSET_TABLE_, %ebx
+ cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+# else
+ cmp __x86_data_cache_size_half, %ecx
+# endif
+# endif
+ jb L(sh_2_no_prefetch)
+
+ lea -64(%ecx), %ecx
+
+ .p2align 4
+L(Shl2LoopStart):
+ prefetcht0 0x1c0(%eax)
+ prefetcht0 0x1c0(%edx)
+ movaps 14(%eax), %xmm2
+ movaps 30(%eax), %xmm3
+ movaps 46(%eax), %xmm4
+ movaps 62(%eax), %xmm5
+ movaps %xmm5, %xmm7
+ palignr $2, %xmm4, %xmm5
+ palignr $2, %xmm3, %xmm4
+ movaps %xmm5, 48(%edx)
+ palignr $2, %xmm2, %xmm3
+ lea 64(%eax), %eax
+ palignr $2, %xmm1, %xmm2
+ movaps %xmm4, 32(%edx)
+ movaps %xmm3, 16(%edx)
+ movaps %xmm7, %xmm1
+ movaps %xmm2, (%edx)
+ lea 64(%edx), %edx
+ sub $64, %ecx
+ ja L(Shl2LoopStart)
+
+L(Shl2LoopLeave):
+ add $32, %ecx
+ jle L(shl_end_0)
+
+ movaps 14(%eax), %xmm2
+ movaps 30(%eax), %xmm3
+ palignr $2, %xmm2, %xmm3
+ palignr $2, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps %xmm3, 16(%edx)
+ lea 32(%edx, %ecx), %edx
+ lea 32(%eax, %ecx), %eax
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+ CFI_PUSH (%edi)
+
+ .p2align 4
+L(sh_2_no_prefetch):
+ lea -32(%ecx), %ecx
+ lea -2(%eax), %eax
+ xor %edi, %edi
+
+ .p2align 4
+L(sh_2_no_prefetch_loop):
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm4
+ palignr $2, %xmm2, %xmm3
+ palignr $2, %xmm1, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+ jb L(sh_2_end_no_prefetch_loop)
+
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm1
+ palignr $2, %xmm2, %xmm3
+ palignr $2, %xmm4, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+ jae L(sh_2_no_prefetch_loop)
+
+L(sh_2_end_no_prefetch_loop):
+ lea 32(%ecx), %ecx
+ add %ecx, %edi
+ add %edi, %edx
+ lea 2(%edi, %eax), %eax
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+ CFI_PUSH (%edi)
+
+ .p2align 4
+L(shl_3):
+# ifndef USE_AS_MEMMOVE
+ movaps -3(%eax), %xmm1
+# else
+ movl DEST+4(%esp), %edi
+ movaps -3(%eax), %xmm1
+ movdqu %xmm0, (%edi)
+# endif
+# ifdef DATA_CACHE_SIZE_HALF
+ cmp $DATA_CACHE_SIZE_HALF, %ecx
+# else
+# ifdef SHARED
+ SETUP_PIC_REG(bx)
+ add $_GLOBAL_OFFSET_TABLE_, %ebx
+ cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+# else
+ cmp __x86_data_cache_size_half, %ecx
+# endif
+# endif
+ jb L(sh_3_no_prefetch)
+
+ lea -64(%ecx), %ecx
+
+ .p2align 4
+L(Shl3LoopStart):
+ prefetcht0 0x1c0(%eax)
+ prefetcht0 0x1c0(%edx)
+ movaps 13(%eax), %xmm2
+ movaps 29(%eax), %xmm3
+ movaps 45(%eax), %xmm4
+ movaps 61(%eax), %xmm5
+ movaps %xmm5, %xmm7
+ palignr $3, %xmm4, %xmm5
+ palignr $3, %xmm3, %xmm4
+ movaps %xmm5, 48(%edx)
+ palignr $3, %xmm2, %xmm3
+ lea 64(%eax), %eax
+ palignr $3, %xmm1, %xmm2
+ movaps %xmm4, 32(%edx)
+ movaps %xmm3, 16(%edx)
+ movaps %xmm7, %xmm1
+ movaps %xmm2, (%edx)
+ lea 64(%edx), %edx
+ sub $64, %ecx
+ ja L(Shl3LoopStart)
+
+L(Shl3LoopLeave):
+ add $32, %ecx
+ jle L(shl_end_0)
+
+ movaps 13(%eax), %xmm2
+ movaps 29(%eax), %xmm3
+ palignr $3, %xmm2, %xmm3
+ palignr $3, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps %xmm3, 16(%edx)
+ lea 32(%edx, %ecx), %edx
+ lea 32(%eax, %ecx), %eax
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+ CFI_PUSH (%edi)
+
+ .p2align 4
+L(sh_3_no_prefetch):
+ lea -32(%ecx), %ecx
+ lea -3(%eax), %eax
+ xor %edi, %edi
+
+ .p2align 4
+L(sh_3_no_prefetch_loop):
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm4
+ palignr $3, %xmm2, %xmm3
+ palignr $3, %xmm1, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+
+ jb L(sh_3_end_no_prefetch_loop)
+
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm1
+ palignr $3, %xmm2, %xmm3
+ palignr $3, %xmm4, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+
+ jae L(sh_3_no_prefetch_loop)
+
+L(sh_3_end_no_prefetch_loop):
+ lea 32(%ecx), %ecx
+ add %ecx, %edi
+ add %edi, %edx
+ lea 3(%edi, %eax), %eax
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+ CFI_PUSH (%edi)
+
+ .p2align 4
+L(shl_4):
+# ifndef USE_AS_MEMMOVE
+ movaps -4(%eax), %xmm1
+# else
+ movl DEST+4(%esp), %edi
+ movaps -4(%eax), %xmm1
+ movdqu %xmm0, (%edi)
+# endif
+# ifdef DATA_CACHE_SIZE_HALF
+ cmp $DATA_CACHE_SIZE_HALF, %ecx
+# else
+# ifdef SHARED
+ SETUP_PIC_REG(bx)
+ add $_GLOBAL_OFFSET_TABLE_, %ebx
+ cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+# else
+ cmp __x86_data_cache_size_half, %ecx
+# endif
+# endif
+ jb L(sh_4_no_prefetch)
+
+ lea -64(%ecx), %ecx
+
+ .p2align 4
+L(Shl4LoopStart):
+ prefetcht0 0x1c0(%eax)
+ prefetcht0 0x1c0(%edx)
+ movaps 12(%eax), %xmm2
+ movaps 28(%eax), %xmm3
+ movaps 44(%eax), %xmm4
+ movaps 60(%eax), %xmm5
+ movaps %xmm5, %xmm7
+ palignr $4, %xmm4, %xmm5
+ palignr $4, %xmm3, %xmm4
+ movaps %xmm5, 48(%edx)
+ palignr $4, %xmm2, %xmm3
+ lea 64(%eax), %eax
+ palignr $4, %xmm1, %xmm2
+ movaps %xmm4, 32(%edx)
+ movaps %xmm3, 16(%edx)
+ movaps %xmm7, %xmm1
+ movaps %xmm2, (%edx)
+ lea 64(%edx), %edx
+ sub $64, %ecx
+ ja L(Shl4LoopStart)
+
+L(Shl4LoopLeave):
+ add $32, %ecx
+ jle L(shl_end_0)
+
+ movaps 12(%eax), %xmm2
+ movaps 28(%eax), %xmm3
+ palignr $4, %xmm2, %xmm3
+ palignr $4, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps %xmm3, 16(%edx)
+ lea 32(%edx, %ecx), %edx
+ lea 32(%eax, %ecx), %eax
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+ CFI_PUSH (%edi)
+
+ .p2align 4
+L(sh_4_no_prefetch):
+ lea -32(%ecx), %ecx
+ lea -4(%eax), %eax
+ xor %edi, %edi
+
+ .p2align 4
+L(sh_4_no_prefetch_loop):
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm4
+ palignr $4, %xmm2, %xmm3
+ palignr $4, %xmm1, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+
+ jb L(sh_4_end_no_prefetch_loop)
+
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm1
+ palignr $4, %xmm2, %xmm3
+ palignr $4, %xmm4, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+
+ jae L(sh_4_no_prefetch_loop)
+
+L(sh_4_end_no_prefetch_loop):
+ lea 32(%ecx), %ecx
+ add %ecx, %edi
+ add %edi, %edx
+ lea 4(%edi, %eax), %eax
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+ CFI_PUSH (%edi)
+
+ .p2align 4
+L(shl_5):
+# ifndef USE_AS_MEMMOVE
+ movaps -5(%eax), %xmm1
+# else
+ movl DEST+4(%esp), %edi
+ movaps -5(%eax), %xmm1
+ movdqu %xmm0, (%edi)
+# endif
+# ifdef DATA_CACHE_SIZE_HALF
+ cmp $DATA_CACHE_SIZE_HALF, %ecx
+# else
+# ifdef SHARED
+ SETUP_PIC_REG(bx)
+ add $_GLOBAL_OFFSET_TABLE_, %ebx
+ cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+# else
+ cmp __x86_data_cache_size_half, %ecx
+# endif
+# endif
+ jb L(sh_5_no_prefetch)
+
+ lea -64(%ecx), %ecx
+
+ .p2align 4
+L(Shl5LoopStart):
+ prefetcht0 0x1c0(%eax)
+ prefetcht0 0x1c0(%edx)
+ movaps 11(%eax), %xmm2
+ movaps 27(%eax), %xmm3
+ movaps 43(%eax), %xmm4
+ movaps 59(%eax), %xmm5
+ movaps %xmm5, %xmm7
+ palignr $5, %xmm4, %xmm5
+ palignr $5, %xmm3, %xmm4
+ movaps %xmm5, 48(%edx)
+ palignr $5, %xmm2, %xmm3
+ lea 64(%eax), %eax
+ palignr $5, %xmm1, %xmm2
+ movaps %xmm4, 32(%edx)
+ movaps %xmm3, 16(%edx)
+ movaps %xmm7, %xmm1
+ movaps %xmm2, (%edx)
+ lea 64(%edx), %edx
+ sub $64, %ecx
+ ja L(Shl5LoopStart)
+
+L(Shl5LoopLeave):
+ add $32, %ecx
+ jle L(shl_end_0)
+
+ movaps 11(%eax), %xmm2
+ movaps 27(%eax), %xmm3
+ palignr $5, %xmm2, %xmm3
+ palignr $5, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps %xmm3, 16(%edx)
+ lea 32(%edx, %ecx), %edx
+ lea 32(%eax, %ecx), %eax
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+ CFI_PUSH (%edi)
+
+ .p2align 4
+L(sh_5_no_prefetch):
+ lea -32(%ecx), %ecx
+ lea -5(%eax), %eax
+ xor %edi, %edi
+
+ .p2align 4
+L(sh_5_no_prefetch_loop):
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm4
+ palignr $5, %xmm2, %xmm3
+ palignr $5, %xmm1, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+
+ jb L(sh_5_end_no_prefetch_loop)
+
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm1
+ palignr $5, %xmm2, %xmm3
+ palignr $5, %xmm4, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+
+ jae L(sh_5_no_prefetch_loop)
+
+L(sh_5_end_no_prefetch_loop):
+ lea 32(%ecx), %ecx
+ add %ecx, %edi
+ add %edi, %edx
+ lea 5(%edi, %eax), %eax
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+ CFI_PUSH (%edi)
+
+ .p2align 4
+L(shl_6):
+# ifndef USE_AS_MEMMOVE
+ movaps -6(%eax), %xmm1
+# else
+ movl DEST+4(%esp), %edi
+ movaps -6(%eax), %xmm1
+ movdqu %xmm0, (%edi)
+# endif
+# ifdef DATA_CACHE_SIZE_HALF
+ cmp $DATA_CACHE_SIZE_HALF, %ecx
+# else
+# ifdef SHARED
+ SETUP_PIC_REG(bx)
+ add $_GLOBAL_OFFSET_TABLE_, %ebx
+ cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+# else
+ cmp __x86_data_cache_size_half, %ecx
+# endif
+# endif
+ jb L(sh_6_no_prefetch)
+
+ lea -64(%ecx), %ecx
+
+ .p2align 4
+L(Shl6LoopStart):
+ prefetcht0 0x1c0(%eax)
+ prefetcht0 0x1c0(%edx)
+ movaps 10(%eax), %xmm2
+ movaps 26(%eax), %xmm3
+ movaps 42(%eax), %xmm4
+ movaps 58(%eax), %xmm5
+ movaps %xmm5, %xmm7
+ palignr $6, %xmm4, %xmm5
+ palignr $6, %xmm3, %xmm4
+ movaps %xmm5, 48(%edx)
+ palignr $6, %xmm2, %xmm3
+ lea 64(%eax), %eax
+ palignr $6, %xmm1, %xmm2
+ movaps %xmm4, 32(%edx)
+ movaps %xmm3, 16(%edx)
+ movaps %xmm7, %xmm1
+ movaps %xmm2, (%edx)
+ lea 64(%edx), %edx
+ sub $64, %ecx
+ ja L(Shl6LoopStart)
+
+L(Shl6LoopLeave):
+ add $32, %ecx
+ jle L(shl_end_0)
+
+ movaps 10(%eax), %xmm2
+ movaps 26(%eax), %xmm3
+ palignr $6, %xmm2, %xmm3
+ palignr $6, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps %xmm3, 16(%edx)
+ lea 32(%edx, %ecx), %edx
+ lea 32(%eax, %ecx), %eax
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+ CFI_PUSH (%edi)
+
+ .p2align 4
+L(sh_6_no_prefetch):
+ lea -32(%ecx), %ecx
+ lea -6(%eax), %eax
+ xor %edi, %edi
+
+ .p2align 4
+L(sh_6_no_prefetch_loop):
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm4
+ palignr $6, %xmm2, %xmm3
+ palignr $6, %xmm1, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+
+ jb L(sh_6_end_no_prefetch_loop)
+
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm1
+ palignr $6, %xmm2, %xmm3
+ palignr $6, %xmm4, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+
+ jae L(sh_6_no_prefetch_loop)
+
+L(sh_6_end_no_prefetch_loop):
+ lea 32(%ecx), %ecx
+ add %ecx, %edi
+ add %edi, %edx
+ lea 6(%edi, %eax), %eax
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+ CFI_PUSH (%edi)
+
+ .p2align 4
+L(shl_7):
+# ifndef USE_AS_MEMMOVE
+ movaps -7(%eax), %xmm1
+# else
+ movl DEST+4(%esp), %edi
+ movaps -7(%eax), %xmm1
+ movdqu %xmm0, (%edi)
+# endif
+# ifdef DATA_CACHE_SIZE_HALF
+ cmp $DATA_CACHE_SIZE_HALF, %ecx
+# else
+# ifdef SHARED
+ SETUP_PIC_REG(bx)
+ add $_GLOBAL_OFFSET_TABLE_, %ebx
+ cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+# else
+ cmp __x86_data_cache_size_half, %ecx
+# endif
+# endif
+ jb L(sh_7_no_prefetch)
+
+ lea -64(%ecx), %ecx
+
+ .p2align 4
+L(Shl7LoopStart):
+ prefetcht0 0x1c0(%eax)
+ prefetcht0 0x1c0(%edx)
+ movaps 9(%eax), %xmm2
+ movaps 25(%eax), %xmm3
+ movaps 41(%eax), %xmm4
+ movaps 57(%eax), %xmm5
+ movaps %xmm5, %xmm7
+ palignr $7, %xmm4, %xmm5
+ palignr $7, %xmm3, %xmm4
+ movaps %xmm5, 48(%edx)
+ palignr $7, %xmm2, %xmm3
+ lea 64(%eax), %eax
+ palignr $7, %xmm1, %xmm2
+ movaps %xmm4, 32(%edx)
+ movaps %xmm3, 16(%edx)
+ movaps %xmm7, %xmm1
+ movaps %xmm2, (%edx)
+ lea 64(%edx), %edx
+ sub $64, %ecx
+ ja L(Shl7LoopStart)
+
+L(Shl7LoopLeave):
+ add $32, %ecx
+ jle L(shl_end_0)
+
+ movaps 9(%eax), %xmm2
+ movaps 25(%eax), %xmm3
+ palignr $7, %xmm2, %xmm3
+ palignr $7, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps %xmm3, 16(%edx)
+ lea 32(%edx, %ecx), %edx
+ lea 32(%eax, %ecx), %eax
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+ CFI_PUSH (%edi)
+
+ .p2align 4
+L(sh_7_no_prefetch):
+ lea -32(%ecx), %ecx
+ lea -7(%eax), %eax
+ xor %edi, %edi
+
+ .p2align 4
+L(sh_7_no_prefetch_loop):
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm4
+ palignr $7, %xmm2, %xmm3
+ palignr $7, %xmm1, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+ jb L(sh_7_end_no_prefetch_loop)
+
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm1
+ palignr $7, %xmm2, %xmm3
+ palignr $7, %xmm4, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+ jae L(sh_7_no_prefetch_loop)
+
+L(sh_7_end_no_prefetch_loop):
+ lea 32(%ecx), %ecx
+ add %ecx, %edi
+ add %edi, %edx
+ lea 7(%edi, %eax), %eax
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+ CFI_PUSH (%edi)
+
+ .p2align 4
+L(shl_8):
+# ifndef USE_AS_MEMMOVE
+ movaps -8(%eax), %xmm1
+# else
+ movl DEST+4(%esp), %edi
+ movaps -8(%eax), %xmm1
+ movdqu %xmm0, (%edi)
+# endif
+# ifdef DATA_CACHE_SIZE_HALF
+ cmp $DATA_CACHE_SIZE_HALF, %ecx
+# else
+# ifdef SHARED
+ SETUP_PIC_REG(bx)
+ add $_GLOBAL_OFFSET_TABLE_, %ebx
+ cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+# else
+ cmp __x86_data_cache_size_half, %ecx
+# endif
+# endif
+ jb L(sh_8_no_prefetch)
+
+ lea -64(%ecx), %ecx
+
+ .p2align 4
+L(Shl8LoopStart):
+ prefetcht0 0x1c0(%eax)
+ prefetcht0 0x1c0(%edx)
+ movaps 8(%eax), %xmm2
+ movaps 24(%eax), %xmm3
+ movaps 40(%eax), %xmm4
+ movaps 56(%eax), %xmm5
+ movaps %xmm5, %xmm7
+ palignr $8, %xmm4, %xmm5
+ palignr $8, %xmm3, %xmm4
+ movaps %xmm5, 48(%edx)
+ palignr $8, %xmm2, %xmm3
+ lea 64(%eax), %eax
+ palignr $8, %xmm1, %xmm2
+ movaps %xmm4, 32(%edx)
+ movaps %xmm3, 16(%edx)
+ movaps %xmm7, %xmm1
+ movaps %xmm2, (%edx)
+ lea 64(%edx), %edx
+ sub $64, %ecx
+ ja L(Shl8LoopStart)
+
+L(LoopLeave8):
+ add $32, %ecx
+ jle L(shl_end_0)
+
+ movaps 8(%eax), %xmm2
+ movaps 24(%eax), %xmm3
+ palignr $8, %xmm2, %xmm3
+ palignr $8, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps %xmm3, 16(%edx)
+ lea 32(%edx, %ecx), %edx
+ lea 32(%eax, %ecx), %eax
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+ CFI_PUSH (%edi)
+
+ .p2align 4
+L(sh_8_no_prefetch):
+ lea -32(%ecx), %ecx
+ lea -8(%eax), %eax
+ xor %edi, %edi
+
+ .p2align 4
+L(sh_8_no_prefetch_loop):
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm4
+ palignr $8, %xmm2, %xmm3
+ palignr $8, %xmm1, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+ jb L(sh_8_end_no_prefetch_loop)
+
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm1
+ palignr $8, %xmm2, %xmm3
+ palignr $8, %xmm4, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+ jae L(sh_8_no_prefetch_loop)
+
+L(sh_8_end_no_prefetch_loop):
+ lea 32(%ecx), %ecx
+ add %ecx, %edi
+ add %edi, %edx
+ lea 8(%edi, %eax), %eax
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+ CFI_PUSH (%edi)
+
+ .p2align 4
+L(shl_9):
+# ifndef USE_AS_MEMMOVE
+ movaps -9(%eax), %xmm1
+# else
+ movl DEST+4(%esp), %edi
+ movaps -9(%eax), %xmm1
+ movdqu %xmm0, (%edi)
+# endif
+# ifdef DATA_CACHE_SIZE_HALF
+ cmp $DATA_CACHE_SIZE_HALF, %ecx
+# else
+# ifdef SHARED
+ SETUP_PIC_REG(bx)
+ add $_GLOBAL_OFFSET_TABLE_, %ebx
+ cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+# else
+ cmp __x86_data_cache_size_half, %ecx
+# endif
+# endif
+ jb L(sh_9_no_prefetch)
+
+ lea -64(%ecx), %ecx
+
+ .p2align 4
+L(Shl9LoopStart):
+ prefetcht0 0x1c0(%eax)
+ prefetcht0 0x1c0(%edx)
+ movaps 7(%eax), %xmm2
+ movaps 23(%eax), %xmm3
+ movaps 39(%eax), %xmm4
+ movaps 55(%eax), %xmm5
+ movaps %xmm5, %xmm7
+ palignr $9, %xmm4, %xmm5
+ palignr $9, %xmm3, %xmm4
+ movaps %xmm5, 48(%edx)
+ palignr $9, %xmm2, %xmm3
+ lea 64(%eax), %eax
+ palignr $9, %xmm1, %xmm2
+ movaps %xmm4, 32(%edx)
+ movaps %xmm3, 16(%edx)
+ movaps %xmm7, %xmm1
+ movaps %xmm2, (%edx)
+ lea 64(%edx), %edx
+ sub $64, %ecx
+ ja L(Shl9LoopStart)
+
+L(Shl9LoopLeave):
+ add $32, %ecx
+ jle L(shl_end_0)
+
+ movaps 7(%eax), %xmm2
+ movaps 23(%eax), %xmm3
+ palignr $9, %xmm2, %xmm3
+ palignr $9, %xmm1, %xmm2
+
+ movaps %xmm2, (%edx)
+ movaps %xmm3, 16(%edx)
+ lea 32(%edx, %ecx), %edx
+ lea 32(%eax, %ecx), %eax
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+ CFI_PUSH (%edi)
+
+ .p2align 4
+L(sh_9_no_prefetch):
+ lea -32(%ecx), %ecx
+ lea -9(%eax), %eax
+ xor %edi, %edi
+
+ .p2align 4
+L(sh_9_no_prefetch_loop):
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm4
+ palignr $9, %xmm2, %xmm3
+ palignr $9, %xmm1, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+ jb L(sh_9_end_no_prefetch_loop)
+
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm1
+ palignr $9, %xmm2, %xmm3
+ palignr $9, %xmm4, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+ jae L(sh_9_no_prefetch_loop)
+
+L(sh_9_end_no_prefetch_loop):
+ lea 32(%ecx), %ecx
+ add %ecx, %edi
+ add %edi, %edx
+ lea 9(%edi, %eax), %eax
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+ CFI_PUSH (%edi)
+
+ .p2align 4
+L(shl_10):
+# ifndef USE_AS_MEMMOVE
+ movaps -10(%eax), %xmm1
+# else
+ movl DEST+4(%esp), %edi
+ movaps -10(%eax), %xmm1
+ movdqu %xmm0, (%edi)
+# endif
+# ifdef DATA_CACHE_SIZE_HALF
+ cmp $DATA_CACHE_SIZE_HALF, %ecx
+# else
+# ifdef SHARED
+ SETUP_PIC_REG(bx)
+ add $_GLOBAL_OFFSET_TABLE_, %ebx
+ cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+# else
+ cmp __x86_data_cache_size_half, %ecx
+# endif
+# endif
+ jb L(sh_10_no_prefetch)
+
+ lea -64(%ecx), %ecx
+
+ .p2align 4
+L(Shl10LoopStart):
+ prefetcht0 0x1c0(%eax)
+ prefetcht0 0x1c0(%edx)
+ movaps 6(%eax), %xmm2
+ movaps 22(%eax), %xmm3
+ movaps 38(%eax), %xmm4
+ movaps 54(%eax), %xmm5
+ movaps %xmm5, %xmm7
+ palignr $10, %xmm4, %xmm5
+ palignr $10, %xmm3, %xmm4
+ movaps %xmm5, 48(%edx)
+ palignr $10, %xmm2, %xmm3
+ lea 64(%eax), %eax
+ palignr $10, %xmm1, %xmm2
+ movaps %xmm4, 32(%edx)
+ movaps %xmm3, 16(%edx)
+ movaps %xmm7, %xmm1
+ movaps %xmm2, (%edx)
+ lea 64(%edx), %edx
+ sub $64, %ecx
+ ja L(Shl10LoopStart)
+
+L(Shl10LoopLeave):
+ add $32, %ecx
+ jle L(shl_end_0)
+
+ movaps 6(%eax), %xmm2
+ movaps 22(%eax), %xmm3
+ palignr $10, %xmm2, %xmm3
+ palignr $10, %xmm1, %xmm2
+
+ movaps %xmm2, (%edx)
+ movaps %xmm3, 16(%edx)
+ lea 32(%edx, %ecx), %edx
+ lea 32(%eax, %ecx), %eax
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+ CFI_PUSH (%edi)
+
+ .p2align 4
+L(sh_10_no_prefetch):
+ lea -32(%ecx), %ecx
+ lea -10(%eax), %eax
+ xor %edi, %edi
+
+ .p2align 4
+L(sh_10_no_prefetch_loop):
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm4
+ palignr $10, %xmm2, %xmm3
+ palignr $10, %xmm1, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+ jb L(sh_10_end_no_prefetch_loop)
+
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm1
+ palignr $10, %xmm2, %xmm3
+ palignr $10, %xmm4, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+ jae L(sh_10_no_prefetch_loop)
+
+L(sh_10_end_no_prefetch_loop):
+ lea 32(%ecx), %ecx
+ add %ecx, %edi
+ add %edi, %edx
+ lea 10(%edi, %eax), %eax
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+ CFI_PUSH (%edi)
+
+ .p2align 4
+L(shl_11):
+# ifndef USE_AS_MEMMOVE
+ movaps -11(%eax), %xmm1
+# else
+ movl DEST+4(%esp), %edi
+ movaps -11(%eax), %xmm1
+ movdqu %xmm0, (%edi)
+# endif
+# ifdef DATA_CACHE_SIZE_HALF
+ cmp $DATA_CACHE_SIZE_HALF, %ecx
+# else
+# ifdef SHARED
+ SETUP_PIC_REG(bx)
+ add $_GLOBAL_OFFSET_TABLE_, %ebx
+ cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+# else
+ cmp __x86_data_cache_size_half, %ecx
+# endif
+# endif
+ jb L(sh_11_no_prefetch)
+
+ lea -64(%ecx), %ecx
+
+ .p2align 4
+L(Shl11LoopStart):
+ prefetcht0 0x1c0(%eax)
+ prefetcht0 0x1c0(%edx)
+ movaps 5(%eax), %xmm2
+ movaps 21(%eax), %xmm3
+ movaps 37(%eax), %xmm4
+ movaps 53(%eax), %xmm5
+ movaps %xmm5, %xmm7
+ palignr $11, %xmm4, %xmm5
+ palignr $11, %xmm3, %xmm4
+ movaps %xmm5, 48(%edx)
+ palignr $11, %xmm2, %xmm3
+ lea 64(%eax), %eax
+ palignr $11, %xmm1, %xmm2
+ movaps %xmm4, 32(%edx)
+ movaps %xmm3, 16(%edx)
+ movaps %xmm7, %xmm1
+ movaps %xmm2, (%edx)
+ lea 64(%edx), %edx
+ sub $64, %ecx
+ ja L(Shl11LoopStart)
+
+L(Shl11LoopLeave):
+ add $32, %ecx
+ jle L(shl_end_0)
+
+ movaps 5(%eax), %xmm2
+ movaps 21(%eax), %xmm3
+ palignr $11, %xmm2, %xmm3
+ palignr $11, %xmm1, %xmm2
+
+ movaps %xmm2, (%edx)
+ movaps %xmm3, 16(%edx)
+ lea 32(%edx, %ecx), %edx
+ lea 32(%eax, %ecx), %eax
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+ CFI_PUSH (%edi)
+
+ .p2align 4
+L(sh_11_no_prefetch):
+ lea -32(%ecx), %ecx
+ lea -11(%eax), %eax
+ xor %edi, %edi
+
+ .p2align 4
+L(sh_11_no_prefetch_loop):
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm4
+ palignr $11, %xmm2, %xmm3
+ palignr $11, %xmm1, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+ jb L(sh_11_end_no_prefetch_loop)
+
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm1
+ palignr $11, %xmm2, %xmm3
+ palignr $11, %xmm4, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+ jae L(sh_11_no_prefetch_loop)
+
+L(sh_11_end_no_prefetch_loop):
+ lea 32(%ecx), %ecx
+ add %ecx, %edi
+ add %edi, %edx
+ lea 11(%edi, %eax), %eax
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+ CFI_PUSH (%edi)
+
+ .p2align 4
+L(shl_12):
+# ifndef USE_AS_MEMMOVE
+ movaps -12(%eax), %xmm1
+# else
+ movl DEST+4(%esp), %edi
+ movaps -12(%eax), %xmm1
+ movdqu %xmm0, (%edi)
+# endif
+# ifdef DATA_CACHE_SIZE_HALF
+ cmp $DATA_CACHE_SIZE_HALF, %ecx
+# else
+# ifdef SHARED
+ SETUP_PIC_REG(bx)
+ add $_GLOBAL_OFFSET_TABLE_, %ebx
+ cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+# else
+ cmp __x86_data_cache_size_half, %ecx
+# endif
+# endif
+ jb L(sh_12_no_prefetch)
+
+ lea -64(%ecx), %ecx
+
+ .p2align 4
+L(Shl12LoopStart):
+ prefetcht0 0x1c0(%eax)
+ prefetcht0 0x1c0(%edx)
+ movaps 4(%eax), %xmm2
+ movaps 20(%eax), %xmm3
+ movaps 36(%eax), %xmm4
+ movaps 52(%eax), %xmm5
+ movaps %xmm5, %xmm7
+ palignr $12, %xmm4, %xmm5
+ palignr $12, %xmm3, %xmm4
+ movaps %xmm5, 48(%edx)
+ palignr $12, %xmm2, %xmm3
+ lea 64(%eax), %eax
+ palignr $12, %xmm1, %xmm2
+ movaps %xmm4, 32(%edx)
+ movaps %xmm3, 16(%edx)
+ movaps %xmm7, %xmm1
+ movaps %xmm2, (%edx)
+ lea 64(%edx), %edx
+ sub $64, %ecx
+ ja L(Shl12LoopStart)
+
+L(Shl12LoopLeave):
+ add $32, %ecx
+ jle L(shl_end_0)
+
+ movaps 4(%eax), %xmm2
+ movaps 20(%eax), %xmm3
+ palignr $12, %xmm2, %xmm3
+ palignr $12, %xmm1, %xmm2
+
+ movaps %xmm2, (%edx)
+ movaps %xmm3, 16(%edx)
+ lea 32(%edx, %ecx), %edx
+ lea 32(%eax, %ecx), %eax
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+ CFI_PUSH (%edi)
+
+ .p2align 4
+L(sh_12_no_prefetch):
+ lea -32(%ecx), %ecx
+ lea -12(%eax), %eax
+ xor %edi, %edi
+
+ .p2align 4
+L(sh_12_no_prefetch_loop):
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm4
+ palignr $12, %xmm2, %xmm3
+ palignr $12, %xmm1, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+ jb L(sh_12_end_no_prefetch_loop)
+
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm1
+ palignr $12, %xmm2, %xmm3
+ palignr $12, %xmm4, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+ jae L(sh_12_no_prefetch_loop)
+
+L(sh_12_end_no_prefetch_loop):
+ lea 32(%ecx), %ecx
+ add %ecx, %edi
+ add %edi, %edx
+ lea 12(%edi, %eax), %eax
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+ CFI_PUSH (%edi)
+
+ .p2align 4
+L(shl_13):
+# ifndef USE_AS_MEMMOVE
+ movaps -13(%eax), %xmm1
+# else
+ movl DEST+4(%esp), %edi
+ movaps -13(%eax), %xmm1
+ movdqu %xmm0, (%edi)
+# endif
+# ifdef DATA_CACHE_SIZE_HALF
+ cmp $DATA_CACHE_SIZE_HALF, %ecx
+# else
+# ifdef SHARED
+ SETUP_PIC_REG(bx)
+ add $_GLOBAL_OFFSET_TABLE_, %ebx
+ cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+# else
+ cmp __x86_data_cache_size_half, %ecx
+# endif
+# endif
+ jb L(sh_13_no_prefetch)
+
+ lea -64(%ecx), %ecx
+
+ .p2align 4
+L(Shl13LoopStart):
+ prefetcht0 0x1c0(%eax)
+ prefetcht0 0x1c0(%edx)
+ movaps 3(%eax), %xmm2
+ movaps 19(%eax), %xmm3
+ movaps 35(%eax), %xmm4
+ movaps 51(%eax), %xmm5
+ movaps %xmm5, %xmm7
+ palignr $13, %xmm4, %xmm5
+ palignr $13, %xmm3, %xmm4
+ movaps %xmm5, 48(%edx)
+ palignr $13, %xmm2, %xmm3
+ lea 64(%eax), %eax
+ palignr $13, %xmm1, %xmm2
+ movaps %xmm4, 32(%edx)
+ movaps %xmm3, 16(%edx)
+ movaps %xmm7, %xmm1
+ movaps %xmm2, (%edx)
+ lea 64(%edx), %edx
+ sub $64, %ecx
+ ja L(Shl13LoopStart)
+
+L(Shl13LoopLeave):
+ add $32, %ecx
+ jle L(shl_end_0)
+
+ movaps 3(%eax), %xmm2
+ movaps 19(%eax), %xmm3
+ palignr $13, %xmm2, %xmm3
+ palignr $13, %xmm1, %xmm2
+
+ movaps %xmm2, (%edx)
+ movaps %xmm3, 16(%edx)
+ lea 32(%edx, %ecx), %edx
+ lea 32(%eax, %ecx), %eax
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+ CFI_PUSH (%edi)
+
+ .p2align 4
+L(sh_13_no_prefetch):
+ lea -32(%ecx), %ecx
+ lea -13(%eax), %eax
+ xor %edi, %edi
+
+ .p2align 4
+L(sh_13_no_prefetch_loop):
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm4
+ palignr $13, %xmm2, %xmm3
+ palignr $13, %xmm1, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+ jb L(sh_13_end_no_prefetch_loop)
+
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm1
+ palignr $13, %xmm2, %xmm3
+ palignr $13, %xmm4, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+ jae L(sh_13_no_prefetch_loop)
+
+L(sh_13_end_no_prefetch_loop):
+ lea 32(%ecx), %ecx
+ add %ecx, %edi
+ add %edi, %edx
+ lea 13(%edi, %eax), %eax
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+ CFI_PUSH (%edi)
+
+ .p2align 4
+L(shl_14):
+# ifndef USE_AS_MEMMOVE
+ movaps -14(%eax), %xmm1
+# else
+ movl DEST+4(%esp), %edi
+ movaps -14(%eax), %xmm1
+ movdqu %xmm0, (%edi)
+# endif
+# ifdef DATA_CACHE_SIZE_HALF
+ cmp $DATA_CACHE_SIZE_HALF, %ecx
+# else
+# ifdef SHARED
+ SETUP_PIC_REG(bx)
+ add $_GLOBAL_OFFSET_TABLE_, %ebx
+ cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+# else
+ cmp __x86_data_cache_size_half, %ecx
+# endif
+# endif
+ jb L(sh_14_no_prefetch)
+
+ lea -64(%ecx), %ecx
+
+ .p2align 4
+L(Shl14LoopStart):
+ prefetcht0 0x1c0(%eax)
+ prefetcht0 0x1c0(%edx)
+ movaps 2(%eax), %xmm2
+ movaps 18(%eax), %xmm3
+ movaps 34(%eax), %xmm4
+ movaps 50(%eax), %xmm5
+ movaps %xmm5, %xmm7
+ palignr $14, %xmm4, %xmm5
+ palignr $14, %xmm3, %xmm4
+ movaps %xmm5, 48(%edx)
+ palignr $14, %xmm2, %xmm3
+ lea 64(%eax), %eax
+ palignr $14, %xmm1, %xmm2
+ movaps %xmm4, 32(%edx)
+ movaps %xmm3, 16(%edx)
+ movaps %xmm7, %xmm1
+ movaps %xmm2, (%edx)
+ lea 64(%edx), %edx
+ sub $64, %ecx
+ ja L(Shl14LoopStart)
+
+L(Shl14LoopLeave):
+ add $32, %ecx
+ jle L(shl_end_0)
+
+ movaps 2(%eax), %xmm2
+ movaps 18(%eax), %xmm3
+ palignr $14, %xmm2, %xmm3
+ palignr $14, %xmm1, %xmm2
+
+ movaps %xmm2, (%edx)
+ movaps %xmm3, 16(%edx)
+ lea 32(%edx, %ecx), %edx
+ lea 32(%eax, %ecx), %eax
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+ CFI_PUSH (%edi)
+
+ .p2align 4
+L(sh_14_no_prefetch):
+ lea -32(%ecx), %ecx
+ lea -14(%eax), %eax
+ xor %edi, %edi
+
+ .p2align 4
+L(sh_14_no_prefetch_loop):
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm4
+ palignr $14, %xmm2, %xmm3
+ palignr $14, %xmm1, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+ jb L(sh_14_end_no_prefetch_loop)
+
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm1
+ palignr $14, %xmm2, %xmm3
+ palignr $14, %xmm4, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+ jae L(sh_14_no_prefetch_loop)
+
+L(sh_14_end_no_prefetch_loop):
+ lea 32(%ecx), %ecx
+ add %ecx, %edi
+ add %edi, %edx
+ lea 14(%edi, %eax), %eax
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+ CFI_PUSH (%edi)
+
+ .p2align 4
+L(shl_15):
+# ifndef USE_AS_MEMMOVE
+ movaps -15(%eax), %xmm1
+# else
+ movl DEST+4(%esp), %edi
+ movaps -15(%eax), %xmm1
+ movdqu %xmm0, (%edi)
+# endif
+# ifdef DATA_CACHE_SIZE_HALF
+ cmp $DATA_CACHE_SIZE_HALF, %ecx
+# else
+# ifdef SHARED
+ SETUP_PIC_REG(bx)
+ add $_GLOBAL_OFFSET_TABLE_, %ebx
+ cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+# else
+ cmp __x86_data_cache_size_half, %ecx
+# endif
+# endif
+ jb L(sh_15_no_prefetch)
+
+ lea -64(%ecx), %ecx
+
+ .p2align 4
+L(Shl15LoopStart):
+ prefetcht0 0x1c0(%eax)
+ prefetcht0 0x1c0(%edx)
+ movaps 1(%eax), %xmm2
+ movaps 17(%eax), %xmm3
+ movaps 33(%eax), %xmm4
+ movaps 49(%eax), %xmm5
+ movaps %xmm5, %xmm7
+ palignr $15, %xmm4, %xmm5
+ palignr $15, %xmm3, %xmm4
+ movaps %xmm5, 48(%edx)
+ palignr $15, %xmm2, %xmm3
+ lea 64(%eax), %eax
+ palignr $15, %xmm1, %xmm2
+ movaps %xmm4, 32(%edx)
+ movaps %xmm3, 16(%edx)
+ movaps %xmm7, %xmm1
+ movaps %xmm2, (%edx)
+ lea 64(%edx), %edx
+ sub $64, %ecx
+ ja L(Shl15LoopStart)
+
+L(Shl15LoopLeave):
+ add $32, %ecx
+ jle L(shl_end_0)
+
+ movaps 1(%eax), %xmm2
+ movaps 17(%eax), %xmm3
+ palignr $15, %xmm2, %xmm3
+ palignr $15, %xmm1, %xmm2
+
+ movaps %xmm2, (%edx)
+ movaps %xmm3, 16(%edx)
+ lea 32(%edx, %ecx), %edx
+ lea 32(%eax, %ecx), %eax
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+ CFI_PUSH (%edi)
+
+ .p2align 4
+L(sh_15_no_prefetch):
+ lea -32(%ecx), %ecx
+ lea -15(%eax), %eax
+ xor %edi, %edi
+
+ .p2align 4
+L(sh_15_no_prefetch_loop):
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm4
+ palignr $15, %xmm2, %xmm3
+ palignr $15, %xmm1, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+ jb L(sh_15_end_no_prefetch_loop)
+
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm1
+ palignr $15, %xmm2, %xmm3
+ palignr $15, %xmm4, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+ jae L(sh_15_no_prefetch_loop)
+
+L(sh_15_end_no_prefetch_loop):
+ lea 32(%ecx), %ecx
+ add %ecx, %edi
+ add %edi, %edx
+ lea 15(%edi, %eax), %eax
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+ CFI_PUSH (%edi)
+
+ .p2align 4
+L(shl_end_0):
+ lea 32(%ecx), %ecx
+ lea (%edx, %ecx), %edx
+ lea (%eax, %ecx), %eax
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+ .p2align 4
+L(fwd_write_44bytes):
+ movq -44(%eax), %xmm0
+ movq %xmm0, -44(%edx)
+L(fwd_write_36bytes):
+ movq -36(%eax), %xmm0
+ movq %xmm0, -36(%edx)
+L(fwd_write_28bytes):
+ movq -28(%eax), %xmm0
+ movq %xmm0, -28(%edx)
+L(fwd_write_20bytes):
+ movq -20(%eax), %xmm0
+ movq %xmm0, -20(%edx)
+L(fwd_write_12bytes):
+ movq -12(%eax), %xmm0
+ movq %xmm0, -12(%edx)
+L(fwd_write_4bytes):
+ movl -4(%eax), %ecx
+ movl %ecx, -4(%edx)
+# ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+ movl %edx, %eax
+# else
+ movl DEST(%esp), %eax
+# endif
+# endif
+ RETURN
+
+ .p2align 4
+L(fwd_write_40bytes):
+ movq -40(%eax), %xmm0
+ movq %xmm0, -40(%edx)
+L(fwd_write_32bytes):
+ movq -32(%eax), %xmm0
+ movq %xmm0, -32(%edx)
+L(fwd_write_24bytes):
+ movq -24(%eax), %xmm0
+ movq %xmm0, -24(%edx)
+L(fwd_write_16bytes):
+ movq -16(%eax), %xmm0
+ movq %xmm0, -16(%edx)
+L(fwd_write_8bytes):
+ movq -8(%eax), %xmm0
+ movq %xmm0, -8(%edx)
+L(fwd_write_0bytes):
+# ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+ movl %edx, %eax
+# else
+ movl DEST(%esp), %eax
+# endif
+# endif
+ RETURN
+
+ .p2align 4
+L(fwd_write_5bytes):
+ movl -5(%eax), %ecx
+ movl -4(%eax), %eax
+ movl %ecx, -5(%edx)
+ movl %eax, -4(%edx)
+# ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+ movl %edx, %eax
+# else
+ movl DEST(%esp), %eax
+# endif
+# endif
+ RETURN
+
+ .p2align 4
+L(fwd_write_45bytes):
+ movq -45(%eax), %xmm0
+ movq %xmm0, -45(%edx)
+L(fwd_write_37bytes):
+ movq -37(%eax), %xmm0
+ movq %xmm0, -37(%edx)
+L(fwd_write_29bytes):
+ movq -29(%eax), %xmm0
+ movq %xmm0, -29(%edx)
+L(fwd_write_21bytes):
+ movq -21(%eax), %xmm0
+ movq %xmm0, -21(%edx)
+L(fwd_write_13bytes):
+ movq -13(%eax), %xmm0
+ movq %xmm0, -13(%edx)
+ movl -5(%eax), %ecx
+ movl %ecx, -5(%edx)
+ movzbl -1(%eax), %ecx
+ movb %cl, -1(%edx)
+# ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+ movl %edx, %eax
+# else
+ movl DEST(%esp), %eax
+# endif
+# endif
+ RETURN
+
+ .p2align 4
+L(fwd_write_41bytes):
+ movq -41(%eax), %xmm0
+ movq %xmm0, -41(%edx)
+L(fwd_write_33bytes):
+ movq -33(%eax), %xmm0
+ movq %xmm0, -33(%edx)
+L(fwd_write_25bytes):
+ movq -25(%eax), %xmm0
+ movq %xmm0, -25(%edx)
+L(fwd_write_17bytes):
+ movq -17(%eax), %xmm0
+ movq %xmm0, -17(%edx)
+L(fwd_write_9bytes):
+ movq -9(%eax), %xmm0
+ movq %xmm0, -9(%edx)
+L(fwd_write_1bytes):
+ movzbl -1(%eax), %ecx
+ movb %cl, -1(%edx)
+# ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+ movl %edx, %eax
+# else
+ movl DEST(%esp), %eax
+# endif
+# endif
+ RETURN
+
+ .p2align 4
+L(fwd_write_46bytes):
+ movq -46(%eax), %xmm0
+ movq %xmm0, -46(%edx)
+L(fwd_write_38bytes):
+ movq -38(%eax), %xmm0
+ movq %xmm0, -38(%edx)
+L(fwd_write_30bytes):
+ movq -30(%eax), %xmm0
+ movq %xmm0, -30(%edx)
+L(fwd_write_22bytes):
+ movq -22(%eax), %xmm0
+ movq %xmm0, -22(%edx)
+L(fwd_write_14bytes):
+ movq -14(%eax), %xmm0
+ movq %xmm0, -14(%edx)
+L(fwd_write_6bytes):
+ movl -6(%eax), %ecx
+ movl %ecx, -6(%edx)
+ movzwl -2(%eax), %ecx
+ movw %cx, -2(%edx)
+# ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+ movl %edx, %eax
+# else
+ movl DEST(%esp), %eax
+# endif
+# endif
+ RETURN
+
+ .p2align 4
+L(fwd_write_42bytes):
+ movq -42(%eax), %xmm0
+ movq %xmm0, -42(%edx)
+L(fwd_write_34bytes):
+ movq -34(%eax), %xmm0
+ movq %xmm0, -34(%edx)
+L(fwd_write_26bytes):
+ movq -26(%eax), %xmm0
+ movq %xmm0, -26(%edx)
+L(fwd_write_18bytes):
+ movq -18(%eax), %xmm0
+ movq %xmm0, -18(%edx)
+L(fwd_write_10bytes):
+ movq -10(%eax), %xmm0
+ movq %xmm0, -10(%edx)
+L(fwd_write_2bytes):
+ movzwl -2(%eax), %ecx
+ movw %cx, -2(%edx)
+# ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+ movl %edx, %eax
+# else
+ movl DEST(%esp), %eax
+# endif
+# endif
+ RETURN
+
+ .p2align 4
+L(fwd_write_47bytes):
+ movq -47(%eax), %xmm0
+ movq %xmm0, -47(%edx)
+L(fwd_write_39bytes):
+ movq -39(%eax), %xmm0
+ movq %xmm0, -39(%edx)
+L(fwd_write_31bytes):
+ movq -31(%eax), %xmm0
+ movq %xmm0, -31(%edx)
+L(fwd_write_23bytes):
+ movq -23(%eax), %xmm0
+ movq %xmm0, -23(%edx)
+L(fwd_write_15bytes):
+ movq -15(%eax), %xmm0
+ movq %xmm0, -15(%edx)
+L(fwd_write_7bytes):
+ movl -7(%eax), %ecx
+ movl %ecx, -7(%edx)
+ movzwl -3(%eax), %ecx
+ movzbl -1(%eax), %eax
+ movw %cx, -3(%edx)
+ movb %al, -1(%edx)
+# ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+ movl %edx, %eax
+# else
+ movl DEST(%esp), %eax
+# endif
+# endif
+ RETURN
+
+ .p2align 4
+L(fwd_write_43bytes):
+ movq -43(%eax), %xmm0
+ movq %xmm0, -43(%edx)
+L(fwd_write_35bytes):
+ movq -35(%eax), %xmm0
+ movq %xmm0, -35(%edx)
+L(fwd_write_27bytes):
+ movq -27(%eax), %xmm0
+ movq %xmm0, -27(%edx)
+L(fwd_write_19bytes):
+ movq -19(%eax), %xmm0
+ movq %xmm0, -19(%edx)
+L(fwd_write_11bytes):
+ movq -11(%eax), %xmm0
+ movq %xmm0, -11(%edx)
+L(fwd_write_3bytes):
+ movzwl -3(%eax), %ecx
+ movzbl -1(%eax), %eax
+ movw %cx, -3(%edx)
+ movb %al, -1(%edx)
+# ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+ movl %edx, %eax
+# else
+ movl DEST(%esp), %eax
+# endif
+# endif
+ RETURN
+
+ .p2align 4
+L(fwd_write_40bytes_align):
+ movdqa -40(%eax), %xmm0
+ movdqa %xmm0, -40(%edx)
+L(fwd_write_24bytes_align):
+ movdqa -24(%eax), %xmm0
+ movdqa %xmm0, -24(%edx)
+L(fwd_write_8bytes_align):
+ movq -8(%eax), %xmm0
+ movq %xmm0, -8(%edx)
+L(fwd_write_0bytes_align):
+# ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+ movl %edx, %eax
+# else
+ movl DEST(%esp), %eax
+# endif
+# endif
+ RETURN
+
+ .p2align 4
+L(fwd_write_32bytes_align):
+ movdqa -32(%eax), %xmm0
+ movdqa %xmm0, -32(%edx)
+L(fwd_write_16bytes_align):
+ movdqa -16(%eax), %xmm0
+ movdqa %xmm0, -16(%edx)
+# ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+ movl %edx, %eax
+# else
+ movl DEST(%esp), %eax
+# endif
+# endif
+ RETURN
+
+ .p2align 4
+L(fwd_write_5bytes_align):
+ movl -5(%eax), %ecx
+ movl -4(%eax), %eax
+ movl %ecx, -5(%edx)
+ movl %eax, -4(%edx)
+# ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+ movl %edx, %eax
+# else
+ movl DEST(%esp), %eax
+# endif
+# endif
+ RETURN
+
+ .p2align 4
+L(fwd_write_45bytes_align):
+ movdqa -45(%eax), %xmm0
+ movdqa %xmm0, -45(%edx)
+L(fwd_write_29bytes_align):
+ movdqa -29(%eax), %xmm0
+ movdqa %xmm0, -29(%edx)
+L(fwd_write_13bytes_align):
+ movq -13(%eax), %xmm0
+ movq %xmm0, -13(%edx)
+ movl -5(%eax), %ecx
+ movl %ecx, -5(%edx)
+ movzbl -1(%eax), %ecx
+ movb %cl, -1(%edx)
+# ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+ movl %edx, %eax
+# else
+ movl DEST(%esp), %eax
+# endif
+# endif
+ RETURN
+
+ .p2align 4
+L(fwd_write_37bytes_align):
+ movdqa -37(%eax), %xmm0
+ movdqa %xmm0, -37(%edx)
+L(fwd_write_21bytes_align):
+ movdqa -21(%eax), %xmm0
+ movdqa %xmm0, -21(%edx)
+ movl -5(%eax), %ecx
+ movl %ecx, -5(%edx)
+ movzbl -1(%eax), %ecx
+ movb %cl, -1(%edx)
+# ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+ movl %edx, %eax
+# else
+ movl DEST(%esp), %eax
+# endif
+# endif
+ RETURN
+
+ .p2align 4
+L(fwd_write_41bytes_align):
+ movdqa -41(%eax), %xmm0
+ movdqa %xmm0, -41(%edx)
+L(fwd_write_25bytes_align):
+ movdqa -25(%eax), %xmm0
+ movdqa %xmm0, -25(%edx)
+L(fwd_write_9bytes_align):
+ movq -9(%eax), %xmm0
+ movq %xmm0, -9(%edx)
+L(fwd_write_1bytes_align):
+ movzbl -1(%eax), %ecx
+ movb %cl, -1(%edx)
+# ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+ movl %edx, %eax
+# else
+ movl DEST(%esp), %eax
+# endif
+# endif
+ RETURN
+
+ .p2align 4
+L(fwd_write_33bytes_align):
+ movdqa -33(%eax), %xmm0
+ movdqa %xmm0, -33(%edx)
+L(fwd_write_17bytes_align):
+ movdqa -17(%eax), %xmm0
+ movdqa %xmm0, -17(%edx)
+ movzbl -1(%eax), %ecx
+ movb %cl, -1(%edx)
+# ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+ movl %edx, %eax
+# else
+ movl DEST(%esp), %eax
+# endif
+# endif
+ RETURN
+
+ .p2align 4
+L(fwd_write_46bytes_align):
+ movdqa -46(%eax), %xmm0
+ movdqa %xmm0, -46(%edx)
+L(fwd_write_30bytes_align):
+ movdqa -30(%eax), %xmm0
+ movdqa %xmm0, -30(%edx)
+L(fwd_write_14bytes_align):
+ movq -14(%eax), %xmm0
+ movq %xmm0, -14(%edx)
+L(fwd_write_6bytes_align):
+ movl -6(%eax), %ecx
+ movl %ecx, -6(%edx)
+ movzwl -2(%eax), %ecx
+ movw %cx, -2(%edx)
+# ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+ movl %edx, %eax
+# else
+ movl DEST(%esp), %eax
+# endif
+# endif
+ RETURN
+
+ .p2align 4
+L(fwd_write_38bytes_align):
+ movdqa -38(%eax), %xmm0
+ movdqa %xmm0, -38(%edx)
+L(fwd_write_22bytes_align):
+ movdqa -22(%eax), %xmm0
+ movdqa %xmm0, -22(%edx)
+ movl -6(%eax), %ecx
+ movl %ecx, -6(%edx)
+ movzwl -2(%eax), %ecx
+ movw %cx, -2(%edx)
+# ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+ movl %edx, %eax
+# else
+ movl DEST(%esp), %eax
+# endif
+# endif
+ RETURN
+
+ .p2align 4
+L(fwd_write_42bytes_align):
+ movdqa -42(%eax), %xmm0
+ movdqa %xmm0, -42(%edx)
+L(fwd_write_26bytes_align):
+ movdqa -26(%eax), %xmm0
+ movdqa %xmm0, -26(%edx)
+L(fwd_write_10bytes_align):
+ movq -10(%eax), %xmm0
+ movq %xmm0, -10(%edx)
+L(fwd_write_2bytes_align):
+ movzwl -2(%eax), %ecx
+ movw %cx, -2(%edx)
+# ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+ movl %edx, %eax
+# else
+ movl DEST(%esp), %eax
+# endif
+# endif
+ RETURN
+
+ .p2align 4
+L(fwd_write_34bytes_align):
+ movdqa -34(%eax), %xmm0
+ movdqa %xmm0, -34(%edx)
+L(fwd_write_18bytes_align):
+ movdqa -18(%eax), %xmm0
+ movdqa %xmm0, -18(%edx)
+ movzwl -2(%eax), %ecx
+ movw %cx, -2(%edx)
+# ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+ movl %edx, %eax
+# else
+ movl DEST(%esp), %eax
+# endif
+# endif
+ RETURN
+
+ .p2align 4
+L(fwd_write_47bytes_align):
+ movdqa -47(%eax), %xmm0
+ movdqa %xmm0, -47(%edx)
+L(fwd_write_31bytes_align):
+ movdqa -31(%eax), %xmm0
+ movdqa %xmm0, -31(%edx)
+L(fwd_write_15bytes_align):
+ movq -15(%eax), %xmm0
+ movq %xmm0, -15(%edx)
+L(fwd_write_7bytes_align):
+ movl -7(%eax), %ecx
+ movl %ecx, -7(%edx)
+ movzwl -3(%eax), %ecx
+ movzbl -1(%eax), %eax
+ movw %cx, -3(%edx)
+ movb %al, -1(%edx)
+# ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+ movl %edx, %eax
+# else
+ movl DEST(%esp), %eax
+# endif
+# endif
+ RETURN
+
+ .p2align 4
+L(fwd_write_39bytes_align):
+ movdqa -39(%eax), %xmm0
+ movdqa %xmm0, -39(%edx)
+L(fwd_write_23bytes_align):
+ movdqa -23(%eax), %xmm0
+ movdqa %xmm0, -23(%edx)
+ movl -7(%eax), %ecx
+ movl %ecx, -7(%edx)
+ movzwl -3(%eax), %ecx
+ movzbl -1(%eax), %eax
+ movw %cx, -3(%edx)
+ movb %al, -1(%edx)
+# ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+ movl %edx, %eax
+# else
+ movl DEST(%esp), %eax
+# endif
+# endif
+ RETURN
+
+ .p2align 4
+L(fwd_write_43bytes_align):
+ movdqa -43(%eax), %xmm0
+ movdqa %xmm0, -43(%edx)
+L(fwd_write_27bytes_align):
+ movdqa -27(%eax), %xmm0
+ movdqa %xmm0, -27(%edx)
+L(fwd_write_11bytes_align):
+ movq -11(%eax), %xmm0
+ movq %xmm0, -11(%edx)
+L(fwd_write_3bytes_align):
+ movzwl -3(%eax), %ecx
+ movzbl -1(%eax), %eax
+ movw %cx, -3(%edx)
+ movb %al, -1(%edx)
+# ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+ movl %edx, %eax
+# else
+ movl DEST(%esp), %eax
+# endif
+# endif
+ RETURN
+
+ .p2align 4
+L(fwd_write_35bytes_align):
+ movdqa -35(%eax), %xmm0
+ movdqa %xmm0, -35(%edx)
+L(fwd_write_19bytes_align):
+ movdqa -19(%eax), %xmm0
+ movdqa %xmm0, -19(%edx)
+ movzwl -3(%eax), %ecx
+ movzbl -1(%eax), %eax
+ movw %cx, -3(%edx)
+ movb %al, -1(%edx)
+# ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+ movl %edx, %eax
+# else
+ movl DEST(%esp), %eax
+# endif
+# endif
+ RETURN
+
+ .p2align 4
+L(fwd_write_44bytes_align):
+ movdqa -44(%eax), %xmm0
+ movdqa %xmm0, -44(%edx)
+L(fwd_write_28bytes_align):
+ movdqa -28(%eax), %xmm0
+ movdqa %xmm0, -28(%edx)
+L(fwd_write_12bytes_align):
+ movq -12(%eax), %xmm0
+ movq %xmm0, -12(%edx)
+L(fwd_write_4bytes_align):
+ movl -4(%eax), %ecx
+ movl %ecx, -4(%edx)
+# ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+ movl %edx, %eax
+# else
+ movl DEST(%esp), %eax
+# endif
+# endif
+ RETURN
+
+ .p2align 4
+L(fwd_write_36bytes_align):
+ movdqa -36(%eax), %xmm0
+ movdqa %xmm0, -36(%edx)
+L(fwd_write_20bytes_align):
+ movdqa -20(%eax), %xmm0
+ movdqa %xmm0, -20(%edx)
+ movl -4(%eax), %ecx
+ movl %ecx, -4(%edx)
+# ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+ movl %edx, %eax
+# else
+ movl DEST(%esp), %eax
+# endif
+# endif
+ RETURN_END
+
+ CFI_PUSH (%edi)
+
+ .p2align 4
+L(large_page):
+ movdqu (%eax), %xmm1
+# ifdef USE_AS_MEMMOVE
+ movl DEST+4(%esp), %edi
+ movdqu %xmm0, (%edi)
+# endif
+ lea 16(%eax), %eax
+ movntdq %xmm1, (%edx)
+ lea 16(%edx), %edx
+ lea -0x90(%ecx), %ecx
+ POP (%edi)
+
+ .p2align 4
+L(large_page_loop):
+ movdqu (%eax), %xmm0
+ movdqu 0x10(%eax), %xmm1
+ movdqu 0x20(%eax), %xmm2
+ movdqu 0x30(%eax), %xmm3
+ movdqu 0x40(%eax), %xmm4
+ movdqu 0x50(%eax), %xmm5
+ movdqu 0x60(%eax), %xmm6
+ movdqu 0x70(%eax), %xmm7
+ lea 0x80(%eax), %eax
+
+ sub $0x80, %ecx
+ movntdq %xmm0, (%edx)
+ movntdq %xmm1, 0x10(%edx)
+ movntdq %xmm2, 0x20(%edx)
+ movntdq %xmm3, 0x30(%edx)
+ movntdq %xmm4, 0x40(%edx)
+ movntdq %xmm5, 0x50(%edx)
+ movntdq %xmm6, 0x60(%edx)
+ movntdq %xmm7, 0x70(%edx)
+ lea 0x80(%edx), %edx
+ jae L(large_page_loop)
+ cmp $-0x40, %ecx
+ lea 0x80(%ecx), %ecx
+ jl L(large_page_less_64bytes)
+
+ movdqu (%eax), %xmm0
+ movdqu 0x10(%eax), %xmm1
+ movdqu 0x20(%eax), %xmm2
+ movdqu 0x30(%eax), %xmm3
+ lea 0x40(%eax), %eax
+
+ movntdq %xmm0, (%edx)
+ movntdq %xmm1, 0x10(%edx)
+ movntdq %xmm2, 0x20(%edx)
+ movntdq %xmm3, 0x30(%edx)
+ lea 0x40(%edx), %edx
+ sub $0x40, %ecx
+L(large_page_less_64bytes):
+ cmp $32, %ecx
+ jb L(large_page_less_32bytes)
+ movdqu (%eax), %xmm0
+ movdqu 0x10(%eax), %xmm1
+ lea 0x20(%eax), %eax
+ movntdq %xmm0, (%edx)
+ movntdq %xmm1, 0x10(%edx)
+ lea 0x20(%edx), %edx
+ sub $0x20, %ecx
+L(large_page_less_32bytes):
+ add %ecx, %edx
+ add %ecx, %eax
+ sfence
+ BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
+
+ .p2align 4
+L(bk_write_44bytes):
+ movq 36(%eax), %xmm0
+ movq %xmm0, 36(%edx)
+L(bk_write_36bytes):
+ movq 28(%eax), %xmm0
+ movq %xmm0, 28(%edx)
+L(bk_write_28bytes):
+ movq 20(%eax), %xmm0
+ movq %xmm0, 20(%edx)
+L(bk_write_20bytes):
+ movq 12(%eax), %xmm0
+ movq %xmm0, 12(%edx)
+L(bk_write_12bytes):
+ movq 4(%eax), %xmm0
+ movq %xmm0, 4(%edx)
+L(bk_write_4bytes):
+ movl (%eax), %ecx
+ movl %ecx, (%edx)
+L(bk_write_0bytes):
+# ifndef USE_AS_BCOPY
+ movl DEST(%esp), %eax
+# ifdef USE_AS_MEMPCPY
+ movl LEN(%esp), %ecx
+ add %ecx, %eax
+# endif
+# endif
+ RETURN
+
+ .p2align 4
+L(bk_write_40bytes):
+ movq 32(%eax), %xmm0
+ movq %xmm0, 32(%edx)
+L(bk_write_32bytes):
+ movq 24(%eax), %xmm0
+ movq %xmm0, 24(%edx)
+L(bk_write_24bytes):
+ movq 16(%eax), %xmm0
+ movq %xmm0, 16(%edx)
+L(bk_write_16bytes):
+ movq 8(%eax), %xmm0
+ movq %xmm0, 8(%edx)
+L(bk_write_8bytes):
+ movq (%eax), %xmm0
+ movq %xmm0, (%edx)
+# ifndef USE_AS_BCOPY
+ movl DEST(%esp), %eax
+# ifdef USE_AS_MEMPCPY
+ movl LEN(%esp), %ecx
+ add %ecx, %eax
+# endif
+# endif
+ RETURN
+
+ .p2align 4
+L(bk_write_45bytes):
+ movq 37(%eax), %xmm0
+ movq %xmm0, 37(%edx)
+L(bk_write_37bytes):
+ movq 29(%eax), %xmm0
+ movq %xmm0, 29(%edx)
+L(bk_write_29bytes):
+ movq 21(%eax), %xmm0
+ movq %xmm0, 21(%edx)
+L(bk_write_21bytes):
+ movq 13(%eax), %xmm0
+ movq %xmm0, 13(%edx)
+L(bk_write_13bytes):
+ movq 5(%eax), %xmm0
+ movq %xmm0, 5(%edx)
+L(bk_write_5bytes):
+ movl 1(%eax), %ecx
+ movl %ecx, 1(%edx)
+L(bk_write_1bytes):
+ movzbl (%eax), %ecx
+ movb %cl, (%edx)
+# ifndef USE_AS_BCOPY
+ movl DEST(%esp), %eax
+# ifdef USE_AS_MEMPCPY
+ movl LEN(%esp), %ecx
+ add %ecx, %eax
+# endif
+# endif
+ RETURN
+
+ .p2align 4
+L(bk_write_41bytes):
+ movq 33(%eax), %xmm0
+ movq %xmm0, 33(%edx)
+L(bk_write_33bytes):
+ movq 25(%eax), %xmm0
+ movq %xmm0, 25(%edx)
+L(bk_write_25bytes):
+ movq 17(%eax), %xmm0
+ movq %xmm0, 17(%edx)
+L(bk_write_17bytes):
+ movq 9(%eax), %xmm0
+ movq %xmm0, 9(%edx)
+L(bk_write_9bytes):
+ movq 1(%eax), %xmm0
+ movq %xmm0, 1(%edx)
+ movzbl (%eax), %ecx
+ movb %cl, (%edx)
+# ifndef USE_AS_BCOPY
+ movl DEST(%esp), %eax
+# ifdef USE_AS_MEMPCPY
+ movl LEN(%esp), %ecx
+ add %ecx, %eax
+# endif
+# endif
+ RETURN
+
+ .p2align 4
+L(bk_write_46bytes):
+ movq 38(%eax), %xmm0
+ movq %xmm0, 38(%edx)
+L(bk_write_38bytes):
+ movq 30(%eax), %xmm0
+ movq %xmm0, 30(%edx)
+L(bk_write_30bytes):
+ movq 22(%eax), %xmm0
+ movq %xmm0, 22(%edx)
+L(bk_write_22bytes):
+ movq 14(%eax), %xmm0
+ movq %xmm0, 14(%edx)
+L(bk_write_14bytes):
+ movq 6(%eax), %xmm0
+ movq %xmm0, 6(%edx)
+L(bk_write_6bytes):
+ movl 2(%eax), %ecx
+ movl %ecx, 2(%edx)
+ movzwl (%eax), %ecx
+ movw %cx, (%edx)
+# ifndef USE_AS_BCOPY
+ movl DEST(%esp), %eax
+# ifdef USE_AS_MEMPCPY
+ movl LEN(%esp), %ecx
+ add %ecx, %eax
+# endif
+# endif
+ RETURN
+
+ .p2align 4
+L(bk_write_42bytes):
+ movq 34(%eax), %xmm0
+ movq %xmm0, 34(%edx)
+L(bk_write_34bytes):
+ movq 26(%eax), %xmm0
+ movq %xmm0, 26(%edx)
+L(bk_write_26bytes):
+ movq 18(%eax), %xmm0
+ movq %xmm0, 18(%edx)
+L(bk_write_18bytes):
+ movq 10(%eax), %xmm0
+ movq %xmm0, 10(%edx)
+L(bk_write_10bytes):
+ movq 2(%eax), %xmm0
+ movq %xmm0, 2(%edx)
+L(bk_write_2bytes):
+ movzwl (%eax), %ecx
+ movw %cx, (%edx)
+# ifndef USE_AS_BCOPY
+ movl DEST(%esp), %eax
+# ifdef USE_AS_MEMPCPY
+ movl LEN(%esp), %ecx
+ add %ecx, %eax
+# endif
+# endif
+ RETURN
+
+ .p2align 4
+L(bk_write_47bytes):
+ movq 39(%eax), %xmm0
+ movq %xmm0, 39(%edx)
+L(bk_write_39bytes):
+ movq 31(%eax), %xmm0
+ movq %xmm0, 31(%edx)
+L(bk_write_31bytes):
+ movq 23(%eax), %xmm0
+ movq %xmm0, 23(%edx)
+L(bk_write_23bytes):
+ movq 15(%eax), %xmm0
+ movq %xmm0, 15(%edx)
+L(bk_write_15bytes):
+ movq 7(%eax), %xmm0
+ movq %xmm0, 7(%edx)
+L(bk_write_7bytes):
+ movl 3(%eax), %ecx
+ movl %ecx, 3(%edx)
+ movzwl 1(%eax), %ecx
+ movw %cx, 1(%edx)
+ movzbl (%eax), %eax
+ movb %al, (%edx)
+# ifndef USE_AS_BCOPY
+ movl DEST(%esp), %eax
+# ifdef USE_AS_MEMPCPY
+ movl LEN(%esp), %ecx
+ add %ecx, %eax
+# endif
+# endif
+ RETURN
+
+ .p2align 4
+L(bk_write_43bytes):
+ movq 35(%eax), %xmm0
+ movq %xmm0, 35(%edx)
+L(bk_write_35bytes):
+ movq 27(%eax), %xmm0
+ movq %xmm0, 27(%edx)
+L(bk_write_27bytes):
+ movq 19(%eax), %xmm0
+ movq %xmm0, 19(%edx)
+L(bk_write_19bytes):
+ movq 11(%eax), %xmm0
+ movq %xmm0, 11(%edx)
+L(bk_write_11bytes):
+ movq 3(%eax), %xmm0
+ movq %xmm0, 3(%edx)
+L(bk_write_3bytes):
+ movzwl 1(%eax), %ecx
+ movw %cx, 1(%edx)
+ movzbl (%eax), %eax
+ movb %al, (%edx)
+# ifndef USE_AS_BCOPY
+ movl DEST(%esp), %eax
+# ifdef USE_AS_MEMPCPY
+ movl LEN(%esp), %ecx
+ add %ecx, %eax
+# endif
+# endif
+ RETURN_END
+
+
+ .pushsection .rodata.ssse3,"a",@progbits
+ .p2align 2
+L(table_48bytes_fwd):
+ .int JMPTBL (L(fwd_write_0bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_1bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_2bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_3bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_4bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_5bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_6bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_7bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_8bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_9bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_10bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_11bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_12bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_13bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_14bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_15bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_16bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_17bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_18bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_19bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_20bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_21bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_22bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_23bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_24bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_25bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_26bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_27bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_28bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_29bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_30bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_31bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_32bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_33bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_34bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_35bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_36bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_37bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_38bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_39bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_40bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_41bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_42bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_43bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_44bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_45bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_46bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_47bytes), L(table_48bytes_fwd))
+
+ .p2align 2
+L(table_48bytes_fwd_align):
+ .int JMPTBL (L(fwd_write_0bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_1bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_2bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_3bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_4bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_5bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_6bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_7bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_8bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_9bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_10bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_11bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_12bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_13bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_14bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_15bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_16bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_17bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_18bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_19bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_20bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_21bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_22bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_23bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_24bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_25bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_26bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_27bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_28bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_29bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_30bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_31bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_32bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_33bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_34bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_35bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_36bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_37bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_38bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_39bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_40bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_41bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_42bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_43bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_44bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_45bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_46bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_47bytes_align), L(table_48bytes_fwd_align))
+
+ .p2align 2
+L(shl_table):
+ .int JMPTBL (L(shl_0), L(shl_table))
+ .int JMPTBL (L(shl_1), L(shl_table))
+ .int JMPTBL (L(shl_2), L(shl_table))
+ .int JMPTBL (L(shl_3), L(shl_table))
+ .int JMPTBL (L(shl_4), L(shl_table))
+ .int JMPTBL (L(shl_5), L(shl_table))
+ .int JMPTBL (L(shl_6), L(shl_table))
+ .int JMPTBL (L(shl_7), L(shl_table))
+ .int JMPTBL (L(shl_8), L(shl_table))
+ .int JMPTBL (L(shl_9), L(shl_table))
+ .int JMPTBL (L(shl_10), L(shl_table))
+ .int JMPTBL (L(shl_11), L(shl_table))
+ .int JMPTBL (L(shl_12), L(shl_table))
+ .int JMPTBL (L(shl_13), L(shl_table))
+ .int JMPTBL (L(shl_14), L(shl_table))
+ .int JMPTBL (L(shl_15), L(shl_table))
+
+ .p2align 2
+L(table_48_bytes_bwd):
+ .int JMPTBL (L(bk_write_0bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_1bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_2bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_3bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_4bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_5bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_6bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_7bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_8bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_9bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_10bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_11bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_12bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_13bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_14bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_15bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_16bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_17bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_18bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_19bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_20bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_21bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_22bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_23bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_24bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_25bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_26bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_27bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_28bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_29bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_30bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_31bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_32bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_33bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_34bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_35bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_36bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_37bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_38bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_39bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_40bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_41bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_42bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_43bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_44bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_45bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_46bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_47bytes), L(table_48_bytes_bwd))
+
+ .popsection
+
+# ifdef USE_AS_MEMMOVE
+ .p2align 4
+L(copy_backward):
+ PUSH (%edi)
+ movl %eax, %edi
+ lea (%ecx,%edx,1),%edx
+ lea (%ecx,%edi,1),%edi
+ testl $0x3, %edx
+ jnz L(bk_align)
+
+L(bk_aligned_4):
+ cmp $64, %ecx
+ jae L(bk_write_more64bytes)
+
+L(bk_write_64bytesless):
+ cmp $32, %ecx
+ jb L(bk_write_less32bytes)
+
+L(bk_write_more32bytes):
+ /* Copy 32 bytes at a time. */
+ sub $32, %ecx
+ movq -8(%edi), %xmm0
+ movq %xmm0, -8(%edx)
+ movq -16(%edi), %xmm0
+ movq %xmm0, -16(%edx)
+ movq -24(%edi), %xmm0
+ movq %xmm0, -24(%edx)
+ movq -32(%edi), %xmm0
+ movq %xmm0, -32(%edx)
+ sub $32, %edx
+ sub $32, %edi
+
+L(bk_write_less32bytes):
+ movl %edi, %eax
+ sub %ecx, %edx
+ sub %ecx, %eax
+ POP (%edi)
+L(bk_write_less32bytes_2):
+ BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4)
+
+ CFI_PUSH (%edi)
+
+ .p2align 4
+L(bk_align):
+ cmp $8, %ecx
+ jbe L(bk_write_less32bytes)
+ testl $1, %edx
+ /* We get here only if (EDX & 3 ) != 0 so if (EDX & 1) ==0,
+ then (EDX & 2) must be != 0. */
+ jz L(bk_got2)
+ sub $1, %edi
+ sub $1, %ecx
+ sub $1, %edx
+ movzbl (%edi), %eax
+ movb %al, (%edx)
+
+ testl $2, %edx
+ jz L(bk_aligned_4)
+
+L(bk_got2):
+ sub $2, %edi
+ sub $2, %ecx
+ sub $2, %edx
+ movzwl (%edi), %eax
+ movw %ax, (%edx)
+ jmp L(bk_aligned_4)
+
+ .p2align 4
+L(bk_write_more64bytes):
+ /* Check alignment of last byte. */
+ testl $15, %edx
+ jz L(bk_ssse3_cpy_pre)
+
+/* EDX is aligned 4 bytes, but not 16 bytes. */
+L(bk_ssse3_align):
+ sub $4, %edi
+ sub $4, %ecx
+ sub $4, %edx
+ movl (%edi), %eax
+ movl %eax, (%edx)
+
+ testl $15, %edx
+ jz L(bk_ssse3_cpy_pre)
+
+ sub $4, %edi
+ sub $4, %ecx
+ sub $4, %edx
+ movl (%edi), %eax
+ movl %eax, (%edx)
+
+ testl $15, %edx
+ jz L(bk_ssse3_cpy_pre)
+
+ sub $4, %edi
+ sub $4, %ecx
+ sub $4, %edx
+ movl (%edi), %eax
+ movl %eax, (%edx)
+
+L(bk_ssse3_cpy_pre):
+ cmp $64, %ecx
+ jb L(bk_write_more32bytes)
+
+ .p2align 4
+L(bk_ssse3_cpy):
+ sub $64, %edi
+ sub $64, %ecx
+ sub $64, %edx
+ movdqu 0x30(%edi), %xmm3
+ movdqa %xmm3, 0x30(%edx)
+ movdqu 0x20(%edi), %xmm2
+ movdqa %xmm2, 0x20(%edx)
+ movdqu 0x10(%edi), %xmm1
+ movdqa %xmm1, 0x10(%edx)
+ movdqu (%edi), %xmm0
+ movdqa %xmm0, (%edx)
+ cmp $64, %ecx
+ jae L(bk_ssse3_cpy)
+ jmp L(bk_write_64bytesless)
+
+# endif
+
+END (MEMCPY)
+
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memcpy.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memcpy.S
new file mode 100644
index 0000000000..f725944620
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memcpy.S
@@ -0,0 +1,78 @@
+/* Multiple versions of memcpy
+ All versions must be listed in ifunc-impl-list.c.
+ Copyright (C) 2010-2017 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in lib and for
+ DSO. In static binaries we need memcpy before the initialization
+ happened. */
+#if defined SHARED && IS_IN (libc)
+ .text
+ENTRY(memcpy)
+ .type memcpy, @gnu_indirect_function
+ LOAD_GOT_AND_RTLD_GLOBAL_RO
+ LOAD_FUNC_GOT_EAX (__memcpy_ia32)
+ HAS_CPU_FEATURE (SSE2)
+ jz 2f
+ LOAD_FUNC_GOT_EAX (__memcpy_sse2_unaligned)
+ HAS_ARCH_FEATURE (Fast_Unaligned_Load)
+ jnz 2f
+ HAS_CPU_FEATURE (SSSE3)
+ jz 2f
+ LOAD_FUNC_GOT_EAX (__memcpy_ssse3)
+ HAS_ARCH_FEATURE (Fast_Rep_String)
+ jz 2f
+ LOAD_FUNC_GOT_EAX (__memcpy_ssse3_rep)
+2: ret
+END(memcpy)
+
+# undef ENTRY
+# define ENTRY(name) \
+ .type __memcpy_ia32, @function; \
+ .p2align 4; \
+ .globl __memcpy_ia32; \
+ .hidden __memcpy_ia32; \
+ __memcpy_ia32: cfi_startproc; \
+ CALL_MCOUNT
+# undef END
+# define END(name) \
+ cfi_endproc; .size __memcpy_ia32, .-__memcpy_ia32
+
+# undef ENTRY_CHK
+# define ENTRY_CHK(name) \
+ .type __memcpy_chk_ia32, @function; \
+ .globl __memcpy_chk_ia32; \
+ .p2align 4; \
+ __memcpy_chk_ia32: cfi_startproc; \
+ CALL_MCOUNT
+# undef END_CHK
+# define END_CHK(name) \
+ cfi_endproc; .size __memcpy_chk_ia32, .-__memcpy_chk_ia32
+
+# undef libc_hidden_builtin_def
+/* IFUNC doesn't work with the hidden functions in shared library since
+ they will be called without setting up EBX needed for PLT which is
+ used by IFUNC. */
+# define libc_hidden_builtin_def(name) \
+ .globl __GI_memcpy; __GI_memcpy = __memcpy_ia32
+#endif
+
+#include "../memcpy.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memcpy_chk.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memcpy_chk.S
new file mode 100644
index 0000000000..1b4fbe2e6f
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memcpy_chk.S
@@ -0,0 +1,50 @@
+/* Multiple versions of __memcpy_chk
+ All versions must be listed in ifunc-impl-list.c.
+ Copyright (C) 2010-2017 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in lib and for
+ DSO. There are no multiarch memcpy functions for static binaries.
+ */
+#if IS_IN (libc)
+# ifdef SHARED
+ .text
+ENTRY(__memcpy_chk)
+ .type __memcpy_chk, @gnu_indirect_function
+ LOAD_GOT_AND_RTLD_GLOBAL_RO
+ LOAD_FUNC_GOT_EAX (__memcpy_chk_ia32)
+ HAS_CPU_FEATURE (SSE2)
+ jz 2f
+ LOAD_FUNC_GOT_EAX (__memcpy_chk_sse2_unaligned)
+ HAS_ARCH_FEATURE (Fast_Unaligned_Load)
+ jnz 2f
+ HAS_CPU_FEATURE (SSSE3)
+ jz 2f
+ LOAD_FUNC_GOT_EAX (__memcpy_chk_ssse3)
+ HAS_ARCH_FEATURE (Fast_Rep_String)
+ jz 2f
+ LOAD_FUNC_GOT_EAX (__memcpy_chk_ssse3_rep)
+2: ret
+END(__memcpy_chk)
+# else
+# include "../memcpy_chk.S"
+# endif
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memmove-sse2-unaligned.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memmove-sse2-unaligned.S
new file mode 100644
index 0000000000..3873594cb2
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memmove-sse2-unaligned.S
@@ -0,0 +1,4 @@
+#define USE_AS_MEMMOVE
+#define MEMCPY __memmove_sse2_unaligned
+#define MEMCPY_CHK __memmove_chk_sse2_unaligned
+#include "memcpy-sse2-unaligned.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memmove-ssse3-rep.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memmove-ssse3-rep.S
new file mode 100644
index 0000000000..d202fc4a13
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memmove-ssse3-rep.S
@@ -0,0 +1,4 @@
+#define USE_AS_MEMMOVE
+#define MEMCPY __memmove_ssse3_rep
+#define MEMCPY_CHK __memmove_chk_ssse3_rep
+#include "memcpy-ssse3-rep.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memmove-ssse3.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memmove-ssse3.S
new file mode 100644
index 0000000000..295430b1ef
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memmove-ssse3.S
@@ -0,0 +1,4 @@
+#define USE_AS_MEMMOVE
+#define MEMCPY __memmove_ssse3
+#define MEMCPY_CHK __memmove_chk_ssse3
+#include "memcpy-ssse3.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memmove.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memmove.S
new file mode 100644
index 0000000000..6eb418ca7f
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memmove.S
@@ -0,0 +1,89 @@
+/* Multiple versions of memmove
+ All versions must be listed in ifunc-impl-list.c.
+ Copyright (C) 2010-2017 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in lib. */
+#if IS_IN (libc)
+ .text
+ENTRY(memmove)
+ .type memmove, @gnu_indirect_function
+ LOAD_GOT_AND_RTLD_GLOBAL_RO
+ LOAD_FUNC_GOT_EAX (__memmove_ia32)
+ HAS_CPU_FEATURE (SSE2)
+ jz 2f
+ LOAD_FUNC_GOT_EAX (__memmove_sse2_unaligned)
+ HAS_ARCH_FEATURE (Fast_Unaligned_Load)
+ jnz 2f
+ HAS_CPU_FEATURE (SSSE3)
+ jz 2f
+ LOAD_FUNC_GOT_EAX (__memmove_ssse3)
+ HAS_ARCH_FEATURE (Fast_Rep_String)
+ jz 2f
+ LOAD_FUNC_GOT_EAX (__memmove_ssse3_rep)
+2: ret
+END(memmove)
+
+# ifdef SHARED
+# undef ENTRY
+# define ENTRY(name) \
+ .type __memmove_ia32, @function; \
+ .p2align 4; \
+ .globl __memmove_ia32; \
+ .hidden __memmove_ia32; \
+ __memmove_ia32: cfi_startproc; \
+ CALL_MCOUNT
+# else
+# undef ENTRY
+# define ENTRY(name) \
+ .type __memmove_ia32, @function; \
+ .globl __memmove_ia32; \
+ .p2align 4; \
+ __memmove_ia32: cfi_startproc; \
+ CALL_MCOUNT
+# endif
+
+# undef END
+# define END(name) \
+ cfi_endproc; .size __memmove_ia32, .-__memmove_ia32
+
+# undef ENTRY_CHK
+# define ENTRY_CHK(name) \
+ .type __memmove_chk_ia32, @function; \
+ .globl __memmove_chk_ia32; \
+ .p2align 4; \
+ __memmove_chk_ia32: cfi_startproc; \
+ CALL_MCOUNT
+# undef END_CHK
+# define END_CHK(name) \
+ cfi_endproc; .size __memmove_chk_ia32, .-__memmove_chk_ia32
+
+# ifdef SHARED
+# undef libc_hidden_builtin_def
+/* IFUNC doesn't work with the hidden functions in shared library since
+ they will be called without setting up EBX needed for PLT which is
+ used by IFUNC. */
+# define libc_hidden_builtin_def(name) \
+ .globl __GI_memmove; __GI_memmove = __memmove_ia32
+# endif
+#endif
+
+#include "../memmove.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memmove_chk.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memmove_chk.S
new file mode 100644
index 0000000000..314834c4c6
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memmove_chk.S
@@ -0,0 +1,94 @@
+/* Multiple versions of __memmove_chk
+ All versions must be listed in ifunc-impl-list.c.
+ Copyright (C) 2010-2017 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in lib. */
+#if IS_IN (libc)
+ .text
+ENTRY(__memmove_chk)
+ .type __memmove_chk, @gnu_indirect_function
+ LOAD_GOT_AND_RTLD_GLOBAL_RO
+ LOAD_FUNC_GOT_EAX (__memmove_chk_ia32)
+ HAS_CPU_FEATURE (SSE2)
+ jz 2f
+ LOAD_FUNC_GOT_EAX (__memmove_chk_sse2_unaligned)
+ HAS_ARCH_FEATURE (Fast_Unaligned_Load)
+ jnz 2f
+ HAS_CPU_FEATURE (SSSE3)
+ jz 2f
+ LOAD_FUNC_GOT_EAX (__memmove_chk_ssse3)
+ HAS_ARCH_FEATURE (Fast_Rep_String)
+ jz 2f
+ LOAD_FUNC_GOT_EAX (__memmove_chk_ssse3_rep)
+2: ret
+END(__memmove_chk)
+
+# ifndef SHARED
+ .type __memmove_chk_sse2_unaligned, @function
+ .p2align 4;
+__memmove_chk_sse2_unaligned:
+ cfi_startproc
+ CALL_MCOUNT
+ movl 12(%esp), %eax
+ cmpl %eax, 16(%esp)
+ jb __chk_fail
+ jmp __memmove_sse2_unaligned
+ cfi_endproc
+ .size __memmove_chk_sse2_unaligned, .-__memmove_chk_sse2_unaligned
+
+ .type __memmove_chk_ssse3, @function
+ .p2align 4;
+__memmove_chk_ssse3:
+ cfi_startproc
+ CALL_MCOUNT
+ movl 12(%esp), %eax
+ cmpl %eax, 16(%esp)
+ jb __chk_fail
+ jmp __memmove_ssse3
+ cfi_endproc
+ .size __memmove_chk_ssse3, .-__memmove_chk_ssse3
+
+ .type __memmove_chk_ssse3_rep, @function
+ .p2align 4;
+__memmove_chk_ssse3_rep:
+ cfi_startproc
+ CALL_MCOUNT
+ movl 12(%esp), %eax
+ cmpl %eax, 16(%esp)
+ jb __chk_fail
+ jmp __memmove_ssse3_rep
+ cfi_endproc
+ .size __memmove_chk_ssse3_rep, .-__memmove_chk_ssse3_rep
+
+ .type __memmove_chk_ia32, @function
+ .p2align 4;
+__memmove_chk_ia32:
+ cfi_startproc
+ CALL_MCOUNT
+ movl 12(%esp), %eax
+ cmpl %eax, 16(%esp)
+ jb __chk_fail
+ jmp __memmove_ia32
+ cfi_endproc
+ .size __memmove_chk_ia32, .-__memmove_chk_ia32
+# endif
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/mempcpy-sse2-unaligned.S b/REORG.TODO/sysdeps/i386/i686/multiarch/mempcpy-sse2-unaligned.S
new file mode 100644
index 0000000000..a1cea50771
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/mempcpy-sse2-unaligned.S
@@ -0,0 +1,4 @@
+#define USE_AS_MEMPCPY
+#define MEMCPY __mempcpy_sse2_unaligned
+#define MEMCPY_CHK __mempcpy_chk_sse2_unaligned
+#include "memcpy-sse2-unaligned.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/mempcpy-ssse3-rep.S b/REORG.TODO/sysdeps/i386/i686/multiarch/mempcpy-ssse3-rep.S
new file mode 100644
index 0000000000..5357b33e18
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/mempcpy-ssse3-rep.S
@@ -0,0 +1,4 @@
+#define USE_AS_MEMPCPY
+#define MEMCPY __mempcpy_ssse3_rep
+#define MEMCPY_CHK __mempcpy_chk_ssse3_rep
+#include "memcpy-ssse3-rep.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/mempcpy-ssse3.S b/REORG.TODO/sysdeps/i386/i686/multiarch/mempcpy-ssse3.S
new file mode 100644
index 0000000000..822d98e954
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/mempcpy-ssse3.S
@@ -0,0 +1,4 @@
+#define USE_AS_MEMPCPY
+#define MEMCPY __mempcpy_ssse3
+#define MEMCPY_CHK __mempcpy_chk_ssse3
+#include "memcpy-ssse3.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/mempcpy.S b/REORG.TODO/sysdeps/i386/i686/multiarch/mempcpy.S
new file mode 100644
index 0000000000..06e377fbc9
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/mempcpy.S
@@ -0,0 +1,81 @@
+/* Multiple versions of mempcpy
+ All versions must be listed in ifunc-impl-list.c.
+ Copyright (C) 2010-2017 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in lib and for
+ DSO. In static binaries we need mempcpy before the initialization
+ happened. */
+#if defined SHARED && IS_IN (libc)
+ .text
+ENTRY(__mempcpy)
+ .type __mempcpy, @gnu_indirect_function
+ LOAD_GOT_AND_RTLD_GLOBAL_RO
+ LOAD_FUNC_GOT_EAX (__mempcpy_ia32)
+ HAS_CPU_FEATURE (SSE2)
+ jz 2f
+ LOAD_FUNC_GOT_EAX (__mempcpy_sse2_unaligned)
+ HAS_ARCH_FEATURE (Fast_Unaligned_Load)
+ jnz 2f
+ HAS_CPU_FEATURE (SSSE3)
+ jz 2f
+ LOAD_FUNC_GOT_EAX (__mempcpy_ssse3)
+ HAS_ARCH_FEATURE (Fast_Rep_String)
+ jz 2f
+ LOAD_FUNC_GOT_EAX (__mempcpy_ssse3_rep)
+2: ret
+END(__mempcpy)
+
+# undef ENTRY
+# define ENTRY(name) \
+ .type __mempcpy_ia32, @function; \
+ .p2align 4; \
+ .globl __mempcpy_ia32; \
+ .hidden __mempcpy_ia32; \
+ __mempcpy_ia32: cfi_startproc; \
+ CALL_MCOUNT
+# undef END
+# define END(name) \
+ cfi_endproc; .size __mempcpy_ia32, .-__mempcpy_ia32
+
+# undef ENTRY_CHK
+# define ENTRY_CHK(name) \
+ .type __mempcpy_chk_ia32, @function; \
+ .globl __mempcpy_chk_ia32; \
+ .p2align 4; \
+ __mempcpy_chk_ia32: cfi_startproc; \
+ CALL_MCOUNT
+# undef END_CHK
+# define END_CHK(name) \
+ cfi_endproc; .size __mempcpy_chk_ia32, .-__mempcpy_chk_ia32
+
+# undef libc_hidden_def
+# undef libc_hidden_builtin_def
+/* IFUNC doesn't work with the hidden functions in shared library since
+ they will be called without setting up EBX needed for PLT which is
+ used by IFUNC. */
+# define libc_hidden_def(name) \
+ .globl __GI_mempcpy; __GI_mempcpy = __mempcpy_ia32
+# define libc_hidden_builtin_def(name) \
+ .globl __GI___mempcpy; __GI___mempcpy = __mempcpy_ia32
+#endif
+
+#include "../mempcpy.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/mempcpy_chk.S b/REORG.TODO/sysdeps/i386/i686/multiarch/mempcpy_chk.S
new file mode 100644
index 0000000000..e13e5248a5
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/mempcpy_chk.S
@@ -0,0 +1,50 @@
+/* Multiple versions of __mempcpy_chk
+ All versions must be listed in ifunc-impl-list.c.
+ Copyright (C) 2010-2017 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in lib and for
+ DSO. There are no multiarch mempcpy functions for static binaries.
+ */
+#if IS_IN (libc)
+# ifdef SHARED
+ .text
+ENTRY(__mempcpy_chk)
+ .type __mempcpy_chk, @gnu_indirect_function
+ LOAD_GOT_AND_RTLD_GLOBAL_RO
+ LOAD_FUNC_GOT_EAX (__mempcpy_chk_ia32)
+ HAS_CPU_FEATURE (SSE2)
+ jz 2f
+ LOAD_FUNC_GOT_EAX (__mempcpy_chk_sse2_unaligned)
+ HAS_ARCH_FEATURE (Fast_Unaligned_Load)
+ jnz 2f
+ HAS_CPU_FEATURE (SSSE3)
+ jz 2f
+ LOAD_FUNC_GOT_EAX (__mempcpy_chk_ssse3)
+ HAS_ARCH_FEATURE (Fast_Rep_String)
+ jz 2f
+ LOAD_FUNC_GOT_EAX (__mempcpy_chk_ssse3_rep)
+2: ret
+END(__mempcpy_chk)
+# else
+# include "../mempcpy_chk.S"
+# endif
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memrchr-c.c b/REORG.TODO/sysdeps/i386/i686/multiarch/memrchr-c.c
new file mode 100644
index 0000000000..ef7bbbe792
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memrchr-c.c
@@ -0,0 +1,7 @@
+#if IS_IN (libc)
+# define MEMRCHR __memrchr_ia32
+# include <string.h>
+extern void *__memrchr_ia32 (const void *, int, size_t);
+#endif
+
+#include "string/memrchr.c"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memrchr-sse2-bsf.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memrchr-sse2-bsf.S
new file mode 100644
index 0000000000..dbbe94fd08
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memrchr-sse2-bsf.S
@@ -0,0 +1,417 @@
+/* Optimized memrchr with sse2
+ Copyright (C) 2011-2017 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+
+# define CFI_PUSH(REG) \
+ cfi_adjust_cfa_offset (4); \
+ cfi_rel_offset (REG, 0)
+
+# define CFI_POP(REG) \
+ cfi_adjust_cfa_offset (-4); \
+ cfi_restore (REG)
+
+# define PUSH(REG) pushl REG; CFI_PUSH (REG)
+# define POP(REG) popl REG; CFI_POP (REG)
+
+# define PARMS 4
+# define STR1 PARMS
+# define STR2 STR1+4
+# define LEN STR2+4
+
+# define MEMCHR __memrchr_sse2_bsf
+
+ .text
+ENTRY (MEMCHR)
+ mov STR1(%esp), %ecx
+ movd STR2(%esp), %xmm1
+ mov LEN(%esp), %edx
+
+ sub $16, %edx
+ jbe L(length_less16)
+
+ punpcklbw %xmm1, %xmm1
+ add %edx, %ecx
+ punpcklbw %xmm1, %xmm1
+
+ movdqu (%ecx), %xmm0
+ pshufd $0, %xmm1, %xmm1
+ pcmpeqb %xmm1, %xmm0
+
+/* Check if there is a match. */
+ pmovmskb %xmm0, %eax
+ test %eax, %eax
+ jnz L(matches0)
+
+ sub $64, %ecx
+ mov %ecx, %eax
+ and $15, %eax
+ jz L(loop_prolog)
+
+ add $16, %ecx
+ add $16, %edx
+ sub %eax, %ecx
+ sub %eax, %edx
+
+ .p2align 4
+/* Loop start on aligned string. */
+L(loop_prolog):
+ sub $64, %edx
+ jbe L(exit_loop)
+
+ movdqa 48(%ecx), %xmm0
+ pcmpeqb %xmm1, %xmm0
+ pmovmskb %xmm0, %eax
+ test %eax, %eax
+ jnz L(matches48)
+
+ movdqa 32(%ecx), %xmm2
+ pcmpeqb %xmm1, %xmm2
+ pmovmskb %xmm2, %eax
+ test %eax, %eax
+ jnz L(matches32)
+
+ movdqa 16(%ecx), %xmm3
+ pcmpeqb %xmm1, %xmm3
+ pmovmskb %xmm3, %eax
+ test %eax, %eax
+ jnz L(matches16)
+
+ movdqa (%ecx), %xmm4
+ pcmpeqb %xmm1, %xmm4
+ pmovmskb %xmm4, %eax
+ test %eax, %eax
+ jnz L(matches0)
+
+ sub $64, %ecx
+ sub $64, %edx
+ jbe L(exit_loop)
+
+ movdqa 48(%ecx), %xmm0
+ pcmpeqb %xmm1, %xmm0
+ pmovmskb %xmm0, %eax
+ test %eax, %eax
+ jnz L(matches48)
+
+ movdqa 32(%ecx), %xmm2
+ pcmpeqb %xmm1, %xmm2
+ pmovmskb %xmm2, %eax
+ test %eax, %eax
+ jnz L(matches32)
+
+ movdqa 16(%ecx), %xmm3
+ pcmpeqb %xmm1, %xmm3
+ pmovmskb %xmm3, %eax
+ test %eax, %eax
+ jnz L(matches16)
+
+ movdqa (%ecx), %xmm3
+ pcmpeqb %xmm1, %xmm3
+ pmovmskb %xmm3, %eax
+ test %eax, %eax
+ jnz L(matches0)
+
+ mov %ecx, %eax
+ and $63, %eax
+ test %eax, %eax
+ jz L(align64_loop)
+
+ add $64, %ecx
+ add $64, %edx
+ sub %eax, %ecx
+ sub %eax, %edx
+
+ .p2align 4
+L(align64_loop):
+ sub $64, %ecx
+ sub $64, %edx
+ jbe L(exit_loop)
+
+ movdqa (%ecx), %xmm0
+ movdqa 16(%ecx), %xmm2
+ movdqa 32(%ecx), %xmm3
+ movdqa 48(%ecx), %xmm4
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm1, %xmm2
+ pcmpeqb %xmm1, %xmm3
+ pcmpeqb %xmm1, %xmm4
+
+ pmaxub %xmm3, %xmm0
+ pmaxub %xmm4, %xmm2
+ pmaxub %xmm0, %xmm2
+ pmovmskb %xmm2, %eax
+
+ test %eax, %eax
+ jz L(align64_loop)
+
+ pmovmskb %xmm4, %eax
+ test %eax, %eax
+ jnz L(matches48)
+
+ pmovmskb %xmm3, %eax
+ test %eax, %eax
+ jnz L(matches32)
+
+ movdqa 16(%ecx), %xmm2
+
+ pcmpeqb %xmm1, %xmm2
+ pcmpeqb (%ecx), %xmm1
+
+ pmovmskb %xmm2, %eax
+ test %eax, %eax
+ jnz L(matches16)
+
+ pmovmskb %xmm1, %eax
+ bsr %eax, %eax
+
+ add %ecx, %eax
+ ret
+
+ .p2align 4
+L(exit_loop):
+ add $64, %edx
+ cmp $32, %edx
+ jbe L(exit_loop_32)
+
+ movdqa 48(%ecx), %xmm0
+ pcmpeqb %xmm1, %xmm0
+ pmovmskb %xmm0, %eax
+ test %eax, %eax
+ jnz L(matches48)
+
+ movdqa 32(%ecx), %xmm2
+ pcmpeqb %xmm1, %xmm2
+ pmovmskb %xmm2, %eax
+ test %eax, %eax
+ jnz L(matches32)
+
+ movdqa 16(%ecx), %xmm3
+ pcmpeqb %xmm1, %xmm3
+ pmovmskb %xmm3, %eax
+ test %eax, %eax
+ jnz L(matches16_1)
+ cmp $48, %edx
+ jbe L(return_null)
+
+ pcmpeqb (%ecx), %xmm1
+ pmovmskb %xmm1, %eax
+ test %eax, %eax
+ jnz L(matches0_1)
+ xor %eax, %eax
+ ret
+
+ .p2align 4
+L(exit_loop_32):
+ movdqa 48(%ecx), %xmm0
+ pcmpeqb %xmm1, %xmm0
+ pmovmskb %xmm0, %eax
+ test %eax, %eax
+ jnz L(matches48_1)
+ cmp $16, %edx
+ jbe L(return_null)
+
+ pcmpeqb 32(%ecx), %xmm1
+ pmovmskb %xmm1, %eax
+ test %eax, %eax
+ jnz L(matches32_1)
+ xor %eax, %eax
+ ret
+
+ .p2align 4
+L(matches0):
+ bsr %eax, %eax
+ add %ecx, %eax
+ ret
+
+ .p2align 4
+L(matches16):
+ bsr %eax, %eax
+ lea 16(%eax, %ecx), %eax
+ ret
+
+ .p2align 4
+L(matches32):
+ bsr %eax, %eax
+ lea 32(%eax, %ecx), %eax
+ ret
+
+ .p2align 4
+L(matches48):
+ bsr %eax, %eax
+ lea 48(%eax, %ecx), %eax
+ ret
+
+ .p2align 4
+L(matches0_1):
+ bsr %eax, %eax
+ sub $64, %edx
+ add %eax, %edx
+ jl L(return_null)
+ add %ecx, %eax
+ ret
+
+ .p2align 4
+L(matches16_1):
+ bsr %eax, %eax
+ sub $48, %edx
+ add %eax, %edx
+ jl L(return_null)
+ lea 16(%ecx, %eax), %eax
+ ret
+
+ .p2align 4
+L(matches32_1):
+ bsr %eax, %eax
+ sub $32, %edx
+ add %eax, %edx
+ jl L(return_null)
+ lea 32(%ecx, %eax), %eax
+ ret
+
+ .p2align 4
+L(matches48_1):
+ bsr %eax, %eax
+ sub $16, %edx
+ add %eax, %edx
+ jl L(return_null)
+ lea 48(%ecx, %eax), %eax
+ ret
+
+ .p2align 4
+L(return_null):
+ xor %eax, %eax
+ ret
+
+ .p2align 4
+L(length_less16_offset0):
+ mov %dl, %cl
+ pcmpeqb (%eax), %xmm1
+
+ mov $1, %edx
+ sal %cl, %edx
+ sub $1, %edx
+ mov %edx, %ecx
+
+ pmovmskb %xmm1, %edx
+
+ and %ecx, %edx
+ test %edx, %edx
+ jz L(return_null)
+
+ bsr %edx, %ecx
+ add %ecx, %eax
+ ret
+
+ .p2align 4
+L(length_less16):
+ punpcklbw %xmm1, %xmm1
+ mov %ecx, %eax
+ punpcklbw %xmm1, %xmm1
+ add $16, %edx
+ jz L(return_null)
+
+ pshufd $0, %xmm1, %xmm1
+ and $15, %ecx
+ jz L(length_less16_offset0)
+
+ PUSH (%edi)
+ mov %cl, %dh
+ add %dl, %dh
+ and $-16, %eax
+
+ sub $16, %dh
+ ja L(length_less16_part2)
+
+ pcmpeqb (%eax), %xmm1
+ pmovmskb %xmm1, %edi
+
+ sar %cl, %edi
+ add %ecx, %eax
+ mov %dl, %cl
+
+ mov $1, %edx
+ sal %cl, %edx
+ sub $1, %edx
+
+ and %edx, %edi
+ test %edi, %edi
+ jz L(ret_null)
+
+ bsr %edi, %edi
+ add %edi, %eax
+ POP (%edi)
+ ret
+
+ CFI_PUSH (%edi)
+
+ .p2align 4
+L(length_less16_part2):
+ movdqa 16(%eax), %xmm2
+ pcmpeqb %xmm1, %xmm2
+ pmovmskb %xmm2, %edi
+
+ mov %cl, %ch
+
+ mov %dh, %cl
+ mov $1, %edx
+ sal %cl, %edx
+ sub $1, %edx
+
+ and %edx, %edi
+
+ test %edi, %edi
+ jnz L(length_less16_part2_return)
+
+ pcmpeqb (%eax), %xmm1
+ pmovmskb %xmm1, %edi
+
+ mov %ch, %cl
+ sar %cl, %edi
+ test %edi, %edi
+ jz L(ret_null)
+
+ bsr %edi, %edi
+ add %edi, %eax
+ xor %ch, %ch
+ add %ecx, %eax
+ POP (%edi)
+ ret
+
+ CFI_PUSH (%edi)
+
+ .p2align 4
+L(length_less16_part2_return):
+ bsr %edi, %edi
+ lea 16(%eax, %edi), %eax
+ POP (%edi)
+ ret
+
+ CFI_PUSH (%edi)
+
+ .p2align 4
+L(ret_null):
+ xor %eax, %eax
+ POP (%edi)
+ ret
+
+END (MEMCHR)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memrchr-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memrchr-sse2.S
new file mode 100644
index 0000000000..5f7853f683
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memrchr-sse2.S
@@ -0,0 +1,724 @@
+/* Optimized memrchr with sse2 without bsf
+ Copyright (C) 2011-2017 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+# define CFI_PUSH(REG) \
+ cfi_adjust_cfa_offset (4); \
+ cfi_rel_offset (REG, 0)
+
+# define CFI_POP(REG) \
+ cfi_adjust_cfa_offset (-4); \
+ cfi_restore (REG)
+
+# define PUSH(REG) pushl REG; CFI_PUSH (REG)
+# define POP(REG) popl REG; CFI_POP (REG)
+
+# define PARMS 4
+# define STR1 PARMS
+# define STR2 STR1+4
+# define LEN STR2+4
+
+ atom_text_section
+ENTRY (__memrchr_sse2)
+ mov STR1(%esp), %ecx
+ movd STR2(%esp), %xmm1
+ mov LEN(%esp), %edx
+
+ sub $16, %edx
+ jbe L(length_less16)
+
+ punpcklbw %xmm1, %xmm1
+ add %edx, %ecx
+ punpcklbw %xmm1, %xmm1
+
+ movdqu (%ecx), %xmm0
+ pshufd $0, %xmm1, %xmm1
+ pcmpeqb %xmm1, %xmm0
+
+ pmovmskb %xmm0, %eax
+ test %eax, %eax
+ jnz L(exit_dispatch)
+
+ sub $64, %ecx
+ mov %ecx, %eax
+ and $15, %eax
+ jz L(loop_prolog)
+
+ lea 16(%ecx), %ecx
+ lea 16(%edx), %edx
+ sub %eax, %edx
+ and $-16, %ecx
+
+ .p2align 4
+/* Loop start on aligned string. */
+L(loop_prolog):
+ sub $64, %edx
+ jbe L(exit_loop)
+
+ movdqa 48(%ecx), %xmm0
+ pcmpeqb %xmm1, %xmm0
+ pmovmskb %xmm0, %eax
+ test %eax, %eax
+ jnz L(matches48)
+
+ movdqa 32(%ecx), %xmm2
+ pcmpeqb %xmm1, %xmm2
+ pmovmskb %xmm2, %eax
+ test %eax, %eax
+ jnz L(matches32)
+
+ movdqa 16(%ecx), %xmm3
+ pcmpeqb %xmm1, %xmm3
+ pmovmskb %xmm3, %eax
+ test %eax, %eax
+ jnz L(matches16)
+
+ movdqa (%ecx), %xmm4
+ pcmpeqb %xmm1, %xmm4
+ pmovmskb %xmm4, %eax
+ test %eax, %eax
+ jnz L(exit_dispatch)
+
+ sub $64, %ecx
+ sub $64, %edx
+ jbe L(exit_loop)
+
+ movdqa 48(%ecx), %xmm0
+ pcmpeqb %xmm1, %xmm0
+ pmovmskb %xmm0, %eax
+ test %eax, %eax
+ jnz L(matches48)
+
+ movdqa 32(%ecx), %xmm2
+ pcmpeqb %xmm1, %xmm2
+ pmovmskb %xmm2, %eax
+ test %eax, %eax
+ jnz L(matches32)
+
+ movdqa 16(%ecx), %xmm3
+ pcmpeqb %xmm1, %xmm3
+ pmovmskb %xmm3, %eax
+ test %eax, %eax
+ jnz L(matches16)
+
+ movdqa (%ecx), %xmm3
+ pcmpeqb %xmm1, %xmm3
+ pmovmskb %xmm3, %eax
+ test %eax, %eax
+ jnz L(exit_dispatch)
+
+ mov %ecx, %eax
+ and $63, %eax
+ test %eax, %eax
+ jz L(align64_loop)
+
+ lea 64(%ecx), %ecx
+ lea 64(%edx), %edx
+ and $-64, %ecx
+ sub %eax, %edx
+
+ .p2align 4
+L(align64_loop):
+ sub $64, %ecx
+ sub $64, %edx
+ jbe L(exit_loop)
+
+ movdqa (%ecx), %xmm0
+ movdqa 16(%ecx), %xmm2
+ movdqa 32(%ecx), %xmm3
+ movdqa 48(%ecx), %xmm4
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm1, %xmm2
+ pcmpeqb %xmm1, %xmm3
+ pcmpeqb %xmm1, %xmm4
+
+ pmaxub %xmm3, %xmm0
+ pmaxub %xmm4, %xmm2
+ pmaxub %xmm0, %xmm2
+ pmovmskb %xmm2, %eax
+
+ test %eax, %eax
+ jz L(align64_loop)
+
+ pmovmskb %xmm4, %eax
+ test %eax, %eax
+ jnz L(matches48)
+
+ pmovmskb %xmm3, %eax
+ test %eax, %eax
+ jnz L(matches32)
+
+ movdqa 16(%ecx), %xmm2
+
+ pcmpeqb %xmm1, %xmm2
+ pcmpeqb (%ecx), %xmm1
+
+ pmovmskb %xmm2, %eax
+ test %eax, %eax
+ jnz L(matches16)
+
+ pmovmskb %xmm1, %eax
+ test %ah, %ah
+ jnz L(exit_dispatch_high)
+ mov %al, %dl
+ and $15 << 4, %dl
+ jnz L(exit_dispatch_8)
+ test $0x08, %al
+ jnz L(exit_4)
+ test $0x04, %al
+ jnz L(exit_3)
+ test $0x02, %al
+ jnz L(exit_2)
+ mov %ecx, %eax
+ ret
+
+ .p2align 4
+L(exit_loop):
+ add $64, %edx
+ cmp $32, %edx
+ jbe L(exit_loop_32)
+
+ movdqa 48(%ecx), %xmm0
+ pcmpeqb %xmm1, %xmm0
+ pmovmskb %xmm0, %eax
+ test %eax, %eax
+ jnz L(matches48)
+
+ movdqa 32(%ecx), %xmm2
+ pcmpeqb %xmm1, %xmm2
+ pmovmskb %xmm2, %eax
+ test %eax, %eax
+ jnz L(matches32)
+
+ movdqa 16(%ecx), %xmm3
+ pcmpeqb %xmm1, %xmm3
+ pmovmskb %xmm3, %eax
+ test %eax, %eax
+ jnz L(matches16_1)
+ cmp $48, %edx
+ jbe L(return_null)
+
+ pcmpeqb (%ecx), %xmm1
+ pmovmskb %xmm1, %eax
+ test %eax, %eax
+ jnz L(matches0_1)
+ xor %eax, %eax
+ ret
+
+ .p2align 4
+L(exit_loop_32):
+ movdqa 48(%ecx), %xmm0
+ pcmpeqb %xmm1, %xmm0
+ pmovmskb %xmm0, %eax
+ test %eax, %eax
+ jnz L(matches48_1)
+ cmp $16, %edx
+ jbe L(return_null)
+
+ pcmpeqb 32(%ecx), %xmm1
+ pmovmskb %xmm1, %eax
+ test %eax, %eax
+ jnz L(matches32_1)
+ xor %eax, %eax
+ ret
+
+ .p2align 4
+L(matches16):
+ lea 16(%ecx), %ecx
+ test %ah, %ah
+ jnz L(exit_dispatch_high)
+ mov %al, %dl
+ and $15 << 4, %dl
+ jnz L(exit_dispatch_8)
+ test $0x08, %al
+ jnz L(exit_4)
+ test $0x04, %al
+ jnz L(exit_3)
+ test $0x02, %al
+ jnz L(exit_2)
+ mov %ecx, %eax
+ ret
+
+ .p2align 4
+L(matches32):
+ lea 32(%ecx), %ecx
+ test %ah, %ah
+ jnz L(exit_dispatch_high)
+ mov %al, %dl
+ and $15 << 4, %dl
+ jnz L(exit_dispatch_8)
+ test $0x08, %al
+ jnz L(exit_4)
+ test $0x04, %al
+ jnz L(exit_3)
+ test $0x02, %al
+ jnz L(exit_2)
+ mov %ecx, %eax
+ ret
+
+ .p2align 4
+L(matches48):
+ lea 48(%ecx), %ecx
+
+ .p2align 4
+L(exit_dispatch):
+ test %ah, %ah
+ jnz L(exit_dispatch_high)
+ mov %al, %dl
+ and $15 << 4, %dl
+ jnz L(exit_dispatch_8)
+ test $0x08, %al
+ jnz L(exit_4)
+ test $0x04, %al
+ jnz L(exit_3)
+ test $0x02, %al
+ jnz L(exit_2)
+ mov %ecx, %eax
+ ret
+
+ .p2align 4
+L(exit_dispatch_8):
+ test $0x80, %al
+ jnz L(exit_8)
+ test $0x40, %al
+ jnz L(exit_7)
+ test $0x20, %al
+ jnz L(exit_6)
+ lea 4(%ecx), %eax
+ ret
+
+ .p2align 4
+L(exit_dispatch_high):
+ mov %ah, %dh
+ and $15 << 4, %dh
+ jnz L(exit_dispatch_high_8)
+ test $0x08, %ah
+ jnz L(exit_12)
+ test $0x04, %ah
+ jnz L(exit_11)
+ test $0x02, %ah
+ jnz L(exit_10)
+ lea 8(%ecx), %eax
+ ret
+
+ .p2align 4
+L(exit_dispatch_high_8):
+ test $0x80, %ah
+ jnz L(exit_16)
+ test $0x40, %ah
+ jnz L(exit_15)
+ test $0x20, %ah
+ jnz L(exit_14)
+ lea 12(%ecx), %eax
+ ret
+
+ .p2align 4
+L(exit_2):
+ lea 1(%ecx), %eax
+ ret
+
+ .p2align 4
+L(exit_3):
+ lea 2(%ecx), %eax
+ ret
+
+ .p2align 4
+L(exit_4):
+ lea 3(%ecx), %eax
+ ret
+
+ .p2align 4
+L(exit_6):
+ lea 5(%ecx), %eax
+ ret
+
+ .p2align 4
+L(exit_7):
+ lea 6(%ecx), %eax
+ ret
+
+ .p2align 4
+L(exit_8):
+ lea 7(%ecx), %eax
+ ret
+
+ .p2align 4
+L(exit_10):
+ lea 9(%ecx), %eax
+ ret
+
+ .p2align 4
+L(exit_11):
+ lea 10(%ecx), %eax
+ ret
+
+ .p2align 4
+L(exit_12):
+ lea 11(%ecx), %eax
+ ret
+
+ .p2align 4
+L(exit_14):
+ lea 13(%ecx), %eax
+ ret
+
+ .p2align 4
+L(exit_15):
+ lea 14(%ecx), %eax
+ ret
+
+ .p2align 4
+L(exit_16):
+ lea 15(%ecx), %eax
+ ret
+
+ .p2align 4
+L(matches0_1):
+ lea -64(%edx), %edx
+
+ test %ah, %ah
+ jnz L(exit_dispatch_1_high)
+ mov %al, %ah
+ and $15 << 4, %ah
+ jnz L(exit_dispatch_1_8)
+ test $0x08, %al
+ jnz L(exit_1_4)
+ test $0x04, %al
+ jnz L(exit_1_3)
+ test $0x02, %al
+ jnz L(exit_1_2)
+ add $0, %edx
+ jl L(return_null)
+ mov %ecx, %eax
+ ret
+
+ .p2align 4
+L(matches16_1):
+ lea -48(%edx), %edx
+ lea 16(%ecx), %ecx
+
+ test %ah, %ah
+ jnz L(exit_dispatch_1_high)
+ mov %al, %ah
+ and $15 << 4, %ah
+ jnz L(exit_dispatch_1_8)
+ test $0x08, %al
+ jnz L(exit_1_4)
+ test $0x04, %al
+ jnz L(exit_1_3)
+ test $0x02, %al
+ jnz L(exit_1_2)
+ add $0, %edx
+ jl L(return_null)
+ mov %ecx, %eax
+ ret
+
+ .p2align 4
+L(matches32_1):
+ lea -32(%edx), %edx
+ lea 32(%ecx), %ecx
+
+ test %ah, %ah
+ jnz L(exit_dispatch_1_high)
+ mov %al, %ah
+ and $15 << 4, %ah
+ jnz L(exit_dispatch_1_8)
+ test $0x08, %al
+ jnz L(exit_1_4)
+ test $0x04, %al
+ jnz L(exit_1_3)
+ test $0x02, %al
+ jnz L(exit_1_2)
+ add $0, %edx
+ jl L(return_null)
+ mov %ecx, %eax
+ ret
+
+ .p2align 4
+L(matches48_1):
+ lea -16(%edx), %edx
+ lea 48(%ecx), %ecx
+
+ .p2align 4
+L(exit_dispatch_1):
+ test %ah, %ah
+ jnz L(exit_dispatch_1_high)
+ mov %al, %ah
+ and $15 << 4, %ah
+ jnz L(exit_dispatch_1_8)
+ test $0x08, %al
+ jnz L(exit_1_4)
+ test $0x04, %al
+ jnz L(exit_1_3)
+ test $0x02, %al
+ jnz L(exit_1_2)
+ add $0, %edx
+ jl L(return_null)
+ mov %ecx, %eax
+ ret
+
+ .p2align 4
+L(exit_dispatch_1_8):
+ test $0x80, %al
+ jnz L(exit_1_8)
+ test $0x40, %al
+ jnz L(exit_1_7)
+ test $0x20, %al
+ jnz L(exit_1_6)
+ add $4, %edx
+ jl L(return_null)
+ lea 4(%ecx), %eax
+ ret
+
+ .p2align 4
+L(exit_dispatch_1_high):
+ mov %ah, %al
+ and $15 << 4, %al
+ jnz L(exit_dispatch_1_high_8)
+ test $0x08, %ah
+ jnz L(exit_1_12)
+ test $0x04, %ah
+ jnz L(exit_1_11)
+ test $0x02, %ah
+ jnz L(exit_1_10)
+ add $8, %edx
+ jl L(return_null)
+ lea 8(%ecx), %eax
+ ret
+
+ .p2align 4
+L(exit_dispatch_1_high_8):
+ test $0x80, %ah
+ jnz L(exit_1_16)
+ test $0x40, %ah
+ jnz L(exit_1_15)
+ test $0x20, %ah
+ jnz L(exit_1_14)
+ add $12, %edx
+ jl L(return_null)
+ lea 12(%ecx), %eax
+ ret
+
+ .p2align 4
+L(exit_1_2):
+ add $1, %edx
+ jl L(return_null)
+ lea 1(%ecx), %eax
+ ret
+
+ .p2align 4
+L(exit_1_3):
+ add $2, %edx
+ jl L(return_null)
+ lea 2(%ecx), %eax
+ ret
+
+ .p2align 4
+L(exit_1_4):
+ add $3, %edx
+ jl L(return_null)
+ lea 3(%ecx), %eax
+ ret
+
+ .p2align 4
+L(exit_1_6):
+ add $5, %edx
+ jl L(return_null)
+ lea 5(%ecx), %eax
+ ret
+
+ .p2align 4
+L(exit_1_7):
+ add $6, %edx
+ jl L(return_null)
+ lea 6(%ecx), %eax
+ ret
+
+ .p2align 4
+L(exit_1_8):
+ add $7, %edx
+ jl L(return_null)
+ lea 7(%ecx), %eax
+ ret
+
+ .p2align 4
+L(exit_1_10):
+ add $9, %edx
+ jl L(return_null)
+ lea 9(%ecx), %eax
+ ret
+
+ .p2align 4
+L(exit_1_11):
+ add $10, %edx
+ jl L(return_null)
+ lea 10(%ecx), %eax
+ ret
+
+ .p2align 4
+L(exit_1_12):
+ add $11, %edx
+ jl L(return_null)
+ lea 11(%ecx), %eax
+ ret
+
+ .p2align 4
+L(exit_1_14):
+ add $13, %edx
+ jl L(return_null)
+ lea 13(%ecx), %eax
+ ret
+
+ .p2align 4
+L(exit_1_15):
+ add $14, %edx
+ jl L(return_null)
+ lea 14(%ecx), %eax
+ ret
+
+ .p2align 4
+L(exit_1_16):
+ add $15, %edx
+ jl L(return_null)
+ lea 15(%ecx), %eax
+ ret
+
+ .p2align 4
+L(return_null):
+ xor %eax, %eax
+ ret
+
+ .p2align 4
+L(length_less16_offset0):
+ mov %dl, %cl
+ pcmpeqb (%eax), %xmm1
+
+ mov $1, %edx
+ sal %cl, %edx
+ sub $1, %edx
+
+ mov %eax, %ecx
+ pmovmskb %xmm1, %eax
+
+ and %edx, %eax
+ test %eax, %eax
+ jnz L(exit_dispatch)
+
+ xor %eax, %eax
+ ret
+
+ .p2align 4
+L(length_less16):
+ punpcklbw %xmm1, %xmm1
+ add $16, %edx
+ je L(return_null)
+ punpcklbw %xmm1, %xmm1
+
+ mov %ecx, %eax
+ pshufd $0, %xmm1, %xmm1
+
+ and $15, %ecx
+ jz L(length_less16_offset0)
+
+ PUSH (%edi)
+
+ mov %cl, %dh
+ add %dl, %dh
+ and $-16, %eax
+
+ sub $16, %dh
+ ja L(length_less16_part2)
+
+ pcmpeqb (%eax), %xmm1
+ pmovmskb %xmm1, %edi
+
+ sar %cl, %edi
+ add %ecx, %eax
+ mov %dl, %cl
+
+ mov $1, %edx
+ sal %cl, %edx
+ sub $1, %edx
+
+ and %edx, %edi
+ test %edi, %edi
+ jz L(ret_null)
+
+ bsr %edi, %edi
+ add %edi, %eax
+ POP (%edi)
+ ret
+
+ CFI_PUSH (%edi)
+
+ .p2align 4
+L(length_less16_part2):
+ movdqa 16(%eax), %xmm2
+ pcmpeqb %xmm1, %xmm2
+ pmovmskb %xmm2, %edi
+
+ mov %cl, %ch
+
+ mov %dh, %cl
+ mov $1, %edx
+ sal %cl, %edx
+ sub $1, %edx
+
+ and %edx, %edi
+
+ test %edi, %edi
+ jnz L(length_less16_part2_return)
+
+ pcmpeqb (%eax), %xmm1
+ pmovmskb %xmm1, %edi
+
+ mov %ch, %cl
+ sar %cl, %edi
+ test %edi, %edi
+ jz L(ret_null)
+
+ bsr %edi, %edi
+ add %edi, %eax
+ xor %ch, %ch
+ add %ecx, %eax
+ POP (%edi)
+ ret
+
+ CFI_PUSH (%edi)
+
+ .p2align 4
+L(length_less16_part2_return):
+ bsr %edi, %edi
+ lea 16(%eax, %edi), %eax
+ POP (%edi)
+ ret
+
+ CFI_PUSH (%edi)
+
+ .p2align 4
+L(ret_null):
+ xor %eax, %eax
+ POP (%edi)
+ ret
+
+END (__memrchr_sse2)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memrchr.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memrchr.S
new file mode 100644
index 0000000000..d4253a553b
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memrchr.S
@@ -0,0 +1,45 @@
+/* Multiple versions of memrchr
+ All versions must be listed in ifunc-impl-list.c.
+ Copyright (C) 2011-2017 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+#if IS_IN (libc)
+ .text
+ENTRY(__memrchr)
+ .type __memrchr, @gnu_indirect_function
+ LOAD_GOT_AND_RTLD_GLOBAL_RO
+ HAS_CPU_FEATURE (SSE2)
+ jz 2f
+ HAS_ARCH_FEATURE (Slow_BSF)
+ jz 3f
+
+ LOAD_FUNC_GOT_EAX (__memrchr_sse2)
+ ret
+
+2: LOAD_FUNC_GOT_EAX (__memrchr_ia32)
+ ret
+
+3: LOAD_FUNC_GOT_EAX (__memrchr_sse2_bsf)
+ ret
+END(__memrchr)
+
+weak_alias(__memrchr, memrchr)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memset-sse2-rep.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memset-sse2-rep.S
new file mode 100644
index 0000000000..3221077e49
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memset-sse2-rep.S
@@ -0,0 +1,811 @@
+/* memset with SSE2 and REP string.
+ Copyright (C) 2010-2017 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#if IS_IN (libc)
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+#define CFI_PUSH(REG) \
+ cfi_adjust_cfa_offset (4); \
+ cfi_rel_offset (REG, 0)
+
+#define CFI_POP(REG) \
+ cfi_adjust_cfa_offset (-4); \
+ cfi_restore (REG)
+
+#define PUSH(REG) pushl REG; CFI_PUSH (REG)
+#define POP(REG) popl REG; CFI_POP (REG)
+
+#ifdef USE_AS_BZERO
+# define DEST PARMS
+# define LEN DEST+4
+# define SETRTNVAL
+#else
+# define DEST PARMS
+# define CHR DEST+4
+# define LEN CHR+4
+# define SETRTNVAL movl DEST(%esp), %eax
+#endif
+
+#ifdef SHARED
+# define ENTRANCE PUSH (%ebx);
+# define RETURN_END POP (%ebx); ret
+# define RETURN RETURN_END; CFI_PUSH (%ebx)
+# define PARMS 8 /* Preserve EBX. */
+# define JMPTBL(I, B) I - B
+
+/* Load an entry in a jump table into EBX and branch to it. TABLE is a
+ jump table with relative offsets. */
+# define BRANCH_TO_JMPTBL_ENTRY(TABLE) \
+ /* We first load PC into EBX. */ \
+ SETUP_PIC_REG(bx); \
+ /* Get the address of the jump table. */ \
+ add $(TABLE - .), %ebx; \
+ /* Get the entry and convert the relative offset to the \
+ absolute address. */ \
+ add (%ebx,%ecx,4), %ebx; \
+ add %ecx, %edx; \
+ /* We loaded the jump table and adjusted EDX. Go. */ \
+ jmp *%ebx
+#else
+# define ENTRANCE
+# define RETURN_END ret
+# define RETURN RETURN_END
+# define PARMS 4
+# define JMPTBL(I, B) I
+
+/* Branch to an entry in a jump table. TABLE is a jump table with
+ absolute offsets. */
+# define BRANCH_TO_JMPTBL_ENTRY(TABLE) \
+ add %ecx, %edx; \
+ jmp *TABLE(,%ecx,4)
+#endif
+
+ .section .text.sse2,"ax",@progbits
+#if defined SHARED && IS_IN (libc) && !defined USE_AS_BZERO
+ENTRY (__memset_chk_sse2_rep)
+ movl 12(%esp), %eax
+ cmpl %eax, 16(%esp)
+ jb HIDDEN_JUMPTARGET (__chk_fail)
+END (__memset_chk_sse2_rep)
+#endif
+ENTRY (__memset_sse2_rep)
+ ENTRANCE
+
+ movl LEN(%esp), %ecx
+#ifdef USE_AS_BZERO
+ xor %eax, %eax
+#else
+ movzbl CHR(%esp), %eax
+ movb %al, %ah
+ /* Fill the whole EAX with pattern. */
+ movl %eax, %edx
+ shl $16, %eax
+ or %edx, %eax
+#endif
+ movl DEST(%esp), %edx
+ cmp $32, %ecx
+ jae L(32bytesormore)
+
+L(write_less32bytes):
+ BRANCH_TO_JMPTBL_ENTRY (L(table_less_32bytes))
+
+
+ .pushsection .rodata.sse2,"a",@progbits
+ ALIGN (2)
+L(table_less_32bytes):
+ .int JMPTBL (L(write_0bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_1bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_2bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_3bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_4bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_5bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_6bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_7bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_8bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_9bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_10bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_11bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_12bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_13bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_14bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_15bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_16bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_17bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_18bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_19bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_20bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_21bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_22bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_23bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_24bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_25bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_26bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_27bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_28bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_29bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_30bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_31bytes), L(table_less_32bytes))
+ .popsection
+
+ ALIGN (4)
+L(write_28bytes):
+ movl %eax, -28(%edx)
+L(write_24bytes):
+ movl %eax, -24(%edx)
+L(write_20bytes):
+ movl %eax, -20(%edx)
+L(write_16bytes):
+ movl %eax, -16(%edx)
+L(write_12bytes):
+ movl %eax, -12(%edx)
+L(write_8bytes):
+ movl %eax, -8(%edx)
+L(write_4bytes):
+ movl %eax, -4(%edx)
+L(write_0bytes):
+ SETRTNVAL
+ RETURN
+
+ ALIGN (4)
+L(write_29bytes):
+ movl %eax, -29(%edx)
+L(write_25bytes):
+ movl %eax, -25(%edx)
+L(write_21bytes):
+ movl %eax, -21(%edx)
+L(write_17bytes):
+ movl %eax, -17(%edx)
+L(write_13bytes):
+ movl %eax, -13(%edx)
+L(write_9bytes):
+ movl %eax, -9(%edx)
+L(write_5bytes):
+ movl %eax, -5(%edx)
+L(write_1bytes):
+ movb %al, -1(%edx)
+ SETRTNVAL
+ RETURN
+
+ ALIGN (4)
+L(write_30bytes):
+ movl %eax, -30(%edx)
+L(write_26bytes):
+ movl %eax, -26(%edx)
+L(write_22bytes):
+ movl %eax, -22(%edx)
+L(write_18bytes):
+ movl %eax, -18(%edx)
+L(write_14bytes):
+ movl %eax, -14(%edx)
+L(write_10bytes):
+ movl %eax, -10(%edx)
+L(write_6bytes):
+ movl %eax, -6(%edx)
+L(write_2bytes):
+ movw %ax, -2(%edx)
+ SETRTNVAL
+ RETURN
+
+ ALIGN (4)
+L(write_31bytes):
+ movl %eax, -31(%edx)
+L(write_27bytes):
+ movl %eax, -27(%edx)
+L(write_23bytes):
+ movl %eax, -23(%edx)
+L(write_19bytes):
+ movl %eax, -19(%edx)
+L(write_15bytes):
+ movl %eax, -15(%edx)
+L(write_11bytes):
+ movl %eax, -11(%edx)
+L(write_7bytes):
+ movl %eax, -7(%edx)
+L(write_3bytes):
+ movw %ax, -3(%edx)
+ movb %al, -1(%edx)
+ SETRTNVAL
+ RETURN
+
+ ALIGN (4)
+/* ECX > 32 and EDX is 4 byte aligned. */
+L(32bytesormore):
+ /* Fill xmm0 with the pattern. */
+#ifdef USE_AS_BZERO
+ pxor %xmm0, %xmm0
+#else
+ movd %eax, %xmm0
+ pshufd $0, %xmm0, %xmm0
+#endif
+ testl $0xf, %edx
+ jz L(aligned_16)
+/* ECX > 32 and EDX is not 16 byte aligned. */
+L(not_aligned_16):
+ movdqu %xmm0, (%edx)
+ movl %edx, %eax
+ and $-16, %edx
+ add $16, %edx
+ sub %edx, %eax
+ add %eax, %ecx
+ movd %xmm0, %eax
+
+ ALIGN (4)
+L(aligned_16):
+ cmp $128, %ecx
+ jae L(128bytesormore)
+
+L(aligned_16_less128bytes):
+ BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes))
+
+ ALIGN (4)
+L(128bytesormore):
+ PUSH (%edi)
+#ifdef DATA_CACHE_SIZE
+ PUSH (%ebx)
+ mov $DATA_CACHE_SIZE, %ebx
+#else
+# ifdef SHARED
+ SETUP_PIC_REG(bx)
+ add $_GLOBAL_OFFSET_TABLE_, %ebx
+ mov __x86_data_cache_size@GOTOFF(%ebx), %ebx
+# else
+ PUSH (%ebx)
+ mov __x86_data_cache_size, %ebx
+# endif
+#endif
+ mov %ebx, %edi
+ shr $4, %ebx
+ sub %ebx, %edi
+#if defined DATA_CACHE_SIZE || !defined SHARED
+ POP (%ebx)
+#endif
+/*
+ * When data size approximate the end of L1 cache,
+ * fast string will prefetch and combine data efficiently.
+ */
+ cmp %edi, %ecx
+ jae L(128bytesormore_endof_L1)
+ subl $128, %ecx
+L(128bytesormore_normal):
+ sub $128, %ecx
+ movdqa %xmm0, (%edx)
+ movdqa %xmm0, 0x10(%edx)
+ movdqa %xmm0, 0x20(%edx)
+ movdqa %xmm0, 0x30(%edx)
+ movdqa %xmm0, 0x40(%edx)
+ movdqa %xmm0, 0x50(%edx)
+ movdqa %xmm0, 0x60(%edx)
+ movdqa %xmm0, 0x70(%edx)
+ lea 128(%edx), %edx
+ jb L(128bytesless_normal)
+
+
+ sub $128, %ecx
+ movdqa %xmm0, (%edx)
+ movdqa %xmm0, 0x10(%edx)
+ movdqa %xmm0, 0x20(%edx)
+ movdqa %xmm0, 0x30(%edx)
+ movdqa %xmm0, 0x40(%edx)
+ movdqa %xmm0, 0x50(%edx)
+ movdqa %xmm0, 0x60(%edx)
+ movdqa %xmm0, 0x70(%edx)
+ lea 128(%edx), %edx
+ jae L(128bytesormore_normal)
+
+L(128bytesless_normal):
+ POP (%edi)
+ add $128, %ecx
+ BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes))
+
+ CFI_PUSH (%edi)
+ ALIGN (4)
+L(128bytesormore_endof_L1):
+ mov %edx, %edi
+ mov %ecx, %edx
+ shr $2, %ecx
+ and $3, %edx
+ rep stosl
+ jz L(copy_page_by_rep_exit)
+ cmp $2, %edx
+ jb L(copy_page_by_rep_left_1)
+ movw %ax, (%edi)
+ add $2, %edi
+ sub $2, %edx
+ jz L(copy_page_by_rep_exit)
+L(copy_page_by_rep_left_1):
+ movb %al, (%edi)
+L(copy_page_by_rep_exit):
+ POP (%edi)
+ SETRTNVAL
+ RETURN
+
+ .pushsection .rodata.sse2,"a",@progbits
+ ALIGN (2)
+L(table_16_128bytes):
+ .int JMPTBL (L(aligned_16_0bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_1bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_2bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_3bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_4bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_5bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_6bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_7bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_8bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_9bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_10bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_11bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_12bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_13bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_14bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_15bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_16bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_17bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_18bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_19bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_20bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_21bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_22bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_23bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_24bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_25bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_26bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_27bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_28bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_29bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_30bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_31bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_32bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_33bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_34bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_35bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_36bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_37bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_38bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_39bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_40bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_41bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_42bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_43bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_44bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_45bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_46bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_47bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_48bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_49bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_50bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_51bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_52bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_53bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_54bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_55bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_56bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_57bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_58bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_59bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_60bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_61bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_62bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_63bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_64bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_65bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_66bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_67bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_68bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_69bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_70bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_71bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_72bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_73bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_74bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_75bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_76bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_77bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_78bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_79bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_80bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_81bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_82bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_83bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_84bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_85bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_86bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_87bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_88bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_89bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_90bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_91bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_92bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_93bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_94bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_95bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_96bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_97bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_98bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_99bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_100bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_101bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_102bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_103bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_104bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_105bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_106bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_107bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_108bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_109bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_110bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_111bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_112bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_113bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_114bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_115bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_116bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_117bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_118bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_119bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_120bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_121bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_122bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_123bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_124bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_125bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_126bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_127bytes), L(table_16_128bytes))
+ .popsection
+
+ ALIGN (4)
+L(aligned_16_112bytes):
+ movdqa %xmm0, -112(%edx)
+L(aligned_16_96bytes):
+ movdqa %xmm0, -96(%edx)
+L(aligned_16_80bytes):
+ movdqa %xmm0, -80(%edx)
+L(aligned_16_64bytes):
+ movdqa %xmm0, -64(%edx)
+L(aligned_16_48bytes):
+ movdqa %xmm0, -48(%edx)
+L(aligned_16_32bytes):
+ movdqa %xmm0, -32(%edx)
+L(aligned_16_16bytes):
+ movdqa %xmm0, -16(%edx)
+L(aligned_16_0bytes):
+ SETRTNVAL
+ RETURN
+
+ ALIGN (4)
+L(aligned_16_113bytes):
+ movdqa %xmm0, -113(%edx)
+L(aligned_16_97bytes):
+ movdqa %xmm0, -97(%edx)
+L(aligned_16_81bytes):
+ movdqa %xmm0, -81(%edx)
+L(aligned_16_65bytes):
+ movdqa %xmm0, -65(%edx)
+L(aligned_16_49bytes):
+ movdqa %xmm0, -49(%edx)
+L(aligned_16_33bytes):
+ movdqa %xmm0, -33(%edx)
+L(aligned_16_17bytes):
+ movdqa %xmm0, -17(%edx)
+L(aligned_16_1bytes):
+ movb %al, -1(%edx)
+ SETRTNVAL
+ RETURN
+
+ ALIGN (4)
+L(aligned_16_114bytes):
+ movdqa %xmm0, -114(%edx)
+L(aligned_16_98bytes):
+ movdqa %xmm0, -98(%edx)
+L(aligned_16_82bytes):
+ movdqa %xmm0, -82(%edx)
+L(aligned_16_66bytes):
+ movdqa %xmm0, -66(%edx)
+L(aligned_16_50bytes):
+ movdqa %xmm0, -50(%edx)
+L(aligned_16_34bytes):
+ movdqa %xmm0, -34(%edx)
+L(aligned_16_18bytes):
+ movdqa %xmm0, -18(%edx)
+L(aligned_16_2bytes):
+ movw %ax, -2(%edx)
+ SETRTNVAL
+ RETURN
+
+ ALIGN (4)
+L(aligned_16_115bytes):
+ movdqa %xmm0, -115(%edx)
+L(aligned_16_99bytes):
+ movdqa %xmm0, -99(%edx)
+L(aligned_16_83bytes):
+ movdqa %xmm0, -83(%edx)
+L(aligned_16_67bytes):
+ movdqa %xmm0, -67(%edx)
+L(aligned_16_51bytes):
+ movdqa %xmm0, -51(%edx)
+L(aligned_16_35bytes):
+ movdqa %xmm0, -35(%edx)
+L(aligned_16_19bytes):
+ movdqa %xmm0, -19(%edx)
+L(aligned_16_3bytes):
+ movw %ax, -3(%edx)
+ movb %al, -1(%edx)
+ SETRTNVAL
+ RETURN
+
+ ALIGN (4)
+L(aligned_16_116bytes):
+ movdqa %xmm0, -116(%edx)
+L(aligned_16_100bytes):
+ movdqa %xmm0, -100(%edx)
+L(aligned_16_84bytes):
+ movdqa %xmm0, -84(%edx)
+L(aligned_16_68bytes):
+ movdqa %xmm0, -68(%edx)
+L(aligned_16_52bytes):
+ movdqa %xmm0, -52(%edx)
+L(aligned_16_36bytes):
+ movdqa %xmm0, -36(%edx)
+L(aligned_16_20bytes):
+ movdqa %xmm0, -20(%edx)
+L(aligned_16_4bytes):
+ movl %eax, -4(%edx)
+ SETRTNVAL
+ RETURN
+
+ ALIGN (4)
+L(aligned_16_117bytes):
+ movdqa %xmm0, -117(%edx)
+L(aligned_16_101bytes):
+ movdqa %xmm0, -101(%edx)
+L(aligned_16_85bytes):
+ movdqa %xmm0, -85(%edx)
+L(aligned_16_69bytes):
+ movdqa %xmm0, -69(%edx)
+L(aligned_16_53bytes):
+ movdqa %xmm0, -53(%edx)
+L(aligned_16_37bytes):
+ movdqa %xmm0, -37(%edx)
+L(aligned_16_21bytes):
+ movdqa %xmm0, -21(%edx)
+L(aligned_16_5bytes):
+ movl %eax, -5(%edx)
+ movb %al, -1(%edx)
+ SETRTNVAL
+ RETURN
+
+ ALIGN (4)
+L(aligned_16_118bytes):
+ movdqa %xmm0, -118(%edx)
+L(aligned_16_102bytes):
+ movdqa %xmm0, -102(%edx)
+L(aligned_16_86bytes):
+ movdqa %xmm0, -86(%edx)
+L(aligned_16_70bytes):
+ movdqa %xmm0, -70(%edx)
+L(aligned_16_54bytes):
+ movdqa %xmm0, -54(%edx)
+L(aligned_16_38bytes):
+ movdqa %xmm0, -38(%edx)
+L(aligned_16_22bytes):
+ movdqa %xmm0, -22(%edx)
+L(aligned_16_6bytes):
+ movl %eax, -6(%edx)
+ movw %ax, -2(%edx)
+ SETRTNVAL
+ RETURN
+
+ ALIGN (4)
+L(aligned_16_119bytes):
+ movdqa %xmm0, -119(%edx)
+L(aligned_16_103bytes):
+ movdqa %xmm0, -103(%edx)
+L(aligned_16_87bytes):
+ movdqa %xmm0, -87(%edx)
+L(aligned_16_71bytes):
+ movdqa %xmm0, -71(%edx)
+L(aligned_16_55bytes):
+ movdqa %xmm0, -55(%edx)
+L(aligned_16_39bytes):
+ movdqa %xmm0, -39(%edx)
+L(aligned_16_23bytes):
+ movdqa %xmm0, -23(%edx)
+L(aligned_16_7bytes):
+ movl %eax, -7(%edx)
+ movw %ax, -3(%edx)
+ movb %al, -1(%edx)
+ SETRTNVAL
+ RETURN
+
+ ALIGN (4)
+L(aligned_16_120bytes):
+ movdqa %xmm0, -120(%edx)
+L(aligned_16_104bytes):
+ movdqa %xmm0, -104(%edx)
+L(aligned_16_88bytes):
+ movdqa %xmm0, -88(%edx)
+L(aligned_16_72bytes):
+ movdqa %xmm0, -72(%edx)
+L(aligned_16_56bytes):
+ movdqa %xmm0, -56(%edx)
+L(aligned_16_40bytes):
+ movdqa %xmm0, -40(%edx)
+L(aligned_16_24bytes):
+ movdqa %xmm0, -24(%edx)
+L(aligned_16_8bytes):
+ movq %xmm0, -8(%edx)
+ SETRTNVAL
+ RETURN
+
+ ALIGN (4)
+L(aligned_16_121bytes):
+ movdqa %xmm0, -121(%edx)
+L(aligned_16_105bytes):
+ movdqa %xmm0, -105(%edx)
+L(aligned_16_89bytes):
+ movdqa %xmm0, -89(%edx)
+L(aligned_16_73bytes):
+ movdqa %xmm0, -73(%edx)
+L(aligned_16_57bytes):
+ movdqa %xmm0, -57(%edx)
+L(aligned_16_41bytes):
+ movdqa %xmm0, -41(%edx)
+L(aligned_16_25bytes):
+ movdqa %xmm0, -25(%edx)
+L(aligned_16_9bytes):
+ movq %xmm0, -9(%edx)
+ movb %al, -1(%edx)
+ SETRTNVAL
+ RETURN
+
+ ALIGN (4)
+L(aligned_16_122bytes):
+ movdqa %xmm0, -122(%edx)
+L(aligned_16_106bytes):
+ movdqa %xmm0, -106(%edx)
+L(aligned_16_90bytes):
+ movdqa %xmm0, -90(%edx)
+L(aligned_16_74bytes):
+ movdqa %xmm0, -74(%edx)
+L(aligned_16_58bytes):
+ movdqa %xmm0, -58(%edx)
+L(aligned_16_42bytes):
+ movdqa %xmm0, -42(%edx)
+L(aligned_16_26bytes):
+ movdqa %xmm0, -26(%edx)
+L(aligned_16_10bytes):
+ movq %xmm0, -10(%edx)
+ movw %ax, -2(%edx)
+ SETRTNVAL
+ RETURN
+
+ ALIGN (4)
+L(aligned_16_123bytes):
+ movdqa %xmm0, -123(%edx)
+L(aligned_16_107bytes):
+ movdqa %xmm0, -107(%edx)
+L(aligned_16_91bytes):
+ movdqa %xmm0, -91(%edx)
+L(aligned_16_75bytes):
+ movdqa %xmm0, -75(%edx)
+L(aligned_16_59bytes):
+ movdqa %xmm0, -59(%edx)
+L(aligned_16_43bytes):
+ movdqa %xmm0, -43(%edx)
+L(aligned_16_27bytes):
+ movdqa %xmm0, -27(%edx)
+L(aligned_16_11bytes):
+ movq %xmm0, -11(%edx)
+ movw %ax, -3(%edx)
+ movb %al, -1(%edx)
+ SETRTNVAL
+ RETURN
+
+ ALIGN (4)
+L(aligned_16_124bytes):
+ movdqa %xmm0, -124(%edx)
+L(aligned_16_108bytes):
+ movdqa %xmm0, -108(%edx)
+L(aligned_16_92bytes):
+ movdqa %xmm0, -92(%edx)
+L(aligned_16_76bytes):
+ movdqa %xmm0, -76(%edx)
+L(aligned_16_60bytes):
+ movdqa %xmm0, -60(%edx)
+L(aligned_16_44bytes):
+ movdqa %xmm0, -44(%edx)
+L(aligned_16_28bytes):
+ movdqa %xmm0, -28(%edx)
+L(aligned_16_12bytes):
+ movq %xmm0, -12(%edx)
+ movl %eax, -4(%edx)
+ SETRTNVAL
+ RETURN
+
+ ALIGN (4)
+L(aligned_16_125bytes):
+ movdqa %xmm0, -125(%edx)
+L(aligned_16_109bytes):
+ movdqa %xmm0, -109(%edx)
+L(aligned_16_93bytes):
+ movdqa %xmm0, -93(%edx)
+L(aligned_16_77bytes):
+ movdqa %xmm0, -77(%edx)
+L(aligned_16_61bytes):
+ movdqa %xmm0, -61(%edx)
+L(aligned_16_45bytes):
+ movdqa %xmm0, -45(%edx)
+L(aligned_16_29bytes):
+ movdqa %xmm0, -29(%edx)
+L(aligned_16_13bytes):
+ movq %xmm0, -13(%edx)
+ movl %eax, -5(%edx)
+ movb %al, -1(%edx)
+ SETRTNVAL
+ RETURN
+
+ ALIGN (4)
+L(aligned_16_126bytes):
+ movdqa %xmm0, -126(%edx)
+L(aligned_16_110bytes):
+ movdqa %xmm0, -110(%edx)
+L(aligned_16_94bytes):
+ movdqa %xmm0, -94(%edx)
+L(aligned_16_78bytes):
+ movdqa %xmm0, -78(%edx)
+L(aligned_16_62bytes):
+ movdqa %xmm0, -62(%edx)
+L(aligned_16_46bytes):
+ movdqa %xmm0, -46(%edx)
+L(aligned_16_30bytes):
+ movdqa %xmm0, -30(%edx)
+L(aligned_16_14bytes):
+ movq %xmm0, -14(%edx)
+ movl %eax, -6(%edx)
+ movw %ax, -2(%edx)
+ SETRTNVAL
+ RETURN
+
+ ALIGN (4)
+L(aligned_16_127bytes):
+ movdqa %xmm0, -127(%edx)
+L(aligned_16_111bytes):
+ movdqa %xmm0, -111(%edx)
+L(aligned_16_95bytes):
+ movdqa %xmm0, -95(%edx)
+L(aligned_16_79bytes):
+ movdqa %xmm0, -79(%edx)
+L(aligned_16_63bytes):
+ movdqa %xmm0, -63(%edx)
+L(aligned_16_47bytes):
+ movdqa %xmm0, -47(%edx)
+L(aligned_16_31bytes):
+ movdqa %xmm0, -31(%edx)
+L(aligned_16_15bytes):
+ movq %xmm0, -15(%edx)
+ movl %eax, -7(%edx)
+ movw %ax, -3(%edx)
+ movb %al, -1(%edx)
+ SETRTNVAL
+ RETURN_END
+
+END (__memset_sse2_rep)
+
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memset-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memset-sse2.S
new file mode 100644
index 0000000000..d7b8be9114
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memset-sse2.S
@@ -0,0 +1,860 @@
+/* memset with SSE2
+ Copyright (C) 2010-2017 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#if IS_IN (libc)
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+#define CFI_PUSH(REG) \
+ cfi_adjust_cfa_offset (4); \
+ cfi_rel_offset (REG, 0)
+
+#define CFI_POP(REG) \
+ cfi_adjust_cfa_offset (-4); \
+ cfi_restore (REG)
+
+#define PUSH(REG) pushl REG; CFI_PUSH (REG)
+#define POP(REG) popl REG; CFI_POP (REG)
+
+#ifdef USE_AS_BZERO
+# define DEST PARMS
+# define LEN DEST+4
+# define SETRTNVAL
+#else
+# define DEST PARMS
+# define CHR DEST+4
+# define LEN CHR+4
+# define SETRTNVAL movl DEST(%esp), %eax
+#endif
+
+#ifdef SHARED
+# define ENTRANCE PUSH (%ebx);
+# define RETURN_END POP (%ebx); ret
+# define RETURN RETURN_END; CFI_PUSH (%ebx)
+# define PARMS 8 /* Preserve EBX. */
+# define JMPTBL(I, B) I - B
+
+/* Load an entry in a jump table into EBX and branch to it. TABLE is a
+ jump table with relative offsets. */
+# define BRANCH_TO_JMPTBL_ENTRY(TABLE) \
+ /* We first load PC into EBX. */ \
+ SETUP_PIC_REG(bx); \
+ /* Get the address of the jump table. */ \
+ add $(TABLE - .), %ebx; \
+ /* Get the entry and convert the relative offset to the \
+ absolute address. */ \
+ add (%ebx,%ecx,4), %ebx; \
+ add %ecx, %edx; \
+ /* We loaded the jump table and adjusted EDX. Go. */ \
+ jmp *%ebx
+#else
+# define ENTRANCE
+# define RETURN_END ret
+# define RETURN RETURN_END
+# define PARMS 4
+# define JMPTBL(I, B) I
+
+/* Branch to an entry in a jump table. TABLE is a jump table with
+ absolute offsets. */
+# define BRANCH_TO_JMPTBL_ENTRY(TABLE) \
+ add %ecx, %edx; \
+ jmp *TABLE(,%ecx,4)
+#endif
+
+ .section .text.sse2,"ax",@progbits
+#if defined SHARED && IS_IN (libc) && !defined USE_AS_BZERO
+ENTRY (__memset_chk_sse2)
+ movl 12(%esp), %eax
+ cmpl %eax, 16(%esp)
+ jb HIDDEN_JUMPTARGET (__chk_fail)
+END (__memset_chk_sse2)
+#endif
+ENTRY (__memset_sse2)
+ ENTRANCE
+
+ movl LEN(%esp), %ecx
+#ifdef USE_AS_BZERO
+ xor %eax, %eax
+#else
+ movzbl CHR(%esp), %eax
+ movb %al, %ah
+ /* Fill the whole EAX with pattern. */
+ movl %eax, %edx
+ shl $16, %eax
+ or %edx, %eax
+#endif
+ movl DEST(%esp), %edx
+ cmp $32, %ecx
+ jae L(32bytesormore)
+
+L(write_less32bytes):
+ BRANCH_TO_JMPTBL_ENTRY (L(table_less_32bytes))
+
+
+ .pushsection .rodata.sse2,"a",@progbits
+ ALIGN (2)
+L(table_less_32bytes):
+ .int JMPTBL (L(write_0bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_1bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_2bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_3bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_4bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_5bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_6bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_7bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_8bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_9bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_10bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_11bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_12bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_13bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_14bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_15bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_16bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_17bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_18bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_19bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_20bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_21bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_22bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_23bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_24bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_25bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_26bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_27bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_28bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_29bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_30bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_31bytes), L(table_less_32bytes))
+ .popsection
+
+ ALIGN (4)
+L(write_28bytes):
+ movl %eax, -28(%edx)
+L(write_24bytes):
+ movl %eax, -24(%edx)
+L(write_20bytes):
+ movl %eax, -20(%edx)
+L(write_16bytes):
+ movl %eax, -16(%edx)
+L(write_12bytes):
+ movl %eax, -12(%edx)
+L(write_8bytes):
+ movl %eax, -8(%edx)
+L(write_4bytes):
+ movl %eax, -4(%edx)
+L(write_0bytes):
+ SETRTNVAL
+ RETURN
+
+ ALIGN (4)
+L(write_29bytes):
+ movl %eax, -29(%edx)
+L(write_25bytes):
+ movl %eax, -25(%edx)
+L(write_21bytes):
+ movl %eax, -21(%edx)
+L(write_17bytes):
+ movl %eax, -17(%edx)
+L(write_13bytes):
+ movl %eax, -13(%edx)
+L(write_9bytes):
+ movl %eax, -9(%edx)
+L(write_5bytes):
+ movl %eax, -5(%edx)
+L(write_1bytes):
+ movb %al, -1(%edx)
+ SETRTNVAL
+ RETURN
+
+ ALIGN (4)
+L(write_30bytes):
+ movl %eax, -30(%edx)
+L(write_26bytes):
+ movl %eax, -26(%edx)
+L(write_22bytes):
+ movl %eax, -22(%edx)
+L(write_18bytes):
+ movl %eax, -18(%edx)
+L(write_14bytes):
+ movl %eax, -14(%edx)
+L(write_10bytes):
+ movl %eax, -10(%edx)
+L(write_6bytes):
+ movl %eax, -6(%edx)
+L(write_2bytes):
+ movw %ax, -2(%edx)
+ SETRTNVAL
+ RETURN
+
+ ALIGN (4)
+L(write_31bytes):
+ movl %eax, -31(%edx)
+L(write_27bytes):
+ movl %eax, -27(%edx)
+L(write_23bytes):
+ movl %eax, -23(%edx)
+L(write_19bytes):
+ movl %eax, -19(%edx)
+L(write_15bytes):
+ movl %eax, -15(%edx)
+L(write_11bytes):
+ movl %eax, -11(%edx)
+L(write_7bytes):
+ movl %eax, -7(%edx)
+L(write_3bytes):
+ movw %ax, -3(%edx)
+ movb %al, -1(%edx)
+ SETRTNVAL
+ RETURN
+
+ ALIGN (4)
+/* ECX > 32 and EDX is 4 byte aligned. */
+L(32bytesormore):
+ /* Fill xmm0 with the pattern. */
+#ifdef USE_AS_BZERO
+ pxor %xmm0, %xmm0
+#else
+ movd %eax, %xmm0
+ pshufd $0, %xmm0, %xmm0
+#endif
+ testl $0xf, %edx
+ jz L(aligned_16)
+/* ECX > 32 and EDX is not 16 byte aligned. */
+L(not_aligned_16):
+ movdqu %xmm0, (%edx)
+ movl %edx, %eax
+ and $-16, %edx
+ add $16, %edx
+ sub %edx, %eax
+ add %eax, %ecx
+ movd %xmm0, %eax
+
+ ALIGN (4)
+L(aligned_16):
+ cmp $128, %ecx
+ jae L(128bytesormore)
+
+L(aligned_16_less128bytes):
+ BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes))
+
+ ALIGN (4)
+L(128bytesormore):
+#ifdef SHARED_CACHE_SIZE
+ PUSH (%ebx)
+ mov $SHARED_CACHE_SIZE, %ebx
+#else
+# ifdef SHARED
+ SETUP_PIC_REG(bx)
+ add $_GLOBAL_OFFSET_TABLE_, %ebx
+ mov __x86_shared_cache_size@GOTOFF(%ebx), %ebx
+# else
+ PUSH (%ebx)
+ mov __x86_shared_cache_size, %ebx
+# endif
+#endif
+ cmp %ebx, %ecx
+ jae L(128bytesormore_nt_start)
+
+
+#ifdef DATA_CACHE_SIZE
+ POP (%ebx)
+# define RESTORE_EBX_STATE CFI_PUSH (%ebx)
+ cmp $DATA_CACHE_SIZE, %ecx
+#else
+# ifdef SHARED
+# define RESTORE_EBX_STATE
+ SETUP_PIC_REG(bx)
+ add $_GLOBAL_OFFSET_TABLE_, %ebx
+ cmp __x86_data_cache_size@GOTOFF(%ebx), %ecx
+# else
+ POP (%ebx)
+# define RESTORE_EBX_STATE CFI_PUSH (%ebx)
+ cmp __x86_data_cache_size, %ecx
+# endif
+#endif
+
+ jae L(128bytes_L2_normal)
+ subl $128, %ecx
+L(128bytesormore_normal):
+ sub $128, %ecx
+ movdqa %xmm0, (%edx)
+ movdqa %xmm0, 0x10(%edx)
+ movdqa %xmm0, 0x20(%edx)
+ movdqa %xmm0, 0x30(%edx)
+ movdqa %xmm0, 0x40(%edx)
+ movdqa %xmm0, 0x50(%edx)
+ movdqa %xmm0, 0x60(%edx)
+ movdqa %xmm0, 0x70(%edx)
+ lea 128(%edx), %edx
+ jb L(128bytesless_normal)
+
+
+ sub $128, %ecx
+ movdqa %xmm0, (%edx)
+ movdqa %xmm0, 0x10(%edx)
+ movdqa %xmm0, 0x20(%edx)
+ movdqa %xmm0, 0x30(%edx)
+ movdqa %xmm0, 0x40(%edx)
+ movdqa %xmm0, 0x50(%edx)
+ movdqa %xmm0, 0x60(%edx)
+ movdqa %xmm0, 0x70(%edx)
+ lea 128(%edx), %edx
+ jae L(128bytesormore_normal)
+
+L(128bytesless_normal):
+ add $128, %ecx
+ BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes))
+
+ ALIGN (4)
+L(128bytes_L2_normal):
+ prefetcht0 0x380(%edx)
+ prefetcht0 0x3c0(%edx)
+ sub $128, %ecx
+ movdqa %xmm0, (%edx)
+ movaps %xmm0, 0x10(%edx)
+ movaps %xmm0, 0x20(%edx)
+ movaps %xmm0, 0x30(%edx)
+ movaps %xmm0, 0x40(%edx)
+ movaps %xmm0, 0x50(%edx)
+ movaps %xmm0, 0x60(%edx)
+ movaps %xmm0, 0x70(%edx)
+ add $128, %edx
+ cmp $128, %ecx
+ jae L(128bytes_L2_normal)
+
+L(128bytesless_L2_normal):
+ BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes))
+
+ RESTORE_EBX_STATE
+L(128bytesormore_nt_start):
+ sub %ebx, %ecx
+ ALIGN (4)
+L(128bytesormore_shared_cache_loop):
+ prefetcht0 0x3c0(%edx)
+ prefetcht0 0x380(%edx)
+ sub $0x80, %ebx
+ movdqa %xmm0, (%edx)
+ movdqa %xmm0, 0x10(%edx)
+ movdqa %xmm0, 0x20(%edx)
+ movdqa %xmm0, 0x30(%edx)
+ movdqa %xmm0, 0x40(%edx)
+ movdqa %xmm0, 0x50(%edx)
+ movdqa %xmm0, 0x60(%edx)
+ movdqa %xmm0, 0x70(%edx)
+ add $0x80, %edx
+ cmp $0x80, %ebx
+ jae L(128bytesormore_shared_cache_loop)
+ cmp $0x80, %ecx
+ jb L(shared_cache_loop_end)
+ ALIGN (4)
+L(128bytesormore_nt):
+ sub $0x80, %ecx
+ movntdq %xmm0, (%edx)
+ movntdq %xmm0, 0x10(%edx)
+ movntdq %xmm0, 0x20(%edx)
+ movntdq %xmm0, 0x30(%edx)
+ movntdq %xmm0, 0x40(%edx)
+ movntdq %xmm0, 0x50(%edx)
+ movntdq %xmm0, 0x60(%edx)
+ movntdq %xmm0, 0x70(%edx)
+ add $0x80, %edx
+ cmp $0x80, %ecx
+ jae L(128bytesormore_nt)
+ sfence
+L(shared_cache_loop_end):
+#if defined DATA_CACHE_SIZE || !defined SHARED
+ POP (%ebx)
+#endif
+ BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes))
+
+
+ .pushsection .rodata.sse2,"a",@progbits
+ ALIGN (2)
+L(table_16_128bytes):
+ .int JMPTBL (L(aligned_16_0bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_1bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_2bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_3bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_4bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_5bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_6bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_7bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_8bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_9bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_10bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_11bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_12bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_13bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_14bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_15bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_16bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_17bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_18bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_19bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_20bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_21bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_22bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_23bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_24bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_25bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_26bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_27bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_28bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_29bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_30bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_31bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_32bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_33bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_34bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_35bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_36bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_37bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_38bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_39bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_40bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_41bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_42bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_43bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_44bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_45bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_46bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_47bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_48bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_49bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_50bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_51bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_52bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_53bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_54bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_55bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_56bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_57bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_58bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_59bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_60bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_61bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_62bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_63bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_64bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_65bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_66bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_67bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_68bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_69bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_70bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_71bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_72bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_73bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_74bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_75bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_76bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_77bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_78bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_79bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_80bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_81bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_82bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_83bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_84bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_85bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_86bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_87bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_88bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_89bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_90bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_91bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_92bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_93bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_94bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_95bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_96bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_97bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_98bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_99bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_100bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_101bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_102bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_103bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_104bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_105bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_106bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_107bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_108bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_109bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_110bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_111bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_112bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_113bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_114bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_115bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_116bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_117bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_118bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_119bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_120bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_121bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_122bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_123bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_124bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_125bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_126bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_127bytes), L(table_16_128bytes))
+ .popsection
+
+ ALIGN (4)
+L(aligned_16_112bytes):
+ movdqa %xmm0, -112(%edx)
+L(aligned_16_96bytes):
+ movdqa %xmm0, -96(%edx)
+L(aligned_16_80bytes):
+ movdqa %xmm0, -80(%edx)
+L(aligned_16_64bytes):
+ movdqa %xmm0, -64(%edx)
+L(aligned_16_48bytes):
+ movdqa %xmm0, -48(%edx)
+L(aligned_16_32bytes):
+ movdqa %xmm0, -32(%edx)
+L(aligned_16_16bytes):
+ movdqa %xmm0, -16(%edx)
+L(aligned_16_0bytes):
+ SETRTNVAL
+ RETURN
+
+ ALIGN (4)
+L(aligned_16_113bytes):
+ movdqa %xmm0, -113(%edx)
+L(aligned_16_97bytes):
+ movdqa %xmm0, -97(%edx)
+L(aligned_16_81bytes):
+ movdqa %xmm0, -81(%edx)
+L(aligned_16_65bytes):
+ movdqa %xmm0, -65(%edx)
+L(aligned_16_49bytes):
+ movdqa %xmm0, -49(%edx)
+L(aligned_16_33bytes):
+ movdqa %xmm0, -33(%edx)
+L(aligned_16_17bytes):
+ movdqa %xmm0, -17(%edx)
+L(aligned_16_1bytes):
+ movb %al, -1(%edx)
+ SETRTNVAL
+ RETURN
+
+ ALIGN (4)
+L(aligned_16_114bytes):
+ movdqa %xmm0, -114(%edx)
+L(aligned_16_98bytes):
+ movdqa %xmm0, -98(%edx)
+L(aligned_16_82bytes):
+ movdqa %xmm0, -82(%edx)
+L(aligned_16_66bytes):
+ movdqa %xmm0, -66(%edx)
+L(aligned_16_50bytes):
+ movdqa %xmm0, -50(%edx)
+L(aligned_16_34bytes):
+ movdqa %xmm0, -34(%edx)
+L(aligned_16_18bytes):
+ movdqa %xmm0, -18(%edx)
+L(aligned_16_2bytes):
+ movw %ax, -2(%edx)
+ SETRTNVAL
+ RETURN
+
+ ALIGN (4)
+L(aligned_16_115bytes):
+ movdqa %xmm0, -115(%edx)
+L(aligned_16_99bytes):
+ movdqa %xmm0, -99(%edx)
+L(aligned_16_83bytes):
+ movdqa %xmm0, -83(%edx)
+L(aligned_16_67bytes):
+ movdqa %xmm0, -67(%edx)
+L(aligned_16_51bytes):
+ movdqa %xmm0, -51(%edx)
+L(aligned_16_35bytes):
+ movdqa %xmm0, -35(%edx)
+L(aligned_16_19bytes):
+ movdqa %xmm0, -19(%edx)
+L(aligned_16_3bytes):
+ movw %ax, -3(%edx)
+ movb %al, -1(%edx)
+ SETRTNVAL
+ RETURN
+
+ ALIGN (4)
+L(aligned_16_116bytes):
+ movdqa %xmm0, -116(%edx)
+L(aligned_16_100bytes):
+ movdqa %xmm0, -100(%edx)
+L(aligned_16_84bytes):
+ movdqa %xmm0, -84(%edx)
+L(aligned_16_68bytes):
+ movdqa %xmm0, -68(%edx)
+L(aligned_16_52bytes):
+ movdqa %xmm0, -52(%edx)
+L(aligned_16_36bytes):
+ movdqa %xmm0, -36(%edx)
+L(aligned_16_20bytes):
+ movdqa %xmm0, -20(%edx)
+L(aligned_16_4bytes):
+ movl %eax, -4(%edx)
+ SETRTNVAL
+ RETURN
+
+ ALIGN (4)
+L(aligned_16_117bytes):
+ movdqa %xmm0, -117(%edx)
+L(aligned_16_101bytes):
+ movdqa %xmm0, -101(%edx)
+L(aligned_16_85bytes):
+ movdqa %xmm0, -85(%edx)
+L(aligned_16_69bytes):
+ movdqa %xmm0, -69(%edx)
+L(aligned_16_53bytes):
+ movdqa %xmm0, -53(%edx)
+L(aligned_16_37bytes):
+ movdqa %xmm0, -37(%edx)
+L(aligned_16_21bytes):
+ movdqa %xmm0, -21(%edx)
+L(aligned_16_5bytes):
+ movl %eax, -5(%edx)
+ movb %al, -1(%edx)
+ SETRTNVAL
+ RETURN
+
+ ALIGN (4)
+L(aligned_16_118bytes):
+ movdqa %xmm0, -118(%edx)
+L(aligned_16_102bytes):
+ movdqa %xmm0, -102(%edx)
+L(aligned_16_86bytes):
+ movdqa %xmm0, -86(%edx)
+L(aligned_16_70bytes):
+ movdqa %xmm0, -70(%edx)
+L(aligned_16_54bytes):
+ movdqa %xmm0, -54(%edx)
+L(aligned_16_38bytes):
+ movdqa %xmm0, -38(%edx)
+L(aligned_16_22bytes):
+ movdqa %xmm0, -22(%edx)
+L(aligned_16_6bytes):
+ movl %eax, -6(%edx)
+ movw %ax, -2(%edx)
+ SETRTNVAL
+ RETURN
+
+ ALIGN (4)
+L(aligned_16_119bytes):
+ movdqa %xmm0, -119(%edx)
+L(aligned_16_103bytes):
+ movdqa %xmm0, -103(%edx)
+L(aligned_16_87bytes):
+ movdqa %xmm0, -87(%edx)
+L(aligned_16_71bytes):
+ movdqa %xmm0, -71(%edx)
+L(aligned_16_55bytes):
+ movdqa %xmm0, -55(%edx)
+L(aligned_16_39bytes):
+ movdqa %xmm0, -39(%edx)
+L(aligned_16_23bytes):
+ movdqa %xmm0, -23(%edx)
+L(aligned_16_7bytes):
+ movl %eax, -7(%edx)
+ movw %ax, -3(%edx)
+ movb %al, -1(%edx)
+ SETRTNVAL
+ RETURN
+
+ ALIGN (4)
+L(aligned_16_120bytes):
+ movdqa %xmm0, -120(%edx)
+L(aligned_16_104bytes):
+ movdqa %xmm0, -104(%edx)
+L(aligned_16_88bytes):
+ movdqa %xmm0, -88(%edx)
+L(aligned_16_72bytes):
+ movdqa %xmm0, -72(%edx)
+L(aligned_16_56bytes):
+ movdqa %xmm0, -56(%edx)
+L(aligned_16_40bytes):
+ movdqa %xmm0, -40(%edx)
+L(aligned_16_24bytes):
+ movdqa %xmm0, -24(%edx)
+L(aligned_16_8bytes):
+ movq %xmm0, -8(%edx)
+ SETRTNVAL
+ RETURN
+
+ ALIGN (4)
+L(aligned_16_121bytes):
+ movdqa %xmm0, -121(%edx)
+L(aligned_16_105bytes):
+ movdqa %xmm0, -105(%edx)
+L(aligned_16_89bytes):
+ movdqa %xmm0, -89(%edx)
+L(aligned_16_73bytes):
+ movdqa %xmm0, -73(%edx)
+L(aligned_16_57bytes):
+ movdqa %xmm0, -57(%edx)
+L(aligned_16_41bytes):
+ movdqa %xmm0, -41(%edx)
+L(aligned_16_25bytes):
+ movdqa %xmm0, -25(%edx)
+L(aligned_16_9bytes):
+ movq %xmm0, -9(%edx)
+ movb %al, -1(%edx)
+ SETRTNVAL
+ RETURN
+
+ ALIGN (4)
+L(aligned_16_122bytes):
+ movdqa %xmm0, -122(%edx)
+L(aligned_16_106bytes):
+ movdqa %xmm0, -106(%edx)
+L(aligned_16_90bytes):
+ movdqa %xmm0, -90(%edx)
+L(aligned_16_74bytes):
+ movdqa %xmm0, -74(%edx)
+L(aligned_16_58bytes):
+ movdqa %xmm0, -58(%edx)
+L(aligned_16_42bytes):
+ movdqa %xmm0, -42(%edx)
+L(aligned_16_26bytes):
+ movdqa %xmm0, -26(%edx)
+L(aligned_16_10bytes):
+ movq %xmm0, -10(%edx)
+ movw %ax, -2(%edx)
+ SETRTNVAL
+ RETURN
+
+ ALIGN (4)
+L(aligned_16_123bytes):
+ movdqa %xmm0, -123(%edx)
+L(aligned_16_107bytes):
+ movdqa %xmm0, -107(%edx)
+L(aligned_16_91bytes):
+ movdqa %xmm0, -91(%edx)
+L(aligned_16_75bytes):
+ movdqa %xmm0, -75(%edx)
+L(aligned_16_59bytes):
+ movdqa %xmm0, -59(%edx)
+L(aligned_16_43bytes):
+ movdqa %xmm0, -43(%edx)
+L(aligned_16_27bytes):
+ movdqa %xmm0, -27(%edx)
+L(aligned_16_11bytes):
+ movq %xmm0, -11(%edx)
+ movw %ax, -3(%edx)
+ movb %al, -1(%edx)
+ SETRTNVAL
+ RETURN
+
+ ALIGN (4)
+L(aligned_16_124bytes):
+ movdqa %xmm0, -124(%edx)
+L(aligned_16_108bytes):
+ movdqa %xmm0, -108(%edx)
+L(aligned_16_92bytes):
+ movdqa %xmm0, -92(%edx)
+L(aligned_16_76bytes):
+ movdqa %xmm0, -76(%edx)
+L(aligned_16_60bytes):
+ movdqa %xmm0, -60(%edx)
+L(aligned_16_44bytes):
+ movdqa %xmm0, -44(%edx)
+L(aligned_16_28bytes):
+ movdqa %xmm0, -28(%edx)
+L(aligned_16_12bytes):
+ movq %xmm0, -12(%edx)
+ movl %eax, -4(%edx)
+ SETRTNVAL
+ RETURN
+
+ ALIGN (4)
+L(aligned_16_125bytes):
+ movdqa %xmm0, -125(%edx)
+L(aligned_16_109bytes):
+ movdqa %xmm0, -109(%edx)
+L(aligned_16_93bytes):
+ movdqa %xmm0, -93(%edx)
+L(aligned_16_77bytes):
+ movdqa %xmm0, -77(%edx)
+L(aligned_16_61bytes):
+ movdqa %xmm0, -61(%edx)
+L(aligned_16_45bytes):
+ movdqa %xmm0, -45(%edx)
+L(aligned_16_29bytes):
+ movdqa %xmm0, -29(%edx)
+L(aligned_16_13bytes):
+ movq %xmm0, -13(%edx)
+ movl %eax, -5(%edx)
+ movb %al, -1(%edx)
+ SETRTNVAL
+ RETURN
+
+ ALIGN (4)
+L(aligned_16_126bytes):
+ movdqa %xmm0, -126(%edx)
+L(aligned_16_110bytes):
+ movdqa %xmm0, -110(%edx)
+L(aligned_16_94bytes):
+ movdqa %xmm0, -94(%edx)
+L(aligned_16_78bytes):
+ movdqa %xmm0, -78(%edx)
+L(aligned_16_62bytes):
+ movdqa %xmm0, -62(%edx)
+L(aligned_16_46bytes):
+ movdqa %xmm0, -46(%edx)
+L(aligned_16_30bytes):
+ movdqa %xmm0, -30(%edx)
+L(aligned_16_14bytes):
+ movq %xmm0, -14(%edx)
+ movl %eax, -6(%edx)
+ movw %ax, -2(%edx)
+ SETRTNVAL
+ RETURN
+
+ ALIGN (4)
+L(aligned_16_127bytes):
+ movdqa %xmm0, -127(%edx)
+L(aligned_16_111bytes):
+ movdqa %xmm0, -111(%edx)
+L(aligned_16_95bytes):
+ movdqa %xmm0, -95(%edx)
+L(aligned_16_79bytes):
+ movdqa %xmm0, -79(%edx)
+L(aligned_16_63bytes):
+ movdqa %xmm0, -63(%edx)
+L(aligned_16_47bytes):
+ movdqa %xmm0, -47(%edx)
+L(aligned_16_31bytes):
+ movdqa %xmm0, -31(%edx)
+L(aligned_16_15bytes):
+ movq %xmm0, -15(%edx)
+ movl %eax, -7(%edx)
+ movw %ax, -3(%edx)
+ movb %al, -1(%edx)
+ SETRTNVAL
+ RETURN_END
+
+END (__memset_sse2)
+
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memset.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memset.S
new file mode 100644
index 0000000000..f601663a9f
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memset.S
@@ -0,0 +1,75 @@
+/* Multiple versions of memset
+ All versions must be listed in ifunc-impl-list.c.
+ Copyright (C) 2010-2017 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in lib. */
+#if IS_IN (libc)
+ .text
+ENTRY(memset)
+ .type memset, @gnu_indirect_function
+ LOAD_GOT_AND_RTLD_GLOBAL_RO
+ LOAD_FUNC_GOT_EAX (__memset_ia32)
+ HAS_CPU_FEATURE (SSE2)
+ jz 2f
+ LOAD_FUNC_GOT_EAX (__memset_sse2)
+ HAS_ARCH_FEATURE (Fast_Rep_String)
+ jz 2f
+ LOAD_FUNC_GOT_EAX (__memset_sse2_rep)
+2: ret
+END(memset)
+
+# undef ENTRY
+# define ENTRY(name) \
+ .type __memset_ia32, @function; \
+ .globl __memset_ia32; \
+ .p2align 4; \
+ __memset_ia32: cfi_startproc; \
+ CALL_MCOUNT
+# undef END
+# define END(name) \
+ cfi_endproc; .size __memset_ia32, .-__memset_ia32
+
+# undef ENTRY_CHK
+# define ENTRY_CHK(name) \
+ .type __memset_chk_ia32, @function; \
+ .globl __memset_chk_ia32; \
+ .p2align 4; \
+ __memset_chk_ia32: cfi_startproc; \
+ CALL_MCOUNT
+# undef END_CHK
+# define END_CHK(name) \
+ cfi_endproc; .size __memset_chk_ia32, .-__memset_chk_ia32
+
+# ifdef SHARED
+# undef libc_hidden_builtin_def
+/* IFUNC doesn't work with the hidden functions in shared library since
+ they will be called without setting up EBX needed for PLT which is
+ used by IFUNC. */
+# define libc_hidden_builtin_def(name) \
+ .globl __GI_memset; __GI_memset = __memset_ia32
+# endif
+
+# undef strong_alias
+# define strong_alias(original, alias)
+#endif
+
+#include "../memset.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memset_chk.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memset_chk.S
new file mode 100644
index 0000000000..573cf4208a
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memset_chk.S
@@ -0,0 +1,82 @@
+/* Multiple versions of __memset_chk
+ All versions must be listed in ifunc-impl-list.c.
+ Copyright (C) 2010-2017 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in lib. */
+#if IS_IN (libc)
+ .text
+ENTRY(__memset_chk)
+ .type __memset_chk, @gnu_indirect_function
+ LOAD_GOT_AND_RTLD_GLOBAL_RO
+ LOAD_FUNC_GOT_EAX (__memset_chk_ia32)
+ HAS_CPU_FEATURE (SSE2)
+ jz 2f
+ LOAD_FUNC_GOT_EAX (__memset_chk_sse2)
+ HAS_ARCH_FEATURE (Fast_Rep_String)
+ jz 2f
+ LOAD_FUNC_GOT_EAX (__memset_chk_sse2_rep)
+2: ret
+END(__memset_chk)
+
+# ifdef SHARED
+strong_alias (__memset_chk, __memset_zero_constant_len_parameter)
+ .section .gnu.warning.__memset_zero_constant_len_parameter
+ .string "memset used with constant zero length parameter; this could be due to transposed parameters"
+# else
+ .text
+ .type __memset_chk_sse2, @function
+ .p2align 4;
+__memset_chk_sse2:
+ cfi_startproc
+ CALL_MCOUNT
+ movl 12(%esp), %eax
+ cmpl %eax, 16(%esp)
+ jb __chk_fail
+ jmp __memset_sse2
+ cfi_endproc
+ .size __memset_chk_sse2, .-__memset_chk_sse2
+
+ .type __memset_chk_sse2_rep, @function
+ .p2align 4;
+__memset_chk_sse2_rep:
+ cfi_startproc
+ CALL_MCOUNT
+ movl 12(%esp), %eax
+ cmpl %eax, 16(%esp)
+ jb __chk_fail
+ jmp __memset_sse2_rep
+ cfi_endproc
+ .size __memset_chk_sse2_rep, .-__memset_chk_sse2_rep
+
+ .type __memset_chk_ia32, @function
+ .p2align 4;
+__memset_chk_ia32:
+ cfi_startproc
+ CALL_MCOUNT
+ movl 12(%esp), %eax
+ cmpl %eax, 16(%esp)
+ jb __chk_fail
+ jmp __memset_ia32
+ cfi_endproc
+ .size __memset_chk_ia32, .-__memset_chk_ia32
+# endif
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/rawmemchr-sse2-bsf.S b/REORG.TODO/sysdeps/i386/i686/multiarch/rawmemchr-sse2-bsf.S
new file mode 100644
index 0000000000..88c0e5776c
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/rawmemchr-sse2-bsf.S
@@ -0,0 +1,3 @@
+#define USE_AS_RAWMEMCHR
+#define MEMCHR __rawmemchr_sse2_bsf
+#include "memchr-sse2-bsf.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/rawmemchr-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/rawmemchr-sse2.S
new file mode 100644
index 0000000000..038c74896b
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/rawmemchr-sse2.S
@@ -0,0 +1,3 @@
+#define USE_AS_RAWMEMCHR
+#define MEMCHR __rawmemchr_sse2
+#include "memchr-sse2.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/rawmemchr.S b/REORG.TODO/sysdeps/i386/i686/multiarch/rawmemchr.S
new file mode 100644
index 0000000000..0a41d63ee8
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/rawmemchr.S
@@ -0,0 +1,65 @@
+/* Multiple versions of rawmemchr
+ All versions must be listed in ifunc-impl-list.c.
+ Copyright (C) 2011-2017 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+#if IS_IN (libc)
+ .text
+ENTRY(__rawmemchr)
+ .type __rawmemchr, @gnu_indirect_function
+ LOAD_GOT_AND_RTLD_GLOBAL_RO
+ HAS_CPU_FEATURE (SSE2)
+ jz 2f
+ HAS_ARCH_FEATURE (Slow_BSF)
+ jz 3f
+
+ LOAD_FUNC_GOT_EAX (__rawmemchr_sse2)
+ ret
+
+2: LOAD_FUNC_GOT_EAX (__rawmemchr_ia32)
+ ret
+
+3: LOAD_FUNC_GOT_EAX (__rawmemchr_sse2_bsf)
+ ret
+END(__rawmemchr)
+
+weak_alias(__rawmemchr, rawmemchr)
+
+# undef ENTRY
+# define ENTRY(name) \
+ .type __rawmemchr_ia32, @function; \
+ .globl __rawmemchr_ia32; \
+ .p2align 4; \
+ __rawmemchr_ia32: cfi_startproc; \
+ CALL_MCOUNT
+# undef END
+# define END(name) \
+ cfi_endproc; .size __rawmemchr_ia32, .-__rawmemchr_ia32
+
+# undef libc_hidden_def
+/* IFUNC doesn't work with the hidden functions in shared library since
+ they will be called without setting up EBX needed for PLT which is
+ used by IFUNC. */
+# define libc_hidden_def(name) \
+ .globl __GI___rawmemchr; __GI___rawmemchr = __rawmemchr_ia32
+
+#endif
+#include "../../rawmemchr.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/rtld-strnlen.c b/REORG.TODO/sysdeps/i386/i686/multiarch/rtld-strnlen.c
new file mode 100644
index 0000000000..1aa5440644
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/rtld-strnlen.c
@@ -0,0 +1 @@
+#include <string/strnlen.c>
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/s_fma-fma.c b/REORG.TODO/sysdeps/i386/i686/multiarch/s_fma-fma.c
new file mode 100644
index 0000000000..2e9619f97c
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/s_fma-fma.c
@@ -0,0 +1,27 @@
+/* FMA version of fma.
+ Copyright (C) 2010-2017 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <config.h>
+
+double
+__fma_fma (double x, double y, double z)
+{
+ asm ("vfmadd213sd %3, %2, %0" : "=x" (x) : "0" (x), "x" (y), "xm" (z));
+ return x;
+}
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/s_fma.c b/REORG.TODO/sysdeps/i386/i686/multiarch/s_fma.c
new file mode 100644
index 0000000000..411ebb2ba9
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/s_fma.c
@@ -0,0 +1,34 @@
+/* Multiple versions of fma.
+ Copyright (C) 2010-2017 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <config.h>
+
+#include <math.h>
+#include <init-arch.h>
+
+extern double __fma_ia32 (double x, double y, double z) attribute_hidden;
+extern double __fma_fma (double x, double y, double z) attribute_hidden;
+
+libm_ifunc (__fma,
+ HAS_ARCH_FEATURE (FMA_Usable) ? __fma_fma : __fma_ia32);
+weak_alias (__fma, fma)
+
+#define __fma __fma_ia32
+
+#include <sysdeps/ieee754/ldbl-96/s_fma.c>
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/s_fmaf-fma.c b/REORG.TODO/sysdeps/i386/i686/multiarch/s_fmaf-fma.c
new file mode 100644
index 0000000000..ee57abfda2
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/s_fmaf-fma.c
@@ -0,0 +1,27 @@
+/* FMA version of fmaf.
+ Copyright (C) 2010-2017 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <config.h>
+
+float
+__fmaf_fma (float x, float y, float z)
+{
+ asm ("vfmadd213ss %3, %2, %0" : "=x" (x) : "0" (x), "x" (y), "xm" (z));
+ return x;
+}
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/s_fmaf.c b/REORG.TODO/sysdeps/i386/i686/multiarch/s_fmaf.c
new file mode 100644
index 0000000000..00b0fbcfc5
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/s_fmaf.c
@@ -0,0 +1,34 @@
+/* Multiple versions of fmaf.
+ Copyright (C) 2010-2017 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <config.h>
+
+#include <math.h>
+#include <init-arch.h>
+
+extern float __fmaf_ia32 (float x, float y, float z) attribute_hidden;
+extern float __fmaf_fma (float x, float y, float z) attribute_hidden;
+
+libm_ifunc (__fmaf,
+ HAS_ARCH_FEATURE (FMA_Usable) ? __fmaf_fma : __fmaf_ia32);
+weak_alias (__fmaf, fmaf)
+
+#define __fmaf __fmaf_ia32
+
+#include <sysdeps/ieee754/dbl-64/s_fmaf.c>
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/sched_cpucount.c b/REORG.TODO/sysdeps/i386/i686/multiarch/sched_cpucount.c
new file mode 100644
index 0000000000..7db31b02f8
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/sched_cpucount.c
@@ -0,0 +1 @@
+#include <sysdeps/x86_64/multiarch/sched_cpucount.c>
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/stpcpy-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/stpcpy-sse2.S
new file mode 100644
index 0000000000..46ca1b3074
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/stpcpy-sse2.S
@@ -0,0 +1,3 @@
+#define USE_AS_STPCPY
+#define STRCPY __stpcpy_sse2
+#include "strcpy-sse2.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/stpcpy-ssse3.S b/REORG.TODO/sysdeps/i386/i686/multiarch/stpcpy-ssse3.S
new file mode 100644
index 0000000000..d971c2da38
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/stpcpy-ssse3.S
@@ -0,0 +1,3 @@
+#define USE_AS_STPCPY
+#define STRCPY __stpcpy_ssse3
+#include "strcpy-ssse3.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/stpcpy.S b/REORG.TODO/sysdeps/i386/i686/multiarch/stpcpy.S
new file mode 100644
index 0000000000..ee81ab6ae3
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/stpcpy.S
@@ -0,0 +1,9 @@
+/* Multiple versions of stpcpy
+ All versions must be listed in ifunc-impl-list.c. */
+#define USE_AS_STPCPY
+#define STRCPY __stpcpy
+#include "strcpy.S"
+
+weak_alias (__stpcpy, stpcpy)
+libc_hidden_def (__stpcpy)
+libc_hidden_builtin_def (stpcpy)
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/stpncpy-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/stpncpy-sse2.S
new file mode 100644
index 0000000000..37a703cb76
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/stpncpy-sse2.S
@@ -0,0 +1,4 @@
+#define USE_AS_STPCPY
+#define USE_AS_STRNCPY
+#define STRCPY __stpncpy_sse2
+#include "strcpy-sse2.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/stpncpy-ssse3.S b/REORG.TODO/sysdeps/i386/i686/multiarch/stpncpy-ssse3.S
new file mode 100644
index 0000000000..14ed16f6b5
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/stpncpy-ssse3.S
@@ -0,0 +1,4 @@
+#define USE_AS_STPCPY
+#define USE_AS_STRNCPY
+#define STRCPY __stpncpy_ssse3
+#include "strcpy-ssse3.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/stpncpy.S b/REORG.TODO/sysdeps/i386/i686/multiarch/stpncpy.S
new file mode 100644
index 0000000000..2698ca6a8c
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/stpncpy.S
@@ -0,0 +1,8 @@
+/* Multiple versions of stpncpy
+ All versions must be listed in ifunc-impl-list.c. */
+#define STRCPY __stpncpy
+#define USE_AS_STPCPY
+#define USE_AS_STRNCPY
+#include "strcpy.S"
+
+weak_alias (__stpncpy, stpncpy)
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp-c.c b/REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp-c.c
new file mode 100644
index 0000000000..753c6ec84a
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp-c.c
@@ -0,0 +1,12 @@
+#include <string.h>
+
+extern __typeof (strcasecmp) __strcasecmp_nonascii;
+
+#define __strcasecmp __strcasecmp_nonascii
+#include <string/strcasecmp.c>
+
+strong_alias (__strcasecmp_nonascii, __strcasecmp_ia32)
+
+/* The needs of strcasecmp in libc are minimal, no need to go through
+ the IFUNC. */
+strong_alias (__strcasecmp_nonascii, __GI___strcasecmp)
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp.S
new file mode 100644
index 0000000000..ec59276408
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp.S
@@ -0,0 +1,39 @@
+/* Entry point for multi-version x86 strcasecmp.
+ All versions must be listed in ifunc-impl-list.c.
+ Copyright (C) 2011-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+ .text
+ENTRY(__strcasecmp)
+ .type __strcasecmp, @gnu_indirect_function
+ LOAD_GOT_AND_RTLD_GLOBAL_RO
+ LOAD_FUNC_GOT_EAX (__strcasecmp_ia32)
+ HAS_CPU_FEATURE (SSSE3)
+ jz 2f
+ LOAD_FUNC_GOT_EAX (__strcasecmp_ssse3)
+ HAS_CPU_FEATURE (SSE4_2)
+ jz 2f
+ HAS_ARCH_FEATURE (Slow_SSE4_2)
+ jnz 2f
+ LOAD_FUNC_GOT_EAX (__strcasecmp_sse4_2)
+2: ret
+END(__strcasecmp)
+
+weak_alias (__strcasecmp, strcasecmp)
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp_l-c.c b/REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp_l-c.c
new file mode 100644
index 0000000000..d4fcd2b4a1
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp_l-c.c
@@ -0,0 +1,13 @@
+#include <string.h>
+
+extern __typeof (strcasecmp_l) __strcasecmp_l_nonascii;
+
+#define __strcasecmp_l __strcasecmp_l_nonascii
+#define USE_IN_EXTENDED_LOCALE_MODEL 1
+#include <string/strcasecmp.c>
+
+strong_alias (__strcasecmp_l_nonascii, __strcasecmp_l_ia32)
+
+/* The needs of strcasecmp in libc are minimal, no need to go through
+ the IFUNC. */
+strong_alias (__strcasecmp_l_nonascii, __GI___strcasecmp_l)
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp_l-sse4.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp_l-sse4.S
new file mode 100644
index 0000000000..411d4153f2
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp_l-sse4.S
@@ -0,0 +1,2 @@
+#define USE_AS_STRCASECMP_L 1
+#include "strcmp-sse4.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp_l-ssse3.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp_l-ssse3.S
new file mode 100644
index 0000000000..a22b93c518
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp_l-ssse3.S
@@ -0,0 +1,2 @@
+#define USE_AS_STRCASECMP_L 1
+#include "strcmp-ssse3.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp_l.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp_l.S
new file mode 100644
index 0000000000..711c09b0dc
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp_l.S
@@ -0,0 +1,7 @@
+/* Multiple versions of strcasecmp_l
+ All versions must be listed in ifunc-impl-list.c. */
+#define STRCMP __strcasecmp_l
+#define USE_AS_STRCASECMP_L
+#include "strcmp.S"
+
+weak_alias (__strcasecmp_l, strcasecmp_l)
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strcat-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strcat-sse2.S
new file mode 100644
index 0000000000..6359c7330c
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strcat-sse2.S
@@ -0,0 +1,1245 @@
+/* strcat with SSE2
+ Copyright (C) 2011-2017 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+
+
+# define CFI_PUSH(REG) \
+ cfi_adjust_cfa_offset (4); \
+ cfi_rel_offset (REG, 0)
+
+# define CFI_POP(REG) \
+ cfi_adjust_cfa_offset (-4); \
+ cfi_restore (REG)
+
+# define PUSH(REG) pushl REG; CFI_PUSH (REG)
+# define POP(REG) popl REG; CFI_POP (REG)
+
+# ifdef SHARED
+# define JMPTBL(I, B) I - B
+
+/* Load an entry in a jump table into ECX and branch to it. TABLE is a
+ jump table with relative offsets. INDEX is a register contains the
+ index into the jump table. SCALE is the scale of INDEX. */
+
+# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
+ /* We first load PC into ECX. */ \
+ SETUP_PIC_REG(cx); \
+ /* Get the address of the jump table. */ \
+ addl $(TABLE - .), %ecx; \
+ /* Get the entry and convert the relative offset to the \
+ absolute address. */ \
+ addl (%ecx,INDEX,SCALE), %ecx; \
+ /* We loaded the jump table and adjusted ECX. Go. */ \
+ jmp *%ecx
+# else
+# define JMPTBL(I, B) I
+
+/* Branch to an entry in a jump table. TABLE is a jump table with
+ absolute offsets. INDEX is a register contains the index into the
+ jump table. SCALE is the scale of INDEX. */
+
+# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
+ jmp *TABLE(,INDEX,SCALE)
+# endif
+
+# ifndef STRCAT
+# define STRCAT __strcat_sse2
+# endif
+
+# define PARMS 4
+# define STR1 PARMS+4
+# define STR2 STR1+4
+
+# ifdef USE_AS_STRNCAT
+# define LEN STR2+8
+# define STR3 STR1+4
+# else
+# define STR3 STR1
+# endif
+
+# define USE_AS_STRCAT
+# ifdef USE_AS_STRNCAT
+# define RETURN POP(%ebx); POP(%esi); ret; CFI_PUSH(%ebx); CFI_PUSH(%esi);
+# else
+# define RETURN POP(%esi); ret; CFI_PUSH(%esi);
+# endif
+
+.text
+ENTRY (STRCAT)
+ PUSH (%esi)
+ mov STR1(%esp), %eax
+ mov STR2(%esp), %esi
+# ifdef USE_AS_STRNCAT
+ PUSH (%ebx)
+ movl LEN(%esp), %ebx
+ test %ebx, %ebx
+ jz L(ExitZero)
+# endif
+ cmpb $0, (%esi)
+ mov %esi, %ecx
+ mov %eax, %edx
+ jz L(ExitZero)
+
+ and $63, %ecx
+ and $63, %edx
+ cmp $32, %ecx
+ ja L(StrlenCore7_1)
+ cmp $48, %edx
+ ja L(alignment_prolog)
+
+ pxor %xmm0, %xmm0
+ pxor %xmm4, %xmm4
+ pxor %xmm7, %xmm7
+ movdqu (%eax), %xmm1
+ movdqu (%esi), %xmm5
+ pcmpeqb %xmm1, %xmm0
+ movdqu 16(%esi), %xmm6
+ pmovmskb %xmm0, %ecx
+ pcmpeqb %xmm5, %xmm4
+ pcmpeqb %xmm6, %xmm7
+ test %ecx, %ecx
+ jnz L(exit_less16_)
+ mov %eax, %ecx
+ and $-16, %eax
+ jmp L(loop_prolog)
+
+L(alignment_prolog):
+ pxor %xmm0, %xmm0
+ pxor %xmm4, %xmm4
+ mov %edx, %ecx
+ pxor %xmm7, %xmm7
+ and $15, %ecx
+ and $-16, %eax
+ pcmpeqb (%eax), %xmm0
+ movdqu (%esi), %xmm5
+ movdqu 16(%esi), %xmm6
+ pmovmskb %xmm0, %edx
+ pcmpeqb %xmm5, %xmm4
+ shr %cl, %edx
+ pcmpeqb %xmm6, %xmm7
+ test %edx, %edx
+ jnz L(exit_less16)
+ add %eax, %ecx
+
+ pxor %xmm0, %xmm0
+L(loop_prolog):
+ pxor %xmm1, %xmm1
+ pxor %xmm2, %xmm2
+ pxor %xmm3, %xmm3
+ .p2align 4
+L(align16_loop):
+ pcmpeqb 16(%eax), %xmm0
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ jnz L(exit16)
+
+ pcmpeqb 32(%eax), %xmm1
+ pmovmskb %xmm1, %edx
+ test %edx, %edx
+ jnz L(exit32)
+
+ pcmpeqb 48(%eax), %xmm2
+ pmovmskb %xmm2, %edx
+ test %edx, %edx
+ jnz L(exit48)
+
+ pcmpeqb 64(%eax), %xmm3
+ pmovmskb %xmm3, %edx
+ lea 64(%eax), %eax
+ test %edx, %edx
+ jz L(align16_loop)
+ bsf %edx, %edx
+ add %edx, %eax
+ jmp L(StartStrcpyPart)
+
+ .p2align 4
+L(exit16):
+ bsf %edx, %edx
+ lea 16(%eax, %edx), %eax
+ jmp L(StartStrcpyPart)
+
+ .p2align 4
+L(exit32):
+ bsf %edx, %edx
+ lea 32(%eax, %edx), %eax
+ jmp L(StartStrcpyPart)
+
+ .p2align 4
+L(exit48):
+ bsf %edx, %edx
+ lea 48(%eax, %edx), %eax
+ jmp L(StartStrcpyPart)
+
+ .p2align 4
+L(exit_less16):
+ bsf %edx, %edx
+ add %ecx, %eax
+ add %edx, %eax
+ jmp L(StartStrcpyPart)
+
+ .p2align 4
+L(exit_less16_):
+ bsf %ecx, %ecx
+ add %ecx, %eax
+
+ .p2align 4
+L(StartStrcpyPart):
+ pmovmskb %xmm4, %edx
+# ifdef USE_AS_STRNCAT
+ cmp $16, %ebx
+ jbe L(CopyFrom1To16BytesTail1Case2OrCase3)
+# endif
+ test %edx, %edx
+ jnz L(CopyFrom1To16BytesTail1)
+
+ movdqu %xmm5, (%eax)
+ pmovmskb %xmm7, %edx
+# ifdef USE_AS_STRNCAT
+ cmp $32, %ebx
+ jbe L(CopyFrom1To32Bytes1Case2OrCase3)
+# endif
+ test %edx, %edx
+ jnz L(CopyFrom1To32Bytes1)
+
+ mov %esi, %ecx
+ and $-16, %esi
+ and $15, %ecx
+ pxor %xmm0, %xmm0
+# ifdef USE_AS_STRNCAT
+ add %ecx, %ebx
+ sbb %edx, %edx
+ or %edx, %ebx
+# endif
+ sub %ecx, %eax
+ jmp L(Unalign16Both)
+
+L(StrlenCore7_1):
+ mov %eax, %ecx
+ pxor %xmm0, %xmm0
+ and $15, %ecx
+ and $-16, %eax
+ pcmpeqb (%eax), %xmm0
+ pmovmskb %xmm0, %edx
+ shr %cl, %edx
+ test %edx, %edx
+ jnz L(exit_less16_1)
+ add %eax, %ecx
+
+ pxor %xmm0, %xmm0
+ pxor %xmm1, %xmm1
+ pxor %xmm2, %xmm2
+ pxor %xmm3, %xmm3
+
+ .p2align 4
+L(align16_loop_1):
+ pcmpeqb 16(%eax), %xmm0
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ jnz L(exit16_1)
+
+ pcmpeqb 32(%eax), %xmm1
+ pmovmskb %xmm1, %edx
+ test %edx, %edx
+ jnz L(exit32_1)
+
+ pcmpeqb 48(%eax), %xmm2
+ pmovmskb %xmm2, %edx
+ test %edx, %edx
+ jnz L(exit48_1)
+
+ pcmpeqb 64(%eax), %xmm3
+ pmovmskb %xmm3, %edx
+ lea 64(%eax), %eax
+ test %edx, %edx
+ jz L(align16_loop_1)
+ bsf %edx, %edx
+ add %edx, %eax
+ jmp L(StartStrcpyPart_1)
+
+ .p2align 4
+L(exit16_1):
+ bsf %edx, %edx
+ lea 16(%eax, %edx), %eax
+ jmp L(StartStrcpyPart_1)
+
+ .p2align 4
+L(exit32_1):
+ bsf %edx, %edx
+ lea 32(%eax, %edx), %eax
+ jmp L(StartStrcpyPart_1)
+
+ .p2align 4
+L(exit48_1):
+ bsf %edx, %edx
+ lea 48(%eax, %edx), %eax
+ jmp L(StartStrcpyPart_1)
+
+ .p2align 4
+L(exit_less16_1):
+ bsf %edx, %edx
+ add %ecx, %eax
+ add %edx, %eax
+
+ .p2align 4
+L(StartStrcpyPart_1):
+ mov %esi, %ecx
+ and $15, %ecx
+ and $-16, %esi
+ pxor %xmm0, %xmm0
+ pxor %xmm1, %xmm1
+
+# ifdef USE_AS_STRNCAT
+ cmp $48, %ebx
+ ja L(BigN)
+# endif
+ pcmpeqb (%esi), %xmm1
+# ifdef USE_AS_STRNCAT
+ add %ecx, %ebx
+# endif
+ pmovmskb %xmm1, %edx
+ shr %cl, %edx
+# ifdef USE_AS_STRNCAT
+ cmp $16, %ebx
+ jbe L(CopyFrom1To16BytesTailCase2OrCase3)
+# endif
+ test %edx, %edx
+ jnz L(CopyFrom1To16BytesTail)
+
+ pcmpeqb 16(%esi), %xmm0
+ pmovmskb %xmm0, %edx
+# ifdef USE_AS_STRNCAT
+ cmp $32, %ebx
+ jbe L(CopyFrom1To32BytesCase2OrCase3)
+# endif
+ test %edx, %edx
+ jnz L(CopyFrom1To32Bytes)
+
+ movdqu (%esi, %ecx), %xmm1 /* copy 16 bytes */
+ movdqu %xmm1, (%eax)
+ sub %ecx, %eax
+
+ .p2align 4
+L(Unalign16Both):
+ mov $16, %ecx
+ movdqa (%esi, %ecx), %xmm1
+ movaps 16(%esi, %ecx), %xmm2
+ movdqu %xmm1, (%eax, %ecx)
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %edx
+ add $16, %ecx
+# ifdef USE_AS_STRNCAT
+ sub $48, %ebx
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+ test %edx, %edx
+ jnz L(CopyFrom1To16Bytes)
+L(Unalign16BothBigN):
+ movaps 16(%esi, %ecx), %xmm3
+ movdqu %xmm2, (%eax, %ecx)
+ pcmpeqb %xmm3, %xmm0
+ pmovmskb %xmm0, %edx
+ add $16, %ecx
+# ifdef USE_AS_STRNCAT
+ sub $16, %ebx
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+ test %edx, %edx
+ jnz L(CopyFrom1To16Bytes)
+
+ movaps 16(%esi, %ecx), %xmm4
+ movdqu %xmm3, (%eax, %ecx)
+ pcmpeqb %xmm4, %xmm0
+ pmovmskb %xmm0, %edx
+ add $16, %ecx
+# ifdef USE_AS_STRNCAT
+ sub $16, %ebx
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+ test %edx, %edx
+ jnz L(CopyFrom1To16Bytes)
+
+ movaps 16(%esi, %ecx), %xmm1
+ movdqu %xmm4, (%eax, %ecx)
+ pcmpeqb %xmm1, %xmm0
+ pmovmskb %xmm0, %edx
+ add $16, %ecx
+# ifdef USE_AS_STRNCAT
+ sub $16, %ebx
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+ test %edx, %edx
+ jnz L(CopyFrom1To16Bytes)
+
+ movaps 16(%esi, %ecx), %xmm2
+ movdqu %xmm1, (%eax, %ecx)
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %edx
+ add $16, %ecx
+# ifdef USE_AS_STRNCAT
+ sub $16, %ebx
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+ test %edx, %edx
+ jnz L(CopyFrom1To16Bytes)
+
+ movaps 16(%esi, %ecx), %xmm3
+ movdqu %xmm2, (%eax, %ecx)
+ pcmpeqb %xmm3, %xmm0
+ pmovmskb %xmm0, %edx
+ add $16, %ecx
+# ifdef USE_AS_STRNCAT
+ sub $16, %ebx
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+ test %edx, %edx
+ jnz L(CopyFrom1To16Bytes)
+
+ movdqu %xmm3, (%eax, %ecx)
+ mov %esi, %edx
+ lea 16(%esi, %ecx), %esi
+ and $-0x40, %esi
+ sub %esi, %edx
+ sub %edx, %eax
+# ifdef USE_AS_STRNCAT
+ lea 128(%ebx, %edx), %ebx
+# endif
+ movaps (%esi), %xmm2
+ movaps %xmm2, %xmm4
+ movaps 16(%esi), %xmm5
+ movaps 32(%esi), %xmm3
+ movaps %xmm3, %xmm6
+ movaps 48(%esi), %xmm7
+ pminub %xmm5, %xmm2
+ pminub %xmm7, %xmm3
+ pminub %xmm2, %xmm3
+ pcmpeqb %xmm0, %xmm3
+ pmovmskb %xmm3, %edx
+# ifdef USE_AS_STRNCAT
+ sub $64, %ebx
+ jbe L(UnalignedLeaveCase2OrCase3)
+# endif
+ test %edx, %edx
+ jnz L(Unaligned64Leave)
+
+ .p2align 4
+L(Unaligned64Loop_start):
+ add $64, %eax
+ add $64, %esi
+ movdqu %xmm4, -64(%eax)
+ movaps (%esi), %xmm2
+ movdqa %xmm2, %xmm4
+ movdqu %xmm5, -48(%eax)
+ movaps 16(%esi), %xmm5
+ pminub %xmm5, %xmm2
+ movaps 32(%esi), %xmm3
+ movdqu %xmm6, -32(%eax)
+ movaps %xmm3, %xmm6
+ movdqu %xmm7, -16(%eax)
+ movaps 48(%esi), %xmm7
+ pminub %xmm7, %xmm3
+ pminub %xmm2, %xmm3
+ pcmpeqb %xmm0, %xmm3
+ pmovmskb %xmm3, %edx
+# ifdef USE_AS_STRNCAT
+ sub $64, %ebx
+ jbe L(UnalignedLeaveCase2OrCase3)
+# endif
+ test %edx, %edx
+ jz L(Unaligned64Loop_start)
+
+L(Unaligned64Leave):
+ pxor %xmm1, %xmm1
+
+ pcmpeqb %xmm4, %xmm0
+ pcmpeqb %xmm5, %xmm1
+ pmovmskb %xmm0, %edx
+ pmovmskb %xmm1, %ecx
+ test %edx, %edx
+ jnz L(CopyFrom1To16BytesUnaligned_0)
+ test %ecx, %ecx
+ jnz L(CopyFrom1To16BytesUnaligned_16)
+
+ pcmpeqb %xmm6, %xmm0
+ pcmpeqb %xmm7, %xmm1
+ pmovmskb %xmm0, %edx
+ pmovmskb %xmm1, %ecx
+ test %edx, %edx
+ jnz L(CopyFrom1To16BytesUnaligned_32)
+
+ bsf %ecx, %edx
+ movdqu %xmm4, (%eax)
+ movdqu %xmm5, 16(%eax)
+ movdqu %xmm6, 32(%eax)
+ add $48, %esi
+ add $48, %eax
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
+
+# ifdef USE_AS_STRNCAT
+ .p2align 4
+L(BigN):
+ pcmpeqb (%esi), %xmm1
+ pmovmskb %xmm1, %edx
+ shr %cl, %edx
+ test %edx, %edx
+ jnz L(CopyFrom1To16BytesTail)
+
+ pcmpeqb 16(%esi), %xmm0
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ jnz L(CopyFrom1To32Bytes)
+
+ movdqu (%esi, %ecx), %xmm1 /* copy 16 bytes */
+ movdqu %xmm1, (%eax)
+ sub %ecx, %eax
+ sub $48, %ebx
+ add %ecx, %ebx
+
+ mov $16, %ecx
+ movdqa (%esi, %ecx), %xmm1
+ movaps 16(%esi, %ecx), %xmm2
+ movdqu %xmm1, (%eax, %ecx)
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %edx
+ add $16, %ecx
+ test %edx, %edx
+ jnz L(CopyFrom1To16Bytes)
+ jmp L(Unalign16BothBigN)
+# endif
+
+/*------------end of main part-------------------------------*/
+
+/* Case1 */
+ .p2align 4
+L(CopyFrom1To16Bytes):
+ add %ecx, %eax
+ add %ecx, %esi
+ bsf %edx, %edx
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
+
+ .p2align 4
+L(CopyFrom1To16BytesTail):
+ add %ecx, %esi
+ bsf %edx, %edx
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
+
+ .p2align 4
+L(CopyFrom1To32Bytes1):
+ add $16, %esi
+ add $16, %eax
+L(CopyFrom1To16BytesTail1):
+ bsf %edx, %edx
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
+
+ .p2align 4
+L(CopyFrom1To32Bytes):
+ bsf %edx, %edx
+ add %ecx, %esi
+ add $16, %edx
+ sub %ecx, %edx
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
+
+ .p2align 4
+L(CopyFrom1To16BytesUnaligned_0):
+ bsf %edx, %edx
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
+
+ .p2align 4
+L(CopyFrom1To16BytesUnaligned_16):
+ bsf %ecx, %edx
+ movdqu %xmm4, (%eax)
+ add $16, %esi
+ add $16, %eax
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
+
+ .p2align 4
+L(CopyFrom1To16BytesUnaligned_32):
+ bsf %edx, %edx
+ movdqu %xmm4, (%eax)
+ movdqu %xmm5, 16(%eax)
+ add $32, %esi
+ add $32, %eax
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
+
+# ifdef USE_AS_STRNCAT
+
+ .p2align 4
+L(CopyFrom1To16BytesExit):
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
+
+/* Case2 */
+
+ .p2align 4
+L(CopyFrom1To16BytesCase2):
+ add $16, %ebx
+ add %ecx, %eax
+ add %ecx, %esi
+ bsf %edx, %edx
+ cmp %ebx, %edx
+ jb L(CopyFrom1To16BytesExit)
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4)
+
+ .p2align 4
+L(CopyFrom1To32BytesCase2):
+ sub %ecx, %ebx
+ add %ecx, %esi
+ bsf %edx, %edx
+ add $16, %edx
+ sub %ecx, %edx
+ cmp %ebx, %edx
+ jb L(CopyFrom1To16BytesExit)
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4)
+
+L(CopyFrom1To16BytesTailCase2):
+ sub %ecx, %ebx
+ add %ecx, %esi
+ bsf %edx, %edx
+ cmp %ebx, %edx
+ jb L(CopyFrom1To16BytesExit)
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4)
+
+L(CopyFrom1To16BytesTail1Case2):
+ bsf %edx, %edx
+ cmp %ebx, %edx
+ jb L(CopyFrom1To16BytesExit)
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4)
+
+/* Case2 or Case3, Case3 */
+
+ .p2align 4
+L(CopyFrom1To16BytesCase2OrCase3):
+ test %edx, %edx
+ jnz L(CopyFrom1To16BytesCase2)
+L(CopyFrom1To16BytesCase3):
+ add $16, %ebx
+ add %ecx, %eax
+ add %ecx, %esi
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4)
+
+ .p2align 4
+L(CopyFrom1To32BytesCase2OrCase3):
+ test %edx, %edx
+ jnz L(CopyFrom1To32BytesCase2)
+ sub %ecx, %ebx
+ add %ecx, %esi
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4)
+
+ .p2align 4
+L(CopyFrom1To16BytesTailCase2OrCase3):
+ test %edx, %edx
+ jnz L(CopyFrom1To16BytesTailCase2)
+ sub %ecx, %ebx
+ add %ecx, %esi
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4)
+
+ .p2align 4
+L(CopyFrom1To32Bytes1Case2OrCase3):
+ add $16, %eax
+ add $16, %esi
+ sub $16, %ebx
+L(CopyFrom1To16BytesTail1Case2OrCase3):
+ test %edx, %edx
+ jnz L(CopyFrom1To16BytesTail1Case2)
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4)
+
+# endif
+
+# ifdef USE_AS_STRNCAT
+ .p2align 4
+L(StrncatExit0):
+ movb %bh, (%eax)
+ mov STR3(%esp), %eax
+ RETURN
+# endif
+
+ .p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit1):
+ movb %bh, 1(%eax)
+# endif
+L(Exit1):
+# ifdef USE_AS_STRNCAT
+ movb (%esi), %dh
+# endif
+ movb %dh, (%eax)
+ mov STR3(%esp), %eax
+ RETURN
+
+ .p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit2):
+ movb %bh, 2(%eax)
+# endif
+L(Exit2):
+ movw (%esi), %dx
+ movw %dx, (%eax)
+ mov STR3(%esp), %eax
+ RETURN
+
+ .p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit3):
+ movb %bh, 3(%eax)
+# endif
+L(Exit3):
+ movw (%esi), %cx
+ movw %cx, (%eax)
+# ifdef USE_AS_STRNCAT
+ movb 2(%esi), %dh
+# endif
+ movb %dh, 2(%eax)
+ mov STR3(%esp), %eax
+ RETURN
+
+ .p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit4):
+ movb %bh, 4(%eax)
+# endif
+L(Exit4):
+ movl (%esi), %edx
+ movl %edx, (%eax)
+ mov STR3(%esp), %eax
+ RETURN
+
+ .p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit5):
+ movb %bh, 5(%eax)
+# endif
+L(Exit5):
+ movl (%esi), %ecx
+# ifdef USE_AS_STRNCAT
+ movb 4(%esi), %dh
+# endif
+ movb %dh, 4(%eax)
+ movl %ecx, (%eax)
+ mov STR3(%esp), %eax
+ RETURN
+
+ .p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit6):
+ movb %bh, 6(%eax)
+# endif
+L(Exit6):
+ movl (%esi), %ecx
+ movw 4(%esi), %dx
+ movl %ecx, (%eax)
+ movw %dx, 4(%eax)
+ mov STR3(%esp), %eax
+ RETURN
+
+ .p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit7):
+ movb %bh, 7(%eax)
+# endif
+L(Exit7):
+ movl (%esi), %ecx
+ movl 3(%esi), %edx
+ movl %ecx, (%eax)
+ movl %edx, 3(%eax)
+ mov STR3(%esp), %eax
+ RETURN
+
+ .p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit8):
+ movb %bh, 8(%eax)
+# endif
+L(Exit8):
+ movlpd (%esi), %xmm0
+ movlpd %xmm0, (%eax)
+ mov STR3(%esp), %eax
+ RETURN
+
+ .p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit9):
+ movb %bh, 9(%eax)
+# endif
+L(Exit9):
+ movlpd (%esi), %xmm0
+# ifdef USE_AS_STRNCAT
+ movb 8(%esi), %dh
+# endif
+ movb %dh, 8(%eax)
+ movlpd %xmm0, (%eax)
+ mov STR3(%esp), %eax
+ RETURN
+
+ .p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit10):
+ movb %bh, 10(%eax)
+# endif
+L(Exit10):
+ movlpd (%esi), %xmm0
+ movw 8(%esi), %dx
+ movlpd %xmm0, (%eax)
+ movw %dx, 8(%eax)
+ mov STR3(%esp), %eax
+ RETURN
+
+ .p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit11):
+ movb %bh, 11(%eax)
+# endif
+L(Exit11):
+ movlpd (%esi), %xmm0
+ movl 7(%esi), %edx
+ movlpd %xmm0, (%eax)
+ movl %edx, 7(%eax)
+ mov STR3(%esp), %eax
+ RETURN
+
+ .p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit12):
+ movb %bh, 12(%eax)
+# endif
+L(Exit12):
+ movlpd (%esi), %xmm0
+ movl 8(%esi), %edx
+ movlpd %xmm0, (%eax)
+ movl %edx, 8(%eax)
+ mov STR3(%esp), %eax
+ RETURN
+
+ .p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit13):
+ movb %bh, 13(%eax)
+# endif
+L(Exit13):
+ movlpd (%esi), %xmm0
+ movlpd 5(%esi), %xmm1
+ movlpd %xmm0, (%eax)
+ movlpd %xmm1, 5(%eax)
+ mov STR3(%esp), %eax
+ RETURN
+
+ .p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit14):
+ movb %bh, 14(%eax)
+# endif
+L(Exit14):
+ movlpd (%esi), %xmm0
+ movlpd 6(%esi), %xmm1
+ movlpd %xmm0, (%eax)
+ movlpd %xmm1, 6(%eax)
+ mov STR3(%esp), %eax
+ RETURN
+
+ .p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit15):
+ movb %bh, 15(%eax)
+# endif
+L(Exit15):
+ movlpd (%esi), %xmm0
+ movlpd 7(%esi), %xmm1
+ movlpd %xmm0, (%eax)
+ movlpd %xmm1, 7(%eax)
+ mov STR3(%esp), %eax
+ RETURN
+
+ .p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit16):
+ movb %bh, 16(%eax)
+# endif
+L(Exit16):
+ movdqu (%esi), %xmm0
+ movdqu %xmm0, (%eax)
+ mov STR3(%esp), %eax
+ RETURN
+
+ .p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit17):
+ movb %bh, 17(%eax)
+# endif
+L(Exit17):
+ movdqu (%esi), %xmm0
+# ifdef USE_AS_STRNCAT
+ movb 16(%esi), %dh
+# endif
+ movdqu %xmm0, (%eax)
+ movb %dh, 16(%eax)
+ mov STR3(%esp), %eax
+ RETURN
+
+ .p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit18):
+ movb %bh, 18(%eax)
+# endif
+L(Exit18):
+ movdqu (%esi), %xmm0
+ movw 16(%esi), %cx
+ movdqu %xmm0, (%eax)
+ movw %cx, 16(%eax)
+ mov STR3(%esp), %eax
+ RETURN
+
+ .p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit19):
+ movb %bh, 19(%eax)
+# endif
+L(Exit19):
+ movdqu (%esi), %xmm0
+ movl 15(%esi), %ecx
+ movdqu %xmm0, (%eax)
+ movl %ecx, 15(%eax)
+ mov STR3(%esp), %eax
+ RETURN
+
+ .p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit20):
+ movb %bh, 20(%eax)
+# endif
+L(Exit20):
+ movdqu (%esi), %xmm0
+ movl 16(%esi), %ecx
+ movdqu %xmm0, (%eax)
+ movl %ecx, 16(%eax)
+ mov STR3(%esp), %eax
+ RETURN
+
+ .p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit21):
+ movb %bh, 21(%eax)
+# endif
+L(Exit21):
+ movdqu (%esi), %xmm0
+ movl 16(%esi), %ecx
+# ifdef USE_AS_STRNCAT
+ movb 20(%esi), %dh
+# endif
+ movdqu %xmm0, (%eax)
+ movl %ecx, 16(%eax)
+ movb %dh, 20(%eax)
+ mov STR3(%esp), %eax
+ RETURN
+
+ .p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit22):
+ movb %bh, 22(%eax)
+# endif
+L(Exit22):
+ movdqu (%esi), %xmm0
+ movlpd 14(%esi), %xmm3
+ movdqu %xmm0, (%eax)
+ movlpd %xmm3, 14(%eax)
+ mov STR3(%esp), %eax
+ RETURN
+
+ .p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit23):
+ movb %bh, 23(%eax)
+# endif
+L(Exit23):
+ movdqu (%esi), %xmm0
+ movlpd 15(%esi), %xmm3
+ movdqu %xmm0, (%eax)
+ movlpd %xmm3, 15(%eax)
+ mov STR3(%esp), %eax
+ RETURN
+
+ .p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit24):
+ movb %bh, 24(%eax)
+# endif
+L(Exit24):
+ movdqu (%esi), %xmm0
+ movlpd 16(%esi), %xmm2
+ movdqu %xmm0, (%eax)
+ movlpd %xmm2, 16(%eax)
+ mov STR3(%esp), %eax
+ RETURN
+
+ .p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit25):
+ movb %bh, 25(%eax)
+# endif
+L(Exit25):
+ movdqu (%esi), %xmm0
+ movlpd 16(%esi), %xmm2
+# ifdef USE_AS_STRNCAT
+ movb 24(%esi), %dh
+# endif
+ movdqu %xmm0, (%eax)
+ movlpd %xmm2, 16(%eax)
+ movb %dh, 24(%eax)
+ mov STR3(%esp), %eax
+ RETURN
+
+ .p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit26):
+ movb %bh, 26(%eax)
+# endif
+L(Exit26):
+ movdqu (%esi), %xmm0
+ movlpd 16(%esi), %xmm2
+ movw 24(%esi), %cx
+ movdqu %xmm0, (%eax)
+ movlpd %xmm2, 16(%eax)
+ movw %cx, 24(%eax)
+ mov STR3(%esp), %eax
+ RETURN
+
+ .p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit27):
+ movb %bh, 27(%eax)
+# endif
+L(Exit27):
+ movdqu (%esi), %xmm0
+ movlpd 16(%esi), %xmm2
+ movl 23(%esi), %ecx
+ movdqu %xmm0, (%eax)
+ movlpd %xmm2, 16(%eax)
+ movl %ecx, 23(%eax)
+ mov STR3(%esp), %eax
+ RETURN
+
+ .p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit28):
+ movb %bh, 28(%eax)
+# endif
+L(Exit28):
+ movdqu (%esi), %xmm0
+ movlpd 16(%esi), %xmm2
+ movl 24(%esi), %ecx
+ movdqu %xmm0, (%eax)
+ movlpd %xmm2, 16(%eax)
+ movl %ecx, 24(%eax)
+ mov STR3(%esp), %eax
+ RETURN
+
+ .p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit29):
+ movb %bh, 29(%eax)
+# endif
+L(Exit29):
+ movdqu (%esi), %xmm0
+ movdqu 13(%esi), %xmm2
+ movdqu %xmm0, (%eax)
+ movdqu %xmm2, 13(%eax)
+ mov STR3(%esp), %eax
+ RETURN
+
+ .p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit30):
+ movb %bh, 30(%eax)
+# endif
+L(Exit30):
+ movdqu (%esi), %xmm0
+ movdqu 14(%esi), %xmm2
+ movdqu %xmm0, (%eax)
+ movdqu %xmm2, 14(%eax)
+ mov STR3(%esp), %eax
+ RETURN
+
+ .p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit31):
+ movb %bh, 31(%eax)
+# endif
+L(Exit31):
+ movdqu (%esi), %xmm0
+ movdqu 15(%esi), %xmm2
+ movdqu %xmm0, (%eax)
+ movdqu %xmm2, 15(%eax)
+ mov STR3(%esp), %eax
+ RETURN
+
+ .p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit32):
+ movb %bh, 32(%eax)
+# endif
+L(Exit32):
+ movdqu (%esi), %xmm0
+ movdqu 16(%esi), %xmm2
+ movdqu %xmm0, (%eax)
+ movdqu %xmm2, 16(%eax)
+ mov STR3(%esp), %eax
+ RETURN
+
+# ifdef USE_AS_STRNCAT
+
+ .p2align 4
+L(UnalignedLeaveCase2OrCase3):
+ test %edx, %edx
+ jnz L(Unaligned64LeaveCase2)
+L(Unaligned64LeaveCase3):
+ lea 64(%ebx), %ecx
+ and $-16, %ecx
+ add $48, %ebx
+ jl L(CopyFrom1To16BytesCase3)
+ movdqu %xmm4, (%eax)
+ sub $16, %ebx
+ jb L(CopyFrom1To16BytesCase3)
+ movdqu %xmm5, 16(%eax)
+ sub $16, %ebx
+ jb L(CopyFrom1To16BytesCase3)
+ movdqu %xmm6, 32(%eax)
+ sub $16, %ebx
+ jb L(CopyFrom1To16BytesCase3)
+ movdqu %xmm7, 48(%eax)
+ xor %bh, %bh
+ movb %bh, 64(%eax)
+ mov STR3(%esp), %eax
+ RETURN
+
+ .p2align 4
+L(Unaligned64LeaveCase2):
+ xor %ecx, %ecx
+ pcmpeqb %xmm4, %xmm0
+ pmovmskb %xmm0, %edx
+ add $48, %ebx
+ jle L(CopyFrom1To16BytesCase2OrCase3)
+ test %edx, %edx
+ jnz L(CopyFrom1To16Bytes)
+
+ pcmpeqb %xmm5, %xmm0
+ pmovmskb %xmm0, %edx
+ movdqu %xmm4, (%eax)
+ add $16, %ecx
+ sub $16, %ebx
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+ test %edx, %edx
+ jnz L(CopyFrom1To16Bytes)
+
+ pcmpeqb %xmm6, %xmm0
+ pmovmskb %xmm0, %edx
+ movdqu %xmm5, 16(%eax)
+ add $16, %ecx
+ sub $16, %ebx
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+ test %edx, %edx
+ jnz L(CopyFrom1To16Bytes)
+
+ pcmpeqb %xmm7, %xmm0
+ pmovmskb %xmm0, %edx
+ movdqu %xmm6, 32(%eax)
+ lea 16(%eax, %ecx), %eax
+ lea 16(%esi, %ecx), %esi
+ bsf %edx, %edx
+ cmp %ebx, %edx
+ jb L(CopyFrom1To16BytesExit)
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4)
+# endif
+ .p2align 4
+L(ExitZero):
+ RETURN
+
+END (STRCAT)
+
+ .p2align 4
+ .section .rodata
+L(ExitTable):
+ .int JMPTBL(L(Exit1), L(ExitTable))
+ .int JMPTBL(L(Exit2), L(ExitTable))
+ .int JMPTBL(L(Exit3), L(ExitTable))
+ .int JMPTBL(L(Exit4), L(ExitTable))
+ .int JMPTBL(L(Exit5), L(ExitTable))
+ .int JMPTBL(L(Exit6), L(ExitTable))
+ .int JMPTBL(L(Exit7), L(ExitTable))
+ .int JMPTBL(L(Exit8), L(ExitTable))
+ .int JMPTBL(L(Exit9), L(ExitTable))
+ .int JMPTBL(L(Exit10), L(ExitTable))
+ .int JMPTBL(L(Exit11), L(ExitTable))
+ .int JMPTBL(L(Exit12), L(ExitTable))
+ .int JMPTBL(L(Exit13), L(ExitTable))
+ .int JMPTBL(L(Exit14), L(ExitTable))
+ .int JMPTBL(L(Exit15), L(ExitTable))
+ .int JMPTBL(L(Exit16), L(ExitTable))
+ .int JMPTBL(L(Exit17), L(ExitTable))
+ .int JMPTBL(L(Exit18), L(ExitTable))
+ .int JMPTBL(L(Exit19), L(ExitTable))
+ .int JMPTBL(L(Exit20), L(ExitTable))
+ .int JMPTBL(L(Exit21), L(ExitTable))
+ .int JMPTBL(L(Exit22), L(ExitTable))
+ .int JMPTBL(L(Exit23), L(ExitTable))
+ .int JMPTBL(L(Exit24), L(ExitTable))
+ .int JMPTBL(L(Exit25), L(ExitTable))
+ .int JMPTBL(L(Exit26), L(ExitTable))
+ .int JMPTBL(L(Exit27), L(ExitTable))
+ .int JMPTBL(L(Exit28), L(ExitTable))
+ .int JMPTBL(L(Exit29), L(ExitTable))
+ .int JMPTBL(L(Exit30), L(ExitTable))
+ .int JMPTBL(L(Exit31), L(ExitTable))
+ .int JMPTBL(L(Exit32), L(ExitTable))
+# ifdef USE_AS_STRNCAT
+L(ExitStrncatTable):
+ .int JMPTBL(L(StrncatExit0), L(ExitStrncatTable))
+ .int JMPTBL(L(StrncatExit1), L(ExitStrncatTable))
+ .int JMPTBL(L(StrncatExit2), L(ExitStrncatTable))
+ .int JMPTBL(L(StrncatExit3), L(ExitStrncatTable))
+ .int JMPTBL(L(StrncatExit4), L(ExitStrncatTable))
+ .int JMPTBL(L(StrncatExit5), L(ExitStrncatTable))
+ .int JMPTBL(L(StrncatExit6), L(ExitStrncatTable))
+ .int JMPTBL(L(StrncatExit7), L(ExitStrncatTable))
+ .int JMPTBL(L(StrncatExit8), L(ExitStrncatTable))
+ .int JMPTBL(L(StrncatExit9), L(ExitStrncatTable))
+ .int JMPTBL(L(StrncatExit10), L(ExitStrncatTable))
+ .int JMPTBL(L(StrncatExit11), L(ExitStrncatTable))
+ .int JMPTBL(L(StrncatExit12), L(ExitStrncatTable))
+ .int JMPTBL(L(StrncatExit13), L(ExitStrncatTable))
+ .int JMPTBL(L(StrncatExit14), L(ExitStrncatTable))
+ .int JMPTBL(L(StrncatExit15), L(ExitStrncatTable))
+ .int JMPTBL(L(StrncatExit16), L(ExitStrncatTable))
+ .int JMPTBL(L(StrncatExit17), L(ExitStrncatTable))
+ .int JMPTBL(L(StrncatExit18), L(ExitStrncatTable))
+ .int JMPTBL(L(StrncatExit19), L(ExitStrncatTable))
+ .int JMPTBL(L(StrncatExit20), L(ExitStrncatTable))
+ .int JMPTBL(L(StrncatExit21), L(ExitStrncatTable))
+ .int JMPTBL(L(StrncatExit22), L(ExitStrncatTable))
+ .int JMPTBL(L(StrncatExit23), L(ExitStrncatTable))
+ .int JMPTBL(L(StrncatExit24), L(ExitStrncatTable))
+ .int JMPTBL(L(StrncatExit25), L(ExitStrncatTable))
+ .int JMPTBL(L(StrncatExit26), L(ExitStrncatTable))
+ .int JMPTBL(L(StrncatExit27), L(ExitStrncatTable))
+ .int JMPTBL(L(StrncatExit28), L(ExitStrncatTable))
+ .int JMPTBL(L(StrncatExit29), L(ExitStrncatTable))
+ .int JMPTBL(L(StrncatExit30), L(ExitStrncatTable))
+ .int JMPTBL(L(StrncatExit31), L(ExitStrncatTable))
+ .int JMPTBL(L(StrncatExit32), L(ExitStrncatTable))
+# endif
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strcat-ssse3.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strcat-ssse3.S
new file mode 100644
index 0000000000..59ffbc60a5
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strcat-ssse3.S
@@ -0,0 +1,572 @@
+/* strcat with SSSE3
+ Copyright (C) 2011-2017 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+
+# define CFI_PUSH(REG) \
+ cfi_adjust_cfa_offset (4); \
+ cfi_rel_offset (REG, 0)
+
+# define CFI_POP(REG) \
+ cfi_adjust_cfa_offset (-4); \
+ cfi_restore (REG)
+
+# define PUSH(REG) pushl REG; CFI_PUSH (REG)
+# define POP(REG) popl REG; CFI_POP (REG)
+
+# ifndef STRCAT
+# define STRCAT __strcat_ssse3
+# endif
+
+# define PARMS 4
+# define STR1 PARMS+4
+# define STR2 STR1+4
+
+# ifdef USE_AS_STRNCAT
+# define LEN STR2+8
+# endif
+
+# define USE_AS_STRCAT
+
+.text
+ENTRY (STRCAT)
+ PUSH (%edi)
+ mov STR1(%esp), %edi
+ mov %edi, %edx
+
+# define RETURN jmp L(StartStrcpyPart)
+# include "strlen-sse2.S"
+
+L(StartStrcpyPart):
+ mov STR2(%esp), %ecx
+ lea (%edi, %eax), %edx
+# ifdef USE_AS_STRNCAT
+ PUSH (%ebx)
+ mov LEN(%esp), %ebx
+ test %ebx, %ebx
+ jz L(StrncatExit0)
+ cmp $8, %ebx
+ jbe L(StrncatExit8Bytes)
+# endif
+ cmpb $0, (%ecx)
+ jz L(Exit1)
+ cmpb $0, 1(%ecx)
+ jz L(Exit2)
+ cmpb $0, 2(%ecx)
+ jz L(Exit3)
+ cmpb $0, 3(%ecx)
+ jz L(Exit4)
+ cmpb $0, 4(%ecx)
+ jz L(Exit5)
+ cmpb $0, 5(%ecx)
+ jz L(Exit6)
+ cmpb $0, 6(%ecx)
+ jz L(Exit7)
+ cmpb $0, 7(%ecx)
+ jz L(Exit8)
+ cmpb $0, 8(%ecx)
+ jz L(Exit9)
+# ifdef USE_AS_STRNCAT
+ cmp $16, %ebx
+ jb L(StrncatExit15Bytes)
+# endif
+ cmpb $0, 9(%ecx)
+ jz L(Exit10)
+ cmpb $0, 10(%ecx)
+ jz L(Exit11)
+ cmpb $0, 11(%ecx)
+ jz L(Exit12)
+ cmpb $0, 12(%ecx)
+ jz L(Exit13)
+ cmpb $0, 13(%ecx)
+ jz L(Exit14)
+ cmpb $0, 14(%ecx)
+ jz L(Exit15)
+ cmpb $0, 15(%ecx)
+ jz L(Exit16)
+# ifdef USE_AS_STRNCAT
+ cmp $16, %ebx
+ je L(StrncatExit16)
+
+# define RETURN1 \
+ POP (%ebx); \
+ POP (%edi); \
+ ret; \
+ CFI_PUSH (%ebx); \
+ CFI_PUSH (%edi)
+# define USE_AS_STRNCPY
+# else
+# define RETURN1 POP (%edi); ret; CFI_PUSH (%edi)
+# endif
+# include "strcpy-ssse3.S"
+ .p2align 4
+L(CopyFrom1To16Bytes):
+ add %esi, %edx
+ add %esi, %ecx
+
+ POP (%esi)
+ test %al, %al
+ jz L(ExitHigh)
+ test $0x01, %al
+ jnz L(Exit1)
+ test $0x02, %al
+ jnz L(Exit2)
+ test $0x04, %al
+ jnz L(Exit3)
+ test $0x08, %al
+ jnz L(Exit4)
+ test $0x10, %al
+ jnz L(Exit5)
+ test $0x20, %al
+ jnz L(Exit6)
+ test $0x40, %al
+ jnz L(Exit7)
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ movl %edi, %eax
+ RETURN1
+
+ .p2align 4
+L(ExitHigh):
+ test $0x01, %ah
+ jnz L(Exit9)
+ test $0x02, %ah
+ jnz L(Exit10)
+ test $0x04, %ah
+ jnz L(Exit11)
+ test $0x08, %ah
+ jnz L(Exit12)
+ test $0x10, %ah
+ jnz L(Exit13)
+ test $0x20, %ah
+ jnz L(Exit14)
+ test $0x40, %ah
+ jnz L(Exit15)
+ movlpd (%ecx), %xmm0
+ movlpd 8(%ecx), %xmm1
+ movlpd %xmm0, (%edx)
+ movlpd %xmm1, 8(%edx)
+ movl %edi, %eax
+ RETURN1
+
+ .p2align 4
+L(StrncatExit1):
+ movb %bh, 1(%edx)
+L(Exit1):
+ movb (%ecx), %al
+ movb %al, (%edx)
+ movl %edi, %eax
+ RETURN1
+
+ .p2align 4
+L(StrncatExit2):
+ movb %bh, 2(%edx)
+L(Exit2):
+ movw (%ecx), %ax
+ movw %ax, (%edx)
+ movl %edi, %eax
+ RETURN1
+
+ .p2align 4
+L(StrncatExit3):
+ movb %bh, 3(%edx)
+L(Exit3):
+ movw (%ecx), %ax
+ movw %ax, (%edx)
+ movb 2(%ecx), %al
+ movb %al, 2(%edx)
+ movl %edi, %eax
+ RETURN1
+
+ .p2align 4
+L(StrncatExit4):
+ movb %bh, 4(%edx)
+L(Exit4):
+ movl (%ecx), %eax
+ movl %eax, (%edx)
+ movl %edi, %eax
+ RETURN1
+
+ .p2align 4
+L(StrncatExit5):
+ movb %bh, 5(%edx)
+L(Exit5):
+ movl (%ecx), %eax
+ movl %eax, (%edx)
+ movb 4(%ecx), %al
+ movb %al, 4(%edx)
+ movl %edi, %eax
+ RETURN1
+
+ .p2align 4
+L(StrncatExit6):
+ movb %bh, 6(%edx)
+L(Exit6):
+ movl (%ecx), %eax
+ movl %eax, (%edx)
+ movw 4(%ecx), %ax
+ movw %ax, 4(%edx)
+ movl %edi, %eax
+ RETURN1
+
+ .p2align 4
+L(StrncatExit7):
+ movb %bh, 7(%edx)
+L(Exit7):
+ movl (%ecx), %eax
+ movl %eax, (%edx)
+ movl 3(%ecx), %eax
+ movl %eax, 3(%edx)
+ movl %edi, %eax
+ RETURN1
+
+ .p2align 4
+L(StrncatExit8):
+ movb %bh, 8(%edx)
+L(Exit8):
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ movl %edi, %eax
+ RETURN1
+
+ .p2align 4
+L(StrncatExit9):
+ movb %bh, 9(%edx)
+L(Exit9):
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ movb 8(%ecx), %al
+ movb %al, 8(%edx)
+ movl %edi, %eax
+ RETURN1
+
+ .p2align 4
+L(StrncatExit10):
+ movb %bh, 10(%edx)
+L(Exit10):
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ movw 8(%ecx), %ax
+ movw %ax, 8(%edx)
+ movl %edi, %eax
+ RETURN1
+
+ .p2align 4
+L(StrncatExit11):
+ movb %bh, 11(%edx)
+L(Exit11):
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ movl 7(%ecx), %eax
+ movl %eax, 7(%edx)
+ movl %edi, %eax
+ RETURN1
+
+ .p2align 4
+L(StrncatExit12):
+ movb %bh, 12(%edx)
+L(Exit12):
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ movl 8(%ecx), %eax
+ movl %eax, 8(%edx)
+ movl %edi, %eax
+ RETURN1
+
+ .p2align 4
+L(StrncatExit13):
+ movb %bh, 13(%edx)
+L(Exit13):
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ movlpd 5(%ecx), %xmm0
+ movlpd %xmm0, 5(%edx)
+ movl %edi, %eax
+ RETURN1
+
+ .p2align 4
+L(StrncatExit14):
+ movb %bh, 14(%edx)
+L(Exit14):
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ movlpd 6(%ecx), %xmm0
+ movlpd %xmm0, 6(%edx)
+ movl %edi, %eax
+ RETURN1
+
+ .p2align 4
+L(StrncatExit15):
+ movb %bh, 15(%edx)
+L(Exit15):
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ movlpd 7(%ecx), %xmm0
+ movlpd %xmm0, 7(%edx)
+ movl %edi, %eax
+ RETURN1
+
+ .p2align 4
+L(StrncatExit16):
+ movb %bh, 16(%edx)
+L(Exit16):
+ movlpd (%ecx), %xmm0
+ movlpd 8(%ecx), %xmm1
+ movlpd %xmm0, (%edx)
+ movlpd %xmm1, 8(%edx)
+ movl %edi, %eax
+ RETURN1
+
+# ifdef USE_AS_STRNCPY
+
+ CFI_PUSH(%esi)
+
+ .p2align 4
+L(CopyFrom1To16BytesCase2):
+ add $16, %ebx
+ add %esi, %ecx
+ lea (%esi, %edx), %esi
+ lea -9(%ebx), %edx
+ and $1<<7, %dh
+ or %al, %dh
+ test %dh, %dh
+ lea (%esi), %edx
+ POP (%esi)
+ jz L(ExitHighCase2)
+
+ test $0x01, %al
+ jnz L(Exit1)
+ cmp $1, %ebx
+ je L(StrncatExit1)
+ test $0x02, %al
+ jnz L(Exit2)
+ cmp $2, %ebx
+ je L(StrncatExit2)
+ test $0x04, %al
+ jnz L(Exit3)
+ cmp $3, %ebx
+ je L(StrncatExit3)
+ test $0x08, %al
+ jnz L(Exit4)
+ cmp $4, %ebx
+ je L(StrncatExit4)
+ test $0x10, %al
+ jnz L(Exit5)
+ cmp $5, %ebx
+ je L(StrncatExit5)
+ test $0x20, %al
+ jnz L(Exit6)
+ cmp $6, %ebx
+ je L(StrncatExit6)
+ test $0x40, %al
+ jnz L(Exit7)
+ cmp $7, %ebx
+ je L(StrncatExit7)
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ lea 7(%edx), %eax
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+ xor %cl, %cl
+ movb %cl, (%eax)
+ movl %edi, %eax
+ RETURN1
+
+ .p2align 4
+L(ExitHighCase2):
+ test $0x01, %ah
+ jnz L(Exit9)
+ cmp $9, %ebx
+ je L(StrncatExit9)
+ test $0x02, %ah
+ jnz L(Exit10)
+ cmp $10, %ebx
+ je L(StrncatExit10)
+ test $0x04, %ah
+ jnz L(Exit11)
+ cmp $11, %ebx
+ je L(StrncatExit11)
+ test $0x8, %ah
+ jnz L(Exit12)
+ cmp $12, %ebx
+ je L(StrncatExit12)
+ test $0x10, %ah
+ jnz L(Exit13)
+ cmp $13, %ebx
+ je L(StrncatExit13)
+ test $0x20, %ah
+ jnz L(Exit14)
+ cmp $14, %ebx
+ je L(StrncatExit14)
+ test $0x40, %ah
+ jnz L(Exit15)
+ cmp $15, %ebx
+ je L(StrncatExit15)
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ movlpd 8(%ecx), %xmm1
+ movlpd %xmm1, 8(%edx)
+ movl %edi, %eax
+ RETURN1
+
+ CFI_PUSH(%esi)
+
+L(CopyFrom1To16BytesCase2OrCase3):
+ test %eax, %eax
+ jnz L(CopyFrom1To16BytesCase2)
+
+ .p2align 4
+L(CopyFrom1To16BytesCase3):
+ add $16, %ebx
+ add %esi, %edx
+ add %esi, %ecx
+
+ POP (%esi)
+
+ cmp $8, %ebx
+ ja L(ExitHighCase3)
+ cmp $1, %ebx
+ je L(StrncatExit1)
+ cmp $2, %ebx
+ je L(StrncatExit2)
+ cmp $3, %ebx
+ je L(StrncatExit3)
+ cmp $4, %ebx
+ je L(StrncatExit4)
+ cmp $5, %ebx
+ je L(StrncatExit5)
+ cmp $6, %ebx
+ je L(StrncatExit6)
+ cmp $7, %ebx
+ je L(StrncatExit7)
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ movb %bh, 8(%edx)
+ movl %edi, %eax
+ RETURN1
+
+ .p2align 4
+L(ExitHighCase3):
+ cmp $9, %ebx
+ je L(StrncatExit9)
+ cmp $10, %ebx
+ je L(StrncatExit10)
+ cmp $11, %ebx
+ je L(StrncatExit11)
+ cmp $12, %ebx
+ je L(StrncatExit12)
+ cmp $13, %ebx
+ je L(StrncatExit13)
+ cmp $14, %ebx
+ je L(StrncatExit14)
+ cmp $15, %ebx
+ je L(StrncatExit15)
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ movlpd 8(%ecx), %xmm1
+ movlpd %xmm1, 8(%edx)
+ movb %bh, 16(%edx)
+ movl %edi, %eax
+ RETURN1
+
+ .p2align 4
+L(StrncatExit0):
+ movl %edi, %eax
+ RETURN1
+
+ .p2align 4
+L(StrncatExit15Bytes):
+ cmp $9, %ebx
+ je L(StrncatExit9)
+ cmpb $0, 9(%ecx)
+ jz L(Exit10)
+ cmp $10, %ebx
+ je L(StrncatExit10)
+ cmpb $0, 10(%ecx)
+ jz L(Exit11)
+ cmp $11, %ebx
+ je L(StrncatExit11)
+ cmpb $0, 11(%ecx)
+ jz L(Exit12)
+ cmp $12, %ebx
+ je L(StrncatExit12)
+ cmpb $0, 12(%ecx)
+ jz L(Exit13)
+ cmp $13, %ebx
+ je L(StrncatExit13)
+ cmpb $0, 13(%ecx)
+ jz L(Exit14)
+ cmp $14, %ebx
+ je L(StrncatExit14)
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ movlpd 7(%ecx), %xmm0
+ movlpd %xmm0, 7(%edx)
+ lea 14(%edx), %eax
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+ movb %bh, (%eax)
+ movl %edi, %eax
+ RETURN1
+
+ .p2align 4
+L(StrncatExit8Bytes):
+ cmpb $0, (%ecx)
+ jz L(Exit1)
+ cmp $1, %ebx
+ je L(StrncatExit1)
+ cmpb $0, 1(%ecx)
+ jz L(Exit2)
+ cmp $2, %ebx
+ je L(StrncatExit2)
+ cmpb $0, 2(%ecx)
+ jz L(Exit3)
+ cmp $3, %ebx
+ je L(StrncatExit3)
+ cmpb $0, 3(%ecx)
+ jz L(Exit4)
+ cmp $4, %ebx
+ je L(StrncatExit4)
+ cmpb $0, 4(%ecx)
+ jz L(Exit5)
+ cmp $5, %ebx
+ je L(StrncatExit5)
+ cmpb $0, 5(%ecx)
+ jz L(Exit6)
+ cmp $6, %ebx
+ je L(StrncatExit6)
+ cmpb $0, 6(%ecx)
+ jz L(Exit7)
+ cmp $7, %ebx
+ je L(StrncatExit7)
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ lea 7(%edx), %eax
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+ movb %bh, (%eax)
+ movl %edi, %eax
+ RETURN1
+
+# endif
+END (STRCAT)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strcat.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strcat.S
new file mode 100644
index 0000000000..8412cb6f23
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strcat.S
@@ -0,0 +1,92 @@
+/* Multiple versions of strcat
+ All versions must be listed in ifunc-impl-list.c.
+ Copyright (C) 2011-2017 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+#ifndef USE_AS_STRNCAT
+# ifndef STRCAT
+# define STRCAT strcat
+# endif
+#endif
+
+#ifdef USE_AS_STRNCAT
+# define STRCAT_SSSE3 __strncat_ssse3
+# define STRCAT_SSE2 __strncat_sse2
+# define STRCAT_IA32 __strncat_ia32
+# define __GI_STRCAT __GI_strncat
+#else
+# define STRCAT_SSSE3 __strcat_ssse3
+# define STRCAT_SSE2 __strcat_sse2
+# define STRCAT_IA32 __strcat_ia32
+# define __GI_STRCAT __GI_strcat
+#endif
+
+
+/* Define multiple versions only for the definition in libc. Don't
+ define multiple versions for strncat in static library since we
+ need strncat before the initialization happened. */
+#if IS_IN (libc)
+
+ .text
+ENTRY(STRCAT)
+ .type STRCAT, @gnu_indirect_function
+ LOAD_GOT_AND_RTLD_GLOBAL_RO
+ LOAD_FUNC_GOT_EAX (STRCAT_IA32)
+ HAS_CPU_FEATURE (SSE2)
+ jz 2f
+ LOAD_FUNC_GOT_EAX (STRCAT_SSE2)
+ HAS_ARCH_FEATURE (Fast_Unaligned_Load)
+ jnz 2f
+ HAS_CPU_FEATURE (SSSE3)
+ jz 2f
+ LOAD_FUNC_GOT_EAX (STRCAT_SSSE3)
+2: ret
+END(STRCAT)
+
+# undef ENTRY
+# define ENTRY(name) \
+ .type STRCAT_IA32, @function; \
+ .align 16; \
+ .globl STRCAT_IA32; \
+ .hidden STRCAT_IA32; \
+ STRCAT_IA32: cfi_startproc; \
+ CALL_MCOUNT
+# undef END
+# define END(name) \
+ cfi_endproc; .size STRCAT_IA32, .-STRCAT_IA32
+
+# ifdef SHARED
+# undef libc_hidden_builtin_def
+/* It doesn't make sense to send libc-internal strcat calls through a PLT.
+ The speedup we get from using SSSE3 instruction is likely eaten away
+ by the indirect call in the PLT. */
+# define libc_hidden_builtin_def(name) \
+ .globl __GI_STRCAT; __GI_STRCAT = STRCAT_IA32
+# undef libc_hidden_def
+# define libc_hidden_def(name) \
+ .globl __GI___STRCAT; __GI___STRCAT = STRCAT_IA32
+
+# endif
+#endif
+
+#ifndef USE_AS_STRNCAT
+# include "../../strcat.S"
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strchr-sse2-bsf.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strchr-sse2-bsf.S
new file mode 100644
index 0000000000..95fd7c084e
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strchr-sse2-bsf.S
@@ -0,0 +1,158 @@
+/* strchr with SSE2 with bsf
+ Copyright (C) 2011-2017 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+
+# define CFI_PUSH(REG) \
+ cfi_adjust_cfa_offset (4); \
+ cfi_rel_offset (REG, 0)
+
+# define CFI_POP(REG) \
+ cfi_adjust_cfa_offset (-4); \
+ cfi_restore (REG)
+
+# define PUSH(REG) pushl REG; CFI_PUSH (REG)
+# define POP(REG) popl REG; CFI_POP (REG)
+
+# define PARMS 8
+# define ENTRANCE PUSH(%edi)
+# define RETURN POP(%edi); ret; CFI_PUSH(%edi);
+
+# define STR1 PARMS
+# define STR2 STR1+4
+
+ .text
+ENTRY (__strchr_sse2_bsf)
+
+ ENTRANCE
+ mov STR1(%esp), %ecx
+ movd STR2(%esp), %xmm1
+
+ pxor %xmm2, %xmm2
+ mov %ecx, %edi
+ punpcklbw %xmm1, %xmm1
+ punpcklbw %xmm1, %xmm1
+ /* ECX has OFFSET. */
+ and $15, %ecx
+ pshufd $0, %xmm1, %xmm1
+ je L(loop)
+
+/* Handle unaligned string. */
+ and $-16, %edi
+ movdqa (%edi), %xmm0
+ pcmpeqb %xmm0, %xmm2
+ pcmpeqb %xmm1, %xmm0
+ /* Find where NULL is. */
+ pmovmskb %xmm2, %edx
+ /* Check if there is a match. */
+ pmovmskb %xmm0, %eax
+ /* Remove the leading bytes. */
+ sarl %cl, %edx
+ sarl %cl, %eax
+ test %eax, %eax
+ je L(unaligned_no_match)
+ /* Check which byte is a match. */
+ bsf %eax, %eax
+ /* Is there a NULL? */
+ test %edx, %edx
+ je L(unaligned_match)
+ bsf %edx, %edx
+ cmpl %edx, %eax
+ /* Return NULL if NULL comes first. */
+ ja L(return_null)
+L(unaligned_match):
+ add %edi, %eax
+ add %ecx, %eax
+ RETURN
+
+ .p2align 4
+L(unaligned_no_match):
+ test %edx, %edx
+ jne L(return_null)
+ pxor %xmm2, %xmm2
+
+ add $16, %edi
+
+ .p2align 4
+/* Loop start on aligned string. */
+L(loop):
+ movdqa (%edi), %xmm0
+ pcmpeqb %xmm0, %xmm2
+ add $16, %edi
+ pcmpeqb %xmm1, %xmm0
+ pmovmskb %xmm2, %edx
+ pmovmskb %xmm0, %eax
+ or %eax, %edx
+ jnz L(matches)
+
+ movdqa (%edi), %xmm0
+ pcmpeqb %xmm0, %xmm2
+ add $16, %edi
+ pcmpeqb %xmm1, %xmm0
+ pmovmskb %xmm2, %edx
+ pmovmskb %xmm0, %eax
+ or %eax, %edx
+ jnz L(matches)
+
+ movdqa (%edi), %xmm0
+ pcmpeqb %xmm0, %xmm2
+ add $16, %edi
+ pcmpeqb %xmm1, %xmm0
+ pmovmskb %xmm2, %edx
+ pmovmskb %xmm0, %eax
+ or %eax, %edx
+ jnz L(matches)
+
+ movdqa (%edi), %xmm0
+ pcmpeqb %xmm0, %xmm2
+ add $16, %edi
+ pcmpeqb %xmm1, %xmm0
+ pmovmskb %xmm2, %edx
+ pmovmskb %xmm0, %eax
+ or %eax, %edx
+ jnz L(matches)
+ jmp L(loop)
+
+L(matches):
+ pmovmskb %xmm2, %edx
+ test %eax, %eax
+ jz L(return_null)
+ bsf %eax, %eax
+ /* There is a match. First find where NULL is. */
+ test %edx, %edx
+ je L(match)
+ bsf %edx, %ecx
+ /* Check if NULL comes first. */
+ cmpl %ecx, %eax
+ ja L(return_null)
+L(match):
+ sub $16, %edi
+ add %edi, %eax
+ RETURN
+
+/* Return NULL. */
+ .p2align 4
+L(return_null):
+ xor %eax, %eax
+ RETURN
+
+END (__strchr_sse2_bsf)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strchr-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strchr-sse2.S
new file mode 100644
index 0000000000..1f9e875b04
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strchr-sse2.S
@@ -0,0 +1,348 @@
+/* strchr SSE2 without bsf
+ Copyright (C) 2011-2017 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+
+# define CFI_PUSH(REG) \
+ cfi_adjust_cfa_offset (4); \
+ cfi_rel_offset (REG, 0)
+
+# define CFI_POP(REG) \
+ cfi_adjust_cfa_offset (-4); \
+ cfi_restore (REG)
+
+# define PUSH(REG) pushl REG; CFI_PUSH (REG)
+# define POP(REG) popl REG; CFI_POP (REG)
+
+# define PARMS 8
+# define ENTRANCE PUSH(%edi)
+# define RETURN POP(%edi); ret; CFI_PUSH(%edi);
+
+# define STR1 PARMS
+# define STR2 STR1+4
+
+ atom_text_section
+ENTRY (__strchr_sse2)
+
+ ENTRANCE
+ mov STR1(%esp), %ecx
+ movd STR2(%esp), %xmm1
+
+ pxor %xmm2, %xmm2
+ mov %ecx, %edi
+ punpcklbw %xmm1, %xmm1
+ punpcklbw %xmm1, %xmm1
+ /* ECX has OFFSET. */
+ and $15, %ecx
+ pshufd $0, %xmm1, %xmm1
+ je L(loop)
+
+/* Handle unaligned string. */
+ and $-16, %edi
+ movdqa (%edi), %xmm0
+ pcmpeqb %xmm0, %xmm2
+ pcmpeqb %xmm1, %xmm0
+ /* Find where NULL is. */
+ pmovmskb %xmm2, %edx
+ /* Check if there is a match. */
+ pmovmskb %xmm0, %eax
+ /* Remove the leading bytes. */
+ sarl %cl, %edx
+ sarl %cl, %eax
+ test %eax, %eax
+ jz L(unaligned_no_match)
+ /* Check which byte is a match. */
+ /* Is there a NULL? */
+ add %ecx, %edi
+ test %edx, %edx
+ jz L(match_case1)
+ jmp L(match_case2)
+
+ .p2align 4
+L(unaligned_no_match):
+ test %edx, %edx
+ jne L(return_null)
+
+ pxor %xmm2, %xmm2
+ add $16, %edi
+
+ .p2align 4
+/* Loop start on aligned string. */
+L(loop):
+ movdqa (%edi), %xmm0
+ pcmpeqb %xmm0, %xmm2
+ pcmpeqb %xmm1, %xmm0
+ pmovmskb %xmm2, %edx
+ pmovmskb %xmm0, %eax
+ test %eax, %eax
+ jnz L(matches)
+ test %edx, %edx
+ jnz L(return_null)
+ add $16, %edi
+
+ movdqa (%edi), %xmm0
+ pcmpeqb %xmm0, %xmm2
+ pcmpeqb %xmm1, %xmm0
+ pmovmskb %xmm2, %edx
+ pmovmskb %xmm0, %eax
+ test %eax, %eax
+ jnz L(matches)
+ test %edx, %edx
+ jnz L(return_null)
+ add $16, %edi
+
+ movdqa (%edi), %xmm0
+ pcmpeqb %xmm0, %xmm2
+ pcmpeqb %xmm1, %xmm0
+ pmovmskb %xmm2, %edx
+ pmovmskb %xmm0, %eax
+ test %eax, %eax
+ jnz L(matches)
+ test %edx, %edx
+ jnz L(return_null)
+ add $16, %edi
+
+ movdqa (%edi), %xmm0
+ pcmpeqb %xmm0, %xmm2
+ pcmpeqb %xmm1, %xmm0
+ pmovmskb %xmm2, %edx
+ pmovmskb %xmm0, %eax
+ test %eax, %eax
+ jnz L(matches)
+ test %edx, %edx
+ jnz L(return_null)
+ add $16, %edi
+ jmp L(loop)
+
+L(matches):
+ /* There is a match. First find where NULL is. */
+ test %edx, %edx
+ jz L(match_case1)
+
+ .p2align 4
+L(match_case2):
+ test %al, %al
+ jz L(match_higth_case2)
+
+ mov %al, %cl
+ and $15, %cl
+ jnz L(match_case2_4)
+
+ mov %dl, %ch
+ and $15, %ch
+ jnz L(return_null)
+
+ test $0x10, %al
+ jnz L(Exit5)
+ test $0x10, %dl
+ jnz L(return_null)
+ test $0x20, %al
+ jnz L(Exit6)
+ test $0x20, %dl
+ jnz L(return_null)
+ test $0x40, %al
+ jnz L(Exit7)
+ test $0x40, %dl
+ jnz L(return_null)
+ lea 7(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(match_case2_4):
+ test $0x01, %al
+ jnz L(Exit1)
+ test $0x01, %dl
+ jnz L(return_null)
+ test $0x02, %al
+ jnz L(Exit2)
+ test $0x02, %dl
+ jnz L(return_null)
+ test $0x04, %al
+ jnz L(Exit3)
+ test $0x04, %dl
+ jnz L(return_null)
+ lea 3(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(match_higth_case2):
+ test %dl, %dl
+ jnz L(return_null)
+
+ mov %ah, %cl
+ and $15, %cl
+ jnz L(match_case2_12)
+
+ mov %dh, %ch
+ and $15, %ch
+ jnz L(return_null)
+
+ test $0x10, %ah
+ jnz L(Exit13)
+ test $0x10, %dh
+ jnz L(return_null)
+ test $0x20, %ah
+ jnz L(Exit14)
+ test $0x20, %dh
+ jnz L(return_null)
+ test $0x40, %ah
+ jnz L(Exit15)
+ test $0x40, %dh
+ jnz L(return_null)
+ lea 15(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(match_case2_12):
+ test $0x01, %ah
+ jnz L(Exit9)
+ test $0x01, %dh
+ jnz L(return_null)
+ test $0x02, %ah
+ jnz L(Exit10)
+ test $0x02, %dh
+ jnz L(return_null)
+ test $0x04, %ah
+ jnz L(Exit11)
+ test $0x04, %dh
+ jnz L(return_null)
+ lea 11(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(match_case1):
+ test %al, %al
+ jz L(match_higth_case1)
+
+ test $0x01, %al
+ jnz L(Exit1)
+ test $0x02, %al
+ jnz L(Exit2)
+ test $0x04, %al
+ jnz L(Exit3)
+ test $0x08, %al
+ jnz L(Exit4)
+ test $0x10, %al
+ jnz L(Exit5)
+ test $0x20, %al
+ jnz L(Exit6)
+ test $0x40, %al
+ jnz L(Exit7)
+ lea 7(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(match_higth_case1):
+ test $0x01, %ah
+ jnz L(Exit9)
+ test $0x02, %ah
+ jnz L(Exit10)
+ test $0x04, %ah
+ jnz L(Exit11)
+ test $0x08, %ah
+ jnz L(Exit12)
+ test $0x10, %ah
+ jnz L(Exit13)
+ test $0x20, %ah
+ jnz L(Exit14)
+ test $0x40, %ah
+ jnz L(Exit15)
+ lea 15(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(Exit1):
+ lea (%edi), %eax
+ RETURN
+
+ .p2align 4
+L(Exit2):
+ lea 1(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(Exit3):
+ lea 2(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(Exit4):
+ lea 3(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(Exit5):
+ lea 4(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(Exit6):
+ lea 5(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(Exit7):
+ lea 6(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(Exit9):
+ lea 8(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(Exit10):
+ lea 9(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(Exit11):
+ lea 10(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(Exit12):
+ lea 11(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(Exit13):
+ lea 12(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(Exit14):
+ lea 13(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(Exit15):
+ lea 14(%edi), %eax
+ RETURN
+
+/* Return NULL. */
+ .p2align 4
+L(return_null):
+ xor %eax, %eax
+ RETURN
+
+END (__strchr_sse2)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strchr.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strchr.S
new file mode 100644
index 0000000000..5b97b1c767
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strchr.S
@@ -0,0 +1,57 @@
+/* Multiple versions of strchr
+ All versions must be listed in ifunc-impl-list.c.
+ Copyright (C) 2011-2017 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+#if IS_IN (libc)
+ .text
+ENTRY(strchr)
+ .type strchr, @gnu_indirect_function
+ LOAD_GOT_AND_RTLD_GLOBAL_RO
+ LOAD_FUNC_GOT_EAX (__strchr_ia32)
+ HAS_CPU_FEATURE (SSE2)
+ jz 2f
+ LOAD_FUNC_GOT_EAX (__strchr_sse2_bsf)
+ HAS_ARCH_FEATURE (Slow_BSF)
+ jz 2f
+ LOAD_FUNC_GOT_EAX (__strchr_sse2)
+2: ret
+END(strchr)
+
+# undef ENTRY
+# define ENTRY(name) \
+ .type __strchr_ia32, @function; \
+ .globl __strchr_ia32; \
+ .p2align 4; \
+ __strchr_ia32: cfi_startproc; \
+ CALL_MCOUNT
+# undef END
+# define END(name) \
+ cfi_endproc; .size __strchr_ia32, .-__strchr_ia32
+# undef libc_hidden_builtin_def
+/* IFUNC doesn't work with the hidden functions in shared library since
+ they will be called without setting up EBX needed for PLT which is
+ used by IFUNC. */
+# define libc_hidden_builtin_def(name) \
+ .globl __GI_strchr; __GI_strchr = __strchr_ia32
+#endif
+
+#include "../../i586/strchr.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strcmp-sse4.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strcmp-sse4.S
new file mode 100644
index 0000000000..cd26058671
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strcmp-sse4.S
@@ -0,0 +1,804 @@
+/* strcmp with SSE4.2
+ Copyright (C) 2010-2017 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#if IS_IN (libc)
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+#define CFI_PUSH(REG) \
+ cfi_adjust_cfa_offset (4); \
+ cfi_rel_offset (REG, 0)
+
+#define CFI_POP(REG) \
+ cfi_adjust_cfa_offset (-4); \
+ cfi_restore (REG)
+
+#define PUSH(REG) pushl REG; CFI_PUSH (REG)
+#define POP(REG) popl REG; CFI_POP (REG)
+
+#ifdef USE_AS_STRNCMP
+# ifndef STRCMP
+# define STRCMP __strncmp_sse4_2
+# endif
+# define STR1 8
+# define STR2 STR1+4
+# define CNT STR2+4
+# define RETURN POP (REM); ret; .p2align 4; CFI_PUSH (REM)
+# define REM %ebp
+#elif defined USE_AS_STRCASECMP_L
+# include "locale-defines.h"
+# ifndef STRCMP
+# define STRCMP __strcasecmp_l_sse4_2
+# endif
+# ifdef PIC
+# define STR1 12
+# else
+# define STR1 8
+# endif
+# define STR2 STR1+4
+# define LOCALE 12 /* Loaded before the adjustment. */
+# ifdef PIC
+# define RETURN POP (%edi); POP (%ebx); ret; \
+ .p2align 4; CFI_PUSH (%ebx); CFI_PUSH (%edi)
+# else
+# define RETURN POP (%edi); ret; .p2align 4; CFI_PUSH (%edi)
+# endif
+# define NONASCII __strcasecmp_nonascii
+#elif defined USE_AS_STRNCASECMP_L
+# include "locale-defines.h"
+# ifndef STRCMP
+# define STRCMP __strncasecmp_l_sse4_2
+# endif
+# ifdef PIC
+# define STR1 16
+# else
+# define STR1 12
+# endif
+# define STR2 STR1+4
+# define CNT STR2+4
+# define LOCALE 16 /* Loaded before the adjustment. */
+# ifdef PIC
+# define RETURN POP (%edi); POP (REM); POP (%ebx); ret; \
+ .p2align 4; \
+ CFI_PUSH (%ebx); CFI_PUSH (REM); CFI_PUSH (%edi)
+# else
+# define RETURN POP (%edi); POP (REM); ret; \
+ .p2align 4; CFI_PUSH (REM); CFI_PUSH (%edi)
+# endif
+# define REM %ebp
+# define NONASCII __strncasecmp_nonascii
+#else
+# ifndef STRCMP
+# define STRCMP __strcmp_sse4_2
+# endif
+# define STR1 4
+# define STR2 STR1+4
+# define RETURN ret; .p2align 4
+#endif
+
+ .section .text.sse4.2,"ax",@progbits
+
+#ifdef USE_AS_STRCASECMP_L
+ENTRY (__strcasecmp_sse4_2)
+# ifdef PIC
+ PUSH (%ebx)
+ LOAD_PIC_REG(bx)
+ movl __libc_tsd_LOCALE@GOTNTPOFF(%ebx), %eax
+# ifdef NO_TLS_DIRECT_SEG_REFS
+ addl %gs:0, %eax
+ movl (%eax), %eax
+# else
+ movl %gs:(%eax), %eax
+# endif
+# else
+# ifdef NO_TLS_DIRECT_SEG_REFS
+ movl %gs:0, %eax
+ movl __libc_tsd_LOCALE@NTPOFF(%eax), %eax
+# else
+ movl %gs:__libc_tsd_LOCALE@NTPOFF, %eax
+# endif
+# endif
+# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
+ movl LOCALE_T___LOCALES+LC_CTYPE*4(%eax), %eax
+# else
+ movl (%eax), %eax
+# endif
+ testl $1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%eax)
+# ifdef PIC
+ je L(ascii)
+ POP (%ebx)
+ jmp __strcasecmp_nonascii
+# else
+ jne __strcasecmp_nonascii
+ jmp L(ascii)
+# endif
+END (__strcasecmp_sse4_2)
+#endif
+
+#ifdef USE_AS_STRNCASECMP_L
+ENTRY (__strncasecmp_sse4_2)
+# ifdef PIC
+ PUSH (%ebx)
+ LOAD_PIC_REG(bx)
+ movl __libc_tsd_LOCALE@GOTNTPOFF(%ebx), %eax
+# ifdef NO_TLS_DIRECT_SEG_REFS
+ addl %gs:0, %eax
+ movl (%eax), %eax
+# else
+ movl %gs:(%eax), %eax
+# endif
+# else
+# ifdef NO_TLS_DIRECT_SEG_REFS
+ movl %gs:0, %eax
+ movl __libc_tsd_LOCALE@NTPOFF(%eax), %eax
+# else
+ movl %gs:__libc_tsd_LOCALE@NTPOFF, %eax
+# endif
+# endif
+# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
+ movl LOCALE_T___LOCALES+LC_CTYPE*4(%eax), %eax
+# else
+ movl (%eax), %eax
+# endif
+ testl $1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%eax)
+# ifdef PIC
+ je L(ascii)
+ POP (%ebx)
+ jmp __strncasecmp_nonascii
+# else
+ jne __strncasecmp_nonascii
+ jmp L(ascii)
+# endif
+END (__strncasecmp_sse4_2)
+#endif
+
+ ENTRY (STRCMP)
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+ movl LOCALE(%esp), %eax
+# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
+ movl LOCALE_T___LOCALES+LC_CTYPE*4(%eax), %eax
+# else
+ movl (%eax), %eax
+# endif
+ testl $1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%eax)
+ jne NONASCII
+
+# ifdef PIC
+ PUSH (%ebx)
+ LOAD_PIC_REG(bx)
+# endif
+L(ascii):
+ .section .rodata.cst16,"aM",@progbits,16
+ .align 16
+.Lbelowupper:
+ .quad 0x4040404040404040
+ .quad 0x4040404040404040
+.Ltopupper:
+ .quad 0x5b5b5b5b5b5b5b5b
+ .quad 0x5b5b5b5b5b5b5b5b
+.Ltouppermask:
+ .quad 0x2020202020202020
+ .quad 0x2020202020202020
+ .previous
+
+# ifdef PIC
+# define UCLOW_reg .Lbelowupper@GOTOFF(%ebx)
+# define UCHIGH_reg .Ltopupper@GOTOFF(%ebx)
+# define LCQWORD_reg .Ltouppermask@GOTOFF(%ebx)
+# else
+# define UCLOW_reg .Lbelowupper
+# define UCHIGH_reg .Ltopupper
+# define LCQWORD_reg .Ltouppermask
+# endif
+#endif
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ PUSH (REM)
+#endif
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+ PUSH (%edi)
+#endif
+ mov STR1(%esp), %edx
+ mov STR2(%esp), %eax
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ movl CNT(%esp), REM
+ test REM, REM
+ je L(eq)
+#endif
+ mov %dx, %cx
+ and $0xfff, %cx
+ cmp $0xff0, %cx
+ ja L(first4bytes)
+ movdqu (%edx), %xmm2
+ mov %eax, %ecx
+ and $0xfff, %ecx
+ cmp $0xff0, %ecx
+ ja L(first4bytes)
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+# define TOLOWER(reg1, reg2) \
+ movdqa reg1, %xmm3; \
+ movdqa UCHIGH_reg, %xmm4; \
+ movdqa reg2, %xmm5; \
+ movdqa UCHIGH_reg, %xmm6; \
+ pcmpgtb UCLOW_reg, %xmm3; \
+ pcmpgtb reg1, %xmm4; \
+ pcmpgtb UCLOW_reg, %xmm5; \
+ pcmpgtb reg2, %xmm6; \
+ pand %xmm4, %xmm3; \
+ pand %xmm6, %xmm5; \
+ pand LCQWORD_reg, %xmm3; \
+ pand LCQWORD_reg, %xmm5; \
+ por %xmm3, reg1; \
+ por %xmm5, reg2
+
+ movdqu (%eax), %xmm1
+ TOLOWER (%xmm2, %xmm1)
+ movd %xmm2, %ecx
+ movd %xmm1, %edi
+ movdqa %xmm2, %xmm3
+ movdqa %xmm1, %xmm4
+ cmpl %edi, %ecx
+#else
+# define TOLOWER(reg1, reg)
+
+ movd %xmm2, %ecx
+ cmp (%eax), %ecx
+#endif
+ jne L(less4bytes)
+#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+ movdqu (%eax), %xmm1
+#endif
+ pxor %xmm2, %xmm1
+ pxor %xmm0, %xmm0
+ ptest %xmm1, %xmm0
+ jnc L(less16bytes)
+ pcmpeqb %xmm0, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ sub $16, REM
+ jbe L(eq)
+#endif
+ add $16, %edx
+ add $16, %eax
+L(first4bytes):
+ movzbl (%eax), %ecx
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+ movzbl (%edx), %edi
+# ifdef PIC
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%edi,4), %edi
+# else
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%edi,4), %edi
+# endif
+ cmpl %ecx, %edi
+#else
+ cmpb %cl, (%edx)
+#endif
+ jne L(neq)
+ cmpl $0, %ecx
+ je L(eq)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp $1, REM
+ je L(eq)
+#endif
+
+ movzbl 1(%eax), %ecx
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+ movzbl 1(%edx), %edi
+# ifdef PIC
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%edi,4), %edi
+# else
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%edi,4), %edi
+# endif
+ cmpl %ecx, %edi
+#else
+ cmpb %cl, 1(%edx)
+#endif
+ jne L(neq)
+ cmpl $0, %ecx
+ je L(eq)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp $2, REM
+ je L(eq)
+#endif
+ movzbl 2(%eax), %ecx
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+ movzbl 2(%edx), %edi
+# ifdef PIC
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%edi,4), %edi
+# else
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%edi,4), %edi
+# endif
+ cmpl %ecx, %edi
+#else
+ cmpb %cl, 2(%edx)
+#endif
+ jne L(neq)
+ cmpl $0, %ecx
+ je L(eq)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp $3, REM
+ je L(eq)
+#endif
+ movzbl 3(%eax), %ecx
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+ movzbl 3(%edx), %edi
+# ifdef PIC
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%edi,4), %edi
+# else
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%edi,4), %edi
+# endif
+ cmpl %ecx, %edi
+#else
+ cmpb %cl, 3(%edx)
+#endif
+ jne L(neq)
+ cmpl $0, %ecx
+ je L(eq)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp $4, REM
+ je L(eq)
+#endif
+ movzbl 4(%eax), %ecx
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+ movzbl 4(%edx), %edi
+# ifdef PIC
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%edi,4), %edi
+# else
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%edi,4), %edi
+# endif
+ cmpl %ecx, %edi
+#else
+ cmpb %cl, 4(%edx)
+#endif
+ jne L(neq)
+ cmpl $0, %ecx
+ je L(eq)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp $5, REM
+ je L(eq)
+#endif
+ movzbl 5(%eax), %ecx
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+ movzbl 5(%edx), %edi
+# ifdef PIC
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%edi,4), %edi
+# else
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%edi,4), %edi
+# endif
+ cmpl %ecx, %edi
+#else
+ cmpb %cl, 5(%edx)
+#endif
+ jne L(neq)
+ cmpl $0, %ecx
+ je L(eq)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp $6, REM
+ je L(eq)
+#endif
+ movzbl 6(%eax), %ecx
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+ movzbl 6(%edx), %edi
+# ifdef PIC
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%edi,4), %edi
+# else
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%edi,4), %edi
+# endif
+ cmpl %ecx, %edi
+#else
+ cmpb %cl, 6(%edx)
+#endif
+ jne L(neq)
+ cmpl $0, %ecx
+ je L(eq)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp $7, REM
+ je L(eq)
+#endif
+ movzbl 7(%eax), %ecx
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+ movzbl 7(%edx), %edi
+# ifdef PIC
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%edi,4), %edi
+# else
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%edi,4), %edi
+# endif
+ cmpl %ecx, %edi
+#else
+ cmpb %cl, 7(%edx)
+#endif
+ jne L(neq)
+ cmpl $0, %ecx
+ je L(eq)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ sub $8, REM
+ je L(eq)
+#endif
+ add $8, %eax
+ add $8, %edx
+
+#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+ PUSH (%edi)
+#endif
+ PUSH (%esi)
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cfi_remember_state
+#endif
+ mov %edx, %edi
+ mov %eax, %esi
+ xorl %eax, %eax
+L(check_offset):
+ movl %edi, %edx
+ movl %esi, %ecx
+ andl $0xfff, %edx
+ andl $0xfff, %ecx
+ cmpl %edx, %ecx
+ cmovl %edx, %ecx
+ lea -0xff0(%ecx), %edx
+ sub %edx, %edi
+ sub %edx, %esi
+ testl %edx, %edx
+ jg L(crosspage)
+L(loop):
+ movdqu (%esi,%edx), %xmm2
+ movdqu (%edi,%edx), %xmm1
+ TOLOWER (%xmm2, %xmm1)
+ pcmpistri $0x1a, %xmm2, %xmm1
+ jbe L(end)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ sub $16, REM
+ jbe L(more16byteseq)
+#endif
+
+ add $16, %edx
+ jle L(loop)
+L(crosspage):
+ movzbl (%edi,%edx), %eax
+ movzbl (%esi,%edx), %ecx
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+# ifdef PIC
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%eax,4), %eax
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+# else
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%eax,4), %eax
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+# endif
+#endif
+ subl %ecx, %eax
+ jne L(ret)
+ testl %ecx, %ecx
+ je L(ret)
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ sub $1, REM
+ jbe L(more16byteseq)
+#endif
+ inc %edx
+ cmp $15, %edx
+ jle L(crosspage)
+ add %edx, %edi
+ add %edx, %esi
+ jmp L(check_offset)
+
+ .p2align 4
+L(end):
+ jnc L(ret)
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ sub %ecx, REM
+ jbe L(more16byteseq)
+#endif
+ lea (%ecx,%edx), %ecx
+ movzbl (%edi,%ecx), %eax
+ movzbl (%esi,%ecx), %ecx
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+# ifdef PIC
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%eax,4), %eax
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+# else
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%eax,4), %eax
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+# endif
+#endif
+ subl %ecx, %eax
+L(ret):
+ POP (%esi)
+ POP (%edi)
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ POP (REM)
+#endif
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+# ifdef PIC
+ POP (%ebx)
+# endif
+#endif
+ ret
+
+ .p2align 4
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cfi_restore_state
+L(more16byteseq):
+ POP (%esi)
+# ifdef USE_AS_STRNCMP
+ POP (%edi)
+# endif
+#endif
+L(eq):
+ xorl %eax, %eax
+ RETURN
+
+L(neq):
+ mov $1, %eax
+ ja L(neq_bigger)
+ neg %eax
+L(neq_bigger):
+ RETURN
+
+L(less16bytes):
+ add $0xfefefeff, %ecx
+ jnc L(less4bytes)
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+ movd %xmm3, %edi
+ xor %edi, %ecx
+#else
+ xor (%edx), %ecx
+#endif
+ or $0xfefefeff, %ecx
+ add $1, %ecx
+ jnz L(less4bytes)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp $4, REM
+ jbe L(eq)
+#endif
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+ psrldq $4, %xmm3
+ psrldq $4, %xmm4
+ movd %xmm3, %ecx
+ movd %xmm4, %edi
+ cmp %edi, %ecx
+ mov %ecx, %edi
+#else
+ mov 4(%edx), %ecx
+ cmp 4(%eax), %ecx
+#endif
+ jne L(more4bytes)
+ add $0xfefefeff, %ecx
+ jnc L(more4bytes)
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+ xor %edi, %ecx
+#else
+ xor 4(%edx), %ecx
+#endif
+ or $0xfefefeff, %ecx
+ add $1, %ecx
+ jnz L(more4bytes)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ sub $8, REM
+ jbe L(eq)
+#endif
+
+ add $8, %edx
+ add $8, %eax
+L(less4bytes):
+
+ movzbl (%eax), %ecx
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+ movzbl (%edx), %edi
+# ifdef PIC
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%edi,4), %edi
+# else
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%edi,4), %edi
+# endif
+ cmpl %ecx, %edi
+#else
+ cmpb %cl, (%edx)
+#endif
+ jne L(neq)
+ cmpl $0, %ecx
+ je L(eq)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp $1, REM
+ je L(eq)
+#endif
+ movzbl 1(%eax), %ecx
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+ movzbl 1(%edx), %edi
+# ifdef PIC
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%edi,4), %edi
+# else
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%edi,4), %edi
+# endif
+ cmpl %ecx, %edi
+#else
+ cmpb %cl, 1(%edx)
+#endif
+ jne L(neq)
+ cmpl $0, %ecx
+ je L(eq)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp $2, REM
+ je L(eq)
+#endif
+
+ movzbl 2(%eax), %ecx
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+ movzbl 2(%edx), %edi
+# ifdef PIC
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%edi,4), %edi
+# else
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%edi,4), %edi
+# endif
+ cmpl %ecx, %edi
+#else
+ cmpb %cl, 2(%edx)
+#endif
+ jne L(neq)
+ cmpl $0, %ecx
+ je L(eq)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp $3, REM
+ je L(eq)
+#endif
+ movzbl 3(%eax), %ecx
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+ movzbl 3(%edx), %edi
+# ifdef PIC
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%edi,4), %edi
+# else
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%edi,4), %edi
+# endif
+ cmpl %ecx, %edi
+#else
+ cmpb %cl, 3(%edx)
+#endif
+ jne L(neq)
+ cmpl $0, %ecx
+ je L(eq)
+
+L(more4bytes):
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp $4, REM
+ je L(eq)
+#endif
+ movzbl 4(%eax), %ecx
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+ movzbl 4(%edx), %edi
+# ifdef PIC
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%edi,4), %edi
+# else
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%edi,4), %edi
+# endif
+ cmpl %ecx, %edi
+#else
+ cmpb %cl, 4(%edx)
+#endif
+ jne L(neq)
+ cmpl $0, %ecx
+ je L(eq)
+
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp $5, REM
+ je L(eq)
+#endif
+ movzbl 5(%eax), %ecx
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+ movzbl 5(%edx), %edi
+# ifdef PIC
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%edi,4), %edi
+# else
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%edi,4), %edi
+# endif
+ cmpl %ecx, %edi
+#else
+ cmpb %cl, 5(%edx)
+#endif
+ jne L(neq)
+ cmpl $0, %ecx
+ je L(eq)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp $6, REM
+ je L(eq)
+#endif
+ movzbl 6(%eax), %ecx
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+ movzbl 6(%edx), %edi
+# ifdef PIC
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%edi,4), %edi
+# else
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%edi,4), %edi
+# endif
+ cmpl %ecx, %edi
+#else
+ cmpb %cl, 6(%edx)
+#endif
+ jne L(neq)
+ cmpl $0, %ecx
+ je L(eq)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp $7, REM
+ je L(eq)
+#endif
+ movzbl 7(%eax), %ecx
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+ movzbl 7(%edx), %edi
+# ifdef PIC
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%edi,4), %edi
+# else
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%edi,4), %edi
+# endif
+ cmpl %ecx, %edi
+#else
+ cmpb %cl, 7(%edx)
+#endif
+ jne L(neq)
+ jmp L(eq)
+
+END (STRCMP)
+
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strcmp-ssse3.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strcmp-ssse3.S
new file mode 100644
index 0000000000..b25cc3e068
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strcmp-ssse3.S
@@ -0,0 +1,2810 @@
+/* strcmp with SSSE3
+ Copyright (C) 2010-2017 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#if IS_IN (libc)
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+#define CFI_PUSH(REG) \
+ cfi_adjust_cfa_offset (4); \
+ cfi_rel_offset (REG, 0)
+
+#define CFI_POP(REG) \
+ cfi_adjust_cfa_offset (-4); \
+ cfi_restore (REG)
+
+#define PUSH(REG) pushl REG; CFI_PUSH (REG)
+#define POP(REG) popl REG; CFI_POP (REG)
+
+#ifdef USE_AS_STRNCMP
+# ifndef STRCMP
+# define STRCMP __strncmp_ssse3
+# endif
+# define STR1 8
+# define STR2 STR1+4
+# define CNT STR2+4
+# define RETURN POP (REM); ret; .p2align 4; CFI_PUSH (REM)
+# define UPDATE_STRNCMP_COUNTER \
+ /* calculate left number to compare */ \
+ mov $16, %esi; \
+ sub %ecx, %esi; \
+ cmp %esi, REM; \
+ jbe L(more8byteseq); \
+ sub %esi, REM
+# define FLAGS %ebx
+# define REM %ebp
+#elif defined USE_AS_STRCASECMP_L
+# include "locale-defines.h"
+# ifndef STRCMP
+# define STRCMP __strcasecmp_l_ssse3
+# endif
+# ifdef PIC
+# define STR1 8
+# else
+# define STR1 4
+# endif
+# define STR2 STR1+4
+# define LOCALE 12 /* Loaded before the adjustment. */
+# ifdef PIC
+# define RETURN POP (%ebx); ret; .p2align 4; CFI_PUSH (%ebx)
+# else
+# define RETURN ret; .p2align 4
+# endif
+# define UPDATE_STRNCMP_COUNTER
+# define FLAGS (%esp)
+# define NONASCII __strcasecmp_nonascii
+#elif defined USE_AS_STRNCASECMP_L
+# include "locale-defines.h"
+# ifndef STRCMP
+# define STRCMP __strncasecmp_l_ssse3
+# endif
+# ifdef PIC
+# define STR1 12
+# else
+# define STR1 8
+# endif
+# define STR2 STR1+4
+# define CNT STR2+4
+# define LOCALE 16 /* Loaded before the adjustment. */
+# ifdef PIC
+# define RETURN POP (REM); POP (%ebx); ret; \
+ .p2align 4; CFI_PUSH (%ebx); CFI_PUSH (REM)
+# else
+# define RETURN POP (REM); ret; .p2align 4; CFI_PUSH (REM)
+# endif
+# define UPDATE_STRNCMP_COUNTER \
+ /* calculate left number to compare */ \
+ mov $16, %esi; \
+ sub %ecx, %esi; \
+ cmp %esi, REM; \
+ jbe L(more8byteseq); \
+ sub %esi, REM
+# define FLAGS (%esp)
+# define REM %ebp
+# define NONASCII __strncasecmp_nonascii
+#else
+# ifndef STRCMP
+# define STRCMP __strcmp_ssse3
+# endif
+# define STR1 4
+# define STR2 STR1+4
+# define RETURN ret; .p2align 4
+# define UPDATE_STRNCMP_COUNTER
+# define FLAGS %ebx
+#endif
+
+ .section .text.ssse3,"ax",@progbits
+
+#ifdef USE_AS_STRCASECMP_L
+ENTRY (__strcasecmp_ssse3)
+# ifdef PIC
+ PUSH (%ebx)
+ LOAD_PIC_REG(bx)
+ movl __libc_tsd_LOCALE@GOTNTPOFF(%ebx), %eax
+# ifdef NO_TLS_DIRECT_SEG_REFS
+ addl %gs:0, %eax
+ movl (%eax), %eax
+# else
+ movl %gs:(%eax), %eax
+# endif
+# else
+# ifdef NO_TLS_DIRECT_SEG_REFS
+ movl %gs:0, %eax
+ movl __libc_tsd_LOCALE@NTPOFF(%eax), %eax
+# else
+ movl %gs:__libc_tsd_LOCALE@NTPOFF, %eax
+# endif
+# endif
+# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
+ movl LOCALE_T___LOCALES+LC_CTYPE*4(%eax), %eax
+# else
+ movl (%eax), %eax
+# endif
+ testl $1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%eax)
+# ifdef PIC
+ je L(ascii)
+ POP (%ebx)
+ jmp __strcasecmp_nonascii
+# else
+ jne __strcasecmp_nonascii
+ jmp L(ascii)
+# endif
+END (__strcasecmp_ssse3)
+#endif
+
+#ifdef USE_AS_STRNCASECMP_L
+ENTRY (__strncasecmp_ssse3)
+# ifdef PIC
+ PUSH (%ebx)
+ LOAD_PIC_REG(bx)
+ movl __libc_tsd_LOCALE@GOTNTPOFF(%ebx), %eax
+# ifdef NO_TLS_DIRECT_SEG_REFS
+ addl %gs:0, %eax
+ movl (%eax), %eax
+# else
+ movl %gs:(%eax), %eax
+# endif
+# else
+# ifdef NO_TLS_DIRECT_SEG_REFS
+ movl %gs:0, %eax
+ movl __libc_tsd_LOCALE@NTPOFF(%eax), %eax
+# else
+ movl %gs:__libc_tsd_LOCALE@NTPOFF, %eax
+# endif
+# endif
+# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
+ movl LOCALE_T___LOCALES+LC_CTYPE*4(%eax), %eax
+# else
+ movl (%eax), %eax
+# endif
+ testl $1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%eax)
+# ifdef PIC
+ je L(ascii)
+ POP (%ebx)
+ jmp __strncasecmp_nonascii
+# else
+ jne __strncasecmp_nonascii
+ jmp L(ascii)
+# endif
+END (__strncasecmp_ssse3)
+#endif
+
+ENTRY (STRCMP)
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+ movl LOCALE(%esp), %eax
+# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
+ movl LOCALE_T___LOCALES+LC_CTYPE*4(%eax), %eax
+# else
+ movl (%eax), %eax
+# endif
+ testl $1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%eax)
+ jne NONASCII
+
+# ifdef PIC
+ PUSH (%ebx)
+ LOAD_PIC_REG(bx)
+# endif
+L(ascii):
+ .section .rodata.cst16,"aM",@progbits,16
+ .align 16
+.Lbelowupper:
+ .quad 0x4040404040404040
+ .quad 0x4040404040404040
+.Ltopupper:
+ .quad 0x5b5b5b5b5b5b5b5b
+ .quad 0x5b5b5b5b5b5b5b5b
+.Ltouppermask:
+ .quad 0x2020202020202020
+ .quad 0x2020202020202020
+ .previous
+
+# ifdef PIC
+# define UCLOW_reg .Lbelowupper@GOTOFF(%ebx)
+# define UCHIGH_reg .Ltopupper@GOTOFF(%ebx)
+# define LCQWORD_reg .Ltouppermask@GOTOFF(%ebx)
+# else
+# define UCLOW_reg .Lbelowupper
+# define UCHIGH_reg .Ltopupper
+# define LCQWORD_reg .Ltouppermask
+# endif
+#endif
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ PUSH (REM)
+#endif
+
+ movl STR1(%esp), %edx
+ movl STR2(%esp), %eax
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ movl CNT(%esp), REM
+ cmp $16, REM
+ jb L(less16bytes_sncmp)
+#elif !defined USE_AS_STRCASECMP_L
+ movzbl (%eax), %ecx
+ cmpb %cl, (%edx)
+ jne L(neq)
+ cmpl $0, %ecx
+ je L(eq)
+
+ movzbl 1(%eax), %ecx
+ cmpb %cl, 1(%edx)
+ jne L(neq)
+ cmpl $0, %ecx
+ je L(eq)
+
+ movzbl 2(%eax), %ecx
+ cmpb %cl, 2(%edx)
+ jne L(neq)
+ cmpl $0, %ecx
+ je L(eq)
+
+ movzbl 3(%eax), %ecx
+ cmpb %cl, 3(%edx)
+ jne L(neq)
+ cmpl $0, %ecx
+ je L(eq)
+
+ movzbl 4(%eax), %ecx
+ cmpb %cl, 4(%edx)
+ jne L(neq)
+ cmpl $0, %ecx
+ je L(eq)
+
+ movzbl 5(%eax), %ecx
+ cmpb %cl, 5(%edx)
+ jne L(neq)
+ cmpl $0, %ecx
+ je L(eq)
+
+ movzbl 6(%eax), %ecx
+ cmpb %cl, 6(%edx)
+ jne L(neq)
+ cmpl $0, %ecx
+ je L(eq)
+
+ movzbl 7(%eax), %ecx
+ cmpb %cl, 7(%edx)
+ jne L(neq)
+ cmpl $0, %ecx
+ je L(eq)
+
+ add $8, %edx
+ add $8, %eax
+#endif
+ movl %edx, %ecx
+ and $0xfff, %ecx
+ cmp $0xff0, %ecx
+ ja L(crosspage)
+ mov %eax, %ecx
+ and $0xfff, %ecx
+ cmp $0xff0, %ecx
+ ja L(crosspage)
+ pxor %xmm0, %xmm0
+ movlpd (%eax), %xmm1
+ movlpd (%edx), %xmm2
+ movhpd 8(%eax), %xmm1
+ movhpd 8(%edx), %xmm2
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+# define TOLOWER(reg1, reg2) \
+ movdqa reg1, %xmm5; \
+ movdqa reg2, %xmm7; \
+ movdqa UCHIGH_reg, %xmm6; \
+ pcmpgtb UCLOW_reg, %xmm5; \
+ pcmpgtb UCLOW_reg, %xmm7; \
+ pcmpgtb reg1, %xmm6; \
+ pand %xmm6, %xmm5; \
+ movdqa UCHIGH_reg, %xmm6; \
+ pcmpgtb reg2, %xmm6; \
+ pand %xmm6, %xmm7; \
+ pand LCQWORD_reg, %xmm5; \
+ por %xmm5, reg1; \
+ pand LCQWORD_reg, %xmm7; \
+ por %xmm7, reg2
+ TOLOWER (%xmm1, %xmm2)
+#else
+# define TOLOWER(reg1, reg2)
+#endif
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %ecx
+ sub $0xffff, %ecx
+ jnz L(less16bytes)
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp $16, REM
+ lea -16(REM), REM
+ jbe L(eq)
+#endif
+ add $16, %eax
+ add $16, %edx
+
+L(crosspage):
+
+#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+ PUSH (FLAGS)
+#endif
+ PUSH (%edi)
+ PUSH (%esi)
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+ pushl $0
+ cfi_adjust_cfa_offset (4)
+#endif
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cfi_remember_state
+#endif
+
+ movl %edx, %edi
+ movl %eax, %ecx
+ and $0xf, %ecx
+ and $0xf, %edi
+ xor %ecx, %eax
+ xor %edi, %edx
+#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+ xor FLAGS, FLAGS
+#endif
+ cmp %edi, %ecx
+ je L(ashr_0)
+ ja L(bigger)
+ orl $0x20, FLAGS
+ xchg %edx, %eax
+ xchg %ecx, %edi
+L(bigger):
+ lea 15(%edi), %edi
+ sub %ecx, %edi
+ cmp $8, %edi
+ jle L(ashr_less_8)
+ cmp $14, %edi
+ je L(ashr_15)
+ cmp $13, %edi
+ je L(ashr_14)
+ cmp $12, %edi
+ je L(ashr_13)
+ cmp $11, %edi
+ je L(ashr_12)
+ cmp $10, %edi
+ je L(ashr_11)
+ cmp $9, %edi
+ je L(ashr_10)
+L(ashr_less_8):
+ je L(ashr_9)
+ cmp $7, %edi
+ je L(ashr_8)
+ cmp $6, %edi
+ je L(ashr_7)
+ cmp $5, %edi
+ je L(ashr_6)
+ cmp $4, %edi
+ je L(ashr_5)
+ cmp $3, %edi
+ je L(ashr_4)
+ cmp $2, %edi
+ je L(ashr_3)
+ cmp $1, %edi
+ je L(ashr_2)
+ cmp $0, %edi
+ je L(ashr_1)
+
+/*
+ * The following cases will be handled by ashr_0
+ * ecx(offset of esi) eax(offset of edi) relative offset corresponding case
+ * n(0~15) n(0~15) 15(15+ n-n) ashr_0
+ */
+ .p2align 4
+L(ashr_0):
+ mov $0xffff, %esi
+ movdqa (%eax), %xmm1
+ pxor %xmm0, %xmm0
+ pcmpeqb %xmm1, %xmm0
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+ movdqa (%edx), %xmm2
+ TOLOWER (%xmm1, %xmm2)
+ pcmpeqb %xmm2, %xmm1
+#else
+ pcmpeqb (%edx), %xmm1
+#endif
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edi
+ shr %cl, %esi
+ shr %cl, %edi
+ sub %edi, %esi
+ mov %ecx, %edi
+ jne L(less32bytes)
+ UPDATE_STRNCMP_COUNTER
+ movl $0x10, FLAGS
+ mov $0x10, %ecx
+ pxor %xmm0, %xmm0
+ .p2align 4
+L(loop_ashr_0):
+ movdqa (%eax, %ecx), %xmm1
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+ movdqa (%edx, %ecx), %xmm2
+ TOLOWER (%xmm1, %xmm2)
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+#else
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb (%edx, %ecx), %xmm1
+#endif
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %esi
+ sub $0xffff, %esi
+ jnz L(exit)
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp $16, REM
+ lea -16(REM), REM
+ jbe L(more8byteseq)
+#endif
+ add $16, %ecx
+ jmp L(loop_ashr_0)
+
+/*
+ * The following cases will be handled by ashr_1
+ * ecx(offset of esi) eax(offset of edi) relative offset corresponding case
+ * n(15) n -15 0(15 +(n-15) - n) ashr_1
+ */
+ .p2align 4
+L(ashr_1):
+ mov $0xffff, %esi
+ pxor %xmm0, %xmm0
+ movdqa (%edx), %xmm2
+ movdqa (%eax), %xmm1
+ pcmpeqb %xmm1, %xmm0
+ pslldq $15, %xmm2
+ TOLOWER (%xmm1, %xmm2)
+ pcmpeqb %xmm1, %xmm2
+ psubb %xmm0, %xmm2
+ pmovmskb %xmm2, %edi
+ shr %cl, %esi
+ shr %cl, %edi
+ sub %edi, %esi
+ lea -15(%ecx), %edi
+ jnz L(less32bytes)
+
+ UPDATE_STRNCMP_COUNTER
+
+ movdqa (%edx), %xmm3
+ pxor %xmm0, %xmm0
+ mov $16, %ecx
+ orl $1, FLAGS
+ lea 1(%edx), %edi
+ and $0xfff, %edi
+ sub $0x1000, %edi
+
+ .p2align 4
+L(loop_ashr_1):
+ add $16, %edi
+ jg L(nibble_ashr_1)
+
+L(gobble_ashr_1):
+ movdqa (%eax, %ecx), %xmm1
+ movdqa (%edx, %ecx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ palignr $1, %xmm3, %xmm2
+ TOLOWER (%xmm1, %xmm2)
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %esi
+ sub $0xffff, %esi
+ jnz L(exit)
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp $16, REM
+ lea -16(REM), REM
+ jbe L(more8byteseq)
+#endif
+
+ add $16, %ecx
+ movdqa %xmm4, %xmm3
+
+ add $16, %edi
+ jg L(nibble_ashr_1)
+
+ movdqa (%eax, %ecx), %xmm1
+ movdqa (%edx, %ecx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ palignr $1, %xmm3, %xmm2
+ TOLOWER (%xmm1, %xmm2)
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %esi
+ sub $0xffff, %esi
+ jnz L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp $16, REM
+ lea -16(REM), REM
+ jbe L(more8byteseq)
+#endif
+ add $16, %ecx
+ movdqa %xmm4, %xmm3
+ jmp L(loop_ashr_1)
+
+ .p2align 4
+L(nibble_ashr_1):
+ pcmpeqb %xmm3, %xmm0
+ pmovmskb %xmm0, %esi
+ test $0xfffe, %esi
+ jnz L(ashr_1_exittail)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp $15, REM
+ jbe L(ashr_1_exittail)
+#endif
+ pxor %xmm0, %xmm0
+ sub $0x1000, %edi
+ jmp L(gobble_ashr_1)
+
+ .p2align 4
+L(ashr_1_exittail):
+ movdqa (%eax, %ecx), %xmm1
+ psrldq $1, %xmm0
+ psrldq $1, %xmm3
+ jmp L(aftertail)
+
+/*
+ * The following cases will be handled by ashr_2
+ * ecx(offset of esi) eax(offset of edi) relative offset corresponding case
+ * n(14~15) n -14 1(15 +(n-14) - n) ashr_2
+ */
+ .p2align 4
+L(ashr_2):
+ mov $0xffff, %esi
+ pxor %xmm0, %xmm0
+ movdqa (%edx), %xmm2
+ movdqa (%eax), %xmm1
+ pcmpeqb %xmm1, %xmm0
+ pslldq $14, %xmm2
+ TOLOWER (%xmm1, %xmm2)
+ pcmpeqb %xmm1, %xmm2
+ psubb %xmm0, %xmm2
+ pmovmskb %xmm2, %edi
+ shr %cl, %esi
+ shr %cl, %edi
+ sub %edi, %esi
+ lea -14(%ecx), %edi
+ jnz L(less32bytes)
+
+ UPDATE_STRNCMP_COUNTER
+
+ movdqa (%edx), %xmm3
+ pxor %xmm0, %xmm0
+ mov $16, %ecx
+ orl $2, FLAGS
+ lea 2(%edx), %edi
+ and $0xfff, %edi
+ sub $0x1000, %edi
+
+ .p2align 4
+L(loop_ashr_2):
+ add $16, %edi
+ jg L(nibble_ashr_2)
+
+L(gobble_ashr_2):
+ movdqa (%eax, %ecx), %xmm1
+ movdqa (%edx, %ecx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ palignr $2, %xmm3, %xmm2
+ TOLOWER (%xmm1, %xmm2)
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %esi
+ sub $0xffff, %esi
+ jnz L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp $16, REM
+ lea -16(REM), REM
+ jbe L(more8byteseq)
+#endif
+ add $16, %ecx
+ movdqa %xmm4, %xmm3
+
+ add $16, %edi
+ jg L(nibble_ashr_2)
+
+ movdqa (%eax, %ecx), %xmm1
+ movdqa (%edx, %ecx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ palignr $2, %xmm3, %xmm2
+ TOLOWER (%xmm1, %xmm2)
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %esi
+ sub $0xffff, %esi
+ jnz L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp $16, REM
+ lea -16(REM), REM
+ jbe L(more8byteseq)
+#endif
+ add $16, %ecx
+ movdqa %xmm4, %xmm3
+ jmp L(loop_ashr_2)
+
+ .p2align 4
+L(nibble_ashr_2):
+ pcmpeqb %xmm3, %xmm0
+ pmovmskb %xmm0, %esi
+ test $0xfffc, %esi
+ jnz L(ashr_2_exittail)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp $14, REM
+ jbe L(ashr_2_exittail)
+#endif
+
+ pxor %xmm0, %xmm0
+ sub $0x1000, %edi
+ jmp L(gobble_ashr_2)
+
+ .p2align 4
+L(ashr_2_exittail):
+ movdqa (%eax, %ecx), %xmm1
+ psrldq $2, %xmm0
+ psrldq $2, %xmm3
+ jmp L(aftertail)
+
+/*
+ * The following cases will be handled by ashr_3
+ * ecx(offset of esi) eax(offset of edi) relative offset corresponding case
+ * n(13~15) n -13 2(15 +(n-13) - n) ashr_3
+ */
+ .p2align 4
+L(ashr_3):
+ mov $0xffff, %esi
+ pxor %xmm0, %xmm0
+ movdqa (%edx), %xmm2
+ movdqa (%eax), %xmm1
+ pcmpeqb %xmm1, %xmm0
+ pslldq $13, %xmm2
+ TOLOWER (%xmm1, %xmm2)
+ pcmpeqb %xmm1, %xmm2
+ psubb %xmm0, %xmm2
+ pmovmskb %xmm2, %edi
+ shr %cl, %esi
+ shr %cl, %edi
+ sub %edi, %esi
+ lea -13(%ecx), %edi
+ jnz L(less32bytes)
+
+ UPDATE_STRNCMP_COUNTER
+
+ movdqa (%edx), %xmm3
+ pxor %xmm0, %xmm0
+ mov $16, %ecx
+ orl $3, FLAGS
+ lea 3(%edx), %edi
+ and $0xfff, %edi
+ sub $0x1000, %edi
+
+ .p2align 4
+L(loop_ashr_3):
+ add $16, %edi
+ jg L(nibble_ashr_3)
+
+L(gobble_ashr_3):
+ movdqa (%eax, %ecx), %xmm1
+ movdqa (%edx, %ecx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ palignr $3, %xmm3, %xmm2
+ TOLOWER (%xmm1, %xmm2)
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %esi
+ sub $0xffff, %esi
+ jnz L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp $16, REM
+ lea -16(REM), REM
+ jbe L(more8byteseq)
+#endif
+ add $16, %ecx
+ movdqa %xmm4, %xmm3
+
+ add $16, %edi
+ jg L(nibble_ashr_3)
+
+ movdqa (%eax, %ecx), %xmm1
+ movdqa (%edx, %ecx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ palignr $3, %xmm3, %xmm2
+ TOLOWER (%xmm1, %xmm2)
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %esi
+ sub $0xffff, %esi
+ jnz L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp $16, REM
+ lea -16(REM), REM
+ jbe L(more8byteseq)
+#endif
+ add $16, %ecx
+ movdqa %xmm4, %xmm3
+ jmp L(loop_ashr_3)
+
+ .p2align 4
+L(nibble_ashr_3):
+ pcmpeqb %xmm3, %xmm0
+ pmovmskb %xmm0, %esi
+ test $0xfff8, %esi
+ jnz L(ashr_3_exittail)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp $13, REM
+ jbe L(ashr_3_exittail)
+#endif
+ pxor %xmm0, %xmm0
+ sub $0x1000, %edi
+ jmp L(gobble_ashr_3)
+
+ .p2align 4
+L(ashr_3_exittail):
+ movdqa (%eax, %ecx), %xmm1
+ psrldq $3, %xmm0
+ psrldq $3, %xmm3
+ jmp L(aftertail)
+
+/*
+ * The following cases will be handled by ashr_4
+ * ecx(offset of esi) eax(offset of edi) relative offset corresponding case
+ * n(12~15) n -12 3(15 +(n-12) - n) ashr_4
+ */
+ .p2align 4
+L(ashr_4):
+ mov $0xffff, %esi
+ pxor %xmm0, %xmm0
+ movdqa (%edx), %xmm2
+ movdqa (%eax), %xmm1
+ pcmpeqb %xmm1, %xmm0
+ pslldq $12, %xmm2
+ TOLOWER (%xmm1, %xmm2)
+ pcmpeqb %xmm1, %xmm2
+ psubb %xmm0, %xmm2
+ pmovmskb %xmm2, %edi
+ shr %cl, %esi
+ shr %cl, %edi
+ sub %edi, %esi
+ lea -12(%ecx), %edi
+ jnz L(less32bytes)
+
+ UPDATE_STRNCMP_COUNTER
+
+ movdqa (%edx), %xmm3
+ pxor %xmm0, %xmm0
+ mov $16, %ecx
+ orl $4, FLAGS
+ lea 4(%edx), %edi
+ and $0xfff, %edi
+ sub $0x1000, %edi
+
+ .p2align 4
+L(loop_ashr_4):
+ add $16, %edi
+ jg L(nibble_ashr_4)
+
+L(gobble_ashr_4):
+ movdqa (%eax, %ecx), %xmm1
+ movdqa (%edx, %ecx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ palignr $4, %xmm3, %xmm2
+ TOLOWER (%xmm1, %xmm2)
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %esi
+ sub $0xffff, %esi
+ jnz L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp $16, REM
+ lea -16(REM), REM
+ jbe L(more8byteseq)
+#endif
+
+ add $16, %ecx
+ movdqa %xmm4, %xmm3
+
+ add $16, %edi
+ jg L(nibble_ashr_4)
+
+ movdqa (%eax, %ecx), %xmm1
+ movdqa (%edx, %ecx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ palignr $4, %xmm3, %xmm2
+ TOLOWER (%xmm1, %xmm2)
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %esi
+ sub $0xffff, %esi
+ jnz L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp $16, REM
+ lea -16(REM), REM
+ jbe L(more8byteseq)
+#endif
+
+ add $16, %ecx
+ movdqa %xmm4, %xmm3
+ jmp L(loop_ashr_4)
+
+ .p2align 4
+L(nibble_ashr_4):
+ pcmpeqb %xmm3, %xmm0
+ pmovmskb %xmm0, %esi
+ test $0xfff0, %esi
+ jnz L(ashr_4_exittail)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp $12, REM
+ jbe L(ashr_4_exittail)
+#endif
+
+ pxor %xmm0, %xmm0
+ sub $0x1000, %edi
+ jmp L(gobble_ashr_4)
+
+ .p2align 4
+L(ashr_4_exittail):
+ movdqa (%eax, %ecx), %xmm1
+ psrldq $4, %xmm0
+ psrldq $4, %xmm3
+ jmp L(aftertail)
+
+/*
+ * The following cases will be handled by ashr_5
+ * ecx(offset of esi) eax(offset of edi) relative offset corresponding case
+ * n(11~15) n -11 4(15 +(n-11) - n) ashr_5
+ */
+ .p2align 4
+L(ashr_5):
+ mov $0xffff, %esi
+ pxor %xmm0, %xmm0
+ movdqa (%edx), %xmm2
+ movdqa (%eax), %xmm1
+ pcmpeqb %xmm1, %xmm0
+ pslldq $11, %xmm2
+ TOLOWER (%xmm1, %xmm2)
+ pcmpeqb %xmm1, %xmm2
+ psubb %xmm0, %xmm2
+ pmovmskb %xmm2, %edi
+ shr %cl, %esi
+ shr %cl, %edi
+ sub %edi, %esi
+ lea -11(%ecx), %edi
+ jnz L(less32bytes)
+
+ UPDATE_STRNCMP_COUNTER
+
+ movdqa (%edx), %xmm3
+ pxor %xmm0, %xmm0
+ mov $16, %ecx
+ orl $5, FLAGS
+ lea 5(%edx), %edi
+ and $0xfff, %edi
+ sub $0x1000, %edi
+
+ .p2align 4
+L(loop_ashr_5):
+ add $16, %edi
+ jg L(nibble_ashr_5)
+
+L(gobble_ashr_5):
+ movdqa (%eax, %ecx), %xmm1
+ movdqa (%edx, %ecx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ palignr $5, %xmm3, %xmm2
+ TOLOWER (%xmm1, %xmm2)
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %esi
+ sub $0xffff, %esi
+ jnz L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp $16, REM
+ lea -16(REM), REM
+ jbe L(more8byteseq)
+#endif
+ add $16, %ecx
+ movdqa %xmm4, %xmm3
+
+ add $16, %edi
+ jg L(nibble_ashr_5)
+
+ movdqa (%eax, %ecx), %xmm1
+ movdqa (%edx, %ecx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ palignr $5, %xmm3, %xmm2
+ TOLOWER (%xmm1, %xmm2)
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %esi
+ sub $0xffff, %esi
+ jnz L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp $16, REM
+ lea -16(REM), REM
+ jbe L(more8byteseq)
+#endif
+ add $16, %ecx
+ movdqa %xmm4, %xmm3
+ jmp L(loop_ashr_5)
+
+ .p2align 4
+L(nibble_ashr_5):
+ pcmpeqb %xmm3, %xmm0
+ pmovmskb %xmm0, %esi
+ test $0xffe0, %esi
+ jnz L(ashr_5_exittail)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp $11, REM
+ jbe L(ashr_5_exittail)
+#endif
+ pxor %xmm0, %xmm0
+ sub $0x1000, %edi
+ jmp L(gobble_ashr_5)
+
+ .p2align 4
+L(ashr_5_exittail):
+ movdqa (%eax, %ecx), %xmm1
+ psrldq $5, %xmm0
+ psrldq $5, %xmm3
+ jmp L(aftertail)
+
+/*
+ * The following cases will be handled by ashr_6
+ * ecx(offset of esi) eax(offset of edi) relative offset corresponding case
+ * n(10~15) n -10 5(15 +(n-10) - n) ashr_6
+ */
+
+ .p2align 4
+L(ashr_6):
+ mov $0xffff, %esi
+ pxor %xmm0, %xmm0
+ movdqa (%edx), %xmm2
+ movdqa (%eax), %xmm1
+ pcmpeqb %xmm1, %xmm0
+ pslldq $10, %xmm2
+ TOLOWER (%xmm1, %xmm2)
+ pcmpeqb %xmm1, %xmm2
+ psubb %xmm0, %xmm2
+ pmovmskb %xmm2, %edi
+ shr %cl, %esi
+ shr %cl, %edi
+ sub %edi, %esi
+ lea -10(%ecx), %edi
+ jnz L(less32bytes)
+
+ UPDATE_STRNCMP_COUNTER
+
+ movdqa (%edx), %xmm3
+ pxor %xmm0, %xmm0
+ mov $16, %ecx
+ orl $6, FLAGS
+ lea 6(%edx), %edi
+ and $0xfff, %edi
+ sub $0x1000, %edi
+
+ .p2align 4
+L(loop_ashr_6):
+ add $16, %edi
+ jg L(nibble_ashr_6)
+
+L(gobble_ashr_6):
+ movdqa (%eax, %ecx), %xmm1
+ movdqa (%edx, %ecx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ palignr $6, %xmm3, %xmm2
+ TOLOWER (%xmm1, %xmm2)
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %esi
+ sub $0xffff, %esi
+ jnz L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp $16, REM
+ lea -16(REM), REM
+ jbe L(more8byteseq)
+#endif
+
+ add $16, %ecx
+ movdqa %xmm4, %xmm3
+
+ add $16, %edi
+ jg L(nibble_ashr_6)
+
+ movdqa (%eax, %ecx), %xmm1
+ movdqa (%edx, %ecx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ palignr $6, %xmm3, %xmm2
+ TOLOWER (%xmm1, %xmm2)
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %esi
+ sub $0xffff, %esi
+ jnz L(exit)
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp $16, REM
+ lea -16(REM), REM
+ jbe L(more8byteseq)
+#endif
+
+ add $16, %ecx
+ movdqa %xmm4, %xmm3
+ jmp L(loop_ashr_6)
+
+ .p2align 4
+L(nibble_ashr_6):
+ pcmpeqb %xmm3, %xmm0
+ pmovmskb %xmm0, %esi
+ test $0xffc0, %esi
+ jnz L(ashr_6_exittail)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp $10, REM
+ jbe L(ashr_6_exittail)
+#endif
+ pxor %xmm0, %xmm0
+ sub $0x1000, %edi
+ jmp L(gobble_ashr_6)
+
+ .p2align 4
+L(ashr_6_exittail):
+ movdqa (%eax, %ecx), %xmm1
+ psrldq $6, %xmm0
+ psrldq $6, %xmm3
+ jmp L(aftertail)
+
+/*
+ * The following cases will be handled by ashr_7
+ * ecx(offset of esi) eax(offset of edi) relative offset corresponding case
+ * n(9~15) n - 9 6(15 +(n-9) - n) ashr_7
+ */
+
+ .p2align 4
+L(ashr_7):
+ mov $0xffff, %esi
+ pxor %xmm0, %xmm0
+ movdqa (%edx), %xmm2
+ movdqa (%eax), %xmm1
+ pcmpeqb %xmm1, %xmm0
+ pslldq $9, %xmm2
+ TOLOWER (%xmm1, %xmm2)
+ pcmpeqb %xmm1, %xmm2
+ psubb %xmm0, %xmm2
+ pmovmskb %xmm2, %edi
+ shr %cl, %esi
+ shr %cl, %edi
+ sub %edi, %esi
+ lea -9(%ecx), %edi
+ jnz L(less32bytes)
+
+ UPDATE_STRNCMP_COUNTER
+
+ movdqa (%edx), %xmm3
+ pxor %xmm0, %xmm0
+ mov $16, %ecx
+ orl $7, FLAGS
+ lea 8(%edx), %edi
+ and $0xfff, %edi
+ sub $0x1000, %edi
+
+ .p2align 4
+L(loop_ashr_7):
+ add $16, %edi
+ jg L(nibble_ashr_7)
+
+L(gobble_ashr_7):
+ movdqa (%eax, %ecx), %xmm1
+ movdqa (%edx, %ecx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ palignr $7, %xmm3, %xmm2
+ TOLOWER (%xmm1, %xmm2)
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %esi
+ sub $0xffff, %esi
+ jnz L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp $16, REM
+ lea -16(REM), REM
+ jbe L(more8byteseq)
+#endif
+
+ add $16, %ecx
+ movdqa %xmm4, %xmm3
+
+ add $16, %edi
+ jg L(nibble_ashr_7)
+
+ movdqa (%eax, %ecx), %xmm1
+ movdqa (%edx, %ecx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ palignr $7, %xmm3, %xmm2
+ TOLOWER (%xmm1, %xmm2)
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %esi
+ sub $0xffff, %esi
+ jnz L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp $16, REM
+ lea -16(REM), REM
+ jbe L(more8byteseq)
+#endif
+
+ add $16, %ecx
+ movdqa %xmm4, %xmm3
+ jmp L(loop_ashr_7)
+
+ .p2align 4
+L(nibble_ashr_7):
+ pcmpeqb %xmm3, %xmm0
+ pmovmskb %xmm0, %esi
+ test $0xff80, %esi
+ jnz L(ashr_7_exittail)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp $9, REM
+ jbe L(ashr_7_exittail)
+#endif
+ pxor %xmm0, %xmm0
+ pxor %xmm0, %xmm0
+ sub $0x1000, %edi
+ jmp L(gobble_ashr_7)
+
+ .p2align 4
+L(ashr_7_exittail):
+ movdqa (%eax, %ecx), %xmm1
+ psrldq $7, %xmm0
+ psrldq $7, %xmm3
+ jmp L(aftertail)
+
+/*
+ * The following cases will be handled by ashr_8
+ * ecx(offset of esi) eax(offset of edi) relative offset corresponding case
+ * n(8~15) n - 8 7(15 +(n-8) - n) ashr_8
+ */
+ .p2align 4
+L(ashr_8):
+ mov $0xffff, %esi
+ pxor %xmm0, %xmm0
+ movdqa (%edx), %xmm2
+ movdqa (%eax), %xmm1
+ pcmpeqb %xmm1, %xmm0
+ pslldq $8, %xmm2
+ TOLOWER (%xmm1, %xmm2)
+ pcmpeqb %xmm1, %xmm2
+ psubb %xmm0, %xmm2
+ pmovmskb %xmm2, %edi
+ shr %cl, %esi
+ shr %cl, %edi
+ sub %edi, %esi
+ lea -8(%ecx), %edi
+ jnz L(less32bytes)
+
+ UPDATE_STRNCMP_COUNTER
+
+ movdqa (%edx), %xmm3
+ pxor %xmm0, %xmm0
+ mov $16, %ecx
+ orl $8, FLAGS
+ lea 8(%edx), %edi
+ and $0xfff, %edi
+ sub $0x1000, %edi
+
+ .p2align 4
+L(loop_ashr_8):
+ add $16, %edi
+ jg L(nibble_ashr_8)
+
+L(gobble_ashr_8):
+ movdqa (%eax, %ecx), %xmm1
+ movdqa (%edx, %ecx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ palignr $8, %xmm3, %xmm2
+ TOLOWER (%xmm1, %xmm2)
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %esi
+ sub $0xffff, %esi
+ jnz L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp $16, REM
+ lea -16(REM), REM
+ jbe L(more8byteseq)
+#endif
+ add $16, %ecx
+ movdqa %xmm4, %xmm3
+
+ add $16, %edi
+ jg L(nibble_ashr_8)
+
+ movdqa (%eax, %ecx), %xmm1
+ movdqa (%edx, %ecx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ palignr $8, %xmm3, %xmm2
+ TOLOWER (%xmm1, %xmm2)
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %esi
+ sub $0xffff, %esi
+ jnz L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp $16, REM
+ lea -16(REM), REM
+ jbe L(more8byteseq)
+#endif
+ add $16, %ecx
+ movdqa %xmm4, %xmm3
+ jmp L(loop_ashr_8)
+
+ .p2align 4
+L(nibble_ashr_8):
+ pcmpeqb %xmm3, %xmm0
+ pmovmskb %xmm0, %esi
+ test $0xff00, %esi
+ jnz L(ashr_8_exittail)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp $8, REM
+ jbe L(ashr_8_exittail)
+#endif
+ pxor %xmm0, %xmm0
+ pxor %xmm0, %xmm0
+ sub $0x1000, %edi
+ jmp L(gobble_ashr_8)
+
+ .p2align 4
+L(ashr_8_exittail):
+ movdqa (%eax, %ecx), %xmm1
+ psrldq $8, %xmm0
+ psrldq $8, %xmm3
+ jmp L(aftertail)
+
+/*
+ * The following cases will be handled by ashr_9
+ * ecx(offset of esi) eax(offset of edi) relative offset corresponding case
+ * n(7~15) n - 7 8(15 +(n-7) - n) ashr_9
+ */
+ .p2align 4
+L(ashr_9):
+ mov $0xffff, %esi
+ pxor %xmm0, %xmm0
+ movdqa (%edx), %xmm2
+ movdqa (%eax), %xmm1
+ pcmpeqb %xmm1, %xmm0
+ pslldq $7, %xmm2
+ TOLOWER (%xmm1, %xmm2)
+ pcmpeqb %xmm1, %xmm2
+ psubb %xmm0, %xmm2
+ pmovmskb %xmm2, %edi
+ shr %cl, %esi
+ shr %cl, %edi
+ sub %edi, %esi
+ lea -7(%ecx), %edi
+ jnz L(less32bytes)
+
+ UPDATE_STRNCMP_COUNTER
+
+ movdqa (%edx), %xmm3
+ pxor %xmm0, %xmm0
+ mov $16, %ecx
+ orl $9, FLAGS
+ lea 9(%edx), %edi
+ and $0xfff, %edi
+ sub $0x1000, %edi
+
+ .p2align 4
+L(loop_ashr_9):
+ add $16, %edi
+ jg L(nibble_ashr_9)
+
+L(gobble_ashr_9):
+ movdqa (%eax, %ecx), %xmm1
+ movdqa (%edx, %ecx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ palignr $9, %xmm3, %xmm2
+ TOLOWER (%xmm1, %xmm2)
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %esi
+ sub $0xffff, %esi
+ jnz L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp $16, REM
+ lea -16(REM), REM
+ jbe L(more8byteseq)
+#endif
+ add $16, %ecx
+ movdqa %xmm4, %xmm3
+
+ add $16, %edi
+ jg L(nibble_ashr_9)
+
+ movdqa (%eax, %ecx), %xmm1
+ movdqa (%edx, %ecx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ palignr $9, %xmm3, %xmm2
+ TOLOWER (%xmm1, %xmm2)
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %esi
+ sub $0xffff, %esi
+ jnz L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp $16, REM
+ lea -16(REM), REM
+ jbe L(more8byteseq)
+#endif
+ add $16, %ecx
+ movdqa %xmm4, %xmm3
+ jmp L(loop_ashr_9)
+
+ .p2align 4
+L(nibble_ashr_9):
+ pcmpeqb %xmm3, %xmm0
+ pmovmskb %xmm0, %esi
+ test $0xfe00, %esi
+ jnz L(ashr_9_exittail)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp $7, REM
+ jbe L(ashr_9_exittail)
+#endif
+ pxor %xmm0, %xmm0
+ sub $0x1000, %edi
+ jmp L(gobble_ashr_9)
+
+ .p2align 4
+L(ashr_9_exittail):
+ movdqa (%eax, %ecx), %xmm1
+ psrldq $9, %xmm0
+ psrldq $9, %xmm3
+ jmp L(aftertail)
+
+/*
+ * The following cases will be handled by ashr_10
+ * ecx(offset of esi) eax(offset of edi) relative offset corresponding case
+ * n(6~15) n - 6 9(15 +(n-6) - n) ashr_10
+ */
+ .p2align 4
+L(ashr_10):
+ mov $0xffff, %esi
+ pxor %xmm0, %xmm0
+ movdqa (%edx), %xmm2
+ movdqa (%eax), %xmm1
+ pcmpeqb %xmm1, %xmm0
+ pslldq $6, %xmm2
+ TOLOWER (%xmm1, %xmm2)
+ pcmpeqb %xmm1, %xmm2
+ psubb %xmm0, %xmm2
+ pmovmskb %xmm2, %edi
+ shr %cl, %esi
+ shr %cl, %edi
+ sub %edi, %esi
+ lea -6(%ecx), %edi
+ jnz L(less32bytes)
+
+ UPDATE_STRNCMP_COUNTER
+
+ movdqa (%edx), %xmm3
+ pxor %xmm0, %xmm0
+ mov $16, %ecx
+ orl $10, FLAGS
+ lea 10(%edx), %edi
+ and $0xfff, %edi
+ sub $0x1000, %edi
+
+ .p2align 4
+L(loop_ashr_10):
+ add $16, %edi
+ jg L(nibble_ashr_10)
+
+L(gobble_ashr_10):
+ movdqa (%eax, %ecx), %xmm1
+ movdqa (%edx, %ecx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ palignr $10, %xmm3, %xmm2
+ TOLOWER (%xmm1, %xmm2)
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %esi
+ sub $0xffff, %esi
+ jnz L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp $16, REM
+ lea -16(REM), REM
+ jbe L(more8byteseq)
+#endif
+ add $16, %ecx
+ movdqa %xmm4, %xmm3
+
+ add $16, %edi
+ jg L(nibble_ashr_10)
+
+ movdqa (%eax, %ecx), %xmm1
+ movdqa (%edx, %ecx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ palignr $10, %xmm3, %xmm2
+ TOLOWER (%xmm1, %xmm2)
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %esi
+ sub $0xffff, %esi
+ jnz L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp $16, REM
+ lea -16(REM), REM
+ jbe L(more8byteseq)
+#endif
+ add $16, %ecx
+ movdqa %xmm4, %xmm3
+ jmp L(loop_ashr_10)
+
+ .p2align 4
+L(nibble_ashr_10):
+ pcmpeqb %xmm3, %xmm0
+ pmovmskb %xmm0, %esi
+ test $0xfc00, %esi
+ jnz L(ashr_10_exittail)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp $6, REM
+ jbe L(ashr_10_exittail)
+#endif
+ pxor %xmm0, %xmm0
+ sub $0x1000, %edi
+ jmp L(gobble_ashr_10)
+
+ .p2align 4
+L(ashr_10_exittail):
+ movdqa (%eax, %ecx), %xmm1
+ psrldq $10, %xmm0
+ psrldq $10, %xmm3
+ jmp L(aftertail)
+
+/*
+ * The following cases will be handled by ashr_11
+ * ecx(offset of esi) eax(offset of edi) relative offset corresponding case
+ * n(5~15) n - 5 10(15 +(n-5) - n) ashr_11
+ */
+ .p2align 4
+L(ashr_11):
+ mov $0xffff, %esi
+ pxor %xmm0, %xmm0
+ movdqa (%edx), %xmm2
+ movdqa (%eax), %xmm1
+ pcmpeqb %xmm1, %xmm0
+ pslldq $5, %xmm2
+ TOLOWER (%xmm1, %xmm2)
+ pcmpeqb %xmm1, %xmm2
+ psubb %xmm0, %xmm2
+ pmovmskb %xmm2, %edi
+ shr %cl, %esi
+ shr %cl, %edi
+ sub %edi, %esi
+ lea -5(%ecx), %edi
+ jnz L(less32bytes)
+
+ UPDATE_STRNCMP_COUNTER
+
+ movdqa (%edx), %xmm3
+ pxor %xmm0, %xmm0
+ mov $16, %ecx
+ orl $11, FLAGS
+ lea 11(%edx), %edi
+ and $0xfff, %edi
+ sub $0x1000, %edi
+
+ .p2align 4
+L(loop_ashr_11):
+ add $16, %edi
+ jg L(nibble_ashr_11)
+
+L(gobble_ashr_11):
+ movdqa (%eax, %ecx), %xmm1
+ movdqa (%edx, %ecx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ palignr $11, %xmm3, %xmm2
+ TOLOWER (%xmm1, %xmm2)
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %esi
+ sub $0xffff, %esi
+ jnz L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp $16, REM
+ lea -16(REM), REM
+ jbe L(more8byteseq)
+#endif
+ add $16, %ecx
+ movdqa %xmm4, %xmm3
+
+ add $16, %edi
+ jg L(nibble_ashr_11)
+
+ movdqa (%eax, %ecx), %xmm1
+ movdqa (%edx, %ecx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ palignr $11, %xmm3, %xmm2
+ TOLOWER (%xmm1, %xmm2)
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %esi
+ sub $0xffff, %esi
+ jnz L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp $16, REM
+ lea -16(REM), REM
+ jbe L(more8byteseq)
+#endif
+ add $16, %ecx
+ movdqa %xmm4, %xmm3
+ jmp L(loop_ashr_11)
+
+ .p2align 4
+L(nibble_ashr_11):
+ pcmpeqb %xmm3, %xmm0
+ pmovmskb %xmm0, %esi
+ test $0xf800, %esi
+ jnz L(ashr_11_exittail)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp $5, REM
+ jbe L(ashr_11_exittail)
+#endif
+ pxor %xmm0, %xmm0
+ sub $0x1000, %edi
+ jmp L(gobble_ashr_11)
+
+ .p2align 4
+L(ashr_11_exittail):
+ movdqa (%eax, %ecx), %xmm1
+ psrldq $11, %xmm0
+ psrldq $11, %xmm3
+ jmp L(aftertail)
+
+/*
+ * The following cases will be handled by ashr_12
+ * ecx(offset of esi) eax(offset of edi) relative offset corresponding case
+ * n(4~15) n - 4 11(15 +(n-4) - n) ashr_12
+ */
+ .p2align 4
+L(ashr_12):
+ mov $0xffff, %esi
+ pxor %xmm0, %xmm0
+ movdqa (%edx), %xmm2
+ movdqa (%eax), %xmm1
+ pcmpeqb %xmm1, %xmm0
+ pslldq $4, %xmm2
+ TOLOWER (%xmm1, %xmm2)
+ pcmpeqb %xmm1, %xmm2
+ psubb %xmm0, %xmm2
+ pmovmskb %xmm2, %edi
+ shr %cl, %esi
+ shr %cl, %edi
+ sub %edi, %esi
+ lea -4(%ecx), %edi
+ jnz L(less32bytes)
+
+ UPDATE_STRNCMP_COUNTER
+
+ movdqa (%edx), %xmm3
+ pxor %xmm0, %xmm0
+ mov $16, %ecx
+ orl $12, FLAGS
+ lea 12(%edx), %edi
+ and $0xfff, %edi
+ sub $0x1000, %edi
+
+ .p2align 4
+L(loop_ashr_12):
+ add $16, %edi
+ jg L(nibble_ashr_12)
+
+L(gobble_ashr_12):
+ movdqa (%eax, %ecx), %xmm1
+ movdqa (%edx, %ecx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ palignr $12, %xmm3, %xmm2
+ TOLOWER (%xmm1, %xmm2)
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %esi
+ sub $0xffff, %esi
+ jnz L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp $16, REM
+ lea -16(REM), REM
+ jbe L(more8byteseq)
+#endif
+
+ add $16, %ecx
+ movdqa %xmm4, %xmm3
+
+ add $16, %edi
+ jg L(nibble_ashr_12)
+
+ movdqa (%eax, %ecx), %xmm1
+ movdqa (%edx, %ecx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ palignr $12, %xmm3, %xmm2
+ TOLOWER (%xmm1, %xmm2)
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %esi
+ sub $0xffff, %esi
+ jnz L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp $16, REM
+ lea -16(REM), REM
+ jbe L(more8byteseq)
+#endif
+ add $16, %ecx
+ movdqa %xmm4, %xmm3
+ jmp L(loop_ashr_12)
+
+ .p2align 4
+L(nibble_ashr_12):
+ pcmpeqb %xmm3, %xmm0
+ pmovmskb %xmm0, %esi
+ test $0xf000, %esi
+ jnz L(ashr_12_exittail)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp $4, REM
+ jbe L(ashr_12_exittail)
+#endif
+ pxor %xmm0, %xmm0
+ sub $0x1000, %edi
+ jmp L(gobble_ashr_12)
+
+ .p2align 4
+L(ashr_12_exittail):
+ movdqa (%eax, %ecx), %xmm1
+ psrldq $12, %xmm0
+ psrldq $12, %xmm3
+ jmp L(aftertail)
+
+/*
+ * The following cases will be handled by ashr_13
+ * ecx(offset of esi) eax(offset of edi) relative offset corresponding case
+ * n(3~15) n - 3 12(15 +(n-3) - n) ashr_13
+ */
+ .p2align 4
+L(ashr_13):
+ mov $0xffff, %esi
+ pxor %xmm0, %xmm0
+ movdqa (%edx), %xmm2
+ movdqa (%eax), %xmm1
+ pcmpeqb %xmm1, %xmm0
+ pslldq $3, %xmm2
+ TOLOWER (%xmm1, %xmm2)
+ pcmpeqb %xmm1, %xmm2
+ psubb %xmm0, %xmm2
+ pmovmskb %xmm2, %edi
+ shr %cl, %esi
+ shr %cl, %edi
+ sub %edi, %esi
+ lea -3(%ecx), %edi
+ jnz L(less32bytes)
+
+ UPDATE_STRNCMP_COUNTER
+
+ movdqa (%edx), %xmm3
+ pxor %xmm0, %xmm0
+ mov $16, %ecx
+ orl $13, FLAGS
+ lea 13(%edx), %edi
+ and $0xfff, %edi
+ sub $0x1000, %edi
+
+ .p2align 4
+L(loop_ashr_13):
+ add $16, %edi
+ jg L(nibble_ashr_13)
+
+L(gobble_ashr_13):
+ movdqa (%eax, %ecx), %xmm1
+ movdqa (%edx, %ecx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ palignr $13, %xmm3, %xmm2
+ TOLOWER (%xmm1, %xmm2)
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %esi
+ sub $0xffff, %esi
+ jnz L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp $16, REM
+ lea -16(REM), REM
+ jbe L(more8byteseq)
+#endif
+ add $16, %ecx
+ movdqa %xmm4, %xmm3
+
+ add $16, %edi
+ jg L(nibble_ashr_13)
+
+ movdqa (%eax, %ecx), %xmm1
+ movdqa (%edx, %ecx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ palignr $13, %xmm3, %xmm2
+ TOLOWER (%xmm1, %xmm2)
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %esi
+ sub $0xffff, %esi
+ jnz L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp $16, REM
+ lea -16(REM), REM
+ jbe L(more8byteseq)
+#endif
+ add $16, %ecx
+ movdqa %xmm4, %xmm3
+ jmp L(loop_ashr_13)
+
+ .p2align 4
+L(nibble_ashr_13):
+ pcmpeqb %xmm3, %xmm0
+ pmovmskb %xmm0, %esi
+ test $0xe000, %esi
+ jnz L(ashr_13_exittail)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp $3, REM
+ jbe L(ashr_13_exittail)
+#endif
+ pxor %xmm0, %xmm0
+ sub $0x1000, %edi
+ jmp L(gobble_ashr_13)
+
+ .p2align 4
+L(ashr_13_exittail):
+ movdqa (%eax, %ecx), %xmm1
+ psrldq $13, %xmm0
+ psrldq $13, %xmm3
+ jmp L(aftertail)
+
+/*
+ * The following cases will be handled by ashr_14
+ * ecx(offset of esi) eax(offset of edi) relative offset corresponding case
+ * n(2~15) n - 2 13(15 +(n-2) - n) ashr_14
+ */
+ .p2align 4
+L(ashr_14):
+ mov $0xffff, %esi
+ pxor %xmm0, %xmm0
+ movdqa (%edx), %xmm2
+ movdqa (%eax), %xmm1
+ pcmpeqb %xmm1, %xmm0
+ pslldq $2, %xmm2
+ TOLOWER (%xmm1, %xmm2)
+ pcmpeqb %xmm1, %xmm2
+ psubb %xmm0, %xmm2
+ pmovmskb %xmm2, %edi
+ shr %cl, %esi
+ shr %cl, %edi
+ sub %edi, %esi
+ lea -2(%ecx), %edi
+ jnz L(less32bytes)
+
+ UPDATE_STRNCMP_COUNTER
+
+ movdqa (%edx), %xmm3
+ pxor %xmm0, %xmm0
+ mov $16, %ecx
+ orl $14, FLAGS
+ lea 14(%edx), %edi
+ and $0xfff, %edi
+ sub $0x1000, %edi
+
+ .p2align 4
+L(loop_ashr_14):
+ add $16, %edi
+ jg L(nibble_ashr_14)
+
+L(gobble_ashr_14):
+ movdqa (%eax, %ecx), %xmm1
+ movdqa (%edx, %ecx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ palignr $14, %xmm3, %xmm2
+ TOLOWER (%xmm1, %xmm2)
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %esi
+ sub $0xffff, %esi
+ jnz L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp $16, REM
+ lea -16(REM), REM
+ jbe L(more8byteseq)
+#endif
+ add $16, %ecx
+ movdqa %xmm4, %xmm3
+
+ add $16, %edi
+ jg L(nibble_ashr_14)
+
+ movdqa (%eax, %ecx), %xmm1
+ movdqa (%edx, %ecx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ palignr $14, %xmm3, %xmm2
+ TOLOWER (%xmm1, %xmm2)
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %esi
+ sub $0xffff, %esi
+ jnz L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp $16, REM
+ lea -16(REM), REM
+ jbe L(more8byteseq)
+#endif
+ add $16, %ecx
+ movdqa %xmm4, %xmm3
+ jmp L(loop_ashr_14)
+
+ .p2align 4
+L(nibble_ashr_14):
+ pcmpeqb %xmm3, %xmm0
+ pmovmskb %xmm0, %esi
+ test $0xc000, %esi
+ jnz L(ashr_14_exittail)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp $2, REM
+ jbe L(ashr_14_exittail)
+#endif
+ pxor %xmm0, %xmm0
+ sub $0x1000, %edi
+ jmp L(gobble_ashr_14)
+
+ .p2align 4
+L(ashr_14_exittail):
+ movdqa (%eax, %ecx), %xmm1
+ psrldq $14, %xmm0
+ psrldq $14, %xmm3
+ jmp L(aftertail)
+
+/*
+ * The following cases will be handled by ashr_14
+ * ecx(offset of esi) eax(offset of edi) relative offset corresponding case
+ * n(1~15) n - 1 14(15 +(n-1) - n) ashr_15
+ */
+
+ .p2align 4
+L(ashr_15):
+ mov $0xffff, %esi
+ pxor %xmm0, %xmm0
+ movdqa (%edx), %xmm2
+ movdqa (%eax), %xmm1
+ pcmpeqb %xmm1, %xmm0
+ pslldq $1, %xmm2
+ TOLOWER (%xmm1, %xmm2)
+ pcmpeqb %xmm1, %xmm2
+ psubb %xmm0, %xmm2
+ pmovmskb %xmm2, %edi
+ shr %cl, %esi
+ shr %cl, %edi
+ sub %edi, %esi
+ lea -1(%ecx), %edi
+ jnz L(less32bytes)
+
+ UPDATE_STRNCMP_COUNTER
+
+ movdqa (%edx), %xmm3
+ pxor %xmm0, %xmm0
+ mov $16, %ecx
+ orl $15, FLAGS
+ lea 15(%edx), %edi
+ and $0xfff, %edi
+ sub $0x1000, %edi
+
+ .p2align 4
+L(loop_ashr_15):
+ add $16, %edi
+ jg L(nibble_ashr_15)
+
+L(gobble_ashr_15):
+ movdqa (%eax, %ecx), %xmm1
+ movdqa (%edx, %ecx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ palignr $15, %xmm3, %xmm2
+ TOLOWER (%xmm1, %xmm2)
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %esi
+ sub $0xffff, %esi
+ jnz L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp $16, REM
+ lea -16(REM), REM
+ jbe L(more8byteseq)
+#endif
+ add $16, %ecx
+ movdqa %xmm4, %xmm3
+
+ add $16, %edi
+ jg L(nibble_ashr_15)
+
+ movdqa (%eax, %ecx), %xmm1
+ movdqa (%edx, %ecx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ palignr $15, %xmm3, %xmm2
+ TOLOWER (%xmm1, %xmm2)
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %esi
+ sub $0xffff, %esi
+ jnz L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp $16, REM
+ lea -16(REM), REM
+ jbe L(more8byteseq)
+#endif
+ add $16, %ecx
+ movdqa %xmm4, %xmm3
+ jmp L(loop_ashr_15)
+
+ .p2align 4
+L(nibble_ashr_15):
+ pcmpeqb %xmm3, %xmm0
+ pmovmskb %xmm0, %esi
+ test $0x8000, %esi
+ jnz L(ashr_15_exittail)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp $1, REM
+ jbe L(ashr_15_exittail)
+#endif
+ pxor %xmm0, %xmm0
+ sub $0x1000, %edi
+ jmp L(gobble_ashr_15)
+
+ .p2align 4
+L(ashr_15_exittail):
+ movdqa (%eax, %ecx), %xmm1
+ psrldq $15, %xmm0
+ psrldq $15, %xmm3
+ jmp L(aftertail)
+
+ .p2align 4
+L(aftertail):
+ TOLOWER (%xmm1, %xmm3)
+ pcmpeqb %xmm3, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %esi
+ not %esi
+L(exit):
+ mov FLAGS, %edi
+ and $0x1f, %edi
+ lea -16(%edi, %ecx), %edi
+L(less32bytes):
+ add %edi, %edx
+ add %ecx, %eax
+ testl $0x20, FLAGS
+ jz L(ret2)
+ xchg %eax, %edx
+
+ .p2align 4
+L(ret2):
+ mov %esi, %ecx
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+ addl $4, %esp
+ cfi_adjust_cfa_offset (-4)
+#endif
+ POP (%esi)
+ POP (%edi)
+#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+ POP (FLAGS)
+#endif
+L(less16bytes):
+ test %cl, %cl
+ jz L(2next_8_bytes)
+
+ test $0x01, %cl
+ jnz L(Byte0)
+
+ test $0x02, %cl
+ jnz L(Byte1)
+
+ test $0x04, %cl
+ jnz L(Byte2)
+
+ test $0x08, %cl
+ jnz L(Byte3)
+
+ test $0x10, %cl
+ jnz L(Byte4)
+
+ test $0x20, %cl
+ jnz L(Byte5)
+
+ test $0x40, %cl
+ jnz L(Byte6)
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp $7, REM
+ jbe L(eq)
+#endif
+
+ movzx 7(%eax), %ecx
+ movzx 7(%edx), %eax
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+# ifdef PIC
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%eax,4), %eax
+# else
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%eax,4), %eax
+# endif
+#endif
+
+ sub %ecx, %eax
+ RETURN
+
+L(Byte0):
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp $0, REM
+ jbe L(eq)
+#endif
+ movzx (%eax), %ecx
+ movzx (%edx), %eax
+
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+# ifdef PIC
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%eax,4), %eax
+# else
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%eax,4), %eax
+# endif
+#endif
+
+ sub %ecx, %eax
+ RETURN
+
+L(Byte1):
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp $1, REM
+ jbe L(eq)
+#endif
+ movzx 1(%eax), %ecx
+ movzx 1(%edx), %eax
+
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+# ifdef PIC
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%eax,4), %eax
+# else
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%eax,4), %eax
+# endif
+#endif
+
+ sub %ecx, %eax
+ RETURN
+
+L(Byte2):
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp $2, REM
+ jbe L(eq)
+#endif
+ movzx 2(%eax), %ecx
+ movzx 2(%edx), %eax
+
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+# ifdef PIC
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%eax,4), %eax
+# else
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%eax,4), %eax
+# endif
+#endif
+
+ sub %ecx, %eax
+ RETURN
+
+L(Byte3):
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp $3, REM
+ jbe L(eq)
+#endif
+ movzx 3(%eax), %ecx
+ movzx 3(%edx), %eax
+
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+# ifdef PIC
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%eax,4), %eax
+# else
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%eax,4), %eax
+# endif
+#endif
+
+ sub %ecx, %eax
+ RETURN
+
+L(Byte4):
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp $4, REM
+ jbe L(eq)
+#endif
+ movzx 4(%eax), %ecx
+ movzx 4(%edx), %eax
+
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+# ifdef PIC
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%eax,4), %eax
+# else
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%eax,4), %eax
+# endif
+#endif
+
+ sub %ecx, %eax
+ RETURN
+
+L(Byte5):
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp $5, REM
+ jbe L(eq)
+#endif
+ movzx 5(%eax), %ecx
+ movzx 5(%edx), %eax
+
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+# ifdef PIC
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%eax,4), %eax
+# else
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%eax,4), %eax
+# endif
+#endif
+
+ sub %ecx, %eax
+ RETURN
+
+L(Byte6):
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp $6, REM
+ jbe L(eq)
+#endif
+ movzx 6(%eax), %ecx
+ movzx 6(%edx), %eax
+
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+# ifdef PIC
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%eax,4), %eax
+# else
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%eax,4), %eax
+# endif
+#endif
+
+ sub %ecx, %eax
+ RETURN
+
+L(2next_8_bytes):
+ add $8, %eax
+ add $8, %edx
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp $8, REM
+ lea -8(REM), REM
+ jbe L(eq)
+#endif
+
+ test $0x01, %ch
+ jnz L(Byte0)
+
+ test $0x02, %ch
+ jnz L(Byte1)
+
+ test $0x04, %ch
+ jnz L(Byte2)
+
+ test $0x08, %ch
+ jnz L(Byte3)
+
+ test $0x10, %ch
+ jnz L(Byte4)
+
+ test $0x20, %ch
+ jnz L(Byte5)
+
+ test $0x40, %ch
+ jnz L(Byte6)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp $7, REM
+ jbe L(eq)
+#endif
+ movzx 7(%eax), %ecx
+ movzx 7(%edx), %eax
+
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+# ifdef PIC
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%eax,4), %eax
+# else
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%eax,4), %eax
+# endif
+#endif
+
+ sub %ecx, %eax
+ RETURN
+
+#ifdef USE_AS_STRNCMP
+L(neq_sncmp):
+#endif
+L(neq):
+ mov $1, %eax
+ ja L(neq_bigger)
+ neg %eax
+L(neq_bigger):
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+ addl $4, %esp
+ cfi_adjust_cfa_offset (-4)
+#endif
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ POP (REM)
+#endif
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+# ifdef PIC
+ POP (%ebx)
+# endif
+#endif
+ ret
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ .p2align 4
+ cfi_restore_state
+L(more8byteseq):
+
+# ifdef USE_AS_STRNCASECMP_L
+ addl $4, %esp
+ cfi_adjust_cfa_offset (-4)
+# endif
+ POP (%esi)
+ POP (%edi)
+# ifdef USE_AS_STRNCMP
+ POP (FLAGS)
+# endif
+#endif
+
+#ifdef USE_AS_STRNCMP
+L(eq_sncmp):
+#endif
+L(eq):
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ POP (REM)
+#endif
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+# ifdef PIC
+ POP (%ebx)
+# endif
+#endif
+ xorl %eax, %eax
+ ret
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ .p2align 4
+# if defined USE_AS_STRNCASECMP_L && defined PIC
+ CFI_PUSH (%ebx)
+# endif
+ CFI_PUSH (REM)
+L(less16bytes_sncmp):
+# ifdef USE_AS_STRNCASECMP_L
+ PUSH (%esi)
+# endif
+ test REM, REM
+ jz L(eq_sncmp)
+
+ movzbl (%eax), %ecx
+# ifdef USE_AS_STRNCASECMP_L
+ movzbl (%edx), %esi
+# ifdef PIC
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%esi,4), %esi
+# else
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%esi,4), %esi
+# endif
+ cmpl %ecx, %esi
+# else
+ cmpb %cl, (%edx)
+# endif
+ jne L(neq_sncmp)
+ test %cl, %cl
+ je L(eq_sncmp)
+
+ cmp $1, REM
+ je L(eq_sncmp)
+
+ movzbl 1(%eax), %ecx
+# ifdef USE_AS_STRNCASECMP_L
+ movzbl 1(%edx), %esi
+# ifdef PIC
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%esi,4), %esi
+# else
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%esi,4), %esi
+# endif
+ cmpl %ecx, %esi
+# else
+ cmpb %cl, 1(%edx)
+# endif
+ jne L(neq_sncmp)
+ test %cl, %cl
+ je L(eq_sncmp)
+
+ cmp $2, REM
+ je L(eq_sncmp)
+
+ movzbl 2(%eax), %ecx
+# ifdef USE_AS_STRNCASECMP_L
+ movzbl 2(%edx), %esi
+# ifdef PIC
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%esi,4), %esi
+# else
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%esi,4), %esi
+# endif
+ cmpl %ecx, %esi
+# else
+ cmpb %cl, 2(%edx)
+# endif
+ jne L(neq_sncmp)
+ test %cl, %cl
+ je L(eq_sncmp)
+
+ cmp $3, REM
+ je L(eq_sncmp)
+
+ movzbl 3(%eax), %ecx
+# ifdef USE_AS_STRNCASECMP_L
+ movzbl 3(%edx), %esi
+# ifdef PIC
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%esi,4), %esi
+# else
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%esi,4), %esi
+# endif
+ cmpl %ecx, %esi
+# else
+ cmpb %cl, 3(%edx)
+# endif
+ jne L(neq_sncmp)
+ test %cl, %cl
+ je L(eq_sncmp)
+
+ cmp $4, REM
+ je L(eq_sncmp)
+
+ movzbl 4(%eax), %ecx
+# ifdef USE_AS_STRNCASECMP_L
+ movzbl 4(%edx), %esi
+# ifdef PIC
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%esi,4), %esi
+# else
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%esi,4), %esi
+# endif
+ cmpl %ecx, %esi
+# else
+ cmpb %cl, 4(%edx)
+# endif
+ jne L(neq_sncmp)
+ test %cl, %cl
+ je L(eq_sncmp)
+
+ cmp $5, REM
+ je L(eq_sncmp)
+
+ movzbl 5(%eax), %ecx
+# ifdef USE_AS_STRNCASECMP_L
+ movzbl 5(%edx), %esi
+# ifdef PIC
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%esi,4), %esi
+# else
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%esi,4), %esi
+# endif
+ cmpl %ecx, %esi
+# else
+ cmpb %cl, 5(%edx)
+# endif
+ jne L(neq_sncmp)
+ test %cl, %cl
+ je L(eq_sncmp)
+
+ cmp $6, REM
+ je L(eq_sncmp)
+
+ movzbl 6(%eax), %ecx
+# ifdef USE_AS_STRNCASECMP_L
+ movzbl 6(%edx), %esi
+# ifdef PIC
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%esi,4), %esi
+# else
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%esi,4), %esi
+# endif
+ cmpl %ecx, %esi
+# else
+ cmpb %cl, 6(%edx)
+# endif
+ jne L(neq_sncmp)
+ test %cl, %cl
+ je L(eq_sncmp)
+
+ cmp $7, REM
+ je L(eq_sncmp)
+
+ movzbl 7(%eax), %ecx
+# ifdef USE_AS_STRNCASECMP_L
+ movzbl 7(%edx), %esi
+# ifdef PIC
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%esi,4), %esi
+# else
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%esi,4), %esi
+# endif
+ cmpl %ecx, %esi
+# else
+ cmpb %cl, 7(%edx)
+# endif
+ jne L(neq_sncmp)
+ test %cl, %cl
+ je L(eq_sncmp)
+
+
+ cmp $8, REM
+ je L(eq_sncmp)
+
+ movzbl 8(%eax), %ecx
+# ifdef USE_AS_STRNCASECMP_L
+ movzbl 8(%edx), %esi
+# ifdef PIC
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%esi,4), %esi
+# else
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%esi,4), %esi
+# endif
+ cmpl %ecx, %esi
+# else
+ cmpb %cl, 8(%edx)
+# endif
+ jne L(neq_sncmp)
+ test %cl, %cl
+ je L(eq_sncmp)
+
+ cmp $9, REM
+ je L(eq_sncmp)
+
+ movzbl 9(%eax), %ecx
+# ifdef USE_AS_STRNCASECMP_L
+ movzbl 9(%edx), %esi
+# ifdef PIC
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%esi,4), %esi
+# else
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%esi,4), %esi
+# endif
+ cmpl %ecx, %esi
+# else
+ cmpb %cl, 9(%edx)
+# endif
+ jne L(neq_sncmp)
+ test %cl, %cl
+ je L(eq_sncmp)
+
+ cmp $10, REM
+ je L(eq_sncmp)
+
+ movzbl 10(%eax), %ecx
+# ifdef USE_AS_STRNCASECMP_L
+ movzbl 10(%edx), %esi
+# ifdef PIC
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%esi,4), %esi
+# else
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%esi,4), %esi
+# endif
+ cmpl %ecx, %esi
+# else
+ cmpb %cl, 10(%edx)
+# endif
+ jne L(neq_sncmp)
+ test %cl, %cl
+ je L(eq_sncmp)
+
+ cmp $11, REM
+ je L(eq_sncmp)
+
+ movzbl 11(%eax), %ecx
+# ifdef USE_AS_STRNCASECMP_L
+ movzbl 11(%edx), %esi
+# ifdef PIC
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%esi,4), %esi
+# else
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%esi,4), %esi
+# endif
+ cmpl %ecx, %esi
+# else
+ cmpb %cl, 11(%edx)
+# endif
+ jne L(neq_sncmp)
+ test %cl, %cl
+ je L(eq_sncmp)
+
+
+ cmp $12, REM
+ je L(eq_sncmp)
+
+ movzbl 12(%eax), %ecx
+# ifdef USE_AS_STRNCASECMP_L
+ movzbl 12(%edx), %esi
+# ifdef PIC
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%esi,4), %esi
+# else
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%esi,4), %esi
+# endif
+ cmpl %ecx, %esi
+# else
+ cmpb %cl, 12(%edx)
+# endif
+ jne L(neq_sncmp)
+ test %cl, %cl
+ je L(eq_sncmp)
+
+ cmp $13, REM
+ je L(eq_sncmp)
+
+ movzbl 13(%eax), %ecx
+# ifdef USE_AS_STRNCASECMP_L
+ movzbl 13(%edx), %esi
+# ifdef PIC
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%esi,4), %esi
+# else
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%esi,4), %esi
+# endif
+ cmpl %ecx, %esi
+# else
+ cmpb %cl, 13(%edx)
+# endif
+ jne L(neq_sncmp)
+ test %cl, %cl
+ je L(eq_sncmp)
+
+ cmp $14, REM
+ je L(eq_sncmp)
+
+ movzbl 14(%eax), %ecx
+# ifdef USE_AS_STRNCASECMP_L
+ movzbl 14(%edx), %esi
+# ifdef PIC
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%esi,4), %esi
+# else
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%esi,4), %esi
+# endif
+ cmpl %ecx, %esi
+# else
+ cmpb %cl, 14(%edx)
+# endif
+ jne L(neq_sncmp)
+ test %cl, %cl
+ je L(eq_sncmp)
+
+ cmp $15, REM
+ je L(eq_sncmp)
+
+ movzbl 15(%eax), %ecx
+# ifdef USE_AS_STRNCASECMP_L
+ movzbl 15(%edx), %esi
+# ifdef PIC
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%esi,4), %esi
+# else
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%esi,4), %esi
+# endif
+ cmpl %ecx, %esi
+# else
+ cmpb %cl, 15(%edx)
+# endif
+ jne L(neq_sncmp)
+
+# ifdef USE_AS_STRNCASECMP_L
+L(eq_sncmp):
+ POP (%esi)
+# endif
+ POP (REM)
+# if defined USE_AS_STRNCASECMP_L && defined PIC
+ POP (%ebx)
+# endif
+ xor %eax, %eax
+ ret
+
+# ifdef USE_AS_STRNCASECMP_L
+ .p2align 4
+# ifdef PIC
+ CFI_PUSH (%ebx)
+# endif
+ CFI_PUSH (REM)
+ CFI_PUSH (%esi)
+L(neq_sncmp):
+ mov $1, %eax
+ mov $-1, %edx
+ cmovna %edx, %eax
+ POP (%esi)
+ POP (REM)
+# ifdef PIC
+ POP (%ebx)
+# endif
+ ret
+# endif
+#endif
+
+END (STRCMP)
+
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strcmp.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strcmp.S
new file mode 100644
index 0000000000..56de25a4b7
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strcmp.S
@@ -0,0 +1,95 @@
+/* Multiple versions of strcmp
+ All versions must be listed in ifunc-impl-list.c.
+ Copyright (C) 2010-2017 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+#ifdef USE_AS_STRNCMP
+# define STRCMP strncmp
+# define __GI_STRCMP __GI_strncmp
+# define __STRCMP_IA32 __strncmp_ia32
+# define __STRCMP_SSSE3 __strncmp_ssse3
+# define __STRCMP_SSE4_2 __strncmp_sse4_2
+#elif defined USE_AS_STRCASECMP_L
+# define STRCMP __strcasecmp_l
+# define __GI_STRCMP __GI_strcasecmp_l
+# define __STRCMP_IA32 __strcasecmp_l_ia32
+# define __STRCMP_SSSE3 __strcasecmp_l_ssse3
+# define __STRCMP_SSE4_2 __strcasecmp_l_sse4_2
+#elif defined USE_AS_STRNCASECMP_L
+# define STRCMP __strncasecmp_l
+# define __GI_STRCMP __GI_strncasecmp_l
+# define __STRCMP_IA32 __strncasecmp_l_ia32
+# define __STRCMP_SSSE3 __strncasecmp_l_ssse3
+# define __STRCMP_SSE4_2 __strncasecmp_l_sse4_2
+#else
+# define STRCMP strcmp
+# define __GI_STRCMP __GI_strcmp
+# define __STRCMP_IA32 __strcmp_ia32
+# define __STRCMP_SSSE3 __strcmp_ssse3
+# define __STRCMP_SSE4_2 __strcmp_sse4_2
+#endif
+
+/* Define multiple versions only for the definition in libc. Don't
+ define multiple versions for strncmp in static library since we
+ need strncmp before the initialization happened. */
+#if (defined SHARED || !defined USE_AS_STRNCMP) && IS_IN (libc)
+ .text
+ENTRY(STRCMP)
+ .type STRCMP, @gnu_indirect_function
+ LOAD_GOT_AND_RTLD_GLOBAL_RO
+ LOAD_FUNC_GOT_EAX (__STRCMP_IA32)
+ HAS_CPU_FEATURE (SSSE3)
+ jz 2f
+ LOAD_FUNC_GOT_EAX (__STRCMP_SSSE3)
+ HAS_CPU_FEATURE (SSE4_2)
+ jz 2f
+ HAS_ARCH_FEATURE (Slow_SSE4_2)
+ jnz 2f
+ LOAD_FUNC_GOT_EAX (__STRCMP_SSE4_2)
+2: ret
+END(STRCMP)
+
+# undef ENTRY
+# define ENTRY(name) \
+ .type __STRCMP_IA32, @function; \
+ .p2align 4; \
+ .globl __STRCMP_IA32; \
+ .hidden __STRCMP_IA32; \
+ __STRCMP_IA32: cfi_startproc; \
+ CALL_MCOUNT
+# undef END
+# define END(name) \
+ cfi_endproc; .size __STRCMP_IA32, .-__STRCMP_IA32
+
+# ifdef SHARED
+# undef libc_hidden_builtin_def
+/* IFUNC doesn't work with the hidden functions in shared library since
+ they will be called without setting up EBX needed for PLT which is
+ used by IFUNC. */
+# define libc_hidden_builtin_def(name) \
+ .globl __GI_STRCMP; __GI_STRCMP = __STRCMP_IA32
+# endif
+#endif
+
+#if !defined USE_AS_STRNCMP && !defined USE_AS_STRCASECMP_L \
+ && !defined USE_AS_STRNCASECMP_L
+# include "../strcmp.S"
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strcpy-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strcpy-sse2.S
new file mode 100644
index 0000000000..ed627a5f62
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strcpy-sse2.S
@@ -0,0 +1,2250 @@
+/* strcpy with SSE2 and unaligned load
+ Copyright (C) 2011-2017 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+
+
+# define CFI_PUSH(REG) \
+ cfi_adjust_cfa_offset (4); \
+ cfi_rel_offset (REG, 0)
+
+# define CFI_POP(REG) \
+ cfi_adjust_cfa_offset (-4); \
+ cfi_restore (REG)
+
+# define PUSH(REG) pushl REG; CFI_PUSH (REG)
+# define POP(REG) popl REG; CFI_POP (REG)
+
+# ifndef STRCPY
+# define STRCPY __strcpy_sse2
+# endif
+
+# define STR1 PARMS
+# define STR2 STR1+4
+# define LEN STR2+4
+
+# ifdef USE_AS_STRNCPY
+# define PARMS 16
+# define ENTRANCE PUSH(%ebx); PUSH(%esi); PUSH(%edi)
+# define RETURN POP(%edi); POP(%esi); POP(%ebx); ret; \
+ CFI_PUSH(%ebx); CFI_PUSH(%esi); CFI_PUSH(%edi);
+
+# ifdef SHARED
+# define JMPTBL(I, B) I - B
+
+/* Load an entry in a jump table into ECX and branch to it. TABLE is a
+ jump table with relative offsets.
+ INDEX is a register contains the index into the jump table.
+ SCALE is the scale of INDEX. */
+
+# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
+ /* We first load PC into ECX. */ \
+ SETUP_PIC_REG(cx); \
+ /* Get the address of the jump table. */ \
+ addl $(TABLE - .), %ecx; \
+ /* Get the entry and convert the relative offset to the \
+ absolute address. */ \
+ addl (%ecx,INDEX,SCALE), %ecx; \
+ /* We loaded the jump table and adjusted ECX. Go. */ \
+ jmp *%ecx
+# else
+# define JMPTBL(I, B) I
+
+/* Branch to an entry in a jump table. TABLE is a jump table with
+ absolute offsets. INDEX is a register contains the index into the
+ jump table. SCALE is the scale of INDEX. */
+
+# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
+ jmp *TABLE(,INDEX,SCALE)
+# endif
+
+.text
+ENTRY (STRCPY)
+ ENTRANCE
+ mov STR1(%esp), %edi
+ mov STR2(%esp), %esi
+ movl LEN(%esp), %ebx
+ test %ebx, %ebx
+ jz L(ExitZero)
+
+ mov %esi, %ecx
+# ifndef USE_AS_STPCPY
+ mov %edi, %eax /* save result */
+# endif
+ and $15, %ecx
+ jz L(SourceStringAlignmentZero)
+
+ and $-16, %esi
+ pxor %xmm0, %xmm0
+ pxor %xmm1, %xmm1
+
+ pcmpeqb (%esi), %xmm1
+ add %ecx, %ebx
+ pmovmskb %xmm1, %edx
+ shr %cl, %edx
+# ifdef USE_AS_STPCPY
+ cmp $16, %ebx
+ jbe L(CopyFrom1To16BytesTailCase2OrCase3)
+# else
+ cmp $17, %ebx
+ jbe L(CopyFrom1To16BytesTailCase2OrCase3)
+# endif
+ test %edx, %edx
+ jnz L(CopyFrom1To16BytesTail)
+
+ pcmpeqb 16(%esi), %xmm0
+ pmovmskb %xmm0, %edx
+# ifdef USE_AS_STPCPY
+ cmp $32, %ebx
+ jbe L(CopyFrom1To32BytesCase2OrCase3)
+# else
+ cmp $33, %ebx
+ jbe L(CopyFrom1To32BytesCase2OrCase3)
+# endif
+ test %edx, %edx
+ jnz L(CopyFrom1To32Bytes)
+
+ movdqu (%esi, %ecx), %xmm1 /* copy 16 bytes */
+ movdqu %xmm1, (%edi)
+
+ sub %ecx, %edi
+
+/* If source address alignment != destination address alignment */
+ .p2align 4
+L(Unalign16Both):
+ mov $16, %ecx
+ movdqa (%esi, %ecx), %xmm1
+ movaps 16(%esi, %ecx), %xmm2
+ movdqu %xmm1, (%edi, %ecx)
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %edx
+ add $16, %ecx
+ sub $48, %ebx
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+ test %edx, %edx
+ jnz L(CopyFrom1To16BytesUnalignedXmm2)
+
+ movaps 16(%esi, %ecx), %xmm3
+ movdqu %xmm2, (%edi, %ecx)
+ pcmpeqb %xmm3, %xmm0
+ pmovmskb %xmm0, %edx
+ add $16, %ecx
+ sub $16, %ebx
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+ test %edx, %edx
+ jnz L(CopyFrom1To16BytesUnalignedXmm3)
+
+ movaps 16(%esi, %ecx), %xmm4
+ movdqu %xmm3, (%edi, %ecx)
+ pcmpeqb %xmm4, %xmm0
+ pmovmskb %xmm0, %edx
+ add $16, %ecx
+ sub $16, %ebx
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+ test %edx, %edx
+ jnz L(CopyFrom1To16BytesUnalignedXmm4)
+
+ movaps 16(%esi, %ecx), %xmm1
+ movdqu %xmm4, (%edi, %ecx)
+ pcmpeqb %xmm1, %xmm0
+ pmovmskb %xmm0, %edx
+ add $16, %ecx
+ sub $16, %ebx
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+ test %edx, %edx
+ jnz L(CopyFrom1To16BytesUnalignedXmm1)
+
+ movaps 16(%esi, %ecx), %xmm2
+ movdqu %xmm1, (%edi, %ecx)
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %edx
+ add $16, %ecx
+ sub $16, %ebx
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+ test %edx, %edx
+ jnz L(CopyFrom1To16BytesUnalignedXmm2)
+
+ movaps 16(%esi, %ecx), %xmm3
+ movdqu %xmm2, (%edi, %ecx)
+ pcmpeqb %xmm3, %xmm0
+ pmovmskb %xmm0, %edx
+ add $16, %ecx
+ sub $16, %ebx
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+ test %edx, %edx
+ jnz L(CopyFrom1To16BytesUnalignedXmm3)
+
+ movdqu %xmm3, (%edi, %ecx)
+ mov %esi, %edx
+ lea 16(%esi, %ecx), %esi
+ and $-0x40, %esi
+ sub %esi, %edx
+ sub %edx, %edi
+ lea 128(%ebx, %edx), %ebx
+
+L(Unaligned64Loop):
+ movaps (%esi), %xmm2
+ movaps %xmm2, %xmm4
+ movaps 16(%esi), %xmm5
+ movaps 32(%esi), %xmm3
+ movaps %xmm3, %xmm6
+ movaps 48(%esi), %xmm7
+ pminub %xmm5, %xmm2
+ pminub %xmm7, %xmm3
+ pminub %xmm2, %xmm3
+ pcmpeqb %xmm0, %xmm3
+ pmovmskb %xmm3, %edx
+ sub $64, %ebx
+ jbe L(UnalignedLeaveCase2OrCase3)
+ test %edx, %edx
+ jnz L(Unaligned64Leave)
+L(Unaligned64Loop_start):
+ add $64, %edi
+ add $64, %esi
+ movdqu %xmm4, -64(%edi)
+ movaps (%esi), %xmm2
+ movdqa %xmm2, %xmm4
+ movdqu %xmm5, -48(%edi)
+ movaps 16(%esi), %xmm5
+ pminub %xmm5, %xmm2
+ movaps 32(%esi), %xmm3
+ movdqu %xmm6, -32(%edi)
+ movaps %xmm3, %xmm6
+ movdqu %xmm7, -16(%edi)
+ movaps 48(%esi), %xmm7
+ pminub %xmm7, %xmm3
+ pminub %xmm2, %xmm3
+ pcmpeqb %xmm0, %xmm3
+ pmovmskb %xmm3, %edx
+ sub $64, %ebx
+ jbe L(UnalignedLeaveCase2OrCase3)
+ test %edx, %edx
+ jz L(Unaligned64Loop_start)
+L(Unaligned64Leave):
+ pxor %xmm1, %xmm1
+
+ pcmpeqb %xmm4, %xmm0
+ pcmpeqb %xmm5, %xmm1
+ pmovmskb %xmm0, %edx
+ pmovmskb %xmm1, %ecx
+ test %edx, %edx
+ jnz L(CopyFrom1To16BytesUnaligned_0)
+ test %ecx, %ecx
+ jnz L(CopyFrom1To16BytesUnaligned_16)
+
+ pcmpeqb %xmm6, %xmm0
+ pcmpeqb %xmm7, %xmm1
+ pmovmskb %xmm0, %edx
+ pmovmskb %xmm1, %ecx
+ test %edx, %edx
+ jnz L(CopyFrom1To16BytesUnaligned_32)
+
+ bsf %ecx, %edx
+ movdqu %xmm4, (%edi)
+ movdqu %xmm5, 16(%edi)
+ movdqu %xmm6, 32(%edi)
+# ifdef USE_AS_STPCPY
+ lea 48(%edi, %edx), %eax
+# endif
+ movdqu %xmm7, 48(%edi)
+ add $15, %ebx
+ sub %edx, %ebx
+ lea 49(%edi, %edx), %edi
+ jmp L(StrncpyFillTailWithZero)
+
+/* If source address alignment == destination address alignment */
+
+L(SourceStringAlignmentZero):
+ pxor %xmm0, %xmm0
+ movdqa (%esi), %xmm1
+ pcmpeqb %xmm1, %xmm0
+ pmovmskb %xmm0, %edx
+# ifdef USE_AS_STPCPY
+ cmp $16, %ebx
+ jbe L(CopyFrom1To16BytesTail1Case2OrCase3)
+# else
+ cmp $17, %ebx
+ jbe L(CopyFrom1To16BytesTail1Case2OrCase3)
+# endif
+ test %edx, %edx
+ jnz L(CopyFrom1To16BytesTail1)
+
+ pcmpeqb 16(%esi), %xmm0
+ movdqu %xmm1, (%edi)
+ pmovmskb %xmm0, %edx
+# ifdef USE_AS_STPCPY
+ cmp $32, %ebx
+ jbe L(CopyFrom1To32Bytes1Case2OrCase3)
+# else
+ cmp $33, %ebx
+ jbe L(CopyFrom1To32Bytes1Case2OrCase3)
+# endif
+ test %edx, %edx
+ jnz L(CopyFrom1To32Bytes1)
+
+ jmp L(Unalign16Both)
+
+/*-----------------End of main part---------------------------*/
+
+/* Case1 */
+ .p2align 4
+L(CopyFrom1To16BytesTail):
+ sub %ecx, %ebx
+ add %ecx, %esi
+ bsf %edx, %edx
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
+
+ .p2align 4
+L(CopyFrom1To32Bytes1):
+ add $16, %esi
+ add $16, %edi
+ sub $16, %ebx
+L(CopyFrom1To16BytesTail1):
+ bsf %edx, %edx
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
+
+ .p2align 4
+L(CopyFrom1To32Bytes):
+ sub %ecx, %ebx
+ bsf %edx, %edx
+ add %ecx, %esi
+ add $16, %edx
+ sub %ecx, %edx
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
+
+ .p2align 4
+L(CopyFrom1To16BytesUnaligned_0):
+ bsf %edx, %edx
+# ifdef USE_AS_STPCPY
+ lea (%edi, %edx), %eax
+# endif
+ movdqu %xmm4, (%edi)
+ add $63, %ebx
+ sub %edx, %ebx
+ lea 1(%edi, %edx), %edi
+ jmp L(StrncpyFillTailWithZero)
+
+ .p2align 4
+L(CopyFrom1To16BytesUnaligned_16):
+ bsf %ecx, %edx
+ movdqu %xmm4, (%edi)
+# ifdef USE_AS_STPCPY
+ lea 16(%edi, %edx), %eax
+# endif
+ movdqu %xmm5, 16(%edi)
+ add $47, %ebx
+ sub %edx, %ebx
+ lea 17(%edi, %edx), %edi
+ jmp L(StrncpyFillTailWithZero)
+
+ .p2align 4
+L(CopyFrom1To16BytesUnaligned_32):
+ bsf %edx, %edx
+ movdqu %xmm4, (%edi)
+ movdqu %xmm5, 16(%edi)
+# ifdef USE_AS_STPCPY
+ lea 32(%edi, %edx), %eax
+# endif
+ movdqu %xmm6, 32(%edi)
+ add $31, %ebx
+ sub %edx, %ebx
+ lea 33(%edi, %edx), %edi
+ jmp L(StrncpyFillTailWithZero)
+
+ .p2align 4
+L(CopyFrom1To16BytesUnalignedXmm6):
+ movdqu %xmm6, (%edi, %ecx)
+ jmp L(CopyFrom1To16BytesXmmExit)
+
+ .p2align 4
+L(CopyFrom1To16BytesUnalignedXmm5):
+ movdqu %xmm5, (%edi, %ecx)
+ jmp L(CopyFrom1To16BytesXmmExit)
+
+ .p2align 4
+L(CopyFrom1To16BytesUnalignedXmm4):
+ movdqu %xmm4, (%edi, %ecx)
+ jmp L(CopyFrom1To16BytesXmmExit)
+
+ .p2align 4
+L(CopyFrom1To16BytesUnalignedXmm3):
+ movdqu %xmm3, (%edi, %ecx)
+ jmp L(CopyFrom1To16BytesXmmExit)
+
+ .p2align 4
+L(CopyFrom1To16BytesUnalignedXmm1):
+ movdqu %xmm1, (%edi, %ecx)
+ jmp L(CopyFrom1To16BytesXmmExit)
+
+ .p2align 4
+L(CopyFrom1To16BytesExit):
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
+
+/* Case2 */
+
+ .p2align 4
+L(CopyFrom1To16BytesCase2):
+ add $16, %ebx
+ add %ecx, %edi
+ add %ecx, %esi
+ bsf %edx, %edx
+ cmp %ebx, %edx
+ jb L(CopyFrom1To16BytesExit)
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %ebx, 4)
+
+ .p2align 4
+L(CopyFrom1To32BytesCase2):
+ sub %ecx, %ebx
+ add %ecx, %esi
+ bsf %edx, %edx
+ add $16, %edx
+ sub %ecx, %edx
+ cmp %ebx, %edx
+ jb L(CopyFrom1To16BytesExit)
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %ebx, 4)
+
+L(CopyFrom1To16BytesTailCase2):
+ sub %ecx, %ebx
+ add %ecx, %esi
+ bsf %edx, %edx
+ cmp %ebx, %edx
+ jb L(CopyFrom1To16BytesExit)
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %ebx, 4)
+
+L(CopyFrom1To16BytesTail1Case2):
+ bsf %edx, %edx
+ cmp %ebx, %edx
+ jb L(CopyFrom1To16BytesExit)
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %ebx, 4)
+
+/* Case2 or Case3, Case3 */
+
+ .p2align 4
+L(CopyFrom1To16BytesCase2OrCase3):
+ test %edx, %edx
+ jnz L(CopyFrom1To16BytesCase2)
+L(CopyFrom1To16BytesCase3):
+ add $16, %ebx
+ add %ecx, %edi
+ add %ecx, %esi
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %ebx, 4)
+
+ .p2align 4
+L(CopyFrom1To32BytesCase2OrCase3):
+ test %edx, %edx
+ jnz L(CopyFrom1To32BytesCase2)
+ sub %ecx, %ebx
+ add %ecx, %esi
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %ebx, 4)
+
+ .p2align 4
+L(CopyFrom1To16BytesTailCase2OrCase3):
+ test %edx, %edx
+ jnz L(CopyFrom1To16BytesTailCase2)
+ sub %ecx, %ebx
+ add %ecx, %esi
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %ebx, 4)
+
+ .p2align 4
+L(CopyFrom1To32Bytes1Case2OrCase3):
+ add $16, %edi
+ add $16, %esi
+ sub $16, %ebx
+L(CopyFrom1To16BytesTail1Case2OrCase3):
+ test %edx, %edx
+ jnz L(CopyFrom1To16BytesTail1Case2)
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %ebx, 4)
+
+ .p2align 4
+L(Exit0):
+# ifdef USE_AS_STPCPY
+ mov %edi, %eax
+# endif
+ RETURN
+
+ .p2align 4
+L(Exit1):
+ movb %dh, (%edi)
+# ifdef USE_AS_STPCPY
+ lea (%edi), %eax
+# endif
+ sub $1, %ebx
+ lea 1(%edi), %edi
+ jnz L(StrncpyFillTailWithZero)
+ RETURN
+
+ .p2align 4
+L(Exit2):
+ movw (%esi), %dx
+ movw %dx, (%edi)
+# ifdef USE_AS_STPCPY
+ lea 1(%edi), %eax
+# endif
+ sub $2, %ebx
+ lea 2(%edi), %edi
+ jnz L(StrncpyFillTailWithZero)
+ RETURN
+
+ .p2align 4
+L(Exit3):
+ movw (%esi), %cx
+ movw %cx, (%edi)
+ movb %dh, 2(%edi)
+# ifdef USE_AS_STPCPY
+ lea 2(%edi), %eax
+# endif
+ sub $3, %ebx
+ lea 3(%edi), %edi
+ jnz L(StrncpyFillTailWithZero)
+ RETURN
+
+ .p2align 4
+L(Exit4):
+ movl (%esi), %edx
+ movl %edx, (%edi)
+# ifdef USE_AS_STPCPY
+ lea 3(%edi), %eax
+# endif
+ sub $4, %ebx
+ lea 4(%edi), %edi
+ jnz L(StrncpyFillTailWithZero)
+ RETURN
+
+ .p2align 4
+L(Exit5):
+ movl (%esi), %ecx
+ movb %dh, 4(%edi)
+ movl %ecx, (%edi)
+# ifdef USE_AS_STPCPY
+ lea 4(%edi), %eax
+# endif
+ sub $5, %ebx
+ lea 5(%edi), %edi
+ jnz L(StrncpyFillTailWithZero)
+ RETURN
+
+ .p2align 4
+L(Exit6):
+ movl (%esi), %ecx
+ movw 4(%esi), %dx
+ movl %ecx, (%edi)
+ movw %dx, 4(%edi)
+# ifdef USE_AS_STPCPY
+ lea 5(%edi), %eax
+# endif
+ sub $6, %ebx
+ lea 6(%edi), %edi
+ jnz L(StrncpyFillTailWithZero)
+ RETURN
+
+ .p2align 4
+L(Exit7):
+ movl (%esi), %ecx
+ movl 3(%esi), %edx
+ movl %ecx, (%edi)
+ movl %edx, 3(%edi)
+# ifdef USE_AS_STPCPY
+ lea 6(%edi), %eax
+# endif
+ sub $7, %ebx
+ lea 7(%edi), %edi
+ jnz L(StrncpyFillTailWithZero)
+ RETURN
+
+ .p2align 4
+L(Exit8):
+ movlpd (%esi), %xmm0
+ movlpd %xmm0, (%edi)
+# ifdef USE_AS_STPCPY
+ lea 7(%edi), %eax
+# endif
+ sub $8, %ebx
+ lea 8(%edi), %edi
+ jnz L(StrncpyFillTailWithZero)
+ RETURN
+
+ .p2align 4
+L(Exit9):
+ movlpd (%esi), %xmm0
+ movb %dh, 8(%edi)
+ movlpd %xmm0, (%edi)
+# ifdef USE_AS_STPCPY
+ lea 8(%edi), %eax
+# endif
+ sub $9, %ebx
+ lea 9(%edi), %edi
+ jnz L(StrncpyFillTailWithZero)
+ RETURN
+
+ .p2align 4
+L(Exit10):
+ movlpd (%esi), %xmm0
+ movw 8(%esi), %dx
+ movlpd %xmm0, (%edi)
+ movw %dx, 8(%edi)
+# ifdef USE_AS_STPCPY
+ lea 9(%edi), %eax
+# endif
+ sub $10, %ebx
+ lea 10(%edi), %edi
+ jnz L(StrncpyFillTailWithZero)
+ RETURN
+
+ .p2align 4
+L(Exit11):
+ movlpd (%esi), %xmm0
+ movl 7(%esi), %edx
+ movlpd %xmm0, (%edi)
+ movl %edx, 7(%edi)
+# ifdef USE_AS_STPCPY
+ lea 10(%edi), %eax
+# endif
+ sub $11, %ebx
+ lea 11(%edi), %edi
+ jnz L(StrncpyFillTailWithZero)
+ RETURN
+
+ .p2align 4
+L(Exit12):
+ movlpd (%esi), %xmm0
+ movl 8(%esi), %edx
+ movlpd %xmm0, (%edi)
+ movl %edx, 8(%edi)
+# ifdef USE_AS_STPCPY
+ lea 11(%edi), %eax
+# endif
+ sub $12, %ebx
+ lea 12(%edi), %edi
+ jnz L(StrncpyFillTailWithZero)
+ RETURN
+
+ .p2align 4
+L(Exit13):
+ movlpd (%esi), %xmm0
+ movlpd 5(%esi), %xmm1
+ movlpd %xmm0, (%edi)
+ movlpd %xmm1, 5(%edi)
+# ifdef USE_AS_STPCPY
+ lea 12(%edi), %eax
+# endif
+ sub $13, %ebx
+ lea 13(%edi), %edi
+ jnz L(StrncpyFillTailWithZero)
+ RETURN
+
+ .p2align 4
+L(Exit14):
+ movlpd (%esi), %xmm0
+ movlpd 6(%esi), %xmm1
+ movlpd %xmm0, (%edi)
+ movlpd %xmm1, 6(%edi)
+# ifdef USE_AS_STPCPY
+ lea 13(%edi), %eax
+# endif
+ sub $14, %ebx
+ lea 14(%edi), %edi
+ jnz L(StrncpyFillTailWithZero)
+ RETURN
+
+ .p2align 4
+L(Exit15):
+ movlpd (%esi), %xmm0
+ movlpd 7(%esi), %xmm1
+ movlpd %xmm0, (%edi)
+ movlpd %xmm1, 7(%edi)
+# ifdef USE_AS_STPCPY
+ lea 14(%edi), %eax
+# endif
+ sub $15, %ebx
+ lea 15(%edi), %edi
+ jnz L(StrncpyFillTailWithZero)
+ RETURN
+
+ .p2align 4
+L(Exit16):
+ movdqu (%esi), %xmm0
+ movdqu %xmm0, (%edi)
+# ifdef USE_AS_STPCPY
+ lea 15(%edi), %eax
+# endif
+ sub $16, %ebx
+ lea 16(%edi), %edi
+ jnz L(StrncpyFillTailWithZero)
+ RETURN
+
+ .p2align 4
+L(Exit17):
+ movdqu (%esi), %xmm0
+ movdqu %xmm0, (%edi)
+ movb %dh, 16(%edi)
+# ifdef USE_AS_STPCPY
+ lea 16(%edi), %eax
+# endif
+ sub $17, %ebx
+ lea 17(%edi), %edi
+ jnz L(StrncpyFillTailWithZero)
+ RETURN
+
+ .p2align 4
+L(Exit18):
+ movdqu (%esi), %xmm0
+ movw 16(%esi), %cx
+ movdqu %xmm0, (%edi)
+ movw %cx, 16(%edi)
+# ifdef USE_AS_STPCPY
+ lea 17(%edi), %eax
+# endif
+ sub $18, %ebx
+ lea 18(%edi), %edi
+ jnz L(StrncpyFillTailWithZero)
+ RETURN
+
+ .p2align 4
+L(Exit19):
+ movdqu (%esi), %xmm0
+ movl 15(%esi), %ecx
+ movdqu %xmm0, (%edi)
+ movl %ecx, 15(%edi)
+# ifdef USE_AS_STPCPY
+ lea 18(%edi), %eax
+# endif
+ sub $19, %ebx
+ lea 19(%edi), %edi
+ jnz L(StrncpyFillTailWithZero)
+ RETURN
+
+ .p2align 4
+L(Exit20):
+ movdqu (%esi), %xmm0
+ movl 16(%esi), %ecx
+ movdqu %xmm0, (%edi)
+ movl %ecx, 16(%edi)
+# ifdef USE_AS_STPCPY
+ lea 19(%edi), %eax
+# endif
+ sub $20, %ebx
+ lea 20(%edi), %edi
+ jnz L(StrncpyFillTailWithZero)
+ RETURN
+
+ .p2align 4
+L(Exit21):
+ movdqu (%esi), %xmm0
+ movl 16(%esi), %ecx
+ movdqu %xmm0, (%edi)
+ movl %ecx, 16(%edi)
+ movb %dh, 20(%edi)
+# ifdef USE_AS_STPCPY
+ lea 20(%edi), %eax
+# endif
+ sub $21, %ebx
+ lea 21(%edi), %edi
+ jnz L(StrncpyFillTailWithZero)
+ RETURN
+
+ .p2align 4
+L(Exit22):
+ movdqu (%esi), %xmm0
+ movlpd 14(%esi), %xmm3
+ movdqu %xmm0, (%edi)
+ movlpd %xmm3, 14(%edi)
+# ifdef USE_AS_STPCPY
+ lea 21(%edi), %eax
+# endif
+ sub $22, %ebx
+ lea 22(%edi), %edi
+ jnz L(StrncpyFillTailWithZero)
+ RETURN
+
+ .p2align 4
+L(Exit23):
+ movdqu (%esi), %xmm0
+ movlpd 15(%esi), %xmm3
+ movdqu %xmm0, (%edi)
+ movlpd %xmm3, 15(%edi)
+# ifdef USE_AS_STPCPY
+ lea 22(%edi), %eax
+# endif
+ sub $23, %ebx
+ lea 23(%edi), %edi
+ jnz L(StrncpyFillTailWithZero)
+ RETURN
+
+ .p2align 4
+L(Exit24):
+ movdqu (%esi), %xmm0
+ movlpd 16(%esi), %xmm2
+ movdqu %xmm0, (%edi)
+ movlpd %xmm2, 16(%edi)
+# ifdef USE_AS_STPCPY
+ lea 23(%edi), %eax
+# endif
+ sub $24, %ebx
+ lea 24(%edi), %edi
+ jnz L(StrncpyFillTailWithZero)
+ RETURN
+
+ .p2align 4
+L(Exit25):
+ movdqu (%esi), %xmm0
+ movlpd 16(%esi), %xmm2
+ movdqu %xmm0, (%edi)
+ movlpd %xmm2, 16(%edi)
+ movb %dh, 24(%edi)
+# ifdef USE_AS_STPCPY
+ lea 24(%edi), %eax
+# endif
+ sub $25, %ebx
+ lea 25(%edi), %edi
+ jnz L(StrncpyFillTailWithZero)
+ RETURN
+
+ .p2align 4
+L(Exit26):
+ movdqu (%esi), %xmm0
+ movlpd 16(%esi), %xmm2
+ movw 24(%esi), %cx
+ movdqu %xmm0, (%edi)
+ movlpd %xmm2, 16(%edi)
+ movw %cx, 24(%edi)
+# ifdef USE_AS_STPCPY
+ lea 25(%edi), %eax
+# endif
+ sub $26, %ebx
+ lea 26(%edi), %edi
+ jnz L(StrncpyFillTailWithZero)
+ RETURN
+
+ .p2align 4
+L(Exit27):
+ movdqu (%esi), %xmm0
+ movlpd 16(%esi), %xmm2
+ movl 23(%esi), %ecx
+ movdqu %xmm0, (%edi)
+ movlpd %xmm2, 16(%edi)
+ movl %ecx, 23(%edi)
+# ifdef USE_AS_STPCPY
+ lea 26(%edi), %eax
+# endif
+ sub $27, %ebx
+ lea 27(%edi), %edi
+ jnz L(StrncpyFillTailWithZero)
+ RETURN
+
+ .p2align 4
+L(Exit28):
+ movdqu (%esi), %xmm0
+ movlpd 16(%esi), %xmm2
+ movl 24(%esi), %ecx
+ movdqu %xmm0, (%edi)
+ movlpd %xmm2, 16(%edi)
+ movl %ecx, 24(%edi)
+# ifdef USE_AS_STPCPY
+ lea 27(%edi), %eax
+# endif
+ sub $28, %ebx
+ lea 28(%edi), %edi
+ jnz L(StrncpyFillTailWithZero)
+ RETURN
+
+ .p2align 4
+L(Exit29):
+ movdqu (%esi), %xmm0
+ movdqu 13(%esi), %xmm2
+ movdqu %xmm0, (%edi)
+ movdqu %xmm2, 13(%edi)
+# ifdef USE_AS_STPCPY
+ lea 28(%edi), %eax
+# endif
+ sub $29, %ebx
+ lea 29(%edi), %edi
+ jnz L(StrncpyFillTailWithZero)
+ RETURN
+
+ .p2align 4
+L(Exit30):
+ movdqu (%esi), %xmm0
+ movdqu 14(%esi), %xmm2
+ movdqu %xmm0, (%edi)
+ movdqu %xmm2, 14(%edi)
+# ifdef USE_AS_STPCPY
+ lea 29(%edi), %eax
+# endif
+ sub $30, %ebx
+ lea 30(%edi), %edi
+ jnz L(StrncpyFillTailWithZero)
+ RETURN
+
+
+ .p2align 4
+L(Exit31):
+ movdqu (%esi), %xmm0
+ movdqu 15(%esi), %xmm2
+ movdqu %xmm0, (%edi)
+ movdqu %xmm2, 15(%edi)
+# ifdef USE_AS_STPCPY
+ lea 30(%edi), %eax
+# endif
+ sub $31, %ebx
+ lea 31(%edi), %edi
+ jnz L(StrncpyFillTailWithZero)
+ RETURN
+
+ .p2align 4
+L(Exit32):
+ movdqu (%esi), %xmm0
+ movdqu 16(%esi), %xmm2
+ movdqu %xmm0, (%edi)
+ movdqu %xmm2, 16(%edi)
+# ifdef USE_AS_STPCPY
+ lea 31(%edi), %eax
+# endif
+ sub $32, %ebx
+ lea 32(%edi), %edi
+ jnz L(StrncpyFillTailWithZero)
+ RETURN
+
+ .p2align 4
+L(StrncpyExit1):
+ movb (%esi), %dl
+ movb %dl, (%edi)
+# ifdef USE_AS_STPCPY
+ lea 1(%edi), %eax
+# endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit2):
+ movw (%esi), %dx
+ movw %dx, (%edi)
+# ifdef USE_AS_STPCPY
+ lea 2(%edi), %eax
+# endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit3):
+ movw (%esi), %cx
+ movb 2(%esi), %dl
+ movw %cx, (%edi)
+ movb %dl, 2(%edi)
+# ifdef USE_AS_STPCPY
+ lea 3(%edi), %eax
+# endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit4):
+ movl (%esi), %edx
+ movl %edx, (%edi)
+# ifdef USE_AS_STPCPY
+ lea 4(%edi), %eax
+# endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit5):
+ movl (%esi), %ecx
+ movb 4(%esi), %dl
+ movl %ecx, (%edi)
+ movb %dl, 4(%edi)
+# ifdef USE_AS_STPCPY
+ lea 5(%edi), %eax
+# endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit6):
+ movl (%esi), %ecx
+ movw 4(%esi), %dx
+ movl %ecx, (%edi)
+ movw %dx, 4(%edi)
+# ifdef USE_AS_STPCPY
+ lea 6(%edi), %eax
+# endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit7):
+ movl (%esi), %ecx
+ movl 3(%esi), %edx
+ movl %ecx, (%edi)
+ movl %edx, 3(%edi)
+# ifdef USE_AS_STPCPY
+ lea 7(%edi), %eax
+# endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit8):
+ movlpd (%esi), %xmm0
+ movlpd %xmm0, (%edi)
+# ifdef USE_AS_STPCPY
+ lea 8(%edi), %eax
+# endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit9):
+ movlpd (%esi), %xmm0
+ movb 8(%esi), %dl
+ movlpd %xmm0, (%edi)
+ movb %dl, 8(%edi)
+# ifdef USE_AS_STPCPY
+ lea 9(%edi), %eax
+# endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit10):
+ movlpd (%esi), %xmm0
+ movw 8(%esi), %dx
+ movlpd %xmm0, (%edi)
+ movw %dx, 8(%edi)
+# ifdef USE_AS_STPCPY
+ lea 10(%edi), %eax
+# endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit11):
+ movlpd (%esi), %xmm0
+ movl 7(%esi), %edx
+ movlpd %xmm0, (%edi)
+ movl %edx, 7(%edi)
+# ifdef USE_AS_STPCPY
+ lea 11(%edi), %eax
+# endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit12):
+ movlpd (%esi), %xmm0
+ movl 8(%esi), %edx
+ movlpd %xmm0, (%edi)
+ movl %edx, 8(%edi)
+# ifdef USE_AS_STPCPY
+ lea 12(%edi), %eax
+# endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit13):
+ movlpd (%esi), %xmm0
+ movlpd 5(%esi), %xmm1
+ movlpd %xmm0, (%edi)
+ movlpd %xmm1, 5(%edi)
+# ifdef USE_AS_STPCPY
+ lea 13(%edi), %eax
+# endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit14):
+ movlpd (%esi), %xmm0
+ movlpd 6(%esi), %xmm1
+ movlpd %xmm0, (%edi)
+ movlpd %xmm1, 6(%edi)
+# ifdef USE_AS_STPCPY
+ lea 14(%edi), %eax
+# endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit15):
+ movlpd (%esi), %xmm0
+ movlpd 7(%esi), %xmm1
+ movlpd %xmm0, (%edi)
+ movlpd %xmm1, 7(%edi)
+# ifdef USE_AS_STPCPY
+ lea 15(%edi), %eax
+# endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit16):
+ movdqu (%esi), %xmm0
+ movdqu %xmm0, (%edi)
+# ifdef USE_AS_STPCPY
+ lea 16(%edi), %eax
+# endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit17):
+ movdqu (%esi), %xmm0
+ movb 16(%esi), %cl
+ movdqu %xmm0, (%edi)
+ movb %cl, 16(%edi)
+# ifdef USE_AS_STPCPY
+ lea 17(%edi), %eax
+# endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit18):
+ movdqu (%esi), %xmm0
+ movw 16(%esi), %cx
+ movdqu %xmm0, (%edi)
+ movw %cx, 16(%edi)
+# ifdef USE_AS_STPCPY
+ lea 18(%edi), %eax
+# endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit19):
+ movdqu (%esi), %xmm0
+ movl 15(%esi), %ecx
+ movdqu %xmm0, (%edi)
+ movl %ecx, 15(%edi)
+# ifdef USE_AS_STPCPY
+ lea 19(%edi), %eax
+# endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit20):
+ movdqu (%esi), %xmm0
+ movl 16(%esi), %ecx
+ movdqu %xmm0, (%edi)
+ movl %ecx, 16(%edi)
+# ifdef USE_AS_STPCPY
+ lea 20(%edi), %eax
+# endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit21):
+ movdqu (%esi), %xmm0
+ movl 16(%esi), %ecx
+ movb 20(%esi), %dl
+ movdqu %xmm0, (%edi)
+ movl %ecx, 16(%edi)
+ movb %dl, 20(%edi)
+# ifdef USE_AS_STPCPY
+ lea 21(%edi), %eax
+# endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit22):
+ movdqu (%esi), %xmm0
+ movlpd 14(%esi), %xmm3
+ movdqu %xmm0, (%edi)
+ movlpd %xmm3, 14(%edi)
+# ifdef USE_AS_STPCPY
+ lea 22(%edi), %eax
+# endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit23):
+ movdqu (%esi), %xmm0
+ movlpd 15(%esi), %xmm3
+ movdqu %xmm0, (%edi)
+ movlpd %xmm3, 15(%edi)
+# ifdef USE_AS_STPCPY
+ lea 23(%edi), %eax
+# endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit24):
+ movdqu (%esi), %xmm0
+ movlpd 16(%esi), %xmm2
+ movdqu %xmm0, (%edi)
+ movlpd %xmm2, 16(%edi)
+# ifdef USE_AS_STPCPY
+ lea 24(%edi), %eax
+# endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit25):
+ movdqu (%esi), %xmm0
+ movlpd 16(%esi), %xmm2
+ movb 24(%esi), %cl
+ movdqu %xmm0, (%edi)
+ movlpd %xmm2, 16(%edi)
+ movb %cl, 24(%edi)
+# ifdef USE_AS_STPCPY
+ lea 25(%edi), %eax
+# endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit26):
+ movdqu (%esi), %xmm0
+ movlpd 16(%esi), %xmm2
+ movw 24(%esi), %cx
+ movdqu %xmm0, (%edi)
+ movlpd %xmm2, 16(%edi)
+ movw %cx, 24(%edi)
+# ifdef USE_AS_STPCPY
+ lea 26(%edi), %eax
+# endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit27):
+ movdqu (%esi), %xmm0
+ movlpd 16(%esi), %xmm2
+ movl 23(%esi), %ecx
+ movdqu %xmm0, (%edi)
+ movlpd %xmm2, 16(%edi)
+ movl %ecx, 23(%edi)
+# ifdef USE_AS_STPCPY
+ lea 27(%edi), %eax
+# endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit28):
+ movdqu (%esi), %xmm0
+ movlpd 16(%esi), %xmm2
+ movl 24(%esi), %ecx
+ movdqu %xmm0, (%edi)
+ movlpd %xmm2, 16(%edi)
+ movl %ecx, 24(%edi)
+# ifdef USE_AS_STPCPY
+ lea 28(%edi), %eax
+# endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit29):
+ movdqu (%esi), %xmm0
+ movdqu 13(%esi), %xmm2
+ movdqu %xmm0, (%edi)
+ movdqu %xmm2, 13(%edi)
+# ifdef USE_AS_STPCPY
+ lea 29(%edi), %eax
+# endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit30):
+ movdqu (%esi), %xmm0
+ movdqu 14(%esi), %xmm2
+ movdqu %xmm0, (%edi)
+ movdqu %xmm2, 14(%edi)
+# ifdef USE_AS_STPCPY
+ lea 30(%edi), %eax
+# endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit31):
+ movdqu (%esi), %xmm0
+ movdqu 15(%esi), %xmm2
+ movdqu %xmm0, (%edi)
+ movdqu %xmm2, 15(%edi)
+# ifdef USE_AS_STPCPY
+ lea 31(%edi), %eax
+# endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit32):
+ movdqu (%esi), %xmm0
+ movdqu 16(%esi), %xmm2
+ movdqu %xmm0, (%edi)
+ movdqu %xmm2, 16(%edi)
+# ifdef USE_AS_STPCPY
+ lea 32(%edi), %eax
+# endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit33):
+ movdqu (%esi), %xmm0
+ movdqu 16(%esi), %xmm2
+ movb 32(%esi), %cl
+ movdqu %xmm0, (%edi)
+ movdqu %xmm2, 16(%edi)
+ movb %cl, 32(%edi)
+ RETURN
+
+ .p2align 4
+L(Fill0):
+ RETURN
+
+ .p2align 4
+L(Fill1):
+ movb %dl, (%edi)
+ RETURN
+
+ .p2align 4
+L(Fill2):
+ movw %dx, (%edi)
+ RETURN
+
+ .p2align 4
+L(Fill3):
+ movl %edx, -1(%edi)
+ RETURN
+
+ .p2align 4
+L(Fill4):
+ movl %edx, (%edi)
+ RETURN
+
+ .p2align 4
+L(Fill5):
+ movl %edx, (%edi)
+ movb %dl, 4(%edi)
+ RETURN
+
+ .p2align 4
+L(Fill6):
+ movl %edx, (%edi)
+ movw %dx, 4(%edi)
+ RETURN
+
+ .p2align 4
+L(Fill7):
+ movlpd %xmm0, -1(%edi)
+ RETURN
+
+ .p2align 4
+L(Fill8):
+ movlpd %xmm0, (%edi)
+ RETURN
+
+ .p2align 4
+L(Fill9):
+ movlpd %xmm0, (%edi)
+ movb %dl, 8(%edi)
+ RETURN
+
+ .p2align 4
+L(Fill10):
+ movlpd %xmm0, (%edi)
+ movw %dx, 8(%edi)
+ RETURN
+
+ .p2align 4
+L(Fill11):
+ movlpd %xmm0, (%edi)
+ movl %edx, 7(%edi)
+ RETURN
+
+ .p2align 4
+L(Fill12):
+ movlpd %xmm0, (%edi)
+ movl %edx, 8(%edi)
+ RETURN
+
+ .p2align 4
+L(Fill13):
+ movlpd %xmm0, (%edi)
+ movlpd %xmm0, 5(%edi)
+ RETURN
+
+ .p2align 4
+L(Fill14):
+ movlpd %xmm0, (%edi)
+ movlpd %xmm0, 6(%edi)
+ RETURN
+
+ .p2align 4
+L(Fill15):
+ movdqu %xmm0, -1(%edi)
+ RETURN
+
+ .p2align 4
+L(Fill16):
+ movdqu %xmm0, (%edi)
+ RETURN
+
+ .p2align 4
+L(CopyFrom1To16BytesUnalignedXmm2):
+ movdqu %xmm2, (%edi, %ecx)
+
+ .p2align 4
+L(CopyFrom1To16BytesXmmExit):
+ bsf %edx, %edx
+ add $15, %ebx
+ add %ecx, %edi
+# ifdef USE_AS_STPCPY
+ lea (%edi, %edx), %eax
+# endif
+ sub %edx, %ebx
+ lea 1(%edi, %edx), %edi
+
+ .p2align 4
+L(StrncpyFillTailWithZero):
+ pxor %xmm0, %xmm0
+ xor %edx, %edx
+ sub $16, %ebx
+ jbe L(StrncpyFillExit)
+
+ movdqu %xmm0, (%edi)
+ add $16, %edi
+
+ mov %edi, %esi
+ and $0xf, %esi
+ sub %esi, %edi
+ add %esi, %ebx
+ sub $64, %ebx
+ jb L(StrncpyFillLess64)
+
+L(StrncpyFillLoopMovdqa):
+ movdqa %xmm0, (%edi)
+ movdqa %xmm0, 16(%edi)
+ movdqa %xmm0, 32(%edi)
+ movdqa %xmm0, 48(%edi)
+ add $64, %edi
+ sub $64, %ebx
+ jae L(StrncpyFillLoopMovdqa)
+
+L(StrncpyFillLess64):
+ add $32, %ebx
+ jl L(StrncpyFillLess32)
+ movdqa %xmm0, (%edi)
+ movdqa %xmm0, 16(%edi)
+ add $32, %edi
+ sub $16, %ebx
+ jl L(StrncpyFillExit)
+ movdqa %xmm0, (%edi)
+ add $16, %edi
+ BRANCH_TO_JMPTBL_ENTRY (L(FillTable), %ebx, 4)
+
+L(StrncpyFillLess32):
+ add $16, %ebx
+ jl L(StrncpyFillExit)
+ movdqa %xmm0, (%edi)
+ add $16, %edi
+ BRANCH_TO_JMPTBL_ENTRY (L(FillTable), %ebx, 4)
+
+L(StrncpyFillExit):
+ add $16, %ebx
+ BRANCH_TO_JMPTBL_ENTRY (L(FillTable), %ebx, 4)
+
+ .p2align 4
+L(UnalignedLeaveCase2OrCase3):
+ test %edx, %edx
+ jnz L(Unaligned64LeaveCase2)
+L(Unaligned64LeaveCase3):
+ lea 64(%ebx), %ecx
+ and $-16, %ecx
+ add $48, %ebx
+ jl L(CopyFrom1To16BytesCase3)
+ movdqu %xmm4, (%edi)
+ sub $16, %ebx
+ jb L(CopyFrom1To16BytesCase3)
+ movdqu %xmm5, 16(%edi)
+ sub $16, %ebx
+ jb L(CopyFrom1To16BytesCase3)
+ movdqu %xmm6, 32(%edi)
+ sub $16, %ebx
+ jb L(CopyFrom1To16BytesCase3)
+ movdqu %xmm7, 48(%edi)
+# ifdef USE_AS_STPCPY
+ lea 64(%edi), %eax
+# endif
+ RETURN
+
+ .p2align 4
+L(Unaligned64LeaveCase2):
+ xor %ecx, %ecx
+ pcmpeqb %xmm4, %xmm0
+ pmovmskb %xmm0, %edx
+ add $48, %ebx
+ jle L(CopyFrom1To16BytesCase2OrCase3)
+ test %edx, %edx
+ jnz L(CopyFrom1To16BytesUnalignedXmm4)
+
+ pcmpeqb %xmm5, %xmm0
+ pmovmskb %xmm0, %edx
+ movdqu %xmm4, (%edi)
+ add $16, %ecx
+ sub $16, %ebx
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+ test %edx, %edx
+ jnz L(CopyFrom1To16BytesUnalignedXmm5)
+
+ pcmpeqb %xmm6, %xmm0
+ pmovmskb %xmm0, %edx
+ movdqu %xmm5, 16(%edi)
+ add $16, %ecx
+ sub $16, %ebx
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+ test %edx, %edx
+ jnz L(CopyFrom1To16BytesUnalignedXmm6)
+
+ pcmpeqb %xmm7, %xmm0
+ pmovmskb %xmm0, %edx
+ movdqu %xmm6, 32(%edi)
+ lea 16(%edi, %ecx), %edi
+ lea 16(%esi, %ecx), %esi
+ bsf %edx, %edx
+ cmp %ebx, %edx
+ jb L(CopyFrom1To16BytesExit)
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %ebx, 4)
+
+ .p2align 4
+L(ExitZero):
+ movl %edi, %eax
+ RETURN
+
+END (STRCPY)
+
+ .p2align 4
+ .section .rodata
+L(ExitTable):
+ .int JMPTBL(L(Exit1), L(ExitTable))
+ .int JMPTBL(L(Exit2), L(ExitTable))
+ .int JMPTBL(L(Exit3), L(ExitTable))
+ .int JMPTBL(L(Exit4), L(ExitTable))
+ .int JMPTBL(L(Exit5), L(ExitTable))
+ .int JMPTBL(L(Exit6), L(ExitTable))
+ .int JMPTBL(L(Exit7), L(ExitTable))
+ .int JMPTBL(L(Exit8), L(ExitTable))
+ .int JMPTBL(L(Exit9), L(ExitTable))
+ .int JMPTBL(L(Exit10), L(ExitTable))
+ .int JMPTBL(L(Exit11), L(ExitTable))
+ .int JMPTBL(L(Exit12), L(ExitTable))
+ .int JMPTBL(L(Exit13), L(ExitTable))
+ .int JMPTBL(L(Exit14), L(ExitTable))
+ .int JMPTBL(L(Exit15), L(ExitTable))
+ .int JMPTBL(L(Exit16), L(ExitTable))
+ .int JMPTBL(L(Exit17), L(ExitTable))
+ .int JMPTBL(L(Exit18), L(ExitTable))
+ .int JMPTBL(L(Exit19), L(ExitTable))
+ .int JMPTBL(L(Exit20), L(ExitTable))
+ .int JMPTBL(L(Exit21), L(ExitTable))
+ .int JMPTBL(L(Exit22), L(ExitTable))
+ .int JMPTBL(L(Exit23), L(ExitTable))
+ .int JMPTBL(L(Exit24), L(ExitTable))
+ .int JMPTBL(L(Exit25), L(ExitTable))
+ .int JMPTBL(L(Exit26), L(ExitTable))
+ .int JMPTBL(L(Exit27), L(ExitTable))
+ .int JMPTBL(L(Exit28), L(ExitTable))
+ .int JMPTBL(L(Exit29), L(ExitTable))
+ .int JMPTBL(L(Exit30), L(ExitTable))
+ .int JMPTBL(L(Exit31), L(ExitTable))
+ .int JMPTBL(L(Exit32), L(ExitTable))
+
+L(ExitStrncpyTable):
+ .int JMPTBL(L(Exit0), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit1), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit2), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit3), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit4), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit5), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit6), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit7), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit8), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit9), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit10), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit11), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit12), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit13), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit14), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit15), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit16), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit17), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit18), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit19), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit20), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit21), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit22), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit23), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit24), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit25), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit26), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit27), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit28), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit29), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit30), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit31), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit32), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit33), L(ExitStrncpyTable))
+
+ .p2align 4
+L(FillTable):
+ .int JMPTBL(L(Fill0), L(FillTable))
+ .int JMPTBL(L(Fill1), L(FillTable))
+ .int JMPTBL(L(Fill2), L(FillTable))
+ .int JMPTBL(L(Fill3), L(FillTable))
+ .int JMPTBL(L(Fill4), L(FillTable))
+ .int JMPTBL(L(Fill5), L(FillTable))
+ .int JMPTBL(L(Fill6), L(FillTable))
+ .int JMPTBL(L(Fill7), L(FillTable))
+ .int JMPTBL(L(Fill8), L(FillTable))
+ .int JMPTBL(L(Fill9), L(FillTable))
+ .int JMPTBL(L(Fill10), L(FillTable))
+ .int JMPTBL(L(Fill11), L(FillTable))
+ .int JMPTBL(L(Fill12), L(FillTable))
+ .int JMPTBL(L(Fill13), L(FillTable))
+ .int JMPTBL(L(Fill14), L(FillTable))
+ .int JMPTBL(L(Fill15), L(FillTable))
+ .int JMPTBL(L(Fill16), L(FillTable))
+# else
+# define PARMS 4
+# define ENTRANCE
+# define RETURN POP (%edi); ret; CFI_PUSH (%edi)
+# define RETURN1 ret
+
+ .text
+ENTRY (STRCPY)
+ ENTRANCE
+ mov STR1(%esp), %edx
+ mov STR2(%esp), %ecx
+
+ cmpb $0, (%ecx)
+ jz L(ExitTail1)
+ cmpb $0, 1(%ecx)
+ jz L(ExitTail2)
+ cmpb $0, 2(%ecx)
+ jz L(ExitTail3)
+ cmpb $0, 3(%ecx)
+ jz L(ExitTail4)
+ cmpb $0, 4(%ecx)
+ jz L(ExitTail5)
+ cmpb $0, 5(%ecx)
+ jz L(ExitTail6)
+ cmpb $0, 6(%ecx)
+ jz L(ExitTail7)
+ cmpb $0, 7(%ecx)
+ jz L(ExitTail8)
+ cmpb $0, 8(%ecx)
+ jz L(ExitTail9)
+ cmpb $0, 9(%ecx)
+ jz L(ExitTail10)
+ cmpb $0, 10(%ecx)
+ jz L(ExitTail11)
+ cmpb $0, 11(%ecx)
+ jz L(ExitTail12)
+ cmpb $0, 12(%ecx)
+ jz L(ExitTail13)
+ cmpb $0, 13(%ecx)
+ jz L(ExitTail14)
+ cmpb $0, 14(%ecx)
+ jz L(ExitTail15)
+ cmpb $0, 15(%ecx)
+ jz L(ExitTail16)
+
+ PUSH (%edi)
+ PUSH (%ebx)
+
+ mov %edx, %edi
+ lea 16(%ecx), %ebx
+ and $-16, %ebx
+ pxor %xmm0, %xmm0
+ movdqu (%ecx), %xmm1
+ movdqu %xmm1, (%edx)
+ pcmpeqb (%ebx), %xmm0
+ pmovmskb %xmm0, %eax
+ sub %ecx, %ebx
+ test %eax, %eax
+ jnz L(CopyFrom1To16Bytes)
+
+ mov %ecx, %eax
+ lea 16(%ecx), %ecx
+ and $-16, %ecx
+ sub %ecx, %eax
+ sub %eax, %edx
+ xor %ebx, %ebx
+
+ .p2align 4
+ movdqa (%ecx), %xmm1
+ movaps 16(%ecx), %xmm2
+ movdqu %xmm1, (%edx)
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %eax
+ add $16, %ebx
+ test %eax, %eax
+ jnz L(CopyFrom1To16Bytes)
+
+ movaps 16(%ecx, %ebx), %xmm3
+ movdqu %xmm2, (%edx, %ebx)
+ pcmpeqb %xmm3, %xmm0
+ pmovmskb %xmm0, %eax
+ add $16, %ebx
+ test %eax, %eax
+ jnz L(CopyFrom1To16Bytes)
+
+ movaps 16(%ecx, %ebx), %xmm4
+ movdqu %xmm3, (%edx, %ebx)
+ pcmpeqb %xmm4, %xmm0
+ pmovmskb %xmm0, %eax
+ add $16, %ebx
+ test %eax, %eax
+ jnz L(CopyFrom1To16Bytes)
+
+ movaps 16(%ecx, %ebx), %xmm1
+ movdqu %xmm4, (%edx, %ebx)
+ pcmpeqb %xmm1, %xmm0
+ pmovmskb %xmm0, %eax
+ add $16, %ebx
+ test %eax, %eax
+ jnz L(CopyFrom1To16Bytes)
+
+ movaps 16(%ecx, %ebx), %xmm2
+ movdqu %xmm1, (%edx, %ebx)
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %eax
+ add $16, %ebx
+ test %eax, %eax
+ jnz L(CopyFrom1To16Bytes)
+
+ movaps 16(%ecx, %ebx), %xmm3
+ movdqu %xmm2, (%edx, %ebx)
+ pcmpeqb %xmm3, %xmm0
+ pmovmskb %xmm0, %eax
+ add $16, %ebx
+ test %eax, %eax
+ jnz L(CopyFrom1To16Bytes)
+
+ movdqu %xmm3, (%edx, %ebx)
+ mov %ecx, %eax
+ lea 16(%ecx, %ebx), %ecx
+ and $-0x40, %ecx
+ sub %ecx, %eax
+ sub %eax, %edx
+
+L(Aligned64Loop):
+ movaps (%ecx), %xmm2
+ movaps %xmm2, %xmm4
+ movaps 16(%ecx), %xmm5
+ movaps 32(%ecx), %xmm3
+ movaps %xmm3, %xmm6
+ movaps 48(%ecx), %xmm7
+ pminub %xmm5, %xmm2
+ add $64, %ecx
+ pminub %xmm7, %xmm3
+ add $64, %edx
+ pminub %xmm2, %xmm3
+ pcmpeqb %xmm0, %xmm3
+ pmovmskb %xmm3, %eax
+ test %eax, %eax
+ jnz L(Aligned64Leave)
+L(Aligned64Loop_start):
+ movdqu %xmm4, -64(%edx)
+ movaps (%ecx), %xmm2
+ movdqa %xmm2, %xmm4
+ movdqu %xmm5, -48(%edx)
+ movaps 16(%ecx), %xmm5
+ pminub %xmm5, %xmm2
+ movaps 32(%ecx), %xmm3
+ movdqu %xmm6, -32(%edx)
+ movaps %xmm3, %xmm6
+ movdqu %xmm7, -16(%edx)
+ movaps 48(%ecx), %xmm7
+ pminub %xmm7, %xmm3
+ pminub %xmm2, %xmm3
+ pcmpeqb %xmm3, %xmm0
+ pmovmskb %xmm0, %eax
+ add $64, %edx
+ add $64, %ecx
+ test %eax, %eax
+ jz L(Aligned64Loop_start)
+L(Aligned64Leave):
+ sub $0xa0, %ebx
+ pxor %xmm0, %xmm0
+ pcmpeqb %xmm4, %xmm0
+ pmovmskb %xmm0, %eax
+ test %eax, %eax
+ jnz L(CopyFrom1To16Bytes)
+
+ pcmpeqb %xmm5, %xmm0
+ pmovmskb %xmm0, %eax
+ movdqu %xmm4, -64(%edx)
+ test %eax, %eax
+ lea 16(%ebx), %ebx
+ jnz L(CopyFrom1To16Bytes)
+
+ pcmpeqb %xmm6, %xmm0
+ pmovmskb %xmm0, %eax
+ movdqu %xmm5, -48(%edx)
+ test %eax, %eax
+ lea 16(%ebx), %ebx
+ jnz L(CopyFrom1To16Bytes)
+
+ movdqu %xmm6, -32(%edx)
+ pcmpeqb %xmm7, %xmm0
+ pmovmskb %xmm0, %eax
+ lea 16(%ebx), %ebx
+
+/*-----------------End of main part---------------------------*/
+
+ .p2align 4
+L(CopyFrom1To16Bytes):
+ add %ebx, %edx
+ add %ebx, %ecx
+
+ POP (%ebx)
+ test %al, %al
+ jz L(ExitHigh)
+ test $0x01, %al
+ jnz L(Exit1)
+ test $0x02, %al
+ jnz L(Exit2)
+ test $0x04, %al
+ jnz L(Exit3)
+ test $0x08, %al
+ jnz L(Exit4)
+ test $0x10, %al
+ jnz L(Exit5)
+ test $0x20, %al
+ jnz L(Exit6)
+ test $0x40, %al
+ jnz L(Exit7)
+ /* Exit 8 */
+ movl (%ecx), %eax
+ movl %eax, (%edx)
+ movl 4(%ecx), %eax
+ movl %eax, 4(%edx)
+# ifdef USE_AS_STPCPY
+ lea 7(%edx), %eax
+# else
+ movl %edi, %eax
+# endif
+ RETURN
+
+ .p2align 4
+L(ExitHigh):
+ test $0x01, %ah
+ jnz L(Exit9)
+ test $0x02, %ah
+ jnz L(Exit10)
+ test $0x04, %ah
+ jnz L(Exit11)
+ test $0x08, %ah
+ jnz L(Exit12)
+ test $0x10, %ah
+ jnz L(Exit13)
+ test $0x20, %ah
+ jnz L(Exit14)
+ test $0x40, %ah
+ jnz L(Exit15)
+ /* Exit 16 */
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ movlpd 8(%ecx), %xmm0
+ movlpd %xmm0, 8(%edx)
+# ifdef USE_AS_STPCPY
+ lea 15(%edx), %eax
+# else
+ movl %edi, %eax
+# endif
+ RETURN
+
+ .p2align 4
+L(Exit1):
+ movb (%ecx), %al
+ movb %al, (%edx)
+# ifdef USE_AS_STPCPY
+ lea (%edx), %eax
+# else
+ movl %edi, %eax
+# endif
+ RETURN
+
+ .p2align 4
+L(Exit2):
+ movw (%ecx), %ax
+ movw %ax, (%edx)
+# ifdef USE_AS_STPCPY
+ lea 1(%edx), %eax
+# else
+ movl %edi, %eax
+# endif
+ RETURN
+
+ .p2align 4
+L(Exit3):
+ movw (%ecx), %ax
+ movw %ax, (%edx)
+ movb 2(%ecx), %al
+ movb %al, 2(%edx)
+# ifdef USE_AS_STPCPY
+ lea 2(%edx), %eax
+# else
+ movl %edi, %eax
+# endif
+ RETURN
+
+ .p2align 4
+L(Exit4):
+ movl (%ecx), %eax
+ movl %eax, (%edx)
+# ifdef USE_AS_STPCPY
+ lea 3(%edx), %eax
+# else
+ movl %edi, %eax
+# endif
+ RETURN
+
+ .p2align 4
+L(Exit5):
+ movl (%ecx), %eax
+ movl %eax, (%edx)
+ movb 4(%ecx), %al
+ movb %al, 4(%edx)
+# ifdef USE_AS_STPCPY
+ lea 4(%edx), %eax
+# else
+ movl %edi, %eax
+# endif
+ RETURN
+
+ .p2align 4
+L(Exit6):
+ movl (%ecx), %eax
+ movl %eax, (%edx)
+ movw 4(%ecx), %ax
+ movw %ax, 4(%edx)
+# ifdef USE_AS_STPCPY
+ lea 5(%edx), %eax
+# else
+ movl %edi, %eax
+# endif
+ RETURN
+
+ .p2align 4
+L(Exit7):
+ movl (%ecx), %eax
+ movl %eax, (%edx)
+ movl 3(%ecx), %eax
+ movl %eax, 3(%edx)
+# ifdef USE_AS_STPCPY
+ lea 6(%edx), %eax
+# else
+ movl %edi, %eax
+# endif
+ RETURN
+
+ .p2align 4
+L(Exit9):
+ movl (%ecx), %eax
+ movl %eax, (%edx)
+ movl 4(%ecx), %eax
+ movl %eax, 4(%edx)
+ movb 8(%ecx), %al
+ movb %al, 8(%edx)
+# ifdef USE_AS_STPCPY
+ lea 8(%edx), %eax
+# else
+ movl %edi, %eax
+# endif
+ RETURN
+
+ .p2align 4
+L(Exit10):
+ movl (%ecx), %eax
+ movl %eax, (%edx)
+ movl 4(%ecx), %eax
+ movl %eax, 4(%edx)
+ movw 8(%ecx), %ax
+ movw %ax, 8(%edx)
+# ifdef USE_AS_STPCPY
+ lea 9(%edx), %eax
+# else
+ movl %edi, %eax
+# endif
+ RETURN
+
+ .p2align 4
+L(Exit11):
+ movl (%ecx), %eax
+ movl %eax, (%edx)
+ movl 4(%ecx), %eax
+ movl %eax, 4(%edx)
+ movl 7(%ecx), %eax
+ movl %eax, 7(%edx)
+# ifdef USE_AS_STPCPY
+ lea 10(%edx), %eax
+# else
+ movl %edi, %eax
+# endif
+ RETURN
+
+ .p2align 4
+L(Exit12):
+ movl (%ecx), %eax
+ movl %eax, (%edx)
+ movl 4(%ecx), %eax
+ movl %eax, 4(%edx)
+ movl 8(%ecx), %eax
+ movl %eax, 8(%edx)
+# ifdef USE_AS_STPCPY
+ lea 11(%edx), %eax
+# else
+ movl %edi, %eax
+# endif
+ RETURN
+
+ .p2align 4
+L(Exit13):
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ movlpd 5(%ecx), %xmm0
+ movlpd %xmm0, 5(%edx)
+# ifdef USE_AS_STPCPY
+ lea 12(%edx), %eax
+# else
+ movl %edi, %eax
+# endif
+ RETURN
+
+ .p2align 4
+L(Exit14):
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ movlpd 6(%ecx), %xmm0
+ movlpd %xmm0, 6(%edx)
+# ifdef USE_AS_STPCPY
+ lea 13(%edx), %eax
+# else
+ movl %edi, %eax
+# endif
+ RETURN
+
+ .p2align 4
+L(Exit15):
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ movlpd 7(%ecx), %xmm0
+ movlpd %xmm0, 7(%edx)
+# ifdef USE_AS_STPCPY
+ lea 14(%edx), %eax
+# else
+ movl %edi, %eax
+# endif
+ RETURN
+
+CFI_POP (%edi)
+
+ .p2align 4
+L(ExitTail1):
+ movb (%ecx), %al
+ movb %al, (%edx)
+ movl %edx, %eax
+ RETURN1
+
+ .p2align 4
+L(ExitTail2):
+ movw (%ecx), %ax
+ movw %ax, (%edx)
+# ifdef USE_AS_STPCPY
+ lea 1(%edx), %eax
+# else
+ movl %edx, %eax
+# endif
+ RETURN1
+
+ .p2align 4
+L(ExitTail3):
+ movw (%ecx), %ax
+ movw %ax, (%edx)
+ movb 2(%ecx), %al
+ movb %al, 2(%edx)
+# ifdef USE_AS_STPCPY
+ lea 2(%edx), %eax
+# else
+ movl %edx, %eax
+# endif
+ RETURN1
+
+ .p2align 4
+L(ExitTail4):
+ movl (%ecx), %eax
+ movl %eax, (%edx)
+# ifdef USE_AS_STPCPY
+ lea 3(%edx), %eax
+# else
+ movl %edx, %eax
+# endif
+ RETURN1
+
+ .p2align 4
+L(ExitTail5):
+ movl (%ecx), %eax
+ movl %eax, (%edx)
+ movb 4(%ecx), %al
+ movb %al, 4(%edx)
+# ifdef USE_AS_STPCPY
+ lea 4(%edx), %eax
+# else
+ movl %edx, %eax
+# endif
+ RETURN1
+
+ .p2align 4
+L(ExitTail6):
+ movl (%ecx), %eax
+ movl %eax, (%edx)
+ movw 4(%ecx), %ax
+ movw %ax, 4(%edx)
+# ifdef USE_AS_STPCPY
+ lea 5(%edx), %eax
+# else
+ movl %edx, %eax
+# endif
+ RETURN1
+
+ .p2align 4
+L(ExitTail7):
+ movl (%ecx), %eax
+ movl %eax, (%edx)
+ movl 3(%ecx), %eax
+ movl %eax, 3(%edx)
+# ifdef USE_AS_STPCPY
+ lea 6(%edx), %eax
+# else
+ movl %edx, %eax
+# endif
+ RETURN1
+
+ .p2align 4
+L(ExitTail8):
+ movl (%ecx), %eax
+ movl %eax, (%edx)
+ movl 4(%ecx), %eax
+ movl %eax, 4(%edx)
+# ifdef USE_AS_STPCPY
+ lea 7(%edx), %eax
+# else
+ movl %edx, %eax
+# endif
+ RETURN1
+
+ .p2align 4
+L(ExitTail9):
+ movl (%ecx), %eax
+ movl %eax, (%edx)
+ movl 4(%ecx), %eax
+ movl %eax, 4(%edx)
+ movb 8(%ecx), %al
+ movb %al, 8(%edx)
+# ifdef USE_AS_STPCPY
+ lea 8(%edx), %eax
+# else
+ movl %edx, %eax
+# endif
+ RETURN1
+
+ .p2align 4
+L(ExitTail10):
+ movl (%ecx), %eax
+ movl %eax, (%edx)
+ movl 4(%ecx), %eax
+ movl %eax, 4(%edx)
+ movw 8(%ecx), %ax
+ movw %ax, 8(%edx)
+# ifdef USE_AS_STPCPY
+ lea 9(%edx), %eax
+# else
+ movl %edx, %eax
+# endif
+ RETURN1
+
+ .p2align 4
+L(ExitTail11):
+ movl (%ecx), %eax
+ movl %eax, (%edx)
+ movl 4(%ecx), %eax
+ movl %eax, 4(%edx)
+ movl 7(%ecx), %eax
+ movl %eax, 7(%edx)
+# ifdef USE_AS_STPCPY
+ lea 10(%edx), %eax
+# else
+ movl %edx, %eax
+# endif
+ RETURN1
+
+ .p2align 4
+L(ExitTail12):
+ movl (%ecx), %eax
+ movl %eax, (%edx)
+ movl 4(%ecx), %eax
+ movl %eax, 4(%edx)
+ movl 8(%ecx), %eax
+ movl %eax, 8(%edx)
+# ifdef USE_AS_STPCPY
+ lea 11(%edx), %eax
+# else
+ movl %edx, %eax
+# endif
+ RETURN1
+
+ .p2align 4
+L(ExitTail13):
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ movlpd 5(%ecx), %xmm0
+ movlpd %xmm0, 5(%edx)
+# ifdef USE_AS_STPCPY
+ lea 12(%edx), %eax
+# else
+ movl %edx, %eax
+# endif
+ RETURN1
+
+ .p2align 4
+L(ExitTail14):
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ movlpd 6(%ecx), %xmm0
+ movlpd %xmm0, 6(%edx)
+# ifdef USE_AS_STPCPY
+ lea 13(%edx), %eax
+# else
+ movl %edx, %eax
+# endif
+ RETURN1
+
+ .p2align 4
+L(ExitTail15):
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ movlpd 7(%ecx), %xmm0
+ movlpd %xmm0, 7(%edx)
+# ifdef USE_AS_STPCPY
+ lea 14(%edx), %eax
+# else
+ movl %edx, %eax
+# endif
+ RETURN1
+
+ .p2align 4
+L(ExitTail16):
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ movlpd 8(%ecx), %xmm0
+ movlpd %xmm0, 8(%edx)
+# ifdef USE_AS_STPCPY
+ lea 15(%edx), %eax
+# else
+ movl %edx, %eax
+# endif
+ RETURN1
+
+END (STRCPY)
+# endif
+
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strcpy-ssse3.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strcpy-ssse3.S
new file mode 100644
index 0000000000..effd85da94
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strcpy-ssse3.S
@@ -0,0 +1,3901 @@
+/* strcpy with SSSE3
+ Copyright (C) 2011-2017 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+
+#if IS_IN (libc)
+
+# ifndef USE_AS_STRCAT
+# include <sysdep.h>
+
+# define CFI_PUSH(REG) \
+ cfi_adjust_cfa_offset (4); \
+ cfi_rel_offset (REG, 0)
+
+# define CFI_POP(REG) \
+ cfi_adjust_cfa_offset (-4); \
+ cfi_restore (REG)
+
+# define PUSH(REG) pushl REG; CFI_PUSH (REG)
+# define POP(REG) popl REG; CFI_POP (REG)
+
+# ifndef STRCPY
+# define STRCPY __strcpy_ssse3
+# endif
+
+# ifdef USE_AS_STRNCPY
+# define PARMS 8
+# define ENTRANCE PUSH (%ebx)
+# define RETURN POP (%ebx); ret; CFI_PUSH (%ebx);
+# define RETURN1 POP (%edi); POP (%ebx); ret; CFI_PUSH (%ebx); CFI_PUSH (%edi)
+# else
+# define PARMS 4
+# define ENTRANCE
+# define RETURN ret
+# define RETURN1 POP (%edi); ret; CFI_PUSH (%edi)
+# endif
+
+# ifdef USE_AS_STPCPY
+# define SAVE_RESULT(n) lea n(%edx), %eax
+# define SAVE_RESULT_TAIL(n) lea n(%edx), %eax
+# else
+# define SAVE_RESULT(n) movl %edi, %eax
+# define SAVE_RESULT_TAIL(n) movl %edx, %eax
+# endif
+
+# define STR1 PARMS
+# define STR2 STR1+4
+# define LEN STR2+4
+
+/* In this code following instructions are used for copying:
+ movb - 1 byte
+ movw - 2 byte
+ movl - 4 byte
+ movlpd - 8 byte
+ movaps - 16 byte - requires 16 byte alignment
+ of sourse and destination adresses.
+*/
+
+.text
+ENTRY (STRCPY)
+ ENTRANCE
+ mov STR1(%esp), %edx
+ mov STR2(%esp), %ecx
+# ifdef USE_AS_STRNCPY
+ movl LEN(%esp), %ebx
+ cmp $8, %ebx
+ jbe L(StrncpyExit8Bytes)
+# endif
+ cmpb $0, (%ecx)
+ jz L(ExitTail1)
+ cmpb $0, 1(%ecx)
+ jz L(ExitTail2)
+ cmpb $0, 2(%ecx)
+ jz L(ExitTail3)
+ cmpb $0, 3(%ecx)
+ jz L(ExitTail4)
+ cmpb $0, 4(%ecx)
+ jz L(ExitTail5)
+ cmpb $0, 5(%ecx)
+ jz L(ExitTail6)
+ cmpb $0, 6(%ecx)
+ jz L(ExitTail7)
+ cmpb $0, 7(%ecx)
+ jz L(ExitTail8)
+# ifdef USE_AS_STRNCPY
+ cmp $16, %ebx
+ jb L(StrncpyExit15Bytes)
+# endif
+ cmpb $0, 8(%ecx)
+ jz L(ExitTail9)
+ cmpb $0, 9(%ecx)
+ jz L(ExitTail10)
+ cmpb $0, 10(%ecx)
+ jz L(ExitTail11)
+ cmpb $0, 11(%ecx)
+ jz L(ExitTail12)
+ cmpb $0, 12(%ecx)
+ jz L(ExitTail13)
+ cmpb $0, 13(%ecx)
+ jz L(ExitTail14)
+ cmpb $0, 14(%ecx)
+ jz L(ExitTail15)
+# ifdef USE_AS_STRNCPY
+ cmp $16, %ebx
+ je L(ExitTail16)
+# endif
+ cmpb $0, 15(%ecx)
+ jz L(ExitTail16)
+
+ PUSH (%edi)
+ mov %edx, %edi
+# endif
+ PUSH (%esi)
+# ifdef USE_AS_STRNCPY
+ mov %ecx, %esi
+ sub $16, %ebx
+ and $0xf, %esi
+
+/* add 16 bytes ecx_offset to ebx */
+
+ add %esi, %ebx
+# endif
+ lea 16(%ecx), %esi
+ and $-16, %esi
+ pxor %xmm0, %xmm0
+ movlpd (%ecx), %xmm1
+ movlpd %xmm1, (%edx)
+
+ pcmpeqb (%esi), %xmm0
+ movlpd 8(%ecx), %xmm1
+ movlpd %xmm1, 8(%edx)
+
+ pmovmskb %xmm0, %eax
+ sub %ecx, %esi
+
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(CopyFrom1To16Bytes)
+
+ mov %edx, %eax
+ lea 16(%edx), %edx
+ and $-16, %edx
+ sub %edx, %eax
+
+# ifdef USE_AS_STRNCPY
+ add %eax, %esi
+ lea -1(%esi), %esi
+ and $1<<31, %esi
+ test %esi, %esi
+ jnz L(ContinueCopy)
+ lea 16(%ebx), %ebx
+
+L(ContinueCopy):
+# endif
+ sub %eax, %ecx
+ mov %ecx, %eax
+ and $0xf, %eax
+ mov $0, %esi
+
+/* case: ecx_offset == edx_offset */
+
+ jz L(Align16Both)
+
+ cmp $8, %eax
+ jae L(ShlHigh8)
+ cmp $1, %eax
+ je L(Shl1)
+ cmp $2, %eax
+ je L(Shl2)
+ cmp $3, %eax
+ je L(Shl3)
+ cmp $4, %eax
+ je L(Shl4)
+ cmp $5, %eax
+ je L(Shl5)
+ cmp $6, %eax
+ je L(Shl6)
+ jmp L(Shl7)
+
+L(ShlHigh8):
+ je L(Shl8)
+ cmp $9, %eax
+ je L(Shl9)
+ cmp $10, %eax
+ je L(Shl10)
+ cmp $11, %eax
+ je L(Shl11)
+ cmp $12, %eax
+ je L(Shl12)
+ cmp $13, %eax
+ je L(Shl13)
+ cmp $14, %eax
+ je L(Shl14)
+ jmp L(Shl15)
+
+L(Align16Both):
+ movaps (%ecx), %xmm1
+ movaps 16(%ecx), %xmm2
+ movaps %xmm1, (%edx)
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %eax
+ lea 16(%esi), %esi
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(CopyFrom1To16Bytes)
+
+ movaps 16(%ecx, %esi), %xmm3
+ movaps %xmm2, (%edx, %esi)
+ pcmpeqb %xmm3, %xmm0
+ pmovmskb %xmm0, %eax
+ lea 16(%esi), %esi
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(CopyFrom1To16Bytes)
+
+ movaps 16(%ecx, %esi), %xmm4
+ movaps %xmm3, (%edx, %esi)
+ pcmpeqb %xmm4, %xmm0
+ pmovmskb %xmm0, %eax
+ lea 16(%esi), %esi
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(CopyFrom1To16Bytes)
+
+ movaps 16(%ecx, %esi), %xmm1
+ movaps %xmm4, (%edx, %esi)
+ pcmpeqb %xmm1, %xmm0
+ pmovmskb %xmm0, %eax
+ lea 16(%esi), %esi
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(CopyFrom1To16Bytes)
+
+ movaps 16(%ecx, %esi), %xmm2
+ movaps %xmm1, (%edx, %esi)
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %eax
+ lea 16(%esi), %esi
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(CopyFrom1To16Bytes)
+
+ movaps 16(%ecx, %esi), %xmm3
+ movaps %xmm2, (%edx, %esi)
+ pcmpeqb %xmm3, %xmm0
+ pmovmskb %xmm0, %eax
+ lea 16(%esi), %esi
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(CopyFrom1To16Bytes)
+
+ movaps %xmm3, (%edx, %esi)
+ mov %ecx, %eax
+ lea 16(%ecx, %esi), %ecx
+ and $-0x40, %ecx
+ sub %ecx, %eax
+ sub %eax, %edx
+# ifdef USE_AS_STRNCPY
+ lea 112(%ebx, %eax), %ebx
+# endif
+ mov $-0x40, %esi
+
+L(Aligned64Loop):
+ movaps (%ecx), %xmm2
+ movaps 32(%ecx), %xmm3
+ movaps %xmm2, %xmm4
+ movaps 16(%ecx), %xmm5
+ movaps %xmm3, %xmm6
+ movaps 48(%ecx), %xmm7
+ pminub %xmm5, %xmm2
+ pminub %xmm7, %xmm3
+ pminub %xmm2, %xmm3
+ lea 64(%edx), %edx
+ pcmpeqb %xmm0, %xmm3
+ lea 64(%ecx), %ecx
+ pmovmskb %xmm3, %eax
+# ifdef USE_AS_STRNCPY
+ sub $64, %ebx
+ jbe L(StrncpyLeaveCase2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Aligned64Leave)
+ movaps %xmm4, -64(%edx)
+ movaps %xmm5, -48(%edx)
+ movaps %xmm6, -32(%edx)
+ movaps %xmm7, -16(%edx)
+ jmp L(Aligned64Loop)
+
+L(Aligned64Leave):
+# ifdef USE_AS_STRNCPY
+ lea 48(%ebx), %ebx
+# endif
+ pcmpeqb %xmm4, %xmm0
+ pmovmskb %xmm0, %eax
+ test %eax, %eax
+ jnz L(CopyFrom1To16Bytes)
+
+ pcmpeqb %xmm5, %xmm0
+# ifdef USE_AS_STRNCPY
+ lea -16(%ebx), %ebx
+# endif
+ pmovmskb %xmm0, %eax
+ movaps %xmm4, -64(%edx)
+ test %eax, %eax
+ lea 16(%esi), %esi
+ jnz L(CopyFrom1To16Bytes)
+
+ pcmpeqb %xmm6, %xmm0
+# ifdef USE_AS_STRNCPY
+ lea -16(%ebx), %ebx
+# endif
+ pmovmskb %xmm0, %eax
+ movaps %xmm5, -48(%edx)
+ test %eax, %eax
+ lea 16(%esi), %esi
+ jnz L(CopyFrom1To16Bytes)
+
+ movaps %xmm6, -32(%edx)
+ pcmpeqb %xmm7, %xmm0
+# ifdef USE_AS_STRNCPY
+ lea -16(%ebx), %ebx
+# endif
+ pmovmskb %xmm0, %eax
+ lea 16(%esi), %esi
+ jmp L(CopyFrom1To16Bytes)
+
+ .p2align 4
+L(Shl1):
+ movaps -1(%ecx), %xmm1
+ movaps 15(%ecx), %xmm2
+L(Shl1Start):
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %eax
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit1Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl1LoopExit)
+
+ palignr $1, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ movaps 31(%ecx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit1Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl1LoopExit)
+
+ palignr $1, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 31(%ecx), %xmm2
+ movaps %xmm3, %xmm1
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit1Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl1LoopExit)
+
+ palignr $1, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 31(%ecx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit1Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl1LoopExit)
+
+ palignr $1, %xmm3, %xmm2
+ movaps %xmm2, (%edx)
+ lea 31(%ecx), %ecx
+ lea 16(%edx), %edx
+
+ mov %ecx, %eax
+ and $-0x40, %ecx
+ sub %ecx, %eax
+ lea -15(%ecx), %ecx
+ sub %eax, %edx
+# ifdef USE_AS_STRNCPY
+ add %eax, %ebx
+# endif
+ movaps -1(%ecx), %xmm1
+
+L(Shl1LoopStart):
+ movaps 15(%ecx), %xmm2
+ movaps 31(%ecx), %xmm3
+ movaps %xmm3, %xmm6
+ movaps 47(%ecx), %xmm4
+ movaps %xmm4, %xmm7
+ movaps 63(%ecx), %xmm5
+ pminub %xmm2, %xmm6
+ pminub %xmm5, %xmm7
+ pminub %xmm6, %xmm7
+ pcmpeqb %xmm0, %xmm7
+ pmovmskb %xmm7, %eax
+ movaps %xmm5, %xmm7
+ palignr $1, %xmm4, %xmm5
+ test %eax, %eax
+ palignr $1, %xmm3, %xmm4
+ jnz L(Shl1Start)
+# ifdef USE_AS_STRNCPY
+ sub $64, %ebx
+ jbe L(StrncpyLeave1)
+# endif
+ palignr $1, %xmm2, %xmm3
+ lea 64(%ecx), %ecx
+ palignr $1, %xmm1, %xmm2
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%edx)
+ movaps %xmm4, 32(%edx)
+ movaps %xmm3, 16(%edx)
+ movaps %xmm2, (%edx)
+ lea 64(%edx), %edx
+ jmp L(Shl1LoopStart)
+
+L(Shl1LoopExit):
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ movlpd 7(%ecx), %xmm0
+ movlpd %xmm0, 7(%edx)
+ mov $15, %esi
+ jmp L(CopyFrom1To16Bytes)
+
+ .p2align 4
+L(Shl2):
+ movaps -2(%ecx), %xmm1
+ movaps 14(%ecx), %xmm2
+L(Shl2Start):
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %eax
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit2Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl2LoopExit)
+
+ palignr $2, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ movaps 30(%ecx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit2Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl2LoopExit)
+
+ palignr $2, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 30(%ecx), %xmm2
+ movaps %xmm3, %xmm1
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit2Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl2LoopExit)
+
+ palignr $2, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 30(%ecx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit2Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl2LoopExit)
+
+ palignr $2, %xmm3, %xmm2
+ movaps %xmm2, (%edx)
+ lea 30(%ecx), %ecx
+ lea 16(%edx), %edx
+
+ mov %ecx, %eax
+ and $-0x40, %ecx
+ sub %ecx, %eax
+ lea -14(%ecx), %ecx
+ sub %eax, %edx
+# ifdef USE_AS_STRNCPY
+ add %eax, %ebx
+# endif
+ movaps -2(%ecx), %xmm1
+
+L(Shl2LoopStart):
+ movaps 14(%ecx), %xmm2
+ movaps 30(%ecx), %xmm3
+ movaps %xmm3, %xmm6
+ movaps 46(%ecx), %xmm4
+ movaps %xmm4, %xmm7
+ movaps 62(%ecx), %xmm5
+ pminub %xmm2, %xmm6
+ pminub %xmm5, %xmm7
+ pminub %xmm6, %xmm7
+ pcmpeqb %xmm0, %xmm7
+ pmovmskb %xmm7, %eax
+ movaps %xmm5, %xmm7
+ palignr $2, %xmm4, %xmm5
+ test %eax, %eax
+ palignr $2, %xmm3, %xmm4
+ jnz L(Shl2Start)
+# ifdef USE_AS_STRNCPY
+ sub $64, %ebx
+ jbe L(StrncpyLeave2)
+# endif
+ palignr $2, %xmm2, %xmm3
+ lea 64(%ecx), %ecx
+ palignr $2, %xmm1, %xmm2
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%edx)
+ movaps %xmm4, 32(%edx)
+ movaps %xmm3, 16(%edx)
+ movaps %xmm2, (%edx)
+ lea 64(%edx), %edx
+ jmp L(Shl2LoopStart)
+
+L(Shl2LoopExit):
+ movlpd (%ecx), %xmm0
+ movlpd 6(%ecx), %xmm1
+ movlpd %xmm0, (%edx)
+ movlpd %xmm1, 6(%edx)
+ mov $14, %esi
+ jmp L(CopyFrom1To16Bytes)
+
+ .p2align 4
+L(Shl3):
+ movaps -3(%ecx), %xmm1
+ movaps 13(%ecx), %xmm2
+L(Shl3Start):
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %eax
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit3Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl3LoopExit)
+
+ palignr $3, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ movaps 29(%ecx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit3Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl3LoopExit)
+
+ palignr $3, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 29(%ecx), %xmm2
+ movaps %xmm3, %xmm1
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit3Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl3LoopExit)
+
+ palignr $3, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 29(%ecx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit3Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl3LoopExit)
+
+ palignr $3, %xmm3, %xmm2
+ movaps %xmm2, (%edx)
+ lea 29(%ecx), %ecx
+ lea 16(%edx), %edx
+
+ mov %ecx, %eax
+ and $-0x40, %ecx
+ sub %ecx, %eax
+ lea -13(%ecx), %ecx
+ sub %eax, %edx
+# ifdef USE_AS_STRNCPY
+ add %eax, %ebx
+# endif
+ movaps -3(%ecx), %xmm1
+
+L(Shl3LoopStart):
+ movaps 13(%ecx), %xmm2
+ movaps 29(%ecx), %xmm3
+ movaps %xmm3, %xmm6
+ movaps 45(%ecx), %xmm4
+ movaps %xmm4, %xmm7
+ movaps 61(%ecx), %xmm5
+ pminub %xmm2, %xmm6
+ pminub %xmm5, %xmm7
+ pminub %xmm6, %xmm7
+ pcmpeqb %xmm0, %xmm7
+ pmovmskb %xmm7, %eax
+ movaps %xmm5, %xmm7
+ palignr $3, %xmm4, %xmm5
+ test %eax, %eax
+ palignr $3, %xmm3, %xmm4
+ jnz L(Shl3Start)
+# ifdef USE_AS_STRNCPY
+ sub $64, %ebx
+ jbe L(StrncpyLeave3)
+# endif
+ palignr $3, %xmm2, %xmm3
+ lea 64(%ecx), %ecx
+ palignr $3, %xmm1, %xmm2
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%edx)
+ movaps %xmm4, 32(%edx)
+ movaps %xmm3, 16(%edx)
+ movaps %xmm2, (%edx)
+ lea 64(%edx), %edx
+ jmp L(Shl3LoopStart)
+
+L(Shl3LoopExit):
+ movlpd (%ecx), %xmm0
+ movlpd 5(%ecx), %xmm1
+ movlpd %xmm0, (%edx)
+ movlpd %xmm1, 5(%edx)
+ mov $13, %esi
+ jmp L(CopyFrom1To16Bytes)
+
+ .p2align 4
+L(Shl4):
+ movaps -4(%ecx), %xmm1
+ movaps 12(%ecx), %xmm2
+L(Shl4Start):
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %eax
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit4Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl4LoopExit)
+
+ palignr $4, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ movaps 28(%ecx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit4Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl4LoopExit)
+
+ palignr $4, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 28(%ecx), %xmm2
+ movaps %xmm3, %xmm1
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit4Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl4LoopExit)
+
+ palignr $4, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 28(%ecx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit4Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl4LoopExit)
+
+ palignr $4, %xmm3, %xmm2
+ movaps %xmm2, (%edx)
+ lea 28(%ecx), %ecx
+ lea 16(%edx), %edx
+
+ mov %ecx, %eax
+ and $-0x40, %ecx
+ sub %ecx, %eax
+ lea -12(%ecx), %ecx
+ sub %eax, %edx
+# ifdef USE_AS_STRNCPY
+ add %eax, %ebx
+# endif
+ movaps -4(%ecx), %xmm1
+
+L(Shl4LoopStart):
+ movaps 12(%ecx), %xmm2
+ movaps 28(%ecx), %xmm3
+ movaps %xmm3, %xmm6
+ movaps 44(%ecx), %xmm4
+ movaps %xmm4, %xmm7
+ movaps 60(%ecx), %xmm5
+ pminub %xmm2, %xmm6
+ pminub %xmm5, %xmm7
+ pminub %xmm6, %xmm7
+ pcmpeqb %xmm0, %xmm7
+ pmovmskb %xmm7, %eax
+ movaps %xmm5, %xmm7
+ palignr $4, %xmm4, %xmm5
+ test %eax, %eax
+ palignr $4, %xmm3, %xmm4
+ jnz L(Shl4Start)
+# ifdef USE_AS_STRNCPY
+ sub $64, %ebx
+ jbe L(StrncpyLeave4)
+# endif
+ palignr $4, %xmm2, %xmm3
+ lea 64(%ecx), %ecx
+ palignr $4, %xmm1, %xmm2
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%edx)
+ movaps %xmm4, 32(%edx)
+ movaps %xmm3, 16(%edx)
+ movaps %xmm2, (%edx)
+ lea 64(%edx), %edx
+ jmp L(Shl4LoopStart)
+
+L(Shl4LoopExit):
+ movlpd (%ecx), %xmm0
+ movl 8(%ecx), %esi
+ movlpd %xmm0, (%edx)
+ movl %esi, 8(%edx)
+ mov $12, %esi
+ jmp L(CopyFrom1To16Bytes)
+
+ .p2align 4
+L(Shl5):
+ movaps -5(%ecx), %xmm1
+ movaps 11(%ecx), %xmm2
+L(Shl5Start):
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %eax
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit5Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl5LoopExit)
+
+ palignr $5, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ movaps 27(%ecx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit5Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl5LoopExit)
+
+ palignr $5, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 27(%ecx), %xmm2
+ movaps %xmm3, %xmm1
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit5Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl5LoopExit)
+
+ palignr $5, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 27(%ecx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit5Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl5LoopExit)
+
+ palignr $5, %xmm3, %xmm2
+ movaps %xmm2, (%edx)
+ lea 27(%ecx), %ecx
+ lea 16(%edx), %edx
+
+ mov %ecx, %eax
+ and $-0x40, %ecx
+ sub %ecx, %eax
+ lea -11(%ecx), %ecx
+ sub %eax, %edx
+# ifdef USE_AS_STRNCPY
+ add %eax, %ebx
+# endif
+ movaps -5(%ecx), %xmm1
+
+L(Shl5LoopStart):
+ movaps 11(%ecx), %xmm2
+ movaps 27(%ecx), %xmm3
+ movaps %xmm3, %xmm6
+ movaps 43(%ecx), %xmm4
+ movaps %xmm4, %xmm7
+ movaps 59(%ecx), %xmm5
+ pminub %xmm2, %xmm6
+ pminub %xmm5, %xmm7
+ pminub %xmm6, %xmm7
+ pcmpeqb %xmm0, %xmm7
+ pmovmskb %xmm7, %eax
+ movaps %xmm5, %xmm7
+ palignr $5, %xmm4, %xmm5
+ test %eax, %eax
+ palignr $5, %xmm3, %xmm4
+ jnz L(Shl5Start)
+# ifdef USE_AS_STRNCPY
+ sub $64, %ebx
+ jbe L(StrncpyLeave5)
+# endif
+ palignr $5, %xmm2, %xmm3
+ lea 64(%ecx), %ecx
+ palignr $5, %xmm1, %xmm2
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%edx)
+ movaps %xmm4, 32(%edx)
+ movaps %xmm3, 16(%edx)
+ movaps %xmm2, (%edx)
+ lea 64(%edx), %edx
+ jmp L(Shl5LoopStart)
+
+L(Shl5LoopExit):
+ movlpd (%ecx), %xmm0
+ movl 7(%ecx), %esi
+ movlpd %xmm0, (%edx)
+ movl %esi, 7(%edx)
+ mov $11, %esi
+ jmp L(CopyFrom1To16Bytes)
+
+ .p2align 4
+L(Shl6):
+ movaps -6(%ecx), %xmm1
+ movaps 10(%ecx), %xmm2
+L(Shl6Start):
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %eax
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit6Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl6LoopExit)
+
+ palignr $6, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ movaps 26(%ecx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit6Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl6LoopExit)
+
+ palignr $6, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 26(%ecx), %xmm2
+ movaps %xmm3, %xmm1
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit6Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl6LoopExit)
+
+ palignr $6, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 26(%ecx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit6Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl6LoopExit)
+
+ palignr $6, %xmm3, %xmm2
+ movaps %xmm2, (%edx)
+ lea 26(%ecx), %ecx
+ lea 16(%edx), %edx
+
+ mov %ecx, %eax
+ and $-0x40, %ecx
+ sub %ecx, %eax
+ lea -10(%ecx), %ecx
+ sub %eax, %edx
+# ifdef USE_AS_STRNCPY
+ add %eax, %ebx
+# endif
+ movaps -6(%ecx), %xmm1
+
+L(Shl6LoopStart):
+ movaps 10(%ecx), %xmm2
+ movaps 26(%ecx), %xmm3
+ movaps %xmm3, %xmm6
+ movaps 42(%ecx), %xmm4
+ movaps %xmm4, %xmm7
+ movaps 58(%ecx), %xmm5
+ pminub %xmm2, %xmm6
+ pminub %xmm5, %xmm7
+ pminub %xmm6, %xmm7
+ pcmpeqb %xmm0, %xmm7
+ pmovmskb %xmm7, %eax
+ movaps %xmm5, %xmm7
+ palignr $6, %xmm4, %xmm5
+ test %eax, %eax
+ palignr $6, %xmm3, %xmm4
+ jnz L(Shl6Start)
+# ifdef USE_AS_STRNCPY
+ sub $64, %ebx
+ jbe L(StrncpyLeave6)
+# endif
+ palignr $6, %xmm2, %xmm3
+ lea 64(%ecx), %ecx
+ palignr $6, %xmm1, %xmm2
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%edx)
+ movaps %xmm4, 32(%edx)
+ movaps %xmm3, 16(%edx)
+ movaps %xmm2, (%edx)
+ lea 64(%edx), %edx
+ jmp L(Shl6LoopStart)
+
+L(Shl6LoopExit):
+ movlpd (%ecx), %xmm0
+ movl 6(%ecx), %esi
+ movlpd %xmm0, (%edx)
+ movl %esi, 6(%edx)
+ mov $10, %esi
+ jmp L(CopyFrom1To16Bytes)
+
+ .p2align 4
+L(Shl7):
+ movaps -7(%ecx), %xmm1
+ movaps 9(%ecx), %xmm2
+L(Shl7Start):
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %eax
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit7Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl7LoopExit)
+
+ palignr $7, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ movaps 25(%ecx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit7Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl7LoopExit)
+
+ palignr $7, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 25(%ecx), %xmm2
+ movaps %xmm3, %xmm1
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit7Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl7LoopExit)
+
+ palignr $7, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 25(%ecx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit7Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl7LoopExit)
+
+ palignr $7, %xmm3, %xmm2
+ movaps %xmm2, (%edx)
+ lea 25(%ecx), %ecx
+ lea 16(%edx), %edx
+
+ mov %ecx, %eax
+ and $-0x40, %ecx
+ sub %ecx, %eax
+ lea -9(%ecx), %ecx
+ sub %eax, %edx
+# ifdef USE_AS_STRNCPY
+ add %eax, %ebx
+# endif
+ movaps -7(%ecx), %xmm1
+
+L(Shl7LoopStart):
+ movaps 9(%ecx), %xmm2
+ movaps 25(%ecx), %xmm3
+ movaps %xmm3, %xmm6
+ movaps 41(%ecx), %xmm4
+ movaps %xmm4, %xmm7
+ movaps 57(%ecx), %xmm5
+ pminub %xmm2, %xmm6
+ pminub %xmm5, %xmm7
+ pminub %xmm6, %xmm7
+ pcmpeqb %xmm0, %xmm7
+ pmovmskb %xmm7, %eax
+ movaps %xmm5, %xmm7
+ palignr $7, %xmm4, %xmm5
+ test %eax, %eax
+ palignr $7, %xmm3, %xmm4
+ jnz L(Shl7Start)
+# ifdef USE_AS_STRNCPY
+ sub $64, %ebx
+ jbe L(StrncpyLeave7)
+# endif
+ palignr $7, %xmm2, %xmm3
+ lea 64(%ecx), %ecx
+ palignr $7, %xmm1, %xmm2
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%edx)
+ movaps %xmm4, 32(%edx)
+ movaps %xmm3, 16(%edx)
+ movaps %xmm2, (%edx)
+ lea 64(%edx), %edx
+ jmp L(Shl7LoopStart)
+
+L(Shl7LoopExit):
+ movlpd (%ecx), %xmm0
+ movl 5(%ecx), %esi
+ movlpd %xmm0, (%edx)
+ movl %esi, 5(%edx)
+ mov $9, %esi
+ jmp L(CopyFrom1To16Bytes)
+
+ .p2align 4
+L(Shl8):
+ movaps -8(%ecx), %xmm1
+ movaps 8(%ecx), %xmm2
+L(Shl8Start):
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %eax
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit8Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl8LoopExit)
+
+ palignr $8, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ movaps 24(%ecx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit8Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl8LoopExit)
+
+ palignr $8, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 24(%ecx), %xmm2
+ movaps %xmm3, %xmm1
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit8Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl8LoopExit)
+
+ palignr $8, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 24(%ecx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit8Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl8LoopExit)
+
+ palignr $8, %xmm3, %xmm2
+ movaps %xmm2, (%edx)
+ lea 24(%ecx), %ecx
+ lea 16(%edx), %edx
+
+ mov %ecx, %eax
+ and $-0x40, %ecx
+ sub %ecx, %eax
+ lea -8(%ecx), %ecx
+ sub %eax, %edx
+# ifdef USE_AS_STRNCPY
+ add %eax, %ebx
+# endif
+ movaps -8(%ecx), %xmm1
+
+L(Shl8LoopStart):
+ movaps 8(%ecx), %xmm2
+ movaps 24(%ecx), %xmm3
+ movaps %xmm3, %xmm6
+ movaps 40(%ecx), %xmm4
+ movaps %xmm4, %xmm7
+ movaps 56(%ecx), %xmm5
+ pminub %xmm2, %xmm6
+ pminub %xmm5, %xmm7
+ pminub %xmm6, %xmm7
+ pcmpeqb %xmm0, %xmm7
+ pmovmskb %xmm7, %eax
+ movaps %xmm5, %xmm7
+ palignr $8, %xmm4, %xmm5
+ test %eax, %eax
+ palignr $8, %xmm3, %xmm4
+ jnz L(Shl8Start)
+# ifdef USE_AS_STRNCPY
+ sub $64, %ebx
+ jbe L(StrncpyLeave8)
+# endif
+ palignr $8, %xmm2, %xmm3
+ lea 64(%ecx), %ecx
+ palignr $8, %xmm1, %xmm2
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%edx)
+ movaps %xmm4, 32(%edx)
+ movaps %xmm3, 16(%edx)
+ movaps %xmm2, (%edx)
+ lea 64(%edx), %edx
+ jmp L(Shl8LoopStart)
+
+L(Shl8LoopExit):
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ mov $8, %esi
+ jmp L(CopyFrom1To16Bytes)
+
+ .p2align 4
+L(Shl9):
+ movaps -9(%ecx), %xmm1
+ movaps 7(%ecx), %xmm2
+L(Shl9Start):
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %eax
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit9Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl9LoopExit)
+
+ palignr $9, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ movaps 23(%ecx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit9Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl9LoopExit)
+
+ palignr $9, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 23(%ecx), %xmm2
+ movaps %xmm3, %xmm1
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit9Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl9LoopExit)
+
+ palignr $9, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 23(%ecx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit9Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl9LoopExit)
+
+ palignr $9, %xmm3, %xmm2
+ movaps %xmm2, (%edx)
+ lea 23(%ecx), %ecx
+ lea 16(%edx), %edx
+
+ mov %ecx, %eax
+ and $-0x40, %ecx
+ sub %ecx, %eax
+ lea -7(%ecx), %ecx
+ sub %eax, %edx
+# ifdef USE_AS_STRNCPY
+ add %eax, %ebx
+# endif
+ movaps -9(%ecx), %xmm1
+
+L(Shl9LoopStart):
+ movaps 7(%ecx), %xmm2
+ movaps 23(%ecx), %xmm3
+ movaps %xmm3, %xmm6
+ movaps 39(%ecx), %xmm4
+ movaps %xmm4, %xmm7
+ movaps 55(%ecx), %xmm5
+ pminub %xmm2, %xmm6
+ pminub %xmm5, %xmm7
+ pminub %xmm6, %xmm7
+ pcmpeqb %xmm0, %xmm7
+ pmovmskb %xmm7, %eax
+ movaps %xmm5, %xmm7
+ palignr $9, %xmm4, %xmm5
+ test %eax, %eax
+ palignr $9, %xmm3, %xmm4
+ jnz L(Shl9Start)
+# ifdef USE_AS_STRNCPY
+ sub $64, %ebx
+ jbe L(StrncpyLeave9)
+# endif
+ palignr $9, %xmm2, %xmm3
+ lea 64(%ecx), %ecx
+ palignr $9, %xmm1, %xmm2
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%edx)
+ movaps %xmm4, 32(%edx)
+ movaps %xmm3, 16(%edx)
+ movaps %xmm2, (%edx)
+ lea 64(%edx), %edx
+ jmp L(Shl9LoopStart)
+
+L(Shl9LoopExit):
+ movlpd -1(%ecx), %xmm0
+ movlpd %xmm0, -1(%edx)
+ mov $7, %esi
+ jmp L(CopyFrom1To16Bytes)
+
+ .p2align 4
+L(Shl10):
+ movaps -10(%ecx), %xmm1
+ movaps 6(%ecx), %xmm2
+L(Shl10Start):
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %eax
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit10Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl10LoopExit)
+
+ palignr $10, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ movaps 22(%ecx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit10Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl10LoopExit)
+
+ palignr $10, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 22(%ecx), %xmm2
+ movaps %xmm3, %xmm1
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit10Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl10LoopExit)
+
+ palignr $10, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 22(%ecx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit10Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl10LoopExit)
+
+ palignr $10, %xmm3, %xmm2
+ movaps %xmm2, (%edx)
+ lea 22(%ecx), %ecx
+ lea 16(%edx), %edx
+
+ mov %ecx, %eax
+ and $-0x40, %ecx
+ sub %ecx, %eax
+ lea -6(%ecx), %ecx
+ sub %eax, %edx
+# ifdef USE_AS_STRNCPY
+ add %eax, %ebx
+# endif
+ movaps -10(%ecx), %xmm1
+
+L(Shl10LoopStart):
+ movaps 6(%ecx), %xmm2
+ movaps 22(%ecx), %xmm3
+ movaps %xmm3, %xmm6
+ movaps 38(%ecx), %xmm4
+ movaps %xmm4, %xmm7
+ movaps 54(%ecx), %xmm5
+ pminub %xmm2, %xmm6
+ pminub %xmm5, %xmm7
+ pminub %xmm6, %xmm7
+ pcmpeqb %xmm0, %xmm7
+ pmovmskb %xmm7, %eax
+ movaps %xmm5, %xmm7
+ palignr $10, %xmm4, %xmm5
+ test %eax, %eax
+ palignr $10, %xmm3, %xmm4
+ jnz L(Shl10Start)
+# ifdef USE_AS_STRNCPY
+ sub $64, %ebx
+ jbe L(StrncpyLeave10)
+# endif
+ palignr $10, %xmm2, %xmm3
+ lea 64(%ecx), %ecx
+ palignr $10, %xmm1, %xmm2
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%edx)
+ movaps %xmm4, 32(%edx)
+ movaps %xmm3, 16(%edx)
+ movaps %xmm2, (%edx)
+ lea 64(%edx), %edx
+ jmp L(Shl10LoopStart)
+
+L(Shl10LoopExit):
+ movlpd -2(%ecx), %xmm0
+ movlpd %xmm0, -2(%edx)
+ mov $6, %esi
+ jmp L(CopyFrom1To16Bytes)
+
+ .p2align 4
+L(Shl11):
+ movaps -11(%ecx), %xmm1
+ movaps 5(%ecx), %xmm2
+L(Shl11Start):
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %eax
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit11Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl11LoopExit)
+
+ palignr $11, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ movaps 21(%ecx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit11Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl11LoopExit)
+
+ palignr $11, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 21(%ecx), %xmm2
+ movaps %xmm3, %xmm1
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit11Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl11LoopExit)
+
+ palignr $11, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 21(%ecx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit11Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl11LoopExit)
+
+ palignr $11, %xmm3, %xmm2
+ movaps %xmm2, (%edx)
+ lea 21(%ecx), %ecx
+ lea 16(%edx), %edx
+
+ mov %ecx, %eax
+ and $-0x40, %ecx
+ sub %ecx, %eax
+ lea -5(%ecx), %ecx
+ sub %eax, %edx
+# ifdef USE_AS_STRNCPY
+ add %eax, %ebx
+# endif
+ movaps -11(%ecx), %xmm1
+
+L(Shl11LoopStart):
+ movaps 5(%ecx), %xmm2
+ movaps 21(%ecx), %xmm3
+ movaps %xmm3, %xmm6
+ movaps 37(%ecx), %xmm4
+ movaps %xmm4, %xmm7
+ movaps 53(%ecx), %xmm5
+ pminub %xmm2, %xmm6
+ pminub %xmm5, %xmm7
+ pminub %xmm6, %xmm7
+ pcmpeqb %xmm0, %xmm7
+ pmovmskb %xmm7, %eax
+ movaps %xmm5, %xmm7
+ palignr $11, %xmm4, %xmm5
+ test %eax, %eax
+ palignr $11, %xmm3, %xmm4
+ jnz L(Shl11Start)
+# ifdef USE_AS_STRNCPY
+ sub $64, %ebx
+ jbe L(StrncpyLeave11)
+# endif
+ palignr $11, %xmm2, %xmm3
+ lea 64(%ecx), %ecx
+ palignr $11, %xmm1, %xmm2
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%edx)
+ movaps %xmm4, 32(%edx)
+ movaps %xmm3, 16(%edx)
+ movaps %xmm2, (%edx)
+ lea 64(%edx), %edx
+ jmp L(Shl11LoopStart)
+
+L(Shl11LoopExit):
+ movlpd -3(%ecx), %xmm0
+ movlpd %xmm0, -3(%edx)
+ mov $5, %esi
+ jmp L(CopyFrom1To16Bytes)
+
+ .p2align 4
+L(Shl12):
+ movaps -12(%ecx), %xmm1
+ movaps 4(%ecx), %xmm2
+L(Shl12Start):
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %eax
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit12Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl12LoopExit)
+
+ palignr $12, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ movaps 20(%ecx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit12Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl12LoopExit)
+
+ palignr $12, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 20(%ecx), %xmm2
+ movaps %xmm3, %xmm1
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit12Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl12LoopExit)
+
+ palignr $12, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 20(%ecx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit12Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl12LoopExit)
+
+ palignr $12, %xmm3, %xmm2
+ movaps %xmm2, (%edx)
+ lea 20(%ecx), %ecx
+ lea 16(%edx), %edx
+
+ mov %ecx, %eax
+ and $-0x40, %ecx
+ sub %ecx, %eax
+ lea -4(%ecx), %ecx
+ sub %eax, %edx
+# ifdef USE_AS_STRNCPY
+ add %eax, %ebx
+# endif
+ movaps -12(%ecx), %xmm1
+
+L(Shl12LoopStart):
+ movaps 4(%ecx), %xmm2
+ movaps 20(%ecx), %xmm3
+ movaps %xmm3, %xmm6
+ movaps 36(%ecx), %xmm4
+ movaps %xmm4, %xmm7
+ movaps 52(%ecx), %xmm5
+ pminub %xmm2, %xmm6
+ pminub %xmm5, %xmm7
+ pminub %xmm6, %xmm7
+ pcmpeqb %xmm0, %xmm7
+ pmovmskb %xmm7, %eax
+ movaps %xmm5, %xmm7
+ palignr $12, %xmm4, %xmm5
+ test %eax, %eax
+ palignr $12, %xmm3, %xmm4
+ jnz L(Shl12Start)
+# ifdef USE_AS_STRNCPY
+ sub $64, %ebx
+ jbe L(StrncpyLeave12)
+# endif
+ palignr $12, %xmm2, %xmm3
+ lea 64(%ecx), %ecx
+ palignr $12, %xmm1, %xmm2
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%edx)
+ movaps %xmm4, 32(%edx)
+ movaps %xmm3, 16(%edx)
+ movaps %xmm2, (%edx)
+ lea 64(%edx), %edx
+ jmp L(Shl12LoopStart)
+
+L(Shl12LoopExit):
+ movl (%ecx), %esi
+ movl %esi, (%edx)
+ mov $4, %esi
+ jmp L(CopyFrom1To16Bytes)
+
+ .p2align 4
+L(Shl13):
+ movaps -13(%ecx), %xmm1
+ movaps 3(%ecx), %xmm2
+L(Shl13Start):
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %eax
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit13Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl13LoopExit)
+
+ palignr $13, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ movaps 19(%ecx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit13Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl13LoopExit)
+
+ palignr $13, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 19(%ecx), %xmm2
+ movaps %xmm3, %xmm1
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit13Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl13LoopExit)
+
+ palignr $13, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 19(%ecx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit13Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl13LoopExit)
+
+ palignr $13, %xmm3, %xmm2
+ movaps %xmm2, (%edx)
+ lea 19(%ecx), %ecx
+ lea 16(%edx), %edx
+
+ mov %ecx, %eax
+ and $-0x40, %ecx
+ sub %ecx, %eax
+ lea -3(%ecx), %ecx
+ sub %eax, %edx
+# ifdef USE_AS_STRNCPY
+ add %eax, %ebx
+# endif
+ movaps -13(%ecx), %xmm1
+
+L(Shl13LoopStart):
+ movaps 3(%ecx), %xmm2
+ movaps 19(%ecx), %xmm3
+ movaps %xmm3, %xmm6
+ movaps 35(%ecx), %xmm4
+ movaps %xmm4, %xmm7
+ movaps 51(%ecx), %xmm5
+ pminub %xmm2, %xmm6
+ pminub %xmm5, %xmm7
+ pminub %xmm6, %xmm7
+ pcmpeqb %xmm0, %xmm7
+ pmovmskb %xmm7, %eax
+ movaps %xmm5, %xmm7
+ palignr $13, %xmm4, %xmm5
+ test %eax, %eax
+ palignr $13, %xmm3, %xmm4
+ jnz L(Shl13Start)
+# ifdef USE_AS_STRNCPY
+ sub $64, %ebx
+ jbe L(StrncpyLeave13)
+# endif
+ palignr $13, %xmm2, %xmm3
+ lea 64(%ecx), %ecx
+ palignr $13, %xmm1, %xmm2
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%edx)
+ movaps %xmm4, 32(%edx)
+ movaps %xmm3, 16(%edx)
+ movaps %xmm2, (%edx)
+ lea 64(%edx), %edx
+ jmp L(Shl13LoopStart)
+
+L(Shl13LoopExit):
+ movl -1(%ecx), %esi
+ movl %esi, -1(%edx)
+ mov $3, %esi
+ jmp L(CopyFrom1To16Bytes)
+
+ .p2align 4
+L(Shl14):
+ movaps -14(%ecx), %xmm1
+ movaps 2(%ecx), %xmm2
+L(Shl14Start):
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %eax
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit14Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl14LoopExit)
+
+ palignr $14, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ movaps 18(%ecx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit14Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl14LoopExit)
+
+ palignr $14, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 18(%ecx), %xmm2
+ movaps %xmm3, %xmm1
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit14Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl14LoopExit)
+
+ palignr $14, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 18(%ecx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit14Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl14LoopExit)
+
+ palignr $14, %xmm3, %xmm2
+ movaps %xmm2, (%edx)
+ lea 18(%ecx), %ecx
+ lea 16(%edx), %edx
+
+ mov %ecx, %eax
+ and $-0x40, %ecx
+ sub %ecx, %eax
+ lea -2(%ecx), %ecx
+ sub %eax, %edx
+# ifdef USE_AS_STRNCPY
+ add %eax, %ebx
+# endif
+ movaps -14(%ecx), %xmm1
+
+L(Shl14LoopStart):
+ movaps 2(%ecx), %xmm2
+ movaps 18(%ecx), %xmm3
+ movaps %xmm3, %xmm6
+ movaps 34(%ecx), %xmm4
+ movaps %xmm4, %xmm7
+ movaps 50(%ecx), %xmm5
+ pminub %xmm2, %xmm6
+ pminub %xmm5, %xmm7
+ pminub %xmm6, %xmm7
+ pcmpeqb %xmm0, %xmm7
+ pmovmskb %xmm7, %eax
+ movaps %xmm5, %xmm7
+ palignr $14, %xmm4, %xmm5
+ test %eax, %eax
+ palignr $14, %xmm3, %xmm4
+ jnz L(Shl14Start)
+# ifdef USE_AS_STRNCPY
+ sub $64, %ebx
+ jbe L(StrncpyLeave14)
+# endif
+ palignr $14, %xmm2, %xmm3
+ lea 64(%ecx), %ecx
+ palignr $14, %xmm1, %xmm2
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%edx)
+ movaps %xmm4, 32(%edx)
+ movaps %xmm3, 16(%edx)
+ movaps %xmm2, (%edx)
+ lea 64(%edx), %edx
+ jmp L(Shl14LoopStart)
+
+L(Shl14LoopExit):
+ movl -2(%ecx), %esi
+ movl %esi, -2(%edx)
+ mov $2, %esi
+ jmp L(CopyFrom1To16Bytes)
+
+ .p2align 4
+L(Shl15):
+ movaps -15(%ecx), %xmm1
+ movaps 1(%ecx), %xmm2
+L(Shl15Start):
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %eax
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit15Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl15LoopExit)
+
+ palignr $15, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ movaps 17(%ecx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit15Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl15LoopExit)
+
+ palignr $15, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 17(%ecx), %xmm2
+ movaps %xmm3, %xmm1
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit15Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl15LoopExit)
+
+ palignr $15, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 17(%ecx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit15Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl15LoopExit)
+
+ palignr $15, %xmm3, %xmm2
+ movaps %xmm2, (%edx)
+ lea 17(%ecx), %ecx
+ lea 16(%edx), %edx
+
+ mov %ecx, %eax
+ and $-0x40, %ecx
+ sub %ecx, %eax
+ lea -1(%ecx), %ecx
+ sub %eax, %edx
+# ifdef USE_AS_STRNCPY
+ add %eax, %ebx
+# endif
+ movaps -15(%ecx), %xmm1
+
+L(Shl15LoopStart):
+ movaps 1(%ecx), %xmm2
+ movaps 17(%ecx), %xmm3
+ movaps %xmm3, %xmm6
+ movaps 33(%ecx), %xmm4
+ movaps %xmm4, %xmm7
+ movaps 49(%ecx), %xmm5
+ pminub %xmm2, %xmm6
+ pminub %xmm5, %xmm7
+ pminub %xmm6, %xmm7
+ pcmpeqb %xmm0, %xmm7
+ pmovmskb %xmm7, %eax
+ movaps %xmm5, %xmm7
+ palignr $15, %xmm4, %xmm5
+ test %eax, %eax
+ palignr $15, %xmm3, %xmm4
+ jnz L(Shl15Start)
+# ifdef USE_AS_STRNCPY
+ sub $64, %ebx
+ jbe L(StrncpyLeave15)
+# endif
+ palignr $15, %xmm2, %xmm3
+ lea 64(%ecx), %ecx
+ palignr $15, %xmm1, %xmm2
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%edx)
+ movaps %xmm4, 32(%edx)
+ movaps %xmm3, 16(%edx)
+ movaps %xmm2, (%edx)
+ lea 64(%edx), %edx
+ jmp L(Shl15LoopStart)
+
+L(Shl15LoopExit):
+ movl -3(%ecx), %esi
+ movl %esi, -3(%edx)
+ mov $1, %esi
+# ifdef USE_AS_STRCAT
+ jmp L(CopyFrom1To16Bytes)
+# endif
+
+
+# ifndef USE_AS_STRCAT
+
+ .p2align 4
+L(CopyFrom1To16Bytes):
+# ifdef USE_AS_STRNCPY
+ add $16, %ebx
+# endif
+ add %esi, %edx
+ add %esi, %ecx
+
+ POP (%esi)
+ test %al, %al
+ jz L(ExitHigh8)
+
+L(CopyFrom1To16BytesLess8):
+ mov %al, %ah
+ and $15, %ah
+ jz L(ExitHigh4)
+
+ test $0x01, %al
+ jnz L(Exit1)
+ test $0x02, %al
+ jnz L(Exit2)
+ test $0x04, %al
+ jnz L(Exit3)
+
+ .p2align 4
+L(Exit4):
+ movl (%ecx), %eax
+ movl %eax, (%edx)
+ SAVE_RESULT (3)
+# ifdef USE_AS_STRNCPY
+ sub $4, %ebx
+ lea 4(%edx), %ecx
+ jnz L(StrncpyFillTailWithZero1)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# endif
+# endif
+ RETURN1
+
+ .p2align 4
+L(ExitHigh4):
+ test $0x10, %al
+ jnz L(Exit5)
+ test $0x20, %al
+ jnz L(Exit6)
+ test $0x40, %al
+ jnz L(Exit7)
+
+ .p2align 4
+L(Exit8):
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ SAVE_RESULT (7)
+# ifdef USE_AS_STRNCPY
+ sub $8, %ebx
+ lea 8(%edx), %ecx
+ jnz L(StrncpyFillTailWithZero1)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# endif
+# endif
+ RETURN1
+
+ .p2align 4
+L(ExitHigh8):
+ mov %ah, %al
+ and $15, %al
+ jz L(ExitHigh12)
+
+ test $0x01, %ah
+ jnz L(Exit9)
+ test $0x02, %ah
+ jnz L(Exit10)
+ test $0x04, %ah
+ jnz L(Exit11)
+
+ .p2align 4
+L(Exit12):
+ movlpd (%ecx), %xmm0
+ movl 8(%ecx), %eax
+ movlpd %xmm0, (%edx)
+ movl %eax, 8(%edx)
+ SAVE_RESULT (11)
+# ifdef USE_AS_STRNCPY
+ sub $12, %ebx
+ lea 12(%edx), %ecx
+ jnz L(StrncpyFillTailWithZero1)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# endif
+# endif
+ RETURN1
+
+ .p2align 4
+L(ExitHigh12):
+ test $0x10, %ah
+ jnz L(Exit13)
+ test $0x20, %ah
+ jnz L(Exit14)
+ test $0x40, %ah
+ jnz L(Exit15)
+
+ .p2align 4
+L(Exit16):
+ movdqu (%ecx), %xmm0
+ movdqu %xmm0, (%edx)
+ SAVE_RESULT (15)
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ lea 16(%edx), %ecx
+ jnz L(StrncpyFillTailWithZero1)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# endif
+# endif
+ RETURN1
+
+# ifdef USE_AS_STRNCPY
+
+ CFI_PUSH(%esi)
+
+ .p2align 4
+L(CopyFrom1To16BytesCase2):
+ add $16, %ebx
+ add %esi, %ecx
+ add %esi, %edx
+
+ POP (%esi)
+
+ test %al, %al
+ jz L(ExitHighCase2)
+
+ cmp $8, %ebx
+ ja L(CopyFrom1To16BytesLess8)
+
+ test $0x01, %al
+ jnz L(Exit1)
+ cmp $1, %ebx
+ je L(Exit1)
+ test $0x02, %al
+ jnz L(Exit2)
+ cmp $2, %ebx
+ je L(Exit2)
+ test $0x04, %al
+ jnz L(Exit3)
+ cmp $3, %ebx
+ je L(Exit3)
+ test $0x08, %al
+ jnz L(Exit4)
+ cmp $4, %ebx
+ je L(Exit4)
+ test $0x10, %al
+ jnz L(Exit5)
+ cmp $5, %ebx
+ je L(Exit5)
+ test $0x20, %al
+ jnz L(Exit6)
+ cmp $6, %ebx
+ je L(Exit6)
+ test $0x40, %al
+ jnz L(Exit7)
+ cmp $7, %ebx
+ je L(Exit7)
+ jmp L(Exit8)
+
+ .p2align 4
+L(ExitHighCase2):
+ cmp $8, %ebx
+ jbe L(CopyFrom1To16BytesLess8Case3)
+
+ test $0x01, %ah
+ jnz L(Exit9)
+ cmp $9, %ebx
+ je L(Exit9)
+ test $0x02, %ah
+ jnz L(Exit10)
+ cmp $10, %ebx
+ je L(Exit10)
+ test $0x04, %ah
+ jnz L(Exit11)
+ cmp $11, %ebx
+ je L(Exit11)
+ test $0x8, %ah
+ jnz L(Exit12)
+ cmp $12, %ebx
+ je L(Exit12)
+ test $0x10, %ah
+ jnz L(Exit13)
+ cmp $13, %ebx
+ je L(Exit13)
+ test $0x20, %ah
+ jnz L(Exit14)
+ cmp $14, %ebx
+ je L(Exit14)
+ test $0x40, %ah
+ jnz L(Exit15)
+ cmp $15, %ebx
+ je L(Exit15)
+ jmp L(Exit16)
+
+ CFI_PUSH(%esi)
+
+ .p2align 4
+L(CopyFrom1To16BytesCase2OrCase3):
+ test %eax, %eax
+ jnz L(CopyFrom1To16BytesCase2)
+
+ .p2align 4
+L(CopyFrom1To16BytesCase3):
+ add $16, %ebx
+ add %esi, %edx
+ add %esi, %ecx
+
+ POP (%esi)
+
+ cmp $8, %ebx
+ ja L(ExitHigh8Case3)
+
+L(CopyFrom1To16BytesLess8Case3):
+ cmp $4, %ebx
+ ja L(ExitHigh4Case3)
+
+ cmp $1, %ebx
+ je L(Exit1)
+ cmp $2, %ebx
+ je L(Exit2)
+ cmp $3, %ebx
+ je L(Exit3)
+ movl (%ecx), %eax
+ movl %eax, (%edx)
+ SAVE_RESULT (4)
+ RETURN1
+
+ .p2align 4
+L(ExitHigh4Case3):
+ cmp $5, %ebx
+ je L(Exit5)
+ cmp $6, %ebx
+ je L(Exit6)
+ cmp $7, %ebx
+ je L(Exit7)
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ SAVE_RESULT (8)
+ RETURN1
+
+ .p2align 4
+L(ExitHigh8Case3):
+ cmp $12, %ebx
+ ja L(ExitHigh12Case3)
+
+ cmp $9, %ebx
+ je L(Exit9)
+ cmp $10, %ebx
+ je L(Exit10)
+ cmp $11, %ebx
+ je L(Exit11)
+ movlpd (%ecx), %xmm0
+ movl 8(%ecx), %eax
+ movlpd %xmm0, (%edx)
+ movl %eax, 8(%edx)
+ SAVE_RESULT (12)
+ RETURN1
+
+ .p2align 4
+L(ExitHigh12Case3):
+ cmp $13, %ebx
+ je L(Exit13)
+ cmp $14, %ebx
+ je L(Exit14)
+ cmp $15, %ebx
+ je L(Exit15)
+ movlpd (%ecx), %xmm0
+ movlpd 8(%ecx), %xmm1
+ movlpd %xmm0, (%edx)
+ movlpd %xmm1, 8(%edx)
+ SAVE_RESULT (16)
+ RETURN1
+
+# endif
+
+ .p2align 4
+L(Exit1):
+ movb (%ecx), %al
+ movb %al, (%edx)
+ SAVE_RESULT (0)
+# ifdef USE_AS_STRNCPY
+ sub $1, %ebx
+ lea 1(%edx), %ecx
+ jnz L(StrncpyFillTailWithZero1)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# endif
+# endif
+ RETURN1
+
+ .p2align 4
+L(Exit2):
+ movw (%ecx), %ax
+ movw %ax, (%edx)
+ SAVE_RESULT (1)
+# ifdef USE_AS_STRNCPY
+ sub $2, %ebx
+ lea 2(%edx), %ecx
+ jnz L(StrncpyFillTailWithZero1)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# endif
+# endif
+ RETURN1
+
+ .p2align 4
+L(Exit3):
+ movw (%ecx), %ax
+ movw %ax, (%edx)
+ movb 2(%ecx), %al
+ movb %al, 2(%edx)
+ SAVE_RESULT (2)
+# ifdef USE_AS_STRNCPY
+ sub $3, %ebx
+ lea 3(%edx), %ecx
+ jnz L(StrncpyFillTailWithZero1)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# endif
+# endif
+ RETURN1
+
+ .p2align 4
+L(Exit5):
+ movl (%ecx), %eax
+ movl %eax, (%edx)
+ movb 4(%ecx), %al
+ movb %al, 4(%edx)
+ SAVE_RESULT (4)
+# ifdef USE_AS_STRNCPY
+ sub $5, %ebx
+ lea 5(%edx), %ecx
+ jnz L(StrncpyFillTailWithZero1)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# endif
+# endif
+ RETURN1
+
+ .p2align 4
+L(Exit6):
+ movl (%ecx), %eax
+ movl %eax, (%edx)
+ movw 4(%ecx), %ax
+ movw %ax, 4(%edx)
+ SAVE_RESULT (5)
+# ifdef USE_AS_STRNCPY
+ sub $6, %ebx
+ lea 6(%edx), %ecx
+ jnz L(StrncpyFillTailWithZero1)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# endif
+# endif
+ RETURN1
+
+ .p2align 4
+L(Exit7):
+ movl (%ecx), %eax
+ movl %eax, (%edx)
+ movl 3(%ecx), %eax
+ movl %eax, 3(%edx)
+ SAVE_RESULT (6)
+# ifdef USE_AS_STRNCPY
+ sub $7, %ebx
+ lea 7(%edx), %ecx
+ jnz L(StrncpyFillTailWithZero1)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# endif
+# endif
+ RETURN1
+
+ .p2align 4
+L(Exit9):
+ movlpd (%ecx), %xmm0
+ movb 8(%ecx), %al
+ movlpd %xmm0, (%edx)
+ movb %al, 8(%edx)
+ SAVE_RESULT (8)
+# ifdef USE_AS_STRNCPY
+ sub $9, %ebx
+ lea 9(%edx), %ecx
+ jnz L(StrncpyFillTailWithZero1)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# endif
+# endif
+ RETURN1
+
+ .p2align 4
+L(Exit10):
+ movlpd (%ecx), %xmm0
+ movw 8(%ecx), %ax
+ movlpd %xmm0, (%edx)
+ movw %ax, 8(%edx)
+ SAVE_RESULT (9)
+# ifdef USE_AS_STRNCPY
+ sub $10, %ebx
+ lea 10(%edx), %ecx
+ jnz L(StrncpyFillTailWithZero1)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# endif
+# endif
+ RETURN1
+
+ .p2align 4
+L(Exit11):
+ movlpd (%ecx), %xmm0
+ movl 7(%ecx), %eax
+ movlpd %xmm0, (%edx)
+ movl %eax, 7(%edx)
+ SAVE_RESULT (10)
+# ifdef USE_AS_STRNCPY
+ sub $11, %ebx
+ lea 11(%edx), %ecx
+ jnz L(StrncpyFillTailWithZero1)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# endif
+# endif
+ RETURN1
+
+ .p2align 4
+L(Exit13):
+ movlpd (%ecx), %xmm0
+ movlpd 5(%ecx), %xmm1
+ movlpd %xmm0, (%edx)
+ movlpd %xmm1, 5(%edx)
+ SAVE_RESULT (12)
+# ifdef USE_AS_STRNCPY
+ sub $13, %ebx
+ lea 13(%edx), %ecx
+ jnz L(StrncpyFillTailWithZero1)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# endif
+# endif
+ RETURN1
+
+ .p2align 4
+L(Exit14):
+ movlpd (%ecx), %xmm0
+ movlpd 6(%ecx), %xmm1
+ movlpd %xmm0, (%edx)
+ movlpd %xmm1, 6(%edx)
+ SAVE_RESULT (13)
+# ifdef USE_AS_STRNCPY
+ sub $14, %ebx
+ lea 14(%edx), %ecx
+ jnz L(StrncpyFillTailWithZero1)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# endif
+# endif
+ RETURN1
+
+ .p2align 4
+L(Exit15):
+ movlpd (%ecx), %xmm0
+ movlpd 7(%ecx), %xmm1
+ movlpd %xmm0, (%edx)
+ movlpd %xmm1, 7(%edx)
+ SAVE_RESULT (14)
+# ifdef USE_AS_STRNCPY
+ sub $15, %ebx
+ lea 15(%edx), %ecx
+ jnz L(StrncpyFillTailWithZero1)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# endif
+# endif
+ RETURN1
+
+CFI_POP (%edi)
+
+# ifdef USE_AS_STRNCPY
+ .p2align 4
+L(Fill0):
+ RETURN
+
+ .p2align 4
+L(Fill1):
+ movb %dl, (%ecx)
+ RETURN
+
+ .p2align 4
+L(Fill2):
+ movw %dx, (%ecx)
+ RETURN
+
+ .p2align 4
+L(Fill3):
+ movw %dx, (%ecx)
+ movb %dl, 2(%ecx)
+ RETURN
+
+ .p2align 4
+L(Fill4):
+ movl %edx, (%ecx)
+ RETURN
+
+ .p2align 4
+L(Fill5):
+ movl %edx, (%ecx)
+ movb %dl, 4(%ecx)
+ RETURN
+
+ .p2align 4
+L(Fill6):
+ movl %edx, (%ecx)
+ movw %dx, 4(%ecx)
+ RETURN
+
+ .p2align 4
+L(Fill7):
+ movl %edx, (%ecx)
+ movl %edx, 3(%ecx)
+ RETURN
+
+ .p2align 4
+L(Fill8):
+ movlpd %xmm0, (%ecx)
+ RETURN
+
+ .p2align 4
+L(Fill9):
+ movlpd %xmm0, (%ecx)
+ movb %dl, 8(%ecx)
+ RETURN
+
+ .p2align 4
+L(Fill10):
+ movlpd %xmm0, (%ecx)
+ movw %dx, 8(%ecx)
+ RETURN
+
+ .p2align 4
+L(Fill11):
+ movlpd %xmm0, (%ecx)
+ movl %edx, 7(%ecx)
+ RETURN
+
+ .p2align 4
+L(Fill12):
+ movlpd %xmm0, (%ecx)
+ movl %edx, 8(%ecx)
+ RETURN
+
+ .p2align 4
+L(Fill13):
+ movlpd %xmm0, (%ecx)
+ movlpd %xmm0, 5(%ecx)
+ RETURN
+
+ .p2align 4
+L(Fill14):
+ movlpd %xmm0, (%ecx)
+ movlpd %xmm0, 6(%ecx)
+ RETURN
+
+ .p2align 4
+L(Fill15):
+ movlpd %xmm0, (%ecx)
+ movlpd %xmm0, 7(%ecx)
+ RETURN
+
+ .p2align 4
+L(Fill16):
+ movlpd %xmm0, (%ecx)
+ movlpd %xmm0, 8(%ecx)
+ RETURN
+
+ .p2align 4
+L(StrncpyFillExit1):
+ lea 16(%ebx), %ebx
+L(FillFrom1To16Bytes):
+ test %ebx, %ebx
+ jz L(Fill0)
+ cmp $16, %ebx
+ je L(Fill16)
+ cmp $8, %ebx
+ je L(Fill8)
+ jg L(FillMore8)
+ cmp $4, %ebx
+ je L(Fill4)
+ jg L(FillMore4)
+ cmp $2, %ebx
+ jl L(Fill1)
+ je L(Fill2)
+ jg L(Fill3)
+L(FillMore8): /* but less than 16 */
+ cmp $12, %ebx
+ je L(Fill12)
+ jl L(FillLess12)
+ cmp $14, %ebx
+ jl L(Fill13)
+ je L(Fill14)
+ jg L(Fill15)
+L(FillMore4): /* but less than 8 */
+ cmp $6, %ebx
+ jl L(Fill5)
+ je L(Fill6)
+ jg L(Fill7)
+L(FillLess12): /* but more than 8 */
+ cmp $10, %ebx
+ jl L(Fill9)
+ je L(Fill10)
+ jmp L(Fill11)
+
+ CFI_PUSH(%edi)
+
+ .p2align 4
+L(StrncpyFillTailWithZero1):
+ POP (%edi)
+L(StrncpyFillTailWithZero):
+ pxor %xmm0, %xmm0
+ xor %edx, %edx
+ sub $16, %ebx
+ jbe L(StrncpyFillExit1)
+
+ movlpd %xmm0, (%ecx)
+ movlpd %xmm0, 8(%ecx)
+
+ lea 16(%ecx), %ecx
+
+ mov %ecx, %edx
+ and $0xf, %edx
+ sub %edx, %ecx
+ add %edx, %ebx
+ xor %edx, %edx
+ sub $64, %ebx
+ jb L(StrncpyFillLess64)
+
+L(StrncpyFillLoopMovdqa):
+ movdqa %xmm0, (%ecx)
+ movdqa %xmm0, 16(%ecx)
+ movdqa %xmm0, 32(%ecx)
+ movdqa %xmm0, 48(%ecx)
+ lea 64(%ecx), %ecx
+ sub $64, %ebx
+ jae L(StrncpyFillLoopMovdqa)
+
+L(StrncpyFillLess64):
+ add $32, %ebx
+ jl L(StrncpyFillLess32)
+ movdqa %xmm0, (%ecx)
+ movdqa %xmm0, 16(%ecx)
+ lea 32(%ecx), %ecx
+ sub $16, %ebx
+ jl L(StrncpyFillExit1)
+ movdqa %xmm0, (%ecx)
+ lea 16(%ecx), %ecx
+ jmp L(FillFrom1To16Bytes)
+
+L(StrncpyFillLess32):
+ add $16, %ebx
+ jl L(StrncpyFillExit1)
+ movdqa %xmm0, (%ecx)
+ lea 16(%ecx), %ecx
+ jmp L(FillFrom1To16Bytes)
+# endif
+
+ .p2align 4
+L(ExitTail1):
+ movb (%ecx), %al
+ movb %al, (%edx)
+ SAVE_RESULT_TAIL (0)
+# ifdef USE_AS_STRNCPY
+ sub $1, %ebx
+ lea 1(%edx), %ecx
+ jnz L(StrncpyFillTailWithZero)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# endif
+# endif
+ RETURN
+
+ .p2align 4
+L(ExitTail2):
+ movw (%ecx), %ax
+ movw %ax, (%edx)
+ SAVE_RESULT_TAIL (1)
+# ifdef USE_AS_STRNCPY
+ sub $2, %ebx
+ lea 2(%edx), %ecx
+ jnz L(StrncpyFillTailWithZero)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# endif
+# endif
+ RETURN
+
+ .p2align 4
+L(ExitTail3):
+ movw (%ecx), %ax
+ movw %ax, (%edx)
+ movb 2(%ecx), %al
+ movb %al, 2(%edx)
+ SAVE_RESULT_TAIL (2)
+# ifdef USE_AS_STRNCPY
+ sub $3, %ebx
+ lea 3(%edx), %ecx
+ jnz L(StrncpyFillTailWithZero)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# endif
+# endif
+ RETURN
+
+ .p2align 4
+L(ExitTail4):
+ movl (%ecx), %eax
+ movl %eax, (%edx)
+ SAVE_RESULT_TAIL (3)
+# ifdef USE_AS_STRNCPY
+ sub $4, %ebx
+ lea 4(%edx), %ecx
+ jnz L(StrncpyFillTailWithZero)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# endif
+# endif
+ RETURN
+
+ .p2align 4
+L(ExitTail5):
+ movl (%ecx), %eax
+ movl %eax, (%edx)
+ movb 4(%ecx), %al
+ movb %al, 4(%edx)
+ SAVE_RESULT_TAIL (4)
+# ifdef USE_AS_STRNCPY
+ sub $5, %ebx
+ lea 5(%edx), %ecx
+ jnz L(StrncpyFillTailWithZero)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# endif
+# endif
+ RETURN
+
+ .p2align 4
+L(ExitTail6):
+ movl (%ecx), %eax
+ movl %eax, (%edx)
+ movw 4(%ecx), %ax
+ movw %ax, 4(%edx)
+ SAVE_RESULT_TAIL (5)
+# ifdef USE_AS_STRNCPY
+ sub $6, %ebx
+ lea 6(%edx), %ecx
+ jnz L(StrncpyFillTailWithZero)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# endif
+# endif
+ RETURN
+
+ .p2align 4
+L(ExitTail7):
+ movl (%ecx), %eax
+ movl %eax, (%edx)
+ movl 3(%ecx), %eax
+ movl %eax, 3(%edx)
+ SAVE_RESULT_TAIL (6)
+# ifdef USE_AS_STRNCPY
+ sub $7, %ebx
+ lea 7(%edx), %ecx
+ jnz L(StrncpyFillTailWithZero)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# endif
+# endif
+ RETURN
+
+ .p2align 4
+L(ExitTail8):
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ SAVE_RESULT_TAIL (7)
+# ifdef USE_AS_STRNCPY
+ sub $8, %ebx
+ lea 8(%edx), %ecx
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ RETURN
+
+ .p2align 4
+L(ExitTail9):
+ movlpd (%ecx), %xmm0
+ movb 8(%ecx), %al
+ movlpd %xmm0, (%edx)
+ movb %al, 8(%edx)
+ SAVE_RESULT_TAIL (8)
+# ifdef USE_AS_STRNCPY
+ sub $9, %ebx
+ lea 9(%edx), %ecx
+ jnz L(StrncpyFillTailWithZero)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# endif
+# endif
+ RETURN
+
+ .p2align 4
+L(ExitTail10):
+ movlpd (%ecx), %xmm0
+ movw 8(%ecx), %ax
+ movlpd %xmm0, (%edx)
+ movw %ax, 8(%edx)
+ SAVE_RESULT_TAIL (9)
+# ifdef USE_AS_STRNCPY
+ sub $10, %ebx
+ lea 10(%edx), %ecx
+ jnz L(StrncpyFillTailWithZero)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# endif
+# endif
+ RETURN
+
+ .p2align 4
+L(ExitTail11):
+ movlpd (%ecx), %xmm0
+ movl 7(%ecx), %eax
+ movlpd %xmm0, (%edx)
+ movl %eax, 7(%edx)
+ SAVE_RESULT_TAIL (10)
+# ifdef USE_AS_STRNCPY
+ sub $11, %ebx
+ lea 11(%edx), %ecx
+ jnz L(StrncpyFillTailWithZero)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# endif
+# endif
+ RETURN
+
+ .p2align 4
+L(ExitTail12):
+ movlpd (%ecx), %xmm0
+ movl 8(%ecx), %eax
+ movlpd %xmm0, (%edx)
+ movl %eax, 8(%edx)
+ SAVE_RESULT_TAIL (11)
+# ifdef USE_AS_STRNCPY
+ sub $12, %ebx
+ lea 12(%edx), %ecx
+ jnz L(StrncpyFillTailWithZero)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# endif
+# endif
+ RETURN
+
+ .p2align 4
+L(ExitTail13):
+ movlpd (%ecx), %xmm0
+ movlpd 5(%ecx), %xmm1
+ movlpd %xmm0, (%edx)
+ movlpd %xmm1, 5(%edx)
+ SAVE_RESULT_TAIL (12)
+# ifdef USE_AS_STRNCPY
+ sub $13, %ebx
+ lea 13(%edx), %ecx
+ jnz L(StrncpyFillTailWithZero)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# endif
+# endif
+ RETURN
+
+ .p2align 4
+L(ExitTail14):
+ movlpd (%ecx), %xmm0
+ movlpd 6(%ecx), %xmm1
+ movlpd %xmm0, (%edx)
+ movlpd %xmm1, 6(%edx)
+ SAVE_RESULT_TAIL (13)
+# ifdef USE_AS_STRNCPY
+ sub $14, %ebx
+ lea 14(%edx), %ecx
+ jnz L(StrncpyFillTailWithZero)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# endif
+# endif
+ RETURN
+
+ .p2align 4
+L(ExitTail15):
+ movlpd (%ecx), %xmm0
+ movlpd 7(%ecx), %xmm1
+ movlpd %xmm0, (%edx)
+ movlpd %xmm1, 7(%edx)
+ SAVE_RESULT_TAIL (14)
+# ifdef USE_AS_STRNCPY
+ sub $15, %ebx
+ lea 15(%edx), %ecx
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ RETURN
+
+ .p2align 4
+L(ExitTail16):
+ movdqu (%ecx), %xmm0
+ movdqu %xmm0, (%edx)
+ SAVE_RESULT_TAIL (15)
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ lea 16(%edx), %ecx
+ jnz L(StrncpyFillTailWithZero)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# endif
+# endif
+ RETURN
+# endif
+
+# ifdef USE_AS_STRNCPY
+# ifndef USE_AS_STRCAT
+ CFI_PUSH (%esi)
+ CFI_PUSH (%edi)
+# endif
+ .p2align 4
+L(StrncpyLeaveCase2OrCase3):
+ test %eax, %eax
+ jnz L(Aligned64LeaveCase2)
+
+L(Aligned64LeaveCase3):
+ add $48, %ebx
+ jle L(CopyFrom1To16BytesCase3)
+ movaps %xmm4, -64(%edx)
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(CopyFrom1To16BytesCase3)
+ movaps %xmm5, -48(%edx)
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(CopyFrom1To16BytesCase3)
+ movaps %xmm6, -32(%edx)
+ lea 16(%esi), %esi
+ lea -16(%ebx), %ebx
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(Aligned64LeaveCase2):
+ pcmpeqb %xmm4, %xmm0
+ pmovmskb %xmm0, %eax
+ add $48, %ebx
+ jle L(CopyFrom1To16BytesCase2OrCase3)
+ test %eax, %eax
+ jnz L(CopyFrom1To16Bytes)
+
+ pcmpeqb %xmm5, %xmm0
+ pmovmskb %xmm0, %eax
+ movaps %xmm4, -64(%edx)
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+ test %eax, %eax
+ jnz L(CopyFrom1To16Bytes)
+
+ pcmpeqb %xmm6, %xmm0
+ pmovmskb %xmm0, %eax
+ movaps %xmm5, -48(%edx)
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+ test %eax, %eax
+ jnz L(CopyFrom1To16Bytes)
+
+ pcmpeqb %xmm7, %xmm0
+ pmovmskb %xmm0, %eax
+ movaps %xmm6, -32(%edx)
+ lea 16(%esi), %esi
+ lea -16(%ebx), %ebx
+ jmp L(CopyFrom1To16BytesCase2)
+
+/*--------------------------------------------------*/
+ .p2align 4
+L(StrncpyExit1Case2OrCase3):
+ movlpd (%ecx), %xmm0
+ movlpd 7(%ecx), %xmm1
+ movlpd %xmm0, (%edx)
+ movlpd %xmm1, 7(%edx)
+ mov $15, %esi
+ test %eax, %eax
+ jnz L(CopyFrom1To16BytesCase2)
+ jmp L(CopyFrom1To16BytesCase3)
+
+ .p2align 4
+L(StrncpyExit2Case2OrCase3):
+ movlpd (%ecx), %xmm0
+ movlpd 6(%ecx), %xmm1
+ movlpd %xmm0, (%edx)
+ movlpd %xmm1, 6(%edx)
+ mov $14, %esi
+ test %eax, %eax
+ jnz L(CopyFrom1To16BytesCase2)
+ jmp L(CopyFrom1To16BytesCase3)
+
+ .p2align 4
+L(StrncpyExit3Case2OrCase3):
+ movlpd (%ecx), %xmm0
+ movlpd 5(%ecx), %xmm1
+ movlpd %xmm0, (%edx)
+ movlpd %xmm1, 5(%edx)
+ mov $13, %esi
+ test %eax, %eax
+ jnz L(CopyFrom1To16BytesCase2)
+ jmp L(CopyFrom1To16BytesCase3)
+
+ .p2align 4
+L(StrncpyExit4Case2OrCase3):
+ movlpd (%ecx), %xmm0
+ movl 8(%ecx), %esi
+ movlpd %xmm0, (%edx)
+ movl %esi, 8(%edx)
+ mov $12, %esi
+ test %eax, %eax
+ jnz L(CopyFrom1To16BytesCase2)
+ jmp L(CopyFrom1To16BytesCase3)
+
+ .p2align 4
+L(StrncpyExit5Case2OrCase3):
+ movlpd (%ecx), %xmm0
+ movl 7(%ecx), %esi
+ movlpd %xmm0, (%edx)
+ movl %esi, 7(%edx)
+ mov $11, %esi
+ test %eax, %eax
+ jnz L(CopyFrom1To16BytesCase2)
+ jmp L(CopyFrom1To16BytesCase3)
+
+ .p2align 4
+L(StrncpyExit6Case2OrCase3):
+ movlpd (%ecx), %xmm0
+ movl 6(%ecx), %esi
+ movlpd %xmm0, (%edx)
+ movl %esi, 6(%edx)
+ mov $10, %esi
+ test %eax, %eax
+ jnz L(CopyFrom1To16BytesCase2)
+ jmp L(CopyFrom1To16BytesCase3)
+
+ .p2align 4
+L(StrncpyExit7Case2OrCase3):
+ movlpd (%ecx), %xmm0
+ movl 5(%ecx), %esi
+ movlpd %xmm0, (%edx)
+ movl %esi, 5(%edx)
+ mov $9, %esi
+ test %eax, %eax
+ jnz L(CopyFrom1To16BytesCase2)
+ jmp L(CopyFrom1To16BytesCase3)
+
+ .p2align 4
+L(StrncpyExit8Case2OrCase3):
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ mov $8, %esi
+ test %eax, %eax
+ jnz L(CopyFrom1To16BytesCase2)
+ jmp L(CopyFrom1To16BytesCase3)
+
+ .p2align 4
+L(StrncpyExit9Case2OrCase3):
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ mov $7, %esi
+ test %eax, %eax
+ jnz L(CopyFrom1To16BytesCase2)
+ jmp L(CopyFrom1To16BytesCase3)
+
+ .p2align 4
+L(StrncpyExit10Case2OrCase3):
+ movlpd -1(%ecx), %xmm0
+ movlpd %xmm0, -1(%edx)
+ mov $6, %esi
+ test %eax, %eax
+ jnz L(CopyFrom1To16BytesCase2)
+ jmp L(CopyFrom1To16BytesCase3)
+
+ .p2align 4
+L(StrncpyExit11Case2OrCase3):
+ movlpd -2(%ecx), %xmm0
+ movlpd %xmm0, -2(%edx)
+ mov $5, %esi
+ test %eax, %eax
+ jnz L(CopyFrom1To16BytesCase2)
+ jmp L(CopyFrom1To16BytesCase3)
+
+ .p2align 4
+L(StrncpyExit12Case2OrCase3):
+ movl (%ecx), %esi
+ movl %esi, (%edx)
+ mov $4, %esi
+ test %eax, %eax
+ jnz L(CopyFrom1To16BytesCase2)
+ jmp L(CopyFrom1To16BytesCase3)
+
+ .p2align 4
+L(StrncpyExit13Case2OrCase3):
+ movl -1(%ecx), %esi
+ movl %esi, -1(%edx)
+ mov $3, %esi
+ test %eax, %eax
+ jnz L(CopyFrom1To16BytesCase2)
+ jmp L(CopyFrom1To16BytesCase3)
+
+ .p2align 4
+L(StrncpyExit14Case2OrCase3):
+ movl -2(%ecx), %esi
+ movl %esi, -2(%edx)
+ mov $2, %esi
+ test %eax, %eax
+ jnz L(CopyFrom1To16BytesCase2)
+ jmp L(CopyFrom1To16BytesCase3)
+
+ .p2align 4
+L(StrncpyExit15Case2OrCase3):
+ movl -3(%ecx), %esi
+ movl %esi, -3(%edx)
+ mov $1, %esi
+ test %eax, %eax
+ jnz L(CopyFrom1To16BytesCase2)
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave1):
+ movaps %xmm2, %xmm3
+ add $48, %ebx
+ jle L(StrncpyExit1)
+ palignr $1, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 31(%ecx), %xmm2
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit1)
+ palignr $1, %xmm3, %xmm2
+ movaps %xmm2, 16(%edx)
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit1)
+ movaps %xmm4, 32(%edx)
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit1)
+ movaps %xmm5, 48(%edx)
+ lea 16(%esi), %esi
+ lea -16(%ebx), %ebx
+L(StrncpyExit1):
+ lea 15(%edx, %esi), %edx
+ lea 15(%ecx, %esi), %ecx
+ movdqu -16(%ecx), %xmm0
+ xor %esi, %esi
+ movdqu %xmm0, -16(%edx)
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave2):
+ movaps %xmm2, %xmm3
+ add $48, %ebx
+ jle L(StrncpyExit2)
+ palignr $2, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 30(%ecx), %xmm2
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit2)
+ palignr $2, %xmm3, %xmm2
+ movaps %xmm2, 16(%edx)
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit2)
+ movaps %xmm4, 32(%edx)
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit2)
+ movaps %xmm5, 48(%edx)
+ lea 16(%esi), %esi
+ lea -16(%ebx), %ebx
+L(StrncpyExit2):
+ lea 14(%edx, %esi), %edx
+ lea 14(%ecx, %esi), %ecx
+ movdqu -16(%ecx), %xmm0
+ xor %esi, %esi
+ movdqu %xmm0, -16(%edx)
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave3):
+ movaps %xmm2, %xmm3
+ add $48, %ebx
+ jle L(StrncpyExit3)
+ palignr $3, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 29(%ecx), %xmm2
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit3)
+ palignr $3, %xmm3, %xmm2
+ movaps %xmm2, 16(%edx)
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit3)
+ movaps %xmm4, 32(%edx)
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit3)
+ movaps %xmm5, 48(%edx)
+ lea 16(%esi), %esi
+ lea -16(%ebx), %ebx
+L(StrncpyExit3):
+ lea 13(%edx, %esi), %edx
+ lea 13(%ecx, %esi), %ecx
+ movdqu -16(%ecx), %xmm0
+ xor %esi, %esi
+ movdqu %xmm0, -16(%edx)
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave4):
+ movaps %xmm2, %xmm3
+ add $48, %ebx
+ jle L(StrncpyExit4)
+ palignr $4, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 28(%ecx), %xmm2
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit4)
+ palignr $4, %xmm3, %xmm2
+ movaps %xmm2, 16(%edx)
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit4)
+ movaps %xmm4, 32(%edx)
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit4)
+ movaps %xmm5, 48(%edx)
+ lea 16(%esi), %esi
+ lea -16(%ebx), %ebx
+L(StrncpyExit4):
+ lea 12(%edx, %esi), %edx
+ lea 12(%ecx, %esi), %ecx
+ movlpd -12(%ecx), %xmm0
+ movl -4(%ecx), %eax
+ movlpd %xmm0, -12(%edx)
+ movl %eax, -4(%edx)
+ xor %esi, %esi
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave5):
+ movaps %xmm2, %xmm3
+ add $48, %ebx
+ jle L(StrncpyExit5)
+ palignr $5, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 27(%ecx), %xmm2
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit5)
+ palignr $5, %xmm3, %xmm2
+ movaps %xmm2, 16(%edx)
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit5)
+ movaps %xmm4, 32(%edx)
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit5)
+ movaps %xmm5, 48(%edx)
+ lea 16(%esi), %esi
+ lea -16(%ebx), %ebx
+L(StrncpyExit5):
+ lea 11(%edx, %esi), %edx
+ lea 11(%ecx, %esi), %ecx
+ movlpd -11(%ecx), %xmm0
+ movl -4(%ecx), %eax
+ movlpd %xmm0, -11(%edx)
+ movl %eax, -4(%edx)
+ xor %esi, %esi
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave6):
+ movaps %xmm2, %xmm3
+ add $48, %ebx
+ jle L(StrncpyExit6)
+ palignr $6, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 26(%ecx), %xmm2
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit6)
+ palignr $6, %xmm3, %xmm2
+ movaps %xmm2, 16(%edx)
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit6)
+ movaps %xmm4, 32(%edx)
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit6)
+ movaps %xmm5, 48(%edx)
+ lea 16(%esi), %esi
+ lea -16(%ebx), %ebx
+L(StrncpyExit6):
+ lea 10(%edx, %esi), %edx
+ lea 10(%ecx, %esi), %ecx
+
+ movlpd -10(%ecx), %xmm0
+ movw -2(%ecx), %ax
+ movlpd %xmm0, -10(%edx)
+ movw %ax, -2(%edx)
+ xor %esi, %esi
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave7):
+ movaps %xmm2, %xmm3
+ add $48, %ebx
+ jle L(StrncpyExit7)
+ palignr $7, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 25(%ecx), %xmm2
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit7)
+ palignr $7, %xmm3, %xmm2
+ movaps %xmm2, 16(%edx)
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit7)
+ movaps %xmm4, 32(%edx)
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit7)
+ movaps %xmm5, 48(%edx)
+ lea 16(%esi), %esi
+ lea -16(%ebx), %ebx
+L(StrncpyExit7):
+ lea 9(%edx, %esi), %edx
+ lea 9(%ecx, %esi), %ecx
+
+ movlpd -9(%ecx), %xmm0
+ movb -1(%ecx), %ah
+ movlpd %xmm0, -9(%edx)
+ movb %ah, -1(%edx)
+ xor %esi, %esi
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave8):
+ movaps %xmm2, %xmm3
+ add $48, %ebx
+ jle L(StrncpyExit8)
+ palignr $8, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 24(%ecx), %xmm2
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit8)
+ palignr $8, %xmm3, %xmm2
+ movaps %xmm2, 16(%edx)
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit8)
+ movaps %xmm4, 32(%edx)
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit8)
+ movaps %xmm5, 48(%edx)
+ lea 16(%esi), %esi
+ lea -16(%ebx), %ebx
+L(StrncpyExit8):
+ lea 8(%edx, %esi), %edx
+ lea 8(%ecx, %esi), %ecx
+ movlpd -8(%ecx), %xmm0
+ movlpd %xmm0, -8(%edx)
+ xor %esi, %esi
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave9):
+ movaps %xmm2, %xmm3
+ add $48, %ebx
+ jle L(StrncpyExit9)
+ palignr $9, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 23(%ecx), %xmm2
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit9)
+ palignr $9, %xmm3, %xmm2
+ movaps %xmm2, 16(%edx)
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit9)
+ movaps %xmm4, 32(%edx)
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit9)
+ movaps %xmm5, 48(%edx)
+ lea 16(%esi), %esi
+ lea -16(%ebx), %ebx
+L(StrncpyExit9):
+ lea 7(%edx, %esi), %edx
+ lea 7(%ecx, %esi), %ecx
+
+ movlpd -8(%ecx), %xmm0
+ movlpd %xmm0, -8(%edx)
+ xor %esi, %esi
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave10):
+ movaps %xmm2, %xmm3
+ add $48, %ebx
+ jle L(StrncpyExit10)
+ palignr $10, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 22(%ecx), %xmm2
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit10)
+ palignr $10, %xmm3, %xmm2
+ movaps %xmm2, 16(%edx)
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit10)
+ movaps %xmm4, 32(%edx)
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit10)
+ movaps %xmm5, 48(%edx)
+ lea 16(%esi), %esi
+ lea -16(%ebx), %ebx
+L(StrncpyExit10):
+ lea 6(%edx, %esi), %edx
+ lea 6(%ecx, %esi), %ecx
+
+ movlpd -8(%ecx), %xmm0
+ movlpd %xmm0, -8(%edx)
+ xor %esi, %esi
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave11):
+ movaps %xmm2, %xmm3
+ add $48, %ebx
+ jle L(StrncpyExit11)
+ palignr $11, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 21(%ecx), %xmm2
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit11)
+ palignr $11, %xmm3, %xmm2
+ movaps %xmm2, 16(%edx)
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit11)
+ movaps %xmm4, 32(%edx)
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit11)
+ movaps %xmm5, 48(%edx)
+ lea 16(%esi), %esi
+ lea -16(%ebx), %ebx
+L(StrncpyExit11):
+ lea 5(%edx, %esi), %edx
+ lea 5(%ecx, %esi), %ecx
+ movl -5(%ecx), %esi
+ movb -1(%ecx), %ah
+ movl %esi, -5(%edx)
+ movb %ah, -1(%edx)
+ xor %esi, %esi
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave12):
+ movaps %xmm2, %xmm3
+ add $48, %ebx
+ jle L(StrncpyExit12)
+ palignr $12, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 20(%ecx), %xmm2
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit12)
+ palignr $12, %xmm3, %xmm2
+ movaps %xmm2, 16(%edx)
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit12)
+ movaps %xmm4, 32(%edx)
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit12)
+ movaps %xmm5, 48(%edx)
+ lea 16(%esi), %esi
+ lea -16(%ebx), %ebx
+L(StrncpyExit12):
+ lea 4(%edx, %esi), %edx
+ lea 4(%ecx, %esi), %ecx
+ movl -4(%ecx), %eax
+ movl %eax, -4(%edx)
+ xor %esi, %esi
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave13):
+ movaps %xmm2, %xmm3
+ add $48, %ebx
+ jle L(StrncpyExit13)
+ palignr $13, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 19(%ecx), %xmm2
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit13)
+ palignr $13, %xmm3, %xmm2
+ movaps %xmm2, 16(%edx)
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit13)
+ movaps %xmm4, 32(%edx)
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit13)
+ movaps %xmm5, 48(%edx)
+ lea 16(%esi), %esi
+ lea -16(%ebx), %ebx
+L(StrncpyExit13):
+ lea 3(%edx, %esi), %edx
+ lea 3(%ecx, %esi), %ecx
+
+ movl -4(%ecx), %eax
+ movl %eax, -4(%edx)
+ xor %esi, %esi
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave14):
+ movaps %xmm2, %xmm3
+ add $48, %ebx
+ jle L(StrncpyExit14)
+ palignr $14, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 18(%ecx), %xmm2
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit14)
+ palignr $14, %xmm3, %xmm2
+ movaps %xmm2, 16(%edx)
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit14)
+ movaps %xmm4, 32(%edx)
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit14)
+ movaps %xmm5, 48(%edx)
+ lea 16(%esi), %esi
+ lea -16(%ebx), %ebx
+L(StrncpyExit14):
+ lea 2(%edx, %esi), %edx
+ lea 2(%ecx, %esi), %ecx
+ movw -2(%ecx), %ax
+ movw %ax, -2(%edx)
+ xor %esi, %esi
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave15):
+ movaps %xmm2, %xmm3
+ add $48, %ebx
+ jle L(StrncpyExit15)
+ palignr $15, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 17(%ecx), %xmm2
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit15)
+ palignr $15, %xmm3, %xmm2
+ movaps %xmm2, 16(%edx)
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit15)
+ movaps %xmm4, 32(%edx)
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit15)
+ movaps %xmm5, 48(%edx)
+ lea 16(%esi), %esi
+ lea -16(%ebx), %ebx
+L(StrncpyExit15):
+ lea 1(%edx, %esi), %edx
+ lea 1(%ecx, %esi), %ecx
+ movb -1(%ecx), %ah
+ movb %ah, -1(%edx)
+ xor %esi, %esi
+ jmp L(CopyFrom1To16BytesCase3)
+# endif
+
+# ifndef USE_AS_STRCAT
+# ifdef USE_AS_STRNCPY
+ CFI_POP (%esi)
+ CFI_POP (%edi)
+
+ .p2align 4
+L(ExitTail0):
+ movl %edx, %eax
+ RETURN
+
+ .p2align 4
+L(StrncpyExit15Bytes):
+ cmp $12, %ebx
+ jbe L(StrncpyExit12Bytes)
+ cmpb $0, 8(%ecx)
+ jz L(ExitTail9)
+ cmpb $0, 9(%ecx)
+ jz L(ExitTail10)
+ cmpb $0, 10(%ecx)
+ jz L(ExitTail11)
+ cmpb $0, 11(%ecx)
+ jz L(ExitTail12)
+ cmp $13, %ebx
+ je L(ExitTail13)
+ cmpb $0, 12(%ecx)
+ jz L(ExitTail13)
+ cmp $14, %ebx
+ je L(ExitTail14)
+ cmpb $0, 13(%ecx)
+ jz L(ExitTail14)
+ movlpd (%ecx), %xmm0
+ movlpd 7(%ecx), %xmm1
+ movlpd %xmm0, (%edx)
+ movlpd %xmm1, 7(%edx)
+# ifdef USE_AS_STPCPY
+ lea 14(%edx), %eax
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# else
+ movl %edx, %eax
+# endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit12Bytes):
+ cmp $9, %ebx
+ je L(ExitTail9)
+ cmpb $0, 8(%ecx)
+ jz L(ExitTail9)
+ cmp $10, %ebx
+ je L(ExitTail10)
+ cmpb $0, 9(%ecx)
+ jz L(ExitTail10)
+ cmp $11, %ebx
+ je L(ExitTail11)
+ cmpb $0, 10(%ecx)
+ jz L(ExitTail11)
+ movlpd (%ecx), %xmm0
+ movl 8(%ecx), %eax
+ movlpd %xmm0, (%edx)
+ movl %eax, 8(%edx)
+ SAVE_RESULT_TAIL (11)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit8Bytes):
+ cmp $4, %ebx
+ jbe L(StrncpyExit4Bytes)
+ cmpb $0, (%ecx)
+ jz L(ExitTail1)
+ cmpb $0, 1(%ecx)
+ jz L(ExitTail2)
+ cmpb $0, 2(%ecx)
+ jz L(ExitTail3)
+ cmpb $0, 3(%ecx)
+ jz L(ExitTail4)
+
+ cmp $5, %ebx
+ je L(ExitTail5)
+ cmpb $0, 4(%ecx)
+ jz L(ExitTail5)
+ cmp $6, %ebx
+ je L(ExitTail6)
+ cmpb $0, 5(%ecx)
+ jz L(ExitTail6)
+ cmp $7, %ebx
+ je L(ExitTail7)
+ cmpb $0, 6(%ecx)
+ jz L(ExitTail7)
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+# ifdef USE_AS_STPCPY
+ lea 7(%edx), %eax
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# else
+ movl %edx, %eax
+# endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit4Bytes):
+ test %ebx, %ebx
+ jz L(ExitTail0)
+ cmp $1, %ebx
+ je L(ExitTail1)
+ cmpb $0, (%ecx)
+ jz L(ExitTail1)
+ cmp $2, %ebx
+ je L(ExitTail2)
+ cmpb $0, 1(%ecx)
+ jz L(ExitTail2)
+ cmp $3, %ebx
+ je L(ExitTail3)
+ cmpb $0, 2(%ecx)
+ jz L(ExitTail3)
+ movl (%ecx), %eax
+ movl %eax, (%edx)
+ SAVE_RESULT_TAIL (3)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# endif
+ RETURN
+# endif
+
+END (STRCPY)
+# endif
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strcpy.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strcpy.S
new file mode 100644
index 0000000000..ffbc03c6d5
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strcpy.S
@@ -0,0 +1,116 @@
+/* Multiple versions of strcpy
+ All versions must be listed in ifunc-impl-list.c.
+ Copyright (C) 2011-2017 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+#if !defined (USE_AS_STPCPY) && !defined (USE_AS_STRNCPY)
+# ifndef STRCPY
+# define STRCPY strcpy
+# endif
+#endif
+
+#ifdef USE_AS_STPCPY
+# ifdef USE_AS_STRNCPY
+# define STRCPY_SSSE3 __stpncpy_ssse3
+# define STRCPY_SSE2 __stpncpy_sse2
+# define STRCPY_IA32 __stpncpy_ia32
+# define __GI_STRCPY __GI_stpncpy
+# define __GI___STRCPY __GI___stpncpy
+# else
+# define STRCPY_SSSE3 __stpcpy_ssse3
+# define STRCPY_SSE2 __stpcpy_sse2
+# define STRCPY_IA32 __stpcpy_ia32
+# define __GI_STRCPY __GI_stpcpy
+# define __GI___STRCPY __GI___stpcpy
+# endif
+#else
+# ifdef USE_AS_STRNCPY
+# define STRCPY_SSSE3 __strncpy_ssse3
+# define STRCPY_SSE2 __strncpy_sse2
+# define STRCPY_IA32 __strncpy_ia32
+# define __GI_STRCPY __GI_strncpy
+# else
+# define STRCPY_SSSE3 __strcpy_ssse3
+# define STRCPY_SSE2 __strcpy_sse2
+# define STRCPY_IA32 __strcpy_ia32
+# define __GI_STRCPY __GI_strcpy
+# endif
+#endif
+
+
+/* Define multiple versions only for the definition in libc. Don't
+ define multiple versions for strncpy in static library since we
+ need strncpy before the initialization happened. */
+#if IS_IN (libc)
+
+ .text
+ENTRY(STRCPY)
+ .type STRCPY, @gnu_indirect_function
+ LOAD_GOT_AND_RTLD_GLOBAL_RO
+ LOAD_FUNC_GOT_EAX (STRCPY_IA32)
+ HAS_CPU_FEATURE (SSE2)
+ jz 2f
+ LOAD_FUNC_GOT_EAX (STRCPY_SSE2)
+ HAS_ARCH_FEATURE (Fast_Unaligned_Load)
+ jnz 2f
+ HAS_CPU_FEATURE (SSSE3)
+ jz 2f
+ LOAD_FUNC_GOT_EAX (STRCPY_SSSE3)
+2: ret
+END(STRCPY)
+
+# undef ENTRY
+# define ENTRY(name) \
+ .type STRCPY_IA32, @function; \
+ .align 16; \
+ .globl STRCPY_IA32; \
+ .hidden STRCPY_IA32; \
+ STRCPY_IA32: cfi_startproc; \
+ CALL_MCOUNT
+# undef END
+# define END(name) \
+ cfi_endproc; .size STRCPY_IA32, .-STRCPY_IA32
+
+# ifdef SHARED
+# undef libc_hidden_builtin_def
+/* It doesn't make sense to send libc-internal strcpy calls through a PLT.
+ The speedup we get from using SSSE3 instruction is likely eaten away
+ by the indirect call in the PLT. */
+# define libc_hidden_builtin_def(name) \
+ .globl __GI_STRCPY; __GI_STRCPY = STRCPY_IA32
+# undef libc_hidden_def
+# define libc_hidden_def(name) \
+ .globl __GI___STRCPY; __GI___STRCPY = STRCPY_IA32
+
+# endif
+#endif
+
+#ifdef USE_AS_STPCPY
+# ifdef USE_AS_STRNCPY
+# include "../../stpncpy.S"
+# else
+# include "../../i586/stpcpy.S"
+# endif
+#else
+# ifndef USE_AS_STRNCPY
+# include "../../i586/strcpy.S"
+# endif
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strcspn-c.c b/REORG.TODO/sysdeps/i386/i686/multiarch/strcspn-c.c
new file mode 100644
index 0000000000..6d61e190a8
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strcspn-c.c
@@ -0,0 +1,2 @@
+#define __strcspn_sse2 __strcspn_ia32
+#include <sysdeps/x86_64/multiarch/strcspn-c.c>
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strcspn.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strcspn.S
new file mode 100644
index 0000000000..21e5093924
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strcspn.S
@@ -0,0 +1,75 @@
+/* Multiple versions of strcspn
+ All versions must be listed in ifunc-impl-list.c.
+ Copyright (C) 2009-2017 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <config.h>
+#include <sysdep.h>
+#include <init-arch.h>
+
+#ifdef USE_AS_STRPBRK
+#define STRCSPN_SSE42 __strpbrk_sse42
+#define STRCSPN_IA32 __strpbrk_ia32
+#define __GI_STRCSPN __GI_strpbrk
+#else
+#ifndef STRCSPN
+#define STRCSPN strcspn
+#define STRCSPN_SSE42 __strcspn_sse42
+#define STRCSPN_IA32 __strcspn_ia32
+#define __GI_STRCSPN __GI_strcspn
+#endif
+#endif
+
+/* Define multiple versions only for the definition in libc. Don't
+ define multiple versions for strpbrk in static library since we
+ need strpbrk before the initialization happened. */
+#if (defined SHARED || !defined USE_AS_STRPBRK) && IS_IN (libc)
+ .text
+ENTRY(STRCSPN)
+ .type STRCSPN, @gnu_indirect_function
+ LOAD_GOT_AND_RTLD_GLOBAL_RO
+ LOAD_FUNC_GOT_EAX (STRCSPN_IA32)
+ HAS_CPU_FEATURE (SSE4_2)
+ jz 2f
+ LOAD_FUNC_GOT_EAX (STRCSPN_SSE42)
+2: ret
+END(STRCSPN)
+
+# undef ENTRY
+# define ENTRY(name) \
+ .type STRCSPN_IA32, @function; \
+ .globl STRCSPN_IA32; \
+ .p2align 4; \
+ STRCSPN_IA32: cfi_startproc; \
+ CALL_MCOUNT
+# undef END
+# define END(name) \
+ cfi_endproc; .size STRCSPN_IA32, .-STRCSPN_IA32
+# undef libc_hidden_builtin_def
+/* IFUNC doesn't work with the hidden functions in shared library since
+ they will be called without setting up EBX needed for PLT which is
+ used by IFUNC. */
+# define libc_hidden_builtin_def(name) \
+ .globl __GI_STRCSPN; __GI_STRCSPN = STRCSPN_IA32
+#endif
+
+#ifdef USE_AS_STRPBRK
+#include "../../strpbrk.S"
+#else
+#include "../../strcspn.S"
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strlen-sse2-bsf.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strlen-sse2-bsf.S
new file mode 100644
index 0000000000..d3ea864bab
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strlen-sse2-bsf.S
@@ -0,0 +1,125 @@
+/* strlen with SSE2 and BSF
+ Copyright (C) 2010-2017 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#if defined SHARED && IS_IN (libc)
+
+#include <sysdep.h>
+
+#define CFI_PUSH(REG) \
+ cfi_adjust_cfa_offset (4); \
+ cfi_rel_offset (REG, 0)
+
+#define CFI_POP(REG) \
+ cfi_adjust_cfa_offset (-4); \
+ cfi_restore (REG)
+
+#define PUSH(REG) pushl REG; CFI_PUSH (REG)
+#define POP(REG) popl REG; CFI_POP (REG)
+#define PARMS 4 + 8 /* Preserve ESI and EDI. */
+#define STR PARMS
+#define ENTRANCE PUSH (%esi); PUSH (%edi); cfi_remember_state
+#define RETURN POP (%edi); POP (%esi); ret; \
+ cfi_restore_state; cfi_remember_state
+
+ .text
+ENTRY ( __strlen_sse2_bsf)
+ ENTRANCE
+ mov STR(%esp), %edi
+ xor %eax, %eax
+ mov %edi, %ecx
+ and $0x3f, %ecx
+ pxor %xmm0, %xmm0
+ cmp $0x30, %ecx
+ ja L(next)
+ movdqu (%edi), %xmm1
+ pcmpeqb %xmm1, %xmm0
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ jnz L(exit_less16)
+ mov %edi, %eax
+ and $-16, %eax
+ jmp L(align16_start)
+L(next):
+
+ mov %edi, %eax
+ and $-16, %eax
+ pcmpeqb (%eax), %xmm0
+ mov $-1, %esi
+ sub %eax, %ecx
+ shl %cl, %esi
+ pmovmskb %xmm0, %edx
+ and %esi, %edx
+ jnz L(exit)
+L(align16_start):
+ pxor %xmm0, %xmm0
+ pxor %xmm1, %xmm1
+ pxor %xmm2, %xmm2
+ pxor %xmm3, %xmm3
+ .p2align 4
+L(align16_loop):
+ pcmpeqb 16(%eax), %xmm0
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ jnz L(exit16)
+
+ pcmpeqb 32(%eax), %xmm1
+ pmovmskb %xmm1, %edx
+ test %edx, %edx
+ jnz L(exit32)
+
+ pcmpeqb 48(%eax), %xmm2
+ pmovmskb %xmm2, %edx
+ test %edx, %edx
+ jnz L(exit48)
+
+ pcmpeqb 64(%eax), %xmm3
+ pmovmskb %xmm3, %edx
+ lea 64(%eax), %eax
+ test %edx, %edx
+ jz L(align16_loop)
+L(exit):
+ sub %edi, %eax
+L(exit_less16):
+ bsf %edx, %edx
+ add %edx, %eax
+ RETURN
+L(exit16):
+ sub %edi, %eax
+ bsf %edx, %edx
+ add %edx, %eax
+ add $16, %eax
+ RETURN
+L(exit32):
+ sub %edi, %eax
+ bsf %edx, %edx
+ add %edx, %eax
+ add $32, %eax
+ RETURN
+L(exit48):
+ sub %edi, %eax
+ bsf %edx, %edx
+ add %edx, %eax
+ add $48, %eax
+ POP (%edi)
+ POP (%esi)
+ ret
+
+END ( __strlen_sse2_bsf)
+
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strlen-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strlen-sse2.S
new file mode 100644
index 0000000000..36fc1469d0
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strlen-sse2.S
@@ -0,0 +1,695 @@
+/* strlen with SSE2
+ Copyright (C) 2010-2017 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* for strlen only SHARED version is optimized, for strcat, strncat, strnlen both STATIC and SHARED are optimized */
+
+#if (defined USE_AS_STRNLEN || defined USE_AS_STRCAT || defined SHARED) && IS_IN (libc)
+
+# ifndef USE_AS_STRCAT
+
+# include <sysdep.h>
+# define PARMS 4
+# define STR PARMS
+# define RETURN ret
+
+# ifdef USE_AS_STRNLEN
+# define LEN PARMS + 8
+# define CFI_PUSH(REG) \
+ cfi_adjust_cfa_offset (4); \
+ cfi_rel_offset (REG, 0)
+
+# define CFI_POP(REG) \
+ cfi_adjust_cfa_offset (-4); \
+ cfi_restore (REG)
+
+# define PUSH(REG) pushl REG; CFI_PUSH (REG)
+# define POP(REG) popl REG; CFI_POP (REG)
+# undef RETURN
+# define RETURN POP (%edi); CFI_PUSH(%edi); ret
+# endif
+
+# ifndef STRLEN
+# define STRLEN __strlen_sse2
+# endif
+
+ atom_text_section
+ENTRY (STRLEN)
+ mov STR(%esp), %edx
+# ifdef USE_AS_STRNLEN
+ PUSH (%edi)
+ movl LEN(%esp), %edi
+ sub $4, %edi
+ jbe L(len_less4_prolog)
+# endif
+# endif
+ xor %eax, %eax
+ cmpb $0, (%edx)
+ jz L(exit_tail0)
+ cmpb $0, 1(%edx)
+ jz L(exit_tail1)
+ cmpb $0, 2(%edx)
+ jz L(exit_tail2)
+ cmpb $0, 3(%edx)
+ jz L(exit_tail3)
+
+# ifdef USE_AS_STRNLEN
+ sub $4, %edi
+ jbe L(len_less8_prolog)
+# endif
+
+ cmpb $0, 4(%edx)
+ jz L(exit_tail4)
+ cmpb $0, 5(%edx)
+ jz L(exit_tail5)
+ cmpb $0, 6(%edx)
+ jz L(exit_tail6)
+ cmpb $0, 7(%edx)
+ jz L(exit_tail7)
+
+# ifdef USE_AS_STRNLEN
+ sub $4, %edi
+ jbe L(len_less12_prolog)
+# endif
+
+ cmpb $0, 8(%edx)
+ jz L(exit_tail8)
+ cmpb $0, 9(%edx)
+ jz L(exit_tail9)
+ cmpb $0, 10(%edx)
+ jz L(exit_tail10)
+ cmpb $0, 11(%edx)
+ jz L(exit_tail11)
+
+# ifdef USE_AS_STRNLEN
+ sub $4, %edi
+ jbe L(len_less16_prolog)
+# endif
+
+ cmpb $0, 12(%edx)
+ jz L(exit_tail12)
+ cmpb $0, 13(%edx)
+ jz L(exit_tail13)
+ cmpb $0, 14(%edx)
+ jz L(exit_tail14)
+ cmpb $0, 15(%edx)
+ jz L(exit_tail15)
+
+ pxor %xmm0, %xmm0
+ lea 16(%edx), %eax
+ mov %eax, %ecx
+ and $-16, %eax
+
+# ifdef USE_AS_STRNLEN
+ and $15, %edx
+ add %edx, %edi
+ sub $64, %edi
+ jbe L(len_less64)
+# endif
+
+ pcmpeqb (%eax), %xmm0
+ pmovmskb %xmm0, %edx
+ pxor %xmm1, %xmm1
+ test %edx, %edx
+ lea 16(%eax), %eax
+ jnz L(exit)
+
+ pcmpeqb (%eax), %xmm1
+ pmovmskb %xmm1, %edx
+ pxor %xmm2, %xmm2
+ test %edx, %edx
+ lea 16(%eax), %eax
+ jnz L(exit)
+
+ pcmpeqb (%eax), %xmm2
+ pmovmskb %xmm2, %edx
+ pxor %xmm3, %xmm3
+ test %edx, %edx
+ lea 16(%eax), %eax
+ jnz L(exit)
+
+ pcmpeqb (%eax), %xmm3
+ pmovmskb %xmm3, %edx
+ test %edx, %edx
+ lea 16(%eax), %eax
+ jnz L(exit)
+
+# ifdef USE_AS_STRNLEN
+ sub $64, %edi
+ jbe L(len_less64)
+# endif
+
+ pcmpeqb (%eax), %xmm0
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ lea 16(%eax), %eax
+ jnz L(exit)
+
+ pcmpeqb (%eax), %xmm1
+ pmovmskb %xmm1, %edx
+ test %edx, %edx
+ lea 16(%eax), %eax
+ jnz L(exit)
+
+ pcmpeqb (%eax), %xmm2
+ pmovmskb %xmm2, %edx
+ test %edx, %edx
+ lea 16(%eax), %eax
+ jnz L(exit)
+
+ pcmpeqb (%eax), %xmm3
+ pmovmskb %xmm3, %edx
+ test %edx, %edx
+ lea 16(%eax), %eax
+ jnz L(exit)
+
+# ifdef USE_AS_STRNLEN
+ sub $64, %edi
+ jbe L(len_less64)
+# endif
+
+ pcmpeqb (%eax), %xmm0
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ lea 16(%eax), %eax
+ jnz L(exit)
+
+ pcmpeqb (%eax), %xmm1
+ pmovmskb %xmm1, %edx
+ test %edx, %edx
+ lea 16(%eax), %eax
+ jnz L(exit)
+
+ pcmpeqb (%eax), %xmm2
+ pmovmskb %xmm2, %edx
+ test %edx, %edx
+ lea 16(%eax), %eax
+ jnz L(exit)
+
+ pcmpeqb (%eax), %xmm3
+ pmovmskb %xmm3, %edx
+ test %edx, %edx
+ lea 16(%eax), %eax
+ jnz L(exit)
+
+# ifdef USE_AS_STRNLEN
+ sub $64, %edi
+ jbe L(len_less64)
+# endif
+
+ pcmpeqb (%eax), %xmm0
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ lea 16(%eax), %eax
+ jnz L(exit)
+
+ pcmpeqb (%eax), %xmm1
+ pmovmskb %xmm1, %edx
+ test %edx, %edx
+ lea 16(%eax), %eax
+ jnz L(exit)
+
+ pcmpeqb (%eax), %xmm2
+ pmovmskb %xmm2, %edx
+ test %edx, %edx
+ lea 16(%eax), %eax
+ jnz L(exit)
+
+ pcmpeqb (%eax), %xmm3
+ pmovmskb %xmm3, %edx
+ test %edx, %edx
+ lea 16(%eax), %eax
+ jnz L(exit)
+
+# ifdef USE_AS_STRNLEN
+ mov %eax, %edx
+ and $63, %edx
+ add %edx, %edi
+# endif
+
+ and $-0x40, %eax
+
+ .p2align 4
+L(aligned_64_loop):
+# ifdef USE_AS_STRNLEN
+ sub $64, %edi
+ jbe L(len_less64)
+# endif
+ movaps (%eax), %xmm0
+ movaps 16(%eax), %xmm1
+ movaps 32(%eax), %xmm2
+ movaps 48(%eax), %xmm6
+ pminub %xmm1, %xmm0
+ pminub %xmm6, %xmm2
+ pminub %xmm0, %xmm2
+ pcmpeqb %xmm3, %xmm2
+ pmovmskb %xmm2, %edx
+ test %edx, %edx
+ lea 64(%eax), %eax
+ jz L(aligned_64_loop)
+
+ pcmpeqb -64(%eax), %xmm3
+ pmovmskb %xmm3, %edx
+ test %edx, %edx
+ lea 48(%ecx), %ecx
+ jnz L(exit)
+
+ pcmpeqb %xmm1, %xmm3
+ pmovmskb %xmm3, %edx
+ test %edx, %edx
+ lea -16(%ecx), %ecx
+ jnz L(exit)
+
+ pcmpeqb -32(%eax), %xmm3
+ pmovmskb %xmm3, %edx
+ test %edx, %edx
+ lea -16(%ecx), %ecx
+ jnz L(exit)
+
+ pcmpeqb %xmm6, %xmm3
+ pmovmskb %xmm3, %edx
+ lea -16(%ecx), %ecx
+L(exit):
+ sub %ecx, %eax
+ test %dl, %dl
+ jz L(exit_high)
+
+ mov %dl, %cl
+ and $15, %cl
+ jz L(exit_8)
+ test $0x01, %dl
+ jnz L(exit_tail0)
+ test $0x02, %dl
+ jnz L(exit_tail1)
+ test $0x04, %dl
+ jnz L(exit_tail2)
+ add $3, %eax
+ RETURN
+
+ .p2align 4
+L(exit_8):
+ test $0x10, %dl
+ jnz L(exit_tail4)
+ test $0x20, %dl
+ jnz L(exit_tail5)
+ test $0x40, %dl
+ jnz L(exit_tail6)
+ add $7, %eax
+ RETURN
+
+ .p2align 4
+L(exit_high):
+ mov %dh, %ch
+ and $15, %ch
+ jz L(exit_high_8)
+ test $0x01, %dh
+ jnz L(exit_tail8)
+ test $0x02, %dh
+ jnz L(exit_tail9)
+ test $0x04, %dh
+ jnz L(exit_tail10)
+ add $11, %eax
+ RETURN
+
+ .p2align 4
+L(exit_high_8):
+ test $0x10, %dh
+ jnz L(exit_tail12)
+ test $0x20, %dh
+ jnz L(exit_tail13)
+ test $0x40, %dh
+ jnz L(exit_tail14)
+ add $15, %eax
+L(exit_tail0):
+ RETURN
+
+# ifdef USE_AS_STRNLEN
+
+ .p2align 4
+L(len_less64):
+ pxor %xmm0, %xmm0
+ add $64, %edi
+
+ pcmpeqb (%eax), %xmm0
+ pmovmskb %xmm0, %edx
+ pxor %xmm1, %xmm1
+ lea 16(%eax), %eax
+ test %edx, %edx
+ jnz L(strnlen_exit)
+
+ sub $16, %edi
+ jbe L(return_start_len)
+
+ pcmpeqb (%eax), %xmm1
+ pmovmskb %xmm1, %edx
+ lea 16(%eax), %eax
+ test %edx, %edx
+ jnz L(strnlen_exit)
+
+ sub $16, %edi
+ jbe L(return_start_len)
+
+ pcmpeqb (%eax), %xmm0
+ pmovmskb %xmm0, %edx
+ lea 16(%eax), %eax
+ test %edx, %edx
+ jnz L(strnlen_exit)
+
+ sub $16, %edi
+ jbe L(return_start_len)
+
+ pcmpeqb (%eax), %xmm1
+ pmovmskb %xmm1, %edx
+ lea 16(%eax), %eax
+ test %edx, %edx
+ jnz L(strnlen_exit)
+
+ movl LEN(%esp), %eax
+ RETURN
+
+ .p2align 4
+L(strnlen_exit):
+ sub %ecx, %eax
+
+ test %dl, %dl
+ jz L(strnlen_exit_high)
+ mov %dl, %cl
+ and $15, %cl
+ jz L(strnlen_exit_8)
+ test $0x01, %dl
+ jnz L(exit_tail0)
+ test $0x02, %dl
+ jnz L(strnlen_exit_tail1)
+ test $0x04, %dl
+ jnz L(strnlen_exit_tail2)
+ sub $4, %edi
+ jb L(return_start_len)
+ lea 3(%eax), %eax
+ RETURN
+
+ .p2align 4
+L(strnlen_exit_8):
+ test $0x10, %dl
+ jnz L(strnlen_exit_tail4)
+ test $0x20, %dl
+ jnz L(strnlen_exit_tail5)
+ test $0x40, %dl
+ jnz L(strnlen_exit_tail6)
+ sub $8, %edi
+ jb L(return_start_len)
+ lea 7(%eax), %eax
+ RETURN
+
+ .p2align 4
+L(strnlen_exit_high):
+ mov %dh, %ch
+ and $15, %ch
+ jz L(strnlen_exit_high_8)
+ test $0x01, %dh
+ jnz L(strnlen_exit_tail8)
+ test $0x02, %dh
+ jnz L(strnlen_exit_tail9)
+ test $0x04, %dh
+ jnz L(strnlen_exit_tail10)
+ sub $12, %edi
+ jb L(return_start_len)
+ lea 11(%eax), %eax
+ RETURN
+
+ .p2align 4
+L(strnlen_exit_high_8):
+ test $0x10, %dh
+ jnz L(strnlen_exit_tail12)
+ test $0x20, %dh
+ jnz L(strnlen_exit_tail13)
+ test $0x40, %dh
+ jnz L(strnlen_exit_tail14)
+ sub $16, %edi
+ jb L(return_start_len)
+ lea 15(%eax), %eax
+ RETURN
+
+ .p2align 4
+L(strnlen_exit_tail1):
+ sub $2, %edi
+ jb L(return_start_len)
+ lea 1(%eax), %eax
+ RETURN
+
+ .p2align 4
+L(strnlen_exit_tail2):
+ sub $3, %edi
+ jb L(return_start_len)
+ lea 2(%eax), %eax
+ RETURN
+
+ .p2align 4
+L(strnlen_exit_tail4):
+ sub $5, %edi
+ jb L(return_start_len)
+ lea 4(%eax), %eax
+ RETURN
+
+ .p2align 4
+L(strnlen_exit_tail5):
+ sub $6, %edi
+ jb L(return_start_len)
+ lea 5(%eax), %eax
+ RETURN
+
+ .p2align 4
+L(strnlen_exit_tail6):
+ sub $7, %edi
+ jb L(return_start_len)
+ lea 6(%eax), %eax
+ RETURN
+
+ .p2align 4
+L(strnlen_exit_tail8):
+ sub $9, %edi
+ jb L(return_start_len)
+ lea 8(%eax), %eax
+ RETURN
+
+ .p2align 4
+L(strnlen_exit_tail9):
+ sub $10, %edi
+ jb L(return_start_len)
+ lea 9(%eax), %eax
+ RETURN
+
+ .p2align 4
+L(strnlen_exit_tail10):
+ sub $11, %edi
+ jb L(return_start_len)
+ lea 10(%eax), %eax
+ RETURN
+
+ .p2align 4
+L(strnlen_exit_tail12):
+ sub $13, %edi
+ jb L(return_start_len)
+ lea 12(%eax), %eax
+ RETURN
+
+ .p2align 4
+L(strnlen_exit_tail13):
+ sub $14, %edi
+ jb L(return_start_len)
+ lea 13(%eax), %eax
+ RETURN
+
+ .p2align 4
+L(strnlen_exit_tail14):
+ sub $15, %edi
+ jb L(return_start_len)
+ lea 14(%eax), %eax
+ RETURN
+
+ .p2align 4
+L(return_start_len):
+ movl LEN(%esp), %eax
+ RETURN
+
+/* for prolog only */
+
+ .p2align 4
+L(len_less4_prolog):
+ xor %eax, %eax
+
+ add $4, %edi
+ jz L(exit_tail0)
+
+ cmpb $0, (%edx)
+ jz L(exit_tail0)
+ cmp $1, %edi
+ je L(exit_tail1)
+
+ cmpb $0, 1(%edx)
+ jz L(exit_tail1)
+ cmp $2, %edi
+ je L(exit_tail2)
+
+ cmpb $0, 2(%edx)
+ jz L(exit_tail2)
+ cmp $3, %edi
+ je L(exit_tail3)
+
+ cmpb $0, 3(%edx)
+ jz L(exit_tail3)
+ mov $4, %eax
+ RETURN
+
+ .p2align 4
+L(len_less8_prolog):
+ add $4, %edi
+
+ cmpb $0, 4(%edx)
+ jz L(exit_tail4)
+ cmp $1, %edi
+ je L(exit_tail5)
+
+ cmpb $0, 5(%edx)
+ jz L(exit_tail5)
+ cmp $2, %edi
+ je L(exit_tail6)
+
+ cmpb $0, 6(%edx)
+ jz L(exit_tail6)
+ cmp $3, %edi
+ je L(exit_tail7)
+
+ cmpb $0, 7(%edx)
+ jz L(exit_tail7)
+ mov $8, %eax
+ RETURN
+
+
+ .p2align 4
+L(len_less12_prolog):
+ add $4, %edi
+
+ cmpb $0, 8(%edx)
+ jz L(exit_tail8)
+ cmp $1, %edi
+ je L(exit_tail9)
+
+ cmpb $0, 9(%edx)
+ jz L(exit_tail9)
+ cmp $2, %edi
+ je L(exit_tail10)
+
+ cmpb $0, 10(%edx)
+ jz L(exit_tail10)
+ cmp $3, %edi
+ je L(exit_tail11)
+
+ cmpb $0, 11(%edx)
+ jz L(exit_tail11)
+ mov $12, %eax
+ RETURN
+
+ .p2align 4
+L(len_less16_prolog):
+ add $4, %edi
+
+ cmpb $0, 12(%edx)
+ jz L(exit_tail12)
+ cmp $1, %edi
+ je L(exit_tail13)
+
+ cmpb $0, 13(%edx)
+ jz L(exit_tail13)
+ cmp $2, %edi
+ je L(exit_tail14)
+
+ cmpb $0, 14(%edx)
+ jz L(exit_tail14)
+ cmp $3, %edi
+ je L(exit_tail15)
+
+ cmpb $0, 15(%edx)
+ jz L(exit_tail15)
+ mov $16, %eax
+ RETURN
+# endif
+
+ .p2align 4
+L(exit_tail1):
+ add $1, %eax
+ RETURN
+
+L(exit_tail2):
+ add $2, %eax
+ RETURN
+
+L(exit_tail3):
+ add $3, %eax
+ RETURN
+
+L(exit_tail4):
+ add $4, %eax
+ RETURN
+
+L(exit_tail5):
+ add $5, %eax
+ RETURN
+
+L(exit_tail6):
+ add $6, %eax
+ RETURN
+
+L(exit_tail7):
+ add $7, %eax
+ RETURN
+
+L(exit_tail8):
+ add $8, %eax
+ RETURN
+
+L(exit_tail9):
+ add $9, %eax
+ RETURN
+
+L(exit_tail10):
+ add $10, %eax
+ RETURN
+
+L(exit_tail11):
+ add $11, %eax
+ RETURN
+
+L(exit_tail12):
+ add $12, %eax
+ RETURN
+
+L(exit_tail13):
+ add $13, %eax
+ RETURN
+
+L(exit_tail14):
+ add $14, %eax
+ RETURN
+
+L(exit_tail15):
+ add $15, %eax
+# ifndef USE_AS_STRCAT
+ RETURN
+END (STRLEN)
+# endif
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strlen.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strlen.S
new file mode 100644
index 0000000000..77cf6bcdb0
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strlen.S
@@ -0,0 +1,60 @@
+/* Multiple versions of strlen
+ All versions must be listed in ifunc-impl-list.c.
+ Copyright (C) 2009-2017 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in libc and for the
+ DSO. In static binaries, we need strlen before the initialization
+ happened. */
+#if defined SHARED && IS_IN (libc)
+ .text
+ENTRY(strlen)
+ .type strlen, @gnu_indirect_function
+ LOAD_GOT_AND_RTLD_GLOBAL_RO
+ LOAD_FUNC_GOT_EAX (__strlen_ia32)
+ HAS_CPU_FEATURE (SSE2)
+ jz 2f
+ LOAD_FUNC_GOT_EAX (__strlen_sse2_bsf)
+ HAS_ARCH_FEATURE (Slow_BSF)
+ jz 2f
+ LOAD_FUNC_GOT_EAX (__strlen_sse2)
+2: ret
+END(strlen)
+
+# undef ENTRY
+# define ENTRY(name) \
+ .type __strlen_ia32, @function; \
+ .globl __strlen_ia32; \
+ .p2align 4; \
+ __strlen_ia32: cfi_startproc; \
+ CALL_MCOUNT
+# undef END
+# define END(name) \
+ cfi_endproc; .size __strlen_ia32, .-__strlen_ia32
+# undef libc_hidden_builtin_def
+/* IFUNC doesn't work with the hidden functions in shared library since
+ they will be called without setting up EBX needed for PLT which is
+ used by IFUNC. */
+# define libc_hidden_builtin_def(name) \
+ .globl __GI_strlen; __GI_strlen = __strlen_ia32
+#endif
+
+#include "../../i586/strlen.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strncase-c.c b/REORG.TODO/sysdeps/i386/i686/multiarch/strncase-c.c
new file mode 100644
index 0000000000..76581eb62b
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strncase-c.c
@@ -0,0 +1,8 @@
+#include <string.h>
+
+extern __typeof (strncasecmp) __strncasecmp_nonascii;
+
+#define __strncasecmp __strncasecmp_nonascii
+#include <string/strncase.c>
+
+strong_alias (__strncasecmp_nonascii, __strncasecmp_ia32)
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strncase.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strncase.S
new file mode 100644
index 0000000000..a56e63a566
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strncase.S
@@ -0,0 +1,39 @@
+/* Entry point for multi-version x86 strncasecmp.
+ All versions must be listed in ifunc-impl-list.c.
+ Copyright (C) 2011-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+ .text
+ENTRY(__strncasecmp)
+ .type __strncasecmp, @gnu_indirect_function
+ LOAD_GOT_AND_RTLD_GLOBAL_RO
+ LOAD_FUNC_GOT_EAX (__strncasecmp_ia32)
+ HAS_CPU_FEATURE (SSSE3)
+ jz 2f
+ LOAD_FUNC_GOT_EAX (__strncasecmp_ssse3)
+ HAS_CPU_FEATURE (SSE4_2)
+ jz 2f
+ HAS_ARCH_FEATURE (Slow_SSE4_2)
+ jnz 2f
+ LOAD_FUNC_GOT_EAX (__strncasecmp_sse4_2)
+2: ret
+END(__strncasecmp)
+
+weak_alias (__strncasecmp, strncasecmp)
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strncase_l-c.c b/REORG.TODO/sysdeps/i386/i686/multiarch/strncase_l-c.c
new file mode 100644
index 0000000000..7e601af271
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strncase_l-c.c
@@ -0,0 +1,13 @@
+#include <string.h>
+
+extern __typeof (strncasecmp_l) __strncasecmp_l_nonascii;
+
+#define __strncasecmp_l __strncasecmp_l_nonascii
+#define USE_IN_EXTENDED_LOCALE_MODEL 1
+#include <string/strncase.c>
+
+strong_alias (__strncasecmp_l_nonascii, __strncasecmp_l_ia32)
+
+/* The needs of strcasecmp in libc are minimal, no need to go through
+ the IFUNC. */
+strong_alias (__strncasecmp_l_nonascii, __GI___strncasecmp_l)
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strncase_l-sse4.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strncase_l-sse4.S
new file mode 100644
index 0000000000..557210832e
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strncase_l-sse4.S
@@ -0,0 +1,2 @@
+#define USE_AS_STRNCASECMP_L 1
+#include "strcmp-sse4.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strncase_l-ssse3.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strncase_l-ssse3.S
new file mode 100644
index 0000000000..d438a1ae35
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strncase_l-ssse3.S
@@ -0,0 +1,2 @@
+#define USE_AS_STRNCASECMP_L 1
+#include "strcmp-ssse3.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strncase_l.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strncase_l.S
new file mode 100644
index 0000000000..8a74ee8574
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strncase_l.S
@@ -0,0 +1,7 @@
+/* Multiple versions of strncasecmp_l
+ All versions must be listed in ifunc-impl-list.c. */
+#define STRCMP __strncasecmp_l
+#define USE_AS_STRNCASECMP_L
+#include "strcmp.S"
+
+weak_alias (__strncasecmp_l, strncasecmp_l)
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strncat-c.c b/REORG.TODO/sysdeps/i386/i686/multiarch/strncat-c.c
new file mode 100644
index 0000000000..132a000545
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strncat-c.c
@@ -0,0 +1,8 @@
+#define STRNCAT __strncat_ia32
+#ifdef SHARED
+#undef libc_hidden_def
+#define libc_hidden_def(name) \
+ __hidden_ver1 (__strncat_ia32, __GI___strncat, __strncat_ia32);
+#endif
+
+#include "string/strncat.c"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strncat-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strncat-sse2.S
new file mode 100644
index 0000000000..f1045b72b8
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strncat-sse2.S
@@ -0,0 +1,4 @@
+#define STRCAT __strncat_sse2
+#define USE_AS_STRNCAT
+
+#include "strcat-sse2.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strncat-ssse3.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strncat-ssse3.S
new file mode 100644
index 0000000000..625b90a978
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strncat-ssse3.S
@@ -0,0 +1,4 @@
+#define STRCAT __strncat_ssse3
+#define USE_AS_STRNCAT
+
+#include "strcat-ssse3.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strncat.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strncat.S
new file mode 100644
index 0000000000..5c1bf41453
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strncat.S
@@ -0,0 +1,5 @@
+/* Multiple versions of strncat
+ All versions must be listed in ifunc-impl-list.c. */
+#define STRCAT strncat
+#define USE_AS_STRNCAT
+#include "strcat.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strncmp-c.c b/REORG.TODO/sysdeps/i386/i686/multiarch/strncmp-c.c
new file mode 100644
index 0000000000..cc059da494
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strncmp-c.c
@@ -0,0 +1,8 @@
+#ifdef SHARED
+# define STRNCMP __strncmp_ia32
+# undef libc_hidden_builtin_def
+# define libc_hidden_builtin_def(name) \
+ __hidden_ver1 (__strncmp_ia32, __GI_strncmp, __strncmp_ia32);
+#endif
+
+#include "string/strncmp.c"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strncmp-sse4.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strncmp-sse4.S
new file mode 100644
index 0000000000..cf14dfaf6c
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strncmp-sse4.S
@@ -0,0 +1,5 @@
+#ifdef SHARED
+# define USE_AS_STRNCMP
+# define STRCMP __strncmp_sse4_2
+# include "strcmp-sse4.S"
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strncmp-ssse3.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strncmp-ssse3.S
new file mode 100644
index 0000000000..536c8685f2
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strncmp-ssse3.S
@@ -0,0 +1,5 @@
+#ifdef SHARED
+# define USE_AS_STRNCMP
+# define STRCMP __strncmp_ssse3
+# include "strcmp-ssse3.S"
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strncmp.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strncmp.S
new file mode 100644
index 0000000000..150d4786d2
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strncmp.S
@@ -0,0 +1,5 @@
+/* Multiple versions of strncmp
+ All versions must be listed in ifunc-impl-list.c. */
+#define USE_AS_STRNCMP
+#define STRCMP strncmp
+#include "strcmp.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strncpy-c.c b/REORG.TODO/sysdeps/i386/i686/multiarch/strncpy-c.c
new file mode 100644
index 0000000000..201e3f98b3
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strncpy-c.c
@@ -0,0 +1,8 @@
+#define STRNCPY __strncpy_ia32
+#ifdef SHARED
+# undef libc_hidden_builtin_def
+# define libc_hidden_builtin_def(name) \
+ __hidden_ver1 (__strncpy_ia32, __GI_strncpy, __strncpy_ia32);
+#endif
+
+#include "string/strncpy.c"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strncpy-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strncpy-sse2.S
new file mode 100644
index 0000000000..bdd99239a4
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strncpy-sse2.S
@@ -0,0 +1,3 @@
+#define USE_AS_STRNCPY
+#define STRCPY __strncpy_sse2
+#include "strcpy-sse2.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strncpy-ssse3.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strncpy-ssse3.S
new file mode 100644
index 0000000000..bf82ee447d
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strncpy-ssse3.S
@@ -0,0 +1,3 @@
+#define USE_AS_STRNCPY
+#define STRCPY __strncpy_ssse3
+#include "strcpy-ssse3.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strncpy.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strncpy.S
new file mode 100644
index 0000000000..9c257efc6e
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strncpy.S
@@ -0,0 +1,5 @@
+/* Multiple versions of strncpy
+ All versions must be listed in ifunc-impl-list.c. */
+#define USE_AS_STRNCPY
+#define STRCPY strncpy
+#include "strcpy.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strnlen-c.c b/REORG.TODO/sysdeps/i386/i686/multiarch/strnlen-c.c
new file mode 100644
index 0000000000..351e939a93
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strnlen-c.c
@@ -0,0 +1,10 @@
+#define STRNLEN __strnlen_ia32
+#ifdef SHARED
+# undef libc_hidden_def
+# define libc_hidden_def(name) \
+ __hidden_ver1 (__strnlen_ia32, __GI_strnlen, __strnlen_ia32); \
+ strong_alias (__strnlen_ia32, __strnlen_ia32_1); \
+ __hidden_ver1 (__strnlen_ia32_1, __GI___strnlen, __strnlen_ia32_1);
+#endif
+
+#include "string/strnlen.c"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strnlen-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strnlen-sse2.S
new file mode 100644
index 0000000000..56b6ae2a5c
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strnlen-sse2.S
@@ -0,0 +1,3 @@
+#define USE_AS_STRNLEN
+#define STRLEN __strnlen_sse2
+#include "strlen-sse2.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strnlen.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strnlen.S
new file mode 100644
index 0000000000..d241522c70
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strnlen.S
@@ -0,0 +1,37 @@
+/* Multiple versions of strnlen
+ All versions must be listed in ifunc-impl-list.c.
+ Copyright (C) 2011-2017 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+#if IS_IN (libc)
+ .text
+ENTRY(__strnlen)
+ .type __strnlen, @gnu_indirect_function
+ LOAD_GOT_AND_RTLD_GLOBAL_RO
+ LOAD_FUNC_GOT_EAX (__strnlen_ia32)
+ HAS_CPU_FEATURE (SSE2)
+ jz 2f
+ LOAD_FUNC_GOT_EAX (__strnlen_sse2)
+2: ret
+END(__strnlen)
+
+weak_alias(__strnlen, strnlen)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strpbrk-c.c b/REORG.TODO/sysdeps/i386/i686/multiarch/strpbrk-c.c
new file mode 100644
index 0000000000..5db62053b3
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strpbrk-c.c
@@ -0,0 +1,2 @@
+#define __strpbrk_sse2 __strpbrk_ia32
+#include <sysdeps/x86_64/multiarch/strpbrk-c.c>
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strpbrk.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strpbrk.S
new file mode 100644
index 0000000000..7201d6376f
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strpbrk.S
@@ -0,0 +1,5 @@
+/* Multiple versions of strpbrk
+ All versions must be listed in ifunc-impl-list.c. */
+#define STRCSPN strpbrk
+#define USE_AS_STRPBRK
+#include "strcspn.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strrchr-sse2-bsf.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strrchr-sse2-bsf.S
new file mode 100644
index 0000000000..39a7c8825b
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strrchr-sse2-bsf.S
@@ -0,0 +1,282 @@
+/* strrchr with SSE2 with bsf and bsr
+ Copyright (C) 2011-2017 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+
+# define CFI_PUSH(REG) \
+ cfi_adjust_cfa_offset (4); \
+ cfi_rel_offset (REG, 0)
+
+# define CFI_POP(REG) \
+ cfi_adjust_cfa_offset (-4); \
+ cfi_restore (REG)
+
+# define PUSH(REG) pushl REG; CFI_PUSH (REG)
+# define POP(REG) popl REG; CFI_POP (REG)
+
+# define PARMS 4
+# define STR1 PARMS
+# define STR2 STR1+4
+
+ .text
+ENTRY (__strrchr_sse2_bsf)
+
+ mov STR1(%esp), %ecx
+ movd STR2(%esp), %xmm1
+
+ PUSH (%edi)
+ pxor %xmm2, %xmm2
+ mov %ecx, %edi
+ punpcklbw %xmm1, %xmm1
+ punpcklbw %xmm1, %xmm1
+ /* ECX has OFFSET. */
+ and $63, %ecx
+ cmp $48, %ecx
+ pshufd $0, %xmm1, %xmm1
+ ja L(crosscashe)
+
+/* unaligned string. */
+ movdqu (%edi), %xmm0
+ pcmpeqb %xmm0, %xmm2
+ pcmpeqb %xmm1, %xmm0
+ /* Find where NULL is. */
+ pmovmskb %xmm2, %edx
+ /* Check if there is a match. */
+ pmovmskb %xmm0, %eax
+
+ test %eax, %eax
+ jnz L(unaligned_match1)
+
+ test %edx, %edx
+ jnz L(return_null)
+
+ and $-16, %edi
+ add $16, %edi
+
+ PUSH (%esi)
+ PUSH (%ebx)
+
+ xor %ebx, %ebx
+ jmp L(loop)
+
+ CFI_POP (%esi)
+ CFI_POP (%ebx)
+
+ .p2align 4
+L(unaligned_return_value1):
+ bsf %edx, %ecx
+ mov $2, %edx
+ shl %cl, %edx
+ sub $1, %edx
+ and %edx, %eax
+ jz L(return_null)
+ bsr %eax, %eax
+ add %edi, %eax
+ POP (%edi)
+ ret
+ CFI_PUSH (%edi)
+
+ .p2align 4
+L(unaligned_match1):
+ test %edx, %edx
+ jnz L(unaligned_return_value1)
+
+ PUSH (%esi)
+ PUSH (%ebx)
+
+ mov %eax, %ebx
+ lea 16(%edi), %esi
+ and $-16, %edi
+ add $16, %edi
+ jmp L(loop)
+
+ CFI_POP (%esi)
+ CFI_POP (%ebx)
+
+ .p2align 4
+ L(crosscashe):
+/* Hancle unaligned string. */
+ and $15, %ecx
+ and $-16, %edi
+ pxor %xmm3, %xmm3
+ movdqa (%edi), %xmm0
+ pcmpeqb %xmm0, %xmm3
+ pcmpeqb %xmm1, %xmm0
+ /* Find where NULL is. */
+ pmovmskb %xmm3, %edx
+ /* Check if there is a match. */
+ pmovmskb %xmm0, %eax
+ /* Remove the leading bytes. */
+ shr %cl, %edx
+ shr %cl, %eax
+
+ test %eax, %eax
+ jnz L(unaligned_match)
+
+ test %edx, %edx
+ jnz L(return_null)
+
+ add $16, %edi
+
+ PUSH (%esi)
+ PUSH (%ebx)
+
+ xor %ebx, %ebx
+ jmp L(loop)
+
+ CFI_POP (%esi)
+ CFI_POP (%ebx)
+
+ .p2align 4
+L(unaligned_return_value):
+ add %ecx, %edi
+ bsf %edx, %ecx
+ mov $2, %edx
+ shl %cl, %edx
+ sub $1, %edx
+ and %edx, %eax
+ jz L(return_null)
+ bsr %eax, %eax
+ add %edi, %eax
+ POP (%edi)
+ ret
+ CFI_PUSH (%edi)
+
+ .p2align 4
+L(unaligned_match):
+ test %edx, %edx
+ jnz L(unaligned_return_value)
+
+ PUSH (%esi)
+ PUSH (%ebx)
+
+ mov %eax, %ebx
+ add $16, %edi
+ lea (%edi, %ecx), %esi
+
+/* Loop start on aligned string. */
+ .p2align 4
+L(loop):
+ movdqa (%edi), %xmm0
+ pcmpeqb %xmm0, %xmm2
+ add $16, %edi
+ pcmpeqb %xmm1, %xmm0
+ pmovmskb %xmm2, %ecx
+ pmovmskb %xmm0, %eax
+ or %eax, %ecx
+ jnz L(matches)
+
+ movdqa (%edi), %xmm0
+ pcmpeqb %xmm0, %xmm2
+ add $16, %edi
+ pcmpeqb %xmm1, %xmm0
+ pmovmskb %xmm2, %ecx
+ pmovmskb %xmm0, %eax
+ or %eax, %ecx
+ jnz L(matches)
+
+ movdqa (%edi), %xmm0
+ pcmpeqb %xmm0, %xmm2
+ add $16, %edi
+ pcmpeqb %xmm1, %xmm0
+ pmovmskb %xmm2, %ecx
+ pmovmskb %xmm0, %eax
+ or %eax, %ecx
+ jnz L(matches)
+
+ movdqa (%edi), %xmm0
+ pcmpeqb %xmm0, %xmm2
+ add $16, %edi
+ pcmpeqb %xmm1, %xmm0
+ pmovmskb %xmm2, %ecx
+ pmovmskb %xmm0, %eax
+ or %eax, %ecx
+ jz L(loop)
+
+L(matches):
+ test %eax, %eax
+ jnz L(match)
+L(return_value):
+ test %ebx, %ebx
+ jz L(return_null_1)
+ bsr %ebx, %eax
+ add %esi, %eax
+
+ POP (%ebx)
+ POP (%esi)
+
+ sub $16, %eax
+ POP (%edi)
+ ret
+
+ CFI_PUSH (%edi)
+ CFI_PUSH (%ebx)
+ CFI_PUSH (%esi)
+
+ .p2align 4
+L(match):
+ pmovmskb %xmm2, %ecx
+ test %ecx, %ecx
+ jnz L(return_value_1)
+ mov %eax, %ebx
+ mov %edi, %esi
+ jmp L(loop)
+
+ .p2align 4
+L(return_value_1):
+ bsf %ecx, %ecx
+ mov $2, %edx
+ shl %cl, %edx
+ sub $1, %edx
+ and %edx, %eax
+ jz L(return_value)
+
+ POP (%ebx)
+ POP (%esi)
+
+ bsr %eax, %eax
+ add %edi, %eax
+ sub $16, %eax
+ POP (%edi)
+ ret
+
+ CFI_PUSH (%edi)
+/* Return NULL. */
+ .p2align 4
+L(return_null):
+ xor %eax, %eax
+ POP (%edi)
+ ret
+
+ CFI_PUSH (%edi)
+ CFI_PUSH (%ebx)
+ CFI_PUSH (%esi)
+/* Return NULL. */
+ .p2align 4
+L(return_null_1):
+ POP (%ebx)
+ POP (%esi)
+ POP (%edi)
+ xor %eax, %eax
+ ret
+
+END (__strrchr_sse2_bsf)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strrchr-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strrchr-sse2.S
new file mode 100644
index 0000000000..20934288be
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strrchr-sse2.S
@@ -0,0 +1,708 @@
+/* strrchr SSE2 without bsf and bsr
+ Copyright (C) 2011-2017 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+
+# define CFI_PUSH(REG) \
+ cfi_adjust_cfa_offset (4); \
+ cfi_rel_offset (REG, 0)
+
+# define CFI_POP(REG) \
+ cfi_adjust_cfa_offset (-4); \
+ cfi_restore (REG)
+
+# define PUSH(REG) pushl REG; CFI_PUSH (REG)
+# define POP(REG) popl REG; CFI_POP (REG)
+
+# define PARMS 8
+# define ENTRANCE PUSH(%edi);
+# define RETURN POP(%edi); ret; CFI_PUSH(%edi);
+
+# define STR1 PARMS
+# define STR2 STR1+4
+
+ atom_text_section
+ENTRY (__strrchr_sse2)
+
+ ENTRANCE
+ mov STR1(%esp), %ecx
+ movd STR2(%esp), %xmm1
+
+ pxor %xmm2, %xmm2
+ mov %ecx, %edi
+ punpcklbw %xmm1, %xmm1
+ punpcklbw %xmm1, %xmm1
+ /* ECX has OFFSET. */
+ and $63, %ecx
+ cmp $48, %ecx
+ pshufd $0, %xmm1, %xmm1
+ ja L(crosscache)
+
+/* unaligned string. */
+ movdqu (%edi), %xmm0
+ pcmpeqb %xmm0, %xmm2
+ pcmpeqb %xmm1, %xmm0
+ /* Find where NULL is. */
+ pmovmskb %xmm2, %ecx
+ /* Check if there is a match. */
+ pmovmskb %xmm0, %eax
+ add $16, %edi
+
+ test %eax, %eax
+ jnz L(unaligned_match1)
+
+ test %ecx, %ecx
+ jnz L(return_null)
+
+ and $-16, %edi
+
+ PUSH (%esi)
+ PUSH (%ebx)
+
+ xor %ebx, %ebx
+ jmp L(loop)
+
+ CFI_POP (%esi)
+ CFI_POP (%ebx)
+
+ .p2align 4
+L(unaligned_match1):
+ test %ecx, %ecx
+ jnz L(prolog_find_zero_1)
+
+ PUSH (%esi)
+ PUSH (%ebx)
+
+ mov %eax, %ebx
+ mov %edi, %esi
+ and $-16, %edi
+ jmp L(loop)
+
+ CFI_POP (%esi)
+ CFI_POP (%ebx)
+
+ .p2align 4
+L(crosscache):
+/* Hancle unaligned string. */
+ and $15, %ecx
+ and $-16, %edi
+ pxor %xmm3, %xmm3
+ movdqa (%edi), %xmm0
+ pcmpeqb %xmm0, %xmm3
+ pcmpeqb %xmm1, %xmm0
+ /* Find where NULL is. */
+ pmovmskb %xmm3, %edx
+ /* Check if there is a match. */
+ pmovmskb %xmm0, %eax
+ /* Remove the leading bytes. */
+ shr %cl, %edx
+ shr %cl, %eax
+ add $16, %edi
+
+ test %eax, %eax
+ jnz L(unaligned_match)
+
+ test %edx, %edx
+ jnz L(return_null)
+
+ PUSH (%esi)
+ PUSH (%ebx)
+
+ xor %ebx, %ebx
+ jmp L(loop)
+
+ CFI_POP (%esi)
+ CFI_POP (%ebx)
+
+ .p2align 4
+L(unaligned_match):
+ test %edx, %edx
+ jnz L(prolog_find_zero)
+
+ PUSH (%esi)
+ PUSH (%ebx)
+
+ mov %eax, %ebx
+ lea (%edi, %ecx), %esi
+
+/* Loop start on aligned string. */
+ .p2align 4
+L(loop):
+ movdqa (%edi), %xmm0
+ pcmpeqb %xmm0, %xmm2
+ add $16, %edi
+ pcmpeqb %xmm1, %xmm0
+ pmovmskb %xmm2, %ecx
+ pmovmskb %xmm0, %eax
+ or %eax, %ecx
+ jnz L(matches)
+
+ movdqa (%edi), %xmm0
+ pcmpeqb %xmm0, %xmm2
+ add $16, %edi
+ pcmpeqb %xmm1, %xmm0
+ pmovmskb %xmm2, %ecx
+ pmovmskb %xmm0, %eax
+ or %eax, %ecx
+ jnz L(matches)
+
+ movdqa (%edi), %xmm0
+ pcmpeqb %xmm0, %xmm2
+ add $16, %edi
+ pcmpeqb %xmm1, %xmm0
+ pmovmskb %xmm2, %ecx
+ pmovmskb %xmm0, %eax
+ or %eax, %ecx
+ jnz L(matches)
+
+ movdqa (%edi), %xmm0
+ pcmpeqb %xmm0, %xmm2
+ add $16, %edi
+ pcmpeqb %xmm1, %xmm0
+ pmovmskb %xmm2, %ecx
+ pmovmskb %xmm0, %eax
+ or %eax, %ecx
+ jz L(loop)
+
+L(matches):
+ test %eax, %eax
+ jnz L(match)
+L(return_value):
+ test %ebx, %ebx
+ jz L(return_null_1)
+ mov %ebx, %eax
+ mov %esi, %edi
+
+ POP (%ebx)
+ POP (%esi)
+
+ jmp L(match_exit)
+
+ CFI_PUSH (%ebx)
+ CFI_PUSH (%esi)
+
+ .p2align 4
+L(return_null_1):
+ POP (%ebx)
+ POP (%esi)
+
+ xor %eax, %eax
+ RETURN
+
+ CFI_PUSH (%ebx)
+ CFI_PUSH (%esi)
+
+ .p2align 4
+L(match):
+ pmovmskb %xmm2, %ecx
+ test %ecx, %ecx
+ jnz L(find_zero)
+ mov %eax, %ebx
+ mov %edi, %esi
+ jmp L(loop)
+
+ .p2align 4
+L(find_zero):
+ test %cl, %cl
+ jz L(find_zero_high)
+ mov %cl, %dl
+ and $15, %dl
+ jz L(find_zero_8)
+ test $0x01, %cl
+ jnz L(FindZeroExit1)
+ test $0x02, %cl
+ jnz L(FindZeroExit2)
+ test $0x04, %cl
+ jnz L(FindZeroExit3)
+ and $1 << 4 - 1, %eax
+ jz L(return_value)
+
+ POP (%ebx)
+ POP (%esi)
+ jmp L(match_exit)
+
+ CFI_PUSH (%ebx)
+ CFI_PUSH (%esi)
+
+ .p2align 4
+L(find_zero_8):
+ test $0x10, %cl
+ jnz L(FindZeroExit5)
+ test $0x20, %cl
+ jnz L(FindZeroExit6)
+ test $0x40, %cl
+ jnz L(FindZeroExit7)
+ and $1 << 8 - 1, %eax
+ jz L(return_value)
+
+ POP (%ebx)
+ POP (%esi)
+ jmp L(match_exit)
+
+ CFI_PUSH (%ebx)
+ CFI_PUSH (%esi)
+
+ .p2align 4
+L(find_zero_high):
+ mov %ch, %dh
+ and $15, %dh
+ jz L(find_zero_high_8)
+ test $0x01, %ch
+ jnz L(FindZeroExit9)
+ test $0x02, %ch
+ jnz L(FindZeroExit10)
+ test $0x04, %ch
+ jnz L(FindZeroExit11)
+ and $1 << 12 - 1, %eax
+ jz L(return_value)
+
+ POP (%ebx)
+ POP (%esi)
+ jmp L(match_exit)
+
+ CFI_PUSH (%ebx)
+ CFI_PUSH (%esi)
+
+ .p2align 4
+L(find_zero_high_8):
+ test $0x10, %ch
+ jnz L(FindZeroExit13)
+ test $0x20, %ch
+ jnz L(FindZeroExit14)
+ test $0x40, %ch
+ jnz L(FindZeroExit15)
+ and $1 << 16 - 1, %eax
+ jz L(return_value)
+
+ POP (%ebx)
+ POP (%esi)
+ jmp L(match_exit)
+
+ CFI_PUSH (%ebx)
+ CFI_PUSH (%esi)
+
+ .p2align 4
+L(FindZeroExit1):
+ and $1, %eax
+ jz L(return_value)
+
+ POP (%ebx)
+ POP (%esi)
+ jmp L(match_exit)
+
+ CFI_PUSH (%ebx)
+ CFI_PUSH (%esi)
+
+ .p2align 4
+L(FindZeroExit2):
+ and $1 << 2 - 1, %eax
+ jz L(return_value)
+
+ POP (%ebx)
+ POP (%esi)
+ jmp L(match_exit)
+
+ CFI_PUSH (%ebx)
+ CFI_PUSH (%esi)
+
+ .p2align 4
+L(FindZeroExit3):
+ and $1 << 3 - 1, %eax
+ jz L(return_value)
+
+ POP (%ebx)
+ POP (%esi)
+ jmp L(match_exit)
+
+ CFI_PUSH (%ebx)
+ CFI_PUSH (%esi)
+
+ .p2align 4
+L(FindZeroExit5):
+ and $1 << 5 - 1, %eax
+ jz L(return_value)
+
+ POP (%ebx)
+ POP (%esi)
+ jmp L(match_exit)
+
+ CFI_PUSH (%ebx)
+ CFI_PUSH (%esi)
+
+ .p2align 4
+L(FindZeroExit6):
+ and $1 << 6 - 1, %eax
+ jz L(return_value)
+
+ POP (%ebx)
+ POP (%esi)
+ jmp L(match_exit)
+
+ CFI_PUSH (%ebx)
+ CFI_PUSH (%esi)
+
+ .p2align 4
+L(FindZeroExit7):
+ and $1 << 7 - 1, %eax
+ jz L(return_value)
+
+ POP (%ebx)
+ POP (%esi)
+ jmp L(match_exit)
+
+ CFI_PUSH (%ebx)
+ CFI_PUSH (%esi)
+
+ .p2align 4
+L(FindZeroExit9):
+ and $1 << 9 - 1, %eax
+ jz L(return_value)
+
+ POP (%ebx)
+ POP (%esi)
+ jmp L(match_exit)
+
+ CFI_PUSH (%ebx)
+ CFI_PUSH (%esi)
+
+ .p2align 4
+L(FindZeroExit10):
+ and $1 << 10 - 1, %eax
+ jz L(return_value)
+
+ POP (%ebx)
+ POP (%esi)
+ jmp L(match_exit)
+
+ CFI_PUSH (%ebx)
+ CFI_PUSH (%esi)
+
+ .p2align 4
+L(FindZeroExit11):
+ and $1 << 11 - 1, %eax
+ jz L(return_value)
+
+ POP (%ebx)
+ POP (%esi)
+ jmp L(match_exit)
+
+ CFI_PUSH (%ebx)
+ CFI_PUSH (%esi)
+
+ .p2align 4
+L(FindZeroExit13):
+ and $1 << 13 - 1, %eax
+ jz L(return_value)
+
+ POP (%ebx)
+ POP (%esi)
+ jmp L(match_exit)
+
+ CFI_PUSH (%ebx)
+ CFI_PUSH (%esi)
+
+ .p2align 4
+L(FindZeroExit14):
+ and $1 << 14 - 1, %eax
+ jz L(return_value)
+
+ POP (%ebx)
+ POP (%esi)
+ jmp L(match_exit)
+
+ CFI_PUSH (%ebx)
+ CFI_PUSH (%esi)
+
+ .p2align 4
+L(FindZeroExit15):
+ and $1 << 15 - 1, %eax
+ jz L(return_value)
+
+ POP (%ebx)
+ POP (%esi)
+
+ .p2align 4
+L(match_exit):
+ test %ah, %ah
+ jnz L(match_exit_high)
+ mov %al, %dl
+ and $15 << 4, %dl
+ jnz L(match_exit_8)
+ test $0x08, %al
+ jnz L(Exit4)
+ test $0x04, %al
+ jnz L(Exit3)
+ test $0x02, %al
+ jnz L(Exit2)
+ lea -16(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(match_exit_8):
+ test $0x80, %al
+ jnz L(Exit8)
+ test $0x40, %al
+ jnz L(Exit7)
+ test $0x20, %al
+ jnz L(Exit6)
+ lea -12(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(match_exit_high):
+ mov %ah, %dh
+ and $15 << 4, %dh
+ jnz L(match_exit_high_8)
+ test $0x08, %ah
+ jnz L(Exit12)
+ test $0x04, %ah
+ jnz L(Exit11)
+ test $0x02, %ah
+ jnz L(Exit10)
+ lea -8(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(match_exit_high_8):
+ test $0x80, %ah
+ jnz L(Exit16)
+ test $0x40, %ah
+ jnz L(Exit15)
+ test $0x20, %ah
+ jnz L(Exit14)
+ lea -4(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(Exit2):
+ lea -15(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(Exit3):
+ lea -14(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(Exit4):
+ lea -13(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(Exit6):
+ lea -11(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(Exit7):
+ lea -10(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(Exit8):
+ lea -9(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(Exit10):
+ lea -7(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(Exit11):
+ lea -6(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(Exit12):
+ lea -5(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(Exit14):
+ lea -3(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(Exit15):
+ lea -2(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(Exit16):
+ lea -1(%edi), %eax
+ RETURN
+
+/* Return NULL. */
+ .p2align 4
+L(return_null):
+ xor %eax, %eax
+ RETURN
+
+ .p2align 4
+L(prolog_find_zero):
+ add %ecx, %edi
+ mov %edx, %ecx
+L(prolog_find_zero_1):
+ test %cl, %cl
+ jz L(prolog_find_zero_high)
+ mov %cl, %dl
+ and $15, %dl
+ jz L(prolog_find_zero_8)
+ test $0x01, %cl
+ jnz L(PrologFindZeroExit1)
+ test $0x02, %cl
+ jnz L(PrologFindZeroExit2)
+ test $0x04, %cl
+ jnz L(PrologFindZeroExit3)
+ and $1 << 4 - 1, %eax
+ jnz L(match_exit)
+ xor %eax, %eax
+ RETURN
+
+ .p2align 4
+L(prolog_find_zero_8):
+ test $0x10, %cl
+ jnz L(PrologFindZeroExit5)
+ test $0x20, %cl
+ jnz L(PrologFindZeroExit6)
+ test $0x40, %cl
+ jnz L(PrologFindZeroExit7)
+ and $1 << 8 - 1, %eax
+ jnz L(match_exit)
+ xor %eax, %eax
+ RETURN
+
+ .p2align 4
+L(prolog_find_zero_high):
+ mov %ch, %dh
+ and $15, %dh
+ jz L(prolog_find_zero_high_8)
+ test $0x01, %ch
+ jnz L(PrologFindZeroExit9)
+ test $0x02, %ch
+ jnz L(PrologFindZeroExit10)
+ test $0x04, %ch
+ jnz L(PrologFindZeroExit11)
+ and $1 << 12 - 1, %eax
+ jnz L(match_exit)
+ xor %eax, %eax
+ RETURN
+
+ .p2align 4
+L(prolog_find_zero_high_8):
+ test $0x10, %ch
+ jnz L(PrologFindZeroExit13)
+ test $0x20, %ch
+ jnz L(PrologFindZeroExit14)
+ test $0x40, %ch
+ jnz L(PrologFindZeroExit15)
+ and $1 << 16 - 1, %eax
+ jnz L(match_exit)
+ xor %eax, %eax
+ RETURN
+
+ .p2align 4
+L(PrologFindZeroExit1):
+ and $1, %eax
+ jnz L(match_exit)
+ xor %eax, %eax
+ RETURN
+
+ .p2align 4
+L(PrologFindZeroExit2):
+ and $1 << 2 - 1, %eax
+ jnz L(match_exit)
+ xor %eax, %eax
+ RETURN
+
+ .p2align 4
+L(PrologFindZeroExit3):
+ and $1 << 3 - 1, %eax
+ jnz L(match_exit)
+ xor %eax, %eax
+ RETURN
+
+ .p2align 4
+L(PrologFindZeroExit5):
+ and $1 << 5 - 1, %eax
+ jnz L(match_exit)
+ xor %eax, %eax
+ RETURN
+
+ .p2align 4
+L(PrologFindZeroExit6):
+ and $1 << 6 - 1, %eax
+ jnz L(match_exit)
+ xor %eax, %eax
+ RETURN
+
+ .p2align 4
+L(PrologFindZeroExit7):
+ and $1 << 7 - 1, %eax
+ jnz L(match_exit)
+ xor %eax, %eax
+ RETURN
+
+ .p2align 4
+L(PrologFindZeroExit9):
+ and $1 << 9 - 1, %eax
+ jnz L(match_exit)
+ xor %eax, %eax
+ RETURN
+
+ .p2align 4
+L(PrologFindZeroExit10):
+ and $1 << 10 - 1, %eax
+ jnz L(match_exit)
+ xor %eax, %eax
+ RETURN
+
+ .p2align 4
+L(PrologFindZeroExit11):
+ and $1 << 11 - 1, %eax
+ jnz L(match_exit)
+ xor %eax, %eax
+ RETURN
+
+ .p2align 4
+L(PrologFindZeroExit13):
+ and $1 << 13 - 1, %eax
+ jnz L(match_exit)
+ xor %eax, %eax
+ RETURN
+
+ .p2align 4
+L(PrologFindZeroExit14):
+ and $1 << 14 - 1, %eax
+ jnz L(match_exit)
+ xor %eax, %eax
+ RETURN
+
+ .p2align 4
+L(PrologFindZeroExit15):
+ and $1 << 15 - 1, %eax
+ jnz L(match_exit)
+ xor %eax, %eax
+ RETURN
+
+END (__strrchr_sse2)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strrchr.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strrchr.S
new file mode 100644
index 0000000000..d9281eaeae
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strrchr.S
@@ -0,0 +1,57 @@
+/* Multiple versions of strrchr
+ All versions must be listed in ifunc-impl-list.c.
+ Copyright (C) 2011-2017 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+#if IS_IN (libc)
+ .text
+ENTRY(strrchr)
+ .type strrchr, @gnu_indirect_function
+ LOAD_GOT_AND_RTLD_GLOBAL_RO
+ LOAD_FUNC_GOT_EAX (__strrchr_ia32)
+ HAS_CPU_FEATURE (SSE2)
+ jz 2f
+ LOAD_FUNC_GOT_EAX (__strrchr_sse2_bsf)
+ HAS_ARCH_FEATURE (Slow_BSF)
+ jz 2f
+ LOAD_FUNC_GOT_EAX (__strrchr_sse2)
+2: ret
+END(strrchr)
+
+# undef ENTRY
+# define ENTRY(name) \
+ .type __strrchr_ia32, @function; \
+ .globl __strrchr_ia32; \
+ .p2align 4; \
+ __strrchr_ia32: cfi_startproc; \
+ CALL_MCOUNT
+# undef END
+# define END(name) \
+ cfi_endproc; .size __strrchr_ia32, .-__strrchr_ia32
+# undef libc_hidden_builtin_def
+/* IFUNC doesn't work with the hidden functions in shared library since
+ they will be called without setting up EBX needed for PLT which is
+ used by IFUNC. */
+# define libc_hidden_builtin_def(name) \
+ .globl __GI_strrchr; __GI_strrchr = __strrchr_ia32
+#endif
+
+#include "../../strrchr.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strspn-c.c b/REORG.TODO/sysdeps/i386/i686/multiarch/strspn-c.c
new file mode 100644
index 0000000000..bea09dea71
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strspn-c.c
@@ -0,0 +1,2 @@
+#define __strspn_sse2 __strspn_ia32
+#include <sysdeps/x86_64/multiarch/strspn-c.c>
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strspn.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strspn.S
new file mode 100644
index 0000000000..1269062381
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strspn.S
@@ -0,0 +1,56 @@
+/* Multiple versions of strspn
+ All versions must be listed in ifunc-impl-list.c.
+ Copyright (C) 2009-2017 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <config.h>
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in libc. */
+#if IS_IN (libc)
+ .text
+ENTRY(strspn)
+ .type strspn, @gnu_indirect_function
+ LOAD_GOT_AND_RTLD_GLOBAL_RO
+ LOAD_FUNC_GOT_EAX (__strspn_ia32)
+ HAS_CPU_FEATURE (SSE4_2)
+ jz 2f
+ LOAD_FUNC_GOT_EAX (__strspn_sse42)
+2: ret
+END(strspn)
+
+# undef ENTRY
+# define ENTRY(name) \
+ .type __strspn_ia32, @function; \
+ .globl __strspn_ia32; \
+ .p2align 4; \
+__strspn_ia32: cfi_startproc; \
+ CALL_MCOUNT
+# undef END
+# define END(name) \
+ cfi_endproc; .size __strspn_ia32, .-__strspn_ia32
+# undef libc_hidden_builtin_def
+/* IFUNC doesn't work with the hidden functions in shared library since
+ they will be called without setting up EBX needed for PLT which is
+ used by IFUNC. */
+# define libc_hidden_builtin_def(name) \
+ .globl __GI_strspn; __GI_strspn = __strspn_ia32
+#endif
+
+#include "../../strspn.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/test-multiarch.c b/REORG.TODO/sysdeps/i386/i686/multiarch/test-multiarch.c
new file mode 100644
index 0000000000..593cfec273
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/test-multiarch.c
@@ -0,0 +1 @@
+#include <sysdeps/x86_64/multiarch/test-multiarch.c>
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/varshift.c b/REORG.TODO/sysdeps/i386/i686/multiarch/varshift.c
new file mode 100644
index 0000000000..7760b966e2
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/varshift.c
@@ -0,0 +1 @@
+#include <sysdeps/x86_64/multiarch/varshift.c>
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/varshift.h b/REORG.TODO/sysdeps/i386/i686/multiarch/varshift.h
new file mode 100644
index 0000000000..7c72c70d67
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/varshift.h
@@ -0,0 +1 @@
+#include <sysdeps/x86_64/multiarch/varshift.h>
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wcschr-c.c b/REORG.TODO/sysdeps/i386/i686/multiarch/wcschr-c.c
new file mode 100644
index 0000000000..38d41d04de
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wcschr-c.c
@@ -0,0 +1,22 @@
+#include <wchar.h>
+
+#if IS_IN (libc)
+# undef libc_hidden_weak
+# define libc_hidden_weak(name)
+
+# undef weak_alias
+# define weak_alias(name,alias)
+
+# ifdef SHARED
+# undef libc_hidden_def
+# define libc_hidden_def(name) \
+ __hidden_ver1 (__wcschr_ia32, __GI_wcschr, __wcschr_ia32); \
+ strong_alias (__wcschr_ia32, __wcschr_ia32_1); \
+ __hidden_ver1 (__wcschr_ia32_1, __GI___wcschr, __wcschr_ia32_1);
+# endif
+#endif
+
+extern __typeof (wcschr) __wcschr_ia32;
+
+#define WCSCHR __wcschr_ia32
+#include <wcsmbs/wcschr.c>
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wcschr-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/wcschr-sse2.S
new file mode 100644
index 0000000000..9ff6c3b8d6
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wcschr-sse2.S
@@ -0,0 +1,219 @@
+/* wcschr with SSE2, without using bsf instructions
+ Copyright (C) 2011-2017 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#if IS_IN (libc)
+# include <sysdep.h>
+
+# define CFI_PUSH(REG) \
+ cfi_adjust_cfa_offset (4); \
+ cfi_rel_offset (REG, 0)
+
+# define CFI_POP(REG) \
+ cfi_adjust_cfa_offset (-4); \
+ cfi_restore (REG)
+
+# define PUSH(REG) pushl REG; CFI_PUSH (REG)
+# define POP(REG) popl REG; CFI_POP (REG)
+
+# define PARMS 4
+# define STR1 PARMS
+# define STR2 STR1+4
+
+ atom_text_section
+ENTRY (__wcschr_sse2)
+
+ mov STR1(%esp), %ecx
+ movd STR2(%esp), %xmm1
+
+ mov %ecx, %eax
+ punpckldq %xmm1, %xmm1
+ pxor %xmm2, %xmm2
+ punpckldq %xmm1, %xmm1
+
+ and $63, %eax
+ cmp $48, %eax
+ ja L(cross_cache)
+
+ movdqu (%ecx), %xmm0
+ pcmpeqd %xmm0, %xmm2
+ pcmpeqd %xmm1, %xmm0
+ pmovmskb %xmm2, %edx
+ pmovmskb %xmm0, %eax
+ or %eax, %edx
+ jnz L(matches)
+ and $-16, %ecx
+ jmp L(loop)
+
+ .p2align 4
+L(cross_cache):
+ PUSH (%edi)
+ mov %ecx, %edi
+ mov %eax, %ecx
+ and $-16, %edi
+ and $15, %ecx
+ movdqa (%edi), %xmm0
+ pcmpeqd %xmm0, %xmm2
+ pcmpeqd %xmm1, %xmm0
+ pmovmskb %xmm2, %edx
+ pmovmskb %xmm0, %eax
+
+ sarl %cl, %edx
+ sarl %cl, %eax
+ test %eax, %eax
+ jz L(unaligned_no_match)
+
+ add %edi, %ecx
+ POP (%edi)
+
+ test %edx, %edx
+ jz L(match_case1)
+ test %al, %al
+ jz L(match_higth_case2)
+ test $15, %al
+ jnz L(match_case2_4)
+ test $15, %dl
+ jnz L(return_null)
+ lea 4(%ecx), %eax
+ ret
+
+ CFI_PUSH (%edi)
+
+ .p2align 4
+L(unaligned_no_match):
+ mov %edi, %ecx
+ POP (%edi)
+
+ test %edx, %edx
+ jnz L(return_null)
+
+ pxor %xmm2, %xmm2
+
+/* Loop start on aligned string. */
+ .p2align 4
+L(loop):
+ add $16, %ecx
+ movdqa (%ecx), %xmm0
+ pcmpeqd %xmm0, %xmm2
+ pcmpeqd %xmm1, %xmm0
+ pmovmskb %xmm2, %edx
+ pmovmskb %xmm0, %eax
+ or %eax, %edx
+ jnz L(matches)
+ add $16, %ecx
+
+ movdqa (%ecx), %xmm0
+ pcmpeqd %xmm0, %xmm2
+ pcmpeqd %xmm1, %xmm0
+ pmovmskb %xmm2, %edx
+ pmovmskb %xmm0, %eax
+ or %eax, %edx
+ jnz L(matches)
+ add $16, %ecx
+
+ movdqa (%ecx), %xmm0
+ pcmpeqd %xmm0, %xmm2
+ pcmpeqd %xmm1, %xmm0
+ pmovmskb %xmm2, %edx
+ pmovmskb %xmm0, %eax
+ or %eax, %edx
+ jnz L(matches)
+ add $16, %ecx
+
+ movdqa (%ecx), %xmm0
+ pcmpeqd %xmm0, %xmm2
+ pcmpeqd %xmm1, %xmm0
+ pmovmskb %xmm2, %edx
+ pmovmskb %xmm0, %eax
+ or %eax, %edx
+ jz L(loop)
+
+ .p2align 4
+L(matches):
+ pmovmskb %xmm2, %edx
+ test %eax, %eax
+ jz L(return_null)
+ test %edx, %edx
+ jz L(match_case1)
+
+ .p2align 4
+L(match_case2):
+ test %al, %al
+ jz L(match_higth_case2)
+ test $15, %al
+ jnz L(match_case2_4)
+ test $15, %dl
+ jnz L(return_null)
+ lea 4(%ecx), %eax
+ ret
+
+ .p2align 4
+L(match_case2_4):
+ mov %ecx, %eax
+ ret
+
+ .p2align 4
+L(match_higth_case2):
+ test %dl, %dl
+ jnz L(return_null)
+ test $15, %ah
+ jnz L(match_case2_12)
+ test $15, %dh
+ jnz L(return_null)
+ lea 12(%ecx), %eax
+ ret
+
+ .p2align 4
+L(match_case2_12):
+ lea 8(%ecx), %eax
+ ret
+
+ .p2align 4
+L(match_case1):
+ test %al, %al
+ jz L(match_higth_case1)
+
+ test $0x01, %al
+ jnz L(exit0)
+ lea 4(%ecx), %eax
+ ret
+
+ .p2align 4
+L(match_higth_case1):
+ test $0x01, %ah
+ jnz L(exit3)
+ lea 12(%ecx), %eax
+ ret
+
+ .p2align 4
+L(exit0):
+ mov %ecx, %eax
+ ret
+
+ .p2align 4
+L(exit3):
+ lea 8(%ecx), %eax
+ ret
+
+ .p2align 4
+L(return_null):
+ xor %eax, %eax
+ ret
+
+END (__wcschr_sse2)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wcschr.S b/REORG.TODO/sysdeps/i386/i686/multiarch/wcschr.S
new file mode 100644
index 0000000000..d3c65a6436
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wcschr.S
@@ -0,0 +1,36 @@
+/* Multiple versions of wcschr
+ All versions must be listed in ifunc-impl-list.c.
+ Copyright (C) 2011-2017 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+#if IS_IN (libc)
+ .text
+ENTRY(__wcschr)
+ .type wcschr, @gnu_indirect_function
+ LOAD_GOT_AND_RTLD_GLOBAL_RO
+ LOAD_FUNC_GOT_EAX (__wcschr_ia32)
+ HAS_CPU_FEATURE (SSE2)
+ jz 2f
+ LOAD_FUNC_GOT_EAX (__wcschr_sse2)
+2: ret
+END(__wcschr)
+weak_alias (__wcschr, wcschr)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wcscmp-c.c b/REORG.TODO/sysdeps/i386/i686/multiarch/wcscmp-c.c
new file mode 100644
index 0000000000..e3337d77e2
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wcscmp-c.c
@@ -0,0 +1,14 @@
+#include <wchar.h>
+
+#define WCSCMP __wcscmp_ia32
+#ifdef SHARED
+# undef libc_hidden_def
+# define libc_hidden_def(name) \
+ __hidden_ver1 (__wcscmp_ia32, __GI___wcscmp, __wcscmp_ia32);
+#endif
+#undef weak_alias
+#define weak_alias(name, alias)
+
+extern __typeof (wcscmp) __wcscmp_ia32;
+
+#include "wcsmbs/wcscmp.c"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wcscmp-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/wcscmp-sse2.S
new file mode 100644
index 0000000000..a464b58204
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wcscmp-sse2.S
@@ -0,0 +1,1018 @@
+/* wcscmp with SSE2
+ Copyright (C) 2011-2017 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+
+# define CFI_PUSH(REG) \
+ cfi_adjust_cfa_offset (4); \
+ cfi_rel_offset (REG, 0)
+
+# define CFI_POP(REG) \
+ cfi_adjust_cfa_offset (-4); \
+ cfi_restore (REG)
+
+# define PUSH(REG) pushl REG; CFI_PUSH (REG)
+# define POP(REG) popl REG; CFI_POP (REG)
+
+# define ENTRANCE PUSH(%esi); PUSH(%edi)
+# define RETURN POP(%edi); POP(%esi); ret; CFI_PUSH(%esi); CFI_PUSH(%edi);
+# define PARMS 4
+# define STR1 PARMS
+# define STR2 STR1+4
+
+/* Note: wcscmp uses signed comparison, not unsugned as in strcmp function. */
+
+ .text
+ENTRY (__wcscmp_sse2)
+/*
+ * This implementation uses SSE to compare up to 16 bytes at a time.
+*/
+ mov STR1(%esp), %edx
+ mov STR2(%esp), %eax
+
+ mov (%eax), %ecx
+ cmp %ecx, (%edx)
+ jne L(neq)
+ test %ecx, %ecx
+ jz L(eq)
+
+ mov 4(%eax), %ecx
+ cmp %ecx, 4(%edx)
+ jne L(neq)
+ test %ecx, %ecx
+ jz L(eq)
+
+ mov 8(%eax), %ecx
+ cmp %ecx, 8(%edx)
+ jne L(neq)
+ test %ecx, %ecx
+ jz L(eq)
+
+ mov 12(%eax), %ecx
+ cmp %ecx, 12(%edx)
+ jne L(neq)
+ test %ecx, %ecx
+ jz L(eq)
+
+ ENTRANCE
+ add $16, %eax
+ add $16, %edx
+
+ mov %eax, %esi
+ mov %edx, %edi
+ pxor %xmm0, %xmm0 /* clear %xmm0 for null char checks */
+ mov %al, %ch
+ mov %dl, %cl
+ and $63, %eax /* esi alignment in cache line */
+ and $63, %edx /* edi alignment in cache line */
+ and $15, %cl
+ jz L(continue_00)
+ cmp $16, %edx
+ jb L(continue_0)
+ cmp $32, %edx
+ jb L(continue_16)
+ cmp $48, %edx
+ jb L(continue_32)
+
+L(continue_48):
+ and $15, %ch
+ jz L(continue_48_00)
+ cmp $16, %eax
+ jb L(continue_0_48)
+ cmp $32, %eax
+ jb L(continue_16_48)
+ cmp $48, %eax
+ jb L(continue_32_48)
+
+ .p2align 4
+L(continue_48_48):
+ mov (%esi), %ecx
+ cmp %ecx, (%edi)
+ jne L(nequal)
+ test %ecx, %ecx
+ jz L(equal)
+
+ mov 4(%esi), %ecx
+ cmp %ecx, 4(%edi)
+ jne L(nequal)
+ test %ecx, %ecx
+ jz L(equal)
+
+ mov 8(%esi), %ecx
+ cmp %ecx, 8(%edi)
+ jne L(nequal)
+ test %ecx, %ecx
+ jz L(equal)
+
+ mov 12(%esi), %ecx
+ cmp %ecx, 12(%edi)
+ jne L(nequal)
+ test %ecx, %ecx
+ jz L(equal)
+
+ movdqu 16(%edi), %xmm1
+ movdqu 16(%esi), %xmm2
+ pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
+ pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */
+ psubb %xmm0, %xmm1 /* packed sub of comparison results*/
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
+ jnz L(less4_double_words_16)
+
+ movdqu 32(%edi), %xmm1
+ movdqu 32(%esi), %xmm2
+ pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
+ pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */
+ psubb %xmm0, %xmm1 /* packed sub of comparison results*/
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
+ jnz L(less4_double_words_32)
+
+ movdqu 48(%edi), %xmm1
+ movdqu 48(%esi), %xmm2
+ pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
+ pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */
+ psubb %xmm0, %xmm1 /* packed sub of comparison results*/
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
+ jnz L(less4_double_words_48)
+
+ add $64, %esi
+ add $64, %edi
+ jmp L(continue_48_48)
+
+L(continue_0):
+ and $15, %ch
+ jz L(continue_0_00)
+ cmp $16, %eax
+ jb L(continue_0_0)
+ cmp $32, %eax
+ jb L(continue_0_16)
+ cmp $48, %eax
+ jb L(continue_0_32)
+
+ .p2align 4
+L(continue_0_48):
+ mov (%esi), %ecx
+ cmp %ecx, (%edi)
+ jne L(nequal)
+ test %ecx, %ecx
+ jz L(equal)
+
+ mov 4(%esi), %ecx
+ cmp %ecx, 4(%edi)
+ jne L(nequal)
+ test %ecx, %ecx
+ jz L(equal)
+
+ mov 8(%esi), %ecx
+ cmp %ecx, 8(%edi)
+ jne L(nequal)
+ test %ecx, %ecx
+ jz L(equal)
+
+ mov 12(%esi), %ecx
+ cmp %ecx, 12(%edi)
+ jne L(nequal)
+ test %ecx, %ecx
+ jz L(equal)
+
+ movdqu 16(%edi), %xmm1
+ movdqu 16(%esi), %xmm2
+ pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
+ pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */
+ psubb %xmm0, %xmm1 /* packed sub of comparison results*/
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
+ jnz L(less4_double_words_16)
+
+ movdqu 32(%edi), %xmm1
+ movdqu 32(%esi), %xmm2
+ pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
+ pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */
+ psubb %xmm0, %xmm1 /* packed sub of comparison results*/
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
+ jnz L(less4_double_words_32)
+
+ mov 48(%esi), %ecx
+ cmp %ecx, 48(%edi)
+ jne L(nequal)
+ test %ecx, %ecx
+ jz L(equal)
+
+ mov 52(%esi), %ecx
+ cmp %ecx, 52(%edi)
+ jne L(nequal)
+ test %ecx, %ecx
+ jz L(equal)
+
+ mov 56(%esi), %ecx
+ cmp %ecx, 56(%edi)
+ jne L(nequal)
+ test %ecx, %ecx
+ jz L(equal)
+
+ mov 60(%esi), %ecx
+ cmp %ecx, 60(%edi)
+ jne L(nequal)
+ test %ecx, %ecx
+ jz L(equal)
+
+ add $64, %esi
+ add $64, %edi
+ jmp L(continue_0_48)
+
+ .p2align 4
+L(continue_00):
+ and $15, %ch
+ jz L(continue_00_00)
+ cmp $16, %eax
+ jb L(continue_00_0)
+ cmp $32, %eax
+ jb L(continue_00_16)
+ cmp $48, %eax
+ jb L(continue_00_32)
+
+ .p2align 4
+L(continue_00_48):
+ pcmpeqd (%edi), %xmm0
+ mov (%edi), %eax
+ pmovmskb %xmm0, %ecx
+ test %ecx, %ecx
+ jnz L(less4_double_words1)
+
+ cmp (%esi), %eax
+ jne L(nequal)
+
+ mov 4(%edi), %eax
+ cmp 4(%esi), %eax
+ jne L(nequal)
+
+ mov 8(%edi), %eax
+ cmp 8(%esi), %eax
+ jne L(nequal)
+
+ mov 12(%edi), %eax
+ cmp 12(%esi), %eax
+ jne L(nequal)
+
+ movdqu 16(%esi), %xmm2
+ pcmpeqd %xmm2, %xmm0 /* Any null double_word? */
+ pcmpeqd 16(%edi), %xmm2 /* compare first 4 double_words for equality */
+ psubb %xmm0, %xmm2 /* packed sub of comparison results*/
+ pmovmskb %xmm2, %edx
+ sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
+ jnz L(less4_double_words_16)
+
+ movdqu 32(%esi), %xmm2
+ pcmpeqd %xmm2, %xmm0 /* Any null double_word? */
+ pcmpeqd 32(%edi), %xmm2 /* compare first 4 double_words for equality */
+ psubb %xmm0, %xmm2 /* packed sub of comparison results*/
+ pmovmskb %xmm2, %edx
+ sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
+ jnz L(less4_double_words_32)
+
+ movdqu 48(%esi), %xmm2
+ pcmpeqd %xmm2, %xmm0 /* Any null double_word? */
+ pcmpeqd 48(%edi), %xmm2 /* compare first 4 double_words for equality */
+ psubb %xmm0, %xmm2 /* packed sub of comparison results*/
+ pmovmskb %xmm2, %edx
+ sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
+ jnz L(less4_double_words_48)
+
+ add $64, %esi
+ add $64, %edi
+ jmp L(continue_00_48)
+
+ .p2align 4
+L(continue_32):
+ and $15, %ch
+ jz L(continue_32_00)
+ cmp $16, %eax
+ jb L(continue_0_32)
+ cmp $32, %eax
+ jb L(continue_16_32)
+ cmp $48, %eax
+ jb L(continue_32_32)
+
+ .p2align 4
+L(continue_32_48):
+ mov (%esi), %ecx
+ cmp %ecx, (%edi)
+ jne L(nequal)
+ test %ecx, %ecx
+ jz L(equal)
+
+ mov 4(%esi), %ecx
+ cmp %ecx, 4(%edi)
+ jne L(nequal)
+ test %ecx, %ecx
+ jz L(equal)
+
+ mov 8(%esi), %ecx
+ cmp %ecx, 8(%edi)
+ jne L(nequal)
+ test %ecx, %ecx
+ jz L(equal)
+
+ mov 12(%esi), %ecx
+ cmp %ecx, 12(%edi)
+ jne L(nequal)
+ test %ecx, %ecx
+ jz L(equal)
+
+ mov 16(%esi), %ecx
+ cmp %ecx, 16(%edi)
+ jne L(nequal)
+ test %ecx, %ecx
+ jz L(equal)
+
+ mov 20(%esi), %ecx
+ cmp %ecx, 20(%edi)
+ jne L(nequal)
+ test %ecx, %ecx
+ jz L(equal)
+
+ mov 24(%esi), %ecx
+ cmp %ecx, 24(%edi)
+ jne L(nequal)
+ test %ecx, %ecx
+ jz L(equal)
+
+ mov 28(%esi), %ecx
+ cmp %ecx, 28(%edi)
+ jne L(nequal)
+ test %ecx, %ecx
+ jz L(equal)
+
+ movdqu 32(%edi), %xmm1
+ movdqu 32(%esi), %xmm2
+ pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
+ pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */
+ psubb %xmm0, %xmm1 /* packed sub of comparison results*/
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
+ jnz L(less4_double_words_32)
+
+ movdqu 48(%edi), %xmm1
+ movdqu 48(%esi), %xmm2
+ pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
+ pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */
+ psubb %xmm0, %xmm1 /* packed sub of comparison results */
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
+ jnz L(less4_double_words_48)
+
+ add $64, %esi
+ add $64, %edi
+ jmp L(continue_32_48)
+
+ .p2align 4
+L(continue_16):
+ and $15, %ch
+ jz L(continue_16_00)
+ cmp $16, %eax
+ jb L(continue_0_16)
+ cmp $32, %eax
+ jb L(continue_16_16)
+ cmp $48, %eax
+ jb L(continue_16_32)
+
+ .p2align 4
+L(continue_16_48):
+ mov (%esi), %ecx
+ cmp %ecx, (%edi)
+ jne L(nequal)
+ test %ecx, %ecx
+ jz L(equal)
+
+ mov 4(%esi), %ecx
+ cmp %ecx, 4(%edi)
+ jne L(nequal)
+ test %ecx, %ecx
+ jz L(equal)
+
+ mov 8(%esi), %ecx
+ cmp %ecx, 8(%edi)
+ jne L(nequal)
+ test %ecx, %ecx
+ jz L(equal)
+
+ mov 12(%esi), %ecx
+ cmp %ecx, 12(%edi)
+ jne L(nequal)
+ test %ecx, %ecx
+ jz L(equal)
+
+ movdqu 16(%edi), %xmm1
+ movdqu 16(%esi), %xmm2
+ pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
+ pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */
+ psubb %xmm0, %xmm1 /* packed sub of comparison results*/
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
+ jnz L(less4_double_words_16)
+
+ mov 32(%esi), %ecx
+ cmp %ecx, 32(%edi)
+ jne L(nequal)
+ test %ecx, %ecx
+ jz L(equal)
+
+ mov 36(%esi), %ecx
+ cmp %ecx, 36(%edi)
+ jne L(nequal)
+ test %ecx, %ecx
+ jz L(equal)
+
+ mov 40(%esi), %ecx
+ cmp %ecx, 40(%edi)
+ jne L(nequal)
+ test %ecx, %ecx
+ jz L(equal)
+
+ mov 44(%esi), %ecx
+ cmp %ecx, 44(%edi)
+ jne L(nequal)
+ test %ecx, %ecx
+ jz L(equal)
+
+ movdqu 48(%edi), %xmm1
+ movdqu 48(%esi), %xmm2
+ pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
+ pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */
+ psubb %xmm0, %xmm1 /* packed sub of comparison results*/
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
+ jnz L(less4_double_words_48)
+
+ add $64, %esi
+ add $64, %edi
+ jmp L(continue_16_48)
+
+ .p2align 4
+L(continue_00_00):
+ movdqa (%edi), %xmm1
+ pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
+ pcmpeqd (%esi), %xmm1 /* compare first 4 double_words for equality */
+ psubb %xmm0, %xmm1 /* packed sub of comparison results*/
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
+ jnz L(less4_double_words)
+
+ movdqa 16(%edi), %xmm3
+ pcmpeqd %xmm3, %xmm0 /* Any null double_word? */
+ pcmpeqd 16(%esi), %xmm3 /* compare first 4 double_words for equality */
+ psubb %xmm0, %xmm3 /* packed sub of comparison results*/
+ pmovmskb %xmm3, %edx
+ sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
+ jnz L(less4_double_words_16)
+
+ movdqa 32(%edi), %xmm5
+ pcmpeqd %xmm5, %xmm0 /* Any null double_word? */
+ pcmpeqd 32(%esi), %xmm5 /* compare first 4 double_words for equality */
+ psubb %xmm0, %xmm5 /* packed sub of comparison results*/
+ pmovmskb %xmm5, %edx
+ sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
+ jnz L(less4_double_words_32)
+
+ movdqa 48(%edi), %xmm1
+ pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
+ pcmpeqd 48(%esi), %xmm1 /* compare first 4 double_words for equality */
+ psubb %xmm0, %xmm1 /* packed sub of comparison results*/
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
+ jnz L(less4_double_words_48)
+
+ add $64, %esi
+ add $64, %edi
+ jmp L(continue_00_00)
+
+ .p2align 4
+L(continue_00_32):
+ movdqu (%esi), %xmm2
+ pcmpeqd %xmm2, %xmm0 /* Any null double_word? */
+ pcmpeqd (%edi), %xmm2 /* compare first 4 double_words for equality */
+ psubb %xmm0, %xmm2 /* packed sub of comparison results*/
+ pmovmskb %xmm2, %edx
+ sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
+ jnz L(less4_double_words)
+
+ add $16, %esi
+ add $16, %edi
+ jmp L(continue_00_48)
+
+ .p2align 4
+L(continue_00_16):
+ movdqu (%esi), %xmm2
+ pcmpeqd %xmm2, %xmm0 /* Any null double_word? */
+ pcmpeqd (%edi), %xmm2 /* compare first 4 double_words for equality */
+ psubb %xmm0, %xmm2 /* packed sub of comparison results*/
+ pmovmskb %xmm2, %edx
+ sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
+ jnz L(less4_double_words)
+
+ movdqu 16(%esi), %xmm2
+ pcmpeqd %xmm2, %xmm0 /* Any null double_word? */
+ pcmpeqd 16(%edi), %xmm2 /* compare first 4 double_words for equality */
+ psubb %xmm0, %xmm2 /* packed sub of comparison results*/
+ pmovmskb %xmm2, %edx
+ sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
+ jnz L(less4_double_words_16)
+
+ add $32, %esi
+ add $32, %edi
+ jmp L(continue_00_48)
+
+ .p2align 4
+L(continue_00_0):
+ movdqu (%esi), %xmm2
+ pcmpeqd %xmm2, %xmm0 /* Any null double_word? */
+ pcmpeqd (%edi), %xmm2 /* compare first 4 double_words for equality */
+ psubb %xmm0, %xmm2 /* packed sub of comparison results*/
+ pmovmskb %xmm2, %edx
+ sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
+ jnz L(less4_double_words)
+
+ movdqu 16(%esi), %xmm2
+ pcmpeqd %xmm2, %xmm0 /* Any null double_word? */
+ pcmpeqd 16(%edi), %xmm2 /* compare first 4 double_words for equality */
+ psubb %xmm0, %xmm2 /* packed sub of comparison results*/
+ pmovmskb %xmm2, %edx
+ sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
+ jnz L(less4_double_words_16)
+
+ movdqu 32(%esi), %xmm2
+ pcmpeqd %xmm2, %xmm0 /* Any null double_word? */
+ pcmpeqd 32(%edi), %xmm2 /* compare first 4 double_words for equality */
+ psubb %xmm0, %xmm2 /* packed sub of comparison results*/
+ pmovmskb %xmm2, %edx
+ sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
+ jnz L(less4_double_words_32)
+
+ add $48, %esi
+ add $48, %edi
+ jmp L(continue_00_48)
+
+ .p2align 4
+L(continue_48_00):
+ pcmpeqd (%esi), %xmm0
+ mov (%edi), %eax
+ pmovmskb %xmm0, %ecx
+ test %ecx, %ecx
+ jnz L(less4_double_words1)
+
+ cmp (%esi), %eax
+ jne L(nequal)
+
+ mov 4(%edi), %eax
+ cmp 4(%esi), %eax
+ jne L(nequal)
+
+ mov 8(%edi), %eax
+ cmp 8(%esi), %eax
+ jne L(nequal)
+
+ mov 12(%edi), %eax
+ cmp 12(%esi), %eax
+ jne L(nequal)
+
+ movdqu 16(%edi), %xmm1
+ pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
+ pcmpeqd 16(%esi), %xmm1 /* compare first 4 double_words for equality */
+ psubb %xmm0, %xmm1 /* packed sub of comparison results*/
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
+ jnz L(less4_double_words_16)
+
+ movdqu 32(%edi), %xmm1
+ pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
+ pcmpeqd 32(%esi), %xmm1 /* compare first 4 double_words for equality */
+ psubb %xmm0, %xmm1 /* packed sub of comparison results*/
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
+ jnz L(less4_double_words_32)
+
+ movdqu 48(%edi), %xmm1
+ pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
+ pcmpeqd 48(%esi), %xmm1 /* compare first 4 double_words for equality */
+ psubb %xmm0, %xmm1 /* packed sub of comparison results*/
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
+ jnz L(less4_double_words_48)
+
+ add $64, %esi
+ add $64, %edi
+ jmp L(continue_48_00)
+
+ .p2align 4
+L(continue_32_00):
+ movdqu (%edi), %xmm1
+ pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
+ pcmpeqd (%esi), %xmm1 /* compare first 4 double_words for equality */
+ psubb %xmm0, %xmm1 /* packed sub of comparison results*/
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
+ jnz L(less4_double_words)
+
+ add $16, %esi
+ add $16, %edi
+ jmp L(continue_48_00)
+
+ .p2align 4
+L(continue_16_00):
+ movdqu (%edi), %xmm1
+ pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
+ pcmpeqd (%esi), %xmm1 /* compare first 4 double_words for equality */
+ psubb %xmm0, %xmm1 /* packed sub of comparison results*/
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
+ jnz L(less4_double_words)
+
+ movdqu 16(%edi), %xmm1
+ pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
+ pcmpeqd 16(%esi), %xmm1 /* compare first 4 double_words for equality */
+ psubb %xmm0, %xmm1 /* packed sub of comparison results*/
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
+ jnz L(less4_double_words_16)
+
+ add $32, %esi
+ add $32, %edi
+ jmp L(continue_48_00)
+
+ .p2align 4
+L(continue_0_00):
+ movdqu (%edi), %xmm1
+ pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
+ pcmpeqd (%esi), %xmm1 /* compare first 4 double_words for equality */
+ psubb %xmm0, %xmm1 /* packed sub of comparison results*/
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
+ jnz L(less4_double_words)
+
+ movdqu 16(%edi), %xmm1
+ pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
+ pcmpeqd 16(%esi), %xmm1 /* compare first 4 double_words for equality */
+ psubb %xmm0, %xmm1 /* packed sub of comparison results*/
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
+ jnz L(less4_double_words_16)
+
+ movdqu 32(%edi), %xmm1
+ pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
+ pcmpeqd 32(%esi), %xmm1 /* compare first 4 double_words for equality */
+ psubb %xmm0, %xmm1 /* packed sub of comparison results*/
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
+ jnz L(less4_double_words_32)
+
+ add $48, %esi
+ add $48, %edi
+ jmp L(continue_48_00)
+
+ .p2align 4
+L(continue_32_32):
+ movdqu (%edi), %xmm1
+ movdqu (%esi), %xmm2
+ pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
+ pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */
+ psubb %xmm0, %xmm1 /* packed sub of comparison results*/
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
+ jnz L(less4_double_words)
+
+ add $16, %esi
+ add $16, %edi
+ jmp L(continue_48_48)
+
+ .p2align 4
+L(continue_16_16):
+ movdqu (%edi), %xmm1
+ movdqu (%esi), %xmm2
+ pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
+ pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */
+ psubb %xmm0, %xmm1 /* packed sub of comparison results*/
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
+ jnz L(less4_double_words)
+
+ movdqu 16(%edi), %xmm3
+ movdqu 16(%esi), %xmm4
+ pcmpeqd %xmm3, %xmm0 /* Any null double_word? */
+ pcmpeqd %xmm4, %xmm3 /* compare first 4 double_words for equality */
+ psubb %xmm0, %xmm3 /* packed sub of comparison results*/
+ pmovmskb %xmm3, %edx
+ sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
+ jnz L(less4_double_words_16)
+
+ add $32, %esi
+ add $32, %edi
+ jmp L(continue_48_48)
+
+ .p2align 4
+L(continue_0_0):
+ movdqu (%edi), %xmm1
+ movdqu (%esi), %xmm2
+ pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
+ pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */
+ psubb %xmm0, %xmm1 /* packed sub of comparison results*/
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
+ jnz L(less4_double_words)
+
+ movdqu 16(%edi), %xmm3
+ movdqu 16(%esi), %xmm4
+ pcmpeqd %xmm3, %xmm0 /* Any null double_word? */
+ pcmpeqd %xmm4, %xmm3 /* compare first 4 double_words for equality */
+ psubb %xmm0, %xmm3 /* packed sub of comparison results*/
+ pmovmskb %xmm3, %edx
+ sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
+ jnz L(less4_double_words_16)
+
+ movdqu 32(%edi), %xmm1
+ movdqu 32(%esi), %xmm2
+ pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
+ pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */
+ psubb %xmm0, %xmm1 /* packed sub of comparison results*/
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
+ jnz L(less4_double_words_32)
+
+ add $48, %esi
+ add $48, %edi
+ jmp L(continue_48_48)
+
+ .p2align 4
+L(continue_0_16):
+ movdqu (%edi), %xmm1
+ movdqu (%esi), %xmm2
+ pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
+ pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */
+ psubb %xmm0, %xmm1 /* packed sub of comparison results*/
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
+ jnz L(less4_double_words)
+
+ movdqu 16(%edi), %xmm1
+ movdqu 16(%esi), %xmm2
+ pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
+ pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */
+ psubb %xmm0, %xmm1 /* packed sub of comparison results*/
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
+ jnz L(less4_double_words_16)
+
+ add $32, %esi
+ add $32, %edi
+ jmp L(continue_32_48)
+
+ .p2align 4
+L(continue_0_32):
+ movdqu (%edi), %xmm1
+ movdqu (%esi), %xmm2
+ pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
+ pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */
+ psubb %xmm0, %xmm1 /* packed sub of comparison results*/
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
+ jnz L(less4_double_words)
+
+ add $16, %esi
+ add $16, %edi
+ jmp L(continue_16_48)
+
+ .p2align 4
+L(continue_16_32):
+ movdqu (%edi), %xmm1
+ movdqu (%esi), %xmm2
+ pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
+ pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */
+ psubb %xmm0, %xmm1 /* packed sub of comparison results*/
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
+ jnz L(less4_double_words)
+
+ add $16, %esi
+ add $16, %edi
+ jmp L(continue_32_48)
+
+ .p2align 4
+L(less4_double_words1):
+ cmp (%esi), %eax
+ jne L(nequal)
+ test %eax, %eax
+ jz L(equal)
+
+ mov 4(%esi), %ecx
+ cmp %ecx, 4(%edi)
+ jne L(nequal)
+ test %ecx, %ecx
+ jz L(equal)
+
+ mov 8(%esi), %ecx
+ cmp %ecx, 8(%edi)
+ jne L(nequal)
+ test %ecx, %ecx
+ jz L(equal)
+
+ mov 12(%esi), %ecx
+ cmp %ecx, 12(%edi)
+ jne L(nequal)
+ xor %eax, %eax
+ RETURN
+
+ .p2align 4
+L(less4_double_words):
+ xor %eax, %eax
+ test %dl, %dl
+ jz L(next_two_double_words)
+ and $15, %dl
+ jz L(second_double_word)
+ mov (%esi), %ecx
+ cmp %ecx, (%edi)
+ jne L(nequal)
+ RETURN
+
+ .p2align 4
+L(second_double_word):
+ mov 4(%esi), %ecx
+ cmp %ecx, 4(%edi)
+ jne L(nequal)
+ RETURN
+
+ .p2align 4
+L(next_two_double_words):
+ and $15, %dh
+ jz L(fourth_double_word)
+ mov 8(%esi), %ecx
+ cmp %ecx, 8(%edi)
+ jne L(nequal)
+ RETURN
+
+ .p2align 4
+L(fourth_double_word):
+ mov 12(%esi), %ecx
+ cmp %ecx, 12(%edi)
+ jne L(nequal)
+ RETURN
+
+ .p2align 4
+L(less4_double_words_16):
+ xor %eax, %eax
+ test %dl, %dl
+ jz L(next_two_double_words_16)
+ and $15, %dl
+ jz L(second_double_word_16)
+ mov 16(%esi), %ecx
+ cmp %ecx, 16(%edi)
+ jne L(nequal)
+ RETURN
+
+ .p2align 4
+L(second_double_word_16):
+ mov 20(%esi), %ecx
+ cmp %ecx, 20(%edi)
+ jne L(nequal)
+ RETURN
+
+ .p2align 4
+L(next_two_double_words_16):
+ and $15, %dh
+ jz L(fourth_double_word_16)
+ mov 24(%esi), %ecx
+ cmp %ecx, 24(%edi)
+ jne L(nequal)
+ RETURN
+
+ .p2align 4
+L(fourth_double_word_16):
+ mov 28(%esi), %ecx
+ cmp %ecx, 28(%edi)
+ jne L(nequal)
+ RETURN
+
+ .p2align 4
+L(less4_double_words_32):
+ xor %eax, %eax
+ test %dl, %dl
+ jz L(next_two_double_words_32)
+ and $15, %dl
+ jz L(second_double_word_32)
+ mov 32(%esi), %ecx
+ cmp %ecx, 32(%edi)
+ jne L(nequal)
+ RETURN
+
+ .p2align 4
+L(second_double_word_32):
+ mov 36(%esi), %ecx
+ cmp %ecx, 36(%edi)
+ jne L(nequal)
+ RETURN
+
+ .p2align 4
+L(next_two_double_words_32):
+ and $15, %dh
+ jz L(fourth_double_word_32)
+ mov 40(%esi), %ecx
+ cmp %ecx, 40(%edi)
+ jne L(nequal)
+ RETURN
+
+ .p2align 4
+L(fourth_double_word_32):
+ mov 44(%esi), %ecx
+ cmp %ecx, 44(%edi)
+ jne L(nequal)
+ RETURN
+
+ .p2align 4
+L(less4_double_words_48):
+ xor %eax, %eax
+ test %dl, %dl
+ jz L(next_two_double_words_48)
+ and $15, %dl
+ jz L(second_double_word_48)
+ mov 48(%esi), %ecx
+ cmp %ecx, 48(%edi)
+ jne L(nequal)
+ RETURN
+
+ .p2align 4
+L(second_double_word_48):
+ mov 52(%esi), %ecx
+ cmp %ecx, 52(%edi)
+ jne L(nequal)
+ RETURN
+
+ .p2align 4
+L(next_two_double_words_48):
+ and $15, %dh
+ jz L(fourth_double_word_48)
+ mov 56(%esi), %ecx
+ cmp %ecx, 56(%edi)
+ jne L(nequal)
+ RETURN
+
+ .p2align 4
+L(fourth_double_word_48):
+ mov 60(%esi), %ecx
+ cmp %ecx, 60(%edi)
+ jne L(nequal)
+ RETURN
+
+ .p2align 4
+L(nequal):
+ mov $1, %eax
+ jg L(return)
+ neg %eax
+ RETURN
+
+ .p2align 4
+L(return):
+ RETURN
+
+ .p2align 4
+L(equal):
+ xorl %eax, %eax
+ RETURN
+
+ CFI_POP (%edi)
+ CFI_POP (%esi)
+
+ .p2align 4
+L(neq):
+ mov $1, %eax
+ jg L(neq_bigger)
+ neg %eax
+
+L(neq_bigger):
+ ret
+
+ .p2align 4
+L(eq):
+ xorl %eax, %eax
+ ret
+
+END (__wcscmp_sse2)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wcscmp.S b/REORG.TODO/sysdeps/i386/i686/multiarch/wcscmp.S
new file mode 100644
index 0000000000..7118bdd4db
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wcscmp.S
@@ -0,0 +1,39 @@
+/* Multiple versions of wcscmp
+ All versions must be listed in ifunc-impl-list.c.
+ Copyright (C) 2011-2017 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in libc and for the
+ DSO. In static binaries, we need wcscmp before the initialization
+ happened. */
+#if IS_IN (libc)
+ .text
+ENTRY(__wcscmp)
+ .type __wcscmp, @gnu_indirect_function
+ LOAD_GOT_AND_RTLD_GLOBAL_RO
+ LOAD_FUNC_GOT_EAX (__wcscmp_ia32)
+ HAS_CPU_FEATURE (SSE2)
+ jz 2f
+ LOAD_FUNC_GOT_EAX (__wcscmp_sse2)
+2: ret
+END(__wcscmp)
+weak_alias (__wcscmp, wcscmp)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wcscpy-c.c b/REORG.TODO/sysdeps/i386/i686/multiarch/wcscpy-c.c
new file mode 100644
index 0000000000..fb3000392b
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wcscpy-c.c
@@ -0,0 +1,5 @@
+#if IS_IN (libc)
+# define wcscpy __wcscpy_ia32
+#endif
+
+#include "wcsmbs/wcscpy.c"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wcscpy-ssse3.S b/REORG.TODO/sysdeps/i386/i686/multiarch/wcscpy-ssse3.S
new file mode 100644
index 0000000000..6280ba92ab
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wcscpy-ssse3.S
@@ -0,0 +1,600 @@
+/* wcscpy with SSSE3
+ Copyright (C) 2011-2017 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#if IS_IN (libc)
+# include <sysdep.h>
+
+# define CFI_PUSH(REG) \
+ cfi_adjust_cfa_offset (4); \
+ cfi_rel_offset (REG, 0)
+
+# define CFI_POP(REG) \
+ cfi_adjust_cfa_offset (-4); \
+ cfi_restore (REG)
+
+# define PUSH(REG) pushl REG; CFI_PUSH (REG)
+# define POP(REG) popl REG; CFI_POP (REG)
+
+# define PARMS 4
+# define RETURN POP (%edi); ret; CFI_PUSH (%edi)
+# define STR1 PARMS
+# define STR2 STR1+4
+# define LEN STR2+4
+
+ atom_text_section
+ENTRY (__wcscpy_ssse3)
+ mov STR1(%esp), %edx
+ mov STR2(%esp), %ecx
+
+ cmp $0, (%ecx)
+ jz L(ExitTail4)
+ cmp $0, 4(%ecx)
+ jz L(ExitTail8)
+ cmp $0, 8(%ecx)
+ jz L(ExitTail12)
+ cmp $0, 12(%ecx)
+ jz L(ExitTail16)
+
+ PUSH (%edi)
+ mov %edx, %edi
+ PUSH (%esi)
+ lea 16(%ecx), %esi
+
+ and $-16, %esi
+
+ pxor %xmm0, %xmm0
+ pcmpeqd (%esi), %xmm0
+ movdqu (%ecx), %xmm1
+ movdqu %xmm1, (%edx)
+
+ pmovmskb %xmm0, %eax
+ sub %ecx, %esi
+
+ test %eax, %eax
+ jnz L(CopyFrom1To16Bytes)
+
+ mov %edx, %eax
+ lea 16(%edx), %edx
+ and $-16, %edx
+ sub %edx, %eax
+
+ sub %eax, %ecx
+ mov %ecx, %eax
+ and $0xf, %eax
+ mov $0, %esi
+
+ jz L(Align16Both)
+ cmp $4, %eax
+ je L(Shl4)
+ cmp $8, %eax
+ je L(Shl8)
+ jmp L(Shl12)
+
+L(Align16Both):
+ movaps (%ecx), %xmm1
+ movaps 16(%ecx), %xmm2
+ movaps %xmm1, (%edx)
+ pcmpeqd %xmm2, %xmm0
+ pmovmskb %xmm0, %eax
+ lea 16(%esi), %esi
+
+ test %eax, %eax
+ jnz L(CopyFrom1To16Bytes)
+
+ movaps 16(%ecx, %esi), %xmm3
+ movaps %xmm2, (%edx, %esi)
+ pcmpeqd %xmm3, %xmm0
+ pmovmskb %xmm0, %eax
+ lea 16(%esi), %esi
+
+ test %eax, %eax
+ jnz L(CopyFrom1To16Bytes)
+
+ movaps 16(%ecx, %esi), %xmm4
+ movaps %xmm3, (%edx, %esi)
+ pcmpeqd %xmm4, %xmm0
+ pmovmskb %xmm0, %eax
+ lea 16(%esi), %esi
+
+ test %eax, %eax
+ jnz L(CopyFrom1To16Bytes)
+
+ movaps 16(%ecx, %esi), %xmm1
+ movaps %xmm4, (%edx, %esi)
+ pcmpeqd %xmm1, %xmm0
+ pmovmskb %xmm0, %eax
+ lea 16(%esi), %esi
+
+ test %eax, %eax
+ jnz L(CopyFrom1To16Bytes)
+
+ movaps 16(%ecx, %esi), %xmm2
+ movaps %xmm1, (%edx, %esi)
+ pcmpeqd %xmm2, %xmm0
+ pmovmskb %xmm0, %eax
+ lea 16(%esi), %esi
+
+ test %eax, %eax
+ jnz L(CopyFrom1To16Bytes)
+
+ movaps 16(%ecx, %esi), %xmm3
+ movaps %xmm2, (%edx, %esi)
+ pcmpeqd %xmm3, %xmm0
+ pmovmskb %xmm0, %eax
+ lea 16(%esi), %esi
+
+ test %eax, %eax
+ jnz L(CopyFrom1To16Bytes)
+
+ movaps %xmm3, (%edx, %esi)
+ mov %ecx, %eax
+ lea 16(%ecx, %esi), %ecx
+ and $-0x40, %ecx
+ sub %ecx, %eax
+ sub %eax, %edx
+
+ mov $-0x40, %esi
+
+L(Aligned64Loop):
+ movaps (%ecx), %xmm2
+ movaps 32(%ecx), %xmm3
+ movaps %xmm2, %xmm4
+ movaps 16(%ecx), %xmm5
+ movaps %xmm3, %xmm6
+ movaps 48(%ecx), %xmm7
+ pminub %xmm5, %xmm2
+ pminub %xmm7, %xmm3
+ pminub %xmm2, %xmm3
+ lea 64(%edx), %edx
+ pcmpeqd %xmm0, %xmm3
+ lea 64(%ecx), %ecx
+ pmovmskb %xmm3, %eax
+
+ test %eax, %eax
+ jnz L(Aligned64Leave)
+ movaps %xmm4, -64(%edx)
+ movaps %xmm5, -48(%edx)
+ movaps %xmm6, -32(%edx)
+ movaps %xmm7, -16(%edx)
+ jmp L(Aligned64Loop)
+
+L(Aligned64Leave):
+ pcmpeqd %xmm4, %xmm0
+ pmovmskb %xmm0, %eax
+ test %eax, %eax
+ jnz L(CopyFrom1To16Bytes)
+
+ pcmpeqd %xmm5, %xmm0
+ pmovmskb %xmm0, %eax
+ movaps %xmm4, -64(%edx)
+ test %eax, %eax
+ lea 16(%esi), %esi
+ jnz L(CopyFrom1To16Bytes)
+
+ pcmpeqd %xmm6, %xmm0
+ pmovmskb %xmm0, %eax
+ movaps %xmm5, -48(%edx)
+ test %eax, %eax
+ lea 16(%esi), %esi
+ jnz L(CopyFrom1To16Bytes)
+
+ movaps %xmm6, -32(%edx)
+ pcmpeqd %xmm7, %xmm0
+ pmovmskb %xmm0, %eax
+ test %eax, %eax
+ lea 16(%esi), %esi
+ jnz L(CopyFrom1To16Bytes)
+
+ mov $-0x40, %esi
+ movaps %xmm7, -16(%edx)
+ jmp L(Aligned64Loop)
+
+ .p2align 4
+L(Shl4):
+ movaps -4(%ecx), %xmm1
+ movaps 12(%ecx), %xmm2
+L(Shl4Start):
+ pcmpeqd %xmm2, %xmm0
+ pmovmskb %xmm0, %eax
+ movaps %xmm2, %xmm3
+
+ test %eax, %eax
+ jnz L(Shl4LoopExit)
+
+ palignr $4, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 28(%ecx), %xmm2
+
+ pcmpeqd %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm1
+
+ test %eax, %eax
+ jnz L(Shl4LoopExit)
+
+ palignr $4, %xmm3, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 28(%ecx), %xmm2
+
+ pcmpeqd %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+
+ test %eax, %eax
+ jnz L(Shl4LoopExit)
+
+ palignr $4, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 28(%ecx), %xmm2
+
+ pcmpeqd %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+
+ test %eax, %eax
+ jnz L(Shl4LoopExit)
+
+ palignr $4, %xmm3, %xmm2
+ movaps %xmm2, (%edx)
+ lea 28(%ecx), %ecx
+ lea 16(%edx), %edx
+
+ mov %ecx, %eax
+ and $-0x40, %ecx
+ sub %ecx, %eax
+ lea -12(%ecx), %ecx
+ sub %eax, %edx
+
+ movaps -4(%ecx), %xmm1
+
+L(Shl4LoopStart):
+ movaps 12(%ecx), %xmm2
+ movaps 28(%ecx), %xmm3
+ movaps %xmm3, %xmm6
+ movaps 44(%ecx), %xmm4
+ movaps %xmm4, %xmm7
+ movaps 60(%ecx), %xmm5
+ pminub %xmm2, %xmm6
+ pminub %xmm5, %xmm7
+ pminub %xmm6, %xmm7
+ pcmpeqd %xmm0, %xmm7
+ pmovmskb %xmm7, %eax
+ movaps %xmm5, %xmm7
+ palignr $4, %xmm4, %xmm5
+ test %eax, %eax
+ palignr $4, %xmm3, %xmm4
+ jnz L(Shl4Start)
+
+ palignr $4, %xmm2, %xmm3
+ lea 64(%ecx), %ecx
+ palignr $4, %xmm1, %xmm2
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%edx)
+ movaps %xmm4, 32(%edx)
+ movaps %xmm3, 16(%edx)
+ movaps %xmm2, (%edx)
+ lea 64(%edx), %edx
+ jmp L(Shl4LoopStart)
+
+L(Shl4LoopExit):
+ movlpd (%ecx), %xmm0
+ movl 8(%ecx), %esi
+ movlpd %xmm0, (%edx)
+ movl %esi, 8(%edx)
+ POP (%esi)
+ add $12, %edx
+ add $12, %ecx
+ test %al, %al
+ jz L(ExitHigh)
+ test $0x01, %al
+ jnz L(Exit4)
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ movl %edi, %eax
+ RETURN
+
+ CFI_PUSH (%esi)
+
+ .p2align 4
+L(Shl8):
+ movaps -8(%ecx), %xmm1
+ movaps 8(%ecx), %xmm2
+L(Shl8Start):
+ pcmpeqd %xmm2, %xmm0
+ pmovmskb %xmm0, %eax
+ movaps %xmm2, %xmm3
+
+ test %eax, %eax
+ jnz L(Shl8LoopExit)
+
+ palignr $8, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 24(%ecx), %xmm2
+
+ pcmpeqd %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm1
+
+ test %eax, %eax
+ jnz L(Shl8LoopExit)
+
+ palignr $8, %xmm3, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 24(%ecx), %xmm2
+
+ pcmpeqd %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+
+ test %eax, %eax
+ jnz L(Shl8LoopExit)
+
+ palignr $8, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 24(%ecx), %xmm2
+
+ pcmpeqd %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+
+ test %eax, %eax
+ jnz L(Shl8LoopExit)
+
+ palignr $8, %xmm3, %xmm2
+ movaps %xmm2, (%edx)
+ lea 24(%ecx), %ecx
+ lea 16(%edx), %edx
+
+ mov %ecx, %eax
+ and $-0x40, %ecx
+ sub %ecx, %eax
+ lea -8(%ecx), %ecx
+ sub %eax, %edx
+
+ movaps -8(%ecx), %xmm1
+
+L(Shl8LoopStart):
+ movaps 8(%ecx), %xmm2
+ movaps 24(%ecx), %xmm3
+ movaps %xmm3, %xmm6
+ movaps 40(%ecx), %xmm4
+ movaps %xmm4, %xmm7
+ movaps 56(%ecx), %xmm5
+ pminub %xmm2, %xmm6
+ pminub %xmm5, %xmm7
+ pminub %xmm6, %xmm7
+ pcmpeqd %xmm0, %xmm7
+ pmovmskb %xmm7, %eax
+ movaps %xmm5, %xmm7
+ palignr $8, %xmm4, %xmm5
+ test %eax, %eax
+ palignr $8, %xmm3, %xmm4
+ jnz L(Shl8Start)
+
+ palignr $8, %xmm2, %xmm3
+ lea 64(%ecx), %ecx
+ palignr $8, %xmm1, %xmm2
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%edx)
+ movaps %xmm4, 32(%edx)
+ movaps %xmm3, 16(%edx)
+ movaps %xmm2, (%edx)
+ lea 64(%edx), %edx
+ jmp L(Shl8LoopStart)
+
+L(Shl8LoopExit):
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ POP (%esi)
+ add $8, %edx
+ add $8, %ecx
+ test %al, %al
+ jz L(ExitHigh)
+ test $0x01, %al
+ jnz L(Exit4)
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ movl %edi, %eax
+ RETURN
+
+ CFI_PUSH (%esi)
+
+ .p2align 4
+L(Shl12):
+ movaps -12(%ecx), %xmm1
+ movaps 4(%ecx), %xmm2
+L(Shl12Start):
+ pcmpeqd %xmm2, %xmm0
+ pmovmskb %xmm0, %eax
+ movaps %xmm2, %xmm3
+
+ test %eax, %eax
+ jnz L(Shl12LoopExit)
+
+ palignr $12, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 20(%ecx), %xmm2
+
+ pcmpeqd %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm1
+
+ test %eax, %eax
+ jnz L(Shl12LoopExit)
+
+ palignr $12, %xmm3, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 20(%ecx), %xmm2
+
+ pcmpeqd %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+
+ test %eax, %eax
+ jnz L(Shl12LoopExit)
+
+ palignr $12, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 20(%ecx), %xmm2
+
+ pcmpeqd %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+
+ test %eax, %eax
+ jnz L(Shl12LoopExit)
+
+ palignr $12, %xmm3, %xmm2
+ movaps %xmm2, (%edx)
+ lea 20(%ecx), %ecx
+ lea 16(%edx), %edx
+
+ mov %ecx, %eax
+ and $-0x40, %ecx
+ sub %ecx, %eax
+ lea -4(%ecx), %ecx
+ sub %eax, %edx
+
+ movaps -12(%ecx), %xmm1
+
+L(Shl12LoopStart):
+ movaps 4(%ecx), %xmm2
+ movaps 20(%ecx), %xmm3
+ movaps %xmm3, %xmm6
+ movaps 36(%ecx), %xmm4
+ movaps %xmm4, %xmm7
+ movaps 52(%ecx), %xmm5
+ pminub %xmm2, %xmm6
+ pminub %xmm5, %xmm7
+ pminub %xmm6, %xmm7
+ pcmpeqd %xmm0, %xmm7
+ pmovmskb %xmm7, %eax
+ movaps %xmm5, %xmm7
+ palignr $12, %xmm4, %xmm5
+ test %eax, %eax
+ palignr $12, %xmm3, %xmm4
+ jnz L(Shl12Start)
+
+ palignr $12, %xmm2, %xmm3
+ lea 64(%ecx), %ecx
+ palignr $12, %xmm1, %xmm2
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%edx)
+ movaps %xmm4, 32(%edx)
+ movaps %xmm3, 16(%edx)
+ movaps %xmm2, (%edx)
+ lea 64(%edx), %edx
+ jmp L(Shl12LoopStart)
+
+L(Shl12LoopExit):
+ movl (%ecx), %esi
+ movl %esi, (%edx)
+ mov $4, %esi
+
+ .p2align 4
+L(CopyFrom1To16Bytes):
+ add %esi, %edx
+ add %esi, %ecx
+
+ POP (%esi)
+ test %al, %al
+ jz L(ExitHigh)
+ test $0x01, %al
+ jnz L(Exit4)
+L(Exit8):
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ movl %edi, %eax
+ RETURN
+
+ .p2align 4
+L(ExitHigh):
+ test $0x01, %ah
+ jnz L(Exit12)
+L(Exit16):
+ movdqu (%ecx), %xmm0
+ movdqu %xmm0, (%edx)
+ movl %edi, %eax
+ RETURN
+
+ .p2align 4
+L(Exit4):
+ movl (%ecx), %eax
+ movl %eax, (%edx)
+ movl %edi, %eax
+ RETURN
+
+ .p2align 4
+L(Exit12):
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ movl 8(%ecx), %eax
+ movl %eax, 8(%edx)
+ movl %edi, %eax
+ RETURN
+
+CFI_POP (%edi)
+
+ .p2align 4
+L(ExitTail4):
+ movl (%ecx), %eax
+ movl %eax, (%edx)
+ movl %edx, %eax
+ ret
+
+ .p2align 4
+L(ExitTail8):
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ movl %edx, %eax
+ ret
+
+ .p2align 4
+L(ExitTail12):
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ movl 8(%ecx), %eax
+ movl %eax, 8(%edx)
+ movl %edx, %eax
+ ret
+
+ .p2align 4
+L(ExitTail16):
+ movdqu (%ecx), %xmm0
+ movdqu %xmm0, (%edx)
+ movl %edx, %eax
+ ret
+
+END (__wcscpy_ssse3)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wcscpy.S b/REORG.TODO/sysdeps/i386/i686/multiarch/wcscpy.S
new file mode 100644
index 0000000000..cfc97dd87c
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wcscpy.S
@@ -0,0 +1,36 @@
+/* Multiple versions of wcscpy
+ All versions must be listed in ifunc-impl-list.c.
+ Copyright (C) 2011-2017 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in libc. */
+#if IS_IN (libc)
+ .text
+ENTRY(wcscpy)
+ .type wcscpy, @gnu_indirect_function
+ LOAD_GOT_AND_RTLD_GLOBAL_RO
+ LOAD_FUNC_GOT_EAX (__wcscpy_ia32)
+ HAS_CPU_FEATURE (SSSE3)
+ jz 2f
+ LOAD_FUNC_GOT_EAX (__wcscpy_ssse3)
+2: ret
+END(wcscpy)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wcslen-c.c b/REORG.TODO/sysdeps/i386/i686/multiarch/wcslen-c.c
new file mode 100644
index 0000000000..a335dc0f7e
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wcslen-c.c
@@ -0,0 +1,9 @@
+#include <wchar.h>
+
+#if IS_IN (libc)
+# define WCSLEN __wcslen_ia32
+#endif
+
+extern __typeof (wcslen) __wcslen_ia32;
+
+#include "wcsmbs/wcslen.c"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wcslen-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/wcslen-sse2.S
new file mode 100644
index 0000000000..bd3fc4c79b
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wcslen-sse2.S
@@ -0,0 +1,193 @@
+/* wcslen with SSE2
+ Copyright (C) 2011-2017 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#if IS_IN (libc)
+# include <sysdep.h>
+# define STR 4
+
+ .text
+ENTRY (__wcslen_sse2)
+ mov STR(%esp), %edx
+
+ cmp $0, (%edx)
+ jz L(exit_tail0)
+ cmp $0, 4(%edx)
+ jz L(exit_tail1)
+ cmp $0, 8(%edx)
+ jz L(exit_tail2)
+ cmp $0, 12(%edx)
+ jz L(exit_tail3)
+ cmp $0, 16(%edx)
+ jz L(exit_tail4)
+ cmp $0, 20(%edx)
+ jz L(exit_tail5)
+ cmp $0, 24(%edx)
+ jz L(exit_tail6)
+ cmp $0, 28(%edx)
+ jz L(exit_tail7)
+
+ pxor %xmm0, %xmm0
+
+ lea 32(%edx), %eax
+ lea 16(%edx), %ecx
+ and $-16, %eax
+
+ pcmpeqd (%eax), %xmm0
+ pmovmskb %xmm0, %edx
+ pxor %xmm1, %xmm1
+ test %edx, %edx
+ lea 16(%eax), %eax
+ jnz L(exit)
+
+ pcmpeqd (%eax), %xmm1
+ pmovmskb %xmm1, %edx
+ pxor %xmm2, %xmm2
+ test %edx, %edx
+ lea 16(%eax), %eax
+ jnz L(exit)
+
+ pcmpeqd (%eax), %xmm2
+ pmovmskb %xmm2, %edx
+ pxor %xmm3, %xmm3
+ test %edx, %edx
+ lea 16(%eax), %eax
+ jnz L(exit)
+
+ pcmpeqd (%eax), %xmm3
+ pmovmskb %xmm3, %edx
+ test %edx, %edx
+ lea 16(%eax), %eax
+ jnz L(exit)
+
+ and $-0x40, %eax
+
+ .p2align 4
+L(aligned_64_loop):
+ movaps (%eax), %xmm0
+ movaps 16(%eax), %xmm1
+ movaps 32(%eax), %xmm2
+ movaps 48(%eax), %xmm6
+
+ pminub %xmm1, %xmm0
+ pminub %xmm6, %xmm2
+ pminub %xmm0, %xmm2
+ pcmpeqd %xmm3, %xmm2
+ pmovmskb %xmm2, %edx
+ test %edx, %edx
+ lea 64(%eax), %eax
+ jz L(aligned_64_loop)
+
+ pcmpeqd -64(%eax), %xmm3
+ pmovmskb %xmm3, %edx
+ test %edx, %edx
+ lea 48(%ecx), %ecx
+ jnz L(exit)
+
+ pcmpeqd %xmm1, %xmm3
+ pmovmskb %xmm3, %edx
+ test %edx, %edx
+ lea -16(%ecx), %ecx
+ jnz L(exit)
+
+ pcmpeqd -32(%eax), %xmm3
+ pmovmskb %xmm3, %edx
+ test %edx, %edx
+ lea -16(%ecx), %ecx
+ jnz L(exit)
+
+ pcmpeqd %xmm6, %xmm3
+ pmovmskb %xmm3, %edx
+ test %edx, %edx
+ lea -16(%ecx), %ecx
+ jnz L(exit)
+
+ jmp L(aligned_64_loop)
+
+ .p2align 4
+L(exit):
+ sub %ecx, %eax
+ shr $2, %eax
+ test %dl, %dl
+ jz L(exit_high)
+
+ mov %dl, %cl
+ and $15, %cl
+ jz L(exit_1)
+ ret
+
+ .p2align 4
+L(exit_high):
+ mov %dh, %ch
+ and $15, %ch
+ jz L(exit_3)
+ add $2, %eax
+ ret
+
+ .p2align 4
+L(exit_1):
+ add $1, %eax
+ ret
+
+ .p2align 4
+L(exit_3):
+ add $3, %eax
+ ret
+
+ .p2align 4
+L(exit_tail0):
+ xor %eax, %eax
+ ret
+
+ .p2align 4
+L(exit_tail1):
+ mov $1, %eax
+ ret
+
+ .p2align 4
+L(exit_tail2):
+ mov $2, %eax
+ ret
+
+ .p2align 4
+L(exit_tail3):
+ mov $3, %eax
+ ret
+
+ .p2align 4
+L(exit_tail4):
+ mov $4, %eax
+ ret
+
+ .p2align 4
+L(exit_tail5):
+ mov $5, %eax
+ ret
+
+ .p2align 4
+L(exit_tail6):
+ mov $6, %eax
+ ret
+
+ .p2align 4
+L(exit_tail7):
+ mov $7, %eax
+ ret
+
+END (__wcslen_sse2)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wcslen.S b/REORG.TODO/sysdeps/i386/i686/multiarch/wcslen.S
new file mode 100644
index 0000000000..6ef9b6e7b5
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wcslen.S
@@ -0,0 +1,37 @@
+/* Multiple versions of wcslen
+ All versions must be listed in ifunc-impl-list.c.
+ Copyright (C) 2011-2017 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+#if IS_IN (libc)
+ .text
+ENTRY(__wcslen)
+ .type __wcslen, @gnu_indirect_function
+ LOAD_GOT_AND_RTLD_GLOBAL_RO
+ LOAD_FUNC_GOT_EAX (__wcslen_ia32)
+ HAS_CPU_FEATURE (SSE2)
+ jz 2f
+ LOAD_FUNC_GOT_EAX (__wcslen_sse2)
+2: ret
+END(__wcslen)
+
+weak_alias(__wcslen, wcslen)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wcsrchr-c.c b/REORG.TODO/sysdeps/i386/i686/multiarch/wcsrchr-c.c
new file mode 100644
index 0000000000..8d8a335b5b
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wcsrchr-c.c
@@ -0,0 +1,5 @@
+#if IS_IN (libc)
+# define wcsrchr __wcsrchr_ia32
+#endif
+
+#include "wcsmbs/wcsrchr.c"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wcsrchr-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/wcsrchr-sse2.S
new file mode 100644
index 0000000000..1a9b60e55e
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wcsrchr-sse2.S
@@ -0,0 +1,354 @@
+/* wcsrchr with SSE2, without using bsf instructions.
+ Copyright (C) 2011-2017 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#if IS_IN (libc)
+# include <sysdep.h>
+# define CFI_PUSH(REG) \
+ cfi_adjust_cfa_offset (4); \
+ cfi_rel_offset (REG, 0)
+
+# define CFI_POP(REG) \
+ cfi_adjust_cfa_offset (-4); \
+ cfi_restore (REG)
+
+# define PUSH(REG) pushl REG; CFI_PUSH (REG)
+# define POP(REG) popl REG; CFI_POP (REG)
+
+# define PARMS 8
+# define ENTRANCE PUSH (%edi);
+# define RETURN POP (%edi); ret; CFI_PUSH (%edi);
+# define STR1 PARMS
+# define STR2 STR1+4
+
+ atom_text_section
+ENTRY (__wcsrchr_sse2)
+
+ ENTRANCE
+ mov STR1(%esp), %ecx
+ movd STR2(%esp), %xmm1
+
+ mov %ecx, %edi
+ punpckldq %xmm1, %xmm1
+ pxor %xmm2, %xmm2
+ punpckldq %xmm1, %xmm1
+
+/* ECX has OFFSET. */
+ and $63, %ecx
+ cmp $48, %ecx
+ ja L(crosscache)
+
+/* unaligned string. */
+ movdqu (%edi), %xmm0
+ pcmpeqd %xmm0, %xmm2
+ pcmpeqd %xmm1, %xmm0
+/* Find where NULL is. */
+ pmovmskb %xmm2, %ecx
+/* Check if there is a match. */
+ pmovmskb %xmm0, %eax
+ add $16, %edi
+
+ test %eax, %eax
+ jnz L(unaligned_match1)
+
+ test %ecx, %ecx
+ jnz L(return_null)
+
+ and $-16, %edi
+
+ PUSH (%esi)
+
+ xor %edx, %edx
+ jmp L(loop)
+
+ CFI_POP (%esi)
+
+ .p2align 4
+L(unaligned_match1):
+ test %ecx, %ecx
+ jnz L(prolog_find_zero_1)
+
+ PUSH (%esi)
+
+/* Save current match */
+ mov %eax, %edx
+ mov %edi, %esi
+ and $-16, %edi
+ jmp L(loop)
+
+ CFI_POP (%esi)
+
+ .p2align 4
+L(crosscache):
+/* Hancle unaligned string. */
+ and $15, %ecx
+ and $-16, %edi
+ pxor %xmm3, %xmm3
+ movdqa (%edi), %xmm0
+ pcmpeqd %xmm0, %xmm3
+ pcmpeqd %xmm1, %xmm0
+/* Find where NULL is. */
+ pmovmskb %xmm3, %edx
+/* Check if there is a match. */
+ pmovmskb %xmm0, %eax
+/* Remove the leading bytes. */
+ shr %cl, %edx
+ shr %cl, %eax
+ add $16, %edi
+
+ test %eax, %eax
+ jnz L(unaligned_match)
+
+ test %edx, %edx
+ jnz L(return_null)
+
+ PUSH (%esi)
+
+ xor %edx, %edx
+ jmp L(loop)
+
+ CFI_POP (%esi)
+
+ .p2align 4
+L(unaligned_match):
+ test %edx, %edx
+ jnz L(prolog_find_zero)
+
+ PUSH (%esi)
+
+ mov %eax, %edx
+ lea (%edi, %ecx), %esi
+
+/* Loop start on aligned string. */
+ .p2align 4
+L(loop):
+ movdqa (%edi), %xmm0
+ pcmpeqd %xmm0, %xmm2
+ add $16, %edi
+ pcmpeqd %xmm1, %xmm0
+ pmovmskb %xmm2, %ecx
+ pmovmskb %xmm0, %eax
+ or %eax, %ecx
+ jnz L(matches)
+
+ movdqa (%edi), %xmm3
+ pcmpeqd %xmm3, %xmm2
+ add $16, %edi
+ pcmpeqd %xmm1, %xmm3
+ pmovmskb %xmm2, %ecx
+ pmovmskb %xmm3, %eax
+ or %eax, %ecx
+ jnz L(matches)
+
+ movdqa (%edi), %xmm4
+ pcmpeqd %xmm4, %xmm2
+ add $16, %edi
+ pcmpeqd %xmm1, %xmm4
+ pmovmskb %xmm2, %ecx
+ pmovmskb %xmm4, %eax
+ or %eax, %ecx
+ jnz L(matches)
+
+ movdqa (%edi), %xmm5
+ pcmpeqd %xmm5, %xmm2
+ add $16, %edi
+ pcmpeqd %xmm1, %xmm5
+ pmovmskb %xmm2, %ecx
+ pmovmskb %xmm5, %eax
+ or %eax, %ecx
+ jz L(loop)
+
+ .p2align 4
+L(matches):
+ test %eax, %eax
+ jnz L(match)
+L(return_value):
+ test %edx, %edx
+ jz L(return_null_1)
+ mov %edx, %eax
+ mov %esi, %edi
+
+ POP (%esi)
+
+ test %ah, %ah
+ jnz L(match_third_or_fourth_wchar)
+ test $15 << 4, %al
+ jnz L(match_second_wchar)
+ lea -16(%edi), %eax
+ RETURN
+
+ CFI_PUSH (%esi)
+
+ .p2align 4
+L(return_null_1):
+ POP (%esi)
+
+ xor %eax, %eax
+ RETURN
+
+ CFI_PUSH (%esi)
+
+ .p2align 4
+L(match):
+ pmovmskb %xmm2, %ecx
+ test %ecx, %ecx
+ jnz L(find_zero)
+/* save match info */
+ mov %eax, %edx
+ mov %edi, %esi
+ jmp L(loop)
+
+ .p2align 4
+L(find_zero):
+ test %cl, %cl
+ jz L(find_zero_in_third_or_fourth_wchar)
+ test $15, %cl
+ jz L(find_zero_in_second_wchar)
+ and $1, %eax
+ jz L(return_value)
+
+ POP (%esi)
+
+ lea -16(%edi), %eax
+ RETURN
+
+ CFI_PUSH (%esi)
+
+ .p2align 4
+L(find_zero_in_second_wchar):
+ and $1 << 5 - 1, %eax
+ jz L(return_value)
+
+ POP (%esi)
+
+ test $15 << 4, %al
+ jnz L(match_second_wchar)
+ lea -16(%edi), %eax
+ RETURN
+
+ CFI_PUSH (%esi)
+
+ .p2align 4
+L(find_zero_in_third_or_fourth_wchar):
+ test $15, %ch
+ jz L(find_zero_in_fourth_wchar)
+ and $1 << 9 - 1, %eax
+ jz L(return_value)
+
+ POP (%esi)
+
+ test %ah, %ah
+ jnz L(match_third_wchar)
+ test $15 << 4, %al
+ jnz L(match_second_wchar)
+ lea -16(%edi), %eax
+ RETURN
+
+ CFI_PUSH (%esi)
+
+ .p2align 4
+L(find_zero_in_fourth_wchar):
+
+ POP (%esi)
+
+ test %ah, %ah
+ jnz L(match_third_or_fourth_wchar)
+ test $15 << 4, %al
+ jnz L(match_second_wchar)
+ lea -16(%edi), %eax
+ RETURN
+
+ CFI_PUSH (%esi)
+
+ .p2align 4
+L(match_second_wchar):
+ lea -12(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(match_third_or_fourth_wchar):
+ test $15 << 4, %ah
+ jnz L(match_fourth_wchar)
+ lea -8(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(match_third_wchar):
+ lea -8(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(match_fourth_wchar):
+ lea -4(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(return_null):
+ xor %eax, %eax
+ RETURN
+
+ .p2align 4
+L(prolog_find_zero):
+ add %ecx, %edi
+ mov %edx, %ecx
+L(prolog_find_zero_1):
+ test %cl, %cl
+ jz L(prolog_find_zero_in_third_or_fourth_wchar)
+ test $15, %cl
+ jz L(prolog_find_zero_in_second_wchar)
+ and $1, %eax
+ jz L(return_null)
+
+ lea -16(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(prolog_find_zero_in_second_wchar):
+ and $1 << 5 - 1, %eax
+ jz L(return_null)
+
+ test $15 << 4, %al
+ jnz L(match_second_wchar)
+ lea -16(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(prolog_find_zero_in_third_or_fourth_wchar):
+ test $15, %ch
+ jz L(prolog_find_zero_in_fourth_wchar)
+ and $1 << 9 - 1, %eax
+ jz L(return_null)
+
+ test %ah, %ah
+ jnz L(match_third_wchar)
+ test $15 << 4, %al
+ jnz L(match_second_wchar)
+ lea -16(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(prolog_find_zero_in_fourth_wchar):
+ test %ah, %ah
+ jnz L(match_third_or_fourth_wchar)
+ test $15 << 4, %al
+ jnz L(match_second_wchar)
+ lea -16(%edi), %eax
+ RETURN
+
+END (__wcsrchr_sse2)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wcsrchr.S b/REORG.TODO/sysdeps/i386/i686/multiarch/wcsrchr.S
new file mode 100644
index 0000000000..cf67333995
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wcsrchr.S
@@ -0,0 +1,35 @@
+/* Multiple versions of wcsrchr
+ All versions must be listed in ifunc-impl-list.c.
+ Copyright (C) 2011-2017 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+#if IS_IN (libc)
+ .text
+ENTRY(wcsrchr)
+ .type wcsrchr, @gnu_indirect_function
+ LOAD_GOT_AND_RTLD_GLOBAL_RO
+ LOAD_FUNC_GOT_EAX (__wcsrchr_ia32)
+ HAS_CPU_FEATURE (SSE2)
+ jz 2f
+ LOAD_FUNC_GOT_EAX (__wcsrchr_sse2)
+2: ret
+END(wcsrchr)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wmemcmp-c.c b/REORG.TODO/sysdeps/i386/i686/multiarch/wmemcmp-c.c
new file mode 100644
index 0000000000..75ab4b94c1
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wmemcmp-c.c
@@ -0,0 +1,9 @@
+#include <wchar.h>
+
+#if IS_IN (libc)
+# define WMEMCMP __wmemcmp_ia32
+#endif
+
+extern __typeof (wmemcmp) __wmemcmp_ia32;
+
+#include "wcsmbs/wmemcmp.c"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wmemcmp-sse4.S b/REORG.TODO/sysdeps/i386/i686/multiarch/wmemcmp-sse4.S
new file mode 100644
index 0000000000..1a857c7e21
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wmemcmp-sse4.S
@@ -0,0 +1,4 @@
+#define USE_AS_WMEMCMP 1
+#define MEMCMP __wmemcmp_sse4_2
+
+#include "memcmp-sse4.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wmemcmp-ssse3.S b/REORG.TODO/sysdeps/i386/i686/multiarch/wmemcmp-ssse3.S
new file mode 100644
index 0000000000..a41ef95fc1
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wmemcmp-ssse3.S
@@ -0,0 +1,4 @@
+#define USE_AS_WMEMCMP 1
+#define MEMCMP __wmemcmp_ssse3
+
+#include "memcmp-ssse3.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wmemcmp.S b/REORG.TODO/sysdeps/i386/i686/multiarch/wmemcmp.S
new file mode 100644
index 0000000000..1b9a54a413
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wmemcmp.S
@@ -0,0 +1,40 @@
+/* Multiple versions of wmemcmp
+ All versions must be listed in ifunc-impl-list.c.
+ Copyright (C) 2011-2017 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in libc. */
+
+#if IS_IN (libc)
+ .text
+ENTRY(wmemcmp)
+ .type wmemcmp, @gnu_indirect_function
+ LOAD_GOT_AND_RTLD_GLOBAL_RO
+ LOAD_FUNC_GOT_EAX (__wmemcmp_ia32)
+ HAS_CPU_FEATURE (SSSE3)
+ jz 2f
+ LOAD_FUNC_GOT_EAX (__wmemcmp_ssse3)
+ HAS_CPU_FEATURE (SSE4_2)
+ jz 2f
+ LOAD_FUNC_GOT_EAX (__wmemcmp_sse4_2)
+2: ret
+END(wmemcmp)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/nptl/tls.h b/REORG.TODO/sysdeps/i386/i686/nptl/tls.h
new file mode 100644
index 0000000000..5b527af9d3
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/nptl/tls.h
@@ -0,0 +1,35 @@
+/* Copyright (C) 2002-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+ Contributed by Ulrich Drepper <drepper@redhat.com>, 2002.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#ifndef _TLS_H
+
+/* Additional definitions for <tls.h> on i686 and up. */
+
+
+/* Macros to load from and store into segment registers. We can use
+ the 32-bit instructions. */
+#define TLS_GET_GS() \
+ ({ int __seg; __asm ("movl %%gs, %0" : "=q" (__seg)); __seg; })
+#define TLS_SET_GS(val) \
+ __asm ("movl %0, %%gs" :: "q" (val))
+
+
+/* Get the full set of definitions. */
+#include_next <tls.h>
+
+#endif /* tls.h */
diff --git a/REORG.TODO/sysdeps/i386/i686/pthread_spin_trylock.S b/REORG.TODO/sysdeps/i386/i686/pthread_spin_trylock.S
new file mode 100644
index 0000000000..ce9c94d41a
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/pthread_spin_trylock.S
@@ -0,0 +1,20 @@
+/* Copyright (C) 2002-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+ Contributed by Ulrich Drepper <drepper@redhat.com>, 2002.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#define HAVE_CMOV 1
+#include <sysdeps/i386/pthread_spin_trylock.S>
diff --git a/REORG.TODO/sysdeps/i386/i686/stack-aliasing.h b/REORG.TODO/sysdeps/i386/i686/stack-aliasing.h
new file mode 100644
index 0000000000..9b5a1b0d47
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/stack-aliasing.h
@@ -0,0 +1,23 @@
+/* Define macros for stack address aliasing issues for NPTL. i686 version.
+ Copyright (C) 2014-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* What is useful is to avoid the 64k aliasing problem which reliably
+ happens if all stacks use sizes which are a multiple of 64k. Tell
+ the stack allocator to disturb this by allocation one more page if
+ necessary. */
+#define MULTI_PAGE_ALIASING 65536
diff --git a/REORG.TODO/sysdeps/i386/i686/strcmp.S b/REORG.TODO/sysdeps/i386/i686/strcmp.S
new file mode 100644
index 0000000000..1ae305912e
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/strcmp.S
@@ -0,0 +1,52 @@
+/* Highly optimized version for ix86, x>=6.
+ Copyright (C) 1999-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+ Contributed by Ulrich Drepper <drepper@cygnus.com>, 1999.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+#define PARMS 4 /* no space for saved regs */
+#define STR1 PARMS
+#define STR2 STR1+4
+
+ .text
+ENTRY (strcmp)
+
+ movl STR1(%esp), %ecx
+ movl STR2(%esp), %edx
+
+L(oop): movb (%ecx), %al
+ cmpb (%edx), %al
+ jne L(neq)
+ incl %ecx
+ incl %edx
+ testb %al, %al
+ jnz L(oop)
+
+ xorl %eax, %eax
+ /* when strings are equal, pointers rest one beyond
+ the end of the NUL terminators. */
+ ret
+
+L(neq): movl $1, %eax
+ movl $-1, %ecx
+ cmovbl %ecx, %eax
+
+ ret
+END (strcmp)
+libc_hidden_builtin_def (strcmp)
diff --git a/REORG.TODO/sysdeps/i386/i686/tst-stack-align.h b/REORG.TODO/sysdeps/i386/i686/tst-stack-align.h
new file mode 100644
index 0000000000..51f03fe77b
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/tst-stack-align.h
@@ -0,0 +1,44 @@
+/* Copyright (C) 2003-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <stdio.h>
+#include <stdint.h>
+#ifndef __SSE__
+#include_next <tst-stack-align.h>
+#else
+#include <xmmintrin.h>
+
+#define TEST_STACK_ALIGN() \
+ ({ \
+ __m128 _m; \
+ double _d = 12.0; \
+ long double _ld = 15.0; \
+ int _ret = 0; \
+ printf ("__m128: %p %zu\n", &_m, __alignof (__m128)); \
+ if ((((uintptr_t) &_m) & (__alignof (__m128) - 1)) != 0) \
+ _ret = 1; \
+ \
+ printf ("double: %g %p %zu\n", _d, &_d, __alignof (double)); \
+ if ((((uintptr_t) &_d) & (__alignof (double) - 1)) != 0) \
+ _ret = 1; \
+ \
+ printf ("ldouble: %Lg %p %zu\n", _ld, &_ld, __alignof (long double)); \
+ if ((((uintptr_t) &_ld) & (__alignof (long double) - 1)) != 0) \
+ _ret = 1; \
+ _ret; \
+ })
+#endif