588 files changed, 79900 insertions, 0 deletions
diff --git a/REORG.TODO/sysdeps/x86_64/64/Implies-after b/REORG.TODO/sysdeps/x86_64/64/Implies-after
new file mode 100644
index 0000000000..a8cae95f9d
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/64/Implies-after
@@ -0,0 +1 @@
+wordsize-64
diff --git a/REORG.TODO/sysdeps/x86_64/Implies b/REORG.TODO/sysdeps/x86_64/Implies
new file mode 100644
index 0000000000..811c19a8f2
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/Implies
@@ -0,0 +1,5 @@
+x86
+ieee754/ldbl-96
+ieee754/dbl-64/wordsize-64
+ieee754/dbl-64
+ieee754/flt-32
diff --git a/REORG.TODO/sysdeps/x86_64/Makefile b/REORG.TODO/sysdeps/x86_64/Makefile
new file mode 100644
index 0000000000..5075c91277
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/Makefile
@@ -0,0 +1,124 @@
+# The i387 `long double' is a distinct type we support.
+long-double-fcts = yes
+
+ifeq ($(subdir),csu)
+gen-as-const-headers += link-defines.sym
+endif
+
+ifeq ($(subdir),gmon)
+sysdep_routines += _mcount
+# We cannot compile _mcount.S with -pg because that would create
+# recursive calls when ENTRY is used.  Just copy the normal static
+# object.
+sysdep_noprof += _mcount
+endif
+
+ifeq ($(subdir),malloc)
+tests += tst-mallocalign1
+endif
+
+ifeq ($(subdir),string)
+sysdep_routines += cacheinfo strcasecmp_l-nonascii strncase_l-nonascii
+gen-as-const-headers += locale-defines.sym
+endif
+
+ifeq ($(subdir),elf)
+# There is no good reason to use MMX in x86-64 ld.so with GCC.
+CFLAGS-.os += $(if $(filter $(@F),$(patsubst %,%.os,$(all-rtld-routines))),\
+		   -mno-mmx)
+
+sysdep-dl-routines += tlsdesc dl-tlsdesc
+
+tests += ifuncmain8
+modules-names += ifuncmod8
+
+$(objpfx)ifuncmain8: $(objpfx)ifuncmod8.so
+
+tests += tst-quad1 tst-quad2
+modules-names += tst-quadmod1 tst-quadmod2
+
+$(objpfx)tst-quad1: $(objpfx)tst-quadmod1.so
+$(objpfx)tst-quad2: $(objpfx)tst-quadmod2.so
+
+quad-pie-test += tst-quad1pie tst-quad2pie
+tests += $(quad-pie-test)
+tests-pie += $(quad-pie-test)
+test-extras += tst-quadmod1pie tst-quadmod2pie
+extra-test-objs += tst-quadmod1pie.o tst-quadmod2pie.o
+
+$(objpfx)tst-quad1pie: $(objpfx)tst-quadmod1pie.o
+$(objpfx)tst-quad2pie: $(objpfx)tst-quadmod2pie.o
+
+CFLAGS-tst-quad1pie.c = $(PIE-ccflag)
+CFLAGS-tst-quad2pie.c = $(PIE-ccflag)
+
+tests += tst-audit3 tst-audit4 tst-audit5 tst-audit6 tst-audit7 \
+	 tst-audit10 tst-sse tst-avx tst-avx512
+test-extras += tst-audit4-aux tst-audit10-aux \
+	       tst-avx-aux tst-avx512-aux
+extra-test-objs += tst-audit4-aux.o tst-audit10-aux.o \
+		   tst-avx-aux.o tst-avx512-aux.o
+
+tests += tst-split-dynreloc
+LDFLAGS-tst-split-dynreloc = -Wl,-T,$(..)sysdeps/x86_64/tst-split-dynreloc.lds
+tst-split-dynreloc-ENV = LD_BIND_NOW=1
+
+modules-names += tst-auditmod3a tst-auditmod3b \
+		tst-auditmod4a tst-auditmod4b \
+		tst-auditmod5a tst-auditmod5b \
+		tst-auditmod6a tst-auditmod6b tst-auditmod6c \
+		tst-auditmod7a tst-auditmod7b \
+		tst-auditmod10a tst-auditmod10b \
+		tst-ssemod tst-avxmod tst-avx512mod
+
+$(objpfx)tst-audit3: $(objpfx)tst-auditmod3a.so
+$(objpfx)tst-audit3.out: $(objpfx)tst-auditmod3b.so
+tst-audit3-ENV = LD_AUDIT=$(objpfx)tst-auditmod3b.so
+
+$(objpfx)tst-audit4: $(objpfx)tst-audit4-aux.o $(objpfx)tst-auditmod4a.so
+$(objpfx)tst-audit4.out: $(objpfx)tst-auditmod4b.so
+tst-audit4-ENV = LD_AUDIT=$(objpfx)tst-auditmod4b.so
+
+$(objpfx)tst-audit5: $(objpfx)tst-auditmod5a.so
+$(objpfx)tst-audit5.out: $(objpfx)tst-auditmod5b.so
+tst-audit5-ENV = LD_AUDIT=$(objpfx)tst-auditmod5b.so
+
+$(objpfx)tst-audit6: $(objpfx)tst-auditmod6a.so
+$(objpfx)tst-audit6.out: $(objpfx)tst-auditmod6b.so \
+			 $(objpfx)tst-auditmod6c.so
+tst-audit6-ENV = LD_AUDIT=$(objpfx)tst-auditmod6b.so:$(objpfx)tst-auditmod6c.so
+
+$(objpfx)tst-audit7: $(objpfx)tst-auditmod7a.so
+$(objpfx)tst-audit7.out: $(objpfx)tst-auditmod7b.so
+tst-audit7-ENV = LD_AUDIT=$(objpfx)tst-auditmod7b.so
+
+$(objpfx)tst-audit10: $(objpfx)tst-audit10-aux.o $(objpfx)tst-auditmod10a.so
+$(objpfx)tst-audit10.out: $(objpfx)tst-auditmod10b.so
+tst-audit10-ENV = LD_AUDIT=$(objpfx)tst-auditmod10b.so
+
+$(objpfx)tst-sse: $(objpfx)tst-ssemod.so
+$(objpfx)tst-avx: $(objpfx)tst-avx-aux.o $(objpfx)tst-avxmod.so
+$(objpfx)tst-avx512: $(objpfx)tst-avx512-aux.o $(objpfx)tst-avx512mod.so
+
+AVX-CFLAGS=-mavx -mno-vzeroupper
+CFLAGS-tst-audit4-aux.c += $(AVX-CFLAGS)
+CFLAGS-tst-auditmod4a.c += $(AVX-CFLAGS)
+CFLAGS-tst-auditmod4b.c += $(AVX-CFLAGS)
+CFLAGS-tst-auditmod6b.c += $(AVX-CFLAGS)
+CFLAGS-tst-auditmod6c.c += $(AVX-CFLAGS)
+CFLAGS-tst-auditmod7b.c += $(AVX-CFLAGS)
+CFLAGS-tst-avx-aux.c += $(AVX-CFLAGS)
+CFLAGS-tst-avxmod.c += $(AVX-CFLAGS)
+ifeq (yes,$(config-cflags-avx512))
+AVX512-CFLAGS = -mavx512f
+CFLAGS-tst-audit10-aux.c += $(AVX512-CFLAGS)
+CFLAGS-tst-auditmod10a.c += $(AVX512-CFLAGS)
+CFLAGS-tst-auditmod10b.c += $(AVX512-CFLAGS)
+CFLAGS-tst-avx512-aux.c += $(AVX512-CFLAGS)
+CFLAGS-tst-avx512mod.c += $(AVX512-CFLAGS)
+endif
+endif
+
+ifeq ($(subdir),csu)
+gen-as-const-headers += tlsdesc.sym
+endif
diff --git a/REORG.TODO/sysdeps/x86_64/Versions b/REORG.TODO/sysdeps/x86_64/Versions
new file mode 100644
index 0000000000..a437f85e6e
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/Versions
@@ -0,0 +1,12 @@
+libc {
+  GLIBC_2.14 {
+    memcpy;
+  }
+}
+libm {
+  GLIBC_2.1 {
+    # A generic bug got this omitted from other configurations' version
+    # sets, but we always had it.
+    exp2l;
+  }
+}
diff --git a/REORG.TODO/sysdeps/x86_64/____longjmp_chk.S b/REORG.TODO/sysdeps/x86_64/____longjmp_chk.S
new file mode 100644
index 0000000000..0910861a9d
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/____longjmp_chk.S
@@ -0,0 +1 @@
+#error "OS-specific version needed"
diff --git a/REORG.TODO/sysdeps/x86_64/__longjmp.S b/REORG.TODO/sysdeps/x86_64/__longjmp.S
new file mode 100644
index 0000000000..350b6b1bf6
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/__longjmp.S
@@ -0,0 +1,68 @@
+/* Copyright (C) 2001-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <jmpbuf-offsets.h>
+#include <asm-syntax.h>
+#include <stap-probe.h>
+
+/* Jump to the position specified by ENV, causing the
+   setjmp call there to return VAL, or 1 if VAL is 0.
+   void __longjmp (__jmp_buf env, int val).  */
+	.text
+ENTRY(__longjmp)
+	/* Restore registers.  */
+	mov (JB_RSP*8)(%rdi),%R8_LP
+	mov (JB_RBP*8)(%rdi),%R9_LP
+	mov (JB_PC*8)(%rdi),%RDX_LP
+#ifdef PTR_DEMANGLE
+	PTR_DEMANGLE (%R8_LP)
+	PTR_DEMANGLE (%R9_LP)
+	PTR_DEMANGLE (%RDX_LP)
+# ifdef __ILP32__
+	/* We ignored the high bits of the %rbp value because only the low
+	   bits are mangled.  But we cannot presume that %rbp is being used
+	   as a pointer and truncate it, so recover the high bits.  */
+	movl (JB_RBP*8 + 4)(%rdi), %eax
+	shlq $32, %rax
+	orq %rax, %r9
+# endif
+#endif
+	LIBC_PROBE (longjmp, 3, LP_SIZE@%RDI_LP, -4@%esi, LP_SIZE@%RDX_LP)
+	/* We add unwind information for the target here.  */
+	cfi_def_cfa(%rdi, 0)
+	cfi_register(%rsp,%r8)
+	cfi_register(%rbp,%r9)
+	cfi_register(%rip,%rdx)
+	cfi_offset(%rbx,JB_RBX*8)
+	cfi_offset(%r12,JB_R12*8)
+	cfi_offset(%r13,JB_R13*8)
+	cfi_offset(%r14,JB_R14*8)
+	cfi_offset(%r15,JB_R15*8)
+	movq (JB_RBX*8)(%rdi),%rbx
+	movq (JB_R12*8)(%rdi),%r12
+	movq (JB_R13*8)(%rdi),%r13
+	movq (JB_R14*8)(%rdi),%r14
+	movq (JB_R15*8)(%rdi),%r15
+	/* Set return value for setjmp.  */
+	mov %esi, %eax
+	mov %R8_LP,%RSP_LP
+	movq %r9,%rbp
+	LIBC_PROBE (longjmp_target, 3,
+		    LP_SIZE@%RDI_LP, -4@%eax, LP_SIZE@%RDX_LP)
+	jmpq *%rdx
+END (__longjmp)
diff --git a/REORG.TODO/sysdeps/x86_64/_mcount.S b/REORG.TODO/sysdeps/x86_64/_mcount.S
new file mode 100644
index 0000000000..bcf0957752
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/_mcount.S
@@ -0,0 +1,125 @@
+/* Machine-specific calling sequence for `mcount' profiling function.  x86-64 version.
+   Copyright (C) 2002-2017 Free Software Foundation, Inc.
+   Contributed by Andreas Jaeger <aj@suse.de>.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* Assembly stub to invoke _mcount().  Compiler generated code calls
+   this stub after executing a function's prologue and without saving any
+   registers.  It is therefore necessary to preserve %rcx, %rdx, %rsi, %rdi,
+   %r8, %r9 as they may contain function arguments.  */
+
+#include <sysdep.h>
+
+ENTRY(_mcount)
+	/* Allocate space for 7 registers.  */
+	subq	$56,%rsp
+	cfi_adjust_cfa_offset (56)
+	movq	%rax,(%rsp)
+	cfi_rel_offset (rax, 0)
+	movq	%rcx,8(%rsp)
+	cfi_rel_offset (rcx, 8)
+	movq	%rdx,16(%rsp)
+	cfi_rel_offset (rdx, 16)
+	movq	%rsi,24(%rsp)
+	cfi_rel_offset (rsi, 24)
+	movq	%rdi,32(%rsp)
+	cfi_rel_offset (rdi, 32)
+	movq	%r8,40(%rsp)
+	cfi_rel_offset (r8, 40)
+	movq	%r9,48(%rsp)
+	cfi_rel_offset (r9, 48)
+
+	/* Setup parameter for __mcount_internal.  */
+	/* selfpc is the return address on the stack.  */
+	movq	56(%rsp),%rsi
+	/* Get frompc via the frame pointer.  */
+	movq	8(%rbp),%rdi
+	call C_SYMBOL_NAME(__mcount_internal)
+	/* Pop the saved registers.  Please note that `mcount' has no
+	   return value.  */
+	movq	48(%rsp),%r9
+	cfi_restore (r9)
+	movq	40(%rsp),%r8
+	cfi_restore (r8)
+	movq	32(%rsp),%rdi
+	cfi_restore (rdi)
+	movq	24(%rsp),%rsi
+	cfi_restore (rsi)
+	movq	16(%rsp),%rdx
+	cfi_restore (rdx)
+	movq	8(%rsp),%rcx
+	cfi_restore (rcx)
+	movq	(%rsp),%rax
+	cfi_restore (rax)
+	addq	$56,%rsp
+	cfi_adjust_cfa_offset (-56)
+	ret
+END(_mcount)
+
+#undef mcount
+weak_alias (_mcount, mcount)
+
+/* __fentry__ is different from _mcount in that it is called before
+   function prolog.  This means (among other things) that it has non-standard
+   stack alignment on entry: (%RSP & 0xF) == 0.  */
+
+ENTRY(__fentry__)
+	/* Allocate space for 7 registers
+	   (+8 bytes for proper stack alignment).  */
+	subq	$64,%rsp
+	cfi_adjust_cfa_offset (64)
+	movq	%rax,(%rsp)
+	cfi_rel_offset (rax, 0)
+	movq	%rcx,8(%rsp)
+	cfi_rel_offset (rcx, 8)
+	movq	%rdx,16(%rsp)
+	cfi_rel_offset (rdx, 16)
+	movq	%rsi,24(%rsp)
+	cfi_rel_offset (rsi, 24)
+	movq	%rdi,32(%rsp)
+	cfi_rel_offset (rdi, 32)
+	movq	%r8,40(%rsp)
+	cfi_rel_offset (r8, 40)
+	movq	%r9,48(%rsp)
+	cfi_rel_offset (r9, 48)
+
+	/* Setup parameter for __mcount_internal.  */
+	/* selfpc is the return address on the stack.  */
+	movq	64(%rsp),%rsi
+	/* caller is the return address above it */
+	movq	72(%rsp),%rdi
+	call C_SYMBOL_NAME(__mcount_internal)
+	/* Pop the saved registers.  Please note that `__fentry__' has no
+	   return value.  */
+	movq	48(%rsp),%r9
+	cfi_restore (r9)
+	movq	40(%rsp),%r8
+	cfi_restore (r8)
+	movq	32(%rsp),%rdi
+	cfi_restore (rdi)
+	movq	24(%rsp),%rsi
+	cfi_restore (rsi)
+	movq	16(%rsp),%rdx
+	cfi_restore (rdx)
+	movq	8(%rsp),%rcx
+	cfi_restore (rcx)
+	movq	(%rsp),%rax
+	cfi_restore (rax)
+	addq	$64,%rsp
+	cfi_adjust_cfa_offset (-64)
+	ret
+END(__fentry__)
diff --git a/REORG.TODO/sysdeps/x86_64/abort-instr.h b/REORG.TODO/sysdeps/x86_64/abort-instr.h
new file mode 100644
index 0000000000..810f10379b
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/abort-instr.h
@@ -0,0 +1,2 @@
+/* An instruction which should crash any program is `hlt'.  */
+#define ABORT_INSTRUCTION asm ("hlt")
diff --git a/REORG.TODO/sysdeps/x86_64/add_n.S b/REORG.TODO/sysdeps/x86_64/add_n.S
new file mode 100644
index 0000000000..4ba83c0bdb
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/add_n.S
@@ -0,0 +1,100 @@
+/* x86-64 __mpn_add_n -- Add two limb vectors of the same length > 0 and store
+   sum in a third limb vector.
+   Copyright (C) 2006-2017 Free Software Foundation, Inc.
+   This file is part of the GNU MP Library.
+
+   The GNU MP Library is free software; you can redistribute it and/or modify
+   it under the terms of the GNU Lesser General Public License as published by
+   the Free Software Foundation; either version 2.1 of the License, or (at your
+   option) any later version.
+
+   The GNU MP Library is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+   License for more details.
+
+   You should have received a copy of the GNU Lesser General Public License
+   along with the GNU MP Library; see the file COPYING.LIB.  If not,
+   see <http://www.gnu.org/licenses/>.  */
+
+#include "sysdep.h"
+#include "asm-syntax.h"
+
+#define rp	%rdi
+#define up	%rsi
+#define vp	%rdx
+#define n	%rcx
+#define cy	%r8
+
+#ifndef func
+# define func __mpn_add_n
+# define ADCSBB adc
+#endif
+
+	.text
+ENTRY (func)
+	xor	%r8, %r8
+	mov	(up), %r10
+	mov	(vp), %r11
+
+	lea	-8(up,n,8), up
+	lea	-8(vp,n,8), vp
+	lea	-16(rp,n,8), rp
+	mov	%ecx, %eax
+	neg	n
+	and	$3, %eax
+	je	L(b00)
+	add	%rax, n		/* clear low rcx bits for jrcxz */
+	cmp	$2, %eax
+	jl	L(b01)
+	je	L(b10)
+
+L(b11):	shr	%r8		/* set cy */
+	jmp	L(e11)
+
+L(b00):	shr	%r8		/* set cy */
+	mov	%r10, %r8
+	mov	%r11, %r9
+	lea	4(n), n
+	jmp	L(e00)
+
+L(b01):	shr	%r8		/* set cy */
+	jmp	L(e01)
+
+L(b10):	shr	%r8		/* set cy */
+	mov	%r10, %r8
+	mov	%r11, %r9
+	jmp	L(e10)
+
+L(end):	ADCSBB	%r11, %r10
+	mov	%r10, 8(rp)
+	mov	%ecx, %eax	/* clear eax, ecx contains 0 */
+	adc	%eax, %eax
+	ret
+
+	.p2align 4
+L(top):
+	mov	-24(up,n,8), %r8
+	mov	-24(vp,n,8), %r9
+	ADCSBB	%r11, %r10
+	mov	%r10, -24(rp,n,8)
+L(e00):
+	mov	-16(up,n,8), %r10
+	mov	-16(vp,n,8), %r11
+	ADCSBB	%r9, %r8
+	mov	%r8, -16(rp,n,8)
+L(e11):
+	mov	-8(up,n,8), %r8
+	mov	-8(vp,n,8), %r9
+	ADCSBB	%r11, %r10
+	mov	%r10, -8(rp,n,8)
+L(e10):
+	mov	(up,n,8), %r10
+	mov	(vp,n,8), %r11
+	ADCSBB	%r9, %r8
+	mov	%r8, (rp,n,8)
+L(e01):
+	jrcxz	L(end)
+	lea	4(n), n
+	jmp	L(top)
+END (func)
diff --git a/REORG.TODO/sysdeps/x86_64/addmul_1.S b/REORG.TODO/sysdeps/x86_64/addmul_1.S
new file mode 100644
index 0000000000..faccdfdbc4
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/addmul_1.S
@@ -0,0 +1,114 @@
+/* x86-64 __mpn_addmul_1 -- Multiply a limb vector with a limb and add
+   the result to a second limb vector.
+   Copyright (C) 2003-2017 Free Software Foundation, Inc.
+   This file is part of the GNU MP Library.
+
+   The GNU MP Library is free software; you can redistribute it and/or modify
+   it under the terms of the GNU Lesser General Public License as published by
+   the Free Software Foundation; either version 2.1 of the License, or (at your
+   option) any later version.
+
+   The GNU MP Library is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+   License for more details.
+
+   You should have received a copy of the GNU Lesser General Public License
+   along with the GNU MP Library; see the file COPYING.LIB.  If not,
+   see <http://www.gnu.org/licenses/>.  */
+
+#include "sysdep.h"
+#include "asm-syntax.h"
+
+#define rp	%rdi
+#define up	%rsi
+#define n	%rdx
+#define v0	%rcx
+
+#ifndef func
+# define func __mpn_addmul_1
+# define ADDSUB add
+#endif
+
+	.text
+ENTRY (func)
+	push	%rbx
+	push	%rbp
+	lea	(%rdx), %rbx
+	neg	%rbx
+
+	mov	(up), %rax
+	mov	(rp), %r10
+
+	lea	-16(rp,%rdx,8), rp
+	lea	(up,%rdx,8), up
+	mul	%rcx
+
+	bt	$0, %ebx
+	jc	L(odd)
+
+	lea	(%rax), %r11
+	mov	8(up,%rbx,8), %rax
+	lea	(%rdx), %rbp
+	mul	%rcx
+	add	$2, %rbx
+	jns	L(n2)
+
+	lea	(%rax), %r8
+	mov	(up,%rbx,8), %rax
+	lea	(%rdx), %r9
+	jmp	L(mid)
+
+L(odd):	add	$1, %rbx
+	jns	L(n1)
+
+	lea	(%rax), %r8
+	mov	(up,%rbx,8), %rax
+	lea	(%rdx), %r9
+	mul	%rcx
+	lea	(%rax), %r11
+	mov	8(up,%rbx,8), %rax
+	lea	(%rdx), %rbp
+	jmp	L(e)
+
+	.p2align 4
+L(top):	mul	%rcx
+	ADDSUB	%r8, %r10
+	lea	(%rax), %r8
+	mov	(up,%rbx,8), %rax
+	adc	%r9, %r11
+	mov	%r10, -8(rp,%rbx,8)
+	mov	(rp,%rbx,8), %r10
+	lea	(%rdx), %r9
+	adc	$0, %rbp
+L(mid):	mul	%rcx
+	ADDSUB	%r11, %r10
+	lea	(%rax), %r11
+	mov	8(up,%rbx,8), %rax
+	adc	%rbp, %r8
+	mov	%r10, (rp,%rbx,8)
+	mov	8(rp,%rbx,8), %r10
+	lea	(%rdx), %rbp
+	adc	$0, %r9
+L(e):	add	$2, %rbx
+	js	L(top)
+
+	mul	%rcx
+	ADDSUB	%r8, %r10
+	adc	%r9, %r11
+	mov	%r10, -8(rp)
+	adc	$0, %rbp
+L(n2):	mov	(rp), %r10
+	ADDSUB	%r11, %r10
+	adc	%rbp, %rax
+	mov	%r10, (rp)
+	adc	$0, %rdx
+L(n1):	mov	8(rp), %r10
+	ADDSUB	%rax, %r10
+	mov	%r10, 8(rp)
+	mov	%ebx, %eax	/* zero rax */
+	adc	%rdx, %rax
+	pop	%rbp
+	pop	%rbx
+	ret
+END (func)
diff --git a/REORG.TODO/sysdeps/x86_64/atomic-machine.h b/REORG.TODO/sysdeps/x86_64/atomic-machine.h
new file mode 100644
index 0000000000..c454734001
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/atomic-machine.h
@@ -0,0 +1,482 @@
+/* Copyright (C) 2002-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@redhat.com>, 2002.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _X86_64_ATOMIC_MACHINE_H
+#define _X86_64_ATOMIC_MACHINE_H 1
+
+#include <stdint.h>
+#include <tls.h>                   /* For tcbhead_t.  */
+#include <libc-pointer-arith.h>    /* For cast_to_integer.  */
+
+typedef int8_t atomic8_t;
+typedef uint8_t uatomic8_t;
+typedef int_fast8_t atomic_fast8_t;
+typedef uint_fast8_t uatomic_fast8_t;
+
+typedef int16_t atomic16_t;
+typedef uint16_t uatomic16_t;
+typedef int_fast16_t atomic_fast16_t;
+typedef uint_fast16_t uatomic_fast16_t;
+
+typedef int32_t atomic32_t;
+typedef uint32_t uatomic32_t;
+typedef int_fast32_t atomic_fast32_t;
+typedef uint_fast32_t uatomic_fast32_t;
+
+typedef int64_t atomic64_t;
+typedef uint64_t uatomic64_t;
+typedef int_fast64_t atomic_fast64_t;
+typedef uint_fast64_t uatomic_fast64_t;
+
+typedef intptr_t atomicptr_t;
+typedef uintptr_t uatomicptr_t;
+typedef intmax_t atomic_max_t;
+typedef uintmax_t uatomic_max_t;
+
+
+#ifndef LOCK_PREFIX
+# ifdef UP
+#  define LOCK_PREFIX	/* nothing */
+# else
+#  define LOCK_PREFIX "lock;"
+# endif
+#endif
+
+#define __HAVE_64B_ATOMICS 1
+#define USE_ATOMIC_COMPILER_BUILTINS 1
+#define ATOMIC_EXCHANGE_USES_CAS 0
+
+#define atomic_compare_and_exchange_val_acq(mem, newval, oldval) \
+  __sync_val_compare_and_swap (mem, oldval, newval)
+#define atomic_compare_and_exchange_bool_acq(mem, newval, oldval) \
+  (! __sync_bool_compare_and_swap (mem, oldval, newval))
+
+
+#define __arch_c_compare_and_exchange_val_8_acq(mem, newval, oldval) \
+  ({ __typeof (*mem) ret;						      \
+    __asm __volatile ("cmpl $0, %%fs:%P5\n\t"				      \
+		      "je 0f\n\t"					      \
+		      "lock\n"						      \
+		       "0:\tcmpxchgb %b2, %1"				      \
+		       : "=a" (ret), "=m" (*mem)			      \
+		       : "q" (newval), "m" (*mem), "0" (oldval),	      \
+			 "i" (offsetof (tcbhead_t, multiple_threads)));	      \
+     ret; })
+
+#define __arch_c_compare_and_exchange_val_16_acq(mem, newval, oldval) \
+  ({ __typeof (*mem) ret;						      \
+    __asm __volatile ("cmpl $0, %%fs:%P5\n\t"				      \
+		      "je 0f\n\t"					      \
+		      "lock\n"						      \
+		       "0:\tcmpxchgw %w2, %1"				      \
+		       : "=a" (ret), "=m" (*mem)			      \
+		       : "q" (newval), "m" (*mem), "0" (oldval),	      \
+			 "i" (offsetof (tcbhead_t, multiple_threads)));	      \
+     ret; })
+
+#define __arch_c_compare_and_exchange_val_32_acq(mem, newval, oldval) \
+  ({ __typeof (*mem) ret;						      \
+    __asm __volatile ("cmpl $0, %%fs:%P5\n\t"				      \
+		      "je 0f\n\t"					      \
+		      "lock\n"						      \
+		       "0:\tcmpxchgl %2, %1"				      \
+		       : "=a" (ret), "=m" (*mem)			      \
+		       : "q" (newval), "m" (*mem), "0" (oldval),	      \
+			 "i" (offsetof (tcbhead_t, multiple_threads)));	      \
+     ret; })
+
+#define __arch_c_compare_and_exchange_val_64_acq(mem, newval, oldval) \
+  ({ __typeof (*mem) ret;						      \
+     __asm __volatile ("cmpl $0, %%fs:%P5\n\t"				      \
+		       "je 0f\n\t"					      \
+		       "lock\n"						      \
+		       "0:\tcmpxchgq %q2, %1"				      \
+		       : "=a" (ret), "=m" (*mem)			      \
+		       : "q" ((atomic64_t) cast_to_integer (newval)),	      \
+			 "m" (*mem),					      \
+			 "0" ((atomic64_t) cast_to_integer (oldval)),	      \
+			 "i" (offsetof (tcbhead_t, multiple_threads)));	      \
+     ret; })
+
+
+/* Note that we need no lock prefix.  */
+#define atomic_exchange_acq(mem, newvalue) \
+  ({ __typeof (*mem) result;						      \
+     if (sizeof (*mem) == 1)						      \
+       __asm __volatile ("xchgb %b0, %1"				      \
+			 : "=q" (result), "=m" (*mem)			      \
+			 : "0" (newvalue), "m" (*mem));			      \
+     else if (sizeof (*mem) == 2)					      \
+       __asm __volatile ("xchgw %w0, %1"				      \
+			 : "=r" (result), "=m" (*mem)			      \
+			 : "0" (newvalue), "m" (*mem));			      \
+     else if (sizeof (*mem) == 4)					      \
+       __asm __volatile ("xchgl %0, %1"					      \
+			 : "=r" (result), "=m" (*mem)			      \
+			 : "0" (newvalue), "m" (*mem));			      \
+     else								      \
+       __asm __volatile ("xchgq %q0, %1"				      \
+			 : "=r" (result), "=m" (*mem)			      \
+			 : "0" ((atomic64_t) cast_to_integer (newvalue)),     \
+			   "m" (*mem));					      \
+     result; })
+
+
+#define __arch_exchange_and_add_body(lock, mem, value)			      \
+  ({ __typeof (*mem) result;						      \
+     if (sizeof (*mem) == 1)						      \
+       __asm __volatile (lock "xaddb %b0, %1"				      \
+			 : "=q" (result), "=m" (*mem)			      \
+			 : "0" (value), "m" (*mem),			      \
+			   "i" (offsetof (tcbhead_t, multiple_threads)));     \
+     else if (sizeof (*mem) == 2)					      \
+       __asm __volatile (lock "xaddw %w0, %1"				      \
+			 : "=r" (result), "=m" (*mem)			      \
+			 : "0" (value), "m" (*mem),			      \
+			   "i" (offsetof (tcbhead_t, multiple_threads)));     \
+     else if (sizeof (*mem) == 4)					      \
+       __asm __volatile (lock "xaddl %0, %1"				      \
+			 : "=r" (result), "=m" (*mem)			      \
+			 : "0" (value), "m" (*mem),			      \
+			   "i" (offsetof (tcbhead_t, multiple_threads)));     \
+     else								      \
+       __asm __volatile (lock "xaddq %q0, %1"				      \
+			 : "=r" (result), "=m" (*mem)			      \
+			 : "0" ((atomic64_t) cast_to_integer (value)),	      \
+			   "m" (*mem),					      \
+			   "i" (offsetof (tcbhead_t, multiple_threads)));     \
+     result; })
+
+#define atomic_exchange_and_add(mem, value) \
+  __sync_fetch_and_add (mem, value)
+
+#define __arch_exchange_and_add_cprefix \
+  "cmpl $0, %%fs:%P4\n\tje 0f\n\tlock\n0:\t"
+
+#define catomic_exchange_and_add(mem, value) \
+  __arch_exchange_and_add_body (__arch_exchange_and_add_cprefix, mem, value)
+
+
+#define __arch_add_body(lock, pfx, mem, value)				      \
+  do {									      \
+    if (__builtin_constant_p (value) && (value) == 1)			      \
+      pfx##_increment (mem);						      \
+    else if (__builtin_constant_p (value) && (value) == -1)		      \
+      pfx##_decrement (mem);						      \
+    else if (sizeof (*mem) == 1)					      \
+      __asm __volatile (lock "addb %b1, %0"				      \
+			: "=m" (*mem)					      \
+			: "iq" (value), "m" (*mem),			      \
+			  "i" (offsetof (tcbhead_t, multiple_threads)));      \
+    else if (sizeof (*mem) == 2)					      \
+      __asm __volatile (lock "addw %w1, %0"				      \
+			: "=m" (*mem)					      \
+			: "ir" (value), "m" (*mem),			      \
+			  "i" (offsetof (tcbhead_t, multiple_threads)));      \
+    else if (sizeof (*mem) == 4)					      \
+      __asm __volatile (lock "addl %1, %0"				      \
+			: "=m" (*mem)					      \
+			: "ir" (value), "m" (*mem),			      \
+			  "i" (offsetof (tcbhead_t, multiple_threads)));      \
+    else								      \
+      __asm __volatile (lock "addq %q1, %0"				      \
+			: "=m" (*mem)					      \
+			: "ir" ((atomic64_t) cast_to_integer (value)),	      \
+			  "m" (*mem),					      \
+			  "i" (offsetof (tcbhead_t, multiple_threads)));      \
+  } while (0)
+
+#define atomic_add(mem, value) \
+  __arch_add_body (LOCK_PREFIX, atomic, mem, value)
+
+#define __arch_add_cprefix \
+  "cmpl $0, %%fs:%P3\n\tje 0f\n\tlock\n0:\t"
+
+#define catomic_add(mem, value) \
+  __arch_add_body (__arch_add_cprefix, catomic, mem, value)
+
+
+#define atomic_add_negative(mem, value) \
+  ({ unsigned char __result;						      \
+     if (sizeof (*mem) == 1)						      \
+       __asm __volatile (LOCK_PREFIX "addb %b2, %0; sets %1"		      \
+			 : "=m" (*mem), "=qm" (__result)		      \
+			 : "iq" (value), "m" (*mem));			      \
+     else if (sizeof (*mem) == 2)					      \
+       __asm __volatile (LOCK_PREFIX "addw %w2, %0; sets %1"		      \
+			 : "=m" (*mem), "=qm" (__result)		      \
+			 : "ir" (value), "m" (*mem));			      \
+     else if (sizeof (*mem) == 4)					      \
+       __asm __volatile (LOCK_PREFIX "addl %2, %0; sets %1"		      \
+			 : "=m" (*mem), "=qm" (__result)		      \
+			 : "ir" (value), "m" (*mem));			      \
+     else								      \
+       __asm __volatile (LOCK_PREFIX "addq %q2, %0; sets %1"		      \
+			 : "=m" (*mem), "=qm" (__result)		      \
+			 : "ir" ((atomic64_t) cast_to_integer (value)),	      \
+			   "m" (*mem));					      \
+     __result; })
+
+
+#define atomic_add_zero(mem, value) \
+  ({ unsigned char __result;						      \
+     if (sizeof (*mem) == 1)						      \
+       __asm __volatile (LOCK_PREFIX "addb %b2, %0; setz %1"		      \
+			 : "=m" (*mem), "=qm" (__result)		      \
+			 : "iq" (value), "m" (*mem));			      \
+     else if (sizeof (*mem) == 2)					      \
+       __asm __volatile (LOCK_PREFIX "addw %w2, %0; setz %1"		      \
+			 : "=m" (*mem), "=qm" (__result)		      \
+			 : "ir" (value), "m" (*mem));			      \
+     else if (sizeof (*mem) == 4)					      \
+       __asm __volatile (LOCK_PREFIX "addl %2, %0; setz %1"		      \
+			 : "=m" (*mem), "=qm" (__result)		      \
+			 : "ir" (value), "m" (*mem));			      \
+     else								      \
+       __asm __volatile (LOCK_PREFIX "addq %q2, %0; setz %1"		      \
+			 : "=m" (*mem), "=qm" (__result)		      \
+			 : "ir" ((atomic64_t) cast_to_integer (value)),	      \
+			   "m" (*mem));					      \
+     __result; })
+
+
+#define __arch_increment_body(lock, mem) \
+  do {									      \
+    if (sizeof (*mem) == 1)						      \
+      __asm __volatile (lock "incb %b0"					      \
+			: "=m" (*mem)					      \
+			: "m" (*mem),					      \
+			  "i" (offsetof (tcbhead_t, multiple_threads)));      \
+    else if (sizeof (*mem) == 2)					      \
+      __asm __volatile (lock "incw %w0"					      \
+			: "=m" (*mem)					      \
+			: "m" (*mem),					      \
+			  "i" (offsetof (tcbhead_t, multiple_threads)));      \
+    else if (sizeof (*mem) == 4)					      \
+      __asm __volatile (lock "incl %0"					      \
+			: "=m" (*mem)					      \
+			: "m" (*mem),					      \
+			  "i" (offsetof (tcbhead_t, multiple_threads)));      \
+    else								      \
+      __asm __volatile (lock "incq %q0"					      \
+			: "=m" (*mem)					      \
+			: "m" (*mem),					      \
+			  "i" (offsetof (tcbhead_t, multiple_threads)));      \
+  } while (0)
+
+#define atomic_increment(mem) __arch_increment_body (LOCK_PREFIX, mem)
+
+#define __arch_increment_cprefix \
+  "cmpl $0, %%fs:%P2\n\tje 0f\n\tlock\n0:\t"
+
+#define catomic_increment(mem) \
+  __arch_increment_body (__arch_increment_cprefix, mem)
+
+
+#define atomic_increment_and_test(mem) \
+  ({ unsigned char __result;						      \
+     if (sizeof (*mem) == 1)						      \
+       __asm __volatile (LOCK_PREFIX "incb %b0; sete %1"		      \
+			 : "=m" (*mem), "=qm" (__result)		      \
+			 : "m" (*mem));					      \
+     else if (sizeof (*mem) == 2)					      \
+       __asm __volatile (LOCK_PREFIX "incw %w0; sete %1"		      \
+			 : "=m" (*mem), "=qm" (__result)		      \
+			 : "m" (*mem));					      \
+     else if (sizeof (*mem) == 4)					      \
+       __asm __volatile (LOCK_PREFIX "incl %0; sete %1"			      \
+			 : "=m" (*mem), "=qm" (__result)		      \
+			 : "m" (*mem));					      \
+     else								      \
+       __asm __volatile (LOCK_PREFIX "incq %q0; sete %1"		      \
+			 : "=m" (*mem), "=qm" (__result)		      \
+			 : "m" (*mem));					      \
+     __result; })
+
+
+#define __arch_decrement_body(lock, mem) \
+  do {									      \
+    if (sizeof (*mem) == 1)						      \
+      __asm __volatile (lock "decb %b0"					      \
+			: "=m" (*mem)					      \
+			: "m" (*mem),					      \
+			  "i" (offsetof (tcbhead_t, multiple_threads)));      \
+    else if (sizeof (*mem) == 2)					      \
+      __asm __volatile (lock "decw %w0"					      \
+			: "=m" (*mem)					      \
+			: "m" (*mem),					      \
+			  "i" (offsetof (tcbhead_t, multiple_threads)));      \
+    else if (sizeof (*mem) == 4)					      \
+      __asm __volatile (lock "decl %0"					      \
+			: "=m" (*mem)					      \
+			: "m" (*mem),					      \
+			  "i" (offsetof (tcbhead_t, multiple_threads)));      \
+    else								      \
+      __asm __volatile (lock "decq %q0"					      \
+			: "=m" (*mem)					      \
+			: "m" (*mem),					      \
+			  "i" (offsetof (tcbhead_t, multiple_threads)));      \
+  } while (0)
+
+#define atomic_decrement(mem) __arch_decrement_body (LOCK_PREFIX, mem)
+
+#define __arch_decrement_cprefix \
+  "cmpl $0, %%fs:%P2\n\tje 0f\n\tlock\n0:\t"
+
+#define catomic_decrement(mem) \
+  __arch_decrement_body (__arch_decrement_cprefix, mem)
+
+
+#define atomic_decrement_and_test(mem) \
+  ({ unsigned char __result;						      \
+     if (sizeof (*mem) == 1)						      \
+       __asm __volatile (LOCK_PREFIX "decb %b0; sete %1"		      \
+			 : "=m" (*mem), "=qm" (__result)		      \
+			 : "m" (*mem));					      \
+     else if (sizeof (*mem) == 2)					      \
+       __asm __volatile (LOCK_PREFIX "decw %w0; sete %1"		      \
+			 : "=m" (*mem), "=qm" (__result)		      \
+			 : "m" (*mem));					      \
+     else if (sizeof (*mem) == 4)					      \
+       __asm __volatile (LOCK_PREFIX "decl %0; sete %1"			      \
+			 : "=m" (*mem), "=qm" (__result)		      \
+			 : "m" (*mem));					      \
+     else								      \
+       __asm __volatile (LOCK_PREFIX "decq %q0; sete %1"		      \
+			 : "=m" (*mem), "=qm" (__result)		      \
+			 : "m" (*mem));					      \
+     __result; })
+
+
+#define atomic_bit_set(mem, bit) \
+  do {									      \
+    if (sizeof (*mem) == 1)						      \
+      __asm __volatile (LOCK_PREFIX "orb %b2, %0"			      \
+			: "=m" (*mem)					      \
+			: "m" (*mem), "iq" (1L << (bit)));		      \
+    else if (sizeof (*mem) == 2)					      \
+      __asm __volatile (LOCK_PREFIX "orw %w2, %0"			      \
+			: "=m" (*mem)					      \
+			: "m" (*mem), "ir" (1L << (bit)));		      \
+    else if (sizeof (*mem) == 4)					      \
+      __asm __volatile (LOCK_PREFIX "orl %2, %0"			      \
+			: "=m" (*mem)					      \
+			: "m" (*mem), "ir" (1L << (bit)));		      \
+    else if (__builtin_constant_p (bit) && (bit) < 32)			      \
+      __asm __volatile (LOCK_PREFIX "orq %2, %0"			      \
+			: "=m" (*mem)					      \
+			: "m" (*mem), "i" (1L << (bit)));		      \
+    else								      \
+      __asm __volatile (LOCK_PREFIX "orq %q2, %0"			      \
+			: "=m" (*mem)					      \
+			: "m" (*mem), "r" (1UL << (bit)));		      \
+  } while (0)
+
+
+#define atomic_bit_test_set(mem, bit) \
+  ({ unsigned char __result;						      \
+     if (sizeof (*mem) == 1)						      \
+       __asm __volatile (LOCK_PREFIX "btsb %3, %1; setc %0"		      \
+			 : "=q" (__result), "=m" (*mem)			      \
+			 : "m" (*mem), "iq" (bit));			      \
+     else if (sizeof (*mem) == 2)					      \
+       __asm __volatile (LOCK_PREFIX "btsw %3, %1; setc %0"		      \
+			 : "=q" (__result), "=m" (*mem)			      \
+			 : "m" (*mem), "ir" (bit));			      \
+     else if (sizeof (*mem) == 4)					      \
+       __asm __volatile (LOCK_PREFIX "btsl %3, %1; setc %0"		      \
+			 : "=q" (__result), "=m" (*mem)			      \
+			 : "m" (*mem), "ir" (bit));			      \
+     else							      	      \
+       __asm __volatile (LOCK_PREFIX "btsq %3, %1; setc %0"		      \
+			 : "=q" (__result), "=m" (*mem)			      \
+			 : "m" (*mem), "ir" (bit));			      \
+     __result; })
+
+
+#define atomic_spin_nop() asm ("rep; nop")
+
+
+#define __arch_and_body(lock, mem, mask) \
+  do {									      \
+    if (sizeof (*mem) == 1)						      \
+      __asm __volatile (lock "andb %b1, %0"				      \
+			: "=m" (*mem)					      \
+			: "iq" (mask), "m" (*mem),			      \
+			  "i" (offsetof (tcbhead_t, multiple_threads)));      \
+    else if (sizeof (*mem) == 2)					      \
+      __asm __volatile (lock "andw %w1, %0"				      \
+			: "=m" (*mem)					      \
+			: "ir" (mask), "m" (*mem),			      \
+			  "i" (offsetof (tcbhead_t, multiple_threads)));      \
+    else if (sizeof (*mem) == 4)					      \
+      __asm __volatile (lock "andl %1, %0"				      \
+			: "=m" (*mem)					      \
+			: "ir" (mask), "m" (*mem),			      \
+			  "i" (offsetof (tcbhead_t, multiple_threads)));      \
+    else								      \
+      __asm __volatile (lock "andq %q1, %0"				      \
+			: "=m" (*mem)					      \
+			: "ir" (mask), "m" (*mem),			      \
+			  "i" (offsetof (tcbhead_t, multiple_threads)));      \
+  } while (0)
+
+#define __arch_cprefix \
+  "cmpl $0, %%fs:%P3\n\tje 0f\n\tlock\n0:\t"
+
+#define atomic_and(mem, mask) __arch_and_body (LOCK_PREFIX, mem, mask)
+
+#define catomic_and(mem, mask) __arch_and_body (__arch_cprefix, mem, mask)
+
+
+#define __arch_or_body(lock, mem, mask)					      \
+  do {									      \
+    if (sizeof (*mem) == 1)						      \
+      __asm __volatile (lock "orb %b1, %0"				      \
+			: "=m" (*mem)					      \
+			: "iq" (mask), "m" (*mem),			      \
+			  "i" (offsetof (tcbhead_t, multiple_threads)));      \
+    else if (sizeof (*mem) == 2)					      \
+      __asm __volatile (lock "orw %w1, %0"				      \
+			: "=m" (*mem)					      \
+			: "ir" (mask), "m" (*mem),			      \
+			  "i" (offsetof (tcbhead_t, multiple_threads)));      \
+    else if (sizeof (*mem) == 4)					      \
+      __asm __volatile (lock "orl %1, %0"				      \
+			: "=m" (*mem)					      \
+			: "ir" (mask), "m" (*mem),			      \
+			  "i" (offsetof (tcbhead_t, multiple_threads)));      \
+    else								      \
+      __asm __volatile (lock "orq %q1, %0"				      \
+			: "=m" (*mem)					      \
+			: "ir" (mask), "m" (*mem),			      \
+			  "i" (offsetof (tcbhead_t, multiple_threads)));      \
+  } while (0)
+
+#define atomic_or(mem, mask) __arch_or_body (LOCK_PREFIX, mem, mask)
+
+#define catomic_or(mem, mask) __arch_or_body (__arch_cprefix, mem, mask)
+
+/* We don't use mfence because it is supposedly slower due to having to
+   provide stronger guarantees (e.g., regarding self-modifying code).  */
+#define atomic_full_barrier() \
+    __asm __volatile (LOCK_PREFIX "orl $0, (%%rsp)" ::: "memory")
+#define atomic_read_barrier() __asm ("" ::: "memory")
+#define atomic_write_barrier() __asm ("" ::: "memory")
+
+#endif /* atomic-machine.h */
diff --git a/REORG.TODO/sysdeps/x86_64/backtrace.c b/REORG.TODO/sysdeps/x86_64/backtrace.c
new file mode 100644
index 0000000000..15f425b410
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/backtrace.c
@@ -0,0 +1,133 @@
+/* Return backtrace of current program state.
+   Copyright (C) 2003-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Jakub Jelinek <jakub@redhat.com>, 2003.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <libc-lock.h>
+#include <dlfcn.h>
+#include <execinfo.h>
+#include <stdlib.h>
+#include <unwind.h>
+
+struct trace_arg
+{
+  void **array;
+  _Unwind_Word cfa;
+  int cnt;
+  int size;
+};
+
+#ifdef SHARED
+static _Unwind_Reason_Code (*unwind_backtrace) (_Unwind_Trace_Fn, void *);
+static _Unwind_Ptr (*unwind_getip) (struct _Unwind_Context *);
+static _Unwind_Word (*unwind_getcfa) (struct _Unwind_Context *);
+static void *libgcc_handle;
+
+
+/* Dummy version in case libgcc_s does not contain the real code.  */
+static _Unwind_Word
+dummy_getcfa (struct _Unwind_Context *ctx __attribute__ ((unused)))
+{
+  return 0;
+}
+
+
+static void
+init (void)
+{
+  libgcc_handle = __libc_dlopen ("libgcc_s.so.1");
+
+  if (libgcc_handle == NULL)
+    return;
+
+  unwind_backtrace = __libc_dlsym (libgcc_handle, "_Unwind_Backtrace");
+  unwind_getip = __libc_dlsym (libgcc_handle, "_Unwind_GetIP");
+  if (unwind_getip == NULL)
+    unwind_backtrace = NULL;
+  unwind_getcfa = (__libc_dlsym (libgcc_handle, "_Unwind_GetCFA")
+		  ?: dummy_getcfa);
+}
+#else
+# define unwind_backtrace _Unwind_Backtrace
+# define unwind_getip _Unwind_GetIP
+# define unwind_getcfa _Unwind_GetCFA
+#endif
+
+static _Unwind_Reason_Code
+backtrace_helper (struct _Unwind_Context *ctx, void *a)
+{
+  struct trace_arg *arg = a;
+
+  /* We are first called with address in the __backtrace function.
+     Skip it.  */
+  if (arg->cnt != -1)
+    {
+      arg->array[arg->cnt] = (void *) unwind_getip (ctx);
+
+      /* Check whether we make any progress.  */
+      _Unwind_Word cfa = unwind_getcfa (ctx);
+
+      if (arg->cnt > 0 && arg->array[arg->cnt - 1] == arg->array[arg->cnt]
+	 && cfa == arg->cfa)
+       return _URC_END_OF_STACK;
+      arg->cfa = cfa;
+    }
+  if (++arg->cnt == arg->size)
+    return _URC_END_OF_STACK;
+  return _URC_NO_REASON;
+}
+
+int
+__backtrace (void **array, int size)
+{
+  struct trace_arg arg = { .array = array, .cfa = 0, .size = size, .cnt = -1 };
+
+  if (size <= 0)
+    return 0;
+
+#ifdef SHARED
+  __libc_once_define (static, once);
+
+  __libc_once (once, init);
+  if (unwind_backtrace == NULL)
+    return 0;
+#endif
+
+  unwind_backtrace (backtrace_helper, &arg);
+
+  /* _Unwind_Backtrace seems to put NULL address above
+     _start.  Fix it up here.  */
+  if (arg.cnt > 1 && arg.array[arg.cnt - 1] == NULL)
+    --arg.cnt;
+  return arg.cnt != -1 ? arg.cnt : 0;
+}
+weak_alias (__backtrace, backtrace)
+libc_hidden_def (__backtrace)
+
+
+#ifdef SHARED
+/* Free all resources if necessary.  */
+libc_freeres_fn (free_mem)
+{
+  unwind_backtrace = NULL;
+  if (libgcc_handle != NULL)
+    {
+      __libc_dlclose (libgcc_handle);
+      libgcc_handle = NULL;
+    }
+}
+#endif
diff --git a/REORG.TODO/sysdeps/x86_64/bsd-_setjmp.S b/REORG.TODO/sysdeps/x86_64/bsd-_setjmp.S
new file mode 100644
index 0000000000..bc40a88938
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/bsd-_setjmp.S
@@ -0,0 +1,37 @@
+/* BSD `_setjmp' entry point to `sigsetjmp (..., 0)'.  x86-64 version.
+   Copyright (C) 1994-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* This just does a tail-call to `__sigsetjmp (ARG, 0)'.
+   We cannot do it in C because it must be a tail-call, so frame-unwinding
+   in setjmp doesn't clobber the state restored by longjmp.  */
+
+#include <sysdep.h>
+#define _ASM
+#define _SETJMP_H
+#include <bits/setjmp.h>
+
+ENTRY (_setjmp)
+	/* Set up arguments, we only need to set the second arg.  */
+	xorl %esi, %esi
+#ifdef PIC
+	jmp HIDDEN_JUMPTARGET (__sigsetjmp)
+#else
+	jmp __sigsetjmp
+#endif
+END (_setjmp)
+libc_hidden_def (_setjmp)
diff --git a/REORG.TODO/sysdeps/x86_64/bsd-setjmp.S b/REORG.TODO/sysdeps/x86_64/bsd-setjmp.S
new file mode 100644
index 0000000000..45ee1234b9
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/bsd-setjmp.S
@@ -0,0 +1,36 @@
+/* BSD `setjmp' entry point to `sigsetjmp (..., 1)'.  x86-64 version.
+   Copyright (C) 1994-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* This just does a tail-call to `__sigsetjmp (ARG, 1)'.
+   We cannot do it in C because it must be a tail-call, so frame-unwinding
+   in setjmp doesn't clobber the state restored by longjmp.  */
+
+#include <sysdep.h>
+#define _ASM
+#define _SETJMP_H
+#include <bits/setjmp.h>
+
+ENTRY (setjmp)
+	/* Set up arguments, we only need to set the 2nd arg.  */
+	movl $1, %esi
+#ifdef	PIC
+	jmp HIDDEN_JUMPTARGET (__sigsetjmp)
+#else
+	jmp __sigsetjmp
+#endif
+END (setjmp)
diff --git a/REORG.TODO/sysdeps/x86_64/bzero.S b/REORG.TODO/sysdeps/x86_64/bzero.S
new file mode 100644
index 0000000000..f96d567fd8
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/bzero.S
@@ -0,0 +1 @@
+/* Implemented in memset.S.  */
diff --git a/REORG.TODO/sysdeps/x86_64/configure b/REORG.TODO/sysdeps/x86_64/configure
new file mode 100644
index 0000000000..2d14c344df
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/configure
@@ -0,0 +1,156 @@
+# This file is generated from configure.ac by Autoconf.  DO NOT EDIT!
+ # Local configure fragment for sysdeps/x86_64.
+
+for ac_prog in $AS
+do
+  # Extract the first word of "$ac_prog", so it can be a program name with args.
+set dummy $ac_prog; ac_word=$2
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+$as_echo_n "checking for $ac_word... " >&6; }
+if ${ac_cv_prog_AS+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if test -n "$AS"; then
+  ac_cv_prog_AS="$AS" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    for ac_exec_ext in '' $ac_executable_extensions; do
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
+    ac_cv_prog_AS="$ac_prog"
+    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+    break 2
+  fi
+done
+  done
+IFS=$as_save_IFS
+
+fi
+fi
+AS=$ac_cv_prog_AS
+if test -n "$AS"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $AS" >&5
+$as_echo "$AS" >&6; }
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+fi
+
+
+  test -n "$AS" && break
+done
+
+if test -z "$AS"; then
+  ac_verc_fail=yes
+else
+  # Found it, now check the version.
+  { $as_echo "$as_me:${as_lineno-$LINENO}: checking version of $AS" >&5
+$as_echo_n "checking version of $AS... " >&6; }
+  ac_prog_version=`$AS --version 2>&1 | sed -n 's/^.*GNU assembler.* \([0-9]*\.[0-9.]*\).*$/\1/p'`
+  case $ac_prog_version in
+    '') ac_prog_version="v. ?.??, bad"; ac_verc_fail=yes;;
+    2.2[4-9]*|2.[3-9][0-9]*|[3-9].*|[1-9][0-9]*)
+       ac_prog_version="$ac_prog_version, ok"; ac_verc_fail=no;;
+    *) ac_prog_version="$ac_prog_version, bad"; ac_verc_fail=yes;;
+
+  esac
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_prog_version" >&5
+$as_echo "$ac_prog_version" >&6; }
+fi
+if test $ac_verc_fail = yes; then
+  critic_missing="$critic_missing The program AS is required in version >= 2.24 for target x86_64."
+fi
+
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for AVX512DQ support in assembler" >&5
+$as_echo_n "checking for AVX512DQ support in assembler... " >&6; }
+if ${libc_cv_asm_avx512dq+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  cat > conftest.s <<\EOF
+        vandpd (%rax), %zmm6, %zmm1
+EOF
+if { ac_try='${CC-cc} -c $ASFLAGS conftest.s 1>&5'
+  { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5
+  (eval $ac_try) 2>&5
+  ac_status=$?
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; }; }; then
+  libc_cv_asm_avx512dq=yes
+else
+  libc_cv_asm_avx512dq=no
+fi
+rm -f conftest*
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $libc_cv_asm_avx512dq" >&5
+$as_echo "$libc_cv_asm_avx512dq" >&6; }
+if test $libc_cv_asm_avx512dq = yes; then
+  $as_echo "#define HAVE_AVX512DQ_ASM_SUPPORT 1" >>confdefs.h
+
+fi
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for AVX512 support" >&5
+$as_echo_n "checking for AVX512 support... " >&6; }
+if ${libc_cv_cc_avx512+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if { ac_try='${CC-cc} -mavx512f -xc /dev/null -S -o /dev/null'
+  { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5
+  (eval $ac_try) 2>&5
+  ac_status=$?
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; }; }; then :
+  libc_cv_cc_avx512=$libc_cv_asm_avx512dq
+else
+  libc_cv_cc_avx512=no
+fi
+
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $libc_cv_cc_avx512" >&5
+$as_echo "$libc_cv_cc_avx512" >&6; }
+if test $libc_cv_cc_avx512 = yes; then
+  $as_echo "#define HAVE_AVX512_SUPPORT 1" >>confdefs.h
+
+fi
+config_vars="$config_vars
+config-cflags-avx512 = $libc_cv_cc_avx512"
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for Intel MPX support" >&5
+$as_echo_n "checking for Intel MPX support... " >&6; }
+if ${libc_cv_asm_mpx+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  cat > conftest.s <<\EOF
+        bndmov %bnd0,(%rsp)
+EOF
+if { ac_try='${CC-cc} -c $ASFLAGS conftest.s 1>&5'
+  { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5
+  (eval $ac_try) 2>&5
+  ac_status=$?
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; }; }; then
+  libc_cv_asm_mpx=yes
+else
+  libc_cv_asm_mpx=no
+fi
+rm -f conftest*
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $libc_cv_asm_mpx" >&5
+$as_echo "$libc_cv_asm_mpx" >&6; }
+if test $libc_cv_asm_mpx = yes; then
+  $as_echo "#define HAVE_MPX_SUPPORT 1" >>confdefs.h
+
+fi
+
+if test x"$build_mathvec" = xnotset; then
+  build_mathvec=yes
+fi
+
+$as_echo "#define PI_STATIC_AND_HIDDEN 1" >>confdefs.h
+
+
+test -n "$critic_missing" && as_fn_error $? "
+*** $critic_missing" "$LINENO" 5
diff --git a/REORG.TODO/sysdeps/x86_64/configure.ac b/REORG.TODO/sysdeps/x86_64/configure.ac
new file mode 100644
index 0000000000..7d8aaafc0c
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/configure.ac
@@ -0,0 +1,58 @@
+GLIBC_PROVIDES dnl See aclocal.m4 in the top level source directory.
+# Local configure fragment for sysdeps/x86_64.
+
+dnl Accept as 2.24 or newer for AVX512 load and store.
+AC_CHECK_PROG_VER(AS, $AS, --version,
+		  [GNU assembler.* \([0-9]*\.[0-9.]*\)],
+		  [2.2[4-9]*|2.[3-9][0-9]*|[3-9].*|[1-9][0-9]*],
+		  critic_missing="$critic_missing The program AS is required in version >= 2.24 for target x86_64.")
+
+dnl Check if asm supports AVX512DQ.
+AC_CACHE_CHECK(for AVX512DQ support in assembler, libc_cv_asm_avx512dq, [dnl
+cat > conftest.s <<\EOF
+        vandpd (%rax), %zmm6, %zmm1
+EOF
+if AC_TRY_COMMAND(${CC-cc} -c $ASFLAGS conftest.s 1>&AS_MESSAGE_LOG_FD); then
+  libc_cv_asm_avx512dq=yes
+else
+  libc_cv_asm_avx512dq=no
+fi
+rm -f conftest*])
+if test $libc_cv_asm_avx512dq = yes; then
+  AC_DEFINE(HAVE_AVX512DQ_ASM_SUPPORT)
+fi
+
+dnl Check if -mavx512f works.
+AC_CACHE_CHECK(for AVX512 support, libc_cv_cc_avx512, [dnl
+LIBC_TRY_CC_OPTION([-mavx512f], [libc_cv_cc_avx512=$libc_cv_asm_avx512dq], [libc_cv_cc_avx512=no])
+])
+if test $libc_cv_cc_avx512 = yes; then
+  AC_DEFINE(HAVE_AVX512_SUPPORT)
+fi
+LIBC_CONFIG_VAR([config-cflags-avx512], [$libc_cv_cc_avx512])
+
+dnl Check whether asm supports Intel MPX
+AC_CACHE_CHECK(for Intel MPX support, libc_cv_asm_mpx, [dnl
+cat > conftest.s <<\EOF
+        bndmov %bnd0,(%rsp)
+EOF
+if AC_TRY_COMMAND(${CC-cc} -c $ASFLAGS conftest.s 1>&AS_MESSAGE_LOG_FD); then
+  libc_cv_asm_mpx=yes
+else
+  libc_cv_asm_mpx=no
+fi
+rm -f conftest*])
+if test $libc_cv_asm_mpx = yes; then
+  AC_DEFINE(HAVE_MPX_SUPPORT)
+fi
+
+if test x"$build_mathvec" = xnotset; then
+  build_mathvec=yes
+fi
+
+dnl It is always possible to access static and hidden symbols in an
+dnl position independent way.
+AC_DEFINE(PI_STATIC_AND_HIDDEN)
+
+test -n "$critic_missing" && AC_MSG_ERROR([
+*** $critic_missing])
diff --git a/REORG.TODO/sysdeps/x86_64/crti.S b/REORG.TODO/sysdeps/x86_64/crti.S
new file mode 100644
index 0000000000..2687f35cb7
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/crti.S
@@ -0,0 +1,80 @@
+/* Special .init and .fini section support for x86-64.
+   Copyright (C) 2012-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   In addition to the permissions in the GNU Lesser General Public
+   License, the Free Software Foundation gives you unlimited
+   permission to link the compiled version of this file with other
+   programs, and to distribute those programs without any restriction
+   coming from the use of this file. (The GNU Lesser General Public
+   License restrictions do apply in other respects; for example, they
+   cover modification of the file, and distribution when not linked
+   into another program.)
+
+   Note that people who make modified versions of this file are not
+   obligated to grant this special exception for their modified
+   versions; it is their choice whether to do so. The GNU Lesser
+   General Public License gives permission to release a modified
+   version without this exception; this exception also makes it
+   possible to release a modified version which carries forward this
+   exception.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* crti.S puts a function prologue at the beginning of the .init and
+   .fini sections and defines global symbols for those addresses, so
+   they can be called as functions.  The symbols _init and _fini are
+   magic and cause the linker to emit DT_INIT and DT_FINI.  */
+
+#include <libc-symbols.h>
+#include <sysdep.h>
+
+#ifndef PREINIT_FUNCTION
+# define PREINIT_FUNCTION __gmon_start__
+#endif
+
+#ifndef PREINIT_FUNCTION_WEAK
+# define PREINIT_FUNCTION_WEAK 1
+#endif
+
+#if PREINIT_FUNCTION_WEAK
+	weak_extern (PREINIT_FUNCTION)
+#else
+	.hidden PREINIT_FUNCTION
+#endif
+
+	.section .init,"ax",@progbits
+	.p2align 2
+	.globl _init
+	.type _init, @function
+_init:
+	/* Maintain 16-byte stack alignment for called functions.  */
+	subq $8, %rsp
+#if PREINIT_FUNCTION_WEAK
+	movq PREINIT_FUNCTION@GOTPCREL(%rip), %rax
+	testq %rax, %rax
+	je .Lno_weak_fn
+	call *%rax
+.Lno_weak_fn:
+#else
+	call PREINIT_FUNCTION
+#endif
+
+	.section .fini,"ax",@progbits
+	.p2align 2
+	.globl _fini
+	.type _fini, @function
+_fini:
+	subq $8, %rsp
diff --git a/REORG.TODO/sysdeps/x86_64/crtn.S b/REORG.TODO/sysdeps/x86_64/crtn.S
new file mode 100644
index 0000000000..29e3b85300
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/crtn.S
@@ -0,0 +1,45 @@
+/* Special .init and .fini section support for x86-64.
+   Copyright (C) 2012-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   In addition to the permissions in the GNU Lesser General Public
+   License, the Free Software Foundation gives you unlimited
+   permission to link the compiled version of this file with other
+   programs, and to distribute those programs without any restriction
+   coming from the use of this file. (The GNU Lesser General Public
+   License restrictions do apply in other respects; for example, they
+   cover modification of the file, and distribution when not linked
+   into another program.)
+
+   Note that people who make modified versions of this file are not
+   obligated to grant this special exception for their modified
+   versions; it is their choice whether to do so. The GNU Lesser
+   General Public License gives permission to release a modified
+   version without this exception; this exception also makes it
+   possible to release a modified version which carries forward this
+   exception.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* crtn.S puts function epilogues in the .init and .fini sections
+   corresponding to the prologues in crti.S. */
+
+	.section .init,"ax",@progbits
+	addq $8, %rsp
+	ret
+
+	.section .fini,"ax",@progbits
+	addq $8, %rsp
+	ret
diff --git a/REORG.TODO/sysdeps/x86_64/dl-irel.h b/REORG.TODO/sysdeps/x86_64/dl-irel.h
new file mode 100644
index 0000000000..5f9967abe5
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/dl-irel.h
@@ -0,0 +1,51 @@
+/* Machine-dependent ELF indirect relocation inline functions.
+   x86-64 version.
+   Copyright (C) 2009-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _DL_IREL_H
+#define _DL_IREL_H
+
+#include <stdio.h>
+#include <unistd.h>
+
+#define ELF_MACHINE_IRELA	1
+
+static inline ElfW(Addr)
+__attribute ((always_inline))
+elf_ifunc_invoke (ElfW(Addr) addr)
+{
+  return ((ElfW(Addr) (*) (void)) (addr)) ();
+}
+
+static inline void
+__attribute ((always_inline))
+elf_irela (const ElfW(Rela) *reloc)
+{
+  ElfW(Addr) *const reloc_addr = (void *) reloc->r_offset;
+  const unsigned long int r_type = ELFW(R_TYPE) (reloc->r_info);
+
+  if (__glibc_likely (r_type == R_X86_64_IRELATIVE))
+    {
+      ElfW(Addr) value = elf_ifunc_invoke(reloc->r_addend);
+      *reloc_addr = value;
+    }
+  else
+    __libc_fatal ("unexpected reloc type in static binary");
+}
+
+#endif /* dl-irel.h */
diff --git a/REORG.TODO/sysdeps/x86_64/dl-lookupcfg.h b/REORG.TODO/sysdeps/x86_64/dl-lookupcfg.h
new file mode 100644
index 0000000000..47b534a059
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/dl-lookupcfg.h
@@ -0,0 +1,32 @@
+/* Configuration of lookup functions.
+   Copyright (C) 2005-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#define DL_UNMAP_IS_SPECIAL
+
+#include_next <dl-lookupcfg.h>
+
+/* Address of protected data defined in the shared library may be
+   external due to copy relocation.   */
+#define DL_EXTERN_PROTECTED_DATA
+
+struct link_map;
+
+extern void _dl_unmap (struct link_map *map)
+  internal_function attribute_hidden;
+
+#define DL_UNMAP(map) _dl_unmap (map)
diff --git a/REORG.TODO/sysdeps/x86_64/dl-machine.h b/REORG.TODO/sysdeps/x86_64/dl-machine.h
new file mode 100644
index 0000000000..0015db4d6a
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/dl-machine.h
@@ -0,0 +1,601 @@
+/* Machine-dependent ELF dynamic relocation inline functions.  x86-64 version.
+   Copyright (C) 2001-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Andreas Jaeger <aj@suse.de>.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef dl_machine_h
+#define dl_machine_h
+
+#define ELF_MACHINE_NAME "x86_64"
+
+#include <sys/param.h>
+#include <sysdep.h>
+#include <tls.h>
+#include <dl-tlsdesc.h>
+#include <cpu-features.c>
+
+/* Return nonzero iff ELF header is compatible with the running host.  */
+static inline int __attribute__ ((unused))
+elf_machine_matches_host (const ElfW(Ehdr) *ehdr)
+{
+  return ehdr->e_machine == EM_X86_64;
+}
+
+
+/* Return the link-time address of _DYNAMIC.  Conveniently, this is the
+   first element of the GOT.  This must be inlined in a function which
+   uses global data.  */
+static inline ElfW(Addr) __attribute__ ((unused))
+elf_machine_dynamic (void)
+{
+  /* This produces an IP-relative reloc which is resolved at link time. */
+  extern const ElfW(Addr) _GLOBAL_OFFSET_TABLE_[] attribute_hidden;
+  return _GLOBAL_OFFSET_TABLE_[0];
+}
+
+
+/* Return the run-time load address of the shared object.  */
+static inline ElfW(Addr) __attribute__ ((unused))
+elf_machine_load_address (void)
+{
+  /* Compute the difference between the runtime address of _DYNAMIC as seen
+     by an IP-relative reference, and the link-time address found in the
+     special unrelocated first GOT entry.  */
+  extern ElfW(Dyn) _DYNAMIC[] attribute_hidden;
+  return (ElfW(Addr)) &_DYNAMIC - elf_machine_dynamic ();
+}
+
+/* Set up the loaded object described by L so its unrelocated PLT
+   entries will jump to the on-demand fixup code in dl-runtime.c.  */
+
+static inline int __attribute__ ((unused, always_inline))
+elf_machine_runtime_setup (struct link_map *l, int lazy, int profile)
+{
+  Elf64_Addr *got;
+  extern void _dl_runtime_resolve_sse (ElfW(Word)) attribute_hidden;
+  extern void _dl_runtime_resolve_avx (ElfW(Word)) attribute_hidden;
+  extern void _dl_runtime_resolve_avx_slow (ElfW(Word)) attribute_hidden;
+  extern void _dl_runtime_resolve_avx_opt (ElfW(Word)) attribute_hidden;
+  extern void _dl_runtime_resolve_avx512 (ElfW(Word)) attribute_hidden;
+  extern void _dl_runtime_resolve_avx512_opt (ElfW(Word)) attribute_hidden;
+  extern void _dl_runtime_profile_sse (ElfW(Word)) attribute_hidden;
+  extern void _dl_runtime_profile_avx (ElfW(Word)) attribute_hidden;
+  extern void _dl_runtime_profile_avx512 (ElfW(Word)) attribute_hidden;
+
+  if (l->l_info[DT_JMPREL] && lazy)
+    {
+      /* The GOT entries for functions in the PLT have not yet been filled
+	 in.  Their initial contents will arrange when called to push an
+	 offset into the .rel.plt section, push _GLOBAL_OFFSET_TABLE_[1],
+	 and then jump to _GLOBAL_OFFSET_TABLE_[2].  */
+      got = (Elf64_Addr *) D_PTR (l, l_info[DT_PLTGOT]);
+      /* If a library is prelinked but we have to relocate anyway,
+	 we have to be able to undo the prelinking of .got.plt.
+	 The prelinker saved us here address of .plt + 0x16.  */
+      if (got[1])
+	{
+	  l->l_mach.plt = got[1] + l->l_addr;
+	  l->l_mach.gotplt = (ElfW(Addr)) &got[3];
+	}
+      /* Identify this shared object.  */
+      *(ElfW(Addr) *) (got + 1) = (ElfW(Addr)) l;
+
+      /* The got[2] entry contains the address of a function which gets
+	 called to get the address of a so far unresolved function and
+	 jump to it.  The profiling extension of the dynamic linker allows
+	 to intercept the calls to collect information.  In this case we
+	 don't store the address in the GOT so that all future calls also
+	 end in this function.  */
+      if (__glibc_unlikely (profile))
+	{
+	  if (HAS_ARCH_FEATURE (AVX512F_Usable))
+	    *(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_profile_avx512;
+	  else if (HAS_ARCH_FEATURE (AVX_Usable))
+	    *(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_profile_avx;
+	  else
+	    *(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_profile_sse;
+
+	  if (GLRO(dl_profile) != NULL
+	      && _dl_name_match_p (GLRO(dl_profile), l))
+	    /* This is the object we are looking for.  Say that we really
+	       want profiling and the timers are started.  */
+	    GL(dl_profile_map) = l;
+	}
+      else
+	{
+	  /* This function will get called to fix up the GOT entry
+	     indicated by the offset on the stack, and then jump to
+	     the resolved address.  */
+	  if (HAS_ARCH_FEATURE (AVX512F_Usable))
+	    {
+	      if (HAS_ARCH_FEATURE (Use_dl_runtime_resolve_opt))
+		*(ElfW(Addr) *) (got + 2)
+		  = (ElfW(Addr)) &_dl_runtime_resolve_avx512_opt;
+	      else
+		*(ElfW(Addr) *) (got + 2)
+		  = (ElfW(Addr)) &_dl_runtime_resolve_avx512;
+	    }
+	  else if (HAS_ARCH_FEATURE (AVX_Usable))
+	    {
+	      if (HAS_ARCH_FEATURE (Use_dl_runtime_resolve_opt))
+		*(ElfW(Addr) *) (got + 2)
+		  = (ElfW(Addr)) &_dl_runtime_resolve_avx_opt;
+	      else if (HAS_ARCH_FEATURE (Use_dl_runtime_resolve_slow))
+		*(ElfW(Addr) *) (got + 2)
+		  = (ElfW(Addr)) &_dl_runtime_resolve_avx_slow;
+	      else
+		*(ElfW(Addr) *) (got + 2)
+		  = (ElfW(Addr)) &_dl_runtime_resolve_avx;
+	    }
+	  else
+	    *(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_resolve_sse;
+	}
+    }
+
+  if (l->l_info[ADDRIDX (DT_TLSDESC_GOT)] && lazy)
+    *(ElfW(Addr)*)(D_PTR (l, l_info[ADDRIDX (DT_TLSDESC_GOT)]) + l->l_addr)
+      = (ElfW(Addr)) &_dl_tlsdesc_resolve_rela;
+
+  return lazy;
+}
+
+/* Initial entry point code for the dynamic linker.
+   The C function `_dl_start' is the real entry point;
+   its return value is the user program's entry point.  */
+#define RTLD_START asm ("\n\
+.text\n\
+	.align 16\n\
+.globl _start\n\
+.globl _dl_start_user\n\
+_start:\n\
+	movq %rsp, %rdi\n\
+	call _dl_start\n\
+_dl_start_user:\n\
+	# Save the user entry point address in %r12.\n\
+	movq %rax, %r12\n\
+	# See if we were run as a command with the executable file\n\
+	# name as an extra leading argument.\n\
+	movl _dl_skip_args(%rip), %eax\n\
+	# Pop the original argument count.\n\
+	popq %rdx\n\
+	# Adjust the stack pointer to skip _dl_skip_args words.\n\
+	leaq (%rsp,%rax,8), %rsp\n\
+	# Subtract _dl_skip_args from argc.\n\
+	subl %eax, %edx\n\
+	# Push argc back on the stack.\n\
+	pushq %rdx\n\
+	# Call _dl_init (struct link_map *main_map, int argc, char **argv, char **env)\n\
+	# argc -> rsi\n\
+	movq %rdx, %rsi\n\
+	# Save %rsp value in %r13.\n\
+	movq %rsp, %r13\n\
+	# And align stack for the _dl_init call. \n\
+	andq $-16, %rsp\n\
+	# _dl_loaded -> rdi\n\
+	movq _rtld_local(%rip), %rdi\n\
+	# env -> rcx\n\
+	leaq 16(%r13,%rdx,8), %rcx\n\
+	# argv -> rdx\n\
+	leaq 8(%r13), %rdx\n\
+	# Clear %rbp to mark outermost frame obviously even for constructors.\n\
+	xorl %ebp, %ebp\n\
+	# Call the function to run the initializers.\n\
+	call _dl_init\n\
+	# Pass our finalizer function to the user in %rdx, as per ELF ABI.\n\
+	leaq _dl_fini(%rip), %rdx\n\
+	# And make sure %rsp points to argc stored on the stack.\n\
+	movq %r13, %rsp\n\
+	# Jump to the user's entry point.\n\
+	jmp *%r12\n\
+.previous\n\
+");
+
+/* ELF_RTYPE_CLASS_PLT iff TYPE describes relocation of a PLT entry or
+   TLS variable, so undefined references should not be allowed to
+   define the value.
+   ELF_RTYPE_CLASS_COPY iff TYPE should not be allowed to resolve to one
+   of the main executable's symbols, as for a COPY reloc.
+   ELF_RTYPE_CLASS_EXTERN_PROTECTED_DATA iff TYPE describes relocation may
+   against protected data whose address be external due to copy relocation.
+ */
+#define elf_machine_type_class(type)					      \
+  ((((type) == R_X86_64_JUMP_SLOT					      \
+     || (type) == R_X86_64_DTPMOD64					      \
+     || (type) == R_X86_64_DTPOFF64					      \
+     || (type) == R_X86_64_TPOFF64					      \
+     || (type) == R_X86_64_TLSDESC)					      \
+    * ELF_RTYPE_CLASS_PLT)						      \
+   | (((type) == R_X86_64_COPY) * ELF_RTYPE_CLASS_COPY)			      \
+   | (((type) == R_X86_64_GLOB_DAT) * ELF_RTYPE_CLASS_EXTERN_PROTECTED_DATA))
+
+/* A reloc type used for ld.so cmdline arg lookups to reject PLT entries.  */
+#define ELF_MACHINE_JMP_SLOT	R_X86_64_JUMP_SLOT
+
+/* The relative ifunc relocation.  */
+// XXX This is a work-around for a broken linker.  Remove!
+#define ELF_MACHINE_IRELATIVE	R_X86_64_IRELATIVE
+
+/* The x86-64 never uses Elf64_Rel/Elf32_Rel relocations.  */
+#define ELF_MACHINE_NO_REL 1
+#define ELF_MACHINE_NO_RELA 0
+
+/* We define an initialization function.  This is called very early in
+   _dl_sysdep_start.  */
+#define DL_PLATFORM_INIT dl_platform_init ()
+
+static inline void __attribute__ ((unused))
+dl_platform_init (void)
+{
+#if IS_IN (rtld)
+  /* init_cpu_features has been called early from __libc_start_main in
+     static executable.  */
+  init_cpu_features (&GLRO(dl_x86_cpu_features));
+#else
+  if (GLRO(dl_platform) != NULL && *GLRO(dl_platform) == '\0')
+    /* Avoid an empty string which would disturb us.  */
+    GLRO(dl_platform) = NULL;
+#endif
+}
+
+static inline ElfW(Addr)
+elf_machine_fixup_plt (struct link_map *map, lookup_t t,
+		       const ElfW(Rela) *reloc,
+		       ElfW(Addr) *reloc_addr, ElfW(Addr) value)
+{
+  return *reloc_addr = value;
+}
+
+/* Return the final value of a PLT relocation.  On x86-64 the
+   JUMP_SLOT relocation ignores the addend.  */
+static inline ElfW(Addr)
+elf_machine_plt_value (struct link_map *map, const ElfW(Rela) *reloc,
+		       ElfW(Addr) value)
+{
+  return value;
+}
+
+
+/* Names of the architecture-specific auditing callback functions.  */
+#define ARCH_LA_PLTENTER x86_64_gnu_pltenter
+#define ARCH_LA_PLTEXIT x86_64_gnu_pltexit
+
+#endif /* !dl_machine_h */
+
+#ifdef RESOLVE_MAP
+
+/* Perform the relocation specified by RELOC and SYM (which is fully resolved).
+   MAP is the object containing the reloc.  */
+
+auto inline void
+__attribute__ ((always_inline))
+elf_machine_rela (struct link_map *map, const ElfW(Rela) *reloc,
+		  const ElfW(Sym) *sym, const struct r_found_version *version,
+		  void *const reloc_addr_arg, int skip_ifunc)
+{
+  ElfW(Addr) *const reloc_addr = reloc_addr_arg;
+  const unsigned long int r_type = ELFW(R_TYPE) (reloc->r_info);
+
+# if !defined RTLD_BOOTSTRAP || !defined HAVE_Z_COMBRELOC
+  if (__glibc_unlikely (r_type == R_X86_64_RELATIVE))
+    {
+#  if !defined RTLD_BOOTSTRAP && !defined HAVE_Z_COMBRELOC
+      /* This is defined in rtld.c, but nowhere in the static libc.a;
+	 make the reference weak so static programs can still link.
+	 This declaration cannot be done when compiling rtld.c
+	 (i.e. #ifdef RTLD_BOOTSTRAP) because rtld.c contains the
+	 common defn for _dl_rtld_map, which is incompatible with a
+	 weak decl in the same file.  */
+#   ifndef SHARED
+      weak_extern (GL(dl_rtld_map));
+#   endif
+      if (map != &GL(dl_rtld_map)) /* Already done in rtld itself.  */
+#  endif
+	*reloc_addr = map->l_addr + reloc->r_addend;
+    }
+  else
+# endif
+# if !defined RTLD_BOOTSTRAP
+  /* l_addr + r_addend may be > 0xffffffff and R_X86_64_RELATIVE64
+     relocation updates the whole 64-bit entry.  */
+  if (__glibc_unlikely (r_type == R_X86_64_RELATIVE64))
+    *(Elf64_Addr *) reloc_addr = (Elf64_Addr) map->l_addr + reloc->r_addend;
+  else
+# endif
+  if (__glibc_unlikely (r_type == R_X86_64_NONE))
+    return;
+  else
+    {
+# ifndef RTLD_BOOTSTRAP
+      const ElfW(Sym) *const refsym = sym;
+# endif
+      struct link_map *sym_map = RESOLVE_MAP (&sym, version, r_type);
+      ElfW(Addr) value = (sym == NULL ? 0
+			  : (ElfW(Addr)) sym_map->l_addr + sym->st_value);
+
+      if (sym != NULL
+	  && __builtin_expect (ELFW(ST_TYPE) (sym->st_info) == STT_GNU_IFUNC,
+			       0)
+	  && __builtin_expect (sym->st_shndx != SHN_UNDEF, 1)
+	  && __builtin_expect (!skip_ifunc, 1))
+	{
+# ifndef RTLD_BOOTSTRAP
+	  if (sym_map != map
+	      && sym_map->l_type != lt_executable
+	      && !sym_map->l_relocated)
+	    {
+	      const char *strtab
+		= (const char *) D_PTR (map, l_info[DT_STRTAB]);
+	      _dl_error_printf ("\
+%s: Relink `%s' with `%s' for IFUNC symbol `%s'\n",
+				RTLD_PROGNAME, map->l_name,
+				sym_map->l_name,
+				strtab + refsym->st_name);
+	    }
+# endif
+	  value = ((ElfW(Addr) (*) (void)) value) ();
+	}
+
+      switch (r_type)
+	{
+# ifndef RTLD_BOOTSTRAP
+#  ifdef __ILP32__
+	case R_X86_64_SIZE64:
+	  /* Set to symbol size plus addend.  */
+	  *(Elf64_Addr *) (uintptr_t) reloc_addr
+	    = (Elf64_Addr) sym->st_size + reloc->r_addend;
+	  break;
+
+	case R_X86_64_SIZE32:
+#  else
+	case R_X86_64_SIZE64:
+#  endif
+	  /* Set to symbol size plus addend.  */
+	  value = sym->st_size;
+# endif
+	case R_X86_64_GLOB_DAT:
+	case R_X86_64_JUMP_SLOT:
+	  *reloc_addr = value + reloc->r_addend;
+	  break;
+
+# ifndef RESOLVE_CONFLICT_FIND_MAP
+	case R_X86_64_DTPMOD64:
+#  ifdef RTLD_BOOTSTRAP
+	  /* During startup the dynamic linker is always the module
+	     with index 1.
+	     XXX If this relocation is necessary move before RESOLVE
+	     call.  */
+	  *reloc_addr = 1;
+#  else
+	  /* Get the information from the link map returned by the
+	     resolve function.  */
+	  if (sym_map != NULL)
+	    *reloc_addr = sym_map->l_tls_modid;
+#  endif
+	  break;
+	case R_X86_64_DTPOFF64:
+#  ifndef RTLD_BOOTSTRAP
+	  /* During relocation all TLS symbols are defined and used.
+	     Therefore the offset is already correct.  */
+	  if (sym != NULL)
+	    {
+	      value = sym->st_value + reloc->r_addend;
+#   ifdef __ILP32__
+	      /* This relocation type computes a signed offset that is
+		 usually negative.  The symbol and addend values are 32
+		 bits but the GOT entry is 64 bits wide and the whole
+		 64-bit entry is used as a signed quantity, so we need
+		 to sign-extend the computed value to 64 bits.  */
+	      *(Elf64_Sxword *) reloc_addr = (Elf64_Sxword) (Elf32_Sword) value;
+#   else
+	      *reloc_addr = value;
+#   endif
+	    }
+#  endif
+	  break;
+	case R_X86_64_TLSDESC:
+	  {
+	    struct tlsdesc volatile *td =
+	      (struct tlsdesc volatile *)reloc_addr;
+
+#  ifndef RTLD_BOOTSTRAP
+	    if (! sym)
+	      {
+		td->arg = (void*)reloc->r_addend;
+		td->entry = _dl_tlsdesc_undefweak;
+	      }
+	    else
+#  endif
+	      {
+#  ifndef RTLD_BOOTSTRAP
+#   ifndef SHARED
+		CHECK_STATIC_TLS (map, sym_map);
+#   else
+		if (!TRY_STATIC_TLS (map, sym_map))
+		  {
+		    td->arg = _dl_make_tlsdesc_dynamic
+		      (sym_map, sym->st_value + reloc->r_addend);
+		    td->entry = _dl_tlsdesc_dynamic;
+		  }
+		else
+#   endif
+#  endif
+		  {
+		    td->arg = (void*)(sym->st_value - sym_map->l_tls_offset
+				      + reloc->r_addend);
+		    td->entry = _dl_tlsdesc_return;
+		  }
+	      }
+	    break;
+	  }
+	case R_X86_64_TPOFF64:
+	  /* The offset is negative, forward from the thread pointer.  */
+#  ifndef RTLD_BOOTSTRAP
+	  if (sym != NULL)
+#  endif
+	    {
+#  ifndef RTLD_BOOTSTRAP
+	      CHECK_STATIC_TLS (map, sym_map);
+#  endif
+	      /* We know the offset of the object the symbol is contained in.
+		 It is a negative value which will be added to the
+		 thread pointer.  */
+	      value = (sym->st_value + reloc->r_addend
+		       - sym_map->l_tls_offset);
+#  ifdef __ILP32__
+	      /* The symbol and addend values are 32 bits but the GOT
+		 entry is 64 bits wide and the whole 64-bit entry is used
+		 as a signed quantity, so we need to sign-extend the
+		 computed value to 64 bits.  */
+	      *(Elf64_Sxword *) reloc_addr = (Elf64_Sxword) (Elf32_Sword) value;
+#  else
+	      *reloc_addr = value;
+#  endif
+	    }
+	  break;
+# endif
+
+# ifndef RTLD_BOOTSTRAP
+	case R_X86_64_64:
+	  /* value + r_addend may be > 0xffffffff and R_X86_64_64
+	     relocation updates the whole 64-bit entry.  */
+	  *(Elf64_Addr *) reloc_addr = (Elf64_Addr) value + reloc->r_addend;
+	  break;
+#  ifndef __ILP32__
+	case R_X86_64_SIZE32:
+	  /* Set to symbol size plus addend.  */
+	  value = sym->st_size;
+#  endif
+	case R_X86_64_32:
+	  value += reloc->r_addend;
+	  *(unsigned int *) reloc_addr = value;
+
+	  const char *fmt;
+	  if (__glibc_unlikely (value > UINT_MAX))
+	    {
+	      const char *strtab;
+
+	      fmt = "\
+%s: Symbol `%s' causes overflow in R_X86_64_32 relocation\n";
+#  ifndef RESOLVE_CONFLICT_FIND_MAP
+	    print_err:
+#  endif
+	      strtab = (const char *) D_PTR (map, l_info[DT_STRTAB]);
+
+	      _dl_error_printf (fmt, RTLD_PROGNAME, strtab + refsym->st_name);
+	    }
+	  break;
+#  ifndef RESOLVE_CONFLICT_FIND_MAP
+	  /* Not needed for dl-conflict.c.  */
+	case R_X86_64_PC32:
+	  value += reloc->r_addend - (ElfW(Addr)) reloc_addr;
+	  *(unsigned int *) reloc_addr = value;
+	  if (__glibc_unlikely (value != (int) value))
+	    {
+	      fmt = "\
+%s: Symbol `%s' causes overflow in R_X86_64_PC32 relocation\n";
+	      goto print_err;
+	    }
+	  break;
+	case R_X86_64_COPY:
+	  if (sym == NULL)
+	    /* This can happen in trace mode if an object could not be
+	       found.  */
+	    break;
+	  memcpy (reloc_addr_arg, (void *) value,
+		  MIN (sym->st_size, refsym->st_size));
+	  if (__builtin_expect (sym->st_size > refsym->st_size, 0)
+	      || (__builtin_expect (sym->st_size < refsym->st_size, 0)
+		  && GLRO(dl_verbose)))
+	    {
+	      fmt = "\
+%s: Symbol `%s' has different size in shared object, consider re-linking\n";
+	      goto print_err;
+	    }
+	  break;
+#  endif
+	case R_X86_64_IRELATIVE:
+	  value = map->l_addr + reloc->r_addend;
+	  value = ((ElfW(Addr) (*) (void)) value) ();
+	  *reloc_addr = value;
+	  break;
+	default:
+	  _dl_reloc_bad_type (map, r_type, 0);
+	  break;
+# endif
+	}
+    }
+}
+
+auto inline void
+__attribute ((always_inline))
+elf_machine_rela_relative (ElfW(Addr) l_addr, const ElfW(Rela) *reloc,
+			   void *const reloc_addr_arg)
+{
+  ElfW(Addr) *const reloc_addr = reloc_addr_arg;
+#if !defined RTLD_BOOTSTRAP
+  /* l_addr + r_addend may be > 0xffffffff and R_X86_64_RELATIVE64
+     relocation updates the whole 64-bit entry.  */
+  if (__glibc_unlikely (ELFW(R_TYPE) (reloc->r_info) == R_X86_64_RELATIVE64))
+    *(Elf64_Addr *) reloc_addr = (Elf64_Addr) l_addr + reloc->r_addend;
+  else
+#endif
+    {
+      assert (ELFW(R_TYPE) (reloc->r_info) == R_X86_64_RELATIVE);
+      *reloc_addr = l_addr + reloc->r_addend;
+    }
+}
+
+auto inline void
+__attribute ((always_inline))
+elf_machine_lazy_rel (struct link_map *map,
+		      ElfW(Addr) l_addr, const ElfW(Rela) *reloc,
+		      int skip_ifunc)
+{
+  ElfW(Addr) *const reloc_addr = (void *) (l_addr + reloc->r_offset);
+  const unsigned long int r_type = ELFW(R_TYPE) (reloc->r_info);
+
+  /* Check for unexpected PLT reloc type.  */
+  if (__glibc_likely (r_type == R_X86_64_JUMP_SLOT))
+    {
+      if (__builtin_expect (map->l_mach.plt, 0) == 0)
+	*reloc_addr += l_addr;
+      else
+	*reloc_addr =
+	  map->l_mach.plt
+	  + (((ElfW(Addr)) reloc_addr) - map->l_mach.gotplt) * 2;
+    }
+  else if (__glibc_likely (r_type == R_X86_64_TLSDESC))
+    {
+      struct tlsdesc volatile * __attribute__((__unused__)) td =
+	(struct tlsdesc volatile *)reloc_addr;
+
+      td->arg = (void*)reloc;
+      td->entry = (void*)(D_PTR (map, l_info[ADDRIDX (DT_TLSDESC_PLT)])
+			  + map->l_addr);
+    }
+  else if (__glibc_unlikely (r_type == R_X86_64_IRELATIVE))
+    {
+      ElfW(Addr) value = map->l_addr + reloc->r_addend;
+      if (__glibc_likely (!skip_ifunc))
+	value = ((ElfW(Addr) (*) (void)) value) ();
+      *reloc_addr = value;
+    }
+  else
+    _dl_reloc_bad_type (map, r_type, 1);
+}
+
+#endif /* RESOLVE_MAP */
diff --git a/REORG.TODO/sysdeps/x86_64/dl-procinfo.c b/REORG.TODO/sysdeps/x86_64/dl-procinfo.c
new file mode 100644
index 0000000000..17ae800a37
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/dl-procinfo.c
@@ -0,0 +1,45 @@
+/* Data for x86-64 version of processor capability information.
+   Copyright (C) 2015-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* If anything should be added here check whether the size of each string
+   is still ok with the given array size.
+
+   All the #ifdefs in the definitions are quite irritating but
+   necessary if we want to avoid duplicating the information.  There
+   are three different modes:
+
+   - PROCINFO_DECL is defined.  This means we are only interested in
+     declarations.
+
+   - PROCINFO_DECL is not defined:
+
+     + if SHARED is defined the file is included in an array
+       initializer.  The .element = { ... } syntax is needed.
+
+     + if SHARED is not defined a normal array initialization is
+       needed.
+  */
+
+#ifndef PROCINFO_CLASS
+# define PROCINFO_CLASS
+#endif
+
+#include <sysdeps/x86/dl-procinfo.c>
+
+#undef PROCINFO_DECL
+#undef PROCINFO_CLASS
diff --git a/REORG.TODO/sysdeps/x86_64/dl-runtime.c b/REORG.TODO/sysdeps/x86_64/dl-runtime.c
new file mode 100644
index 0000000000..b625d1e882
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/dl-runtime.c
@@ -0,0 +1,9 @@
+/* The ABI calls for the PLT stubs to pass the index of the relocation
+   and not its offset.  In _dl_profile_fixup and _dl_call_pltexit we
+   also use the index.  Therefore it is wasteful to compute the offset
+   in the trampoline just to reverse the operation immediately
+   afterwards.  */
+#define reloc_offset reloc_arg * sizeof (PLTREL)
+#define reloc_index  reloc_arg
+
+#include <elf/dl-runtime.c>
diff --git a/REORG.TODO/sysdeps/x86_64/dl-tls.h b/REORG.TODO/sysdeps/x86_64/dl-tls.h
new file mode 100644
index 0000000000..4a59d2a924
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/dl-tls.h
@@ -0,0 +1,29 @@
+/* Thread-local storage handling in the ELF dynamic linker.  x86-64 version.
+   Copyright (C) 2002-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <stdint.h>
+
+/* Type used for the representation of TLS information in the GOT.  */
+typedef struct dl_tls_index
+{
+  uint64_t ti_module;
+  uint64_t ti_offset;
+} tls_index;
+
+
+extern void *__tls_get_addr (tls_index *ti);
diff --git a/REORG.TODO/sysdeps/x86_64/dl-tlsdesc.S b/REORG.TODO/sysdeps/x86_64/dl-tlsdesc.S
new file mode 100644
index 0000000000..be3a780a1a
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/dl-tlsdesc.S
@@ -0,0 +1,245 @@
+/* Thread-local storage handling in the ELF dynamic linker.  x86_64 version.
+   Copyright (C) 2004-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <tls.h>
+#include "tlsdesc.h"
+
+	.text
+
+     /* This function is used to compute the TP offset for symbols in
+	Static TLS, i.e., whose TP offset is the same for all
+	threads.
+
+	The incoming %rax points to the TLS descriptor, such that
+	0(%rax) points to _dl_tlsdesc_return itself, and 8(%rax) holds
+	the TP offset of the symbol corresponding to the object
+	denoted by the argument.  */
+
+	.hidden _dl_tlsdesc_return
+	.global	_dl_tlsdesc_return
+	.type	_dl_tlsdesc_return,@function
+	cfi_startproc
+	.align 16
+_dl_tlsdesc_return:
+	movq	8(%rax), %rax
+	ret
+	cfi_endproc
+	.size	_dl_tlsdesc_return, .-_dl_tlsdesc_return
+
+     /* This function is used for undefined weak TLS symbols, for
+	which the base address (i.e., disregarding any addend) should
+	resolve to NULL.
+
+	%rax points to the TLS descriptor, such that 0(%rax) points to
+	_dl_tlsdesc_undefweak itself, and 8(%rax) holds the addend.
+	We return the addend minus the TP, such that, when the caller
+	adds TP, it gets the addend back.  If that's zero, as usual,
+	that's most likely a NULL pointer.  */
+
+	.hidden _dl_tlsdesc_undefweak
+	.global	_dl_tlsdesc_undefweak
+	.type	_dl_tlsdesc_undefweak,@function
+	cfi_startproc
+	.align 16
+_dl_tlsdesc_undefweak:
+	movq	8(%rax), %rax
+	subq	%fs:0, %rax
+	ret
+	cfi_endproc
+	.size	_dl_tlsdesc_undefweak, .-_dl_tlsdesc_undefweak
+
+#ifdef SHARED
+	.hidden _dl_tlsdesc_dynamic
+	.global	_dl_tlsdesc_dynamic
+	.type	_dl_tlsdesc_dynamic,@function
+
+     /* %rax points to the TLS descriptor, such that 0(%rax) points to
+	_dl_tlsdesc_dynamic itself, and 8(%rax) points to a struct
+	tlsdesc_dynamic_arg object.  It must return in %rax the offset
+	between the thread pointer and the object denoted by the
+	argument, without clobbering any registers.
+
+	The assembly code that follows is a rendition of the following
+	C code, hand-optimized a little bit.
+
+ptrdiff_t
+_dl_tlsdesc_dynamic (register struct tlsdesc *tdp asm ("%rax"))
+{
+  struct tlsdesc_dynamic_arg *td = tdp->arg;
+  dtv_t *dtv = *(dtv_t **)((char *)__thread_pointer + DTV_OFFSET);
+  if (__builtin_expect (td->gen_count <= dtv[0].counter
+			&& (dtv[td->tlsinfo.ti_module].pointer.val
+			    != TLS_DTV_UNALLOCATED),
+			1))
+    return dtv[td->tlsinfo.ti_module].pointer.val + td->tlsinfo.ti_offset
+      - __thread_pointer;
+
+  return __tls_get_addr_internal (&td->tlsinfo) - __thread_pointer;
+}
+*/
+	cfi_startproc
+	.align 16
+_dl_tlsdesc_dynamic:
+	/* Preserve call-clobbered registers that we modify.
+	   We need two scratch regs anyway.  */
+	movq	%rsi, -16(%rsp)
+	movq	%fs:DTV_OFFSET, %rsi
+	movq	%rdi, -8(%rsp)
+	movq	TLSDESC_ARG(%rax), %rdi
+	movq	(%rsi), %rax
+	cmpq	%rax, TLSDESC_GEN_COUNT(%rdi)
+	ja	.Lslow
+	movq	TLSDESC_MODID(%rdi), %rax
+	salq	$4, %rax
+	movq	(%rax,%rsi), %rax
+	cmpq	$-1, %rax
+	je	.Lslow
+	addq	TLSDESC_MODOFF(%rdi), %rax
+.Lret:
+	movq	-16(%rsp), %rsi
+	subq	%fs:0, %rax
+	movq	-8(%rsp), %rdi
+	ret
+.Lslow:
+	/* Besides rdi and rsi, saved above, save rdx, rcx, r8, r9,
+	   r10 and r11.  Also, align the stack, that's off by 8 bytes.	*/
+	subq	$72, %rsp
+	cfi_adjust_cfa_offset (72)
+	movq	%rdx, 8(%rsp)
+	movq	%rcx, 16(%rsp)
+	movq	%r8, 24(%rsp)
+	movq	%r9, 32(%rsp)
+	movq	%r10, 40(%rsp)
+	movq	%r11, 48(%rsp)
+	/* %rdi already points to the tlsinfo data structure.  */
+	call	HIDDEN_JUMPTARGET (__tls_get_addr)
+	movq	8(%rsp), %rdx
+	movq	16(%rsp), %rcx
+	movq	24(%rsp), %r8
+	movq	32(%rsp), %r9
+	movq	40(%rsp), %r10
+	movq	48(%rsp), %r11
+	addq	$72, %rsp
+	cfi_adjust_cfa_offset (-72)
+	jmp	.Lret
+	cfi_endproc
+	.size	_dl_tlsdesc_dynamic, .-_dl_tlsdesc_dynamic
+#endif /* SHARED */
+
+     /* This function is a wrapper for a lazy resolver for TLS_DESC
+	RELA relocations.  The incoming 0(%rsp) points to the caller's
+	link map, pushed by the dynamic object's internal lazy TLS
+	resolver front-end before tail-calling us.  We need to pop it
+	ourselves.  %rax points to a TLS descriptor, such that 0(%rax)
+	holds the address of the internal resolver front-end (unless
+	some other thread beat us to resolving it) and 8(%rax) holds a
+	pointer to the relocation.
+
+	When the actual resolver returns, it will have adjusted the
+	TLS descriptor such that we can tail-call it for it to return
+	the TP offset of the symbol.  */
+
+	.hidden _dl_tlsdesc_resolve_rela
+	.global	_dl_tlsdesc_resolve_rela
+	.type	_dl_tlsdesc_resolve_rela,@function
+	cfi_startproc
+	.align 16
+	/* The PLT entry will have pushed the link_map pointer.  */
+_dl_tlsdesc_resolve_rela:
+	cfi_adjust_cfa_offset (8)
+	/* Save all call-clobbered registers.  Add 8 bytes for push in
+	   the PLT entry to align the stack.  */
+	subq	$80, %rsp
+	cfi_adjust_cfa_offset (80)
+	movq	%rax, (%rsp)
+	movq	%rdi, 8(%rsp)
+	movq	%rax, %rdi	/* Pass tlsdesc* in %rdi.  */
+	movq	%rsi, 16(%rsp)
+	movq	80(%rsp), %rsi	/* Pass link_map* in %rsi.  */
+	movq	%r8, 24(%rsp)
+	movq	%r9, 32(%rsp)
+	movq	%r10, 40(%rsp)
+	movq	%r11, 48(%rsp)
+	movq	%rdx, 56(%rsp)
+	movq	%rcx, 64(%rsp)
+	call	_dl_tlsdesc_resolve_rela_fixup
+	movq	(%rsp), %rax
+	movq	8(%rsp), %rdi
+	movq	16(%rsp), %rsi
+	movq	24(%rsp), %r8
+	movq	32(%rsp), %r9
+	movq	40(%rsp), %r10
+	movq	48(%rsp), %r11
+	movq	56(%rsp), %rdx
+	movq	64(%rsp), %rcx
+	addq	$88, %rsp
+	cfi_adjust_cfa_offset (-88)
+	jmp	*(%rax)
+	cfi_endproc
+	.size	_dl_tlsdesc_resolve_rela, .-_dl_tlsdesc_resolve_rela
+
+     /* This function is a placeholder for lazy resolving of TLS
+	relocations.  Once some thread starts resolving a TLS
+	relocation, it sets up the TLS descriptor to use this
+	resolver, such that other threads that would attempt to
+	resolve it concurrently may skip the call to the original lazy
+	resolver and go straight to a condition wait.
+
+	When the actual resolver returns, it will have adjusted the
+	TLS descriptor such that we can tail-call it for it to return
+	the TP offset of the symbol.  */
+
+	.hidden _dl_tlsdesc_resolve_hold
+	.global	_dl_tlsdesc_resolve_hold
+	.type	_dl_tlsdesc_resolve_hold,@function
+	cfi_startproc
+	.align 16
+_dl_tlsdesc_resolve_hold:
+0:
+	/* Save all call-clobbered registers.  */
+	subq	$72, %rsp
+	cfi_adjust_cfa_offset (72)
+	movq	%rax, (%rsp)
+	movq	%rdi, 8(%rsp)
+	movq	%rax, %rdi	/* Pass tlsdesc* in %rdi.  */
+	movq	%rsi, 16(%rsp)
+	/* Pass _dl_tlsdesc_resolve_hold's address in %rsi.  */
+	leaq	. - _dl_tlsdesc_resolve_hold(%rip), %rsi
+	movq	%r8, 24(%rsp)
+	movq	%r9, 32(%rsp)
+	movq	%r10, 40(%rsp)
+	movq	%r11, 48(%rsp)
+	movq	%rdx, 56(%rsp)
+	movq	%rcx, 64(%rsp)
+	call	_dl_tlsdesc_resolve_hold_fixup
+1:
+	movq	(%rsp), %rax
+	movq	8(%rsp), %rdi
+	movq	16(%rsp), %rsi
+	movq	24(%rsp), %r8
+	movq	32(%rsp), %r9
+	movq	40(%rsp), %r10
+	movq	48(%rsp), %r11
+	movq	56(%rsp), %rdx
+	movq	64(%rsp), %rcx
+	addq	$72, %rsp
+	cfi_adjust_cfa_offset (-72)
+	jmp	*(%rax)
+	cfi_endproc
+	.size	_dl_tlsdesc_resolve_hold, .-_dl_tlsdesc_resolve_hold
diff --git a/REORG.TODO/sysdeps/x86_64/dl-tlsdesc.h b/REORG.TODO/sysdeps/x86_64/dl-tlsdesc.h
new file mode 100644
index 0000000000..14019a2610
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/dl-tlsdesc.h
@@ -0,0 +1,70 @@
+/* Thread-local storage descriptor handling in the ELF dynamic linker.
+   x86_64 version.
+   Copyright (C) 2005-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <stdint.h>
+
+#ifndef _X86_64_DL_TLSDESC_H
+# define _X86_64_DL_TLSDESC_H 1
+
+/* Type used to represent a TLS descriptor in the GOT.  */
+struct tlsdesc
+{
+  /* Anonymous union is used here to ensure that GOT entry slot is always
+     8 bytes for both x32 and x86-64.  */
+  union
+    {
+      ptrdiff_t (*entry) (struct tlsdesc *on_rax);
+      uint64_t entry_slot;
+    };
+  union
+    {
+      void *arg;
+      uint64_t arg_slot;
+    };
+};
+
+typedef struct dl_tls_index
+{
+  uint64_t ti_module;
+  uint64_t ti_offset;
+} tls_index;
+
+/* Type used as the argument in a TLS descriptor for a symbol that
+   needs dynamic TLS offsets.  */
+struct tlsdesc_dynamic_arg
+{
+  tls_index tlsinfo;
+  size_t gen_count;
+};
+
+extern ptrdiff_t attribute_hidden
+  _dl_tlsdesc_return(struct tlsdesc *on_rax),
+  _dl_tlsdesc_undefweak(struct tlsdesc *on_rax),
+  _dl_tlsdesc_resolve_rela(struct tlsdesc *on_rax),
+  _dl_tlsdesc_resolve_hold(struct tlsdesc *on_rax);
+
+# ifdef SHARED
+extern void *_dl_make_tlsdesc_dynamic (struct link_map *map,
+				       size_t ti_offset)
+  internal_function attribute_hidden;
+
+extern ptrdiff_t attribute_hidden _dl_tlsdesc_dynamic(struct tlsdesc *);
+# endif
+
+#endif
diff --git a/REORG.TODO/sysdeps/x86_64/dl-trampoline.S b/REORG.TODO/sysdeps/x86_64/dl-trampoline.S
new file mode 100644
index 0000000000..c14c61aa58
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/dl-trampoline.S
@@ -0,0 +1,147 @@
+/* PLT trampolines.  x86-64 version.
+   Copyright (C) 2004-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <config.h>
+#include <sysdep.h>
+#include <cpu-features.h>
+#include <link-defines.h>
+
+#ifndef DL_STACK_ALIGNMENT
+/* Due to GCC bug:
+
+   https://gcc.gnu.org/bugzilla/show_bug.cgi?id=58066
+
+   __tls_get_addr may be called with 8-byte stack alignment.  Although
+   this bug has been fixed in GCC 4.9.4, 5.3 and 6, we can't assume
+   that stack will be always aligned at 16 bytes.  We use unaligned
+   16-byte move to load and store SSE registers, which has no penalty
+   on modern processors if stack is 16-byte aligned.  */
+# define DL_STACK_ALIGNMENT 8
+#endif
+
+#ifndef DL_RUNTIME_UNALIGNED_VEC_SIZE
+/* The maximum size in bytes of unaligned vector load and store in the
+   dynamic linker.  Since SSE optimized memory/string functions with
+   aligned SSE register load and store are used in the dynamic linker,
+   we must set this to 8 so that _dl_runtime_resolve_sse will align the
+   stack before calling _dl_fixup.  */
+# define DL_RUNTIME_UNALIGNED_VEC_SIZE 8
+#endif
+
+/* True if _dl_runtime_resolve should align stack to VEC_SIZE bytes.  */
+#define DL_RUNTIME_RESOLVE_REALIGN_STACK \
+  (VEC_SIZE > DL_STACK_ALIGNMENT \
+   && VEC_SIZE > DL_RUNTIME_UNALIGNED_VEC_SIZE)
+
+/* Align vector register save area to 16 bytes.  */
+#define REGISTER_SAVE_VEC_OFF	0
+
+/* Area on stack to save and restore registers used for parameter
+   passing when calling _dl_fixup.  */
+#ifdef __ILP32__
+# define REGISTER_SAVE_RAX	(REGISTER_SAVE_VEC_OFF + VEC_SIZE * 8)
+# define PRESERVE_BND_REGS_PREFIX
+#else
+/* Align bound register save area to 16 bytes.  */
+# define REGISTER_SAVE_BND0	(REGISTER_SAVE_VEC_OFF + VEC_SIZE * 8)
+# define REGISTER_SAVE_BND1	(REGISTER_SAVE_BND0 + 16)
+# define REGISTER_SAVE_BND2	(REGISTER_SAVE_BND1 + 16)
+# define REGISTER_SAVE_BND3	(REGISTER_SAVE_BND2 + 16)
+# define REGISTER_SAVE_RAX	(REGISTER_SAVE_BND3 + 16)
+# ifdef HAVE_MPX_SUPPORT
+#  define PRESERVE_BND_REGS_PREFIX bnd
+# else
+#  define PRESERVE_BND_REGS_PREFIX .byte 0xf2
+# endif
+#endif
+#define REGISTER_SAVE_RCX	(REGISTER_SAVE_RAX + 8)
+#define REGISTER_SAVE_RDX	(REGISTER_SAVE_RCX + 8)
+#define REGISTER_SAVE_RSI	(REGISTER_SAVE_RDX + 8)
+#define REGISTER_SAVE_RDI	(REGISTER_SAVE_RSI + 8)
+#define REGISTER_SAVE_R8	(REGISTER_SAVE_RDI + 8)
+#define REGISTER_SAVE_R9	(REGISTER_SAVE_R8 + 8)
+
+#define RESTORE_AVX
+
+#define VEC_SIZE		64
+#define VMOVA			vmovdqa64
+#if DL_RUNTIME_RESOLVE_REALIGN_STACK || VEC_SIZE <= DL_STACK_ALIGNMENT
+# define VMOV			vmovdqa64
+#else
+# define VMOV			vmovdqu64
+#endif
+#define VEC(i)			zmm##i
+#define _dl_runtime_resolve	_dl_runtime_resolve_avx512
+#define _dl_runtime_profile	_dl_runtime_profile_avx512
+#include "dl-trampoline.h"
+#undef _dl_runtime_resolve
+#undef _dl_runtime_profile
+#undef VEC
+#undef VMOV
+#undef VMOVA
+#undef VEC_SIZE
+
+#define VEC_SIZE		32
+#define VMOVA			vmovdqa
+#if DL_RUNTIME_RESOLVE_REALIGN_STACK || VEC_SIZE <= DL_STACK_ALIGNMENT
+# define VMOV			vmovdqa
+#else
+# define VMOV			vmovdqu
+#endif
+#define VEC(i)			ymm##i
+#define _dl_runtime_resolve	_dl_runtime_resolve_avx
+#define _dl_runtime_resolve_opt	_dl_runtime_resolve_avx_opt
+#define _dl_runtime_profile	_dl_runtime_profile_avx
+#include "dl-trampoline.h"
+#undef _dl_runtime_resolve
+#undef _dl_runtime_resolve_opt
+#undef _dl_runtime_profile
+#undef VEC
+#undef VMOV
+#undef VMOVA
+#undef VEC_SIZE
+
+/* movaps/movups is 1-byte shorter.  */
+#define VEC_SIZE		16
+#define VMOVA			movaps
+#if DL_RUNTIME_RESOLVE_REALIGN_STACK || VEC_SIZE <= DL_STACK_ALIGNMENT
+# define VMOV			movaps
+#else
+# define VMOV			movups
+#endif
+#define VEC(i)			xmm##i
+#define _dl_runtime_resolve	_dl_runtime_resolve_sse
+#define _dl_runtime_profile	_dl_runtime_profile_sse
+#undef RESTORE_AVX
+#include "dl-trampoline.h"
+#undef _dl_runtime_resolve
+#undef _dl_runtime_profile
+#undef VMOV
+#undef VMOVA
+
+/* Used by _dl_runtime_resolve_avx_opt/_dl_runtime_resolve_avx512_opt
+   to preserve the full vector registers with zero upper bits.  */
+#define VMOVA			vmovdqa
+#if DL_RUNTIME_RESOLVE_REALIGN_STACK || VEC_SIZE <= DL_STACK_ALIGNMENT
+# define VMOV			vmovdqa
+#else
+# define VMOV			vmovdqu
+#endif
+#define _dl_runtime_resolve	_dl_runtime_resolve_sse_vex
+#define _dl_runtime_resolve_opt	_dl_runtime_resolve_avx512_opt
+#include "dl-trampoline.h"
diff --git a/REORG.TODO/sysdeps/x86_64/dl-trampoline.h b/REORG.TODO/sysdeps/x86_64/dl-trampoline.h
new file mode 100644
index 0000000000..8db24c16ac
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/dl-trampoline.h
@@ -0,0 +1,647 @@
+/* PLT trampolines.  x86-64 version.
+   Copyright (C) 2009-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#undef REGISTER_SAVE_AREA_RAW
+#ifdef __ILP32__
+/* X32 saves RCX, RDX, RSI, RDI, R8 and R9 plus RAX as well as VEC0 to
+   VEC7.  */
+# define REGISTER_SAVE_AREA_RAW	(8 * 7 + VEC_SIZE * 8)
+#else
+/* X86-64 saves RCX, RDX, RSI, RDI, R8 and R9 plus RAX as well as
+   BND0, BND1, BND2, BND3 and VEC0 to VEC7. */
+# define REGISTER_SAVE_AREA_RAW	(8 * 7 + 16 * 4 + VEC_SIZE * 8)
+#endif
+
+#undef REGISTER_SAVE_AREA
+#undef LOCAL_STORAGE_AREA
+#undef BASE
+#if DL_RUNTIME_RESOLVE_REALIGN_STACK
+# define REGISTER_SAVE_AREA	(REGISTER_SAVE_AREA_RAW + 8)
+/* Local stack area before jumping to function address: RBX.  */
+# define LOCAL_STORAGE_AREA	8
+# define BASE			rbx
+# if (REGISTER_SAVE_AREA % VEC_SIZE) != 0
+#  error REGISTER_SAVE_AREA must be multples of VEC_SIZE
+# endif
+#else
+# define REGISTER_SAVE_AREA	REGISTER_SAVE_AREA_RAW
+/* Local stack area before jumping to function address:  All saved
+   registers.  */
+# define LOCAL_STORAGE_AREA	REGISTER_SAVE_AREA
+# define BASE			rsp
+# if (REGISTER_SAVE_AREA % 16) != 8
+#  error REGISTER_SAVE_AREA must be odd multples of 8
+# endif
+#endif
+
+	.text
+#ifdef _dl_runtime_resolve_opt
+/* Use the smallest vector registers to preserve the full YMM/ZMM
+   registers to avoid SSE transition penalty.  */
+
+# if VEC_SIZE == 32
+/* Check if the upper 128 bits in %ymm0 - %ymm7 registers are non-zero
+   and preserve %xmm0 - %xmm7 registers with the zero upper bits.  Since
+   there is no SSE transition penalty on AVX512 processors which don't
+   support XGETBV with ECX == 1, _dl_runtime_resolve_avx512_slow isn't
+   provided.   */
+	.globl _dl_runtime_resolve_avx_slow
+	.hidden _dl_runtime_resolve_avx_slow
+	.type _dl_runtime_resolve_avx_slow, @function
+	.align 16
+_dl_runtime_resolve_avx_slow:
+	cfi_startproc
+	cfi_adjust_cfa_offset(16) # Incorporate PLT
+	vorpd %ymm0, %ymm1, %ymm8
+	vorpd %ymm2, %ymm3, %ymm9
+	vorpd %ymm4, %ymm5, %ymm10
+	vorpd %ymm6, %ymm7, %ymm11
+	vorpd %ymm8, %ymm9, %ymm9
+	vorpd %ymm10, %ymm11, %ymm10
+	vpcmpeqd %xmm8, %xmm8, %xmm8
+	vorpd %ymm9, %ymm10, %ymm10
+	vptest %ymm10, %ymm8
+	# Preserve %ymm0 - %ymm7 registers if the upper 128 bits of any
+	# %ymm0 - %ymm7 registers aren't zero.
+	PRESERVE_BND_REGS_PREFIX
+	jnc _dl_runtime_resolve_avx
+	# Use vzeroupper to avoid SSE transition penalty.
+	vzeroupper
+	# Preserve %xmm0 - %xmm7 registers with the zero upper 128 bits
+	# when the upper 128 bits of %ymm0 - %ymm7 registers are zero.
+	PRESERVE_BND_REGS_PREFIX
+	jmp _dl_runtime_resolve_sse_vex
+	cfi_adjust_cfa_offset(-16) # Restore PLT adjustment
+	cfi_endproc
+	.size _dl_runtime_resolve_avx_slow, .-_dl_runtime_resolve_avx_slow
+# endif
+
+/* Use XGETBV with ECX == 1 to check which bits in vector registers are
+   non-zero and only preserve the non-zero lower bits with zero upper
+   bits.  */
+	.globl _dl_runtime_resolve_opt
+	.hidden _dl_runtime_resolve_opt
+	.type _dl_runtime_resolve_opt, @function
+	.align 16
+_dl_runtime_resolve_opt:
+	cfi_startproc
+	cfi_adjust_cfa_offset(16) # Incorporate PLT
+	pushq %rax
+	cfi_adjust_cfa_offset(8)
+	cfi_rel_offset(%rax, 0)
+	pushq %rcx
+	cfi_adjust_cfa_offset(8)
+	cfi_rel_offset(%rcx, 0)
+	pushq %rdx
+	cfi_adjust_cfa_offset(8)
+	cfi_rel_offset(%rdx, 0)
+	movl $1, %ecx
+	xgetbv
+	movl %eax, %r11d
+	popq %rdx
+	cfi_adjust_cfa_offset(-8)
+	cfi_restore (%rdx)
+	popq %rcx
+	cfi_adjust_cfa_offset(-8)
+	cfi_restore (%rcx)
+	popq %rax
+	cfi_adjust_cfa_offset(-8)
+	cfi_restore (%rax)
+# if VEC_SIZE == 32
+	# For YMM registers, check if YMM state is in use.
+	andl $bit_YMM_state, %r11d
+	# Preserve %xmm0 - %xmm7 registers with the zero upper 128 bits if
+	# YMM state isn't in use.
+	PRESERVE_BND_REGS_PREFIX
+	jz _dl_runtime_resolve_sse_vex
+# elif VEC_SIZE == 16
+	# For ZMM registers, check if YMM state and ZMM state are in
+	# use.
+	andl $(bit_YMM_state | bit_ZMM0_15_state), %r11d
+	cmpl $bit_YMM_state, %r11d
+	# Preserve %zmm0 - %zmm7 registers if ZMM state is in use.
+	PRESERVE_BND_REGS_PREFIX
+	jg _dl_runtime_resolve_avx512
+	# Preserve %ymm0 - %ymm7 registers with the zero upper 256 bits if
+	# ZMM state isn't in use.
+	PRESERVE_BND_REGS_PREFIX
+	je _dl_runtime_resolve_avx
+	# Preserve %xmm0 - %xmm7 registers with the zero upper 384 bits if
+	# neither YMM state nor ZMM state are in use.
+# else
+#  error Unsupported VEC_SIZE!
+# endif
+	cfi_adjust_cfa_offset(-16) # Restore PLT adjustment
+	cfi_endproc
+	.size _dl_runtime_resolve_opt, .-_dl_runtime_resolve_opt
+#endif
+	.globl _dl_runtime_resolve
+	.hidden _dl_runtime_resolve
+	.type _dl_runtime_resolve, @function
+	.align 16
+	cfi_startproc
+_dl_runtime_resolve:
+	cfi_adjust_cfa_offset(16) # Incorporate PLT
+#if DL_RUNTIME_RESOLVE_REALIGN_STACK
+# if LOCAL_STORAGE_AREA != 8
+#  error LOCAL_STORAGE_AREA must be 8
+# endif
+	pushq %rbx			# push subtracts stack by 8.
+	cfi_adjust_cfa_offset(8)
+	cfi_rel_offset(%rbx, 0)
+	mov %RSP_LP, %RBX_LP
+	cfi_def_cfa_register(%rbx)
+	and $-VEC_SIZE, %RSP_LP
+#endif
+	sub $REGISTER_SAVE_AREA, %RSP_LP
+#if !DL_RUNTIME_RESOLVE_REALIGN_STACK
+	cfi_adjust_cfa_offset(REGISTER_SAVE_AREA)
+#endif
+	# Preserve registers otherwise clobbered.
+	movq %rax, REGISTER_SAVE_RAX(%rsp)
+	movq %rcx, REGISTER_SAVE_RCX(%rsp)
+	movq %rdx, REGISTER_SAVE_RDX(%rsp)
+	movq %rsi, REGISTER_SAVE_RSI(%rsp)
+	movq %rdi, REGISTER_SAVE_RDI(%rsp)
+	movq %r8, REGISTER_SAVE_R8(%rsp)
+	movq %r9, REGISTER_SAVE_R9(%rsp)
+	VMOV %VEC(0), (REGISTER_SAVE_VEC_OFF)(%rsp)
+	VMOV %VEC(1), (REGISTER_SAVE_VEC_OFF + VEC_SIZE)(%rsp)
+	VMOV %VEC(2), (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 2)(%rsp)
+	VMOV %VEC(3), (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 3)(%rsp)
+	VMOV %VEC(4), (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 4)(%rsp)
+	VMOV %VEC(5), (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 5)(%rsp)
+	VMOV %VEC(6), (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 6)(%rsp)
+	VMOV %VEC(7), (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 7)(%rsp)
+#ifndef __ILP32__
+	# We also have to preserve bound registers.  These are nops if
+	# Intel MPX isn't available or disabled.
+# ifdef HAVE_MPX_SUPPORT
+	bndmov %bnd0, REGISTER_SAVE_BND0(%rsp)
+	bndmov %bnd1, REGISTER_SAVE_BND1(%rsp)
+	bndmov %bnd2, REGISTER_SAVE_BND2(%rsp)
+	bndmov %bnd3, REGISTER_SAVE_BND3(%rsp)
+# else
+#  if REGISTER_SAVE_BND0 == 0
+	.byte 0x66,0x0f,0x1b,0x04,0x24
+#  else
+	.byte 0x66,0x0f,0x1b,0x44,0x24,REGISTER_SAVE_BND0
+#  endif
+	.byte 0x66,0x0f,0x1b,0x4c,0x24,REGISTER_SAVE_BND1
+	.byte 0x66,0x0f,0x1b,0x54,0x24,REGISTER_SAVE_BND2
+	.byte 0x66,0x0f,0x1b,0x5c,0x24,REGISTER_SAVE_BND3
+# endif
+#endif
+	# Copy args pushed by PLT in register.
+	# %rdi: link_map, %rsi: reloc_index
+	mov (LOCAL_STORAGE_AREA + 8)(%BASE), %RSI_LP
+	mov LOCAL_STORAGE_AREA(%BASE), %RDI_LP
+	call _dl_fixup		# Call resolver.
+	mov %RAX_LP, %R11_LP	# Save return value
+#ifndef __ILP32__
+	# Restore bound registers.  These are nops if Intel MPX isn't
+	# avaiable or disabled.
+# ifdef HAVE_MPX_SUPPORT
+	bndmov REGISTER_SAVE_BND3(%rsp), %bnd3
+	bndmov REGISTER_SAVE_BND2(%rsp), %bnd2
+	bndmov REGISTER_SAVE_BND1(%rsp), %bnd1
+	bndmov REGISTER_SAVE_BND0(%rsp), %bnd0
+# else
+	.byte 0x66,0x0f,0x1a,0x5c,0x24,REGISTER_SAVE_BND3
+	.byte 0x66,0x0f,0x1a,0x54,0x24,REGISTER_SAVE_BND2
+	.byte 0x66,0x0f,0x1a,0x4c,0x24,REGISTER_SAVE_BND1
+#  if REGISTER_SAVE_BND0 == 0
+	.byte 0x66,0x0f,0x1a,0x04,0x24
+#  else
+	.byte 0x66,0x0f,0x1a,0x44,0x24,REGISTER_SAVE_BND0
+#  endif
+# endif
+#endif
+	# Get register content back.
+	movq REGISTER_SAVE_R9(%rsp), %r9
+	movq REGISTER_SAVE_R8(%rsp), %r8
+	movq REGISTER_SAVE_RDI(%rsp), %rdi
+	movq REGISTER_SAVE_RSI(%rsp), %rsi
+	movq REGISTER_SAVE_RDX(%rsp), %rdx
+	movq REGISTER_SAVE_RCX(%rsp), %rcx
+	movq REGISTER_SAVE_RAX(%rsp), %rax
+	VMOV (REGISTER_SAVE_VEC_OFF)(%rsp), %VEC(0)
+	VMOV (REGISTER_SAVE_VEC_OFF + VEC_SIZE)(%rsp), %VEC(1)
+	VMOV (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 2)(%rsp), %VEC(2)
+	VMOV (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 3)(%rsp), %VEC(3)
+	VMOV (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 4)(%rsp), %VEC(4)
+	VMOV (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 5)(%rsp), %VEC(5)
+	VMOV (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 6)(%rsp), %VEC(6)
+	VMOV (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 7)(%rsp), %VEC(7)
+#if DL_RUNTIME_RESOLVE_REALIGN_STACK
+	mov %RBX_LP, %RSP_LP
+	cfi_def_cfa_register(%rsp)
+	movq (%rsp), %rbx
+	cfi_restore(%rbx)
+#endif
+	# Adjust stack(PLT did 2 pushes)
+	add $(LOCAL_STORAGE_AREA + 16), %RSP_LP
+	cfi_adjust_cfa_offset(-(LOCAL_STORAGE_AREA + 16))
+	# Preserve bound registers.
+	PRESERVE_BND_REGS_PREFIX
+	jmp *%r11		# Jump to function address.
+	cfi_endproc
+	.size _dl_runtime_resolve, .-_dl_runtime_resolve
+
+
+/* To preserve %xmm0 - %xmm7 registers, dl-trampoline.h is included
+   twice, for _dl_runtime_resolve_sse and _dl_runtime_resolve_sse_vex.
+   But we don't need another _dl_runtime_profile for XMM registers.  */
+#if !defined PROF && defined _dl_runtime_profile
+# if (LR_VECTOR_OFFSET % VEC_SIZE) != 0
+#  error LR_VECTOR_OFFSET must be multples of VEC_SIZE
+# endif
+
+	.globl _dl_runtime_profile
+	.hidden _dl_runtime_profile
+	.type _dl_runtime_profile, @function
+	.align 16
+_dl_runtime_profile:
+	cfi_startproc
+	cfi_adjust_cfa_offset(16) # Incorporate PLT
+	/* The La_x86_64_regs data structure pointed to by the
+	   fourth paramater must be VEC_SIZE-byte aligned.  This must
+	   be explicitly enforced.  We have the set up a dynamically
+	   sized stack frame.  %rbx points to the top half which
+	   has a fixed size and preserves the original stack pointer.  */
+
+	sub $32, %RSP_LP	# Allocate the local storage.
+	cfi_adjust_cfa_offset(32)
+	movq %rbx, (%rsp)
+	cfi_rel_offset(%rbx, 0)
+
+	/* On the stack:
+		56(%rbx)	parameter #1
+		48(%rbx)	return address
+
+		40(%rbx)	reloc index
+		32(%rbx)	link_map
+
+		24(%rbx)	La_x86_64_regs pointer
+		16(%rbx)	framesize
+		 8(%rbx)	rax
+		  (%rbx)	rbx
+	*/
+
+	movq %rax, 8(%rsp)
+	mov %RSP_LP, %RBX_LP
+	cfi_def_cfa_register(%rbx)
+
+	/* Actively align the La_x86_64_regs structure.  */
+	and $-VEC_SIZE, %RSP_LP
+	/* sizeof(La_x86_64_regs).  Need extra space for 8 SSE registers
+	   to detect if any xmm0-xmm7 registers are changed by audit
+	   module.  */
+	sub $(LR_SIZE + XMM_SIZE*8), %RSP_LP
+	movq %rsp, 24(%rbx)
+
+	/* Fill the La_x86_64_regs structure.  */
+	movq %rdx, LR_RDX_OFFSET(%rsp)
+	movq %r8,  LR_R8_OFFSET(%rsp)
+	movq %r9,  LR_R9_OFFSET(%rsp)
+	movq %rcx, LR_RCX_OFFSET(%rsp)
+	movq %rsi, LR_RSI_OFFSET(%rsp)
+	movq %rdi, LR_RDI_OFFSET(%rsp)
+	movq %rbp, LR_RBP_OFFSET(%rsp)
+
+	lea 48(%rbx), %RAX_LP
+	movq %rax, LR_RSP_OFFSET(%rsp)
+
+	/* We always store the XMM registers even if AVX is available.
+	   This is to provide backward binary compatibility for existing
+	   audit modules.  */
+	movaps %xmm0,		   (LR_XMM_OFFSET)(%rsp)
+	movaps %xmm1, (LR_XMM_OFFSET +   XMM_SIZE)(%rsp)
+	movaps %xmm2, (LR_XMM_OFFSET + XMM_SIZE*2)(%rsp)
+	movaps %xmm3, (LR_XMM_OFFSET + XMM_SIZE*3)(%rsp)
+	movaps %xmm4, (LR_XMM_OFFSET + XMM_SIZE*4)(%rsp)
+	movaps %xmm5, (LR_XMM_OFFSET + XMM_SIZE*5)(%rsp)
+	movaps %xmm6, (LR_XMM_OFFSET + XMM_SIZE*6)(%rsp)
+	movaps %xmm7, (LR_XMM_OFFSET + XMM_SIZE*7)(%rsp)
+
+# ifndef __ILP32__
+#  ifdef HAVE_MPX_SUPPORT
+	bndmov %bnd0, 		   (LR_BND_OFFSET)(%rsp)  # Preserve bound
+	bndmov %bnd1, (LR_BND_OFFSET +   BND_SIZE)(%rsp)  # registers. Nops if
+	bndmov %bnd2, (LR_BND_OFFSET + BND_SIZE*2)(%rsp)  # MPX not available
+	bndmov %bnd3, (LR_BND_OFFSET + BND_SIZE*3)(%rsp)  # or disabled.
+#  else
+	.byte 0x66,0x0f,0x1b,0x84,0x24;.long (LR_BND_OFFSET)
+	.byte 0x66,0x0f,0x1b,0x8c,0x24;.long (LR_BND_OFFSET + BND_SIZE)
+	.byte 0x66,0x0f,0x1b,0x94,0x24;.long (LR_BND_OFFSET + BND_SIZE*2)
+	.byte 0x66,0x0f,0x1b,0x9c,0x24;.long (LR_BND_OFFSET + BND_SIZE*3)
+#  endif
+# endif
+
+# ifdef RESTORE_AVX
+	/* This is to support AVX audit modules.  */
+	VMOVA %VEC(0),		      (LR_VECTOR_OFFSET)(%rsp)
+	VMOVA %VEC(1), (LR_VECTOR_OFFSET +   VECTOR_SIZE)(%rsp)
+	VMOVA %VEC(2), (LR_VECTOR_OFFSET + VECTOR_SIZE*2)(%rsp)
+	VMOVA %VEC(3), (LR_VECTOR_OFFSET + VECTOR_SIZE*3)(%rsp)
+	VMOVA %VEC(4), (LR_VECTOR_OFFSET + VECTOR_SIZE*4)(%rsp)
+	VMOVA %VEC(5), (LR_VECTOR_OFFSET + VECTOR_SIZE*5)(%rsp)
+	VMOVA %VEC(6), (LR_VECTOR_OFFSET + VECTOR_SIZE*6)(%rsp)
+	VMOVA %VEC(7), (LR_VECTOR_OFFSET + VECTOR_SIZE*7)(%rsp)
+
+	/* Save xmm0-xmm7 registers to detect if any of them are
+	   changed by audit module.  */
+	vmovdqa %xmm0,		    (LR_SIZE)(%rsp)
+	vmovdqa %xmm1, (LR_SIZE +   XMM_SIZE)(%rsp)
+	vmovdqa %xmm2, (LR_SIZE + XMM_SIZE*2)(%rsp)
+	vmovdqa %xmm3, (LR_SIZE + XMM_SIZE*3)(%rsp)
+	vmovdqa %xmm4, (LR_SIZE + XMM_SIZE*4)(%rsp)
+	vmovdqa %xmm5, (LR_SIZE + XMM_SIZE*5)(%rsp)
+	vmovdqa %xmm6, (LR_SIZE + XMM_SIZE*6)(%rsp)
+	vmovdqa %xmm7, (LR_SIZE + XMM_SIZE*7)(%rsp)
+# endif
+
+	mov %RSP_LP, %RCX_LP	# La_x86_64_regs pointer to %rcx.
+	mov 48(%rbx), %RDX_LP	# Load return address if needed.
+	mov 40(%rbx), %RSI_LP	# Copy args pushed by PLT in register.
+	mov 32(%rbx), %RDI_LP	# %rdi: link_map, %rsi: reloc_index
+	lea 16(%rbx), %R8_LP	# Address of framesize
+	call _dl_profile_fixup	# Call resolver.
+
+	mov %RAX_LP, %R11_LP	# Save return value.
+
+	movq 8(%rbx), %rax	# Get back register content.
+	movq LR_RDX_OFFSET(%rsp), %rdx
+	movq  LR_R8_OFFSET(%rsp), %r8
+	movq  LR_R9_OFFSET(%rsp), %r9
+
+	movaps		    (LR_XMM_OFFSET)(%rsp), %xmm0
+	movaps	 (LR_XMM_OFFSET + XMM_SIZE)(%rsp), %xmm1
+	movaps (LR_XMM_OFFSET + XMM_SIZE*2)(%rsp), %xmm2
+	movaps (LR_XMM_OFFSET + XMM_SIZE*3)(%rsp), %xmm3
+	movaps (LR_XMM_OFFSET + XMM_SIZE*4)(%rsp), %xmm4
+	movaps (LR_XMM_OFFSET + XMM_SIZE*5)(%rsp), %xmm5
+	movaps (LR_XMM_OFFSET + XMM_SIZE*6)(%rsp), %xmm6
+	movaps (LR_XMM_OFFSET + XMM_SIZE*7)(%rsp), %xmm7
+
+# ifdef RESTORE_AVX
+	/* Check if any xmm0-xmm7 registers are changed by audit
+	   module.  */
+	vpcmpeqq (LR_SIZE)(%rsp), %xmm0, %xmm8
+	vpmovmskb %xmm8, %esi
+	cmpl $0xffff, %esi
+	je 2f
+	vmovdqa	%xmm0, (LR_VECTOR_OFFSET)(%rsp)
+	jmp 1f
+2:	VMOVA (LR_VECTOR_OFFSET)(%rsp), %VEC(0)
+	vmovdqa	%xmm0, (LR_XMM_OFFSET)(%rsp)
+
+1:	vpcmpeqq (LR_SIZE + XMM_SIZE)(%rsp), %xmm1, %xmm8
+	vpmovmskb %xmm8, %esi
+	cmpl $0xffff, %esi
+	je 2f
+	vmovdqa	%xmm1, (LR_VECTOR_OFFSET + VECTOR_SIZE)(%rsp)
+	jmp 1f
+2:	VMOVA (LR_VECTOR_OFFSET + VECTOR_SIZE)(%rsp), %VEC(1)
+	vmovdqa	%xmm1, (LR_XMM_OFFSET + XMM_SIZE)(%rsp)
+
+1:	vpcmpeqq (LR_SIZE + XMM_SIZE*2)(%rsp), %xmm2, %xmm8
+	vpmovmskb %xmm8, %esi
+	cmpl $0xffff, %esi
+	je 2f
+	vmovdqa	%xmm2, (LR_VECTOR_OFFSET + VECTOR_SIZE*2)(%rsp)
+	jmp 1f
+2:	VMOVA (LR_VECTOR_OFFSET + VECTOR_SIZE*2)(%rsp), %VEC(2)
+	vmovdqa	%xmm2, (LR_XMM_OFFSET + XMM_SIZE*2)(%rsp)
+
+1:	vpcmpeqq (LR_SIZE + XMM_SIZE*3)(%rsp), %xmm3, %xmm8
+	vpmovmskb %xmm8, %esi
+	cmpl $0xffff, %esi
+	je 2f
+	vmovdqa	%xmm3, (LR_VECTOR_OFFSET + VECTOR_SIZE*3)(%rsp)
+	jmp 1f
+2:	VMOVA (LR_VECTOR_OFFSET + VECTOR_SIZE*3)(%rsp), %VEC(3)
+	vmovdqa	%xmm3, (LR_XMM_OFFSET + XMM_SIZE*3)(%rsp)
+
+1:	vpcmpeqq (LR_SIZE + XMM_SIZE*4)(%rsp), %xmm4, %xmm8
+	vpmovmskb %xmm8, %esi
+	cmpl $0xffff, %esi
+	je 2f
+	vmovdqa	%xmm4, (LR_VECTOR_OFFSET + VECTOR_SIZE*4)(%rsp)
+	jmp 1f
+2:	VMOVA (LR_VECTOR_OFFSET + VECTOR_SIZE*4)(%rsp), %VEC(4)
+	vmovdqa	%xmm4, (LR_XMM_OFFSET + XMM_SIZE*4)(%rsp)
+
+1:	vpcmpeqq (LR_SIZE + XMM_SIZE*5)(%rsp), %xmm5, %xmm8
+	vpmovmskb %xmm8, %esi
+	cmpl $0xffff, %esi
+	je 2f
+	vmovdqa	%xmm5, (LR_VECTOR_OFFSET + VECTOR_SIZE*5)(%rsp)
+	jmp 1f
+2:	VMOVA (LR_VECTOR_OFFSET + VECTOR_SIZE*5)(%rsp), %VEC(5)
+	vmovdqa	%xmm5, (LR_XMM_OFFSET + XMM_SIZE*5)(%rsp)
+
+1:	vpcmpeqq (LR_SIZE + XMM_SIZE*6)(%rsp), %xmm6, %xmm8
+	vpmovmskb %xmm8, %esi
+	cmpl $0xffff, %esi
+	je 2f
+	vmovdqa	%xmm6, (LR_VECTOR_OFFSET + VECTOR_SIZE*6)(%rsp)
+	jmp 1f
+2:	VMOVA (LR_VECTOR_OFFSET + VECTOR_SIZE*6)(%rsp), %VEC(6)
+	vmovdqa	%xmm6, (LR_XMM_OFFSET + XMM_SIZE*6)(%rsp)
+
+1:	vpcmpeqq (LR_SIZE + XMM_SIZE*7)(%rsp), %xmm7, %xmm8
+	vpmovmskb %xmm8, %esi
+	cmpl $0xffff, %esi
+	je 2f
+	vmovdqa	%xmm7, (LR_VECTOR_OFFSET + VECTOR_SIZE*7)(%rsp)
+	jmp 1f
+2:	VMOVA (LR_VECTOR_OFFSET + VECTOR_SIZE*7)(%rsp), %VEC(7)
+	vmovdqa	%xmm7, (LR_XMM_OFFSET + XMM_SIZE*7)(%rsp)
+
+1:
+# endif
+
+# ifndef __ILP32__
+#  ifdef HAVE_MPX_SUPPORT
+	bndmov              (LR_BND_OFFSET)(%rsp), %bnd0  # Restore bound
+	bndmov (LR_BND_OFFSET +   BND_SIZE)(%rsp), %bnd1  # registers.
+	bndmov (LR_BND_OFFSET + BND_SIZE*2)(%rsp), %bnd2
+	bndmov (LR_BND_OFFSET + BND_SIZE*3)(%rsp), %bnd3
+#  else
+	.byte 0x66,0x0f,0x1a,0x84,0x24;.long (LR_BND_OFFSET)
+	.byte 0x66,0x0f,0x1a,0x8c,0x24;.long (LR_BND_OFFSET + BND_SIZE)
+	.byte 0x66,0x0f,0x1a,0x94,0x24;.long (LR_BND_OFFSET + BND_SIZE*2)
+	.byte 0x66,0x0f,0x1a,0x9c,0x24;.long (LR_BND_OFFSET + BND_SIZE*3)
+#  endif
+# endif
+
+	mov  16(%rbx), %R10_LP	# Anything in framesize?
+	test %R10_LP, %R10_LP
+	PRESERVE_BND_REGS_PREFIX
+	jns 3f
+
+	/* There's nothing in the frame size, so there
+	   will be no call to the _dl_call_pltexit. */
+
+	/* Get back registers content.  */
+	movq LR_RCX_OFFSET(%rsp), %rcx
+	movq LR_RSI_OFFSET(%rsp), %rsi
+	movq LR_RDI_OFFSET(%rsp), %rdi
+
+	mov %RBX_LP, %RSP_LP
+	movq (%rsp), %rbx
+	cfi_restore(%rbx)
+	cfi_def_cfa_register(%rsp)
+
+	add $48, %RSP_LP	# Adjust the stack to the return value
+				# (eats the reloc index and link_map)
+	cfi_adjust_cfa_offset(-48)
+	PRESERVE_BND_REGS_PREFIX
+	jmp *%r11		# Jump to function address.
+
+3:
+	cfi_adjust_cfa_offset(48)
+	cfi_rel_offset(%rbx, 0)
+	cfi_def_cfa_register(%rbx)
+
+	/* At this point we need to prepare new stack for the function
+	   which has to be called.  We copy the original stack to a
+	   temporary buffer of the size specified by the 'framesize'
+	   returned from _dl_profile_fixup */
+
+	lea LR_RSP_OFFSET(%rbx), %RSI_LP # stack
+	add $8, %R10_LP
+	and $-16, %R10_LP
+	mov %R10_LP, %RCX_LP
+	sub %R10_LP, %RSP_LP
+	mov %RSP_LP, %RDI_LP
+	shr $3, %RCX_LP
+	rep
+	movsq
+
+	movq 24(%rdi), %rcx	# Get back register content.
+	movq 32(%rdi), %rsi
+	movq 40(%rdi), %rdi
+
+	PRESERVE_BND_REGS_PREFIX
+	call *%r11
+
+	mov 24(%rbx), %RSP_LP	# Drop the copied stack content
+
+	/* Now we have to prepare the La_x86_64_retval structure for the
+	   _dl_call_pltexit.  The La_x86_64_regs is being pointed by rsp now,
+	   so we just need to allocate the sizeof(La_x86_64_retval) space on
+	   the stack, since the alignment has already been taken care of. */
+# ifdef RESTORE_AVX
+	/* sizeof(La_x86_64_retval).  Need extra space for 2 SSE
+	   registers to detect if xmm0/xmm1 registers are changed
+	   by audit module.  */
+	sub $(LRV_SIZE + XMM_SIZE*2), %RSP_LP
+# else
+	sub $LRV_SIZE, %RSP_LP	# sizeof(La_x86_64_retval)
+# endif
+	mov %RSP_LP, %RCX_LP	# La_x86_64_retval argument to %rcx.
+
+	/* Fill in the La_x86_64_retval structure.  */
+	movq %rax, LRV_RAX_OFFSET(%rcx)
+	movq %rdx, LRV_RDX_OFFSET(%rcx)
+
+	movaps %xmm0, LRV_XMM0_OFFSET(%rcx)
+	movaps %xmm1, LRV_XMM1_OFFSET(%rcx)
+
+# ifdef RESTORE_AVX
+	/* This is to support AVX audit modules.  */
+	VMOVA %VEC(0), LRV_VECTOR0_OFFSET(%rcx)
+	VMOVA %VEC(1), LRV_VECTOR1_OFFSET(%rcx)
+
+	/* Save xmm0/xmm1 registers to detect if they are changed
+	   by audit module.  */
+	vmovdqa %xmm0,		  (LRV_SIZE)(%rcx)
+	vmovdqa %xmm1, (LRV_SIZE + XMM_SIZE)(%rcx)
+# endif
+
+# ifndef __ILP32__
+#  ifdef HAVE_MPX_SUPPORT
+	bndmov %bnd0, LRV_BND0_OFFSET(%rcx)  # Preserve returned bounds.
+	bndmov %bnd1, LRV_BND1_OFFSET(%rcx)
+#  else
+	.byte  0x66,0x0f,0x1b,0x81;.long (LRV_BND0_OFFSET)
+	.byte  0x66,0x0f,0x1b,0x89;.long (LRV_BND1_OFFSET)
+#  endif
+# endif
+
+	fstpt LRV_ST0_OFFSET(%rcx)
+	fstpt LRV_ST1_OFFSET(%rcx)
+
+	movq 24(%rbx), %rdx	# La_x86_64_regs argument to %rdx.
+	movq 40(%rbx), %rsi	# Copy args pushed by PLT in register.
+	movq 32(%rbx), %rdi	# %rdi: link_map, %rsi: reloc_index
+	call _dl_call_pltexit
+
+	/* Restore return registers.  */
+	movq LRV_RAX_OFFSET(%rsp), %rax
+	movq LRV_RDX_OFFSET(%rsp), %rdx
+
+	movaps LRV_XMM0_OFFSET(%rsp), %xmm0
+	movaps LRV_XMM1_OFFSET(%rsp), %xmm1
+
+# ifdef RESTORE_AVX
+	/* Check if xmm0/xmm1 registers are changed by audit module.  */
+	vpcmpeqq (LRV_SIZE)(%rsp), %xmm0, %xmm2
+	vpmovmskb %xmm2, %esi
+	cmpl $0xffff, %esi
+	jne 1f
+	VMOVA LRV_VECTOR0_OFFSET(%rsp), %VEC(0)
+
+1:	vpcmpeqq (LRV_SIZE + XMM_SIZE)(%rsp), %xmm1, %xmm2
+	vpmovmskb %xmm2, %esi
+	cmpl $0xffff, %esi
+	jne 1f
+	VMOVA LRV_VECTOR1_OFFSET(%rsp), %VEC(1)
+
+1:
+# endif
+
+# ifndef __ILP32__
+#  ifdef HAVE_MPX_SUPPORT
+	bndmov LRV_BND0_OFFSET(%rsp), %bnd0  # Restore bound registers.
+	bndmov LRV_BND1_OFFSET(%rsp), %bnd1
+#  else
+	.byte  0x66,0x0f,0x1a,0x84,0x24;.long (LRV_BND0_OFFSET)
+	.byte  0x66,0x0f,0x1a,0x8c,0x24;.long (LRV_BND1_OFFSET)
+#  endif
+# endif
+
+	fldt LRV_ST1_OFFSET(%rsp)
+	fldt LRV_ST0_OFFSET(%rsp)
+
+	mov %RBX_LP, %RSP_LP
+	movq (%rsp), %rbx
+	cfi_restore(%rbx)
+	cfi_def_cfa_register(%rsp)
+
+	add $48, %RSP_LP	# Adjust the stack to the return value
+				# (eats the reloc index and link_map)
+	cfi_adjust_cfa_offset(-48)
+	PRESERVE_BND_REGS_PREFIX
+	retq
+
+	cfi_endproc
+	.size _dl_runtime_profile, .-_dl_runtime_profile
+#endif
diff --git a/REORG.TODO/sysdeps/x86_64/ffs.c b/REORG.TODO/sysdeps/x86_64/ffs.c
new file mode 100644
index 0000000000..132812c488
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/ffs.c
@@ -0,0 +1,39 @@
+/* ffs -- find first set bit in a word, counted from least significant end.
+   For AMD x86-64.
+   This file is part of the GNU C Library.
+   Copyright (C) 1991-2017 Free Software Foundation, Inc.
+   Contributed by Ulrich Drepper <drepper@cygnus.com>.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <string.h>
+
+#undef	ffs
+
+int
+__ffs (int x)
+{
+  int cnt;
+  int tmp;
+
+  asm ("bsfl %2,%0\n"		/* Count low bits in X and store in %1.  */
+       "cmovel %1,%0\n"		/* If number was zero, use -1 as result.  */
+       : "=&r" (cnt), "=r" (tmp) : "rm" (x), "1" (-1));
+
+  return cnt + 1;
+}
+weak_alias (__ffs, ffs)
+libc_hidden_def (__ffs)
+libc_hidden_builtin_def (ffs)
diff --git a/REORG.TODO/sysdeps/x86_64/ffsll.c b/REORG.TODO/sysdeps/x86_64/ffsll.c
new file mode 100644
index 0000000000..47111ce61b
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/ffsll.c
@@ -0,0 +1,42 @@
+/* ffsll -- find first set bit in a word, counted from least significant end.
+   For AMD x86-64.
+   This file is part of the GNU C Library.
+   Copyright (C) 1991-2017 Free Software Foundation, Inc.
+   Contributed by Ulrich Drepper <drepper@cygnus.com>.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#define ffsl __something_else
+#include <string.h>
+
+#undef	ffsll
+
+int
+ffsll (long long int x)
+{
+  long long int cnt;
+  long long int tmp;
+
+  asm ("bsfq %2,%0\n"		/* Count low bits in X and store in %1.  */
+       "cmoveq %1,%0\n"		/* If number was zero, use -1 as result.  */
+       : "=&r" (cnt), "=r" (tmp) : "rm" (x), "1" (-1));
+
+  return cnt + 1;
+}
+
+#ifndef __ILP32__
+#undef	ffsl
+weak_alias (ffsll, ffsl)
+#endif
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/Implies b/REORG.TODO/sysdeps/x86_64/fpu/Implies
new file mode 100644
index 0000000000..2b745a34fb
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/Implies
@@ -0,0 +1 @@
+x86/fpu
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/Makefile b/REORG.TODO/sysdeps/x86_64/fpu/Makefile
new file mode 100644
index 0000000000..2b7d69bb50
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/Makefile
@@ -0,0 +1,239 @@
+ifeq ($(subdir),mathvec)
+libmvec-support += svml_d_cos2_core svml_d_cos4_core_avx \
+		   svml_d_cos4_core svml_d_cos8_core \
+		   svml_d_sin2_core svml_d_sin4_core_avx \
+		   svml_d_sin4_core svml_d_sin8_core svml_d_trig_data \
+		   svml_s_cosf4_core svml_s_cosf8_core_avx \
+		   svml_s_cosf8_core svml_s_cosf16_core svml_s_trig_data \
+		   svml_s_sinf4_core svml_s_sinf8_core_avx \
+		   svml_s_sinf8_core svml_s_sinf16_core \
+		   svml_d_sincos2_core svml_d_sincos4_core_avx \
+		   svml_d_sincos4_core svml_d_sincos8_core \
+		   svml_d_log2_core svml_d_log4_core_avx svml_d_log4_core \
+		   svml_d_log8_core svml_d_log_data svml_s_logf4_core \
+		   svml_s_logf8_core_avx svml_s_logf8_core svml_s_logf16_core \
+		   svml_s_logf_data svml_d_exp2_core svml_d_exp4_core_avx \
+		   svml_d_exp4_core svml_d_exp8_core svml_d_exp_data \
+		   svml_s_expf4_core svml_s_expf8_core_avx svml_s_expf8_core \
+		   svml_s_expf16_core svml_s_expf_data svml_d_pow2_core \
+		   svml_d_pow4_core_avx svml_d_pow4_core svml_d_pow8_core \
+		   svml_d_pow_data svml_s_powf4_core svml_s_powf8_core_avx \
+		   svml_s_powf8_core svml_s_powf16_core svml_s_powf_data \
+		   svml_s_sincosf4_core svml_s_sincosf8_core_avx \
+		   svml_s_sincosf8_core svml_s_sincosf16_core svml_finite_alias
+
+libmvec-static-only-routines = svml_finite_alias
+endif
+
+# Variables for libmvec tests.
+ifeq ($(subdir),math)
+ifeq ($(build-mathvec),yes)
+libmvec-tests += double-vlen2 double-vlen4 double-vlen4-avx2 \
+		 float-vlen4 float-vlen8 float-vlen8-avx2
+tests += test-double-libmvec-alias test-double-libmvec-alias-avx \
+	 test-double-libmvec-alias-avx2 test-double-libmvec-alias-main \
+	 test-double-libmvec-alias-avx-main test-double-libmvec-alias-avx2-main \
+	 test-float-libmvec-alias test-float-libmvec-alias-avx \
+	 test-float-libmvec-alias-avx2 test-float-libmvec-alias-main \
+	 test-float-libmvec-alias-avx-main test-float-libmvec-alias-avx2-main \
+	 test-double-libmvec-sincos test-double-libmvec-sincos-avx \
+	 test-double-libmvec-sincos-avx2 test-float-libmvec-sincosf \
+	 test-float-libmvec-sincosf-avx test-float-libmvec-sincosf-avx2
+modules-names += test-double-libmvec-alias-mod \
+		 test-double-libmvec-alias-avx-mod \
+		 test-double-libmvec-alias-avx2-mod \
+		 test-float-libmvec-alias-mod \
+		 test-float-libmvec-alias-avx-mod \
+		 test-float-libmvec-alias-avx2-mod
+modules-names-tests += test-double-libmvec-alias-mod \
+		 test-double-libmvec-alias-avx-mod \
+		 test-double-libmvec-alias-avx2-mod \
+		 test-float-libmvec-alias-mod \
+		 test-float-libmvec-alias-avx-mod \
+		 test-float-libmvec-alias-avx2-mod
+extra-test-objs += test-double-libmvec-sincos-avx-main.o \
+		   test-double-libmvec-sincos-avx2-main.o \
+		   test-double-libmvec-sincos-main.o \
+		   test-float-libmvec-sincosf-avx-main.o \
+		   test-float-libmvec-sincosf-avx2-main.o\
+		   test-float-libmvec-sincosf-main.o
+test-double-libmvec-alias-mod.so-no-z-defs = yes
+test-double-libmvec-alias-avx-mod.so-no-z-defs = yes
+test-double-libmvec-alias-avx2-mod.so-no-z-defs = yes
+test-float-libmvec-alias-mod.so-no-z-defs = yes
+test-float-libmvec-alias-avx-mod.so-no-z-defs = yes
+test-float-libmvec-alias-avx2-mod.so-no-z-defs = yes
+
+$(objpfx)test-double-libmvec-alias: \
+  $(objpfx)test-double-libmvec-alias-mod.so
+$(objpfx)test-double-libmvec-alias-mod.so: \
+  $(objpfx)../mathvec/libmvec_nonshared.a $(libmvec)
+
+$(objpfx)test-double-libmvec-alias-avx: \
+  $(objpfx)test-double-libmvec-alias-avx-mod.so
+$(objpfx)test-double-libmvec-alias-avx-mod.so: \
+  $(objpfx)../mathvec/libmvec_nonshared.a $(libmvec)
+
+$(objpfx)test-double-libmvec-alias-avx2: \
+  $(objpfx)test-double-libmvec-alias-avx2-mod.so
+$(objpfx)test-double-libmvec-alias-avx2-mod.so: \
+  $(objpfx)../mathvec/libmvec_nonshared.a $(libmvec)
+
+$(objpfx)test-double-libmvec-alias-main: \
+  $(objpfx)test-double-libmvec-alias-mod.os \
+  $(objpfx)../mathvec/libmvec_nonshared.a $(libmvec)
+
+$(objpfx)test-double-libmvec-alias-avx-main: \
+  $(objpfx)test-double-libmvec-alias-avx-mod.os \
+  $(objpfx)../mathvec/libmvec_nonshared.a $(libmvec)
+
+$(objpfx)test-double-libmvec-alias-avx2-main: \
+  $(objpfx)test-double-libmvec-alias-avx2-mod.os \
+  $(objpfx)../mathvec/libmvec_nonshared.a $(libmvec)
+
+$(objpfx)test-float-libmvec-alias: \
+  $(objpfx)test-float-libmvec-alias-mod.so
+$(objpfx)test-float-libmvec-alias-mod.so: \
+  $(objpfx)../mathvec/libmvec_nonshared.a $(libmvec)
+
+$(objpfx)test-float-libmvec-alias-avx: \
+  $(objpfx)test-float-libmvec-alias-avx-mod.so
+$(objpfx)test-float-libmvec-alias-avx-mod.so: \
+  $(objpfx)../mathvec/libmvec_nonshared.a $(libmvec)
+
+$(objpfx)test-float-libmvec-alias-avx2: \
+  $(objpfx)test-float-libmvec-alias-avx2-mod.so
+$(objpfx)test-float-libmvec-alias-avx2-mod.so: \
+  $(objpfx)../mathvec/libmvec_nonshared.a $(libmvec)
+
+$(objpfx)test-float-libmvec-alias-main: \
+  $(objpfx)test-float-libmvec-alias-mod.os \
+  $(objpfx)../mathvec/libmvec_nonshared.a $(libmvec)
+
+$(objpfx)test-float-libmvec-alias-avx-main: \
+  $(objpfx)test-float-libmvec-alias-avx-mod.os \
+  $(objpfx)../mathvec/libmvec_nonshared.a $(libmvec)
+
+$(objpfx)test-float-libmvec-alias-avx2-main: \
+  $(objpfx)test-float-libmvec-alias-avx2-mod.os \
+  $(objpfx)../mathvec/libmvec_nonshared.a $(libmvec)
+
+$(objpfx)test-double-libmvec-sincos: \
+  $(objpfx)test-double-libmvec-sincos.o \
+  $(objpfx)test-double-libmvec-sincos-main.o $(libmvec)
+
+$(objpfx)test-double-libmvec-sincos-avx: \
+  $(objpfx)test-double-libmvec-sincos-avx.o \
+  $(objpfx)test-double-libmvec-sincos-avx-main.o $(libmvec)
+
+$(objpfx)test-double-libmvec-sincos-avx2: \
+  $(objpfx)test-double-libmvec-sincos-avx2.o \
+  $(objpfx)test-double-libmvec-sincos-avx2-main.o $(libmvec)
+
+$(objpfx)test-float-libmvec-sincosf: \
+  $(objpfx)test-float-libmvec-sincosf.o \
+  $(objpfx)test-float-libmvec-sincosf-main.o $(libmvec)
+
+$(objpfx)test-float-libmvec-sincosf-avx: \
+  $(objpfx)test-float-libmvec-sincosf-avx.o \
+  $(objpfx)test-float-libmvec-sincosf-avx-main.o $(libmvec)
+
+$(objpfx)test-float-libmvec-sincosf-avx2: \
+  $(objpfx)test-float-libmvec-sincosf-avx2.o \
+  $(objpfx)test-float-libmvec-sincosf-avx2-main.o $(libmvec)
+
+ifeq (yes,$(config-cflags-avx512))
+libmvec-tests += double-vlen8 float-vlen16
+tests += test-double-libmvec-alias-avx512 \
+	 test-float-libmvec-alias-avx512 \
+	 test-double-libmvec-alias-avx512-main \
+	 test-float-libmvec-alias-avx512-main \
+	 test-double-libmvec-sincos-avx512 \
+	 test-float-libmvec-sincosf-avx512
+modules-names += test-double-libmvec-alias-avx512-mod \
+		 test-float-libmvec-alias-avx512-mod
+modules-names-tests += test-double-libmvec-alias-avx512-mod \
+		 test-float-libmvec-alias-avx512-mod
+extra-test-objs += test-double-libmvec-sincos-avx512-main.o \
+		   test-float-libmvec-sincosf-avx512-main.o
+test-double-libmvec-alias-avx512-mod.so-no-z-defs = yes
+test-float-libmvec-alias-avx512-mod.so-no-z-defs = yes
+
+$(objpfx)test-double-libmvec-alias-avx512: \
+  $(objpfx)test-double-libmvec-alias-avx512-mod.so
+$(objpfx)test-double-libmvec-alias-avx512-mod.so: \
+  $(objpfx)../mathvec/libmvec_nonshared.a $(libmvec)
+
+$(objpfx)test-double-libmvec-alias-avx512-main: \
+  $(objpfx)test-double-libmvec-alias-avx512-mod.os \
+  $(objpfx)../mathvec/libmvec_nonshared.a $(libmvec)
+
+$(objpfx)test-float-libmvec-alias-avx512: \
+  $(objpfx)test-float-libmvec-alias-avx512-mod.so
+$(objpfx)test-float-libmvec-alias-avx512-mod.so: \
+  $(objpfx)../mathvec/libmvec_nonshared.a $(libmvec)
+
+$(objpfx)test-float-libmvec-alias-avx512-main: \
+  $(objpfx)test-float-libmvec-alias-avx512-mod.os \
+  $(objpfx)../mathvec/libmvec_nonshared.a $(libmvec)
+
+$(objpfx)test-double-libmvec-sincos-avx512: \
+  $(objpfx)test-double-libmvec-sincos-avx512.o \
+  $(objpfx)test-double-libmvec-sincos-avx512-main.o $(libmvec)
+
+$(objpfx)test-float-libmvec-sincosf-avx512: \
+  $(objpfx)test-float-libmvec-sincosf-avx512.o \
+  $(objpfx)test-float-libmvec-sincosf-avx512-main.o $(libmvec)
+endif
+
+double-vlen2-funcs = cos exp log pow sin sincos
+double-vlen4-funcs = cos exp log pow sin sincos
+double-vlen4-avx2-funcs = cos exp log pow sin sincos
+double-vlen8-funcs = cos exp log pow sin sincos
+float-vlen4-funcs = cos exp log pow sin sincos
+float-vlen8-funcs = cos exp log pow sin sincos
+float-vlen8-avx2-funcs = cos exp log pow sin sincos
+float-vlen16-funcs = cos exp log pow sin sincos
+
+double-vlen4-arch-ext-cflags = -mavx
+double-vlen4-arch-ext2-cflags = -mavx2
+double-vlen8-arch-ext-cflags = -mavx512f
+
+float-vlen8-arch-ext-cflags = -mavx
+float-vlen8-arch-ext2-cflags = -mavx2
+float-vlen16-arch-ext-cflags = -mavx512f
+
+libmvec-sincos-cflags = $(libm-test-fast-math-cflags) -fno-inline -fopenmp -Wno-unknown-pragmas
+libmvec-alias-cflags = $(libmvec-sincos-cflags) -ffloat-store -ffinite-math-only
+
+CFLAGS-test-double-libmvec-alias-mod.c = $(libmvec-alias-cflags)
+CFLAGS-test-double-libmvec-alias-avx-mod.c = $(double-vlen4-arch-ext-cflags) $(libmvec-alias-cflags) -DREQUIRE_AVX
+CFLAGS-test-double-libmvec-alias-avx2-mod.c = $(double-vlen4-arch-ext2-cflags) $(libmvec-alias-cflags) -DREQUIRE_AVX2
+CFLAGS-test-double-libmvec-alias-avx512-mod.c = $(double-vlen8-arch-ext-cflags) $(libmvec-alias-cflags) -DREQUIRE_AVX512F
+
+CFLAGS-test-float-libmvec-alias-mod.c = $(libmvec-alias-cflags)
+CFLAGS-test-float-libmvec-alias-avx-mod.c = $(double-vlen4-arch-ext-cflags) $(libmvec-alias-cflags) -DREQUIRE_AVX
+CFLAGS-test-float-libmvec-alias-avx2-mod.c = $(double-vlen4-arch-ext2-cflags) $(libmvec-alias-cflags) -DREQUIRE_AVX2
+CFLAGS-test-float-libmvec-alias-avx512-mod.c = $(double-vlen8-arch-ext-cflags) $(libmvec-alias-cflags) -DREQUIRE_AVX512F
+
+CFLAGS-test-double-vlen4-avx2-wrappers.c = $(double-vlen4-arch-ext2-cflags)
+
+CFLAGS-test-float-vlen8-avx2-wrappers.c = $(float-vlen8-arch-ext2-cflags)
+
+CFLAGS-test-double-libmvec-sincos-main.c = $(libmvec-sincos-cflags)
+CFLAGS-test-double-libmvec-sincos-avx.c = -DREQUIRE_AVX
+CFLAGS-test-double-libmvec-sincos-avx-main.c = $(libmvec-sincos-cflags) $(double-vlen4-arch-ext-cflags)
+CFLAGS-test-double-libmvec-sincos-avx2.c = -DREQUIRE_AVX2
+CFLAGS-test-double-libmvec-sincos-avx2-main.c = $(libmvec-sincos-cflags) $(double-vlen4-arch-ext2-cflags)
+CFLAGS-test-double-libmvec-sincos-avx512.c = -DREQUIRE_AVX512F
+CFLAGS-test-double-libmvec-sincos-avx512-main.c = $(libmvec-sincos-cflags) $(double-vlen8-arch-ext-cflags)
+
+CFLAGS-test-float-libmvec-sincosf-main.c = $(libmvec-sincos-cflags)
+CFLAGS-test-float-libmvec-sincosf-avx.c = -DREQUIRE_AVX
+CFLAGS-test-float-libmvec-sincosf-avx-main.c = $(libmvec-sincos-cflags) $(float-vlen8-arch-ext-cflags)
+CFLAGS-test-float-libmvec-sincosf-avx2.c = -DREQUIRE_AVX2
+CFLAGS-test-float-libmvec-sincosf-avx2-main.c = $(libmvec-sincos-cflags) $(float-vlen8-arch-ext2-cflags)
+CFLAGS-test-float-libmvec-sincosf-avx512.c = -DREQUIRE_AVX512F
+CFLAGS-test-float-libmvec-sincosf-avx512-main.c = $(libmvec-sincos-cflags) $(float-vlen16-arch-ext-cflags)
+endif
+endif
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/Versions b/REORG.TODO/sysdeps/x86_64/fpu/Versions
new file mode 100644
index 0000000000..08132045d6
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/Versions
@@ -0,0 +1,16 @@
+libmvec {
+  GLIBC_2.22 {
+    _ZGVbN2v_cos; _ZGVcN4v_cos; _ZGVdN4v_cos; _ZGVeN8v_cos;
+    _ZGVbN2v_sin; _ZGVcN4v_sin; _ZGVdN4v_sin; _ZGVeN8v_sin;
+    _ZGVbN2vvv_sincos; _ZGVcN4vvv_sincos; _ZGVdN4vvv_sincos; _ZGVeN8vvv_sincos;
+    _ZGVbN2v_log; _ZGVcN4v_log; _ZGVdN4v_log; _ZGVeN8v_log;
+    _ZGVbN2v_exp; _ZGVcN4v_exp; _ZGVdN4v_exp; _ZGVeN8v_exp;
+    _ZGVbN2vv_pow; _ZGVcN4vv_pow; _ZGVdN4vv_pow; _ZGVeN8vv_pow;
+    _ZGVbN4v_cosf; _ZGVcN8v_cosf; _ZGVdN8v_cosf; _ZGVeN16v_cosf;
+    _ZGVbN4v_sinf; _ZGVcN8v_sinf; _ZGVdN8v_sinf; _ZGVeN16v_sinf;
+    _ZGVbN4v_logf; _ZGVcN8v_logf; _ZGVdN8v_logf; _ZGVeN16v_logf;
+    _ZGVbN4v_expf; _ZGVcN8v_expf; _ZGVdN8v_expf; _ZGVeN16v_expf;
+    _ZGVbN4vv_powf; _ZGVcN8vv_powf; _ZGVdN8vv_powf; _ZGVeN16vv_powf;
+    _ZGVbN4vvv_sincosf; _ZGVcN8vvv_sincosf; _ZGVdN8vvv_sincosf; _ZGVeN16vvv_sincosf;
+  }
+}
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/e_acosl.c b/REORG.TODO/sysdeps/x86_64/fpu/e_acosl.c
new file mode 100644
index 0000000000..1ef6d3c94a
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/e_acosl.c
@@ -0,0 +1 @@
+#include "sysdeps/i386/fpu/e_acosl.c"
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/e_atan2l.c b/REORG.TODO/sysdeps/x86_64/fpu/e_atan2l.c
new file mode 100644
index 0000000000..bbd549f307
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/e_atan2l.c
@@ -0,0 +1,2 @@
+#include "sysdeps/i386/fpu/e_atan2l.c"
+
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/e_exp10l.S b/REORG.TODO/sysdeps/x86_64/fpu/e_exp10l.S
new file mode 100644
index 0000000000..d843e2b5e8
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/e_exp10l.S
@@ -0,0 +1,2 @@
+#define USE_AS_EXP10L
+#include <e_expl.S>
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/e_exp2l.S b/REORG.TODO/sysdeps/x86_64/fpu/e_exp2l.S
new file mode 100644
index 0000000000..0e059b7565
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/e_exp2l.S
@@ -0,0 +1,58 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Adapted for exp2 by Ulrich Drepper <drepper@cygnus.com>.
+ * Adapted for x86-64 by Andreas Jaeger <aj@suse.de>.
+ * Public domain.
+ */
+
+#include <machine/asm.h>
+#include <x86_64-math-asm.h>
+
+DEFINE_LDBL_MIN
+
+#ifdef PIC
+# define MO(op) op##(%rip)
+#else
+# define MO(op) op
+#endif
+
+	.text
+ENTRY(__ieee754_exp2l)
+	fldt	8(%rsp)
+/* I added the following ugly construct because exp(+-Inf) resulted
+   in NaN.  The ugliness results from the bright minds at Intel.
+   For the i686 the code can be written better.
+   -- drepper@cygnus.com.  */
+	fxam				/* Is NaN or +-Inf?  */
+	fstsw	%ax
+	movb	$0x45, %dh
+	andb	%ah, %dh
+	cmpb	$0x05, %dh
+	je	1f			/* Is +-Inf, jump.  */
+	movzwl	8+8(%rsp), %eax
+	andl	$0x7fff, %eax
+	cmpl	$0x3fbe, %eax
+	jge	3f
+	/* Argument's exponent below -65, result rounds to 1.  */
+	fld1
+	faddp
+	ret
+3:	fld	%st
+	frndint				/* int(x) */
+	fsubr	%st,%st(1)		/* fract(x) */
+	fxch
+	f2xm1				/* 2^(fract(x)) - 1 */
+	fld1
+	faddp				/* 2^(fract(x)) */
+	fscale				/* e^x */
+	fstp	%st(1)
+	LDBL_CHECK_FORCE_UFLOW_NONNEG_NAN
+	ret
+
+1:	testl	$0x200, %eax		/* Test sign.  */
+	jz	2f			/* If positive, jump.  */
+	fstp	%st
+	fldz				/* Set result to 0.  */
+2:	ret
+END (__ieee754_exp2l)
+strong_alias (__ieee754_exp2l, __exp2l_finite)
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/e_expf.S b/REORG.TODO/sysdeps/x86_64/fpu/e_expf.S
new file mode 100644
index 0000000000..4fd2bb1fb5
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/e_expf.S
@@ -0,0 +1,339 @@
+/* Optimized __ieee754_expf function.
+   Copyright (C) 2012-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+/* Short algorithm description:
+ *
+ *  Let K = 64 (table size).
+ *       e^x  = 2^(x/log(2)) = 2^n * T[j] * (1 + P(y))
+ *  where
+ *       x = m*log(2)/K + y,    y in [0.0..log(2)/K]
+ *       m = n*K + j,           m,n,j - signed integer, j in [0..K-1]
+ *       values of 2^(j/K) are tabulated as T[j].
+ *
+ *       P(y) is a minimax polynomial approximation of expf(x)-1
+ *       on small interval [0.0..log(2)/K].
+ *
+ *       P(y) = P3*y*y*y*y + P2*y*y*y + P1*y*y + P0*y, calculated as
+ *       z = y*y;    P(y) = (P3*z + P1)*z + (P2*z + P0)*y
+ *
+ * Special cases:
+ *  expf(NaN) = NaN
+ *  expf(+INF) = +INF
+ *  expf(-INF) = 0
+ *  expf(x) = 1 for subnormals
+ *  for finite argument, only expf(0)=1 is exact
+ *  expf(x) overflows if x>88.7228317260742190
+ *  expf(x) underflows if x<-103.972076416015620
+ */
+
+	.text
+ENTRY(__ieee754_expf)
+	/* Input: single precision x in %xmm0 */
+	cvtss2sd	%xmm0, %xmm1	/* Convert x to double precision */
+	movd	%xmm0, %ecx		/* Copy x */
+	movsd	L(DP_KLN2)(%rip), %xmm2	/* DP K/log(2) */
+	movsd	L(DP_P2)(%rip), %xmm3	/* DP P2 */
+	movl	%ecx, %eax		/* x */
+	mulsd	%xmm1, %xmm2		/* DP x*K/log(2) */
+	andl	$0x7fffffff, %ecx	/* |x| */
+	lea	L(DP_T)(%rip), %rsi	/* address of table T[j] */
+	cmpl	$0x42ad496b, %ecx	/* |x|<125*log(2) ? */
+	movsd	L(DP_P3)(%rip), %xmm4	/* DP P3 */
+	addsd	L(DP_RS)(%rip), %xmm2	/* DP x*K/log(2)+RS */
+	jae	L(special_paths)
+
+	/* Here if |x|<125*log(2) */
+	cmpl	$0x31800000, %ecx	/* |x|<2^(-28) ? */
+	jb	L(small_arg)
+
+	/* Main path: here if 2^(-28)<=|x|<125*log(2) */
+	cvtsd2ss	%xmm2, %xmm2	/* SP x*K/log(2)+RS */
+	movd	%xmm2, %eax		/* bits of n*K+j with trash */
+	subss	L(SP_RS)(%rip), %xmm2	/* SP t=round(x*K/log(2)) */
+	movl	%eax, %edx		/* n*K+j with trash */
+	cvtss2sd	%xmm2, %xmm2	/* DP t */
+	andl	$0x3f, %eax		/* bits of j */
+	mulsd	L(DP_NLN2K)(%rip), %xmm2/* DP -t*log(2)/K */
+	andl	$0xffffffc0, %edx	/* bits of n */
+#ifdef __AVX__
+	vaddsd	%xmm1, %xmm2, %xmm0	/* DP y=x-t*log(2)/K */
+	vmulsd	%xmm0, %xmm0, %xmm2	/* DP z=y*y */
+#else
+	addsd	%xmm1, %xmm2		/* DP y=x-t*log(2)/K */
+	movaps	%xmm2, %xmm0		/* DP y */
+	mulsd	%xmm2, %xmm2		/* DP z=y*y */
+#endif
+	mulsd	%xmm2, %xmm4		/* DP P3*z */
+	addl	$0x1fc0, %edx		/* bits of n + SP exponent bias */
+	mulsd	%xmm2, %xmm3		/* DP P2*z */
+	shll	$17, %edx		/* SP 2^n */
+	addsd	L(DP_P1)(%rip), %xmm4	/* DP P3*z+P1 */
+	addsd	L(DP_P0)(%rip), %xmm3	/* DP P2*z+P0 */
+	movd	%edx, %xmm1		/* SP 2^n */
+	mulsd	%xmm2, %xmm4		/* DP (P3*z+P1)*z */
+	mulsd	%xmm3, %xmm0		/* DP (P2*z+P0)*y */
+	addsd	%xmm4, %xmm0		/* DP P(y) */
+	mulsd	(%rsi,%rax,8), %xmm0	/* DP P(y)*T[j] */
+	addsd	(%rsi,%rax,8), %xmm0	/* DP T[j]*(P(y)+1) */
+	cvtsd2ss	%xmm0, %xmm0	/* SP T[j]*(P(y)+1) */
+	mulss	%xmm1, %xmm0		/* SP result=2^n*(T[j]*(P(y)+1)) */
+	ret
+
+	.p2align	4
+L(small_arg):
+	/* Here if 0<=|x|<2^(-28) */
+	addss	L(SP_ONE)(%rip), %xmm0	/* 1.0 + x */
+	/* Return 1.0 with inexact raised, except for x==0 */
+	ret
+
+	.p2align	4
+L(special_paths):
+	/* Here if 125*log(2)<=|x| */
+	shrl	$31, %eax		/* Get sign bit of x, and depending on it: */
+	lea	L(SP_RANGE)(%rip), %rdx	/* load over/underflow bound */
+	cmpl	(%rdx,%rax,4), %ecx	/* |x|<under/overflow bound ? */
+	jbe	L(near_under_or_overflow)
+
+	/* Here if |x|>under/overflow bound */
+	cmpl	$0x7f800000, %ecx	/* |x| is finite ? */
+	jae	L(arg_inf_or_nan)
+
+	/* Here if |x|>under/overflow bound, and x is finite */
+	testq	%rax, %rax		/* sign of x nonzero ? */
+	je	L(res_overflow)
+
+	/* Here if -inf<x<underflow bound (x<0) */
+	movss	L(SP_SMALL)(%rip), %xmm0/* load small value 2^(-100) */
+	mulss	%xmm0, %xmm0		/* Return underflowed result (zero or subnormal) */
+	ret
+
+	.p2align	4
+L(res_overflow):
+	/* Here if overflow bound<x<inf (x>0) */
+	movss	L(SP_LARGE)(%rip), %xmm0/* load large value 2^100 */
+	mulss	%xmm0, %xmm0		/* Return overflowed result (Inf or max normal) */
+	ret
+
+	.p2align	4
+L(arg_inf_or_nan):
+	/* Here if |x| is Inf or NAN */
+	jne	L(arg_nan)	/* |x| is Inf ? */
+
+	/* Here if |x| is Inf */
+	lea	L(SP_INF_0)(%rip), %rdx	/* depending on sign of x: */
+	movss	(%rdx,%rax,4), %xmm0	/* return zero or Inf */
+	ret
+
+	.p2align	4
+L(arg_nan):
+	/* Here if |x| is NaN */
+	addss	%xmm0, %xmm0		/* Return x+x (raise invalid) */
+	ret
+
+	.p2align	4
+L(near_under_or_overflow):
+	/* Here if 125*log(2)<=|x|<under/overflow bound */
+	cvtsd2ss	%xmm2, %xmm2	/* SP x*K/log(2)+RS */
+	movd	%xmm2, %eax		/* bits of n*K+j with trash */
+	subss	L(SP_RS)(%rip), %xmm2	/* SP t=round(x*K/log(2)) */
+	movl	%eax, %edx		/* n*K+j with trash */
+	cvtss2sd	%xmm2, %xmm2	/* DP t */
+	andl	$0x3f, %eax		/* bits of j */
+	mulsd	L(DP_NLN2K)(%rip), %xmm2/* DP -t*log(2)/K */
+	andl	$0xffffffc0, %edx	/* bits of n */
+#ifdef __AVX__
+	vaddsd	%xmm1, %xmm2, %xmm0	/* DP y=x-t*log(2)/K */
+	vmulsd	%xmm0, %xmm0, %xmm2	/* DP z=y*y */
+#else
+	addsd	%xmm1, %xmm2		/* DP y=x-t*log(2)/K */
+	movaps	%xmm2, %xmm0		/* DP y */
+	mulsd	%xmm2, %xmm2		/* DP z=y*y */
+#endif
+	mulsd	%xmm2, %xmm4		/* DP P3*z */
+	addl	$0xffc0, %edx		/* bits of n + DP exponent bias */
+	mulsd	%xmm2, %xmm3		/* DP P2*z */
+	shlq	$46, %rdx		/* DP 2^n */
+	addsd	L(DP_P1)(%rip), %xmm4	/* DP P3*z+P1 */
+	addsd	L(DP_P0)(%rip), %xmm3	/* DP P2*z+P0 */
+	movd	%rdx, %xmm1		/* DP 2^n */
+	mulsd	%xmm2, %xmm4		/* DP (P3*z+P1)*z */
+	mulsd	%xmm3, %xmm0		/* DP (P2*z+P0)*y */
+	addsd	%xmm4, %xmm0		/* DP P(y) */
+	mulsd	(%rsi,%rax,8), %xmm0	/* DP P(y)*T[j] */
+	addsd	(%rsi,%rax,8), %xmm0	/* DP T[j]*(P(y)+1) */
+	mulsd	%xmm1, %xmm0		/* DP result=2^n*(T[j]*(P(y)+1)) */
+	cvtsd2ss	%xmm0, %xmm0	/* convert result to single precision */
+	ret
+END(__ieee754_expf)
+
+	.section .rodata, "a"
+	.p2align 3
+L(DP_T): /* table of double precision values 2^(j/K) for j=[0..K-1] */
+	.long	0x00000000, 0x3ff00000
+	.long	0x3e778061, 0x3ff02c9a
+	.long	0xd3158574, 0x3ff059b0
+	.long	0x18759bc8, 0x3ff08745
+	.long	0x6cf9890f, 0x3ff0b558
+	.long	0x32d3d1a2, 0x3ff0e3ec
+	.long	0xd0125b51, 0x3ff11301
+	.long	0xaea92de0, 0x3ff1429a
+	.long	0x3c7d517b, 0x3ff172b8
+	.long	0xeb6fcb75, 0x3ff1a35b
+	.long	0x3168b9aa, 0x3ff1d487
+	.long	0x88628cd6, 0x3ff2063b
+	.long	0x6e756238, 0x3ff2387a
+	.long	0x65e27cdd, 0x3ff26b45
+	.long	0xf51fdee1, 0x3ff29e9d
+	.long	0xa6e4030b, 0x3ff2d285
+	.long	0x0a31b715, 0x3ff306fe
+	.long	0xb26416ff, 0x3ff33c08
+	.long	0x373aa9cb, 0x3ff371a7
+	.long	0x34e59ff7, 0x3ff3a7db
+	.long	0x4c123422, 0x3ff3dea6
+	.long	0x21f72e2a, 0x3ff4160a
+	.long	0x6061892d, 0x3ff44e08
+	.long	0xb5c13cd0, 0x3ff486a2
+	.long	0xd5362a27, 0x3ff4bfda
+	.long	0x769d2ca7, 0x3ff4f9b2
+	.long	0x569d4f82, 0x3ff5342b
+	.long	0x36b527da, 0x3ff56f47
+	.long	0xdd485429, 0x3ff5ab07
+	.long	0x15ad2148, 0x3ff5e76f
+	.long	0xb03a5585, 0x3ff6247e
+	.long	0x82552225, 0x3ff66238
+	.long	0x667f3bcd, 0x3ff6a09e
+	.long	0x3c651a2f, 0x3ff6dfb2
+	.long	0xe8ec5f74, 0x3ff71f75
+	.long	0x564267c9, 0x3ff75feb
+	.long	0x73eb0187, 0x3ff7a114
+	.long	0x36cf4e62, 0x3ff7e2f3
+	.long	0x994cce13, 0x3ff82589
+	.long	0x9b4492ed, 0x3ff868d9
+	.long	0x422aa0db, 0x3ff8ace5
+	.long	0x99157736, 0x3ff8f1ae
+	.long	0xb0cdc5e5, 0x3ff93737
+	.long	0x9fde4e50, 0x3ff97d82
+	.long	0x82a3f090, 0x3ff9c491
+	.long	0x7b5de565, 0x3ffa0c66
+	.long	0xb23e255d, 0x3ffa5503
+	.long	0x5579fdbf, 0x3ffa9e6b
+	.long	0x995ad3ad, 0x3ffae89f
+	.long	0xb84f15fb, 0x3ffb33a2
+	.long	0xf2fb5e47, 0x3ffb7f76
+	.long	0x904bc1d2, 0x3ffbcc1e
+	.long	0xdd85529c, 0x3ffc199b
+	.long	0x2e57d14b, 0x3ffc67f1
+	.long	0xdcef9069, 0x3ffcb720
+	.long	0x4a07897c, 0x3ffd072d
+	.long	0xdcfba487, 0x3ffd5818
+	.long	0x03db3285, 0x3ffda9e6
+	.long	0x337b9b5f, 0x3ffdfc97
+	.long	0xe78b3ff6, 0x3ffe502e
+	.long	0xa2a490da, 0x3ffea4af
+	.long	0xee615a27, 0x3ffefa1b
+	.long	0x5b6e4540, 0x3fff5076
+	.long	0x819e90d8, 0x3fffa7c1
+	.type L(DP_T), @object
+	ASM_SIZE_DIRECTIVE(L(DP_T))
+
+	.section .rodata.cst8,"aM",@progbits,8
+	.p2align 3
+L(DP_KLN2): /* double precision K/log(2) */
+	.long	0x652b82fe, 0x40571547
+	.type L(DP_KLN2), @object
+	ASM_SIZE_DIRECTIVE(L(DP_KLN2))
+
+	.p2align 3
+L(DP_NLN2K): /* double precision -log(2)/K */
+	.long	0xfefa39ef, 0xbf862e42
+	.type L(DP_NLN2K), @object
+	ASM_SIZE_DIRECTIVE(L(DP_NLN2K))
+
+	.p2align 3
+L(DP_RS): /* double precision 2^23+2^22 */
+	.long	0x00000000, 0x41680000
+	.type L(DP_RS), @object
+	ASM_SIZE_DIRECTIVE(L(DP_RS))
+
+	.p2align 3
+L(DP_P3): /* double precision polynomial coefficient P3 */
+	.long	0xeb78fa85, 0x3fa56420
+	.type L(DP_P3), @object
+	ASM_SIZE_DIRECTIVE(L(DP_P3))
+
+	.p2align 3
+L(DP_P1): /* double precision polynomial coefficient P1 */
+	.long	0x008d6118, 0x3fe00000
+	.type L(DP_P1), @object
+	ASM_SIZE_DIRECTIVE(L(DP_P1))
+
+	.p2align 3
+L(DP_P2): /* double precision polynomial coefficient P2 */
+	.long	0xda752d4f, 0x3fc55550
+	.type L(DP_P2), @object
+	ASM_SIZE_DIRECTIVE(L(DP_P2))
+
+	.p2align 3
+L(DP_P0): /* double precision polynomial coefficient P0 */
+	.long	0xffffe7c6, 0x3fefffff
+	.type L(DP_P0), @object
+	ASM_SIZE_DIRECTIVE(L(DP_P0))
+
+	.p2align 2
+L(SP_RANGE): /* single precision overflow/underflow bounds */
+	.long	0x42b17217	/* if x>this bound, then result overflows */
+	.long	0x42cff1b4	/* if x<this bound, then result underflows */
+	.type L(SP_RANGE), @object
+	ASM_SIZE_DIRECTIVE(L(SP_RANGE))
+
+	.p2align 2
+L(SP_INF_0):
+	.long	0x7f800000	/* single precision Inf */
+	.long	0		/* single precision zero */
+	.type L(SP_INF_0), @object
+	ASM_SIZE_DIRECTIVE(L(SP_INF_0))
+
+	.section .rodata.cst4,"aM",@progbits,4
+	.p2align 2
+L(SP_RS): /* single precision 2^23+2^22 */
+	.long	0x4b400000
+	.type L(SP_RS), @object
+	ASM_SIZE_DIRECTIVE(L(SP_RS))
+
+	.p2align 2
+L(SP_SMALL): /* single precision small value 2^(-100) */
+	.long	0x0d800000
+	.type L(SP_SMALL), @object
+	ASM_SIZE_DIRECTIVE(L(SP_SMALL))
+
+	.p2align 2
+L(SP_LARGE): /* single precision large value 2^100 */
+	.long	0x71800000
+	.type L(SP_LARGE), @object
+	ASM_SIZE_DIRECTIVE(L(SP_LARGE))
+
+	.p2align 2
+L(SP_ONE): /* single precision 1.0 */
+	.long	0x3f800000
+	.type L(SP_ONE), @object
+	ASM_SIZE_DIRECTIVE(L(SP_ONE))
+
+strong_alias (__ieee754_expf, __expf_finite)
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/e_expl.S b/REORG.TODO/sysdeps/x86_64/fpu/e_expl.S
new file mode 100644
index 0000000000..a4ef023b2b
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/e_expl.S
@@ -0,0 +1,219 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ *
+ * Adapted for `long double' by Ulrich Drepper <drepper@cygnus.com>.
+ */
+
+/*
+ * The 8087 method for the exponential function is to calculate
+ *   exp(x) = 2^(x log2(e))
+ * after separating integer and fractional parts
+ *   x log2(e) = i + f, |f| <= .5
+ * 2^i is immediate but f needs to be precise for long double accuracy.
+ * Suppress range reduction error in computing f by the following.
+ * Separate x into integer and fractional parts
+ *   x = xi + xf, |xf| <= .5
+ * Separate log2(e) into the sum of an exact number c0 and small part c1.
+ *   c0 + c1 = log2(e) to extra precision
+ * Then
+ *   f = (c0 xi - i) + c0 xf + c1 x
+ * where c0 xi is exact and so also is (c0 xi - i).
+ * -- moshier@na-net.ornl.gov
+ */
+
+#include <machine/asm.h>
+#include <x86_64-math-asm.h>
+
+#ifdef USE_AS_EXP10L
+# define IEEE754_EXPL __ieee754_exp10l
+# define EXPL_FINITE __exp10l_finite
+# define FLDLOG fldl2t
+#elif defined USE_AS_EXPM1L
+# define IEEE754_EXPL __expm1l
+# undef EXPL_FINITE
+# define FLDLOG fldl2e
+#else
+# define IEEE754_EXPL __ieee754_expl
+# define EXPL_FINITE __expl_finite
+# define FLDLOG fldl2e
+#endif
+
+	.section .rodata.cst16,"aM",@progbits,16
+
+	.p2align 4
+#ifdef USE_AS_EXP10L
+	.type c0,@object
+c0:	.byte 0, 0, 0, 0, 0, 0, 0x9a, 0xd4, 0x00, 0x40
+	.byte 0, 0, 0, 0, 0, 0
+	ASM_SIZE_DIRECTIVE(c0)
+	.type c1,@object
+c1:	.byte 0x58, 0x92, 0xfc, 0x15, 0x37, 0x9a, 0x97, 0xf0, 0xef, 0x3f
+	.byte 0, 0, 0, 0, 0, 0
+	ASM_SIZE_DIRECTIVE(c1)
+#else
+	.type c0,@object
+c0:	.byte 0, 0, 0, 0, 0, 0, 0xaa, 0xb8, 0xff, 0x3f
+	.byte 0, 0, 0, 0, 0, 0
+	ASM_SIZE_DIRECTIVE(c0)
+	.type c1,@object
+c1:	.byte 0x20, 0xfa, 0xee, 0xc2, 0x5f, 0x70, 0xa5, 0xec, 0xed, 0x3f
+	.byte 0, 0, 0, 0, 0, 0
+	ASM_SIZE_DIRECTIVE(c1)
+#endif
+#ifndef USE_AS_EXPM1L
+	.type csat,@object
+csat:	.byte 0, 0, 0, 0, 0, 0, 0, 0x80, 0x0e, 0x40
+	.byte 0, 0, 0, 0, 0, 0
+	ASM_SIZE_DIRECTIVE(csat)
+DEFINE_LDBL_MIN
+#endif
+
+#ifdef PIC
+# define MO(op) op##(%rip)
+#else
+# define MO(op) op
+#endif
+
+	.text
+ENTRY(IEEE754_EXPL)
+#ifdef USE_AS_EXPM1L
+	movzwl	8+8(%rsp), %eax
+	xorb	$0x80, %ah	// invert sign bit (now 1 is "positive")
+	cmpl	$0xc006, %eax	// is num positive and exp >= 6 (number is >= 128.0)?
+	jae	HIDDEN_JUMPTARGET (__expl) // (if num is denormal, it is at least >= 64.0)
+#endif
+	fldt	8(%rsp)
+/* I added the following ugly construct because expl(+-Inf) resulted
+   in NaN.  The ugliness results from the bright minds at Intel.
+   For the i686 the code can be written better.
+   -- drepper@cygnus.com.  */
+	fxam			/* Is NaN or +-Inf?  */
+#ifdef USE_AS_EXPM1L
+	xorb	$0x80, %ah
+	cmpl	$0xc006, %eax
+	fstsw	%ax
+	movb	$0x45, %dh
+	jb	4f
+
+	/* Below -64.0 (may be -NaN or -Inf). */
+	andb	%ah, %dh
+	cmpb	$0x01, %dh
+	je	6f		/* Is +-NaN, jump.  */
+	jmp	1f		/* -large, possibly -Inf.  */
+
+4:	/* In range -64.0 to 64.0 (may be +-0 but not NaN or +-Inf).  */
+	/* Test for +-0 as argument.  */
+	andb	%ah, %dh
+	cmpb	$0x40, %dh
+	je	2f
+
+	/* Test for arguments that are small but not subnormal.  */
+	movzwl	8+8(%rsp), %eax
+	andl	$0x7fff, %eax
+	cmpl	$0x3fbf, %eax
+	jge	3f
+	/* Argument's exponent below -64; avoid spurious underflow if
+	   normal.  */
+	cmpl	$0x0001, %eax
+	jge	2f
+	/* Force underflow and return the argument, to avoid wrong signs
+	   of zero results from the code below in some rounding modes.  */
+	fld	%st
+	fmul	%st
+	fstp	%st
+	jmp	2f
+#else
+	movzwl	8+8(%rsp), %eax
+	andl	$0x7fff, %eax
+	cmpl	$0x400d, %eax
+	jg	5f
+	cmpl	$0x3fbc, %eax
+	jge	3f
+	/* Argument's exponent below -67, result rounds to 1.  */
+	fld1
+	faddp
+	jmp	2f
+5:	/* Overflow, underflow or infinity or NaN as argument.  */
+	fstsw	%ax
+	movb	$0x45, %dh
+	andb	%ah, %dh
+	cmpb	$0x05, %dh
+	je	1f		/* Is +-Inf, jump.    */
+	cmpb	$0x01, %dh
+	je	6f		/* Is +-NaN, jump.    */
+	/* Overflow or underflow; saturate.  */
+	fstp	%st
+	fldt	MO(csat)
+	andb	$2, %ah
+	jz	3f
+	fchs
+#endif
+3:	FLDLOG			/* 1  log2(base)      */
+	fmul	%st(1), %st	/* 1  x log2(base)    */
+	/* Set round-to-nearest temporarily.  */
+	fstcw	-4(%rsp)
+	movl	$0xf3ff, %edx
+	andl	-4(%rsp), %edx
+	movl	%edx, -8(%rsp)
+	fldcw	-8(%rsp)
+	frndint			/* 1  i               */
+	fld	%st(1)		/* 2  x               */
+	frndint			/* 2  xi              */
+	fldcw	-4(%rsp)
+	fld	%st(1)		/* 3  i               */
+	fldt	MO(c0)		/* 4  c0              */
+	fld	%st(2)		/* 5  xi              */
+	fmul	%st(1), %st	/* 5  c0 xi           */
+	fsubp	%st, %st(2)	/* 4  f = c0 xi  - i  */
+	fld	%st(4)		/* 5  x               */
+	fsub	%st(3), %st	/* 5  xf = x - xi     */
+	fmulp	%st, %st(1)	/* 4  c0 xf           */
+	faddp	%st, %st(1)	/* 3  f = f + c0 xf   */
+	fldt	MO(c1)		/* 4                  */
+	fmul	%st(4), %st	/* 4  c1 * x          */
+	faddp	%st, %st(1)	/* 3  f = f + c1 * x  */
+	f2xm1			/* 3 2^(fract(x * log2(base))) - 1 */
+#ifdef USE_AS_EXPM1L
+	fstp	%st(1)		/* 2                  */
+	fscale			/* 2 scale factor is st(1); base^x - 2^i */
+	fxch			/* 2 i                */
+	fld1			/* 3 1.0              */
+	fscale			/* 3 2^i              */
+	fld1			/* 4 1.0              */
+	fsubrp	%st, %st(1)	/* 3 2^i - 1.0        */
+	fstp	%st(1)		/* 2                  */
+	faddp	%st, %st(1)	/* 1 base^x - 1.0     */
+#else
+	fld1			/* 4 1.0              */
+	faddp			/* 3 2^(fract(x * log2(base))) */
+	fstp	%st(1)		/* 2  */
+	fscale			/* 2 scale factor is st(1); base^x */
+	fstp	%st(1)		/* 1  */
+	LDBL_CHECK_FORCE_UFLOW_NONNEG
+#endif
+	fstp	%st(1)		/* 0  */
+	jmp	2f
+1:
+#ifdef USE_AS_EXPM1L
+	/* For expm1l, only negative sign gets here.  */
+	fstp	%st
+	fld1
+	fchs
+#else
+	testl	$0x200, %eax	/* Test sign.  */
+	jz	2f		/* If positive, jump.  */
+	fstp	%st
+	fldz			/* Set result to 0.  */
+#endif
+2:	ret
+6:	/* NaN argument.  */
+	fadd	%st
+	ret
+END(IEEE754_EXPL)
+#ifdef USE_AS_EXPM1L
+libm_hidden_def (__expm1l)
+weak_alias (__expm1l, expm1l)
+#else
+strong_alias (IEEE754_EXPL, EXPL_FINITE)
+#endif
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/e_fmodl.S b/REORG.TODO/sysdeps/x86_64/fpu/e_fmodl.S
new file mode 100644
index 0000000000..07c50df8d1
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/e_fmodl.S
@@ -0,0 +1,23 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ *
+ * Adapted for `long double' by Ulrich Drepper <drepper@cygnus.com>.
+ * Adapted for x86-64 by Andreas Jaeger <aj@suse.de>.
+ */
+
+#include <machine/asm.h>
+
+RCSID("$NetBSD: $")
+
+ENTRY(__ieee754_fmodl)
+	fldt	24(%rsp)
+	fldt	8(%rsp)
+1:	fprem
+	fstsw	%ax
+	and	$04,%ah
+	jnz	1b
+	fstp	%st(1)
+	ret
+END (__ieee754_fmodl)
+strong_alias (__ieee754_fmodl, __fmodl_finite)
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/e_ilogbl.S b/REORG.TODO/sysdeps/x86_64/fpu/e_ilogbl.S
new file mode 100644
index 0000000000..ae6c0fe6f9
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/e_ilogbl.S
@@ -0,0 +1,39 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Changes for long double by Ulrich Drepper <drepper@cygnus.com>
+ * Adapted for x86-64 by Andreas Jaeger <aj@suse.de>.
+ * Public domain.
+ */
+
+#include <machine/asm.h>
+
+ENTRY(__ieee754_ilogbl)
+	fldt	8(%rsp)
+/* I added the following ugly construct because ilogb(+-Inf) is
+   required to return INT_MAX in ISO C99.
+   -- jakub@redhat.com.  */
+	fxam			/* Is NaN or +-Inf?  */
+	fstsw   %ax
+	movb    $0x45, %dh
+	andb    %ah, %dh
+	cmpb    $0x05, %dh
+	je      1f		/* Is +-Inf, jump.  */
+	cmpb    $0x40, %dh
+	je      2f		/* Is +-Inf, jump.  */
+
+	fxtract
+	fstp	%st
+
+	fistpl	-4(%rsp)
+	fwait
+	movl	-4(%rsp),%eax
+
+	ret
+
+1:	fstp	%st
+	movl	$0x7fffffff, %eax
+	ret
+2:	fstp	%st
+	movl	$0x80000000, %eax	/* FP_ILOGB0  */
+	ret
+END (__ieee754_ilogbl)
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/e_log10l.S b/REORG.TODO/sysdeps/x86_64/fpu/e_log10l.S
new file mode 100644
index 0000000000..e0cb88e32e
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/e_log10l.S
@@ -0,0 +1,92 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ *
+ * Adapted for `long double' by Ulrich Drepper <drepper@cygnus.com>.
+ *
+ * Changed to use fyl2xp1 for values near 1, <drepper@cygnus.com>.
+ * Adapted for x86-64 by Andreas Jaeger <aj@suse.de>.
+ */
+
+#include <machine/asm.h>
+
+	.section .rodata.cst8,"aM",@progbits,8
+
+	.p2align 3
+	.type one,@object
+one:	.double 1.0
+	ASM_SIZE_DIRECTIVE(one)
+	/* It is not important that this constant is precise.  It is only
+	   a value which is known to be on the safe side for using the
+	   fyl2xp1 instruction.  */
+	.type limit,@object
+limit:	.double 0.29
+	ASM_SIZE_DIRECTIVE(limit)
+
+
+#ifdef PIC
+# define MO(op) op##(%rip)
+#else
+# define MO(op) op
+#endif
+
+	.text
+ENTRY(__ieee754_log10l)
+	fldlg2			// log10(2)
+	fldt	8(%rsp)		// x : log10(2)
+	fxam
+	fnstsw
+	fld	%st		// x : x : log10(2)
+	testb	$1, %ah
+	jnz	3f		// in case x is NaN or �Inf
+4:	fsubl	MO(one)		// x-1 : x : log10(2)
+	fld	%st		// x-1 : x-1 : x : log10(2)
+	fabs			// |x-1| : x-1 : x : log10(2)
+	fcompl	MO(limit)	// x-1 : x : log10(2)
+	fnstsw			// x-1 : x : log10(2)
+	andb	$0x45, %ah
+	jz	2f
+	fxam
+	fnstsw
+	andb	$0x45, %ah
+	cmpb	$0x40, %ah
+	jne	5f
+	fabs			// log10(1) is +0 in all rounding modes.
+5:	fstp	%st(1)		// x-1 : log10(2)
+	fyl2xp1			// log10(x)
+	ret
+
+2:	fstp	%st(0)		// x : log10(2)
+	fyl2x			// log10(x)
+	ret
+
+3:	testb	$4, %ah
+	jnz	4b		// in case x is �Inf
+	fstp	%st(1)
+	fstp	%st(1)
+	fadd	%st(0)
+	ret
+END(__ieee754_log10l)
+
+
+ENTRY(__log10l_finite)
+	fldlg2			// log10(2)
+	fldt	8(%rsp)		// x : log10(2)
+	fld	%st		// x : x : log10(2)
+4:	fsubl	MO(one)		// x-1 : x : log10(2)
+	fld	%st		// x-1 : x-1 : x : log10(2)
+	fabs			// |x-1| : x-1 : x : log10(2)
+	fcompl	MO(limit)	// x-1 : x : log10(2)
+	fnstsw			// x-1 : x : log10(2)
+	andb	$0x45, %ah
+	jz	2b
+	fxam
+	fnstsw
+	andb	$0x45, %ah
+	cmpb	$0x40, %ah
+	jne	6f
+	fabs			// log10(1) is +0 in all rounding modes.
+6:	fstp	%st(1)		// x-1 : log10(2)
+	fyl2xp1			// log10(x)
+	ret
+END(__log10l_finite)
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/e_log2l.S b/REORG.TODO/sysdeps/x86_64/fpu/e_log2l.S
new file mode 100644
index 0000000000..023ec29164
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/e_log2l.S
@@ -0,0 +1,91 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Adapted for use as log2 by Ulrich Drepper <drepper@cygnus.com>.
+ * Public domain.
+ *
+ * Changed to use fyl2xp1 for values near 1, <drepper@cygnus.com>.
+ * Adapted for x86-64 by Andreas Jaeger <aj@suse.de>.
+ */
+
+#include <machine/asm.h>
+
+	.section .rodata.cst8,"aM",@progbits,8
+
+	.p2align 3
+	.type one,@object
+one:	.double 1.0
+	ASM_SIZE_DIRECTIVE(one)
+	/* It is not important that this constant is precise.  It is only
+	   a value which is known to be on the safe side for using the
+	   fyl2xp1 instruction.  */
+	.type limit,@object
+limit:	.double 0.29
+	ASM_SIZE_DIRECTIVE(limit)
+
+
+#ifdef PIC
+# define MO(op) op##(%rip)
+#else
+# define MO(op) op
+#endif
+
+	.text
+ENTRY(__ieee754_log2l)
+	fldl	MO(one)
+	fldt	8(%rsp)		// x : 1
+	fxam
+	fnstsw
+	fld	%st		// x : x : 1
+	testb	$1, %ah
+	jnz	3f		// in case x is NaN or �Inf
+4:	fsub	%st(2), %st	// x-1 : x : 1
+	fld	%st		// x-1 : x-1 : x : 1
+	fabs			// |x-1| : x-1 : x : 1
+	fcompl	MO(limit)	// x-1 : x : 1
+	fnstsw			// x-1 : x : 1
+	andb	$0x45, %ah
+	jz	2f
+	fxam
+	fnstsw
+	andb	$0x45, %ah
+	cmpb	$0x40, %ah
+	jne	5f
+	fabs			// log2(1) is +0 in all rounding modes.
+5:	fstp	%st(1)		// x-1 : 1
+	fyl2xp1			// log(x)
+	ret
+
+2:	fstp	%st(0)		// x : 1
+	fyl2x			// log(x)
+	ret
+
+3:	testb	$4, %ah
+	jnz	4b		// in case x is �Inf
+	fstp	%st(1)
+	fstp	%st(1)
+	fadd	%st(0)
+	ret
+END (__ieee754_log2l)
+
+
+ENTRY(__log2l_finite)
+	fldl	MO(one)
+	fldt	8(%rsp)		// x : 1
+	fld	%st		// x : x : 1
+	fsub	%st(2), %st	// x-1 : x : 1
+	fld	%st		// x-1 : x-1 : x : 1
+	fabs			// |x-1| : x-1 : x : 1
+	fcompl	MO(limit)	// x-1 : x : 1
+	fnstsw			// x-1 : x : 1
+	andb	$0x45, %ah
+	jz	2b
+	fxam
+	fnstsw
+	andb	$0x45, %ah
+	cmpb	$0x40, %ah
+	jne	6f
+	fabs			// log2(1) is +0 in all rounding modes.
+6:	fstp	%st(1)		// x-1 : 1
+	fyl2xp1			// log(x)
+	ret
+END (__log2l_finite)
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/e_logl.S b/REORG.TODO/sysdeps/x86_64/fpu/e_logl.S
new file mode 100644
index 0000000000..0d3576f48b
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/e_logl.S
@@ -0,0 +1,94 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ *
+ * Adapted for `long double' by Ulrich Drepper <drepper@cygnus.com>.
+ * Adapted for x86-64 by Andreas Jaeger <aj@suse.de>.
+ */
+
+#include <machine/asm.h>
+
+
+	.section .rodata.cst8,"aM",@progbits,8
+
+	.p2align 3
+	.type one,@object
+one:	.double 1.0
+	ASM_SIZE_DIRECTIVE(one)
+	/* It is not important that this constant is precise.  It is only
+	   a value which is known to be on the safe side for using the
+	   fyl2xp1 instruction.  */
+	.type limit,@object
+limit:	.double 0.29
+	ASM_SIZE_DIRECTIVE(limit)
+
+
+#ifdef PIC
+# define MO(op) op##(%rip)
+#else
+# define MO(op) op
+#endif
+
+	.text
+ENTRY(__ieee754_logl)
+	fldln2			// log(2)
+	fldt	8(%rsp)		// x : log(2)
+	fxam
+	fnstsw
+	fld	%st		// x : x : log(2)
+	testb	$1, %ah
+	jnz	3f		// in case x is NaN or +-Inf
+	movzwl	8+8(%rsp), %eax
+	cmpl	$0xc000, %eax
+	jae	6f		// x <= -2, avoid overflow from -LDBL_MAX - 1.
+4:	fsubl	MO(one)		// x-1 : x : log(2)
+6:	fld	%st		// x-1 : x-1 : x : log(2)
+	fabs			// |x-1| : x-1 : x : log(2)
+	fcompl	MO(limit)	// x-1 : x : log(2)
+	fnstsw			// x-1 : x : log(2)
+	andb	$0x45, %ah
+	jz	2f
+	fxam
+	fnstsw
+	andb	$0x45, %ah
+	cmpb	$0x40, %ah
+	jne	5f
+	fabs			// log(1) is +0 in all rounding modes.
+5:	fstp	%st(1)		// x-1 : log(2)
+	fyl2xp1			// log(x)
+	ret
+
+2:	fstp	%st(0)		// x : log(2)
+	fyl2x			// log(x)
+	ret
+
+3:	testb	$4, %ah
+	jnz	4b		// in case x is +-Inf
+	fstp	%st(1)
+	fstp	%st(1)
+	fadd	%st(0)
+	ret
+END (__ieee754_logl)
+
+
+ENTRY(__logl_finite)
+	fldln2			// log(2)
+	fldt	8(%rsp)		// x : log(2)
+	fld	%st		// x : x : log(2)
+	fsubl	MO(one)		// x-1 : x : log(2)
+	fld	%st		// x-1 : x-1 : x : log(2)
+	fabs			// |x-1| : x-1 : x : log(2)
+	fcompl	MO(limit)	// x-1 : x : log(2)
+	fnstsw			// x-1 : x : log(2)
+	andb	$0x45, %ah
+	jz	2b
+	fxam
+	fnstsw
+	andb	$0x45, %ah
+	cmpb	$0x40, %ah
+	jne	7f
+	fabs			// log(1) is +0 in all rounding modes.
+7:	fstp	%st(1)		// x-1 : log(2)
+	fyl2xp1			// log(x)
+	ret
+END (__logl_finite)
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/e_powl.S b/REORG.TODO/sysdeps/x86_64/fpu/e_powl.S
new file mode 100644
index 0000000000..571c0a18d5
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/e_powl.S
@@ -0,0 +1,433 @@
+/* ix87 specific implementation of pow function.
+   Copyright (C) 1996-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@cygnus.com>, 1996.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <machine/asm.h>
+#include <x86_64-math-asm.h>
+
+	.section .rodata.cst8,"aM",@progbits,8
+
+	.p2align 3
+	.type one,@object
+one:	.double 1.0
+	ASM_SIZE_DIRECTIVE(one)
+	.type p2,@object
+p2:	.byte 0, 0, 0, 0, 0, 0, 0x10, 0x40
+	ASM_SIZE_DIRECTIVE(p2)
+	.type p63,@object
+p63:	.byte 0, 0, 0, 0, 0, 0, 0xe0, 0x43
+	ASM_SIZE_DIRECTIVE(p63)
+	.type p64,@object
+p64:	.byte 0, 0, 0, 0, 0, 0, 0xf0, 0x43
+	ASM_SIZE_DIRECTIVE(p64)
+	.type p78,@object
+p78:	.byte 0, 0, 0, 0, 0, 0, 0xd0, 0x44
+	ASM_SIZE_DIRECTIVE(p78)
+	.type pm79,@object
+pm79:	.byte 0, 0, 0, 0, 0, 0, 0, 0x3b
+	ASM_SIZE_DIRECTIVE(pm79)
+
+	.section .rodata.cst16,"aM",@progbits,16
+
+	.p2align 3
+	.type infinity,@object
+inf_zero:
+infinity:
+	.byte 0, 0, 0, 0, 0, 0, 0xf0, 0x7f
+	ASM_SIZE_DIRECTIVE(infinity)
+	.type zero,@object
+zero:	.double 0.0
+	ASM_SIZE_DIRECTIVE(zero)
+	.type minf_mzero,@object
+minf_mzero:
+minfinity:
+	.byte 0, 0, 0, 0, 0, 0, 0xf0, 0xff
+mzero:
+	.byte 0, 0, 0, 0, 0, 0, 0, 0x80
+	ASM_SIZE_DIRECTIVE(minf_mzero)
+DEFINE_LDBL_MIN
+
+#ifdef PIC
+# define MO(op) op##(%rip)
+#else
+# define MO(op) op
+#endif
+
+	.text
+ENTRY(__ieee754_powl)
+	fldt	24(%rsp)	// y
+	fxam
+
+
+	fnstsw
+	movb	%ah, %dl
+	andb	$0x45, %ah
+	cmpb	$0x40, %ah	// is y == 0 ?
+	je	11f
+
+	cmpb	$0x05, %ah	// is y == �inf ?
+	je	12f
+
+	cmpb	$0x01, %ah	// is y == NaN ?
+	je	30f
+
+	fldt	8(%rsp)		// x : y
+
+	fxam
+	fnstsw
+	movb	%ah, %dh
+	andb	$0x45, %ah
+	cmpb	$0x40, %ah
+	je	20f		// x is �0
+
+	cmpb	$0x05, %ah
+	je	15f		// x is �inf
+
+	cmpb	$0x01, %ah
+	je	31f		// x is NaN
+
+	fxch			// y : x
+
+	/* fistpll raises invalid exception for |y| >= 1L<<63.  */
+	fldl	MO(p63)		// 1L<<63 : y : x
+	fld	%st(1)		// y : 1L<<63 : y : x
+	fabs			// |y| : 1L<<63 : y : x
+	fcomip	%st(1), %st	// 1L<<63 : y : x
+	fstp	%st(0)		// y : x
+	jnc	2f
+
+	/* First see whether `y' is a natural number.  In this case we
+	   can use a more precise algorithm.  */
+	fld	%st		// y : y : x
+	fistpll	-8(%rsp)	// y : x
+	fildll	-8(%rsp)	// int(y) : y : x
+	fucomip	%st(1),%st	// y : x
+	je	9f
+
+	// If y has absolute value at most 0x1p-79, then any finite
+	// nonzero x will result in 1.  Saturate y to those bounds to
+	// avoid underflow in the calculation of y*log2(x).
+	fldl	MO(pm79)	// 0x1p-79 : y : x
+	fld	%st(1)		// y : 0x1p-79 : y : x
+	fabs			// |y| : 0x1p-79 : y : x
+	fcomip	%st(1), %st	// 0x1p-79 : y : x
+	fstp	%st(0)		// y : x
+	jnc	3f
+	fstp	%st(0)		// pop y
+	fldl	MO(pm79)	// 0x1p-79 : x
+	testb	$2, %dl
+	jnz	3f		// y > 0
+	fchs			// -0x1p-79 : x
+	jmp	3f
+
+9:	/* OK, we have an integer value for y.  Unless very small
+	   (we use < 4), use the algorithm for real exponent to avoid
+	   accumulation of errors.  */
+	fldl	MO(p2)		// 4 : y : x
+	fld	%st(1)		// y : 4 : y : x
+	fabs			// |y| : 4 : y : x
+	fcomip	%st(1), %st	// 4 : y : x
+	fstp	%st(0)		// y : x
+	jnc	3f
+	mov	-8(%rsp),%eax
+	mov	-4(%rsp),%edx
+	orl	$0, %edx
+	fstp	%st(0)		// x
+	jns	4f		// y >= 0, jump
+	fdivrl	MO(one)		// 1/x		(now referred to as x)
+	negl	%eax
+	adcl	$0, %edx
+	negl	%edx
+4:	fldl	MO(one)		// 1 : x
+	fxch
+
+	/* If y is even, take the absolute value of x.  Otherwise,
+	   ensure all intermediate values that might overflow have the
+	   sign of x.  */
+	testb	$1, %al
+	jnz	6f
+	fabs
+
+6:	shrdl	$1, %edx, %eax
+	jnc	5f
+	fxch
+	fabs
+	fmul	%st(1)		// x : ST*x
+	fxch
+5:	fld	%st		// x : x : ST*x
+	fabs			// |x| : x : ST*x
+	fmulp			// |x|*x : ST*x
+	shrl	$1, %edx
+	movl	%eax, %ecx
+	orl	%edx, %ecx
+	jnz	6b
+	fstp	%st(0)		// ST*x
+	LDBL_CHECK_FORCE_UFLOW_NONNAN
+	ret
+
+	/* y is �NAN */
+30:	fldt	8(%rsp)		// x : y
+	fldl	MO(one)		// 1.0 : x : y
+	fucomip	%st(1),%st	// x : y
+	je	32f
+31:	/* At least one argument NaN, and result should be NaN.  */
+	faddp
+	ret
+32:	jc	31b
+	/* pow (1, NaN); check if the NaN signaling.  */
+	testb	$0x40, 31(%rsp)
+	jz	31b
+	fstp	%st(1)
+	ret
+
+	.align ALIGNARG(4)
+2:	// y is a large integer (absolute value at least 1L<<63).
+	// If y has absolute value at least 1L<<78, then any finite
+	// nonzero x will result in 0 (underflow), 1 or infinity (overflow).
+	// Saturate y to those bounds to avoid overflow in the calculation
+	// of y*log2(x).
+	fldl	MO(p78)		// 1L<<78 : y : x
+	fld	%st(1)		// y : 1L<<78 : y : x
+	fabs			// |y| : 1L<<78 : y : x
+	fcomip	%st(1), %st	// 1L<<78 : y : x
+	fstp	%st(0)		// y : x
+	jc	3f
+	fstp	%st(0)		// pop y
+	fldl	MO(p78)		// 1L<<78 : x
+	testb	$2, %dl
+	jz	3f		// y > 0
+	fchs			// -(1L<<78) : x
+	.align ALIGNARG(4)
+3:	/* y is a real number.  */
+	subq	$40, %rsp
+	cfi_adjust_cfa_offset (40)
+	fstpt	16(%rsp)	// x
+	fstpt	(%rsp)		// <empty>
+	call	HIDDEN_JUMPTARGET (__powl_helper)	// <result>
+	addq	$40, %rsp
+	cfi_adjust_cfa_offset (-40)
+	ret
+
+	// pow(x,�0) = 1, unless x is sNaN
+	.align ALIGNARG(4)
+11:	fstp	%st(0)		// pop y
+	fldt	8(%rsp)		// x
+	fxam
+	fnstsw
+	andb	$0x45, %ah
+	cmpb	$0x01, %ah
+	je	112f		// x is NaN
+111:	fstp	%st(0)
+	fldl	MO(one)
+	ret
+
+112:	testb	$0x40, 15(%rsp)
+	jnz	111b
+	fadd	%st(0)
+	ret
+
+	// y == �inf
+	.align ALIGNARG(4)
+12:	fstp	%st(0)		// pop y
+	fldl	MO(one)		// 1
+	fldt	8(%rsp)		// x : 1
+	fabs			// abs(x) : 1
+	fucompp			// < 1, == 1, or > 1
+	fnstsw
+	andb	$0x45, %ah
+	cmpb	$0x45, %ah
+	je	13f		// jump if x is NaN
+
+	cmpb	$0x40, %ah
+	je	14f		// jump if |x| == 1
+
+	shlb	$1, %ah
+	xorb	%ah, %dl
+	andl	$2, %edx
+#ifdef PIC
+	lea	inf_zero(%rip),%rcx
+	fldl	(%rcx, %rdx, 4)
+#else
+	fldl	inf_zero(,%rdx, 4)
+#endif
+	ret
+
+	.align ALIGNARG(4)
+14:	fldl	MO(one)
+	ret
+
+	.align ALIGNARG(4)
+13:	fldt	8(%rsp)		// load x == NaN
+	fadd	%st(0)
+	ret
+
+	.align ALIGNARG(4)
+	// x is �inf
+15:	fstp	%st(0)		// y
+	testb	$2, %dh
+	jz	16f		// jump if x == +inf
+
+	// fistpll raises invalid exception for |y| >= 1L<<63, but y
+	// may be odd unless we know |y| >= 1L<<64.
+	fldl	MO(p64)		// 1L<<64 : y
+	fld	%st(1)		// y : 1L<<64 : y
+	fabs			// |y| : 1L<<64 : y
+	fcomip	%st(1), %st	// 1L<<64 : y
+	fstp	%st(0)		// y
+	jnc	16f
+	fldl	MO(p63)		// p63 : y
+	fxch			// y : p63
+	fprem			// y%p63 : p63
+	fstp	%st(1)		// y%p63
+
+	// We must find out whether y is an odd integer.
+	fld	%st		// y : y
+	fistpll	-8(%rsp)	// y
+	fildll	-8(%rsp)	// int(y) : y
+	fucomip %st(1),%st
+	ffreep	%st		// <empty>
+	jne	17f
+
+	// OK, the value is an integer, but is it odd?
+	mov	-8(%rsp), %eax
+	mov	-4(%rsp), %edx
+	andb	$1, %al
+	jz	18f		// jump if not odd
+	// It's an odd integer.
+	shrl	$31, %edx
+#ifdef PIC
+	lea	minf_mzero(%rip),%rcx
+	fldl	(%rcx, %rdx, 8)
+#else
+	fldl	minf_mzero(,%rdx, 8)
+#endif
+	ret
+
+	.align ALIGNARG(4)
+16:	fcompl	MO(zero)
+	fnstsw
+	shrl	$5, %eax
+	andl	$8, %eax
+#ifdef PIC
+	lea	inf_zero(%rip),%rcx
+	fldl	(%rcx, %rax, 1)
+#else
+	fldl	inf_zero(,%rax, 1)
+#endif
+	ret
+
+	.align ALIGNARG(4)
+17:	shll	$30, %edx	// sign bit for y in right position
+18:	shrl	$31, %edx
+#ifdef PIC
+	lea	inf_zero(%rip),%rcx
+	fldl	(%rcx, %rdx, 8)
+#else
+	fldl	inf_zero(,%rdx, 8)
+#endif
+	ret
+
+	.align ALIGNARG(4)
+	// x is �0
+20:	fstp	%st(0)		// y
+	testb	$2, %dl
+	jz	21f		// y > 0
+
+	// x is �0 and y is < 0.  We must find out whether y is an odd integer.
+	testb	$2, %dh
+	jz	25f
+
+	// fistpll raises invalid exception for |y| >= 1L<<63, but y
+	// may be odd unless we know |y| >= 1L<<64.
+	fldl	MO(p64)		// 1L<<64 : y
+	fld	%st(1)		// y : 1L<<64 : y
+	fabs			// |y| : 1L<<64 : y
+	fcomip	%st(1), %st	// 1L<<64 : y
+	fstp	%st(0)		// y
+	jnc	25f
+	fldl	MO(p63)		// p63 : y
+	fxch			// y : p63
+	fprem			// y%p63 : p63
+	fstp	%st(1)		// y%p63
+
+	fld	%st		// y : y
+	fistpll	-8(%rsp)	// y
+	fildll	-8(%rsp)	// int(y) : y
+	fucomip	%st(1),%st
+	ffreep	%st		// <empty>
+	jne	26f
+
+	// OK, the value is an integer, but is it odd?
+	mov	-8(%rsp),%eax
+	mov	-4(%rsp),%edx
+	andb	$1, %al
+	jz	27f		// jump if not odd
+	// It's an odd integer.
+	// Raise divide-by-zero exception and get minus infinity value.
+	fldl	MO(one)
+	fdivl	MO(zero)
+	fchs
+	ret
+
+25:	fstp	%st(0)
+26:
+27:	// Raise divide-by-zero exception and get infinity value.
+	fldl	MO(one)
+	fdivl	MO(zero)
+	ret
+
+	.align ALIGNARG(4)
+	// x is �0 and y is > 0.  We must find out whether y is an odd integer.
+21:	testb	$2, %dh
+	jz	22f
+
+	// fistpll raises invalid exception for |y| >= 1L<<63, but y
+	// may be odd unless we know |y| >= 1L<<64.
+	fldl	MO(p64)		// 1L<<64 : y
+	fxch			// y : 1L<<64
+	fcomi	%st(1), %st	// y : 1L<<64
+	fstp	%st(1)		// y
+	jnc	22f
+	fldl	MO(p63)		// p63 : y
+	fxch			// y : p63
+	fprem			// y%p63 : p63
+	fstp	%st(1)		// y%p63
+
+	fld	%st		// y : y
+	fistpll	-8(%rsp)	// y
+	fildll	-8(%rsp)	// int(y) : y
+	fucomip %st(1),%st
+	ffreep	%st		// <empty>
+	jne	23f
+
+	// OK, the value is an integer, but is it odd?
+	mov	-8(%rsp),%eax
+	mov	-4(%rsp),%edx
+	andb	$1, %al
+	jz	24f		// jump if not odd
+	// It's an odd integer.
+	fldl	MO(mzero)
+	ret
+
+22:	fstp	%st(0)
+23:
+24:	fldl	MO(zero)
+	ret
+
+END(__ieee754_powl)
+strong_alias (__ieee754_powl, __powl_finite)
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/e_remainderl.S b/REORG.TODO/sysdeps/x86_64/fpu/e_remainderl.S
new file mode 100644
index 0000000000..4ee0910912
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/e_remainderl.S
@@ -0,0 +1,21 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ *
+ * Adapted for `long double' by Ulrich Drepper <drepper@cygnus.com>.
+ * Adapted for x86-64 by Andreas Jaeger <aj@suse.de>.
+ */
+
+#include <machine/asm.h>
+
+ENTRY(__ieee754_remainderl)
+	fldt	24(%rsp)
+	fldt	8(%rsp)
+1:	fprem1
+	fstsw	%ax
+	testl	$0x400,%eax
+	jnz	1b
+	fstp	%st(1)
+	ret
+END (__ieee754_remainderl)
+strong_alias (__ieee754_remainderl, __remainderl_finite)
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/e_scalbl.S b/REORG.TODO/sysdeps/x86_64/fpu/e_scalbl.S
new file mode 100644
index 0000000000..2982dc3b9e
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/e_scalbl.S
@@ -0,0 +1,89 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ *
+ * Adapted for `long double' by Ulrich Drepper <drepper@cygnus.com>.
+ * Adapted for x86-64 by Andreas Jaeger <aj@suse.de>
+ *
+ * Correct handling of y==-inf <drepper@gnu>
+ */
+
+#include <machine/asm.h>
+
+	.section .rodata
+
+	.align ALIGNARG(4)
+	.type zero_nan,@object
+zero_nan:
+	.double 0.0
+nan:	.byte 0, 0, 0, 0, 0, 0, 0xff, 0x7f
+	.byte 0, 0, 0, 0, 0, 0, 0, 0x80
+	.byte 0, 0, 0, 0, 0, 0, 0xff, 0x7f
+	ASM_SIZE_DIRECTIVE(zero_nan)
+
+
+#ifdef PIC
+# define MO(op) op##(%rip)
+#else
+# define MO(op) op
+#endif
+
+	.text
+ENTRY(__ieee754_scalbl)
+	fldt	24(%rsp)
+	fxam
+	fnstsw
+	fldt	8(%rsp)
+	andl	$0x4700, %eax
+	cmpl	$0x0700, %eax
+	je	1f
+	andl	$0x4500, %eax
+	cmpl	$0x0100, %eax
+	je	2f
+	fxam
+	fnstsw
+	andl	$0x4500, %eax
+	cmpl	$0x0100, %eax
+	je	2f
+	fld	%st(1)
+	frndint
+	fcomip	%st(2), %st
+	jne	4f
+	fscale
+	fstp	%st(1)
+	ret
+
+	/* y is -inf */
+1:	fxam
+	fnstsw
+	movl	16(%rsp), %edx
+	shrl	$5, %eax
+	fstp	%st
+	fstp	%st
+	andl	$0x8000, %edx
+	andl	$0x0228, %eax
+	cmpl	$0x0028, %eax
+	je	4f
+	andl	$8, %eax
+	shrl	$11, %edx
+	addl	%edx, %eax
+#ifdef PIC
+	lea	zero_nan(%rip),%rdx
+	fldl	(%rdx,%rax,1)
+#else
+	fldl	zero_nan(%rax, 1)
+#endif
+	ret
+
+	/* The result is NaN; raise an exception for sNaN arguments.  */
+2:	faddp
+	ret
+
+	/* Return NaN and raise the invalid exception.  */
+4:	fstp	%st
+	fstp	%st
+	fldz
+	fdiv	%st
+	ret
+END(__ieee754_scalbl)
+strong_alias (__ieee754_scalbl, __scalbl_finite)
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/e_sqrt.c b/REORG.TODO/sysdeps/x86_64/fpu/e_sqrt.c
new file mode 100644
index 0000000000..33b59f67c1
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/e_sqrt.c
@@ -0,0 +1,31 @@
+/* Square root of floating point number.
+   Copyright (C) 2002-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <math_private.h>
+
+#undef __ieee754_sqrt
+double
+__ieee754_sqrt (double x)
+{
+  double res;
+
+  asm ("sqrtsd %1, %0" : "=x" (res) : "xm" (x));
+
+  return res;
+}
+strong_alias (__ieee754_sqrt, __sqrt_finite)
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/e_sqrtf.c b/REORG.TODO/sysdeps/x86_64/fpu/e_sqrtf.c
new file mode 100644
index 0000000000..386b903c43
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/e_sqrtf.c
@@ -0,0 +1,31 @@
+/* Square root of floating point number.
+   Copyright (C) 2002-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <math_private.h>
+
+#undef __ieee754_sqrtf
+float
+__ieee754_sqrtf (float x)
+{
+  float res;
+
+  asm ("sqrtss %1, %0" : "=x" (res) : "xm" (x));
+
+  return res;
+}
+strong_alias (__ieee754_sqrtf, __sqrtf_finite)
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/e_sqrtl.c b/REORG.TODO/sysdeps/x86_64/fpu/e_sqrtl.c
new file mode 100644
index 0000000000..90e4e164e5
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/e_sqrtl.c
@@ -0,0 +1 @@
+#include "sysdeps/i386/fpu/e_sqrtl.c"
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/fclrexcpt.c b/REORG.TODO/sysdeps/x86_64/fpu/fclrexcpt.c
new file mode 100644
index 0000000000..93bf0d341f
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/fclrexcpt.c
@@ -0,0 +1,52 @@
+/* Clear given exceptions in current floating-point environment.
+   Copyright (C) 2001-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <fenv.h>
+
+int
+feclearexcept (int excepts)
+{
+  fenv_t temp;
+  unsigned int mxcsr;
+
+  /* Mask out unsupported bits/exceptions.  */
+  excepts &= FE_ALL_EXCEPT;
+
+  /* Bah, we have to clear selected exceptions.  Since there is no
+     `fldsw' instruction we have to do it the hard way.  */
+  __asm__ ("fnstenv %0" : "=m" (*&temp));
+
+  /* Clear the relevant bits.  */
+  temp.__status_word &= excepts ^ FE_ALL_EXCEPT;
+
+  /* Put the new data in effect.  */
+  __asm__ ("fldenv %0" : : "m" (*&temp));
+
+  /* And the same procedure for SSE.  */
+  __asm__ ("stmxcsr %0" : "=m" (*&mxcsr));
+
+  /* Clear the relevant bits.  */
+  mxcsr &= ~excepts;
+
+  /* And put them into effect.  */
+  __asm__ ("ldmxcsr %0" : : "m" (*&mxcsr));
+
+  /* Success.  */
+  return 0;
+}
+libm_hidden_def (feclearexcept)
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/fedisblxcpt.c b/REORG.TODO/sysdeps/x86_64/fpu/fedisblxcpt.c
new file mode 100644
index 0000000000..512987bd03
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/fedisblxcpt.c
@@ -0,0 +1,46 @@
+/* Disable floating-point exceptions.
+   Copyright (C) 2001-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Andreas Jaeger <aj@suse.de>, 2001.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <fenv.h>
+
+int
+fedisableexcept (int excepts)
+{
+  unsigned short int new_exc, old_exc;
+  unsigned int new;
+
+  excepts &= FE_ALL_EXCEPT;
+
+  /* Get the current control word of the x87 FPU.  */
+  __asm__ ("fstcw %0" : "=m" (*&new_exc));
+
+  old_exc = (~new_exc) & FE_ALL_EXCEPT;
+
+  new_exc |= excepts;
+  __asm__ ("fldcw %0" : : "m" (*&new_exc));
+
+  /* And now the same for the SSE MXCSR register.  */
+  __asm__ ("stmxcsr %0" : "=m" (*&new));
+
+  /* The SSE exception masks are shifted by 7 bits.  */
+  new |= excepts << 7;
+  __asm__ ("ldmxcsr %0" : : "m" (*&new));
+
+  return old_exc;
+}
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/feenablxcpt.c b/REORG.TODO/sysdeps/x86_64/fpu/feenablxcpt.c
new file mode 100644
index 0000000000..0985d71a00
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/feenablxcpt.c
@@ -0,0 +1,46 @@
+/* Enable floating-point exceptions.
+   Copyright (C) 2001-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Andreas Jaeger <aj@suse.de>, 2001.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <fenv.h>
+
+int
+feenableexcept (int excepts)
+{
+  unsigned short int new_exc, old_exc;
+  unsigned int new;
+
+  excepts &= FE_ALL_EXCEPT;
+
+  /* Get the current control word of the x87 FPU.  */
+  __asm__ ("fstcw %0" : "=m" (*&new_exc));
+
+  old_exc = (~new_exc) & FE_ALL_EXCEPT;
+
+  new_exc &= ~excepts;
+  __asm__ ("fldcw %0" : : "m" (*&new_exc));
+
+  /* And now the same for the SSE MXCSR register.  */
+  __asm__ ("stmxcsr %0" : "=m" (*&new));
+
+  /* The SSE exception masks are shifted by 7 bits.  */
+  new &= ~(excepts << 7);
+  __asm__ ("ldmxcsr %0" : : "m" (*&new));
+
+  return old_exc;
+}
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/fegetenv.c b/REORG.TODO/sysdeps/x86_64/fpu/fegetenv.c
new file mode 100644
index 0000000000..af7642e990
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/fegetenv.c
@@ -0,0 +1,35 @@
+/* Store current floating-point environment.
+   Copyright (C) 2001-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <fenv.h>
+
+int
+__fegetenv (fenv_t *envp)
+{
+  __asm__ ("fnstenv %0\n"
+	   /* fnstenv changes the exception mask, so load back the
+	      stored environment.  */
+	   "fldenv %0\n"
+	   "stmxcsr %1" : "=m" (*envp), "=m" (envp->__mxcsr));
+
+  /* Success.  */
+  return 0;
+}
+libm_hidden_def (__fegetenv)
+weak_alias (__fegetenv, fegetenv)
+libm_hidden_weak (fegetenv)
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/fegetexcept.c b/REORG.TODO/sysdeps/x86_64/fpu/fegetexcept.c
new file mode 100644
index 0000000000..7dbf40401e
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/fegetexcept.c
@@ -0,0 +1,31 @@
+/* Get enabled floating-point exceptions.
+   Copyright (C) 2001-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Andreas Jaeger <aj@suse.de>, 2001.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <fenv.h>
+
+int
+fegetexcept (void)
+{
+  unsigned short int exc;
+
+  /* Get the current control word.  */
+  __asm__ ("fstcw %0" : "=m" (*&exc));
+
+  return (~exc) & FE_ALL_EXCEPT;
+}
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/fegetmode.c b/REORG.TODO/sysdeps/x86_64/fpu/fegetmode.c
new file mode 100644
index 0000000000..4513f80c85
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/fegetmode.c
@@ -0,0 +1,28 @@
+/* Store current floating-point control modes.  x86_64 version.
+   Copyright (C) 2016-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <fenv.h>
+#include <fpu_control.h>
+
+int
+fegetmode (femode_t *modep)
+{
+  _FPU_GETCW (modep->__control_word);
+  __asm__ ("stmxcsr %0" : "=m" (modep->__mxcsr));
+  return 0;
+}
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/fegetround.c b/REORG.TODO/sysdeps/x86_64/fpu/fegetround.c
new file mode 100644
index 0000000000..bff3eae102
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/fegetround.c
@@ -0,0 +1,35 @@
+/* Return current rounding direction.
+   Copyright (C) 1997-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <fenv.h>
+
+int
+__fegetround (void)
+{
+  int cw;
+  /* We only check the x87 FPU unit.  The SSE unit should be the same
+     - and if it's not the same there's no way to signal it.  */
+
+  __asm__ ("fnstcw %0" : "=m" (*&cw));
+
+  return cw & 0xc00;
+}
+libm_hidden_def (__fegetround)
+weak_alias (__fegetround, fegetround)
+libm_hidden_weak (fegetround)
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/feholdexcpt.c b/REORG.TODO/sysdeps/x86_64/fpu/feholdexcpt.c
new file mode 100644
index 0000000000..0a6c836f4f
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/feholdexcpt.c
@@ -0,0 +1,41 @@
+/* Store current floating-point environment and clear exceptions.
+   Copyright (C) 2001-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <fenv.h>
+
+int
+__feholdexcept (fenv_t *envp)
+{
+  unsigned int mxcsr;
+
+  /* Store the environment.  Recall that fnstenv has a side effect of
+     masking all exceptions.  Then clear all exceptions.  */
+  __asm__ ("fnstenv %0\n\t"
+	   "stmxcsr %1\n\t"
+	   "fnclex"
+	   : "=m" (*envp), "=m" (envp->__mxcsr));
+
+  /* Set the SSE MXCSR register.  */
+  mxcsr = (envp->__mxcsr | 0x1f80) & ~0x3f;
+  __asm__ ("ldmxcsr %0" : : "m" (*&mxcsr));
+
+  return 0;
+}
+libm_hidden_def (__feholdexcept)
+weak_alias (__feholdexcept, feholdexcept)
+libm_hidden_weak (feholdexcept)
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/fesetenv.c b/REORG.TODO/sysdeps/x86_64/fpu/fesetenv.c
new file mode 100644
index 0000000000..90164bf3d3
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/fesetenv.c
@@ -0,0 +1,114 @@
+/* Install given floating-point environment.
+   Copyright (C) 2001-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <fenv.h>
+#include <fpu_control.h>
+#include <assert.h>
+
+
+/* All exceptions, including the x86-specific "denormal operand"
+   exception.  */
+#define FE_ALL_EXCEPT_X86 (FE_ALL_EXCEPT | __FE_DENORM)
+
+
+int
+__fesetenv (const fenv_t *envp)
+{
+  fenv_t temp;
+
+  /* Install the environment specified by ENVP.  But there are a few
+     values which we do not want to come from the saved environment.
+     Therefore, we get the current environment and replace the values
+     we want to use from the environment specified by the parameter.  */
+  __asm__ ("fnstenv %0\n"
+	   "stmxcsr %1" : "=m" (*&temp), "=m" (*&temp.__mxcsr));
+
+  if (envp == FE_DFL_ENV)
+    {
+      temp.__control_word |= FE_ALL_EXCEPT_X86;
+      temp.__control_word &= ~FE_TOWARDZERO;
+      temp.__control_word |= _FPU_EXTENDED;
+      temp.__status_word &= ~FE_ALL_EXCEPT_X86;
+      temp.__eip = 0;
+      temp.__cs_selector = 0;
+      temp.__opcode = 0;
+      temp.__data_offset = 0;
+      temp.__data_selector = 0;
+      /* Clear SSE exceptions.  */
+      temp.__mxcsr &= ~FE_ALL_EXCEPT_X86;
+      /* Set mask for SSE MXCSR.  */
+      temp.__mxcsr |= (FE_ALL_EXCEPT_X86 << 7);
+      /* Set rounding to FE_TONEAREST.  */
+      temp.__mxcsr &= ~ 0x6000;
+      temp.__mxcsr |= (FE_TONEAREST << 3);
+      /* Clear the FZ and DAZ bits.  */
+      temp.__mxcsr &= ~0x8040;
+    }
+  else if (envp == FE_NOMASK_ENV)
+    {
+      temp.__control_word &= ~(FE_ALL_EXCEPT | FE_TOWARDZERO);
+      /* Keep the "denormal operand" exception masked.  */
+      temp.__control_word |= __FE_DENORM;
+      temp.__control_word |= _FPU_EXTENDED;
+      temp.__status_word &= ~FE_ALL_EXCEPT_X86;
+      temp.__eip = 0;
+      temp.__cs_selector = 0;
+      temp.__opcode = 0;
+      temp.__data_offset = 0;
+      temp.__data_selector = 0;
+      /* Clear SSE exceptions.  */
+      temp.__mxcsr &= ~FE_ALL_EXCEPT_X86;
+      /* Set mask for SSE MXCSR.  */
+      /* Set rounding to FE_TONEAREST.  */
+      temp.__mxcsr &= ~ 0x6000;
+      temp.__mxcsr |= (FE_TONEAREST << 3);
+      /* Do not mask exceptions.  */
+      temp.__mxcsr &= ~(FE_ALL_EXCEPT << 7);
+      /* Keep the "denormal operand" exception masked.  */
+      temp.__mxcsr |= (__FE_DENORM << 7);
+      /* Clear the FZ and DAZ bits.  */
+      temp.__mxcsr &= ~0x8040;
+    }
+  else
+    {
+      temp.__control_word &= ~(FE_ALL_EXCEPT_X86
+			       | FE_TOWARDZERO
+			       | _FPU_EXTENDED);
+      temp.__control_word |= (envp->__control_word
+			      & (FE_ALL_EXCEPT_X86
+				 | FE_TOWARDZERO
+				 | _FPU_EXTENDED));
+      temp.__status_word &= ~FE_ALL_EXCEPT_X86;
+      temp.__status_word |= envp->__status_word & FE_ALL_EXCEPT_X86;
+      temp.__eip = envp->__eip;
+      temp.__cs_selector = envp->__cs_selector;
+      temp.__opcode = envp->__opcode;
+      temp.__data_offset = envp->__data_offset;
+      temp.__data_selector = envp->__data_selector;
+      temp.__mxcsr = envp->__mxcsr;
+    }
+
+  __asm__ ("fldenv %0\n"
+	   "ldmxcsr %1" : : "m" (temp), "m" (temp.__mxcsr));
+
+  /* Success.  */
+  return 0;
+}
+libm_hidden_def (__fesetenv)
+weak_alias (__fesetenv, fesetenv)
+libm_hidden_weak (fesetenv)
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/fesetexcept.c b/REORG.TODO/sysdeps/x86_64/fpu/fesetexcept.c
new file mode 100644
index 0000000000..65683b5697
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/fesetexcept.c
@@ -0,0 +1,31 @@
+/* Set given exception flags.  x86_64 version.
+   Copyright (C) 2016-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <fenv.h>
+
+int
+fesetexcept (int excepts)
+{
+  unsigned int mxcsr;
+
+  __asm__ ("stmxcsr %0" : "=m" (*&mxcsr));
+  mxcsr |= excepts & FE_ALL_EXCEPT;
+  __asm__ ("ldmxcsr %0" : : "m" (*&mxcsr));
+
+  return 0;
+}
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/fesetmode.c b/REORG.TODO/sysdeps/x86_64/fpu/fesetmode.c
new file mode 100644
index 0000000000..27429f7887
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/fesetmode.c
@@ -0,0 +1,50 @@
+/* Install given floating-point control modes.  x86_64 version.
+   Copyright (C) 2016-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <fenv.h>
+#include <fpu_control.h>
+
+/* All exceptions, including the x86-specific "denormal operand"
+   exception.  */
+#define FE_ALL_EXCEPT_X86 (FE_ALL_EXCEPT | __FE_DENORM)
+
+int
+fesetmode (const femode_t *modep)
+{
+  fpu_control_t cw;
+  unsigned int mxcsr;
+  __asm__ ("stmxcsr %0" : "=m" (mxcsr));
+  /* Preserve SSE exception flags but restore other state in
+     MXCSR.  */
+  mxcsr &= FE_ALL_EXCEPT_X86;
+  if (modep == FE_DFL_MODE)
+    {
+      cw = _FPU_DEFAULT;
+      /* Default MXCSR state has all bits zero except for those
+	 masking exceptions.  */
+      mxcsr |= FE_ALL_EXCEPT_X86 << 7;
+    }
+  else
+    {
+      cw = modep->__control_word;
+      mxcsr |= modep->__mxcsr & ~FE_ALL_EXCEPT_X86;
+    }
+  _FPU_SETCW (cw);
+  __asm__ ("ldmxcsr %0" : : "m" (mxcsr));
+  return 0;
+}
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/fesetround.c b/REORG.TODO/sysdeps/x86_64/fpu/fesetround.c
new file mode 100644
index 0000000000..939297252a
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/fesetround.c
@@ -0,0 +1,48 @@
+/* Set current rounding direction.
+   Copyright (C) 2001-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <fenv.h>
+
+int
+__fesetround (int round)
+{
+  unsigned short int cw;
+  int mxcsr;
+
+  if ((round & ~0xc00) != 0)
+    /* ROUND is no valid rounding mode.  */
+    return 1;
+
+  /* First set the x87 FPU.  */
+  asm ("fnstcw %0" : "=m" (*&cw));
+  cw &= ~0xc00;
+  cw |= round;
+  asm ("fldcw %0" : : "m" (*&cw));
+
+  /* And now the MSCSR register for SSE, the precision is at different bit
+     positions in the different units, we need to shift it 3 bits.  */
+  asm ("stmxcsr %0" : "=m" (*&mxcsr));
+  mxcsr &= ~ 0x6000;
+  mxcsr |= round << 3;
+  asm ("ldmxcsr %0" : : "m" (*&mxcsr));
+
+  return 0;
+}
+libm_hidden_def (__fesetround)
+weak_alias (__fesetround, fesetround)
+libm_hidden_weak (fesetround)
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/feupdateenv.c b/REORG.TODO/sysdeps/x86_64/fpu/feupdateenv.c
new file mode 100644
index 0000000000..3bc110ce48
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/feupdateenv.c
@@ -0,0 +1,52 @@
+/* Install given floating-point environment and raise exceptions.
+   Copyright (C) 1997-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <fenv.h>
+
+int
+__feupdateenv (const fenv_t *envp)
+{
+  fexcept_t temp;
+  unsigned int xtemp;
+
+  /* Save current exceptions.  */
+  __asm__ ("fnstsw %0\n\tstmxcsr %1" : "=m" (*&temp), "=m" (xtemp));
+  temp = (temp | xtemp) & FE_ALL_EXCEPT;
+
+  /* Install new environment.  */
+  __fesetenv (envp);
+
+  /* Raise the saved exception.  Incidently for us the implementation
+     defined format of the values in objects of type fexcept_t is the
+     same as the ones specified using the FE_* constants.  */
+  __feraiseexcept ((int) temp);
+
+  /* Success.  */
+  return 0;
+}
+
+#include <shlib-compat.h>
+#if SHLIB_COMPAT (libm, GLIBC_2_1, GLIBC_2_2)
+strong_alias (__feupdateenv, __old_feupdateenv)
+compat_symbol (libm, __old_feupdateenv, feupdateenv, GLIBC_2_1);
+#endif
+
+libm_hidden_def (__feupdateenv)
+libm_hidden_ver (__feupdateenv, feupdateenv)
+versioned_symbol (libm, __feupdateenv, feupdateenv, GLIBC_2_2);
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/fgetexcptflg.c b/REORG.TODO/sysdeps/x86_64/fpu/fgetexcptflg.c
new file mode 100644
index 0000000000..c1a0c2f872
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/fgetexcptflg.c
@@ -0,0 +1,35 @@
+/* Store current representation for exceptions.
+   Copyright (C) 2001-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <fenv.h>
+
+int
+fegetexceptflag (fexcept_t *flagp, int excepts)
+{
+  fexcept_t temp;
+  unsigned int mxscr;
+
+  /* Get the current exceptions for the x87 FPU and SSE unit.  */
+  __asm__ ("fnstsw %0\n"
+	   "stmxcsr %1" : "=m" (*&temp), "=m" (*&mxscr));
+
+  *flagp = (temp | mxscr) & FE_ALL_EXCEPT & excepts;
+
+  /* Success.  */
+  return 0;
+}
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/fraiseexcpt.c b/REORG.TODO/sysdeps/x86_64/fpu/fraiseexcpt.c
new file mode 100644
index 0000000000..13eb4af331
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/fraiseexcpt.c
@@ -0,0 +1,121 @@
+/* Raise given exceptions.
+   Copyright (C) 2001-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <fenv.h>
+#include <math.h>
+
+int
+__feraiseexcept (int excepts)
+{
+  /* Raise exceptions represented by EXPECTS.  But we must raise only
+     one signal at a time.  It is important that if the overflow/underflow
+     exception and the inexact exception are given at the same time,
+     the overflow/underflow exception follows the inexact exception.  */
+
+  /* First: invalid exception.  */
+  if ((FE_INVALID & excepts) != 0)
+    {
+      /* One example of an invalid operation is 0.0 / 0.0.  */
+      float f = 0.0;
+
+      __asm__ __volatile__ ("divss %0, %0 " : : "x" (f));
+      (void) &f;
+    }
+
+  /* Next: division by zero.  */
+  if ((FE_DIVBYZERO & excepts) != 0)
+    {
+      float f = 1.0;
+      float g = 0.0;
+
+      __asm__ __volatile__ ("divss %1, %0" : : "x" (f), "x" (g));
+      (void) &f;
+    }
+
+  /* Next: overflow.  */
+  if ((FE_OVERFLOW & excepts) != 0)
+    {
+      /* XXX: Is it ok to only set the x87 FPU?  */
+      /* There is no way to raise only the overflow flag.  Do it the
+	 hard way.  */
+      fenv_t temp;
+
+      /* Bah, we have to clear selected exceptions.  Since there is no
+	 `fldsw' instruction we have to do it the hard way.  */
+      __asm__ __volatile__ ("fnstenv %0" : "=m" (*&temp));
+
+      /* Set the relevant bits.  */
+      temp.__status_word |= FE_OVERFLOW;
+
+      /* Put the new data in effect.  */
+      __asm__ __volatile__ ("fldenv %0" : : "m" (*&temp));
+
+      /* And raise the exception.  */
+      __asm__ __volatile__ ("fwait");
+    }
+
+  /* Next: underflow.  */
+  if ((FE_UNDERFLOW & excepts) != 0)
+    {
+      /* XXX: Is it ok to only set the x87 FPU?  */
+      /* There is no way to raise only the underflow flag.  Do it the
+	 hard way.  */
+      fenv_t temp;
+
+      /* Bah, we have to clear selected exceptions.  Since there is no
+	 `fldsw' instruction we have to do it the hard way.  */
+      __asm__ __volatile__ ("fnstenv %0" : "=m" (*&temp));
+
+      /* Set the relevant bits.  */
+      temp.__status_word |= FE_UNDERFLOW;
+
+      /* Put the new data in effect.  */
+      __asm__ __volatile__ ("fldenv %0" : : "m" (*&temp));
+
+      /* And raise the exception.  */
+      __asm__ __volatile__ ("fwait");
+    }
+
+  /* Last: inexact.  */
+  if ((FE_INEXACT & excepts) != 0)
+    {
+      /* XXX: Is it ok to only set the x87 FPU?  */
+      /* There is no way to raise only the inexact flag.  Do it the
+	 hard way.  */
+      fenv_t temp;
+
+      /* Bah, we have to clear selected exceptions.  Since there is no
+	 `fldsw' instruction we have to do it the hard way.  */
+      __asm__ __volatile__ ("fnstenv %0" : "=m" (*&temp));
+
+      /* Set the relevant bits.  */
+      temp.__status_word |= FE_INEXACT;
+
+      /* Put the new data in effect.  */
+      __asm__ __volatile__ ("fldenv %0" : : "m" (*&temp));
+
+      /* And raise the exception.  */
+      __asm__ __volatile__ ("fwait");
+    }
+
+  /* Success.  */
+  return 0;
+}
+libm_hidden_def (__feraiseexcept)
+weak_alias (__feraiseexcept, feraiseexcept)
+libm_hidden_weak (feraiseexcept)
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/fsetexcptflg.c b/REORG.TODO/sysdeps/x86_64/fpu/fsetexcptflg.c
new file mode 100644
index 0000000000..ffc44dcad5
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/fsetexcptflg.c
@@ -0,0 +1,53 @@
+/* Set floating-point environment exception handling.
+   Copyright (C) 2001-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <fenv.h>
+#include <math.h>
+
+int
+fesetexceptflag (const fexcept_t *flagp, int excepts)
+{
+  fenv_t temp;
+  unsigned int mxcsr;
+
+  /* XXX: Do we really need to set both the exception in both units?
+     Shouldn't it be enough to set only the SSE unit?  */
+
+  /* Get the current x87 FPU environment.  We have to do this since we
+     cannot separately set the status word.  */
+  __asm__ ("fnstenv %0" : "=m" (*&temp));
+
+  temp.__status_word &= ~(excepts & FE_ALL_EXCEPT);
+  temp.__status_word |= *flagp & excepts & FE_ALL_EXCEPT;
+
+  /* Store the new status word (along with the rest of the environment.
+     Possibly new exceptions are set but they won't get executed unless
+     the next floating-point instruction.  */
+  __asm__ ("fldenv %0" : : "m" (*&temp));
+
+  /* And now the same for SSE.  */
+  __asm__ ("stmxcsr %0" : "=m" (*&mxcsr));
+
+  mxcsr &= ~(excepts & FE_ALL_EXCEPT);
+  mxcsr |= *flagp & excepts & FE_ALL_EXCEPT;
+
+  __asm__ ("ldmxcsr %0" : : "m" (*&mxcsr));
+
+  /* Success.  */
+  return 0;
+}
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/ftestexcept.c b/REORG.TODO/sysdeps/x86_64/fpu/ftestexcept.c
new file mode 100644
index 0000000000..502bdb2c42
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/ftestexcept.c
@@ -0,0 +1,33 @@
+/* Test exception in current environment.
+   Copyright (C) 2001-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <fenv.h>
+
+int
+fetestexcept (int excepts)
+{
+  int temp;
+  unsigned int mxscr;
+
+  /* Get current exceptions.  */
+  __asm__ ("fnstsw %0\n"
+	   "stmxcsr %1" : "=m" (*&temp), "=m" (*&mxscr));
+
+  return (temp | mxscr) & excepts & FE_ALL_EXCEPT;
+}
+libm_hidden_def (fetestexcept)
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/libm-test-ulps b/REORG.TODO/sysdeps/x86_64/fpu/libm-test-ulps
new file mode 100644
index 0000000000..61da961a57
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/libm-test-ulps
@@ -0,0 +1,2368 @@
+# Begin of automatic generation
+
+# Maximal error of functions:
+Function: "acos":
+float: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "acos_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: "acos_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: "acos_upward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: "acosh":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 2
+ldouble: 2
+
+Function: "acosh_downward":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 4
+ldouble: 4
+
+Function: "acosh_towardzero":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 4
+ldouble: 4
+
+Function: "acosh_upward":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 3
+ldouble: 3
+
+Function: "asin":
+float: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "asin_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: "asin_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "asin_upward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "asinh":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 3
+ldouble: 3
+
+Function: "asinh_downward":
+double: 3
+float: 3
+idouble: 3
+ifloat: 3
+ildouble: 5
+ldouble: 5
+
+Function: "asinh_towardzero":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 4
+ldouble: 4
+
+Function: "asinh_upward":
+double: 3
+float: 3
+idouble: 3
+ifloat: 3
+ildouble: 5
+ldouble: 5
+
+Function: "atan":
+float: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "atan2":
+float: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "atan2_downward":
+double: 1
+float: 2
+idouble: 1
+ifloat: 2
+ildouble: 1
+ldouble: 1
+
+Function: "atan2_towardzero":
+double: 1
+float: 2
+idouble: 1
+ifloat: 2
+ildouble: 1
+ldouble: 1
+
+Function: "atan2_upward":
+double: 1
+float: 2
+idouble: 1
+ifloat: 2
+ildouble: 1
+ldouble: 1
+
+Function: "atan_downward":
+double: 1
+float: 2
+idouble: 1
+ifloat: 2
+ildouble: 1
+ldouble: 1
+
+Function: "atan_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "atan_upward":
+double: 1
+float: 2
+idouble: 1
+ifloat: 2
+ildouble: 1
+ldouble: 1
+
+Function: "atanh":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 3
+ldouble: 3
+
+Function: "atanh_downward":
+double: 3
+float: 3
+idouble: 3
+ifloat: 3
+ildouble: 5
+ldouble: 5
+
+Function: "atanh_towardzero":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 4
+ldouble: 4
+
+Function: "atanh_upward":
+double: 3
+float: 3
+idouble: 3
+ifloat: 3
+ildouble: 5
+ldouble: 5
+
+Function: "cabs":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "cabs_downward":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "cabs_towardzero":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "cabs_upward":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: Real part of "cacos":
+double: 1
+float: 2
+idouble: 1
+ifloat: 2
+ildouble: 1
+ldouble: 1
+
+Function: Imaginary part of "cacos":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 2
+ldouble: 2
+
+Function: Real part of "cacos_downward":
+double: 3
+float: 2
+idouble: 3
+ifloat: 2
+ildouble: 2
+ldouble: 2
+
+Function: Imaginary part of "cacos_downward":
+double: 5
+float: 3
+idouble: 5
+ifloat: 3
+ildouble: 6
+ldouble: 6
+
+Function: Real part of "cacos_towardzero":
+double: 3
+float: 2
+idouble: 3
+ifloat: 2
+ildouble: 2
+ldouble: 2
+
+Function: Imaginary part of "cacos_towardzero":
+double: 5
+float: 3
+idouble: 5
+ifloat: 3
+ildouble: 5
+ldouble: 5
+
+Function: Real part of "cacos_upward":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 2
+ldouble: 2
+
+Function: Imaginary part of "cacos_upward":
+double: 5
+float: 7
+idouble: 5
+ifloat: 7
+ildouble: 7
+ldouble: 7
+
+Function: Real part of "cacosh":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 2
+ldouble: 2
+
+Function: Imaginary part of "cacosh":
+double: 1
+float: 2
+idouble: 1
+ifloat: 2
+ildouble: 1
+ldouble: 1
+
+Function: Real part of "cacosh_downward":
+double: 5
+float: 3
+idouble: 5
+ifloat: 3
+ildouble: 5
+ldouble: 5
+
+Function: Imaginary part of "cacosh_downward":
+double: 3
+float: 3
+idouble: 3
+ifloat: 3
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "cacosh_towardzero":
+double: 5
+float: 3
+idouble: 5
+ifloat: 3
+ildouble: 5
+ldouble: 5
+
+Function: Imaginary part of "cacosh_towardzero":
+double: 3
+float: 2
+idouble: 3
+ifloat: 2
+ildouble: 2
+ldouble: 2
+
+Function: Real part of "cacosh_upward":
+double: 4
+float: 4
+idouble: 4
+ifloat: 4
+ildouble: 5
+ldouble: 5
+
+Function: Imaginary part of "cacosh_upward":
+double: 3
+float: 2
+idouble: 3
+ifloat: 2
+ildouble: 3
+ldouble: 3
+
+Function: "carg":
+float: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "carg_downward":
+double: 1
+float: 2
+idouble: 1
+ifloat: 2
+ildouble: 1
+ldouble: 1
+
+Function: "carg_towardzero":
+double: 1
+float: 2
+idouble: 1
+ifloat: 2
+ildouble: 1
+ldouble: 1
+
+Function: "carg_upward":
+double: 1
+float: 2
+idouble: 1
+ifloat: 2
+ildouble: 1
+ldouble: 1
+
+Function: Real part of "casin":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Imaginary part of "casin":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 2
+ldouble: 2
+
+Function: Real part of "casin_downward":
+double: 3
+float: 2
+idouble: 3
+ifloat: 2
+ildouble: 3
+ldouble: 3
+
+Function: Imaginary part of "casin_downward":
+double: 5
+float: 3
+idouble: 5
+ifloat: 3
+ildouble: 6
+ldouble: 6
+
+Function: Real part of "casin_towardzero":
+double: 3
+float: 1
+idouble: 3
+ifloat: 1
+ildouble: 3
+ldouble: 3
+
+Function: Imaginary part of "casin_towardzero":
+double: 5
+float: 3
+idouble: 5
+ifloat: 3
+ildouble: 5
+ldouble: 5
+
+Function: Real part of "casin_upward":
+double: 3
+float: 2
+idouble: 3
+ifloat: 2
+ildouble: 2
+ldouble: 2
+
+Function: Imaginary part of "casin_upward":
+double: 5
+float: 7
+idouble: 5
+ifloat: 7
+ildouble: 7
+ldouble: 7
+
+Function: Real part of "casinh":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 2
+ldouble: 2
+
+Function: Imaginary part of "casinh":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Real part of "casinh_downward":
+double: 5
+float: 3
+idouble: 5
+ifloat: 3
+ildouble: 6
+ldouble: 6
+
+Function: Imaginary part of "casinh_downward":
+double: 3
+float: 2
+idouble: 3
+ifloat: 2
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "casinh_towardzero":
+double: 5
+float: 3
+idouble: 5
+ifloat: 3
+ildouble: 5
+ldouble: 5
+
+Function: Imaginary part of "casinh_towardzero":
+double: 3
+float: 1
+idouble: 3
+ifloat: 1
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "casinh_upward":
+double: 5
+float: 7
+idouble: 5
+ifloat: 7
+ildouble: 7
+ldouble: 7
+
+Function: Imaginary part of "casinh_upward":
+double: 3
+float: 2
+idouble: 3
+ifloat: 2
+ildouble: 2
+ldouble: 2
+
+Function: Real part of "catan":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Imaginary part of "catan":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Real part of "catan_downward":
+double: 1
+float: 2
+idouble: 1
+ifloat: 2
+ildouble: 1
+ldouble: 1
+
+Function: Imaginary part of "catan_downward":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 4
+ldouble: 4
+
+Function: Real part of "catan_towardzero":
+double: 1
+float: 2
+idouble: 1
+ifloat: 2
+ildouble: 1
+ldouble: 1
+
+Function: Imaginary part of "catan_towardzero":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 4
+ldouble: 4
+
+Function: Real part of "catan_upward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Imaginary part of "catan_upward":
+double: 3
+float: 3
+idouble: 3
+ifloat: 3
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "catanh":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Imaginary part of "catanh":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Real part of "catanh_downward":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 4
+ldouble: 4
+
+Function: Imaginary part of "catanh_downward":
+double: 1
+float: 2
+idouble: 1
+ifloat: 2
+ildouble: 1
+ldouble: 1
+
+Function: Real part of "catanh_towardzero":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 4
+ldouble: 4
+
+Function: Imaginary part of "catanh_towardzero":
+double: 1
+float: 2
+idouble: 1
+ifloat: 2
+ildouble: 1
+ldouble: 1
+
+Function: Real part of "catanh_upward":
+double: 4
+float: 4
+idouble: 4
+ifloat: 4
+ildouble: 4
+ldouble: 4
+
+Function: Imaginary part of "catanh_upward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "cbrt":
+double: 3
+float: 1
+idouble: 3
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "cbrt_downward":
+double: 4
+float: 1
+idouble: 4
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "cbrt_towardzero":
+double: 3
+float: 1
+idouble: 3
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "cbrt_upward":
+double: 5
+float: 1
+idouble: 5
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Real part of "ccos":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Imaginary part of "ccos":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Real part of "ccos_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 3
+ldouble: 3
+
+Function: Imaginary part of "ccos_downward":
+double: 2
+float: 3
+idouble: 2
+ifloat: 3
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "ccos_towardzero":
+double: 1
+float: 2
+idouble: 1
+ifloat: 2
+ildouble: 3
+ldouble: 3
+
+Function: Imaginary part of "ccos_towardzero":
+double: 2
+float: 3
+idouble: 2
+ifloat: 3
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "ccos_upward":
+double: 1
+float: 2
+idouble: 1
+ifloat: 2
+ildouble: 2
+ldouble: 2
+
+Function: Imaginary part of "ccos_upward":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 2
+ldouble: 2
+
+Function: Real part of "ccosh":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Imaginary part of "ccosh":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Real part of "ccosh_downward":
+double: 1
+float: 2
+idouble: 1
+ifloat: 2
+ildouble: 3
+ldouble: 3
+
+Function: Imaginary part of "ccosh_downward":
+double: 2
+float: 3
+idouble: 2
+ifloat: 3
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "ccosh_towardzero":
+double: 1
+float: 3
+idouble: 1
+ifloat: 3
+ildouble: 3
+ldouble: 3
+
+Function: Imaginary part of "ccosh_towardzero":
+double: 2
+float: 3
+idouble: 2
+ifloat: 3
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "ccosh_upward":
+double: 1
+float: 2
+idouble: 1
+ifloat: 2
+ildouble: 2
+ldouble: 2
+
+Function: Imaginary part of "ccosh_upward":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 2
+ldouble: 2
+
+Function: Real part of "cexp":
+double: 2
+float: 1
+idouble: 2
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Imaginary part of "cexp":
+double: 1
+float: 2
+idouble: 1
+ifloat: 2
+ildouble: 1
+ldouble: 1
+
+Function: Real part of "cexp_downward":
+double: 1
+float: 2
+idouble: 1
+ifloat: 2
+ildouble: 3
+ldouble: 3
+
+Function: Imaginary part of "cexp_downward":
+double: 1
+float: 3
+idouble: 1
+ifloat: 3
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "cexp_towardzero":
+double: 1
+float: 2
+idouble: 1
+ifloat: 2
+ildouble: 3
+ldouble: 3
+
+Function: Imaginary part of "cexp_towardzero":
+double: 1
+float: 3
+idouble: 1
+ifloat: 3
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "cexp_upward":
+double: 1
+float: 2
+idouble: 1
+ifloat: 2
+ildouble: 2
+ldouble: 2
+
+Function: Imaginary part of "cexp_upward":
+double: 1
+float: 2
+idouble: 1
+ifloat: 2
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "clog":
+double: 3
+float: 3
+idouble: 3
+ifloat: 3
+ildouble: 3
+ldouble: 3
+
+Function: Imaginary part of "clog":
+float: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Real part of "clog10":
+double: 3
+float: 4
+idouble: 3
+ifloat: 4
+ildouble: 4
+ldouble: 4
+
+Function: Imaginary part of "clog10":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 2
+ldouble: 2
+
+Function: Real part of "clog10_downward":
+double: 5
+float: 4
+idouble: 5
+ifloat: 4
+ildouble: 8
+ldouble: 8
+
+Function: Imaginary part of "clog10_downward":
+double: 2
+float: 4
+idouble: 2
+ifloat: 4
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "clog10_towardzero":
+double: 5
+float: 5
+idouble: 5
+ifloat: 5
+ildouble: 8
+ldouble: 8
+
+Function: Imaginary part of "clog10_towardzero":
+double: 2
+float: 4
+idouble: 2
+ifloat: 4
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "clog10_upward":
+double: 6
+float: 5
+idouble: 6
+ifloat: 5
+ildouble: 8
+ldouble: 8
+
+Function: Imaginary part of "clog10_upward":
+double: 2
+float: 4
+idouble: 2
+ifloat: 4
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "clog_downward":
+double: 4
+float: 3
+idouble: 4
+ifloat: 3
+ildouble: 5
+ldouble: 5
+
+Function: Imaginary part of "clog_downward":
+double: 1
+float: 2
+idouble: 1
+ifloat: 2
+ildouble: 1
+ldouble: 1
+
+Function: Real part of "clog_towardzero":
+double: 4
+float: 4
+idouble: 4
+ifloat: 4
+ildouble: 5
+ldouble: 5
+
+Function: Imaginary part of "clog_towardzero":
+double: 1
+float: 3
+idouble: 1
+ifloat: 3
+ildouble: 1
+ldouble: 1
+
+Function: Real part of "clog_upward":
+double: 4
+float: 3
+idouble: 4
+ifloat: 3
+ildouble: 4
+ldouble: 4
+
+Function: Imaginary part of "clog_upward":
+double: 1
+float: 2
+idouble: 1
+ifloat: 2
+ildouble: 1
+ldouble: 1
+
+Function: "cos":
+ildouble: 1
+ldouble: 1
+
+Function: "cos_downward":
+double: 1
+idouble: 1
+ildouble: 3
+ldouble: 3
+
+Function: "cos_towardzero":
+double: 1
+idouble: 1
+ildouble: 2
+ldouble: 2
+
+Function: "cos_upward":
+double: 1
+idouble: 1
+ildouble: 2
+ldouble: 2
+
+Function: "cos_vlen16":
+float: 1
+
+Function: "cos_vlen2":
+double: 2
+
+Function: "cos_vlen4":
+double: 2
+float: 1
+
+Function: "cos_vlen4_avx2":
+double: 2
+
+Function: "cos_vlen8":
+double: 1
+float: 1
+
+Function: "cos_vlen8_avx2":
+float: 1
+
+Function: "cosh":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: "cosh_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 2
+ldouble: 3
+
+Function: "cosh_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: "cosh_upward":
+double: 1
+float: 2
+idouble: 1
+ifloat: 2
+ildouble: 2
+ldouble: 3
+
+Function: Real part of "cpow":
+double: 2
+float: 5
+idouble: 2
+ifloat: 5
+ildouble: 3
+ldouble: 3
+
+Function: Imaginary part of "cpow":
+float: 2
+ifloat: 2
+ildouble: 4
+ldouble: 4
+
+Function: Real part of "cpow_downward":
+double: 4
+float: 8
+idouble: 4
+ifloat: 8
+ildouble: 7
+ldouble: 7
+
+Function: Imaginary part of "cpow_downward":
+double: 1
+float: 2
+idouble: 1
+ifloat: 2
+ildouble: 2
+ldouble: 2
+
+Function: Real part of "cpow_towardzero":
+double: 4
+float: 8
+idouble: 4
+ifloat: 8
+ildouble: 7
+ldouble: 7
+
+Function: Imaginary part of "cpow_towardzero":
+double: 1
+float: 2
+idouble: 1
+ifloat: 2
+ildouble: 1
+ldouble: 1
+
+Function: Real part of "cpow_upward":
+double: 4
+float: 1
+idouble: 4
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: Imaginary part of "cpow_upward":
+double: 1
+float: 2
+idouble: 1
+ifloat: 2
+ildouble: 2
+ldouble: 2
+
+Function: Real part of "csin":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Real part of "csin_downward":
+double: 2
+float: 3
+idouble: 2
+ifloat: 3
+ildouble: 3
+ldouble: 3
+
+Function: Imaginary part of "csin_downward":
+double: 1
+float: 2
+idouble: 1
+ifloat: 2
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "csin_towardzero":
+double: 2
+float: 3
+idouble: 2
+ifloat: 3
+ildouble: 3
+ldouble: 3
+
+Function: Imaginary part of "csin_towardzero":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "csin_upward":
+double: 2
+float: 3
+idouble: 2
+ifloat: 3
+ildouble: 3
+ldouble: 3
+
+Function: Imaginary part of "csin_upward":
+double: 1
+float: 3
+idouble: 1
+ifloat: 3
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "csinh":
+float: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Imaginary part of "csinh":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Real part of "csinh_downward":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 3
+ldouble: 3
+
+Function: Imaginary part of "csinh_downward":
+double: 2
+float: 3
+idouble: 2
+ifloat: 3
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "csinh_towardzero":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 3
+ldouble: 3
+
+Function: Imaginary part of "csinh_towardzero":
+double: 2
+float: 3
+idouble: 2
+ifloat: 3
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "csinh_upward":
+double: 1
+float: 3
+idouble: 1
+ifloat: 3
+ildouble: 3
+ldouble: 3
+
+Function: Imaginary part of "csinh_upward":
+double: 2
+float: 3
+idouble: 2
+ifloat: 3
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "csqrt":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 2
+ldouble: 2
+
+Function: Imaginary part of "csqrt":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 2
+ldouble: 2
+
+Function: Real part of "csqrt_downward":
+double: 5
+float: 4
+idouble: 5
+ifloat: 4
+ildouble: 5
+ldouble: 5
+
+Function: Imaginary part of "csqrt_downward":
+double: 4
+float: 3
+idouble: 4
+ifloat: 3
+ildouble: 4
+ldouble: 4
+
+Function: Real part of "csqrt_towardzero":
+double: 4
+float: 3
+idouble: 4
+ifloat: 3
+ildouble: 4
+ldouble: 4
+
+Function: Imaginary part of "csqrt_towardzero":
+double: 4
+float: 3
+idouble: 4
+ifloat: 3
+ildouble: 4
+ldouble: 4
+
+Function: Real part of "csqrt_upward":
+double: 5
+float: 4
+idouble: 5
+ifloat: 4
+ildouble: 5
+ldouble: 5
+
+Function: Imaginary part of "csqrt_upward":
+double: 3
+float: 3
+idouble: 3
+ifloat: 3
+ildouble: 4
+ldouble: 4
+
+Function: Real part of "ctan":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: Imaginary part of "ctan":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 1
+ldouble: 1
+
+Function: Real part of "ctan_downward":
+double: 6
+float: 5
+idouble: 6
+ifloat: 5
+ildouble: 5
+ldouble: 5
+
+Function: Imaginary part of "ctan_downward":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 4
+ldouble: 4
+
+Function: Real part of "ctan_towardzero":
+double: 5
+float: 3
+idouble: 5
+ifloat: 3
+ildouble: 5
+ldouble: 5
+
+Function: Imaginary part of "ctan_towardzero":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 4
+ldouble: 4
+
+Function: Real part of "ctan_upward":
+double: 2
+float: 4
+idouble: 2
+ifloat: 4
+ildouble: 3
+ldouble: 3
+
+Function: Imaginary part of "ctan_upward":
+double: 2
+float: 1
+idouble: 2
+ifloat: 1
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "ctanh":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 1
+ldouble: 1
+
+Function: Imaginary part of "ctanh":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 2
+ldouble: 2
+
+Function: Real part of "ctanh_downward":
+double: 4
+float: 2
+idouble: 4
+ifloat: 2
+ildouble: 4
+ldouble: 4
+
+Function: Imaginary part of "ctanh_downward":
+double: 6
+float: 5
+idouble: 6
+ifloat: 5
+ildouble: 4
+ldouble: 4
+
+Function: Real part of "ctanh_towardzero":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 4
+ldouble: 4
+
+Function: Imaginary part of "ctanh_towardzero":
+double: 5
+float: 3
+idouble: 5
+ifloat: 3
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "ctanh_upward":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 3
+ldouble: 3
+
+Function: Imaginary part of "ctanh_upward":
+double: 2
+float: 3
+idouble: 2
+ifloat: 3
+ildouble: 3
+ldouble: 3
+
+Function: "erf":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "erf_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "erf_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "erf_upward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "erfc":
+double: 3
+float: 2
+idouble: 3
+ifloat: 2
+ildouble: 3
+ldouble: 3
+
+Function: "erfc_downward":
+double: 5
+float: 6
+idouble: 5
+ifloat: 6
+ildouble: 4
+ldouble: 4
+
+Function: "erfc_towardzero":
+double: 3
+float: 4
+idouble: 3
+ifloat: 4
+ildouble: 4
+ldouble: 4
+
+Function: "erfc_upward":
+double: 5
+float: 6
+idouble: 5
+ifloat: 6
+ildouble: 5
+ldouble: 5
+
+Function: "exp":
+ildouble: 1
+ldouble: 1
+
+Function: "exp10":
+double: 2
+idouble: 2
+ildouble: 1
+ldouble: 1
+
+Function: "exp10_downward":
+double: 2
+float: 1
+idouble: 2
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: "exp10_towardzero":
+double: 2
+float: 1
+idouble: 2
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: "exp10_upward":
+double: 2
+float: 1
+idouble: 2
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: "exp2":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "exp2_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "exp2_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "exp2_upward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "exp_downward":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "exp_towardzero":
+double: 1
+idouble: 1
+ildouble: 2
+ldouble: 2
+
+Function: "exp_upward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "exp_vlen16":
+float: 1
+
+Function: "exp_vlen2":
+double: 1
+
+Function: "exp_vlen4":
+double: 1
+float: 1
+
+Function: "exp_vlen4_avx2":
+double: 1
+
+Function: "exp_vlen8":
+double: 1
+float: 1
+
+Function: "exp_vlen8_avx2":
+float: 1
+
+Function: "expm1":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: "expm1_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 4
+ldouble: 4
+
+Function: "expm1_towardzero":
+double: 1
+float: 2
+idouble: 1
+ifloat: 2
+ildouble: 4
+ldouble: 4
+
+Function: "expm1_upward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 4
+ldouble: 4
+
+Function: "gamma":
+double: 4
+float: 4
+idouble: 4
+ifloat: 4
+ildouble: 4
+ldouble: 4
+
+Function: "gamma_downward":
+double: 5
+float: 4
+idouble: 5
+ifloat: 4
+ildouble: 7
+ldouble: 7
+
+Function: "gamma_towardzero":
+double: 5
+float: 4
+idouble: 5
+ifloat: 4
+ildouble: 7
+ldouble: 7
+
+Function: "gamma_upward":
+double: 5
+float: 5
+idouble: 5
+ifloat: 5
+ildouble: 6
+ldouble: 6
+
+Function: "hypot":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "hypot_downward":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "hypot_towardzero":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "hypot_upward":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "j0":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 2
+ldouble: 2
+
+Function: "j0_downward":
+double: 2
+float: 4
+idouble: 2
+ifloat: 4
+ildouble: 4
+ldouble: 4
+
+Function: "j0_towardzero":
+double: 3
+float: 2
+idouble: 3
+ifloat: 2
+ildouble: 5
+ldouble: 5
+
+Function: "j0_upward":
+double: 3
+float: 2
+idouble: 3
+ifloat: 2
+ildouble: 4
+ldouble: 4
+
+Function: "j1":
+double: 1
+float: 2
+idouble: 1
+ifloat: 2
+ildouble: 1
+ldouble: 1
+
+Function: "j1_downward":
+double: 3
+float: 3
+idouble: 3
+ifloat: 3
+ildouble: 4
+ldouble: 4
+
+Function: "j1_towardzero":
+double: 3
+float: 2
+idouble: 3
+ifloat: 2
+ildouble: 4
+ldouble: 4
+
+Function: "j1_upward":
+double: 3
+float: 5
+idouble: 3
+ifloat: 5
+ildouble: 3
+ldouble: 3
+
+Function: "jn":
+double: 4
+float: 4
+idouble: 4
+ifloat: 4
+ildouble: 4
+ldouble: 4
+
+Function: "jn_downward":
+double: 5
+float: 5
+idouble: 5
+ifloat: 5
+ildouble: 4
+ldouble: 4
+
+Function: "jn_towardzero":
+double: 5
+float: 5
+idouble: 5
+ifloat: 5
+ildouble: 5
+ldouble: 5
+
+Function: "jn_upward":
+double: 5
+float: 5
+idouble: 5
+ifloat: 5
+ildouble: 5
+ldouble: 5
+
+Function: "lgamma":
+double: 4
+float: 4
+idouble: 4
+ifloat: 4
+ildouble: 4
+ldouble: 4
+
+Function: "lgamma_downward":
+double: 5
+float: 4
+idouble: 5
+ifloat: 4
+ildouble: 7
+ldouble: 7
+
+Function: "lgamma_towardzero":
+double: 5
+float: 4
+idouble: 5
+ifloat: 4
+ildouble: 7
+ldouble: 7
+
+Function: "lgamma_upward":
+double: 5
+float: 5
+idouble: 5
+ifloat: 5
+ildouble: 6
+ldouble: 6
+
+Function: "log":
+float: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "log10":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 1
+ldouble: 1
+
+Function: "log10_downward":
+double: 2
+float: 3
+idouble: 2
+ifloat: 3
+ildouble: 2
+ldouble: 2
+
+Function: "log10_towardzero":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 2
+ldouble: 2
+
+Function: "log10_upward":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 1
+ldouble: 1
+
+Function: "log1p":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: "log1p_downward":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 4
+ldouble: 4
+
+Function: "log1p_towardzero":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 4
+ldouble: 4
+
+Function: "log1p_upward":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 3
+ldouble: 3
+
+Function: "log2":
+double: 2
+float: 1
+idouble: 2
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "log2_downward":
+double: 3
+float: 3
+idouble: 3
+ifloat: 3
+ildouble: 1
+ldouble: 1
+
+Function: "log2_towardzero":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 1
+ldouble: 1
+
+Function: "log2_upward":
+double: 3
+float: 3
+idouble: 3
+ifloat: 3
+ildouble: 1
+ldouble: 1
+
+Function: "log_downward":
+float: 2
+ifloat: 2
+ildouble: 2
+ldouble: 2
+
+Function: "log_towardzero":
+float: 2
+ifloat: 2
+ildouble: 2
+ldouble: 2
+
+Function: "log_upward":
+double: 1
+float: 2
+idouble: 1
+ifloat: 2
+ildouble: 1
+ldouble: 1
+
+Function: "log_vlen16":
+float: 3
+
+Function: "log_vlen2":
+double: 1
+
+Function: "log_vlen4":
+double: 1
+float: 3
+
+Function: "log_vlen4_avx2":
+double: 1
+
+Function: "log_vlen8":
+double: 1
+float: 3
+
+Function: "log_vlen8_avx2":
+float: 2
+
+Function: "pow":
+float: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "pow10":
+double: 2
+idouble: 2
+ildouble: 1
+ldouble: 1
+
+Function: "pow10_downward":
+double: 2
+float: 1
+idouble: 2
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: "pow10_towardzero":
+double: 2
+float: 1
+idouble: 2
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: "pow10_upward":
+double: 2
+float: 1
+idouble: 2
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: "pow_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 4
+ldouble: 4
+
+Function: "pow_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 4
+ldouble: 4
+
+Function: "pow_upward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 4
+ldouble: 4
+
+Function: "pow_vlen16":
+float: 3
+
+Function: "pow_vlen2":
+double: 1
+
+Function: "pow_vlen4":
+double: 1
+float: 3
+
+Function: "pow_vlen4_avx2":
+double: 1
+
+Function: "pow_vlen8":
+double: 1
+float: 3
+
+Function: "pow_vlen8_avx2":
+float: 3
+
+Function: "sin":
+ildouble: 1
+ldouble: 1
+
+Function: "sin_downward":
+double: 1
+idouble: 1
+ildouble: 3
+ldouble: 3
+
+Function: "sin_towardzero":
+double: 1
+idouble: 1
+ildouble: 2
+ldouble: 2
+
+Function: "sin_upward":
+double: 1
+idouble: 1
+ildouble: 3
+ldouble: 3
+
+Function: "sin_vlen16":
+float: 1
+
+Function: "sin_vlen2":
+double: 2
+
+Function: "sin_vlen4":
+double: 2
+float: 1
+
+Function: "sin_vlen4_avx2":
+double: 2
+
+Function: "sin_vlen8":
+double: 2
+float: 1
+
+Function: "sin_vlen8_avx2":
+float: 1
+
+Function: "sincos":
+ildouble: 1
+ldouble: 1
+
+Function: "sincos_downward":
+double: 1
+idouble: 1
+ildouble: 3
+ldouble: 3
+
+Function: "sincos_towardzero":
+double: 1
+idouble: 1
+ildouble: 2
+ldouble: 2
+
+Function: "sincos_upward":
+double: 1
+idouble: 1
+ildouble: 3
+ldouble: 3
+
+Function: "sincos_vlen16":
+float: 1
+
+Function: "sincos_vlen2":
+double: 2
+
+Function: "sincos_vlen4":
+double: 2
+float: 1
+
+Function: "sincos_vlen4_avx2":
+double: 2
+
+Function: "sincos_vlen8":
+double: 1
+float: 1
+
+Function: "sincos_vlen8_avx2":
+float: 1
+
+Function: "sinh":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 2
+ldouble: 2
+
+Function: "sinh_downward":
+double: 3
+float: 3
+idouble: 3
+ifloat: 3
+ildouble: 5
+ldouble: 5
+
+Function: "sinh_towardzero":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 4
+ldouble: 4
+
+Function: "sinh_upward":
+double: 3
+float: 3
+idouble: 3
+ifloat: 3
+ildouble: 5
+ldouble: 5
+
+Function: "tan":
+float: 1
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: "tan_downward":
+double: 1
+float: 2
+idouble: 1
+ifloat: 2
+ildouble: 3
+ldouble: 3
+
+Function: "tan_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 3
+ldouble: 3
+
+Function: "tan_upward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: "tanh":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 3
+ldouble: 3
+
+Function: "tanh_downward":
+double: 3
+float: 3
+idouble: 3
+ifloat: 3
+ildouble: 4
+ldouble: 4
+
+Function: "tanh_towardzero":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 3
+ldouble: 3
+
+Function: "tanh_upward":
+double: 3
+float: 3
+idouble: 3
+ifloat: 3
+ildouble: 4
+ldouble: 4
+
+Function: "tgamma":
+double: 5
+float: 5
+idouble: 5
+ifloat: 5
+ildouble: 5
+ldouble: 5
+
+Function: "tgamma_downward":
+double: 5
+float: 5
+idouble: 5
+ifloat: 5
+ildouble: 5
+ldouble: 5
+
+Function: "tgamma_towardzero":
+double: 5
+float: 5
+idouble: 5
+ifloat: 5
+ildouble: 5
+ldouble: 5
+
+Function: "tgamma_upward":
+double: 5
+float: 5
+idouble: 5
+ifloat: 5
+ildouble: 5
+ldouble: 5
+
+Function: "y0":
+double: 2
+float: 1
+idouble: 2
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "y0_downward":
+double: 3
+float: 4
+idouble: 3
+ifloat: 4
+ildouble: 5
+ldouble: 5
+
+Function: "y0_towardzero":
+double: 3
+float: 3
+idouble: 3
+ifloat: 3
+ildouble: 5
+ldouble: 5
+
+Function: "y0_upward":
+double: 3
+float: 5
+idouble: 3
+ifloat: 5
+ildouble: 3
+ldouble: 3
+
+Function: "y1":
+double: 3
+float: 2
+idouble: 3
+ifloat: 2
+ildouble: 2
+ldouble: 2
+
+Function: "y1_downward":
+double: 3
+float: 2
+idouble: 3
+ifloat: 2
+ildouble: 7
+ldouble: 7
+
+Function: "y1_towardzero":
+double: 3
+float: 2
+idouble: 3
+ifloat: 2
+ildouble: 5
+ldouble: 5
+
+Function: "y1_upward":
+double: 7
+float: 2
+idouble: 7
+ifloat: 2
+ildouble: 7
+ldouble: 7
+
+Function: "yn":
+double: 3
+float: 3
+idouble: 3
+ifloat: 3
+ildouble: 4
+ldouble: 4
+
+Function: "yn_downward":
+double: 3
+float: 4
+idouble: 3
+ifloat: 4
+ildouble: 5
+ldouble: 5
+
+Function: "yn_towardzero":
+double: 3
+float: 3
+idouble: 3
+ifloat: 3
+ildouble: 5
+ldouble: 5
+
+Function: "yn_upward":
+double: 4
+float: 5
+idouble: 4
+ifloat: 5
+ildouble: 4
+ldouble: 4
+
+# end of automatic generation
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/libm-test-ulps-name b/REORG.TODO/sysdeps/x86_64/fpu/libm-test-ulps-name
new file mode 100644
index 0000000000..1c09346681
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/libm-test-ulps-name
@@ -0,0 +1 @@
+x86_64
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/math-tests-arch.h b/REORG.TODO/sysdeps/x86_64/fpu/math-tests-arch.h
new file mode 100644
index 0000000000..9278e3440b
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/math-tests-arch.h
@@ -0,0 +1,53 @@
+/* Runtime architecture check for math tests. x86_64 version.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <cpu-features.h>
+
+#if defined REQUIRE_AVX
+
+# define INIT_ARCH_EXT
+# define CHECK_ARCH_EXT                                        \
+  do                                                           \
+    {                                                          \
+      if (!HAS_ARCH_FEATURE (AVX_Usable)) return;              \
+    }                                                          \
+  while (0)
+
+#elif defined REQUIRE_AVX2
+
+# define INIT_ARCH_EXT
+# define CHECK_ARCH_EXT                                        \
+  do                                                           \
+    {                                                          \
+      if (!HAS_ARCH_FEATURE (AVX2_Usable)) return;             \
+    }                                                          \
+  while (0)
+
+#elif defined REQUIRE_AVX512F
+
+# define INIT_ARCH_EXT
+# define CHECK_ARCH_EXT                                        \
+  do                                                           \
+    {                                                          \
+      if (!HAS_ARCH_FEATURE (AVX512F_Usable)) return;          \
+    }                                                          \
+  while (0)
+
+#else
+# include <sysdeps/generic/math-tests-arch.h>
+#endif
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/math_ldbl.h b/REORG.TODO/sysdeps/x86_64/fpu/math_ldbl.h
new file mode 100644
index 0000000000..6c5bc13455
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/math_ldbl.h
@@ -0,0 +1,100 @@
+/* Manipulation of the bit representation of 'long double' quantities.
+   Copyright (C) 2001-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _MATH_LDBL_H_
+#define _MATH_LDBL_H_ 1
+
+#include <stdint.h>
+
+/* A union which permits us to convert between a long double and
+   three 32 bit ints.  */
+
+typedef union
+{
+  long double value;
+  struct
+  {
+    uint32_t lsw;
+    uint32_t msw;
+    int sign_exponent:16;
+    unsigned int empty1:16;
+    unsigned int empty0:32;
+  } parts;
+} ieee_long_double_shape_type;
+
+/* Get three 32 bit ints from a double.  */
+
+#define GET_LDOUBLE_WORDS(exp,ix0,ix1,d)			\
+do {								\
+  ieee_long_double_shape_type ew_u;				\
+  ew_u.value = (d);						\
+  (exp) = ew_u.parts.sign_exponent;				\
+  (ix0) = ew_u.parts.msw;					\
+  (ix1) = ew_u.parts.lsw;					\
+} while (0)
+
+/* Set a double from two 32 bit ints.  */
+
+#define SET_LDOUBLE_WORDS(d,exp,ix0,ix1)			\
+do {								\
+  ieee_long_double_shape_type iw_u;				\
+  iw_u.parts.sign_exponent = (exp);				\
+  iw_u.parts.msw = (ix0);					\
+  iw_u.parts.lsw = (ix1);					\
+  (d) = iw_u.value;						\
+} while (0)
+
+/* Get the more significant 32 bits of a long double mantissa.  */
+
+#define GET_LDOUBLE_MSW(v,d)					\
+do {								\
+  ieee_long_double_shape_type sh_u;				\
+  sh_u.value = (d);						\
+  (v) = sh_u.parts.msw;						\
+} while (0)
+
+/* Set the more significant 32 bits of a long double mantissa from an int.  */
+
+#define SET_LDOUBLE_MSW(d,v)					\
+do {								\
+  ieee_long_double_shape_type sh_u;				\
+  sh_u.value = (d);						\
+  sh_u.parts.msw = (v);						\
+  (d) = sh_u.value;						\
+} while (0)
+
+/* Get int from the exponent of a long double.  */
+
+#define GET_LDOUBLE_EXP(exp,d)					\
+do {								\
+  ieee_long_double_shape_type ge_u;				\
+  ge_u.value = (d);						\
+  (exp) = ge_u.parts.sign_exponent;				\
+} while (0)
+
+/* Set exponent of a long double from an int.  */
+
+#define SET_LDOUBLE_EXP(d,exp)					\
+do {								\
+  ieee_long_double_shape_type se_u;				\
+  se_u.value = (d);						\
+  se_u.parts.sign_exponent = (exp);				\
+  (d) = se_u.value;						\
+} while (0)
+
+#endif /* math_ldbl.h */
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/math_private.h b/REORG.TODO/sysdeps/x86_64/fpu/math_private.h
new file mode 100644
index 0000000000..027a6a3a4d
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/math_private.h
@@ -0,0 +1,133 @@
+#ifndef X86_64_MATH_PRIVATE_H
+#define X86_64_MATH_PRIVATE_H 1
+
+/* We can do a few things better on x86-64.  */
+
+#if defined __AVX__ || defined SSE2AVX
+# define MOVD "vmovd"
+# define MOVQ "vmovq"
+#else
+# define MOVD "movd"
+# define MOVQ "movq"
+#endif
+
+/* Direct movement of float into integer register.  */
+#define EXTRACT_WORDS64(i, d)						      \
+  do {									      \
+    int64_t i_;								      \
+    asm (MOVQ " %1, %0" : "=rm" (i_) : "x" ((double) (d)));		      \
+    (i) = i_;								      \
+  } while (0)
+
+/* And the reverse.  */
+#define INSERT_WORDS64(d, i) \
+  do {									      \
+    int64_t i_ = i;							      \
+    double d__;								      \
+    asm (MOVQ " %1, %0" : "=x" (d__) : "rm" (i_));			      \
+    d = d__;								      \
+  } while (0)
+
+/* Direct movement of float into integer register.  */
+#define GET_FLOAT_WORD(i, d) \
+  do {									      \
+    int i_;								      \
+    asm (MOVD " %1, %0" : "=rm" (i_) : "x" ((float) (d)));		      \
+    (i) = i_;								      \
+  } while (0)
+
+/* And the reverse.  */
+#define SET_FLOAT_WORD(f, i) \
+  do {									      \
+    int i_ = i;								      \
+    float f__;								      \
+    asm (MOVD " %1, %0" : "=x" (f__) : "rm" (i_));			      \
+    f = f__;								      \
+  } while (0)
+
+#include <sysdeps/i386/fpu/fenv_private.h>
+#include_next <math_private.h>
+
+extern __always_inline double
+__ieee754_sqrt (double d)
+{
+  double res;
+#if defined __AVX__ || defined SSE2AVX
+  asm ("vsqrtsd %1, %0, %0" : "=x" (res) : "xm" (d));
+#else
+  asm ("sqrtsd %1, %0" : "=x" (res) : "xm" (d));
+#endif
+  return res;
+}
+
+extern __always_inline float
+__ieee754_sqrtf (float d)
+{
+  float res;
+#if defined __AVX__ || defined SSE2AVX
+  asm ("vsqrtss %1, %0, %0" : "=x" (res) : "xm" (d));
+#else
+  asm ("sqrtss %1, %0" : "=x" (res) : "xm" (d));
+#endif
+  return res;
+}
+
+extern __always_inline long double
+__ieee754_sqrtl (long double d)
+{
+  long double res;
+  asm ("fsqrt" : "=t" (res) : "0" (d));
+  return res;
+}
+
+#ifdef __SSE4_1__
+extern __always_inline double
+__rint (double d)
+{
+  double res;
+# if defined __AVX__ || defined SSE2AVX
+  asm ("vroundsd $4, %1, %0, %0" : "=x" (res) : "xm" (d));
+# else
+  asm ("roundsd $4, %1, %0" : "=x" (res) : "xm" (d));
+# endif
+  return res;
+}
+
+extern __always_inline float
+__rintf (float d)
+{
+  float res;
+# if defined __AVX__ || defined SSE2AVX
+  asm ("vroundss $4, %1, %0, %0" : "=x" (res) : "xm" (d));
+# else
+  asm ("roundss $4, %1, %0" : "=x" (res) : "xm" (d));
+# endif
+  return res;
+}
+
+extern __always_inline double
+__floor (double d)
+{
+  double res;
+# if defined __AVX__ || defined SSE2AVX
+  asm ("vroundsd $1, %1, %0, %0" : "=x" (res) : "xm" (d));
+# else
+  asm ("roundsd $1, %1, %0" : "=x" (res) : "xm" (d));
+# endif
+  return res;
+}
+
+extern __always_inline float
+__floorf (float d)
+{
+  float res;
+# if defined __AVX__ || defined SSE2AVX
+  asm ("vroundss $1, %1, %0, %0" : "=x" (res) : "xm" (d));
+# else
+  asm ("roundss $1, %1, %0" : "=x" (res) : "xm" (d));
+#  endif
+  return res;
+}
+#endif /* __SSE4_1__ */
+
+#endif /* X86_64_MATH_PRIVATE_H */
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/Makefile b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/Makefile
new file mode 100644
index 0000000000..34542155aa
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/Makefile
@@ -0,0 +1,70 @@
+ifeq ($(subdir),math)
+libm-sysdep_routines += s_floor-c s_ceil-c s_floorf-c s_ceilf-c \
+			s_rint-c s_rintf-c s_nearbyint-c s_nearbyintf-c
+
+libm-sysdep_routines += e_exp-fma4 e_log-fma4 e_pow-fma4 s_atan-fma4 \
+			e_asin-fma4 e_atan2-fma4 s_sin-fma4 s_tan-fma4 \
+			mplog-fma4 mpa-fma4 slowexp-fma4 slowpow-fma4 \
+			sincos32-fma4 doasin-fma4 dosincos-fma4 \
+			halfulp-fma4 mpexp-fma4 \
+			mpatan2-fma4 mpatan-fma4 mpsqrt-fma4 mptan-fma4
+
+CFLAGS-doasin-fma4.c = -mfma4
+CFLAGS-dosincos-fma4.c = -mfma4
+CFLAGS-e_asin-fma4.c = -mfma4
+CFLAGS-e_atan2-fma4.c = -mfma4
+CFLAGS-e_exp-fma4.c = -mfma4
+CFLAGS-e_log-fma4.c = -mfma4
+CFLAGS-e_pow-fma4.c = -mfma4 $(config-cflags-nofma)
+CFLAGS-halfulp-fma4.c = -mfma4
+CFLAGS-mpa-fma4.c = -mfma4
+CFLAGS-mpatan-fma4.c = -mfma4
+CFLAGS-mpatan2-fma4.c = -mfma4
+CFLAGS-mpexp-fma4.c = -mfma4
+CFLAGS-mplog-fma4.c = -mfma4
+CFLAGS-mpsqrt-fma4.c = -mfma4
+CFLAGS-mptan-fma4.c = -mfma4
+CFLAGS-s_atan-fma4.c = -mfma4
+CFLAGS-sincos32-fma4.c = -mfma4
+CFLAGS-slowexp-fma4.c = -mfma4
+CFLAGS-slowpow-fma4.c = -mfma4
+CFLAGS-s_sin-fma4.c = -mfma4
+CFLAGS-s_tan-fma4.c = -mfma4
+
+libm-sysdep_routines += e_exp-avx e_log-avx s_atan-avx \
+			e_atan2-avx s_sin-avx s_tan-avx \
+			mplog-avx mpa-avx slowexp-avx \
+			mpexp-avx
+
+CFLAGS-e_atan2-avx.c = -msse2avx -DSSE2AVX
+CFLAGS-e_exp-avx.c = -msse2avx -DSSE2AVX
+CFLAGS-e_log-avx.c = -msse2avx -DSSE2AVX
+CFLAGS-mpa-avx.c = -msse2avx -DSSE2AVX
+CFLAGS-mpexp-avx.c = -msse2avx -DSSE2AVX
+CFLAGS-mplog-avx.c = -msse2avx -DSSE2AVX
+CFLAGS-s_atan-avx.c = -msse2avx -DSSE2AVX
+CFLAGS-s_sin-avx.c = -msse2avx -DSSE2AVX
+CFLAGS-slowexp-avx.c = -msse2avx -DSSE2AVX
+CFLAGS-s_tan-avx.c = -msse2avx -DSSE2AVX
+endif
+
+ifeq ($(subdir),mathvec)
+libmvec-sysdep_routines += svml_d_cos2_core_sse4 svml_d_cos4_core_avx2 \
+			   svml_d_cos8_core_avx512 svml_d_sin2_core_sse4 \
+			   svml_d_sin4_core_avx2 svml_d_sin8_core_avx512 \
+			   svml_d_log2_core_sse4 svml_d_log4_core_avx2 \
+			   svml_d_log8_core_avx512 svml_d_sincos2_core_sse4 \
+			   svml_d_sincos4_core_avx2 svml_d_sincos8_core_avx512 \
+			   svml_s_cosf4_core_sse4 svml_s_cosf8_core_avx2 \
+			   svml_s_cosf16_core_avx512 svml_s_sinf4_core_sse4 \
+			   svml_s_sinf8_core_avx2 svml_s_sinf16_core_avx512 \
+			   svml_s_logf4_core_sse4 svml_s_logf8_core_avx2 \
+			   svml_s_logf16_core_avx512 svml_d_exp2_core_sse4 \
+			   svml_d_exp4_core_avx2 svml_d_exp8_core_avx512 \
+			   svml_s_expf4_core_sse4 svml_s_expf8_core_avx2 \
+			   svml_s_expf16_core_avx512 svml_d_pow2_core_sse4 \
+			   svml_d_pow4_core_avx2 svml_d_pow8_core_avx512 \
+			   svml_s_powf4_core_sse4 svml_s_powf8_core_avx2 \
+			   svml_s_powf16_core_avx512 svml_s_sincosf4_core_sse4 \
+			   svml_s_sincosf8_core_avx2 svml_s_sincosf16_core_avx512
+endif
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/doasin-fma4.c b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/doasin-fma4.c
new file mode 100644
index 0000000000..53eb419472
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/doasin-fma4.c
@@ -0,0 +1,4 @@
+#define __doasin __doasin_fma4
+#define SECTION __attribute__ ((section (".text.fma4")))
+
+#include <sysdeps/ieee754/dbl-64/doasin.c>
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/dosincos-fma4.c b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/dosincos-fma4.c
new file mode 100644
index 0000000000..1578b2fce0
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/dosincos-fma4.c
@@ -0,0 +1,6 @@
+#define __docos __docos_fma4
+#define __dubcos __dubcos_fma4
+#define __dubsin __dubsin_fma4
+#define SECTION __attribute__ ((section (".text.fma4")))
+
+#include <sysdeps/ieee754/dbl-64/dosincos.c>
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/e_asin-fma4.c b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/e_asin-fma4.c
new file mode 100644
index 0000000000..2657c31f49
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/e_asin-fma4.c
@@ -0,0 +1,11 @@
+#define __ieee754_acos __ieee754_acos_fma4
+#define __ieee754_asin __ieee754_asin_fma4
+#define __cos32 __cos32_fma4
+#define __doasin __doasin_fma4
+#define __docos __docos_fma4
+#define __dubcos __dubcos_fma4
+#define __dubsin __dubsin_fma4
+#define __sin32 __sin32_fma4
+#define SECTION __attribute__ ((section (".text.fma4")))
+
+#include <sysdeps/ieee754/dbl-64/e_asin.c>
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/e_asin.c b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/e_asin.c
new file mode 100644
index 0000000000..111a5b99bd
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/e_asin.c
@@ -0,0 +1,26 @@
+#include <init-arch.h>
+#include <math.h>
+#include <math_private.h>
+
+extern double __ieee754_acos_sse2 (double);
+extern double __ieee754_asin_sse2 (double);
+extern double __ieee754_acos_fma4 (double);
+extern double __ieee754_asin_fma4 (double);
+
+libm_ifunc (__ieee754_acos,
+	    HAS_ARCH_FEATURE (FMA4_Usable)
+	    ? __ieee754_acos_fma4
+	    : __ieee754_acos_sse2);
+strong_alias (__ieee754_acos, __acos_finite)
+
+libm_ifunc (__ieee754_asin,
+	    HAS_ARCH_FEATURE (FMA4_Usable)
+	    ? __ieee754_asin_fma4
+	    : __ieee754_asin_sse2);
+strong_alias (__ieee754_asin, __asin_finite)
+
+#define __ieee754_acos __ieee754_acos_sse2
+#define __ieee754_asin __ieee754_asin_sse2
+
+
+#include <sysdeps/ieee754/dbl-64/e_asin.c>
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/e_atan2-avx.c b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/e_atan2-avx.c
new file mode 100644
index 0000000000..3012afac37
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/e_atan2-avx.c
@@ -0,0 +1,9 @@
+#define __ieee754_atan2 __ieee754_atan2_avx
+#define __add __add_avx
+#define __dbl_mp __dbl_mp_avx
+#define __dvd __dvd_avx
+#define __mul __mul_avx
+#define __sub __sub_avx
+#define SECTION __attribute__ ((section (".text.avx")))
+
+#include <sysdeps/ieee754/dbl-64/e_atan2.c>
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/e_atan2-fma4.c b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/e_atan2-fma4.c
new file mode 100644
index 0000000000..f4e986293e
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/e_atan2-fma4.c
@@ -0,0 +1,10 @@
+#define __ieee754_atan2 __ieee754_atan2_fma4
+#define __add __add_fma4
+#define __dbl_mp __dbl_mp_fma4
+#define __dvd __dvd_fma4
+#define __mpatan2 __mpatan2_fma4
+#define __mul __mul_fma4
+#define __sub __sub_fma4
+#define SECTION __attribute__ ((section (".text.fma4")))
+
+#include <sysdeps/ieee754/dbl-64/e_atan2.c>
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/e_atan2.c b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/e_atan2.c
new file mode 100644
index 0000000000..9ca3c02a44
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/e_atan2.c
@@ -0,0 +1,18 @@
+#include <init-arch.h>
+#include <math.h>
+#include <math_private.h>
+
+extern double __ieee754_atan2_sse2 (double, double);
+extern double __ieee754_atan2_avx (double, double);
+extern double __ieee754_atan2_fma4 (double, double);
+
+libm_ifunc (__ieee754_atan2,
+	    HAS_ARCH_FEATURE (FMA4_Usable) ? __ieee754_atan2_fma4
+	    : (HAS_ARCH_FEATURE (AVX_Usable)
+	       ? __ieee754_atan2_avx : __ieee754_atan2_sse2));
+strong_alias (__ieee754_atan2, __atan2_finite)
+
+#define __ieee754_atan2 __ieee754_atan2_sse2
+
+
+#include <sysdeps/ieee754/dbl-64/e_atan2.c>
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/e_exp-avx.c b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/e_exp-avx.c
new file mode 100644
index 0000000000..ee5dd6d2dc
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/e_exp-avx.c
@@ -0,0 +1,6 @@
+#define __ieee754_exp __ieee754_exp_avx
+#define __exp1 __exp1_avx
+#define __slowexp __slowexp_avx
+#define SECTION __attribute__ ((section (".text.avx")))
+
+#include <sysdeps/ieee754/dbl-64/e_exp.c>
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/e_exp-fma4.c b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/e_exp-fma4.c
new file mode 100644
index 0000000000..ae6eb67603
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/e_exp-fma4.c
@@ -0,0 +1,6 @@
+#define __ieee754_exp __ieee754_exp_fma4
+#define __exp1 __exp1_fma4
+#define __slowexp __slowexp_fma4
+#define SECTION __attribute__ ((section (".text.fma4")))
+
+#include <sysdeps/ieee754/dbl-64/e_exp.c>
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/e_exp.c b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/e_exp.c
new file mode 100644
index 0000000000..b7d7b5ff27
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/e_exp.c
@@ -0,0 +1,18 @@
+#include <init-arch.h>
+#include <math.h>
+#include <math_private.h>
+
+extern double __ieee754_exp_sse2 (double);
+extern double __ieee754_exp_avx (double);
+extern double __ieee754_exp_fma4 (double);
+
+libm_ifunc (__ieee754_exp,
+	    HAS_ARCH_FEATURE (FMA4_Usable) ? __ieee754_exp_fma4
+	    : (HAS_ARCH_FEATURE (AVX_Usable)
+	       ? __ieee754_exp_avx : __ieee754_exp_sse2));
+strong_alias (__ieee754_exp, __exp_finite)
+
+#define __ieee754_exp __ieee754_exp_sse2
+
+
+#include <sysdeps/ieee754/dbl-64/e_exp.c>
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/e_log-avx.c b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/e_log-avx.c
new file mode 100644
index 0000000000..c669019bc2
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/e_log-avx.c
@@ -0,0 +1,8 @@
+#define __ieee754_log __ieee754_log_avx
+#define __mplog __mplog_avx
+#define __add __add_avx
+#define __dbl_mp __dbl_mp_avx
+#define __sub __sub_avx
+#define SECTION __attribute__ ((section (".text.avx")))
+
+#include <sysdeps/ieee754/dbl-64/e_log.c>
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/e_log-fma4.c b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/e_log-fma4.c
new file mode 100644
index 0000000000..a2346cc618
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/e_log-fma4.c
@@ -0,0 +1,8 @@
+#define __ieee754_log __ieee754_log_fma4
+#define __mplog __mplog_fma4
+#define __add __add_fma4
+#define __dbl_mp __dbl_mp_fma4
+#define __sub __sub_fma4
+#define SECTION __attribute__ ((section (".text.fma4")))
+
+#include <sysdeps/ieee754/dbl-64/e_log.c>
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/e_log.c b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/e_log.c
new file mode 100644
index 0000000000..cf9533d6c0
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/e_log.c
@@ -0,0 +1,18 @@
+#include <init-arch.h>
+#include <math.h>
+#include <math_private.h>
+
+extern double __ieee754_log_sse2 (double);
+extern double __ieee754_log_avx (double);
+extern double __ieee754_log_fma4 (double);
+
+libm_ifunc (__ieee754_log,
+	    HAS_ARCH_FEATURE (FMA4_Usable) ? __ieee754_log_fma4
+	    : (HAS_ARCH_FEATURE (AVX_Usable)
+	       ? __ieee754_log_avx : __ieee754_log_sse2));
+strong_alias (__ieee754_log, __log_finite)
+
+#define __ieee754_log __ieee754_log_sse2
+
+
+#include <sysdeps/ieee754/dbl-64/e_log.c>
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/e_pow-fma4.c b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/e_pow-fma4.c
new file mode 100644
index 0000000000..5b3ea8e103
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/e_pow-fma4.c
@@ -0,0 +1,6 @@
+#define __ieee754_pow __ieee754_pow_fma4
+#define __exp1 __exp1_fma4
+#define __slowpow __slowpow_fma4
+#define SECTION __attribute__ ((section (".text.fma4")))
+
+#include <sysdeps/ieee754/dbl-64/e_pow.c>
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/e_pow.c b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/e_pow.c
new file mode 100644
index 0000000000..a5c5d89c3e
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/e_pow.c
@@ -0,0 +1,17 @@
+#include <init-arch.h>
+#include <math.h>
+#include <math_private.h>
+
+extern double __ieee754_pow_sse2 (double, double);
+extern double __ieee754_pow_fma4 (double, double);
+
+libm_ifunc (__ieee754_pow,
+	    HAS_ARCH_FEATURE (FMA4_Usable)
+	    ? __ieee754_pow_fma4
+	    : __ieee754_pow_sse2);
+strong_alias (__ieee754_pow, __pow_finite)
+
+#define __ieee754_pow __ieee754_pow_sse2
+
+
+#include <sysdeps/ieee754/dbl-64/e_pow.c>
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/halfulp-fma4.c b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/halfulp-fma4.c
new file mode 100644
index 0000000000..a00c17c016
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/halfulp-fma4.c
@@ -0,0 +1,4 @@
+#define __halfulp __halfulp_fma4
+#define SECTION __attribute__ ((section (".text.fma4")))
+
+#include <sysdeps/ieee754/dbl-64/halfulp.c>
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/mpa-avx.c b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/mpa-avx.c
new file mode 100644
index 0000000000..366b0b7134
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/mpa-avx.c
@@ -0,0 +1,14 @@
+#define __add __add_avx
+#define __mul __mul_avx
+#define __sqr __sqr_avx
+#define __sub __sub_avx
+#define __dbl_mp __dbl_mp_avx
+#define __dvd __dvd_avx
+
+#define NO___CPY 1
+#define NO___MP_DBL 1
+#define NO___ACR 1
+#define NO__CONST 1
+#define SECTION __attribute__ ((section (".text.avx")))
+
+#include <sysdeps/ieee754/dbl-64/mpa.c>
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/mpa-fma4.c b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/mpa-fma4.c
new file mode 100644
index 0000000000..a4a759407e
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/mpa-fma4.c
@@ -0,0 +1,14 @@
+#define __add __add_fma4
+#define __mul __mul_fma4
+#define __sqr __sqr_fma4
+#define __sub __sub_fma4
+#define __dbl_mp __dbl_mp_fma4
+#define __dvd __dvd_fma4
+
+#define NO___CPY 1
+#define NO___MP_DBL 1
+#define NO___ACR 1
+#define NO__CONST 1
+#define SECTION __attribute__ ((section (".text.fma4")))
+
+#include <sysdeps/ieee754/dbl-64/mpa.c>
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/mpatan-fma4.c b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/mpatan-fma4.c
new file mode 100644
index 0000000000..fbd3bd49a2
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/mpatan-fma4.c
@@ -0,0 +1,10 @@
+#define __mpatan __mpatan_fma4
+#define __add __add_fma4
+#define __dvd __dvd_fma4
+#define __mpsqrt __mpsqrt_fma4
+#define __mul __mul_fma4
+#define __sub __sub_fma4
+#define AVOID_MPATAN_H 1
+#define SECTION __attribute__ ((section (".text.fma4")))
+
+#include <sysdeps/ieee754/dbl-64/mpatan.c>
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/mpatan2-fma4.c b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/mpatan2-fma4.c
new file mode 100644
index 0000000000..e6e44d49b0
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/mpatan2-fma4.c
@@ -0,0 +1,9 @@
+#define __mpatan2 __mpatan2_fma4
+#define __add __add_fma4
+#define __dvd __dvd_fma4
+#define __mpatan __mpatan_fma4
+#define __mpsqrt __mpsqrt_fma4
+#define __mul __mul_fma4
+#define SECTION __attribute__ ((section (".text.fma4")))
+
+#include <sysdeps/ieee754/dbl-64/mpatan2.c>
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/mpexp-avx.c b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/mpexp-avx.c
new file mode 100644
index 0000000000..87f29c96c9
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/mpexp-avx.c
@@ -0,0 +1,9 @@
+#define __mpexp __mpexp_avx
+#define __add __add_avx
+#define __dbl_mp __dbl_mp_avx
+#define __dvd __dvd_avx
+#define __mul __mul_avx
+#define AVOID_MPEXP_H 1
+#define SECTION __attribute__ ((section (".text.avx")))
+
+#include <sysdeps/ieee754/dbl-64/mpexp.c>
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/mpexp-fma4.c b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/mpexp-fma4.c
new file mode 100644
index 0000000000..07ca6e9ad0
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/mpexp-fma4.c
@@ -0,0 +1,9 @@
+#define __mpexp __mpexp_fma4
+#define __add __add_fma4
+#define __dbl_mp __dbl_mp_fma4
+#define __dvd __dvd_fma4
+#define __mul __mul_fma4
+#define AVOID_MPEXP_H 1
+#define SECTION __attribute__ ((section (".text.fma4")))
+
+#include <sysdeps/ieee754/dbl-64/mpexp.c>
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/mplog-avx.c b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/mplog-avx.c
new file mode 100644
index 0000000000..fd783d9a67
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/mplog-avx.c
@@ -0,0 +1,8 @@
+#define __mplog __mplog_avx
+#define __add __add_avx
+#define __mpexp __mpexp_avx
+#define __mul __mul_avx
+#define __sub __sub_avx
+#define SECTION __attribute__ ((section (".text.avx")))
+
+#include <sysdeps/ieee754/dbl-64/mplog.c>
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/mplog-fma4.c b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/mplog-fma4.c
new file mode 100644
index 0000000000..b4733118d7
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/mplog-fma4.c
@@ -0,0 +1,8 @@
+#define __mplog __mplog_fma4
+#define __add __add_fma4
+#define __mpexp __mpexp_fma4
+#define __mul __mul_fma4
+#define __sub __sub_fma4
+#define SECTION __attribute__ ((section (".text.fma4")))
+
+#include <sysdeps/ieee754/dbl-64/mplog.c>
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/mpsqrt-fma4.c b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/mpsqrt-fma4.c
new file mode 100644
index 0000000000..f8a1ba2d92
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/mpsqrt-fma4.c
@@ -0,0 +1,8 @@
+#define __mpsqrt __mpsqrt_fma4
+#define __dbl_mp __dbl_mp_fma4
+#define __mul __mul_fma4
+#define __sub __sub_fma4
+#define AVOID_MPSQRT_H 1
+#define SECTION __attribute__ ((section (".text.fma4")))
+
+#include <sysdeps/ieee754/dbl-64/mpsqrt.c>
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/mptan-fma4.c b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/mptan-fma4.c
new file mode 100644
index 0000000000..fb4a9d48ca
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/mptan-fma4.c
@@ -0,0 +1,7 @@
+#define __mptan __mptan_fma4
+#define __c32 __c32_fma4
+#define __dvd __dvd_fma4
+#define __mpranred __mpranred_fma4
+#define SECTION __attribute__ ((section (".text.fma4")))
+
+#include <sysdeps/ieee754/dbl-64/mptan.c>
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_atan-avx.c b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_atan-avx.c
new file mode 100644
index 0000000000..b5cb9c3a75
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_atan-avx.c
@@ -0,0 +1,8 @@
+#define atan __atan_avx
+#define __add __add_avx
+#define __dbl_mp __dbl_mp_avx
+#define __mul __mul_avx
+#define __sub __sub_avx
+#define SECTION __attribute__ ((section (".text.avx")))
+
+#include <sysdeps/ieee754/dbl-64/s_atan.c>
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_atan-fma4.c b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_atan-fma4.c
new file mode 100644
index 0000000000..9e83e6cdab
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_atan-fma4.c
@@ -0,0 +1,9 @@
+#define atan __atan_fma4
+#define __add __add_fma4
+#define __dbl_mp __dbl_mp_fma4
+#define __mpatan __mpatan_fma4
+#define __mul __mul_fma4
+#define __sub __sub_fma4
+#define SECTION __attribute__ ((section (".text.fma4")))
+
+#include <sysdeps/ieee754/dbl-64/s_atan.c>
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_atan.c b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_atan.c
new file mode 100644
index 0000000000..742e95cb96
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_atan.c
@@ -0,0 +1,15 @@
+#include <init-arch.h>
+#include <math.h>
+
+extern double __atan_sse2 (double);
+extern double __atan_avx (double);
+extern double __atan_fma4 (double);
+
+libm_ifunc (atan, (HAS_ARCH_FEATURE (FMA4_Usable) ? __atan_fma4 :
+		   HAS_ARCH_FEATURE (AVX_Usable)
+		   ? __atan_avx : __atan_sse2));
+
+#define atan __atan_sse2
+
+
+#include <sysdeps/ieee754/dbl-64/s_atan.c>
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_ceil-c.c b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_ceil-c.c
new file mode 100644
index 0000000000..6a5ea3ff27
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_ceil-c.c
@@ -0,0 +1,2 @@
+#define __ceil __ceil_c
+#include <sysdeps/ieee754/dbl-64/wordsize-64/s_ceil.c>
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_ceil.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_ceil.S
new file mode 100644
index 0000000000..f8eef43eff
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_ceil.S
@@ -0,0 +1,38 @@
+/* Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@gmail.come>, 2011.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <machine/asm.h>
+#include <init-arch.h>
+
+
+ENTRY(__ceil)
+	.type	__ceil, @gnu_indirect_function
+	LOAD_RTLD_GLOBAL_RO_RDX
+	leaq	__ceil_sse41(%rip), %rax
+	HAS_CPU_FEATURE (SSE4_1)
+	jnz	2f
+	leaq	__ceil_c(%rip), %rax
+2:	ret
+END(__ceil)
+weak_alias (__ceil, ceil)
+
+
+ENTRY(__ceil_sse41)
+	roundsd	$10, %xmm0, %xmm0
+	ret
+END(__ceil_sse41)
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_ceilf-c.c b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_ceilf-c.c
new file mode 100644
index 0000000000..229a6273b2
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_ceilf-c.c
@@ -0,0 +1,2 @@
+#define __ceilf __ceilf_c
+#include <sysdeps/ieee754/flt-32/s_ceilf.c>
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_ceilf.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_ceilf.S
new file mode 100644
index 0000000000..076f10f0f0
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_ceilf.S
@@ -0,0 +1,38 @@
+/* Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@gmail.come>, 2011.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <machine/asm.h>
+#include <init-arch.h>
+
+
+ENTRY(__ceilf)
+	.type	__ceilf, @gnu_indirect_function
+	LOAD_RTLD_GLOBAL_RO_RDX
+	leaq	__ceilf_sse41(%rip), %rax
+	HAS_CPU_FEATURE (SSE4_1)
+	jnz	2f
+	leaq	__ceilf_c(%rip), %rax
+2:	ret
+END(__ceilf)
+weak_alias (__ceilf, ceilf)
+
+
+ENTRY(__ceilf_sse41)
+	roundss	$10, %xmm0, %xmm0
+	ret
+END(__ceilf_sse41)
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_floor-c.c b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_floor-c.c
new file mode 100644
index 0000000000..68733b69ef
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_floor-c.c
@@ -0,0 +1,3 @@
+#undef __floor
+#define __floor __floor_c
+#include <sysdeps/ieee754/dbl-64/wordsize-64/s_floor.c>
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_floor.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_floor.S
new file mode 100644
index 0000000000..f519ab24f4
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_floor.S
@@ -0,0 +1,38 @@
+/* Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@gmail.come>, 2011.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <machine/asm.h>
+#include <init-arch.h>
+
+
+ENTRY(__floor)
+	.type	__floor, @gnu_indirect_function
+	LOAD_RTLD_GLOBAL_RO_RDX
+	leaq	__floor_sse41(%rip), %rax
+	HAS_CPU_FEATURE (SSE4_1)
+	jnz	2f
+	leaq	__floor_c(%rip), %rax
+2:	ret
+END(__floor)
+weak_alias (__floor, floor)
+
+
+ENTRY(__floor_sse41)
+	roundsd	$9, %xmm0, %xmm0
+	ret
+END(__floor_sse41)
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_floorf-c.c b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_floorf-c.c
new file mode 100644
index 0000000000..2386362328
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_floorf-c.c
@@ -0,0 +1,3 @@
+#undef __floorf
+#define __floorf __floorf_c
+#include <sysdeps/ieee754/flt-32/s_floorf.c>
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_floorf.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_floorf.S
new file mode 100644
index 0000000000..8613f73acc
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_floorf.S
@@ -0,0 +1,38 @@
+/* Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@gmail.come>, 2011.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <machine/asm.h>
+#include <init-arch.h>
+
+
+ENTRY(__floorf)
+	.type	__floorf, @gnu_indirect_function
+	LOAD_RTLD_GLOBAL_RO_RDX
+	leaq	__floorf_sse41(%rip), %rax
+	HAS_CPU_FEATURE (SSE4_1)
+	jnz	2f
+	leaq	__floorf_c(%rip), %rax
+2:	ret
+END(__floorf)
+weak_alias (__floorf, floorf)
+
+
+ENTRY(__floorf_sse41)
+	roundss	$9, %xmm0, %xmm0
+	ret
+END(__floorf_sse41)
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_fma.c b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_fma.c
new file mode 100644
index 0000000000..3ac4fed660
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_fma.c
@@ -0,0 +1,50 @@
+/* FMA version of fma.
+   Copyright (C) 2009-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <config.h>
+#include <math.h>
+#include <init-arch.h>
+
+extern double __fma_sse2 (double x, double y, double z) attribute_hidden;
+
+
+static double
+__fma_fma3 (double x, double y, double z)
+{
+  asm ("vfmadd213sd %3, %2, %0" : "=x" (x) : "0" (x), "x" (y), "xm" (z));
+  return x;
+}
+
+
+static double
+__fma_fma4 (double x, double y, double z)
+{
+  asm ("vfmaddsd %3, %2, %1, %0" : "=x" (x) : "x" (x), "x" (y), "x" (z));
+  return x;
+}
+
+
+libm_ifunc (__fma, HAS_ARCH_FEATURE (FMA_Usable)
+	    ? __fma_fma3 : (HAS_ARCH_FEATURE (FMA4_Usable)
+			    ? __fma_fma4 : __fma_sse2));
+weak_alias (__fma, fma)
+
+#define __fma __fma_sse2
+
+#include <sysdeps/ieee754/dbl-64/s_fma.c>
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_fmaf.c b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_fmaf.c
new file mode 100644
index 0000000000..1ae227c1d4
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_fmaf.c
@@ -0,0 +1,49 @@
+/* FMA version of fmaf.
+   Copyright (C) 2009-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <config.h>
+#include <math.h>
+#include <init-arch.h>
+
+extern float __fmaf_sse2 (float x, float y, float z) attribute_hidden;
+
+
+static float
+__fmaf_fma3 (float x, float y, float z)
+{
+  asm ("vfmadd213ss %3, %2, %0" : "=x" (x) : "0" (x), "x" (y), "xm" (z));
+  return x;
+}
+
+
+static float
+__fmaf_fma4 (float x, float y, float z)
+{
+  asm ("vfmaddss %3, %2, %1, %0" : "=x" (x) : "x" (x), "x" (y), "x" (z));
+  return x;
+}
+
+
+libm_ifunc (__fmaf, HAS_ARCH_FEATURE (FMA_Usable)
+	    ? __fmaf_fma3 : (HAS_ARCH_FEATURE (FMA4_Usable)
+			     ? __fmaf_fma4 : __fmaf_sse2));
+weak_alias (__fmaf, fmaf)
+
+#define __fmaf __fmaf_sse2
+
+#include <sysdeps/ieee754/dbl-64/s_fmaf.c>
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_nearbyint-c.c b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_nearbyint-c.c
new file mode 100644
index 0000000000..f897a2a6a6
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_nearbyint-c.c
@@ -0,0 +1,3 @@
+#undef __nearbyint
+#define __nearbyint __nearbyint_c
+#include <sysdeps/ieee754/dbl-64/wordsize-64/s_nearbyint.c>
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_nearbyint.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_nearbyint.S
new file mode 100644
index 0000000000..5a734f6027
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_nearbyint.S
@@ -0,0 +1,38 @@
+/* Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@gmail.come>, 2011.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <machine/asm.h>
+#include <init-arch.h>
+
+
+ENTRY(__nearbyint)
+	.type	__nearbyint, @gnu_indirect_function
+	LOAD_RTLD_GLOBAL_RO_RDX
+	leaq	__nearbyint_sse41(%rip), %rax
+	HAS_CPU_FEATURE (SSE4_1)
+	jnz	2f
+	leaq	__nearbyint_c(%rip), %rax
+2:	ret
+END(__nearbyint)
+weak_alias (__nearbyint, nearbyint)
+
+
+ENTRY(__nearbyint_sse41)
+	roundsd	$0xc, %xmm0, %xmm0
+	ret
+END(__nearbyint_sse41)
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_nearbyintf-c.c b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_nearbyintf-c.c
new file mode 100644
index 0000000000..aa7768233b
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_nearbyintf-c.c
@@ -0,0 +1,3 @@
+#undef __nearbyintf
+#define __nearbyintf __nearbyintf_c
+#include <sysdeps/ieee754/flt-32/s_nearbyintf.c>
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_nearbyintf.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_nearbyintf.S
new file mode 100644
index 0000000000..ad79fd6021
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_nearbyintf.S
@@ -0,0 +1,38 @@
+/* Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@gmail.come>, 2011.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <machine/asm.h>
+#include <init-arch.h>
+
+
+ENTRY(__nearbyintf)
+	.type	__nearbyintf, @gnu_indirect_function
+	LOAD_RTLD_GLOBAL_RO_RDX
+	leaq	__nearbyintf_sse41(%rip), %rax
+	HAS_CPU_FEATURE (SSE4_1)
+	jnz	2f
+	leaq	__nearbyintf_c(%rip), %rax
+2:	ret
+END(__nearbyintf)
+weak_alias (__nearbyintf, nearbyintf)
+
+
+ENTRY(__nearbyintf_sse41)
+	roundss	$0xc, %xmm0, %xmm0
+	ret
+END(__nearbyintf_sse41)
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_rint-c.c b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_rint-c.c
new file mode 100644
index 0000000000..162a630ff9
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_rint-c.c
@@ -0,0 +1,3 @@
+#undef __rint
+#define __rint __rint_c
+#include <sysdeps/ieee754/dbl-64/wordsize-64/s_rint.c>
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_rint.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_rint.S
new file mode 100644
index 0000000000..4f628a93a4
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_rint.S
@@ -0,0 +1,38 @@
+/* Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@gmail.come>, 2011.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <machine/asm.h>
+#include <init-arch.h>
+
+
+ENTRY(__rint)
+	.type	__rint, @gnu_indirect_function
+	LOAD_RTLD_GLOBAL_RO_RDX
+	leaq	__rint_sse41(%rip), %rax
+	HAS_CPU_FEATURE (SSE4_1)
+	jnz	2f
+	leaq	__rint_c(%rip), %rax
+2:	ret
+END(__rint)
+weak_alias (__rint, rint)
+
+
+ENTRY(__rint_sse41)
+	roundsd	$4, %xmm0, %xmm0
+	ret
+END(__rint_sse41)
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_rintf-c.c b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_rintf-c.c
new file mode 100644
index 0000000000..8505249f34
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_rintf-c.c
@@ -0,0 +1,3 @@
+#undef __rintf
+#define __rintf __rintf_c
+#include <sysdeps/ieee754/flt-32/s_rintf.c>
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_rintf.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_rintf.S
new file mode 100644
index 0000000000..dee4ad794c
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_rintf.S
@@ -0,0 +1,38 @@
+/* Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@gmail.come>, 2011.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <machine/asm.h>
+#include <init-arch.h>
+
+
+ENTRY(__rintf)
+	.type	__rintf, @gnu_indirect_function
+	LOAD_RTLD_GLOBAL_RO_RDX
+	leaq	__rintf_sse41(%rip), %rax
+	HAS_CPU_FEATURE (SSE4_1)
+	jnz	2f
+	leaq	__rintf_c(%rip), %rax
+2:	ret
+END(__rintf)
+weak_alias (__rintf, rintf)
+
+
+ENTRY(__rintf_sse41)
+	roundss	$4, %xmm0, %xmm0
+	ret
+END(__rintf_sse41)
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_sin-avx.c b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_sin-avx.c
new file mode 100644
index 0000000000..e1c6de0259
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_sin-avx.c
@@ -0,0 +1,5 @@
+#define __cos __cos_avx
+#define __sin __sin_avx
+#define SECTION __attribute__ ((section (".text.avx")))
+
+#include <sysdeps/ieee754/dbl-64/s_sin.c>
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_sin-fma4.c b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_sin-fma4.c
new file mode 100644
index 0000000000..4c35739dc9
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_sin-fma4.c
@@ -0,0 +1,11 @@
+#define __cos __cos_fma4
+#define __sin __sin_fma4
+#define __docos __docos_fma4
+#define __dubsin __dubsin_fma4
+#define __mpcos __mpcos_fma4
+#define __mpcos1 __mpcos1_fma4
+#define __mpsin __mpsin_fma4
+#define __mpsin1 __mpsin1_fma4
+#define SECTION __attribute__ ((section (".text.fma4")))
+
+#include <sysdeps/ieee754/dbl-64/s_sin.c>
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_sin.c b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_sin.c
new file mode 100644
index 0000000000..8ffd3e7125
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_sin.c
@@ -0,0 +1,26 @@
+#include <init-arch.h>
+#include <math.h>
+#undef NAN
+
+extern double __cos_sse2 (double);
+extern double __sin_sse2 (double);
+extern double __cos_avx (double);
+extern double __sin_avx (double);
+extern double __cos_fma4 (double);
+extern double __sin_fma4 (double);
+
+libm_ifunc (__cos, (HAS_ARCH_FEATURE (FMA4_Usable) ? __cos_fma4 :
+		    HAS_ARCH_FEATURE (AVX_Usable)
+		    ? __cos_avx : __cos_sse2));
+weak_alias (__cos, cos)
+
+libm_ifunc (__sin, (HAS_ARCH_FEATURE (FMA4_Usable) ? __sin_fma4 :
+		    HAS_ARCH_FEATURE (AVX_Usable)
+		    ? __sin_avx : __sin_sse2));
+weak_alias (__sin, sin)
+
+#define __cos __cos_sse2
+#define __sin __sin_sse2
+
+
+#include <sysdeps/ieee754/dbl-64/s_sin.c>
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_tan-avx.c b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_tan-avx.c
new file mode 100644
index 0000000000..53de5d3c98
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_tan-avx.c
@@ -0,0 +1,6 @@
+#define tan __tan_avx
+#define __dbl_mp __dbl_mp_avx
+#define __sub __sub_avx
+#define SECTION __attribute__ ((section (".text.avx")))
+
+#include <sysdeps/ieee754/dbl-64/s_tan.c>
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_tan-fma4.c b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_tan-fma4.c
new file mode 100644
index 0000000000..a805440b46
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_tan-fma4.c
@@ -0,0 +1,8 @@
+#define tan __tan_fma4
+#define __dbl_mp __dbl_mp_fma4
+#define __mpranred __mpranred_fma4
+#define __mptan __mptan_fma4
+#define __sub __sub_fma4
+#define SECTION __attribute__ ((section (".text.fma4")))
+
+#include <sysdeps/ieee754/dbl-64/s_tan.c>
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_tan.c b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_tan.c
new file mode 100644
index 0000000000..25f3bca07e
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_tan.c
@@ -0,0 +1,15 @@
+#include <init-arch.h>
+#include <math.h>
+
+extern double __tan_sse2 (double);
+extern double __tan_avx (double);
+extern double __tan_fma4 (double);
+
+libm_ifunc (tan, (HAS_ARCH_FEATURE (FMA4_Usable) ? __tan_fma4 :
+		  HAS_ARCH_FEATURE (AVX_Usable)
+		  ? __tan_avx : __tan_sse2));
+
+#define tan __tan_sse2
+
+
+#include <sysdeps/ieee754/dbl-64/s_tan.c>
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/sincos32-fma4.c b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/sincos32-fma4.c
new file mode 100644
index 0000000000..ebbfa18cca
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/sincos32-fma4.c
@@ -0,0 +1,15 @@
+#define __cos32 __cos32_fma4
+#define __sin32 __sin32_fma4
+#define __c32 __c32_fma4
+#define __mpsin __mpsin_fma4
+#define __mpsin1 __mpsin1_fma4
+#define __mpcos __mpcos_fma4
+#define __mpcos1 __mpcos1_fma4
+#define __mpranred __mpranred_fma4
+#define __add __add_fma4
+#define __dbl_mp __dbl_mp_fma4
+#define __mul __mul_fma4
+#define __sub __sub_fma4
+#define SECTION __attribute__ ((section (".text.fma4")))
+
+#include <sysdeps/ieee754/dbl-64/sincos32.c>
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/slowexp-avx.c b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/slowexp-avx.c
new file mode 100644
index 0000000000..d01c6d71a4
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/slowexp-avx.c
@@ -0,0 +1,9 @@
+#define __slowexp __slowexp_avx
+#define __add __add_avx
+#define __dbl_mp __dbl_mp_avx
+#define __mpexp __mpexp_avx
+#define __mul __mul_avx
+#define __sub __sub_avx
+#define SECTION __attribute__ ((section (".text.avx")))
+
+#include <sysdeps/ieee754/dbl-64/slowexp.c>
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/slowexp-fma4.c b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/slowexp-fma4.c
new file mode 100644
index 0000000000..3bcde84233
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/slowexp-fma4.c
@@ -0,0 +1,9 @@
+#define __slowexp __slowexp_fma4
+#define __add __add_fma4
+#define __dbl_mp __dbl_mp_fma4
+#define __mpexp __mpexp_fma4
+#define __mul __mul_fma4
+#define __sub __sub_fma4
+#define SECTION __attribute__ ((section (".text.fma4")))
+
+#include <sysdeps/ieee754/dbl-64/slowexp.c>
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/slowpow-fma4.c b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/slowpow-fma4.c
new file mode 100644
index 0000000000..69d69823bb
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/slowpow-fma4.c
@@ -0,0 +1,11 @@
+#define __slowpow __slowpow_fma4
+#define __add __add_fma4
+#define __dbl_mp __dbl_mp_fma4
+#define __mpexp __mpexp_fma4
+#define __mplog __mplog_fma4
+#define __mul __mul_fma4
+#define __sub __sub_fma4
+#define __halfulp __halfulp_fma4
+#define SECTION __attribute__ ((section (".text.fma4")))
+
+#include <sysdeps/ieee754/dbl-64/slowpow.c>
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_cos2_core.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_cos2_core.S
new file mode 100644
index 0000000000..b209492442
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_cos2_core.S
@@ -0,0 +1,36 @@
+/* Multiple versions of vectorized cos, vector length is 2.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+	.text
+ENTRY (_ZGVbN2v_cos)
+        .type   _ZGVbN2v_cos, @gnu_indirect_function
+	LOAD_RTLD_GLOBAL_RO_RDX
+        leaq    _ZGVbN2v_cos_sse4(%rip), %rax
+	HAS_CPU_FEATURE (SSE4_1)
+        jz      2f
+        ret
+2:      leaq    _ZGVbN2v_cos_sse2(%rip), %rax
+        ret
+END (_ZGVbN2v_cos)
+libmvec_hidden_def (_ZGVbN2v_cos)
+
+#define _ZGVbN2v_cos _ZGVbN2v_cos_sse2
+#include "../svml_d_cos2_core.S"
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_cos2_core_sse4.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_cos2_core_sse4.S
new file mode 100644
index 0000000000..858dc6532f
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_cos2_core_sse4.S
@@ -0,0 +1,223 @@
+/* Function cos vectorized with SSE4.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "svml_d_trig_data.h"
+
+	.text
+ENTRY (_ZGVbN2v_cos_sse4)
+/* ALGORITHM DESCRIPTION:
+
+        ( low accuracy ( < 4ulp ) or enhanced performance
+         ( half of correct mantissa ) implementation )
+
+        Argument representation:
+        arg + Pi/2 = (N*Pi + R)
+
+        Result calculation:
+        cos(arg) = sin(arg+Pi/2) = sin(N*Pi + R) = (-1)^N * sin(R)
+        sin(R) is approximated by corresponding polynomial
+ */
+        pushq     %rbp
+        cfi_adjust_cfa_offset (8)
+        cfi_rel_offset (%rbp, 0)
+        movq      %rsp, %rbp
+        cfi_def_cfa_register (%rbp)
+        andq      $-64, %rsp
+        subq      $320, %rsp
+        movaps    %xmm0, %xmm3
+        movq      __svml_d_trig_data@GOTPCREL(%rip), %rax
+        movups    __dHalfPI(%rax), %xmm2
+
+/* ARGUMENT RANGE REDUCTION:
+   Add Pi/2 to argument: X' = X+Pi/2
+ */
+        addpd     %xmm3, %xmm2
+        movups    __dInvPI(%rax), %xmm5
+        movups    __dAbsMask(%rax), %xmm4
+
+/* Get absolute argument value: X' = |X'| */
+        andps     %xmm2, %xmm4
+
+/* Y = X'*InvPi + RS : right shifter add */
+        mulpd     %xmm5, %xmm2
+
+/* Check for large arguments path */
+        cmpnlepd  __dRangeVal(%rax), %xmm4
+        movups    __dRShifter(%rax), %xmm6
+        addpd     %xmm6, %xmm2
+        movmskpd  %xmm4, %ecx
+
+/* N = Y - RS : right shifter sub */
+        movaps    %xmm2, %xmm1
+
+/* SignRes = Y<<63 : shift LSB to MSB place for result sign */
+        psllq     $63, %xmm2
+        subpd     %xmm6, %xmm1
+
+/* N = N - 0.5 */
+        subpd     __dOneHalf(%rax), %xmm1
+        movups    __dPI1(%rax), %xmm7
+
+/* R = X - N*Pi1 */
+        mulpd     %xmm1, %xmm7
+        movups    __dPI2(%rax), %xmm4
+
+/* R = R - N*Pi2 */
+        mulpd     %xmm1, %xmm4
+        subpd     %xmm7, %xmm0
+        movups    __dPI3(%rax), %xmm5
+
+/* R = R - N*Pi3 */
+        mulpd     %xmm1, %xmm5
+        subpd     %xmm4, %xmm0
+
+/* R = R - N*Pi4 */
+        movups     __dPI4(%rax), %xmm6
+        mulpd     %xmm6, %xmm1
+        subpd     %xmm5, %xmm0
+        subpd     %xmm1, %xmm0
+
+/* POLYNOMIAL APPROXIMATION: R2 = R*R */
+        movaps    %xmm0, %xmm4
+        mulpd     %xmm0, %xmm4
+        movups    __dC7(%rax), %xmm1
+        mulpd     %xmm4, %xmm1
+        addpd     __dC6(%rax), %xmm1
+        mulpd     %xmm4, %xmm1
+        addpd     __dC5(%rax), %xmm1
+        mulpd     %xmm4, %xmm1
+        addpd     __dC4(%rax), %xmm1
+
+/* Poly = C3+R2*(C4+R2*(C5+R2*(C6+R2*C7))) */
+        mulpd     %xmm4, %xmm1
+        addpd     __dC3(%rax), %xmm1
+
+/* Poly = R+R*(R2*(C1+R2*(C2+R2*Poly))) */
+        mulpd     %xmm4, %xmm1
+        addpd     __dC2(%rax), %xmm1
+        mulpd     %xmm4, %xmm1
+        addpd     __dC1(%rax), %xmm1
+        mulpd     %xmm1, %xmm4
+        mulpd     %xmm0, %xmm4
+        addpd     %xmm4, %xmm0
+
+/* RECONSTRUCTION:
+   Final sign setting: Res = Poly^SignRes */
+        xorps     %xmm2, %xmm0
+        testl     %ecx, %ecx
+        jne       .LBL_1_3
+
+.LBL_1_2:
+        cfi_remember_state
+        movq      %rbp, %rsp
+        cfi_def_cfa_register (%rsp)
+        popq      %rbp
+        cfi_adjust_cfa_offset (-8)
+        cfi_restore (%rbp)
+        ret
+
+.LBL_1_3:
+        cfi_restore_state
+        movups    %xmm3, 192(%rsp)
+        movups    %xmm0, 256(%rsp)
+        je        .LBL_1_2
+
+        xorb      %dl, %dl
+        xorl      %eax, %eax
+        movups    %xmm8, 112(%rsp)
+        movups    %xmm9, 96(%rsp)
+        movups    %xmm10, 80(%rsp)
+        movups    %xmm11, 64(%rsp)
+        movups    %xmm12, 48(%rsp)
+        movups    %xmm13, 32(%rsp)
+        movups    %xmm14, 16(%rsp)
+        movups    %xmm15, (%rsp)
+        movq      %rsi, 136(%rsp)
+        movq      %rdi, 128(%rsp)
+        movq      %r12, 168(%rsp)
+        cfi_offset_rel_rsp (12, 168)
+        movb      %dl, %r12b
+        movq      %r13, 160(%rsp)
+        cfi_offset_rel_rsp (13, 160)
+        movl      %ecx, %r13d
+        movq      %r14, 152(%rsp)
+        cfi_offset_rel_rsp (14, 152)
+        movl      %eax, %r14d
+        movq      %r15, 144(%rsp)
+        cfi_offset_rel_rsp (15, 144)
+        cfi_remember_state
+
+.LBL_1_6:
+        btl       %r14d, %r13d
+        jc        .LBL_1_12
+
+.LBL_1_7:
+        lea       1(%r14), %esi
+        btl       %esi, %r13d
+        jc        .LBL_1_10
+
+.LBL_1_8:
+        incb      %r12b
+        addl      $2, %r14d
+        cmpb      $16, %r12b
+        jb        .LBL_1_6
+
+        movups    112(%rsp), %xmm8
+        movups    96(%rsp), %xmm9
+        movups    80(%rsp), %xmm10
+        movups    64(%rsp), %xmm11
+        movups    48(%rsp), %xmm12
+        movups    32(%rsp), %xmm13
+        movups    16(%rsp), %xmm14
+        movups    (%rsp), %xmm15
+        movq      136(%rsp), %rsi
+        movq      128(%rsp), %rdi
+        movq      168(%rsp), %r12
+        cfi_restore (%r12)
+        movq      160(%rsp), %r13
+        cfi_restore (%r13)
+        movq      152(%rsp), %r14
+        cfi_restore (%r14)
+        movq      144(%rsp), %r15
+        cfi_restore (%r15)
+        movups    256(%rsp), %xmm0
+        jmp       .LBL_1_2
+
+.LBL_1_10:
+        cfi_restore_state
+        movzbl    %r12b, %r15d
+        shlq      $4, %r15
+        movsd     200(%rsp,%r15), %xmm0
+
+        call      JUMPTARGET(cos)
+
+        movsd     %xmm0, 264(%rsp,%r15)
+        jmp       .LBL_1_8
+
+.LBL_1_12:
+        movzbl    %r12b, %r15d
+        shlq      $4, %r15
+        movsd     192(%rsp,%r15), %xmm0
+
+        call      JUMPTARGET(cos)
+
+        movsd     %xmm0, 256(%rsp,%r15)
+        jmp       .LBL_1_7
+
+END (_ZGVbN2v_cos_sse4)
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_cos4_core.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_cos4_core.S
new file mode 100644
index 0000000000..ff382e9c6c
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_cos4_core.S
@@ -0,0 +1,36 @@
+/* Multiple versions of vectorized cos, vector length is 4.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+	.text
+ENTRY (_ZGVdN4v_cos)
+        .type   _ZGVdN4v_cos, @gnu_indirect_function
+	LOAD_RTLD_GLOBAL_RO_RDX
+        leaq    _ZGVdN4v_cos_avx2(%rip), %rax
+	HAS_ARCH_FEATURE (AVX2_Usable)
+        jz      2f
+        ret
+2:      leaq    _ZGVdN4v_cos_sse_wrapper(%rip), %rax
+        ret
+END (_ZGVdN4v_cos)
+libmvec_hidden_def (_ZGVdN4v_cos)
+
+#define _ZGVdN4v_cos _ZGVdN4v_cos_sse_wrapper
+#include "../svml_d_cos4_core.S"
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_cos4_core_avx2.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_cos4_core_avx2.S
new file mode 100644
index 0000000000..4b6d09743b
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_cos4_core_avx2.S
@@ -0,0 +1,207 @@
+/* Function cos vectorized with AVX2.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "svml_d_trig_data.h"
+
+	.text
+ENTRY (_ZGVdN4v_cos_avx2)
+
+/* ALGORITHM DESCRIPTION:
+
+      ( low accuracy ( < 4ulp ) or enhanced performance
+       ( half of correct mantissa ) implementation )
+
+      Argument representation:
+      arg + Pi/2 = (N*Pi + R)
+
+      Result calculation:
+      cos(arg) = sin(arg+Pi/2) = sin(N*Pi + R) = (-1)^N * sin(R)
+      sin(R) is approximated by corresponding polynomial
+ */
+        pushq     %rbp
+        cfi_adjust_cfa_offset (8)
+        cfi_rel_offset (%rbp, 0)
+        movq      %rsp, %rbp
+        cfi_def_cfa_register (%rbp)
+        andq      $-64, %rsp
+        subq      $448, %rsp
+        movq      __svml_d_trig_data@GOTPCREL(%rip), %rax
+        vmovapd   %ymm0, %ymm1
+        vmovupd __dInvPI(%rax), %ymm4
+        vmovupd __dRShifter(%rax), %ymm5
+
+/*
+   ARGUMENT RANGE REDUCTION:
+   Add Pi/2 to argument: X' = X+Pi/2
+ */
+        vaddpd __dHalfPI(%rax), %ymm1, %ymm7
+
+/* Get absolute argument value: X' = |X'| */
+        vandpd __dAbsMask(%rax), %ymm7, %ymm2
+
+/* Y = X'*InvPi + RS : right shifter add */
+        vfmadd213pd %ymm5, %ymm4, %ymm7
+        vmovupd __dC7(%rax), %ymm4
+
+/* Check for large arguments path */
+        vcmpnle_uqpd __dRangeVal(%rax), %ymm2, %ymm3
+
+/* N = Y - RS : right shifter sub */
+        vsubpd    %ymm5, %ymm7, %ymm6
+        vmovupd __dPI1_FMA(%rax), %ymm2
+
+/* SignRes = Y<<63 : shift LSB to MSB place for result sign */
+        vpsllq    $63, %ymm7, %ymm7
+
+/* N = N - 0.5 */
+        vsubpd __dOneHalf(%rax), %ymm6, %ymm0
+        vmovmskpd %ymm3, %ecx
+
+/* R = X - N*Pi1 */
+        vmovapd   %ymm1, %ymm3
+        vfnmadd231pd %ymm0, %ymm2, %ymm3
+
+/* R = R - N*Pi2 */
+        vfnmadd231pd __dPI2_FMA(%rax), %ymm0, %ymm3
+
+/* R = R - N*Pi3 */
+        vfnmadd132pd __dPI3_FMA(%rax), %ymm3, %ymm0
+
+/* POLYNOMIAL APPROXIMATION: R2 = R*R */
+        vmulpd    %ymm0, %ymm0, %ymm5
+        vfmadd213pd __dC6(%rax), %ymm5, %ymm4
+        vfmadd213pd __dC5(%rax), %ymm5, %ymm4
+        vfmadd213pd __dC4(%rax), %ymm5, %ymm4
+
+/* Poly = C3+R2*(C4+R2*(C5+R2*(C6+R2*C7))) */
+        vfmadd213pd __dC3(%rax), %ymm5, %ymm4
+
+/* Poly = R+R*(R2*(C1+R2*(C2+R2*Poly))) */
+        vfmadd213pd __dC2(%rax), %ymm5, %ymm4
+        vfmadd213pd __dC1(%rax), %ymm5, %ymm4
+        vmulpd    %ymm5, %ymm4, %ymm6
+        vfmadd213pd %ymm0, %ymm0, %ymm6
+
+/*
+   RECONSTRUCTION:
+   Final sign setting: Res = Poly^SignRes */
+        vxorpd    %ymm7, %ymm6, %ymm0
+        testl     %ecx, %ecx
+        jne       .LBL_1_3
+
+.LBL_1_2:
+        cfi_remember_state
+        movq      %rbp, %rsp
+        cfi_def_cfa_register (%rsp)
+        popq      %rbp
+        cfi_adjust_cfa_offset (-8)
+        cfi_restore (%rbp)
+        ret
+
+.LBL_1_3:
+        cfi_restore_state
+        vmovupd   %ymm1, 320(%rsp)
+        vmovupd   %ymm0, 384(%rsp)
+        je        .LBL_1_2
+
+        xorb      %dl, %dl
+        xorl      %eax, %eax
+        vmovups   %ymm8, 224(%rsp)
+        vmovups   %ymm9, 192(%rsp)
+        vmovups   %ymm10, 160(%rsp)
+        vmovups   %ymm11, 128(%rsp)
+        vmovups   %ymm12, 96(%rsp)
+        vmovups   %ymm13, 64(%rsp)
+        vmovups   %ymm14, 32(%rsp)
+        vmovups   %ymm15, (%rsp)
+        movq      %rsi, 264(%rsp)
+        movq      %rdi, 256(%rsp)
+        movq      %r12, 296(%rsp)
+        cfi_offset_rel_rsp (12, 296)
+        movb      %dl, %r12b
+        movq      %r13, 288(%rsp)
+        cfi_offset_rel_rsp (13, 288)
+        movl      %ecx, %r13d
+        movq      %r14, 280(%rsp)
+        cfi_offset_rel_rsp (14, 280)
+        movl      %eax, %r14d
+        movq      %r15, 272(%rsp)
+        cfi_offset_rel_rsp (15, 272)
+        cfi_remember_state
+
+.LBL_1_6:
+        btl       %r14d, %r13d
+        jc        .LBL_1_12
+
+.LBL_1_7:
+        lea       1(%r14), %esi
+        btl       %esi, %r13d
+        jc        .LBL_1_10
+
+.LBL_1_8:
+        incb      %r12b
+        addl      $2, %r14d
+        cmpb      $16, %r12b
+        jb        .LBL_1_6
+
+        vmovups   224(%rsp), %ymm8
+        vmovups   192(%rsp), %ymm9
+        vmovups   160(%rsp), %ymm10
+        vmovups   128(%rsp), %ymm11
+        vmovups   96(%rsp), %ymm12
+        vmovups   64(%rsp), %ymm13
+        vmovups   32(%rsp), %ymm14
+        vmovups   (%rsp), %ymm15
+        vmovupd   384(%rsp), %ymm0
+        movq      264(%rsp), %rsi
+        movq      256(%rsp), %rdi
+        movq      296(%rsp), %r12
+        cfi_restore (%r12)
+        movq      288(%rsp), %r13
+        cfi_restore (%r13)
+        movq      280(%rsp), %r14
+        cfi_restore (%r14)
+        movq      272(%rsp), %r15
+        cfi_restore (%r15)
+        jmp       .LBL_1_2
+
+.LBL_1_10:
+        cfi_restore_state
+        movzbl    %r12b, %r15d
+        shlq      $4, %r15
+        vmovsd    328(%rsp,%r15), %xmm0
+        vzeroupper
+
+        call      JUMPTARGET(cos)
+
+        vmovsd    %xmm0, 392(%rsp,%r15)
+        jmp       .LBL_1_8
+
+.LBL_1_12:
+        movzbl    %r12b, %r15d
+        shlq      $4, %r15
+        vmovsd    320(%rsp,%r15), %xmm0
+        vzeroupper
+
+        call      JUMPTARGET(cos)
+
+        vmovsd    %xmm0, 384(%rsp,%r15)
+        jmp       .LBL_1_7
+
+END (_ZGVdN4v_cos_avx2)
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core.S
new file mode 100644
index 0000000000..46d35a25d2
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core.S
@@ -0,0 +1,37 @@
+/* Multiple versions of vectorized cos, vector length is 8.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+	.text
+ENTRY (_ZGVeN8v_cos)
+        .type   _ZGVeN8v_cos, @gnu_indirect_function
+	LOAD_RTLD_GLOBAL_RO_RDX
+1:      leaq    _ZGVeN8v_cos_skx(%rip), %rax
+	HAS_ARCH_FEATURE (AVX512DQ_Usable)
+        jnz     2f
+        leaq    _ZGVeN8v_cos_knl(%rip), %rax
+	HAS_ARCH_FEATURE (AVX512F_Usable)
+        jnz     2f
+        leaq    _ZGVeN8v_cos_avx2_wrapper(%rip), %rax
+2:      ret
+END (_ZGVeN8v_cos)
+
+#define _ZGVeN8v_cos _ZGVeN8v_cos_avx2_wrapper
+#include "../svml_d_cos8_core.S"
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S
new file mode 100644
index 0000000000..e7af83c6d5
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S
@@ -0,0 +1,463 @@
+/* Function cos vectorized with AVX-512, KNL and SKX versions.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "svml_d_trig_data.h"
+#include "svml_d_wrapper_impl.h"
+
+	.text
+ENTRY (_ZGVeN8v_cos_knl)
+#ifndef HAVE_AVX512DQ_ASM_SUPPORT
+WRAPPER_IMPL_AVX512 _ZGVdN4v_cos
+#else
+/*
+  ALGORITHM DESCRIPTION:
+
+       ( low accuracy ( < 4ulp ) or enhanced performance
+        ( half of correct mantissa ) implementation )
+
+        Argument representation:
+        arg + Pi/2 = (N*Pi + R)
+
+        Result calculation:
+        cos(arg) = sin(arg+Pi/2) = sin(N*Pi + R) = (-1)^N * sin(R)
+        sin(R) is approximated by corresponding polynomial
+ */
+        pushq     %rbp
+        cfi_adjust_cfa_offset (8)
+        cfi_rel_offset (%rbp, 0)
+        movq      %rsp, %rbp
+        cfi_def_cfa_register (%rbp)
+        andq      $-64, %rsp
+        subq      $1280, %rsp
+        movq      __svml_d_trig_data@GOTPCREL(%rip), %rax
+
+/* R = X - N*Pi1 */
+        vmovaps   %zmm0, %zmm7
+
+/* Check for large arguments path */
+        movq      $-1, %rcx
+
+/*
+  ARGUMENT RANGE REDUCTION:
+  Add Pi/2 to argument: X' = X+Pi/2
+ */
+        vaddpd __dHalfPI(%rax), %zmm0, %zmm5
+        vmovups __dInvPI(%rax), %zmm3
+
+/* Get absolute argument value: X' = |X'| */
+        vpandq __dAbsMask(%rax), %zmm5, %zmm1
+
+/* Y = X'*InvPi + RS : right shifter add */
+        vfmadd213pd __dRShifter(%rax), %zmm3, %zmm5
+        vmovups __dPI1_FMA(%rax), %zmm6
+
+/* N = Y - RS : right shifter sub */
+        vsubpd __dRShifter(%rax), %zmm5, %zmm4
+
+/* SignRes = Y<<63 : shift LSB to MSB place for result sign */
+        vpsllq    $63, %zmm5, %zmm12
+        vmovups __dC7(%rax), %zmm8
+
+/* N = N - 0.5 */
+        vsubpd __dOneHalf(%rax), %zmm4, %zmm10
+        vcmppd    $22, __dRangeVal(%rax), %zmm1, %k1
+        vpbroadcastq %rcx, %zmm2{%k1}{z}
+        vfnmadd231pd %zmm10, %zmm6, %zmm7
+        vptestmq  %zmm2, %zmm2, %k0
+
+/* R = R - N*Pi2 */
+        vfnmadd231pd __dPI2_FMA(%rax), %zmm10, %zmm7
+        kmovw     %k0, %ecx
+        movzbl    %cl, %ecx
+
+/* R = R - N*Pi3 */
+        vfnmadd132pd __dPI3_FMA(%rax), %zmm7, %zmm10
+
+/*
+  POLYNOMIAL APPROXIMATION:
+  R2 = R*R
+ */
+        vmulpd    %zmm10, %zmm10, %zmm9
+        vfmadd213pd __dC6(%rax), %zmm9, %zmm8
+        vfmadd213pd __dC5(%rax), %zmm9, %zmm8
+        vfmadd213pd __dC4(%rax), %zmm9, %zmm8
+
+/* Poly = C3+R2*(C4+R2*(C5+R2*(C6+R2*C7))) */
+        vfmadd213pd __dC3(%rax), %zmm9, %zmm8
+
+/* Poly = R+R*(R2*(C1+R2*(C2+R2*Poly))) */
+        vfmadd213pd __dC2(%rax), %zmm9, %zmm8
+        vfmadd213pd __dC1(%rax), %zmm9, %zmm8
+        vmulpd    %zmm9, %zmm8, %zmm11
+        vfmadd213pd %zmm10, %zmm10, %zmm11
+
+/*
+  RECONSTRUCTION:
+  Final sign setting: Res = Poly^SignRes
+ */
+        vpxorq    %zmm12, %zmm11, %zmm1
+        testl     %ecx, %ecx
+        jne       .LBL_1_3
+
+.LBL_1_2:
+        cfi_remember_state
+        vmovaps   %zmm1, %zmm0
+        movq      %rbp, %rsp
+        cfi_def_cfa_register (%rsp)
+        popq      %rbp
+        cfi_adjust_cfa_offset (-8)
+        cfi_restore (%rbp)
+        ret
+
+.LBL_1_3:
+        cfi_restore_state
+        vmovups   %zmm0, 1152(%rsp)
+        vmovups   %zmm1, 1216(%rsp)
+        je        .LBL_1_2
+
+        xorb      %dl, %dl
+        kmovw     %k4, 1048(%rsp)
+        xorl      %eax, %eax
+        kmovw     %k5, 1040(%rsp)
+        kmovw     %k6, 1032(%rsp)
+        kmovw     %k7, 1024(%rsp)
+        vmovups   %zmm16, 960(%rsp)
+        vmovups   %zmm17, 896(%rsp)
+        vmovups   %zmm18, 832(%rsp)
+        vmovups   %zmm19, 768(%rsp)
+        vmovups   %zmm20, 704(%rsp)
+        vmovups   %zmm21, 640(%rsp)
+        vmovups   %zmm22, 576(%rsp)
+        vmovups   %zmm23, 512(%rsp)
+        vmovups   %zmm24, 448(%rsp)
+        vmovups   %zmm25, 384(%rsp)
+        vmovups   %zmm26, 320(%rsp)
+        vmovups   %zmm27, 256(%rsp)
+        vmovups   %zmm28, 192(%rsp)
+        vmovups   %zmm29, 128(%rsp)
+        vmovups   %zmm30, 64(%rsp)
+        vmovups   %zmm31, (%rsp)
+        movq      %rsi, 1064(%rsp)
+        movq      %rdi, 1056(%rsp)
+        movq      %r12, 1096(%rsp)
+        cfi_offset_rel_rsp (12, 1096)
+        movb      %dl, %r12b
+        movq      %r13, 1088(%rsp)
+        cfi_offset_rel_rsp (13, 1088)
+        movl      %ecx, %r13d
+        movq      %r14, 1080(%rsp)
+        cfi_offset_rel_rsp (14, 1080)
+        movl      %eax, %r14d
+        movq      %r15, 1072(%rsp)
+        cfi_offset_rel_rsp (15, 1072)
+        cfi_remember_state
+
+.LBL_1_6:
+        btl       %r14d, %r13d
+        jc        .LBL_1_12
+
+.LBL_1_7:
+        lea       1(%r14), %esi
+        btl       %esi, %r13d
+        jc        .LBL_1_10
+
+.LBL_1_8:
+        addb      $1, %r12b
+        addl      $2, %r14d
+        cmpb      $16, %r12b
+        jb        .LBL_1_6
+
+        kmovw     1048(%rsp), %k4
+        movq      1064(%rsp), %rsi
+        kmovw     1040(%rsp), %k5
+        movq      1056(%rsp), %rdi
+        kmovw     1032(%rsp), %k6
+        movq      1096(%rsp), %r12
+        cfi_restore (%r12)
+        movq      1088(%rsp), %r13
+        cfi_restore (%r13)
+        kmovw     1024(%rsp), %k7
+        vmovups   960(%rsp), %zmm16
+        vmovups   896(%rsp), %zmm17
+        vmovups   832(%rsp), %zmm18
+        vmovups   768(%rsp), %zmm19
+        vmovups   704(%rsp), %zmm20
+        vmovups   640(%rsp), %zmm21
+        vmovups   576(%rsp), %zmm22
+        vmovups   512(%rsp), %zmm23
+        vmovups   448(%rsp), %zmm24
+        vmovups   384(%rsp), %zmm25
+        vmovups   320(%rsp), %zmm26
+        vmovups   256(%rsp), %zmm27
+        vmovups   192(%rsp), %zmm28
+        vmovups   128(%rsp), %zmm29
+        vmovups   64(%rsp), %zmm30
+        vmovups   (%rsp), %zmm31
+        movq      1080(%rsp), %r14
+        cfi_restore (%r14)
+        movq      1072(%rsp), %r15
+        cfi_restore (%r15)
+        vmovups   1216(%rsp), %zmm1
+        jmp       .LBL_1_2
+
+.LBL_1_10:
+        cfi_restore_state
+        movzbl    %r12b, %r15d
+        shlq      $4, %r15
+        vmovsd    1160(%rsp,%r15), %xmm0
+        call      JUMPTARGET(cos)
+        vmovsd    %xmm0, 1224(%rsp,%r15)
+        jmp       .LBL_1_8
+
+.LBL_1_12:
+        movzbl    %r12b, %r15d
+        shlq      $4, %r15
+        vmovsd    1152(%rsp,%r15), %xmm0
+        call      JUMPTARGET(cos)
+        vmovsd    %xmm0, 1216(%rsp,%r15)
+        jmp       .LBL_1_7
+#endif
+END (_ZGVeN8v_cos_knl)
+
+ENTRY (_ZGVeN8v_cos_skx)
+#ifndef HAVE_AVX512DQ_ASM_SUPPORT
+WRAPPER_IMPL_AVX512 _ZGVdN4v_cos
+#else
+/*
+   ALGORITHM DESCRIPTION:
+
+      ( low accuracy ( < 4ulp ) or enhanced performance
+       ( half of correct mantissa ) implementation )
+
+      Argument representation:
+      arg + Pi/2 = (N*Pi + R)
+
+      Result calculation:
+      cos(arg) = sin(arg+Pi/2) = sin(N*Pi + R) = (-1)^N * sin(R)
+      sin(R) is approximated by corresponding polynomial
+ */
+        pushq     %rbp
+        cfi_adjust_cfa_offset (8)
+        cfi_rel_offset (%rbp, 0)
+        movq      %rsp, %rbp
+        cfi_def_cfa_register (%rbp)
+        andq      $-64, %rsp
+        subq      $1280, %rsp
+        movq      __svml_d_trig_data@GOTPCREL(%rip), %rax
+
+/* R = X - N*Pi1 */
+        vmovaps   %zmm0, %zmm8
+
+/* Check for large arguments path */
+        vpbroadcastq .L_2il0floatpacket.16(%rip), %zmm2
+
+/*
+  ARGUMENT RANGE REDUCTION:
+  Add Pi/2 to argument: X' = X+Pi/2
+ */
+        vaddpd __dHalfPI(%rax), %zmm0, %zmm6
+        vmovups __dInvPI(%rax), %zmm3
+        vmovups __dRShifter(%rax), %zmm4
+        vmovups __dPI1_FMA(%rax), %zmm7
+        vmovups __dC7(%rax), %zmm9
+
+/* Get absolute argument value: X' = |X'| */
+        vandpd __dAbsMask(%rax), %zmm6, %zmm1
+
+/* Y = X'*InvPi + RS : right shifter add */
+        vfmadd213pd %zmm4, %zmm3, %zmm6
+        vcmppd    $18, __dRangeVal(%rax), %zmm1, %k1
+
+/* SignRes = Y<<63 : shift LSB to MSB place for result sign */
+        vpsllq    $63, %zmm6, %zmm13
+
+/* N = Y - RS : right shifter sub */
+        vsubpd    %zmm4, %zmm6, %zmm5
+
+/* N = N - 0.5 */
+        vsubpd __dOneHalf(%rax), %zmm5, %zmm11
+        vfnmadd231pd %zmm11, %zmm7, %zmm8
+
+/* R = R - N*Pi2 */
+        vfnmadd231pd __dPI2_FMA(%rax), %zmm11, %zmm8
+
+/* R = R - N*Pi3 */
+        vfnmadd132pd __dPI3_FMA(%rax), %zmm8, %zmm11
+
+/*
+  POLYNOMIAL APPROXIMATION:
+  R2 = R*R
+ */
+        vmulpd    %zmm11, %zmm11, %zmm10
+        vfmadd213pd __dC6(%rax), %zmm10, %zmm9
+        vfmadd213pd __dC5(%rax), %zmm10, %zmm9
+        vfmadd213pd __dC4(%rax), %zmm10, %zmm9
+
+/* Poly = C3+R2*(C4+R2*(C5+R2*(C6+R2*C7))) */
+        vfmadd213pd __dC3(%rax), %zmm10, %zmm9
+
+/* Poly = R+R*(R2*(C1+R2*(C2+R2*Poly))) */
+        vfmadd213pd __dC2(%rax), %zmm10, %zmm9
+        vfmadd213pd __dC1(%rax), %zmm10, %zmm9
+        vmulpd    %zmm10, %zmm9, %zmm12
+        vfmadd213pd %zmm11, %zmm11, %zmm12
+        vpandnq   %zmm1, %zmm1, %zmm2{%k1}
+        vcmppd    $3, %zmm2, %zmm2, %k0
+
+/*
+  RECONSTRUCTION:
+  Final sign setting: Res = Poly^SignRes
+ */
+        vxorpd    %zmm13, %zmm12, %zmm1
+        kmovw     %k0, %ecx
+        testl     %ecx, %ecx
+        jne       .LBL_2_3
+
+.LBL_2_2:
+        cfi_remember_state
+        vmovaps   %zmm1, %zmm0
+        movq      %rbp, %rsp
+        cfi_def_cfa_register (%rsp)
+        popq      %rbp
+        cfi_adjust_cfa_offset (-8)
+        cfi_restore (%rbp)
+        ret
+
+.LBL_2_3:
+        cfi_restore_state
+        vmovups   %zmm0, 1152(%rsp)
+        vmovups   %zmm1, 1216(%rsp)
+        je        .LBL_2_2
+
+        xorb      %dl, %dl
+        xorl      %eax, %eax
+        kmovw     %k4, 1048(%rsp)
+        kmovw     %k5, 1040(%rsp)
+        kmovw     %k6, 1032(%rsp)
+        kmovw     %k7, 1024(%rsp)
+        vmovups   %zmm16, 960(%rsp)
+        vmovups   %zmm17, 896(%rsp)
+        vmovups   %zmm18, 832(%rsp)
+        vmovups   %zmm19, 768(%rsp)
+        vmovups   %zmm20, 704(%rsp)
+        vmovups   %zmm21, 640(%rsp)
+        vmovups   %zmm22, 576(%rsp)
+        vmovups   %zmm23, 512(%rsp)
+        vmovups   %zmm24, 448(%rsp)
+        vmovups   %zmm25, 384(%rsp)
+        vmovups   %zmm26, 320(%rsp)
+        vmovups   %zmm27, 256(%rsp)
+        vmovups   %zmm28, 192(%rsp)
+        vmovups   %zmm29, 128(%rsp)
+        vmovups   %zmm30, 64(%rsp)
+        vmovups   %zmm31, (%rsp)
+        movq      %rsi, 1064(%rsp)
+        movq      %rdi, 1056(%rsp)
+        movq      %r12, 1096(%rsp)
+        cfi_offset_rel_rsp (12, 1096)
+        movb      %dl, %r12b
+        movq      %r13, 1088(%rsp)
+        cfi_offset_rel_rsp (13, 1088)
+        movl      %ecx, %r13d
+        movq      %r14, 1080(%rsp)
+        cfi_offset_rel_rsp (14, 1080)
+        movl      %eax, %r14d
+        movq      %r15, 1072(%rsp)
+        cfi_offset_rel_rsp (15, 1072)
+        cfi_remember_state
+
+.LBL_2_6:
+        btl       %r14d, %r13d
+        jc        .LBL_2_12
+
+.LBL_2_7:
+        lea       1(%r14), %esi
+        btl       %esi, %r13d
+        jc        .LBL_2_10
+
+.LBL_2_8:
+        incb      %r12b
+        addl      $2, %r14d
+        cmpb      $16, %r12b
+        jb        .LBL_2_6
+
+        kmovw     1048(%rsp), %k4
+        kmovw     1040(%rsp), %k5
+        kmovw     1032(%rsp), %k6
+        kmovw     1024(%rsp), %k7
+        vmovups   960(%rsp), %zmm16
+        vmovups   896(%rsp), %zmm17
+        vmovups   832(%rsp), %zmm18
+        vmovups   768(%rsp), %zmm19
+        vmovups   704(%rsp), %zmm20
+        vmovups   640(%rsp), %zmm21
+        vmovups   576(%rsp), %zmm22
+        vmovups   512(%rsp), %zmm23
+        vmovups   448(%rsp), %zmm24
+        vmovups   384(%rsp), %zmm25
+        vmovups   320(%rsp), %zmm26
+        vmovups   256(%rsp), %zmm27
+        vmovups   192(%rsp), %zmm28
+        vmovups   128(%rsp), %zmm29
+        vmovups   64(%rsp), %zmm30
+        vmovups   (%rsp), %zmm31
+        vmovups   1216(%rsp), %zmm1
+        movq      1064(%rsp), %rsi
+        movq      1056(%rsp), %rdi
+        movq      1096(%rsp), %r12
+        cfi_restore (%r12)
+        movq      1088(%rsp), %r13
+        cfi_restore (%r13)
+        movq      1080(%rsp), %r14
+        cfi_restore (%r14)
+        movq      1072(%rsp), %r15
+        cfi_restore (%r15)
+        jmp       .LBL_2_2
+
+.LBL_2_10:
+        cfi_restore_state
+        movzbl    %r12b, %r15d
+        shlq      $4, %r15
+        vmovsd    1160(%rsp,%r15), %xmm0
+        vzeroupper
+        vmovsd    1160(%rsp,%r15), %xmm0
+
+        call      JUMPTARGET(cos)
+
+        vmovsd    %xmm0, 1224(%rsp,%r15)
+        jmp       .LBL_2_8
+
+.LBL_2_12:
+        movzbl    %r12b, %r15d
+        shlq      $4, %r15
+        vmovsd    1152(%rsp,%r15), %xmm0
+        vzeroupper
+        vmovsd    1152(%rsp,%r15), %xmm0
+
+        call      JUMPTARGET(cos)
+
+        vmovsd    %xmm0, 1216(%rsp,%r15)
+        jmp       .LBL_2_7
+#endif
+END (_ZGVeN8v_cos_skx)
+
+	.section .rodata, "a"
+.L_2il0floatpacket.16:
+	.long	0xffffffff,0xffffffff
+	.type	.L_2il0floatpacket.16,@object
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_exp2_core.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_exp2_core.S
new file mode 100644
index 0000000000..5a17e11a0f
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_exp2_core.S
@@ -0,0 +1,36 @@
+/* Multiple versions of vectorized exp.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+	.text
+ENTRY (_ZGVbN2v_exp)
+        .type   _ZGVbN2v_exp, @gnu_indirect_function
+	LOAD_RTLD_GLOBAL_RO_RDX
+        leaq    _ZGVbN2v_exp_sse4(%rip), %rax
+	HAS_CPU_FEATURE (SSE4_1)
+        jz      2f
+        ret
+2:      leaq    _ZGVbN2v_exp_sse2(%rip), %rax
+        ret
+END (_ZGVbN2v_exp)
+libmvec_hidden_def (_ZGVbN2v_exp)
+
+#define _ZGVbN2v_exp _ZGVbN2v_exp_sse2
+#include "../svml_d_exp2_core.S"
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_exp2_core_sse4.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_exp2_core_sse4.S
new file mode 100644
index 0000000000..864dc5ae9f
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_exp2_core_sse4.S
@@ -0,0 +1,225 @@
+/* Function exp vectorized with SSE4.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "svml_d_exp_data.h"
+
+	.text
+ENTRY (_ZGVbN2v_exp_sse4)
+/*
+   ALGORITHM DESCRIPTION:
+
+     Argument representation:
+     N = rint(X*2^k/ln2) = 2^k*M+j
+     X = N*ln2/2^k + r = M*ln2 + ln2*(j/2^k) + r
+     then -ln2/2^(k+1) < r < ln2/2^(k+1)
+     Alternatively:
+     N = trunc(X*2^k/ln2)
+     then 0 < r < ln2/2^k
+
+     Result calculation:
+     exp(X) = exp(M*ln2 + ln2*(j/2^k) + r)
+     = 2^M * 2^(j/2^k) * exp(r)
+     2^M is calculated by bit manipulation
+     2^(j/2^k) is stored in table
+     exp(r) is approximated by polynomial.
+
+     The table lookup is skipped if k = 0.  */
+
+        pushq     %rbp
+        cfi_adjust_cfa_offset (8)
+        cfi_rel_offset (%rbp, 0)
+        movq      %rsp, %rbp
+        cfi_def_cfa_register (%rbp)
+        andq      $-64, %rsp
+        subq      $320, %rsp
+        movaps    %xmm0, %xmm3
+        movq      __svml_dexp_data@GOTPCREL(%rip), %r8
+
+/* iAbsX = (int)(lX>>32), lX = *(longlong*)&X */
+        pshufd    $221, %xmm3, %xmm7
+        movups __dbInvLn2(%r8), %xmm0
+
+/* dK = X*dbInvLn2 */
+        mulpd     %xmm3, %xmm0
+        movq __iAbsMask(%r8), %xmm5
+        movq __iDomainRange(%r8), %xmm6
+
+/* iAbsX = iAbsX&iAbsMask */
+        pand      %xmm5, %xmm7
+
+/* iRangeMask = (iAbsX>iDomainRange) */
+        pcmpgtd   %xmm6, %xmm7
+
+/* Mask = iRangeMask?1:0, set mask for overflow/underflow */
+        movmskps  %xmm7, %eax
+
+/* dN = rint(X*2^k/Ln2) */
+        xorps     %xmm7, %xmm7
+        movups __dbLn2hi(%r8), %xmm5
+        movups __dbLn2lo(%r8), %xmm6
+        roundpd   $0, %xmm0, %xmm7
+
+/* dR = X - dN*dbLn2hi, dbLn2hi is 52-8-k hi bits of ln2/2^k */
+        mulpd     %xmm7, %xmm5
+
+/* dR = dR - dN*dbLn2lo, dbLn2lo is 40..94 bits of lo part of ln2/2^k */
+        mulpd     %xmm6, %xmm7
+        movups __dbShifter(%r8), %xmm4
+
+/* dM = X*dbInvLn2+dbShifter */
+        addpd     %xmm0, %xmm4
+        movaps    %xmm3, %xmm0
+        subpd     %xmm5, %xmm0
+        subpd     %xmm7, %xmm0
+        movups __dPC2(%r8), %xmm5
+
+/* exp(r) = b0+r*(b0+r*(b1+r*b2)) */
+        mulpd     %xmm0, %xmm5
+        addpd __dPC1(%r8), %xmm5
+        mulpd     %xmm0, %xmm5
+        movups __dPC0(%r8), %xmm6
+        addpd     %xmm6, %xmm5
+        mulpd     %xmm5, %xmm0
+        movdqu __lIndexMask(%r8), %xmm2
+
+/* lIndex = (*(longlong*)&dM)&lIndexMask, lIndex is the lower K bits of lM */
+        movdqa    %xmm2, %xmm1
+
+/* lM = (*(longlong*)&dM)&(~lIndexMask) */
+        pandn     %xmm4, %xmm2
+        pand      %xmm4, %xmm1
+
+/* lM = lM<<(52-K), 2^M */
+        psllq     $42, %xmm2
+
+/* table lookup for dT[j] = 2^(j/2^k) */
+        movd      %xmm1, %edx
+        pextrw    $4, %xmm1, %ecx
+        addpd     %xmm0, %xmm6
+        shll      $3, %edx
+        shll      $3, %ecx
+        movq      (%r8,%rdx), %xmm0
+        andl      $3, %eax
+        movhpd    (%r8,%rcx), %xmm0
+
+/* 2^(j/2^k) * exp(r) */
+        mulpd     %xmm6, %xmm0
+
+/* multiply by 2^M through integer add */
+        paddq     %xmm2, %xmm0
+        jne       .LBL_1_3
+
+.LBL_1_2:
+        cfi_remember_state
+        movq      %rbp, %rsp
+        cfi_def_cfa_register (%rsp)
+        popq      %rbp
+        cfi_adjust_cfa_offset (-8)
+        cfi_restore (%rbp)
+        ret
+
+.LBL_1_3:
+        cfi_restore_state
+        movups    %xmm3, 192(%rsp)
+        movups    %xmm0, 256(%rsp)
+        je        .LBL_1_2
+
+        xorb      %cl, %cl
+        xorl      %edx, %edx
+        movups    %xmm8, 112(%rsp)
+        movups    %xmm9, 96(%rsp)
+        movups    %xmm10, 80(%rsp)
+        movups    %xmm11, 64(%rsp)
+        movups    %xmm12, 48(%rsp)
+        movups    %xmm13, 32(%rsp)
+        movups    %xmm14, 16(%rsp)
+        movups    %xmm15, (%rsp)
+        movq      %rsi, 136(%rsp)
+        movq      %rdi, 128(%rsp)
+        movq      %r12, 168(%rsp)
+        cfi_offset_rel_rsp (12, 168)
+        movb      %cl, %r12b
+        movq      %r13, 160(%rsp)
+        cfi_offset_rel_rsp (13, 160)
+        movl      %eax, %r13d
+        movq      %r14, 152(%rsp)
+        cfi_offset_rel_rsp (14, 152)
+        movl      %edx, %r14d
+        movq      %r15, 144(%rsp)
+        cfi_offset_rel_rsp (15, 144)
+        cfi_remember_state
+
+.LBL_1_6:
+        btl       %r14d, %r13d
+        jc        .LBL_1_12
+
+.LBL_1_7:
+        lea       1(%r14), %esi
+        btl       %esi, %r13d
+        jc        .LBL_1_10
+
+.LBL_1_8:
+        incb      %r12b
+        addl      $2, %r14d
+        cmpb      $16, %r12b
+        jb        .LBL_1_6
+
+        movups    112(%rsp), %xmm8
+        movups    96(%rsp), %xmm9
+        movups    80(%rsp), %xmm10
+        movups    64(%rsp), %xmm11
+        movups    48(%rsp), %xmm12
+        movups    32(%rsp), %xmm13
+        movups    16(%rsp), %xmm14
+        movups    (%rsp), %xmm15
+        movq      136(%rsp), %rsi
+        movq      128(%rsp), %rdi
+        movq      168(%rsp), %r12
+        cfi_restore (%r12)
+        movq      160(%rsp), %r13
+        cfi_restore (%r13)
+        movq      152(%rsp), %r14
+        cfi_restore (%r14)
+        movq      144(%rsp), %r15
+        cfi_restore (%r15)
+        movups    256(%rsp), %xmm0
+        jmp       .LBL_1_2
+
+.LBL_1_10:
+        cfi_restore_state
+        movzbl    %r12b, %r15d
+        shlq      $4, %r15
+        movsd     200(%rsp,%r15), %xmm0
+
+        call      JUMPTARGET(__exp_finite)
+
+        movsd     %xmm0, 264(%rsp,%r15)
+        jmp       .LBL_1_8
+
+.LBL_1_12:
+        movzbl    %r12b, %r15d
+        shlq      $4, %r15
+        movsd     192(%rsp,%r15), %xmm0
+
+        call      JUMPTARGET(__exp_finite)
+
+        movsd     %xmm0, 256(%rsp,%r15)
+        jmp       .LBL_1_7
+
+END (_ZGVbN2v_exp_sse4)
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_exp4_core.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_exp4_core.S
new file mode 100644
index 0000000000..b994a794cd
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_exp4_core.S
@@ -0,0 +1,36 @@
+/* Multiple versions of vectorized exp.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+	.text
+ENTRY (_ZGVdN4v_exp)
+        .type   _ZGVdN4v_exp, @gnu_indirect_function
+	LOAD_RTLD_GLOBAL_RO_RDX
+        leaq    _ZGVdN4v_exp_avx2(%rip), %rax
+	HAS_ARCH_FEATURE (AVX2_Usable)
+        jz      2f
+        ret
+2:      leaq    _ZGVdN4v_exp_sse_wrapper(%rip), %rax
+        ret
+END (_ZGVdN4v_exp)
+libmvec_hidden_def (_ZGVdN4v_exp)
+
+#define _ZGVdN4v_exp _ZGVdN4v_exp_sse_wrapper
+#include "../svml_d_exp4_core.S"
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_exp4_core_avx2.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_exp4_core_avx2.S
new file mode 100644
index 0000000000..937b3c09a6
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_exp4_core_avx2.S
@@ -0,0 +1,212 @@
+/* Function exp vectorized with AVX2.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "svml_d_exp_data.h"
+
+	.text
+ENTRY (_ZGVdN4v_exp_avx2)
+/*
+   ALGORITHM DESCRIPTION:
+
+     Argument representation:
+     N = rint(X*2^k/ln2) = 2^k*M+j
+     X = N*ln2/2^k + r = M*ln2 + ln2*(j/2^k) + r
+     then -ln2/2^(k+1) < r < ln2/2^(k+1)
+     Alternatively:
+     N = trunc(X*2^k/ln2)
+     then 0 < r < ln2/2^k
+
+     Result calculation:
+     exp(X) = exp(M*ln2 + ln2*(j/2^k) + r)
+     = 2^M * 2^(j/2^k) * exp(r)
+     2^M is calculated by bit manipulation
+     2^(j/2^k) is stored in table
+     exp(r) is approximated by polynomial
+
+     The table lookup is skipped if k = 0.  */
+
+        pushq     %rbp
+        cfi_adjust_cfa_offset (8)
+        cfi_rel_offset (%rbp, 0)
+        movq      %rsp, %rbp
+        cfi_def_cfa_register (%rbp)
+        andq      $-64, %rsp
+        subq      $448, %rsp
+        movq      __svml_dexp_data@GOTPCREL(%rip), %rax
+        vmovdqa   %ymm0, %ymm2
+        vmovupd __dbInvLn2(%rax), %ymm3
+        vmovupd __dbShifter(%rax), %ymm1
+        vmovupd __lIndexMask(%rax), %ymm4
+
+/* dM = X*dbInvLn2+dbShifter, dbInvLn2 = 2^k/Ln2 */
+        vfmadd213pd %ymm1, %ymm2, %ymm3
+
+/* iAbsX = (int)(lX>>32), lX = *(longlong*)&X */
+        vextracti128 $1, %ymm2, %xmm5
+        vshufps   $221, %xmm5, %xmm2, %xmm6
+
+/* iAbsX = iAbsX&iAbsMask */
+        vandps __iAbsMask(%rax), %xmm6, %xmm7
+
+/* dN = dM-dbShifter, dN = rint(X*2^k/Ln2) */
+        vsubpd    %ymm1, %ymm3, %ymm6
+
+/* iRangeMask = (iAbsX>iDomainRange) */
+        vpcmpgtd __iDomainRange(%rax), %xmm7, %xmm0
+        vmovupd __dbLn2hi(%rax), %ymm1
+        vmovupd __dPC0(%rax), %ymm7
+
+/* Mask = iRangeMask?1:0, set mask for overflow/underflow */
+        vmovmskps %xmm0, %ecx
+        vmovupd __dPC2(%rax), %ymm0
+
+/* dR = X - dN*dbLn2hi, dbLn2hi is 52-8-k hi bits of ln2/2^k */
+        vmovdqa   %ymm2, %ymm5
+        vfnmadd231pd %ymm6, %ymm1, %ymm5
+
+/* dR = dR - dN*dbLn2lo, dbLn2lo is 40..94 bits of lo part of ln2/2^k */
+        vfnmadd132pd __dbLn2lo(%rax), %ymm5, %ymm6
+
+/* exp(r) = b0+r*(b0+r*(b1+r*b2)) */
+        vfmadd213pd __dPC1(%rax), %ymm6, %ymm0
+        vfmadd213pd %ymm7, %ymm6, %ymm0
+        vfmadd213pd %ymm7, %ymm6, %ymm0
+
+/* lIndex = (*(longlong*)&dM)&lIndexMask, lIndex is the lower K bits of lM */
+        vandps    %ymm4, %ymm3, %ymm1
+
+/* table lookup for dT[j] = 2^(j/2^k) */
+        vxorpd    %ymm6, %ymm6, %ymm6
+        vpcmpeqd  %ymm5, %ymm5, %ymm5
+        vgatherqpd %ymm5, (%rax,%ymm1,8), %ymm6
+
+/* lM = (*(longlong*)&dM)&(~lIndexMask) */
+        vpandn    %ymm3, %ymm4, %ymm3
+
+/* 2^(j/2^k) * exp(r) */
+        vmulpd    %ymm0, %ymm6, %ymm0
+
+/* lM = lM<<(52-K), 2^M */
+        vpsllq    $42, %ymm3, %ymm4
+
+/* multiply by 2^M through integer add */
+        vpaddq    %ymm4, %ymm0, %ymm0
+        testl     %ecx, %ecx
+        jne       .LBL_1_3
+
+.LBL_1_2:
+        cfi_remember_state
+        movq      %rbp, %rsp
+        cfi_def_cfa_register (%rsp)
+        popq      %rbp
+        cfi_adjust_cfa_offset (-8)
+        cfi_restore (%rbp)
+        ret
+
+.LBL_1_3:
+        cfi_restore_state
+        vmovupd   %ymm2, 320(%rsp)
+        vmovupd   %ymm0, 384(%rsp)
+        je        .LBL_1_2
+
+        xorb      %dl, %dl
+        xorl      %eax, %eax
+        vmovups   %ymm8, 224(%rsp)
+        vmovups   %ymm9, 192(%rsp)
+        vmovups   %ymm10, 160(%rsp)
+        vmovups   %ymm11, 128(%rsp)
+        vmovups   %ymm12, 96(%rsp)
+        vmovups   %ymm13, 64(%rsp)
+        vmovups   %ymm14, 32(%rsp)
+        vmovups   %ymm15, (%rsp)
+        movq      %rsi, 264(%rsp)
+        movq      %rdi, 256(%rsp)
+        movq      %r12, 296(%rsp)
+        cfi_offset_rel_rsp (12, 296)
+        movb      %dl, %r12b
+        movq      %r13, 288(%rsp)
+        cfi_offset_rel_rsp (13, 288)
+        movl      %ecx, %r13d
+        movq      %r14, 280(%rsp)
+        cfi_offset_rel_rsp (14, 280)
+        movl      %eax, %r14d
+        movq      %r15, 272(%rsp)
+        cfi_offset_rel_rsp (15, 272)
+        cfi_remember_state
+
+.LBL_1_6:
+        btl       %r14d, %r13d
+        jc        .LBL_1_12
+
+.LBL_1_7:
+        lea       1(%r14), %esi
+        btl       %esi, %r13d
+        jc        .LBL_1_10
+
+.LBL_1_8:
+        incb      %r12b
+        addl      $2, %r14d
+        cmpb      $16, %r12b
+        jb        .LBL_1_6
+
+        vmovups   224(%rsp), %ymm8
+        vmovups   192(%rsp), %ymm9
+        vmovups   160(%rsp), %ymm10
+        vmovups   128(%rsp), %ymm11
+        vmovups   96(%rsp), %ymm12
+        vmovups   64(%rsp), %ymm13
+        vmovups   32(%rsp), %ymm14
+        vmovups   (%rsp), %ymm15
+        vmovupd   384(%rsp), %ymm0
+        movq      264(%rsp), %rsi
+        movq      256(%rsp), %rdi
+        movq      296(%rsp), %r12
+        cfi_restore (%r12)
+        movq      288(%rsp), %r13
+        cfi_restore (%r13)
+        movq      280(%rsp), %r14
+        cfi_restore (%r14)
+        movq      272(%rsp), %r15
+        cfi_restore (%r15)
+        jmp       .LBL_1_2
+
+.LBL_1_10:
+        cfi_restore_state
+        movzbl    %r12b, %r15d
+        shlq      $4, %r15
+        vmovsd    328(%rsp,%r15), %xmm0
+        vzeroupper
+
+        call      JUMPTARGET(__exp_finite)
+
+        vmovsd    %xmm0, 392(%rsp,%r15)
+        jmp       .LBL_1_8
+
+.LBL_1_12:
+        movzbl    %r12b, %r15d
+        shlq      $4, %r15
+        vmovsd    320(%rsp,%r15), %xmm0
+        vzeroupper
+
+        call      JUMPTARGET(__exp_finite)
+
+        vmovsd    %xmm0, 384(%rsp,%r15)
+        jmp       .LBL_1_7
+
+END (_ZGVdN4v_exp_avx2)
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_exp8_core.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_exp8_core.S
new file mode 100644
index 0000000000..6189080fcc
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_exp8_core.S
@@ -0,0 +1,37 @@
+/* Multiple versions of vectorized exp.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+	.text
+ENTRY (_ZGVeN8v_exp)
+        .type   _ZGVeN8v_exp, @gnu_indirect_function
+	LOAD_RTLD_GLOBAL_RO_RDX
+        leaq    _ZGVeN8v_exp_skx(%rip), %rax
+	HAS_ARCH_FEATURE (AVX512DQ_Usable)
+        jnz     2f
+        leaq    _ZGVeN8v_exp_knl(%rip), %rax
+	HAS_ARCH_FEATURE (AVX512F_Usable)
+        jnz     2f
+        leaq    _ZGVeN8v_exp_avx2_wrapper(%rip), %rax
+2:      ret
+END (_ZGVeN8v_exp)
+
+#define _ZGVeN8v_exp _ZGVeN8v_exp_avx2_wrapper
+#include "../svml_d_exp8_core.S"
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_exp8_core_avx512.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_exp8_core_avx512.S
new file mode 100644
index 0000000000..97ba72c2a0
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_exp8_core_avx512.S
@@ -0,0 +1,456 @@
+/* Function exp vectorized with AVX-512. KNL and SKX versions.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "svml_d_exp_data.h"
+#include "svml_d_wrapper_impl.h"
+
+	.text
+ENTRY (_ZGVeN8v_exp_knl)
+#ifndef HAVE_AVX512DQ_ASM_SUPPORT
+WRAPPER_IMPL_AVX512 _ZGVdN4v_exp
+#else
+/*
+   ALGORITHM DESCRIPTION:
+
+     Argument representation:
+     N = rint(X*2^k/ln2) = 2^k*M+j
+     X = N*ln2/2^k + r = M*ln2 + ln2*(j/2^k) + r
+     then -ln2/2^(k+1) < r < ln2/2^(k+1)
+     Alternatively:
+     N = trunc(X*2^k/ln2)
+     then 0 < r < ln2/2^k
+
+     Result calculation:
+     exp(X) = exp(M*ln2 + ln2*(j/2^k) + r)
+     = 2^M * 2^(j/2^k) * exp(r)
+     2^M is calculated by bit manipulation
+     2^(j/2^k) is stored in table
+     exp(r) is approximated by polynomial
+
+     The table lookup is skipped if k = 0.  */
+
+        pushq     %rbp
+        cfi_adjust_cfa_offset (8)
+        cfi_rel_offset (%rbp, 0)
+        movq      %rsp, %rbp
+        cfi_def_cfa_register (%rbp)
+        andq      $-64, %rsp
+        subq      $1280, %rsp
+        movq      __svml_dexp_data@GOTPCREL(%rip), %rax
+
+/* dR = X - dN*dbLn2hi, dbLn2hi is 52-8-k hi bits of ln2/2^k */
+        vmovaps   %zmm0, %zmm8
+
+/* iAbsX = (int)(lX>>32), lX = *(longlong*)&X */
+        vpsrlq    $32, %zmm0, %zmm1
+
+/* iAbsX = iAbsX&iAbsMask */
+        movl      $255, %edx
+        vpmovqd   %zmm1, %ymm2
+        kmovw     %edx, %k2
+
+/* iRangeMask = (iAbsX>iDomainRange) */
+        movl      $-1, %ecx
+
+/* table lookup for dT[j] = 2^(j/2^k) */
+        vpxord    %zmm11, %zmm11, %zmm11
+        vmovups __dbInvLn2(%rax), %zmm5
+        vmovups __dbLn2hi(%rax), %zmm7
+        kxnorw    %k3, %k3, %k3
+
+/* dM = X*dbInvLn2+dbShifter, dbInvLn2 = 2^k/Ln2 */
+        vfmadd213pd __dbShifter(%rax), %zmm0, %zmm5
+        vmovups __dPC2(%rax), %zmm12
+
+/* dN = dM-dbShifter, dN = rint(X*2^k/Ln2) */
+        vsubpd __dbShifter(%rax), %zmm5, %zmm9
+        vmovups __lIndexMask(%rax), %zmm4
+        vfnmadd231pd %zmm9, %zmm7, %zmm8
+        vpandd __iAbsMask(%rax), %zmm2, %zmm2{%k2}
+
+/* lIndex = (*(longlong*)&dM)&lIndexMask, lIndex is the lower K bits of lM */
+        vpandq    %zmm4, %zmm5, %zmm10
+        vgatherqpd (%rax,%zmm10,8), %zmm11{%k3}
+        vpcmpgtd __iDomainRange(%rax), %zmm2, %k1{%k2}
+
+/* lM = (*(longlong*)&dM)&(~lIndexMask) */
+        vpandnq   %zmm5, %zmm4, %zmm6
+        vpbroadcastd %ecx, %zmm3{%k1}{z}
+
+/* lM = lM<<(52-K), 2^M */
+        vpsllq    $42, %zmm6, %zmm14
+
+/* dR = dR - dN*dbLn2lo, dbLn2lo is 40..94 bits of lo part of ln2/2^k */
+        vfnmadd132pd __dbLn2lo(%rax), %zmm8, %zmm9
+
+/* Mask = iRangeMask?1:0, set mask for overflow/underflow */
+        vptestmd  %zmm3, %zmm3, %k0{%k2}
+
+/* exp(r) = b0+r*(b0+r*(b1+r*b2)) */
+        vfmadd213pd __dPC1(%rax), %zmm9, %zmm12
+        kmovw     %k0, %ecx
+        movzbl    %cl, %ecx
+        vfmadd213pd __dPC0(%rax), %zmm9, %zmm12
+        vfmadd213pd __dPC0(%rax), %zmm9, %zmm12
+
+/* 2^(j/2^k) * exp(r) */
+        vmulpd    %zmm12, %zmm11, %zmm13
+
+/* multiply by 2^M through integer add */
+        vpaddq    %zmm14, %zmm13, %zmm1
+        testl     %ecx, %ecx
+        jne       .LBL_1_3
+
+.LBL_1_2:
+        cfi_remember_state
+        vmovaps   %zmm1, %zmm0
+        movq      %rbp, %rsp
+        cfi_def_cfa_register (%rsp)
+        popq      %rbp
+        cfi_adjust_cfa_offset (-8)
+        cfi_restore (%rbp)
+        ret
+
+.LBL_1_3:
+        cfi_restore_state
+        vmovups   %zmm0, 1152(%rsp)
+        vmovups   %zmm1, 1216(%rsp)
+        je        .LBL_1_2
+
+        xorb      %dl, %dl
+        kmovw     %k4, 1048(%rsp)
+        xorl      %eax, %eax
+        kmovw     %k5, 1040(%rsp)
+        kmovw     %k6, 1032(%rsp)
+        kmovw     %k7, 1024(%rsp)
+        vmovups   %zmm16, 960(%rsp)
+        vmovups   %zmm17, 896(%rsp)
+        vmovups   %zmm18, 832(%rsp)
+        vmovups   %zmm19, 768(%rsp)
+        vmovups   %zmm20, 704(%rsp)
+        vmovups   %zmm21, 640(%rsp)
+        vmovups   %zmm22, 576(%rsp)
+        vmovups   %zmm23, 512(%rsp)
+        vmovups   %zmm24, 448(%rsp)
+        vmovups   %zmm25, 384(%rsp)
+        vmovups   %zmm26, 320(%rsp)
+        vmovups   %zmm27, 256(%rsp)
+        vmovups   %zmm28, 192(%rsp)
+        vmovups   %zmm29, 128(%rsp)
+        vmovups   %zmm30, 64(%rsp)
+        vmovups   %zmm31, (%rsp)
+        movq      %rsi, 1064(%rsp)
+        movq      %rdi, 1056(%rsp)
+        movq      %r12, 1096(%rsp)
+        cfi_offset_rel_rsp (12, 1096)
+        movb      %dl, %r12b
+        movq      %r13, 1088(%rsp)
+        cfi_offset_rel_rsp (13, 1088)
+        movl      %ecx, %r13d
+        movq      %r14, 1080(%rsp)
+        cfi_offset_rel_rsp (14, 1080)
+        movl      %eax, %r14d
+        movq      %r15, 1072(%rsp)
+        cfi_offset_rel_rsp (15, 1072)
+        cfi_remember_state
+
+.LBL_1_6:
+        btl       %r14d, %r13d
+        jc        .LBL_1_12
+
+.LBL_1_7:
+        lea       1(%r14), %esi
+        btl       %esi, %r13d
+        jc        .LBL_1_10
+
+.LBL_1_8:
+        addb      $1, %r12b
+        addl      $2, %r14d
+        cmpb      $16, %r12b
+        jb        .LBL_1_6
+
+        kmovw     1048(%rsp), %k4
+        movq      1064(%rsp), %rsi
+        kmovw     1040(%rsp), %k5
+        movq      1056(%rsp), %rdi
+        kmovw     1032(%rsp), %k6
+        movq      1096(%rsp), %r12
+        cfi_restore (%r12)
+        movq      1088(%rsp), %r13
+        cfi_restore (%r13)
+        kmovw     1024(%rsp), %k7
+        vmovups   960(%rsp), %zmm16
+        vmovups   896(%rsp), %zmm17
+        vmovups   832(%rsp), %zmm18
+        vmovups   768(%rsp), %zmm19
+        vmovups   704(%rsp), %zmm20
+        vmovups   640(%rsp), %zmm21
+        vmovups   576(%rsp), %zmm22
+        vmovups   512(%rsp), %zmm23
+        vmovups   448(%rsp), %zmm24
+        vmovups   384(%rsp), %zmm25
+        vmovups   320(%rsp), %zmm26
+        vmovups   256(%rsp), %zmm27
+        vmovups   192(%rsp), %zmm28
+        vmovups   128(%rsp), %zmm29
+        vmovups   64(%rsp), %zmm30
+        vmovups   (%rsp), %zmm31
+        movq      1080(%rsp), %r14
+        cfi_restore (%r14)
+        movq      1072(%rsp), %r15
+        cfi_restore (%r15)
+        vmovups   1216(%rsp), %zmm1
+        jmp       .LBL_1_2
+
+.LBL_1_10:
+        cfi_restore_state
+        movzbl    %r12b, %r15d
+        shlq      $4, %r15
+        vmovsd    1160(%rsp,%r15), %xmm0
+        call      JUMPTARGET(__exp_finite)
+        vmovsd    %xmm0, 1224(%rsp,%r15)
+        jmp       .LBL_1_8
+
+.LBL_1_12:
+        movzbl    %r12b, %r15d
+        shlq      $4, %r15
+        vmovsd    1152(%rsp,%r15), %xmm0
+        call      JUMPTARGET(__exp_finite)
+        vmovsd    %xmm0, 1216(%rsp,%r15)
+        jmp       .LBL_1_7
+#endif
+END (_ZGVeN8v_exp_knl)
+
+ENTRY (_ZGVeN8v_exp_skx)
+#ifndef HAVE_AVX512DQ_ASM_SUPPORT
+WRAPPER_IMPL_AVX512 _ZGVdN4v_exp
+#else
+/*
+   ALGORITHM DESCRIPTION:
+
+     Argument representation:
+     N = rint(X*2^k/ln2) = 2^k*M+j
+     X = N*ln2/2^k + r = M*ln2 + ln2*(j/2^k) + r
+     then -ln2/2^(k+1) < r < ln2/2^(k+1)
+     Alternatively:
+     N = trunc(X*2^k/ln2)
+     then 0 < r < ln2/2^k
+
+     Result calculation:
+     exp(X) = exp(M*ln2 + ln2*(j/2^k) + r)
+     = 2^M * 2^(j/2^k) * exp(r)
+     2^M is calculated by bit manipulation
+     2^(j/2^k) is stored in table
+     exp(r) is approximated by polynomial
+
+     The table lookup is skipped if k = 0.  */
+
+        pushq     %rbp
+        cfi_adjust_cfa_offset (8)
+        cfi_rel_offset (%rbp, 0)
+        movq      %rsp, %rbp
+        cfi_def_cfa_register (%rbp)
+        andq      $-64, %rsp
+        subq      $1280, %rsp
+        movq      __svml_dexp_data@GOTPCREL(%rip), %rax
+
+/* table lookup for dT[j] = 2^(j/2^k) */
+        kxnorw    %k1, %k1, %k1
+
+/* iAbsX = (int)(lX>>32), lX = *(longlong*)&X */
+        vpsrlq    $32, %zmm0, %zmm1
+        vmovups __dbInvLn2(%rax), %zmm7
+        vmovups __dbShifter(%rax), %zmm5
+        vmovups __lIndexMask(%rax), %zmm6
+        vmovups __dbLn2hi(%rax), %zmm9
+        vmovups __dPC0(%rax), %zmm12
+
+/* dM = X*dbInvLn2+dbShifter, dbInvLn2 = 2^k/Ln2 */
+        vfmadd213pd %zmm5, %zmm0, %zmm7
+        vpmovqd   %zmm1, %ymm2
+
+/* dN = dM-dbShifter, dN = rint(X*2^k/Ln2) */
+        vsubpd    %zmm5, %zmm7, %zmm11
+
+/* iAbsX = iAbsX&iAbsMask */
+        vpand __iAbsMask(%rax), %ymm2, %ymm3
+
+/* dR = X - dN*dbLn2hi, dbLn2hi is 52-8-k hi bits of ln2/2^k */
+        vmovaps   %zmm0, %zmm10
+        vfnmadd231pd %zmm11, %zmm9, %zmm10
+        vmovups __dPC2(%rax), %zmm9
+
+/* dR = dR - dN*dbLn2lo, dbLn2lo is 40..94 bits of lo part of ln2/2^k */
+        vfnmadd132pd __dbLn2lo(%rax), %zmm10, %zmm11
+
+/* exp(r) = b0+r*(b0+r*(b1+r*b2)) */
+        vfmadd213pd __dPC1(%rax), %zmm11, %zmm9
+        vfmadd213pd %zmm12, %zmm11, %zmm9
+        vfmadd213pd %zmm12, %zmm11, %zmm9
+
+/* iRangeMask = (iAbsX>iDomainRange) */
+        vpcmpgtd __iDomainRange(%rax), %ymm3, %ymm4
+
+/* Mask = iRangeMask?1:0, set mask for overflow/underflow */
+        vmovmskps %ymm4, %ecx
+
+/* lIndex = (*(longlong*)&dM)&lIndexMask, lIndex is the lower K bits of lM */
+        vpandq    %zmm6, %zmm7, %zmm13
+        vpmovqd   %zmm13, %ymm14
+        vpxord    %zmm15, %zmm15, %zmm15
+        vgatherdpd (%rax,%ymm14,8), %zmm15{%k1}
+
+/* 2^(j/2^k) * exp(r) */
+        vmulpd    %zmm9, %zmm15, %zmm10
+
+/* lM = (*(longlong*)&dM)&(~lIndexMask) */
+        vpandnq   %zmm7, %zmm6, %zmm8
+
+/* lM = lM<<(52-K), 2^M */
+        vpsllq    $42, %zmm8, %zmm1
+
+/* multiply by 2^M through integer add */
+        vpaddq    %zmm1, %zmm10, %zmm1
+        testl     %ecx, %ecx
+        jne       .LBL_2_3
+
+.LBL_2_2:
+        cfi_remember_state
+        vmovaps   %zmm1, %zmm0
+        movq      %rbp, %rsp
+        cfi_def_cfa_register (%rsp)
+        popq      %rbp
+        cfi_adjust_cfa_offset (-8)
+        cfi_restore (%rbp)
+        ret
+
+.LBL_2_3:
+        cfi_restore_state
+        vmovups   %zmm0, 1152(%rsp)
+        vmovups   %zmm1, 1216(%rsp)
+        je        .LBL_2_2
+
+        xorb      %dl, %dl
+        xorl      %eax, %eax
+        kmovw     %k4, 1048(%rsp)
+        kmovw     %k5, 1040(%rsp)
+        kmovw     %k6, 1032(%rsp)
+        kmovw     %k7, 1024(%rsp)
+        vmovups   %zmm16, 960(%rsp)
+        vmovups   %zmm17, 896(%rsp)
+        vmovups   %zmm18, 832(%rsp)
+        vmovups   %zmm19, 768(%rsp)
+        vmovups   %zmm20, 704(%rsp)
+        vmovups   %zmm21, 640(%rsp)
+        vmovups   %zmm22, 576(%rsp)
+        vmovups   %zmm23, 512(%rsp)
+        vmovups   %zmm24, 448(%rsp)
+        vmovups   %zmm25, 384(%rsp)
+        vmovups   %zmm26, 320(%rsp)
+        vmovups   %zmm27, 256(%rsp)
+        vmovups   %zmm28, 192(%rsp)
+        vmovups   %zmm29, 128(%rsp)
+        vmovups   %zmm30, 64(%rsp)
+        vmovups   %zmm31, (%rsp)
+        movq      %rsi, 1064(%rsp)
+        movq      %rdi, 1056(%rsp)
+        movq      %r12, 1096(%rsp)
+        cfi_offset_rel_rsp (12, 1096)
+        movb      %dl, %r12b
+        movq      %r13, 1088(%rsp)
+        cfi_offset_rel_rsp (13, 1088)
+        movl      %ecx, %r13d
+        movq      %r14, 1080(%rsp)
+        cfi_offset_rel_rsp (14, 1080)
+        movl      %eax, %r14d
+        movq      %r15, 1072(%rsp)
+        cfi_offset_rel_rsp (15, 1072)
+        cfi_remember_state
+
+.LBL_2_6:
+        btl       %r14d, %r13d
+        jc        .LBL_2_12
+
+.LBL_2_7:
+        lea       1(%r14), %esi
+        btl       %esi, %r13d
+        jc        .LBL_2_10
+
+.LBL_2_8:
+        incb      %r12b
+        addl      $2, %r14d
+        cmpb      $16, %r12b
+        jb        .LBL_2_6
+
+        kmovw     1048(%rsp), %k4
+        kmovw     1040(%rsp), %k5
+        kmovw     1032(%rsp), %k6
+        kmovw     1024(%rsp), %k7
+        vmovups   960(%rsp), %zmm16
+        vmovups   896(%rsp), %zmm17
+        vmovups   832(%rsp), %zmm18
+        vmovups   768(%rsp), %zmm19
+        vmovups   704(%rsp), %zmm20
+        vmovups   640(%rsp), %zmm21
+        vmovups   576(%rsp), %zmm22
+        vmovups   512(%rsp), %zmm23
+        vmovups   448(%rsp), %zmm24
+        vmovups   384(%rsp), %zmm25
+        vmovups   320(%rsp), %zmm26
+        vmovups   256(%rsp), %zmm27
+        vmovups   192(%rsp), %zmm28
+        vmovups   128(%rsp), %zmm29
+        vmovups   64(%rsp), %zmm30
+        vmovups   (%rsp), %zmm31
+        vmovups   1216(%rsp), %zmm1
+        movq      1064(%rsp), %rsi
+        movq      1056(%rsp), %rdi
+        movq      1096(%rsp), %r12
+        cfi_restore (%r12)
+        movq      1088(%rsp), %r13
+        cfi_restore (%r13)
+        movq      1080(%rsp), %r14
+        cfi_restore (%r14)
+        movq      1072(%rsp), %r15
+        cfi_restore (%r15)
+        jmp       .LBL_2_2
+
+.LBL_2_10:
+        cfi_restore_state
+        movzbl    %r12b, %r15d
+        shlq      $4, %r15
+        vmovsd    1160(%rsp,%r15), %xmm0
+        vzeroupper
+        vmovsd    1160(%rsp,%r15), %xmm0
+        call      JUMPTARGET(__exp_finite)
+        vmovsd    %xmm0, 1224(%rsp,%r15)
+        jmp       .LBL_2_8
+
+.LBL_2_12:
+        movzbl    %r12b, %r15d
+        shlq      $4, %r15
+        vmovsd    1152(%rsp,%r15), %xmm0
+        vzeroupper
+        vmovsd    1152(%rsp,%r15), %xmm0
+        call      JUMPTARGET(__exp_finite)
+        vmovsd    %xmm0, 1216(%rsp,%r15)
+        jmp       .LBL_2_7
+
+#endif
+END (_ZGVeN8v_exp_skx)
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_log2_core.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_log2_core.S
new file mode 100644
index 0000000000..5097add6b5
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_log2_core.S
@@ -0,0 +1,36 @@
+/* Multiple versions of vectorized log.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+        .text
+ENTRY (_ZGVbN2v_log)
+        .type   _ZGVbN2v_log, @gnu_indirect_function
+	LOAD_RTLD_GLOBAL_RO_RDX
+        leaq    _ZGVbN2v_log_sse4(%rip), %rax
+	HAS_CPU_FEATURE (SSE4_1)
+        jz      2f
+        ret
+2:      leaq    _ZGVbN2v_log_sse2(%rip), %rax
+        ret
+END (_ZGVbN2v_log)
+libmvec_hidden_def (_ZGVbN2v_log)
+
+#define _ZGVbN2v_log _ZGVbN2v_log_sse2
+#include "../svml_d_log2_core.S"
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_log2_core_sse4.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_log2_core_sse4.S
new file mode 100644
index 0000000000..7d4b3c8850
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_log2_core_sse4.S
@@ -0,0 +1,229 @@
+/* Function log vectorized with SSE4.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "svml_d_log_data.h"
+
+	.text
+ENTRY (_ZGVbN2v_log_sse4)
+/*
+   ALGORITHM DESCRIPTION:
+
+   log(x) = -log(Rcp) + log(Rcp*x),
+     where Rcp ~ 1/x (accuracy ~9 bits, obtained by rounding
+     HW approximation to 1+9 mantissa bits)
+
+   Reduced argument R=Rcp*x-1 is used to approximate log(1+R) as polynomial
+
+   log(Rcp) = exponent_Rcp*log(2) + log(mantissa_Rcp)
+     -log(mantissa_Rcp) is obtained from a lookup table,
+     accessed by a 9-bit index
+ */
+        pushq     %rbp
+        cfi_adjust_cfa_offset (8)
+        cfi_rel_offset (%rbp, 0)
+        movq      %rsp, %rbp
+        cfi_def_cfa_register (%rbp)
+        andq      $-64, %rsp
+        subq      $320, %rsp
+        movaps    %xmm0, %xmm6
+        movq      __svml_dlog_data@GOTPCREL(%rip), %r8
+        movaps    %xmm6, %xmm3
+        movaps    %xmm6, %xmm2
+
+/* isolate exponent bits */
+        movaps    %xmm6, %xmm1
+        psrlq     $20, %xmm1
+        movups _ExpMask(%r8), %xmm5
+
+/* preserve mantissa, set input exponent to 2^(-10) */
+        andps     %xmm6, %xmm5
+        orps _Two10(%r8), %xmm5
+
+/* reciprocal approximation good to at least 11 bits */
+        cvtpd2ps  %xmm5, %xmm7
+        cmpltpd _MinNorm(%r8), %xmm3
+        cmpnlepd _MaxNorm(%r8), %xmm2
+        movlhps   %xmm7, %xmm7
+
+/* combine and get argument value range mask */
+        orps      %xmm2, %xmm3
+        rcpps     %xmm7, %xmm0
+        movmskpd  %xmm3, %eax
+        movups _HalfMask(%r8), %xmm2
+
+/* argument reduction started:  R = Mantissa*Rcp - 1 */
+        andps     %xmm5, %xmm2
+        cvtps2pd  %xmm0, %xmm4
+        subpd     %xmm2, %xmm5
+
+/* round reciprocal to nearest integer, will have 1+9 mantissa bits */
+        roundpd   $0, %xmm4, %xmm4
+        mulpd     %xmm4, %xmm2
+        mulpd     %xmm4, %xmm5
+        subpd _One(%r8), %xmm2
+        addpd     %xmm2, %xmm5
+        movups _Threshold(%r8), %xmm2
+
+/* calculate index for table lookup */
+        movaps    %xmm4, %xmm3
+        cmpltpd   %xmm4, %xmm2
+        pshufd    $221, %xmm1, %xmm7
+        psrlq     $40, %xmm3
+
+/* convert biased exponent to DP format */
+        cvtdq2pd  %xmm7, %xmm0
+        movd      %xmm3, %edx
+        movups _poly_coeff_1(%r8), %xmm4
+
+/* polynomial computation */
+        mulpd     %xmm5, %xmm4
+        andps _Bias(%r8), %xmm2
+        orps _Bias1(%r8), %xmm2
+
+/*
+   Table stores -log(0.5*mantissa) for larger mantissas,
+   adjust exponent accordingly
+ */
+        subpd     %xmm2, %xmm0
+        addpd _poly_coeff_2(%r8), %xmm4
+
+/* exponent*log(2.0) */
+        mulpd _L2(%r8), %xmm0
+        movaps    %xmm5, %xmm2
+        mulpd     %xmm5, %xmm2
+        movups _poly_coeff_3(%r8), %xmm7
+        mulpd     %xmm5, %xmm7
+        mulpd     %xmm2, %xmm4
+        addpd _poly_coeff_4(%r8), %xmm7
+        addpd     %xmm4, %xmm7
+        mulpd     %xmm7, %xmm2
+        movslq    %edx, %rdx
+        pextrd    $2, %xmm3, %ecx
+
+/*
+   reconstruction:
+   (exponent*log(2)) + (LogRcp + (R+poly))
+ */
+        addpd     %xmm2, %xmm5
+        movslq    %ecx, %rcx
+        movsd     _LogRcp_lookup(%r8,%rdx), %xmm1
+        movhpd    _LogRcp_lookup(%r8,%rcx), %xmm1
+        addpd     %xmm5, %xmm1
+        addpd     %xmm1, %xmm0
+        testl     %eax, %eax
+        jne       .LBL_1_3
+
+.LBL_1_2:
+        cfi_remember_state
+        movq      %rbp, %rsp
+        cfi_def_cfa_register (%rsp)
+        popq      %rbp
+        cfi_adjust_cfa_offset (-8)
+        cfi_restore (%rbp)
+        ret
+
+.LBL_1_3:
+        cfi_restore_state
+        movups    %xmm6, 192(%rsp)
+        movups    %xmm0, 256(%rsp)
+        je        .LBL_1_2
+
+        xorb      %cl, %cl
+        xorl      %edx, %edx
+        movups    %xmm8, 112(%rsp)
+        movups    %xmm9, 96(%rsp)
+        movups    %xmm10, 80(%rsp)
+        movups    %xmm11, 64(%rsp)
+        movups    %xmm12, 48(%rsp)
+        movups    %xmm13, 32(%rsp)
+        movups    %xmm14, 16(%rsp)
+        movups    %xmm15, (%rsp)
+        movq      %rsi, 136(%rsp)
+        movq      %rdi, 128(%rsp)
+        movq      %r12, 168(%rsp)
+        cfi_offset_rel_rsp (12, 168)
+        movb      %cl, %r12b
+        movq      %r13, 160(%rsp)
+        cfi_offset_rel_rsp (13, 160)
+        movl      %eax, %r13d
+        movq      %r14, 152(%rsp)
+        cfi_offset_rel_rsp (14, 152)
+        movl      %edx, %r14d
+        movq      %r15, 144(%rsp)
+        cfi_offset_rel_rsp (15, 144)
+        cfi_remember_state
+
+.LBL_1_6:
+        btl       %r14d, %r13d
+        jc        .LBL_1_12
+
+.LBL_1_7:
+        lea       1(%r14), %esi
+        btl       %esi, %r13d
+        jc        .LBL_1_10
+
+.LBL_1_8:
+        incb      %r12b
+        addl      $2, %r14d
+        cmpb      $16, %r12b
+        jb        .LBL_1_6
+
+        movups    112(%rsp), %xmm8
+        movups    96(%rsp), %xmm9
+        movups    80(%rsp), %xmm10
+        movups    64(%rsp), %xmm11
+        movups    48(%rsp), %xmm12
+        movups    32(%rsp), %xmm13
+        movups    16(%rsp), %xmm14
+        movups    (%rsp), %xmm15
+        movq      136(%rsp), %rsi
+        movq      128(%rsp), %rdi
+        movq      168(%rsp), %r12
+        cfi_restore (%r12)
+        movq      160(%rsp), %r13
+        cfi_restore (%r13)
+        movq      152(%rsp), %r14
+        cfi_restore (%r14)
+        movq      144(%rsp), %r15
+        cfi_restore (%r15)
+        movups    256(%rsp), %xmm0
+        jmp       .LBL_1_2
+
+.LBL_1_10:
+        cfi_restore_state
+        movzbl    %r12b, %r15d
+        shlq      $4, %r15
+        movsd     200(%rsp,%r15), %xmm0
+
+        call      JUMPTARGET(__log_finite)
+
+        movsd     %xmm0, 264(%rsp,%r15)
+        jmp       .LBL_1_8
+
+.LBL_1_12:
+        movzbl    %r12b, %r15d
+        shlq      $4, %r15
+        movsd     192(%rsp,%r15), %xmm0
+
+        call      JUMPTARGET(__log_finite)
+
+        movsd     %xmm0, 256(%rsp,%r15)
+        jmp       .LBL_1_7
+
+END (_ZGVbN2v_log_sse4)
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_log4_core.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_log4_core.S
new file mode 100644
index 0000000000..1e9a2f48a1
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_log4_core.S
@@ -0,0 +1,36 @@
+/* Multiple versions of vectorized log.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+	.text
+ENTRY (_ZGVdN4v_log)
+        .type   _ZGVdN4v_log, @gnu_indirect_function
+	LOAD_RTLD_GLOBAL_RO_RDX
+        leaq    _ZGVdN4v_log_avx2(%rip), %rax
+	HAS_ARCH_FEATURE (AVX2_Usable)
+        jz      2f
+        ret
+2:      leaq    _ZGVdN4v_log_sse_wrapper(%rip), %rax
+        ret
+END (_ZGVdN4v_log)
+libmvec_hidden_def (_ZGVdN4v_log)
+
+#define _ZGVdN4v_log _ZGVdN4v_log_sse_wrapper
+#include "../svml_d_log4_core.S"
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_log4_core_avx2.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_log4_core_avx2.S
new file mode 100644
index 0000000000..04ea9e0071
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_log4_core_avx2.S
@@ -0,0 +1,210 @@
+/* Function log vectorized with AVX2.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "svml_d_log_data.h"
+
+	.text
+ENTRY (_ZGVdN4v_log_avx2)
+/* ALGORITHM DESCRIPTION:
+
+    log(x) = -log(Rcp) + log(Rcp*x),
+    where Rcp ~ 1/x (accuracy ~9 bits, obtained by rounding
+    HW approximation to 1+9 mantissa bits)
+
+    Reduced argument R=Rcp*x-1 is used to approximate log(1+R) as polynomial
+
+    log(Rcp) = exponent_Rcp*log(2) + log(mantissa_Rcp)
+      -log(mantissa_Rcp) is obtained from a lookup table,
+      accessed by a 9-bit index
+ */
+        pushq     %rbp
+        cfi_adjust_cfa_offset (8)
+        cfi_rel_offset (%rbp, 0)
+        movq      %rsp, %rbp
+        cfi_def_cfa_register (%rbp)
+        andq      $-64, %rsp
+        subq      $448, %rsp
+        movq      __svml_dlog_data@GOTPCREL(%rip), %rax
+        vmovdqa   %ymm0, %ymm5
+
+/* isolate exponent bits */
+        vpsrlq    $20, %ymm5, %ymm0
+
+/* preserve mantissa, set input exponent to 2^(-10) */
+        vandpd _ExpMask(%rax), %ymm5, %ymm6
+        vorpd _Two10(%rax), %ymm6, %ymm4
+
+/* reciprocal approximation good to at least 11 bits */
+        vcvtpd2ps %ymm4, %xmm7
+        vrcpps    %xmm7, %xmm1
+        vcmplt_oqpd _MinNorm(%rax), %ymm5, %ymm7
+        vcvtps2pd %xmm1, %ymm3
+        vcmpnle_uqpd _MaxNorm(%rax), %ymm5, %ymm1
+        vextracti128 $1, %ymm0, %xmm2
+        vshufps   $221, %xmm2, %xmm0, %xmm6
+
+/* round reciprocal to nearest integer, will have 1+9 mantissa bits */
+        vroundpd  $0, %ymm3, %ymm2
+
+/* convert biased exponent to DP format */
+        vcvtdq2pd %xmm6, %ymm0
+
+/* combine and get argument value range mask */
+        vorpd     %ymm1, %ymm7, %ymm3
+        vmovupd _One(%rax), %ymm1
+        vmovmskpd %ymm3, %ecx
+
+/* calculate index for table lookup */
+        vpsrlq    $40, %ymm2, %ymm3
+
+/* argument reduction started:  R = Mantissa*Rcp - 1 */
+        vfmsub213pd %ymm1, %ymm2, %ymm4
+        vcmpgt_oqpd _Threshold(%rax), %ymm2, %ymm2
+        vpcmpeqd  %ymm6, %ymm6, %ymm6
+        vxorpd    %ymm1, %ymm1, %ymm1
+        vgatherqpd %ymm6, _LogRcp_lookup(%rax,%ymm3), %ymm1
+
+/* exponent*log(2.0) */
+        vmovupd _poly_coeff_1(%rax), %ymm6
+        vmulpd    %ymm4, %ymm4, %ymm3
+
+/* polynomial computation */
+        vfmadd213pd _poly_coeff_2(%rax), %ymm4, %ymm6
+        vandpd _Bias(%rax), %ymm2, %ymm7
+        vorpd _Bias1(%rax), %ymm7, %ymm2
+
+/*
+   Table stores -log(0.5*mantissa) for larger mantissas,
+   adjust exponent accordingly
+ */
+        vsubpd    %ymm2, %ymm0, %ymm0
+        vmovupd _poly_coeff_3(%rax), %ymm2
+        vfmadd213pd _poly_coeff_4(%rax), %ymm4, %ymm2
+        vfmadd213pd %ymm2, %ymm3, %ymm6
+
+/*
+   reconstruction:
+   (exponent*log(2)) + (LogRcp + (R+poly))
+ */
+        vfmadd213pd %ymm4, %ymm3, %ymm6
+        vaddpd    %ymm1, %ymm6, %ymm4
+        vfmadd132pd _L2(%rax), %ymm4, %ymm0
+        testl     %ecx, %ecx
+        jne       .LBL_1_3
+
+.LBL_1_2:
+        cfi_remember_state
+        movq      %rbp, %rsp
+        cfi_def_cfa_register (%rsp)
+        popq      %rbp
+        cfi_adjust_cfa_offset (-8)
+        cfi_restore (%rbp)
+        ret
+
+.LBL_1_3:
+        cfi_restore_state
+        vmovupd   %ymm5, 320(%rsp)
+        vmovupd   %ymm0, 384(%rsp)
+        je        .LBL_1_2
+
+        xorb      %dl, %dl
+        xorl      %eax, %eax
+        vmovups   %ymm8, 224(%rsp)
+        vmovups   %ymm9, 192(%rsp)
+        vmovups   %ymm10, 160(%rsp)
+        vmovups   %ymm11, 128(%rsp)
+        vmovups   %ymm12, 96(%rsp)
+        vmovups   %ymm13, 64(%rsp)
+        vmovups   %ymm14, 32(%rsp)
+        vmovups   %ymm15, (%rsp)
+        movq      %rsi, 264(%rsp)
+        movq      %rdi, 256(%rsp)
+        movq      %r12, 296(%rsp)
+        cfi_offset_rel_rsp (12, 296)
+        movb      %dl, %r12b
+        movq      %r13, 288(%rsp)
+        cfi_offset_rel_rsp (13, 288)
+        movl      %ecx, %r13d
+        movq      %r14, 280(%rsp)
+        cfi_offset_rel_rsp (14, 280)
+        movl      %eax, %r14d
+        movq      %r15, 272(%rsp)
+        cfi_offset_rel_rsp (15, 272)
+        cfi_remember_state
+
+.LBL_1_6:
+        btl       %r14d, %r13d
+        jc        .LBL_1_12
+
+.LBL_1_7:
+        lea       1(%r14), %esi
+        btl       %esi, %r13d
+        jc        .LBL_1_10
+
+.LBL_1_8:
+        incb      %r12b
+        addl      $2, %r14d
+        cmpb      $16, %r12b
+        jb        .LBL_1_6
+
+        vmovups   224(%rsp), %ymm8
+        vmovups   192(%rsp), %ymm9
+        vmovups   160(%rsp), %ymm10
+        vmovups   128(%rsp), %ymm11
+        vmovups   96(%rsp), %ymm12
+        vmovups   64(%rsp), %ymm13
+        vmovups   32(%rsp), %ymm14
+        vmovups   (%rsp), %ymm15
+        vmovupd   384(%rsp), %ymm0
+        movq      264(%rsp), %rsi
+        movq      256(%rsp), %rdi
+        movq      296(%rsp), %r12
+        cfi_restore (%r12)
+        movq      288(%rsp), %r13
+        cfi_restore (%r13)
+        movq      280(%rsp), %r14
+        cfi_restore (%r14)
+        movq      272(%rsp), %r15
+        cfi_restore (%r15)
+        jmp       .LBL_1_2
+
+.LBL_1_10:
+        cfi_restore_state
+        movzbl    %r12b, %r15d
+        shlq      $4, %r15
+        vmovsd    328(%rsp,%r15), %xmm0
+        vzeroupper
+
+        call      JUMPTARGET(__log_finite)
+
+        vmovsd    %xmm0, 392(%rsp,%r15)
+        jmp       .LBL_1_8
+
+.LBL_1_12:
+        movzbl    %r12b, %r15d
+        shlq      $4, %r15
+        vmovsd    320(%rsp,%r15), %xmm0
+        vzeroupper
+
+        call      JUMPTARGET(__log_finite)
+
+        vmovsd    %xmm0, 384(%rsp,%r15)
+        jmp       .LBL_1_7
+
+END (_ZGVdN4v_log_avx2)
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core.S
new file mode 100644
index 0000000000..43f572d36c
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core.S
@@ -0,0 +1,37 @@
+/* Multiple versions of vectorized log.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+	.text
+ENTRY (_ZGVeN8v_log)
+        .type   _ZGVeN8v_log, @gnu_indirect_function
+	LOAD_RTLD_GLOBAL_RO_RDX
+        leaq    _ZGVeN8v_log_skx(%rip), %rax
+	HAS_ARCH_FEATURE (AVX512DQ_Usable)
+        jnz     2f
+        leaq    _ZGVeN8v_log_knl(%rip), %rax
+	HAS_ARCH_FEATURE (AVX512F_Usable)
+        jnz     2f
+        leaq    _ZGVeN8v_log_avx2_wrapper(%rip), %rax
+2:      ret
+END (_ZGVeN8v_log)
+
+#define _ZGVeN8v_log _ZGVeN8v_log_avx2_wrapper
+#include "../svml_d_log8_core.S"
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S
new file mode 100644
index 0000000000..d10d5114c6
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S
@@ -0,0 +1,468 @@
+/* Function log vectorized with AVX-512. KNL and SKX versions.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "svml_d_log_data.h"
+#include "svml_d_wrapper_impl.h"
+
+	.text
+ENTRY (_ZGVeN8v_log_knl)
+#ifndef HAVE_AVX512DQ_ASM_SUPPORT
+WRAPPER_IMPL_AVX512 _ZGVdN4v_log
+#else
+/*
+   ALGORITHM DESCRIPTION:
+
+   log(x) = -log(Rcp) + log(Rcp*x),
+     where Rcp ~ 1/x (accuracy ~9 bits, obtained by
+     rounding HW approximation to 1+9 mantissa bits)
+
+   Reduced argument R=Rcp*x-1 is used to approximate log(1+R) as polynomial
+
+   log(Rcp) = exponent_Rcp*log(2) + log(mantissa_Rcp)
+     -log(mantissa_Rcp) is obtained from a lookup table,
+     accessed by a 9-bit index
+ */
+        pushq     %rbp
+        cfi_adjust_cfa_offset (8)
+        cfi_rel_offset (%rbp, 0)
+        movq      %rsp, %rbp
+        cfi_def_cfa_register (%rbp)
+        andq      $-64, %rsp
+        subq      $1280, %rsp
+        movq      __svml_dlog_data@GOTPCREL(%rip), %rdx
+        movq      $-1, %rax
+
+/* isolate exponent bits */
+        vpsrlq    $20, %zmm0, %zmm2
+        vpsrlq    $32, %zmm2, %zmm3
+        vpxord    %zmm2, %zmm2, %zmm2
+        kxnorw    %k3, %k3, %k3
+        vmovups   _Two10(%rdx), %zmm1
+        vmovups   _One(%rdx), %zmm9
+        vpmovqd   %zmm3, %ymm4
+
+/* convert biased exponent to DP format */
+        vcvtdq2pd %ymm4, %zmm13
+
+/* preserve mantissa, set input exponent to 2^(-10) */
+        vpternlogq $248, _ExpMask(%rdx), %zmm0, %zmm1
+        vcmppd     $17, _MinNorm(%rdx), %zmm0, %k1
+
+/* reciprocal approximation good to at least 11 bits */
+        vrcp28pd  %zmm1, %zmm5
+        vpbroadcastq %rax, %zmm6{%k1}{z}
+        vmovups   _poly_coeff_3(%rdx), %zmm15
+        vcmppd    $22, _MaxNorm(%rdx), %zmm0, %k2
+        vmovups   _Bias1(%rdx), %zmm14
+
+/* round reciprocal to nearest integer, will have 1+9 mantissa bits */
+        vrndscalepd $8, %zmm5, %zmm11
+        vpbroadcastq %rax, %zmm7{%k2}{z}
+
+/* argument reduction started:  R = Mantissa*Rcp - 1 */
+        vfmsub213pd %zmm9, %zmm11, %zmm1
+
+/* calculate index for table lookup */
+        vpsrlq    $40, %zmm11, %zmm10
+        vgatherqpd _LogRcp_lookup(%rdx,%zmm10), %zmm2{%k3}
+        vcmppd    $30, _Threshold(%rdx), %zmm11, %k1
+
+/* combine and get argument value range mask */
+        vporq     %zmm7, %zmm6, %zmm8
+
+/* exponent*log(2.0) */
+        vmovups   _poly_coeff_1(%rdx), %zmm11
+        vmulpd    %zmm1, %zmm1, %zmm10
+        vptestmq  %zmm8, %zmm8, %k0
+        vfmadd213pd _poly_coeff_4(%rdx), %zmm1, %zmm15
+        kmovw     %k0, %ecx
+
+/* polynomial computation */
+        vfmadd213pd  _poly_coeff_2(%rdx), %zmm1, %zmm11
+        movzbl       %cl, %ecx
+        vpbroadcastq %rax, %zmm12{%k1}{z}
+        vfmadd213pd  %zmm15, %zmm10, %zmm11
+        vpternlogq   $248, _Bias(%rdx), %zmm12, %zmm14
+
+/*
+   Table stores -log(0.5*mantissa) for larger mantissas,
+   adjust exponent accordingly
+ */
+        vsubpd    %zmm14, %zmm13, %zmm3
+
+/*
+   reconstruction:
+   (exponent*log(2)) + (LogRcp + (R+poly))
+ */
+        vfmadd213pd %zmm1, %zmm10, %zmm11
+        vaddpd      %zmm2, %zmm11, %zmm1
+        vfmadd132pd _L2(%rdx), %zmm1, %zmm3
+        testl       %ecx, %ecx
+        jne         .LBL_1_3
+
+.LBL_1_2:
+        cfi_remember_state
+        vmovaps   %zmm3, %zmm0
+        movq      %rbp, %rsp
+        cfi_def_cfa_register (%rsp)
+        popq      %rbp
+        cfi_adjust_cfa_offset (-8)
+        cfi_restore (%rbp)
+        ret
+
+.LBL_1_3:
+        cfi_restore_state
+        vmovups   %zmm0, 1152(%rsp)
+        vmovups   %zmm3, 1216(%rsp)
+        je        .LBL_1_2
+
+        xorb      %dl, %dl
+        kmovw     %k4, 1048(%rsp)
+        xorl      %eax, %eax
+        kmovw     %k5, 1040(%rsp)
+        kmovw     %k6, 1032(%rsp)
+        kmovw     %k7, 1024(%rsp)
+        vmovups   %zmm16, 960(%rsp)
+        vmovups   %zmm17, 896(%rsp)
+        vmovups   %zmm18, 832(%rsp)
+        vmovups   %zmm19, 768(%rsp)
+        vmovups   %zmm20, 704(%rsp)
+        vmovups   %zmm21, 640(%rsp)
+        vmovups   %zmm22, 576(%rsp)
+        vmovups   %zmm23, 512(%rsp)
+        vmovups   %zmm24, 448(%rsp)
+        vmovups   %zmm25, 384(%rsp)
+        vmovups   %zmm26, 320(%rsp)
+        vmovups   %zmm27, 256(%rsp)
+        vmovups   %zmm28, 192(%rsp)
+        vmovups   %zmm29, 128(%rsp)
+        vmovups   %zmm30, 64(%rsp)
+        vmovups   %zmm31, (%rsp)
+        movq      %rsi, 1064(%rsp)
+        movq      %rdi, 1056(%rsp)
+        movq      %r12, 1096(%rsp)
+        cfi_offset_rel_rsp (12, 1096)
+        movb      %dl, %r12b
+        movq      %r13, 1088(%rsp)
+        cfi_offset_rel_rsp (13, 1088)
+        movl      %ecx, %r13d
+        movq      %r14, 1080(%rsp)
+        cfi_offset_rel_rsp (14, 1080)
+        movl      %eax, %r14d
+        movq      %r15, 1072(%rsp)
+        cfi_offset_rel_rsp (15, 1072)
+        cfi_remember_state
+
+.LBL_1_6:
+        btl       %r14d, %r13d
+        jc        .LBL_1_12
+
+.LBL_1_7:
+        lea       1(%r14), %esi
+        btl       %esi, %r13d
+        jc        .LBL_1_10
+
+.LBL_1_8:
+        addb      $1, %r12b
+        addl      $2, %r14d
+        cmpb      $16, %r12b
+        jb        .LBL_1_6
+
+        kmovw     1048(%rsp), %k4
+        movq      1064(%rsp), %rsi
+        kmovw     1040(%rsp), %k5
+        movq      1056(%rsp), %rdi
+        kmovw     1032(%rsp), %k6
+        movq      1096(%rsp), %r12
+        cfi_restore (%r12)
+        movq      1088(%rsp), %r13
+        cfi_restore (%r13)
+        kmovw     1024(%rsp), %k7
+        vmovups   960(%rsp), %zmm16
+        vmovups   896(%rsp), %zmm17
+        vmovups   832(%rsp), %zmm18
+        vmovups   768(%rsp), %zmm19
+        vmovups   704(%rsp), %zmm20
+        vmovups   640(%rsp), %zmm21
+        vmovups   576(%rsp), %zmm22
+        vmovups   512(%rsp), %zmm23
+        vmovups   448(%rsp), %zmm24
+        vmovups   384(%rsp), %zmm25
+        vmovups   320(%rsp), %zmm26
+        vmovups   256(%rsp), %zmm27
+        vmovups   192(%rsp), %zmm28
+        vmovups   128(%rsp), %zmm29
+        vmovups   64(%rsp), %zmm30
+        vmovups   (%rsp), %zmm31
+        movq      1080(%rsp), %r14
+        cfi_restore (%r14)
+        movq      1072(%rsp), %r15
+        cfi_restore (%r15)
+        vmovups   1216(%rsp), %zmm3
+        jmp       .LBL_1_2
+
+.LBL_1_10:
+        cfi_restore_state
+        movzbl    %r12b, %r15d
+        shlq      $4, %r15
+        vmovsd    1160(%rsp,%r15), %xmm0
+        call      JUMPTARGET(__log_finite)
+        vmovsd    %xmm0, 1224(%rsp,%r15)
+        jmp       .LBL_1_8
+
+.LBL_1_12:
+        movzbl    %r12b, %r15d
+        shlq      $4, %r15
+        vmovsd    1152(%rsp,%r15), %xmm0
+        call      JUMPTARGET(__log_finite)
+        vmovsd    %xmm0, 1216(%rsp,%r15)
+        jmp       .LBL_1_7
+#endif
+END (_ZGVeN8v_log_knl)
+
+ENTRY (_ZGVeN8v_log_skx)
+#ifndef HAVE_AVX512DQ_ASM_SUPPORT
+WRAPPER_IMPL_AVX512 _ZGVdN4v_log
+#else
+/*
+   ALGORITHM DESCRIPTION:
+
+     log(x) = -log(Rcp) + log(Rcp*x),
+       where Rcp ~ 1/x (accuracy ~9 bits,
+       obtained by rounding HW approximation to 1+9 mantissa bits)
+
+     Reduced argument R=Rcp*x-1 is used to approximate log(1+R) as polynomial
+
+     log(Rcp) = exponent_Rcp*log(2) + log(mantissa_Rcp)
+       -log(mantissa_Rcp) is obtained from a lookup table,
+       accessed by a 9-bit index
+ */
+        pushq     %rbp
+        cfi_adjust_cfa_offset (8)
+        cfi_rel_offset (%rbp, 0)
+        movq      %rsp, %rbp
+        cfi_def_cfa_register (%rbp)
+        andq      $-64, %rsp
+        subq      $1280, %rsp
+        movq      __svml_dlog_data@GOTPCREL(%rip), %rax
+        vmovaps   %zmm0, %zmm3
+        kxnorw    %k3, %k3, %k3
+        vmovups _Two10(%rax), %zmm2
+        vmovups _Threshold(%rax), %zmm14
+        vmovups _One(%rax), %zmm11
+        vcmppd    $21, _MinNorm(%rax), %zmm3, %k1
+        vcmppd    $18, _MaxNorm(%rax), %zmm3, %k2
+
+/* isolate exponent bits */
+        vpsrlq    $20, %zmm3, %zmm4
+
+/* preserve mantissa, set input exponent to 2^(-10) */
+        vpternlogq $248, _ExpMask(%rax), %zmm3, %zmm2
+        vpbroadcastq .L_2il0floatpacket.12(%rip), %zmm1
+        vpsrlq    $32, %zmm4, %zmm6
+
+/* reciprocal approximation good to at least 11 bits */
+        vrcp14pd  %zmm2, %zmm5
+
+/* exponent*log(2.0) */
+        vmovups _poly_coeff_1(%rax), %zmm4
+        vpmovqd   %zmm6, %ymm7
+
+/* round reciprocal to nearest integer, will have 1+9 mantissa bits */
+        vrndscalepd $8, %zmm5, %zmm0
+
+/* calculate index for table lookup */
+        vpsrlq    $40, %zmm0, %zmm12
+
+/* argument reduction started:  R = Mantissa*Rcp - 1 */
+        vfmsub213pd %zmm11, %zmm0, %zmm2
+        vpmovqd   %zmm12, %ymm13
+
+/* polynomial computation */
+        vfmadd213pd _poly_coeff_2(%rax), %zmm2, %zmm4
+        vmovaps   %zmm1, %zmm8
+        vmovaps   %zmm1, %zmm9
+        vpxord    %zmm5, %zmm5, %zmm5
+        vgatherdpd _LogRcp_lookup(%rax,%ymm13), %zmm5{%k3}
+        vmovups _Bias1(%rax), %zmm13
+        vpandnq   %zmm3, %zmm3, %zmm8{%k1}
+        vcmppd    $21, %zmm0, %zmm14, %k1
+        vpandnq   %zmm14, %zmm14, %zmm1{%k1}
+        vmulpd    %zmm2, %zmm2, %zmm14
+        vpternlogq $248, _Bias(%rax), %zmm1, %zmm13
+        vmovups _poly_coeff_3(%rax), %zmm1
+        vfmadd213pd _poly_coeff_4(%rax), %zmm2, %zmm1
+        vfmadd213pd %zmm1, %zmm14, %zmm4
+
+/*
+   reconstruction:
+   (exponent*log(2)) + (LogRcp + (R+poly))
+ */
+        vfmadd213pd %zmm2, %zmm14, %zmm4
+        vaddpd    %zmm5, %zmm4, %zmm2
+        vpandnq   %zmm3, %zmm3, %zmm9{%k2}
+
+/* combine and get argument value range mask */
+        vorpd     %zmm9, %zmm8, %zmm10
+        vcmppd    $3, %zmm10, %zmm10, %k0
+        kmovw     %k0, %ecx
+
+/* convert biased exponent to DP format */
+        vcvtdq2pd %ymm7, %zmm15
+
+/*
+   Table stores -log(0.5*mantissa) for larger mantissas,
+   adjust exponent accordingly
+ */
+        vsubpd    %zmm13, %zmm15, %zmm0
+        vfmadd132pd _L2(%rax), %zmm2, %zmm0
+        testl     %ecx, %ecx
+        jne       .LBL_2_3
+
+.LBL_2_2:
+        cfi_remember_state
+        movq      %rbp, %rsp
+        cfi_def_cfa_register (%rsp)
+        popq      %rbp
+        cfi_adjust_cfa_offset (-8)
+        cfi_restore (%rbp)
+        ret
+
+.LBL_2_3:
+        cfi_restore_state
+        vmovups   %zmm3, 1152(%rsp)
+        vmovups   %zmm0, 1216(%rsp)
+        je        .LBL_2_2
+
+        xorb      %dl, %dl
+        xorl      %eax, %eax
+        kmovw     %k4, 1048(%rsp)
+        kmovw     %k5, 1040(%rsp)
+        kmovw     %k6, 1032(%rsp)
+        kmovw     %k7, 1024(%rsp)
+        vmovups   %zmm16, 960(%rsp)
+        vmovups   %zmm17, 896(%rsp)
+        vmovups   %zmm18, 832(%rsp)
+        vmovups   %zmm19, 768(%rsp)
+        vmovups   %zmm20, 704(%rsp)
+        vmovups   %zmm21, 640(%rsp)
+        vmovups   %zmm22, 576(%rsp)
+        vmovups   %zmm23, 512(%rsp)
+        vmovups   %zmm24, 448(%rsp)
+        vmovups   %zmm25, 384(%rsp)
+        vmovups   %zmm26, 320(%rsp)
+        vmovups   %zmm27, 256(%rsp)
+        vmovups   %zmm28, 192(%rsp)
+        vmovups   %zmm29, 128(%rsp)
+        vmovups   %zmm30, 64(%rsp)
+        vmovups   %zmm31, (%rsp)
+        movq      %rsi, 1064(%rsp)
+        movq      %rdi, 1056(%rsp)
+        movq      %r12, 1096(%rsp)
+        cfi_offset_rel_rsp (12, 1096)
+        movb      %dl, %r12b
+        movq      %r13, 1088(%rsp)
+        cfi_offset_rel_rsp (13, 1088)
+        movl      %ecx, %r13d
+        movq      %r14, 1080(%rsp)
+        cfi_offset_rel_rsp (14, 1080)
+        movl      %eax, %r14d
+        movq      %r15, 1072(%rsp)
+        cfi_offset_rel_rsp (15, 1072)
+        cfi_remember_state
+
+.LBL_2_6:
+        btl       %r14d, %r13d
+        jc        .LBL_2_12
+
+.LBL_2_7:
+        lea       1(%r14), %esi
+        btl       %esi, %r13d
+        jc        .LBL_2_10
+
+.LBL_2_8:
+        incb      %r12b
+        addl      $2, %r14d
+        cmpb      $16, %r12b
+        jb        .LBL_2_6
+
+        kmovw     1048(%rsp), %k4
+        kmovw     1040(%rsp), %k5
+        kmovw     1032(%rsp), %k6
+        kmovw     1024(%rsp), %k7
+        vmovups   960(%rsp), %zmm16
+        vmovups   896(%rsp), %zmm17
+        vmovups   832(%rsp), %zmm18
+        vmovups   768(%rsp), %zmm19
+        vmovups   704(%rsp), %zmm20
+        vmovups   640(%rsp), %zmm21
+        vmovups   576(%rsp), %zmm22
+        vmovups   512(%rsp), %zmm23
+        vmovups   448(%rsp), %zmm24
+        vmovups   384(%rsp), %zmm25
+        vmovups   320(%rsp), %zmm26
+        vmovups   256(%rsp), %zmm27
+        vmovups   192(%rsp), %zmm28
+        vmovups   128(%rsp), %zmm29
+        vmovups   64(%rsp), %zmm30
+        vmovups   (%rsp), %zmm31
+        vmovups   1216(%rsp), %zmm0
+        movq      1064(%rsp), %rsi
+        movq      1056(%rsp), %rdi
+        movq      1096(%rsp), %r12
+        cfi_restore (%r12)
+        movq      1088(%rsp), %r13
+        cfi_restore (%r13)
+        movq      1080(%rsp), %r14
+        cfi_restore (%r14)
+        movq      1072(%rsp), %r15
+        cfi_restore (%r15)
+        jmp       .LBL_2_2
+
+.LBL_2_10:
+        cfi_restore_state
+        movzbl    %r12b, %r15d
+        shlq      $4, %r15
+        vmovsd    1160(%rsp,%r15), %xmm0
+        vzeroupper
+        vmovsd    1160(%rsp,%r15), %xmm0
+
+        call      JUMPTARGET(__log_finite)
+
+        vmovsd    %xmm0, 1224(%rsp,%r15)
+        jmp       .LBL_2_8
+
+.LBL_2_12:
+        movzbl    %r12b, %r15d
+        shlq      $4, %r15
+        vmovsd    1152(%rsp,%r15), %xmm0
+        vzeroupper
+        vmovsd    1152(%rsp,%r15), %xmm0
+
+        call      JUMPTARGET(__log_finite)
+
+        vmovsd    %xmm0, 1216(%rsp,%r15)
+        jmp       .LBL_2_7
+#endif
+END (_ZGVeN8v_log_skx)
+
+	.section .rodata, "a"
+.L_2il0floatpacket.12:
+	.long	0xffffffff,0xffffffff
+	.type	.L_2il0floatpacket.12,@object
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_pow2_core.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_pow2_core.S
new file mode 100644
index 0000000000..adb0872e56
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_pow2_core.S
@@ -0,0 +1,36 @@
+/* Multiple versions of vectorized pow.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+	.text
+ENTRY (_ZGVbN2vv_pow)
+        .type   _ZGVbN2vv_pow, @gnu_indirect_function
+	LOAD_RTLD_GLOBAL_RO_RDX
+        leaq    _ZGVbN2vv_pow_sse4(%rip), %rax
+	HAS_CPU_FEATURE (SSE4_1)
+        jz      2f
+        ret
+2:      leaq    _ZGVbN2vv_pow_sse2(%rip), %rax
+        ret
+END (_ZGVbN2vv_pow)
+libmvec_hidden_def (_ZGVbN2vv_pow)
+
+#define _ZGVbN2vv_pow _ZGVbN2vv_pow_sse2
+#include "../svml_d_pow2_core.S"
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_pow2_core_sse4.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_pow2_core_sse4.S
new file mode 100644
index 0000000000..ad7c215ff0
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_pow2_core_sse4.S
@@ -0,0 +1,432 @@
+/* Function pow vectorized with SSE4.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "svml_d_pow_data.h"
+
+	.text
+ENTRY (_ZGVbN2vv_pow_sse4)
+/*
+   ALGORITHM DESCRIPTION:
+
+     1) Calculating log2|x|
+     Here we use the following formula.
+     Let |x|=2^k1*X1, where k1 is integer, 1<=X1<2.
+     Let C ~= 1/ln(2),
+     Rcp1 ~= 1/X1,   X2=Rcp1*X1,
+     Rcp2 ~= 1/X2,   X3=Rcp2*X2,
+     Rcp3 ~= 1/X3,   Rcp3C ~= C/X3.
+     Then
+     log2|x| = k1 + log2(1/Rcp1) + log2(1/Rcp2) + log2(C/Rcp3C) +
+               log2(X1*Rcp1*Rcp2*Rcp3C/C),
+     where X1*Rcp1*Rcp2*Rcp3C = C*(1+q), q is very small.
+
+     The values of Rcp1, log2(1/Rcp1), Rcp2, log2(1/Rcp2),
+     Rcp3C, log2(C/Rcp3C) are taken from tables.
+     Values of Rcp1, Rcp2, Rcp3C are such that RcpC=Rcp1*Rcp2*Rcp3C
+     is exactly represented in target precision.
+
+     log2(X1*Rcp1*Rcp2*Rcp3C/C) = log2(1+q) = ln(1+q)/ln2 =
+     = 1/(ln2)*q - 1/(2ln2)*q^2 + 1/(3ln2)*q^3 - ... =
+     = 1/(C*ln2)*cq - 1/(2*C^2*ln2)*cq^2 + 1/(3*C^3*ln2)*cq^3 - ... =
+     = (1 + a1)*cq + a2*cq^2 + a3*cq^3 + ...,
+     where cq = X1*Rcp1*Rcp2*Rcp3C-C,
+     a1=1/(C*ln(2))-1 is small,
+     a2=1/(2*C^2*ln2),
+     a3=1/(3*C^3*ln2),
+     ...
+     We get 3 parts of log2 result: HH+HL+HLL ~= log2|x|.
+
+     2)  Calculation of y*(HH+HL+HLL).
+     Split y into YHi+YLo.
+     Get high PH and medium PL parts of y*log2|x|.
+     Get low PLL part of y*log2|x|.
+     Now we have PH+PL+PLL ~= y*log2|x|.
+
+     3) Calculation of 2^(PH+PL+PLL).
+     Mathematical idea of computing 2^(PH+PL+PLL) is the following.
+     Let's represent PH+PL+PLL in the form N + j/2^expK + Z,
+     where expK=7 in this implementation, N and j are integers,
+     0<=j<=2^expK-1, |Z|<2^(-expK-1).
+     Hence 2^(PH+PL+PLL) ~= 2^N * 2^(j/2^expK) * 2^Z,
+     where 2^(j/2^expK) is stored in a table, and
+     2^Z ~= 1 + B1*Z + B2*Z^2 ... + B5*Z^5.
+
+     We compute 2^(PH+PL+PLL) as follows.
+     Break PH into PHH + PHL, where PHH = N + j/2^expK.
+     Z = PHL + PL + PLL
+     Exp2Poly = B1*Z + B2*Z^2 ... + B5*Z^5
+     Get 2^(j/2^expK) from table in the form THI+TLO.
+     Now we have 2^(PH+PL+PLL) ~= 2^N * (THI + TLO) * (1 + Exp2Poly).
+
+     Get significand of 2^(PH+PL+PLL) in the form ResHi+ResLo:
+     ResHi := THI
+     ResLo := THI * Exp2Poly + TLO
+
+     Get exponent ERes of the result:
+     Res := ResHi + ResLo:
+     Result := ex(Res) + N.  */
+
+        pushq     %rbp
+        cfi_adjust_cfa_offset (8)
+        cfi_rel_offset (%rbp, 0)
+        movq      %rsp, %rbp
+        cfi_def_cfa_register (%rbp)
+        andq      $-64, %rsp
+        subq      $256, %rsp
+        movq      __svml_dpow_data@GOTPCREL(%rip), %rdx
+        movups    %xmm14, 80(%rsp)
+        movups    %xmm9, 176(%rsp)
+        movaps    %xmm1, %xmm9
+        pshufd    $221, %xmm0, %xmm1
+        movq _iIndexMask(%rdx), %xmm14
+        movq _iIndexAdd(%rdx), %xmm6
+
+/* i = (((Hi(x) & 0x000ffe00) + 0x00000200) >> 10); -> i = (b1..b11 + 1) / 2 */
+        pand      %xmm1, %xmm14
+        paddd     %xmm6, %xmm14
+        psrld     $10, %xmm14
+        movups    %xmm13, 96(%rsp)
+
+/* Index for reciprocal table */
+        movdqa    %xmm14, %xmm13
+        pslld     $3, %xmm13
+
+/* Index for log2 table */
+        pslld     $4, %xmm14
+        movd      %xmm13, %eax
+        movups    %xmm10, 160(%rsp)
+        movups _iMantissaMask(%rdx), %xmm10
+        movslq    %eax, %rax
+
+/* x1 = x; Hi(x1) = (Hi(x1)&0x000fffff)|0x3ff00000 */
+        andps     %xmm0, %xmm10
+        pextrd    $1, %xmm13, %ecx
+        movslq    %ecx, %rcx
+        movups    %xmm0, (%rsp)
+        movdqa    %xmm1, %xmm0
+
+/* k = Hi(x); k = k - 0x3fe7fe00; k = k >> 20 */
+        movq _i3fe7fe00(%rdx), %xmm6
+        psubd     %xmm6, %xmm0
+        movups _iHighMask(%rdx), %xmm6
+        psrad     $20, %xmm0
+        movups    %xmm15, 48(%rsp)
+        movups    %xmm12, 112(%rsp)
+        orps _dbOne(%rdx), %xmm10
+        movsd     11712(%rdx,%rax), %xmm12
+        movd      %xmm14, %r8d
+        movq _i2p20_2p19(%rdx), %xmm15
+        movhpd    11712(%rdx,%rcx), %xmm12
+        paddd     %xmm15, %xmm0
+        pextrd    $1, %xmm14, %r9d
+
+/* x1Hi=x1; Lo(x1Hi)&=0xf8000000; x1Lo = x1-x1Hi */
+        movaps    %xmm6, %xmm14
+        andps     %xmm10, %xmm14
+        movaps    %xmm10, %xmm15
+        subpd     %xmm14, %xmm15
+
+/* r1 = x1*rcp1 */
+        mulpd     %xmm12, %xmm10
+
+/* E = -r1+__fence(x1Hi*rcp1) */
+        mulpd     %xmm12, %xmm14
+
+/* E=E+x1Lo*rcp1 */
+        mulpd     %xmm15, %xmm12
+        subpd     %xmm10, %xmm14
+        pshufd    $80, %xmm0, %xmm0
+        movslq    %r8d, %r8
+        andps _iffffffff00000000(%rdx), %xmm0
+        subpd _db2p20_2p19(%rdx), %xmm0
+        addpd     %xmm12, %xmm14
+        movslq    %r9d, %r9
+
+/* T_Rh_Eh = T_Rh + E */
+        movaps    %xmm14, %xmm15
+        movups    %xmm8, 208(%rsp)
+        movups    19968(%rdx,%r8), %xmm8
+        movups    %xmm11, 144(%rsp)
+        movaps    %xmm8, %xmm11
+
+/* cq = c+r1 */
+        movups _LHN(%rdx), %xmm13
+        movhpd    19968(%rdx,%r9), %xmm11
+        addpd     %xmm10, %xmm13
+
+/* T = k + L1hi */
+        addpd     %xmm0, %xmm11
+
+/* T_Rh = T + cq */
+        movaps    %xmm13, %xmm12
+        addpd     %xmm11, %xmm12
+        addpd     %xmm12, %xmm15
+
+/* Rl = T-T_Rh; -> -Rh */
+        subpd     %xmm12, %xmm11
+
+/* HLL = T_Rh - T_Rh_Eh; -> -Eh */
+        subpd     %xmm15, %xmm12
+
+/* Rl=Rl+cq; */
+        addpd     %xmm13, %xmm11
+
+/* cq = cq + E */
+        addpd     %xmm14, %xmm13
+
+/* HLL+=E;  -> El */
+        addpd     %xmm14, %xmm12
+
+/* HLL+=Rl */
+        addpd     %xmm12, %xmm11
+        unpckhpd  19968(%rdx,%r9), %xmm8
+
+/* T_Rh_Eh_HLLhi = T_Rh_Eh + HLL */
+        movaps    %xmm15, %xmm14
+
+/* HLL+=L1lo; */
+        addpd     %xmm11, %xmm8
+        movups _clv_2(%rdx), %xmm11
+
+/* HH = T_Rh_Eh_HLLhi; Lo(HH)&=0xf8000000 */
+        movaps    %xmm6, %xmm12
+
+/* HLL = HLL + (((((((a7)*cq+a6)*cq+a5)*cq+a4)*cq+a3)*cq+a2)*cq+a1)*cq */
+        mulpd     %xmm13, %xmm11
+        addpd _clv_3(%rdx), %xmm11
+        mulpd     %xmm13, %xmm11
+        addpd _clv_4(%rdx), %xmm11
+        mulpd     %xmm13, %xmm11
+        addpd _clv_5(%rdx), %xmm11
+        mulpd     %xmm13, %xmm11
+        addpd _clv_6(%rdx), %xmm11
+        mulpd     %xmm13, %xmm11
+        addpd _clv_7(%rdx), %xmm11
+        mulpd     %xmm11, %xmm13
+        addpd     %xmm13, %xmm8
+        addpd     %xmm8, %xmm14
+
+/*
+   2^(y*(HH+HL+HLL)) starts here:
+   yH = y; Lo(yH)&=0xf8000000
+ */
+        andps     %xmm9, %xmm6
+
+/* yL = y-yH; */
+        movaps    %xmm9, %xmm11
+        subpd     %xmm6, %xmm11
+        andps     %xmm14, %xmm12
+
+/* HLLhi = T_Rh_Eh_HLLhi - T_Rh_Eh */
+        movaps    %xmm14, %xmm10
+
+/* HL = T_Rh_Eh_HLLhi-HH; */
+        subpd     %xmm12, %xmm14
+        subpd     %xmm15, %xmm10
+        movq _HIDELTA(%rdx), %xmm2
+
+/* pH = yH*HH; */
+        movaps    %xmm6, %xmm13
+        movq _LORANGE(%rdx), %xmm3
+        paddd     %xmm2, %xmm1
+        pcmpgtd   %xmm1, %xmm3
+
+/* pL=yL*HL+yH*HL; pL+=yL*HH; */
+        movaps    %xmm11, %xmm1
+        mulpd     %xmm14, %xmm1
+        mulpd     %xmm14, %xmm6
+        mulpd     %xmm12, %xmm13
+        mulpd     %xmm11, %xmm12
+        addpd     %xmm6, %xmm1
+
+/* HLL = HLL - HLLhi */
+        subpd     %xmm10, %xmm8
+        addpd     %xmm12, %xmm1
+
+/* pLL = y*HLL */
+        mulpd     %xmm9, %xmm8
+        movups _db2p45_2p44(%rdx), %xmm11
+
+/* pHH = pH + *(double*)&db2p45_2p44 */
+        movaps    %xmm11, %xmm12
+        addpd     %xmm13, %xmm12
+
+/* t=pL+pLL; t+=pHL */
+        addpd     %xmm8, %xmm1
+        movq _ABSMASK(%rdx), %xmm5
+        pshufd    $221, %xmm9, %xmm4
+        pand      %xmm5, %xmm4
+        movq _INF(%rdx), %xmm7
+        movdqa    %xmm4, %xmm2
+        pcmpgtd   %xmm7, %xmm2
+        pcmpeqd   %xmm7, %xmm4
+        pshufd    $136, %xmm12, %xmm7
+        por       %xmm4, %xmm2
+
+/* pHH = pHH - *(double*)&db2p45_2p44 */
+        subpd     %xmm11, %xmm12
+        pshufd    $221, %xmm13, %xmm10
+        por       %xmm2, %xmm3
+
+/* pHL = pH - pHH; */
+        subpd     %xmm12, %xmm13
+        pand      %xmm5, %xmm10
+        movq _DOMAINRANGE(%rdx), %xmm5
+        movdqa    %xmm10, %xmm4
+        addpd     %xmm1, %xmm13
+        pcmpgtd   %xmm5, %xmm4
+        pcmpeqd   %xmm5, %xmm10
+        por       %xmm10, %xmm4
+        movq _jIndexMask(%rdx), %xmm6
+        por       %xmm4, %xmm3
+        movmskps  %xmm3, %eax
+
+/* j = Lo(pHH)&0x0000007f */
+        pand      %xmm7, %xmm6
+        movq _iOne(%rdx), %xmm3
+
+/* _n = Lo(pHH);
+   _n = _n & 0xffffff80;
+   _n = _n >> 7;
+   Hi(_2n) = (0x3ff+_n)<<20; Lo(_2n) = 0; -> 2^n
+ */
+        pslld     $13, %xmm7
+        paddd     %xmm3, %xmm7
+        pslld     $4, %xmm6
+        movups _cev_1(%rdx), %xmm3
+        movaps    %xmm13, %xmm4
+        mulpd     %xmm13, %xmm3
+
+/* T1 = ((double*)exp2_tbl)[ 2*j ] */
+        movd      %xmm6, %r10d
+        pshufd    $80, %xmm7, %xmm0
+        andps _ifff0000000000000(%rdx), %xmm0
+        addpd _cev_2(%rdx), %xmm3
+        mulpd     %xmm13, %xmm3
+        addpd _cev_3(%rdx), %xmm3
+        mulpd     %xmm13, %xmm3
+        movslq    %r10d, %r10
+        andl      $3, %eax
+        pextrd    $1, %xmm6, %r11d
+        movslq    %r11d, %r11
+        addpd _cev_4(%rdx), %xmm3
+        movsd     36416(%rdx,%r10), %xmm2
+        movhpd    36416(%rdx,%r11), %xmm2
+        mulpd     %xmm2, %xmm0
+        mulpd     %xmm3, %xmm13
+        mulpd     %xmm0, %xmm4
+        addpd _cev_5(%rdx), %xmm13
+        mulpd     %xmm4, %xmm13
+        addpd     %xmm13, %xmm0
+        jne       .LBL_1_3
+
+.LBL_1_2:
+        cfi_remember_state
+        movups    208(%rsp), %xmm8
+        movups    176(%rsp), %xmm9
+        movups    160(%rsp), %xmm10
+        movups    144(%rsp), %xmm11
+        movups    112(%rsp), %xmm12
+        movups    96(%rsp), %xmm13
+        movups    80(%rsp), %xmm14
+        movups    48(%rsp), %xmm15
+        movq      %rbp, %rsp
+        cfi_def_cfa_register (%rsp)
+        popq      %rbp
+        cfi_adjust_cfa_offset (-8)
+        cfi_restore (%rbp)
+        ret
+
+.LBL_1_3:
+        cfi_restore_state
+        movups    (%rsp), %xmm1
+        movups    %xmm1, 64(%rsp)
+        movups    %xmm9, 128(%rsp)
+        movups    %xmm0, 192(%rsp)
+        je        .LBL_1_2
+
+        xorb      %cl, %cl
+        xorl      %edx, %edx
+        movq      %rsi, 8(%rsp)
+        movq      %rdi, (%rsp)
+        movq      %r12, 40(%rsp)
+        cfi_offset_rel_rsp (12, 40)
+        movb      %cl, %r12b
+        movq      %r13, 32(%rsp)
+        cfi_offset_rel_rsp (13, 32)
+        movl      %eax, %r13d
+        movq      %r14, 24(%rsp)
+        cfi_offset_rel_rsp (14, 24)
+        movl      %edx, %r14d
+        movq      %r15, 16(%rsp)
+        cfi_offset_rel_rsp (15, 16)
+        cfi_remember_state
+
+.LBL_1_6:
+        btl       %r14d, %r13d
+        jc        .LBL_1_12
+
+.LBL_1_7:
+        lea       1(%r14), %esi
+        btl       %esi, %r13d
+        jc        .LBL_1_10
+
+.LBL_1_8:
+        incb      %r12b
+        addl      $2, %r14d
+        cmpb      $16, %r12b
+        jb        .LBL_1_6
+
+        movq      8(%rsp), %rsi
+        movq      (%rsp), %rdi
+        movq      40(%rsp), %r12
+        cfi_restore (%r12)
+        movq      32(%rsp), %r13
+        cfi_restore (%r13)
+        movq      24(%rsp), %r14
+        cfi_restore (%r14)
+        movq      16(%rsp), %r15
+        cfi_restore (%r15)
+        movups    192(%rsp), %xmm0
+        jmp       .LBL_1_2
+
+.LBL_1_10:
+        cfi_restore_state
+        movzbl    %r12b, %r15d
+        shlq      $4, %r15
+        movsd     72(%rsp,%r15), %xmm0
+        movsd     136(%rsp,%r15), %xmm1
+
+        call      JUMPTARGET(__pow_finite)
+
+        movsd     %xmm0, 200(%rsp,%r15)
+        jmp       .LBL_1_8
+
+.LBL_1_12:
+        movzbl    %r12b, %r15d
+        shlq      $4, %r15
+        movsd     64(%rsp,%r15), %xmm0
+        movsd     128(%rsp,%r15), %xmm1
+
+        call      JUMPTARGET(__pow_finite)
+
+        movsd     %xmm0, 192(%rsp,%r15)
+        jmp       .LBL_1_7
+
+END (_ZGVbN2vv_pow_sse4)
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_pow4_core.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_pow4_core.S
new file mode 100644
index 0000000000..eea8af6638
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_pow4_core.S
@@ -0,0 +1,36 @@
+/* Multiple versions of vectorized pow.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+	.text
+ENTRY (_ZGVdN4vv_pow)
+        .type   _ZGVdN4vv_pow, @gnu_indirect_function
+	LOAD_RTLD_GLOBAL_RO_RDX
+        leaq    _ZGVdN4vv_pow_avx2(%rip), %rax
+	HAS_ARCH_FEATURE (AVX2_Usable)
+        jz      2f
+        ret
+2:      leaq    _ZGVdN4vv_pow_sse_wrapper(%rip), %rax
+        ret
+END (_ZGVdN4vv_pow)
+libmvec_hidden_def (_ZGVdN4vv_pow)
+
+#define _ZGVdN4vv_pow _ZGVdN4vv_pow_sse_wrapper
+#include "../svml_d_pow4_core.S"
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_pow4_core_avx2.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_pow4_core_avx2.S
new file mode 100644
index 0000000000..3092328909
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_pow4_core_avx2.S
@@ -0,0 +1,387 @@
+/* Function pow vectorized with AVX2.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "svml_d_pow_data.h"
+
+	.text
+ENTRY (_ZGVdN4vv_pow_avx2)
+/*
+   ALGORITHM DESCRIPTION:
+
+     1) Calculating log2|x|
+     Here we use the following formula.
+     Let |x|=2^k1*X1, where k1 is integer, 1<=X1<2.
+     Let C ~= 1/ln(2),
+     Rcp1 ~= 1/X1,   X2=Rcp1*X1,
+     Rcp2 ~= 1/X2,   X3=Rcp2*X2,
+     Rcp3 ~= 1/X3,   Rcp3C ~= C/X3.
+     Then
+     log2|x| = k1 + log2(1/Rcp1) + log2(1/Rcp2) + log2(C/Rcp3C) +
+               log2(X1*Rcp1*Rcp2*Rcp3C/C),
+     where X1*Rcp1*Rcp2*Rcp3C = C*(1+q), q is very small.
+
+     The values of Rcp1, log2(1/Rcp1), Rcp2, log2(1/Rcp2),
+     Rcp3C, log2(C/Rcp3C) are taken from tables.
+     Values of Rcp1, Rcp2, Rcp3C are such that RcpC=Rcp1*Rcp2*Rcp3C
+     is exactly represented in target precision.
+
+     log2(X1*Rcp1*Rcp2*Rcp3C/C) = log2(1+q) = ln(1+q)/ln2 =
+     = 1/(ln2)*q - 1/(2ln2)*q^2 + 1/(3ln2)*q^3 - ... =
+     = 1/(C*ln2)*cq - 1/(2*C^2*ln2)*cq^2 + 1/(3*C^3*ln2)*cq^3 - ... =
+     = (1 + a1)*cq + a2*cq^2 + a3*cq^3 + ...,
+     where cq = X1*Rcp1*Rcp2*Rcp3C-C,
+     a1=1/(C*ln(2))-1 is small,
+     a2=1/(2*C^2*ln2),
+     a3=1/(3*C^3*ln2),
+     ...
+     We get 3 parts of log2 result: HH+HL+HLL ~= log2|x|.
+
+     2)  Calculation of y*(HH+HL+HLL).
+     Split y into YHi+YLo.
+     Get high PH and medium PL parts of y*log2|x|.
+     Get low PLL part of y*log2|x|.
+     Now we have PH+PL+PLL ~= y*log2|x|.
+
+     3) Calculation of 2^(PH+PL+PLL).
+     Mathematical idea of computing 2^(PH+PL+PLL) is the following.
+     Let's represent PH+PL+PLL in the form N + j/2^expK + Z,
+     where expK=7 in this implementation, N and j are integers,
+     0<=j<=2^expK-1, |Z|<2^(-expK-1).
+     Hence 2^(PH+PL+PLL) ~= 2^N * 2^(j/2^expK) * 2^Z,
+     where 2^(j/2^expK) is stored in a table, and
+     2^Z ~= 1 + B1*Z + B2*Z^2 ... + B5*Z^5.
+
+     We compute 2^(PH+PL+PLL) as follows.
+     Break PH into PHH + PHL, where PHH = N + j/2^expK.
+     Z = PHL + PL + PLL
+     Exp2Poly = B1*Z + B2*Z^2 ... + B5*Z^5
+     Get 2^(j/2^expK) from table in the form THI+TLO.
+     Now we have 2^(PH+PL+PLL) ~= 2^N * (THI + TLO) * (1 + Exp2Poly).
+
+     Get significand of 2^(PH+PL+PLL) in the form ResHi+ResLo:
+     ResHi := THI
+     ResLo := THI * Exp2Poly + TLO
+
+     Get exponent ERes of the result:
+     Res := ResHi + ResLo:
+     Result := ex(Res) + N.  */
+
+        pushq     %rbp
+        cfi_adjust_cfa_offset (8)
+        cfi_rel_offset (%rbp, 0)
+        movq      %rsp, %rbp
+        cfi_def_cfa_register (%rbp)
+        andq      $-64, %rsp
+        subq      $448, %rsp
+        movq      __svml_dpow_data@GOTPCREL(%rip), %rax
+        vmovups   %ymm11, 160(%rsp)
+        vmovups   %ymm8, 224(%rsp)
+        vmovups   %ymm10, 352(%rsp)
+        vmovups   %ymm9, 384(%rsp)
+        vmovups   %ymm13, 288(%rsp)
+        vmovapd   %ymm1, %ymm11
+        vxorpd    %ymm1, %ymm1, %ymm1
+        vextracti128 $1, %ymm0, %xmm5
+        vshufps   $221, %xmm5, %xmm0, %xmm5
+
+/* i = (((Hi(x) & 0x000ffe00) + 0x00000200) >> 10); -> i = (b1..b11 + 1) / 2 */
+        vandps _iIndexMask(%rax), %xmm5, %xmm3
+        vpaddd _iIndexAdd(%rax), %xmm3, %xmm6
+        vpsrld    $10, %xmm6, %xmm8
+
+/* Index for reciprocal table */
+        vpslld    $3, %xmm8, %xmm9
+
+/* Index for log2 table */
+        vpslld    $4, %xmm8, %xmm6
+
+/* x1 = x; Hi(x1) = (Hi(x1)&0x000fffff)|0x3ff00000 */
+        vandpd _iMantissaMask(%rax), %ymm0, %ymm4
+        vorpd _dbOne(%rax), %ymm4, %ymm13
+        vpcmpeqd  %ymm4, %ymm4, %ymm4
+        vpcmpeqd  %ymm8, %ymm8, %ymm8
+
+/* k = Hi(x); k = k - 0x3fe7fe00; k = k >> 20 */
+        vpsubd _i3fe7fe00(%rax), %xmm5, %xmm3
+        vpaddd _HIDELTA(%rax), %xmm5, %xmm5
+        vextracti128 $1, %ymm11, %xmm7
+        vshufps   $221, %xmm7, %xmm11, %xmm2
+        vpand _ABSMASK(%rax), %xmm2, %xmm10
+        vpcmpeqd  %ymm2, %ymm2, %ymm2
+        vgatherdpd %ymm2, 11712(%rax,%xmm9), %ymm1
+        vmovups _LORANGE(%rax), %xmm7
+        vxorpd    %ymm2, %ymm2, %ymm2
+        vgatherdpd %ymm4, 19968(%rax,%xmm6), %ymm2
+        vxorpd    %ymm4, %ymm4, %ymm4
+        vgatherdpd %ymm8, 19976(%rax,%xmm6), %ymm4
+        vpsrad    $20, %xmm3, %xmm6
+        vpaddd _i2p20_2p19(%rax), %xmm6, %xmm9
+        vpshufd   $80, %xmm9, %xmm8
+        vpshufd   $250, %xmm9, %xmm3
+
+/* x1Hi=x1; Lo(x1Hi)&=0xf8000000; x1Lo = x1-x1Hi */
+        vandpd _iHighMask(%rax), %ymm13, %ymm9
+        vinserti128 $1, %xmm3, %ymm8, %ymm6
+        vandpd _iffffffff00000000(%rax), %ymm6, %ymm8
+
+/* r1 = x1*rcp1 */
+        vmulpd    %ymm1, %ymm13, %ymm6
+        vsubpd    %ymm9, %ymm13, %ymm3
+        vsubpd _db2p20_2p19(%rax), %ymm8, %ymm8
+
+/* cq = c+r1 */
+        vaddpd _LHN(%rax), %ymm6, %ymm13
+
+/* E = -r1+__fence(x1Hi*rcp1) */
+        vfmsub213pd %ymm6, %ymm1, %ymm9
+
+/* E=E+x1Lo*rcp1 */
+        vfmadd213pd %ymm9, %ymm1, %ymm3
+
+/* T = k + L1hi */
+        vaddpd    %ymm2, %ymm8, %ymm1
+
+/* T_Rh = T + cq */
+        vaddpd    %ymm13, %ymm1, %ymm8
+
+/* Rl = T-T_Rh; -> -Rh */
+        vsubpd    %ymm8, %ymm1, %ymm6
+
+/* Rl=Rl+cq */
+        vaddpd    %ymm6, %ymm13, %ymm1
+
+/* T_Rh_Eh = T_Rh + E */
+        vaddpd    %ymm3, %ymm8, %ymm6
+
+/* cq = cq + E */
+        vaddpd    %ymm3, %ymm13, %ymm13
+
+/* HLL = T_Rh - T_Rh_Eh; -> -Eh */
+        vsubpd    %ymm6, %ymm8, %ymm9
+
+/* HLL+=E;  -> El */
+        vaddpd    %ymm9, %ymm3, %ymm2
+
+/* HLL+=Rl */
+        vaddpd    %ymm1, %ymm2, %ymm8
+
+/* HLL+=L1lo */
+        vaddpd    %ymm4, %ymm8, %ymm4
+        vmovupd _clv_2(%rax), %ymm8
+
+/* HLL = HLL + (((((((a7)*cq+a6)*cq+a5)*cq+a4)*cq+a3)*cq+a2)*cq+a1)*cq */
+        vfmadd213pd _clv_3(%rax), %ymm13, %ymm8
+        vfmadd213pd _clv_4(%rax), %ymm13, %ymm8
+        vfmadd213pd _clv_5(%rax), %ymm13, %ymm8
+        vfmadd213pd _clv_6(%rax), %ymm13, %ymm8
+        vfmadd213pd _clv_7(%rax), %ymm13, %ymm8
+        vfmadd213pd %ymm4, %ymm13, %ymm8
+
+/* T_Rh_Eh_HLLhi = T_Rh_Eh + HLL */
+        vaddpd    %ymm8, %ymm6, %ymm9
+
+/* HH = T_Rh_Eh_HLLhi; Lo(HH)&=0xf8000000 */
+        vandpd _iHighMask(%rax), %ymm9, %ymm2
+
+/*
+   2^(y*(HH+HL+HLL)) starts here:
+   yH = y; Lo(yH)&=0xf8000000;
+ */
+        vandpd _iHighMask(%rax), %ymm11, %ymm1
+
+/* HLLhi = T_Rh_Eh_HLLhi - T_Rh_Eh */
+        vsubpd    %ymm6, %ymm9, %ymm13
+
+/* HL = T_Rh_Eh_HLLhi-HH */
+        vsubpd    %ymm2, %ymm9, %ymm4
+
+/* pH = yH*HH */
+        vmulpd    %ymm2, %ymm1, %ymm9
+
+/* HLL = HLL - HLLhi */
+        vsubpd    %ymm13, %ymm8, %ymm6
+
+/* yL = y-yH */
+        vsubpd    %ymm1, %ymm11, %ymm8
+        vextracti128 $1, %ymm9, %xmm3
+        vshufps   $221, %xmm3, %xmm9, %xmm13
+        vpand _ABSMASK(%rax), %xmm13, %xmm3
+        vpcmpgtd  %xmm5, %xmm7, %xmm13
+        vpcmpgtd _INF(%rax), %xmm10, %xmm7
+        vpcmpeqd _INF(%rax), %xmm10, %xmm10
+        vpor      %xmm10, %xmm7, %xmm7
+        vpor      %xmm7, %xmm13, %xmm5
+
+/* pL=yL*HL+yH*HL; pL+=yL*HH */
+        vmulpd    %ymm4, %ymm8, %ymm7
+        vpcmpgtd _DOMAINRANGE(%rax), %xmm3, %xmm13
+        vpcmpeqd _DOMAINRANGE(%rax), %xmm3, %xmm10
+        vpor      %xmm10, %xmm13, %xmm3
+        vpor      %xmm3, %xmm5, %xmm13
+        vfmadd213pd %ymm7, %ymm4, %ymm1
+
+/* pLL = y*HLL;
+   pHH = pH + *(double*)&db2p45_2p44
+ */
+        vaddpd _db2p45_2p44(%rax), %ymm9, %ymm7
+        vmovmskps %xmm13, %ecx
+        vfmadd213pd %ymm1, %ymm2, %ymm8
+
+/* t=pL+pLL; t+=pHL */
+        vfmadd231pd %ymm11, %ymm6, %ymm8
+        vextracti128 $1, %ymm7, %xmm1
+        vshufps   $136, %xmm1, %xmm7, %xmm10
+
+/* _n = Lo(pHH);
+   _n = _n & 0xffffff80;
+   _n = _n >> 7;
+   Hi(_2n) = (0x3ff+_n)<<20; Lo(_2n) = 0; -> 2^n
+ */
+        vpslld    $13, %xmm10, %xmm2
+        vpaddd _iOne(%rax), %xmm2, %xmm13
+        vpshufd   $80, %xmm13, %xmm4
+        vpshufd   $250, %xmm13, %xmm1
+
+/* j = Lo(pHH)&0x0000007f */
+        vandps _jIndexMask(%rax), %xmm10, %xmm3
+
+/* T1 = ((double*)exp2_tbl)[ 2*j ] */
+        vpcmpeqd  %ymm10, %ymm10, %ymm10
+        vpslld    $4, %xmm3, %xmm5
+
+/* pHH = pHH - *(double*)&db2p45_2p44 */
+        vsubpd _db2p45_2p44(%rax), %ymm7, %ymm7
+
+/* pHL = pH - pHH */
+        vsubpd    %ymm7, %ymm9, %ymm9
+        vaddpd    %ymm9, %ymm8, %ymm6
+        vinserti128 $1, %xmm1, %ymm4, %ymm2
+        vxorpd    %ymm1, %ymm1, %ymm1
+        vgatherdpd %ymm10, 36416(%rax,%xmm5), %ymm1
+        vandpd _ifff0000000000000(%rax), %ymm2, %ymm13
+        vmovupd _cev_1(%rax), %ymm2
+        vmulpd    %ymm1, %ymm13, %ymm1
+        vfmadd213pd _cev_2(%rax), %ymm6, %ymm2
+        vmulpd    %ymm6, %ymm1, %ymm8
+        vfmadd213pd _cev_3(%rax), %ymm6, %ymm2
+        vfmadd213pd _cev_4(%rax), %ymm6, %ymm2
+        vfmadd213pd _cev_5(%rax), %ymm6, %ymm2
+        vfmadd213pd %ymm1, %ymm8, %ymm2
+        testl     %ecx, %ecx
+        jne       .LBL_1_3
+
+.LBL_1_2:
+        cfi_remember_state
+        vmovups   224(%rsp), %ymm8
+        vmovups   384(%rsp), %ymm9
+        vmovups   352(%rsp), %ymm10
+        vmovups   160(%rsp), %ymm11
+        vmovups   288(%rsp), %ymm13
+        vmovdqa   %ymm2, %ymm0
+        movq      %rbp, %rsp
+        cfi_def_cfa_register (%rsp)
+        popq      %rbp
+        cfi_adjust_cfa_offset (-8)
+        cfi_restore (%rbp)
+        ret
+
+.LBL_1_3:
+        cfi_restore_state
+        vmovupd   %ymm0, 192(%rsp)
+        vmovupd   %ymm11, 256(%rsp)
+        vmovupd   %ymm2, 320(%rsp)
+        je        .LBL_1_2
+
+        xorb      %dl, %dl
+        xorl      %eax, %eax
+        vmovups   %ymm12, 64(%rsp)
+        vmovups   %ymm14, 32(%rsp)
+        vmovups   %ymm15, (%rsp)
+        movq      %rsi, 104(%rsp)
+        movq      %rdi, 96(%rsp)
+        movq      %r12, 136(%rsp)
+        cfi_offset_rel_rsp (12, 136)
+        movb      %dl, %r12b
+        movq      %r13, 128(%rsp)
+        cfi_offset_rel_rsp (13, 128)
+        movl      %ecx, %r13d
+        movq      %r14, 120(%rsp)
+        cfi_offset_rel_rsp (14, 120)
+        movl      %eax, %r14d
+        movq      %r15, 112(%rsp)
+        cfi_offset_rel_rsp (15, 112)
+        cfi_remember_state
+
+.LBL_1_6:
+        btl       %r14d, %r13d
+        jc        .LBL_1_12
+
+.LBL_1_7:
+        lea       1(%r14), %esi
+        btl       %esi, %r13d
+        jc        .LBL_1_10
+
+.LBL_1_8:
+        incb      %r12b
+        addl      $2, %r14d
+        cmpb      $16, %r12b
+        jb        .LBL_1_6
+
+        vmovups   64(%rsp), %ymm12
+        vmovups   32(%rsp), %ymm14
+        vmovups   (%rsp), %ymm15
+        vmovupd   320(%rsp), %ymm2
+        movq      104(%rsp), %rsi
+        movq      96(%rsp), %rdi
+        movq      136(%rsp), %r12
+        cfi_restore (%r12)
+        movq      128(%rsp), %r13
+        cfi_restore (%r13)
+        movq      120(%rsp), %r14
+        cfi_restore (%r14)
+        movq      112(%rsp), %r15
+        cfi_restore (%r15)
+        jmp       .LBL_1_2
+
+.LBL_1_10:
+        cfi_restore_state
+        movzbl    %r12b, %r15d
+        shlq      $4, %r15
+        vmovsd    200(%rsp,%r15), %xmm0
+        vmovsd    264(%rsp,%r15), %xmm1
+        vzeroupper
+
+        call      JUMPTARGET(__pow_finite)
+
+        vmovsd    %xmm0, 328(%rsp,%r15)
+        jmp       .LBL_1_8
+
+.LBL_1_12:
+        movzbl    %r12b, %r15d
+        shlq      $4, %r15
+        vmovsd    192(%rsp,%r15), %xmm0
+        vmovsd    256(%rsp,%r15), %xmm1
+        vzeroupper
+
+        call      JUMPTARGET(__pow_finite)
+
+        vmovsd    %xmm0, 320(%rsp,%r15)
+        jmp       .LBL_1_7
+
+END (_ZGVdN4vv_pow_avx2)
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_pow8_core.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_pow8_core.S
new file mode 100644
index 0000000000..68f12b2848
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_pow8_core.S
@@ -0,0 +1,37 @@
+/* Multiple versions of vectorized pow.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+	.text
+ENTRY (_ZGVeN8vv_pow)
+        .type   _ZGVeN8vv_pow, @gnu_indirect_function
+	LOAD_RTLD_GLOBAL_RO_RDX
+        leaq    _ZGVeN8vv_pow_skx(%rip), %rax
+	HAS_ARCH_FEATURE (AVX512DQ_Usable)
+        jnz     2f
+        leaq    _ZGVeN8vv_pow_knl(%rip), %rax
+	HAS_ARCH_FEATURE (AVX512F_Usable)
+        jnz     2f
+        leaq    _ZGVeN8vv_pow_avx2_wrapper(%rip), %rax
+2:      ret
+END (_ZGVeN8vv_pow)
+
+#define _ZGVeN8vv_pow _ZGVeN8vv_pow_avx2_wrapper
+#include "../svml_d_pow8_core.S"
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_pow8_core_avx512.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_pow8_core_avx512.S
new file mode 100644
index 0000000000..2190c1f6b4
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_pow8_core_avx512.S
@@ -0,0 +1,741 @@
+/* Function pow vectorized with AVX-512. KNL and SKX versions.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "svml_d_pow_data.h"
+#include "svml_d_wrapper_impl.h"
+
+/* ALGORITHM DESCRIPTION:
+
+     1) Calculating log2|x|
+     Here we use the following formula.
+     Let |x|=2^k1*X1, where k1 is integer, 1<=X1<2.
+     Let C ~= 1/ln(2),
+     Rcp1 ~= 1/X1,   X2=Rcp1*X1,
+     Rcp2 ~= 1/X2,   X3=Rcp2*X2,
+     Rcp3 ~= 1/X3,   Rcp3C ~= C/X3.
+     Then
+     log2|x| = k1 + log2(1/Rcp1) + log2(1/Rcp2) + log2(C/Rcp3C) +
+               log2(X1*Rcp1*Rcp2*Rcp3C/C),
+     where X1*Rcp1*Rcp2*Rcp3C = C*(1+q), q is very small.
+
+     The values of Rcp1, log2(1/Rcp1), Rcp2, log2(1/Rcp2),
+     Rcp3C, log2(C/Rcp3C) are taken from tables.
+     Values of Rcp1, Rcp2, Rcp3C are such that RcpC=Rcp1*Rcp2*Rcp3C
+     is exactly represented in target precision.
+
+     log2(X1*Rcp1*Rcp2*Rcp3C/C) = log2(1+q) = ln(1+q)/ln2 =
+     = 1/(ln2)*q - 1/(2ln2)*q^2 + 1/(3ln2)*q^3 - ... =
+     = 1/(C*ln2)*cq - 1/(2*C^2*ln2)*cq^2 + 1/(3*C^3*ln2)*cq^3 - ... =
+     = (1 + a1)*cq + a2*cq^2 + a3*cq^3 + ...,
+     where cq = X1*Rcp1*Rcp2*Rcp3C-C,
+     a1=1/(C*ln(2))-1 is small,
+     a2=1/(2*C^2*ln2),
+     a3=1/(3*C^3*ln2),
+     ...
+     We get 3 parts of log2 result: HH+HL+HLL ~= log2|x|.
+
+     2)  Calculation of y*(HH+HL+HLL).
+     Split y into YHi+YLo.
+     Get high PH and medium PL parts of y*log2|x|.
+     Get low PLL part of y*log2|x|.
+     Now we have PH+PL+PLL ~= y*log2|x|.
+
+     3) Calculation of 2^(PH+PL+PLL).
+     Mathematical idea of computing 2^(PH+PL+PLL) is the following.
+     Let's represent PH+PL+PLL in the form N + j/2^expK + Z,
+     where expK=7 in this implementation, N and j are integers,
+     0<=j<=2^expK-1, |Z|<2^(-expK-1).
+     Hence 2^(PH+PL+PLL) ~= 2^N * 2^(j/2^expK) * 2^Z,
+     where 2^(j/2^expK) is stored in a table, and
+     2^Z ~= 1 + B1*Z + B2*Z^2 ... + B5*Z^5.
+
+     We compute 2^(PH+PL+PLL) as follows.
+     Break PH into PHH + PHL, where PHH = N + j/2^expK.
+     Z = PHL + PL + PLL
+     Exp2Poly = B1*Z + B2*Z^2 ... + B5*Z^5
+     Get 2^(j/2^expK) from table in the form THI+TLO.
+     Now we have 2^(PH+PL+PLL) ~= 2^N * (THI + TLO) * (1 + Exp2Poly).
+
+     Get significand of 2^(PH+PL+PLL) in the form ResHi+ResLo:
+     ResHi := THI
+     ResLo := THI * Exp2Poly + TLO
+
+     Get exponent ERes of the result:
+     Res := ResHi + ResLo:
+     Result := ex(Res) + N.  */
+
+	.text
+ENTRY (_ZGVeN8vv_pow_knl)
+#ifndef HAVE_AVX512DQ_ASM_SUPPORT
+WRAPPER_IMPL_AVX512_ff _ZGVdN4vv_pow
+#else
+        pushq     %rbp
+        cfi_adjust_cfa_offset (8)
+        cfi_rel_offset (%rbp, 0)
+        movq      %rsp, %rbp
+        cfi_def_cfa_register (%rbp)
+        andq      $-64, %rsp
+        subq      $1344, %rsp
+        vpsrlq    $32, %zmm0, %zmm13
+        vmovaps   %zmm1, %zmm12
+        movq      __svml_dpow_data@GOTPCREL(%rip), %rax
+        movl      $255, %edx
+        vpmovqd   %zmm13, %ymm10
+        vpsrlq    $32, %zmm12, %zmm14
+        kmovw     %edx, %k1
+        movl      $-1, %ecx
+        vpmovqd   %zmm14, %ymm15
+
+/* x1 = x; Hi(x1) = (Hi(x1)&0x000fffff)|0x3ff00000 */
+        vmovups _dbOne(%rax), %zmm6
+
+/* i = (((Hi(x) & 0x000ffe00) + 0x00000200) >> 10); -> i = (b1..b11 + 1) / 2 */
+        vmovaps   %zmm10, %zmm5
+
+/* k = Hi(x); k = k - 0x3fe7fe00; k = k >> 20 */
+        vpsubd    _i3fe7fe00(%rax), %zmm10, %zmm14{%k1}
+        vpandd    _iIndexMask(%rax), %zmm10, %zmm5{%k1}
+        vpsrad    $20, %zmm14, %zmm14{%k1}
+        vpxord    %zmm9, %zmm9, %zmm9
+        vpaddd    _HIDELTA(%rax), %zmm10, %zmm3{%k1}
+        vpaddd    _iIndexAdd(%rax), %zmm5, %zmm5{%k1}
+        vpxord    %zmm7, %zmm7, %zmm7
+        vpaddd    _i2p20_2p19(%rax), %zmm14, %zmm14{%k1}
+        vpcmpd    $1, _LORANGE(%rax), %zmm3, %k2{%k1}
+        vpsrld    $10, %zmm5, %zmm5{%k1}
+        vpandd    _ABSMASK(%rax), %zmm15, %zmm2{%k1}
+        vpbroadcastd %ecx, %zmm1{%k2}{z}
+
+/* Index for reciprocal table */
+        vpslld     $3, %zmm5, %zmm8{%k1}
+        kxnorw     %k2, %k2, %k2
+        vgatherdpd 11712(%rax,%ymm8), %zmm9{%k2}
+        vpmovzxdq  %ymm14, %zmm10
+
+/* Index for log2 table */
+        vpslld    $4, %zmm5, %zmm13{%k1}
+        kxnorw    %k2, %k2, %k2
+        vpsllq    $32, %zmm10, %zmm3
+        vpxord    %zmm8, %zmm8, %zmm8
+        vpcmpd    $5, _INF(%rax), %zmm2, %k3{%k1}
+        vpbroadcastd %ecx, %zmm4{%k3}{z}
+        vpternlogq $248, _iMantissaMask(%rax), %zmm0, %zmm6
+        kxnorw     %k3, %k3, %k3
+        vpternlogq $168, _iffffffff00000000(%rax), %zmm10, %zmm3
+
+/* x1Hi=x1; Lo(x1Hi)&=0xf8000000; x1Lo = x1-x1Hi */
+        vpandq     _iHighMask(%rax), %zmm6, %zmm2
+        vgatherdpd 19976(%rax,%ymm13), %zmm8{%k2}
+        vpord      %zmm4, %zmm1, %zmm11{%k1}
+        vsubpd     _db2p20_2p19(%rax), %zmm3, %zmm1
+        vsubpd     %zmm2, %zmm6, %zmm5
+
+/* r1 = x1*rcp1 */
+        vmulpd     %zmm9, %zmm6, %zmm6
+        vgatherdpd 19968(%rax,%ymm13), %zmm7{%k3}
+
+/* cq = c+r1 */
+        vaddpd _LHN(%rax), %zmm6, %zmm4
+
+/* E = -r1+__fence(x1Hi*rcp1) */
+        vfmsub213pd %zmm6, %zmm9, %zmm2
+
+/* T = k + L1hi */
+        vaddpd    %zmm7, %zmm1, %zmm7
+
+/* E=E+x1Lo*rcp1 */
+        vfmadd213pd %zmm2, %zmm9, %zmm5
+
+/* T_Rh = T + cq */
+        vaddpd    %zmm4, %zmm7, %zmm3
+
+/* Rl = T-T_Rh; -> -Rh */
+        vsubpd    %zmm3, %zmm7, %zmm9
+
+/* Rl=Rl+cq */
+        vaddpd    %zmm9, %zmm4, %zmm6
+
+/* T_Rh_Eh = T_Rh + E */
+        vaddpd    %zmm5, %zmm3, %zmm9
+
+/* HLL = T_Rh - T_Rh_Eh; -> -Eh */
+        vsubpd    %zmm9, %zmm3, %zmm2
+
+/* cq = cq + E; */
+        vaddpd    %zmm5, %zmm4, %zmm4
+
+/* HLL+=E;  -> El */
+        vaddpd    %zmm2, %zmm5, %zmm1
+        vmovups _clv_2(%rax), %zmm5
+
+/* HLL = HLL + (((((((a7)*cq+a6)*cq+a5)*cq+a4)*cq+a3)*cq+a2)*cq+a1)*cq */
+        vfmadd213pd _clv_3(%rax), %zmm4, %zmm5
+
+/* HLL+=Rl */
+        vaddpd    %zmm6, %zmm1, %zmm7
+
+/* 2^(y*(HH+HL+HLL)) starts here:
+   yH = y; Lo(yH)&=0xf8000000
+ */
+        vpandq    _iHighMask(%rax), %zmm12, %zmm6
+
+/* yL = y-yH */
+        vsubpd      %zmm6, %zmm12, %zmm2
+        vfmadd213pd _clv_4(%rax), %zmm4, %zmm5
+
+/* HLL+=L1lo */
+        vaddpd      %zmm8, %zmm7, %zmm8
+        vfmadd213pd _clv_5(%rax), %zmm4, %zmm5
+        vfmadd213pd _clv_6(%rax), %zmm4, %zmm5
+        vfmadd213pd _clv_7(%rax), %zmm4, %zmm5
+        vfmadd213pd %zmm8, %zmm4, %zmm5
+
+/* T_Rh_Eh_HLLhi = T_Rh_Eh + HLL */
+        vaddpd    %zmm5, %zmm9, %zmm13
+
+/* HLLhi = T_Rh_Eh_HLLhi - T_Rh_Eh */
+        vsubpd    %zmm9, %zmm13, %zmm10
+
+/* HLL = HLL - HLLhi */
+        vsubpd    %zmm10, %zmm5, %zmm3
+
+/* HH = T_Rh_Eh_HLLhi; Lo(HH)&=0xf8000000 */
+        vpandq _iHighMask(%rax), %zmm13, %zmm5
+
+/* pH = yH*HH */
+        vmulpd    %zmm5, %zmm6, %zmm1
+
+/* HL = T_Rh_Eh_HLLhi-HH */
+        vsubpd    %zmm5, %zmm13, %zmm4
+        vpsrlq    $32, %zmm1, %zmm14
+
+/* pLL = y*HLL;
+   pHH = pH + *(double*)&db2p45_2p44
+ */
+        vaddpd    _db2p45_2p44(%rax), %zmm1, %zmm10
+        vpmovqd   %zmm14, %ymm15
+        vpandd    _ABSMASK(%rax), %zmm15, %zmm14{%k1}
+        vpcmpd    $5, _DOMAINRANGE(%rax), %zmm14, %k3{%k1}
+
+/* T1 = ((double*)exp2_tbl)[ 2*j ] */
+        vpxord    %zmm14, %zmm14, %zmm14
+        vpbroadcastd %ecx, %zmm13{%k3}{z}
+        vpord     %zmm13, %zmm11, %zmm11{%k1}
+        vptestmd  %zmm11, %zmm11, %k0{%k1}
+
+/* pL=yL*HL+yH*HL; pL+=yL*HH */
+        vmulpd      %zmm4, %zmm2, %zmm11
+        kmovw       %k0, %ecx
+        vfmadd213pd %zmm11, %zmm4, %zmm6
+
+/* pHH = pHH - *(double*)&db2p45_2p44 */
+        vsubpd    _db2p45_2p44(%rax), %zmm10, %zmm11
+        vpmovqd   %zmm10, %ymm4
+        movzbl    %cl, %ecx
+
+/* _n = Lo(pHH);
+   _n = _n & 0xffffff80;
+   _n = _n >> 7;
+   Hi(_2n) = (0x3ff+_n)<<20; Lo(_2n) = 0; -> 2^n
+ */
+        vpslld    $13, %zmm4, %zmm7{%k1}
+
+/* j = Lo(pHH)&0x0000007f */
+        vpandd      _jIndexMask(%rax), %zmm4, %zmm9{%k1}
+        vfmadd213pd %zmm6, %zmm5, %zmm2
+
+/* pHL = pH - pHH */
+        vsubpd    %zmm11, %zmm1, %zmm1
+        vpaddd    _iOne(%rax), %zmm7, %zmm7{%k1}
+
+/* t=pL+pLL; t+=pHL */
+        vfmadd231pd %zmm12, %zmm3, %zmm2
+        vpslld      $4, %zmm9, %zmm9{%k1}
+        kxnorw      %k1, %k1, %k1
+        vgatherdpd  36416(%rax,%ymm9), %zmm14{%k1}
+        vpmovzxdq   %ymm7, %zmm8
+        vaddpd      %zmm1, %zmm2, %zmm2
+        vmovups     _cev_1(%rax), %zmm1
+        vpsllq      $32, %zmm8, %zmm13
+        vpternlogq  $168, _ifff0000000000000(%rax), %zmm8, %zmm13
+        vfmadd213pd _cev_2(%rax), %zmm2, %zmm1
+        vmulpd      %zmm14, %zmm13, %zmm15
+        vfmadd213pd _cev_3(%rax), %zmm2, %zmm1
+        vmulpd      %zmm2, %zmm15, %zmm3
+        vfmadd213pd _cev_4(%rax), %zmm2, %zmm1
+        vfmadd213pd _cev_5(%rax), %zmm2, %zmm1
+        vfmadd213pd %zmm15, %zmm3, %zmm1
+        testl       %ecx, %ecx
+        jne       .LBL_1_3
+
+.LBL_1_2:
+        cfi_remember_state
+        vmovaps   %zmm1, %zmm0
+        movq      %rbp, %rsp
+        cfi_def_cfa_register (%rsp)
+        popq      %rbp
+        cfi_adjust_cfa_offset (-8)
+        cfi_restore (%rbp)
+        ret
+
+.LBL_1_3:
+        cfi_restore_state
+        vmovups   %zmm0, 1152(%rsp)
+        vmovups   %zmm12, 1216(%rsp)
+        vmovups   %zmm1, 1280(%rsp)
+        je        .LBL_1_2
+
+        xorb      %dl, %dl
+        kmovw     %k4, 1048(%rsp)
+        xorl      %eax, %eax
+        kmovw     %k5, 1040(%rsp)
+        kmovw     %k6, 1032(%rsp)
+        kmovw     %k7, 1024(%rsp)
+        vmovups   %zmm16, 960(%rsp)
+        vmovups   %zmm17, 896(%rsp)
+        vmovups   %zmm18, 832(%rsp)
+        vmovups   %zmm19, 768(%rsp)
+        vmovups   %zmm20, 704(%rsp)
+        vmovups   %zmm21, 640(%rsp)
+        vmovups   %zmm22, 576(%rsp)
+        vmovups   %zmm23, 512(%rsp)
+        vmovups   %zmm24, 448(%rsp)
+        vmovups   %zmm25, 384(%rsp)
+        vmovups   %zmm26, 320(%rsp)
+        vmovups   %zmm27, 256(%rsp)
+        vmovups   %zmm28, 192(%rsp)
+        vmovups   %zmm29, 128(%rsp)
+        vmovups   %zmm30, 64(%rsp)
+        vmovups   %zmm31, (%rsp)
+        movq      %rsi, 1064(%rsp)
+        movq      %rdi, 1056(%rsp)
+        movq      %r12, 1096(%rsp)
+        cfi_offset_rel_rsp (12, 1096)
+        movb      %dl, %r12b
+        movq      %r13, 1088(%rsp)
+        cfi_offset_rel_rsp (13, 1088)
+        movl      %ecx, %r13d
+        movq      %r14, 1080(%rsp)
+        cfi_offset_rel_rsp (14, 1080)
+        movl      %eax, %r14d
+        movq      %r15, 1072(%rsp)
+        cfi_offset_rel_rsp (15, 1072)
+        cfi_remember_state
+
+.LBL_1_6:
+        btl       %r14d, %r13d
+        jc        .LBL_1_12
+
+.LBL_1_7:
+        lea       1(%r14), %esi
+        btl       %esi, %r13d
+        jc        .LBL_1_10
+
+.LBL_1_8:
+        addb      $1, %r12b
+        addl      $2, %r14d
+        cmpb      $16, %r12b
+        jb        .LBL_1_6
+
+        kmovw     1048(%rsp), %k4
+        movq      1064(%rsp), %rsi
+        kmovw     1040(%rsp), %k5
+        movq      1056(%rsp), %rdi
+        kmovw     1032(%rsp), %k6
+        movq      1096(%rsp), %r12
+        cfi_restore (%r12)
+        movq      1088(%rsp), %r13
+        cfi_restore (%r13)
+        kmovw     1024(%rsp), %k7
+        vmovups   960(%rsp), %zmm16
+        vmovups   896(%rsp), %zmm17
+        vmovups   832(%rsp), %zmm18
+        vmovups   768(%rsp), %zmm19
+        vmovups   704(%rsp), %zmm20
+        vmovups   640(%rsp), %zmm21
+        vmovups   576(%rsp), %zmm22
+        vmovups   512(%rsp), %zmm23
+        vmovups   448(%rsp), %zmm24
+        vmovups   384(%rsp), %zmm25
+        vmovups   320(%rsp), %zmm26
+        vmovups   256(%rsp), %zmm27
+        vmovups   192(%rsp), %zmm28
+        vmovups   128(%rsp), %zmm29
+        vmovups   64(%rsp), %zmm30
+        vmovups   (%rsp), %zmm31
+        movq      1080(%rsp), %r14
+        cfi_restore (%r14)
+        movq      1072(%rsp), %r15
+        cfi_restore (%r15)
+        vmovups   1280(%rsp), %zmm1
+        jmp       .LBL_1_2
+
+.LBL_1_10:
+        cfi_restore_state
+        movzbl    %r12b, %r15d
+        shlq      $4, %r15
+        vmovsd    1160(%rsp,%r15), %xmm0
+        vmovsd    1224(%rsp,%r15), %xmm1
+        call      JUMPTARGET(__pow_finite)
+        vmovsd    %xmm0, 1288(%rsp,%r15)
+        jmp       .LBL_1_8
+
+.LBL_1_12:
+        movzbl    %r12b, %r15d
+        shlq      $4, %r15
+        vmovsd    1152(%rsp,%r15), %xmm0
+        vmovsd    1216(%rsp,%r15), %xmm1
+        call      JUMPTARGET(__pow_finite)
+        vmovsd    %xmm0, 1280(%rsp,%r15)
+        jmp       .LBL_1_7
+
+#endif
+END (_ZGVeN8vv_pow_knl)
+
+ENTRY (_ZGVeN8vv_pow_skx)
+#ifndef HAVE_AVX512DQ_ASM_SUPPORT
+WRAPPER_IMPL_AVX512_ff _ZGVdN4vv_pow
+#else
+        pushq     %rbp
+        cfi_adjust_cfa_offset (8)
+        cfi_rel_offset (%rbp, 0)
+        movq      %rsp, %rbp
+        cfi_def_cfa_register (%rbp)
+        andq      $-64, %rsp
+        subq      $1344, %rsp
+        vpsrlq    $32, %zmm0, %zmm10
+        kxnorw    %k1, %k1, %k1
+        kxnorw    %k2, %k2, %k2
+        kxnorw    %k3, %k3, %k3
+        vpmovqd   %zmm10, %ymm7
+        movq      __svml_dpow_data@GOTPCREL(%rip), %rax
+        vmovaps   %zmm1, %zmm6
+        vpsrlq    $32, %zmm6, %zmm13
+
+/* i = (((Hi(x) & 0x000ffe00) + 0x00000200) >> 10); -> i = (b1..b11 + 1) / 2 */
+        vpand _iIndexMask(%rax), %ymm7, %ymm15
+        vpaddd _HIDELTA(%rax), %ymm7, %ymm2
+
+/* k = Hi(x); k = k - 0x3fe7fe00; k = k >> 20 */
+        vpsubd _i3fe7fe00(%rax), %ymm7, %ymm7
+        vmovdqu _ABSMASK(%rax), %ymm4
+        vmovdqu _LORANGE(%rax), %ymm3
+
+/* x1 = x; Hi(x1) = (Hi(x1)&0x000fffff)|0x3ff00000 */
+        vmovups _dbOne(%rax), %zmm11
+        vmovdqu _INF(%rax), %ymm5
+        vpaddd _iIndexAdd(%rax), %ymm15, %ymm12
+        vpmovqd   %zmm13, %ymm14
+        vpternlogq $248, _iMantissaMask(%rax), %zmm0, %zmm11
+        vpsrld    $10, %ymm12, %ymm10
+        vpsrad    $20, %ymm7, %ymm13
+
+/* Index for reciprocal table */
+        vpslld    $3, %ymm10, %ymm8
+
+/* Index for log2 table */
+        vpslld    $4, %ymm10, %ymm1
+        vpcmpgtd  %ymm2, %ymm3, %ymm3
+        vpand     %ymm4, %ymm14, %ymm2
+        vpaddd _i2p20_2p19(%rax), %ymm13, %ymm14
+        vpmovzxdq %ymm14, %zmm15
+        vpsllq    $32, %zmm15, %zmm7
+        vpternlogq $168, _iffffffff00000000(%rax), %zmm15, %zmm7
+        vsubpd _db2p20_2p19(%rax), %zmm7, %zmm13
+        vpxord    %zmm9, %zmm9, %zmm9
+        vgatherdpd 11712(%rax,%ymm8), %zmm9{%k1}
+
+/* T1 = ((double*)exp2_tbl)[ 2*j ] */
+        kxnorw    %k1, %k1, %k1
+        vpxord    %zmm12, %zmm12, %zmm12
+        vpxord    %zmm8, %zmm8, %zmm8
+        vgatherdpd 19968(%rax,%ymm1), %zmm12{%k2}
+        vgatherdpd 19976(%rax,%ymm1), %zmm8{%k3}
+        vmovups _iHighMask(%rax), %zmm1
+
+/* x1Hi=x1; Lo(x1Hi)&=0xf8000000; x1Lo = x1-x1Hi */
+        vandpd    %zmm1, %zmm11, %zmm10
+        vsubpd    %zmm10, %zmm11, %zmm15
+
+/* r1 = x1*rcp1 */
+        vmulpd    %zmm9, %zmm11, %zmm11
+
+/* E = -r1+__fence(x1Hi*rcp1) */
+        vfmsub213pd %zmm11, %zmm9, %zmm10
+
+/* cq = c+r1 */
+        vaddpd _LHN(%rax), %zmm11, %zmm14
+
+/* E=E+x1Lo*rcp1 */
+        vfmadd213pd %zmm10, %zmm9, %zmm15
+
+/* T = k + L1hi */
+        vaddpd    %zmm12, %zmm13, %zmm9
+
+/* T_Rh = T + cq */
+        vaddpd    %zmm14, %zmm9, %zmm11
+
+/* T_Rh_Eh = T_Rh + E */
+        vaddpd    %zmm15, %zmm11, %zmm13
+
+/* Rl = T-T_Rh; -> -Rh */
+        vsubpd    %zmm11, %zmm9, %zmm12
+
+/* HLL = T_Rh - T_Rh_Eh; -> -Eh */
+        vsubpd    %zmm13, %zmm11, %zmm9
+
+/* Rl=Rl+cq */
+        vaddpd    %zmm12, %zmm14, %zmm10
+
+/* HLL+=E;  -> El */
+        vaddpd    %zmm9, %zmm15, %zmm7
+
+/* HLL+=Rl */
+        vaddpd    %zmm10, %zmm7, %zmm12
+
+/* 2^(y*(HH+HL+HLL)) starts here:
+   yH = y; Lo(yH)&=0xf8000000
+ */
+        vandpd    %zmm1, %zmm6, %zmm7
+
+/* HLL+=L1lo */
+        vaddpd    %zmm8, %zmm12, %zmm12
+
+/* cq = cq + E */
+        vaddpd    %zmm15, %zmm14, %zmm8
+        vmovups _clv_2(%rax), %zmm14
+
+/* HLL = HLL + (((((((a7)*cq+a6)*cq+a5)*cq+a4)*cq+a3)*cq+a2)*cq+a1)*cq */
+        vfmadd213pd _clv_3(%rax), %zmm8, %zmm14
+        vfmadd213pd _clv_4(%rax), %zmm8, %zmm14
+        vfmadd213pd _clv_5(%rax), %zmm8, %zmm14
+        vfmadd213pd _clv_6(%rax), %zmm8, %zmm14
+        vfmadd213pd _clv_7(%rax), %zmm8, %zmm14
+        vfmadd213pd %zmm12, %zmm8, %zmm14
+
+/* yL = y-yH */
+        vsubpd    %zmm7, %zmm6, %zmm8
+
+/* T_Rh_Eh_HLLhi = T_Rh_Eh + HLL */
+        vaddpd    %zmm14, %zmm13, %zmm15
+
+/* HH = T_Rh_Eh_HLLhi; Lo(HH)&=0xf8000000 */
+        vandpd    %zmm1, %zmm15, %zmm11
+
+/* HLLhi = T_Rh_Eh_HLLhi - T_Rh_Eh */
+        vsubpd    %zmm13, %zmm15, %zmm13
+
+/* pH = yH*HH */
+        vmulpd    %zmm11, %zmm7, %zmm9
+
+/* HLL = HLL - HLLhi */
+        vsubpd    %zmm13, %zmm14, %zmm12
+
+/* HL = T_Rh_Eh_HLLhi-HH */
+        vsubpd    %zmm11, %zmm15, %zmm10
+        vpsrlq    $32, %zmm9, %zmm1
+        vmovdqu _DOMAINRANGE(%rax), %ymm13
+        vpmovqd   %zmm1, %ymm1
+        vpand     %ymm4, %ymm1, %ymm1
+        vpcmpgtd  %ymm5, %ymm2, %ymm4
+        vpcmpeqd  %ymm5, %ymm2, %ymm5
+        vpternlogd $254, %ymm5, %ymm4, %ymm3
+        vpcmpgtd  %ymm13, %ymm1, %ymm2
+        vpcmpeqd  %ymm13, %ymm1, %ymm4
+        vpternlogd $254, %ymm4, %ymm2, %ymm3
+
+/* pLL = y*HLL */
+        vmovups _db2p45_2p44(%rax), %zmm2
+
+/* pHH = pH + *(double*)&db2p45_2p44 */
+        vaddpd    %zmm2, %zmm9, %zmm1
+        vpmovqd   %zmm1, %ymm5
+
+/* j = Lo(pHH)&0x0000007f */
+        vpand _jIndexMask(%rax), %ymm5, %ymm14
+        vpslld    $4, %ymm14, %ymm15
+        vmovmskps %ymm3, %ecx
+
+/* pL=yL*HL+yH*HL; pL+=yL*HH */
+        vmulpd    %zmm10, %zmm8, %zmm3
+        vfmadd213pd %zmm3, %zmm10, %zmm7
+        vfmadd213pd %zmm7, %zmm11, %zmm8
+
+/* _n = Lo(pHH)
+   _n = _n & 0xffffff80
+   _n = _n >> 7
+   Hi(_2n) = (0x3ff+_n)<<20; Lo(_2n) = 0; -> 2^n
+ */
+        vpslld    $13, %ymm5, %ymm7
+
+/* t=pL+pLL; t+=pHL */
+        vfmadd231pd %zmm6, %zmm12, %zmm8
+        vpaddd _iOne(%rax), %ymm7, %ymm10
+        vpmovzxdq %ymm10, %zmm11
+        vpsllq    $32, %zmm11, %zmm3
+        vpternlogq $168, _ifff0000000000000(%rax), %zmm11, %zmm3
+
+/* pHH = pHH - *(double*)&db2p45_2p44 */
+        vsubpd    %zmm2, %zmm1, %zmm11
+        vmovups _cev_1(%rax), %zmm2
+
+/* pHL = pH - pHH */
+        vsubpd    %zmm11, %zmm9, %zmm9
+        vaddpd    %zmm9, %zmm8, %zmm8
+        vfmadd213pd _cev_2(%rax), %zmm8, %zmm2
+        vfmadd213pd _cev_3(%rax), %zmm8, %zmm2
+        vfmadd213pd _cev_4(%rax), %zmm8, %zmm2
+        vfmadd213pd _cev_5(%rax), %zmm8, %zmm2
+        vpxord    %zmm4, %zmm4, %zmm4
+        vgatherdpd 36416(%rax,%ymm15), %zmm4{%k1}
+        vmulpd    %zmm4, %zmm3, %zmm1
+        vmulpd    %zmm8, %zmm1, %zmm12
+        vfmadd213pd %zmm1, %zmm12, %zmm2
+        testl     %ecx, %ecx
+        jne       .LBL_2_3
+
+.LBL_2_2:
+        cfi_remember_state
+        vmovaps   %zmm2, %zmm0
+        movq      %rbp, %rsp
+        cfi_def_cfa_register (%rsp)
+        popq      %rbp
+        cfi_adjust_cfa_offset (-8)
+        cfi_restore (%rbp)
+        ret
+
+.LBL_2_3:
+        cfi_restore_state
+        vmovups   %zmm0, 1152(%rsp)
+        vmovups   %zmm6, 1216(%rsp)
+        vmovups   %zmm2, 1280(%rsp)
+        je        .LBL_2_2
+
+        xorb      %dl, %dl
+        xorl      %eax, %eax
+        kmovw     %k4, 1048(%rsp)
+        kmovw     %k5, 1040(%rsp)
+        kmovw     %k6, 1032(%rsp)
+        kmovw     %k7, 1024(%rsp)
+        vmovups   %zmm16, 960(%rsp)
+        vmovups   %zmm17, 896(%rsp)
+        vmovups   %zmm18, 832(%rsp)
+        vmovups   %zmm19, 768(%rsp)
+        vmovups   %zmm20, 704(%rsp)
+        vmovups   %zmm21, 640(%rsp)
+        vmovups   %zmm22, 576(%rsp)
+        vmovups   %zmm23, 512(%rsp)
+        vmovups   %zmm24, 448(%rsp)
+        vmovups   %zmm25, 384(%rsp)
+        vmovups   %zmm26, 320(%rsp)
+        vmovups   %zmm27, 256(%rsp)
+        vmovups   %zmm28, 192(%rsp)
+        vmovups   %zmm29, 128(%rsp)
+        vmovups   %zmm30, 64(%rsp)
+        vmovups   %zmm31, (%rsp)
+        movq      %rsi, 1064(%rsp)
+        movq      %rdi, 1056(%rsp)
+        movq      %r12, 1096(%rsp)
+        cfi_offset_rel_rsp (12, 1096)
+        movb      %dl, %r12b
+        movq      %r13, 1088(%rsp)
+        cfi_offset_rel_rsp (13, 1088)
+        movl      %ecx, %r13d
+        movq      %r14, 1080(%rsp)
+        cfi_offset_rel_rsp (14, 1080)
+        movl      %eax, %r14d
+        movq      %r15, 1072(%rsp)
+        cfi_offset_rel_rsp (15, 1072)
+        cfi_remember_state
+
+.LBL_2_6:
+        btl       %r14d, %r13d
+        jc        .LBL_2_12
+
+.LBL_2_7:
+        lea       1(%r14), %esi
+        btl       %esi, %r13d
+        jc        .LBL_2_10
+
+.LBL_2_8:
+        incb      %r12b
+        addl      $2, %r14d
+        cmpb      $16, %r12b
+        jb        .LBL_2_6
+
+        kmovw     1048(%rsp), %k4
+        kmovw     1040(%rsp), %k5
+        kmovw     1032(%rsp), %k6
+        kmovw     1024(%rsp), %k7
+        vmovups   960(%rsp), %zmm16
+        vmovups   896(%rsp), %zmm17
+        vmovups   832(%rsp), %zmm18
+        vmovups   768(%rsp), %zmm19
+        vmovups   704(%rsp), %zmm20
+        vmovups   640(%rsp), %zmm21
+        vmovups   576(%rsp), %zmm22
+        vmovups   512(%rsp), %zmm23
+        vmovups   448(%rsp), %zmm24
+        vmovups   384(%rsp), %zmm25
+        vmovups   320(%rsp), %zmm26
+        vmovups   256(%rsp), %zmm27
+        vmovups   192(%rsp), %zmm28
+        vmovups   128(%rsp), %zmm29
+        vmovups   64(%rsp), %zmm30
+        vmovups   (%rsp), %zmm31
+        vmovups   1280(%rsp), %zmm2
+        movq      1064(%rsp), %rsi
+        movq      1056(%rsp), %rdi
+        movq      1096(%rsp), %r12
+        cfi_restore (%r12)
+        movq      1088(%rsp), %r13
+        cfi_restore (%r13)
+        movq      1080(%rsp), %r14
+        cfi_restore (%r14)
+        movq      1072(%rsp), %r15
+        cfi_restore (%r15)
+        jmp       .LBL_2_2
+
+.LBL_2_10:
+        cfi_restore_state
+        movzbl    %r12b, %r15d
+        shlq      $4, %r15
+        vmovsd    1224(%rsp,%r15), %xmm1
+        vzeroupper
+        vmovsd    1160(%rsp,%r15), %xmm0
+
+        call      JUMPTARGET(__pow_finite)
+
+        vmovsd    %xmm0, 1288(%rsp,%r15)
+        jmp       .LBL_2_8
+
+.LBL_2_12:
+        movzbl    %r12b, %r15d
+        shlq      $4, %r15
+        vmovsd    1216(%rsp,%r15), %xmm1
+        vzeroupper
+        vmovsd    1152(%rsp,%r15), %xmm0
+
+        call      JUMPTARGET(__pow_finite)
+
+        vmovsd    %xmm0, 1280(%rsp,%r15)
+        jmp       .LBL_2_7
+
+#endif
+END (_ZGVeN8vv_pow_skx)
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_sin2_core.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_sin2_core.S
new file mode 100644
index 0000000000..e35654be8d
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_sin2_core.S
@@ -0,0 +1,36 @@
+/* Multiple versions of vectorized sin.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+	.text
+ENTRY (_ZGVbN2v_sin)
+        .type   _ZGVbN2v_sin, @gnu_indirect_function
+	LOAD_RTLD_GLOBAL_RO_RDX
+        leaq    _ZGVbN2v_sin_sse4(%rip), %rax
+	HAS_CPU_FEATURE (SSE4_1)
+        jz      2f
+        ret
+2:      leaq    _ZGVbN2v_sin_sse2(%rip), %rax
+        ret
+END (_ZGVbN2v_sin)
+libmvec_hidden_def (_ZGVbN2v_sin)
+
+#define _ZGVbN2v_sin _ZGVbN2v_sin_sse2
+#include "../svml_d_sin2_core.S"
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_sin2_core_sse4.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_sin2_core_sse4.S
new file mode 100644
index 0000000000..393ba03b76
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_sin2_core_sse4.S
@@ -0,0 +1,229 @@
+/* Function sin vectorized with SSE4.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "svml_d_trig_data.h"
+
+	.text
+ENTRY (_ZGVbN2v_sin_sse4)
+/* ALGORITHM DESCRIPTION:
+
+      ( low accuracy ( < 4ulp ) or enhanced performance
+       ( half of correct mantissa ) implementation )
+
+      Argument representation:
+      arg = N*Pi + R
+
+      Result calculation:
+      sin(arg) = sin(N*Pi + R) = (-1)^N * sin(R)
+      sin(R) is approximated by corresponding polynomial
+ */
+        pushq     %rbp
+        cfi_adjust_cfa_offset (8)
+        cfi_rel_offset (%rbp, 0)
+        movq      %rsp, %rbp
+        cfi_def_cfa_register (%rbp)
+        andq      $-64, %rsp
+        subq      $320, %rsp
+        movaps    %xmm0, %xmm5
+        movq      __svml_d_trig_data@GOTPCREL(%rip), %rax
+        movups __dAbsMask(%rax), %xmm3
+/*
+   ARGUMENT RANGE REDUCTION:
+   X' = |X|
+ */
+        movaps    %xmm3, %xmm4
+
+/* SignX - sign bit of X */
+        andnps    %xmm5, %xmm3
+        movups __dInvPI(%rax), %xmm2
+        andps     %xmm5, %xmm4
+
+/* Y = X'*InvPi + RS : right shifter add */
+        mulpd     %xmm4, %xmm2
+        movups __dRShifter(%rax), %xmm6
+
+/* R = X' - N*Pi1 */
+        movaps    %xmm4, %xmm0
+        addpd     %xmm6, %xmm2
+        cmpnlepd __dRangeVal(%rax), %xmm4
+
+/* N = Y - RS : right shifter sub */
+        movaps    %xmm2, %xmm1
+
+/* SignRes = Y<<63 : shift LSB to MSB place for result sign */
+        psllq     $63, %xmm2
+        subpd     %xmm6, %xmm1
+        movmskpd  %xmm4, %ecx
+        movups __dPI1(%rax), %xmm7
+        mulpd     %xmm1, %xmm7
+        movups __dPI2(%rax), %xmm6
+
+/* R = R - N*Pi2 */
+        mulpd     %xmm1, %xmm6
+        subpd     %xmm7, %xmm0
+        movups __dPI3(%rax), %xmm7
+
+/* R = R - N*Pi3 */
+        mulpd     %xmm1, %xmm7
+        subpd     %xmm6, %xmm0
+        movups __dPI4(%rax), %xmm6
+
+/* R = R - N*Pi4 */
+        mulpd     %xmm6, %xmm1
+        subpd     %xmm7, %xmm0
+        subpd     %xmm1, %xmm0
+
+/*
+   POLYNOMIAL APPROXIMATION:
+   R2 = R*R
+ */
+        movaps    %xmm0, %xmm1
+        mulpd     %xmm0, %xmm1
+
+/* R = R^SignRes : update sign of reduced argument */
+        xorps     %xmm2, %xmm0
+        movups    __dC7_sin(%rax), %xmm2
+        mulpd     %xmm1, %xmm2
+        addpd     __dC6_sin(%rax), %xmm2
+        mulpd     %xmm1, %xmm2
+        addpd     __dC5_sin(%rax), %xmm2
+        mulpd     %xmm1, %xmm2
+        addpd     __dC4_sin(%rax), %xmm2
+
+/* Poly = C3+R2*(C4+R2*(C5+R2*(C6+R2*C7))) */
+        mulpd     %xmm1, %xmm2
+        addpd     __dC3_sin(%rax), %xmm2
+
+/* Poly = R2*(C1+R2*(C2+R2*Poly)) */
+        mulpd     %xmm1, %xmm2
+        addpd     __dC2_sin(%rax), %xmm2
+        mulpd     %xmm1, %xmm2
+        addpd     __dC1_sin(%rax), %xmm2
+        mulpd     %xmm2, %xmm1
+
+/* Poly = Poly*R + R */
+        mulpd     %xmm0, %xmm1
+        addpd     %xmm1, %xmm0
+
+/*
+   RECONSTRUCTION:
+   Final sign setting: Res = Poly^SignX
+ */
+        xorps     %xmm3, %xmm0
+        testl     %ecx, %ecx
+        jne       .LBL_1_3
+
+.LBL_1_2:
+        cfi_remember_state
+        movq      %rbp, %rsp
+        cfi_def_cfa_register (%rsp)
+        popq      %rbp
+        cfi_adjust_cfa_offset (-8)
+        cfi_restore (%rbp)
+        ret
+
+.LBL_1_3:
+        cfi_restore_state
+        movups    %xmm5, 192(%rsp)
+        movups    %xmm0, 256(%rsp)
+        je        .LBL_1_2
+
+        xorb      %dl, %dl
+        xorl      %eax, %eax
+        movups    %xmm8, 112(%rsp)
+        movups    %xmm9, 96(%rsp)
+        movups    %xmm10, 80(%rsp)
+        movups    %xmm11, 64(%rsp)
+        movups    %xmm12, 48(%rsp)
+        movups    %xmm13, 32(%rsp)
+        movups    %xmm14, 16(%rsp)
+        movups    %xmm15, (%rsp)
+        movq      %rsi, 136(%rsp)
+        movq      %rdi, 128(%rsp)
+        movq      %r12, 168(%rsp)
+        cfi_offset_rel_rsp (12, 168)
+        movb      %dl, %r12b
+        movq      %r13, 160(%rsp)
+        cfi_offset_rel_rsp (13, 160)
+        movl      %ecx, %r13d
+        movq      %r14, 152(%rsp)
+        cfi_offset_rel_rsp (14, 152)
+        movl      %eax, %r14d
+        movq      %r15, 144(%rsp)
+        cfi_offset_rel_rsp (15, 144)
+        cfi_remember_state
+
+.LBL_1_6:
+        btl       %r14d, %r13d
+        jc        .LBL_1_12
+
+.LBL_1_7:
+        lea       1(%r14), %esi
+        btl       %esi, %r13d
+        jc        .LBL_1_10
+
+.LBL_1_8:
+        incb      %r12b
+        addl      $2, %r14d
+        cmpb      $16, %r12b
+        jb        .LBL_1_6
+
+        movups    112(%rsp), %xmm8
+        movups    96(%rsp), %xmm9
+        movups    80(%rsp), %xmm10
+        movups    64(%rsp), %xmm11
+        movups    48(%rsp), %xmm12
+        movups    32(%rsp), %xmm13
+        movups    16(%rsp), %xmm14
+        movups    (%rsp), %xmm15
+        movq      136(%rsp), %rsi
+        movq      128(%rsp), %rdi
+        movq      168(%rsp), %r12
+        cfi_restore (%r12)
+        movq      160(%rsp), %r13
+        cfi_restore (%r13)
+        movq      152(%rsp), %r14
+        cfi_restore (%r14)
+        movq      144(%rsp), %r15
+        cfi_restore (%r15)
+        movups    256(%rsp), %xmm0
+        jmp       .LBL_1_2
+
+.LBL_1_10:
+        cfi_restore_state
+        movzbl    %r12b, %r15d
+        shlq      $4, %r15
+        movsd     200(%rsp,%r15), %xmm0
+
+        call      JUMPTARGET(sin)
+
+        movsd     %xmm0, 264(%rsp,%r15)
+        jmp       .LBL_1_8
+
+.LBL_1_12:
+        movzbl    %r12b, %r15d
+        shlq      $4, %r15
+        movsd     192(%rsp,%r15), %xmm0
+
+        call      JUMPTARGET(sin)
+
+        movsd     %xmm0, 256(%rsp,%r15)
+        jmp       .LBL_1_7
+
+END (_ZGVbN2v_sin_sse4)
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_sin4_core.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_sin4_core.S
new file mode 100644
index 0000000000..f4482d3a11
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_sin4_core.S
@@ -0,0 +1,36 @@
+/* Multiple versions of vectorized sin, vector length is 4.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+	.text
+ENTRY (_ZGVdN4v_sin)
+        .type   _ZGVdN4v_sin, @gnu_indirect_function
+	LOAD_RTLD_GLOBAL_RO_RDX
+        leaq    _ZGVdN4v_sin_avx2(%rip), %rax
+	HAS_ARCH_FEATURE (AVX2_Usable)
+        jz      2f
+        ret
+2:      leaq    _ZGVdN4v_sin_sse_wrapper(%rip), %rax
+        ret
+END (_ZGVdN4v_sin)
+libmvec_hidden_def (_ZGVdN4v_sin)
+
+#define _ZGVdN4v_sin _ZGVdN4v_sin_sse_wrapper
+#include "../svml_d_sin4_core.S"
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_sin4_core_avx2.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_sin4_core_avx2.S
new file mode 100644
index 0000000000..b035fa1b15
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_sin4_core_avx2.S
@@ -0,0 +1,210 @@
+/* Function sin vectorized with AVX2.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "svml_d_trig_data.h"
+
+	.text
+ENTRY (_ZGVdN4v_sin_avx2)
+/* ALGORITHM DESCRIPTION:
+
+      ( low accuracy ( < 4ulp ) or enhanced performance
+      ( half of correct mantissa ) implementation )
+
+     Argument representation:
+     arg = N*Pi + R
+
+     Result calculation:
+     sin(arg) = sin(N*Pi + R) = (-1)^N * sin(R)
+     sin(R) is approximated by corresponding polynomial
+ */
+        pushq     %rbp
+        cfi_adjust_cfa_offset (8)
+        cfi_rel_offset (%rbp, 0)
+        movq      %rsp, %rbp
+        cfi_def_cfa_register (%rbp)
+        andq      $-64, %rsp
+        subq      $448, %rsp
+        movq      __svml_d_trig_data@GOTPCREL(%rip), %rax
+        vmovdqa   %ymm0, %ymm4
+        vmovupd __dAbsMask(%rax), %ymm2
+        vmovupd __dInvPI(%rax), %ymm6
+        vmovupd __dRShifter(%rax), %ymm5
+        vmovupd __dPI1_FMA(%rax), %ymm7
+/*
+  ARGUMENT RANGE REDUCTION:
+  X' = |X|
+ */
+        vandpd    %ymm2, %ymm4, %ymm3
+
+/* Y = X'*InvPi + RS : right shifter add */
+        vfmadd213pd %ymm5, %ymm3, %ymm6
+
+/* N = Y - RS : right shifter sub */
+        vsubpd    %ymm5, %ymm6, %ymm1
+
+/* SignRes = Y<<63 : shift LSB to MSB place for result sign */
+        vpsllq    $63, %ymm6, %ymm5
+
+/* R = X' - N*Pi1 */
+        vmovapd   %ymm3, %ymm0
+        vfnmadd231pd %ymm1, %ymm7, %ymm0
+        vcmpnle_uqpd __dRangeVal(%rax), %ymm3, %ymm3
+
+/* R = R - N*Pi2 */
+        vfnmadd231pd __dPI2_FMA(%rax), %ymm1, %ymm0
+
+/* R = R - N*Pi3 */
+        vfnmadd132pd __dPI3_FMA(%rax), %ymm0, %ymm1
+
+/*
+  POLYNOMIAL APPROXIMATION:
+  R2 = R*R
+ */
+        vmulpd    %ymm1, %ymm1, %ymm0
+
+/* R = R^SignRes : update sign of reduced argument */
+        vxorpd    %ymm5, %ymm1, %ymm6
+        vmovupd   __dC7_sin(%rax), %ymm1
+        vfmadd213pd __dC6_sin(%rax), %ymm0, %ymm1
+        vfmadd213pd __dC5_sin(%rax), %ymm0, %ymm1
+        vfmadd213pd __dC4_sin(%rax), %ymm0, %ymm1
+
+/* Poly = C3+R2*(C4+R2*(C5+R2*(C6+R2*C7))) */
+        vfmadd213pd __dC3_sin(%rax), %ymm0, %ymm1
+
+/* Poly = R2*(C1+R2*(C2+R2*Poly)) */
+        vfmadd213pd __dC2_sin(%rax), %ymm0, %ymm1
+        vfmadd213pd __dC1_sin(%rax), %ymm0, %ymm1
+
+/* SignX - sign bit of X */
+        vandnpd   %ymm4, %ymm2, %ymm7
+        vmulpd    %ymm0, %ymm1, %ymm2
+
+/* Poly = Poly*R + R */
+        vfmadd213pd %ymm6, %ymm6, %ymm2
+        vmovmskpd %ymm3, %ecx
+
+/*
+  RECONSTRUCTION:
+  Final sign setting: Res = Poly^SignX
+ */
+        vxorpd    %ymm7, %ymm2, %ymm0
+        testl     %ecx, %ecx
+        jne       .LBL_1_3
+
+.LBL_1_2:
+        cfi_remember_state
+        movq      %rbp, %rsp
+        cfi_def_cfa_register (%rsp)
+        popq      %rbp
+        cfi_adjust_cfa_offset (-8)
+        cfi_restore (%rbp)
+        ret
+
+.LBL_1_3:
+        cfi_restore_state
+        vmovupd   %ymm4, 320(%rsp)
+        vmovupd   %ymm0, 384(%rsp)
+        je        .LBL_1_2
+
+        xorb      %dl, %dl
+        xorl      %eax, %eax
+        vmovups   %ymm8, 224(%rsp)
+        vmovups   %ymm9, 192(%rsp)
+        vmovups   %ymm10, 160(%rsp)
+        vmovups   %ymm11, 128(%rsp)
+        vmovups   %ymm12, 96(%rsp)
+        vmovups   %ymm13, 64(%rsp)
+        vmovups   %ymm14, 32(%rsp)
+        vmovups   %ymm15, (%rsp)
+        movq      %rsi, 264(%rsp)
+        movq      %rdi, 256(%rsp)
+        movq      %r12, 296(%rsp)
+        cfi_offset_rel_rsp (12, 296)
+        movb      %dl, %r12b
+        movq      %r13, 288(%rsp)
+        cfi_offset_rel_rsp (13, 288)
+        movl      %ecx, %r13d
+        movq      %r14, 280(%rsp)
+        cfi_offset_rel_rsp (14, 280)
+        movl      %eax, %r14d
+        movq      %r15, 272(%rsp)
+        cfi_offset_rel_rsp (15, 272)
+        cfi_remember_state
+
+.LBL_1_6:
+        btl       %r14d, %r13d
+        jc        .LBL_1_12
+
+.LBL_1_7:
+        lea       1(%r14), %esi
+        btl       %esi, %r13d
+        jc        .LBL_1_10
+
+.LBL_1_8:
+        incb      %r12b
+        addl      $2, %r14d
+        cmpb      $16, %r12b
+        jb        .LBL_1_6
+
+        vmovups   224(%rsp), %ymm8
+        vmovups   192(%rsp), %ymm9
+        vmovups   160(%rsp), %ymm10
+        vmovups   128(%rsp), %ymm11
+        vmovups   96(%rsp), %ymm12
+        vmovups   64(%rsp), %ymm13
+        vmovups   32(%rsp), %ymm14
+        vmovups   (%rsp), %ymm15
+        vmovupd   384(%rsp), %ymm0
+        movq      264(%rsp), %rsi
+        movq      256(%rsp), %rdi
+        movq      296(%rsp), %r12
+        cfi_restore (%r12)
+        movq      288(%rsp), %r13
+        cfi_restore (%r13)
+        movq      280(%rsp), %r14
+        cfi_restore (%r14)
+        movq      272(%rsp), %r15
+        cfi_restore (%r15)
+        jmp       .LBL_1_2
+
+.LBL_1_10:
+        cfi_restore_state
+        movzbl    %r12b, %r15d
+        shlq      $4, %r15
+        vmovsd    328(%rsp,%r15), %xmm0
+        vzeroupper
+
+        call      JUMPTARGET(sin)
+
+        vmovsd    %xmm0, 392(%rsp,%r15)
+        jmp       .LBL_1_8
+
+.LBL_1_12:
+        movzbl    %r12b, %r15d
+        shlq      $4, %r15
+        vmovsd    320(%rsp,%r15), %xmm0
+        vzeroupper
+
+        call      JUMPTARGET(sin)
+
+        vmovsd    %xmm0, 384(%rsp,%r15)
+        jmp       .LBL_1_7
+
+END (_ZGVdN4v_sin_avx2)
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core.S
new file mode 100644
index 0000000000..2b15889c71
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core.S
@@ -0,0 +1,37 @@
+/* Multiple versions of vectorized sin.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+	.text
+ENTRY (_ZGVeN8v_sin)
+        .type   _ZGVeN8v_sin, @gnu_indirect_function
+	LOAD_RTLD_GLOBAL_RO_RDX
+        leaq    _ZGVeN8v_sin_skx(%rip), %rax
+	HAS_ARCH_FEATURE (AVX512DQ_Usable)
+        jnz     2f
+        leaq    _ZGVeN8v_sin_knl(%rip), %rax
+	HAS_ARCH_FEATURE (AVX512F_Usable)
+        jnz     2f
+        leaq    _ZGVeN8v_sin_avx2_wrapper(%rip), %rax
+2:      ret
+END (_ZGVeN8v_sin)
+
+#define _ZGVeN8v_sin _ZGVeN8v_sin_avx2_wrapper
+#include "../svml_d_sin8_core.S"
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S
new file mode 100644
index 0000000000..7580e60636
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S
@@ -0,0 +1,465 @@
+/* Function sin vectorized with AVX-512, KNL and SKX versions.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "svml_d_trig_data.h"
+#include "svml_d_wrapper_impl.h"
+
+	.text
+ENTRY (_ZGVeN8v_sin_knl)
+#ifndef HAVE_AVX512DQ_ASM_SUPPORT
+WRAPPER_IMPL_AVX512 _ZGVdN4v_sin
+#else
+/*
+   ALGORITHM DESCRIPTION:
+
+      ( low accuracy ( < 4ulp ) or enhanced performance
+      ( half of correct mantissa ) implementation )
+
+      Argument representation:
+      arg = N*Pi + R
+
+      Result calculation:
+      sin(arg) = sin(N*Pi + R) = (-1)^N * sin(R)
+      sin(R) is approximated by corresponding polynomial
+ */
+        pushq     %rbp
+        cfi_adjust_cfa_offset (8)
+        cfi_rel_offset (%rbp, 0)
+        movq      %rsp, %rbp
+        cfi_def_cfa_register (%rbp)
+        andq      $-64, %rsp
+        subq      $1280, %rsp
+        movq      __svml_d_trig_data@GOTPCREL(%rip), %rax
+        movq      $-1, %rdx
+        vmovups __dAbsMask(%rax), %zmm6
+        vmovups __dInvPI(%rax), %zmm1
+
+/*
+   ARGUMENT RANGE REDUCTION:
+   X' = |X|
+ */
+        vpandq    %zmm6, %zmm0, %zmm12
+        vmovups __dPI1_FMA(%rax), %zmm2
+        vmovups __dC7_sin(%rax), %zmm7
+
+/* SignX - sign bit of X */
+        vpandnq   %zmm0, %zmm6, %zmm11
+
+/* R = X' - N*Pi1 */
+        vmovaps   %zmm12, %zmm3
+
+/* Y = X'*InvPi + RS : right shifter add */
+        vfmadd213pd __dRShifter(%rax), %zmm12, %zmm1
+        vcmppd    $22, __dRangeVal(%rax), %zmm12, %k1
+        vpbroadcastq %rdx, %zmm13{%k1}{z}
+
+/* N = Y - RS : right shifter sub */
+        vsubpd __dRShifter(%rax), %zmm1, %zmm4
+
+/* SignRes = Y<<63 : shift LSB to MSB place for result sign */
+        vpsllq    $63, %zmm1, %zmm5
+        vptestmq  %zmm13, %zmm13, %k0
+        vfnmadd231pd %zmm4, %zmm2, %zmm3
+        kmovw     %k0, %ecx
+        movzbl    %cl, %ecx
+
+/* R = R - N*Pi2 */
+        vfnmadd231pd __dPI2_FMA(%rax), %zmm4, %zmm3
+
+/* R = R - N*Pi3 */
+        vfnmadd132pd __dPI3_FMA(%rax), %zmm3, %zmm4
+
+/*
+  POLYNOMIAL APPROXIMATION:
+  R2 = R*R
+ */
+        vmulpd    %zmm4, %zmm4, %zmm8
+
+/* R = R^SignRes : update sign of reduced argument */
+        vpxorq    %zmm5, %zmm4, %zmm9
+        vfmadd213pd __dC6_sin(%rax), %zmm8, %zmm7
+        vfmadd213pd __dC5_sin(%rax), %zmm8, %zmm7
+        vfmadd213pd __dC4_sin(%rax), %zmm8, %zmm7
+
+/* Poly = C3+R2*(C4+R2*(C5+R2*(C6+R2*C7))) */
+        vfmadd213pd __dC3_sin(%rax), %zmm8, %zmm7
+
+/* Poly = R2*(C1+R2*(C2+R2*Poly)) */
+        vfmadd213pd __dC2_sin(%rax), %zmm8, %zmm7
+        vfmadd213pd __dC1_sin(%rax), %zmm8, %zmm7
+        vmulpd    %zmm8, %zmm7, %zmm10
+
+/* Poly = Poly*R + R */
+        vfmadd213pd %zmm9, %zmm9, %zmm10
+
+/*
+   RECONSTRUCTION:
+   Final sign setting: Res = Poly^SignX
+ */
+        vpxorq    %zmm11, %zmm10, %zmm1
+        testl     %ecx, %ecx
+        jne       .LBL_1_3
+
+.LBL_1_2:
+        cfi_remember_state
+        vmovaps   %zmm1, %zmm0
+        movq      %rbp, %rsp
+        cfi_def_cfa_register (%rsp)
+        popq      %rbp
+        cfi_adjust_cfa_offset (-8)
+        cfi_restore (%rbp)
+        ret
+
+.LBL_1_3:
+        cfi_restore_state
+        vmovups   %zmm0, 1152(%rsp)
+        vmovups   %zmm1, 1216(%rsp)
+        je        .LBL_1_2
+
+        xorb      %dl, %dl
+        kmovw     %k4, 1048(%rsp)
+        xorl      %eax, %eax
+        kmovw     %k5, 1040(%rsp)
+        kmovw     %k6, 1032(%rsp)
+        kmovw     %k7, 1024(%rsp)
+        vmovups   %zmm16, 960(%rsp)
+        vmovups   %zmm17, 896(%rsp)
+        vmovups   %zmm18, 832(%rsp)
+        vmovups   %zmm19, 768(%rsp)
+        vmovups   %zmm20, 704(%rsp)
+        vmovups   %zmm21, 640(%rsp)
+        vmovups   %zmm22, 576(%rsp)
+        vmovups   %zmm23, 512(%rsp)
+        vmovups   %zmm24, 448(%rsp)
+        vmovups   %zmm25, 384(%rsp)
+        vmovups   %zmm26, 320(%rsp)
+        vmovups   %zmm27, 256(%rsp)
+        vmovups   %zmm28, 192(%rsp)
+        vmovups   %zmm29, 128(%rsp)
+        vmovups   %zmm30, 64(%rsp)
+        vmovups   %zmm31, (%rsp)
+        movq      %rsi, 1064(%rsp)
+        movq      %rdi, 1056(%rsp)
+        movq      %r12, 1096(%rsp)
+        cfi_offset_rel_rsp (12, 1096)
+        movb      %dl, %r12b
+        movq      %r13, 1088(%rsp)
+        cfi_offset_rel_rsp (13, 1088)
+        movl      %ecx, %r13d
+        movq      %r14, 1080(%rsp)
+        cfi_offset_rel_rsp (14, 1080)
+        movl      %eax, %r14d
+        movq      %r15, 1072(%rsp)
+        cfi_offset_rel_rsp (15, 1072)
+        cfi_remember_state
+
+.LBL_1_6:
+        btl       %r14d, %r13d
+        jc        .LBL_1_12
+
+.LBL_1_7:
+        lea       1(%r14), %esi
+        btl       %esi, %r13d
+        jc        .LBL_1_10
+
+.LBL_1_8:
+        addb      $1, %r12b
+        addl      $2, %r14d
+        cmpb      $16, %r12b
+        jb        .LBL_1_6
+
+        kmovw     1048(%rsp), %k4
+        movq      1064(%rsp), %rsi
+        kmovw     1040(%rsp), %k5
+        movq      1056(%rsp), %rdi
+        kmovw     1032(%rsp), %k6
+        movq      1096(%rsp), %r12
+        cfi_restore (%r12)
+        movq      1088(%rsp), %r13
+        cfi_restore (%r13)
+        kmovw     1024(%rsp), %k7
+        vmovups   960(%rsp), %zmm16
+        vmovups   896(%rsp), %zmm17
+        vmovups   832(%rsp), %zmm18
+        vmovups   768(%rsp), %zmm19
+        vmovups   704(%rsp), %zmm20
+        vmovups   640(%rsp), %zmm21
+        vmovups   576(%rsp), %zmm22
+        vmovups   512(%rsp), %zmm23
+        vmovups   448(%rsp), %zmm24
+        vmovups   384(%rsp), %zmm25
+        vmovups   320(%rsp), %zmm26
+        vmovups   256(%rsp), %zmm27
+        vmovups   192(%rsp), %zmm28
+        vmovups   128(%rsp), %zmm29
+        vmovups   64(%rsp), %zmm30
+        vmovups   (%rsp), %zmm31
+        movq      1080(%rsp), %r14
+        cfi_restore (%r14)
+        movq      1072(%rsp), %r15
+        cfi_restore (%r15)
+        vmovups   1216(%rsp), %zmm1
+        jmp       .LBL_1_2
+
+.LBL_1_10:
+        cfi_restore_state
+        movzbl    %r12b, %r15d
+        shlq      $4, %r15
+        vmovsd    1160(%rsp,%r15), %xmm0
+        call      JUMPTARGET(sin)
+        vmovsd    %xmm0, 1224(%rsp,%r15)
+        jmp       .LBL_1_8
+
+.LBL_1_12:
+        movzbl    %r12b, %r15d
+        shlq      $4, %r15
+        vmovsd    1152(%rsp,%r15), %xmm0
+        call      JUMPTARGET(sin)
+        vmovsd    %xmm0, 1216(%rsp,%r15)
+        jmp       .LBL_1_7
+#endif
+END (_ZGVeN8v_sin_knl)
+
+ENTRY (_ZGVeN8v_sin_skx)
+#ifndef HAVE_AVX512DQ_ASM_SUPPORT
+WRAPPER_IMPL_AVX512 _ZGVdN4v_sin
+#else
+/*
+   ALGORITHM DESCRIPTION:
+
+      ( low accuracy ( < 4ulp ) or enhanced performance
+       ( half of correct mantissa ) implementation )
+
+      Argument representation:
+      arg = N*Pi + R
+
+      Result calculation:
+      sin(arg) = sin(N*Pi + R) = (-1)^N * sin(R)
+      sin(R) is approximated by corresponding polynomial
+ */
+        pushq     %rbp
+        cfi_adjust_cfa_offset (8)
+        cfi_rel_offset (%rbp, 0)
+        movq      %rsp, %rbp
+        cfi_def_cfa_register (%rbp)
+        andq      $-64, %rsp
+        subq      $1280, %rsp
+        movq      __svml_d_trig_data@GOTPCREL(%rip), %rax
+        vpbroadcastq .L_2il0floatpacket.14(%rip), %zmm14
+        vmovups __dAbsMask(%rax), %zmm7
+        vmovups __dInvPI(%rax), %zmm2
+        vmovups __dRShifter(%rax), %zmm1
+        vmovups __dPI1_FMA(%rax), %zmm3
+        vmovups __dC7_sin(%rax), %zmm8
+
+/*
+  ARGUMENT RANGE REDUCTION:
+  X' = |X|
+ */
+        vandpd    %zmm7, %zmm0, %zmm13
+
+/* SignX - sign bit of X */
+        vandnpd   %zmm0, %zmm7, %zmm12
+
+/* Y = X'*InvPi + RS : right shifter add */
+        vfmadd213pd %zmm1, %zmm13, %zmm2
+        vcmppd    $18, __dRangeVal(%rax), %zmm13, %k1
+
+/* SignRes = Y<<63 : shift LSB to MSB place for result sign */
+        vpsllq    $63, %zmm2, %zmm6
+
+/* N = Y - RS : right shifter sub */
+        vsubpd    %zmm1, %zmm2, %zmm5
+
+/* R = X' - N*Pi1 */
+        vmovaps   %zmm13, %zmm4
+        vfnmadd231pd %zmm5, %zmm3, %zmm4
+
+/* R = R - N*Pi2 */
+        vfnmadd231pd __dPI2_FMA(%rax), %zmm5, %zmm4
+
+/* R = R - N*Pi3 */
+        vfnmadd132pd __dPI3_FMA(%rax), %zmm4, %zmm5
+
+/*
+  POLYNOMIAL APPROXIMATION:
+  R2 = R*R
+ */
+        vmulpd    %zmm5, %zmm5, %zmm9
+
+/* R = R^SignRes : update sign of reduced argument */
+        vxorpd    %zmm6, %zmm5, %zmm10
+        vfmadd213pd __dC6_sin(%rax), %zmm9, %zmm8
+        vfmadd213pd __dC5_sin(%rax), %zmm9, %zmm8
+        vfmadd213pd __dC4_sin(%rax), %zmm9, %zmm8
+
+/* Poly = C3+R2*(C4+R2*(C5+R2*(C6+R2*C7))) */
+        vfmadd213pd __dC3_sin(%rax), %zmm9, %zmm8
+
+/* Poly = R2*(C1+R2*(C2+R2*Poly)) */
+        vfmadd213pd __dC2_sin(%rax), %zmm9, %zmm8
+        vfmadd213pd __dC1_sin(%rax), %zmm9, %zmm8
+        vmulpd    %zmm9, %zmm8, %zmm11
+
+/* Poly = Poly*R + R */
+        vfmadd213pd %zmm10, %zmm10, %zmm11
+
+/*
+  RECONSTRUCTION:
+  Final sign setting: Res = Poly^SignX
+ */
+        vxorpd    %zmm12, %zmm11, %zmm1
+        vpandnq   %zmm13, %zmm13, %zmm14{%k1}
+        vcmppd    $3, %zmm14, %zmm14, %k0
+        kmovw     %k0, %ecx
+        testl     %ecx, %ecx
+        jne       .LBL_2_3
+
+.LBL_2_2:
+        cfi_remember_state
+        vmovaps   %zmm1, %zmm0
+        movq      %rbp, %rsp
+        cfi_def_cfa_register (%rsp)
+        popq      %rbp
+        cfi_adjust_cfa_offset (-8)
+        cfi_restore (%rbp)
+        ret
+
+.LBL_2_3:
+        cfi_restore_state
+        vmovups   %zmm0, 1152(%rsp)
+        vmovups   %zmm1, 1216(%rsp)
+        je        .LBL_2_2
+
+        xorb      %dl, %dl
+        xorl      %eax, %eax
+        kmovw     %k4, 1048(%rsp)
+        kmovw     %k5, 1040(%rsp)
+        kmovw     %k6, 1032(%rsp)
+        kmovw     %k7, 1024(%rsp)
+        vmovups   %zmm16, 960(%rsp)
+        vmovups   %zmm17, 896(%rsp)
+        vmovups   %zmm18, 832(%rsp)
+        vmovups   %zmm19, 768(%rsp)
+        vmovups   %zmm20, 704(%rsp)
+        vmovups   %zmm21, 640(%rsp)
+        vmovups   %zmm22, 576(%rsp)
+        vmovups   %zmm23, 512(%rsp)
+        vmovups   %zmm24, 448(%rsp)
+        vmovups   %zmm25, 384(%rsp)
+        vmovups   %zmm26, 320(%rsp)
+        vmovups   %zmm27, 256(%rsp)
+        vmovups   %zmm28, 192(%rsp)
+        vmovups   %zmm29, 128(%rsp)
+        vmovups   %zmm30, 64(%rsp)
+        vmovups   %zmm31, (%rsp)
+        movq      %rsi, 1064(%rsp)
+        movq      %rdi, 1056(%rsp)
+        movq      %r12, 1096(%rsp)
+        cfi_offset_rel_rsp (12, 1096)
+        movb      %dl, %r12b
+        movq      %r13, 1088(%rsp)
+        cfi_offset_rel_rsp (13, 1088)
+        movl      %ecx, %r13d
+        movq      %r14, 1080(%rsp)
+        cfi_offset_rel_rsp (14, 1080)
+        movl      %eax, %r14d
+        movq      %r15, 1072(%rsp)
+        cfi_offset_rel_rsp (15, 1072)
+        cfi_remember_state
+
+.LBL_2_6:
+        btl       %r14d, %r13d
+        jc        .LBL_2_12
+
+.LBL_2_7:
+        lea       1(%r14), %esi
+        btl       %esi, %r13d
+        jc        .LBL_2_10
+
+.LBL_2_8:
+        incb      %r12b
+        addl      $2, %r14d
+        cmpb      $16, %r12b
+        jb        .LBL_2_6
+
+        kmovw     1048(%rsp), %k4
+        kmovw     1040(%rsp), %k5
+        kmovw     1032(%rsp), %k6
+        kmovw     1024(%rsp), %k7
+        vmovups   960(%rsp), %zmm16
+        vmovups   896(%rsp), %zmm17
+        vmovups   832(%rsp), %zmm18
+        vmovups   768(%rsp), %zmm19
+        vmovups   704(%rsp), %zmm20
+        vmovups   640(%rsp), %zmm21
+        vmovups   576(%rsp), %zmm22
+        vmovups   512(%rsp), %zmm23
+        vmovups   448(%rsp), %zmm24
+        vmovups   384(%rsp), %zmm25
+        vmovups   320(%rsp), %zmm26
+        vmovups   256(%rsp), %zmm27
+        vmovups   192(%rsp), %zmm28
+        vmovups   128(%rsp), %zmm29
+        vmovups   64(%rsp), %zmm30
+        vmovups   (%rsp), %zmm31
+        vmovups   1216(%rsp), %zmm1
+        movq      1064(%rsp), %rsi
+        movq      1056(%rsp), %rdi
+        movq      1096(%rsp), %r12
+        cfi_restore (%r12)
+        movq      1088(%rsp), %r13
+        cfi_restore (%r13)
+        movq      1080(%rsp), %r14
+        cfi_restore (%r14)
+        movq      1072(%rsp), %r15
+        cfi_restore (%r15)
+        jmp       .LBL_2_2
+
+.LBL_2_10:
+        cfi_restore_state
+        movzbl    %r12b, %r15d
+        shlq      $4, %r15
+        vmovsd    1160(%rsp,%r15), %xmm0
+        vzeroupper
+        vmovsd    1160(%rsp,%r15), %xmm0
+
+        call      JUMPTARGET(sin)
+
+        vmovsd    %xmm0, 1224(%rsp,%r15)
+        jmp       .LBL_2_8
+
+.LBL_2_12:
+        movzbl    %r12b, %r15d
+        shlq      $4, %r15
+        vmovsd    1152(%rsp,%r15), %xmm0
+        vzeroupper
+        vmovsd    1152(%rsp,%r15), %xmm0
+
+        call      JUMPTARGET(sin)
+
+        vmovsd    %xmm0, 1216(%rsp,%r15)
+        jmp       .LBL_2_7
+#endif
+END (_ZGVeN8v_sin_skx)
+
+	.section .rodata, "a"
+.L_2il0floatpacket.14:
+	.long	0xffffffff,0xffffffff
+	.type	.L_2il0floatpacket.14,@object
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_sincos2_core.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_sincos2_core.S
new file mode 100644
index 0000000000..13279e3fb7
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_sincos2_core.S
@@ -0,0 +1,36 @@
+/* Multiple versions of vectorized sincos.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+	.text
+ENTRY (_ZGVbN2vvv_sincos)
+        .type   _ZGVbN2vvv_sincos, @gnu_indirect_function
+	LOAD_RTLD_GLOBAL_RO_RDX
+        leaq    _ZGVbN2vvv_sincos_sse4(%rip), %rax
+	HAS_CPU_FEATURE (SSE4_1)
+        jz      2f
+        ret
+2:      leaq    _ZGVbN2vvv_sincos_sse2(%rip), %rax
+        ret
+END (_ZGVbN2vvv_sincos)
+libmvec_hidden_def (_ZGVbN2vvv_sincos)
+
+#define _ZGVbN2vvv_sincos _ZGVbN2vvv_sincos_sse2
+#include "../svml_d_sincos2_core.S"
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_sincos2_core_sse4.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_sincos2_core_sse4.S
new file mode 100644
index 0000000000..c46109f35d
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_sincos2_core_sse4.S
@@ -0,0 +1,368 @@
+/* Function sincos vectorized with SSE4.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "svml_d_trig_data.h"
+
+	.text
+ENTRY (_ZGVbN2vl8l8_sincos_sse4)
+/*
+   ALGORITHM DESCRIPTION:
+
+     ( low accuracy ( < 4ulp ) or enhanced performance
+      ( half of correct mantissa ) implementation )
+
+     Argument representation:
+     arg = N*Pi + R
+
+     Result calculation:
+     sin(arg) = sin(N*Pi + R) = (-1)^N * sin(R)
+     arg + Pi/2 = (N'*Pi + R')
+     cos(arg) = sin(arg+Pi/2) = sin(N'*Pi + R') = (-1)^N' * sin(R')
+     sin(R), sin(R') are approximated by corresponding polynomial.  */
+
+        pushq     %rbp
+        cfi_adjust_cfa_offset (8)
+        cfi_rel_offset (%rbp, 0)
+        movq      %rsp, %rbp
+        cfi_def_cfa_register (%rbp)
+        andq      $-64, %rsp
+        subq      $320, %rsp
+        movq      __svml_d_trig_data@GOTPCREL(%rip), %rax
+        movups    %xmm11, 160(%rsp)
+        movups    %xmm12, 144(%rsp)
+        movups __dSignMask(%rax), %xmm11
+
+/* ARGUMENT RANGE REDUCTION:
+   Absolute argument: X' = |X| */
+        movaps    %xmm11, %xmm4
+
+/* Grab sign bit from argument */
+        movaps    %xmm11, %xmm7
+        movups __dInvPI(%rax), %xmm5
+        andnps    %xmm0, %xmm4
+
+/* SinY = X'*InvPi + RS : right shifter add */
+        mulpd     %xmm4, %xmm5
+        addpd __dRShifter(%rax), %xmm5
+
+/* SinSignRes = Y<<63 : shift LSB to MSB place for result sign */
+        movaps    %xmm5, %xmm12
+        andps     %xmm0, %xmm7
+
+/* SinN = Y - RS : right shifter sub */
+        subpd __dRShifter(%rax), %xmm5
+        movups    %xmm10, 176(%rsp)
+        psllq     $63, %xmm12
+        movups __dPI1(%rax), %xmm10
+
+/* SinR = X' - SinN*Pi1 */
+        movaps    %xmm10, %xmm1
+        mulpd     %xmm5, %xmm1
+        movups __dPI2(%rax), %xmm6
+
+/* SinR = SinR - SinN*Pi1 */
+        movaps    %xmm6, %xmm2
+        mulpd     %xmm5, %xmm2
+        movups    %xmm13, 112(%rsp)
+        movaps    %xmm4, %xmm13
+        subpd     %xmm1, %xmm13
+        subpd     %xmm2, %xmm13
+
+/* Sine result sign: SinRSign = SignMask & SinR */
+        movaps    %xmm11, %xmm2
+
+/* CosR = SinX - CosN*Pi1 */
+        movaps    %xmm4, %xmm1
+        movups __dOneHalf(%rax), %xmm3
+        andps     %xmm13, %xmm2
+
+/* Set SinRSign to 0.5 */
+        orps      %xmm2, %xmm3
+
+/* Update CosRSign and CosSignRes signs */
+        xorps     %xmm11, %xmm2
+
+/* CosN = SinN +(-)0.5 */
+        addpd     %xmm5, %xmm3
+        cmpnlepd __dRangeVal(%rax), %xmm4
+        mulpd     %xmm3, %xmm10
+
+/* CosR = CosR - CosN*Pi2 */
+        mulpd     %xmm3, %xmm6
+        subpd     %xmm10, %xmm1
+        movmskpd  %xmm4, %ecx
+        movups __dPI3(%rax), %xmm10
+        xorps     %xmm12, %xmm2
+        subpd     %xmm6, %xmm1
+
+/* SinR = SinR - SinN*Pi3 */
+        movaps    %xmm10, %xmm6
+
+/* Final reconstruction.
+   Combine Sin result's sign */
+        xorps     %xmm7, %xmm12
+        mulpd     %xmm5, %xmm6
+
+/* CosR = CosR - CosN*Pi3 */
+        mulpd     %xmm3, %xmm10
+        subpd     %xmm6, %xmm13
+        subpd     %xmm10, %xmm1
+        movups __dPI4(%rax), %xmm6
+
+/* SinR = SinR - SinN*Pi4 */
+        mulpd     %xmm6, %xmm5
+
+/* CosR = CosR - CosN*Pi4 */
+        mulpd     %xmm6, %xmm3
+        subpd     %xmm5, %xmm13
+        subpd     %xmm3, %xmm1
+
+/* SinR2 = SinR^2 */
+        movaps    %xmm13, %xmm6
+
+/* CosR2 = CosR^2 */
+        movaps    %xmm1, %xmm10
+        mulpd     %xmm13, %xmm6
+        mulpd     %xmm1, %xmm10
+
+/* Polynomial approximation */
+        movups __dC7(%rax), %xmm5
+        movaps    %xmm5, %xmm3
+        mulpd     %xmm6, %xmm3
+        mulpd     %xmm10, %xmm5
+        addpd __dC6(%rax), %xmm3
+        addpd __dC6(%rax), %xmm5
+        mulpd     %xmm6, %xmm3
+        mulpd     %xmm10, %xmm5
+        addpd __dC5(%rax), %xmm3
+        addpd __dC5(%rax), %xmm5
+        mulpd     %xmm6, %xmm3
+        mulpd     %xmm10, %xmm5
+        addpd __dC4(%rax), %xmm3
+        addpd __dC4(%rax), %xmm5
+
+/* SinPoly = C3 + SinR2*(C4 + SinR2*(C5 + SinR2*(C6 + SinR2*C7))) */
+        mulpd     %xmm6, %xmm3
+
+/* CosPoly = C3 + CosR2*(C4 + CosR2*(C5 + CosR2*(C6 + CosR2*C7))) */
+        mulpd     %xmm10, %xmm5
+        addpd __dC3(%rax), %xmm3
+        addpd __dC3(%rax), %xmm5
+
+/* SinPoly = C2 + SinR2*SinPoly */
+        mulpd     %xmm6, %xmm3
+
+/* CosPoly = C2 + CosR2*CosPoly */
+        mulpd     %xmm10, %xmm5
+        addpd __dC2(%rax), %xmm3
+        addpd __dC2(%rax), %xmm5
+
+/* SinPoly = C1 + SinR2*SinPoly */
+        mulpd     %xmm6, %xmm3
+
+/* CosPoly = C1 + CosR2*CosPoly */
+        mulpd     %xmm10, %xmm5
+        addpd __dC1(%rax), %xmm3
+        addpd __dC1(%rax), %xmm5
+
+/* SinPoly = SinR2*SinPoly */
+        mulpd     %xmm3, %xmm6
+
+/* CosPoly = CosR2*CosPoly */
+        mulpd     %xmm5, %xmm10
+
+/* SinPoly = SinR*SinPoly */
+        mulpd     %xmm13, %xmm6
+
+/* CosPoly = CosR*CosPoly */
+        mulpd     %xmm1, %xmm10
+        addpd     %xmm6, %xmm13
+        addpd     %xmm10, %xmm1
+
+/* Update Sin result's sign */
+        xorps     %xmm12, %xmm13
+
+/* Update Cos result's sign */
+        xorps     %xmm2, %xmm1
+        testl     %ecx, %ecx
+        jne       .LBL_1_3
+
+.LBL_1_2:
+        cfi_remember_state
+        movups    176(%rsp), %xmm10
+        movaps    %xmm13, (%rdi)
+        movups    160(%rsp), %xmm11
+        movups    144(%rsp), %xmm12
+        movups    112(%rsp), %xmm13
+        movups    %xmm1, (%rsi)
+        movq      %rbp, %rsp
+        cfi_def_cfa_register (%rsp)
+        popq      %rbp
+        cfi_adjust_cfa_offset (-8)
+        cfi_restore (%rbp)
+        ret
+
+.LBL_1_3:
+        cfi_restore_state
+        movups    %xmm0, 128(%rsp)
+        movups    %xmm13, 192(%rsp)
+        movups    %xmm1, 256(%rsp)
+        je        .LBL_1_2
+
+        xorb      %dl, %dl
+        xorl      %eax, %eax
+        movups    %xmm8, 48(%rsp)
+        movups    %xmm9, 32(%rsp)
+        movups    %xmm14, 16(%rsp)
+        movups    %xmm15, (%rsp)
+        movq      %rsi, 64(%rsp)
+        movq      %r12, 104(%rsp)
+        cfi_offset_rel_rsp (12, 104)
+        movb      %dl, %r12b
+        movq      %r13, 96(%rsp)
+        cfi_offset_rel_rsp (13, 96)
+        movl      %eax, %r13d
+        movq      %r14, 88(%rsp)
+        cfi_offset_rel_rsp (14, 88)
+        movl      %ecx, %r14d
+        movq      %r15, 80(%rsp)
+        cfi_offset_rel_rsp (15, 80)
+        movq      %rbx, 72(%rsp)
+        movq      %rdi, %rbx
+        cfi_remember_state
+
+.LBL_1_6:
+        btl       %r13d, %r14d
+        jc        .LBL_1_13
+
+.LBL_1_7:
+        lea       1(%r13), %esi
+        btl       %esi, %r14d
+        jc        .LBL_1_10
+
+.LBL_1_8:
+        incb      %r12b
+        addl      $2, %r13d
+        cmpb      $16, %r12b
+        jb        .LBL_1_6
+
+        movups    48(%rsp), %xmm8
+        movq      %rbx, %rdi
+        movups    32(%rsp), %xmm9
+        movups    16(%rsp), %xmm14
+        movups    (%rsp), %xmm15
+        movq      64(%rsp), %rsi
+        movq      104(%rsp), %r12
+        cfi_restore (%r12)
+        movq      96(%rsp), %r13
+        cfi_restore (%r13)
+        movq      88(%rsp), %r14
+        cfi_restore (%r14)
+        movq      80(%rsp), %r15
+        cfi_restore (%r15)
+        movq      72(%rsp), %rbx
+        movups    192(%rsp), %xmm13
+        movups    256(%rsp), %xmm1
+        jmp       .LBL_1_2
+
+.LBL_1_10:
+        cfi_restore_state
+        movzbl    %r12b, %r15d
+        shlq      $4, %r15
+        movsd     136(%rsp,%r15), %xmm0
+
+        call      JUMPTARGET(sin)
+
+        movsd     %xmm0, 200(%rsp,%r15)
+        movsd     136(%rsp,%r15), %xmm0
+
+        call      JUMPTARGET(cos)
+
+        movsd     %xmm0, 264(%rsp,%r15)
+        jmp       .LBL_1_8
+
+.LBL_1_13:
+        movzbl    %r12b, %r15d
+        shlq      $4, %r15
+        movsd     128(%rsp,%r15), %xmm0
+
+        call      JUMPTARGET(sin)
+
+        movsd     %xmm0, 192(%rsp,%r15)
+        movsd     128(%rsp,%r15), %xmm0
+
+        call      JUMPTARGET(cos)
+
+        movsd     %xmm0, 256(%rsp,%r15)
+        jmp       .LBL_1_7
+END (_ZGVbN2vl8l8_sincos_sse4)
+libmvec_hidden_def(_ZGVbN2vl8l8_sincos_sse4)
+
+/* vvv version implemented with wrapper to vl8l8 variant.  */
+ENTRY (_ZGVbN2vvv_sincos_sse4)
+#ifndef __ILP32__
+        subq      $72, %rsp
+        .cfi_def_cfa_offset 80
+        movdqu    %xmm1, 32(%rsp)
+        lea       (%rsp), %rdi
+        movdqu    %xmm2, 48(%rdi)
+        lea       16(%rsp), %rsi
+        call      HIDDEN_JUMPTARGET(_ZGVbN2vl8l8_sincos_sse4)
+        movq      32(%rsp), %rdx
+        movq      48(%rsp), %rsi
+        movq      40(%rsp), %r8
+        movq      56(%rsp), %r10
+        movq      (%rsp), %rax
+        movq      16(%rsp), %rcx
+        movq      8(%rsp), %rdi
+        movq      24(%rsp), %r9
+        movq      %rax, (%rdx)
+        movq      %rcx, (%rsi)
+        movq      %rdi, (%r8)
+        movq      %r9, (%r10)
+        addq      $72, %rsp
+        .cfi_def_cfa_offset 8
+        ret
+#else
+        subl    $72, %esp
+        .cfi_def_cfa_offset 80
+        leal    48(%rsp), %esi
+        movaps  %xmm1, 16(%esp)
+        leal    32(%rsp), %edi
+        movaps  %xmm2, (%esp)
+        call    HIDDEN_JUMPTARGET(_ZGVbN2vl8l8_sincos_sse4)
+        movdqa  16(%esp), %xmm1
+        movsd   32(%esp), %xmm0
+        movq    %xmm1, %rax
+        movdqa  (%esp), %xmm2
+        movsd   %xmm0, (%eax)
+        movsd   40(%esp), %xmm0
+        pextrd  $1, %xmm1, %eax
+        movsd   %xmm0, (%eax)
+        movsd   48(%esp), %xmm0
+        movq    %xmm2, %rax
+        movsd   %xmm0, (%eax)
+        movsd   56(%esp), %xmm0
+        pextrd  $1, %xmm2, %eax
+        movsd   %xmm0, (%eax)
+        addl    $72, %esp
+        .cfi_def_cfa_offset 8
+        ret
+#endif
+END (_ZGVbN2vvv_sincos_sse4)
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_sincos4_core.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_sincos4_core.S
new file mode 100644
index 0000000000..8aacb8e76a
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_sincos4_core.S
@@ -0,0 +1,36 @@
+/* Multiple versions of vectorized sincos.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+	.text
+ENTRY (_ZGVdN4vvv_sincos)
+        .type   _ZGVdN4vvv_sincos, @gnu_indirect_function
+	LOAD_RTLD_GLOBAL_RO_RDX
+        leaq    _ZGVdN4vvv_sincos_avx2(%rip), %rax
+	HAS_ARCH_FEATURE (AVX2_Usable)
+        jz      2f
+        ret
+2:      leaq    _ZGVdN4vvv_sincos_sse_wrapper(%rip), %rax
+        ret
+END (_ZGVdN4vvv_sincos)
+libmvec_hidden_def (_ZGVdN4vvv_sincos)
+
+#define _ZGVdN4vvv_sincos _ZGVdN4vvv_sincos_sse_wrapper
+#include "../svml_d_sincos4_core.S"
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_sincos4_core_avx2.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_sincos4_core_avx2.S
new file mode 100644
index 0000000000..a6318c5ca6
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_sincos4_core_avx2.S
@@ -0,0 +1,373 @@
+/* Function sincos vectorized with AVX2.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "svml_d_trig_data.h"
+
+	.text
+ENTRY (_ZGVdN4vl8l8_sincos_avx2)
+/*
+   ALGORITHM DESCRIPTION:
+
+     ( low accuracy ( < 4ulp ) or enhanced performance
+      ( half of correct mantissa ) implementation )
+
+     Argument representation:
+     arg = N*Pi + R
+
+     Result calculation:
+     sin(arg) = sin(N*Pi + R) = (-1)^N * sin(R)
+     arg + Pi/2 = (N'*Pi + R')
+     cos(arg) = sin(arg+Pi/2) = sin(N'*Pi + R') = (-1)^N' * sin(R')
+     sin(R), sin(R') are approximated by corresponding polynomial.  */
+
+        pushq     %rbp
+        cfi_adjust_cfa_offset (8)
+        cfi_rel_offset (%rbp, 0)
+        movq      %rsp, %rbp
+        cfi_def_cfa_register (%rbp)
+        andq      $-64, %rsp
+        subq      $448, %rsp
+        movq      __svml_d_trig_data@GOTPCREL(%rip), %rax
+        vmovups   %ymm14, 288(%rsp)
+        vmovups   %ymm8, 352(%rsp)
+        vmovupd __dSignMask(%rax), %ymm6
+        vmovupd __dInvPI(%rax), %ymm2
+        vmovupd __dPI1_FMA(%rax), %ymm5
+        vmovups   %ymm9, 224(%rsp)
+
+/* ARGUMENT RANGE REDUCTION:
+   Absolute argument: X' = |X| */
+        vandnpd   %ymm0, %ymm6, %ymm1
+
+/* SinY = X'*InvPi + RS : right shifter add */
+        vfmadd213pd __dRShifter(%rax), %ymm1, %ymm2
+
+/* SinSignRes = Y<<63 : shift LSB to MSB place for result sign */
+        vpsllq    $63, %ymm2, %ymm4
+
+/* SinN = Y - RS : right shifter sub */
+        vsubpd __dRShifter(%rax), %ymm2, %ymm2
+
+/* SinR = X' - SinN*Pi1 */
+        vmovdqa   %ymm1, %ymm14
+        vfnmadd231pd %ymm2, %ymm5, %ymm14
+
+/* SinR = SinR - SinN*Pi1 */
+        vfnmadd231pd __dPI2_FMA(%rax), %ymm2, %ymm14
+
+/* Sine result sign: SinRSign = SignMask & SinR */
+        vandpd    %ymm14, %ymm6, %ymm7
+
+/* Set SinRSign to 0.5 */
+        vorpd __dOneHalf(%rax), %ymm7, %ymm3
+
+/* CosN = SinN +(-)0.5 */
+        vaddpd    %ymm3, %ymm2, %ymm3
+
+/* CosR = SinX - CosN*Pi1 */
+        vmovdqa   %ymm1, %ymm8
+        vfnmadd231pd %ymm3, %ymm5, %ymm8
+        vmovupd __dPI3_FMA(%rax), %ymm5
+        vcmpnle_uqpd __dRangeVal(%rax), %ymm1, %ymm1
+
+/* CosR = CosR - CosN*Pi2 */
+        vfnmadd231pd __dPI2_FMA(%rax), %ymm3, %ymm8
+
+/* SinR = SinR - SinN*Pi3 */
+        vfnmadd213pd %ymm14, %ymm5, %ymm2
+
+/* CosR = CosR - CosN*Pi3 */
+        vfnmadd213pd %ymm8, %ymm5, %ymm3
+        vmovupd __dC6(%rax), %ymm8
+
+/* SinR2 = SinR^2 */
+        vmulpd    %ymm2, %ymm2, %ymm14
+
+/* CosR2 = CosR^2 */
+        vmulpd    %ymm3, %ymm3, %ymm5
+
+/* Grab SignX */
+        vandpd    %ymm0, %ymm6, %ymm9
+
+/* Update CosRSign and CosSignRes signs */
+        vxorpd    %ymm6, %ymm7, %ymm6
+        vxorpd    %ymm6, %ymm4, %ymm7
+
+/* Update sign SinSignRes */
+        vxorpd    %ymm9, %ymm4, %ymm6
+
+/* Polynomial approximation */
+        vmovupd __dC7(%rax), %ymm4
+        vmovdqa   %ymm8, %ymm9
+        vfmadd231pd __dC7(%rax), %ymm14, %ymm9
+        vfmadd213pd %ymm8, %ymm5, %ymm4
+        vfmadd213pd __dC5(%rax), %ymm14, %ymm9
+        vfmadd213pd __dC5(%rax), %ymm5, %ymm4
+        vfmadd213pd __dC4(%rax), %ymm14, %ymm9
+        vfmadd213pd __dC4(%rax), %ymm5, %ymm4
+
+/* SinPoly = C3 + SinR2*(C4 + SinR2*(C5 + SinR2*(C6 + SinR2*C7))) */
+        vfmadd213pd __dC3(%rax), %ymm14, %ymm9
+
+/* CosPoly = C3 + CosR2*(C4 + CosR2*(C5 + CosR2*(C6 + CosR2*C7))) */
+        vfmadd213pd __dC3(%rax), %ymm5, %ymm4
+
+/* SinPoly = C2 + SinR2*SinPoly */
+        vfmadd213pd __dC2(%rax), %ymm14, %ymm9
+
+/* CosPoly = C2 + CosR2*CosPoly */
+        vfmadd213pd __dC2(%rax), %ymm5, %ymm4
+
+/* SinPoly = C1 + SinR2*SinPoly */
+        vfmadd213pd __dC1(%rax), %ymm14, %ymm9
+
+/* CosPoly = C1 + CosR2*CosPoly */
+        vfmadd213pd __dC1(%rax), %ymm5, %ymm4
+
+/* SinPoly = SinR2*SinPoly */
+        vmulpd    %ymm14, %ymm9, %ymm8
+
+/* CosPoly = CosR2*CosPoly */
+        vmulpd    %ymm5, %ymm4, %ymm4
+
+/* SinPoly = SinR*SinPoly */
+        vfmadd213pd %ymm2, %ymm2, %ymm8
+
+/* CosPoly = CosR*CosPoly */
+        vfmadd213pd %ymm3, %ymm3, %ymm4
+        vmovmskpd %ymm1, %ecx
+
+/* Final reconstruction
+   Update Sin result's sign */
+        vxorpd    %ymm6, %ymm8, %ymm3
+
+/* Update Cos result's sign */
+        vxorpd    %ymm7, %ymm4, %ymm2
+        testl     %ecx, %ecx
+        jne       .LBL_1_3
+
+.LBL_1_2:
+        cfi_remember_state
+        vmovups   352(%rsp), %ymm8
+        vmovups   224(%rsp), %ymm9
+        vmovups   288(%rsp), %ymm14
+        vmovupd   %ymm2, (%rsi)
+        vmovdqa   %ymm3, (%rdi)
+        movq      %rbp, %rsp
+        cfi_def_cfa_register (%rsp)
+        popq      %rbp
+        cfi_adjust_cfa_offset (-8)
+        cfi_restore (%rbp)
+        ret
+
+.LBL_1_3:
+        cfi_restore_state
+        vmovupd   %ymm0, 256(%rsp)
+        vmovupd   %ymm3, 320(%rsp)
+        vmovupd   %ymm2, 384(%rsp)
+        je        .LBL_1_2
+
+        xorb      %dl, %dl
+        xorl      %eax, %eax
+        vmovups   %ymm10, 128(%rsp)
+        vmovups   %ymm11, 96(%rsp)
+        vmovups   %ymm12, 64(%rsp)
+        vmovups   %ymm13, 32(%rsp)
+        vmovups   %ymm15, (%rsp)
+        movq      %rsi, 160(%rsp)
+        movq      %r12, 200(%rsp)
+        cfi_offset_rel_rsp (12, 200)
+        movb      %dl, %r12b
+        movq      %r13, 192(%rsp)
+        cfi_offset_rel_rsp (13, 192)
+        movl      %eax, %r13d
+        movq      %r14, 184(%rsp)
+        cfi_offset_rel_rsp (14, 184)
+        movl      %ecx, %r14d
+        movq      %r15, 176(%rsp)
+        cfi_offset_rel_rsp (15, 176)
+        movq      %rbx, 168(%rsp)
+        movq      %rdi, %rbx
+        cfi_remember_state
+
+.LBL_1_6:
+        btl       %r13d, %r14d
+        jc        .LBL_1_13
+
+.LBL_1_7:
+        lea       1(%r13), %esi
+        btl       %esi, %r14d
+        jc        .LBL_1_10
+
+.LBL_1_8:
+        incb      %r12b
+        addl      $2, %r13d
+        cmpb      $16, %r12b
+        jb        .LBL_1_6
+
+        vmovups   128(%rsp), %ymm10
+        movq      %rbx, %rdi
+        vmovups   96(%rsp), %ymm11
+        vmovups   64(%rsp), %ymm12
+        vmovups   32(%rsp), %ymm13
+        vmovups   (%rsp), %ymm15
+        vmovupd   320(%rsp), %ymm3
+        vmovupd   384(%rsp), %ymm2
+        movq      160(%rsp), %rsi
+        movq      200(%rsp), %r12
+        cfi_restore (%r12)
+        movq      192(%rsp), %r13
+        cfi_restore (%r13)
+        movq      184(%rsp), %r14
+        cfi_restore (%r14)
+        movq      176(%rsp), %r15
+        cfi_restore (%r15)
+        movq      168(%rsp), %rbx
+        jmp       .LBL_1_2
+
+.LBL_1_10:
+        cfi_restore_state
+        movzbl    %r12b, %r15d
+        shlq      $4, %r15
+        vmovsd    264(%rsp,%r15), %xmm0
+        vzeroupper
+
+        call      JUMPTARGET(sin)
+
+        vmovsd    %xmm0, 328(%rsp,%r15)
+        vmovsd    264(%rsp,%r15), %xmm0
+
+        call      JUMPTARGET(cos)
+
+        vmovsd    %xmm0, 392(%rsp,%r15)
+        jmp       .LBL_1_8
+
+.LBL_1_13:
+        movzbl    %r12b, %r15d
+        shlq      $4, %r15
+        vmovsd    256(%rsp,%r15), %xmm0
+        vzeroupper
+
+        call      JUMPTARGET(sin)
+
+        vmovsd    %xmm0, 320(%rsp,%r15)
+        vmovsd    256(%rsp,%r15), %xmm0
+
+        call      JUMPTARGET(cos)
+
+        vmovsd    %xmm0, 384(%rsp,%r15)
+        jmp       .LBL_1_7
+
+END (_ZGVdN4vl8l8_sincos_avx2)
+libmvec_hidden_def(_ZGVdN4vl8l8_sincos_avx2)
+
+/* vvv version implemented with wrapper to vl8l8 variant.  */
+ENTRY (_ZGVdN4vvv_sincos_avx2)
+#ifndef __ILP32__
+        pushq     %rbp
+        cfi_adjust_cfa_offset (8)
+        cfi_rel_offset (%rbp, 0)
+        movq      %rsp, %rbp
+        cfi_def_cfa_register (%rbp)
+        andq      $-32, %rsp
+        subq      $128, %rsp
+        vmovdqu   %ymm1, 64(%rsp)
+        lea       (%rsp), %rdi
+        vmovdqu   %ymm2, 96(%rdi)
+        lea       32(%rsp), %rsi
+        call      HIDDEN_JUMPTARGET(_ZGVdN4vl8l8_sincos_avx2)
+        movq      64(%rsp), %rdx
+        movq      96(%rsp), %rsi
+        movq      72(%rsp), %r8
+        movq      104(%rsp), %r10
+        movq      (%rsp), %rax
+        movq      32(%rsp), %rcx
+        movq      8(%rsp), %rdi
+        movq      40(%rsp), %r9
+        movq      %rax, (%rdx)
+        movq      %rcx, (%rsi)
+        movq      80(%rsp), %rax
+        movq      112(%rsp), %rcx
+        movq      %rdi, (%r8)
+        movq      %r9, (%r10)
+        movq      88(%rsp), %rdi
+        movq      120(%rsp), %r9
+        movq      16(%rsp), %r11
+        movq      48(%rsp), %rdx
+        movq      24(%rsp), %rsi
+        movq      56(%rsp), %r8
+        movq      %r11, (%rax)
+        movq      %rdx, (%rcx)
+        movq      %rsi, (%rdi)
+        movq      %r8, (%r9)
+        movq      %rbp, %rsp
+        cfi_def_cfa_register (%rsp)
+        popq      %rbp
+        cfi_adjust_cfa_offset (-8)
+        cfi_restore (%rbp)
+        ret
+#else
+        leal    8(%rsp), %r10d
+        .cfi_def_cfa 10, 0
+        andl    $-32, %esp
+        pushq   -8(%r10d)
+        pushq   %rbp
+        .cfi_escape 0x10,0x6,0x2,0x76,0
+        movl    %esp, %ebp
+        pushq   %r10
+        .cfi_escape 0xf,0x3,0x76,0x78,0x6
+        leal    -48(%rbp), %esi
+        leal    -80(%rbp), %edi
+        subl    $104, %esp
+        vmovaps %xmm1, -96(%ebp)
+        vmovaps %xmm2, -112(%ebp)
+        call    HIDDEN_JUMPTARGET(_ZGVdN4vl8l8_sincos_avx2)
+        movl    -96(%ebp), %eax
+        vmovsd  -80(%ebp), %xmm0
+        vmovsd  %xmm0, (%eax)
+        movl    -92(%ebp), %eax
+        vmovsd  -72(%ebp), %xmm0
+        vmovsd  %xmm0, (%eax)
+        movl    -88(%ebp), %eax
+        vmovsd  -64(%ebp), %xmm0
+        vmovsd  %xmm0, (%eax)
+        movl    -84(%ebp), %eax
+        vmovsd  -56(%ebp), %xmm0
+        vmovsd  %xmm0, (%eax)
+        movl    -112(%ebp), %eax
+        vmovsd  -48(%ebp), %xmm0
+        vmovsd  %xmm0, (%eax)
+        movl    -108(%ebp), %eax
+        vmovsd  -40(%ebp), %xmm0
+        vmovsd  %xmm0, (%eax)
+        movl    -104(%ebp), %eax
+        vmovsd  -32(%ebp), %xmm0
+        vmovsd  %xmm0, (%eax)
+        movl    -100(%ebp), %eax
+        vmovsd  -24(%ebp), %xmm0
+        vmovsd  %xmm0, (%eax)
+        addl    $104, %esp
+        popq    %r10
+        .cfi_def_cfa 10, 0
+        popq    %rbp
+        leal    -8(%r10), %esp
+        .cfi_def_cfa 7, 8
+        ret
+#endif
+END (_ZGVdN4vvv_sincos_avx2)
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core.S
new file mode 100644
index 0000000000..3c0abc379e
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core.S
@@ -0,0 +1,37 @@
+/* Multiple versions of vectorized sincos.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+	.text
+ENTRY (_ZGVeN8vvv_sincos)
+        .type   _ZGVeN8vvv_sincos, @gnu_indirect_function
+	LOAD_RTLD_GLOBAL_RO_RDX
+        leaq    _ZGVeN8vvv_sincos_skx(%rip), %rax
+	HAS_ARCH_FEATURE (AVX512DQ_Usable)
+        jnz     2f
+        leaq    _ZGVeN8vvv_sincos_knl(%rip), %rax
+	HAS_ARCH_FEATURE (AVX512F_Usable)
+        jnz     2f
+        leaq    _ZGVeN8vvv_sincos_avx2_wrapper(%rip), %rax
+2:      ret
+END (_ZGVeN8vvv_sincos)
+
+#define _ZGVeN8vvv_sincos _ZGVeN8vvv_sincos_avx2_wrapper
+#include "../svml_d_sincos8_core.S"
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S
new file mode 100644
index 0000000000..c9207558c5
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S
@@ -0,0 +1,763 @@
+/* Function sincos vectorized with AVX-512. KNL and SKX versions.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "svml_d_trig_data.h"
+#include "svml_d_wrapper_impl.h"
+
+/*
+   ALGORITHM DESCRIPTION:
+
+     ( low accuracy ( < 4ulp ) or enhanced performance
+      ( half of correct mantissa ) implementation )
+
+     Argument representation:
+     arg = N*Pi + R
+
+     Result calculation:
+     sin(arg) = sin(N*Pi + R) = (-1)^N * sin(R)
+     arg + Pi/2 = (N'*Pi + R')
+     cos(arg) = sin(arg+Pi/2) = sin(N'*Pi + R') = (-1)^N' * sin(R')
+     sin(R), sin(R') are approximated by corresponding polynomial.  */
+
+	.text
+ENTRY (_ZGVeN8vl8l8_sincos_knl)
+#ifndef HAVE_AVX512DQ_ASM_SUPPORT
+WRAPPER_IMPL_AVX512_fFF _ZGVdN4vl8l8_sincos
+#else
+        pushq     %rbp
+        cfi_adjust_cfa_offset (8)
+        cfi_rel_offset (%rbp, 0)
+        movq      %rsp, %rbp
+        cfi_def_cfa_register (%rbp)
+        andq      $-64, %rsp
+        subq      $1344, %rsp
+        movq      __svml_d_trig_data@GOTPCREL(%rip), %rax
+        vmovaps   %zmm0, %zmm4
+        movq      $-1, %rdx
+        vmovups __dSignMask(%rax), %zmm12
+        vmovups __dInvPI(%rax), %zmm5
+
+/* ARGUMENT RANGE REDUCTION:
+   Absolute argument: X' = |X| */
+        vpandnq   %zmm4, %zmm12, %zmm3
+        vmovups __dPI1_FMA(%rax), %zmm7
+        vmovups __dPI3_FMA(%rax), %zmm9
+
+/* SinR = X' - SinN*Pi1 */
+        vmovaps   %zmm3, %zmm8
+
+/* CosR = SinX - CosN*Pi1 */
+        vmovaps   %zmm3, %zmm10
+
+/* SinY = X'*InvPi + RS : right shifter add */
+        vfmadd213pd __dRShifter(%rax), %zmm3, %zmm5
+        vmovups __dC6(%rax), %zmm13
+
+/* SinN = Y - RS : right shifter sub */
+        vsubpd __dRShifter(%rax), %zmm5, %zmm1
+        vmovaps   %zmm13, %zmm14
+
+/* SinSignRes = Y<<63 : shift LSB to MSB place for result sign */
+        vpsllq    $63, %zmm5, %zmm2
+        vcmppd    $22, __dRangeVal(%rax), %zmm3, %k1
+
+/* Update CosRSign and CosSignRes signs */
+        vmovaps   %zmm12, %zmm5
+        vfnmadd231pd %zmm1, %zmm7, %zmm8
+
+/* SinR = SinR - SinN*Pi1 */
+        vfnmadd231pd __dPI2_FMA(%rax), %zmm1, %zmm8
+
+/* Sine result sign: SinRSign = SignMask & SinR */
+        vpandq    %zmm8, %zmm12, %zmm11
+
+/* Set SinRSign to 0.5 */
+        vporq __dOneHalf(%rax), %zmm11, %zmm6
+        vpternlogq $150, %zmm2, %zmm11, %zmm5
+
+/* Update sign SinSignRes */
+        vpternlogq $120, %zmm4, %zmm12, %zmm2
+
+/* Polynomial approximation */
+        vmovups __dC7(%rax), %zmm11
+
+/* CosN = SinN +(-)0.5 */
+        vaddpd    %zmm6, %zmm1, %zmm0
+
+/* SinR = SinR - SinN*Pi3 */
+        vfnmadd213pd %zmm8, %zmm9, %zmm1
+        vfnmadd231pd %zmm0, %zmm7, %zmm10
+
+/* SinR2 = SinR^2 */
+        vmulpd    %zmm1, %zmm1, %zmm15
+
+/* Grab SignX
+   CosR = CosR - CosN*Pi2 */
+        vfnmadd231pd __dPI2_FMA(%rax), %zmm0, %zmm10
+        vfmadd231pd __dC7(%rax), %zmm15, %zmm14
+
+/* CosR = CosR - CosN*Pi3 */
+        vfnmadd213pd %zmm10, %zmm9, %zmm0
+        vfmadd213pd __dC5(%rax), %zmm15, %zmm14
+
+/* CosR2 = CosR^2 */
+        vmulpd    %zmm0, %zmm0, %zmm12
+        vfmadd213pd __dC4(%rax), %zmm15, %zmm14
+        vfmadd213pd %zmm13, %zmm12, %zmm11
+
+/* SinPoly = C3 + SinR2*(C4 + SinR2*(C5 + SinR2*(C6 + SinR2*C7))) */
+        vfmadd213pd __dC3(%rax), %zmm15, %zmm14
+        vfmadd213pd __dC5(%rax), %zmm12, %zmm11
+
+/* SinPoly = C2 + SinR2*SinPoly */
+        vfmadd213pd __dC2(%rax), %zmm15, %zmm14
+        vfmadd213pd __dC4(%rax), %zmm12, %zmm11
+
+/* SinPoly = C1 + SinR2*SinPoly */
+        vfmadd213pd __dC1(%rax), %zmm15, %zmm14
+
+/* CosPoly = C3 + CosR2*(C4 + CosR2*(C5 + CosR2*(C6 + CosR2*C7))) */
+        vfmadd213pd __dC3(%rax), %zmm12, %zmm11
+
+/* SinPoly = SinR2*SinPoly */
+        vmulpd    %zmm15, %zmm14, %zmm13
+
+/* CosPoly = C2 + CosR2*CosPoly */
+        vfmadd213pd __dC2(%rax), %zmm12, %zmm11
+
+/* SinPoly = SinR*SinPoly */
+        vfmadd213pd %zmm1, %zmm1, %zmm13
+        vpbroadcastq %rdx, %zmm1{%k1}{z}
+
+/* CosPoly = C1 + CosR2*CosPoly */
+        vfmadd213pd __dC1(%rax), %zmm12, %zmm11
+        vptestmq  %zmm1, %zmm1, %k0
+        kmovw     %k0, %ecx
+
+/* CosPoly = CosR2*CosPoly */
+        vmulpd    %zmm12, %zmm11, %zmm14
+        movzbl    %cl, %ecx
+
+/* CosPoly = CosR*CosPoly */
+        vfmadd213pd %zmm0, %zmm0, %zmm14
+
+/* Final reconstruction.
+   Update Sin result's sign */
+        vpxorq    %zmm2, %zmm13, %zmm0
+
+/* Update Cos result's sign */
+        vpxorq    %zmm5, %zmm14, %zmm2
+        testl     %ecx, %ecx
+        jne       .LBL_1_3
+
+.LBL_1_2:
+        cfi_remember_state
+        vmovups   %zmm0, (%rdi)
+        vmovups   %zmm2, (%rsi)
+        movq      %rbp, %rsp
+        cfi_def_cfa_register (%rsp)
+        popq      %rbp
+        cfi_adjust_cfa_offset (-8)
+        cfi_restore (%rbp)
+        ret
+
+.LBL_1_3:
+        cfi_restore_state
+        vmovups   %zmm4, 1152(%rsp)
+        vmovups   %zmm0, 1216(%rsp)
+        vmovups   %zmm2, 1280(%rsp)
+        je        .LBL_1_2
+
+        xorb      %dl, %dl
+        kmovw     %k4, 1048(%rsp)
+        xorl      %eax, %eax
+        kmovw     %k5, 1040(%rsp)
+        kmovw     %k6, 1032(%rsp)
+        kmovw     %k7, 1024(%rsp)
+        vmovups   %zmm16, 960(%rsp)
+        vmovups   %zmm17, 896(%rsp)
+        vmovups   %zmm18, 832(%rsp)
+        vmovups   %zmm19, 768(%rsp)
+        vmovups   %zmm20, 704(%rsp)
+        vmovups   %zmm21, 640(%rsp)
+        vmovups   %zmm22, 576(%rsp)
+        vmovups   %zmm23, 512(%rsp)
+        vmovups   %zmm24, 448(%rsp)
+        vmovups   %zmm25, 384(%rsp)
+        vmovups   %zmm26, 320(%rsp)
+        vmovups   %zmm27, 256(%rsp)
+        vmovups   %zmm28, 192(%rsp)
+        vmovups   %zmm29, 128(%rsp)
+        vmovups   %zmm30, 64(%rsp)
+        vmovups   %zmm31, (%rsp)
+        movq      %rsi, 1056(%rsp)
+        movq      %r12, 1096(%rsp)
+        cfi_offset_rel_rsp (12, 1096)
+        movb      %dl, %r12b
+        movq      %r13, 1088(%rsp)
+        cfi_offset_rel_rsp (13, 1088)
+        movl      %eax, %r13d
+        movq      %r14, 1080(%rsp)
+        cfi_offset_rel_rsp (14, 1080)
+        movl      %ecx, %r14d
+        movq      %r15, 1072(%rsp)
+        cfi_offset_rel_rsp (15, 1072)
+        movq      %rbx, 1064(%rsp)
+        movq      %rdi, %rbx
+        cfi_remember_state
+
+.LBL_1_6:
+        btl       %r13d, %r14d
+        jc        .LBL_1_13
+
+.LBL_1_7:
+        lea       1(%r13), %esi
+        btl       %esi, %r14d
+        jc        .LBL_1_10
+
+.LBL_1_8:
+        addb      $1, %r12b
+        addl      $2, %r13d
+        cmpb      $16, %r12b
+        jb        .LBL_1_6
+
+        movq      %rbx, %rdi
+        kmovw     1048(%rsp), %k4
+        movq      1056(%rsp), %rsi
+        kmovw     1040(%rsp), %k5
+        movq      1096(%rsp), %r12
+        cfi_restore (%r12)
+        kmovw     1032(%rsp), %k6
+        movq      1088(%rsp), %r13
+        cfi_restore (%r13)
+        kmovw     1024(%rsp), %k7
+        vmovups   960(%rsp), %zmm16
+        vmovups   896(%rsp), %zmm17
+        vmovups   832(%rsp), %zmm18
+        vmovups   768(%rsp), %zmm19
+        vmovups   704(%rsp), %zmm20
+        vmovups   640(%rsp), %zmm21
+        vmovups   576(%rsp), %zmm22
+        vmovups   512(%rsp), %zmm23
+        vmovups   448(%rsp), %zmm24
+        vmovups   384(%rsp), %zmm25
+        vmovups   320(%rsp), %zmm26
+        vmovups   256(%rsp), %zmm27
+        vmovups   192(%rsp), %zmm28
+        vmovups   128(%rsp), %zmm29
+        vmovups   64(%rsp), %zmm30
+        vmovups   (%rsp), %zmm31
+        movq      1080(%rsp), %r14
+        cfi_restore (%r14)
+        movq      1072(%rsp), %r15
+        cfi_restore (%r15)
+        movq      1064(%rsp), %rbx
+        vmovups   1216(%rsp), %zmm0
+        vmovups   1280(%rsp), %zmm2
+        jmp       .LBL_1_2
+
+.LBL_1_10:
+        cfi_restore_state
+        movzbl    %r12b, %r15d
+        shlq      $4, %r15
+        vmovsd    1160(%rsp,%r15), %xmm0
+
+        call      JUMPTARGET(sin)
+
+        vmovsd    %xmm0, 1224(%rsp,%r15)
+        vmovsd    1160(%rsp,%r15), %xmm0
+
+        call      JUMPTARGET(cos)
+
+        vmovsd    %xmm0, 1288(%rsp,%r15)
+        jmp       .LBL_1_8
+
+.LBL_1_13:
+        movzbl    %r12b, %r15d
+        shlq      $4, %r15
+        vmovsd    1152(%rsp,%r15), %xmm0
+
+        call      JUMPTARGET(sin)
+
+        vmovsd    %xmm0, 1216(%rsp,%r15)
+        vmovsd    1152(%rsp,%r15), %xmm0
+
+        call      JUMPTARGET(cos)
+
+        vmovsd    %xmm0, 1280(%rsp,%r15)
+        jmp       .LBL_1_7
+
+#endif
+END (_ZGVeN8vl8l8_sincos_knl)
+libmvec_hidden_def(_ZGVeN8vl8l8_sincos_knl)
+
+ENTRY (_ZGVeN8vl8l8_sincos_skx)
+#ifndef HAVE_AVX512DQ_ASM_SUPPORT
+WRAPPER_IMPL_AVX512_fFF _ZGVdN4vl8l8_sincos
+#else
+        pushq     %rbp
+        cfi_adjust_cfa_offset (8)
+        cfi_rel_offset (%rbp, 0)
+        movq      %rsp, %rbp
+        cfi_def_cfa_register (%rbp)
+        andq      $-64, %rsp
+        subq      $1344, %rsp
+        movq      __svml_d_trig_data@GOTPCREL(%rip), %rax
+        vmovaps   %zmm0, %zmm8
+        vmovups __dSignMask(%rax), %zmm4
+        vmovups __dInvPI(%rax), %zmm9
+        vmovups __dRShifter(%rax), %zmm10
+        vmovups __dPI1_FMA(%rax), %zmm13
+        vmovups __dPI2_FMA(%rax), %zmm14
+        vmovups __dOneHalf(%rax), %zmm11
+        vmovups __dPI3_FMA(%rax), %zmm2
+
+/* ARGUMENT RANGE REDUCTION:
+   Absolute argument: X' = |X| */
+        vandnpd   %zmm8, %zmm4, %zmm7
+
+/* SinY = X'*InvPi + RS : right shifter add */
+        vfmadd213pd %zmm10, %zmm7, %zmm9
+        vcmppd    $18, __dRangeVal(%rax), %zmm7, %k1
+
+/* SinSignRes = Y<<63 : shift LSB to MSB place for result sign */
+        vpsllq    $63, %zmm9, %zmm6
+
+/* SinN = Y - RS : right shifter sub */
+        vsubpd    %zmm10, %zmm9, %zmm5
+        vmovups __dC5(%rax), %zmm9
+        vmovups __dC4(%rax), %zmm10
+
+/* SinR = X' - SinN*Pi1 */
+        vmovaps   %zmm7, %zmm15
+        vfnmadd231pd %zmm5, %zmm13, %zmm15
+
+/* SinR = SinR - SinN*Pi1 */
+        vfnmadd231pd %zmm5, %zmm14, %zmm15
+
+/* Sine result sign: SinRSign = SignMask & SinR */
+        vandpd    %zmm15, %zmm4, %zmm1
+
+/* Set SinRSign to 0.5 */
+        vorpd     %zmm1, %zmm11, %zmm12
+        vmovups __dC3(%rax), %zmm11
+
+/* CosN = SinN +(-)0.5 */
+        vaddpd    %zmm12, %zmm5, %zmm3
+
+/* SinR = SinR - SinN*Pi3 */
+        vfnmadd213pd %zmm15, %zmm2, %zmm5
+        vmovups __dC2(%rax), %zmm12
+
+/* SinR2 = SinR^2 */
+        vmulpd    %zmm5, %zmm5, %zmm15
+
+/* CosR = SinX - CosN*Pi1 */
+        vmovaps   %zmm7, %zmm0
+        vfnmadd231pd %zmm3, %zmm13, %zmm0
+        vmovups __dC1(%rax), %zmm13
+
+/* Grab SignX
+   CosR = CosR - CosN*Pi2 */
+        vfnmadd231pd %zmm3, %zmm14, %zmm0
+
+/* CosR = CosR - CosN*Pi3 */
+        vfnmadd213pd %zmm0, %zmm2, %zmm3
+
+/* Polynomial approximation */
+        vmovups __dC7(%rax), %zmm0
+
+/* Update CosRSign and CosSignRes signs */
+        vmovaps   %zmm4, %zmm2
+        vpternlogq $150, %zmm6, %zmm1, %zmm2
+
+/* Update sign SinSignRes */
+        vpternlogq $120, %zmm8, %zmm4, %zmm6
+
+/* CosR2 = CosR^2 */
+        vmulpd    %zmm3, %zmm3, %zmm1
+        vmovups __dC6(%rax), %zmm4
+        vmovaps   %zmm0, %zmm14
+        vfmadd213pd %zmm4, %zmm1, %zmm0
+        vfmadd213pd %zmm4, %zmm15, %zmm14
+        vfmadd213pd %zmm9, %zmm1, %zmm0
+        vfmadd213pd %zmm9, %zmm15, %zmm14
+        vfmadd213pd %zmm10, %zmm1, %zmm0
+        vfmadd213pd %zmm10, %zmm15, %zmm14
+
+/* CosPoly = C3 + CosR2*(C4 + CosR2*(C5 + CosR2*(C6 + CosR2*C7))) */
+        vfmadd213pd %zmm11, %zmm1, %zmm0
+
+/* SinPoly = C3 + SinR2*(C4 + SinR2*(C5 + SinR2*(C6 + SinR2*C7))) */
+        vfmadd213pd %zmm11, %zmm15, %zmm14
+
+/* CosPoly = C2 + CosR2*CosPoly */
+        vfmadd213pd %zmm12, %zmm1, %zmm0
+
+/* SinPoly = C2 + SinR2*SinPoly */
+        vfmadd213pd %zmm12, %zmm15, %zmm14
+
+/* CosPoly = C1 + CosR2*CosPoly */
+        vfmadd213pd %zmm13, %zmm1, %zmm0
+
+/* SinPoly = C1 + SinR2*SinPoly */
+        vfmadd213pd %zmm13, %zmm15, %zmm14
+
+/* CosPoly = CosR2*CosPoly */
+        vmulpd    %zmm1, %zmm0, %zmm1
+
+/* SinPoly = SinR2*SinPoly */
+        vmulpd    %zmm15, %zmm14, %zmm4
+
+/* CosPoly = CosR*CosPoly */
+        vfmadd213pd %zmm3, %zmm3, %zmm1
+
+/* SinPoly = SinR*SinPoly */
+        vfmadd213pd %zmm5, %zmm5, %zmm4
+        vpbroadcastq .L_2il0floatpacket.15(%rip), %zmm3
+
+/* Update Cos result's sign */
+        vxorpd    %zmm2, %zmm1, %zmm1
+
+/* Final reconstruction.
+   Update Sin result's sign */
+        vxorpd    %zmm6, %zmm4, %zmm0
+        vpandnq   %zmm7, %zmm7, %zmm3{%k1}
+        vcmppd    $3, %zmm3, %zmm3, %k0
+        kmovw     %k0, %ecx
+        testl     %ecx, %ecx
+        jne       .LBL_2_3
+
+.LBL_2_2:
+        cfi_remember_state
+        vmovups   %zmm0, (%rdi)
+        vmovups   %zmm1, (%rsi)
+        movq      %rbp, %rsp
+        cfi_def_cfa_register (%rsp)
+        popq      %rbp
+        cfi_adjust_cfa_offset (-8)
+        cfi_restore (%rbp)
+        ret
+
+.LBL_2_3:
+        cfi_restore_state
+        vmovups   %zmm8, 1152(%rsp)
+        vmovups   %zmm0, 1216(%rsp)
+        vmovups   %zmm1, 1280(%rsp)
+        je        .LBL_2_2
+
+        xorb      %dl, %dl
+        xorl      %eax, %eax
+        kmovw     %k4, 1048(%rsp)
+        kmovw     %k5, 1040(%rsp)
+        kmovw     %k6, 1032(%rsp)
+        kmovw     %k7, 1024(%rsp)
+        vmovups   %zmm16, 960(%rsp)
+        vmovups   %zmm17, 896(%rsp)
+        vmovups   %zmm18, 832(%rsp)
+        vmovups   %zmm19, 768(%rsp)
+        vmovups   %zmm20, 704(%rsp)
+        vmovups   %zmm21, 640(%rsp)
+        vmovups   %zmm22, 576(%rsp)
+        vmovups   %zmm23, 512(%rsp)
+        vmovups   %zmm24, 448(%rsp)
+        vmovups   %zmm25, 384(%rsp)
+        vmovups   %zmm26, 320(%rsp)
+        vmovups   %zmm27, 256(%rsp)
+        vmovups   %zmm28, 192(%rsp)
+        vmovups   %zmm29, 128(%rsp)
+        vmovups   %zmm30, 64(%rsp)
+        vmovups   %zmm31, (%rsp)
+        movq      %rsi, 1056(%rsp)
+        movq      %r12, 1096(%rsp)
+        cfi_offset_rel_rsp (12, 1096)
+        movb      %dl, %r12b
+        movq      %r13, 1088(%rsp)
+        cfi_offset_rel_rsp (13, 1088)
+        movl      %eax, %r13d
+        movq      %r14, 1080(%rsp)
+        cfi_offset_rel_rsp (14, 1080)
+        movl      %ecx, %r14d
+        movq      %r15, 1072(%rsp)
+        cfi_offset_rel_rsp (15, 1072)
+        movq      %rbx, 1064(%rsp)
+        movq      %rdi, %rbx
+        cfi_remember_state
+
+.LBL_2_6:
+        btl       %r13d, %r14d
+        jc        .LBL_2_13
+
+.LBL_2_7:
+        lea       1(%r13), %esi
+        btl       %esi, %r14d
+        jc        .LBL_2_10
+
+.LBL_2_8:
+        incb      %r12b
+        addl      $2, %r13d
+        cmpb      $16, %r12b
+        jb        .LBL_2_6
+
+        kmovw     1048(%rsp), %k4
+        movq      %rbx, %rdi
+        kmovw     1040(%rsp), %k5
+        kmovw     1032(%rsp), %k6
+        kmovw     1024(%rsp), %k7
+        vmovups   960(%rsp), %zmm16
+        vmovups   896(%rsp), %zmm17
+        vmovups   832(%rsp), %zmm18
+        vmovups   768(%rsp), %zmm19
+        vmovups   704(%rsp), %zmm20
+        vmovups   640(%rsp), %zmm21
+        vmovups   576(%rsp), %zmm22
+        vmovups   512(%rsp), %zmm23
+        vmovups   448(%rsp), %zmm24
+        vmovups   384(%rsp), %zmm25
+        vmovups   320(%rsp), %zmm26
+        vmovups   256(%rsp), %zmm27
+        vmovups   192(%rsp), %zmm28
+        vmovups   128(%rsp), %zmm29
+        vmovups   64(%rsp), %zmm30
+        vmovups   (%rsp), %zmm31
+        vmovups   1216(%rsp), %zmm0
+        vmovups   1280(%rsp), %zmm1
+        movq      1056(%rsp), %rsi
+        movq      1096(%rsp), %r12
+        cfi_restore (%r12)
+        movq      1088(%rsp), %r13
+        cfi_restore (%r13)
+        movq      1080(%rsp), %r14
+        cfi_restore (%r14)
+        movq      1072(%rsp), %r15
+        cfi_restore (%r15)
+        movq      1064(%rsp), %rbx
+        jmp       .LBL_2_2
+
+.LBL_2_10:
+        cfi_restore_state
+        movzbl    %r12b, %r15d
+        shlq      $4, %r15
+        vmovsd    1160(%rsp,%r15), %xmm0
+        vzeroupper
+        vmovsd    1160(%rsp,%r15), %xmm0
+
+        call      JUMPTARGET(sin)
+
+        vmovsd    %xmm0, 1224(%rsp,%r15)
+        vmovsd    1160(%rsp,%r15), %xmm0
+
+        call      JUMPTARGET(cos)
+
+        vmovsd    %xmm0, 1288(%rsp,%r15)
+        jmp       .LBL_2_8
+
+.LBL_2_13:
+        movzbl    %r12b, %r15d
+        shlq      $4, %r15
+        vmovsd    1152(%rsp,%r15), %xmm0
+        vzeroupper
+        vmovsd    1152(%rsp,%r15), %xmm0
+
+        call      JUMPTARGET(sin)
+
+        vmovsd    %xmm0, 1216(%rsp,%r15)
+        vmovsd    1152(%rsp,%r15), %xmm0
+
+        call      JUMPTARGET(cos)
+
+        vmovsd    %xmm0, 1280(%rsp,%r15)
+        jmp       .LBL_2_7
+
+#endif
+END (_ZGVeN8vl8l8_sincos_skx)
+libmvec_hidden_def(_ZGVeN8vl8l8_sincos_skx)
+
+/* Wrapper between vvv and vl8l8 vector variants.  */
+.macro WRAPPER_AVX512_vvv_vl8l8 callee
+#ifndef __ILP32__
+        pushq     %rbp
+        cfi_adjust_cfa_offset (8)
+        cfi_rel_offset (%rbp, 0)
+        movq      %rsp, %rbp
+        cfi_def_cfa_register (%rbp)
+        andq      $-64, %rsp
+        subq      $256, %rsp
+        /* Encoding for vmovups %zmm1, 128(%rsp).  */
+        .byte 0x62
+        .byte 0xf1
+        .byte 0x7c
+        .byte 0x48
+        .byte 0x11
+        .byte 0x4c
+        .byte 0x24
+        .byte 0x02
+        lea       (%rsp), %rdi
+        /* Encoding for vmovups %zmm2, 192(%rdi).  */
+        .byte 0x62
+        .byte 0xf1
+        .byte 0x7c
+        .byte 0x48
+        .byte 0x11
+        .byte 0x57
+        .byte 0x03
+        lea       64(%rsp), %rsi
+        call      HIDDEN_JUMPTARGET(\callee)
+        movq      128(%rsp), %rdx
+        movq      136(%rsp), %rsi
+        movq      144(%rsp), %r8
+        movq      152(%rsp), %r10
+        movq      (%rsp), %rax
+        movq      8(%rsp), %rcx
+        movq      16(%rsp), %rdi
+        movq      24(%rsp), %r9
+        movq      %rax, (%rdx)
+        movq      %rcx, (%rsi)
+        movq      160(%rsp), %rax
+        movq      168(%rsp), %rcx
+        movq      %rdi, (%r8)
+        movq      %r9, (%r10)
+        movq      176(%rsp), %rdi
+        movq      184(%rsp), %r9
+        movq      32(%rsp), %r11
+        movq      40(%rsp), %rdx
+        movq      48(%rsp), %rsi
+        movq      56(%rsp), %r8
+        movq      %r11, (%rax)
+        movq      %rdx, (%rcx)
+        movq      192(%rsp), %r11
+        movq      200(%rsp), %rdx
+        movq      %rsi, (%rdi)
+        movq      %r8, (%r9)
+        movq      208(%rsp), %rsi
+        movq      216(%rsp), %r8
+        movq      64(%rsp), %r10
+        movq      72(%rsp), %rax
+        movq      80(%rsp), %rcx
+        movq      88(%rsp), %rdi
+        movq      %r10, (%r11)
+        movq      %rax, (%rdx)
+        movq      224(%rsp), %r10
+        movq      232(%rsp), %rax
+        movq      %rcx, (%rsi)
+        movq      %rdi, (%r8)
+        movq      240(%rsp), %rcx
+        movq      248(%rsp), %rdi
+        movq      96(%rsp), %r9
+        movq      104(%rsp), %r11
+        movq      112(%rsp), %rdx
+        movq      120(%rsp), %rsi
+        movq      %r9, (%r10)
+        movq      %r11, (%rax)
+        movq      %rdx, (%rcx)
+        movq      %rsi, (%rdi)
+        movq      %rbp, %rsp
+        cfi_def_cfa_register (%rsp)
+        popq      %rbp
+        cfi_adjust_cfa_offset (-8)
+        cfi_restore (%rbp)
+        ret
+#else
+        leal    8(%rsp), %r10d
+        .cfi_def_cfa 10, 0
+        andl    $-64, %esp
+        pushq   -8(%r10d)
+        pushq   %rbp
+        .cfi_escape 0x10,0x6,0x2,0x76,0
+        movl    %esp, %ebp
+        pushq   %r10
+        .cfi_escape 0xf,0x3,0x76,0x78,0x6
+        leal    -112(%rbp), %esi
+        leal    -176(%rbp), %edi
+        subl    $232, %esp
+        vmovdqa %ymm1, -208(%ebp)
+        vmovdqa %ymm2, -240(%ebp)
+        call    HIDDEN_JUMPTARGET(\callee)
+        vmovdqa -208(%ebp), %xmm0
+        vmovq   %xmm0, %rax
+        vmovsd  -176(%ebp), %xmm0
+        vmovsd  %xmm0, (%eax)
+        shrq    $32, %rax
+        vmovsd  -168(%ebp), %xmm0
+        vmovsd  %xmm0, (%eax)
+        movq    -200(%ebp), %rax
+        vmovsd  -160(%ebp), %xmm0
+        vmovsd  %xmm0, (%eax)
+        shrq    $32, %rax
+        vmovsd  -152(%ebp), %xmm0
+        vmovsd  %xmm0, (%eax)
+        movq    -192(%ebp), %rax
+        vmovsd  -144(%ebp), %xmm0
+        vmovsd  %xmm0, (%eax)
+        shrq    $32, %rax
+        vmovsd  -136(%ebp), %xmm0
+        vmovsd  %xmm0, (%eax)
+        movq    -184(%ebp), %rax
+        vmovsd  -128(%ebp), %xmm0
+        vmovsd  %xmm0, (%eax)
+        shrq    $32, %rax
+        vmovsd  -120(%ebp), %xmm0
+        vmovsd  %xmm0, (%eax)
+        vmovdqa -240(%ebp), %xmm0
+        vmovq   %xmm0, %rax
+        vmovsd  -112(%ebp), %xmm0
+        vmovsd  %xmm0, (%eax)
+        shrq    $32, %rax
+        vmovsd  -104(%ebp), %xmm0
+        vmovsd  %xmm0, (%eax)
+        movq    -232(%ebp), %rax
+        vmovsd  -96(%ebp), %xmm0
+        vmovsd  %xmm0, (%eax)
+        shrq    $32, %rax
+        vmovsd  -88(%ebp), %xmm0
+        vmovsd  %xmm0, (%eax)
+        movq    -224(%ebp), %rax
+        vmovsd  -80(%ebp), %xmm0
+        vmovsd  %xmm0, (%eax)
+        shrq    $32, %rax
+        vmovsd  -72(%ebp), %xmm0
+        vmovsd  %xmm0, (%eax)
+        movq    -216(%ebp), %rax
+        vmovsd  -64(%ebp), %xmm0
+        vmovsd  %xmm0, (%eax)
+        shrq    $32, %rax
+        vmovsd  -56(%ebp), %xmm0
+        vmovsd  %xmm0, (%eax)
+        addl    $232, %esp
+        popq    %r10
+        .cfi_def_cfa 10, 0
+        popq    %rbp
+        leal    -8(%r10), %esp
+        .cfi_def_cfa 7, 8
+        ret
+#endif
+.endm
+
+ENTRY (_ZGVeN8vvv_sincos_knl)
+WRAPPER_AVX512_vvv_vl8l8 _ZGVeN8vl8l8_sincos_knl
+END (_ZGVeN8vvv_sincos_knl)
+
+ENTRY (_ZGVeN8vvv_sincos_skx)
+WRAPPER_AVX512_vvv_vl8l8 _ZGVeN8vl8l8_sincos_skx
+END (_ZGVeN8vvv_sincos_skx)
+
+	.section .rodata, "a"
+.L_2il0floatpacket.15:
+	.long	0xffffffff,0xffffffff
+	.type	.L_2il0floatpacket.15,@object
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core.S
new file mode 100644
index 0000000000..cd67665972
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core.S
@@ -0,0 +1,37 @@
+/* Multiple versions of vectorized cosf.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+	.text
+ENTRY (_ZGVeN16v_cosf)
+        .type   _ZGVeN16v_cosf, @gnu_indirect_function
+	LOAD_RTLD_GLOBAL_RO_RDX
+        leaq    _ZGVeN16v_cosf_skx(%rip), %rax
+	HAS_ARCH_FEATURE (AVX512DQ_Usable)
+        jnz     2f
+        leaq    _ZGVeN16v_cosf_knl(%rip), %rax
+	HAS_ARCH_FEATURE (AVX512F_Usable)
+        jnz     2f
+        leaq    _ZGVeN16v_cosf_avx2_wrapper(%rip), %rax
+2:      ret
+END (_ZGVeN16v_cosf)
+
+#define _ZGVeN16v_cosf _ZGVeN16v_cosf_avx2_wrapper
+#include "../svml_s_cosf16_core.S"
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S
new file mode 100644
index 0000000000..611bb5dd2d
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S
@@ -0,0 +1,460 @@
+/* Function cosf vectorized with AVX-512. KNL and SKX versions.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "svml_s_trig_data.h"
+#include "svml_s_wrapper_impl.h"
+
+	.text
+ENTRY (_ZGVeN16v_cosf_knl)
+#ifndef HAVE_AVX512DQ_ASM_SUPPORT
+WRAPPER_IMPL_AVX512 _ZGVdN8v_cosf
+#else
+/*
+  ALGORITHM DESCRIPTION:
+
+  1) Range reduction to [-Pi/2; +Pi/2] interval
+     a) We remove sign using AND operation
+     b) Add Pi/2 value to argument X for Cos to Sin transformation
+     c) Getting octant Y by 1/Pi multiplication
+     d) Add "Right Shifter" value
+     e) Treat obtained value as integer for destination sign setting.
+        Shift first bit of this value to the last (sign) position
+     f) Subtract "Right Shifter"  value
+     g) Subtract 0.5 from result for octant correction
+     h) Subtract Y*PI from X argument, where PI divided to 4 parts:
+        X = X - Y*PI1 - Y*PI2 - Y*PI3 - Y*PI4;
+  2) Polynomial (minimax for sin within [-Pi/2; +Pi/2] interval)
+     a) Calculate X^2 = X * X
+     b) Calculate polynomial:
+        R = X + X * X^2 * (A3 + x^2 * (A5 + .....
+  3) Destination sign setting
+     a) Set shifted destination sign using XOR operation:
+        R = XOR( R, S );
+ */
+        pushq     %rbp
+        cfi_adjust_cfa_offset (8)
+        cfi_rel_offset (%rbp, 0)
+        movq      %rsp, %rbp
+        cfi_def_cfa_register (%rbp)
+        andq      $-64, %rsp
+        subq      $1280, %rsp
+        movq      __svml_s_trig_data@GOTPCREL(%rip), %rdx
+
+/*
+  h) Subtract Y*PI from X argument, where PI divided to 4 parts:
+  X = X - Y*PI1 - Y*PI2 - Y*PI3
+ */
+        vmovaps   %zmm0, %zmm6
+        movl      $-1, %eax
+
+/* b) Add Pi/2 value to argument X for Cos to Sin transformation */
+        vaddps    __sHalfPI(%rdx), %zmm0, %zmm2
+        vmovups   __sRShifter(%rdx), %zmm3
+
+/*
+  1) Range reduction to [-Pi/2; +Pi/2] interval
+  c) Getting octant Y by 1/Pi multiplication
+  d) Add "Right Shifter" (0x4B000000) value
+ */
+        vfmadd132ps __sInvPI(%rdx), %zmm3, %zmm2
+        vmovups     __sPI1_FMA(%rdx), %zmm5
+
+/* f) Subtract "Right Shifter" (0x4B000000) value */
+        vsubps    %zmm3, %zmm2, %zmm4
+        vmovups   __sA9_FMA(%rdx), %zmm9
+
+/* Check for large and special arguments */
+        vpandd    __sAbsMask(%rdx), %zmm0, %zmm1
+
+/*
+  e) Treat obtained value as integer for destination sign setting.
+  Shift first bit of this value to the last (sign) position (S << 31)
+ */
+        vpslld       $31, %zmm2, %zmm8
+        vcmpps       $22, __sRangeReductionVal(%rdx), %zmm1, %k1
+        vpbroadcastd %eax, %zmm12{%k1}{z}
+
+/* g) Subtract 0.5 from result for octant correction */
+        vsubps       __sOneHalf(%rdx), %zmm4, %zmm7
+        vptestmd     %zmm12, %zmm12, %k0
+        vfnmadd231ps %zmm7, %zmm5, %zmm6
+        kmovw        %k0, %ecx
+        vfnmadd231ps __sPI2_FMA(%rdx), %zmm7, %zmm6
+        vfnmadd132ps __sPI3_FMA(%rdx), %zmm6, %zmm7
+
+/* a) Calculate X^2 = X * X */
+        vmulps    %zmm7, %zmm7, %zmm10
+
+/*
+  3) Destination sign setting
+    a) Set shifted destination sign using XOR operation:
+  R = XOR( R, S );
+ */
+        vpxord    %zmm8, %zmm7, %zmm11
+
+/*
+  b) Calculate polynomial:
+  R = X + X * X^2 * (A3 + x^2 * (A5 + x^2 * (A7 + x^2 * (A9))));
+ */
+        vfmadd213ps __sA7_FMA(%rdx), %zmm10, %zmm9
+        vfmadd213ps __sA5_FMA(%rdx), %zmm10, %zmm9
+        vfmadd213ps __sA3(%rdx), %zmm10, %zmm9
+        vmulps      %zmm10, %zmm9, %zmm1
+        vfmadd213ps %zmm11, %zmm11, %zmm1
+        testl       %ecx, %ecx
+        jne         .LBL_1_3
+
+.LBL_1_2:
+        cfi_remember_state
+        vmovaps   %zmm1, %zmm0
+        movq      %rbp, %rsp
+        cfi_def_cfa_register (%rsp)
+        popq      %rbp
+        cfi_adjust_cfa_offset (-8)
+        cfi_restore (%rbp)
+        ret
+
+.LBL_1_3:
+        cfi_restore_state
+        vmovups   %zmm0, 1152(%rsp)
+        vmovups   %zmm1, 1216(%rsp)
+        je        .LBL_1_2
+
+        xorb      %dl, %dl
+        kmovw     %k4, 1048(%rsp)
+        xorl      %eax, %eax
+        kmovw     %k5, 1040(%rsp)
+        kmovw     %k6, 1032(%rsp)
+        kmovw     %k7, 1024(%rsp)
+        vmovups   %zmm16, 960(%rsp)
+        vmovups   %zmm17, 896(%rsp)
+        vmovups   %zmm18, 832(%rsp)
+        vmovups   %zmm19, 768(%rsp)
+        vmovups   %zmm20, 704(%rsp)
+        vmovups   %zmm21, 640(%rsp)
+        vmovups   %zmm22, 576(%rsp)
+        vmovups   %zmm23, 512(%rsp)
+        vmovups   %zmm24, 448(%rsp)
+        vmovups   %zmm25, 384(%rsp)
+        vmovups   %zmm26, 320(%rsp)
+        vmovups   %zmm27, 256(%rsp)
+        vmovups   %zmm28, 192(%rsp)
+        vmovups   %zmm29, 128(%rsp)
+        vmovups   %zmm30, 64(%rsp)
+        vmovups   %zmm31, (%rsp)
+        movq      %rsi, 1064(%rsp)
+        movq      %rdi, 1056(%rsp)
+        movq      %r12, 1096(%rsp)
+        cfi_offset_rel_rsp (12, 1096)
+        movb      %dl, %r12b
+        movq      %r13, 1088(%rsp)
+        cfi_offset_rel_rsp (13, 1088)
+        movl      %ecx, %r13d
+        movq      %r14, 1080(%rsp)
+        cfi_offset_rel_rsp (14, 1080)
+        movl      %eax, %r14d
+        movq      %r15, 1072(%rsp)
+        cfi_offset_rel_rsp (15, 1072)
+        cfi_remember_state
+
+.LBL_1_6:
+        btl       %r14d, %r13d
+        jc        .LBL_1_12
+
+.LBL_1_7:
+        lea       1(%r14), %esi
+        btl       %esi, %r13d
+        jc        .LBL_1_10
+
+.LBL_1_8:
+        addb      $1, %r12b
+        addl      $2, %r14d
+        cmpb      $16, %r12b
+        jb        .LBL_1_6
+
+        kmovw     1048(%rsp), %k4
+        movq      1064(%rsp), %rsi
+        kmovw     1040(%rsp), %k5
+        movq      1056(%rsp), %rdi
+        kmovw     1032(%rsp), %k6
+        movq      1096(%rsp), %r12
+        cfi_restore (%r12)
+        movq      1088(%rsp), %r13
+        cfi_restore (%r13)
+        kmovw     1024(%rsp), %k7
+        vmovups   960(%rsp), %zmm16
+        vmovups   896(%rsp), %zmm17
+        vmovups   832(%rsp), %zmm18
+        vmovups   768(%rsp), %zmm19
+        vmovups   704(%rsp), %zmm20
+        vmovups   640(%rsp), %zmm21
+        vmovups   576(%rsp), %zmm22
+        vmovups   512(%rsp), %zmm23
+        vmovups   448(%rsp), %zmm24
+        vmovups   384(%rsp), %zmm25
+        vmovups   320(%rsp), %zmm26
+        vmovups   256(%rsp), %zmm27
+        vmovups   192(%rsp), %zmm28
+        vmovups   128(%rsp), %zmm29
+        vmovups   64(%rsp), %zmm30
+        vmovups   (%rsp), %zmm31
+        movq      1080(%rsp), %r14
+        cfi_restore (%r14)
+        movq      1072(%rsp), %r15
+        cfi_restore (%r15)
+        vmovups   1216(%rsp), %zmm1
+        jmp       .LBL_1_2
+
+.LBL_1_10:
+        cfi_restore_state
+        movzbl    %r12b, %r15d
+        vmovss    1156(%rsp,%r15,8), %xmm0
+        call      JUMPTARGET(cosf)
+        vmovss    %xmm0, 1220(%rsp,%r15,8)
+        jmp       .LBL_1_8
+
+.LBL_1_12:
+        movzbl    %r12b, %r15d
+        vmovss    1152(%rsp,%r15,8), %xmm0
+        call      JUMPTARGET(cosf)
+        vmovss    %xmm0, 1216(%rsp,%r15,8)
+        jmp       .LBL_1_7
+#endif
+END (_ZGVeN16v_cosf_knl)
+
+ENTRY (_ZGVeN16v_cosf_skx)
+#ifndef HAVE_AVX512DQ_ASM_SUPPORT
+WRAPPER_IMPL_AVX512 _ZGVdN8v_cosf
+#else
+/*
+  ALGORITHM DESCRIPTION:
+
+  1) Range reduction to [-Pi/2; +Pi/2] interval
+     a) We remove sign using AND operation
+     b) Add Pi/2 value to argument X for Cos to Sin transformation
+     c) Getting octant Y by 1/Pi multiplication
+     d) Add "Right Shifter" value
+     e) Treat obtained value as integer for destination sign setting.
+        Shift first bit of this value to the last (sign) position
+     f) Subtract "Right Shifter"  value
+     g) Subtract 0.5 from result for octant correction
+     h) Subtract Y*PI from X argument, where PI divided to 4 parts:
+        X = X - Y*PI1 - Y*PI2 - Y*PI3 - Y*PI4;
+  2) Polynomial (minimax for sin within [-Pi/2; +Pi/2] interval)
+     a) Calculate X^2 = X * X
+     b) Calculate polynomial:
+        R = X + X * X^2 * (A3 + x^2 * (A5 + .....
+  3) Destination sign setting
+     a) Set shifted destination sign using XOR operation:
+        R = XOR( R, S );
+ */
+        pushq     %rbp
+        cfi_adjust_cfa_offset (8)
+        cfi_rel_offset (%rbp, 0)
+        movq      %rsp, %rbp
+        cfi_def_cfa_register (%rbp)
+        andq      $-64, %rsp
+        subq      $1280, %rsp
+        movq      __svml_s_trig_data@GOTPCREL(%rip), %rax
+
+/*
+  h) Subtract Y*PI from X argument, where PI divided to 4 parts:
+  X = X - Y*PI1 - Y*PI2 - Y*PI3
+ */
+        vmovaps   %zmm0, %zmm6
+        vmovups   .L_2il0floatpacket.13(%rip), %zmm12
+        vmovups __sRShifter(%rax), %zmm3
+        vmovups __sPI1_FMA(%rax), %zmm5
+        vmovups __sA9_FMA(%rax), %zmm9
+
+/* b) Add Pi/2 value to argument X for Cos to Sin transformation */
+        vaddps __sHalfPI(%rax), %zmm0, %zmm2
+
+/* Check for large and special arguments */
+        vandps __sAbsMask(%rax), %zmm0, %zmm1
+
+/*
+  1) Range reduction to [-Pi/2; +Pi/2] interval
+  c) Getting octant Y by 1/Pi multiplication
+  d) Add "Right Shifter" (0x4B000000) value
+ */
+        vfmadd132ps __sInvPI(%rax), %zmm3, %zmm2
+        vcmpps    $18, __sRangeReductionVal(%rax), %zmm1, %k1
+
+/*
+  e) Treat obtained value as integer for destination sign setting.
+  Shift first bit of this value to the last (sign) position (S << 31)
+ */
+        vpslld    $31, %zmm2, %zmm8
+
+/* f) Subtract "Right Shifter" (0x4B000000) value */
+        vsubps    %zmm3, %zmm2, %zmm4
+
+/* g) Subtract 0.5 from result for octant correction */
+        vsubps __sOneHalf(%rax), %zmm4, %zmm7
+        vfnmadd231ps %zmm7, %zmm5, %zmm6
+        vfnmadd231ps __sPI2_FMA(%rax), %zmm7, %zmm6
+        vfnmadd132ps __sPI3_FMA(%rax), %zmm6, %zmm7
+
+/* a) Calculate X^2 = X * X */
+        vmulps    %zmm7, %zmm7, %zmm10
+
+/*
+  3) Destination sign setting
+  a) Set shifted destination sign using XOR operation:
+  R = XOR( R, S );
+ */
+        vxorps    %zmm8, %zmm7, %zmm11
+
+/*
+  b) Calculate polynomial:
+  R = X + X * X^2 * (A3 + x^2 * (A5 + x^2 * (A7 + x^2 * (A9))));
+ */
+        vfmadd213ps __sA7_FMA(%rax), %zmm10, %zmm9
+        vfmadd213ps __sA5_FMA(%rax), %zmm10, %zmm9
+        vfmadd213ps __sA3(%rax), %zmm10, %zmm9
+        vpandnd   %zmm1, %zmm1, %zmm12{%k1}
+        vmulps    %zmm10, %zmm9, %zmm1
+        vptestmd  %zmm12, %zmm12, %k0
+        vfmadd213ps %zmm11, %zmm11, %zmm1
+        kmovw     %k0, %ecx
+        testl     %ecx, %ecx
+        jne       .LBL_2_3
+.LBL_2_2:
+        cfi_remember_state
+        vmovaps   %zmm1, %zmm0
+        movq      %rbp, %rsp
+        cfi_def_cfa_register (%rsp)
+        popq      %rbp
+        cfi_adjust_cfa_offset (-8)
+        cfi_restore (%rbp)
+        ret
+
+.LBL_2_3:
+        cfi_restore_state
+        vmovups   %zmm0, 1152(%rsp)
+        vmovups   %zmm1, 1216(%rsp)
+        je        .LBL_2_2
+
+        xorb      %dl, %dl
+        xorl      %eax, %eax
+        kmovw     %k4, 1048(%rsp)
+        kmovw     %k5, 1040(%rsp)
+        kmovw     %k6, 1032(%rsp)
+        kmovw     %k7, 1024(%rsp)
+        vmovups   %zmm16, 960(%rsp)
+        vmovups   %zmm17, 896(%rsp)
+        vmovups   %zmm18, 832(%rsp)
+        vmovups   %zmm19, 768(%rsp)
+        vmovups   %zmm20, 704(%rsp)
+        vmovups   %zmm21, 640(%rsp)
+        vmovups   %zmm22, 576(%rsp)
+        vmovups   %zmm23, 512(%rsp)
+        vmovups   %zmm24, 448(%rsp)
+        vmovups   %zmm25, 384(%rsp)
+        vmovups   %zmm26, 320(%rsp)
+        vmovups   %zmm27, 256(%rsp)
+        vmovups   %zmm28, 192(%rsp)
+        vmovups   %zmm29, 128(%rsp)
+        vmovups   %zmm30, 64(%rsp)
+        vmovups   %zmm31, (%rsp)
+        movq      %rsi, 1064(%rsp)
+        movq      %rdi, 1056(%rsp)
+        movq      %r12, 1096(%rsp)
+        cfi_offset_rel_rsp (12, 1096)
+        movb      %dl, %r12b
+        movq      %r13, 1088(%rsp)
+        cfi_offset_rel_rsp (13, 1088)
+        movl      %ecx, %r13d
+        movq      %r14, 1080(%rsp)
+        cfi_offset_rel_rsp (14, 1080)
+        movl      %eax, %r14d
+        movq      %r15, 1072(%rsp)
+        cfi_offset_rel_rsp (15, 1072)
+        cfi_remember_state
+
+.LBL_2_6:
+        btl       %r14d, %r13d
+        jc        .LBL_2_12
+.LBL_2_7:
+        lea       1(%r14), %esi
+        btl       %esi, %r13d
+        jc        .LBL_2_10
+.LBL_2_8:
+        incb      %r12b
+        addl      $2, %r14d
+        cmpb      $16, %r12b
+        jb        .LBL_2_6
+        kmovw     1048(%rsp), %k4
+        kmovw     1040(%rsp), %k5
+        kmovw     1032(%rsp), %k6
+        kmovw     1024(%rsp), %k7
+        vmovups   960(%rsp), %zmm16
+        vmovups   896(%rsp), %zmm17
+        vmovups   832(%rsp), %zmm18
+        vmovups   768(%rsp), %zmm19
+        vmovups   704(%rsp), %zmm20
+        vmovups   640(%rsp), %zmm21
+        vmovups   576(%rsp), %zmm22
+        vmovups   512(%rsp), %zmm23
+        vmovups   448(%rsp), %zmm24
+        vmovups   384(%rsp), %zmm25
+        vmovups   320(%rsp), %zmm26
+        vmovups   256(%rsp), %zmm27
+        vmovups   192(%rsp), %zmm28
+        vmovups   128(%rsp), %zmm29
+        vmovups   64(%rsp), %zmm30
+        vmovups   (%rsp), %zmm31
+        vmovups   1216(%rsp), %zmm1
+        movq      1064(%rsp), %rsi
+        movq      1056(%rsp), %rdi
+        movq      1096(%rsp), %r12
+        cfi_restore (%r12)
+        movq      1088(%rsp), %r13
+        cfi_restore (%r13)
+        movq      1080(%rsp), %r14
+        cfi_restore (%r14)
+        movq      1072(%rsp), %r15
+        cfi_restore (%r15)
+        jmp       .LBL_2_2
+
+.LBL_2_10:
+        cfi_restore_state
+        movzbl    %r12b, %r15d
+        vmovss    1156(%rsp,%r15,8), %xmm0
+        vzeroupper
+        vmovss    1156(%rsp,%r15,8), %xmm0
+        call      JUMPTARGET(cosf)
+        vmovss    %xmm0, 1220(%rsp,%r15,8)
+        jmp       .LBL_2_8
+.LBL_2_12:
+        movzbl    %r12b, %r15d
+        vmovss    1152(%rsp,%r15,8), %xmm0
+        vzeroupper
+        vmovss    1152(%rsp,%r15,8), %xmm0
+        call      JUMPTARGET(cosf)
+        vmovss    %xmm0, 1216(%rsp,%r15,8)
+        jmp       .LBL_2_7
+#endif
+END (_ZGVeN16v_cosf_skx)
+
+	.section .rodata, "a"
+.L_2il0floatpacket.13:
+	.long	0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
+	.type	.L_2il0floatpacket.13,@object
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_cosf4_core.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_cosf4_core.S
new file mode 100644
index 0000000000..d73d7c7e3f
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_cosf4_core.S
@@ -0,0 +1,36 @@
+/* Multiple versions of vectorized cosf, vector length is 4.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+	.text
+ENTRY (_ZGVbN4v_cosf)
+        .type   _ZGVbN4v_cosf, @gnu_indirect_function
+	LOAD_RTLD_GLOBAL_RO_RDX
+        leaq    _ZGVbN4v_cosf_sse4(%rip), %rax
+	HAS_CPU_FEATURE (SSE4_1)
+        jz      2f
+        ret
+2:      leaq    _ZGVbN4v_cosf_sse2(%rip), %rax
+        ret
+END (_ZGVbN4v_cosf)
+libmvec_hidden_def (_ZGVbN4v_cosf)
+
+#define _ZGVbN4v_cosf _ZGVbN4v_cosf_sse2
+#include "../svml_s_cosf4_core.S"
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_cosf4_core_sse4.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_cosf4_core_sse4.S
new file mode 100644
index 0000000000..73797e1a93
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_cosf4_core_sse4.S
@@ -0,0 +1,227 @@
+/* Function cosf vectorized with SSE4.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "svml_s_trig_data.h"
+
+	.text
+ENTRY (_ZGVbN4v_cosf_sse4)
+/*
+  ALGORITHM DESCRIPTION:
+
+   1) Range reduction to [-Pi/2; +Pi/2] interval
+      a) We remove sign using AND operation
+      b) Add Pi/2 value to argument X for Cos to Sin transformation
+      c) Getting octant Y by 1/Pi multiplication
+      d) Add "Right Shifter" value
+      e) Treat obtained value as integer for destination sign setting.
+         Shift first bit of this value to the last (sign) position
+      f) Subtract "Right Shifter"  value
+      g) Subtract 0.5 from result for octant correction
+      h) Subtract Y*PI from X argument, where PI divided to 4 parts:
+         X = X - Y*PI1 - Y*PI2 - Y*PI3 - Y*PI4;
+   2) Polynomial (minimax for sin within [-Pi/2; +Pi/2] interval)
+      a) Calculate X^2 = X * X
+      b) Calculate polynomial:
+         R = X + X * X^2 * (A3 + x^2 * (A5 + .....
+   3) Destination sign setting
+      a) Set shifted destination sign using XOR operation:
+         R = XOR( R, S );
+ */
+        pushq     %rbp
+        cfi_adjust_cfa_offset (8)
+        cfi_rel_offset (%rbp, 0)
+        movq      %rsp, %rbp
+        cfi_def_cfa_register (%rbp)
+        andq      $-64, %rsp
+        subq      $320, %rsp
+        movaps    %xmm0, %xmm4
+        movq      __svml_s_trig_data@GOTPCREL(%rip), %rax
+        movups __sHalfPI(%rax), %xmm1
+        movups __sRShifter(%rax), %xmm5
+
+/* b) Add Pi/2 value to argument X for Cos to Sin transformation */
+        addps     %xmm4, %xmm1
+
+/*
+  1) Range reduction to [-Pi/2; +Pi/2] interval
+  c) Getting octant Y by 1/Pi multiplication
+  d) Add "Right Shifter" (0x4B000000) value
+ */
+        mulps __sInvPI(%rax), %xmm1
+        movups __sPI1(%rax), %xmm6
+        addps     %xmm5, %xmm1
+
+/*
+  e) Treat obtained value as integer for destination sign setting.
+  Shift first bit of this value to the last (sign) position (S << 31)
+ */
+        movaps    %xmm1, %xmm2
+
+/* f) Subtract "Right Shifter" (0x4B000000) value */
+        subps     %xmm5, %xmm1
+        movups __sPI2(%rax), %xmm7
+        pslld     $31, %xmm2
+        movups __sPI3(%rax), %xmm5
+        movups __sAbsMask(%rax), %xmm3
+
+/* Check for large and special arguments */
+        andps     %xmm4, %xmm3
+
+/* g) Subtract 0.5 from result for octant correction */
+        subps __sOneHalf(%rax), %xmm1
+        cmpnleps __sRangeReductionVal(%rax), %xmm3
+
+/*
+  h) Subtract Y*PI from X argument, where PI divided to 4 parts:
+  X = X - Y*PI1 - Y*PI2 - Y*PI3 - Y*PI4;
+ */
+        mulps     %xmm1, %xmm6
+        mulps     %xmm1, %xmm7
+        mulps     %xmm1, %xmm5
+        subps     %xmm6, %xmm0
+        movmskps  %xmm3, %ecx
+        movups __sPI4(%rax), %xmm6
+        subps     %xmm7, %xmm0
+        mulps     %xmm6, %xmm1
+        subps     %xmm5, %xmm0
+        subps     %xmm1, %xmm0
+
+/* a) Calculate X^2 = X * X */
+        movaps    %xmm0, %xmm1
+        mulps     %xmm0, %xmm1
+
+/*
+  3) Destination sign setting
+  a) Set shifted destination sign using XOR operation:
+  R = XOR( R, S );
+ */
+        xorps     %xmm2, %xmm0
+        movups __sA9(%rax), %xmm2
+
+/*
+  b) Calculate polynomial:
+  R = X + X * X^2 * (A3 + x^2 * (A5 + x^2 * (A7 + x^2 * (A9))));
+ */
+        mulps     %xmm1, %xmm2
+        addps __sA7(%rax), %xmm2
+        mulps     %xmm1, %xmm2
+        addps __sA5(%rax), %xmm2
+        mulps     %xmm1, %xmm2
+        addps __sA3(%rax), %xmm2
+        mulps     %xmm2, %xmm1
+        mulps     %xmm0, %xmm1
+        addps     %xmm1, %xmm0
+        testl     %ecx, %ecx
+        jne       .LBL_1_3
+
+.LBL_1_2:
+        cfi_remember_state
+        movq      %rbp, %rsp
+        cfi_def_cfa_register (%rsp)
+        popq      %rbp
+        cfi_adjust_cfa_offset (-8)
+        cfi_restore (%rbp)
+        ret
+
+.LBL_1_3:
+        cfi_restore_state
+        movups    %xmm4, 192(%rsp)
+        movups    %xmm0, 256(%rsp)
+        je        .LBL_1_2
+
+        xorb      %dl, %dl
+        xorl      %eax, %eax
+        movups    %xmm8, 112(%rsp)
+        movups    %xmm9, 96(%rsp)
+        movups    %xmm10, 80(%rsp)
+        movups    %xmm11, 64(%rsp)
+        movups    %xmm12, 48(%rsp)
+        movups    %xmm13, 32(%rsp)
+        movups    %xmm14, 16(%rsp)
+        movups    %xmm15, (%rsp)
+        movq      %rsi, 136(%rsp)
+        movq      %rdi, 128(%rsp)
+        movq      %r12, 168(%rsp)
+        cfi_offset_rel_rsp (12, 168)
+        movb      %dl, %r12b
+        movq      %r13, 160(%rsp)
+        cfi_offset_rel_rsp (13, 160)
+        movl      %ecx, %r13d
+        movq      %r14, 152(%rsp)
+        cfi_offset_rel_rsp (14, 152)
+        movl      %eax, %r14d
+        movq      %r15, 144(%rsp)
+        cfi_offset_rel_rsp (15, 144)
+        cfi_remember_state
+
+.LBL_1_6:
+        btl       %r14d, %r13d
+        jc        .LBL_1_12
+
+.LBL_1_7:
+        lea       1(%r14), %esi
+        btl       %esi, %r13d
+        jc        .LBL_1_10
+
+.LBL_1_8:
+        incb      %r12b
+        addl      $2, %r14d
+        cmpb      $16, %r12b
+        jb        .LBL_1_6
+
+        movups    112(%rsp), %xmm8
+        movups    96(%rsp), %xmm9
+        movups    80(%rsp), %xmm10
+        movups    64(%rsp), %xmm11
+        movups    48(%rsp), %xmm12
+        movups    32(%rsp), %xmm13
+        movups    16(%rsp), %xmm14
+        movups    (%rsp), %xmm15
+        movq      136(%rsp), %rsi
+        movq      128(%rsp), %rdi
+        movq      168(%rsp), %r12
+        cfi_restore (%r12)
+        movq      160(%rsp), %r13
+        cfi_restore (%r13)
+        movq      152(%rsp), %r14
+        cfi_restore (%r14)
+        movq      144(%rsp), %r15
+        cfi_restore (%r15)
+        movups    256(%rsp), %xmm0
+        jmp       .LBL_1_2
+
+.LBL_1_10:
+        cfi_restore_state
+        movzbl    %r12b, %r15d
+        movss     196(%rsp,%r15,8), %xmm0
+
+        call      JUMPTARGET(cosf)
+
+        movss     %xmm0, 260(%rsp,%r15,8)
+        jmp       .LBL_1_8
+
+.LBL_1_12:
+        movzbl    %r12b, %r15d
+        movss     192(%rsp,%r15,8), %xmm0
+
+        call      JUMPTARGET(cosf)
+
+        movss     %xmm0, 256(%rsp,%r15,8)
+        jmp       .LBL_1_7
+END (_ZGVbN4v_cosf_sse4)
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_cosf8_core.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_cosf8_core.S
new file mode 100644
index 0000000000..f7530c138a
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_cosf8_core.S
@@ -0,0 +1,36 @@
+/* Multiple versions of vectorized cosf, vector length is 8.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+    Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+	.text
+ENTRY (_ZGVdN8v_cosf)
+        .type   _ZGVdN8v_cosf, @gnu_indirect_function
+	LOAD_RTLD_GLOBAL_RO_RDX
+        leaq    _ZGVdN8v_cosf_avx2(%rip), %rax
+	HAS_ARCH_FEATURE (AVX2_Usable)
+        jz      2f
+        ret
+2:      leaq    _ZGVdN8v_cosf_sse_wrapper(%rip), %rax
+        ret
+END (_ZGVdN8v_cosf)
+libmvec_hidden_def (_ZGVdN8v_cosf)
+
+#define _ZGVdN8v_cosf _ZGVdN8v_cosf_sse_wrapper
+#include "../svml_s_cosf8_core.S"
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_cosf8_core_avx2.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_cosf8_core_avx2.S
new file mode 100644
index 0000000000..c61add3bb9
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_cosf8_core_avx2.S
@@ -0,0 +1,215 @@
+/* Function cosf vectorized with AVX2.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+
+#include <sysdep.h>
+#include "svml_s_trig_data.h"
+
+	.text
+ENTRY (_ZGVdN8v_cosf_avx2)
+/*
+  ALGORITHM DESCRIPTION:
+
+  1) Range reduction to [-Pi/2; +Pi/2] interval
+    a) We remove sign using AND operation
+    b) Add Pi/2 value to argument X for Cos to Sin transformation
+    c) Getting octant Y by 1/Pi multiplication
+    d) Add "Right Shifter" value
+    e) Treat obtained value as integer for destination sign setting.
+       Shift first bit of this value to the last (sign) position
+    f) Subtract "Right Shifter"  value
+    g) Subtract 0.5 from result for octant correction
+    h) Subtract Y*PI from X argument, where PI divided to 4 parts:
+         X = X - Y*PI1 - Y*PI2 - Y*PI3 - Y*PI4;
+  2) Polynomial (minimax for sin within [-Pi/2; +Pi/2] interval)
+    a) Calculate X^2 = X * X
+    b) Calculate polynomial:
+         R = X + X * X^2 * (A3 + x^2 * (A5 + .....
+  3) Destination sign setting
+    a) Set shifted destination sign using XOR operation:
+         R = XOR( R, S );
+ */
+        pushq     %rbp
+        cfi_adjust_cfa_offset (8)
+        cfi_rel_offset (%rbp, 0)
+        movq      %rsp, %rbp
+        cfi_def_cfa_register (%rbp)
+        andq      $-64, %rsp
+        subq      $448, %rsp
+        movq      __svml_s_trig_data@GOTPCREL(%rip), %rax
+        vmovaps   %ymm0, %ymm2
+        vmovups __sRShifter(%rax), %ymm5
+        vmovups __sPI1_FMA(%rax), %ymm7
+
+/* b) Add Pi/2 value to argument X for Cos to Sin transformation */
+        vaddps __sHalfPI(%rax), %ymm2, %ymm4
+
+/*
+  1) Range reduction to [-Pi/2; +Pi/2] interval
+  c) Getting octant Y by 1/Pi multiplication
+  d) Add "Right Shifter" (0x4B000000) value
+ */
+        vfmadd132ps __sInvPI(%rax), %ymm5, %ymm4
+
+/* f) Subtract "Right Shifter" (0x4B000000) value */
+        vsubps    %ymm5, %ymm4, %ymm6
+
+/*
+  e) Treat obtained value as integer for destination sign setting.
+  Shift first bit of this value to the last (sign) position (S << 31)
+ */
+        vpslld    $31, %ymm4, %ymm0
+
+/* g) Subtract 0.5 from result for octant correction */
+        vsubps __sOneHalf(%rax), %ymm6, %ymm4
+
+/* Check for large and special arguments */
+        vandps __sAbsMask(%rax), %ymm2, %ymm3
+        vcmpnle_uqps __sRangeReductionVal(%rax), %ymm3, %ymm1
+
+/*
+  h) Subtract Y*PI from X argument, where PI divided to 4 parts:
+  X = X - Y*PI1 - Y*PI2 - Y*PI3
+ */
+        vmovaps   %ymm2, %ymm3
+        vfnmadd231ps %ymm4, %ymm7, %ymm3
+        vfnmadd231ps __sPI2_FMA(%rax), %ymm4, %ymm3
+        vfnmadd132ps __sPI3_FMA(%rax), %ymm3, %ymm4
+
+/* a) Calculate X^2 = X * X */
+        vmulps    %ymm4, %ymm4, %ymm5
+
+/*
+  3) Destination sign setting
+  a) Set shifted destination sign using XOR operation:
+  R = XOR( R, S );
+ */
+        vxorps    %ymm0, %ymm4, %ymm6
+        vmovups __sA9_FMA(%rax), %ymm0
+
+/*
+  b) Calculate polynomial:
+  R = X + X * X^2 * (A3 + x^2 * (A5 + x^2 * (A7 + x^2 * (A9))))
+ */
+        vfmadd213ps __sA7_FMA(%rax), %ymm5, %ymm0
+        vfmadd213ps __sA5_FMA(%rax), %ymm5, %ymm0
+        vfmadd213ps __sA3(%rax), %ymm5, %ymm0
+        vmulps    %ymm5, %ymm0, %ymm0
+        vmovmskps %ymm1, %ecx
+        vfmadd213ps %ymm6, %ymm6, %ymm0
+        testl     %ecx, %ecx
+        jne       .LBL_1_3
+
+.LBL_1_2:
+        cfi_remember_state
+        movq      %rbp, %rsp
+        cfi_def_cfa_register (%rsp)
+        popq      %rbp
+        cfi_adjust_cfa_offset (-8)
+        cfi_restore (%rbp)
+        ret
+
+.LBL_1_3:
+        cfi_restore_state
+        vmovups   %ymm2, 320(%rsp)
+        vmovups   %ymm0, 384(%rsp)
+        je        .LBL_1_2
+
+        xorb      %dl, %dl
+        xorl      %eax, %eax
+        vmovups   %ymm8, 224(%rsp)
+        vmovups   %ymm9, 192(%rsp)
+        vmovups   %ymm10, 160(%rsp)
+        vmovups   %ymm11, 128(%rsp)
+        vmovups   %ymm12, 96(%rsp)
+        vmovups   %ymm13, 64(%rsp)
+        vmovups   %ymm14, 32(%rsp)
+        vmovups   %ymm15, (%rsp)
+        movq      %rsi, 264(%rsp)
+        movq      %rdi, 256(%rsp)
+        movq      %r12, 296(%rsp)
+        cfi_offset_rel_rsp (12, 296)
+        movb      %dl, %r12b
+        movq      %r13, 288(%rsp)
+        cfi_offset_rel_rsp (13, 288)
+        movl      %ecx, %r13d
+        movq      %r14, 280(%rsp)
+        cfi_offset_rel_rsp (14, 280)
+        movl      %eax, %r14d
+        movq      %r15, 272(%rsp)
+        cfi_offset_rel_rsp (15, 272)
+        cfi_remember_state
+
+.LBL_1_6:
+        btl       %r14d, %r13d
+        jc        .LBL_1_12
+
+.LBL_1_7:
+        lea       1(%r14), %esi
+        btl       %esi, %r13d
+        jc        .LBL_1_10
+
+.LBL_1_8:
+        incb      %r12b
+        addl      $2, %r14d
+        cmpb      $16, %r12b
+        jb        .LBL_1_6
+
+        vmovups   224(%rsp), %ymm8
+        vmovups   192(%rsp), %ymm9
+        vmovups   160(%rsp), %ymm10
+        vmovups   128(%rsp), %ymm11
+        vmovups   96(%rsp), %ymm12
+        vmovups   64(%rsp), %ymm13
+        vmovups   32(%rsp), %ymm14
+        vmovups   (%rsp), %ymm15
+        vmovups   384(%rsp), %ymm0
+        movq      264(%rsp), %rsi
+        movq      256(%rsp), %rdi
+        movq      296(%rsp), %r12
+        cfi_restore (%r12)
+        movq      288(%rsp), %r13
+        cfi_restore (%r13)
+        movq      280(%rsp), %r14
+        cfi_restore (%r14)
+        movq      272(%rsp), %r15
+        cfi_restore (%r15)
+        jmp       .LBL_1_2
+
+.LBL_1_10:
+        cfi_restore_state
+        movzbl    %r12b, %r15d
+        vmovss    324(%rsp,%r15,8), %xmm0
+        vzeroupper
+
+        call      JUMPTARGET(cosf)
+
+        vmovss    %xmm0, 388(%rsp,%r15,8)
+        jmp       .LBL_1_8
+
+.LBL_1_12:
+        movzbl    %r12b, %r15d
+        vmovss    320(%rsp,%r15,8), %xmm0
+        vzeroupper
+
+        call      JUMPTARGET(cosf)
+
+        vmovss    %xmm0, 384(%rsp,%r15,8)
+        jmp       .LBL_1_7
+
+END (_ZGVdN8v_cosf_avx2)
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core.S
new file mode 100644
index 0000000000..3998f616aa
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core.S
@@ -0,0 +1,37 @@
+/* Multiple versions of vectorized expf.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+	.text
+ENTRY (_ZGVeN16v_expf)
+        .type   _ZGVeN16v_expf, @gnu_indirect_function
+	LOAD_RTLD_GLOBAL_RO_RDX
+        leaq    _ZGVeN16v_expf_skx(%rip), %rax
+	HAS_ARCH_FEATURE (AVX512DQ_Usable)
+        jnz     2f
+        leaq    _ZGVeN16v_expf_knl(%rip), %rax
+	HAS_ARCH_FEATURE (AVX512F_Usable)
+        jnz     2f
+        leaq    _ZGVeN16v_expf_avx2_wrapper(%rip), %rax
+2:      ret
+END (_ZGVeN16v_expf)
+
+#define _ZGVeN16v_expf _ZGVeN16v_expf_avx2_wrapper
+#include "../svml_s_expf16_core.S"
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S
new file mode 100644
index 0000000000..e80b2be1a7
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S
@@ -0,0 +1,447 @@
+/* Function expf vectorized with AVX-512. KNL and SKX versions.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "svml_s_expf_data.h"
+#include "svml_s_wrapper_impl.h"
+
+	.text
+ENTRY (_ZGVeN16v_expf_knl)
+#ifndef HAVE_AVX512DQ_ASM_SUPPORT
+WRAPPER_IMPL_AVX512 _ZGVdN8v_expf
+#else
+/*
+   ALGORITHM DESCRIPTION:
+
+     Argument representation:
+     M = rint(X*2^k/ln2) = 2^k*N+j
+     X = M*ln2/2^k + r = N*ln2 + ln2*(j/2^k) + r
+     then -ln2/2^(k+1) < r < ln2/2^(k+1)
+     Alternatively:
+     M = trunc(X*2^k/ln2)
+     then 0 < r < ln2/2^k
+
+     Result calculation:
+     exp(X) = exp(N*ln2 + ln2*(j/2^k) + r)
+     = 2^N * 2^(j/2^k) * exp(r)
+     2^N is calculated by bit manipulation
+     2^(j/2^k) is computed from table lookup
+     exp(r) is approximated by polynomial
+
+     The table lookup is skipped if k = 0.
+     For low accuracy approximation, exp(r) ~ 1 or 1+r.  */
+
+        pushq     %rbp
+        cfi_adjust_cfa_offset (8)
+        cfi_rel_offset (%rbp, 0)
+        movq      %rsp, %rbp
+        cfi_def_cfa_register (%rbp)
+        andq      $-64, %rsp
+        subq      $1280, %rsp
+        movq      __svml_sexp_data@GOTPCREL(%rip), %rax
+
+/* r = x-n*ln2_hi/2^k */
+        vmovaps   %zmm0, %zmm6
+
+/* compare against threshold */
+        movl      $-1, %ecx
+        vmovups   __sInvLn2(%rax), %zmm3
+        vmovups   __sLn2hi(%rax), %zmm5
+
+/* m = x*2^k/ln2 + shifter */
+        vfmadd213ps __sShifter(%rax), %zmm0, %zmm3
+        vmovups     __sPC5(%rax), %zmm9
+
+/* n = m - shifter = rint(x*2^k/ln2) */
+        vsubps    __sShifter(%rax), %zmm3, %zmm7
+
+/* remove sign of x by "and" operation */
+        vpandd   __iAbsMask(%rax), %zmm0, %zmm1
+        vpaddd   __iBias(%rax), %zmm3, %zmm4
+        vpcmpgtd __iDomainRange(%rax), %zmm1, %k1
+
+/* compute 2^N with "shift" */
+        vpslld       $23, %zmm4, %zmm8
+        vfnmadd231ps %zmm7, %zmm5, %zmm6
+        vpbroadcastd %ecx, %zmm2{%k1}{z}
+
+/* r = r-n*ln2_lo/2^k = x - n*ln2/2^k */
+        vfnmadd132ps __sLn2lo(%rax), %zmm6, %zmm7
+
+/* set mask for overflow/underflow */
+        vptestmd  %zmm2, %zmm2, %k0
+        kmovw     %k0, %ecx
+
+/* c5*r+c4 */
+        vfmadd213ps __sPC4(%rax), %zmm7, %zmm9
+
+/* (c5*r+c4)*r+c3 */
+        vfmadd213ps __sPC3(%rax), %zmm7, %zmm9
+
+/* ((c5*r+c4)*r+c3)*r+c2 */
+        vfmadd213ps __sPC2(%rax), %zmm7, %zmm9
+
+/* (((c5*r+c4)*r+c3)*r+c2)*r+c1 */
+        vfmadd213ps __sPC1(%rax), %zmm7, %zmm9
+
+/* exp(r) = ((((c5*r+c4)*r+c3)*r+c2)*r+c1)*r+c0 */
+        vfmadd213ps __sPC0(%rax), %zmm7, %zmm9
+
+/* 2^N*exp(r) */
+        vmulps    %zmm9, %zmm8, %zmm1
+        testl     %ecx, %ecx
+        jne       .LBL_1_3
+
+.LBL_1_2:
+        cfi_remember_state
+        vmovaps   %zmm1, %zmm0
+        movq      %rbp, %rsp
+        cfi_def_cfa_register (%rsp)
+        popq      %rbp
+        cfi_adjust_cfa_offset (-8)
+        cfi_restore (%rbp)
+        ret
+
+.LBL_1_3:
+        cfi_restore_state
+        vmovups   %zmm0, 1152(%rsp)
+        vmovups   %zmm1, 1216(%rsp)
+        je        .LBL_1_2
+
+        xorb      %dl, %dl
+        kmovw     %k4, 1048(%rsp)
+        xorl      %eax, %eax
+        kmovw     %k5, 1040(%rsp)
+        kmovw     %k6, 1032(%rsp)
+        kmovw     %k7, 1024(%rsp)
+        vmovups   %zmm16, 960(%rsp)
+        vmovups   %zmm17, 896(%rsp)
+        vmovups   %zmm18, 832(%rsp)
+        vmovups   %zmm19, 768(%rsp)
+        vmovups   %zmm20, 704(%rsp)
+        vmovups   %zmm21, 640(%rsp)
+        vmovups   %zmm22, 576(%rsp)
+        vmovups   %zmm23, 512(%rsp)
+        vmovups   %zmm24, 448(%rsp)
+        vmovups   %zmm25, 384(%rsp)
+        vmovups   %zmm26, 320(%rsp)
+        vmovups   %zmm27, 256(%rsp)
+        vmovups   %zmm28, 192(%rsp)
+        vmovups   %zmm29, 128(%rsp)
+        vmovups   %zmm30, 64(%rsp)
+        vmovups   %zmm31, (%rsp)
+        movq      %rsi, 1064(%rsp)
+        movq      %rdi, 1056(%rsp)
+        movq      %r12, 1096(%rsp)
+        cfi_offset_rel_rsp (12, 1096)
+        movb      %dl, %r12b
+        movq      %r13, 1088(%rsp)
+        cfi_offset_rel_rsp (13, 1088)
+        movl      %ecx, %r13d
+        movq      %r14, 1080(%rsp)
+        cfi_offset_rel_rsp (14, 1080)
+        movl      %eax, %r14d
+        movq      %r15, 1072(%rsp)
+        cfi_offset_rel_rsp (15, 1072)
+        cfi_remember_state
+
+.LBL_1_6:
+        btl       %r14d, %r13d
+        jc        .LBL_1_12
+
+.LBL_1_7:
+        lea       1(%r14), %esi
+        btl       %esi, %r13d
+        jc        .LBL_1_10
+
+.LBL_1_8:
+        addb      $1, %r12b
+        addl      $2, %r14d
+        cmpb      $16, %r12b
+        jb        .LBL_1_6
+
+        kmovw     1048(%rsp), %k4
+        movq      1064(%rsp), %rsi
+        kmovw     1040(%rsp), %k5
+        movq      1056(%rsp), %rdi
+        kmovw     1032(%rsp), %k6
+        movq      1096(%rsp), %r12
+        cfi_restore (%r12)
+        movq      1088(%rsp), %r13
+        cfi_restore (%r13)
+        kmovw     1024(%rsp), %k7
+        vmovups   960(%rsp), %zmm16
+        vmovups   896(%rsp), %zmm17
+        vmovups   832(%rsp), %zmm18
+        vmovups   768(%rsp), %zmm19
+        vmovups   704(%rsp), %zmm20
+        vmovups   640(%rsp), %zmm21
+        vmovups   576(%rsp), %zmm22
+        vmovups   512(%rsp), %zmm23
+        vmovups   448(%rsp), %zmm24
+        vmovups   384(%rsp), %zmm25
+        vmovups   320(%rsp), %zmm26
+        vmovups   256(%rsp), %zmm27
+        vmovups   192(%rsp), %zmm28
+        vmovups   128(%rsp), %zmm29
+        vmovups   64(%rsp), %zmm30
+        vmovups   (%rsp), %zmm31
+        movq      1080(%rsp), %r14
+        cfi_restore (%r14)
+        movq      1072(%rsp), %r15
+        cfi_restore (%r15)
+        vmovups   1216(%rsp), %zmm1
+        jmp       .LBL_1_2
+
+.LBL_1_10:
+        cfi_restore_state
+        movzbl    %r12b, %r15d
+        vmovss    1156(%rsp,%r15,8), %xmm0
+        call      JUMPTARGET(__expf_finite)
+        vmovss    %xmm0, 1220(%rsp,%r15,8)
+        jmp       .LBL_1_8
+
+.LBL_1_12:
+        movzbl    %r12b, %r15d
+        vmovss    1152(%rsp,%r15,8), %xmm0
+        call      JUMPTARGET(__expf_finite)
+        vmovss    %xmm0, 1216(%rsp,%r15,8)
+        jmp       .LBL_1_7
+
+#endif
+END (_ZGVeN16v_expf_knl)
+
+ENTRY (_ZGVeN16v_expf_skx)
+#ifndef HAVE_AVX512DQ_ASM_SUPPORT
+WRAPPER_IMPL_AVX512 _ZGVdN8v_expf
+#else
+/*
+   ALGORITHM DESCRIPTION:
+
+     Argument representation:
+     M = rint(X*2^k/ln2) = 2^k*N+j
+     X = M*ln2/2^k + r = N*ln2 + ln2*(j/2^k) + r
+     then -ln2/2^(k+1) < r < ln2/2^(k+1)
+     Alternatively:
+     M = trunc(X*2^k/ln2)
+     then 0 < r < ln2/2^k
+
+     Result calculation:
+     exp(X) = exp(N*ln2 + ln2*(j/2^k) + r)
+     = 2^N * 2^(j/2^k) * exp(r)
+     2^N is calculated by bit manipulation
+     2^(j/2^k) is computed from table lookup
+     exp(r) is approximated by polynomial
+
+     The table lookup is skipped if k = 0.
+     For low accuracy approximation, exp(r) ~ 1 or 1+r.  */
+
+        pushq     %rbp
+        cfi_adjust_cfa_offset (8)
+        cfi_rel_offset (%rbp, 0)
+        movq      %rsp, %rbp
+        cfi_def_cfa_register (%rbp)
+        andq      $-64, %rsp
+        subq      $1280, %rsp
+        movq      __svml_sexp_data@GOTPCREL(%rip), %rax
+
+/* r = x-n*ln2_hi/2^k */
+        vmovaps   %zmm0, %zmm7
+
+/* compare against threshold */
+        vmovups   .L_2il0floatpacket.13(%rip), %zmm3
+        vmovups __sInvLn2(%rax), %zmm4
+        vmovups __sShifter(%rax), %zmm1
+        vmovups __sLn2hi(%rax), %zmm6
+        vmovups __sPC5(%rax), %zmm10
+
+/* m = x*2^k/ln2 + shifter */
+        vfmadd213ps %zmm1, %zmm0, %zmm4
+
+/* n = m - shifter = rint(x*2^k/ln2) */
+        vsubps    %zmm1, %zmm4, %zmm8
+        vpaddd __iBias(%rax), %zmm4, %zmm5
+        vfnmadd231ps %zmm8, %zmm6, %zmm7
+
+/* compute 2^N with "shift" */
+        vpslld    $23, %zmm5, %zmm9
+
+/* r = r-n*ln2_lo/2^k = x - n*ln2/2^k */
+        vfnmadd132ps __sLn2lo(%rax), %zmm7, %zmm8
+
+/* c5*r+c4 */
+        vfmadd213ps __sPC4(%rax), %zmm8, %zmm10
+
+/* (c5*r+c4)*r+c3 */
+        vfmadd213ps __sPC3(%rax), %zmm8, %zmm10
+
+/* ((c5*r+c4)*r+c3)*r+c2 */
+        vfmadd213ps __sPC2(%rax), %zmm8, %zmm10
+
+/* (((c5*r+c4)*r+c3)*r+c2)*r+c1 */
+        vfmadd213ps __sPC1(%rax), %zmm8, %zmm10
+
+/* exp(r) = ((((c5*r+c4)*r+c3)*r+c2)*r+c1)*r+c0 */
+        vfmadd213ps __sPC0(%rax), %zmm8, %zmm10
+
+/* 2^N*exp(r) */
+        vmulps    %zmm10, %zmm9, %zmm1
+
+/* remove sign of x by "and" operation */
+        vpandd __iAbsMask(%rax), %zmm0, %zmm2
+        vpcmpd    $2, __iDomainRange(%rax), %zmm2, %k1
+        vpandnd   %zmm2, %zmm2, %zmm3{%k1}
+
+/* set mask for overflow/underflow */
+        vptestmd  %zmm3, %zmm3, %k0
+        kmovw     %k0, %ecx
+        testl     %ecx, %ecx
+        jne       .LBL_2_3
+
+.LBL_2_2:
+        cfi_remember_state
+        vmovaps   %zmm1, %zmm0
+        movq      %rbp, %rsp
+        cfi_def_cfa_register (%rsp)
+        popq      %rbp
+        cfi_adjust_cfa_offset (-8)
+        cfi_restore (%rbp)
+        ret
+
+.LBL_2_3:
+        cfi_restore_state
+        vmovups   %zmm0, 1152(%rsp)
+        vmovups   %zmm1, 1216(%rsp)
+        je        .LBL_2_2
+
+        xorb      %dl, %dl
+        xorl      %eax, %eax
+        kmovw     %k4, 1048(%rsp)
+        kmovw     %k5, 1040(%rsp)
+        kmovw     %k6, 1032(%rsp)
+        kmovw     %k7, 1024(%rsp)
+        vmovups   %zmm16, 960(%rsp)
+        vmovups   %zmm17, 896(%rsp)
+        vmovups   %zmm18, 832(%rsp)
+        vmovups   %zmm19, 768(%rsp)
+        vmovups   %zmm20, 704(%rsp)
+        vmovups   %zmm21, 640(%rsp)
+        vmovups   %zmm22, 576(%rsp)
+        vmovups   %zmm23, 512(%rsp)
+        vmovups   %zmm24, 448(%rsp)
+        vmovups   %zmm25, 384(%rsp)
+        vmovups   %zmm26, 320(%rsp)
+        vmovups   %zmm27, 256(%rsp)
+        vmovups   %zmm28, 192(%rsp)
+        vmovups   %zmm29, 128(%rsp)
+        vmovups   %zmm30, 64(%rsp)
+        vmovups   %zmm31, (%rsp)
+        movq      %rsi, 1064(%rsp)
+        movq      %rdi, 1056(%rsp)
+        movq      %r12, 1096(%rsp)
+        cfi_offset_rel_rsp (12, 1096)
+        movb      %dl, %r12b
+        movq      %r13, 1088(%rsp)
+        cfi_offset_rel_rsp (13, 1088)
+        movl      %ecx, %r13d
+        movq      %r14, 1080(%rsp)
+        cfi_offset_rel_rsp (14, 1080)
+        movl      %eax, %r14d
+        movq      %r15, 1072(%rsp)
+        cfi_offset_rel_rsp (15, 1072)
+        cfi_remember_state
+
+
+.LBL_2_6:
+        btl       %r14d, %r13d
+        jc        .LBL_2_12
+
+.LBL_2_7:
+        lea       1(%r14), %esi
+        btl       %esi, %r13d
+        jc        .LBL_2_10
+
+.LBL_2_8:
+        incb      %r12b
+        addl      $2, %r14d
+        cmpb      $16, %r12b
+        jb        .LBL_2_6
+
+        kmovw     1048(%rsp), %k4
+        kmovw     1040(%rsp), %k5
+        kmovw     1032(%rsp), %k6
+        kmovw     1024(%rsp), %k7
+        vmovups   960(%rsp), %zmm16
+        vmovups   896(%rsp), %zmm17
+        vmovups   832(%rsp), %zmm18
+        vmovups   768(%rsp), %zmm19
+        vmovups   704(%rsp), %zmm20
+        vmovups   640(%rsp), %zmm21
+        vmovups   576(%rsp), %zmm22
+        vmovups   512(%rsp), %zmm23
+        vmovups   448(%rsp), %zmm24
+        vmovups   384(%rsp), %zmm25
+        vmovups   320(%rsp), %zmm26
+        vmovups   256(%rsp), %zmm27
+        vmovups   192(%rsp), %zmm28
+        vmovups   128(%rsp), %zmm29
+        vmovups   64(%rsp), %zmm30
+        vmovups   (%rsp), %zmm31
+        vmovups   1216(%rsp), %zmm1
+        movq      1064(%rsp), %rsi
+        movq      1056(%rsp), %rdi
+        movq      1096(%rsp), %r12
+        cfi_restore (%r12)
+        movq      1088(%rsp), %r13
+        cfi_restore (%r13)
+        movq      1080(%rsp), %r14
+        cfi_restore (%r14)
+        movq      1072(%rsp), %r15
+        cfi_restore (%r15)
+        jmp       .LBL_2_2
+
+.LBL_2_10:
+        cfi_restore_state
+        movzbl    %r12b, %r15d
+        vmovss    1156(%rsp,%r15,8), %xmm0
+        vzeroupper
+        vmovss    1156(%rsp,%r15,8), %xmm0
+
+        call      JUMPTARGET(__expf_finite)
+
+        vmovss    %xmm0, 1220(%rsp,%r15,8)
+        jmp       .LBL_2_8
+
+.LBL_2_12:
+        movzbl    %r12b, %r15d
+        vmovss    1152(%rsp,%r15,8), %xmm0
+        vzeroupper
+        vmovss    1152(%rsp,%r15,8), %xmm0
+
+        call      JUMPTARGET(__expf_finite)
+
+        vmovss    %xmm0, 1216(%rsp,%r15,8)
+        jmp       .LBL_2_7
+
+#endif
+END (_ZGVeN16v_expf_skx)
+
+	.section .rodata, "a"
+.L_2il0floatpacket.13:
+	.long	0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
+	.type	.L_2il0floatpacket.13,@object
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_expf4_core.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_expf4_core.S
new file mode 100644
index 0000000000..8051720ec2
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_expf4_core.S
@@ -0,0 +1,36 @@
+/* Multiple versions of vectorized expf.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+	.text
+ENTRY (_ZGVbN4v_expf)
+        .type   _ZGVbN4v_expf, @gnu_indirect_function
+	LOAD_RTLD_GLOBAL_RO_RDX
+        leaq    _ZGVbN4v_expf_sse4(%rip), %rax
+	HAS_CPU_FEATURE (SSE4_1)
+        jz      2f
+        ret
+2:      leaq    _ZGVbN4v_expf_sse2(%rip), %rax
+        ret
+END (_ZGVbN4v_expf)
+libmvec_hidden_def (_ZGVbN4v_expf)
+
+#define _ZGVbN4v_expf _ZGVbN4v_expf_sse2
+#include "../svml_s_expf4_core.S"
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_expf4_core_sse4.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_expf4_core_sse4.S
new file mode 100644
index 0000000000..2bc510bbf7
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_expf4_core_sse4.S
@@ -0,0 +1,212 @@
+/* Function expf vectorized with SSE4.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "svml_s_expf_data.h"
+
+	.text
+ENTRY (_ZGVbN4v_expf_sse4)
+/*
+   ALGORITHM DESCRIPTION:
+
+     Argument representation:
+     M = rint(X*2^k/ln2) = 2^k*N+j
+     X = M*ln2/2^k + r = N*ln2 + ln2*(j/2^k) + r
+     then -ln2/2^(k+1) < r < ln2/2^(k+1)
+     Alternatively:
+     M = trunc(X*2^k/ln2)
+     then 0 < r < ln2/2^k
+
+     Result calculation:
+     exp(X) = exp(N*ln2 + ln2*(j/2^k) + r)
+     = 2^N * 2^(j/2^k) * exp(r)
+     2^N is calculated by bit manipulation
+     2^(j/2^k) is computed from table lookup
+     exp(r) is approximated by polynomial
+
+     The table lookup is skipped if k = 0.
+     For low accuracy approximation, exp(r) ~ 1 or 1+r.  */
+
+        pushq     %rbp
+        cfi_adjust_cfa_offset (8)
+        cfi_rel_offset (%rbp, 0)
+        movq      %rsp, %rbp
+        cfi_def_cfa_register (%rbp)
+        andq      $-64, %rsp
+        subq      $320, %rsp
+        movaps    %xmm0, %xmm5
+        movq      __svml_sexp_data@GOTPCREL(%rip), %rax
+        movups __sInvLn2(%rax), %xmm0
+
+/* m = x*2^k/ln2 + shifter */
+        mulps     %xmm5, %xmm0
+        movups __sShifter(%rax), %xmm6
+        movups __sLn2hi(%rax), %xmm4
+        addps     %xmm6, %xmm0
+
+/* n = m - shifter = rint(x*2^k/ln2) */
+        movaps    %xmm0, %xmm2
+
+/* remove sign of x by "and" operation */
+        movdqu __iAbsMask(%rax), %xmm7
+        subps     %xmm6, %xmm2
+
+/* r = x-n*ln2_hi/2^k */
+        mulps     %xmm2, %xmm4
+        pand      %xmm5, %xmm7
+
+/* compare against threshold */
+        pcmpgtd __iDomainRange(%rax), %xmm7
+        movups __sLn2lo(%rax), %xmm1
+
+/* set mask for overflow/underflow */
+        movmskps  %xmm7, %ecx
+        movaps    %xmm5, %xmm7
+        movups __sPC5(%rax), %xmm3
+        subps     %xmm4, %xmm7
+
+/* r = r-n*ln2_lo/2^k = x - n*ln2/2^k */
+        mulps     %xmm1, %xmm2
+
+/* compute 2^N with "shift" */
+        movdqu __iBias(%rax), %xmm6
+        subps     %xmm2, %xmm7
+
+/* c5*r+c4 */
+        mulps     %xmm7, %xmm3
+        paddd     %xmm6, %xmm0
+        pslld     $23, %xmm0
+        addps __sPC4(%rax), %xmm3
+
+/* (c5*r+c4)*r+c3 */
+        mulps     %xmm7, %xmm3
+        addps __sPC3(%rax), %xmm3
+
+/* ((c5*r+c4)*r+c3)*r+c2 */
+        mulps     %xmm7, %xmm3
+        addps __sPC2(%rax), %xmm3
+
+/* (((c5*r+c4)*r+c3)*r+c2)*r+c1 */
+        mulps     %xmm7, %xmm3
+        addps __sPC1(%rax), %xmm3
+
+/* exp(r) = ((((c5*r+c4)*r+c3)*r+c2)*r+c1)*r+c0 */
+        mulps     %xmm3, %xmm7
+        addps __sPC0(%rax), %xmm7
+
+/* 2^N*exp(r) */
+        mulps     %xmm7, %xmm0
+        testl     %ecx, %ecx
+        jne       .LBL_1_3
+
+.LBL_1_2:
+        cfi_remember_state
+        movq      %rbp, %rsp
+        cfi_def_cfa_register (%rsp)
+        popq      %rbp
+        cfi_adjust_cfa_offset (-8)
+        cfi_restore (%rbp)
+        ret
+
+.LBL_1_3:
+        cfi_restore_state
+        movups    %xmm5, 192(%rsp)
+        movups    %xmm0, 256(%rsp)
+        je        .LBL_1_2
+
+        xorb      %dl, %dl
+        xorl      %eax, %eax
+        movups    %xmm8, 112(%rsp)
+        movups    %xmm9, 96(%rsp)
+        movups    %xmm10, 80(%rsp)
+        movups    %xmm11, 64(%rsp)
+        movups    %xmm12, 48(%rsp)
+        movups    %xmm13, 32(%rsp)
+        movups    %xmm14, 16(%rsp)
+        movups    %xmm15, (%rsp)
+        movq      %rsi, 136(%rsp)
+        movq      %rdi, 128(%rsp)
+        movq      %r12, 168(%rsp)
+        cfi_offset_rel_rsp (12, 168)
+        movb      %dl, %r12b
+        movq      %r13, 160(%rsp)
+        cfi_offset_rel_rsp (13, 160)
+        movl      %ecx, %r13d
+        movq      %r14, 152(%rsp)
+        cfi_offset_rel_rsp (14, 152)
+        movl      %eax, %r14d
+        movq      %r15, 144(%rsp)
+        cfi_offset_rel_rsp (15, 144)
+        cfi_remember_state
+
+.LBL_1_6:
+        btl       %r14d, %r13d
+        jc        .LBL_1_12
+
+.LBL_1_7:
+        lea       1(%r14), %esi
+        btl       %esi, %r13d
+        jc        .LBL_1_10
+
+.LBL_1_8:
+        incb      %r12b
+        addl      $2, %r14d
+        cmpb      $16, %r12b
+        jb        .LBL_1_6
+
+        movups    112(%rsp), %xmm8
+        movups    96(%rsp), %xmm9
+        movups    80(%rsp), %xmm10
+        movups    64(%rsp), %xmm11
+        movups    48(%rsp), %xmm12
+        movups    32(%rsp), %xmm13
+        movups    16(%rsp), %xmm14
+        movups    (%rsp), %xmm15
+        movq      136(%rsp), %rsi
+        movq      128(%rsp), %rdi
+        movq      168(%rsp), %r12
+        cfi_restore (%r12)
+        movq      160(%rsp), %r13
+        cfi_restore (%r13)
+        movq      152(%rsp), %r14
+        cfi_restore (%r14)
+        movq      144(%rsp), %r15
+        cfi_restore (%r15)
+        movups    256(%rsp), %xmm0
+        jmp       .LBL_1_2
+
+.LBL_1_10:
+        cfi_restore_state
+        movzbl    %r12b, %r15d
+        movss     196(%rsp,%r15,8), %xmm0
+
+        call      JUMPTARGET(__expf_finite)
+
+        movss     %xmm0, 260(%rsp,%r15,8)
+        jmp       .LBL_1_8
+
+.LBL_1_12:
+        movzbl    %r12b, %r15d
+        movss     192(%rsp,%r15,8), %xmm0
+
+        call      JUMPTARGET(__expf_finite)
+
+        movss     %xmm0, 256(%rsp,%r15,8)
+        jmp       .LBL_1_7
+
+END (_ZGVbN4v_expf_sse4)
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_expf8_core.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_expf8_core.S
new file mode 100644
index 0000000000..6ffb1fd784
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_expf8_core.S
@@ -0,0 +1,36 @@
+/* Multiple versions of vectorized expf.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+    Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+	.text
+ENTRY (_ZGVdN8v_expf)
+        .type   _ZGVdN8v_expf, @gnu_indirect_function
+	LOAD_RTLD_GLOBAL_RO_RDX
+        leaq    _ZGVdN8v_expf_avx2(%rip), %rax
+	HAS_ARCH_FEATURE (AVX2_Usable)
+        jz      2f
+        ret
+2:      leaq    _ZGVdN8v_expf_sse_wrapper(%rip), %rax
+        ret
+END (_ZGVdN8v_expf)
+libmvec_hidden_def (_ZGVdN8v_expf)
+
+#define _ZGVdN8v_expf _ZGVdN8v_expf_sse_wrapper
+#include "../svml_s_expf8_core.S"
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_expf8_core_avx2.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_expf8_core_avx2.S
new file mode 100644
index 0000000000..b4a070ac86
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_expf8_core_avx2.S
@@ -0,0 +1,202 @@
+/* Function expf vectorized with AVX2.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "svml_s_expf_data.h"
+
+	.text
+ENTRY(_ZGVdN8v_expf_avx2)
+/*
+   ALGORITHM DESCRIPTION:
+
+     Argument representation:
+     M = rint(X*2^k/ln2) = 2^k*N+j
+     X = M*ln2/2^k + r = N*ln2 + ln2*(j/2^k) + r
+     then -ln2/2^(k+1) < r < ln2/2^(k+1)
+     Alternatively:
+     M = trunc(X*2^k/ln2)
+     then 0 < r < ln2/2^k
+
+     Result calculation:
+     exp(X) = exp(N*ln2 + ln2*(j/2^k) + r)
+     = 2^N * 2^(j/2^k) * exp(r)
+     2^N is calculated by bit manipulation
+     2^(j/2^k) is computed from table lookup
+     exp(r) is approximated by polynomial
+
+     The table lookup is skipped if k = 0.
+     For low accuracy approximation, exp(r) ~ 1 or 1+r.  */
+
+        pushq     %rbp
+        cfi_adjust_cfa_offset (8)
+        cfi_rel_offset (%rbp, 0)
+        movq      %rsp, %rbp
+        cfi_def_cfa_register (%rbp)
+        andq      $-64, %rsp
+        subq      $448, %rsp
+        movq      __svml_sexp_data@GOTPCREL(%rip), %rax
+        vmovaps   %ymm0, %ymm2
+        vmovups __sInvLn2(%rax), %ymm7
+        vmovups __sShifter(%rax), %ymm4
+        vmovups __sLn2hi(%rax), %ymm3
+        vmovups __sPC5(%rax), %ymm1
+
+/* m = x*2^k/ln2 + shifter */
+        vfmadd213ps %ymm4, %ymm2, %ymm7
+
+/* n = m - shifter = rint(x*2^k/ln2) */
+        vsubps    %ymm4, %ymm7, %ymm0
+        vpaddd __iBias(%rax), %ymm7, %ymm4
+
+/* remove sign of x by "and" operation */
+        vandps __iAbsMask(%rax), %ymm2, %ymm5
+
+/* compare against threshold */
+        vpcmpgtd __iDomainRange(%rax), %ymm5, %ymm6
+
+/* r = x-n*ln2_hi/2^k */
+        vmovaps   %ymm2, %ymm5
+        vfnmadd231ps %ymm0, %ymm3, %ymm5
+
+/* r = r-n*ln2_lo/2^k = x - n*ln2/2^k */
+        vfnmadd132ps __sLn2lo(%rax), %ymm5, %ymm0
+
+/* c5*r+c4 */
+        vfmadd213ps __sPC4(%rax), %ymm0, %ymm1
+
+/* (c5*r+c4)*r+c3 */
+        vfmadd213ps __sPC3(%rax), %ymm0, %ymm1
+
+/* ((c5*r+c4)*r+c3)*r+c2 */
+        vfmadd213ps __sPC2(%rax), %ymm0, %ymm1
+
+/* (((c5*r+c4)*r+c3)*r+c2)*r+c1 */
+        vfmadd213ps __sPC1(%rax), %ymm0, %ymm1
+
+/* exp(r) = ((((c5*r+c4)*r+c3)*r+c2)*r+c1)*r+c0 */
+        vfmadd213ps __sPC0(%rax), %ymm0, %ymm1
+
+/* set mask for overflow/underflow */
+        vmovmskps %ymm6, %ecx
+
+/* compute 2^N with "shift" */
+        vpslld    $23, %ymm4, %ymm6
+
+/* 2^N*exp(r) */
+        vmulps    %ymm1, %ymm6, %ymm0
+        testl     %ecx, %ecx
+        jne       .LBL_1_3
+
+.LBL_1_2:
+        cfi_remember_state
+        movq      %rbp, %rsp
+        cfi_def_cfa_register (%rsp)
+        popq      %rbp
+        cfi_adjust_cfa_offset (-8)
+        cfi_restore (%rbp)
+        ret
+
+.LBL_1_3:
+        cfi_restore_state
+        vmovups   %ymm2, 320(%rsp)
+        vmovups   %ymm0, 384(%rsp)
+        je        .LBL_1_2
+
+        xorb      %dl, %dl
+        xorl      %eax, %eax
+        vmovups   %ymm8, 224(%rsp)
+        vmovups   %ymm9, 192(%rsp)
+        vmovups   %ymm10, 160(%rsp)
+        vmovups   %ymm11, 128(%rsp)
+        vmovups   %ymm12, 96(%rsp)
+        vmovups   %ymm13, 64(%rsp)
+        vmovups   %ymm14, 32(%rsp)
+        vmovups   %ymm15, (%rsp)
+        movq      %rsi, 264(%rsp)
+        movq      %rdi, 256(%rsp)
+        movq      %r12, 296(%rsp)
+        cfi_offset_rel_rsp (12, 296)
+        movb      %dl, %r12b
+        movq      %r13, 288(%rsp)
+        cfi_offset_rel_rsp (13, 288)
+        movl      %ecx, %r13d
+        movq      %r14, 280(%rsp)
+        cfi_offset_rel_rsp (14, 280)
+        movl      %eax, %r14d
+        movq      %r15, 272(%rsp)
+        cfi_offset_rel_rsp (15, 272)
+        cfi_remember_state
+
+.LBL_1_6:
+        btl       %r14d, %r13d
+        jc        .LBL_1_12
+
+.LBL_1_7:
+        lea       1(%r14), %esi
+        btl       %esi, %r13d
+        jc        .LBL_1_10
+
+.LBL_1_8:
+        incb      %r12b
+        addl      $2, %r14d
+        cmpb      $16, %r12b
+        jb        .LBL_1_6
+
+        vmovups   224(%rsp), %ymm8
+        vmovups   192(%rsp), %ymm9
+        vmovups   160(%rsp), %ymm10
+        vmovups   128(%rsp), %ymm11
+        vmovups   96(%rsp), %ymm12
+        vmovups   64(%rsp), %ymm13
+        vmovups   32(%rsp), %ymm14
+        vmovups   (%rsp), %ymm15
+        vmovups   384(%rsp), %ymm0
+        movq      264(%rsp), %rsi
+        movq      256(%rsp), %rdi
+        movq      296(%rsp), %r12
+        cfi_restore (%r12)
+        movq      288(%rsp), %r13
+        cfi_restore (%r13)
+        movq      280(%rsp), %r14
+        cfi_restore (%r14)
+        movq      272(%rsp), %r15
+        cfi_restore (%r15)
+        jmp       .LBL_1_2
+
+.LBL_1_10:
+        cfi_restore_state
+        movzbl    %r12b, %r15d
+        vmovss    324(%rsp,%r15,8), %xmm0
+        vzeroupper
+
+        call      JUMPTARGET(__expf_finite)
+
+        vmovss    %xmm0, 388(%rsp,%r15,8)
+        jmp       .LBL_1_8
+
+.LBL_1_12:
+        movzbl    %r12b, %r15d
+        vmovss    320(%rsp,%r15,8), %xmm0
+        vzeroupper
+
+        call      JUMPTARGET(__expf_finite)
+
+        vmovss    %xmm0, 384(%rsp,%r15,8)
+        jmp       .LBL_1_7
+
+END(_ZGVdN8v_expf_avx2)
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core.S
new file mode 100644
index 0000000000..8ab03195c6
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core.S
@@ -0,0 +1,37 @@
+/* Multiple versions of vectorized logf.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+	.text
+ENTRY (_ZGVeN16v_logf)
+        .type   _ZGVeN16v_logf, @gnu_indirect_function
+	LOAD_RTLD_GLOBAL_RO_RDX
+        leaq    _ZGVeN16v_logf_skx(%rip), %rax
+	HAS_ARCH_FEATURE (AVX512DQ_Usable)
+        jnz     2f
+        leaq    _ZGVeN16v_logf_knl(%rip), %rax
+	HAS_ARCH_FEATURE (AVX512F_Usable)
+        jnz     2f
+        leaq    _ZGVeN16v_logf_avx2_wrapper(%rip), %rax
+2:      ret
+END (_ZGVeN16v_logf)
+
+#define _ZGVeN16v_logf _ZGVeN16v_logf_avx2_wrapper
+#include "../svml_s_logf16_core.S"
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S
new file mode 100644
index 0000000000..7ff6fff848
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S
@@ -0,0 +1,416 @@
+/* Function logf vectorized with AVX-512. KNL and SKX versions.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "svml_s_logf_data.h"
+#include "svml_s_wrapper_impl.h"
+
+	.text
+ENTRY (_ZGVeN16v_logf_knl)
+#ifndef HAVE_AVX512DQ_ASM_SUPPORT
+WRAPPER_IMPL_AVX512 _ZGVdN8v_logf
+#else
+/*
+   ALGORITHM DESCRIPTION:
+
+     log(x) = exponent_x*log(2) + log(mantissa_x),         if mantissa_x<4/3
+     log(x) = (exponent_x+1)*log(2) + log(0.5*mantissa_x), if mantissa_x>4/3
+
+     R = mantissa_x - 1,     if mantissa_x<4/3
+     R = 0.5*mantissa_x - 1, if mantissa_x>4/3
+     |R|< 1/3
+
+     log(1+R) is approximated as a polynomial: degree 9 for 1-ulp,
+     degree 7 for 4-ulp, degree 3 for half-precision.  */
+
+        pushq     %rbp
+        cfi_adjust_cfa_offset (8)
+        cfi_rel_offset (%rbp, 0)
+        movq      %rsp, %rbp
+        cfi_def_cfa_register (%rbp)
+        andq      $-64, %rsp
+        subq      $1280, %rsp
+        movq      __svml_slog_data@GOTPCREL(%rip), %rax
+        movl      $-1, %ecx
+
+/* reduction: compute r,n */
+        vpsubd    _iBrkValue(%rax), %zmm0, %zmm2
+        vmovups   _sPoly_7(%rax), %zmm7
+        vpandd    _iOffExpoMask(%rax), %zmm2, %zmm3
+
+/* exponent_x (mantissa_x<4/3) or exponent_x+1 (mantissa_x>4/3) */
+        vpsrad    $23, %zmm2, %zmm4
+
+/* check for working range,
+   set special argument mask (denormals/zero/Inf/NaN)
+ */
+        vpaddd    _iHiDelta(%rax), %zmm0, %zmm1
+
+/* mantissa_x (mantissa_x<4/3), or 0.5*mantissa_x (mantissa_x>4/3) */
+        vpaddd    _iBrkValue(%rax), %zmm3, %zmm6
+        vpcmpd    $1, _iLoRange(%rax), %zmm1, %k1
+        vcvtdq2ps {rn-sae}, %zmm4, %zmm1
+
+/* reduced argument R */
+        vsubps       _sOne(%rax), %zmm6, %zmm8
+        vpbroadcastd %ecx, %zmm5{%k1}{z}
+
+/* polynomial evaluation starts here */
+        vfmadd213ps _sPoly_6(%rax), %zmm8, %zmm7
+        vptestmd    %zmm5, %zmm5, %k0
+        kmovw       %k0, %ecx
+        vfmadd213ps _sPoly_5(%rax), %zmm8, %zmm7
+        vfmadd213ps _sPoly_4(%rax), %zmm8, %zmm7
+        vfmadd213ps _sPoly_3(%rax), %zmm8, %zmm7
+        vfmadd213ps _sPoly_2(%rax), %zmm8, %zmm7
+        vfmadd213ps _sPoly_1(%rax), %zmm8, %zmm7
+        vmulps      %zmm8, %zmm7, %zmm9
+
+/* polynomial evaluation end */
+        vfmadd213ps %zmm8, %zmm8, %zmm9
+
+/*
+   final reconstruction:
+   add exponent_value*log2 to polynomial result
+ */
+        vfmadd132ps _sLn2(%rax), %zmm9, %zmm1
+        testl       %ecx, %ecx
+        jne         .LBL_1_3
+
+.LBL_1_2:
+        cfi_remember_state
+        vmovaps   %zmm1, %zmm0
+        movq      %rbp, %rsp
+        cfi_def_cfa_register (%rsp)
+        popq      %rbp
+        cfi_adjust_cfa_offset (-8)
+        cfi_restore (%rbp)
+        ret
+
+.LBL_1_3:
+        cfi_restore_state
+        vmovups   %zmm0, 1152(%rsp)
+        vmovups   %zmm1, 1216(%rsp)
+        je        .LBL_1_2
+
+        xorb      %dl, %dl
+        kmovw     %k4, 1048(%rsp)
+        xorl      %eax, %eax
+        kmovw     %k5, 1040(%rsp)
+        kmovw     %k6, 1032(%rsp)
+        kmovw     %k7, 1024(%rsp)
+        vmovups   %zmm16, 960(%rsp)
+        vmovups   %zmm17, 896(%rsp)
+        vmovups   %zmm18, 832(%rsp)
+        vmovups   %zmm19, 768(%rsp)
+        vmovups   %zmm20, 704(%rsp)
+        vmovups   %zmm21, 640(%rsp)
+        vmovups   %zmm22, 576(%rsp)
+        vmovups   %zmm23, 512(%rsp)
+        vmovups   %zmm24, 448(%rsp)
+        vmovups   %zmm25, 384(%rsp)
+        vmovups   %zmm26, 320(%rsp)
+        vmovups   %zmm27, 256(%rsp)
+        vmovups   %zmm28, 192(%rsp)
+        vmovups   %zmm29, 128(%rsp)
+        vmovups   %zmm30, 64(%rsp)
+        vmovups   %zmm31, (%rsp)
+        movq      %rsi, 1064(%rsp)
+        movq      %rdi, 1056(%rsp)
+        movq      %r12, 1096(%rsp)
+        cfi_offset_rel_rsp (12, 1096)
+        movb      %dl, %r12b
+        movq      %r13, 1088(%rsp)
+        cfi_offset_rel_rsp (13, 1088)
+        movl      %ecx, %r13d
+        movq      %r14, 1080(%rsp)
+        cfi_offset_rel_rsp (14, 1080)
+        movl      %eax, %r14d
+        movq      %r15, 1072(%rsp)
+        cfi_offset_rel_rsp (15, 1072)
+        cfi_remember_state
+
+.LBL_1_6:
+        btl       %r14d, %r13d
+        jc        .LBL_1_12
+
+.LBL_1_7:
+        lea       1(%r14), %esi
+        btl       %esi, %r13d
+        jc        .LBL_1_10
+
+.LBL_1_8:
+        addb      $1, %r12b
+        addl      $2, %r14d
+        cmpb      $16, %r12b
+        jb        .LBL_1_6
+
+        kmovw     1048(%rsp), %k4
+        movq      1064(%rsp), %rsi
+        kmovw     1040(%rsp), %k5
+        movq      1056(%rsp), %rdi
+        kmovw     1032(%rsp), %k6
+        movq      1096(%rsp), %r12
+        cfi_restore (%r12)
+        movq      1088(%rsp), %r13
+        cfi_restore (%r13)
+        kmovw     1024(%rsp), %k7
+        vmovups   960(%rsp), %zmm16
+        vmovups   896(%rsp), %zmm17
+        vmovups   832(%rsp), %zmm18
+        vmovups   768(%rsp), %zmm19
+        vmovups   704(%rsp), %zmm20
+        vmovups   640(%rsp), %zmm21
+        vmovups   576(%rsp), %zmm22
+        vmovups   512(%rsp), %zmm23
+        vmovups   448(%rsp), %zmm24
+        vmovups   384(%rsp), %zmm25
+        vmovups   320(%rsp), %zmm26
+        vmovups   256(%rsp), %zmm27
+        vmovups   192(%rsp), %zmm28
+        vmovups   128(%rsp), %zmm29
+        vmovups   64(%rsp), %zmm30
+        vmovups   (%rsp), %zmm31
+        movq      1080(%rsp), %r14
+        cfi_restore (%r14)
+        movq      1072(%rsp), %r15
+        cfi_restore (%r15)
+        vmovups   1216(%rsp), %zmm1
+        jmp       .LBL_1_2
+
+.LBL_1_10:
+        cfi_restore_state
+        movzbl    %r12b, %r15d
+        vmovss    1156(%rsp,%r15,8), %xmm0
+        call      JUMPTARGET(__logf_finite)
+        vmovss    %xmm0, 1220(%rsp,%r15,8)
+        jmp       .LBL_1_8
+
+.LBL_1_12:
+        movzbl    %r12b, %r15d
+        vmovss    1152(%rsp,%r15,8), %xmm0
+        call      JUMPTARGET(__logf_finite)
+        vmovss    %xmm0, 1216(%rsp,%r15,8)
+        jmp       .LBL_1_7
+#endif
+END (_ZGVeN16v_logf_knl)
+
+ENTRY (_ZGVeN16v_logf_skx)
+#ifndef HAVE_AVX512DQ_ASM_SUPPORT
+WRAPPER_IMPL_AVX512 _ZGVdN8v_logf
+#else
+/*
+   ALGORITHM DESCRIPTION:
+
+     log(x) = exponent_x*log(2) + log(mantissa_x),         if mantissa_x<4/3
+     log(x) = (exponent_x+1)*log(2) + log(0.5*mantissa_x), if mantissa_x>4/3
+
+     R = mantissa_x - 1,     if mantissa_x<4/3
+     R = 0.5*mantissa_x - 1, if mantissa_x>4/3
+     |R|< 1/3
+
+     log(1+R) is approximated as a polynomial: degree 9 for 1-ulp,
+     degree 7 for 4-ulp, degree 3 for half-precision.  */
+
+        pushq     %rbp
+        cfi_adjust_cfa_offset (8)
+        cfi_rel_offset (%rbp, 0)
+        movq      %rsp, %rbp
+        cfi_def_cfa_register (%rbp)
+        andq      $-64, %rsp
+        subq      $1280, %rsp
+        movq      __svml_slog_data@GOTPCREL(%rip), %rax
+        vmovups   .L_2il0floatpacket.7(%rip), %zmm6
+        vmovups _iBrkValue(%rax), %zmm4
+        vmovups _sPoly_7(%rax), %zmm8
+
+/*
+   check for working range,
+   set special argument mask (denormals/zero/Inf/NaN)
+ */
+        vpaddd _iHiDelta(%rax), %zmm0, %zmm1
+
+/* reduction: compute r,n */
+        vpsubd    %zmm4, %zmm0, %zmm2
+        vpcmpd    $5, _iLoRange(%rax), %zmm1, %k1
+
+/* exponent_x (mantissa_x<4/3) or exponent_x+1 (mantissa_x>4/3) */
+        vpsrad    $23, %zmm2, %zmm5
+        vpandd _iOffExpoMask(%rax), %zmm2, %zmm3
+
+/* mantissa_x (mantissa_x<4/3), or 0.5*mantissa_x (mantissa_x>4/3) */
+        vpaddd    %zmm4, %zmm3, %zmm7
+
+/* reduced argument R */
+        vsubps _sOne(%rax), %zmm7, %zmm9
+
+/* polynomial evaluation starts here */
+        vfmadd213ps _sPoly_6(%rax), %zmm9, %zmm8
+        vfmadd213ps _sPoly_5(%rax), %zmm9, %zmm8
+        vfmadd213ps _sPoly_4(%rax), %zmm9, %zmm8
+        vfmadd213ps _sPoly_3(%rax), %zmm9, %zmm8
+        vfmadd213ps _sPoly_2(%rax), %zmm9, %zmm8
+        vfmadd213ps _sPoly_1(%rax), %zmm9, %zmm8
+        vmulps    %zmm9, %zmm8, %zmm10
+
+/* polynomial evaluation end */
+        vfmadd213ps %zmm9, %zmm9, %zmm10
+        vpandnd   %zmm1, %zmm1, %zmm6{%k1}
+        vptestmd  %zmm6, %zmm6, %k0
+        vcvtdq2ps {rn-sae}, %zmm5, %zmm1
+        kmovw     %k0, %ecx
+
+/*
+   final reconstruction:
+   add exponent_value*log2 to polynomial result
+ */
+        vfmadd132ps _sLn2(%rax), %zmm10, %zmm1
+        testl     %ecx, %ecx
+        jne       .LBL_2_3
+
+.LBL_2_2:
+        cfi_remember_state
+        vmovaps   %zmm1, %zmm0
+        movq      %rbp, %rsp
+        cfi_def_cfa_register (%rsp)
+        popq      %rbp
+        cfi_adjust_cfa_offset (-8)
+        cfi_restore (%rbp)
+        ret
+
+.LBL_2_3:
+        cfi_restore_state
+        vmovups   %zmm0, 1152(%rsp)
+        vmovups   %zmm1, 1216(%rsp)
+        je        .LBL_2_2
+
+        xorb      %dl, %dl
+        xorl      %eax, %eax
+        kmovw     %k4, 1048(%rsp)
+        kmovw     %k5, 1040(%rsp)
+        kmovw     %k6, 1032(%rsp)
+        kmovw     %k7, 1024(%rsp)
+        vmovups   %zmm16, 960(%rsp)
+        vmovups   %zmm17, 896(%rsp)
+        vmovups   %zmm18, 832(%rsp)
+        vmovups   %zmm19, 768(%rsp)
+        vmovups   %zmm20, 704(%rsp)
+        vmovups   %zmm21, 640(%rsp)
+        vmovups   %zmm22, 576(%rsp)
+        vmovups   %zmm23, 512(%rsp)
+        vmovups   %zmm24, 448(%rsp)
+        vmovups   %zmm25, 384(%rsp)
+        vmovups   %zmm26, 320(%rsp)
+        vmovups   %zmm27, 256(%rsp)
+        vmovups   %zmm28, 192(%rsp)
+        vmovups   %zmm29, 128(%rsp)
+        vmovups   %zmm30, 64(%rsp)
+        vmovups   %zmm31, (%rsp)
+        movq      %rsi, 1064(%rsp)
+        movq      %rdi, 1056(%rsp)
+        movq      %r12, 1096(%rsp)
+        cfi_offset_rel_rsp (12, 1096)
+        movb      %dl, %r12b
+        movq      %r13, 1088(%rsp)
+        cfi_offset_rel_rsp (13, 1088)
+        movl      %ecx, %r13d
+        movq      %r14, 1080(%rsp)
+        cfi_offset_rel_rsp (14, 1080)
+        movl      %eax, %r14d
+        movq      %r15, 1072(%rsp)
+        cfi_offset_rel_rsp (15, 1072)
+        cfi_remember_state
+
+.LBL_2_6:
+        btl       %r14d, %r13d
+        jc        .LBL_2_12
+
+.LBL_2_7:
+        lea       1(%r14), %esi
+        btl       %esi, %r13d
+        jc        .LBL_2_10
+
+.LBL_2_8:
+        incb      %r12b
+        addl      $2, %r14d
+        cmpb      $16, %r12b
+        jb        .LBL_2_6
+
+        kmovw     1048(%rsp), %k4
+        kmovw     1040(%rsp), %k5
+        kmovw     1032(%rsp), %k6
+        kmovw     1024(%rsp), %k7
+        vmovups   960(%rsp), %zmm16
+        vmovups   896(%rsp), %zmm17
+        vmovups   832(%rsp), %zmm18
+        vmovups   768(%rsp), %zmm19
+        vmovups   704(%rsp), %zmm20
+        vmovups   640(%rsp), %zmm21
+        vmovups   576(%rsp), %zmm22
+        vmovups   512(%rsp), %zmm23
+        vmovups   448(%rsp), %zmm24
+        vmovups   384(%rsp), %zmm25
+        vmovups   320(%rsp), %zmm26
+        vmovups   256(%rsp), %zmm27
+        vmovups   192(%rsp), %zmm28
+        vmovups   128(%rsp), %zmm29
+        vmovups   64(%rsp), %zmm30
+        vmovups   (%rsp), %zmm31
+        vmovups   1216(%rsp), %zmm1
+        movq      1064(%rsp), %rsi
+        movq      1056(%rsp), %rdi
+        movq      1096(%rsp), %r12
+        cfi_restore (%r12)
+        movq      1088(%rsp), %r13
+        cfi_restore (%r13)
+        movq      1080(%rsp), %r14
+        cfi_restore (%r14)
+        movq      1072(%rsp), %r15
+        cfi_restore (%r15)
+        jmp       .LBL_2_2
+
+.LBL_2_10:
+        cfi_restore_state
+        movzbl    %r12b, %r15d
+        vmovss    1156(%rsp,%r15,8), %xmm0
+        vzeroupper
+        vmovss    1156(%rsp,%r15,8), %xmm0
+
+        call      JUMPTARGET(__logf_finite)
+
+        vmovss    %xmm0, 1220(%rsp,%r15,8)
+        jmp       .LBL_2_8
+
+.LBL_2_12:
+        movzbl    %r12b, %r15d
+        vmovss    1152(%rsp,%r15,8), %xmm0
+        vzeroupper
+        vmovss    1152(%rsp,%r15,8), %xmm0
+
+        call      JUMPTARGET(__logf_finite)
+
+        vmovss    %xmm0, 1216(%rsp,%r15,8)
+        jmp       .LBL_2_7
+
+#endif
+END (_ZGVeN16v_logf_skx)
+
+	.section .rodata, "a"
+.L_2il0floatpacket.7:
+	.long	0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
+	.type	.L_2il0floatpacket.7,@object
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_logf4_core.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_logf4_core.S
new file mode 100644
index 0000000000..4e0e36d5bd
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_logf4_core.S
@@ -0,0 +1,36 @@
+/* Multiple versions of vectorized logf.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+	.text
+ENTRY (_ZGVbN4v_logf)
+        .type   _ZGVbN4v_logf, @gnu_indirect_function
+	LOAD_RTLD_GLOBAL_RO_RDX
+        leaq    _ZGVbN4v_logf_sse4(%rip), %rax
+	HAS_CPU_FEATURE (SSE4_1)
+        jz      2f
+        ret
+2:      leaq    _ZGVbN4v_logf_sse2(%rip), %rax
+        ret
+END (_ZGVbN4v_logf)
+libmvec_hidden_def (_ZGVbN4v_logf)
+
+#define _ZGVbN4v_logf _ZGVbN4v_logf_sse2
+#include "../svml_s_logf4_core.S"
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_logf4_core_sse4.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_logf4_core_sse4.S
new file mode 100644
index 0000000000..156face181
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_logf4_core_sse4.S
@@ -0,0 +1,194 @@
+/* Function logf vectorized with SSE4.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "svml_s_logf_data.h"
+
+	.text
+ENTRY (_ZGVbN4v_logf_sse4)
+/*
+   ALGORITHM DESCRIPTION:
+
+     log(x) = exponent_x*log(2) + log(mantissa_x),         if mantissa_x<4/3
+     log(x) = (exponent_x+1)*log(2) + log(0.5*mantissa_x), if mantissa_x>4/3
+
+     R = mantissa_x - 1,     if mantissa_x<4/3
+     R = 0.5*mantissa_x - 1, if mantissa_x>4/3
+     |R|< 1/3
+
+     log(1+R) is approximated as a polynomial: degree 9 for 1-ulp,
+     degree 7 for 4-ulp, degree 3 for half-precision.  */
+
+        pushq     %rbp
+        cfi_adjust_cfa_offset (8)
+        cfi_rel_offset (%rbp, 0)
+        movq      %rsp, %rbp
+        cfi_def_cfa_register (%rbp)
+        andq      $-64, %rsp
+        subq      $320, %rsp
+
+/* reduction: compute r,n */
+        movaps    %xmm0, %xmm2
+
+/* check for working range,
+   set special argument mask (denormals/zero/Inf/NaN) */
+        movq      __svml_slog_data@GOTPCREL(%rip), %rax
+        movdqu _iHiDelta(%rax), %xmm1
+        movdqu _iLoRange(%rax), %xmm4
+        paddd     %xmm0, %xmm1
+        movdqu _iBrkValue(%rax), %xmm3
+        pcmpgtd   %xmm1, %xmm4
+        movdqu _iOffExpoMask(%rax), %xmm1
+        psubd     %xmm3, %xmm2
+        pand      %xmm2, %xmm1
+
+/* exponent_x (mantissa_x<4/3) or exponent_x+1 (mantissa_x>4/3) */
+        psrad     $23, %xmm2
+        paddd     %xmm3, %xmm1
+        movups _sPoly_7(%rax), %xmm5
+
+/* mantissa_x (mantissa_x<4/3), or 0.5*mantissa_x (mantissa_x>4/3) */
+        cvtdq2ps  %xmm2, %xmm6
+
+/* reduced argument R */
+        subps _sOne(%rax), %xmm1
+        movmskps  %xmm4, %ecx
+
+/* final reconstruction:
+   add exponent_value*log2 to polynomial result */
+        mulps _sLn2(%rax), %xmm6
+
+/* polynomial evaluation starts here */
+        mulps     %xmm1, %xmm5
+        addps _sPoly_6(%rax), %xmm5
+        mulps     %xmm1, %xmm5
+        addps _sPoly_5(%rax), %xmm5
+        mulps     %xmm1, %xmm5
+        addps _sPoly_4(%rax), %xmm5
+        mulps     %xmm1, %xmm5
+        addps _sPoly_3(%rax), %xmm5
+        mulps     %xmm1, %xmm5
+        addps _sPoly_2(%rax), %xmm5
+        mulps     %xmm1, %xmm5
+        addps _sPoly_1(%rax), %xmm5
+        mulps     %xmm1, %xmm5
+
+/* polynomial evaluation end */
+        mulps     %xmm1, %xmm5
+        addps     %xmm5, %xmm1
+        addps     %xmm6, %xmm1
+        testl     %ecx, %ecx
+        jne       .LBL_1_3
+
+.LBL_1_2:
+        cfi_remember_state
+        movdqa    %xmm1, %xmm0
+        movq      %rbp, %rsp
+        cfi_def_cfa_register (%rsp)
+        popq      %rbp
+        cfi_adjust_cfa_offset (-8)
+        cfi_restore (%rbp)
+        ret
+
+.LBL_1_3:
+        cfi_restore_state
+        movups    %xmm0, 192(%rsp)
+        movups    %xmm1, 256(%rsp)
+        je        .LBL_1_2
+
+        xorb      %dl, %dl
+        xorl      %eax, %eax
+        movups    %xmm8, 112(%rsp)
+        movups    %xmm9, 96(%rsp)
+        movups    %xmm10, 80(%rsp)
+        movups    %xmm11, 64(%rsp)
+        movups    %xmm12, 48(%rsp)
+        movups    %xmm13, 32(%rsp)
+        movups    %xmm14, 16(%rsp)
+        movups    %xmm15, (%rsp)
+        movq      %rsi, 136(%rsp)
+        movq      %rdi, 128(%rsp)
+        movq      %r12, 168(%rsp)
+        cfi_offset_rel_rsp (12, 168)
+        movb      %dl, %r12b
+        movq      %r13, 160(%rsp)
+        cfi_offset_rel_rsp (13, 160)
+        movl      %ecx, %r13d
+        movq      %r14, 152(%rsp)
+        cfi_offset_rel_rsp (14, 152)
+        movl      %eax, %r14d
+        movq      %r15, 144(%rsp)
+        cfi_offset_rel_rsp (15, 144)
+        cfi_remember_state
+
+.LBL_1_6:
+        btl       %r14d, %r13d
+        jc        .LBL_1_12
+
+.LBL_1_7:
+        lea       1(%r14), %esi
+        btl       %esi, %r13d
+        jc        .LBL_1_10
+
+.LBL_1_8:
+        incb      %r12b
+        addl      $2, %r14d
+        cmpb      $16, %r12b
+        jb        .LBL_1_6
+
+        movups    112(%rsp), %xmm8
+        movups    96(%rsp), %xmm9
+        movups    80(%rsp), %xmm10
+        movups    64(%rsp), %xmm11
+        movups    48(%rsp), %xmm12
+        movups    32(%rsp), %xmm13
+        movups    16(%rsp), %xmm14
+        movups    (%rsp), %xmm15
+        movq      136(%rsp), %rsi
+        movq      128(%rsp), %rdi
+        movq      168(%rsp), %r12
+        cfi_restore (%r12)
+        movq      160(%rsp), %r13
+        cfi_restore (%r13)
+        movq      152(%rsp), %r14
+        cfi_restore (%r14)
+        movq      144(%rsp), %r15
+        cfi_restore (%r15)
+        movups    256(%rsp), %xmm1
+        jmp       .LBL_1_2
+
+.LBL_1_10:
+        cfi_restore_state
+        movzbl    %r12b, %r15d
+        movss     196(%rsp,%r15,8), %xmm0
+
+        call      JUMPTARGET(__logf_finite)
+
+        movss     %xmm0, 260(%rsp,%r15,8)
+        jmp       .LBL_1_8
+
+.LBL_1_12:
+        movzbl    %r12b, %r15d
+        movss     192(%rsp,%r15,8), %xmm0
+
+        call      JUMPTARGET(__logf_finite)
+
+        movss     %xmm0, 256(%rsp,%r15,8)
+        jmp       .LBL_1_7
+
+END (_ZGVbN4v_logf_sse4)
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_logf8_core.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_logf8_core.S
new file mode 100644
index 0000000000..f4b82de3d4
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_logf8_core.S
@@ -0,0 +1,36 @@
+/* Multiple versions of vectorized logf.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+    Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+	.text
+ENTRY (_ZGVdN8v_logf)
+        .type   _ZGVdN8v_logf, @gnu_indirect_function
+	LOAD_RTLD_GLOBAL_RO_RDX
+        leaq    _ZGVdN8v_logf_avx2(%rip), %rax
+	HAS_ARCH_FEATURE (AVX2_Usable)
+        jz      2f
+        ret
+2:      leaq    _ZGVdN8v_logf_sse_wrapper(%rip), %rax
+        ret
+END (_ZGVdN8v_logf)
+libmvec_hidden_def (_ZGVdN8v_logf)
+
+#define _ZGVdN8v_logf _ZGVdN8v_logf_sse_wrapper
+#include "../svml_s_logf8_core.S"
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_logf8_core_avx2.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_logf8_core_avx2.S
new file mode 100644
index 0000000000..994af91ffe
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_logf8_core_avx2.S
@@ -0,0 +1,184 @@
+/* Function logf vectorized with AVX2.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "svml_s_logf_data.h"
+
+	.text
+ENTRY(_ZGVdN8v_logf_avx2)
+/*
+   ALGORITHM DESCRIPTION:
+
+    log(x) = exponent_x*log(2) + log(mantissa_x),         if mantissa_x<4/3
+    log(x) = (exponent_x+1)*log(2) + log(0.5*mantissa_x), if mantissa_x>4/3
+
+    R = mantissa_x - 1,     if mantissa_x<4/3
+    R = 0.5*mantissa_x - 1, if mantissa_x>4/3
+    |R|< 1/3
+
+    log(1+R) is approximated as a polynomial: degree 9 for 1-ulp,
+    degree 7 for 4-ulp, degree 3 for half-precision.  */
+
+        pushq     %rbp
+        cfi_adjust_cfa_offset (8)
+        cfi_rel_offset (%rbp, 0)
+        movq      %rsp, %rbp
+        cfi_def_cfa_register (%rbp)
+        andq      $-64, %rsp
+        subq      $448, %rsp
+        movq      __svml_slog_data@GOTPCREL(%rip), %rax
+        vmovaps   %ymm0, %ymm2
+        vmovups _iBrkValue(%rax), %ymm6
+        vmovups _iLoRange(%rax), %ymm1
+/* check for working range,
+   set special argument mask (denormals/zero/Inf/NaN) */
+        vpaddd _iHiDelta(%rax), %ymm2, %ymm7
+
+/* reduction: compute r,n */
+        vpsubd    %ymm6, %ymm2, %ymm4
+
+/* exponent_x (mantissa_x<4/3) or exponent_x+1 (mantissa_x>4/3) */
+        vpsrad    $23, %ymm4, %ymm3
+        vpand _iOffExpoMask(%rax), %ymm4, %ymm5
+        vmovups _sPoly_7(%rax), %ymm4
+        vcvtdq2ps %ymm3, %ymm0
+
+/* mantissa_x (mantissa_x<4/3), or 0.5*mantissa_x (mantissa_x>4/3) */
+        vpaddd    %ymm6, %ymm5, %ymm3
+
+/* reduced argument R */
+        vsubps _sOne(%rax), %ymm3, %ymm5
+
+/* polynomial evaluation starts here */
+        vfmadd213ps _sPoly_6(%rax), %ymm5, %ymm4
+        vfmadd213ps _sPoly_5(%rax), %ymm5, %ymm4
+        vfmadd213ps _sPoly_4(%rax), %ymm5, %ymm4
+        vfmadd213ps _sPoly_3(%rax), %ymm5, %ymm4
+        vfmadd213ps _sPoly_2(%rax), %ymm5, %ymm4
+        vfmadd213ps _sPoly_1(%rax), %ymm5, %ymm4
+        vmulps    %ymm5, %ymm4, %ymm6
+
+/* polynomial evaluation end */
+        vfmadd213ps %ymm5, %ymm5, %ymm6
+        vpcmpgtd  %ymm7, %ymm1, %ymm1
+        vmovmskps %ymm1, %ecx
+
+/* final reconstruction:
+   add exponent_value*log2 to polynomial result */
+        vfmadd132ps _sLn2(%rax), %ymm6, %ymm0
+        testl     %ecx, %ecx
+        jne       .LBL_1_3
+
+.LBL_1_2:
+        cfi_remember_state
+        movq      %rbp, %rsp
+        cfi_def_cfa_register (%rsp)
+        popq      %rbp
+        cfi_adjust_cfa_offset (-8)
+        cfi_restore (%rbp)
+        ret
+
+.LBL_1_3:
+        cfi_restore_state
+        vmovups   %ymm2, 320(%rsp)
+        vmovups   %ymm0, 384(%rsp)
+        je        .LBL_1_2
+
+        xorb      %dl, %dl
+        xorl      %eax, %eax
+        vmovups   %ymm8, 224(%rsp)
+        vmovups   %ymm9, 192(%rsp)
+        vmovups   %ymm10, 160(%rsp)
+        vmovups   %ymm11, 128(%rsp)
+        vmovups   %ymm12, 96(%rsp)
+        vmovups   %ymm13, 64(%rsp)
+        vmovups   %ymm14, 32(%rsp)
+        vmovups   %ymm15, (%rsp)
+        movq      %rsi, 264(%rsp)
+        movq      %rdi, 256(%rsp)
+        movq      %r12, 296(%rsp)
+        cfi_offset_rel_rsp (12, 296)
+        movb      %dl, %r12b
+        movq      %r13, 288(%rsp)
+        cfi_offset_rel_rsp (13, 288)
+        movl      %ecx, %r13d
+        movq      %r14, 280(%rsp)
+        cfi_offset_rel_rsp (14, 280)
+        movl      %eax, %r14d
+        movq      %r15, 272(%rsp)
+        cfi_offset_rel_rsp (15, 272)
+        cfi_remember_state
+
+.LBL_1_6:
+        btl       %r14d, %r13d
+        jc        .LBL_1_12
+
+.LBL_1_7:
+        lea       1(%r14), %esi
+        btl       %esi, %r13d
+        jc        .LBL_1_10
+
+.LBL_1_8:
+        incb      %r12b
+        addl      $2, %r14d
+        cmpb      $16, %r12b
+        jb        .LBL_1_6
+
+        vmovups   224(%rsp), %ymm8
+        vmovups   192(%rsp), %ymm9
+        vmovups   160(%rsp), %ymm10
+        vmovups   128(%rsp), %ymm11
+        vmovups   96(%rsp), %ymm12
+        vmovups   64(%rsp), %ymm13
+        vmovups   32(%rsp), %ymm14
+        vmovups   (%rsp), %ymm15
+        vmovups   384(%rsp), %ymm0
+        movq      264(%rsp), %rsi
+        movq      256(%rsp), %rdi
+        movq      296(%rsp), %r12
+        cfi_restore (%r12)
+        movq      288(%rsp), %r13
+        cfi_restore (%r13)
+        movq      280(%rsp), %r14
+        cfi_restore (%r14)
+        movq      272(%rsp), %r15
+        cfi_restore (%r15)
+        jmp       .LBL_1_2
+
+.LBL_1_10:
+        cfi_restore_state
+        movzbl    %r12b, %r15d
+        vmovss    324(%rsp,%r15,8), %xmm0
+        vzeroupper
+
+        call      JUMPTARGET(__logf_finite)
+
+        vmovss    %xmm0, 388(%rsp,%r15,8)
+        jmp       .LBL_1_8
+
+.LBL_1_12:
+        movzbl    %r12b, %r15d
+        vmovss    320(%rsp,%r15,8), %xmm0
+        vzeroupper
+
+        call      JUMPTARGET(__logf_finite)
+
+        vmovss    %xmm0, 384(%rsp,%r15,8)
+        jmp       .LBL_1_7
+
+END(_ZGVdN8v_logf_avx2)
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core.S
new file mode 100644
index 0000000000..6d10c7576f
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core.S
@@ -0,0 +1,37 @@
+/* Multiple versions of vectorized powf.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+	.text
+ENTRY (_ZGVeN16vv_powf)
+        .type   _ZGVeN16vv_powf, @gnu_indirect_function
+	LOAD_RTLD_GLOBAL_RO_RDX
+        leaq    _ZGVeN16vv_powf_skx(%rip), %rax
+	HAS_ARCH_FEATURE (AVX512DQ_Usable)
+        jnz     2f
+        leaq    _ZGVeN16vv_powf_knl(%rip), %rax
+	HAS_ARCH_FEATURE (AVX512F_Usable)
+        jnz     2f
+        leaq    _ZGVeN16vv_powf_avx2_wrapper(%rip), %rax
+2:      ret
+END (_ZGVeN16vv_powf)
+
+#define _ZGVeN16vv_powf _ZGVeN16vv_powf_avx2_wrapper
+#include "../svml_s_powf16_core.S"
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S
new file mode 100644
index 0000000000..fc91a092b0
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S
@@ -0,0 +1,653 @@
+/* Function powf vectorized with AVX-512. KNL and SKX versions.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "svml_s_powf_data.h"
+#include "svml_s_wrapper_impl.h"
+
+/*
+   ALGORITHM DESCRIPTION:
+
+     We are using the next identity : pow(x,y) = 2^(y * log2(x)).
+
+     1) log2(x) calculation
+        Here we use the following formula.
+        Let |x|=2^k1*X1, where k1 is integer, 1<=X1<2.
+        Let C ~= 1/ln(2),
+        Rcp1 ~= 1/X1,   X2=Rcp1*X1,
+        Rcp2 ~= 1/X2,   X3=Rcp2*X2,
+        Rcp3 ~= 1/X3,   Rcp3C ~= C/X3.
+        Then
+          log2|x| = k1 + log2(1/Rcp1) + log2(1/Rcp2) + log2(C/Rcp3C) +
+                    log2(X1*Rcp1*Rcp2*Rcp3C/C),
+        where X1*Rcp1*Rcp2*Rcp3C = C*(1+q), q is very small.
+
+        The values of Rcp1, log2(1/Rcp1), Rcp2, log2(1/Rcp2),
+        Rcp3C, log2(C/Rcp3C) are taken from tables.
+        Values of Rcp1, Rcp2, Rcp3C are such that RcpC=Rcp1*Rcp2*Rcp3C
+        is exactly represented in target precision.
+
+        log2(X1*Rcp1*Rcp2*Rcp3C/C) = log2(1+q) = ln(1+q)/ln2 =
+             = 1/(ln2)*q - 1/(2ln2)*q^2 + 1/(3ln2)*q^3 - ... =
+             = 1/(C*ln2)*cq - 1/(2*C^2*ln2)*cq^2 + 1/(3*C^3*ln2)*cq^3 - ... =
+             = (1 + a1)*cq + a2*cq^2 + a3*cq^3 + ...,
+        where
+             cq=X1*Rcp1*Rcp2*Rcp3C-C,
+             a1=1/(C*ln(2))-1 is small,
+             a2=1/(2*C^2*ln2),
+             a3=1/(3*C^3*ln2),
+                  ...
+        Log2 result is split by three parts: HH+HL+HLL
+
+     2) Calculation of y*log2(x)
+        Split y into YHi+YLo.
+        Get high PH and medium PL parts of y*log2|x|.
+        Get low PLL part of y*log2|x|.
+        Now we have PH+PL+PLL ~= y*log2|x|.
+
+     3) Calculation of 2^(y*log2(x))
+        Let's represent PH+PL+PLL in the form N + j/2^expK + Z,
+        where expK=7 in this implementation, N and j are integers,
+        0<=j<=2^expK-1, |Z|<2^(-expK-1). Hence
+        2^(PH+PL+PLL) ~= 2^N * 2^(j/2^expK) * 2^Z,
+        where 2^(j/2^expK) is stored in a table, and
+        2^Z ~= 1 + B1*Z + B2*Z^2 ... + B5*Z^5.
+        We compute 2^(PH+PL+PLL) as follows:
+        Break PH into PHH + PHL, where PHH = N + j/2^expK.
+        Z = PHL + PL + PLL
+        Exp2Poly = B1*Z + B2*Z^2 ... + B5*Z^5
+        Get 2^(j/2^expK) from table in the form THI+TLO.
+        Now we have 2^(PH+PL+PLL) ~= 2^N * (THI + TLO) * (1 + Exp2Poly).
+        Get significand of 2^(PH+PL+PLL) in the form ResHi+ResLo:
+        ResHi := THI
+        ResLo := THI * Exp2Poly + TLO
+        Get exponent ERes of the result:
+        Res := ResHi + ResLo:
+        Result := ex(Res) + N.  */
+
+	.text
+ENTRY (_ZGVeN16vv_powf_knl)
+#ifndef HAVE_AVX512DQ_ASM_SUPPORT
+WRAPPER_IMPL_AVX512_ff _ZGVdN8vv_powf
+#else
+        pushq     %rbp
+        cfi_adjust_cfa_offset (8)
+        cfi_rel_offset (%rbp, 0)
+        movq      %rsp, %rbp
+        cfi_def_cfa_register (%rbp)
+        andq      $-64, %rsp
+        subq      $1344, %rsp
+        movq      __svml_spow_data@GOTPCREL(%rip), %rdx
+        vmovaps   %zmm1, %zmm9
+        vshuff32x4 $238, %zmm0, %zmm0, %zmm7
+        kxnorw    %k3, %k3, %k3
+        vcvtps2pd %ymm0, %zmm14
+        vcvtps2pd %ymm7, %zmm10
+        movl      $-1, %eax
+        movq      $-1, %rcx
+        vpandd    _ABSMASK(%rdx), %zmm9, %zmm4
+        vmovups   _ExpMask(%rdx), %zmm6
+
+/* exponent bits selection */
+        vpsrlq    $20, %zmm14, %zmm13
+        vshuff32x4 $238, %zmm9, %zmm9, %zmm8
+        vpcmpd    $5, _INF(%rdx), %zmm4, %k2
+        vpsrlq    $32, %zmm13, %zmm15
+        vcvtps2pd %ymm8, %zmm2
+        vmovups   _Two10(%rdx), %zmm4
+        vpmovqd   %zmm15, %ymm12
+        vcvtps2pd %ymm9, %zmm1
+        vpsubd    _NMINNORM(%rdx), %zmm0, %zmm3
+        vpbroadcastd %eax, %zmm8{%k2}{z}
+        vpcmpd    $5, _NMAXVAL(%rdx), %zmm3, %k1
+
+/* preserve mantissa, set input exponent to 2^(-10) */
+        vmovaps   %zmm6, %zmm3
+        vpternlogq $248, %zmm6, %zmm10, %zmm4
+        vpsrlq    $20, %zmm10, %zmm10
+        vpternlogq $234, _Two10(%rdx), %zmm14, %zmm3
+
+/* reciprocal approximation good to at least 11 bits */
+        vrcp28pd  %zmm4, %zmm11
+        vpsrlq    $32, %zmm10, %zmm14
+        vpbroadcastd %eax, %zmm7{%k1}{z}
+        kxnorw    %k1, %k1, %k1
+        vrcp28pd  %zmm3, %zmm5
+        vpmovqd   %zmm14, %ymm6
+        vshufi32x4 $68, %zmm6, %zmm12, %zmm13
+        vmovups   _One(%rdx), %zmm6
+
+/* round reciprocal to nearest integer, will have 1+9 mantissa bits */
+        vrndscalepd $8, %zmm5, %zmm14
+
+/* biased exponent in DP format */
+        vshuff32x4 $238, %zmm13, %zmm13, %zmm5
+        vrndscalepd $8, %zmm11, %zmm11
+        vcmppd    $30, _Threshold(%rdx), %zmm14, %k2
+        vcvtdq2pd %ymm13, %zmm10
+        vcvtdq2pd %ymm5, %zmm15
+
+/* table lookup */
+        vpsrlq    $40, %zmm14, %zmm13
+        vpxord    %zmm5, %zmm5, %zmm5
+        vgatherqpd _Log2Rcp_lookup(%rdx,%zmm13), %zmm5{%k3}
+        vfmsub213pd %zmm6, %zmm14, %zmm3
+        vfmsub213pd %zmm6, %zmm11, %zmm4
+        vcmppd    $30, _Threshold(%rdx), %zmm11, %k3
+        vpbroadcastq %rcx, %zmm14{%k2}{z}
+
+/* dpP= _dbT+lJ*T_ITEM_GRAN */
+        kxnorw    %k2, %k2, %k2
+        vpsrlq    $40, %zmm11, %zmm12
+        vpxord    %zmm6, %zmm6, %zmm6
+        vpbroadcastq %rcx, %zmm11{%k3}{z}
+        kxnorw    %k3, %k3, %k3
+        vgatherqpd _Log2Rcp_lookup(%rdx,%zmm12), %zmm6{%k1}
+        vmovups   _Bias1(%rdx), %zmm12
+        vpternlogq $236, _Bias(%rdx), %zmm12, %zmm14
+        vpternlogq $248, _Bias(%rdx), %zmm11, %zmm12
+        vsubpd    %zmm14, %zmm10, %zmm13
+        vsubpd    %zmm12, %zmm15, %zmm10
+        vmovups   _poly_coeff_3(%rdx), %zmm11
+        vmovups   _poly_coeff_4(%rdx), %zmm15
+        vfmadd213pd %zmm15, %zmm4, %zmm11
+        vmulpd    %zmm4, %zmm4, %zmm12
+        vmovaps   %zmm15, %zmm14
+        vmulpd    %zmm3, %zmm3, %zmm15
+        vfmadd231pd _poly_coeff_3(%rdx), %zmm3, %zmm14
+
+/* reconstruction */
+        vfmadd213pd %zmm4, %zmm12, %zmm11
+        vfmadd213pd %zmm3, %zmm15, %zmm14
+        vaddpd    %zmm6, %zmm11, %zmm11
+        vaddpd    %zmm5, %zmm14, %zmm3
+        vfmadd231pd _L2(%rdx), %zmm10, %zmm11
+        vfmadd132pd _L2(%rdx), %zmm3, %zmm13
+        vmulpd    %zmm2, %zmm11, %zmm12
+        vmulpd    %zmm1, %zmm13, %zmm10
+        vmulpd    __dbInvLn2(%rdx), %zmm12, %zmm6
+
+/* hi bits */
+        vpsrlq    $32, %zmm12, %zmm12
+        vmulpd    __dbInvLn2(%rdx), %zmm10, %zmm1
+
+/* to round down; if dR is an integer we will get R = 1, which is ok */
+        vsubpd    __dbHALF(%rdx), %zmm6, %zmm4
+        vpsrlq    $32, %zmm10, %zmm11
+        vpmovqd   %zmm11, %ymm3
+        vsubpd    __dbHALF(%rdx), %zmm1, %zmm2
+        vaddpd    __dbShifter(%rdx), %zmm4, %zmm14
+        vpmovqd   %zmm12, %ymm4
+        vshufi32x4 $68, %zmm4, %zmm3, %zmm5
+        vpxord    %zmm4, %zmm4, %zmm4
+        vaddpd    __dbShifter(%rdx), %zmm2, %zmm2
+
+/* iAbsX = iAbsX&iAbsMask; */
+        vpandd    __iAbsMask(%rdx), %zmm5, %zmm11
+        vpxord    %zmm5, %zmm5, %zmm5
+        vsubpd    __dbShifter(%rdx), %zmm14, %zmm13
+
+/* iRangeMask = (iAbsX>iDomainRange) */
+        vpcmpgtd     __iDomainRange(%rdx), %zmm11, %k1
+        vsubpd       __dbShifter(%rdx), %zmm2, %zmm15
+        vpbroadcastd %eax, %zmm10{%k1}{z}
+        vpternlogd   $254, %zmm8, %zmm7, %zmm10
+
+/* [0..1) */
+        vsubpd    %zmm15, %zmm1, %zmm1
+
+/* low K bits */
+        vpandq    __lbLOWKBITS(%rdx), %zmm14, %zmm11
+        vgatherqpd 13952(%rdx,%zmm11,8), %zmm5{%k3}
+        vsubpd    %zmm13, %zmm6, %zmm7
+        vptestmd  %zmm10, %zmm10, %k0
+        vpandq    __lbLOWKBITS(%rdx), %zmm2, %zmm10
+        vmulpd    __dbC1(%rdx), %zmm1, %zmm1
+        vmulpd    __dbC1(%rdx), %zmm7, %zmm3
+        vpsrlq    $11, %zmm2, %zmm8
+        vpsrlq    $11, %zmm14, %zmm2
+
+/* NB : including +/- sign for the exponent!! */
+        vpsllq    $52, %zmm8, %zmm8
+        kmovw     %k0, %ecx
+        vpsllq    $52, %zmm2, %zmm6
+        vfmadd213pd %zmm5, %zmm3, %zmm5
+        vgatherqpd 13952(%rdx,%zmm10,8), %zmm4{%k2}
+        vfmadd213pd %zmm4, %zmm1, %zmm4
+        vpaddq    %zmm6, %zmm5, %zmm10
+        vcvtpd2ps %zmm10, %ymm12
+        vpaddq    %zmm8, %zmm4, %zmm7
+        vcvtpd2ps %zmm7, %ymm11
+        vshuff32x4 $68, %zmm12, %zmm11, %zmm1
+        testl     %ecx, %ecx
+        jne       .LBL_1_3
+
+.LBL_1_2:
+        cfi_remember_state
+        vmovaps   %zmm1, %zmm0
+        movq      %rbp, %rsp
+        cfi_def_cfa_register (%rsp)
+        popq      %rbp
+        cfi_adjust_cfa_offset (-8)
+        cfi_restore (%rbp)
+        ret
+
+.LBL_1_3:
+        cfi_restore_state
+        vmovups   %zmm0, 1152(%rsp)
+        vmovups   %zmm9, 1216(%rsp)
+        vmovups   %zmm1, 1280(%rsp)
+        je        .LBL_1_2
+
+        xorb      %dl, %dl
+        kmovw     %k4, 1048(%rsp)
+        xorl      %eax, %eax
+        kmovw     %k5, 1040(%rsp)
+        kmovw     %k6, 1032(%rsp)
+        kmovw     %k7, 1024(%rsp)
+        vmovups   %zmm16, 960(%rsp)
+        vmovups   %zmm17, 896(%rsp)
+        vmovups   %zmm18, 832(%rsp)
+        vmovups   %zmm19, 768(%rsp)
+        vmovups   %zmm20, 704(%rsp)
+        vmovups   %zmm21, 640(%rsp)
+        vmovups   %zmm22, 576(%rsp)
+        vmovups   %zmm23, 512(%rsp)
+        vmovups   %zmm24, 448(%rsp)
+        vmovups   %zmm25, 384(%rsp)
+        vmovups   %zmm26, 320(%rsp)
+        vmovups   %zmm27, 256(%rsp)
+        vmovups   %zmm28, 192(%rsp)
+        vmovups   %zmm29, 128(%rsp)
+        vmovups   %zmm30, 64(%rsp)
+        vmovups   %zmm31, (%rsp)
+        movq      %rsi, 1064(%rsp)
+        movq      %rdi, 1056(%rsp)
+        movq      %r12, 1096(%rsp)
+        cfi_offset_rel_rsp (12, 1096)
+        movb      %dl, %r12b
+        movq      %r13, 1088(%rsp)
+        cfi_offset_rel_rsp (13, 1088)
+        movl      %ecx, %r13d
+        movq      %r14, 1080(%rsp)
+        cfi_offset_rel_rsp (14, 1080)
+        movl      %eax, %r14d
+        movq      %r15, 1072(%rsp)
+        cfi_offset_rel_rsp (15, 1072)
+        cfi_remember_state
+
+.LBL_1_6:
+        btl       %r14d, %r13d
+        jc        .LBL_1_12
+
+.LBL_1_7:
+        lea       1(%r14), %esi
+        btl       %esi, %r13d
+        jc        .LBL_1_10
+
+.LBL_1_8:
+        addb      $1, %r12b
+        addl      $2, %r14d
+        cmpb      $16, %r12b
+        jb        .LBL_1_6
+
+        kmovw     1048(%rsp), %k4
+        movq      1064(%rsp), %rsi
+        kmovw     1040(%rsp), %k5
+        movq      1056(%rsp), %rdi
+        kmovw     1032(%rsp), %k6
+        movq      1096(%rsp), %r12
+        cfi_restore (%r12)
+        movq      1088(%rsp), %r13
+        cfi_restore (%r13)
+        kmovw     1024(%rsp), %k7
+        vmovups   960(%rsp), %zmm16
+        vmovups   896(%rsp), %zmm17
+        vmovups   832(%rsp), %zmm18
+        vmovups   768(%rsp), %zmm19
+        vmovups   704(%rsp), %zmm20
+        vmovups   640(%rsp), %zmm21
+        vmovups   576(%rsp), %zmm22
+        vmovups   512(%rsp), %zmm23
+        vmovups   448(%rsp), %zmm24
+        vmovups   384(%rsp), %zmm25
+        vmovups   320(%rsp), %zmm26
+        vmovups   256(%rsp), %zmm27
+        vmovups   192(%rsp), %zmm28
+        vmovups   128(%rsp), %zmm29
+        vmovups   64(%rsp), %zmm30
+        vmovups   (%rsp), %zmm31
+        movq      1080(%rsp), %r14
+        cfi_restore (%r14)
+        movq      1072(%rsp), %r15
+        cfi_restore (%r15)
+        vmovups   1280(%rsp), %zmm1
+        jmp       .LBL_1_2
+
+.LBL_1_10:
+        cfi_restore_state
+        movzbl    %r12b, %r15d
+        vmovss    1156(%rsp,%r15,8), %xmm0
+        vmovss    1220(%rsp,%r15,8), %xmm1
+        call      JUMPTARGET(__powf_finite)
+        vmovss    %xmm0, 1284(%rsp,%r15,8)
+        jmp       .LBL_1_8
+
+.LBL_1_12:
+        movzbl    %r12b, %r15d
+        vmovss    1152(%rsp,%r15,8), %xmm0
+        vmovss    1216(%rsp,%r15,8), %xmm1
+        call      JUMPTARGET(__powf_finite)
+        vmovss    %xmm0, 1280(%rsp,%r15,8)
+        jmp       .LBL_1_7
+#endif
+END (_ZGVeN16vv_powf_knl)
+
+ENTRY (_ZGVeN16vv_powf_skx)
+#ifndef HAVE_AVX512DQ_ASM_SUPPORT
+WRAPPER_IMPL_AVX512_ff _ZGVdN8vv_powf
+#else
+        pushq     %rbp
+        cfi_adjust_cfa_offset (8)
+        cfi_rel_offset (%rbp, 0)
+        movq      %rsp, %rbp
+        cfi_def_cfa_register (%rbp)
+        andq      $-64, %rsp
+        subq      $1344, %rsp
+        movq      __svml_spow_data@GOTPCREL(%rip), %rax
+        vextractf32x8 $1, %zmm1, %ymm14
+        vextractf32x8 $1, %zmm0, %ymm15
+        vpsubd _NMINNORM(%rax), %zmm0, %zmm9
+        vmovups   %zmm26, 1280(%rsp)
+        vmovups _ExpMask(%rax), %zmm6
+        vpcmpd    $1, _NMAXVAL(%rax), %zmm9, %k1
+        vcvtps2pd %ymm0, %zmm5
+        vcvtps2pd %ymm1, %zmm12
+        kxnorw    %k3, %k3, %k3
+
+/* exponent bits selection */
+        vpsrlq    $20, %zmm5, %zmm3
+        vpsrlq    $32, %zmm3, %zmm2
+        vpmovqd   %zmm2, %ymm11
+        vcvtps2pd %ymm14, %zmm13
+        vmovups   .L_2il0floatpacket.23(%rip), %zmm14
+        vmovaps   %zmm14, %zmm26
+        vpandd _ABSMASK(%rax), %zmm1, %zmm8
+        vpcmpd    $1, _INF(%rax), %zmm8, %k2
+        vpandnd   %zmm9, %zmm9, %zmm26{%k1}
+        vmovups _Two10(%rax), %zmm9
+        kxnorw    %k1, %k1, %k1
+        vcvtps2pd %ymm15, %zmm4
+        vmovaps   %zmm14, %zmm15
+
+/* preserve mantissa, set input exponent to 2^(-10) */
+        vpternlogq $248, %zmm6, %zmm4, %zmm9
+        vpsrlq    $20, %zmm4, %zmm4
+
+/* reciprocal approximation good to at least 11 bits */
+        vrcp14pd  %zmm9, %zmm10
+
+/* round reciprocal to nearest integer, will have 1+9 mantissa bits */
+        vrndscalepd $8, %zmm10, %zmm3
+        vmovups _One(%rax), %zmm10
+        vfmsub213pd %zmm10, %zmm3, %zmm9
+        vpandnd   %zmm8, %zmm8, %zmm15{%k2}
+        vmovaps   %zmm6, %zmm8
+        vpternlogq $234, _Two10(%rax), %zmm5, %zmm8
+        vpsrlq    $32, %zmm4, %zmm5
+        vrcp14pd  %zmm8, %zmm7
+        vpmovqd   %zmm5, %ymm6
+        vrndscalepd $8, %zmm7, %zmm2
+        vfmsub213pd %zmm10, %zmm2, %zmm8
+
+/* table lookup */
+        vpsrlq    $40, %zmm2, %zmm10
+        vinserti32x8 $1, %ymm6, %zmm11, %zmm4
+        vpsrlq    $40, %zmm3, %zmm11
+
+/* biased exponent in DP format */
+        vextracti32x8 $1, %zmm4, %ymm7
+        vcvtdq2pd %ymm4, %zmm6
+        vpmovqd   %zmm10, %ymm4
+        vpmovqd   %zmm11, %ymm5
+        vpxord    %zmm10, %zmm10, %zmm10
+        vgatherdpd _Log2Rcp_lookup(%rax,%ymm4), %zmm10{%k3}
+        vpbroadcastq .L_2il0floatpacket.24(%rip), %zmm4
+        vpxord    %zmm11, %zmm11, %zmm11
+        vcvtdq2pd %ymm7, %zmm7
+        vgatherdpd _Log2Rcp_lookup(%rax,%ymm5), %zmm11{%k1}
+        vmovups _Threshold(%rax), %zmm5
+        vcmppd    $21, %zmm2, %zmm5, %k2
+        vcmppd    $21, %zmm3, %zmm5, %k3
+        vmovups _Bias1(%rax), %zmm3
+        vmovaps   %zmm4, %zmm2
+        vpandnq   %zmm5, %zmm5, %zmm2{%k2}
+        vpternlogq $236, _Bias(%rax), %zmm3, %zmm2
+
+/* dpP= _dbT+lJ*T_ITEM_GRAN */
+        kxnorw    %k2, %k2, %k2
+        vpandnq   %zmm5, %zmm5, %zmm4{%k3}
+        vpternlogq $248, _Bias(%rax), %zmm4, %zmm3
+        vsubpd    %zmm2, %zmm6, %zmm4
+        vmovups _poly_coeff_3(%rax), %zmm6
+        vmovups _poly_coeff_4(%rax), %zmm2
+        vsubpd    %zmm3, %zmm7, %zmm5
+        vmulpd    %zmm8, %zmm8, %zmm7
+        vfmadd213pd %zmm2, %zmm9, %zmm6
+        kxnorw    %k3, %k3, %k3
+        vmovaps   %zmm2, %zmm3
+        vmulpd    %zmm9, %zmm9, %zmm2
+        vfmadd231pd _poly_coeff_3(%rax), %zmm8, %zmm3
+
+/* reconstruction */
+        vfmadd213pd %zmm9, %zmm2, %zmm6
+        vfmadd213pd %zmm8, %zmm7, %zmm3
+        vaddpd    %zmm11, %zmm6, %zmm8
+        vaddpd    %zmm10, %zmm3, %zmm9
+        vfmadd231pd _L2(%rax), %zmm5, %zmm8
+        vfmadd132pd _L2(%rax), %zmm9, %zmm4
+        vmulpd    %zmm13, %zmm8, %zmm13
+        vmulpd    %zmm12, %zmm4, %zmm3
+        vmulpd __dbInvLn2(%rax), %zmm13, %zmm10
+        vmulpd __dbInvLn2(%rax), %zmm3, %zmm8
+
+/* hi bits */
+        vpsrlq    $32, %zmm3, %zmm4
+        vpsrlq    $32, %zmm13, %zmm13
+
+/* to round down; if dR is an integer we will get R = 1, which is ok */
+        vsubpd __dbHALF(%rax), %zmm8, %zmm12
+        vpmovqd   %zmm4, %ymm5
+        vpmovqd   %zmm13, %ymm2
+        vsubpd __dbHALF(%rax), %zmm10, %zmm9
+        vaddpd __dbShifter(%rax), %zmm12, %zmm7
+        vaddpd __dbShifter(%rax), %zmm9, %zmm9
+        vsubpd __dbShifter(%rax), %zmm7, %zmm11
+        vsubpd __dbShifter(%rax), %zmm9, %zmm12
+        vinserti32x8 $1, %ymm2, %zmm5, %zmm3
+
+/* iAbsX = iAbsX&iAbsMask */
+        vpandd __iAbsMask(%rax), %zmm3, %zmm4
+
+/* iRangeMask = (iAbsX>iDomainRange) */
+        vpcmpd    $2, __iDomainRange(%rax), %zmm4, %k1
+        vpandnd   %zmm4, %zmm4, %zmm14{%k1}
+        vpternlogd $254, %zmm15, %zmm26, %zmm14
+
+/* [0..1) */
+        vsubpd    %zmm11, %zmm8, %zmm15
+        vsubpd    %zmm12, %zmm10, %zmm26
+        vptestmd  %zmm14, %zmm14, %k0
+        vpsrlq    $11, %zmm7, %zmm8
+        vpsrlq    $11, %zmm9, %zmm10
+        vmulpd __dbC1(%rax), %zmm26, %zmm26
+        vmulpd __dbC1(%rax), %zmm15, %zmm15
+
+/* NB : including +/- sign for the exponent!! */
+        vpsllq    $52, %zmm10, %zmm13
+        vpsllq    $52, %zmm8, %zmm12
+        kmovw     %k0, %ecx
+
+/* low K bits */
+        vpandq __lbLOWKBITS(%rax), %zmm9, %zmm14
+        vpandq __lbLOWKBITS(%rax), %zmm7, %zmm6
+        vpmovqd   %zmm14, %ymm7
+        vpmovqd   %zmm6, %ymm9
+        vpxord    %zmm2, %zmm2, %zmm2
+        vgatherdpd 13952(%rax,%ymm7,8), %zmm2{%k3}
+        vfmadd213pd %zmm2, %zmm26, %zmm2
+        vpaddq    %zmm13, %zmm2, %zmm2
+        vcvtpd2ps %zmm2, %ymm4
+        vpxord    %zmm11, %zmm11, %zmm11
+        vgatherdpd 13952(%rax,%ymm9,8), %zmm11{%k2}
+        vfmadd213pd %zmm11, %zmm15, %zmm11
+        vpaddq    %zmm12, %zmm11, %zmm3
+        vcvtpd2ps %zmm3, %ymm5
+        vinsertf32x8 $1, %ymm4, %zmm5, %zmm2
+        testl     %ecx, %ecx
+        jne       .LBL_2_3
+
+.LBL_2_2:
+        cfi_remember_state
+        vmovups   1280(%rsp), %zmm26
+        vmovaps   %zmm2, %zmm0
+        movq      %rbp, %rsp
+        cfi_def_cfa_register (%rsp)
+        popq      %rbp
+        cfi_adjust_cfa_offset (-8)
+        cfi_restore (%rbp)
+        ret
+
+.LBL_2_3:
+        cfi_restore_state
+        vmovups   %zmm0, 1088(%rsp)
+        vmovups   %zmm1, 1152(%rsp)
+        vmovups   %zmm2, 1216(%rsp)
+        je        .LBL_2_2
+
+        xorb      %dl, %dl
+        xorl      %eax, %eax
+        kmovw     %k4, 984(%rsp)
+        kmovw     %k5, 976(%rsp)
+        kmovw     %k6, 968(%rsp)
+        kmovw     %k7, 960(%rsp)
+        vmovups   %zmm16, 896(%rsp)
+        vmovups   %zmm17, 832(%rsp)
+        vmovups   %zmm18, 768(%rsp)
+        vmovups   %zmm19, 704(%rsp)
+        vmovups   %zmm20, 640(%rsp)
+        vmovups   %zmm21, 576(%rsp)
+        vmovups   %zmm22, 512(%rsp)
+        vmovups   %zmm23, 448(%rsp)
+        vmovups   %zmm24, 384(%rsp)
+        vmovups   %zmm25, 320(%rsp)
+        vmovups   %zmm27, 256(%rsp)
+        vmovups   %zmm28, 192(%rsp)
+        vmovups   %zmm29, 128(%rsp)
+        vmovups   %zmm30, 64(%rsp)
+        vmovups   %zmm31, (%rsp)
+        movq      %rsi, 1000(%rsp)
+        movq      %rdi, 992(%rsp)
+        movq      %r12, 1032(%rsp)
+        cfi_offset_rel_rsp (12, 1032)
+        movb      %dl, %r12b
+        movq      %r13, 1024(%rsp)
+        cfi_offset_rel_rsp (13, 1024)
+        movl      %ecx, %r13d
+        movq      %r14, 1016(%rsp)
+        cfi_offset_rel_rsp (14, 1016)
+        movl      %eax, %r14d
+        movq      %r15, 1008(%rsp)
+        cfi_offset_rel_rsp (15, 1008)
+        cfi_remember_state
+
+.LBL_2_6:
+        btl       %r14d, %r13d
+        jc        .LBL_2_12
+
+.LBL_2_7:
+        lea       1(%r14), %esi
+        btl       %esi, %r13d
+        jc        .LBL_2_10
+
+.LBL_2_8:
+        incb      %r12b
+        addl      $2, %r14d
+        cmpb      $16, %r12b
+        jb        .LBL_2_6
+
+        kmovw     984(%rsp), %k4
+        kmovw     976(%rsp), %k5
+        kmovw     968(%rsp), %k6
+        kmovw     960(%rsp), %k7
+        vmovups   896(%rsp), %zmm16
+        vmovups   832(%rsp), %zmm17
+        vmovups   768(%rsp), %zmm18
+        vmovups   704(%rsp), %zmm19
+        vmovups   640(%rsp), %zmm20
+        vmovups   576(%rsp), %zmm21
+        vmovups   512(%rsp), %zmm22
+        vmovups   448(%rsp), %zmm23
+        vmovups   384(%rsp), %zmm24
+        vmovups   320(%rsp), %zmm25
+        vmovups   256(%rsp), %zmm27
+        vmovups   192(%rsp), %zmm28
+        vmovups   128(%rsp), %zmm29
+        vmovups   64(%rsp), %zmm30
+        vmovups   (%rsp), %zmm31
+        vmovups   1216(%rsp), %zmm2
+        movq      1000(%rsp), %rsi
+        movq      992(%rsp), %rdi
+        movq      1032(%rsp), %r12
+        cfi_restore (%r12)
+        movq      1024(%rsp), %r13
+        cfi_restore (%r13)
+        movq      1016(%rsp), %r14
+        cfi_restore (%r14)
+        movq      1008(%rsp), %r15
+        cfi_restore (%r15)
+        jmp       .LBL_2_2
+
+.LBL_2_10:
+        cfi_restore_state
+        movzbl    %r12b, %r15d
+        vmovss    1156(%rsp,%r15,8), %xmm1
+        vzeroupper
+        vmovss    1092(%rsp,%r15,8), %xmm0
+        call      JUMPTARGET(__powf_finite)
+        vmovss    %xmm0, 1220(%rsp,%r15,8)
+        jmp       .LBL_2_8
+
+.LBL_2_12:
+        movzbl    %r12b, %r15d
+        vmovss    1152(%rsp,%r15,8), %xmm1
+        vzeroupper
+        vmovss    1088(%rsp,%r15,8), %xmm0
+        call      JUMPTARGET(__powf_finite)
+        vmovss    %xmm0, 1216(%rsp,%r15,8)
+        jmp       .LBL_2_7
+#endif
+END (_ZGVeN16vv_powf_skx)
+
+	.section .rodata, "a"
+.L_2il0floatpacket.23:
+	.long	0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
+	.type	.L_2il0floatpacket.23,@object
+.L_2il0floatpacket.24:
+	.long	0xffffffff,0xffffffff
+	.type	.L_2il0floatpacket.24,@object
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_powf4_core.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_powf4_core.S
new file mode 100644
index 0000000000..785b549882
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_powf4_core.S
@@ -0,0 +1,36 @@
+/* Multiple versions of vectorized powf.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+	.text
+ENTRY (_ZGVbN4vv_powf)
+        .type   _ZGVbN4vv_powf, @gnu_indirect_function
+	LOAD_RTLD_GLOBAL_RO_RDX
+        leaq    _ZGVbN4vv_powf_sse4(%rip), %rax
+	HAS_CPU_FEATURE (SSE4_1)
+        jz      2f
+        ret
+2:      leaq    _ZGVbN4vv_powf_sse2(%rip), %rax
+        ret
+END (_ZGVbN4vv_powf)
+libmvec_hidden_def (_ZGVbN4vv_powf)
+
+#define _ZGVbN4vv_powf _ZGVbN4vv_powf_sse2
+#include "../svml_s_powf4_core.S"
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_powf4_core_sse4.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_powf4_core_sse4.S
new file mode 100644
index 0000000000..8b1b4e74bb
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_powf4_core_sse4.S
@@ -0,0 +1,374 @@
+/* Function powf vectorized with SSE4.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "svml_s_powf_data.h"
+
+	.text
+ENTRY (_ZGVbN4vv_powf_sse4)
+/*
+   ALGORITHM DESCRIPTION:
+
+     We are using the next identity: pow(x,y) = 2^(y * log2(x)).
+
+     1) log2(x) calculation
+        Here we use the following formula.
+        Let |x|=2^k1*X1, where k1 is integer, 1<=X1<2.
+        Let C ~= 1/ln(2),
+        Rcp1 ~= 1/X1,   X2=Rcp1*X1,
+        Rcp2 ~= 1/X2,   X3=Rcp2*X2,
+        Rcp3 ~= 1/X3,   Rcp3C ~= C/X3.
+        Then
+          log2|x| = k1 + log2(1/Rcp1) + log2(1/Rcp2) + log2(C/Rcp3C) +
+                    log2(X1*Rcp1*Rcp2*Rcp3C/C),
+        where X1*Rcp1*Rcp2*Rcp3C = C*(1+q), q is very small.
+
+        The values of Rcp1, log2(1/Rcp1), Rcp2, log2(1/Rcp2),
+        Rcp3C, log2(C/Rcp3C) are taken from tables.
+        Values of Rcp1, Rcp2, Rcp3C are such that RcpC=Rcp1*Rcp2*Rcp3C
+        is exactly represented in target precision.
+
+        log2(X1*Rcp1*Rcp2*Rcp3C/C) = log2(1+q) = ln(1+q)/ln2 =
+             = 1/(ln2)*q - 1/(2ln2)*q^2 + 1/(3ln2)*q^3 - ... =
+             = 1/(C*ln2)*cq - 1/(2*C^2*ln2)*cq^2 + 1/(3*C^3*ln2)*cq^3 - ... =
+             = (1 + a1)*cq + a2*cq^2 + a3*cq^3 + ...,
+        where
+             cq=X1*Rcp1*Rcp2*Rcp3C-C,
+             a1=1/(C*ln(2))-1 is small,
+             a2=1/(2*C^2*ln2),
+             a3=1/(3*C^3*ln2),
+                  ...
+        Log2 result is split by three parts: HH+HL+HLL
+
+     2) Calculation of y*log2(x)
+        Split y into YHi+YLo.
+        Get high PH and medium PL parts of y*log2|x|.
+        Get low PLL part of y*log2|x|.
+        Now we have PH+PL+PLL ~= y*log2|x|.
+
+     3) Calculation of 2^(y*log2(x))
+        Let's represent PH+PL+PLL in the form N + j/2^expK + Z,
+        where expK=7 in this implementation, N and j are integers,
+        0<=j<=2^expK-1, |Z|<2^(-expK-1). Hence
+        2^(PH+PL+PLL) ~= 2^N * 2^(j/2^expK) * 2^Z,
+        where 2^(j/2^expK) is stored in a table, and
+        2^Z ~= 1 + B1*Z + B2*Z^2 ... + B5*Z^5.
+        We compute 2^(PH+PL+PLL) as follows:
+        Break PH into PHH + PHL, where PHH = N + j/2^expK.
+        Z = PHL + PL + PLL
+        Exp2Poly = B1*Z + B2*Z^2 ... + B5*Z^5
+        Get 2^(j/2^expK) from table in the form THI+TLO.
+        Now we have 2^(PH+PL+PLL) ~= 2^N * (THI + TLO) * (1 + Exp2Poly).
+        Get significand of 2^(PH+PL+PLL) in the form ResHi+ResLo:
+        ResHi := THI
+        ResLo := THI * Exp2Poly + TLO
+        Get exponent ERes of the result:
+        Res := ResHi + ResLo:
+        Result := ex(Res) + N.  */
+
+        pushq     %rbp
+        cfi_adjust_cfa_offset (8)
+        cfi_rel_offset (%rbp, 0)
+        movq      %rsp, %rbp
+        cfi_def_cfa_register (%rbp)
+        andq      $-64, %rsp
+        subq      $256, %rsp
+        movaps    %xmm0, %xmm3
+        movhlps   %xmm0, %xmm3
+        movaps    %xmm1, %xmm5
+        movups    %xmm8, 112(%rsp)
+        movaps    %xmm5, %xmm2
+        cvtps2pd  %xmm3, %xmm8
+        cvtps2pd  %xmm5, %xmm7
+        movups    %xmm9, 96(%rsp)
+        movaps    %xmm0, %xmm4
+        cvtps2pd  %xmm0, %xmm9
+        movq      __svml_spow_data@GOTPCREL(%rip), %rdx
+        movups    %xmm10, 176(%rsp)
+        movups    %xmm13, 48(%rsp)
+        movups _ExpMask(%rdx), %xmm6
+
+/* preserve mantissa, set input exponent to 2^(-10) */
+        movaps    %xmm6, %xmm10
+        andps     %xmm8, %xmm6
+        andps     %xmm9, %xmm10
+
+/* exponent bits selection */
+        psrlq     $20, %xmm9
+        orps _Two10(%rdx), %xmm6
+        psrlq     $20, %xmm8
+        orps _Two10(%rdx), %xmm10
+
+/* reciprocal approximation good to at least 11 bits */
+        cvtpd2ps  %xmm6, %xmm13
+        cvtpd2ps  %xmm10, %xmm1
+        movlhps   %xmm13, %xmm13
+        movhlps   %xmm5, %xmm2
+        movlhps   %xmm1, %xmm1
+        movups    %xmm12, 208(%rsp)
+        rcpps     %xmm13, %xmm12
+        movups    %xmm11, 80(%rsp)
+        cvtps2pd  %xmm2, %xmm11
+        rcpps     %xmm1, %xmm2
+        movups    %xmm14, 144(%rsp)
+        cvtps2pd  %xmm12, %xmm14
+        movups    %xmm15, 160(%rsp)
+        cvtps2pd  %xmm2, %xmm15
+        shufps    $221, %xmm8, %xmm9
+
+/* round reciprocal to nearest integer, will have 1+9 mantissa bits */
+        roundpd   $0, %xmm14, %xmm14
+
+/* biased exponent in DP format */
+        pshufd    $238, %xmm9, %xmm8
+        roundpd   $0, %xmm15, %xmm15
+        cvtdq2pd  %xmm8, %xmm1
+        mulpd     %xmm15, %xmm10
+        mulpd     %xmm14, %xmm6
+        cvtdq2pd  %xmm9, %xmm2
+        subpd _One(%rdx), %xmm10
+        subpd _One(%rdx), %xmm6
+
+/* table lookup */
+        movaps    %xmm14, %xmm8
+        movaps    %xmm15, %xmm9
+        psrlq     $40, %xmm8
+        psrlq     $40, %xmm9
+        movd      %xmm8, %r8d
+        movd      %xmm9, %eax
+        psubd _NMINNORM(%rdx), %xmm4
+        movdqu _ABSMASK(%rdx), %xmm3
+        pextrd    $2, %xmm8, %r9d
+        pand      %xmm5, %xmm3
+        movups _Threshold(%rdx), %xmm8
+        pextrd    $2, %xmm9, %ecx
+        movaps    %xmm8, %xmm9
+        cmpltpd   %xmm15, %xmm9
+        cmpltpd   %xmm14, %xmm8
+        andps _Bias(%rdx), %xmm9
+        movaps    %xmm10, %xmm14
+        andps _Bias(%rdx), %xmm8
+        movaps    %xmm6, %xmm15
+        orps _Bias1(%rdx), %xmm9
+        orps _Bias1(%rdx), %xmm8
+        subpd     %xmm9, %xmm2
+        subpd     %xmm8, %xmm1
+        mulpd     %xmm10, %xmm14
+        mulpd     %xmm6, %xmm15
+        mulpd _L2(%rdx), %xmm2
+        mulpd _L2(%rdx), %xmm1
+        movups _poly_coeff_3(%rdx), %xmm9
+        movaps    %xmm9, %xmm8
+        mulpd     %xmm10, %xmm8
+        mulpd     %xmm6, %xmm9
+        addpd _poly_coeff_4(%rdx), %xmm8
+        addpd _poly_coeff_4(%rdx), %xmm9
+        mulpd     %xmm14, %xmm8
+        mulpd     %xmm15, %xmm9
+
+/* reconstruction */
+        addpd     %xmm8, %xmm10
+        addpd     %xmm9, %xmm6
+        movslq    %eax, %rax
+        movslq    %r8d, %r8
+        movslq    %ecx, %rcx
+        movslq    %r9d, %r9
+        movsd     _Log2Rcp_lookup(%rdx,%rax), %xmm13
+        movsd     _Log2Rcp_lookup(%rdx,%r8), %xmm12
+        movhpd    _Log2Rcp_lookup(%rdx,%rcx), %xmm13
+        movhpd    _Log2Rcp_lookup(%rdx,%r9), %xmm12
+        addpd     %xmm10, %xmm13
+        addpd     %xmm6, %xmm12
+        addpd     %xmm13, %xmm2
+        addpd     %xmm12, %xmm1
+        mulpd     %xmm7, %xmm2
+        mulpd     %xmm11, %xmm1
+        movups __dbInvLn2(%rdx), %xmm11
+        movdqa    %xmm4, %xmm12
+        movaps    %xmm11, %xmm10
+        mulpd     %xmm2, %xmm10
+        mulpd     %xmm1, %xmm11
+
+/* to round down; if dR is an integer we will get R = 1, which is ok */
+        movaps    %xmm10, %xmm8
+        movaps    %xmm11, %xmm9
+        subpd __dbHALF(%rdx), %xmm8
+        subpd __dbHALF(%rdx), %xmm9
+        addpd __dbShifter(%rdx), %xmm8
+        addpd __dbShifter(%rdx), %xmm9
+        movaps    %xmm8, %xmm6
+        movaps    %xmm9, %xmm7
+        subpd __dbShifter(%rdx), %xmm6
+        subpd __dbShifter(%rdx), %xmm7
+
+/* [0..1) */
+        subpd     %xmm6, %xmm10
+        subpd     %xmm7, %xmm11
+        mulpd __dbC1(%rdx), %xmm10
+        mulpd __dbC1(%rdx), %xmm11
+
+/* hi bits */
+        shufps    $221, %xmm1, %xmm2
+        movdqu _NMAXVAL(%rdx), %xmm1
+        pcmpgtd   %xmm1, %xmm12
+        pcmpeqd   %xmm1, %xmm4
+        por       %xmm4, %xmm12
+        movdqa    %xmm3, %xmm1
+        movdqu _INF(%rdx), %xmm4
+        pcmpgtd   %xmm4, %xmm1
+        pcmpeqd   %xmm4, %xmm3
+
+/* iAbsX = iAbsX&iAbsMask */
+        pand __iAbsMask(%rdx), %xmm2
+        por       %xmm3, %xmm1
+
+/* iRangeMask = (iAbsX>iDomainRange) */
+        pcmpgtd __iDomainRange(%rdx), %xmm2
+        por       %xmm1, %xmm12
+        movups __lbLOWKBITS(%rdx), %xmm3
+        por       %xmm2, %xmm12
+
+/* low K bits */
+        movaps    %xmm3, %xmm2
+        andps     %xmm9, %xmm3
+        andps     %xmm8, %xmm2
+        psrlq     $11, %xmm8
+
+/* dpP= _dbT+lJ*T_ITEM_GRAN */
+        movd      %xmm2, %r10d
+        psrlq     $11, %xmm9
+        movd      %xmm3, %ecx
+
+/* NB : including +/- sign for the exponent!! */
+        psllq     $52, %xmm8
+        psllq     $52, %xmm9
+        pextrw    $4, %xmm2, %r11d
+        pextrw    $4, %xmm3, %r8d
+        movmskps  %xmm12, %eax
+        shll      $3, %r10d
+        shll      $3, %ecx
+        shll      $3, %r11d
+        shll      $3, %r8d
+        movq      13952(%rdx,%r10), %xmm6
+        movq      13952(%rdx,%rcx), %xmm7
+        movhpd    13952(%rdx,%r11), %xmm6
+        movhpd    13952(%rdx,%r8), %xmm7
+        mulpd     %xmm6, %xmm10
+        mulpd     %xmm7, %xmm11
+        addpd     %xmm10, %xmm6
+        addpd     %xmm11, %xmm7
+        paddq     %xmm8, %xmm6
+        paddq     %xmm9, %xmm7
+        cvtpd2ps  %xmm6, %xmm1
+        cvtpd2ps  %xmm7, %xmm4
+        movlhps   %xmm4, %xmm1
+        testl     %eax, %eax
+        jne       .LBL_1_3
+
+.LBL_1_2:
+        cfi_remember_state
+        movups    112(%rsp), %xmm8
+        movaps    %xmm1, %xmm0
+        movups    96(%rsp), %xmm9
+        movups    176(%rsp), %xmm10
+        movups    80(%rsp), %xmm11
+        movups    208(%rsp), %xmm12
+        movups    48(%rsp), %xmm13
+        movups    144(%rsp), %xmm14
+        movups    160(%rsp), %xmm15
+        movq      %rbp, %rsp
+        cfi_def_cfa_register (%rsp)
+        popq      %rbp
+        cfi_adjust_cfa_offset (-8)
+        cfi_restore (%rbp)
+        ret
+
+.LBL_1_3:
+        cfi_restore_state
+        movups    %xmm0, 64(%rsp)
+        movups    %xmm5, 128(%rsp)
+        movups    %xmm1, 192(%rsp)
+        je        .LBL_1_2
+
+        xorb      %cl, %cl
+        xorl      %edx, %edx
+        movq      %rsi, 8(%rsp)
+        movq      %rdi, (%rsp)
+        movq      %r12, 40(%rsp)
+        cfi_offset_rel_rsp (12, 40)
+        movb      %cl, %r12b
+        movq      %r13, 32(%rsp)
+        cfi_offset_rel_rsp (13, 32)
+        movl      %eax, %r13d
+        movq      %r14, 24(%rsp)
+        cfi_offset_rel_rsp (14, 24)
+        movl      %edx, %r14d
+        movq      %r15, 16(%rsp)
+        cfi_offset_rel_rsp (15, 16)
+        cfi_remember_state
+
+.LBL_1_6:
+        btl       %r14d, %r13d
+        jc        .LBL_1_12
+
+.LBL_1_7:
+        lea       1(%r14), %esi
+        btl       %esi, %r13d
+        jc        .LBL_1_10
+
+.LBL_1_8:
+        incb      %r12b
+        addl      $2, %r14d
+        cmpb      $16, %r12b
+        jb        .LBL_1_6
+
+        movq      8(%rsp), %rsi
+        movq      (%rsp), %rdi
+        movq      40(%rsp), %r12
+        cfi_restore (%r12)
+        movq      32(%rsp), %r13
+        cfi_restore (%r13)
+        movq      24(%rsp), %r14
+        cfi_restore (%r14)
+        movq      16(%rsp), %r15
+        cfi_restore (%r15)
+        movups    192(%rsp), %xmm1
+        jmp       .LBL_1_2
+
+.LBL_1_10:
+        cfi_restore_state
+        movzbl    %r12b, %r15d
+        movss     68(%rsp,%r15,8), %xmm0
+        movss     132(%rsp,%r15,8), %xmm1
+
+        call      JUMPTARGET(__powf_finite)
+
+        movss     %xmm0, 196(%rsp,%r15,8)
+        jmp       .LBL_1_8
+
+.LBL_1_12:
+        movzbl    %r12b, %r15d
+        movss     64(%rsp,%r15,8), %xmm0
+        movss     128(%rsp,%r15,8), %xmm1
+
+        call      JUMPTARGET(__powf_finite)
+
+        movss     %xmm0, 192(%rsp,%r15,8)
+        jmp       .LBL_1_7
+
+END (_ZGVbN4vv_powf_sse4)
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_powf8_core.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_powf8_core.S
new file mode 100644
index 0000000000..1f6a07315e
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_powf8_core.S
@@ -0,0 +1,36 @@
+/* Multiple versions of vectorized powf.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+    Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+	.text
+ENTRY (_ZGVdN8vv_powf)
+        .type   _ZGVdN8vv_powf, @gnu_indirect_function
+	LOAD_RTLD_GLOBAL_RO_RDX
+        leaq    _ZGVdN8vv_powf_avx2(%rip), %rax
+	HAS_ARCH_FEATURE (AVX2_Usable)
+        jz      2f
+        ret
+2:      leaq    _ZGVdN8vv_powf_sse_wrapper(%rip), %rax
+        ret
+END (_ZGVdN8vv_powf)
+libmvec_hidden_def (_ZGVdN8vv_powf)
+
+#define _ZGVdN8vv_powf _ZGVdN8vv_powf_sse_wrapper
+#include "../svml_s_powf8_core.S"
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_powf8_core_avx2.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_powf8_core_avx2.S
new file mode 100644
index 0000000000..683932f410
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_powf8_core_avx2.S
@@ -0,0 +1,357 @@
+/* Function powf vectorized with AVX2.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "svml_s_powf_data.h"
+
+	.text
+ENTRY(_ZGVdN8vv_powf_avx2)
+/*
+   ALGORITHM DESCRIPTION:
+
+     We are using the next identity : pow(x,y) = 2^(y * log2(x)).
+
+     1) log2(x) calculation
+        Here we use the following formula.
+        Let |x|=2^k1*X1, where k1 is integer, 1<=X1<2.
+        Let C ~= 1/ln(2),
+        Rcp1 ~= 1/X1,   X2=Rcp1*X1,
+        Rcp2 ~= 1/X2,   X3=Rcp2*X2,
+        Rcp3 ~= 1/X3,   Rcp3C ~= C/X3.
+        Then
+          log2|x| = k1 + log2(1/Rcp1) + log2(1/Rcp2) + log2(C/Rcp3C) +
+                    log2(X1*Rcp1*Rcp2*Rcp3C/C),
+        where X1*Rcp1*Rcp2*Rcp3C = C*(1+q), q is very small.
+
+        The values of Rcp1, log2(1/Rcp1), Rcp2, log2(1/Rcp2),
+        Rcp3C, log2(C/Rcp3C) are taken from tables.
+        Values of Rcp1, Rcp2, Rcp3C are such that RcpC=Rcp1*Rcp2*Rcp3C
+        is exactly represented in target precision.
+
+        log2(X1*Rcp1*Rcp2*Rcp3C/C) = log2(1+q) = ln(1+q)/ln2 =
+             = 1/(ln2)*q - 1/(2ln2)*q^2 + 1/(3ln2)*q^3 - ... =
+             = 1/(C*ln2)*cq - 1/(2*C^2*ln2)*cq^2 + 1/(3*C^3*ln2)*cq^3 - ... =
+             = (1 + a1)*cq + a2*cq^2 + a3*cq^3 + ...,
+        where
+             cq=X1*Rcp1*Rcp2*Rcp3C-C,
+             a1=1/(C*ln(2))-1 is small,
+             a2=1/(2*C^2*ln2),
+             a3=1/(3*C^3*ln2),
+                  ...
+        Log2 result is split by three parts: HH+HL+HLL
+
+     2) Calculation of y*log2(x)
+        Split y into YHi+YLo.
+        Get high PH and medium PL parts of y*log2|x|.
+        Get low PLL part of y*log2|x|.
+        Now we have PH+PL+PLL ~= y*log2|x|.
+
+     3) Calculation of 2^(y*log2(x))
+        Let's represent PH+PL+PLL in the form N + j/2^expK + Z,
+        where expK=7 in this implementation, N and j are integers,
+        0<=j<=2^expK-1, |Z|<2^(-expK-1). Hence
+        2^(PH+PL+PLL) ~= 2^N * 2^(j/2^expK) * 2^Z,
+        where 2^(j/2^expK) is stored in a table, and
+        2^Z ~= 1 + B1*Z + B2*Z^2 ... + B5*Z^5.
+        We compute 2^(PH+PL+PLL) as follows:
+        Break PH into PHH + PHL, where PHH = N + j/2^expK.
+        Z = PHL + PL + PLL
+        Exp2Poly = B1*Z + B2*Z^2 ... + B5*Z^5
+        Get 2^(j/2^expK) from table in the form THI+TLO.
+        Now we have 2^(PH+PL+PLL) ~= 2^N * (THI + TLO) * (1 + Exp2Poly).
+        Get significand of 2^(PH+PL+PLL) in the form ResHi+ResLo:
+        ResHi := THI
+        ResLo := THI * Exp2Poly + TLO
+        Get exponent ERes of the result:
+        Res := ResHi + ResLo:
+        Result := ex(Res) + N.  */
+
+        pushq     %rbp
+        cfi_adjust_cfa_offset (8)
+        cfi_rel_offset (%rbp, 0)
+        movq      %rsp, %rbp
+        cfi_def_cfa_register (%rbp)
+        andq      $-64, %rsp
+        subq      $448, %rsp
+        lea       __VPACK_ODD_ind.6357.0.1(%rip), %rcx
+        vmovups   %ymm14, 320(%rsp)
+
+/* hi bits */
+        lea       __VPACK_ODD_ind.6358.0.1(%rip), %rax
+        vmovups   %ymm12, 256(%rsp)
+        vmovups   %ymm9, 96(%rsp)
+        vmovups   %ymm13, 224(%rsp)
+        vmovups   %ymm15, 352(%rsp)
+        vmovups   %ymm11, 384(%rsp)
+        vmovups   %ymm10, 288(%rsp)
+        vmovups   (%rcx), %ymm10
+        vmovups   %ymm8, 160(%rsp)
+        vmovdqa   %ymm1, %ymm9
+        movq      __svml_spow_data@GOTPCREL(%rip), %rdx
+        vextractf128 $1, %ymm0, %xmm7
+        vcvtps2pd %xmm0, %ymm14
+        vcvtps2pd %xmm7, %ymm12
+        vpsubd _NMINNORM(%rdx), %ymm0, %ymm7
+
+/* preserve mantissa, set input exponent to 2^(-10) */
+        vandpd _ExpMask(%rdx), %ymm14, %ymm3
+        vandpd _ExpMask(%rdx), %ymm12, %ymm13
+
+/* exponent bits selection */
+        vpsrlq    $20, %ymm12, %ymm12
+        vpsrlq    $20, %ymm14, %ymm14
+        vextractf128 $1, %ymm9, %xmm2
+        vcvtps2pd %xmm9, %ymm1
+        vpand _ABSMASK(%rdx), %ymm9, %ymm8
+        vcvtps2pd %xmm2, %ymm6
+        vorpd _Two10(%rdx), %ymm3, %ymm2
+        vorpd _Two10(%rdx), %ymm13, %ymm3
+
+/* reciprocal approximation good to at least 11 bits */
+        vcvtpd2ps %ymm2, %xmm5
+        vcvtpd2ps %ymm3, %xmm15
+        vrcpps    %xmm5, %xmm4
+        vrcpps    %xmm15, %xmm11
+        vcvtps2pd %xmm4, %ymm13
+        vcvtps2pd %xmm11, %ymm4
+        vpermps   %ymm12, %ymm10, %ymm11
+
+/* round reciprocal to nearest integer, will have 1+9 mantissa bits */
+        vroundpd  $0, %ymm13, %ymm12
+        vpermps   %ymm14, %ymm10, %ymm5
+        vroundpd  $0, %ymm4, %ymm14
+        vmovupd _One(%rdx), %ymm4
+
+/* table lookup */
+        vpsrlq    $40, %ymm12, %ymm10
+        vfmsub213pd %ymm4, %ymm12, %ymm2
+        vfmsub213pd %ymm4, %ymm14, %ymm3
+        vcmpgt_oqpd _Threshold(%rdx), %ymm12, %ymm12
+        vxorpd    %ymm4, %ymm4, %ymm4
+        vandpd _Bias(%rdx), %ymm12, %ymm12
+
+/* biased exponent in DP format */
+        vcvtdq2pd %xmm11, %ymm13
+        vpcmpeqd  %ymm11, %ymm11, %ymm11
+        vgatherqpd %ymm11, _Log2Rcp_lookup(%rdx,%ymm10), %ymm4
+        vpsrlq    $40, %ymm14, %ymm10
+        vcmpgt_oqpd _Threshold(%rdx), %ymm14, %ymm14
+        vpcmpeqd  %ymm11, %ymm11, %ymm11
+        vandpd _Bias(%rdx), %ymm14, %ymm14
+        vcvtdq2pd %xmm5, %ymm15
+        vxorpd    %ymm5, %ymm5, %ymm5
+        vgatherqpd %ymm11, _Log2Rcp_lookup(%rdx,%ymm10), %ymm5
+        vorpd _Bias1(%rdx), %ymm12, %ymm11
+        vorpd _Bias1(%rdx), %ymm14, %ymm10
+        vsubpd    %ymm11, %ymm15, %ymm11
+        vsubpd    %ymm10, %ymm13, %ymm14
+        vmovupd _poly_coeff_4(%rdx), %ymm15
+        vmovupd _poly_coeff_3(%rdx), %ymm13
+        vmulpd    %ymm3, %ymm3, %ymm10
+        vfmadd213pd %ymm15, %ymm3, %ymm13
+        vmovdqa   %ymm15, %ymm12
+        vfmadd231pd _poly_coeff_3(%rdx), %ymm2, %ymm12
+        vmulpd    %ymm2, %ymm2, %ymm15
+
+/* reconstruction */
+        vfmadd213pd %ymm3, %ymm10, %ymm13
+        vfmadd213pd %ymm2, %ymm15, %ymm12
+        vaddpd    %ymm5, %ymm13, %ymm13
+        vaddpd    %ymm4, %ymm12, %ymm2
+        vfmadd231pd _L2(%rdx), %ymm14, %ymm13
+        vfmadd132pd _L2(%rdx), %ymm2, %ymm11
+        vmulpd    %ymm6, %ymm13, %ymm2
+        vmulpd    %ymm1, %ymm11, %ymm10
+        vmulpd __dbInvLn2(%rdx), %ymm2, %ymm6
+        vmulpd __dbInvLn2(%rdx), %ymm10, %ymm15
+
+/* to round down; if dR is an integer we will get R = 1, which is ok */
+        vsubpd __dbHALF(%rdx), %ymm6, %ymm3
+        vsubpd __dbHALF(%rdx), %ymm15, %ymm1
+        vaddpd __dbShifter(%rdx), %ymm3, %ymm13
+        vaddpd __dbShifter(%rdx), %ymm1, %ymm14
+        vsubpd __dbShifter(%rdx), %ymm13, %ymm12
+        vmovups   (%rax), %ymm1
+        vsubpd __dbShifter(%rdx), %ymm14, %ymm11
+
+/* [0..1) */
+        vsubpd    %ymm12, %ymm6, %ymm6
+        vpermps   %ymm10, %ymm1, %ymm3
+        vpermps   %ymm2, %ymm1, %ymm10
+        vpcmpgtd _NMAXVAL(%rdx), %ymm7, %ymm4
+        vpcmpgtd _INF(%rdx), %ymm8, %ymm1
+        vpcmpeqd _NMAXVAL(%rdx), %ymm7, %ymm7
+        vpcmpeqd _INF(%rdx), %ymm8, %ymm8
+        vpor      %ymm7, %ymm4, %ymm2
+        vpor      %ymm8, %ymm1, %ymm1
+        vsubpd    %ymm11, %ymm15, %ymm7
+        vinsertf128 $1, %xmm10, %ymm3, %ymm10
+        vpor      %ymm1, %ymm2, %ymm3
+
+/* iAbsX = iAbsX&iAbsMask */
+        vandps __iAbsMask(%rdx), %ymm10, %ymm10
+
+/* iRangeMask = (iAbsX>iDomainRange) */
+        vpcmpgtd __iDomainRange(%rdx), %ymm10, %ymm4
+        vpor      %ymm4, %ymm3, %ymm5
+        vmulpd __dbC1(%rdx), %ymm7, %ymm4
+        vmovmskps %ymm5, %ecx
+        vmulpd __dbC1(%rdx), %ymm6, %ymm5
+
+/* low K bits */
+        vandps __lbLOWKBITS(%rdx), %ymm14, %ymm6
+
+/* dpP= _dbT+lJ*T_ITEM_GRAN */
+        vxorpd    %ymm7, %ymm7, %ymm7
+        vpcmpeqd  %ymm1, %ymm1, %ymm1
+        vandps __lbLOWKBITS(%rdx), %ymm13, %ymm2
+        vxorpd    %ymm10, %ymm10, %ymm10
+        vpcmpeqd  %ymm3, %ymm3, %ymm3
+        vgatherqpd %ymm1, 13952(%rdx,%ymm6,8), %ymm7
+        vgatherqpd %ymm3, 13952(%rdx,%ymm2,8), %ymm10
+        vpsrlq    $11, %ymm14, %ymm14
+        vpsrlq    $11, %ymm13, %ymm13
+        vfmadd213pd %ymm7, %ymm4, %ymm7
+        vfmadd213pd %ymm10, %ymm5, %ymm10
+
+/* NB : including +/- sign for the exponent!! */
+        vpsllq    $52, %ymm14, %ymm8
+        vpsllq    $52, %ymm13, %ymm11
+        vpaddq    %ymm8, %ymm7, %ymm12
+        vpaddq    %ymm11, %ymm10, %ymm1
+        vcvtpd2ps %ymm12, %xmm15
+        vcvtpd2ps %ymm1, %xmm2
+        vinsertf128 $1, %xmm2, %ymm15, %ymm1
+        testl     %ecx, %ecx
+        jne       .LBL_1_3
+
+.LBL_1_2:
+        cfi_remember_state
+        vmovups   160(%rsp), %ymm8
+        vmovups   96(%rsp), %ymm9
+        vmovups   288(%rsp), %ymm10
+        vmovups   384(%rsp), %ymm11
+        vmovups   256(%rsp), %ymm12
+        vmovups   224(%rsp), %ymm13
+        vmovups   320(%rsp), %ymm14
+        vmovups   352(%rsp), %ymm15
+        vmovdqa   %ymm1, %ymm0
+        movq      %rbp, %rsp
+        cfi_def_cfa_register (%rsp)
+        popq      %rbp
+        cfi_adjust_cfa_offset (-8)
+        cfi_restore (%rbp)
+        ret
+
+.LBL_1_3:
+        cfi_restore_state
+        vmovups   %ymm0, 64(%rsp)
+        vmovups   %ymm9, 128(%rsp)
+        vmovups   %ymm1, 192(%rsp)
+        je        .LBL_1_2
+
+        xorb      %dl, %dl
+        xorl      %eax, %eax
+        movq      %rsi, 8(%rsp)
+        movq      %rdi, (%rsp)
+        movq      %r12, 40(%rsp)
+        cfi_offset_rel_rsp (12, 40)
+        movb      %dl, %r12b
+        movq      %r13, 32(%rsp)
+        cfi_offset_rel_rsp (13, 32)
+        movl      %ecx, %r13d
+        movq      %r14, 24(%rsp)
+        cfi_offset_rel_rsp (14, 24)
+        movl      %eax, %r14d
+        movq      %r15, 16(%rsp)
+        cfi_offset_rel_rsp (15, 16)
+        cfi_remember_state
+
+.LBL_1_6:
+        btl       %r14d, %r13d
+        jc        .LBL_1_12
+
+.LBL_1_7:
+        lea       1(%r14), %esi
+        btl       %esi, %r13d
+        jc        .LBL_1_10
+
+.LBL_1_8:
+        incb      %r12b
+        addl      $2, %r14d
+        cmpb      $16, %r12b
+        jb        .LBL_1_6
+
+        movq      8(%rsp), %rsi
+        movq      (%rsp), %rdi
+        movq      40(%rsp), %r12
+        cfi_restore (%r12)
+        movq      32(%rsp), %r13
+        cfi_restore (%r13)
+        movq      24(%rsp), %r14
+        cfi_restore (%r14)
+        movq      16(%rsp), %r15
+        cfi_restore (%r15)
+        vmovups   192(%rsp), %ymm1
+        jmp       .LBL_1_2
+
+.LBL_1_10:
+        cfi_restore_state
+        movzbl    %r12b, %r15d
+        vmovss    68(%rsp,%r15,8), %xmm0
+        vmovss    132(%rsp,%r15,8), %xmm1
+        vzeroupper
+
+        call      JUMPTARGET(__powf_finite)
+
+        vmovss    %xmm0, 196(%rsp,%r15,8)
+        jmp       .LBL_1_8
+
+.LBL_1_12:
+        movzbl    %r12b, %r15d
+        vmovss    64(%rsp,%r15,8), %xmm0
+        vmovss    128(%rsp,%r15,8), %xmm1
+        vzeroupper
+
+        call      JUMPTARGET(__powf_finite)
+
+        vmovss    %xmm0, 192(%rsp,%r15,8)
+        jmp       .LBL_1_7
+
+END(_ZGVdN8vv_powf_avx2)
+
+	.section .rodata, "a"
+__VPACK_ODD_ind.6357.0.1:
+	.long	1
+	.long	3
+	.long	5
+	.long	7
+	.long	0
+	.long	0
+	.long	0
+	.long	0
+	.space 32, 0x00
+__VPACK_ODD_ind.6358.0.1:
+	.long	1
+	.long	3
+	.long	5
+	.long	7
+	.long	0
+	.long	0
+	.long	0
+	.long	0
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core.S
new file mode 100644
index 0000000000..0545460952
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core.S
@@ -0,0 +1,37 @@
+/* Multiple versions of vectorized sincosf.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+	.text
+ENTRY (_ZGVeN16vvv_sincosf)
+        .type   _ZGVeN16vvv_sincosf, @gnu_indirect_function
+	LOAD_RTLD_GLOBAL_RO_RDX
+        leaq    _ZGVeN16vvv_sincosf_skx(%rip), %rax
+	HAS_ARCH_FEATURE (AVX512DQ_Usable)
+        jnz     2f
+        leaq    _ZGVeN16vvv_sincosf_knl(%rip), %rax
+	HAS_ARCH_FEATURE (AVX512F_Usable)
+        jnz     2f
+        leaq    _ZGVeN16vvv_sincosf_avx2_wrapper(%rip), %rax
+2:      ret
+END (_ZGVeN16vvv_sincosf)
+
+#define _ZGVeN16vvv_sincosf _ZGVeN16vvv_sincosf_avx2_wrapper
+#include "../svml_s_sincosf16_core.S"
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S
new file mode 100644
index 0000000000..f73ab7de7c
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S
@@ -0,0 +1,806 @@
+/* Function sincosf vectorized with AVX-512. KNL and SKX versions.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "svml_s_trig_data.h"
+#include "svml_s_wrapper_impl.h"
+
+/*
+   ALGORITHM DESCRIPTION:
+
+     1) Range reduction to [-Pi/4; +Pi/4] interval
+        a) Grab sign from source argument and save it.
+        b) Remove sign using AND operation
+        c) Getting octant Y by 2/Pi multiplication
+        d) Add "Right Shifter" value
+        e) Treat obtained value as integer S for destination sign setting.
+           SS = ((S-S&1)&2)<<30; For sin part
+           SC = ((S+S&1)&2)<<30; For cos part
+        f) Change destination sign if source sign is negative
+           using XOR operation.
+        g) Subtract "Right Shifter" (0x4B000000) value
+        h) Subtract Y*(PI/2) from X argument, where PI/2 divided to 4 parts:
+           X = X - Y*PI1 - Y*PI2 - Y*PI3 - Y*PI4;
+     2) Polynomial (minimax for sin within  [-Pi/4; +Pi/4] interval)
+        a) Calculate X^2 = X * X
+        b) Calculate 2 polynomials for sin and cos:
+           RS = X * ( A0 + X^2 * (A1 + x^2 * (A2 + x^2 * (A3))));
+           RC = B0 + X^2 * (B1 + x^2 * (B2 + x^2 * (B3 + x^2 * (B4))));
+        c) Swap RS & RC if if first bit of obtained value after
+           Right Shifting is set to 1. Using And, Andnot & Or operations.
+     3) Destination sign setting
+        a) Set shifted destination sign using XOR operation:
+           R1 = XOR( RS, SS );
+           R2 = XOR( RC, SC ).  */
+
+	.text
+ENTRY (_ZGVeN16vl4l4_sincosf_knl)
+#ifndef HAVE_AVX512DQ_ASM_SUPPORT
+WRAPPER_IMPL_AVX512_fFF _ZGVdN8vl4l4_sincosf
+#else
+        pushq     %rbp
+        cfi_adjust_cfa_offset (8)
+        cfi_rel_offset (%rbp, 0)
+        movq      %rsp, %rbp
+        cfi_def_cfa_register (%rbp)
+        andq      $-64, %rsp
+        subq      $1344, %rsp
+        movq      __svml_s_trig_data@GOTPCREL(%rip), %rax
+        vmovaps   %zmm0, %zmm2
+        movl      $-1, %edx
+        vmovups __sAbsMask(%rax), %zmm0
+        vmovups __sInvPI(%rax), %zmm3
+
+/* Absolute argument computation */
+        vpandd    %zmm0, %zmm2, %zmm1
+        vmovups __sPI1_FMA(%rax), %zmm5
+        vmovups __sSignMask(%rax), %zmm9
+        vpandnd   %zmm2, %zmm0, %zmm0
+
+/* h) Subtract Y*(PI/2) from X argument, where PI/2 divided to 3 parts:
+      X = X - Y*PI1 - Y*PI2 - Y*PI3 */
+        vmovaps   %zmm1, %zmm6
+        vmovaps   %zmm1, %zmm8
+
+/* c) Getting octant Y by 2/Pi multiplication
+   d) Add "Right Shifter" value */
+        vfmadd213ps __sRShifter(%rax), %zmm1, %zmm3
+        vmovups __sPI3_FMA(%rax), %zmm7
+
+/* g) Subtract "Right Shifter" (0x4B000000) value */
+        vsubps __sRShifter(%rax), %zmm3, %zmm12
+
+/* e) Treat obtained value as integer S for destination sign setting */
+        vpslld    $31, %zmm3, %zmm13
+        vmovups __sA7_FMA(%rax), %zmm14
+        vfnmadd231ps %zmm12, %zmm5, %zmm6
+
+/* 2) Polynomial (minimax for sin within  [-Pi/4; +Pi/4] interval)
+      a) Calculate X^2 = X * X
+      b) Calculate 2 polynomials for sin and cos:
+         RS = X * ( A0 + X^2 * (A1 + x^2 * (A2 + x^2 * (A3))));
+         RC = B0 + X^2 * (B1 + x^2 * (B2 + x^2 * (B3 + x^2 * (B4)))) */
+        vmovaps   %zmm14, %zmm15
+        vmovups __sA9_FMA(%rax), %zmm3
+        vcmpps    $22, __sRangeReductionVal(%rax), %zmm1, %k1
+        vpbroadcastd %edx, %zmm1{%k1}{z}
+        vfnmadd231ps __sPI2_FMA(%rax), %zmm12, %zmm6
+        vptestmd  %zmm1, %zmm1, %k0
+        vpandd    %zmm6, %zmm9, %zmm11
+        kmovw     %k0, %ecx
+        vpxord __sOneHalf(%rax), %zmm11, %zmm4
+
+/* Result sign calculations */
+        vpternlogd $150, %zmm13, %zmm9, %zmm11
+
+/* Add correction term 0.5 for cos() part */
+        vaddps    %zmm4, %zmm12, %zmm10
+        vfnmadd213ps %zmm6, %zmm7, %zmm12
+        vfnmadd231ps %zmm10, %zmm5, %zmm8
+        vpxord    %zmm13, %zmm12, %zmm13
+        vmulps    %zmm13, %zmm13, %zmm12
+        vfnmadd231ps __sPI2_FMA(%rax), %zmm10, %zmm8
+        vfmadd231ps __sA9_FMA(%rax), %zmm12, %zmm15
+        vfnmadd213ps %zmm8, %zmm7, %zmm10
+        vfmadd213ps __sA5_FMA(%rax), %zmm12, %zmm15
+        vpxord    %zmm11, %zmm10, %zmm5
+        vmulps    %zmm5, %zmm5, %zmm4
+        vfmadd213ps __sA3(%rax), %zmm12, %zmm15
+        vfmadd213ps %zmm14, %zmm4, %zmm3
+        vmulps    %zmm12, %zmm15, %zmm14
+        vfmadd213ps __sA5_FMA(%rax), %zmm4, %zmm3
+        vfmadd213ps %zmm13, %zmm13, %zmm14
+        vfmadd213ps __sA3(%rax), %zmm4, %zmm3
+        vpxord    %zmm0, %zmm14, %zmm0
+        vmulps    %zmm4, %zmm3, %zmm3
+        vfmadd213ps %zmm5, %zmm5, %zmm3
+        testl     %ecx, %ecx
+        jne       .LBL_1_3
+
+.LBL_1_2:
+        cfi_remember_state
+        vmovups   %zmm0, (%rdi)
+        vmovups   %zmm3, (%rsi)
+        movq      %rbp, %rsp
+        cfi_def_cfa_register (%rsp)
+        popq      %rbp
+        cfi_adjust_cfa_offset (-8)
+        cfi_restore (%rbp)
+        ret
+
+.LBL_1_3:
+        cfi_restore_state
+        vmovups   %zmm2, 1152(%rsp)
+        vmovups   %zmm0, 1216(%rsp)
+        vmovups   %zmm3, 1280(%rsp)
+        je        .LBL_1_2
+
+        xorb      %dl, %dl
+        kmovw     %k4, 1048(%rsp)
+        xorl      %eax, %eax
+        kmovw     %k5, 1040(%rsp)
+        kmovw     %k6, 1032(%rsp)
+        kmovw     %k7, 1024(%rsp)
+        vmovups   %zmm16, 960(%rsp)
+        vmovups   %zmm17, 896(%rsp)
+        vmovups   %zmm18, 832(%rsp)
+        vmovups   %zmm19, 768(%rsp)
+        vmovups   %zmm20, 704(%rsp)
+        vmovups   %zmm21, 640(%rsp)
+        vmovups   %zmm22, 576(%rsp)
+        vmovups   %zmm23, 512(%rsp)
+        vmovups   %zmm24, 448(%rsp)
+        vmovups   %zmm25, 384(%rsp)
+        vmovups   %zmm26, 320(%rsp)
+        vmovups   %zmm27, 256(%rsp)
+        vmovups   %zmm28, 192(%rsp)
+        vmovups   %zmm29, 128(%rsp)
+        vmovups   %zmm30, 64(%rsp)
+        vmovups   %zmm31, (%rsp)
+        movq      %rsi, 1056(%rsp)
+        movq      %r12, 1096(%rsp)
+        cfi_offset_rel_rsp (12, 1096)
+        movb      %dl, %r12b
+        movq      %r13, 1088(%rsp)
+        cfi_offset_rel_rsp (13, 1088)
+        movl      %eax, %r13d
+        movq      %r14, 1080(%rsp)
+        cfi_offset_rel_rsp (14, 1080)
+        movl      %ecx, %r14d
+        movq      %r15, 1072(%rsp)
+        cfi_offset_rel_rsp (15, 1072)
+        movq      %rbx, 1064(%rsp)
+        movq      %rdi, %rbx
+        cfi_remember_state
+
+.LBL_1_6:
+        btl       %r13d, %r14d
+        jc        .LBL_1_13
+
+.LBL_1_7:
+        lea       1(%r13), %esi
+        btl       %esi, %r14d
+        jc        .LBL_1_10
+
+.LBL_1_8:
+        addb      $1, %r12b
+        addl      $2, %r13d
+        cmpb      $16, %r12b
+        jb        .LBL_1_6
+
+        movq      %rbx, %rdi
+        kmovw     1048(%rsp), %k4
+        movq      1056(%rsp), %rsi
+        kmovw     1040(%rsp), %k5
+        movq      1096(%rsp), %r12
+        cfi_restore (%r12)
+        kmovw     1032(%rsp), %k6
+        movq      1088(%rsp), %r13
+        cfi_restore (%r13)
+        kmovw     1024(%rsp), %k7
+        vmovups   960(%rsp), %zmm16
+        vmovups   896(%rsp), %zmm17
+        vmovups   832(%rsp), %zmm18
+        vmovups   768(%rsp), %zmm19
+        vmovups   704(%rsp), %zmm20
+        vmovups   640(%rsp), %zmm21
+        vmovups   576(%rsp), %zmm22
+        vmovups   512(%rsp), %zmm23
+        vmovups   448(%rsp), %zmm24
+        vmovups   384(%rsp), %zmm25
+        vmovups   320(%rsp), %zmm26
+        vmovups   256(%rsp), %zmm27
+        vmovups   192(%rsp), %zmm28
+        vmovups   128(%rsp), %zmm29
+        vmovups   64(%rsp), %zmm30
+        vmovups   (%rsp), %zmm31
+        movq      1080(%rsp), %r14
+        cfi_restore (%r14)
+        movq      1072(%rsp), %r15
+        cfi_restore (%r15)
+        movq      1064(%rsp), %rbx
+        vmovups   1216(%rsp), %zmm0
+        vmovups   1280(%rsp), %zmm3
+        jmp       .LBL_1_2
+
+.LBL_1_10:
+        cfi_restore_state
+        movzbl    %r12b, %r15d
+        vmovss    1156(%rsp,%r15,8), %xmm0
+
+        call      JUMPTARGET(sinf)
+
+        vmovss    %xmm0, 1220(%rsp,%r15,8)
+        vmovss    1156(%rsp,%r15,8), %xmm0
+
+        call      JUMPTARGET(cosf)
+
+        vmovss    %xmm0, 1284(%rsp,%r15,8)
+        jmp       .LBL_1_8
+
+.LBL_1_13:
+        movzbl    %r12b, %r15d
+        vmovss    1152(%rsp,%r15,8), %xmm0
+
+        call      JUMPTARGET(sinf)
+
+        vmovss    %xmm0, 1216(%rsp,%r15,8)
+        vmovss    1152(%rsp,%r15,8), %xmm0
+
+        call      JUMPTARGET(cosf)
+
+        vmovss    %xmm0, 1280(%rsp,%r15,8)
+        jmp       .LBL_1_7
+#endif
+END (_ZGVeN16vl4l4_sincosf_knl)
+libmvec_hidden_def(_ZGVeN16vl4l4_sincosf_knl)
+
+ENTRY (_ZGVeN16vl4l4_sincosf_skx)
+#ifndef HAVE_AVX512DQ_ASM_SUPPORT
+WRAPPER_IMPL_AVX512_fFF _ZGVdN8vvv_sincosf
+#else
+        pushq     %rbp
+        cfi_adjust_cfa_offset (8)
+        cfi_rel_offset (%rbp, 0)
+        movq      %rsp, %rbp
+        cfi_def_cfa_register (%rbp)
+        andq      $-64, %rsp
+        subq      $1344, %rsp
+        movq      __svml_s_trig_data@GOTPCREL(%rip), %rax
+        vmovaps   %zmm0, %zmm4
+        vmovups __sAbsMask(%rax), %zmm3
+        vmovups __sInvPI(%rax), %zmm5
+        vmovups __sRShifter(%rax), %zmm6
+        vmovups __sPI1_FMA(%rax), %zmm9
+        vmovups __sPI2_FMA(%rax), %zmm10
+        vmovups __sSignMask(%rax), %zmm14
+        vmovups __sOneHalf(%rax), %zmm7
+        vmovups __sPI3_FMA(%rax), %zmm12
+
+/* Absolute argument computation */
+        vandps    %zmm3, %zmm4, %zmm2
+
+/* c) Getting octant Y by 2/Pi multiplication
+   d) Add "Right Shifter" value */
+        vfmadd213ps %zmm6, %zmm2, %zmm5
+        vcmpps    $18, __sRangeReductionVal(%rax), %zmm2, %k1
+
+/* e) Treat obtained value as integer S for destination sign setting */
+        vpslld    $31, %zmm5, %zmm0
+
+/* g) Subtract "Right Shifter" (0x4B000000) value */
+        vsubps    %zmm6, %zmm5, %zmm5
+        vmovups __sA3(%rax), %zmm6
+
+/* h) Subtract Y*(PI/2) from X argument, where PI/2 divided to 3 parts:
+      X = X - Y*PI1 - Y*PI2 - Y*PI3 */
+        vmovaps   %zmm2, %zmm11
+        vfnmadd231ps %zmm5, %zmm9, %zmm11
+        vfnmadd231ps %zmm5, %zmm10, %zmm11
+        vandps    %zmm11, %zmm14, %zmm1
+        vxorps    %zmm1, %zmm7, %zmm8
+
+/* Result sign calculations */
+        vpternlogd $150, %zmm0, %zmm14, %zmm1
+        vmovups   .L_2il0floatpacket.13(%rip), %zmm14
+
+/* Add correction term 0.5 for cos() part */
+        vaddps    %zmm8, %zmm5, %zmm15
+        vfnmadd213ps %zmm11, %zmm12, %zmm5
+        vandnps   %zmm4, %zmm3, %zmm11
+        vmovups __sA7_FMA(%rax), %zmm3
+        vmovaps   %zmm2, %zmm13
+        vfnmadd231ps %zmm15, %zmm9, %zmm13
+        vxorps    %zmm0, %zmm5, %zmm9
+        vmovups __sA5_FMA(%rax), %zmm0
+        vfnmadd231ps %zmm15, %zmm10, %zmm13
+        vmulps    %zmm9, %zmm9, %zmm8
+        vfnmadd213ps %zmm13, %zmm12, %zmm15
+        vmovups __sA9_FMA(%rax), %zmm12
+        vxorps    %zmm1, %zmm15, %zmm1
+        vmulps    %zmm1, %zmm1, %zmm13
+
+/* 2) Polynomial (minimax for sin within  [-Pi/4; +Pi/4] interval)
+      a) Calculate X^2 = X * X
+      b) Calculate 2 polynomials for sin and cos:
+         RS = X * ( A0 + X^2 * (A1 + x^2 * (A2 + x^2 * (A3))));
+         RC = B0 + X^2 * (B1 + x^2 * (B2 + x^2 * (B3 + x^2 * (B4)))) */
+        vmovaps   %zmm12, %zmm7
+        vfmadd213ps %zmm3, %zmm8, %zmm7
+        vfmadd213ps %zmm3, %zmm13, %zmm12
+        vfmadd213ps %zmm0, %zmm8, %zmm7
+        vfmadd213ps %zmm0, %zmm13, %zmm12
+        vfmadd213ps %zmm6, %zmm8, %zmm7
+        vfmadd213ps %zmm6, %zmm13, %zmm12
+        vmulps    %zmm8, %zmm7, %zmm10
+        vmulps    %zmm13, %zmm12, %zmm3
+        vfmadd213ps %zmm9, %zmm9, %zmm10
+        vfmadd213ps %zmm1, %zmm1, %zmm3
+        vxorps    %zmm11, %zmm10, %zmm0
+        vpandnd   %zmm2, %zmm2, %zmm14{%k1}
+        vptestmd  %zmm14, %zmm14, %k0
+        kmovw     %k0, %ecx
+        testl     %ecx, %ecx
+        jne       .LBL_2_3
+
+.LBL_2_2:
+        cfi_remember_state
+        vmovups   %zmm0, (%rdi)
+        vmovups   %zmm3, (%rsi)
+        movq      %rbp, %rsp
+        cfi_def_cfa_register (%rsp)
+        popq      %rbp
+        cfi_adjust_cfa_offset (-8)
+        cfi_restore (%rbp)
+        ret
+
+.LBL_2_3:
+        cfi_restore_state
+        vmovups   %zmm4, 1152(%rsp)
+        vmovups   %zmm0, 1216(%rsp)
+        vmovups   %zmm3, 1280(%rsp)
+        je        .LBL_2_2
+
+        xorb      %dl, %dl
+        xorl      %eax, %eax
+        kmovw     %k4, 1048(%rsp)
+        kmovw     %k5, 1040(%rsp)
+        kmovw     %k6, 1032(%rsp)
+        kmovw     %k7, 1024(%rsp)
+        vmovups   %zmm16, 960(%rsp)
+        vmovups   %zmm17, 896(%rsp)
+        vmovups   %zmm18, 832(%rsp)
+        vmovups   %zmm19, 768(%rsp)
+        vmovups   %zmm20, 704(%rsp)
+        vmovups   %zmm21, 640(%rsp)
+        vmovups   %zmm22, 576(%rsp)
+        vmovups   %zmm23, 512(%rsp)
+        vmovups   %zmm24, 448(%rsp)
+        vmovups   %zmm25, 384(%rsp)
+        vmovups   %zmm26, 320(%rsp)
+        vmovups   %zmm27, 256(%rsp)
+        vmovups   %zmm28, 192(%rsp)
+        vmovups   %zmm29, 128(%rsp)
+        vmovups   %zmm30, 64(%rsp)
+        vmovups   %zmm31, (%rsp)
+        movq      %rsi, 1056(%rsp)
+        movq      %r12, 1096(%rsp)
+        cfi_offset_rel_rsp (12, 1096)
+        movb      %dl, %r12b
+        movq      %r13, 1088(%rsp)
+        cfi_offset_rel_rsp (13, 1088)
+        movl      %eax, %r13d
+        movq      %r14, 1080(%rsp)
+        cfi_offset_rel_rsp (14, 1080)
+        movl      %ecx, %r14d
+        movq      %r15, 1072(%rsp)
+        cfi_offset_rel_rsp (15, 1072)
+        movq      %rbx, 1064(%rsp)
+        movq      %rdi, %rbx
+        cfi_remember_state
+
+.LBL_2_6:
+        btl       %r13d, %r14d
+        jc        .LBL_2_13
+
+.LBL_2_7:
+        lea       1(%r13), %esi
+        btl       %esi, %r14d
+        jc        .LBL_2_10
+
+.LBL_2_8:
+        incb      %r12b
+        addl      $2, %r13d
+        cmpb      $16, %r12b
+        jb        .LBL_2_6
+
+        kmovw     1048(%rsp), %k4
+        movq      %rbx, %rdi
+        kmovw     1040(%rsp), %k5
+        kmovw     1032(%rsp), %k6
+        kmovw     1024(%rsp), %k7
+        vmovups   960(%rsp), %zmm16
+        vmovups   896(%rsp), %zmm17
+        vmovups   832(%rsp), %zmm18
+        vmovups   768(%rsp), %zmm19
+        vmovups   704(%rsp), %zmm20
+        vmovups   640(%rsp), %zmm21
+        vmovups   576(%rsp), %zmm22
+        vmovups   512(%rsp), %zmm23
+        vmovups   448(%rsp), %zmm24
+        vmovups   384(%rsp), %zmm25
+        vmovups   320(%rsp), %zmm26
+        vmovups   256(%rsp), %zmm27
+        vmovups   192(%rsp), %zmm28
+        vmovups   128(%rsp), %zmm29
+        vmovups   64(%rsp), %zmm30
+        vmovups   (%rsp), %zmm31
+        vmovups   1216(%rsp), %zmm0
+        vmovups   1280(%rsp), %zmm3
+        movq      1056(%rsp), %rsi
+        movq      1096(%rsp), %r12
+        cfi_restore (%r12)
+        movq      1088(%rsp), %r13
+        cfi_restore (%r13)
+        movq      1080(%rsp), %r14
+        cfi_restore (%r14)
+        movq      1072(%rsp), %r15
+        cfi_restore (%r15)
+        movq      1064(%rsp), %rbx
+        jmp       .LBL_2_2
+
+.LBL_2_10:
+        cfi_restore_state
+        movzbl    %r12b, %r15d
+        vmovss    1156(%rsp,%r15,8), %xmm0
+        vzeroupper
+        vmovss    1156(%rsp,%r15,8), %xmm0
+
+        call      JUMPTARGET(sinf)
+
+        vmovss    %xmm0, 1220(%rsp,%r15,8)
+        vmovss    1156(%rsp,%r15,8), %xmm0
+
+        call      JUMPTARGET(cosf)
+
+        vmovss    %xmm0, 1284(%rsp,%r15,8)
+        jmp       .LBL_2_8
+
+.LBL_2_13:
+        movzbl    %r12b, %r15d
+        vmovss    1152(%rsp,%r15,8), %xmm0
+        vzeroupper
+        vmovss    1152(%rsp,%r15,8), %xmm0
+
+        call      JUMPTARGET(sinf)
+
+        vmovss    %xmm0, 1216(%rsp,%r15,8)
+        vmovss    1152(%rsp,%r15,8), %xmm0
+
+        call      JUMPTARGET(cosf)
+
+        vmovss    %xmm0, 1280(%rsp,%r15,8)
+        jmp       .LBL_2_7
+#endif
+END (_ZGVeN16vl4l4_sincosf_skx)
+libmvec_hidden_def(_ZGVeN16vl4l4_sincosf_skx)
+
+/* Wrapper between vvv and vl4l4 vector variants.  */
+.macro WRAPPER_AVX512_vvv_vl4l4 callee
+#ifndef __ILP32__
+        pushq     %rbp
+        cfi_adjust_cfa_offset (8)
+        cfi_rel_offset (%rbp, 0)
+        movq      %rsp, %rbp
+        cfi_def_cfa_register (%rbp)
+        andq      $-64, %rsp
+        subq      $384, %rsp
+        /* Encoding for vmovups %zmm1, 128(%rsp).  */
+        .byte 0x62
+        .byte 0xf1
+        .byte 0x7c
+        .byte 0x48
+        .byte 0x11
+        .byte 0x4c
+        .byte 0x24
+        .byte 0x02
+        lea       (%rsp), %rdi
+        /* Encoding for vmovups %zmm2, 192(%rdi).  */
+        .byte 0x62
+        .byte 0xf1
+        .byte 0x7c
+        .byte 0x48
+        .byte 0x11
+        .byte 0x57
+        .byte 0x03
+        /* Encoding for vmovups %zmm3, 256(%rdi).  */
+        .byte 0x62
+        .byte 0xf1
+        .byte 0x7c
+        .byte 0x48
+        .byte 0x11
+        .byte 0x5f
+        .byte 0x04
+        /* Encoding for vmovups %zmm4, 320(%rdi).  */
+        .byte 0x62
+        .byte 0xf1
+        .byte 0x7c
+        .byte 0x48
+        .byte 0x11
+        .byte 0x67
+        .byte 0x05
+        lea       64(%rsp), %rsi
+        call      HIDDEN_JUMPTARGET(\callee)
+        movq      128(%rsp), %rdx
+        movq      136(%rsp), %rsi
+        movq      144(%rsp), %r8
+        movq      152(%rsp), %r10
+        movl      (%rsp), %eax
+        movl      4(%rsp), %ecx
+        movl      8(%rsp), %edi
+        movl      12(%rsp), %r9d
+        movl      %eax, (%rdx)
+        movl      %ecx, (%rsi)
+        movq      160(%rsp), %rax
+        movq      168(%rsp), %rcx
+        movl      %edi, (%r8)
+        movl      %r9d, (%r10)
+        movq      176(%rsp), %rdi
+        movq      184(%rsp), %r9
+        movl      16(%rsp), %r11d
+        movl      20(%rsp), %edx
+        movl      24(%rsp), %esi
+        movl      28(%rsp), %r8d
+        movl      %r11d, (%rax)
+        movl      %edx, (%rcx)
+        movq      192(%rsp), %r11
+        movq      200(%rsp), %rdx
+        movl      %esi, (%rdi)
+        movl      %r8d, (%r9)
+        movq      208(%rsp), %rsi
+        movq      216(%rsp), %r8
+        movl      32(%rsp), %r10d
+        movl      36(%rsp), %eax
+        movl      40(%rsp), %ecx
+        movl      44(%rsp), %edi
+        movl      %r10d, (%r11)
+        movl      %eax, (%rdx)
+        movq      224(%rsp), %r10
+        movq      232(%rsp), %rax
+        movl      %ecx, (%rsi)
+        movl      %edi, (%r8)
+        movq      240(%rsp), %rcx
+        movq      248(%rsp), %rdi
+        movl      48(%rsp), %r9d
+        movl      52(%rsp), %r11d
+        movl      56(%rsp), %edx
+        movl      60(%rsp), %esi
+        movl      %r9d, (%r10)
+        movl      %r11d, (%rax)
+        movq      256(%rsp), %r9
+        movq      264(%rsp), %r11
+        movl      %edx, (%rcx)
+        movl      %esi, (%rdi)
+        movq      272(%rsp), %rdx
+        movq      280(%rsp), %rsi
+        movl      64(%rsp), %r8d
+        movl      68(%rsp), %r10d
+        movl      72(%rsp), %eax
+        movl      76(%rsp), %ecx
+        movl      %r8d, (%r9)
+        movl      %r10d, (%r11)
+        movq      288(%rsp), %r8
+        movq      296(%rsp), %r10
+        movl      %eax, (%rdx)
+        movl      %ecx, (%rsi)
+        movq      304(%rsp), %rax
+        movq      312(%rsp), %rcx
+        movl      80(%rsp), %edi
+        movl      84(%rsp), %r9d
+        movl      88(%rsp), %r11d
+        movl      92(%rsp), %edx
+        movl      %edi, (%r8)
+        movl      %r9d, (%r10)
+        movq      320(%rsp), %rdi
+        movq      328(%rsp), %r9
+        movl      %r11d, (%rax)
+        movl      %edx, (%rcx)
+        movq      336(%rsp), %r11
+        movq      344(%rsp), %rdx
+        movl      96(%rsp), %esi
+        movl      100(%rsp), %r8d
+        movl      104(%rsp), %r10d
+        movl      108(%rsp), %eax
+        movl      %esi, (%rdi)
+        movl      %r8d, (%r9)
+        movq      352(%rsp), %rsi
+        movq      360(%rsp), %r8
+        movl      %r10d, (%r11)
+        movl      %eax, (%rdx)
+        movq      368(%rsp), %r10
+        movq      376(%rsp), %rax
+        movl      112(%rsp), %ecx
+        movl      116(%rsp), %edi
+        movl      120(%rsp), %r9d
+        movl      124(%rsp), %r11d
+        movl      %ecx, (%rsi)
+        movl      %edi, (%r8)
+        movl      %r9d, (%r10)
+        movl      %r11d, (%rax)
+        movq      %rbp, %rsp
+        cfi_def_cfa_register (%rsp)
+        popq      %rbp
+        cfi_adjust_cfa_offset (-8)
+        cfi_restore (%rbp)
+        ret
+#else
+        leal    8(%rsp), %r10d
+        .cfi_def_cfa 10, 0
+        andl    $-64, %esp
+        pushq   -8(%r10d)
+        pushq   %rbp
+        .cfi_escape 0x10,0x6,0x2,0x76,0
+        movl    %esp, %ebp
+        pushq   %r10
+        .cfi_escape 0xf,0x3,0x76,0x78,0x6
+        leal    -112(%rbp), %esi
+        leal    -176(%rbp), %edi
+        subl    $296, %esp
+        /* Encoding for vmovdqa64 %zmm1, -240(%ebp).  */
+        .byte 0x67
+        .byte 0x62
+        .byte 0xf1
+        .byte 0xfd
+        .byte 0x48
+        .byte 0x7f
+        .byte 0x8d
+        .byte 0x10
+        .byte 0xff
+        .byte 0xff
+        .byte 0xff
+        /* Encoding for vmovdqa64 %zmm2, -304(%ebp).  */
+        .byte 0x67
+        .byte 0x62
+        .byte 0xf1
+        .byte 0xfd
+        .byte 0x48
+        .byte 0x7f
+        .byte 0x95
+        .byte 0xd0
+        .byte 0xfe
+        .byte 0xff
+        .byte 0xff
+        call    HIDDEN_JUMPTARGET(\callee)
+        movl    -240(%ebp), %eax
+        vmovss  -176(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        movl    -236(%ebp), %eax
+        vmovss  -172(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        movl    -232(%ebp), %eax
+        vmovss  -168(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        movl    -228(%ebp), %eax
+        vmovss  -164(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        movl    -224(%ebp), %eax
+        vmovss  -160(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        movl    -220(%ebp), %eax
+        vmovss  -156(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        movl    -216(%ebp), %eax
+        vmovss  -152(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        movl    -212(%ebp), %eax
+        vmovss  -148(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        movl    -208(%ebp), %eax
+        vmovss  -144(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        movl    -204(%ebp), %eax
+        vmovss  -140(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        movl    -200(%ebp), %eax
+        vmovss  -136(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        movl    -196(%ebp), %eax
+        vmovss  -132(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        movl    -192(%ebp), %eax
+        vmovss  -128(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        movl    -188(%ebp), %eax
+        vmovss  -124(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        movl    -184(%ebp), %eax
+        vmovss  -120(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        movl    -180(%ebp), %eax
+        vmovss  -116(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        movl    -304(%ebp), %eax
+        vmovss  -112(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        movl    -300(%ebp), %eax
+        vmovss  -108(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        movl    -296(%ebp), %eax
+        vmovss  -104(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        movl    -292(%ebp), %eax
+        vmovss  -100(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        movl    -288(%ebp), %eax
+        vmovss  -96(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        movl    -284(%ebp), %eax
+        vmovss  -92(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        movl    -280(%ebp), %eax
+        vmovss  -88(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        movl    -276(%ebp), %eax
+        vmovss  -84(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        movl    -272(%ebp), %eax
+        vmovss  -80(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        movl    -268(%ebp), %eax
+        vmovss  -76(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        movl    -264(%ebp), %eax
+        vmovss  -72(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        movl    -260(%ebp), %eax
+        vmovss  -68(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        movl    -256(%ebp), %eax
+        vmovss  -64(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        movl    -252(%ebp), %eax
+        vmovss  -60(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        movl    -248(%ebp), %eax
+        vmovss  -56(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        movl    -244(%ebp), %eax
+        vmovss  -52(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        addl    $296, %esp
+        popq    %r10
+        .cfi_def_cfa 10, 0
+        popq    %rbp
+        leal    -8(%r10), %esp
+        .cfi_def_cfa 7, 8
+        ret
+#endif
+.endm
+
+ENTRY (_ZGVeN16vvv_sincosf_knl)
+WRAPPER_AVX512_vvv_vl4l4 _ZGVeN16vl4l4_sincosf_knl
+END (_ZGVeN16vvv_sincosf_knl)
+
+ENTRY (_ZGVeN16vvv_sincosf_skx)
+WRAPPER_AVX512_vvv_vl4l4 _ZGVeN16vl4l4_sincosf_skx
+END (_ZGVeN16vvv_sincosf_skx)
+
+	.section .rodata, "a"
+.L_2il0floatpacket.13:
+	.long	0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
+	.type	.L_2il0floatpacket.13,@object
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf4_core.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf4_core.S
new file mode 100644
index 0000000000..a249be33d1
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf4_core.S
@@ -0,0 +1,36 @@
+/* Multiple versions of vectorized sincosf.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+	.text
+ENTRY (_ZGVbN4vvv_sincosf)
+        .type   _ZGVbN4vvv_sincosf, @gnu_indirect_function
+	LOAD_RTLD_GLOBAL_RO_RDX
+        leaq    _ZGVbN4vvv_sincosf_sse4(%rip), %rax
+	HAS_CPU_FEATURE (SSE4_1)
+        jz      2f
+        ret
+2:      leaq    _ZGVbN4vvv_sincosf_sse2(%rip), %rax
+        ret
+END (_ZGVbN4vvv_sincosf)
+libmvec_hidden_def (_ZGVbN4vvv_sincosf)
+
+#define _ZGVbN4vvv_sincosf _ZGVbN4vvv_sincosf_sse2
+#include "../svml_s_sincosf4_core.S"
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf4_core_sse4.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf4_core_sse4.S
new file mode 100644
index 0000000000..74a6ac1157
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf4_core_sse4.S
@@ -0,0 +1,346 @@
+/* Function sincosf vectorized with SSE4.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "svml_s_trig_data.h"
+
+	.text
+ENTRY (_ZGVbN4vl4l4_sincosf_sse4)
+/*
+   ALGORITHM DESCRIPTION:
+
+     1) Range reduction to [-Pi/4; +Pi/4] interval
+        a) Grab sign from source argument and save it.
+        b) Remove sign using AND operation
+        c) Getting octant Y by 2/Pi multiplication
+        d) Add "Right Shifter" value
+        e) Treat obtained value as integer S for destination sign setting.
+           SS = ((S-S&1)&2)<<30; For sin part
+           SC = ((S+S&1)&2)<<30; For cos part
+        f) Change destination sign if source sign is negative
+           using XOR operation.
+        g) Subtract "Right Shifter" (0x4B000000) value
+        h) Subtract Y*(PI/2) from X argument, where PI/2 divided to 4 parts:
+           X = X - Y*PI1 - Y*PI2 - Y*PI3 - Y*PI4;
+     2) Polynomial (minimax for sin within  [-Pi/4; +Pi/4] interval)
+        a) Calculate X^2 = X * X
+        b) Calculate 2 polynomials for sin and cos:
+           RS = X * ( A0 + X^2 * (A1 + x^2 * (A2 + x^2 * (A3))));
+           RC = B0 + X^2 * (B1 + x^2 * (B2 + x^2 * (B3 + x^2 * (B4))));
+        c) Swap RS & RC if if first bit of obtained value after
+           Right Shifting is set to 1. Using And, Andnot & Or operations.
+     3) Destination sign setting
+        a) Set shifted destination sign using XOR operation:
+           R1 = XOR( RS, SS );
+           R2 = XOR( RC, SC ).  */
+
+        pushq     %rbp
+        cfi_adjust_cfa_offset (8)
+        cfi_rel_offset (%rbp, 0)
+        movq      %rsp, %rbp
+        cfi_def_cfa_register (%rbp)
+        andq      $-64, %rsp
+        subq      $320, %rsp
+        movq      __svml_s_trig_data@GOTPCREL(%rip), %rax
+        movups    %xmm12, 176(%rsp)
+        movups    %xmm9, 160(%rsp)
+        movups __sAbsMask(%rax), %xmm12
+
+/* Absolute argument computation */
+        movaps    %xmm12, %xmm5
+        andnps    %xmm0, %xmm12
+        movups __sInvPI(%rax), %xmm7
+        andps     %xmm0, %xmm5
+
+/* c) Getting octant Y by 2/Pi multiplication
+   d) Add "Right Shifter" value.  */
+        mulps     %xmm5, %xmm7
+        movups    %xmm10, 144(%rsp)
+        movups __sPI1(%rax), %xmm10
+
+/* h) Subtract Y*(PI/2) from X argument, where PI/2 divided to 3 parts:
+      X = X - Y*PI1 - Y*PI2 - Y*PI3.  */
+        movaps    %xmm10, %xmm1
+        addps __sRShifter(%rax), %xmm7
+
+/* e) Treat obtained value as integer S for destination sign setting */
+        movaps    %xmm7, %xmm9
+
+/* g) Subtract "Right Shifter" (0x4B000000) value */
+        subps __sRShifter(%rax), %xmm7
+        mulps     %xmm7, %xmm1
+        pslld     $31, %xmm9
+        movups __sPI2(%rax), %xmm6
+        movups    %xmm13, 112(%rsp)
+        movaps    %xmm5, %xmm13
+        movaps    %xmm6, %xmm2
+        subps     %xmm1, %xmm13
+        mulps     %xmm7, %xmm2
+        movups __sSignMask(%rax), %xmm3
+        movaps    %xmm5, %xmm1
+        movups __sOneHalf(%rax), %xmm4
+        subps     %xmm2, %xmm13
+        cmpnleps __sRangeReductionVal(%rax), %xmm5
+        movaps    %xmm3, %xmm2
+        andps     %xmm13, %xmm2
+        xorps     %xmm2, %xmm4
+
+/* Result sign calculations */
+        xorps     %xmm2, %xmm3
+        xorps     %xmm9, %xmm3
+
+/* Add correction term 0.5 for cos() part */
+        addps     %xmm7, %xmm4
+        movmskps  %xmm5, %ecx
+        mulps     %xmm4, %xmm10
+        mulps     %xmm4, %xmm6
+        subps     %xmm10, %xmm1
+        movups __sPI3(%rax), %xmm10
+        subps     %xmm6, %xmm1
+        movaps    %xmm10, %xmm6
+        mulps     %xmm7, %xmm6
+        mulps     %xmm4, %xmm10
+        subps     %xmm6, %xmm13
+        subps     %xmm10, %xmm1
+        movups __sPI4(%rax), %xmm6
+        mulps     %xmm6, %xmm7
+        mulps     %xmm6, %xmm4
+        subps     %xmm7, %xmm13
+        subps     %xmm4, %xmm1
+        xorps     %xmm9, %xmm13
+        xorps     %xmm3, %xmm1
+        movaps    %xmm13, %xmm4
+        movaps    %xmm1, %xmm2
+        mulps     %xmm13, %xmm4
+        mulps     %xmm1, %xmm2
+        movups __sA9(%rax), %xmm7
+
+/* 2) Polynomial (minimax for sin within  [-Pi/4; +Pi/4] interval)
+      a) Calculate X^2 = X * X
+      b) Calculate 2 polynomials for sin and cos:
+         RS = X * ( A0 + X^2 * (A1 + x^2 * (A2 + x^2 * (A3))));
+         RC = B0 + X^2 * (B1 + x^2 * (B2 + x^2 * (B3 + x^2 * (B4)))) */
+        movaps    %xmm7, %xmm3
+        mulps     %xmm4, %xmm3
+        mulps     %xmm2, %xmm7
+        addps __sA7(%rax), %xmm3
+        addps __sA7(%rax), %xmm7
+        mulps     %xmm4, %xmm3
+        mulps     %xmm2, %xmm7
+        addps __sA5(%rax), %xmm3
+        addps __sA5(%rax), %xmm7
+        mulps     %xmm4, %xmm3
+        mulps     %xmm2, %xmm7
+        addps __sA3(%rax), %xmm3
+        addps __sA3(%rax), %xmm7
+        mulps     %xmm3, %xmm4
+        mulps     %xmm7, %xmm2
+        mulps     %xmm13, %xmm4
+        mulps     %xmm1, %xmm2
+        addps     %xmm4, %xmm13
+        addps     %xmm2, %xmm1
+        xorps     %xmm12, %xmm13
+        testl     %ecx, %ecx
+        jne       .LBL_1_3
+
+.LBL_1_2:
+        cfi_remember_state
+        movups    160(%rsp), %xmm9
+        movaps    %xmm13, (%rdi)
+        movups    144(%rsp), %xmm10
+        movups    176(%rsp), %xmm12
+        movups    112(%rsp), %xmm13
+        movups    %xmm1, (%rsi)
+        movq      %rbp, %rsp
+        cfi_def_cfa_register (%rsp)
+        popq      %rbp
+        cfi_adjust_cfa_offset (-8)
+        cfi_restore (%rbp)
+        ret
+
+.LBL_1_3:
+        cfi_restore_state
+        movups    %xmm0, 128(%rsp)
+        movups    %xmm13, 192(%rsp)
+        movups    %xmm1, 256(%rsp)
+        je        .LBL_1_2
+
+        xorb      %dl, %dl
+        xorl      %eax, %eax
+        movups    %xmm8, 48(%rsp)
+        movups    %xmm11, 32(%rsp)
+        movups    %xmm14, 16(%rsp)
+        movups    %xmm15, (%rsp)
+        movq      %rsi, 64(%rsp)
+        movq      %r12, 104(%rsp)
+        cfi_offset_rel_rsp (12, 104)
+        movb      %dl, %r12b
+        movq      %r13, 96(%rsp)
+        cfi_offset_rel_rsp (13, 96)
+        movl      %eax, %r13d
+        movq      %r14, 88(%rsp)
+        cfi_offset_rel_rsp (14, 88)
+        movl      %ecx, %r14d
+        movq      %r15, 80(%rsp)
+        cfi_offset_rel_rsp (15, 80)
+        movq      %rbx, 72(%rsp)
+        movq      %rdi, %rbx
+        cfi_remember_state
+
+.LBL_1_6:
+        btl       %r13d, %r14d
+        jc        .LBL_1_13
+
+.LBL_1_7:
+        lea       1(%r13), %esi
+        btl       %esi, %r14d
+        jc        .LBL_1_10
+
+.LBL_1_8:
+        incb      %r12b
+        addl      $2, %r13d
+        cmpb      $16, %r12b
+        jb        .LBL_1_6
+
+        movups    48(%rsp), %xmm8
+        movq      %rbx, %rdi
+        movups    32(%rsp), %xmm11
+        movups    16(%rsp), %xmm14
+        movups    (%rsp), %xmm15
+        movq      64(%rsp), %rsi
+        movq      104(%rsp), %r12
+        cfi_restore (%r12)
+        movq      96(%rsp), %r13
+        cfi_restore (%r13)
+        movq      88(%rsp), %r14
+        cfi_restore (%r14)
+        movq      80(%rsp), %r15
+        cfi_restore (%r15)
+        movq      72(%rsp), %rbx
+        movups    192(%rsp), %xmm13
+        movups    256(%rsp), %xmm1
+        jmp       .LBL_1_2
+
+.LBL_1_10:
+        cfi_restore_state
+        movzbl    %r12b, %r15d
+        movss     132(%rsp,%r15,8), %xmm0
+
+        call      JUMPTARGET(sinf)
+
+        movss     %xmm0, 196(%rsp,%r15,8)
+        movss     132(%rsp,%r15,8), %xmm0
+
+        call      JUMPTARGET(cosf)
+
+        movss     %xmm0, 260(%rsp,%r15,8)
+        jmp       .LBL_1_8
+
+.LBL_1_13:
+        movzbl    %r12b, %r15d
+        movss     128(%rsp,%r15,8), %xmm0
+
+        call      JUMPTARGET(sinf)
+
+        movss     %xmm0, 192(%rsp,%r15,8)
+        movss     128(%rsp,%r15,8), %xmm0
+
+        call      JUMPTARGET(cosf)
+
+        movss     %xmm0, 256(%rsp,%r15,8)
+        jmp       .LBL_1_7
+
+END (_ZGVbN4vl4l4_sincosf_sse4)
+libmvec_hidden_def(_ZGVbN4vl4l4_sincosf_sse4)
+
+/* vvv version implemented with wrapper to vl4l4 variant.  */
+ENTRY (_ZGVbN4vvv_sincosf_sse4)
+#ifndef __ILP32__
+        subq      $104, %rsp
+        .cfi_def_cfa_offset 112
+        movdqu    %xmm1, 32(%rsp)
+        lea       (%rsp), %rdi
+        movdqu    %xmm2, 48(%rdi)
+        lea       16(%rsp), %rsi
+        movdqu    %xmm3, 48(%rsi)
+        movdqu    %xmm4, 64(%rsi)
+        call      HIDDEN_JUMPTARGET(_ZGVbN4vl4l4_sincosf_sse4)
+        movq      32(%rsp), %rdx
+        movq      40(%rsp), %rsi
+        movq      48(%rsp), %r8
+        movq      56(%rsp), %r10
+        movl      (%rsp), %eax
+        movl      4(%rsp), %ecx
+        movl      8(%rsp), %edi
+        movl      12(%rsp), %r9d
+        movl      %eax, (%rdx)
+        movl      %ecx, (%rsi)
+        movq      64(%rsp), %rax
+        movq      72(%rsp), %rcx
+        movl      %edi, (%r8)
+        movl      %r9d, (%r10)
+        movq      80(%rsp), %rdi
+        movq      88(%rsp), %r9
+        movl      16(%rsp), %r11d
+        movl      20(%rsp), %edx
+        movl      24(%rsp), %esi
+        movl      28(%rsp), %r8d
+        movl      %r11d, (%rax)
+        movl      %edx, (%rcx)
+        movl      %esi, (%rdi)
+        movl      %r8d, (%r9)
+        addq      $104, %rsp
+        .cfi_def_cfa_offset 8
+        ret
+#else
+        subl    $72, %esp
+        .cfi_def_cfa_offset 80
+        leal    48(%rsp), %esi
+        movaps  %xmm1, 16(%esp)
+        leal    32(%rsp), %edi
+        movaps  %xmm2, (%esp)
+        call    HIDDEN_JUMPTARGET(_ZGVbN4vl4l4_sincosf_sse4)
+        movl    16(%esp), %eax
+        movss   32(%esp), %xmm0
+        movss   %xmm0, (%eax)
+        movl    20(%esp), %eax
+        movss   36(%esp), %xmm0
+        movss   %xmm0, (%eax)
+        movl    24(%esp), %eax
+        movss   40(%esp), %xmm0
+        movss   %xmm0, (%eax)
+        movl    28(%esp), %eax
+        movss   44(%esp), %xmm0
+        movss   %xmm0, (%eax)
+        movl    (%esp), %eax
+        movss   48(%esp), %xmm0
+        movss   %xmm0, (%eax)
+        movl    4(%esp), %eax
+        movss   52(%esp), %xmm0
+        movss   %xmm0, (%eax)
+        movl    8(%esp), %eax
+        movss   56(%esp), %xmm0
+        movss   %xmm0, (%eax)
+        movl    12(%esp), %eax
+        movss   60(%esp), %xmm0
+        movss   %xmm0, (%eax)
+        addl    $72, %esp
+        .cfi_def_cfa_offset 8
+        ret
+#endif
+END (_ZGVbN4vvv_sincosf_sse4)
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf8_core.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf8_core.S
new file mode 100644
index 0000000000..320fd861a5
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf8_core.S
@@ -0,0 +1,36 @@
+/* Multiple versions of vectorized sincosf.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+    Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+	.text
+ENTRY (_ZGVdN8vvv_sincosf)
+        .type   _ZGVdN8vvv_sincosf, @gnu_indirect_function
+	LOAD_RTLD_GLOBAL_RO_RDX
+        leaq    _ZGVdN8vvv_sincosf_avx2(%rip), %rax
+	HAS_ARCH_FEATURE (AVX2_Usable)
+        jz      2f
+        ret
+2:      leaq    _ZGVdN8vvv_sincosf_sse_wrapper(%rip), %rax
+        ret
+END (_ZGVdN8vvv_sincosf)
+libmvec_hidden_def (_ZGVdN8vvv_sincosf)
+
+#define _ZGVdN8vvv_sincosf _ZGVdN8vvv_sincosf_sse_wrapper
+#include "../svml_s_sincosf8_core.S"
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf8_core_avx2.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf8_core_avx2.S
new file mode 100644
index 0000000000..9e4e2c71c5
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf8_core_avx2.S
@@ -0,0 +1,389 @@
+/* Function sincosf vectorized with AVX2.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "svml_s_trig_data.h"
+
+	.text
+ENTRY (_ZGVdN8vl4l4_sincosf_avx2)
+/*
+   ALGORITHM DESCRIPTION:
+
+     1) Range reduction to [-Pi/4; +Pi/4] interval
+        a) Grab sign from source argument and save it.
+        b) Remove sign using AND operation
+        c) Getting octant Y by 2/Pi multiplication
+        d) Add "Right Shifter" value
+        e) Treat obtained value as integer S for destination sign setting.
+           SS = ((S-S&1)&2)<<30; For sin part
+           SC = ((S+S&1)&2)<<30; For cos part
+        f) Change destination sign if source sign is negative
+           using XOR operation.
+        g) Subtract "Right Shifter" (0x4B000000) value
+        h) Subtract Y*(PI/2) from X argument, where PI/2 divided to 4 parts:
+           X = X - Y*PI1 - Y*PI2 - Y*PI3 - Y*PI4;
+     2) Polynomial (minimax for sin within  [-Pi/4; +Pi/4] interval)
+        a) Calculate X^2 = X * X
+        b) Calculate 2 polynomials for sin and cos:
+           RS = X * ( A0 + X^2 * (A1 + x^2 * (A2 + x^2 * (A3))));
+           RC = B0 + X^2 * (B1 + x^2 * (B2 + x^2 * (B3 + x^2 * (B4))));
+        c) Swap RS & RC if if first bit of obtained value after
+           Right Shifting is set to 1. Using And, Andnot & Or operations.
+     3) Destination sign setting
+        a) Set shifted destination sign using XOR operation:
+           R1 = XOR( RS, SS );
+           R2 = XOR( RC, SC ).  */
+
+        pushq     %rbp
+        cfi_adjust_cfa_offset (8)
+        cfi_rel_offset (%rbp, 0)
+        movq      %rsp, %rbp
+        cfi_def_cfa_register (%rbp)
+        andq      $-64, %rsp
+        subq      $448, %rsp
+        movq      __svml_s_trig_data@GOTPCREL(%rip), %rax
+        vmovdqa   %ymm0, %ymm5
+        vmovups   %ymm13, 352(%rsp)
+        vmovups __sAbsMask(%rax), %ymm2
+        vmovups __sInvPI(%rax), %ymm1
+        vmovups __sPI1_FMA(%rax), %ymm13
+        vmovups   %ymm15, 288(%rsp)
+
+/* Absolute argument computation */
+        vandps    %ymm2, %ymm5, %ymm4
+
+/* c) Getting octant Y by 2/Pi multiplication
+   d) Add "Right Shifter" value */
+        vfmadd213ps __sRShifter(%rax), %ymm4, %ymm1
+
+/* e) Treat obtained value as integer S for destination sign setting */
+        vpslld    $31, %ymm1, %ymm0
+
+/* g) Subtract "Right Shifter" (0x4B000000) value */
+        vsubps __sRShifter(%rax), %ymm1, %ymm1
+
+/* h) Subtract Y*(PI/2) from X argument, where PI/2 divided to 3 parts:
+      X = X - Y*PI1 - Y*PI2 - Y*PI3 */
+        vmovdqa   %ymm4, %ymm7
+        vfnmadd231ps %ymm1, %ymm13, %ymm7
+        vfnmadd231ps __sPI2_FMA(%rax), %ymm1, %ymm7
+        vandps __sSignMask(%rax), %ymm7, %ymm15
+        vxorps __sOneHalf(%rax), %ymm15, %ymm6
+
+/* Add correction term 0.5 for cos() part */
+        vaddps    %ymm6, %ymm1, %ymm6
+        vmovdqa   %ymm4, %ymm3
+        vfnmadd231ps %ymm6, %ymm13, %ymm3
+        vmovups __sPI3_FMA(%rax), %ymm13
+        vcmpnle_uqps __sRangeReductionVal(%rax), %ymm4, %ymm4
+        vfnmadd231ps __sPI2_FMA(%rax), %ymm6, %ymm3
+        vfnmadd213ps %ymm7, %ymm13, %ymm1
+        vfnmadd213ps %ymm3, %ymm13, %ymm6
+
+/* Result sign calculations */
+        vxorps __sSignMask(%rax), %ymm15, %ymm3
+        vxorps    %ymm0, %ymm3, %ymm7
+        vxorps    %ymm7, %ymm6, %ymm3
+        vxorps    %ymm0, %ymm1, %ymm15
+        vandnps   %ymm5, %ymm2, %ymm6
+        vmovups __sA7_FMA(%rax), %ymm2
+        vmulps    %ymm15, %ymm15, %ymm13
+        vmovups __sA9_FMA(%rax), %ymm7
+        vmulps    %ymm3, %ymm3, %ymm1
+
+/* 2) Polynomial (minimax for sin within  [-Pi/4; +Pi/4] interval)
+      a) Calculate X^2 = X * X
+      b) Calculate 2 polynomials for sin and cos:
+         RS = X * ( A0 + X^2 * (A1 + x^2 * (A2 + x^2 * (A3))));
+         RC = B0 + X^2 * (B1 + x^2 * (B2 + x^2 * (B3 + x^2 * (B4)))) */
+        vmovdqa   %ymm2, %ymm0
+        vfmadd231ps __sA9_FMA(%rax), %ymm13, %ymm0
+        vfmadd213ps %ymm2, %ymm1, %ymm7
+        vfmadd213ps __sA5_FMA(%rax), %ymm13, %ymm0
+        vfmadd213ps __sA5_FMA(%rax), %ymm1, %ymm7
+        vfmadd213ps __sA3(%rax), %ymm13, %ymm0
+        vfmadd213ps __sA3(%rax), %ymm1, %ymm7
+        vmulps    %ymm13, %ymm0, %ymm13
+        vmulps    %ymm1, %ymm7, %ymm1
+        vfmadd213ps %ymm15, %ymm15, %ymm13
+        vfmadd213ps %ymm3, %ymm3, %ymm1
+        vmovmskps %ymm4, %ecx
+        vxorps    %ymm6, %ymm13, %ymm0
+        testl     %ecx, %ecx
+        jne       .LBL_1_3
+
+.LBL_1_2:
+        cfi_remember_state
+        vmovups   352(%rsp), %ymm13
+        vmovups   288(%rsp), %ymm15
+        vmovups   %ymm0, (%rdi)
+        vmovups   %ymm1, (%rsi)
+        movq      %rbp, %rsp
+        cfi_def_cfa_register (%rsp)
+        popq      %rbp
+        cfi_adjust_cfa_offset (-8)
+        cfi_restore (%rbp)
+        ret
+
+.LBL_1_3:
+        cfi_restore_state
+        vmovups   %ymm5, 256(%rsp)
+        vmovups   %ymm0, 320(%rsp)
+        vmovups   %ymm1, 384(%rsp)
+        je        .LBL_1_2
+
+        xorb      %dl, %dl
+        xorl      %eax, %eax
+        vmovups   %ymm8, 160(%rsp)
+        vmovups   %ymm9, 128(%rsp)
+        vmovups   %ymm10, 96(%rsp)
+        vmovups   %ymm11, 64(%rsp)
+        vmovups   %ymm12, 32(%rsp)
+        vmovups   %ymm14, (%rsp)
+        movq      %rsi, 192(%rsp)
+        movq      %r12, 232(%rsp)
+        cfi_offset_rel_rsp (12, 232)
+        movb      %dl, %r12b
+        movq      %r13, 224(%rsp)
+        cfi_offset_rel_rsp (13, 224)
+        movl      %eax, %r13d
+        movq      %r14, 216(%rsp)
+        cfi_offset_rel_rsp (14, 216)
+        movl      %ecx, %r14d
+        movq      %r15, 208(%rsp)
+        cfi_offset_rel_rsp (14, 208)
+        movq      %rbx, 200(%rsp)
+        movq      %rdi, %rbx
+        cfi_remember_state
+
+.LBL_1_6:
+        btl       %r13d, %r14d
+        jc        .LBL_1_13
+
+.LBL_1_7:
+        lea       1(%r13), %esi
+        btl       %esi, %r14d
+        jc        .LBL_1_10
+
+.LBL_1_8:
+        incb      %r12b
+        addl      $2, %r13d
+        cmpb      $16, %r12b
+        jb        .LBL_1_6
+
+        vmovups   160(%rsp), %ymm8
+        movq      %rbx, %rdi
+        vmovups   128(%rsp), %ymm9
+        vmovups   96(%rsp), %ymm10
+        vmovups   64(%rsp), %ymm11
+        vmovups   32(%rsp), %ymm12
+        vmovups   (%rsp), %ymm14
+        vmovups   320(%rsp), %ymm0
+        vmovups   384(%rsp), %ymm1
+        movq      192(%rsp), %rsi
+        movq      232(%rsp), %r12
+        cfi_restore (%r12)
+        movq      224(%rsp), %r13
+        cfi_restore (%r13)
+        movq      216(%rsp), %r14
+        cfi_restore (%r14)
+        movq      208(%rsp), %r15
+        cfi_restore (%r15)
+        movq      200(%rsp), %rbx
+        jmp       .LBL_1_2
+
+.LBL_1_10:
+        cfi_restore_state
+        movzbl    %r12b, %r15d
+        vmovss    260(%rsp,%r15,8), %xmm0
+        vzeroupper
+
+        call      JUMPTARGET(sinf)
+
+        vmovss    %xmm0, 324(%rsp,%r15,8)
+        vmovss    260(%rsp,%r15,8), %xmm0
+
+        call      JUMPTARGET(cosf)
+
+        vmovss    %xmm0, 388(%rsp,%r15,8)
+        jmp       .LBL_1_8
+
+.LBL_1_13:
+        movzbl    %r12b, %r15d
+        vmovss    256(%rsp,%r15,8), %xmm0
+        vzeroupper
+
+        call      JUMPTARGET(sinf)
+
+        vmovss    %xmm0, 320(%rsp,%r15,8)
+        vmovss    256(%rsp,%r15,8), %xmm0
+
+        call      JUMPTARGET(cosf)
+
+        vmovss    %xmm0, 384(%rsp,%r15,8)
+        jmp       .LBL_1_7
+
+END (_ZGVdN8vl4l4_sincosf_avx2)
+libmvec_hidden_def(_ZGVdN8vl4l4_sincosf_avx2)
+
+/* vvv version implemented with wrapper to vl4l4 variant.  */
+ENTRY (_ZGVdN8vvv_sincosf_avx2)
+#ifndef __ILP32__
+        pushq     %rbp
+        cfi_adjust_cfa_offset (8)
+        cfi_rel_offset (%rbp, 0)
+        movq      %rsp, %rbp
+        cfi_def_cfa_register (%rbp)
+        andq      $-32, %rsp
+        subq      $192, %rsp
+        vmovdqu   %ymm1, 64(%rsp)
+        lea       (%rsp), %rdi
+        vmovdqu   %ymm2, 96(%rdi)
+        vmovdqu   %ymm3, 128(%rdi)
+        vmovdqu   %ymm4, 160(%rdi)
+        lea       32(%rsp), %rsi
+        call      HIDDEN_JUMPTARGET(_ZGVdN8vl4l4_sincosf_avx2)
+        movq      64(%rsp), %rdx
+        movq      72(%rsp), %rsi
+        movq      80(%rsp), %r8
+        movq      88(%rsp), %r10
+        movl      (%rsp), %eax
+        movl      4(%rsp), %ecx
+        movl      8(%rsp), %edi
+        movl      12(%rsp), %r9d
+        movl      %eax, (%rdx)
+        movl      %ecx, (%rsi)
+        movq      96(%rsp), %rax
+        movq      104(%rsp), %rcx
+        movl      %edi, (%r8)
+        movl      %r9d, (%r10)
+        movq      112(%rsp), %rdi
+        movq      120(%rsp), %r9
+        movl      16(%rsp), %r11d
+        movl      20(%rsp), %edx
+        movl      24(%rsp), %esi
+        movl      28(%rsp), %r8d
+        movl      %r11d, (%rax)
+        movl      %edx, (%rcx)
+        movq      128(%rsp), %r11
+        movq      136(%rsp), %rdx
+        movl      %esi, (%rdi)
+        movl      %r8d, (%r9)
+        movq      144(%rsp), %rsi
+        movq      152(%rsp), %r8
+        movl      32(%rsp), %r10d
+        movl      36(%rsp), %eax
+        movl      40(%rsp), %ecx
+        movl      44(%rsp), %edi
+        movl      %r10d, (%r11)
+        movl      %eax, (%rdx)
+        movq      160(%rsp), %r10
+        movq      168(%rsp), %rax
+        movl      %ecx, (%rsi)
+        movl      %edi, (%r8)
+        movq      176(%rsp), %rcx
+        movq      184(%rsp), %rdi
+        movl      48(%rsp), %r9d
+        movl      52(%rsp), %r11d
+        movl      56(%rsp), %edx
+        movl      60(%rsp), %esi
+        movl      %r9d, (%r10)
+        movl      %r11d, (%rax)
+        movl      %edx, (%rcx)
+        movl      %esi, (%rdi)
+        movq      %rbp, %rsp
+        cfi_def_cfa_register (%rsp)
+        popq      %rbp
+        cfi_adjust_cfa_offset (-8)
+        cfi_restore (%rbp)
+        ret
+#else
+        leal    8(%rsp), %r10d
+        .cfi_def_cfa 10, 0
+        andl    $-32, %esp
+        pushq   -8(%r10d)
+        pushq   %rbp
+        .cfi_escape 0x10,0x6,0x2,0x76,0
+        movl    %esp, %ebp
+        pushq   %r10
+        .cfi_escape 0xf,0x3,0x76,0x78,0x6
+        leal    -48(%rbp), %esi
+        leal    -80(%rbp), %edi
+        subl    $136, %esp
+        vmovdqa %ymm1, -112(%ebp)
+        vmovdqa %ymm2, -144(%ebp)
+        call    HIDDEN_JUMPTARGET(_ZGVdN8vl4l4_sincosf_avx2)
+        vmovdqa -112(%ebp), %xmm0
+        vmovq   %xmm0, %rax
+        vmovss  -80(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        vmovss  -76(%ebp), %xmm0
+        shrq    $32, %rax
+        vmovss  %xmm0, (%eax)
+        movq    -104(%ebp), %rax
+        vmovss  -72(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        vmovss  -68(%ebp), %xmm0
+        shrq    $32, %rax
+        vmovss  %xmm0, (%eax)
+        movq    -96(%ebp), %rax
+        vmovss  -64(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        vmovss  -60(%ebp), %xmm0
+        shrq    $32, %rax
+        vmovss  %xmm0, (%eax)
+        movq    -88(%ebp), %rax
+        vmovss  -56(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        vmovss  -52(%ebp), %xmm0
+        shrq    $32, %rax
+        vmovss  %xmm0, (%eax)
+        vmovdqa -144(%ebp), %xmm0
+        vmovq   %xmm0, %rax
+        vmovss  -48(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        vmovss  -44(%ebp), %xmm0
+        shrq    $32, %rax
+        vmovss  %xmm0, (%eax)
+        movq    -136(%ebp), %rax
+        vmovss  -40(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        vmovss  -36(%ebp), %xmm0
+        shrq    $32, %rax
+        vmovss  %xmm0, (%eax)
+        movq    -128(%ebp), %rax
+        vmovss  -32(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        vmovss  -28(%ebp), %xmm0
+        shrq    $32, %rax
+        vmovss  %xmm0, (%eax)
+        movq    -120(%ebp), %rax
+        vmovss  -24(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        vmovss  -20(%ebp), %xmm0
+        shrq    $32, %rax
+        vmovss  %xmm0, (%eax)
+        addl    $136, %esp
+        popq    %r10
+        .cfi_def_cfa 10, 0
+        popq    %rbp
+        leal    -8(%r10), %esp
+        .cfi_def_cfa 7, 8
+        ret
+#endif
+END (_ZGVdN8vvv_sincosf_avx2)
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core.S
new file mode 100644
index 0000000000..2c18dbce53
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core.S
@@ -0,0 +1,37 @@
+/* Multiple versions of vectorized sinf.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+	.text
+ENTRY (_ZGVeN16v_sinf)
+        .type   _ZGVeN16v_sinf, @gnu_indirect_function
+	LOAD_RTLD_GLOBAL_RO_RDX
+        leaq    _ZGVeN16v_sinf_skx(%rip), %rax
+	HAS_ARCH_FEATURE (AVX512DQ_Usable)
+        jnz     2f
+        leaq    _ZGVeN16v_sinf_knl(%rip), %rax
+	HAS_ARCH_FEATURE (AVX512F_Usable)
+        jnz     2f
+        leaq    _ZGVeN16v_sinf_avx2_wrapper(%rip), %rax
+2:      ret
+END (_ZGVeN16v_sinf)
+
+#define _ZGVeN16v_sinf _ZGVeN16v_sinf_avx2_wrapper
+#include "../svml_s_sinf16_core.S"
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S
new file mode 100644
index 0000000000..8670673a29
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S
@@ -0,0 +1,479 @@
+/* Function sinf vectorized with AVX-512. KNL and SKX versions.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "svml_s_trig_data.h"
+#include "svml_s_wrapper_impl.h"
+
+	.text
+ENTRY(_ZGVeN16v_sinf_knl)
+#ifndef HAVE_AVX512DQ_ASM_SUPPORT
+WRAPPER_IMPL_AVX512 _ZGVdN8v_sinf
+#else
+/*
+   ALGORITHM DESCRIPTION:
+
+   1) Range reduction to [-Pi/2; +Pi/2] interval
+      a) Grab sign from source argument and save it.
+      b) Remove sign using AND operation
+      c) Getting octant Y by 1/Pi multiplication
+      d) Add "Right Shifter" value
+      e) Treat obtained value as integer for destination sign setting.
+         Shift first bit of this value to the last (sign) position
+      f) Change destination sign if source sign is negative
+         using XOR operation.
+      g) Subtract "Right Shifter" value
+      h) Subtract Y*PI from X argument, where PI divided to 4 parts:
+         X = X - Y*PI1 - Y*PI2 - Y*PI3 - Y*PI4;
+   2) Polynomial (minimax for sin within [-Pi/2; +Pi/2] interval)
+      a) Calculate X^2 = X * X
+      b) Calculate polynomial:
+         R = X + X * X^2 * (A3 + x^2 * (A5 + ......
+   3) Destination sign setting
+      a) Set shifted destination sign using XOR operation:
+         R = XOR( R, S );
+ */
+        pushq     %rbp
+        cfi_adjust_cfa_offset (8)
+        cfi_rel_offset (%rbp, 0)
+        movq      %rsp, %rbp
+        cfi_def_cfa_register (%rbp)
+        andq      $-64, %rsp
+        subq      $1280, %rsp
+        movq      __svml_s_trig_data@GOTPCREL(%rip), %rax
+
+/* Check for large and special values */
+        movl      $-1, %edx
+        vmovups __sAbsMask(%rax), %zmm4
+        vmovups __sInvPI(%rax), %zmm1
+
+/* b) Remove sign using AND operation */
+        vpandd    %zmm4, %zmm0, %zmm12
+        vmovups __sPI1_FMA(%rax), %zmm2
+        vmovups __sA9(%rax), %zmm7
+
+/*
+  f) Change destination sign if source sign is negative
+  using XOR operation.
+ */
+        vpandnd   %zmm0, %zmm4, %zmm11
+
+/*
+  h) Subtract Y*PI from X argument, where PI divided to 4 parts:
+  X = X - Y*PI1 - Y*PI2 - Y*PI3;
+ */
+        vmovaps   %zmm12, %zmm3
+
+/*
+  c) Getting octant Y by 1/Pi multiplication
+  d) Add "Right Shifter" value
+ */
+        vfmadd213ps __sRShifter(%rax), %zmm12, %zmm1
+        vcmpps    $22, __sRangeReductionVal(%rax), %zmm12, %k1
+        vpbroadcastd %edx, %zmm13{%k1}{z}
+
+/* g) Subtract "Right Shifter" value */
+        vsubps __sRShifter(%rax), %zmm1, %zmm5
+
+/*
+  e) Treat obtained value as integer for destination sign setting.
+  Shift first bit of this value to the last (sign) position
+ */
+        vpslld    $31, %zmm1, %zmm6
+        vptestmd  %zmm13, %zmm13, %k0
+        vfnmadd231ps %zmm5, %zmm2, %zmm3
+        kmovw     %k0, %ecx
+        vfnmadd231ps __sPI2_FMA(%rax), %zmm5, %zmm3
+        vfnmadd132ps __sPI3_FMA(%rax), %zmm3, %zmm5
+
+/*
+  2) Polynomial (minimax for sin within [-Pi/2; +Pi/2] interval)
+  a) Calculate X^2 = X * X
+  b) Calculate polynomial:
+  R = X + X * X^2 * (A3 + x^2 * (A5 + ......
+ */
+        vmulps    %zmm5, %zmm5, %zmm8
+        vpxord    %zmm6, %zmm5, %zmm9
+        vfmadd213ps __sA7(%rax), %zmm8, %zmm7
+        vfmadd213ps __sA5(%rax), %zmm8, %zmm7
+        vfmadd213ps __sA3(%rax), %zmm8, %zmm7
+        vmulps    %zmm8, %zmm7, %zmm10
+        vfmadd213ps %zmm9, %zmm9, %zmm10
+
+/*
+  3) Destination sign setting
+  a) Set shifted destination sign using XOR operation:
+  R = XOR( R, S );
+ */
+        vpxord    %zmm11, %zmm10, %zmm1
+        testl     %ecx, %ecx
+        jne       .LBL_1_3
+
+.LBL_1_2:
+        cfi_remember_state
+        vmovaps   %zmm1, %zmm0
+        movq      %rbp, %rsp
+        cfi_def_cfa_register (%rsp)
+        popq      %rbp
+        cfi_adjust_cfa_offset (-8)
+        cfi_restore (%rbp)
+        ret
+
+.LBL_1_3:
+        cfi_restore_state
+        vmovups   %zmm0, 1152(%rsp)
+        vmovups   %zmm1, 1216(%rsp)
+        je        .LBL_1_2
+
+        xorb      %dl, %dl
+        kmovw     %k4, 1048(%rsp)
+        xorl      %eax, %eax
+        kmovw     %k5, 1040(%rsp)
+        kmovw     %k6, 1032(%rsp)
+        kmovw     %k7, 1024(%rsp)
+        vmovups   %zmm16, 960(%rsp)
+        vmovups   %zmm17, 896(%rsp)
+        vmovups   %zmm18, 832(%rsp)
+        vmovups   %zmm19, 768(%rsp)
+        vmovups   %zmm20, 704(%rsp)
+        vmovups   %zmm21, 640(%rsp)
+        vmovups   %zmm22, 576(%rsp)
+        vmovups   %zmm23, 512(%rsp)
+        vmovups   %zmm24, 448(%rsp)
+        vmovups   %zmm25, 384(%rsp)
+        vmovups   %zmm26, 320(%rsp)
+        vmovups   %zmm27, 256(%rsp)
+        vmovups   %zmm28, 192(%rsp)
+        vmovups   %zmm29, 128(%rsp)
+        vmovups   %zmm30, 64(%rsp)
+        vmovups   %zmm31, (%rsp)
+        movq      %rsi, 1064(%rsp)
+        movq      %rdi, 1056(%rsp)
+        movq      %r12, 1096(%rsp)
+        cfi_offset_rel_rsp (12, 1096)
+        movb      %dl, %r12b
+        movq      %r13, 1088(%rsp)
+        cfi_offset_rel_rsp (13, 1088)
+        movl      %ecx, %r13d
+        movq      %r14, 1080(%rsp)
+        cfi_offset_rel_rsp (14, 1080)
+        movl      %eax, %r14d
+        movq      %r15, 1072(%rsp)
+        cfi_offset_rel_rsp (15, 1072)
+        cfi_remember_state
+
+.LBL_1_6:
+        btl       %r14d, %r13d
+        jc        .LBL_1_12
+
+.LBL_1_7:
+        lea       1(%r14), %esi
+        btl       %esi, %r13d
+        jc        .LBL_1_10
+
+.LBL_1_8:
+        addb      $1, %r12b
+        addl      $2, %r14d
+        cmpb      $16, %r12b
+        jb        .LBL_1_6
+
+        kmovw     1048(%rsp), %k4
+        movq      1064(%rsp), %rsi
+        kmovw     1040(%rsp), %k5
+        movq      1056(%rsp), %rdi
+        kmovw     1032(%rsp), %k6
+        movq      1096(%rsp), %r12
+        cfi_restore (%r12)
+        movq      1088(%rsp), %r13
+        cfi_restore (%r13)
+        kmovw     1024(%rsp), %k7
+        vmovups   960(%rsp), %zmm16
+        vmovups   896(%rsp), %zmm17
+        vmovups   832(%rsp), %zmm18
+        vmovups   768(%rsp), %zmm19
+        vmovups   704(%rsp), %zmm20
+        vmovups   640(%rsp), %zmm21
+        vmovups   576(%rsp), %zmm22
+        vmovups   512(%rsp), %zmm23
+        vmovups   448(%rsp), %zmm24
+        vmovups   384(%rsp), %zmm25
+        vmovups   320(%rsp), %zmm26
+        vmovups   256(%rsp), %zmm27
+        vmovups   192(%rsp), %zmm28
+        vmovups   128(%rsp), %zmm29
+        vmovups   64(%rsp), %zmm30
+        vmovups   (%rsp), %zmm31
+        movq      1080(%rsp), %r14
+        cfi_restore (%r14)
+        movq      1072(%rsp), %r15
+        cfi_restore (%r15)
+        vmovups   1216(%rsp), %zmm1
+        jmp       .LBL_1_2
+
+.LBL_1_10:
+        cfi_restore_state
+        movzbl    %r12b, %r15d
+        vmovss    1156(%rsp,%r15,8), %xmm0
+        call      JUMPTARGET(sinf)
+        vmovss    %xmm0, 1220(%rsp,%r15,8)
+        jmp       .LBL_1_8
+
+.LBL_1_12:
+        movzbl    %r12b, %r15d
+        vmovss    1152(%rsp,%r15,8), %xmm0
+        call      JUMPTARGET(sinf)
+        vmovss    %xmm0, 1216(%rsp,%r15,8)
+        jmp       .LBL_1_7
+#endif
+END(_ZGVeN16v_sinf_knl)
+
+ENTRY (_ZGVeN16v_sinf_skx)
+#ifndef HAVE_AVX512DQ_ASM_SUPPORT
+WRAPPER_IMPL_AVX512 _ZGVdN8v_sinf
+#else
+/*
+   ALGORITHM DESCRIPTION:
+
+   1) Range reduction to [-Pi/2; +Pi/2] interval
+      a) Grab sign from source argument and save it.
+      b) Remove sign using AND operation
+      c) Getting octant Y by 1/Pi multiplication
+      d) Add "Right Shifter" value
+      e) Treat obtained value as integer for destination sign setting.
+         Shift first bit of this value to the last (sign) position
+      f) Change destination sign if source sign is negative
+         using XOR operation.
+      g) Subtract "Right Shifter" value
+      h) Subtract Y*PI from X argument, where PI divided to 4 parts:
+         X = X - Y*PI1 - Y*PI2 - Y*PI3 - Y*PI4;
+   2) Polynomial (minimax for sin within [-Pi/2; +Pi/2] interval)
+      a) Calculate X^2 = X * X
+      b) Calculate polynomial:
+         R = X + X * X^2 * (A3 + x^2 * (A5 + ......
+   3) Destination sign setting
+      a) Set shifted destination sign using XOR operation:
+         R = XOR( R, S );
+  */
+
+        pushq     %rbp
+        cfi_adjust_cfa_offset (8)
+        cfi_rel_offset (%rbp, 0)
+        movq      %rsp, %rbp
+        cfi_def_cfa_register (%rbp)
+        andq      $-64, %rsp
+        subq      $1280, %rsp
+        movq      __svml_s_trig_data@GOTPCREL(%rip), %rax
+
+/* Check for large and special values */
+        vmovups   .L_2il0floatpacket.11(%rip), %zmm14
+        vmovups __sAbsMask(%rax), %zmm5
+        vmovups __sInvPI(%rax), %zmm1
+        vmovups __sRShifter(%rax), %zmm2
+        vmovups __sPI1_FMA(%rax), %zmm3
+        vmovups __sA9(%rax), %zmm8
+
+/* b) Remove sign using AND operation */
+        vandps    %zmm5, %zmm0, %zmm13
+
+/*
+  f) Change destination sign if source sign is negative
+  using XOR operation.
+ */
+        vandnps   %zmm0, %zmm5, %zmm12
+
+/*
+  c) Getting octant Y by 1/Pi multiplication
+  d) Add "Right Shifter" value
+ */
+        vfmadd213ps %zmm2, %zmm13, %zmm1
+        vcmpps    $18, __sRangeReductionVal(%rax), %zmm13, %k1
+
+/*
+  e) Treat obtained value as integer for destination sign setting.
+  Shift first bit of this value to the last (sign) position
+ */
+        vpslld    $31, %zmm1, %zmm7
+
+/* g) Subtract "Right Shifter" value */
+        vsubps    %zmm2, %zmm1, %zmm6
+
+/*
+  h) Subtract Y*PI from X argument, where PI divided to 4 parts:
+  X = X - Y*PI1 - Y*PI2 - Y*PI3;
+ */
+        vmovaps   %zmm13, %zmm4
+        vfnmadd231ps %zmm6, %zmm3, %zmm4
+        vfnmadd231ps __sPI2_FMA(%rax), %zmm6, %zmm4
+        vfnmadd132ps __sPI3_FMA(%rax), %zmm4, %zmm6
+
+/*
+  2) Polynomial (minimax for sin within [-Pi/2; +Pi/2] interval)
+  a) Calculate X^2 = X * X
+  b) Calculate polynomial:
+  R = X + X * X^2 * (A3 + x^2 * (A5 + ......
+ */
+        vmulps    %zmm6, %zmm6, %zmm9
+        vxorps    %zmm7, %zmm6, %zmm10
+        vfmadd213ps __sA7(%rax), %zmm9, %zmm8
+        vfmadd213ps __sA5(%rax), %zmm9, %zmm8
+        vfmadd213ps __sA3(%rax), %zmm9, %zmm8
+        vmulps    %zmm9, %zmm8, %zmm11
+        vfmadd213ps %zmm10, %zmm10, %zmm11
+
+/*
+  3) Destination sign setting
+  a) Set shifted destination sign using XOR operation:
+  R = XOR( R, S );
+ */
+        vxorps    %zmm12, %zmm11, %zmm1
+        vpandnd   %zmm13, %zmm13, %zmm14{%k1}
+        vptestmd  %zmm14, %zmm14, %k0
+        kmovw     %k0, %ecx
+        testl     %ecx, %ecx
+        jne       .LBL_2_3
+
+.LBL_2_2:
+        cfi_remember_state
+        vmovaps   %zmm1, %zmm0
+        movq      %rbp, %rsp
+        cfi_def_cfa_register (%rsp)
+        popq      %rbp
+        cfi_adjust_cfa_offset (-8)
+        cfi_restore (%rbp)
+        ret
+
+.LBL_2_3:
+        cfi_restore_state
+        vmovups   %zmm0, 1152(%rsp)
+        vmovups   %zmm1, 1216(%rsp)
+        je        .LBL_2_2
+
+        xorb      %dl, %dl
+        xorl      %eax, %eax
+        kmovw     %k4, 1048(%rsp)
+        kmovw     %k5, 1040(%rsp)
+        kmovw     %k6, 1032(%rsp)
+        kmovw     %k7, 1024(%rsp)
+        vmovups   %zmm16, 960(%rsp)
+        vmovups   %zmm17, 896(%rsp)
+        vmovups   %zmm18, 832(%rsp)
+        vmovups   %zmm19, 768(%rsp)
+        vmovups   %zmm20, 704(%rsp)
+        vmovups   %zmm21, 640(%rsp)
+        vmovups   %zmm22, 576(%rsp)
+        vmovups   %zmm23, 512(%rsp)
+        vmovups   %zmm24, 448(%rsp)
+        vmovups   %zmm25, 384(%rsp)
+        vmovups   %zmm26, 320(%rsp)
+        vmovups   %zmm27, 256(%rsp)
+        vmovups   %zmm28, 192(%rsp)
+        vmovups   %zmm29, 128(%rsp)
+        vmovups   %zmm30, 64(%rsp)
+        vmovups   %zmm31, (%rsp)
+        movq      %rsi, 1064(%rsp)
+        movq      %rdi, 1056(%rsp)
+        movq      %r12, 1096(%rsp)
+        cfi_offset_rel_rsp (12, 1096)
+        movb      %dl, %r12b
+        movq      %r13, 1088(%rsp)
+        cfi_offset_rel_rsp (13, 1088)
+        movl      %ecx, %r13d
+        movq      %r14, 1080(%rsp)
+        cfi_offset_rel_rsp (14, 1080)
+        movl      %eax, %r14d
+        movq      %r15, 1072(%rsp)
+        cfi_offset_rel_rsp (15, 1072)
+        cfi_remember_state
+
+.LBL_2_6:
+        btl       %r14d, %r13d
+        jc        .LBL_2_12
+
+.LBL_2_7:
+        lea       1(%r14), %esi
+        btl       %esi, %r13d
+        jc        .LBL_2_10
+
+.LBL_2_8:
+        incb      %r12b
+        addl      $2, %r14d
+        cmpb      $16, %r12b
+        jb        .LBL_2_6
+
+        kmovw     1048(%rsp), %k4
+        kmovw     1040(%rsp), %k5
+        kmovw     1032(%rsp), %k6
+        kmovw     1024(%rsp), %k7
+        vmovups   960(%rsp), %zmm16
+        vmovups   896(%rsp), %zmm17
+        vmovups   832(%rsp), %zmm18
+        vmovups   768(%rsp), %zmm19
+        vmovups   704(%rsp), %zmm20
+        vmovups   640(%rsp), %zmm21
+        vmovups   576(%rsp), %zmm22
+        vmovups   512(%rsp), %zmm23
+        vmovups   448(%rsp), %zmm24
+        vmovups   384(%rsp), %zmm25
+        vmovups   320(%rsp), %zmm26
+        vmovups   256(%rsp), %zmm27
+        vmovups   192(%rsp), %zmm28
+        vmovups   128(%rsp), %zmm29
+        vmovups   64(%rsp), %zmm30
+        vmovups   (%rsp), %zmm31
+        vmovups   1216(%rsp), %zmm1
+        movq      1064(%rsp), %rsi
+        movq      1056(%rsp), %rdi
+        movq      1096(%rsp), %r12
+        cfi_restore (%r12)
+        movq      1088(%rsp), %r13
+        cfi_restore (%r13)
+        movq      1080(%rsp), %r14
+        cfi_restore (%r14)
+        movq      1072(%rsp), %r15
+        cfi_restore (%r15)
+        jmp       .LBL_2_2
+
+.LBL_2_10:
+        cfi_restore_state
+        movzbl    %r12b, %r15d
+        vmovss    1156(%rsp,%r15,8), %xmm0
+        vzeroupper
+        vmovss    1156(%rsp,%r15,8), %xmm0
+
+        call      JUMPTARGET(sinf)
+
+        vmovss    %xmm0, 1220(%rsp,%r15,8)
+        jmp       .LBL_2_8
+
+.LBL_2_12:
+        movzbl    %r12b, %r15d
+        vmovss    1152(%rsp,%r15,8), %xmm0
+        vzeroupper
+        vmovss    1152(%rsp,%r15,8), %xmm0
+
+        call      JUMPTARGET(sinf)
+
+        vmovss    %xmm0, 1216(%rsp,%r15,8)
+        jmp       .LBL_2_7
+#endif
+END (_ZGVeN16v_sinf_skx)
+
+	.section .rodata, "a"
+.L_2il0floatpacket.11:
+	.long	0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
+	.type	.L_2il0floatpacket.11,@object
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_sinf4_core.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_sinf4_core.S
new file mode 100644
index 0000000000..3556473899
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_sinf4_core.S
@@ -0,0 +1,36 @@
+/* Multiple versions of vectorized sinf.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+	.text
+ENTRY (_ZGVbN4v_sinf)
+        .type   _ZGVbN4v_sinf, @gnu_indirect_function
+	LOAD_RTLD_GLOBAL_RO_RDX
+        leaq    _ZGVbN4v_sinf_sse4(%rip), %rax
+	HAS_CPU_FEATURE (SSE4_1)
+        jz      2f
+        ret
+2:      leaq    _ZGVbN4v_sinf_sse2(%rip), %rax
+        ret
+END (_ZGVbN4v_sinf)
+libmvec_hidden_def (_ZGVbN4v_sinf)
+
+#define _ZGVbN4v_sinf _ZGVbN4v_sinf_sse2
+#include "../svml_s_sinf4_core.S"
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_sinf4_core_sse4.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_sinf4_core_sse4.S
new file mode 100644
index 0000000000..c690150964
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_sinf4_core_sse4.S
@@ -0,0 +1,224 @@
+/* Function sinf vectorized with SSE4.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+
+#include <sysdep.h>
+#include "svml_s_trig_data.h"
+
+	.text
+ENTRY(_ZGVbN4v_sinf_sse4)
+/*
+   ALGORITHM DESCRIPTION:
+
+   1) Range reduction to [-Pi/2; +Pi/2] interval
+      a) Grab sign from source argument and save it.
+      b) Remove sign using AND operation
+      c) Getting octant Y by 1/Pi multiplication
+      d) Add "Right Shifter" value
+      e) Treat obtained value as integer for destination sign setting.
+         Shift first bit of this value to the last (sign) position
+      f) Change destination sign if source sign is negative
+         using XOR operation.
+      g) Subtract "Right Shifter" value
+      h) Subtract Y*PI from X argument, where PI divided to 4 parts:
+         X = X - Y*PI1 - Y*PI2 - Y*PI3 - Y*PI4;
+   2) Polynomial (minimax for sin within [-Pi/2; +Pi/2] interval)
+      a) Calculate X^2 = X * X
+      b) Calculate polynomial:
+         R = X + X * X^2 * (A3 + x^2 * (A5 + ......
+   3) Destination sign setting
+      a) Set shifted destination sign using XOR operation:
+         R = XOR( R, S );
+ */
+        pushq     %rbp
+        cfi_adjust_cfa_offset (8)
+        cfi_rel_offset (%rbp, 0)
+        movq      %rsp, %rbp
+        cfi_def_cfa_register (%rbp)
+        andq      $-64, %rsp
+        subq      $320, %rsp
+        movaps    %xmm0, %xmm5
+        movq      __svml_s_trig_data@GOTPCREL(%rip), %rax
+        movups    __sAbsMask(%rax), %xmm2
+
+/* b) Remove sign using AND operation */
+        movaps    %xmm2, %xmm4
+
+/*
+  f) Change destination sign if source sign is negative
+  using XOR operation.
+ */
+        andnps    %xmm5, %xmm2
+        movups    __sInvPI(%rax), %xmm1
+        andps     %xmm5, %xmm4
+
+/* c) Getting octant Y by 1/Pi multiplication
+   d) Add "Right Shifter" value  */
+        mulps     %xmm4, %xmm1
+
+/* h) Subtract Y*PI from X argument, where PI divided to 4 parts:
+   X = X - Y*PI1 - Y*PI2 - Y*PI3 - Y*PI4  */
+        movaps    %xmm4, %xmm0
+
+/* Check for large and special values */
+        cmpnleps  __sRangeReductionVal(%rax), %xmm4
+        movups    __sRShifter(%rax), %xmm6
+        movups    __sPI1(%rax), %xmm7
+        addps     %xmm6, %xmm1
+        movmskps  %xmm4, %ecx
+
+/* e) Treat obtained value as integer for destination sign setting.
+   Shift first bit of this value to the last (sign) position  */
+        movaps    %xmm1, %xmm3
+
+/* g) Subtract "Right Shifter" value */
+        subps     %xmm6, %xmm1
+        mulps     %xmm1, %xmm7
+        pslld     $31, %xmm3
+        movups    __sPI2(%rax), %xmm6
+        subps     %xmm7, %xmm0
+        mulps     %xmm1, %xmm6
+        movups    __sPI3(%rax), %xmm7
+        subps     %xmm6, %xmm0
+        mulps     %xmm1, %xmm7
+        movups    __sPI4(%rax), %xmm6
+        subps     %xmm7, %xmm0
+        mulps     %xmm6, %xmm1
+        subps     %xmm1, %xmm0
+
+/* 2) Polynomial (minimax for sin within [-Pi/2; +Pi/2] interval)
+   a) Calculate X^2 = X * X
+   b) Calculate polynomial:
+   R = X + X * X^2 * (A3 + x^2 * (A5 + ...... */
+        movaps    %xmm0, %xmm1
+        mulps     %xmm0, %xmm1
+        xorps     %xmm3, %xmm0
+        movups    __sA9(%rax), %xmm3
+        mulps     %xmm1, %xmm3
+        addps     __sA7(%rax), %xmm3
+        mulps     %xmm1, %xmm3
+        addps     __sA5(%rax), %xmm3
+        mulps     %xmm1, %xmm3
+        addps     __sA3(%rax), %xmm3
+        mulps     %xmm3, %xmm1
+        mulps     %xmm0, %xmm1
+        addps     %xmm1, %xmm0
+
+/* 3) Destination sign setting
+   a) Set shifted destination sign using XOR operation:
+   R = XOR( R, S ); */
+        xorps     %xmm2, %xmm0
+        testl     %ecx, %ecx
+        jne       .LBL_1_3
+
+.LBL_1_2:
+        cfi_remember_state
+        movq      %rbp, %rsp
+        cfi_def_cfa_register (%rsp)
+        popq      %rbp
+        cfi_adjust_cfa_offset (-8)
+        cfi_restore (%rbp)
+        ret
+
+.LBL_1_3:
+        cfi_restore_state
+        movups    %xmm5, 192(%rsp)
+        movups    %xmm0, 256(%rsp)
+        je        .LBL_1_2
+
+        xorb      %dl, %dl
+        xorl      %eax, %eax
+        movups    %xmm8, 112(%rsp)
+        movups    %xmm9, 96(%rsp)
+        movups    %xmm10, 80(%rsp)
+        movups    %xmm11, 64(%rsp)
+        movups    %xmm12, 48(%rsp)
+        movups    %xmm13, 32(%rsp)
+        movups    %xmm14, 16(%rsp)
+        movups    %xmm15, (%rsp)
+        movq      %rsi, 136(%rsp)
+        movq      %rdi, 128(%rsp)
+        movq      %r12, 168(%rsp)
+        cfi_offset_rel_rsp (12, 168)
+        movb      %dl, %r12b
+        movq      %r13, 160(%rsp)
+        cfi_offset_rel_rsp (13, 160)
+        movl      %ecx, %r13d
+        movq      %r14, 152(%rsp)
+        cfi_offset_rel_rsp (14, 152)
+        movl      %eax, %r14d
+        movq      %r15, 144(%rsp)
+        cfi_offset_rel_rsp (15, 144)
+        cfi_remember_state
+
+.LBL_1_6:
+        btl       %r14d, %r13d
+        jc        .LBL_1_12
+
+.LBL_1_7:
+        lea       1(%r14), %esi
+        btl       %esi, %r13d
+        jc        .LBL_1_10
+
+.LBL_1_8:
+        incb      %r12b
+        addl      $2, %r14d
+        cmpb      $16, %r12b
+        jb        .LBL_1_6
+
+        movups    112(%rsp), %xmm8
+        movups    96(%rsp), %xmm9
+        movups    80(%rsp), %xmm10
+        movups    64(%rsp), %xmm11
+        movups    48(%rsp), %xmm12
+        movups    32(%rsp), %xmm13
+        movups    16(%rsp), %xmm14
+        movups    (%rsp), %xmm15
+        movq      136(%rsp), %rsi
+        movq      128(%rsp), %rdi
+        movq      168(%rsp), %r12
+        cfi_restore (%r12)
+        movq      160(%rsp), %r13
+        cfi_restore (%r13)
+        movq      152(%rsp), %r14
+        cfi_restore (%r14)
+        movq      144(%rsp), %r15
+        cfi_restore (%r15)
+        movups    256(%rsp), %xmm0
+        jmp       .LBL_1_2
+
+.LBL_1_10:
+        cfi_restore_state
+        movzbl    %r12b, %r15d
+        movss     196(%rsp,%r15,8), %xmm0
+
+        call      JUMPTARGET(sinf)
+
+        movss     %xmm0, 260(%rsp,%r15,8)
+        jmp       .LBL_1_8
+
+.LBL_1_12:
+        movzbl    %r12b, %r15d
+        movss     192(%rsp,%r15,8), %xmm0
+
+        call      JUMPTARGET(sinf)
+
+        movss     %xmm0, 256(%rsp,%r15,8)
+        jmp       .LBL_1_7
+
+END(_ZGVbN4v_sinf_sse4)
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_sinf8_core.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_sinf8_core.S
new file mode 100644
index 0000000000..674e88bd55
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_sinf8_core.S
@@ -0,0 +1,36 @@
+/* Multiple versions of vectorized sinf, vector length is 8.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+    Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+	.text
+ENTRY (_ZGVdN8v_sinf)
+        .type   _ZGVdN8v_sinf, @gnu_indirect_function
+	LOAD_RTLD_GLOBAL_RO_RDX
+1:      leaq    _ZGVdN8v_sinf_avx2(%rip), %rax
+	HAS_ARCH_FEATURE (AVX2_Usable)
+        jz      2f
+        ret
+2:      leaq    _ZGVdN8v_sinf_sse_wrapper(%rip), %rax
+        ret
+END (_ZGVdN8v_sinf)
+libmvec_hidden_def (_ZGVdN8v_sinf)
+
+#define _ZGVdN8v_sinf _ZGVdN8v_sinf_sse_wrapper
+#include "../svml_s_sinf8_core.S"
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_sinf8_core_avx2.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_sinf8_core_avx2.S
new file mode 100644
index 0000000000..d34870fa3a
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_sinf8_core_avx2.S
@@ -0,0 +1,219 @@
+/* Function sinf vectorized with AVX2.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "svml_s_trig_data.h"
+
+	.text
+ENTRY(_ZGVdN8v_sinf_avx2)
+/*
+   ALGORITHM DESCRIPTION:
+
+   1) Range reduction to [-Pi/2; +Pi/2] interval
+      a) Grab sign from source argument and save it.
+      b) Remove sign using AND operation
+      c) Getting octant Y by 1/Pi multiplication
+      d) Add "Right Shifter" value
+      e) Treat obtained value as integer for destination sign setting.
+         Shift first bit of this value to the last (sign) position
+      f) Change destination sign if source sign is negative
+         using XOR operation.
+      g) Subtract "Right Shifter" value
+      h) Subtract Y*PI from X argument, where PI divided to 4 parts:
+         X = X - Y*PI1 - Y*PI2 - Y*PI3 - Y*PI4;
+   2) Polynomial (minimax for sin within [-Pi/2; +Pi/2] interval)
+      a) Calculate X^2 = X * X
+      b) Calculate polynomial:
+         R = X + X * X^2 * (A3 + x^2 * (A5 + ......
+   3) Destination sign setting
+      a) Set shifted destination sign using XOR operation:
+         R = XOR( R, S );
+ */
+        pushq     %rbp
+        cfi_adjust_cfa_offset (8)
+        cfi_rel_offset (%rbp, 0)
+        movq      %rsp, %rbp
+        cfi_def_cfa_register (%rbp)
+        andq      $-64, %rsp
+        subq      $448, %rsp
+        movq      __svml_s_trig_data@GOTPCREL(%rip), %rax
+        vmovdqa   %ymm0, %ymm5
+        vmovups __sAbsMask(%rax), %ymm3
+        vmovups __sInvPI(%rax), %ymm7
+        vmovups __sRShifter(%rax), %ymm0
+        vmovups __sPI1_FMA(%rax), %ymm1
+
+/* b) Remove sign using AND operation */
+        vandps    %ymm3, %ymm5, %ymm4
+
+/*
+  c) Getting octant Y by 1/Pi multiplication
+  d) Add "Right Shifter" value
+ */
+        vfmadd213ps %ymm0, %ymm4, %ymm7
+
+/* g) Subtract "Right Shifter" value */
+        vsubps    %ymm0, %ymm7, %ymm2
+
+/*
+  e) Treat obtained value as integer for destination sign setting.
+  Shift first bit of this value to the last (sign) position
+ */
+        vpslld    $31, %ymm7, %ymm6
+
+/*
+  h) Subtract Y*PI from X argument, where PI divided to 4 parts:
+  X = X - Y*PI1 - Y*PI2 - Y*PI3;
+ */
+        vmovdqa   %ymm4, %ymm0
+        vfnmadd231ps %ymm2, %ymm1, %ymm0
+
+/* Check for large and special values */
+        vcmpnle_uqps __sRangeReductionVal(%rax), %ymm4, %ymm4
+        vfnmadd231ps __sPI2_FMA(%rax), %ymm2, %ymm0
+        vfnmadd132ps __sPI3_FMA(%rax), %ymm0, %ymm2
+
+/*
+  2) Polynomial (minimax for sin within [-Pi/2; +Pi/2] interval)
+  a) Calculate X^2 = X * X
+  b) Calculate polynomial:
+  R = X + X * X^2 * (A3 + x^2 * (A5 + ......
+ */
+        vmulps    %ymm2, %ymm2, %ymm1
+
+/*
+  f) Change destination sign if source sign is negative
+  using XOR operation.
+ */
+        vandnps   %ymm5, %ymm3, %ymm0
+        vxorps    %ymm6, %ymm2, %ymm3
+        vmovups __sA9(%rax), %ymm2
+        vfmadd213ps __sA7(%rax), %ymm1, %ymm2
+        vfmadd213ps __sA5(%rax), %ymm1, %ymm2
+        vfmadd213ps __sA3(%rax), %ymm1, %ymm2
+        vmulps    %ymm1, %ymm2, %ymm6
+        vfmadd213ps %ymm3, %ymm3, %ymm6
+        vmovmskps %ymm4, %ecx
+
+/*
+  3) Destination sign setting
+  a) Set shifted destination sign using XOR operation:
+  R = XOR( R, S );
+ */
+        vxorps    %ymm0, %ymm6, %ymm0
+        testl     %ecx, %ecx
+        jne       .LBL_1_3
+
+.LBL_1_2:
+        cfi_remember_state
+        movq      %rbp, %rsp
+        cfi_def_cfa_register (%rsp)
+        popq      %rbp
+        cfi_adjust_cfa_offset (-8)
+        cfi_restore (%rbp)
+        ret
+
+.LBL_1_3:
+        cfi_restore_state
+        vmovups   %ymm5, 320(%rsp)
+        vmovups   %ymm0, 384(%rsp)
+        je        .LBL_1_2
+
+        xorb      %dl, %dl
+        xorl      %eax, %eax
+        vmovups   %ymm8, 224(%rsp)
+        vmovups   %ymm9, 192(%rsp)
+        vmovups   %ymm10, 160(%rsp)
+        vmovups   %ymm11, 128(%rsp)
+        vmovups   %ymm12, 96(%rsp)
+        vmovups   %ymm13, 64(%rsp)
+        vmovups   %ymm14, 32(%rsp)
+        vmovups   %ymm15, (%rsp)
+        movq      %rsi, 264(%rsp)
+        movq      %rdi, 256(%rsp)
+        movq      %r12, 296(%rsp)
+        cfi_offset_rel_rsp (12, 296)
+        movb      %dl, %r12b
+        movq      %r13, 288(%rsp)
+        cfi_offset_rel_rsp (13, 288)
+        movl      %ecx, %r13d
+        movq      %r14, 280(%rsp)
+        cfi_offset_rel_rsp (14, 280)
+        movl      %eax, %r14d
+        movq      %r15, 272(%rsp)
+        cfi_offset_rel_rsp (15, 272)
+        cfi_remember_state
+
+.LBL_1_6:
+        btl       %r14d, %r13d
+        jc        .LBL_1_12
+
+.LBL_1_7:
+        lea       1(%r14), %esi
+        btl       %esi, %r13d
+        jc        .LBL_1_10
+
+.LBL_1_8:
+        incb      %r12b
+        addl      $2, %r14d
+        cmpb      $16, %r12b
+        jb        .LBL_1_6
+
+        vmovups   224(%rsp), %ymm8
+        vmovups   192(%rsp), %ymm9
+        vmovups   160(%rsp), %ymm10
+        vmovups   128(%rsp), %ymm11
+        vmovups   96(%rsp), %ymm12
+        vmovups   64(%rsp), %ymm13
+        vmovups   32(%rsp), %ymm14
+        vmovups   (%rsp), %ymm15
+        vmovups   384(%rsp), %ymm0
+        movq      264(%rsp), %rsi
+        movq      256(%rsp), %rdi
+        movq      296(%rsp), %r12
+        cfi_restore (%r12)
+        movq      288(%rsp), %r13
+        cfi_restore (%r13)
+        movq      280(%rsp), %r14
+        cfi_restore (%r14)
+        movq      272(%rsp), %r15
+        cfi_restore (%r15)
+        jmp       .LBL_1_2
+
+.LBL_1_10:
+        cfi_restore_state
+        movzbl    %r12b, %r15d
+        vmovss    324(%rsp,%r15,8), %xmm0
+        vzeroupper
+
+        call      JUMPTARGET(sinf)
+
+        vmovss    %xmm0, 388(%rsp,%r15,8)
+        jmp       .LBL_1_8
+
+.LBL_1_12:
+        movzbl    %r12b, %r15d
+        vmovss    320(%rsp,%r15,8), %xmm0
+        vzeroupper
+
+        call      JUMPTARGET(sinf)
+
+        vmovss    %xmm0, 384(%rsp,%r15,8)
+        jmp       .LBL_1_7
+
+END(_ZGVdN8v_sinf_avx2)
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/printf_fphex.c b/REORG.TODO/sysdeps/x86_64/fpu/printf_fphex.c
new file mode 100644
index 0000000000..fd68eaeebf
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/printf_fphex.c
@@ -0,0 +1,93 @@
+/* Print floating point number in hexadecimal notation according to ISO C99.
+   Copyright (C) 1997-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef LONG_DOUBLE_DENORM_BIAS
+# define LONG_DOUBLE_DENORM_BIAS (IEEE854_LONG_DOUBLE_BIAS - 1)
+#endif
+
+#define PRINT_FPHEX_LONG_DOUBLE \
+do {									      \
+      /* The "strange" 80 bit format on ix86 and m68k has an explicit	      \
+	 leading digit in the 64 bit mantissa.  */			      \
+      unsigned long long int num;					      \
+      union ieee854_long_double u;					      \
+      u.d = fpnum.ldbl;							      \
+									      \
+      num = (((unsigned long long int) u.ieee.mantissa0) << 32		      \
+	     | u.ieee.mantissa1);					      \
+									      \
+      zero_mantissa = num == 0;						      \
+									      \
+      if (sizeof (unsigned long int) > 6)				      \
+	{								      \
+	  numstr = _itoa_word (num, numbuf + sizeof numbuf, 16,		      \
+			       info->spec == 'A');			      \
+	  wnumstr = _itowa_word (num,					      \
+				 wnumbuf + sizeof (wnumbuf) / sizeof (wchar_t),\
+				 16, info->spec == 'A');		      \
+	}								      \
+      else								      \
+	{								      \
+	  numstr = _itoa (num, numbuf + sizeof numbuf, 16, info->spec == 'A');\
+	  wnumstr = _itowa (num,					      \
+			    wnumbuf + sizeof (wnumbuf) / sizeof (wchar_t),    \
+			    16, info->spec == 'A');			      \
+	}								      \
+									      \
+      /* Fill with zeroes.  */						      \
+      while (numstr > numbuf + (sizeof numbuf - 64 / 4))		      \
+	{								      \
+	  *--numstr = '0';						      \
+	  *--wnumstr = L'0';						      \
+	}								      \
+									      \
+      /* We use a full nibble for the leading digit.  */		      \
+      leading = *numstr++;						      \
+      wnumstr++;							      \
+									      \
+      /* We have 3 bits from the mantissa in the leading nibble.	      \
+	 Therefore we are here using `IEEE854_LONG_DOUBLE_BIAS + 3'.  */      \
+      exponent = u.ieee.exponent;					      \
+									      \
+      if (exponent == 0)						      \
+	{								      \
+	  if (zero_mantissa)						      \
+	    expnegative = 0;						      \
+	  else								      \
+	    {								      \
+	      /* This is a denormalized number.  */			      \
+	      expnegative = 1;						      \
+	      /* This is a hook for the m68k long double format, where the    \
+		 exponent bias is the same for normalized and denormalized    \
+		 numbers.  */						      \
+	      exponent = LONG_DOUBLE_DENORM_BIAS + 3;			      \
+	    }								      \
+	}								      \
+      else if (exponent >= IEEE854_LONG_DOUBLE_BIAS + 3)		      \
+	{								      \
+	  expnegative = 0;						      \
+	  exponent -= IEEE854_LONG_DOUBLE_BIAS + 3;			      \
+	}								      \
+      else								      \
+	{								      \
+	  expnegative = 1;						      \
+	  exponent = -(exponent - (IEEE854_LONG_DOUBLE_BIAS + 3));	      \
+	}								      \
+} while (0)
+
+#include <stdio-common/printf_fphex.c>
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/s_atanl.c b/REORG.TODO/sysdeps/x86_64/fpu/s_atanl.c
new file mode 100644
index 0000000000..fd4a455b55
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/s_atanl.c
@@ -0,0 +1 @@
+#include "sysdeps/i386/fpu/s_atanl.c"
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/s_ceill.S b/REORG.TODO/sysdeps/x86_64/fpu/s_ceill.S
new file mode 100644
index 0000000000..9d8b79dbee
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/s_ceill.S
@@ -0,0 +1,36 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Changes for long double by Ulrich Drepper <drepper@cygnus.com>
+ * Changes for x86-64 by Andreas Jaeger <aj@suse.de>
+ * Public domain.
+ */
+
+#include <machine/asm.h>
+
+
+ENTRY(__ceill)
+	fldt	8(%rsp)
+
+	fnstenv	-28(%rsp)		/* store fpu environment */
+
+	/* We use here %edx although only the low 1 bits are defined.
+	   But none of the operations should care and they are faster
+	   than the 16 bit operations.  */
+	movl	$0x0800,%edx		/* round towards +oo */
+	orl	-28(%rsp),%edx
+	andl	$0xfbff,%edx
+	movl	%edx,-32(%rsp)
+	fldcw	-32(%rsp)		/* load modified control word */
+
+	frndint				/* round */
+
+	/* Preserve "invalid" exceptions from sNaN input.  */
+	fnstsw
+	andl	$0x1, %eax
+	orl	%eax, -24(%rsp)
+
+	fldenv	-28(%rsp)		/* restore original environment */
+
+	ret
+END (__ceill)
+weak_alias (__ceill, ceill)
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/s_copysign.S b/REORG.TODO/sysdeps/x86_64/fpu/s_copysign.S
new file mode 100644
index 0000000000..8939dffd99
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/s_copysign.S
@@ -0,0 +1,50 @@
+/* copy sign, double version.
+   Copyright (C) 2002-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Andreas Jaeger <aj@suse.de>, 2002.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <machine/asm.h>
+
+	.section .rodata.cst16,"aM",@progbits,16
+
+	.align ALIGNARG(4)
+	.type signmask,@object
+signmask:
+	.byte 0, 0, 0, 0, 0, 0, 0, 0x80
+	.byte 0, 0, 0, 0, 0, 0, 0, 0
+	ASM_SIZE_DIRECTIVE(signmask)
+	.type othermask,@object
+othermask:
+	.byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x7f
+	.byte 0, 0, 0, 0, 0, 0, 0, 0
+	ASM_SIZE_DIRECTIVE(othermask)
+
+#ifdef PIC
+#define MO(op) op##(%rip)
+#else
+#define MO(op) op
+#endif
+
+	.text
+ENTRY(__copysign)
+	andpd MO(othermask),%xmm0
+	andpd MO(signmask),%xmm1
+	orpd %xmm1,%xmm0
+	ret
+END (__copysign)
+
+weak_alias (__copysign, copysign)
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/s_copysignf.S b/REORG.TODO/sysdeps/x86_64/fpu/s_copysignf.S
new file mode 100644
index 0000000000..213c2d3c2c
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/s_copysignf.S
@@ -0,0 +1,45 @@
+/* copy sign, double version.
+   Copyright (C) 2002-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Andreas Jaeger <aj@suse.de>, 2002.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <machine/asm.h>
+
+	.section .rodata
+
+	.align ALIGNARG(4)
+	.type mask,@object
+mask:
+	.byte 0xff, 0xff, 0xff, 0x7f
+	ASM_SIZE_DIRECTIVE(mask)
+
+#ifdef PIC
+#define MO(op) op##(%rip)
+#else
+#define MO(op) op
+#endif
+
+	.text
+ENTRY(__copysignf)
+	movss	MO(mask),%xmm3
+	andps	%xmm3,%xmm0
+	andnps	%xmm1,%xmm3
+	orps	%xmm3,%xmm0
+	retq
+END (__copysignf)
+
+weak_alias (__copysignf, copysignf)
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/s_copysignl.S b/REORG.TODO/sysdeps/x86_64/fpu/s_copysignl.S
new file mode 100644
index 0000000000..2ffd612d65
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/s_copysignl.S
@@ -0,0 +1,22 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Changes for long double by Ulrich Drepper <drepper@cygnus.com>.
+ * Adopted for x86-64 by Andreas Jaeger <aj@suse.de>.
+ * Public domain.
+ */
+
+#include <machine/asm.h>
+
+RCSID("$NetBSD: $")
+
+ENTRY(__copysignl)
+	movl	32(%rsp),%edx
+	movl	16(%rsp),%eax
+	andl	$0x8000,%edx
+	andl	$0x7fff,%eax
+	orl	%edx,%eax
+	movl	%eax,16(%rsp)
+	fldt	8(%rsp)
+	ret
+END (__copysignl)
+weak_alias (__copysignl, copysignl)
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/s_cosf.S b/REORG.TODO/sysdeps/x86_64/fpu/s_cosf.S
new file mode 100644
index 0000000000..e9fdc7e56e
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/s_cosf.S
@@ -0,0 +1,533 @@
+/* Optimized cosf function.
+   Copyright (C) 2012-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#define __need_Emath
+#include <bits/errno.h>
+
+/* Short algorithm description:
+ *
+ *  1) if |x| == 0: return 1.0-|x|.
+ *  2) if |x| <  2^-27: return 1.0-|x|.
+ *  3) if |x| <  2^-5 : return 1.0+x^2*DP_COS2_0+x^5*DP_COS2_1.
+ *  4) if |x| <   Pi/4: return 1.0+x^2*(C0+x^2*(C1+x^2*(C2+x^2*(C3+x^2*C4)))).
+ *  5) if |x| < 9*Pi/4:
+ *      5.1) Range reduction: k=trunc(|x|/(Pi/4)), j=(k+1)&0x0e, n=k+3,
+ *           t=|x|-j*Pi/4.
+ *      5.2) Reconstruction:
+ *          s = (-1.0)^((n>>2)&1)
+ *          if(n&2 != 0) {
+ *              using cos(t) polynomial for |t|<Pi/4, result is
+ *              s     * (1.0+t^2*(C0+t^2*(C1+t^2*(C2+t^2*(C3+t^2*C4))))).
+ *          } else {
+ *              using sin(t) polynomial for |t|<Pi/4, result is
+ *              s * t * (1.0+t^2*(S0+t^2*(S1+t^2*(S2+t^2*(S3+t^2*S4))))).
+ *          }
+ *  6) if |x| < 2^23, large args:
+ *      6.1) Range reduction: k=trunc(|x|/(Pi/4)), j=(k+1)&0xfffffffe, n=k+3,
+ *           t=|x|-j*Pi/4.
+ *      6.2) Reconstruction same as (5.2).
+ *  7) if |x| >= 2^23, very large args:
+ *      7.1) Range reduction: k=trunc(|x|/(Pi/4)), j=(k+1)&0xfffffffe, n=k+3,
+ *           t=|x|-j*Pi/4.
+ *      7.2) Reconstruction same as (5.2).
+ *  8) if x is Inf, return x-x, and set errno=EDOM.
+ *  9) if x is NaN, return x-x.
+ *
+ * Special cases:
+ *  cos(+-0) = 1 not raising inexact,
+ *  cos(subnormal) raises inexact,
+ *  cos(min_normalized) raises inexact,
+ *  cos(normalized) raises inexact,
+ *  cos(Inf) = NaN, raises invalid, sets errno to EDOM,
+ *  cos(NaN) = NaN.
+ */
+
+	.text
+ENTRY(__cosf)
+	/* Input: single precision x in %xmm0 */
+
+	movd	%xmm0, %eax		/* Bits of x */
+	movaps	%xmm0, %xmm7		/* Copy of x */
+	cvtss2sd %xmm0, %xmm0		/* DP x */
+	movss	L(SP_ABS_MASK)(%rip), %xmm3
+	andl	$0x7fffffff, %eax	/* |x| */
+
+	cmpl	$0x3f490fdb, %eax	/* |x|<Pi/4?  */
+	jb	L(arg_less_pio4)
+
+	/* Here if |x|>=Pi/4 */
+	andps	%xmm7, %xmm3		/* SP |x| */
+	andpd	L(DP_ABS_MASK)(%rip), %xmm0	/* DP |x| */
+	movss	L(SP_INVPIO4)(%rip), %xmm2	/* SP 1/(Pi/4) */
+
+	cmpl	$0x40e231d6, %eax	/* |x|<9*Pi/4?  */
+	jae	L(large_args)
+
+	/* Here if Pi/4<=|x|<9*Pi/4 */
+	mulss	%xmm3, %xmm2		/* SP |x|/(Pi/4) */
+	cvttss2si %xmm2, %eax		/* k, number of Pi/4 in x */
+	lea	L(PIO4J)(%rip), %rsi
+	addl	$1, %eax		/* k+1 */
+	movl	$0x0e, %edx
+	andl	%eax, %edx		/* j = (k+1)&0x0e */
+	addl	$2, %eax		/* n */
+	subsd	(%rsi,%rdx,8), %xmm0	/* t = |x| - j * Pi/4 */
+
+L(reconstruction):
+	/* Input: %eax=n, %xmm0=t */
+	testl	$2, %eax		/* n&2 != 0?  */
+	jz	L(sin_poly)
+
+/*L(cos_poly):*/
+	/* Here if cos(x) calculated using cos(t) polynomial for |t|<Pi/4:
+	 * y = t*t; z = y*y;
+	 * s = sign(x) * (-1.0)^((n>>2)&1)
+	 * result = s     * (1.0+t^2*(C0+t^2*(C1+t^2*(C2+t^2*(C3+t^2*C4)))))
+	 */
+	shrl	$2, %eax		/* n>>2 */
+	mulsd	%xmm0, %xmm0		/* y=t^2 */
+	andl	$1, %eax		/* (n>>2)&1 */
+	movaps	%xmm0, %xmm1		/* y */
+	mulsd	%xmm0, %xmm0		/* z=t^4 */
+
+	movsd	L(DP_C4)(%rip), %xmm4	/* C4 */
+	mulsd	%xmm0, %xmm4		/* z*C4 */
+	movsd	L(DP_C3)(%rip), %xmm3	/* C3 */
+	mulsd	%xmm0, %xmm3		/* z*C3 */
+	lea	L(DP_ONES)(%rip), %rsi
+	addsd	L(DP_C2)(%rip), %xmm4	/* C2+z*C4 */
+	mulsd	%xmm0, %xmm4		/* z*(C2+z*C4) */
+	addsd	L(DP_C1)(%rip), %xmm3	/* C1+z*C3 */
+	mulsd	%xmm0, %xmm3		/* z*(C1+z*C3) */
+	addsd	L(DP_C0)(%rip), %xmm4	/* C0+z*(C2+z*C4) */
+	mulsd	%xmm1, %xmm4		/* y*(C0+z*(C2+z*C4)) */
+
+	addsd	%xmm4, %xmm3		/* y*(C0+y*(C1+y*(C2+y*(C3+y*C4)))) */
+	/* 1.0+y*(C0+y*(C1+y*(C2+y*(C3+y*C4)))) */
+	addsd	L(DP_ONES)(%rip), %xmm3
+
+	mulsd	(%rsi,%rax,8), %xmm3	/* DP result */
+	cvtsd2ss %xmm3, %xmm0		/* SP result */
+	ret
+
+	.p2align	4
+L(sin_poly):
+	/* Here if cos(x) calculated using sin(t) polynomial for |t|<Pi/4:
+	 * y = t*t; z = y*y;
+	 * s = sign(x) * (-1.0)^((n>>2)&1)
+	 * result = s * t * (1.0+t^2*(S0+t^2*(S1+t^2*(S2+t^2*(S3+t^2*S4)))))
+	 */
+
+	movaps	%xmm0, %xmm4		/* t */
+	shrl	$2, %eax		/* n>>2 */
+	mulsd	%xmm0, %xmm0		/* y=t^2 */
+	andl	$1, %eax		/* (n>>2)&1 */
+	movaps	%xmm0, %xmm1		/* y */
+	mulsd	%xmm0, %xmm0		/* z=t^4 */
+
+	movsd	L(DP_S4)(%rip), %xmm2	/* S4 */
+	mulsd	%xmm0, %xmm2		/* z*S4 */
+	movsd	L(DP_S3)(%rip), %xmm3	/* S3 */
+	mulsd	%xmm0, %xmm3		/* z*S3 */
+	lea	L(DP_ONES)(%rip), %rsi
+	addsd	L(DP_S2)(%rip), %xmm2	/* S2+z*S4 */
+	mulsd	%xmm0, %xmm2		/* z*(S2+z*S4) */
+	addsd	L(DP_S1)(%rip), %xmm3	/* S1+z*S3 */
+	mulsd	%xmm0, %xmm3		/* z*(S1+z*S3) */
+	addsd	L(DP_S0)(%rip), %xmm2	/* S0+z*(S2+z*S4) */
+	mulsd	%xmm1, %xmm2		/* y*(S0+z*(S2+z*S4)) */
+	/* t*s, where s = sign(x) * (-1.0)^((n>>2)&1) */
+	mulsd	(%rsi,%rax,8), %xmm4
+	/* y*(S0+y*(S1+y*(S2+y*(S3+y*S4)))) */
+	addsd	%xmm2, %xmm3
+	/* t*s*y*(S0+y*(S1+y*(S2+y*(S3+y*S4)))) */
+	mulsd	%xmm4, %xmm3
+	/* t*s*(1.0+y*(S0+y*(S1+y*(S2+y*(S3+y*S4)))) */
+	addsd	%xmm4, %xmm3
+	cvtsd2ss %xmm3, %xmm0		/* SP result */
+	ret
+
+	.p2align	4
+L(large_args):
+	/* Here if |x|>=9*Pi/4 */
+	cmpl	$0x7f800000, %eax	/* x is Inf or NaN?  */
+	jae	L(arg_inf_or_nan)
+
+	/* Here if finite |x|>=9*Pi/4 */
+	cmpl	$0x4b000000, %eax	/* |x|<2^23?  */
+	jae	L(very_large_args)
+
+	/* Here if 9*Pi/4<=|x|<2^23 */
+	movsd	L(DP_INVPIO4)(%rip), %xmm1 /* 1/(Pi/4) */
+	mulsd	%xmm0, %xmm1		/* |x|/(Pi/4) */
+	cvttsd2si %xmm1, %eax		/* k=trunc(|x|/(Pi/4)) */
+	addl	$1, %eax		/* k+1 */
+	movl	%eax, %edx
+	andl	$0xfffffffe, %edx	/* j=(k+1)&0xfffffffe */
+	cvtsi2sdl %edx, %xmm4		/* DP j */
+	movsd	L(DP_PIO4HI)(%rip), %xmm2 /* -PIO4HI = high part of -Pi/4 */
+	mulsd	%xmm4, %xmm2		/* -j*PIO4HI */
+	movsd	L(DP_PIO4LO)(%rip), %xmm3 /* -PIO4LO = low part of -Pi/4 */
+	addsd	%xmm2, %xmm0		/* |x| - j*PIO4HI */
+	addl	$2, %eax		/* n */
+	mulsd	%xmm3, %xmm4		/* j*PIO4LO */
+	addsd	%xmm4, %xmm0		/* t = |x| - j*PIO4HI - j*PIO4LO */
+	jmp	L(reconstruction)
+
+	.p2align	4
+L(very_large_args):
+	/* Here if finite |x|>=2^23 */
+
+	/* bitpos = (ix>>23) - BIAS_32 + 59; */
+	shrl	$23, %eax		/* eb = biased exponent of x */
+	/* bitpos = eb - 0x7f + 59, where 0x7f is exponent bias */
+	subl	$68, %eax
+	movl	$28, %ecx		/* %cl=28 */
+	movl	%eax, %edx		/* bitpos copy */
+
+	/* j = bitpos/28; */
+	div	%cl			/* j in register %al=%ax/%cl */
+	movapd	%xmm0, %xmm3		/* |x| */
+	/* clear unneeded remainder from %ah */
+	andl	$0xff, %eax
+
+	imull	$28, %eax, %ecx		/* j*28 */
+	lea	L(_FPI)(%rip), %rsi
+	movsd	L(DP_HI_MASK)(%rip), %xmm4 /* DP_HI_MASK */
+	movapd	%xmm0, %xmm5		/* |x| */
+	mulsd	-16(%rsi,%rax,8), %xmm3	/* tmp3 = FPI[j-2]*|x| */
+	movapd	%xmm0, %xmm1		/* |x| */
+	mulsd	-8(%rsi,%rax,8), %xmm5	/* tmp2 = FPI[j-1]*|x| */
+	mulsd	(%rsi,%rax,8), %xmm0	/* tmp0 = FPI[j]*|x| */
+	addl	$19, %ecx		/* j*28+19 */
+	mulsd	8(%rsi,%rax,8), %xmm1	/* tmp1 = FPI[j+1]*|x| */
+	cmpl	%ecx, %edx		/* bitpos>=j*28+19?  */
+	jl	L(very_large_skip1)
+
+	/* Here if bitpos>=j*28+19 */
+	andpd	%xmm3, %xmm4		/* HI(tmp3) */
+	subsd	%xmm4, %xmm3		/* tmp3 = tmp3 - HI(tmp3) */
+L(very_large_skip1):
+
+	movsd	L(DP_2POW52)(%rip), %xmm6
+	movapd	%xmm5, %xmm2		/* tmp2 copy */
+	addsd	%xmm3, %xmm5		/* tmp5 = tmp3 + tmp2 */
+	movl	$1, %edx
+	addsd	%xmm5, %xmm6		/* tmp6 = tmp5 + 2^52 */
+	movsd	8+L(DP_2POW52)(%rip), %xmm4
+	movd	%xmm6, %eax		/* k = I64_LO(tmp6); */
+	addsd	%xmm6, %xmm4		/* tmp4 = tmp6 - 2^52 */
+	comisd	%xmm5, %xmm4		/* tmp4 > tmp5?  */
+	jbe	L(very_large_skip2)
+
+	/* Here if tmp4 > tmp5 */
+	subl	$1, %eax		/* k-- */
+	addsd	8+L(DP_ONES)(%rip), %xmm4 /* tmp4 -= 1.0 */
+L(very_large_skip2):
+
+	andl	%eax, %edx		/* k&1 */
+	lea	L(DP_ZERONE)(%rip), %rsi
+	subsd	%xmm4, %xmm3		/* tmp3 -= tmp4 */
+	addsd	(%rsi,%rdx,8), %xmm3	/* t  = DP_ZERONE[k&1] + tmp3 */
+	addsd	%xmm2, %xmm3		/* t += tmp2 */
+	addsd	%xmm3, %xmm0		/* t += tmp0 */
+	addl	$3, %eax		/* n=k+3 */
+	addsd	%xmm1, %xmm0		/* t += tmp1 */
+	mulsd	L(DP_PIO4)(%rip), %xmm0	/* t *= PI04 */
+
+	jmp	L(reconstruction)	/* end of very_large_args peth */
+
+	.p2align	4
+L(arg_less_pio4):
+	/* Here if |x|<Pi/4 */
+	cmpl	$0x3d000000, %eax	/* |x|<2^-5?  */
+	jl	L(arg_less_2pn5)
+
+	/* Here if 2^-5<=|x|<Pi/4 */
+	mulsd	%xmm0, %xmm0		/* y=x^2 */
+	movaps	%xmm0, %xmm1		/* y */
+	mulsd	%xmm0, %xmm0		/* z=x^4 */
+	movsd	L(DP_C4)(%rip), %xmm3	/* C4 */
+	mulsd	%xmm0, %xmm3		/* z*C4 */
+	movsd	L(DP_C3)(%rip), %xmm5	/* C3 */
+	mulsd	%xmm0, %xmm5		/* z*C3 */
+	addsd	L(DP_C2)(%rip), %xmm3	/* C2+z*C4 */
+	mulsd	%xmm0, %xmm3		/* z*(C2+z*C4) */
+	addsd	L(DP_C1)(%rip), %xmm5	/* C1+z*C3 */
+	mulsd	%xmm0, %xmm5		/* z*(C1+z*C3) */
+	addsd	L(DP_C0)(%rip), %xmm3	/* C0+z*(C2+z*C4) */
+	mulsd	%xmm1, %xmm3		/* y*(C0+z*(C2+z*C4)) */
+	/* y*(C0+y*(C1+y*(C2+y*(C3+y*C4)))) */
+	addsd	%xmm5, %xmm3
+	/* 1.0 + y*(C0+y*(C1+y*(C2+y*(C3+y*C4)))) */
+	addsd	L(DP_ONES)(%rip), %xmm3
+	cvtsd2ss %xmm3, %xmm0		/* SP result */
+	ret
+
+	.p2align	4
+L(arg_less_2pn5):
+	/* Here if |x|<2^-5 */
+	cmpl	$0x32000000, %eax	/* |x|<2^-27?  */
+	jl	L(arg_less_2pn27)
+
+	/* Here if 2^-27<=|x|<2^-5 */
+	mulsd	%xmm0, %xmm0		/* DP x^2 */
+	movsd	L(DP_COS2_1)(%rip), %xmm3 /* DP DP_COS2_1 */
+	mulsd	%xmm0, %xmm3		/* DP x^2*DP_COS2_1 */
+	addsd	L(DP_COS2_0)(%rip), %xmm3 /* DP DP_COS2_0+x^2*DP_COS2_1 */
+	mulsd	%xmm0, %xmm3		/* DP x^2*DP_COS2_0+x^4*DP_COS2_1 */
+	/* DP 1.0+x^2*DP_COS2_0+x^4*DP_COS2_1 */
+	addsd	L(DP_ONES)(%rip), %xmm3
+	cvtsd2ss %xmm3, %xmm0		/* SP result */
+	ret
+
+	.p2align	4
+L(arg_less_2pn27):
+	/* Here if |x|<2^-27 */
+	andps	L(SP_ABS_MASK)(%rip),%xmm7 /* |x| */
+	movss	L(SP_ONE)(%rip), %xmm0	/* 1.0 */
+	subss	%xmm7, %xmm0		/* result is 1.0-|x| */
+	ret
+
+	.p2align	4
+L(arg_inf_or_nan):
+	/* Here if |x| is Inf or NAN */
+	jne	L(skip_errno_setting)	/* in case of x is NaN */
+
+	/* Align stack to 16 bytes.  */
+	subq	$8, %rsp
+	cfi_adjust_cfa_offset (8)
+	/* Here if x is Inf. Set errno to EDOM.  */
+	call	JUMPTARGET(__errno_location)
+	addq	$8, %rsp
+	cfi_adjust_cfa_offset (-8)
+
+	movl	$EDOM, (%rax)
+
+	.p2align	4
+L(skip_errno_setting):
+	/* Here if |x| is Inf or NAN. Continued.  */
+	movaps	%xmm7, %xmm0		/* load x */
+	subss	%xmm0, %xmm0		/* Result is NaN */
+	ret
+END(__cosf)
+
+	.section .rodata, "a"
+	.p2align 3
+L(PIO4J): /* Table of j*Pi/4, for j=0,1,..,10 */
+	.long	0x00000000,0x00000000
+	.long	0x54442d18,0x3fe921fb
+	.long	0x54442d18,0x3ff921fb
+	.long	0x7f3321d2,0x4002d97c
+	.long	0x54442d18,0x400921fb
+	.long	0x2955385e,0x400f6a7a
+	.long	0x7f3321d2,0x4012d97c
+	.long	0xe9bba775,0x4015fdbb
+	.long	0x54442d18,0x401921fb
+	.long	0xbeccb2bb,0x401c463a
+	.long	0x2955385e,0x401f6a7a
+	.type L(PIO4J), @object
+	ASM_SIZE_DIRECTIVE(L(PIO4J))
+
+	.p2align 3
+L(_FPI): /* 4/Pi broken into sum of positive DP values */
+	.long	0x00000000,0x00000000
+	.long	0x6c000000,0x3ff45f30
+	.long	0x2a000000,0x3e3c9c88
+	.long	0xa8000000,0x3c54fe13
+	.long	0xd0000000,0x3aaf47d4
+	.long	0x6c000000,0x38fbb81b
+	.long	0xe0000000,0x3714acc9
+	.long	0x7c000000,0x3560e410
+	.long	0x56000000,0x33bca2c7
+	.long	0xac000000,0x31fbd778
+	.long	0xe0000000,0x300b7246
+	.long	0xe8000000,0x2e5d2126
+	.long	0x48000000,0x2c970032
+	.long	0xe8000000,0x2ad77504
+	.long	0xe0000000,0x290921cf
+	.long	0xb0000000,0x274deb1c
+	.long	0xe0000000,0x25829a73
+	.long	0xbe000000,0x23fd1046
+	.long	0x10000000,0x2224baed
+	.long	0x8e000000,0x20709d33
+	.long	0x80000000,0x1e535a2f
+	.long	0x64000000,0x1cef904e
+	.long	0x30000000,0x1b0d6398
+	.long	0x24000000,0x1964ce7d
+	.long	0x16000000,0x17b908bf
+	.type L(_FPI), @object
+	ASM_SIZE_DIRECTIVE(L(_FPI))
+
+/* Coefficients of polynomial
+   for cos(x)~=1.0+x^2*DP_COS2_0+x^4*DP_COS2_1, |x|<2^-5.  */
+	.p2align 3
+L(DP_COS2_0):
+	.long	0xff5cc6fd,0xbfdfffff
+	.type L(DP_COS2_0), @object
+	ASM_SIZE_DIRECTIVE(L(DP_COS2_0))
+
+	.p2align 3
+L(DP_COS2_1):
+	.long	0xb178dac5,0x3fa55514
+	.type L(DP_COS2_1), @object
+	ASM_SIZE_DIRECTIVE(L(DP_COS2_1))
+
+	.p2align 3
+L(DP_ZERONE):
+	.long	0x00000000,0x00000000	/* 0.0 */
+	.long	0x00000000,0xbff00000	/* 1.0 */
+	.type L(DP_ZERONE), @object
+	ASM_SIZE_DIRECTIVE(L(DP_ZERONE))
+
+	.p2align 3
+L(DP_ONES):
+	.long	0x00000000,0x3ff00000	/* +1.0 */
+	.long	0x00000000,0xbff00000	/* -1.0 */
+	.type L(DP_ONES), @object
+	ASM_SIZE_DIRECTIVE(L(DP_ONES))
+
+/* Coefficients of polynomial
+   for sin(t)~=t+t^3*(S0+t^2*(S1+t^2*(S2+t^2*(S3+t^2*S4)))), |t|<Pi/4.  */
+	.p2align 3
+L(DP_S3):
+	.long	0x64e6b5b4,0x3ec71d72
+	.type L(DP_S3), @object
+	ASM_SIZE_DIRECTIVE(L(DP_S3))
+
+	.p2align 3
+L(DP_S1):
+	.long	0x10c2688b,0x3f811111
+	.type L(DP_S1), @object
+	ASM_SIZE_DIRECTIVE(L(DP_S1))
+
+	.p2align 3
+L(DP_S4):
+	.long	0x1674b58a,0xbe5a947e
+	.type L(DP_S4), @object
+	ASM_SIZE_DIRECTIVE(L(DP_S4))
+
+	.p2align 3
+L(DP_S2):
+	.long	0x8b4bd1f9,0xbf2a019f
+	.type L(DP_S2),@object
+	ASM_SIZE_DIRECTIVE(L(DP_S2))
+
+	.p2align 3
+L(DP_S0):
+	.long	0x55551cd9,0xbfc55555
+	.type L(DP_S0), @object
+	ASM_SIZE_DIRECTIVE(L(DP_S0))
+
+/* Coefficients of polynomial
+   for cos(t)~=1.0+t^2*(C0+t^2*(C1+t^2*(C2+t^2*(C3+t^2*C4)))), |t|<Pi/4.  */
+	.p2align 3
+L(DP_C3):
+	.long	0x9ac43cc0,0x3efa00eb
+	.type L(DP_C3), @object
+	ASM_SIZE_DIRECTIVE(L(DP_C3))
+
+	.p2align 3
+L(DP_C1):
+	.long	0x545c50c7,0x3fa55555
+	.type L(DP_C1), @object
+	ASM_SIZE_DIRECTIVE(L(DP_C1))
+
+	.p2align 3
+L(DP_C4):
+	.long	0xdd8844d7,0xbe923c97
+	.type L(DP_C4), @object
+	ASM_SIZE_DIRECTIVE(L(DP_C4))
+
+	.p2align 3
+L(DP_C2):
+	.long	0x348b6874,0xbf56c16b
+	.type L(DP_C2), @object
+	ASM_SIZE_DIRECTIVE(L(DP_C2))
+
+	.p2align 3
+L(DP_C0):
+	.long	0xfffe98ae,0xbfdfffff
+	.type L(DP_C0), @object
+	ASM_SIZE_DIRECTIVE(L(DP_C0))
+
+	.p2align 3
+L(DP_PIO4):
+	.long	0x54442d18,0x3fe921fb	/* Pi/4 */
+	.type L(DP_PIO4), @object
+	ASM_SIZE_DIRECTIVE(L(DP_PIO4))
+
+	.p2align 3
+L(DP_2POW52):
+	.long	0x00000000,0x43300000	/* +2^52 */
+	.long	0x00000000,0xc3300000	/* -2^52 */
+	.type L(DP_2POW52), @object
+	ASM_SIZE_DIRECTIVE(L(DP_2POW52))
+
+	.p2align 3
+L(DP_INVPIO4):
+	.long	0x6dc9c883,0x3ff45f30	/* 4/Pi */
+	.type L(DP_INVPIO4), @object
+	ASM_SIZE_DIRECTIVE(L(DP_INVPIO4))
+
+	.p2align 3
+L(DP_PIO4HI):
+	.long	0x54000000,0xbfe921fb	/* High part of Pi/4 */
+	.type L(DP_PIO4HI), @object
+	ASM_SIZE_DIRECTIVE(L(DP_PIO4HI))
+
+	.p2align 3
+L(DP_PIO4LO):
+	.long	0x11A62633,0xbe010b46	/* Low part of Pi/4 */
+	.type L(DP_PIO4LO), @object
+	ASM_SIZE_DIRECTIVE(L(DP_PIO4LO))
+
+	.p2align 2
+L(SP_INVPIO4):
+	.long	0x3fa2f983		/* 4/Pi */
+	.type L(SP_INVPIO4), @object
+	ASM_SIZE_DIRECTIVE(L(SP_INVPIO4))
+
+	.p2align 4
+L(DP_ABS_MASK): /* Mask for getting DP absolute value */
+	.long	0xffffffff,0x7fffffff
+	.long	0xffffffff,0x7fffffff
+	.type L(DP_ABS_MASK), @object
+	ASM_SIZE_DIRECTIVE(L(DP_ABS_MASK))
+
+	.p2align 3
+L(DP_HI_MASK): /* Mask for getting high 21 bits of DP value */
+	.long	0x00000000,0xffffffff
+	.type L(DP_HI_MASK), @object
+	ASM_SIZE_DIRECTIVE(L(DP_HI_MASK))
+
+	.p2align 4
+L(SP_ABS_MASK): /* Mask for getting SP absolute value */
+	.long	0x7fffffff,0x7fffffff
+	.long	0x7fffffff,0x7fffffff
+	.type L(SP_ABS_MASK), @object
+	ASM_SIZE_DIRECTIVE(L(SP_ABS_MASK))
+
+	.p2align 2
+L(SP_ONE):
+	.long	0x3f800000		/* 1.0 */
+	.type L(SP_ONE), @object
+	ASM_SIZE_DIRECTIVE(L(SP_ONE))
+
+weak_alias(__cosf, cosf)
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/s_expm1l.S b/REORG.TODO/sysdeps/x86_64/fpu/s_expm1l.S
new file mode 100644
index 0000000000..7fbd99b0db
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/s_expm1l.S
@@ -0,0 +1,2 @@
+#define USE_AS_EXPM1L
+#include <e_expl.S>
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/s_fabs.c b/REORG.TODO/sysdeps/x86_64/fpu/s_fabs.c
new file mode 100644
index 0000000000..f5d3ee87e9
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/s_fabs.c
@@ -0,0 +1,26 @@
+/* Absolute value of floating point number.
+   Copyright (C) 2002-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <math.h>
+
+double
+__fabs (double x)
+{
+  return __builtin_fabs (x);
+}
+weak_alias (__fabs, fabs)
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/s_fabsf.c b/REORG.TODO/sysdeps/x86_64/fpu/s_fabsf.c
new file mode 100644
index 0000000000..9956cce757
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/s_fabsf.c
@@ -0,0 +1,26 @@
+/* Absolute value of floating point number.
+   Copyright (C) 2002-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <math.h>
+
+float
+__fabsf (float x)
+{
+  return __builtin_fabsf (x);
+}
+weak_alias (__fabsf, fabsf)
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/s_fabsl.S b/REORG.TODO/sysdeps/x86_64/fpu/s_fabsl.S
new file mode 100644
index 0000000000..1aef8318d9
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/s_fabsl.S
@@ -0,0 +1,27 @@
+/* Absolute value of floating point number.
+   Copyright (C) 2002-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+       .text
+ENTRY(__fabsl)
+       fldt    8(%rsp)
+       fabs
+       ret
+END(__fabsl)
+weak_alias (__fabsl, fabsl)
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/s_finitel.S b/REORG.TODO/sysdeps/x86_64/fpu/s_finitel.S
new file mode 100644
index 0000000000..9e49796901
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/s_finitel.S
@@ -0,0 +1,16 @@
+/*
+ * Written by Joe Keane <jgk@jgk.org>.
+ * Adopted for x86-64 by Andreas Jaeger <aj@suse.de>.
+ */
+
+#include <machine/asm.h>
+
+ENTRY(__finitel)
+	movl	16(%rsp),%eax
+	orl	$0xffff8000, %eax
+	incl	%eax
+	shrl	$31, %eax
+	ret
+END (__finitel)
+weak_alias (__finitel, finitel)
+hidden_def (__finitel)
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/s_floorl.S b/REORG.TODO/sysdeps/x86_64/fpu/s_floorl.S
new file mode 100644
index 0000000000..535fdd8571
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/s_floorl.S
@@ -0,0 +1,35 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Changes for long double by Ulrich Drepper <drepper@cygnus.com>
+ * Changes for x86-64 by Andreas Jaeger <aj@suse.de>=09
+ * Public domain.
+ */
+
+#include <machine/asm.h>
+
+ENTRY(__floorl)
+	fldt	8(%rsp)
+
+	fnstenv	-28(%rsp)		/* store fpu environment */
+
+	/* We use here %edx although only the low 1 bits are defined.
+	   But none of the operations should care and they are faster
+	   than the 16 bit operations.  */
+	movl	$0x400,%edx		/* round towards -oo */
+	orl	-28(%rsp),%edx
+	andl	$0xf7ff,%edx
+	movl	%edx,-32(%rsp)
+	fldcw	-32(%rsp)		/* load modified control word */
+
+	frndint				/* round */
+
+	/* Preserve "invalid" exceptions from sNaN input.  */
+	fnstsw
+	andl	$0x1, %eax
+	orl	%eax, -24(%rsp)
+
+	fldenv	-28(%rsp)		/* restore original environment */
+
+	ret
+END (__floorl)
+weak_alias (__floorl, floorl)
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/s_fmax.S b/REORG.TODO/sysdeps/x86_64/fpu/s_fmax.S
new file mode 100644
index 0000000000..f93c9f9371
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/s_fmax.S
@@ -0,0 +1,52 @@
+/* Compute maximum of two numbers, regarding NaN as missing argument.
+   Copyright (C) 2002-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Andreas Jaeger <aj@suse.de>, 2002.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+	.text
+ENTRY(__fmax)
+	ucomisd	%xmm0, %xmm1
+	jp	1f		// jump if unordered
+	maxsd	%xmm1, %xmm0
+	jmp	2f
+
+1:	ucomisd	%xmm1, %xmm1	// Is xmm1 a NaN?
+	jp	3f
+	// xmm0 is a NaN; xmm1 is not.  Test if xmm0 is signaling.
+	movsd	%xmm0, -8(%rsp)
+	testb	$0x8, -2(%rsp)
+	jz	4f
+	movsd	%xmm1, %xmm0	// otherwise return xmm1
+	ret
+
+3:	// xmm1 is a NaN; xmm0 may or may not be.
+	ucomisd	%xmm0, %xmm0
+	jp	4f
+	// xmm1 is a NaN; xmm0 is not.  Test if xmm1 is signaling.
+	movsd	%xmm1, -8(%rsp)
+	testb	$0x8, -2(%rsp)
+	jz	4f
+	ret
+
+4:	// Both arguments are NaNs, or one is a signaling NaN.
+	addsd	%xmm1, %xmm0
+
+2:	ret
+END(__fmax)
+weak_alias (__fmax, fmax)
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/s_fmaxf.S b/REORG.TODO/sysdeps/x86_64/fpu/s_fmaxf.S
new file mode 100644
index 0000000000..82989feb4b
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/s_fmaxf.S
@@ -0,0 +1,52 @@
+/* Compute maximum of two numbers, regarding NaN as missing argument.
+   Copyright (C) 2002-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Andreas Jaeger <aj@suse.de>, 2002.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+	.text
+ENTRY(__fmaxf)
+	ucomiss	%xmm0, %xmm1
+	jp	1f		// jump if unordered
+	maxss	%xmm1, %xmm0
+	jmp	2f
+
+1:	ucomiss	%xmm1, %xmm1	// Is xmm1 a NaN?
+	jp	3f
+	// xmm0 is a NaN; xmm1 is not.  Test if xmm0 is signaling.
+	movss	%xmm0, -4(%rsp)
+	testb	$0x40, -2(%rsp)
+	jz	4f
+	movss	%xmm1, %xmm0	// otherwise return xmm1
+	ret
+
+3:	// xmm1 is a NaN; xmm0 may or may not be.
+	ucomiss	%xmm0, %xmm0
+	jp	4f
+	// xmm1 is a NaN; xmm0 is not.  Test if xmm1 is signaling.
+	movss	%xmm1, -4(%rsp)
+	testb	$0x40, -2(%rsp)
+	jz	4f
+	ret
+
+4:	// Both arguments are NaNs, or one is a signaling NaN.
+	addss	%xmm1, %xmm0
+
+2:	ret
+END(__fmaxf)
+weak_alias (__fmaxf, fmaxf)
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/s_fmaxl.S b/REORG.TODO/sysdeps/x86_64/fpu/s_fmaxl.S
new file mode 100644
index 0000000000..2d3321fce4
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/s_fmaxl.S
@@ -0,0 +1,58 @@
+/* Compute maximum of two numbers, regarding NaN as missing argument.
+   Copyright (C) 1997-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+	.text
+ENTRY(__fmaxl)
+	fldt	8(%rsp)		// x
+	fldt	24(%rsp)	// x : y
+
+	fucomi	%st(1), %st
+	jp	2f
+	fcmovb	%st(1), %st
+
+	fstp	%st(1)
+
+	ret
+
+2:	// Unordered.
+	fucomi	%st(0), %st
+	jp	3f
+	// st(1) is a NaN; st(0) is not.  Test if st(1) is signaling.
+	testb	$0x40, 15(%rsp)
+	jz	4f
+	fstp	%st(1)
+	ret
+
+3:	// st(0) is a NaN; st(1) may or may not be.
+	fxch
+	fucomi	%st(0), %st
+	jp	4f
+	// st(1) is a NaN; st(0) is not.  Test if st(1) is signaling.
+	testb	$0x40, 31(%rsp)
+	jz	4f
+	fstp	%st(1)
+	ret
+
+4:	// Both arguments are NaNs, or one is a signaling NaN.
+	faddp
+	ret
+END(__fmaxl)
+weak_alias (__fmaxl, fmaxl)
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/s_fmin.S b/REORG.TODO/sysdeps/x86_64/fpu/s_fmin.S
new file mode 100644
index 0000000000..718bf489df
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/s_fmin.S
@@ -0,0 +1,52 @@
+/* Compute minimum of two numbers, regarding NaN as missing argument.
+   Copyright (C) 2002-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Andreas Jaeger <aj@suse.de>, 2002.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+	.text
+ENTRY(__fmin)
+	ucomisd	%xmm0, %xmm1
+	jp	1f		// jump if unordered
+	minsd	%xmm1, %xmm0
+	jmp	2f
+
+1:	ucomisd	%xmm1, %xmm1	// Is xmm1 a NaN?
+	jp	3f
+	// xmm0 is a NaN; xmm1 is not.  Test if xmm0 is signaling.
+	movsd	%xmm0, -8(%rsp)
+	testb	$0x8, -2(%rsp)
+	jz	4f
+	movsd	%xmm1, %xmm0	// otherwise return xmm1
+	ret
+
+3:	// xmm1 is a NaN; xmm0 may or may not be.
+	ucomisd	%xmm0, %xmm0
+	jp	4f
+	// xmm1 is a NaN; xmm0 is not.  Test if xmm1 is signaling.
+	movsd	%xmm1, -8(%rsp)
+	testb	$0x8, -2(%rsp)
+	jz	4f
+	ret
+
+4:	// Both arguments are NaNs, or one is a signaling NaN.
+	addsd	%xmm1, %xmm0
+
+2:	ret
+END(__fmin)
+weak_alias (__fmin, fmin)
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/s_fminf.S b/REORG.TODO/sysdeps/x86_64/fpu/s_fminf.S
new file mode 100644
index 0000000000..8e8c9360ac
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/s_fminf.S
@@ -0,0 +1,52 @@
+/* Compute minimum of two numbers, regarding NaN as missing argument.
+   Copyright (C) 2002-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Andreas Jaeger <aj@suse.de>, 2002.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+	.text
+ENTRY(__fminf)
+	ucomiss	%xmm0, %xmm1
+	jp	1f		// jump if unordered
+	minss	%xmm1, %xmm0
+	jmp	2f
+
+1:	ucomiss	%xmm1, %xmm1	// Is xmm1 a NaN?
+	jp	3f
+	// xmm0 is a NaN; xmm1 is not.  Test if xmm0 is signaling.
+	movss	%xmm0, -4(%rsp)
+	testb	$0x40, -2(%rsp)
+	jz	4f
+	movss	%xmm1, %xmm0	// otherwise return xmm1
+	ret
+
+3:	// xmm1 is a NaN; xmm0 may or may not be.
+	ucomiss	%xmm0, %xmm0
+	jp	4f
+	// xmm1 is a NaN; xmm0 is not.  Test if xmm1 is signaling.
+	movss	%xmm1, -4(%rsp)
+	testb	$0x40, -2(%rsp)
+	jz	4f
+	ret
+
+4:	// Both arguments are NaNs, or one is a signaling NaN.
+	addss	%xmm1, %xmm0
+
+2:	ret
+END(__fminf)
+weak_alias (__fminf, fminf)
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/s_fminl.S b/REORG.TODO/sysdeps/x86_64/fpu/s_fminl.S
new file mode 100644
index 0000000000..33eed7b30b
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/s_fminl.S
@@ -0,0 +1,58 @@
+/* Compute minimum of two numbers, regarding NaN as missing argument.
+   Copyright (C) 1997-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+	.text
+ENTRY(__fminl)
+	fldt	8(%rsp)		// x
+	fldt	24(%rsp)	// x : y
+
+	fucomi	%st(1), %st
+	jp	2f
+	fcmovnb	%st(1), %st
+
+	fstp	%st(1)
+
+	ret
+
+2:	// Unordered.
+	fucomi	%st(0), %st
+	jp	3f
+	// st(1) is a NaN; st(0) is not.  Test if st(1) is signaling.
+	testb	$0x40, 15(%rsp)
+	jz	4f
+	fstp	%st(1)
+	ret
+
+3:	// st(0) is a NaN; st(1) may or may not be.
+	fxch
+	fucomi	%st(0), %st
+	jp	4f
+	// st(1) is a NaN; st(0) is not.  Test if st(1) is signaling.
+	testb	$0x40, 31(%rsp)
+	jz	4f
+	fstp	%st(1)
+	ret
+
+4:	// Both arguments are NaNs, or one is a signaling NaN.
+	faddp
+	ret
+END(__fminl)
+weak_alias (__fminl, fminl)
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/s_fpclassifyl.c b/REORG.TODO/sysdeps/x86_64/fpu/s_fpclassifyl.c
new file mode 100644
index 0000000000..856854b0f5
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/s_fpclassifyl.c
@@ -0,0 +1,2 @@
+#include <sysdeps/i386/fpu/s_fpclassifyl.c>
+
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/s_isinfl.c b/REORG.TODO/sysdeps/x86_64/fpu/s_isinfl.c
new file mode 100644
index 0000000000..ca818b5e90
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/s_isinfl.c
@@ -0,0 +1 @@
+#include <sysdeps/i386/fpu/s_isinfl.c>
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/s_isnanl.c b/REORG.TODO/sysdeps/x86_64/fpu/s_isnanl.c
new file mode 100644
index 0000000000..06e69c3aeb
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/s_isnanl.c
@@ -0,0 +1 @@
+#include <sysdeps/i386/fpu/s_isnanl.c>
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/s_llrint.S b/REORG.TODO/sysdeps/x86_64/fpu/s_llrint.S
new file mode 100644
index 0000000000..af7bbce585
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/s_llrint.S
@@ -0,0 +1,32 @@
+/* Round argument to nearest integral value according to current rounding
+   direction.
+   Copyright (C) 2002-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Andreas Jaeger <aj@suse.d>, 2002.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+	.text
+ENTRY(__llrint)
+	cvtsd2si %xmm0,%rax
+	ret
+END(__llrint)
+weak_alias (__llrint, llrint)
+#ifndef __ILP32__
+strong_alias (__llrint, __lrint)
+weak_alias (__llrint, lrint)
+#endif
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/s_llrintf.S b/REORG.TODO/sysdeps/x86_64/fpu/s_llrintf.S
new file mode 100644
index 0000000000..9edb78bf1d
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/s_llrintf.S
@@ -0,0 +1,32 @@
+/* Round argument to nearest integral value according to current rounding
+   direction.
+   Copyright (C) 2002-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Andreas Jaeger <aj@suse.d>, 2002.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+	.text
+ENTRY(__llrintf)
+	cvtss2si %xmm0,%rax
+	ret
+END(__llrintf)
+weak_alias (__llrintf, llrintf)
+#ifndef __ILP32__
+strong_alias (__llrintf, __lrintf)
+weak_alias (__llrintf, lrintf)
+#endif
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/s_llrintl.S b/REORG.TODO/sysdeps/x86_64/fpu/s_llrintl.S
new file mode 100644
index 0000000000..e5bbf0106e
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/s_llrintl.S
@@ -0,0 +1,34 @@
+/* Round argument to nearest integral value according to current rounding
+   direction.
+   Copyright (C) 1997-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+	.text
+ENTRY(__llrintl)
+	fldt	8(%rsp)
+	fistpll	-8(%rsp)
+	fwait
+	movq	-8(%rsp),%rax
+	ret
+END(__llrintl)
+weak_alias (__llrintl, llrintl)
+#ifndef __ILP32__
+strong_alias (__llrintl, __lrintl)
+weak_alias (__llrintl, lrintl)
+#endif
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/s_log1pl.S b/REORG.TODO/sysdeps/x86_64/fpu/s_log1pl.S
new file mode 100644
index 0000000000..947e5e4552
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/s_log1pl.S
@@ -0,0 +1,74 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ *
+ * Adapted for `long double' by Ulrich Drepper <drepper@cygnus.com>.
+ * Adapted for x86-64 by Andreas Jaeger <aj@suse.de>.
+ */
+
+#include <machine/asm.h>
+
+RCSID("$NetBSD: s_log1p.S,v 1.7 1995/05/09 00:10:58 jtc Exp $")
+
+	.section .rodata
+
+	.align ALIGNARG(4)
+	/* The fyl2xp1 can only be used for values in
+		-1 + sqrt(2) / 2 <= x <= 1 - sqrt(2) / 2
+	   0.29 is a safe value.
+	*/
+limit:	.tfloat 0.29
+	/* Please note:	 we use a double value here.  Since 1.0 has
+	   an exact representation this does not effect the accuracy
+	   but it helps to optimize the code.  */
+one:	.double 1.0
+
+/*
+ * Use the fyl2xp1 function when the argument is in the range -0.29 to 0.29,
+ * otherwise fyl2x with the needed extra computation.
+ */
+#ifdef PIC
+#define MO(op) op##(%rip)
+#else
+#define MO(op) op
+#endif
+
+	.text
+ENTRY(__log1pl)
+	fldln2
+
+	fldt	8(%rsp)
+
+	fxam
+	fnstsw
+	fld	%st
+	testb	$1, %ah
+	jnz	3f		// in case x is NaN or �Inf
+4:
+	fabs
+	fldt	MO(limit)
+	fcompp
+	fnstsw
+	andb	$1,%ah
+	jz	2f
+
+	movzwl	8+8(%rsp), %eax
+	xorb	$0x80, %ah
+	cmpl	$0xc040, %eax
+	jae	5f
+
+	faddl	MO(one)
+5:	fyl2x
+	ret
+
+2:	fyl2xp1
+	ret
+
+3:	testb	$4, %ah
+	jnz	4b		// in case x is �Inf
+	fstp	%st(1)
+	fstp	%st(1)
+	fadd	%st(0)
+	ret
+
+END (__log1pl)
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/s_logbl.c b/REORG.TODO/sysdeps/x86_64/fpu/s_logbl.c
new file mode 100644
index 0000000000..4791ba64e8
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/s_logbl.c
@@ -0,0 +1 @@
+#include <sysdeps/i386/fpu/s_logbl.c>
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/s_lrint.S b/REORG.TODO/sysdeps/x86_64/fpu/s_lrint.S
new file mode 100644
index 0000000000..dfc31359a0
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/s_lrint.S
@@ -0,0 +1 @@
+/* Not needed, see s_llrint.S.  */
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/s_lrintf.S b/REORG.TODO/sysdeps/x86_64/fpu/s_lrintf.S
new file mode 100644
index 0000000000..fcdc4dca9a
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/s_lrintf.S
@@ -0,0 +1 @@
+/* Not needed, see s_llrintf.S.  */
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/s_lrintl.S b/REORG.TODO/sysdeps/x86_64/fpu/s_lrintl.S
new file mode 100644
index 0000000000..ef9c45d00d
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/s_lrintl.S
@@ -0,0 +1 @@
+/* Not needed, see s_llrintl.S.  */
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/s_nearbyintl.S b/REORG.TODO/sysdeps/x86_64/fpu/s_nearbyintl.S
new file mode 100644
index 0000000000..31b21a5037
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/s_nearbyintl.S
@@ -0,0 +1,19 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ */
+/* Adapted for use as nearbyint by Ulrich Drepper <drepper@cygnus.com>.  */
+
+#include <machine/asm.h>
+
+ENTRY(__nearbyintl)
+	fldt	8(%rsp)
+	fnstenv	-28(%rsp)
+	frndint
+	fnstsw
+	andl	$0x1, %eax
+	orl	%eax, -24(%rsp)
+	fldenv	-28(%rsp)
+	ret
+END (__nearbyintl)
+weak_alias (__nearbyintl, nearbyintl)
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/s_nextafterl.c b/REORG.TODO/sysdeps/x86_64/fpu/s_nextafterl.c
new file mode 100644
index 0000000000..f59f16848f
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/s_nextafterl.c
@@ -0,0 +1 @@
+#include <sysdeps/i386/fpu/s_nextafterl.c>
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/s_nexttoward.c b/REORG.TODO/sysdeps/x86_64/fpu/s_nexttoward.c
new file mode 100644
index 0000000000..aee2bb5895
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/s_nexttoward.c
@@ -0,0 +1 @@
+#include <sysdeps/i386/fpu/s_nexttoward.c>
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/s_nexttowardf.c b/REORG.TODO/sysdeps/x86_64/fpu/s_nexttowardf.c
new file mode 100644
index 0000000000..55e95f6916
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/s_nexttowardf.c
@@ -0,0 +1 @@
+#include <sysdeps/i386/fpu/s_nexttowardf.c>
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/s_rintl.c b/REORG.TODO/sysdeps/x86_64/fpu/s_rintl.c
new file mode 100644
index 0000000000..1cad42e921
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/s_rintl.c
@@ -0,0 +1 @@
+#include <sysdeps/i386/fpu/s_rintl.c>
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/s_scalbnl.S b/REORG.TODO/sysdeps/x86_64/fpu/s_scalbnl.S
new file mode 100644
index 0000000000..6c7683c32b
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/s_scalbnl.S
@@ -0,0 +1,17 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Changes for long double by Ulrich Drepper <drepper@cygnus.com>
+ * Changes for x86-64 by Andreas Jaeger <aj@suse.de>=09
+ * Public domain.
+ */
+
+#include <machine/asm.h>
+
+ENTRY(__scalbnl)
+	movl	%edi,-4(%rsp)
+	fildl	-4(%rsp)
+	fldt	8(%rsp)
+	fscale
+	fstp	%st(1)
+	ret
+END (__scalbnl)
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/s_signbit.S b/REORG.TODO/sysdeps/x86_64/fpu/s_signbit.S
new file mode 100644
index 0000000000..a24757cd48
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/s_signbit.S
@@ -0,0 +1,26 @@
+/* Return nonzero value if number is negative.
+   Copyright (C) 2009-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@redha.com>, 2009.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+ENTRY(__signbit)
+	pmovmskb %xmm0, %eax
+	andl	$0x80, %eax
+	ret
+END(__signbit)
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/s_signbitf.S b/REORG.TODO/sysdeps/x86_64/fpu/s_signbitf.S
new file mode 100644
index 0000000000..7739424bf6
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/s_signbitf.S
@@ -0,0 +1,26 @@
+/* Return nonzero value if number is negative.
+   Copyright (C) 2009-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@redha.com>, 2009.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+ENTRY(__signbitf)
+	pmovmskb %xmm0, %eax
+	andl	$0x8, %eax
+	ret
+END(__signbitf)
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/s_significandl.c b/REORG.TODO/sysdeps/x86_64/fpu/s_significandl.c
new file mode 100644
index 0000000000..a4ad986164
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/s_significandl.c
@@ -0,0 +1 @@
+#include <sysdeps/i386/fpu/s_significandl.c>
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/s_sincosf.S b/REORG.TODO/sysdeps/x86_64/fpu/s_sincosf.S
new file mode 100644
index 0000000000..e6ed81ed91
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/s_sincosf.S
@@ -0,0 +1,564 @@
+/* Optimized sincosf function.
+   Copyright (C) 2012-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#define __need_Emath
+#include <bits/errno.h>
+
+/* Short algorithm description:
+ *
+ *  1) if |x|==0:    sin(x)=x,
+ *                   cos(x)=1.
+ *  2) if |x|<2^-27: sin(x)=x-x*DP_SMALL, raising underflow only when needed,
+ *                   cos(x)=1-|x|.
+ *  3) if |x|<2^-5 : sin(x)=x+x*x^2*DP_SIN2_0+x^5*DP_SIN2_1,
+ *                   cos(x)=1+1*x^2*DP_COS2_0+x^5*DP_COS2_1
+ *  4) if |x|< Pi/4: sin(x)=x+x*x^2*(S0+x^2*(S1+x^2*(S2+x^2*(S3+x^2*S4)))),
+ *                   cos(x)=1+1*x^2*(C0+x^2*(C1+x^2*(C2+x^2*(C3+x^2*C4)))).
+ *  5) if |x| < 9*Pi/4:
+ *      5.1) Range reduction:
+ *          k=trunc(|x|/(Pi/4)), j=(k+1)&0x0e, n=k+1, t=|x|-j*Pi/4.
+ *      5.2) Reconstruction:
+ *          sign_sin = sign(x) * (-1.0)^(( n   >>2)&1)
+ *          sign_cos =           (-1.0)^(((n+2)>>2)&1)
+ *          poly_sin = ((((S4*t^2 + S3)*t^2 + S2)*t^2 + S1)*t^2 + S0)*t^2*t+t
+ *          poly_cos = ((((C4*t^2 + C3)*t^2 + C2)*t^2 + C1)*t^2 + C0)*t^2*s+s
+ *          if(n&2 != 0) {
+ *              using cos(t) and sin(t) polynomials for |t|<Pi/4, results are
+ *              cos(x) = poly_sin * sign_cos
+ *              sin(x) = poly_cos * sign_sin
+ *          } else {
+ *              sin(x) = poly_sin * sign_sin
+ *              cos(x) = poly_cos * sign_cos
+ *          }
+ *  6) if |x| < 2^23, large args:
+ *      6.1) Range reduction:
+ *          k=trunc(|x|/(Pi/4)), j=(k+1)&0xfffffffe, n=k+1, t=|x|-j*Pi/4
+ *      6.2) Reconstruction same as (5.2).
+ *  7) if |x| >= 2^23, very large args:
+ *      7.1) Range reduction:
+ *          k=trunc(|x|/(Pi/4)), j=(k+1)&0xfffffffe, n=k+1, t=|x|-j*Pi/4.
+ *      7.2) Reconstruction same as (5.2).
+ *  8) if x is Inf, return x-x, and set errno=EDOM.
+ *  9) if x is NaN, return x-x.
+ *
+ * Special cases:
+ *  sin/cos(+-0) = +-0/1 not raising inexact/underflow,
+ *  sin/cos(subnormal) raises inexact/underflow,
+ *  sin/cos(min_normalized) raises inexact/underflow,
+ *  sin/cos(normalized) raises inexact,
+ *  sin/cos(Inf) = NaN, raises invalid, sets errno to EDOM,
+ *  sin/cos(NaN) = NaN.
+ */
+
+# define ARG_SIN_PTR	%rdi
+# define ARG_COS_PTR	%rsi
+
+	.text
+ENTRY(__sincosf)
+	/* Input: %xmm0 contains single precision argument x */
+	/*        %rdi points to sin result */
+	/*        %rsi points to cos result */
+
+	movd	%xmm0, %eax		/* Bits of x */
+	movaps	%xmm0, %xmm7		/* Copy of x */
+	cvtss2sd %xmm0, %xmm0		/* DP x */
+	movss	L(SP_ABS_MASK)(%rip), %xmm3
+	movl	%eax, %r8d		/* Copy of x bits */
+	andl	$0x7fffffff, %eax	/* |x| */
+
+	cmpl	$0x3f490fdb, %eax	/* |x|<Pi/4 ? */
+	jb	L(arg_less_pio4)
+
+	/* Here if |x|>=Pi/4 */
+	andps	%xmm7, %xmm3		/* SP |x| */
+	andpd	L(DP_ABS_MASK)(%rip),%xmm0 /* DP |x| */
+	movss	L(SP_INVPIO4)(%rip), %xmm2 /* SP 1/(Pi/4) */
+
+	cmpl	$0x40e231d6, %eax	/* |x|<9*Pi/4 ? */
+	jae	L(large_args)
+
+	/* Here if Pi/4<=|x|<9*Pi/4 */
+	mulss	%xmm3, %xmm2		/* SP |x|/(Pi/4) */
+	movl	%r8d, %ecx		/* Load x */
+	cvttss2si %xmm2, %eax		/* k, number of Pi/4 in x */
+	lea	L(PIO4J)(%rip), %r9
+	shrl	$29, %ecx		/* (sign of x) << 2 */
+	addl	$1, %eax		/* k+1 */
+	movl	$0x0e, %edx
+	andl	%eax, %edx		/* j = (k+1)&0x0e */
+	subsd	(%r9,%rdx,8), %xmm0	/* t = |x| - j * Pi/4 */
+
+L(reconstruction):
+	/* Input: %eax=n, %xmm0=t, %ecx=sign(x) */
+
+	movaps	%xmm0, %xmm4		/* t */
+	movhpd	L(DP_ONES)(%rip), %xmm4 /* 1|t */
+	mulsd	%xmm0, %xmm0		/* y=t^2 */
+	movl	$2, %edx
+	unpcklpd %xmm0, %xmm0		/* y|y */
+	addl	%eax, %edx		/* k+2 */
+	movaps	%xmm0, %xmm1		/* y|y */
+	mulpd	%xmm0, %xmm0		/* z=t^4|z=t^4 */
+
+	movaps	L(DP_SC4)(%rip), %xmm2	/* S4 */
+	mulpd	%xmm0, %xmm2		/* z*S4 */
+	movaps	L(DP_SC3)(%rip), %xmm3	/* S3 */
+	mulpd	%xmm0, %xmm3		/* z*S3 */
+	xorl	%eax, %ecx		/* (sign_x ^ (k>>2))<<2 */
+	addpd	L(DP_SC2)(%rip), %xmm2	/* S2+z*S4 */
+	mulpd	%xmm0, %xmm2		/* z*(S2+z*S4) */
+	shrl	$2, %edx		/* (k+2)>>2 */
+	addpd	L(DP_SC1)(%rip), %xmm3	/* S1+z*S3 */
+	mulpd	%xmm0, %xmm3		/* z*(S1+z*S3) */
+	shrl	$2, %ecx		/* sign_x ^ k>>2 */
+	addpd	L(DP_SC0)(%rip), %xmm2	/* S0+z*(S2+z*S4) */
+	andl	$1, %edx		/* sign_cos = ((k+2)>>2)&1 */
+	mulpd	%xmm1, %xmm2		/* y*(S0+z*(S2+z*S4)) */
+	andl	$1, %ecx		/* sign_sin = sign_x ^ ((k>>2)&1) */
+	addpd	%xmm2, %xmm3		/* y*(S0+y*(S1+y*(S2+y*(S3+y*S4)))) */
+	lea	L(DP_ONES)(%rip), %r9
+	mulpd	%xmm4, %xmm3		/*t*y*(S0+y*(S1+y*(S2+y*(S3+y*S4))))*/
+	testl	$2, %eax		/* n&2 != 0 ? */
+	addpd	%xmm4, %xmm3		/*t+t*y*(S0+y*(S1+y*(S2+y*(S3+y*S4))*/
+	jnz	L(sin_result_sin_poly)
+
+/*L(sin_result_cos_poly):*/
+	/*
+	 * Here if
+	 * cos(x) = poly_sin * sign_cos
+	 * sin(x) = poly_cos * sign_sin
+	 */
+	movsd	(%r9,%rcx,8), %xmm4	/* 0|sign_sin */
+	movhpd	(%r9,%rdx,8), %xmm4	/* sign_cos|sign_sin */
+	mulpd	%xmm4, %xmm3		/* result_cos|result_sin */
+	cvtpd2ps %xmm3, %xmm0		/* SP results */
+	movss	%xmm0, (ARG_SIN_PTR)	/* store sin(x) from xmm0[0] */
+	shufps	$1, %xmm0, %xmm0	/* move cos(x) to xmm0[0] */
+	movss	%xmm0, (ARG_COS_PTR)	/* store cos(x) */
+	ret
+
+	.p2align	4
+L(sin_result_sin_poly):
+	/*
+	 * Here if
+	 * sin(x) = poly_sin * sign_sin
+	 * cos(x) = poly_cos * sign_cos
+	 */
+	movsd	(%r9,%rdx,8), %xmm4	/* 0|sign_cos */
+	movhpd	(%r9,%rcx,8), %xmm4	/* sign_sin|sign_cos */
+	mulpd	%xmm4, %xmm3		/* result_sin|result_cos */
+	cvtpd2ps %xmm3, %xmm0		/* SP results */
+	movss	%xmm0, (ARG_COS_PTR)	/* store cos(x) from xmm0[0] */
+	shufps	$1, %xmm0, %xmm0	/* move sin(x) to xmm0[0] */
+	movss	%xmm0, (ARG_SIN_PTR)	/* store sin(x) */
+	ret
+
+	.p2align	4
+L(large_args):
+	/* Here if |x|>=9*Pi/4 */
+	cmpl	$0x7f800000, %eax	/* x is Inf or NaN ? */
+	jae	L(arg_inf_or_nan)
+
+	/* Here if finite |x|>=9*Pi/4 */
+	cmpl	$0x4b000000, %eax	/* |x|<2^23 ? */
+	jae	L(very_large_args)
+
+	/* Here if 9*Pi/4<=|x|<2^23 */
+	movsd	L(DP_INVPIO4)(%rip), %xmm1 /* 1/(Pi/4) */
+	mulsd	%xmm0, %xmm1		/* |x|/(Pi/4) */
+	cvttsd2si %xmm1, %eax		/* k=trunc(|x|/(Pi/4)) */
+	addl	$1, %eax		/* k+1 */
+	movl	%eax, %edx
+	andl	$0xfffffffe, %edx	/* j=(k+1)&0xfffffffe */
+	cvtsi2sdl %edx, %xmm4		/* DP j */
+	movl	%r8d, %ecx		/* Load x */
+	movsd	L(DP_PIO4HI)(%rip), %xmm2 /* -PIO4HI = high part of -Pi/4 */
+	shrl	$29, %ecx		/* (sign of x) << 2 */
+	mulsd	%xmm4, %xmm2		/* -j*PIO4HI */
+	movsd	L(DP_PIO4LO)(%rip), %xmm3 /* -PIO4LO = low part of -Pi/4 */
+	addsd	%xmm2, %xmm0		/* |x| - j*PIO4HI */
+	mulsd	%xmm3, %xmm4		/* j*PIO4LO */
+	addsd	%xmm4, %xmm0		/* t = |x| - j*PIO4HI - j*PIO4LO */
+	jmp	L(reconstruction)
+
+	.p2align	4
+L(very_large_args):
+	/* Here if finite |x|>=2^23 */
+
+	/* bitpos = (ix>>23) - BIAS_32 + 59; */
+	shrl	$23, %eax		/* eb = biased exponent of x */
+	subl	$68, %eax		/* bitpos=eb-0x7f+59, where 0x7f */
+							/*is exponent bias */
+	movl	$28, %ecx		/* %cl=28 */
+	movl	%eax, %edx		/* bitpos copy */
+
+	/* j = bitpos/28; */
+	div	%cl			/* j in register %al=%ax/%cl */
+	movapd	%xmm0, %xmm3		/* |x| */
+	andl	$0xff, %eax		/* clear unneeded remainder from %ah*/
+
+	imull	$28, %eax, %ecx		/* j*28 */
+	lea	L(_FPI)(%rip), %r9
+	movsd	L(DP_HI_MASK)(%rip), %xmm4 /* DP_HI_MASK */
+	movapd	%xmm0, %xmm5		/* |x| */
+	mulsd	-16(%r9,%rax,8), %xmm3	/* tmp3 = FPI[j-2]*|x| */
+	movapd	%xmm0, %xmm1		/* |x| */
+	mulsd	-8(%r9,%rax,8), %xmm5	/* tmp2 = FPI[j-1]*|x| */
+	mulsd	(%r9,%rax,8), %xmm0	/* tmp0 = FPI[j]*|x| */
+	addl	$19, %ecx		/* j*28+19 */
+	mulsd	8(%r9,%rax,8), %xmm1	/* tmp1 = FPI[j+1]*|x| */
+	cmpl	%ecx, %edx		/* bitpos>=j*28+19 ? */
+	jl	L(very_large_skip1)
+
+	/* Here if bitpos>=j*28+19 */
+	andpd	%xmm3, %xmm4		/* HI(tmp3) */
+	subsd	%xmm4, %xmm3		/* tmp3 = tmp3 - HI(tmp3) */
+L(very_large_skip1):
+
+	movsd	L(DP_2POW52)(%rip), %xmm6
+	movapd	%xmm5, %xmm2		/* tmp2 copy */
+	addsd	%xmm3, %xmm5		/* tmp5 = tmp3 + tmp2 */
+	movl	$1, %edx
+	addsd	%xmm5, %xmm6		/* tmp6 = tmp5 + 2^52 */
+	movsd	8+L(DP_2POW52)(%rip), %xmm4
+	movd	%xmm6, %eax		/* k = I64_LO(tmp6); */
+	addsd	%xmm6, %xmm4		/* tmp4 = tmp6 - 2^52 */
+	movl	%r8d, %ecx		/* Load x */
+	comisd	%xmm5, %xmm4		/* tmp4 > tmp5 ? */
+	jbe	L(very_large_skip2)
+
+	/* Here if tmp4 > tmp5 */
+	subl	$1, %eax		/* k-- */
+	addsd	8+L(DP_ONES)(%rip), %xmm4 /* tmp4 -= 1.0 */
+L(very_large_skip2):
+
+	andl	%eax, %edx		/* k&1 */
+	lea	L(DP_ZERONE)(%rip), %r9
+	subsd	%xmm4, %xmm3		/* tmp3 -= tmp4 */
+	addsd	(%r9,%rdx,8), %xmm3	/* t  = DP_ZERONE[k&1] + tmp3 */
+	addsd	%xmm2, %xmm3		/* t += tmp2 */
+	shrl	$29, %ecx		/* (sign of x) << 2 */
+	addsd	%xmm3, %xmm0		/* t += tmp0 */
+	addl	$1, %eax		/* n=k+1 */
+	addsd	%xmm1, %xmm0		/* t += tmp1 */
+	mulsd	L(DP_PIO4)(%rip), %xmm0	/* t *= PI04 */
+
+	jmp	L(reconstruction)	/* end of very_large_args peth */
+
+	.p2align	4
+L(arg_less_pio4):
+	/* Here if |x|<Pi/4 */
+	cmpl	$0x3d000000, %eax	/* |x|<2^-5 ? */
+	jl	L(arg_less_2pn5)
+
+	/* Here if 2^-5<=|x|<Pi/4 */
+	movaps	%xmm0, %xmm3		/* DP x */
+	movhpd	L(DP_ONES)(%rip), %xmm3 /* DP 1|x */
+	mulsd	%xmm0, %xmm0		/* DP y=x^2 */
+	unpcklpd %xmm0, %xmm0		/* DP y|y */
+	movaps	%xmm0, %xmm1		/* y|y */
+	mulpd	%xmm0, %xmm0		/* z=x^4|z=x^4 */
+
+	movapd	L(DP_SC4)(%rip), %xmm4	/* S4 */
+	mulpd	%xmm0, %xmm4		/* z*S4 */
+	movapd	L(DP_SC3)(%rip), %xmm5	/* S3 */
+	mulpd	%xmm0, %xmm5		/* z*S3 */
+	addpd	L(DP_SC2)(%rip), %xmm4	/* S2+z*S4 */
+	mulpd	%xmm0, %xmm4		/* z*(S2+z*S4) */
+	addpd	L(DP_SC1)(%rip), %xmm5	/* S1+z*S3 */
+	mulpd	%xmm0, %xmm5		/* z*(S1+z*S3) */
+	addpd	L(DP_SC0)(%rip), %xmm4	/* S0+z*(S2+z*S4) */
+	mulpd	%xmm1, %xmm4		/* y*(S0+z*(S2+z*S4)) */
+	mulpd	%xmm3, %xmm5		/* x*z*(S1+z*S3) */
+	mulpd	%xmm3, %xmm4		/* x*y*(S0+z*(S2+z*S4)) */
+	addpd	%xmm5, %xmm4		/*x*y*(S0+y*(S1+y*(S2+y*(S3+y*S4))))*/
+	addpd	%xmm4, %xmm3		/*x+x*y*(S0+y*(S1+y*(S2+y*(S3+y*S4))*/
+	cvtpd2ps %xmm3, %xmm0		/* SP results */
+	movss	%xmm0, (ARG_SIN_PTR)	/* store sin(x) from xmm0[0] */
+	shufps	$1, %xmm0, %xmm0	/* move cos(x) to xmm0[0] */
+	movss	%xmm0, (ARG_COS_PTR)	/* store cos(x) */
+	ret
+
+	.p2align	4
+L(arg_less_2pn5):
+	/* Here if |x|<2^-5 */
+	cmpl	$0x32000000, %eax	/* |x|<2^-27 ? */
+	jl	L(arg_less_2pn27)
+
+	/* Here if 2^-27<=|x|<2^-5 */
+	movaps	%xmm0, %xmm1		/* DP x */
+	movhpd	L(DP_ONES)(%rip), %xmm1 /* DP 1|x */
+	mulsd	%xmm0, %xmm0		/* DP x^2 */
+	unpcklpd %xmm0, %xmm0		/* DP x^2|x^2 */
+
+	movaps	L(DP_SINCOS2_1)(%rip), %xmm3 /* DP DP_SIN2_1 */
+	mulpd	%xmm0, %xmm3		/* DP x^2*DP_SIN2_1 */
+	addpd	L(DP_SINCOS2_0)(%rip), %xmm3 /* DP DP_SIN2_0+x^2*DP_SIN2_1 */
+	mulpd	%xmm0, %xmm3		/* DP x^2*DP_SIN2_0+x^4*DP_SIN2_1 */
+	mulpd	%xmm1, %xmm3		/* DP x^3*DP_SIN2_0+x^5*DP_SIN2_1 */
+	addpd	%xmm1, %xmm3		/* DP x+x^3*DP_SIN2_0+x^5*DP_SIN2_1 */
+	cvtpd2ps %xmm3, %xmm0		/* SP results */
+	movss	%xmm0, (ARG_SIN_PTR)	/* store sin(x) from xmm0[0] */
+	shufps	$1, %xmm0, %xmm0	/* move cos(x) to xmm0[0] */
+	movss	%xmm0, (ARG_COS_PTR)	/* store cos(x) */
+	ret
+
+	.p2align	4
+L(arg_less_2pn27):
+	cmpl	$0, %eax		/* x=0 ? */
+	je	L(arg_zero)		/* in case x=0 return sin(+-0)==+-0 */
+	/* Here if |x|<2^-27 */
+	/*
+	 * Special cases here:
+	 *  sin(subnormal) raises inexact/underflow
+	 *  sin(min_normalized) raises inexact/underflow
+	 *  sin(normalized) raises inexact
+	 *  cos(here)=1-|x| (raising inexact)
+	 */
+	movaps	%xmm0, %xmm3		/* DP x */
+	mulsd	L(DP_SMALL)(%rip), %xmm0/* DP x*DP_SMALL */
+	subsd	%xmm0, %xmm3		/* DP sin result is x-x*DP_SMALL */
+	andps	L(SP_ABS_MASK)(%rip), %xmm7/* SP |x| */
+	cvtsd2ss %xmm3, %xmm0		/* sin(x) */
+	movss	L(SP_ONE)(%rip), %xmm1	/* SP 1.0 */
+	movss	%xmm0, (ARG_SIN_PTR)	/* sin(x) store */
+	subss	%xmm7, %xmm1		/* cos(x) */
+	movss	%xmm1, (ARG_COS_PTR)	/* cos(x) store */
+	ret
+
+	.p2align	4
+L(arg_zero):
+	movss	L(SP_ONE)(%rip), %xmm0	/* 1.0 */
+	movss	%xmm7, (ARG_SIN_PTR)	/* sin(+-0)==x */
+	movss	%xmm0, (ARG_COS_PTR)	/* cos(+-0)==1 */
+	ret
+
+	.p2align	4
+L(arg_inf_or_nan):
+	/* Here if |x| is Inf or NAN */
+	jne	L(skip_errno_setting)	/* in case of x is NaN */
+
+	/* Align stack to 16 bytes.  */
+	subq	$8, %rsp
+	cfi_adjust_cfa_offset (8)
+	/* Here if x is Inf. Set errno to EDOM.  */
+	call	JUMPTARGET(__errno_location)
+	addq	$8, %rsp
+	cfi_adjust_cfa_offset (-8)
+
+	movl	$EDOM, (%rax)
+
+	.p2align	4
+L(skip_errno_setting):
+	/* Here if |x| is Inf or NAN. Continued. */
+	subss	%xmm7, %xmm7		/* x-x, result is NaN */
+	movss	%xmm7, (ARG_SIN_PTR)
+	movss	%xmm7, (ARG_COS_PTR)
+	ret
+END(__sincosf)
+
+	.section .rodata, "a"
+	.p2align 3
+L(PIO4J): /* Table of j*Pi/4, for j=0,1,..,10 */
+	.long	0x00000000,0x00000000
+	.long	0x54442d18,0x3fe921fb
+	.long	0x54442d18,0x3ff921fb
+	.long	0x7f3321d2,0x4002d97c
+	.long	0x54442d18,0x400921fb
+	.long	0x2955385e,0x400f6a7a
+	.long	0x7f3321d2,0x4012d97c
+	.long	0xe9bba775,0x4015fdbb
+	.long	0x54442d18,0x401921fb
+	.long	0xbeccb2bb,0x401c463a
+	.long	0x2955385e,0x401f6a7a
+	.type L(PIO4J), @object
+	ASM_SIZE_DIRECTIVE(L(PIO4J))
+
+	.p2align 3
+L(_FPI): /* 4/Pi broken into sum of positive DP values */
+	.long	0x00000000,0x00000000
+	.long	0x6c000000,0x3ff45f30
+	.long	0x2a000000,0x3e3c9c88
+	.long	0xa8000000,0x3c54fe13
+	.long	0xd0000000,0x3aaf47d4
+	.long	0x6c000000,0x38fbb81b
+	.long	0xe0000000,0x3714acc9
+	.long	0x7c000000,0x3560e410
+	.long	0x56000000,0x33bca2c7
+	.long	0xac000000,0x31fbd778
+	.long	0xe0000000,0x300b7246
+	.long	0xe8000000,0x2e5d2126
+	.long	0x48000000,0x2c970032
+	.long	0xe8000000,0x2ad77504
+	.long	0xe0000000,0x290921cf
+	.long	0xb0000000,0x274deb1c
+	.long	0xe0000000,0x25829a73
+	.long	0xbe000000,0x23fd1046
+	.long	0x10000000,0x2224baed
+	.long	0x8e000000,0x20709d33
+	.long	0x80000000,0x1e535a2f
+	.long	0x64000000,0x1cef904e
+	.long	0x30000000,0x1b0d6398
+	.long	0x24000000,0x1964ce7d
+	.long	0x16000000,0x17b908bf
+	.type L(_FPI), @object
+	ASM_SIZE_DIRECTIVE(L(_FPI))
+
+/* Coefficients of polynomials for */
+/* sin(x)~=x+x*x^2*(DP_SIN2_0+x^2*DP_SIN2_1) in low  DP part, */
+/* cos(x)~=1+1*x^2*(DP_COS2_0+x^2*DP_COS2_1) in high DP part, */
+/* for |x|<2^-5. */
+	.p2align 4
+L(DP_SINCOS2_0):
+	.long	0x5543d49d,0xbfc55555
+	.long	0xff5cc6fd,0xbfdfffff
+	.type L(DP_SINCOS2_0), @object
+	ASM_SIZE_DIRECTIVE(L(DP_SINCOS2_0))
+
+	.p2align 4
+L(DP_SINCOS2_1):
+	.long	0x75cec8c5,0x3f8110f4
+	.long	0xb178dac5,0x3fa55514
+	.type L(DP_SINCOS2_1), @object
+	ASM_SIZE_DIRECTIVE(L(DP_SINCOS2_1))
+
+
+	.p2align 3
+L(DP_ZERONE):
+	.long	0x00000000,0x00000000	/* 0.0 */
+	.long	0x00000000,0xbff00000	/* 1.0 */
+	.type L(DP_ZERONE), @object
+	ASM_SIZE_DIRECTIVE(L(DP_ZERONE))
+
+	.p2align 3
+L(DP_ONES):
+	.long	0x00000000,0x3ff00000	/* +1.0 */
+	.long	0x00000000,0xbff00000	/* -1.0 */
+	.type L(DP_ONES), @object
+	ASM_SIZE_DIRECTIVE(L(DP_ONES))
+
+/* Coefficients of polynomials for */
+/* sin(t)~=t+t*t^2*(S0+t^2*(S1+t^2*(S2+t^2*(S3+t^2*S4)))) in low  DP part, */
+/* cos(t)~=1+1*t^2*(C0+t^2*(C1+t^2*(C2+t^2*(C3+t^2*C4)))) in high DP part, */
+/* for |t|<Pi/4. */
+	.p2align 4
+L(DP_SC4):
+	.long	0x1674b58a,0xbe5a947e
+	.long	0xdd8844d7,0xbe923c97
+	.type L(DP_SC4), @object
+	ASM_SIZE_DIRECTIVE(L(DP_SC4))
+
+	.p2align 4
+L(DP_SC3):
+	.long	0x64e6b5b4,0x3ec71d72
+	.long	0x9ac43cc0,0x3efa00eb
+	.type L(DP_SC3), @object
+	ASM_SIZE_DIRECTIVE(L(DP_SC3))
+
+	.p2align 4
+L(DP_SC2):
+	.long	0x8b4bd1f9,0xbf2a019f
+	.long	0x348b6874,0xbf56c16b
+	.type L(DP_SC2), @object
+	ASM_SIZE_DIRECTIVE(L(DP_SC2))
+
+	.p2align 4
+L(DP_SC1):
+	.long	0x10c2688b,0x3f811111
+	.long	0x545c50c7,0x3fa55555
+	.type L(DP_SC1), @object
+	ASM_SIZE_DIRECTIVE(L(DP_SC1))
+
+	.p2align 4
+L(DP_SC0):
+	.long	0x55551cd9,0xbfc55555
+	.long	0xfffe98ae,0xbfdfffff
+	.type L(DP_SC0), @object
+	ASM_SIZE_DIRECTIVE(L(DP_SC0))
+
+	.p2align 3
+L(DP_SMALL):
+	.long	0x00000000,0x3cd00000	/* 2^(-50) */
+	.type L(DP_SMALL), @object
+	ASM_SIZE_DIRECTIVE(L(DP_SMALL))
+
+	.p2align 3
+L(DP_PIO4):
+	.long	0x54442d18,0x3fe921fb	/* Pi/4 */
+	.type L(DP_PIO4), @object
+	ASM_SIZE_DIRECTIVE(L(DP_PIO4))
+
+	.p2align 3
+L(DP_2POW52):
+	.long	0x00000000,0x43300000	/* +2^52 */
+	.long	0x00000000,0xc3300000	/* -2^52 */
+	.type L(DP_2POW52), @object
+	ASM_SIZE_DIRECTIVE(L(DP_2POW52))
+
+	.p2align 3
+L(DP_INVPIO4):
+	.long	0x6dc9c883,0x3ff45f30	/* 4/Pi */
+	.type L(DP_INVPIO4), @object
+	ASM_SIZE_DIRECTIVE(L(DP_INVPIO4))
+
+	.p2align 3
+L(DP_PIO4HI):
+	.long	0x54000000,0xbfe921fb	/* High part of Pi/4 */
+	.type L(DP_PIO4HI), @object
+	ASM_SIZE_DIRECTIVE(L(DP_PIO4HI))
+
+	.p2align 3
+L(DP_PIO4LO):
+	.long	0x11A62633,0xbe010b46	/* Low part of Pi/4 */
+	.type L(DP_PIO4LO), @object
+	ASM_SIZE_DIRECTIVE(L(DP_PIO4LO))
+
+	.p2align 2
+L(SP_INVPIO4):
+	.long	0x3fa2f983		/* 4/Pi */
+	.type L(SP_INVPIO4), @object
+	ASM_SIZE_DIRECTIVE(L(SP_INVPIO4))
+
+	.p2align 4
+L(DP_ABS_MASK): /* Mask for getting DP absolute value */
+	.long	0xffffffff,0x7fffffff
+	.long	0xffffffff,0x7fffffff
+	.type L(DP_ABS_MASK), @object
+	ASM_SIZE_DIRECTIVE(L(DP_ABS_MASK))
+
+	.p2align 3
+L(DP_HI_MASK): /* Mask for getting high 21 bits of DP value */
+	.long	0x00000000,0xffffffff
+	.type L(DP_HI_MASK), @object
+	ASM_SIZE_DIRECTIVE(L(DP_HI_MASK))
+
+	.p2align 4
+L(SP_ABS_MASK): /* Mask for getting SP absolute value */
+	.long	0x7fffffff,0x7fffffff
+	.long	0x7fffffff,0x7fffffff
+	.type L(SP_ABS_MASK), @object
+	ASM_SIZE_DIRECTIVE(L(SP_ABS_MASK))
+
+	.p2align 2
+L(SP_ONE):
+	.long	0x3f800000		/* 1.0 */
+	.type L(SP_ONE), @object
+	ASM_SIZE_DIRECTIVE(L(SP_ONE))
+
+weak_alias(__sincosf, sincosf)
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/s_sinf.S b/REORG.TODO/sysdeps/x86_64/fpu/s_sinf.S
new file mode 100644
index 0000000000..0aa5d43d8c
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/s_sinf.S
@@ -0,0 +1,559 @@
+/* Optimized sinf function.
+   Copyright (C) 2012-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#define __need_Emath
+#include <bits/errno.h>
+
+/* Short algorithm description:
+ *
+ *  1) if |x| == 0: return x.
+ *  2) if |x| <  2^-27: return x-x*DP_SMALL, raise underflow only when needed.
+ *  3) if |x| <  2^-5 : return x+x^3*DP_SIN2_0+x^5*DP_SIN2_1.
+ *  4) if |x| <   Pi/4: return x+x^3*(S0+x^2*(S1+x^2*(S2+x^2*(S3+x^2*S4)))).
+ *  5) if |x| < 9*Pi/4:
+ *      5.1) Range reduction: k=trunc(|x|/(Pi/4)), j=(k+1)&0x0e, n=k+1,
+ *           t=|x|-j*Pi/4.
+ *      5.2) Reconstruction:
+ *          s = sign(x) * (-1.0)^((n>>2)&1)
+ *          if(n&2 != 0) {
+ *              using cos(t) polynomial for |t|<Pi/4, result is
+ *              s     * (1.0+t^2*(C0+t^2*(C1+t^2*(C2+t^2*(C3+t^2*C4))))).
+ *          } else {
+ *              using sin(t) polynomial for |t|<Pi/4, result is
+ *              s * t * (1.0+t^2*(S0+t^2*(S1+t^2*(S2+t^2*(S3+t^2*S4))))).
+ *          }
+ *  6) if |x| < 2^23, large args:
+ *      6.1) Range reduction: k=trunc(|x|/(Pi/4)), j=(k+1)&0xfffffffe, n=k+1,
+ *           t=|x|-j*Pi/4.
+ *      6.2) Reconstruction same as (5.2).
+ *  7) if |x| >= 2^23, very large args:
+ *      7.1) Range reduction: k=trunc(|x|/(Pi/4)), j=(k+1)&0xfffffffe, n=k+1,
+ *           t=|x|-j*Pi/4.
+ *      7.2) Reconstruction same as (5.2).
+ *  8) if x is Inf, return x-x, and set errno=EDOM.
+ *  9) if x is NaN, return x-x.
+ *
+ * Special cases:
+ *  sin(+-0) = +-0 not raising inexact/underflow,
+ *  sin(subnormal) raises inexact/underflow,
+ *  sin(min_normalized) raises inexact/underflow,
+ *  sin(normalized) raises inexact,
+ *  sin(Inf) = NaN, raises invalid, sets errno to EDOM,
+ *  sin(NaN) = NaN.
+ */
+
+	.text
+ENTRY(__sinf)
+	/* Input: single precision x in %xmm0 */
+
+	movd	%xmm0, %eax		/* Bits of x */
+	movaps	%xmm0, %xmm7		/* Copy of x */
+	cvtss2sd %xmm0, %xmm0		/* DP x */
+	movss	L(SP_ABS_MASK)(%rip), %xmm3
+	movl	%eax, %edi		/* Copy of x bits */
+	andl	$0x7fffffff, %eax	/* |x| */
+
+	cmpl	$0x3f490fdb, %eax	/* |x|<Pi/4?  */
+	jb	L(arg_less_pio4)
+
+	/* Here if |x|>=Pi/4 */
+	andps	%xmm7, %xmm3		/* SP |x| */
+	andpd	L(DP_ABS_MASK)(%rip),%xmm0 /* DP |x| */
+	movss	L(SP_INVPIO4)(%rip), %xmm2 /* SP 1/(Pi/4) */
+
+	cmpl	$0x40e231d6, %eax	/* |x|<9*Pi/4?  */
+	jae	L(large_args)
+
+	/* Here if Pi/4<=|x|<9*Pi/4 */
+	mulss	%xmm3, %xmm2		/* SP |x|/(Pi/4) */
+	movl	%edi, %ecx		/* Load x */
+	cvttss2si %xmm2, %eax		/* k, number of Pi/4 in x */
+	lea	L(PIO4J)(%rip), %rsi
+	shrl	$31, %ecx		/* sign of x */
+	addl	$1, %eax		/* k+1 */
+	movl	$0x0e, %edx
+	andl	%eax, %edx		/* j = (k+1)&0x0e */
+	subsd	(%rsi,%rdx,8), %xmm0	/* t = |x| - j * Pi/4 */
+
+L(reconstruction):
+	/* Input: %eax=n, %xmm0=t, %ecx=sign(x) */
+	testl	$2, %eax		/* n&2 != 0?  */
+	jz	L(sin_poly)
+
+/*L(cos_poly):*/
+	/* Here if sin(x) calculated using cos(t) polynomial for |t|<Pi/4:
+	 * y = t*t; z = y*y;
+	 * s = sign(x) * (-1.0)^((n>>2)&1)
+	 * result = s     * (1.0+t^2*(C0+t^2*(C1+t^2*(C2+t^2*(C3+t^2*C4)))))
+	 */
+	shrl	$2, %eax		/* n>>2 */
+	mulsd	%xmm0, %xmm0		/* y=t^2 */
+	andl	$1, %eax		/* (n>>2)&1 */
+	movaps	%xmm0, %xmm1		/* y */
+	mulsd	%xmm0, %xmm0		/* z=t^4 */
+
+	movsd	L(DP_C4)(%rip), %xmm4	/* C4 */
+	mulsd	%xmm0, %xmm4		/* z*C4 */
+	xorl	%eax, %ecx		/* (-1.0)^((n>>2)&1) XOR sign(x) */
+	movsd	L(DP_C3)(%rip), %xmm3	/* C3 */
+	mulsd	%xmm0, %xmm3		/* z*C3 */
+	lea	L(DP_ONES)(%rip), %rsi
+	addsd	L(DP_C2)(%rip), %xmm4	/* C2+z*C4 */
+	mulsd	%xmm0, %xmm4		/* z*(C2+z*C4) */
+	addsd	L(DP_C1)(%rip), %xmm3	/* C1+z*C3 */
+	mulsd	%xmm0, %xmm3		/* z*(C1+z*C3) */
+	addsd	L(DP_C0)(%rip), %xmm4	/* C0+z*(C2+z*C4) */
+	mulsd	%xmm1, %xmm4		/* y*(C0+z*(C2+z*C4)) */
+
+	/* y*(C0+y*(C1+y*(C2+y*(C3+y*C4)))) */
+	addsd	%xmm4, %xmm3
+	/* 1.0+y*(C0+y*(C1+y*(C2+y*(C3+y*C4)))) */
+	addsd	L(DP_ONES)(%rip), %xmm3
+
+	mulsd	(%rsi,%rcx,8), %xmm3	/* DP result */
+	cvtsd2ss %xmm3, %xmm0 		/* SP result */
+	ret
+
+	.p2align	4
+L(sin_poly):
+	/* Here if sin(x) calculated using sin(t) polynomial for |t|<Pi/4:
+	 * y = t*t; z = y*y;
+	 * s = sign(x) * (-1.0)^((n>>2)&1)
+	 * result = s * t * (1.0+t^2*(S0+t^2*(S1+t^2*(S2+t^2*(S3+t^2*S4)))))
+	 */
+
+	movaps	%xmm0, %xmm4		/* t */
+	shrl	$2, %eax		/* n>>2 */
+	mulsd	%xmm0, %xmm0		/* y=t^2 */
+	andl	$1, %eax		/* (n>>2)&1 */
+	movaps	%xmm0, %xmm1		/* y */
+	xorl	%eax, %ecx		/* (-1.0)^((n>>2)&1) XOR sign(x) */
+	mulsd	%xmm0, %xmm0		/* z=t^4 */
+
+	movsd	L(DP_S4)(%rip), %xmm2	/* S4 */
+	mulsd	%xmm0, %xmm2		/* z*S4 */
+	movsd	L(DP_S3)(%rip), %xmm3	/* S3 */
+	mulsd	%xmm0, %xmm3		/* z*S3 */
+	lea	L(DP_ONES)(%rip), %rsi
+	addsd	L(DP_S2)(%rip), %xmm2	/* S2+z*S4 */
+	mulsd	%xmm0, %xmm2		/* z*(S2+z*S4) */
+	addsd	L(DP_S1)(%rip), %xmm3	/* S1+z*S3 */
+	mulsd	%xmm0, %xmm3		/* z*(S1+z*S3) */
+	addsd	L(DP_S0)(%rip), %xmm2	/* S0+z*(S2+z*S4) */
+	mulsd	%xmm1, %xmm2		/* y*(S0+z*(S2+z*S4)) */
+	/* t*s, where s = sign(x) * (-1.0)^((n>>2)&1) */
+	mulsd	(%rsi,%rcx,8), %xmm4
+	/* y*(S0+y*(S1+y*(S2+y*(S3+y*S4)))) */
+	addsd	%xmm2, %xmm3
+	/* t*s*y*(S0+y*(S1+y*(S2+y*(S3+y*S4)))) */
+	mulsd	%xmm4, %xmm3
+	/* t*s*(1.0+y*(S0+y*(S1+y*(S2+y*(S3+y*S4)))) */
+	addsd	%xmm4, %xmm3
+	cvtsd2ss %xmm3, %xmm0 		/* SP result */
+	ret
+
+	.p2align	4
+L(large_args):
+	/* Here if |x|>=9*Pi/4 */
+	cmpl	$0x7f800000, %eax	/* x is Inf or NaN?  */
+	jae	L(arg_inf_or_nan)
+
+	/* Here if finite |x|>=9*Pi/4 */
+	cmpl	$0x4b000000, %eax	/* |x|<2^23?  */
+	jae	L(very_large_args)
+
+	/* Here if 9*Pi/4<=|x|<2^23 */
+	movsd	L(DP_INVPIO4)(%rip), %xmm1 /* 1/(Pi/4) */
+	mulsd	%xmm0, %xmm1		/* |x|/(Pi/4) */
+	cvttsd2si %xmm1, %eax		/* k=trunc(|x|/(Pi/4)) */
+	addl	$1, %eax		/* k+1 */
+	movl	%eax, %edx
+	andl	$0xfffffffe, %edx	/* j=(k+1)&0xfffffffe */
+	cvtsi2sdl %edx, %xmm4		/* DP j */
+	movl	%edi, %ecx		/* Load x */
+	movsd	L(DP_PIO4HI)(%rip), %xmm2 /* -PIO4HI = high part of -Pi/4 */
+	shrl	$31, %ecx		/* sign bit of x */
+	mulsd	%xmm4, %xmm2		/* -j*PIO4HI */
+	movsd	L(DP_PIO4LO)(%rip), %xmm3 /* -PIO4LO = low part of -Pi/4 */
+	addsd	%xmm2, %xmm0		/* |x| - j*PIO4HI */
+	mulsd	%xmm3, %xmm4		/* j*PIO4LO */
+	addsd	%xmm4, %xmm0		/* t = |x| - j*PIO4HI - j*PIO4LO */
+	jmp	L(reconstruction)
+
+	.p2align	4
+L(very_large_args):
+	/* Here if finite |x|>=2^23 */
+
+	/* bitpos = (ix>>23) - BIAS_32 + 59; */
+	shrl	$23, %eax		/* eb = biased exponent of x */
+	/* bitpos = eb - 0x7f + 59, where 0x7f is exponent bias */
+	subl	$68, %eax
+	movl	$28, %ecx		/* %cl=28 */
+	movl	%eax, %edx		/* bitpos copy */
+
+	/* j = bitpos/28; */
+	div	%cl			/* j in register %al=%ax/%cl */
+	movapd	%xmm0, %xmm3		/* |x| */
+	/* clear unneeded remainder from %ah */
+	andl	$0xff, %eax
+
+	imull	$28, %eax, %ecx		/* j*28 */
+	lea	L(_FPI)(%rip), %rsi
+	movsd	L(DP_HI_MASK)(%rip), %xmm4 /* DP_HI_MASK */
+	movapd	%xmm0, %xmm5		/* |x| */
+	mulsd	-16(%rsi,%rax,8), %xmm3	/* tmp3 = FPI[j-2]*|x| */
+	movapd	%xmm0, %xmm1		/* |x| */
+	mulsd	-8(%rsi,%rax,8), %xmm5	/* tmp2 = FPI[j-1]*|x| */
+	mulsd	(%rsi,%rax,8), %xmm0	/* tmp0 = FPI[j]*|x| */
+	addl	$19, %ecx		/* j*28+19 */
+	mulsd	8(%rsi,%rax,8), %xmm1	/* tmp1 = FPI[j+1]*|x| */
+	cmpl	%ecx, %edx		/* bitpos>=j*28+19?  */
+	jl	L(very_large_skip1)
+
+	/* Here if bitpos>=j*28+19 */
+	andpd	%xmm3, %xmm4		/* HI(tmp3) */
+	subsd	%xmm4, %xmm3		/* tmp3 = tmp3 - HI(tmp3) */
+L(very_large_skip1):
+
+	movsd	L(DP_2POW52)(%rip), %xmm6
+	movapd	%xmm5, %xmm2		/* tmp2 copy */
+	addsd	%xmm3, %xmm5		/* tmp5 = tmp3 + tmp2 */
+	movl	$1, %edx
+	addsd	%xmm5, %xmm6		/* tmp6 = tmp5 + 2^52 */
+	movsd	8+L(DP_2POW52)(%rip), %xmm4
+	movd	%xmm6, %eax		/* k = I64_LO(tmp6); */
+	addsd	%xmm6, %xmm4		/* tmp4 = tmp6 - 2^52 */
+	movl	%edi, %ecx		/* Load x */
+	comisd	%xmm5, %xmm4		/* tmp4 > tmp5?  */
+	jbe	L(very_large_skip2)
+
+	/* Here if tmp4 > tmp5 */
+	subl	$1, %eax		/* k-- */
+	addsd	8+L(DP_ONES)(%rip), %xmm4 /* tmp4 -= 1.0 */
+L(very_large_skip2):
+
+	andl	%eax, %edx		/* k&1 */
+	lea	L(DP_ZERONE)(%rip), %rsi
+	subsd	%xmm4, %xmm3		/* tmp3 -= tmp4 */
+	addsd	(%rsi,%rdx,8), %xmm3	/* t  = DP_ZERONE[k&1] + tmp3 */
+	addsd	%xmm2, %xmm3		/* t += tmp2 */
+	shrl	$31, %ecx		/* sign of x */
+	addsd	%xmm3, %xmm0		/* t += tmp0 */
+	addl	$1, %eax		/* n=k+1 */
+	addsd	%xmm1, %xmm0		/* t += tmp1 */
+	mulsd	L(DP_PIO4)(%rip), %xmm0	/* t *= PI04 */
+
+	jmp	L(reconstruction)	/* end of very_large_args peth */
+
+	.p2align	4
+L(arg_less_pio4):
+	/* Here if |x|<Pi/4 */
+	cmpl	$0x3d000000, %eax	/* |x|<2^-5?  */
+	jl	L(arg_less_2pn5)
+
+	/* Here if 2^-5<=|x|<Pi/4 */
+	movaps	%xmm0, %xmm3		/* x */
+	mulsd	%xmm0, %xmm0		/* y=x^2 */
+	movaps	%xmm0, %xmm1		/* y */
+	mulsd	%xmm0, %xmm0		/* z=x^4 */
+	movsd	L(DP_S4)(%rip), %xmm4	/* S4 */
+	mulsd	%xmm0, %xmm4		/* z*S4 */
+	movsd	L(DP_S3)(%rip), %xmm5	/* S3 */
+	mulsd	%xmm0, %xmm5		/* z*S3 */
+	addsd	L(DP_S2)(%rip), %xmm4	/* S2+z*S4 */
+	mulsd	%xmm0, %xmm4		/* z*(S2+z*S4) */
+	addsd	L(DP_S1)(%rip), %xmm5	/* S1+z*S3 */
+	mulsd	%xmm0, %xmm5		/* z*(S1+z*S3) */
+	addsd	L(DP_S0)(%rip), %xmm4	/* S0+z*(S2+z*S4) */
+	mulsd	%xmm1, %xmm4		/* y*(S0+z*(S2+z*S4)) */
+	mulsd	%xmm3, %xmm5		/* x*z*(S1+z*S3) */
+	mulsd	%xmm3, %xmm4		/* x*y*(S0+z*(S2+z*S4)) */
+	/* x*y*(S0+y*(S1+y*(S2+y*(S3+y*S4)))) */
+	addsd	%xmm5, %xmm4
+	/* x + x*y*(S0+y*(S1+y*(S2+y*(S3+y*S4)))) */
+	addsd	%xmm4, %xmm3
+	cvtsd2ss %xmm3, %xmm0		/* SP result */
+	ret
+
+	.p2align	4
+L(arg_less_2pn5):
+	/* Here if |x|<2^-5 */
+	cmpl	$0x32000000, %eax	/* |x|<2^-27?  */
+	jl	L(arg_less_2pn27)
+
+	/* Here if 2^-27<=|x|<2^-5 */
+	movaps	%xmm0, %xmm1		/* DP x */
+	mulsd	%xmm0, %xmm0		/* DP x^2 */
+	movsd	L(DP_SIN2_1)(%rip), %xmm3 /* DP DP_SIN2_1 */
+	mulsd	%xmm0, %xmm3		/* DP x^2*DP_SIN2_1 */
+	addsd	L(DP_SIN2_0)(%rip), %xmm3 /* DP DP_SIN2_0+x^2*DP_SIN2_1 */
+	mulsd	%xmm0, %xmm3		/* DP x^2*DP_SIN2_0+x^4*DP_SIN2_1 */
+	mulsd	%xmm1, %xmm3		/* DP x^3*DP_SIN2_0+x^5*DP_SIN2_1 */
+	addsd	%xmm1, %xmm3		/* DP x+x^3*DP_SIN2_0+x^5*DP_SIN2_1 */
+	cvtsd2ss %xmm3, %xmm0		/* SP result */
+	ret
+
+	.p2align	4
+L(arg_less_2pn27):
+	cmpl	$0, %eax		/* x=0?  */
+	je	L(arg_zero)		/* in case x=0 return sin(+-0)==+-0 */
+	/* Here if |x|<2^-27 */
+	/*
+	 * Special cases here:
+	 *  sin(subnormal) raises inexact/underflow
+	 *  sin(min_normalized) raises inexact/underflow
+	 *  sin(normalized) raises inexact
+	 */
+	movaps	%xmm0, %xmm3		/* Copy of DP x */
+	mulsd	L(DP_SMALL)(%rip), %xmm0 /* x*DP_SMALL */
+	subsd	%xmm0, %xmm3		/* Result is x-x*DP_SMALL */
+	cvtsd2ss %xmm3, %xmm0		/* Result converted to SP */
+	ret
+
+	.p2align	4
+L(arg_zero):
+	movaps	%xmm7, %xmm0		/* SP x */
+	ret
+
+	.p2align	4
+L(arg_inf_or_nan):
+	/* Here if |x| is Inf or NAN */
+	jne	L(skip_errno_setting)	/* in case of x is NaN */
+
+	/* Align stack to 16 bytes.  */
+	subq	$8, %rsp
+	cfi_adjust_cfa_offset (8)
+	/* Here if x is Inf. Set errno to EDOM.  */
+	call	JUMPTARGET(__errno_location)
+	addq	$8, %rsp
+	cfi_adjust_cfa_offset (-8)
+
+	movl	$EDOM, (%rax)
+
+	.p2align	4
+L(skip_errno_setting):
+	/* Here if |x| is Inf or NAN. Continued.  */
+	movaps	%xmm7, %xmm0		/* load x */
+	subss	%xmm0, %xmm0		/* Result is NaN */
+	ret
+END(__sinf)
+
+	.section .rodata, "a"
+	.p2align 3
+L(PIO4J): /* Table of j*Pi/4, for j=0,1,..,10 */
+	.long	0x00000000,0x00000000
+	.long	0x54442d18,0x3fe921fb
+	.long	0x54442d18,0x3ff921fb
+	.long	0x7f3321d2,0x4002d97c
+	.long	0x54442d18,0x400921fb
+	.long	0x2955385e,0x400f6a7a
+	.long	0x7f3321d2,0x4012d97c
+	.long	0xe9bba775,0x4015fdbb
+	.long	0x54442d18,0x401921fb
+	.long	0xbeccb2bb,0x401c463a
+	.long	0x2955385e,0x401f6a7a
+	.type L(PIO4J), @object
+	ASM_SIZE_DIRECTIVE(L(PIO4J))
+
+	.p2align 3
+L(_FPI): /* 4/Pi broken into sum of positive DP values */
+	.long	0x00000000,0x00000000
+	.long	0x6c000000,0x3ff45f30
+	.long	0x2a000000,0x3e3c9c88
+	.long	0xa8000000,0x3c54fe13
+	.long	0xd0000000,0x3aaf47d4
+	.long	0x6c000000,0x38fbb81b
+	.long	0xe0000000,0x3714acc9
+	.long	0x7c000000,0x3560e410
+	.long	0x56000000,0x33bca2c7
+	.long	0xac000000,0x31fbd778
+	.long	0xe0000000,0x300b7246
+	.long	0xe8000000,0x2e5d2126
+	.long	0x48000000,0x2c970032
+	.long	0xe8000000,0x2ad77504
+	.long	0xe0000000,0x290921cf
+	.long	0xb0000000,0x274deb1c
+	.long	0xe0000000,0x25829a73
+	.long	0xbe000000,0x23fd1046
+	.long	0x10000000,0x2224baed
+	.long	0x8e000000,0x20709d33
+	.long	0x80000000,0x1e535a2f
+	.long	0x64000000,0x1cef904e
+	.long	0x30000000,0x1b0d6398
+	.long	0x24000000,0x1964ce7d
+	.long	0x16000000,0x17b908bf
+	.type L(_FPI), @object
+	ASM_SIZE_DIRECTIVE(L(_FPI))
+
+/* Coefficients of polynomial
+   for sin(x)~=x+x^3*DP_SIN2_0+x^5*DP_SIN2_1, |x|<2^-5.  */
+	.p2align 3
+L(DP_SIN2_0):
+	.long	0x5543d49d,0xbfc55555
+	.type L(DP_SIN2_0), @object
+	ASM_SIZE_DIRECTIVE(L(DP_SIN2_0))
+
+	.p2align 3
+L(DP_SIN2_1):
+	.long	0x75cec8c5,0x3f8110f4
+	.type L(DP_SIN2_1), @object
+	ASM_SIZE_DIRECTIVE(L(DP_SIN2_1))
+
+	.p2align 3
+L(DP_ZERONE):
+	.long	0x00000000,0x00000000	/* 0.0 */
+	.long	0x00000000,0xbff00000	/* 1.0 */
+	.type L(DP_ZERONE), @object
+	ASM_SIZE_DIRECTIVE(L(DP_ZERONE))
+
+	.p2align 3
+L(DP_ONES):
+	.long	0x00000000,0x3ff00000	/* +1.0 */
+	.long	0x00000000,0xbff00000	/* -1.0 */
+	.type L(DP_ONES), @object
+	ASM_SIZE_DIRECTIVE(L(DP_ONES))
+
+/* Coefficients of polynomial
+   for sin(t)~=t+t^3*(S0+t^2*(S1+t^2*(S2+t^2*(S3+t^2*S4)))), |t|<Pi/4.  */
+	.p2align 3
+L(DP_S3):
+	.long	0x64e6b5b4,0x3ec71d72
+	.type L(DP_S3), @object
+	ASM_SIZE_DIRECTIVE(L(DP_S3))
+
+	.p2align 3
+L(DP_S1):
+	.long	0x10c2688b,0x3f811111
+	.type L(DP_S1), @object
+	ASM_SIZE_DIRECTIVE(L(DP_S1))
+
+	.p2align 3
+L(DP_S4):
+	.long	0x1674b58a,0xbe5a947e
+	.type L(DP_S4), @object
+	ASM_SIZE_DIRECTIVE(L(DP_S4))
+
+	.p2align 3
+L(DP_S2):
+	.long	0x8b4bd1f9,0xbf2a019f
+	.type L(DP_S2), @object
+	ASM_SIZE_DIRECTIVE(L(DP_S2))
+
+	.p2align 3
+L(DP_S0):
+	.long	0x55551cd9,0xbfc55555
+	.type L(DP_S0), @object
+	ASM_SIZE_DIRECTIVE(L(DP_S0))
+
+	.p2align 3
+L(DP_SMALL):
+	.long	0x00000000,0x3cd00000	/* 2^(-50) */
+	.type L(DP_SMALL), @object
+	ASM_SIZE_DIRECTIVE(L(DP_SMALL))
+
+/* Coefficients of polynomial
+   for cos(t)~=1.0+t^2*(C0+t^2*(C1+t^2*(C2+t^2*(C3+t^2*C4)))), |t|<Pi/4.  */
+	.p2align 3
+L(DP_C3):
+	.long	0x9ac43cc0,0x3efa00eb
+	.type L(DP_C3), @object
+	ASM_SIZE_DIRECTIVE(L(DP_C3))
+
+	.p2align 3
+L(DP_C1):
+	.long	0x545c50c7,0x3fa55555
+	.type L(DP_C1), @object
+	ASM_SIZE_DIRECTIVE(L(DP_C1))
+
+	.p2align 3
+L(DP_C4):
+	.long	0xdd8844d7,0xbe923c97
+	.type L(DP_C4), @object
+	ASM_SIZE_DIRECTIVE(L(DP_C4))
+
+	.p2align 3
+L(DP_C2):
+	.long	0x348b6874,0xbf56c16b
+	.type L(DP_C2), @object
+	ASM_SIZE_DIRECTIVE(L(DP_C2))
+
+	.p2align 3
+L(DP_C0):
+	.long	0xfffe98ae,0xbfdfffff
+	.type L(DP_C0), @object
+	ASM_SIZE_DIRECTIVE(L(DP_C0))
+
+	.p2align 3
+L(DP_PIO4):
+	.long	0x54442d18,0x3fe921fb	/* Pi/4 */
+	.type L(DP_PIO4), @object
+	ASM_SIZE_DIRECTIVE(L(DP_PIO4))
+
+	.p2align 3
+L(DP_2POW52):
+	.long	0x00000000,0x43300000	/* +2^52 */
+	.long	0x00000000,0xc3300000	/* -2^52 */
+	.type L(DP_2POW52), @object
+	ASM_SIZE_DIRECTIVE(L(DP_2POW52))
+
+	.p2align 3
+L(DP_INVPIO4):
+	.long	0x6dc9c883,0x3ff45f30	/* 4/Pi */
+	.type L(DP_INVPIO4), @object
+	ASM_SIZE_DIRECTIVE(L(DP_INVPIO4))
+
+	.p2align 3
+L(DP_PIO4HI):
+	.long	0x54000000,0xbfe921fb	/* High part of Pi/4 */
+	.type L(DP_PIO4HI), @object
+	ASM_SIZE_DIRECTIVE(L(DP_PIO4HI))
+
+	.p2align 3
+L(DP_PIO4LO):
+	.long	0x11A62633,0xbe010b46	/* Low part of Pi/4 */
+	.type L(DP_PIO4LO), @object
+	ASM_SIZE_DIRECTIVE(L(DP_PIO4LO))
+
+	.p2align 2
+L(SP_INVPIO4):
+	.long	0x3fa2f983		/* 4/Pi */
+	.type L(SP_INVPIO4), @object
+	ASM_SIZE_DIRECTIVE(L(SP_INVPIO4))
+
+	.p2align 4
+L(DP_ABS_MASK): /* Mask for getting DP absolute value */
+	.long	0xffffffff,0x7fffffff
+	.long	0xffffffff,0x7fffffff
+	.type L(DP_ABS_MASK), @object
+	ASM_SIZE_DIRECTIVE(L(DP_ABS_MASK))
+
+	.p2align 3
+L(DP_HI_MASK): /* Mask for getting high 21 bits of DP value */
+	.long	0x00000000,0xffffffff
+	.type L(DP_HI_MASK),@object
+	ASM_SIZE_DIRECTIVE(L(DP_HI_MASK))
+
+	.p2align 4
+L(SP_ABS_MASK): /* Mask for getting SP absolute value */
+	.long	0x7fffffff,0x7fffffff
+	.long	0x7fffffff,0x7fffffff
+	.type L(SP_ABS_MASK), @object
+	ASM_SIZE_DIRECTIVE(L(SP_ABS_MASK))
+
+weak_alias(__sinf, sinf)
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/s_truncl.S b/REORG.TODO/sysdeps/x86_64/fpu/s_truncl.S
new file mode 100644
index 0000000000..b6ca0bae7b
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/s_truncl.S
@@ -0,0 +1,36 @@
+/* Truncate long double value.
+   Copyright (C) 1997-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <machine/asm.h>
+
+ENTRY(__truncl)
+	fldt	8(%rsp)
+	fnstenv	-28(%rsp)
+	movl	$0xc00, %edx
+	orl	-28(%rsp), %edx
+	movl	%edx, -32(%rsp)
+	fldcw	-32(%rsp)
+	frndint
+	fnstsw
+	andl	$0x1, %eax
+	orl	%eax, -24(%rsp)
+	fldenv	-28(%rsp)
+	ret
+END(__truncl)
+weak_alias (__truncl, truncl)
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/svml_d_cos2_core.S b/REORG.TODO/sysdeps/x86_64/fpu/svml_d_cos2_core.S
new file mode 100644
index 0000000000..db4fd3f62f
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/svml_d_cos2_core.S
@@ -0,0 +1,29 @@
+/* Function cos vectorized with SSE2.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "svml_d_wrapper_impl.h"
+
+	.text
+ENTRY (_ZGVbN2v_cos)
+WRAPPER_IMPL_SSE2 cos
+END (_ZGVbN2v_cos)
+
+#ifndef USE_MULTIARCH
+ libmvec_hidden_def (_ZGVbN2v_cos)
+#endif
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/svml_d_cos4_core.S b/REORG.TODO/sysdeps/x86_64/fpu/svml_d_cos4_core.S
new file mode 100644
index 0000000000..a30f1c43f5
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/svml_d_cos4_core.S
@@ -0,0 +1,29 @@
+/* Function cos vectorized with AVX2, wrapper version.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "svml_d_wrapper_impl.h"
+
+	.text
+ENTRY (_ZGVdN4v_cos)
+WRAPPER_IMPL_AVX _ZGVbN2v_cos
+END (_ZGVdN4v_cos)
+
+#ifndef USE_MULTIARCH
+ libmvec_hidden_def (_ZGVdN4v_cos)
+#endif
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/svml_d_cos4_core_avx.S b/REORG.TODO/sysdeps/x86_64/fpu/svml_d_cos4_core_avx.S
new file mode 100644
index 0000000000..c6ce6fa1a4
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/svml_d_cos4_core_avx.S
@@ -0,0 +1,25 @@
+/* Function cos vectorized in AVX ISA as wrapper to SSE4 ISA version.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "svml_d_wrapper_impl.h"
+
+	.text
+ENTRY (_ZGVcN4v_cos)
+WRAPPER_IMPL_AVX _ZGVbN2v_cos
+END (_ZGVcN4v_cos)
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/svml_d_cos8_core.S b/REORG.TODO/sysdeps/x86_64/fpu/svml_d_cos8_core.S
new file mode 100644
index 0000000000..5432bc701e
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/svml_d_cos8_core.S
@@ -0,0 +1,25 @@
+/* Function cos vectorized with AVX-512, wrapper to AVX2.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "svml_d_wrapper_impl.h"
+
+	.text
+ENTRY (_ZGVeN8v_cos)
+WRAPPER_IMPL_AVX512 _ZGVdN4v_cos
+END (_ZGVeN8v_cos)
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/svml_d_exp2_core.S b/REORG.TODO/sysdeps/x86_64/fpu/svml_d_exp2_core.S
new file mode 100644
index 0000000000..92b328331d
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/svml_d_exp2_core.S
@@ -0,0 +1,29 @@
+/* Function exp vectorized with SSE2.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "svml_d_wrapper_impl.h"
+
+	.text
+ENTRY (_ZGVbN2v_exp)
+WRAPPER_IMPL_SSE2 __exp_finite
+END (_ZGVbN2v_exp)
+
+#ifndef USE_MULTIARCH
+ libmvec_hidden_def (_ZGVbN2v_exp)
+#endif
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/svml_d_exp4_core.S b/REORG.TODO/sysdeps/x86_64/fpu/svml_d_exp4_core.S
new file mode 100644
index 0000000000..e062263d7a
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/svml_d_exp4_core.S
@@ -0,0 +1,29 @@
+/* Function exp vectorized with AVX2, wrapper version.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "svml_d_wrapper_impl.h"
+
+	.text
+ENTRY (_ZGVdN4v_exp)
+WRAPPER_IMPL_AVX _ZGVbN2v_exp
+END (_ZGVdN4v_exp)
+
+#ifndef USE_MULTIARCH
+ libmvec_hidden_def (_ZGVdN4v_exp)
+#endif
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/svml_d_exp4_core_avx.S b/REORG.TODO/sysdeps/x86_64/fpu/svml_d_exp4_core_avx.S
new file mode 100644
index 0000000000..21ae29d330
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/svml_d_exp4_core_avx.S
@@ -0,0 +1,25 @@
+/* Function exp vectorized in AVX ISA as wrapper to SSE4 ISA version.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "svml_d_wrapper_impl.h"
+
+	.text
+ENTRY (_ZGVcN4v_exp)
+WRAPPER_IMPL_AVX _ZGVbN2v_exp
+END (_ZGVcN4v_exp)
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/svml_d_exp8_core.S b/REORG.TODO/sysdeps/x86_64/fpu/svml_d_exp8_core.S
new file mode 100644
index 0000000000..28bfa98dde
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/svml_d_exp8_core.S
@@ -0,0 +1,25 @@
+/* Function exp vectorized with AVX-512. Wrapper to AVX2 version.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "svml_d_wrapper_impl.h"
+
+	.text
+ENTRY (_ZGVeN8v_exp)
+WRAPPER_IMPL_AVX512 _ZGVdN4v_exp
+END (_ZGVeN8v_exp)
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/svml_d_exp_data.S b/REORG.TODO/sysdeps/x86_64/fpu/svml_d_exp_data.S
new file mode 100644
index 0000000000..521537e3f6
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/svml_d_exp_data.S
@@ -0,0 +1,1088 @@
+/* Data for vector function exp.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include "svml_d_exp_data.h"
+
+	.section .rodata, "a"
+	.align 64
+
+/* Data table for vector implementations of function exp.
+ * The table may contain polynomial, reduction, lookup
+ * coefficients and other constants obtained through different
+ * methods of research and experimental work.  */
+	.globl __svml_dexp_data
+__svml_dexp_data:
+
+/* Lookup table of 2^(j/2^K): */
+.if .-__svml_dexp_data != __dbT
+.err
+.endif
+	.quad	0x3ff0000000000000
+	.quad	0x3ff002c605e2e8cf
+	.quad	0x3ff0058c86da1c0a
+	.quad	0x3ff0085382faef83
+	.quad	0x3ff00b1afa5abcbf
+	.quad	0x3ff00de2ed0ee0f5
+	.quad	0x3ff010ab5b2cbd11
+	.quad	0x3ff0137444c9b5b5
+	.quad	0x3ff0163da9fb3335
+	.quad	0x3ff019078ad6a19f
+	.quad	0x3ff01bd1e77170b4
+	.quad	0x3ff01e9cbfe113ef
+	.quad	0x3ff02168143b0281
+	.quad	0x3ff02433e494b755
+	.quad	0x3ff027003103b10e
+	.quad	0x3ff029ccf99d720a
+	.quad	0x3ff02c9a3e778061
+	.quad	0x3ff02f67ffa765e6
+	.quad	0x3ff032363d42b027
+	.quad	0x3ff03504f75ef071
+	.quad	0x3ff037d42e11bbcc
+	.quad	0x3ff03aa3e170aafe
+	.quad	0x3ff03d7411915a8a
+	.quad	0x3ff04044be896ab6
+	.quad	0x3ff04315e86e7f85
+	.quad	0x3ff045e78f5640b9
+	.quad	0x3ff048b9b35659d8
+	.quad	0x3ff04b8c54847a28
+	.quad	0x3ff04e5f72f654b1
+	.quad	0x3ff051330ec1a03f
+	.quad	0x3ff0540727fc1762
+	.quad	0x3ff056dbbebb786b
+	.quad	0x3ff059b0d3158574
+	.quad	0x3ff05c866520045b
+	.quad	0x3ff05f5c74f0bec2
+	.quad	0x3ff06233029d8216
+	.quad	0x3ff0650a0e3c1f89
+	.quad	0x3ff067e197e26c14
+	.quad	0x3ff06ab99fa6407c
+	.quad	0x3ff06d92259d794d
+	.quad	0x3ff0706b29ddf6de
+	.quad	0x3ff07344ac7d9d51
+	.quad	0x3ff0761ead925493
+	.quad	0x3ff078f92d32085d
+	.quad	0x3ff07bd42b72a836
+	.quad	0x3ff07eafa86a2771
+	.quad	0x3ff0818ba42e7d30
+	.quad	0x3ff084681ed5a462
+	.quad	0x3ff0874518759bc8
+	.quad	0x3ff08a22912465f2
+	.quad	0x3ff08d0088f8093f
+	.quad	0x3ff08fdf00068fe2
+	.quad	0x3ff092bdf66607e0
+	.quad	0x3ff0959d6c2c830d
+	.quad	0x3ff0987d61701716
+	.quad	0x3ff09b5dd646dd77
+	.quad	0x3ff09e3ecac6f383
+	.quad	0x3ff0a1203f067a63
+	.quad	0x3ff0a402331b9715
+	.quad	0x3ff0a6e4a71c726e
+	.quad	0x3ff0a9c79b1f3919
+	.quad	0x3ff0acab0f3a1b9c
+	.quad	0x3ff0af8f03834e52
+	.quad	0x3ff0b27378110974
+	.quad	0x3ff0b5586cf9890f
+	.quad	0x3ff0b83de2530d11
+	.quad	0x3ff0bb23d833d93f
+	.quad	0x3ff0be0a4eb2353b
+	.quad	0x3ff0c0f145e46c85
+	.quad	0x3ff0c3d8bde0ce7a
+	.quad	0x3ff0c6c0b6bdae53
+	.quad	0x3ff0c9a93091632a
+	.quad	0x3ff0cc922b7247f7
+	.quad	0x3ff0cf7ba776bb94
+	.quad	0x3ff0d265a4b520ba
+	.quad	0x3ff0d5502343de02
+	.quad	0x3ff0d83b23395dec
+	.quad	0x3ff0db26a4ac0ed5
+	.quad	0x3ff0de12a7b26300
+	.quad	0x3ff0e0ff2c62d096
+	.quad	0x3ff0e3ec32d3d1a2
+	.quad	0x3ff0e6d9bb1be415
+	.quad	0x3ff0e9c7c55189c6
+	.quad	0x3ff0ecb6518b4874
+	.quad	0x3ff0efa55fdfa9c5
+	.quad	0x3ff0f294f0653b45
+	.quad	0x3ff0f58503328e6d
+	.quad	0x3ff0f875985e389b
+	.quad	0x3ff0fb66affed31b
+	.quad	0x3ff0fe584a2afb21
+	.quad	0x3ff1014a66f951ce
+	.quad	0x3ff1043d06807c2f
+	.quad	0x3ff1073028d7233e
+	.quad	0x3ff10a23ce13f3e2
+	.quad	0x3ff10d17f64d9ef1
+	.quad	0x3ff1100ca19ad92f
+	.quad	0x3ff11301d0125b51
+	.quad	0x3ff115f781cae1fa
+	.quad	0x3ff118edb6db2dc1
+	.quad	0x3ff11be46f5a032c
+	.quad	0x3ff11edbab5e2ab6
+	.quad	0x3ff121d36afe70c9
+	.quad	0x3ff124cbae51a5c8
+	.quad	0x3ff127c4756e9e05
+	.quad	0x3ff12abdc06c31cc
+	.quad	0x3ff12db78f613d5b
+	.quad	0x3ff130b1e264a0e9
+	.quad	0x3ff133acb98d40a2
+	.quad	0x3ff136a814f204ab
+	.quad	0x3ff139a3f4a9d922
+	.quad	0x3ff13ca058cbae1e
+	.quad	0x3ff13f9d416e77af
+	.quad	0x3ff1429aaea92de0
+	.quad	0x3ff14598a092ccb7
+	.quad	0x3ff1489717425438
+	.quad	0x3ff14b9612cec861
+	.quad	0x3ff14e95934f312e
+	.quad	0x3ff1519598da9a9a
+	.quad	0x3ff154962388149e
+	.quad	0x3ff15797336eb333
+	.quad	0x3ff15a98c8a58e51
+	.quad	0x3ff15d9ae343c1f2
+	.quad	0x3ff1609d83606e12
+	.quad	0x3ff163a0a912b6ac
+	.quad	0x3ff166a45471c3c2
+	.quad	0x3ff169a88594c157
+	.quad	0x3ff16cad3c92df73
+	.quad	0x3ff16fb279835224
+	.quad	0x3ff172b83c7d517b
+	.quad	0x3ff175be85981992
+	.quad	0x3ff178c554eaea89
+	.quad	0x3ff17bccaa8d0888
+	.quad	0x3ff17ed48695bbc0
+	.quad	0x3ff181dce91c506a
+	.quad	0x3ff184e5d23816c9
+	.quad	0x3ff187ef4200632b
+	.quad	0x3ff18af9388c8dea
+	.quad	0x3ff18e03b5f3f36b
+	.quad	0x3ff1910eba4df41f
+	.quad	0x3ff1941a45b1f487
+	.quad	0x3ff1972658375d2f
+	.quad	0x3ff19a32f1f59ab4
+	.quad	0x3ff19d4013041dc2
+	.quad	0x3ff1a04dbb7a5b13
+	.quad	0x3ff1a35beb6fcb75
+	.quad	0x3ff1a66aa2fbebc7
+	.quad	0x3ff1a979e2363cf8
+	.quad	0x3ff1ac89a936440d
+	.quad	0x3ff1af99f8138a1c
+	.quad	0x3ff1b2aacee59c53
+	.quad	0x3ff1b5bc2dc40bf0
+	.quad	0x3ff1b8ce14c66e4c
+	.quad	0x3ff1bbe084045cd4
+	.quad	0x3ff1bef37b95750b
+	.quad	0x3ff1c206fb91588f
+	.quad	0x3ff1c51b040fad15
+	.quad	0x3ff1c82f95281c6b
+	.quad	0x3ff1cb44aef2547a
+	.quad	0x3ff1ce5a51860746
+	.quad	0x3ff1d1707cfaeaed
+	.quad	0x3ff1d4873168b9aa
+	.quad	0x3ff1d79e6ee731d7
+	.quad	0x3ff1dab6358e15e8
+	.quad	0x3ff1ddce85752c71
+	.quad	0x3ff1e0e75eb44027
+	.quad	0x3ff1e400c1631fdb
+	.quad	0x3ff1e71aad999e82
+	.quad	0x3ff1ea35236f9330
+	.quad	0x3ff1ed5022fcd91d
+	.quad	0x3ff1f06bac594fa0
+	.quad	0x3ff1f387bf9cda38
+	.quad	0x3ff1f6a45cdf6085
+	.quad	0x3ff1f9c18438ce4d
+	.quad	0x3ff1fcdf35c1137a
+	.quad	0x3ff1fffd7190241e
+	.quad	0x3ff2031c37bdf872
+	.quad	0x3ff2063b88628cd6
+	.quad	0x3ff2095b6395e1d2
+	.quad	0x3ff20c7bc96ffc18
+	.quad	0x3ff20f9cba08e483
+	.quad	0x3ff212be3578a819
+	.quad	0x3ff215e03bd7580c
+	.quad	0x3ff21902cd3d09b9
+	.quad	0x3ff21c25e9c1d6aa
+	.quad	0x3ff21f49917ddc96
+	.quad	0x3ff2226dc4893d64
+	.quad	0x3ff2259282fc1f27
+	.quad	0x3ff228b7cceeac25
+	.quad	0x3ff22bdda27912d1
+	.quad	0x3ff22f0403b385d2
+	.quad	0x3ff2322af0b63bff
+	.quad	0x3ff2355269997062
+	.quad	0x3ff2387a6e756238
+	.quad	0x3ff23ba2ff6254f4
+	.quad	0x3ff23ecc1c78903a
+	.quad	0x3ff241f5c5d05fe6
+	.quad	0x3ff2451ffb82140a
+	.quad	0x3ff2484abda600ef
+	.quad	0x3ff24b760c547f15
+	.quad	0x3ff24ea1e7a5eb35
+	.quad	0x3ff251ce4fb2a63f
+	.quad	0x3ff254fb44931561
+	.quad	0x3ff25828c65fa1ff
+	.quad	0x3ff25b56d530b9bc
+	.quad	0x3ff25e85711ece75
+	.quad	0x3ff261b49a425645
+	.quad	0x3ff264e450b3cb82
+	.quad	0x3ff26814948bacc3
+	.quad	0x3ff26b4565e27cdd
+	.quad	0x3ff26e76c4d0c2e5
+	.quad	0x3ff271a8b16f0a30
+	.quad	0x3ff274db2bd5e254
+	.quad	0x3ff2780e341ddf29
+	.quad	0x3ff27b41ca5f98cb
+	.quad	0x3ff27e75eeb3ab98
+	.quad	0x3ff281aaa132b832
+	.quad	0x3ff284dfe1f56381
+	.quad	0x3ff28815b11456b1
+	.quad	0x3ff28b4c0ea83f36
+	.quad	0x3ff28e82fac9ceca
+	.quad	0x3ff291ba7591bb70
+	.quad	0x3ff294f27f18bf72
+	.quad	0x3ff2982b17779965
+	.quad	0x3ff29b643ec70c27
+	.quad	0x3ff29e9df51fdee1
+	.quad	0x3ff2a1d83a9add08
+	.quad	0x3ff2a5130f50d65c
+	.quad	0x3ff2a84e735a9eec
+	.quad	0x3ff2ab8a66d10f13
+	.quad	0x3ff2aec6e9cd037b
+	.quad	0x3ff2b203fc675d1f
+	.quad	0x3ff2b5419eb90148
+	.quad	0x3ff2b87fd0dad990
+	.quad	0x3ff2bbbe92e5d3e3
+	.quad	0x3ff2befde4f2e280
+	.quad	0x3ff2c23dc71afbf7
+	.quad	0x3ff2c57e39771b2f
+	.quad	0x3ff2c8bf3c203f5f
+	.quad	0x3ff2cc00cf2f6c18
+	.quad	0x3ff2cf42f2bda93d
+	.quad	0x3ff2d285a6e4030b
+	.quad	0x3ff2d5c8ebbb8a15
+	.quad	0x3ff2d90cc15d5346
+	.quad	0x3ff2dc5127e277e3
+	.quad	0x3ff2df961f641589
+	.quad	0x3ff2e2dba7fb4e33
+	.quad	0x3ff2e621c1c14833
+	.quad	0x3ff2e9686ccf2e3b
+	.quad	0x3ff2ecafa93e2f56
+	.quad	0x3ff2eff777277ef0
+	.quad	0x3ff2f33fd6a454d2
+	.quad	0x3ff2f688c7cded23
+	.quad	0x3ff2f9d24abd886b
+	.quad	0x3ff2fd1c5f8c6b93
+	.quad	0x3ff300670653dfe4
+	.quad	0x3ff303b23f2d330b
+	.quad	0x3ff306fe0a31b715
+	.quad	0x3ff30a4a677ac276
+	.quad	0x3ff30d975721b004
+	.quad	0x3ff310e4d93fdefb
+	.quad	0x3ff31432edeeb2fd
+	.quad	0x3ff3178195479413
+	.quad	0x3ff31ad0cf63eeac
+	.quad	0x3ff31e209c5d33a0
+	.quad	0x3ff32170fc4cd831
+	.quad	0x3ff324c1ef4c560a
+	.quad	0x3ff3281375752b40
+	.quad	0x3ff32b658ee0da54
+	.quad	0x3ff32eb83ba8ea32
+	.quad	0x3ff3320b7be6e633
+	.quad	0x3ff3355f4fb45e20
+	.quad	0x3ff338b3b72ae62d
+	.quad	0x3ff33c08b26416ff
+	.quad	0x3ff33f5e41798daa
+	.quad	0x3ff342b46484ebb4
+	.quad	0x3ff3460b1b9fd712
+	.quad	0x3ff3496266e3fa2d
+	.quad	0x3ff34cba466b03e1
+	.quad	0x3ff35012ba4ea77d
+	.quad	0x3ff3536bc2a89cc4
+	.quad	0x3ff356c55f929ff1
+	.quad	0x3ff35a1f912671b1
+	.quad	0x3ff35d7a577dd72b
+	.quad	0x3ff360d5b2b299fc
+	.quad	0x3ff36431a2de883b
+	.quad	0x3ff3678e281b7475
+	.quad	0x3ff36aeb428335b4
+	.quad	0x3ff36e48f22fa77c
+	.quad	0x3ff371a7373aa9cb
+	.quad	0x3ff3750611be211c
+	.quad	0x3ff3786581d3f669
+	.quad	0x3ff37bc587961726
+	.quad	0x3ff37f26231e754a
+	.quad	0x3ff3828754870746
+	.quad	0x3ff385e91be9c811
+	.quad	0x3ff3894b7960b71f
+	.quad	0x3ff38cae6d05d866
+	.quad	0x3ff39011f6f3345f
+	.quad	0x3ff393761742d808
+	.quad	0x3ff396dace0ed4e1
+	.quad	0x3ff39a401b7140ef
+	.quad	0x3ff39da5ff8436bc
+	.quad	0x3ff3a10c7a61d55b
+	.quad	0x3ff3a4738c244064
+	.quad	0x3ff3a7db34e59ff7
+	.quad	0x3ff3ab4374c020bd
+	.quad	0x3ff3aeac4bcdf3ea
+	.quad	0x3ff3b215ba294f39
+	.quad	0x3ff3b57fbfec6cf4
+	.quad	0x3ff3b8ea5d318bef
+	.quad	0x3ff3bc559212ef89
+	.quad	0x3ff3bfc15eaadfb1
+	.quad	0x3ff3c32dc313a8e5
+	.quad	0x3ff3c69abf679c2e
+	.quad	0x3ff3ca0853c10f28
+	.quad	0x3ff3cd76803a5c00
+	.quad	0x3ff3d0e544ede173
+	.quad	0x3ff3d454a1f602d0
+	.quad	0x3ff3d7c4976d27fa
+	.quad	0x3ff3db35256dbd67
+	.quad	0x3ff3dea64c123422
+	.quad	0x3ff3e2180b7501cc
+	.quad	0x3ff3e58a63b0a09b
+	.quad	0x3ff3e8fd54df8f5c
+	.quad	0x3ff3ec70df1c5175
+	.quad	0x3ff3efe502816ee3
+	.quad	0x3ff3f359bf29743f
+	.quad	0x3ff3f6cf152ef2b8
+	.quad	0x3ff3fa4504ac801c
+	.quad	0x3ff3fdbb8dbcb6d2
+	.quad	0x3ff40132b07a35df
+	.quad	0x3ff404aa6cffa0e5
+	.quad	0x3ff40822c367a024
+	.quad	0x3ff40b9bb3cce07c
+	.quad	0x3ff40f153e4a136a
+	.quad	0x3ff4128f62f9ef0e
+	.quad	0x3ff4160a21f72e2a
+	.quad	0x3ff419857b5c901f
+	.quad	0x3ff41d016f44d8f5
+	.quad	0x3ff4207dfdcad153
+	.quad	0x3ff423fb2709468a
+	.quad	0x3ff42778eb1b0a8b
+	.quad	0x3ff42af74a1af3f1
+	.quad	0x3ff42e764423ddfd
+	.quad	0x3ff431f5d950a897
+	.quad	0x3ff4357609bc3850
+	.quad	0x3ff438f6d5817663
+	.quad	0x3ff43c783cbb50b4
+	.quad	0x3ff43ffa3f84b9d4
+	.quad	0x3ff4437cddf8a8fe
+	.quad	0x3ff4470018321a1a
+	.quad	0x3ff44a83ee4c0dbd
+	.quad	0x3ff44e086061892d
+	.quad	0x3ff4518d6e8d965b
+	.quad	0x3ff4551318eb43ec
+	.quad	0x3ff458995f95a532
+	.quad	0x3ff45c2042a7d232
+	.quad	0x3ff45fa7c23ce7a4
+	.quad	0x3ff4632fde7006f4
+	.quad	0x3ff466b8975c563e
+	.quad	0x3ff46a41ed1d0057
+	.quad	0x3ff46dcbdfcd34c8
+	.quad	0x3ff471566f8827d0
+	.quad	0x3ff474e19c691265
+	.quad	0x3ff4786d668b3237
+	.quad	0x3ff47bf9ce09c9ab
+	.quad	0x3ff47f86d3001fe5
+	.quad	0x3ff48314758980bf
+	.quad	0x3ff486a2b5c13cd0
+	.quad	0x3ff48a3193c2a96c
+	.quad	0x3ff48dc10fa920a1
+	.quad	0x3ff491512990013f
+	.quad	0x3ff494e1e192aed2
+	.quad	0x3ff4987337cc91a5
+	.quad	0x3ff49c052c5916c4
+	.quad	0x3ff49f97bf53affd
+	.quad	0x3ff4a32af0d7d3de
+	.quad	0x3ff4a6bec100fdba
+	.quad	0x3ff4aa532feaada6
+	.quad	0x3ff4ade83db0687a
+	.quad	0x3ff4b17dea6db7d7
+	.quad	0x3ff4b514363e2a20
+	.quad	0x3ff4b8ab213d5283
+	.quad	0x3ff4bc42ab86c8f1
+	.quad	0x3ff4bfdad5362a27
+	.quad	0x3ff4c3739e6717aa
+	.quad	0x3ff4c70d073537ca
+	.quad	0x3ff4caa70fbc35a1
+	.quad	0x3ff4ce41b817c114
+	.quad	0x3ff4d1dd00638ed8
+	.quad	0x3ff4d578e8bb586b
+	.quad	0x3ff4d915713adc1e
+	.quad	0x3ff4dcb299fddd0d
+	.quad	0x3ff4e05063202327
+	.quad	0x3ff4e3eeccbd7b2a
+	.quad	0x3ff4e78dd6f1b6a6
+	.quad	0x3ff4eb2d81d8abff
+	.quad	0x3ff4eecdcd8e3669
+	.quad	0x3ff4f26eba2e35f0
+	.quad	0x3ff4f61047d48f73
+	.quad	0x3ff4f9b2769d2ca7
+	.quad	0x3ff4fd5546a3fc17
+	.quad	0x3ff500f8b804f127
+	.quad	0x3ff5049ccadc0412
+	.quad	0x3ff508417f4531ee
+	.quad	0x3ff50be6d55c7ca9
+	.quad	0x3ff50f8ccd3deb0d
+	.quad	0x3ff51333670588bf
+	.quad	0x3ff516daa2cf6642
+	.quad	0x3ff51a8280b798f4
+	.quad	0x3ff51e2b00da3b14
+	.quad	0x3ff521d423536bbe
+	.quad	0x3ff5257de83f4eef
+	.quad	0x3ff529284fba0d84
+	.quad	0x3ff52cd359dfd53d
+	.quad	0x3ff5307f06ccd8ba
+	.quad	0x3ff5342b569d4f82
+	.quad	0x3ff537d8496d75fc
+	.quad	0x3ff53b85df598d78
+	.quad	0x3ff53f34187ddc28
+	.quad	0x3ff542e2f4f6ad27
+	.quad	0x3ff5469274e05078
+	.quad	0x3ff54a4298571b06
+	.quad	0x3ff54df35f7766a3
+	.quad	0x3ff551a4ca5d920f
+	.quad	0x3ff55556d92600f1
+	.quad	0x3ff559098bed1bdf
+	.quad	0x3ff55cbce2cf505b
+	.quad	0x3ff56070dde910d2
+	.quad	0x3ff564257d56d4a2
+	.quad	0x3ff567dac1351819
+	.quad	0x3ff56b90a9a05c72
+	.quad	0x3ff56f4736b527da
+	.quad	0x3ff572fe68900573
+	.quad	0x3ff576b63f4d854c
+	.quad	0x3ff57a6ebb0a3c6d
+	.quad	0x3ff57e27dbe2c4cf
+	.quad	0x3ff581e1a1f3bd60
+	.quad	0x3ff5859c0d59ca07
+	.quad	0x3ff589571e31939f
+	.quad	0x3ff58d12d497c7fd
+	.quad	0x3ff590cf30a919ed
+	.quad	0x3ff5948c32824135
+	.quad	0x3ff59849da3ffa96
+	.quad	0x3ff59c0827ff07cc
+	.quad	0x3ff59fc71bdc2f8e
+	.quad	0x3ff5a386b5f43d92
+	.quad	0x3ff5a746f664028b
+	.quad	0x3ff5ab07dd485429
+	.quad	0x3ff5aec96abe0d1f
+	.quad	0x3ff5b28b9ee20d1e
+	.quad	0x3ff5b64e79d138d8
+	.quad	0x3ff5ba11fba87a03
+	.quad	0x3ff5bdd62484bf56
+	.quad	0x3ff5c19af482fc8f
+	.quad	0x3ff5c5606bc02a6d
+	.quad	0x3ff5c9268a5946b7
+	.quad	0x3ff5cced506b543a
+	.quad	0x3ff5d0b4be135acc
+	.quad	0x3ff5d47cd36e6747
+	.quad	0x3ff5d84590998b93
+	.quad	0x3ff5dc0ef5b1de9e
+	.quad	0x3ff5dfd902d47c65
+	.quad	0x3ff5e3a3b81e85ec
+	.quad	0x3ff5e76f15ad2148
+	.quad	0x3ff5eb3b1b9d799a
+	.quad	0x3ff5ef07ca0cbf0f
+	.quad	0x3ff5f2d5211826e8
+	.quad	0x3ff5f6a320dceb71
+	.quad	0x3ff5fa71c9784c0b
+	.quad	0x3ff5fe411b078d26
+	.quad	0x3ff6021115a7f849
+	.quad	0x3ff605e1b976dc09
+	.quad	0x3ff609b306918c13
+	.quad	0x3ff60d84fd15612a
+	.quad	0x3ff611579d1fb925
+	.quad	0x3ff6152ae6cdf6f4
+	.quad	0x3ff618feda3d829f
+	.quad	0x3ff61cd3778bc944
+	.quad	0x3ff620a8bed63d1f
+	.quad	0x3ff6247eb03a5585
+	.quad	0x3ff628554bd58ee5
+	.quad	0x3ff62c2c91c56acd
+	.quad	0x3ff6300482276fe8
+	.quad	0x3ff633dd1d1929fd
+	.quad	0x3ff637b662b829f5
+	.quad	0x3ff63b90532205d8
+	.quad	0x3ff63f6aee7458cd
+	.quad	0x3ff6434634ccc320
+	.quad	0x3ff647222648ea3d
+	.quad	0x3ff64afec30678b7
+	.quad	0x3ff64edc0b231e41
+	.quad	0x3ff652b9febc8fb7
+	.quad	0x3ff656989df08719
+	.quad	0x3ff65a77e8dcc390
+	.quad	0x3ff65e57df9f096b
+	.quad	0x3ff6623882552225
+	.quad	0x3ff66619d11cdc5f
+	.quad	0x3ff669fbcc140be7
+	.quad	0x3ff66dde735889b8
+	.quad	0x3ff671c1c70833f6
+	.quad	0x3ff675a5c740edf5
+	.quad	0x3ff6798a7420a036
+	.quad	0x3ff67d6fcdc5386a
+	.quad	0x3ff68155d44ca973
+	.quad	0x3ff6853c87d4eb62
+	.quad	0x3ff68923e87bfb7a
+	.quad	0x3ff68d0bf65fdc34
+	.quad	0x3ff690f4b19e9538
+	.quad	0x3ff694de1a563367
+	.quad	0x3ff698c830a4c8d4
+	.quad	0x3ff69cb2f4a86cca
+	.quad	0x3ff6a09e667f3bcd
+	.quad	0x3ff6a48a86475795
+	.quad	0x3ff6a877541ee718
+	.quad	0x3ff6ac64d0241683
+	.quad	0x3ff6b052fa75173e
+	.quad	0x3ff6b441d3301fee
+	.quad	0x3ff6b8315a736c75
+	.quad	0x3ff6bc21905d3df0
+	.quad	0x3ff6c012750bdabf
+	.quad	0x3ff6c404089d8e7d
+	.quad	0x3ff6c7f64b30aa09
+	.quad	0x3ff6cbe93ce38381
+	.quad	0x3ff6cfdcddd47645
+	.quad	0x3ff6d3d12e21e2fb
+	.quad	0x3ff6d7c62dea2f8a
+	.quad	0x3ff6dbbbdd4bc720
+	.quad	0x3ff6dfb23c651a2f
+	.quad	0x3ff6e3a94b549e71
+	.quad	0x3ff6e7a10a38cee8
+	.quad	0x3ff6eb9979302bdd
+	.quad	0x3ff6ef9298593ae5
+	.quad	0x3ff6f38c67d286dd
+	.quad	0x3ff6f786e7ba9fef
+	.quad	0x3ff6fb8218301b90
+	.quad	0x3ff6ff7df9519484
+	.quad	0x3ff7037a8b3daadb
+	.quad	0x3ff70777ce1303f6
+	.quad	0x3ff70b75c1f04a84
+	.quad	0x3ff70f7466f42e87
+	.quad	0x3ff71373bd3d6551
+	.quad	0x3ff71773c4eaa988
+	.quad	0x3ff71b747e1abb24
+	.quad	0x3ff71f75e8ec5f74
+	.quad	0x3ff72378057e611a
+	.quad	0x3ff7277ad3ef9011
+	.quad	0x3ff72b7e545ec1a8
+	.quad	0x3ff72f8286ead08a
+	.quad	0x3ff733876bb29cb8
+	.quad	0x3ff7378d02d50b8f
+	.quad	0x3ff73b934c7107c7
+	.quad	0x3ff73f9a48a58174
+	.quad	0x3ff743a1f7916e05
+	.quad	0x3ff747aa5953c849
+	.quad	0x3ff74bb36e0b906d
+	.quad	0x3ff74fbd35d7cbfd
+	.quad	0x3ff753c7b0d785e8
+	.quad	0x3ff757d2df29ce7c
+	.quad	0x3ff75bdec0edbb6b
+	.quad	0x3ff75feb564267c9
+	.quad	0x3ff763f89f46f40f
+	.quad	0x3ff768069c1a861d
+	.quad	0x3ff76c154cdc4937
+	.quad	0x3ff77024b1ab6e09
+	.quad	0x3ff77434caa72aa7
+	.quad	0x3ff7784597eeba8f
+	.quad	0x3ff77c5719a15ea6
+	.quad	0x3ff780694fde5d3f
+	.quad	0x3ff7847c3ac50219
+	.quad	0x3ff7888fda749e5d
+	.quad	0x3ff78ca42f0c88a5
+	.quad	0x3ff790b938ac1cf6
+	.quad	0x3ff794cef772bcc9
+	.quad	0x3ff798e56b7fcf03
+	.quad	0x3ff79cfc94f2bfff
+	.quad	0x3ff7a11473eb0187
+	.quad	0x3ff7a52d08880ad9
+	.quad	0x3ff7a94652e958aa
+	.quad	0x3ff7ad60532e6d20
+	.quad	0x3ff7b17b0976cfdb
+	.quad	0x3ff7b59675e20def
+	.quad	0x3ff7b9b2988fb9ec
+	.quad	0x3ff7bdcf719f6bd7
+	.quad	0x3ff7c1ed0130c132
+	.quad	0x3ff7c60b47635cf9
+	.quad	0x3ff7ca2a4456e7a3
+	.quad	0x3ff7ce49f82b0f24
+	.quad	0x3ff7d26a62ff86f0
+	.quad	0x3ff7d68b84f407f8
+	.quad	0x3ff7daad5e2850ac
+	.quad	0x3ff7decfeebc24fe
+	.quad	0x3ff7e2f336cf4e62
+	.quad	0x3ff7e71736819bcd
+	.quad	0x3ff7eb3bedf2e1b9
+	.quad	0x3ff7ef615d42fa24
+	.quad	0x3ff7f3878491c491
+	.quad	0x3ff7f7ae63ff260a
+	.quad	0x3ff7fbd5fbab091f
+	.quad	0x3ff7fffe4bb55dec
+	.quad	0x3ff80427543e1a12
+	.quad	0x3ff80851156538be
+	.quad	0x3ff80c7b8f4abaa9
+	.quad	0x3ff810a6c20ea617
+	.quad	0x3ff814d2add106d9
+	.quad	0x3ff818ff52b1ee50
+	.quad	0x3ff81d2cb0d1736a
+	.quad	0x3ff8215ac84fb2a6
+	.quad	0x3ff82589994cce13
+	.quad	0x3ff829b923e8ed53
+	.quad	0x3ff82de968443d9a
+	.quad	0x3ff8321a667ef1b2
+	.quad	0x3ff8364c1eb941f7
+	.quad	0x3ff83a7e91136c5d
+	.quad	0x3ff83eb1bdadb46d
+	.quad	0x3ff842e5a4a8634a
+	.quad	0x3ff8471a4623c7ad
+	.quad	0x3ff84b4fa24035ea
+	.quad	0x3ff84f85b91e07f1
+	.quad	0x3ff853bc8add9d4c
+	.quad	0x3ff857f4179f5b21
+	.quad	0x3ff85c2c5f83ac35
+	.quad	0x3ff8606562ab00ec
+	.quad	0x3ff8649f2135cf48
+	.quad	0x3ff868d99b4492ed
+	.quad	0x3ff86d14d0f7cd1d
+	.quad	0x3ff87150c27004c2
+	.quad	0x3ff8758d6fcdc666
+	.quad	0x3ff879cad931a436
+	.quad	0x3ff87e08febc3608
+	.quad	0x3ff88247e08e1957
+	.quad	0x3ff886877ec7f144
+	.quad	0x3ff88ac7d98a6699
+	.quad	0x3ff88f08f0f627cb
+	.quad	0x3ff8934ac52be8f7
+	.quad	0x3ff8978d564c63e7
+	.quad	0x3ff89bd0a478580f
+	.quad	0x3ff8a014afd08a94
+	.quad	0x3ff8a4597875c644
+	.quad	0x3ff8a89efe88dba1
+	.quad	0x3ff8ace5422aa0db
+	.quad	0x3ff8b12c437bf1d4
+	.quad	0x3ff8b574029db01e
+	.quad	0x3ff8b9bc7fb0c302
+	.quad	0x3ff8be05bad61778
+	.quad	0x3ff8c24fb42ea033
+	.quad	0x3ff8c69a6bdb5598
+	.quad	0x3ff8cae5e1fd35c4
+	.quad	0x3ff8cf3216b5448c
+	.quad	0x3ff8d37f0a248b7f
+	.quad	0x3ff8d7ccbc6c19e6
+	.quad	0x3ff8dc1b2dad04c4
+	.quad	0x3ff8e06a5e0866d9
+	.quad	0x3ff8e4ba4d9f60a1
+	.quad	0x3ff8e90afc931857
+	.quad	0x3ff8ed5c6b04b9f6
+	.quad	0x3ff8f1ae99157736
+	.quad	0x3ff8f60186e68793
+	.quad	0x3ff8fa553499284b
+	.quad	0x3ff8fea9a24e9c5c
+	.quad	0x3ff902fed0282c8a
+	.quad	0x3ff90754be472760
+	.quad	0x3ff90bab6ccce12c
+	.quad	0x3ff91002dbdab403
+	.quad	0x3ff9145b0b91ffc6
+	.quad	0x3ff918b3fc142a19
+	.quad	0x3ff91d0dad829e70
+	.quad	0x3ff921681ffece05
+	.quad	0x3ff925c353aa2fe2
+	.quad	0x3ff92a1f48a640dc
+	.quad	0x3ff92e7bff148396
+	.quad	0x3ff932d977168083
+	.quad	0x3ff93737b0cdc5e5
+	.quad	0x3ff93b96ac5be7d1
+	.quad	0x3ff93ff669e2802b
+	.quad	0x3ff94456e9832ead
+	.quad	0x3ff948b82b5f98e5
+	.quad	0x3ff94d1a2f996a33
+	.quad	0x3ff9517cf65253d1
+	.quad	0x3ff955e07fac0ccd
+	.quad	0x3ff95a44cbc8520f
+	.quad	0x3ff95ea9dac8e658
+	.quad	0x3ff9630faccf9243
+	.quad	0x3ff9677641fe2446
+	.quad	0x3ff96bdd9a7670b3
+	.quad	0x3ff97045b65a51ba
+	.quad	0x3ff974ae95cba768
+	.quad	0x3ff9791838ec57ab
+	.quad	0x3ff97d829fde4e50
+	.quad	0x3ff981edcac37d05
+	.quad	0x3ff98659b9bddb5b
+	.quad	0x3ff98ac66cef66c8
+	.quad	0x3ff98f33e47a22a2
+	.quad	0x3ff993a220801829
+	.quad	0x3ff9981121235681
+	.quad	0x3ff99c80e685f2b5
+	.quad	0x3ff9a0f170ca07ba
+	.quad	0x3ff9a562c011b66d
+	.quad	0x3ff9a9d4d47f2598
+	.quad	0x3ff9ae47ae3481ed
+	.quad	0x3ff9b2bb4d53fe0d
+	.quad	0x3ff9b72fb1ffd285
+	.quad	0x3ff9bba4dc5a3dd3
+	.quad	0x3ff9c01acc858463
+	.quad	0x3ff9c49182a3f090
+	.quad	0x3ff9c908fed7d2aa
+	.quad	0x3ff9cd81414380f2
+	.quad	0x3ff9d1fa4a09579d
+	.quad	0x3ff9d674194bb8d5
+	.quad	0x3ff9daeeaf2d0cb8
+	.quad	0x3ff9df6a0bcfc15e
+	.quad	0x3ff9e3e62f564ad5
+	.quad	0x3ff9e86319e32323
+	.quad	0x3ff9ece0cb98ca4b
+	.quad	0x3ff9f15f4499c647
+	.quad	0x3ff9f5de8508a311
+	.quad	0x3ff9fa5e8d07f29e
+	.quad	0x3ff9fedf5cba4ce0
+	.quad	0x3ffa0360f4424fcb
+	.quad	0x3ffa07e353c29f50
+	.quad	0x3ffa0c667b5de565
+	.quad	0x3ffa10ea6b36d1fe
+	.quad	0x3ffa156f23701b15
+	.quad	0x3ffa19f4a42c7ca9
+	.quad	0x3ffa1e7aed8eb8bb
+	.quad	0x3ffa2301ffb99757
+	.quad	0x3ffa2789dacfe68c
+	.quad	0x3ffa2c127ef47a74
+	.quad	0x3ffa309bec4a2d33
+	.quad	0x3ffa352622f3def6
+	.quad	0x3ffa39b1231475f7
+	.quad	0x3ffa3e3ceccede7c
+	.quad	0x3ffa42c980460ad8
+	.quad	0x3ffa4756dd9cf36e
+	.quad	0x3ffa4be504f696b1
+	.quad	0x3ffa5073f675f924
+	.quad	0x3ffa5503b23e255d
+	.quad	0x3ffa599438722c03
+	.quad	0x3ffa5e25893523d4
+	.quad	0x3ffa62b7a4aa29a1
+	.quad	0x3ffa674a8af46052
+	.quad	0x3ffa6bde3c36f0e6
+	.quad	0x3ffa7072b8950a73
+	.quad	0x3ffa75080031e22b
+	.quad	0x3ffa799e1330b358
+	.quad	0x3ffa7e34f1b4bf62
+	.quad	0x3ffa82cc9be14dca
+	.quad	0x3ffa876511d9ac32
+	.quad	0x3ffa8bfe53c12e59
+	.quad	0x3ffa909861bb2e1d
+	.quad	0x3ffa95333beb0b7e
+	.quad	0x3ffa99cee2742c9d
+	.quad	0x3ffa9e6b5579fdbf
+	.quad	0x3ffaa308951ff14d
+	.quad	0x3ffaa7a6a1897fd2
+	.quad	0x3ffaac457ada2803
+	.quad	0x3ffab0e521356eba
+	.quad	0x3ffab58594bedefa
+	.quad	0x3ffaba26d59a09ee
+	.quad	0x3ffabec8e3ea86ee
+	.quad	0x3ffac36bbfd3f37a
+	.quad	0x3ffac80f6979f340
+	.quad	0x3ffaccb3e100301e
+	.quad	0x3ffad159268a5a1c
+	.quad	0x3ffad5ff3a3c2774
+	.quad	0x3ffadaa61c395493
+	.quad	0x3ffadf4dcca5a413
+	.quad	0x3ffae3f64ba4dec6
+	.quad	0x3ffae89f995ad3ad
+	.quad	0x3ffaed49b5eb5803
+	.quad	0x3ffaf1f4a17a4735
+	.quad	0x3ffaf6a05c2b82e9
+	.quad	0x3ffafb4ce622f2ff
+	.quad	0x3ffafffa3f84858c
+	.quad	0x3ffb04a868742ee4
+	.quad	0x3ffb09576115e994
+	.quad	0x3ffb0e07298db666
+	.quad	0x3ffb12b7c1ff9c61
+	.quad	0x3ffb17692a8fa8cd
+	.quad	0x3ffb1c1b6361ef31
+	.quad	0x3ffb20ce6c9a8952
+	.quad	0x3ffb2582465d973c
+	.quad	0x3ffb2a36f0cf3f3a
+	.quad	0x3ffb2eec6c13addd
+	.quad	0x3ffb33a2b84f15fb
+	.quad	0x3ffb3859d5a5b0b1
+	.quad	0x3ffb3d11c43bbd62
+	.quad	0x3ffb41ca843581ba
+	.quad	0x3ffb468415b749b1
+	.quad	0x3ffb4b3e78e56786
+	.quad	0x3ffb4ff9ade433c6
+	.quad	0x3ffb54b5b4d80d4a
+	.quad	0x3ffb59728de5593a
+	.quad	0x3ffb5e303930830c
+	.quad	0x3ffb62eeb6ddfc87
+	.quad	0x3ffb67ae07123dc3
+	.quad	0x3ffb6c6e29f1c52a
+	.quad	0x3ffb712f1fa1177b
+	.quad	0x3ffb75f0e844bfc6
+	.quad	0x3ffb7ab384014f76
+	.quad	0x3ffb7f76f2fb5e47
+	.quad	0x3ffb843b35578a51
+	.quad	0x3ffb89004b3a7804
+	.quad	0x3ffb8dc634c8d228
+	.quad	0x3ffb928cf22749e4
+	.quad	0x3ffb9754837a96b7
+	.quad	0x3ffb9c1ce8e77680
+	.quad	0x3ffba0e62292ad7d
+	.quad	0x3ffba5b030a1064a
+	.quad	0x3ffbaa7b133751e3
+	.quad	0x3ffbaf46ca7a67a7
+	.quad	0x3ffbb413568f255a
+	.quad	0x3ffbb8e0b79a6f1f
+	.quad	0x3ffbbdaeedc12f82
+	.quad	0x3ffbc27df9285775
+	.quad	0x3ffbc74dd9f4de4f
+	.quad	0x3ffbcc1e904bc1d2
+	.quad	0x3ffbd0f01c520628
+	.quad	0x3ffbd5c27e2cb5e5
+	.quad	0x3ffbda95b600e20b
+	.quad	0x3ffbdf69c3f3a207
+	.quad	0x3ffbe43ea82a13b5
+	.quad	0x3ffbe91462c95b60
+	.quad	0x3ffbedeaf3f6a3c2
+	.quad	0x3ffbf2c25bd71e09
+	.quad	0x3ffbf79a9a9001d2
+	.quad	0x3ffbfc73b0468d30
+	.quad	0x3ffc014d9d2004aa
+	.quad	0x3ffc06286141b33d
+	.quad	0x3ffc0b03fcd0ea5c
+	.quad	0x3ffc0fe06ff301f4
+	.quad	0x3ffc14bdbacd586a
+	.quad	0x3ffc199bdd85529c
+	.quad	0x3ffc1e7ad8405be6
+	.quad	0x3ffc235aab23e61e
+	.quad	0x3ffc283b56556999
+	.quad	0x3ffc2d1cd9fa652c
+	.quad	0x3ffc31ff36385e29
+	.quad	0x3ffc36e26b34e065
+	.quad	0x3ffc3bc679157e38
+	.quad	0x3ffc40ab5fffd07a
+	.quad	0x3ffc45912019768c
+	.quad	0x3ffc4a77b9881650
+	.quad	0x3ffc4f5f2c715c31
+	.quad	0x3ffc544778fafb22
+	.quad	0x3ffc59309f4aac9f
+	.quad	0x3ffc5e1a9f8630ad
+	.quad	0x3ffc630579d34ddd
+	.quad	0x3ffc67f12e57d14b
+	.quad	0x3ffc6cddbd398ea4
+	.quad	0x3ffc71cb269e601f
+	.quad	0x3ffc76b96aac2686
+	.quad	0x3ffc7ba88988c933
+	.quad	0x3ffc8098835a3611
+	.quad	0x3ffc8589584661a1
+	.quad	0x3ffc8a7b087346f4
+	.quad	0x3ffc8f6d9406e7b5
+	.quad	0x3ffc9460fb274c22
+	.quad	0x3ffc99553dfa8313
+	.quad	0x3ffc9e4a5ca6a1f8
+	.quad	0x3ffca3405751c4db
+	.quad	0x3ffca8372e220e61
+	.quad	0x3ffcad2ee13da7cb
+	.quad	0x3ffcb22770cac0f9
+	.quad	0x3ffcb720dcef9069
+	.quad	0x3ffcbc1b25d25337
+	.quad	0x3ffcc1164b994d23
+	.quad	0x3ffcc6124e6ac88b
+	.quad	0x3ffccb0f2e6d1675
+	.quad	0x3ffcd00cebc68e87
+	.quad	0x3ffcd50b869d8f0f
+	.quad	0x3ffcda0aff187d02
+	.quad	0x3ffcdf0b555dc3fa
+	.quad	0x3ffce40c8993d63d
+	.quad	0x3ffce90e9be12cb9
+	.quad	0x3ffcee118c6c4709
+	.quad	0x3ffcf3155b5bab74
+	.quad	0x3ffcf81a08d5e6ec
+	.quad	0x3ffcfd1f95018d17
+	.quad	0x3ffd022600053845
+	.quad	0x3ffd072d4a07897c
+	.quad	0x3ffd0c35732f2870
+	.quad	0x3ffd113e7ba2c38c
+	.quad	0x3ffd164863890fee
+	.quad	0x3ffd1b532b08c968
+	.quad	0x3ffd205ed248b287
+	.quad	0x3ffd256b596f948c
+	.quad	0x3ffd2a78c0a43f72
+	.quad	0x3ffd2f87080d89f2
+	.quad	0x3ffd34962fd2517a
+	.quad	0x3ffd39a638197a3c
+	.quad	0x3ffd3eb72109ef21
+	.quad	0x3ffd43c8eacaa1d6
+	.quad	0x3ffd48db95828ac7
+	.quad	0x3ffd4def2158a91f
+	.quad	0x3ffd53038e7402ce
+	.quad	0x3ffd5818dcfba487
+	.quad	0x3ffd5d2f0d16a1c3
+	.quad	0x3ffd62461eec14be
+	.quad	0x3ffd675e12a31e7f
+	.quad	0x3ffd6c76e862e6d3
+	.quad	0x3ffd7190a0529c51
+	.quad	0x3ffd76ab3a99745b
+	.quad	0x3ffd7bc6b75eab1f
+	.quad	0x3ffd80e316c98398
+	.quad	0x3ffd86005901478f
+	.quad	0x3ffd8b1e7e2d479d
+	.quad	0x3ffd903d8674db2b
+	.quad	0x3ffd955d71ff6075
+	.quad	0x3ffd9a7e40f43c89
+	.quad	0x3ffd9f9ff37adb4a
+	.quad	0x3ffda4c289baaf6e
+	.quad	0x3ffda9e603db3285
+	.quad	0x3ffdaf0a6203e4f5
+	.quad	0x3ffdb42fa45c4dfd
+	.quad	0x3ffdb955cb0bfbb6
+	.quad	0x3ffdbe7cd63a8315
+	.quad	0x3ffdc3a4c60f7fea
+	.quad	0x3ffdc8cd9ab294e4
+	.quad	0x3ffdcdf7544b6b92
+	.quad	0x3ffdd321f301b460
+	.quad	0x3ffdd84d76fd269e
+	.quad	0x3ffddd79e065807d
+	.quad	0x3ffde2a72f628712
+	.quad	0x3ffde7d5641c0658
+	.quad	0x3ffded047eb9d12d
+	.quad	0x3ffdf2347f63c159
+	.quad	0x3ffdf7656641b78c
+	.quad	0x3ffdfc97337b9b5f
+	.quad	0x3ffe01c9e7395b56
+	.quad	0x3ffe06fd81a2ece1
+	.quad	0x3ffe0c3202e04c5d
+	.quad	0x3ffe11676b197d17
+	.quad	0x3ffe169dba768949
+	.quad	0x3ffe1bd4f11f8220
+	.quad	0x3ffe210d0f3c7fba
+	.quad	0x3ffe264614f5a129
+	.quad	0x3ffe2b8002730c71
+	.quad	0x3ffe30bad7dcee90
+	.quad	0x3ffe35f6955b7b78
+	.quad	0x3ffe3b333b16ee12
+	.quad	0x3ffe4070c9378842
+	.quad	0x3ffe45af3fe592e8
+	.quad	0x3ffe4aee9f495ddc
+	.quad	0x3ffe502ee78b3ff6
+	.quad	0x3ffe557018d3970b
+	.quad	0x3ffe5ab2334ac7ee
+	.quad	0x3ffe5ff537193e75
+	.quad	0x3ffe653924676d76
+	.quad	0x3ffe6a7dfb5dceca
+	.quad	0x3ffe6fc3bc24e350
+	.quad	0x3ffe750a66e532eb
+	.quad	0x3ffe7a51fbc74c83
+	.quad	0x3ffe7f9a7af3c60b
+	.quad	0x3ffe84e3e4933c7e
+	.quad	0x3ffe8a2e38ce53df
+	.quad	0x3ffe8f7977cdb740
+	.quad	0x3ffe94c5a1ba18bd
+	.quad	0x3ffe9a12b6bc3181
+	.quad	0x3ffe9f60b6fcc1c7
+	.quad	0x3ffea4afa2a490da
+	.quad	0x3ffea9ff79dc6d14
+	.quad	0x3ffeaf503ccd2be5
+	.quad	0x3ffeb4a1eb9fa9d1
+	.quad	0x3ffeb9f4867cca6e
+	.quad	0x3ffebf480d8d786d
+	.quad	0x3ffec49c80faa594
+	.quad	0x3ffec9f1e0ed4ac2
+	.quad	0x3ffecf482d8e67f1
+	.quad	0x3ffed49f67070435
+	.quad	0x3ffed9f78d802dc2
+	.quad	0x3ffedf50a122f9e6
+	.quad	0x3ffee4aaa2188510
+	.quad	0x3ffeea059089f2d0
+	.quad	0x3ffeef616ca06dd6
+	.quad	0x3ffef4be368527f6
+	.quad	0x3ffefa1bee615a27
+	.quad	0x3ffeff7a945e4487
+	.quad	0x3fff04da28a52e59
+	.quad	0x3fff0a3aab5f6609
+	.quad	0x3fff0f9c1cb6412a
+	.quad	0x3fff14fe7cd31c7b
+	.quad	0x3fff1a61cbdf5be7
+	.quad	0x3fff1fc60a046a84
+	.quad	0x3fff252b376bba97
+	.quad	0x3fff2a91543ec595
+	.quad	0x3fff2ff860a70c22
+	.quad	0x3fff35605cce1613
+	.quad	0x3fff3ac948dd7274
+	.quad	0x3fff403324feb781
+	.quad	0x3fff459df15b82ac
+	.quad	0x3fff4b09ae1d78a1
+	.quad	0x3fff50765b6e4540
+	.quad	0x3fff55e3f9779ba5
+	.quad	0x3fff5b5288633625
+	.quad	0x3fff60c2085ad652
+	.quad	0x3fff6632798844f8
+	.quad	0x3fff6ba3dc155226
+	.quad	0x3fff7116302bd526
+	.quad	0x3fff768975f5ac86
+	.quad	0x3fff7bfdad9cbe14
+	.quad	0x3fff8172d74af6e1
+	.quad	0x3fff86e8f32a4b45
+	.quad	0x3fff8c600164b6dc
+	.quad	0x3fff91d802243c89
+	.quad	0x3fff9750f592e677
+	.quad	0x3fff9ccadbdac61d
+	.quad	0x3fffa245b525f439
+	.quad	0x3fffa7c1819e90d8
+	.quad	0x3fffad3e416ec354
+	.quad	0x3fffb2bbf4c0ba54
+	.quad	0x3fffb83a9bbeabd1
+	.quad	0x3fffbdba3692d514
+	.quad	0x3fffc33ac5677ab8
+	.quad	0x3fffc8bc4866e8ad
+	.quad	0x3fffce3ebfbb7237
+	.quad	0x3fffd3c22b8f71f1
+	.quad	0x3fffd9468c0d49cc
+	.quad	0x3fffdecbe15f6314
+	.quad	0x3fffe4522bb02e6e
+	.quad	0x3fffe9d96b2a23d9
+	.quad	0x3fffef619ff7c2b3
+	.quad	0x3ffff4eaca4391b6
+	.quad	0x3ffffa74ea381efc
+
+/* Range reduction coefficients:
+ * log(2) inverted = 2^k/ln2 */
+double_vector __dbInvLn2 0x40971547652b82fe
+
+/* right-shifter value = 3*2^52 */
+double_vector __dbShifter 0x4338000000000000
+
+/* log(2) high part = ln2/2^k(52-k-9 hibits) */
+double_vector __dbLn2hi 0x3f462e42fec00000
+
+/* log(2) low part = ln2/2^k(52-k-9..104-k-9 lobits) */
+double_vector __dbLn2lo 0x3d5d1cf79abc9e3b
+
+/* Polynomial coefficients (k=10, deg=3): */
+double_vector __dPC0 0x3ff0000000000000
+double_vector __dPC1 0x3fe0000001ebfbe0
+double_vector __dPC2 0x3fc5555555555556
+
+/* Other constants:
+ * index mask = 2^k-1 */
+double_vector __lIndexMask 0x00000000000003ff
+
+/* absolute value mask (SP) */
+float_vector __iAbsMask 0x7fffffff
+
+/* domain range (SP) (>=4086232B) */
+float_vector __iDomainRange 0x4086232a
+	.type	__svml_dexp_data,@object
+	.size __svml_dexp_data,.-__svml_dexp_data
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/svml_d_exp_data.h b/REORG.TODO/sysdeps/x86_64/fpu/svml_d_exp_data.h
new file mode 100644
index 0000000000..70e7660739
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/svml_d_exp_data.h
@@ -0,0 +1,52 @@
+/* Offsets for data table for function exp.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef D_EXP_DATA_H
+#define D_EXP_DATA_H
+
+#define __dbT                         	0
+#define __dbInvLn2                    	8192
+#define __dbShifter                   	8256
+#define __dbLn2hi                     	8320
+#define __dbLn2lo                     	8384
+#define __dPC0                        	8448
+#define __dPC1                        	8512
+#define __dPC2                        	8576
+#define __lIndexMask                  	8640
+#define __iAbsMask                    	8704
+#define __iDomainRange                	8768
+
+.macro double_vector offset value
+.if .-__svml_dexp_data != \offset
+.err
+.endif
+.rept 8
+.quad \value
+.endr
+.endm
+
+.macro float_vector offset value
+.if .-__svml_dexp_data != \offset
+.err
+.endif
+.rept 16
+.long \value
+.endr
+.endm
+
+#endif
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/svml_d_log2_core.S b/REORG.TODO/sysdeps/x86_64/fpu/svml_d_log2_core.S
new file mode 100644
index 0000000000..4e2d9b9640
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/svml_d_log2_core.S
@@ -0,0 +1,29 @@
+/* Function log vectorized with SSE2.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "svml_d_wrapper_impl.h"
+
+	.text
+ENTRY (_ZGVbN2v_log)
+WRAPPER_IMPL_SSE2 __log_finite
+END (_ZGVbN2v_log)
+
+#ifndef USE_MULTIARCH
+ libmvec_hidden_def (_ZGVbN2v_log)
+#endif
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/svml_d_log4_core.S b/REORG.TODO/sysdeps/x86_64/fpu/svml_d_log4_core.S
new file mode 100644
index 0000000000..2db872682d
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/svml_d_log4_core.S
@@ -0,0 +1,29 @@
+/* Function log vectorized with AVX2, wrapper version.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "svml_d_wrapper_impl.h"
+
+	.text
+ENTRY (_ZGVdN4v_log)
+WRAPPER_IMPL_AVX _ZGVbN2v_log
+END (_ZGVdN4v_log)
+
+#ifndef USE_MULTIARCH
+ libmvec_hidden_def (_ZGVdN4v_log)
+#endif
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/svml_d_log4_core_avx.S b/REORG.TODO/sysdeps/x86_64/fpu/svml_d_log4_core_avx.S
new file mode 100644
index 0000000000..72cb77a1b7
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/svml_d_log4_core_avx.S
@@ -0,0 +1,25 @@
+/* Function log vectorized in AVX ISA as wrapper to SSE4 ISA version.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "svml_d_wrapper_impl.h"
+
+	.text
+ENTRY (_ZGVcN4v_log)
+WRAPPER_IMPL_AVX _ZGVbN2v_log
+END (_ZGVcN4v_log)
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/svml_d_log8_core.S b/REORG.TODO/sysdeps/x86_64/fpu/svml_d_log8_core.S
new file mode 100644
index 0000000000..d4c4850fdc
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/svml_d_log8_core.S
@@ -0,0 +1,25 @@
+/* Function log vectorized with AVX-512. Wrapper to AVX2 version.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "svml_d_wrapper_impl.h"
+
+	.text
+ENTRY (_ZGVeN8v_log)
+WRAPPER_IMPL_AVX512 _ZGVdN4v_log
+END (_ZGVeN8v_log)
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/svml_d_log_data.S b/REORG.TODO/sysdeps/x86_64/fpu/svml_d_log_data.S
new file mode 100644
index 0000000000..b17874100c
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/svml_d_log_data.S
@@ -0,0 +1,1662 @@
+/* Data for function log.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include "svml_d_log_data.h"
+
+	.section .rodata, "a"
+	.align 64
+
+/* Data table for vector implementations of function log.
+   The table may contain polynomial, reduction, lookup coefficients
+   and other constants obtained through different methods
+   of research and experimental work.  */
+	.globl __svml_dlog_data
+__svml_dlog_data:
+
+/* Lookup table in high+low parts and 9-bit index for
+   -log(mRcp), where mRcp is mantissa of 1/x 9-bit accurate reciprocal:  */
+.if .-__svml_dlog_data != _Log_HA_table
+.err
+.endif
+	.quad	0xc086232bdd7a8300
+	.quad	0xbe1ce91eef3fb100
+	.quad	0xc086232fdc7ad828
+	.quad	0xbe1cefcffda73b6a
+	.quad	0xc0862333d97d2ba0
+	.quad	0xbe1cef406748f1ff
+	.quad	0xc0862337d48378e0
+	.quad	0xbe1cef2a9429925a
+	.quad	0xc086233bcd8fb878
+	.quad	0xbe1cf138d17ebecb
+	.quad	0xc086233fc4a3e018
+	.quad	0xbe1ceff2dbbbb29e
+	.quad	0xc0862343b9c1e270
+	.quad	0xbe1cf1a42aae437b
+	.quad	0xc0862347acebaf68
+	.quad	0xbe1cef3b152048af
+	.quad	0xc086234b9e2333f0
+	.quad	0xbe1cef20e127805e
+	.quad	0xc086234f8d6a5a30
+	.quad	0xbe1cf00ad6052cf4
+	.quad	0xc08623537ac30980
+	.quad	0xbe1cefc4642ee597
+	.quad	0xc0862357662f2660
+	.quad	0xbe1cf1f277d36e16
+	.quad	0xc086235b4fb092a0
+	.quad	0xbe1ceed009e8d8e6
+	.quad	0xc086235f37492d28
+	.quad	0xbe1cf1e4038cb362
+	.quad	0xc08623631cfad250
+	.quad	0xbe1cf0b0873b8557
+	.quad	0xc086236700c75b98
+	.quad	0xbe1cf15bb3227c0b
+	.quad	0xc086236ae2b09fe0
+	.quad	0xbe1cf151ef8ca9ed
+	.quad	0xc086236ec2b87358
+	.quad	0xbe1cefe1dc2cd2ed
+	.quad	0xc0862372a0e0a780
+	.quad	0xbe1cf0d1eec5454f
+	.quad	0xc08623767d2b0b48
+	.quad	0xbe1ceeefd570bbce
+	.quad	0xc086237a57996af0
+	.quad	0xbe1cee99ae91b3a7
+	.quad	0xc086237e302d9028
+	.quad	0xbe1cf0412830fbd1
+	.quad	0xc086238206e94218
+	.quad	0xbe1ceee898588610
+	.quad	0xc0862385dbce4548
+	.quad	0xbe1cee9a1fbcaaea
+	.quad	0xc0862389aede5bc0
+	.quad	0xbe1ceed8e7cc1ad6
+	.quad	0xc086238d801b4500
+	.quad	0xbe1cf10c8d059da6
+	.quad	0xc08623914f86be18
+	.quad	0xbe1ceee6c63a8165
+	.quad	0xc08623951d228180
+	.quad	0xbe1cf0c3592d2ff1
+	.quad	0xc0862398e8f04758
+	.quad	0xbe1cf0026cc4cb1b
+	.quad	0xc086239cb2f1c538
+	.quad	0xbe1cf15d48d8e670
+	.quad	0xc08623a07b28ae60
+	.quad	0xbe1cef359363787c
+	.quad	0xc08623a44196b390
+	.quad	0xbe1cefdf1ab2e82c
+	.quad	0xc08623a8063d8338
+	.quad	0xbe1cefe43c02aa84
+	.quad	0xc08623abc91ec960
+	.quad	0xbe1cf044f5ae35b7
+	.quad	0xc08623af8a3c2fb8
+	.quad	0xbe1cf0b0b4001e1b
+	.quad	0xc08623b349975d98
+	.quad	0xbe1cf1bae76dfbcf
+	.quad	0xc08623b70731f810
+	.quad	0xbe1cef0a72e13a62
+	.quad	0xc08623bac30da1c8
+	.quad	0xbe1cf184007d2b6b
+	.quad	0xc08623be7d2bfb40
+	.quad	0xbe1cf16f4b239e98
+	.quad	0xc08623c2358ea2a0
+	.quad	0xbe1cf0976acada87
+	.quad	0xc08623c5ec3733d0
+	.quad	0xbe1cf066318a16ff
+	.quad	0xc08623c9a1274880
+	.quad	0xbe1ceffaa7148798
+	.quad	0xc08623cd54607820
+	.quad	0xbe1cf23ab02e9b6e
+	.quad	0xc08623d105e45800
+	.quad	0xbe1cefdfef7d4fde
+	.quad	0xc08623d4b5b47b20
+	.quad	0xbe1cf17fece44f2b
+	.quad	0xc08623d863d27270
+	.quad	0xbe1cf18f907d0d7c
+	.quad	0xc08623dc103fccb0
+	.quad	0xbe1cee61fe072c98
+	.quad	0xc08623dfbafe1668
+	.quad	0xbe1cf022dd891e2f
+	.quad	0xc08623e3640eda20
+	.quad	0xbe1ceecc1daf4358
+	.quad	0xc08623e70b73a028
+	.quad	0xbe1cf0173c4fa380
+	.quad	0xc08623eab12deec8
+	.quad	0xbe1cf16a2150c2f4
+	.quad	0xc08623ee553f4a30
+	.quad	0xbe1cf1bf980b1f4b
+	.quad	0xc08623f1f7a93480
+	.quad	0xbe1cef8b731663c2
+	.quad	0xc08623f5986d2dc0
+	.quad	0xbe1cee9a664d7ef4
+	.quad	0xc08623f9378cb3f0
+	.quad	0xbe1cf1eda2af6400
+	.quad	0xc08623fcd5094320
+	.quad	0xbe1cf1923f9d68d7
+	.quad	0xc086240070e45548
+	.quad	0xbe1cf0747cd3e03a
+	.quad	0xc08624040b1f6260
+	.quad	0xbe1cf22ee855bd6d
+	.quad	0xc0862407a3bbe078
+	.quad	0xbe1cf0d57360c00b
+	.quad	0xc086240b3abb4398
+	.quad	0xbe1ceebc815cd575
+	.quad	0xc086240ed01efdd0
+	.quad	0xbe1cf03bfb970951
+	.quad	0xc086241263e87f50
+	.quad	0xbe1cf16e74768529
+	.quad	0xc0862415f6193658
+	.quad	0xbe1cefec64b8becb
+	.quad	0xc086241986b28f30
+	.quad	0xbe1cf0838d210baa
+	.quad	0xc086241d15b5f448
+	.quad	0xbe1cf0ea86e75b11
+	.quad	0xc0862420a324ce28
+	.quad	0xbe1cf1708d11d805
+	.quad	0xc08624242f008380
+	.quad	0xbe1ceea988c5a417
+	.quad	0xc0862427b94a7910
+	.quad	0xbe1cef166a7bbca5
+	.quad	0xc086242b420411d0
+	.quad	0xbe1cf0c9d9e86a38
+	.quad	0xc086242ec92eaee8
+	.quad	0xbe1cef0946455411
+	.quad	0xc08624324ecbaf98
+	.quad	0xbe1cefea60907739
+	.quad	0xc0862435d2dc7160
+	.quad	0xbe1cf1ed0934ce42
+	.quad	0xc086243955624ff8
+	.quad	0xbe1cf191ba746c7d
+	.quad	0xc086243cd65ea548
+	.quad	0xbe1ceeec78cf2a7e
+	.quad	0xc086244055d2c968
+	.quad	0xbe1cef345284c119
+	.quad	0xc0862443d3c012b8
+	.quad	0xbe1cf24f77355219
+	.quad	0xc08624475027d5e8
+	.quad	0xbe1cf05bf087e114
+	.quad	0xc086244acb0b65d0
+	.quad	0xbe1cef3504a32189
+	.quad	0xc086244e446c1398
+	.quad	0xbe1ceff54b2a406f
+	.quad	0xc0862451bc4b2eb8
+	.quad	0xbe1cf0757d54ed4f
+	.quad	0xc086245532aa04f0
+	.quad	0xbe1cf0c8099fdfd5
+	.quad	0xc0862458a789e250
+	.quad	0xbe1cf0b173796a31
+	.quad	0xc086245c1aec1138
+	.quad	0xbe1cf11d8734540d
+	.quad	0xc086245f8cd1da60
+	.quad	0xbe1cf1916a723ceb
+	.quad	0xc0862462fd3c84d8
+	.quad	0xbe1cf19a911e1da7
+	.quad	0xc08624666c2d5608
+	.quad	0xbe1cf23a9ef72e4f
+	.quad	0xc0862469d9a591c0
+	.quad	0xbe1cef503d947663
+	.quad	0xc086246d45a67a18
+	.quad	0xbe1cf0fceeb1a0b2
+	.quad	0xc0862470b0314fa8
+	.quad	0xbe1cf107e27e4fbc
+	.quad	0xc086247419475160
+	.quad	0xbe1cf03dd9922331
+	.quad	0xc086247780e9bc98
+	.quad	0xbe1cefce1a10e129
+	.quad	0xc086247ae719cd18
+	.quad	0xbe1ceea47f73c4f6
+	.quad	0xc086247e4bd8bd10
+	.quad	0xbe1ceec0ac56d100
+	.quad	0xc0862481af27c528
+	.quad	0xbe1cee8a6593278a
+	.quad	0xc086248511081c70
+	.quad	0xbe1cf2231dd9dec7
+	.quad	0xc0862488717af888
+	.quad	0xbe1cf0b4b8ed7da8
+	.quad	0xc086248bd0818d68
+	.quad	0xbe1cf1bd8d835002
+	.quad	0xc086248f2e1d0d98
+	.quad	0xbe1cf259acc107f4
+	.quad	0xc08624928a4eaa20
+	.quad	0xbe1cee897636b00c
+	.quad	0xc0862495e5179270
+	.quad	0xbe1cee757f20c326
+	.quad	0xc08624993e78f490
+	.quad	0xbe1cefafd3aa54a4
+	.quad	0xc086249c9673fd10
+	.quad	0xbe1cee7298d38b97
+	.quad	0xc086249fed09d6f8
+	.quad	0xbe1ceedc158d4ceb
+	.quad	0xc08624a3423babe0
+	.quad	0xbe1cf2282987cb2e
+	.quad	0xc08624a6960aa400
+	.quad	0xbe1cefe7381ecc4b
+	.quad	0xc08624a9e877e600
+	.quad	0xbe1cef328dbbce80
+	.quad	0xc08624ad39849728
+	.quad	0xbe1cefde45f3cc71
+	.quad	0xc08624b08931db58
+	.quad	0xbe1cefa8b89433b9
+	.quad	0xc08624b3d780d500
+	.quad	0xbe1cef6773c0b139
+	.quad	0xc08624b72472a528
+	.quad	0xbe1cf031c931c11f
+	.quad	0xc08624ba70086b78
+	.quad	0xbe1cf088f49275e7
+	.quad	0xc08624bdba434630
+	.quad	0xbe1cf17de0eaa86d
+	.quad	0xc08624c103245238
+	.quad	0xbe1cefd492f1ba75
+	.quad	0xc08624c44aacab08
+	.quad	0xbe1cf1253e154466
+	.quad	0xc08624c790dd6ad0
+	.quad	0xbe1cf0fb09ee6d55
+	.quad	0xc08624cad5b7aa58
+	.quad	0xbe1cf1f08dd048fe
+	.quad	0xc08624ce193c8120
+	.quad	0xbe1ceeca0809697f
+	.quad	0xc08624d15b6d0538
+	.quad	0xbe1cef8d5662d968
+	.quad	0xc08624d49c4a4b78
+	.quad	0xbe1cee97b556ed78
+	.quad	0xc08624d7dbd56750
+	.quad	0xbe1cf1b14b6acb75
+	.quad	0xc08624db1a0f6b00
+	.quad	0xbe1cef1e860623f2
+	.quad	0xc08624de56f96758
+	.quad	0xbe1ceeaf4d156f3d
+	.quad	0xc08624e192946bf0
+	.quad	0xbe1ceecc12b400ed
+	.quad	0xc08624e4cce18710
+	.quad	0xbe1cf180c40c794f
+	.quad	0xc08624e805e1c5c8
+	.quad	0xbe1cf185a08f7f65
+	.quad	0xc08624eb3d9633d8
+	.quad	0xbe1cef45fc924078
+	.quad	0xc08624ee73ffdbb0
+	.quad	0xbe1cf1e4f457f32a
+	.quad	0xc08624f1a91fc6a0
+	.quad	0xbe1cf040147b8a5a
+	.quad	0xc08624f4dcf6fc98
+	.quad	0xbe1cf1effca0dfb2
+	.quad	0xc08624f80f868468
+	.quad	0xbe1cf0470146e5bc
+	.quad	0xc08624fb40cf6390
+	.quad	0xbe1cef4dd186e501
+	.quad	0xc08624fe70d29e60
+	.quad	0xbe1ceebe257f66c7
+	.quad	0xc08625019f9137f0
+	.quad	0xbe1ceefb7a1c395c
+	.quad	0xc0862504cd0c3220
+	.quad	0xbe1cf209dedfed8c
+	.quad	0xc0862507f9448db0
+	.quad	0xbe1cf082da464994
+	.quad	0xc086250b243b4a18
+	.quad	0xbe1cee88694a73cf
+	.quad	0xc086250e4df165a0
+	.quad	0xbe1cf0b61e8f0531
+	.quad	0xc08625117667dd78
+	.quad	0xbe1cf1106599c962
+	.quad	0xc08625149d9fad98
+	.quad	0xbe1ceff1ee88af1f
+	.quad	0xc0862517c399d0c8
+	.quad	0xbe1cf0f746994ef6
+	.quad	0xc086251ae85740b8
+	.quad	0xbe1cefe8a1d077e4
+	.quad	0xc086251e0bd8f5e0
+	.quad	0xbe1cf1a1da036092
+	.quad	0xc08625212e1fe7a8
+	.quad	0xbe1cf0f8a7786fcd
+	.quad	0xc08625244f2d0c48
+	.quad	0xbe1cefa1174a07a7
+	.quad	0xc08625276f0158d8
+	.quad	0xbe1cef1043aa5b25
+	.quad	0xc086252a8d9dc150
+	.quad	0xbe1cf15d521c169d
+	.quad	0xc086252dab033898
+	.quad	0xbe1cf220bba8861f
+	.quad	0xc0862530c732b078
+	.quad	0xbe1cef51e310eae2
+	.quad	0xc0862533e22d1988
+	.quad	0xbe1cf222fcedd8ae
+	.quad	0xc0862536fbf36370
+	.quad	0xbe1cefdb4da4bda8
+	.quad	0xc086253a14867ca0
+	.quad	0xbe1ceeafc1112171
+	.quad	0xc086253d2be75280
+	.quad	0xbe1cee99dfb4b408
+	.quad	0xc08625404216d160
+	.quad	0xbe1cf22d2536f06b
+	.quad	0xc08625435715e498
+	.quad	0xbe1cef6abbf2e268
+	.quad	0xc08625466ae57648
+	.quad	0xbe1cf093a14789f5
+	.quad	0xc08625497d866fa0
+	.quad	0xbe1cf0f93655603c
+	.quad	0xc086254c8ef9b8b8
+	.quad	0xbe1cf1cc40c9aafc
+	.quad	0xc086254f9f4038a8
+	.quad	0xbe1ceeea5f4e9157
+	.quad	0xc0862552ae5ad568
+	.quad	0xbe1cefa9f52d4997
+	.quad	0xc0862555bc4a7400
+	.quad	0xbe1cefa490a638ff
+	.quad	0xc0862558c90ff868
+	.quad	0xbe1cef7fcf797d6f
+	.quad	0xc086255bd4ac4590
+	.quad	0xbe1cf1b4c51113c9
+	.quad	0xc086255edf203d78
+	.quad	0xbe1cef55e5b4a55d
+	.quad	0xc0862561e86cc100
+	.quad	0xbe1cf0d37a25f9dc
+	.quad	0xc0862564f092b028
+	.quad	0xbe1ceebe9efc19d9
+	.quad	0xc0862567f792e9d8
+	.quad	0xbe1cee8ad30a57b5
+	.quad	0xc086256afd6e4c08
+	.quad	0xbe1cef4e1817b90b
+	.quad	0xc086256e0225b3b8
+	.quad	0xbe1cee7fa9229996
+	.quad	0xc086257105b9fce0
+	.quad	0xbe1cf0b54963d945
+	.quad	0xc0862574082c0298
+	.quad	0xbe1cee5f2f3c7995
+	.quad	0xc0862577097c9ee0
+	.quad	0xbe1cf0828e303a2c
+	.quad	0xc086257a09acaae0
+	.quad	0xbe1cf172c3078947
+	.quad	0xc086257d08bcfec0
+	.quad	0xbe1cf189252afa22
+	.quad	0xc086258006ae71b8
+	.quad	0xbe1cefdb80426923
+	.quad	0xc08625830381da08
+	.quad	0xbe1ceef1391a0372
+	.quad	0xc0862585ff380d00
+	.quad	0xbe1cf17720c78d13
+	.quad	0xc0862588f9d1df18
+	.quad	0xbe1ceef1f9027d83
+	.quad	0xc086258bf35023b8
+	.quad	0xbe1cf06fac99dec9
+	.quad	0xc086258eebb3ad78
+	.quad	0xbe1cf1373eeb45c0
+	.quad	0xc0862591e2fd4e00
+	.quad	0xbe1cef777536bb81
+	.quad	0xc0862594d92dd600
+	.quad	0xbe1cf0f43ca40766
+	.quad	0xc0862597ce461558
+	.quad	0xbe1cefb2cfc6766b
+	.quad	0xc086259ac246daf0
+	.quad	0xbe1ceea49e64ffa2
+	.quad	0xc086259db530f4c8
+	.quad	0xbe1cf250fa457dec
+	.quad	0xc08625a0a7053018
+	.quad	0xbe1cf17d8bb2a44e
+	.quad	0xc08625a397c45918
+	.quad	0xbe1cf1d5906d54b7
+	.quad	0xc08625a6876f3b30
+	.quad	0xbe1cf08fe7b31780
+	.quad	0xc08625a97606a0e0
+	.quad	0xbe1cef13edfc9d11
+	.quad	0xc08625ac638b53c8
+	.quad	0xbe1cef9d2b107219
+	.quad	0xc08625af4ffe1cb0
+	.quad	0xbe1cf1ddd4ff6160
+	.quad	0xc08625b23b5fc390
+	.quad	0xbe1cefa02a996495
+	.quad	0xc08625b525b10f68
+	.quad	0xbe1cf166a7e37ee5
+	.quad	0xc08625b80ef2c680
+	.quad	0xbe1cef0b171068a5
+	.quad	0xc08625baf725ae28
+	.quad	0xbe1cf05c80779283
+	.quad	0xc08625bdde4a8af0
+	.quad	0xbe1cf1bbfbffb889
+	.quad	0xc08625c0c4622090
+	.quad	0xbe1cf0b8666c0124
+	.quad	0xc08625c3a96d31e0
+	.quad	0xbe1cf0a8fcf47a86
+	.quad	0xc08625c68d6c80f0
+	.quad	0xbe1cef46e18cb092
+	.quad	0xc08625c97060cef0
+	.quad	0xbe1cf1458a350efb
+	.quad	0xc08625cc524adc58
+	.quad	0xbe1ceeea1dadce12
+	.quad	0xc08625cf332b68b0
+	.quad	0xbe1cf0a1bfdc44c7
+	.quad	0xc08625d2130332d0
+	.quad	0xbe1cef96d02da73e
+	.quad	0xc08625d4f1d2f8a8
+	.quad	0xbe1cf2451c3c7701
+	.quad	0xc08625d7cf9b7778
+	.quad	0xbe1cf10d08f83812
+	.quad	0xc08625daac5d6ba0
+	.quad	0xbe1ceec5b4895c5e
+	.quad	0xc08625dd881990b0
+	.quad	0xbe1cf14e1325c5e4
+	.quad	0xc08625e062d0a188
+	.quad	0xbe1cf21d0904be12
+	.quad	0xc08625e33c835838
+	.quad	0xbe1ceed0839bcf21
+	.quad	0xc08625e615326df0
+	.quad	0xbe1cf1bb944889d2
+	.quad	0xc08625e8ecde9b48
+	.quad	0xbe1cee738e85eece
+	.quad	0xc08625ebc38897e0
+	.quad	0xbe1cf25c2bc6ef12
+	.quad	0xc08625ee99311ac8
+	.quad	0xbe1cf132b70a41ad
+	.quad	0xc08625f16dd8da28
+	.quad	0xbe1cf1984236a6e3
+	.quad	0xc08625f441808b78
+	.quad	0xbe1cf19ae74998f9
+	.quad	0xc08625f71428e370
+	.quad	0xbe1cef3e175d61a1
+	.quad	0xc08625f9e5d295f8
+	.quad	0xbe1cf101f9868fd9
+	.quad	0xc08625fcb67e5658
+	.quad	0xbe1cee69db83dcd2
+	.quad	0xc08625ff862cd6f8
+	.quad	0xbe1cf081b636af51
+	.quad	0xc086260254dec9a8
+	.quad	0xbe1cee62c7d59b3e
+	.quad	0xc08626052294df58
+	.quad	0xbe1cf1b745c57716
+	.quad	0xc0862607ef4fc868
+	.quad	0xbe1cef3d2800ea23
+	.quad	0xc086260abb103458
+	.quad	0xbe1cef480ff1acd2
+	.quad	0xc086260d85d6d200
+	.quad	0xbe1cf2424c9a17ef
+	.quad	0xc08626104fa44f90
+	.quad	0xbe1cf12cfde90fd5
+	.quad	0xc086261318795a68
+	.quad	0xbe1cf21f590dd5b6
+	.quad	0xc0862615e0569f48
+	.quad	0xbe1cf0c50f9cd28a
+	.quad	0xc0862618a73cca30
+	.quad	0xbe1ceedbdb520545
+	.quad	0xc086261b6d2c8668
+	.quad	0xbe1cf0b030396011
+	.quad	0xc086261e32267e98
+	.quad	0xbe1cf19917010e96
+	.quad	0xc0862620f62b5cb0
+	.quad	0xbe1cf07331355985
+	.quad	0xc0862623b93bc9e8
+	.quad	0xbe1cf01ae921a1c3
+	.quad	0xc08626267b586ed0
+	.quad	0xbe1cefe5cf0dbf0c
+	.quad	0xc08626293c81f348
+	.quad	0xbe1cf01b258aeb50
+	.quad	0xc086262bfcb8fe88
+	.quad	0xbe1cee6b9e7f4c68
+	.quad	0xc086262ebbfe3710
+	.quad	0xbe1cee684a9b21c9
+	.quad	0xc08626317a5242b8
+	.quad	0xbe1cf1f8bcde9a8b
+	.quad	0xc086263437b5c6c0
+	.quad	0xbe1cf1d063d36238
+	.quad	0xc0862636f42967a8
+	.quad	0xbe1cf1e31a19075e
+	.quad	0xc0862639afadc950
+	.quad	0xbe1cf1d8efdf7e7d
+	.quad	0xc086263c6a438ef0
+	.quad	0xbe1cf1812ee72dba
+	.quad	0xc086263f23eb5b18
+	.quad	0xbe1cf1449a9a2279
+	.quad	0xc0862641dca5cfb8
+	.quad	0xbe1cee96edce5085
+	.quad	0xc086264494738e08
+	.quad	0xbe1cf06797bd03b2
+	.quad	0xc08626474b5536b8
+	.quad	0xbe1cef91b9b7ffc1
+	.quad	0xc086264a014b69c0
+	.quad	0xbe1cef4b6721278f
+	.quad	0xc086264cb656c678
+	.quad	0xbe1cf1942925eb4a
+	.quad	0xc086264f6a77eba8
+	.quad	0xbe1cefa2c7bc2e39
+	.quad	0xc08626521daf7758
+	.quad	0xbe1cf252595aceb3
+	.quad	0xc0862654cffe0718
+	.quad	0xbe1cee8e9ae47ec2
+	.quad	0xc0862657816437a8
+	.quad	0xbe1cf1bf913828fa
+	.quad	0xc086265a31e2a558
+	.quad	0xbe1cf23475d6b366
+	.quad	0xc086265ce179ebc8
+	.quad	0xbe1cef8df00a922b
+	.quad	0xc086265f902aa5f0
+	.quad	0xbe1cef279bfa43e0
+	.quad	0xc08626623df56e38
+	.quad	0xbe1cf080e10b8365
+	.quad	0xc0862664eadade70
+	.quad	0xbe1cf1a518f9b544
+	.quad	0xc086266796db8fd0
+	.quad	0xbe1cef9308fed9e9
+	.quad	0xc086266a41f81ae8
+	.quad	0xbe1ceea3ae6b19c9
+	.quad	0xc086266cec3117b8
+	.quad	0xbe1ceef06003d4c2
+	.quad	0xc086266f95871da8
+	.quad	0xbe1cf0b8457ffb0c
+	.quad	0xc08626723dfac390
+	.quad	0xbe1cf0c526745ad6
+	.quad	0xc0862674e58c9fa8
+	.quad	0xbe1cf0cf91ff7b5d
+	.quad	0xc08626778c3d4798
+	.quad	0xbe1cefe260819380
+	.quad	0xc086267a320d5070
+	.quad	0xbe1ceebd90aa27a3
+	.quad	0xc086267cd6fd4ea8
+	.quad	0xbe1cf0388121dffa
+	.quad	0xc086267f7b0dd630
+	.quad	0xbe1cf1a3881435f1
+	.quad	0xc08626821e3f7a68
+	.quad	0xbe1cef28e9d9ac52
+	.quad	0xc0862684c092ce08
+	.quad	0xbe1cf02d300062dd
+	.quad	0xc086268762086350
+	.quad	0xbe1cefaee1edfa35
+	.quad	0xc086268a02a0cbe0
+	.quad	0xbe1cf0a5a052e936
+	.quad	0xc086268ca25c98d8
+	.quad	0xbe1cee60a4a497ed
+	.quad	0xc086268f413c5ab0
+	.quad	0xbe1cf0e4a5d0cf49
+	.quad	0xc0862691df40a170
+	.quad	0xbe1cf149235a4e6e
+	.quad	0xc08626947c69fc80
+	.quad	0xbe1cf215180b9fcc
+	.quad	0xc086269718b8fac8
+	.quad	0xbe1cef9b156a9840
+	.quad	0xc0862699b42e2a90
+	.quad	0xbe1cf054c91441be
+	.quad	0xc086269c4eca19a8
+	.quad	0xbe1cf13ded26512c
+	.quad	0xc086269ee88d5550
+	.quad	0xbe1cf22ea4d8ac06
+	.quad	0xc08626a181786a40
+	.quad	0xbe1cf2354666ee2e
+	.quad	0xc08626a4198be4a8
+	.quad	0xbe1cefef936752b3
+	.quad	0xc08626a6b0c85020
+	.quad	0xbe1cf1e360a9db68
+	.quad	0xc08626a9472e37d8
+	.quad	0xbe1ceed6aeb812c5
+	.quad	0xc08626abdcbe2650
+	.quad	0xbe1cf227340b4986
+	.quad	0xc08626ae7178a5b0
+	.quad	0xbe1cf0215a0cbe0d
+	.quad	0xc08626b1055e3f70
+	.quad	0xbe1cf256adf0ae26
+	.quad	0xc08626b3986f7ca8
+	.quad	0xbe1ceff3c67aed06
+	.quad	0xc08626b62aace5c8
+	.quad	0xbe1cf2159fb93652
+	.quad	0xc08626b8bc1702e0
+	.quad	0xbe1cf01e6dbd1c7f
+	.quad	0xc08626bb4cae5b60
+	.quad	0xbe1cf009e75d1c0c
+	.quad	0xc08626bddc737648
+	.quad	0xbe1ceec10a020e73
+	.quad	0xc08626c06b66da08
+	.quad	0xbe1cf06d5783eee7
+	.quad	0xc08626c2f9890ca0
+	.quad	0xbe1cf0cb8f169ffe
+	.quad	0xc08626c586da9388
+	.quad	0xbe1cef7de2452430
+	.quad	0xc08626c8135bf3b0
+	.quad	0xbe1cf05da6f783ae
+	.quad	0xc08626ca9f0db198
+	.quad	0xbe1cefcc877d681d
+	.quad	0xc08626cd29f05138
+	.quad	0xbe1cef0531954ab3
+	.quad	0xc08626cfb4045608
+	.quad	0xbe1cf06b8565ea3d
+	.quad	0xc08626d23d4a4310
+	.quad	0xbe1cefdc455d9d7e
+	.quad	0xc08626d4c5c29ad0
+	.quad	0xbe1ceefc47e8fa64
+	.quad	0xc08626d74d6ddf48
+	.quad	0xbe1cf1872bf033f2
+	.quad	0xc08626d9d44c9210
+	.quad	0xbe1cf19d91087f9d
+	.quad	0xc08626dc5a5f3438
+	.quad	0xbe1cf012d444c6ab
+	.quad	0xc08626dedfa64650
+	.quad	0xbe1cf0ba528ee153
+	.quad	0xc08626e164224880
+	.quad	0xbe1ceeb431709788
+	.quad	0xc08626e3e7d3ba60
+	.quad	0xbe1cf0b9af31a6a5
+	.quad	0xc08626e66abb1b28
+	.quad	0xbe1cf168fb2e135b
+	.quad	0xc08626e8ecd8e990
+	.quad	0xbe1cef9097461c93
+	.quad	0xc08626eb6e2da3d0
+	.quad	0xbe1cee7a434735d8
+	.quad	0xc08626edeeb9c7a8
+	.quad	0xbe1cf235732b86f2
+	.quad	0xc08626f06e7dd280
+	.quad	0xbe1cefe1510b89e6
+	.quad	0xc08626f2ed7a4120
+	.quad	0xbe1cf1f64b9b80ef
+	.quad	0xc08626f56baf9000
+	.quad	0xbe1cf08f320ca339
+	.quad	0xc08626f7e91e3b08
+	.quad	0xbe1cf1b1de2808a1
+	.quad	0xc08626fa65c6bdc0
+	.quad	0xbe1cf1976d778b28
+	.quad	0xc08626fce1a99338
+	.quad	0xbe1ceef40a4f076f
+	.quad	0xc08626ff5cc73600
+	.quad	0xbe1cef3e45869ce3
+	.quad	0xc0862701d7202048
+	.quad	0xbe1ceef601b4c9d6
+	.quad	0xc086270450b4cbc0
+	.quad	0xbe1cf1eaf0b57fd6
+	.quad	0xc0862706c985b1c0
+	.quad	0xbe1cef82a44990f3
+	.quad	0xc086270941934b10
+	.quad	0xbe1ceefe32981f2c
+	.quad	0xc086270bb8de1018
+	.quad	0xbe1cefbf6f5a0445
+	.quad	0xc086270e2f6678d0
+	.quad	0xbe1cf18dba75792c
+	.quad	0xc0862710a52cfcc8
+	.quad	0xbe1cf0da64ce995f
+	.quad	0xc08627131a321318
+	.quad	0xbe1cef04ac0fb802
+	.quad	0xc08627158e763268
+	.quad	0xbe1cee9d4e2ad9bd
+	.quad	0xc086271801f9d0f8
+	.quad	0xbe1cefa9b55407b5
+	.quad	0xc086271a74bd64a0
+	.quad	0xbe1cefe6bd329570
+	.quad	0xc086271ce6c162c8
+	.quad	0xbe1cef0b1205dc85
+	.quad	0xc086271f58064068
+	.quad	0xbe1cef092a785e3f
+	.quad	0xc0862721c88c7210
+	.quad	0xbe1cf050dcdaac30
+	.quad	0xc086272438546be8
+	.quad	0xbe1cf210907ded8b
+	.quad	0xc0862726a75ea1b8
+	.quad	0xbe1cee760be44f99
+	.quad	0xc086272915ab86c0
+	.quad	0xbe1ceeeee07c2bcc
+	.quad	0xc086272b833b8df0
+	.quad	0xbe1cf06874992df5
+	.quad	0xc086272df00f29d0
+	.quad	0xbe1cef8fac5d4899
+	.quad	0xc08627305c26cc70
+	.quad	0xbe1cf1103241cc99
+	.quad	0xc0862732c782e788
+	.quad	0xbe1cf1d35fef83fe
+	.quad	0xc08627353223ec68
+	.quad	0xbe1cef3ec8133e1d
+	.quad	0xc08627379c0a4be8
+	.quad	0xbe1cef7261daccd8
+	.quad	0xc086273a05367688
+	.quad	0xbe1cf18656c50806
+	.quad	0xc086273c6da8dc68
+	.quad	0xbe1cf1c8736e049a
+	.quad	0xc086273ed561ed38
+	.quad	0xbe1cf1f93bff4911
+	.quad	0xc08627413c621848
+	.quad	0xbe1cf188a4ea680c
+	.quad	0xc0862743a2a9cc80
+	.quad	0xbe1cf1d270930c80
+	.quad	0xc086274608397868
+	.quad	0xbe1cf25a328c28e2
+	.quad	0xc08627486d118a28
+	.quad	0xbe1cf106f90aa3b8
+	.quad	0xc086274ad1326f80
+	.quad	0xbe1cee5e9d2e885a
+	.quad	0xc086274d349c95c0
+	.quad	0xbe1cf1c0bac27228
+	.quad	0xc086274f975069f8
+	.quad	0xbe1cf1a1500f9b1c
+	.quad	0xc0862751f94e58c0
+	.quad	0xbe1cefc30663ac44
+	.quad	0xc08627545a96ce48
+	.quad	0xbe1cf17123e427a2
+	.quad	0xc0862756bb2a3678
+	.quad	0xbe1cefb92749fea4
+	.quad	0xc08627591b08fcc0
+	.quad	0xbe1cefa40e1ea74a
+	.quad	0xc086275b7a338c40
+	.quad	0xbe1cee6f4612c3e9
+	.quad	0xc086275dd8aa4fa8
+	.quad	0xbe1cf1c54a053627
+	.quad	0xc0862760366db168
+	.quad	0xbe1ceff5eb503d9e
+	.quad	0xc0862762937e1b70
+	.quad	0xbe1cf02e47f10cee
+	.quad	0xc0862764efdbf768
+	.quad	0xbe1ceeb06e1d0dad
+	.quad	0xc08627674b87ae88
+	.quad	0xbe1cf10aadd6dba5
+	.quad	0xc0862769a681a9c0
+	.quad	0xbe1cf24e9913d30f
+	.quad	0xc086276c00ca51a0
+	.quad	0xbe1cef47b301e312
+	.quad	0xc086276e5a620e48
+	.quad	0xbe1ceeb1cefc2e85
+	.quad	0xc0862770b3494788
+	.quad	0xbe1cf16f1fbbe011
+	.quad	0xc08627730b8064e8
+	.quad	0xbe1ceebdf75174c7
+	.quad	0xc08627756307cd70
+	.quad	0xbe1cf06e3871a0da
+	.quad	0xc0862777b9dfe7f0
+	.quad	0xbe1cef16799fd554
+	.quad	0xc086277a10091ac0
+	.quad	0xbe1cf248dabf5377
+	.quad	0xc086277c6583cc00
+	.quad	0xbe1cf0c78d92a2cd
+	.quad	0xc086277eba506158
+	.quad	0xbe1cf0b911b029f0
+	.quad	0xc08627810e6f4028
+	.quad	0xbe1cefdc24719766
+	.quad	0xc086278361e0cd70
+	.quad	0xbe1cefbb6562b7e7
+	.quad	0xc0862785b4a56dd8
+	.quad	0xbe1cf1e0afb349ec
+	.quad	0xc086278806bd85c0
+	.quad	0xbe1cf008292e52fc
+	.quad	0xc086278a58297918
+	.quad	0xbe1cf053073872bf
+	.quad	0xc086278ca8e9ab88
+	.quad	0xbe1cf17a0a55a947
+	.quad	0xc086278ef8fe8068
+	.quad	0xbe1ceeffb0b60234
+	.quad	0xc086279148685aa0
+	.quad	0xbe1cf162204794a8
+	.quad	0xc086279397279ce0
+	.quad	0xbe1cf24cc8cb48ac
+	.quad	0xc0862795e53ca978
+	.quad	0xbe1cf0c9be68d5c3
+	.quad	0xc086279832a7e258
+	.quad	0xbe1cf172cd3d7388
+	.quad	0xc086279a7f69a930
+	.quad	0xbe1ceea2465fbce5
+	.quad	0xc086279ccb825f40
+	.quad	0xbe1cf0a386d2500f
+	.quad	0xc086279f16f26590
+	.quad	0xbe1cf1e338ddc18a
+	.quad	0xc08627a161ba1cd0
+	.quad	0xbe1cef1f5049867f
+	.quad	0xc08627a3abd9e548
+	.quad	0xbe1cef96c1ea8b1f
+	.quad	0xc08627a5f5521f00
+	.quad	0xbe1cf138f6fd3c26
+	.quad	0xc08627a83e2329b0
+	.quad	0xbe1cf0d4fcbfdf3a
+	.quad	0xc08627aa864d64b0
+	.quad	0xbe1cf24870c12c81
+	.quad	0xc08627accdd12f18
+	.quad	0xbe1cf0ae2a56348d
+	.quad	0xc08627af14aee7a0
+	.quad	0xbe1cee8ca1a9b893
+	.quad	0xc08627b15ae6eca8
+	.quad	0xbe1cf20414d637b0
+	.quad	0xc08627b3a0799c60
+	.quad	0xbe1cf0fc6b7b12d8
+	.quad	0xc08627b5e5675488
+	.quad	0xbe1cf152d93c4a00
+	.quad	0xc08627b829b072a0
+	.quad	0xbe1cf1073f9b77c2
+	.quad	0xc08627ba6d5553d8
+	.quad	0xbe1cee694f97d5a4
+	.quad	0xc08627bcb0565500
+	.quad	0xbe1cf0456b8239d7
+	.quad	0xc08627bef2b3d2b0
+	.quad	0xbe1cf211497127e3
+	.quad	0xc08627c1346e2930
+	.quad	0xbe1cf01856c0384d
+	.quad	0xc08627c37585b468
+	.quad	0xbe1cefa7dd05479e
+	.quad	0xc08627c5b5fad000
+	.quad	0xbe1cef3ae8e50b93
+	.quad	0xc08627c7f5cdd750
+	.quad	0xbe1ceea5f32fdd3a
+	.quad	0xc08627ca34ff2560
+	.quad	0xbe1cef424caeb8d9
+	.quad	0xc08627cc738f14f0
+	.quad	0xbe1cf0194d07a81f
+	.quad	0xc08627ceb17e0070
+	.quad	0xbe1cf20f452000c1
+	.quad	0xc08627d0eecc4210
+	.quad	0xbe1cf00e356218e4
+	.quad	0xc08627d32b7a33a0
+	.quad	0xbe1cef30484b4bcb
+	.quad	0xc08627d567882eb0
+	.quad	0xbe1ceeea11a6641b
+	.quad	0xc08627d7a2f68c80
+	.quad	0xbe1cf13492d5bd7b
+	.quad	0xc08627d9ddc5a618
+	.quad	0xbe1ceeb7048fad96
+	.quad	0xc08627dc17f5d418
+	.quad	0xbe1ceef0666f0477
+	.quad	0xc08627de51876ee8
+	.quad	0xbe1cf060d4b8b5c2
+	.quad	0xc08627e08a7acea8
+	.quad	0xbe1cf0b2a4b6ff8c
+	.quad	0xc08627e2c2d04b28
+	.quad	0xbe1cf0e34809a875
+	.quad	0xc08627e4fa883bf0
+	.quad	0xbe1cf16bf74a3522
+	.quad	0xc08627e731a2f848
+	.quad	0xbe1cee6a24623d57
+	.quad	0xc08627e96820d718
+	.quad	0xbe1cefc7b4f1528e
+	.quad	0xc08627eb9e022f18
+	.quad	0xbe1cf163051f3548
+	.quad	0xc08627edd34756b8
+	.quad	0xbe1cef36b3366305
+	.quad	0xc08627f007f0a408
+	.quad	0xbe1cf18134625550
+	.quad	0xc08627f23bfe6cf0
+	.quad	0xbe1cf0ec32ec1a11
+	.quad	0xc08627f46f710700
+	.quad	0xbe1ceeb3b64f3edc
+	.quad	0xc08627f6a248c778
+	.quad	0xbe1cf0cd15805bc8
+	.quad	0xc08627f8d4860368
+	.quad	0xbe1cf20db3bddebe
+	.quad	0xc08627fb06290f90
+	.quad	0xbe1cf25188430e25
+	.quad	0xc08627fd37324070
+	.quad	0xbe1ceea1713490f9
+	.quad	0xc08627ff67a1ea28
+	.quad	0xbe1cf159521d234c
+	.quad	0xc0862801977860b8
+	.quad	0xbe1cf24dfe50783b
+	.quad	0xc0862803c6b5f7d0
+	.quad	0xbe1ceef2ef89a60b
+	.quad	0xc0862805f55b02c8
+	.quad	0xbe1cee7fc919d62c
+	.quad	0xc08628082367d4c0
+	.quad	0xbe1cf215a7fb513a
+	.quad	0xc086280a50dcc0a8
+	.quad	0xbe1cf0e4401c5ed4
+	.quad	0xc086280c7dba1910
+	.quad	0xbe1cf04ec734d256
+	.quad	0xc086280eaa003050
+	.quad	0xbe1cf010ad787fea
+	.quad	0xc0862810d5af5880
+	.quad	0xbe1cee622478393d
+	.quad	0xc086281300c7e368
+	.quad	0xbe1cf01c7482564f
+	.quad	0xc08628152b4a22a0
+	.quad	0xbe1cf0de20d33536
+	.quad	0xc086281755366778
+	.quad	0xbe1cef2edae5837d
+	.quad	0xc08628197e8d02f0
+	.quad	0xbe1cf0a345318cc9
+	.quad	0xc086281ba74e45d8
+	.quad	0xbe1cf20085aa34b8
+	.quad	0xc086281dcf7a80c0
+	.quad	0xbe1cef5fa845ad83
+	.quad	0xc086281ff71203e0
+	.quad	0xbe1cf050d1df69c4
+	.quad	0xc08628221e151f48
+	.quad	0xbe1ceffe43c035b9
+	.quad	0xc0862824448422b8
+	.quad	0xbe1cf14f3018d3c2
+	.quad	0xc08628266a5f5dc0
+	.quad	0xbe1cef0a5fbae83d
+	.quad	0xc08628288fa71f98
+	.quad	0xbe1ceff8a95b72a1
+	.quad	0xc086282ab45bb750
+	.quad	0xbe1cef073aa9849b
+	.quad	0xc086282cd87d73a8
+	.quad	0xbe1cef69b3835c02
+	.quad	0xc086282efc0ca328
+	.quad	0xbe1cf0bc139379a9
+	.quad	0xc08628311f099420
+	.quad	0xbe1cef247a9ec596
+	.quad	0xc086283341749490
+	.quad	0xbe1cef74bbcc488a
+	.quad	0xc0862835634df248
+	.quad	0xbe1cef4bc42e7b8e
+	.quad	0xc08628378495fad0
+	.quad	0xbe1cf136d4d5a810
+	.quad	0xc0862839a54cfb80
+	.quad	0xbe1cf0d290b24dd8
+	.quad	0xc086283bc5734168
+	.quad	0xbe1ceeebde8e0065
+	.quad	0xc086283de5091950
+	.quad	0xbe1cf1a09f60aa1e
+	.quad	0xc0862840040ecfe0
+	.quad	0xbe1cf0803947a234
+	.quad	0xc08628422284b168
+	.quad	0xbe1cf0abf7638127
+	.quad	0xc0862844406b0a08
+	.quad	0xbe1cf0f73ee12058
+	.quad	0xc08628465dc225a0
+	.quad	0xbe1cf2079971b26c
+	.quad	0xc08628487a8a4fe0
+	.quad	0xbe1cee74957564b1
+	.quad	0xc086284a96c3d420
+	.quad	0xbe1ceee77c1b7d43
+	.quad	0xc086284cb26efd90
+	.quad	0xbe1cf23addba6e09
+	.quad	0xc086284ecd8c1730
+	.quad	0xbe1cf199f4a1da60
+	.quad	0xc0862850e81b6bb0
+	.quad	0xbe1cf09fdea81393
+	.quad	0xc0862853021d4588
+	.quad	0xbe1cf176adb417f7
+	.quad	0xc08628551b91ef00
+	.quad	0xbe1cf0f64f84a8da
+	.quad	0xc08628573479b220
+	.quad	0xbe1ceec34cf49523
+	.quad	0xc08628594cd4d8a8
+	.quad	0xbe1cf16d60fbe0bb
+	.quad	0xc086285b64a3ac40
+	.quad	0xbe1cee8de7acfc7b
+	.quad	0xc086285d7be67630
+	.quad	0xbe1ceee6256cce8d
+	.quad	0xc086285f929d7fa0
+	.quad	0xbe1cee7d66a3d8a5
+	.quad	0xc0862861a8c91170
+	.quad	0xbe1cf0bef8265792
+	.quad	0xc0862863be697458
+	.quad	0xbe1cf097f890c6f8
+	.quad	0xc0862865d37ef0c8
+	.quad	0xbe1cf09502d5c3fc
+	.quad	0xc0862867e809cf00
+	.quad	0xbe1ceeffb239dac7
+	.quad	0xc0862869fc0a56f8
+	.quad	0xbe1cf1fbfff95c98
+	.quad	0xc086286c0f80d090
+	.quad	0xbe1cefa57ad3eef7
+	.quad	0xc086286e226d8348
+	.quad	0xbe1cf22c58b9183d
+	.quad	0xc086287034d0b690
+	.quad	0xbe1ceff262d0a248
+	.quad	0xc086287246aab180
+	.quad	0xbe1cefa7bc194186
+	.quad	0xc086287457fbbb08
+	.quad	0xbe1cf06782d784d9
+	.quad	0xc086287668c419e0
+	.quad	0xbe1cf1d44d0eaa07
+	.quad	0xc086287879041490
+	.quad	0xbe1cf034803c8a48
+	.quad	0xc086287a88bbf158
+	.quad	0xbe1cf08e84916b6f
+	.quad	0xc086287c97ebf650
+	.quad	0xbe1cf0c4d3dc1bc7
+	.quad	0xc086287ea6946958
+	.quad	0xbe1cefb1e4625943
+	.quad	0xc0862880b4b59010
+	.quad	0xbe1cf143efdd1fd0
+	.quad	0xc0862882c24faff8
+	.quad	0xbe1cee9896d016da
+	.quad	0xc0862884cf630e38
+	.quad	0xbe1cf2186072f2cc
+	.quad	0xc0862886dbefeff0
+	.quad	0xbe1cef9217633d34
+	.quad	0xc0862888e7f699e0
+	.quad	0xbe1cf05603549486
+	.quad	0xc086288af37750b0
+	.quad	0xbe1cef50fff513d3
+	.quad	0xc086288cfe7258c0
+	.quad	0xbe1cf127713b32d0
+	.quad	0xc086288f08e7f650
+	.quad	0xbe1cf05015520f3d
+	.quad	0xc086289112d86d58
+	.quad	0xbe1cf12eb458b26f
+	.quad	0xc08628931c4401a8
+	.quad	0xbe1cf22eae2887ed
+	.quad	0xc0862895252af6e0
+	.quad	0xbe1cefdd6656dd2d
+	.quad	0xc08628972d8d9058
+	.quad	0xbe1cf1048ea4e646
+	.quad	0xc0862899356c1150
+	.quad	0xbe1ceec4501167e9
+	.quad	0xc086289b3cc6bcb8
+	.quad	0xbe1cf0ad52becc3f
+	.quad	0xc086289d439dd568
+	.quad	0xbe1cf0daa4e00e35
+	.quad	0xc086289f49f19df8
+	.quad	0xbe1cf00b80de8d6a
+	.quad	0xc08628a14fc258c8
+	.quad	0xbe1cf1bcf2ea8464
+	.quad	0xc08628a355104818
+	.quad	0xbe1cf0435e2782b0
+	.quad	0xc08628a559dbade0
+	.quad	0xbe1cf0e3e1a5f56c
+	.quad	0xc08628a75e24cbf8
+	.quad	0xbe1cefed9d5a721d
+	.quad	0xc08628a961ebe3f8
+	.quad	0xbe1cf0d2d74321e2
+	.quad	0xc08628ab65313750
+	.quad	0xbe1cf24200eb55e9
+	.quad	0xc08628ad67f50740
+	.quad	0xbe1cf23e9d7cf979
+	.quad	0xc08628af6a3794d0
+	.quad	0xbe1cf23a088f421c
+	.quad	0xc08628b16bf920e0
+	.quad	0xbe1cef2c1de1ab32
+	.quad	0xc08628b36d39ec08
+	.quad	0xbe1cf1abc231f7b2
+	.quad	0xc08628b56dfa36d0
+	.quad	0xbe1cf2074d5ba303
+	.quad	0xc08628b76e3a4180
+	.quad	0xbe1cf05cd5eed880
+	.rept	48
+	.byte	0
+	.endr
+
+/* Lookup table with 9-bit index for
+   -log(mRcp), where mRcp is mantissa of 1/x 9-bit accurate reciprocal:
+ */
+.if .-__svml_dlog_data != _Log_LA_table
+.err
+.endif
+	.quad	0x8000000000000000
+	.quad	0xbf5ff802a9ab10e6
+	.quad	0xbf6ff00aa2b10bc0
+	.quad	0xbf77ee11ebd82e94
+	.quad	0xbf7fe02a6b106789
+	.quad	0xbf83e7295d25a7d9
+	.quad	0xbf87dc475f810a77
+	.quad	0xbf8bcf712c74384c
+	.quad	0xbf8fc0a8b0fc03e4
+	.quad	0xbf91d7f7eb9eebe7
+	.quad	0xbf93cea44346a575
+	.quad	0xbf95c45a51b8d389
+	.quad	0xbf97b91b07d5b11b
+	.quad	0xbf99ace7551cc514
+	.quad	0xbf9b9fc027af9198
+	.quad	0xbf9d91a66c543cc4
+	.quad	0xbf9f829b0e783300
+	.quad	0xbfa0b94f7c196176
+	.quad	0xbfa1b0d98923d980
+	.quad	0xbfa2a7ec2214e873
+	.quad	0xbfa39e87b9febd60
+	.quad	0xbfa494acc34d911c
+	.quad	0xbfa58a5bafc8e4d5
+	.quad	0xbfa67f94f094bd98
+	.quad	0xbfa77458f632dcfc
+	.quad	0xbfa868a83083f6cf
+	.quad	0xbfa95c830ec8e3eb
+	.quad	0xbfaa4fe9ffa3d235
+	.quad	0xbfab42dd711971bf
+	.quad	0xbfac355dd0921f2d
+	.quad	0xbfad276b8adb0b52
+	.quad	0xbfae19070c276016
+	.quad	0xbfaf0a30c01162a6
+	.quad	0xbfaffae9119b9303
+	.quad	0xbfb075983598e471
+	.quad	0xbfb0ed839b5526fe
+	.quad	0xbfb16536eea37ae1
+	.quad	0xbfb1dcb263db1944
+	.quad	0xbfb253f62f0a1417
+	.quad	0xbfb2cb0283f5de1f
+	.quad	0xbfb341d7961bd1d1
+	.quad	0xbfb3b87598b1b6ee
+	.quad	0xbfb42edcbea646f0
+	.quad	0xbfb4a50d3aa1b040
+	.quad	0xbfb51b073f06183f
+	.quad	0xbfb590cafdf01c28
+	.quad	0xbfb60658a93750c4
+	.quad	0xbfb67bb0726ec0fc
+	.quad	0xbfb6f0d28ae56b4c
+	.quad	0xbfb765bf23a6be13
+	.quad	0xbfb7da766d7b12cd
+	.quad	0xbfb84ef898e8282a
+	.quad	0xbfb8c345d6319b21
+	.quad	0xbfb9375e55595ede
+	.quad	0xbfb9ab42462033ad
+	.quad	0xbfba1ef1d8061cd4
+	.quad	0xbfba926d3a4ad563
+	.quad	0xbfbb05b49bee43fe
+	.quad	0xbfbb78c82bb0eda1
+	.quad	0xbfbbeba818146765
+	.quad	0xbfbc5e548f5bc743
+	.quad	0xbfbcd0cdbf8c13e1
+	.quad	0xbfbd4313d66cb35d
+	.quad	0xbfbdb5270187d927
+	.quad	0xbfbe27076e2af2e6
+	.quad	0xbfbe98b549671467
+	.quad	0xbfbf0a30c01162a6
+	.quad	0xbfbf7b79fec37ddf
+	.quad	0xbfbfec9131dbeabb
+	.quad	0xbfc02ebb42bf3d4b
+	.quad	0xbfc0671512ca596e
+	.quad	0xbfc09f561ee719c3
+	.quad	0xbfc0d77e7cd08e59
+	.quad	0xbfc10f8e422539b1
+	.quad	0xbfc14785846742ac
+	.quad	0xbfc17f6458fca611
+	.quad	0xbfc1b72ad52f67a0
+	.quad	0xbfc1eed90e2dc2c3
+	.quad	0xbfc2266f190a5acb
+	.quad	0xbfc25ded0abc6ad2
+	.quad	0xbfc29552f81ff523
+	.quad	0xbfc2cca0f5f5f251
+	.quad	0xbfc303d718e47fd3
+	.quad	0xbfc33af575770e4f
+	.quad	0xbfc371fc201e8f74
+	.quad	0xbfc3a8eb2d31a376
+	.quad	0xbfc3dfc2b0ecc62a
+	.quad	0xbfc41682bf727bc0
+	.quad	0xbfc44d2b6ccb7d1e
+	.quad	0xbfc483bccce6e3dd
+	.quad	0xbfc4ba36f39a55e5
+	.quad	0xbfc4f099f4a230b2
+	.quad	0xbfc526e5e3a1b438
+	.quad	0xbfc55d1ad4232d6f
+	.quad	0xbfc59338d9982086
+	.quad	0xbfc5c940075972b9
+	.quad	0xbfc5ff3070a793d4
+	.quad	0xbfc6350a28aaa758
+	.quad	0xbfc66acd4272ad51
+	.quad	0xbfc6a079d0f7aad2
+	.quad	0xbfc6d60fe719d21d
+	.quad	0xbfc70b8f97a1aa75
+	.quad	0xbfc740f8f54037a5
+	.quad	0xbfc7764c128f2127
+	.quad	0xbfc7ab890210d909
+	.quad	0xbfc7e0afd630c274
+	.quad	0xbfc815c0a14357eb
+	.quad	0xbfc84abb75865139
+	.quad	0xbfc87fa06520c911
+	.quad	0xbfc8b46f8223625b
+	.quad	0xbfc8e928de886d41
+	.quad	0xbfc91dcc8c340bde
+	.quad	0xbfc9525a9cf456b4
+	.quad	0xbfc986d3228180ca
+	.quad	0xbfc9bb362e7dfb83
+	.quad	0xbfc9ef83d2769a34
+	.quad	0xbfca23bc1fe2b563
+	.quad	0xbfca57df28244dcd
+	.quad	0xbfca8becfc882f19
+	.quad	0xbfcabfe5ae46124c
+	.quad	0xbfcaf3c94e80bff3
+	.quad	0xbfcb2797ee46320c
+	.quad	0xbfcb5b519e8fb5a4
+	.quad	0xbfcb8ef670420c3b
+	.quad	0xbfcbc286742d8cd6
+	.quad	0xbfcbf601bb0e44e2
+	.quad	0xbfcc2968558c18c1
+	.quad	0xbfcc5cba543ae425
+	.quad	0xbfcc8ff7c79a9a22
+	.quad	0xbfccc320c0176502
+	.quad	0xbfccf6354e09c5dc
+	.quad	0xbfcd293581b6b3e7
+	.quad	0xbfcd5c216b4fbb91
+	.quad	0xbfcd8ef91af31d5e
+	.quad	0xbfcdc1bca0abec7d
+	.quad	0xbfcdf46c0c722d2f
+	.quad	0xbfce27076e2af2e6
+	.quad	0xbfce598ed5a87e2f
+	.quad	0xbfce8c0252aa5a60
+	.quad	0xbfcebe61f4dd7b0b
+	.quad	0xbfcef0adcbdc5936
+	.quad	0xbfcf22e5e72f105d
+	.quad	0xbfcf550a564b7b37
+	.quad	0xbfcf871b28955045
+	.quad	0xbfcfb9186d5e3e2b
+	.quad	0xbfcfeb0233e607cc
+	.quad	0xbfd00e6c45ad501d
+	.quad	0xbfd0274dc16c232f
+	.quad	0xbfd0402594b4d041
+	.quad	0xbfd058f3c703ebc6
+	.quad	0xbfd071b85fcd590d
+	.quad	0xbfd08a73667c57af
+	.quad	0xbfd0a324e27390e3
+	.quad	0xbfd0bbccdb0d24bd
+	.quad	0xbfd0d46b579ab74b
+	.quad	0xbfd0ed005f657da4
+	.quad	0xbfd1058bf9ae4ad5
+	.quad	0xbfd11e0e2dad9cb7
+	.quad	0xbfd136870293a8b0
+	.quad	0xbfd14ef67f88685a
+	.quad	0xbfd1675cababa60e
+	.quad	0xbfd17fb98e15095d
+	.quad	0xbfd1980d2dd4236f
+	.quad	0xbfd1b05791f07b49
+	.quad	0xbfd1c898c16999fb
+	.quad	0xbfd1e0d0c33716be
+	.quad	0xbfd1f8ff9e48a2f3
+	.quad	0xbfd211255986160c
+	.quad	0xbfd22941fbcf7966
+	.quad	0xbfd241558bfd1404
+	.quad	0xbfd2596010df763a
+	.quad	0xbfd27161913f853d
+	.quad	0xbfd2895a13de86a3
+	.quad	0xbfd2a1499f762bc9
+	.quad	0xbfd2b9303ab89d25
+	.quad	0xbfd2d10dec508583
+	.quad	0xbfd2e8e2bae11d31
+	.quad	0xbfd300aead06350c
+	.quad	0xbfd31871c9544185
+	.quad	0xbfd3302c16586588
+	.quad	0xbfd347dd9a987d55
+	.quad	0xbfd35f865c93293e
+	.quad	0xbfd3772662bfd85b
+	.quad	0xbfd38ebdb38ed321
+	.quad	0xbfd3a64c556945ea
+	.quad	0xbfd3bdd24eb14b6a
+	.quad	0xbfd3d54fa5c1f710
+	.quad	0xbfd3ecc460ef5f50
+	.quad	0xbfd404308686a7e4
+	.quad	0xbfd41b941cce0bee
+	.quad	0xbfd432ef2a04e814
+	.quad	0xbfd44a41b463c47c
+	.quad	0xbfd4618bc21c5ec2
+	.quad	0xbfd478cd5959b3d9
+	.quad	0xbfd49006804009d1
+	.quad	0xbfd4a7373cecf997
+	.quad	0xbfd4be5f957778a1
+	.quad	0xbfd4d57f8fefe27f
+	.quad	0xbfd4ec973260026a
+	.quad	0xbfd503a682cb1cb3
+	.quad	0xbfd51aad872df82d
+	.quad	0xbfd531ac457ee77e
+	.quad	0xbfd548a2c3add263
+	.quad	0xbfd55f9107a43ee2
+	.quad	0xbfd5767717455a6c
+	.quad	0xbfd58d54f86e02f2
+	.quad	0xbfd5a42ab0f4cfe2
+	.quad	0xbfd5baf846aa1b19
+	.quad	0xbfd5d1bdbf5809ca
+	.quad	0xbfd5e87b20c2954a
+	.quad	0xbfd5ff3070a793d4
+	.quad	0xbfd615ddb4bec13c
+	.quad	0xbfd62c82f2b9c795
+	.quad	0x3fd61965cdb02c1f
+	.quad	0x3fd602d08af091ec
+	.quad	0x3fd5ec433d5c35ae
+	.quad	0x3fd5d5bddf595f30
+	.quad	0x3fd5bf406b543db2
+	.quad	0x3fd5a8cadbbedfa1
+	.quad	0x3fd5925d2b112a59
+	.quad	0x3fd57bf753c8d1fb
+	.quad	0x3fd565995069514c
+	.quad	0x3fd54f431b7be1a9
+	.quad	0x3fd538f4af8f72fe
+	.quad	0x3fd522ae0738a3d8
+	.quad	0x3fd50c6f1d11b97c
+	.quad	0x3fd4f637ebba9810
+	.quad	0x3fd4e0086dd8baca
+	.quad	0x3fd4c9e09e172c3c
+	.quad	0x3fd4b3c077267e9a
+	.quad	0x3fd49da7f3bcc41f
+	.quad	0x3fd487970e958770
+	.quad	0x3fd4718dc271c41b
+	.quad	0x3fd45b8c0a17df13
+	.quad	0x3fd44591e0539f49
+	.quad	0x3fd42f9f3ff62642
+	.quad	0x3fd419b423d5e8c7
+	.quad	0x3fd403d086cea79c
+	.quad	0x3fd3edf463c1683e
+	.quad	0x3fd3d81fb5946dba
+	.quad	0x3fd3c25277333184
+	.quad	0x3fd3ac8ca38e5c5f
+	.quad	0x3fd396ce359bbf54
+	.quad	0x3fd3811728564cb2
+	.quad	0x3fd36b6776be1117
+	.quad	0x3fd355bf1bd82c8b
+	.quad	0x3fd3401e12aecba1
+	.quad	0x3fd32a84565120a8
+	.quad	0x3fd314f1e1d35ce4
+	.quad	0x3fd2ff66b04ea9d4
+	.quad	0x3fd2e9e2bce12286
+	.quad	0x3fd2d46602adccee
+	.quad	0x3fd2bef07cdc9354
+	.quad	0x3fd2a982269a3dbf
+	.quad	0x3fd2941afb186b7c
+	.quad	0x3fd27ebaf58d8c9d
+	.quad	0x3fd269621134db92
+	.quad	0x3fd25410494e56c7
+	.quad	0x3fd23ec5991eba49
+	.quad	0x3fd22981fbef797b
+	.quad	0x3fd214456d0eb8d4
+	.quad	0x3fd1ff0fe7cf47a7
+	.quad	0x3fd1e9e1678899f4
+	.quad	0x3fd1d4b9e796c245
+	.quad	0x3fd1bf99635a6b95
+	.quad	0x3fd1aa7fd638d33f
+	.quad	0x3fd1956d3b9bc2fa
+	.quad	0x3fd180618ef18adf
+	.quad	0x3fd16b5ccbacfb73
+	.quad	0x3fd1565eed455fc3
+	.quad	0x3fd14167ef367783
+	.quad	0x3fd12c77cd00713b
+	.quad	0x3fd1178e8227e47c
+	.quad	0x3fd102ac0a35cc1c
+	.quad	0x3fd0edd060b78081
+	.quad	0x3fd0d8fb813eb1ef
+	.quad	0x3fd0c42d676162e3
+	.quad	0x3fd0af660eb9e279
+	.quad	0x3fd09aa572e6c6d4
+	.quad	0x3fd085eb8f8ae797
+	.quad	0x3fd07138604d5862
+	.quad	0x3fd05c8be0d9635a
+	.quad	0x3fd047e60cde83b8
+	.quad	0x3fd03346e0106062
+	.quad	0x3fd01eae5626c691
+	.quad	0x3fd00a1c6adda473
+	.quad	0x3fcfeb2233ea07cd
+	.quad	0x3fcfc218be620a5e
+	.quad	0x3fcf991c6cb3b379
+	.quad	0x3fcf702d36777df0
+	.quad	0x3fcf474b134df229
+	.quad	0x3fcf1e75fadf9bde
+	.quad	0x3fcef5ade4dcffe6
+	.quad	0x3fceccf2c8fe920a
+	.quad	0x3fcea4449f04aaf5
+	.quad	0x3fce7ba35eb77e2a
+	.quad	0x3fce530effe71012
+	.quad	0x3fce2a877a6b2c12
+	.quad	0x3fce020cc6235ab5
+	.quad	0x3fcdd99edaf6d7e9
+	.quad	0x3fcdb13db0d48940
+	.quad	0x3fcd88e93fb2f450
+	.quad	0x3fcd60a17f903515
+	.quad	0x3fcd38666871f465
+	.quad	0x3fcd1037f2655e7b
+	.quad	0x3fcce816157f1988
+	.quad	0x3fccc000c9db3c52
+	.quad	0x3fcc97f8079d44ec
+	.quad	0x3fcc6ffbc6f00f71
+	.quad	0x3fcc480c0005ccd1
+	.quad	0x3fcc2028ab17f9b4
+	.quad	0x3fcbf851c067555f
+	.quad	0x3fcbd087383bd8ad
+	.quad	0x3fcba8c90ae4ad19
+	.quad	0x3fcb811730b823d2
+	.quad	0x3fcb5971a213acdb
+	.quad	0x3fcb31d8575bce3d
+	.quad	0x3fcb0a4b48fc1b46
+	.quad	0x3fcae2ca6f672bd4
+	.quad	0x3fcabb55c31693ad
+	.quad	0x3fca93ed3c8ad9e3
+	.quad	0x3fca6c90d44b704e
+	.quad	0x3fca454082e6ab05
+	.quad	0x3fca1dfc40f1b7f1
+	.quad	0x3fc9f6c407089664
+	.quad	0x3fc9cf97cdce0ec3
+	.quad	0x3fc9a8778debaa38
+	.quad	0x3fc981634011aa75
+	.quad	0x3fc95a5adcf7017f
+	.quad	0x3fc9335e5d594989
+	.quad	0x3fc90c6db9fcbcd9
+	.quad	0x3fc8e588ebac2dbf
+	.quad	0x3fc8beafeb38fe8c
+	.quad	0x3fc897e2b17b19a5
+	.quad	0x3fc871213750e994
+	.quad	0x3fc84a6b759f512f
+	.quad	0x3fc823c16551a3c2
+	.quad	0x3fc7fd22ff599d4f
+	.quad	0x3fc7d6903caf5ad0
+	.quad	0x3fc7b0091651528c
+	.quad	0x3fc7898d85444c73
+	.quad	0x3fc7631d82935a86
+	.quad	0x3fc73cb9074fd14d
+	.quad	0x3fc716600c914054
+	.quad	0x3fc6f0128b756abc
+	.quad	0x3fc6c9d07d203fc7
+	.quad	0x3fc6a399dabbd383
+	.quad	0x3fc67d6e9d785771
+	.quad	0x3fc6574ebe8c133a
+	.quad	0x3fc6313a37335d76
+	.quad	0x3fc60b3100b09476
+	.quad	0x3fc5e533144c1719
+	.quad	0x3fc5bf406b543db2
+	.quad	0x3fc59958ff1d52f1
+	.quad	0x3fc5737cc9018cdd
+	.quad	0x3fc54dabc26105d2
+	.quad	0x3fc527e5e4a1b58d
+	.quad	0x3fc5022b292f6a45
+	.quad	0x3fc4dc7b897bc1c8
+	.quad	0x3fc4b6d6fefe22a4
+	.quad	0x3fc4913d8333b561
+	.quad	0x3fc46baf0f9f5db7
+	.quad	0x3fc4462b9dc9b3dc
+	.quad	0x3fc420b32740fdd4
+	.quad	0x3fc3fb45a59928cc
+	.quad	0x3fc3d5e3126bc27f
+	.quad	0x3fc3b08b6757f2a9
+	.quad	0x3fc38b3e9e027479
+	.quad	0x3fc365fcb0159016
+	.quad	0x3fc340c59741142e
+	.quad	0x3fc31b994d3a4f85
+	.quad	0x3fc2f677cbbc0a96
+	.quad	0x3fc2d1610c86813a
+	.quad	0x3fc2ac55095f5c59
+	.quad	0x3fc28753bc11aba5
+	.quad	0x3fc2625d1e6ddf57
+	.quad	0x3fc23d712a49c202
+	.quad	0x3fc2188fd9807263
+	.quad	0x3fc1f3b925f25d41
+	.quad	0x3fc1ceed09853752
+	.quad	0x3fc1aa2b7e23f72a
+	.quad	0x3fc185747dbecf34
+	.quad	0x3fc160c8024b27b1
+	.quad	0x3fc13c2605c398c3
+	.quad	0x3fc1178e8227e47c
+	.quad	0x3fc0f301717cf0fb
+	.quad	0x3fc0ce7ecdccc28d
+	.quad	0x3fc0aa06912675d5
+	.quad	0x3fc08598b59e3a07
+	.quad	0x3fc06135354d4b18
+	.quad	0x3fc03cdc0a51ec0d
+	.quad	0x3fc0188d2ecf6140
+	.quad	0x3fbfe89139dbd566
+	.quad	0x3fbfa01c9db57ce2
+	.quad	0x3fbf57bc7d9005db
+	.quad	0x3fbf0f70cdd992e3
+	.quad	0x3fbec739830a1120
+	.quad	0x3fbe7f1691a32d3e
+	.quad	0x3fbe3707ee30487b
+	.quad	0x3fbdef0d8d466db9
+	.quad	0x3fbda727638446a2
+	.quad	0x3fbd5f55659210e2
+	.quad	0x3fbd179788219364
+	.quad	0x3fbccfedbfee13a8
+	.quad	0x3fbc885801bc4b23
+	.quad	0x3fbc40d6425a5cb1
+	.quad	0x3fbbf968769fca11
+	.quad	0x3fbbb20e936d6974
+	.quad	0x3fbb6ac88dad5b1c
+	.quad	0x3fbb23965a52ff00
+	.quad	0x3fbadc77ee5aea8c
+	.quad	0x3fba956d3ecade63
+	.quad	0x3fba4e7640b1bc38
+	.quad	0x3fba0792e9277cac
+	.quad	0x3fb9c0c32d4d2548
+	.quad	0x3fb97a07024cbe74
+	.quad	0x3fb9335e5d594989
+	.quad	0x3fb8ecc933aeb6e8
+	.quad	0x3fb8a6477a91dc29
+	.quad	0x3fb85fd927506a48
+	.quad	0x3fb8197e2f40e3f0
+	.quad	0x3fb7d33687c293c9
+	.quad	0x3fb78d02263d82d3
+	.quad	0x3fb746e100226ed9
+	.quad	0x3fb700d30aeac0e1
+	.quad	0x3fb6bad83c1883b6
+	.quad	0x3fb674f089365a7a
+	.quad	0x3fb62f1be7d77743
+	.quad	0x3fb5e95a4d9791cb
+	.quad	0x3fb5a3abb01ade25
+	.quad	0x3fb55e10050e0384
+	.quad	0x3fb518874226130a
+	.quad	0x3fb4d3115d207eac
+	.quad	0x3fb48dae4bc31018
+	.quad	0x3fb4485e03dbdfad
+	.quad	0x3fb403207b414b7f
+	.quad	0x3fb3bdf5a7d1ee64
+	.quad	0x3fb378dd7f749714
+	.quad	0x3fb333d7f8183f4b
+	.quad	0x3fb2eee507b40301
+	.quad	0x3fb2aa04a44717a5
+	.quad	0x3fb26536c3d8c369
+	.quad	0x3fb2207b5c78549e
+	.quad	0x3fb1dbd2643d190b
+	.quad	0x3fb1973bd1465567
+	.quad	0x3fb152b799bb3cc9
+	.quad	0x3fb10e45b3cae831
+	.quad	0x3fb0c9e615ac4e17
+	.quad	0x3fb08598b59e3a07
+	.quad	0x3fb0415d89e74444
+	.quad	0x3faffa6911ab9301
+	.quad	0x3faf723b517fc523
+	.quad	0x3faeea31c006b87c
+	.quad	0x3fae624c4a0b5e1b
+	.quad	0x3fadda8adc67ee4e
+	.quad	0x3fad52ed6405d86f
+	.quad	0x3faccb73cdddb2cc
+	.quad	0x3fac441e06f72a9e
+	.quad	0x3fabbcebfc68f420
+	.quad	0x3fab35dd9b58baad
+	.quad	0x3faaaef2d0fb10fc
+	.quad	0x3faa282b8a936171
+	.quad	0x3fa9a187b573de7c
+	.quad	0x3fa91b073efd7314
+	.quad	0x3fa894aa149fb343
+	.quad	0x3fa80e7023d8ccc4
+	.quad	0x3fa788595a3577ba
+	.quad	0x3fa70265a550e777
+	.quad	0x3fa67c94f2d4bb58
+	.quad	0x3fa5f6e73078efb8
+	.quad	0x3fa5715c4c03ceef
+	.quad	0x3fa4ebf43349e26f
+	.quad	0x3fa466aed42de3ea
+	.quad	0x3fa3e18c1ca0ae92
+	.quad	0x3fa35c8bfaa1306b
+	.quad	0x3fa2d7ae5c3c5bae
+	.quad	0x3fa252f32f8d183f
+	.quad	0x3fa1ce5a62bc353a
+	.quad	0x3fa149e3e4005a8d
+	.quad	0x3fa0c58fa19dfaaa
+	.quad	0x3fa0415d89e74444
+	.quad	0x3f9f7a9b16782856
+	.quad	0x3f9e72bf2813ce51
+	.quad	0x3f9d6b2725979802
+	.quad	0x3f9c63d2ec14aaf2
+	.quad	0x3f9b5cc258b718e6
+	.quad	0x3f9a55f548c5c43f
+	.quad	0x3f994f6b99a24475
+	.quad	0x3f98492528c8cabf
+	.quad	0x3f974321d3d006d3
+	.quad	0x3f963d6178690bd6
+	.quad	0x3f9537e3f45f3565
+	.quad	0x3f9432a925980cc1
+	.quad	0x3f932db0ea132e22
+	.quad	0x3f9228fb1fea2e28
+	.quad	0x3f912487a5507f70
+	.quad	0x3f90205658935847
+	.quad	0x3f8e38ce3033310c
+	.quad	0x3f8c317384c75f06
+	.quad	0x3f8a2a9c6c170462
+	.quad	0x3f882448a388a2aa
+	.quad	0x3f861e77e8b53fc6
+	.quad	0x3f841929f96832f0
+	.quad	0x3f82145e939ef1e9
+	.quad	0x3f8010157588de71
+	.quad	0x3f7c189cbb0e27fb
+	.quad	0x3f78121214586b54
+	.quad	0x3f740c8a747878e2
+	.quad	0x3f70080559588b35
+	.quad	0x3f680904828985c0
+	.quad	0x3f60040155d5889e
+	.quad	0x3f50020055655889
+	.quad	0x0000000000000000
+	.rept	56
+	.byte	0
+	.endr
+
+/* Polynomial coefficients: */
+double_vector _poly_coeff_1 0x3fc9999cacdb4d0a
+double_vector _poly_coeff_2 0xbfd0000148058ee1
+double_vector _poly_coeff_3 0x3fd55555555543c5
+double_vector _poly_coeff_4 0xbfdffffffffff81f
+
+/* Exponent mask */
+double_vector _ExpMask 0x000fffffffffffff
+
+/* 2^10 */
+double_vector _Two10 0x3f50000000000000
+
+/* Minimum normal number */
+double_vector _MinNorm 0x0010000000000000
+
+/* Maximum normal number */
+double_vector _MaxNorm 0x7fefffffffffffff
+
+/* Half of mantissa mask */
+double_vector _HalfMask 0xfffffffffc000000
+
+/* 1.0 */
+double_vector _One 0x3ff0000000000000
+
+/* log(2) high part */
+double_vector _L2H 0x3fe62e42fefa0000
+
+/* log(2) low part */
+double_vector _L2L 0x3d7cf79abc9e0000
+
+/* Work range threshold = 724 */
+double_vector _Threshold 0x4086a00000000000
+
+/* Bias */
+double_vector _Bias 0x408ff80000000000
+
+/* Bias (-1 bit) */
+double_vector _Bias1 0x408ff00000000000
+
+/* log(2) */
+double_vector _L2 0x3fe62e42fefa39ef
+
+/* General purpose constants:
+   DP infinities, +/-  */
+.if .-__svml_dlog_data != _dInfs
+.err
+.endif
+	.quad	0x7ff0000000000000
+	.quad	0xfff0000000000000
+	.rept	48
+	.byte	0
+	.endr
+
+/* DP 1.0, +/- */
+.if .-__svml_dlog_data != _dOnes
+.err
+.endif
+	.quad	0x3ff0000000000000
+	.quad	0xbff0000000000000
+	.rept	48
+	.byte	0
+	.endr
+
+/* DP 0.0, +/- */
+.if .-__svml_dlog_data != _dZeros
+.err
+.endif
+	.quad	0x0000000000000000
+	.quad	0x8000000000000000
+	.rept	48
+	.byte	0
+	.endr
+	.type	__svml_dlog_data,@object
+	.size __svml_dlog_data,.-__svml_dlog_data
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/svml_d_log_data.h b/REORG.TODO/sysdeps/x86_64/fpu/svml_d_log_data.h
new file mode 100644
index 0000000000..84d65db95d
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/svml_d_log_data.h
@@ -0,0 +1,54 @@
+/* Offsets for data table for function log.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef D_LOG_DATA_H
+#define D_LOG_DATA_H
+
+#define _LogRcp_lookup                 -4218816
+#define _Log_HA_table                 	0
+#define _Log_LA_table                 	8256
+#define _poly_coeff_1                 	12416
+#define _poly_coeff_2                 	12480
+#define _poly_coeff_3                 	12544
+#define _poly_coeff_4                 	12608
+#define _ExpMask                      	12672
+#define _Two10                        	12736
+#define _MinNorm                      	12800
+#define _MaxNorm                      	12864
+#define _HalfMask                     	12928
+#define _One                          	12992
+#define _L2H                          	13056
+#define _L2L                          	13120
+#define _Threshold                    	13184
+#define _Bias                         	13248
+#define _Bias1                        	13312
+#define _L2                           	13376
+#define _dInfs                        	13440
+#define _dOnes                        	13504
+#define _dZeros                       	13568
+
+.macro double_vector offset value
+.if .-__svml_dlog_data != \offset
+.err
+.endif
+.rept 8
+.quad \value
+.endr
+.endm
+
+#endif
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/svml_d_pow2_core.S b/REORG.TODO/sysdeps/x86_64/fpu/svml_d_pow2_core.S
new file mode 100644
index 0000000000..ccdb592135
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/svml_d_pow2_core.S
@@ -0,0 +1,29 @@
+/* Function pow vectorized with SSE2.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "svml_d_wrapper_impl.h"
+
+	.text
+ENTRY (_ZGVbN2vv_pow)
+WRAPPER_IMPL_SSE2_ff __pow_finite
+END (_ZGVbN2vv_pow)
+
+#ifndef USE_MULTIARCH
+ libmvec_hidden_def (_ZGVbN2vv_pow)
+#endif
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/svml_d_pow4_core.S b/REORG.TODO/sysdeps/x86_64/fpu/svml_d_pow4_core.S
new file mode 100644
index 0000000000..30ae0f5a2f
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/svml_d_pow4_core.S
@@ -0,0 +1,29 @@
+/* Function pow vectorized with AVX2, wrapper version.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "svml_d_wrapper_impl.h"
+
+	.text
+ENTRY (_ZGVdN4vv_pow)
+WRAPPER_IMPL_AVX_ff _ZGVbN2vv_pow
+END (_ZGVdN4vv_pow)
+
+#ifndef USE_MULTIARCH
+ libmvec_hidden_def (_ZGVdN4vv_pow)
+#endif
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/svml_d_pow4_core_avx.S b/REORG.TODO/sysdeps/x86_64/fpu/svml_d_pow4_core_avx.S
new file mode 100644
index 0000000000..bcea225c4d
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/svml_d_pow4_core_avx.S
@@ -0,0 +1,25 @@
+/* Function pow vectorized in AVX ISA as wrapper to SSE4 ISA version.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "svml_d_wrapper_impl.h"
+
+	.text
+ENTRY (_ZGVcN4vv_pow)
+WRAPPER_IMPL_AVX_ff _ZGVbN2vv_pow
+END (_ZGVcN4vv_pow)
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/svml_d_pow8_core.S b/REORG.TODO/sysdeps/x86_64/fpu/svml_d_pow8_core.S
new file mode 100644
index 0000000000..06b3a81124
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/svml_d_pow8_core.S
@@ -0,0 +1,25 @@
+/* Function pow vectorized with AVX-512. Wrapper to AVX2 version.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "svml_d_wrapper_impl.h"
+
+	.text
+ENTRY (_ZGVeN8vv_pow)
+WRAPPER_IMPL_AVX512_ff _ZGVdN4vv_pow
+END (_ZGVeN8vv_pow)
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/svml_d_pow_data.S b/REORG.TODO/sysdeps/x86_64/fpu/svml_d_pow_data.S
new file mode 100644
index 0000000000..2f05f7becb
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/svml_d_pow_data.S
@@ -0,0 +1,4863 @@
+/* Data for function pow.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include "svml_d_pow_data.h"
+
+	.section .rodata, "a"
+	.align 64
+
+/* Data table for vector implementations of function pow.
+   The table may contain polynomial, reduction, lookup coefficients and
+   other coefficients obtained through different methods of research and
+   experimental work.  */
+
+	.globl __svml_dpow_data
+__svml_dpow_data:
+
+/* Lookup log(2) table (for HSW): */
+.if .-__svml_dpow_data != _hsw_log2_table
+.err
+.endif
+	.quad	0xc08ff00000000000
+	.quad	0x0000000000000000
+	.quad	0xc08ff005c3e0ffc2
+	.quad	0xbd33ab2631d4676d
+	.quad	0xc08ff00b84e236bc
+	.quad	0xbd4563ba56cde925
+	.quad	0xc08ff01143068126
+	.quad	0x3d11790209e88471
+	.quad	0xc08ff016fe50b6ee
+	.quad	0xbd408517f8e37b00
+	.quad	0xc08ff01cb6c3abd0
+	.quad	0xbd44558b51cada94
+	.quad	0xc08ff0226c622f52
+	.quad	0xbd3ec312ed069b24
+	.quad	0xc08ff0281f2f0cd0
+	.quad	0xbd374a4cb0be9e8a
+	.quad	0xc08ff02dcf2d0b86
+	.quad	0x3d26eb3ac8ec0ef7
+	.quad	0xc08ff0337c5eee92
+	.quad	0xbd45984a60ff3d2f
+	.quad	0xc08ff03926c7750a
+	.quad	0xbd0f0cccdd01ee2f
+	.quad	0xc08ff03ece6959f0
+	.quad	0xbd3a5671e1bd4ae8
+	.quad	0xc08ff0447347544c
+	.quad	0xbd3a0976c0a2827d
+	.quad	0xc08ff04a1564172a
+	.quad	0x3d1e14ebaf30c95e
+	.quad	0xc08ff04fb4c251a0
+	.quad	0xbd46898809d2dc10
+	.quad	0xc08ff0555164aee2
+	.quad	0xbd4355e6ecb8e0f1
+	.quad	0xc08ff05aeb4dd63c
+	.quad	0x3cf3c6764fc87b4a
+	.quad	0xc08ff06082806b1c
+	.quad	0xbd4532c412ba94db
+	.quad	0xc08ff06616ff0d24
+	.quad	0xbd4465182838ed44
+	.quad	0xc08ff06ba8cc5824
+	.quad	0xbd47dc6d46384b31
+	.quad	0xc08ff07137eae42a
+	.quad	0xbd35af7a7c7c34f3
+	.quad	0xc08ff076c45d4584
+	.quad	0x3d18a0e14f76d994
+	.quad	0xc08ff07c4e260cc8
+	.quad	0xbd44e7e87341aeee
+	.quad	0xc08ff081d547c6e4
+	.quad	0xbd153121e9af5428
+	.quad	0xc08ff08759c4fd14
+	.quad	0xbd3f9ab3cf74baba
+	.quad	0xc08ff08cdba034fa
+	.quad	0xbd3f09941811b2ee
+	.quad	0xc08ff0925adbf09a
+	.quad	0xbd3a3c89a2cf3516
+	.quad	0xc08ff097d77aae66
+	.quad	0x3d291b415eeb24ed
+	.quad	0xc08ff09d517ee940
+	.quad	0x3d2c7a4ff65ddbc9
+	.quad	0xc08ff0a2c8eb1886
+	.quad	0xbd385a047f97bb3e
+	.quad	0xc08ff0a83dc1b01a
+	.quad	0x3d1124ac34b21259
+	.quad	0xc08ff0adb005205e
+	.quad	0xbd34f286d207e2c8
+	.quad	0xc08ff0b31fb7d648
+	.quad	0xbd33167ccc538261
+	.quad	0xc08ff0b88cdc3b5e
+	.quad	0xbd4542fe4ce30d63
+	.quad	0xc08ff0bdf774b5c4
+	.quad	0xbd41409e20d7191b
+	.quad	0xc08ff0c35f83a83c
+	.quad	0xbd40638b5ff73edf
+	.quad	0xc08ff0c8c50b7232
+	.quad	0x3d294aa31b9b6d65
+	.quad	0xc08ff0ce280e6fba
+	.quad	0xbd38723279ebfab6
+	.quad	0xc08ff0d3888ef9a4
+	.quad	0xbd124fad116078ef
+	.quad	0xc08ff0d8e68f6572
+	.quad	0xbd437350d69ea580
+	.quad	0xc08ff0de4212056c
+	.quad	0xbd45dd31d962d373
+	.quad	0xc08ff0e39b19289e
+	.quad	0x3d058b34834a501e
+	.quad	0xc08ff0e8f1a71adc
+	.quad	0xbd06d26859c7991e
+	.quad	0xc08ff0ee45be24d0
+	.quad	0xbd3ddb7886f88587
+	.quad	0xc08ff0f397608bfc
+	.quad	0xbd42d90e5edaecee
+	.quad	0xc08ff0f8e69092be
+	.quad	0xbd40c5eacb577b4a
+	.quad	0xc08ff0fe33507858
+	.quad	0xbce49209a68c72a1
+	.quad	0xc08ff1037da278f2
+	.quad	0xbd30e0f9c896007d
+	.quad	0xc08ff108c588cda8
+	.quad	0x3d2871a7610e40bd
+	.quad	0xc08ff10e0b05ac84
+	.quad	0xbd31da156756faad
+	.quad	0xc08ff1134e1b4890
+	.quad	0xbd28b7fcd690403e
+	.quad	0xc08ff1188ecbd1d0
+	.quad	0xbd46be4a29c44115
+	.quad	0xc08ff11dcd197552
+	.quad	0xbd36f6bd48a860f0
+	.quad	0xc08ff12309065d28
+	.quad	0xbd47913e788c5887
+	.quad	0xc08ff1284294b07a
+	.quad	0xbd28fe35da2ab291
+	.quad	0xc08ff12d79c6937e
+	.quad	0xbd3fb9b1aaf54bcc
+	.quad	0xc08ff132ae9e278a
+	.quad	0xbd3c343ea3e580eb
+	.quad	0xc08ff137e11d8b10
+	.quad	0xbd3f1140264356b8
+	.quad	0xc08ff13d1146d9a8
+	.quad	0xbd34c7e0166e1f56
+	.quad	0xc08ff1423f1c2c12
+	.quad	0xbd3d449e80431d92
+	.quad	0xc08ff1476a9f983e
+	.quad	0xbd474d3138e94164
+	.quad	0xc08ff14c93d33152
+	.quad	0x3d2370693afbcdb1
+	.quad	0xc08ff151bab907a6
+	.quad	0x3d1badba7fbb3d20
+	.quad	0xc08ff156df5328d6
+	.quad	0x3d2cea9347cb6655
+	.quad	0xc08ff15c01a39fbc
+	.quad	0xbd46879fa00b120a
+	.quad	0xc08ff16121ac7480
+	.quad	0xbd43cf0ff16ff990
+	.quad	0xc08ff1663f6fac90
+	.quad	0xbd43167ccc538261
+	.quad	0xc08ff16b5aef4aae
+	.quad	0xbd2f7081b8e33aad
+	.quad	0xc08ff170742d4ef0
+	.quad	0xbd13f94e00e7d6bc
+	.quad	0xc08ff1758b2bb6c8
+	.quad	0x3d22280434bda911
+	.quad	0xc08ff17a9fec7d06
+	.quad	0x3d1108740d92f890
+	.quad	0xc08ff17fb27199de
+	.quad	0xbd416d18135d3266
+	.quad	0xc08ff184c2bd02f0
+	.quad	0xbd1d97ee9124773b
+	.quad	0xc08ff189d0d0ab42
+	.quad	0xbd40ccd0edd00e4c
+	.quad	0xc08ff18edcae8352
+	.quad	0xbd36d76b9a843329
+	.quad	0xc08ff193e6587910
+	.quad	0xbd210f7ac89c6f2d
+	.quad	0xc08ff198edd077e6
+	.quad	0xbd40df02face8ca9
+	.quad	0xc08ff19df31868c0
+	.quad	0xbd41d4cc2f68b868
+	.quad	0xc08ff1a2f632320c
+	.quad	0x3d2e54d71deb636a
+	.quad	0xc08ff1a7f71fb7ba
+	.quad	0xbd373af6b5487f35
+	.quad	0xc08ff1acf5e2db4e
+	.quad	0xbd3927dfc23d9780
+	.quad	0xc08ff1b1f27d7bd8
+	.quad	0x3d2601ccfac2b557
+	.quad	0xc08ff1b6ecf175f8
+	.quad	0xbd45e96bed8cce30
+	.quad	0xc08ff1bbe540a3f0
+	.quad	0xbd1b76a46f31880a
+	.quad	0xc08ff1c0db6cdd94
+	.quad	0xbd3bdc81c4db3134
+	.quad	0xc08ff1c5cf77f860
+	.quad	0xbd304cc6600a133e
+	.quad	0xc08ff1cac163c770
+	.quad	0xbd3b912d8994b162
+	.quad	0xc08ff1cfb1321b8c
+	.quad	0xbd20009770ea1465
+	.quad	0xc08ff1d49ee4c326
+	.quad	0x3d2a40dc2d2a6bf7
+	.quad	0xc08ff1d98a7d8a60
+	.quad	0xbd269affffe47644
+	.quad	0xc08ff1de73fe3b14
+	.quad	0xbd301dc37c84e79a
+	.quad	0xc08ff1e35b689cd2
+	.quad	0xbd2953e61f15bd9b
+	.quad	0xc08ff1e840be74e6
+	.quad	0xbd34998f93e7aa3c
+	.quad	0xc08ff1ed2401865e
+	.quad	0x3cf5c14e55f57802
+	.quad	0xc08ff1f205339208
+	.quad	0xbd3e4e8eea54ce63
+	.quad	0xc08ff1f6e4565680
+	.quad	0x3d0aaa72ba2c6ba2
+	.quad	0xc08ff1fbc16b9026
+	.quad	0xbd30144751b3314f
+	.quad	0xc08ff2009c74f930
+	.quad	0x3d2a15a5b343a140
+	.quad	0xc08ff205757449a0
+	.quad	0xbd398eec5e85b29f
+	.quad	0xc08ff20a4c6b3756
+	.quad	0xbd1b361c7dddadb6
+	.quad	0xc08ff20f215b7606
+	.quad	0xbcc2de0634d33aa9
+	.quad	0xc08ff213f446b744
+	.quad	0xbce024b5b4e89254
+	.quad	0xc08ff218c52eaa84
+	.quad	0xbd451d49f63f4830
+	.quad	0xc08ff21d9414fd24
+	.quad	0x3d1f4c2417f39394
+	.quad	0xc08ff22260fb5a60
+	.quad	0xbd46eb9612e0b4f3
+	.quad	0xc08ff2272be36b6c
+	.quad	0xbd1a5bd9bcda22fd
+	.quad	0xc08ff22bf4ced760
+	.quad	0xbd41feb2fc708a78
+	.quad	0xc08ff230bbbf4350
+	.quad	0x3d13045428f88499
+	.quad	0xc08ff23580b6523e
+	.quad	0xbcfc14a31ce1b7e3
+	.quad	0xc08ff23a43b5a52a
+	.quad	0xbd38c9a2f2dbcaf9
+	.quad	0xc08ff23f04bedb12
+	.quad	0x3d1ecd417972c083
+	.quad	0xc08ff243c3d390ee
+	.quad	0xbd38e36471414f76
+	.quad	0xc08ff24880f561c0
+	.quad	0xbd3ce60916e52e91
+	.quad	0xc08ff24d3c25e68e
+	.quad	0x3d1d406db502402d
+	.quad	0xc08ff251f566b664
+	.quad	0xbd3a0d8c0e85a909
+	.quad	0xc08ff256acb96662
+	.quad	0xbd2dafbfd96d5335
+	.quad	0xc08ff25b621f89b2
+	.quad	0xbd455ede26f47b19
+	.quad	0xc08ff260159ab196
+	.quad	0xbd461f2e47488cf1
+	.quad	0xc08ff264c72c6d64
+	.quad	0xbd406b35c7c781db
+	.quad	0xc08ff26976d64a8c
+	.quad	0xbd20c369fc5a3d9b
+	.quad	0xc08ff26e2499d49a
+	.quad	0x3d20993376649b50
+	.quad	0xc08ff272d078953a
+	.quad	0x3d1664deafdbfed5
+	.quad	0xc08ff2777a74143c
+	.quad	0x3d282b53e791792d
+	.quad	0xc08ff27c228dd794
+	.quad	0x3ccc79237996a42b
+	.quad	0xc08ff280c8c76360
+	.quad	0xbd3125d6cbcd1095
+	.quad	0xc08ff2856d2239ea
+	.quad	0xbd3194cfcc6c23cf
+	.quad	0xc08ff28a0f9fdbaa
+	.quad	0x3cee35952fb0019c
+	.quad	0xc08ff28eb041c748
+	.quad	0xbd2286fbc7f749ff
+	.quad	0xc08ff2934f0979a2
+	.quad	0xbd4715fc9257edff
+	.quad	0xc08ff297ebf86dd0
+	.quad	0xbd35dcccaf649933
+	.quad	0xc08ff29c87101d1e
+	.quad	0xbd46d3f77ae3858b
+	.quad	0xc08ff2a12051ff1c
+	.quad	0xbd0432648cfc8738
+	.quad	0xc08ff2a5b7bf8992
+	.quad	0xbd3acdf73d83987f
+	.quad	0xc08ff2aa4d5a3092
+	.quad	0xbd2e6c522ceda3fb
+	.quad	0xc08ff2aee123666e
+	.quad	0xbd4195620f0359d8
+	.quad	0xc08ff2b3731c9bc4
+	.quad	0xbd3c70f15d3ebabd
+	.quad	0xc08ff2b803473f7a
+	.quad	0xbd3a1e7e802c4828
+	.quad	0xc08ff2bc91a4bec4
+	.quad	0xbd4572ca23a96c48
+	.quad	0xc08ff2c11e368528
+	.quad	0xbd415b2de01cea41
+	.quad	0xc08ff2c5a8fdfc7c
+	.quad	0xbd47dc11ebf92a98
+	.quad	0xc08ff2ca31fc8cee
+	.quad	0xbd474dca44f1db91
+	.quad	0xc08ff2ceb9339d04
+	.quad	0x3cfb88755d6ca189
+	.quad	0xc08ff2d33ea4919a
+	.quad	0xbd32e1a3152150d3
+	.quad	0xc08ff2d7c250cdf0
+	.quad	0xbd206adfcaa4bcf5
+	.quad	0xc08ff2dc4439b3a2
+	.quad	0x3d290d43956fa5d8
+	.quad	0xc08ff2e0c460a2ae
+	.quad	0x3d27158a37417c3a
+	.quad	0xc08ff2e542c6f978
+	.quad	0xbd1829434d994a2a
+	.quad	0xc08ff2e9bf6e14cc
+	.quad	0xbd2c3e1e30d370ea
+	.quad	0xc08ff2ee3a574fde
+	.quad	0xbd4677c8dfd9aa24
+	.quad	0xc08ff2f2b3840452
+	.quad	0xbd2788eba5c173ee
+	.quad	0xc08ff2f72af58a34
+	.quad	0xbd4588aec6dfa7dc
+	.quad	0xc08ff2fba0ad3808
+	.quad	0xbd47fe42f19c5879
+	.quad	0xc08ff30014ac62c4
+	.quad	0x3d2d5e6a8a4fb059
+	.quad	0xc08ff30486f45dce
+	.quad	0xbd0edb9d09608783
+	.quad	0xc08ff308f7867b0c
+	.quad	0xbd18dc7c094eee51
+	.quad	0xc08ff30d66640ada
+	.quad	0xbd46028f37225746
+	.quad	0xc08ff311d38e5c16
+	.quad	0xbd212d25b3252647
+	.quad	0xc08ff3163f06bc16
+	.quad	0xbd3906944ba567f4
+	.quad	0xc08ff31aa8ce76b8
+	.quad	0xbd2b8d59e8492d6e
+	.quad	0xc08ff31f10e6d65a
+	.quad	0xbd339eec34ce3ce3
+	.quad	0xc08ff323775123e2
+	.quad	0xbd3c22d2cad415ae
+	.quad	0xc08ff327dc0ea6be
+	.quad	0xbd42ce2af5839ab8
+	.quad	0xc08ff32c3f20a4e8
+	.quad	0xbd03719eb3af5b8d
+	.quad	0xc08ff330a08862e2
+	.quad	0xbd3feed12980ee19
+	.quad	0xc08ff335004723c4
+	.quad	0xbd2979a5db68721d
+	.quad	0xc08ff3395e5e2932
+	.quad	0x3cf7159b944f7fd7
+	.quad	0xc08ff33dbaceb364
+	.quad	0xbd377e236c73e71b
+	.quad	0xc08ff342159a012a
+	.quad	0xbd4568bb43ac99bb
+	.quad	0xc08ff3466ec14fec
+	.quad	0xbcf4275f1035e5e8
+	.quad	0xc08ff34ac645dba6
+	.quad	0xbd3cc58a505d117a
+	.quad	0xc08ff34f1c28def8
+	.quad	0x3d10bad7dfa568f7
+	.quad	0xc08ff353706b9318
+	.quad	0xbd3c27e675df639d
+	.quad	0xc08ff357c30f2fe4
+	.quad	0x3d06e3cb71b554e7
+	.quad	0xc08ff35c1414ebd4
+	.quad	0xbd40c353cb7112a5
+	.quad	0xc08ff360637dfc0c
+	.quad	0xbd30d199805b0aec
+	.quad	0xc08ff364b14b9450
+	.quad	0xbd381e2a51761f86
+	.quad	0xc08ff368fd7ee710
+	.quad	0xbd250520a377c7ec
+	.quad	0xc08ff36d48192564
+	.quad	0xbcef941453836236
+	.quad	0xc08ff371911b7f10
+	.quad	0xbd39e65cd77582e2
+	.quad	0xc08ff375d887228a
+	.quad	0x3d201640f615fa5c
+	.quad	0xc08ff37a1e5d3cf2
+	.quad	0xbce855a216719009
+	.quad	0xc08ff37e629efa1e
+	.quad	0xbd3ae66b65d78df9
+	.quad	0xc08ff382a54d8498
+	.quad	0xbd45cb804b949696
+	.quad	0xc08ff386e66a05a0
+	.quad	0xbd33de15e265b5d9
+	.quad	0xc08ff38b25f5a52a
+	.quad	0xbd46acfcfdca95de
+	.quad	0xc08ff38f63f189ea
+	.quad	0xbd1a3f6c066ebdd4
+	.quad	0xc08ff393a05ed948
+	.quad	0xbd3ecf4dff1e8ea2
+	.quad	0xc08ff397db3eb770
+	.quad	0xbd40d40bb2010158
+	.quad	0xc08ff39c1492474a
+	.quad	0xbd40f992ba145dcf
+	.quad	0xc08ff3a04c5aaa80
+	.quad	0xbd346fab3fa1a144
+	.quad	0xc08ff3a48299017e
+	.quad	0xbd23ea90adf6a54a
+	.quad	0xc08ff3a8b74e6b74
+	.quad	0xbd449e1389f86468
+	.quad	0xc08ff3acea7c065c
+	.quad	0xbd441dfc7d7c3321
+	.quad	0xc08ff3b11c22eef6
+	.quad	0xbd148ad9b560f3b7
+	.quad	0xc08ff3b54c4440ca
+	.quad	0x3cf1bfb62d6a3aa8
+	.quad	0xc08ff3b97ae1162e
+	.quad	0xbd2ac444ea257ffa
+	.quad	0xc08ff3bda7fa8846
+	.quad	0xbd39313aec658458
+	.quad	0xc08ff3c1d391af06
+	.quad	0x3d2a140de4db9aae
+	.quad	0xc08ff3c5fda7a12e
+	.quad	0xbd24c06f912ab9d1
+	.quad	0xc08ff3ca263d7456
+	.quad	0xbd426152c271eb36
+	.quad	0xc08ff3ce4d543cea
+	.quad	0xbd33483146784bd2
+	.quad	0xc08ff3d272ed0e28
+	.quad	0xbd44640a8fec6a2e
+	.quad	0xc08ff3d69708fa2a
+	.quad	0xbd479ca7cb93cc08
+	.quad	0xc08ff3dab9a911e2
+	.quad	0xbd3cc65b96825ec6
+	.quad	0xc08ff3dedace651c
+	.quad	0xbd2103e8f00d41c8
+	.quad	0xc08ff3e2fa7a0280
+	.quad	0xbd3ebdb1bbaf9ab0
+	.quad	0xc08ff3e718acf798
+	.quad	0xbd350343f8df4b43
+	.quad	0xc08ff3eb356850ca
+	.quad	0xbd3db11aa6a7cdea
+	.quad	0xc08ff3ef50ad1960
+	.quad	0xbd3b3b3864c60011
+	.quad	0xc08ff3f36a7c5b86
+	.quad	0xbd3310f9839f068a
+	.quad	0xc08ff3f782d7204c
+	.quad	0xbd40144751b3314f
+	.quad	0xc08ff3fb99be6faa
+	.quad	0xbd429875b0e43fd8
+	.quad	0xc08ff3ffaf335080
+	.quad	0x3cf9518ce032f41d
+	.quad	0xc08ff403c336c894
+	.quad	0x3d29ab66b62c5ca8
+	.quad	0xc08ff407d5c9dc98
+	.quad	0xbd437fc8cafdef46
+	.quad	0xc08ff40be6ed9030
+	.quad	0xbd2515e1cacac36e
+	.quad	0xc08ff40ff6a2e5e6
+	.quad	0xbd27f33943464056
+	.quad	0xc08ff41404eadf38
+	.quad	0xbd1cb6f70109b0f1
+	.quad	0xc08ff41811c67c94
+	.quad	0x3d24dc166e0e0c68
+	.quad	0xc08ff41c1d36bd58
+	.quad	0xbd3d990d1e0f6657
+	.quad	0xc08ff420273c9fdc
+	.quad	0xbcfea92d9e0e8ac2
+	.quad	0xc08ff4242fd92166
+	.quad	0xbd303cf98ab4e537
+	.quad	0xc08ff428370d3e38
+	.quad	0xbd2fbc00d8d6cbcf
+	.quad	0xc08ff42c3cd9f18a
+	.quad	0xbd2fd3fe3499ea9f
+	.quad	0xc08ff4304140358e
+	.quad	0xbd3532c412ba94db
+	.quad	0xc08ff43444410372
+	.quad	0xbd1f5ab329b483ec
+	.quad	0xc08ff43845dd535e
+	.quad	0xbd40444ebaaf2894
+	.quad	0xc08ff43c46161c7c
+	.quad	0xbd35897d184aaac4
+	.quad	0xc08ff44044ec54f2
+	.quad	0xbd1d4f639bb5cdf6
+	.quad	0xc08ff4444260f1e6
+	.quad	0xbd467d28344c2ff0
+	.quad	0xc08ff4483e74e786
+	.quad	0xbcccb52b4581174d
+	.quad	0xc08ff44c392928fa
+	.quad	0xbd449eb852b25382
+	.quad	0xc08ff450327ea878
+	.quad	0xbd450e785694a8c6
+	.quad	0xc08ff4542a765738
+	.quad	0xbd2410f5d3161a62
+	.quad	0xc08ff45821112578
+	.quad	0xbcc81e2b378ff59d
+	.quad	0xc08ff45c16500280
+	.quad	0xbd3e6009faee4be8
+	.quad	0xc08ff4600a33dca6
+	.quad	0x3d12b628e2d05d76
+	.quad	0xc08ff463fcbda144
+	.quad	0xbd3cbb828084fcb1
+	.quad	0xc08ff467edee3cc8
+	.quad	0xbd4085c5870d5301
+	.quad	0xc08ff46bddc69aaa
+	.quad	0xbd4475780e47156b
+	.quad	0xc08ff46fcc47a574
+	.quad	0xbcdbc76a2753b99b
+	.quad	0xc08ff473b97246bc
+	.quad	0xbd2012f1593ee62a
+	.quad	0xc08ff477a547672e
+	.quad	0xbd3d30c3d2643639
+	.quad	0xc08ff47b8fc7ee8a
+	.quad	0xbd062c45c4bc31c9
+	.quad	0xc08ff47f78f4c3a0
+	.quad	0xbd22642415d47384
+	.quad	0xc08ff48360cecc5a
+	.quad	0x3d2372fd3ff3197b
+	.quad	0xc08ff4874756edb4
+	.quad	0xbd4668c543d0b42b
+	.quad	0xc08ff48b2c8e0bca
+	.quad	0xbd33f65cadbe0d26
+	.quad	0xc08ff48f107509ca
+	.quad	0x3cfbfbf899cf2b3c
+	.quad	0xc08ff492f30cc9fe
+	.quad	0xbd307470f69809cc
+	.quad	0xc08ff496d4562dce
+	.quad	0xbd44115a1a340462
+	.quad	0xc08ff49ab45215c0
+	.quad	0xbcff5369fdf426cf
+	.quad	0xc08ff49e93016172
+	.quad	0xbd3fc02bc277071d
+	.quad	0xc08ff4a27064efa8
+	.quad	0xbd4728da988cc139
+	.quad	0xc08ff4a64c7d9e44
+	.quad	0xbd458147cf67745e
+	.quad	0xc08ff4aa274c4a4a
+	.quad	0xbd22100986691daa
+	.quad	0xc08ff4ae00d1cfde
+	.quad	0xbd36879fa00b120a
+	.quad	0xc08ff4b1d90f0a4c
+	.quad	0xbd40b68fc634db41
+	.quad	0xc08ff4b5b004d404
+	.quad	0xbd3c03254a7145e3
+	.quad	0xc08ff4b985b4069c
+	.quad	0xbcf4f144da6e4533
+	.quad	0xc08ff4bd5a1d7ad0
+	.quad	0x3d1b3d7b0e65d2ce
+	.quad	0xc08ff4c12d420886
+	.quad	0x3d0dd3d30f5deaa7
+	.quad	0xc08ff4c4ff2286ce
+	.quad	0x3d20dc60dc5befec
+	.quad	0xc08ff4c8cfbfcbe0
+	.quad	0xbd47f6a1ab3efbbe
+	.quad	0xc08ff4cc9f1aad26
+	.quad	0xbd429b21ae4817e9
+	.quad	0xc08ff4d06d33ff32
+	.quad	0x3d256a9ae5dca5a3
+	.quad	0xc08ff4d43a0c95c2
+	.quad	0x3cf38bc99b3611ce
+	.quad	0xc08ff4d805a543c8
+	.quad	0xbd0c6d2c37daf317
+	.quad	0xc08ff4dbcffedb64
+	.quad	0xbd262404772a151d
+	.quad	0xc08ff4df991a2de8
+	.quad	0xbd11c0de7b779cb3
+	.quad	0xc08ff4e360f80bd6
+	.quad	0xbd4424a06f870b9e
+	.quad	0xc08ff4e7279944e8
+	.quad	0xbd3a69393bab4fd0
+	.quad	0xc08ff4eaecfea808
+	.quad	0xbd266cccab240e90
+	.quad	0xc08ff4eeb1290356
+	.quad	0xbd38e9b57298d22f
+	.quad	0xc08ff4f27419242c
+	.quad	0x3d2eddd33ea4d6f1
+	.quad	0xc08ff4f635cfd714
+	.quad	0xbd476e0ed8a042be
+	.quad	0xc08ff4f9f64de7dc
+	.quad	0xbce66ae2a7ada553
+	.quad	0xc08ff4fdb5942180
+	.quad	0xbd0cd57d9d86514e
+	.quad	0xc08ff50173a34e3c
+	.quad	0xbd42efafb4bec72b
+	.quad	0xc08ff505307c378a
+	.quad	0xbd1a46dbdcc762d3
+	.quad	0xc08ff508ec1fa61a
+	.quad	0xbd354b383b0e8a55
+	.quad	0xc08ff50ca68e61e0
+	.quad	0x3d2c7d469ea019ad
+	.quad	0xc08ff5105fc93208
+	.quad	0xbd264adb1adca9a8
+	.quad	0xc08ff51417d0dd04
+	.quad	0x3ce5c601f0626dc8
+	.quad	0xc08ff517cea62882
+	.quad	0x3d18eb650003fb32
+	.quad	0xc08ff51b8449d972
+	.quad	0xbd326baaf0b591f8
+	.quad	0xc08ff51f38bcb408
+	.quad	0xbd461b8d0e43a37f
+	.quad	0xc08ff522ebff7bbc
+	.quad	0xbd33859a74f0d148
+	.quad	0xc08ff5269e12f346
+	.quad	0xbd3c57f2495fb7fa
+	.quad	0xc08ff52a4ef7dca8
+	.quad	0xbcd5dc21a39bf974
+	.quad	0xc08ff52dfeaef926
+	.quad	0x3d0aa0e9e6bca777
+	.quad	0xc08ff531ad39094c
+	.quad	0xbd47d0fa4fa0c208
+	.quad	0xc08ff5355a96ccf4
+	.quad	0x3d23bb5921006679
+	.quad	0xc08ff53906c90336
+	.quad	0xbd21f3e0c466e8f9
+	.quad	0xc08ff53cb1d06a7c
+	.quad	0xbd39f3ba83f85c08
+	.quad	0xc08ff5405badc07a
+	.quad	0x3d2e77ad7a4b71c0
+	.quad	0xc08ff5440461c22a
+	.quad	0xbd1f1bbd2926f164
+	.quad	0xc08ff547abed2bd8
+	.quad	0xbd44479667bb79bf
+	.quad	0xc08ff54b5250b91e
+	.quad	0xbd2094ef49b8484b
+	.quad	0xc08ff54ef78d24de
+	.quad	0xbd41fb87566dd18c
+	.quad	0xc08ff5529ba32950
+	.quad	0xbd3c6d8d86531d56
+	.quad	0xc08ff5563e937ff8
+	.quad	0xbd323e7492de8d74
+	.quad	0xc08ff559e05ee1ac
+	.quad	0xbcf63d8bd35fdc18
+	.quad	0xc08ff55d81060692
+	.quad	0xbd3cc78dae939320
+	.quad	0xc08ff5612089a626
+	.quad	0xbd44cf0e362f4a36
+	.quad	0xc08ff564beea7736
+	.quad	0xbd3a96d7a36f1545
+	.quad	0xc08ff5685c292fe2
+	.quad	0xbd4570af1a0bc9f4
+	.quad	0xc08ff56bf84685a4
+	.quad	0x3d1bdc90791aef03
+	.quad	0xc08ff56f93432d44
+	.quad	0xbd40d2abacfc0489
+	.quad	0xc08ff5732d1fdaea
+	.quad	0xbd39e35c1aa7693f
+	.quad	0xc08ff576c5dd4210
+	.quad	0xbd23c49c247ab6af
+	.quad	0xc08ff57a5d7c1588
+	.quad	0xbd4374da167aead5
+	.quad	0xc08ff57df3fd0782
+	.quad	0xbd2aeb8cb1ac05cd
+	.quad	0xc08ff5818960c982
+	.quad	0xbd3b1b8ae4633046
+	.quad	0xc08ff5851da80c6c
+	.quad	0xbd20899cee46ebe4
+	.quad	0xc08ff588b0d3807c
+	.quad	0xbcfc4413fd83dec1
+	.quad	0xc08ff58c42e3d54c
+	.quad	0xbd02101a9685c779
+	.quad	0xc08ff58fd3d9b9d2
+	.quad	0xbd45c074c957d037
+	.quad	0xc08ff59363b5dc66
+	.quad	0xbd3f7cc3df8803d1
+	.quad	0xc08ff596f278eaba
+	.quad	0xbd3961ecab44052e
+	.quad	0xc08ff59a802391e2
+	.quad	0xbd1979a5db68721d
+	.quad	0xc08ff59e0cb67e50
+	.quad	0xbd3e4ce321e589a9
+	.quad	0xc08ff5a198325bdc
+	.quad	0x3d0e321d11f8a0ce
+	.quad	0xc08ff5a52297d5ba
+	.quad	0x3d227ae8037b21bf
+	.quad	0xc08ff5a8abe79684
+	.quad	0x3d1ebefecd51a1be
+	.quad	0xc08ff5ac34224836
+	.quad	0xbd372c2fed3f759f
+	.quad	0xc08ff5afbb489432
+	.quad	0xbd46b82e2a9e810c
+	.quad	0xc08ff5b3415b2340
+	.quad	0x3d2e59ad84a6a593
+	.quad	0xc08ff5b6c65a9d86
+	.quad	0xbd249d97df07e357
+	.quad	0xc08ff5ba4a47aa98
+	.quad	0xbd46d25a5b8a19b2
+	.quad	0xc08ff5bdcd22f172
+	.quad	0x3d2e859780f0cdc7
+	.quad	0xc08ff5c14eed186e
+	.quad	0xbd4171cf05a99915
+	.quad	0xc08ff5c4cfa6c55a
+	.quad	0xbd41ef9459fef720
+	.quad	0xc08ff5c84f509d68
+	.quad	0x3d145ccfb66fabd2
+	.quad	0xc08ff5cbcdeb4530
+	.quad	0xbd46bf2e7459b97d
+	.quad	0xc08ff5cf4b7760be
+	.quad	0xbd36132520b9d027
+	.quad	0xc08ff5d2c7f59382
+	.quad	0x3d15872350f805d6
+	.quad	0xc08ff5d643668058
+	.quad	0xbd41835d469035a9
+	.quad	0xc08ff5d9bdcac98e
+	.quad	0xbd47b7378ad99d2e
+	.quad	0xc08ff5dd372310dc
+	.quad	0xbd472d51ea7c162e
+	.quad	0xc08ff5e0af6ff76a
+	.quad	0x3d2a8843781eda15
+	.quad	0xc08ff5e426b21dc8
+	.quad	0xbd44ea36d76b0bd8
+	.quad	0xc08ff5e79cea2402
+	.quad	0x3d2e03b336c24b74
+	.quad	0xc08ff5eb1218a986
+	.quad	0xbd45a7bfdb3c98b0
+	.quad	0xc08ff5ee863e4d40
+	.quad	0xbd37204f55bbf90d
+	.quad	0xc08ff5f1f95bad84
+	.quad	0xbd41b72e122257f1
+	.quad	0xc08ff5f56b71681e
+	.quad	0xbd1488084776534a
+	.quad	0xc08ff5f8dc801a48
+	.quad	0xbd2866405210e49e
+	.quad	0xc08ff5fc4c8860b4
+	.quad	0x3d1d45da26510032
+	.quad	0xc08ff5ffbb8ad784
+	.quad	0xbd2f386200388584
+	.quad	0xc08ff60329881a52
+	.quad	0xbd47e32446892fb9
+	.quad	0xc08ff6069680c42e
+	.quad	0xbd4330c4c4a27e40
+	.quad	0xc08ff60a02756f9c
+	.quad	0xbd0cb6f70109b0f1
+	.quad	0xc08ff60d6d66b694
+	.quad	0xbd4777531ab1b43f
+	.quad	0xc08ff610d755328e
+	.quad	0x3d118906313e79cf
+	.quad	0xc08ff61440417c70
+	.quad	0x3d0a5b363a6f499c
+	.quad	0xc08ff617a82c2c9e
+	.quad	0xbd39308437e74325
+	.quad	0xc08ff61b0f15daf6
+	.quad	0xbd3fef5f3fc61899
+	.quad	0xc08ff61e74ff1ece
+	.quad	0xbd3b85f3204507b9
+	.quad	0xc08ff621d9e88ef6
+	.quad	0xbd42fc8ea3276ba0
+	.quad	0xc08ff6253dd2c1bc
+	.quad	0x3d0d2fe4574e09b9
+	.quad	0xc08ff628a0be4ce4
+	.quad	0xbd3245829ca653e6
+	.quad	0xc08ff62c02abc5b4
+	.quad	0xbd42a385b236e315
+	.quad	0xc08ff62f639bc0ee
+	.quad	0xbd301f1e98d8979c
+	.quad	0xc08ff632c38ed2ce
+	.quad	0xbd3ded9b44542fd9
+	.quad	0xc08ff63622858f12
+	.quad	0xbd3d400fd651da9a
+	.quad	0xc08ff639808088f6
+	.quad	0x3d29f78153fcfec0
+	.quad	0xc08ff63cdd805330
+	.quad	0xbd46af859d47a29a
+	.quad	0xc08ff64039858000
+	.quad	0xbd3667f21fa8423f
+	.quad	0xc08ff6439490a11e
+	.quad	0xbd1b254cabaa042b
+	.quad	0xc08ff646eea247c6
+	.quad	0x3d1ee969a95f528f
+	.quad	0xc08ff64a47bb04b4
+	.quad	0xbd3821d36e0b7548
+	.quad	0xc08ff64d9fdb682a
+	.quad	0xbd3974e6432d9ee8
+	.quad	0xc08ff650f70401ea
+	.quad	0xbd1d74d044558154
+	.quad	0xc08ff6544d356138
+	.quad	0xbd371b3a63cddadf
+	.quad	0xc08ff657a27014e0
+	.quad	0x3d17b6aad08dc210
+	.quad	0xc08ff65af6b4ab2c
+	.quad	0xbd47d7bfb12454c5
+	.quad	0xc08ff65e4a03b1f4
+	.quad	0xbd373647bf25fa5f
+	.quad	0xc08ff6619c5db68e
+	.quad	0xbcf742a6b2827cf0
+	.quad	0xc08ff664edc345d8
+	.quad	0xbd02d3bbd925734c
+	.quad	0xc08ff6683e34ec38
+	.quad	0xbd03f7a55cd2af4c
+	.quad	0xc08ff66b8db3359a
+	.quad	0xbd308364fa508035
+	.quad	0xc08ff66edc3ead74
+	.quad	0x3d2b37bd36337985
+	.quad	0xc08ff67229d7dec0
+	.quad	0x3d22a424c693063d
+	.quad	0xc08ff675767f5404
+	.quad	0xbd166cccab240e90
+	.quad	0xc08ff678c2359750
+	.quad	0x3d2bce65acc07927
+	.quad	0xc08ff67c0cfb323a
+	.quad	0xbd25651ccd0e0880
+	.quad	0xc08ff67f56d0ade6
+	.quad	0xbd4533d5b4542c99
+	.quad	0xc08ff6829fb69304
+	.quad	0xbd22ce6312ebb81d
+	.quad	0xc08ff685e7ad69ca
+	.quad	0xbd2b6967f02b01d8
+	.quad	0xc08ff6892eb5b9fe
+	.quad	0xbd3bb55730409355
+	.quad	0xc08ff68c74d00af2
+	.quad	0xbd4352b18e47fcd2
+	.quad	0xc08ff68fb9fce386
+	.quad	0xbceed0798d1aa216
+	.quad	0xc08ff692fe3cca22
+	.quad	0xbd464b702b56565e
+	.quad	0xc08ff696419044c4
+	.quad	0xbd45909799f95e23
+	.quad	0xc08ff69983f7d8f4
+	.quad	0xbd2bebde1ac6e983
+	.quad	0xc08ff69cc5740bc8
+	.quad	0xbd18f7aac147fdc1
+	.quad	0xc08ff6a0060561e8
+	.quad	0x3d2653a2eb403f26
+	.quad	0xc08ff6a345ac5f8a
+	.quad	0x3d1769a8e6b40f5e
+	.quad	0xc08ff6a684698876
+	.quad	0xbd1770535b322bbf
+	.quad	0xc08ff6a9c23d6004
+	.quad	0xbd434df378df21ad
+	.quad	0xc08ff6acff286920
+	.quad	0xbd398cc3b5d08e15
+	.quad	0xc08ff6b03b2b2644
+	.quad	0xbd39d941e9e746a4
+	.quad	0xc08ff6b376461980
+	.quad	0x3d2fd2e802de76ad
+	.quad	0xc08ff6b6b079c472
+	.quad	0xbcf968ab16b0d7ba
+	.quad	0xc08ff6b9e9c6a850
+	.quad	0xbd3fa4a9eb6b8621
+	.quad	0xc08ff6bd222d45e4
+	.quad	0xbd36ad5bac74b87f
+	.quad	0xc08ff6c059ae1d8a
+	.quad	0x3d057c1b79ee9964
+	.quad	0xc08ff6c39049af32
+	.quad	0xbd0af5e9bb5386c2
+	.quad	0xc08ff6c6c6007a64
+	.quad	0xbce8467191344d58
+	.quad	0xc08ff6c9fad2fe3c
+	.quad	0xbd1148dad646cb9d
+	.quad	0xc08ff6cd2ec1b96c
+	.quad	0xbd4149540d5fceb9
+	.quad	0xc08ff6d061cd2a40
+	.quad	0xbd117b2f1731efbe
+	.quad	0xc08ff6d393f5ce96
+	.quad	0x3d25005be8c5610b
+	.quad	0xc08ff6d6c53c23e6
+	.quad	0x3d29a1979619fe2f
+	.quad	0xc08ff6d9f5a0a740
+	.quad	0x3d15ebe99c4f6416
+	.quad	0xc08ff6dd2523d54c
+	.quad	0xbd36d25a5b8a19b2
+	.quad	0xc08ff6e053c62a4c
+	.quad	0xbd47f3f2612caf97
+	.quad	0xc08ff6e38188221c
+	.quad	0xbd3848e9d1d92d88
+	.quad	0xc08ff6e6ae6a382e
+	.quad	0xbd3b4aada7453897
+	.quad	0xc08ff6e9da6ce792
+	.quad	0xbd2640ef87ede14b
+	.quad	0xc08ff6ed0590aaf0
+	.quad	0xbd2da89e835cc3d2
+	.quad	0xc08ff6f02fd5fc8e
+	.quad	0x3d2fa6e2ac948d1a
+	.quad	0xc08ff6f3593d5648
+	.quad	0xbd44bf3775fde250
+	.quad	0xc08ff6f681c731a0
+	.quad	0x3d2924ae921f7eca
+	.quad	0xc08ff6f9a97407a8
+	.quad	0xbd32994b351f388c
+	.quad	0xc08ff6fcd0445118
+	.quad	0xbd429af37d1edf2f
+	.quad	0xc08ff6fff6388644
+	.quad	0x3d2ed5a8a2de89da
+	.quad	0xc08ff7031b511f16
+	.quad	0xbd474d8b66a69572
+	.quad	0xc08ff7063f8e9322
+	.quad	0xbd3b20d190c69cff
+	.quad	0xc08ff70962f15992
+	.quad	0xbcf455bedf4083bc
+	.quad	0xc08ff70c8579e930
+	.quad	0xbd215844900583de
+	.quad	0xc08ff70fa728b868
+	.quad	0xbd054cda62d3926e
+	.quad	0xc08ff712c7fe3d44
+	.quad	0x3d2143e9a0cbd481
+	.quad	0xc08ff715e7faed6e
+	.quad	0x3d2a82ed66976b91
+	.quad	0xc08ff719071f3e30
+	.quad	0xbd318c64f0672cf9
+	.quad	0xc08ff71c256ba478
+	.quad	0xbd2c760bc9b188c4
+	.quad	0xc08ff71f42e094d2
+	.quad	0xbd2b88ca364674ac
+	.quad	0xc08ff7225f7e836c
+	.quad	0xbd46361ccd8974a5
+	.quad	0xc08ff7257b45e41a
+	.quad	0xbd24e3eb5884aae7
+	.quad	0xc08ff72896372a4c
+	.quad	0xbd38b1aff71c8605
+	.quad	0xc08ff72bb052c91a
+	.quad	0xbd429a0a140ddd8a
+	.quad	0xc08ff72ec999333e
+	.quad	0xbd43d6bb35ec114f
+	.quad	0xc08ff731e20adb16
+	.quad	0xbd2bd849ce4dc635
+	.quad	0xc08ff734f9a832a2
+	.quad	0xbd206c243749114c
+	.quad	0xc08ff7381071ab88
+	.quad	0xbd3595f2f68d91fd
+	.quad	0xc08ff73b2667b714
+	.quad	0xbd3017eb15bb7de4
+	.quad	0xc08ff73e3b8ac636
+	.quad	0x3d1c28798c12cc39
+	.quad	0xc08ff7414fdb4982
+	.quad	0xbd12ce6312ebb81d
+	.quad	0xc08ff7446359b134
+	.quad	0xbd4395510d1e3f81
+	.quad	0xc08ff74776066d30
+	.quad	0xbd3f86493917b407
+	.quad	0xc08ff74a87e1ecfe
+	.quad	0xbd10be3a57487484
+	.quad	0xc08ff74d98ec9fcc
+	.quad	0xbd2d5297837adb4b
+	.quad	0xc08ff750a926f472
+	.quad	0xbd43ae4d308b33a5
+	.quad	0xc08ff753b8915972
+	.quad	0x3d2d54d244e2aaee
+	.quad	0xc08ff756c72c3cee
+	.quad	0xbd35f097b0fe80a3
+	.quad	0xc08ff759d4f80cba
+	.quad	0xbd3077f1f5f0cc83
+	.quad	0xc08ff75ce1f5364e
+	.quad	0x3d19367107b8e917
+	.quad	0xc08ff75fee2426ca
+	.quad	0xbd33623c81400bcf
+	.quad	0xc08ff762f9854afc
+	.quad	0xbd33b55bcb161bac
+	.quad	0xc08ff76604190f5a
+	.quad	0x3d2eb3c3bf914b9c
+	.quad	0xc08ff7690ddfe000
+	.quad	0xbd45a6a7f43f6ec0
+	.quad	0xc08ff76c16da28be
+	.quad	0xbd3b253dff5e0495
+	.quad	0xc08ff76f1f085508
+	.quad	0x3d1b08127eec65d2
+	.quad	0xc08ff772266acffc
+	.quad	0xbd45b1799ceaeb51
+	.quad	0xc08ff7752d02046c
+	.quad	0xbd2e63bd0fcda210
+	.quad	0xc08ff77832ce5cce
+	.quad	0xbd148cd0a7bb24b2
+	.quad	0xc08ff77b37d04348
+	.quad	0x3d11ef56fa3d37b4
+	.quad	0xc08ff77e3c0821ac
+	.quad	0x3d1a768216f872eb
+	.quad	0xc08ff7813f766178
+	.quad	0xbd44b4a15a96316e
+	.quad	0xc08ff784421b6bdc
+	.quad	0xbd4258a7b2336919
+	.quad	0xc08ff78743f7a9b2
+	.quad	0x3d03f659faac5a20
+	.quad	0xc08ff78a450b8380
+	.quad	0xbd2401fbaaa67e3c
+	.quad	0xc08ff78d4557617e
+	.quad	0xbd476fa81cf6a494
+	.quad	0xc08ff79044dbab94
+	.quad	0xbd44f46b93eece0a
+	.quad	0xc08ff7934398c956
+	.quad	0xbd3c91f073716495
+	.quad	0xc08ff796418f2208
+	.quad	0xbd3672b0c88d4dd6
+	.quad	0xc08ff7993ebf1c9e
+	.quad	0xbd3fb554647678d1
+	.quad	0xc08ff79c3b291fbe
+	.quad	0xbd0bb98afdf33295
+	.quad	0xc08ff79f36cd91ba
+	.quad	0xbd3a1c40753a869f
+	.quad	0xc08ff7a231acd89a
+	.quad	0xbd3395510d1e3f81
+	.quad	0xc08ff7a52bc75a14
+	.quad	0xbcf98fd2dca61c14
+	.quad	0xc08ff7a8251d7b8e
+	.quad	0xbd40e7b8e7574248
+	.quad	0xc08ff7ab1dafa224
+	.quad	0xbd43f88ff2576e98
+	.quad	0xc08ff7ae157e32a2
+	.quad	0xbd1f61a96b8ce776
+	.quad	0xc08ff7b10c899184
+	.quad	0x3cde66be73b9da04
+	.quad	0xc08ff7b402d222fa
+	.quad	0xbd408d5c3f1d5c0d
+	.quad	0xc08ff7b6f8584aea
+	.quad	0xbd3cbebea25ecd9e
+	.quad	0xc08ff7b9ed1c6cea
+	.quad	0xbd2507d6dc1f27ef
+	.quad	0xc08ff7bce11eec44
+	.quad	0x3d2794d4c6c8f327
+	.quad	0xc08ff7bfd4602bf4
+	.quad	0xbd3f1e32799da52d
+	.quad	0xc08ff7c2c6e08eb0
+	.quad	0xbd35c01818adf4af
+	.quad	0xc08ff7c5b8a076de
+	.quad	0x3d2cfc4de6d73dea
+	.quad	0xc08ff7c8a9a04696
+	.quad	0xbd4227264a17d460
+	.quad	0xc08ff7cb99e05fae
+	.quad	0xbd0142b08bb672e8
+	.quad	0xc08ff7ce896123a8
+	.quad	0xbd2564fcfaea5fb3
+	.quad	0xc08ff7d17822f3c2
+	.quad	0x3d2aab1b2a41b090
+	.quad	0xc08ff7d4662630ea
+	.quad	0xbd46ac3b83ef359a
+	.quad	0xc08ff7d7536b3bce
+	.quad	0x3d241a2f220ccf53
+	.quad	0xc08ff7da3ff274c6
+	.quad	0xbd38f5d37680fd7c
+	.quad	0xc08ff7dd2bbc3bec
+	.quad	0x3d048a179268271d
+	.quad	0xc08ff7e016c8f108
+	.quad	0xbd471e548b69f12a
+	.quad	0xc08ff7e30118f3a2
+	.quad	0xbd41a23946dfa58c
+	.quad	0xc08ff7e5eaaca2f4
+	.quad	0xbd25330d5605f2a6
+	.quad	0xc08ff7e8d3845df0
+	.quad	0xbd319b14945cf6ba
+	.quad	0xc08ff7ebbba08342
+	.quad	0xbd4702e1863f7c92
+	.quad	0xc08ff7eea3017150
+	.quad	0xbd437cfeba9ff979
+	.quad	0xc08ff7f189a78636
+	.quad	0xbd3df6e958e938b0
+	.quad	0xc08ff7f46f931fca
+	.quad	0xbd37ca15910e7069
+	.quad	0xc08ff7f754c49b9c
+	.quad	0xbd15cfd00d77e6ec
+	.quad	0xc08ff7fa393c56f4
+	.quad	0xbd2a025d9e2442e6
+	.quad	0xc08ff7fd1cfaaed6
+	.quad	0xbd3258e9a821b7cc
+	.quad	0xc08ff80000000000
+	.quad	0x0000000000000000
+	.rept	48
+	.byte	0
+	.endr
+
+/* Lookup exp(2) table (for HSW): */
+.if .-__svml_dpow_data != _hsw_dTe
+.err
+.endif
+	.quad	0x3ff0000000000000
+	.quad	0x3ff00b1afa5abcbf
+	.quad	0x3ff0163da9fb3335
+	.quad	0x3ff02168143b0281
+	.quad	0x3ff02c9a3e778061
+	.quad	0x3ff037d42e11bbcc
+	.quad	0x3ff04315e86e7f85
+	.quad	0x3ff04e5f72f654b1
+	.quad	0x3ff059b0d3158574
+	.quad	0x3ff0650a0e3c1f89
+	.quad	0x3ff0706b29ddf6de
+	.quad	0x3ff07bd42b72a836
+	.quad	0x3ff0874518759bc8
+	.quad	0x3ff092bdf66607e0
+	.quad	0x3ff09e3ecac6f383
+	.quad	0x3ff0a9c79b1f3919
+	.quad	0x3ff0b5586cf9890f
+	.quad	0x3ff0c0f145e46c85
+	.quad	0x3ff0cc922b7247f7
+	.quad	0x3ff0d83b23395dec
+	.quad	0x3ff0e3ec32d3d1a2
+	.quad	0x3ff0efa55fdfa9c5
+	.quad	0x3ff0fb66affed31b
+	.quad	0x3ff1073028d7233e
+	.quad	0x3ff11301d0125b51
+	.quad	0x3ff11edbab5e2ab6
+	.quad	0x3ff12abdc06c31cc
+	.quad	0x3ff136a814f204ab
+	.quad	0x3ff1429aaea92de0
+	.quad	0x3ff14e95934f312e
+	.quad	0x3ff15a98c8a58e51
+	.quad	0x3ff166a45471c3c2
+	.quad	0x3ff172b83c7d517b
+	.quad	0x3ff17ed48695bbc0
+	.quad	0x3ff18af9388c8dea
+	.quad	0x3ff1972658375d2f
+	.quad	0x3ff1a35beb6fcb75
+	.quad	0x3ff1af99f8138a1c
+	.quad	0x3ff1bbe084045cd4
+	.quad	0x3ff1c82f95281c6b
+	.quad	0x3ff1d4873168b9aa
+	.quad	0x3ff1e0e75eb44027
+	.quad	0x3ff1ed5022fcd91d
+	.quad	0x3ff1f9c18438ce4d
+	.quad	0x3ff2063b88628cd6
+	.quad	0x3ff212be3578a819
+	.quad	0x3ff21f49917ddc96
+	.quad	0x3ff22bdda27912d1
+	.quad	0x3ff2387a6e756238
+	.quad	0x3ff2451ffb82140a
+	.quad	0x3ff251ce4fb2a63f
+	.quad	0x3ff25e85711ece75
+	.quad	0x3ff26b4565e27cdd
+	.quad	0x3ff2780e341ddf29
+	.quad	0x3ff284dfe1f56381
+	.quad	0x3ff291ba7591bb70
+	.quad	0x3ff29e9df51fdee1
+	.quad	0x3ff2ab8a66d10f13
+	.quad	0x3ff2b87fd0dad990
+	.quad	0x3ff2c57e39771b2f
+	.quad	0x3ff2d285a6e4030b
+	.quad	0x3ff2df961f641589
+	.quad	0x3ff2ecafa93e2f56
+	.quad	0x3ff2f9d24abd886b
+	.quad	0x3ff306fe0a31b715
+	.quad	0x3ff31432edeeb2fd
+	.quad	0x3ff32170fc4cd831
+	.quad	0x3ff32eb83ba8ea32
+	.quad	0x3ff33c08b26416ff
+	.quad	0x3ff3496266e3fa2d
+	.quad	0x3ff356c55f929ff1
+	.quad	0x3ff36431a2de883b
+	.quad	0x3ff371a7373aa9cb
+	.quad	0x3ff37f26231e754a
+	.quad	0x3ff38cae6d05d866
+	.quad	0x3ff39a401b7140ef
+	.quad	0x3ff3a7db34e59ff7
+	.quad	0x3ff3b57fbfec6cf4
+	.quad	0x3ff3c32dc313a8e5
+	.quad	0x3ff3d0e544ede173
+	.quad	0x3ff3dea64c123422
+	.quad	0x3ff3ec70df1c5175
+	.quad	0x3ff3fa4504ac801c
+	.quad	0x3ff40822c367a024
+	.quad	0x3ff4160a21f72e2a
+	.quad	0x3ff423fb2709468a
+	.quad	0x3ff431f5d950a897
+	.quad	0x3ff43ffa3f84b9d4
+	.quad	0x3ff44e086061892d
+	.quad	0x3ff45c2042a7d232
+	.quad	0x3ff46a41ed1d0057
+	.quad	0x3ff4786d668b3237
+	.quad	0x3ff486a2b5c13cd0
+	.quad	0x3ff494e1e192aed2
+	.quad	0x3ff4a32af0d7d3de
+	.quad	0x3ff4b17dea6db7d7
+	.quad	0x3ff4bfdad5362a27
+	.quad	0x3ff4ce41b817c114
+	.quad	0x3ff4dcb299fddd0d
+	.quad	0x3ff4eb2d81d8abff
+	.quad	0x3ff4f9b2769d2ca7
+	.quad	0x3ff508417f4531ee
+	.quad	0x3ff516daa2cf6642
+	.quad	0x3ff5257de83f4eef
+	.quad	0x3ff5342b569d4f82
+	.quad	0x3ff542e2f4f6ad27
+	.quad	0x3ff551a4ca5d920f
+	.quad	0x3ff56070dde910d2
+	.quad	0x3ff56f4736b527da
+	.quad	0x3ff57e27dbe2c4cf
+	.quad	0x3ff58d12d497c7fd
+	.quad	0x3ff59c0827ff07cc
+	.quad	0x3ff5ab07dd485429
+	.quad	0x3ff5ba11fba87a03
+	.quad	0x3ff5c9268a5946b7
+	.quad	0x3ff5d84590998b93
+	.quad	0x3ff5e76f15ad2148
+	.quad	0x3ff5f6a320dceb71
+	.quad	0x3ff605e1b976dc09
+	.quad	0x3ff6152ae6cdf6f4
+	.quad	0x3ff6247eb03a5585
+	.quad	0x3ff633dd1d1929fd
+	.quad	0x3ff6434634ccc320
+	.quad	0x3ff652b9febc8fb7
+	.quad	0x3ff6623882552225
+	.quad	0x3ff671c1c70833f6
+	.quad	0x3ff68155d44ca973
+	.quad	0x3ff690f4b19e9538
+	.quad	0x3ff6a09e667f3bcd
+	.quad	0x3ff6b052fa75173e
+	.quad	0x3ff6c012750bdabf
+	.quad	0x3ff6cfdcddd47645
+	.quad	0x3ff6dfb23c651a2f
+	.quad	0x3ff6ef9298593ae5
+	.quad	0x3ff6ff7df9519484
+	.quad	0x3ff70f7466f42e87
+	.quad	0x3ff71f75e8ec5f74
+	.quad	0x3ff72f8286ead08a
+	.quad	0x3ff73f9a48a58174
+	.quad	0x3ff74fbd35d7cbfd
+	.quad	0x3ff75feb564267c9
+	.quad	0x3ff77024b1ab6e09
+	.quad	0x3ff780694fde5d3f
+	.quad	0x3ff790b938ac1cf6
+	.quad	0x3ff7a11473eb0187
+	.quad	0x3ff7b17b0976cfdb
+	.quad	0x3ff7c1ed0130c132
+	.quad	0x3ff7d26a62ff86f0
+	.quad	0x3ff7e2f336cf4e62
+	.quad	0x3ff7f3878491c491
+	.quad	0x3ff80427543e1a12
+	.quad	0x3ff814d2add106d9
+	.quad	0x3ff82589994cce13
+	.quad	0x3ff8364c1eb941f7
+	.quad	0x3ff8471a4623c7ad
+	.quad	0x3ff857f4179f5b21
+	.quad	0x3ff868d99b4492ed
+	.quad	0x3ff879cad931a436
+	.quad	0x3ff88ac7d98a6699
+	.quad	0x3ff89bd0a478580f
+	.quad	0x3ff8ace5422aa0db
+	.quad	0x3ff8be05bad61778
+	.quad	0x3ff8cf3216b5448c
+	.quad	0x3ff8e06a5e0866d9
+	.quad	0x3ff8f1ae99157736
+	.quad	0x3ff902fed0282c8a
+	.quad	0x3ff9145b0b91ffc6
+	.quad	0x3ff925c353aa2fe2
+	.quad	0x3ff93737b0cdc5e5
+	.quad	0x3ff948b82b5f98e5
+	.quad	0x3ff95a44cbc8520f
+	.quad	0x3ff96bdd9a7670b3
+	.quad	0x3ff97d829fde4e50
+	.quad	0x3ff98f33e47a22a2
+	.quad	0x3ff9a0f170ca07ba
+	.quad	0x3ff9b2bb4d53fe0d
+	.quad	0x3ff9c49182a3f090
+	.quad	0x3ff9d674194bb8d5
+	.quad	0x3ff9e86319e32323
+	.quad	0x3ff9fa5e8d07f29e
+	.quad	0x3ffa0c667b5de565
+	.quad	0x3ffa1e7aed8eb8bb
+	.quad	0x3ffa309bec4a2d33
+	.quad	0x3ffa42c980460ad8
+	.quad	0x3ffa5503b23e255d
+	.quad	0x3ffa674a8af46052
+	.quad	0x3ffa799e1330b358
+	.quad	0x3ffa8bfe53c12e59
+	.quad	0x3ffa9e6b5579fdbf
+	.quad	0x3ffab0e521356eba
+	.quad	0x3ffac36bbfd3f37a
+	.quad	0x3ffad5ff3a3c2774
+	.quad	0x3ffae89f995ad3ad
+	.quad	0x3ffafb4ce622f2ff
+	.quad	0x3ffb0e07298db666
+	.quad	0x3ffb20ce6c9a8952
+	.quad	0x3ffb33a2b84f15fb
+	.quad	0x3ffb468415b749b1
+	.quad	0x3ffb59728de5593a
+	.quad	0x3ffb6c6e29f1c52a
+	.quad	0x3ffb7f76f2fb5e47
+	.quad	0x3ffb928cf22749e4
+	.quad	0x3ffba5b030a1064a
+	.quad	0x3ffbb8e0b79a6f1f
+	.quad	0x3ffbcc1e904bc1d2
+	.quad	0x3ffbdf69c3f3a207
+	.quad	0x3ffbf2c25bd71e09
+	.quad	0x3ffc06286141b33d
+	.quad	0x3ffc199bdd85529c
+	.quad	0x3ffc2d1cd9fa652c
+	.quad	0x3ffc40ab5fffd07a
+	.quad	0x3ffc544778fafb22
+	.quad	0x3ffc67f12e57d14b
+	.quad	0x3ffc7ba88988c933
+	.quad	0x3ffc8f6d9406e7b5
+	.quad	0x3ffca3405751c4db
+	.quad	0x3ffcb720dcef9069
+	.quad	0x3ffccb0f2e6d1675
+	.quad	0x3ffcdf0b555dc3fa
+	.quad	0x3ffcf3155b5bab74
+	.quad	0x3ffd072d4a07897c
+	.quad	0x3ffd1b532b08c968
+	.quad	0x3ffd2f87080d89f2
+	.quad	0x3ffd43c8eacaa1d6
+	.quad	0x3ffd5818dcfba487
+	.quad	0x3ffd6c76e862e6d3
+	.quad	0x3ffd80e316c98398
+	.quad	0x3ffd955d71ff6075
+	.quad	0x3ffda9e603db3285
+	.quad	0x3ffdbe7cd63a8315
+	.quad	0x3ffdd321f301b460
+	.quad	0x3ffde7d5641c0658
+	.quad	0x3ffdfc97337b9b5f
+	.quad	0x3ffe11676b197d17
+	.quad	0x3ffe264614f5a129
+	.quad	0x3ffe3b333b16ee12
+	.quad	0x3ffe502ee78b3ff6
+	.quad	0x3ffe653924676d76
+	.quad	0x3ffe7a51fbc74c83
+	.quad	0x3ffe8f7977cdb740
+	.quad	0x3ffea4afa2a490da
+	.quad	0x3ffeb9f4867cca6e
+	.quad	0x3ffecf482d8e67f1
+	.quad	0x3ffee4aaa2188510
+	.quad	0x3ffefa1bee615a27
+	.quad	0x3fff0f9c1cb6412a
+	.quad	0x3fff252b376bba97
+	.quad	0x3fff3ac948dd7274
+	.quad	0x3fff50765b6e4540
+	.quad	0x3fff6632798844f8
+	.quad	0x3fff7bfdad9cbe14
+	.quad	0x3fff91d802243c89
+	.quad	0x3fffa7c1819e90d8
+	.quad	0x3fffbdba3692d514
+	.quad	0x3fffd3c22b8f71f1
+	.quad	0x3fffe9d96b2a23d9
+
+/* General purpose constants:
+ * hsw_dMantMask */
+double_vector _hsw_dMantMask 0x000fffffffffffff
+
+/* hsw_dOne */
+double_vector _hsw_dOne 0x3ff0000000000000
+
+/* hsw_dCvtMask */
+double_vector _hsw_dCvtMask 0x4338000000000000
+
+/* hsw_dMinNorm */
+double_vector _hsw_dMinNorm 0x0010000000000000
+
+/* hsw_dMaxNorm */
+double_vector _hsw_dMaxNorm 0x7fefffffffffffff
+
+/* hsw_lRndBit */
+double_vector _hsw_lRndBit 0x0000040000000000
+
+/* hsw_lRndMask */
+double_vector _hsw_lRndMask 0xfffff80000000000
+
+/* Log polynomial:
+ * hsw_dc6 */
+double_vector _hsw_dc6 0xbfcec1cfbbc5c90c
+
+/* hsw_dc5 */
+double_vector _hsw_dc5 0x3fd2776da3d26e6a
+
+/* hsw_dc4 */
+double_vector _hsw_dc4 0xbfd71547655d37e0
+
+/* hsw_dc3 */
+double_vector _hsw_dc3 0x3fdec709dc39fb02
+
+/* hsw_dc1 */
+double_vector _hsw_dc1 0x3c777a3a2c24613d
+
+/* hsw_dc1h */
+double_vector _hsw_dc1h 0x3ff71547652b82fe
+
+/* hsw_dc2 */
+double_vector _hsw_dc2 0xbfe71547652b82fe
+
+/* Additional constants:
+ * hsw_AbsMask */
+double_vector _hsw_dAbsMask 0x7fffffffffffffff
+
+/* hsw_dDomainRange */
+double_vector _hsw_dDomainRange 0x408fec0000000000
+
+/* hsw_dShifter */
+double_vector _hsw_dShifter 0x42b800000003ff00
+
+/* hsw_dIndexMask */
+double_vector _hsw_dIndexMask 0x00000000000007f8
+
+/* Exp polynomial:
+ * hsw_dce4 */
+double_vector _hsw_dce4 0x3f83b2ab930f15f9
+
+/* hsw_dce3 */
+double_vector _hsw_dce3 0x3fac6b090da1e0a9
+
+/* hsw_dce2 */
+double_vector _hsw_dce2 0x3fcebfbdff82c54d
+
+/* hsw_dce1 */
+double_vector _hsw_dce1 0x3fe62e42fefa39b9
+
+/* Reciprocal lookup table for log part (non HSW): */
+.if .-__svml_dpow_data != _rcp_t1
+.err
+.endif
+	.quad	0x3ff7154740000000
+	.quad	0x3ff70f8340000000
+	.quad	0x3ff709c240000000
+	.quad	0x3ff7040440000000
+	.quad	0x3ff6fe4900000000
+	.quad	0x3ff6f89080000000
+	.quad	0x3ff6f2db00000000
+	.quad	0x3ff6ed2840000000
+	.quad	0x3ff6e77840000000
+	.quad	0x3ff6e1cb40000000
+	.quad	0x3ff6dc2100000000
+	.quad	0x3ff6d67980000000
+	.quad	0x3ff6d0d4c0000000
+	.quad	0x3ff6cb32c0000000
+	.quad	0x3ff6c593c0000000
+	.quad	0x3ff6bff780000000
+	.quad	0x3ff6ba5dc0000000
+	.quad	0x3ff6b4c700000000
+	.quad	0x3ff6af32c0000000
+	.quad	0x3ff6a9a180000000
+	.quad	0x3ff6a41300000000
+	.quad	0x3ff69e8700000000
+	.quad	0x3ff698fdc0000000
+	.quad	0x3ff6937740000000
+	.quad	0x3ff68df380000000
+	.quad	0x3ff6887280000000
+	.quad	0x3ff682f400000000
+	.quad	0x3ff67d7840000000
+	.quad	0x3ff677ff40000000
+	.quad	0x3ff67288c0000000
+	.quad	0x3ff66d1540000000
+	.quad	0x3ff667a400000000
+	.quad	0x3ff6623580000000
+	.quad	0x3ff65cc9c0000000
+	.quad	0x3ff6576080000000
+	.quad	0x3ff651fa00000000
+	.quad	0x3ff64c9600000000
+	.quad	0x3ff6473480000000
+	.quad	0x3ff641d5c0000000
+	.quad	0x3ff63c7980000000
+	.quad	0x3ff6372000000000
+	.quad	0x3ff631c900000000
+	.quad	0x3ff62c7480000000
+	.quad	0x3ff6272280000000
+	.quad	0x3ff621d340000000
+	.quad	0x3ff61c8640000000
+	.quad	0x3ff6173c00000000
+	.quad	0x3ff611f440000000
+	.quad	0x3ff60caf00000000
+	.quad	0x3ff6076c40000000
+	.quad	0x3ff6022c00000000
+	.quad	0x3ff5fcee80000000
+	.quad	0x3ff5f7b340000000
+	.quad	0x3ff5f27a80000000
+	.quad	0x3ff5ed4440000000
+	.quad	0x3ff5e81040000000
+	.quad	0x3ff5e2df00000000
+	.quad	0x3ff5ddb040000000
+	.quad	0x3ff5d883c0000000
+	.quad	0x3ff5d359c0000000
+	.quad	0x3ff5ce3240000000
+	.quad	0x3ff5c90d40000000
+	.quad	0x3ff5c3ea80000000
+	.quad	0x3ff5beca40000000
+	.quad	0x3ff5b9ac80000000
+	.quad	0x3ff5b49100000000
+	.quad	0x3ff5af7800000000
+	.quad	0x3ff5aa6180000000
+	.quad	0x3ff5a54d40000000
+	.quad	0x3ff5a03b40000000
+	.quad	0x3ff59b2bc0000000
+	.quad	0x3ff5961ec0000000
+	.quad	0x3ff59113c0000000
+	.quad	0x3ff58c0b80000000
+	.quad	0x3ff5870540000000
+	.quad	0x3ff58201c0000000
+	.quad	0x3ff57d0040000000
+	.quad	0x3ff5780140000000
+	.quad	0x3ff5730480000000
+	.quad	0x3ff56e0a00000000
+	.quad	0x3ff56911c0000000
+	.quad	0x3ff5641c00000000
+	.quad	0x3ff55f2880000000
+	.quad	0x3ff55a3740000000
+	.quad	0x3ff5554840000000
+	.quad	0x3ff5505bc0000000
+	.quad	0x3ff54b7140000000
+	.quad	0x3ff5468900000000
+	.quad	0x3ff541a340000000
+	.quad	0x3ff53cbf80000000
+	.quad	0x3ff537de40000000
+	.quad	0x3ff532ff00000000
+	.quad	0x3ff52e2240000000
+	.quad	0x3ff5294780000000
+	.quad	0x3ff5246f00000000
+	.quad	0x3ff51f98c0000000
+	.quad	0x3ff51ac4c0000000
+	.quad	0x3ff515f300000000
+	.quad	0x3ff5112340000000
+	.quad	0x3ff50c5600000000
+	.quad	0x3ff5078ac0000000
+	.quad	0x3ff502c1c0000000
+	.quad	0x3ff4fdfac0000000
+	.quad	0x3ff4f93600000000
+	.quad	0x3ff4f47380000000
+	.quad	0x3ff4efb340000000
+	.quad	0x3ff4eaf500000000
+	.quad	0x3ff4e638c0000000
+	.quad	0x3ff4e17ec0000000
+	.quad	0x3ff4dcc700000000
+	.quad	0x3ff4d81180000000
+	.quad	0x3ff4d35dc0000000
+	.quad	0x3ff4ceac80000000
+	.quad	0x3ff4c9fd00000000
+	.quad	0x3ff4c54fc0000000
+	.quad	0x3ff4c0a4c0000000
+	.quad	0x3ff4bbfbc0000000
+	.quad	0x3ff4b754c0000000
+	.quad	0x3ff4b2b000000000
+	.quad	0x3ff4ae0d40000000
+	.quad	0x3ff4a96c80000000
+	.quad	0x3ff4a4ce00000000
+	.quad	0x3ff4a03140000000
+	.quad	0x3ff49b9700000000
+	.quad	0x3ff496fe80000000
+	.quad	0x3ff4926800000000
+	.quad	0x3ff48dd3c0000000
+	.quad	0x3ff4894180000000
+	.quad	0x3ff484b100000000
+	.quad	0x3ff48022c0000000
+	.quad	0x3ff47b96c0000000
+	.quad	0x3ff4770c80000000
+	.quad	0x3ff4728440000000
+	.quad	0x3ff46dfe00000000
+	.quad	0x3ff46979c0000000
+	.quad	0x3ff464f780000000
+	.quad	0x3ff4607780000000
+	.quad	0x3ff45bf940000000
+	.quad	0x3ff4577d00000000
+	.quad	0x3ff45302c0000000
+	.quad	0x3ff44e8a40000000
+	.quad	0x3ff44a1400000000
+	.quad	0x3ff4459f80000000
+	.quad	0x3ff4412d40000000
+	.quad	0x3ff43cbcc0000000
+	.quad	0x3ff4384e40000000
+	.quad	0x3ff433e180000000
+	.quad	0x3ff42f7700000000
+	.quad	0x3ff42b0e40000000
+	.quad	0x3ff426a780000000
+	.quad	0x3ff4224280000000
+	.quad	0x3ff41ddf80000000
+	.quad	0x3ff4197e80000000
+	.quad	0x3ff4151f40000000
+	.quad	0x3ff410c200000000
+	.quad	0x3ff40c66c0000000
+	.quad	0x3ff4080d40000000
+	.quad	0x3ff403b5c0000000
+	.quad	0x3ff3ff6000000000
+	.quad	0x3ff3fb0c00000000
+	.quad	0x3ff3f6ba40000000
+	.quad	0x3ff3f26a00000000
+	.quad	0x3ff3ee1bc0000000
+	.quad	0x3ff3e9cf80000000
+	.quad	0x3ff3e58500000000
+	.quad	0x3ff3e13c40000000
+	.quad	0x3ff3dcf580000000
+	.quad	0x3ff3d8b080000000
+	.quad	0x3ff3d46d40000000
+	.quad	0x3ff3d02c00000000
+	.quad	0x3ff3cbec80000000
+	.quad	0x3ff3c7aec0000000
+	.quad	0x3ff3c37300000000
+	.quad	0x3ff3bf3900000000
+	.quad	0x3ff3bb00c0000000
+	.quad	0x3ff3b6ca40000000
+	.quad	0x3ff3b29580000000
+	.quad	0x3ff3ae62c0000000
+	.quad	0x3ff3aa3180000000
+	.quad	0x3ff3a60240000000
+	.quad	0x3ff3a1d4c0000000
+	.quad	0x3ff39da900000000
+	.quad	0x3ff3997f40000000
+	.quad	0x3ff3955700000000
+	.quad	0x3ff3913080000000
+	.quad	0x3ff38d0bc0000000
+	.quad	0x3ff388e900000000
+	.quad	0x3ff384c7c0000000
+	.quad	0x3ff380a840000000
+	.quad	0x3ff37c8ac0000000
+	.quad	0x3ff3786ec0000000
+	.quad	0x3ff3745480000000
+	.quad	0x3ff3703c00000000
+	.quad	0x3ff36c2540000000
+	.quad	0x3ff3681040000000
+	.quad	0x3ff363fcc0000000
+	.quad	0x3ff35feb40000000
+	.quad	0x3ff35bdb40000000
+	.quad	0x3ff357cd00000000
+	.quad	0x3ff353c080000000
+	.quad	0x3ff34fb5c0000000
+	.quad	0x3ff34bac80000000
+	.quad	0x3ff347a540000000
+	.quad	0x3ff3439f80000000
+	.quad	0x3ff33f9b40000000
+	.quad	0x3ff33b9900000000
+	.quad	0x3ff3379840000000
+	.quad	0x3ff3339900000000
+	.quad	0x3ff32f9bc0000000
+	.quad	0x3ff32b9fc0000000
+	.quad	0x3ff327a5c0000000
+	.quad	0x3ff323ad40000000
+	.quad	0x3ff31fb680000000
+	.quad	0x3ff31bc140000000
+	.quad	0x3ff317cdc0000000
+	.quad	0x3ff313dbc0000000
+	.quad	0x3ff30feb80000000
+	.quad	0x3ff30bfd00000000
+	.quad	0x3ff3080fc0000000
+	.quad	0x3ff3042480000000
+	.quad	0x3ff3003ac0000000
+	.quad	0x3ff2fc5280000000
+	.quad	0x3ff2f86bc0000000
+	.quad	0x3ff2f48700000000
+	.quad	0x3ff2f0a380000000
+	.quad	0x3ff2ecc1c0000000
+	.quad	0x3ff2e8e180000000
+	.quad	0x3ff2e502c0000000
+	.quad	0x3ff2e125c0000000
+	.quad	0x3ff2dd4a40000000
+	.quad	0x3ff2d97080000000
+	.quad	0x3ff2d59840000000
+	.quad	0x3ff2d1c180000000
+	.quad	0x3ff2cdec40000000
+	.quad	0x3ff2ca1880000000
+	.quad	0x3ff2c64680000000
+	.quad	0x3ff2c27600000000
+	.quad	0x3ff2bea700000000
+	.quad	0x3ff2bad9c0000000
+	.quad	0x3ff2b70dc0000000
+	.quad	0x3ff2b34380000000
+	.quad	0x3ff2af7ac0000000
+	.quad	0x3ff2abb340000000
+	.quad	0x3ff2a7ed80000000
+	.quad	0x3ff2a42980000000
+	.quad	0x3ff2a066c0000000
+	.quad	0x3ff29ca580000000
+	.quad	0x3ff298e5c0000000
+	.quad	0x3ff29527c0000000
+	.quad	0x3ff2916b00000000
+	.quad	0x3ff28dafc0000000
+	.quad	0x3ff289f640000000
+	.quad	0x3ff2863e00000000
+	.quad	0x3ff2828740000000
+	.quad	0x3ff27ed240000000
+	.quad	0x3ff27b1e80000000
+	.quad	0x3ff2776c40000000
+	.quad	0x3ff273bb80000000
+	.quad	0x3ff2700c40000000
+	.quad	0x3ff26c5e80000000
+	.quad	0x3ff268b200000000
+	.quad	0x3ff2650740000000
+	.quad	0x3ff2615dc0000000
+	.quad	0x3ff25db5c0000000
+	.quad	0x3ff25a0f40000000
+	.quad	0x3ff2566a40000000
+	.quad	0x3ff252c6c0000000
+	.quad	0x3ff24f2480000000
+	.quad	0x3ff24b83c0000000
+	.quad	0x3ff247e480000000
+	.quad	0x3ff24446c0000000
+	.quad	0x3ff240aa40000000
+	.quad	0x3ff23d0f40000000
+	.quad	0x3ff23975c0000000
+	.quad	0x3ff235dd80000000
+	.quad	0x3ff23246c0000000
+	.quad	0x3ff22eb180000000
+	.quad	0x3ff22b1d80000000
+	.quad	0x3ff2278b00000000
+	.quad	0x3ff223fa00000000
+	.quad	0x3ff2206a40000000
+	.quad	0x3ff21cdc00000000
+	.quad	0x3ff2194f00000000
+	.quad	0x3ff215c380000000
+	.quad	0x3ff2123940000000
+	.quad	0x3ff20eb080000000
+	.quad	0x3ff20b2940000000
+	.quad	0x3ff207a340000000
+	.quad	0x3ff2041ec0000000
+	.quad	0x3ff2009b80000000
+	.quad	0x3ff1fd1980000000
+	.quad	0x3ff1f99900000000
+	.quad	0x3ff1f619c0000000
+	.quad	0x3ff1f29c00000000
+	.quad	0x3ff1ef1fc0000000
+	.quad	0x3ff1eba480000000
+	.quad	0x3ff1e82ac0000000
+	.quad	0x3ff1e4b280000000
+	.quad	0x3ff1e13b80000000
+	.quad	0x3ff1ddc5c0000000
+	.quad	0x3ff1da5180000000
+	.quad	0x3ff1d6de80000000
+	.quad	0x3ff1d36cc0000000
+	.quad	0x3ff1cffc40000000
+	.quad	0x3ff1cc8d40000000
+	.quad	0x3ff1c91f80000000
+	.quad	0x3ff1c5b340000000
+	.quad	0x3ff1c24840000000
+	.quad	0x3ff1bede40000000
+	.quad	0x3ff1bb7600000000
+	.quad	0x3ff1b80ec0000000
+	.quad	0x3ff1b4a900000000
+	.quad	0x3ff1b14480000000
+	.quad	0x3ff1ade140000000
+	.quad	0x3ff1aa7f40000000
+	.quad	0x3ff1a71e80000000
+	.quad	0x3ff1a3bf40000000
+	.quad	0x3ff1a06140000000
+	.quad	0x3ff19d0480000000
+	.quad	0x3ff199a900000000
+	.quad	0x3ff1964ec0000000
+	.quad	0x3ff192f5c0000000
+	.quad	0x3ff18f9e00000000
+	.quad	0x3ff18c47c0000000
+	.quad	0x3ff188f280000000
+	.quad	0x3ff1859ec0000000
+	.quad	0x3ff1824c00000000
+	.quad	0x3ff17efac0000000
+	.quad	0x3ff17baa80000000
+	.quad	0x3ff1785bc0000000
+	.quad	0x3ff1750e40000000
+	.quad	0x3ff171c1c0000000
+	.quad	0x3ff16e76c0000000
+	.quad	0x3ff16b2d00000000
+	.quad	0x3ff167e440000000
+	.quad	0x3ff1649d00000000
+	.quad	0x3ff16156c0000000
+	.quad	0x3ff15e11c0000000
+	.quad	0x3ff15ace40000000
+	.quad	0x3ff1578bc0000000
+	.quad	0x3ff1544a80000000
+	.quad	0x3ff1510a80000000
+	.quad	0x3ff14dcbc0000000
+	.quad	0x3ff14a8e40000000
+	.quad	0x3ff14751c0000000
+	.quad	0x3ff14416c0000000
+	.quad	0x3ff140dcc0000000
+	.quad	0x3ff13da400000000
+	.quad	0x3ff13a6c80000000
+	.quad	0x3ff1373600000000
+	.quad	0x3ff1340100000000
+	.quad	0x3ff130cd00000000
+	.quad	0x3ff12d9a40000000
+	.quad	0x3ff12a68c0000000
+	.quad	0x3ff1273840000000
+	.quad	0x3ff1240900000000
+	.quad	0x3ff120db00000000
+	.quad	0x3ff11dae40000000
+	.quad	0x3ff11a8280000000
+	.quad	0x3ff1175800000000
+	.quad	0x3ff1142ec0000000
+	.quad	0x3ff11106c0000000
+	.quad	0x3ff10ddfc0000000
+	.quad	0x3ff10ab9c0000000
+	.quad	0x3ff1079540000000
+	.quad	0x3ff10471c0000000
+	.quad	0x3ff1014f80000000
+	.quad	0x3ff0fe2e40000000
+	.quad	0x3ff0fb0e40000000
+	.quad	0x3ff0f7ef40000000
+	.quad	0x3ff0f4d180000000
+	.quad	0x3ff0f1b500000000
+	.quad	0x3ff0ee9980000000
+	.quad	0x3ff0eb7f40000000
+	.quad	0x3ff0e86600000000
+	.quad	0x3ff0e54e00000000
+	.quad	0x3ff0e23700000000
+	.quad	0x3ff0df2140000000
+	.quad	0x3ff0dc0c80000000
+	.quad	0x3ff0d8f900000000
+	.quad	0x3ff0d5e6c0000000
+	.quad	0x3ff0d2d540000000
+	.quad	0x3ff0cfc540000000
+	.quad	0x3ff0ccb640000000
+	.quad	0x3ff0c9a840000000
+	.quad	0x3ff0c69b40000000
+	.quad	0x3ff0c38f80000000
+	.quad	0x3ff0c08500000000
+	.quad	0x3ff0bd7b80000000
+	.quad	0x3ff0ba7300000000
+	.quad	0x3ff0b76bc0000000
+	.quad	0x3ff0b46580000000
+	.quad	0x3ff0b16040000000
+	.quad	0x3ff0ae5c40000000
+	.quad	0x3ff0ab5940000000
+	.quad	0x3ff0a85780000000
+	.quad	0x3ff0a556c0000000
+	.quad	0x3ff0a25700000000
+	.quad	0x3ff09f5880000000
+	.quad	0x3ff09c5ac0000000
+	.quad	0x3ff0995e80000000
+	.quad	0x3ff0966300000000
+	.quad	0x3ff09368c0000000
+	.quad	0x3ff0906f80000000
+	.quad	0x3ff08d7740000000
+	.quad	0x3ff08a8000000000
+	.quad	0x3ff0878a00000000
+	.quad	0x3ff0849500000000
+	.quad	0x3ff081a100000000
+	.quad	0x3ff07eae40000000
+	.quad	0x3ff07bbc40000000
+	.quad	0x3ff078cb80000000
+	.quad	0x3ff075dbc0000000
+	.quad	0x3ff072ed00000000
+	.quad	0x3ff06fff80000000
+	.quad	0x3ff06d12c0000000
+	.quad	0x3ff06a2740000000
+	.quad	0x3ff0673cc0000000
+	.quad	0x3ff0645340000000
+	.quad	0x3ff0616ac0000000
+	.quad	0x3ff05e8340000000
+	.quad	0x3ff05b9d00000000
+	.quad	0x3ff058b780000000
+	.quad	0x3ff055d340000000
+	.quad	0x3ff052f000000000
+	.quad	0x3ff0500d80000000
+	.quad	0x3ff04d2c40000000
+	.quad	0x3ff04a4c00000000
+	.quad	0x3ff0476cc0000000
+	.quad	0x3ff0448e80000000
+	.quad	0x3ff041b140000000
+	.quad	0x3ff03ed500000000
+	.quad	0x3ff03bf9c0000000
+	.quad	0x3ff0391fc0000000
+	.quad	0x3ff0364680000000
+	.quad	0x3ff0336e40000000
+	.quad	0x3ff0309700000000
+	.quad	0x3ff02dc0c0000000
+	.quad	0x3ff02aeb80000000
+	.quad	0x3ff0281740000000
+	.quad	0x3ff0254400000000
+	.quad	0x3ff02271c0000000
+	.quad	0x3ff01fa080000000
+	.quad	0x3ff01cd040000000
+	.quad	0x3ff01a00c0000000
+	.quad	0x3ff0173280000000
+	.quad	0x3ff0146540000000
+	.quad	0x3ff01198c0000000
+	.quad	0x3ff00ecd80000000
+	.quad	0x3ff00c0300000000
+	.quad	0x3ff0093980000000
+	.quad	0x3ff0067100000000
+	.quad	0x3ff003a980000000
+	.quad	0x3ff000e300000000
+	.quad	0x3feffc3a80000000
+	.quad	0x3feff6b140000000
+	.quad	0x3feff129c0000000
+	.quad	0x3fefeba480000000
+	.quad	0x3fefe620c0000000
+	.quad	0x3fefe09f40000000
+	.quad	0x3fefdb1f80000000
+	.quad	0x3fefd5a180000000
+	.quad	0x3fefd02580000000
+	.quad	0x3fefcaab80000000
+	.quad	0x3fefc53340000000
+	.quad	0x3fefbfbd00000000
+	.quad	0x3fefba4880000000
+	.quad	0x3fefb4d600000000
+	.quad	0x3fefaf6540000000
+	.quad	0x3fefa9f680000000
+	.quad	0x3fefa48980000000
+	.quad	0x3fef9f1e40000000
+	.quad	0x3fef99b500000000
+	.quad	0x3fef944dc0000000
+	.quad	0x3fef8ee800000000
+	.quad	0x3fef898440000000
+	.quad	0x3fef842280000000
+	.quad	0x3fef7ec280000000
+	.quad	0x3fef796440000000
+	.quad	0x3fef7407c0000000
+	.quad	0x3fef6ead40000000
+	.quad	0x3fef695480000000
+	.quad	0x3fef63fd80000000
+	.quad	0x3fef5ea880000000
+	.quad	0x3fef595540000000
+	.quad	0x3fef5403c0000000
+	.quad	0x3fef4eb400000000
+	.quad	0x3fef496640000000
+	.quad	0x3fef441a00000000
+	.quad	0x3fef3ecfc0000000
+	.quad	0x3fef398740000000
+	.quad	0x3fef344080000000
+	.quad	0x3fef2efb80000000
+	.quad	0x3fef29b880000000
+	.quad	0x3fef247700000000
+	.quad	0x3fef1f3780000000
+	.quad	0x3fef19f980000000
+	.quad	0x3fef14bd80000000
+	.quad	0x3fef0f8340000000
+	.quad	0x3fef0a4ac0000000
+	.quad	0x3fef0513c0000000
+	.quad	0x3feeffdec0000000
+	.quad	0x3feefaab80000000
+	.quad	0x3feef57a00000000
+	.quad	0x3feef04a00000000
+	.quad	0x3feeeb1c00000000
+	.quad	0x3feee5ef80000000
+	.quad	0x3feee0c500000000
+	.quad	0x3feedb9c00000000
+	.quad	0x3feed67500000000
+	.quad	0x3feed14f80000000
+	.quad	0x3feecc2bc0000000
+	.quad	0x3feec709c0000000
+	.quad	0x3feec1e940000000
+	.quad	0x3feebccac0000000
+	.quad	0x3feeb7adc0000000
+	.quad	0x3feeb29280000000
+	.quad	0x3feead7900000000
+	.quad	0x3feea86140000000
+	.quad	0x3feea34b40000000
+	.quad	0x3fee9e36c0000000
+	.quad	0x3fee992400000000
+	.quad	0x3fee941300000000
+	.quad	0x3fee8f0380000000
+	.quad	0x3fee89f5c0000000
+	.quad	0x3fee84e9c0000000
+	.quad	0x3fee7fdf40000000
+	.quad	0x3fee7ad680000000
+	.quad	0x3fee75cf80000000
+	.quad	0x3fee70ca00000000
+	.quad	0x3fee6bc640000000
+	.quad	0x3fee66c440000000
+	.quad	0x3fee61c3c0000000
+	.quad	0x3fee5cc500000000
+	.quad	0x3fee57c7c0000000
+	.quad	0x3fee52cc40000000
+	.quad	0x3fee4dd280000000
+	.quad	0x3fee48da00000000
+	.quad	0x3fee43e380000000
+	.quad	0x3fee3eee80000000
+	.quad	0x3fee39fb00000000
+	.quad	0x3fee350940000000
+	.quad	0x3fee301940000000
+	.quad	0x3fee2b2ac0000000
+	.quad	0x3fee263dc0000000
+	.quad	0x3fee215280000000
+	.quad	0x3fee1c68c0000000
+	.quad	0x3fee178080000000
+	.quad	0x3fee129a00000000
+	.quad	0x3fee0db540000000
+	.quad	0x3fee08d1c0000000
+	.quad	0x3fee03f000000000
+	.quad	0x3fedff1000000000
+	.quad	0x3fedfa3140000000
+	.quad	0x3fedf55440000000
+	.quad	0x3fedf07900000000
+	.quad	0x3fedeb9f00000000
+	.quad	0x3fede6c6c0000000
+	.quad	0x3fede1f040000000
+	.quad	0x3feddd1b00000000
+	.quad	0x3fedd84780000000
+	.quad	0x3fedd37580000000
+	.quad	0x3fedcea500000000
+	.quad	0x3fedc9d600000000
+	.quad	0x3fedc508c0000000
+	.quad	0x3fedc03d00000000
+	.quad	0x3fedbb72c0000000
+	.quad	0x3fedb6aa00000000
+	.quad	0x3fedb1e2c0000000
+	.quad	0x3fedad1d00000000
+	.quad	0x3feda85900000000
+	.quad	0x3feda39680000000
+	.quad	0x3fed9ed540000000
+	.quad	0x3fed9a15c0000000
+	.quad	0x3fed9557c0000000
+	.quad	0x3fed909b40000000
+	.quad	0x3fed8be040000000
+	.quad	0x3fed8726c0000000
+	.quad	0x3fed826f00000000
+	.quad	0x3fed7db880000000
+	.quad	0x3fed790380000000
+	.quad	0x3fed745000000000
+	.quad	0x3fed6f9e40000000
+	.quad	0x3fed6aedc0000000
+	.quad	0x3fed663ec0000000
+	.quad	0x3fed619140000000
+	.quad	0x3fed5ce540000000
+	.quad	0x3fed583ac0000000
+	.quad	0x3fed5391c0000000
+	.quad	0x3fed4eea40000000
+	.quad	0x3fed4a4440000000
+	.quad	0x3fed459f80000000
+	.quad	0x3fed40fc80000000
+	.quad	0x3fed3c5ac0000000
+	.quad	0x3fed37bac0000000
+	.quad	0x3fed331c00000000
+	.quad	0x3fed2e7ec0000000
+	.quad	0x3fed29e300000000
+	.quad	0x3fed254880000000
+	.quad	0x3fed20afc0000000
+	.quad	0x3fed1c1840000000
+	.quad	0x3fed178240000000
+	.quad	0x3fed12edc0000000
+	.quad	0x3fed0e5ac0000000
+	.quad	0x3fed09c900000000
+	.quad	0x3fed0538c0000000
+	.quad	0x3fed00aa00000000
+	.quad	0x3fecfc1c80000000
+	.quad	0x3fecf790c0000000
+	.quad	0x3fecf30600000000
+	.quad	0x3fecee7d00000000
+	.quad	0x3fece9f540000000
+	.quad	0x3fece56f00000000
+	.quad	0x3fece0ea40000000
+	.quad	0x3fecdc66c0000000
+	.quad	0x3fecd7e4c0000000
+	.quad	0x3fecd36440000000
+	.quad	0x3feccee500000000
+	.quad	0x3fecca6740000000
+	.quad	0x3fecc5eac0000000
+	.quad	0x3fecc16fc0000000
+	.quad	0x3fecbcf640000000
+	.quad	0x3fecb87e00000000
+	.quad	0x3fecb40740000000
+	.quad	0x3fecaf91c0000000
+	.quad	0x3fecab1dc0000000
+	.quad	0x3feca6ab00000000
+	.quad	0x3feca239c0000000
+	.quad	0x3fec9dc9c0000000
+	.quad	0x3fec995b40000000
+	.quad	0x3fec94ee00000000
+	.quad	0x3fec908240000000
+	.quad	0x3fec8c17c0000000
+	.quad	0x3fec87aec0000000
+	.quad	0x3fec834700000000
+	.quad	0x3fec7ee0c0000000
+	.quad	0x3fec7a7bc0000000
+	.quad	0x3fec761800000000
+	.quad	0x3fec71b5c0000000
+	.quad	0x3fec6d54c0000000
+	.quad	0x3fec68f540000000
+	.quad	0x3fec649700000000
+	.quad	0x3fec603a00000000
+	.quad	0x3fec5bde80000000
+	.quad	0x3fec578440000000
+	.quad	0x3fec532b80000000
+	.quad	0x3fec4ed3c0000000
+	.quad	0x3fec4a7dc0000000
+	.quad	0x3fec4628c0000000
+	.quad	0x3fec41d540000000
+	.quad	0x3fec3d8300000000
+	.quad	0x3fec393200000000
+	.quad	0x3fec34e240000000
+	.quad	0x3fec309400000000
+	.quad	0x3fec2c4700000000
+	.quad	0x3fec27fb80000000
+	.quad	0x3fec23b100000000
+	.quad	0x3fec1f6800000000
+	.quad	0x3fec1b2040000000
+	.quad	0x3fec16d9c0000000
+	.quad	0x3fec1294c0000000
+	.quad	0x3fec0e50c0000000
+	.quad	0x3fec0a0e40000000
+	.quad	0x3fec05cd00000000
+	.quad	0x3fec018d00000000
+	.quad	0x3febfd4e40000000
+	.quad	0x3febf91100000000
+	.quad	0x3febf4d4c0000000
+	.quad	0x3febf09a00000000
+	.quad	0x3febec6080000000
+	.quad	0x3febe82840000000
+	.quad	0x3febe3f140000000
+	.quad	0x3febdfbb80000000
+	.quad	0x3febdb8700000000
+	.quad	0x3febd753c0000000
+	.quad	0x3febd32200000000
+	.quad	0x3febcef140000000
+	.quad	0x3febcac1c0000000
+	.quad	0x3febc693c0000000
+	.quad	0x3febc266c0000000
+	.quad	0x3febbe3b40000000
+	.quad	0x3febba10c0000000
+	.quad	0x3febb5e7c0000000
+	.quad	0x3febb1bfc0000000
+	.quad	0x3febad9940000000
+	.quad	0x3feba973c0000000
+	.quad	0x3feba54fc0000000
+	.quad	0x3feba12cc0000000
+	.quad	0x3feb9d0b00000000
+	.quad	0x3feb98eac0000000
+	.quad	0x3feb94cb80000000
+	.quad	0x3feb90ad80000000
+	.quad	0x3feb8c90c0000000
+	.quad	0x3feb887540000000
+	.quad	0x3feb845b00000000
+	.quad	0x3feb8041c0000000
+	.quad	0x3feb7c2a00000000
+	.quad	0x3feb781340000000
+	.quad	0x3feb73fe00000000
+	.quad	0x3feb6fe9c0000000
+	.quad	0x3feb6bd6c0000000
+	.quad	0x3feb67c500000000
+	.quad	0x3feb63b440000000
+	.quad	0x3feb5fa500000000
+	.quad	0x3feb5b96c0000000
+	.quad	0x3feb5789c0000000
+	.quad	0x3feb537e00000000
+	.quad	0x3feb4f7380000000
+	.quad	0x3feb4b6a00000000
+	.quad	0x3feb476200000000
+	.quad	0x3feb435b00000000
+	.quad	0x3feb3f5540000000
+	.quad	0x3feb3b5080000000
+	.quad	0x3feb374d00000000
+	.quad	0x3feb334ac0000000
+	.quad	0x3feb2f49c0000000
+	.quad	0x3feb2b49c0000000
+	.quad	0x3feb274b40000000
+	.quad	0x3feb234d80000000
+	.quad	0x3feb1f5140000000
+	.quad	0x3feb1b5600000000
+	.quad	0x3feb175c00000000
+	.quad	0x3feb136300000000
+	.quad	0x3feb0f6b80000000
+	.quad	0x3feb0b74c0000000
+	.quad	0x3feb077f80000000
+	.quad	0x3feb038b40000000
+	.quad	0x3feaff9840000000
+	.quad	0x3feafba640000000
+	.quad	0x3feaf7b580000000
+	.quad	0x3feaf3c600000000
+	.quad	0x3feaefd780000000
+	.quad	0x3feaebea40000000
+	.quad	0x3feae7fe00000000
+	.quad	0x3feae41300000000
+	.quad	0x3feae02900000000
+	.quad	0x3feadc4040000000
+	.quad	0x3fead858c0000000
+	.quad	0x3fead47240000000
+	.quad	0x3fead08cc0000000
+	.quad	0x3feacca8c0000000
+	.quad	0x3feac8c580000000
+	.quad	0x3feac4e380000000
+	.quad	0x3feac102c0000000
+	.quad	0x3feabd2300000000
+	.quad	0x3feab94480000000
+	.quad	0x3feab56700000000
+	.quad	0x3feab18a80000000
+	.quad	0x3feaadaf80000000
+	.quad	0x3feaa9d540000000
+	.quad	0x3feaa5fc40000000
+	.quad	0x3feaa22440000000
+	.quad	0x3fea9e4d80000000
+	.quad	0x3fea9a77c0000000
+	.quad	0x3fea96a340000000
+	.quad	0x3fea92cfc0000000
+	.quad	0x3fea8efd80000000
+	.quad	0x3fea8b2c40000000
+	.quad	0x3fea875c00000000
+	.quad	0x3fea838cc0000000
+	.quad	0x3fea7fbec0000000
+	.quad	0x3fea7bf200000000
+	.quad	0x3fea782640000000
+	.quad	0x3fea745b80000000
+	.quad	0x3fea7091c0000000
+	.quad	0x3fea6cc940000000
+	.quad	0x3fea6901c0000000
+	.quad	0x3fea653b40000000
+	.quad	0x3fea617600000000
+	.quad	0x3fea5db1c0000000
+	.quad	0x3fea59ee80000000
+	.quad	0x3fea562c80000000
+	.quad	0x3fea526b80000000
+	.quad	0x3fea4eab80000000
+	.quad	0x3fea4aecc0000000
+	.quad	0x3fea472ec0000000
+	.quad	0x3fea437200000000
+	.quad	0x3fea3fb640000000
+	.quad	0x3fea3bfbc0000000
+	.quad	0x3fea384240000000
+	.quad	0x3fea3489c0000000
+	.quad	0x3fea30d240000000
+	.quad	0x3fea2d1bc0000000
+	.quad	0x3fea296680000000
+	.quad	0x3fea25b200000000
+	.quad	0x3fea21fec0000000
+	.quad	0x3fea1e4cc0000000
+	.quad	0x3fea1a9b80000000
+	.quad	0x3fea16eb40000000
+	.quad	0x3fea133c40000000
+	.quad	0x3fea0f8e40000000
+	.quad	0x3fea0be140000000
+	.quad	0x3fea083540000000
+	.quad	0x3fea048a40000000
+	.quad	0x3fea00e080000000
+	.quad	0x3fe9fd3780000000
+	.quad	0x3fe9f98fc0000000
+	.quad	0x3fe9f5e900000000
+	.quad	0x3fe9f24340000000
+	.quad	0x3fe9ee9e80000000
+	.quad	0x3fe9eafac0000000
+	.quad	0x3fe9e75800000000
+	.quad	0x3fe9e3b640000000
+	.quad	0x3fe9e01580000000
+	.quad	0x3fe9dc7600000000
+	.quad	0x3fe9d8d740000000
+	.quad	0x3fe9d539c0000000
+	.quad	0x3fe9d19d00000000
+	.quad	0x3fe9ce0180000000
+	.quad	0x3fe9ca66c0000000
+	.quad	0x3fe9c6cd40000000
+	.quad	0x3fe9c33480000000
+	.quad	0x3fe9bf9d00000000
+	.quad	0x3fe9bc0680000000
+	.quad	0x3fe9b870c0000000
+	.quad	0x3fe9b4dc40000000
+	.quad	0x3fe9b148c0000000
+	.quad	0x3fe9adb600000000
+	.quad	0x3fe9aa2480000000
+	.quad	0x3fe9a693c0000000
+	.quad	0x3fe9a30440000000
+	.quad	0x3fe99f7580000000
+	.quad	0x3fe99be7c0000000
+	.quad	0x3fe9985b40000000
+	.quad	0x3fe994cf80000000
+	.quad	0x3fe99144c0000000
+	.quad	0x3fe98dbb00000000
+	.quad	0x3fe98a3240000000
+	.quad	0x3fe986aa80000000
+	.quad	0x3fe98323c0000000
+	.quad	0x3fe97f9e00000000
+	.quad	0x3fe97c1900000000
+	.quad	0x3fe9789540000000
+	.quad	0x3fe9751240000000
+	.quad	0x3fe9719080000000
+	.quad	0x3fe96e0f80000000
+	.quad	0x3fe96a8f80000000
+	.quad	0x3fe9671040000000
+	.quad	0x3fe9639240000000
+	.quad	0x3fe9601540000000
+	.quad	0x3fe95c9900000000
+	.quad	0x3fe9591dc0000000
+	.quad	0x3fe955a380000000
+	.quad	0x3fe9522a40000000
+	.quad	0x3fe94eb200000000
+	.quad	0x3fe94b3a80000000
+	.quad	0x3fe947c400000000
+	.quad	0x3fe9444e80000000
+	.quad	0x3fe940da00000000
+	.quad	0x3fe93d6640000000
+	.quad	0x3fe939f3c0000000
+	.quad	0x3fe9368200000000
+	.quad	0x3fe9331140000000
+	.quad	0x3fe92fa140000000
+	.quad	0x3fe92c3280000000
+	.quad	0x3fe928c480000000
+	.quad	0x3fe9255780000000
+	.quad	0x3fe921eb40000000
+	.quad	0x3fe91e8040000000
+	.quad	0x3fe91b1600000000
+	.quad	0x3fe917ac80000000
+	.quad	0x3fe9144440000000
+	.quad	0x3fe910dcc0000000
+	.quad	0x3fe90d7640000000
+	.quad	0x3fe90a1080000000
+	.quad	0x3fe906abc0000000
+	.quad	0x3fe9034800000000
+	.quad	0x3fe8ffe540000000
+	.quad	0x3fe8fc8340000000
+	.quad	0x3fe8f92240000000
+	.quad	0x3fe8f5c200000000
+	.quad	0x3fe8f26300000000
+	.quad	0x3fe8ef0480000000
+	.quad	0x3fe8eba740000000
+	.quad	0x3fe8e84ac0000000
+	.quad	0x3fe8e4ef40000000
+	.quad	0x3fe8e19480000000
+	.quad	0x3fe8de3ac0000000
+	.quad	0x3fe8dae1c0000000
+	.quad	0x3fe8d78a00000000
+	.quad	0x3fe8d432c0000000
+	.quad	0x3fe8d0dcc0000000
+	.quad	0x3fe8cd8780000000
+	.quad	0x3fe8ca3300000000
+	.quad	0x3fe8c6df80000000
+	.quad	0x3fe8c38d00000000
+	.quad	0x3fe8c03b40000000
+	.quad	0x3fe8bcea80000000
+	.quad	0x3fe8b99a80000000
+	.quad	0x3fe8b64b80000000
+	.quad	0x3fe8b2fd40000000
+	.quad	0x3fe8afb000000000
+	.quad	0x3fe8ac63c0000000
+	.quad	0x3fe8a91840000000
+	.quad	0x3fe8a5cd80000000
+	.quad	0x3fe8a283c0000000
+	.quad	0x3fe89f3b00000000
+	.quad	0x3fe89bf300000000
+	.quad	0x3fe898abc0000000
+	.quad	0x3fe8956580000000
+	.quad	0x3fe8922040000000
+	.quad	0x3fe88edbc0000000
+	.quad	0x3fe88b9800000000
+	.quad	0x3fe8885540000000
+	.quad	0x3fe8851380000000
+	.quad	0x3fe881d240000000
+	.quad	0x3fe87e9240000000
+	.quad	0x3fe87b52c0000000
+	.quad	0x3fe8781480000000
+	.quad	0x3fe874d6c0000000
+	.quad	0x3fe8719a00000000
+	.quad	0x3fe86e5e40000000
+	.quad	0x3fe86b2340000000
+	.quad	0x3fe867e900000000
+	.quad	0x3fe864afc0000000
+	.quad	0x3fe8617740000000
+	.quad	0x3fe85e3f80000000
+	.quad	0x3fe85b08c0000000
+	.quad	0x3fe857d300000000
+	.quad	0x3fe8549dc0000000
+	.quad	0x3fe8516980000000
+	.quad	0x3fe84e3640000000
+	.quad	0x3fe84b03c0000000
+	.quad	0x3fe847d200000000
+	.quad	0x3fe844a100000000
+	.quad	0x3fe8417100000000
+	.quad	0x3fe83e4200000000
+	.quad	0x3fe83b1380000000
+	.quad	0x3fe837e600000000
+	.quad	0x3fe834b940000000
+	.quad	0x3fe8318d80000000
+	.quad	0x3fe82e6280000000
+	.quad	0x3fe82b3840000000
+	.quad	0x3fe8280f00000000
+	.quad	0x3fe824e640000000
+	.quad	0x3fe821bec0000000
+	.quad	0x3fe81e97c0000000
+	.quad	0x3fe81b71c0000000
+	.quad	0x3fe8184c80000000
+	.quad	0x3fe8152800000000
+	.quad	0x3fe8120480000000
+	.quad	0x3fe80ee1c0000000
+	.quad	0x3fe80bbfc0000000
+	.quad	0x3fe8089e80000000
+	.quad	0x3fe8057e40000000
+	.quad	0x3fe8025ec0000000
+	.quad	0x3fe7ff4000000000
+	.quad	0x3fe7fc2200000000
+	.quad	0x3fe7f90500000000
+	.quad	0x3fe7f5e8c0000000
+	.quad	0x3fe7f2cd40000000
+	.quad	0x3fe7efb280000000
+	.quad	0x3fe7ec9880000000
+	.quad	0x3fe7e97f80000000
+	.quad	0x3fe7e66740000000
+	.quad	0x3fe7e34fc0000000
+	.quad	0x3fe7e03940000000
+	.quad	0x3fe7dd2340000000
+	.quad	0x3fe7da0e40000000
+	.quad	0x3fe7d6fa00000000
+	.quad	0x3fe7d3e680000000
+	.quad	0x3fe7d0d3c0000000
+	.quad	0x3fe7cdc1c0000000
+	.quad	0x3fe7cab0c0000000
+	.quad	0x3fe7c7a080000000
+	.quad	0x3fe7c49100000000
+	.quad	0x3fe7c18240000000
+	.quad	0x3fe7be7440000000
+	.quad	0x3fe7bb6700000000
+	.quad	0x3fe7b85ac0000000
+	.quad	0x3fe7b54f00000000
+	.quad	0x3fe7b24440000000
+	.quad	0x3fe7af3a40000000
+	.quad	0x3fe7ac3100000000
+	.quad	0x3fe7a92880000000
+	.quad	0x3fe7a620c0000000
+	.quad	0x3fe7a319c0000000
+	.quad	0x3fe7a013c0000000
+	.quad	0x3fe79d0e40000000
+	.quad	0x3fe79a09c0000000
+	.quad	0x3fe7970600000000
+	.quad	0x3fe79402c0000000
+	.quad	0x3fe7910080000000
+	.quad	0x3fe78dff00000000
+	.quad	0x3fe78afe40000000
+	.quad	0x3fe787fe40000000
+	.quad	0x3fe784ff00000000
+	.quad	0x3fe7820080000000
+	.quad	0x3fe77f02c0000000
+	.quad	0x3fe77c05c0000000
+	.quad	0x3fe77909c0000000
+	.quad	0x3fe7760e40000000
+	.quad	0x3fe7731380000000
+	.quad	0x3fe77019c0000000
+	.quad	0x3fe76d2080000000
+	.quad	0x3fe76a2800000000
+	.quad	0x3fe7673080000000
+	.quad	0x3fe7643980000000
+	.quad	0x3fe7614340000000
+	.quad	0x3fe75e4e00000000
+	.quad	0x3fe75b5940000000
+	.quad	0x3fe7586580000000
+	.quad	0x3fe7557240000000
+	.quad	0x3fe7527fc0000000
+	.quad	0x3fe74f8e40000000
+	.quad	0x3fe74c9d40000000
+	.quad	0x3fe749ad00000000
+	.quad	0x3fe746bd80000000
+	.quad	0x3fe743cec0000000
+	.quad	0x3fe740e100000000
+	.quad	0x3fe73df3c0000000
+	.quad	0x3fe73b0740000000
+	.quad	0x3fe7381b80000000
+	.quad	0x3fe7353080000000
+	.quad	0x3fe7324600000000
+	.quad	0x3fe72f5c80000000
+	.quad	0x3fe72c73c0000000
+	.quad	0x3fe7298b80000000
+	.quad	0x3fe726a440000000
+	.quad	0x3fe723bd80000000
+	.quad	0x3fe720d7c0000000
+	.quad	0x3fe71df280000000
+	.quad	0x3fe71b0e00000000
+	.quad	0x3fe7182a40000000
+	.quad	0x3fe7154740000000
+	.quad	0x0000000000000000
+	.rept	48
+	.byte	0
+	.endr
+
+/* Log(2) lookup table for log part (non HSW): */
+.if .-__svml_dpow_data != _log2_t1
+.err
+.endif
+	.rept	2
+	.quad	0x0000000000000000
+	.endr
+	.quad	0x3f5712e100000000
+	.quad	0x3e0ee8a22f7c5987
+	.quad	0x3f670fc100000000
+	.quad	0x3e17e16043fd7529
+	.quad	0x3f71497700000000
+	.quad	0x3e239efb866b119c
+	.quad	0x3f7709bb00000000
+	.quad	0x3e1b5ea7ee997dc0
+	.quad	0x3f7cc8aa00000000
+	.quad	0x3e2efad156451e8d
+	.quad	0x3f81430200000000
+	.quad	0x3e204975bf955ee8
+	.quad	0x3f84210300000000
+	.quad	0x3e2e526353333f9a
+	.quad	0x3f86fe5800000000
+	.quad	0x3e2dbbc5d9986525
+	.quad	0x3f89dae000000000
+	.quad	0x3e211ae127d370f8
+	.quad	0x3f8cb6ba00000000
+	.quad	0x3e2af44e8a20fe77
+	.quad	0x3f8f91e600000000
+	.quad	0x3e1f77bd1cd9fbc7
+	.quad	0x3f91363100000000
+	.quad	0x3e40f52f789c83a3
+	.quad	0x3f92a31800000000
+	.quad	0x3e172308c2064b24
+	.quad	0x3f940f9600000000
+	.quad	0x3e2f342d9eb8aeed
+	.quad	0x3f957bbb00000000
+	.quad	0x3e4abb9a144866b7
+	.quad	0x3f96e79800000000
+	.quad	0x3e48b85ac72b0200
+	.quad	0x3f98530c00000000
+	.quad	0x3e2d1e01fbc85d86
+	.quad	0x3f99be3600000000
+	.quad	0x3e37d26f00cda0dd
+	.quad	0x3f9b28f600000000
+	.quad	0x3e3433218e840f16
+	.quad	0x3f9c935b00000000
+	.quad	0x3e4f50a107fb8c37
+	.quad	0x3f9dfd7700000000
+	.quad	0x3e3604e609a9e948
+	.quad	0x3f9f673700000000
+	.quad	0x3e489f0de52d1118
+	.quad	0x3fa0684e00000000
+	.quad	0x3e4d127bd17abd42
+	.quad	0x3fa11cd300000000
+	.quad	0x3e3a899b4ece6057
+	.quad	0x3fa1d12900000000
+	.quad	0x3e5f0d0f99858cfa
+	.quad	0x3fa2855a00000000
+	.quad	0x3e58b94e89d977a4
+	.quad	0x3fa3395d00000000
+	.quad	0x3e402a7f6bf76796
+	.quad	0x3fa3ed3100000000
+	.quad	0x3e3e342da3e0aab6
+	.quad	0x3fa4a0de00000000
+	.quad	0x3e58cae94cd5496b
+	.quad	0x3fa5545500000000
+	.quad	0x3e3fdc64d89d4032
+	.quad	0x3fa607ad00000000
+	.quad	0x3e37dfd30f154124
+	.quad	0x3fa6bad500000000
+	.quad	0x3e5eb1e05460b0e3
+	.quad	0x3fa76dcf00000000
+	.quad	0x3e490ead14c7109d
+	.quad	0x3fa820a100000000
+	.quad	0x3e5258eaf10715e3
+	.quad	0x3fa8d34400000000
+	.quad	0x3e242a28e25fb4d0
+	.quad	0x3fa985bf00000000
+	.quad	0x3dfa4a83c146ec0f
+	.quad	0x3faa381200000000
+	.quad	0x3e3c7de45fe856f6
+	.quad	0x3faaea3500000000
+	.quad	0x3e408258f0914a28
+	.quad	0x3fab9c3000000000
+	.quad	0x3e3f9589c628dfe0
+	.quad	0x3fac4dfa00000000
+	.quad	0x3e5721556bde9f1f
+	.quad	0x3facff9c00000000
+	.quad	0x3e5a8867f80f2a46
+	.quad	0x3fadb11600000000
+	.quad	0x3e4a583c979a598e
+	.quad	0x3fae626700000000
+	.quad	0x3e443847800c1405
+	.quad	0x3faf138700000000
+	.quad	0x3e1664a168a10688
+	.quad	0x3fafc48600000000
+	.quad	0x3e2eb49173242e2e
+	.quad	0x3fb03aa900000000
+	.quad	0x3e6b1b90df1d2899
+	.quad	0x3fb092fb00000000
+	.quad	0x3e6f4828dce8ef96
+	.quad	0x3fb0eb3900000000
+	.quad	0x3e57e8a84071ed7c
+	.quad	0x3fb1436100000000
+	.quad	0x3e6ea26e46fc50e3
+	.quad	0x3fb19b7500000000
+	.quad	0x3e64d3ec52377554
+	.quad	0x3fb1f37000000000
+	.quad	0x3e46a5728109990d
+	.quad	0x3fb24b5900000000
+	.quad	0x3e6b426b10e12ca0
+	.quad	0x3fb2a32e00000000
+	.quad	0x3e59bbba7c1b46c7
+	.quad	0x3fb2faed00000000
+	.quad	0x3e67f99638784faf
+	.quad	0x3fb3529c00000000
+	.quad	0x3e1e52f196858161
+	.quad	0x3fb3aa3000000000
+	.quad	0x3e67a4fe6def19e6
+	.quad	0x3fb401b000000000
+	.quad	0x3e0302a326e6a3dc
+	.quad	0x3fb4591d00000000
+	.quad	0x3e6fa21b2e435f49
+	.quad	0x3fb4b07600000000
+	.quad	0x3e58415e51626967
+	.quad	0x3fb507b900000000
+	.quad	0x3e3a033d6c5941c4
+	.quad	0x3fb55ee600000000
+	.quad	0x3e33c8467c54296b
+	.quad	0x3fb5b60100000000
+	.quad	0x3e5e02f5a12fe65d
+	.quad	0x3fb60d0600000000
+	.quad	0x3e6ecfc86d9ed70d
+	.quad	0x3fb663f600000000
+	.quad	0x3e5eb24497a376b8
+	.quad	0x3fb6bad400000000
+	.quad	0x3e48c77f72e2b40f
+	.quad	0x3fb7119b00000000
+	.quad	0x3e68ed7d5e52d89e
+	.quad	0x3fb7684d00000000
+	.quad	0x3e43fa7ea9d3799b
+	.quad	0x3fb7beec00000000
+	.quad	0x3e60571414f770db
+	.quad	0x3fb8157900000000
+	.quad	0x3e68c7d07f316ee3
+	.quad	0x3fb86bf000000000
+	.quad	0x3e6360f420c77bec
+	.quad	0x3fb8c25000000000
+	.quad	0x3e6d91c947d50fa1
+	.quad	0x3fb918a300000000
+	.quad	0x3e4b231ba93bd154
+	.quad	0x3fb96eda00000000
+	.quad	0x3e61d38c8099fddd
+	.quad	0x3fb9c50300000000
+	.quad	0x3e677eeb9b0174ac
+	.quad	0x3fba1b1100000000
+	.quad	0x3e69d6ddd016014c
+	.quad	0x3fba711100000000
+	.quad	0x3e626690842b7789
+	.quad	0x3fbac6fa00000000
+	.quad	0x3e5830b93095c531
+	.quad	0x3fbb1cd000000000
+	.quad	0x3e5c2b99518e0d2c
+	.quad	0x3fbb729300000000
+	.quad	0x3e66279b91823620
+	.quad	0x3fbbc84400000000
+	.quad	0x3e30adafc9057ecc
+	.quad	0x3fbc1ddd00000000
+	.quad	0x3e461ce45269682a
+	.quad	0x3fbc736300000000
+	.quad	0x3e5044ef5f2fe276
+	.quad	0x3fbcc8d600000000
+	.quad	0x3e4eb3dbd5234ce7
+	.quad	0x3fbd1e3600000000
+	.quad	0x3e2eb70a6e724019
+	.quad	0x3fbd737e00000000
+	.quad	0x3e5403a5977b9a51
+	.quad	0x3fbdc8b700000000
+	.quad	0x3e62d343b2886c33
+	.quad	0x3fbe1ddd00000000
+	.quad	0x3e5f443cfbd572a9
+	.quad	0x3fbe72eb00000000
+	.quad	0x3e632ff4a08c00ad
+	.quad	0x3fbec7ea00000000
+	.quad	0x3e611d934f5c870b
+	.quad	0x3fbf1cd100000000
+	.quad	0x3e610afc18ecc7fd
+	.quad	0x3fbf71a900000000
+	.quad	0x3e4c5db9d4383f15
+	.quad	0x3fbfc66800000000
+	.quad	0x3e6a615fe5dcf50a
+	.quad	0x3fc00d8c00000000
+	.quad	0x3e6f8684b8524b4d
+	.quad	0x3fc037da00000000
+	.quad	0x3e7471e52c396096
+	.quad	0x3fc0621e00000000
+	.quad	0x3e7a1aad94d3758a
+	.quad	0x3fc08c5800000000
+	.quad	0x3e7f9b4f573cd19d
+	.quad	0x3fc0b68900000000
+	.quad	0x3e4e88e925a98afd
+	.quad	0x3fc0e0b100000000
+	.quad	0x3e677212d0eeb433
+	.quad	0x3fc10acd00000000
+	.quad	0x3e63ff48e459228f
+	.quad	0x3fc134e100000000
+	.quad	0x3e63a241697adc33
+	.quad	0x3fc15eeb00000000
+	.quad	0x3e4f4a7ae82699a0
+	.quad	0x3fc188ec00000000
+	.quad	0x3e7d83a2e1fe8196
+	.quad	0x3fc1b2e400000000
+	.quad	0x3e6e765c52c5b577
+	.quad	0x3fc1dcd100000000
+	.quad	0x3e77eaa5780399be
+	.quad	0x3fc206b400000000
+	.quad	0x3e766c5ef95ab1fc
+	.quad	0x3fc2308f00000000
+	.quad	0x3e703a52d5db6084
+	.quad	0x3fc25a6200000000
+	.quad	0x3e51786d7d82f6f1
+	.quad	0x3fc2842a00000000
+	.quad	0x3e6641ea2ded60b8
+	.quad	0x3fc2ade800000000
+	.quad	0x3e4addfbeaa772f7
+	.quad	0x3fc2d79b00000000
+	.quad	0x3e67cdfbbc061e04
+	.quad	0x3fc3014800000000
+	.quad	0x3e717ad775a7481b
+	.quad	0x3fc32ae800000000
+	.quad	0x3e7e4f15a673baf4
+	.quad	0x3fc3548300000000
+	.quad	0x3e58eca1813fa934
+	.quad	0x3fc37e1200000000
+	.quad	0x3e7a3622382e96fb
+	.quad	0x3fc3a79700000000
+	.quad	0x3e7916bb2a2cea0a
+	.quad	0x3fc3d11400000000
+	.quad	0x3e61e6a28aaa11cb
+	.quad	0x3fc3fa8800000000
+	.quad	0x3e61a3ceca68f920
+	.quad	0x3fc423f100000000
+	.quad	0x3e705825c8caf8ed
+	.quad	0x3fc44d5200000000
+	.quad	0x3e572d6f71f4b037
+	.quad	0x3fc476aa00000000
+	.quad	0x3e6060fdf3cabb49
+	.quad	0x3fc49ff700000000
+	.quad	0x3e6df855c48e67aa
+	.quad	0x3fc4c93e00000000
+	.quad	0x3e60854767c83d89
+	.quad	0x3fc4f27700000000
+	.quad	0x3e7c27d2adfa3cf1
+	.quad	0x3fc51bab00000000
+	.quad	0x3e21e96f77a9b8ff
+	.quad	0x3fc544d500000000
+	.quad	0x3e69b89066da0127
+	.quad	0x3fc56df400000000
+	.quad	0x3e7831ab063f0639
+	.quad	0x3fc5970b00000000
+	.quad	0x3e62a3ff97f4402e
+	.quad	0x3fc5c01b00000000
+	.quad	0x3e5cfdec6aa61224
+	.quad	0x3fc5e92000000000
+	.quad	0x3e30bf99a341739b
+	.quad	0x3fc6121900000000
+	.quad	0x3e7589025c069af7
+	.quad	0x3fc63b0c00000000
+	.quad	0x3e73e7c70dc28176
+	.quad	0x3fc663f600000000
+	.quad	0x3e7319225255ed92
+	.quad	0x3fc68cd700000000
+	.quad	0x3e721d999e92e626
+	.quad	0x3fc6b5af00000000
+	.quad	0x3e6feaba3c111c8a
+	.quad	0x3fc6de7e00000000
+	.quad	0x3e67408ffba276e0
+	.quad	0x3fc7074100000000
+	.quad	0x3e7b9de032cb0fd0
+	.quad	0x3fc72ffe00000000
+	.quad	0x3e6fbab18df0f78e
+	.quad	0x3fc758b100000000
+	.quad	0x3e7eed8f544cc58a
+	.quad	0x3fc7815c00000000
+	.quad	0x3e5f34382f992a55
+	.quad	0x3fc7a9ff00000000
+	.quad	0x3e723a0bf2565894
+	.quad	0x3fc7d29700000000
+	.quad	0x3e6784d72660bf64
+	.quad	0x3fc7fb2800000000
+	.quad	0x3e53cef9f2a00fda
+	.quad	0x3fc823ad00000000
+	.quad	0x3e6636827e73660e
+	.quad	0x3fc84c2b00000000
+	.quad	0x3e6e0bc0ce905e5f
+	.quad	0x3fc874a000000000
+	.quad	0x3e5b40d32ca21b4f
+	.quad	0x3fc89d0d00000000
+	.quad	0x3e7a968650124684
+	.quad	0x3fc8c56f00000000
+	.quad	0x3e7724c9f4c54dc2
+	.quad	0x3fc8edca00000000
+	.quad	0x3e6b8d4ab3e3b13c
+	.quad	0x3fc9161b00000000
+	.quad	0x3e74576bcfdafe5e
+	.quad	0x3fc93e6500000000
+	.quad	0x3e7332208c376c3f
+	.quad	0x3fc966a600000000
+	.quad	0x3df175e083c82deb
+	.quad	0x3fc98edc00000000
+	.quad	0x3e79efce11aa7d30
+	.quad	0x3fc9b70c00000000
+	.quad	0x3e62ae7840b35985
+	.quad	0x3fc9df3200000000
+	.quad	0x3e4e8c13081d57dc
+	.quad	0x3fca074e00000000
+	.quad	0x3e60b028bf61097b
+	.quad	0x3fca2f6200000000
+	.quad	0x3e7fa41706304e8f
+	.quad	0x3fca576d00000000
+	.quad	0x3e7f0e5f94377493
+	.quad	0x3fca7f7100000000
+	.quad	0x3e6edeeabeeeab1a
+	.quad	0x3fcaa76d00000000
+	.quad	0x3e6fdf22f0ca6c0d
+	.quad	0x3fcacf5d00000000
+	.quad	0x3e676d3aee892f9c
+	.quad	0x3fcaf74700000000
+	.quad	0x3e7fbc37f3121ab7
+	.quad	0x3fcb1f2800000000
+	.quad	0x3e7717af8e5dd5b2
+	.quad	0x3fcb46ff00000000
+	.quad	0x3e70c006784d6d72
+	.quad	0x3fcb6ece00000000
+	.quad	0x3e75ebf2abe7a8f0
+	.quad	0x3fcb969600000000
+	.quad	0x3e570772e1aa6f94
+	.quad	0x3fcbbe5300000000
+	.quad	0x3e7507e05d60e5c4
+	.quad	0x3fcbe60900000000
+	.quad	0x3e6a479c1c7622d5
+	.quad	0x3fcc0db700000000
+	.quad	0x3e6a7653cad63a6a
+	.quad	0x3fcc355b00000000
+	.quad	0x3e63c6576ac08e77
+	.quad	0x3fcc5cf700000000
+	.quad	0x3e696181ff9674a7
+	.quad	0x3fcc848b00000000
+	.quad	0x3e74c88b88cb08d4
+	.quad	0x3fccac1500000000
+	.quad	0x3e768ee1a3f58613
+	.quad	0x3fccd39700000000
+	.quad	0x3e7bc7d00e53901c
+	.quad	0x3fccfb1200000000
+	.quad	0x3e4cb8c314503175
+	.quad	0x3fcd228400000000
+	.quad	0x3e6a40646984129b
+	.quad	0x3fcd49ee00000000
+	.quad	0x3e77864b48c32b3c
+	.quad	0x3fcd714e00000000
+	.quad	0x3e76dc470f22f1ee
+	.quad	0x3fcd98a900000000
+	.quad	0x3e153043b87205ac
+	.quad	0x3fcdbff800000000
+	.quad	0x3e7ce2096f5baed1
+	.quad	0x3fcde74000000000
+	.quad	0x3e76b6293b0e2ea0
+	.quad	0x3fce0e8000000000
+	.quad	0x3e69e5c03298a8d0
+	.quad	0x3fce35b500000000
+	.quad	0x3e7359a4add9086c
+	.quad	0x3fce5ce400000000
+	.quad	0x3e7fbba6e4320b0b
+	.quad	0x3fce840c00000000
+	.quad	0x3e57a7356760bf17
+	.quad	0x3fceab2b00000000
+	.quad	0x3e5412dd4c71d4aa
+	.quad	0x3fced23f00000000
+	.quad	0x3e708cbbd3de4f64
+	.quad	0x3fcef94d00000000
+	.quad	0x3e7ed1ec6fb9ef8f
+	.quad	0x3fcf205400000000
+	.quad	0x3e4b20911d7e37db
+	.quad	0x3fcf474f00000000
+	.quad	0x3e7192aee74aaf85
+	.quad	0x3fcf6e4500000000
+	.quad	0x3de9ff7395251cf5
+	.quad	0x3fcf953200000000
+	.quad	0x3e418fcf45710fc3
+	.quad	0x3fcfbc1600000000
+	.quad	0x3e77204d0144751b
+	.quad	0x3fcfe2f200000000
+	.quad	0x3e7df662b4d59d8e
+	.quad	0x3fd004e300000000
+	.quad	0x3e75d25f17b09d21
+	.quad	0x3fd0184a00000000
+	.quad	0x3e64044284485ca5
+	.quad	0x3fd02bab00000000
+	.quad	0x3e80a9a0c732cb2c
+	.quad	0x3fd03f0900000000
+	.quad	0x3e89a98ad1490635
+	.quad	0x3fd0526300000000
+	.quad	0x3e897756562a827f
+	.quad	0x3fd065b900000000
+	.quad	0x3e7f42d1cecd3768
+	.quad	0x3fd0790a00000000
+	.quad	0x3e8bb6060195a070
+	.quad	0x3fd08c5900000000
+	.quad	0x3e5c5a7b3a2bd335
+	.quad	0x3fd09fa100000000
+	.quad	0x3e8a2743f6a4cd20
+	.quad	0x3fd0b2e700000000
+	.quad	0x3e775f83f99025b0
+	.quad	0x3fd0c62900000000
+	.quad	0x3e87ca856421a674
+	.quad	0x3fd0d96600000000
+	.quad	0x3e814d2830ef12fd
+	.quad	0x3fd0eca000000000
+	.quad	0x3e62348eca90f220
+	.quad	0x3fd0ffd600000000
+	.quad	0x3e812fcf75d18b23
+	.quad	0x3fd1130700000000
+	.quad	0x3e73b4c2bf9f9dd3
+	.quad	0x3fd1263600000000
+	.quad	0x3e499ef30070a508
+	.quad	0x3fd1395f00000000
+	.quad	0x3e61edb0d9e8da9b
+	.quad	0x3fd14c8400000000
+	.quad	0x3e8f23ac3152c264
+	.quad	0x3fd15fa600000000
+	.quad	0x3e752ec233b712ad
+	.quad	0x3fd172c400000000
+	.quad	0x3e7a163986a7b84c
+	.quad	0x3fd185dd00000000
+	.quad	0x3e8f734fda450672
+	.quad	0x3fd198f400000000
+	.quad	0x3e7028962c15f52b
+	.quad	0x3fd1ac0500000000
+	.quad	0x3e8fd23e213f6416
+	.quad	0x3fd1bf1300000000
+	.quad	0x3e68e4e3166c3339
+	.quad	0x3fd1d21e00000000
+	.quad	0x3e70ea55e7da3fec
+	.quad	0x3fd1e52300000000
+	.quad	0x3e81b9e3403df05d
+	.quad	0x3fd1f82500000000
+	.quad	0x3e7e762367a00f4a
+	.quad	0x3fd20b2400000000
+	.quad	0x3e3388b4dd9d8704
+	.quad	0x3fd21e1f00000000
+	.quad	0x3e6603bbc7b763e4
+	.quad	0x3fd2311400000000
+	.quad	0x3e7f38b9f767e1c9
+	.quad	0x3fd2440700000000
+	.quad	0x3e8361c0e424306b
+	.quad	0x3fd256f600000000
+	.quad	0x3e53e15a0763e5f5
+	.quad	0x3fd269e100000000
+	.quad	0x3e5c346e0f5542ab
+	.quad	0x3fd27cc800000000
+	.quad	0x3e8623bac0f6e8e5
+	.quad	0x3fd28fab00000000
+	.quad	0x3e82d664ea511964
+	.quad	0x3fd2a28b00000000
+	.quad	0x3e244827751649e1
+	.quad	0x3fd2b56500000000
+	.quad	0x3e870662732a8325
+	.quad	0x3fd2c83c00000000
+	.quad	0x3e8db880f0396c05
+	.quad	0x3fd2db1000000000
+	.quad	0x3e8409b34923f5d0
+	.quad	0x3fd2ede000000000
+	.quad	0x3e899c121e8496e6
+	.quad	0x3fd300ad00000000
+	.quad	0x3e7c232f22d20f20
+	.quad	0x3fd3137500000000
+	.quad	0x3e73683d6c58ca0d
+	.quad	0x3fd3263900000000
+	.quad	0x3e836d65141862cf
+	.quad	0x3fd338fa00000000
+	.quad	0x3e75be12efc2f601
+	.quad	0x3fd34bb600000000
+	.quad	0x3e70751869f3b7a6
+	.quad	0x3fd35e6f00000000
+	.quad	0x3e89f95043bbfc91
+	.quad	0x3fd3712400000000
+	.quad	0x3e80d499b29f7615
+	.quad	0x3fd383d500000000
+	.quad	0x3e83dd8f4de52902
+	.quad	0x3fd3968400000000
+	.quad	0x3e748a73fa7e46e2
+	.quad	0x3fd3a92e00000000
+	.quad	0x3e6252112c0e2155
+	.quad	0x3fd3bbd300000000
+	.quad	0x3e52a1dc831e5ad7
+	.quad	0x3fd3ce7500000000
+	.quad	0x3e825d1013e78284
+	.quad	0x3fd3e11400000000
+	.quad	0x3e796f27f8ed6ab1
+	.quad	0x3fd3f3af00000000
+	.quad	0x3e81043c4e106f6a
+	.quad	0x3fd4064500000000
+	.quad	0x3e8723607a748d45
+	.quad	0x3fd418d900000000
+	.quad	0x3e7c5a76f3c6b991
+	.quad	0x3fd42b6900000000
+	.quad	0x3e7c13d54b6ede12
+	.quad	0x3fd43df400000000
+	.quad	0x3e7d02dc433313ae
+	.quad	0x3fd4507c00000000
+	.quad	0x3e8edba9f6e1776c
+	.quad	0x3fd4630100000000
+	.quad	0x3e86e864bf1d1aaa
+	.quad	0x3fd4758100000000
+	.quad	0x3e7cae90765abc31
+	.quad	0x3fd487fe00000000
+	.quad	0x3e849fe23646e5a5
+	.quad	0x3fd49a7800000000
+	.quad	0x3e479a36743be41d
+	.quad	0x3fd4aced00000000
+	.quad	0x3e8483e03299b840
+	.quad	0x3fd4bf5f00000000
+	.quad	0x3e7abba144c6b22b
+	.quad	0x3fd4d1cd00000000
+	.quad	0x3e774d20fdd9f23b
+	.quad	0x3fd4e43800000000
+	.quad	0x3e871d1f7aa47e01
+	.quad	0x3fd4f69e00000000
+	.quad	0x3e8f2860ba3b3db5
+	.quad	0x3fd5090200000000
+	.quad	0x3e83af1c17099bfe
+	.quad	0x3fd51b6200000000
+	.quad	0x3e785ff9de74a1b4
+	.quad	0x3fd52dbe00000000
+	.quad	0x3e709325cfafa80f
+	.quad	0x3fd5401600000000
+	.quad	0x3e6e6947ccf73d7a
+	.quad	0x3fd5526a00000000
+	.quad	0x3e738124d5db9ad7
+	.quad	0x3fd564bb00000000
+	.quad	0x3e86b2911c62b3a2
+	.quad	0x3fd5770900000000
+	.quad	0x3e6719bc759ee891
+	.quad	0x3fd5895200000000
+	.quad	0x3e869a322d9370bc
+	.quad	0x3fd59b9800000000
+	.quad	0x3e719789a94340e2
+	.quad	0x3fd5addb00000000
+	.quad	0x3e61c3d9786a1c1a
+	.quad	0x3fd5c01a00000000
+	.quad	0x3e37ef590a213419
+	.quad	0x3fd5d25400000000
+	.quad	0x3e8d54eb1103130f
+	.quad	0x3fd5e48d00000000
+	.quad	0x3e52f62a9cc12fd0
+	.quad	0x3fd5f6c100000000
+	.quad	0x3e6be9b244784641
+	.quad	0x3fd608f100000000
+	.quad	0x3e758a521184b277
+	.quad	0x3fd61b1e00000000
+	.quad	0x3e86042873323471
+	.quad	0x3fd62d4700000000
+	.quad	0x3e8fbc7d80b47bcf
+	.quad	0x3fd63f6d00000000
+	.quad	0x3e6e2c82077ea756
+	.quad	0x3fd6518f00000000
+	.quad	0x3e85ccef6bf767f4
+	.quad	0x3fd663ae00000000
+	.quad	0x3e46ead81df81e8f
+	.quad	0x3fd675c900000000
+	.quad	0x3e82dd03f10cd685
+	.quad	0x3fd687e100000000
+	.quad	0x3e3e902c6dbc1f0c
+	.quad	0x3fd699f500000000
+	.quad	0x3e84319abac9c4b2
+	.quad	0x3fd6ac0600000000
+	.quad	0x3e5b055166c24b15
+	.quad	0x3fd6be1200000000
+	.quad	0x3e7c3be07b4f7882
+	.quad	0x3fd6d01b00000000
+	.quad	0x3e8cfd93dd847e5d
+	.quad	0x3fd6e22100000000
+	.quad	0x3e6ace863358e8d7
+	.quad	0x3fd6f42300000000
+	.quad	0x3e83e40c6242bfe9
+	.quad	0x3fd7062300000000
+	.quad	0x3e610ab6a8479b5d
+	.quad	0x3fd7181e00000000
+	.quad	0x3e7cd689bcfd9cf6
+	.quad	0x3fd72a1600000000
+	.quad	0x3e8b1978624662cc
+	.quad	0x3fd73c0b00000000
+	.quad	0x3e3b1a8d9a80c213
+	.quad	0x3fd74dfa00000000
+	.quad	0x3e8f44cc629fadc5
+	.quad	0x3fd75fe900000000
+	.quad	0x3e70d17562376005
+	.quad	0x3fd771d300000000
+	.quad	0x3e731fbf269b0088
+	.quad	0x3fd783b900000000
+	.quad	0x3e52ab13f0273736
+	.quad	0x3fd7959b00000000
+	.quad	0x3e8ba45253b127d6
+	.quad	0x3fd7a77b00000000
+	.quad	0x3e852fa4783a4dfd
+	.quad	0x3fd7b95700000000
+	.quad	0x3e6528d527430d54
+	.quad	0x3fd7cb2f00000000
+	.quad	0x3e84f6c8a8c54418
+	.quad	0x3fd7dd0500000000
+	.quad	0x3e5f404ba538c133
+	.quad	0x3fd7eed700000000
+	.quad	0x3e81d08a084632f9
+	.quad	0x3fd800a500000000
+	.quad	0x3e84e2c39b578d96
+	.quad	0x3fd8127000000000
+	.quad	0x3e8641178f2c2b02
+	.quad	0x3fd8243700000000
+	.quad	0x3e781b9c28ee919e
+	.quad	0x3fd835fa00000000
+	.quad	0x3e8f7b17b6d5775c
+	.quad	0x3fd847bc00000000
+	.quad	0x3e89db0c612f1b2e
+	.quad	0x3fd8597800000000
+	.quad	0x3e8dffaae2cbad0f
+	.quad	0x3fd86b3300000000
+	.quad	0x3e70f5b6d0513247
+	.quad	0x3fd87ce900000000
+	.quad	0x3e6699b2d0c42cca
+	.quad	0x3fd88e9b00000000
+	.quad	0x3e8edc16362782b3
+	.quad	0x3fd8a04b00000000
+	.quad	0x3e83cd771d49fb4b
+	.quad	0x3fd8b1f800000000
+	.quad	0x3e60b05b11747e4c
+	.quad	0x3fd8c3a100000000
+	.quad	0x3e7f52c9816db2c1
+	.quad	0x3fd8d54600000000
+	.quad	0x3e782d70d541d6c1
+	.quad	0x3fd8e6e800000000
+	.quad	0x3e57aa91cc153dde
+	.quad	0x3fd8f88600000000
+	.quad	0x3e83f65a8e01affc
+	.quad	0x3fd90a2100000000
+	.quad	0x3e8ecae2475966df
+	.quad	0x3fd91bba00000000
+	.quad	0x3e591f169848d269
+	.quad	0x3fd92d4f00000000
+	.quad	0x3e3647c7943a8d23
+	.quad	0x3fd93ee000000000
+	.quad	0x3e8726bf3db3e718
+	.quad	0x3fd9506d00000000
+	.quad	0x3e8c1a18fafa10d5
+	.quad	0x3fd961f900000000
+	.quad	0x3e5b2740c198f220
+	.quad	0x3fd9737f00000000
+	.quad	0x3e887fb1536242b8
+	.quad	0x3fd9850400000000
+	.quad	0x3e7ec5c619b71f3e
+	.quad	0x3fd9968400000000
+	.quad	0x3e8366d3eb0e5d24
+	.quad	0x3fd9a80200000000
+	.quad	0x3e88a3c48f5901ad
+	.quad	0x3fd9b97c00000000
+	.quad	0x3e74a3bb2d70054b
+	.quad	0x3fd9caf200000000
+	.quad	0x3e825931e77b3ed9
+	.quad	0x3fd9dc6600000000
+	.quad	0x3e8ac1bd72bb6920
+	.quad	0x3fd9edd600000000
+	.quad	0x3e7d26c9777b80e6
+	.quad	0x3fd9ff4200000000
+	.quad	0x3e87cdf6b003fe44
+	.quad	0x3fda10ad00000000
+	.quad	0x3e32256c5f5257da
+	.quad	0x3fda221200000000
+	.quad	0x3e83b4a3ff1466d0
+	.quad	0x3fda337600000000
+	.quad	0x3e673fb048cd2b2f
+	.quad	0x3fda44d600000000
+	.quad	0x3e7844f0a7da3c13
+	.quad	0x3fda563100000000
+	.quad	0x3e8bcba6da5b37e1
+	.quad	0x3fda678b00000000
+	.quad	0x3e7325816e447b2d
+	.quad	0x3fda78e100000000
+	.quad	0x3e753defc2fb5aa0
+	.quad	0x3fda8a3300000000
+	.quad	0x3e8e9f688620242e
+	.quad	0x3fda9b8300000000
+	.quad	0x3e650c63633bbec2
+	.quad	0x3fdaacce00000000
+	.quad	0x3e8e38f926facedd
+	.quad	0x3fdabe1800000000
+	.quad	0x3e83efe3f1bc83ea
+	.quad	0x3fdacf5d00000000
+	.quad	0x3e809e9d83cd28e8
+	.quad	0x3fdae0a000000000
+	.quad	0x3e72f7a9feea5b2a
+	.quad	0x3fdaf1df00000000
+	.quad	0x3e83762377a3c900
+	.quad	0x3fdb031b00000000
+	.quad	0x3e7c7818efde9c0a
+	.quad	0x3fdb145500000000
+	.quad	0x3e618ff8ce39a19e
+	.quad	0x3fdb258900000000
+	.quad	0x3e8fd450b400cdc5
+	.quad	0x3fdb36bc00000000
+	.quad	0x3e861347926aa708
+	.quad	0x3fdb47eb00000000
+	.quad	0x3e8be7104fa3a380
+	.quad	0x3fdb591700000000
+	.quad	0x3e80fdc35b90ee8d
+	.quad	0x3fdb6a4100000000
+	.quad	0x3e056415269e9adc
+	.quad	0x3fdb7b6600000000
+	.quad	0x3e8ddbe05932e271
+	.quad	0x3fdb8c8900000000
+	.quad	0x3e73fe21df4fea38
+	.quad	0x3fdb9da800000000
+	.quad	0x3e60b2e6d80d2ce6
+	.quad	0x3fdbaec400000000
+	.quad	0x3e874289e4e1d49c
+	.quad	0x3fdbbfdd00000000
+	.quad	0x3e87ce1b050aa700
+	.quad	0x3fdbd0f300000000
+	.quad	0x3e65f3c859448338
+	.quad	0x3fdbe20400000000
+	.quad	0x3e8ffc7f79678a39
+	.quad	0x3fdbf31400000000
+	.quad	0x3e824a1ec9be7496
+	.quad	0x3fdc042100000000
+	.quad	0x3e8c2b16ec00f182
+	.quad	0x3fdc152a00000000
+	.quad	0x3e6a92654ec891d7
+	.quad	0x3fdc263000000000
+	.quad	0x3e7037888b90c7f8
+	.quad	0x3fdc373200000000
+	.quad	0x3e84e5a090419bc8
+	.quad	0x3fdc483200000000
+	.quad	0x3e882722e066f64d
+	.quad	0x3fdc592f00000000
+	.quad	0x3e6894ad710aef0c
+	.quad	0x3fdc6a2900000000
+	.quad	0x3e74290c06a50919
+	.quad	0x3fdc7b1f00000000
+	.quad	0x3e8829ea41109e48
+	.quad	0x3fdc8c1200000000
+	.quad	0x3e8011fb6ad70668
+	.quad	0x3fdc9d0200000000
+	.quad	0x3e8d1948f3cb0098
+	.quad	0x3fdcadef00000000
+	.quad	0x3e835c4dc117de0d
+	.quad	0x3fdcbed900000000
+	.quad	0x3e8e37710c7563b4
+	.quad	0x3fdccfc000000000
+	.quad	0x3e81b705b8191331
+	.quad	0x3fdce0a400000000
+	.quad	0x3e89474b1cfe31f4
+	.quad	0x3fdcf18500000000
+	.quad	0x3e71c8d86ee32d3b
+	.quad	0x3fdd026300000000
+	.quad	0x3e7815019917c831
+	.quad	0x3fdd133d00000000
+	.quad	0x3e86a58c1d40a370
+	.quad	0x3fdd241400000000
+	.quad	0x3e70c2fc81bc79c2
+	.quad	0x3fdd34e900000000
+	.quad	0x3e88ba3405adb567
+	.quad	0x3fdd45ba00000000
+	.quad	0x3e5ddba9ecf26bb9
+	.quad	0x3fdd568800000000
+	.quad	0x3e3d1ef9e850540f
+	.quad	0x3fdd675300000000
+	.quad	0x3e80065d34ca0dce
+	.quad	0x3fdd781c00000000
+	.quad	0x3e80d733e02d0dd1
+	.quad	0x3fdd88e100000000
+	.quad	0x3e870ef65b098f9c
+	.quad	0x3fdd99a300000000
+	.quad	0x3e52c86102e26030
+	.quad	0x3fddaa6100000000
+	.quad	0x3e8e80c9ef4c81d3
+	.quad	0x3fddbb1e00000000
+	.quad	0x3e7692e19cb2b425
+	.quad	0x3fddcbd600000000
+	.quad	0x3e8c462e64521547
+	.quad	0x3fdddc8c00000000
+	.quad	0x3e8d5a1dd411035e
+	.quad	0x3fdded4000000000
+	.quad	0x3e7c908df47a8f92
+	.quad	0x3fddfdf000000000
+	.quad	0x3e545cf17f40aa9d
+	.quad	0x3fde0e9d00000000
+	.quad	0x3e687c172ac42c55
+	.quad	0x3fde1f4600000000
+	.quad	0x3e78da98936314cf
+	.quad	0x3fde2fed00000000
+	.quad	0x3e4812e4ac4e8487
+	.quad	0x3fde409100000000
+	.quad	0x3e64755453322906
+	.quad	0x3fde513100000000
+	.quad	0x3e7528ae2e3ef4fa
+	.quad	0x3fde61d000000000
+	.quad	0x3e7501716cf4be90
+	.quad	0x3fde726900000000
+	.quad	0x3e8f3cea8b8b9869
+	.quad	0x3fde830200000000
+	.quad	0x3e7be69828149b31
+	.quad	0x3fde939600000000
+	.quad	0x3e8d5e2937a72435
+	.quad	0x3fdea42800000000
+	.quad	0x3e89bfbbe2698141
+	.quad	0x3fdeb4b800000000
+	.quad	0x3e56d15b8c6d35e8
+	.quad	0x3fdec54400000000
+	.quad	0x3e886f8d094b9a13
+	.quad	0x3fded5cd00000000
+	.quad	0x3e7b23c5dca4eff0
+	.quad	0x3fdee65300000000
+	.quad	0x3e7d463bf0218027
+	.quad	0x3fdef6d600000000
+	.quad	0x3e8b651c6050e055
+	.quad	0x3fdf075600000000
+	.quad	0x3e6b46a793b8e626
+	.quad	0x3fdf17d400000000
+	.quad	0x3e74650236b11f5f
+	.quad	0x3fdf284e00000000
+	.quad	0x3e77629298efa0ad
+	.quad	0x3fdf38c500000000
+	.quad	0x3e87d798bebcb6ab
+	.quad	0x3fdf493a00000000
+	.quad	0x3e7ccde6d2f4c9f7
+	.quad	0x3fdf59ab00000000
+	.quad	0x3e5186572a5ff9c8
+	.quad	0x3fdf6a1a00000000
+	.quad	0x3e799d006591c907
+	.quad	0x3fdf7a8500000000
+	.quad	0x3e841960e73ec979
+	.quad	0x3fdf8aee00000000
+	.quad	0x3e630aa8521479fd
+	.quad	0x3fdf9b5300000000
+	.quad	0x3e8e8b869c429d94
+	.quad	0x3fdfabb700000000
+	.quad	0x3e4350fc25c8a13b
+	.quad	0x3fdfbc1700000000
+	.quad	0x3e79009a6ef5d48a
+	.quad	0x3fdfcc7300000000
+	.quad	0x3e8306349a8abfef
+	.quad	0x3fdfdcce00000000
+	.quad	0x3e7d9f569f06bc1e
+	.quad	0x3fdfed2500000000
+	.quad	0x3e65160ec1d12919
+	.quad	0x3fdffd7900000000
+	.quad	0x3e5a83ff2555a494
+	.quad	0x3fe006e500000000
+	.quad	0x3e9afca83644de26
+	.quad	0x3fe00f0d00000000
+	.quad	0x3e53c49d9079d468
+	.quad	0x3fe0173200000000
+	.quad	0x3e9ae76be763882e
+	.quad	0x3fe01f5700000000
+	.quad	0x3e7f793285e25c81
+	.quad	0x3fe0277a00000000
+	.quad	0x3e800243639826ee
+	.quad	0x3fe02f9b00000000
+	.quad	0x3e9b301832f2c8a9
+	.quad	0x3fe037bc00000000
+	.quad	0x3e54b54b5457ab7c
+	.quad	0x3fe03fda00000000
+	.quad	0x3e9a32f3449fa7a6
+	.quad	0x3fe047f700000000
+	.quad	0x3e8e060e91d41da5
+	.quad	0x3fe0501300000000
+	.quad	0x3e8a3f382aa1e82b
+	.quad	0x3fe0582d00000000
+	.quad	0x3e9da8b4318c1dd2
+	.quad	0x3fe0604700000000
+	.quad	0x3e3f9274a07c17a6
+	.quad	0x3fe0685e00000000
+	.quad	0x3e95804ec5f0fe6d
+	.quad	0x3fe0707400000000
+	.quad	0x3e9c8eac786d0112
+	.quad	0x3fe0788900000000
+	.quad	0x3e958943fb66416b
+	.quad	0x3fe0809d00000000
+	.quad	0x3e33fb82cede51e0
+	.quad	0x3fe088ae00000000
+	.quad	0x3e9cc27b15563034
+	.quad	0x3fe090bf00000000
+	.quad	0x3e8581667ca3348d
+	.quad	0x3fe098ce00000000
+	.quad	0x3e8454acd057fbfa
+	.quad	0x3fe0a0dc00000000
+	.quad	0x3e91cf1c5c53f37d
+	.quad	0x3fe0a8e800000000
+	.quad	0x3e93b2b423f481d0
+	.quad	0x3fe0b0f300000000
+	.quad	0x3e7a8314e3b62474
+	.quad	0x3fe0b8fd00000000
+	.quad	0x3e574eeba208d495
+	.quad	0x3fe0c10400000000
+	.quad	0x3e961ac74d5ada6a
+	.quad	0x3fe0c90b00000000
+	.quad	0x3e926ddde7aa78b1
+	.quad	0x3fe0d11000000000
+	.quad	0x3e9f51b91d907509
+	.quad	0x3fe0d91400000000
+	.quad	0x3e9ca5d77a3bf837
+	.quad	0x3fe0e11700000000
+	.quad	0x3e84935ef97f078e
+	.quad	0x3fe0e91800000000
+	.quad	0x3e80395f3d5449d6
+	.quad	0x3fe0f11800000000
+	.quad	0x3e8a2c7cb38d9ed1
+	.quad	0x3fe0f91600000000
+	.quad	0x3e9677ba0152cbb4
+	.quad	0x3fe1011300000000
+	.quad	0x3e9b3a7927aec2fd
+	.quad	0x3fe1090f00000000
+	.quad	0x3e707f2889e8b7a9
+	.quad	0x3fe1110900000000
+	.quad	0x3e93bcf3ba17fb1f
+	.quad	0x3fe1190200000000
+	.quad	0x3e7cecd182c0b1e4
+	.quad	0x3fe120f900000000
+	.quad	0x3e95a3c2fb2785b2
+	.quad	0x3fe128ef00000000
+	.quad	0x3e9edbce6a636a11
+	.quad	0x3fe130e400000000
+	.quad	0x3e972c7da9b832d3
+	.quad	0x3fe138d700000000
+	.quad	0x3e9e74efeb672a03
+	.quad	0x3fe140ca00000000
+	.quad	0x3e2a1e54f6b89e31
+	.quad	0x3fe148ba00000000
+	.quad	0x3e90ad737019fd24
+	.quad	0x3fe150a900000000
+	.quad	0x3e9b639c287d2824
+	.quad	0x3fe1589700000000
+	.quad	0x3e9495b6dd3ddabd
+	.quad	0x3fe1608400000000
+	.quad	0x3e7f2aeffe31b5d0
+	.quad	0x3fe1686f00000000
+	.quad	0x3e827b385c52cc9f
+	.quad	0x3fe1705900000000
+	.quad	0x3e71e501d3944026
+	.quad	0x3fe1784100000000
+	.quad	0x3e99628a2c0e2602
+	.quad	0x3fe1802800000000
+	.quad	0x3e9c2e52f159a4bf
+	.quad	0x3fe1880e00000000
+	.quad	0x3e8976d9b0f3dfdd
+	.quad	0x3fe18ff300000000
+	.quad	0x3e628513cd04695c
+	.quad	0x3fe197d600000000
+	.quad	0x3e75b2da605bddf8
+	.quad	0x3fe19fb700000000
+	.quad	0x3e95ee648263ee18
+	.quad	0x3fe1a79700000000
+	.quad	0x3e9f6e601ac91256
+	.quad	0x3fe1af7700000000
+	.quad	0x3e5d155a178b90cd
+	.quad	0x3fe1b75400000000
+	.quad	0x3e9cfbe9de667b41
+	.quad	0x3fe1bf3100000000
+	.quad	0x3e744ae80f899fbd
+	.quad	0x3fe1c70c00000000
+	.quad	0x3e76d96ff1c879c9
+	.quad	0x3fe1cee500000000
+	.quad	0x3e9ecb5e2c072eb0
+	.quad	0x3fe1d6be00000000
+	.quad	0x3e71c11dbe1db818
+	.quad	0x3fe1de9500000000
+	.quad	0x3e625cbb9559d10f
+	.quad	0x3fe1e66a00000000
+	.quad	0x3e9841c66176bdde
+	.quad	0x3fe1ee3f00000000
+	.quad	0x3e78dd143c97c211
+	.quad	0x3fe1f61200000000
+	.quad	0x3e309f38f10515b8
+	.quad	0x3fe1fde300000000
+	.quad	0x3e9de1d02b7acb55
+	.quad	0x3fe205b400000000
+	.quad	0x3e7d6e666f069f9f
+	.quad	0x3fe20d8300000000
+	.quad	0x3e80c459b58a9a68
+	.quad	0x3fe2155100000000
+	.quad	0x3e4b3ac6c4422b43
+	.quad	0x3fe21d1d00000000
+	.quad	0x3e90a6dabdf57c13
+	.quad	0x3fe224e800000000
+	.quad	0x3e87a6f05e2e66b4
+	.quad	0x3fe22cb200000000
+	.quad	0x3e83ebcaaaa786ff
+	.quad	0x3fe2347a00000000
+	.quad	0x3e933c5177ae38be
+	.quad	0x3fe23c4100000000
+	.quad	0x3e9f44e5029b8b1d
+	.quad	0x3fe2440700000000
+	.quad	0x3e9635c0e894df30
+	.quad	0x3fe24bcc00000000
+	.quad	0x3e6e87f9f1f3590c
+	.quad	0x3fe2538f00000000
+	.quad	0x3e7feacb86a3b429
+	.quad	0x3fe25b5100000000
+	.quad	0x3e8cfdcf4e10a41a
+	.quad	0x3fe2631100000000
+	.quad	0x3e9f73a21fdde641
+	.quad	0x3fe26ad100000000
+	.quad	0x3e7a8b8011d56d3b
+	.quad	0x3fe2728f00000000
+	.quad	0x3e6f84bf7d5b34d0
+	.quad	0x3fe27a4c00000000
+	.quad	0x3e6985cc1c8f11b0
+	.quad	0x3fe2820700000000
+	.quad	0x3e88d25a6a02c803
+	.quad	0x3fe289c100000000
+	.quad	0x3e975fd4c3433e76
+	.quad	0x3fe2917a00000000
+	.quad	0x3e8825154781d2c2
+	.quad	0x3fe2993200000000
+	.quad	0x3e62791595e60d25
+	.quad	0x3fe2a0e800000000
+	.quad	0x3e605b4c41d5635b
+	.quad	0x3fe2a89d00000000
+	.quad	0x3e68e92900528496
+	.quad	0x3fe2b05000000000
+	.quad	0x3e9970145df6a281
+	.quad	0xbfda8ff900000000
+	.quad	0xbe86302155df0de3
+	.quad	0xbfda809600000000
+	.quad	0xbe8d2b316176fad0
+	.quad	0xbfda713700000000
+	.quad	0xbe824db2f6aceb96
+	.quad	0xbfda61da00000000
+	.quad	0xbe67117a804da234
+	.quad	0xbfda527f00000000
+	.quad	0xbe7f97f60ff5807b
+	.quad	0xbfda432700000000
+	.quad	0xbe809d5c44adaa28
+	.quad	0xbfda33d200000000
+	.quad	0xbe70e2c7de9ac83b
+	.quad	0xbfda247f00000000
+	.quad	0xbe8781011952fb40
+	.quad	0xbfda152f00000000
+	.quad	0xbe6794c0edaf9f16
+	.quad	0xbfda05e100000000
+	.quad	0xbe77ddf6e9895b08
+	.quad	0xbfd9f69600000000
+	.quad	0xbe73aef455ae3da8
+	.quad	0xbfd9e74d00000000
+	.quad	0xbe6eaf442c7ba9be
+	.quad	0xbfd9d80600000000
+	.quad	0xbe8dc93243f14070
+	.quad	0xbfd9c8c300000000
+	.quad	0xbe78d1ba7956f02d
+	.quad	0xbfd9b98100000000
+	.quad	0xbe8b8c1e78260310
+	.quad	0xbfd9aa4300000000
+	.quad	0xbe5ce27fc9d31391
+	.quad	0xbfd99b0700000000
+	.quad	0xbe634b6355f4087a
+	.quad	0xbfd98bcd00000000
+	.quad	0xbe6c94b4572fef43
+	.quad	0xbfd97c9600000000
+	.quad	0xbe5846721de94267
+	.quad	0xbfd96d6100000000
+	.quad	0xbe88b74acdde1f6a
+	.quad	0xbfd95e2f00000000
+	.quad	0xbe801a3e03f6b280
+	.quad	0xbfd94f0000000000
+	.quad	0xbe4b35095482043f
+	.quad	0xbfd93fd200000000
+	.quad	0xbe856437d9bb4a5c
+	.quad	0xbfd930a800000000
+	.quad	0xbe5db5b388b06a65
+	.quad	0xbfd9218000000000
+	.quad	0xbe79c93768c0e5d4
+	.quad	0xbfd9125a00000000
+	.quad	0xbe27f0e9d0aaf77a
+	.quad	0xbfd9033700000000
+	.quad	0xbe6e085f7c5942f1
+	.quad	0xbfd8f41600000000
+	.quad	0xbe81b98df5f47569
+	.quad	0xbfd8e4f700000000
+	.quad	0xbe8f3428ac4ddeec
+	.quad	0xbfd8d5dc00000000
+	.quad	0xbe7127ef6092650e
+	.quad	0xbfd8c6c300000000
+	.quad	0xbe7c262e6c66cdb8
+	.quad	0xbfd8b7ac00000000
+	.quad	0xbe876faffff4af15
+	.quad	0xbfd8a89800000000
+	.quad	0xbe635fdead9ef9a2
+	.quad	0xbfd8998600000000
+	.quad	0xbe7dfc6109e45ceb
+	.quad	0xbfd88a7600000000
+	.quad	0xbe8d94a9416e4721
+	.quad	0xbfd87b6900000000
+	.quad	0xbe80c9bd35322fa9
+	.quad	0xbfd86c5f00000000
+	.quad	0xbe45bd4714c8ffcf
+	.quad	0xbfd85d5700000000
+	.quad	0xbe7f0ac6abba5180
+	.quad	0xbfd84e5100000000
+	.quad	0xbe74a1d4fc76c4e2
+	.quad	0xbfd83f4e00000000
+	.quad	0xbe58c7bbd43ea059
+	.quad	0xbfd8304d00000000
+	.quad	0xbe8a18240481523a
+	.quad	0xbfd8214e00000000
+	.quad	0xbe8e4115e0e87309
+	.quad	0xbfd8125300000000
+	.quad	0xbe4067fcc9c54454
+	.quad	0xbfd8035a00000000
+	.quad	0xbe5519044060b3ca
+	.quad	0xbfd7f46200000000
+	.quad	0xbe81f1c2bab3efa5
+	.quad	0xbfd7e56e00000000
+	.quad	0xbe2f4f8116a92f1f
+	.quad	0xbfd7d67c00000000
+	.quad	0xbe7d00ebaf755412
+	.quad	0xbfd7c78c00000000
+	.quad	0xbe757cb332aa9b04
+	.quad	0xbfd7b89f00000000
+	.quad	0xbe6b67957924a221
+	.quad	0xbfd7a9b400000000
+	.quad	0xbe749441f289397f
+	.quad	0xbfd79acb00000000
+	.quad	0xbe853e207739b243
+	.quad	0xbfd78be500000000
+	.quad	0xbe6f940fb688810d
+	.quad	0xbfd77d0100000000
+	.quad	0xbe8b3df7ad1f744b
+	.quad	0xbfd76e2000000000
+	.quad	0xbe86b033ad082bc9
+	.quad	0xbfd75f4100000000
+	.quad	0xbe8a6afc121884da
+	.quad	0xbfd7506500000000
+	.quad	0xbe6a7683b47c1884
+	.quad	0xbfd7418a00000000
+	.quad	0xbe8b777e34575fd6
+	.quad	0xbfd732b200000000
+	.quad	0xbe8927fbbcb9ee5d
+	.quad	0xbfd723dd00000000
+	.quad	0xbe88c68d7090566b
+	.quad	0xbfd7150b00000000
+	.quad	0xbe4a2b2a2a0eb191
+	.quad	0xbfd7063900000000
+	.quad	0xbe8afbf68de6383b
+	.quad	0xbfd6f76b00000000
+	.quad	0xbe86ddf093045ea8
+	.quad	0xbfd6e89f00000000
+	.quad	0xbe8c8c435cc0756e
+	.quad	0xbfd6d9d600000000
+	.quad	0xbe786d3ae8f9661f
+	.quad	0xbfd6cb0f00000000
+	.quad	0xbe6832e43f6d9d88
+	.quad	0xbfd6bc4a00000000
+	.quad	0xbe747cb81361877f
+	.quad	0xbfd6ad8800000000
+	.quad	0xbe82035808f1c0f3
+	.quad	0xbfd69ec800000000
+	.quad	0xbe76ff1399db6922
+	.quad	0xbfd6900a00000000
+	.quad	0xbe7fcdb431863dd3
+	.quad	0xbfd6814e00000000
+	.quad	0xbe8f693d13fbb8d9
+	.quad	0xbfd6729600000000
+	.quad	0xbe834eb29036fad3
+	.quad	0xbfd663df00000000
+	.quad	0xbe899b456a12ce2e
+	.quad	0xbfd6552b00000000
+	.quad	0xbe772618a503c189
+	.quad	0xbfd6467900000000
+	.quad	0xbe72cc529275c5a3
+	.quad	0xbfd637c900000000
+	.quad	0xbe8344c9b19a2513
+	.quad	0xbfd6291c00000000
+	.quad	0xbe72be4c963d47b8
+	.quad	0xbfd61a7100000000
+	.quad	0xbe77cb0653b68de6
+	.quad	0xbfd60bc800000000
+	.quad	0xbe8b082faedc50d1
+	.quad	0xbfd5fd2200000000
+	.quad	0xbe86f7868080f7bc
+	.quad	0xbfd5ee7e00000000
+	.quad	0xbe6a9fb569e79a60
+	.quad	0xbfd5dfdc00000000
+	.quad	0xbe8cbdd5bf453a04
+	.quad	0xbfd5d13d00000000
+	.quad	0xbe6bb6ee545183dc
+	.quad	0xbfd5c2a000000000
+	.quad	0xbe87ec26c29aa221
+	.quad	0xbfd5b40500000000
+	.quad	0xbe8d5da983e3cbed
+	.quad	0xbfd5a56d00000000
+	.quad	0xbe80b6e1bfe5ec04
+	.quad	0xbfd596d700000000
+	.quad	0xbe8228784608b2df
+	.quad	0xbfd5884300000000
+	.quad	0xbe7116419622027e
+	.quad	0xbfd579b200000000
+	.quad	0xbe6aee6a38f29592
+	.quad	0xbfd56b2200000000
+	.quad	0xbe8a36af180d0f15
+	.quad	0xbfd55c9500000000
+	.quad	0xbe8c853372ca57cc
+	.quad	0xbfd54e0b00000000
+	.quad	0xbe7bb00ee04486c4
+	.quad	0xbfd53f8300000000
+	.quad	0xbe7cc02b891628da
+	.quad	0xbfd530fd00000000
+	.quad	0xbe63794fe93c7f63
+	.quad	0xbfd5227900000000
+	.quad	0xbe75d7854e0de2c5
+	.quad	0xbfd513f800000000
+	.quad	0xbe372da45519dce0
+	.quad	0xbfd5057800000000
+	.quad	0xbe79f8d2da727bf4
+	.quad	0xbfd4f6fc00000000
+	.quad	0xbe56cec60358c3fd
+	.quad	0xbfd4e88000000000
+	.quad	0xbe8602e65c350140
+	.quad	0xbfd4da0800000000
+	.quad	0xbe8328c92737a9b0
+	.quad	0xbfd4cb9200000000
+	.quad	0xbe6dc3078767b5b5
+	.quad	0xbfd4bd1e00000000
+	.quad	0xbe79203927cd12cc
+	.quad	0xbfd4aead00000000
+	.quad	0xbe55c17da1b07b42
+	.quad	0xbfd4a03d00000000
+	.quad	0xbe80825c25cbdda8
+	.quad	0xbfd491d000000000
+	.quad	0xbe7f601ba1cb823b
+	.quad	0xbfd4836600000000
+	.quad	0xbe2caebe06773e1b
+	.quad	0xbfd474fd00000000
+	.quad	0xbe72afc887224809
+	.quad	0xbfd4669700000000
+	.quad	0xbe60b454dababfee
+	.quad	0xbfd4583200000000
+	.quad	0xbe8777e382ef584f
+	.quad	0xbfd449d000000000
+	.quad	0xbe8d0defa65e43f7
+	.quad	0xbfd43b7100000000
+	.quad	0xbe8520e465f01125
+	.quad	0xbfd42d1400000000
+	.quad	0xbe68a9db3066f3ad
+	.quad	0xbfd41eb900000000
+	.quad	0xbe7418cd285c77e6
+	.quad	0xbfd4106000000000
+	.quad	0xbe6ce1f66985cea7
+	.quad	0xbfd4020900000000
+	.quad	0xbe8798904973ef89
+	.quad	0xbfd3f3b500000000
+	.quad	0xbe4967d2ab8251d8
+	.quad	0xbfd3e56200000000
+	.quad	0xbe8f242d496e3d08
+	.quad	0xbfd3d71200000000
+	.quad	0xbe86a393bba964c4
+	.quad	0xbfd3c8c500000000
+	.quad	0xbe507570cacef7bf
+	.quad	0xbfd3ba7900000000
+	.quad	0xbe6efe0fa4f69a96
+	.quad	0xbfd3ac3000000000
+	.quad	0xbe4b827373e0a286
+	.quad	0xbfd39de800000000
+	.quad	0xbe864ab3e2fb43d9
+	.quad	0xbfd38fa300000000
+	.quad	0xbe8f81504eb31318
+	.quad	0xbfd3816100000000
+	.quad	0xbe5d3164fb917590
+	.quad	0xbfd3732000000000
+	.quad	0xbe8ccb836b329f7f
+	.quad	0xbfd364e200000000
+	.quad	0xbe8133990d5010c8
+	.quad	0xbfd356a600000000
+	.quad	0xbe404bc113420455
+	.quad	0xbfd3486c00000000
+	.quad	0xbe697514cf0a57dc
+	.quad	0xbfd33a3400000000
+	.quad	0xbe6dce5b769a0eb8
+	.quad	0xbfd32bfe00000000
+	.quad	0xbe8e6e1dd018cc95
+	.quad	0xbfd31dcb00000000
+	.quad	0xbe817b505f20e7f3
+	.quad	0xbfd30f9a00000000
+	.quad	0xbe3835df86199ab1
+	.quad	0xbfd3016b00000000
+	.quad	0xbe69cf10d769bddb
+	.quad	0xbfd2f33e00000000
+	.quad	0xbe7168482a60bb7c
+	.quad	0xbfd2e51400000000
+	.quad	0xbe4bd6cdf5bcf5c4
+	.quad	0xbfd2d6ea00000000
+	.quad	0xbe8d924633fff084
+	.quad	0xbfd2c8c500000000
+	.quad	0xbe7542c49a05ee8f
+	.quad	0xbfd2baa000000000
+	.quad	0xbe8ac97c411279db
+	.quad	0xbfd2ac7f00000000
+	.quad	0xbe536acce9910bf7
+	.quad	0xbfd29e5f00000000
+	.quad	0xbe6e5f25492f16f4
+	.quad	0xbfd2904100000000
+	.quad	0xbe74df4847fe96f4
+	.quad	0xbfd2822500000000
+	.quad	0xbe763798f43090eb
+	.quad	0xbfd2740c00000000
+	.quad	0xbe5fb975ad3295a5
+	.quad	0xbfd265f400000000
+	.quad	0xbe8afcc065467993
+	.quad	0xbfd257e000000000
+	.quad	0xbe751f024a4452fe
+	.quad	0xbfd249cc00000000
+	.quad	0xbe8e6279a0249a31
+	.quad	0xbfd23bbc00000000
+	.quad	0xbe7631798bcda203
+	.quad	0xbfd22dad00000000
+	.quad	0xbe869d668ff512cd
+	.quad	0xbfd21fa100000000
+	.quad	0xbe4179cae9beee0d
+	.quad	0xbfd2119700000000
+	.quad	0xbe63fa3a108ec52d
+	.quad	0xbfd2038e00000000
+	.quad	0xbe7bbae8d6fb8a1c
+	.quad	0xbfd1f58800000000
+	.quad	0xbe807f90e4c2ec69
+	.quad	0xbfd1e78400000000
+	.quad	0xbe82bc2f5babe119
+	.quad	0xbfd1d98200000000
+	.quad	0xbe84baa4d8e71f1c
+	.quad	0xbfd1cb8200000000
+	.quad	0xbe86a24fc7020b2b
+	.quad	0xbfd1bd8500000000
+	.quad	0xbe8302982dfe3735
+	.quad	0xbfd1af8900000000
+	.quad	0xbe8536eece3209fa
+	.quad	0xbfd1a19000000000
+	.quad	0xbe823ace8fc13621
+	.quad	0xbfd1939900000000
+	.quad	0xbe7f9b761181cc82
+	.quad	0xbfd185a400000000
+	.quad	0xbe7c2e82add30fbf
+	.quad	0xbfd177b100000000
+	.quad	0xbe7a7defb44845fc
+	.quad	0xbfd169c000000000
+	.quad	0xbe7ad8fc5efe4b5f
+	.quad	0xbfd15bd100000000
+	.quad	0xbe7d8efa5836733a
+	.quad	0xbfd14de400000000
+	.quad	0xbe8177a6d8101fb8
+	.quad	0xbfd13ffa00000000
+	.quad	0xbe8030b69ab39bd3
+	.quad	0xbfd1321100000000
+	.quad	0xbe86067085d42483
+	.quad	0xbfd1242a00000000
+	.quad	0xbe8da8a239a3d693
+	.quad	0xbfd1164700000000
+	.quad	0xbe4d72294066a603
+	.quad	0xbfd1086400000000
+	.quad	0xbe7b1ba1dc449b96
+	.quad	0xbfd0fa8400000000
+	.quad	0xbe862896725de3dd
+	.quad	0xbfd0eca600000000
+	.quad	0xbe6a4d928a11e457
+	.quad	0xbfd0deca00000000
+	.quad	0xbe843a36b9d55575
+	.quad	0xbfd0d0f000000000
+	.quad	0xbe73f2208d19fe75
+	.quad	0xbfd0c31800000000
+	.quad	0xbe8d4bfe81a344c0
+	.quad	0xbfd0b54200000000
+	.quad	0xbe88ff16f1f6621d
+	.quad	0xbfd0a76f00000000
+	.quad	0xbe829e78b22b06aa
+	.quad	0xbfd0999d00000000
+	.quad	0xbe84e64b365fec9a
+	.quad	0xbfd08bcd00000000
+	.quad	0xbe8ab2bf39987eff
+	.quad	0xbfd07e0000000000
+	.quad	0xbe8ef00e6f310240
+	.quad	0xbfd0703500000000
+	.quad	0xbe7884f5dd34e44b
+	.quad	0xbfd0626b00000000
+	.quad	0xbe8d92500f14b471
+	.quad	0xbfd054a400000000
+	.quad	0xbe8307e1dd3ad028
+	.quad	0xbfd046df00000000
+	.quad	0xbe79971a63342c6a
+	.quad	0xbfd0391c00000000
+	.quad	0xbe760b6f55e8db61
+	.quad	0xbfd02b5a00000000
+	.quad	0xbe8302cf89e64237
+	.quad	0xbfd01d9b00000000
+	.quad	0xbe8a9f4c3efc935a
+	.quad	0xbfd00fde00000000
+	.quad	0xbe788f5a8dc51cdf
+	.quad	0xbfd0022300000000
+	.quad	0xbe8de87b8de45c1c
+	.quad	0xbfcfe8d500000000
+	.quad	0xbe73bc8feab63684
+	.quad	0xbfcfcd6700000000
+	.quad	0xbe766b590d531889
+	.quad	0xbfcfb1fe00000000
+	.quad	0xbe50ba5e451bff1a
+	.quad	0xbfcf969700000000
+	.quad	0xbe5d9e85a4fc1ce1
+	.quad	0xbfcf7b3600000000
+	.quad	0xbe687fbdab298db0
+	.quad	0xbfcf5fd800000000
+	.quad	0xbe5c831eaf201561
+	.quad	0xbfcf447e00000000
+	.quad	0xbe6c97cc28a0c985
+	.quad	0xbfcf292900000000
+	.quad	0xbe4096a784f160c8
+	.quad	0xbfcf0dd800000000
+	.quad	0xbe463a00e430058b
+	.quad	0xbfcef28900000000
+	.quad	0xbe7a9ae40adf8036
+	.quad	0xbfced74100000000
+	.quad	0xbe76178f7389c2b3
+	.quad	0xbfcebbfc00000000
+	.quad	0xbe628e408a6030db
+	.quad	0xbfcea0bb00000000
+	.quad	0xbe65370cfca139e2
+	.quad	0xbfce857d00000000
+	.quad	0xbe509b099c44098a
+	.quad	0xbfce6a4300000000
+	.quad	0xbe68d5caf2faef74
+	.quad	0xbfce4f0e00000000
+	.quad	0xbe4dd08f036b132f
+	.quad	0xbfce33dd00000000
+	.quad	0xbe64ccf4cb32e460
+	.quad	0xbfce18af00000000
+	.quad	0xbe64c4c42c4e4661
+	.quad	0xbfcdfd8700000000
+	.quad	0xbe70b81de05729de
+	.quad	0xbfcde26000000000
+	.quad	0xbe7a821176a0fe0e
+	.quad	0xbfcdc74000000000
+	.quad	0xbe669566643c24c3
+	.quad	0xbfcdac2200000000
+	.quad	0xbe767c88339625fc
+	.quad	0xbfcd910900000000
+	.quad	0xbe72da2735aa6c86
+	.quad	0xbfcd75f300000000
+	.quad	0xbe644c6d4a5f5ad6
+	.quad	0xbfcd5ae300000000
+	.quad	0xbe6396dd21fe2514
+	.quad	0xbfcd3fd400000000
+	.quad	0xbe6ca92ae56a4fcf
+	.quad	0xbfcd24cb00000000
+	.quad	0xbe7bdc846e0ed386
+	.quad	0xbfcd09c600000000
+	.quad	0xbe55b88be3ae865a
+	.quad	0xbfcceec500000000
+	.quad	0xbe3fc6a072116830
+	.quad	0xbfccd3c600000000
+	.quad	0xbe7b1a6214562c52
+	.quad	0xbfccb8cd00000000
+	.quad	0xbe5f2c91c96636d8
+	.quad	0xbfcc9dd800000000
+	.quad	0xbe60c3b48651cf97
+	.quad	0xbfcc82e600000000
+	.quad	0xbe5966f235766ddb
+	.quad	0xbfcc67f800000000
+	.quad	0xbe78ce14eae5dca8
+	.quad	0xbfcc4d0e00000000
+	.quad	0xbe625479353b5c4a
+	.quad	0xbfcc322800000000
+	.quad	0xbe6d333a7b285ac2
+	.quad	0xbfcc174500000000
+	.quad	0xbe7277affe5d329a
+	.quad	0xbfcbfc6700000000
+	.quad	0xbe67fffd12834efc
+	.quad	0xbfcbe18d00000000
+	.quad	0xbe7b862223583bcf
+	.quad	0xbfcbc6b700000000
+	.quad	0xbe649b874647b1f2
+	.quad	0xbfcbabe300000000
+	.quad	0xbe78929bf1c864a7
+	.quad	0xbfcb911600000000
+	.quad	0xbe74d074968f73d7
+	.quad	0xbfcb764a00000000
+	.quad	0xbe79fb251b935310
+	.quad	0xbfcb5b8300000000
+	.quad	0xbe769696568e41b9
+	.quad	0xbfcb40c100000000
+	.quad	0xbe65ed80b7eb91e0
+	.quad	0xbfcb260200000000
+	.quad	0xbe07d52c3932a2e4
+	.quad	0xbfcb0b4700000000
+	.quad	0xbe6b8ad7d7a99fe6
+	.quad	0xbfcaf08f00000000
+	.quad	0xbe7cbc2b9155b770
+	.quad	0xbfcad5db00000000
+	.quad	0xbe6aa03f2514a52b
+	.quad	0xbfcabb2d00000000
+	.quad	0xbe6cfb1d524b6daf
+	.quad	0xbfcaa08000000000
+	.quad	0xbe7a78cd1fbb1e99
+	.quad	0xbfca85d900000000
+	.quad	0xbe119017e37d4667
+	.quad	0xbfca6b3400000000
+	.quad	0xbe6184b897951f46
+	.quad	0xbfca509400000000
+	.quad	0xbe675349e1651fc0
+	.quad	0xbfca35f700000000
+	.quad	0xbe71c8acc30679dd
+	.quad	0xbfca1b5f00000000
+	.quad	0xbe72ec1682bf9837
+	.quad	0xbfca00ca00000000
+	.quad	0xbe77d09336233c90
+	.quad	0xbfc9e63a00000000
+	.quad	0xbe7852e40017e39c
+	.quad	0xbfc9cbad00000000
+	.quad	0xbe7d1fd8802fb817
+	.quad	0xbfc9b12400000000
+	.quad	0xbe59d13fae79743c
+	.quad	0xbfc9969d00000000
+	.quad	0xbe748d385e0277cf
+	.quad	0xbfc97c1b00000000
+	.quad	0xbe7f678fa8388a68
+	.quad	0xbfc9619f00000000
+	.quad	0xbe5d6188e89480ec
+	.quad	0xbfc9472500000000
+	.quad	0xbe74e4cb139c1e95
+	.quad	0xbfc92caf00000000
+	.quad	0xbe6093e9a4239741
+	.quad	0xbfc9123c00000000
+	.quad	0xbe3c518d850f7ba8
+	.quad	0xbfc8f7cd00000000
+	.quad	0xbe797b7fc86f1c0c
+	.quad	0xbfc8dd6200000000
+	.quad	0xbe77d280a0117cfd
+	.quad	0xbfc8c2fa00000000
+	.quad	0xbe7d078174c6928f
+	.quad	0xbfc8a89800000000
+	.quad	0xbe357f7a64ccd537
+	.quad	0xbfc88e3800000000
+	.quad	0xbe6a22cd1f2e8f29
+	.quad	0xbfc873dc00000000
+	.quad	0xbe1c582d297ff644
+	.quad	0xbfc8598400000000
+	.quad	0xbe73cd87ce24f758
+	.quad	0xbfc83f3000000000
+	.quad	0xbe6eb716bac42623
+	.quad	0xbfc824df00000000
+	.quad	0xbe73592a0f410400
+	.quad	0xbfc80a9300000000
+	.quad	0xbe78343174876ba5
+	.quad	0xbfc7f04900000000
+	.quad	0xbe6ba4f9b930430e
+	.quad	0xbfc7d60400000000
+	.quad	0xbe5367dd3b0b6b0b
+	.quad	0xbfc7bbc200000000
+	.quad	0xbe556265a1dc7a8e
+	.quad	0xbfc7a18500000000
+	.quad	0xbe5f71aca38241c4
+	.quad	0xbfc7874b00000000
+	.quad	0xbe746381f987646b
+	.quad	0xbfc76d1500000000
+	.quad	0xbe665804bc056069
+	.quad	0xbfc752e200000000
+	.quad	0xbe68e83e5955bbc6
+	.quad	0xbfc738b200000000
+	.quad	0xbe787a19887d1e81
+	.quad	0xbfc71e8800000000
+	.quad	0xbe5fd1054d6e1895
+	.quad	0xbfc7045f00000000
+	.quad	0xbe6471e7650be845
+	.quad	0xbfc6ea3b00000000
+	.quad	0xbe707e9d9296377f
+	.quad	0xbfc6d01c00000000
+	.quad	0xbe7b1bb94e9cc3b2
+	.quad	0xbfc6b5ff00000000
+	.quad	0xbe7936ceca9afdc8
+	.quad	0xbfc69be600000000
+	.quad	0xbe4cb3a881abfdf7
+	.quad	0xbfc681d100000000
+	.quad	0xbe732151a8286c6f
+	.quad	0xbfc667c000000000
+	.quad	0xbe6efc2e3e9ced23
+	.quad	0xbfc64db200000000
+	.quad	0xbe78eb86ac9ef252
+	.quad	0xbfc633a800000000
+	.quad	0xbe6f50df1abe0fc9
+	.quad	0xbfc619a100000000
+	.quad	0xbe73f3aefe930c8f
+	.quad	0xbfc5ff9f00000000
+	.quad	0xbe7edc30c01b141d
+	.quad	0xbfc5e59f00000000
+	.quad	0xbe7f08ed31fe1628
+	.quad	0xbfc5cba500000000
+	.quad	0xbe5983b170e6c68f
+	.quad	0xbfc5b1ad00000000
+	.quad	0xbe7c5342ddbb7371
+	.quad	0xbfc597ba00000000
+	.quad	0xbe31f13b9ecb2da6
+	.quad	0xbfc57dc900000000
+	.quad	0xbe75038fc82fbc24
+	.quad	0xbfc563dc00000000
+	.quad	0xbe783ff5ad081783
+	.quad	0xbfc549f300000000
+	.quad	0xbe662723a6715875
+	.quad	0xbfc5300d00000000
+	.quad	0xbe6b7b7cc9af768a
+	.quad	0xbfc5162b00000000
+	.quad	0xbe1f78d1162b410d
+	.quad	0xbfc4fc4d00000000
+	.quad	0xbe7cb37679326801
+	.quad	0xbfc4e27200000000
+	.quad	0xbe7065fa9470590b
+	.quad	0xbfc4c89c00000000
+	.quad	0xbe6c3a0233eda037
+	.quad	0xbfc4aec800000000
+	.quad	0xbe4e014055897901
+	.quad	0xbfc494f900000000
+	.quad	0xbe4fb8e003c2f3b1
+	.quad	0xbfc47b2b00000000
+	.quad	0xbe7c8996199d6eea
+	.quad	0xbfc4616400000000
+	.quad	0xbe0faf0bc81e4b94
+	.quad	0xbfc4479d00000000
+	.quad	0xbe7cc047f1f25c83
+	.quad	0xbfc42ddd00000000
+	.quad	0xbe53d0da516b147f
+	.quad	0xbfc4141f00000000
+	.quad	0xbe7fcb190acb1c29
+	.quad	0xbfc3fa6400000000
+	.quad	0xbe7414ec0c60bad1
+	.quad	0xbfc3e0ae00000000
+	.quad	0xbe74e9ba984a9a60
+	.quad	0xbfc3c6fc00000000
+	.quad	0xbe624337ccc1362d
+	.quad	0xbfc3ad4b00000000
+	.quad	0xbe7774b4cc0ec2a8
+	.quad	0xbfc393a000000000
+	.quad	0xbe732b380b7efc7c
+	.quad	0xbfc379f700000000
+	.quad	0xbe62dac931c2e190
+	.quad	0xbfc3605300000000
+	.quad	0xbe6b470fa43dc529
+	.quad	0xbfc346b100000000
+	.quad	0xbe69abf6162bfc32
+	.quad	0xbfc32d1300000000
+	.quad	0xbe2ba4b334a02879
+	.quad	0xbfc3137a00000000
+	.quad	0xbe4d8be297e30d03
+	.quad	0xbfc2f9e300000000
+	.quad	0xbe415bfda1644c22
+	.quad	0xbfc2e04f00000000
+	.quad	0xbe763bbe948b1ac0
+	.quad	0xbfc2c6c000000000
+	.quad	0xbe016a3f42b0e0f2
+	.quad	0xbfc2ad3400000000
+	.quad	0xbe00b500d8b4466e
+	.quad	0xbfc293ab00000000
+	.quad	0xbe767834aad3c38f
+	.quad	0xbfc27a2700000000
+	.quad	0xbe4b3fb7ded60421
+	.quad	0xbfc260a600000000
+	.quad	0xbe5cc6018f3bcd49
+	.quad	0xbfc2472700000000
+	.quad	0xbe603b59bc184860
+	.quad	0xbfc22dad00000000
+	.quad	0xbe7a556695fca0d7
+	.quad	0xbfc2143600000000
+	.quad	0xbe64434576d52cb7
+	.quad	0xbfc1fac400000000
+	.quad	0xbe6796ca377ea74e
+	.quad	0xbfc1e15400000000
+	.quad	0xbe66f7798c85559d
+	.quad	0xbfc1c7e800000000
+	.quad	0xbe4bde34965f6984
+	.quad	0xbfc1ae7d00000000
+	.quad	0xbe79e4ab7003a0e6
+	.quad	0xbfc1951900000000
+	.quad	0xbe49fd11e39abaac
+	.quad	0xbfc17bb800000000
+	.quad	0xbe56b7b48b95c15b
+	.quad	0xbfc1625900000000
+	.quad	0xbe5cc36d3e3cca65
+	.quad	0xbfc148fe00000000
+	.quad	0xbe41ce485761f69c
+	.quad	0xbfc12fa600000000
+	.quad	0xbe770a1f05316811
+	.quad	0xbfc1165300000000
+	.quad	0xbe578d49dc1afe94
+	.quad	0xbfc0fd0300000000
+	.quad	0xbe6e0dca31cd9e54
+	.quad	0xbfc0e3b500000000
+	.quad	0xbe784e650e0a2fd5
+	.quad	0xbfc0ca6b00000000
+	.quad	0xbe7c536d57d9dab9
+	.quad	0xbfc0b12500000000
+	.quad	0xbe7b57a5578d01fd
+	.quad	0xbfc097e300000000
+	.quad	0xbe759cc0cf3da52a
+	.quad	0xbfc07ea300000000
+	.quad	0xbe70dc7f7c36aab7
+	.quad	0xbfc0656900000000
+	.quad	0xbe43057726eea6f9
+	.quad	0xbfc04c3000000000
+	.quad	0xbe75532713b0b555
+	.quad	0xbfc032fc00000000
+	.quad	0xbe51f736f8234297
+	.quad	0xbfc019c900000000
+	.quad	0xbe757a9427127e28
+	.quad	0xbfc0009c00000000
+	.quad	0xbe7dd37909d634e1
+	.quad	0xbfbfcee400000000
+	.quad	0xbe60e50b92227f37
+	.quad	0xbfbf9c9700000000
+	.quad	0xbe10744b2bbd5c34
+	.quad	0xbfbf6a4d00000000
+	.quad	0xbe6576fb1ab66ad7
+	.quad	0xbfbf380f00000000
+	.quad	0xbe6b5374d31a91ee
+	.quad	0xbfbf05d600000000
+	.quad	0xbe4db610eee1b81b
+	.quad	0xbfbed3a000000000
+	.quad	0xbe6a19b7978e8bb8
+	.quad	0xbfbea17600000000
+	.quad	0xbe6f4cb6bf56f18e
+	.quad	0xbfbe6f5100000000
+	.quad	0xbe57f67e0bd3b63f
+	.quad	0xbfbe3d3300000000
+	.quad	0xbe666a27d6a83d6c
+	.quad	0xbfbe0b1a00000000
+	.quad	0xbe523cbf0c85fa27
+	.quad	0xbfbdd90800000000
+	.quad	0xbe6a7ced811f7da6
+	.quad	0xbfbda6ff00000000
+	.quad	0xbe5615e1bd550182
+	.quad	0xbfbd74fd00000000
+	.quad	0xbe6b4da043725d03
+	.quad	0xbfbd430000000000
+	.quad	0xbe658a49aa2dca64
+	.quad	0xbfbd110b00000000
+	.quad	0xbe6066543ad84ef1
+	.quad	0xbfbcdf1a00000000
+	.quad	0xbe66073d700e9f19
+	.quad	0xbfbcad3500000000
+	.quad	0xbe63a29cd758d759
+	.quad	0xbfbc7b5100000000
+	.quad	0xbe49b8777d6bbc9d
+	.quad	0xbfbc497800000000
+	.quad	0xbe623f87f4487fe4
+	.quad	0xbfbc17a400000000
+	.quad	0xbe55196cb4c66620
+	.quad	0xbfbbe5d800000000
+	.quad	0xbe496e785a0317a3
+	.quad	0xbfbbb41000000000
+	.quad	0xbe5ee49501957b40
+	.quad	0xbfbb825000000000
+	.quad	0xbe6cf6df4849748b
+	.quad	0xbfbb509500000000
+	.quad	0xbe688f964bd70c8f
+	.quad	0xbfbb1ee600000000
+	.quad	0xbe6072c317519bb4
+	.quad	0xbfbaed3800000000
+	.quad	0xbe05b3290a662bd0
+	.quad	0xbfbabb9500000000
+	.quad	0xbe5b246ad0582c09
+	.quad	0xbfba89f700000000
+	.quad	0xbe55372721811f66
+	.quad	0xbfba585d00000000
+	.quad	0xbe67c995fe88bce3
+	.quad	0xbfba26cc00000000
+	.quad	0xbe596605e161e768
+	.quad	0xbfb9f54300000000
+	.quad	0xbe53bd6ea8cdcabf
+	.quad	0xbfb9c3be00000000
+	.quad	0xbe6873a6488f239e
+	.quad	0xbfb9924200000000
+	.quad	0xbe6038db2539e54e
+	.quad	0xbfb960ca00000000
+	.quad	0xbe6a3576f0eb47ea
+	.quad	0xbfb92f5b00000000
+	.quad	0xbe5ca16578e782d8
+	.quad	0xbfb8fdf000000000
+	.quad	0xbe6571dd058c9404
+	.quad	0xbfb8cc8e00000000
+	.quad	0xbe4e8172926b3912
+	.quad	0xbfb89b3400000000
+	.quad	0xbe458eb8a49a1ed9
+	.quad	0xbfb869de00000000
+	.quad	0xbe67736434037b3e
+	.quad	0xbfb8388d00000000
+	.quad	0xbe6e2728b7069e85
+	.quad	0xbfb8074500000000
+	.quad	0xbe61c6bcd5b504de
+	.quad	0xbfb7d60500000000
+	.quad	0xbe62d9f791fd12f7
+	.quad	0xbfb7a4ca00000000
+	.quad	0xbe53b18b476f88bf
+	.quad	0xbfb7739300000000
+	.quad	0xbe671b2ad71bba2e
+	.quad	0xbfb7426500000000
+	.quad	0xbe6329422bbd68e8
+	.quad	0xbfb7113f00000000
+	.quad	0xbe6e8b3c2fe4ecae
+	.quad	0xbfb6e01f00000000
+	.quad	0xbe2795edd5ed58e9
+	.quad	0xbfb6af0200000000
+	.quad	0xbe6c4c07447a13fa
+	.quad	0xbfb67def00000000
+	.quad	0xbe4f2ea58340e81e
+	.quad	0xbfb64ce400000000
+	.quad	0xbe4203398a8ffda4
+	.quad	0xbfb61bda00000000
+	.quad	0xbe2d4147ad124eaa
+	.quad	0xbfb5eadc00000000
+	.quad	0xbe539c66835b9867
+	.quad	0xbfb5b9df00000000
+	.quad	0xbe6317f3d15a9860
+	.quad	0xbfb588ef00000000
+	.quad	0xbe503474104b244e
+	.quad	0xbfb557ff00000000
+	.quad	0xbe6f1dfae0bd2e94
+	.quad	0xbfb5271900000000
+	.quad	0xbe541889ef09d7c8
+	.quad	0xbfb4f63b00000000
+	.quad	0xbe52dc76d475d4d1
+	.quad	0xbfb4c56200000000
+	.quad	0xbe433458770a1735
+	.quad	0xbfb4948d00000000
+	.quad	0xbe6c8223b5c8b49b
+	.quad	0xbfb463c200000000
+	.quad	0xbe540d91e2302042
+	.quad	0xbfb432fb00000000
+	.quad	0xbe64b47f064d986f
+	.quad	0xbfb4023900000000
+	.quad	0xbe6ce4d526c81e43
+	.quad	0xbfb3d18000000000
+	.quad	0xbe6c41714a091d46
+	.quad	0xbfb3a0d000000000
+	.quad	0xbe63540db8c80703
+	.quad	0xbfb3702100000000
+	.quad	0xbe5f8cf1a845a25c
+	.quad	0xbfb33f7b00000000
+	.quad	0xbe430a65c7a2686f
+	.quad	0xbfb30edd00000000
+	.quad	0xbe62d26a7215665c
+	.quad	0xbfb2de4500000000
+	.quad	0xbe1bff57e3bab991
+	.quad	0xbfb2adb100000000
+	.quad	0xbe5e8adfc156e82d
+	.quad	0xbfb27d2200000000
+	.quad	0xbe6e5d041c5f1a05
+	.quad	0xbfb24c9d00000000
+	.quad	0xbe50a21095df344c
+	.quad	0xbfb21c2000000000
+	.quad	0xbe5b57c218054e22
+	.quad	0xbfb1eba400000000
+	.quad	0xbe6b1806f4988888
+	.quad	0xbfb1bb3200000000
+	.quad	0xbe430029dc60a716
+	.quad	0xbfb18ac400000000
+	.quad	0xbe611e8ed29c4bea
+	.quad	0xbfb15a5f00000000
+	.quad	0xbe6aae4e1e1cd7e9
+	.quad	0xbfb12a0000000000
+	.quad	0xbe4f2855166a96d5
+	.quad	0xbfb0f9a500000000
+	.quad	0xbe68ccc743692647
+	.quad	0xbfb0c95400000000
+	.quad	0xbe50c2b8ff93eea0
+	.quad	0xbfb0990400000000
+	.quad	0xbe329700306849f4
+	.quad	0xbfb068c000000000
+	.quad	0xbe661c7597dfa0cf
+	.quad	0xbfb0387e00000000
+	.quad	0xbe64f950c199fdd6
+	.quad	0xbfb0084500000000
+	.quad	0xbe6434bda55a11e5
+	.quad	0xbfafb02300000000
+	.quad	0xbe537435dba745c1
+	.quad	0xbfaf4fc600000000
+	.quad	0xbe4793720209c664
+	.quad	0xbfaeef7b00000000
+	.quad	0xbe3e845c9d0173b4
+	.quad	0xbfae8f3a00000000
+	.quad	0xbe527188bd53b8bf
+	.quad	0xbfae2f0400000000
+	.quad	0xbe49e4e1f2d00cb9
+	.quad	0xbfadced800000000
+	.quad	0xbe57db5b6132809a
+	.quad	0xbfad6ebf00000000
+	.quad	0xbe43c7fbabdf571f
+	.quad	0xbfad0eb000000000
+	.quad	0xbe4c086873f1531f
+	.quad	0xbfacaeac00000000
+	.quad	0xbe33d01264312288
+	.quad	0xbfac4eb200000000
+	.quad	0xbe4ed73a1b11c287
+	.quad	0xbfabeecb00000000
+	.quad	0xbe328d5761ea48d2
+	.quad	0xbfab8eee00000000
+	.quad	0xbe4e2759579ac08a
+	.quad	0xbfab2f1c00000000
+	.quad	0xbe4eea927b8de26e
+	.quad	0xbfaacf5500000000
+	.quad	0xbe3a03ec4341a4ac
+	.quad	0xbfaa6f9800000000
+	.quad	0xbe54efb9656181bf
+	.quad	0xbfaa0fee00000000
+	.quad	0xbe529aa680456564
+	.quad	0xbfa9b04f00000000
+	.quad	0xbe42b60fbbf05015
+	.quad	0xbfa950ba00000000
+	.quad	0xbe59ea4d388956ac
+	.quad	0xbfa8f13800000000
+	.quad	0xbe5c820f8ddadcd6
+	.quad	0xbfa891ba00000000
+	.quad	0xbe27e05a334c58f7
+	.quad	0xbfa8324d00000000
+	.quad	0xbe5d3229b2ba0376
+	.quad	0xbfa7d2ec00000000
+	.quad	0xbe545e77c08ed94c
+	.quad	0xbfa7739600000000
+	.quad	0xbe427656b6f95551
+	.quad	0xbfa7144a00000000
+	.quad	0xbe5c82a193d30405
+	.quad	0xbfa6b50a00000000
+	.quad	0xbe4ddebd1f3c284a
+	.quad	0xbfa655dc00000000
+	.quad	0xbe599c108199cfd8
+	.quad	0xbfa5f6ba00000000
+	.quad	0xbe348e1f3828f0d8
+	.quad	0xbfa597a200000000
+	.quad	0xbe5240beb8df56ca
+	.quad	0xbfa5389600000000
+	.quad	0xbe1aed65370b9099
+	.quad	0xbfa4d99400000000
+	.quad	0xbe5429166d091c5d
+	.quad	0xbfa47a9e00000000
+	.quad	0xbe44d5db06b75692
+	.quad	0xbfa41bba00000000
+	.quad	0xbe5e4ff2e670387a
+	.quad	0xbfa3bcda00000000
+	.quad	0xbe5e73df6e675ed2
+	.quad	0xbfa35e0d00000000
+	.quad	0xbe5df2994af6bbf0
+	.quad	0xbfa2ff4c00000000
+	.quad	0xbe31a09f65bfdef1
+	.quad	0xbfa2a09500000000
+	.quad	0xbe5290bafe6a7061
+	.quad	0xbfa241ea00000000
+	.quad	0xbe425151c43b4181
+	.quad	0xbfa1e34a00000000
+	.quad	0xbe41d8dbc0646431
+	.quad	0xbfa184b500000000
+	.quad	0xbe5298ac777c8c9d
+	.quad	0xbfa1263400000000
+	.quad	0xbe10a2f9d7e8035a
+	.quad	0xbfa0c7b600000000
+	.quad	0xbe0bbc4c660fd088
+	.quad	0xbfa0694b00000000
+	.quad	0xbe3cc374b7950d13
+	.quad	0xbfa00aeb00000000
+	.quad	0xbe5aa058acdc0265
+	.quad	0xbf9f592000000000
+	.quad	0xbe149b4d7e5df2c0
+	.quad	0xbf9e9c8f00000000
+	.quad	0xbe10a7a7e78bdba3
+	.quad	0xbf9de01500000000
+	.quad	0xbde02a1d978db2f1
+	.quad	0xbf9d23b100000000
+	.quad	0xbe4e9227a287068e
+	.quad	0xbf9c676500000000
+	.quad	0xbe4e8561096793f8
+	.quad	0xbf9bab3100000000
+	.quad	0xbe0968e122179f22
+	.quad	0xbf9aef1300000000
+	.quad	0xbe328465c0dba24f
+	.quad	0xbf9a330c00000000
+	.quad	0xbe47051e31e0d70b
+	.quad	0xbf99771d00000000
+	.quad	0xbe38b8d275ff3a9a
+	.quad	0xbf98bb5500000000
+	.quad	0xbe122bdb89883925
+	.quad	0xbf97ff9400000000
+	.quad	0xbe36fbf85d50fecb
+	.quad	0xbf9743eb00000000
+	.quad	0xbdf87cba8eccac44
+	.quad	0xbf96886800000000
+	.quad	0xbe4bd57d800c1470
+	.quad	0xbf95ccee00000000
+	.quad	0xbe3be2933856d62e
+	.quad	0xbf95118b00000000
+	.quad	0xbe409620e0f1be7b
+	.quad	0xbf94564f00000000
+	.quad	0xbe4e4325cf62b811
+	.quad	0xbf939b1c00000000
+	.quad	0xbe2adee9af6a25c0
+	.quad	0xbf92e00000000000
+	.quad	0xbe20ce46d28f63c9
+	.quad	0xbf92250b00000000
+	.quad	0xbe41f6aa9fb6fe0b
+	.quad	0xbf916a1e00000000
+	.quad	0xbe4e41409957601b
+	.quad	0xbf90af5900000000
+	.quad	0xbe4e53e5a63658ad
+	.quad	0xbf8fe93900000000
+	.quad	0xbe3eded24d629d7d
+	.quad	0xbf8e73ef00000000
+	.quad	0xbe3a29d2ea7d362b
+	.quad	0xbf8cfef500000000
+	.quad	0xbe1e2e79fe4aa765
+	.quad	0xbf8b8a0a00000000
+	.quad	0xbe3e8785027a216b
+	.quad	0xbf8a155000000000
+	.quad	0xbe37a174d5a8bded
+	.quad	0xbf88a0c600000000
+	.quad	0xbe35dde88f39d7ce
+	.quad	0xbf872c6c00000000
+	.quad	0xbe3c41ea3f44a785
+	.quad	0xbf85b86300000000
+	.quad	0xbe194c69ffd7f42d
+	.quad	0xbf84446a00000000
+	.quad	0xbe1a5e4e0d24af39
+	.quad	0xbf82d0a100000000
+	.quad	0xbe381611eb6c3818
+	.quad	0xbf815d0900000000
+	.quad	0xbe3dd5da9cc5f987
+	.quad	0xbf7fd34500000000
+	.quad	0xbe25bd80e0b0590e
+	.quad	0xbf7cec9900000000
+	.quad	0xbe1ce47bb0eea510
+	.quad	0xbf7a068e00000000
+	.quad	0xbe26dbe100877575
+	.quad	0xbf7720e600000000
+	.quad	0xbd9aa4f614b9e1ac
+	.quad	0xbf743b5f00000000
+	.quad	0xbe271a96b1eb7842
+	.quad	0xbf71567b00000000
+	.quad	0xbe2318f60005710d
+	.quad	0xbf6ce37400000000
+	.quad	0xbe0c7a4e122b1762
+	.quad	0xbf671b3600000000
+	.quad	0xbe1c85d1e3d214d1
+	.quad	0xbf61533f00000000
+	.quad	0xbe0e793b61aa1f54
+	.quad	0xbf57181c00000000
+	.quad	0xbe01296a4555af78
+	.quad	0xbf47168e00000000
+	.quad	0xbdf30d6f34ebfa1c
+	.rept	2
+	.quad	0x0000000000000000
+	.endr
+	.rept	48
+	.byte	0
+	.endr
+
+/* Exp(2) lookup table for exp part (non HSW) */
+.if .-__svml_dpow_data != _exp2_tbl
+.err
+.endif
+	.quad	0x3ff0000000000000
+	.quad	0x0000000000000000
+	.quad	0x3ff0163da9fb3335
+	.quad	0x3c9b61299ab8cdb7
+	.quad	0x3ff02c9a3e778061
+	.quad	0xbc719083535b085d
+	.quad	0x3ff04315e86e7f85
+	.quad	0xbc90a31c1977c96e
+	.quad	0x3ff059b0d3158574
+	.quad	0x3c8d73e2a475b465
+	.quad	0x3ff0706b29ddf6de
+	.quad	0xbc8c91dfe2b13c26
+	.quad	0x3ff0874518759bc8
+	.quad	0x3c6186be4bb284ff
+	.quad	0x3ff09e3ecac6f383
+	.quad	0x3c91487818316135
+	.quad	0x3ff0b5586cf9890f
+	.quad	0x3c98a62e4adc610a
+	.quad	0x3ff0cc922b7247f7
+	.quad	0x3c901edc16e24f71
+	.quad	0x3ff0e3ec32d3d1a2
+	.quad	0x3c403a1727c57b52
+	.quad	0x3ff0fb66affed31b
+	.quad	0xbc6b9bedc44ebd7b
+	.quad	0x3ff11301d0125b51
+	.quad	0xbc96c51039449b39
+	.quad	0x3ff12abdc06c31cc
+	.quad	0xbc51b514b36ca5c7
+	.quad	0x3ff1429aaea92de0
+	.quad	0xbc932fbf9af1369e
+	.quad	0x3ff15a98c8a58e51
+	.quad	0x3c82406ab9eeab09
+	.quad	0x3ff172b83c7d517b
+	.quad	0xbc819041b9d78a75
+	.quad	0x3ff18af9388c8dea
+	.quad	0xbc911023d1970f6b
+	.quad	0x3ff1a35beb6fcb75
+	.quad	0x3c8e5b4c7b4968e4
+	.quad	0x3ff1bbe084045cd4
+	.quad	0xbc995386352ef607
+	.quad	0x3ff1d4873168b9aa
+	.quad	0x3c9e016e00a2643c
+	.quad	0x3ff1ed5022fcd91d
+	.quad	0xbc91df98027bb78b
+	.quad	0x3ff2063b88628cd6
+	.quad	0x3c8dc775814a8494
+	.quad	0x3ff21f49917ddc96
+	.quad	0x3c82a97e9494a5ed
+	.quad	0x3ff2387a6e756238
+	.quad	0x3c99b07eb6c70572
+	.quad	0x3ff251ce4fb2a63f
+	.quad	0x3c8ac155bef4f4a4
+	.quad	0x3ff26b4565e27cdd
+	.quad	0x3c82bd339940e9d9
+	.quad	0x3ff284dfe1f56381
+	.quad	0xbc9a4c3a8c3f0d7d
+	.quad	0x3ff29e9df51fdee1
+	.quad	0x3c8612e8afad1255
+	.quad	0x3ff2b87fd0dad990
+	.quad	0xbc410adcd6381aa3
+	.quad	0x3ff2d285a6e4030b
+	.quad	0x3c90024754db41d4
+	.quad	0x3ff2ecafa93e2f56
+	.quad	0x3c71ca0f45d52383
+	.quad	0x3ff306fe0a31b715
+	.quad	0x3c86f46ad23182e4
+	.quad	0x3ff32170fc4cd831
+	.quad	0x3c8a9ce78e18047c
+	.quad	0x3ff33c08b26416ff
+	.quad	0x3c932721843659a5
+	.quad	0x3ff356c55f929ff1
+	.quad	0xbc8b5cee5c4e4628
+	.quad	0x3ff371a7373aa9cb
+	.quad	0xbc963aeabf42eae1
+	.quad	0x3ff38cae6d05d866
+	.quad	0xbc9e958d3c9904bc
+	.quad	0x3ff3a7db34e59ff7
+	.quad	0xbc75e436d661f5e2
+	.quad	0x3ff3c32dc313a8e5
+	.quad	0xbc9efff8375d29c3
+	.quad	0x3ff3dea64c123422
+	.quad	0x3c8ada0911f09ebb
+	.quad	0x3ff3fa4504ac801c
+	.quad	0xbc97d023f956f9f3
+	.quad	0x3ff4160a21f72e2a
+	.quad	0xbc5ef3691c309278
+	.quad	0x3ff431f5d950a897
+	.quad	0xbc81c7dde35f7998
+	.quad	0x3ff44e086061892d
+	.quad	0x3c489b7a04ef80cf
+	.quad	0x3ff46a41ed1d0057
+	.quad	0x3c9c944bd1648a76
+	.quad	0x3ff486a2b5c13cd0
+	.quad	0x3c73c1a3b69062f0
+	.quad	0x3ff4a32af0d7d3de
+	.quad	0x3c99cb62f3d1be56
+	.quad	0x3ff4bfdad5362a27
+	.quad	0x3c7d4397afec42e2
+	.quad	0x3ff4dcb299fddd0d
+	.quad	0x3c98ecdbbc6a7833
+	.quad	0x3ff4f9b2769d2ca7
+	.quad	0xbc94b309d25957e3
+	.quad	0x3ff516daa2cf6642
+	.quad	0xbc8f768569bd93ee
+	.quad	0x3ff5342b569d4f82
+	.quad	0xbc807abe1db13cac
+	.quad	0x3ff551a4ca5d920f
+	.quad	0xbc8d689cefede59a
+	.quad	0x3ff56f4736b527da
+	.quad	0x3c99bb2c011d93ac
+	.quad	0x3ff58d12d497c7fd
+	.quad	0x3c8295e15b9a1de7
+	.quad	0x3ff5ab07dd485429
+	.quad	0x3c96324c054647ac
+	.quad	0x3ff5c9268a5946b7
+	.quad	0x3c3c4b1b816986a2
+	.quad	0x3ff5e76f15ad2148
+	.quad	0x3c9ba6f93080e65d
+	.quad	0x3ff605e1b976dc09
+	.quad	0xbc93e2429b56de47
+	.quad	0x3ff6247eb03a5585
+	.quad	0xbc9383c17e40b496
+	.quad	0x3ff6434634ccc320
+	.quad	0xbc8c483c759d8932
+	.quad	0x3ff6623882552225
+	.quad	0xbc9bb60987591c33
+	.quad	0x3ff68155d44ca973
+	.quad	0x3c6038ae44f73e64
+	.quad	0x3ff6a09e667f3bcd
+	.quad	0xbc9bdd3413b26455
+	.quad	0x3ff6c012750bdabf
+	.quad	0xbc72895667ff0b0c
+	.quad	0x3ff6dfb23c651a2f
+	.quad	0xbc6bbe3a683c88aa
+	.quad	0x3ff6ff7df9519484
+	.quad	0xbc883c0f25860ef6
+	.quad	0x3ff71f75e8ec5f74
+	.quad	0xbc816e4786887a99
+	.quad	0x3ff73f9a48a58174
+	.quad	0xbc90a8d96c65d53b
+	.quad	0x3ff75feb564267c9
+	.quad	0xbc90245957316dd3
+	.quad	0x3ff780694fde5d3f
+	.quad	0x3c9866b80a02162c
+	.quad	0x3ff7a11473eb0187
+	.quad	0xbc841577ee04992f
+	.quad	0x3ff7c1ed0130c132
+	.quad	0x3c9f124cd1164dd5
+	.quad	0x3ff7e2f336cf4e62
+	.quad	0x3c705d02ba15797e
+	.quad	0x3ff80427543e1a12
+	.quad	0xbc927c86626d972a
+	.quad	0x3ff82589994cce13
+	.quad	0xbc9d4c1dd41532d7
+	.quad	0x3ff8471a4623c7ad
+	.quad	0xbc88d684a341cdfb
+	.quad	0x3ff868d99b4492ed
+	.quad	0xbc9fc6f89bd4f6ba
+	.quad	0x3ff88ac7d98a6699
+	.quad	0x3c9994c2f37cb53a
+	.quad	0x3ff8ace5422aa0db
+	.quad	0x3c96e9f156864b26
+	.quad	0x3ff8cf3216b5448c
+	.quad	0xbc70d55e32e9e3aa
+	.quad	0x3ff8f1ae99157736
+	.quad	0x3c85cc13a2e3976c
+	.quad	0x3ff9145b0b91ffc6
+	.quad	0xbc9dd6792e582523
+	.quad	0x3ff93737b0cdc5e5
+	.quad	0xbc675fc781b57ebb
+	.quad	0x3ff95a44cbc8520f
+	.quad	0xbc764b7c96a5f039
+	.quad	0x3ff97d829fde4e50
+	.quad	0xbc9d185b7c1b85d0
+	.quad	0x3ff9a0f170ca07ba
+	.quad	0xbc9173bd91cee632
+	.quad	0x3ff9c49182a3f090
+	.quad	0x3c7c7c46b071f2be
+	.quad	0x3ff9e86319e32323
+	.quad	0x3c7824ca78e64c6e
+	.quad	0x3ffa0c667b5de565
+	.quad	0xbc9359495d1cd532
+	.quad	0x3ffa309bec4a2d33
+	.quad	0x3c96305c7ddc36ab
+	.quad	0x3ffa5503b23e255d
+	.quad	0xbc9d2f6edb8d41e1
+	.quad	0x3ffa799e1330b358
+	.quad	0x3c9bcb7ecac563c6
+	.quad	0x3ffa9e6b5579fdbf
+	.quad	0x3c90fac90ef7fd31
+	.quad	0x3ffac36bbfd3f37a
+	.quad	0xbc8f9234cae76cd0
+	.quad	0x3ffae89f995ad3ad
+	.quad	0x3c97a1cd345dcc81
+	.quad	0x3ffb0e07298db666
+	.quad	0xbc9bdef54c80e424
+	.quad	0x3ffb33a2b84f15fb
+	.quad	0xbc62805e3084d707
+	.quad	0x3ffb59728de5593a
+	.quad	0xbc9c71dfbbba6de3
+	.quad	0x3ffb7f76f2fb5e47
+	.quad	0xbc75584f7e54ac3a
+	.quad	0x3ffba5b030a1064a
+	.quad	0xbc9efcd30e54292e
+	.quad	0x3ffbcc1e904bc1d2
+	.quad	0x3c823dd07a2d9e84
+	.quad	0x3ffbf2c25bd71e09
+	.quad	0xbc9efdca3f6b9c72
+	.quad	0x3ffc199bdd85529c
+	.quad	0x3c811065895048dd
+	.quad	0x3ffc40ab5fffd07a
+	.quad	0x3c9b4537e083c60a
+	.quad	0x3ffc67f12e57d14b
+	.quad	0x3c92884dff483cac
+	.quad	0x3ffc8f6d9406e7b5
+	.quad	0x3c71acbc48805c44
+	.quad	0x3ffcb720dcef9069
+	.quad	0x3c7503cbd1e949db
+	.quad	0x3ffcdf0b555dc3fa
+	.quad	0xbc8dd83b53829d72
+	.quad	0x3ffd072d4a07897c
+	.quad	0xbc9cbc3743797a9c
+	.quad	0x3ffd2f87080d89f2
+	.quad	0xbc9d487b719d8577
+	.quad	0x3ffd5818dcfba487
+	.quad	0x3c82ed02d75b3706
+	.quad	0x3ffd80e316c98398
+	.quad	0xbc911ec18beddfe8
+	.quad	0x3ffda9e603db3285
+	.quad	0x3c9c2300696db532
+	.quad	0x3ffdd321f301b460
+	.quad	0x3c92da5778f018c2
+	.quad	0x3ffdfc97337b9b5f
+	.quad	0xbc91a5cd4f184b5b
+	.quad	0x3ffe264614f5a129
+	.quad	0xbc97b627817a1496
+	.quad	0x3ffe502ee78b3ff6
+	.quad	0x3c839e8980a9cc8f
+	.quad	0x3ffe7a51fbc74c83
+	.quad	0x3c92d522ca0c8de1
+	.quad	0x3ffea4afa2a490da
+	.quad	0xbc9e9c23179c2893
+	.quad	0x3ffecf482d8e67f1
+	.quad	0xbc9c93f3b411ad8c
+	.quad	0x3ffefa1bee615a27
+	.quad	0x3c9dc7f486a4b6b0
+	.quad	0x3fff252b376bba97
+	.quad	0x3c93a1a5bf0d8e43
+	.quad	0x3fff50765b6e4540
+	.quad	0x3c99d3e12dd8a18a
+	.quad	0x3fff7bfdad9cbe14
+	.quad	0xbc9dbb12d0063509
+	.quad	0x3fffa7c1819e90d8
+	.quad	0x3c874853f3a5931e
+	.quad	0x3fffd3c22b8f71f1
+	.quad	0x3c62eb74966579e7
+
+/* log2 polynomial coefficients:
+ * clv7 */
+double_vector _clv_1 0x3f903950cf599c56
+
+/* clv6 */
+double_vector _clv_2 0xbf9b4ea0e9419f52
+
+/* clv5 */
+double_vector _clv_3 0x3fa7a334ddfc9f86
+
+/* clv4 */
+double_vector _clv_4 0xbfb550472a8bb463
+
+/* clv3 */
+double_vector _clv_5 0x3fc47fd462b3b816
+
+/* clv2 */
+double_vector _clv_6 0xbfd62e4346694107
+
+/* clv1 */
+double_vector _clv_7 0x3e79c3a6966457ee
+
+/* exponential polynomial coefficients:
+ * cev5 */
+double_vector _cev_1 0x3f55d87fe78a6731
+
+/* cev4 */
+double_vector _cev_2 0x3f83b2ab6fba4e77
+
+/* cev3 */
+double_vector _cev_3 0x3fac6b08d704a0bf
+
+/* cev2 */
+double_vector _cev_4 0x3fcebfbdff82c58e
+
+/* cev1 */
+double_vector _cev_5 0x3fe62e42fefa39ef
+
+/* General purpose constants:
+ * iMantissaMask */
+double_vector _iMantissaMask 0x000fffffffffffff
+
+/* i3fe7fe0000000000 */
+double_vector _i3fe7fe0000000000 0x3fe7fe0000000000
+
+/* dbOne */
+double_vector _dbOne 0x3ff0000000000000
+
+/* iffffffff00000000 */
+double_vector _iffffffff00000000 0xffffffff00000000
+
+/* db2p20_2p19 = 2^20+2^19 */
+double_vector _db2p20_2p19 0x4138000000000000
+
+/* iHighMask */
+double_vector _iHighMask 0xfffffffff8000000
+
+/* LHN = -log2(e) truncated to 22 bits */
+double_vector _LHN 0xbff7154740000000
+
+/* ifff0000000000000 */
+double_vector _ifff0000000000000 0xfff0000000000000
+
+/* db2p45_2p44 */
+double_vector _db2p45_2p44 0x42c8000000000000
+
+/* NEG_INF */
+double_vector _NEG_INF 0xfff0000000000000
+
+/* NEG_ZERO */
+double_vector _NEG_ZERO 0x8000000000000000
+
+/* 2pow52 */
+double_vector _d2pow52 0x4330000000000000
+
+/* 1div2pow111 */
+double_vector _d1div2pow111 0x3900000000000000
+
+/* HIDELTA */
+float_vector _HIDELTA 0x00100000
+
+/* LORANGE */
+float_vector _LORANGE 0x00200000
+
+/* ABSMASK */
+float_vector _ABSMASK 0x7fffffff
+
+/* INF */
+float_vector _INF 0x7f800000
+
+/* DOMAINRANGE */
+float_vector _DOMAINRANGE 0x408f3fff
+
+/* iIndexMask */
+float_vector _iIndexMask 0x000ffe00
+
+/* iIndexAdd */
+float_vector _iIndexAdd 0x00000200
+
+/* i3fe7fe00 */
+float_vector _i3fe7fe00 0x3fe7fe00
+
+/* i2p20_2p19 */
+float_vector _i2p20_2p19 0x41380000
+
+/* iOne */
+float_vector _iOne 0x3ff00000
+
+/* jIndexMask */
+float_vector _jIndexMask 0x0000007f
+	.type	__svml_dpow_data,@object
+	.size __svml_dpow_data,.-__svml_dpow_data
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/svml_d_pow_data.h b/REORG.TODO/sysdeps/x86_64/fpu/svml_d_pow_data.h
new file mode 100644
index 0000000000..ce90d8546b
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/svml_d_pow_data.h
@@ -0,0 +1,104 @@
+/* Offsets for data table for function pow.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef D_POW_DATA_H
+#define D_POW_DATA_H
+
+#define _hsw_log2_table               	0
+#define _hsw_dTe                      	8256
+#define _hsw_dMantMask                	10304
+#define _hsw_dOne                     	10368
+#define _hsw_dCvtMask                 	10432
+#define _hsw_dMinNorm                 	10496
+#define _hsw_dMaxNorm                 	10560
+#define _hsw_lRndBit                  	10624
+#define _hsw_lRndMask                 	10688
+#define _hsw_dc6                      	10752
+#define _hsw_dc5                      	10816
+#define _hsw_dc4                      	10880
+#define _hsw_dc3                      	10944
+#define _hsw_dc1                      	11008
+#define _hsw_dc1h                     	11072
+#define _hsw_dc2                      	11136
+#define _hsw_dAbsMask                 	11200
+#define _hsw_dDomainRange             	11264
+#define _hsw_dShifter                 	11328
+#define _hsw_dIndexMask               	11392
+#define _hsw_dce4                     	11456
+#define _hsw_dce3                     	11520
+#define _hsw_dce2                     	11584
+#define _hsw_dce1                     	11648
+#define _rcp_t1                       	11712
+#define _log2_t1                      	19968
+#define _exp2_tbl                     	36416
+#define _clv_1                        	38464
+#define _clv_2                        	38528
+#define _clv_3                        	38592
+#define _clv_4                        	38656
+#define _clv_5                        	38720
+#define _clv_6                        	38784
+#define _clv_7                        	38848
+#define _cev_1                        	38912
+#define _cev_2                        	38976
+#define _cev_3                        	39040
+#define _cev_4                        	39104
+#define _cev_5                        	39168
+#define _iMantissaMask                	39232
+#define _i3fe7fe0000000000            	39296
+#define _dbOne                        	39360
+#define _iffffffff00000000            	39424
+#define _db2p20_2p19                  	39488
+#define _iHighMask                    	39552
+#define _LHN                          	39616
+#define _ifff0000000000000            	39680
+#define _db2p45_2p44                  	39744
+#define _NEG_INF                      	39808
+#define _NEG_ZERO                     	39872
+#define _d2pow52                      	39936
+#define _d1div2pow111                 	40000
+#define _HIDELTA                      	40064
+#define _LORANGE                      	40128
+#define _ABSMASK                      	40192
+#define _INF                          	40256
+#define _DOMAINRANGE                  	40320
+#define _iIndexMask                   	40384
+#define _iIndexAdd                    	40448
+#define _i3fe7fe00                    	40512
+#define _i2p20_2p19                   	40576
+#define _iOne                         	40640
+#define _jIndexMask                   	40704
+
+.macro double_vector offset value
+.if .-__svml_dpow_data != \offset
+.err
+.endif
+.rept 8
+.quad \value
+.endr
+.endm
+
+.macro float_vector offset value
+.if .-__svml_dpow_data != \offset
+.err
+.endif
+.rept 16
+.long \value
+.endr
+.endm
+
+#endif
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/svml_d_sin2_core.S b/REORG.TODO/sysdeps/x86_64/fpu/svml_d_sin2_core.S
new file mode 100644
index 0000000000..85990833be
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/svml_d_sin2_core.S
@@ -0,0 +1,29 @@
+/* Function sin vectorized with SSE2.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "svml_d_wrapper_impl.h"
+
+	.text
+ENTRY (_ZGVbN2v_sin)
+WRAPPER_IMPL_SSE2 sin
+END (_ZGVbN2v_sin)
+
+#ifndef USE_MULTIARCH
+ libmvec_hidden_def (_ZGVbN2v_sin)
+#endif
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/svml_d_sin4_core.S b/REORG.TODO/sysdeps/x86_64/fpu/svml_d_sin4_core.S
new file mode 100644
index 0000000000..7b9211d8c7
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/svml_d_sin4_core.S
@@ -0,0 +1,29 @@
+/* Function sin vectorized with AVX2, wrapper version.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "svml_d_wrapper_impl.h"
+
+	.text
+ENTRY (_ZGVdN4v_sin)
+WRAPPER_IMPL_AVX _ZGVbN2v_sin
+END (_ZGVdN4v_sin)
+
+#ifndef USE_MULTIARCH
+ libmvec_hidden_def (_ZGVdN4v_sin)
+#endif
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/svml_d_sin4_core_avx.S b/REORG.TODO/sysdeps/x86_64/fpu/svml_d_sin4_core_avx.S
new file mode 100644
index 0000000000..3edf88a047
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/svml_d_sin4_core_avx.S
@@ -0,0 +1,25 @@
+/* Function sin vectorized in AVX ISA as wrapper to SSE4 ISA version.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "svml_d_wrapper_impl.h"
+
+	.text
+ENTRY (_ZGVcN4v_sin)
+WRAPPER_IMPL_AVX _ZGVbN2v_sin
+END (_ZGVcN4v_sin)
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/svml_d_sin8_core.S b/REORG.TODO/sysdeps/x86_64/fpu/svml_d_sin8_core.S
new file mode 100644
index 0000000000..8e67f3cbbe
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/svml_d_sin8_core.S
@@ -0,0 +1,25 @@
+/* Function sin vectorized with AVX-512, wrapper to AVX2 version.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "svml_d_wrapper_impl.h"
+
+	.text
+ENTRY (_ZGVeN8v_sin)
+WRAPPER_IMPL_AVX512 _ZGVdN4v_sin
+END (_ZGVeN8v_sin)
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/svml_d_sincos2_core.S b/REORG.TODO/sysdeps/x86_64/fpu/svml_d_sincos2_core.S
new file mode 100644
index 0000000000..e8023e8e8e
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/svml_d_sincos2_core.S
@@ -0,0 +1,110 @@
+/* Function sincos vectorized with SSE2.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "svml_d_wrapper_impl.h"
+
+	.text
+ENTRY (_ZGVbN2vl8l8_sincos)
+WRAPPER_IMPL_SSE2_fFF sincos
+END (_ZGVbN2vl8l8_sincos)
+libmvec_hidden_def (_ZGVbN2vl8l8_sincos)
+
+/* SSE2 ISA version as wrapper to scalar (for vector
+   function declared with #pragma omp declare simd notinbranch).  */
+.macro WRAPPER_IMPL_SSE2_fFF_vvv callee
+#ifndef __ILP32__
+        subq      $88, %rsp
+        cfi_adjust_cfa_offset(88)
+        movaps    %xmm0, 64(%rsp)
+        lea       (%rsp), %rdi
+        movdqa    %xmm1, 32(%rdi)
+        lea       16(%rsp), %rsi
+        movdqa    %xmm2, 32(%rsi)
+        call      JUMPTARGET(\callee)
+        movsd     72(%rsp), %xmm0
+        lea       8(%rsp), %rdi
+        lea       24(%rsp), %rsi
+        call      JUMPTARGET(\callee)
+        movq      32(%rsp), %rdx
+        movq      48(%rsp), %rsi
+        movq      40(%rsp), %r8
+        movq      56(%rsp), %r10
+        movq      (%rsp), %rax
+        movq      16(%rsp), %rcx
+        movq      8(%rsp), %rdi
+        movq      24(%rsp), %r9
+        movq      %rax, (%rdx)
+        movq      %rcx, (%rsi)
+        movq      %rdi, (%r8)
+        movq      %r9, (%r10)
+        addq      $88, %rsp
+        cfi_adjust_cfa_offset(-88)
+        ret
+#else
+        pushq   %rbp
+        .cfi_def_cfa_offset 16
+        .cfi_offset 6, -16
+        pushq   %rbx
+        .cfi_def_cfa_offset 24
+        .cfi_offset 3, -24
+        subl    $88, %esp
+        .cfi_def_cfa_offset 112
+        leal    64(%rsp), %esi
+        movaps  %xmm1, 32(%esp)
+        leal    48(%rsp), %edi
+        movaps  %xmm2, 16(%esp)
+        movq    %rsi, %rbp
+        movq    %rdi, %rbx
+        movaps  %xmm0, (%esp)
+        call    JUMPTARGET(\callee)
+        movupd  8(%esp), %xmm0
+        leal    8(%rbp), %esi
+        leal    8(%rbx), %edi
+        call    JUMPTARGET(\callee)
+        movdqa  32(%esp), %xmm1
+        movsd   48(%esp), %xmm0
+        movq    %xmm1, %rax
+        movdqa  16(%esp), %xmm2
+        movsd   %xmm0, (%eax)
+        movsd   56(%esp), %xmm0
+        pextrd  $1, %xmm1, %eax
+        movsd   %xmm0, (%eax)
+        movsd   64(%esp), %xmm0
+        movq    %xmm2, %rax
+        movsd   %xmm0, (%eax)
+        movsd   72(%esp), %xmm0
+        pextrd  $1, %xmm2, %eax
+        movsd   %xmm0, (%eax)
+        addl    $88, %esp
+        .cfi_def_cfa_offset 24
+        popq    %rbx
+        .cfi_def_cfa_offset 16
+        popq    %rbp
+        .cfi_def_cfa_offset 8
+        ret
+#endif
+.endm
+
+ENTRY (_ZGVbN2vvv_sincos)
+WRAPPER_IMPL_SSE2_fFF_vvv sincos
+END (_ZGVbN2vvv_sincos)
+
+#ifndef USE_MULTIARCH
+ libmvec_hidden_def (_ZGVbN2vvv_sincos)
+#endif
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/svml_d_sincos4_core.S b/REORG.TODO/sysdeps/x86_64/fpu/svml_d_sincos4_core.S
new file mode 100644
index 0000000000..3bcd09b62d
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/svml_d_sincos4_core.S
@@ -0,0 +1,152 @@
+/* Function sincos vectorized with AVX2, wrapper version.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "svml_d_wrapper_impl.h"
+
+	.text
+ENTRY (_ZGVdN4vl8l8_sincos)
+WRAPPER_IMPL_AVX_fFF _ZGVbN2vl8l8_sincos
+END (_ZGVdN4vl8l8_sincos)
+libmvec_hidden_def (_ZGVdN4vl8l8_sincos)
+
+/* AVX2 ISA version as wrapper to SSE ISA version (for vector
+   function declared with #pragma omp declare simd notinbranch).  */
+.macro WRAPPER_IMPL_AVX2_fFF_vvv callee
+#ifndef __ILP32__
+        pushq     %rbp
+        cfi_adjust_cfa_offset (8)
+        cfi_rel_offset (%rbp, 0)
+        movq      %rsp, %rbp
+        cfi_def_cfa_register (%rbp)
+        andq      $-32, %rsp
+        subq      $160, %rsp
+        vmovupd   %ymm0, 128(%rsp)
+        lea       (%rsp), %rdi
+        vmovdqu   %ymm1, 64(%rdi)
+        vmovdqu   %ymm2, 96(%rdi)
+        lea       32(%rsp), %rsi
+        vzeroupper
+        call      HIDDEN_JUMPTARGET(\callee)
+        vmovupd   144(%rsp), %xmm0
+        lea       16(%rsp), %rdi
+        lea       48(%rsp), %rsi
+        call      HIDDEN_JUMPTARGET(\callee)
+        movq      64(%rsp), %rdx
+        movq      96(%rsp), %rsi
+        movq      72(%rsp), %r8
+        movq      104(%rsp), %r10
+        movq      (%rsp), %rax
+        movq      32(%rsp), %rcx
+        movq      8(%rsp), %rdi
+        movq      40(%rsp), %r9
+        movq      %rax, (%rdx)
+        movq      %rcx, (%rsi)
+        movq      80(%rsp), %rax
+        movq      112(%rsp), %rcx
+        movq      %rdi, (%r8)
+        movq      %r9, (%r10)
+        movq      88(%rsp), %rdi
+        movq      120(%rsp), %r9
+        movq      16(%rsp), %r11
+        movq      48(%rsp), %rdx
+        movq      24(%rsp), %rsi
+        movq      56(%rsp), %r8
+        movq      %r11, (%rax)
+        movq      %rdx, (%rcx)
+        movq      %rsi, (%rdi)
+        movq      %r8, (%r9)
+        movq      %rbp, %rsp
+        cfi_def_cfa_register (%rsp)
+        popq      %rbp
+        cfi_adjust_cfa_offset (-8)
+        cfi_restore (%rbp)
+        ret
+#else
+        leal    8(%rsp), %r10d
+        .cfi_def_cfa 10, 0
+        andl    $-32, %esp
+        pushq   -8(%r10d)
+        pushq   %rbp
+        .cfi_escape 0x10,0x6,0x2,0x76,0
+        movl    %esp, %ebp
+        pushq   %r12
+        leal    -80(%rbp), %esi
+        pushq   %r10
+        .cfi_escape 0xf,0x3,0x76,0x70,0x6
+        .cfi_escape 0x10,0xc,0x2,0x76,0x78
+        leal    -112(%rbp), %edi
+        movq    %rsi, %r12
+        pushq   %rbx
+        .cfi_escape 0x10,0x3,0x2,0x76,0x68
+        movq    %rdi, %rbx
+        subl    $152, %esp
+        vmovaps %xmm1, -128(%ebp)
+        vmovaps %xmm2, -144(%ebp)
+        vmovapd %ymm0, -176(%ebp)
+        vzeroupper
+        call    HIDDEN_JUMPTARGET(\callee)
+        leal    16(%r12), %esi
+        vmovapd -160(%ebp), %xmm0
+        leal    16(%rbx), %edi
+        call    HIDDEN_JUMPTARGET(\callee)
+        movq    -128(%ebp), %rax
+        vmovsd  -112(%ebp), %xmm0
+        vmovdqa -128(%ebp), %xmm5
+        vmovdqa -144(%ebp), %xmm1
+        vmovsd  %xmm0, (%eax)
+        vmovsd  -104(%ebp), %xmm0
+        vpextrd $1, %xmm5, %eax
+        vmovsd  %xmm0, (%eax)
+        movq    -120(%ebp), %rax
+        vmovsd  -96(%ebp), %xmm0
+        vmovsd  %xmm0, (%eax)
+        vmovsd  -88(%ebp), %xmm0
+        vpextrd $3, %xmm5, %eax
+        vmovsd  %xmm0, (%eax)
+        movq    -144(%ebp), %rax
+        vmovsd  -80(%ebp), %xmm0
+        vmovsd  %xmm0, (%eax)
+        vmovsd  -72(%ebp), %xmm0
+        vpextrd $1, %xmm1, %eax
+        vmovsd  %xmm0, (%eax)
+        movq    -136(%ebp), %rax
+        vmovsd  -64(%ebp), %xmm0
+        vmovsd  %xmm0, (%eax)
+        vmovsd  -56(%ebp), %xmm0
+        vpextrd $3, %xmm1, %eax
+        vmovsd  %xmm0, (%eax)
+        addl    $152, %esp
+        popq    %rbx
+        popq    %r10
+        .cfi_def_cfa 10, 0
+        popq    %r12
+        popq    %rbp
+        leal    -8(%r10), %esp
+        .cfi_def_cfa 7, 8
+        ret
+#endif
+.endm
+
+ENTRY (_ZGVdN4vvv_sincos)
+WRAPPER_IMPL_AVX2_fFF_vvv _ZGVbN2vl8l8_sincos
+END (_ZGVdN4vvv_sincos)
+
+#ifndef USE_MULTIARCH
+ libmvec_hidden_def (_ZGVdN4vvv_sincos)
+#endif
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/svml_d_sincos4_core_avx.S b/REORG.TODO/sysdeps/x86_64/fpu/svml_d_sincos4_core_avx.S
new file mode 100644
index 0000000000..1164ae7a74
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/svml_d_sincos4_core_avx.S
@@ -0,0 +1,143 @@
+/* Function sincos vectorized in AVX ISA as wrapper to SSE4 ISA version.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "svml_d_wrapper_impl.h"
+
+	.text
+ENTRY (_ZGVcN4vl8l8_sincos)
+WRAPPER_IMPL_AVX_fFF _ZGVbN2vl8l8_sincos
+END (_ZGVcN4vl8l8_sincos)
+
+/* AVX ISA version as wrapper to SSE ISA version (for vector
+   function declared with #pragma omp declare simd notinbranch).  */
+.macro WRAPPER_IMPL_AVX_fFF_vvv callee
+#ifndef __ILP32__
+        pushq     %rbp
+        movq      %rsp, %rbp
+        andq      $-32, %rsp
+        subq      $160, %rsp
+        vmovupd   %ymm0, 64(%rsp)
+        lea       (%rsp), %rdi
+        vmovdqu   %xmm1, 96(%rdi)
+        vmovdqu   %xmm2, 112(%rdi)
+        vmovdqu   %xmm3, 128(%rdi)
+        vmovdqu   %xmm4, 144(%rdi)
+        lea       32(%rsp), %rsi
+	vzeroupper
+        call      HIDDEN_JUMPTARGET(\callee)
+        vmovdqu   80(%rsp), %xmm0
+        lea       16(%rsp), %rdi
+        lea       48(%rsp), %rsi
+        call      HIDDEN_JUMPTARGET(\callee)
+        movq      96(%rsp), %rdx
+        movq      104(%rsp), %rsi
+        movq      112(%rsp), %r8
+        movq      120(%rsp), %r10
+        movq      (%rsp), %rax
+        movq      8(%rsp), %rcx
+        movq      16(%rsp), %rdi
+        movq      24(%rsp), %r9
+        movq      %rax, (%rdx)
+        movq      %rcx, (%rsi)
+        movq      128(%rsp), %rax
+        movq      136(%rsp), %rcx
+        movq      %rdi, (%r8)
+        movq      %r9, (%r10)
+        movq      144(%rsp), %rdi
+        movq      152(%rsp), %r9
+        movq      32(%rsp), %r11
+        movq      40(%rsp), %rdx
+        movq      48(%rsp), %rsi
+        movq      56(%rsp), %r8
+        movq      %r11, (%rax)
+        movq      %rdx, (%rcx)
+        movq      %rsi, (%rdi)
+        movq      %r8, (%r9)
+        movq      %rbp, %rsp
+        popq      %rbp
+        ret
+#else
+        leal    8(%rsp), %r10d
+        .cfi_def_cfa 10, 0
+        andl    $-32, %esp
+        pushq   -8(%r10d)
+        pushq   %rbp
+        .cfi_escape 0x10,0x6,0x2,0x76,0
+        movl    %esp, %ebp
+        pushq   %r12
+        leal    -80(%rbp), %esi
+        pushq   %r10
+        .cfi_escape 0xf,0x3,0x76,0x70,0x6
+        .cfi_escape 0x10,0xc,0x2,0x76,0x78
+        leal    -112(%rbp), %edi
+        movq    %rsi, %r12
+        pushq   %rbx
+        .cfi_escape 0x10,0x3,0x2,0x76,0x68
+        movq    %rdi, %rbx
+        subl    $152, %esp
+        vmovaps %xmm1, -128(%ebp)
+        vmovaps %xmm2, -144(%ebp)
+        vmovapd %ymm0, -176(%ebp)
+        vzeroupper
+        call    HIDDEN_JUMPTARGET(\callee)
+        leal    16(%r12), %esi
+        vmovupd -160(%ebp), %xmm0
+        leal    16(%rbx), %edi
+        call    HIDDEN_JUMPTARGET(\callee)
+        movq    -128(%ebp), %rax
+        vmovsd  -112(%ebp), %xmm0
+        vmovdqa -128(%ebp), %xmm5
+        vmovdqa -144(%ebp), %xmm1
+        vmovsd  %xmm0, (%eax)
+        vmovsd  -104(%ebp), %xmm0
+        vpextrd $1, %xmm5, %eax
+        vmovsd  %xmm0, (%eax)
+        movq    -120(%ebp), %rax
+        vmovsd  -96(%ebp), %xmm0
+        vmovsd  %xmm0, (%eax)
+        vmovsd  -88(%ebp), %xmm0
+        vpextrd $3, %xmm5, %eax
+        vmovsd  %xmm0, (%eax)
+        movq    -144(%ebp), %rax
+        vmovsd  -80(%ebp), %xmm0
+        vmovsd  %xmm0, (%eax)
+        vmovsd  -72(%ebp), %xmm0
+        vpextrd $1, %xmm1, %eax
+        vmovsd  %xmm0, (%eax)
+        movq    -136(%ebp), %rax
+        vmovsd  -64(%ebp), %xmm0
+        vmovsd  %xmm0, (%eax)
+        vmovsd  -56(%ebp), %xmm0
+        vpextrd $3, %xmm1, %eax
+        vmovsd  %xmm0, (%eax)
+        addl    $152, %esp
+        popq    %rbx
+        popq    %r10
+        .cfi_def_cfa 10, 0
+        popq    %r12
+        popq    %rbp
+        leal    -8(%r10), %esp
+        .cfi_def_cfa 7, 8
+        ret
+#endif
+.endm
+
+ENTRY (_ZGVcN4vvv_sincos)
+WRAPPER_IMPL_AVX_fFF_vvv _ZGVbN2vl8l8_sincos
+END (_ZGVcN4vvv_sincos)
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/svml_d_sincos8_core.S b/REORG.TODO/sysdeps/x86_64/fpu/svml_d_sincos8_core.S
new file mode 100644
index 0000000000..c104539821
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/svml_d_sincos8_core.S
@@ -0,0 +1,224 @@
+/* Function sincos vectorized with AVX-512. Wrapper to AVX2 version.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "svml_d_wrapper_impl.h"
+
+	.text
+ENTRY (_ZGVeN8vl8l8_sincos)
+WRAPPER_IMPL_AVX512_fFF _ZGVdN4vl8l8_sincos
+END (_ZGVeN8vl8l8_sincos)
+
+/* AVX512 ISA version as wrapper to AVX2 ISA version (for vector
+   function declared with #pragma omp declare simd notinbranch).  */
+.macro WRAPPER_IMPL_AVX512_fFF_vvv callee
+#ifndef __ILP32__
+        pushq     %rbp
+        cfi_adjust_cfa_offset (8)
+        cfi_rel_offset (%rbp, 0)
+        movq      %rsp, %rbp
+        cfi_def_cfa_register (%rbp)
+        andq      $-64, %rsp
+        subq      $320, %rsp
+        /* Encoding for vmovups %zmm0, 256(%rsp).  */
+        .byte 0x62
+        .byte 0xf1
+        .byte 0x7c
+        .byte 0x48
+        .byte 0x11
+        .byte 0x44
+        .byte 0x24
+        .byte 0x04
+        lea       (%rsp), %rdi
+        /* Encoding for vmovups %zmm1, 128(%rdi).  */
+        .byte 0x62
+        .byte 0xf1
+        .byte 0x7c
+        .byte 0x48
+        .byte 0x11
+        .byte 0x4f
+        .byte 0x02
+        /* Encoding for vmovups %zmm2, 192(%rdi).  */
+        .byte 0x62
+        .byte 0xf1
+        .byte 0x7c
+        .byte 0x48
+        .byte 0x11
+        .byte 0x57
+        .byte 0x03
+        lea       64(%rsp), %rsi
+        call      HIDDEN_JUMPTARGET(\callee)
+        vmovdqu   288(%rsp), %ymm0
+        lea       32(%rsp), %rdi
+        lea       96(%rsp), %rsi
+        call      HIDDEN_JUMPTARGET(\callee)
+        movq      128(%rsp), %rdx
+        movq      192(%rsp), %rsi
+        movq      136(%rsp), %r8
+        movq      200(%rsp), %r10
+        movq      (%rsp), %rax
+        movq      64(%rsp), %rcx
+        movq      8(%rsp), %rdi
+        movq      72(%rsp), %r9
+        movq      %rax, (%rdx)
+        movq      %rcx, (%rsi)
+        movq      144(%rsp), %rax
+        movq      208(%rsp), %rcx
+        movq      %rdi, (%r8)
+        movq      %r9, (%r10)
+        movq      152(%rsp), %rdi
+        movq      216(%rsp), %r9
+        movq      16(%rsp), %r11
+        movq      80(%rsp), %rdx
+        movq      24(%rsp), %rsi
+        movq      88(%rsp), %r8
+        movq      %r11, (%rax)
+        movq      %rdx, (%rcx)
+        movq      160(%rsp), %r11
+        movq      224(%rsp), %rdx
+        movq      %rsi, (%rdi)
+        movq      %r8, (%r9)
+        movq      168(%rsp), %rsi
+        movq      232(%rsp), %r8
+        movq      32(%rsp), %r10
+        movq      96(%rsp), %rax
+        movq      40(%rsp), %rcx
+        movq      104(%rsp), %rdi
+        movq      %r10, (%r11)
+        movq      %rax, (%rdx)
+        movq      176(%rsp), %r10
+        movq      240(%rsp), %rax
+        movq      %rcx, (%rsi)
+        movq      %rdi, (%r8)
+        movq      184(%rsp), %rcx
+        movq      248(%rsp), %rdi
+        movq      48(%rsp), %r9
+        movq      112(%rsp), %r11
+        movq      56(%rsp), %rdx
+        movq      120(%rsp), %rsi
+        movq      %r9, (%r10)
+        movq      %r11, (%rax)
+        movq      %rdx, (%rcx)
+        movq      %rsi, (%rdi)
+        movq      %rbp, %rsp
+        cfi_def_cfa_register (%rsp)
+        popq      %rbp
+        cfi_adjust_cfa_offset (-8)
+        cfi_restore (%rbp)
+        ret
+#else
+        leal    8(%rsp), %r10d
+        .cfi_def_cfa 10, 0
+        andl    $-64, %esp
+        pushq   -8(%r10d)
+        pushq   %rbp
+        .cfi_escape 0x10,0x6,0x2,0x76,0
+        movl    %esp, %ebp
+        pushq   %r12
+        leal    -112(%rbp), %esi
+        pushq   %r10
+        .cfi_escape 0xf,0x3,0x76,0x70,0x6
+        .cfi_escape 0x10,0xc,0x2,0x76,0x78
+        leal    -176(%rbp), %edi
+        movq    %rsi, %r12
+        pushq   %rbx
+        .cfi_escape 0x10,0x3,0x2,0x76,0x68
+        movq    %rdi, %rbx
+        subl    $280, %esp
+        vmovdqa %ymm1, -208(%ebp)
+        vmovdqa %ymm2, -240(%ebp)
+        /* Encoding for vmovapd %zmm0, -304(%ebp).  */
+        .byte 0x67
+        .byte 0x62
+        .byte 0xf1
+        .byte 0xfd
+        .byte 0x48
+        .byte 0x29
+        .byte 0x85
+        .byte 0xd0
+        .byte 0xfe
+        .byte 0xff
+        .byte 0xff
+        call    HIDDEN_JUMPTARGET(\callee)
+        leal    32(%r12), %esi
+        vmovupd -272(%ebp), %ymm0
+        leal    32(%rbx), %edi
+        call    HIDDEN_JUMPTARGET(\callee)
+        movl    -208(%ebp), %eax
+        vmovsd  -176(%ebp), %xmm0
+        vmovsd  %xmm0, (%eax)
+        movl    -204(%ebp), %eax
+        vmovsd  -168(%ebp), %xmm0
+        vmovsd  %xmm0, (%eax)
+        movl    -200(%ebp), %eax
+        vmovsd  -160(%ebp), %xmm0
+        vmovsd  %xmm0, (%eax)
+        movl    -196(%ebp), %eax
+        vmovsd  -152(%ebp), %xmm0
+        vmovsd  %xmm0, (%eax)
+        movl    -192(%ebp), %eax
+        vmovsd  -144(%ebp), %xmm0
+        vmovsd  %xmm0, (%eax)
+        movl    -188(%ebp), %eax
+        vmovsd  -136(%ebp), %xmm0
+        vmovsd  %xmm0, (%eax)
+        movl    -184(%ebp), %eax
+        vmovsd  -128(%ebp), %xmm0
+        vmovsd  %xmm0, (%eax)
+        movl    -180(%ebp), %eax
+        vmovsd  -120(%ebp), %xmm0
+        vmovsd  %xmm0, (%eax)
+        movl    -240(%ebp), %eax
+        vmovsd  -112(%ebp), %xmm0
+        vmovsd  %xmm0, (%eax)
+        movl    -236(%ebp), %eax
+        vmovsd  -104(%ebp), %xmm0
+        vmovsd  %xmm0, (%eax)
+        movl    -232(%ebp), %eax
+        vmovsd  -96(%ebp), %xmm0
+        vmovsd  %xmm0, (%eax)
+        movl    -228(%ebp), %eax
+        vmovsd  -88(%ebp), %xmm0
+        vmovsd  %xmm0, (%eax)
+        movl    -224(%ebp), %eax
+        vmovsd  -80(%ebp), %xmm0
+        vmovsd  %xmm0, (%eax)
+        movl    -220(%ebp), %eax
+        vmovsd  -72(%ebp), %xmm0
+        vmovsd  %xmm0, (%eax)
+        movl    -216(%ebp), %eax
+        vmovsd  -64(%ebp), %xmm0
+        vmovsd  %xmm0, (%eax)
+        movl    -212(%ebp), %eax
+        vmovsd  -56(%ebp), %xmm0
+        vmovsd  %xmm0, (%eax)
+        addl    $280, %esp
+        popq    %rbx
+        popq    %r10
+        .cfi_def_cfa 10, 0
+        popq    %r12
+        popq    %rbp
+        leal    -8(%r10), %esp
+        .cfi_def_cfa 7, 8
+        ret
+#endif
+.endm
+
+ENTRY (_ZGVeN8vvv_sincos)
+WRAPPER_IMPL_AVX512_fFF_vvv _ZGVdN4vl8l8_sincos
+END (_ZGVeN8vvv_sincos)
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/svml_d_trig_data.S b/REORG.TODO/sysdeps/x86_64/fpu/svml_d_trig_data.S
new file mode 100644
index 0000000000..f7cf6c0a08
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/svml_d_trig_data.S
@@ -0,0 +1,130 @@
+/* Data for vectorized sin, cos, sincos.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include "svml_d_trig_data.h"
+
+	.section .rodata, "a"
+	.align 64
+
+/* Data table for vector implementations.
+   The table may contain polynomial, reduction, lookup
+   coefficients and other constants obtained through different
+   methods of research and experimental work.
+ */
+	.globl __svml_d_trig_data
+__svml_d_trig_data:
+
+/* General purpose constants:
+   absolute value mask
+ */
+double_vector __dAbsMask 0x7fffffffffffffff
+
+/* working range threshold */
+double_vector __dRangeVal 0x4160000000000000
+
+/* working range threshold */
+double_vector __dRangeVal_sin 0x4170000000000000
+
+/* PI/2 */
+double_vector __dHalfPI 0x3ff921fb54442d18
+
+/* 1/PI */
+double_vector __dInvPI 0x3fd45f306dc9c883
+
+/* right-shifter constant */
+double_vector __dRShifter 0x4338000000000000
+
+/* 0.0 */
+double_vector __dZero 0x0000000000000000
+
+/* -0.0 */
+double_vector __lNZero 0x8000000000000000
+
+/* 0.5 */
+double_vector __dOneHalf 0x3fe0000000000000
+
+/* Range reduction PI-based constants:
+   PI high part
+ */
+double_vector __dPI1 0x400921fb40000000
+
+/* PI mid  part 1 */
+double_vector __dPI2 0x3e84442d00000000
+
+/* PI mid  part 2 */
+double_vector __dPI3 0x3d08469880000000
+
+/* PI low  part */
+double_vector __dPI4 0x3b88cc51701b839a
+
+/* Range reduction PI-based constants if FMA available:
+   PI high part (FMA available)
+ */
+double_vector __dPI1_FMA 0x400921fb54442d18
+
+/* PI mid part  (FMA available) */
+double_vector __dPI2_FMA 0x3ca1a62633145c06
+
+/* PI low part  (FMA available) */
+double_vector __dPI3_FMA 0x395c1cd129024e09
+
+/* HalfPI1 */
+double_vector __dHalfPI1 0x3ff921fc00000000
+
+/* HalfPI2 */
+double_vector __dHalfPI2 0xbea5777a00000000
+
+/* HalfPI3 */
+double_vector __dHalfPI3 0xbd473dcc00000000
+
+/* HalfPI4 */
+double_vector __dHalfPI4 0x3bf898cc51701b84
+
+/* Polynomial coefficients (relative error 2^(-52.115)): */
+double_vector __dC1 0xbfc55555555554a7
+double_vector __dC2 0x3f8111111110a4a8
+double_vector __dC3 0xbf2a01a019a5b86d
+double_vector __dC4 0x3ec71de38030fea0
+double_vector __dC5 0xbe5ae63546002231
+double_vector __dC6 0x3de60e6857a2f220
+double_vector __dC7 0xbd69f0d60811aac8
+
+/* Polynomial coefficients (relative error 2^(-52.115)): */
+double_vector __dC1_sin 0xbfc55555555554a8
+double_vector __dC2_sin 0x3f8111111110a573
+double_vector __dC3_sin 0xbf2a01a019a659dd
+double_vector __dC4_sin 0x3ec71de3806add1a
+double_vector __dC5_sin 0xbe5ae6355aaa4a53
+double_vector __dC6_sin 0x3de60e6bee01d83e
+double_vector __dC7_sin 0xbd69f1517e9f65f0
+
+/*
+   Additional constants:
+   absolute value mask
+ */
+/* right-shifer for low accuracy version */
+double_vector __dRShifter_la 0x4330000000000000
+
+/* right-shifer-1.0 for low accuracy version */
+double_vector __dRShifterm5_la 0x432fffffffffffff
+
+/* right-shifer with low mask for low accuracy version */
+double_vector __dRXmax_la 0x43300000007ffffe
+
+	.type	__svml_d_trig_data,@object
+	.size	__svml_d_trig_data,.-__svml_d_trig_data
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/svml_d_trig_data.h b/REORG.TODO/sysdeps/x86_64/fpu/svml_d_trig_data.h
new file mode 100644
index 0000000000..ccdff7edb8
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/svml_d_trig_data.h
@@ -0,0 +1,72 @@
+/* Offsets for data table for vectorized sin, cos, sincos.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef D_TRIG_DATA_H
+#define D_TRIG_DATA_H
+
+#define __dAbsMask              0
+#define __dRangeVal             64
+#define __dRangeVal_sin         64*2
+#define __dHalfPI               64*3
+#define __dInvPI                64*4
+#define __dRShifter             64*5
+#define __dZero                 64*6
+#define __lNZero                64*7
+#define __dOneHalf              64*8
+#define __dPI1                  64*9
+#define __dPI2                  64*10
+#define __dPI3                  64*11
+#define __dPI4                  64*12
+#define __dPI1_FMA              64*13
+#define __dPI2_FMA              64*14
+#define __dPI3_FMA              64*15
+#define __dHalfPI1              64*16
+#define __dHalfPI2              64*17
+#define __dHalfPI3              64*18
+#define __dHalfPI4              64*19
+#define __dC1                   64*20
+#define __dC2                   64*21
+#define __dC3                   64*22
+#define __dC4                   64*23
+#define __dC5                   64*24
+#define __dC6                   64*25
+#define __dC7                   64*26
+#define __dC1_sin               64*27
+#define __dC2_sin               64*28
+#define __dC3_sin               64*29
+#define __dC4_sin               64*30
+#define __dC5_sin               64*31
+#define __dC6_sin               64*32
+#define __dC7_sin               64*33
+#define __dRShifter_la          64*34
+#define __dRShifterm5_la        64*35
+#define __dRXmax_la             64*36
+#define __dAbsMask_la           __dAbsMask
+#define __dInvPI_la             __dInvPI
+#define __dSignMask             __lNZero
+
+.macro double_vector offset value
+.if .-__svml_d_trig_data != \offset
+.err
+.endif
+.rept 8
+.quad \value
+.endr
+.endm
+
+#endif
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/svml_d_wrapper_impl.h b/REORG.TODO/sysdeps/x86_64/fpu/svml_d_wrapper_impl.h
new file mode 100644
index 0000000000..625eb6642b
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/svml_d_wrapper_impl.h
@@ -0,0 +1,335 @@
+/* Wrapper implementations of vector math functions.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* SSE2 ISA version as wrapper to scalar.  */
+.macro WRAPPER_IMPL_SSE2 callee
+        subq      $40, %rsp
+        cfi_adjust_cfa_offset(40)
+        movaps    %xmm0, (%rsp)
+        call      JUMPTARGET(\callee)
+        movsd     %xmm0, 16(%rsp)
+        movsd     8(%rsp), %xmm0
+        call      JUMPTARGET(\callee)
+        movsd     16(%rsp), %xmm1
+        movsd     %xmm0, 24(%rsp)
+        unpcklpd  %xmm0, %xmm1
+        movaps    %xmm1, %xmm0
+        addq      $40, %rsp
+        cfi_adjust_cfa_offset(-40)
+        ret
+.endm
+
+/* 2 argument SSE2 ISA version as wrapper to scalar.  */
+.macro WRAPPER_IMPL_SSE2_ff callee
+        subq      $56, %rsp
+        cfi_adjust_cfa_offset(56)
+        movaps    %xmm0, (%rsp)
+        movaps    %xmm1, 16(%rsp)
+        call      JUMPTARGET(\callee)
+        movsd     %xmm0, 32(%rsp)
+        movsd     8(%rsp), %xmm0
+        movsd     24(%rsp), %xmm1
+        call      JUMPTARGET(\callee)
+        movsd     32(%rsp), %xmm1
+        movsd     %xmm0, 40(%rsp)
+        unpcklpd  %xmm0, %xmm1
+        movaps    %xmm1, %xmm0
+        addq      $56, %rsp
+        cfi_adjust_cfa_offset(-56)
+        ret
+.endm
+
+/* 3 argument SSE2 ISA version as wrapper to scalar.  */
+.macro WRAPPER_IMPL_SSE2_fFF callee
+        pushq   %rbp
+        cfi_adjust_cfa_offset (8)
+        cfi_rel_offset (%rbp, 0)
+        pushq   %rbx
+        cfi_adjust_cfa_offset (8)
+        cfi_rel_offset (%rbx, 0)
+        movq    %rdi, %rbp
+        movq    %rsi, %rbx
+        subq    $40, %rsp
+        cfi_adjust_cfa_offset(40)
+        leaq    16(%rsp), %rsi
+        leaq    24(%rsp), %rdi
+        movaps  %xmm0, (%rsp)
+        call    JUMPTARGET(\callee)
+        leaq    16(%rsp), %rsi
+        leaq    24(%rsp), %rdi
+        movsd   24(%rsp), %xmm0
+        movapd  (%rsp), %xmm1
+        movsd   %xmm0, 0(%rbp)
+        unpckhpd        %xmm1, %xmm1
+        movsd   16(%rsp), %xmm0
+        movsd   %xmm0, (%rbx)
+        movapd  %xmm1, %xmm0
+        call    JUMPTARGET(\callee)
+        movsd   24(%rsp), %xmm0
+        movsd   %xmm0, 8(%rbp)
+        movsd   16(%rsp), %xmm0
+        movsd   %xmm0, 8(%rbx)
+        addq    $40, %rsp
+        cfi_adjust_cfa_offset(-40)
+        popq    %rbx
+        cfi_adjust_cfa_offset (-8)
+        cfi_restore (%rbx)
+        popq    %rbp
+        cfi_adjust_cfa_offset (-8)
+        cfi_restore (%rbp)
+        ret
+.endm
+
+/* AVX/AVX2 ISA version as wrapper to SSE ISA version.  */
+.macro WRAPPER_IMPL_AVX callee
+        pushq		%rbp
+        cfi_adjust_cfa_offset (8)
+        cfi_rel_offset (%rbp, 0)
+        movq		%rsp, %rbp
+        cfi_def_cfa_register (%rbp)
+        andq		$-32, %rsp
+        subq		$32, %rsp
+        vextractf128	$1, %ymm0, (%rsp)
+        vzeroupper
+        call		HIDDEN_JUMPTARGET(\callee)
+        vmovapd		%xmm0, 16(%rsp)
+        vmovaps		(%rsp), %xmm0
+        call		HIDDEN_JUMPTARGET(\callee)
+        vmovapd		%xmm0, %xmm1
+        vmovapd		16(%rsp), %xmm0
+        vinsertf128	$1, %xmm1, %ymm0, %ymm0
+        movq		%rbp, %rsp
+        cfi_def_cfa_register (%rsp)
+        popq		%rbp
+        cfi_adjust_cfa_offset (-8)
+        cfi_restore (%rbp)
+        ret
+.endm
+
+/* 2 argument AVX/AVX2 ISA version as wrapper to SSE ISA version.  */
+.macro WRAPPER_IMPL_AVX_ff callee
+        pushq     %rbp
+        cfi_adjust_cfa_offset (8)
+        cfi_rel_offset (%rbp, 0)
+        movq      %rsp, %rbp
+        cfi_def_cfa_register (%rbp)
+        andq      $-32, %rsp
+        subq      $64, %rsp
+        vextractf128 $1, %ymm0, 16(%rsp)
+        vextractf128 $1, %ymm1, (%rsp)
+        vzeroupper
+        call      HIDDEN_JUMPTARGET(\callee)
+        vmovaps   %xmm0, 32(%rsp)
+        vmovaps   16(%rsp), %xmm0
+        vmovaps   (%rsp), %xmm1
+        call      HIDDEN_JUMPTARGET(\callee)
+        vmovaps   %xmm0, %xmm1
+        vmovaps   32(%rsp), %xmm0
+        vinsertf128 $1, %xmm1, %ymm0, %ymm0
+        movq      %rbp, %rsp
+        cfi_def_cfa_register (%rsp)
+        popq      %rbp
+        cfi_adjust_cfa_offset (-8)
+        cfi_restore (%rbp)
+        ret
+.endm
+
+/* 3 argument AVX/AVX2 ISA version as wrapper to SSE ISA version.  */
+.macro WRAPPER_IMPL_AVX_fFF callee
+        pushq     %rbp
+        cfi_adjust_cfa_offset (8)
+        cfi_rel_offset (%rbp, 0)
+        movq      %rsp, %rbp
+        cfi_def_cfa_register (%rbp)
+        andq      $-32, %rsp
+        pushq     %r13
+        cfi_adjust_cfa_offset (8)
+        cfi_rel_offset (%r13, 0)
+        pushq     %r14
+        cfi_adjust_cfa_offset (8)
+        cfi_rel_offset (%r14, 0)
+        subq      $48, %rsp
+        movq      %rsi, %r14
+        movq      %rdi, %r13
+        vextractf128 $1, %ymm0, 32(%rsp)
+        vzeroupper
+        call      HIDDEN_JUMPTARGET(\callee)
+        vmovaps   32(%rsp), %xmm0
+        lea       (%rsp), %rdi
+        lea       16(%rsp), %rsi
+        call      HIDDEN_JUMPTARGET(\callee)
+        vmovapd   (%rsp), %xmm0
+        vmovapd   16(%rsp), %xmm1
+        vmovapd   %xmm0, 16(%r13)
+        vmovapd   %xmm1, 16(%r14)
+        addq      $48, %rsp
+        popq      %r14
+        cfi_adjust_cfa_offset (-8)
+        cfi_restore (%r14)
+        popq      %r13
+        cfi_adjust_cfa_offset (-8)
+        cfi_restore (%r13)
+        movq      %rbp, %rsp
+        cfi_def_cfa_register (%rsp)
+        popq      %rbp
+        cfi_adjust_cfa_offset (-8)
+        cfi_restore (%rbp)
+        ret
+.endm
+
+/* AVX512 ISA version as wrapper to AVX2 ISA version.  */
+.macro WRAPPER_IMPL_AVX512 callee
+        pushq     %rbp
+        cfi_adjust_cfa_offset (8)
+        cfi_rel_offset (%rbp, 0)
+        movq      %rsp, %rbp
+        cfi_def_cfa_register (%rbp)
+        andq      $-64, %rsp
+        subq      $128, %rsp
+/* Below is encoding for vmovups %zmm0, (%rsp).  */
+        .byte   0x62
+        .byte   0xf1
+        .byte   0x7c
+        .byte   0x48
+        .byte   0x11
+        .byte   0x04
+        .byte   0x24
+        vmovupd   (%rsp), %ymm0
+        call      HIDDEN_JUMPTARGET(\callee)
+        vmovupd   %ymm0, 64(%rsp)
+        vmovupd   32(%rsp), %ymm0
+        call      HIDDEN_JUMPTARGET(\callee)
+        vmovupd   %ymm0, 96(%rsp)
+/* Below is encoding for vmovups 64(%rsp), %zmm0.  */
+        .byte   0x62
+        .byte   0xf1
+        .byte   0x7c
+        .byte   0x48
+        .byte   0x10
+        .byte   0x44
+        .byte   0x24
+        .byte   0x01
+        movq      %rbp, %rsp
+        cfi_def_cfa_register (%rsp)
+        popq      %rbp
+        cfi_adjust_cfa_offset (-8)
+        cfi_restore (%rbp)
+        ret
+.endm
+
+/* 2 argument AVX512 ISA version as wrapper to AVX2 ISA version.  */
+.macro WRAPPER_IMPL_AVX512_ff callee
+        pushq     %rbp
+        cfi_adjust_cfa_offset (8)
+        cfi_rel_offset (%rbp, 0)
+        movq      %rsp, %rbp
+        cfi_def_cfa_register (%rbp)
+        andq      $-64, %rsp
+        subq      $192, %rsp
+/* Below is encoding for vmovups %zmm0, (%rsp).  */
+        .byte   0x62
+        .byte   0xf1
+        .byte   0x7c
+        .byte   0x48
+        .byte   0x11
+        .byte   0x04
+        .byte   0x24
+/* Below is encoding for vmovups %zmm1, 64(%rsp).  */
+        .byte   0x62
+        .byte   0xf1
+        .byte   0x7c
+        .byte   0x48
+        .byte   0x11
+        .byte   0x4c
+        .byte   0x24
+        .byte   0x01
+        vmovupd   (%rsp), %ymm0
+        vmovupd   64(%rsp), %ymm1
+        call      HIDDEN_JUMPTARGET(\callee)
+        vmovupd   %ymm0, 128(%rsp)
+        vmovupd   32(%rsp), %ymm0
+        vmovupd   96(%rsp), %ymm1
+        call      HIDDEN_JUMPTARGET(\callee)
+        vmovupd   %ymm0, 160(%rsp)
+/* Below is encoding for vmovups 128(%rsp), %zmm0.  */
+        .byte   0x62
+        .byte   0xf1
+        .byte   0x7c
+        .byte   0x48
+        .byte   0x10
+        .byte   0x44
+        .byte   0x24
+        .byte   0x02
+        movq      %rbp, %rsp
+        cfi_def_cfa_register (%rsp)
+        popq      %rbp
+        cfi_adjust_cfa_offset (-8)
+        cfi_restore (%rbp)
+        ret
+.endm
+
+/* 3 argument AVX512 ISA version as wrapper to AVX2 ISA version.  */
+.macro WRAPPER_IMPL_AVX512_fFF callee
+        pushq     %rbp
+        cfi_adjust_cfa_offset (8)
+        cfi_rel_offset (%rbp, 0)
+        movq      %rsp, %rbp
+        cfi_def_cfa_register (%rbp)
+        andq      $-64, %rsp
+        pushq     %r12
+        cfi_adjust_cfa_offset (8)
+        cfi_rel_offset (%r12, 0)
+        pushq     %r13
+        cfi_adjust_cfa_offset (8)
+        cfi_rel_offset (%r13, 0)
+        subq      $176, %rsp
+        movq      %rsi, %r13
+/* Below is encoding for vmovups %zmm0, (%rsp).  */
+        .byte	0x62
+        .byte	0xf1
+        .byte	0x7c
+        .byte	0x48
+        .byte	0x11
+        .byte	0x04
+        .byte	0x24
+        movq    %rdi, %r12
+        vmovupd (%rsp), %ymm0
+        call      HIDDEN_JUMPTARGET(\callee)
+        vmovupd   32(%rsp), %ymm0
+        lea       64(%rsp), %rdi
+        lea       96(%rsp), %rsi
+        call      HIDDEN_JUMPTARGET(\callee)
+        vmovupd   64(%rsp), %ymm0
+        vmovupd   96(%rsp), %ymm1
+        vmovupd   %ymm0, 32(%r12)
+        vmovupd   %ymm1, 32(%r13)
+        vzeroupper
+        addq      $176, %rsp
+        popq      %r13
+        cfi_adjust_cfa_offset (-8)
+        cfi_restore (%r13)
+        popq      %r12
+        cfi_adjust_cfa_offset (-8)
+        cfi_restore (%r12)
+        movq      %rbp, %rsp
+        cfi_def_cfa_register (%rsp)
+        popq      %rbp
+        cfi_adjust_cfa_offset (-8)
+        cfi_restore (%rbp)
+        ret
+.endm
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/svml_finite_alias.S b/REORG.TODO/sysdeps/x86_64/fpu/svml_finite_alias.S
new file mode 100644
index 0000000000..7e39e7801d
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/svml_finite_alias.S
@@ -0,0 +1,58 @@
+/* These aliases added as workaround to exclude unnecessary symbol
+   aliases in libmvec.so while compiler creates the vector names
+   based on scalar asm name.  Corresponding discussion is at
+   <https://gcc.gnu.org/ml/gcc/2015-06/msg00173.html>.
+   Copyright (C) 2015-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+#define ALIAS_IMPL(alias, target) \
+ENTRY (alias); \
+	jmp *target@GOTPCREL(%rip); \
+END (alias)
+
+	.text
+ALIAS_IMPL (_ZGVbN2v___log_finite, _ZGVbN2v_log)
+ALIAS_IMPL (_ZGVcN4v___log_finite, _ZGVcN4v_log)
+ALIAS_IMPL (_ZGVdN4v___log_finite, _ZGVdN4v_log)
+ALIAS_IMPL (_ZGVeN8v___log_finite, _ZGVeN8v_log)
+
+ALIAS_IMPL (_ZGVbN4v___logf_finite, _ZGVbN4v_logf)
+ALIAS_IMPL (_ZGVcN8v___logf_finite, _ZGVcN8v_logf)
+ALIAS_IMPL (_ZGVdN8v___logf_finite, _ZGVdN8v_logf)
+ALIAS_IMPL (_ZGVeN16v___logf_finite, _ZGVeN16v_logf)
+
+ALIAS_IMPL (_ZGVbN2v___exp_finite, _ZGVbN2v_exp)
+ALIAS_IMPL (_ZGVcN4v___exp_finite, _ZGVcN4v_exp)
+ALIAS_IMPL (_ZGVdN4v___exp_finite, _ZGVdN4v_exp)
+ALIAS_IMPL (_ZGVeN8v___exp_finite, _ZGVeN8v_exp)
+
+ALIAS_IMPL (_ZGVbN4v___expf_finite, _ZGVbN4v_expf)
+ALIAS_IMPL (_ZGVcN8v___expf_finite, _ZGVcN8v_expf)
+ALIAS_IMPL (_ZGVdN8v___expf_finite, _ZGVdN8v_expf)
+ALIAS_IMPL (_ZGVeN16v___expf_finite, _ZGVeN16v_expf)
+
+ALIAS_IMPL (_ZGVbN2vv___pow_finite, _ZGVbN2vv_pow)
+ALIAS_IMPL (_ZGVcN4vv___pow_finite, _ZGVcN4vv_pow)
+ALIAS_IMPL (_ZGVdN4vv___pow_finite, _ZGVdN4vv_pow)
+ALIAS_IMPL (_ZGVeN8vv___pow_finite, _ZGVeN8vv_pow)
+
+ALIAS_IMPL (_ZGVbN4vv___powf_finite, _ZGVbN4vv_powf)
+ALIAS_IMPL (_ZGVcN8vv___powf_finite, _ZGVcN8vv_powf)
+ALIAS_IMPL (_ZGVdN8vv___powf_finite, _ZGVdN8vv_powf)
+ALIAS_IMPL (_ZGVeN16vv___powf_finite, _ZGVeN16vv_powf)
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/svml_s_cosf16_core.S b/REORG.TODO/sysdeps/x86_64/fpu/svml_s_cosf16_core.S
new file mode 100644
index 0000000000..127eb82ae0
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/svml_s_cosf16_core.S
@@ -0,0 +1,25 @@
+/* Function cosf vectorized with AVX-512. Wrapper to AVX2 version.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "svml_s_wrapper_impl.h"
+
+	.text
+ENTRY (_ZGVeN16v_cosf)
+WRAPPER_IMPL_AVX512 _ZGVdN8v_cosf
+END (_ZGVeN16v_cosf)
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/svml_s_cosf4_core.S b/REORG.TODO/sysdeps/x86_64/fpu/svml_s_cosf4_core.S
new file mode 100644
index 0000000000..800766cc4e
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/svml_s_cosf4_core.S
@@ -0,0 +1,29 @@
+/* Function cosf vectorized with SSE2, wrapper version.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "svml_s_wrapper_impl.h"
+
+	.text
+ENTRY (_ZGVbN4v_cosf)
+WRAPPER_IMPL_SSE2 cosf
+END (_ZGVbN4v_cosf)
+
+#ifndef USE_MULTIARCH
+ libmvec_hidden_def (_ZGVbN4v_cosf)
+#endif
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/svml_s_cosf8_core.S b/REORG.TODO/sysdeps/x86_64/fpu/svml_s_cosf8_core.S
new file mode 100644
index 0000000000..46c588074c
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/svml_s_cosf8_core.S
@@ -0,0 +1,29 @@
+/* Function cosf vectorized with AVX2, wrapper version.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "svml_s_wrapper_impl.h"
+
+	.text
+ENTRY (_ZGVdN8v_cosf)
+WRAPPER_IMPL_AVX _ZGVbN4v_cosf
+END (_ZGVdN8v_cosf)
+
+#ifndef USE_MULTIARCH
+ libmvec_hidden_def (_ZGVdN8v_cosf)
+#endif
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/svml_s_cosf8_core_avx.S b/REORG.TODO/sysdeps/x86_64/fpu/svml_s_cosf8_core_avx.S
new file mode 100644
index 0000000000..459685ee6a
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/svml_s_cosf8_core_avx.S
@@ -0,0 +1,25 @@
+/* Function cosf vectorized in AVX ISA as wrapper to SSE4 ISA version.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "svml_s_wrapper_impl.h"
+
+        .text
+ENTRY (_ZGVcN8v_cosf)
+WRAPPER_IMPL_AVX _ZGVbN4v_cosf
+END (_ZGVcN8v_cosf)
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/svml_s_expf16_core.S b/REORG.TODO/sysdeps/x86_64/fpu/svml_s_expf16_core.S
new file mode 100644
index 0000000000..a32f03e1a7
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/svml_s_expf16_core.S
@@ -0,0 +1,25 @@
+/* Function expf vectorized with AVX-512. Wrapper to AVX2 version.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "svml_s_wrapper_impl.h"
+
+	.text
+ENTRY (_ZGVeN16v_expf)
+WRAPPER_IMPL_AVX512 _ZGVdN8v_expf
+END (_ZGVeN16v_expf)
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/svml_s_expf4_core.S b/REORG.TODO/sysdeps/x86_64/fpu/svml_s_expf4_core.S
new file mode 100644
index 0000000000..c8ec8f97b7
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/svml_s_expf4_core.S
@@ -0,0 +1,30 @@
+/* Function expf vectorized with SSE2.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+
+#include <sysdep.h>
+#include "svml_s_wrapper_impl.h"
+
+	.text
+ENTRY (_ZGVbN4v_expf)
+WRAPPER_IMPL_SSE2 __expf_finite
+END (_ZGVbN4v_expf)
+
+#ifndef USE_MULTIARCH
+ libmvec_hidden_def (_ZGVbN4v_expf)
+#endif
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/svml_s_expf8_core.S b/REORG.TODO/sysdeps/x86_64/fpu/svml_s_expf8_core.S
new file mode 100644
index 0000000000..f5e1be62eb
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/svml_s_expf8_core.S
@@ -0,0 +1,29 @@
+/* Function expf vectorized with AVX2, wrapper version.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "svml_s_wrapper_impl.h"
+
+	.text
+ENTRY (_ZGVdN8v_expf)
+WRAPPER_IMPL_AVX _ZGVbN4v_expf
+END (_ZGVdN8v_expf)
+
+#ifndef USE_MULTIARCH
+ libmvec_hidden_def (_ZGVdN8v_expf)
+#endif
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/svml_s_expf8_core_avx.S b/REORG.TODO/sysdeps/x86_64/fpu/svml_s_expf8_core_avx.S
new file mode 100644
index 0000000000..f3557f8c19
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/svml_s_expf8_core_avx.S
@@ -0,0 +1,25 @@
+/* Function expf vectorized in AVX ISA as wrapper to SSE4 ISA version.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "svml_s_wrapper_impl.h"
+
+        .text
+ENTRY(_ZGVcN8v_expf)
+WRAPPER_IMPL_AVX _ZGVbN4v_expf
+END(_ZGVcN8v_expf)
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/svml_s_expf_data.S b/REORG.TODO/sysdeps/x86_64/fpu/svml_s_expf_data.S
new file mode 100644
index 0000000000..226104f5f9
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/svml_s_expf_data.S
@@ -0,0 +1,63 @@
+/* Data for function expf.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include "svml_s_expf_data.h"
+
+	.section .rodata, "a"
+	.align 64
+
+/* Data table for vector implementations of function expf.
+   The table may contain polynomial, reduction, lookup coefficients and
+   other coefficients obtained through different methods of research and
+   experimental work.  */
+
+	.globl __svml_sexp_data
+__svml_sexp_data:
+
+/* Range reduction coefficients:
+ * log(2) inverted */
+float_vector __sInvLn2 0x3fb8aa3b
+
+/* right shifter constant */
+float_vector __sShifter 0x4b400000
+
+/* log(2) high part */
+float_vector __sLn2hi 0x3f317200
+
+/* log(2) low part */
+float_vector __sLn2lo 0x35bfbe8e
+
+/* bias */
+float_vector __iBias 0x0000007f
+
+/* Polynomial coefficients:
+ * Here we approximate 2^x on [-0.5, 0.5] */
+float_vector __sPC0 0x3f800000
+float_vector __sPC1 0x3f7ffffe
+float_vector __sPC2 0x3effff34
+float_vector __sPC3 0x3e2aacac
+float_vector __sPC4 0x3d2b8392
+float_vector __sPC5 0x3c07d9fe
+
+/* absolute value mask */
+float_vector __iAbsMask 0x7fffffff
+
+/* working domain range */
+float_vector __iDomainRange 0x42aeac4f
+	.type	__svml_sexp_data,@object
+	.size __svml_sexp_data,.-__svml_sexp_data
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/svml_s_expf_data.h b/REORG.TODO/sysdeps/x86_64/fpu/svml_s_expf_data.h
new file mode 100644
index 0000000000..5badb84b14
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/svml_s_expf_data.h
@@ -0,0 +1,45 @@
+/* Offsets for data table for vector function expf.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef S_EXPF_DATA_H
+#define S_EXPF_DATA_H
+
+#define __sInvLn2                     	0
+#define __sShifter                    	64
+#define __sLn2hi                      	128
+#define __sLn2lo                      	192
+#define __iBias                       	256
+#define __sPC0                        	320
+#define __sPC1                        	384
+#define __sPC2                        	448
+#define __sPC3                        	512
+#define __sPC4                        	576
+#define __sPC5                        	640
+#define __iAbsMask                    	704
+#define __iDomainRange                	768
+
+.macro float_vector offset value
+.if .-__svml_sexp_data != \offset
+.err
+.endif
+.rept 16
+.long \value
+.endr
+.endm
+
+#endif
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/svml_s_logf16_core.S b/REORG.TODO/sysdeps/x86_64/fpu/svml_s_logf16_core.S
new file mode 100644
index 0000000000..081c449f42
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/svml_s_logf16_core.S
@@ -0,0 +1,25 @@
+/* Function logf vectorized with AVX-512. Wrapper to AVX2 version.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "svml_s_wrapper_impl.h"
+
+	.text
+ENTRY (_ZGVeN16v_logf)
+WRAPPER_IMPL_AVX512 _ZGVdN8v_logf
+END (_ZGVeN16v_logf)
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/svml_s_logf4_core.S b/REORG.TODO/sysdeps/x86_64/fpu/svml_s_logf4_core.S
new file mode 100644
index 0000000000..fab301db1e
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/svml_s_logf4_core.S
@@ -0,0 +1,30 @@
+/* Function logf vectorized with SSE2.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+
+#include <sysdep.h>
+#include "svml_s_wrapper_impl.h"
+
+	.text
+ENTRY (_ZGVbN4v_logf)
+WRAPPER_IMPL_SSE2 __logf_finite
+END (_ZGVbN4v_logf)
+
+#ifndef USE_MULTIARCH
+ libmvec_hidden_def (_ZGVbN4v_logf)
+#endif
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/svml_s_logf8_core.S b/REORG.TODO/sysdeps/x86_64/fpu/svml_s_logf8_core.S
new file mode 100644
index 0000000000..e1aa2f363c
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/svml_s_logf8_core.S
@@ -0,0 +1,29 @@
+/* Function logf vectorized with AVX2, wrapper version.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "svml_s_wrapper_impl.h"
+
+	.text
+ENTRY (_ZGVdN8v_logf)
+WRAPPER_IMPL_AVX _ZGVbN4v_logf
+END (_ZGVdN8v_logf)
+
+#ifndef USE_MULTIARCH
+ libmvec_hidden_def (_ZGVdN8v_logf)
+#endif
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/svml_s_logf8_core_avx.S b/REORG.TODO/sysdeps/x86_64/fpu/svml_s_logf8_core_avx.S
new file mode 100644
index 0000000000..e74e47c152
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/svml_s_logf8_core_avx.S
@@ -0,0 +1,25 @@
+/* Function logf vectorized in AVX ISA as wrapper to SSE4 ISA version.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "svml_s_wrapper_impl.h"
+
+        .text
+ENTRY(_ZGVcN8v_logf)
+WRAPPER_IMPL_AVX _ZGVbN4v_logf
+END(_ZGVcN8v_logf)
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/svml_s_logf_data.S b/REORG.TODO/sysdeps/x86_64/fpu/svml_s_logf_data.S
new file mode 100644
index 0000000000..487c439120
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/svml_s_logf_data.S
@@ -0,0 +1,102 @@
+/* Data for vector function logf.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include "svml_s_logf_data.h"
+
+	.section .rodata, "a"
+	.align 64
+
+/* Data table for vector implementations of function logf.
+   The table may contain polynomial, reduction, lookup coefficients and
+   other coefficients obtained through different methods of research and
+   experimental work.  */
+
+	.globl __svml_slog_data
+__svml_slog_data:
+
+/* Polynomial sPoly[] coefficients:
+ * -5.0000000000000000000000000e-01 */
+float_vector _sPoly_1 0xbf000000
+
+/* 3.3336564898490905761718750e-01 */
+float_vector _sPoly_2 0x3eaaaee7
+
+/* -2.5004664063453674316406250e-01 */
+float_vector _sPoly_3 0xbe80061d
+
+/* 1.9822503626346588134765625e-01 */
+float_vector _sPoly_4 0x3e4afb81
+
+/* -1.6462457180023193359375000e-01 */
+float_vector _sPoly_5 0xbe289358
+
+/* 1.6964881122112274169921875e-01 */
+float_vector _sPoly_6 0x3e2db86b
+
+/* -1.5177205204963684082031250e-01 */
+float_vector _sPoly_7 0xbe1b6a22
+
+/* Constant for work range check: Delta 80000000-7f800000 */
+float_vector _iHiDelta 0x00800000
+
+/* Constant for work range check: 00800000 + Delta */
+float_vector _iLoRange 0x01000000
+
+/* Mantissa break point  SP 2/3 */
+float_vector _iBrkValue 0x3f2aaaab
+
+/* SP significand mask */
+float_vector _iOffExpoMask 0x007fffff
+
+/* 1.0f */
+float_vector _sOne 0x3f800000
+
+/* SP log(2) */
+float_vector _sLn2 0x3f317218
+
+/* SP infinity, +/- */
+.if .-__svml_slog_data != _sInfs
+.err
+.endif
+	.long	0x7f800000
+	.long	0xff800000
+	.rept	56
+	.byte	0
+	.endr
+
+/* SP one, +/- */
+.if .-__svml_slog_data != _sOnes
+.err
+.endif
+	.long	0x3f800000
+	.long	0xbf800000
+	.rept	56
+	.byte	0
+	.endr
+
+/* SP zero +/- */
+.if .-__svml_slog_data != _sZeros
+.err
+.endif
+	.long	0x00000000
+	.long	0x80000000
+	.rept	56
+	.byte	0
+	.endr
+	.type	__svml_slog_data,@object
+	.size __svml_slog_data,.-__svml_slog_data
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/svml_s_logf_data.h b/REORG.TODO/sysdeps/x86_64/fpu/svml_s_logf_data.h
new file mode 100644
index 0000000000..52612e3ae3
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/svml_s_logf_data.h
@@ -0,0 +1,48 @@
+/* Offsets for data table for vectorized function logf.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef S_LOGF_DATA_H
+#define S_LOGF_DATA_H
+
+#define _sPoly_1                      	0
+#define _sPoly_2                      	64
+#define _sPoly_3                      	128
+#define _sPoly_4                      	192
+#define _sPoly_5                      	256
+#define _sPoly_6                      	320
+#define _sPoly_7                      	384
+#define _iHiDelta                     	448
+#define _iLoRange                     	512
+#define _iBrkValue                    	576
+#define _iOffExpoMask                 	640
+#define _sOne                         	704
+#define _sLn2                         	768
+#define _sInfs                        	832
+#define _sOnes                        	896
+#define _sZeros                       	960
+
+.macro float_vector offset value
+.if .-__svml_slog_data != \offset
+.err
+.endif
+.rept 16
+.long \value
+.endr
+.endm
+
+#endif
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/svml_s_powf16_core.S b/REORG.TODO/sysdeps/x86_64/fpu/svml_s_powf16_core.S
new file mode 100644
index 0000000000..ac041df507
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/svml_s_powf16_core.S
@@ -0,0 +1,25 @@
+/* Function powf vectorized with AVX-512. Wrapper to AVX2 version.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "svml_s_wrapper_impl.h"
+
+	.text
+ENTRY (_ZGVeN16vv_powf)
+WRAPPER_IMPL_AVX512_ff _ZGVdN8vv_powf
+END (_ZGVeN16vv_powf)
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/svml_s_powf4_core.S b/REORG.TODO/sysdeps/x86_64/fpu/svml_s_powf4_core.S
new file mode 100644
index 0000000000..61d336e160
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/svml_s_powf4_core.S
@@ -0,0 +1,29 @@
+/* Function powf vectorized with SSE2.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "svml_s_wrapper_impl.h"
+
+	.text
+ENTRY (_ZGVbN4vv_powf)
+WRAPPER_IMPL_SSE2_ff __powf_finite
+END (_ZGVbN4vv_powf)
+
+#ifndef USE_MULTIARCH
+ libmvec_hidden_def (_ZGVbN4vv_powf)
+#endif
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/svml_s_powf8_core.S b/REORG.TODO/sysdeps/x86_64/fpu/svml_s_powf8_core.S
new file mode 100644
index 0000000000..2ae28051c5
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/svml_s_powf8_core.S
@@ -0,0 +1,29 @@
+/* Function powf vectorized with AVX2, wrapper version.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "svml_s_wrapper_impl.h"
+
+	.text
+ENTRY (_ZGVdN8vv_powf)
+WRAPPER_IMPL_AVX_ff _ZGVbN4vv_powf
+END (_ZGVdN8vv_powf)
+
+#ifndef USE_MULTIARCH
+ libmvec_hidden_def (_ZGVdN8vv_powf)
+#endif
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/svml_s_powf8_core_avx.S b/REORG.TODO/sysdeps/x86_64/fpu/svml_s_powf8_core_avx.S
new file mode 100644
index 0000000000..0522865ef1
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/svml_s_powf8_core_avx.S
@@ -0,0 +1,25 @@
+/* Function powf vectorized in AVX ISA as wrapper to SSE4 ISA version.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "svml_s_wrapper_impl.h"
+
+        .text
+ENTRY(_ZGVcN8vv_powf)
+WRAPPER_IMPL_AVX_ff _ZGVbN4vv_powf
+END(_ZGVcN8vv_powf)
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/svml_s_powf_data.S b/REORG.TODO/sysdeps/x86_64/fpu/svml_s_powf_data.S
new file mode 100644
index 0000000000..630baa62a8
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/svml_s_powf_data.S
@@ -0,0 +1,3759 @@
+/* Data for function powf.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include "svml_s_powf_data.h"
+
+	.section .rodata, "a"
+	.align 64
+
+/* Data table for vector implementations of function powf.
+   The table may contain polynomial, reduction, lookup coefficients and
+   other coefficients obtained through different methods of research
+   and experimental work.  */
+
+	.globl __svml_spow_data
+__svml_spow_data:
+
+/* General purpose constants for H+L multiplication:
+ * NMINNORM */
+float_vector _NMINNORM 0x80800000
+
+/* NMAXVAL */
+float_vector _NMAXVAL 0xfeffffff
+
+/* INF */
+float_vector _INF 0x7f800000
+
+/* ABSMASK */
+float_vector _ABSMASK 0x7fffffff
+
+/* DOMAINRANGE */
+float_vector _DOMAINRANGE 0x42ae9a00
+
+/* Log(2) lookup High+Low table for logarithmic part */
+.if .-__svml_spow_data != _Log_HA_table
+.err
+.endif
+	.quad	0xc086232bdd7a8300
+	.quad	0xbe1ce91eef3fb100
+	.quad	0xc086232fdc7ad828
+	.quad	0xbe1cefcffda73b6a
+	.quad	0xc0862333d97d2ba0
+	.quad	0xbe1cef406748f1ff
+	.quad	0xc0862337d48378e0
+	.quad	0xbe1cef2a9429925a
+	.quad	0xc086233bcd8fb878
+	.quad	0xbe1cf138d17ebecb
+	.quad	0xc086233fc4a3e018
+	.quad	0xbe1ceff2dbbbb29e
+	.quad	0xc0862343b9c1e270
+	.quad	0xbe1cf1a42aae437b
+	.quad	0xc0862347acebaf68
+	.quad	0xbe1cef3b152048af
+	.quad	0xc086234b9e2333f0
+	.quad	0xbe1cef20e127805e
+	.quad	0xc086234f8d6a5a30
+	.quad	0xbe1cf00ad6052cf4
+	.quad	0xc08623537ac30980
+	.quad	0xbe1cefc4642ee597
+	.quad	0xc0862357662f2660
+	.quad	0xbe1cf1f277d36e16
+	.quad	0xc086235b4fb092a0
+	.quad	0xbe1ceed009e8d8e6
+	.quad	0xc086235f37492d28
+	.quad	0xbe1cf1e4038cb362
+	.quad	0xc08623631cfad250
+	.quad	0xbe1cf0b0873b8557
+	.quad	0xc086236700c75b98
+	.quad	0xbe1cf15bb3227c0b
+	.quad	0xc086236ae2b09fe0
+	.quad	0xbe1cf151ef8ca9ed
+	.quad	0xc086236ec2b87358
+	.quad	0xbe1cefe1dc2cd2ed
+	.quad	0xc0862372a0e0a780
+	.quad	0xbe1cf0d1eec5454f
+	.quad	0xc08623767d2b0b48
+	.quad	0xbe1ceeefd570bbce
+	.quad	0xc086237a57996af0
+	.quad	0xbe1cee99ae91b3a7
+	.quad	0xc086237e302d9028
+	.quad	0xbe1cf0412830fbd1
+	.quad	0xc086238206e94218
+	.quad	0xbe1ceee898588610
+	.quad	0xc0862385dbce4548
+	.quad	0xbe1cee9a1fbcaaea
+	.quad	0xc0862389aede5bc0
+	.quad	0xbe1ceed8e7cc1ad6
+	.quad	0xc086238d801b4500
+	.quad	0xbe1cf10c8d059da6
+	.quad	0xc08623914f86be18
+	.quad	0xbe1ceee6c63a8165
+	.quad	0xc08623951d228180
+	.quad	0xbe1cf0c3592d2ff1
+	.quad	0xc0862398e8f04758
+	.quad	0xbe1cf0026cc4cb1b
+	.quad	0xc086239cb2f1c538
+	.quad	0xbe1cf15d48d8e670
+	.quad	0xc08623a07b28ae60
+	.quad	0xbe1cef359363787c
+	.quad	0xc08623a44196b390
+	.quad	0xbe1cefdf1ab2e82c
+	.quad	0xc08623a8063d8338
+	.quad	0xbe1cefe43c02aa84
+	.quad	0xc08623abc91ec960
+	.quad	0xbe1cf044f5ae35b7
+	.quad	0xc08623af8a3c2fb8
+	.quad	0xbe1cf0b0b4001e1b
+	.quad	0xc08623b349975d98
+	.quad	0xbe1cf1bae76dfbcf
+	.quad	0xc08623b70731f810
+	.quad	0xbe1cef0a72e13a62
+	.quad	0xc08623bac30da1c8
+	.quad	0xbe1cf184007d2b6b
+	.quad	0xc08623be7d2bfb40
+	.quad	0xbe1cf16f4b239e98
+	.quad	0xc08623c2358ea2a0
+	.quad	0xbe1cf0976acada87
+	.quad	0xc08623c5ec3733d0
+	.quad	0xbe1cf066318a16ff
+	.quad	0xc08623c9a1274880
+	.quad	0xbe1ceffaa7148798
+	.quad	0xc08623cd54607820
+	.quad	0xbe1cf23ab02e9b6e
+	.quad	0xc08623d105e45800
+	.quad	0xbe1cefdfef7d4fde
+	.quad	0xc08623d4b5b47b20
+	.quad	0xbe1cf17fece44f2b
+	.quad	0xc08623d863d27270
+	.quad	0xbe1cf18f907d0d7c
+	.quad	0xc08623dc103fccb0
+	.quad	0xbe1cee61fe072c98
+	.quad	0xc08623dfbafe1668
+	.quad	0xbe1cf022dd891e2f
+	.quad	0xc08623e3640eda20
+	.quad	0xbe1ceecc1daf4358
+	.quad	0xc08623e70b73a028
+	.quad	0xbe1cf0173c4fa380
+	.quad	0xc08623eab12deec8
+	.quad	0xbe1cf16a2150c2f4
+	.quad	0xc08623ee553f4a30
+	.quad	0xbe1cf1bf980b1f4b
+	.quad	0xc08623f1f7a93480
+	.quad	0xbe1cef8b731663c2
+	.quad	0xc08623f5986d2dc0
+	.quad	0xbe1cee9a664d7ef4
+	.quad	0xc08623f9378cb3f0
+	.quad	0xbe1cf1eda2af6400
+	.quad	0xc08623fcd5094320
+	.quad	0xbe1cf1923f9d68d7
+	.quad	0xc086240070e45548
+	.quad	0xbe1cf0747cd3e03a
+	.quad	0xc08624040b1f6260
+	.quad	0xbe1cf22ee855bd6d
+	.quad	0xc0862407a3bbe078
+	.quad	0xbe1cf0d57360c00b
+	.quad	0xc086240b3abb4398
+	.quad	0xbe1ceebc815cd575
+	.quad	0xc086240ed01efdd0
+	.quad	0xbe1cf03bfb970951
+	.quad	0xc086241263e87f50
+	.quad	0xbe1cf16e74768529
+	.quad	0xc0862415f6193658
+	.quad	0xbe1cefec64b8becb
+	.quad	0xc086241986b28f30
+	.quad	0xbe1cf0838d210baa
+	.quad	0xc086241d15b5f448
+	.quad	0xbe1cf0ea86e75b11
+	.quad	0xc0862420a324ce28
+	.quad	0xbe1cf1708d11d805
+	.quad	0xc08624242f008380
+	.quad	0xbe1ceea988c5a417
+	.quad	0xc0862427b94a7910
+	.quad	0xbe1cef166a7bbca5
+	.quad	0xc086242b420411d0
+	.quad	0xbe1cf0c9d9e86a38
+	.quad	0xc086242ec92eaee8
+	.quad	0xbe1cef0946455411
+	.quad	0xc08624324ecbaf98
+	.quad	0xbe1cefea60907739
+	.quad	0xc0862435d2dc7160
+	.quad	0xbe1cf1ed0934ce42
+	.quad	0xc086243955624ff8
+	.quad	0xbe1cf191ba746c7d
+	.quad	0xc086243cd65ea548
+	.quad	0xbe1ceeec78cf2a7e
+	.quad	0xc086244055d2c968
+	.quad	0xbe1cef345284c119
+	.quad	0xc0862443d3c012b8
+	.quad	0xbe1cf24f77355219
+	.quad	0xc08624475027d5e8
+	.quad	0xbe1cf05bf087e114
+	.quad	0xc086244acb0b65d0
+	.quad	0xbe1cef3504a32189
+	.quad	0xc086244e446c1398
+	.quad	0xbe1ceff54b2a406f
+	.quad	0xc0862451bc4b2eb8
+	.quad	0xbe1cf0757d54ed4f
+	.quad	0xc086245532aa04f0
+	.quad	0xbe1cf0c8099fdfd5
+	.quad	0xc0862458a789e250
+	.quad	0xbe1cf0b173796a31
+	.quad	0xc086245c1aec1138
+	.quad	0xbe1cf11d8734540d
+	.quad	0xc086245f8cd1da60
+	.quad	0xbe1cf1916a723ceb
+	.quad	0xc0862462fd3c84d8
+	.quad	0xbe1cf19a911e1da7
+	.quad	0xc08624666c2d5608
+	.quad	0xbe1cf23a9ef72e4f
+	.quad	0xc0862469d9a591c0
+	.quad	0xbe1cef503d947663
+	.quad	0xc086246d45a67a18
+	.quad	0xbe1cf0fceeb1a0b2
+	.quad	0xc0862470b0314fa8
+	.quad	0xbe1cf107e27e4fbc
+	.quad	0xc086247419475160
+	.quad	0xbe1cf03dd9922331
+	.quad	0xc086247780e9bc98
+	.quad	0xbe1cefce1a10e129
+	.quad	0xc086247ae719cd18
+	.quad	0xbe1ceea47f73c4f6
+	.quad	0xc086247e4bd8bd10
+	.quad	0xbe1ceec0ac56d100
+	.quad	0xc0862481af27c528
+	.quad	0xbe1cee8a6593278a
+	.quad	0xc086248511081c70
+	.quad	0xbe1cf2231dd9dec7
+	.quad	0xc0862488717af888
+	.quad	0xbe1cf0b4b8ed7da8
+	.quad	0xc086248bd0818d68
+	.quad	0xbe1cf1bd8d835002
+	.quad	0xc086248f2e1d0d98
+	.quad	0xbe1cf259acc107f4
+	.quad	0xc08624928a4eaa20
+	.quad	0xbe1cee897636b00c
+	.quad	0xc0862495e5179270
+	.quad	0xbe1cee757f20c326
+	.quad	0xc08624993e78f490
+	.quad	0xbe1cefafd3aa54a4
+	.quad	0xc086249c9673fd10
+	.quad	0xbe1cee7298d38b97
+	.quad	0xc086249fed09d6f8
+	.quad	0xbe1ceedc158d4ceb
+	.quad	0xc08624a3423babe0
+	.quad	0xbe1cf2282987cb2e
+	.quad	0xc08624a6960aa400
+	.quad	0xbe1cefe7381ecc4b
+	.quad	0xc08624a9e877e600
+	.quad	0xbe1cef328dbbce80
+	.quad	0xc08624ad39849728
+	.quad	0xbe1cefde45f3cc71
+	.quad	0xc08624b08931db58
+	.quad	0xbe1cefa8b89433b9
+	.quad	0xc08624b3d780d500
+	.quad	0xbe1cef6773c0b139
+	.quad	0xc08624b72472a528
+	.quad	0xbe1cf031c931c11f
+	.quad	0xc08624ba70086b78
+	.quad	0xbe1cf088f49275e7
+	.quad	0xc08624bdba434630
+	.quad	0xbe1cf17de0eaa86d
+	.quad	0xc08624c103245238
+	.quad	0xbe1cefd492f1ba75
+	.quad	0xc08624c44aacab08
+	.quad	0xbe1cf1253e154466
+	.quad	0xc08624c790dd6ad0
+	.quad	0xbe1cf0fb09ee6d55
+	.quad	0xc08624cad5b7aa58
+	.quad	0xbe1cf1f08dd048fe
+	.quad	0xc08624ce193c8120
+	.quad	0xbe1ceeca0809697f
+	.quad	0xc08624d15b6d0538
+	.quad	0xbe1cef8d5662d968
+	.quad	0xc08624d49c4a4b78
+	.quad	0xbe1cee97b556ed78
+	.quad	0xc08624d7dbd56750
+	.quad	0xbe1cf1b14b6acb75
+	.quad	0xc08624db1a0f6b00
+	.quad	0xbe1cef1e860623f2
+	.quad	0xc08624de56f96758
+	.quad	0xbe1ceeaf4d156f3d
+	.quad	0xc08624e192946bf0
+	.quad	0xbe1ceecc12b400ed
+	.quad	0xc08624e4cce18710
+	.quad	0xbe1cf180c40c794f
+	.quad	0xc08624e805e1c5c8
+	.quad	0xbe1cf185a08f7f65
+	.quad	0xc08624eb3d9633d8
+	.quad	0xbe1cef45fc924078
+	.quad	0xc08624ee73ffdbb0
+	.quad	0xbe1cf1e4f457f32a
+	.quad	0xc08624f1a91fc6a0
+	.quad	0xbe1cf040147b8a5a
+	.quad	0xc08624f4dcf6fc98
+	.quad	0xbe1cf1effca0dfb2
+	.quad	0xc08624f80f868468
+	.quad	0xbe1cf0470146e5bc
+	.quad	0xc08624fb40cf6390
+	.quad	0xbe1cef4dd186e501
+	.quad	0xc08624fe70d29e60
+	.quad	0xbe1ceebe257f66c7
+	.quad	0xc08625019f9137f0
+	.quad	0xbe1ceefb7a1c395c
+	.quad	0xc0862504cd0c3220
+	.quad	0xbe1cf209dedfed8c
+	.quad	0xc0862507f9448db0
+	.quad	0xbe1cf082da464994
+	.quad	0xc086250b243b4a18
+	.quad	0xbe1cee88694a73cf
+	.quad	0xc086250e4df165a0
+	.quad	0xbe1cf0b61e8f0531
+	.quad	0xc08625117667dd78
+	.quad	0xbe1cf1106599c962
+	.quad	0xc08625149d9fad98
+	.quad	0xbe1ceff1ee88af1f
+	.quad	0xc0862517c399d0c8
+	.quad	0xbe1cf0f746994ef6
+	.quad	0xc086251ae85740b8
+	.quad	0xbe1cefe8a1d077e4
+	.quad	0xc086251e0bd8f5e0
+	.quad	0xbe1cf1a1da036092
+	.quad	0xc08625212e1fe7a8
+	.quad	0xbe1cf0f8a7786fcd
+	.quad	0xc08625244f2d0c48
+	.quad	0xbe1cefa1174a07a7
+	.quad	0xc08625276f0158d8
+	.quad	0xbe1cef1043aa5b25
+	.quad	0xc086252a8d9dc150
+	.quad	0xbe1cf15d521c169d
+	.quad	0xc086252dab033898
+	.quad	0xbe1cf220bba8861f
+	.quad	0xc0862530c732b078
+	.quad	0xbe1cef51e310eae2
+	.quad	0xc0862533e22d1988
+	.quad	0xbe1cf222fcedd8ae
+	.quad	0xc0862536fbf36370
+	.quad	0xbe1cefdb4da4bda8
+	.quad	0xc086253a14867ca0
+	.quad	0xbe1ceeafc1112171
+	.quad	0xc086253d2be75280
+	.quad	0xbe1cee99dfb4b408
+	.quad	0xc08625404216d160
+	.quad	0xbe1cf22d2536f06b
+	.quad	0xc08625435715e498
+	.quad	0xbe1cef6abbf2e268
+	.quad	0xc08625466ae57648
+	.quad	0xbe1cf093a14789f5
+	.quad	0xc08625497d866fa0
+	.quad	0xbe1cf0f93655603c
+	.quad	0xc086254c8ef9b8b8
+	.quad	0xbe1cf1cc40c9aafc
+	.quad	0xc086254f9f4038a8
+	.quad	0xbe1ceeea5f4e9157
+	.quad	0xc0862552ae5ad568
+	.quad	0xbe1cefa9f52d4997
+	.quad	0xc0862555bc4a7400
+	.quad	0xbe1cefa490a638ff
+	.quad	0xc0862558c90ff868
+	.quad	0xbe1cef7fcf797d6f
+	.quad	0xc086255bd4ac4590
+	.quad	0xbe1cf1b4c51113c9
+	.quad	0xc086255edf203d78
+	.quad	0xbe1cef55e5b4a55d
+	.quad	0xc0862561e86cc100
+	.quad	0xbe1cf0d37a25f9dc
+	.quad	0xc0862564f092b028
+	.quad	0xbe1ceebe9efc19d9
+	.quad	0xc0862567f792e9d8
+	.quad	0xbe1cee8ad30a57b5
+	.quad	0xc086256afd6e4c08
+	.quad	0xbe1cef4e1817b90b
+	.quad	0xc086256e0225b3b8
+	.quad	0xbe1cee7fa9229996
+	.quad	0xc086257105b9fce0
+	.quad	0xbe1cf0b54963d945
+	.quad	0xc0862574082c0298
+	.quad	0xbe1cee5f2f3c7995
+	.quad	0xc0862577097c9ee0
+	.quad	0xbe1cf0828e303a2c
+	.quad	0xc086257a09acaae0
+	.quad	0xbe1cf172c3078947
+	.quad	0xc086257d08bcfec0
+	.quad	0xbe1cf189252afa22
+	.quad	0xc086258006ae71b8
+	.quad	0xbe1cefdb80426923
+	.quad	0xc08625830381da08
+	.quad	0xbe1ceef1391a0372
+	.quad	0xc0862585ff380d00
+	.quad	0xbe1cf17720c78d13
+	.quad	0xc0862588f9d1df18
+	.quad	0xbe1ceef1f9027d83
+	.quad	0xc086258bf35023b8
+	.quad	0xbe1cf06fac99dec9
+	.quad	0xc086258eebb3ad78
+	.quad	0xbe1cf1373eeb45c0
+	.quad	0xc0862591e2fd4e00
+	.quad	0xbe1cef777536bb81
+	.quad	0xc0862594d92dd600
+	.quad	0xbe1cf0f43ca40766
+	.quad	0xc0862597ce461558
+	.quad	0xbe1cefb2cfc6766b
+	.quad	0xc086259ac246daf0
+	.quad	0xbe1ceea49e64ffa2
+	.quad	0xc086259db530f4c8
+	.quad	0xbe1cf250fa457dec
+	.quad	0xc08625a0a7053018
+	.quad	0xbe1cf17d8bb2a44e
+	.quad	0xc08625a397c45918
+	.quad	0xbe1cf1d5906d54b7
+	.quad	0xc08625a6876f3b30
+	.quad	0xbe1cf08fe7b31780
+	.quad	0xc08625a97606a0e0
+	.quad	0xbe1cef13edfc9d11
+	.quad	0xc08625ac638b53c8
+	.quad	0xbe1cef9d2b107219
+	.quad	0xc08625af4ffe1cb0
+	.quad	0xbe1cf1ddd4ff6160
+	.quad	0xc08625b23b5fc390
+	.quad	0xbe1cefa02a996495
+	.quad	0xc08625b525b10f68
+	.quad	0xbe1cf166a7e37ee5
+	.quad	0xc08625b80ef2c680
+	.quad	0xbe1cef0b171068a5
+	.quad	0xc08625baf725ae28
+	.quad	0xbe1cf05c80779283
+	.quad	0xc08625bdde4a8af0
+	.quad	0xbe1cf1bbfbffb889
+	.quad	0xc08625c0c4622090
+	.quad	0xbe1cf0b8666c0124
+	.quad	0xc08625c3a96d31e0
+	.quad	0xbe1cf0a8fcf47a86
+	.quad	0xc08625c68d6c80f0
+	.quad	0xbe1cef46e18cb092
+	.quad	0xc08625c97060cef0
+	.quad	0xbe1cf1458a350efb
+	.quad	0xc08625cc524adc58
+	.quad	0xbe1ceeea1dadce12
+	.quad	0xc08625cf332b68b0
+	.quad	0xbe1cf0a1bfdc44c7
+	.quad	0xc08625d2130332d0
+	.quad	0xbe1cef96d02da73e
+	.quad	0xc08625d4f1d2f8a8
+	.quad	0xbe1cf2451c3c7701
+	.quad	0xc08625d7cf9b7778
+	.quad	0xbe1cf10d08f83812
+	.quad	0xc08625daac5d6ba0
+	.quad	0xbe1ceec5b4895c5e
+	.quad	0xc08625dd881990b0
+	.quad	0xbe1cf14e1325c5e4
+	.quad	0xc08625e062d0a188
+	.quad	0xbe1cf21d0904be12
+	.quad	0xc08625e33c835838
+	.quad	0xbe1ceed0839bcf21
+	.quad	0xc08625e615326df0
+	.quad	0xbe1cf1bb944889d2
+	.quad	0xc08625e8ecde9b48
+	.quad	0xbe1cee738e85eece
+	.quad	0xc08625ebc38897e0
+	.quad	0xbe1cf25c2bc6ef12
+	.quad	0xc08625ee99311ac8
+	.quad	0xbe1cf132b70a41ad
+	.quad	0xc08625f16dd8da28
+	.quad	0xbe1cf1984236a6e3
+	.quad	0xc08625f441808b78
+	.quad	0xbe1cf19ae74998f9
+	.quad	0xc08625f71428e370
+	.quad	0xbe1cef3e175d61a1
+	.quad	0xc08625f9e5d295f8
+	.quad	0xbe1cf101f9868fd9
+	.quad	0xc08625fcb67e5658
+	.quad	0xbe1cee69db83dcd2
+	.quad	0xc08625ff862cd6f8
+	.quad	0xbe1cf081b636af51
+	.quad	0xc086260254dec9a8
+	.quad	0xbe1cee62c7d59b3e
+	.quad	0xc08626052294df58
+	.quad	0xbe1cf1b745c57716
+	.quad	0xc0862607ef4fc868
+	.quad	0xbe1cef3d2800ea23
+	.quad	0xc086260abb103458
+	.quad	0xbe1cef480ff1acd2
+	.quad	0xc086260d85d6d200
+	.quad	0xbe1cf2424c9a17ef
+	.quad	0xc08626104fa44f90
+	.quad	0xbe1cf12cfde90fd5
+	.quad	0xc086261318795a68
+	.quad	0xbe1cf21f590dd5b6
+	.quad	0xc0862615e0569f48
+	.quad	0xbe1cf0c50f9cd28a
+	.quad	0xc0862618a73cca30
+	.quad	0xbe1ceedbdb520545
+	.quad	0xc086261b6d2c8668
+	.quad	0xbe1cf0b030396011
+	.quad	0xc086261e32267e98
+	.quad	0xbe1cf19917010e96
+	.quad	0xc0862620f62b5cb0
+	.quad	0xbe1cf07331355985
+	.quad	0xc0862623b93bc9e8
+	.quad	0xbe1cf01ae921a1c3
+	.quad	0xc08626267b586ed0
+	.quad	0xbe1cefe5cf0dbf0c
+	.quad	0xc08626293c81f348
+	.quad	0xbe1cf01b258aeb50
+	.quad	0xc086262bfcb8fe88
+	.quad	0xbe1cee6b9e7f4c68
+	.quad	0xc086262ebbfe3710
+	.quad	0xbe1cee684a9b21c9
+	.quad	0xc08626317a5242b8
+	.quad	0xbe1cf1f8bcde9a8b
+	.quad	0xc086263437b5c6c0
+	.quad	0xbe1cf1d063d36238
+	.quad	0xc0862636f42967a8
+	.quad	0xbe1cf1e31a19075e
+	.quad	0xc0862639afadc950
+	.quad	0xbe1cf1d8efdf7e7d
+	.quad	0xc086263c6a438ef0
+	.quad	0xbe1cf1812ee72dba
+	.quad	0xc086263f23eb5b18
+	.quad	0xbe1cf1449a9a2279
+	.quad	0xc0862641dca5cfb8
+	.quad	0xbe1cee96edce5085
+	.quad	0xc086264494738e08
+	.quad	0xbe1cf06797bd03b2
+	.quad	0xc08626474b5536b8
+	.quad	0xbe1cef91b9b7ffc1
+	.quad	0xc086264a014b69c0
+	.quad	0xbe1cef4b6721278f
+	.quad	0xc086264cb656c678
+	.quad	0xbe1cf1942925eb4a
+	.quad	0xc086264f6a77eba8
+	.quad	0xbe1cefa2c7bc2e39
+	.quad	0xc08626521daf7758
+	.quad	0xbe1cf252595aceb3
+	.quad	0xc0862654cffe0718
+	.quad	0xbe1cee8e9ae47ec2
+	.quad	0xc0862657816437a8
+	.quad	0xbe1cf1bf913828fa
+	.quad	0xc086265a31e2a558
+	.quad	0xbe1cf23475d6b366
+	.quad	0xc086265ce179ebc8
+	.quad	0xbe1cef8df00a922b
+	.quad	0xc086265f902aa5f0
+	.quad	0xbe1cef279bfa43e0
+	.quad	0xc08626623df56e38
+	.quad	0xbe1cf080e10b8365
+	.quad	0xc0862664eadade70
+	.quad	0xbe1cf1a518f9b544
+	.quad	0xc086266796db8fd0
+	.quad	0xbe1cef9308fed9e9
+	.quad	0xc086266a41f81ae8
+	.quad	0xbe1ceea3ae6b19c9
+	.quad	0xc086266cec3117b8
+	.quad	0xbe1ceef06003d4c2
+	.quad	0xc086266f95871da8
+	.quad	0xbe1cf0b8457ffb0c
+	.quad	0xc08626723dfac390
+	.quad	0xbe1cf0c526745ad6
+	.quad	0xc0862674e58c9fa8
+	.quad	0xbe1cf0cf91ff7b5d
+	.quad	0xc08626778c3d4798
+	.quad	0xbe1cefe260819380
+	.quad	0xc086267a320d5070
+	.quad	0xbe1ceebd90aa27a3
+	.quad	0xc086267cd6fd4ea8
+	.quad	0xbe1cf0388121dffa
+	.quad	0xc086267f7b0dd630
+	.quad	0xbe1cf1a3881435f1
+	.quad	0xc08626821e3f7a68
+	.quad	0xbe1cef28e9d9ac52
+	.quad	0xc0862684c092ce08
+	.quad	0xbe1cf02d300062dd
+	.quad	0xc086268762086350
+	.quad	0xbe1cefaee1edfa35
+	.quad	0xc086268a02a0cbe0
+	.quad	0xbe1cf0a5a052e936
+	.quad	0xc086268ca25c98d8
+	.quad	0xbe1cee60a4a497ed
+	.quad	0xc086268f413c5ab0
+	.quad	0xbe1cf0e4a5d0cf49
+	.quad	0xc0862691df40a170
+	.quad	0xbe1cf149235a4e6e
+	.quad	0xc08626947c69fc80
+	.quad	0xbe1cf215180b9fcc
+	.quad	0xc086269718b8fac8
+	.quad	0xbe1cef9b156a9840
+	.quad	0xc0862699b42e2a90
+	.quad	0xbe1cf054c91441be
+	.quad	0xc086269c4eca19a8
+	.quad	0xbe1cf13ded26512c
+	.quad	0xc086269ee88d5550
+	.quad	0xbe1cf22ea4d8ac06
+	.quad	0xc08626a181786a40
+	.quad	0xbe1cf2354666ee2e
+	.quad	0xc08626a4198be4a8
+	.quad	0xbe1cefef936752b3
+	.quad	0xc08626a6b0c85020
+	.quad	0xbe1cf1e360a9db68
+	.quad	0xc08626a9472e37d8
+	.quad	0xbe1ceed6aeb812c5
+	.quad	0xc08626abdcbe2650
+	.quad	0xbe1cf227340b4986
+	.quad	0xc08626ae7178a5b0
+	.quad	0xbe1cf0215a0cbe0d
+	.quad	0xc08626b1055e3f70
+	.quad	0xbe1cf256adf0ae26
+	.quad	0xc08626b3986f7ca8
+	.quad	0xbe1ceff3c67aed06
+	.quad	0xc08626b62aace5c8
+	.quad	0xbe1cf2159fb93652
+	.quad	0xc08626b8bc1702e0
+	.quad	0xbe1cf01e6dbd1c7f
+	.quad	0xc08626bb4cae5b60
+	.quad	0xbe1cf009e75d1c0c
+	.quad	0xc08626bddc737648
+	.quad	0xbe1ceec10a020e73
+	.quad	0xc08626c06b66da08
+	.quad	0xbe1cf06d5783eee7
+	.quad	0xc08626c2f9890ca0
+	.quad	0xbe1cf0cb8f169ffe
+	.quad	0xc08626c586da9388
+	.quad	0xbe1cef7de2452430
+	.quad	0xc08626c8135bf3b0
+	.quad	0xbe1cf05da6f783ae
+	.quad	0xc08626ca9f0db198
+	.quad	0xbe1cefcc877d681d
+	.quad	0xc08626cd29f05138
+	.quad	0xbe1cef0531954ab3
+	.quad	0xc08626cfb4045608
+	.quad	0xbe1cf06b8565ea3d
+	.quad	0xc08626d23d4a4310
+	.quad	0xbe1cefdc455d9d7e
+	.quad	0xc08626d4c5c29ad0
+	.quad	0xbe1ceefc47e8fa64
+	.quad	0xc08626d74d6ddf48
+	.quad	0xbe1cf1872bf033f2
+	.quad	0xc08626d9d44c9210
+	.quad	0xbe1cf19d91087f9d
+	.quad	0xc08626dc5a5f3438
+	.quad	0xbe1cf012d444c6ab
+	.quad	0xc08626dedfa64650
+	.quad	0xbe1cf0ba528ee153
+	.quad	0xc08626e164224880
+	.quad	0xbe1ceeb431709788
+	.quad	0xc08626e3e7d3ba60
+	.quad	0xbe1cf0b9af31a6a5
+	.quad	0xc08626e66abb1b28
+	.quad	0xbe1cf168fb2e135b
+	.quad	0xc08626e8ecd8e990
+	.quad	0xbe1cef9097461c93
+	.quad	0xc08626eb6e2da3d0
+	.quad	0xbe1cee7a434735d8
+	.quad	0xc08626edeeb9c7a8
+	.quad	0xbe1cf235732b86f2
+	.quad	0xc08626f06e7dd280
+	.quad	0xbe1cefe1510b89e6
+	.quad	0xc08626f2ed7a4120
+	.quad	0xbe1cf1f64b9b80ef
+	.quad	0xc08626f56baf9000
+	.quad	0xbe1cf08f320ca339
+	.quad	0xc08626f7e91e3b08
+	.quad	0xbe1cf1b1de2808a1
+	.quad	0xc08626fa65c6bdc0
+	.quad	0xbe1cf1976d778b28
+	.quad	0xc08626fce1a99338
+	.quad	0xbe1ceef40a4f076f
+	.quad	0xc08626ff5cc73600
+	.quad	0xbe1cef3e45869ce3
+	.quad	0xc0862701d7202048
+	.quad	0xbe1ceef601b4c9d6
+	.quad	0xc086270450b4cbc0
+	.quad	0xbe1cf1eaf0b57fd6
+	.quad	0xc0862706c985b1c0
+	.quad	0xbe1cef82a44990f3
+	.quad	0xc086270941934b10
+	.quad	0xbe1ceefe32981f2c
+	.quad	0xc086270bb8de1018
+	.quad	0xbe1cefbf6f5a0445
+	.quad	0xc086270e2f6678d0
+	.quad	0xbe1cf18dba75792c
+	.quad	0xc0862710a52cfcc8
+	.quad	0xbe1cf0da64ce995f
+	.quad	0xc08627131a321318
+	.quad	0xbe1cef04ac0fb802
+	.quad	0xc08627158e763268
+	.quad	0xbe1cee9d4e2ad9bd
+	.quad	0xc086271801f9d0f8
+	.quad	0xbe1cefa9b55407b5
+	.quad	0xc086271a74bd64a0
+	.quad	0xbe1cefe6bd329570
+	.quad	0xc086271ce6c162c8
+	.quad	0xbe1cef0b1205dc85
+	.quad	0xc086271f58064068
+	.quad	0xbe1cef092a785e3f
+	.quad	0xc0862721c88c7210
+	.quad	0xbe1cf050dcdaac30
+	.quad	0xc086272438546be8
+	.quad	0xbe1cf210907ded8b
+	.quad	0xc0862726a75ea1b8
+	.quad	0xbe1cee760be44f99
+	.quad	0xc086272915ab86c0
+	.quad	0xbe1ceeeee07c2bcc
+	.quad	0xc086272b833b8df0
+	.quad	0xbe1cf06874992df5
+	.quad	0xc086272df00f29d0
+	.quad	0xbe1cef8fac5d4899
+	.quad	0xc08627305c26cc70
+	.quad	0xbe1cf1103241cc99
+	.quad	0xc0862732c782e788
+	.quad	0xbe1cf1d35fef83fe
+	.quad	0xc08627353223ec68
+	.quad	0xbe1cef3ec8133e1d
+	.quad	0xc08627379c0a4be8
+	.quad	0xbe1cef7261daccd8
+	.quad	0xc086273a05367688
+	.quad	0xbe1cf18656c50806
+	.quad	0xc086273c6da8dc68
+	.quad	0xbe1cf1c8736e049a
+	.quad	0xc086273ed561ed38
+	.quad	0xbe1cf1f93bff4911
+	.quad	0xc08627413c621848
+	.quad	0xbe1cf188a4ea680c
+	.quad	0xc0862743a2a9cc80
+	.quad	0xbe1cf1d270930c80
+	.quad	0xc086274608397868
+	.quad	0xbe1cf25a328c28e2
+	.quad	0xc08627486d118a28
+	.quad	0xbe1cf106f90aa3b8
+	.quad	0xc086274ad1326f80
+	.quad	0xbe1cee5e9d2e885a
+	.quad	0xc086274d349c95c0
+	.quad	0xbe1cf1c0bac27228
+	.quad	0xc086274f975069f8
+	.quad	0xbe1cf1a1500f9b1c
+	.quad	0xc0862751f94e58c0
+	.quad	0xbe1cefc30663ac44
+	.quad	0xc08627545a96ce48
+	.quad	0xbe1cf17123e427a2
+	.quad	0xc0862756bb2a3678
+	.quad	0xbe1cefb92749fea4
+	.quad	0xc08627591b08fcc0
+	.quad	0xbe1cefa40e1ea74a
+	.quad	0xc086275b7a338c40
+	.quad	0xbe1cee6f4612c3e9
+	.quad	0xc086275dd8aa4fa8
+	.quad	0xbe1cf1c54a053627
+	.quad	0xc0862760366db168
+	.quad	0xbe1ceff5eb503d9e
+	.quad	0xc0862762937e1b70
+	.quad	0xbe1cf02e47f10cee
+	.quad	0xc0862764efdbf768
+	.quad	0xbe1ceeb06e1d0dad
+	.quad	0xc08627674b87ae88
+	.quad	0xbe1cf10aadd6dba5
+	.quad	0xc0862769a681a9c0
+	.quad	0xbe1cf24e9913d30f
+	.quad	0xc086276c00ca51a0
+	.quad	0xbe1cef47b301e312
+	.quad	0xc086276e5a620e48
+	.quad	0xbe1ceeb1cefc2e85
+	.quad	0xc0862770b3494788
+	.quad	0xbe1cf16f1fbbe011
+	.quad	0xc08627730b8064e8
+	.quad	0xbe1ceebdf75174c7
+	.quad	0xc08627756307cd70
+	.quad	0xbe1cf06e3871a0da
+	.quad	0xc0862777b9dfe7f0
+	.quad	0xbe1cef16799fd554
+	.quad	0xc086277a10091ac0
+	.quad	0xbe1cf248dabf5377
+	.quad	0xc086277c6583cc00
+	.quad	0xbe1cf0c78d92a2cd
+	.quad	0xc086277eba506158
+	.quad	0xbe1cf0b911b029f0
+	.quad	0xc08627810e6f4028
+	.quad	0xbe1cefdc24719766
+	.quad	0xc086278361e0cd70
+	.quad	0xbe1cefbb6562b7e7
+	.quad	0xc0862785b4a56dd8
+	.quad	0xbe1cf1e0afb349ec
+	.quad	0xc086278806bd85c0
+	.quad	0xbe1cf008292e52fc
+	.quad	0xc086278a58297918
+	.quad	0xbe1cf053073872bf
+	.quad	0xc086278ca8e9ab88
+	.quad	0xbe1cf17a0a55a947
+	.quad	0xc086278ef8fe8068
+	.quad	0xbe1ceeffb0b60234
+	.quad	0xc086279148685aa0
+	.quad	0xbe1cf162204794a8
+	.quad	0xc086279397279ce0
+	.quad	0xbe1cf24cc8cb48ac
+	.quad	0xc0862795e53ca978
+	.quad	0xbe1cf0c9be68d5c3
+	.quad	0xc086279832a7e258
+	.quad	0xbe1cf172cd3d7388
+	.quad	0xc086279a7f69a930
+	.quad	0xbe1ceea2465fbce5
+	.quad	0xc086279ccb825f40
+	.quad	0xbe1cf0a386d2500f
+	.quad	0xc086279f16f26590
+	.quad	0xbe1cf1e338ddc18a
+	.quad	0xc08627a161ba1cd0
+	.quad	0xbe1cef1f5049867f
+	.quad	0xc08627a3abd9e548
+	.quad	0xbe1cef96c1ea8b1f
+	.quad	0xc08627a5f5521f00
+	.quad	0xbe1cf138f6fd3c26
+	.quad	0xc08627a83e2329b0
+	.quad	0xbe1cf0d4fcbfdf3a
+	.quad	0xc08627aa864d64b0
+	.quad	0xbe1cf24870c12c81
+	.quad	0xc08627accdd12f18
+	.quad	0xbe1cf0ae2a56348d
+	.quad	0xc08627af14aee7a0
+	.quad	0xbe1cee8ca1a9b893
+	.quad	0xc08627b15ae6eca8
+	.quad	0xbe1cf20414d637b0
+	.quad	0xc08627b3a0799c60
+	.quad	0xbe1cf0fc6b7b12d8
+	.quad	0xc08627b5e5675488
+	.quad	0xbe1cf152d93c4a00
+	.quad	0xc08627b829b072a0
+	.quad	0xbe1cf1073f9b77c2
+	.quad	0xc08627ba6d5553d8
+	.quad	0xbe1cee694f97d5a4
+	.quad	0xc08627bcb0565500
+	.quad	0xbe1cf0456b8239d7
+	.quad	0xc08627bef2b3d2b0
+	.quad	0xbe1cf211497127e3
+	.quad	0xc08627c1346e2930
+	.quad	0xbe1cf01856c0384d
+	.quad	0xc08627c37585b468
+	.quad	0xbe1cefa7dd05479e
+	.quad	0xc08627c5b5fad000
+	.quad	0xbe1cef3ae8e50b93
+	.quad	0xc08627c7f5cdd750
+	.quad	0xbe1ceea5f32fdd3a
+	.quad	0xc08627ca34ff2560
+	.quad	0xbe1cef424caeb8d9
+	.quad	0xc08627cc738f14f0
+	.quad	0xbe1cf0194d07a81f
+	.quad	0xc08627ceb17e0070
+	.quad	0xbe1cf20f452000c1
+	.quad	0xc08627d0eecc4210
+	.quad	0xbe1cf00e356218e4
+	.quad	0xc08627d32b7a33a0
+	.quad	0xbe1cef30484b4bcb
+	.quad	0xc08627d567882eb0
+	.quad	0xbe1ceeea11a6641b
+	.quad	0xc08627d7a2f68c80
+	.quad	0xbe1cf13492d5bd7b
+	.quad	0xc08627d9ddc5a618
+	.quad	0xbe1ceeb7048fad96
+	.quad	0xc08627dc17f5d418
+	.quad	0xbe1ceef0666f0477
+	.quad	0xc08627de51876ee8
+	.quad	0xbe1cf060d4b8b5c2
+	.quad	0xc08627e08a7acea8
+	.quad	0xbe1cf0b2a4b6ff8c
+	.quad	0xc08627e2c2d04b28
+	.quad	0xbe1cf0e34809a875
+	.quad	0xc08627e4fa883bf0
+	.quad	0xbe1cf16bf74a3522
+	.quad	0xc08627e731a2f848
+	.quad	0xbe1cee6a24623d57
+	.quad	0xc08627e96820d718
+	.quad	0xbe1cefc7b4f1528e
+	.quad	0xc08627eb9e022f18
+	.quad	0xbe1cf163051f3548
+	.quad	0xc08627edd34756b8
+	.quad	0xbe1cef36b3366305
+	.quad	0xc08627f007f0a408
+	.quad	0xbe1cf18134625550
+	.quad	0xc08627f23bfe6cf0
+	.quad	0xbe1cf0ec32ec1a11
+	.quad	0xc08627f46f710700
+	.quad	0xbe1ceeb3b64f3edc
+	.quad	0xc08627f6a248c778
+	.quad	0xbe1cf0cd15805bc8
+	.quad	0xc08627f8d4860368
+	.quad	0xbe1cf20db3bddebe
+	.quad	0xc08627fb06290f90
+	.quad	0xbe1cf25188430e25
+	.quad	0xc08627fd37324070
+	.quad	0xbe1ceea1713490f9
+	.quad	0xc08627ff67a1ea28
+	.quad	0xbe1cf159521d234c
+	.quad	0xc0862801977860b8
+	.quad	0xbe1cf24dfe50783b
+	.quad	0xc0862803c6b5f7d0
+	.quad	0xbe1ceef2ef89a60b
+	.quad	0xc0862805f55b02c8
+	.quad	0xbe1cee7fc919d62c
+	.quad	0xc08628082367d4c0
+	.quad	0xbe1cf215a7fb513a
+	.quad	0xc086280a50dcc0a8
+	.quad	0xbe1cf0e4401c5ed4
+	.quad	0xc086280c7dba1910
+	.quad	0xbe1cf04ec734d256
+	.quad	0xc086280eaa003050
+	.quad	0xbe1cf010ad787fea
+	.quad	0xc0862810d5af5880
+	.quad	0xbe1cee622478393d
+	.quad	0xc086281300c7e368
+	.quad	0xbe1cf01c7482564f
+	.quad	0xc08628152b4a22a0
+	.quad	0xbe1cf0de20d33536
+	.quad	0xc086281755366778
+	.quad	0xbe1cef2edae5837d
+	.quad	0xc08628197e8d02f0
+	.quad	0xbe1cf0a345318cc9
+	.quad	0xc086281ba74e45d8
+	.quad	0xbe1cf20085aa34b8
+	.quad	0xc086281dcf7a80c0
+	.quad	0xbe1cef5fa845ad83
+	.quad	0xc086281ff71203e0
+	.quad	0xbe1cf050d1df69c4
+	.quad	0xc08628221e151f48
+	.quad	0xbe1ceffe43c035b9
+	.quad	0xc0862824448422b8
+	.quad	0xbe1cf14f3018d3c2
+	.quad	0xc08628266a5f5dc0
+	.quad	0xbe1cef0a5fbae83d
+	.quad	0xc08628288fa71f98
+	.quad	0xbe1ceff8a95b72a1
+	.quad	0xc086282ab45bb750
+	.quad	0xbe1cef073aa9849b
+	.quad	0xc086282cd87d73a8
+	.quad	0xbe1cef69b3835c02
+	.quad	0xc086282efc0ca328
+	.quad	0xbe1cf0bc139379a9
+	.quad	0xc08628311f099420
+	.quad	0xbe1cef247a9ec596
+	.quad	0xc086283341749490
+	.quad	0xbe1cef74bbcc488a
+	.quad	0xc0862835634df248
+	.quad	0xbe1cef4bc42e7b8e
+	.quad	0xc08628378495fad0
+	.quad	0xbe1cf136d4d5a810
+	.quad	0xc0862839a54cfb80
+	.quad	0xbe1cf0d290b24dd8
+	.quad	0xc086283bc5734168
+	.quad	0xbe1ceeebde8e0065
+	.quad	0xc086283de5091950
+	.quad	0xbe1cf1a09f60aa1e
+	.quad	0xc0862840040ecfe0
+	.quad	0xbe1cf0803947a234
+	.quad	0xc08628422284b168
+	.quad	0xbe1cf0abf7638127
+	.quad	0xc0862844406b0a08
+	.quad	0xbe1cf0f73ee12058
+	.quad	0xc08628465dc225a0
+	.quad	0xbe1cf2079971b26c
+	.quad	0xc08628487a8a4fe0
+	.quad	0xbe1cee74957564b1
+	.quad	0xc086284a96c3d420
+	.quad	0xbe1ceee77c1b7d43
+	.quad	0xc086284cb26efd90
+	.quad	0xbe1cf23addba6e09
+	.quad	0xc086284ecd8c1730
+	.quad	0xbe1cf199f4a1da60
+	.quad	0xc0862850e81b6bb0
+	.quad	0xbe1cf09fdea81393
+	.quad	0xc0862853021d4588
+	.quad	0xbe1cf176adb417f7
+	.quad	0xc08628551b91ef00
+	.quad	0xbe1cf0f64f84a8da
+	.quad	0xc08628573479b220
+	.quad	0xbe1ceec34cf49523
+	.quad	0xc08628594cd4d8a8
+	.quad	0xbe1cf16d60fbe0bb
+	.quad	0xc086285b64a3ac40
+	.quad	0xbe1cee8de7acfc7b
+	.quad	0xc086285d7be67630
+	.quad	0xbe1ceee6256cce8d
+	.quad	0xc086285f929d7fa0
+	.quad	0xbe1cee7d66a3d8a5
+	.quad	0xc0862861a8c91170
+	.quad	0xbe1cf0bef8265792
+	.quad	0xc0862863be697458
+	.quad	0xbe1cf097f890c6f8
+	.quad	0xc0862865d37ef0c8
+	.quad	0xbe1cf09502d5c3fc
+	.quad	0xc0862867e809cf00
+	.quad	0xbe1ceeffb239dac7
+	.quad	0xc0862869fc0a56f8
+	.quad	0xbe1cf1fbfff95c98
+	.quad	0xc086286c0f80d090
+	.quad	0xbe1cefa57ad3eef7
+	.quad	0xc086286e226d8348
+	.quad	0xbe1cf22c58b9183d
+	.quad	0xc086287034d0b690
+	.quad	0xbe1ceff262d0a248
+	.quad	0xc086287246aab180
+	.quad	0xbe1cefa7bc194186
+	.quad	0xc086287457fbbb08
+	.quad	0xbe1cf06782d784d9
+	.quad	0xc086287668c419e0
+	.quad	0xbe1cf1d44d0eaa07
+	.quad	0xc086287879041490
+	.quad	0xbe1cf034803c8a48
+	.quad	0xc086287a88bbf158
+	.quad	0xbe1cf08e84916b6f
+	.quad	0xc086287c97ebf650
+	.quad	0xbe1cf0c4d3dc1bc7
+	.quad	0xc086287ea6946958
+	.quad	0xbe1cefb1e4625943
+	.quad	0xc0862880b4b59010
+	.quad	0xbe1cf143efdd1fd0
+	.quad	0xc0862882c24faff8
+	.quad	0xbe1cee9896d016da
+	.quad	0xc0862884cf630e38
+	.quad	0xbe1cf2186072f2cc
+	.quad	0xc0862886dbefeff0
+	.quad	0xbe1cef9217633d34
+	.quad	0xc0862888e7f699e0
+	.quad	0xbe1cf05603549486
+	.quad	0xc086288af37750b0
+	.quad	0xbe1cef50fff513d3
+	.quad	0xc086288cfe7258c0
+	.quad	0xbe1cf127713b32d0
+	.quad	0xc086288f08e7f650
+	.quad	0xbe1cf05015520f3d
+	.quad	0xc086289112d86d58
+	.quad	0xbe1cf12eb458b26f
+	.quad	0xc08628931c4401a8
+	.quad	0xbe1cf22eae2887ed
+	.quad	0xc0862895252af6e0
+	.quad	0xbe1cefdd6656dd2d
+	.quad	0xc08628972d8d9058
+	.quad	0xbe1cf1048ea4e646
+	.quad	0xc0862899356c1150
+	.quad	0xbe1ceec4501167e9
+	.quad	0xc086289b3cc6bcb8
+	.quad	0xbe1cf0ad52becc3f
+	.quad	0xc086289d439dd568
+	.quad	0xbe1cf0daa4e00e35
+	.quad	0xc086289f49f19df8
+	.quad	0xbe1cf00b80de8d6a
+	.quad	0xc08628a14fc258c8
+	.quad	0xbe1cf1bcf2ea8464
+	.quad	0xc08628a355104818
+	.quad	0xbe1cf0435e2782b0
+	.quad	0xc08628a559dbade0
+	.quad	0xbe1cf0e3e1a5f56c
+	.quad	0xc08628a75e24cbf8
+	.quad	0xbe1cefed9d5a721d
+	.quad	0xc08628a961ebe3f8
+	.quad	0xbe1cf0d2d74321e2
+	.quad	0xc08628ab65313750
+	.quad	0xbe1cf24200eb55e9
+	.quad	0xc08628ad67f50740
+	.quad	0xbe1cf23e9d7cf979
+	.quad	0xc08628af6a3794d0
+	.quad	0xbe1cf23a088f421c
+	.quad	0xc08628b16bf920e0
+	.quad	0xbe1cef2c1de1ab32
+	.quad	0xc08628b36d39ec08
+	.quad	0xbe1cf1abc231f7b2
+	.quad	0xc08628b56dfa36d0
+	.quad	0xbe1cf2074d5ba303
+	.quad	0xc08628b76e3a4180
+	.quad	0xbe1cf05cd5eed880
+	.rept	48
+	.byte	0
+	.endr
+
+/* Log(2) lookup table for logarithmic part */
+.if .-__svml_spow_data != _Log_LA_table
+.err
+.endif
+	.quad	0x8000000000000000
+	.quad	0xbf5ff802a9ab10e6
+	.quad	0xbf6ff00aa2b10bc0
+	.quad	0xbf77ee11ebd82e94
+	.quad	0xbf7fe02a6b106789
+	.quad	0xbf83e7295d25a7d9
+	.quad	0xbf87dc475f810a77
+	.quad	0xbf8bcf712c74384c
+	.quad	0xbf8fc0a8b0fc03e4
+	.quad	0xbf91d7f7eb9eebe7
+	.quad	0xbf93cea44346a575
+	.quad	0xbf95c45a51b8d389
+	.quad	0xbf97b91b07d5b11b
+	.quad	0xbf99ace7551cc514
+	.quad	0xbf9b9fc027af9198
+	.quad	0xbf9d91a66c543cc4
+	.quad	0xbf9f829b0e783300
+	.quad	0xbfa0b94f7c196176
+	.quad	0xbfa1b0d98923d980
+	.quad	0xbfa2a7ec2214e873
+	.quad	0xbfa39e87b9febd60
+	.quad	0xbfa494acc34d911c
+	.quad	0xbfa58a5bafc8e4d5
+	.quad	0xbfa67f94f094bd98
+	.quad	0xbfa77458f632dcfc
+	.quad	0xbfa868a83083f6cf
+	.quad	0xbfa95c830ec8e3eb
+	.quad	0xbfaa4fe9ffa3d235
+	.quad	0xbfab42dd711971bf
+	.quad	0xbfac355dd0921f2d
+	.quad	0xbfad276b8adb0b52
+	.quad	0xbfae19070c276016
+	.quad	0xbfaf0a30c01162a6
+	.quad	0xbfaffae9119b9303
+	.quad	0xbfb075983598e471
+	.quad	0xbfb0ed839b5526fe
+	.quad	0xbfb16536eea37ae1
+	.quad	0xbfb1dcb263db1944
+	.quad	0xbfb253f62f0a1417
+	.quad	0xbfb2cb0283f5de1f
+	.quad	0xbfb341d7961bd1d1
+	.quad	0xbfb3b87598b1b6ee
+	.quad	0xbfb42edcbea646f0
+	.quad	0xbfb4a50d3aa1b040
+	.quad	0xbfb51b073f06183f
+	.quad	0xbfb590cafdf01c28
+	.quad	0xbfb60658a93750c4
+	.quad	0xbfb67bb0726ec0fc
+	.quad	0xbfb6f0d28ae56b4c
+	.quad	0xbfb765bf23a6be13
+	.quad	0xbfb7da766d7b12cd
+	.quad	0xbfb84ef898e8282a
+	.quad	0xbfb8c345d6319b21
+	.quad	0xbfb9375e55595ede
+	.quad	0xbfb9ab42462033ad
+	.quad	0xbfba1ef1d8061cd4
+	.quad	0xbfba926d3a4ad563
+	.quad	0xbfbb05b49bee43fe
+	.quad	0xbfbb78c82bb0eda1
+	.quad	0xbfbbeba818146765
+	.quad	0xbfbc5e548f5bc743
+	.quad	0xbfbcd0cdbf8c13e1
+	.quad	0xbfbd4313d66cb35d
+	.quad	0xbfbdb5270187d927
+	.quad	0xbfbe27076e2af2e6
+	.quad	0xbfbe98b549671467
+	.quad	0xbfbf0a30c01162a6
+	.quad	0xbfbf7b79fec37ddf
+	.quad	0xbfbfec9131dbeabb
+	.quad	0xbfc02ebb42bf3d4b
+	.quad	0xbfc0671512ca596e
+	.quad	0xbfc09f561ee719c3
+	.quad	0xbfc0d77e7cd08e59
+	.quad	0xbfc10f8e422539b1
+	.quad	0xbfc14785846742ac
+	.quad	0xbfc17f6458fca611
+	.quad	0xbfc1b72ad52f67a0
+	.quad	0xbfc1eed90e2dc2c3
+	.quad	0xbfc2266f190a5acb
+	.quad	0xbfc25ded0abc6ad2
+	.quad	0xbfc29552f81ff523
+	.quad	0xbfc2cca0f5f5f251
+	.quad	0xbfc303d718e47fd3
+	.quad	0xbfc33af575770e4f
+	.quad	0xbfc371fc201e8f74
+	.quad	0xbfc3a8eb2d31a376
+	.quad	0xbfc3dfc2b0ecc62a
+	.quad	0xbfc41682bf727bc0
+	.quad	0xbfc44d2b6ccb7d1e
+	.quad	0xbfc483bccce6e3dd
+	.quad	0xbfc4ba36f39a55e5
+	.quad	0xbfc4f099f4a230b2
+	.quad	0xbfc526e5e3a1b438
+	.quad	0xbfc55d1ad4232d6f
+	.quad	0xbfc59338d9982086
+	.quad	0xbfc5c940075972b9
+	.quad	0xbfc5ff3070a793d4
+	.quad	0xbfc6350a28aaa758
+	.quad	0xbfc66acd4272ad51
+	.quad	0xbfc6a079d0f7aad2
+	.quad	0xbfc6d60fe719d21d
+	.quad	0xbfc70b8f97a1aa75
+	.quad	0xbfc740f8f54037a5
+	.quad	0xbfc7764c128f2127
+	.quad	0xbfc7ab890210d909
+	.quad	0xbfc7e0afd630c274
+	.quad	0xbfc815c0a14357eb
+	.quad	0xbfc84abb75865139
+	.quad	0xbfc87fa06520c911
+	.quad	0xbfc8b46f8223625b
+	.quad	0xbfc8e928de886d41
+	.quad	0xbfc91dcc8c340bde
+	.quad	0xbfc9525a9cf456b4
+	.quad	0xbfc986d3228180ca
+	.quad	0xbfc9bb362e7dfb83
+	.quad	0xbfc9ef83d2769a34
+	.quad	0xbfca23bc1fe2b563
+	.quad	0xbfca57df28244dcd
+	.quad	0xbfca8becfc882f19
+	.quad	0xbfcabfe5ae46124c
+	.quad	0xbfcaf3c94e80bff3
+	.quad	0xbfcb2797ee46320c
+	.quad	0xbfcb5b519e8fb5a4
+	.quad	0xbfcb8ef670420c3b
+	.quad	0xbfcbc286742d8cd6
+	.quad	0xbfcbf601bb0e44e2
+	.quad	0xbfcc2968558c18c1
+	.quad	0xbfcc5cba543ae425
+	.quad	0xbfcc8ff7c79a9a22
+	.quad	0xbfccc320c0176502
+	.quad	0xbfccf6354e09c5dc
+	.quad	0xbfcd293581b6b3e7
+	.quad	0xbfcd5c216b4fbb91
+	.quad	0xbfcd8ef91af31d5e
+	.quad	0xbfcdc1bca0abec7d
+	.quad	0xbfcdf46c0c722d2f
+	.quad	0xbfce27076e2af2e6
+	.quad	0xbfce598ed5a87e2f
+	.quad	0xbfce8c0252aa5a60
+	.quad	0xbfcebe61f4dd7b0b
+	.quad	0xbfcef0adcbdc5936
+	.quad	0xbfcf22e5e72f105d
+	.quad	0xbfcf550a564b7b37
+	.quad	0xbfcf871b28955045
+	.quad	0xbfcfb9186d5e3e2b
+	.quad	0xbfcfeb0233e607cc
+	.quad	0xbfd00e6c45ad501d
+	.quad	0xbfd0274dc16c232f
+	.quad	0xbfd0402594b4d041
+	.quad	0xbfd058f3c703ebc6
+	.quad	0xbfd071b85fcd590d
+	.quad	0xbfd08a73667c57af
+	.quad	0xbfd0a324e27390e3
+	.quad	0xbfd0bbccdb0d24bd
+	.quad	0xbfd0d46b579ab74b
+	.quad	0xbfd0ed005f657da4
+	.quad	0xbfd1058bf9ae4ad5
+	.quad	0xbfd11e0e2dad9cb7
+	.quad	0xbfd136870293a8b0
+	.quad	0xbfd14ef67f88685a
+	.quad	0xbfd1675cababa60e
+	.quad	0xbfd17fb98e15095d
+	.quad	0xbfd1980d2dd4236f
+	.quad	0xbfd1b05791f07b49
+	.quad	0xbfd1c898c16999fb
+	.quad	0xbfd1e0d0c33716be
+	.quad	0xbfd1f8ff9e48a2f3
+	.quad	0xbfd211255986160c
+	.quad	0xbfd22941fbcf7966
+	.quad	0xbfd241558bfd1404
+	.quad	0xbfd2596010df763a
+	.quad	0xbfd27161913f853d
+	.quad	0xbfd2895a13de86a3
+	.quad	0xbfd2a1499f762bc9
+	.quad	0xbfd2b9303ab89d25
+	.quad	0xbfd2d10dec508583
+	.quad	0xbfd2e8e2bae11d31
+	.quad	0xbfd300aead06350c
+	.quad	0xbfd31871c9544185
+	.quad	0xbfd3302c16586588
+	.quad	0xbfd347dd9a987d55
+	.quad	0xbfd35f865c93293e
+	.quad	0xbfd3772662bfd85b
+	.quad	0xbfd38ebdb38ed321
+	.quad	0xbfd3a64c556945ea
+	.quad	0xbfd3bdd24eb14b6a
+	.quad	0xbfd3d54fa5c1f710
+	.quad	0xbfd3ecc460ef5f50
+	.quad	0xbfd404308686a7e4
+	.quad	0xbfd41b941cce0bee
+	.quad	0xbfd432ef2a04e814
+	.quad	0xbfd44a41b463c47c
+	.quad	0xbfd4618bc21c5ec2
+	.quad	0xbfd478cd5959b3d9
+	.quad	0xbfd49006804009d1
+	.quad	0xbfd4a7373cecf997
+	.quad	0xbfd4be5f957778a1
+	.quad	0xbfd4d57f8fefe27f
+	.quad	0xbfd4ec973260026a
+	.quad	0xbfd503a682cb1cb3
+	.quad	0xbfd51aad872df82d
+	.quad	0xbfd531ac457ee77e
+	.quad	0xbfd548a2c3add263
+	.quad	0xbfd55f9107a43ee2
+	.quad	0xbfd5767717455a6c
+	.quad	0xbfd58d54f86e02f2
+	.quad	0xbfd5a42ab0f4cfe2
+	.quad	0xbfd5baf846aa1b19
+	.quad	0xbfd5d1bdbf5809ca
+	.quad	0xbfd5e87b20c2954a
+	.quad	0xbfd5ff3070a793d4
+	.quad	0xbfd615ddb4bec13c
+	.quad	0xbfd62c82f2b9c795
+	.quad	0x3fd61965cdb02c1f
+	.quad	0x3fd602d08af091ec
+	.quad	0x3fd5ec433d5c35ae
+	.quad	0x3fd5d5bddf595f30
+	.quad	0x3fd5bf406b543db2
+	.quad	0x3fd5a8cadbbedfa1
+	.quad	0x3fd5925d2b112a59
+	.quad	0x3fd57bf753c8d1fb
+	.quad	0x3fd565995069514c
+	.quad	0x3fd54f431b7be1a9
+	.quad	0x3fd538f4af8f72fe
+	.quad	0x3fd522ae0738a3d8
+	.quad	0x3fd50c6f1d11b97c
+	.quad	0x3fd4f637ebba9810
+	.quad	0x3fd4e0086dd8baca
+	.quad	0x3fd4c9e09e172c3c
+	.quad	0x3fd4b3c077267e9a
+	.quad	0x3fd49da7f3bcc41f
+	.quad	0x3fd487970e958770
+	.quad	0x3fd4718dc271c41b
+	.quad	0x3fd45b8c0a17df13
+	.quad	0x3fd44591e0539f49
+	.quad	0x3fd42f9f3ff62642
+	.quad	0x3fd419b423d5e8c7
+	.quad	0x3fd403d086cea79c
+	.quad	0x3fd3edf463c1683e
+	.quad	0x3fd3d81fb5946dba
+	.quad	0x3fd3c25277333184
+	.quad	0x3fd3ac8ca38e5c5f
+	.quad	0x3fd396ce359bbf54
+	.quad	0x3fd3811728564cb2
+	.quad	0x3fd36b6776be1117
+	.quad	0x3fd355bf1bd82c8b
+	.quad	0x3fd3401e12aecba1
+	.quad	0x3fd32a84565120a8
+	.quad	0x3fd314f1e1d35ce4
+	.quad	0x3fd2ff66b04ea9d4
+	.quad	0x3fd2e9e2bce12286
+	.quad	0x3fd2d46602adccee
+	.quad	0x3fd2bef07cdc9354
+	.quad	0x3fd2a982269a3dbf
+	.quad	0x3fd2941afb186b7c
+	.quad	0x3fd27ebaf58d8c9d
+	.quad	0x3fd269621134db92
+	.quad	0x3fd25410494e56c7
+	.quad	0x3fd23ec5991eba49
+	.quad	0x3fd22981fbef797b
+	.quad	0x3fd214456d0eb8d4
+	.quad	0x3fd1ff0fe7cf47a7
+	.quad	0x3fd1e9e1678899f4
+	.quad	0x3fd1d4b9e796c245
+	.quad	0x3fd1bf99635a6b95
+	.quad	0x3fd1aa7fd638d33f
+	.quad	0x3fd1956d3b9bc2fa
+	.quad	0x3fd180618ef18adf
+	.quad	0x3fd16b5ccbacfb73
+	.quad	0x3fd1565eed455fc3
+	.quad	0x3fd14167ef367783
+	.quad	0x3fd12c77cd00713b
+	.quad	0x3fd1178e8227e47c
+	.quad	0x3fd102ac0a35cc1c
+	.quad	0x3fd0edd060b78081
+	.quad	0x3fd0d8fb813eb1ef
+	.quad	0x3fd0c42d676162e3
+	.quad	0x3fd0af660eb9e279
+	.quad	0x3fd09aa572e6c6d4
+	.quad	0x3fd085eb8f8ae797
+	.quad	0x3fd07138604d5862
+	.quad	0x3fd05c8be0d9635a
+	.quad	0x3fd047e60cde83b8
+	.quad	0x3fd03346e0106062
+	.quad	0x3fd01eae5626c691
+	.quad	0x3fd00a1c6adda473
+	.quad	0x3fcfeb2233ea07cd
+	.quad	0x3fcfc218be620a5e
+	.quad	0x3fcf991c6cb3b379
+	.quad	0x3fcf702d36777df0
+	.quad	0x3fcf474b134df229
+	.quad	0x3fcf1e75fadf9bde
+	.quad	0x3fcef5ade4dcffe6
+	.quad	0x3fceccf2c8fe920a
+	.quad	0x3fcea4449f04aaf5
+	.quad	0x3fce7ba35eb77e2a
+	.quad	0x3fce530effe71012
+	.quad	0x3fce2a877a6b2c12
+	.quad	0x3fce020cc6235ab5
+	.quad	0x3fcdd99edaf6d7e9
+	.quad	0x3fcdb13db0d48940
+	.quad	0x3fcd88e93fb2f450
+	.quad	0x3fcd60a17f903515
+	.quad	0x3fcd38666871f465
+	.quad	0x3fcd1037f2655e7b
+	.quad	0x3fcce816157f1988
+	.quad	0x3fccc000c9db3c52
+	.quad	0x3fcc97f8079d44ec
+	.quad	0x3fcc6ffbc6f00f71
+	.quad	0x3fcc480c0005ccd1
+	.quad	0x3fcc2028ab17f9b4
+	.quad	0x3fcbf851c067555f
+	.quad	0x3fcbd087383bd8ad
+	.quad	0x3fcba8c90ae4ad19
+	.quad	0x3fcb811730b823d2
+	.quad	0x3fcb5971a213acdb
+	.quad	0x3fcb31d8575bce3d
+	.quad	0x3fcb0a4b48fc1b46
+	.quad	0x3fcae2ca6f672bd4
+	.quad	0x3fcabb55c31693ad
+	.quad	0x3fca93ed3c8ad9e3
+	.quad	0x3fca6c90d44b704e
+	.quad	0x3fca454082e6ab05
+	.quad	0x3fca1dfc40f1b7f1
+	.quad	0x3fc9f6c407089664
+	.quad	0x3fc9cf97cdce0ec3
+	.quad	0x3fc9a8778debaa38
+	.quad	0x3fc981634011aa75
+	.quad	0x3fc95a5adcf7017f
+	.quad	0x3fc9335e5d594989
+	.quad	0x3fc90c6db9fcbcd9
+	.quad	0x3fc8e588ebac2dbf
+	.quad	0x3fc8beafeb38fe8c
+	.quad	0x3fc897e2b17b19a5
+	.quad	0x3fc871213750e994
+	.quad	0x3fc84a6b759f512f
+	.quad	0x3fc823c16551a3c2
+	.quad	0x3fc7fd22ff599d4f
+	.quad	0x3fc7d6903caf5ad0
+	.quad	0x3fc7b0091651528c
+	.quad	0x3fc7898d85444c73
+	.quad	0x3fc7631d82935a86
+	.quad	0x3fc73cb9074fd14d
+	.quad	0x3fc716600c914054
+	.quad	0x3fc6f0128b756abc
+	.quad	0x3fc6c9d07d203fc7
+	.quad	0x3fc6a399dabbd383
+	.quad	0x3fc67d6e9d785771
+	.quad	0x3fc6574ebe8c133a
+	.quad	0x3fc6313a37335d76
+	.quad	0x3fc60b3100b09476
+	.quad	0x3fc5e533144c1719
+	.quad	0x3fc5bf406b543db2
+	.quad	0x3fc59958ff1d52f1
+	.quad	0x3fc5737cc9018cdd
+	.quad	0x3fc54dabc26105d2
+	.quad	0x3fc527e5e4a1b58d
+	.quad	0x3fc5022b292f6a45
+	.quad	0x3fc4dc7b897bc1c8
+	.quad	0x3fc4b6d6fefe22a4
+	.quad	0x3fc4913d8333b561
+	.quad	0x3fc46baf0f9f5db7
+	.quad	0x3fc4462b9dc9b3dc
+	.quad	0x3fc420b32740fdd4
+	.quad	0x3fc3fb45a59928cc
+	.quad	0x3fc3d5e3126bc27f
+	.quad	0x3fc3b08b6757f2a9
+	.quad	0x3fc38b3e9e027479
+	.quad	0x3fc365fcb0159016
+	.quad	0x3fc340c59741142e
+	.quad	0x3fc31b994d3a4f85
+	.quad	0x3fc2f677cbbc0a96
+	.quad	0x3fc2d1610c86813a
+	.quad	0x3fc2ac55095f5c59
+	.quad	0x3fc28753bc11aba5
+	.quad	0x3fc2625d1e6ddf57
+	.quad	0x3fc23d712a49c202
+	.quad	0x3fc2188fd9807263
+	.quad	0x3fc1f3b925f25d41
+	.quad	0x3fc1ceed09853752
+	.quad	0x3fc1aa2b7e23f72a
+	.quad	0x3fc185747dbecf34
+	.quad	0x3fc160c8024b27b1
+	.quad	0x3fc13c2605c398c3
+	.quad	0x3fc1178e8227e47c
+	.quad	0x3fc0f301717cf0fb
+	.quad	0x3fc0ce7ecdccc28d
+	.quad	0x3fc0aa06912675d5
+	.quad	0x3fc08598b59e3a07
+	.quad	0x3fc06135354d4b18
+	.quad	0x3fc03cdc0a51ec0d
+	.quad	0x3fc0188d2ecf6140
+	.quad	0x3fbfe89139dbd566
+	.quad	0x3fbfa01c9db57ce2
+	.quad	0x3fbf57bc7d9005db
+	.quad	0x3fbf0f70cdd992e3
+	.quad	0x3fbec739830a1120
+	.quad	0x3fbe7f1691a32d3e
+	.quad	0x3fbe3707ee30487b
+	.quad	0x3fbdef0d8d466db9
+	.quad	0x3fbda727638446a2
+	.quad	0x3fbd5f55659210e2
+	.quad	0x3fbd179788219364
+	.quad	0x3fbccfedbfee13a8
+	.quad	0x3fbc885801bc4b23
+	.quad	0x3fbc40d6425a5cb1
+	.quad	0x3fbbf968769fca11
+	.quad	0x3fbbb20e936d6974
+	.quad	0x3fbb6ac88dad5b1c
+	.quad	0x3fbb23965a52ff00
+	.quad	0x3fbadc77ee5aea8c
+	.quad	0x3fba956d3ecade63
+	.quad	0x3fba4e7640b1bc38
+	.quad	0x3fba0792e9277cac
+	.quad	0x3fb9c0c32d4d2548
+	.quad	0x3fb97a07024cbe74
+	.quad	0x3fb9335e5d594989
+	.quad	0x3fb8ecc933aeb6e8
+	.quad	0x3fb8a6477a91dc29
+	.quad	0x3fb85fd927506a48
+	.quad	0x3fb8197e2f40e3f0
+	.quad	0x3fb7d33687c293c9
+	.quad	0x3fb78d02263d82d3
+	.quad	0x3fb746e100226ed9
+	.quad	0x3fb700d30aeac0e1
+	.quad	0x3fb6bad83c1883b6
+	.quad	0x3fb674f089365a7a
+	.quad	0x3fb62f1be7d77743
+	.quad	0x3fb5e95a4d9791cb
+	.quad	0x3fb5a3abb01ade25
+	.quad	0x3fb55e10050e0384
+	.quad	0x3fb518874226130a
+	.quad	0x3fb4d3115d207eac
+	.quad	0x3fb48dae4bc31018
+	.quad	0x3fb4485e03dbdfad
+	.quad	0x3fb403207b414b7f
+	.quad	0x3fb3bdf5a7d1ee64
+	.quad	0x3fb378dd7f749714
+	.quad	0x3fb333d7f8183f4b
+	.quad	0x3fb2eee507b40301
+	.quad	0x3fb2aa04a44717a5
+	.quad	0x3fb26536c3d8c369
+	.quad	0x3fb2207b5c78549e
+	.quad	0x3fb1dbd2643d190b
+	.quad	0x3fb1973bd1465567
+	.quad	0x3fb152b799bb3cc9
+	.quad	0x3fb10e45b3cae831
+	.quad	0x3fb0c9e615ac4e17
+	.quad	0x3fb08598b59e3a07
+	.quad	0x3fb0415d89e74444
+	.quad	0x3faffa6911ab9301
+	.quad	0x3faf723b517fc523
+	.quad	0x3faeea31c006b87c
+	.quad	0x3fae624c4a0b5e1b
+	.quad	0x3fadda8adc67ee4e
+	.quad	0x3fad52ed6405d86f
+	.quad	0x3faccb73cdddb2cc
+	.quad	0x3fac441e06f72a9e
+	.quad	0x3fabbcebfc68f420
+	.quad	0x3fab35dd9b58baad
+	.quad	0x3faaaef2d0fb10fc
+	.quad	0x3faa282b8a936171
+	.quad	0x3fa9a187b573de7c
+	.quad	0x3fa91b073efd7314
+	.quad	0x3fa894aa149fb343
+	.quad	0x3fa80e7023d8ccc4
+	.quad	0x3fa788595a3577ba
+	.quad	0x3fa70265a550e777
+	.quad	0x3fa67c94f2d4bb58
+	.quad	0x3fa5f6e73078efb8
+	.quad	0x3fa5715c4c03ceef
+	.quad	0x3fa4ebf43349e26f
+	.quad	0x3fa466aed42de3ea
+	.quad	0x3fa3e18c1ca0ae92
+	.quad	0x3fa35c8bfaa1306b
+	.quad	0x3fa2d7ae5c3c5bae
+	.quad	0x3fa252f32f8d183f
+	.quad	0x3fa1ce5a62bc353a
+	.quad	0x3fa149e3e4005a8d
+	.quad	0x3fa0c58fa19dfaaa
+	.quad	0x3fa0415d89e74444
+	.quad	0x3f9f7a9b16782856
+	.quad	0x3f9e72bf2813ce51
+	.quad	0x3f9d6b2725979802
+	.quad	0x3f9c63d2ec14aaf2
+	.quad	0x3f9b5cc258b718e6
+	.quad	0x3f9a55f548c5c43f
+	.quad	0x3f994f6b99a24475
+	.quad	0x3f98492528c8cabf
+	.quad	0x3f974321d3d006d3
+	.quad	0x3f963d6178690bd6
+	.quad	0x3f9537e3f45f3565
+	.quad	0x3f9432a925980cc1
+	.quad	0x3f932db0ea132e22
+	.quad	0x3f9228fb1fea2e28
+	.quad	0x3f912487a5507f70
+	.quad	0x3f90205658935847
+	.quad	0x3f8e38ce3033310c
+	.quad	0x3f8c317384c75f06
+	.quad	0x3f8a2a9c6c170462
+	.quad	0x3f882448a388a2aa
+	.quad	0x3f861e77e8b53fc6
+	.quad	0x3f841929f96832f0
+	.quad	0x3f82145e939ef1e9
+	.quad	0x3f8010157588de71
+	.quad	0x3f7c189cbb0e27fb
+	.quad	0x3f78121214586b54
+	.quad	0x3f740c8a747878e2
+	.quad	0x3f70080559588b35
+	.quad	0x3f680904828985c0
+	.quad	0x3f60040155d5889e
+	.quad	0x3f50020055655889
+	.quad	0x0000000000000000
+	.rept	56
+	.byte	0
+	.endr
+
+/* Polynomial coefficients for log part:
+ * coeff4 */
+double_vector _poly_coeff_1 0x3fc9999cacdb4d0a
+
+/* coeff3 */
+double_vector _poly_coeff_2 0xbfd0000148058ee1
+
+/* coeff2 */
+double_vector _poly_coeff_3 0x3fd55555555543c5
+
+/* coeff1 */
+double_vector _poly_coeff_4 0xbfdffffffffff81f
+
+/* General purpose constants for log part: ExpMask */
+double_vector _ExpMask 0x000fffffffffffff
+
+/* Two10 */
+double_vector _Two10 0x3f50000000000000
+
+/* MinNorm */
+double_vector _MinNorm 0x0010000000000000
+
+/* MaxNorm */
+double_vector _MaxNorm 0x7fefffffffffffff
+
+/* HalfMask */
+double_vector _HalfMask 0xfffffffffc000000
+
+/* One */
+double_vector _One 0x3ff0000000000000
+
+/* L2H */
+double_vector _L2H 0x3fe62e42fefa0000
+
+/* L2L */
+double_vector _L2L 0x3d7cf79abc9e0000
+
+/* Threshold */
+double_vector _Threshold 0x4086a00000000000
+
+/* Bias */
+double_vector _Bias 0x408ff80000000000
+
+/* Bias1 */
+double_vector _Bias1 0x408ff00000000000
+
+/* L2L */
+double_vector _L2 0x3fe62e42fefa39ef
+
+/* dInfs = DP infinity, +/- == */
+.if .-__svml_spow_data != _dInfs
+.err
+.endif
+	.quad	0x7ff0000000000000
+	.quad	0xfff0000000000000
+	.rept	48
+	.byte	0
+	.endr
+
+/* dOnes = DP one, +/- == */
+.if .-__svml_spow_data != _dOnes
+.err
+.endif
+	.quad	0x3ff0000000000000
+	.quad	0xbff0000000000000
+	.rept	48
+	.byte	0
+	.endr
+
+/* dZeros = DP zero +/- == */
+.if .-__svml_spow_data != _dZeros
+.err
+.endif
+	.quad	0x0000000000000000
+	.quad	0x8000000000000000
+	.rept	48
+	.byte	0
+	.endr
+.if .-__svml_spow_data != __dbT
+.err
+.endif
+	.quad	0x3feffffffc27dd9e
+	.quad	0x3ff00162f1a4047d
+	.quad	0x3ff002c603f68252
+	.quad	0x3ff00429350e12af
+	.quad	0x3ff0058c84ed6032
+	.quad	0x3ff006eff39715b2
+	.quad	0x3ff00853810dde41
+	.quad	0x3ff009b72d54652f
+	.quad	0x3ff00b1af86d5604
+	.quad	0x3ff00c7ee25b5c86
+	.quad	0x3ff00de2eb2124b3
+	.quad	0x3ff00f4712c15ac8
+	.quad	0x3ff010ab593eab39
+	.quad	0x3ff0120fbe9bc2ba
+	.quad	0x3ff0137442db4e38
+	.quad	0x3ff014d8e5fffada
+	.quad	0x3ff0163da80c7604
+	.quad	0x3ff017a289036d56
+	.quad	0x3ff0190788e78eab
+	.quad	0x3ff01a6ca7bb8818
+	.quad	0x3ff01bd1e58207ef
+	.quad	0x3ff01d37423dbcbc
+	.quad	0x3ff01e9cbdf15549
+	.quad	0x3ff02002589f8099
+	.quad	0x3ff02168124aedec
+	.quad	0x3ff022cdeaf64cbc
+	.quad	0x3ff02433e2a44cc1
+	.quad	0x3ff02599f9579ded
+	.quad	0x3ff027002f12f06d
+	.quad	0x3ff0286683d8f4ac
+	.quad	0x3ff029ccf7ac5b4d
+	.quad	0x3ff02b338a8fd532
+	.quad	0x3ff02c9a3c861379
+	.quad	0x3ff02e010d91c778
+	.quad	0x3ff02f67fdb5a2c4
+	.quad	0x3ff030cf0cf4572d
+	.quad	0x3ff032363b5096bc
+	.quad	0x3ff0339d88cd13bc
+	.quad	0x3ff03504f56c80ae
+	.quad	0x3ff0366c81319053
+	.quad	0x3ff037d42c1ef5a2
+	.quad	0x3ff0393bf63763d5
+	.quad	0x3ff03aa3df7d8e5f
+	.quad	0x3ff03c0be7f428eb
+	.quad	0x3ff03d740f9de766
+	.quad	0x3ff03edc567d7df7
+	.quad	0x3ff04044bc95a0fe
+	.quad	0x3ff041ad41e9051d
+	.quad	0x3ff04315e67a5f2a
+	.quad	0x3ff0447eaa4c643e
+	.quad	0x3ff045e78d61c9ac
+	.quad	0x3ff047508fbd4502
+	.quad	0x3ff048b9b1618c0b
+	.quad	0x3ff04a22f25154cd
+	.quad	0x3ff04b8c528f558b
+	.quad	0x3ff04cf5d21e44c4
+	.quad	0x3ff04e5f7100d935
+	.quad	0x3ff04fc92f39c9d4
+	.quad	0x3ff051330ccbcdd5
+	.quad	0x3ff0529d09b99ca8
+	.quad	0x3ff054072605edfb
+	.quad	0x3ff0557161b379b3
+	.quad	0x3ff056dbbcc4f7f8
+	.quad	0x3ff05846373d212a
+	.quad	0x3ff059b0d11eade5
+	.quad	0x3ff05b1b8a6c5706
+	.quad	0x3ff05c866328d5a2
+	.quad	0x3ff05df15b56e30a
+	.quad	0x3ff05f5c72f938cf
+	.quad	0x3ff060c7aa1290bd
+	.quad	0x3ff0623300a5a4db
+	.quad	0x3ff0639e76b52f6e
+	.quad	0x3ff0650a0c43eaf6
+	.quad	0x3ff06675c1549232
+	.quad	0x3ff067e195e9e01a
+	.quad	0x3ff0694d8a068fe7
+	.quad	0x3ff06ab99dad5d0c
+	.quad	0x3ff06c25d0e10338
+	.quad	0x3ff06d9223a43e58
+	.quad	0x3ff06efe95f9ca95
+	.quad	0x3ff0706b27e46455
+	.quad	0x3ff071d7d966c83a
+	.quad	0x3ff07344aa83b324
+	.quad	0x3ff074b19b3de22f
+	.quad	0x3ff0761eab9812b4
+	.quad	0x3ff0778bdb950247
+	.quad	0x3ff078f92b376ebc
+	.quad	0x3ff07a669a821621
+	.quad	0x3ff07bd42977b6c4
+	.quad	0x3ff07d41d81b0f2b
+	.quad	0x3ff07eafa66ede1e
+	.quad	0x3ff0801d9475e2a0
+	.quad	0x3ff0818ba232dbee
+	.quad	0x3ff082f9cfa88985
+	.quad	0x3ff084681cd9ab21
+	.quad	0x3ff085d689c900b6
+	.quad	0x3ff0874516794a79
+	.quad	0x3ff088b3c2ed48d9
+	.quad	0x3ff08a228f27bc86
+	.quad	0x3ff08b917b2b6667
+	.quad	0x3ff08d0086fb07a6
+	.quad	0x3ff08e6fb29961a8
+	.quad	0x3ff08fdefe09360d
+	.quad	0x3ff0914e694d46b6
+	.quad	0x3ff092bdf46855c0
+	.quad	0x3ff0942d9f5d2582
+	.quad	0x3ff0959d6a2e7893
+	.quad	0x3ff0970d54df11c8
+	.quad	0x3ff0987d5f71b432
+	.quad	0x3ff099ed89e9231e
+	.quad	0x3ff09b5dd448221a
+	.quad	0x3ff09cce3e9174ec
+	.quad	0x3ff09e3ec8c7df9d
+	.quad	0x3ff09faf72ee2670
+	.quad	0x3ff0a1203d070de5
+	.quad	0x3ff0a29127155abd
+	.quad	0x3ff0a402311bd1f0
+	.quad	0x3ff0a5735b1d38bb
+	.quad	0x3ff0a6e4a51c5493
+	.quad	0x3ff0a8560f1beb2c
+	.quad	0x3ff0a9c7991ec278
+	.quad	0x3ff0ab394327a0a7
+	.quad	0x3ff0acab0d394c25
+	.quad	0x3ff0ae1cf7568b9d
+	.quad	0x3ff0af8f018225f7
+	.quad	0x3ff0b1012bbee259
+	.quad	0x3ff0b273760f8825
+	.quad	0x3ff0b3e5e076defc
+	.quad	0x3ff0b5586af7aebc
+	.quad	0x3ff0b6cb1594bf84
+	.quad	0x3ff0b83de050d9ab
+	.quad	0x3ff0b9b0cb2ec5ca
+	.quad	0x3ff0bb23d6314cb7
+	.quad	0x3ff0bc97015b3783
+	.quad	0x3ff0be0a4caf4f81
+	.quad	0x3ff0bf7db8305e3f
+	.quad	0x3ff0c0f143e12d8a
+	.quad	0x3ff0c264efc4876c
+	.quad	0x3ff0c3d8bbdd362e
+	.quad	0x3ff0c54ca82e0455
+	.quad	0x3ff0c6c0b4b9bca6
+	.quad	0x3ff0c834e1832a24
+	.quad	0x3ff0c9a92e8d180e
+	.quad	0x3ff0cb1d9bda51e1
+	.quad	0x3ff0cc92296da35b
+	.quad	0x3ff0ce06d749d876
+	.quad	0x3ff0cf7ba571bd6a
+	.quad	0x3ff0d0f093e81eab
+	.quad	0x3ff0d265a2afc8f1
+	.quad	0x3ff0d3dad1cb892b
+	.quad	0x3ff0d550213e2c8c
+	.quad	0x3ff0d6c5910a8081
+	.quad	0x3ff0d83b213352b8
+	.quad	0x3ff0d9b0d1bb711b
+	.quad	0x3ff0db26a2a5a9d4
+	.quad	0x3ff0dc9c93f4cb4a
+	.quad	0x3ff0de12a5aba423
+	.quad	0x3ff0df88d7cd0344
+	.quad	0x3ff0e0ff2a5bb7cd
+	.quad	0x3ff0e2759d5a9121
+	.quad	0x3ff0e3ec30cc5edd
+	.quad	0x3ff0e562e4b3f0df
+	.quad	0x3ff0e6d9b9141745
+	.quad	0x3ff0e850adefa265
+	.quad	0x3ff0e9c7c34962db
+	.quad	0x3ff0eb3ef924297d
+	.quad	0x3ff0ecb64f82c75e
+	.quad	0x3ff0ee2dc6680dd6
+	.quad	0x3ff0efa55dd6ce75
+	.quad	0x3ff0f11d15d1db0c
+	.quad	0x3ff0f294ee5c05ab
+	.quad	0x3ff0f40ce77820a2
+	.quad	0x3ff0f5850128fe7a
+	.quad	0x3ff0f6fd3b717200
+	.quad	0x3ff0f87596544e3f
+	.quad	0x3ff0f9ee11d4667f
+	.quad	0x3ff0fb66adf48e46
+	.quad	0x3ff0fcdf6ab7995c
+	.quad	0x3ff0fe5848205bc4
+	.quad	0x3ff0ffd14631a9c2
+	.quad	0x3ff1014a64ee57d9
+	.quad	0x3ff102c3a4593ac9
+	.quad	0x3ff1043d04752792
+	.quad	0x3ff105b68544f373
+	.quad	0x3ff1073026cb73e9
+	.quad	0x3ff108a9e90b7eb2
+	.quad	0x3ff10a23cc07e9c6
+	.quad	0x3ff10b9dcfc38b63
+	.quad	0x3ff10d17f44139fe
+	.quad	0x3ff10e923983cc53
+	.quad	0x3ff1100c9f8e1955
+	.quad	0x3ff111872662f83e
+	.quad	0x3ff11301ce054081
+	.quad	0x3ff1147c9677c9d2
+	.quad	0x3ff115f77fbd6c23
+	.quad	0x3ff1177289d8ffa9
+	.quad	0x3ff118edb4cd5cd4
+	.quad	0x3ff11a69009d5c54
+	.quad	0x3ff11be46d4bd71a
+	.quad	0x3ff11d5ffadba653
+	.quad	0x3ff11edba94fa36e
+	.quad	0x3ff1205778aaa817
+	.quad	0x3ff121d368ef8e3b
+	.quad	0x3ff1234f7a213008
+	.quad	0x3ff124cbac4267e5
+	.quad	0x3ff12647ff56107f
+	.quad	0x3ff127c4735f04bd
+	.quad	0x3ff1294108601fcb
+	.quad	0x3ff12abdbe5c3d0f
+	.quad	0x3ff12c3a95563832
+	.quad	0x3ff12db78d50ed19
+	.quad	0x3ff12f34a64f37ed
+	.quad	0x3ff130b1e053f513
+	.quad	0x3ff1322f3b62012e
+	.quad	0x3ff133acb77c3927
+	.quad	0x3ff1352a54a57a1f
+	.quad	0x3ff136a812e0a17c
+	.quad	0x3ff13825f2308ce0
+	.quad	0x3ff139a3f2981a2e
+	.quad	0x3ff13b22141a278a
+	.quad	0x3ff13ca056b99356
+	.quad	0x3ff13e1eba793c33
+	.quad	0x3ff13f9d3f5c0103
+	.quad	0x3ff1411be564c0e7
+	.quad	0x3ff1429aac965b40
+	.quad	0x3ff1441994f3afae
+	.quad	0x3ff145989e7f9e13
+	.quad	0x3ff14717c93d068e
+	.quad	0x3ff14897152ec980
+	.quad	0x3ff14a168257c787
+	.quad	0x3ff14b9610bae185
+	.quad	0x3ff14d15c05af897
+	.quad	0x3ff14e95913aee1e
+	.quad	0x3ff15015835da3b8
+	.quad	0x3ff1519596c5fb46
+	.quad	0x3ff15315cb76d6e5
+	.quad	0x3ff15496217318f6
+	.quad	0x3ff1561698bda417
+	.quad	0x3ff1579731595b27
+	.quad	0x3ff15917eb492145
+	.quad	0x3ff15a98c68fd9d1
+	.quad	0x3ff15c19c330686b
+	.quad	0x3ff15d9ae12db0ef
+	.quad	0x3ff15f1c208a977f
+	.quad	0x3ff1609d814a007b
+	.quad	0x3ff1621f036ed081
+	.quad	0x3ff163a0a6fbec71
+	.quad	0x3ff165226bf4396d
+	.quad	0x3ff166a4525a9cd3
+	.quad	0x3ff168265a31fc44
+	.quad	0x3ff169a8837d3da3
+	.quad	0x3ff16b2ace3f4710
+	.quad	0x3ff16cad3a7afeeb
+	.quad	0x3ff16e2fc8334bd8
+	.quad	0x3ff16fb2776b14b8
+	.quad	0x3ff17135482540ad
+	.quad	0x3ff172b83a64b71a
+	.quad	0x3ff1743b4e2c5fa4
+	.quad	0x3ff175be837f222d
+	.quad	0x3ff17741da5fe6da
+	.quad	0x3ff178c552d1960f
+	.quad	0x3ff17a48ecd71873
+	.quad	0x3ff17bcca87356e9
+	.quad	0x3ff17d5085a93a9b
+	.quad	0x3ff17ed4847bacec
+	.quad	0x3ff18058a4ed9787
+	.quad	0x3ff181dce701e451
+	.quad	0x3ff183614abb7d75
+	.quad	0x3ff184e5d01d4d5b
+	.quad	0x3ff1866a772a3ead
+	.quad	0x3ff187ef3fe53c58
+	.quad	0x3ff189742a513185
+	.quad	0x3ff18af9367109a1
+	.quad	0x3ff18c7e6447b059
+	.quad	0x3ff18e03b3d8119c
+	.quad	0x3ff18f8925251997
+	.quad	0x3ff1910eb831b4ba
+	.quad	0x3ff192946d00cfb6
+	.quad	0x3ff1941a4395577c
+	.quad	0x3ff195a03bf2393e
+	.quad	0x3ff19726561a626d
+	.quad	0x3ff198ac9210c0c1
+	.quad	0x3ff19a32efd8422c
+	.quad	0x3ff19bb96f73d4e5
+	.quad	0x3ff19d4010e66763
+	.quad	0x3ff19ec6d432e85c
+	.quad	0x3ff1a04db95c46cc
+	.quad	0x3ff1a1d4c06571ed
+	.quad	0x3ff1a35be9515937
+	.quad	0x3ff1a4e33422ec69
+	.quad	0x3ff1a66aa0dd1b81
+	.quad	0x3ff1a7f22f82d6bc
+	.quad	0x3ff1a979e0170e9a
+	.quad	0x3ff1ab01b29cb3dd
+	.quad	0x3ff1ac89a716b786
+	.quad	0x3ff1ae11bd880ada
+	.quad	0x3ff1af99f5f39f5d
+	.quad	0x3ff1b122505c66d5
+	.quad	0x3ff1b2aaccc5534b
+	.quad	0x3ff1b4336b315705
+	.quad	0x3ff1b5bc2ba3648e
+	.quad	0x3ff1b7450e1e6eb3
+	.quad	0x3ff1b8ce12a56880
+	.quad	0x3ff1ba57393b4544
+	.quad	0x3ff1bbe081e2f88e
+	.quad	0x3ff1bd69ec9f762f
+	.quad	0x3ff1bef37973b23b
+	.quad	0x3ff1c07d2862a105
+	.quad	0x3ff1c206f96f3724
+	.quad	0x3ff1c390ec9c696f
+	.quad	0x3ff1c51b01ed2cfe
+	.quad	0x3ff1c6a53964772e
+	.quad	0x3ff1c82f93053d99
+	.quad	0x3ff1c9ba0ed2761e
+	.quad	0x3ff1cb44accf16dc
+	.quad	0x3ff1cccf6cfe1634
+	.quad	0x3ff1ce5a4f626acb
+	.quad	0x3ff1cfe553ff0b83
+	.quad	0x3ff1d1707ad6ef85
+	.quad	0x3ff1d2fbc3ed0e37
+	.quad	0x3ff1d4872f445f44
+	.quad	0x3ff1d612bcdfda99
+	.quad	0x3ff1d79e6cc27863
+	.quad	0x3ff1d92a3eef3111
+	.quad	0x3ff1dab63368fd56
+	.quad	0x3ff1dc424a32d624
+	.quad	0x3ff1ddce834fb4b0
+	.quad	0x3ff1df5adec29273
+	.quad	0x3ff1e0e75c8e6927
+	.quad	0x3ff1e273fcb632c5
+	.quad	0x3ff1e400bf3ce98b
+	.quad	0x3ff1e58da42587fa
+	.quad	0x3ff1e71aab7308d1
+	.quad	0x3ff1e8a7d5286717
+	.quad	0x3ff1ea3521489e0e
+	.quad	0x3ff1ebc28fd6a942
+	.quad	0x3ff1ed5020d5847a
+	.quad	0x3ff1eeddd4482bc3
+	.quad	0x3ff1f06baa319b6b
+	.quad	0x3ff1f1f9a294d004
+	.quad	0x3ff1f387bd74c660
+	.quad	0x3ff1f515fad47b95
+	.quad	0x3ff1f6a45ab6ecfa
+	.quad	0x3ff1f832dd1f1829
+	.quad	0x3ff1f9c1820ffafe
+	.quad	0x3ff1fb50498c9397
+	.quad	0x3ff1fcdf3397e057
+	.quad	0x3ff1fe6e4034dfdf
+	.quad	0x3ff1fffd6f669115
+	.quad	0x3ff2018cc12ff324
+	.quad	0x3ff2031c35940574
+	.quad	0x3ff204abcc95c7b4
+	.quad	0x3ff2063b863839d2
+	.quad	0x3ff207cb627e5c01
+	.quad	0x3ff2095b616b2eb7
+	.quad	0x3ff20aeb8301b2aa
+	.quad	0x3ff20c7bc744e8d5
+	.quad	0x3ff20e0c2e37d274
+	.quad	0x3ff20f9cb7dd7108
+	.quad	0x3ff2112d6438c651
+	.quad	0x3ff212be334cd455
+	.quad	0x3ff2144f251c9d5b
+	.quad	0x3ff215e039ab23ee
+	.quad	0x3ff2177170fb6adb
+	.quad	0x3ff21902cb107531
+	.quad	0x3ff21a9447ed4643
+	.quad	0x3ff21c25e794e1a7
+	.quad	0x3ff21db7aa0a4b34
+	.quad	0x3ff21f498f508707
+	.quad	0x3ff220db976a997e
+	.quad	0x3ff2226dc25b8739
+	.quad	0x3ff224001026551c
+	.quad	0x3ff2259280ce084e
+	.quad	0x3ff227251455a63b
+	.quad	0x3ff228b7cac0348e
+	.quad	0x3ff22a4aa410b938
+	.quad	0x3ff22bdda04a3a6b
+	.quad	0x3ff22d70bf6fbea0
+	.quad	0x3ff22f0401844c8d
+	.quad	0x3ff23097668aeb31
+	.quad	0x3ff2322aee86a1ca
+	.quad	0x3ff233be997a77db
+	.quad	0x3ff235526769752b
+	.quad	0x3ff236e65856a1c4
+	.quad	0x3ff2387a6c4505ef
+	.quad	0x3ff23a0ea337aa40
+	.quad	0x3ff23ba2fd319789
+	.quad	0x3ff23d377a35d6de
+	.quad	0x3ff23ecc1a47719b
+	.quad	0x3ff24060dd69715c
+	.quad	0x3ff241f5c39ee002
+	.quad	0x3ff2438acceac7b2
+	.quad	0x3ff2451ff95032d1
+	.quad	0x3ff246b548d22c0c
+	.quad	0x3ff2484abb73be50
+	.quad	0x3ff249e05137f4cf
+	.quad	0x3ff24b760a21daff
+	.quad	0x3ff24d0be6347c98
+	.quad	0x3ff24ea1e572e597
+	.quad	0x3ff2503807e0223a
+	.quad	0x3ff251ce4d7f3f08
+	.quad	0x3ff25364b65348c6
+	.quad	0x3ff254fb425f4c80
+	.quad	0x3ff25691f1a65784
+	.quad	0x3ff25828c42b7763
+	.quad	0x3ff259bfb9f1b9f7
+	.quad	0x3ff25b56d2fc2d55
+	.quad	0x3ff25cee0f4ddfdd
+	.quad	0x3ff25e856ee9e031
+	.quad	0x3ff2601cf1d33d35
+	.quad	0x3ff261b4980d0613
+	.quad	0x3ff2634c619a4a36
+	.quad	0x3ff264e44e7e1952
+	.quad	0x3ff2667c5ebb8358
+	.quad	0x3ff2681492559883
+	.quad	0x3ff269ace94f694f
+	.quad	0x3ff26b4563ac067d
+	.quad	0x3ff26cde016e8111
+	.quad	0x3ff26e76c299ea53
+	.quad	0x3ff2700fa73153d1
+	.quad	0x3ff271a8af37cf5b
+	.quad	0x3ff27341dab06f07
+	.quad	0x3ff274db299e452c
+	.quad	0x3ff276749c046468
+	.quad	0x3ff2780e31e5df9c
+	.quad	0x3ff279a7eb45c9ef
+	.quad	0x3ff27b41c82736c8
+	.quad	0x3ff27cdbc88d39d8
+	.quad	0x3ff27e75ec7ae70f
+	.quad	0x3ff2801033f352a4
+	.quad	0x3ff281aa9ef99111
+	.quad	0x3ff283452d90b716
+	.quad	0x3ff284dfdfbbd9b7
+	.quad	0x3ff2867ab57e0e3a
+	.quad	0x3ff28815aeda6a2d
+	.quad	0x3ff289b0cbd4035f
+	.quad	0x3ff28b4c0c6defe6
+	.quad	0x3ff28ce770ab461b
+	.quad	0x3ff28e82f88f1c9e
+	.quad	0x3ff2901ea41c8a50
+	.quad	0x3ff291ba7356a657
+	.quad	0x3ff2935666408820
+	.quad	0x3ff294f27cdd475a
+	.quad	0x3ff2968eb72ffbfc
+	.quad	0x3ff2982b153bbe3d
+	.quad	0x3ff299c79703a69e
+	.quad	0x3ff29b643c8acdde
+	.quad	0x3ff29d0105d44d08
+	.quad	0x3ff29e9df2e33d66
+	.quad	0x3ff2a03b03bab88b
+	.quad	0x3ff2a1d8385dd84a
+	.quad	0x3ff2a37590cfb6bf
+	.quad	0x3ff2a5130d136e49
+	.quad	0x3ff2a6b0ad2c198d
+	.quad	0x3ff2a84e711cd374
+	.quad	0x3ff2a9ec58e8b729
+	.quad	0x3ff2ab8a6492e024
+	.quad	0x3ff2ad28941e6a18
+	.quad	0x3ff2aec6e78e7104
+	.quad	0x3ff2b0655ee61129
+	.quad	0x3ff2b203fa28670e
+	.quad	0x3ff2b3a2b9588f7f
+	.quad	0x3ff2b5419c79a78c
+	.quad	0x3ff2b6e0a38ecc8b
+	.quad	0x3ff2b87fce9b1c18
+	.quad	0x3ff2ba1f1da1b412
+	.quad	0x3ff2bbbe90a5b29e
+	.quad	0x3ff2bd5e27aa3627
+	.quad	0x3ff2befde2b25d5c
+	.quad	0x3ff2c09dc1c14733
+	.quad	0x3ff2c23dc4da12e4
+	.quad	0x3ff2c3ddebffdff0
+	.quad	0x3ff2c57e3735ce1a
+	.quad	0x3ff2c71ea67efd6d
+	.quad	0x3ff2c8bf39de8e38
+	.quad	0x3ff2ca5ff157a10f
+	.quad	0x3ff2cc00cced56cd
+	.quad	0x3ff2cda1cca2d08f
+	.quad	0x3ff2cf42f07b2fbd
+	.quad	0x3ff2d0e4387995fe
+	.quad	0x3ff2d285a4a12544
+	.quad	0x3ff2d42734f4ffc3
+	.quad	0x3ff2d5c8e97847f6
+	.quad	0x3ff2d76ac22e209d
+	.quad	0x3ff2d90cbf19acbd
+	.quad	0x3ff2daaee03e0fa3
+	.quad	0x3ff2dc51259e6ce0
+	.quad	0x3ff2ddf38f3de848
+	.quad	0x3ff2df961d1fa5f9
+	.quad	0x3ff2e138cf46ca57
+	.quad	0x3ff2e2dba5b67a06
+	.quad	0x3ff2e47ea071d9f4
+	.quad	0x3ff2e621bf7c0f57
+	.quad	0x3ff2e7c502d83fa4
+	.quad	0x3ff2e9686a89909e
+	.quad	0x3ff2eb0bf6932845
+	.quad	0x3ff2ecafa6f82ce7
+	.quad	0x3ff2ee537bbbc512
+	.quad	0x3ff2eff774e1179d
+	.quad	0x3ff2f19b926b4ba5
+	.quad	0x3ff2f33fd45d888a
+	.quad	0x3ff2f4e43abaf5f5
+	.quad	0x3ff2f688c586bbd5
+	.quad	0x3ff2f82d74c4025c
+	.quad	0x3ff2f9d24875f205
+	.quad	0x3ff2fb77409fb390
+	.quad	0x3ff2fd1c5d447003
+	.quad	0x3ff2fec19e6750ab
+	.quad	0x3ff30067040b7f19
+	.quad	0x3ff3020c8e342527
+	.quad	0x3ff303b23ce46cf4
+	.quad	0x3ff30558101f80e3
+	.quad	0x3ff306fe07e88ba0
+	.quad	0x3ff308a42442b81d
+	.quad	0x3ff30a4a65313191
+	.quad	0x3ff30bf0cab7237a
+	.quad	0x3ff30d9754d7b99d
+	.quad	0x3ff30f3e03962005
+	.quad	0x3ff310e4d6f58302
+	.quad	0x3ff3128bcef90f2b
+	.quad	0x3ff31432eba3f15f
+	.quad	0x3ff315da2cf956c2
+	.quad	0x3ff3178192fc6cbf
+	.quad	0x3ff319291db06106
+	.quad	0x3ff31ad0cd186190
+	.quad	0x3ff31c78a1379c9b
+	.quad	0x3ff31e209a1140ab
+	.quad	0x3ff31fc8b7a87c8c
+	.quad	0x3ff32170fa007f51
+	.quad	0x3ff32319611c7851
+	.quad	0x3ff324c1ecff972d
+	.quad	0x3ff3266a9dad0bca
+	.quad	0x3ff3281373280654
+	.quad	0x3ff329bc6d73b741
+	.quad	0x3ff32b658c934f48
+	.quad	0x3ff32d0ed089ff6c
+	.quad	0x3ff32eb8395af8f4
+	.quad	0x3ff33061c7096d6f
+	.quad	0x3ff3320b79988eb2
+	.quad	0x3ff333b5510b8eda
+	.quad	0x3ff3355f4d65a04a
+	.quad	0x3ff337096ea9f5ab
+	.quad	0x3ff338b3b4dbc1f0
+	.quad	0x3ff33a5e1ffe384f
+	.quad	0x3ff33c08b0148c49
+	.quad	0x3ff33db36521f1a3
+	.quad	0x3ff33f5e3f299c69
+	.quad	0x3ff341093e2ec0f2
+	.quad	0x3ff342b4623493d7
+	.quad	0x3ff3445fab3e49fa
+	.quad	0x3ff3460b194f1887
+	.quad	0x3ff347b6ac6a34ec
+	.quad	0x3ff349626492d4e2
+	.quad	0x3ff34b0e41cc2e67
+	.quad	0x3ff34cba441977c4
+	.quad	0x3ff34e666b7de784
+	.quad	0x3ff35012b7fcb47d
+	.quad	0x3ff351bf299915c9
+	.quad	0x3ff3536bc05642cf
+	.quad	0x3ff355187c377337
+	.quad	0x3ff356c55d3fdef4
+	.quad	0x3ff358726372be40
+	.quad	0x3ff35a1f8ed3499b
+	.quad	0x3ff35bccdf64b9cf
+	.quad	0x3ff35d7a552a47ea
+	.quad	0x3ff35f27f0272d45
+	.quad	0x3ff360d5b05ea37f
+	.quad	0x3ff3628395d3e47d
+	.quad	0x3ff36431a08a2a6f
+	.quad	0x3ff365dfd084afc9
+	.quad	0x3ff3678e25c6af48
+	.quad	0x3ff3693ca05363f3
+	.quad	0x3ff36aeb402e0915
+	.quad	0x3ff36c9a0559da43
+	.quad	0x3ff36e48efda1358
+	.quad	0x3ff36ff7ffb1f078
+	.quad	0x3ff371a734e4ae11
+	.quad	0x3ff373568f7588d3
+	.quad	0x3ff375060f67bdb9
+	.quad	0x3ff376b5b4be8a0a
+	.quad	0x3ff378657f7d2b4c
+	.quad	0x3ff37a156fa6df54
+	.quad	0x3ff37bc5853ee43d
+	.quad	0x3ff37d75c0487869
+	.quad	0x3ff37f2620c6da82
+	.quad	0x3ff380d6a6bd497c
+	.quad	0x3ff38287522f048e
+	.quad	0x3ff38438231f4b3f
+	.quad	0x3ff385e919915d57
+	.quad	0x3ff3879a35887ae9
+	.quad	0x3ff3894b7707e450
+	.quad	0x3ff38afcde12da2f
+	.quad	0x3ff38cae6aac9d71
+	.quad	0x3ff38e601cd86f48
+	.quad	0x3ff39011f4999132
+	.quad	0x3ff391c3f1f344f1
+	.quad	0x3ff3937614e8cc90
+	.quad	0x3ff395285d7d6a65
+	.quad	0x3ff396dacbb4610c
+	.quad	0x3ff3988d5f90f36a
+	.quad	0x3ff39a40191664ac
+	.quad	0x3ff39bf2f847f847
+	.quad	0x3ff39da5fd28f1f8
+	.quad	0x3ff39f5927bc95c8
+	.quad	0x3ff3a10c78062804
+	.quad	0x3ff3a2bfee08ed45
+	.quad	0x3ff3a47389c82a68
+	.quad	0x3ff3a6274b472498
+	.quad	0x3ff3a7db32892144
+	.quad	0x3ff3a98f3f916626
+	.quad	0x3ff3ab4372633941
+	.quad	0x3ff3acf7cb01e0de
+	.quad	0x3ff3aeac4970a392
+	.quad	0x3ff3b060edb2c837
+	.quad	0x3ff3b215b7cb95f3
+	.quad	0x3ff3b3caa7be5434
+	.quad	0x3ff3b57fbd8e4aae
+	.quad	0x3ff3b734f93ec163
+	.quad	0x3ff3b8ea5ad30097
+	.quad	0x3ff3ba9fe24e50dd
+	.quad	0x3ff3bc558fb3fb0d
+	.quad	0x3ff3be0b6307484a
+	.quad	0x3ff3bfc15c4b81ff
+	.quad	0x3ff3c1777b83f1e0
+	.quad	0x3ff3c32dc0b3e1ea
+	.quad	0x3ff3c4e42bde9c62
+	.quad	0x3ff3c69abd076bd8
+	.quad	0x3ff3c85174319b24
+	.quad	0x3ff3ca0851607565
+	.quad	0x3ff3cbbf54974607
+	.quad	0x3ff3cd767dd958bd
+	.quad	0x3ff3cf2dcd29f984
+	.quad	0x3ff3d0e5428c749e
+	.quad	0x3ff3d29cde04169e
+	.quad	0x3ff3d4549f942c57
+	.quad	0x3ff3d60c874002ee
+	.quad	0x3ff3d7c4950ae7cb
+	.quad	0x3ff3d97cc8f828a2
+	.quad	0x3ff3db35230b136f
+	.quad	0x3ff3dceda346f679
+	.quad	0x3ff3dea649af204f
+	.quad	0x3ff3e05f1646dfca
+	.quad	0x3ff3e2180911840c
+	.quad	0x3ff3e3d122125c7f
+	.quad	0x3ff3e58a614cb8db
+	.quad	0x3ff3e743c6c3e91b
+	.quad	0x3ff3e8fd527b3d8a
+	.quad	0x3ff3eab7047606b7
+	.quad	0x3ff3ec70dcb7957e
+	.quad	0x3ff3ee2adb433b04
+	.quad	0x3ff3efe5001c48b5
+	.quad	0x3ff3f19f4b46104c
+	.quad	0x3ff3f359bcc3e3c8
+	.quad	0x3ff3f51454991573
+	.quad	0x3ff3f6cf12c8f7e5
+	.quad	0x3ff3f889f756ddfa
+	.quad	0x3ff3fa4502461adb
+	.quad	0x3ff3fc00339a01f9
+	.quad	0x3ff3fdbb8b55e710
+	.quad	0x3ff3ff77097d1e26
+	.quad	0x3ff40132ae12fb8a
+	.quad	0x3ff402ee791ad3d5
+	.quad	0x3ff404aa6a97fbea
+	.quad	0x3ff40666828dc8f6
+	.quad	0x3ff40822c0ff9071
+	.quad	0x3ff409df25f0a81b
+	.quad	0x3ff40b9bb16465fe
+	.quad	0x3ff40d58635e2070
+	.quad	0x3ff40f153be12e0f
+	.quad	0x3ff410d23af0e5c5
+	.quad	0x3ff4128f60909ec4
+	.quad	0x3ff4144cacc3b08a
+	.quad	0x3ff4160a1f8d72dd
+	.quad	0x3ff417c7b8f13dd0
+	.quad	0x3ff4198578f269be
+	.quad	0x3ff41b435f944f4c
+	.quad	0x3ff41d016cda476c
+	.quad	0x3ff41ebfa0c7ab57
+	.quad	0x3ff4207dfb5fd491
+	.quad	0x3ff4223c7ca61cea
+	.quad	0x3ff423fb249dde7b
+	.quad	0x3ff425b9f34a73a7
+	.quad	0x3ff42778e8af371d
+	.quad	0x3ff4293804cf83d5
+	.quad	0x3ff42af747aeb511
+	.quad	0x3ff42cb6b1502661
+	.quad	0x3ff42e7641b73399
+	.quad	0x3ff43035f8e738de
+	.quad	0x3ff431f5d6e3929c
+	.quad	0x3ff433b5dbaf9d8b
+	.quad	0x3ff43576074eb6ac
+	.quad	0x3ff4373659c43b4c
+	.quad	0x3ff438f6d3138902
+	.quad	0x3ff43ab7733ffdb1
+	.quad	0x3ff43c783a4cf784
+	.quad	0x3ff43e39283dd4f5
+	.quad	0x3ff43ffa3d15f4c3
+	.quad	0x3ff441bb78d8b5fc
+	.quad	0x3ff4437cdb8977f9
+	.quad	0x3ff4453e652b9a59
+	.quad	0x3ff4470015c27d0e
+	.quad	0x3ff448c1ed51804b
+	.quad	0x3ff44a83ebdc0497
+	.quad	0x3ff44c4611656abf
+	.quad	0x3ff44e085df113da
+	.quad	0x3ff44fcad182614e
+	.quad	0x3ff4518d6c1cb4c9
+	.quad	0x3ff453502dc37045
+	.quad	0x3ff455131679f608
+	.quad	0x3ff456d62643a8a0
+	.quad	0x3ff458995d23eae9
+	.quad	0x3ff45a5cbb1e2009
+	.quad	0x3ff45c204035ab72
+	.quad	0x3ff45de3ec6df0df
+	.quad	0x3ff45fa7bfca5459
+	.quad	0x3ff4616bba4e3a34
+	.quad	0x3ff4632fdbfd070c
+	.quad	0x3ff464f424da1fcc
+	.quad	0x3ff466b894e8e9a6
+	.quad	0x3ff4687d2c2cca1e
+	.quad	0x3ff46a41eaa926fc
+	.quad	0x3ff46c06d061665a
+	.quad	0x3ff46dcbdd58ee98
+	.quad	0x3ff46f9111932664
+	.quad	0x3ff471566d1374b7
+	.quad	0x3ff4731befdd40d6
+	.quad	0x3ff474e199f3f251
+	.quad	0x3ff476a76b5af103
+	.quad	0x3ff4786d6415a514
+	.quad	0x3ff47a33842776f6
+	.quad	0x3ff47bf9cb93cf67
+	.quad	0x3ff47dc03a5e1772
+	.quad	0x3ff47f86d089b86d
+	.quad	0x3ff4814d8e1a1bf8
+	.quad	0x3ff483147312ac00
+	.quad	0x3ff484db7f76d2be
+	.quad	0x3ff486a2b349fab7
+	.quad	0x3ff4886a0e8f8ebb
+	.quad	0x3ff48a31914af9e6
+	.quad	0x3ff48bf93b7fa79f
+	.quad	0x3ff48dc10d31039b
+	.quad	0x3ff48f89066279da
+	.quad	0x3ff49151271776a6
+	.quad	0x3ff493196f536698
+	.quad	0x3ff494e1df19b693
+	.quad	0x3ff496aa766dd3c6
+	.quad	0x3ff4987335532bad
+	.quad	0x3ff49a3c1bcd2c0f
+	.quad	0x3ff49c0529df4300
+	.quad	0x3ff49dce5f8cdee0
+	.quad	0x3ff49f97bcd96e5a
+	.quad	0x3ff4a16141c86066
+	.quad	0x3ff4a32aee5d2449
+	.quad	0x3ff4a4f4c29b2993
+	.quad	0x3ff4a6bebe85e020
+	.quad	0x3ff4a888e220b819
+	.quad	0x3ff4aa532d6f21f4
+	.quad	0x3ff4ac1da0748e6f
+	.quad	0x3ff4ade83b346e9c
+	.quad	0x3ff4afb2fdb233d4
+	.quad	0x3ff4b17de7f14fbb
+	.quad	0x3ff4b348f9f53446
+	.quad	0x3ff4b51433c153b3
+	.quad	0x3ff4b6df9559208f
+	.quad	0x3ff4b8ab1ec00db1
+	.quad	0x3ff4ba76cff98e3e
+	.quad	0x3ff4bc42a90915a7
+	.quad	0x3ff4be0ea9f217aa
+	.quad	0x3ff4bfdad2b80852
+	.quad	0x3ff4c1a7235e5bf6
+	.quad	0x3ff4c3739be88737
+	.quad	0x3ff4c5403c59ff09
+	.quad	0x3ff4c70d04b638a6
+	.quad	0x3ff4c8d9f500a999
+	.quad	0x3ff4caa70d3cc7b9
+	.quad	0x3ff4cc744d6e0926
+	.quad	0x3ff4ce41b597e454
+	.quad	0x3ff4d00f45bdcffe
+	.quad	0x3ff4d1dcfde3432d
+	.quad	0x3ff4d3aade0bb539
+	.quad	0x3ff4d578e63a9dc2
+	.quad	0x3ff4d747167374bd
+	.quad	0x3ff4d9156eb9b264
+	.quad	0x3ff4dae3ef10cf42
+	.quad	0x3ff4dcb2977c442f
+	.quad	0x3ff4de8167ff8a4e
+	.quad	0x3ff4e050609e1b11
+	.quad	0x3ff4e21f815b7036
+	.quad	0x3ff4e3eeca3b03c9
+	.quad	0x3ff4e5be3b405022
+	.quad	0x3ff4e78dd46ecfe6
+	.quad	0x3ff4e95d95c9fe0b
+	.quad	0x3ff4eb2d7f5555ce
+	.quad	0x3ff4ecfd911452bd
+	.quad	0x3ff4eecdcb0a70b3
+	.quad	0x3ff4f09e2d3b2bd8
+	.quad	0x3ff4f26eb7aa00a1
+	.quad	0x3ff4f43f6a5a6bd2
+	.quad	0x3ff4f610454fea79
+	.quad	0x3ff4f7e1488df9f4
+	.quad	0x3ff4f9b2741817ee
+	.quad	0x3ff4fb83c7f1c25e
+	.quad	0x3ff4fd55441e778b
+	.quad	0x3ff4ff26e8a1b608
+	.quad	0x3ff500f8b57efcb6
+	.quad	0x3ff502caaab9cac1
+	.quad	0x3ff5049cc8559fa7
+	.quad	0x3ff5066f0e55fb31
+	.quad	0x3ff508417cbe5d77
+	.quad	0x3ff50a14139246db
+	.quad	0x3ff50be6d2d53812
+	.quad	0x3ff50db9ba8ab21a
+	.quad	0x3ff50f8ccab63642
+	.quad	0x3ff51160035b4625
+	.quad	0x3ff51333647d63ad
+	.quad	0x3ff51506ee201112
+	.quad	0x3ff516daa046d0d6
+	.quad	0x3ff518ae7af525ce
+	.quad	0x3ff51a827e2e931a
+	.quad	0x3ff51c56a9f69c2a
+	.quad	0x3ff51e2afe50c4b9
+	.quad	0x3ff51fff7b4090d2
+	.quad	0x3ff521d420c984ce
+	.quad	0x3ff523a8eeef2553
+	.quad	0x3ff5257de5b4f757
+	.quad	0x3ff52753051e801a
+	.quad	0x3ff529284d2f4530
+	.quad	0x3ff52afdbdeacc76
+	.quad	0x3ff52cd357549c19
+	.quad	0x3ff52ea919703a95
+	.quad	0x3ff5307f04412eb4
+	.quad	0x3ff5325517caff8d
+	.quad	0x3ff5342b54113485
+	.quad	0x3ff53601b9175551
+	.quad	0x3ff537d846e0e9f5
+	.quad	0x3ff539aefd717ac0
+	.quad	0x3ff53b85dccc9053
+	.quad	0x3ff53d5ce4f5b39a
+	.quad	0x3ff53f3415f06dd2
+	.quad	0x3ff5410b6fc04885
+	.quad	0x3ff542e2f268cd8c
+	.quad	0x3ff544ba9ded870f
+	.quad	0x3ff546927251ff84
+	.quad	0x3ff5486a6f99c1b1
+	.quad	0x3ff54a4295c858a6
+	.quad	0x3ff54c1ae4e14fc7
+	.quad	0x3ff54df35ce832c3
+	.quad	0x3ff54fcbfde08d9b
+	.quad	0x3ff551a4c7cdec9c
+	.quad	0x3ff5537dbab3dc60
+	.quad	0x3ff55556d695e9d6
+	.quad	0x3ff557301b77a236
+	.quad	0x3ff55909895c9309
+	.quad	0x3ff55ae320484a28
+	.quad	0x3ff55cbce03e55b7
+	.quad	0x3ff55e96c942442b
+	.quad	0x3ff56070db57a44b
+	.quad	0x3ff5624b16820529
+	.quad	0x3ff564257ac4f625
+	.quad	0x3ff56600082406f4
+	.quad	0x3ff567dabea2c792
+	.quad	0x3ff569b59e44c851
+	.quad	0x3ff56b90a70d99ce
+	.quad	0x3ff56d6bd900ccf5
+	.quad	0x3ff56f473421f304
+	.quad	0x3ff57122b8749d87
+	.quad	0x3ff572fe65fc5e58
+	.quad	0x3ff574da3cbcc7a0
+	.quad	0x3ff576b63cb96bd8
+	.quad	0x3ff5789265f5ddca
+	.quad	0x3ff57a6eb875b08c
+	.quad	0x3ff57c4b343c7786
+	.quad	0x3ff57e27d94dc66d
+	.quad	0x3ff58004a7ad3148
+	.quad	0x3ff581e19f5e4c6a
+	.quad	0x3ff583bec064ac79
+	.quad	0x3ff5859c0ac3e669
+	.quad	0x3ff587797e7f8f7c
+	.quad	0x3ff589571b9b3d44
+	.quad	0x3ff58b34e21a85a7
+	.quad	0x3ff58d12d200fed2
+	.quad	0x3ff58ef0eb523f4a
+	.quad	0x3ff590cf2e11ddde
+	.quad	0x3ff592ad9a4371af
+	.quad	0x3ff5948c2fea922e
+	.quad	0x3ff5966aef0ad71b
+	.quad	0x3ff59849d7a7d883
+	.quad	0x3ff59a28e9c52ec9
+	.quad	0x3ff59c0825667299
+	.quad	0x3ff59de78a8f3cf4
+	.quad	0x3ff59fc719432727
+	.quad	0x3ff5a1a6d185cad3
+	.quad	0x3ff5a386b35ac1e4
+	.quad	0x3ff5a566bec5a699
+	.quad	0x3ff5a746f3ca1381
+	.quad	0x3ff5a927526ba378
+	.quad	0x3ff5ab07daadf1af
+	.quad	0x3ff5ace88c9499a3
+	.quad	0x3ff5aec968233721
+	.quad	0x3ff5b0aa6d5d6649
+	.quad	0x3ff5b28b9c46c389
+	.quad	0x3ff5b46cf4e2eb9d
+	.quad	0x3ff5b64e77357b97
+	.quad	0x3ff5b830234210d3
+	.quad	0x3ff5ba11f90c4902
+	.quad	0x3ff5bbf3f897c221
+	.quad	0x3ff5bdd621e81a81
+	.quad	0x3ff5bfb87500f0c1
+	.quad	0x3ff5c19af1e5e3d2
+	.quad	0x3ff5c37d989a92f2
+	.quad	0x3ff5c56069229db4
+	.quad	0x3ff5c7436381a3f7
+	.quad	0x3ff5c92687bb45ed
+	.quad	0x3ff5cb09d5d3241a
+	.quad	0x3ff5cced4dccdf4c
+	.quad	0x3ff5ced0efac18aa
+	.quad	0x3ff5d0b4bb7471a6
+	.quad	0x3ff5d298b1298c02
+	.quad	0x3ff5d47cd0cf09d4
+	.quad	0x3ff5d6611a688d81
+	.quad	0x3ff5d8458df9b9bf
+	.quad	0x3ff5da2a2b863193
+	.quad	0x3ff5dc0ef3119855
+	.quad	0x3ff5ddf3e49f91ad
+	.quad	0x3ff5dfd90033c193
+	.quad	0x3ff5e1be45d1cc4f
+	.quad	0x3ff5e3a3b57d567d
+	.quad	0x3ff5e5894f3a0506
+	.quad	0x3ff5e76f130b7d27
+	.quad	0x3ff5e95500f5646d
+	.quad	0x3ff5eb3b18fb60b3
+	.quad	0x3ff5ed215b21182a
+	.quad	0x3ff5ef07c76a314e
+	.quad	0x3ff5f0ee5dda52f4
+	.quad	0x3ff5f2d51e752439
+	.quad	0x3ff5f4bc093e4c90
+	.quad	0x3ff5f6a31e3973bf
+	.quad	0x3ff5f88a5d6a41d9
+	.quad	0x3ff5fa71c6d45f43
+	.quad	0x3ff5fc595a7b74b4
+	.quad	0x3ff5fe4118632b33
+	.quad	0x3ff60029008f2c1b
+	.quad	0x3ff6021113032116
+	.quad	0x3ff603f94fc2b41f
+	.quad	0x3ff605e1b6d18f82
+	.quad	0x3ff607ca48335ddf
+	.quad	0x3ff609b303ebca24
+	.quad	0x3ff60b9be9fe7f93
+	.quad	0x3ff60d84fa6f29bf
+	.quad	0x3ff60f6e3541748a
+	.quad	0x3ff611579a790c29
+	.quad	0x3ff613412a199d25
+	.quad	0x3ff6152ae426d453
+	.quad	0x3ff61714c8a45edf
+	.quad	0x3ff618fed795ea44
+	.quad	0x3ff61ae910ff244e
+	.quad	0x3ff61cd374e3bb1b
+	.quad	0x3ff61ebe03475d1c
+	.quad	0x3ff620a8bc2db914
+	.quad	0x3ff622939f9a7e14
+	.quad	0x3ff6247ead915b83
+	.quad	0x3ff62669e6160116
+	.quad	0x3ff62855492c1ed7
+	.quad	0x3ff62a40d6d76521
+	.quad	0x3ff62c2c8f1b84a0
+	.quad	0x3ff62e1871fc2e52
+	.quad	0x3ff630047f7d1386
+	.quad	0x3ff631f0b7a1e5e1
+	.quad	0x3ff633dd1a6e5753
+	.quad	0x3ff635c9a7e61a25
+	.quad	0x3ff637b6600ce0ed
+	.quad	0x3ff639a342e65e97
+	.quad	0x3ff63b905076465f
+	.quad	0x3ff63d7d88c04bd0
+	.quad	0x3ff63f6aebc822cd
+	.quad	0x3ff6415879917f88
+	.quad	0x3ff6434632201685
+	.quad	0x3ff6453415779c9b
+	.quad	0x3ff64722239bc6f3
+	.quad	0x3ff649105c904b09
+	.quad	0x3ff64afec058dea9
+	.quad	0x3ff64ced4ef937f3
+	.quad	0x3ff64edc08750d5b
+	.quad	0x3ff650caecd015a3
+	.quad	0x3ff652b9fc0e07e3
+	.quad	0x3ff654a936329b85
+	.quad	0x3ff656989b418844
+	.quad	0x3ff658882b3e862e
+	.quad	0x3ff65a77e62d4da4
+	.quad	0x3ff65c67cc119759
+	.quad	0x3ff65e57dcef1c54
+	.quad	0x3ff6604818c995ed
+	.quad	0x3ff662387fa4bdce
+	.quad	0x3ff6642911844df6
+	.quad	0x3ff66619ce6c00b4
+	.quad	0x3ff6680ab65f90ac
+	.quad	0x3ff669fbc962b8d3
+	.quad	0x3ff66bed07793473
+	.quad	0x3ff66dde70a6bf26
+	.quad	0x3ff66fd004ef14db
+	.quad	0x3ff671c1c455f1d2
+	.quad	0x3ff673b3aedf129f
+	.quad	0x3ff675a5c48e342a
+	.quad	0x3ff67798056713aa
+	.quad	0x3ff6798a716d6eaf
+	.quad	0x3ff67b7d08a50316
+	.quad	0x3ff67d6fcb118f12
+	.quad	0x3ff67f62b8b6d12a
+	.quad	0x3ff68155d1988835
+	.quad	0x3ff6834915ba7361
+	.quad	0x3ff6853c8520522a
+	.quad	0x3ff687301fcde464
+	.quad	0x3ff68923e5c6ea33
+	.quad	0x3ff68b17d70f2412
+	.quad	0x3ff68d0bf3aa52c9
+	.quad	0x3ff68f003b9c3779
+	.quad	0x3ff690f4aee89395
+	.quad	0x3ff692e94d9328e0
+	.quad	0x3ff694de179fb976
+	.quad	0x3ff696d30d1207c0
+	.quad	0x3ff698c82dedd681
+	.quad	0x3ff69abd7a36e8c9
+	.quad	0x3ff69cb2f1f101ff
+	.quad	0x3ff69ea8951fe5e0
+	.quad	0x3ff6a09e63c75876
+	.quad	0x3ff6a2945deb1e23
+	.quad	0x3ff6a48a838efb9d
+	.quad	0x3ff6a680d4b6b5ec
+	.quad	0x3ff6a8775166126a
+	.quad	0x3ff6aa6df9a0d6c8
+	.quad	0x3ff6ac64cd6ac90a
+	.quad	0x3ff6ae5bccc7af86
+	.quad	0x3ff6b052f7bb50e6
+	.quad	0x3ff6b24a4e497429
+	.quad	0x3ff6b441d075e0a1
+	.quad	0x3ff6b6397e445df5
+	.quad	0x3ff6b83157b8b41e
+	.quad	0x3ff6ba295cd6ab6a
+	.quad	0x3ff6bc218da20c7a
+	.quad	0x3ff6be19ea1ea046
+	.quad	0x3ff6c01272503016
+	.quad	0x3ff6c20b263a8587
+	.quad	0x3ff6c40405e16a8b
+	.quad	0x3ff6c5fd1148a969
+	.quad	0x3ff6c7f648740cb9
+	.quad	0x3ff6c9efab675f6a
+	.quad	0x3ff6cbe93a266cbe
+	.quad	0x3ff6cde2f4b5004b
+	.quad	0x3ff6cfdcdb16e5fb
+	.quad	0x3ff6d1d6ed4fea0d
+	.quad	0x3ff6d3d12b63d914
+	.quad	0x3ff6d5cb95567ff7
+	.quad	0x3ff6d7c62b2babf1
+	.quad	0x3ff6d9c0ece72a93
+	.quad	0x3ff6dbbbda8cc9c0
+	.quad	0x3ff6ddb6f42057b0
+	.quad	0x3ff6dfb239a5a2f3
+	.quad	0x3ff6e1adab207a67
+	.quad	0x3ff6e3a94894ad43
+	.quad	0x3ff6e5a512060b13
+	.quad	0x3ff6e7a1077863b4
+	.quad	0x3ff6e99d28ef875a
+	.quad	0x3ff6eb99766f468d
+	.quad	0x3ff6ed95effb722a
+	.quad	0x3ff6ef929597db64
+	.quad	0x3ff6f18f674853bf
+	.quad	0x3ff6f38c6510ad16
+	.quad	0x3ff6f5898ef4b99a
+	.quad	0x3ff6f786e4f84bcd
+	.quad	0x3ff6f984671f368a
+	.quad	0x3ff6fb82156d4cfe
+	.quad	0x3ff6fd7fefe662ac
+	.quad	0x3ff6ff7df68e4b6c
+	.quad	0x3ff7017c2968db6b
+	.quad	0x3ff7037a8879e729
+	.quad	0x3ff7057913c5437d
+	.quad	0x3ff70777cb4ec594
+	.quad	0x3ff70976af1a42ec
+	.quad	0x3ff70b75bf2b915c
+	.quad	0x3ff70d74fb868710
+	.quad	0x3ff70f74642efa85
+	.quad	0x3ff71173f928c291
+	.quad	0x3ff71373ba77b65f
+	.quad	0x3ff71573a81fad6d
+	.quad	0x3ff71773c2247f90
+	.quad	0x3ff71974088a04f2
+	.quad	0x3ff71b747b541612
+	.quad	0x3ff71d751a868bc4
+	.quad	0x3ff71f75e6253f32
+	.quad	0x3ff72176de3409db
+	.quad	0x3ff7237802b6c593
+	.quad	0x3ff7257953b14c84
+	.quad	0x3ff7277ad127792f
+	.quad	0x3ff7297c7b1d2667
+	.quad	0x3ff72b7e51962f56
+	.quad	0x3ff72d8054966f7e
+	.quad	0x3ff72f828421c2b3
+	.quad	0x3ff73184e03c0520
+	.quad	0x3ff7338768e91346
+	.quad	0x3ff7358a1e2cc9fc
+	.quad	0x3ff7378d000b066d
+	.quad	0x3ff739900e87a61c
+	.quad	0x3ff73b9349a686df
+	.quad	0x3ff73d96b16b86e5
+	.quad	0x3ff73f9a45da84b1
+	.quad	0x3ff7419e06f75f1a
+	.quad	0x3ff743a1f4c5f551
+	.quad	0x3ff745a60f4a26da
+	.quad	0x3ff747aa5687d38f
+	.quad	0x3ff749aeca82dba1
+	.quad	0x3ff74bb36b3f1f98
+	.quad	0x3ff74db838c0804e
+	.quad	0x3ff74fbd330adef7
+	.quad	0x3ff751c25a221d1c
+	.quad	0x3ff753c7ae0a1c9b
+	.quad	0x3ff755cd2ec6bfaa
+	.quad	0x3ff757d2dc5be8d3
+	.quad	0x3ff759d8b6cd7af8
+	.quad	0x3ff75bdebe1f5950
+	.quad	0x3ff75de4f2556769
+	.quad	0x3ff75feb53738927
+	.quad	0x3ff761f1e17da2c4
+	.quad	0x3ff763f89c7798d0
+	.quad	0x3ff765ff84655034
+	.quad	0x3ff76806994aae2c
+	.quad	0x3ff76a0ddb2b984c
+	.quad	0x3ff76c154a0bf47d
+	.quad	0x3ff76e1ce5efa903
+	.quad	0x3ff77024aeda9c72
+	.quad	0x3ff7722ca4d0b5ba
+	.quad	0x3ff77434c7d5dc1c
+	.quad	0x3ff7763d17edf738
+	.quad	0x3ff77845951ceefb
+	.quad	0x3ff77a4e3f66abb0
+	.quad	0x3ff77c5716cf15f4
+	.quad	0x3ff77e601b5a16bd
+	.quad	0x3ff780694d0b9758
+	.quad	0x3ff78272abe78169
+	.quad	0x3ff7847c37f1bee8
+	.quad	0x3ff78685f12e3a27
+	.quad	0x3ff7888fd7a0ddcc
+	.quad	0x3ff78a99eb4d94d8
+	.quad	0x3ff78ca42c384a9f
+	.quad	0x3ff78eae9a64eacc
+	.quad	0x3ff790b935d76165
+	.quad	0x3ff792c3fe939ac3
+	.quad	0x3ff794cef49d8396
+	.quad	0x3ff796da17f908e9
+	.quad	0x3ff798e568aa181a
+	.quad	0x3ff79af0e6b49ee0
+	.quad	0x3ff79cfc921c8b49
+	.quad	0x3ff79f086ae5cbba
+	.quad	0x3ff7a11471144eef
+	.quad	0x3ff7a320a4ac03fa
+	.quad	0x3ff7a52d05b0da48
+	.quad	0x3ff7a7399426c19b
+	.quad	0x3ff7a9465011aa0b
+	.quad	0x3ff7ab5339758409
+	.quad	0x3ff7ad605056405d
+	.quad	0x3ff7af6d94b7d027
+	.quad	0x3ff7b17b069e24de
+	.quad	0x3ff7b388a60d3050
+	.quad	0x3ff7b5967308e4a3
+	.quad	0x3ff7b7a46d953455
+	.quad	0x3ff7b9b295b6123a
+	.quad	0x3ff7bbc0eb6f7180
+	.quad	0x3ff7bdcf6ec545aa
+	.quad	0x3ff7bfde1fbb8295
+	.quad	0x3ff7c1ecfe561c73
+	.quad	0x3ff7c3fc0a9907d2
+	.quad	0x3ff7c60b44883993
+	.quad	0x3ff7c81aac27a6f1
+	.quad	0x3ff7ca2a417b4580
+	.quad	0x3ff7cc3a04870b28
+	.quad	0x3ff7ce49f54eee2d
+	.quad	0x3ff7d05a13d6e52a
+	.quad	0x3ff7d26a6022e710
+	.quad	0x3ff7d47ada36eb2a
+	.quad	0x3ff7d68b8216e919
+	.quad	0x3ff7d89c57c6d8d7
+	.quad	0x3ff7daad5b4ab2b8
+	.quad	0x3ff7dcbe8ca66f64
+	.quad	0x3ff7decfebde07de
+	.quad	0x3ff7e0e178f57582
+	.quad	0x3ff7e2f333f0b201
+	.quad	0x3ff7e5051cd3b766
+	.quad	0x3ff7e71733a28014
+	.quad	0x3ff7e929786106c7
+	.quad	0x3ff7eb3beb134693
+	.quad	0x3ff7ed4e8bbd3ae3
+	.quad	0x3ff7ef615a62df7a
+	.quad	0x3ff7f17457083077
+	.quad	0x3ff7f38781b12a4e
+	.quad	0x3ff7f59ada61c9cc
+	.quad	0x3ff7f7ae611e0c17
+	.quad	0x3ff7f9c215e9eeae
+	.quad	0x3ff7fbd5f8c96f66
+	.quad	0x3ff7fdea09c08c72
+	.quad	0x3ff7fffe48d34457
+	.quad	0x3ff80212b60595f7
+	.quad	0x3ff80427515b808b
+	.quad	0x3ff8063c1ad903a4
+	.quad	0x3ff8085112821f2e
+	.quad	0x3ff80a66385ad36d
+	.quad	0x3ff80c7b8c6720fb
+	.quad	0x3ff80e910eab08ce
+	.quad	0x3ff810a6bf2a8c34
+	.quad	0x3ff812bc9de9acd3
+	.quad	0x3ff814d2aaec6cab
+	.quad	0x3ff816e8e636ce15
+	.quad	0x3ff818ff4fccd3c0
+	.quad	0x3ff81b15e7b280b9
+	.quad	0x3ff81d2cadebd863
+	.quad	0x3ff81f43a27cde79
+	.quad	0x3ff8215ac5699711
+	.quad	0x3ff8237216b60699
+	.quad	0x3ff82589966631da
+	.quad	0x3ff827a1447e1df3
+	.quad	0x3ff829b92101d060
+	.quad	0x3ff82bd12bf54ef1
+	.quad	0x3ff82de9655c9fd6
+	.quad	0x3ff83001cd3bc993
+	.quad	0x3ff8321a6396d307
+	.quad	0x3ff834332871c36a
+	.quad	0x3ff8364c1bd0a24e
+	.quad	0x3ff838653db7779f
+	.quad	0x3ff83a7e8e2a4ba1
+	.quad	0x3ff83c980d2d26f1
+	.quad	0x3ff83eb1bac41287
+	.quad	0x3ff840cb96f317b4
+	.quad	0x3ff842e5a1be4023
+	.quad	0x3ff844ffdb2995d7
+	.quad	0x3ff8471a4339232f
+	.quad	0x3ff84934d9f0f2e1
+	.quad	0x3ff84b4f9f550fff
+	.quad	0x3ff84d6a936985f4
+	.quad	0x3ff84f85b6326082
+	.quad	0x3ff851a107b3abca
+	.quad	0x3ff853bc87f17443
+	.quad	0x3ff855d836efc6bd
+	.quad	0x3ff857f414b2b067
+	.quad	0x3ff85a10213e3ec4
+	.quad	0x3ff85c2c5c967fb5
+	.quad	0x3ff85e48c6bf8171
+	.quad	0x3ff860655fbd528d
+	.quad	0x3ff86282279401f7
+	.quad	0x3ff8649f1e479ef5
+	.quad	0x3ff866bc43dc392a
+	.quad	0x3ff868d99855e08f
+	.quad	0x3ff86af71bb8a57c
+	.quad	0x3ff86d14ce08989e
+	.quad	0x3ff86f32af49cb03
+	.quad	0x3ff87150bf804e0b
+	.quad	0x3ff8736efeb03378
+	.quad	0x3ff8758d6cdd8d61
+	.quad	0x3ff877ac0a0c6e38
+	.quad	0x3ff879cad640e8cc
+	.quad	0x3ff87be9d17f1044
+	.quad	0x3ff87e08fbcaf822
+	.quad	0x3ff880285528b444
+	.quad	0x3ff88247dd9c58df
+	.quad	0x3ff884679529fa86
+	.quad	0x3ff886877bd5ae23
+	.quad	0x3ff888a791a388ff
+	.quad	0x3ff88ac7d697a0b9
+	.quad	0x3ff88ce84ab60b4f
+	.quad	0x3ff88f08ee02df15
+	.quad	0x3ff89129c08232be
+	.quad	0x3ff8934ac2381d54
+	.quad	0x3ff8956bf328b63f
+	.quad	0x3ff8978d53581541
+	.quad	0x3ff899aee2ca5273
+	.quad	0x3ff89bd0a183864e
+	.quad	0x3ff89df28f87c9a5
+	.quad	0x3ff8a014acdb35a2
+	.quad	0x3ff8a236f981e3cd
+	.quad	0x3ff8a459757fee0b
+	.quad	0x3ff8a67c20d96e96
+	.quad	0x3ff8a89efb928009
+	.quad	0x3ff8aac205af3d57
+	.quad	0x3ff8ace53f33c1ce
+	.quad	0x3ff8af08a8242919
+	.quad	0x3ff8b12c40848f3b
+	.quad	0x3ff8b35008591095
+	.quad	0x3ff8b573ffa5c9e2
+	.quad	0x3ff8b798266ed839
+	.quad	0x3ff8b9bc7cb8590d
+	.quad	0x3ff8bbe102866a27
+	.quad	0x3ff8be05b7dd29b2
+	.quad	0x3ff8c02a9cc0b632
+	.quad	0x3ff8c24fb1352e86
+	.quad	0x3ff8c474f53eb1e8
+	.quad	0x3ff8c69a68e15fed
+	.quad	0x3ff8c8c00c215887
+	.quad	0x3ff8cae5df02bc04
+	.quad	0x3ff8cd0be189ab0a
+	.quad	0x3ff8cf3213ba46a0
+	.quad	0x3ff8d1587598b023
+	.quad	0x3ff8d37f07290950
+	.quad	0x3ff8d5a5c86f743d
+	.quad	0x3ff8d7ccb970135d
+	.quad	0x3ff8d9f3da2f097f
+	.quad	0x3ff8dc1b2ab079ca
+	.quad	0x3ff8de42aaf887c7
+	.quad	0x3ff8e06a5b0b5758
+	.quad	0x3ff8e2923aed0cb7
+	.quad	0x3ff8e4ba4aa1cc81
+	.quad	0x3ff8e6e28a2dbba9
+	.quad	0x3ff8e90af994ff81
+	.quad	0x3ff8eb3398dbbdb7
+	.quad	0x3ff8ed5c68061c54
+	.quad	0x3ff8ef85671841bc
+	.quad	0x3ff8f1ae961654b0
+	.quad	0x3ff8f3d7f5047c4f
+	.quad	0x3ff8f60183e6e012
+	.quad	0x3ff8f82b42c1a7cf
+	.quad	0x3ff8fa553198fbb8
+	.quad	0x3ff8fc7f5071045a
+	.quad	0x3ff8fea99f4deaa1
+	.quad	0x3ff900d41e33d7d1
+	.quad	0x3ff902fecd26f58f
+	.quad	0x3ff90529ac2b6dda
+	.quad	0x3ff90754bb456b0e
+	.quad	0x3ff9097ffa7917e2
+	.quad	0x3ff90bab69ca9f6c
+	.quad	0x3ff90dd7093e2d1b
+	.quad	0x3ff91002d8d7ecbd
+	.quad	0x3ff9122ed89c0a7e
+	.quad	0x3ff9145b088eb2e4
+	.quad	0x3ff9168768b412d0
+	.quad	0x3ff918b3f9105783
+	.quad	0x3ff91ae0b9a7ae9b
+	.quad	0x3ff91d0daa7e4610
+	.quad	0x3ff91f3acb984c37
+	.quad	0x3ff921681cf9efc3
+	.quad	0x3ff923959ea75fc4
+	.quad	0x3ff925c350a4cba7
+	.quad	0x3ff927f132f66333
+	.quad	0x3ff92a1f45a05690
+	.quad	0x3ff92c4d88a6d63f
+	.quad	0x3ff92e7bfc0e1323
+	.quad	0x3ff930aa9fda3e74
+	.quad	0x3ff932d9740f89d1
+	.quad	0x3ff9350878b2272d
+	.quad	0x3ff93737adc648dd
+	.quad	0x3ff9396713502192
+	.quad	0x3ff93b96a953e45b
+	.quad	0x3ff93dc66fd5c4a2
+	.quad	0x3ff93ff666d9f630
+	.quad	0x3ff942268e64ad2b
+	.quad	0x3ff94456e67a1e16
+	.quad	0x3ff946876f1e7dd2
+	.quad	0x3ff948b82856019b
+	.quad	0x3ff94ae91224df0d
+	.quad	0x3ff94d1a2c8f4c1e
+	.quad	0x3ff94f4b77997f27
+	.quad	0x3ff9517cf347aeda
+	.quad	0x3ff953ae9f9e1246
+	.quad	0x3ff955e07ca0e0dd
+	.quad	0x3ff958128a545266
+	.quad	0x3ff95a44c8bc9f0e
+	.quad	0x3ff95c7737ddff5a
+	.quad	0x3ff95ea9d7bcac2f
+	.quad	0x3ff960dca85cdecf
+	.quad	0x3ff9630fa9c2d0da
+	.quad	0x3ff96542dbf2bc4e
+	.quad	0x3ff967763ef0db86
+	.quad	0x3ff969a9d2c1693a
+	.quad	0x3ff96bdd9768a084
+	.quad	0x3ff96e118ceabcd7
+	.quad	0x3ff97045b34bfa05
+	.quad	0x3ff9727a0a90943f
+	.quad	0x3ff974ae92bcc816
+	.quad	0x3ff976e34bd4d273
+	.quad	0x3ff9791835dcf0a3
+	.quad	0x3ff97b4d50d9604e
+	.quad	0x3ff97d829cce5f7c
+	.quad	0x3ff97fb819c02c8f
+	.quad	0x3ff981edc7b3064d
+	.quad	0x3ff98423a6ab2bd5
+	.quad	0x3ff98659b6acdca7
+	.quad	0x3ff9888ff7bc58a2
+	.quad	0x3ff98ac669dde001
+	.quad	0x3ff98cfd0d15b35d
+	.quad	0x3ff98f33e16813b0
+	.quad	0x3ff9916ae6d94251
+	.quad	0x3ff993a21d6d80f4
+	.quad	0x3ff995d9852911ae
+	.quad	0x3ff998111e1036f2
+	.quad	0x3ff99a48e827338e
+	.quad	0x3ff99c80e3724ab5
+	.quad	0x3ff99eb90ff5bff1
+	.quad	0x3ff9a0f16db5d730
+	.quad	0x3ff9a329fcb6d4be
+	.quad	0x3ff9a562bcfcfd42
+	.quad	0x3ff9a79bae8c95c8
+	.quad	0x3ff9a9d4d169e3b4
+	.quad	0x3ff9ac0e25992ccd
+	.quad	0x3ff9ae47ab1eb739
+	.quad	0x3ff9b08161fec979
+	.quad	0x3ff9b2bb4a3daa71
+	.quad	0x3ff9b4f563dfa161
+	.quad	0x3ff9b72faee8f5e9
+	.quad	0x3ff9b96a2b5df009
+	.quad	0x3ff9bba4d942d81f
+	.quad	0x3ff9bddfb89bf6e9
+	.quad	0x3ff9c01ac96d9580
+	.quad	0x3ff9c2560bbbfd60
+	.quad	0x3ff9c4917f8b7866
+	.quad	0x3ff9c6cd24e050c8
+	.quad	0x3ff9c908fbbed121
+	.quad	0x3ff9cb45042b4467
+	.quad	0x3ff9cd813e29f5f2
+	.quad	0x3ff9cfbda9bf3179
+	.quad	0x3ff9d1fa46ef430e
+	.quad	0x3ff9d43715be772a
+	.quad	0x3ff9d67416311aa0
+	.quad	0x3ff9d8b1484b7aa2
+	.quad	0x3ff9daeeac11e4c5
+	.quad	0x3ff9dd2c4188a6fb
+	.quad	0x3ff9df6a08b40f94
+	.quad	0x3ff9e1a801986d45
+	.quad	0x3ff9e3e62c3a0f1d
+	.quad	0x3ff9e624889d448d
+	.quad	0x3ff9e86316c65d65
+	.quad	0x3ff9eaa1d6b9a9d6
+	.quad	0x3ff9ece0c87b7a6f
+	.quad	0x3ff9ef1fec102020
+	.quad	0x3ff9f15f417bec36
+	.quad	0x3ff9f39ec8c33062
+	.quad	0x3ff9f5de81ea3eb2
+	.quad	0x3ff9f81e6cf56995
+	.quad	0x3ff9fa5e89e903d9
+	.quad	0x3ff9fc9ed8c960ac
+	.quad	0x3ff9fedf599ad39d
+	.quad	0x3ffa01200c61b09a
+	.quad	0x3ffa0360f1224bf2
+	.quad	0x3ffa05a207e0fa53
+	.quad	0x3ffa07e350a210ca
+	.quad	0x3ffa0a24cb69e4c7
+	.quad	0x3ffa0c66783ccc19
+	.quad	0x3ffa0ea8571f1ced
+	.quad	0x3ffa10ea68152dd4
+	.quad	0x3ffa132cab2355bc
+	.quad	0x3ffa156f204debf5
+	.quad	0x3ffa17b1c7994830
+	.quad	0x3ffa19f4a109c27b
+	.quad	0x3ffa1c37aca3b348
+	.quad	0x3ffa1e7aea6b7367
+	.quad	0x3ffa20be5a655c0a
+	.quad	0x3ffa2301fc95c6c4
+	.quad	0x3ffa2545d1010d86
+	.quad	0x3ffa2789d7ab8aa3
+	.quad	0x3ffa29ce109998cf
+	.quad	0x3ffa2c127bcf931c
+	.quad	0x3ffa2e571951d502
+	.quad	0x3ffa309be924ba55
+	.quad	0x3ffa32e0eb4c9f4a
+	.quad	0x3ffa35261fcde079
+	.quad	0x3ffa376b86acdad9
+	.quad	0x3ffa39b11fedebc2
+	.quad	0x3ffa3bf6eb9570ef
+	.quad	0x3ffa3e3ce9a7c878
+	.quad	0x3ffa40831a2950d8
+	.quad	0x3ffa42c97d1e68ec
+	.quad	0x3ffa4510128b6ff1
+	.quad	0x3ffa4756da74c583
+	.quad	0x3ffa499dd4dec9a2
+	.quad	0x3ffa4be501cddcad
+	.quad	0x3ffa4e2c61465f66
+	.quad	0x3ffa5073f34cb2f0
+	.quad	0x3ffa52bbb7e538cc
+	.quad	0x3ffa5503af1452e0
+	.quad	0x3ffa574bd8de6371
+	.quad	0x3ffa59943547cd25
+	.quad	0x3ffa5bdcc454f307
+	.quad	0x3ffa5e25860a387d
+	.quad	0x3ffa606e7a6c0154
+	.quad	0x3ffa62b7a17eb1b8
+	.quad	0x3ffa6500fb46ae37
+	.quad	0x3ffa674a87c85bbf
+	.quad	0x3ffa699447081fa2
+	.quad	0x3ffa6bde390a5f91
+	.quad	0x3ffa6e285dd3819f
+	.quad	0x3ffa7072b567ec43
+	.quad	0x3ffa72bd3fcc0653
+	.quad	0x3ffa7507fd043708
+	.quad	0x3ffa7752ed14e5fb
+	.quad	0x3ffa799e10027b29
+	.quad	0x3ffa7be965d15ef0
+	.quad	0x3ffa7e34ee85fa0f
+	.quad	0x3ffa8080aa24b5a6
+	.quad	0x3ffa82cc98b1fb3a
+	.quad	0x3ffa8518ba3234b0
+	.quad	0x3ffa87650ea9cc4d
+	.quad	0x3ffa89b1961d2cbb
+	.quad	0x3ffa8bfe5090c106
+	.quad	0x3ffa8e4b3e08f499
+	.quad	0x3ffa90985e8a3344
+	.quad	0x3ffa92e5b218e937
+	.quad	0x3ffa953338b98307
+	.quad	0x3ffa9780f2706da6
+	.quad	0x3ffa99cedf42166e
+	.quad	0x3ffa9c1cff32eb19
+	.quad	0x3ffa9e6b524759c1
+	.quad	0x3ffaa0b9d883d0e6
+	.quad	0x3ffaa30891ecbf66
+	.quad	0x3ffaa5577e869486
+	.quad	0x3ffaa7a69e55bfea
+	.quad	0x3ffaa9f5f15eb19b
+	.quad	0x3ffaac4577a5da02
+	.quad	0x3ffaae95312fa9ec
+	.quad	0x3ffab0e51e009287
+	.quad	0x3ffab3353e1d0565
+	.quad	0x3ffab5859189747c
+	.quad	0x3ffab7d6184a5220
+	.quad	0x3ffaba26d264110c
+	.quad	0x3ffabc77bfdb245d
+	.quad	0x3ffabec8e0b3ff90
+	.quad	0x3ffac11a34f31687
+	.quad	0x3ffac36bbc9cdd87
+	.quad	0x3ffac5bd77b5c936
+	.quad	0x3ffac80f66424e9f
+	.quad	0x3ffaca618846e330
+	.quad	0x3ffaccb3ddc7fcb7
+	.quad	0x3ffacf0666ca1167
+	.quad	0x3ffad159235197d6
+	.quad	0x3ffad3ac136306fc
+	.quad	0x3ffad5ff3702d636
+	.quad	0x3ffad8528e357d43
+	.quad	0x3ffadaa618ff7445
+	.quad	0x3ffadcf9d76533bf
+	.quad	0x3ffadf4dc96b349b
+	.quad	0x3ffae1a1ef15f025
+	.quad	0x3ffae3f64869e00c
+	.quad	0x3ffae64ad56b7e60
+	.quad	0x3ffae89f961f4598
+	.quad	0x3ffaeaf48a89b08d
+	.quad	0x3ffaed49b2af3a7a
+	.quad	0x3ffaef9f0e945eff
+	.quad	0x3ffaf1f49e3d9a1f
+	.quad	0x3ffaf44a61af6840
+	.quad	0x3ffaf6a058ee462d
+	.quad	0x3ffaf8f683feb114
+	.quad	0x3ffafb4ce2e52685
+	.quad	0x3ffafda375a62474
+	.quad	0x3ffafffa3c46293a
+	.quad	0x3ffb025136c9b394
+	.quad	0x3ffb04a8653542a2
+	.quad	0x3ffb06ffc78d55e6
+	.quad	0x3ffb09575dd66d48
+	.quad	0x3ffb0baf28150913
+	.quad	0x3ffb0e07264da9f8
+	.quad	0x3ffb105f5884d106
+	.quad	0x3ffb12b7bebeffb8
+	.quad	0x3ffb15105900b7e6
+	.quad	0x3ffb1769274e7bcf
+	.quad	0x3ffb19c229acce18
+	.quad	0x3ffb1c1b602031c6
+	.quad	0x3ffb1e74caad2a44
+	.quad	0x3ffb20ce69583b61
+	.quad	0x3ffb23283c25e951
+	.quad	0x3ffb2582431ab8ab
+	.quad	0x3ffb27dc7e3b2e6b
+	.quad	0x3ffb2a36ed8bcff1
+	.quad	0x3ffb2c9191112300
+	.quad	0x3ffb2eec68cfadc2
+	.quad	0x3ffb314774cbf6c3
+	.quad	0x3ffb33a2b50a84f5
+	.quad	0x3ffb35fe298fdfad
+	.quad	0x3ffb3859d2608ea7
+	.quad	0x3ffb3ab5af811a00
+	.quad	0x3ffb3d11c0f60a3b
+	.quad	0x3ffb3f6e06c3e840
+	.quad	0x3ffb41ca80ef3d5d
+	.quad	0x3ffb44272f7c9343
+	.quad	0x3ffb468412707405
+	.quad	0x3ffb48e129cf6a20
+	.quad	0x3ffb4b3e759e0071
+	.quad	0x3ffb4d9bf5e0c23e
+	.quad	0x3ffb4ff9aa9c3b30
+	.quad	0x3ffb525793d4f751
+	.quad	0x3ffb54b5b18f8319
+	.quad	0x3ffb571403d06b5b
+	.quad	0x3ffb59728a9c3d55
+	.quad	0x3ffb5bd145f786a7
+	.quad	0x3ffb5e3035e6d559
+	.quad	0x3ffb608f5a6eb7d6
+	.quad	0x3ffb62eeb393bcee
+	.quad	0x3ffb654e415a73d6
+	.quad	0x3ffb67ae03c76c2a
+	.quad	0x3ffb6a0dfadf35e8
+	.quad	0x3ffb6c6e26a66177
+	.quad	0x3ffb6ece87217fa1
+	.quad	0x3ffb712f1c552196
+	.quad	0x3ffb738fe645d8e9
+	.quad	0x3ffb75f0e4f83795
+	.quad	0x3ffb78521870cffb
+	.quad	0x3ffb7ab380b434df
+	.quad	0x3ffb7d151dc6f96c
+	.quad	0x3ffb7f76efadb132
+	.quad	0x3ffb81d8f66cf026
+	.quad	0x3ffb843b32094aa4
+	.quad	0x3ffb869da287556c
+	.quad	0x3ffb890047eba5a5
+	.quad	0x3ffb8b63223ad0da
+	.quad	0x3ffb8dc631796cfe
+	.quad	0x3ffb902975ac1068
+	.quad	0x3ffb928ceed751d6
+	.quad	0x3ffb94f09cffc869
+	.quad	0x3ffb9754802a0bab
+	.quad	0x3ffb99b8985ab38a
+	.quad	0x3ffb9c1ce596585d
+	.quad	0x3ffb9e8167e192dc
+	.quad	0x3ffba0e61f40fc29
+	.quad	0x3ffba34b0bb92dca
+	.quad	0x3ffba5b02d4ec1ab
+	.quad	0x3ffba81584065220
+	.quad	0x3ffbaa7b0fe479e1
+	.quad	0x3ffbace0d0edd40c
+	.quad	0x3ffbaf46c726fc27
+	.quad	0x3ffbb1acf2948e1f
+	.quad	0x3ffbb413533b2643
+	.quad	0x3ffbb679e91f614c
+	.quad	0x3ffbb8e0b445dc58
+	.quad	0x3ffbbb47b4b334eb
+	.quad	0x3ffbbdaeea6c08f0
+	.quad	0x3ffbc0165574f6bb
+	.quad	0x3ffbc27df5d29d00
+	.quad	0x3ffbc4e5cb899adf
+	.quad	0x3ffbc74dd69e8fdc
+	.quad	0x3ffbc9b617161be5
+	.quad	0x3ffbcc1e8cf4df48
+	.quad	0x3ffbce87383f7ac1
+	.quad	0x3ffbd0f018fa8f6d
+	.quad	0x3ffbd3592f2abed3
+	.quad	0x3ffbd5c27ad4aae0
+	.quad	0x3ffbd82bfbfcf5e7
+	.quad	0x3ffbda95b2a842a2
+	.quad	0x3ffbdcff9edb3432
+	.quad	0x3ffbdf69c09a6e20
+	.quad	0x3ffbe1d417ea945a
+	.quad	0x3ffbe43ea4d04b36
+	.quad	0x3ffbe6a967503772
+	.quad	0x3ffbe9145f6efe30
+	.quad	0x3ffbeb7f8d3144fc
+	.quad	0x3ffbedeaf09bb1c7
+	.quad	0x3ffbf05689b2eaec
+	.quad	0x3ffbf2c2587b9729
+	.quad	0x3ffbf52e5cfa5da6
+	.quad	0x3ffbf79a9733e5f3
+	.quad	0x3ffbfa07072cd804
+	.quad	0x3ffbfc73ace9dc39
+	.quad	0x3ffbfee0886f9b53
+	.quad	0x3ffc014d99c2be80
+	.quad	0x3ffc03bae0e7ef53
+	.quad	0x3ffc06285de3d7c7
+	.quad	0x3ffc089610bb223d
+	.quad	0x3ffc0b03f9727980
+	.quad	0x3ffc0d72180e88c1
+	.quad	0x3ffc0fe06c93fb98
+	.quad	0x3ffc124ef7077e06
+	.quad	0x3ffc14bdb76dbc74
+	.quad	0x3ffc172cadcb63b0
+	.quad	0x3ffc199bda2520f2
+	.quad	0x3ffc1c0b3c7fa1d9
+	.quad	0x3ffc1e7ad4df946e
+	.quad	0x3ffc20eaa349a71c
+	.quad	0x3ffc235aa7c288be
+	.quad	0x3ffc25cae24ee890
+	.quad	0x3ffc283b52f37637
+	.quad	0x3ffc2aabf9b4e1c5
+	.quad	0x3ffc2d1cd697dbaf
+	.quad	0x3ffc2f8de9a114d2
+	.quad	0x3ffc31ff32d53e76
+	.quad	0x3ffc3470b2390a49
+	.quad	0x3ffc36e267d12a62
+	.quad	0x3ffc395453a25140
+	.quad	0x3ffc3bc675b131cb
+	.quad	0x3ffc3e38ce027f50
+	.quad	0x3ffc40ab5c9aed89
+	.quad	0x3ffc431e217f3095
+	.quad	0x3ffc45911cb3fcfd
+	.quad	0x3ffc48044e3e07b0
+	.quad	0x3ffc4a77b6220609
+	.quad	0x3ffc4ceb5464adc8
+	.quad	0x3ffc4f5f290ab517
+	.quad	0x3ffc51d33418d28a
+	.quad	0x3ffc54477593bd1c
+	.quad	0x3ffc56bbed802c30
+	.quad	0x3ffc59309be2d792
+	.quad	0x3ffc5ba580c07778
+	.quad	0x3ffc5e1a9c1dc47f
+	.quad	0x3ffc608fedff77ae
+	.quad	0x3ffc6305766a4a74
+	.quad	0x3ffc657b3562f6a9
+	.quad	0x3ffc67f12aee368d
+	.quad	0x3ffc6a675710c4cc
+	.quad	0x3ffc6cddb9cf5c77
+	.quad	0x3ffc6f54532eb909
+	.quad	0x3ffc71cb23339668
+	.quad	0x3ffc744229e2b0e1
+	.quad	0x3ffc76b96740c52b
+	.quad	0x3ffc7930db529065
+	.quad	0x3ffc7ba8861cd01a
+	.quad	0x3ffc7e2067a44239
+	.quad	0x3ffc80987feda51f
+	.quad	0x3ffc8310cefdb791
+	.quad	0x3ffc858954d938bc
+	.quad	0x3ffc88021184e837
+	.quad	0x3ffc8a7b05058602
+	.quad	0x3ffc8cf42f5fd289
+	.quad	0x3ffc8f6d90988e9c
+	.quad	0x3ffc91e728b47b79
+	.quad	0x3ffc9460f7b85ac7
+	.quad	0x3ffc96dafda8ee95
+	.quad	0x3ffc99553a8af95b
+	.quad	0x3ffc9bcfae633dfe
+	.quad	0x3ffc9e4a59367fca
+	.quad	0x3ffca0c53b098273
+	.quad	0x3ffca34053e10a1b
+	.quad	0x3ffca5bba3c1db4b
+	.quad	0x3ffca8372ab0baf6
+	.quad	0x3ffcaab2e8b26e78
+	.quad	0x3ffcad2eddcbbb9a
+	.quad	0x3ffcafab0a01688c
+	.quad	0x3ffcb2276d583be7
+	.quad	0x3ffcb4a407d4fcb3
+	.quad	0x3ffcb720d97c725c
+	.quad	0x3ffcb99de25364bb
+	.quad	0x3ffcbc1b225e9c14
+	.quad	0x3ffcbe9899a2e114
+	.quad	0x3ffcc1164824fcd0
+	.quad	0x3ffcc3942de9b8ca
+	.quad	0x3ffcc6124af5deee
+	.quad	0x3ffcc8909f4e3990
+	.quad	0x3ffccb0f2af79372
+	.quad	0x3ffccd8dedf6b7bd
+	.quad	0x3ffcd00ce8507204
+	.quad	0x3ffcd28c1a098e48
+	.quad	0x3ffcd50b8326d8f2
+	.quad	0x3ffcd78b23ad1ed5
+	.quad	0x3ffcda0afba12d30
+	.quad	0x3ffcdc8b0b07d1aa
+	.quad	0x3ffcdf0b51e5da58
+	.quad	0x3ffce18bd04015b7
+	.quad	0x3ffce40c861b52b1
+	.quad	0x3ffce68d737c6096
+	.quad	0x3ffce90e98680f28
+	.quad	0x3ffceb8ff4e32e8c
+	.quad	0x3ffcee1188f28f58
+	.quad	0x3ffcf093549b0289
+	.quad	0x3ffcf31557e15988
+	.quad	0x3ffcf59792ca6629
+	.quad	0x3ffcf81a055afaab
+	.quad	0x3ffcfa9caf97e9b8
+	.quad	0x3ffcfd1f91860666
+	.quad	0x3ffcffa2ab2a2432
+	.quad	0x3ffd0225fc891709
+	.quad	0x3ffd04a985a7b341
+	.quad	0x3ffd072d468acd9b
+	.quad	0x3ffd09b13f373b42
+	.quad	0x3ffd0c356fb1d1ce
+	.quad	0x3ffd0eb9d7ff6743
+	.quad	0x3ffd113e7824d20f
+	.quad	0x3ffd13c35026e90b
+	.quad	0x3ffd1648600a837b
+	.quad	0x3ffd18cda7d4790f
+	.quad	0x3ffd1b532789a1e4
+	.quad	0x3ffd1dd8df2ed681
+	.quad	0x3ffd205ecec8efd8
+	.quad	0x3ffd22e4f65cc746
+	.quad	0x3ffd256b55ef3696
+	.quad	0x3ffd27f1ed8517fd
+	.quad	0x3ffd2a78bd23461a
+	.quad	0x3ffd2cffc4ce9bfe
+	.quad	0x3ffd2f87048bf51e
+	.quad	0x3ffd320e7c602d5e
+	.quad	0x3ffd34962c50210e
+	.quad	0x3ffd371e1460aced
+	.quad	0x3ffd39a63496ae1e
+	.quad	0x3ffd3c2e8cf70237
+	.quad	0x3ffd3eb71d868736
+	.quad	0x3ffd413fe64a1b88
+	.quad	0x3ffd43c8e7469e02
+	.quad	0x3ffd46522080edeb
+	.quad	0x3ffd48db91fdeaf0
+	.quad	0x3ffd4b653bc2752c
+	.quad	0x3ffd4def1dd36d29
+	.quad	0x3ffd50793835b3da
+	.quad	0x3ffd53038aee2a9f
+	.quad	0x3ffd558e1601b344
+	.quad	0x3ffd5818d9753003
+	.quad	0x3ffd5aa3d54d8381
+	.quad	0x3ffd5d2f098f90cf
+	.quad	0x3ffd5fba76403b6b
+	.quad	0x3ffd62461b64673f
+	.quad	0x3ffd64d1f900f8a4
+	.quad	0x3ffd675e0f1ad45a
+	.quad	0x3ffd69ea5db6df94
+	.quad	0x3ffd6c76e4d9ffed
+	.quad	0x3ffd6f03a4891b6e
+	.quad	0x3ffd71909cc9188f
+	.quad	0x3ffd741dcd9ede30
+	.quad	0x3ffd76ab370f53a1
+	.quad	0x3ffd7938d91f609f
+	.quad	0x3ffd7bc6b3d3ed53
+	.quad	0x3ffd7e54c731e251
+	.quad	0x3ffd80e3133e289e
+	.quad	0x3ffd837197fda9a8
+	.quad	0x3ffd860055754f4c
+	.quad	0x3ffd888f4baa03d3
+	.quad	0x3ffd8b1e7aa0b1f5
+	.quad	0x3ffd8dade25e44d5
+	.quad	0x3ffd903d82e7a803
+	.quad	0x3ffd92cd5c41c77f
+	.quad	0x3ffd955d6e718fb2
+	.quad	0x3ffd97edb97bed76
+	.quad	0x3ffd9a7e3d65ce10
+	.quad	0x3ffd9d0efa341f33
+	.quad	0x3ffd9f9fefebceff
+	.quad	0x3ffda2311e91cc02
+	.quad	0x3ffda4c2862b0536
+	.quad	0x3ffda75426bc6a05
+	.quad	0x3ffda9e6004aea45
+	.quad	0x3ffdac7812db7638
+	.quad	0x3ffdaf0a5e72fe91
+	.quad	0x3ffdb19ce316746e
+	.quad	0x3ffdb42fa0cac95a
+	.quad	0x3ffdb6c29794ef50
+	.quad	0x3ffdb955c779d8b8
+	.quad	0x3ffdbbe9307e7867
+	.quad	0x3ffdbe7cd2a7c1a1
+	.quad	0x3ffdc110adfaa815
+	.quad	0x3ffdc3a4c27c1fe4
+	.quad	0x3ffdc63910311d9a
+	.quad	0x3ffdc8cd971e9631
+	.quad	0x3ffdcb6257497f13
+	.quad	0x3ffdcdf750b6ce17
+	.quad	0x3ffdd08c836b797f
+	.quad	0x3ffdd321ef6c7800
+	.quad	0x3ffdd5b794bec0bc
+	.quad	0x3ffdd84d73674b3f
+	.quad	0x3ffddae38b6b0f89
+	.quad	0x3ffddd79dccf0603
+	.quad	0x3ffde01067982789
+	.quad	0x3ffde2a72bcb6d61
+	.quad	0x3ffde53e296dd143
+	.quad	0x3ffde7d560844d54
+	.quad	0x3ffdea6cd113dc26
+	.quad	0x3ffded047b2178bb
+	.quad	0x3ffdef9c5eb21e83
+	.quad	0x3ffdf2347bcac95e
+	.quad	0x3ffdf4ccd2707596
+	.quad	0x3ffdf76562a81feb
+	.quad	0x3ffdf9fe2c76c585
+	.quad	0x3ffdfc972fe163fd
+	.quad	0x3ffdff306cecf95b
+	.quad	0x3ffe01c9e39e8418
+	.quad	0x3ffe046393fb0315
+	.quad	0x3ffe06fd7e0775aa
+	.quad	0x3ffe0997a1c8db99
+	.quad	0x3ffe0c31ff443512
+	.quad	0x3ffe0ecc967e82b9
+	.quad	0x3ffe1167677cc59c
+	.quad	0x3ffe14027243ff3b
+	.quad	0x3ffe169db6d93183
+	.quad	0x3ffe193935415ed1
+	.quad	0x3ffe1bd4ed8189f2
+	.quad	0x3ffe1e70df9eb621
+	.quad	0x3ffe210d0b9de709
+	.quad	0x3ffe23a9718420c3
+	.quad	0x3ffe2646115667d9
+	.quad	0x3ffe28e2eb19c142
+	.quad	0x3ffe2b7ffed33266
+	.quad	0x3ffe2e1d4c87c11e
+	.quad	0x3ffe30bad43c73ae
+	.quad	0x3ffe335895f650cf
+	.quad	0x3ffe35f691ba5fa4
+	.quad	0x3ffe3894c78da7c2
+	.quad	0x3ffe3b333775312f
+	.quad	0x3ffe3dd1e176045e
+	.quad	0x3ffe4070c5952a35
+	.quad	0x3ffe430fe3d7ac06
+	.quad	0x3ffe45af3c429394
+	.quad	0x3ffe484ecedaeb14
+	.quad	0x3ffe4aee9ba5bd26
+	.quad	0x3ffe4d8ea2a814df
+	.quad	0x3ffe502ee3e6fdc2
+	.quad	0x3ffe52cf5f6783c0
+	.quad	0x3ffe5570152eb33c
+	.quad	0x3ffe581105419909
+	.quad	0x3ffe5ab22fa54269
+	.quad	0x3ffe5d53945ebd0f
+	.quad	0x3ffe5ff53373171e
+	.quad	0x3ffe62970ce75f28
+	.quad	0x3ffe653920c0a430
+	.quad	0x3ffe67db6f03f5ab
+	.quad	0x3ffe6a7df7b6637a
+	.quad	0x3ffe6d20badcfdf3
+	.quad	0x3ffe6fc3b87cd5d9
+	.quad	0x3ffe7266f09afc62
+	.quad	0x3ffe750a633c8332
+	.quad	0x3ffe77ae10667c5d
+	.quad	0x3ffe7a51f81dfa6b
+	.quad	0x3ffe7cf61a681052
+	.quad	0x3ffe7f9a7749d178
+	.quad	0x3ffe823f0ec851b6
+	.quad	0x3ffe84e3e0e8a554
+	.quad	0x3ffe8788edafe10a
+	.quad	0x3ffe8a2e35231a01
+	.quad	0x3ffe8cd3b74765d6
+	.quad	0x3ffe8f797421da93
+	.quad	0x3ffe921f6bb78eb2
+	.quad	0x3ffe94c59e0d9924
+	.quad	0x3ffe976c0b291144
+	.quad	0x3ffe9a12b30f0ee0
+	.quad	0x3ffe9cb995c4aa3b
+	.quad	0x3ffe9f60b34efc02
+	.quad	0x3ffea2080bb31d5a
+	.quad	0x3ffea4af9ef627d4
+	.quad	0x3ffea7576d1d3575
+	.quad	0x3ffea9ff762d60b2
+	.quad	0x3ffeaca7ba2bc471
+	.quad	0x3ffeaf50391d7c09
+	.quad	0x3ffeb1f8f307a346
+	.quad	0x3ffeb4a1e7ef5660
+	.quad	0x3ffeb74b17d9b203
+	.quad	0x3ffeb9f482cbd34b
+	.quad	0x3ffebc9e28cad7ca
+	.quad	0x3ffebf4809dbdd7c
+	.quad	0x3ffec1f2260402d5
+	.quad	0x3ffec49c7d4866b8
+	.quad	0x3ffec7470fae2879
+	.quad	0x3ffec9f1dd3a67df
+	.quad	0x3ffecc9ce5f24521
+	.quad	0x3ffecf4829dae0eb
+	.quad	0x3ffed1f3a8f95c56
+	.quad	0x3ffed49f6352d8ef
+	.quad	0x3ffed74b58ec78b7
+	.quad	0x3ffed9f789cb5e20
+	.quad	0x3ffedca3f5f4ac0a
+	.quad	0x3ffedf509d6d85cb
+	.quad	0x3ffee1fd803b0f2a
+	.quad	0x3ffee4aa9e626c5f
+	.quad	0x3ffee757f7e8c217
+	.quad	0x3ffeea058cd3356e
+	.quad	0x3ffeecb35d26ebf2
+	.quad	0x3ffeef6168e90ba5
+	.quad	0x3ffef20fb01ebafb
+	.quad	0x3ffef4be32cd20da
+	.quad	0x3ffef76cf0f9649a
+	.quad	0x3ffefa1beaa8ae04
+	.quad	0x3ffefccb1fe02556
+	.quad	0x3ffeff7a90a4f33f
+	.quad	0x3fff022a3cfc40e1
+	.quad	0x3fff04da24eb37d0
+	.quad	0x3fff078a48770213
+	.quad	0x3fff0a3aa7a4ca23
+	.quad	0x3fff0ceb4279baea
+	.quad	0x3fff0f9c18faffca
+	.quad	0x3fff124d2b2dc491
+	.quad	0x3fff14fe79173584
+	.quad	0x3fff17b002bc7f5a
+	.quad	0x3fff1a61c822cf3c
+	.quad	0x3fff1d13c94f52c7
+	.quad	0x3fff1fc606473809
+	.quad	0x3fff22787f0fad85
+	.quad	0x3fff252b33ade22f
+	.quad	0x3fff27de24270571
+	.quad	0x3fff2a9150804723
+	.quad	0x3fff2d44b8bed796
+	.quad	0x3fff2ff85ce7e78a
+	.quad	0x3fff32ac3d00a832
+	.quad	0x3fff3560590e4b38
+	.quad	0x3fff3814b11602b5
+	.quad	0x3fff3ac9451d0138
+	.quad	0x3fff3d7e152879c2
+	.quad	0x3fff4033213d9fc8
+	.quad	0x3fff42e86961a731
+	.quad	0x3fff459ded99c45a
+	.quad	0x3fff4853adeb2c11
+	.quad	0x3fff4b09aa5b1398
+	.quad	0x3fff4dbfe2eeb0a6
+	.quad	0x3fff507657ab3963
+	.quad	0x3fff532d0895e46e
+	.quad	0x3fff55e3f5b3e8d8
+	.quad	0x3fff589b1f0a7e23
+	.quad	0x3fff5b52849edc4a
+	.quad	0x3fff5e0a26763bb8
+	.quad	0x3fff60c20495d54d
+	.quad	0x3fff637a1f02e25c
+	.quad	0x3fff663275c29cab
+	.quad	0x3fff68eb08da3e7a
+	.quad	0x3fff6ba3d84f0275
+	.quad	0x3fff6e5ce42623c1
+	.quad	0x3fff71162c64ddf3
+	.quad	0x3fff73cfb1106d1b
+	.quad	0x3fff7689722e0db5
+	.quad	0x3fff79436fc2fcb6
+	.quad	0x3fff7bfda9d47787
+	.quad	0x3fff7eb82067bc04
+	.quad	0x3fff8172d382087c
+	.quad	0x3fff842dc3289bb5
+	.quad	0x3fff86e8ef60b4ea
+	.quad	0x3fff89a4582f93c7
+	.quad	0x3fff8c5ffd9a786e
+	.quad	0x3fff8f1bdfa6a377
+	.quad	0x3fff91d7fe5955eb
+	.quad	0x3fff949459b7d14b
+	.quad	0x3fff9750f1c7578c
+	.quad	0x3fff9a0dc68d2b16
+	.quad	0x3fff9ccad80e8ec8
+	.quad	0x3fff9f882650c5f2
+	.quad	0x3fffa245b159145c
+	.quad	0x3fffa503792cbe42
+	.quad	0x3fffa7c17dd10856
+	.quad	0x3fffaa7fbf4b37bd
+	.quad	0x3fffad3e3da09211
+	.quad	0x3fffaffcf8d65d61
+	.quad	0x3fffb2bbf0f1e031
+	.quad	0x3fffb57b25f8617d
+	.quad	0x3fffb83a97ef28b2
+	.quad	0x3fffbafa46db7db4
+	.quad	0x3fffbdba32c2a8db
+	.quad	0x3fffc07a5ba9f2f6
+	.quad	0x3fffc33ac196a548
+	.quad	0x3fffc5fb648e098a
+	.quad	0x3fffc8bc449569e9
+	.quad	0x3fffcb7d61b21108
+	.quad	0x3fffce3ebbe94a01
+	.quad	0x3fffd10053406061
+	.quad	0x3fffd3c227bca02c
+	.quad	0x3fffd684396355da
+	.quad	0x3fffd9468839ce5a
+	.quad	0x3fffdc0914455712
+	.quad	0x3fffdecbdd8b3dd8
+	.quad	0x3fffe18ee410d0ff
+	.quad	0x3fffe45227db5f4b
+	.quad	0x3fffe715a8f037f6
+	.quad	0x3fffe9d96754aab1
+	.quad	0x3fffec9d630e07a4
+	.quad	0x3fffef619c219f69
+	.quad	0x3ffff2261294c314
+	.quad	0x3ffff4eac66cc42c
+	.quad	0x3ffff7afb7aef4b0
+	.quad	0x3ffffa74e660a715
+	.quad	0x3ffffd3a52872e44
+	.quad	0x3ffffffffc27dd9e
+	.rept	56
+	.byte	0
+	.endr
+
+/* Other general purpose constants:
+ * _dbInvLn2 */
+double_vector __dbInvLn2 0x40a71547652b82fe
+
+/* _dbShifter */
+double_vector __dbShifter 0x4338000000000000
+
+/* _dbHALF */
+double_vector __dbHALF 0x3fe0000000000000
+
+/* _dbC1 = 2^(1/2^K)-1 */
+double_vector __dbC1 0x3f362f3904051fa1
+
+/* _lbLOWKBITS = 2^K-1 */
+double_vector __lbLOWKBITS 0x00000000000007ff
+
+/* _iAbsMask */
+float_vector __iAbsMask 0x7fffffff
+
+/* _iDomainRange */
+float_vector __iDomainRange 0x4059fe36
+	.type	__svml_spow_data,@object
+	.size __svml_spow_data,.-__svml_spow_data
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/svml_s_powf_data.h b/REORG.TODO/sysdeps/x86_64/fpu/svml_s_powf_data.h
new file mode 100644
index 0000000000..016dbf7fce
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/svml_s_powf_data.h
@@ -0,0 +1,76 @@
+/* Offsets for data table for function powf.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef S_POWF_DATA_H
+#define S_POWF_DATA_H
+
+#define _Log2Rcp_lookup                -4218496
+#define _NMINNORM                     	0
+#define _NMAXVAL                      	64
+#define _INF                          	128
+#define _ABSMASK                      	192
+#define _DOMAINRANGE                  	256
+#define _Log_HA_table                 	320
+#define _Log_LA_table                 	8576
+#define _poly_coeff_1                 	12736
+#define _poly_coeff_2                 	12800
+#define _poly_coeff_3                 	12864
+#define _poly_coeff_4                 	12928
+#define _ExpMask                      	12992
+#define _Two10                        	13056
+#define _MinNorm                      	13120
+#define _MaxNorm                      	13184
+#define _HalfMask                     	13248
+#define _One                          	13312
+#define _L2H                          	13376
+#define _L2L                          	13440
+#define _Threshold                    	13504
+#define _Bias                         	13568
+#define _Bias1                        	13632
+#define _L2                           	13696
+#define _dInfs                        	13760
+#define _dOnes                        	13824
+#define _dZeros                       	13888
+#define __dbT                         	13952
+#define __dbInvLn2                    	30400
+#define __dbShifter                   	30464
+#define __dbHALF                      	30528
+#define __dbC1                        	30592
+#define __lbLOWKBITS                  	30656
+#define __iAbsMask                    	30720
+#define __iDomainRange                	30784
+
+.macro double_vector offset value
+.if .-__svml_spow_data != \offset
+.err
+.endif
+.rept 8
+.quad \value
+.endr
+.endm
+
+.macro float_vector offset value
+.if .-__svml_spow_data != \offset
+.err
+.endif
+.rept 16
+.long \value
+.endr
+.endm
+
+#endif
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/svml_s_sincosf16_core.S b/REORG.TODO/sysdeps/x86_64/fpu/svml_s_sincosf16_core.S
new file mode 100644
index 0000000000..d86c91380e
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/svml_s_sincosf16_core.S
@@ -0,0 +1,358 @@
+/* Function sincosf vectorized with AVX-512. Wrapper to AVX2 version.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "svml_s_wrapper_impl.h"
+
+	.text
+ENTRY (_ZGVeN16vl4l4_sincosf)
+WRAPPER_IMPL_AVX512_fFF _ZGVdN8vl4l4_sincosf
+END (_ZGVeN16vl4l4_sincosf)
+
+/* AVX512 ISA version as wrapper to AVX2 ISA version (for vector
+   function declared with #pragma omp declare simd notinbranch).  */
+.macro WRAPPER_IMPL_AVX512_fFF_vvv callee
+#ifndef __ILP32__
+        pushq     %rbp
+        cfi_adjust_cfa_offset (8)
+        cfi_rel_offset (%rbp, 0)
+        movq      %rsp, %rbp
+        cfi_def_cfa_register (%rbp)
+        andq      $-64, %rsp
+        subq      $448, %rsp
+        /* Encoding for vmovups %zmm0, 384(%rsp).  */
+        .byte 0x62
+        .byte 0xf1
+        .byte 0x7c
+        .byte 0x48
+        .byte 0x11
+        .byte 0x44
+        .byte 0x24
+        .byte 0x06
+        lea       (%rsp), %rdi
+        /* Encoding for vmovups %zmm1, 128(%rdi).  */
+        .byte 0x62
+        .byte 0xf1
+        .byte 0x7c
+        .byte 0x48
+        .byte 0x11
+        .byte 0x4f
+        .byte 0x02
+        /* Encoding for vmovups %zmm2, 192(%rdi).  */
+        .byte 0x62
+        .byte 0xf1
+        .byte 0x7c
+        .byte 0x48
+        .byte 0x11
+        .byte 0x57
+        .byte 0x03
+        /* Encoding for vmovups %zmm3, 256(%rdi).  */
+        .byte 0x62
+        .byte 0xf1
+        .byte 0x7c
+        .byte 0x48
+        .byte 0x11
+        .byte 0x5f
+        .byte 0x04
+        /* Encoding for vmovups %zmm4, 320(%rdi).  */
+        .byte 0x62
+        .byte 0xf1
+        .byte 0x7c
+        .byte 0x48
+        .byte 0x11
+        .byte 0x67
+        .byte 0x05
+        lea       64(%rsp), %rsi
+        call      HIDDEN_JUMPTARGET(\callee)
+        vmovdqu   416(%rsp), %ymm0
+        lea       32(%rsp), %rdi
+        lea       96(%rsp), %rsi
+        call      HIDDEN_JUMPTARGET(\callee)
+        movq      128(%rsp), %rdx
+        movq      136(%rsp), %rsi
+        movq      144(%rsp), %r8
+        movq      152(%rsp), %r10
+        movl      (%rsp), %eax
+        movl      4(%rsp), %ecx
+        movl      8(%rsp), %edi
+        movl      12(%rsp), %r9d
+        movl      %eax, (%rdx)
+        movl      %ecx, (%rsi)
+        movq      160(%rsp), %rax
+        movq      168(%rsp), %rcx
+        movl      %edi, (%r8)
+        movl      %r9d, (%r10)
+        movq      176(%rsp), %rdi
+        movq      184(%rsp), %r9
+        movl      16(%rsp), %r11d
+        movl      20(%rsp), %edx
+        movl      24(%rsp), %esi
+        movl      28(%rsp), %r8d
+        movl      %r11d, (%rax)
+        movl      %edx, (%rcx)
+        movq      192(%rsp), %r11
+        movq      200(%rsp), %rdx
+        movl      %esi, (%rdi)
+        movl      %r8d, (%r9)
+        movq      208(%rsp), %rsi
+        movq      216(%rsp), %r8
+        movl      32(%rsp), %r10d
+        movl      36(%rsp), %eax
+        movl      40(%rsp), %ecx
+        movl      44(%rsp), %edi
+        movl      %r10d, (%r11)
+        movl      %eax, (%rdx)
+        movq      224(%rsp), %r10
+        movq      232(%rsp), %rax
+        movl      %ecx, (%rsi)
+        movl      %edi, (%r8)
+        movq      240(%rsp), %rcx
+        movq      248(%rsp), %rdi
+        movl      48(%rsp), %r9d
+        movl      52(%rsp), %r11d
+        movl      56(%rsp), %edx
+        movl      60(%rsp), %esi
+        movl      %r9d, (%r10)
+        movl      %r11d, (%rax)
+        movq      256(%rsp), %r9
+        movq      264(%rsp), %r11
+        movl      %edx, (%rcx)
+        movl      %esi, (%rdi)
+        movq      272(%rsp), %rdx
+        movq      280(%rsp), %rsi
+        movl      64(%rsp), %r8d
+        movl      68(%rsp), %r10d
+        movl      72(%rsp), %eax
+        movl      76(%rsp), %ecx
+        movl      %r8d, (%r9)
+        movl      %r10d, (%r11)
+        movq      288(%rsp), %r8
+        movq      296(%rsp), %r10
+        movl      %eax, (%rdx)
+        movl      %ecx, (%rsi)
+        movq      304(%rsp), %rax
+        movq      312(%rsp), %rcx
+        movl      80(%rsp), %edi
+        movl      84(%rsp), %r9d
+        movl      88(%rsp), %r11d
+        movl      92(%rsp), %edx
+        movl      %edi, (%r8)
+        movl      %r9d, (%r10)
+        movq      320(%rsp), %rdi
+        movq      328(%rsp), %r9
+        movl      %r11d, (%rax)
+        movl      %edx, (%rcx)
+        movq      336(%rsp), %r11
+        movq      344(%rsp), %rdx
+        movl      96(%rsp), %esi
+        movl      100(%rsp), %r8d
+        movl      104(%rsp), %r10d
+        movl      108(%rsp), %eax
+        movl      %esi, (%rdi)
+        movl      %r8d, (%r9)
+        movq      352(%rsp), %rsi
+        movq      360(%rsp), %r8
+        movl      %r10d, (%r11)
+        movl      %eax, (%rdx)
+        movq      368(%rsp), %r10
+        movq      376(%rsp), %rax
+        movl      112(%rsp), %ecx
+        movl      116(%rsp), %edi
+        movl      120(%rsp), %r9d
+        movl      124(%rsp), %r11d
+        movl      %ecx, (%rsi)
+        movl      %edi, (%r8)
+        movl      %r9d, (%r10)
+        movl      %r11d, (%rax)
+        movq      %rbp, %rsp
+        cfi_def_cfa_register (%rsp)
+        popq      %rbp
+        cfi_adjust_cfa_offset (-8)
+        cfi_restore (%rbp)
+        ret
+#else
+        leal    8(%rsp), %r10d
+        .cfi_def_cfa 10, 0
+        andl    $-64, %esp
+        pushq   -8(%r10d)
+        pushq   %rbp
+        .cfi_escape 0x10,0x6,0x2,0x76,0
+        movl    %esp, %ebp
+        pushq   %r12
+        leal    -112(%rbp), %esi
+        pushq   %r10
+        .cfi_escape 0xf,0x3,0x76,0x70,0x6
+        .cfi_escape 0x10,0xc,0x2,0x76,0x78
+        leal    -176(%rbp), %edi
+        movq    %rsi, %r12
+        pushq   %rbx
+        .cfi_escape 0x10,0x3,0x2,0x76,0x68
+        movq    %rdi, %rbx
+        subl    $344, %esp
+        /* Encoding for vmovdqa64 %zmm1, -240(%ebp).  */
+        .byte 0x67
+        .byte 0x62
+        .byte 0xf1
+        .byte 0xfd
+        .byte 0x48
+        .byte 0x7f
+        .byte 0x8d
+        .byte 0x10
+        .byte 0xff
+        .byte 0xff
+        .byte 0xff
+        /* Encoding for vmovdqa64 %zmm2, -304(%ebp).  */
+        .byte 0x67
+        .byte 0x62
+        .byte 0xf1
+        .byte 0xfd
+        .byte 0x48
+        .byte 0x7f
+        .byte 0x95
+        .byte 0xd0
+        .byte 0xfe
+        .byte 0xff
+        .byte 0xff
+        /* Encoding for vmovaps %zmm0, -368(%ebp).  */
+        .byte 0x67
+        .byte 0x62
+        .byte 0xf1
+        .byte 0x7c
+        .byte 0x48
+        .byte 0x29
+        .byte 0x85
+        .byte 0x90
+        .byte 0xfe
+        .byte 0xff
+        .byte 0xff
+        call    HIDDEN_JUMPTARGET(\callee)
+        leal    32(%r12), %esi
+        vmovups -336(%ebp), %ymm0
+        leal    32(%rbx), %edi
+        call    HIDDEN_JUMPTARGET(\callee)
+        movl    -240(%ebp), %eax
+        vmovss  -176(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        movl    -236(%ebp), %eax
+        vmovss  -172(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        movl    -232(%ebp), %eax
+        vmovss  -168(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        movl    -228(%ebp), %eax
+        vmovss  -164(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        movl    -224(%ebp), %eax
+        vmovss  -160(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        movl    -220(%ebp), %eax
+        vmovss  -156(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        movl    -216(%ebp), %eax
+        vmovss  -152(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        movl    -212(%ebp), %eax
+        vmovss  -148(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        movl    -208(%ebp), %eax
+        vmovss  -144(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        movl    -204(%ebp), %eax
+        vmovss  -140(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        movl    -200(%ebp), %eax
+        vmovss  -136(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        movl    -196(%ebp), %eax
+        vmovss  -132(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        movl    -192(%ebp), %eax
+        vmovss  -128(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        movl    -188(%ebp), %eax
+        vmovss  -124(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        movl    -184(%ebp), %eax
+        vmovss  -120(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        movl    -180(%ebp), %eax
+        vmovss  -116(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        movl    -304(%ebp), %eax
+        vmovss  -112(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        movl    -300(%ebp), %eax
+        vmovss  -108(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        movl    -296(%ebp), %eax
+        vmovss  -104(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        movl    -292(%ebp), %eax
+        vmovss  -100(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        movl    -288(%ebp), %eax
+        vmovss  -96(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        movl    -284(%ebp), %eax
+        vmovss  -92(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        movl    -280(%ebp), %eax
+        vmovss  -88(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        movl    -276(%ebp), %eax
+        vmovss  -84(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        movl    -272(%ebp), %eax
+        vmovss  -80(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        movl    -268(%ebp), %eax
+        vmovss  -76(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        movl    -264(%ebp), %eax
+        vmovss  -72(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        movl    -260(%ebp), %eax
+        vmovss  -68(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        movl    -256(%ebp), %eax
+        vmovss  -64(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        movl    -252(%ebp), %eax
+        vmovss  -60(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        movl    -248(%ebp), %eax
+        vmovss  -56(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        movl    -244(%ebp), %eax
+        vmovss  -52(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        addl    $344, %esp
+        popq    %rbx
+        popq    %r10
+        .cfi_def_cfa 10, 0
+        popq    %r12
+        popq    %rbp
+        leal    -8(%r10), %esp
+        .cfi_def_cfa 7, 8
+        ret
+#endif
+.endm
+
+ENTRY (_ZGVeN16vvv_sincosf)
+WRAPPER_IMPL_AVX512_fFF_vvv _ZGVdN8vl4l4_sincosf
+END (_ZGVeN16vvv_sincosf)
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/svml_s_sincosf4_core.S b/REORG.TODO/sysdeps/x86_64/fpu/svml_s_sincosf4_core.S
new file mode 100644
index 0000000000..2ab33b59a7
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/svml_s_sincosf4_core.S
@@ -0,0 +1,152 @@
+/* Function sincosf vectorized with SSE2.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "svml_s_wrapper_impl.h"
+
+	.text
+ENTRY (_ZGVbN4vl4l4_sincosf)
+WRAPPER_IMPL_SSE2_fFF sincosf
+END (_ZGVbN4vl4l4_sincosf)
+libmvec_hidden_def (_ZGVbN4vl4l4_sincosf)
+
+/* SSE2 ISA version as wrapper to scalar (for vector
+   function declared with #pragma omp declare simd notinbranch).  */
+.macro WRAPPER_IMPL_SSE2_fFF_vvv callee
+#ifndef __ILP32__
+        subq      $120, %rsp
+        cfi_adjust_cfa_offset(120)
+        movaps    %xmm0, 96(%rsp)
+        lea       (%rsp), %rdi
+        movdqa    %xmm1, 32(%rdi)
+        lea       16(%rsp), %rsi
+        movdqa    %xmm2, 32(%rsi)
+        movdqa    %xmm3, 48(%rsi)
+        movdqa    %xmm4, 64(%rsi)
+        call      JUMPTARGET(\callee)
+        movss     100(%rsp), %xmm0
+        lea       4(%rsp), %rdi
+        lea       20(%rsp), %rsi
+        call      JUMPTARGET(\callee)
+        movss     104(%rsp), %xmm0
+        lea       8(%rsp), %rdi
+        lea       24(%rsp), %rsi
+        call      JUMPTARGET(\callee)
+        movss     108(%rsp), %xmm0
+        lea       12(%rsp), %rdi
+        lea       28(%rsp), %rsi
+        call      JUMPTARGET(\callee)
+        movq      32(%rsp), %rdx
+        movq      40(%rsp), %rsi
+        movq      48(%rsp), %r8
+        movq      56(%rsp), %r10
+        movl      (%rsp), %eax
+        movl      4(%rsp), %ecx
+        movl      8(%rsp), %edi
+        movl      12(%rsp), %r9d
+        movl      %eax, (%rdx)
+        movl      %ecx, (%rsi)
+        movq      64(%rsp), %rax
+        movq      72(%rsp), %rcx
+        movl      %edi, (%r8)
+        movl      %r9d, (%r10)
+        movq      80(%rsp), %rdi
+        movq      88(%rsp), %r9
+        movl      16(%rsp), %r11d
+        movl      20(%rsp), %edx
+        movl      24(%rsp), %esi
+        movl      28(%rsp), %r8d
+        movl      %r11d, (%rax)
+        movl      %edx, (%rcx)
+        movl      %esi, (%rdi)
+        movl      %r8d, (%r9)
+        addq      $120, %rsp
+        cfi_adjust_cfa_offset(-120)
+        ret
+#else
+        pushq   %rbp
+        .cfi_def_cfa_offset 16
+        .cfi_offset 6, -16
+        pushq   %rbx
+        .cfi_def_cfa_offset 24
+        .cfi_offset 3, -24
+        subl    $88, %esp
+        .cfi_def_cfa_offset 112
+        leal    64(%rsp), %esi
+        movaps  %xmm1, (%esp)
+        leal    48(%rsp), %edi
+        movaps  %xmm2, 16(%esp)
+        movq    %rsi, %rbp
+        movq    %rdi, %rbx
+        movaps  %xmm0, 32(%esp)
+        call    JUMPTARGET(\callee)
+        movups  36(%esp), %xmm0
+        leal    4(%rbp), %esi
+        leal    4(%rbx), %edi
+        call    JUMPTARGET(\callee)
+        movups  40(%esp), %xmm0
+        leal    8(%rbp), %esi
+        leal    8(%rbx), %edi
+        call    JUMPTARGET(\callee)
+        movups  44(%esp), %xmm0
+        leal    12(%rbp), %esi
+        leal    12(%rbx), %edi
+        call    JUMPTARGET(\callee)
+        movq    (%esp), %rax
+        movss   48(%esp), %xmm0
+        movdqa  (%esp), %xmm4
+        movdqa  16(%esp), %xmm7
+        movss   %xmm0, (%eax)
+        movss   52(%esp), %xmm0
+        pextrd  $1, %xmm4, %eax
+        movss   %xmm0, (%eax)
+        movq    8(%esp), %rax
+        movss   56(%esp), %xmm0
+        movss   %xmm0, (%eax)
+        movss   60(%esp), %xmm0
+        pextrd  $3, %xmm4, %eax
+        movss   %xmm0, (%eax)
+        movq    16(%esp), %rax
+        movss   64(%esp), %xmm0
+        movss   %xmm0, (%eax)
+        movss   68(%esp), %xmm0
+        pextrd  $1, %xmm7, %eax
+        movss   %xmm0, (%eax)
+        movq    24(%esp), %rax
+        movss   72(%esp), %xmm0
+        movss   %xmm0, (%eax)
+        movss   76(%esp), %xmm0
+        pextrd  $3, %xmm7, %eax
+        movss   %xmm0, (%eax)
+        addl    $88, %esp
+        .cfi_def_cfa_offset 24
+        popq    %rbx
+        .cfi_def_cfa_offset 16
+        popq    %rbp
+        .cfi_def_cfa_offset 8
+        ret
+#endif
+.endm
+
+ENTRY (_ZGVbN4vvv_sincosf)
+WRAPPER_IMPL_SSE2_fFF_vvv sincosf
+END (_ZGVbN4vvv_sincosf)
+
+#ifndef USE_MULTIARCH
+ libmvec_hidden_def (_ZGVbN4vvv_sincosf)
+#endif
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/svml_s_sincosf8_core.S b/REORG.TODO/sysdeps/x86_64/fpu/svml_s_sincosf8_core.S
new file mode 100644
index 0000000000..757d39c522
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/svml_s_sincosf8_core.S
@@ -0,0 +1,200 @@
+/* Function sincosf vectorized with AVX2, wrapper version.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "svml_s_wrapper_impl.h"
+
+	.text
+ENTRY (_ZGVdN8vl4l4_sincosf)
+WRAPPER_IMPL_AVX_fFF _ZGVbN4vl4l4_sincosf
+END (_ZGVdN8vl4l4_sincosf)
+libmvec_hidden_def (_ZGVdN8vl4l4_sincosf)
+
+/* AVX2 ISA version as wrapper to SSE ISA version (for vector
+   function declared with #pragma omp declare simd notinbranch).  */
+.macro WRAPPER_IMPL_AVX2_fFF_vvv callee
+#ifndef __ILP32__
+        pushq     %rbp
+        cfi_adjust_cfa_offset (8)
+        cfi_rel_offset (%rbp, 0)
+        movq      %rsp, %rbp
+        cfi_def_cfa_register (%rbp)
+        andq      $-32, %rsp
+        subq      $224, %rsp
+        vmovups   %ymm0, 192(%rsp)
+        lea       (%rsp), %rdi
+        vmovdqu   %ymm1, 64(%rdi)
+        vmovdqu   %ymm2, 96(%rdi)
+        vmovdqu   %ymm3, 128(%rdi)
+        vmovdqu   %ymm4, 160(%rdi)
+        lea       32(%rsp), %rsi
+	vzeroupper
+        call      HIDDEN_JUMPTARGET(\callee)
+        vmovups   208(%rsp), %xmm0
+        lea       16(%rsp), %rdi
+        lea       48(%rsp), %rsi
+        call      HIDDEN_JUMPTARGET(\callee)
+        movq      64(%rsp), %rdx
+        movq      72(%rsp), %rsi
+        movq      80(%rsp), %r8
+        movq      88(%rsp), %r10
+        movl      (%rsp), %eax
+        movl      4(%rsp), %ecx
+        movl      8(%rsp), %edi
+        movl      12(%rsp), %r9d
+        movl      %eax, (%rdx)
+        movl      %ecx, (%rsi)
+        movq      96(%rsp), %rax
+        movq      104(%rsp), %rcx
+        movl      %edi, (%r8)
+        movl      %r9d, (%r10)
+        movq      112(%rsp), %rdi
+        movq      120(%rsp), %r9
+        movl      16(%rsp), %r11d
+        movl      20(%rsp), %edx
+        movl      24(%rsp), %esi
+        movl      28(%rsp), %r8d
+        movl      %r11d, (%rax)
+        movl      %edx, (%rcx)
+        movq      128(%rsp), %r11
+        movq      136(%rsp), %rdx
+        movl      %esi, (%rdi)
+        movl      %r8d, (%r9)
+        movq      144(%rsp), %rsi
+        movq      152(%rsp), %r8
+        movl      32(%rsp), %r10d
+        movl      36(%rsp), %eax
+        movl      40(%rsp), %ecx
+        movl      44(%rsp), %edi
+        movl      %r10d, (%r11)
+        movl      %eax, (%rdx)
+        movq      160(%rsp), %r10
+        movq      168(%rsp), %rax
+        movl      %ecx, (%rsi)
+        movl      %edi, (%r8)
+        movq      176(%rsp), %rcx
+        movq      184(%rsp), %rdi
+        movl      48(%rsp), %r9d
+        movl      52(%rsp), %r11d
+        movl      56(%rsp), %edx
+        movl      60(%rsp), %esi
+        movl      %r9d, (%r10)
+        movl      %r11d, (%rax)
+        movl      %edx, (%rcx)
+        movl      %esi, (%rdi)
+        movq      %rbp, %rsp
+        cfi_def_cfa_register (%rsp)
+        popq      %rbp
+        cfi_adjust_cfa_offset (-8)
+        cfi_restore (%rbp)
+        ret
+#else
+        leal    8(%rsp), %r10d
+        .cfi_def_cfa 10, 0
+        andl    $-32, %esp
+        pushq   -8(%r10d)
+        pushq   %rbp
+        .cfi_escape 0x10,0x6,0x2,0x76,0
+        movl    %esp, %ebp
+        pushq   %r12
+        leal    -80(%rbp), %esi
+        pushq   %r10
+        .cfi_escape 0xf,0x3,0x76,0x70,0x6
+        .cfi_escape 0x10,0xc,0x2,0x76,0x78
+        leal    -112(%rbp), %edi
+        movq    %rsi, %r12
+        pushq   %rbx
+        .cfi_escape 0x10,0x3,0x2,0x76,0x68
+        movq    %rdi, %rbx
+        subl    $184, %esp
+        vmovdqa %ymm1, -144(%ebp)
+        vmovdqa %ymm2, -176(%ebp)
+        vmovaps %ymm0, -208(%ebp)
+	vzeroupper
+        call    HIDDEN_JUMPTARGET(\callee)
+        leal    16(%r12), %esi
+        vmovups -192(%ebp), %xmm0
+        leal    16(%rbx), %edi
+        call    HIDDEN_JUMPTARGET(\callee)
+        movl    -144(%ebp), %eax
+        vmovss  -112(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        movl    -140(%ebp), %eax
+        vmovss  -108(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        movl    -136(%ebp), %eax
+        vmovss  -104(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        movl    -132(%ebp), %eax
+        vmovss  -100(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        movl    -128(%ebp), %eax
+        vmovss  -96(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        movl    -124(%ebp), %eax
+        vmovss  -92(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        movl    -120(%ebp), %eax
+        vmovss  -88(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        movl    -116(%ebp), %eax
+        vmovss  -84(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        movl    -176(%ebp), %eax
+        vmovss  -80(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        movl    -172(%ebp), %eax
+        vmovss  -76(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        movl    -168(%ebp), %eax
+        vmovss  -72(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        movl    -164(%ebp), %eax
+        vmovss  -68(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        movl    -160(%ebp), %eax
+        vmovss  -64(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        movl    -156(%ebp), %eax
+        vmovss  -60(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        movl    -152(%ebp), %eax
+        vmovss  -56(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        movl    -148(%ebp), %eax
+        vmovss  -52(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        addl    $184, %esp
+        popq    %rbx
+        popq    %r10
+        .cfi_def_cfa 10, 0
+        popq    %r12
+        popq    %rbp
+        leal    -8(%r10), %esp
+        .cfi_def_cfa 7, 8
+        ret
+#endif
+.endm
+
+ENTRY (_ZGVdN8vvv_sincosf)
+WRAPPER_IMPL_AVX2_fFF_vvv _ZGVbN4vl4l4_sincosf
+END (_ZGVdN8vvv_sincosf)
+
+#ifndef USE_MULTIARCH
+ libmvec_hidden_def (_ZGVdN8vvv_sincosf)
+#endif
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/svml_s_sincosf8_core_avx.S b/REORG.TODO/sysdeps/x86_64/fpu/svml_s_sincosf8_core_avx.S
new file mode 100644
index 0000000000..0955924cdd
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/svml_s_sincosf8_core_avx.S
@@ -0,0 +1,198 @@
+/* Function sincosf vectorized in AVX ISA as wrapper to SSE4 ISA version.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "svml_s_wrapper_impl.h"
+
+        .text
+ENTRY (_ZGVcN8vl4l4_sincosf)
+WRAPPER_IMPL_AVX_fFF _ZGVbN4vl4l4_sincosf
+END (_ZGVcN8vl4l4_sincosf)
+
+/* AVX ISA version as wrapper to SSE ISA version (for vector
+   function declared with #pragma omp declare simd notinbranch).  */
+.macro WRAPPER_IMPL_AVX_fFF_vvv callee
+#ifndef __ILP32__
+        pushq     %rbp
+        movq      %rsp, %rbp
+        andq      $-32, %rsp
+        subq      $224, %rsp
+        vmovups   %ymm0, 64(%rsp)
+        lea       (%rsp), %rdi
+        vmovdqu   %xmm1, 96(%rdi)
+        vmovdqu   %xmm2, 112(%rdi)
+        vmovdqu   %xmm3, 128(%rdi)
+        vmovdqu   %xmm4, 144(%rdi)
+        vmovdqu   %xmm5, 160(%rdi)
+        lea       32(%rsp), %rsi
+        vmovdqu   %xmm6, 144(%rsi)
+        vmovdqu   %xmm7, 160(%rsi)
+        vzeroupper
+        call      HIDDEN_JUMPTARGET(\callee)
+        vmovdqu   80(%rsp), %xmm0
+        lea       16(%rsp), %rdi
+        lea       48(%rsp), %rsi
+        call      HIDDEN_JUMPTARGET(\callee)
+        movq      96(%rsp), %rdx
+        movq      104(%rsp), %rsi
+        movq      112(%rsp), %r8
+        movq      120(%rsp), %r10
+        movl      (%rsp), %eax
+        movl      4(%rsp), %ecx
+        movl      8(%rsp), %edi
+        movl      12(%rsp), %r9d
+        movl      %eax, (%rdx)
+        movl      %ecx, (%rsi)
+        movq      128(%rsp), %rax
+        movq      136(%rsp), %rcx
+        movl      %edi, (%r8)
+        movl      %r9d, (%r10)
+        movq      144(%rsp), %rdi
+        movq      152(%rsp), %r9
+        movl      16(%rsp), %r11d
+        movl      20(%rsp), %edx
+        movl      24(%rsp), %esi
+        movl      28(%rsp), %r8d
+        movl      %r11d, (%rax)
+        movl      %edx, (%rcx)
+        movq      160(%rsp), %r11
+        movq      168(%rsp), %rdx
+        movl      %esi, (%rdi)
+        movl      %r8d, (%r9)
+        movq      176(%rsp), %rsi
+        movq      184(%rsp), %r8
+        movl      32(%rsp), %r10d
+        movl      36(%rsp), %eax
+        movl      40(%rsp), %ecx
+        movl      44(%rsp), %edi
+        movl      %r10d, (%r11)
+        movl      %eax, (%rdx)
+        movq      192(%rsp), %r10
+        movq      200(%rsp), %rax
+        movl      %ecx, (%rsi)
+        movl      %edi, (%r8)
+        movq      16(%rbp), %rcx
+        movq      24(%rbp), %rdi
+        movl      48(%rsp), %r9d
+        movl      52(%rsp), %r11d
+        movl      56(%rsp), %edx
+        movl      60(%rsp), %esi
+        movl      %r9d, (%r10)
+        movl      %r11d, (%rax)
+        movl      %edx, (%rcx)
+        movl      %esi, (%rdi)
+        movq      %rbp, %rsp
+        popq      %rbp
+        ret
+#else
+        leal    8(%rsp), %r10d
+        .cfi_def_cfa 10, 0
+        andl    $-32, %esp
+        pushq   -8(%r10d)
+        pushq   %rbp
+        .cfi_escape 0x10,0x6,0x2,0x76,0
+        movl    %esp, %ebp
+        pushq   %r12
+        leal    -80(%rbp), %esi
+        pushq   %r10
+        .cfi_escape 0xf,0x3,0x76,0x70,0x6
+        .cfi_escape 0x10,0xc,0x2,0x76,0x78
+        leal    -112(%rbp), %edi
+        movq    %rsi, %r12
+        pushq   %rbx
+        .cfi_escape 0x10,0x3,0x2,0x76,0x68
+        movq    %rdi, %rbx
+        subl    $184, %esp
+        vmovaps %xmm1, -128(%ebp)
+        vmovaps %xmm2, -144(%ebp)
+        vmovaps %xmm3, -160(%ebp)
+        vmovaps %xmm4, -176(%ebp)
+        vmovaps %ymm0, -208(%ebp)
+        vzeroupper
+        call    HIDDEN_JUMPTARGET(\callee)
+        leal    16(%r12), %esi
+        vmovups -192(%ebp), %xmm0
+        leal    16(%rbx), %edi
+        call    HIDDEN_JUMPTARGET(\callee)
+        movq    -128(%ebp), %rax
+        vmovss  -112(%ebp), %xmm0
+        vmovdqa -128(%ebp), %xmm7
+        vmovdqa -144(%ebp), %xmm3
+        vmovss  %xmm0, (%eax)
+        vmovss  -108(%ebp), %xmm0
+        vpextrd $1, %xmm7, %eax
+        vmovss  %xmm0, (%eax)
+        movq    -120(%ebp), %rax
+        vmovss  -104(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        vmovss  -100(%ebp), %xmm0
+        vpextrd $3, %xmm7, %eax
+        vmovdqa -160(%ebp), %xmm7
+        vmovss  %xmm0, (%eax)
+        movq    -144(%ebp), %rax
+        vmovss  -96(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        vmovss  -92(%ebp), %xmm0
+        vpextrd $1, %xmm3, %eax
+        vmovss  %xmm0, (%eax)
+        movq    -136(%ebp), %rax
+        vmovss  -88(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        vmovss  -84(%ebp), %xmm0
+        vpextrd $3, %xmm3, %eax
+        vmovss  %xmm0, (%eax)
+        movq    -160(%ebp), %rax
+        vmovss  -80(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        vmovss  -76(%ebp), %xmm0
+        vpextrd $1, %xmm7, %eax
+        vmovss  %xmm0, (%eax)
+        movq    -152(%ebp), %rax
+        vmovss  -72(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        vmovss  -68(%ebp), %xmm0
+        vpextrd $3, %xmm7, %eax
+        vmovss  %xmm0, (%eax)
+        movq    -176(%ebp), %rax
+        vmovss  -64(%ebp), %xmm0
+        vmovdqa -176(%ebp), %xmm3
+        vmovss  %xmm0, (%eax)
+        vmovss  -60(%ebp), %xmm0
+        vpextrd $1, %xmm3, %eax
+        vmovss  %xmm0, (%eax)
+        movq    -168(%ebp), %rax
+        vmovss  -56(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        vmovss  -52(%ebp), %xmm0
+        vpextrd $3, %xmm3, %eax
+        vmovss  %xmm0, (%eax)
+        addl    $184, %esp
+        popq    %rbx
+        popq    %r10
+        .cfi_def_cfa 10, 0
+        popq    %r12
+        popq    %rbp
+        leal    -8(%r10), %esp
+        .cfi_def_cfa 7, 8
+        ret
+#endif
+.endm
+
+ENTRY (_ZGVcN8vvv_sincosf)
+WRAPPER_IMPL_AVX_fFF_vvv _ZGVbN4vl4l4_sincosf
+END (_ZGVcN8vvv_sincosf)
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/svml_s_sinf16_core.S b/REORG.TODO/sysdeps/x86_64/fpu/svml_s_sinf16_core.S
new file mode 100644
index 0000000000..14473da427
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/svml_s_sinf16_core.S
@@ -0,0 +1,25 @@
+/* Function sinf vectorized with AVX-512. Wrapper to AVX2 version.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "svml_s_wrapper_impl.h"
+
+	.text
+ENTRY (_ZGVeN16v_sinf)
+WRAPPER_IMPL_AVX512 _ZGVdN8v_sinf
+END (_ZGVeN16v_sinf)
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/svml_s_sinf4_core.S b/REORG.TODO/sysdeps/x86_64/fpu/svml_s_sinf4_core.S
new file mode 100644
index 0000000000..910f39c7f2
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/svml_s_sinf4_core.S
@@ -0,0 +1,30 @@
+/* Function sinf vectorized with SSE2.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+
+#include <sysdep.h>
+#include "svml_s_wrapper_impl.h"
+
+	.text
+ENTRY (_ZGVbN4v_sinf)
+WRAPPER_IMPL_SSE2 sinf
+END (_ZGVbN4v_sinf)
+
+#ifndef USE_MULTIARCH
+ libmvec_hidden_def (_ZGVbN4v_sinf)
+#endif
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/svml_s_sinf8_core.S b/REORG.TODO/sysdeps/x86_64/fpu/svml_s_sinf8_core.S
new file mode 100644
index 0000000000..568c978a22
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/svml_s_sinf8_core.S
@@ -0,0 +1,29 @@
+/* Function sinf vectorized with AVX2, wrapper version.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "svml_s_wrapper_impl.h"
+
+	.text
+ENTRY (_ZGVdN8v_sinf)
+WRAPPER_IMPL_AVX _ZGVbN4v_sinf
+END (_ZGVdN8v_sinf)
+
+#ifndef USE_MULTIARCH
+ libmvec_hidden_def (_ZGVdN8v_sinf)
+#endif
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/svml_s_sinf8_core_avx.S b/REORG.TODO/sysdeps/x86_64/fpu/svml_s_sinf8_core_avx.S
new file mode 100644
index 0000000000..603f59ed1b
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/svml_s_sinf8_core_avx.S
@@ -0,0 +1,25 @@
+/* Function sinf vectorized in AVX ISA as wrapper to SSE4 ISA version.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "svml_s_wrapper_impl.h"
+
+        .text
+ENTRY(_ZGVcN8v_sinf)
+WRAPPER_IMPL_AVX _ZGVbN4v_sinf
+END(_ZGVcN8v_sinf)
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/svml_s_trig_data.S b/REORG.TODO/sysdeps/x86_64/fpu/svml_s_trig_data.S
new file mode 100644
index 0000000000..19a569118f
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/svml_s_trig_data.S
@@ -0,0 +1,111 @@
+/* Data for function cosf.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include "svml_s_trig_data.h"
+
+	.section .rodata, "a"
+	.align 64
+
+/* Data table for vector implementations of function cosf.
+   The table may contain polynomial, reduction, lookup coefficients
+   and other macro_names obtained through different methods
+   of research and experimental work.  */
+
+	.globl __svml_s_trig_data
+__svml_s_trig_data:
+
+/* General purpose constants:
+   absolute value mask */
+float_vector __sAbsMask 0x7fffffff
+
+/* threshold for out-of-range values */
+float_vector __sRangeReductionVal 0x461c4000
+
+/* +INF */
+float_vector __sRangeVal 0x7f800000
+
+/* High Accuracy version polynomial coefficients:
+   S1 = -1.66666666664728165763e-01 */
+float_vector __sS1 0xbe2aaaab
+
+/* S2 = 8.33329173045453069014e-03 */
+float_vector __sS2 0x3c08885c
+
+/* C1 = -5.00000000000000000000e-01 */
+float_vector __sC1 0xbf000000
+
+/* C2 = 4.16638942914469202550e-02 */
+float_vector __sC2 0x3d2aaa7c
+
+/* Range reduction PI-based constants:
+   PI high part  */
+float_vector __sPI1 0x40490000
+
+/* PI mid part 1 */
+float_vector __sPI2 0x3a7da000
+
+/* PI mid part 2 */
+float_vector __sPI3 0x34222000
+
+/* PI low part */
+float_vector __sPI4 0x2cb4611a
+
+/* PI1, PI2, and PI3 when FMA is available
+   PI high part (when FMA available) */
+float_vector __sPI1_FMA 0x40490fdb
+
+/* PI mid part  (when FMA available) */
+float_vector __sPI2_FMA 0xb3bbbd2e
+
+/* PI low part  (when FMA available) */
+float_vector __sPI3_FMA 0xa7772ced
+
+/* Polynomial constants for work w/o FMA, relative error ~ 2^(-26.625) */
+float_vector __sA3 0xbe2aaaa6
+float_vector __sA5 0x3c08876a
+float_vector __sA7 0xb94fb7ff
+float_vector __sA9 0x362edef8
+
+/* Polynomial constants, work with FMA, relative error ~ 2^(-26.417) */
+float_vector __sA5_FMA 0x3c088768
+float_vector __sA7_FMA 0xb94fb6cf
+float_vector __sA9_FMA 0x362ec335
+
+/* 1/PI */
+float_vector __sInvPI 0x3ea2f983
+
+/* right-shifter constant */
+float_vector __sRShifter 0x4b400000
+
+/* PI/2 */
+float_vector __sHalfPI 0x3fc90fdb
+
+/* 1/2 */
+float_vector __sOneHalf 0x3f000000
+
+/* high accuracy table index mask */
+float_vector __iIndexMask 0x000000ff
+
+/* 2^(k-1) */
+float_vector __i2pK_1 0x00000040
+
+/* sign field mask */
+float_vector __sSignMask 0x80000000
+
+	.type	__svml_s_trig_data,@object
+	.size __svml_s_trig_data,.-__svml_s_trig_data
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/svml_s_trig_data.h b/REORG.TODO/sysdeps/x86_64/fpu/svml_s_trig_data.h
new file mode 100644
index 0000000000..04f4f7b1ed
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/svml_s_trig_data.h
@@ -0,0 +1,62 @@
+/* Offsets for data table for vectorized sinf, cosf, sincosf.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef S_TRIG_DATA_H
+#define S_TRIG_DATA_H
+
+.macro float_vector offset value
+.if .-__svml_s_trig_data != \offset
+.err
+.endif
+.rept 16
+.long \value
+.endr
+.endm
+
+#define __sAbsMask                      0
+#define __sRangeReductionVal            64
+#define __sRangeVal                     64*2
+#define __sS1                           64*3
+#define __sS2                           64*4
+#define __sC1                           64*5
+#define __sC2                           64*6
+#define __sPI1                          64*7
+#define __sPI2                          64*8
+#define __sPI3                          64*9
+#define __sPI4                          64*10
+#define __sPI1_FMA                      64*11
+#define __sPI2_FMA                      64*12
+#define __sPI3_FMA                      64*13
+#define __sA3                           64*14
+#define __sA5                           64*15
+#define __sA7                           64*16
+#define __sA9                           64*17
+#define __sA5_FMA                       64*18
+#define __sA7_FMA                       64*19
+#define __sA9_FMA                       64*20
+#define __sInvPI                        64*21
+#define __sRShifter                     64*22
+#define __sHalfPI                       64*23
+#define __sOneHalf                      64*24
+#define __iIndexMask                  	64*25
+#define __i2pK_1                      	64*26
+#define __sSignMask                   	64*27
+#define __dT_cosf                       64*28
+#define __dT                            64*92
+
+#endif
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/svml_s_wrapper_impl.h b/REORG.TODO/sysdeps/x86_64/fpu/svml_s_wrapper_impl.h
new file mode 100644
index 0000000000..cd6d58361c
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/svml_s_wrapper_impl.h
@@ -0,0 +1,371 @@
+/* Wrapper implementations of vector math functions.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* SSE2 ISA version as wrapper to scalar.  */
+.macro WRAPPER_IMPL_SSE2 callee
+        subq      $40, %rsp
+        cfi_adjust_cfa_offset(40)
+        movaps    %xmm0, (%rsp)
+        call      JUMPTARGET(\callee)
+        movss     %xmm0, 16(%rsp)
+        movss     4(%rsp), %xmm0
+        call      JUMPTARGET(\callee)
+        movss     %xmm0, 20(%rsp)
+        movss     8(%rsp), %xmm0
+        call      JUMPTARGET(\callee)
+        movss     %xmm0, 24(%rsp)
+        movss     12(%rsp), %xmm0
+        call      JUMPTARGET(\callee)
+        movss     16(%rsp), %xmm3
+        movss     20(%rsp), %xmm2
+        movss     24(%rsp), %xmm1
+        movss     %xmm0, 28(%rsp)
+        unpcklps  %xmm1, %xmm3
+        unpcklps  %xmm0, %xmm2
+        unpcklps  %xmm2, %xmm3
+        movaps    %xmm3, %xmm0
+        addq      $40, %rsp
+        cfi_adjust_cfa_offset(-40)
+        ret
+.endm
+
+/* 2 argument SSE2 ISA version as wrapper to scalar.  */
+.macro WRAPPER_IMPL_SSE2_ff callee
+        subq      $56, %rsp
+        cfi_adjust_cfa_offset(56)
+        movaps    %xmm0, (%rsp)
+        movaps    %xmm1, 16(%rsp)
+        call      JUMPTARGET(\callee)
+        movss     %xmm0, 32(%rsp)
+        movss     4(%rsp), %xmm0
+        movss     20(%rsp), %xmm1
+        call      JUMPTARGET(\callee)
+        movss     %xmm0, 36(%rsp)
+        movss     8(%rsp), %xmm0
+        movss     24(%rsp), %xmm1
+        call      JUMPTARGET(\callee)
+        movss     %xmm0, 40(%rsp)
+        movss     12(%rsp), %xmm0
+        movss     28(%rsp), %xmm1
+        call      JUMPTARGET(\callee)
+        movss     32(%rsp), %xmm3
+        movss     36(%rsp), %xmm2
+        movss     40(%rsp), %xmm1
+        movss     %xmm0, 44(%rsp)
+        unpcklps  %xmm1, %xmm3
+        unpcklps  %xmm0, %xmm2
+        unpcklps  %xmm2, %xmm3
+        movaps    %xmm3, %xmm0
+        addq      $56, %rsp
+        cfi_adjust_cfa_offset(-56)
+        ret
+.endm
+
+/* 3 argument SSE2 ISA version as wrapper to scalar.  */
+.macro WRAPPER_IMPL_SSE2_fFF callee
+        pushq   %rbp
+        cfi_adjust_cfa_offset (8)
+        cfi_rel_offset (%rbp, 0)
+        pushq   %rbx
+        cfi_adjust_cfa_offset (8)
+        cfi_rel_offset (%rbx, 0)
+        movq    %rdi, %rbp
+        movq    %rsi, %rbx
+        subq    $40, %rsp
+        cfi_adjust_cfa_offset(40)
+        leaq    24(%rsp), %rsi
+        leaq    28(%rsp), %rdi
+        movaps  %xmm0, (%rsp)
+        call    JUMPTARGET(\callee)
+        leaq    24(%rsp), %rsi
+        leaq    28(%rsp), %rdi
+        movss   28(%rsp), %xmm0
+        movss   %xmm0, 0(%rbp)
+        movaps  (%rsp), %xmm1
+        movss   24(%rsp), %xmm0
+        movss   %xmm0, (%rbx)
+        movaps  %xmm1, %xmm0
+        shufps  $85, %xmm1, %xmm0
+        call    JUMPTARGET(\callee)
+        movss   28(%rsp), %xmm0
+        leaq    24(%rsp), %rsi
+        movss   %xmm0, 4(%rbp)
+        leaq    28(%rsp), %rdi
+        movaps  (%rsp), %xmm1
+        movss   24(%rsp), %xmm0
+        movss   %xmm0, 4(%rbx)
+        movaps  %xmm1, %xmm0
+        unpckhps        %xmm1, %xmm0
+        call    JUMPTARGET(\callee)
+        movaps  (%rsp), %xmm1
+        leaq    24(%rsp), %rsi
+        leaq    28(%rsp), %rdi
+        movss   28(%rsp), %xmm0
+        shufps  $255, %xmm1, %xmm1
+        movss   %xmm0, 8(%rbp)
+        movss   24(%rsp), %xmm0
+        movss   %xmm0, 8(%rbx)
+        movaps  %xmm1, %xmm0
+        call    JUMPTARGET(\callee)
+        movss   28(%rsp), %xmm0
+        movss   %xmm0, 12(%rbp)
+        movss   24(%rsp), %xmm0
+        movss   %xmm0, 12(%rbx)
+        addq    $40, %rsp
+        cfi_adjust_cfa_offset(-40)
+        popq    %rbx
+        cfi_adjust_cfa_offset (-8)
+        cfi_restore (%rbx)
+        popq    %rbp
+        cfi_adjust_cfa_offset (-8)
+        cfi_restore (%rbp)
+        ret
+.endm
+
+/* AVX/AVX2 ISA version as wrapper to SSE ISA version.  */
+.macro WRAPPER_IMPL_AVX callee
+        pushq     	%rbp
+        cfi_adjust_cfa_offset (8)
+        cfi_rel_offset (%rbp, 0)
+        movq      	%rsp, %rbp
+        cfi_def_cfa_register (%rbp)
+        andq      	$-32, %rsp
+        subq      	$32, %rsp
+        vextractf128 	$1, %ymm0, (%rsp)
+        vzeroupper
+        call      	HIDDEN_JUMPTARGET(\callee)
+        vmovaps   	%xmm0, 16(%rsp)
+        vmovaps   	(%rsp), %xmm0
+        call      	HIDDEN_JUMPTARGET(\callee)
+        vmovaps   	%xmm0, %xmm1
+        vmovaps   	16(%rsp), %xmm0
+        vinsertf128 	$1, %xmm1, %ymm0, %ymm0
+        movq      	%rbp, %rsp
+        cfi_def_cfa_register (%rsp)
+        popq      	%rbp
+        cfi_adjust_cfa_offset (-8)
+        cfi_restore (%rbp)
+        ret
+.endm
+
+/* 2 argument AVX/AVX2 ISA version as wrapper to SSE ISA version.  */
+.macro WRAPPER_IMPL_AVX_ff callee
+        pushq     %rbp
+        cfi_adjust_cfa_offset (8)
+        cfi_rel_offset (%rbp, 0)
+        movq      %rsp, %rbp
+        cfi_def_cfa_register (%rbp)
+        andq      $-32, %rsp
+        subq      $64, %rsp
+        vextractf128 $1, %ymm0, 16(%rsp)
+        vextractf128 $1, %ymm1, (%rsp)
+        vzeroupper
+        call      HIDDEN_JUMPTARGET(\callee)
+        vmovaps   %xmm0, 32(%rsp)
+        vmovaps   16(%rsp), %xmm0
+        vmovaps   (%rsp), %xmm1
+        call      HIDDEN_JUMPTARGET(\callee)
+        vmovaps   %xmm0, %xmm1
+        vmovaps   32(%rsp), %xmm0
+        vinsertf128 $1, %xmm1, %ymm0, %ymm0
+        movq      %rbp, %rsp
+        cfi_def_cfa_register (%rsp)
+        popq      %rbp
+        cfi_adjust_cfa_offset (-8)
+        cfi_restore (%rbp)
+        ret
+.endm
+
+/* 3 argument AVX/AVX2 ISA version as wrapper to SSE ISA version.  */
+.macro WRAPPER_IMPL_AVX_fFF callee
+        pushq     %rbp
+        cfi_adjust_cfa_offset (8)
+        cfi_rel_offset (%rbp, 0)
+        movq      %rsp, %rbp
+        cfi_def_cfa_register (%rbp)
+        andq      $-32, %rsp
+        pushq     %r13
+        cfi_adjust_cfa_offset (8)
+        cfi_rel_offset (%r13, 0)
+        pushq     %r14
+        cfi_adjust_cfa_offset (8)
+        cfi_rel_offset (%r14, 0)
+        subq      $48, %rsp
+        movq      %rsi, %r14
+        vmovaps   %ymm0, (%rsp)
+        movq      %rdi, %r13
+        vmovaps   16(%rsp), %xmm1
+        vmovaps   %xmm1, 32(%rsp)
+        vzeroupper
+        vmovaps   (%rsp), %xmm0
+        call      HIDDEN_JUMPTARGET(\callee)
+        vmovaps   32(%rsp), %xmm0
+        lea       (%rsp), %rdi
+        lea       16(%rsp), %rsi
+        call      HIDDEN_JUMPTARGET(\callee)
+        vmovaps   (%rsp), %xmm0
+        vmovaps   16(%rsp), %xmm1
+        vmovaps   %xmm0, 16(%r13)
+        vmovaps   %xmm1, 16(%r14)
+        addq      $48, %rsp
+        popq      %r14
+        cfi_adjust_cfa_offset (-8)
+        cfi_restore (%r14)
+        popq      %r13
+        cfi_adjust_cfa_offset (-8)
+        cfi_restore (%r13)
+        movq      %rbp, %rsp
+        cfi_def_cfa_register (%rsp)
+        popq      %rbp
+        cfi_adjust_cfa_offset (-8)
+        cfi_restore (%rbp)
+        ret
+.endm
+
+/* AVX512 ISA version as wrapper to AVX2 ISA version.  */
+.macro WRAPPER_IMPL_AVX512 callee
+        pushq     %rbp
+        cfi_adjust_cfa_offset (8)
+        cfi_rel_offset (%rbp, 0)
+        movq      %rsp, %rbp
+        cfi_def_cfa_register (%rbp)
+        andq      $-64, %rsp
+        subq      $128, %rsp
+/* Below is encoding for vmovups %zmm0, (%rsp).  */
+        .byte   0x62
+        .byte   0xf1
+        .byte   0x7c
+        .byte   0x48
+        .byte   0x11
+        .byte   0x04
+        .byte   0x24
+        vmovupd   (%rsp), %ymm0
+        call      HIDDEN_JUMPTARGET(\callee)
+        vmovupd   %ymm0, 64(%rsp)
+        vmovupd   32(%rsp), %ymm0
+        call      HIDDEN_JUMPTARGET(\callee)
+        vmovupd   %ymm0, 96(%rsp)
+/* Below is encoding for vmovups 64(%rsp), %zmm0.  */
+        .byte   0x62
+        .byte   0xf1
+        .byte   0x7c
+        .byte   0x48
+        .byte   0x10
+        .byte   0x44
+        .byte   0x24
+        .byte   0x01
+        movq      %rbp, %rsp
+        cfi_def_cfa_register (%rsp)
+        popq      %rbp
+        cfi_adjust_cfa_offset (-8)
+        cfi_restore (%rbp)
+        ret
+.endm
+
+/* 2 argument AVX512 ISA version as wrapper to AVX2 ISA version.  */
+.macro WRAPPER_IMPL_AVX512_ff callee
+        pushq     %rbp
+        cfi_adjust_cfa_offset (8)
+        cfi_rel_offset (%rbp, 0)
+        movq      %rsp, %rbp
+        cfi_def_cfa_register (%rbp)
+        andq      $-64, %rsp
+        subq      $192, %rsp
+/* Below is encoding for vmovups %zmm0, (%rsp).  */
+        .byte   0x62
+        .byte   0xf1
+        .byte   0x7c
+        .byte   0x48
+        .byte   0x11
+        .byte   0x04
+        .byte   0x24
+/* Below is encoding for vmovups %zmm1, 64(%rsp).  */
+        .byte   0x62
+        .byte   0xf1
+        .byte   0x7c
+        .byte   0x48
+        .byte   0x11
+        .byte   0x4c
+        .byte   0x24
+        .byte   0x01
+        vmovups   (%rsp), %ymm0
+        vmovups   64(%rsp), %ymm1
+        call      HIDDEN_JUMPTARGET(\callee)
+        vmovups   %ymm0, 128(%rsp)
+        vmovups   32(%rsp), %ymm0
+        vmovups   96(%rsp), %ymm1
+        call      HIDDEN_JUMPTARGET(\callee)
+        vmovups   %ymm0, 160(%rsp)
+/* Below is encoding for vmovups 128(%rsp), %zmm0.  */
+        .byte   0x62
+        .byte   0xf1
+        .byte   0x7c
+        .byte   0x48
+        .byte   0x10
+        .byte   0x44
+        .byte   0x24
+        .byte   0x02
+        movq      %rbp, %rsp
+        cfi_def_cfa_register (%rsp)
+        popq      %rbp
+        cfi_adjust_cfa_offset (-8)
+        cfi_restore (%rbp)
+        ret
+.endm
+
+/* 3 argument AVX512 ISA version as wrapper to AVX2 ISA version.  */
+.macro WRAPPER_IMPL_AVX512_fFF callee
+        pushq     %rbp
+        cfi_adjust_cfa_offset (8)
+        cfi_rel_offset (%rbp, 0)
+        movq	%rsp, %rbp
+        cfi_def_cfa_register (%rbp)
+        andq      $-64, %rsp
+        pushq     %r12
+        pushq     %r13
+        subq      $176, %rsp
+        movq      %rsi, %r13
+/* Below is encoding for vmovaps %zmm0, (%rsp).  */
+        .byte	0x62
+        .byte	0xf1
+        .byte	0x7c
+        .byte	0x48
+        .byte	0x29
+        .byte	0x04
+        .byte	0x24
+        movq      %rdi, %r12
+        vmovaps   (%rsp), %ymm0
+        call      HIDDEN_JUMPTARGET(\callee)
+        vmovaps   32(%rsp), %ymm0
+        lea       64(%rsp), %rdi
+        lea       96(%rsp), %rsi
+        call      HIDDEN_JUMPTARGET(\callee)
+        vmovaps   64(%rsp), %ymm0
+        vmovaps   96(%rsp), %ymm1
+        vmovaps   %ymm0, 32(%r12)
+        vmovaps   %ymm1, 32(%r13)
+        addq      $176, %rsp
+        popq      %r13
+        popq      %r12
+        movq      %rbp, %rsp
+        cfi_def_cfa_register (%rsp)
+        popq	%rbp
+        cfi_adjust_cfa_offset (-8)
+        cfi_restore (%rbp)
+        ret
+.endm
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/test-double-libmvec-alias-avx-main.c b/REORG.TODO/sysdeps/x86_64/fpu/test-double-libmvec-alias-avx-main.c
new file mode 100644
index 0000000000..43914ef0e7
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/test-double-libmvec-alias-avx-main.c
@@ -0,0 +1 @@
+#include "test-double-libmvec-alias.c"
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/test-double-libmvec-alias-avx-mod.c b/REORG.TODO/sysdeps/x86_64/fpu/test-double-libmvec-alias-avx-mod.c
new file mode 100644
index 0000000000..514883dcf9
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/test-double-libmvec-alias-avx-mod.c
@@ -0,0 +1 @@
+#include "test-double-libmvec-alias-mod.c"
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/test-double-libmvec-alias-avx.c b/REORG.TODO/sysdeps/x86_64/fpu/test-double-libmvec-alias-avx.c
new file mode 100644
index 0000000000..43914ef0e7
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/test-double-libmvec-alias-avx.c
@@ -0,0 +1 @@
+#include "test-double-libmvec-alias.c"
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/test-double-libmvec-alias-avx2-main.c b/REORG.TODO/sysdeps/x86_64/fpu/test-double-libmvec-alias-avx2-main.c
new file mode 100644
index 0000000000..43914ef0e7
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/test-double-libmvec-alias-avx2-main.c
@@ -0,0 +1 @@
+#include "test-double-libmvec-alias.c"
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/test-double-libmvec-alias-avx2-mod.c b/REORG.TODO/sysdeps/x86_64/fpu/test-double-libmvec-alias-avx2-mod.c
new file mode 100644
index 0000000000..514883dcf9
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/test-double-libmvec-alias-avx2-mod.c
@@ -0,0 +1 @@
+#include "test-double-libmvec-alias-mod.c"
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/test-double-libmvec-alias-avx2.c b/REORG.TODO/sysdeps/x86_64/fpu/test-double-libmvec-alias-avx2.c
new file mode 100644
index 0000000000..43914ef0e7
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/test-double-libmvec-alias-avx2.c
@@ -0,0 +1 @@
+#include "test-double-libmvec-alias.c"
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/test-double-libmvec-alias-avx512-main.c b/REORG.TODO/sysdeps/x86_64/fpu/test-double-libmvec-alias-avx512-main.c
new file mode 100644
index 0000000000..43914ef0e7
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/test-double-libmvec-alias-avx512-main.c
@@ -0,0 +1 @@
+#include "test-double-libmvec-alias.c"
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/test-double-libmvec-alias-avx512-mod.c b/REORG.TODO/sysdeps/x86_64/fpu/test-double-libmvec-alias-avx512-mod.c
new file mode 100644
index 0000000000..514883dcf9
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/test-double-libmvec-alias-avx512-mod.c
@@ -0,0 +1 @@
+#include "test-double-libmvec-alias-mod.c"
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/test-double-libmvec-alias-avx512.c b/REORG.TODO/sysdeps/x86_64/fpu/test-double-libmvec-alias-avx512.c
new file mode 100644
index 0000000000..43914ef0e7
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/test-double-libmvec-alias-avx512.c
@@ -0,0 +1 @@
+#include "test-double-libmvec-alias.c"
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/test-double-libmvec-alias-main.c b/REORG.TODO/sysdeps/x86_64/fpu/test-double-libmvec-alias-main.c
new file mode 100644
index 0000000000..43914ef0e7
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/test-double-libmvec-alias-main.c
@@ -0,0 +1 @@
+#include "test-double-libmvec-alias.c"
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/test-double-libmvec-alias-mod.c b/REORG.TODO/sysdeps/x86_64/fpu/test-double-libmvec-alias-mod.c
new file mode 100644
index 0000000000..d549c3ec19
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/test-double-libmvec-alias-mod.c
@@ -0,0 +1,25 @@
+/* Part of test to build shared library to ensure link against
+   *_finite aliases from libmvec.
+   Copyright (C) 2016-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <math.h>
+#include <stdlib.h>
+#include <math-tests-arch.h>
+
+#include "test-double.h"
+#include "test-libmvec-alias-mod.c"
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/test-double-libmvec-alias.c b/REORG.TODO/sysdeps/x86_64/fpu/test-double-libmvec-alias.c
new file mode 100644
index 0000000000..c7048d346f
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/test-double-libmvec-alias.c
@@ -0,0 +1,29 @@
+/* Part of test to ensure link against *_finite aliases from libmvec.
+   Copyright (C) 2016-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+extern int
+test_finite_alias (void);
+
+static int
+do_test (void)
+{
+  return test_finite_alias ();
+}
+
+#define TEST_FUNCTION do_test ()
+#include "../../../test-skeleton.c"
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/test-double-libmvec-sincos-avx-main.c b/REORG.TODO/sysdeps/x86_64/fpu/test-double-libmvec-sincos-avx-main.c
new file mode 100644
index 0000000000..fc2ffea314
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/test-double-libmvec-sincos-avx-main.c
@@ -0,0 +1 @@
+#include "test-double-libmvec-sincos-main.c"
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/test-double-libmvec-sincos-avx.c b/REORG.TODO/sysdeps/x86_64/fpu/test-double-libmvec-sincos-avx.c
new file mode 100644
index 0000000000..896f1bcbaf
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/test-double-libmvec-sincos-avx.c
@@ -0,0 +1 @@
+#include "test-double-libmvec-sincos.c"
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/test-double-libmvec-sincos-avx2-main.c b/REORG.TODO/sysdeps/x86_64/fpu/test-double-libmvec-sincos-avx2-main.c
new file mode 100644
index 0000000000..fc2ffea314
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/test-double-libmvec-sincos-avx2-main.c
@@ -0,0 +1 @@
+#include "test-double-libmvec-sincos-main.c"
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/test-double-libmvec-sincos-avx2.c b/REORG.TODO/sysdeps/x86_64/fpu/test-double-libmvec-sincos-avx2.c
new file mode 100644
index 0000000000..896f1bcbaf
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/test-double-libmvec-sincos-avx2.c
@@ -0,0 +1 @@
+#include "test-double-libmvec-sincos.c"
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/test-double-libmvec-sincos-avx512-main.c b/REORG.TODO/sysdeps/x86_64/fpu/test-double-libmvec-sincos-avx512-main.c
new file mode 100644
index 0000000000..fc2ffea314
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/test-double-libmvec-sincos-avx512-main.c
@@ -0,0 +1 @@
+#include "test-double-libmvec-sincos-main.c"
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/test-double-libmvec-sincos-avx512.c b/REORG.TODO/sysdeps/x86_64/fpu/test-double-libmvec-sincos-avx512.c
new file mode 100644
index 0000000000..896f1bcbaf
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/test-double-libmvec-sincos-avx512.c
@@ -0,0 +1 @@
+#include "test-double-libmvec-sincos.c"
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/test-double-libmvec-sincos-main.c b/REORG.TODO/sysdeps/x86_64/fpu/test-double-libmvec-sincos-main.c
new file mode 100644
index 0000000000..c33436dc0f
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/test-double-libmvec-sincos-main.c
@@ -0,0 +1,43 @@
+/* Test for vector sincos ABI.
+   Copyright (C) 2016-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <math.h>
+
+#define N 1000
+double x[N], s[N], c[N];
+double* s_ptrs[N];
+double* c_ptrs[N];
+
+int
+test_sincos_abi (void)
+{
+  int i;
+
+  for(i = 0; i < N; i++)
+  {
+    x[i] = i / 3;
+    s_ptrs[i] = &s[i];
+    c_ptrs[i] = &c[i];
+  }
+
+#pragma omp simd
+  for(i = 0; i < N; i++)
+    sincos (x[i], s_ptrs[i], c_ptrs[i]);
+
+  return 0;
+}
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/test-double-libmvec-sincos.c b/REORG.TODO/sysdeps/x86_64/fpu/test-double-libmvec-sincos.c
new file mode 100644
index 0000000000..9be71edd93
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/test-double-libmvec-sincos.c
@@ -0,0 +1,44 @@
+/* Test for vector sincos ABI.
+   Copyright (C) 2016-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <math-tests-arch.h>
+
+extern int test_sincos_abi (void);
+
+int arch_check = 1;
+
+static void
+check_arch (void)
+{
+  CHECK_ARCH_EXT;
+  arch_check = 0;
+}
+
+static int
+do_test (void)
+{
+  check_arch ();
+
+  if (arch_check)
+    return 77;
+
+  return test_sincos_abi ();
+}
+
+#define TEST_FUNCTION do_test ()
+#include "../../../test-skeleton.c"
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/test-double-vlen2-wrappers.c b/REORG.TODO/sysdeps/x86_64/fpu/test-double-vlen2-wrappers.c
new file mode 100644
index 0000000000..b4457f700a
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/test-double-vlen2-wrappers.c
@@ -0,0 +1,33 @@
+/* Wrapper part of tests for SSE ISA versions of vector math functions.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include "test-double-vlen2.h"
+#include "test-math-vector-sincos.h"
+#include <immintrin.h>
+
+#define VEC_TYPE __m128d
+
+VECTOR_WRAPPER (WRAPPER_NAME (cos), _ZGVbN2v_cos)
+VECTOR_WRAPPER (WRAPPER_NAME (sin), _ZGVbN2v_sin)
+VECTOR_WRAPPER (WRAPPER_NAME (log), _ZGVbN2v_log)
+VECTOR_WRAPPER (WRAPPER_NAME (exp), _ZGVbN2v_exp)
+VECTOR_WRAPPER_ff (WRAPPER_NAME (pow), _ZGVbN2vv_pow)
+
+#define VEC_INT_TYPE __m128i
+
+VECTOR_WRAPPER_fFF_2 (WRAPPER_NAME (sincos), _ZGVbN2vvv_sincos)
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/test-double-vlen4-avx2-wrappers.c b/REORG.TODO/sysdeps/x86_64/fpu/test-double-vlen4-avx2-wrappers.c
new file mode 100644
index 0000000000..e6b991ceaf
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/test-double-vlen4-avx2-wrappers.c
@@ -0,0 +1,40 @@
+/* Wrapper part of tests for AVX2 ISA versions of vector math functions.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include "test-double-vlen4.h"
+#include "test-math-vector-sincos.h"
+#include <immintrin.h>
+
+#undef VEC_SUFF
+#define VEC_SUFF _vlen4_avx2
+
+#define VEC_TYPE __m256d
+
+VECTOR_WRAPPER (WRAPPER_NAME (cos), _ZGVdN4v_cos)
+VECTOR_WRAPPER (WRAPPER_NAME (sin), _ZGVdN4v_sin)
+VECTOR_WRAPPER (WRAPPER_NAME (log), _ZGVdN4v_log)
+VECTOR_WRAPPER (WRAPPER_NAME (exp), _ZGVdN4v_exp)
+VECTOR_WRAPPER_ff (WRAPPER_NAME (pow), _ZGVdN4vv_pow)
+
+#ifndef __ILP32__
+# define VEC_INT_TYPE __m256i
+#else
+# define VEC_INT_TYPE __m128i
+#endif
+
+VECTOR_WRAPPER_fFF_2 (WRAPPER_NAME (sincos), _ZGVdN4vvv_sincos)
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/test-double-vlen4-avx2.h b/REORG.TODO/sysdeps/x86_64/fpu/test-double-vlen4-avx2.h
new file mode 100644
index 0000000000..a15d4be31f
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/test-double-vlen4-avx2.h
@@ -0,0 +1,25 @@
+/* Tests for AVX2 ISA versions of vector math functions.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <test-double-vlen4.h>
+
+#undef VEC_SUFF
+#define VEC_SUFF _vlen4_avx2
+
+#undef REQUIRE_AVX
+#define REQUIRE_AVX2
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/test-double-vlen4-wrappers.c b/REORG.TODO/sysdeps/x86_64/fpu/test-double-vlen4-wrappers.c
new file mode 100644
index 0000000000..3606b6f55f
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/test-double-vlen4-wrappers.c
@@ -0,0 +1,37 @@
+/* Wrapper part of tests for AVX ISA versions of vector math functions.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include "test-double-vlen4.h"
+#include "test-math-vector-sincos.h"
+#include <immintrin.h>
+
+#define VEC_TYPE __m256d
+
+VECTOR_WRAPPER (WRAPPER_NAME (cos), _ZGVcN4v_cos)
+VECTOR_WRAPPER (WRAPPER_NAME (sin), _ZGVcN4v_sin)
+VECTOR_WRAPPER (WRAPPER_NAME (log), _ZGVcN4v_log)
+VECTOR_WRAPPER (WRAPPER_NAME (exp), _ZGVcN4v_exp)
+VECTOR_WRAPPER_ff (WRAPPER_NAME (pow), _ZGVcN4vv_pow)
+
+#define VEC_INT_TYPE __m128i
+
+#ifndef __ILP32__
+VECTOR_WRAPPER_fFF_3 (WRAPPER_NAME (sincos), _ZGVcN4vvv_sincos)
+#else
+VECTOR_WRAPPER_fFF_2 (WRAPPER_NAME (sincos), _ZGVcN4vvv_sincos)
+#endif
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/test-double-vlen4.h b/REORG.TODO/sysdeps/x86_64/fpu/test-double-vlen4.h
new file mode 100644
index 0000000000..1698e621d6
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/test-double-vlen4.h
@@ -0,0 +1,21 @@
+/* Tests for AVX ISA versions of vector math functions.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include_next <test-double-vlen4.h>
+
+#define REQUIRE_AVX
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/test-double-vlen8-wrappers.c b/REORG.TODO/sysdeps/x86_64/fpu/test-double-vlen8-wrappers.c
new file mode 100644
index 0000000000..d77b43046d
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/test-double-vlen8-wrappers.c
@@ -0,0 +1,37 @@
+/* Wrapper part of tests for AVX-512 versions of vector math functions.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include "test-double-vlen8.h"
+#include "test-math-vector-sincos.h"
+#include <immintrin.h>
+
+#define VEC_TYPE __m512d
+
+VECTOR_WRAPPER (WRAPPER_NAME (cos), _ZGVeN8v_cos)
+VECTOR_WRAPPER (WRAPPER_NAME (sin), _ZGVeN8v_sin)
+VECTOR_WRAPPER (WRAPPER_NAME (log), _ZGVeN8v_log)
+VECTOR_WRAPPER (WRAPPER_NAME (exp), _ZGVeN8v_exp)
+VECTOR_WRAPPER_ff (WRAPPER_NAME (pow), _ZGVeN8vv_pow)
+
+#ifndef __ILP32__
+# define VEC_INT_TYPE __m512i
+#else
+# define VEC_INT_TYPE __m256i
+#endif
+
+VECTOR_WRAPPER_fFF_2 (WRAPPER_NAME (sincos), _ZGVeN8vvv_sincos)
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/test-double-vlen8.h b/REORG.TODO/sysdeps/x86_64/fpu/test-double-vlen8.h
new file mode 100644
index 0000000000..5802abc121
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/test-double-vlen8.h
@@ -0,0 +1,21 @@
+/* Tests for AVX-512 versions of vector math functions.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include_next <test-double-vlen8.h>
+
+#define REQUIRE_AVX512F
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/test-float-libmvec-alias-avx-main.c b/REORG.TODO/sysdeps/x86_64/fpu/test-float-libmvec-alias-avx-main.c
new file mode 100644
index 0000000000..f3691cc8e6
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/test-float-libmvec-alias-avx-main.c
@@ -0,0 +1 @@
+#include "test-float-libmvec-alias.c"
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/test-float-libmvec-alias-avx-mod.c b/REORG.TODO/sysdeps/x86_64/fpu/test-float-libmvec-alias-avx-mod.c
new file mode 100644
index 0000000000..7fc3d8aedd
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/test-float-libmvec-alias-avx-mod.c
@@ -0,0 +1 @@
+#include "test-float-libmvec-alias-mod.c"
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/test-float-libmvec-alias-avx.c b/REORG.TODO/sysdeps/x86_64/fpu/test-float-libmvec-alias-avx.c
new file mode 100644
index 0000000000..f3691cc8e6
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/test-float-libmvec-alias-avx.c
@@ -0,0 +1 @@
+#include "test-float-libmvec-alias.c"
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/test-float-libmvec-alias-avx2-main.c b/REORG.TODO/sysdeps/x86_64/fpu/test-float-libmvec-alias-avx2-main.c
new file mode 100644
index 0000000000..f3691cc8e6
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/test-float-libmvec-alias-avx2-main.c
@@ -0,0 +1 @@
+#include "test-float-libmvec-alias.c"
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/test-float-libmvec-alias-avx2-mod.c b/REORG.TODO/sysdeps/x86_64/fpu/test-float-libmvec-alias-avx2-mod.c
new file mode 100644
index 0000000000..7fc3d8aedd
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/test-float-libmvec-alias-avx2-mod.c
@@ -0,0 +1 @@
+#include "test-float-libmvec-alias-mod.c"
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/test-float-libmvec-alias-avx2.c b/REORG.TODO/sysdeps/x86_64/fpu/test-float-libmvec-alias-avx2.c
new file mode 100644
index 0000000000..f3691cc8e6
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/test-float-libmvec-alias-avx2.c
@@ -0,0 +1 @@
+#include "test-float-libmvec-alias.c"
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/test-float-libmvec-alias-avx512-main.c b/REORG.TODO/sysdeps/x86_64/fpu/test-float-libmvec-alias-avx512-main.c
new file mode 100644
index 0000000000..f3691cc8e6
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/test-float-libmvec-alias-avx512-main.c
@@ -0,0 +1 @@
+#include "test-float-libmvec-alias.c"
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/test-float-libmvec-alias-avx512-mod.c b/REORG.TODO/sysdeps/x86_64/fpu/test-float-libmvec-alias-avx512-mod.c
new file mode 100644
index 0000000000..7fc3d8aedd
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/test-float-libmvec-alias-avx512-mod.c
@@ -0,0 +1 @@
+#include "test-float-libmvec-alias-mod.c"
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/test-float-libmvec-alias-avx512.c b/REORG.TODO/sysdeps/x86_64/fpu/test-float-libmvec-alias-avx512.c
new file mode 100644
index 0000000000..f3691cc8e6
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/test-float-libmvec-alias-avx512.c
@@ -0,0 +1 @@
+#include "test-float-libmvec-alias.c"
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/test-float-libmvec-alias-main.c b/REORG.TODO/sysdeps/x86_64/fpu/test-float-libmvec-alias-main.c
new file mode 100644
index 0000000000..f3691cc8e6
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/test-float-libmvec-alias-main.c
@@ -0,0 +1 @@
+#include "test-float-libmvec-alias.c"
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/test-float-libmvec-alias-mod.c b/REORG.TODO/sysdeps/x86_64/fpu/test-float-libmvec-alias-mod.c
new file mode 100644
index 0000000000..109307f997
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/test-float-libmvec-alias-mod.c
@@ -0,0 +1,25 @@
+/* Part of test to build shared library to ensure link against
+   *_finite aliases from libmvec.
+   Copyright (C) 2016-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <math.h>
+#include <stdlib.h>
+#include <math-tests-arch.h>
+
+#include "test-float.h"
+#include "test-libmvec-alias-mod.c"
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/test-float-libmvec-alias.c b/REORG.TODO/sysdeps/x86_64/fpu/test-float-libmvec-alias.c
new file mode 100644
index 0000000000..c7048d346f
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/test-float-libmvec-alias.c
@@ -0,0 +1,29 @@
+/* Part of test to ensure link against *_finite aliases from libmvec.
+   Copyright (C) 2016-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+extern int
+test_finite_alias (void);
+
+static int
+do_test (void)
+{
+  return test_finite_alias ();
+}
+
+#define TEST_FUNCTION do_test ()
+#include "../../../test-skeleton.c"
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/test-float-libmvec-sincosf-avx-main.c b/REORG.TODO/sysdeps/x86_64/fpu/test-float-libmvec-sincosf-avx-main.c
new file mode 100644
index 0000000000..558e2ac649
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/test-float-libmvec-sincosf-avx-main.c
@@ -0,0 +1 @@
+#include "test-float-libmvec-sincosf-main.c"
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/test-float-libmvec-sincosf-avx.c b/REORG.TODO/sysdeps/x86_64/fpu/test-float-libmvec-sincosf-avx.c
new file mode 100644
index 0000000000..5b45f0a055
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/test-float-libmvec-sincosf-avx.c
@@ -0,0 +1 @@
+#include "test-float-libmvec-sincosf.c"
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/test-float-libmvec-sincosf-avx2-main.c b/REORG.TODO/sysdeps/x86_64/fpu/test-float-libmvec-sincosf-avx2-main.c
new file mode 100644
index 0000000000..558e2ac649
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/test-float-libmvec-sincosf-avx2-main.c
@@ -0,0 +1 @@
+#include "test-float-libmvec-sincosf-main.c"
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/test-float-libmvec-sincosf-avx2.c b/REORG.TODO/sysdeps/x86_64/fpu/test-float-libmvec-sincosf-avx2.c
new file mode 100644
index 0000000000..5b45f0a055
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/test-float-libmvec-sincosf-avx2.c
@@ -0,0 +1 @@
+#include "test-float-libmvec-sincosf.c"
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/test-float-libmvec-sincosf-avx512-main.c b/REORG.TODO/sysdeps/x86_64/fpu/test-float-libmvec-sincosf-avx512-main.c
new file mode 100644
index 0000000000..558e2ac649
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/test-float-libmvec-sincosf-avx512-main.c
@@ -0,0 +1 @@
+#include "test-float-libmvec-sincosf-main.c"
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/test-float-libmvec-sincosf-avx512.c b/REORG.TODO/sysdeps/x86_64/fpu/test-float-libmvec-sincosf-avx512.c
new file mode 100644
index 0000000000..5b45f0a055
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/test-float-libmvec-sincosf-avx512.c
@@ -0,0 +1 @@
+#include "test-float-libmvec-sincosf.c"
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/test-float-libmvec-sincosf-main.c b/REORG.TODO/sysdeps/x86_64/fpu/test-float-libmvec-sincosf-main.c
new file mode 100644
index 0000000000..5dd1efa8f9
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/test-float-libmvec-sincosf-main.c
@@ -0,0 +1,42 @@
+/* Test for vector sincosf ABI.
+   Copyright (C) 2016-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <math.h>
+
+#define N 1000
+float x[N], s[N], c[N];
+float *s_ptrs[N];
+float *c_ptrs[N];
+
+int
+test_sincosf_abi (void)
+{
+  int i;
+  for(i = 0; i < N; i++)
+  {
+    x[i] = i / 3;
+    s_ptrs[i] = &s[i];
+    c_ptrs[i] = &c[i];
+  }
+
+#pragma omp simd
+  for(i = 0; i < N; i++)
+    sincosf (x[i], s_ptrs[i], c_ptrs[i]);
+
+  return 0;
+}
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/test-float-libmvec-sincosf.c b/REORG.TODO/sysdeps/x86_64/fpu/test-float-libmvec-sincosf.c
new file mode 100644
index 0000000000..79543f5cb0
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/test-float-libmvec-sincosf.c
@@ -0,0 +1,44 @@
+/* Test for vector sincosf ABI.
+   Copyright (C) 2016-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <math-tests-arch.h>
+
+extern int test_sincosf_abi (void);
+
+int arch_check = 1;
+
+static void
+check_arch (void)
+{
+  CHECK_ARCH_EXT;
+  arch_check = 0;
+}
+
+static int
+do_test (void)
+{
+  check_arch ();
+
+  if (arch_check)
+    return 77;
+
+  return test_sincosf_abi ();
+}
+
+#define TEST_FUNCTION do_test ()
+#include "../../../test-skeleton.c"
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/test-float-vlen16-wrappers.c b/REORG.TODO/sysdeps/x86_64/fpu/test-float-vlen16-wrappers.c
new file mode 100644
index 0000000000..2e729e2770
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/test-float-vlen16-wrappers.c
@@ -0,0 +1,37 @@
+/* Wrapper part of tests for AVX-512 ISA versions of vector math functions.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include "test-float-vlen16.h"
+#include "test-math-vector-sincos.h"
+#include <immintrin.h>
+
+#define VEC_TYPE __m512
+
+VECTOR_WRAPPER (WRAPPER_NAME (cosf), _ZGVeN16v_cosf)
+VECTOR_WRAPPER (WRAPPER_NAME (sinf), _ZGVeN16v_sinf)
+VECTOR_WRAPPER (WRAPPER_NAME (logf), _ZGVeN16v_logf)
+VECTOR_WRAPPER (WRAPPER_NAME (expf), _ZGVeN16v_expf)
+VECTOR_WRAPPER_ff (WRAPPER_NAME (powf), _ZGVeN16vv_powf)
+
+#define VEC_INT_TYPE __m512i
+
+#ifndef __ILP32__
+VECTOR_WRAPPER_fFF_3 (WRAPPER_NAME (sincosf), _ZGVeN16vvv_sincosf)
+#else
+VECTOR_WRAPPER_fFF_2 (WRAPPER_NAME (sincosf), _ZGVeN16vvv_sincosf)
+#endif
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/test-float-vlen16.h b/REORG.TODO/sysdeps/x86_64/fpu/test-float-vlen16.h
new file mode 100644
index 0000000000..b2bfbf5371
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/test-float-vlen16.h
@@ -0,0 +1,21 @@
+/* Tests for AVX-512 ISA versions of vector math functions.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include_next <test-float-vlen16.h>
+
+#define REQUIRE_AVX512F
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/test-float-vlen4-wrappers.c b/REORG.TODO/sysdeps/x86_64/fpu/test-float-vlen4-wrappers.c
new file mode 100644
index 0000000000..a332a65236
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/test-float-vlen4-wrappers.c
@@ -0,0 +1,37 @@
+/* Wrapper part of tests for SSE ISA versions of vector math functions.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include "test-float-vlen4.h"
+#include "test-math-vector-sincos.h"
+#include <immintrin.h>
+
+#define VEC_TYPE __m128
+
+VECTOR_WRAPPER (WRAPPER_NAME (cosf), _ZGVbN4v_cosf)
+VECTOR_WRAPPER (WRAPPER_NAME (sinf), _ZGVbN4v_sinf)
+VECTOR_WRAPPER (WRAPPER_NAME (logf), _ZGVbN4v_logf)
+VECTOR_WRAPPER (WRAPPER_NAME (expf), _ZGVbN4v_expf)
+VECTOR_WRAPPER_ff (WRAPPER_NAME (powf), _ZGVbN4vv_powf)
+
+#define VEC_INT_TYPE __m128i
+
+#ifndef __ILP32__
+VECTOR_WRAPPER_fFF_3 (WRAPPER_NAME (sincosf), _ZGVbN4vvv_sincosf)
+#else
+VECTOR_WRAPPER_fFF_2 (WRAPPER_NAME (sincosf), _ZGVbN4vvv_sincosf)
+#endif
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/test-float-vlen8-avx2-wrappers.c b/REORG.TODO/sysdeps/x86_64/fpu/test-float-vlen8-avx2-wrappers.c
new file mode 100644
index 0000000000..511f9342a6
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/test-float-vlen8-avx2-wrappers.c
@@ -0,0 +1,43 @@
+/* Wrapper part of tests for AVX2 ISA versions of vector math functions.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include "test-float-vlen8.h"
+#include "test-math-vector-sincos.h"
+#include <immintrin.h>
+
+#undef VEC_SUFF
+#define VEC_SUFF _vlen8_avx2
+
+#define VEC_TYPE __m256
+
+VECTOR_WRAPPER (WRAPPER_NAME (cosf), _ZGVdN8v_cosf)
+VECTOR_WRAPPER (WRAPPER_NAME (sinf), _ZGVdN8v_sinf)
+VECTOR_WRAPPER (WRAPPER_NAME (logf), _ZGVdN8v_logf)
+VECTOR_WRAPPER (WRAPPER_NAME (expf), _ZGVdN8v_expf)
+VECTOR_WRAPPER_ff (WRAPPER_NAME (powf), _ZGVdN8vv_powf)
+
+/* Redefinition of wrapper to be compatible with _ZGVdN8vvv_sincosf.  */
+#undef VECTOR_WRAPPER_fFF
+
+#define VEC_INT_TYPE __m256i
+
+#ifndef __ILP32__
+VECTOR_WRAPPER_fFF_3 (WRAPPER_NAME (sincosf), _ZGVdN8vvv_sincosf)
+#else
+VECTOR_WRAPPER_fFF_2 (WRAPPER_NAME (sincosf), _ZGVdN8vvv_sincosf)
+#endif
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/test-float-vlen8-avx2.h b/REORG.TODO/sysdeps/x86_64/fpu/test-float-vlen8-avx2.h
new file mode 100644
index 0000000000..4967f9d19b
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/test-float-vlen8-avx2.h
@@ -0,0 +1,25 @@
+/* Tests for AVX2 ISA versions of vector math functions.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <test-float-vlen8.h>
+
+#undef VEC_SUFF
+#define VEC_SUFF _vlen8_avx2
+
+#undef REQUIRE_AVX
+#define REQUIRE_AVX2
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/test-float-vlen8-wrappers.c b/REORG.TODO/sysdeps/x86_64/fpu/test-float-vlen8-wrappers.c
new file mode 100644
index 0000000000..5a3581b0c8
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/test-float-vlen8-wrappers.c
@@ -0,0 +1,37 @@
+/* Wrapper part of tests for AVX ISA versions of vector math functions.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include "test-float-vlen8.h"
+#include "test-math-vector-sincos.h"
+#include <immintrin.h>
+
+#define VEC_TYPE __m256
+
+VECTOR_WRAPPER (WRAPPER_NAME (cosf), _ZGVcN8v_cosf)
+VECTOR_WRAPPER (WRAPPER_NAME (sinf), _ZGVcN8v_sinf)
+VECTOR_WRAPPER (WRAPPER_NAME (logf), _ZGVcN8v_logf)
+VECTOR_WRAPPER (WRAPPER_NAME (expf), _ZGVcN8v_expf)
+VECTOR_WRAPPER_ff (WRAPPER_NAME (powf), _ZGVcN8vv_powf)
+
+#define VEC_INT_TYPE __m128i
+
+#ifndef __ILP32__
+VECTOR_WRAPPER_fFF_4 (WRAPPER_NAME (sincosf), _ZGVcN8vvv_sincosf)
+#else
+VECTOR_WRAPPER_fFF_3 (WRAPPER_NAME (sincosf), _ZGVcN8vvv_sincosf)
+#endif
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/test-float-vlen8.h b/REORG.TODO/sysdeps/x86_64/fpu/test-float-vlen8.h
new file mode 100644
index 0000000000..23ef71c6c5
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/test-float-vlen8.h
@@ -0,0 +1,21 @@
+/* Tests for AVX ISA versions of vector math functions.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include_next <test-float-vlen8.h>
+
+#define REQUIRE_AVX
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/test-libmvec-alias-mod.c b/REORG.TODO/sysdeps/x86_64/fpu/test-libmvec-alias-mod.c
new file mode 100644
index 0000000000..9746b0ae1c
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/test-libmvec-alias-mod.c
@@ -0,0 +1,66 @@
+/* Part of test to build shared library to ensure link against
+   *_finite aliases from libmvec.
+   Copyright (C) 2016-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#define N 4000
+FLOAT log_arg[N];
+FLOAT exp_arg[N];
+FLOAT log_res[N];
+FLOAT exp_res[N];
+FLOAT pow_res[N];
+int arch_check = 1;
+
+static void
+init_arg (void)
+{
+  int i;
+
+  CHECK_ARCH_EXT;
+
+  arch_check = 0;
+
+  for (i = 0; i < N; i += 1)
+    {
+      log_arg[i] = 1.0;
+      exp_arg[i] = 0.0;
+    }
+}
+
+int
+test_finite_alias (void)
+{
+  int i;
+
+  init_arg ();
+
+  if (arch_check) return 77;
+
+#pragma omp simd
+  for (i = 0; i < N; i += 1)
+    {
+      log_res[i] = FUNC (log) (log_arg[i]);
+      exp_res[i] = FUNC (exp) (exp_arg[i]);
+      pow_res[i] = FUNC (pow) (log_arg[i], log_arg[i]);
+    }
+
+  if (log_res[0] != 0.0) return 1;
+  if (exp_res[0] != 1.0) return 1;
+  if (pow_res[0] != 1.0) return 1;
+
+  return 0;
+}
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/x86_64-math-asm.h b/REORG.TODO/sysdeps/x86_64/fpu/x86_64-math-asm.h
new file mode 100644
index 0000000000..4b4e40c3e7
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/fpu/x86_64-math-asm.h
@@ -0,0 +1,74 @@
+/* Helper macros for x86_64 libm functions.
+   Copyright (C) 2015-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _X86_64_MATH_ASM_H
+#define _X86_64_MATH_ASM_H 1
+
+/* Define constants for the minimum value of a floating-point
+   type.  */
+#define DEFINE_LDBL_MIN					\
+	.section .rodata.cst16,"aM",@progbits,16;	\
+	.p2align 4;					\
+	.type ldbl_min,@object;				\
+ldbl_min:						\
+	.byte 0, 0, 0, 0, 0, 0, 0, 0x80, 0x1, 0;	\
+	.byte 0, 0, 0, 0, 0, 0;				\
+	.size ldbl_min, .-ldbl_min;
+
+/* Force an underflow exception if the given value (nonnegative or
+   NaN) is subnormal.  The relevant constant for the minimum of the
+   type must have been defined, the MO macro must have been defined
+   for access to memory operands, and, if PIC, the PIC register must
+   have been loaded.  */
+#define LDBL_CHECK_FORCE_UFLOW_NONNEG_NAN	\
+	fldt	MO(ldbl_min);			\
+	fld	%st(1);				\
+	fucomip	%st(1), %st(0);			\
+	fstp	%st(0);				\
+	jnc 6464f;				\
+	fld	%st(0);				\
+	fmul	%st(0);				\
+	fstp	%st(0);				\
+6464:
+
+/* Likewise, but the argument is not a NaN.  */
+#define LDBL_CHECK_FORCE_UFLOW_NONNAN		\
+	fldt	MO(ldbl_min);			\
+	fld	%st(1);				\
+	fabs;					\
+	fcomip	%st(1), %st(0);			\
+	fstp	%st(0);				\
+	jnc 6464f;				\
+	fld	%st(0);				\
+	fmul	%st(0);				\
+	fstp	%st(0);				\
+6464:
+
+/* Likewise, but the argument is nonnegative and not a NaN.  */
+#define LDBL_CHECK_FORCE_UFLOW_NONNEG		\
+	fldt	MO(ldbl_min);			\
+	fld	%st(1);				\
+	fcomip	%st(1), %st(0);			\
+	fstp	%st(0);				\
+	jnc 6464f;				\
+	fld	%st(0);				\
+	fmul	%st(0);				\
+	fstp	%st(0);				\
+6464:
+
+#endif /* x86_64-math-asm.h.  */
diff --git a/REORG.TODO/sysdeps/x86_64/hp-timing.h b/REORG.TODO/sysdeps/x86_64/hp-timing.h
new file mode 100644
index 0000000000..1b2d2cde33
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/hp-timing.h
@@ -0,0 +1,40 @@
+/* High precision, low overhead timing functions.  x86-64 version.
+   Copyright (C) 2002-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _HP_TIMING_H
+#define _HP_TIMING_H	1
+
+/* We always assume having the timestamp register.  */
+#define HP_TIMING_AVAIL		(1)
+#define HP_SMALL_TIMING_AVAIL	(1)
+
+/* We indeed have inlined functions.  */
+#define HP_TIMING_INLINE	(1)
+
+/* We use 64bit values for the times.  */
+typedef unsigned long long int hp_timing_t;
+
+/* The "=A" constraint used in 32-bit mode does not work in 64-bit mode.  */
+#define HP_TIMING_NOW(Var) \
+  ({ unsigned int _hi, _lo; \
+     asm volatile ("rdtsc" : "=a" (_lo), "=d" (_hi)); \
+     (Var) = ((unsigned long long int) _hi << 32) | _lo; })
+
+#include <hp-timing-common.h>
+
+#endif /* hp-timing.h */
diff --git a/REORG.TODO/sysdeps/x86_64/htonl.S b/REORG.TODO/sysdeps/x86_64/htonl.S
new file mode 100644
index 0000000000..dcc6bca592
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/htonl.S
@@ -0,0 +1,34 @@
+/* Change byte order in word.  For AMD x86-64.
+   Copyright (C) 1997-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+/*
+   INPUT PARAMETERS:
+   word		%rdi
+*/
+
+	.text
+ENTRY (htonl)
+	movl	%edi, %eax
+	bswap	%eax
+	ret
+END (htonl)
+
+weak_alias (htonl, ntohl)
diff --git a/REORG.TODO/sysdeps/x86_64/ifuncmain8.c b/REORG.TODO/sysdeps/x86_64/ifuncmain8.c
new file mode 100644
index 0000000000..c97cad0af4
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/ifuncmain8.c
@@ -0,0 +1,32 @@
+/* Test IFUNC selector with floating-point parameters.
+   Copyright (C) 2015-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <stdlib.h>
+
+extern float foo (float);
+
+static int
+do_test (void)
+{
+  if (foo (2) != 3)
+    abort ();
+  return 0;
+}
+
+#define TEST_FUNCTION do_test ()
+#include "../test-skeleton.c"
diff --git a/REORG.TODO/sysdeps/x86_64/ifuncmod8.c b/REORG.TODO/sysdeps/x86_64/ifuncmod8.c
new file mode 100644
index 0000000000..037158b2b5
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/ifuncmod8.c
@@ -0,0 +1,37 @@
+/* Test IFUNC selector with floating-point parameters.
+   Copyright (C) 2015-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <emmintrin.h>
+
+void * foo_ifunc (void) __asm__ ("foo");
+__asm__(".type foo, %gnu_indirect_function");
+
+static float
+foo_impl (float x)
+{
+  return x + 1;
+}
+
+void *
+inhibit_stack_protector
+foo_ifunc (void)
+{
+  __m128i xmm = _mm_set1_epi32 (-1);
+  asm volatile ("movdqa %0, %%xmm0" : : "x" (xmm) : "xmm0" );
+  return foo_impl;
+}
diff --git a/REORG.TODO/sysdeps/x86_64/jmpbuf-offsets.h b/REORG.TODO/sysdeps/x86_64/jmpbuf-offsets.h
new file mode 100644
index 0000000000..7471deaae8
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/jmpbuf-offsets.h
@@ -0,0 +1,29 @@
+/* Private macros for accessing __jmp_buf contents.  x86-64 version.
+   Copyright (C) 2006-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* We only need to save callee-saved registers plus stackpointer and
+   program counter.  */
+#define JB_RBX	0
+#define JB_RBP	1
+#define JB_R12	2
+#define JB_R13	3
+#define JB_R14	4
+#define JB_R15	5
+#define JB_RSP	6
+#define JB_PC	7
+#define JB_SIZE (8*8)
diff --git a/REORG.TODO/sysdeps/x86_64/jmpbuf-unwind.h b/REORG.TODO/sysdeps/x86_64/jmpbuf-unwind.h
new file mode 100644
index 0000000000..a22c77af05
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/jmpbuf-unwind.h
@@ -0,0 +1,49 @@
+/* Copyright (C) 2003-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Jakub Jelinek <jakub@redhat.com>, 2003.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <setjmp.h>
+#include <jmpbuf-offsets.h>
+#include <stdint.h>
+#include <unwind.h>
+#include <sysdep.h>
+
+/* Test if longjmp to JMPBUF would unwind the frame
+   containing a local variable at ADDRESS.  */
+#define _JMPBUF_UNWINDS(jmpbuf, address, demangle) \
+  ((void *) (address) < (void *) demangle ((jmpbuf)[JB_RSP]))
+
+#define _JMPBUF_CFA_UNWINDS_ADJ(_jmpbuf, _context, _adj) \
+  _JMPBUF_UNWINDS_ADJ (_jmpbuf, \
+		       (void *) (_Unwind_Ptr) _Unwind_GetCFA (_context), \
+		       _adj)
+
+static inline uintptr_t __attribute__ ((unused))
+_jmpbuf_sp (__jmp_buf regs)
+{
+  uintptr_t sp = regs[JB_RSP];
+#ifdef PTR_DEMANGLE
+  PTR_DEMANGLE (sp);
+#endif
+  return sp;
+}
+
+#define _JMPBUF_UNWINDS_ADJ(_jmpbuf, _address, _adj) \
+  ((uintptr_t) (_address) - (_adj) < _jmpbuf_sp (_jmpbuf) - (_adj))
+
+/* We use the normal longjmp for unwinding.  */
+#define __libc_unwind_longjmp(buf, val) __libc_longjmp (buf, val)
diff --git a/REORG.TODO/sysdeps/x86_64/l10nflist.c b/REORG.TODO/sysdeps/x86_64/l10nflist.c
new file mode 100644
index 0000000000..2e08372338
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/l10nflist.c
@@ -0,0 +1,13 @@
+#ifdef __POPCNT__
+# include <popcntintrin.h>
+
+static inline unsigned int
+pop (unsigned int x)
+{
+  return _mm_popcnt_u32 (x);
+}
+# define ARCH_POP 1
+
+#endif
+
+#include <intl/l10nflist.c>
diff --git a/REORG.TODO/sysdeps/x86_64/ldbl2mpn.c b/REORG.TODO/sysdeps/x86_64/ldbl2mpn.c
new file mode 100644
index 0000000000..641b789cd4
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/ldbl2mpn.c
@@ -0,0 +1 @@
+#include "../i386/ldbl2mpn.c"
diff --git a/REORG.TODO/sysdeps/x86_64/ldsodefs.h b/REORG.TODO/sysdeps/x86_64/ldsodefs.h
new file mode 100644
index 0000000000..19ff8c8209
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/ldsodefs.h
@@ -0,0 +1,56 @@
+/* Run-time dynamic linker data structures for loaded ELF shared objects.
+   Copyright (C) 1995-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef	_X86_64_LDSODEFS_H
+#define	_X86_64_LDSODEFS_H	1
+
+#include <elf.h>
+#include <cpu-features.h>
+
+struct La_x86_64_regs;
+struct La_x86_64_retval;
+struct La_x32_regs;
+struct La_x32_retval;
+
+#define ARCH_PLTENTER_MEMBERS						\
+    Elf64_Addr (*x86_64_gnu_pltenter) (Elf64_Sym *, unsigned int,	\
+				       uintptr_t *,			\
+				       uintptr_t *, struct La_x86_64_regs *, \
+				       unsigned int *, const char *name, \
+				       long int *framesizep);		\
+    Elf32_Addr (*x32_gnu_pltenter) (Elf32_Sym *, unsigned int, uintptr_t *, \
+				    uintptr_t *, struct La_x32_regs *,	\
+				    unsigned int *, const char *name,	\
+				    long int *framesizep)
+
+#define ARCH_PLTEXIT_MEMBERS						\
+    unsigned int (*x86_64_gnu_pltexit) (Elf64_Sym *, unsigned int,	\
+					uintptr_t *,			\
+					uintptr_t *,			\
+					const struct La_x86_64_regs *,	\
+					struct La_x86_64_retval *,	\
+					const char *);			\
+    unsigned int (*x32_gnu_pltexit) (Elf32_Sym *, unsigned int, uintptr_t *, \
+				     uintptr_t *,			\
+				     const struct La_x32_regs *,	\
+				     struct La_x86_64_retval *,		\
+				     const char *)
+
+#include_next <ldsodefs.h>
+
+#endif
diff --git a/REORG.TODO/sysdeps/x86_64/link-defines.sym b/REORG.TODO/sysdeps/x86_64/link-defines.sym
new file mode 100644
index 0000000000..963c69b320
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/link-defines.sym
@@ -0,0 +1,38 @@
+#include "link.h"
+#include <stddef.h>
+
+--
+VECTOR_SIZE		sizeof (La_x86_64_vector)
+XMM_SIZE		sizeof (La_x86_64_xmm)
+YMM_SIZE		sizeof (La_x86_64_ymm)
+ZMM_SIZE		sizeof (La_x86_64_zmm)
+BND_SIZE		sizeof (__int128_t)
+
+LR_SIZE			sizeof (struct La_x86_64_regs)
+LR_RDX_OFFSET		offsetof (struct La_x86_64_regs, lr_rdx)
+LR_R8_OFFSET		offsetof (struct La_x86_64_regs, lr_r8)
+LR_R9_OFFSET		offsetof (struct La_x86_64_regs, lr_r9)
+LR_RCX_OFFSET		offsetof (struct La_x86_64_regs, lr_rcx)
+LR_RSI_OFFSET		offsetof (struct La_x86_64_regs, lr_rsi)
+LR_RDI_OFFSET		offsetof (struct La_x86_64_regs, lr_rdi)
+LR_RBP_OFFSET		offsetof (struct La_x86_64_regs, lr_rbp)
+LR_RSP_OFFSET		offsetof (struct La_x86_64_regs, lr_rsp)
+LR_XMM_OFFSET		offsetof (struct La_x86_64_regs, lr_xmm)
+LR_VECTOR_OFFSET	offsetof (struct La_x86_64_regs, lr_vector)
+#ifndef __ILP32__
+LR_BND_OFFSET		offsetof (struct La_x86_64_regs, lr_bnd)
+#endif
+
+LRV_SIZE		sizeof (struct La_x86_64_retval)
+LRV_RAX_OFFSET		offsetof (struct La_x86_64_retval, lrv_rax)
+LRV_RDX_OFFSET		offsetof (struct La_x86_64_retval, lrv_rdx)
+LRV_XMM0_OFFSET		offsetof (struct La_x86_64_retval, lrv_xmm0)
+LRV_XMM1_OFFSET		offsetof (struct La_x86_64_retval, lrv_xmm1)
+LRV_ST0_OFFSET		offsetof (struct La_x86_64_retval, lrv_st0)
+LRV_ST1_OFFSET		offsetof (struct La_x86_64_retval, lrv_st1)
+LRV_VECTOR0_OFFSET	offsetof (struct La_x86_64_retval, lrv_vector0)
+LRV_VECTOR1_OFFSET	offsetof (struct La_x86_64_retval, lrv_vector1)
+#ifndef __ILP32__
+LRV_BND0_OFFSET		offsetof (struct La_x86_64_retval, lrv_bnd0)
+LRV_BND1_OFFSET		offsetof (struct La_x86_64_retval, lrv_bnd1)
+#endif
diff --git a/REORG.TODO/sysdeps/x86_64/locale-defines.sym b/REORG.TODO/sysdeps/x86_64/locale-defines.sym
new file mode 100644
index 0000000000..aebff9a4f9
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/locale-defines.sym
@@ -0,0 +1,11 @@
+#include <locale/localeinfo.h>
+#include <langinfo.h>
+#include <stddef.h>
+
+--
+
+LOCALE_T___LOCALES		offsetof (struct __locale_struct, __locales)
+LC_CTYPE
+_NL_CTYPE_NONASCII_CASE
+LOCALE_DATA_VALUES		offsetof (struct __locale_data, values)
+SIZEOF_VALUES			sizeof (((struct __locale_data *) 0)->values[0])
diff --git a/REORG.TODO/sysdeps/x86_64/localplt.data b/REORG.TODO/sysdeps/x86_64/localplt.data
new file mode 100644
index 0000000000..a1840cff31
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/localplt.data
@@ -0,0 +1,20 @@
+# See scripts/check-localplt.awk for how this file is processed.
+# PLT use is required for the malloc family and for matherr because
+# users can define their own functions and have library internals call them.
+# Linker in binutils 2.26 and newer consolidates R_X86_64_JUMP_SLOT
+# relocation with R_X86_64_GLOB_DAT relocation against the same symbol.
+libc.so: calloc + RELA R_X86_64_GLOB_DAT
+libc.so: free + RELA R_X86_64_GLOB_DAT
+libc.so: malloc + RELA R_X86_64_GLOB_DAT
+libc.so: memalign + RELA R_X86_64_GLOB_DAT
+libc.so: realloc + RELA R_X86_64_GLOB_DAT
+libm.so: matherr + RELA R_X86_64_GLOB_DAT
+# The main malloc is interposed into the dynamic linker, for
+# allocations after the initial link (when dlopen is used).
+ld.so: malloc + RELA R_X86_64_GLOB_DAT
+ld.so: calloc + RELA R_X86_64_GLOB_DAT
+ld.so: realloc + RELA R_X86_64_GLOB_DAT
+ld.so: free + RELA R_X86_64_GLOB_DAT
+# The TLS-enabled version of these functions is interposed from libc.so.
+ld.so: _dl_signal_error + RELA R_X86_64_GLOB_DAT
+ld.so: _dl_catch_error + RELA R_X86_64_GLOB_DAT
diff --git a/REORG.TODO/sysdeps/x86_64/lshift.S b/REORG.TODO/sysdeps/x86_64/lshift.S
new file mode 100644
index 0000000000..690f83555a
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/lshift.S
@@ -0,0 +1,116 @@
+/* x86-64 __mpn_lshift --
+   Copyright (C) 2007-2017 Free Software Foundation, Inc.
+   This file is part of the GNU MP Library.
+
+   The GNU MP Library is free software; you can redistribute it and/or modify
+   it under the terms of the GNU Lesser General Public License as published by
+   the Free Software Foundation; either version 2.1 of the License, or (at your
+   option) any later version.
+
+   The GNU MP Library is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+   License for more details.
+
+   You should have received a copy of the GNU Lesser General Public License
+   along with the GNU MP Library; see the file COPYING.LIB.  If not,
+   see <http://www.gnu.org/licenses/>.  */
+
+#include "sysdep.h"
+#include "asm-syntax.h"
+
+#define rp	%rdi
+#define up	%rsi
+#define n	%rdx
+#define cnt	%cl
+
+	.text
+ENTRY (__mpn_lshift)
+	lea	-8(rp,n,8), rp
+	lea	-8(up,n,8), up
+
+	mov	%edx, %eax
+	and	$3, %eax
+	jne	L(nb00)
+L(b00):	/* n = 4, 8, 12, ... */
+	mov	(up), %r10
+	mov	-8(up), %r11
+	xor	%eax, %eax
+	shld	%cl, %r10, %rax
+	mov	-16(up), %r8
+	lea	24(rp), rp
+	sub	$4, n
+	jmp	L(00)
+
+L(nb00):/* n = 1, 5, 9, ... */
+	cmp	$2, %eax
+	jae	L(nb01)
+L(b01):	mov	(up), %r9
+	xor	%eax, %eax
+	shld	%cl, %r9, %rax
+	sub	$2, n
+	jb	L(le1)
+	mov	-8(up), %r10
+	mov	-16(up), %r11
+	lea	-8(up), up
+	lea	16(rp), rp
+	jmp	L(01)
+L(le1):	shl	%cl, %r9
+	mov	%r9, (rp)
+	ret
+
+L(nb01):/* n = 2, 6, 10, ... */
+	jne	L(b11)
+L(b10):	mov	(up), %r8
+	mov	-8(up), %r9
+	xor	%eax, %eax
+	shld	%cl, %r8, %rax
+	sub	$3, n
+	jb	L(le2)
+	mov	-16(up), %r10
+	lea	-16(up), up
+	lea	8(rp), rp
+	jmp	L(10)
+L(le2):	shld	%cl, %r9, %r8
+	mov	%r8, (rp)
+	shl	%cl, %r9
+	mov	%r9, -8(rp)
+	ret
+
+	.p2align 4		/* performance critical! */
+L(b11):	/* n = 3, 7, 11, ... */
+	mov	(up), %r11
+	mov	-8(up), %r8
+	xor	%eax, %eax
+	shld	%cl, %r11, %rax
+	mov	-16(up), %r9
+	lea	-24(up), up
+	sub	$4, n
+	jb	L(end)
+
+	.p2align 4
+L(top):	shld	%cl, %r8, %r11
+	mov	(up), %r10
+	mov	%r11, (rp)
+L(10):	shld	%cl, %r9, %r8
+	mov	-8(up), %r11
+	mov	%r8, -8(rp)
+L(01):	shld	%cl, %r10, %r9
+	mov	-16(up), %r8
+	mov	%r9, -16(rp)
+L(00):	shld	%cl, %r11, %r10
+	mov	-24(up), %r9
+	mov	%r10, -24(rp)
+	add	$-32, up
+	lea	-32(rp), rp
+	sub	$4, n
+	jnc	L(top)
+
+L(end):	shld	%cl, %r8, %r11
+	mov	%r11, (rp)
+	shld	%cl, %r9, %r8
+	mov	%r8, -8(rp)
+	shl	%cl, %r9
+	mov	%r9, -16(rp)
+	ret
+END (__mpn_lshift)
diff --git a/REORG.TODO/sysdeps/x86_64/machine-gmon.h b/REORG.TODO/sysdeps/x86_64/machine-gmon.h
new file mode 100644
index 0000000000..4fdccf8d17
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/machine-gmon.h
@@ -0,0 +1,38 @@
+/* x86-64-specific implementation of profiling support.
+   Copyright (C) 2002-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Andreas Jaeger <aj@suse.de>, 2002.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+/* We need a special version of the `mcount' function since for x86-64
+   so that we do not use __builtin_return_address (N) and avoid
+   clobbering of register.  */
+
+
+/* We must not pollute the global namespace.  */
+#define mcount_internal __mcount_internal
+
+void mcount_internal (u_long frompc, u_long selfpc);
+
+#define _MCOUNT_DECL(frompc, selfpc) \
+void mcount_internal (u_long frompc, u_long selfpc)
+
+
+/* Define MCOUNT as empty since we have the implementation in another
+   file.  */
+#define MCOUNT
diff --git a/REORG.TODO/sysdeps/x86_64/memchr.S b/REORG.TODO/sysdeps/x86_64/memchr.S
new file mode 100644
index 0000000000..d3be012424
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/memchr.S
@@ -0,0 +1,315 @@
+/* Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+/* fast SSE2 version with using pmaxub and 64 byte loop */
+
+	.text
+ENTRY(memchr)
+	movd	%esi, %xmm1
+	mov	%edi, %ecx
+
+	punpcklbw %xmm1, %xmm1
+	test	%rdx, %rdx
+	jz	L(return_null)
+	punpcklbw %xmm1, %xmm1
+
+	and	$63, %ecx
+	pshufd	$0, %xmm1, %xmm1
+
+	cmp	$48, %ecx
+	ja	L(crosscache)
+
+	movdqu	(%rdi), %xmm0
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+
+	jnz	L(matches_1)
+	sub	$16, %rdx
+	jbe	L(return_null)
+	add	$16, %rdi
+	and	$15, %ecx
+	and	$-16, %rdi
+	add	%rcx, %rdx
+	sub	$64, %rdx
+	jbe	L(exit_loop)
+	jmp	L(loop_prolog)
+
+	.p2align 4
+L(crosscache):
+	and	$15, %ecx
+	and	$-16, %rdi
+	movdqa	(%rdi), %xmm0
+
+	pcmpeqb	%xmm1, %xmm0
+/* Check if there is a match.  */
+	pmovmskb %xmm0, %eax
+/* Remove the leading bytes.  */
+	sar	%cl, %eax
+	test	%eax, %eax
+	je	L(unaligned_no_match)
+/* Check which byte is a match.  */
+	bsf	%eax, %eax
+
+	sub	%rax, %rdx
+	jbe	L(return_null)
+	add	%rdi, %rax
+	add	%rcx, %rax
+	ret
+
+	.p2align 4
+L(unaligned_no_match):
+        /* "rcx" is less than 16.  Calculate "rdx + rcx - 16" by using
+	   "rdx - (16 - rcx)" instead of "(rdx + rcx) - 16" to void
+	   possible addition overflow.  */
+	neg	%rcx
+	add	$16, %rcx
+	sub	%rcx, %rdx
+	jbe	L(return_null)
+	add	$16, %rdi
+	sub	$64, %rdx
+	jbe	L(exit_loop)
+
+	.p2align 4
+L(loop_prolog):
+	movdqa	(%rdi), %xmm0
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(matches)
+
+	movdqa	16(%rdi), %xmm2
+	pcmpeqb	%xmm1, %xmm2
+	pmovmskb %xmm2, %eax
+	test	%eax, %eax
+	jnz	L(matches16)
+
+	movdqa	32(%rdi), %xmm3
+	pcmpeqb	%xmm1, %xmm3
+	pmovmskb %xmm3, %eax
+	test	%eax, %eax
+	jnz	L(matches32)
+
+	movdqa	48(%rdi), %xmm4
+	pcmpeqb	%xmm1, %xmm4
+	add	$64, %rdi
+	pmovmskb %xmm4, %eax
+	test	%eax, %eax
+	jnz	L(matches0)
+
+	test	$0x3f, %rdi
+	jz	L(align64_loop)
+
+	sub	$64, %rdx
+	jbe	L(exit_loop)
+
+	movdqa	(%rdi), %xmm0
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(matches)
+
+	movdqa	16(%rdi), %xmm2
+	pcmpeqb	%xmm1, %xmm2
+	pmovmskb %xmm2, %eax
+	test	%eax, %eax
+	jnz	L(matches16)
+
+	movdqa	32(%rdi), %xmm3
+	pcmpeqb	%xmm1, %xmm3
+	pmovmskb %xmm3, %eax
+	test	%eax, %eax
+	jnz	L(matches32)
+
+	movdqa	48(%rdi), %xmm3
+	pcmpeqb	%xmm1, %xmm3
+	pmovmskb %xmm3, %eax
+
+	add	$64, %rdi
+	test	%eax, %eax
+	jnz	L(matches0)
+
+	mov	%rdi, %rcx
+	and	$-64, %rdi
+	and	$63, %ecx
+	add	%rcx, %rdx
+
+	.p2align 4
+L(align64_loop):
+	sub	$64, %rdx
+	jbe	L(exit_loop)
+	movdqa	(%rdi), %xmm0
+	movdqa	16(%rdi), %xmm2
+	movdqa	32(%rdi), %xmm3
+	movdqa	48(%rdi), %xmm4
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm1, %xmm2
+	pcmpeqb	%xmm1, %xmm3
+	pcmpeqb	%xmm1, %xmm4
+
+	pmaxub	%xmm0, %xmm3
+	pmaxub	%xmm2, %xmm4
+	pmaxub	%xmm3, %xmm4
+	pmovmskb %xmm4, %eax
+
+	add	$64, %rdi
+
+	test	%eax, %eax
+	jz	L(align64_loop)
+
+	sub	$64, %rdi
+
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(matches)
+
+	pmovmskb %xmm2, %eax
+	test	%eax, %eax
+	jnz	L(matches16)
+
+	movdqa	32(%rdi), %xmm3
+	pcmpeqb	%xmm1, %xmm3
+
+	pcmpeqb	48(%rdi), %xmm1
+	pmovmskb %xmm3, %eax
+	test	%eax, %eax
+	jnz	L(matches32)
+
+	pmovmskb %xmm1, %eax
+	bsf	%eax, %eax
+	lea	48(%rdi, %rax), %rax
+	ret
+
+	.p2align 4
+L(exit_loop):
+	add	$32, %edx
+	jle	L(exit_loop_32)
+
+	movdqa	(%rdi), %xmm0
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(matches)
+
+	movdqa	16(%rdi), %xmm2
+	pcmpeqb	%xmm1, %xmm2
+	pmovmskb %xmm2, %eax
+	test	%eax, %eax
+	jnz	L(matches16)
+
+	movdqa	32(%rdi), %xmm3
+	pcmpeqb	%xmm1, %xmm3
+	pmovmskb %xmm3, %eax
+	test	%eax, %eax
+	jnz	L(matches32_1)
+	sub	$16, %edx
+	jle	L(return_null)
+
+	pcmpeqb	48(%rdi), %xmm1
+	pmovmskb %xmm1, %eax
+	test	%eax, %eax
+	jnz	L(matches48_1)
+	xor	%eax, %eax
+	ret
+
+	.p2align 4
+L(exit_loop_32):
+	add	$32, %edx
+	movdqa	(%rdi), %xmm0
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(matches_1)
+	sub	$16, %edx
+	jbe	L(return_null)
+
+	pcmpeqb	16(%rdi), %xmm1
+	pmovmskb %xmm1, %eax
+	test	%eax, %eax
+	jnz	L(matches16_1)
+	xor	%eax, %eax
+	ret
+
+	.p2align 4
+L(matches0):
+	bsf	%eax, %eax
+	lea	-16(%rax, %rdi), %rax
+	ret
+
+	.p2align 4
+L(matches):
+	bsf	%eax, %eax
+	add	%rdi, %rax
+	ret
+
+	.p2align 4
+L(matches16):
+	bsf	%eax, %eax
+	lea	16(%rax, %rdi), %rax
+	ret
+
+	.p2align 4
+L(matches32):
+	bsf	%eax, %eax
+	lea	32(%rax, %rdi), %rax
+	ret
+
+	.p2align 4
+L(matches_1):
+	bsf	%eax, %eax
+	sub	%rax, %rdx
+	jbe	L(return_null)
+	add	%rdi, %rax
+	ret
+
+	.p2align 4
+L(matches16_1):
+	bsf	%eax, %eax
+	sub	%rax, %rdx
+	jbe	L(return_null)
+	lea	16(%rdi, %rax), %rax
+	ret
+
+	.p2align 4
+L(matches32_1):
+	bsf	%eax, %eax
+	sub	%rax, %rdx
+	jbe	L(return_null)
+	lea	32(%rdi, %rax), %rax
+	ret
+
+	.p2align 4
+L(matches48_1):
+	bsf	%eax, %eax
+	sub	%rax, %rdx
+	jbe	L(return_null)
+	lea	48(%rdi, %rax), %rax
+	ret
+
+	.p2align 4
+L(return_null):
+	xor	%eax, %eax
+	ret
+END(memchr)
+
+strong_alias (memchr, __memchr)
+
+libc_hidden_builtin_def(memchr)
diff --git a/REORG.TODO/sysdeps/x86_64/memcmp.S b/REORG.TODO/sysdeps/x86_64/memcmp.S
new file mode 100644
index 0000000000..0828a22534
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/memcmp.S
@@ -0,0 +1,358 @@
+/* memcmp with SSE2
+   Copyright (C) 2009-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+	.text
+ENTRY (memcmp)
+	test	%rdx, %rdx
+	jz	L(finz)
+	cmpq	$1, %rdx
+	jle	L(finr1b)
+	subq	%rdi, %rsi
+	movq	%rdx, %r10
+	cmpq	$32, %r10
+	jge	L(gt32)
+	/* Handle small chunks and last block of less than 32 bytes.  */
+L(small):
+	testq	$1, %r10
+	jz	L(s2b)
+	movzbl	(%rdi),	%eax
+	movzbl	(%rdi, %rsi), %edx
+	subq    $1, %r10
+	je	L(finz1)
+	addq	$1, %rdi
+	subl	%edx, %eax
+	jnz	L(exit)
+L(s2b):
+	testq	$2, %r10
+	jz	L(s4b)
+	movzwl	(%rdi),	%eax
+	movzwl	(%rdi, %rsi), %edx
+	subq    $2, %r10
+	je	L(fin2_7)
+	addq	$2, %rdi
+	cmpl	%edx, %eax
+	jnz	L(fin2_7)
+L(s4b):
+	testq	$4, %r10
+	jz	L(s8b)
+	movl	(%rdi),	%eax
+	movl	(%rdi, %rsi), %edx
+	subq    $4, %r10
+	je	L(fin2_7)
+	addq	$4, %rdi
+	cmpl	%edx, %eax
+	jnz	L(fin2_7)
+L(s8b):
+	testq	$8, %r10
+	jz	L(s16b)
+	movq	(%rdi),	%rax
+	movq	(%rdi, %rsi), %rdx
+	subq    $8, %r10
+	je	L(fin2_7)
+	addq	$8, %rdi
+	cmpq	%rdx, %rax
+	jnz	L(fin2_7)
+L(s16b):
+	movdqu    (%rdi), %xmm1
+	movdqu    (%rdi, %rsi), %xmm0
+	pcmpeqb   %xmm0, %xmm1
+	pmovmskb  %xmm1, %edx
+	xorl	  %eax, %eax
+	subl      $0xffff, %edx
+	jz	  L(finz)
+	bsfl      %edx, %ecx
+	leaq	 (%rdi, %rcx), %rcx
+	movzbl	 (%rcx), %eax
+	movzbl	 (%rsi, %rcx), %edx
+	jmp	 L(finz1)
+
+	.p2align 4,, 4
+L(finr1b):
+	movzbl	(%rdi), %eax
+	movzbl  (%rsi), %edx
+L(finz1):
+	subl	%edx, %eax
+L(exit):
+	ret
+
+	.p2align 4,, 4
+L(fin2_7):
+	cmpq	%rdx, %rax
+	jz	L(finz)
+	movq	%rax, %r11
+	subq	%rdx, %r11
+	bsfq	%r11, %rcx
+	sarq	$3, %rcx
+	salq	$3, %rcx
+	sarq	%cl, %rax
+	movzbl  %al, %eax
+	sarq	%cl, %rdx
+	movzbl  %dl, %edx
+	subl	%edx, %eax
+	ret
+
+	.p2align 4,, 4
+L(finz):
+	xorl	%eax, %eax
+	ret
+
+	/* For blocks bigger than 32 bytes
+	   1. Advance one of the addr pointer to be 16B aligned.
+	   2. Treat the case of both addr pointers aligned to 16B
+	      separately to avoid movdqu.
+	   3. Handle any blocks of greater than 64 consecutive bytes with
+	      unrolling to reduce branches.
+	   4. At least one addr pointer is 16B aligned, use memory version
+	      of pcmbeqb.
+	*/
+	.p2align 4,, 4
+L(gt32):
+	movq	%rdx, %r11
+	addq	%rdi, %r11
+	movq	%rdi, %r8
+
+	andq	$15, %r8
+	jz	L(16am)
+	/* Both pointers may be misaligned.  */
+	movdqu	(%rdi),	%xmm1
+	movdqu	(%rdi, %rsi), %xmm0
+	pcmpeqb   %xmm0, %xmm1
+	pmovmskb  %xmm1, %edx
+	subl      $0xffff, %edx
+	jnz       L(neq)
+	neg	 %r8
+	leaq    16(%rdi, %r8), %rdi
+L(16am):
+	/* Handle two 16B aligned pointers separately.  */
+	testq   $15, %rsi
+	jz      L(ATR)
+	testq	$16, %rdi
+	jz	L(A32)
+	movdqu	(%rdi, %rsi), %xmm0
+	pcmpeqb   (%rdi), %xmm0
+	pmovmskb  %xmm0, %edx
+	subl      $0xffff, %edx
+	jnz       L(neq)
+	addq	$16, %rdi
+L(A32):
+	movq	%r11, %r10
+	andq	$-32, %r10
+	cmpq	%r10, %rdi
+        jge	L(mt16)
+	/* Pre-unroll to be ready for unrolled 64B loop.  */
+	testq	$32, %rdi
+	jz	L(A64)
+	movdqu    (%rdi,%rsi), %xmm0
+	pcmpeqb   (%rdi), %xmm0
+	pmovmskb  %xmm0, %edx
+	subl      $0xffff, %edx
+	jnz       L(neq)
+	addq       $16, %rdi
+
+	movdqu    (%rdi,%rsi), %xmm0
+	pcmpeqb  (%rdi), %xmm0
+	pmovmskb  %xmm0, %edx
+	subl      $0xffff, %edx
+	jnz       L(neq)
+	addq       $16, %rdi
+
+L(A64):
+	movq	%r11, %r10
+	andq	$-64, %r10
+	cmpq	%r10, %rdi
+        jge	L(mt32)
+
+L(A64main):
+	movdqu    (%rdi,%rsi), %xmm0
+	pcmpeqb   (%rdi), %xmm0
+	pmovmskb  %xmm0, %edx
+	subl      $0xffff, %edx
+	jnz       L(neq)
+	addq       $16, %rdi
+
+	movdqu    (%rdi,%rsi), %xmm0
+	pcmpeqb   (%rdi), %xmm0
+	pmovmskb  %xmm0, %edx
+	subl      $0xffff, %edx
+	jnz       L(neq)
+	addq       $16, %rdi
+
+	movdqu    (%rdi,%rsi), %xmm0
+	pcmpeqb   (%rdi), %xmm0
+	pmovmskb  %xmm0, %edx
+	subl      $0xffff, %edx
+	jnz       L(neq)
+	addq       $16, %rdi
+
+	movdqu    (%rdi,%rsi), %xmm0
+	pcmpeqb  (%rdi), %xmm0
+	pmovmskb  %xmm0, %edx
+	subl      $0xffff, %edx
+	jnz       L(neq)
+	addq       $16, %rdi
+
+	cmpq       %rdi, %r10
+	jne       L(A64main)
+
+L(mt32):
+	movq	%r11, %r10
+	andq	$-32, %r10
+	cmpq	%r10, %rdi
+        jge	L(mt16)
+
+L(A32main):
+	movdqu    (%rdi,%rsi), %xmm0
+	pcmpeqb   (%rdi), %xmm0
+	pmovmskb  %xmm0, %edx
+	subl      $0xffff, %edx
+	jnz       L(neq)
+	addq       $16, %rdi
+
+	movdqu    (%rdi,%rsi), %xmm0
+	pcmpeqb  (%rdi), %xmm0
+	pmovmskb  %xmm0, %edx
+	subl      $0xffff, %edx
+	jnz       L(neq)
+	addq       $16, %rdi
+
+	cmpq       %rdi, %r10
+	jne       L(A32main)
+L(mt16):
+	subq       %rdi, %r11
+	je	  L(finz)
+	movq	  %r11, %r10
+	jmp	  L(small)
+
+	.p2align 4,, 4
+L(neq):
+	bsfl      %edx, %ecx
+	movzbl	 (%rdi, %rcx), %eax
+	addq	 %rdi, %rsi
+	movzbl	 (%rsi,%rcx), %edx
+	jmp	 L(finz1)
+
+	.p2align 4,, 4
+L(ATR):
+	movq	%r11, %r10
+	andq	$-32, %r10
+	cmpq	%r10, %rdi
+        jge	L(mt16)
+	testq	$16, %rdi
+	jz	L(ATR32)
+
+	movdqa    (%rdi,%rsi), %xmm0
+	pcmpeqb   (%rdi), %xmm0
+	pmovmskb  %xmm0, %edx
+	subl      $0xffff, %edx
+	jnz       L(neq)
+	addq       $16, %rdi
+	cmpq       %rdi, %r10
+	je       L(mt16)
+
+L(ATR32):
+	movq	%r11, %r10
+	andq	$-64, %r10
+	testq	$32, %rdi
+	jz	L(ATR64)
+
+	movdqa    (%rdi,%rsi), %xmm0
+	pcmpeqb   (%rdi), %xmm0
+	pmovmskb  %xmm0, %edx
+	subl      $0xffff, %edx
+	jnz       L(neq)
+	addq       $16, %rdi
+
+	movdqa    (%rdi,%rsi), %xmm0
+	pcmpeqb   (%rdi), %xmm0
+	pmovmskb  %xmm0, %edx
+	subl      $0xffff, %edx
+	jnz       L(neq)
+	addq       $16, %rdi
+
+L(ATR64):
+	cmpq       %rdi, %r10
+	je	   L(mt32)
+
+L(ATR64main):
+	movdqa    (%rdi,%rsi), %xmm0
+	pcmpeqb   (%rdi), %xmm0
+	pmovmskb  %xmm0, %edx
+	subl      $0xffff, %edx
+	jnz       L(neq)
+	addq       $16, %rdi
+
+	movdqa    (%rdi,%rsi), %xmm0
+	pcmpeqb   (%rdi), %xmm0
+	pmovmskb  %xmm0, %edx
+	subl      $0xffff, %edx
+	jnz       L(neq)
+	addq       $16, %rdi
+
+	movdqa    (%rdi,%rsi), %xmm0
+	pcmpeqb   (%rdi), %xmm0
+	pmovmskb  %xmm0, %edx
+	subl      $0xffff, %edx
+	jnz       L(neq)
+	addq       $16, %rdi
+
+	movdqa    (%rdi,%rsi), %xmm0
+	pcmpeqb   (%rdi), %xmm0
+	pmovmskb  %xmm0, %edx
+	subl      $0xffff, %edx
+	jnz       L(neq)
+	addq       $16, %rdi
+	cmpq       %rdi, %r10
+	jne       L(ATR64main)
+
+	movq	%r11, %r10
+	andq	$-32, %r10
+	cmpq	%r10, %rdi
+        jge	L(mt16)
+
+L(ATR32res):
+	movdqa    (%rdi,%rsi), %xmm0
+	pcmpeqb   (%rdi), %xmm0
+	pmovmskb  %xmm0, %edx
+	subl      $0xffff, %edx
+	jnz       L(neq)
+	addq       $16, %rdi
+
+	movdqa    (%rdi,%rsi), %xmm0
+	pcmpeqb   (%rdi), %xmm0
+	pmovmskb  %xmm0, %edx
+	subl      $0xffff, %edx
+	jnz       L(neq)
+	addq       $16, %rdi
+
+	cmpq	  %r10, %rdi
+	jne       L(ATR32res)
+
+	subq       %rdi, %r11
+	je	  L(finz)
+	movq	  %r11, %r10
+	jmp	  L(small)
+	/* Align to 16byte to improve instruction fetch.  */
+	.p2align 4,, 4
+END(memcmp)
+
+#undef bcmp
+weak_alias (memcmp, bcmp)
+libc_hidden_builtin_def (memcmp)
diff --git a/REORG.TODO/sysdeps/x86_64/memcopy.h b/REORG.TODO/sysdeps/x86_64/memcopy.h
new file mode 100644
index 0000000000..590b6cb16b
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/memcopy.h
@@ -0,0 +1 @@
+/* X86-64 doesn't use memory copy functions.  */
diff --git a/REORG.TODO/sysdeps/x86_64/memcpy.S b/REORG.TODO/sysdeps/x86_64/memcpy.S
new file mode 100644
index 0000000000..d98500a78a
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/memcpy.S
@@ -0,0 +1 @@
+/* Implemented in memcpy.S.  */
diff --git a/REORG.TODO/sysdeps/x86_64/memcpy_chk.S b/REORG.TODO/sysdeps/x86_64/memcpy_chk.S
new file mode 100644
index 0000000000..23e9e1ade5
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/memcpy_chk.S
@@ -0,0 +1,33 @@
+/* Checking memcpy for x86-64.
+   Copyright (C) 2004-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+#ifndef SHARED
+	/* For libc.so this is defined in memcpy.S.
+	   For libc.a, this is a separate source to avoid
+	   memcpy bringing in __chk_fail and all routines
+	   it calls.  */
+        .text
+ENTRY (__memcpy_chk)
+	cmpq	%rdx, %rcx
+	jb	__chk_fail
+	jmp	memcpy
+END (__memcpy_chk)
+#endif
diff --git a/REORG.TODO/sysdeps/x86_64/memmove.S b/REORG.TODO/sysdeps/x86_64/memmove.S
new file mode 100644
index 0000000000..5bbae9904f
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/memmove.S
@@ -0,0 +1,71 @@
+/* Optimized memmove for x86-64.
+   Copyright (C) 2016-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+#define VEC_SIZE	16
+#define VEC(i)		xmm##i
+#define PREFETCHNT	prefetchnta
+#define VMOVNT		movntdq
+/* Use movups and movaps for smaller code sizes.  */
+#define VMOVU		movups
+#define VMOVA		movaps
+
+#define SECTION(p)		p
+
+#ifdef USE_MULTIARCH
+# if !defined SHARED || !IS_IN (libc)
+#  define MEMCPY_SYMBOL(p,s)		memcpy
+# endif
+#else
+# if defined SHARED && IS_IN (libc)
+#  define MEMCPY_SYMBOL(p,s)		__memcpy
+# else
+#  define MEMCPY_SYMBOL(p,s)		memcpy
+# endif
+#endif
+#if !defined SHARED || !defined USE_MULTIARCH || !IS_IN (libc)
+# define MEMPCPY_SYMBOL(p,s)		__mempcpy
+#endif
+#ifndef MEMMOVE_SYMBOL
+# define MEMMOVE_CHK_SYMBOL(p,s)	p
+# define MEMMOVE_SYMBOL(p,s)		memmove
+#endif
+
+#include "multiarch/memmove-vec-unaligned-erms.S"
+
+#ifndef USE_MULTIARCH
+libc_hidden_builtin_def (memmove)
+# if defined SHARED && IS_IN (libc)
+strong_alias (memmove, __memcpy)
+libc_hidden_ver (memmove, memcpy)
+# endif
+libc_hidden_def (__mempcpy)
+weak_alias (__mempcpy, mempcpy)
+libc_hidden_builtin_def (mempcpy)
+
+# if defined SHARED && IS_IN (libc)
+#  undef memcpy
+#  include <shlib-compat.h>
+versioned_symbol (libc, __memcpy, memcpy, GLIBC_2_14);
+
+#  if SHLIB_COMPAT (libc, GLIBC_2_2_5, GLIBC_2_14)
+compat_symbol (libc, memmove, memcpy, GLIBC_2_2_5);
+#  endif
+# endif
+#endif
diff --git a/REORG.TODO/sysdeps/x86_64/memmove_chk.S b/REORG.TODO/sysdeps/x86_64/memmove_chk.S
new file mode 100644
index 0000000000..a87345800b
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/memmove_chk.S
@@ -0,0 +1,33 @@
+/* Checking memmove for x86-64.
+   Copyright (C) 2016-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+#ifndef SHARED
+	/* For libc.so this is defined in memmove.S.
+	   For libc.a, this is a separate source to avoid
+	   memmove bringing in __chk_fail and all routines
+	   it calls.  */
+        .text
+ENTRY (__memmove_chk)
+	cmpq	%rdx, %rcx
+	jb	__chk_fail
+	jmp	memmove
+END (__memmove_chk)
+#endif
diff --git a/REORG.TODO/sysdeps/x86_64/mempcpy.S b/REORG.TODO/sysdeps/x86_64/mempcpy.S
new file mode 100644
index 0000000000..d98500a78a
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/mempcpy.S
@@ -0,0 +1 @@
+/* Implemented in memcpy.S.  */
diff --git a/REORG.TODO/sysdeps/x86_64/mempcpy_chk.S b/REORG.TODO/sysdeps/x86_64/mempcpy_chk.S
new file mode 100644
index 0000000000..f912291576
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/mempcpy_chk.S
@@ -0,0 +1,33 @@
+/* Checking mempcpy for x86-64.
+   Copyright (C) 2004-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+#ifndef SHARED
+	/* For libc.so this is defined in memcpy.S.
+	   For libc.a, this is a separate source to avoid
+	   mempcpy bringing in __chk_fail and all routines
+	   it calls.  */
+        .text
+ENTRY (__mempcpy_chk)
+	cmpq	%rdx, %rcx
+	jb	__chk_fail
+	jmp	mempcpy
+END (__mempcpy_chk)
+#endif
diff --git a/REORG.TODO/sysdeps/x86_64/memrchr.S b/REORG.TODO/sysdeps/x86_64/memrchr.S
new file mode 100644
index 0000000000..5fa0fe9c1c
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/memrchr.S
@@ -0,0 +1,380 @@
+/* fast SSE2 memrchr with 64 byte loop and pmaxub instruction using
+
+   Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+	.text
+ENTRY (__memrchr)
+	movd	%esi, %xmm1
+
+	sub	$16, %rdx
+	jbe	L(length_less16)
+
+	punpcklbw	%xmm1, %xmm1
+	punpcklbw	%xmm1, %xmm1
+
+	add	%rdx, %rdi
+	pshufd	$0, %xmm1, %xmm1
+
+	movdqu	(%rdi), %xmm0
+	pcmpeqb	%xmm1, %xmm0
+
+/* Check if there is a match.  */
+	pmovmskb	%xmm0, %eax
+	test	%eax, %eax
+	jnz	L(matches0)
+
+	sub	$64, %rdi
+	mov	%edi, %ecx
+	and	$15, %ecx
+	jz	L(loop_prolog)
+
+	add	$16, %rdi
+	add	$16, %rdx
+	and	$-16, %rdi
+	sub	%rcx, %rdx
+
+	.p2align 4
+L(loop_prolog):
+	sub	$64, %rdx
+	jbe	L(exit_loop)
+
+	movdqa	48(%rdi), %xmm0
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb	%xmm0, %eax
+	test	%eax, %eax
+	jnz	L(matches48)
+
+	movdqa	32(%rdi), %xmm2
+	pcmpeqb	%xmm1, %xmm2
+	pmovmskb	%xmm2, %eax
+	test	%eax, %eax
+	jnz	L(matches32)
+
+	movdqa	16(%rdi), %xmm3
+	pcmpeqb	%xmm1, %xmm3
+	pmovmskb	%xmm3, %eax
+	test	%eax, %eax
+	jnz	L(matches16)
+
+	movdqa	(%rdi), %xmm4
+	pcmpeqb	%xmm1, %xmm4
+	pmovmskb	%xmm4, %eax
+	test	%eax, %eax
+	jnz	L(matches0)
+
+	sub	$64, %rdi
+	sub	$64, %rdx
+	jbe	L(exit_loop)
+
+	movdqa	48(%rdi), %xmm0
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb	%xmm0, %eax
+	test	%eax, %eax
+	jnz	L(matches48)
+
+	movdqa	32(%rdi), %xmm2
+	pcmpeqb	%xmm1, %xmm2
+	pmovmskb	%xmm2, %eax
+	test	%eax, %eax
+	jnz	L(matches32)
+
+	movdqa	16(%rdi), %xmm3
+	pcmpeqb	%xmm1, %xmm3
+	pmovmskb	%xmm3, %eax
+	test	%eax, %eax
+	jnz	L(matches16)
+
+	movdqa	(%rdi), %xmm3
+	pcmpeqb	%xmm1, %xmm3
+	pmovmskb	%xmm3, %eax
+	test	%eax, %eax
+	jnz	L(matches0)
+
+	mov	%edi, %ecx
+	and	$63, %ecx
+	jz	L(align64_loop)
+
+	add	$64, %rdi
+	add	$64, %rdx
+	and	$-64, %rdi
+	sub	%rcx, %rdx
+
+	.p2align 4
+L(align64_loop):
+	sub	$64, %rdi
+	sub	$64, %rdx
+	jbe	L(exit_loop)
+
+	movdqa	(%rdi), %xmm0
+	movdqa	16(%rdi), %xmm2
+	movdqa	32(%rdi), %xmm3
+	movdqa	48(%rdi), %xmm4
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm1, %xmm2
+	pcmpeqb	%xmm1, %xmm3
+	pcmpeqb	%xmm1, %xmm4
+
+	pmaxub	%xmm3, %xmm0
+	pmaxub	%xmm4, %xmm2
+	pmaxub	%xmm0, %xmm2
+	pmovmskb	%xmm2, %eax
+
+	test	%eax, %eax
+	jz	L(align64_loop)
+
+	pmovmskb	%xmm4, %eax
+	test	%eax, %eax
+	jnz	L(matches48)
+
+	pmovmskb	%xmm3, %eax
+	test	%eax, %eax
+	jnz	L(matches32)
+
+	movdqa	16(%rdi), %xmm2
+
+	pcmpeqb	%xmm1, %xmm2
+	pcmpeqb	(%rdi), %xmm1
+
+	pmovmskb	%xmm2, %eax
+	test	%eax, %eax
+	jnz	L(matches16)
+
+	pmovmskb	%xmm1, %eax
+	bsr	%eax, %eax
+
+	add	%rdi, %rax
+	ret
+
+	.p2align 4
+L(exit_loop):
+	add	$64, %edx
+	cmp	$32, %edx
+	jbe	L(exit_loop_32)
+
+	movdqa	48(%rdi), %xmm0
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb	%xmm0, %eax
+	test	%eax, %eax
+	jnz	L(matches48)
+
+	movdqa	32(%rdi), %xmm2
+	pcmpeqb	%xmm1, %xmm2
+	pmovmskb	%xmm2, %eax
+	test	%eax, %eax
+	jnz	L(matches32)
+
+	movdqa	16(%rdi), %xmm3
+	pcmpeqb	%xmm1, %xmm3
+	pmovmskb	%xmm3, %eax
+	test	%eax, %eax
+	jnz	L(matches16_1)
+	cmp	$48, %edx
+	jbe	L(return_null)
+
+	pcmpeqb	(%rdi), %xmm1
+	pmovmskb	%xmm1, %eax
+	test	%eax, %eax
+	jnz	L(matches0_1)
+	xor	%eax, %eax
+	ret
+
+	.p2align 4
+L(exit_loop_32):
+	movdqa	48(%rdi), %xmm0
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb	%xmm0, %eax
+	test	%eax, %eax
+	jnz	L(matches48_1)
+	cmp	$16, %edx
+	jbe	L(return_null)
+
+	pcmpeqb	32(%rdi), %xmm1
+	pmovmskb	%xmm1, %eax
+	test	%eax, %eax
+	jnz	L(matches32_1)
+	xor	%eax, %eax
+	ret
+
+	.p2align 4
+L(matches0):
+	bsr	%eax, %eax
+	add	%rdi, %rax
+	ret
+
+	.p2align 4
+L(matches16):
+	bsr	%eax, %eax
+	lea	16(%rax, %rdi), %rax
+	ret
+
+	.p2align 4
+L(matches32):
+	bsr	%eax, %eax
+	lea	32(%rax, %rdi), %rax
+	ret
+
+	.p2align 4
+L(matches48):
+	bsr	%eax, %eax
+	lea	48(%rax, %rdi), %rax
+	ret
+
+	.p2align 4
+L(matches0_1):
+	bsr	%eax, %eax
+	sub	$64, %rdx
+	add	%rax, %rdx
+	jl	L(return_null)
+	add	%rdi, %rax
+	ret
+
+	.p2align 4
+L(matches16_1):
+	bsr	%eax, %eax
+	sub	$48, %rdx
+	add	%rax, %rdx
+	jl	L(return_null)
+	lea	16(%rdi, %rax), %rax
+	ret
+
+	.p2align 4
+L(matches32_1):
+	bsr	%eax, %eax
+	sub	$32, %rdx
+	add	%rax, %rdx
+	jl	L(return_null)
+	lea	32(%rdi, %rax), %rax
+	ret
+
+	.p2align 4
+L(matches48_1):
+	bsr	%eax, %eax
+	sub	$16, %rdx
+	add	%rax, %rdx
+	jl	L(return_null)
+	lea	48(%rdi, %rax), %rax
+	ret
+
+	.p2align 4
+L(return_null):
+	xor	%eax, %eax
+	ret
+
+	.p2align 4
+L(length_less16_offset0):
+	test	%edx, %edx
+	jz	L(return_null)
+
+	mov	%dl, %cl
+	pcmpeqb	(%rdi), %xmm1
+
+	mov	$1, %edx
+	sal	%cl, %edx
+	sub	$1, %edx
+
+	pmovmskb	%xmm1, %eax
+
+	and	%edx, %eax
+	test	%eax, %eax
+	jz	L(return_null)
+
+	bsr	%eax, %eax
+	add	%rdi, %rax
+	ret
+
+	.p2align 4
+L(length_less16):
+	punpcklbw	%xmm1, %xmm1
+	punpcklbw	%xmm1, %xmm1
+
+	add	$16, %edx
+
+	pshufd	$0, %xmm1, %xmm1
+
+	mov	%edi, %ecx
+	and	$15, %ecx
+	jz	L(length_less16_offset0)
+
+	mov	%cl, %dh
+	mov	%ecx, %esi
+	add	%dl, %dh
+	and	$-16, %rdi
+
+	sub	$16, %dh
+	ja	L(length_less16_part2)
+
+	pcmpeqb	(%rdi), %xmm1
+	pmovmskb	%xmm1, %eax
+
+	sar	%cl, %eax
+	mov	%dl, %cl
+
+	mov	$1, %edx
+	sal	%cl, %edx
+	sub	$1, %edx
+
+	and	%edx, %eax
+	test	%eax, %eax
+	jz	L(return_null)
+
+	bsr	%eax, %eax
+	add	%rdi, %rax
+	add	%rsi, %rax
+	ret
+
+	.p2align 4
+L(length_less16_part2):
+	movdqa	16(%rdi), %xmm2
+	pcmpeqb	%xmm1, %xmm2
+	pmovmskb	%xmm2, %eax
+
+	mov	%dh, %cl
+	mov	$1, %edx
+	sal	%cl, %edx
+	sub	$1, %edx
+
+	and	%edx, %eax
+
+	test	%eax, %eax
+	jnz	L(length_less16_part2_return)
+
+	pcmpeqb	(%rdi), %xmm1
+	pmovmskb	%xmm1, %eax
+
+	mov	%esi, %ecx
+	sar	%cl, %eax
+	test	%eax, %eax
+	jz	L(return_null)
+
+	bsr	%eax, %eax
+	add	%rdi, %rax
+	add	%rsi, %rax
+	ret
+
+	.p2align 4
+L(length_less16_part2_return):
+	bsr	%eax, %eax
+	lea	16(%rax, %rdi), %rax
+	ret
+
+END (__memrchr)
+weak_alias (__memrchr, memrchr)
diff --git a/REORG.TODO/sysdeps/x86_64/memset.S b/REORG.TODO/sysdeps/x86_64/memset.S
new file mode 100644
index 0000000000..41278787fe
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/memset.S
@@ -0,0 +1,67 @@
+/* memset/bzero -- set memory area to CH/0
+   Optimized version for x86-64.
+   Copyright (C) 2002-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+#define VEC_SIZE	16
+#define VEC(i)		xmm##i
+/* Don't use movups and movaps since it will get larger nop paddings for
+   alignment.  */
+#define VMOVU		movdqu
+#define VMOVA		movdqa
+
+#define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+  movd d, %xmm0; \
+  movq r, %rax; \
+  punpcklbw %xmm0, %xmm0; \
+  punpcklwd %xmm0, %xmm0; \
+  pshufd $0, %xmm0, %xmm0
+
+#define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+  movd d, %xmm0; \
+  movq r, %rax; \
+  pshufd $0, %xmm0, %xmm0
+
+#define SECTION(p)		p
+
+#ifndef MEMSET_SYMBOL
+# define MEMSET_CHK_SYMBOL(p,s)	p
+# define MEMSET_SYMBOL(p,s)	memset
+#endif
+
+#ifndef WMEMSET_SYMBOL
+# define WMEMSET_CHK_SYMBOL(p,s) p
+# define WMEMSET_SYMBOL(p,s)	__wmemset
+#endif
+
+#include "multiarch/memset-vec-unaligned-erms.S"
+
+libc_hidden_builtin_def (memset)
+
+#if IS_IN (libc)
+libc_hidden_def (__wmemset)
+weak_alias (__wmemset, wmemset)
+libc_hidden_weak (wmemset)
+#endif
+
+#if defined SHARED && IS_IN (libc) && !defined USE_MULTIARCH
+strong_alias (__memset_chk, __memset_zero_constant_len_parameter)
+	.section .gnu.warning.__memset_zero_constant_len_parameter
+	.string "memset used with constant zero length parameter; this could be due to transposed parameters"
+#endif
diff --git a/REORG.TODO/sysdeps/x86_64/memset_chk.S b/REORG.TODO/sysdeps/x86_64/memset_chk.S
new file mode 100644
index 0000000000..33d15c0c10
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/memset_chk.S
@@ -0,0 +1,33 @@
+/* Checking memset for x86-64.
+   Copyright (C) 2004-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+#ifndef SHARED
+	/* For libc.so this is defined in memset.S.
+	   For libc.a, this is a separate source to avoid
+	   memset bringing in __chk_fail and all routines
+	   it calls.  */
+        .text
+ENTRY (__memset_chk)
+	cmpq	%rdx, %rcx
+	jb	__chk_fail
+	jmp	memset
+END (__memset_chk)
+#endif
diff --git a/REORG.TODO/sysdeps/x86_64/memusage.h b/REORG.TODO/sysdeps/x86_64/memusage.h
new file mode 100644
index 0000000000..50f960b140
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/memusage.h
@@ -0,0 +1,21 @@
+/* Copyright (C) 2001-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#define GETSP() ({ register uintptr_t stack_ptr asm ("rsp"); stack_ptr; })
+#define GETTIME(low,high) asm ("rdtsc" : "=a" (low), "=d" (high))
+
+#include <sysdeps/generic/memusage.h>
diff --git a/REORG.TODO/sysdeps/x86_64/mp_clz_tab.c b/REORG.TODO/sysdeps/x86_64/mp_clz_tab.c
new file mode 100644
index 0000000000..7b13a394da
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/mp_clz_tab.c
@@ -0,0 +1 @@
+/* __clz_tab not needed on x86-64.  */
diff --git a/REORG.TODO/sysdeps/x86_64/mul_1.S b/REORG.TODO/sysdeps/x86_64/mul_1.S
new file mode 100644
index 0000000000..5c1c4335bf
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/mul_1.S
@@ -0,0 +1,128 @@
+/* AMD64 __mpn_mul_1 -- Multiply a limb vector with a limb and store
+   the result in a second limb vector.
+   Copyright (C) 2003-2017 Free Software Foundation, Inc.
+   This file is part of the GNU MP Library.
+
+   The GNU MP Library is free software; you can redistribute it and/or modify
+   it under the terms of the GNU Lesser General Public License as published by
+   the Free Software Foundation; either version 2.1 of the License, or (at your
+   option) any later version.
+
+   The GNU MP Library is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+   License for more details.
+
+   You should have received a copy of the GNU Lesser General Public License
+   along with the GNU MP Library; see the file COPYING.LIB.  If not,
+   see <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+#define rp	%rdi
+#define up	%rsi
+#define n_param	%rdx
+#define vl	%rcx
+
+#define n	%r11
+
+	.text
+ENTRY (__mpn_mul_1)
+	push	%rbx
+	cfi_adjust_cfa_offset (8)
+	cfi_rel_offset (%rbx, 0)
+	xor	%r10, %r10
+	mov	(up), %rax		/* read first u limb early */
+	mov	n_param, %rbx		/* move away n from rdx, mul uses it */
+	mul	vl
+	mov	%rbx, %r11
+
+	add	%r10, %rax
+	adc	$0, %rdx
+
+	and	$3, %ebx
+	jz	L(b0)
+	cmp	$2, %ebx
+	jz	L(b2)
+	jg	L(b3)
+
+L(b1):	dec	n
+	jne	L(gt1)
+	mov	%rax, (rp)
+	jmp	L(ret)
+L(gt1):	lea	8(up,n,8), up
+	lea	-8(rp,n,8), rp
+	neg	n
+	xor	%r10, %r10
+	xor	%ebx, %ebx
+	mov	%rax, %r9
+	mov	(up,n,8), %rax
+	mov	%rdx, %r8
+	jmp	L(L1)
+
+L(b0):	lea	(up,n,8), up
+	lea	-16(rp,n,8), rp
+	neg	n
+	xor	%r10, %r10
+	mov	%rax, %r8
+	mov	%rdx, %rbx
+	jmp	L(L0)
+
+L(b3):	lea	-8(up,n,8), up
+	lea	-24(rp,n,8), rp
+	neg	n
+	mov	%rax, %rbx
+	mov	%rdx, %r10
+	jmp	L(L3)
+
+L(b2):	lea	-16(up,n,8), up
+	lea	-32(rp,n,8), rp
+	neg	n
+	xor	%r8, %r8
+	xor	%ebx, %ebx
+	mov	%rax, %r10
+	mov	24(up,n,8), %rax
+	mov	%rdx, %r9
+	jmp	L(L2)
+
+	.p2align 4
+L(top): mov	%r10, (rp,n,8)
+	add	%rax, %r9
+	mov	(up,n,8), %rax
+	adc	%rdx, %r8
+	mov	$0, %r10d
+L(L1):	mul	vl
+	mov	%r9, 8(rp,n,8)
+	add	%rax, %r8
+	adc	%rdx, %rbx
+L(L0):	mov	8(up,n,8), %rax
+	mul	vl
+	mov	%r8, 16(rp,n,8)
+	add	%rax, %rbx
+	adc	%rdx, %r10
+L(L3):	mov	16(up,n,8), %rax
+	mul	vl
+	mov	%rbx, 24(rp,n,8)
+	mov	$0, %r8d                # zero
+	mov	%r8, %rbx               # zero
+	add	%rax, %r10
+	mov	24(up,n,8), %rax
+	mov	%r8, %r9                # zero
+	adc	%rdx, %r9
+L(L2):	mul	vl
+	add	$4, n
+	js	L(top)
+
+	mov	%r10, (rp,n,8)
+	add	%rax, %r9
+	adc	%r8, %rdx
+	mov	%r9, 8(rp,n,8)
+	add	%r8, %rdx
+L(ret):	mov	%rdx, %rax
+
+	pop	%rbx
+	cfi_adjust_cfa_offset (-8)
+	cfi_restore (%rbx)
+	ret
+END (__mpn_mul_1)
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/Makefile b/REORG.TODO/sysdeps/x86_64/multiarch/Makefile
new file mode 100644
index 0000000000..310a3a4b72
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/Makefile
@@ -0,0 +1,42 @@
+ifeq ($(subdir),csu)
+tests += test-multiarch
+endif
+
+ifeq ($(subdir),string)
+
+sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \
+		   strcmp-sse2-unaligned strncmp-ssse3 \
+		   memcmp-avx2-movbe \
+		   memcmp-sse4 memcpy-ssse3 \
+		   memmove-ssse3 \
+		   memcpy-ssse3-back \
+		   memmove-ssse3-back \
+		   memmove-avx512-no-vzeroupper strcasecmp_l-ssse3 \
+		   strncase_l-ssse3 strcat-ssse3 strncat-ssse3\
+		   strcpy-ssse3 strncpy-ssse3 stpcpy-ssse3 stpncpy-ssse3 \
+		   strcpy-sse2-unaligned strncpy-sse2-unaligned \
+		   stpcpy-sse2-unaligned stpncpy-sse2-unaligned \
+		   strcat-sse2-unaligned strncat-sse2-unaligned \
+		   strchr-sse2-no-bsf memcmp-ssse3 strstr-sse2-unaligned \
+		   strcspn-c strpbrk-c strspn-c varshift \
+		   memset-avx512-no-vzeroupper \
+		   memmove-avx-unaligned-erms \
+		   memmove-avx512-unaligned-erms \
+		   memset-avx2-unaligned-erms \
+		   memset-avx512-unaligned-erms
+CFLAGS-varshift.c += -msse4
+CFLAGS-strcspn-c.c += -msse4
+CFLAGS-strpbrk-c.c += -msse4
+CFLAGS-strspn-c.c += -msse4
+endif
+
+ifeq ($(subdir),wcsmbs)
+sysdep_routines += wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c \
+		   wmemcmp-avx2-movbe \
+		   wcscpy-ssse3 wcscpy-c \
+		   wcsnlen-sse4_1 wcsnlen-c
+endif
+
+ifeq ($(subdir),debug)
+sysdep_routines += wmemset_chk-nonshared
+endif
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/bcopy.S b/REORG.TODO/sysdeps/x86_64/multiarch/bcopy.S
new file mode 100644
index 0000000000..639f02bde3
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/bcopy.S
@@ -0,0 +1,7 @@
+#include <sysdep.h>
+
+	.text
+ENTRY(bcopy)
+	xchg	%rdi, %rsi
+	jmp	__libc_memmove	/* Branch to IFUNC memmove.  */
+END(bcopy)
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/REORG.TODO/sysdeps/x86_64/multiarch/ifunc-impl-list.c
new file mode 100644
index 0000000000..5627183aca
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -0,0 +1,460 @@
+/* Enumerate available IFUNC implementations of a function.  x86-64 version.
+   Copyright (C) 2012-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <assert.h>
+#include <string.h>
+#include <wchar.h>
+#include <ifunc-impl-list.h>
+#include <sysdep.h>
+#include "init-arch.h"
+
+/* Maximum number of IFUNC implementations.  */
+#define MAX_IFUNC	5
+
+/* Fill ARRAY of MAX elements with IFUNC implementations for function
+   NAME supported on target machine and return the number of valid
+   entries.  */
+
+size_t
+__libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+			size_t max)
+{
+  assert (max >= MAX_IFUNC);
+
+  size_t i = 0;
+
+  /* Support sysdeps/x86_64/multiarch/memcmp.S.  */
+  IFUNC_IMPL (i, name, memcmp,
+	      IFUNC_IMPL_ADD (array, i, memcmp,
+			      (HAS_ARCH_FEATURE (AVX2_Usable)
+			       && HAS_CPU_FEATURE (MOVBE)),
+			      __memcmp_avx2_movbe)
+	      IFUNC_IMPL_ADD (array, i, memcmp, HAS_CPU_FEATURE (SSE4_1),
+			      __memcmp_sse4_1)
+	      IFUNC_IMPL_ADD (array, i, memcmp, HAS_CPU_FEATURE (SSSE3),
+			      __memcmp_ssse3)
+	      IFUNC_IMPL_ADD (array, i, memcmp, 1, __memcmp_sse2))
+
+  /* Support sysdeps/x86_64/multiarch/memmove_chk.c.  */
+  IFUNC_IMPL (i, name, __memmove_chk,
+	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
+			      HAS_ARCH_FEATURE (AVX512F_Usable),
+			      __memmove_chk_avx512_no_vzeroupper)
+	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
+			      HAS_ARCH_FEATURE (AVX512F_Usable),
+			      __memmove_chk_avx512_unaligned)
+	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
+			      HAS_ARCH_FEATURE (AVX512F_Usable),
+			      __memmove_chk_avx512_unaligned_erms)
+	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
+			      HAS_ARCH_FEATURE (AVX_Usable),
+			      __memmove_chk_avx_unaligned)
+	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
+			      HAS_ARCH_FEATURE (AVX_Usable),
+			      __memmove_chk_avx_unaligned_erms)
+	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
+			      HAS_CPU_FEATURE (SSSE3),
+			      __memmove_chk_ssse3_back)
+	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
+			      HAS_CPU_FEATURE (SSSE3),
+			      __memmove_chk_ssse3)
+	      IFUNC_IMPL_ADD (array, i, __memmove_chk, 1,
+			      __memmove_chk_sse2_unaligned)
+	      IFUNC_IMPL_ADD (array, i, __memmove_chk, 1,
+			      __memmove_chk_sse2_unaligned_erms))
+
+  /* Support sysdeps/x86_64/multiarch/memmove.S.  */
+  IFUNC_IMPL (i, name, memmove,
+	      IFUNC_IMPL_ADD (array, i, memmove,
+			      HAS_ARCH_FEATURE (AVX_Usable),
+			      __memmove_avx_unaligned)
+	      IFUNC_IMPL_ADD (array, i, memmove,
+			      HAS_ARCH_FEATURE (AVX_Usable),
+			      __memmove_avx_unaligned_erms)
+	      IFUNC_IMPL_ADD (array, i, memmove,
+			      HAS_ARCH_FEATURE (AVX512F_Usable),
+			      __memmove_avx512_no_vzeroupper)
+	      IFUNC_IMPL_ADD (array, i, memmove,
+			      HAS_ARCH_FEATURE (AVX512F_Usable),
+			      __memmove_avx512_unaligned)
+	      IFUNC_IMPL_ADD (array, i, memmove,
+			      HAS_ARCH_FEATURE (AVX512F_Usable),
+			      __memmove_avx512_unaligned_erms)
+	      IFUNC_IMPL_ADD (array, i, memmove, HAS_CPU_FEATURE (SSSE3),
+			      __memmove_ssse3_back)
+	      IFUNC_IMPL_ADD (array, i, memmove, HAS_CPU_FEATURE (SSSE3),
+			      __memmove_ssse3)
+	      IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_erms)
+	      IFUNC_IMPL_ADD (array, i, memmove, 1,
+			      __memmove_sse2_unaligned)
+	      IFUNC_IMPL_ADD (array, i, memmove, 1,
+			      __memmove_sse2_unaligned_erms))
+
+  /* Support sysdeps/x86_64/multiarch/memset_chk.S.  */
+  IFUNC_IMPL (i, name, __memset_chk,
+	      IFUNC_IMPL_ADD (array, i, __memset_chk, 1,
+			      __memset_chk_sse2_unaligned)
+	      IFUNC_IMPL_ADD (array, i, __memset_chk, 1,
+			      __memset_chk_sse2_unaligned_erms)
+	      IFUNC_IMPL_ADD (array, i, __memset_chk,
+			      HAS_ARCH_FEATURE (AVX2_Usable),
+			      __memset_chk_avx2_unaligned)
+	      IFUNC_IMPL_ADD (array, i, __memset_chk,
+			      HAS_ARCH_FEATURE (AVX2_Usable),
+			      __memset_chk_avx2_unaligned_erms)
+	      IFUNC_IMPL_ADD (array, i, __memset_chk,
+			      HAS_ARCH_FEATURE (AVX512F_Usable),
+			      __memset_chk_avx512_unaligned_erms)
+	      IFUNC_IMPL_ADD (array, i, __memset_chk,
+			      HAS_ARCH_FEATURE (AVX512F_Usable),
+			      __memset_chk_avx512_unaligned)
+	      IFUNC_IMPL_ADD (array, i, __memset_chk,
+			      HAS_ARCH_FEATURE (AVX512F_Usable),
+			      __memset_chk_avx512_no_vzeroupper)
+	      )
+
+  /* Support sysdeps/x86_64/multiarch/memset.S.  */
+  IFUNC_IMPL (i, name, memset,
+	      IFUNC_IMPL_ADD (array, i, memset, 1,
+			      __memset_sse2_unaligned)
+	      IFUNC_IMPL_ADD (array, i, memset, 1,
+			      __memset_sse2_unaligned_erms)
+	      IFUNC_IMPL_ADD (array, i, memset, 1, __memset_erms)
+	      IFUNC_IMPL_ADD (array, i, memset,
+			      HAS_ARCH_FEATURE (AVX2_Usable),
+			      __memset_avx2_unaligned)
+	      IFUNC_IMPL_ADD (array, i, memset,
+			      HAS_ARCH_FEATURE (AVX2_Usable),
+			      __memset_avx2_unaligned_erms)
+	      IFUNC_IMPL_ADD (array, i, memset,
+			      HAS_ARCH_FEATURE (AVX512F_Usable),
+			      __memset_avx512_unaligned_erms)
+	      IFUNC_IMPL_ADD (array, i, memset,
+			      HAS_ARCH_FEATURE (AVX512F_Usable),
+			      __memset_avx512_unaligned)
+	      IFUNC_IMPL_ADD (array, i, memset,
+			      HAS_ARCH_FEATURE (AVX512F_Usable),
+			      __memset_avx512_no_vzeroupper)
+	     )
+
+  /* Support sysdeps/x86_64/multiarch/stpncpy.S.  */
+  IFUNC_IMPL (i, name, stpncpy,
+	      IFUNC_IMPL_ADD (array, i, stpncpy, HAS_CPU_FEATURE (SSSE3),
+			      __stpncpy_ssse3)
+	      IFUNC_IMPL_ADD (array, i, stpncpy, 1,
+			      __stpncpy_sse2_unaligned)
+	      IFUNC_IMPL_ADD (array, i, stpncpy, 1, __stpncpy_sse2))
+
+  /* Support sysdeps/x86_64/multiarch/stpcpy.S.  */
+  IFUNC_IMPL (i, name, stpcpy,
+	      IFUNC_IMPL_ADD (array, i, stpcpy, HAS_CPU_FEATURE (SSSE3),
+			      __stpcpy_ssse3)
+	      IFUNC_IMPL_ADD (array, i, stpcpy, 1, __stpcpy_sse2_unaligned)
+	      IFUNC_IMPL_ADD (array, i, stpcpy, 1, __stpcpy_sse2))
+
+  /* Support sysdeps/x86_64/multiarch/strcasecmp_l.S.  */
+  IFUNC_IMPL (i, name, strcasecmp,
+	      IFUNC_IMPL_ADD (array, i, strcasecmp,
+			      HAS_ARCH_FEATURE (AVX_Usable),
+			      __strcasecmp_avx)
+	      IFUNC_IMPL_ADD (array, i, strcasecmp,
+			      HAS_CPU_FEATURE (SSE4_2),
+			      __strcasecmp_sse42)
+	      IFUNC_IMPL_ADD (array, i, strcasecmp,
+			      HAS_CPU_FEATURE (SSSE3),
+			      __strcasecmp_ssse3)
+	      IFUNC_IMPL_ADD (array, i, strcasecmp, 1, __strcasecmp_sse2))
+
+  /* Support sysdeps/x86_64/multiarch/strcasecmp_l.S.  */
+  IFUNC_IMPL (i, name, strcasecmp_l,
+	      IFUNC_IMPL_ADD (array, i, strcasecmp_l,
+			      HAS_ARCH_FEATURE (AVX_Usable),
+			      __strcasecmp_l_avx)
+	      IFUNC_IMPL_ADD (array, i, strcasecmp_l,
+			      HAS_CPU_FEATURE (SSE4_2),
+			      __strcasecmp_l_sse42)
+	      IFUNC_IMPL_ADD (array, i, strcasecmp_l,
+			      HAS_CPU_FEATURE (SSSE3),
+			      __strcasecmp_l_ssse3)
+	      IFUNC_IMPL_ADD (array, i, strcasecmp_l, 1,
+			      __strcasecmp_l_sse2))
+
+  /* Support sysdeps/x86_64/multiarch/strcat.S.  */
+  IFUNC_IMPL (i, name, strcat,
+	      IFUNC_IMPL_ADD (array, i, strcat, HAS_CPU_FEATURE (SSSE3),
+			      __strcat_ssse3)
+	      IFUNC_IMPL_ADD (array, i, strcat, 1, __strcat_sse2_unaligned)
+	      IFUNC_IMPL_ADD (array, i, strcat, 1, __strcat_sse2))
+
+  /* Support sysdeps/x86_64/multiarch/strchr.S.  */
+  IFUNC_IMPL (i, name, strchr,
+	      IFUNC_IMPL_ADD (array, i, strchr, 1, __strchr_sse2_no_bsf)
+	      IFUNC_IMPL_ADD (array, i, strchr, 1, __strchr_sse2))
+
+  /* Support sysdeps/x86_64/multiarch/strcmp.S.  */
+  IFUNC_IMPL (i, name, strcmp,
+	      IFUNC_IMPL_ADD (array, i, strcmp, HAS_CPU_FEATURE (SSE4_2),
+			      __strcmp_sse42)
+	      IFUNC_IMPL_ADD (array, i, strcmp, HAS_CPU_FEATURE (SSSE3),
+			      __strcmp_ssse3)
+	      IFUNC_IMPL_ADD (array, i, strcmp, 1, __strcmp_sse2_unaligned)
+	      IFUNC_IMPL_ADD (array, i, strcmp, 1, __strcmp_sse2))
+
+  /* Support sysdeps/x86_64/multiarch/strcpy.S.  */
+  IFUNC_IMPL (i, name, strcpy,
+	      IFUNC_IMPL_ADD (array, i, strcpy, HAS_CPU_FEATURE (SSSE3),
+			      __strcpy_ssse3)
+	      IFUNC_IMPL_ADD (array, i, strcpy, 1, __strcpy_sse2_unaligned)
+	      IFUNC_IMPL_ADD (array, i, strcpy, 1, __strcpy_sse2))
+
+  /* Support sysdeps/x86_64/multiarch/strcspn.S.  */
+  IFUNC_IMPL (i, name, strcspn,
+	      IFUNC_IMPL_ADD (array, i, strcspn, HAS_CPU_FEATURE (SSE4_2),
+			      __strcspn_sse42)
+	      IFUNC_IMPL_ADD (array, i, strcspn, 1, __strcspn_sse2))
+
+  /* Support sysdeps/x86_64/multiarch/strncase_l.S.  */
+  IFUNC_IMPL (i, name, strncasecmp,
+	      IFUNC_IMPL_ADD (array, i, strncasecmp,
+			      HAS_ARCH_FEATURE (AVX_Usable),
+			      __strncasecmp_avx)
+	      IFUNC_IMPL_ADD (array, i, strncasecmp,
+			      HAS_CPU_FEATURE (SSE4_2),
+			      __strncasecmp_sse42)
+	      IFUNC_IMPL_ADD (array, i, strncasecmp,
+			      HAS_CPU_FEATURE (SSSE3),
+			      __strncasecmp_ssse3)
+	      IFUNC_IMPL_ADD (array, i, strncasecmp, 1,
+			      __strncasecmp_sse2))
+
+  /* Support sysdeps/x86_64/multiarch/strncase_l.S.  */
+  IFUNC_IMPL (i, name, strncasecmp_l,
+	      IFUNC_IMPL_ADD (array, i, strncasecmp_l,
+			      HAS_ARCH_FEATURE (AVX_Usable),
+			      __strncasecmp_l_avx)
+	      IFUNC_IMPL_ADD (array, i, strncasecmp_l,
+			      HAS_CPU_FEATURE (SSE4_2),
+			      __strncasecmp_l_sse42)
+	      IFUNC_IMPL_ADD (array, i, strncasecmp_l,
+			      HAS_CPU_FEATURE (SSSE3),
+			      __strncasecmp_l_ssse3)
+	      IFUNC_IMPL_ADD (array, i, strncasecmp_l, 1,
+			      __strncasecmp_l_sse2))
+
+  /* Support sysdeps/x86_64/multiarch/strncat.S.  */
+  IFUNC_IMPL (i, name, strncat,
+	      IFUNC_IMPL_ADD (array, i, strncat, HAS_CPU_FEATURE (SSSE3),
+			      __strncat_ssse3)
+	      IFUNC_IMPL_ADD (array, i, strncat, 1,
+			      __strncat_sse2_unaligned)
+	      IFUNC_IMPL_ADD (array, i, strncat, 1, __strncat_sse2))
+
+  /* Support sysdeps/x86_64/multiarch/strncpy.S.  */
+  IFUNC_IMPL (i, name, strncpy,
+	      IFUNC_IMPL_ADD (array, i, strncpy, HAS_CPU_FEATURE (SSSE3),
+			      __strncpy_ssse3)
+	      IFUNC_IMPL_ADD (array, i, strncpy, 1,
+			      __strncpy_sse2_unaligned)
+	      IFUNC_IMPL_ADD (array, i, strncpy, 1, __strncpy_sse2))
+
+  /* Support sysdeps/x86_64/multiarch/strpbrk.S.  */
+  IFUNC_IMPL (i, name, strpbrk,
+	      IFUNC_IMPL_ADD (array, i, strpbrk, HAS_CPU_FEATURE (SSE4_2),
+			      __strpbrk_sse42)
+	      IFUNC_IMPL_ADD (array, i, strpbrk, 1, __strpbrk_sse2))
+
+
+  /* Support sysdeps/x86_64/multiarch/strspn.S.  */
+  IFUNC_IMPL (i, name, strspn,
+	      IFUNC_IMPL_ADD (array, i, strspn, HAS_CPU_FEATURE (SSE4_2),
+			      __strspn_sse42)
+	      IFUNC_IMPL_ADD (array, i, strspn, 1, __strspn_sse2))
+
+  /* Support sysdeps/x86_64/multiarch/strstr.c.  */
+  IFUNC_IMPL (i, name, strstr,
+	      IFUNC_IMPL_ADD (array, i, strstr, 1, __strstr_sse2_unaligned)
+	      IFUNC_IMPL_ADD (array, i, strstr, 1, __strstr_sse2))
+
+  /* Support sysdeps/x86_64/multiarch/wcscpy.S.  */
+  IFUNC_IMPL (i, name, wcscpy,
+	      IFUNC_IMPL_ADD (array, i, wcscpy, HAS_CPU_FEATURE (SSSE3),
+			      __wcscpy_ssse3)
+	      IFUNC_IMPL_ADD (array, i, wcscpy, 1, __wcscpy_sse2))
+
+  /* Support sysdeps/x86_64/multiarch/wcsnlen.c.  */
+  IFUNC_IMPL (i, name, wcsnlen,
+	      IFUNC_IMPL_ADD (array, i, wcsnlen,
+			      HAS_CPU_FEATURE (SSE4_1),
+			      __wcsnlen_sse4_1)
+	      IFUNC_IMPL_ADD (array, i, wcsnlen, 1, __wcsnlen_sse2))
+
+  /* Support sysdeps/x86_64/multiarch/wmemcmp.S.  */
+  IFUNC_IMPL (i, name, wmemcmp,
+	      IFUNC_IMPL_ADD (array, i, wmemcmp,
+			      (HAS_ARCH_FEATURE (AVX2_Usable)
+			       && HAS_CPU_FEATURE (MOVBE)),
+			      __wmemcmp_avx2_movbe)
+	      IFUNC_IMPL_ADD (array, i, wmemcmp, HAS_CPU_FEATURE (SSE4_1),
+			      __wmemcmp_sse4_1)
+	      IFUNC_IMPL_ADD (array, i, wmemcmp, HAS_CPU_FEATURE (SSSE3),
+			      __wmemcmp_ssse3)
+	      IFUNC_IMPL_ADD (array, i, wmemcmp, 1, __wmemcmp_sse2))
+
+  /* Support sysdeps/x86_64/multiarch/wmemset.c.  */
+  IFUNC_IMPL (i, name, wmemset,
+	      IFUNC_IMPL_ADD (array, i, wmemset, 1,
+			      __wmemset_sse2_unaligned)
+	      IFUNC_IMPL_ADD (array, i, wmemset,
+			      HAS_ARCH_FEATURE (AVX2_Usable),
+			      __wmemset_avx2_unaligned)
+	      IFUNC_IMPL_ADD (array, i, wmemset,
+			      HAS_ARCH_FEATURE (AVX512F_Usable),
+			      __wmemset_avx512_unaligned))
+
+#ifdef SHARED
+  /* Support sysdeps/x86_64/multiarch/memcpy_chk.S.  */
+  IFUNC_IMPL (i, name, __memcpy_chk,
+	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
+			      HAS_ARCH_FEATURE (AVX512F_Usable),
+			      __memcpy_chk_avx512_no_vzeroupper)
+	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
+			      HAS_ARCH_FEATURE (AVX512F_Usable),
+			      __memcpy_chk_avx512_unaligned)
+	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
+			      HAS_ARCH_FEATURE (AVX512F_Usable),
+			      __memcpy_chk_avx512_unaligned_erms)
+	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
+			      HAS_ARCH_FEATURE (AVX_Usable),
+			      __memcpy_chk_avx_unaligned)
+	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
+			      HAS_ARCH_FEATURE (AVX_Usable),
+			      __memcpy_chk_avx_unaligned_erms)
+	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
+			      HAS_CPU_FEATURE (SSSE3),
+			      __memcpy_chk_ssse3_back)
+	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
+			      HAS_CPU_FEATURE (SSSE3),
+			      __memcpy_chk_ssse3)
+	      IFUNC_IMPL_ADD (array, i, __memcpy_chk, 1,
+			      __memcpy_chk_sse2_unaligned)
+	      IFUNC_IMPL_ADD (array, i, __memcpy_chk, 1,
+			      __memcpy_chk_sse2_unaligned_erms))
+
+  /* Support sysdeps/x86_64/multiarch/memcpy.S.  */
+  IFUNC_IMPL (i, name, memcpy,
+	      IFUNC_IMPL_ADD (array, i, memcpy,
+			      HAS_ARCH_FEATURE (AVX_Usable),
+			      __memcpy_avx_unaligned)
+	      IFUNC_IMPL_ADD (array, i, memcpy,
+			      HAS_ARCH_FEATURE (AVX_Usable),
+			      __memcpy_avx_unaligned_erms)
+	      IFUNC_IMPL_ADD (array, i, memcpy, HAS_CPU_FEATURE (SSSE3),
+			      __memcpy_ssse3_back)
+	      IFUNC_IMPL_ADD (array, i, memcpy, HAS_CPU_FEATURE (SSSE3),
+			      __memcpy_ssse3)
+	      IFUNC_IMPL_ADD (array, i, memcpy,
+			      HAS_ARCH_FEATURE (AVX512F_Usable),
+			      __memcpy_avx512_no_vzeroupper)
+	      IFUNC_IMPL_ADD (array, i, memcpy,
+			      HAS_ARCH_FEATURE (AVX512F_Usable),
+			      __memcpy_avx512_unaligned)
+	      IFUNC_IMPL_ADD (array, i, memcpy,
+			      HAS_ARCH_FEATURE (AVX512F_Usable),
+			      __memcpy_avx512_unaligned_erms)
+	      IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_sse2_unaligned)
+	      IFUNC_IMPL_ADD (array, i, memcpy, 1,
+			      __memcpy_sse2_unaligned_erms)
+	      IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_erms))
+
+  /* Support sysdeps/x86_64/multiarch/mempcpy_chk.S.  */
+  IFUNC_IMPL (i, name, __mempcpy_chk,
+	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
+			      HAS_ARCH_FEATURE (AVX512F_Usable),
+			      __mempcpy_chk_avx512_no_vzeroupper)
+	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
+			      HAS_ARCH_FEATURE (AVX512F_Usable),
+			      __mempcpy_chk_avx512_unaligned)
+	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
+			      HAS_ARCH_FEATURE (AVX512F_Usable),
+			      __mempcpy_chk_avx512_unaligned_erms)
+	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
+			      HAS_ARCH_FEATURE (AVX_Usable),
+			      __mempcpy_chk_avx_unaligned)
+	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
+			      HAS_ARCH_FEATURE (AVX_Usable),
+			      __mempcpy_chk_avx_unaligned_erms)
+	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
+			      HAS_CPU_FEATURE (SSSE3),
+			      __mempcpy_chk_ssse3_back)
+	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
+			      HAS_CPU_FEATURE (SSSE3),
+			      __mempcpy_chk_ssse3)
+	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk, 1,
+			      __mempcpy_chk_sse2_unaligned)
+	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk, 1,
+			      __mempcpy_chk_sse2_unaligned_erms))
+
+  /* Support sysdeps/x86_64/multiarch/mempcpy.S.  */
+  IFUNC_IMPL (i, name, mempcpy,
+	      IFUNC_IMPL_ADD (array, i, mempcpy,
+			      HAS_ARCH_FEATURE (AVX512F_Usable),
+			      __mempcpy_avx512_no_vzeroupper)
+	      IFUNC_IMPL_ADD (array, i, mempcpy,
+			      HAS_ARCH_FEATURE (AVX512F_Usable),
+			      __mempcpy_avx512_unaligned)
+	      IFUNC_IMPL_ADD (array, i, mempcpy,
+			      HAS_ARCH_FEATURE (AVX512F_Usable),
+			      __mempcpy_avx512_unaligned_erms)
+	      IFUNC_IMPL_ADD (array, i, mempcpy,
+			      HAS_ARCH_FEATURE (AVX_Usable),
+			      __mempcpy_avx_unaligned)
+	      IFUNC_IMPL_ADD (array, i, mempcpy,
+			      HAS_ARCH_FEATURE (AVX_Usable),
+			      __mempcpy_avx_unaligned_erms)
+	      IFUNC_IMPL_ADD (array, i, mempcpy, HAS_CPU_FEATURE (SSSE3),
+			      __mempcpy_ssse3_back)
+	      IFUNC_IMPL_ADD (array, i, mempcpy, HAS_CPU_FEATURE (SSSE3),
+			      __mempcpy_ssse3)
+	      IFUNC_IMPL_ADD (array, i, mempcpy, 1,
+			      __mempcpy_sse2_unaligned)
+	      IFUNC_IMPL_ADD (array, i, mempcpy, 1,
+			      __mempcpy_sse2_unaligned_erms)
+	      IFUNC_IMPL_ADD (array, i, mempcpy, 1, __mempcpy_erms))
+
+  /* Support sysdeps/x86_64/multiarch/strncmp.S.  */
+  IFUNC_IMPL (i, name, strncmp,
+	      IFUNC_IMPL_ADD (array, i, strncmp, HAS_CPU_FEATURE (SSE4_2),
+			      __strncmp_sse42)
+	      IFUNC_IMPL_ADD (array, i, strncmp, HAS_CPU_FEATURE (SSSE3),
+			      __strncmp_ssse3)
+	      IFUNC_IMPL_ADD (array, i, strncmp, 1, __strncmp_sse2))
+
+  /* Support sysdeps/x86_64/multiarch/wmemset_chk.c.  */
+  IFUNC_IMPL (i, name, __wmemset_chk,
+	      IFUNC_IMPL_ADD (array, i, __wmemset_chk, 1,
+			      __wmemset_chk_sse2_unaligned)
+	      IFUNC_IMPL_ADD (array, i, __wmemset_chk,
+			      HAS_ARCH_FEATURE (AVX2_Usable),
+			      __wmemset_chk_avx2_unaligned)
+	      IFUNC_IMPL_ADD (array, i, __wmemset_chk,
+			      HAS_ARCH_FEATURE (AVX512F_Usable),
+			      __wmemset_chk_avx512_unaligned))
+#endif
+
+  return i;
+}
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/ifunc-wmemset.h b/REORG.TODO/sysdeps/x86_64/multiarch/ifunc-wmemset.h
new file mode 100644
index 0000000000..d761985a47
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/ifunc-wmemset.h
@@ -0,0 +1,42 @@
+/* Common definition for wmemset/wmemset_chk ifunc selections.
+   Copyright (C) 2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <init-arch.h>
+
+extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx512_unaligned) attribute_hidden;
+
+static inline void *
+IFUNC_SELECTOR (void)
+{
+  const struct cpu_features* cpu_features = __get_cpu_features ();
+
+  if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)
+      && CPU_FEATURES_ARCH_P (cpu_features, AVX2_Usable)
+      && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
+    {
+      if (CPU_FEATURES_ARCH_P (cpu_features, AVX512F_Usable)
+	  && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512))
+	return OPTIMIZE (avx512_unaligned);
+      else
+	return OPTIMIZE (avx2_unaligned);
+    }
+
+  return OPTIMIZE (sse2_unaligned);
+}
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S b/REORG.TODO/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
new file mode 100644
index 0000000000..47630dd97b
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
@@ -0,0 +1,425 @@
+/* memcmp/wmemcmp optimized with AVX2.
+   Copyright (C) 2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+
+/* memcmp/wmemcmp is implemented as:
+   1. For size from 2 to 7 bytes, load as big endian with movbe and bswap
+      to avoid branches.
+   2. Use overlapping compare to avoid branch.
+   3. Use vector compare when size >= 4 bytes for memcmp or size >= 8
+      bytes for wmemcmp.
+   4. If size is 8 * VEC_SIZE or less, unroll the loop.
+   5. Compare 4 * VEC_SIZE at a time with the aligned first memory
+      area.
+   6. Use 2 vector compares when size is 2 * VEC_SIZE or less.
+   7. Use 4 vector compares when size is 4 * VEC_SIZE or less.
+   8. Use 8 vector compares when size is 8 * VEC_SIZE or less.  */
+
+# include <sysdep.h>
+
+# ifndef MEMCMP
+#  define MEMCMP	__memcmp_avx2_movbe
+# endif
+
+# ifdef USE_AS_WMEMCMP
+#  define VPCMPEQ	vpcmpeqd
+# else
+#  define VPCMPEQ	vpcmpeqb
+# endif
+
+# ifndef VZEROUPPER
+#  define VZEROUPPER	vzeroupper
+# endif
+
+# define VEC_SIZE 32
+# define VEC_MASK ((1 << VEC_SIZE) - 1)
+
+/* Warning!
+           wmemcmp has to use SIGNED comparison for elements.
+           memcmp has to use UNSIGNED comparison for elemnts.
+*/
+
+	.section .text.avx,"ax",@progbits
+ENTRY (MEMCMP)
+# ifdef USE_AS_WMEMCMP
+	shl	$2, %rdx
+# endif
+	cmpq	$VEC_SIZE, %rdx
+	jb	L(less_vec)
+	cmpq	$(VEC_SIZE * 2), %rdx
+	ja	L(more_2x_vec)
+
+L(last_2x_vec):
+	/* From VEC to 2 * VEC.  No branch when size == VEC_SIZE.  */
+	vmovdqu	(%rsi), %ymm2
+	VPCMPEQ (%rdi), %ymm2, %ymm2
+	vpmovmskb %ymm2, %eax
+	subl    $VEC_MASK, %eax
+	jnz	L(first_vec)
+
+L(last_vec):
+	/* Use overlapping loads to avoid branches.  */
+	leaq	-VEC_SIZE(%rdi, %rdx), %rdi
+	leaq	-VEC_SIZE(%rsi, %rdx), %rsi
+	vmovdqu	(%rsi), %ymm2
+	VPCMPEQ (%rdi), %ymm2, %ymm2
+	vpmovmskb %ymm2, %eax
+	subl    $VEC_MASK, %eax
+	jnz	L(first_vec)
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(first_vec):
+	/* A byte or int32 is different within 16 or 32 bytes.  */
+	tzcntl	%eax, %ecx
+# ifdef USE_AS_WMEMCMP
+	xorl	%eax, %eax
+	movl	(%rdi, %rcx), %edx
+	cmpl	(%rsi, %rcx), %edx
+L(wmemcmp_return):
+	setl	%al
+	negl	%eax
+	orl	$1, %eax
+# else
+	movzbl	(%rdi, %rcx), %eax
+	movzbl	(%rsi, %rcx), %edx
+	sub	%edx, %eax
+# endif
+	VZEROUPPER
+	ret
+
+# ifdef USE_AS_WMEMCMP
+	.p2align 4
+L(4):
+	xorl	%eax, %eax
+	movl	(%rdi), %edx
+	cmpl	(%rsi), %edx
+	jne	L(wmemcmp_return)
+	ret
+# else
+	.p2align 4
+L(between_4_7):
+	/* Load as big endian with overlapping movbe to avoid branches.  */
+	movbe	(%rdi), %eax
+	movbe	(%rsi), %ecx
+	shlq	$32, %rax
+	shlq	$32, %rcx
+	movbe	-4(%rdi, %rdx), %edi
+	movbe	-4(%rsi, %rdx), %esi
+	orq	%rdi, %rax
+	orq	%rsi, %rcx
+	subq	%rcx, %rax
+	je	L(exit)
+	sbbl	%eax, %eax
+	orl	$1, %eax
+	ret
+
+	.p2align 4
+L(exit):
+	ret
+
+	.p2align 4
+L(between_2_3):
+	/* Load as big endian with overlapping loads and bswap to avoid
+	   branches.  */
+	movzwl	-2(%rdi, %rdx), %eax
+	movzwl	-2(%rsi, %rdx), %ecx
+	shll	$16, %eax
+	shll	$16, %ecx
+	movzwl	(%rdi), %edi
+	movzwl	(%rsi), %esi
+	orl	%edi, %eax
+	orl	%esi, %ecx
+	bswap	%eax
+	bswap	%ecx
+	subl	%ecx, %eax
+	ret
+
+	.p2align 4
+L(1):
+	movzbl	(%rdi), %eax
+	movzbl	(%rsi), %ecx
+	subl	%ecx, %eax
+	ret
+# endif
+
+	.p2align 4
+L(zero):
+	xorl	%eax, %eax
+	ret
+
+	.p2align 4
+L(less_vec):
+# ifdef USE_AS_WMEMCMP
+	/* It can only be 0, 4, 8, 12, 16, 20, 24, 28 bytes.  */
+	cmpb	$4, %dl
+	je	L(4)
+	jb	L(zero)
+# else
+	cmpb	$1, %dl
+	je	L(1)
+	jb	L(zero)
+	cmpb	$4, %dl
+	jb	L(between_2_3)
+	cmpb	$8, %dl
+	jb	L(between_4_7)
+# endif
+	cmpb	$16, %dl
+	jae	L(between_16_31)
+	/* It is between 8 and 15 bytes.  */
+	vmovq	(%rdi), %xmm1
+	vmovq	(%rsi), %xmm2
+	VPCMPEQ %xmm1, %xmm2, %xmm2
+	vpmovmskb %xmm2, %eax
+	subl    $0xffff, %eax
+	jnz	L(first_vec)
+	/* Use overlapping loads to avoid branches.  */
+	leaq	-8(%rdi, %rdx), %rdi
+	leaq	-8(%rsi, %rdx), %rsi
+	vmovq	(%rdi), %xmm1
+	vmovq	(%rsi), %xmm2
+	VPCMPEQ %xmm1, %xmm2, %xmm2
+	vpmovmskb %xmm2, %eax
+	subl    $0xffff, %eax
+	jnz	L(first_vec)
+	ret
+
+	.p2align 4
+L(between_16_31):
+	/* From 16 to 31 bytes.  No branch when size == 16.  */
+	vmovdqu	(%rsi), %xmm2
+	VPCMPEQ (%rdi), %xmm2, %xmm2
+	vpmovmskb %xmm2, %eax
+	subl    $0xffff, %eax
+	jnz	L(first_vec)
+
+	/* Use overlapping loads to avoid branches.  */
+	leaq	-16(%rdi, %rdx), %rdi
+	leaq	-16(%rsi, %rdx), %rsi
+	vmovdqu	(%rsi), %xmm2
+	VPCMPEQ (%rdi), %xmm2, %xmm2
+	vpmovmskb %xmm2, %eax
+	subl    $0xffff, %eax
+	jnz	L(first_vec)
+	ret
+
+	.p2align 4
+L(more_2x_vec):
+	/* More than 2 * VEC.  */
+	cmpq	$(VEC_SIZE * 8), %rdx
+	ja	L(more_8x_vec)
+	cmpq	$(VEC_SIZE * 4), %rdx
+	jb	L(last_4x_vec)
+
+	/* From 4 * VEC to 8 * VEC, inclusively. */
+	vmovdqu	(%rsi), %ymm1
+	VPCMPEQ (%rdi), %ymm1, %ymm1
+
+	vmovdqu	VEC_SIZE(%rsi), %ymm2
+	VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2
+
+	vmovdqu	(VEC_SIZE * 2)(%rsi), %ymm3
+	VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3
+
+	vmovdqu	(VEC_SIZE * 3)(%rsi), %ymm4
+	VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4
+
+	vpand	%ymm1, %ymm2, %ymm5
+	vpand	%ymm3, %ymm4, %ymm6
+	vpand	%ymm5, %ymm6, %ymm5
+
+	vpmovmskb %ymm5, %eax
+	subl	$VEC_MASK, %eax
+	jnz	L(4x_vec_end)
+
+	leaq	-(4 * VEC_SIZE)(%rdi, %rdx), %rdi
+	leaq	-(4 * VEC_SIZE)(%rsi, %rdx), %rsi
+	vmovdqu	(%rsi), %ymm1
+	VPCMPEQ (%rdi), %ymm1, %ymm1
+
+	vmovdqu	VEC_SIZE(%rsi), %ymm2
+	VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2
+	vpand	%ymm2, %ymm1, %ymm5
+
+	vmovdqu	(VEC_SIZE * 2)(%rsi), %ymm3
+	VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3
+	vpand	%ymm3, %ymm5, %ymm5
+
+	vmovdqu	(VEC_SIZE * 3)(%rsi), %ymm4
+	VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4
+	vpand	%ymm4, %ymm5, %ymm5
+
+	vpmovmskb %ymm5, %eax
+	subl	$VEC_MASK, %eax
+	jnz	L(4x_vec_end)
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(more_8x_vec):
+	/* More than 8 * VEC.  Check the first VEC.  */
+	vmovdqu	(%rsi), %ymm2
+	VPCMPEQ (%rdi), %ymm2, %ymm2
+	vpmovmskb %ymm2, %eax
+	subl    $VEC_MASK, %eax
+	jnz	L(first_vec)
+
+	/* Align the first memory area for aligned loads in the loop.
+	   Compute how much the first memory area is misaligned.  */
+	movq	%rdi, %rcx
+	andl	$(VEC_SIZE - 1), %ecx
+	/* Get the negative of offset for alignment.  */
+	subq	$VEC_SIZE, %rcx
+	/* Adjust the second memory area.  */
+	subq	%rcx, %rsi
+	/* Adjust the first memory area which should be aligned now.  */
+	subq	%rcx, %rdi
+	/* Adjust length.  */
+	addq	%rcx, %rdx
+
+L(loop_4x_vec):
+	/* Compare 4 * VEC at a time forward.  */
+	vmovdqu	(%rsi), %ymm1
+	VPCMPEQ (%rdi), %ymm1, %ymm1
+
+	vmovdqu	VEC_SIZE(%rsi), %ymm2
+	VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2
+	vpand	%ymm2, %ymm1, %ymm5
+
+	vmovdqu	(VEC_SIZE * 2)(%rsi), %ymm3
+	VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3
+	vpand	%ymm3, %ymm5, %ymm5
+
+	vmovdqu	(VEC_SIZE * 3)(%rsi), %ymm4
+	VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4
+	vpand	%ymm4, %ymm5, %ymm5
+
+	vpmovmskb %ymm5, %eax
+	subl	$VEC_MASK, %eax
+	jnz	L(4x_vec_end)
+
+	addq	$(VEC_SIZE * 4), %rdi
+	addq	$(VEC_SIZE * 4), %rsi
+
+	subq	$(VEC_SIZE * 4), %rdx
+	cmpq	$(VEC_SIZE * 4), %rdx
+	jae	L(loop_4x_vec)
+
+	/* Less than 4 * VEC.  */
+	cmpq	$VEC_SIZE, %rdx
+	jbe	L(last_vec)
+	cmpq	$(VEC_SIZE * 2), %rdx
+	jbe	L(last_2x_vec)
+
+L(last_4x_vec):
+	/* From 2 * VEC to 4 * VEC. */
+	vmovdqu	(%rsi), %ymm2
+	VPCMPEQ (%rdi), %ymm2, %ymm2
+	vpmovmskb %ymm2, %eax
+	subl    $VEC_MASK, %eax
+	jnz	L(first_vec)
+
+	addq	$VEC_SIZE, %rdi
+	addq	$VEC_SIZE, %rsi
+	vmovdqu	(%rsi), %ymm2
+	VPCMPEQ (%rdi), %ymm2, %ymm2
+	vpmovmskb %ymm2, %eax
+	subl    $VEC_MASK, %eax
+	jnz	L(first_vec)
+
+	/* Use overlapping loads to avoid branches.  */
+	leaq	-(3 * VEC_SIZE)(%rdi, %rdx), %rdi
+	leaq	-(3 * VEC_SIZE)(%rsi, %rdx), %rsi
+	vmovdqu	(%rsi), %ymm2
+	VPCMPEQ (%rdi), %ymm2, %ymm2
+	vpmovmskb %ymm2, %eax
+	subl    $VEC_MASK, %eax
+	jnz	L(first_vec)
+
+	addq	$VEC_SIZE, %rdi
+	addq	$VEC_SIZE, %rsi
+	vmovdqu	(%rsi), %ymm2
+	VPCMPEQ (%rdi), %ymm2, %ymm2
+	vpmovmskb %ymm2, %eax
+	subl    $VEC_MASK, %eax
+	jnz	L(first_vec)
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(4x_vec_end):
+	vpmovmskb %ymm1, %eax
+	subl	$VEC_MASK, %eax
+	jnz	L(first_vec)
+	vpmovmskb %ymm2, %eax
+	subl	$VEC_MASK, %eax
+	jnz	L(first_vec_x1)
+	vpmovmskb %ymm3, %eax
+	subl	$VEC_MASK, %eax
+	jnz	L(first_vec_x2)
+	vpmovmskb %ymm4, %eax
+	subl	$VEC_MASK, %eax
+	tzcntl	%eax, %ecx
+# ifdef USE_AS_WMEMCMP
+	xorl	%eax, %eax
+	movl	(VEC_SIZE * 3)(%rdi, %rcx), %edx
+	cmpl	(VEC_SIZE * 3)(%rsi, %rcx), %edx
+	jmp	L(wmemcmp_return)
+# else
+	movzbl	(VEC_SIZE * 3)(%rdi, %rcx), %eax
+	movzbl	(VEC_SIZE * 3)(%rsi, %rcx), %edx
+	sub	%edx, %eax
+# endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(first_vec_x1):
+	tzcntl	%eax, %ecx
+# ifdef USE_AS_WMEMCMP
+	xorl	%eax, %eax
+	movl	VEC_SIZE(%rdi, %rcx), %edx
+	cmpl	VEC_SIZE(%rsi, %rcx), %edx
+	jmp	L(wmemcmp_return)
+# else
+	movzbl	VEC_SIZE(%rdi, %rcx), %eax
+	movzbl	VEC_SIZE(%rsi, %rcx), %edx
+	sub	%edx, %eax
+# endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(first_vec_x2):
+	tzcntl	%eax, %ecx
+# ifdef USE_AS_WMEMCMP
+	xorl	%eax, %eax
+	movl	(VEC_SIZE * 2)(%rdi, %rcx), %edx
+	cmpl	(VEC_SIZE * 2)(%rsi, %rcx), %edx
+	jmp	L(wmemcmp_return)
+# else
+	movzbl	(VEC_SIZE * 2)(%rdi, %rcx), %eax
+	movzbl	(VEC_SIZE * 2)(%rsi, %rcx), %edx
+	sub	%edx, %eax
+# endif
+	VZEROUPPER
+	ret
+END (MEMCMP)
+#endif
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/memcmp-sse4.S b/REORG.TODO/sysdeps/x86_64/multiarch/memcmp-sse4.S
new file mode 100644
index 0000000000..771639f662
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/memcmp-sse4.S
@@ -0,0 +1,1776 @@
+/* memcmp with SSE4.1, wmemcmp with SSE4.1
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+
+# ifndef MEMCMP
+#  define MEMCMP	__memcmp_sse4_1
+# endif
+
+# define JMPTBL(I, B)	(I - B)
+
+# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)		\
+  lea		TABLE(%rip), %r11;				\
+  movslq	(%r11, INDEX, SCALE), %rcx;			\
+  add		%r11, %rcx;					\
+  jmp		*%rcx;						\
+  ud2
+
+/* Warning!
+           wmemcmp has to use SIGNED comparison for elements.
+           memcmp has to use UNSIGNED comparison for elemnts.
+*/
+
+	.section .text.sse4.1,"ax",@progbits
+ENTRY (MEMCMP)
+# ifdef USE_AS_WMEMCMP
+	shl	$2, %rdx
+# endif
+	pxor	%xmm0, %xmm0
+	cmp	$79, %rdx
+	ja	L(79bytesormore)
+# ifndef USE_AS_WMEMCMP
+	cmp	$1, %rdx
+	je	L(firstbyte)
+# endif
+	add	%rdx, %rsi
+	add	%rdx, %rdi
+	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
+
+# ifndef USE_AS_WMEMCMP
+	.p2align 4
+L(firstbyte):
+	movzbl	(%rdi), %eax
+	movzbl	(%rsi), %ecx
+	sub	%ecx, %eax
+	ret
+# endif
+
+	.p2align 4
+L(79bytesormore):
+	movdqu	(%rsi), %xmm1
+	movdqu	(%rdi), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(16bytesin256)
+	mov	%rsi, %rcx
+	and	$-16, %rsi
+	add	$16, %rsi
+	sub	%rsi, %rcx
+
+	sub	%rcx, %rdi
+	add	%rcx, %rdx
+	test	$0xf, %rdi
+	jz	L(2aligned)
+
+	cmp	$128, %rdx
+	ja	L(128bytesormore)
+L(less128bytes):
+	sub	$64, %rdx
+
+	movdqu	(%rdi), %xmm2
+	pxor	(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(16bytesin256)
+
+	movdqu	16(%rdi), %xmm2
+	pxor	16(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(32bytesin256)
+
+	movdqu	32(%rdi), %xmm2
+	pxor	32(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(48bytesin256)
+
+	movdqu	48(%rdi), %xmm2
+	pxor	48(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(64bytesin256)
+	cmp	$32, %rdx
+	jb	L(less32bytesin64)
+
+	movdqu	64(%rdi), %xmm2
+	pxor	64(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(80bytesin256)
+
+	movdqu	80(%rdi), %xmm2
+	pxor	80(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(96bytesin256)
+	sub	$32, %rdx
+	add	$32, %rdi
+	add	$32, %rsi
+L(less32bytesin64):
+	add	$64, %rdi
+	add	$64, %rsi
+	add	%rdx, %rsi
+	add	%rdx, %rdi
+	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
+
+L(128bytesormore):
+	cmp	$512, %rdx
+	ja	L(512bytesormore)
+	cmp	$256, %rdx
+	ja	L(less512bytes)
+L(less256bytes):
+	sub	$128, %rdx
+
+	movdqu	(%rdi), %xmm2
+	pxor	(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(16bytesin256)
+
+	movdqu	16(%rdi), %xmm2
+	pxor	16(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(32bytesin256)
+
+	movdqu	32(%rdi), %xmm2
+	pxor	32(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(48bytesin256)
+
+	movdqu	48(%rdi), %xmm2
+	pxor	48(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(64bytesin256)
+
+	movdqu	64(%rdi), %xmm2
+	pxor	64(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(80bytesin256)
+
+	movdqu	80(%rdi), %xmm2
+	pxor	80(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(96bytesin256)
+
+	movdqu	96(%rdi), %xmm2
+	pxor	96(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(112bytesin256)
+
+	movdqu	112(%rdi), %xmm2
+	pxor	112(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(128bytesin256)
+
+	add	$128, %rsi
+	add	$128, %rdi
+
+	cmp	$64, %rdx
+	jae	L(less128bytes)
+
+	cmp	$32, %rdx
+	jb	L(less32bytesin128)
+
+	movdqu	(%rdi), %xmm2
+	pxor	(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(16bytesin256)
+
+	movdqu	16(%rdi), %xmm2
+	pxor	16(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(32bytesin256)
+	sub	$32, %rdx
+	add	$32, %rdi
+	add	$32, %rsi
+L(less32bytesin128):
+	add	%rdx, %rsi
+	add	%rdx, %rdi
+	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
+
+L(less512bytes):
+	sub	$256, %rdx
+	movdqu	(%rdi), %xmm2
+	pxor	(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(16bytesin256)
+
+	movdqu	16(%rdi), %xmm2
+	pxor	16(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(32bytesin256)
+
+	movdqu	32(%rdi), %xmm2
+	pxor	32(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(48bytesin256)
+
+	movdqu	48(%rdi), %xmm2
+	pxor	48(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(64bytesin256)
+
+	movdqu	64(%rdi), %xmm2
+	pxor	64(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(80bytesin256)
+
+	movdqu	80(%rdi), %xmm2
+	pxor	80(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(96bytesin256)
+
+	movdqu	96(%rdi), %xmm2
+	pxor	96(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(112bytesin256)
+
+	movdqu	112(%rdi), %xmm2
+	pxor	112(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(128bytesin256)
+
+	movdqu	128(%rdi), %xmm2
+	pxor	128(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(144bytesin256)
+
+	movdqu	144(%rdi), %xmm2
+	pxor	144(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(160bytesin256)
+
+	movdqu	160(%rdi), %xmm2
+	pxor	160(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(176bytesin256)
+
+	movdqu	176(%rdi), %xmm2
+	pxor	176(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(192bytesin256)
+
+	movdqu	192(%rdi), %xmm2
+	pxor	192(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(208bytesin256)
+
+	movdqu	208(%rdi), %xmm2
+	pxor	208(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(224bytesin256)
+
+	movdqu	224(%rdi), %xmm2
+	pxor	224(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(240bytesin256)
+
+	movdqu	240(%rdi), %xmm2
+	pxor	240(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(256bytesin256)
+
+	add	$256, %rsi
+	add	$256, %rdi
+
+	cmp	$128, %rdx
+	jae	L(less256bytes)
+
+	cmp	$64, %rdx
+	jae	L(less128bytes)
+
+	cmp	$32, %rdx
+	jb	L(less32bytesin256)
+
+	movdqu	(%rdi), %xmm2
+	pxor	(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(16bytesin256)
+
+	movdqu	16(%rdi), %xmm2
+	pxor	16(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(32bytesin256)
+	sub	$32, %rdx
+	add	$32, %rdi
+	add	$32, %rsi
+L(less32bytesin256):
+	add	%rdx, %rsi
+	add	%rdx, %rdi
+	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
+
+	.p2align 4
+L(512bytesormore):
+# ifdef DATA_CACHE_SIZE_HALF
+	mov	$DATA_CACHE_SIZE_HALF, %R8_LP
+# else
+	mov	__x86_data_cache_size_half(%rip), %R8_LP
+# endif
+	mov	%r8, %r9
+	shr	$1, %r8
+	add	%r9, %r8
+	cmp	%r8, %rdx
+	ja	L(L2_L3_cache_unaglined)
+	sub	$64, %rdx
+	.p2align 4
+L(64bytesormore_loop):
+	movdqu	(%rdi), %xmm2
+	pxor	(%rsi), %xmm2
+	movdqa	%xmm2, %xmm1
+
+	movdqu	16(%rdi), %xmm3
+	pxor	16(%rsi), %xmm3
+	por	%xmm3, %xmm1
+
+	movdqu	32(%rdi), %xmm4
+	pxor	32(%rsi), %xmm4
+	por	%xmm4, %xmm1
+
+	movdqu	48(%rdi), %xmm5
+	pxor	48(%rsi), %xmm5
+	por	%xmm5, %xmm1
+
+	ptest	%xmm1, %xmm0
+	jnc	L(64bytesormore_loop_end)
+	add	$64, %rsi
+	add	$64, %rdi
+	sub	$64, %rdx
+	jae	L(64bytesormore_loop)
+
+	add	$64, %rdx
+	add	%rdx, %rsi
+	add	%rdx, %rdi
+	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
+
+L(L2_L3_cache_unaglined):
+	sub	$64, %rdx
+	.p2align 4
+L(L2_L3_unaligned_128bytes_loop):
+	prefetchnta 0x1c0(%rdi)
+	prefetchnta 0x1c0(%rsi)
+	movdqu	(%rdi), %xmm2
+	pxor	(%rsi), %xmm2
+	movdqa	%xmm2, %xmm1
+
+	movdqu	16(%rdi), %xmm3
+	pxor	16(%rsi), %xmm3
+	por	%xmm3, %xmm1
+
+	movdqu	32(%rdi), %xmm4
+	pxor	32(%rsi), %xmm4
+	por	%xmm4, %xmm1
+
+	movdqu	48(%rdi), %xmm5
+	pxor	48(%rsi), %xmm5
+	por	%xmm5, %xmm1
+
+	ptest	%xmm1, %xmm0
+	jnc	L(64bytesormore_loop_end)
+	add	$64, %rsi
+	add	$64, %rdi
+	sub	$64, %rdx
+	jae	L(L2_L3_unaligned_128bytes_loop)
+
+	add	$64, %rdx
+	add	%rdx, %rsi
+	add	%rdx, %rdi
+	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
+
+/*
+ * This case is for machines which are sensitive for unaligned instructions.
+ */
+	.p2align 4
+L(2aligned):
+	cmp	$128, %rdx
+	ja	L(128bytesormorein2aligned)
+L(less128bytesin2aligned):
+	sub	$64, %rdx
+
+	movdqa	(%rdi), %xmm2
+	pxor	(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(16bytesin256)
+
+	movdqa	16(%rdi), %xmm2
+	pxor	16(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(32bytesin256)
+
+	movdqa	32(%rdi), %xmm2
+	pxor	32(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(48bytesin256)
+
+	movdqa	48(%rdi), %xmm2
+	pxor	48(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(64bytesin256)
+	cmp	$32, %rdx
+	jb	L(less32bytesin64in2alinged)
+
+	movdqa	64(%rdi), %xmm2
+	pxor	64(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(80bytesin256)
+
+	movdqa	80(%rdi), %xmm2
+	pxor	80(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(96bytesin256)
+	sub	$32, %rdx
+	add	$32, %rdi
+	add	$32, %rsi
+L(less32bytesin64in2alinged):
+	add	$64, %rdi
+	add	$64, %rsi
+	add	%rdx, %rsi
+	add	%rdx, %rdi
+	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
+
+	.p2align 4
+L(128bytesormorein2aligned):
+	cmp	$512, %rdx
+	ja	L(512bytesormorein2aligned)
+	cmp	$256, %rdx
+	ja	L(256bytesormorein2aligned)
+L(less256bytesin2alinged):
+	sub	$128, %rdx
+
+	movdqa	(%rdi), %xmm2
+	pxor	(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(16bytesin256)
+
+	movdqa	16(%rdi), %xmm2
+	pxor	16(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(32bytesin256)
+
+	movdqa	32(%rdi), %xmm2
+	pxor	32(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(48bytesin256)
+
+	movdqa	48(%rdi), %xmm2
+	pxor	48(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(64bytesin256)
+
+	movdqa	64(%rdi), %xmm2
+	pxor	64(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(80bytesin256)
+
+	movdqa	80(%rdi), %xmm2
+	pxor	80(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(96bytesin256)
+
+	movdqa	96(%rdi), %xmm2
+	pxor	96(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(112bytesin256)
+
+	movdqa	112(%rdi), %xmm2
+	pxor	112(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(128bytesin256)
+
+	add	$128, %rsi
+	add	$128, %rdi
+
+	cmp	$64, %rdx
+	jae	L(less128bytesin2aligned)
+
+	cmp	$32, %rdx
+	jb	L(less32bytesin128in2aligned)
+
+	movdqu	(%rdi), %xmm2
+	pxor	(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(16bytesin256)
+
+	movdqu	16(%rdi), %xmm2
+	pxor	16(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(32bytesin256)
+	sub	$32, %rdx
+	add	$32, %rdi
+	add	$32, %rsi
+L(less32bytesin128in2aligned):
+	add	%rdx, %rsi
+	add	%rdx, %rdi
+	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
+
+	.p2align 4
+L(256bytesormorein2aligned):
+
+	sub	$256, %rdx
+	movdqa	(%rdi), %xmm2
+	pxor	(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(16bytesin256)
+
+	movdqa	16(%rdi), %xmm2
+	pxor	16(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(32bytesin256)
+
+	movdqa	32(%rdi), %xmm2
+	pxor	32(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(48bytesin256)
+
+	movdqa	48(%rdi), %xmm2
+	pxor	48(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(64bytesin256)
+
+	movdqa	64(%rdi), %xmm2
+	pxor	64(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(80bytesin256)
+
+	movdqa	80(%rdi), %xmm2
+	pxor	80(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(96bytesin256)
+
+	movdqa	96(%rdi), %xmm2
+	pxor	96(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(112bytesin256)
+
+	movdqa	112(%rdi), %xmm2
+	pxor	112(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(128bytesin256)
+
+	movdqa	128(%rdi), %xmm2
+	pxor	128(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(144bytesin256)
+
+	movdqa	144(%rdi), %xmm2
+	pxor	144(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(160bytesin256)
+
+	movdqa	160(%rdi), %xmm2
+	pxor	160(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(176bytesin256)
+
+	movdqa	176(%rdi), %xmm2
+	pxor	176(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(192bytesin256)
+
+	movdqa	192(%rdi), %xmm2
+	pxor	192(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(208bytesin256)
+
+	movdqa	208(%rdi), %xmm2
+	pxor	208(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(224bytesin256)
+
+	movdqa	224(%rdi), %xmm2
+	pxor	224(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(240bytesin256)
+
+	movdqa	240(%rdi), %xmm2
+	pxor	240(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(256bytesin256)
+
+	add	$256, %rsi
+	add	$256, %rdi
+
+	cmp	$128, %rdx
+	jae	L(less256bytesin2alinged)
+
+	cmp	$64, %rdx
+	jae	L(less128bytesin2aligned)
+
+	cmp	$32, %rdx
+	jb	L(less32bytesin256in2alinged)
+
+	movdqa	(%rdi), %xmm2
+	pxor	(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(16bytesin256)
+
+	movdqa	16(%rdi), %xmm2
+	pxor	16(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(32bytesin256)
+	sub	$32, %rdx
+	add	$32, %rdi
+	add	$32, %rsi
+L(less32bytesin256in2alinged):
+	add	%rdx, %rsi
+	add	%rdx, %rdi
+	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
+
+	.p2align 4
+L(512bytesormorein2aligned):
+# ifdef DATA_CACHE_SIZE_HALF
+	mov	$DATA_CACHE_SIZE_HALF, %R8_LP
+# else
+	mov	__x86_data_cache_size_half(%rip), %R8_LP
+# endif
+	mov	%r8, %r9
+	shr	$1, %r8
+	add	%r9, %r8
+	cmp	%r8, %rdx
+	ja	L(L2_L3_cache_aglined)
+
+	sub	$64, %rdx
+	.p2align 4
+L(64bytesormore_loopin2aligned):
+	movdqa	(%rdi), %xmm2
+	pxor	(%rsi), %xmm2
+	movdqa	%xmm2, %xmm1
+
+	movdqa	16(%rdi), %xmm3
+	pxor	16(%rsi), %xmm3
+	por	%xmm3, %xmm1
+
+	movdqa	32(%rdi), %xmm4
+	pxor	32(%rsi), %xmm4
+	por	%xmm4, %xmm1
+
+	movdqa	48(%rdi), %xmm5
+	pxor	48(%rsi), %xmm5
+	por	%xmm5, %xmm1
+
+	ptest	%xmm1, %xmm0
+	jnc	L(64bytesormore_loop_end)
+	add	$64, %rsi
+	add	$64, %rdi
+	sub	$64, %rdx
+	jae	L(64bytesormore_loopin2aligned)
+
+	add	$64, %rdx
+	add	%rdx, %rsi
+	add	%rdx, %rdi
+	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
+L(L2_L3_cache_aglined):
+	sub	$64, %rdx
+
+	.p2align 4
+L(L2_L3_aligned_128bytes_loop):
+	prefetchnta 0x1c0(%rdi)
+	prefetchnta 0x1c0(%rsi)
+	movdqa	(%rdi), %xmm2
+	pxor	(%rsi), %xmm2
+	movdqa	%xmm2, %xmm1
+
+	movdqa	16(%rdi), %xmm3
+	pxor	16(%rsi), %xmm3
+	por	%xmm3, %xmm1
+
+	movdqa	32(%rdi), %xmm4
+	pxor	32(%rsi), %xmm4
+	por	%xmm4, %xmm1
+
+	movdqa	48(%rdi), %xmm5
+	pxor	48(%rsi), %xmm5
+	por	%xmm5, %xmm1
+
+	ptest	%xmm1, %xmm0
+	jnc	L(64bytesormore_loop_end)
+	add	$64, %rsi
+	add	$64, %rdi
+	sub	$64, %rdx
+	jae	L(L2_L3_aligned_128bytes_loop)
+
+	add	$64, %rdx
+	add	%rdx, %rsi
+	add	%rdx, %rdi
+	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
+
+
+	.p2align 4
+L(64bytesormore_loop_end):
+	add	$16, %rdi
+	add	$16, %rsi
+	ptest	%xmm2, %xmm0
+	jnc	L(16bytes)
+
+	add	$16, %rdi
+	add	$16, %rsi
+	ptest	%xmm3, %xmm0
+	jnc	L(16bytes)
+
+	add	$16, %rdi
+	add	$16, %rsi
+	ptest	%xmm4, %xmm0
+	jnc	L(16bytes)
+
+	add	$16, %rdi
+	add	$16, %rsi
+	jmp	L(16bytes)
+
+L(256bytesin256):
+	add	$256, %rdi
+	add	$256, %rsi
+	jmp	L(16bytes)
+L(240bytesin256):
+	add	$240, %rdi
+	add	$240, %rsi
+	jmp	L(16bytes)
+L(224bytesin256):
+	add	$224, %rdi
+	add	$224, %rsi
+	jmp	L(16bytes)
+L(208bytesin256):
+	add	$208, %rdi
+	add	$208, %rsi
+	jmp	L(16bytes)
+L(192bytesin256):
+	add	$192, %rdi
+	add	$192, %rsi
+	jmp	L(16bytes)
+L(176bytesin256):
+	add	$176, %rdi
+	add	$176, %rsi
+	jmp	L(16bytes)
+L(160bytesin256):
+	add	$160, %rdi
+	add	$160, %rsi
+	jmp	L(16bytes)
+L(144bytesin256):
+	add	$144, %rdi
+	add	$144, %rsi
+	jmp	L(16bytes)
+L(128bytesin256):
+	add	$128, %rdi
+	add	$128, %rsi
+	jmp	L(16bytes)
+L(112bytesin256):
+	add	$112, %rdi
+	add	$112, %rsi
+	jmp	L(16bytes)
+L(96bytesin256):
+	add	$96, %rdi
+	add	$96, %rsi
+	jmp	L(16bytes)
+L(80bytesin256):
+	add	$80, %rdi
+	add	$80, %rsi
+	jmp	L(16bytes)
+L(64bytesin256):
+	add	$64, %rdi
+	add	$64, %rsi
+	jmp	L(16bytes)
+L(48bytesin256):
+	add	$16, %rdi
+	add	$16, %rsi
+L(32bytesin256):
+	add	$16, %rdi
+	add	$16, %rsi
+L(16bytesin256):
+	add	$16, %rdi
+	add	$16, %rsi
+L(16bytes):
+	mov	-16(%rdi), %rax
+	mov	-16(%rsi), %rcx
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+L(8bytes):
+	mov	-8(%rdi), %rax
+	mov	-8(%rsi), %rcx
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+	xor	%eax, %eax
+	ret
+
+	.p2align 4
+L(12bytes):
+	mov	-12(%rdi), %rax
+	mov	-12(%rsi), %rcx
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+L(4bytes):
+	mov	-4(%rsi), %ecx
+# ifndef USE_AS_WMEMCMP
+	mov	-4(%rdi), %eax
+	cmp	%eax, %ecx
+# else
+	cmp	-4(%rdi), %ecx
+# endif
+	jne	L(diffin4bytes)
+L(0bytes):
+	xor	%eax, %eax
+	ret
+
+# ifndef USE_AS_WMEMCMP
+/* unreal case for wmemcmp */
+	.p2align 4
+L(65bytes):
+	movdqu	-65(%rdi), %xmm1
+	movdqu	-65(%rsi), %xmm2
+	mov	$-65, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(49bytes):
+	movdqu	-49(%rdi), %xmm1
+	movdqu	-49(%rsi), %xmm2
+	mov	$-49, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(33bytes):
+	movdqu	-33(%rdi), %xmm1
+	movdqu	-33(%rsi), %xmm2
+	mov	$-33, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(17bytes):
+	mov	-17(%rdi), %rax
+	mov	-17(%rsi), %rcx
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+L(9bytes):
+	mov	-9(%rdi), %rax
+	mov	-9(%rsi), %rcx
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+	movzbl	-1(%rdi), %eax
+	movzbl	-1(%rsi), %edx
+	sub	%edx, %eax
+	ret
+
+	.p2align 4
+L(13bytes):
+	mov	-13(%rdi), %rax
+	mov	-13(%rsi), %rcx
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+	mov	-8(%rdi), %rax
+	mov	-8(%rsi), %rcx
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+	xor	%eax, %eax
+	ret
+
+	.p2align 4
+L(5bytes):
+	mov	-5(%rdi), %eax
+	mov	-5(%rsi), %ecx
+	cmp	%eax, %ecx
+	jne	L(diffin4bytes)
+	movzbl	-1(%rdi), %eax
+	movzbl	-1(%rsi), %edx
+	sub	%edx, %eax
+	ret
+
+	.p2align 4
+L(66bytes):
+	movdqu	-66(%rdi), %xmm1
+	movdqu	-66(%rsi), %xmm2
+	mov	$-66, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(50bytes):
+	movdqu	-50(%rdi), %xmm1
+	movdqu	-50(%rsi), %xmm2
+	mov	$-50, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(34bytes):
+	movdqu	-34(%rdi), %xmm1
+	movdqu	-34(%rsi), %xmm2
+	mov	$-34, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(18bytes):
+	mov	-18(%rdi), %rax
+	mov	-18(%rsi), %rcx
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+L(10bytes):
+	mov	-10(%rdi), %rax
+	mov	-10(%rsi), %rcx
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+	movzwl	-2(%rdi), %eax
+	movzwl	-2(%rsi), %ecx
+	cmp	%cl, %al
+	jne	L(end)
+	and	$0xffff, %eax
+	and	$0xffff, %ecx
+	sub	%ecx, %eax
+	ret
+
+	.p2align 4
+L(14bytes):
+	mov	-14(%rdi), %rax
+	mov	-14(%rsi), %rcx
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+	mov	-8(%rdi), %rax
+	mov	-8(%rsi), %rcx
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+	xor	%eax, %eax
+	ret
+
+	.p2align 4
+L(6bytes):
+	mov	-6(%rdi), %eax
+	mov	-6(%rsi), %ecx
+	cmp	%eax, %ecx
+	jne	L(diffin4bytes)
+L(2bytes):
+	movzwl	-2(%rsi), %ecx
+	movzwl	-2(%rdi), %eax
+	cmp	%cl, %al
+	jne	L(end)
+	and	$0xffff, %eax
+	and	$0xffff, %ecx
+	sub	%ecx, %eax
+	ret
+
+	.p2align 4
+L(67bytes):
+	movdqu	-67(%rdi), %xmm2
+	movdqu	-67(%rsi), %xmm1
+	mov	$-67, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(51bytes):
+	movdqu	-51(%rdi), %xmm2
+	movdqu	-51(%rsi), %xmm1
+	mov	$-51, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(35bytes):
+	movdqu	-35(%rsi), %xmm1
+	movdqu	-35(%rdi), %xmm2
+	mov	$-35, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(19bytes):
+	mov	-19(%rdi), %rax
+	mov	-19(%rsi), %rcx
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+L(11bytes):
+	mov	-11(%rdi), %rax
+	mov	-11(%rsi), %rcx
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+	mov	-4(%rdi), %eax
+	mov	-4(%rsi), %ecx
+	cmp	%eax, %ecx
+	jne	L(diffin4bytes)
+	xor	%eax, %eax
+	ret
+
+	.p2align 4
+L(15bytes):
+	mov	-15(%rdi), %rax
+	mov	-15(%rsi), %rcx
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+	mov	-8(%rdi), %rax
+	mov	-8(%rsi), %rcx
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+	xor	%eax, %eax
+	ret
+
+	.p2align 4
+L(7bytes):
+	mov	-7(%rdi), %eax
+	mov	-7(%rsi), %ecx
+	cmp	%eax, %ecx
+	jne	L(diffin4bytes)
+	mov	-4(%rdi), %eax
+	mov	-4(%rsi), %ecx
+	cmp	%eax, %ecx
+	jne	L(diffin4bytes)
+	xor	%eax, %eax
+	ret
+
+	.p2align 4
+L(3bytes):
+	movzwl	-3(%rdi), %eax
+	movzwl	-3(%rsi), %ecx
+	cmp	%eax, %ecx
+	jne	L(diffin2bytes)
+L(1bytes):
+	movzbl	-1(%rdi), %eax
+	movzbl	-1(%rsi), %ecx
+	sub	%ecx, %eax
+	ret
+# endif
+
+	.p2align 4
+L(68bytes):
+	movdqu	-68(%rdi), %xmm2
+	movdqu	-68(%rsi), %xmm1
+	mov	$-68, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(52bytes):
+	movdqu	-52(%rdi), %xmm2
+	movdqu	-52(%rsi), %xmm1
+	mov	$-52, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(36bytes):
+	movdqu	-36(%rdi), %xmm2
+	movdqu	-36(%rsi), %xmm1
+	mov	$-36, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(20bytes):
+	movdqu	-20(%rdi), %xmm2
+	movdqu	-20(%rsi), %xmm1
+	mov	$-20, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+	mov	-4(%rsi), %ecx
+
+# ifndef USE_AS_WMEMCMP
+	mov	-4(%rdi), %eax
+	cmp	%eax, %ecx
+# else
+	cmp	-4(%rdi), %ecx
+# endif
+	jne	L(diffin4bytes)
+	xor	%eax, %eax
+	ret
+
+# ifndef USE_AS_WMEMCMP
+/* unreal cases for wmemcmp */
+	.p2align 4
+L(69bytes):
+	movdqu	-69(%rsi), %xmm1
+	movdqu	-69(%rdi), %xmm2
+	mov	$-69, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(53bytes):
+	movdqu	-53(%rsi), %xmm1
+	movdqu	-53(%rdi), %xmm2
+	mov	$-53, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(37bytes):
+	movdqu	-37(%rsi), %xmm1
+	movdqu	-37(%rdi), %xmm2
+	mov	$-37, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(21bytes):
+	movdqu	-21(%rsi), %xmm1
+	movdqu	-21(%rdi), %xmm2
+	mov	$-21, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+	mov	-8(%rdi), %rax
+	mov	-8(%rsi), %rcx
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+	xor	%eax, %eax
+	ret
+
+	.p2align 4
+L(70bytes):
+	movdqu	-70(%rsi), %xmm1
+	movdqu	-70(%rdi), %xmm2
+	mov	$-70, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(54bytes):
+	movdqu	-54(%rsi), %xmm1
+	movdqu	-54(%rdi), %xmm2
+	mov	$-54, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(38bytes):
+	movdqu	-38(%rsi), %xmm1
+	movdqu	-38(%rdi), %xmm2
+	mov	$-38, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(22bytes):
+	movdqu	-22(%rsi), %xmm1
+	movdqu	-22(%rdi), %xmm2
+	mov	$-22, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+	mov	-8(%rdi), %rax
+	mov	-8(%rsi), %rcx
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+	xor	%eax, %eax
+	ret
+
+	.p2align 4
+L(71bytes):
+	movdqu	-71(%rsi), %xmm1
+	movdqu	-71(%rdi), %xmm2
+	mov	$-71, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(55bytes):
+	movdqu	-55(%rdi), %xmm2
+	movdqu	-55(%rsi), %xmm1
+	mov	$-55, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(39bytes):
+	movdqu	-39(%rdi), %xmm2
+	movdqu	-39(%rsi), %xmm1
+	mov	$-39, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(23bytes):
+	movdqu	-23(%rdi), %xmm2
+	movdqu	-23(%rsi), %xmm1
+	mov	$-23, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+	mov	-8(%rdi), %rax
+	mov	-8(%rsi), %rcx
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+	xor	%eax, %eax
+	ret
+# endif
+
+	.p2align 4
+L(72bytes):
+	movdqu	-72(%rsi), %xmm1
+	movdqu	-72(%rdi), %xmm2
+	mov	$-72, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(56bytes):
+	movdqu	-56(%rdi), %xmm2
+	movdqu	-56(%rsi), %xmm1
+	mov	$-56, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(40bytes):
+	movdqu	-40(%rdi), %xmm2
+	movdqu	-40(%rsi), %xmm1
+	mov	$-40, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(24bytes):
+	movdqu	-24(%rdi), %xmm2
+	movdqu	-24(%rsi), %xmm1
+	mov	$-24, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+
+	mov	-8(%rsi), %rcx
+	mov	-8(%rdi), %rax
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+	xor	%eax, %eax
+	ret
+
+# ifndef USE_AS_WMEMCMP
+/* unreal cases for wmemcmp */
+	.p2align 4
+L(73bytes):
+	movdqu	-73(%rsi), %xmm1
+	movdqu	-73(%rdi), %xmm2
+	mov	$-73, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(57bytes):
+	movdqu	-57(%rdi), %xmm2
+	movdqu	-57(%rsi), %xmm1
+	mov	$-57, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(41bytes):
+	movdqu	-41(%rdi), %xmm2
+	movdqu	-41(%rsi), %xmm1
+	mov	$-41, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(25bytes):
+	movdqu	-25(%rdi), %xmm2
+	movdqu	-25(%rsi), %xmm1
+	mov	$-25, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+	mov	-9(%rdi), %rax
+	mov	-9(%rsi), %rcx
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+	movzbl	-1(%rdi), %eax
+	movzbl	-1(%rsi), %ecx
+	sub	%ecx, %eax
+	ret
+
+	.p2align 4
+L(74bytes):
+	movdqu	-74(%rsi), %xmm1
+	movdqu	-74(%rdi), %xmm2
+	mov	$-74, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(58bytes):
+	movdqu	-58(%rdi), %xmm2
+	movdqu	-58(%rsi), %xmm1
+	mov	$-58, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(42bytes):
+	movdqu	-42(%rdi), %xmm2
+	movdqu	-42(%rsi), %xmm1
+	mov	$-42, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(26bytes):
+	movdqu	-26(%rdi), %xmm2
+	movdqu	-26(%rsi), %xmm1
+	mov	$-26, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+	mov	-10(%rdi), %rax
+	mov	-10(%rsi), %rcx
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+	movzwl	-2(%rdi), %eax
+	movzwl	-2(%rsi), %ecx
+	jmp	L(diffin2bytes)
+
+	.p2align 4
+L(75bytes):
+	movdqu	-75(%rsi), %xmm1
+	movdqu	-75(%rdi), %xmm2
+	mov	$-75, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(59bytes):
+	movdqu	-59(%rdi), %xmm2
+	movdqu	-59(%rsi), %xmm1
+	mov	$-59, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(43bytes):
+	movdqu	-43(%rdi), %xmm2
+	movdqu	-43(%rsi), %xmm1
+	mov	$-43, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(27bytes):
+	movdqu	-27(%rdi), %xmm2
+	movdqu	-27(%rsi), %xmm1
+	mov	$-27, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+	mov	-11(%rdi), %rax
+	mov	-11(%rsi), %rcx
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+	mov	-4(%rdi), %eax
+	mov	-4(%rsi), %ecx
+	cmp	%eax, %ecx
+	jne	L(diffin4bytes)
+	xor	%eax, %eax
+	ret
+# endif
+	.p2align 4
+L(76bytes):
+	movdqu	-76(%rsi), %xmm1
+	movdqu	-76(%rdi), %xmm2
+	mov	$-76, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(60bytes):
+	movdqu	-60(%rdi), %xmm2
+	movdqu	-60(%rsi), %xmm1
+	mov	$-60, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(44bytes):
+	movdqu	-44(%rdi), %xmm2
+	movdqu	-44(%rsi), %xmm1
+	mov	$-44, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(28bytes):
+	movdqu	-28(%rdi), %xmm2
+	movdqu	-28(%rsi), %xmm1
+	mov	$-28, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+	mov	-12(%rdi), %rax
+	mov	-12(%rsi), %rcx
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+	mov	-4(%rsi), %ecx
+# ifndef USE_AS_WMEMCMP
+	mov	-4(%rdi), %eax
+	cmp	%eax, %ecx
+# else
+	cmp	-4(%rdi), %ecx
+# endif
+	jne	L(diffin4bytes)
+	xor	%eax, %eax
+	ret
+
+# ifndef USE_AS_WMEMCMP
+/* unreal cases for wmemcmp */
+	.p2align 4
+L(77bytes):
+	movdqu	-77(%rsi), %xmm1
+	movdqu	-77(%rdi), %xmm2
+	mov	$-77, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(61bytes):
+	movdqu	-61(%rdi), %xmm2
+	movdqu	-61(%rsi), %xmm1
+	mov	$-61, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(45bytes):
+	movdqu	-45(%rdi), %xmm2
+	movdqu	-45(%rsi), %xmm1
+	mov	$-45, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(29bytes):
+	movdqu	-29(%rdi), %xmm2
+	movdqu	-29(%rsi), %xmm1
+	mov	$-29, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+
+	mov	-13(%rdi), %rax
+	mov	-13(%rsi), %rcx
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+
+	mov	-8(%rdi), %rax
+	mov	-8(%rsi), %rcx
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+	xor	%eax, %eax
+	ret
+
+	.p2align 4
+L(78bytes):
+	movdqu	-78(%rsi), %xmm1
+	movdqu	-78(%rdi), %xmm2
+	mov	$-78, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(62bytes):
+	movdqu	-62(%rdi), %xmm2
+	movdqu	-62(%rsi), %xmm1
+	mov	$-62, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(46bytes):
+	movdqu	-46(%rdi), %xmm2
+	movdqu	-46(%rsi), %xmm1
+	mov	$-46, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(30bytes):
+	movdqu	-30(%rdi), %xmm2
+	movdqu	-30(%rsi), %xmm1
+	mov	$-30, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+	mov	-14(%rdi), %rax
+	mov	-14(%rsi), %rcx
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+	mov	-8(%rdi), %rax
+	mov	-8(%rsi), %rcx
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+	xor	%eax, %eax
+	ret
+
+	.p2align 4
+L(79bytes):
+	movdqu	-79(%rsi), %xmm1
+	movdqu	-79(%rdi), %xmm2
+	mov	$-79, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(63bytes):
+	movdqu	-63(%rdi), %xmm2
+	movdqu	-63(%rsi), %xmm1
+	mov	$-63, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(47bytes):
+	movdqu	-47(%rdi), %xmm2
+	movdqu	-47(%rsi), %xmm1
+	mov	$-47, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(31bytes):
+	movdqu	-31(%rdi), %xmm2
+	movdqu	-31(%rsi), %xmm1
+	mov	$-31, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+	mov	-15(%rdi), %rax
+	mov	-15(%rsi), %rcx
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+	mov	-8(%rdi), %rax
+	mov	-8(%rsi), %rcx
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+	xor	%eax, %eax
+	ret
+# endif
+	.p2align 4
+L(64bytes):
+	movdqu	-64(%rdi), %xmm2
+	movdqu	-64(%rsi), %xmm1
+	mov	$-64, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(48bytes):
+	movdqu	-48(%rdi), %xmm2
+	movdqu	-48(%rsi), %xmm1
+	mov	$-48, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(32bytes):
+	movdqu	-32(%rdi), %xmm2
+	movdqu	-32(%rsi), %xmm1
+	mov	$-32, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+
+	mov	-16(%rdi), %rax
+	mov	-16(%rsi), %rcx
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+
+	mov	-8(%rdi), %rax
+	mov	-8(%rsi), %rcx
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+	xor	%eax, %eax
+	ret
+
+/*
+ * Aligned 8 bytes to avoid 2 branch "taken" in one 16 alinged code block.
+ */
+	.p2align 3
+L(less16bytes):
+	movsbq	%dl, %rdx
+	mov	(%rsi, %rdx), %rcx
+	mov	(%rdi, %rdx), %rax
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+	mov	8(%rsi, %rdx), %rcx
+	mov	8(%rdi, %rdx), %rax
+L(diffin8bytes):
+	cmp	%eax, %ecx
+	jne	L(diffin4bytes)
+	shr	$32, %rcx
+	shr	$32, %rax
+
+# ifdef USE_AS_WMEMCMP
+/* for wmemcmp */
+	cmp	%eax, %ecx
+	jne	L(diffin4bytes)
+	xor	%eax, %eax
+	ret
+# endif
+
+L(diffin4bytes):
+# ifndef USE_AS_WMEMCMP
+	cmp	%cx, %ax
+	jne	L(diffin2bytes)
+	shr	$16, %ecx
+	shr	$16, %eax
+L(diffin2bytes):
+	cmp	%cl, %al
+	jne	L(end)
+	and	$0xffff, %eax
+	and	$0xffff, %ecx
+	sub	%ecx, %eax
+	ret
+
+	.p2align 4
+L(end):
+	and	$0xff, %eax
+	and	$0xff, %ecx
+	sub	%ecx, %eax
+	ret
+# else
+
+/* for wmemcmp */
+	mov	$1, %eax
+	jl	L(nequal_bigger)
+	neg	%eax
+	ret
+
+	.p2align 4
+L(nequal_bigger):
+	ret
+
+L(unreal_case):
+	xor	%eax, %eax
+	ret
+# endif
+
+END (MEMCMP)
+
+	.section .rodata.sse4.1,"a",@progbits
+	.p2align 3
+# ifndef USE_AS_WMEMCMP
+L(table_64bytes):
+	.int	JMPTBL (L(0bytes), L(table_64bytes))
+	.int	JMPTBL (L(1bytes), L(table_64bytes))
+	.int	JMPTBL (L(2bytes), L(table_64bytes))
+	.int	JMPTBL (L(3bytes), L(table_64bytes))
+	.int	JMPTBL (L(4bytes), L(table_64bytes))
+	.int	JMPTBL (L(5bytes), L(table_64bytes))
+	.int	JMPTBL (L(6bytes), L(table_64bytes))
+	.int	JMPTBL (L(7bytes), L(table_64bytes))
+	.int	JMPTBL (L(8bytes), L(table_64bytes))
+	.int	JMPTBL (L(9bytes), L(table_64bytes))
+	.int	JMPTBL (L(10bytes), L(table_64bytes))
+	.int	JMPTBL (L(11bytes), L(table_64bytes))
+	.int	JMPTBL (L(12bytes), L(table_64bytes))
+	.int	JMPTBL (L(13bytes), L(table_64bytes))
+	.int	JMPTBL (L(14bytes), L(table_64bytes))
+	.int	JMPTBL (L(15bytes), L(table_64bytes))
+	.int	JMPTBL (L(16bytes), L(table_64bytes))
+	.int	JMPTBL (L(17bytes), L(table_64bytes))
+	.int	JMPTBL (L(18bytes), L(table_64bytes))
+	.int	JMPTBL (L(19bytes), L(table_64bytes))
+	.int	JMPTBL (L(20bytes), L(table_64bytes))
+	.int	JMPTBL (L(21bytes), L(table_64bytes))
+	.int	JMPTBL (L(22bytes), L(table_64bytes))
+	.int	JMPTBL (L(23bytes), L(table_64bytes))
+	.int	JMPTBL (L(24bytes), L(table_64bytes))
+	.int	JMPTBL (L(25bytes), L(table_64bytes))
+	.int	JMPTBL (L(26bytes), L(table_64bytes))
+	.int	JMPTBL (L(27bytes), L(table_64bytes))
+	.int	JMPTBL (L(28bytes), L(table_64bytes))
+	.int	JMPTBL (L(29bytes), L(table_64bytes))
+	.int	JMPTBL (L(30bytes), L(table_64bytes))
+	.int	JMPTBL (L(31bytes), L(table_64bytes))
+	.int	JMPTBL (L(32bytes), L(table_64bytes))
+	.int	JMPTBL (L(33bytes), L(table_64bytes))
+	.int	JMPTBL (L(34bytes), L(table_64bytes))
+	.int	JMPTBL (L(35bytes), L(table_64bytes))
+	.int	JMPTBL (L(36bytes), L(table_64bytes))
+	.int	JMPTBL (L(37bytes), L(table_64bytes))
+	.int	JMPTBL (L(38bytes), L(table_64bytes))
+	.int	JMPTBL (L(39bytes), L(table_64bytes))
+	.int	JMPTBL (L(40bytes), L(table_64bytes))
+	.int	JMPTBL (L(41bytes), L(table_64bytes))
+	.int	JMPTBL (L(42bytes), L(table_64bytes))
+	.int	JMPTBL (L(43bytes), L(table_64bytes))
+	.int	JMPTBL (L(44bytes), L(table_64bytes))
+	.int	JMPTBL (L(45bytes), L(table_64bytes))
+	.int	JMPTBL (L(46bytes), L(table_64bytes))
+	.int	JMPTBL (L(47bytes), L(table_64bytes))
+	.int	JMPTBL (L(48bytes), L(table_64bytes))
+	.int	JMPTBL (L(49bytes), L(table_64bytes))
+	.int	JMPTBL (L(50bytes), L(table_64bytes))
+	.int	JMPTBL (L(51bytes), L(table_64bytes))
+	.int	JMPTBL (L(52bytes), L(table_64bytes))
+	.int	JMPTBL (L(53bytes), L(table_64bytes))
+	.int	JMPTBL (L(54bytes), L(table_64bytes))
+	.int	JMPTBL (L(55bytes), L(table_64bytes))
+	.int	JMPTBL (L(56bytes), L(table_64bytes))
+	.int	JMPTBL (L(57bytes), L(table_64bytes))
+	.int	JMPTBL (L(58bytes), L(table_64bytes))
+	.int	JMPTBL (L(59bytes), L(table_64bytes))
+	.int	JMPTBL (L(60bytes), L(table_64bytes))
+	.int	JMPTBL (L(61bytes), L(table_64bytes))
+	.int	JMPTBL (L(62bytes), L(table_64bytes))
+	.int	JMPTBL (L(63bytes), L(table_64bytes))
+	.int	JMPTBL (L(64bytes), L(table_64bytes))
+	.int	JMPTBL (L(65bytes), L(table_64bytes))
+	.int	JMPTBL (L(66bytes), L(table_64bytes))
+	.int	JMPTBL (L(67bytes), L(table_64bytes))
+	.int	JMPTBL (L(68bytes), L(table_64bytes))
+	.int	JMPTBL (L(69bytes), L(table_64bytes))
+	.int	JMPTBL (L(70bytes), L(table_64bytes))
+	.int	JMPTBL (L(71bytes), L(table_64bytes))
+	.int	JMPTBL (L(72bytes), L(table_64bytes))
+	.int	JMPTBL (L(73bytes), L(table_64bytes))
+	.int	JMPTBL (L(74bytes), L(table_64bytes))
+	.int	JMPTBL (L(75bytes), L(table_64bytes))
+	.int	JMPTBL (L(76bytes), L(table_64bytes))
+	.int	JMPTBL (L(77bytes), L(table_64bytes))
+	.int	JMPTBL (L(78bytes), L(table_64bytes))
+	.int	JMPTBL (L(79bytes), L(table_64bytes))
+# else
+L(table_64bytes):
+	.int	JMPTBL (L(0bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(4bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(8bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(12bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(16bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(20bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(24bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(28bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(32bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(36bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(40bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(44bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(48bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(52bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(56bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(60bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(64bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(68bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(72bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(76bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+# endif
+#endif
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/memcmp-ssse3.S b/REORG.TODO/sysdeps/x86_64/multiarch/memcmp-ssse3.S
new file mode 100644
index 0000000000..8d7d2fe67b
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/memcmp-ssse3.S
@@ -0,0 +1,1990 @@
+/* memcmp with SSSE3, wmemcmp with SSSE3
+   Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+
+# ifndef MEMCMP
+#  define MEMCMP	__memcmp_ssse3
+# endif
+
+/* Warning!
+	   wmemcmp has to use SIGNED comparison for elements.
+	   memcmp has to use UNSIGNED comparison for elemnts.
+*/
+
+	atom_text_section
+ENTRY (MEMCMP)
+# ifdef USE_AS_WMEMCMP
+	shl	$2, %rdx
+	test	%rdx, %rdx
+	jz	L(equal)
+# endif
+	mov	%rdx, %rcx
+	mov	%rdi, %rdx
+	cmp	$48, %rcx;
+	jae	L(48bytesormore)	/* LEN => 48  */
+
+	add	%rcx, %rsi
+	add	%rcx, %rdi
+	jmp	L(less48bytes)
+
+	.p2align 4
+/* ECX >= 32.  */
+L(48bytesormore):
+	movdqu	(%rdi), %xmm3
+	movdqu	(%rsi), %xmm0
+	pcmpeqb	%xmm0, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	16(%rdi), %rdi
+	lea	16(%rsi), %rsi
+	sub	$0xffff, %edx
+	jnz	L(less16bytes)
+	mov	%edi, %edx
+	and	$0xf, %edx
+	xor	%rdx, %rdi
+	sub	%rdx, %rsi
+	add	%rdx, %rcx
+	mov	%esi, %edx
+	and	$0xf, %edx
+	jz	L(shr_0)
+	xor	%rdx, %rsi
+
+# ifndef USE_AS_WMEMCMP
+	cmp	$8, %edx
+	jae	L(next_unaligned_table)
+	cmp	$0, %edx
+	je	L(shr_0)
+	cmp	$1, %edx
+	je	L(shr_1)
+	cmp	$2, %edx
+	je	L(shr_2)
+	cmp	$3, %edx
+	je	L(shr_3)
+	cmp	$4, %edx
+	je	L(shr_4)
+	cmp	$5, %edx
+	je	L(shr_5)
+	cmp	$6, %edx
+	je	L(shr_6)
+	jmp	L(shr_7)
+
+	.p2align 2
+L(next_unaligned_table):
+	cmp	$8, %edx
+	je	L(shr_8)
+	cmp	$9, %edx
+	je	L(shr_9)
+	cmp	$10, %edx
+	je	L(shr_10)
+	cmp	$11, %edx
+	je	L(shr_11)
+	cmp	$12, %edx
+	je	L(shr_12)
+	cmp	$13, %edx
+	je	L(shr_13)
+	cmp	$14, %edx
+	je	L(shr_14)
+	jmp	L(shr_15)
+# else
+	cmp	$0, %edx
+	je	L(shr_0)
+	cmp	$4, %edx
+	je	L(shr_4)
+	cmp	$8, %edx
+	je	L(shr_8)
+	jmp	L(shr_12)
+# endif
+
+	.p2align 4
+L(shr_0):
+	cmp	$80, %rcx
+	lea	-48(%rcx), %rcx
+	jae	L(shr_0_gobble)
+	xor	%eax, %eax
+	movdqa	(%rsi), %xmm1
+	pcmpeqb	(%rdi), %xmm1
+	movdqa	16(%rsi), %xmm2
+	pcmpeqb	16(%rdi), %xmm2
+	pand	%xmm1, %xmm2
+	pmovmskb %xmm2, %edx
+	lea	32(%rdi), %rdi
+	lea	32(%rsi), %rsi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	add	%rcx, %rsi
+	add	%rcx, %rdi
+	jmp	L(less48bytes)
+
+	.p2align 4
+L(shr_0_gobble):
+	movdqa	(%rsi), %xmm0
+	xor	%eax, %eax
+	pcmpeqb	(%rdi), %xmm0
+	sub	$32, %rcx
+	movdqa	16(%rsi), %xmm2
+	pcmpeqb	16(%rdi), %xmm2
+L(shr_0_gobble_loop):
+	pand	%xmm0, %xmm2
+	sub	$32, %rcx
+	pmovmskb %xmm2, %edx
+	movdqa	%xmm0, %xmm1
+	movdqa	32(%rsi), %xmm0
+	movdqa	48(%rsi), %xmm2
+	sbb	$0xffff, %edx
+	pcmpeqb	32(%rdi), %xmm0
+	pcmpeqb	48(%rdi), %xmm2
+	lea	32(%rdi), %rdi
+	lea	32(%rsi), %rsi
+	jz	L(shr_0_gobble_loop)
+
+	pand	%xmm0, %xmm2
+	cmp	$0, %rcx
+	jge	L(next)
+	inc	%edx
+	add	$32, %rcx
+L(next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm2, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%rdi), %rdi
+	lea	32(%rsi), %rsi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	add	%rcx, %rsi
+	add	%rcx, %rdi
+	jmp	L(less48bytes)
+
+# ifndef USE_AS_WMEMCMP
+
+	.p2align 4
+L(shr_1):
+	cmp	$80, %rcx
+	lea	-48(%rcx), %rcx
+	mov	%edx, %eax
+	jae	L(shr_1_gobble)
+
+	movdqa	16(%rsi), %xmm1
+	movdqa	%xmm1, %xmm2
+	palignr	$1, (%rsi), %xmm1
+	pcmpeqb	(%rdi), %xmm1
+
+	movdqa	32(%rsi), %xmm3
+	palignr	$1, %xmm2, %xmm3
+	pcmpeqb	16(%rdi), %xmm3
+
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	32(%rdi), %rdi
+	lea	32(%rsi), %rsi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	add	$1, %rsi
+	add	%rcx, %rsi
+	add	%rcx, %rdi
+	jmp	L(less48bytes)
+
+	.p2align 4
+L(shr_1_gobble):
+	sub	$32, %rcx
+	movdqa	16(%rsi), %xmm0
+	palignr	$1, (%rsi), %xmm0
+	pcmpeqb	(%rdi), %xmm0
+
+	movdqa	32(%rsi), %xmm3
+	palignr	$1, 16(%rsi), %xmm3
+	pcmpeqb	16(%rdi), %xmm3
+
+L(shr_1_gobble_loop):
+	pand	%xmm0, %xmm3
+	sub	$32, %rcx
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+
+	movdqa	64(%rsi), %xmm3
+	palignr	$1, 48(%rsi), %xmm3
+	sbb	$0xffff, %edx
+	movdqa	48(%rsi), %xmm0
+	palignr	$1, 32(%rsi), %xmm0
+	pcmpeqb	32(%rdi), %xmm0
+	lea	32(%rsi), %rsi
+	pcmpeqb	48(%rdi), %xmm3
+
+	lea	32(%rdi), %rdi
+	jz	L(shr_1_gobble_loop)
+	pand	%xmm0, %xmm3
+
+	cmp	$0, %rcx
+	jge	L(shr_1_gobble_next)
+	inc	%edx
+	add	$32, %rcx
+L(shr_1_gobble_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%rdi), %rdi
+	lea	32(%rsi), %rsi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	1(%rsi), %rsi
+	add	%rcx, %rsi
+	add	%rcx, %rdi
+	jmp	L(less48bytes)
+
+
+	.p2align 4
+L(shr_2):
+	cmp	$80, %rcx
+	lea	-48(%rcx), %rcx
+	mov	%edx, %eax
+	jae	L(shr_2_gobble)
+
+	movdqa	16(%rsi), %xmm1
+	movdqa	%xmm1, %xmm2
+	palignr	$2, (%rsi), %xmm1
+	pcmpeqb	(%rdi), %xmm1
+
+	movdqa	32(%rsi), %xmm3
+	palignr	$2, %xmm2, %xmm3
+	pcmpeqb	16(%rdi), %xmm3
+
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	32(%rdi), %rdi
+	lea	32(%rsi), %rsi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	add	$2, %rsi
+	add	%rcx, %rsi
+	add	%rcx, %rdi
+	jmp	L(less48bytes)
+
+	.p2align 4
+L(shr_2_gobble):
+	sub	$32, %rcx
+	movdqa	16(%rsi), %xmm0
+	palignr	$2, (%rsi), %xmm0
+	pcmpeqb	(%rdi), %xmm0
+
+	movdqa	32(%rsi), %xmm3
+	palignr	$2, 16(%rsi), %xmm3
+	pcmpeqb	16(%rdi), %xmm3
+
+L(shr_2_gobble_loop):
+	pand	%xmm0, %xmm3
+	sub	$32, %rcx
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+
+	movdqa	64(%rsi), %xmm3
+	palignr	$2, 48(%rsi), %xmm3
+	sbb	$0xffff, %edx
+	movdqa	48(%rsi), %xmm0
+	palignr	$2, 32(%rsi), %xmm0
+	pcmpeqb	32(%rdi), %xmm0
+	lea	32(%rsi), %rsi
+	pcmpeqb	48(%rdi), %xmm3
+
+	lea	32(%rdi), %rdi
+	jz	L(shr_2_gobble_loop)
+	pand	%xmm0, %xmm3
+
+	cmp	$0, %rcx
+	jge	L(shr_2_gobble_next)
+	inc	%edx
+	add	$32, %rcx
+L(shr_2_gobble_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%rdi), %rdi
+	lea	32(%rsi), %rsi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	2(%rsi), %rsi
+	add	%rcx, %rsi
+	add	%rcx, %rdi
+	jmp	L(less48bytes)
+
+	.p2align 4
+L(shr_3):
+	cmp	$80, %rcx
+	lea	-48(%rcx), %rcx
+	mov	%edx, %eax
+	jae	L(shr_3_gobble)
+
+	movdqa	16(%rsi), %xmm1
+	movdqa	%xmm1, %xmm2
+	palignr	$3, (%rsi), %xmm1
+	pcmpeqb	(%rdi), %xmm1
+
+	movdqa	32(%rsi), %xmm3
+	palignr	$3, %xmm2, %xmm3
+	pcmpeqb	16(%rdi), %xmm3
+
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	32(%rdi), %rdi
+	lea	32(%rsi), %rsi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	add	$3, %rsi
+	add	%rcx, %rsi
+	add	%rcx, %rdi
+	jmp	L(less48bytes)
+
+	.p2align 4
+L(shr_3_gobble):
+	sub	$32, %rcx
+	movdqa	16(%rsi), %xmm0
+	palignr	$3, (%rsi), %xmm0
+	pcmpeqb	(%rdi), %xmm0
+
+	movdqa	32(%rsi), %xmm3
+	palignr	$3, 16(%rsi), %xmm3
+	pcmpeqb	16(%rdi), %xmm3
+
+L(shr_3_gobble_loop):
+	pand	%xmm0, %xmm3
+	sub	$32, %rcx
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+
+	movdqa	64(%rsi), %xmm3
+	palignr	$3, 48(%rsi), %xmm3
+	sbb	$0xffff, %edx
+	movdqa	48(%rsi), %xmm0
+	palignr	$3, 32(%rsi), %xmm0
+	pcmpeqb	32(%rdi), %xmm0
+	lea	32(%rsi), %rsi
+	pcmpeqb	48(%rdi), %xmm3
+
+	lea	32(%rdi), %rdi
+	jz	L(shr_3_gobble_loop)
+	pand	%xmm0, %xmm3
+
+	cmp	$0, %rcx
+	jge	L(shr_3_gobble_next)
+	inc	%edx
+	add	$32, %rcx
+L(shr_3_gobble_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%rdi), %rdi
+	lea	32(%rsi), %rsi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	3(%rsi), %rsi
+	add	%rcx, %rsi
+	add	%rcx, %rdi
+	jmp	L(less48bytes)
+
+# endif
+
+	.p2align 4
+L(shr_4):
+	cmp	$80, %rcx
+	lea	-48(%rcx), %rcx
+	mov	%edx, %eax
+	jae	L(shr_4_gobble)
+
+	movdqa	16(%rsi), %xmm1
+	movdqa	%xmm1, %xmm2
+	palignr	$4, (%rsi), %xmm1
+	pcmpeqb	(%rdi), %xmm1
+
+	movdqa	32(%rsi), %xmm3
+	palignr	$4, %xmm2, %xmm3
+	pcmpeqb	16(%rdi), %xmm3
+
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	32(%rdi), %rdi
+	lea	32(%rsi), %rsi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	add	$4, %rsi
+	add	%rcx, %rsi
+	add	%rcx, %rdi
+	jmp	L(less48bytes)
+
+	.p2align 4
+L(shr_4_gobble):
+	sub	$32, %rcx
+	movdqa	16(%rsi), %xmm0
+	palignr	$4, (%rsi), %xmm0
+	pcmpeqb	(%rdi), %xmm0
+
+	movdqa	32(%rsi), %xmm3
+	palignr	$4, 16(%rsi), %xmm3
+	pcmpeqb	16(%rdi), %xmm3
+
+L(shr_4_gobble_loop):
+	pand	%xmm0, %xmm3
+	sub	$32, %rcx
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+
+	movdqa	64(%rsi), %xmm3
+	palignr	$4, 48(%rsi), %xmm3
+	sbb	$0xffff, %edx
+	movdqa	48(%rsi), %xmm0
+	palignr	$4, 32(%rsi), %xmm0
+	pcmpeqb	32(%rdi), %xmm0
+	lea	32(%rsi), %rsi
+	pcmpeqb	48(%rdi), %xmm3
+
+	lea	32(%rdi), %rdi
+	jz	L(shr_4_gobble_loop)
+	pand	%xmm0, %xmm3
+
+	cmp	$0, %rcx
+	jge	L(shr_4_gobble_next)
+	inc	%edx
+	add	$32, %rcx
+L(shr_4_gobble_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%rdi), %rdi
+	lea	32(%rsi), %rsi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	4(%rsi), %rsi
+	add	%rcx, %rsi
+	add	%rcx, %rdi
+	jmp	L(less48bytes)
+
+# ifndef USE_AS_WMEMCMP
+
+	.p2align 4
+L(shr_5):
+	cmp	$80, %rcx
+	lea	-48(%rcx), %rcx
+	mov	%edx, %eax
+	jae	L(shr_5_gobble)
+
+	movdqa	16(%rsi), %xmm1
+	movdqa	%xmm1, %xmm2
+	palignr	$5, (%rsi), %xmm1
+	pcmpeqb	(%rdi), %xmm1
+
+	movdqa	32(%rsi), %xmm3
+	palignr	$5, %xmm2, %xmm3
+	pcmpeqb	16(%rdi), %xmm3
+
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	32(%rdi), %rdi
+	lea	32(%rsi), %rsi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	add	$5, %rsi
+	add	%rcx, %rsi
+	add	%rcx, %rdi
+	jmp	L(less48bytes)
+
+	.p2align 4
+L(shr_5_gobble):
+	sub	$32, %rcx
+	movdqa	16(%rsi), %xmm0
+	palignr	$5, (%rsi), %xmm0
+	pcmpeqb	(%rdi), %xmm0
+
+	movdqa	32(%rsi), %xmm3
+	palignr	$5, 16(%rsi), %xmm3
+	pcmpeqb	16(%rdi), %xmm3
+
+L(shr_5_gobble_loop):
+	pand	%xmm0, %xmm3
+	sub	$32, %rcx
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+
+	movdqa	64(%rsi), %xmm3
+	palignr	$5, 48(%rsi), %xmm3
+	sbb	$0xffff, %edx
+	movdqa	48(%rsi), %xmm0
+	palignr	$5, 32(%rsi), %xmm0
+	pcmpeqb	32(%rdi), %xmm0
+	lea	32(%rsi), %rsi
+	pcmpeqb	48(%rdi), %xmm3
+
+	lea	32(%rdi), %rdi
+	jz	L(shr_5_gobble_loop)
+	pand	%xmm0, %xmm3
+
+	cmp	$0, %rcx
+	jge	L(shr_5_gobble_next)
+	inc	%edx
+	add	$32, %rcx
+L(shr_5_gobble_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%rdi), %rdi
+	lea	32(%rsi), %rsi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	5(%rsi), %rsi
+	add	%rcx, %rsi
+	add	%rcx, %rdi
+	jmp	L(less48bytes)
+
+	.p2align 4
+L(shr_6):
+	cmp	$80, %rcx
+	lea	-48(%rcx), %rcx
+	mov	%edx, %eax
+	jae	L(shr_6_gobble)
+
+	movdqa	16(%rsi), %xmm1
+	movdqa	%xmm1, %xmm2
+	palignr	$6, (%rsi), %xmm1
+	pcmpeqb	(%rdi), %xmm1
+
+	movdqa	32(%rsi), %xmm3
+	palignr	$6, %xmm2, %xmm3
+	pcmpeqb	16(%rdi), %xmm3
+
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	32(%rdi), %rdi
+	lea	32(%rsi), %rsi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	add	$6, %rsi
+	add	%rcx, %rsi
+	add	%rcx, %rdi
+	jmp	L(less48bytes)
+
+	.p2align 4
+L(shr_6_gobble):
+	sub	$32, %rcx
+	movdqa	16(%rsi), %xmm0
+	palignr	$6, (%rsi), %xmm0
+	pcmpeqb	(%rdi), %xmm0
+
+	movdqa	32(%rsi), %xmm3
+	palignr	$6, 16(%rsi), %xmm3
+	pcmpeqb	16(%rdi), %xmm3
+
+L(shr_6_gobble_loop):
+	pand	%xmm0, %xmm3
+	sub	$32, %rcx
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+
+	movdqa	64(%rsi), %xmm3
+	palignr	$6, 48(%rsi), %xmm3
+	sbb	$0xffff, %edx
+	movdqa	48(%rsi), %xmm0
+	palignr	$6, 32(%rsi), %xmm0
+	pcmpeqb	32(%rdi), %xmm0
+	lea	32(%rsi), %rsi
+	pcmpeqb	48(%rdi), %xmm3
+
+	lea	32(%rdi), %rdi
+	jz	L(shr_6_gobble_loop)
+	pand	%xmm0, %xmm3
+
+	cmp	$0, %rcx
+	jge	L(shr_6_gobble_next)
+	inc	%edx
+	add	$32, %rcx
+L(shr_6_gobble_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%rdi), %rdi
+	lea	32(%rsi), %rsi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	6(%rsi), %rsi
+	add	%rcx, %rsi
+	add	%rcx, %rdi
+	jmp	L(less48bytes)
+
+	.p2align 4
+L(shr_7):
+	cmp	$80, %rcx
+	lea	-48(%rcx), %rcx
+	mov	%edx, %eax
+	jae	L(shr_7_gobble)
+
+	movdqa	16(%rsi), %xmm1
+	movdqa	%xmm1, %xmm2
+	palignr	$7, (%rsi), %xmm1
+	pcmpeqb	(%rdi), %xmm1
+
+	movdqa	32(%rsi), %xmm3
+	palignr	$7, %xmm2, %xmm3
+	pcmpeqb	16(%rdi), %xmm3
+
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	32(%rdi), %rdi
+	lea	32(%rsi), %rsi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	add	$7, %rsi
+	add	%rcx, %rsi
+	add	%rcx, %rdi
+	jmp	L(less48bytes)
+
+	.p2align 4
+L(shr_7_gobble):
+	sub	$32, %rcx
+	movdqa	16(%rsi), %xmm0
+	palignr	$7, (%rsi), %xmm0
+	pcmpeqb	(%rdi), %xmm0
+
+	movdqa	32(%rsi), %xmm3
+	palignr	$7, 16(%rsi), %xmm3
+	pcmpeqb	16(%rdi), %xmm3
+
+L(shr_7_gobble_loop):
+	pand	%xmm0, %xmm3
+	sub	$32, %rcx
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+
+	movdqa	64(%rsi), %xmm3
+	palignr	$7, 48(%rsi), %xmm3
+	sbb	$0xffff, %edx
+	movdqa	48(%rsi), %xmm0
+	palignr	$7, 32(%rsi), %xmm0
+	pcmpeqb	32(%rdi), %xmm0
+	lea	32(%rsi), %rsi
+	pcmpeqb	48(%rdi), %xmm3
+
+	lea	32(%rdi), %rdi
+	jz	L(shr_7_gobble_loop)
+	pand	%xmm0, %xmm3
+
+	cmp	$0, %rcx
+	jge	L(shr_7_gobble_next)
+	inc	%edx
+	add	$32, %rcx
+L(shr_7_gobble_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%rdi), %rdi
+	lea	32(%rsi), %rsi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	7(%rsi), %rsi
+	add	%rcx, %rsi
+	add	%rcx, %rdi
+	jmp	L(less48bytes)
+
+# endif
+
+	.p2align 4
+L(shr_8):
+	cmp	$80, %rcx
+	lea	-48(%rcx), %rcx
+	mov	%edx, %eax
+	jae	L(shr_8_gobble)
+
+	movdqa	16(%rsi), %xmm1
+	movdqa	%xmm1, %xmm2
+	palignr	$8, (%rsi), %xmm1
+	pcmpeqb	(%rdi), %xmm1
+
+	movdqa	32(%rsi), %xmm3
+	palignr	$8, %xmm2, %xmm3
+	pcmpeqb	16(%rdi), %xmm3
+
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	32(%rdi), %rdi
+	lea	32(%rsi), %rsi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	add	$8, %rsi
+	add	%rcx, %rsi
+	add	%rcx, %rdi
+	jmp	L(less48bytes)
+
+	.p2align 4
+L(shr_8_gobble):
+	sub	$32, %rcx
+	movdqa	16(%rsi), %xmm0
+	palignr	$8, (%rsi), %xmm0
+	pcmpeqb	(%rdi), %xmm0
+
+	movdqa	32(%rsi), %xmm3
+	palignr	$8, 16(%rsi), %xmm3
+	pcmpeqb	16(%rdi), %xmm3
+
+L(shr_8_gobble_loop):
+	pand	%xmm0, %xmm3
+	sub	$32, %rcx
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+
+	movdqa	64(%rsi), %xmm3
+	palignr	$8, 48(%rsi), %xmm3
+	sbb	$0xffff, %edx
+	movdqa	48(%rsi), %xmm0
+	palignr	$8, 32(%rsi), %xmm0
+	pcmpeqb	32(%rdi), %xmm0
+	lea	32(%rsi), %rsi
+	pcmpeqb	48(%rdi), %xmm3
+
+	lea	32(%rdi), %rdi
+	jz	L(shr_8_gobble_loop)
+	pand	%xmm0, %xmm3
+
+	cmp	$0, %rcx
+	jge	L(shr_8_gobble_next)
+	inc	%edx
+	add	$32, %rcx
+L(shr_8_gobble_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%rdi), %rdi
+	lea	32(%rsi), %rsi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	8(%rsi), %rsi
+	add	%rcx, %rsi
+	add	%rcx, %rdi
+	jmp	L(less48bytes)
+
+# ifndef USE_AS_WMEMCMP
+
+	.p2align 4
+L(shr_9):
+	cmp	$80, %rcx
+	lea	-48(%rcx), %rcx
+	mov	%edx, %eax
+	jae	L(shr_9_gobble)
+
+	movdqa	16(%rsi), %xmm1
+	movdqa	%xmm1, %xmm2
+	palignr	$9, (%rsi), %xmm1
+	pcmpeqb	(%rdi), %xmm1
+
+	movdqa	32(%rsi), %xmm3
+	palignr	$9, %xmm2, %xmm3
+	pcmpeqb	16(%rdi), %xmm3
+
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	32(%rdi), %rdi
+	lea	32(%rsi), %rsi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	add	$9, %rsi
+	add	%rcx, %rsi
+	add	%rcx, %rdi
+	jmp	L(less48bytes)
+
+	.p2align 4
+L(shr_9_gobble):
+	sub	$32, %rcx
+	movdqa	16(%rsi), %xmm0
+	palignr	$9, (%rsi), %xmm0
+	pcmpeqb	(%rdi), %xmm0
+
+	movdqa	32(%rsi), %xmm3
+	palignr	$9, 16(%rsi), %xmm3
+	pcmpeqb	16(%rdi), %xmm3
+
+L(shr_9_gobble_loop):
+	pand	%xmm0, %xmm3
+	sub	$32, %rcx
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+
+	movdqa	64(%rsi), %xmm3
+	palignr	$9, 48(%rsi), %xmm3
+	sbb	$0xffff, %edx
+	movdqa	48(%rsi), %xmm0
+	palignr	$9, 32(%rsi), %xmm0
+	pcmpeqb	32(%rdi), %xmm0
+	lea	32(%rsi), %rsi
+	pcmpeqb	48(%rdi), %xmm3
+
+	lea	32(%rdi), %rdi
+	jz	L(shr_9_gobble_loop)
+	pand	%xmm0, %xmm3
+
+	cmp	$0, %rcx
+	jge	L(shr_9_gobble_next)
+	inc	%edx
+	add	$32, %rcx
+L(shr_9_gobble_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%rdi), %rdi
+	lea	32(%rsi), %rsi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	9(%rsi), %rsi
+	add	%rcx, %rsi
+	add	%rcx, %rdi
+	jmp	L(less48bytes)
+
+	.p2align 4
+L(shr_10):
+	cmp	$80, %rcx
+	lea	-48(%rcx), %rcx
+	mov	%edx, %eax
+	jae	L(shr_10_gobble)
+
+	movdqa	16(%rsi), %xmm1
+	movdqa	%xmm1, %xmm2
+	palignr	$10, (%rsi), %xmm1
+	pcmpeqb	(%rdi), %xmm1
+
+	movdqa	32(%rsi), %xmm3
+	palignr	$10, %xmm2, %xmm3
+	pcmpeqb	16(%rdi), %xmm3
+
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	32(%rdi), %rdi
+	lea	32(%rsi), %rsi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	add	$10, %rsi
+	add	%rcx, %rsi
+	add	%rcx, %rdi
+	jmp	L(less48bytes)
+
+	.p2align 4
+L(shr_10_gobble):
+	sub	$32, %rcx
+	movdqa	16(%rsi), %xmm0
+	palignr	$10, (%rsi), %xmm0
+	pcmpeqb	(%rdi), %xmm0
+
+	movdqa	32(%rsi), %xmm3
+	palignr	$10, 16(%rsi), %xmm3
+	pcmpeqb	16(%rdi), %xmm3
+
+L(shr_10_gobble_loop):
+	pand	%xmm0, %xmm3
+	sub	$32, %rcx
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+
+	movdqa	64(%rsi), %xmm3
+	palignr	$10, 48(%rsi), %xmm3
+	sbb	$0xffff, %edx
+	movdqa	48(%rsi), %xmm0
+	palignr	$10, 32(%rsi), %xmm0
+	pcmpeqb	32(%rdi), %xmm0
+	lea	32(%rsi), %rsi
+	pcmpeqb	48(%rdi), %xmm3
+
+	lea	32(%rdi), %rdi
+	jz	L(shr_10_gobble_loop)
+	pand	%xmm0, %xmm3
+
+	cmp	$0, %rcx
+	jge	L(shr_10_gobble_next)
+	inc	%edx
+	add	$32, %rcx
+L(shr_10_gobble_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%rdi), %rdi
+	lea	32(%rsi), %rsi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	10(%rsi), %rsi
+	add	%rcx, %rsi
+	add	%rcx, %rdi
+	jmp	L(less48bytes)
+
+	.p2align 4
+L(shr_11):
+	cmp	$80, %rcx
+	lea	-48(%rcx), %rcx
+	mov	%edx, %eax
+	jae	L(shr_11_gobble)
+
+	movdqa	16(%rsi), %xmm1
+	movdqa	%xmm1, %xmm2
+	palignr	$11, (%rsi), %xmm1
+	pcmpeqb	(%rdi), %xmm1
+
+	movdqa	32(%rsi), %xmm3
+	palignr	$11, %xmm2, %xmm3
+	pcmpeqb	16(%rdi), %xmm3
+
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	32(%rdi), %rdi
+	lea	32(%rsi), %rsi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	add	$11, %rsi
+	add	%rcx, %rsi
+	add	%rcx, %rdi
+	jmp	L(less48bytes)
+
+	.p2align 4
+L(shr_11_gobble):
+	sub	$32, %rcx
+	movdqa	16(%rsi), %xmm0
+	palignr	$11, (%rsi), %xmm0
+	pcmpeqb	(%rdi), %xmm0
+
+	movdqa	32(%rsi), %xmm3
+	palignr	$11, 16(%rsi), %xmm3
+	pcmpeqb	16(%rdi), %xmm3
+
+L(shr_11_gobble_loop):
+	pand	%xmm0, %xmm3
+	sub	$32, %rcx
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+
+	movdqa	64(%rsi), %xmm3
+	palignr	$11, 48(%rsi), %xmm3
+	sbb	$0xffff, %edx
+	movdqa	48(%rsi), %xmm0
+	palignr	$11, 32(%rsi), %xmm0
+	pcmpeqb	32(%rdi), %xmm0
+	lea	32(%rsi), %rsi
+	pcmpeqb	48(%rdi), %xmm3
+
+	lea	32(%rdi), %rdi
+	jz	L(shr_11_gobble_loop)
+	pand	%xmm0, %xmm3
+
+	cmp	$0, %rcx
+	jge	L(shr_11_gobble_next)
+	inc	%edx
+	add	$32, %rcx
+L(shr_11_gobble_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%rdi), %rdi
+	lea	32(%rsi), %rsi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	11(%rsi), %rsi
+	add	%rcx, %rsi
+	add	%rcx, %rdi
+	jmp	L(less48bytes)
+
+# endif
+
+	.p2align 4
+L(shr_12):
+	cmp	$80, %rcx
+	lea	-48(%rcx), %rcx
+	mov	%edx, %eax
+	jae	L(shr_12_gobble)
+
+	movdqa	16(%rsi), %xmm1
+	movdqa	%xmm1, %xmm2
+	palignr	$12, (%rsi), %xmm1
+	pcmpeqb	(%rdi), %xmm1
+
+	movdqa	32(%rsi), %xmm3
+	palignr	$12, %xmm2, %xmm3
+	pcmpeqb	16(%rdi), %xmm3
+
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	32(%rdi), %rdi
+	lea	32(%rsi), %rsi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	add	$12, %rsi
+	add	%rcx, %rsi
+	add	%rcx, %rdi
+	jmp	L(less48bytes)
+
+	.p2align 4
+L(shr_12_gobble):
+	sub	$32, %rcx
+	movdqa	16(%rsi), %xmm0
+	palignr	$12, (%rsi), %xmm0
+	pcmpeqb	(%rdi), %xmm0
+
+	movdqa	32(%rsi), %xmm3
+	palignr	$12, 16(%rsi), %xmm3
+	pcmpeqb	16(%rdi), %xmm3
+
+L(shr_12_gobble_loop):
+	pand	%xmm0, %xmm3
+	sub	$32, %rcx
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+
+	movdqa	64(%rsi), %xmm3
+	palignr	$12, 48(%rsi), %xmm3
+	sbb	$0xffff, %edx
+	movdqa	48(%rsi), %xmm0
+	palignr	$12, 32(%rsi), %xmm0
+	pcmpeqb	32(%rdi), %xmm0
+	lea	32(%rsi), %rsi
+	pcmpeqb	48(%rdi), %xmm3
+
+	lea	32(%rdi), %rdi
+	jz	L(shr_12_gobble_loop)
+	pand	%xmm0, %xmm3
+
+	cmp	$0, %rcx
+	jge	L(shr_12_gobble_next)
+	inc	%edx
+	add	$32, %rcx
+L(shr_12_gobble_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%rdi), %rdi
+	lea	32(%rsi), %rsi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	12(%rsi), %rsi
+	add	%rcx, %rsi
+	add	%rcx, %rdi
+	jmp	L(less48bytes)
+
+# ifndef USE_AS_WMEMCMP
+
+	.p2align 4
+L(shr_13):
+	cmp	$80, %rcx
+	lea	-48(%rcx), %rcx
+	mov	%edx, %eax
+	jae	L(shr_13_gobble)
+
+	movdqa	16(%rsi), %xmm1
+	movdqa	%xmm1, %xmm2
+	palignr	$13, (%rsi), %xmm1
+	pcmpeqb	(%rdi), %xmm1
+
+	movdqa	32(%rsi), %xmm3
+	palignr	$13, %xmm2, %xmm3
+	pcmpeqb	16(%rdi), %xmm3
+
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	32(%rdi), %rdi
+	lea	32(%rsi), %rsi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	add	$13, %rsi
+	add	%rcx, %rsi
+	add	%rcx, %rdi
+	jmp	L(less48bytes)
+
+	.p2align 4
+L(shr_13_gobble):
+	sub	$32, %rcx
+	movdqa	16(%rsi), %xmm0
+	palignr	$13, (%rsi), %xmm0
+	pcmpeqb	(%rdi), %xmm0
+
+	movdqa	32(%rsi), %xmm3
+	palignr	$13, 16(%rsi), %xmm3
+	pcmpeqb	16(%rdi), %xmm3
+
+L(shr_13_gobble_loop):
+	pand	%xmm0, %xmm3
+	sub	$32, %rcx
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+
+	movdqa	64(%rsi), %xmm3
+	palignr	$13, 48(%rsi), %xmm3
+	sbb	$0xffff, %edx
+	movdqa	48(%rsi), %xmm0
+	palignr	$13, 32(%rsi), %xmm0
+	pcmpeqb	32(%rdi), %xmm0
+	lea	32(%rsi), %rsi
+	pcmpeqb	48(%rdi), %xmm3
+
+	lea	32(%rdi), %rdi
+	jz	L(shr_13_gobble_loop)
+	pand	%xmm0, %xmm3
+
+	cmp	$0, %rcx
+	jge	L(shr_13_gobble_next)
+	inc	%edx
+	add	$32, %rcx
+L(shr_13_gobble_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%rdi), %rdi
+	lea	32(%rsi), %rsi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	13(%rsi), %rsi
+	add	%rcx, %rsi
+	add	%rcx, %rdi
+	jmp	L(less48bytes)
+
+	.p2align 4
+L(shr_14):
+	cmp	$80, %rcx
+	lea	-48(%rcx), %rcx
+	mov	%edx, %eax
+	jae	L(shr_14_gobble)
+
+	movdqa	16(%rsi), %xmm1
+	movdqa	%xmm1, %xmm2
+	palignr	$14, (%rsi), %xmm1
+	pcmpeqb	(%rdi), %xmm1
+
+	movdqa	32(%rsi), %xmm3
+	palignr	$14, %xmm2, %xmm3
+	pcmpeqb	16(%rdi), %xmm3
+
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	32(%rdi), %rdi
+	lea	32(%rsi), %rsi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	add	$14, %rsi
+	add	%rcx, %rsi
+	add	%rcx, %rdi
+	jmp	L(less48bytes)
+
+	.p2align 4
+L(shr_14_gobble):
+	sub	$32, %rcx
+	movdqa	16(%rsi), %xmm0
+	palignr	$14, (%rsi), %xmm0
+	pcmpeqb	(%rdi), %xmm0
+
+	movdqa	32(%rsi), %xmm3
+	palignr	$14, 16(%rsi), %xmm3
+	pcmpeqb	16(%rdi), %xmm3
+
+L(shr_14_gobble_loop):
+	pand	%xmm0, %xmm3
+	sub	$32, %rcx
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+
+	movdqa	64(%rsi), %xmm3
+	palignr	$14, 48(%rsi), %xmm3
+	sbb	$0xffff, %edx
+	movdqa	48(%rsi), %xmm0
+	palignr	$14, 32(%rsi), %xmm0
+	pcmpeqb	32(%rdi), %xmm0
+	lea	32(%rsi), %rsi
+	pcmpeqb	48(%rdi), %xmm3
+
+	lea	32(%rdi), %rdi
+	jz	L(shr_14_gobble_loop)
+	pand	%xmm0, %xmm3
+
+	cmp	$0, %rcx
+	jge	L(shr_14_gobble_next)
+	inc	%edx
+	add	$32, %rcx
+L(shr_14_gobble_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%rdi), %rdi
+	lea	32(%rsi), %rsi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	14(%rsi), %rsi
+	add	%rcx, %rsi
+	add	%rcx, %rdi
+	jmp	L(less48bytes)
+
+	.p2align 4
+L(shr_15):
+	cmp	$80, %rcx
+	lea	-48(%rcx), %rcx
+	mov	%edx, %eax
+	jae	L(shr_15_gobble)
+
+	movdqa	16(%rsi), %xmm1
+	movdqa	%xmm1, %xmm2
+	palignr	$15, (%rsi), %xmm1
+	pcmpeqb	(%rdi), %xmm1
+
+	movdqa	32(%rsi), %xmm3
+	palignr	$15, %xmm2, %xmm3
+	pcmpeqb	16(%rdi), %xmm3
+
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	32(%rdi), %rdi
+	lea	32(%rsi), %rsi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	add	$15, %rsi
+	add	%rcx, %rsi
+	add	%rcx, %rdi
+	jmp	L(less48bytes)
+
+	.p2align 4
+L(shr_15_gobble):
+	sub	$32, %rcx
+	movdqa	16(%rsi), %xmm0
+	palignr	$15, (%rsi), %xmm0
+	pcmpeqb	(%rdi), %xmm0
+
+	movdqa	32(%rsi), %xmm3
+	palignr	$15, 16(%rsi), %xmm3
+	pcmpeqb	16(%rdi), %xmm3
+
+L(shr_15_gobble_loop):
+	pand	%xmm0, %xmm3
+	sub	$32, %rcx
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+
+	movdqa	64(%rsi), %xmm3
+	palignr	$15, 48(%rsi), %xmm3
+	sbb	$0xffff, %edx
+	movdqa	48(%rsi), %xmm0
+	palignr	$15, 32(%rsi), %xmm0
+	pcmpeqb	32(%rdi), %xmm0
+	lea	32(%rsi), %rsi
+	pcmpeqb	48(%rdi), %xmm3
+
+	lea	32(%rdi), %rdi
+	jz	L(shr_15_gobble_loop)
+	pand	%xmm0, %xmm3
+
+	cmp	$0, %rcx
+	jge	L(shr_15_gobble_next)
+	inc	%edx
+	add	$32, %rcx
+L(shr_15_gobble_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%rdi), %rdi
+	lea	32(%rsi), %rsi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	15(%rsi), %rsi
+	add	%rcx, %rsi
+	add	%rcx, %rdi
+	jmp	L(less48bytes)
+# endif
+	.p2align 4
+L(exit):
+	pmovmskb %xmm1, %r8d
+	sub	$0xffff, %r8d
+	jz	L(first16bytes)
+	lea	-16(%rsi), %rsi
+	lea	-16(%rdi), %rdi
+	mov	%r8d, %edx
+L(first16bytes):
+	add	%rax, %rsi
+L(less16bytes):
+# ifndef USE_AS_WMEMCMP
+	test	%dl, %dl
+	jz	L(next_24_bytes)
+
+	test	$0x01, %dl
+	jnz	L(Byte16)
+
+	test	$0x02, %dl
+	jnz	L(Byte17)
+
+	test	$0x04, %dl
+	jnz	L(Byte18)
+
+	test	$0x08, %dl
+	jnz	L(Byte19)
+
+	test	$0x10, %dl
+	jnz	L(Byte20)
+
+	test	$0x20, %dl
+	jnz	L(Byte21)
+
+	test	$0x40, %dl
+	jnz	L(Byte22)
+
+	movzbl	-9(%rdi), %eax
+	movzbl	-9(%rsi), %edx
+	sub	%edx, %eax
+	ret
+
+	.p2align 4
+L(Byte16):
+	movzbl	-16(%rdi), %eax
+	movzbl	-16(%rsi), %edx
+	sub	%edx, %eax
+	ret
+
+	.p2align 4
+L(Byte17):
+	movzbl	-15(%rdi), %eax
+	movzbl	-15(%rsi), %edx
+	sub	%edx, %eax
+	ret
+
+	.p2align 4
+L(Byte18):
+	movzbl	-14(%rdi), %eax
+	movzbl	-14(%rsi), %edx
+	sub	%edx, %eax
+	ret
+
+	.p2align 4
+L(Byte19):
+	movzbl	-13(%rdi), %eax
+	movzbl	-13(%rsi), %edx
+	sub	%edx, %eax
+	ret
+
+	.p2align 4
+L(Byte20):
+	movzbl	-12(%rdi), %eax
+	movzbl	-12(%rsi), %edx
+	sub	%edx, %eax
+	ret
+
+	.p2align 4
+L(Byte21):
+	movzbl	-11(%rdi), %eax
+	movzbl	-11(%rsi), %edx
+	sub	%edx, %eax
+	ret
+
+	.p2align 4
+L(Byte22):
+	movzbl	-10(%rdi), %eax
+	movzbl	-10(%rsi), %edx
+	sub	%edx, %eax
+	ret
+
+	.p2align 4
+L(next_24_bytes):
+	lea	8(%rdi), %rdi
+	lea	8(%rsi), %rsi
+	test	$0x01, %dh
+	jnz	L(Byte16)
+
+	test	$0x02, %dh
+	jnz	L(Byte17)
+
+	test	$0x04, %dh
+	jnz	L(Byte18)
+
+	test	$0x08, %dh
+	jnz	L(Byte19)
+
+	test	$0x10, %dh
+	jnz	L(Byte20)
+
+	test	$0x20, %dh
+	jnz	L(Byte21)
+
+	test	$0x40, %dh
+	jnz	L(Byte22)
+
+	movzbl	-9(%rdi), %eax
+	movzbl	-9(%rsi), %edx
+	sub	%edx, %eax
+	ret
+# else
+/* special for wmemcmp */
+	xor	%eax, %eax
+	test	%dl, %dl
+	jz	L(next_two_double_words)
+	and	$15, %dl
+	jz	L(second_double_word)
+	mov	-16(%rdi), %eax
+	cmp	-16(%rsi), %eax
+	jne	L(find_diff)
+	ret
+
+	.p2align 4
+L(second_double_word):
+	mov	-12(%rdi), %eax
+	cmp	-12(%rsi), %eax
+	jne	L(find_diff)
+	ret
+
+	.p2align 4
+L(next_two_double_words):
+	and	$15, %dh
+	jz	L(fourth_double_word)
+	mov	-8(%rdi), %eax
+	cmp	-8(%rsi), %eax
+	jne	L(find_diff)
+	ret
+
+	.p2align 4
+L(fourth_double_word):
+	mov	-4(%rdi), %eax
+	cmp	-4(%rsi), %eax
+	jne	L(find_diff)
+	ret
+# endif
+
+	.p2align 4
+L(less48bytes):
+	cmp	$8, %ecx
+	jae	L(more8bytes)
+	cmp	$0, %ecx
+	je	L(0bytes)
+# ifndef USE_AS_WMEMCMP
+	cmp	$1, %ecx
+	je	L(1bytes)
+	cmp	$2, %ecx
+	je	L(2bytes)
+	cmp	$3, %ecx
+	je	L(3bytes)
+	cmp	$4, %ecx
+	je	L(4bytes)
+	cmp	$5, %ecx
+	je	L(5bytes)
+	cmp	$6, %ecx
+	je	L(6bytes)
+	jmp	L(7bytes)
+# else
+	jmp	L(4bytes)
+# endif
+
+	.p2align 4
+L(more8bytes):
+	cmp	$16, %ecx
+	jae	L(more16bytes)
+	cmp	$8, %ecx
+	je	L(8bytes)
+# ifndef USE_AS_WMEMCMP
+	cmp	$9, %ecx
+	je	L(9bytes)
+	cmp	$10, %ecx
+	je	L(10bytes)
+	cmp	$11, %ecx
+	je	L(11bytes)
+	cmp	$12, %ecx
+	je	L(12bytes)
+	cmp	$13, %ecx
+	je	L(13bytes)
+	cmp	$14, %ecx
+	je	L(14bytes)
+	jmp	L(15bytes)
+# else
+	jmp	L(12bytes)
+# endif
+
+	.p2align 4
+L(more16bytes):
+	cmp	$24, %ecx
+	jae	L(more24bytes)
+	cmp	$16, %ecx
+	je	L(16bytes)
+# ifndef USE_AS_WMEMCMP
+	cmp	$17, %ecx
+	je	L(17bytes)
+	cmp	$18, %ecx
+	je	L(18bytes)
+	cmp	$19, %ecx
+	je	L(19bytes)
+	cmp	$20, %ecx
+	je	L(20bytes)
+	cmp	$21, %ecx
+	je	L(21bytes)
+	cmp	$22, %ecx
+	je	L(22bytes)
+	jmp	L(23bytes)
+# else
+	jmp	L(20bytes)
+# endif
+
+	.p2align 4
+L(more24bytes):
+	cmp	$32, %ecx
+	jae	L(more32bytes)
+	cmp	$24, %ecx
+	je	L(24bytes)
+# ifndef USE_AS_WMEMCMP
+	cmp	$25, %ecx
+	je	L(25bytes)
+	cmp	$26, %ecx
+	je	L(26bytes)
+	cmp	$27, %ecx
+	je	L(27bytes)
+	cmp	$28, %ecx
+	je	L(28bytes)
+	cmp	$29, %ecx
+	je	L(29bytes)
+	cmp	$30, %ecx
+	je	L(30bytes)
+	jmp	L(31bytes)
+# else
+	jmp	L(28bytes)
+# endif
+
+	.p2align 4
+L(more32bytes):
+	cmp	$40, %ecx
+	jae	L(more40bytes)
+	cmp	$32, %ecx
+	je	L(32bytes)
+# ifndef USE_AS_WMEMCMP
+	cmp	$33, %ecx
+	je	L(33bytes)
+	cmp	$34, %ecx
+	je	L(34bytes)
+	cmp	$35, %ecx
+	je	L(35bytes)
+	cmp	$36, %ecx
+	je	L(36bytes)
+	cmp	$37, %ecx
+	je	L(37bytes)
+	cmp	$38, %ecx
+	je	L(38bytes)
+	jmp	L(39bytes)
+# else
+	jmp	L(36bytes)
+# endif
+
+	.p2align 4
+L(more40bytes):
+	cmp	$40, %ecx
+	je	L(40bytes)
+# ifndef USE_AS_WMEMCMP
+	cmp	$41, %ecx
+	je	L(41bytes)
+	cmp	$42, %ecx
+	je	L(42bytes)
+	cmp	$43, %ecx
+	je	L(43bytes)
+	cmp	$44, %ecx
+	je	L(44bytes)
+	cmp	$45, %ecx
+	je	L(45bytes)
+	cmp	$46, %ecx
+	je	L(46bytes)
+	jmp	L(47bytes)
+
+	.p2align 4
+L(44bytes):
+	movl	-44(%rdi), %eax
+	movl	-44(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(40bytes):
+	movl	-40(%rdi), %eax
+	movl	-40(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(36bytes):
+	movl	-36(%rdi), %eax
+	movl	-36(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(32bytes):
+	movl	-32(%rdi), %eax
+	movl	-32(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(28bytes):
+	movl	-28(%rdi), %eax
+	movl	-28(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(24bytes):
+	movl	-24(%rdi), %eax
+	movl	-24(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(20bytes):
+	movl	-20(%rdi), %eax
+	movl	-20(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(16bytes):
+	movl	-16(%rdi), %eax
+	movl	-16(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(12bytes):
+	movl	-12(%rdi), %eax
+	movl	-12(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(8bytes):
+	movl	-8(%rdi), %eax
+	movl	-8(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(4bytes):
+	movl	-4(%rdi), %eax
+	movl	-4(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(0bytes):
+	xor	%eax, %eax
+	ret
+# else
+	.p2align 4
+L(44bytes):
+	movl	-44(%rdi), %eax
+	cmp	-44(%rsi), %eax
+	jne	L(find_diff)
+L(40bytes):
+	movl	-40(%rdi), %eax
+	cmp	-40(%rsi), %eax
+	jne	L(find_diff)
+L(36bytes):
+	movl	-36(%rdi), %eax
+	cmp	-36(%rsi), %eax
+	jne	L(find_diff)
+L(32bytes):
+	movl	-32(%rdi), %eax
+	cmp	-32(%rsi), %eax
+	jne	L(find_diff)
+L(28bytes):
+	movl	-28(%rdi), %eax
+	cmp	-28(%rsi), %eax
+	jne	L(find_diff)
+L(24bytes):
+	movl	-24(%rdi), %eax
+	cmp	-24(%rsi), %eax
+	jne	L(find_diff)
+L(20bytes):
+	movl	-20(%rdi), %eax
+	cmp	-20(%rsi), %eax
+	jne	L(find_diff)
+L(16bytes):
+	movl	-16(%rdi), %eax
+	cmp	-16(%rsi), %eax
+	jne	L(find_diff)
+L(12bytes):
+	movl	-12(%rdi), %eax
+	cmp	-12(%rsi), %eax
+	jne	L(find_diff)
+L(8bytes):
+	movl	-8(%rdi), %eax
+	cmp	-8(%rsi), %eax
+	jne	L(find_diff)
+L(4bytes):
+	movl	-4(%rdi), %eax
+	cmp	-4(%rsi), %eax
+	jne	L(find_diff)
+L(0bytes):
+	xor	%eax, %eax
+	ret
+# endif
+
+# ifndef USE_AS_WMEMCMP
+	.p2align 4
+L(45bytes):
+	movl	-45(%rdi), %eax
+	movl	-45(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(41bytes):
+	movl	-41(%rdi), %eax
+	movl	-41(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(37bytes):
+	movl	-37(%rdi), %eax
+	movl	-37(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(33bytes):
+	movl	-33(%rdi), %eax
+	movl	-33(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(29bytes):
+	movl	-29(%rdi), %eax
+	movl	-29(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(25bytes):
+	movl	-25(%rdi), %eax
+	movl	-25(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(21bytes):
+	movl	-21(%rdi), %eax
+	movl	-21(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(17bytes):
+	movl	-17(%rdi), %eax
+	movl	-17(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(13bytes):
+	movl	-13(%rdi), %eax
+	movl	-13(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(9bytes):
+	movl	-9(%rdi), %eax
+	movl	-9(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(5bytes):
+	movl	-5(%rdi), %eax
+	movl	-5(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(1bytes):
+	movzbl	-1(%rdi), %eax
+	cmpb	-1(%rsi), %al
+	jne	L(set)
+	xor	%eax, %eax
+	ret
+
+	.p2align 4
+L(46bytes):
+	movl	-46(%rdi), %eax
+	movl	-46(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(42bytes):
+	movl	-42(%rdi), %eax
+	movl	-42(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(38bytes):
+	movl	-38(%rdi), %eax
+	movl	-38(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(34bytes):
+	movl	-34(%rdi), %eax
+	movl	-34(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(30bytes):
+	movl	-30(%rdi), %eax
+	movl	-30(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(26bytes):
+	movl	-26(%rdi), %eax
+	movl	-26(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(22bytes):
+	movl	-22(%rdi), %eax
+	movl	-22(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(18bytes):
+	movl	-18(%rdi), %eax
+	movl	-18(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(14bytes):
+	movl	-14(%rdi), %eax
+	movl	-14(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(10bytes):
+	movl	-10(%rdi), %eax
+	movl	-10(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(6bytes):
+	movl	-6(%rdi), %eax
+	movl	-6(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(2bytes):
+	movzwl	-2(%rdi), %eax
+	movzwl	-2(%rsi), %ecx
+	cmpb	%cl, %al
+	jne	L(set)
+	cmp	%ecx, %eax
+	jne	L(set)
+	xor	%eax, %eax
+	ret
+
+	.p2align 4
+L(47bytes):
+	movl	-47(%rdi), %eax
+	movl	-47(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(43bytes):
+	movl	-43(%rdi), %eax
+	movl	-43(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(39bytes):
+	movl	-39(%rdi), %eax
+	movl	-39(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(35bytes):
+	movl	-35(%rdi), %eax
+	movl	-35(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(31bytes):
+	movl	-31(%rdi), %eax
+	movl	-31(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(27bytes):
+	movl	-27(%rdi), %eax
+	movl	-27(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(23bytes):
+	movl	-23(%rdi), %eax
+	movl	-23(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(19bytes):
+	movl	-19(%rdi), %eax
+	movl	-19(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(15bytes):
+	movl	-15(%rdi), %eax
+	movl	-15(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(11bytes):
+	movl	-11(%rdi), %eax
+	movl	-11(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(7bytes):
+	movl	-7(%rdi), %eax
+	movl	-7(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(3bytes):
+	movzwl	-3(%rdi), %eax
+	movzwl	-3(%rsi), %ecx
+	cmpb	%cl, %al
+	jne	L(set)
+	cmp	%ecx, %eax
+	jne	L(set)
+	movzbl	-1(%rdi), %eax
+	cmpb	-1(%rsi), %al
+	jne	L(set)
+	xor	%eax, %eax
+	ret
+
+	.p2align 4
+L(find_diff):
+	cmpb	%cl, %al
+	jne	L(set)
+	cmpw	%cx, %ax
+	jne	L(set)
+	shr	$16, %eax
+	shr	$16, %ecx
+	cmpb	%cl, %al
+	jne	L(set)
+
+/* We get there only if we already know there is a
+difference.  */
+
+	cmp	%ecx, %eax
+L(set):
+	sbb	%eax, %eax
+	sbb	$-1, %eax
+	ret
+# else
+
+/* for wmemcmp */
+	.p2align 4
+L(find_diff):
+	mov	$1, %eax
+	jg	L(find_diff_bigger)
+	neg	%eax
+	ret
+
+	.p2align 4
+L(find_diff_bigger):
+	ret
+# endif
+
+	.p2align 4
+L(equal):
+	xor	%eax, %eax
+	ret
+
+END (MEMCMP)
+#endif
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/memcmp.S b/REORG.TODO/sysdeps/x86_64/multiarch/memcmp.S
new file mode 100644
index 0000000000..0c9804b7e9
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/memcmp.S
@@ -0,0 +1,78 @@
+/* Multiple versions of memcmp
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in libc. */
+#if IS_IN (libc)
+	.text
+ENTRY(memcmp)
+	.type	memcmp, @gnu_indirect_function
+	LOAD_RTLD_GLOBAL_RO_RDX
+	HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
+	jnz	1f
+	HAS_ARCH_FEATURE (AVX2_Usable)
+	jz	1f
+	HAS_CPU_FEATURE (MOVBE)
+	jz	1f
+	HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
+	jz	1f
+	leaq	__memcmp_avx2_movbe(%rip), %rax
+	ret
+
+1:	HAS_CPU_FEATURE (SSSE3)
+	jnz	2f
+	leaq	__memcmp_sse2(%rip), %rax
+	ret
+
+2:	HAS_CPU_FEATURE (SSE4_1)
+	jz	3f
+	leaq	__memcmp_sse4_1(%rip), %rax
+	ret
+
+3:	leaq	__memcmp_ssse3(%rip), %rax
+	ret
+
+END(memcmp)
+
+# undef ENTRY
+# define ENTRY(name) \
+	.type __memcmp_sse2, @function; \
+	.p2align 4; \
+	.globl __memcmp_sse2; \
+	.hidden __memcmp_sse2; \
+	__memcmp_sse2: cfi_startproc; \
+	CALL_MCOUNT
+# undef END
+# define END(name) \
+	cfi_endproc; .size __memcmp_sse2, .-__memcmp_sse2
+
+# ifdef SHARED
+#  undef libc_hidden_builtin_def
+/* It doesn't make sense to send libc-internal memcmp calls through a PLT.
+   The speedup we get from using SSE4 instructions is likely eaten away
+   by the indirect call in the PLT.  */
+#  define libc_hidden_builtin_def(name) \
+	.globl __GI_memcmp; __GI_memcmp = __memcmp_sse2
+# endif
+#endif
+
+#include "../memcmp.S"
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S b/REORG.TODO/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S
new file mode 100644
index 0000000000..4e060a27fd
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S
@@ -0,0 +1,3180 @@
+/* memcpy with SSSE3 and REP string
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+#if IS_IN (libc) \
+    && (defined SHARED \
+        || defined USE_AS_MEMMOVE \
+	|| !defined USE_MULTIARCH)
+
+#include "asm-syntax.h"
+
+#ifndef MEMCPY
+# define MEMCPY		__memcpy_ssse3_back
+# define MEMCPY_CHK	__memcpy_chk_ssse3_back
+# define MEMPCPY	__mempcpy_ssse3_back
+# define MEMPCPY_CHK	__mempcpy_chk_ssse3_back
+#endif
+
+#define JMPTBL(I, B)	I - B
+
+/* Branch to an entry in a jump table.  TABLE is a jump table with
+   relative offsets.  INDEX is a register contains the index into the
+   jump table.  SCALE is the scale of INDEX.  */
+#define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)		\
+  lea		TABLE(%rip), %r11;				\
+  movslq	(%r11, INDEX, SCALE), INDEX;			\
+  lea		(%r11, INDEX), INDEX;				\
+  jmp		*INDEX;						\
+  ud2
+
+	.section .text.ssse3,"ax",@progbits
+#if !defined USE_AS_MEMPCPY && !defined USE_AS_MEMMOVE
+ENTRY (MEMPCPY_CHK)
+	cmpq	%rdx, %rcx
+	jb	HIDDEN_JUMPTARGET (__chk_fail)
+END (MEMPCPY_CHK)
+
+ENTRY (MEMPCPY)
+	movq	%rdi, %rax
+	addq	%rdx, %rax
+	jmp	L(start)
+END (MEMPCPY)
+#endif
+
+#if !defined USE_AS_BCOPY
+ENTRY (MEMCPY_CHK)
+	cmpq	%rdx, %rcx
+	jb	HIDDEN_JUMPTARGET (__chk_fail)
+END (MEMCPY_CHK)
+#endif
+
+ENTRY (MEMCPY)
+	mov	%rdi, %rax
+#ifdef USE_AS_MEMPCPY
+	add	%rdx, %rax
+#endif
+
+#ifdef USE_AS_MEMMOVE
+	cmp	%rsi, %rdi
+	jb	L(copy_forward)
+	je	L(bwd_write_0bytes)
+	cmp	$144, %rdx
+	jae	L(copy_backward)
+	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
+L(copy_forward):
+#endif
+L(start):
+	cmp	$144, %rdx
+	jae	L(144bytesormore)
+
+L(fwd_write_less32bytes):
+#ifndef USE_AS_MEMMOVE
+	cmp	%dil, %sil
+	jbe	L(bk_write)
+#endif
+	add	%rdx, %rsi
+	add	%rdx, %rdi
+	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
+#ifndef USE_AS_MEMMOVE
+L(bk_write):
+
+	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
+#endif
+
+	.p2align 4
+L(144bytesormore):
+
+#ifndef USE_AS_MEMMOVE
+	cmp	%dil, %sil
+	jle	L(copy_backward)
+#endif
+	movdqu	(%rsi), %xmm0
+	mov	%rdi, %r8
+	and	$-16, %rdi
+	add	$16, %rdi
+	mov	%rdi, %r9
+	sub	%r8, %r9
+	sub	%r9, %rdx
+	add	%r9, %rsi
+	mov	%rsi, %r9
+	and	$0xf, %r9
+	jz	L(shl_0)
+#ifdef DATA_CACHE_SIZE
+	mov	$DATA_CACHE_SIZE, %RCX_LP
+#else
+	mov	__x86_data_cache_size(%rip), %RCX_LP
+#endif
+	cmp	%rcx, %rdx
+	jae	L(gobble_mem_fwd)
+	lea    	L(shl_table_fwd)(%rip), %r11
+	sub	$0x80, %rdx
+	movslq	(%r11, %r9, 4), %r9
+	add	%r11, %r9
+	jmp	*%r9
+	ud2
+
+	.p2align 4
+L(copy_backward):
+#ifdef DATA_CACHE_SIZE
+	mov	$DATA_CACHE_SIZE, %RCX_LP
+#else
+	mov	__x86_data_cache_size(%rip), %RCX_LP
+#endif
+	shl	$1, %rcx
+	cmp	%rcx, %rdx
+	ja	L(gobble_mem_bwd)
+
+	add	%rdx, %rdi
+	add	%rdx, %rsi
+	movdqu	-16(%rsi), %xmm0
+	lea	-16(%rdi), %r8
+	mov	%rdi, %r9
+	and	$0xf, %r9
+	xor	%r9, %rdi
+	sub	%r9, %rsi
+	sub	%r9, %rdx
+	mov	%rsi, %r9
+	and	$0xf, %r9
+	jz	L(shl_0_bwd)
+	lea    	L(shl_table_bwd)(%rip), %r11
+	sub	$0x80, %rdx
+	movslq	(%r11, %r9, 4), %r9
+	add	%r11, %r9
+	jmp	*%r9
+	ud2
+
+	.p2align 4
+L(shl_0):
+
+	mov	%rdx, %r9
+	shr	$8, %r9
+	add	%rdx, %r9
+#ifdef DATA_CACHE_SIZE
+	cmp	$DATA_CACHE_SIZE_HALF, %R9_LP
+#else
+	cmp	__x86_data_cache_size_half(%rip), %R9_LP
+#endif
+	jae	L(gobble_mem_fwd)
+	sub	$0x80, %rdx
+	.p2align 4
+L(shl_0_loop):
+	movdqa	(%rsi), %xmm1
+	movdqa	%xmm1, (%rdi)
+	movaps	0x10(%rsi), %xmm2
+	movaps	%xmm2, 0x10(%rdi)
+	movaps	0x20(%rsi), %xmm3
+	movaps	%xmm3, 0x20(%rdi)
+	movaps	0x30(%rsi), %xmm4
+	movaps	%xmm4, 0x30(%rdi)
+	movaps	0x40(%rsi), %xmm1
+	movaps	%xmm1, 0x40(%rdi)
+	movaps	0x50(%rsi), %xmm2
+	movaps	%xmm2, 0x50(%rdi)
+	movaps	0x60(%rsi), %xmm3
+	movaps	%xmm3, 0x60(%rdi)
+	movaps	0x70(%rsi), %xmm4
+	movaps	%xmm4, 0x70(%rdi)
+	sub	$0x80, %rdx
+	lea	0x80(%rsi), %rsi
+	lea	0x80(%rdi), %rdi
+	jae	L(shl_0_loop)
+	movdqu	%xmm0, (%r8)
+	add	$0x80, %rdx
+	add	%rdx, %rsi
+	add	%rdx, %rdi
+	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
+
+	.p2align 4
+L(shl_0_bwd):
+	sub	$0x80, %rdx
+L(copy_backward_loop):
+	movaps	-0x10(%rsi), %xmm1
+	movaps	%xmm1, -0x10(%rdi)
+	movaps	-0x20(%rsi), %xmm2
+	movaps	%xmm2, -0x20(%rdi)
+	movaps	-0x30(%rsi), %xmm3
+	movaps	%xmm3, -0x30(%rdi)
+	movaps	-0x40(%rsi), %xmm4
+	movaps	%xmm4, -0x40(%rdi)
+	movaps	-0x50(%rsi), %xmm5
+	movaps	%xmm5, -0x50(%rdi)
+	movaps	-0x60(%rsi), %xmm5
+	movaps	%xmm5, -0x60(%rdi)
+	movaps	-0x70(%rsi), %xmm5
+	movaps	%xmm5, -0x70(%rdi)
+	movaps	-0x80(%rsi), %xmm5
+	movaps	%xmm5, -0x80(%rdi)
+	sub	$0x80, %rdx
+	lea	-0x80(%rdi), %rdi
+	lea	-0x80(%rsi), %rsi
+	jae	L(copy_backward_loop)
+
+	movdqu	%xmm0, (%r8)
+	add	$0x80, %rdx
+	sub	%rdx, %rdi
+	sub	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
+
+	.p2align 4
+L(shl_1):
+	sub	$0x80, %rdx
+	movaps	-0x01(%rsi), %xmm1
+	movaps	0x0f(%rsi), %xmm2
+	movaps	0x1f(%rsi), %xmm3
+	movaps	0x2f(%rsi), %xmm4
+	movaps	0x3f(%rsi), %xmm5
+	movaps	0x4f(%rsi), %xmm6
+	movaps	0x5f(%rsi), %xmm7
+	movaps	0x6f(%rsi), %xmm8
+	movaps	0x7f(%rsi), %xmm9
+	lea	0x80(%rsi), %rsi
+	palignr	$1, %xmm8, %xmm9
+	movaps	%xmm9, 0x70(%rdi)
+	palignr	$1, %xmm7, %xmm8
+	movaps	%xmm8, 0x60(%rdi)
+	palignr	$1, %xmm6, %xmm7
+	movaps	%xmm7, 0x50(%rdi)
+	palignr	$1, %xmm5, %xmm6
+	movaps	%xmm6, 0x40(%rdi)
+	palignr	$1, %xmm4, %xmm5
+	movaps	%xmm5, 0x30(%rdi)
+	palignr	$1, %xmm3, %xmm4
+	movaps	%xmm4, 0x20(%rdi)
+	palignr	$1, %xmm2, %xmm3
+	movaps	%xmm3, 0x10(%rdi)
+	palignr	$1, %xmm1, %xmm2
+	movaps	%xmm2, (%rdi)
+	lea	0x80(%rdi), %rdi
+	jae	L(shl_1)
+	movdqu	%xmm0, (%r8)
+	add	$0x80, %rdx
+	add	%rdx, %rdi
+	add	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
+
+	.p2align 4
+L(shl_1_bwd):
+	movaps	-0x01(%rsi), %xmm1
+
+	movaps	-0x11(%rsi), %xmm2
+	palignr	$1, %xmm2, %xmm1
+	movaps	%xmm1, -0x10(%rdi)
+
+	movaps	-0x21(%rsi), %xmm3
+	palignr	$1, %xmm3, %xmm2
+	movaps	%xmm2, -0x20(%rdi)
+
+	movaps	-0x31(%rsi), %xmm4
+	palignr	$1, %xmm4, %xmm3
+	movaps	%xmm3, -0x30(%rdi)
+
+	movaps	-0x41(%rsi), %xmm5
+	palignr	$1, %xmm5, %xmm4
+	movaps	%xmm4, -0x40(%rdi)
+
+	movaps	-0x51(%rsi), %xmm6
+	palignr	$1, %xmm6, %xmm5
+	movaps	%xmm5, -0x50(%rdi)
+
+	movaps	-0x61(%rsi), %xmm7
+	palignr	$1, %xmm7, %xmm6
+	movaps	%xmm6, -0x60(%rdi)
+
+	movaps	-0x71(%rsi), %xmm8
+	palignr	$1, %xmm8, %xmm7
+	movaps	%xmm7, -0x70(%rdi)
+
+	movaps	-0x81(%rsi), %xmm9
+	palignr	$1, %xmm9, %xmm8
+	movaps	%xmm8, -0x80(%rdi)
+
+	sub	$0x80, %rdx
+	lea	-0x80(%rdi), %rdi
+	lea	-0x80(%rsi), %rsi
+	jae	L(shl_1_bwd)
+	movdqu	%xmm0, (%r8)
+	add	$0x80, %rdx
+	sub	%rdx, %rdi
+	sub	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
+
+	.p2align 4
+L(shl_2):
+	sub	$0x80, %rdx
+	movaps	-0x02(%rsi), %xmm1
+	movaps	0x0e(%rsi), %xmm2
+	movaps	0x1e(%rsi), %xmm3
+	movaps	0x2e(%rsi), %xmm4
+	movaps	0x3e(%rsi), %xmm5
+	movaps	0x4e(%rsi), %xmm6
+	movaps	0x5e(%rsi), %xmm7
+	movaps	0x6e(%rsi), %xmm8
+	movaps	0x7e(%rsi), %xmm9
+	lea	0x80(%rsi), %rsi
+	palignr	$2, %xmm8, %xmm9
+	movaps	%xmm9, 0x70(%rdi)
+	palignr	$2, %xmm7, %xmm8
+	movaps	%xmm8, 0x60(%rdi)
+	palignr	$2, %xmm6, %xmm7
+	movaps	%xmm7, 0x50(%rdi)
+	palignr	$2, %xmm5, %xmm6
+	movaps	%xmm6, 0x40(%rdi)
+	palignr	$2, %xmm4, %xmm5
+	movaps	%xmm5, 0x30(%rdi)
+	palignr	$2, %xmm3, %xmm4
+	movaps	%xmm4, 0x20(%rdi)
+	palignr	$2, %xmm2, %xmm3
+	movaps	%xmm3, 0x10(%rdi)
+	palignr	$2, %xmm1, %xmm2
+	movaps	%xmm2, (%rdi)
+	lea	0x80(%rdi), %rdi
+	jae	L(shl_2)
+	movdqu	%xmm0, (%r8)
+	add	$0x80, %rdx
+	add	%rdx, %rdi
+	add	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
+
+	.p2align 4
+L(shl_2_bwd):
+	movaps	-0x02(%rsi), %xmm1
+
+	movaps	-0x12(%rsi), %xmm2
+	palignr	$2, %xmm2, %xmm1
+	movaps	%xmm1, -0x10(%rdi)
+
+	movaps	-0x22(%rsi), %xmm3
+	palignr	$2, %xmm3, %xmm2
+	movaps	%xmm2, -0x20(%rdi)
+
+	movaps	-0x32(%rsi), %xmm4
+	palignr	$2, %xmm4, %xmm3
+	movaps	%xmm3, -0x30(%rdi)
+
+	movaps	-0x42(%rsi), %xmm5
+	palignr	$2, %xmm5, %xmm4
+	movaps	%xmm4, -0x40(%rdi)
+
+	movaps	-0x52(%rsi), %xmm6
+	palignr	$2, %xmm6, %xmm5
+	movaps	%xmm5, -0x50(%rdi)
+
+	movaps	-0x62(%rsi), %xmm7
+	palignr	$2, %xmm7, %xmm6
+	movaps	%xmm6, -0x60(%rdi)
+
+	movaps	-0x72(%rsi), %xmm8
+	palignr	$2, %xmm8, %xmm7
+	movaps	%xmm7, -0x70(%rdi)
+
+	movaps	-0x82(%rsi), %xmm9
+	palignr	$2, %xmm9, %xmm8
+	movaps	%xmm8, -0x80(%rdi)
+
+	sub	$0x80, %rdx
+	lea	-0x80(%rdi), %rdi
+	lea	-0x80(%rsi), %rsi
+	jae	L(shl_2_bwd)
+	movdqu	%xmm0, (%r8)
+	add	$0x80, %rdx
+	sub	%rdx, %rdi
+	sub	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
+
+	.p2align 4
+L(shl_3):
+	sub	$0x80, %rdx
+	movaps -0x03(%rsi), %xmm1
+	movaps	0x0d(%rsi), %xmm2
+	movaps	0x1d(%rsi), %xmm3
+	movaps	0x2d(%rsi), %xmm4
+	movaps	0x3d(%rsi), %xmm5
+	movaps	0x4d(%rsi), %xmm6
+	movaps	0x5d(%rsi), %xmm7
+	movaps	0x6d(%rsi), %xmm8
+	movaps	0x7d(%rsi), %xmm9
+	lea	0x80(%rsi), %rsi
+	palignr	$3, %xmm8, %xmm9
+	movaps	%xmm9, 0x70(%rdi)
+	palignr	$3, %xmm7, %xmm8
+	movaps	%xmm8, 0x60(%rdi)
+	palignr	$3, %xmm6, %xmm7
+	movaps	%xmm7, 0x50(%rdi)
+	palignr	$3, %xmm5, %xmm6
+	movaps	%xmm6, 0x40(%rdi)
+	palignr	$3, %xmm4, %xmm5
+	movaps	%xmm5, 0x30(%rdi)
+	palignr	$3, %xmm3, %xmm4
+	movaps	%xmm4, 0x20(%rdi)
+	palignr	$3, %xmm2, %xmm3
+	movaps	%xmm3, 0x10(%rdi)
+	palignr	$3, %xmm1, %xmm2
+	movaps	%xmm2, (%rdi)
+	lea	0x80(%rdi), %rdi
+	jae	L(shl_3)
+	movdqu	%xmm0, (%r8)
+	add	$0x80, %rdx
+	add	%rdx, %rdi
+	add	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
+
+	.p2align 4
+L(shl_3_bwd):
+	movaps	-0x03(%rsi), %xmm1
+
+	movaps	-0x13(%rsi), %xmm2
+	palignr	$3, %xmm2, %xmm1
+	movaps	%xmm1, -0x10(%rdi)
+
+	movaps	-0x23(%rsi), %xmm3
+	palignr	$3, %xmm3, %xmm2
+	movaps	%xmm2, -0x20(%rdi)
+
+	movaps	-0x33(%rsi), %xmm4
+	palignr	$3, %xmm4, %xmm3
+	movaps	%xmm3, -0x30(%rdi)
+
+	movaps	-0x43(%rsi), %xmm5
+	palignr	$3, %xmm5, %xmm4
+	movaps	%xmm4, -0x40(%rdi)
+
+	movaps	-0x53(%rsi), %xmm6
+	palignr	$3, %xmm6, %xmm5
+	movaps	%xmm5, -0x50(%rdi)
+
+	movaps	-0x63(%rsi), %xmm7
+	palignr	$3, %xmm7, %xmm6
+	movaps	%xmm6, -0x60(%rdi)
+
+	movaps	-0x73(%rsi), %xmm8
+	palignr	$3, %xmm8, %xmm7
+	movaps	%xmm7, -0x70(%rdi)
+
+	movaps	-0x83(%rsi), %xmm9
+	palignr	$3, %xmm9, %xmm8
+	movaps	%xmm8, -0x80(%rdi)
+
+	sub	$0x80, %rdx
+	lea	-0x80(%rdi), %rdi
+	lea	-0x80(%rsi), %rsi
+	jae	L(shl_3_bwd)
+	movdqu	%xmm0, (%r8)
+	add	$0x80, %rdx
+	sub	%rdx, %rdi
+	sub	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
+
+	.p2align 4
+L(shl_4):
+	sub	$0x80, %rdx
+	movaps	-0x04(%rsi), %xmm1
+	movaps	0x0c(%rsi), %xmm2
+	movaps	0x1c(%rsi), %xmm3
+	movaps	0x2c(%rsi), %xmm4
+	movaps	0x3c(%rsi), %xmm5
+	movaps	0x4c(%rsi), %xmm6
+	movaps	0x5c(%rsi), %xmm7
+	movaps	0x6c(%rsi), %xmm8
+	movaps	0x7c(%rsi), %xmm9
+	lea	0x80(%rsi), %rsi
+	palignr	$4, %xmm8, %xmm9
+	movaps	%xmm9, 0x70(%rdi)
+	palignr	$4, %xmm7, %xmm8
+	movaps	%xmm8, 0x60(%rdi)
+	palignr	$4, %xmm6, %xmm7
+	movaps	%xmm7, 0x50(%rdi)
+	palignr	$4, %xmm5, %xmm6
+	movaps	%xmm6, 0x40(%rdi)
+	palignr	$4, %xmm4, %xmm5
+	movaps	%xmm5, 0x30(%rdi)
+	palignr	$4, %xmm3, %xmm4
+	movaps	%xmm4, 0x20(%rdi)
+	palignr	$4, %xmm2, %xmm3
+	movaps	%xmm3, 0x10(%rdi)
+	palignr	$4, %xmm1, %xmm2
+	movaps	%xmm2, (%rdi)
+	lea	0x80(%rdi), %rdi
+	jae	L(shl_4)
+	movdqu	%xmm0, (%r8)
+	add	$0x80, %rdx
+	add	%rdx, %rdi
+	add	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
+
+	.p2align 4
+L(shl_4_bwd):
+	movaps	-0x04(%rsi), %xmm1
+
+	movaps	-0x14(%rsi), %xmm2
+	palignr	$4, %xmm2, %xmm1
+	movaps	%xmm1, -0x10(%rdi)
+
+	movaps	-0x24(%rsi), %xmm3
+	palignr	$4, %xmm3, %xmm2
+	movaps	%xmm2, -0x20(%rdi)
+
+	movaps	-0x34(%rsi), %xmm4
+	palignr	$4, %xmm4, %xmm3
+	movaps	%xmm3, -0x30(%rdi)
+
+	movaps	-0x44(%rsi), %xmm5
+	palignr	$4, %xmm5, %xmm4
+	movaps	%xmm4, -0x40(%rdi)
+
+	movaps	-0x54(%rsi), %xmm6
+	palignr	$4, %xmm6, %xmm5
+	movaps	%xmm5, -0x50(%rdi)
+
+	movaps	-0x64(%rsi), %xmm7
+	palignr	$4, %xmm7, %xmm6
+	movaps	%xmm6, -0x60(%rdi)
+
+	movaps	-0x74(%rsi), %xmm8
+	palignr	$4, %xmm8, %xmm7
+	movaps	%xmm7, -0x70(%rdi)
+
+	movaps	-0x84(%rsi), %xmm9
+	palignr	$4, %xmm9, %xmm8
+	movaps	%xmm8, -0x80(%rdi)
+
+	sub	$0x80, %rdx
+	lea	-0x80(%rdi), %rdi
+	lea	-0x80(%rsi), %rsi
+	jae	L(shl_4_bwd)
+	movdqu	%xmm0, (%r8)
+	add	$0x80, %rdx
+	sub	%rdx, %rdi
+	sub	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
+
+	.p2align 4
+L(shl_5):
+	sub	$0x80, %rdx
+	movaps	-0x05(%rsi), %xmm1
+	movaps	0x0b(%rsi), %xmm2
+	movaps	0x1b(%rsi), %xmm3
+	movaps	0x2b(%rsi), %xmm4
+	movaps	0x3b(%rsi), %xmm5
+	movaps	0x4b(%rsi), %xmm6
+	movaps	0x5b(%rsi), %xmm7
+	movaps	0x6b(%rsi), %xmm8
+	movaps	0x7b(%rsi), %xmm9
+	lea	0x80(%rsi), %rsi
+	palignr	$5, %xmm8, %xmm9
+	movaps	%xmm9, 0x70(%rdi)
+	palignr	$5, %xmm7, %xmm8
+	movaps	%xmm8, 0x60(%rdi)
+	palignr	$5, %xmm6, %xmm7
+	movaps	%xmm7, 0x50(%rdi)
+	palignr	$5, %xmm5, %xmm6
+	movaps	%xmm6, 0x40(%rdi)
+	palignr	$5, %xmm4, %xmm5
+	movaps	%xmm5, 0x30(%rdi)
+	palignr	$5, %xmm3, %xmm4
+	movaps	%xmm4, 0x20(%rdi)
+	palignr	$5, %xmm2, %xmm3
+	movaps	%xmm3, 0x10(%rdi)
+	palignr	$5, %xmm1, %xmm2
+	movaps	%xmm2, (%rdi)
+	lea	0x80(%rdi), %rdi
+	jae	L(shl_5)
+	movdqu	%xmm0, (%r8)
+	add	$0x80, %rdx
+	add	%rdx, %rdi
+	add	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
+
+	.p2align 4
+L(shl_5_bwd):
+	movaps	-0x05(%rsi), %xmm1
+
+	movaps	-0x15(%rsi), %xmm2
+	palignr	$5, %xmm2, %xmm1
+	movaps	%xmm1, -0x10(%rdi)
+
+	movaps	-0x25(%rsi), %xmm3
+	palignr	$5, %xmm3, %xmm2
+	movaps	%xmm2, -0x20(%rdi)
+
+	movaps	-0x35(%rsi), %xmm4
+	palignr	$5, %xmm4, %xmm3
+	movaps	%xmm3, -0x30(%rdi)
+
+	movaps	-0x45(%rsi), %xmm5
+	palignr	$5, %xmm5, %xmm4
+	movaps	%xmm4, -0x40(%rdi)
+
+	movaps	-0x55(%rsi), %xmm6
+	palignr	$5, %xmm6, %xmm5
+	movaps	%xmm5, -0x50(%rdi)
+
+	movaps	-0x65(%rsi), %xmm7
+	palignr	$5, %xmm7, %xmm6
+	movaps	%xmm6, -0x60(%rdi)
+
+	movaps	-0x75(%rsi), %xmm8
+	palignr	$5, %xmm8, %xmm7
+	movaps	%xmm7, -0x70(%rdi)
+
+	movaps	-0x85(%rsi), %xmm9
+	palignr	$5, %xmm9, %xmm8
+	movaps	%xmm8, -0x80(%rdi)
+
+	sub	$0x80, %rdx
+	lea	-0x80(%rdi), %rdi
+	lea	-0x80(%rsi), %rsi
+	jae	L(shl_5_bwd)
+	movdqu	%xmm0, (%r8)
+	add	$0x80, %rdx
+	sub	%rdx, %rdi
+	sub	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
+
+	.p2align 4
+L(shl_6):
+	sub	$0x80, %rdx
+	movaps	-0x06(%rsi), %xmm1
+	movaps	0x0a(%rsi), %xmm2
+	movaps	0x1a(%rsi), %xmm3
+	movaps	0x2a(%rsi), %xmm4
+	movaps	0x3a(%rsi), %xmm5
+	movaps	0x4a(%rsi), %xmm6
+	movaps	0x5a(%rsi), %xmm7
+	movaps	0x6a(%rsi), %xmm8
+	movaps	0x7a(%rsi), %xmm9
+	lea	0x80(%rsi), %rsi
+	palignr	$6, %xmm8, %xmm9
+	movaps	%xmm9, 0x70(%rdi)
+	palignr	$6, %xmm7, %xmm8
+	movaps	%xmm8, 0x60(%rdi)
+	palignr	$6, %xmm6, %xmm7
+	movaps	%xmm7, 0x50(%rdi)
+	palignr	$6, %xmm5, %xmm6
+	movaps	%xmm6, 0x40(%rdi)
+	palignr	$6, %xmm4, %xmm5
+	movaps	%xmm5, 0x30(%rdi)
+	palignr	$6, %xmm3, %xmm4
+	movaps	%xmm4, 0x20(%rdi)
+	palignr	$6, %xmm2, %xmm3
+	movaps	%xmm3, 0x10(%rdi)
+	palignr	$6, %xmm1, %xmm2
+	movaps	%xmm2, (%rdi)
+	lea	0x80(%rdi), %rdi
+	jae	L(shl_6)
+	movdqu	%xmm0, (%r8)
+	add	$0x80, %rdx
+	add	%rdx, %rdi
+	add	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
+
+	.p2align 4
+L(shl_6_bwd):
+	movaps	-0x06(%rsi), %xmm1
+
+	movaps	-0x16(%rsi), %xmm2
+	palignr	$6, %xmm2, %xmm1
+	movaps	%xmm1, -0x10(%rdi)
+
+	movaps	-0x26(%rsi), %xmm3
+	palignr	$6, %xmm3, %xmm2
+	movaps	%xmm2, -0x20(%rdi)
+
+	movaps	-0x36(%rsi), %xmm4
+	palignr	$6, %xmm4, %xmm3
+	movaps	%xmm3, -0x30(%rdi)
+
+	movaps	-0x46(%rsi), %xmm5
+	palignr	$6, %xmm5, %xmm4
+	movaps	%xmm4, -0x40(%rdi)
+
+	movaps	-0x56(%rsi), %xmm6
+	palignr	$6, %xmm6, %xmm5
+	movaps	%xmm5, -0x50(%rdi)
+
+	movaps	-0x66(%rsi), %xmm7
+	palignr	$6, %xmm7, %xmm6
+	movaps	%xmm6, -0x60(%rdi)
+
+	movaps	-0x76(%rsi), %xmm8
+	palignr	$6, %xmm8, %xmm7
+	movaps	%xmm7, -0x70(%rdi)
+
+	movaps	-0x86(%rsi), %xmm9
+	palignr	$6, %xmm9, %xmm8
+	movaps	%xmm8, -0x80(%rdi)
+
+	sub	$0x80, %rdx
+	lea	-0x80(%rdi), %rdi
+	lea	-0x80(%rsi), %rsi
+	jae	L(shl_6_bwd)
+	movdqu	%xmm0, (%r8)
+	add	$0x80, %rdx
+	sub	%rdx, %rdi
+	sub	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
+
+	.p2align 4
+L(shl_7):
+	sub	$0x80, %rdx
+	movaps	-0x07(%rsi), %xmm1
+	movaps	0x09(%rsi), %xmm2
+	movaps	0x19(%rsi), %xmm3
+	movaps	0x29(%rsi), %xmm4
+	movaps	0x39(%rsi), %xmm5
+	movaps	0x49(%rsi), %xmm6
+	movaps	0x59(%rsi), %xmm7
+	movaps	0x69(%rsi), %xmm8
+	movaps	0x79(%rsi), %xmm9
+	lea	0x80(%rsi), %rsi
+	palignr	$7, %xmm8, %xmm9
+	movaps	%xmm9, 0x70(%rdi)
+	palignr	$7, %xmm7, %xmm8
+	movaps	%xmm8, 0x60(%rdi)
+	palignr	$7, %xmm6, %xmm7
+	movaps	%xmm7, 0x50(%rdi)
+	palignr	$7, %xmm5, %xmm6
+	movaps	%xmm6, 0x40(%rdi)
+	palignr	$7, %xmm4, %xmm5
+	movaps	%xmm5, 0x30(%rdi)
+	palignr	$7, %xmm3, %xmm4
+	movaps	%xmm4, 0x20(%rdi)
+	palignr	$7, %xmm2, %xmm3
+	movaps	%xmm3, 0x10(%rdi)
+	palignr	$7, %xmm1, %xmm2
+	movaps	%xmm2, (%rdi)
+	lea	0x80(%rdi), %rdi
+	jae	L(shl_7)
+	movdqu	%xmm0, (%r8)
+	add	$0x80, %rdx
+	add	%rdx, %rdi
+	add	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
+
+	.p2align 4
+L(shl_7_bwd):
+	movaps	-0x07(%rsi), %xmm1
+
+	movaps	-0x17(%rsi), %xmm2
+	palignr	$7, %xmm2, %xmm1
+	movaps	%xmm1, -0x10(%rdi)
+
+	movaps	-0x27(%rsi), %xmm3
+	palignr	$7, %xmm3, %xmm2
+	movaps	%xmm2, -0x20(%rdi)
+
+	movaps	-0x37(%rsi), %xmm4
+	palignr	$7, %xmm4, %xmm3
+	movaps	%xmm3, -0x30(%rdi)
+
+	movaps	-0x47(%rsi), %xmm5
+	palignr	$7, %xmm5, %xmm4
+	movaps	%xmm4, -0x40(%rdi)
+
+	movaps	-0x57(%rsi), %xmm6
+	palignr	$7, %xmm6, %xmm5
+	movaps	%xmm5, -0x50(%rdi)
+
+	movaps	-0x67(%rsi), %xmm7
+	palignr	$7, %xmm7, %xmm6
+	movaps	%xmm6, -0x60(%rdi)
+
+	movaps	-0x77(%rsi), %xmm8
+	palignr	$7, %xmm8, %xmm7
+	movaps	%xmm7, -0x70(%rdi)
+
+	movaps	-0x87(%rsi), %xmm9
+	palignr	$7, %xmm9, %xmm8
+	movaps	%xmm8, -0x80(%rdi)
+
+	sub	$0x80, %rdx
+	lea	-0x80(%rdi), %rdi
+	lea	-0x80(%rsi), %rsi
+	jae	L(shl_7_bwd)
+	movdqu	%xmm0, (%r8)
+	add	$0x80, %rdx
+	sub	%rdx, %rdi
+	sub	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
+
+	.p2align 4
+L(shl_8):
+	sub	$0x80, %rdx
+	movaps	-0x08(%rsi), %xmm1
+	movaps	0x08(%rsi), %xmm2
+	movaps	0x18(%rsi), %xmm3
+	movaps	0x28(%rsi), %xmm4
+	movaps	0x38(%rsi), %xmm5
+	movaps	0x48(%rsi), %xmm6
+	movaps	0x58(%rsi), %xmm7
+	movaps	0x68(%rsi), %xmm8
+	movaps	0x78(%rsi), %xmm9
+	lea	0x80(%rsi), %rsi
+	palignr	$8, %xmm8, %xmm9
+	movaps	%xmm9, 0x70(%rdi)
+	palignr	$8, %xmm7, %xmm8
+	movaps	%xmm8, 0x60(%rdi)
+	palignr	$8, %xmm6, %xmm7
+	movaps	%xmm7, 0x50(%rdi)
+	palignr	$8, %xmm5, %xmm6
+	movaps	%xmm6, 0x40(%rdi)
+	palignr	$8, %xmm4, %xmm5
+	movaps	%xmm5, 0x30(%rdi)
+	palignr	$8, %xmm3, %xmm4
+	movaps	%xmm4, 0x20(%rdi)
+	palignr	$8, %xmm2, %xmm3
+	movaps	%xmm3, 0x10(%rdi)
+	palignr	$8, %xmm1, %xmm2
+	movaps	%xmm2, (%rdi)
+	lea	0x80(%rdi), %rdi
+	jae	L(shl_8)
+	movdqu	%xmm0, (%r8)
+	add	$0x80, %rdx
+	add	%rdx, %rdi
+	add	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
+
+	.p2align 4
+L(shl_8_bwd):
+	movaps	-0x08(%rsi), %xmm1
+
+	movaps	-0x18(%rsi), %xmm2
+	palignr	$8, %xmm2, %xmm1
+	movaps	%xmm1, -0x10(%rdi)
+
+	movaps	-0x28(%rsi), %xmm3
+	palignr	$8, %xmm3, %xmm2
+	movaps	%xmm2, -0x20(%rdi)
+
+	movaps	-0x38(%rsi), %xmm4
+	palignr	$8, %xmm4, %xmm3
+	movaps	%xmm3, -0x30(%rdi)
+
+	movaps	-0x48(%rsi), %xmm5
+	palignr	$8, %xmm5, %xmm4
+	movaps	%xmm4, -0x40(%rdi)
+
+	movaps	-0x58(%rsi), %xmm6
+	palignr	$8, %xmm6, %xmm5
+	movaps	%xmm5, -0x50(%rdi)
+
+	movaps	-0x68(%rsi), %xmm7
+	palignr	$8, %xmm7, %xmm6
+	movaps	%xmm6, -0x60(%rdi)
+
+	movaps	-0x78(%rsi), %xmm8
+	palignr	$8, %xmm8, %xmm7
+	movaps	%xmm7, -0x70(%rdi)
+
+	movaps	-0x88(%rsi), %xmm9
+	palignr	$8, %xmm9, %xmm8
+	movaps	%xmm8, -0x80(%rdi)
+
+	sub	$0x80, %rdx
+	lea	-0x80(%rdi), %rdi
+	lea	-0x80(%rsi), %rsi
+	jae	L(shl_8_bwd)
+L(shl_8_end_bwd):
+	movdqu	%xmm0, (%r8)
+	add	$0x80, %rdx
+	sub	%rdx, %rdi
+	sub	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
+
+	.p2align 4
+L(shl_9):
+	sub	$0x80, %rdx
+	movaps	-0x09(%rsi), %xmm1
+	movaps	0x07(%rsi), %xmm2
+	movaps	0x17(%rsi), %xmm3
+	movaps	0x27(%rsi), %xmm4
+	movaps	0x37(%rsi), %xmm5
+	movaps	0x47(%rsi), %xmm6
+	movaps	0x57(%rsi), %xmm7
+	movaps	0x67(%rsi), %xmm8
+	movaps	0x77(%rsi), %xmm9
+	lea	0x80(%rsi), %rsi
+	palignr	$9, %xmm8, %xmm9
+	movaps	%xmm9, 0x70(%rdi)
+	palignr	$9, %xmm7, %xmm8
+	movaps	%xmm8, 0x60(%rdi)
+	palignr	$9, %xmm6, %xmm7
+	movaps	%xmm7, 0x50(%rdi)
+	palignr	$9, %xmm5, %xmm6
+	movaps	%xmm6, 0x40(%rdi)
+	palignr	$9, %xmm4, %xmm5
+	movaps	%xmm5, 0x30(%rdi)
+	palignr	$9, %xmm3, %xmm4
+	movaps	%xmm4, 0x20(%rdi)
+	palignr	$9, %xmm2, %xmm3
+	movaps	%xmm3, 0x10(%rdi)
+	palignr	$9, %xmm1, %xmm2
+	movaps	%xmm2, (%rdi)
+	lea	0x80(%rdi), %rdi
+	jae	L(shl_9)
+	movdqu	%xmm0, (%r8)
+	add	$0x80, %rdx
+	add	%rdx, %rdi
+	add	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
+
+	.p2align 4
+L(shl_9_bwd):
+	movaps	-0x09(%rsi), %xmm1
+
+	movaps	-0x19(%rsi), %xmm2
+	palignr	$9, %xmm2, %xmm1
+	movaps	%xmm1, -0x10(%rdi)
+
+	movaps	-0x29(%rsi), %xmm3
+	palignr	$9, %xmm3, %xmm2
+	movaps	%xmm2, -0x20(%rdi)
+
+	movaps	-0x39(%rsi), %xmm4
+	palignr	$9, %xmm4, %xmm3
+	movaps	%xmm3, -0x30(%rdi)
+
+	movaps	-0x49(%rsi), %xmm5
+	palignr	$9, %xmm5, %xmm4
+	movaps	%xmm4, -0x40(%rdi)
+
+	movaps	-0x59(%rsi), %xmm6
+	palignr	$9, %xmm6, %xmm5
+	movaps	%xmm5, -0x50(%rdi)
+
+	movaps	-0x69(%rsi), %xmm7
+	palignr	$9, %xmm7, %xmm6
+	movaps	%xmm6, -0x60(%rdi)
+
+	movaps	-0x79(%rsi), %xmm8
+	palignr	$9, %xmm8, %xmm7
+	movaps	%xmm7, -0x70(%rdi)
+
+	movaps	-0x89(%rsi), %xmm9
+	palignr	$9, %xmm9, %xmm8
+	movaps	%xmm8, -0x80(%rdi)
+
+	sub	$0x80, %rdx
+	lea	-0x80(%rdi), %rdi
+	lea	-0x80(%rsi), %rsi
+	jae	L(shl_9_bwd)
+	movdqu	%xmm0, (%r8)
+	add	$0x80, %rdx
+	sub	%rdx, %rdi
+	sub	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
+
+	.p2align 4
+L(shl_10):
+	sub	$0x80, %rdx
+	movaps	-0x0a(%rsi), %xmm1
+	movaps	0x06(%rsi), %xmm2
+	movaps	0x16(%rsi), %xmm3
+	movaps	0x26(%rsi), %xmm4
+	movaps	0x36(%rsi), %xmm5
+	movaps	0x46(%rsi), %xmm6
+	movaps	0x56(%rsi), %xmm7
+	movaps	0x66(%rsi), %xmm8
+	movaps	0x76(%rsi), %xmm9
+	lea	0x80(%rsi), %rsi
+	palignr	$10, %xmm8, %xmm9
+	movaps	%xmm9, 0x70(%rdi)
+	palignr	$10, %xmm7, %xmm8
+	movaps	%xmm8, 0x60(%rdi)
+	palignr	$10, %xmm6, %xmm7
+	movaps	%xmm7, 0x50(%rdi)
+	palignr	$10, %xmm5, %xmm6
+	movaps	%xmm6, 0x40(%rdi)
+	palignr	$10, %xmm4, %xmm5
+	movaps	%xmm5, 0x30(%rdi)
+	palignr	$10, %xmm3, %xmm4
+	movaps	%xmm4, 0x20(%rdi)
+	palignr	$10, %xmm2, %xmm3
+	movaps	%xmm3, 0x10(%rdi)
+	palignr	$10, %xmm1, %xmm2
+	movaps	%xmm2, (%rdi)
+	lea	0x80(%rdi), %rdi
+	jae	L(shl_10)
+	movdqu	%xmm0, (%r8)
+	add	$0x80, %rdx
+	add	%rdx, %rdi
+	add	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
+
+	.p2align 4
+L(shl_10_bwd):
+	movaps	-0x0a(%rsi), %xmm1
+
+	movaps	-0x1a(%rsi), %xmm2
+	palignr	$10, %xmm2, %xmm1
+	movaps	%xmm1, -0x10(%rdi)
+
+	movaps	-0x2a(%rsi), %xmm3
+	palignr	$10, %xmm3, %xmm2
+	movaps	%xmm2, -0x20(%rdi)
+
+	movaps	-0x3a(%rsi), %xmm4
+	palignr	$10, %xmm4, %xmm3
+	movaps	%xmm3, -0x30(%rdi)
+
+	movaps	-0x4a(%rsi), %xmm5
+	palignr	$10, %xmm5, %xmm4
+	movaps	%xmm4, -0x40(%rdi)
+
+	movaps	-0x5a(%rsi), %xmm6
+	palignr	$10, %xmm6, %xmm5
+	movaps	%xmm5, -0x50(%rdi)
+
+	movaps	-0x6a(%rsi), %xmm7
+	palignr	$10, %xmm7, %xmm6
+	movaps	%xmm6, -0x60(%rdi)
+
+	movaps	-0x7a(%rsi), %xmm8
+	palignr	$10, %xmm8, %xmm7
+	movaps	%xmm7, -0x70(%rdi)
+
+	movaps	-0x8a(%rsi), %xmm9
+	palignr	$10, %xmm9, %xmm8
+	movaps	%xmm8, -0x80(%rdi)
+
+	sub	$0x80, %rdx
+	lea	-0x80(%rdi), %rdi
+	lea	-0x80(%rsi), %rsi
+	jae	L(shl_10_bwd)
+	movdqu	%xmm0, (%r8)
+	add	$0x80, %rdx
+	sub	%rdx, %rdi
+	sub	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
+
+	.p2align 4
+L(shl_11):
+	sub	$0x80, %rdx
+	movaps	-0x0b(%rsi), %xmm1
+	movaps	0x05(%rsi), %xmm2
+	movaps	0x15(%rsi), %xmm3
+	movaps	0x25(%rsi), %xmm4
+	movaps	0x35(%rsi), %xmm5
+	movaps	0x45(%rsi), %xmm6
+	movaps	0x55(%rsi), %xmm7
+	movaps	0x65(%rsi), %xmm8
+	movaps	0x75(%rsi), %xmm9
+	lea	0x80(%rsi), %rsi
+	palignr	$11, %xmm8, %xmm9
+	movaps	%xmm9, 0x70(%rdi)
+	palignr	$11, %xmm7, %xmm8
+	movaps	%xmm8, 0x60(%rdi)
+	palignr	$11, %xmm6, %xmm7
+	movaps	%xmm7, 0x50(%rdi)
+	palignr	$11, %xmm5, %xmm6
+	movaps	%xmm6, 0x40(%rdi)
+	palignr	$11, %xmm4, %xmm5
+	movaps	%xmm5, 0x30(%rdi)
+	palignr	$11, %xmm3, %xmm4
+	movaps	%xmm4, 0x20(%rdi)
+	palignr	$11, %xmm2, %xmm3
+	movaps	%xmm3, 0x10(%rdi)
+	palignr	$11, %xmm1, %xmm2
+	movaps	%xmm2, (%rdi)
+	lea	0x80(%rdi), %rdi
+	jae	L(shl_11)
+	movdqu	%xmm0, (%r8)
+	add	$0x80, %rdx
+	add	%rdx, %rdi
+	add	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
+
+	.p2align 4
+L(shl_11_bwd):
+	movaps	-0x0b(%rsi), %xmm1
+
+	movaps	-0x1b(%rsi), %xmm2
+	palignr	$11, %xmm2, %xmm1
+	movaps	%xmm1, -0x10(%rdi)
+
+	movaps	-0x2b(%rsi), %xmm3
+	palignr	$11, %xmm3, %xmm2
+	movaps	%xmm2, -0x20(%rdi)
+
+	movaps	-0x3b(%rsi), %xmm4
+	palignr	$11, %xmm4, %xmm3
+	movaps	%xmm3, -0x30(%rdi)
+
+	movaps	-0x4b(%rsi), %xmm5
+	palignr	$11, %xmm5, %xmm4
+	movaps	%xmm4, -0x40(%rdi)
+
+	movaps	-0x5b(%rsi), %xmm6
+	palignr	$11, %xmm6, %xmm5
+	movaps	%xmm5, -0x50(%rdi)
+
+	movaps	-0x6b(%rsi), %xmm7
+	palignr	$11, %xmm7, %xmm6
+	movaps	%xmm6, -0x60(%rdi)
+
+	movaps	-0x7b(%rsi), %xmm8
+	palignr	$11, %xmm8, %xmm7
+	movaps	%xmm7, -0x70(%rdi)
+
+	movaps	-0x8b(%rsi), %xmm9
+	palignr	$11, %xmm9, %xmm8
+	movaps	%xmm8, -0x80(%rdi)
+
+	sub	$0x80, %rdx
+	lea	-0x80(%rdi), %rdi
+	lea	-0x80(%rsi), %rsi
+	jae	L(shl_11_bwd)
+	movdqu	%xmm0, (%r8)
+	add	$0x80, %rdx
+	sub	%rdx, %rdi
+	sub	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
+
+	.p2align 4
+L(shl_12):
+	sub	$0x80, %rdx
+	movdqa	-0x0c(%rsi), %xmm1
+	movaps	0x04(%rsi), %xmm2
+	movaps	0x14(%rsi), %xmm3
+	movaps	0x24(%rsi), %xmm4
+	movaps	0x34(%rsi), %xmm5
+	movaps	0x44(%rsi), %xmm6
+	movaps	0x54(%rsi), %xmm7
+	movaps	0x64(%rsi), %xmm8
+	movaps	0x74(%rsi), %xmm9
+	lea	0x80(%rsi), %rsi
+	palignr	$12, %xmm8, %xmm9
+	movaps	%xmm9, 0x70(%rdi)
+	palignr	$12, %xmm7, %xmm8
+	movaps	%xmm8, 0x60(%rdi)
+	palignr	$12, %xmm6, %xmm7
+	movaps	%xmm7, 0x50(%rdi)
+	palignr	$12, %xmm5, %xmm6
+	movaps	%xmm6, 0x40(%rdi)
+	palignr	$12, %xmm4, %xmm5
+	movaps	%xmm5, 0x30(%rdi)
+	palignr	$12, %xmm3, %xmm4
+	movaps	%xmm4, 0x20(%rdi)
+	palignr	$12, %xmm2, %xmm3
+	movaps	%xmm3, 0x10(%rdi)
+	palignr	$12, %xmm1, %xmm2
+	movaps	%xmm2, (%rdi)
+
+	lea	0x80(%rdi), %rdi
+	jae	L(shl_12)
+	movdqu	%xmm0, (%r8)
+	add	$0x80, %rdx
+	add	%rdx, %rdi
+	add	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
+
+	.p2align 4
+L(shl_12_bwd):
+	movaps	-0x0c(%rsi), %xmm1
+
+	movaps	-0x1c(%rsi), %xmm2
+	palignr	$12, %xmm2, %xmm1
+	movaps	%xmm1, -0x10(%rdi)
+
+	movaps	-0x2c(%rsi), %xmm3
+	palignr	$12, %xmm3, %xmm2
+	movaps	%xmm2, -0x20(%rdi)
+
+	movaps	-0x3c(%rsi), %xmm4
+	palignr	$12, %xmm4, %xmm3
+	movaps	%xmm3, -0x30(%rdi)
+
+	movaps	-0x4c(%rsi), %xmm5
+	palignr	$12, %xmm5, %xmm4
+	movaps	%xmm4, -0x40(%rdi)
+
+	movaps	-0x5c(%rsi), %xmm6
+	palignr	$12, %xmm6, %xmm5
+	movaps	%xmm5, -0x50(%rdi)
+
+	movaps	-0x6c(%rsi), %xmm7
+	palignr	$12, %xmm7, %xmm6
+	movaps	%xmm6, -0x60(%rdi)
+
+	movaps	-0x7c(%rsi), %xmm8
+	palignr	$12, %xmm8, %xmm7
+	movaps	%xmm7, -0x70(%rdi)
+
+	movaps	-0x8c(%rsi), %xmm9
+	palignr	$12, %xmm9, %xmm8
+	movaps	%xmm8, -0x80(%rdi)
+
+	sub	$0x80, %rdx
+	lea	-0x80(%rdi), %rdi
+	lea	-0x80(%rsi), %rsi
+	jae	L(shl_12_bwd)
+	movdqu	%xmm0, (%r8)
+	add	$0x80, %rdx
+	sub	%rdx, %rdi
+	sub	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
+
+	.p2align 4
+L(shl_13):
+	sub	$0x80, %rdx
+	movaps	-0x0d(%rsi), %xmm1
+	movaps	0x03(%rsi), %xmm2
+	movaps	0x13(%rsi), %xmm3
+	movaps	0x23(%rsi), %xmm4
+	movaps	0x33(%rsi), %xmm5
+	movaps	0x43(%rsi), %xmm6
+	movaps	0x53(%rsi), %xmm7
+	movaps	0x63(%rsi), %xmm8
+	movaps	0x73(%rsi), %xmm9
+	lea	0x80(%rsi), %rsi
+	palignr	$13, %xmm8, %xmm9
+	movaps	%xmm9, 0x70(%rdi)
+	palignr	$13, %xmm7, %xmm8
+	movaps	%xmm8, 0x60(%rdi)
+	palignr	$13, %xmm6, %xmm7
+	movaps	%xmm7, 0x50(%rdi)
+	palignr	$13, %xmm5, %xmm6
+	movaps	%xmm6, 0x40(%rdi)
+	palignr	$13, %xmm4, %xmm5
+	movaps	%xmm5, 0x30(%rdi)
+	palignr	$13, %xmm3, %xmm4
+	movaps	%xmm4, 0x20(%rdi)
+	palignr	$13, %xmm2, %xmm3
+	movaps	%xmm3, 0x10(%rdi)
+	palignr	$13, %xmm1, %xmm2
+	movaps	%xmm2, (%rdi)
+	lea	0x80(%rdi), %rdi
+	jae	L(shl_13)
+	movdqu	%xmm0, (%r8)
+	add	$0x80, %rdx
+	add	%rdx, %rdi
+	add	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
+
+	.p2align 4
+L(shl_13_bwd):
+	movaps	-0x0d(%rsi), %xmm1
+
+	movaps	-0x1d(%rsi), %xmm2
+	palignr	$13, %xmm2, %xmm1
+	movaps	%xmm1, -0x10(%rdi)
+
+	movaps	-0x2d(%rsi), %xmm3
+	palignr	$13, %xmm3, %xmm2
+	movaps	%xmm2, -0x20(%rdi)
+
+	movaps	-0x3d(%rsi), %xmm4
+	palignr	$13, %xmm4, %xmm3
+	movaps	%xmm3, -0x30(%rdi)
+
+	movaps	-0x4d(%rsi), %xmm5
+	palignr	$13, %xmm5, %xmm4
+	movaps	%xmm4, -0x40(%rdi)
+
+	movaps	-0x5d(%rsi), %xmm6
+	palignr	$13, %xmm6, %xmm5
+	movaps	%xmm5, -0x50(%rdi)
+
+	movaps	-0x6d(%rsi), %xmm7
+	palignr	$13, %xmm7, %xmm6
+	movaps	%xmm6, -0x60(%rdi)
+
+	movaps	-0x7d(%rsi), %xmm8
+	palignr	$13, %xmm8, %xmm7
+	movaps	%xmm7, -0x70(%rdi)
+
+	movaps	-0x8d(%rsi), %xmm9
+	palignr	$13, %xmm9, %xmm8
+	movaps	%xmm8, -0x80(%rdi)
+
+	sub	$0x80, %rdx
+	lea	-0x80(%rdi), %rdi
+	lea	-0x80(%rsi), %rsi
+	jae	L(shl_13_bwd)
+	movdqu	%xmm0, (%r8)
+	add	$0x80, %rdx
+	sub	%rdx, %rdi
+	sub	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
+
+	.p2align 4
+L(shl_14):
+	sub	$0x80, %rdx
+	movaps	-0x0e(%rsi), %xmm1
+	movaps	0x02(%rsi), %xmm2
+	movaps	0x12(%rsi), %xmm3
+	movaps	0x22(%rsi), %xmm4
+	movaps	0x32(%rsi), %xmm5
+	movaps	0x42(%rsi), %xmm6
+	movaps	0x52(%rsi), %xmm7
+	movaps	0x62(%rsi), %xmm8
+	movaps	0x72(%rsi), %xmm9
+	lea	0x80(%rsi), %rsi
+	palignr	$14, %xmm8, %xmm9
+	movaps	%xmm9, 0x70(%rdi)
+	palignr	$14, %xmm7, %xmm8
+	movaps	%xmm8, 0x60(%rdi)
+	palignr	$14, %xmm6, %xmm7
+	movaps	%xmm7, 0x50(%rdi)
+	palignr	$14, %xmm5, %xmm6
+	movaps	%xmm6, 0x40(%rdi)
+	palignr	$14, %xmm4, %xmm5
+	movaps	%xmm5, 0x30(%rdi)
+	palignr	$14, %xmm3, %xmm4
+	movaps	%xmm4, 0x20(%rdi)
+	palignr	$14, %xmm2, %xmm3
+	movaps	%xmm3, 0x10(%rdi)
+	palignr	$14, %xmm1, %xmm2
+	movaps	%xmm2, (%rdi)
+	lea	0x80(%rdi), %rdi
+	jae	L(shl_14)
+	movdqu	%xmm0, (%r8)
+	add	$0x80, %rdx
+	add	%rdx, %rdi
+	add	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
+
+	.p2align 4
+L(shl_14_bwd):
+	movaps	-0x0e(%rsi), %xmm1
+
+	movaps	-0x1e(%rsi), %xmm2
+	palignr	$14, %xmm2, %xmm1
+	movaps	%xmm1, -0x10(%rdi)
+
+	movaps	-0x2e(%rsi), %xmm3
+	palignr	$14, %xmm3, %xmm2
+	movaps	%xmm2, -0x20(%rdi)
+
+	movaps	-0x3e(%rsi), %xmm4
+	palignr	$14, %xmm4, %xmm3
+	movaps	%xmm3, -0x30(%rdi)
+
+	movaps	-0x4e(%rsi), %xmm5
+	palignr	$14, %xmm5, %xmm4
+	movaps	%xmm4, -0x40(%rdi)
+
+	movaps	-0x5e(%rsi), %xmm6
+	palignr	$14, %xmm6, %xmm5
+	movaps	%xmm5, -0x50(%rdi)
+
+	movaps	-0x6e(%rsi), %xmm7
+	palignr	$14, %xmm7, %xmm6
+	movaps	%xmm6, -0x60(%rdi)
+
+	movaps	-0x7e(%rsi), %xmm8
+	palignr	$14, %xmm8, %xmm7
+	movaps	%xmm7, -0x70(%rdi)
+
+	movaps	-0x8e(%rsi), %xmm9
+	palignr	$14, %xmm9, %xmm8
+	movaps	%xmm8, -0x80(%rdi)
+
+	sub	$0x80, %rdx
+	lea	-0x80(%rdi), %rdi
+	lea	-0x80(%rsi), %rsi
+	jae	L(shl_14_bwd)
+	movdqu	%xmm0, (%r8)
+	add	$0x80, %rdx
+	sub	%rdx, %rdi
+	sub	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
+
+	.p2align 4
+L(shl_15):
+	sub	$0x80, %rdx
+	movaps	-0x0f(%rsi), %xmm1
+	movaps	0x01(%rsi), %xmm2
+	movaps	0x11(%rsi), %xmm3
+	movaps	0x21(%rsi), %xmm4
+	movaps	0x31(%rsi), %xmm5
+	movaps	0x41(%rsi), %xmm6
+	movaps	0x51(%rsi), %xmm7
+	movaps	0x61(%rsi), %xmm8
+	movaps	0x71(%rsi), %xmm9
+	lea	0x80(%rsi), %rsi
+	palignr	$15, %xmm8, %xmm9
+	movaps	%xmm9, 0x70(%rdi)
+	palignr	$15, %xmm7, %xmm8
+	movaps	%xmm8, 0x60(%rdi)
+	palignr	$15, %xmm6, %xmm7
+	movaps	%xmm7, 0x50(%rdi)
+	palignr	$15, %xmm5, %xmm6
+	movaps	%xmm6, 0x40(%rdi)
+	palignr	$15, %xmm4, %xmm5
+	movaps	%xmm5, 0x30(%rdi)
+	palignr	$15, %xmm3, %xmm4
+	movaps	%xmm4, 0x20(%rdi)
+	palignr	$15, %xmm2, %xmm3
+	movaps	%xmm3, 0x10(%rdi)
+	palignr	$15, %xmm1, %xmm2
+	movaps	%xmm2, (%rdi)
+	lea	0x80(%rdi), %rdi
+	jae	L(shl_15)
+	movdqu	%xmm0, (%r8)
+	add	$0x80, %rdx
+	add	%rdx, %rdi
+	add	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
+
+	.p2align 4
+L(shl_15_bwd):
+	movaps	-0x0f(%rsi), %xmm1
+
+	movaps	-0x1f(%rsi), %xmm2
+	palignr	$15, %xmm2, %xmm1
+	movaps	%xmm1, -0x10(%rdi)
+
+	movaps	-0x2f(%rsi), %xmm3
+	palignr	$15, %xmm3, %xmm2
+	movaps	%xmm2, -0x20(%rdi)
+
+	movaps	-0x3f(%rsi), %xmm4
+	palignr	$15, %xmm4, %xmm3
+	movaps	%xmm3, -0x30(%rdi)
+
+	movaps	-0x4f(%rsi), %xmm5
+	palignr	$15, %xmm5, %xmm4
+	movaps	%xmm4, -0x40(%rdi)
+
+	movaps	-0x5f(%rsi), %xmm6
+	palignr	$15, %xmm6, %xmm5
+	movaps	%xmm5, -0x50(%rdi)
+
+	movaps	-0x6f(%rsi), %xmm7
+	palignr	$15, %xmm7, %xmm6
+	movaps	%xmm6, -0x60(%rdi)
+
+	movaps	-0x7f(%rsi), %xmm8
+	palignr	$15, %xmm8, %xmm7
+	movaps	%xmm7, -0x70(%rdi)
+
+	movaps	-0x8f(%rsi), %xmm9
+	palignr	$15, %xmm9, %xmm8
+	movaps	%xmm8, -0x80(%rdi)
+
+	sub	$0x80, %rdx
+	lea	-0x80(%rdi), %rdi
+	lea	-0x80(%rsi), %rsi
+	jae	L(shl_15_bwd)
+	movdqu	%xmm0, (%r8)
+	add	$0x80, %rdx
+	sub	%rdx, %rdi
+	sub	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
+
+	.p2align 4
+L(gobble_mem_fwd):
+	movdqu	(%rsi), %xmm1
+	movdqu	%xmm0, (%r8)
+	movdqa	%xmm1, (%rdi)
+	sub	$16, %rdx
+	add	$16, %rsi
+	add	$16, %rdi
+
+#ifdef SHARED_CACHE_SIZE_HALF
+	mov	$SHARED_CACHE_SIZE_HALF, %RCX_LP
+#else
+	mov	__x86_shared_cache_size_half(%rip), %RCX_LP
+#endif
+#ifdef USE_AS_MEMMOVE
+	mov	%rsi, %r9
+	sub	%rdi, %r9
+	cmp	%rdx, %r9
+	jae	L(memmove_is_memcpy_fwd)
+	cmp	%rcx, %r9
+	jbe	L(ll_cache_copy_fwd_start)
+L(memmove_is_memcpy_fwd):
+#endif
+	cmp	%rcx, %rdx
+	ja	L(bigger_in_fwd)
+	mov	%rdx, %rcx
+L(bigger_in_fwd):
+	sub	%rcx, %rdx
+	cmp	$0x1000, %rdx
+	jbe	L(ll_cache_copy_fwd)
+
+	mov	%rcx, %r9
+	shl	$3, %r9
+	cmp	%r9, %rdx
+	jbe	L(2steps_copy_fwd)
+	add	%rcx, %rdx
+	xor	%rcx, %rcx
+L(2steps_copy_fwd):
+	sub	$0x80, %rdx
+L(gobble_mem_fwd_loop):
+	sub	$0x80, %rdx
+	prefetcht0 0x200(%rsi)
+	prefetcht0 0x300(%rsi)
+	movdqu	(%rsi), %xmm0
+	movdqu	0x10(%rsi), %xmm1
+	movdqu	0x20(%rsi), %xmm2
+	movdqu	0x30(%rsi), %xmm3
+	movdqu	0x40(%rsi), %xmm4
+	movdqu	0x50(%rsi), %xmm5
+	movdqu	0x60(%rsi), %xmm6
+	movdqu	0x70(%rsi), %xmm7
+	lfence
+	movntdq	%xmm0, (%rdi)
+	movntdq	%xmm1, 0x10(%rdi)
+	movntdq	%xmm2, 0x20(%rdi)
+	movntdq	%xmm3, 0x30(%rdi)
+	movntdq	%xmm4, 0x40(%rdi)
+	movntdq	%xmm5, 0x50(%rdi)
+	movntdq	%xmm6, 0x60(%rdi)
+	movntdq	%xmm7, 0x70(%rdi)
+	lea	0x80(%rsi), %rsi
+	lea	0x80(%rdi), %rdi
+	jae	L(gobble_mem_fwd_loop)
+	sfence
+	cmp	$0x80, %rcx
+	jb	L(gobble_mem_fwd_end)
+	add	$0x80, %rdx
+L(ll_cache_copy_fwd):
+	add	%rcx, %rdx
+L(ll_cache_copy_fwd_start):
+	sub	$0x80, %rdx
+L(gobble_ll_loop_fwd):
+	prefetchnta 0x1c0(%rsi)
+	prefetchnta 0x280(%rsi)
+	prefetchnta 0x1c0(%rdi)
+	prefetchnta 0x280(%rdi)
+	sub	$0x80, %rdx
+	movdqu	(%rsi), %xmm0
+	movdqu	0x10(%rsi), %xmm1
+	movdqu	0x20(%rsi), %xmm2
+	movdqu	0x30(%rsi), %xmm3
+	movdqu	0x40(%rsi), %xmm4
+	movdqu	0x50(%rsi), %xmm5
+	movdqu	0x60(%rsi), %xmm6
+	movdqu	0x70(%rsi), %xmm7
+	movdqa	%xmm0, (%rdi)
+	movdqa	%xmm1, 0x10(%rdi)
+	movdqa	%xmm2, 0x20(%rdi)
+	movdqa	%xmm3, 0x30(%rdi)
+	movdqa	%xmm4, 0x40(%rdi)
+	movdqa	%xmm5, 0x50(%rdi)
+	movdqa	%xmm6, 0x60(%rdi)
+	movdqa	%xmm7, 0x70(%rdi)
+	lea	0x80(%rsi), %rsi
+	lea	0x80(%rdi), %rdi
+	jae	L(gobble_ll_loop_fwd)
+L(gobble_mem_fwd_end):
+	add	$0x80, %rdx
+	add	%rdx, %rsi
+	add	%rdx, %rdi
+	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
+
+	.p2align 4
+L(gobble_mem_bwd):
+	add	%rdx, %rsi
+	add	%rdx, %rdi
+
+	movdqu	-16(%rsi), %xmm0
+	lea	-16(%rdi), %r8
+	mov	%rdi, %r9
+	and	$-16, %rdi
+	sub	%rdi, %r9
+	sub	%r9, %rsi
+	sub	%r9, %rdx
+
+
+#ifdef SHARED_CACHE_SIZE_HALF
+	mov	$SHARED_CACHE_SIZE_HALF, %RCX_LP
+#else
+	mov	__x86_shared_cache_size_half(%rip), %RCX_LP
+#endif
+#ifdef USE_AS_MEMMOVE
+	mov	%rdi, %r9
+	sub	%rsi, %r9
+	cmp	%rdx, %r9
+	jae	L(memmove_is_memcpy_bwd)
+	cmp	%rcx, %r9
+	jbe	L(ll_cache_copy_bwd_start)
+L(memmove_is_memcpy_bwd):
+#endif
+	cmp	%rcx, %rdx
+	ja	L(bigger)
+	mov	%rdx, %rcx
+L(bigger):
+	sub	%rcx, %rdx
+	cmp	$0x1000, %rdx
+	jbe	L(ll_cache_copy)
+
+	mov	%rcx, %r9
+	shl	$3, %r9
+	cmp	%r9, %rdx
+	jbe	L(2steps_copy)
+	add	%rcx, %rdx
+	xor	%rcx, %rcx
+L(2steps_copy):
+	sub	$0x80, %rdx
+L(gobble_mem_bwd_loop):
+	sub	$0x80, %rdx
+	prefetcht0 -0x200(%rsi)
+	prefetcht0 -0x300(%rsi)
+	movdqu	-0x10(%rsi), %xmm1
+	movdqu	-0x20(%rsi), %xmm2
+	movdqu	-0x30(%rsi), %xmm3
+	movdqu	-0x40(%rsi), %xmm4
+	movdqu	-0x50(%rsi), %xmm5
+	movdqu	-0x60(%rsi), %xmm6
+	movdqu	-0x70(%rsi), %xmm7
+	movdqu	-0x80(%rsi), %xmm8
+	lfence
+	movntdq	%xmm1, -0x10(%rdi)
+	movntdq	%xmm2, -0x20(%rdi)
+	movntdq	%xmm3, -0x30(%rdi)
+	movntdq	%xmm4, -0x40(%rdi)
+	movntdq	%xmm5, -0x50(%rdi)
+	movntdq	%xmm6, -0x60(%rdi)
+	movntdq	%xmm7, -0x70(%rdi)
+	movntdq	%xmm8, -0x80(%rdi)
+	lea	-0x80(%rsi), %rsi
+	lea	-0x80(%rdi), %rdi
+	jae	L(gobble_mem_bwd_loop)
+	sfence
+	cmp	$0x80, %rcx
+	jb	L(gobble_mem_bwd_end)
+	add	$0x80, %rdx
+L(ll_cache_copy):
+	add	%rcx, %rdx
+L(ll_cache_copy_bwd_start):
+	sub	$0x80, %rdx
+L(gobble_ll_loop):
+	prefetchnta -0x1c0(%rsi)
+	prefetchnta -0x280(%rsi)
+	prefetchnta -0x1c0(%rdi)
+	prefetchnta -0x280(%rdi)
+	sub	$0x80, %rdx
+	movdqu	-0x10(%rsi), %xmm1
+	movdqu	-0x20(%rsi), %xmm2
+	movdqu	-0x30(%rsi), %xmm3
+	movdqu	-0x40(%rsi), %xmm4
+	movdqu	-0x50(%rsi), %xmm5
+	movdqu	-0x60(%rsi), %xmm6
+	movdqu	-0x70(%rsi), %xmm7
+	movdqu	-0x80(%rsi), %xmm8
+	movdqa	%xmm1, -0x10(%rdi)
+	movdqa	%xmm2, -0x20(%rdi)
+	movdqa	%xmm3, -0x30(%rdi)
+	movdqa	%xmm4, -0x40(%rdi)
+	movdqa	%xmm5, -0x50(%rdi)
+	movdqa	%xmm6, -0x60(%rdi)
+	movdqa	%xmm7, -0x70(%rdi)
+	movdqa	%xmm8, -0x80(%rdi)
+	lea	-0x80(%rsi), %rsi
+	lea	-0x80(%rdi), %rdi
+	jae	L(gobble_ll_loop)
+L(gobble_mem_bwd_end):
+	movdqu	%xmm0, (%r8)
+	add	$0x80, %rdx
+	sub	%rdx, %rsi
+	sub	%rdx, %rdi
+	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
+
+	.p2align 4
+L(fwd_write_128bytes):
+	lddqu	-128(%rsi), %xmm0
+	movdqu	%xmm0, -128(%rdi)
+L(fwd_write_112bytes):
+	lddqu	-112(%rsi), %xmm0
+	movdqu	%xmm0, -112(%rdi)
+L(fwd_write_96bytes):
+	lddqu	-96(%rsi), %xmm0
+	movdqu	%xmm0, -96(%rdi)
+L(fwd_write_80bytes):
+	lddqu	-80(%rsi), %xmm0
+	movdqu	%xmm0, -80(%rdi)
+L(fwd_write_64bytes):
+	lddqu	-64(%rsi), %xmm0
+	movdqu	%xmm0, -64(%rdi)
+L(fwd_write_48bytes):
+	lddqu	-48(%rsi), %xmm0
+	movdqu	%xmm0, -48(%rdi)
+L(fwd_write_32bytes):
+	lddqu	-32(%rsi), %xmm0
+	movdqu	%xmm0, -32(%rdi)
+L(fwd_write_16bytes):
+	lddqu	-16(%rsi), %xmm0
+	movdqu	%xmm0, -16(%rdi)
+L(fwd_write_0bytes):
+	ret
+
+
+	.p2align 4
+L(fwd_write_143bytes):
+	lddqu	-143(%rsi), %xmm0
+	movdqu	%xmm0, -143(%rdi)
+L(fwd_write_127bytes):
+	lddqu	-127(%rsi), %xmm0
+	movdqu	%xmm0, -127(%rdi)
+L(fwd_write_111bytes):
+	lddqu	-111(%rsi), %xmm0
+	movdqu	%xmm0, -111(%rdi)
+L(fwd_write_95bytes):
+	lddqu	-95(%rsi), %xmm0
+	movdqu	%xmm0, -95(%rdi)
+L(fwd_write_79bytes):
+	lddqu	-79(%rsi), %xmm0
+	movdqu	%xmm0, -79(%rdi)
+L(fwd_write_63bytes):
+	lddqu	-63(%rsi), %xmm0
+	movdqu	%xmm0, -63(%rdi)
+L(fwd_write_47bytes):
+	lddqu	-47(%rsi), %xmm0
+	movdqu	%xmm0, -47(%rdi)
+L(fwd_write_31bytes):
+	lddqu	-31(%rsi), %xmm0
+	lddqu	-16(%rsi), %xmm1
+	movdqu	%xmm0, -31(%rdi)
+	movdqu	%xmm1, -16(%rdi)
+	ret
+
+	.p2align 4
+L(fwd_write_15bytes):
+	mov	-15(%rsi), %rdx
+	mov	-8(%rsi), %rcx
+	mov	%rdx, -15(%rdi)
+	mov	%rcx, -8(%rdi)
+	ret
+
+	.p2align 4
+L(fwd_write_142bytes):
+	lddqu	-142(%rsi), %xmm0
+	movdqu	%xmm0, -142(%rdi)
+L(fwd_write_126bytes):
+	lddqu	-126(%rsi), %xmm0
+	movdqu	%xmm0, -126(%rdi)
+L(fwd_write_110bytes):
+	lddqu	-110(%rsi), %xmm0
+	movdqu	%xmm0, -110(%rdi)
+L(fwd_write_94bytes):
+	lddqu	-94(%rsi), %xmm0
+	movdqu	%xmm0, -94(%rdi)
+L(fwd_write_78bytes):
+	lddqu	-78(%rsi), %xmm0
+	movdqu	%xmm0, -78(%rdi)
+L(fwd_write_62bytes):
+	lddqu	-62(%rsi), %xmm0
+	movdqu	%xmm0, -62(%rdi)
+L(fwd_write_46bytes):
+	lddqu	-46(%rsi), %xmm0
+	movdqu	%xmm0, -46(%rdi)
+L(fwd_write_30bytes):
+	lddqu	-30(%rsi), %xmm0
+	lddqu	-16(%rsi), %xmm1
+	movdqu	%xmm0, -30(%rdi)
+	movdqu	%xmm1, -16(%rdi)
+	ret
+
+	.p2align 4
+L(fwd_write_14bytes):
+	mov	-14(%rsi), %rdx
+	mov	-8(%rsi), %rcx
+	mov	%rdx, -14(%rdi)
+	mov	%rcx, -8(%rdi)
+	ret
+
+	.p2align 4
+L(fwd_write_141bytes):
+	lddqu	-141(%rsi), %xmm0
+	movdqu	%xmm0, -141(%rdi)
+L(fwd_write_125bytes):
+	lddqu	-125(%rsi), %xmm0
+	movdqu	%xmm0, -125(%rdi)
+L(fwd_write_109bytes):
+	lddqu	-109(%rsi), %xmm0
+	movdqu	%xmm0, -109(%rdi)
+L(fwd_write_93bytes):
+	lddqu	-93(%rsi), %xmm0
+	movdqu	%xmm0, -93(%rdi)
+L(fwd_write_77bytes):
+	lddqu	-77(%rsi), %xmm0
+	movdqu	%xmm0, -77(%rdi)
+L(fwd_write_61bytes):
+	lddqu	-61(%rsi), %xmm0
+	movdqu	%xmm0, -61(%rdi)
+L(fwd_write_45bytes):
+	lddqu	-45(%rsi), %xmm0
+	movdqu	%xmm0, -45(%rdi)
+L(fwd_write_29bytes):
+	lddqu	-29(%rsi), %xmm0
+	lddqu	-16(%rsi), %xmm1
+	movdqu	%xmm0, -29(%rdi)
+	movdqu	%xmm1, -16(%rdi)
+	ret
+
+	.p2align 4
+L(fwd_write_13bytes):
+	mov	-13(%rsi), %rdx
+	mov	-8(%rsi), %rcx
+	mov	%rdx, -13(%rdi)
+	mov	%rcx, -8(%rdi)
+	ret
+
+	.p2align 4
+L(fwd_write_140bytes):
+	lddqu	-140(%rsi), %xmm0
+	movdqu	%xmm0, -140(%rdi)
+L(fwd_write_124bytes):
+	lddqu	-124(%rsi), %xmm0
+	movdqu	%xmm0, -124(%rdi)
+L(fwd_write_108bytes):
+	lddqu	-108(%rsi), %xmm0
+	movdqu	%xmm0, -108(%rdi)
+L(fwd_write_92bytes):
+	lddqu	-92(%rsi), %xmm0
+	movdqu	%xmm0, -92(%rdi)
+L(fwd_write_76bytes):
+	lddqu	-76(%rsi), %xmm0
+	movdqu	%xmm0, -76(%rdi)
+L(fwd_write_60bytes):
+	lddqu	-60(%rsi), %xmm0
+	movdqu	%xmm0, -60(%rdi)
+L(fwd_write_44bytes):
+	lddqu	-44(%rsi), %xmm0
+	movdqu	%xmm0, -44(%rdi)
+L(fwd_write_28bytes):
+	lddqu	-28(%rsi), %xmm0
+	lddqu	-16(%rsi), %xmm1
+	movdqu	%xmm0, -28(%rdi)
+	movdqu	%xmm1, -16(%rdi)
+	ret
+
+	.p2align 4
+L(fwd_write_12bytes):
+	mov	-12(%rsi), %rdx
+	mov	-4(%rsi), %ecx
+	mov	%rdx, -12(%rdi)
+	mov	%ecx, -4(%rdi)
+	ret
+
+	.p2align 4
+L(fwd_write_139bytes):
+	lddqu	-139(%rsi), %xmm0
+	movdqu	%xmm0, -139(%rdi)
+L(fwd_write_123bytes):
+	lddqu	-123(%rsi), %xmm0
+	movdqu	%xmm0, -123(%rdi)
+L(fwd_write_107bytes):
+	lddqu	-107(%rsi), %xmm0
+	movdqu	%xmm0, -107(%rdi)
+L(fwd_write_91bytes):
+	lddqu	-91(%rsi), %xmm0
+	movdqu	%xmm0, -91(%rdi)
+L(fwd_write_75bytes):
+	lddqu	-75(%rsi), %xmm0
+	movdqu	%xmm0, -75(%rdi)
+L(fwd_write_59bytes):
+	lddqu	-59(%rsi), %xmm0
+	movdqu	%xmm0, -59(%rdi)
+L(fwd_write_43bytes):
+	lddqu	-43(%rsi), %xmm0
+	movdqu	%xmm0, -43(%rdi)
+L(fwd_write_27bytes):
+	lddqu	-27(%rsi), %xmm0
+	lddqu	-16(%rsi), %xmm1
+	movdqu	%xmm0, -27(%rdi)
+	movdqu	%xmm1, -16(%rdi)
+	ret
+
+	.p2align 4
+L(fwd_write_11bytes):
+	mov	-11(%rsi), %rdx
+	mov	-4(%rsi), %ecx
+	mov	%rdx, -11(%rdi)
+	mov	%ecx, -4(%rdi)
+	ret
+
+	.p2align 4
+L(fwd_write_138bytes):
+	lddqu	-138(%rsi), %xmm0
+	movdqu	%xmm0, -138(%rdi)
+L(fwd_write_122bytes):
+	lddqu	-122(%rsi), %xmm0
+	movdqu	%xmm0, -122(%rdi)
+L(fwd_write_106bytes):
+	lddqu	-106(%rsi), %xmm0
+	movdqu	%xmm0, -106(%rdi)
+L(fwd_write_90bytes):
+	lddqu	-90(%rsi), %xmm0
+	movdqu	%xmm0, -90(%rdi)
+L(fwd_write_74bytes):
+	lddqu	-74(%rsi), %xmm0
+	movdqu	%xmm0, -74(%rdi)
+L(fwd_write_58bytes):
+	lddqu	-58(%rsi), %xmm0
+	movdqu	%xmm0, -58(%rdi)
+L(fwd_write_42bytes):
+	lddqu	-42(%rsi), %xmm0
+	movdqu	%xmm0, -42(%rdi)
+L(fwd_write_26bytes):
+	lddqu	-26(%rsi), %xmm0
+	lddqu	-16(%rsi), %xmm1
+	movdqu	%xmm0, -26(%rdi)
+	movdqu	%xmm1, -16(%rdi)
+	ret
+
+	.p2align 4
+L(fwd_write_10bytes):
+	mov	-10(%rsi), %rdx
+	mov	-4(%rsi), %ecx
+	mov	%rdx, -10(%rdi)
+	mov	%ecx, -4(%rdi)
+	ret
+
+	.p2align 4
+L(fwd_write_137bytes):
+	lddqu	-137(%rsi), %xmm0
+	movdqu	%xmm0, -137(%rdi)
+L(fwd_write_121bytes):
+	lddqu	-121(%rsi), %xmm0
+	movdqu	%xmm0, -121(%rdi)
+L(fwd_write_105bytes):
+	lddqu	-105(%rsi), %xmm0
+	movdqu	%xmm0, -105(%rdi)
+L(fwd_write_89bytes):
+	lddqu	-89(%rsi), %xmm0
+	movdqu	%xmm0, -89(%rdi)
+L(fwd_write_73bytes):
+	lddqu	-73(%rsi), %xmm0
+	movdqu	%xmm0, -73(%rdi)
+L(fwd_write_57bytes):
+	lddqu	-57(%rsi), %xmm0
+	movdqu	%xmm0, -57(%rdi)
+L(fwd_write_41bytes):
+	lddqu	-41(%rsi), %xmm0
+	movdqu	%xmm0, -41(%rdi)
+L(fwd_write_25bytes):
+	lddqu	-25(%rsi), %xmm0
+	lddqu	-16(%rsi), %xmm1
+	movdqu	%xmm0, -25(%rdi)
+	movdqu	%xmm1, -16(%rdi)
+	ret
+
+	.p2align 4
+L(fwd_write_9bytes):
+	mov	-9(%rsi), %rdx
+	mov	-4(%rsi), %ecx
+	mov	%rdx, -9(%rdi)
+	mov	%ecx, -4(%rdi)
+	ret
+
+	.p2align 4
+L(fwd_write_136bytes):
+	lddqu	-136(%rsi), %xmm0
+	movdqu	%xmm0, -136(%rdi)
+L(fwd_write_120bytes):
+	lddqu	-120(%rsi), %xmm0
+	movdqu	%xmm0, -120(%rdi)
+L(fwd_write_104bytes):
+	lddqu	-104(%rsi), %xmm0
+	movdqu	%xmm0, -104(%rdi)
+L(fwd_write_88bytes):
+	lddqu	-88(%rsi), %xmm0
+	movdqu	%xmm0, -88(%rdi)
+L(fwd_write_72bytes):
+	lddqu	-72(%rsi), %xmm0
+	movdqu	%xmm0, -72(%rdi)
+L(fwd_write_56bytes):
+	lddqu	-56(%rsi), %xmm0
+	movdqu	%xmm0, -56(%rdi)
+L(fwd_write_40bytes):
+	lddqu	-40(%rsi), %xmm0
+	movdqu	%xmm0, -40(%rdi)
+L(fwd_write_24bytes):
+	lddqu	-24(%rsi), %xmm0
+	lddqu	-16(%rsi), %xmm1
+	movdqu	%xmm0, -24(%rdi)
+	movdqu	%xmm1, -16(%rdi)
+	ret
+
+	.p2align 4
+L(fwd_write_8bytes):
+	mov	-8(%rsi), %rdx
+	mov	%rdx, -8(%rdi)
+	ret
+
+	.p2align 4
+L(fwd_write_135bytes):
+	lddqu	-135(%rsi), %xmm0
+	movdqu	%xmm0, -135(%rdi)
+L(fwd_write_119bytes):
+	lddqu	-119(%rsi), %xmm0
+	movdqu	%xmm0, -119(%rdi)
+L(fwd_write_103bytes):
+	lddqu	-103(%rsi), %xmm0
+	movdqu	%xmm0, -103(%rdi)
+L(fwd_write_87bytes):
+	lddqu	-87(%rsi), %xmm0
+	movdqu	%xmm0, -87(%rdi)
+L(fwd_write_71bytes):
+	lddqu	-71(%rsi), %xmm0
+	movdqu	%xmm0, -71(%rdi)
+L(fwd_write_55bytes):
+	lddqu	-55(%rsi), %xmm0
+	movdqu	%xmm0, -55(%rdi)
+L(fwd_write_39bytes):
+	lddqu	-39(%rsi), %xmm0
+	movdqu	%xmm0, -39(%rdi)
+L(fwd_write_23bytes):
+	lddqu	-23(%rsi), %xmm0
+	lddqu	-16(%rsi), %xmm1
+	movdqu	%xmm0, -23(%rdi)
+	movdqu	%xmm1, -16(%rdi)
+	ret
+
+	.p2align 4
+L(fwd_write_7bytes):
+	mov	-7(%rsi), %edx
+	mov	-4(%rsi), %ecx
+	mov	%edx, -7(%rdi)
+	mov	%ecx, -4(%rdi)
+	ret
+
+	.p2align 4
+L(fwd_write_134bytes):
+	lddqu	-134(%rsi), %xmm0
+	movdqu	%xmm0, -134(%rdi)
+L(fwd_write_118bytes):
+	lddqu	-118(%rsi), %xmm0
+	movdqu	%xmm0, -118(%rdi)
+L(fwd_write_102bytes):
+	lddqu	-102(%rsi), %xmm0
+	movdqu	%xmm0, -102(%rdi)
+L(fwd_write_86bytes):
+	lddqu	-86(%rsi), %xmm0
+	movdqu	%xmm0, -86(%rdi)
+L(fwd_write_70bytes):
+	lddqu	-70(%rsi), %xmm0
+	movdqu	%xmm0, -70(%rdi)
+L(fwd_write_54bytes):
+	lddqu	-54(%rsi), %xmm0
+	movdqu	%xmm0, -54(%rdi)
+L(fwd_write_38bytes):
+	lddqu	-38(%rsi), %xmm0
+	movdqu	%xmm0, -38(%rdi)
+L(fwd_write_22bytes):
+	lddqu	-22(%rsi), %xmm0
+	lddqu	-16(%rsi), %xmm1
+	movdqu	%xmm0, -22(%rdi)
+	movdqu	%xmm1, -16(%rdi)
+	ret
+
+	.p2align 4
+L(fwd_write_6bytes):
+	mov	-6(%rsi), %edx
+	mov	-4(%rsi), %ecx
+	mov	%edx, -6(%rdi)
+	mov	%ecx, -4(%rdi)
+	ret
+
+	.p2align 4
+L(fwd_write_133bytes):
+	lddqu	-133(%rsi), %xmm0
+	movdqu	%xmm0, -133(%rdi)
+L(fwd_write_117bytes):
+	lddqu	-117(%rsi), %xmm0
+	movdqu	%xmm0, -117(%rdi)
+L(fwd_write_101bytes):
+	lddqu	-101(%rsi), %xmm0
+	movdqu	%xmm0, -101(%rdi)
+L(fwd_write_85bytes):
+	lddqu	-85(%rsi), %xmm0
+	movdqu	%xmm0, -85(%rdi)
+L(fwd_write_69bytes):
+	lddqu	-69(%rsi), %xmm0
+	movdqu	%xmm0, -69(%rdi)
+L(fwd_write_53bytes):
+	lddqu	-53(%rsi), %xmm0
+	movdqu	%xmm0, -53(%rdi)
+L(fwd_write_37bytes):
+	lddqu	-37(%rsi), %xmm0
+	movdqu	%xmm0, -37(%rdi)
+L(fwd_write_21bytes):
+	lddqu	-21(%rsi), %xmm0
+	lddqu	-16(%rsi), %xmm1
+	movdqu	%xmm0, -21(%rdi)
+	movdqu	%xmm1, -16(%rdi)
+	ret
+
+	.p2align 4
+L(fwd_write_5bytes):
+	mov	-5(%rsi), %edx
+	mov	-4(%rsi), %ecx
+	mov	%edx, -5(%rdi)
+	mov	%ecx, -4(%rdi)
+	ret
+
+	.p2align 4
+L(fwd_write_132bytes):
+	lddqu	-132(%rsi), %xmm0
+	movdqu	%xmm0, -132(%rdi)
+L(fwd_write_116bytes):
+	lddqu	-116(%rsi), %xmm0
+	movdqu	%xmm0, -116(%rdi)
+L(fwd_write_100bytes):
+	lddqu	-100(%rsi), %xmm0
+	movdqu	%xmm0, -100(%rdi)
+L(fwd_write_84bytes):
+	lddqu	-84(%rsi), %xmm0
+	movdqu	%xmm0, -84(%rdi)
+L(fwd_write_68bytes):
+	lddqu	-68(%rsi), %xmm0
+	movdqu	%xmm0, -68(%rdi)
+L(fwd_write_52bytes):
+	lddqu	-52(%rsi), %xmm0
+	movdqu	%xmm0, -52(%rdi)
+L(fwd_write_36bytes):
+	lddqu	-36(%rsi), %xmm0
+	movdqu	%xmm0, -36(%rdi)
+L(fwd_write_20bytes):
+	lddqu	-20(%rsi), %xmm0
+	lddqu	-16(%rsi), %xmm1
+	movdqu	%xmm0, -20(%rdi)
+	movdqu	%xmm1, -16(%rdi)
+	ret
+
+	.p2align 4
+L(fwd_write_4bytes):
+	mov	-4(%rsi), %edx
+	mov	%edx, -4(%rdi)
+	ret
+
+	.p2align 4
+L(fwd_write_131bytes):
+	lddqu	-131(%rsi), %xmm0
+	movdqu	%xmm0, -131(%rdi)
+L(fwd_write_115bytes):
+	lddqu	-115(%rsi), %xmm0
+	movdqu	%xmm0, -115(%rdi)
+L(fwd_write_99bytes):
+	lddqu	-99(%rsi), %xmm0
+	movdqu	%xmm0, -99(%rdi)
+L(fwd_write_83bytes):
+	lddqu	-83(%rsi), %xmm0
+	movdqu	%xmm0, -83(%rdi)
+L(fwd_write_67bytes):
+	lddqu	-67(%rsi), %xmm0
+	movdqu	%xmm0, -67(%rdi)
+L(fwd_write_51bytes):
+	lddqu	-51(%rsi), %xmm0
+	movdqu	%xmm0, -51(%rdi)
+L(fwd_write_35bytes):
+	lddqu	-35(%rsi), %xmm0
+	movdqu	%xmm0, -35(%rdi)
+L(fwd_write_19bytes):
+	lddqu	-19(%rsi), %xmm0
+	lddqu	-16(%rsi), %xmm1
+	movdqu	%xmm0, -19(%rdi)
+	movdqu	%xmm1, -16(%rdi)
+	ret
+
+	.p2align 4
+L(fwd_write_3bytes):
+	mov	-3(%rsi), %dx
+	mov	-2(%rsi), %cx
+	mov	%dx, -3(%rdi)
+	mov	%cx, -2(%rdi)
+	ret
+
+	.p2align 4
+L(fwd_write_130bytes):
+	lddqu	-130(%rsi), %xmm0
+	movdqu	%xmm0, -130(%rdi)
+L(fwd_write_114bytes):
+	lddqu	-114(%rsi), %xmm0
+	movdqu	%xmm0, -114(%rdi)
+L(fwd_write_98bytes):
+	lddqu	-98(%rsi), %xmm0
+	movdqu	%xmm0, -98(%rdi)
+L(fwd_write_82bytes):
+	lddqu	-82(%rsi), %xmm0
+	movdqu	%xmm0, -82(%rdi)
+L(fwd_write_66bytes):
+	lddqu	-66(%rsi), %xmm0
+	movdqu	%xmm0, -66(%rdi)
+L(fwd_write_50bytes):
+	lddqu	-50(%rsi), %xmm0
+	movdqu	%xmm0, -50(%rdi)
+L(fwd_write_34bytes):
+	lddqu	-34(%rsi), %xmm0
+	movdqu	%xmm0, -34(%rdi)
+L(fwd_write_18bytes):
+	lddqu	-18(%rsi), %xmm0
+	lddqu	-16(%rsi), %xmm1
+	movdqu	%xmm0, -18(%rdi)
+	movdqu	%xmm1, -16(%rdi)
+	ret
+
+	.p2align 4
+L(fwd_write_2bytes):
+	movzwl	-2(%rsi), %edx
+	mov	%dx, -2(%rdi)
+	ret
+
+	.p2align 4
+L(fwd_write_129bytes):
+	lddqu	-129(%rsi), %xmm0
+	movdqu	%xmm0, -129(%rdi)
+L(fwd_write_113bytes):
+	lddqu	-113(%rsi), %xmm0
+	movdqu	%xmm0, -113(%rdi)
+L(fwd_write_97bytes):
+	lddqu	-97(%rsi), %xmm0
+	movdqu	%xmm0, -97(%rdi)
+L(fwd_write_81bytes):
+	lddqu	-81(%rsi), %xmm0
+	movdqu	%xmm0, -81(%rdi)
+L(fwd_write_65bytes):
+	lddqu	-65(%rsi), %xmm0
+	movdqu	%xmm0, -65(%rdi)
+L(fwd_write_49bytes):
+	lddqu	-49(%rsi), %xmm0
+	movdqu	%xmm0, -49(%rdi)
+L(fwd_write_33bytes):
+	lddqu	-33(%rsi), %xmm0
+	movdqu	%xmm0, -33(%rdi)
+L(fwd_write_17bytes):
+	lddqu	-17(%rsi), %xmm0
+	lddqu	-16(%rsi), %xmm1
+	movdqu	%xmm0, -17(%rdi)
+	movdqu	%xmm1, -16(%rdi)
+	ret
+
+	.p2align 4
+L(fwd_write_1bytes):
+	movzbl	-1(%rsi), %edx
+	mov	%dl, -1(%rdi)
+	ret
+
+	.p2align 4
+L(bwd_write_128bytes):
+	lddqu	112(%rsi), %xmm0
+	movdqu	%xmm0, 112(%rdi)
+L(bwd_write_112bytes):
+	lddqu	96(%rsi), %xmm0
+	movdqu	%xmm0, 96(%rdi)
+L(bwd_write_96bytes):
+	lddqu	80(%rsi), %xmm0
+	movdqu	%xmm0, 80(%rdi)
+L(bwd_write_80bytes):
+	lddqu	64(%rsi), %xmm0
+	movdqu	%xmm0, 64(%rdi)
+L(bwd_write_64bytes):
+	lddqu	48(%rsi), %xmm0
+	movdqu	%xmm0, 48(%rdi)
+L(bwd_write_48bytes):
+	lddqu	32(%rsi), %xmm0
+	movdqu	%xmm0, 32(%rdi)
+L(bwd_write_32bytes):
+	lddqu	16(%rsi), %xmm0
+	movdqu	%xmm0, 16(%rdi)
+L(bwd_write_16bytes):
+	lddqu	(%rsi), %xmm0
+	movdqu	%xmm0, (%rdi)
+L(bwd_write_0bytes):
+	ret
+
+	.p2align 4
+L(bwd_write_143bytes):
+	lddqu	127(%rsi), %xmm0
+	movdqu	%xmm0, 127(%rdi)
+L(bwd_write_127bytes):
+	lddqu	111(%rsi), %xmm0
+	movdqu	%xmm0, 111(%rdi)
+L(bwd_write_111bytes):
+	lddqu	95(%rsi), %xmm0
+	movdqu	%xmm0, 95(%rdi)
+L(bwd_write_95bytes):
+	lddqu	79(%rsi), %xmm0
+	movdqu	%xmm0, 79(%rdi)
+L(bwd_write_79bytes):
+	lddqu	63(%rsi), %xmm0
+	movdqu	%xmm0, 63(%rdi)
+L(bwd_write_63bytes):
+	lddqu	47(%rsi), %xmm0
+	movdqu	%xmm0, 47(%rdi)
+L(bwd_write_47bytes):
+	lddqu	31(%rsi), %xmm0
+	movdqu	%xmm0, 31(%rdi)
+L(bwd_write_31bytes):
+	lddqu	15(%rsi), %xmm0
+	lddqu	(%rsi), %xmm1
+	movdqu	%xmm0, 15(%rdi)
+	movdqu	%xmm1, (%rdi)
+	ret
+
+
+	.p2align 4
+L(bwd_write_15bytes):
+	mov	7(%rsi), %rdx
+	mov	(%rsi), %rcx
+	mov	%rdx, 7(%rdi)
+	mov	%rcx, (%rdi)
+	ret
+
+	.p2align 4
+L(bwd_write_142bytes):
+	lddqu	126(%rsi), %xmm0
+	movdqu	%xmm0, 126(%rdi)
+L(bwd_write_126bytes):
+	lddqu	110(%rsi), %xmm0
+	movdqu	%xmm0, 110(%rdi)
+L(bwd_write_110bytes):
+	lddqu	94(%rsi), %xmm0
+	movdqu	%xmm0, 94(%rdi)
+L(bwd_write_94bytes):
+	lddqu	78(%rsi), %xmm0
+	movdqu	%xmm0, 78(%rdi)
+L(bwd_write_78bytes):
+	lddqu	62(%rsi), %xmm0
+	movdqu	%xmm0, 62(%rdi)
+L(bwd_write_62bytes):
+	lddqu	46(%rsi), %xmm0
+	movdqu	%xmm0, 46(%rdi)
+L(bwd_write_46bytes):
+	lddqu	30(%rsi), %xmm0
+	movdqu	%xmm0, 30(%rdi)
+L(bwd_write_30bytes):
+	lddqu	14(%rsi), %xmm0
+	lddqu	(%rsi), %xmm1
+	movdqu	%xmm0, 14(%rdi)
+	movdqu	%xmm1, (%rdi)
+	ret
+
+	.p2align 4
+L(bwd_write_14bytes):
+	mov	6(%rsi), %rdx
+	mov	(%rsi), %rcx
+	mov	%rdx, 6(%rdi)
+	mov	%rcx, (%rdi)
+	ret
+
+	.p2align 4
+L(bwd_write_141bytes):
+	lddqu	125(%rsi), %xmm0
+	movdqu	%xmm0, 125(%rdi)
+L(bwd_write_125bytes):
+	lddqu	109(%rsi), %xmm0
+	movdqu	%xmm0, 109(%rdi)
+L(bwd_write_109bytes):
+	lddqu	93(%rsi), %xmm0
+	movdqu	%xmm0, 93(%rdi)
+L(bwd_write_93bytes):
+	lddqu	77(%rsi), %xmm0
+	movdqu	%xmm0, 77(%rdi)
+L(bwd_write_77bytes):
+	lddqu	61(%rsi), %xmm0
+	movdqu	%xmm0, 61(%rdi)
+L(bwd_write_61bytes):
+	lddqu	45(%rsi), %xmm0
+	movdqu	%xmm0, 45(%rdi)
+L(bwd_write_45bytes):
+	lddqu	29(%rsi), %xmm0
+	movdqu	%xmm0, 29(%rdi)
+L(bwd_write_29bytes):
+	lddqu	13(%rsi), %xmm0
+	lddqu	(%rsi), %xmm1
+	movdqu	%xmm0, 13(%rdi)
+	movdqu	%xmm1, (%rdi)
+	ret
+
+	.p2align 4
+L(bwd_write_13bytes):
+	mov	5(%rsi), %rdx
+	mov	(%rsi), %rcx
+	mov	%rdx, 5(%rdi)
+	mov	%rcx, (%rdi)
+	ret
+
+	.p2align 4
+L(bwd_write_140bytes):
+	lddqu	124(%rsi), %xmm0
+	movdqu	%xmm0, 124(%rdi)
+L(bwd_write_124bytes):
+	lddqu	108(%rsi), %xmm0
+	movdqu	%xmm0, 108(%rdi)
+L(bwd_write_108bytes):
+	lddqu	92(%rsi), %xmm0
+	movdqu	%xmm0, 92(%rdi)
+L(bwd_write_92bytes):
+	lddqu	76(%rsi), %xmm0
+	movdqu	%xmm0, 76(%rdi)
+L(bwd_write_76bytes):
+	lddqu	60(%rsi), %xmm0
+	movdqu	%xmm0, 60(%rdi)
+L(bwd_write_60bytes):
+	lddqu	44(%rsi), %xmm0
+	movdqu	%xmm0, 44(%rdi)
+L(bwd_write_44bytes):
+	lddqu	28(%rsi), %xmm0
+	movdqu	%xmm0, 28(%rdi)
+L(bwd_write_28bytes):
+	lddqu	12(%rsi), %xmm0
+	lddqu	(%rsi), %xmm1
+	movdqu	%xmm0, 12(%rdi)
+	movdqu	%xmm1, (%rdi)
+	ret
+
+	.p2align 4
+L(bwd_write_12bytes):
+	mov	4(%rsi), %rdx
+	mov	(%rsi), %rcx
+	mov	%rdx, 4(%rdi)
+	mov	%rcx, (%rdi)
+	ret
+
+	.p2align 4
+L(bwd_write_139bytes):
+	lddqu	123(%rsi), %xmm0
+	movdqu	%xmm0, 123(%rdi)
+L(bwd_write_123bytes):
+	lddqu	107(%rsi), %xmm0
+	movdqu	%xmm0, 107(%rdi)
+L(bwd_write_107bytes):
+	lddqu	91(%rsi), %xmm0
+	movdqu	%xmm0, 91(%rdi)
+L(bwd_write_91bytes):
+	lddqu	75(%rsi), %xmm0
+	movdqu	%xmm0, 75(%rdi)
+L(bwd_write_75bytes):
+	lddqu	59(%rsi), %xmm0
+	movdqu	%xmm0, 59(%rdi)
+L(bwd_write_59bytes):
+	lddqu	43(%rsi), %xmm0
+	movdqu	%xmm0, 43(%rdi)
+L(bwd_write_43bytes):
+	lddqu	27(%rsi), %xmm0
+	movdqu	%xmm0, 27(%rdi)
+L(bwd_write_27bytes):
+	lddqu	11(%rsi), %xmm0
+	lddqu	(%rsi), %xmm1
+	movdqu	%xmm0, 11(%rdi)
+	movdqu	%xmm1, (%rdi)
+	ret
+
+	.p2align 4
+L(bwd_write_11bytes):
+	mov	3(%rsi), %rdx
+	mov	(%rsi), %rcx
+	mov	%rdx, 3(%rdi)
+	mov	%rcx, (%rdi)
+	ret
+
+	.p2align 4
+L(bwd_write_138bytes):
+	lddqu	122(%rsi), %xmm0
+	movdqu	%xmm0, 122(%rdi)
+L(bwd_write_122bytes):
+	lddqu	106(%rsi), %xmm0
+	movdqu	%xmm0, 106(%rdi)
+L(bwd_write_106bytes):
+	lddqu	90(%rsi), %xmm0
+	movdqu	%xmm0, 90(%rdi)
+L(bwd_write_90bytes):
+	lddqu	74(%rsi), %xmm0
+	movdqu	%xmm0, 74(%rdi)
+L(bwd_write_74bytes):
+	lddqu	58(%rsi), %xmm0
+	movdqu	%xmm0, 58(%rdi)
+L(bwd_write_58bytes):
+	lddqu	42(%rsi), %xmm0
+	movdqu	%xmm0, 42(%rdi)
+L(bwd_write_42bytes):
+	lddqu	26(%rsi), %xmm0
+	movdqu	%xmm0, 26(%rdi)
+L(bwd_write_26bytes):
+	lddqu	10(%rsi), %xmm0
+	lddqu	(%rsi), %xmm1
+	movdqu	%xmm0, 10(%rdi)
+	movdqu	%xmm1, (%rdi)
+	ret
+
+	.p2align 4
+L(bwd_write_10bytes):
+	mov	2(%rsi), %rdx
+	mov	(%rsi), %rcx
+	mov	%rdx, 2(%rdi)
+	mov	%rcx, (%rdi)
+	ret
+
+	.p2align 4
+L(bwd_write_137bytes):
+	lddqu	121(%rsi), %xmm0
+	movdqu	%xmm0, 121(%rdi)
+L(bwd_write_121bytes):
+	lddqu	105(%rsi), %xmm0
+	movdqu	%xmm0, 105(%rdi)
+L(bwd_write_105bytes):
+	lddqu	89(%rsi), %xmm0
+	movdqu	%xmm0, 89(%rdi)
+L(bwd_write_89bytes):
+	lddqu	73(%rsi), %xmm0
+	movdqu	%xmm0, 73(%rdi)
+L(bwd_write_73bytes):
+	lddqu	57(%rsi), %xmm0
+	movdqu	%xmm0, 57(%rdi)
+L(bwd_write_57bytes):
+	lddqu	41(%rsi), %xmm0
+	movdqu	%xmm0, 41(%rdi)
+L(bwd_write_41bytes):
+	lddqu	25(%rsi), %xmm0
+	movdqu	%xmm0, 25(%rdi)
+L(bwd_write_25bytes):
+	lddqu	9(%rsi), %xmm0
+	lddqu	(%rsi), %xmm1
+	movdqu	%xmm0, 9(%rdi)
+	movdqu	%xmm1, (%rdi)
+	ret
+
+	.p2align 4
+L(bwd_write_9bytes):
+	mov	1(%rsi), %rdx
+	mov	(%rsi), %rcx
+	mov	%rdx, 1(%rdi)
+	mov	%rcx, (%rdi)
+	ret
+
+	.p2align 4
+L(bwd_write_136bytes):
+	lddqu	120(%rsi), %xmm0
+	movdqu	%xmm0, 120(%rdi)
+L(bwd_write_120bytes):
+	lddqu	104(%rsi), %xmm0
+	movdqu	%xmm0, 104(%rdi)
+L(bwd_write_104bytes):
+	lddqu	88(%rsi), %xmm0
+	movdqu	%xmm0, 88(%rdi)
+L(bwd_write_88bytes):
+	lddqu	72(%rsi), %xmm0
+	movdqu	%xmm0, 72(%rdi)
+L(bwd_write_72bytes):
+	lddqu	56(%rsi), %xmm0
+	movdqu	%xmm0, 56(%rdi)
+L(bwd_write_56bytes):
+	lddqu	40(%rsi), %xmm0
+	movdqu	%xmm0, 40(%rdi)
+L(bwd_write_40bytes):
+	lddqu	24(%rsi), %xmm0
+	movdqu	%xmm0, 24(%rdi)
+L(bwd_write_24bytes):
+	lddqu	8(%rsi), %xmm0
+	lddqu	(%rsi), %xmm1
+	movdqu	%xmm0, 8(%rdi)
+	movdqu	%xmm1, (%rdi)
+	ret
+
+	.p2align 4
+L(bwd_write_8bytes):
+	mov	(%rsi), %rdx
+	mov	%rdx, (%rdi)
+	ret
+
+	.p2align 4
+L(bwd_write_135bytes):
+	lddqu	119(%rsi), %xmm0
+	movdqu	%xmm0, 119(%rdi)
+L(bwd_write_119bytes):
+	lddqu	103(%rsi), %xmm0
+	movdqu	%xmm0, 103(%rdi)
+L(bwd_write_103bytes):
+	lddqu	87(%rsi), %xmm0
+	movdqu	%xmm0, 87(%rdi)
+L(bwd_write_87bytes):
+	lddqu	71(%rsi), %xmm0
+	movdqu	%xmm0, 71(%rdi)
+L(bwd_write_71bytes):
+	lddqu	55(%rsi), %xmm0
+	movdqu	%xmm0, 55(%rdi)
+L(bwd_write_55bytes):
+	lddqu	39(%rsi), %xmm0
+	movdqu	%xmm0, 39(%rdi)
+L(bwd_write_39bytes):
+	lddqu	23(%rsi), %xmm0
+	movdqu	%xmm0, 23(%rdi)
+L(bwd_write_23bytes):
+	lddqu	7(%rsi), %xmm0
+	lddqu	(%rsi), %xmm1
+	movdqu	%xmm0, 7(%rdi)
+	movdqu	%xmm1, (%rdi)
+	ret
+
+	.p2align 4
+L(bwd_write_7bytes):
+	mov	3(%rsi), %edx
+	mov	(%rsi), %ecx
+	mov	%edx, 3(%rdi)
+	mov	%ecx, (%rdi)
+	ret
+
+	.p2align 4
+L(bwd_write_134bytes):
+	lddqu	118(%rsi), %xmm0
+	movdqu	%xmm0, 118(%rdi)
+L(bwd_write_118bytes):
+	lddqu	102(%rsi), %xmm0
+	movdqu	%xmm0, 102(%rdi)
+L(bwd_write_102bytes):
+	lddqu	86(%rsi), %xmm0
+	movdqu	%xmm0, 86(%rdi)
+L(bwd_write_86bytes):
+	lddqu	70(%rsi), %xmm0
+	movdqu	%xmm0, 70(%rdi)
+L(bwd_write_70bytes):
+	lddqu	54(%rsi), %xmm0
+	movdqu	%xmm0, 54(%rdi)
+L(bwd_write_54bytes):
+	lddqu	38(%rsi), %xmm0
+	movdqu	%xmm0, 38(%rdi)
+L(bwd_write_38bytes):
+	lddqu	22(%rsi), %xmm0
+	movdqu	%xmm0, 22(%rdi)
+L(bwd_write_22bytes):
+	lddqu	6(%rsi), %xmm0
+	lddqu	(%rsi), %xmm1
+	movdqu	%xmm0, 6(%rdi)
+	movdqu	%xmm1, (%rdi)
+	ret
+
+	.p2align 4
+L(bwd_write_6bytes):
+	mov	2(%rsi), %edx
+	mov	(%rsi), %ecx
+	mov	%edx, 2(%rdi)
+	mov	%ecx, (%rdi)
+	ret
+
+	.p2align 4
+L(bwd_write_133bytes):
+	lddqu	117(%rsi), %xmm0
+	movdqu	%xmm0, 117(%rdi)
+L(bwd_write_117bytes):
+	lddqu	101(%rsi), %xmm0
+	movdqu	%xmm0, 101(%rdi)
+L(bwd_write_101bytes):
+	lddqu	85(%rsi), %xmm0
+	movdqu	%xmm0, 85(%rdi)
+L(bwd_write_85bytes):
+	lddqu	69(%rsi), %xmm0
+	movdqu	%xmm0, 69(%rdi)
+L(bwd_write_69bytes):
+	lddqu	53(%rsi), %xmm0
+	movdqu	%xmm0, 53(%rdi)
+L(bwd_write_53bytes):
+	lddqu	37(%rsi), %xmm0
+	movdqu	%xmm0, 37(%rdi)
+L(bwd_write_37bytes):
+	lddqu	21(%rsi), %xmm0
+	movdqu	%xmm0, 21(%rdi)
+L(bwd_write_21bytes):
+	lddqu	5(%rsi), %xmm0
+	lddqu	(%rsi), %xmm1
+	movdqu	%xmm0, 5(%rdi)
+	movdqu	%xmm1, (%rdi)
+	ret
+
+	.p2align 4
+L(bwd_write_5bytes):
+	mov	1(%rsi), %edx
+	mov	(%rsi), %ecx
+	mov	%edx, 1(%rdi)
+	mov	%ecx, (%rdi)
+	ret
+
+	.p2align 4
+L(bwd_write_132bytes):
+	lddqu	116(%rsi), %xmm0
+	movdqu	%xmm0, 116(%rdi)
+L(bwd_write_116bytes):
+	lddqu	100(%rsi), %xmm0
+	movdqu	%xmm0, 100(%rdi)
+L(bwd_write_100bytes):
+	lddqu	84(%rsi), %xmm0
+	movdqu	%xmm0, 84(%rdi)
+L(bwd_write_84bytes):
+	lddqu	68(%rsi), %xmm0
+	movdqu	%xmm0, 68(%rdi)
+L(bwd_write_68bytes):
+	lddqu	52(%rsi), %xmm0
+	movdqu	%xmm0, 52(%rdi)
+L(bwd_write_52bytes):
+	lddqu	36(%rsi), %xmm0
+	movdqu	%xmm0, 36(%rdi)
+L(bwd_write_36bytes):
+	lddqu	20(%rsi), %xmm0
+	movdqu	%xmm0, 20(%rdi)
+L(bwd_write_20bytes):
+	lddqu	4(%rsi), %xmm0
+	lddqu	(%rsi), %xmm1
+	movdqu	%xmm0, 4(%rdi)
+	movdqu	%xmm1, (%rdi)
+	ret
+
+	.p2align 4
+L(bwd_write_4bytes):
+	mov	(%rsi), %edx
+	mov	%edx, (%rdi)
+	ret
+
+	.p2align 4
+L(bwd_write_131bytes):
+	lddqu	115(%rsi), %xmm0
+	movdqu	%xmm0, 115(%rdi)
+L(bwd_write_115bytes):
+	lddqu	99(%rsi), %xmm0
+	movdqu	%xmm0, 99(%rdi)
+L(bwd_write_99bytes):
+	lddqu	83(%rsi), %xmm0
+	movdqu	%xmm0, 83(%rdi)
+L(bwd_write_83bytes):
+	lddqu	67(%rsi), %xmm0
+	movdqu	%xmm0, 67(%rdi)
+L(bwd_write_67bytes):
+	lddqu	51(%rsi), %xmm0
+	movdqu	%xmm0, 51(%rdi)
+L(bwd_write_51bytes):
+	lddqu	35(%rsi), %xmm0
+	movdqu	%xmm0, 35(%rdi)
+L(bwd_write_35bytes):
+	lddqu	19(%rsi), %xmm0
+	movdqu	%xmm0, 19(%rdi)
+L(bwd_write_19bytes):
+	lddqu	3(%rsi), %xmm0
+	lddqu	(%rsi), %xmm1
+	movdqu	%xmm0, 3(%rdi)
+	movdqu	%xmm1, (%rdi)
+	ret
+
+	.p2align 4
+L(bwd_write_3bytes):
+	mov	1(%rsi), %dx
+	mov	(%rsi), %cx
+	mov	%dx, 1(%rdi)
+	mov	%cx, (%rdi)
+	ret
+
+	.p2align 4
+L(bwd_write_130bytes):
+	lddqu	114(%rsi), %xmm0
+	movdqu	%xmm0, 114(%rdi)
+L(bwd_write_114bytes):
+	lddqu	98(%rsi), %xmm0
+	movdqu	%xmm0, 98(%rdi)
+L(bwd_write_98bytes):
+	lddqu	82(%rsi), %xmm0
+	movdqu	%xmm0, 82(%rdi)
+L(bwd_write_82bytes):
+	lddqu	66(%rsi), %xmm0
+	movdqu	%xmm0, 66(%rdi)
+L(bwd_write_66bytes):
+	lddqu	50(%rsi), %xmm0
+	movdqu	%xmm0, 50(%rdi)
+L(bwd_write_50bytes):
+	lddqu	34(%rsi), %xmm0
+	movdqu	%xmm0, 34(%rdi)
+L(bwd_write_34bytes):
+	lddqu	18(%rsi), %xmm0
+	movdqu	%xmm0, 18(%rdi)
+L(bwd_write_18bytes):
+	lddqu	2(%rsi), %xmm0
+	lddqu	(%rsi), %xmm1
+	movdqu	%xmm0, 2(%rdi)
+	movdqu	%xmm1, (%rdi)
+	ret
+
+	.p2align 4
+L(bwd_write_2bytes):
+	movzwl	(%rsi), %edx
+	mov	%dx, (%rdi)
+	ret
+
+	.p2align 4
+L(bwd_write_129bytes):
+	lddqu	113(%rsi), %xmm0
+	movdqu	%xmm0, 113(%rdi)
+L(bwd_write_113bytes):
+	lddqu	97(%rsi), %xmm0
+	movdqu	%xmm0, 97(%rdi)
+L(bwd_write_97bytes):
+	lddqu	81(%rsi), %xmm0
+	movdqu	%xmm0, 81(%rdi)
+L(bwd_write_81bytes):
+	lddqu	65(%rsi), %xmm0
+	movdqu	%xmm0, 65(%rdi)
+L(bwd_write_65bytes):
+	lddqu	49(%rsi), %xmm0
+	movdqu	%xmm0, 49(%rdi)
+L(bwd_write_49bytes):
+	lddqu	33(%rsi), %xmm0
+	movdqu	%xmm0, 33(%rdi)
+L(bwd_write_33bytes):
+	lddqu	17(%rsi), %xmm0
+	movdqu	%xmm0, 17(%rdi)
+L(bwd_write_17bytes):
+	lddqu	1(%rsi), %xmm0
+	lddqu	(%rsi), %xmm1
+	movdqu	%xmm0, 1(%rdi)
+	movdqu	%xmm1, (%rdi)
+	ret
+
+	.p2align 4
+L(bwd_write_1bytes):
+	movzbl	(%rsi), %edx
+	mov	%dl, (%rdi)
+	ret
+
+END (MEMCPY)
+
+	.section .rodata.ssse3,"a",@progbits
+	.p2align 3
+L(table_144_bytes_bwd):
+	.int	JMPTBL (L(bwd_write_0bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_1bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_2bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_3bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_4bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_5bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_6bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_7bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_8bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_9bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_10bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_11bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_12bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_13bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_14bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_15bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_16bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_17bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_18bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_19bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_20bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_21bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_22bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_23bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_24bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_25bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_26bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_27bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_28bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_29bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_30bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_31bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_32bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_33bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_34bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_35bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_36bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_37bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_38bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_39bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_40bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_41bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_42bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_43bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_44bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_45bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_46bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_47bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_48bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_49bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_50bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_51bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_52bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_53bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_54bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_55bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_56bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_57bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_58bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_59bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_60bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_61bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_62bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_63bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_64bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_65bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_66bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_67bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_68bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_69bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_70bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_71bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_72bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_73bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_74bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_75bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_76bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_77bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_78bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_79bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_80bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_81bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_82bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_83bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_84bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_85bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_86bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_87bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_88bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_89bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_90bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_91bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_92bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_93bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_94bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_95bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_96bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_97bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_98bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_99bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_100bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_101bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_102bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_103bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_104bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_105bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_106bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_107bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_108bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_109bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_110bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_111bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_112bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_113bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_114bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_115bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_116bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_117bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_118bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_119bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_120bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_121bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_122bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_123bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_124bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_125bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_126bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_127bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_128bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_129bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_130bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_131bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_132bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_133bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_134bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_135bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_136bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_137bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_138bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_139bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_140bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_141bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_142bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_143bytes), L(table_144_bytes_bwd))
+
+	.p2align 3
+L(table_144_bytes_fwd):
+	.int	JMPTBL (L(fwd_write_0bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_1bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_2bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_3bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_4bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_5bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_6bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_7bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_8bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_9bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_10bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_11bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_12bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_13bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_14bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_15bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_16bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_17bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_18bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_19bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_20bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_21bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_22bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_23bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_24bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_25bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_26bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_27bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_28bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_29bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_30bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_31bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_32bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_33bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_34bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_35bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_36bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_37bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_38bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_39bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_40bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_41bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_42bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_43bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_44bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_45bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_46bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_47bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_48bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_49bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_50bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_51bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_52bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_53bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_54bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_55bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_56bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_57bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_58bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_59bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_60bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_61bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_62bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_63bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_64bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_65bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_66bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_67bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_68bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_69bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_70bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_71bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_72bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_73bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_74bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_75bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_76bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_77bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_78bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_79bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_80bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_81bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_82bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_83bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_84bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_85bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_86bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_87bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_88bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_89bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_90bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_91bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_92bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_93bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_94bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_95bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_96bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_97bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_98bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_99bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_100bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_101bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_102bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_103bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_104bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_105bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_106bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_107bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_108bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_109bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_110bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_111bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_112bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_113bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_114bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_115bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_116bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_117bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_118bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_119bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_120bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_121bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_122bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_123bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_124bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_125bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_126bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_127bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_128bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_129bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_130bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_131bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_132bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_133bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_134bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_135bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_136bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_137bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_138bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_139bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_140bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_141bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_142bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_143bytes), L(table_144_bytes_fwd))
+
+	.p2align 3
+L(shl_table_fwd):
+	.int	JMPTBL (L(shl_0), L(shl_table_fwd))
+	.int	JMPTBL (L(shl_1), L(shl_table_fwd))
+	.int	JMPTBL (L(shl_2), L(shl_table_fwd))
+	.int	JMPTBL (L(shl_3), L(shl_table_fwd))
+	.int	JMPTBL (L(shl_4), L(shl_table_fwd))
+	.int	JMPTBL (L(shl_5), L(shl_table_fwd))
+	.int	JMPTBL (L(shl_6), L(shl_table_fwd))
+	.int	JMPTBL (L(shl_7), L(shl_table_fwd))
+	.int	JMPTBL (L(shl_8), L(shl_table_fwd))
+	.int	JMPTBL (L(shl_9), L(shl_table_fwd))
+	.int	JMPTBL (L(shl_10), L(shl_table_fwd))
+	.int	JMPTBL (L(shl_11), L(shl_table_fwd))
+	.int	JMPTBL (L(shl_12), L(shl_table_fwd))
+	.int	JMPTBL (L(shl_13), L(shl_table_fwd))
+	.int	JMPTBL (L(shl_14), L(shl_table_fwd))
+	.int	JMPTBL (L(shl_15), L(shl_table_fwd))
+
+	.p2align 3
+L(shl_table_bwd):
+	.int	JMPTBL (L(shl_0_bwd), L(shl_table_bwd))
+	.int	JMPTBL (L(shl_1_bwd), L(shl_table_bwd))
+	.int	JMPTBL (L(shl_2_bwd), L(shl_table_bwd))
+	.int	JMPTBL (L(shl_3_bwd), L(shl_table_bwd))
+	.int	JMPTBL (L(shl_4_bwd), L(shl_table_bwd))
+	.int	JMPTBL (L(shl_5_bwd), L(shl_table_bwd))
+	.int	JMPTBL (L(shl_6_bwd), L(shl_table_bwd))
+	.int	JMPTBL (L(shl_7_bwd), L(shl_table_bwd))
+	.int	JMPTBL (L(shl_8_bwd), L(shl_table_bwd))
+	.int	JMPTBL (L(shl_9_bwd), L(shl_table_bwd))
+	.int	JMPTBL (L(shl_10_bwd), L(shl_table_bwd))
+	.int	JMPTBL (L(shl_11_bwd), L(shl_table_bwd))
+	.int	JMPTBL (L(shl_12_bwd), L(shl_table_bwd))
+	.int	JMPTBL (L(shl_13_bwd), L(shl_table_bwd))
+	.int	JMPTBL (L(shl_14_bwd), L(shl_table_bwd))
+	.int	JMPTBL (L(shl_15_bwd), L(shl_table_bwd))
+
+#endif
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/memcpy-ssse3.S b/REORG.TODO/sysdeps/x86_64/multiarch/memcpy-ssse3.S
new file mode 100644
index 0000000000..f3ea52a46c
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/memcpy-ssse3.S
@@ -0,0 +1,3150 @@
+/* memcpy with SSSE3
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+#if IS_IN (libc) \
+    && (defined SHARED \
+        || defined USE_AS_MEMMOVE \
+	|| !defined USE_MULTIARCH)
+
+#include "asm-syntax.h"
+
+#ifndef MEMCPY
+# define MEMCPY		__memcpy_ssse3
+# define MEMCPY_CHK	__memcpy_chk_ssse3
+# define MEMPCPY	__mempcpy_ssse3
+# define MEMPCPY_CHK	__mempcpy_chk_ssse3
+#endif
+
+#define JMPTBL(I, B)	I - B
+
+/* Branch to an entry in a jump table.  TABLE is a jump table with
+   relative offsets.  INDEX is a register contains the index into the
+   jump table.  SCALE is the scale of INDEX.  */
+#define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)		\
+  lea		TABLE(%rip), %r11;				\
+  movslq	(%r11, INDEX, SCALE), INDEX;			\
+  lea		(%r11, INDEX), INDEX;				\
+  jmp		*INDEX;						\
+  ud2
+
+	.section .text.ssse3,"ax",@progbits
+#if !defined USE_AS_MEMPCPY && !defined USE_AS_MEMMOVE
+ENTRY (MEMPCPY_CHK)
+	cmpq	%rdx, %rcx
+	jb	HIDDEN_JUMPTARGET (__chk_fail)
+END (MEMPCPY_CHK)
+
+ENTRY (MEMPCPY)
+	movq	%rdi, %rax
+	addq	%rdx, %rax
+	jmp	L(start)
+END (MEMPCPY)
+#endif
+
+#if !defined USE_AS_BCOPY
+ENTRY (MEMCPY_CHK)
+	cmpq	%rdx, %rcx
+	jb	HIDDEN_JUMPTARGET (__chk_fail)
+END (MEMCPY_CHK)
+#endif
+
+ENTRY (MEMCPY)
+	mov	%rdi, %rax
+#ifdef USE_AS_MEMPCPY
+	add	%rdx, %rax
+#endif
+
+#ifdef USE_AS_MEMMOVE
+	cmp	%rsi, %rdi
+	jb	L(copy_forward)
+	je	L(write_0bytes)
+	cmp	$79, %rdx
+	jbe	L(copy_forward)
+	jmp	L(copy_backward)
+L(copy_forward):
+#endif
+L(start):
+	cmp	$79, %rdx
+	lea     L(table_less_80bytes)(%rip), %r11
+	ja	L(80bytesormore)
+	movslq	(%r11, %rdx, 4), %r9
+	add	%rdx, %rsi
+	add	%rdx, %rdi
+	add	%r11, %r9
+	jmp	*%r9
+	ud2
+
+	.p2align 4
+L(80bytesormore):
+#ifndef USE_AS_MEMMOVE
+	cmp	%dil, %sil
+	jle	L(copy_backward)
+#endif
+
+	movdqu	(%rsi), %xmm0
+	mov	%rdi, %rcx
+	and	$-16, %rdi
+	add	$16, %rdi
+	mov	%rcx, %r8
+	sub	%rdi, %rcx
+	add	%rcx, %rdx
+	sub	%rcx, %rsi
+
+#ifdef SHARED_CACHE_SIZE_HALF
+	mov	$SHARED_CACHE_SIZE_HALF, %RCX_LP
+#else
+	mov	__x86_shared_cache_size_half(%rip), %RCX_LP
+#endif
+	cmp	%rcx, %rdx
+	mov	%rsi, %r9
+	ja	L(large_page_fwd)
+	and	$0xf, %r9
+	jz	L(shl_0)
+#ifdef DATA_CACHE_SIZE_HALF
+	mov	$DATA_CACHE_SIZE_HALF, %RCX_LP
+#else
+	mov	__x86_data_cache_size_half(%rip), %RCX_LP
+#endif
+	BRANCH_TO_JMPTBL_ENTRY (L(shl_table), %r9, 4)
+
+	.p2align 4
+L(copy_backward):
+	movdqu	-16(%rsi, %rdx), %xmm0
+	add	%rdx, %rsi
+	lea	-16(%rdi, %rdx), %r8
+	add	%rdx, %rdi
+
+	mov	%rdi, %rcx
+	and	$0xf, %rcx
+	xor	%rcx, %rdi
+	sub	%rcx, %rdx
+	sub	%rcx, %rsi
+
+#ifdef SHARED_CACHE_SIZE_HALF
+	mov	$SHARED_CACHE_SIZE_HALF, %RCX_LP
+#else
+	mov	__x86_shared_cache_size_half(%rip), %RCX_LP
+#endif
+
+	cmp	%rcx, %rdx
+	mov	%rsi, %r9
+	ja	L(large_page_bwd)
+	and	$0xf, %r9
+	jz	L(shl_0_bwd)
+#ifdef DATA_CACHE_SIZE_HALF
+	mov	$DATA_CACHE_SIZE_HALF, %RCX_LP
+#else
+	mov	__x86_data_cache_size_half(%rip), %RCX_LP
+#endif
+	BRANCH_TO_JMPTBL_ENTRY (L(shl_table_bwd), %r9, 4)
+
+	.p2align 4
+L(shl_0):
+	sub	$16, %rdx
+	movdqa	(%rsi), %xmm1
+	add	$16, %rsi
+	movdqa	%xmm1, (%rdi)
+	add	$16, %rdi
+	cmp	$128, %rdx
+	movdqu	%xmm0, (%r8)
+	ja	L(shl_0_gobble)
+	cmp	$64, %rdx
+	jb	L(shl_0_less_64bytes)
+	movaps	(%rsi), %xmm4
+	movaps	16(%rsi), %xmm1
+	movaps	32(%rsi), %xmm2
+	movaps	48(%rsi), %xmm3
+	movaps	%xmm4, (%rdi)
+	movaps	%xmm1, 16(%rdi)
+	movaps	%xmm2, 32(%rdi)
+	movaps	%xmm3, 48(%rdi)
+	sub	$64, %rdx
+	add	$64, %rsi
+	add	$64, %rdi
+L(shl_0_less_64bytes):
+	add	%rdx, %rsi
+	add	%rdx, %rdi
+	BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
+
+	.p2align 4
+L(shl_0_gobble):
+#ifdef DATA_CACHE_SIZE_HALF
+	cmp	$DATA_CACHE_SIZE_HALF, %RDX_LP
+#else
+	cmp	__x86_data_cache_size_half(%rip), %RDX_LP
+#endif
+	lea	-128(%rdx), %rdx
+	jae	L(shl_0_gobble_mem_loop)
+L(shl_0_gobble_cache_loop):
+	movdqa	(%rsi), %xmm4
+	movaps	0x10(%rsi), %xmm1
+	movaps	0x20(%rsi), %xmm2
+	movaps	0x30(%rsi), %xmm3
+
+	movdqa	%xmm4, (%rdi)
+	movaps	%xmm1, 0x10(%rdi)
+	movaps	%xmm2, 0x20(%rdi)
+	movaps	%xmm3, 0x30(%rdi)
+
+	sub	$128, %rdx
+	movaps	0x40(%rsi), %xmm4
+	movaps	0x50(%rsi), %xmm5
+	movaps	0x60(%rsi), %xmm6
+	movaps	0x70(%rsi), %xmm7
+	lea	0x80(%rsi), %rsi
+	movaps	%xmm4, 0x40(%rdi)
+	movaps	%xmm5, 0x50(%rdi)
+	movaps	%xmm6, 0x60(%rdi)
+	movaps	%xmm7, 0x70(%rdi)
+	lea	0x80(%rdi), %rdi
+
+	jae	L(shl_0_gobble_cache_loop)
+	cmp	$-0x40, %rdx
+	lea	0x80(%rdx), %rdx
+	jl	L(shl_0_cache_less_64bytes)
+
+	movdqa	(%rsi), %xmm4
+	sub	$0x40, %rdx
+	movdqa	0x10(%rsi), %xmm1
+
+	movdqa	%xmm4, (%rdi)
+	movdqa	%xmm1, 0x10(%rdi)
+
+	movdqa	0x20(%rsi), %xmm4
+	movdqa	0x30(%rsi), %xmm1
+	add	$0x40, %rsi
+
+	movdqa	%xmm4, 0x20(%rdi)
+	movdqa	%xmm1, 0x30(%rdi)
+	add	$0x40, %rdi
+L(shl_0_cache_less_64bytes):
+	add	%rdx, %rsi
+	add	%rdx, %rdi
+	BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
+
+	.p2align 4
+L(shl_0_gobble_mem_loop):
+	prefetcht0 0x1c0(%rsi)
+	prefetcht0 0x280(%rsi)
+
+	movdqa	(%rsi), %xmm0
+	movdqa	0x10(%rsi), %xmm1
+	movdqa	0x20(%rsi), %xmm2
+	movdqa	0x30(%rsi), %xmm3
+	movdqa	0x40(%rsi), %xmm4
+	movdqa	0x50(%rsi), %xmm5
+	movdqa	0x60(%rsi), %xmm6
+	movdqa	0x70(%rsi), %xmm7
+	lea	0x80(%rsi), %rsi
+	sub	$0x80, %rdx
+	movdqa	%xmm0, (%rdi)
+	movdqa	%xmm1, 0x10(%rdi)
+	movdqa	%xmm2, 0x20(%rdi)
+	movdqa	%xmm3, 0x30(%rdi)
+	movdqa	%xmm4, 0x40(%rdi)
+	movdqa	%xmm5, 0x50(%rdi)
+	movdqa	%xmm6, 0x60(%rdi)
+	movdqa	%xmm7, 0x70(%rdi)
+	lea	0x80(%rdi), %rdi
+
+	jae	L(shl_0_gobble_mem_loop)
+	cmp	$-0x40, %rdx
+	lea	0x80(%rdx), %rdx
+	jl	L(shl_0_mem_less_64bytes)
+
+	movdqa	(%rsi), %xmm0
+	sub	$0x40, %rdx
+	movdqa	0x10(%rsi), %xmm1
+
+	movdqa	%xmm0, (%rdi)
+	movdqa	%xmm1, 0x10(%rdi)
+
+	movdqa	0x20(%rsi), %xmm0
+	movdqa	0x30(%rsi), %xmm1
+	add	$0x40, %rsi
+
+	movdqa	%xmm0, 0x20(%rdi)
+	movdqa	%xmm1, 0x30(%rdi)
+	add	$0x40, %rdi
+L(shl_0_mem_less_64bytes):
+	cmp	$0x20, %rdx
+	jb	L(shl_0_mem_less_32bytes)
+	movdqa	(%rsi), %xmm0
+	sub	$0x20, %rdx
+	movdqa	0x10(%rsi), %xmm1
+	add	$0x20, %rsi
+	movdqa	%xmm0, (%rdi)
+	movdqa	%xmm1, 0x10(%rdi)
+	add	$0x20, %rdi
+L(shl_0_mem_less_32bytes):
+	add	%rdx, %rdi
+	add	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
+
+	.p2align 4
+L(shl_0_bwd):
+	sub	$16, %rdx
+	movdqa	-0x10(%rsi), %xmm1
+	sub	$16, %rsi
+	movdqa	%xmm1, -0x10(%rdi)
+	sub	$16, %rdi
+	cmp	$0x80, %rdx
+	movdqu	%xmm0, (%r8)
+	ja	L(shl_0_gobble_bwd)
+	cmp	$64, %rdx
+	jb	L(shl_0_less_64bytes_bwd)
+	movaps	-0x10(%rsi), %xmm0
+	movaps	-0x20(%rsi), %xmm1
+	movaps	-0x30(%rsi), %xmm2
+	movaps	-0x40(%rsi), %xmm3
+	movaps	%xmm0, -0x10(%rdi)
+	movaps	%xmm1, -0x20(%rdi)
+	movaps	%xmm2, -0x30(%rdi)
+	movaps	%xmm3, -0x40(%rdi)
+	sub	$64, %rdx
+	sub	$0x40, %rsi
+	sub	$0x40, %rdi
+L(shl_0_less_64bytes_bwd):
+	BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
+
+	.p2align 4
+L(shl_0_gobble_bwd):
+#ifdef DATA_CACHE_SIZE_HALF
+	cmp	$DATA_CACHE_SIZE_HALF, %RDX_LP
+#else
+	cmp	__x86_data_cache_size_half(%rip), %RDX_LP
+#endif
+	lea	-128(%rdx), %rdx
+	jae	L(shl_0_gobble_mem_bwd_loop)
+L(shl_0_gobble_bwd_loop):
+	movdqa	-0x10(%rsi), %xmm0
+	movaps	-0x20(%rsi), %xmm1
+	movaps	-0x30(%rsi), %xmm2
+	movaps	-0x40(%rsi), %xmm3
+
+	movdqa	%xmm0, -0x10(%rdi)
+	movaps	%xmm1, -0x20(%rdi)
+	movaps	%xmm2, -0x30(%rdi)
+	movaps	%xmm3, -0x40(%rdi)
+
+	sub	$0x80, %rdx
+	movaps	-0x50(%rsi), %xmm4
+	movaps	-0x60(%rsi), %xmm5
+	movaps	-0x70(%rsi), %xmm6
+	movaps	-0x80(%rsi), %xmm7
+	lea	-0x80(%rsi), %rsi
+	movaps	%xmm4, -0x50(%rdi)
+	movaps	%xmm5, -0x60(%rdi)
+	movaps	%xmm6, -0x70(%rdi)
+	movaps	%xmm7, -0x80(%rdi)
+	lea	-0x80(%rdi), %rdi
+
+	jae	L(shl_0_gobble_bwd_loop)
+	cmp	$-0x40, %rdx
+	lea	0x80(%rdx), %rdx
+	jl	L(shl_0_gobble_bwd_less_64bytes)
+
+	movdqa	-0x10(%rsi), %xmm0
+	sub	$0x40, %rdx
+	movdqa	-0x20(%rsi), %xmm1
+
+	movdqa	%xmm0, -0x10(%rdi)
+	movdqa	%xmm1, -0x20(%rdi)
+
+	movdqa	-0x30(%rsi), %xmm0
+	movdqa	-0x40(%rsi), %xmm1
+	sub	$0x40, %rsi
+
+	movdqa	%xmm0, -0x30(%rdi)
+	movdqa	%xmm1, -0x40(%rdi)
+	sub	$0x40, %rdi
+L(shl_0_gobble_bwd_less_64bytes):
+	BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
+
+	.p2align 4
+L(shl_0_gobble_mem_bwd_loop):
+	prefetcht0 -0x1c0(%rsi)
+	prefetcht0 -0x280(%rsi)
+	movdqa	-0x10(%rsi), %xmm0
+	movdqa	-0x20(%rsi), %xmm1
+	movdqa	-0x30(%rsi), %xmm2
+	movdqa	-0x40(%rsi), %xmm3
+	movdqa	-0x50(%rsi), %xmm4
+	movdqa	-0x60(%rsi), %xmm5
+	movdqa	-0x70(%rsi), %xmm6
+	movdqa	-0x80(%rsi), %xmm7
+	lea	-0x80(%rsi), %rsi
+	sub	$0x80, %rdx
+	movdqa	%xmm0, -0x10(%rdi)
+	movdqa	%xmm1, -0x20(%rdi)
+	movdqa	%xmm2, -0x30(%rdi)
+	movdqa	%xmm3, -0x40(%rdi)
+	movdqa	%xmm4, -0x50(%rdi)
+	movdqa	%xmm5, -0x60(%rdi)
+	movdqa	%xmm6, -0x70(%rdi)
+	movdqa	%xmm7, -0x80(%rdi)
+	lea	-0x80(%rdi), %rdi
+
+	jae	L(shl_0_gobble_mem_bwd_loop)
+	cmp	$-0x40, %rdx
+	lea	0x80(%rdx), %rdx
+	jl	L(shl_0_mem_bwd_less_64bytes)
+
+	movdqa	-0x10(%rsi), %xmm0
+	sub	$0x40, %rdx
+	movdqa	-0x20(%rsi), %xmm1
+
+	movdqa	%xmm0, -0x10(%rdi)
+	movdqa	%xmm1, -0x20(%rdi)
+
+	movdqa	-0x30(%rsi), %xmm0
+	movdqa	-0x40(%rsi), %xmm1
+	sub	$0x40, %rsi
+
+	movdqa	%xmm0, -0x30(%rdi)
+	movdqa	%xmm1, -0x40(%rdi)
+	sub	$0x40, %rdi
+L(shl_0_mem_bwd_less_64bytes):
+	cmp	$0x20, %rdx
+	jb	L(shl_0_mem_bwd_less_32bytes)
+	movdqa	-0x10(%rsi), %xmm0
+	sub	$0x20, %rdx
+	movdqa	-0x20(%rsi), %xmm1
+	sub	$0x20, %rsi
+	movdqa	%xmm0, -0x10(%rdi)
+	movdqa	%xmm1, -0x20(%rdi)
+	sub	$0x20, %rdi
+L(shl_0_mem_bwd_less_32bytes):
+	BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
+
+	.p2align 4
+L(shl_1):
+	lea	(L(shl_1_loop_L1)-L(shl_1))(%r9), %r9
+	cmp	%rcx, %rdx
+	movaps	-0x01(%rsi), %xmm1
+	jb	L(L1_fwd)
+	lea	(L(shl_1_loop_L2)-L(shl_1_loop_L1))(%r9), %r9
+L(L1_fwd):
+	lea	-64(%rdx), %rdx
+	jmp	*%r9
+	ud2
+L(shl_1_loop_L2):
+	prefetchnta 0x1c0(%rsi)
+L(shl_1_loop_L1):
+	sub	$64, %rdx
+	movaps	0x0f(%rsi), %xmm2
+	movaps	0x1f(%rsi), %xmm3
+	movaps	0x2f(%rsi), %xmm4
+	movaps	0x3f(%rsi), %xmm5
+	movdqa	%xmm5, %xmm6
+	palignr	$1, %xmm4, %xmm5
+	lea	64(%rsi), %rsi
+	palignr	$1, %xmm3, %xmm4
+	palignr	$1, %xmm2, %xmm3
+	lea	64(%rdi), %rdi
+	palignr	$1, %xmm1, %xmm2
+	movdqa	%xmm6, %xmm1
+	movdqa	%xmm2, -0x40(%rdi)
+	movaps	%xmm3, -0x30(%rdi)
+	jb	L(shl_1_end)
+	movaps	%xmm4, -0x20(%rdi)
+	movaps	%xmm5, -0x10(%rdi)
+	jmp	*%r9
+	ud2
+L(shl_1_end):
+	movaps	%xmm4, -0x20(%rdi)
+	lea	64(%rdx), %rdx
+	movaps	%xmm5, -0x10(%rdi)
+	add	%rdx, %rdi
+	movdqu	%xmm0, (%r8)
+	add	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+
+	.p2align 4
+L(shl_1_bwd):
+	lea	(L(shl_1_bwd_loop_L1)-L(shl_1_bwd))(%r9), %r9
+	cmp	%rcx, %rdx
+	movaps	-0x01(%rsi), %xmm1
+	jb	L(L1_bwd)
+	lea	(L(shl_1_bwd_loop_L2)-L(shl_1_bwd_loop_L1))(%r9), %r9
+L(L1_bwd):
+	lea	-64(%rdx), %rdx
+	jmp	*%r9
+	ud2
+L(shl_1_bwd_loop_L2):
+	prefetchnta -0x1c0(%rsi)
+L(shl_1_bwd_loop_L1):
+	movaps	-0x11(%rsi), %xmm2
+	sub	$0x40, %rdx
+	movaps	-0x21(%rsi), %xmm3
+	movaps	-0x31(%rsi), %xmm4
+	movaps	-0x41(%rsi), %xmm5
+	lea	-0x40(%rsi), %rsi
+	palignr	$1, %xmm2, %xmm1
+	palignr	$1, %xmm3, %xmm2
+	palignr	$1, %xmm4, %xmm3
+	palignr	$1, %xmm5, %xmm4
+
+	movaps	%xmm1, -0x10(%rdi)
+	movaps	%xmm5, %xmm1
+
+	movaps	%xmm2, -0x20(%rdi)
+	lea	-0x40(%rdi), %rdi
+
+	movaps	%xmm3, 0x10(%rdi)
+	jb	L(shl_1_bwd_end)
+	movaps	%xmm4, (%rdi)
+	jmp	*%r9
+	ud2
+L(shl_1_bwd_end):
+	movaps	%xmm4, (%rdi)
+	lea	64(%rdx), %rdx
+	movdqu	%xmm0, (%r8)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+
+	.p2align 4
+L(shl_2):
+	lea	(L(shl_2_loop_L1)-L(shl_2))(%r9), %r9
+	cmp	%rcx, %rdx
+	movaps	-0x02(%rsi), %xmm1
+	jb	L(L2_fwd)
+	lea	(L(shl_2_loop_L2)-L(shl_2_loop_L1))(%r9), %r9
+L(L2_fwd):
+	lea	-64(%rdx), %rdx
+	jmp	*%r9
+	ud2
+L(shl_2_loop_L2):
+	prefetchnta 0x1c0(%rsi)
+L(shl_2_loop_L1):
+	sub	$64, %rdx
+	movaps	0x0e(%rsi), %xmm2
+	movaps	0x1e(%rsi), %xmm3
+	movaps	0x2e(%rsi), %xmm4
+	movaps	0x3e(%rsi), %xmm5
+	movdqa	%xmm5, %xmm6
+	palignr	$2, %xmm4, %xmm5
+	lea	64(%rsi), %rsi
+	palignr	$2, %xmm3, %xmm4
+	palignr	$2, %xmm2, %xmm3
+	lea	64(%rdi), %rdi
+	palignr	$2, %xmm1, %xmm2
+	movdqa	%xmm6, %xmm1
+	movdqa	%xmm2, -0x40(%rdi)
+	movaps	%xmm3, -0x30(%rdi)
+	jb	L(shl_2_end)
+	movaps	%xmm4, -0x20(%rdi)
+	movaps	%xmm5, -0x10(%rdi)
+	jmp	*%r9
+	ud2
+L(shl_2_end):
+	movaps	%xmm4, -0x20(%rdi)
+	lea	64(%rdx), %rdx
+	movaps	%xmm5, -0x10(%rdi)
+	add	%rdx, %rdi
+	movdqu	%xmm0, (%r8)
+	add	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+
+	.p2align 4
+L(shl_2_bwd):
+	lea	(L(shl_2_bwd_loop_L1)-L(shl_2_bwd))(%r9), %r9
+	cmp	%rcx, %rdx
+	movaps	-0x02(%rsi), %xmm1
+	jb	L(L2_bwd)
+	lea	(L(shl_2_bwd_loop_L2)-L(shl_2_bwd_loop_L1))(%r9), %r9
+L(L2_bwd):
+	lea	-64(%rdx), %rdx
+	jmp	*%r9
+	ud2
+L(shl_2_bwd_loop_L2):
+	prefetchnta -0x1c0(%rsi)
+L(shl_2_bwd_loop_L1):
+	movaps	-0x12(%rsi), %xmm2
+	sub	$0x40, %rdx
+	movaps	-0x22(%rsi), %xmm3
+	movaps	-0x32(%rsi), %xmm4
+	movaps	-0x42(%rsi), %xmm5
+	lea	-0x40(%rsi), %rsi
+	palignr	$2, %xmm2, %xmm1
+	palignr	$2, %xmm3, %xmm2
+	palignr	$2, %xmm4, %xmm3
+	palignr	$2, %xmm5, %xmm4
+
+	movaps	%xmm1, -0x10(%rdi)
+	movaps	%xmm5, %xmm1
+
+	movaps	%xmm2, -0x20(%rdi)
+	lea	-0x40(%rdi), %rdi
+
+	movaps	%xmm3, 0x10(%rdi)
+	jb	L(shl_2_bwd_end)
+	movaps	%xmm4, (%rdi)
+	jmp	*%r9
+	ud2
+L(shl_2_bwd_end):
+	movaps	%xmm4, (%rdi)
+	lea	64(%rdx), %rdx
+	movdqu	%xmm0, (%r8)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+
+	.p2align 4
+L(shl_3):
+	lea	(L(shl_3_loop_L1)-L(shl_3))(%r9), %r9
+	cmp	%rcx, %rdx
+	movaps	-0x03(%rsi), %xmm1
+	jb	L(L3_fwd)
+	lea	(L(shl_3_loop_L2)-L(shl_3_loop_L1))(%r9), %r9
+L(L3_fwd):
+	lea	-64(%rdx), %rdx
+	jmp	*%r9
+	ud2
+L(shl_3_loop_L2):
+	prefetchnta 0x1c0(%rsi)
+L(shl_3_loop_L1):
+	sub	$64, %rdx
+	movaps	0x0d(%rsi), %xmm2
+	movaps	0x1d(%rsi), %xmm3
+	movaps	0x2d(%rsi), %xmm4
+	movaps	0x3d(%rsi), %xmm5
+	movdqa	%xmm5, %xmm6
+	palignr	$3, %xmm4, %xmm5
+	lea	64(%rsi), %rsi
+	palignr	$3, %xmm3, %xmm4
+	palignr	$3, %xmm2, %xmm3
+	lea	64(%rdi), %rdi
+	palignr	$3, %xmm1, %xmm2
+	movdqa	%xmm6, %xmm1
+	movdqa	%xmm2, -0x40(%rdi)
+	movaps	%xmm3, -0x30(%rdi)
+	jb	L(shl_3_end)
+	movaps	%xmm4, -0x20(%rdi)
+	movaps	%xmm5, -0x10(%rdi)
+	jmp	*%r9
+	ud2
+L(shl_3_end):
+	movaps	%xmm4, -0x20(%rdi)
+	lea	64(%rdx), %rdx
+	movaps	%xmm5, -0x10(%rdi)
+	add	%rdx, %rdi
+	movdqu	%xmm0, (%r8)
+	add	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+
+	.p2align 4
+L(shl_3_bwd):
+	lea	(L(shl_3_bwd_loop_L1)-L(shl_3_bwd))(%r9), %r9
+	cmp	%rcx, %rdx
+	movaps	-0x03(%rsi), %xmm1
+	jb	L(L3_bwd)
+	lea	(L(shl_3_bwd_loop_L2)-L(shl_3_bwd_loop_L1))(%r9), %r9
+L(L3_bwd):
+	lea	-64(%rdx), %rdx
+	jmp	*%r9
+	ud2
+L(shl_3_bwd_loop_L2):
+	prefetchnta -0x1c0(%rsi)
+L(shl_3_bwd_loop_L1):
+	movaps	-0x13(%rsi), %xmm2
+	sub	$0x40, %rdx
+	movaps	-0x23(%rsi), %xmm3
+	movaps	-0x33(%rsi), %xmm4
+	movaps	-0x43(%rsi), %xmm5
+	lea	-0x40(%rsi), %rsi
+	palignr	$3, %xmm2, %xmm1
+	palignr	$3, %xmm3, %xmm2
+	palignr	$3, %xmm4, %xmm3
+	palignr	$3, %xmm5, %xmm4
+
+	movaps	%xmm1, -0x10(%rdi)
+	movaps	%xmm5, %xmm1
+
+	movaps	%xmm2, -0x20(%rdi)
+	lea	-0x40(%rdi), %rdi
+
+	movaps	%xmm3, 0x10(%rdi)
+	jb	L(shl_3_bwd_end)
+	movaps	%xmm4, (%rdi)
+	jmp	*%r9
+	ud2
+L(shl_3_bwd_end):
+	movaps	%xmm4, (%rdi)
+	lea	64(%rdx), %rdx
+	movdqu	%xmm0, (%r8)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+
+	.p2align 4
+L(shl_4):
+	lea	(L(shl_4_loop_L1)-L(shl_4))(%r9), %r9
+	cmp	%rcx, %rdx
+	movaps	-0x04(%rsi), %xmm1
+	jb	L(L4_fwd)
+	lea	(L(shl_4_loop_L2)-L(shl_4_loop_L1))(%r9), %r9
+L(L4_fwd):
+	lea	-64(%rdx), %rdx
+	jmp	*%r9
+	ud2
+L(shl_4_loop_L2):
+	prefetchnta 0x1c0(%rsi)
+L(shl_4_loop_L1):
+	sub	$64, %rdx
+	movaps	0x0c(%rsi), %xmm2
+	movaps	0x1c(%rsi), %xmm3
+	movaps	0x2c(%rsi), %xmm4
+	movaps	0x3c(%rsi), %xmm5
+	movdqa	%xmm5, %xmm6
+	palignr	$4, %xmm4, %xmm5
+	lea	64(%rsi), %rsi
+	palignr	$4, %xmm3, %xmm4
+	palignr	$4, %xmm2, %xmm3
+	lea	64(%rdi), %rdi
+	palignr	$4, %xmm1, %xmm2
+	movdqa	%xmm6, %xmm1
+	movdqa	%xmm2, -0x40(%rdi)
+	movaps	%xmm3, -0x30(%rdi)
+	jb	L(shl_4_end)
+	movaps	%xmm4, -0x20(%rdi)
+	movaps	%xmm5, -0x10(%rdi)
+	jmp	*%r9
+	ud2
+L(shl_4_end):
+	movaps	%xmm4, -0x20(%rdi)
+	lea	64(%rdx), %rdx
+	movaps	%xmm5, -0x10(%rdi)
+	add	%rdx, %rdi
+	movdqu	%xmm0, (%r8)
+	add	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+
+	.p2align 4
+L(shl_4_bwd):
+	lea	(L(shl_4_bwd_loop_L1)-L(shl_4_bwd))(%r9), %r9
+	cmp	%rcx, %rdx
+	movaps	-0x04(%rsi), %xmm1
+	jb	L(L4_bwd)
+	lea	(L(shl_4_bwd_loop_L2)-L(shl_4_bwd_loop_L1))(%r9), %r9
+L(L4_bwd):
+	lea	-64(%rdx), %rdx
+	jmp	*%r9
+	ud2
+L(shl_4_bwd_loop_L2):
+	prefetchnta -0x1c0(%rsi)
+L(shl_4_bwd_loop_L1):
+	movaps	-0x14(%rsi), %xmm2
+	sub	$0x40, %rdx
+	movaps	-0x24(%rsi), %xmm3
+	movaps	-0x34(%rsi), %xmm4
+	movaps	-0x44(%rsi), %xmm5
+	lea	-0x40(%rsi), %rsi
+	palignr	$4, %xmm2, %xmm1
+	palignr	$4, %xmm3, %xmm2
+	palignr	$4, %xmm4, %xmm3
+	palignr	$4, %xmm5, %xmm4
+
+	movaps	%xmm1, -0x10(%rdi)
+	movaps	%xmm5, %xmm1
+
+	movaps	%xmm2, -0x20(%rdi)
+	lea	-0x40(%rdi), %rdi
+
+	movaps	%xmm3, 0x10(%rdi)
+	jb	L(shl_4_bwd_end)
+	movaps	%xmm4, (%rdi)
+	jmp	*%r9
+	ud2
+L(shl_4_bwd_end):
+	movaps	%xmm4, (%rdi)
+	lea	64(%rdx), %rdx
+	movdqu	%xmm0, (%r8)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+
+	.p2align 4
+L(shl_5):
+	lea	(L(shl_5_loop_L1)-L(shl_5))(%r9), %r9
+	cmp	%rcx, %rdx
+	movaps	-0x05(%rsi), %xmm1
+	jb	L(L5_fwd)
+	lea	(L(shl_5_loop_L2)-L(shl_5_loop_L1))(%r9), %r9
+L(L5_fwd):
+	lea	-64(%rdx), %rdx
+	jmp	*%r9
+	ud2
+L(shl_5_loop_L2):
+	prefetchnta 0x1c0(%rsi)
+L(shl_5_loop_L1):
+	sub	$64, %rdx
+	movaps	0x0b(%rsi), %xmm2
+	movaps	0x1b(%rsi), %xmm3
+	movaps	0x2b(%rsi), %xmm4
+	movaps	0x3b(%rsi), %xmm5
+	movdqa	%xmm5, %xmm6
+	palignr	$5, %xmm4, %xmm5
+	lea	64(%rsi), %rsi
+	palignr	$5, %xmm3, %xmm4
+	palignr	$5, %xmm2, %xmm3
+	lea	64(%rdi), %rdi
+	palignr	$5, %xmm1, %xmm2
+	movdqa	%xmm6, %xmm1
+	movdqa	%xmm2, -0x40(%rdi)
+	movaps	%xmm3, -0x30(%rdi)
+	jb	L(shl_5_end)
+	movaps	%xmm4, -0x20(%rdi)
+	movaps	%xmm5, -0x10(%rdi)
+	jmp	*%r9
+	ud2
+L(shl_5_end):
+	movaps	%xmm4, -0x20(%rdi)
+	lea	64(%rdx), %rdx
+	movaps	%xmm5, -0x10(%rdi)
+	add	%rdx, %rdi
+	movdqu	%xmm0, (%r8)
+	add	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+
+	.p2align 4
+L(shl_5_bwd):
+	lea	(L(shl_5_bwd_loop_L1)-L(shl_5_bwd))(%r9), %r9
+	cmp	%rcx, %rdx
+	movaps	-0x05(%rsi), %xmm1
+	jb	L(L5_bwd)
+	lea	(L(shl_5_bwd_loop_L2)-L(shl_5_bwd_loop_L1))(%r9), %r9
+L(L5_bwd):
+	lea	-64(%rdx), %rdx
+	jmp	*%r9
+	ud2
+L(shl_5_bwd_loop_L2):
+	prefetchnta -0x1c0(%rsi)
+L(shl_5_bwd_loop_L1):
+	movaps	-0x15(%rsi), %xmm2
+	sub	$0x40, %rdx
+	movaps	-0x25(%rsi), %xmm3
+	movaps	-0x35(%rsi), %xmm4
+	movaps	-0x45(%rsi), %xmm5
+	lea	-0x40(%rsi), %rsi
+	palignr	$5, %xmm2, %xmm1
+	palignr	$5, %xmm3, %xmm2
+	palignr	$5, %xmm4, %xmm3
+	palignr	$5, %xmm5, %xmm4
+
+	movaps	%xmm1, -0x10(%rdi)
+	movaps	%xmm5, %xmm1
+
+	movaps	%xmm2, -0x20(%rdi)
+	lea	-0x40(%rdi), %rdi
+
+	movaps	%xmm3, 0x10(%rdi)
+	jb	L(shl_5_bwd_end)
+	movaps	%xmm4, (%rdi)
+	jmp	*%r9
+	ud2
+L(shl_5_bwd_end):
+	movaps	%xmm4, (%rdi)
+	lea	64(%rdx), %rdx
+	movdqu	%xmm0, (%r8)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+
+	.p2align 4
+L(shl_6):
+	lea	(L(shl_6_loop_L1)-L(shl_6))(%r9), %r9
+	cmp	%rcx, %rdx
+	movaps	-0x06(%rsi), %xmm1
+	jb	L(L6_fwd)
+	lea	(L(shl_6_loop_L2)-L(shl_6_loop_L1))(%r9), %r9
+L(L6_fwd):
+	lea	-64(%rdx), %rdx
+	jmp	*%r9
+	ud2
+L(shl_6_loop_L2):
+	prefetchnta 0x1c0(%rsi)
+L(shl_6_loop_L1):
+	sub	$64, %rdx
+	movaps	0x0a(%rsi), %xmm2
+	movaps	0x1a(%rsi), %xmm3
+	movaps	0x2a(%rsi), %xmm4
+	movaps	0x3a(%rsi), %xmm5
+	movdqa	%xmm5, %xmm6
+	palignr	$6, %xmm4, %xmm5
+	lea	64(%rsi), %rsi
+	palignr	$6, %xmm3, %xmm4
+	palignr	$6, %xmm2, %xmm3
+	lea	64(%rdi), %rdi
+	palignr	$6, %xmm1, %xmm2
+	movdqa	%xmm6, %xmm1
+	movdqa	%xmm2, -0x40(%rdi)
+	movaps	%xmm3, -0x30(%rdi)
+	jb	L(shl_6_end)
+	movaps	%xmm4, -0x20(%rdi)
+	movaps	%xmm5, -0x10(%rdi)
+	jmp	*%r9
+	ud2
+L(shl_6_end):
+	movaps	%xmm4, -0x20(%rdi)
+	lea	64(%rdx), %rdx
+	movaps	%xmm5, -0x10(%rdi)
+	add	%rdx, %rdi
+	movdqu	%xmm0, (%r8)
+	add	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+
+	.p2align 4
+L(shl_6_bwd):
+	lea	(L(shl_6_bwd_loop_L1)-L(shl_6_bwd))(%r9), %r9
+	cmp	%rcx, %rdx
+	movaps	-0x06(%rsi), %xmm1
+	jb	L(L6_bwd)
+	lea	(L(shl_6_bwd_loop_L2)-L(shl_6_bwd_loop_L1))(%r9), %r9
+L(L6_bwd):
+	lea	-64(%rdx), %rdx
+	jmp	*%r9
+	ud2
+L(shl_6_bwd_loop_L2):
+	prefetchnta -0x1c0(%rsi)
+L(shl_6_bwd_loop_L1):
+	movaps	-0x16(%rsi), %xmm2
+	sub	$0x40, %rdx
+	movaps	-0x26(%rsi), %xmm3
+	movaps	-0x36(%rsi), %xmm4
+	movaps	-0x46(%rsi), %xmm5
+	lea	-0x40(%rsi), %rsi
+	palignr	$6, %xmm2, %xmm1
+	palignr	$6, %xmm3, %xmm2
+	palignr	$6, %xmm4, %xmm3
+	palignr	$6, %xmm5, %xmm4
+
+	movaps	%xmm1, -0x10(%rdi)
+	movaps	%xmm5, %xmm1
+
+	movaps	%xmm2, -0x20(%rdi)
+	lea	-0x40(%rdi), %rdi
+
+	movaps	%xmm3, 0x10(%rdi)
+	jb	L(shl_6_bwd_end)
+	movaps	%xmm4, (%rdi)
+	jmp	*%r9
+	ud2
+L(shl_6_bwd_end):
+	movaps	%xmm4, (%rdi)
+	lea	64(%rdx), %rdx
+	movdqu	%xmm0, (%r8)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+
+	.p2align 4
+L(shl_7):
+	lea	(L(shl_7_loop_L1)-L(shl_7))(%r9), %r9
+	cmp	%rcx, %rdx
+	movaps	-0x07(%rsi), %xmm1
+	jb	L(L7_fwd)
+	lea	(L(shl_7_loop_L2)-L(shl_7_loop_L1))(%r9), %r9
+L(L7_fwd):
+	lea	-64(%rdx), %rdx
+	jmp	*%r9
+	ud2
+L(shl_7_loop_L2):
+	prefetchnta 0x1c0(%rsi)
+L(shl_7_loop_L1):
+	sub	$64, %rdx
+	movaps	0x09(%rsi), %xmm2
+	movaps	0x19(%rsi), %xmm3
+	movaps	0x29(%rsi), %xmm4
+	movaps	0x39(%rsi), %xmm5
+	movdqa	%xmm5, %xmm6
+	palignr	$7, %xmm4, %xmm5
+	lea	64(%rsi), %rsi
+	palignr	$7, %xmm3, %xmm4
+	palignr	$7, %xmm2, %xmm3
+	lea	64(%rdi), %rdi
+	palignr	$7, %xmm1, %xmm2
+	movdqa	%xmm6, %xmm1
+	movdqa	%xmm2, -0x40(%rdi)
+	movaps	%xmm3, -0x30(%rdi)
+	jb	L(shl_7_end)
+	movaps	%xmm4, -0x20(%rdi)
+	movaps	%xmm5, -0x10(%rdi)
+	jmp	*%r9
+	ud2
+L(shl_7_end):
+	movaps	%xmm4, -0x20(%rdi)
+	lea	64(%rdx), %rdx
+	movaps	%xmm5, -0x10(%rdi)
+	add	%rdx, %rdi
+	movdqu	%xmm0, (%r8)
+	add	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+
+	.p2align 4
+L(shl_7_bwd):
+	lea	(L(shl_7_bwd_loop_L1)-L(shl_7_bwd))(%r9), %r9
+	cmp	%rcx, %rdx
+	movaps	-0x07(%rsi), %xmm1
+	jb	L(L7_bwd)
+	lea	(L(shl_7_bwd_loop_L2)-L(shl_7_bwd_loop_L1))(%r9), %r9
+L(L7_bwd):
+	lea	-64(%rdx), %rdx
+	jmp	*%r9
+	ud2
+L(shl_7_bwd_loop_L2):
+	prefetchnta -0x1c0(%rsi)
+L(shl_7_bwd_loop_L1):
+	movaps	-0x17(%rsi), %xmm2
+	sub	$0x40, %rdx
+	movaps	-0x27(%rsi), %xmm3
+	movaps	-0x37(%rsi), %xmm4
+	movaps	-0x47(%rsi), %xmm5
+	lea	-0x40(%rsi), %rsi
+	palignr	$7, %xmm2, %xmm1
+	palignr	$7, %xmm3, %xmm2
+	palignr	$7, %xmm4, %xmm3
+	palignr	$7, %xmm5, %xmm4
+
+	movaps	%xmm1, -0x10(%rdi)
+	movaps	%xmm5, %xmm1
+
+	movaps	%xmm2, -0x20(%rdi)
+	lea	-0x40(%rdi), %rdi
+
+	movaps	%xmm3, 0x10(%rdi)
+	jb	L(shl_7_bwd_end)
+	movaps	%xmm4, (%rdi)
+	jmp	*%r9
+	ud2
+L(shl_7_bwd_end):
+	movaps	%xmm4, (%rdi)
+	lea	64(%rdx), %rdx
+	movdqu	%xmm0, (%r8)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+
+	.p2align 4
+L(shl_8):
+	lea	(L(shl_8_loop_L1)-L(shl_8))(%r9), %r9
+	cmp	%rcx, %rdx
+	movaps	-0x08(%rsi), %xmm1
+	jb	L(L8_fwd)
+	lea	(L(shl_8_loop_L2)-L(shl_8_loop_L1))(%r9), %r9
+L(L8_fwd):
+	lea	-64(%rdx), %rdx
+	jmp	*%r9
+L(shl_8_loop_L2):
+	prefetchnta 0x1c0(%rsi)
+L(shl_8_loop_L1):
+	sub	$64, %rdx
+	movaps	0x08(%rsi), %xmm2
+	movaps	0x18(%rsi), %xmm3
+	movaps	0x28(%rsi), %xmm4
+	movaps	0x38(%rsi), %xmm5
+	movdqa	%xmm5, %xmm6
+	palignr	$8, %xmm4, %xmm5
+	lea	64(%rsi), %rsi
+	palignr	$8, %xmm3, %xmm4
+	palignr	$8, %xmm2, %xmm3
+	lea	64(%rdi), %rdi
+	palignr	$8, %xmm1, %xmm2
+	movdqa	%xmm6, %xmm1
+	movdqa	%xmm2, -0x40(%rdi)
+	movaps	%xmm3, -0x30(%rdi)
+	jb	L(shl_8_end)
+	movaps	%xmm4, -0x20(%rdi)
+	movaps	%xmm5, -0x10(%rdi)
+	jmp	*%r9
+	ud2
+	.p2align 4
+L(shl_8_end):
+	lea	64(%rdx), %rdx
+	movaps	%xmm4, -0x20(%rdi)
+	add	%rdx, %rsi
+	movaps	%xmm5, -0x10(%rdi)
+	add	%rdx, %rdi
+	movdqu	%xmm0, (%r8)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+
+	.p2align 4
+L(shl_8_bwd):
+	lea	(L(shl_8_bwd_loop_L1)-L(shl_8_bwd))(%r9), %r9
+	cmp	%rcx, %rdx
+	movaps	-0x08(%rsi), %xmm1
+	jb	L(L8_bwd)
+	lea	(L(shl_8_bwd_loop_L2)-L(shl_8_bwd_loop_L1))(%r9), %r9
+L(L8_bwd):
+	lea	-64(%rdx), %rdx
+	jmp	*%r9
+	ud2
+L(shl_8_bwd_loop_L2):
+	prefetchnta -0x1c0(%rsi)
+L(shl_8_bwd_loop_L1):
+	movaps	-0x18(%rsi), %xmm2
+	sub	$0x40, %rdx
+	movaps	-0x28(%rsi), %xmm3
+	movaps	-0x38(%rsi), %xmm4
+	movaps	-0x48(%rsi), %xmm5
+	lea	-0x40(%rsi), %rsi
+	palignr	$8, %xmm2, %xmm1
+	palignr	$8, %xmm3, %xmm2
+	palignr	$8, %xmm4, %xmm3
+	palignr	$8, %xmm5, %xmm4
+
+	movaps	%xmm1, -0x10(%rdi)
+	movaps	%xmm5, %xmm1
+
+	movaps	%xmm2, -0x20(%rdi)
+	lea	-0x40(%rdi), %rdi
+
+	movaps	%xmm3, 0x10(%rdi)
+	jb	L(shl_8_bwd_end)
+	movaps	%xmm4, (%rdi)
+	jmp	*%r9
+	ud2
+L(shl_8_bwd_end):
+	movaps	%xmm4, (%rdi)
+	lea	64(%rdx), %rdx
+	movdqu	%xmm0, (%r8)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+
+	.p2align 4
+L(shl_9):
+	lea	(L(shl_9_loop_L1)-L(shl_9))(%r9), %r9
+	cmp	%rcx, %rdx
+	movaps	-0x09(%rsi), %xmm1
+	jb	L(L9_fwd)
+	lea	(L(shl_9_loop_L2)-L(shl_9_loop_L1))(%r9), %r9
+L(L9_fwd):
+	lea	-64(%rdx), %rdx
+	jmp	*%r9
+	ud2
+L(shl_9_loop_L2):
+	prefetchnta 0x1c0(%rsi)
+L(shl_9_loop_L1):
+	sub	$64, %rdx
+	movaps	0x07(%rsi), %xmm2
+	movaps	0x17(%rsi), %xmm3
+	movaps	0x27(%rsi), %xmm4
+	movaps	0x37(%rsi), %xmm5
+	movdqa	%xmm5, %xmm6
+	palignr	$9, %xmm4, %xmm5
+	lea	64(%rsi), %rsi
+	palignr	$9, %xmm3, %xmm4
+	palignr	$9, %xmm2, %xmm3
+	lea	64(%rdi), %rdi
+	palignr	$9, %xmm1, %xmm2
+	movdqa	%xmm6, %xmm1
+	movdqa	%xmm2, -0x40(%rdi)
+	movaps	%xmm3, -0x30(%rdi)
+	jb	L(shl_9_end)
+	movaps	%xmm4, -0x20(%rdi)
+	movaps	%xmm5, -0x10(%rdi)
+	jmp	*%r9
+	ud2
+L(shl_9_end):
+	movaps	%xmm4, -0x20(%rdi)
+	lea	64(%rdx), %rdx
+	movaps	%xmm5, -0x10(%rdi)
+	add	%rdx, %rdi
+	movdqu	%xmm0, (%r8)
+	add	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+
+	.p2align 4
+L(shl_9_bwd):
+	lea	(L(shl_9_bwd_loop_L1)-L(shl_9_bwd))(%r9), %r9
+	cmp	%rcx, %rdx
+	movaps	-0x09(%rsi), %xmm1
+	jb	L(L9_bwd)
+	lea	(L(shl_9_bwd_loop_L2)-L(shl_9_bwd_loop_L1))(%r9), %r9
+L(L9_bwd):
+	lea	-64(%rdx), %rdx
+	jmp	*%r9
+	ud2
+L(shl_9_bwd_loop_L2):
+	prefetchnta -0x1c0(%rsi)
+L(shl_9_bwd_loop_L1):
+	movaps	-0x19(%rsi), %xmm2
+	sub	$0x40, %rdx
+	movaps	-0x29(%rsi), %xmm3
+	movaps	-0x39(%rsi), %xmm4
+	movaps	-0x49(%rsi), %xmm5
+	lea	-0x40(%rsi), %rsi
+	palignr	$9, %xmm2, %xmm1
+	palignr	$9, %xmm3, %xmm2
+	palignr	$9, %xmm4, %xmm3
+	palignr	$9, %xmm5, %xmm4
+
+	movaps	%xmm1, -0x10(%rdi)
+	movaps	%xmm5, %xmm1
+
+	movaps	%xmm2, -0x20(%rdi)
+	lea	-0x40(%rdi), %rdi
+
+	movaps	%xmm3, 0x10(%rdi)
+	jb	L(shl_9_bwd_end)
+	movaps	%xmm4, (%rdi)
+	jmp	*%r9
+	ud2
+L(shl_9_bwd_end):
+	movaps	%xmm4, (%rdi)
+	lea	64(%rdx), %rdx
+	movdqu	%xmm0, (%r8)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+
+	.p2align 4
+L(shl_10):
+	lea	(L(shl_10_loop_L1)-L(shl_10))(%r9), %r9
+	cmp	%rcx, %rdx
+	movaps	-0x0a(%rsi), %xmm1
+	jb	L(L10_fwd)
+	lea	(L(shl_10_loop_L2)-L(shl_10_loop_L1))(%r9), %r9
+L(L10_fwd):
+	lea	-64(%rdx), %rdx
+	jmp	*%r9
+	ud2
+L(shl_10_loop_L2):
+	prefetchnta 0x1c0(%rsi)
+L(shl_10_loop_L1):
+	sub	$64, %rdx
+	movaps	0x06(%rsi), %xmm2
+	movaps	0x16(%rsi), %xmm3
+	movaps	0x26(%rsi), %xmm4
+	movaps	0x36(%rsi), %xmm5
+	movdqa	%xmm5, %xmm6
+	palignr	$10, %xmm4, %xmm5
+	lea	64(%rsi), %rsi
+	palignr	$10, %xmm3, %xmm4
+	palignr	$10, %xmm2, %xmm3
+	lea	64(%rdi), %rdi
+	palignr	$10, %xmm1, %xmm2
+	movdqa	%xmm6, %xmm1
+	movdqa	%xmm2, -0x40(%rdi)
+	movaps	%xmm3, -0x30(%rdi)
+	jb	L(shl_10_end)
+	movaps	%xmm4, -0x20(%rdi)
+	movaps	%xmm5, -0x10(%rdi)
+	jmp	*%r9
+	ud2
+L(shl_10_end):
+	movaps	%xmm4, -0x20(%rdi)
+	lea	64(%rdx), %rdx
+	movaps	%xmm5, -0x10(%rdi)
+	add	%rdx, %rdi
+	movdqu	%xmm0, (%r8)
+	add	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+
+	.p2align 4
+L(shl_10_bwd):
+	lea	(L(shl_10_bwd_loop_L1)-L(shl_10_bwd))(%r9), %r9
+	cmp	%rcx, %rdx
+	movaps	-0x0a(%rsi), %xmm1
+	jb	L(L10_bwd)
+	lea	(L(shl_10_bwd_loop_L2)-L(shl_10_bwd_loop_L1))(%r9), %r9
+L(L10_bwd):
+	lea	-64(%rdx), %rdx
+	jmp	*%r9
+	ud2
+L(shl_10_bwd_loop_L2):
+	prefetchnta -0x1c0(%rsi)
+L(shl_10_bwd_loop_L1):
+	movaps	-0x1a(%rsi), %xmm2
+	sub	$0x40, %rdx
+	movaps	-0x2a(%rsi), %xmm3
+	movaps	-0x3a(%rsi), %xmm4
+	movaps	-0x4a(%rsi), %xmm5
+	lea	-0x40(%rsi), %rsi
+	palignr	$10, %xmm2, %xmm1
+	palignr	$10, %xmm3, %xmm2
+	palignr	$10, %xmm4, %xmm3
+	palignr	$10, %xmm5, %xmm4
+
+	movaps	%xmm1, -0x10(%rdi)
+	movaps	%xmm5, %xmm1
+
+	movaps	%xmm2, -0x20(%rdi)
+	lea	-0x40(%rdi), %rdi
+
+	movaps	%xmm3, 0x10(%rdi)
+	jb	L(shl_10_bwd_end)
+	movaps	%xmm4, (%rdi)
+	jmp	*%r9
+	ud2
+L(shl_10_bwd_end):
+	movaps	%xmm4, (%rdi)
+	lea	64(%rdx), %rdx
+	movdqu	%xmm0, (%r8)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+
+	.p2align 4
+L(shl_11):
+	lea	(L(shl_11_loop_L1)-L(shl_11))(%r9), %r9
+	cmp	%rcx, %rdx
+	movaps	-0x0b(%rsi), %xmm1
+	jb	L(L11_fwd)
+	lea	(L(shl_11_loop_L2)-L(shl_11_loop_L1))(%r9), %r9
+L(L11_fwd):
+	lea	-64(%rdx), %rdx
+	jmp	*%r9
+	ud2
+L(shl_11_loop_L2):
+	prefetchnta 0x1c0(%rsi)
+L(shl_11_loop_L1):
+	sub	$64, %rdx
+	movaps	0x05(%rsi), %xmm2
+	movaps	0x15(%rsi), %xmm3
+	movaps	0x25(%rsi), %xmm4
+	movaps	0x35(%rsi), %xmm5
+	movdqa	%xmm5, %xmm6
+	palignr	$11, %xmm4, %xmm5
+	lea	64(%rsi), %rsi
+	palignr	$11, %xmm3, %xmm4
+	palignr	$11, %xmm2, %xmm3
+	lea	64(%rdi), %rdi
+	palignr	$11, %xmm1, %xmm2
+	movdqa	%xmm6, %xmm1
+	movdqa	%xmm2, -0x40(%rdi)
+	movaps	%xmm3, -0x30(%rdi)
+	jb	L(shl_11_end)
+	movaps	%xmm4, -0x20(%rdi)
+	movaps	%xmm5, -0x10(%rdi)
+	jmp	*%r9
+	ud2
+L(shl_11_end):
+	movaps	%xmm4, -0x20(%rdi)
+	lea	64(%rdx), %rdx
+	movaps	%xmm5, -0x10(%rdi)
+	add	%rdx, %rdi
+	movdqu	%xmm0, (%r8)
+	add	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+
+	.p2align 4
+L(shl_11_bwd):
+	lea	(L(shl_11_bwd_loop_L1)-L(shl_11_bwd))(%r9), %r9
+	cmp	%rcx, %rdx
+	movaps	-0x0b(%rsi), %xmm1
+	jb	L(L11_bwd)
+	lea	(L(shl_11_bwd_loop_L2)-L(shl_11_bwd_loop_L1))(%r9), %r9
+L(L11_bwd):
+	lea	-64(%rdx), %rdx
+	jmp	*%r9
+	ud2
+L(shl_11_bwd_loop_L2):
+	prefetchnta -0x1c0(%rsi)
+L(shl_11_bwd_loop_L1):
+	movaps	-0x1b(%rsi), %xmm2
+	sub	$0x40, %rdx
+	movaps	-0x2b(%rsi), %xmm3
+	movaps	-0x3b(%rsi), %xmm4
+	movaps	-0x4b(%rsi), %xmm5
+	lea	-0x40(%rsi), %rsi
+	palignr	$11, %xmm2, %xmm1
+	palignr	$11, %xmm3, %xmm2
+	palignr	$11, %xmm4, %xmm3
+	palignr	$11, %xmm5, %xmm4
+
+	movaps	%xmm1, -0x10(%rdi)
+	movaps	%xmm5, %xmm1
+
+	movaps	%xmm2, -0x20(%rdi)
+	lea	-0x40(%rdi), %rdi
+
+	movaps	%xmm3, 0x10(%rdi)
+	jb	L(shl_11_bwd_end)
+	movaps	%xmm4, (%rdi)
+	jmp	*%r9
+	ud2
+L(shl_11_bwd_end):
+	movaps	%xmm4, (%rdi)
+	lea	64(%rdx), %rdx
+	movdqu	%xmm0, (%r8)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+
+	.p2align 4
+L(shl_12):
+	lea	(L(shl_12_loop_L1)-L(shl_12))(%r9), %r9
+	cmp	%rcx, %rdx
+	movaps	-0x0c(%rsi), %xmm1
+	jb	L(L12_fwd)
+	lea	(L(shl_12_loop_L2)-L(shl_12_loop_L1))(%r9), %r9
+L(L12_fwd):
+	lea	-64(%rdx), %rdx
+	jmp	*%r9
+	ud2
+L(shl_12_loop_L2):
+	prefetchnta 0x1c0(%rsi)
+L(shl_12_loop_L1):
+	sub	$64, %rdx
+	movaps	0x04(%rsi), %xmm2
+	movaps	0x14(%rsi), %xmm3
+	movaps	0x24(%rsi), %xmm4
+	movaps	0x34(%rsi), %xmm5
+	movdqa	%xmm5, %xmm6
+	palignr	$12, %xmm4, %xmm5
+	lea	64(%rsi), %rsi
+	palignr	$12, %xmm3, %xmm4
+	palignr	$12, %xmm2, %xmm3
+	lea	64(%rdi), %rdi
+	palignr	$12, %xmm1, %xmm2
+	movdqa	%xmm6, %xmm1
+	movdqa	%xmm2, -0x40(%rdi)
+	movaps	%xmm3, -0x30(%rdi)
+	jb	L(shl_12_end)
+	movaps	%xmm4, -0x20(%rdi)
+	movaps	%xmm5, -0x10(%rdi)
+	jmp	*%r9
+	ud2
+L(shl_12_end):
+	movaps	%xmm4, -0x20(%rdi)
+	lea	64(%rdx), %rdx
+	movaps	%xmm5, -0x10(%rdi)
+	add	%rdx, %rdi
+	movdqu	%xmm0, (%r8)
+	add	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+
+	.p2align 4
+L(shl_12_bwd):
+	lea	(L(shl_12_bwd_loop_L1)-L(shl_12_bwd))(%r9), %r9
+	cmp	%rcx, %rdx
+	movaps	-0x0c(%rsi), %xmm1
+	jb	L(L12_bwd)
+	lea	(L(shl_12_bwd_loop_L2)-L(shl_12_bwd_loop_L1))(%r9), %r9
+L(L12_bwd):
+	lea	-64(%rdx), %rdx
+	jmp	*%r9
+	ud2
+L(shl_12_bwd_loop_L2):
+	prefetchnta -0x1c0(%rsi)
+L(shl_12_bwd_loop_L1):
+	movaps	-0x1c(%rsi), %xmm2
+	sub	$0x40, %rdx
+	movaps	-0x2c(%rsi), %xmm3
+	movaps	-0x3c(%rsi), %xmm4
+	movaps	-0x4c(%rsi), %xmm5
+	lea	-0x40(%rsi), %rsi
+	palignr	$12, %xmm2, %xmm1
+	palignr	$12, %xmm3, %xmm2
+	palignr	$12, %xmm4, %xmm3
+	palignr	$12, %xmm5, %xmm4
+
+	movaps	%xmm1, -0x10(%rdi)
+	movaps	%xmm5, %xmm1
+
+	movaps	%xmm2, -0x20(%rdi)
+	lea	-0x40(%rdi), %rdi
+
+	movaps	%xmm3, 0x10(%rdi)
+	jb	L(shl_12_bwd_end)
+	movaps	%xmm4, (%rdi)
+	jmp	*%r9
+	ud2
+L(shl_12_bwd_end):
+	movaps	%xmm4, (%rdi)
+	lea	64(%rdx), %rdx
+	movdqu	%xmm0, (%r8)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+
+	.p2align 4
+L(shl_13):
+	lea	(L(shl_13_loop_L1)-L(shl_13))(%r9), %r9
+	cmp	%rcx, %rdx
+	movaps	-0x0d(%rsi), %xmm1
+	jb	L(L13_fwd)
+	lea	(L(shl_13_loop_L2)-L(shl_13_loop_L1))(%r9), %r9
+L(L13_fwd):
+	lea	-64(%rdx), %rdx
+	jmp	*%r9
+	ud2
+L(shl_13_loop_L2):
+	prefetchnta 0x1c0(%rsi)
+L(shl_13_loop_L1):
+	sub	$64, %rdx
+	movaps	0x03(%rsi), %xmm2
+	movaps	0x13(%rsi), %xmm3
+	movaps	0x23(%rsi), %xmm4
+	movaps	0x33(%rsi), %xmm5
+	movdqa	%xmm5, %xmm6
+	palignr	$13, %xmm4, %xmm5
+	lea	64(%rsi), %rsi
+	palignr	$13, %xmm3, %xmm4
+	palignr	$13, %xmm2, %xmm3
+	lea	64(%rdi), %rdi
+	palignr	$13, %xmm1, %xmm2
+	movdqa	%xmm6, %xmm1
+	movdqa	%xmm2, -0x40(%rdi)
+	movaps	%xmm3, -0x30(%rdi)
+	jb	L(shl_13_end)
+	movaps	%xmm4, -0x20(%rdi)
+	movaps	%xmm5, -0x10(%rdi)
+	jmp	*%r9
+	ud2
+L(shl_13_end):
+	movaps	%xmm4, -0x20(%rdi)
+	lea	64(%rdx), %rdx
+	movaps	%xmm5, -0x10(%rdi)
+	add	%rdx, %rdi
+	movdqu	%xmm0, (%r8)
+	add	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+
+	.p2align 4
+L(shl_13_bwd):
+	lea	(L(shl_13_bwd_loop_L1)-L(shl_13_bwd))(%r9), %r9
+	cmp	%rcx, %rdx
+	movaps	-0x0d(%rsi), %xmm1
+	jb	L(L13_bwd)
+	lea	(L(shl_13_bwd_loop_L2)-L(shl_13_bwd_loop_L1))(%r9), %r9
+L(L13_bwd):
+	lea	-64(%rdx), %rdx
+	jmp	*%r9
+	ud2
+L(shl_13_bwd_loop_L2):
+	prefetchnta -0x1c0(%rsi)
+L(shl_13_bwd_loop_L1):
+	movaps	-0x1d(%rsi), %xmm2
+	sub	$0x40, %rdx
+	movaps	-0x2d(%rsi), %xmm3
+	movaps	-0x3d(%rsi), %xmm4
+	movaps	-0x4d(%rsi), %xmm5
+	lea	-0x40(%rsi), %rsi
+	palignr	$13, %xmm2, %xmm1
+	palignr	$13, %xmm3, %xmm2
+	palignr	$13, %xmm4, %xmm3
+	palignr	$13, %xmm5, %xmm4
+
+	movaps	%xmm1, -0x10(%rdi)
+	movaps	%xmm5, %xmm1
+
+	movaps	%xmm2, -0x20(%rdi)
+	lea	-0x40(%rdi), %rdi
+
+	movaps	%xmm3, 0x10(%rdi)
+	jb	L(shl_13_bwd_end)
+	movaps	%xmm4, (%rdi)
+	jmp	*%r9
+	ud2
+L(shl_13_bwd_end):
+	movaps	%xmm4, (%rdi)
+	lea	64(%rdx), %rdx
+	movdqu	%xmm0, (%r8)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+
+	.p2align 4
+L(shl_14):
+	lea	(L(shl_14_loop_L1)-L(shl_14))(%r9), %r9
+	cmp	%rcx, %rdx
+	movaps	-0x0e(%rsi), %xmm1
+	jb	L(L14_fwd)
+	lea	(L(shl_14_loop_L2)-L(shl_14_loop_L1))(%r9), %r9
+L(L14_fwd):
+	lea	-64(%rdx), %rdx
+	jmp	*%r9
+	ud2
+L(shl_14_loop_L2):
+	prefetchnta 0x1c0(%rsi)
+L(shl_14_loop_L1):
+	sub	$64, %rdx
+	movaps	0x02(%rsi), %xmm2
+	movaps	0x12(%rsi), %xmm3
+	movaps	0x22(%rsi), %xmm4
+	movaps	0x32(%rsi), %xmm5
+	movdqa	%xmm5, %xmm6
+	palignr	$14, %xmm4, %xmm5
+	lea	64(%rsi), %rsi
+	palignr	$14, %xmm3, %xmm4
+	palignr	$14, %xmm2, %xmm3
+	lea	64(%rdi), %rdi
+	palignr	$14, %xmm1, %xmm2
+	movdqa	%xmm6, %xmm1
+	movdqa	%xmm2, -0x40(%rdi)
+	movaps	%xmm3, -0x30(%rdi)
+	jb	L(shl_14_end)
+	movaps	%xmm4, -0x20(%rdi)
+	movaps	%xmm5, -0x10(%rdi)
+	jmp	*%r9
+	ud2
+L(shl_14_end):
+	movaps	%xmm4, -0x20(%rdi)
+	lea	64(%rdx), %rdx
+	movaps	%xmm5, -0x10(%rdi)
+	add	%rdx, %rdi
+	movdqu	%xmm0, (%r8)
+	add	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+
+	.p2align 4
+L(shl_14_bwd):
+	lea	(L(shl_14_bwd_loop_L1)-L(shl_14_bwd))(%r9), %r9
+	cmp	%rcx, %rdx
+	movaps	-0x0e(%rsi), %xmm1
+	jb	L(L14_bwd)
+	lea	(L(shl_14_bwd_loop_L2)-L(shl_14_bwd_loop_L1))(%r9), %r9
+L(L14_bwd):
+	lea	-64(%rdx), %rdx
+	jmp	*%r9
+	ud2
+L(shl_14_bwd_loop_L2):
+	prefetchnta -0x1c0(%rsi)
+L(shl_14_bwd_loop_L1):
+	movaps	-0x1e(%rsi), %xmm2
+	sub	$0x40, %rdx
+	movaps	-0x2e(%rsi), %xmm3
+	movaps	-0x3e(%rsi), %xmm4
+	movaps	-0x4e(%rsi), %xmm5
+	lea	-0x40(%rsi), %rsi
+	palignr	$14, %xmm2, %xmm1
+	palignr	$14, %xmm3, %xmm2
+	palignr	$14, %xmm4, %xmm3
+	palignr	$14, %xmm5, %xmm4
+
+	movaps	%xmm1, -0x10(%rdi)
+	movaps	%xmm5, %xmm1
+
+	movaps	%xmm2, -0x20(%rdi)
+	lea	-0x40(%rdi), %rdi
+
+	movaps	%xmm3, 0x10(%rdi)
+	jb	L(shl_14_bwd_end)
+	movaps	%xmm4, (%rdi)
+	jmp	*%r9
+	ud2
+L(shl_14_bwd_end):
+	movaps	%xmm4, (%rdi)
+	lea	64(%rdx), %rdx
+	movdqu	%xmm0, (%r8)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+
+	.p2align 4
+L(shl_15):
+	lea	(L(shl_15_loop_L1)-L(shl_15))(%r9), %r9
+	cmp	%rcx, %rdx
+	movaps	-0x0f(%rsi), %xmm1
+	jb	L(L15_fwd)
+	lea	(L(shl_15_loop_L2)-L(shl_15_loop_L1))(%r9), %r9
+L(L15_fwd):
+	lea	-64(%rdx), %rdx
+	jmp	*%r9
+	ud2
+L(shl_15_loop_L2):
+	prefetchnta 0x1c0(%rsi)
+L(shl_15_loop_L1):
+	sub	$64, %rdx
+	movaps	0x01(%rsi), %xmm2
+	movaps	0x11(%rsi), %xmm3
+	movaps	0x21(%rsi), %xmm4
+	movaps	0x31(%rsi), %xmm5
+	movdqa	%xmm5, %xmm6
+	palignr	$15, %xmm4, %xmm5
+	lea	64(%rsi), %rsi
+	palignr	$15, %xmm3, %xmm4
+	palignr	$15, %xmm2, %xmm3
+	lea	64(%rdi), %rdi
+	palignr	$15, %xmm1, %xmm2
+	movdqa	%xmm6, %xmm1
+	movdqa	%xmm2, -0x40(%rdi)
+	movaps	%xmm3, -0x30(%rdi)
+	jb	L(shl_15_end)
+	movaps	%xmm4, -0x20(%rdi)
+	movaps	%xmm5, -0x10(%rdi)
+	jmp	*%r9
+	ud2
+L(shl_15_end):
+	movaps	%xmm4, -0x20(%rdi)
+	lea	64(%rdx), %rdx
+	movaps	%xmm5, -0x10(%rdi)
+	add	%rdx, %rdi
+	movdqu	%xmm0, (%r8)
+	add	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+
+	.p2align 4
+L(shl_15_bwd):
+	lea	(L(shl_15_bwd_loop_L1)-L(shl_15_bwd))(%r9), %r9
+	cmp	%rcx, %rdx
+	movaps	-0x0f(%rsi), %xmm1
+	jb	L(L15_bwd)
+	lea	(L(shl_15_bwd_loop_L2)-L(shl_15_bwd_loop_L1))(%r9), %r9
+L(L15_bwd):
+	lea	-64(%rdx), %rdx
+	jmp	*%r9
+	ud2
+L(shl_15_bwd_loop_L2):
+	prefetchnta -0x1c0(%rsi)
+L(shl_15_bwd_loop_L1):
+	movaps	-0x1f(%rsi), %xmm2
+	sub	$0x40, %rdx
+	movaps	-0x2f(%rsi), %xmm3
+	movaps	-0x3f(%rsi), %xmm4
+	movaps	-0x4f(%rsi), %xmm5
+	lea	-0x40(%rsi), %rsi
+	palignr	$15, %xmm2, %xmm1
+	palignr	$15, %xmm3, %xmm2
+	palignr	$15, %xmm4, %xmm3
+	palignr	$15, %xmm5, %xmm4
+
+	movaps	%xmm1, -0x10(%rdi)
+	movaps	%xmm5, %xmm1
+
+	movaps	%xmm2, -0x20(%rdi)
+	lea	-0x40(%rdi), %rdi
+
+	movaps	%xmm3, 0x10(%rdi)
+	jb	L(shl_15_bwd_end)
+	movaps	%xmm4, (%rdi)
+	jmp	*%r9
+	ud2
+L(shl_15_bwd_end):
+	movaps	%xmm4, (%rdi)
+	lea	64(%rdx), %rdx
+	movdqu	%xmm0, (%r8)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+
+	.p2align 4
+L(write_72bytes):
+	movdqu	-72(%rsi), %xmm0
+	movdqu	-56(%rsi), %xmm1
+	mov	-40(%rsi), %r8
+	mov	-32(%rsi), %r9
+	mov	-24(%rsi), %r10
+	mov	-16(%rsi), %r11
+	mov	-8(%rsi), %rcx
+	movdqu	 %xmm0, -72(%rdi)
+	movdqu	 %xmm1, -56(%rdi)
+	mov	 %r8, -40(%rdi)
+	mov	 %r9, -32(%rdi)
+	mov	 %r10, -24(%rdi)
+	mov	 %r11, -16(%rdi)
+	mov	 %rcx, -8(%rdi)
+	ret
+
+	.p2align 4
+L(write_64bytes):
+	movdqu	-64(%rsi), %xmm0
+	mov	-48(%rsi), %rcx
+	mov	-40(%rsi), %r8
+	mov	-32(%rsi), %r9
+	mov	-24(%rsi), %r10
+	mov	-16(%rsi), %r11
+	mov	-8(%rsi), %rdx
+	movdqu	 %xmm0, -64(%rdi)
+	mov	 %rcx, -48(%rdi)
+	mov	 %r8, -40(%rdi)
+	mov	 %r9, -32(%rdi)
+	mov	 %r10, -24(%rdi)
+	mov	 %r11, -16(%rdi)
+	mov	 %rdx, -8(%rdi)
+	ret
+
+	.p2align 4
+L(write_56bytes):
+	movdqu	-56(%rsi), %xmm0
+	mov	-40(%rsi), %r8
+	mov	-32(%rsi), %r9
+	mov	-24(%rsi), %r10
+	mov	-16(%rsi), %r11
+	mov	-8(%rsi), %rcx
+	movdqu	 %xmm0, -56(%rdi)
+	mov	 %r8, -40(%rdi)
+	mov	 %r9, -32(%rdi)
+	mov	 %r10, -24(%rdi)
+	mov	 %r11, -16(%rdi)
+	mov	 %rcx, -8(%rdi)
+	ret
+
+	.p2align 4
+L(write_48bytes):
+	mov	-48(%rsi), %rcx
+	mov	-40(%rsi), %r8
+	mov	-32(%rsi), %r9
+	mov	-24(%rsi), %r10
+	mov	-16(%rsi), %r11
+	mov	-8(%rsi), %rdx
+	mov	 %rcx, -48(%rdi)
+	mov	 %r8, -40(%rdi)
+	mov	 %r9, -32(%rdi)
+	mov	 %r10, -24(%rdi)
+	mov	 %r11, -16(%rdi)
+	mov	 %rdx, -8(%rdi)
+	ret
+
+	.p2align 4
+L(write_40bytes):
+	mov	-40(%rsi), %r8
+	mov	-32(%rsi), %r9
+	mov	-24(%rsi), %r10
+	mov	-16(%rsi), %r11
+	mov	-8(%rsi), %rdx
+	mov	 %r8, -40(%rdi)
+	mov	 %r9, -32(%rdi)
+	mov	 %r10, -24(%rdi)
+	mov	 %r11, -16(%rdi)
+	mov	 %rdx, -8(%rdi)
+	ret
+
+	.p2align 4
+L(write_32bytes):
+	mov	-32(%rsi), %r9
+	mov	-24(%rsi), %r10
+	mov	-16(%rsi), %r11
+	mov	-8(%rsi), %rdx
+	mov	 %r9, -32(%rdi)
+	mov	 %r10, -24(%rdi)
+	mov	 %r11, -16(%rdi)
+	mov	 %rdx, -8(%rdi)
+	ret
+
+	.p2align 4
+L(write_24bytes):
+	mov	-24(%rsi), %r10
+	mov	-16(%rsi), %r11
+	mov	-8(%rsi), %rdx
+	mov	 %r10, -24(%rdi)
+	mov	 %r11, -16(%rdi)
+	mov	 %rdx, -8(%rdi)
+	ret
+
+	.p2align 4
+L(write_16bytes):
+	mov	-16(%rsi), %r11
+	mov	-8(%rsi), %rdx
+	mov	 %r11, -16(%rdi)
+	mov	 %rdx, -8(%rdi)
+	ret
+
+	.p2align 4
+L(write_8bytes):
+	mov	-8(%rsi), %rdx
+	mov	 %rdx, -8(%rdi)
+L(write_0bytes):
+	ret
+
+	.p2align 4
+L(write_73bytes):
+	movdqu	-73(%rsi), %xmm0
+	movdqu	-57(%rsi), %xmm1
+	mov	-41(%rsi), %rcx
+	mov	-33(%rsi), %r9
+	mov	-25(%rsi), %r10
+	mov	-17(%rsi), %r11
+	mov	-9(%rsi), %r8
+	mov	-4(%rsi), %edx
+	movdqu	 %xmm0, -73(%rdi)
+	movdqu	 %xmm1, -57(%rdi)
+	mov	 %rcx, -41(%rdi)
+	mov	 %r9, -33(%rdi)
+	mov	 %r10, -25(%rdi)
+	mov	 %r11, -17(%rdi)
+	mov	 %r8, -9(%rdi)
+	mov	 %edx, -4(%rdi)
+	ret
+
+	.p2align 4
+L(write_65bytes):
+	movdqu	-65(%rsi), %xmm0
+	movdqu	-49(%rsi), %xmm1
+	mov	-33(%rsi), %r9
+	mov	-25(%rsi), %r10
+	mov	-17(%rsi), %r11
+	mov	-9(%rsi), %rcx
+	mov	-4(%rsi), %edx
+	movdqu	 %xmm0, -65(%rdi)
+	movdqu	 %xmm1, -49(%rdi)
+	mov	 %r9, -33(%rdi)
+	mov	 %r10, -25(%rdi)
+	mov	 %r11, -17(%rdi)
+	mov	 %rcx, -9(%rdi)
+	mov	 %edx, -4(%rdi)
+	ret
+
+	.p2align 4
+L(write_57bytes):
+	movdqu	-57(%rsi), %xmm0
+	mov	-41(%rsi), %r8
+	mov	-33(%rsi), %r9
+	mov	-25(%rsi), %r10
+	mov	-17(%rsi), %r11
+	mov	-9(%rsi), %rcx
+	mov	-4(%rsi), %edx
+	movdqu	 %xmm0, -57(%rdi)
+	mov	 %r8, -41(%rdi)
+	mov	 %r9, -33(%rdi)
+	mov	 %r10, -25(%rdi)
+	mov	 %r11, -17(%rdi)
+	mov	 %rcx, -9(%rdi)
+	mov	 %edx, -4(%rdi)
+	ret
+
+	.p2align 4
+L(write_49bytes):
+	movdqu	-49(%rsi), %xmm0
+	mov	-33(%rsi), %r9
+	mov	-25(%rsi), %r10
+	mov	-17(%rsi), %r11
+	mov	-9(%rsi), %rcx
+	mov	-4(%rsi), %edx
+	movdqu	 %xmm0, -49(%rdi)
+	mov	 %r9, -33(%rdi)
+	mov	 %r10, -25(%rdi)
+	mov	 %r11, -17(%rdi)
+	mov	 %rcx, -9(%rdi)
+	mov	 %edx, -4(%rdi)
+	ret
+
+	.p2align 4
+L(write_41bytes):
+	mov	-41(%rsi), %r8
+	mov	-33(%rsi), %r9
+	mov	-25(%rsi), %r10
+	mov	-17(%rsi), %r11
+	mov	-9(%rsi), %rcx
+	mov	-1(%rsi), %dl
+	mov	 %r8, -41(%rdi)
+	mov	 %r9, -33(%rdi)
+	mov	 %r10, -25(%rdi)
+	mov	 %r11, -17(%rdi)
+	mov	 %rcx, -9(%rdi)
+	mov	 %dl, -1(%rdi)
+	ret
+
+	.p2align 4
+L(write_33bytes):
+	mov	-33(%rsi), %r9
+	mov	-25(%rsi), %r10
+	mov	-17(%rsi), %r11
+	mov	-9(%rsi), %rcx
+	mov	-1(%rsi), %dl
+	mov	 %r9, -33(%rdi)
+	mov	 %r10, -25(%rdi)
+	mov	 %r11, -17(%rdi)
+	mov	 %rcx, -9(%rdi)
+	mov	 %dl, -1(%rdi)
+	ret
+
+	.p2align 4
+L(write_25bytes):
+	mov	-25(%rsi), %r10
+	mov	-17(%rsi), %r11
+	mov	-9(%rsi), %rcx
+	mov	-1(%rsi), %dl
+	mov	 %r10, -25(%rdi)
+	mov	 %r11, -17(%rdi)
+	mov	 %rcx, -9(%rdi)
+	mov	 %dl, -1(%rdi)
+	ret
+
+	.p2align 4
+L(write_17bytes):
+	mov	-17(%rsi), %r11
+	mov	-9(%rsi), %rcx
+	mov	-4(%rsi), %edx
+	mov	 %r11, -17(%rdi)
+	mov	 %rcx, -9(%rdi)
+	mov	 %edx, -4(%rdi)
+	ret
+
+	.p2align 4
+L(write_9bytes):
+	mov	-9(%rsi), %rcx
+	mov	-4(%rsi), %edx
+	mov	 %rcx, -9(%rdi)
+	mov	 %edx, -4(%rdi)
+	ret
+
+	.p2align 4
+L(write_1bytes):
+	mov	-1(%rsi), %dl
+	mov	 %dl, -1(%rdi)
+	ret
+
+	.p2align 4
+L(write_74bytes):
+	movdqu	-74(%rsi), %xmm0
+	movdqu	-58(%rsi), %xmm1
+	mov	-42(%rsi), %r8
+	mov	-34(%rsi), %r9
+	mov	-26(%rsi), %r10
+	mov	-18(%rsi), %r11
+	mov	-10(%rsi), %rcx
+	mov	-4(%rsi), %edx
+	movdqu	 %xmm0, -74(%rdi)
+	movdqu	 %xmm1, -58(%rdi)
+	mov	 %r8, -42(%rdi)
+	mov	 %r9, -34(%rdi)
+	mov	 %r10, -26(%rdi)
+	mov	 %r11, -18(%rdi)
+	mov	 %rcx, -10(%rdi)
+	mov	 %edx, -4(%rdi)
+	ret
+
+	.p2align 4
+L(write_66bytes):
+	movdqu	-66(%rsi), %xmm0
+	movdqu	-50(%rsi), %xmm1
+	mov	-42(%rsi), %r8
+	mov	-34(%rsi), %r9
+	mov	-26(%rsi), %r10
+	mov	-18(%rsi), %r11
+	mov	-10(%rsi), %rcx
+	mov	-4(%rsi), %edx
+	movdqu	 %xmm0, -66(%rdi)
+	movdqu	 %xmm1, -50(%rdi)
+	mov	 %r8, -42(%rdi)
+	mov	 %r9, -34(%rdi)
+	mov	 %r10, -26(%rdi)
+	mov	 %r11, -18(%rdi)
+	mov	 %rcx, -10(%rdi)
+	mov	 %edx, -4(%rdi)
+	ret
+
+	.p2align 4
+L(write_58bytes):
+	movdqu	-58(%rsi), %xmm1
+	mov	-42(%rsi), %r8
+	mov	-34(%rsi), %r9
+	mov	-26(%rsi), %r10
+	mov	-18(%rsi), %r11
+	mov	-10(%rsi), %rcx
+	mov	-4(%rsi), %edx
+	movdqu	 %xmm1, -58(%rdi)
+	mov	 %r8, -42(%rdi)
+	mov	 %r9, -34(%rdi)
+	mov	 %r10, -26(%rdi)
+	mov	 %r11, -18(%rdi)
+	mov	 %rcx, -10(%rdi)
+	mov	 %edx, -4(%rdi)
+	ret
+
+	.p2align 4
+L(write_50bytes):
+	movdqu	-50(%rsi), %xmm0
+	mov	-34(%rsi), %r9
+	mov	-26(%rsi), %r10
+	mov	-18(%rsi), %r11
+	mov	-10(%rsi), %rcx
+	mov	-4(%rsi), %edx
+	movdqu	 %xmm0, -50(%rdi)
+	mov	 %r9, -34(%rdi)
+	mov	 %r10, -26(%rdi)
+	mov	 %r11, -18(%rdi)
+	mov	 %rcx, -10(%rdi)
+	mov	 %edx, -4(%rdi)
+	ret
+
+	.p2align 4
+L(write_42bytes):
+	mov	-42(%rsi), %r8
+	mov	-34(%rsi), %r9
+	mov	-26(%rsi), %r10
+	mov	-18(%rsi), %r11
+	mov	-10(%rsi), %rcx
+	mov	-4(%rsi), %edx
+	mov	 %r8, -42(%rdi)
+	mov	 %r9, -34(%rdi)
+	mov	 %r10, -26(%rdi)
+	mov	 %r11, -18(%rdi)
+	mov	 %rcx, -10(%rdi)
+	mov	 %edx, -4(%rdi)
+	ret
+
+	.p2align 4
+L(write_34bytes):
+	mov	-34(%rsi), %r9
+	mov	-26(%rsi), %r10
+	mov	-18(%rsi), %r11
+	mov	-10(%rsi), %rcx
+	mov	-4(%rsi), %edx
+	mov	 %r9, -34(%rdi)
+	mov	 %r10, -26(%rdi)
+	mov	 %r11, -18(%rdi)
+	mov	 %rcx, -10(%rdi)
+	mov	 %edx, -4(%rdi)
+	ret
+
+	.p2align 4
+L(write_26bytes):
+	mov	-26(%rsi), %r10
+	mov	-18(%rsi), %r11
+	mov	-10(%rsi), %rcx
+	mov	-4(%rsi), %edx
+	mov	 %r10, -26(%rdi)
+	mov	 %r11, -18(%rdi)
+	mov	 %rcx, -10(%rdi)
+	mov	 %edx, -4(%rdi)
+	ret
+
+	.p2align 4
+L(write_18bytes):
+	mov	-18(%rsi), %r11
+	mov	-10(%rsi), %rcx
+	mov	-4(%rsi), %edx
+	mov	 %r11, -18(%rdi)
+	mov	 %rcx, -10(%rdi)
+	mov	 %edx, -4(%rdi)
+	ret
+
+	.p2align 4
+L(write_10bytes):
+	mov	-10(%rsi), %rcx
+	mov	-4(%rsi), %edx
+	mov	 %rcx, -10(%rdi)
+	mov	 %edx, -4(%rdi)
+	ret
+
+	.p2align 4
+L(write_2bytes):
+	mov	-2(%rsi), %dx
+	mov	 %dx, -2(%rdi)
+	ret
+
+	.p2align 4
+L(write_75bytes):
+	movdqu	-75(%rsi), %xmm0
+	movdqu	-59(%rsi), %xmm1
+	mov	-43(%rsi), %r8
+	mov	-35(%rsi), %r9
+	mov	-27(%rsi), %r10
+	mov	-19(%rsi), %r11
+	mov	-11(%rsi), %rcx
+	mov	-4(%rsi), %edx
+	movdqu	 %xmm0, -75(%rdi)
+	movdqu	 %xmm1, -59(%rdi)
+	mov	 %r8, -43(%rdi)
+	mov	 %r9, -35(%rdi)
+	mov	 %r10, -27(%rdi)
+	mov	 %r11, -19(%rdi)
+	mov	 %rcx, -11(%rdi)
+	mov	 %edx, -4(%rdi)
+	ret
+
+	.p2align 4
+L(write_67bytes):
+	movdqu	-67(%rsi), %xmm0
+	movdqu	-59(%rsi), %xmm1
+	mov	-43(%rsi), %r8
+	mov	-35(%rsi), %r9
+	mov	-27(%rsi), %r10
+	mov	-19(%rsi), %r11
+	mov	-11(%rsi), %rcx
+	mov	-4(%rsi), %edx
+	movdqu	 %xmm0, -67(%rdi)
+	movdqu	 %xmm1, -59(%rdi)
+	mov	 %r8, -43(%rdi)
+	mov	 %r9, -35(%rdi)
+	mov	 %r10, -27(%rdi)
+	mov	 %r11, -19(%rdi)
+	mov	 %rcx, -11(%rdi)
+	mov	 %edx, -4(%rdi)
+	ret
+
+	.p2align 4
+L(write_59bytes):
+	movdqu	-59(%rsi), %xmm0
+	mov	-43(%rsi), %r8
+	mov	-35(%rsi), %r9
+	mov	-27(%rsi), %r10
+	mov	-19(%rsi), %r11
+	mov	-11(%rsi), %rcx
+	mov	-4(%rsi), %edx
+	movdqu	 %xmm0, -59(%rdi)
+	mov	 %r8, -43(%rdi)
+	mov	 %r9, -35(%rdi)
+	mov	 %r10, -27(%rdi)
+	mov	 %r11, -19(%rdi)
+	mov	 %rcx, -11(%rdi)
+	mov	 %edx, -4(%rdi)
+	ret
+
+	.p2align 4
+L(write_51bytes):
+	movdqu	-51(%rsi), %xmm0
+	mov	-35(%rsi), %r9
+	mov	-27(%rsi), %r10
+	mov	-19(%rsi), %r11
+	mov	-11(%rsi), %rcx
+	mov	-4(%rsi), %edx
+	movdqu	 %xmm0, -51(%rdi)
+	mov	 %r9, -35(%rdi)
+	mov	 %r10, -27(%rdi)
+	mov	 %r11, -19(%rdi)
+	mov	 %rcx, -11(%rdi)
+	mov	 %edx, -4(%rdi)
+	ret
+
+	.p2align 4
+L(write_43bytes):
+	mov	-43(%rsi), %r8
+	mov	-35(%rsi), %r9
+	mov	-27(%rsi), %r10
+	mov	-19(%rsi), %r11
+	mov	-11(%rsi), %rcx
+	mov	-4(%rsi), %edx
+	mov	 %r8, -43(%rdi)
+	mov	 %r9, -35(%rdi)
+	mov	 %r10, -27(%rdi)
+	mov	 %r11, -19(%rdi)
+	mov	 %rcx, -11(%rdi)
+	mov	 %edx, -4(%rdi)
+	ret
+
+	.p2align 4
+L(write_35bytes):
+	mov	-35(%rsi), %r9
+	mov	-27(%rsi), %r10
+	mov	-19(%rsi), %r11
+	mov	-11(%rsi), %rcx
+	mov	-4(%rsi), %edx
+	mov	 %r9, -35(%rdi)
+	mov	 %r10, -27(%rdi)
+	mov	 %r11, -19(%rdi)
+	mov	 %rcx, -11(%rdi)
+	mov	 %edx, -4(%rdi)
+	ret
+
+	.p2align 4
+L(write_27bytes):
+	mov	-27(%rsi), %r10
+	mov	-19(%rsi), %r11
+	mov	-11(%rsi), %rcx
+	mov	-4(%rsi), %edx
+	mov	 %r10, -27(%rdi)
+	mov	 %r11, -19(%rdi)
+	mov	 %rcx, -11(%rdi)
+	mov	 %edx, -4(%rdi)
+	ret
+
+	.p2align 4
+L(write_19bytes):
+	mov	-19(%rsi), %r11
+	mov	-11(%rsi), %rcx
+	mov	-4(%rsi), %edx
+	mov	 %r11, -19(%rdi)
+	mov	 %rcx, -11(%rdi)
+	mov	 %edx, -4(%rdi)
+	ret
+
+	.p2align 4
+L(write_11bytes):
+	mov	-11(%rsi), %rcx
+	mov	-4(%rsi), %edx
+	mov	 %rcx, -11(%rdi)
+	mov	 %edx, -4(%rdi)
+	ret
+
+	.p2align 4
+L(write_3bytes):
+	mov	-3(%rsi), %dx
+	mov	-2(%rsi), %cx
+	mov	 %dx, -3(%rdi)
+	mov	 %cx, -2(%rdi)
+	ret
+
+	.p2align 4
+L(write_76bytes):
+	movdqu	-76(%rsi), %xmm0
+	movdqu	-60(%rsi), %xmm1
+	mov	-44(%rsi), %r8
+	mov	-36(%rsi), %r9
+	mov	-28(%rsi), %r10
+	mov	-20(%rsi), %r11
+	mov	-12(%rsi), %rcx
+	mov	-4(%rsi), %edx
+	movdqu	 %xmm0, -76(%rdi)
+	movdqu	 %xmm1, -60(%rdi)
+	mov	 %r8, -44(%rdi)
+	mov	 %r9, -36(%rdi)
+	mov	 %r10, -28(%rdi)
+	mov	 %r11, -20(%rdi)
+	mov	 %rcx, -12(%rdi)
+	mov	 %edx, -4(%rdi)
+	ret
+
+	.p2align 4
+L(write_68bytes):
+	movdqu	-68(%rsi), %xmm0
+	movdqu	-52(%rsi), %xmm1
+	mov	-36(%rsi), %r9
+	mov	-28(%rsi), %r10
+	mov	-20(%rsi), %r11
+	mov	-12(%rsi), %rcx
+	mov	-4(%rsi), %edx
+	movdqu	 %xmm0, -68(%rdi)
+	movdqu	 %xmm1, -52(%rdi)
+	mov	 %r9, -36(%rdi)
+	mov	 %r10, -28(%rdi)
+	mov	 %r11, -20(%rdi)
+	mov	 %rcx, -12(%rdi)
+	mov	 %edx, -4(%rdi)
+	ret
+
+	.p2align 4
+L(write_60bytes):
+	movdqu	-60(%rsi), %xmm0
+	mov	-44(%rsi), %r8
+	mov	-36(%rsi), %r9
+	mov	-28(%rsi), %r10
+	mov	-20(%rsi), %r11
+	mov	-12(%rsi), %rcx
+	mov	-4(%rsi), %edx
+	movdqu	 %xmm0, -60(%rdi)
+	mov	 %r8, -44(%rdi)
+	mov	 %r9, -36(%rdi)
+	mov	 %r10, -28(%rdi)
+	mov	 %r11, -20(%rdi)
+	mov	 %rcx, -12(%rdi)
+	mov	 %edx, -4(%rdi)
+	ret
+
+	.p2align 4
+L(write_52bytes):
+	movdqu	-52(%rsi), %xmm0
+	mov	-36(%rsi), %r9
+	mov	-28(%rsi), %r10
+	mov	-20(%rsi), %r11
+	mov	-12(%rsi), %rcx
+	mov	-4(%rsi), %edx
+	movdqu	 %xmm0, -52(%rdi)
+	mov	 %r9, -36(%rdi)
+	mov	 %r10, -28(%rdi)
+	mov	 %r11, -20(%rdi)
+	mov	 %rcx, -12(%rdi)
+	mov	 %edx, -4(%rdi)
+	ret
+
+	.p2align 4
+L(write_44bytes):
+	mov	-44(%rsi), %r8
+	mov	-36(%rsi), %r9
+	mov	-28(%rsi), %r10
+	mov	-20(%rsi), %r11
+	mov	-12(%rsi), %rcx
+	mov	-4(%rsi), %edx
+	mov	 %r8, -44(%rdi)
+	mov	 %r9, -36(%rdi)
+	mov	 %r10, -28(%rdi)
+	mov	 %r11, -20(%rdi)
+	mov	 %rcx, -12(%rdi)
+	mov	 %edx, -4(%rdi)
+	ret
+
+	.p2align 4
+L(write_36bytes):
+	mov	-36(%rsi), %r9
+	mov	-28(%rsi), %r10
+	mov	-20(%rsi), %r11
+	mov	-12(%rsi), %rcx
+	mov	-4(%rsi), %edx
+	mov	 %r9, -36(%rdi)
+	mov	 %r10, -28(%rdi)
+	mov	 %r11, -20(%rdi)
+	mov	 %rcx, -12(%rdi)
+	mov	 %edx, -4(%rdi)
+	ret
+
+	.p2align 4
+L(write_28bytes):
+	mov	-28(%rsi), %r10
+	mov	-20(%rsi), %r11
+	mov	-12(%rsi), %rcx
+	mov	-4(%rsi), %edx
+	mov	 %r10, -28(%rdi)
+	mov	 %r11, -20(%rdi)
+	mov	 %rcx, -12(%rdi)
+	mov	 %edx, -4(%rdi)
+	ret
+
+	.p2align 4
+L(write_20bytes):
+	mov	-20(%rsi), %r11
+	mov	-12(%rsi), %rcx
+	mov	-4(%rsi), %edx
+	mov	 %r11, -20(%rdi)
+	mov	 %rcx, -12(%rdi)
+	mov	 %edx, -4(%rdi)
+	ret
+
+	.p2align 4
+L(write_12bytes):
+	mov	-12(%rsi), %rcx
+	mov	-4(%rsi), %edx
+	mov	 %rcx, -12(%rdi)
+	mov	 %edx, -4(%rdi)
+	ret
+
+	.p2align 4
+L(write_4bytes):
+	mov	-4(%rsi), %edx
+	mov	 %edx, -4(%rdi)
+	ret
+
+	.p2align 4
+L(write_77bytes):
+	movdqu	-77(%rsi), %xmm0
+	movdqu	-61(%rsi), %xmm1
+	mov	-45(%rsi), %r8
+	mov	-37(%rsi), %r9
+	mov	-29(%rsi), %r10
+	mov	-21(%rsi), %r11
+	mov	-13(%rsi), %rcx
+	mov	-8(%rsi), %rdx
+	movdqu	 %xmm0, -77(%rdi)
+	movdqu	 %xmm1, -61(%rdi)
+	mov	 %r8, -45(%rdi)
+	mov	 %r9, -37(%rdi)
+	mov	 %r10, -29(%rdi)
+	mov	 %r11, -21(%rdi)
+	mov	 %rcx, -13(%rdi)
+	mov	 %rdx, -8(%rdi)
+	ret
+
+	.p2align 4
+L(write_69bytes):
+	movdqu	-69(%rsi), %xmm0
+	movdqu	-53(%rsi), %xmm1
+	mov	-37(%rsi), %r9
+	mov	-29(%rsi), %r10
+	mov	-21(%rsi), %r11
+	mov	-13(%rsi), %rcx
+	mov	-8(%rsi), %rdx
+	movdqu	 %xmm0, -69(%rdi)
+	movdqu	 %xmm1, -53(%rdi)
+	mov	 %r9, -37(%rdi)
+	mov	 %r10, -29(%rdi)
+	mov	 %r11, -21(%rdi)
+	mov	 %rcx, -13(%rdi)
+	mov	 %rdx, -8(%rdi)
+	ret
+
+	.p2align 4
+L(write_61bytes):
+	movdqu	-61(%rsi), %xmm0
+	mov	-45(%rsi), %r8
+	mov	-37(%rsi), %r9
+	mov	-29(%rsi), %r10
+	mov	-21(%rsi), %r11
+	mov	-13(%rsi), %rcx
+	mov	-8(%rsi), %rdx
+	movdqu	 %xmm0, -61(%rdi)
+	mov	 %r8, -45(%rdi)
+	mov	 %r9, -37(%rdi)
+	mov	 %r10, -29(%rdi)
+	mov	 %r11, -21(%rdi)
+	mov	 %rcx, -13(%rdi)
+	mov	 %rdx, -8(%rdi)
+	ret
+
+	.p2align 4
+L(write_53bytes):
+	movdqu	-53(%rsi), %xmm0
+	mov	-45(%rsi), %r8
+	mov	-37(%rsi), %r9
+	mov	-29(%rsi), %r10
+	mov	-21(%rsi), %r11
+	mov	-13(%rsi), %rcx
+	mov	-8(%rsi), %rdx
+	movdqu	 %xmm0, -53(%rdi)
+	mov	 %r9, -37(%rdi)
+	mov	 %r10, -29(%rdi)
+	mov	 %r11, -21(%rdi)
+	mov	 %rcx, -13(%rdi)
+	mov	 %rdx, -8(%rdi)
+	ret
+
+	.p2align 4
+L(write_45bytes):
+	mov	-45(%rsi), %r8
+	mov	-37(%rsi), %r9
+	mov	-29(%rsi), %r10
+	mov	-21(%rsi), %r11
+	mov	-13(%rsi), %rcx
+	mov	-8(%rsi), %rdx
+	mov	 %r8, -45(%rdi)
+	mov	 %r9, -37(%rdi)
+	mov	 %r10, -29(%rdi)
+	mov	 %r11, -21(%rdi)
+	mov	 %rcx, -13(%rdi)
+	mov	 %rdx, -8(%rdi)
+	ret
+
+	.p2align 4
+L(write_37bytes):
+	mov	-37(%rsi), %r9
+	mov	-29(%rsi), %r10
+	mov	-21(%rsi), %r11
+	mov	-13(%rsi), %rcx
+	mov	-8(%rsi), %rdx
+	mov	 %r9, -37(%rdi)
+	mov	 %r10, -29(%rdi)
+	mov	 %r11, -21(%rdi)
+	mov	 %rcx, -13(%rdi)
+	mov	 %rdx, -8(%rdi)
+	ret
+
+	.p2align 4
+L(write_29bytes):
+	mov	-29(%rsi), %r10
+	mov	-21(%rsi), %r11
+	mov	-13(%rsi), %rcx
+	mov	-8(%rsi), %rdx
+	mov	 %r10, -29(%rdi)
+	mov	 %r11, -21(%rdi)
+	mov	 %rcx, -13(%rdi)
+	mov	 %rdx, -8(%rdi)
+	ret
+
+	.p2align 4
+L(write_21bytes):
+	mov	-21(%rsi), %r11
+	mov	-13(%rsi), %rcx
+	mov	-8(%rsi), %rdx
+	mov	 %r11, -21(%rdi)
+	mov	 %rcx, -13(%rdi)
+	mov	 %rdx, -8(%rdi)
+	ret
+
+	.p2align 4
+L(write_13bytes):
+	mov	-13(%rsi), %rcx
+	mov	-8(%rsi), %rdx
+	mov	 %rcx, -13(%rdi)
+	mov	 %rdx, -8(%rdi)
+	ret
+
+	.p2align 4
+L(write_5bytes):
+	mov	-5(%rsi), %edx
+	mov	-4(%rsi), %ecx
+	mov	 %edx, -5(%rdi)
+	mov	 %ecx, -4(%rdi)
+	ret
+
+	.p2align 4
+L(write_78bytes):
+	movdqu	-78(%rsi), %xmm0
+	movdqu	-62(%rsi), %xmm1
+	mov	-46(%rsi), %r8
+	mov	-38(%rsi), %r9
+	mov	-30(%rsi), %r10
+	mov	-22(%rsi), %r11
+	mov	-14(%rsi), %rcx
+	mov	-8(%rsi), %rdx
+	movdqu	 %xmm0, -78(%rdi)
+	movdqu	 %xmm1, -62(%rdi)
+	mov	 %r8, -46(%rdi)
+	mov	 %r9, -38(%rdi)
+	mov	 %r10, -30(%rdi)
+	mov	 %r11, -22(%rdi)
+	mov	 %rcx, -14(%rdi)
+	mov	 %rdx, -8(%rdi)
+	ret
+
+	.p2align 4
+L(write_70bytes):
+	movdqu	-70(%rsi), %xmm0
+	movdqu	-54(%rsi), %xmm1
+	mov	-38(%rsi), %r9
+	mov	-30(%rsi), %r10
+	mov	-22(%rsi), %r11
+	mov	-14(%rsi), %rcx
+	mov	-8(%rsi), %rdx
+	movdqu	 %xmm0, -70(%rdi)
+	movdqu	 %xmm1, -54(%rdi)
+	mov	 %r9, -38(%rdi)
+	mov	 %r10, -30(%rdi)
+	mov	 %r11, -22(%rdi)
+	mov	 %rcx, -14(%rdi)
+	mov	 %rdx, -8(%rdi)
+	ret
+
+	.p2align 4
+L(write_62bytes):
+	movdqu	-62(%rsi), %xmm0
+	mov	-46(%rsi), %r8
+	mov	-38(%rsi), %r9
+	mov	-30(%rsi), %r10
+	mov	-22(%rsi), %r11
+	mov	-14(%rsi), %rcx
+	mov	-8(%rsi), %rdx
+	movdqu	 %xmm0, -62(%rdi)
+	mov	 %r8, -46(%rdi)
+	mov	 %r9, -38(%rdi)
+	mov	 %r10, -30(%rdi)
+	mov	 %r11, -22(%rdi)
+	mov	 %rcx, -14(%rdi)
+	mov	 %rdx, -8(%rdi)
+	ret
+
+	.p2align 4
+L(write_54bytes):
+	movdqu	-54(%rsi), %xmm0
+	mov	-38(%rsi), %r9
+	mov	-30(%rsi), %r10
+	mov	-22(%rsi), %r11
+	mov	-14(%rsi), %rcx
+	mov	-8(%rsi), %rdx
+	movdqu	 %xmm0, -54(%rdi)
+	mov	 %r9, -38(%rdi)
+	mov	 %r10, -30(%rdi)
+	mov	 %r11, -22(%rdi)
+	mov	 %rcx, -14(%rdi)
+	mov	 %rdx, -8(%rdi)
+	ret
+
+	.p2align 4
+L(write_46bytes):
+	mov	-46(%rsi), %r8
+	mov	-38(%rsi), %r9
+	mov	-30(%rsi), %r10
+	mov	-22(%rsi), %r11
+	mov	-14(%rsi), %rcx
+	mov	-8(%rsi), %rdx
+	mov	 %r8, -46(%rdi)
+	mov	 %r9, -38(%rdi)
+	mov	 %r10, -30(%rdi)
+	mov	 %r11, -22(%rdi)
+	mov	 %rcx, -14(%rdi)
+	mov	 %rdx, -8(%rdi)
+	ret
+
+	.p2align 4
+L(write_38bytes):
+	mov	-38(%rsi), %r9
+	mov	-30(%rsi), %r10
+	mov	-22(%rsi), %r11
+	mov	-14(%rsi), %rcx
+	mov	-8(%rsi), %rdx
+	mov	 %r9, -38(%rdi)
+	mov	 %r10, -30(%rdi)
+	mov	 %r11, -22(%rdi)
+	mov	 %rcx, -14(%rdi)
+	mov	 %rdx, -8(%rdi)
+	ret
+
+	.p2align 4
+L(write_30bytes):
+	mov	-30(%rsi), %r10
+	mov	-22(%rsi), %r11
+	mov	-14(%rsi), %rcx
+	mov	-8(%rsi), %rdx
+	mov	 %r10, -30(%rdi)
+	mov	 %r11, -22(%rdi)
+	mov	 %rcx, -14(%rdi)
+	mov	 %rdx, -8(%rdi)
+	ret
+
+	.p2align 4
+L(write_22bytes):
+	mov	-22(%rsi), %r11
+	mov	-14(%rsi), %rcx
+	mov	-8(%rsi), %rdx
+	mov	 %r11, -22(%rdi)
+	mov	 %rcx, -14(%rdi)
+	mov	 %rdx, -8(%rdi)
+	ret
+
+	.p2align 4
+L(write_14bytes):
+	mov	-14(%rsi), %rcx
+	mov	-8(%rsi), %rdx
+	mov	 %rcx, -14(%rdi)
+	mov	 %rdx, -8(%rdi)
+	ret
+
+	.p2align 4
+L(write_6bytes):
+	mov	-6(%rsi), %edx
+	mov	-4(%rsi), %ecx
+	mov	 %edx, -6(%rdi)
+	mov	 %ecx, -4(%rdi)
+	ret
+
+	.p2align 4
+L(write_79bytes):
+	movdqu	-79(%rsi), %xmm0
+	movdqu	-63(%rsi), %xmm1
+	mov	-47(%rsi), %r8
+	mov	-39(%rsi), %r9
+	mov	-31(%rsi), %r10
+	mov	-23(%rsi), %r11
+	mov	-15(%rsi), %rcx
+	mov	-8(%rsi), %rdx
+	movdqu	 %xmm0, -79(%rdi)
+	movdqu	 %xmm1, -63(%rdi)
+	mov	 %r8, -47(%rdi)
+	mov	 %r9, -39(%rdi)
+	mov	 %r10, -31(%rdi)
+	mov	 %r11, -23(%rdi)
+	mov	 %rcx, -15(%rdi)
+	mov	 %rdx, -8(%rdi)
+	ret
+
+	.p2align 4
+L(write_71bytes):
+	movdqu	-71(%rsi), %xmm0
+	movdqu	-55(%rsi), %xmm1
+	mov	-39(%rsi), %r9
+	mov	-31(%rsi), %r10
+	mov	-23(%rsi), %r11
+	mov	-15(%rsi), %rcx
+	mov	-8(%rsi), %rdx
+	movdqu	 %xmm0, -71(%rdi)
+	movdqu	 %xmm1, -55(%rdi)
+	mov	 %r9, -39(%rdi)
+	mov	 %r10, -31(%rdi)
+	mov	 %r11, -23(%rdi)
+	mov	 %rcx, -15(%rdi)
+	mov	 %rdx, -8(%rdi)
+	ret
+
+	.p2align 4
+L(write_63bytes):
+	movdqu	-63(%rsi), %xmm0
+	mov	-47(%rsi), %r8
+	mov	-39(%rsi), %r9
+	mov	-31(%rsi), %r10
+	mov	-23(%rsi), %r11
+	mov	-15(%rsi), %rcx
+	mov	-8(%rsi), %rdx
+	movdqu	 %xmm0, -63(%rdi)
+	mov	 %r8, -47(%rdi)
+	mov	 %r9, -39(%rdi)
+	mov	 %r10, -31(%rdi)
+	mov	 %r11, -23(%rdi)
+	mov	 %rcx, -15(%rdi)
+	mov	 %rdx, -8(%rdi)
+	ret
+
+	.p2align 4
+L(write_55bytes):
+	movdqu	-55(%rsi), %xmm0
+	mov	-39(%rsi), %r9
+	mov	-31(%rsi), %r10
+	mov	-23(%rsi), %r11
+	mov	-15(%rsi), %rcx
+	mov	-8(%rsi), %rdx
+	movdqu	 %xmm0, -55(%rdi)
+	mov	 %r9, -39(%rdi)
+	mov	 %r10, -31(%rdi)
+	mov	 %r11, -23(%rdi)
+	mov	 %rcx, -15(%rdi)
+	mov	 %rdx, -8(%rdi)
+	ret
+
+	.p2align 4
+L(write_47bytes):
+	mov	-47(%rsi), %r8
+	mov	-39(%rsi), %r9
+	mov	-31(%rsi), %r10
+	mov	-23(%rsi), %r11
+	mov	-15(%rsi), %rcx
+	mov	-8(%rsi), %rdx
+	mov	 %r8, -47(%rdi)
+	mov	 %r9, -39(%rdi)
+	mov	 %r10, -31(%rdi)
+	mov	 %r11, -23(%rdi)
+	mov	 %rcx, -15(%rdi)
+	mov	 %rdx, -8(%rdi)
+	ret
+
+	.p2align 4
+L(write_39bytes):
+	mov	-39(%rsi), %r9
+	mov	-31(%rsi), %r10
+	mov	-23(%rsi), %r11
+	mov	-15(%rsi), %rcx
+	mov	-8(%rsi), %rdx
+	mov	 %r9, -39(%rdi)
+	mov	 %r10, -31(%rdi)
+	mov	 %r11, -23(%rdi)
+	mov	 %rcx, -15(%rdi)
+	mov	 %rdx, -8(%rdi)
+	ret
+
+	.p2align 4
+L(write_31bytes):
+	mov	-31(%rsi), %r10
+	mov	-23(%rsi), %r11
+	mov	-15(%rsi), %rcx
+	mov	-8(%rsi), %rdx
+	mov	 %r10, -31(%rdi)
+	mov	 %r11, -23(%rdi)
+	mov	 %rcx, -15(%rdi)
+	mov	 %rdx, -8(%rdi)
+	ret
+
+	.p2align 4
+L(write_23bytes):
+	mov	-23(%rsi), %r11
+	mov	-15(%rsi), %rcx
+	mov	-8(%rsi), %rdx
+	mov	 %r11, -23(%rdi)
+	mov	 %rcx, -15(%rdi)
+	mov	 %rdx, -8(%rdi)
+	ret
+
+	.p2align 4
+L(write_15bytes):
+	mov	-15(%rsi), %rcx
+	mov	-8(%rsi), %rdx
+	mov	 %rcx, -15(%rdi)
+	mov	 %rdx, -8(%rdi)
+	ret
+
+	.p2align 4
+L(write_7bytes):
+	mov	-7(%rsi), %edx
+	mov	-4(%rsi), %ecx
+	mov	 %edx, -7(%rdi)
+	mov	 %ecx, -4(%rdi)
+	ret
+
+	.p2align 4
+L(large_page_fwd):
+	movdqu	(%rsi), %xmm1
+	lea	16(%rsi), %rsi
+	movdqu	%xmm0, (%r8)
+	movntdq	%xmm1, (%rdi)
+	lea	16(%rdi), %rdi
+	lea	-0x90(%rdx), %rdx
+#ifdef USE_AS_MEMMOVE
+	mov	%rsi, %r9
+	sub	%rdi, %r9
+	cmp	%rdx, %r9
+	jae	L(memmove_is_memcpy_fwd)
+	shl	$2, %rcx
+	cmp	%rcx, %rdx
+	jb	L(ll_cache_copy_fwd_start)
+L(memmove_is_memcpy_fwd):
+#endif
+L(large_page_loop):
+	movdqu	(%rsi), %xmm0
+	movdqu	0x10(%rsi), %xmm1
+	movdqu	0x20(%rsi), %xmm2
+	movdqu	0x30(%rsi), %xmm3
+	movdqu	0x40(%rsi), %xmm4
+	movdqu	0x50(%rsi), %xmm5
+	movdqu	0x60(%rsi), %xmm6
+	movdqu	0x70(%rsi), %xmm7
+	lea	0x80(%rsi), %rsi
+
+	sub	$0x80, %rdx
+	movntdq	%xmm0, (%rdi)
+	movntdq	%xmm1, 0x10(%rdi)
+	movntdq	%xmm2, 0x20(%rdi)
+	movntdq	%xmm3, 0x30(%rdi)
+	movntdq	%xmm4, 0x40(%rdi)
+	movntdq	%xmm5, 0x50(%rdi)
+	movntdq	%xmm6, 0x60(%rdi)
+	movntdq	%xmm7, 0x70(%rdi)
+	lea	0x80(%rdi), %rdi
+	jae	L(large_page_loop)
+	cmp	$-0x40, %rdx
+	lea	0x80(%rdx), %rdx
+	jl	L(large_page_less_64bytes)
+
+	movdqu	(%rsi), %xmm0
+	movdqu	0x10(%rsi), %xmm1
+	movdqu	0x20(%rsi), %xmm2
+	movdqu	0x30(%rsi), %xmm3
+	lea	0x40(%rsi), %rsi
+
+	movntdq	%xmm0, (%rdi)
+	movntdq	%xmm1, 0x10(%rdi)
+	movntdq	%xmm2, 0x20(%rdi)
+	movntdq	%xmm3, 0x30(%rdi)
+	lea	0x40(%rdi), %rdi
+	sub	$0x40, %rdx
+L(large_page_less_64bytes):
+	add	%rdx, %rsi
+	add	%rdx, %rdi
+	sfence
+	BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
+
+#ifdef USE_AS_MEMMOVE
+	.p2align 4
+L(ll_cache_copy_fwd_start):
+	prefetcht0 0x1c0(%rsi)
+	prefetcht0 0x200(%rsi)
+	movdqu	(%rsi), %xmm0
+	movdqu	0x10(%rsi), %xmm1
+	movdqu	0x20(%rsi), %xmm2
+	movdqu	0x30(%rsi), %xmm3
+	movdqu	0x40(%rsi), %xmm4
+	movdqu	0x50(%rsi), %xmm5
+	movdqu	0x60(%rsi), %xmm6
+	movdqu	0x70(%rsi), %xmm7
+	lea	0x80(%rsi), %rsi
+
+	sub	$0x80, %rdx
+	movaps	%xmm0, (%rdi)
+	movaps	%xmm1, 0x10(%rdi)
+	movaps	%xmm2, 0x20(%rdi)
+	movaps	%xmm3, 0x30(%rdi)
+	movaps	%xmm4, 0x40(%rdi)
+	movaps	%xmm5, 0x50(%rdi)
+	movaps	%xmm6, 0x60(%rdi)
+	movaps	%xmm7, 0x70(%rdi)
+	lea	0x80(%rdi), %rdi
+	jae	L(ll_cache_copy_fwd_start)
+	cmp	$-0x40, %rdx
+	lea	0x80(%rdx), %rdx
+	jl	L(large_page_ll_less_fwd_64bytes)
+
+	movdqu	(%rsi), %xmm0
+	movdqu	0x10(%rsi), %xmm1
+	movdqu	0x20(%rsi), %xmm2
+	movdqu	0x30(%rsi), %xmm3
+	lea	0x40(%rsi), %rsi
+
+	movaps	%xmm0, (%rdi)
+	movaps	%xmm1, 0x10(%rdi)
+	movaps	%xmm2, 0x20(%rdi)
+	movaps	%xmm3, 0x30(%rdi)
+	lea	0x40(%rdi), %rdi
+	sub	$0x40, %rdx
+L(large_page_ll_less_fwd_64bytes):
+	add	%rdx, %rsi
+	add	%rdx, %rdi
+	BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
+
+#endif
+	.p2align 4
+L(large_page_bwd):
+	movdqu	-0x10(%rsi), %xmm1
+	lea	-16(%rsi), %rsi
+	movdqu	%xmm0, (%r8)
+	movdqa	%xmm1, -0x10(%rdi)
+	lea	-16(%rdi), %rdi
+	lea	-0x90(%rdx), %rdx
+#ifdef USE_AS_MEMMOVE
+	mov	%rdi, %r9
+	sub	%rsi, %r9
+	cmp	%rdx, %r9
+	jae	L(memmove_is_memcpy_bwd)
+	cmp	%rcx, %r9
+	jb	L(ll_cache_copy_bwd_start)
+L(memmove_is_memcpy_bwd):
+#endif
+L(large_page_bwd_loop):
+	movdqu	-0x10(%rsi), %xmm0
+	movdqu	-0x20(%rsi), %xmm1
+	movdqu	-0x30(%rsi), %xmm2
+	movdqu	-0x40(%rsi), %xmm3
+	movdqu	-0x50(%rsi), %xmm4
+	movdqu	-0x60(%rsi), %xmm5
+	movdqu	-0x70(%rsi), %xmm6
+	movdqu	-0x80(%rsi), %xmm7
+	lea	-0x80(%rsi), %rsi
+
+	sub	$0x80, %rdx
+	movntdq	%xmm0, -0x10(%rdi)
+	movntdq	%xmm1, -0x20(%rdi)
+	movntdq	%xmm2, -0x30(%rdi)
+	movntdq	%xmm3, -0x40(%rdi)
+	movntdq	%xmm4, -0x50(%rdi)
+	movntdq	%xmm5, -0x60(%rdi)
+	movntdq	%xmm6, -0x70(%rdi)
+	movntdq	%xmm7, -0x80(%rdi)
+	lea	-0x80(%rdi), %rdi
+	jae	L(large_page_bwd_loop)
+	cmp	$-0x40, %rdx
+	lea	0x80(%rdx), %rdx
+	jl	L(large_page_less_bwd_64bytes)
+
+	movdqu	-0x10(%rsi), %xmm0
+	movdqu	-0x20(%rsi), %xmm1
+	movdqu	-0x30(%rsi), %xmm2
+	movdqu	-0x40(%rsi), %xmm3
+	lea	-0x40(%rsi), %rsi
+
+	movntdq	%xmm0, -0x10(%rdi)
+	movntdq	%xmm1, -0x20(%rdi)
+	movntdq	%xmm2, -0x30(%rdi)
+	movntdq	%xmm3, -0x40(%rdi)
+	lea	-0x40(%rdi), %rdi
+	sub	$0x40, %rdx
+L(large_page_less_bwd_64bytes):
+	sfence
+	BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
+
+#ifdef USE_AS_MEMMOVE
+	.p2align 4
+L(ll_cache_copy_bwd_start):
+	prefetcht0 -0x1c0(%rsi)
+	prefetcht0 -0x200(%rsi)
+	movdqu	-0x10(%rsi), %xmm0
+	movdqu	-0x20(%rsi), %xmm1
+	movdqu	-0x30(%rsi), %xmm2
+	movdqu	-0x40(%rsi), %xmm3
+	movdqu	-0x50(%rsi), %xmm4
+	movdqu	-0x60(%rsi), %xmm5
+	movdqu	-0x70(%rsi), %xmm6
+	movdqu	-0x80(%rsi), %xmm7
+	lea	-0x80(%rsi), %rsi
+
+	sub	$0x80, %rdx
+	movaps	%xmm0, -0x10(%rdi)
+	movaps	%xmm1, -0x20(%rdi)
+	movaps	%xmm2, -0x30(%rdi)
+	movaps	%xmm3, -0x40(%rdi)
+	movaps	%xmm4, -0x50(%rdi)
+	movaps	%xmm5, -0x60(%rdi)
+	movaps	%xmm6, -0x70(%rdi)
+	movaps	%xmm7, -0x80(%rdi)
+	lea	-0x80(%rdi), %rdi
+	jae	L(ll_cache_copy_bwd_start)
+	cmp	$-0x40, %rdx
+	lea	0x80(%rdx), %rdx
+	jl	L(large_page_ll_less_bwd_64bytes)
+
+	movdqu	-0x10(%rsi), %xmm0
+	movdqu	-0x20(%rsi), %xmm1
+	movdqu	-0x30(%rsi), %xmm2
+	movdqu	-0x40(%rsi), %xmm3
+	lea	-0x40(%rsi), %rsi
+
+	movaps	%xmm0, -0x10(%rdi)
+	movaps	%xmm1, -0x20(%rdi)
+	movaps	%xmm2, -0x30(%rdi)
+	movaps	%xmm3, -0x40(%rdi)
+	lea	-0x40(%rdi), %rdi
+	sub	$0x40, %rdx
+L(large_page_ll_less_bwd_64bytes):
+	BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
+#endif
+
+END (MEMCPY)
+
+	.section .rodata.ssse3,"a",@progbits
+	.p2align 3
+L(table_less_80bytes):
+	.int	JMPTBL (L(write_0bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_1bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_2bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_3bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_4bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_5bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_6bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_7bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_8bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_9bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_10bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_11bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_12bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_13bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_14bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_15bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_16bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_17bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_18bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_19bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_20bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_21bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_22bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_23bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_24bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_25bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_26bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_27bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_28bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_29bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_30bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_31bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_32bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_33bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_34bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_35bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_36bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_37bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_38bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_39bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_40bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_41bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_42bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_43bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_44bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_45bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_46bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_47bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_48bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_49bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_50bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_51bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_52bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_53bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_54bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_55bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_56bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_57bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_58bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_59bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_60bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_61bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_62bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_63bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_64bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_65bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_66bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_67bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_68bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_69bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_70bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_71bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_72bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_73bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_74bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_75bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_76bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_77bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_78bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_79bytes), L(table_less_80bytes))
+
+	.p2align 3
+L(shl_table):
+	.int	JMPTBL (L(shl_0), L(shl_table))
+	.int	JMPTBL (L(shl_1), L(shl_table))
+	.int	JMPTBL (L(shl_2), L(shl_table))
+	.int	JMPTBL (L(shl_3), L(shl_table))
+	.int	JMPTBL (L(shl_4), L(shl_table))
+	.int	JMPTBL (L(shl_5), L(shl_table))
+	.int	JMPTBL (L(shl_6), L(shl_table))
+	.int	JMPTBL (L(shl_7), L(shl_table))
+	.int	JMPTBL (L(shl_8), L(shl_table))
+	.int	JMPTBL (L(shl_9), L(shl_table))
+	.int	JMPTBL (L(shl_10), L(shl_table))
+	.int	JMPTBL (L(shl_11), L(shl_table))
+	.int	JMPTBL (L(shl_12), L(shl_table))
+	.int	JMPTBL (L(shl_13), L(shl_table))
+	.int	JMPTBL (L(shl_14), L(shl_table))
+	.int	JMPTBL (L(shl_15), L(shl_table))
+
+	.p2align 3
+L(shl_table_bwd):
+	.int	JMPTBL (L(shl_0_bwd), L(shl_table_bwd))
+	.int	JMPTBL (L(shl_1_bwd), L(shl_table_bwd))
+	.int	JMPTBL (L(shl_2_bwd), L(shl_table_bwd))
+	.int	JMPTBL (L(shl_3_bwd), L(shl_table_bwd))
+	.int	JMPTBL (L(shl_4_bwd), L(shl_table_bwd))
+	.int	JMPTBL (L(shl_5_bwd), L(shl_table_bwd))
+	.int	JMPTBL (L(shl_6_bwd), L(shl_table_bwd))
+	.int	JMPTBL (L(shl_7_bwd), L(shl_table_bwd))
+	.int	JMPTBL (L(shl_8_bwd), L(shl_table_bwd))
+	.int	JMPTBL (L(shl_9_bwd), L(shl_table_bwd))
+	.int	JMPTBL (L(shl_10_bwd), L(shl_table_bwd))
+	.int	JMPTBL (L(shl_11_bwd), L(shl_table_bwd))
+	.int	JMPTBL (L(shl_12_bwd), L(shl_table_bwd))
+	.int	JMPTBL (L(shl_13_bwd), L(shl_table_bwd))
+	.int	JMPTBL (L(shl_14_bwd), L(shl_table_bwd))
+	.int	JMPTBL (L(shl_15_bwd), L(shl_table_bwd))
+
+#endif
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/memcpy.S b/REORG.TODO/sysdeps/x86_64/multiarch/memcpy.S
new file mode 100644
index 0000000000..af2770397c
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/memcpy.S
@@ -0,0 +1,75 @@
+/* Multiple versions of memcpy
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in lib and for
+   DSO.  In static binaries we need memcpy before the initialization
+   happened.  */
+#if defined SHARED && IS_IN (libc)
+	.text
+ENTRY(__new_memcpy)
+	.type	__new_memcpy, @gnu_indirect_function
+	LOAD_RTLD_GLOBAL_RO_RDX
+	lea	__memcpy_erms(%rip), %RAX_LP
+	HAS_ARCH_FEATURE (Prefer_ERMS)
+	jnz	2f
+	HAS_ARCH_FEATURE (Prefer_No_AVX512)
+	jnz	1f
+	HAS_ARCH_FEATURE (AVX512F_Usable)
+	jz	1f
+	lea	__memcpy_avx512_no_vzeroupper(%rip), %RAX_LP
+	HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
+	jnz	2f
+	lea	__memcpy_avx512_unaligned_erms(%rip), %RAX_LP
+	HAS_CPU_FEATURE (ERMS)
+	jnz	2f
+	lea	__memcpy_avx512_unaligned(%rip), %RAX_LP
+	ret
+1:	lea	__memcpy_avx_unaligned(%rip), %RAX_LP
+	HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
+	jz	L(Fast_Unaligned_Load)
+	HAS_CPU_FEATURE (ERMS)
+	jz	2f
+	lea	__memcpy_avx_unaligned_erms(%rip), %RAX_LP
+	ret
+L(Fast_Unaligned_Load):
+	lea	__memcpy_sse2_unaligned(%rip), %RAX_LP
+	HAS_ARCH_FEATURE (Fast_Unaligned_Copy)
+	jz	L(SSSE3)
+	HAS_CPU_FEATURE (ERMS)
+	jz	2f
+	lea	__memcpy_sse2_unaligned_erms(%rip), %RAX_LP
+	ret
+L(SSSE3):
+	HAS_CPU_FEATURE (SSSE3)
+	jz	2f
+	lea    __memcpy_ssse3_back(%rip), %RAX_LP
+	HAS_ARCH_FEATURE (Fast_Copy_Backward)
+	jnz	2f
+	lea	__memcpy_ssse3(%rip), %RAX_LP
+2:	ret
+END(__new_memcpy)
+
+# undef memcpy
+# include <shlib-compat.h>
+versioned_symbol (libc, __new_memcpy, memcpy, GLIBC_2_14);
+#endif
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/memcpy_chk.S b/REORG.TODO/sysdeps/x86_64/multiarch/memcpy_chk.S
new file mode 100644
index 0000000000..8737fb9755
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/memcpy_chk.S
@@ -0,0 +1,72 @@
+/* Multiple versions of __memcpy_chk
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in lib and for
+   DSO.  There are no multiarch memcpy functions for static binaries.
+ */
+#if IS_IN (libc)
+# ifdef SHARED
+	.text
+ENTRY(__memcpy_chk)
+	.type	__memcpy_chk, @gnu_indirect_function
+	LOAD_RTLD_GLOBAL_RO_RDX
+	HAS_ARCH_FEATURE (Prefer_No_AVX512)
+	jnz	1f
+	HAS_ARCH_FEATURE (AVX512F_Usable)
+	jz	1f
+	lea	__memcpy_chk_avx512_no_vzeroupper(%rip), %RAX_LP
+	HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
+	jnz	2f
+	lea	__memcpy_chk_avx512_unaligned_erms(%rip), %RAX_LP
+	HAS_CPU_FEATURE (ERMS)
+	jnz	2f
+	lea	__memcpy_chk_avx512_unaligned(%rip), %RAX_LP
+	ret
+1:	lea	__memcpy_chk_avx_unaligned(%rip), %RAX_LP
+	HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
+	jz	L(Fast_Unaligned_Load)
+	HAS_CPU_FEATURE (ERMS)
+	jz	2f
+	lea	__memcpy_chk_avx_unaligned_erms(%rip), %RAX_LP
+	ret
+L(Fast_Unaligned_Load):
+	lea	__memcpy_chk_sse2_unaligned(%rip), %RAX_LP
+	HAS_ARCH_FEATURE (Fast_Unaligned_Copy)
+	jz	L(SSSE3)
+	HAS_CPU_FEATURE (ERMS)
+	jz	2f
+	lea	__memcpy_chk_sse2_unaligned_erms(%rip), %RAX_LP
+	ret
+L(SSSE3):
+	HAS_CPU_FEATURE (SSSE3)
+	jz	2f
+	lea    __memcpy_chk_ssse3_back(%rip), %RAX_LP
+	HAS_ARCH_FEATURE (Fast_Copy_Backward)
+	jnz	2f
+	lea	__memcpy_chk_ssse3(%rip), %RAX_LP
+2:	ret
+END(__memcpy_chk)
+# else
+#  include "../memcpy_chk.S"
+# endif
+#endif
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S b/REORG.TODO/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S
new file mode 100644
index 0000000000..e195e93f15
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S
@@ -0,0 +1,12 @@
+#if IS_IN (libc)
+# define VEC_SIZE	32
+# define VEC(i)		ymm##i
+# define VMOVNT		vmovntdq
+# define VMOVU		vmovdqu
+# define VMOVA		vmovdqa
+
+# define SECTION(p)		p##.avx
+# define MEMMOVE_SYMBOL(p,s)	p##_avx_##s
+
+# include "memmove-vec-unaligned-erms.S"
+#endif
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S b/REORG.TODO/sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S
new file mode 100644
index 0000000000..f3ef10577c
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S
@@ -0,0 +1,420 @@
+/* memmove/memcpy/mempcpy optimized with AVX512 for KNL hardware.
+   Copyright (C) 2016-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+#if IS_IN (libc)
+
+# include "asm-syntax.h"
+
+	.section .text.avx512,"ax",@progbits
+# if defined SHARED && !defined USE_AS_MEMPCPY && !defined USE_AS_MEMMOVE
+ENTRY (__mempcpy_chk_avx512_no_vzeroupper)
+	cmpq	%rdx, %rcx
+	jb	HIDDEN_JUMPTARGET (__chk_fail)
+END (__mempcpy_chk_avx512_no_vzeroupper)
+
+ENTRY (__mempcpy_avx512_no_vzeroupper)
+	movq	%rdi, %rax
+	addq	%rdx, %rax
+	jmp	L(start)
+END (__mempcpy_avx512_no_vzeroupper)
+# endif
+
+# ifdef SHARED
+ENTRY (__memmove_chk_avx512_no_vzeroupper)
+	cmpq	%rdx, %rcx
+	jb	HIDDEN_JUMPTARGET (__chk_fail)
+END (__memmove_chk_avx512_no_vzeroupper)
+# endif
+
+ENTRY (__memmove_avx512_no_vzeroupper)
+	mov	%rdi, %rax
+# ifdef USE_AS_MEMPCPY
+	add	%rdx, %rax
+# endif
+L(start):
+	lea	(%rsi, %rdx), %rcx
+	lea	(%rdi, %rdx), %r9
+	cmp	$512, %rdx
+	ja	L(512bytesormore)
+
+L(check):
+	cmp	$16, %rdx
+	jbe	L(less_16bytes)
+	cmp	$256, %rdx
+	jb	L(less_256bytes)
+	vmovups	(%rsi), %zmm0
+	vmovups 0x40(%rsi), %zmm1
+	vmovups 0x80(%rsi), %zmm2
+	vmovups 0xC0(%rsi), %zmm3
+	vmovups	-0x100(%rcx), %zmm4
+	vmovups -0xC0(%rcx), %zmm5
+	vmovups -0x80(%rcx), %zmm6
+	vmovups -0x40(%rcx), %zmm7
+	vmovups %zmm0, (%rdi)
+	vmovups %zmm1, 0x40(%rdi)
+	vmovups %zmm2, 0x80(%rdi)
+	vmovups %zmm3, 0xC0(%rdi)
+	vmovups	%zmm4, -0x100(%r9)
+	vmovups %zmm5, -0xC0(%r9)
+	vmovups %zmm6, -0x80(%r9)
+	vmovups %zmm7, -0x40(%r9)
+	ret
+
+L(less_256bytes):
+	cmp	$128, %dl
+	jb	L(less_128bytes)
+	vmovups	(%rsi), %zmm0
+	vmovups 0x40(%rsi), %zmm1
+	vmovups -0x80(%rcx), %zmm2
+	vmovups -0x40(%rcx), %zmm3
+	vmovups	%zmm0, (%rdi)
+	vmovups %zmm1, 0x40(%rdi)
+	vmovups %zmm2, -0x80(%r9)
+	vmovups %zmm3, -0x40(%r9)
+	ret
+
+L(less_128bytes):
+	cmp	$64, %dl
+	jb	L(less_64bytes)
+	vmovdqu (%rsi), %ymm0
+	vmovdqu 0x20(%rsi), %ymm1
+	vmovdqu -0x40(%rcx), %ymm2
+	vmovdqu -0x20(%rcx), %ymm3
+	vmovdqu %ymm0, (%rdi)
+	vmovdqu %ymm1, 0x20(%rdi)
+	vmovdqu %ymm2, -0x40(%r9)
+	vmovdqu %ymm3, -0x20(%r9)
+	ret
+
+L(less_64bytes):
+	cmp	$32, %dl
+	jb	L(less_32bytes)
+	vmovdqu	(%rsi), %ymm0
+	vmovdqu -0x20(%rcx), %ymm1
+	vmovdqu	%ymm0, (%rdi)
+	vmovdqu	%ymm1, -0x20(%r9)
+	ret
+
+L(less_32bytes):
+	vmovdqu (%rsi), %xmm0
+	vmovdqu -0x10(%rcx), %xmm1
+	vmovdqu %xmm0, (%rdi)
+	vmovdqu %xmm1, -0x10(%r9)
+	ret
+
+L(less_16bytes):
+	cmp	$8, %dl
+	jb	L(less_8bytes)
+	movq	(%rsi), %rsi
+	movq	-0x8(%rcx), %rcx
+	movq	%rsi, (%rdi)
+	movq	%rcx, -0x8(%r9)
+	ret
+
+L(less_8bytes):
+	cmp	$4, %dl
+	jb	L(less_4bytes)
+	mov	(%rsi), %esi
+	mov	-0x4(%rcx), %ecx
+	mov	%esi, (%rdi)
+	mov	%ecx, -0x4(%r9)
+	ret
+
+L(less_4bytes):
+	cmp	$2, %dl
+	jb	L(less_2bytes)
+	mov	(%rsi), %si
+	mov	-0x2(%rcx), %cx
+	mov	%si, (%rdi)
+	mov	%cx, -0x2(%r9)
+	ret
+
+L(less_2bytes):
+	cmp	$1, %dl
+	jb	L(less_1bytes)
+	mov	(%rsi), %cl
+	mov	%cl, (%rdi)
+L(less_1bytes):
+	ret
+
+L(512bytesormore):
+# ifdef SHARED_CACHE_SIZE_HALF
+	mov	$SHARED_CACHE_SIZE_HALF, %r8
+# else
+	mov	__x86_shared_cache_size_half(%rip), %r8
+# endif
+	cmp	%r8, %rdx
+	jae	L(preloop_large)
+	cmp	$1024, %rdx
+	ja	L(1024bytesormore)
+	prefetcht1 (%rsi)
+	prefetcht1 0x40(%rsi)
+	prefetcht1 0x80(%rsi)
+	prefetcht1 0xC0(%rsi)
+	prefetcht1 0x100(%rsi)
+	prefetcht1 0x140(%rsi)
+	prefetcht1 0x180(%rsi)
+	prefetcht1 0x1C0(%rsi)
+	prefetcht1 -0x200(%rcx)
+	prefetcht1 -0x1C0(%rcx)
+	prefetcht1 -0x180(%rcx)
+	prefetcht1 -0x140(%rcx)
+	prefetcht1 -0x100(%rcx)
+	prefetcht1 -0xC0(%rcx)
+	prefetcht1 -0x80(%rcx)
+	prefetcht1 -0x40(%rcx)
+	vmovups	(%rsi), %zmm0
+	vmovups 0x40(%rsi), %zmm1
+	vmovups 0x80(%rsi), %zmm2
+	vmovups 0xC0(%rsi), %zmm3
+	vmovups	0x100(%rsi), %zmm4
+	vmovups 0x140(%rsi), %zmm5
+	vmovups 0x180(%rsi), %zmm6
+	vmovups 0x1C0(%rsi), %zmm7
+	vmovups	-0x200(%rcx), %zmm8
+	vmovups -0x1C0(%rcx), %zmm9
+	vmovups -0x180(%rcx), %zmm10
+	vmovups -0x140(%rcx), %zmm11
+	vmovups	-0x100(%rcx), %zmm12
+	vmovups -0xC0(%rcx), %zmm13
+	vmovups -0x80(%rcx), %zmm14
+	vmovups -0x40(%rcx), %zmm15
+	vmovups %zmm0, (%rdi)
+	vmovups %zmm1, 0x40(%rdi)
+	vmovups %zmm2, 0x80(%rdi)
+	vmovups %zmm3, 0xC0(%rdi)
+	vmovups %zmm4, 0x100(%rdi)
+	vmovups %zmm5, 0x140(%rdi)
+	vmovups %zmm6, 0x180(%rdi)
+	vmovups %zmm7, 0x1C0(%rdi)
+	vmovups	%zmm8, -0x200(%r9)
+	vmovups %zmm9, -0x1C0(%r9)
+	vmovups %zmm10, -0x180(%r9)
+	vmovups %zmm11, -0x140(%r9)
+	vmovups	%zmm12, -0x100(%r9)
+	vmovups %zmm13, -0xC0(%r9)
+	vmovups %zmm14, -0x80(%r9)
+	vmovups %zmm15, -0x40(%r9)
+	ret
+
+L(1024bytesormore):
+	cmp	%rsi, %rdi
+	ja	L(1024bytesormore_bkw)
+	sub	$512, %r9
+	vmovups -0x200(%rcx), %zmm8
+	vmovups -0x1C0(%rcx), %zmm9
+	vmovups -0x180(%rcx), %zmm10
+	vmovups -0x140(%rcx), %zmm11
+	vmovups	-0x100(%rcx), %zmm12
+	vmovups -0xC0(%rcx), %zmm13
+	vmovups -0x80(%rcx), %zmm14
+	vmovups -0x40(%rcx), %zmm15
+	prefetcht1 (%rsi)
+	prefetcht1 0x40(%rsi)
+	prefetcht1 0x80(%rsi)
+	prefetcht1 0xC0(%rsi)
+	prefetcht1 0x100(%rsi)
+	prefetcht1 0x140(%rsi)
+	prefetcht1 0x180(%rsi)
+	prefetcht1 0x1C0(%rsi)
+
+/* Loop with unaligned memory access.  */
+L(gobble_512bytes_loop):
+	vmovups	(%rsi), %zmm0
+	vmovups 0x40(%rsi), %zmm1
+	vmovups 0x80(%rsi), %zmm2
+	vmovups 0xC0(%rsi), %zmm3
+	vmovups	0x100(%rsi), %zmm4
+	vmovups 0x140(%rsi), %zmm5
+	vmovups 0x180(%rsi), %zmm6
+	vmovups 0x1C0(%rsi), %zmm7
+	add	$512, %rsi
+	prefetcht1 (%rsi)
+	prefetcht1 0x40(%rsi)
+	prefetcht1 0x80(%rsi)
+	prefetcht1 0xC0(%rsi)
+	prefetcht1 0x100(%rsi)
+	prefetcht1 0x140(%rsi)
+	prefetcht1 0x180(%rsi)
+	prefetcht1 0x1C0(%rsi)
+	vmovups	%zmm0, (%rdi)
+	vmovups %zmm1, 0x40(%rdi)
+	vmovups %zmm2, 0x80(%rdi)
+	vmovups %zmm3, 0xC0(%rdi)
+	vmovups	%zmm4, 0x100(%rdi)
+	vmovups %zmm5, 0x140(%rdi)
+	vmovups %zmm6, 0x180(%rdi)
+	vmovups %zmm7, 0x1C0(%rdi)
+	add	$512, %rdi
+	cmp	%r9, %rdi
+	jb	L(gobble_512bytes_loop)
+	vmovups %zmm8, (%r9)
+	vmovups %zmm9, 0x40(%r9)
+	vmovups %zmm10, 0x80(%r9)
+	vmovups %zmm11, 0xC0(%r9)
+	vmovups %zmm12, 0x100(%r9)
+	vmovups %zmm13, 0x140(%r9)
+	vmovups %zmm14, 0x180(%r9)
+	vmovups %zmm15, 0x1C0(%r9)
+	ret
+
+L(1024bytesormore_bkw):
+	add	$512, %rdi
+	vmovups	0x1C0(%rsi), %zmm8
+	vmovups 0x180(%rsi), %zmm9
+	vmovups 0x140(%rsi), %zmm10
+	vmovups 0x100(%rsi), %zmm11
+	vmovups	0xC0(%rsi), %zmm12
+	vmovups 0x80(%rsi), %zmm13
+	vmovups 0x40(%rsi), %zmm14
+	vmovups (%rsi), %zmm15
+	prefetcht1 -0x40(%rcx)
+	prefetcht1 -0x80(%rcx)
+	prefetcht1 -0xC0(%rcx)
+	prefetcht1 -0x100(%rcx)
+	prefetcht1 -0x140(%rcx)
+	prefetcht1 -0x180(%rcx)
+	prefetcht1 -0x1C0(%rcx)
+	prefetcht1 -0x200(%rcx)
+
+/* Backward loop with unaligned memory access.  */
+L(gobble_512bytes_loop_bkw):
+	vmovups -0x40(%rcx), %zmm0
+	vmovups -0x80(%rcx), %zmm1
+	vmovups -0xC0(%rcx), %zmm2
+	vmovups	-0x100(%rcx), %zmm3
+	vmovups -0x140(%rcx), %zmm4
+	vmovups -0x180(%rcx), %zmm5
+	vmovups -0x1C0(%rcx), %zmm6
+	vmovups	-0x200(%rcx), %zmm7
+	sub	$512, %rcx
+	prefetcht1 -0x40(%rcx)
+	prefetcht1 -0x80(%rcx)
+	prefetcht1 -0xC0(%rcx)
+	prefetcht1 -0x100(%rcx)
+	prefetcht1 -0x140(%rcx)
+	prefetcht1 -0x180(%rcx)
+	prefetcht1 -0x1C0(%rcx)
+	prefetcht1 -0x200(%rcx)
+	vmovups %zmm0, -0x40(%r9)
+	vmovups %zmm1, -0x80(%r9)
+	vmovups %zmm2, -0xC0(%r9)
+	vmovups	%zmm3, -0x100(%r9)
+	vmovups %zmm4, -0x140(%r9)
+	vmovups %zmm5, -0x180(%r9)
+	vmovups %zmm6, -0x1C0(%r9)
+	vmovups	%zmm7, -0x200(%r9)
+	sub	$512, %r9
+	cmp	%rdi, %r9
+	ja	L(gobble_512bytes_loop_bkw)
+	vmovups %zmm8, -0x40(%rdi)
+	vmovups %zmm9, -0x80(%rdi)
+	vmovups %zmm10, -0xC0(%rdi)
+	vmovups %zmm11, -0x100(%rdi)
+	vmovups %zmm12, -0x140(%rdi)
+	vmovups %zmm13, -0x180(%rdi)
+	vmovups %zmm14, -0x1C0(%rdi)
+	vmovups %zmm15, -0x200(%rdi)
+	ret
+
+L(preloop_large):
+	cmp	%rsi, %rdi
+	ja	L(preloop_large_bkw)
+	vmovups	(%rsi), %zmm4
+	vmovups	0x40(%rsi), %zmm5
+
+/* Align destination for access with non-temporal stores in the loop.  */
+	mov	%rdi, %r8
+	and	$-0x80, %rdi
+	add	$0x80, %rdi
+	sub	%rdi, %r8
+	sub	%r8, %rsi
+	add	%r8, %rdx
+L(gobble_256bytes_nt_loop):
+	prefetcht1 0x200(%rsi)
+	prefetcht1 0x240(%rsi)
+	prefetcht1 0x280(%rsi)
+	prefetcht1 0x2C0(%rsi)
+	prefetcht1 0x300(%rsi)
+	prefetcht1 0x340(%rsi)
+	prefetcht1 0x380(%rsi)
+	prefetcht1 0x3C0(%rsi)
+	vmovdqu64 (%rsi), %zmm0
+	vmovdqu64 0x40(%rsi), %zmm1
+	vmovdqu64 0x80(%rsi), %zmm2
+	vmovdqu64 0xC0(%rsi), %zmm3
+	vmovntdq %zmm0, (%rdi)
+	vmovntdq %zmm1, 0x40(%rdi)
+	vmovntdq %zmm2, 0x80(%rdi)
+	vmovntdq %zmm3, 0xC0(%rdi)
+	sub	$256, %rdx
+	add	$256, %rsi
+	add	$256, %rdi
+	cmp	$256, %rdx
+	ja	L(gobble_256bytes_nt_loop)
+	sfence
+	vmovups	%zmm4, (%rax)
+	vmovups	%zmm5, 0x40(%rax)
+	jmp	L(check)
+
+L(preloop_large_bkw):
+	vmovups -0x80(%rcx), %zmm4
+	vmovups -0x40(%rcx), %zmm5
+
+/* Align end of destination for access with non-temporal stores.  */
+	mov	%r9, %r8
+	and	$-0x80, %r9
+	sub	%r9, %r8
+	sub	%r8, %rcx
+	sub	%r8, %rdx
+	add	%r9, %r8
+L(gobble_256bytes_nt_loop_bkw):
+	prefetcht1 -0x400(%rcx)
+	prefetcht1 -0x3C0(%rcx)
+	prefetcht1 -0x380(%rcx)
+	prefetcht1 -0x340(%rcx)
+	prefetcht1 -0x300(%rcx)
+	prefetcht1 -0x2C0(%rcx)
+	prefetcht1 -0x280(%rcx)
+	prefetcht1 -0x240(%rcx)
+	vmovdqu64 -0x100(%rcx), %zmm0
+	vmovdqu64 -0xC0(%rcx), %zmm1
+	vmovdqu64 -0x80(%rcx), %zmm2
+	vmovdqu64 -0x40(%rcx), %zmm3
+	vmovntdq %zmm0,	-0x100(%r9)
+	vmovntdq %zmm1,	-0xC0(%r9)
+	vmovntdq %zmm2,	-0x80(%r9)
+	vmovntdq %zmm3,	-0x40(%r9)
+	sub	$256, %rdx
+	sub	$256, %rcx
+	sub	$256, %r9
+	cmp	$256, %rdx
+	ja	L(gobble_256bytes_nt_loop_bkw)
+	sfence
+	vmovups	%zmm4, -0x80(%r8)
+	vmovups	%zmm5, -0x40(%r8)
+	jmp	L(check)
+END (__memmove_avx512_no_vzeroupper)
+
+# ifdef SHARED
+strong_alias (__memmove_avx512_no_vzeroupper, __memcpy_avx512_no_vzeroupper)
+strong_alias (__memmove_chk_avx512_no_vzeroupper, __memcpy_chk_avx512_no_vzeroupper)
+# endif
+#endif
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S b/REORG.TODO/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
new file mode 100644
index 0000000000..aac1515cf6
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
@@ -0,0 +1,12 @@
+#if IS_IN (libc)
+# define VEC_SIZE	64
+# define VEC(i)		zmm##i
+# define VMOVNT		vmovntdq
+# define VMOVU		vmovdqu64
+# define VMOVA		vmovdqa64
+
+# define SECTION(p)		p##.avx512
+# define MEMMOVE_SYMBOL(p,s)	p##_avx512_##s
+
+# include "memmove-vec-unaligned-erms.S"
+#endif
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/memmove-ssse3-back.S b/REORG.TODO/sysdeps/x86_64/multiarch/memmove-ssse3-back.S
new file mode 100644
index 0000000000..f9a4e9aff9
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/memmove-ssse3-back.S
@@ -0,0 +1,4 @@
+#define USE_AS_MEMMOVE
+#define MEMCPY		__memmove_ssse3_back
+#define MEMCPY_CHK	__memmove_chk_ssse3_back
+#include "memcpy-ssse3-back.S"
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/memmove-ssse3.S b/REORG.TODO/sysdeps/x86_64/multiarch/memmove-ssse3.S
new file mode 100644
index 0000000000..295430b1ef
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/memmove-ssse3.S
@@ -0,0 +1,4 @@
+#define USE_AS_MEMMOVE
+#define MEMCPY		__memmove_ssse3
+#define MEMCPY_CHK	__memmove_chk_ssse3
+#include "memcpy-ssse3.S"
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/REORG.TODO/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
new file mode 100644
index 0000000000..dee3ec529c
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
@@ -0,0 +1,553 @@
+/* memmove/memcpy/mempcpy with unaligned load/store and rep movsb
+   Copyright (C) 2016-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* memmove/memcpy/mempcpy is implemented as:
+   1. Use overlapping load and store to avoid branch.
+   2. Load all sources into registers and store them together to avoid
+      possible address overlap between source and destination.
+   3. If size is 8 * VEC_SIZE or less, load all sources into registers
+      and store them together.
+   4. If address of destination > address of source, backward copy
+      4 * VEC_SIZE at a time with unaligned load and aligned store.
+      Load the first 4 * VEC and last VEC before the loop and store
+      them after the loop to support overlapping addresses.
+   5. Otherwise, forward copy 4 * VEC_SIZE at a time with unaligned
+      load and aligned store.  Load the last 4 * VEC and first VEC
+      before the loop and store them after the loop to support
+      overlapping addresses.
+   6. If size >= __x86_shared_non_temporal_threshold and there is no
+      overlap between destination and source, use non-temporal store
+      instead of aligned store.  */
+
+#include <sysdep.h>
+
+#ifndef MEMCPY_SYMBOL
+# define MEMCPY_SYMBOL(p,s)		MEMMOVE_SYMBOL(p, s)
+#endif
+
+#ifndef MEMPCPY_SYMBOL
+# define MEMPCPY_SYMBOL(p,s)		MEMMOVE_SYMBOL(p, s)
+#endif
+
+#ifndef MEMMOVE_CHK_SYMBOL
+# define MEMMOVE_CHK_SYMBOL(p,s)	MEMMOVE_SYMBOL(p, s)
+#endif
+
+#ifndef VZEROUPPER
+# if VEC_SIZE > 16
+#  define VZEROUPPER vzeroupper
+# else
+#  define VZEROUPPER
+# endif
+#endif
+
+/* Threshold to use Enhanced REP MOVSB.  Since there is overhead to set
+   up REP MOVSB operation, REP MOVSB isn't faster on short data.  The
+   memcpy micro benchmark in glibc shows that 2KB is the approximate
+   value above which REP MOVSB becomes faster than SSE2 optimization
+   on processors with Enhanced REP MOVSB.  Since larger register size
+   can move more data with a single load and store, the threshold is
+   higher with larger register size.  */
+#ifndef REP_MOVSB_THRESHOLD
+# define REP_MOVSB_THRESHOLD	(2048 * (VEC_SIZE / 16))
+#endif
+
+#ifndef PREFETCH
+# define PREFETCH(addr) prefetcht0 addr
+#endif
+
+/* Assume 64-byte prefetch size.  */
+#ifndef PREFETCH_SIZE
+# define PREFETCH_SIZE 64
+#endif
+
+#define PREFETCHED_LOAD_SIZE (VEC_SIZE * 4)
+
+#if PREFETCH_SIZE == 64
+# if PREFETCHED_LOAD_SIZE == PREFETCH_SIZE
+#  define PREFETCH_ONE_SET(dir, base, offset) \
+	PREFETCH ((offset)base)
+# elif PREFETCHED_LOAD_SIZE == 2 * PREFETCH_SIZE
+#  define PREFETCH_ONE_SET(dir, base, offset) \
+	PREFETCH ((offset)base); \
+	PREFETCH ((offset + dir * PREFETCH_SIZE)base)
+# elif PREFETCHED_LOAD_SIZE == 4 * PREFETCH_SIZE
+#  define PREFETCH_ONE_SET(dir, base, offset) \
+	PREFETCH ((offset)base); \
+	PREFETCH ((offset + dir * PREFETCH_SIZE)base); \
+	PREFETCH ((offset + dir * PREFETCH_SIZE)base); \
+	PREFETCH ((offset + dir * PREFETCH_SIZE * 2)base); \
+	PREFETCH ((offset + dir * PREFETCH_SIZE * 3)base)
+# else
+#   error Unsupported PREFETCHED_LOAD_SIZE!
+# endif
+#else
+# error Unsupported PREFETCH_SIZE!
+#endif
+
+#ifndef SECTION
+# error SECTION is not defined!
+#endif
+
+	.section SECTION(.text),"ax",@progbits
+#if defined SHARED && IS_IN (libc)
+ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned))
+	cmpq	%rdx, %rcx
+	jb	HIDDEN_JUMPTARGET (__chk_fail)
+END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned))
+#endif
+
+#if VEC_SIZE == 16 || defined SHARED
+ENTRY (MEMPCPY_SYMBOL (__mempcpy, unaligned))
+	movq	%rdi, %rax
+	addq	%rdx, %rax
+	jmp	L(start)
+END (MEMPCPY_SYMBOL (__mempcpy, unaligned))
+#endif
+
+#if defined SHARED && IS_IN (libc)
+ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned))
+	cmpq	%rdx, %rcx
+	jb	HIDDEN_JUMPTARGET (__chk_fail)
+END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned))
+#endif
+
+ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned))
+	movq	%rdi, %rax
+L(start):
+	cmpq	$VEC_SIZE, %rdx
+	jb	L(less_vec)
+	cmpq	$(VEC_SIZE * 2), %rdx
+	ja	L(more_2x_vec)
+#if !defined USE_MULTIARCH || !IS_IN (libc)
+L(last_2x_vec):
+#endif
+	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
+	VMOVU	(%rsi), %VEC(0)
+	VMOVU	-VEC_SIZE(%rsi,%rdx), %VEC(1)
+	VMOVU	%VEC(0), (%rdi)
+	VMOVU	%VEC(1), -VEC_SIZE(%rdi,%rdx)
+	VZEROUPPER
+#if !defined USE_MULTIARCH || !IS_IN (libc)
+L(nop):
+#endif
+	ret
+#if defined USE_MULTIARCH && IS_IN (libc)
+END (MEMMOVE_SYMBOL (__memmove, unaligned))
+
+# if VEC_SIZE == 16
+#  if defined SHARED
+/* Only used to measure performance of REP MOVSB.  */
+ENTRY (__mempcpy_erms)
+	movq	%rdi, %rax
+	addq	%rdx, %rax
+	jmp	L(start_movsb)
+END (__mempcpy_erms)
+#  endif
+
+ENTRY (__memmove_erms)
+	movq	%rdi, %rax
+L(start_movsb):
+	movq	%rdx, %rcx
+	cmpq	%rsi, %rdi
+	jb	1f
+	/* Source == destination is less common.  */
+	je	2f
+	leaq	(%rsi,%rcx), %rdx
+	cmpq	%rdx, %rdi
+	jb	L(movsb_backward)
+1:
+	rep movsb
+2:
+	ret
+L(movsb_backward):
+	leaq	-1(%rdi,%rcx), %rdi
+	leaq	-1(%rsi,%rcx), %rsi
+	std
+	rep movsb
+	cld
+	ret
+END (__memmove_erms)
+#  if defined SHARED
+strong_alias (__memmove_erms, __memcpy_erms)
+#  endif
+# endif
+
+# ifdef SHARED
+ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms))
+	cmpq	%rdx, %rcx
+	jb	HIDDEN_JUMPTARGET (__chk_fail)
+END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms))
+
+ENTRY (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms))
+	movq	%rdi, %rax
+	addq	%rdx, %rax
+	jmp	L(start_erms)
+END (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms))
+
+ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
+	cmpq	%rdx, %rcx
+	jb	HIDDEN_JUMPTARGET (__chk_fail)
+END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
+# endif
+
+ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
+	movq	%rdi, %rax
+L(start_erms):
+	cmpq	$VEC_SIZE, %rdx
+	jb	L(less_vec)
+	cmpq	$(VEC_SIZE * 2), %rdx
+	ja	L(movsb_more_2x_vec)
+L(last_2x_vec):
+	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE. */
+	VMOVU	(%rsi), %VEC(0)
+	VMOVU	-VEC_SIZE(%rsi,%rdx), %VEC(1)
+	VMOVU	%VEC(0), (%rdi)
+	VMOVU	%VEC(1), -VEC_SIZE(%rdi,%rdx)
+L(return):
+	VZEROUPPER
+	ret
+
+L(movsb):
+	cmpq	__x86_shared_non_temporal_threshold(%rip), %rdx
+	jae	L(more_8x_vec)
+	cmpq	%rsi, %rdi
+	jb	1f
+	/* Source == destination is less common.  */
+	je	L(nop)
+	leaq	(%rsi,%rdx), %r9
+	cmpq	%r9, %rdi
+	/* Avoid slow backward REP MOVSB.  */
+# if REP_MOVSB_THRESHOLD <= (VEC_SIZE * 8)
+#  error Unsupported REP_MOVSB_THRESHOLD and VEC_SIZE!
+# endif
+	jb	L(more_8x_vec_backward)
+1:
+	movq	%rdx, %rcx
+	rep movsb
+L(nop):
+	ret
+#endif
+
+L(less_vec):
+	/* Less than 1 VEC.  */
+#if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
+# error Unsupported VEC_SIZE!
+#endif
+#if VEC_SIZE > 32
+	cmpb	$32, %dl
+	jae	L(between_32_63)
+#endif
+#if VEC_SIZE > 16
+	cmpb	$16, %dl
+	jae	L(between_16_31)
+#endif
+	cmpb	$8, %dl
+	jae	L(between_8_15)
+	cmpb	$4, %dl
+	jae	L(between_4_7)
+	cmpb	$1, %dl
+	ja	L(between_2_3)
+	jb	1f
+	movzbl	(%rsi), %ecx
+	movb	%cl, (%rdi)
+1:
+	ret
+#if VEC_SIZE > 32
+L(between_32_63):
+	/* From 32 to 63.  No branch when size == 32.  */
+	vmovdqu	(%rsi), %ymm0
+	vmovdqu	-32(%rsi,%rdx), %ymm1
+	vmovdqu	%ymm0, (%rdi)
+	vmovdqu	%ymm1, -32(%rdi,%rdx)
+	VZEROUPPER
+	ret
+#endif
+#if VEC_SIZE > 16
+	/* From 16 to 31.  No branch when size == 16.  */
+L(between_16_31):
+	vmovdqu	(%rsi), %xmm0
+	vmovdqu	-16(%rsi,%rdx), %xmm1
+	vmovdqu	%xmm0, (%rdi)
+	vmovdqu	%xmm1, -16(%rdi,%rdx)
+	ret
+#endif
+L(between_8_15):
+	/* From 8 to 15.  No branch when size == 8.  */
+	movq	-8(%rsi,%rdx), %rcx
+	movq	(%rsi), %rsi
+	movq	%rcx, -8(%rdi,%rdx)
+	movq	%rsi, (%rdi)
+	ret
+L(between_4_7):
+	/* From 4 to 7.  No branch when size == 4.  */
+	movl	-4(%rsi,%rdx), %ecx
+	movl	(%rsi), %esi
+	movl	%ecx, -4(%rdi,%rdx)
+	movl	%esi, (%rdi)
+	ret
+L(between_2_3):
+	/* From 2 to 3.  No branch when size == 2.  */
+	movzwl	-2(%rsi,%rdx), %ecx
+	movzwl	(%rsi), %esi
+	movw	%cx, -2(%rdi,%rdx)
+	movw	%si, (%rdi)
+	ret
+
+#if defined USE_MULTIARCH && IS_IN (libc)
+L(movsb_more_2x_vec):
+	cmpq	$REP_MOVSB_THRESHOLD, %rdx
+	ja	L(movsb)
+#endif
+L(more_2x_vec):
+	/* More than 2 * VEC and there may be overlap between destination
+	   and source.  */
+	cmpq	$(VEC_SIZE * 8), %rdx
+	ja	L(more_8x_vec)
+	cmpq	$(VEC_SIZE * 4), %rdx
+	jb	L(last_4x_vec)
+	/* Copy from 4 * VEC to 8 * VEC, inclusively. */
+	VMOVU	(%rsi), %VEC(0)
+	VMOVU	VEC_SIZE(%rsi), %VEC(1)
+	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
+	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(3)
+	VMOVU	-VEC_SIZE(%rsi,%rdx), %VEC(4)
+	VMOVU	-(VEC_SIZE * 2)(%rsi,%rdx), %VEC(5)
+	VMOVU	-(VEC_SIZE * 3)(%rsi,%rdx), %VEC(6)
+	VMOVU	-(VEC_SIZE * 4)(%rsi,%rdx), %VEC(7)
+	VMOVU	%VEC(0), (%rdi)
+	VMOVU	%VEC(1), VEC_SIZE(%rdi)
+	VMOVU	%VEC(2), (VEC_SIZE * 2)(%rdi)
+	VMOVU	%VEC(3), (VEC_SIZE * 3)(%rdi)
+	VMOVU	%VEC(4), -VEC_SIZE(%rdi,%rdx)
+	VMOVU	%VEC(5), -(VEC_SIZE * 2)(%rdi,%rdx)
+	VMOVU	%VEC(6), -(VEC_SIZE * 3)(%rdi,%rdx)
+	VMOVU	%VEC(7), -(VEC_SIZE * 4)(%rdi,%rdx)
+	VZEROUPPER
+	ret
+L(last_4x_vec):
+	/* Copy from 2 * VEC to 4 * VEC. */
+	VMOVU	(%rsi), %VEC(0)
+	VMOVU	VEC_SIZE(%rsi), %VEC(1)
+	VMOVU	-VEC_SIZE(%rsi,%rdx), %VEC(2)
+	VMOVU	-(VEC_SIZE * 2)(%rsi,%rdx), %VEC(3)
+	VMOVU	%VEC(0), (%rdi)
+	VMOVU	%VEC(1), VEC_SIZE(%rdi)
+	VMOVU	%VEC(2), -VEC_SIZE(%rdi,%rdx)
+	VMOVU	%VEC(3), -(VEC_SIZE * 2)(%rdi,%rdx)
+	VZEROUPPER
+	ret
+
+L(more_8x_vec):
+	cmpq	%rsi, %rdi
+	ja	L(more_8x_vec_backward)
+	/* Source == destination is less common.  */
+	je	L(nop)
+	/* Load the first VEC and last 4 * VEC to support overlapping
+	   addresses.  */
+	VMOVU	(%rsi), %VEC(4)
+	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(5)
+	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %VEC(6)
+	VMOVU	-(VEC_SIZE * 3)(%rsi, %rdx), %VEC(7)
+	VMOVU	-(VEC_SIZE * 4)(%rsi, %rdx), %VEC(8)
+	/* Save start and stop of the destination buffer.  */
+	movq	%rdi, %r11
+	leaq	-VEC_SIZE(%rdi, %rdx), %rcx
+	/* Align destination for aligned stores in the loop.  Compute
+	   how much destination is misaligned.  */
+	movq	%rdi, %r8
+	andq	$(VEC_SIZE - 1), %r8
+	/* Get the negative of offset for alignment.  */
+	subq	$VEC_SIZE, %r8
+	/* Adjust source.  */
+	subq	%r8, %rsi
+	/* Adjust destination which should be aligned now.  */
+	subq	%r8, %rdi
+	/* Adjust length.  */
+	addq	%r8, %rdx
+#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
+	/* Check non-temporal store threshold.  */
+	cmpq	__x86_shared_non_temporal_threshold(%rip), %rdx
+	ja	L(large_forward)
+#endif
+L(loop_4x_vec_forward):
+	/* Copy 4 * VEC a time forward.  */
+	VMOVU	(%rsi), %VEC(0)
+	VMOVU	VEC_SIZE(%rsi), %VEC(1)
+	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
+	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(3)
+	addq	$(VEC_SIZE * 4), %rsi
+	subq	$(VEC_SIZE * 4), %rdx
+	VMOVA	%VEC(0), (%rdi)
+	VMOVA	%VEC(1), VEC_SIZE(%rdi)
+	VMOVA	%VEC(2), (VEC_SIZE * 2)(%rdi)
+	VMOVA	%VEC(3), (VEC_SIZE * 3)(%rdi)
+	addq	$(VEC_SIZE * 4), %rdi
+	cmpq	$(VEC_SIZE * 4), %rdx
+	ja	L(loop_4x_vec_forward)
+	/* Store the last 4 * VEC.  */
+	VMOVU	%VEC(5), (%rcx)
+	VMOVU	%VEC(6), -VEC_SIZE(%rcx)
+	VMOVU	%VEC(7), -(VEC_SIZE * 2)(%rcx)
+	VMOVU	%VEC(8), -(VEC_SIZE * 3)(%rcx)
+	/* Store the first VEC.  */
+	VMOVU	%VEC(4), (%r11)
+	VZEROUPPER
+	ret
+
+L(more_8x_vec_backward):
+	/* Load the first 4 * VEC and last VEC to support overlapping
+	   addresses.  */
+	VMOVU	(%rsi), %VEC(4)
+	VMOVU	VEC_SIZE(%rsi), %VEC(5)
+	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(6)
+	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(7)
+	VMOVU	-VEC_SIZE(%rsi,%rdx), %VEC(8)
+	/* Save stop of the destination buffer.  */
+	leaq	-VEC_SIZE(%rdi, %rdx), %r11
+	/* Align destination end for aligned stores in the loop.  Compute
+	   how much destination end is misaligned.  */
+	leaq	-VEC_SIZE(%rsi, %rdx), %rcx
+	movq	%r11, %r9
+	movq	%r11, %r8
+	andq	$(VEC_SIZE - 1), %r8
+	/* Adjust source.  */
+	subq	%r8, %rcx
+	/* Adjust the end of destination which should be aligned now.  */
+	subq	%r8, %r9
+	/* Adjust length.  */
+	subq	%r8, %rdx
+#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
+	/* Check non-temporal store threshold.  */
+	cmpq	__x86_shared_non_temporal_threshold(%rip), %rdx
+	ja	L(large_backward)
+#endif
+L(loop_4x_vec_backward):
+	/* Copy 4 * VEC a time backward.  */
+	VMOVU	(%rcx), %VEC(0)
+	VMOVU	-VEC_SIZE(%rcx), %VEC(1)
+	VMOVU	-(VEC_SIZE * 2)(%rcx), %VEC(2)
+	VMOVU	-(VEC_SIZE * 3)(%rcx), %VEC(3)
+	subq	$(VEC_SIZE * 4), %rcx
+	subq	$(VEC_SIZE * 4), %rdx
+	VMOVA	%VEC(0), (%r9)
+	VMOVA	%VEC(1), -VEC_SIZE(%r9)
+	VMOVA	%VEC(2), -(VEC_SIZE * 2)(%r9)
+	VMOVA	%VEC(3), -(VEC_SIZE * 3)(%r9)
+	subq	$(VEC_SIZE * 4), %r9
+	cmpq	$(VEC_SIZE * 4), %rdx
+	ja	L(loop_4x_vec_backward)
+	/* Store the first 4 * VEC.  */
+	VMOVU	%VEC(4), (%rdi)
+	VMOVU	%VEC(5), VEC_SIZE(%rdi)
+	VMOVU	%VEC(6), (VEC_SIZE * 2)(%rdi)
+	VMOVU	%VEC(7), (VEC_SIZE * 3)(%rdi)
+	/* Store the last VEC.  */
+	VMOVU	%VEC(8), (%r11)
+	VZEROUPPER
+	ret
+
+#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
+L(large_forward):
+	/* Don't use non-temporal store if there is overlap between
+	   destination and source since destination may be in cache
+	   when source is loaded.  */
+	leaq    (%rdi, %rdx), %r10
+	cmpq    %r10, %rsi
+	jb	L(loop_4x_vec_forward)
+L(loop_large_forward):
+	/* Copy 4 * VEC a time forward with non-temporal stores.  */
+	PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 2)
+	PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 3)
+	VMOVU	(%rsi), %VEC(0)
+	VMOVU	VEC_SIZE(%rsi), %VEC(1)
+	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
+	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(3)
+	addq	$PREFETCHED_LOAD_SIZE, %rsi
+	subq	$PREFETCHED_LOAD_SIZE, %rdx
+	VMOVNT	%VEC(0), (%rdi)
+	VMOVNT	%VEC(1), VEC_SIZE(%rdi)
+	VMOVNT	%VEC(2), (VEC_SIZE * 2)(%rdi)
+	VMOVNT	%VEC(3), (VEC_SIZE * 3)(%rdi)
+	addq	$PREFETCHED_LOAD_SIZE, %rdi
+	cmpq	$PREFETCHED_LOAD_SIZE, %rdx
+	ja	L(loop_large_forward)
+	sfence
+	/* Store the last 4 * VEC.  */
+	VMOVU	%VEC(5), (%rcx)
+	VMOVU	%VEC(6), -VEC_SIZE(%rcx)
+	VMOVU	%VEC(7), -(VEC_SIZE * 2)(%rcx)
+	VMOVU	%VEC(8), -(VEC_SIZE * 3)(%rcx)
+	/* Store the first VEC.  */
+	VMOVU	%VEC(4), (%r11)
+	VZEROUPPER
+	ret
+
+L(large_backward):
+	/* Don't use non-temporal store if there is overlap between
+	   destination and source since destination may be in cache
+	   when source is loaded.  */
+	leaq    (%rcx, %rdx), %r10
+	cmpq    %r10, %r9
+	jb	L(loop_4x_vec_backward)
+L(loop_large_backward):
+	/* Copy 4 * VEC a time backward with non-temporal stores.  */
+	PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 2)
+	PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 3)
+	VMOVU	(%rcx), %VEC(0)
+	VMOVU	-VEC_SIZE(%rcx), %VEC(1)
+	VMOVU	-(VEC_SIZE * 2)(%rcx), %VEC(2)
+	VMOVU	-(VEC_SIZE * 3)(%rcx), %VEC(3)
+	subq	$PREFETCHED_LOAD_SIZE, %rcx
+	subq	$PREFETCHED_LOAD_SIZE, %rdx
+	VMOVNT	%VEC(0), (%r9)
+	VMOVNT	%VEC(1), -VEC_SIZE(%r9)
+	VMOVNT	%VEC(2), -(VEC_SIZE * 2)(%r9)
+	VMOVNT	%VEC(3), -(VEC_SIZE * 3)(%r9)
+	subq	$PREFETCHED_LOAD_SIZE, %r9
+	cmpq	$PREFETCHED_LOAD_SIZE, %rdx
+	ja	L(loop_large_backward)
+	sfence
+	/* Store the first 4 * VEC.  */
+	VMOVU	%VEC(4), (%rdi)
+	VMOVU	%VEC(5), VEC_SIZE(%rdi)
+	VMOVU	%VEC(6), (VEC_SIZE * 2)(%rdi)
+	VMOVU	%VEC(7), (VEC_SIZE * 3)(%rdi)
+	/* Store the last VEC.  */
+	VMOVU	%VEC(8), (%r11)
+	VZEROUPPER
+	ret
+#endif
+END (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
+
+#ifdef SHARED
+# if IS_IN (libc)
+#  ifdef USE_MULTIARCH
+strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned_erms),
+	      MEMMOVE_SYMBOL (__memcpy, unaligned_erms))
+strong_alias (MEMMOVE_SYMBOL (__memmove_chk, unaligned_erms),
+	      MEMMOVE_SYMBOL (__memcpy_chk, unaligned_erms))
+#  endif
+strong_alias (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned),
+	      MEMMOVE_CHK_SYMBOL (__memcpy_chk, unaligned))
+# endif
+#endif
+#if VEC_SIZE == 16 || defined SHARED
+strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned),
+	      MEMCPY_SYMBOL (__memcpy, unaligned))
+#endif
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/memmove.S b/REORG.TODO/sysdeps/x86_64/multiarch/memmove.S
new file mode 100644
index 0000000000..8c534e83e0
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/memmove.S
@@ -0,0 +1,101 @@
+/* Multiple versions of memmove
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2016-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in lib and for
+   DSO.  */
+#if IS_IN (libc)
+	.text
+ENTRY(__libc_memmove)
+	.type	__libc_memmove, @gnu_indirect_function
+	LOAD_RTLD_GLOBAL_RO_RDX
+	lea	__memmove_erms(%rip), %RAX_LP
+	HAS_ARCH_FEATURE (Prefer_ERMS)
+	jnz	2f
+	HAS_ARCH_FEATURE (Prefer_No_AVX512)
+	jnz	1f
+	HAS_ARCH_FEATURE (AVX512F_Usable)
+	jz	1f
+	lea	__memmove_avx512_no_vzeroupper(%rip), %RAX_LP
+	HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
+	jnz	2f
+	lea	__memmove_avx512_unaligned_erms(%rip), %RAX_LP
+	HAS_CPU_FEATURE (ERMS)
+	jnz	2f
+	lea	__memmove_avx512_unaligned(%rip), %RAX_LP
+	ret
+1:	lea	__memmove_avx_unaligned(%rip), %RAX_LP
+	HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
+	jz	L(Fast_Unaligned_Load)
+	HAS_CPU_FEATURE (ERMS)
+	jz	2f
+	lea	__memmove_avx_unaligned_erms(%rip), %RAX_LP
+	ret
+L(Fast_Unaligned_Load):
+	lea	__memmove_sse2_unaligned(%rip), %RAX_LP
+	HAS_ARCH_FEATURE (Fast_Unaligned_Copy)
+	jz	L(SSSE3)
+	HAS_CPU_FEATURE (ERMS)
+	jz	2f
+	lea	__memmove_sse2_unaligned_erms(%rip), %RAX_LP
+	ret
+L(SSSE3):
+	HAS_CPU_FEATURE (SSSE3)
+	jz	2f
+	lea    __memmove_ssse3_back(%rip), %RAX_LP
+	HAS_ARCH_FEATURE (Fast_Copy_Backward)
+	jnz	2f
+	lea	__memmove_ssse3(%rip), %RAX_LP
+2:	ret
+END(__libc_memmove)
+#endif
+
+#if IS_IN (libc)
+# define MEMMOVE_SYMBOL(p,s)	p##_sse2_##s
+
+# ifdef SHARED
+libc_hidden_ver (__memmove_sse2_unaligned, memmove)
+libc_hidden_ver (__memcpy_sse2_unaligned, memcpy)
+libc_hidden_ver (__mempcpy_sse2_unaligned, mempcpy)
+libc_hidden_ver (__mempcpy_sse2_unaligned, __mempcpy)
+
+#  undef libc_hidden_builtin_def
+/* It doesn't make sense to send libc-internal memmove calls through a PLT.
+   The speedup we get from using SSE2 instructions is likely eaten away
+   by the indirect call in the PLT.  */
+#  define libc_hidden_builtin_def
+# endif
+strong_alias (__libc_memmove, memmove)
+#endif
+
+#if !defined SHARED || !IS_IN (libc)
+weak_alias (__mempcpy, mempcpy)
+#endif
+
+#include "../memmove.S"
+
+#if defined SHARED && IS_IN (libc)
+# include <shlib-compat.h>
+# if SHLIB_COMPAT (libc, GLIBC_2_2_5, GLIBC_2_14)
+/* Use __memmove_sse2_unaligned to support overlapping addresses.  */
+compat_symbol (libc, __memmove_sse2_unaligned, memcpy, GLIBC_2_2_5);
+# endif
+#endif
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/memmove_chk.S b/REORG.TODO/sysdeps/x86_64/multiarch/memmove_chk.S
new file mode 100644
index 0000000000..7870dd0247
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/memmove_chk.S
@@ -0,0 +1,71 @@
+/* Multiple versions of __memmove_chk
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2016-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in lib and for
+   DSO.  There are no multiarch memmove functions for static binaries.
+ */
+#if IS_IN (libc)
+# ifdef SHARED
+	.text
+ENTRY(__memmove_chk)
+	.type	__memmove_chk, @gnu_indirect_function
+	LOAD_RTLD_GLOBAL_RO_RDX
+	HAS_ARCH_FEATURE (Prefer_No_AVX512)
+	jnz	1f
+	HAS_ARCH_FEATURE (AVX512F_Usable)
+	jz	1f
+	lea	__memmove_chk_avx512_no_vzeroupper(%rip), %RAX_LP
+	HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
+	jnz	2f
+	lea	__memmove_chk_avx512_unaligned_erms(%rip), %RAX_LP
+	HAS_CPU_FEATURE (ERMS)
+	jnz	2f
+	lea	__memmove_chk_avx512_unaligned(%rip), %RAX_LP
+	ret
+1:	lea	__memmove_chk_avx_unaligned(%rip), %RAX_LP
+	HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
+	jz	L(Fast_Unaligned_Load)
+	HAS_CPU_FEATURE (ERMS)
+	jz	2f
+	lea	__memmove_chk_avx_unaligned_erms(%rip), %RAX_LP
+	ret
+L(Fast_Unaligned_Load):
+	lea	__memmove_chk_sse2_unaligned(%rip), %RAX_LP
+	HAS_ARCH_FEATURE (Fast_Unaligned_Copy)
+	jz	L(SSSE3)
+	HAS_CPU_FEATURE (ERMS)
+	jz	2f
+	lea	__memmove_chk_sse2_unaligned_erms(%rip), %RAX_LP
+	ret
+L(SSSE3):
+	HAS_CPU_FEATURE (SSSE3)
+	jz	2f
+	lea    __memmove_chk_ssse3_back(%rip), %RAX_LP
+	HAS_ARCH_FEATURE (Fast_Copy_Backward)
+	jnz	2f
+	lea	__memmove_chk_ssse3(%rip), %RAX_LP
+2:	ret
+END(__memmove_chk)
+# else
+#  include "../memmove_chk.S"
+# endif
+#endif
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/mempcpy.S b/REORG.TODO/sysdeps/x86_64/multiarch/mempcpy.S
new file mode 100644
index 0000000000..b8b2b28094
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/mempcpy.S
@@ -0,0 +1,73 @@
+/* Multiple versions of mempcpy
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in lib and for
+   DSO.  In static binaries we need mempcpy before the initialization
+   happened.  */
+#if defined SHARED && IS_IN (libc)
+	.text
+ENTRY(__mempcpy)
+	.type	__mempcpy, @gnu_indirect_function
+	LOAD_RTLD_GLOBAL_RO_RDX
+	lea	__mempcpy_erms(%rip), %RAX_LP
+	HAS_ARCH_FEATURE (Prefer_ERMS)
+	jnz	2f
+	HAS_ARCH_FEATURE (Prefer_No_AVX512)
+	jnz	1f
+	HAS_ARCH_FEATURE (AVX512F_Usable)
+	jz	1f
+	lea	__mempcpy_avx512_no_vzeroupper(%rip), %RAX_LP
+	HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
+	jnz	2f
+	lea	__mempcpy_avx512_unaligned_erms(%rip), %RAX_LP
+	HAS_CPU_FEATURE (ERMS)
+	jnz	2f
+	lea	__mempcpy_avx512_unaligned(%rip), %RAX_LP
+	ret
+1:	lea	__mempcpy_avx_unaligned(%rip), %RAX_LP
+	HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
+	jz	L(Fast_Unaligned_Load)
+	HAS_CPU_FEATURE (ERMS)
+	jz	2f
+	lea	__mempcpy_avx_unaligned_erms(%rip), %RAX_LP
+	ret
+L(Fast_Unaligned_Load):
+	lea	__mempcpy_sse2_unaligned(%rip), %RAX_LP
+	HAS_ARCH_FEATURE (Fast_Unaligned_Copy)
+	jz	L(SSSE3)
+	HAS_CPU_FEATURE (ERMS)
+	jz	2f
+	lea	__mempcpy_sse2_unaligned_erms(%rip), %RAX_LP
+	ret
+L(SSSE3):
+	HAS_CPU_FEATURE (SSSE3)
+	jz	2f
+	lea    __mempcpy_ssse3_back(%rip), %RAX_LP
+	HAS_ARCH_FEATURE (Fast_Copy_Backward)
+	jnz	2f
+	lea	__mempcpy_ssse3(%rip), %RAX_LP
+2:	ret
+END(__mempcpy)
+
+weak_alias (__mempcpy, mempcpy)
+#endif
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/mempcpy_chk.S b/REORG.TODO/sysdeps/x86_64/multiarch/mempcpy_chk.S
new file mode 100644
index 0000000000..072b22c49f
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/mempcpy_chk.S
@@ -0,0 +1,72 @@
+/* Multiple versions of __mempcpy_chk
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in lib and for
+   DSO.  There are no multiarch mempcpy functions for static binaries.
+ */
+#if IS_IN (libc)
+# ifdef SHARED
+	.text
+ENTRY(__mempcpy_chk)
+	.type	__mempcpy_chk, @gnu_indirect_function
+	LOAD_RTLD_GLOBAL_RO_RDX
+	HAS_ARCH_FEATURE (Prefer_No_AVX512)
+	jnz	1f
+	HAS_ARCH_FEATURE (AVX512F_Usable)
+	jz	1f
+	lea	__mempcpy_chk_avx512_no_vzeroupper(%rip), %RAX_LP
+	HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
+	jnz	2f
+	lea	__mempcpy_chk_avx512_unaligned_erms(%rip), %RAX_LP
+	HAS_CPU_FEATURE (ERMS)
+	jnz	2f
+	lea	__mempcpy_chk_avx512_unaligned(%rip), %RAX_LP
+	ret
+1:	lea	__mempcpy_chk_avx_unaligned(%rip), %RAX_LP
+	HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
+	jz	L(Fast_Unaligned_Load)
+	HAS_CPU_FEATURE (ERMS)
+	jz	2f
+	lea	__mempcpy_chk_avx_unaligned_erms(%rip), %RAX_LP
+	ret
+L(Fast_Unaligned_Load):
+	lea	__mempcpy_chk_sse2_unaligned(%rip), %RAX_LP
+	HAS_ARCH_FEATURE (Fast_Unaligned_Copy)
+	jz	L(SSSE3)
+	HAS_CPU_FEATURE (ERMS)
+	jz	2f
+	lea	__mempcpy_chk_sse2_unaligned_erms(%rip), %RAX_LP
+	ret
+L(SSSE3):
+	HAS_CPU_FEATURE (SSSE3)
+	jz	2f
+	lea    __mempcpy_chk_ssse3_back(%rip), %RAX_LP
+	HAS_ARCH_FEATURE (Fast_Copy_Backward)
+	jnz	2f
+	lea	__mempcpy_chk_ssse3(%rip), %RAX_LP
+2:	ret
+END(__mempcpy_chk)
+# else
+#  include "../mempcpy_chk.S"
+# endif
+#endif
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/REORG.TODO/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
new file mode 100644
index 0000000000..7ab3d89849
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
@@ -0,0 +1,22 @@
+#if IS_IN (libc)
+# define VEC_SIZE	32
+# define VEC(i)		ymm##i
+# define VMOVU		vmovdqu
+# define VMOVA		vmovdqa
+
+# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+  vmovd d, %xmm0; \
+  movq r, %rax; \
+  vpbroadcastb %xmm0, %ymm0
+
+# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+  vmovd d, %xmm0; \
+  movq r, %rax; \
+  vpbroadcastd %xmm0, %ymm0
+
+# define SECTION(p)		p##.avx
+# define MEMSET_SYMBOL(p,s)	p##_avx2_##s
+# define WMEMSET_SYMBOL(p,s)	p##_avx2_##s
+
+# include "memset-vec-unaligned-erms.S"
+#endif
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/memset-avx512-no-vzeroupper.S b/REORG.TODO/sysdeps/x86_64/multiarch/memset-avx512-no-vzeroupper.S
new file mode 100644
index 0000000000..1f66602398
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/memset-avx512-no-vzeroupper.S
@@ -0,0 +1,194 @@
+/* memset optimized with AVX512 for KNL hardware.
+   Copyright (C) 2015-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+#if IS_IN (libc)
+
+#include "asm-syntax.h"
+#ifndef MEMSET
+# define MEMSET __memset_avx512_no_vzeroupper
+# define MEMSET_CHK __memset_chk_avx512_no_vzeroupper
+#endif
+
+	.section .text.avx512,"ax",@progbits
+#if defined PIC
+ENTRY (MEMSET_CHK)
+	cmpq	%rdx, %rcx
+	jb	HIDDEN_JUMPTARGET (__chk_fail)
+END (MEMSET_CHK)
+#endif
+
+ENTRY (MEMSET)
+	vpxor	%xmm0, %xmm0, %xmm0
+	vmovd	%esi, %xmm1
+	lea	(%rdi, %rdx), %rsi
+	mov	%rdi, %rax
+	vpshufb	%xmm0, %xmm1, %xmm0
+	cmp	$16, %rdx
+	jb	L(less_16bytes)
+	cmp	$512, %rdx
+	vbroadcastss	%xmm0, %zmm2
+	ja	L(512bytesormore)
+	cmp	$256, %rdx
+	jb	L(less_256bytes)
+	vmovups	%zmm2, (%rdi)
+	vmovups %zmm2, 0x40(%rdi)
+	vmovups %zmm2, 0x80(%rdi)
+	vmovups %zmm2, 0xC0(%rdi)
+	vmovups %zmm2, -0x100(%rsi)
+	vmovups %zmm2, -0xC0(%rsi)
+	vmovups %zmm2, -0x80(%rsi)
+	vmovups %zmm2, -0x40(%rsi)
+	ret
+
+L(less_256bytes):
+	cmp	$128, %dl
+	jb	L(less_128bytes)
+	vmovups	%zmm2, (%rdi)
+	vmovups %zmm2, 0x40(%rdi)
+	vmovups %zmm2, -0x80(%rsi)
+	vmovups %zmm2, -0x40(%rsi)
+	ret
+
+L(less_128bytes):
+	cmp	$64, %dl
+	jb	L(less_64bytes)
+	vmovups	%zmm2, (%rdi)
+	vmovups	%zmm2, -0x40(%rsi)
+	ret
+
+L(less_64bytes):
+	cmp	$32, %dl
+	jb	L(less_32bytes)
+	vmovdqu	%ymm2, (%rdi)
+	vmovdqu %ymm2, -0x20(%rsi)
+	ret
+
+L(less_32bytes):
+	vmovdqu %xmm0, (%rdi)
+	vmovdqu %xmm0, -0x10(%rsi)
+	ret
+
+L(less_16bytes):
+	cmp	$8, %dl
+	jb	L(less_8bytes)
+	vmovq	%xmm0, (%rdi)
+	vmovq	%xmm0, -0x08(%rsi)
+	ret
+
+L(less_8bytes):
+	vmovd	%xmm0, %ecx
+	cmp	$4, %dl
+	jb	L(less_4bytes)
+	mov	%ecx, (%rdi)
+	mov	%ecx, -0x04(%rsi)
+	ret
+
+L(less_4bytes):
+	cmp	$2, %dl
+	jb	L(less_2bytes)
+	mov	%cx, (%rdi)
+	mov	%cx, -0x02(%rsi)
+	ret
+
+L(less_2bytes):
+	cmp	$1, %dl
+	jb	L(less_1bytes)
+	mov	%cl, (%rdi)
+L(less_1bytes):
+	ret
+
+L(512bytesormore):
+	mov	__x86_shared_cache_size_half(%rip), %rcx
+	cmp	%rcx, %rdx
+	ja	L(preloop_large)
+	cmp	$1024, %rdx
+	ja	L(1024bytesormore)
+
+	vmovups	%zmm2, (%rdi)
+	vmovups	%zmm2, 0x40(%rdi)
+	vmovups	%zmm2, 0x80(%rdi)
+	vmovups	%zmm2, 0xC0(%rdi)
+	vmovups	%zmm2, 0x100(%rdi)
+	vmovups	%zmm2, 0x140(%rdi)
+	vmovups	%zmm2, 0x180(%rdi)
+	vmovups	%zmm2, 0x1C0(%rdi)
+	vmovups %zmm2, -0x200(%rsi)
+	vmovups %zmm2, -0x1C0(%rsi)
+	vmovups %zmm2, -0x180(%rsi)
+	vmovups %zmm2, -0x140(%rsi)
+	vmovups %zmm2, -0x100(%rsi)
+	vmovups %zmm2, -0xC0(%rsi)
+	vmovups %zmm2, -0x80(%rsi)
+	vmovups %zmm2, -0x40(%rsi)
+	ret
+
+/* Align on 64 and loop with aligned stores.  */
+L(1024bytesormore):
+	sub	$0x100, %rsi
+	vmovups	%zmm2, (%rax)
+	and	$-0x40, %rdi
+	add	$0x40, %rdi
+
+L(gobble_256bytes_loop):
+	vmovaps	%zmm2, (%rdi)
+	vmovaps	%zmm2, 0x40(%rdi)
+	vmovaps	%zmm2, 0x80(%rdi)
+	vmovaps	%zmm2, 0xC0(%rdi)
+	add	$0x100, %rdi
+	cmp	%rsi, %rdi
+	jb	L(gobble_256bytes_loop)
+	vmovups %zmm2, (%rsi)
+	vmovups %zmm2, 0x40(%rsi)
+	vmovups %zmm2, 0x80(%rsi)
+	vmovups %zmm2, 0xC0(%rsi)
+	ret
+
+/* Align on 128 and loop with non-temporal stores.  */
+L(preloop_large):
+	and	$-0x80, %rdi
+	add	$0x80, %rdi
+	vmovups	%zmm2, (%rax)
+	vmovups	%zmm2, 0x40(%rax)
+	sub	$0x200, %rsi
+
+L(gobble_512bytes_nt_loop):
+	vmovntdq %zmm2, (%rdi)
+	vmovntdq %zmm2, 0x40(%rdi)
+	vmovntdq %zmm2, 0x80(%rdi)
+	vmovntdq %zmm2, 0xC0(%rdi)
+	vmovntdq %zmm2, 0x100(%rdi)
+	vmovntdq %zmm2, 0x140(%rdi)
+	vmovntdq %zmm2, 0x180(%rdi)
+	vmovntdq %zmm2, 0x1C0(%rdi)
+	add	$0x200, %rdi
+	cmp	%rsi, %rdi
+	jb	L(gobble_512bytes_nt_loop)
+	sfence
+	vmovups %zmm2, (%rsi)
+	vmovups %zmm2, 0x40(%rsi)
+	vmovups %zmm2, 0x80(%rsi)
+	vmovups %zmm2, 0xC0(%rsi)
+	vmovups	%zmm2, 0x100(%rsi)
+	vmovups	%zmm2, 0x140(%rsi)
+	vmovups	%zmm2, 0x180(%rsi)
+	vmovups	%zmm2, 0x1C0(%rsi)
+	ret
+END (MEMSET)
+#endif
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/REORG.TODO/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
new file mode 100644
index 0000000000..0783979ca5
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
@@ -0,0 +1,24 @@
+#if IS_IN (libc)
+# define VEC_SIZE	64
+# define VEC(i)		zmm##i
+# define VMOVU		vmovdqu64
+# define VMOVA		vmovdqa64
+
+# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+  vmovd d, %xmm0; \
+  movq r, %rax; \
+  vpbroadcastb %xmm0, %xmm0; \
+  vpbroadcastq %xmm0, %zmm0
+
+# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+  vmovd d, %xmm0; \
+  movq r, %rax; \
+  vpbroadcastd %xmm0, %xmm0; \
+  vpbroadcastq %xmm0, %zmm0
+
+# define SECTION(p)		p##.avx512
+# define MEMSET_SYMBOL(p,s)	p##_avx512_##s
+# define WMEMSET_SYMBOL(p,s)	p##_avx512_##s
+
+# include "memset-vec-unaligned-erms.S"
+#endif
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/REORG.TODO/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
new file mode 100644
index 0000000000..2eb9e3744e
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
@@ -0,0 +1,263 @@
+/* memset/bzero with unaligned store and rep stosb
+   Copyright (C) 2016-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* memset is implemented as:
+   1. Use overlapping store to avoid branch.
+   2. If size is less than VEC, use integer register stores.
+   3. If size is from VEC_SIZE to 2 * VEC_SIZE, use 2 VEC stores.
+   4. If size is from 2 * VEC_SIZE to 4 * VEC_SIZE, use 4 VEC stores.
+   5. If size is more to 4 * VEC_SIZE, align to 4 * VEC_SIZE with
+      4 VEC stores and store 4 * VEC at a time until done.  */
+
+#include <sysdep.h>
+
+#ifndef MEMSET_CHK_SYMBOL
+# define MEMSET_CHK_SYMBOL(p,s)		MEMSET_SYMBOL(p, s)
+#endif
+
+#ifndef WMEMSET_CHK_SYMBOL
+# define WMEMSET_CHK_SYMBOL(p,s)	WMEMSET_SYMBOL(p, s)
+#endif
+
+#ifndef VZEROUPPER
+# if VEC_SIZE > 16
+#  define VZEROUPPER			vzeroupper
+# else
+#  define VZEROUPPER
+# endif
+#endif
+
+#ifndef VZEROUPPER_SHORT_RETURN
+# if VEC_SIZE > 16
+#  define VZEROUPPER_SHORT_RETURN	vzeroupper
+# else
+#  define VZEROUPPER_SHORT_RETURN	rep
+# endif
+#endif
+
+#ifndef MOVQ
+# if VEC_SIZE > 16
+#  define MOVQ				vmovq
+# else
+#  define MOVQ				movq
+# endif
+#endif
+
+/* Threshold to use Enhanced REP STOSB.  Since there is overhead to set
+   up REP STOSB operation, REP STOSB isn't faster on short data.  The
+   memset micro benchmark in glibc shows that 2KB is the approximate
+   value above which REP STOSB becomes faster on processors with
+   Enhanced REP STOSB.  Since the stored value is fixed, larger register
+   size has minimal impact on threshold.  */
+#ifndef REP_STOSB_THRESHOLD
+# define REP_STOSB_THRESHOLD		2048
+#endif
+
+#ifndef SECTION
+# error SECTION is not defined!
+#endif
+
+	.section SECTION(.text),"ax",@progbits
+#if VEC_SIZE == 16 && IS_IN (libc)
+ENTRY (__bzero)
+	movq	%rdi, %rax /* Set return value.  */
+	movq	%rsi, %rdx /* Set n.  */
+	pxor	%xmm0, %xmm0
+	jmp	L(entry_from_bzero)
+END (__bzero)
+weak_alias (__bzero, bzero)
+#endif
+
+#if IS_IN (libc)
+# if defined SHARED
+ENTRY_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned))
+	cmpq	%rdx, %rcx
+	jb	HIDDEN_JUMPTARGET (__chk_fail)
+END_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned))
+# endif
+
+ENTRY (WMEMSET_SYMBOL (__wmemset, unaligned))
+	shlq	$2, %rdx
+	WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
+	jmp	L(entry_from_bzero)
+END (WMEMSET_SYMBOL (__wmemset, unaligned))
+#endif
+
+#if defined SHARED && IS_IN (libc)
+ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
+	cmpq	%rdx, %rcx
+	jb	HIDDEN_JUMPTARGET (__chk_fail)
+END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
+#endif
+
+ENTRY (MEMSET_SYMBOL (__memset, unaligned))
+	MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
+L(entry_from_bzero):
+	cmpq	$VEC_SIZE, %rdx
+	jb	L(less_vec)
+	cmpq	$(VEC_SIZE * 2), %rdx
+	ja	L(more_2x_vec)
+	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
+	VMOVU	%VEC(0), -VEC_SIZE(%rdi,%rdx)
+	VMOVU	%VEC(0), (%rdi)
+	VZEROUPPER
+	ret
+#if defined USE_MULTIARCH && IS_IN (libc)
+END (MEMSET_SYMBOL (__memset, unaligned))
+
+# if VEC_SIZE == 16
+/* Only used to measure performance of REP STOSB.  */
+ENTRY (__memset_erms)
+# else
+/* Provide a symbol to debugger.  */
+ENTRY (MEMSET_SYMBOL (__memset, erms))
+# endif
+L(stosb):
+	/* Issue vzeroupper before rep stosb.  */
+	VZEROUPPER
+	movq	%rdx, %rcx
+	movzbl	%sil, %eax
+	movq	%rdi, %rdx
+	rep stosb
+	movq	%rdx, %rax
+	ret
+# if VEC_SIZE == 16
+END (__memset_erms)
+# else
+END (MEMSET_SYMBOL (__memset, erms))
+# endif
+
+# if defined SHARED && IS_IN (libc)
+ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
+	cmpq	%rdx, %rcx
+	jb	HIDDEN_JUMPTARGET (__chk_fail)
+END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
+# endif
+
+ENTRY (MEMSET_SYMBOL (__memset, unaligned_erms))
+	MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
+	cmpq	$VEC_SIZE, %rdx
+	jb	L(less_vec)
+	cmpq	$(VEC_SIZE * 2), %rdx
+	ja	L(stosb_more_2x_vec)
+	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
+	VMOVU	%VEC(0), -VEC_SIZE(%rdi,%rdx)
+	VMOVU	%VEC(0), (%rdi)
+	VZEROUPPER
+	ret
+
+L(stosb_more_2x_vec):
+	cmpq	$REP_STOSB_THRESHOLD, %rdx
+	ja	L(stosb)
+#endif
+L(more_2x_vec):
+	cmpq  $(VEC_SIZE * 4), %rdx
+	ja	L(loop_start)
+	VMOVU	%VEC(0), (%rdi)
+	VMOVU	%VEC(0), VEC_SIZE(%rdi)
+	VMOVU	%VEC(0), -VEC_SIZE(%rdi,%rdx)
+	VMOVU	%VEC(0), -(VEC_SIZE * 2)(%rdi,%rdx)
+L(return):
+	VZEROUPPER
+	ret
+
+L(loop_start):
+	leaq	(VEC_SIZE * 4)(%rdi), %rcx
+	VMOVU	%VEC(0), (%rdi)
+	andq	$-(VEC_SIZE * 4), %rcx
+	VMOVU	%VEC(0), -VEC_SIZE(%rdi,%rdx)
+	VMOVU	%VEC(0), VEC_SIZE(%rdi)
+	VMOVU	%VEC(0), -(VEC_SIZE * 2)(%rdi,%rdx)
+	VMOVU	%VEC(0), (VEC_SIZE * 2)(%rdi)
+	VMOVU	%VEC(0), -(VEC_SIZE * 3)(%rdi,%rdx)
+	VMOVU	%VEC(0), (VEC_SIZE * 3)(%rdi)
+	VMOVU	%VEC(0), -(VEC_SIZE * 4)(%rdi,%rdx)
+	addq	%rdi, %rdx
+	andq	$-(VEC_SIZE * 4), %rdx
+	cmpq	%rdx, %rcx
+	je	L(return)
+L(loop):
+	VMOVA	%VEC(0), (%rcx)
+	VMOVA	%VEC(0), VEC_SIZE(%rcx)
+	VMOVA	%VEC(0), (VEC_SIZE * 2)(%rcx)
+	VMOVA	%VEC(0), (VEC_SIZE * 3)(%rcx)
+	addq	$(VEC_SIZE * 4), %rcx
+	cmpq	%rcx, %rdx
+	jne	L(loop)
+	VZEROUPPER_SHORT_RETURN
+	ret
+L(less_vec):
+	/* Less than 1 VEC.  */
+# if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
+#  error Unsupported VEC_SIZE!
+# endif
+# if VEC_SIZE > 32
+	cmpb	$32, %dl
+	jae	L(between_32_63)
+# endif
+# if VEC_SIZE > 16
+	cmpb	$16, %dl
+	jae	L(between_16_31)
+# endif
+	MOVQ	%xmm0, %rcx
+	cmpb	$8, %dl
+	jae	L(between_8_15)
+	cmpb	$4, %dl
+	jae	L(between_4_7)
+	cmpb	$1, %dl
+	ja	L(between_2_3)
+	jb	1f
+	movb	%cl, (%rdi)
+1:
+	VZEROUPPER
+	ret
+# if VEC_SIZE > 32
+	/* From 32 to 63.  No branch when size == 32.  */
+L(between_32_63):
+	vmovdqu	%ymm0, -32(%rdi,%rdx)
+	vmovdqu	%ymm0, (%rdi)
+	VZEROUPPER
+	ret
+# endif
+# if VEC_SIZE > 16
+	/* From 16 to 31.  No branch when size == 16.  */
+L(between_16_31):
+	vmovdqu	%xmm0, -16(%rdi,%rdx)
+	vmovdqu	%xmm0, (%rdi)
+	VZEROUPPER
+	ret
+# endif
+	/* From 8 to 15.  No branch when size == 8.  */
+L(between_8_15):
+	movq	%rcx, -8(%rdi,%rdx)
+	movq	%rcx, (%rdi)
+	VZEROUPPER
+	ret
+L(between_4_7):
+	/* From 4 to 7.  No branch when size == 4.  */
+	movl	%ecx, -4(%rdi,%rdx)
+	movl	%ecx, (%rdi)
+	VZEROUPPER
+	ret
+L(between_2_3):
+	/* From 2 to 3.  No branch when size == 2.  */
+	movw	%cx, -2(%rdi,%rdx)
+	movw	%cx, (%rdi)
+	VZEROUPPER
+	ret
+END (MEMSET_SYMBOL (__memset, unaligned_erms))
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/memset.S b/REORG.TODO/sysdeps/x86_64/multiarch/memset.S
new file mode 100644
index 0000000000..11f27378b0
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/memset.S
@@ -0,0 +1,82 @@
+/* Multiple versions of memset
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <shlib-compat.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in lib.  */
+#if IS_IN (libc)
+ENTRY(memset)
+	.type	memset, @gnu_indirect_function
+	LOAD_RTLD_GLOBAL_RO_RDX
+	lea	__memset_erms(%rip), %RAX_LP
+	HAS_ARCH_FEATURE (Prefer_ERMS)
+	jnz	2f
+	lea	__memset_sse2_unaligned_erms(%rip), %RAX_LP
+	HAS_CPU_FEATURE (ERMS)
+	jnz	1f
+	lea	__memset_sse2_unaligned(%rip), %RAX_LP
+1:
+	HAS_ARCH_FEATURE (AVX2_Usable)
+	jz	2f
+	lea	__memset_avx2_unaligned_erms(%rip), %RAX_LP
+	HAS_CPU_FEATURE (ERMS)
+	jnz	L(AVX512F)
+	lea	__memset_avx2_unaligned(%rip), %RAX_LP
+L(AVX512F):
+	HAS_ARCH_FEATURE (Prefer_No_AVX512)
+	jnz	2f
+	HAS_ARCH_FEATURE (AVX512F_Usable)
+	jz	2f
+	lea	__memset_avx512_no_vzeroupper(%rip), %RAX_LP
+	HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
+	jnz	2f
+	lea	__memset_avx512_unaligned_erms(%rip), %RAX_LP
+	HAS_CPU_FEATURE (ERMS)
+	jnz	2f
+	lea	__memset_avx512_unaligned(%rip), %RAX_LP
+2:	ret
+END(memset)
+#endif
+
+#if IS_IN (libc)
+# define MEMSET_SYMBOL(p,s)	p##_sse2_##s
+# define WMEMSET_SYMBOL(p,s)	p##_sse2_##s
+
+# ifdef SHARED
+#  undef libc_hidden_builtin_def
+/* It doesn't make sense to send libc-internal memset calls through a PLT.
+   The speedup we get from using SSE2 instructions is likely eaten away
+   by the indirect call in the PLT.  */
+#  define libc_hidden_builtin_def(name) \
+	.globl __GI_memset; __GI_memset = __memset_sse2_unaligned; \
+	.globl __GI_wmemset; __GI_wmemset = __wmemset_sse2_unaligned; \
+	.globl __GI___wmemset; __GI___wmemset = __wmemset_sse2_unaligned
+# endif
+
+# undef weak_alias
+# define weak_alias(original, alias) \
+	.weak bzero; bzero = __bzero
+
+# undef strong_alias
+# define strong_alias(original, alias)
+#endif
+
+#include "../memset.S"
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/memset_chk.S b/REORG.TODO/sysdeps/x86_64/multiarch/memset_chk.S
new file mode 100644
index 0000000000..7e08311cdf
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/memset_chk.S
@@ -0,0 +1,61 @@
+/* Multiple versions of memset_chk
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in lib.  */
+#if IS_IN (libc)
+# ifdef SHARED
+ENTRY(__memset_chk)
+	.type	__memset_chk, @gnu_indirect_function
+	LOAD_RTLD_GLOBAL_RO_RDX
+	lea	__memset_chk_sse2_unaligned_erms(%rip), %RAX_LP
+	HAS_CPU_FEATURE (ERMS)
+	jnz	1f
+	lea	__memset_chk_sse2_unaligned(%rip), %RAX_LP
+1:
+	HAS_ARCH_FEATURE (AVX2_Usable)
+	jz	2f
+	lea	__memset_chk_avx2_unaligned_erms(%rip), %RAX_LP
+	HAS_CPU_FEATURE (ERMS)
+	jnz	L(AVX512F)
+	lea	__memset_chk_avx2_unaligned(%rip), %RAX_LP
+L(AVX512F):
+	HAS_ARCH_FEATURE (Prefer_No_AVX512)
+	jnz	2f
+	HAS_ARCH_FEATURE (AVX512F_Usable)
+	jz	2f
+	lea	__memset_chk_avx512_no_vzeroupper(%rip), %RAX_LP
+	HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
+	jnz	2f
+	lea	__memset_chk_avx512_unaligned_erms(%rip), %RAX_LP
+	HAS_CPU_FEATURE (ERMS)
+	jnz	2f
+	lea	__memset_chk_avx512_unaligned(%rip), %RAX_LP
+2:	ret
+END(__memset_chk)
+
+strong_alias (__memset_chk, __memset_zero_constant_len_parameter)
+	.section .gnu.warning.__memset_zero_constant_len_parameter
+	.string "memset used with constant zero length parameter; this could be due to transposed parameters"
+# else
+#  include "../memset_chk.S"
+# endif
+#endif
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/sched_cpucount.c b/REORG.TODO/sysdeps/x86_64/multiarch/sched_cpucount.c
new file mode 100644
index 0000000000..453f183747
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/sched_cpucount.c
@@ -0,0 +1,36 @@
+/* Count bits in CPU set.  x86-64 multi-arch version.
+   This file is part of the GNU C Library.
+   Copyright (C) 2008-2017 Free Software Foundation, Inc.
+   Contributed by Ulrich Drepper <drepper@redhat.com>.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sched.h>
+#include "init-arch.h"
+
+#define __sched_cpucount static generic_cpucount
+#include <posix/sched_cpucount.c>
+#undef __sched_cpucount
+
+#define POPCNT(l) \
+  ({ __cpu_mask r; \
+     asm ("popcnt %1, %0" : "=r" (r) : "0" (l));\
+     r; })
+#define __sched_cpucount static popcount_cpucount
+#include <posix/sched_cpucount.c>
+#undef __sched_cpucount
+
+libc_ifunc (__sched_cpucount,
+	    HAS_CPU_FEATURE (POPCOUNT) ? popcount_cpucount : generic_cpucount);
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/stpcpy-sse2-unaligned.S b/REORG.TODO/sysdeps/x86_64/multiarch/stpcpy-sse2-unaligned.S
new file mode 100644
index 0000000000..34231f8b46
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/stpcpy-sse2-unaligned.S
@@ -0,0 +1,3 @@
+#define USE_AS_STPCPY
+#define STRCPY __stpcpy_sse2_unaligned
+#include "strcpy-sse2-unaligned.S"
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/stpcpy-ssse3.S b/REORG.TODO/sysdeps/x86_64/multiarch/stpcpy-ssse3.S
new file mode 100644
index 0000000000..d971c2da38
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/stpcpy-ssse3.S
@@ -0,0 +1,3 @@
+#define USE_AS_STPCPY
+#define STRCPY __stpcpy_ssse3
+#include "strcpy-ssse3.S"
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/stpcpy.S b/REORG.TODO/sysdeps/x86_64/multiarch/stpcpy.S
new file mode 100644
index 0000000000..ee81ab6ae3
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/stpcpy.S
@@ -0,0 +1,9 @@
+/* Multiple versions of stpcpy
+   All versions must be listed in ifunc-impl-list.c.  */
+#define USE_AS_STPCPY
+#define STRCPY __stpcpy
+#include "strcpy.S"
+
+weak_alias (__stpcpy, stpcpy)
+libc_hidden_def (__stpcpy)
+libc_hidden_builtin_def (stpcpy)
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/stpncpy-c.c b/REORG.TODO/sysdeps/x86_64/multiarch/stpncpy-c.c
new file mode 100644
index 0000000000..2fde77dcab
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/stpncpy-c.c
@@ -0,0 +1,8 @@
+#define STPNCPY __stpncpy_sse2
+#ifdef SHARED
+#undef libc_hidden_def
+#define libc_hidden_def(name) \
+  __hidden_ver1 (__stpncpy_sse2, __GI___stpncpy, __stpncpy_sse2);
+#endif
+
+#include "stpncpy.c"
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/stpncpy-sse2-unaligned.S b/REORG.TODO/sysdeps/x86_64/multiarch/stpncpy-sse2-unaligned.S
new file mode 100644
index 0000000000..658520f78f
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/stpncpy-sse2-unaligned.S
@@ -0,0 +1,4 @@
+#define USE_AS_STPCPY
+#define USE_AS_STRNCPY
+#define STRCPY __stpncpy_sse2_unaligned
+#include "strcpy-sse2-unaligned.S"
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/stpncpy-ssse3.S b/REORG.TODO/sysdeps/x86_64/multiarch/stpncpy-ssse3.S
new file mode 100644
index 0000000000..14ed16f6b5
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/stpncpy-ssse3.S
@@ -0,0 +1,4 @@
+#define USE_AS_STPCPY
+#define USE_AS_STRNCPY
+#define STRCPY __stpncpy_ssse3
+#include "strcpy-ssse3.S"
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/stpncpy.S b/REORG.TODO/sysdeps/x86_64/multiarch/stpncpy.S
new file mode 100644
index 0000000000..2698ca6a8c
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/stpncpy.S
@@ -0,0 +1,8 @@
+/* Multiple versions of stpncpy
+   All versions must be listed in ifunc-impl-list.c.  */
+#define STRCPY __stpncpy
+#define USE_AS_STPCPY
+#define USE_AS_STRNCPY
+#include "strcpy.S"
+
+weak_alias (__stpncpy, stpncpy)
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/strcasecmp_l-ssse3.S b/REORG.TODO/sysdeps/x86_64/multiarch/strcasecmp_l-ssse3.S
new file mode 100644
index 0000000000..fb2f9ae14a
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/strcasecmp_l-ssse3.S
@@ -0,0 +1,6 @@
+#define USE_SSSE3 1
+#define USE_AS_STRCASECMP_L
+#define NO_NOLOCALE_ALIAS
+#define STRCMP __strcasecmp_l_ssse3
+#define __strcasecmp __strcasecmp_ssse3
+#include "../strcmp.S"
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/strcasecmp_l.S b/REORG.TODO/sysdeps/x86_64/multiarch/strcasecmp_l.S
new file mode 100644
index 0000000000..49f5b9fd95
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/strcasecmp_l.S
@@ -0,0 +1,8 @@
+/* Multiple versions of strcasecmp and strcasecmp_l
+   All versions must be listed in ifunc-impl-list.c.  */
+#define STRCMP __strcasecmp_l
+#define USE_AS_STRCASECMP_L
+#include "strcmp.S"
+
+weak_alias (__strcasecmp_l, strcasecmp_l)
+libc_hidden_def (strcasecmp_l)
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S b/REORG.TODO/sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S
new file mode 100644
index 0000000000..d0a8a1518a
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S
@@ -0,0 +1,279 @@
+/* strcat with SSE2
+   Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+
+# ifndef STRCAT
+#  define STRCAT  __strcat_sse2_unaligned
+# endif
+
+# define USE_AS_STRCAT
+
+.text
+ENTRY (STRCAT)
+	mov	%rdi, %r9
+# ifdef USE_AS_STRNCAT
+	mov	%rdx, %r8
+# endif
+
+/* Inline corresponding strlen file, temporary until new strcpy
+   implementation gets merged.  */
+
+	xor	%rax, %rax
+	mov	%edi, %ecx
+	and	$0x3f, %ecx
+	pxor	%xmm0, %xmm0
+	cmp	$0x30, %ecx
+	ja	L(next)
+	movdqu	(%rdi), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	L(exit_less16)
+	mov	%rdi, %rax
+	and	$-16, %rax
+	jmp	L(align16_start)
+L(next):
+	mov	%rdi, %rax
+	and	$-16, %rax
+	pcmpeqb	(%rax), %xmm0
+	mov	$-1, %r10d
+	sub	%rax, %rcx
+	shl	%cl, %r10d
+	pmovmskb %xmm0, %edx
+	and	%r10d, %edx
+	jnz	L(exit)
+
+L(align16_start):
+	pxor	%xmm0, %xmm0
+	pxor	%xmm1, %xmm1
+	pxor	%xmm2, %xmm2
+	pxor	%xmm3, %xmm3
+	pcmpeqb	16(%rax), %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	L(exit16)
+
+	pcmpeqb	32(%rax), %xmm1
+	pmovmskb %xmm1, %edx
+	test	%edx, %edx
+	jnz	L(exit32)
+
+	pcmpeqb	48(%rax), %xmm2
+	pmovmskb %xmm2, %edx
+	test	%edx, %edx
+	jnz	L(exit48)
+
+	pcmpeqb	64(%rax), %xmm3
+	pmovmskb %xmm3, %edx
+	test	%edx, %edx
+	jnz	L(exit64)
+
+	pcmpeqb	80(%rax), %xmm0
+	add	$64, %rax
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	L(exit16)
+
+	pcmpeqb	32(%rax), %xmm1
+	pmovmskb %xmm1, %edx
+	test	%edx, %edx
+	jnz	L(exit32)
+
+	pcmpeqb	48(%rax), %xmm2
+	pmovmskb %xmm2, %edx
+	test	%edx, %edx
+	jnz	L(exit48)
+
+	pcmpeqb	64(%rax), %xmm3
+	pmovmskb %xmm3, %edx
+	test	%edx, %edx
+	jnz	L(exit64)
+
+	pcmpeqb	80(%rax), %xmm0
+	add	$64, %rax
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	L(exit16)
+
+	pcmpeqb	32(%rax), %xmm1
+	pmovmskb %xmm1, %edx
+	test	%edx, %edx
+	jnz	L(exit32)
+
+	pcmpeqb	48(%rax), %xmm2
+	pmovmskb %xmm2, %edx
+	test	%edx, %edx
+	jnz	L(exit48)
+
+	pcmpeqb	64(%rax), %xmm3
+	pmovmskb %xmm3, %edx
+	test	%edx, %edx
+	jnz	L(exit64)
+
+	pcmpeqb	80(%rax), %xmm0
+	add	$64, %rax
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	L(exit16)
+
+	pcmpeqb	32(%rax), %xmm1
+	pmovmskb %xmm1, %edx
+	test	%edx, %edx
+	jnz	L(exit32)
+
+	pcmpeqb	48(%rax), %xmm2
+	pmovmskb %xmm2, %edx
+	test	%edx, %edx
+	jnz	L(exit48)
+
+	pcmpeqb	64(%rax), %xmm3
+	pmovmskb %xmm3, %edx
+	test	%edx, %edx
+	jnz	L(exit64)
+
+	test	$0x3f, %rax
+	jz	L(align64_loop)
+
+	pcmpeqb	80(%rax), %xmm0
+	add	$80, %rax
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	L(exit)
+
+	test	$0x3f, %rax
+	jz	L(align64_loop)
+
+	pcmpeqb	16(%rax), %xmm1
+	add	$16, %rax
+	pmovmskb %xmm1, %edx
+	test	%edx, %edx
+	jnz	L(exit)
+
+	test	$0x3f, %rax
+	jz	L(align64_loop)
+
+	pcmpeqb	16(%rax), %xmm2
+	add	$16, %rax
+	pmovmskb %xmm2, %edx
+	test	%edx, %edx
+	jnz	L(exit)
+
+	test	$0x3f, %rax
+	jz	L(align64_loop)
+
+	pcmpeqb	16(%rax), %xmm3
+	add	$16, %rax
+	pmovmskb %xmm3, %edx
+	test	%edx, %edx
+	jnz	L(exit)
+
+	add	$16, %rax
+	.p2align 4
+	L(align64_loop):
+	movaps	(%rax),	%xmm4
+	pminub	16(%rax),	%xmm4
+	movaps	32(%rax),	%xmm5
+	pminub	48(%rax),	%xmm5
+	add	$64,	%rax
+	pminub	%xmm4,	%xmm5
+	pcmpeqb	%xmm0,	%xmm5
+	pmovmskb %xmm5,	%edx
+	test	%edx,	%edx
+	jz	L(align64_loop)
+
+	pcmpeqb	-64(%rax), %xmm0
+	sub	$80,	%rax
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	L(exit16)
+
+	pcmpeqb	32(%rax), %xmm1
+	pmovmskb %xmm1, %edx
+	test	%edx, %edx
+	jnz	L(exit32)
+
+	pcmpeqb	48(%rax), %xmm2
+	pmovmskb %xmm2, %edx
+	test	%edx, %edx
+	jnz	L(exit48)
+
+	pcmpeqb	64(%rax), %xmm3
+	pmovmskb %xmm3, %edx
+	sub	%rdi, %rax
+	bsf	%rdx, %rdx
+	add	%rdx, %rax
+	add	$64, %rax
+	jmp	L(StartStrcpyPart)
+
+	.p2align 4
+L(exit):
+	sub	%rdi, %rax
+L(exit_less16):
+	bsf	%rdx, %rdx
+	add	%rdx, %rax
+	jmp	L(StartStrcpyPart)
+
+	.p2align 4
+L(exit16):
+	sub	%rdi, %rax
+	bsf	%rdx, %rdx
+	add	%rdx, %rax
+	add	$16, %rax
+	jmp	L(StartStrcpyPart)
+
+	.p2align 4
+L(exit32):
+	sub	%rdi, %rax
+	bsf	%rdx, %rdx
+	add	%rdx, %rax
+	add	$32, %rax
+	jmp	L(StartStrcpyPart)
+
+	.p2align 4
+L(exit48):
+	sub	%rdi, %rax
+	bsf	%rdx, %rdx
+	add	%rdx, %rax
+	add	$48, %rax
+	jmp	L(StartStrcpyPart)
+
+	.p2align 4
+L(exit64):
+	sub	%rdi, %rax
+	bsf	%rdx, %rdx
+	add	%rdx, %rax
+	add	$64, %rax
+
+	.p2align 4
+L(StartStrcpyPart):
+	lea	(%r9, %rax), %rdi
+	mov	%rsi, %rcx
+	mov	%r9, %rax      /* save result */
+
+# ifdef USE_AS_STRNCAT
+	test	%r8, %r8
+	jz	L(ExitZero)
+#  define USE_AS_STRNCPY
+# endif
+
+# include "strcpy-sse2-unaligned.S"
+#endif
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/strcat-ssse3.S b/REORG.TODO/sysdeps/x86_64/multiarch/strcat-ssse3.S
new file mode 100644
index 0000000000..edd683d778
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/strcat-ssse3.S
@@ -0,0 +1,867 @@
+/* strcat with SSSE3
+   Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+
+# ifndef STRCAT
+#  define STRCAT  __strcat_ssse3
+# endif
+
+# define USE_AS_STRCAT
+
+.text
+ENTRY (STRCAT)
+# ifdef USE_AS_STRNCAT
+	mov	%rdx, %r8
+# endif
+
+
+/* Inline corresponding strlen file, temporary until new strcpy
+   implementation gets merged.  */
+
+	xor	%eax, %eax
+	cmpb	$0, (%rdi)
+	jz	L(exit_tail0)
+	cmpb	$0, 1(%rdi)
+	jz	L(exit_tail1)
+	cmpb	$0, 2(%rdi)
+	jz	L(exit_tail2)
+	cmpb	$0, 3(%rdi)
+	jz	L(exit_tail3)
+
+	cmpb	$0, 4(%rdi)
+	jz	L(exit_tail4)
+	cmpb	$0, 5(%rdi)
+	jz	L(exit_tail5)
+	cmpb	$0, 6(%rdi)
+	jz	L(exit_tail6)
+	cmpb	$0, 7(%rdi)
+	jz	L(exit_tail7)
+
+	cmpb	$0, 8(%rdi)
+	jz	L(exit_tail8)
+	cmpb	$0, 9(%rdi)
+	jz	L(exit_tail9)
+	cmpb	$0, 10(%rdi)
+	jz	L(exit_tail10)
+	cmpb	$0, 11(%rdi)
+	jz	L(exit_tail11)
+
+	cmpb	$0, 12(%rdi)
+	jz	L(exit_tail12)
+	cmpb	$0, 13(%rdi)
+	jz	L(exit_tail13)
+	cmpb	$0, 14(%rdi)
+	jz	L(exit_tail14)
+	cmpb	$0, 15(%rdi)
+	jz	L(exit_tail15)
+	pxor	%xmm0, %xmm0
+	lea	16(%rdi), %rcx
+	lea	16(%rdi), %rax
+	and	$-16, %rax
+
+	pcmpeqb	(%rax), %xmm0
+	pmovmskb %xmm0, %edx
+	pxor	%xmm1, %xmm1
+	test	%edx, %edx
+	lea	16(%rax), %rax
+	jnz	L(exit)
+
+	pcmpeqb	(%rax), %xmm1
+	pmovmskb %xmm1, %edx
+	pxor	%xmm2, %xmm2
+	test	%edx, %edx
+	lea	16(%rax), %rax
+	jnz	L(exit)
+
+	pcmpeqb	(%rax), %xmm2
+	pmovmskb %xmm2, %edx
+	pxor	%xmm3, %xmm3
+	test	%edx, %edx
+	lea	16(%rax), %rax
+	jnz	L(exit)
+
+	pcmpeqb	(%rax), %xmm3
+	pmovmskb %xmm3, %edx
+	test	%edx, %edx
+	lea	16(%rax), %rax
+	jnz	L(exit)
+
+	pcmpeqb	(%rax), %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	lea	16(%rax), %rax
+	jnz	L(exit)
+
+	pcmpeqb	(%rax), %xmm1
+	pmovmskb %xmm1, %edx
+	test	%edx, %edx
+	lea	16(%rax), %rax
+	jnz	L(exit)
+
+	pcmpeqb	(%rax), %xmm2
+	pmovmskb %xmm2, %edx
+	test	%edx, %edx
+	lea	16(%rax), %rax
+	jnz	L(exit)
+
+	pcmpeqb	(%rax), %xmm3
+	pmovmskb %xmm3, %edx
+	test	%edx, %edx
+	lea	16(%rax), %rax
+	jnz	L(exit)
+
+	pcmpeqb	(%rax), %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	lea	16(%rax), %rax
+	jnz	L(exit)
+
+	pcmpeqb	(%rax), %xmm1
+	pmovmskb %xmm1, %edx
+	test	%edx, %edx
+	lea	16(%rax), %rax
+	jnz	L(exit)
+
+	pcmpeqb	(%rax), %xmm2
+	pmovmskb %xmm2, %edx
+	test	%edx, %edx
+	lea	16(%rax), %rax
+	jnz	L(exit)
+
+	pcmpeqb	(%rax), %xmm3
+	pmovmskb %xmm3, %edx
+	test	%edx, %edx
+	lea	16(%rax), %rax
+	jnz	L(exit)
+
+	pcmpeqb	(%rax), %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	lea	16(%rax), %rax
+	jnz	L(exit)
+
+	pcmpeqb	(%rax), %xmm1
+	pmovmskb %xmm1, %edx
+	test	%edx, %edx
+	lea	16(%rax), %rax
+	jnz	L(exit)
+
+	pcmpeqb	(%rax), %xmm2
+	pmovmskb %xmm2, %edx
+	test	%edx, %edx
+	lea	16(%rax), %rax
+	jnz	L(exit)
+
+	pcmpeqb	(%rax), %xmm3
+	pmovmskb %xmm3, %edx
+	test	%edx, %edx
+	lea	16(%rax), %rax
+	jnz	L(exit)
+
+	and	$-0x40, %rax
+
+	.p2align 4
+L(aligned_64):
+	pcmpeqb	(%rax), %xmm0
+	pcmpeqb	16(%rax), %xmm1
+	pcmpeqb	32(%rax), %xmm2
+	pcmpeqb	48(%rax), %xmm3
+	pmovmskb %xmm0, %edx
+	pmovmskb %xmm1, %r11d
+	pmovmskb %xmm2, %r10d
+	pmovmskb %xmm3, %r9d
+	or	%edx, %r9d
+	or	%r11d, %r9d
+	or	%r10d, %r9d
+	lea	64(%rax), %rax
+	jz	L(aligned_64)
+
+	test	%edx, %edx
+	jnz	L(aligned_64_exit_16)
+	test	%r11d, %r11d
+	jnz	L(aligned_64_exit_32)
+	test	%r10d, %r10d
+	jnz	L(aligned_64_exit_48)
+
+L(aligned_64_exit_64):
+	pmovmskb %xmm3, %edx
+	jmp	L(exit)
+
+L(aligned_64_exit_48):
+	lea	-16(%rax), %rax
+	mov	%r10d, %edx
+	jmp	L(exit)
+
+L(aligned_64_exit_32):
+	lea	-32(%rax), %rax
+	mov	%r11d, %edx
+	jmp	L(exit)
+
+L(aligned_64_exit_16):
+	lea	-48(%rax), %rax
+
+L(exit):
+	sub	%rcx, %rax
+	test	%dl, %dl
+	jz	L(exit_high)
+	test	$0x01, %dl
+	jnz	L(exit_tail0)
+
+	test	$0x02, %dl
+	jnz	L(exit_tail1)
+
+	test	$0x04, %dl
+	jnz	L(exit_tail2)
+
+	test	$0x08, %dl
+	jnz	L(exit_tail3)
+
+	test	$0x10, %dl
+	jnz	L(exit_tail4)
+
+	test	$0x20, %dl
+	jnz	L(exit_tail5)
+
+	test	$0x40, %dl
+	jnz	L(exit_tail6)
+	add	$7, %eax
+L(exit_tail0):
+	jmp	L(StartStrcpyPart)
+
+	.p2align 4
+L(exit_high):
+	add	$8, %eax
+	test	$0x01, %dh
+	jnz	L(exit_tail0)
+
+	test	$0x02, %dh
+	jnz	L(exit_tail1)
+
+	test	$0x04, %dh
+	jnz	L(exit_tail2)
+
+	test	$0x08, %dh
+	jnz	L(exit_tail3)
+
+	test	$0x10, %dh
+	jnz	L(exit_tail4)
+
+	test	$0x20, %dh
+	jnz	L(exit_tail5)
+
+	test	$0x40, %dh
+	jnz	L(exit_tail6)
+	add	$7, %eax
+	jmp	L(StartStrcpyPart)
+
+	.p2align 4
+L(exit_tail1):
+	add	$1, %eax
+	jmp	L(StartStrcpyPart)
+
+	.p2align 4
+L(exit_tail2):
+	add	$2, %eax
+	jmp	L(StartStrcpyPart)
+
+	.p2align 4
+L(exit_tail3):
+	add	$3, %eax
+	jmp	L(StartStrcpyPart)
+
+	.p2align 4
+L(exit_tail4):
+	add	$4, %eax
+	jmp	L(StartStrcpyPart)
+
+	.p2align 4
+L(exit_tail5):
+	add	$5, %eax
+	jmp	L(StartStrcpyPart)
+
+	.p2align 4
+L(exit_tail6):
+	add	$6, %eax
+	jmp	L(StartStrcpyPart)
+
+	.p2align 4
+L(exit_tail7):
+	add	$7, %eax
+	jmp	L(StartStrcpyPart)
+
+	.p2align 4
+L(exit_tail8):
+	add	$8, %eax
+	jmp	L(StartStrcpyPart)
+
+	.p2align 4
+L(exit_tail9):
+	add	$9, %eax
+	jmp	L(StartStrcpyPart)
+
+	.p2align 4
+L(exit_tail10):
+	add	$10, %eax
+	jmp	L(StartStrcpyPart)
+
+	.p2align 4
+L(exit_tail11):
+	add	$11, %eax
+	jmp	L(StartStrcpyPart)
+
+	.p2align 4
+L(exit_tail12):
+	add	$12, %eax
+	jmp	L(StartStrcpyPart)
+
+	.p2align 4
+L(exit_tail13):
+	add	$13, %eax
+	jmp	L(StartStrcpyPart)
+
+	.p2align 4
+L(exit_tail14):
+	add	$14, %eax
+	jmp	L(StartStrcpyPart)
+
+	.p2align 4
+L(exit_tail15):
+	add	$15, %eax
+
+	.p2align 4
+L(StartStrcpyPart):
+	mov	%rsi, %rcx
+	lea	(%rdi, %rax), %rdx
+# ifdef USE_AS_STRNCAT
+	test	%r8, %r8
+	jz	L(StrncatExit0)
+	cmp	$8, %r8
+	jbe	L(StrncatExit8Bytes)
+# endif
+	cmpb	$0, (%rcx)
+	jz	L(Exit1)
+	cmpb	$0, 1(%rcx)
+	jz	L(Exit2)
+	cmpb	$0, 2(%rcx)
+	jz	L(Exit3)
+	cmpb	$0, 3(%rcx)
+	jz	L(Exit4)
+	cmpb	$0, 4(%rcx)
+	jz	L(Exit5)
+	cmpb	$0, 5(%rcx)
+	jz	L(Exit6)
+	cmpb	$0, 6(%rcx)
+	jz	L(Exit7)
+	cmpb	$0, 7(%rcx)
+	jz	L(Exit8)
+	cmpb	$0, 8(%rcx)
+	jz	L(Exit9)
+# ifdef USE_AS_STRNCAT
+	cmp	$16, %r8
+	jb	L(StrncatExit15Bytes)
+# endif
+	cmpb	$0, 9(%rcx)
+	jz	L(Exit10)
+	cmpb	$0, 10(%rcx)
+	jz	L(Exit11)
+	cmpb	$0, 11(%rcx)
+	jz	L(Exit12)
+	cmpb	$0, 12(%rcx)
+	jz	L(Exit13)
+	cmpb	$0, 13(%rcx)
+	jz	L(Exit14)
+	cmpb	$0, 14(%rcx)
+	jz	L(Exit15)
+	cmpb	$0, 15(%rcx)
+	jz	L(Exit16)
+# ifdef USE_AS_STRNCAT
+	cmp	$16, %r8
+	je	L(StrncatExit16)
+#  define USE_AS_STRNCPY
+# endif
+
+# include "strcpy-ssse3.S"
+
+	.p2align 4
+L(CopyFrom1To16Bytes):
+	add	%rsi, %rdx
+	add	%rsi, %rcx
+
+	test	%al, %al
+	jz	L(ExitHigh)
+	test	$0x01, %al
+	jnz	L(Exit1)
+	test	$0x02, %al
+	jnz	L(Exit2)
+	test	$0x04, %al
+	jnz	L(Exit3)
+	test	$0x08, %al
+	jnz	L(Exit4)
+	test	$0x10, %al
+	jnz	L(Exit5)
+	test	$0x20, %al
+	jnz	L(Exit6)
+	test	$0x40, %al
+	jnz	L(Exit7)
+	movlpd	(%rcx), %xmm0
+	movlpd	%xmm0, (%rdx)
+	mov	%rdi, %rax
+	ret
+
+	.p2align 4
+L(ExitHigh):
+	test	$0x01, %ah
+	jnz	L(Exit9)
+	test	$0x02, %ah
+	jnz	L(Exit10)
+	test	$0x04, %ah
+	jnz	L(Exit11)
+	test	$0x08, %ah
+	jnz	L(Exit12)
+	test	$0x10, %ah
+	jnz	L(Exit13)
+	test	$0x20, %ah
+	jnz	L(Exit14)
+	test	$0x40, %ah
+	jnz	L(Exit15)
+	movlpd	(%rcx), %xmm0
+	movlpd	8(%rcx), %xmm1
+	movlpd	%xmm0, (%rdx)
+	movlpd	%xmm1, 8(%rdx)
+	mov	%rdi, %rax
+	ret
+
+	.p2align 4
+L(StrncatExit1):
+	xor	%ah, %ah
+	movb	%ah, 1(%rdx)
+L(Exit1):
+	movb	(%rcx), %al
+	movb	%al, (%rdx)
+	mov	%rdi, %rax
+	ret
+
+	.p2align 4
+L(StrncatExit2):
+	xor	%ah, %ah
+	movb	%ah, 2(%rdx)
+L(Exit2):
+	movw	(%rcx), %ax
+	movw	%ax, (%rdx)
+	mov	%rdi, %rax
+	ret
+
+	.p2align 4
+L(StrncatExit3):
+	xor	%ah, %ah
+	movb	%ah, 3(%rdx)
+L(Exit3):
+	movw	(%rcx), %ax
+	movw	%ax, (%rdx)
+	movb	2(%rcx), %al
+	movb	%al, 2(%rdx)
+	mov	%rdi, %rax
+	ret
+
+	.p2align 4
+L(StrncatExit4):
+	xor	%ah, %ah
+	movb	%ah, 4(%rdx)
+L(Exit4):
+	mov	(%rcx), %eax
+	mov	%eax, (%rdx)
+	mov	%rdi, %rax
+	ret
+
+	.p2align 4
+L(StrncatExit5):
+	xor	%ah, %ah
+	movb	%ah, 5(%rdx)
+L(Exit5):
+	mov	(%rcx), %eax
+	mov	%eax, (%rdx)
+	movb	4(%rcx), %al
+	movb	%al, 4(%rdx)
+	mov	%rdi, %rax
+	ret
+
+	.p2align 4
+L(StrncatExit6):
+	xor	%ah, %ah
+	movb	%ah, 6(%rdx)
+L(Exit6):
+	mov	(%rcx), %eax
+	mov	%eax, (%rdx)
+	movw	4(%rcx), %ax
+	movw	%ax, 4(%rdx)
+	mov	%rdi, %rax
+	ret
+
+	.p2align 4
+L(StrncatExit7):
+	xor	%ah, %ah
+	movb	%ah, 7(%rdx)
+L(Exit7):
+	mov	(%rcx), %eax
+	mov	%eax, (%rdx)
+	mov	3(%rcx), %eax
+	mov	%eax, 3(%rdx)
+	mov	%rdi, %rax
+	ret
+
+	.p2align 4
+L(StrncatExit8):
+	xor	%ah, %ah
+	movb	%ah, 8(%rdx)
+L(Exit8):
+	movlpd	(%rcx), %xmm0
+	movlpd	%xmm0, (%rdx)
+	mov	%rdi, %rax
+	ret
+
+	.p2align 4
+L(StrncatExit9):
+	xor	%ah, %ah
+	movb	%ah, 9(%rdx)
+L(Exit9):
+	movlpd	(%rcx), %xmm0
+	movlpd	%xmm0, (%rdx)
+	movb	8(%rcx), %al
+	movb	%al, 8(%rdx)
+	mov	%rdi, %rax
+	ret
+
+	.p2align 4
+L(StrncatExit10):
+	xor	%ah, %ah
+	movb	%ah, 10(%rdx)
+L(Exit10):
+	movlpd	(%rcx), %xmm0
+	movlpd	%xmm0, (%rdx)
+	movw	8(%rcx), %ax
+	movw	%ax, 8(%rdx)
+	mov	%rdi, %rax
+	ret
+
+	.p2align 4
+L(StrncatExit11):
+	xor	%ah, %ah
+	movb	%ah, 11(%rdx)
+L(Exit11):
+	movlpd	(%rcx), %xmm0
+	movlpd	%xmm0, (%rdx)
+	mov	7(%rcx), %eax
+	mov	%eax, 7(%rdx)
+	mov	%rdi, %rax
+	ret
+
+	.p2align 4
+L(StrncatExit12):
+	xor	%ah, %ah
+	movb	%ah, 12(%rdx)
+L(Exit12):
+	movlpd	(%rcx), %xmm0
+	movlpd	%xmm0, (%rdx)
+	mov	8(%rcx), %eax
+	mov	%eax, 8(%rdx)
+	mov	%rdi, %rax
+	ret
+
+	.p2align 4
+L(StrncatExit13):
+	xor	%ah, %ah
+	movb	%ah, 13(%rdx)
+L(Exit13):
+	movlpd	(%rcx), %xmm0
+	movlpd	%xmm0, (%rdx)
+	movlpd	5(%rcx), %xmm1
+	movlpd	%xmm1, 5(%rdx)
+	mov	%rdi, %rax
+	ret
+
+	.p2align 4
+L(StrncatExit14):
+	xor	%ah, %ah
+	movb	%ah, 14(%rdx)
+L(Exit14):
+	movlpd	(%rcx), %xmm0
+	movlpd	%xmm0, (%rdx)
+	movlpd	6(%rcx), %xmm1
+	movlpd	%xmm1, 6(%rdx)
+	mov	%rdi, %rax
+	ret
+
+	.p2align 4
+L(StrncatExit15):
+	xor	%ah, %ah
+	movb	%ah, 15(%rdx)
+L(Exit15):
+	movlpd	(%rcx), %xmm0
+	movlpd	%xmm0, (%rdx)
+	movlpd	7(%rcx), %xmm1
+	movlpd	%xmm1, 7(%rdx)
+	mov	%rdi, %rax
+	ret
+
+	.p2align 4
+L(StrncatExit16):
+	xor	%ah, %ah
+	movb	%ah, 16(%rdx)
+L(Exit16):
+	movlpd	(%rcx), %xmm0
+	movlpd	8(%rcx), %xmm1
+	movlpd	%xmm0, (%rdx)
+	movlpd	%xmm1, 8(%rdx)
+	mov	%rdi, %rax
+	ret
+
+# ifdef USE_AS_STRNCPY
+
+	.p2align 4
+L(CopyFrom1To16BytesCase2):
+	add	$16, %r8
+	add	%rsi, %rcx
+	lea	(%rsi, %rdx), %rsi
+	lea	-9(%r8), %rdx
+	and	$1<<7, %dh
+	or	%al, %dh
+	test	%dh, %dh
+	lea	(%rsi), %rdx
+	jz	L(ExitHighCase2)
+
+	test	$0x01, %al
+	jnz	L(Exit1)
+	cmp	$1, %r8
+	je	L(StrncatExit1)
+	test	$0x02, %al
+	jnz	L(Exit2)
+	cmp	$2, %r8
+	je	L(StrncatExit2)
+	test	$0x04, %al
+	jnz	L(Exit3)
+	cmp	$3, %r8
+	je	L(StrncatExit3)
+	test	$0x08, %al
+	jnz	L(Exit4)
+	cmp	$4, %r8
+	je	L(StrncatExit4)
+	test	$0x10, %al
+	jnz	L(Exit5)
+	cmp	$5, %r8
+	je	L(StrncatExit5)
+	test	$0x20, %al
+	jnz	L(Exit6)
+	cmp	$6, %r8
+	je	L(StrncatExit6)
+	test	$0x40, %al
+	jnz	L(Exit7)
+	cmp	$7, %r8
+	je	L(StrncatExit7)
+	movlpd	(%rcx), %xmm0
+	movlpd	%xmm0, (%rdx)
+	lea	7(%rdx), %rax
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+	xor	%cl, %cl
+	movb	%cl, (%rax)
+	mov	%rdi, %rax
+	ret
+
+	.p2align 4
+L(ExitHighCase2):
+	test	$0x01, %ah
+	jnz	L(Exit9)
+	cmp	$9, %r8
+	je	L(StrncatExit9)
+	test	$0x02, %ah
+	jnz	L(Exit10)
+	cmp	$10, %r8
+	je	L(StrncatExit10)
+	test	$0x04, %ah
+	jnz	L(Exit11)
+	cmp	$11, %r8
+	je	L(StrncatExit11)
+	test	$0x8, %ah
+	jnz	L(Exit12)
+	cmp	$12, %r8
+	je	L(StrncatExit12)
+	test	$0x10, %ah
+	jnz	L(Exit13)
+	cmp	$13, %r8
+	je	L(StrncatExit13)
+	test	$0x20, %ah
+	jnz	L(Exit14)
+	cmp	$14, %r8
+	je	L(StrncatExit14)
+	test	$0x40, %ah
+	jnz	L(Exit15)
+	cmp	$15, %r8
+	je	L(StrncatExit15)
+	movlpd	(%rcx), %xmm0
+	movlpd	%xmm0, (%rdx)
+	movlpd	8(%rcx), %xmm1
+	movlpd	%xmm1, 8(%rdx)
+	mov	%rdi, %rax
+	ret
+
+L(CopyFrom1To16BytesCase2OrCase3):
+	test	%rax, %rax
+	jnz	L(CopyFrom1To16BytesCase2)
+
+	.p2align 4
+L(CopyFrom1To16BytesCase3):
+	add	$16, %r8
+	add	%rsi, %rdx
+	add	%rsi, %rcx
+
+	cmp	$8, %r8
+	ja	L(ExitHighCase3)
+	cmp	$1, %r8
+	je	L(StrncatExit1)
+	cmp	$2, %r8
+	je	L(StrncatExit2)
+	cmp	$3, %r8
+	je	L(StrncatExit3)
+	cmp	$4, %r8
+	je	L(StrncatExit4)
+	cmp	$5, %r8
+	je	L(StrncatExit5)
+	cmp	$6, %r8
+	je	L(StrncatExit6)
+	cmp	$7, %r8
+	je	L(StrncatExit7)
+	movlpd	(%rcx), %xmm0
+	movlpd	%xmm0, (%rdx)
+	xor	%ah, %ah
+	movb	%ah, 8(%rdx)
+	mov	%rdi, %rax
+	ret
+
+	.p2align 4
+L(ExitHighCase3):
+	cmp	$9, %r8
+	je	L(StrncatExit9)
+	cmp	$10, %r8
+	je	L(StrncatExit10)
+	cmp	$11, %r8
+	je	L(StrncatExit11)
+	cmp	$12, %r8
+	je	L(StrncatExit12)
+	cmp	$13, %r8
+	je	L(StrncatExit13)
+	cmp	$14, %r8
+	je	L(StrncatExit14)
+	cmp	$15, %r8
+	je	L(StrncatExit15)
+	movlpd	(%rcx), %xmm0
+	movlpd	%xmm0, (%rdx)
+	movlpd	8(%rcx), %xmm1
+	movlpd	%xmm1, 8(%rdx)
+	xor	%ah, %ah
+	movb	%ah, 16(%rdx)
+	mov	%rdi, %rax
+	ret
+
+	.p2align 4
+L(StrncatExit0):
+	mov	%rdi, %rax
+	ret
+
+	.p2align 4
+L(StrncatExit15Bytes):
+	cmp	$9, %r8
+	je	L(StrncatExit9)
+	cmpb	$0, 9(%rcx)
+	jz	L(Exit10)
+	cmp	$10, %r8
+	je	L(StrncatExit10)
+	cmpb	$0, 10(%rcx)
+	jz	L(Exit11)
+	cmp	$11, %r8
+	je	L(StrncatExit11)
+	cmpb	$0, 11(%rcx)
+	jz	L(Exit12)
+	cmp	$12, %r8
+	je	L(StrncatExit12)
+	cmpb	$0, 12(%rcx)
+	jz	L(Exit13)
+	cmp	$13, %r8
+	je	L(StrncatExit13)
+	cmpb	$0, 13(%rcx)
+	jz	L(Exit14)
+	cmp	$14, %r8
+	je	L(StrncatExit14)
+	movlpd	(%rcx), %xmm0
+	movlpd	%xmm0, (%rdx)
+	movlpd	7(%rcx), %xmm1
+	movlpd	%xmm1, 7(%rdx)
+	lea	14(%rdx), %rax
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+	xor	%cl, %cl
+	movb	%cl, (%rax)
+	mov	%rdi, %rax
+	ret
+
+	.p2align 4
+L(StrncatExit8Bytes):
+	cmpb	$0, (%rcx)
+	jz	L(Exit1)
+	cmp	$1, %r8
+	je	L(StrncatExit1)
+	cmpb	$0, 1(%rcx)
+	jz	L(Exit2)
+	cmp	$2, %r8
+	je	L(StrncatExit2)
+	cmpb	$0, 2(%rcx)
+	jz	L(Exit3)
+	cmp	$3, %r8
+	je	L(StrncatExit3)
+	cmpb	$0, 3(%rcx)
+	jz	L(Exit4)
+	cmp	$4, %r8
+	je	L(StrncatExit4)
+	cmpb	$0, 4(%rcx)
+	jz	L(Exit5)
+	cmp	$5, %r8
+	je	L(StrncatExit5)
+	cmpb	$0, 5(%rcx)
+	jz	L(Exit6)
+	cmp	$6, %r8
+	je	L(StrncatExit6)
+	cmpb	$0, 6(%rcx)
+	jz	L(Exit7)
+	cmp	$7, %r8
+	je	L(StrncatExit7)
+	movlpd	(%rcx), %xmm0
+	movlpd	%xmm0, (%rdx)
+	lea	7(%rdx), %rax
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+	xor	%cl, %cl
+	movb	%cl, (%rax)
+	mov	%rdi, %rax
+	ret
+
+# endif
+END (STRCAT)
+#endif
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/strcat.S b/REORG.TODO/sysdeps/x86_64/multiarch/strcat.S
new file mode 100644
index 0000000000..0e0e5dda9c
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/strcat.S
@@ -0,0 +1,85 @@
+/* Multiple versions of strcat
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2009-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+#ifndef USE_AS_STRNCAT
+# ifndef STRCAT
+#  define STRCAT strcat
+# endif
+#endif
+
+#ifdef USE_AS_STRNCAT
+# define STRCAT_SSSE3	         	__strncat_ssse3
+# define STRCAT_SSE2	            	__strncat_sse2
+# define STRCAT_SSE2_UNALIGNED    	__strncat_sse2_unaligned
+# define __GI_STRCAT	            	__GI_strncat
+# define __GI___STRCAT              __GI___strncat
+#else
+# define STRCAT_SSSE3	         	__strcat_ssse3
+# define STRCAT_SSE2	            	__strcat_sse2
+# define STRCAT_SSE2_UNALIGNED    	__strcat_sse2_unaligned
+# define __GI_STRCAT	            	__GI_strcat
+# define __GI___STRCAT              __GI___strcat
+#endif
+
+
+/* Define multiple versions only for the definition in libc.  */
+#if IS_IN (libc)
+	.text
+ENTRY(STRCAT)
+	.type	STRCAT, @gnu_indirect_function
+	LOAD_RTLD_GLOBAL_RO_RDX
+	leaq	STRCAT_SSE2_UNALIGNED(%rip), %rax
+	HAS_ARCH_FEATURE (Fast_Unaligned_Load)
+	jnz	2f
+	leaq	STRCAT_SSE2(%rip), %rax
+	HAS_CPU_FEATURE (SSSE3)
+	jz	2f
+	leaq	STRCAT_SSSE3(%rip), %rax
+2:	ret
+END(STRCAT)
+
+# undef ENTRY
+# define ENTRY(name) \
+	.type STRCAT_SSE2, @function; \
+	.align 16; \
+	.globl STRCAT_SSE2; \
+	.hidden STRCAT_SSE2; \
+	STRCAT_SSE2: cfi_startproc; \
+	CALL_MCOUNT
+# undef END
+# define END(name) \
+	cfi_endproc; .size STRCAT_SSE2, .-STRCAT_SSE2
+# undef libc_hidden_builtin_def
+/* It doesn't make sense to send libc-internal strcat calls through a PLT.
+   The speedup we get from using SSSE3 instruction is likely eaten away
+   by the indirect call in the PLT.  */
+# define libc_hidden_builtin_def(name) \
+	.globl __GI_STRCAT; __GI_STRCAT = STRCAT_SSE2
+# undef libc_hidden_def
+# define libc_hidden_def(name) \
+	.globl __GI___STRCAT; __GI___STRCAT = STRCAT_SSE2
+#endif
+
+#ifndef USE_AS_STRNCAT
+# include "../strcat.S"
+#endif
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/strchr-sse2-no-bsf.S b/REORG.TODO/sysdeps/x86_64/multiarch/strchr-sse2-no-bsf.S
new file mode 100644
index 0000000000..cbbd0b33d3
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/strchr-sse2-no-bsf.S
@@ -0,0 +1,280 @@
+/* strchr with SSE2 without bsf
+   Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+# include "asm-syntax.h"
+
+	atom_text_section
+ENTRY (__strchr_sse2_no_bsf)
+	movd	%esi, %xmm1
+	movq	%rdi, %rcx
+	punpcklbw %xmm1, %xmm1
+	andq	$~15, %rdi
+	pxor	%xmm2, %xmm2
+	punpcklbw %xmm1, %xmm1
+	orl	$0xffffffff, %esi
+	movdqa	(%rdi), %xmm0
+	pshufd	$0, %xmm1, %xmm1
+	subq	%rdi, %rcx
+	movdqa	%xmm0, %xmm3
+	leaq	16(%rdi), %rdi
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm3
+	shl	%cl, %esi
+	pmovmskb %xmm0, %eax
+	pmovmskb %xmm3, %edx
+	andl	%esi, %eax
+	andl	%esi, %edx
+	test	%eax, %eax
+	jnz	L(matches)
+	test	%edx, %edx
+	jnz	L(return_null)
+
+L(loop):
+	movdqa	(%rdi), %xmm0
+	leaq	16(%rdi), %rdi
+	movdqa	%xmm0, %xmm3
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm3
+	pmovmskb %xmm0, %eax
+	pmovmskb %xmm3, %edx
+	or	%eax, %edx
+	jz	L(loop)
+
+	pmovmskb %xmm3, %edx
+	test	%eax, %eax
+	jnz	L(matches)
+
+/* Return NULL.  */
+	.p2align 4
+L(return_null):
+	xor	%rax, %rax
+	ret
+
+L(matches):
+	/* There is a match.  First find where NULL is.  */
+	leaq	-16(%rdi), %rdi
+	test	%edx, %edx
+	jz	L(match_case1)
+
+	.p2align 4
+L(match_case2):
+	test	%al, %al
+	jz	L(match_high_case2)
+
+	mov	%al, %cl
+	and	$15, %cl
+	jnz	L(match_case2_4)
+
+	mov	%dl, %ch
+	and	$15, %ch
+	jnz	L(return_null)
+
+	test	$0x10, %al
+	jnz	L(Exit5)
+	test	$0x10, %dl
+	jnz	L(return_null)
+	test	$0x20, %al
+	jnz	L(Exit6)
+	test	$0x20, %dl
+	jnz	L(return_null)
+	test	$0x40, %al
+	jnz	L(Exit7)
+	test	$0x40, %dl
+	jnz	L(return_null)
+	lea	7(%rdi), %rax
+	ret
+
+	.p2align 4
+L(match_case2_4):
+	test	$0x01, %al
+	jnz	L(Exit1)
+	test	$0x01, %dl
+	jnz	L(return_null)
+	test	$0x02, %al
+	jnz	L(Exit2)
+	test	$0x02, %dl
+	jnz	L(return_null)
+	test	$0x04, %al
+	jnz	L(Exit3)
+	test	$0x04, %dl
+	jnz	L(return_null)
+	lea	3(%rdi), %rax
+	ret
+
+	.p2align 4
+L(match_high_case2):
+	test	%dl, %dl
+	jnz	L(return_null)
+
+	mov	%ah, %cl
+	and	$15, %cl
+	jnz	L(match_case2_12)
+
+	mov	%dh, %ch
+	and	$15, %ch
+	jnz	L(return_null)
+
+	test	$0x10, %ah
+	jnz	L(Exit13)
+	test	$0x10, %dh
+	jnz	L(return_null)
+	test	$0x20, %ah
+	jnz	L(Exit14)
+	test	$0x20, %dh
+	jnz	L(return_null)
+	test	$0x40, %ah
+	jnz	L(Exit15)
+	test	$0x40, %dh
+	jnz	L(return_null)
+	lea	15(%rdi), %rax
+	ret
+
+	.p2align 4
+L(match_case2_12):
+	test	$0x01, %ah
+	jnz	L(Exit9)
+	test	$0x01, %dh
+	jnz	L(return_null)
+	test	$0x02, %ah
+	jnz	L(Exit10)
+	test	$0x02, %dh
+	jnz	L(return_null)
+	test	$0x04, %ah
+	jnz	L(Exit11)
+	test	$0x04, %dh
+	jnz	L(return_null)
+	lea	11(%rdi), %rax
+	ret
+
+	.p2align 4
+L(match_case1):
+	test	%al, %al
+	jz	L(match_high_case1)
+
+	test	$0x01, %al
+	jnz	L(Exit1)
+	test	$0x02, %al
+	jnz	L(Exit2)
+	test	$0x04, %al
+	jnz	L(Exit3)
+	test	$0x08, %al
+	jnz	L(Exit4)
+	test	$0x10, %al
+	jnz	L(Exit5)
+	test	$0x20, %al
+	jnz	L(Exit6)
+	test	$0x40, %al
+	jnz	L(Exit7)
+	lea	7(%rdi), %rax
+	ret
+
+	.p2align 4
+L(match_high_case1):
+	test	$0x01, %ah
+	jnz	L(Exit9)
+	test	$0x02, %ah
+	jnz	L(Exit10)
+	test	$0x04, %ah
+	jnz	L(Exit11)
+	test	$0x08, %ah
+	jnz	L(Exit12)
+	test	$0x10, %ah
+	jnz	L(Exit13)
+	test	$0x20, %ah
+	jnz	L(Exit14)
+	test	$0x40, %ah
+	jnz	L(Exit15)
+	lea	15(%rdi), %rax
+	ret
+
+	.p2align 4
+L(Exit1):
+	lea	(%rdi), %rax
+	ret
+
+	.p2align 4
+L(Exit2):
+	lea	1(%rdi), %rax
+	ret
+
+	.p2align 4
+L(Exit3):
+	lea	2(%rdi), %rax
+	ret
+
+	.p2align 4
+L(Exit4):
+	lea	3(%rdi), %rax
+	ret
+
+	.p2align 4
+L(Exit5):
+	lea	4(%rdi), %rax
+	ret
+
+	.p2align 4
+L(Exit6):
+	lea	5(%rdi), %rax
+	ret
+
+	.p2align 4
+L(Exit7):
+	lea	6(%rdi), %rax
+	ret
+
+	.p2align 4
+L(Exit9):
+	lea	8(%rdi), %rax
+	ret
+
+	.p2align 4
+L(Exit10):
+	lea	9(%rdi), %rax
+	ret
+
+	.p2align 4
+L(Exit11):
+	lea	10(%rdi), %rax
+	ret
+
+	.p2align 4
+L(Exit12):
+	lea	11(%rdi), %rax
+	ret
+
+	.p2align 4
+L(Exit13):
+	lea	12(%rdi), %rax
+	ret
+
+	.p2align 4
+L(Exit14):
+	lea	13(%rdi), %rax
+	ret
+
+	.p2align 4
+L(Exit15):
+	lea	14(%rdi), %rax
+	ret
+
+END (__strchr_sse2_no_bsf)
+#endif
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/strchr.S b/REORG.TODO/sysdeps/x86_64/multiarch/strchr.S
new file mode 100644
index 0000000000..c9f54ca2e2
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/strchr.S
@@ -0,0 +1,57 @@
+/* Multiple versions of strchr
+   Copyright (C) 2009-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+
+/* Define multiple versions only for the definition in libc.  */
+#if IS_IN (libc)
+	.text
+ENTRY(strchr)
+	.type	strchr, @gnu_indirect_function
+	LOAD_RTLD_GLOBAL_RO_RDX
+	leaq	__strchr_sse2(%rip), %rax
+2:	HAS_ARCH_FEATURE (Slow_BSF)
+	jz	3f
+	leaq    __strchr_sse2_no_bsf(%rip), %rax
+3:	ret
+END(strchr)
+
+
+
+# undef ENTRY
+# define ENTRY(name) \
+	.type __strchr_sse2, @function; \
+	.align 16; \
+	.globl __strchr_sse2; \
+	.hidden __strchr_sse2; \
+	__strchr_sse2: cfi_startproc; \
+	CALL_MCOUNT
+# undef END
+# define END(name) \
+	cfi_endproc; .size __strchr_sse2, .-__strchr_sse2
+# undef libc_hidden_builtin_def
+/* It doesn't make sense to send libc-internal strchr calls through a PLT.
+   The speedup we get from using SSE4.2 instruction is likely eaten away
+   by the indirect call in the PLT.  */
+# define libc_hidden_builtin_def(name) \
+	.globl __GI_strchr; __GI_strchr = __strchr_sse2
+#endif
+
+#include "../strchr.S"
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S b/REORG.TODO/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S
new file mode 100644
index 0000000000..b0992dce39
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S
@@ -0,0 +1,213 @@
+/* strcmp with unaligned loads
+   Copyright (C) 2013-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+
+#include "sysdep.h"
+
+ENTRY ( __strcmp_sse2_unaligned)
+	movl	%edi, %eax
+	xorl	%edx, %edx
+	pxor	%xmm7, %xmm7
+	orl	%esi, %eax
+	andl	$4095, %eax
+	cmpl	$4032, %eax
+	jg	L(cross_page)
+	movdqu	(%rdi), %xmm1
+	movdqu	(%rsi), %xmm0
+	pcmpeqb	%xmm1, %xmm0
+	pminub	%xmm1, %xmm0
+	pxor	%xmm1, %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb	%xmm0, %eax
+	testq	%rax, %rax
+	je	L(next_48_bytes)
+L(return):
+	bsfq	%rax, %rdx
+	movzbl	(%rdi, %rdx), %eax
+	movzbl	(%rsi, %rdx), %edx
+	subl	%edx, %eax
+	ret
+
+	.p2align 4
+L(next_48_bytes):
+	movdqu	16(%rdi), %xmm6
+	movdqu	16(%rsi), %xmm3
+	movdqu	32(%rdi), %xmm5
+	pcmpeqb	%xmm6, %xmm3
+	movdqu	32(%rsi), %xmm2
+	pminub	%xmm6, %xmm3
+	pcmpeqb	%xmm1, %xmm3
+	movdqu	48(%rdi), %xmm4
+	pcmpeqb	%xmm5, %xmm2
+	pmovmskb	%xmm3, %edx
+	movdqu	48(%rsi), %xmm0
+	pminub	%xmm5, %xmm2
+	pcmpeqb	%xmm1, %xmm2
+	pcmpeqb	%xmm4, %xmm0
+	pmovmskb	%xmm2, %eax
+	salq	$16, %rdx
+	pminub	%xmm4, %xmm0
+	pcmpeqb	%xmm1, %xmm0
+	salq	$32, %rax
+	orq	%rdx, %rax
+	pmovmskb	%xmm0, %ecx
+	movq	%rcx, %rdx
+	salq	$48, %rdx
+	orq	%rdx, %rax
+	jne	L(return)
+L(main_loop_header):
+	leaq	64(%rdi), %rdx
+	movl	$4096, %ecx
+	pxor	%xmm9, %xmm9
+	andq	$-64, %rdx
+	subq	%rdi, %rdx
+	leaq	(%rdi, %rdx), %rax
+	addq	%rsi, %rdx
+	movq	%rdx, %rsi
+	andl	$4095, %esi
+	subq	%rsi, %rcx
+	shrq	$6, %rcx
+	movq	%rcx, %rsi
+	jmp	L(loop_start)
+
+	.p2align 4
+L(loop):
+	addq	$64, %rax
+	addq	$64, %rdx
+L(loop_start):
+	testq	%rsi, %rsi
+	leaq	-1(%rsi), %rsi
+	je	L(loop_cross_page)
+L(back_to_loop):
+	movdqu	(%rdx), %xmm0
+	movdqu	16(%rdx), %xmm1
+	movdqa	(%rax), %xmm2
+	movdqa	16(%rax), %xmm3
+	pcmpeqb	%xmm2, %xmm0
+	movdqu	32(%rdx), %xmm5
+	pcmpeqb	%xmm3, %xmm1
+	pminub	%xmm2, %xmm0
+	movdqu	48(%rdx), %xmm6
+	pminub	%xmm3, %xmm1
+	movdqa	32(%rax), %xmm2
+	pminub	%xmm1, %xmm0
+	movdqa	48(%rax), %xmm3
+	pcmpeqb	%xmm2, %xmm5
+	pcmpeqb	%xmm3, %xmm6
+	pminub	%xmm2, %xmm5
+	pminub	%xmm3, %xmm6
+	pminub	%xmm5, %xmm0
+	pminub	%xmm6, %xmm0
+	pcmpeqb	%xmm7, %xmm0
+	pmovmskb	%xmm0, %ecx
+	testl	%ecx, %ecx
+	je	L(loop)
+	pcmpeqb	%xmm7, %xmm5
+	movdqu	(%rdx), %xmm0
+	pcmpeqb	%xmm7, %xmm1
+	movdqa	(%rax), %xmm2
+	pcmpeqb	%xmm2, %xmm0
+	pminub	%xmm2, %xmm0
+	pcmpeqb	%xmm7, %xmm6
+	pcmpeqb	%xmm7, %xmm0
+	pmovmskb	%xmm1, %ecx
+	pmovmskb	%xmm5, %r8d
+	pmovmskb	%xmm0, %edi
+	salq	$16, %rcx
+	salq	$32, %r8
+	pmovmskb	%xmm6, %esi
+	orq	%r8, %rcx
+	orq	%rdi, %rcx
+	salq	$48, %rsi
+	orq	%rsi, %rcx
+	bsfq	%rcx, %rcx
+	movzbl	(%rax, %rcx), %eax
+	movzbl	(%rdx, %rcx), %edx
+	subl	%edx, %eax
+	ret
+
+	.p2align 4
+L(loop_cross_page):
+	xor	%r10, %r10
+	movq	%rdx, %r9
+	and	$63, %r9
+	subq	%r9, %r10
+
+	movdqa	(%rdx, %r10), %xmm0
+	movdqa	16(%rdx, %r10), %xmm1
+	movdqu	(%rax, %r10), %xmm2
+	movdqu	16(%rax, %r10), %xmm3
+	pcmpeqb	%xmm2, %xmm0
+	movdqa	32(%rdx, %r10), %xmm5
+	pcmpeqb	%xmm3, %xmm1
+	pminub	%xmm2, %xmm0
+	movdqa	48(%rdx, %r10), %xmm6
+	pminub	%xmm3, %xmm1
+	movdqu	32(%rax, %r10), %xmm2
+	movdqu	48(%rax, %r10), %xmm3
+	pcmpeqb	%xmm2, %xmm5
+	pcmpeqb	%xmm3, %xmm6
+	pminub	%xmm2, %xmm5
+	pminub	%xmm3, %xmm6
+
+	pcmpeqb	%xmm7, %xmm0
+	pcmpeqb	%xmm7, %xmm1
+	pcmpeqb	%xmm7, %xmm5
+	pcmpeqb	%xmm7, %xmm6
+
+	pmovmskb	%xmm1, %ecx
+	pmovmskb	%xmm5, %r8d
+	pmovmskb	%xmm0, %edi
+	salq	$16, %rcx
+	salq	$32, %r8
+	pmovmskb	%xmm6, %esi
+	orq	%r8, %rdi
+	orq	%rcx, %rdi
+	salq	$48, %rsi
+	orq	%rsi, %rdi
+	movq	%r9, %rcx
+	movq	$63, %rsi
+	shrq	%cl, %rdi
+	test	%rdi, %rdi
+	je	L(back_to_loop)
+	bsfq	%rdi, %rcx
+	movzbl	(%rax, %rcx), %eax
+	movzbl	(%rdx, %rcx), %edx
+	subl	%edx, %eax
+	ret
+
+	.p2align 4
+L(cross_page_loop):
+	cmpb	%cl, %al
+	jne	L(different)
+	addq	$1, %rdx
+	cmpq	$64, %rdx
+	je	L(main_loop_header)
+L(cross_page):
+	movzbl	(%rdi, %rdx), %eax
+	movzbl	(%rsi, %rdx), %ecx
+	testb	%al, %al
+	jne	L(cross_page_loop)
+	xorl	%eax, %eax
+L(different):
+	subl	%ecx, %eax
+	ret
+END (__strcmp_sse2_unaligned)
+
+#endif
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/strcmp-sse42.S b/REORG.TODO/sysdeps/x86_64/multiarch/strcmp-sse42.S
new file mode 100644
index 0000000000..ed26d4a8fb
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/strcmp-sse42.S
@@ -0,0 +1,1792 @@
+/* strcmp with SSE4.2
+   Copyright (C) 2009-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+
+/* We use 0x1a:
+	_SIDD_SBYTE_OPS
+	| _SIDD_CMP_EQUAL_EACH
+	| _SIDD_NEGATIVE_POLARITY
+	| _SIDD_LEAST_SIGNIFICANT
+   on pcmpistri to find out if two 16byte data elements are the same
+   and the offset of the first different byte.  There are 4 cases:
+
+   1. Both 16byte data elements are valid and identical.
+   2. Both 16byte data elements have EOS and identical.
+   3. Both 16byte data elements are valid and they differ at offset X.
+   4. At least one 16byte data element has EOS at offset X.  Two 16byte
+      data elements must differ at or before offset X.
+
+   Here is the table of ECX, CFlag, ZFlag and SFlag for 4 cases:
+
+   case		ECX	CFlag	ZFlag	SFlag
+    1		16	  0	  0	  0
+    2		16	  0	  1	  1
+    3		 X	  1	  0	  0
+    4	       0 <= X	  1	 0/1	 0/1
+
+   We exit from the loop for cases 2, 3 and 4 with jbe which branches
+   when either CFlag or ZFlag is 1.  If CFlag == 0, we return 0 for
+   case 2.  */
+
+	/* Put all SSE 4.2 functions together.  */
+	.section .text.SECTION,"ax",@progbits
+	.align	16
+	.type	STRCMP_SSE42, @function
+	.globl	STRCMP_SSE42
+	.hidden	STRCMP_SSE42
+#ifdef USE_AS_STRCASECMP_L
+ENTRY (GLABEL(__strcasecmp))
+	movq	__libc_tsd_LOCALE@gottpoff(%rip),%rax
+	mov	%fs:(%rax),%RDX_LP
+
+	// XXX 5 byte should be before the function
+	/* 5-byte NOP.  */
+	.byte	0x0f,0x1f,0x44,0x00,0x00
+END (GLABEL(__strcasecmp))
+	/* FALLTHROUGH to strcasecmp_l.  */
+#endif
+#ifdef USE_AS_STRNCASECMP_L
+ENTRY (GLABEL(__strncasecmp))
+	movq	__libc_tsd_LOCALE@gottpoff(%rip),%rax
+	mov	%fs:(%rax),%RCX_LP
+
+	// XXX 5 byte should be before the function
+	/* 5-byte NOP.  */
+	.byte	0x0f,0x1f,0x44,0x00,0x00
+END (GLABEL(__strncasecmp))
+	/* FALLTHROUGH to strncasecmp_l.  */
+#endif
+
+
+#ifdef USE_AVX
+# define movdqa vmovdqa
+# define movdqu vmovdqu
+# define pmovmskb vpmovmskb
+# define pcmpistri vpcmpistri
+# define psubb vpsubb
+# define pcmpeqb vpcmpeqb
+# define psrldq vpsrldq
+# define pslldq vpslldq
+# define palignr vpalignr
+# define pxor vpxor
+# define D(arg) arg, arg
+#else
+# define D(arg) arg
+#endif
+
+STRCMP_SSE42:
+	cfi_startproc
+	CALL_MCOUNT
+
+/*
+ * This implementation uses SSE to compare up to 16 bytes at a time.
+ */
+#ifdef USE_AS_STRCASECMP_L
+	/* We have to fall back on the C implementation for locales
+	   with encodings not matching ASCII for single bytes.  */
+# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
+	mov	LOCALE_T___LOCALES+LC_CTYPE*LP_SIZE(%rdx), %RAX_LP
+# else
+	mov	(%rdx), %RAX_LP
+# endif
+	testl	$1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%rax)
+	jne	__strcasecmp_l_nonascii
+#endif
+#ifdef USE_AS_STRNCASECMP_L
+	/* We have to fall back on the C implementation for locales
+	   with encodings not matching ASCII for single bytes.  */
+# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
+	mov	LOCALE_T___LOCALES+LC_CTYPE*LP_SIZE(%rcx), %RAX_LP
+# else
+	mov	(%rcx), %RAX_LP
+# endif
+	testl	$1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%rax)
+	jne	__strncasecmp_l_nonascii
+#endif
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	test	%rdx, %rdx
+	je	LABEL(strcmp_exitz)
+	cmp	$1, %rdx
+	je	LABEL(Byte0)
+	mov	%rdx, %r11
+#endif
+	mov	%esi, %ecx
+	mov	%edi, %eax
+/* Use 64bit AND here to avoid long NOP padding.  */
+	and	$0x3f, %rcx		/* rsi alignment in cache line */
+	and	$0x3f, %rax		/* rdi alignment in cache line */
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+	.section .rodata.cst16,"aM",@progbits,16
+	.align 16
+LABEL(belowupper):
+	.quad	0x4040404040404040
+	.quad	0x4040404040404040
+LABEL(topupper):
+# ifdef USE_AVX
+	.quad	0x5a5a5a5a5a5a5a5a
+	.quad	0x5a5a5a5a5a5a5a5a
+# else
+	.quad	0x5b5b5b5b5b5b5b5b
+	.quad	0x5b5b5b5b5b5b5b5b
+# endif
+LABEL(touppermask):
+	.quad	0x2020202020202020
+	.quad	0x2020202020202020
+	.previous
+	movdqa	LABEL(belowupper)(%rip), %xmm4
+# define UCLOW_reg %xmm4
+	movdqa	LABEL(topupper)(%rip), %xmm5
+# define UCHIGH_reg %xmm5
+	movdqa	LABEL(touppermask)(%rip), %xmm6
+# define LCQWORD_reg %xmm6
+#endif
+	cmp	$0x30, %ecx
+	ja	LABEL(crosscache)/* rsi: 16-byte load will cross cache line */
+	cmp	$0x30, %eax
+	ja	LABEL(crosscache)/* rdi: 16-byte load will cross cache line */
+	movdqu	(%rdi), %xmm1
+	movdqu	(%rsi), %xmm2
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+# ifdef USE_AVX
+#  define TOLOWER(reg1, reg2) \
+	vpcmpgtb UCLOW_reg, reg1, %xmm7;			\
+	vpcmpgtb UCHIGH_reg, reg1, %xmm8;			\
+	vpcmpgtb UCLOW_reg, reg2, %xmm9;			\
+	vpcmpgtb UCHIGH_reg, reg2, %xmm10;			\
+	vpandn	%xmm7, %xmm8, %xmm8;					\
+	vpandn	%xmm9, %xmm10, %xmm10;					\
+	vpand	LCQWORD_reg, %xmm8, %xmm8;				\
+	vpand	LCQWORD_reg, %xmm10, %xmm10;				\
+	vpor	reg1, %xmm8, reg1;					\
+	vpor	reg2, %xmm10, reg2
+# else
+#  define TOLOWER(reg1, reg2) \
+	movdqa	reg1, %xmm7;					\
+	movdqa	UCHIGH_reg, %xmm8;				\
+	movdqa	reg2, %xmm9;					\
+	movdqa	UCHIGH_reg, %xmm10;				\
+	pcmpgtb	UCLOW_reg, %xmm7;				\
+	pcmpgtb	reg1, %xmm8;					\
+	pcmpgtb	UCLOW_reg, %xmm9;				\
+	pcmpgtb	reg2, %xmm10;					\
+	pand	%xmm8, %xmm7;					\
+	pand	%xmm10, %xmm9;					\
+	pand	LCQWORD_reg, %xmm7;				\
+	pand	LCQWORD_reg, %xmm9;				\
+	por	%xmm7, reg1;					\
+	por	%xmm9, reg2
+# endif
+	TOLOWER (%xmm1, %xmm2)
+#else
+# define TOLOWER(reg1, reg2)
+#endif
+	pxor	%xmm0, D(%xmm0)		/* clear %xmm0 for null char checks */
+	pcmpeqb	%xmm1, D(%xmm0)		/* Any null chars? */
+	pcmpeqb	%xmm2, D(%xmm1)		/* compare first 16 bytes for equality */
+	psubb	%xmm0, D(%xmm1)		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 16 bytes are same, edx == 0xffff */
+	jnz	LABEL(less16bytes)/* If not, find different value or null char */
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	sub	$16, %r11
+	jbe	LABEL(strcmp_exitz)/* finish comparison */
+#endif
+	add	$16, %rsi		/* prepare to search next 16 bytes */
+	add	$16, %rdi		/* prepare to search next 16 bytes */
+
+	/*
+	 * Determine source and destination string offsets from 16-byte
+	 * alignment.  Use relative offset difference between the two to
+	 * determine which case below to use.
+	 */
+	.p2align 4
+LABEL(crosscache):
+	and	$0xfffffffffffffff0, %rsi /* force %rsi is 16 byte aligned */
+	and	$0xfffffffffffffff0, %rdi /* force %rdi is 16 byte aligned */
+	mov	$0xffff, %edx		/* for equivalent offset */
+	xor	%r8d, %r8d
+	and	$0xf, %ecx		/* offset of rsi */
+	and	$0xf, %eax		/* offset of rdi */
+	pxor	%xmm0, D(%xmm0)		/* clear %xmm0 for null char check */
+	cmp	%eax, %ecx
+	je	LABEL(ashr_0)		/* rsi and rdi relative offset same */
+	ja	LABEL(bigger)
+	mov	%edx, %r8d		/* r8d is offset flag for exit tail */
+	xchg	%ecx, %eax
+	xchg	%rsi, %rdi
+LABEL(bigger):
+	movdqa	(%rdi), %xmm2
+	movdqa	(%rsi), %xmm1
+	lea	15(%rax), %r9
+	sub	%rcx, %r9
+	lea	LABEL(unaligned_table)(%rip), %r10
+	movslq	(%r10, %r9,4), %r9
+	pcmpeqb	%xmm1, D(%xmm0)		/* Any null chars? */
+	lea	(%r10, %r9), %r10
+	jmp	*%r10			/* jump to corresponding case */
+
+/*
+ * The following cases will be handled by ashr_0
+ *  rcx(offset of rsi)  rax(offset of rdi)  relative offset  corresponding case
+ *        n(0~15)            n(0~15)           15(15+ n-n)         ashr_0
+ */
+	.p2align 4
+LABEL(ashr_0):
+
+	movdqa	(%rsi), %xmm1
+	pcmpeqb	%xmm1, D(%xmm0)		/* Any null chars? */
+#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+	pcmpeqb	(%rdi), D(%xmm1)	/* compare 16 bytes for equality */
+#else
+	movdqa	(%rdi), %xmm2
+	TOLOWER (%xmm1, %xmm2)
+	pcmpeqb	%xmm2, D(%xmm1)		/* compare 16 bytes for equality */
+#endif
+	psubb	%xmm0, D(%xmm1)		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %r9d
+	shr	%cl, %edx		/* adjust 0xffff for offset */
+	shr	%cl, %r9d		/* adjust for 16-byte offset */
+	sub	%r9d, %edx
+	/*
+	 * edx must be the same with r9d if in left byte (16-rcx) is equal to
+	 * the start from (16-rax) and no null char was seen.
+	 */
+	jne	LABEL(less32bytes)	/* mismatch or null char */
+	UPDATE_STRNCMP_COUNTER
+	mov	$16, %rcx
+	mov	$16, %r9
+
+	/*
+	 * Now both strings are aligned at 16-byte boundary. Loop over strings
+	 * checking 32-bytes per iteration.
+	 */
+	mov	%rcx, %rdx	/* only for offset of sse4 instruction loop*/
+	.p2align 4
+LABEL(ashr_0_use):
+	movdqa	(%rdi,%rdx), %xmm0
+#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+	pcmpistri      $0x1a,(%rsi,%rdx), %xmm0
+#else
+	movdqa	(%rsi,%rdx), %xmm1
+	TOLOWER (%xmm0, %xmm1)
+	pcmpistri $0x1a, %xmm1, %xmm0
+#endif
+	lea	16(%rdx), %rdx
+	jbe	LABEL(ashr_0_exit_use)
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	sub	$16, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+
+	movdqa	(%rdi,%rdx), %xmm0
+#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+	pcmpistri      $0x1a,(%rsi,%rdx), %xmm0
+#else
+	movdqa	(%rsi,%rdx), %xmm1
+	TOLOWER (%xmm0, %xmm1)
+	pcmpistri $0x1a, %xmm1, %xmm0
+#endif
+	lea	16(%rdx), %rdx
+	jbe	LABEL(ashr_0_exit_use)
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	sub	$16, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+	jmp	LABEL(ashr_0_use)
+
+
+	.p2align 4
+LABEL(ashr_0_exit_use):
+	jnc	LABEL(strcmp_exitz)
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	sub	%rcx, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+	lea	-16(%rdx, %rcx), %rcx
+	movzbl	(%rdi, %rcx), %eax
+	movzbl	(%rsi, %rcx), %edx
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+	leaq	_nl_C_LC_CTYPE_tolower+128*4(%rip), %rcx
+	movl	(%rcx,%rax,4), %eax
+	movl	(%rcx,%rdx,4), %edx
+#endif
+	sub	%edx, %eax
+	ret
+
+
+
+/*
+ * The following cases will be handled by ashr_1
+ * rcx(offset of rsi)  rax(offset of rdi)   relative offset	corresponding case
+ *        n(15)            n -15            0(15 +(n-15) - n)         ashr_1
+ */
+	.p2align 4
+LABEL(ashr_1):
+	pslldq	$15, D(%xmm2)		/* shift first string to align with second */
+	TOLOWER (%xmm1, %xmm2)
+	pcmpeqb	%xmm1, D(%xmm2)		/* compare 16 bytes for equality */
+	psubb	%xmm0, D(%xmm2)		/* packed sub of comparison results*/
+	pmovmskb %xmm2, %r9d
+	shr	%cl, %edx		/* adjust 0xffff for offset */
+	shr	%cl, %r9d		/* adjust for 16-byte offset */
+	sub	%r9d, %edx
+	jnz	LABEL(less32bytes)	/* mismatch or null char seen */
+	movdqa	(%rdi), %xmm3
+	UPDATE_STRNCMP_COUNTER
+
+	mov	$16, %rcx		/* index for loads*/
+	mov	$1, %r9d		/* byte position left over from less32bytes case */
+	/*
+	 * Setup %r10 value allows us to detect crossing a page boundary.
+	 * When %r10 goes positive we have crossed a page boundary and
+	 * need to do a nibble.
+	 */
+	lea	1(%rdi), %r10
+	and	$0xfff, %r10		/* offset into 4K page */
+	sub	$0x1000, %r10		/* subtract 4K pagesize */
+	mov	%rcx, %rdx	/* only for offset of sse4 instruction loop*/
+
+	.p2align 4
+LABEL(loop_ashr_1_use):
+	add	$16, %r10
+	jg	LABEL(nibble_ashr_1_use)
+
+LABEL(nibble_ashr_1_restart_use):
+	movdqa	(%rdi, %rdx), %xmm0
+	palignr $1, -16(%rdi, %rdx), D(%xmm0)
+#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+	pcmpistri	$0x1a,(%rsi,%rdx), %xmm0
+#else
+	movdqa	(%rsi,%rdx), %xmm1
+	TOLOWER (%xmm0, %xmm1)
+	pcmpistri $0x1a, %xmm1, %xmm0
+#endif
+	jbe	LABEL(exit_use)
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	sub	$16, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+
+	add	$16, %rdx
+	add	$16, %r10
+	jg	LABEL(nibble_ashr_1_use)
+
+	movdqa	(%rdi, %rdx), %xmm0
+	palignr $1, -16(%rdi, %rdx), D(%xmm0)
+#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+	pcmpistri	$0x1a,(%rsi,%rdx), %xmm0
+#else
+	movdqa	(%rsi,%rdx), %xmm1
+	TOLOWER (%xmm0, %xmm1)
+	pcmpistri $0x1a, %xmm1, %xmm0
+#endif
+	jbe	LABEL(exit_use)
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	sub	$16, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+	add	$16, %rdx
+	jmp	LABEL(loop_ashr_1_use)
+
+	.p2align 4
+LABEL(nibble_ashr_1_use):
+	sub	$0x1000, %r10
+	movdqa	-16(%rdi, %rdx), %xmm0
+	psrldq	$1, D(%xmm0)
+	pcmpistri      $0x3a,%xmm0, %xmm0
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	%r11, %rcx
+	jae	LABEL(nibble_ashr_exit_use)
+#endif
+	cmp	$14, %ecx
+	ja	LABEL(nibble_ashr_1_restart_use)
+
+	jmp	LABEL(nibble_ashr_exit_use)
+
+/*
+ * The following cases will be handled by ashr_2
+ * rcx(offset of rsi)  rax(offset of rdi)   relative offset	corresponding case
+ *        n(14~15)            n -14         1(15 +(n-14) - n)         ashr_2
+ */
+	.p2align 4
+LABEL(ashr_2):
+	pslldq	$14, D(%xmm2)
+	TOLOWER (%xmm1, %xmm2)
+	pcmpeqb	%xmm1, D(%xmm2)
+	psubb	%xmm0, D(%xmm2)
+	pmovmskb %xmm2, %r9d
+	shr	%cl, %edx
+	shr	%cl, %r9d
+	sub	%r9d, %edx
+	jnz	LABEL(less32bytes)
+	movdqa	(%rdi), %xmm3
+	UPDATE_STRNCMP_COUNTER
+
+	mov	$16, %rcx	/* index for loads */
+	mov	$2, %r9d	/* byte position left over from less32bytes case */
+	/*
+	 * Setup %r10 value allows us to detect crossing a page boundary.
+	 * When %r10 goes positive we have crossed a page boundary and
+	 * need to do a nibble.
+	 */
+	lea	2(%rdi), %r10
+	and	$0xfff, %r10	/* offset into 4K page */
+	sub	$0x1000, %r10	/* subtract 4K pagesize */
+	mov	%rcx, %rdx	/* only for offset of sse4 instruction loop*/
+
+	.p2align 4
+LABEL(loop_ashr_2_use):
+	add	$16, %r10
+	jg	LABEL(nibble_ashr_2_use)
+
+LABEL(nibble_ashr_2_restart_use):
+	movdqa	(%rdi, %rdx), %xmm0
+	palignr $2, -16(%rdi, %rdx), D(%xmm0)
+#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+	pcmpistri	$0x1a,(%rsi,%rdx), %xmm0
+#else
+	movdqa	(%rsi,%rdx), %xmm1
+	TOLOWER (%xmm0, %xmm1)
+	pcmpistri $0x1a, %xmm1, %xmm0
+#endif
+	jbe	LABEL(exit_use)
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	sub	$16, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+
+	add	$16, %rdx
+	add	$16, %r10
+	jg	LABEL(nibble_ashr_2_use)
+
+	movdqa	(%rdi, %rdx), %xmm0
+	palignr $2, -16(%rdi, %rdx), D(%xmm0)
+#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+	pcmpistri	$0x1a,(%rsi,%rdx), %xmm0
+#else
+	movdqa	(%rsi,%rdx), %xmm1
+	TOLOWER (%xmm0, %xmm1)
+	pcmpistri $0x1a, %xmm1, %xmm0
+#endif
+	jbe	LABEL(exit_use)
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	sub	$16, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+	add	$16, %rdx
+	jmp	LABEL(loop_ashr_2_use)
+
+	.p2align 4
+LABEL(nibble_ashr_2_use):
+	sub	$0x1000, %r10
+	movdqa	-16(%rdi, %rdx), %xmm0
+	psrldq	$2, D(%xmm0)
+	pcmpistri      $0x3a,%xmm0, %xmm0
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	%r11, %rcx
+	jae	LABEL(nibble_ashr_exit_use)
+#endif
+	cmp	$13, %ecx
+	ja	LABEL(nibble_ashr_2_restart_use)
+
+	jmp	LABEL(nibble_ashr_exit_use)
+
+/*
+ * The following cases will be handled by ashr_3
+ *  rcx(offset of rsi)  rax(offset of rdi)  relative offset	 corresponding case
+ *        n(13~15)            n -13         2(15 +(n-13) - n)         ashr_3
+ */
+	.p2align 4
+LABEL(ashr_3):
+	pslldq	$13, D(%xmm2)
+	TOLOWER (%xmm1, %xmm2)
+	pcmpeqb	%xmm1, D(%xmm2)
+	psubb	%xmm0, D(%xmm2)
+	pmovmskb %xmm2, %r9d
+	shr	%cl, %edx
+	shr	%cl, %r9d
+	sub	%r9d, %edx
+	jnz	LABEL(less32bytes)
+	movdqa	(%rdi), %xmm3
+
+	UPDATE_STRNCMP_COUNTER
+
+	mov	$16, %rcx	/* index for loads */
+	mov	$3, %r9d	/* byte position left over from less32bytes case */
+	/*
+	 * Setup %r10 value allows us to detect crossing a page boundary.
+	 * When %r10 goes positive we have crossed a page boundary and
+	 * need to do a nibble.
+	 */
+	lea	3(%rdi), %r10
+	and	$0xfff, %r10	/* offset into 4K page */
+	sub	$0x1000, %r10	/* subtract 4K pagesize */
+	mov	%rcx, %rdx	/* only for offset of sse4 instruction loop*/
+
+LABEL(loop_ashr_3_use):
+	add	$16, %r10
+	jg	LABEL(nibble_ashr_3_use)
+
+LABEL(nibble_ashr_3_restart_use):
+	movdqa	(%rdi, %rdx), %xmm0
+	palignr $3, -16(%rdi, %rdx), D(%xmm0)
+#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+	pcmpistri	$0x1a,(%rsi,%rdx), %xmm0
+#else
+	movdqa	(%rsi,%rdx), %xmm1
+	TOLOWER (%xmm0, %xmm1)
+	pcmpistri $0x1a, %xmm1, %xmm0
+#endif
+	jbe	LABEL(exit_use)
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	sub	$16, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+
+	add	$16, %rdx
+	add	$16, %r10
+	jg	LABEL(nibble_ashr_3_use)
+
+	movdqa	(%rdi, %rdx), %xmm0
+	palignr $3, -16(%rdi, %rdx), D(%xmm0)
+#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+	pcmpistri	$0x1a,(%rsi,%rdx), %xmm0
+#else
+	movdqa	(%rsi,%rdx), %xmm1
+	TOLOWER (%xmm0, %xmm1)
+	pcmpistri $0x1a, %xmm1, %xmm0
+#endif
+	jbe	LABEL(exit_use)
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	sub	$16, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+	add	$16, %rdx
+	jmp	LABEL(loop_ashr_3_use)
+
+	.p2align 4
+LABEL(nibble_ashr_3_use):
+	sub	$0x1000, %r10
+	movdqa	-16(%rdi, %rdx), %xmm0
+	psrldq	$3, D(%xmm0)
+	pcmpistri      $0x3a,%xmm0, %xmm0
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	%r11, %rcx
+	jae	LABEL(nibble_ashr_exit_use)
+#endif
+	cmp	$12, %ecx
+	ja	LABEL(nibble_ashr_3_restart_use)
+
+	jmp	LABEL(nibble_ashr_exit_use)
+
+/*
+ * The following cases will be handled by ashr_4
+ *  rcx(offset of rsi)  rax(offset of rdi)  relative offset	 corresponding case
+ *        n(12~15)            n -12         3(15 +(n-12) - n)         ashr_4
+ */
+	.p2align 4
+LABEL(ashr_4):
+	pslldq	$12, D(%xmm2)
+	TOLOWER (%xmm1, %xmm2)
+	pcmpeqb	%xmm1, D(%xmm2)
+	psubb	%xmm0, D(%xmm2)
+	pmovmskb %xmm2, %r9d
+	shr	%cl, %edx
+	shr	%cl, %r9d
+	sub	%r9d, %edx
+	jnz	LABEL(less32bytes)
+	movdqa	(%rdi), %xmm3
+
+	UPDATE_STRNCMP_COUNTER
+
+	mov	$16, %rcx	/* index for loads */
+	mov	$4, %r9d	/* byte position left over from less32bytes case */
+	/*
+	 * Setup %r10 value allows us to detect crossing a page boundary.
+	 * When %r10 goes positive we have crossed a page boundary and
+	 * need to do a nibble.
+	 */
+	lea	4(%rdi), %r10
+	and	$0xfff, %r10	/* offset into 4K page */
+	sub	$0x1000, %r10	/* subtract 4K pagesize */
+	mov	%rcx, %rdx	/* only for offset of sse4 instruction loop*/
+
+	.p2align 4
+LABEL(loop_ashr_4_use):
+	add	$16, %r10
+	jg	LABEL(nibble_ashr_4_use)
+
+LABEL(nibble_ashr_4_restart_use):
+	movdqa	(%rdi, %rdx), %xmm0
+	palignr $4, -16(%rdi, %rdx), D(%xmm0)
+#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+	pcmpistri	$0x1a,(%rsi,%rdx), %xmm0
+#else
+	movdqa	(%rsi,%rdx), %xmm1
+	TOLOWER (%xmm0, %xmm1)
+	pcmpistri $0x1a, %xmm1, %xmm0
+#endif
+	jbe	LABEL(exit_use)
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	sub	$16, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+
+	add	$16, %rdx
+	add	$16, %r10
+	jg	LABEL(nibble_ashr_4_use)
+
+	movdqa	(%rdi, %rdx), %xmm0
+	palignr $4, -16(%rdi, %rdx), D(%xmm0)
+#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+	pcmpistri	$0x1a,(%rsi,%rdx), %xmm0
+#else
+	movdqa	(%rsi,%rdx), %xmm1
+	TOLOWER (%xmm0, %xmm1)
+	pcmpistri $0x1a, %xmm1, %xmm0
+#endif
+	jbe	LABEL(exit_use)
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	sub	$16, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+	add	$16, %rdx
+	jmp	LABEL(loop_ashr_4_use)
+
+	.p2align 4
+LABEL(nibble_ashr_4_use):
+	sub	$0x1000, %r10
+	movdqa	-16(%rdi, %rdx), %xmm0
+	psrldq	$4, D(%xmm0)
+	pcmpistri      $0x3a,%xmm0, %xmm0
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	%r11, %rcx
+	jae	LABEL(nibble_ashr_exit_use)
+#endif
+	cmp	$11, %ecx
+	ja	LABEL(nibble_ashr_4_restart_use)
+
+	jmp	LABEL(nibble_ashr_exit_use)
+
+/*
+ * The following cases will be handled by ashr_5
+ *  rcx(offset of rsi)  rax(offset of rdi)        relative offset      corresponding case
+ *        n(11~15)          n - 11		  4(15 +(n-11) - n)         ashr_5
+ */
+	.p2align 4
+LABEL(ashr_5):
+	pslldq	$11, D(%xmm2)
+	TOLOWER (%xmm1, %xmm2)
+	pcmpeqb	%xmm1, D(%xmm2)
+	psubb	%xmm0, D(%xmm2)
+	pmovmskb %xmm2, %r9d
+	shr	%cl, %edx
+	shr	%cl, %r9d
+	sub	%r9d, %edx
+	jnz	LABEL(less32bytes)
+	movdqa	(%rdi), %xmm3
+
+	UPDATE_STRNCMP_COUNTER
+
+	mov	$16, %rcx	/* index for loads */
+	mov	$5, %r9d	/* byte position left over from less32bytes case */
+	/*
+	 * Setup %r10 value allows us to detect crossing a page boundary.
+	 * When %r10 goes positive we have crossed a page boundary and
+	 * need to do a nibble.
+	 */
+	lea	5(%rdi), %r10
+	and	$0xfff, %r10	/* offset into 4K page */
+	sub	$0x1000, %r10	/* subtract 4K pagesize */
+	mov	%rcx, %rdx	/* only for offset of sse4 instruction loop*/
+
+	.p2align 4
+LABEL(loop_ashr_5_use):
+	add	$16, %r10
+	jg	LABEL(nibble_ashr_5_use)
+
+LABEL(nibble_ashr_5_restart_use):
+	movdqa	(%rdi, %rdx), %xmm0
+	palignr $5, -16(%rdi, %rdx), D(%xmm0)
+#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+	pcmpistri	$0x1a,(%rsi,%rdx), %xmm0
+#else
+	movdqa	(%rsi,%rdx), %xmm1
+	TOLOWER (%xmm0, %xmm1)
+	pcmpistri $0x1a, %xmm1, %xmm0
+#endif
+	jbe	LABEL(exit_use)
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	sub	$16, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+
+	add	$16, %rdx
+	add	$16, %r10
+	jg	LABEL(nibble_ashr_5_use)
+
+	movdqa	(%rdi, %rdx), %xmm0
+
+	palignr $5, -16(%rdi, %rdx), D(%xmm0)
+#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+	pcmpistri	$0x1a,(%rsi,%rdx), %xmm0
+#else
+	movdqa	(%rsi,%rdx), %xmm1
+	TOLOWER (%xmm0, %xmm1)
+	pcmpistri $0x1a, %xmm1, %xmm0
+#endif
+	jbe	LABEL(exit_use)
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	sub	$16, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+	add	$16, %rdx
+	jmp	LABEL(loop_ashr_5_use)
+
+	.p2align 4
+LABEL(nibble_ashr_5_use):
+	sub	$0x1000, %r10
+	movdqa	-16(%rdi, %rdx), %xmm0
+	psrldq	$5, D(%xmm0)
+	pcmpistri      $0x3a,%xmm0, %xmm0
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	%r11, %rcx
+	jae	LABEL(nibble_ashr_exit_use)
+#endif
+	cmp	$10, %ecx
+	ja	LABEL(nibble_ashr_5_restart_use)
+
+	jmp	LABEL(nibble_ashr_exit_use)
+
+/*
+ * The following cases will be handled by ashr_6
+ *  rcx(offset of rsi)  rax(offset of rdi)        relative offset      corresponding case
+ *        n(10~15)          n - 10		  5(15 +(n-10) - n)         ashr_6
+ */
+	.p2align 4
+LABEL(ashr_6):
+	pslldq	$10, D(%xmm2)
+	TOLOWER (%xmm1, %xmm2)
+	pcmpeqb	%xmm1, D(%xmm2)
+	psubb	%xmm0, D(%xmm2)
+	pmovmskb %xmm2, %r9d
+	shr	%cl, %edx
+	shr	%cl, %r9d
+	sub	%r9d, %edx
+	jnz	LABEL(less32bytes)
+	movdqa	(%rdi), %xmm3
+
+	UPDATE_STRNCMP_COUNTER
+
+	mov	$16, %rcx	/* index for loads */
+	mov	$6, %r9d	/* byte position left over from less32bytes case */
+	/*
+	 * Setup %r10 value allows us to detect crossing a page boundary.
+	 * When %r10 goes positive we have crossed a page boundary and
+	 * need to do a nibble.
+	 */
+	lea	6(%rdi), %r10
+	and	$0xfff, %r10	/* offset into 4K page */
+	sub	$0x1000, %r10	/* subtract 4K pagesize */
+	mov	%rcx, %rdx	/* only for offset of sse4 instruction loop*/
+
+	.p2align 4
+LABEL(loop_ashr_6_use):
+	add	$16, %r10
+	jg	LABEL(nibble_ashr_6_use)
+
+LABEL(nibble_ashr_6_restart_use):
+	movdqa	(%rdi, %rdx), %xmm0
+	palignr $6, -16(%rdi, %rdx), D(%xmm0)
+#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+	pcmpistri $0x1a,(%rsi,%rdx), %xmm0
+#else
+	movdqa	(%rsi,%rdx), %xmm1
+	TOLOWER (%xmm0, %xmm1)
+	pcmpistri $0x1a, %xmm1, %xmm0
+#endif
+	jbe	LABEL(exit_use)
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	sub	$16, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+
+	add	$16, %rdx
+	add	$16, %r10
+	jg	LABEL(nibble_ashr_6_use)
+
+	movdqa	(%rdi, %rdx), %xmm0
+	palignr $6, -16(%rdi, %rdx), D(%xmm0)
+#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+	pcmpistri $0x1a,(%rsi,%rdx), %xmm0
+#else
+	movdqa	(%rsi,%rdx), %xmm1
+	TOLOWER (%xmm0, %xmm1)
+	pcmpistri $0x1a, %xmm1, %xmm0
+#endif
+	jbe	LABEL(exit_use)
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	sub	$16, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+	add	$16, %rdx
+	jmp	LABEL(loop_ashr_6_use)
+
+	.p2align 4
+LABEL(nibble_ashr_6_use):
+	sub	$0x1000, %r10
+	movdqa	-16(%rdi, %rdx), %xmm0
+	psrldq	$6, D(%xmm0)
+	pcmpistri      $0x3a,%xmm0, %xmm0
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	%r11, %rcx
+	jae	LABEL(nibble_ashr_exit_use)
+#endif
+	cmp	$9, %ecx
+	ja	LABEL(nibble_ashr_6_restart_use)
+
+	jmp	LABEL(nibble_ashr_exit_use)
+
+/*
+ * The following cases will be handled by ashr_7
+ *  rcx(offset of rsi)  rax(offset of rdi)        relative offset      corresponding case
+ *        n(9~15)          n - 9		  6(15 +(n - 9) - n)         ashr_7
+ */
+	.p2align 4
+LABEL(ashr_7):
+	pslldq	$9, D(%xmm2)
+	TOLOWER (%xmm1, %xmm2)
+	pcmpeqb	%xmm1, D(%xmm2)
+	psubb	%xmm0, D(%xmm2)
+	pmovmskb %xmm2, %r9d
+	shr	%cl, %edx
+	shr	%cl, %r9d
+	sub	%r9d, %edx
+	jnz	LABEL(less32bytes)
+	movdqa	(%rdi), %xmm3
+
+	UPDATE_STRNCMP_COUNTER
+
+	mov	$16, %rcx	/* index for loads */
+	mov	$7, %r9d	/* byte position left over from less32bytes case */
+	/*
+	 * Setup %r10 value allows us to detect crossing a page boundary.
+	 * When %r10 goes positive we have crossed a page boundary and
+	 * need to do a nibble.
+	 */
+	lea	7(%rdi), %r10
+	and	$0xfff, %r10	/* offset into 4K page */
+	sub	$0x1000, %r10	/* subtract 4K pagesize */
+	mov	%rcx, %rdx	/* only for offset of sse4 instruction loop*/
+
+	.p2align 4
+LABEL(loop_ashr_7_use):
+	add	$16, %r10
+	jg	LABEL(nibble_ashr_7_use)
+
+LABEL(nibble_ashr_7_restart_use):
+	movdqa	(%rdi, %rdx), %xmm0
+	palignr $7, -16(%rdi, %rdx), D(%xmm0)
+#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+	pcmpistri	$0x1a,(%rsi,%rdx), %xmm0
+#else
+	movdqa	(%rsi,%rdx), %xmm1
+	TOLOWER (%xmm0, %xmm1)
+	pcmpistri $0x1a, %xmm1, %xmm0
+#endif
+	jbe	LABEL(exit_use)
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	sub	$16, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+
+	add	$16, %rdx
+	add	$16, %r10
+	jg	LABEL(nibble_ashr_7_use)
+
+	movdqa	(%rdi, %rdx), %xmm0
+	palignr $7, -16(%rdi, %rdx), D(%xmm0)
+#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+	pcmpistri	$0x1a,(%rsi,%rdx), %xmm0
+#else
+	movdqa	(%rsi,%rdx), %xmm1
+	TOLOWER (%xmm0, %xmm1)
+	pcmpistri $0x1a, %xmm1, %xmm0
+#endif
+	jbe	LABEL(exit_use)
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	sub	$16, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+	add	$16, %rdx
+	jmp	LABEL(loop_ashr_7_use)
+
+	.p2align 4
+LABEL(nibble_ashr_7_use):
+	sub	$0x1000, %r10
+	movdqa	-16(%rdi, %rdx), %xmm0
+	psrldq	$7, D(%xmm0)
+	pcmpistri      $0x3a,%xmm0, %xmm0
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	%r11, %rcx
+	jae	LABEL(nibble_ashr_exit_use)
+#endif
+	cmp	$8, %ecx
+	ja	LABEL(nibble_ashr_7_restart_use)
+
+	jmp	LABEL(nibble_ashr_exit_use)
+
+/*
+ *  The following cases will be handled by ashr_8
+ *  rcx(offset of rsi)  rax(offset of rdi)        relative offset	 corresponding case
+ *        n(8~15)          n - 8		  7(15 +(n - 8) - n)         ashr_8
+ */
+	.p2align 4
+LABEL(ashr_8):
+	pslldq	$8, D(%xmm2)
+	TOLOWER (%xmm1, %xmm2)
+	pcmpeqb	%xmm1, D(%xmm2)
+	psubb	%xmm0, D(%xmm2)
+	pmovmskb %xmm2, %r9d
+	shr	%cl, %edx
+	shr	%cl, %r9d
+	sub	%r9d, %edx
+	jnz	LABEL(less32bytes)
+	movdqa	(%rdi), %xmm3
+
+	UPDATE_STRNCMP_COUNTER
+
+	mov	$16, %rcx	/* index for loads */
+	mov	$8, %r9d	/* byte position left over from less32bytes case */
+	/*
+	 * Setup %r10 value allows us to detect crossing a page boundary.
+	 * When %r10 goes positive we have crossed a page boundary and
+	 * need to do a nibble.
+	 */
+	lea	8(%rdi), %r10
+	and	$0xfff, %r10	/* offset into 4K page */
+	sub	$0x1000, %r10	/* subtract 4K pagesize */
+	mov	%rcx, %rdx	/* only for offset of sse4 instruction loop*/
+
+	.p2align 4
+LABEL(loop_ashr_8_use):
+	add	$16, %r10
+	jg	LABEL(nibble_ashr_8_use)
+
+LABEL(nibble_ashr_8_restart_use):
+	movdqa	(%rdi, %rdx), %xmm0
+	palignr $8, -16(%rdi, %rdx), D(%xmm0)
+#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
+#else
+	movdqa	(%rsi,%rdx), %xmm1
+	TOLOWER (%xmm0, %xmm1)
+	pcmpistri $0x1a, %xmm1, %xmm0
+#endif
+	jbe	LABEL(exit_use)
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	sub	$16, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+
+	add	$16, %rdx
+	add	$16, %r10
+	jg	LABEL(nibble_ashr_8_use)
+
+	movdqa	(%rdi, %rdx), %xmm0
+	palignr $8, -16(%rdi, %rdx), D(%xmm0)
+#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
+#else
+	movdqa	(%rsi,%rdx), %xmm1
+	TOLOWER (%xmm0, %xmm1)
+	pcmpistri $0x1a, %xmm1, %xmm0
+#endif
+	jbe	LABEL(exit_use)
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	sub	$16, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+	add	$16, %rdx
+	jmp	LABEL(loop_ashr_8_use)
+
+	.p2align 4
+LABEL(nibble_ashr_8_use):
+	sub	$0x1000, %r10
+	movdqa	-16(%rdi, %rdx), %xmm0
+	psrldq	$8, D(%xmm0)
+	pcmpistri      $0x3a,%xmm0, %xmm0
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	%r11, %rcx
+	jae	LABEL(nibble_ashr_exit_use)
+#endif
+	cmp	$7, %ecx
+	ja	LABEL(nibble_ashr_8_restart_use)
+
+	jmp	LABEL(nibble_ashr_exit_use)
+
+/*
+ *  The following cases will be handled by ashr_9
+ *  rcx(offset of rsi)  rax(offset of rdi)        relative offset	 corresponding case
+ *        n(7~15)          n - 7		  8(15 +(n - 7) - n)         ashr_9
+ */
+	.p2align 4
+LABEL(ashr_9):
+	pslldq	$7, D(%xmm2)
+	TOLOWER (%xmm1, %xmm2)
+	pcmpeqb	%xmm1, D(%xmm2)
+	psubb	%xmm0, D(%xmm2)
+	pmovmskb %xmm2, %r9d
+	shr	%cl, %edx
+	shr	%cl, %r9d
+	sub	%r9d, %edx
+	jnz	LABEL(less32bytes)
+	movdqa	(%rdi), %xmm3
+
+	UPDATE_STRNCMP_COUNTER
+
+	mov	$16, %rcx	/* index for loads */
+	mov	$9, %r9d	/* byte position left over from less32bytes case */
+	/*
+	 * Setup %r10 value allows us to detect crossing a page boundary.
+	 * When %r10 goes positive we have crossed a page boundary and
+	 * need to do a nibble.
+	 */
+	lea	9(%rdi), %r10
+	and	$0xfff, %r10	/* offset into 4K page */
+	sub	$0x1000, %r10	/* subtract 4K pagesize */
+	mov	%rcx, %rdx	/* only for offset of sse4 instruction loop*/
+
+	.p2align 4
+LABEL(loop_ashr_9_use):
+	add	$16, %r10
+	jg	LABEL(nibble_ashr_9_use)
+
+LABEL(nibble_ashr_9_restart_use):
+	movdqa	(%rdi, %rdx), %xmm0
+
+	palignr $9, -16(%rdi, %rdx), D(%xmm0)
+#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
+#else
+	movdqa	(%rsi,%rdx), %xmm1
+	TOLOWER (%xmm0, %xmm1)
+	pcmpistri $0x1a, %xmm1, %xmm0
+#endif
+	jbe	LABEL(exit_use)
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	sub	$16, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+
+	add	$16, %rdx
+	add	$16, %r10
+	jg	LABEL(nibble_ashr_9_use)
+
+	movdqa	(%rdi, %rdx), %xmm0
+	palignr $9, -16(%rdi, %rdx), D(%xmm0)
+#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
+#else
+	movdqa	(%rsi,%rdx), %xmm1
+	TOLOWER (%xmm0, %xmm1)
+	pcmpistri $0x1a, %xmm1, %xmm0
+#endif
+	jbe	LABEL(exit_use)
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	sub	$16, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+	add	$16, %rdx
+	jmp	LABEL(loop_ashr_9_use)
+
+	.p2align 4
+LABEL(nibble_ashr_9_use):
+	sub	$0x1000, %r10
+	movdqa	-16(%rdi, %rdx), %xmm0
+	psrldq	$9, D(%xmm0)
+	pcmpistri      $0x3a,%xmm0, %xmm0
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	%r11, %rcx
+	jae	LABEL(nibble_ashr_exit_use)
+#endif
+	cmp	$6, %ecx
+	ja	LABEL(nibble_ashr_9_restart_use)
+
+	jmp	LABEL(nibble_ashr_exit_use)
+
+/*
+ *  The following cases will be handled by ashr_10
+ *  rcx(offset of rsi)  rax(offset of rdi)        relative offset	 corresponding case
+ *        n(6~15)          n - 6		  9(15 +(n - 6) - n)         ashr_10
+ */
+	.p2align 4
+LABEL(ashr_10):
+	pslldq	$6, D(%xmm2)
+	TOLOWER (%xmm1, %xmm2)
+	pcmpeqb	%xmm1, D(%xmm2)
+	psubb	%xmm0, D(%xmm2)
+	pmovmskb %xmm2, %r9d
+	shr	%cl, %edx
+	shr	%cl, %r9d
+	sub	%r9d, %edx
+	jnz	LABEL(less32bytes)
+	movdqa	(%rdi), %xmm3
+
+	UPDATE_STRNCMP_COUNTER
+
+	mov	$16, %rcx	/* index for loads */
+	mov	$10, %r9d	/* byte position left over from less32bytes case */
+	/*
+	 * Setup %r10 value allows us to detect crossing a page boundary.
+	 * When %r10 goes positive we have crossed a page boundary and
+	 * need to do a nibble.
+	 */
+	lea	10(%rdi), %r10
+	and	$0xfff, %r10	/* offset into 4K page */
+	sub	$0x1000, %r10	/* subtract 4K pagesize */
+	mov	%rcx, %rdx	/* only for offset of sse4 instruction loop*/
+
+	.p2align 4
+LABEL(loop_ashr_10_use):
+	add	$16, %r10
+	jg	LABEL(nibble_ashr_10_use)
+
+LABEL(nibble_ashr_10_restart_use):
+	movdqa	(%rdi, %rdx), %xmm0
+	palignr $10, -16(%rdi, %rdx), D(%xmm0)
+#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
+#else
+	movdqa	(%rsi,%rdx), %xmm1
+	TOLOWER (%xmm0, %xmm1)
+	pcmpistri $0x1a, %xmm1, %xmm0
+#endif
+	jbe	LABEL(exit_use)
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	sub	$16, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+
+	add	$16, %rdx
+	add	$16, %r10
+	jg	LABEL(nibble_ashr_10_use)
+
+	movdqa	(%rdi, %rdx), %xmm0
+	palignr $10, -16(%rdi, %rdx), D(%xmm0)
+#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
+#else
+	movdqa	(%rsi,%rdx), %xmm1
+	TOLOWER (%xmm0, %xmm1)
+	pcmpistri $0x1a, %xmm1, %xmm0
+#endif
+	jbe	LABEL(exit_use)
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	sub	$16, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+	add	$16, %rdx
+	jmp	LABEL(loop_ashr_10_use)
+
+	.p2align 4
+LABEL(nibble_ashr_10_use):
+	sub	$0x1000, %r10
+	movdqa	-16(%rdi, %rdx), %xmm0
+	psrldq	$10, D(%xmm0)
+	pcmpistri      $0x3a,%xmm0, %xmm0
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	%r11, %rcx
+	jae	LABEL(nibble_ashr_exit_use)
+#endif
+	cmp	$5, %ecx
+	ja	LABEL(nibble_ashr_10_restart_use)
+
+	jmp	LABEL(nibble_ashr_exit_use)
+
+/*
+ *  The following cases will be handled by ashr_11
+ *  rcx(offset of rsi)  rax(offset of rdi)        relative offset	 corresponding case
+ *        n(5~15)          n - 5		  10(15 +(n - 5) - n)         ashr_11
+ */
+	.p2align 4
+LABEL(ashr_11):
+	pslldq	$5, D(%xmm2)
+	TOLOWER (%xmm1, %xmm2)
+	pcmpeqb	%xmm1, D(%xmm2)
+	psubb	%xmm0, D(%xmm2)
+	pmovmskb %xmm2, %r9d
+	shr	%cl, %edx
+	shr	%cl, %r9d
+	sub	%r9d, %edx
+	jnz	LABEL(less32bytes)
+	movdqa	(%rdi), %xmm3
+
+	UPDATE_STRNCMP_COUNTER
+
+	mov	$16, %rcx	/* index for loads */
+	mov	$11, %r9d	/* byte position left over from less32bytes case */
+	/*
+	 * Setup %r10 value allows us to detect crossing a page boundary.
+	 * When %r10 goes positive we have crossed a page boundary and
+	 * need to do a nibble.
+	 */
+	lea	11(%rdi), %r10
+	and	$0xfff, %r10	/* offset into 4K page */
+	sub	$0x1000, %r10	/* subtract 4K pagesize */
+	mov	%rcx, %rdx	/* only for offset of sse4 instruction loop*/
+
+	.p2align 4
+LABEL(loop_ashr_11_use):
+	add	$16, %r10
+	jg	LABEL(nibble_ashr_11_use)
+
+LABEL(nibble_ashr_11_restart_use):
+	movdqa	(%rdi, %rdx), %xmm0
+	palignr $11, -16(%rdi, %rdx), D(%xmm0)
+#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
+#else
+	movdqa	(%rsi,%rdx), %xmm1
+	TOLOWER (%xmm0, %xmm1)
+	pcmpistri $0x1a, %xmm1, %xmm0
+#endif
+	jbe	LABEL(exit_use)
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	sub	$16, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+
+	add	$16, %rdx
+	add	$16, %r10
+	jg	LABEL(nibble_ashr_11_use)
+
+	movdqa	(%rdi, %rdx), %xmm0
+	palignr $11, -16(%rdi, %rdx), D(%xmm0)
+#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
+#else
+	movdqa	(%rsi,%rdx), %xmm1
+	TOLOWER (%xmm0, %xmm1)
+	pcmpistri $0x1a, %xmm1, %xmm0
+#endif
+	jbe	LABEL(exit_use)
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	sub	$16, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+	add	$16, %rdx
+	jmp	LABEL(loop_ashr_11_use)
+
+	.p2align 4
+LABEL(nibble_ashr_11_use):
+	sub	$0x1000, %r10
+	movdqa	-16(%rdi, %rdx), %xmm0
+	psrldq	$11, D(%xmm0)
+	pcmpistri      $0x3a,%xmm0, %xmm0
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	%r11, %rcx
+	jae	LABEL(nibble_ashr_exit_use)
+#endif
+	cmp	$4, %ecx
+	ja	LABEL(nibble_ashr_11_restart_use)
+
+	jmp	LABEL(nibble_ashr_exit_use)
+
+/*
+ *  The following cases will be handled by ashr_12
+ *  rcx(offset of rsi)  rax(offset of rdi)        relative offset	 corresponding case
+ *        n(4~15)          n - 4		  11(15 +(n - 4) - n)         ashr_12
+ */
+	.p2align 4
+LABEL(ashr_12):
+	pslldq	$4, D(%xmm2)
+	TOLOWER (%xmm1, %xmm2)
+	pcmpeqb	%xmm1, D(%xmm2)
+	psubb	%xmm0, D(%xmm2)
+	pmovmskb %xmm2, %r9d
+	shr	%cl, %edx
+	shr	%cl, %r9d
+	sub	%r9d, %edx
+	jnz	LABEL(less32bytes)
+	movdqa	(%rdi), %xmm3
+
+	UPDATE_STRNCMP_COUNTER
+
+	mov	$16, %rcx	/* index for loads */
+	mov	$12, %r9d	/* byte position left over from less32bytes case */
+	/*
+	 * Setup %r10 value allows us to detect crossing a page boundary.
+	 * When %r10 goes positive we have crossed a page boundary and
+	 * need to do a nibble.
+	 */
+	lea	12(%rdi), %r10
+	and	$0xfff, %r10	/* offset into 4K page */
+	sub	$0x1000, %r10	/* subtract 4K pagesize */
+	mov	%rcx, %rdx	/* only for offset of sse4 instruction loop*/
+
+	.p2align 4
+LABEL(loop_ashr_12_use):
+	add	$16, %r10
+	jg	LABEL(nibble_ashr_12_use)
+
+LABEL(nibble_ashr_12_restart_use):
+	movdqa	(%rdi, %rdx), %xmm0
+	palignr $12, -16(%rdi, %rdx), D(%xmm0)
+#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
+#else
+	movdqa	(%rsi,%rdx), %xmm1
+	TOLOWER (%xmm0, %xmm1)
+	pcmpistri $0x1a, %xmm1, %xmm0
+#endif
+	jbe	LABEL(exit_use)
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	sub	$16, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+
+	add	$16, %rdx
+	add	$16, %r10
+	jg	LABEL(nibble_ashr_12_use)
+
+	movdqa	(%rdi, %rdx), %xmm0
+	palignr $12, -16(%rdi, %rdx), D(%xmm0)
+#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
+#else
+	movdqa	(%rsi,%rdx), %xmm1
+	TOLOWER (%xmm0, %xmm1)
+	pcmpistri $0x1a, %xmm1, %xmm0
+#endif
+	jbe	LABEL(exit_use)
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	sub	$16, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+	add	$16, %rdx
+	jmp	LABEL(loop_ashr_12_use)
+
+	.p2align 4
+LABEL(nibble_ashr_12_use):
+	sub	$0x1000, %r10
+	movdqa	-16(%rdi, %rdx), %xmm0
+	psrldq	$12, D(%xmm0)
+	pcmpistri      $0x3a,%xmm0, %xmm0
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	%r11, %rcx
+	jae	LABEL(nibble_ashr_exit_use)
+#endif
+	cmp	$3, %ecx
+	ja	LABEL(nibble_ashr_12_restart_use)
+
+	jmp	LABEL(nibble_ashr_exit_use)
+
+/*
+ *  The following cases will be handled by ashr_13
+ *  rcx(offset of rsi)  rax(offset of rdi)        relative offset	 corresponding case
+ *        n(3~15)          n - 3		  12(15 +(n - 3) - n)         ashr_13
+ */
+	.p2align 4
+LABEL(ashr_13):
+	pslldq	$3, D(%xmm2)
+	TOLOWER (%xmm1, %xmm2)
+	pcmpeqb	%xmm1, D(%xmm2)
+	psubb	%xmm0, D(%xmm2)
+	pmovmskb %xmm2, %r9d
+	shr	%cl, %edx
+	shr	%cl, %r9d
+	sub	%r9d, %edx
+	jnz	LABEL(less32bytes)
+	movdqa	(%rdi), %xmm3
+
+	UPDATE_STRNCMP_COUNTER
+
+	mov	$16, %rcx	/* index for loads */
+	mov	$13, %r9d	/* byte position left over from less32bytes case */
+	/*
+	 * Setup %r10 value allows us to detect crossing a page boundary.
+	 * When %r10 goes positive we have crossed a page boundary and
+	 * need to do a nibble.
+	 */
+	lea	13(%rdi), %r10
+	and	$0xfff, %r10	/* offset into 4K page */
+	sub	$0x1000, %r10	/* subtract 4K pagesize */
+
+	mov	%rcx, %rdx	/* only for offset of sse4 instruction loop*/
+
+	.p2align 4
+LABEL(loop_ashr_13_use):
+	add	$16, %r10
+	jg	LABEL(nibble_ashr_13_use)
+
+LABEL(nibble_ashr_13_restart_use):
+	movdqa	(%rdi, %rdx), %xmm0
+	palignr $13, -16(%rdi, %rdx), D(%xmm0)
+#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
+#else
+	movdqa	(%rsi,%rdx), %xmm1
+	TOLOWER (%xmm0, %xmm1)
+	pcmpistri $0x1a, %xmm1, %xmm0
+#endif
+	jbe	LABEL(exit_use)
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	sub	$16, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+
+	add	$16, %rdx
+	add	$16, %r10
+	jg	LABEL(nibble_ashr_13_use)
+
+	movdqa	(%rdi, %rdx), %xmm0
+	palignr $13, -16(%rdi, %rdx), D(%xmm0)
+#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
+#else
+	movdqa	(%rsi,%rdx), %xmm1
+	TOLOWER (%xmm0, %xmm1)
+	pcmpistri $0x1a, %xmm1, %xmm0
+#endif
+	jbe	LABEL(exit_use)
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	sub	$16, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+	add	$16, %rdx
+	jmp	LABEL(loop_ashr_13_use)
+
+	.p2align 4
+LABEL(nibble_ashr_13_use):
+	sub	$0x1000, %r10
+	movdqa	-16(%rdi, %rdx), %xmm0
+	psrldq	$13, D(%xmm0)
+	pcmpistri      $0x3a,%xmm0, %xmm0
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	%r11, %rcx
+	jae	LABEL(nibble_ashr_exit_use)
+#endif
+	cmp	$2, %ecx
+	ja	LABEL(nibble_ashr_13_restart_use)
+
+	jmp	LABEL(nibble_ashr_exit_use)
+
+/*
+ *  The following cases will be handled by ashr_14
+ *  rcx(offset of rsi)  rax(offset of rdi)        relative offset	 corresponding case
+ *        n(2~15)          n - 2		  13(15 +(n - 2) - n)         ashr_14
+ */
+	.p2align 4
+LABEL(ashr_14):
+	pslldq  $2, D(%xmm2)
+	TOLOWER (%xmm1, %xmm2)
+	pcmpeqb	%xmm1, D(%xmm2)
+	psubb	%xmm0, D(%xmm2)
+	pmovmskb %xmm2, %r9d
+	shr	%cl, %edx
+	shr	%cl, %r9d
+	sub	%r9d, %edx
+	jnz	LABEL(less32bytes)
+	movdqa	(%rdi), %xmm3
+
+	UPDATE_STRNCMP_COUNTER
+
+	mov	$16, %rcx	/* index for loads */
+	mov	$14, %r9d	/* byte position left over from less32bytes case */
+	/*
+	 * Setup %r10 value allows us to detect crossing a page boundary.
+	 * When %r10 goes positive we have crossed a page boundary and
+	 * need to do a nibble.
+	 */
+	lea	14(%rdi), %r10
+	and	$0xfff, %r10	/* offset into 4K page */
+	sub	$0x1000, %r10	/* subtract 4K pagesize */
+
+	mov	%rcx, %rdx	/* only for offset of sse4 instruction loop*/
+
+	.p2align 4
+LABEL(loop_ashr_14_use):
+	add	$16, %r10
+	jg	LABEL(nibble_ashr_14_use)
+
+LABEL(nibble_ashr_14_restart_use):
+	movdqa	(%rdi, %rdx), %xmm0
+	palignr $14, -16(%rdi, %rdx), D(%xmm0)
+#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
+#else
+	movdqa	(%rsi,%rdx), %xmm1
+	TOLOWER (%xmm0, %xmm1)
+	pcmpistri $0x1a, %xmm1, %xmm0
+#endif
+	jbe	LABEL(exit_use)
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	sub	$16, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+
+	add	$16, %rdx
+	add	$16, %r10
+	jg	LABEL(nibble_ashr_14_use)
+
+	movdqa	(%rdi, %rdx), %xmm0
+	palignr $14, -16(%rdi, %rdx), D(%xmm0)
+#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
+#else
+	movdqa	(%rsi,%rdx), %xmm1
+	TOLOWER (%xmm0, %xmm1)
+	pcmpistri $0x1a, %xmm1, %xmm0
+#endif
+	jbe	LABEL(exit_use)
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	sub	$16, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+	add	$16, %rdx
+	jmp	LABEL(loop_ashr_14_use)
+
+	.p2align 4
+LABEL(nibble_ashr_14_use):
+	sub	$0x1000, %r10
+	movdqa	-16(%rdi, %rdx), %xmm0
+	psrldq	$14, D(%xmm0)
+	pcmpistri      $0x3a,%xmm0, %xmm0
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	%r11, %rcx
+	jae	LABEL(nibble_ashr_exit_use)
+#endif
+	cmp	$1, %ecx
+	ja	LABEL(nibble_ashr_14_restart_use)
+
+	jmp	LABEL(nibble_ashr_exit_use)
+
+/*
+ *  The following cases will be handled by ashr_15
+ *  rcx(offset of rsi)  rax(offset of rdi)        relative offset	 corresponding case
+ *        n(1~15)          n - 1		  14(15 +(n - 1) - n)         ashr_15
+ */
+	.p2align 4
+LABEL(ashr_15):
+	pslldq	$1, D(%xmm2)
+	TOLOWER (%xmm1, %xmm2)
+	pcmpeqb	%xmm1, D(%xmm2)
+	psubb	%xmm0, D(%xmm2)
+	pmovmskb %xmm2, %r9d
+	shr	%cl, %edx
+	shr	%cl, %r9d
+	sub	%r9d, %edx
+	jnz	LABEL(less32bytes)
+
+	movdqa	(%rdi), %xmm3
+
+	UPDATE_STRNCMP_COUNTER
+
+	mov	$16, %rcx	/* index for loads */
+	mov	$15, %r9d	/* byte position left over from less32bytes case */
+	/*
+	 * Setup %r10 value allows us to detect crossing a page boundary.
+	 * When %r10 goes positive we have crossed a page boundary and
+	 * need to do a nibble.
+	 */
+	lea	15(%rdi), %r10
+	and	$0xfff, %r10	/* offset into 4K page */
+
+	sub	$0x1000, %r10	/* subtract 4K pagesize */
+
+	mov	%rcx, %rdx	/* only for offset of sse4 instruction loop*/
+
+	.p2align 4
+LABEL(loop_ashr_15_use):
+	add	$16, %r10
+	jg	LABEL(nibble_ashr_15_use)
+
+LABEL(nibble_ashr_15_restart_use):
+	movdqa	(%rdi, %rdx), %xmm0
+	palignr $15, -16(%rdi, %rdx), D(%xmm0)
+#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
+#else
+	movdqa	(%rsi,%rdx), %xmm1
+	TOLOWER (%xmm0, %xmm1)
+	pcmpistri $0x1a, %xmm1, %xmm0
+#endif
+	jbe	LABEL(exit_use)
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	sub	$16, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+
+	add	$16, %rdx
+	add	$16, %r10
+	jg	LABEL(nibble_ashr_15_use)
+
+	movdqa	(%rdi, %rdx), %xmm0
+	palignr $15, -16(%rdi, %rdx), D(%xmm0)
+#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
+#else
+	movdqa	(%rsi,%rdx), %xmm1
+	TOLOWER (%xmm0, %xmm1)
+	pcmpistri $0x1a, %xmm1, %xmm0
+#endif
+	jbe	LABEL(exit_use)
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	sub	$16, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+	add	$16, %rdx
+	jmp	LABEL(loop_ashr_15_use)
+
+	.p2align 4
+LABEL(nibble_ashr_15_use):
+	sub	$0x1000, %r10
+	movdqa	-16(%rdi, %rdx), %xmm0
+	psrldq	$15, D(%xmm0)
+	pcmpistri      $0x3a,%xmm0, %xmm0
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	%r11, %rcx
+	jae	LABEL(nibble_ashr_exit_use)
+#endif
+	cmp	$0, %ecx
+	ja	LABEL(nibble_ashr_15_restart_use)
+
+LABEL(nibble_ashr_exit_use):
+#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+	pcmpistri      $0x1a,(%rsi,%rdx), %xmm0
+#else
+	movdqa	(%rsi,%rdx), %xmm1
+	TOLOWER (%xmm0, %xmm1)
+	pcmpistri $0x1a, %xmm1, %xmm0
+#endif
+	.p2align 4
+LABEL(exit_use):
+	jnc	LABEL(strcmp_exitz)
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	sub	%rcx, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+	add	%rcx, %rdx
+	lea	-16(%rdi, %r9), %rdi
+	movzbl	(%rdi, %rdx), %eax
+	movzbl	(%rsi, %rdx), %edx
+	test	%r8d, %r8d
+	jz	LABEL(ret_use)
+	xchg	%eax, %edx
+LABEL(ret_use):
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+	leaq	_nl_C_LC_CTYPE_tolower+128*4(%rip), %rcx
+	movl	(%rcx,%rdx,4), %edx
+	movl	(%rcx,%rax,4), %eax
+#endif
+
+	sub	%edx, %eax
+	ret
+
+LABEL(less32bytes):
+	lea	(%rdi, %rax), %rdi	/* locate the exact address for first operand(rdi) */
+	lea	(%rsi, %rcx), %rsi	/* locate the exact address for second operand(rsi) */
+	test	%r8d, %r8d
+	jz	LABEL(ret)
+	xchg	%rsi, %rdi		/* recover original order according to flag(%r8d) */
+
+	.p2align 4
+LABEL(ret):
+LABEL(less16bytes):
+	bsf	%rdx, %rdx		/* find and store bit index in %rdx */
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	sub	%rdx, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+	movzbl	(%rsi, %rdx), %ecx
+	movzbl	(%rdi, %rdx), %eax
+
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+	leaq	_nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx
+	movl	(%rdx,%rcx,4), %ecx
+	movl	(%rdx,%rax,4), %eax
+#endif
+
+	sub	%ecx, %eax
+	ret
+
+LABEL(strcmp_exitz):
+	xor	%eax, %eax
+	ret
+
+	.p2align 4
+	// XXX Same as code above
+LABEL(Byte0):
+	movzx	(%rsi), %ecx
+	movzx	(%rdi), %eax
+
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+	leaq	_nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx
+	movl	(%rdx,%rcx,4), %ecx
+	movl	(%rdx,%rax,4), %eax
+#endif
+
+	sub	%ecx, %eax
+	ret
+	cfi_endproc
+	.size	STRCMP_SSE42, .-STRCMP_SSE42
+
+#undef UCLOW_reg
+#undef UCHIGH_reg
+#undef LCQWORD_reg
+#undef TOLOWER
+
+	/* Put all SSE 4.2 functions together.  */
+	.section .rodata.SECTION,"a",@progbits
+	.p2align 3
+LABEL(unaligned_table):
+	.int	LABEL(ashr_1) - LABEL(unaligned_table)
+	.int	LABEL(ashr_2) - LABEL(unaligned_table)
+	.int	LABEL(ashr_3) - LABEL(unaligned_table)
+	.int	LABEL(ashr_4) - LABEL(unaligned_table)
+	.int	LABEL(ashr_5) - LABEL(unaligned_table)
+	.int	LABEL(ashr_6) - LABEL(unaligned_table)
+	.int	LABEL(ashr_7) - LABEL(unaligned_table)
+	.int	LABEL(ashr_8) - LABEL(unaligned_table)
+	.int	LABEL(ashr_9) - LABEL(unaligned_table)
+	.int	LABEL(ashr_10) - LABEL(unaligned_table)
+	.int	LABEL(ashr_11) - LABEL(unaligned_table)
+	.int	LABEL(ashr_12) - LABEL(unaligned_table)
+	.int	LABEL(ashr_13) - LABEL(unaligned_table)
+	.int	LABEL(ashr_14) - LABEL(unaligned_table)
+	.int	LABEL(ashr_15) - LABEL(unaligned_table)
+	.int	LABEL(ashr_0) - LABEL(unaligned_table)
+
+#undef LABEL
+#undef GLABEL
+#undef SECTION
+#undef movdqa
+#undef movdqu
+#undef pmovmskb
+#undef pcmpistri
+#undef psubb
+#undef pcmpeqb
+#undef psrldq
+#undef pslldq
+#undef palignr
+#undef pxor
+#undef D
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/strcmp-ssse3.S b/REORG.TODO/sysdeps/x86_64/multiarch/strcmp-ssse3.S
new file mode 100644
index 0000000000..1b7fa33c91
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/strcmp-ssse3.S
@@ -0,0 +1,5 @@
+#if IS_IN (libc)
+# define USE_SSSE3 1
+# define STRCMP __strcmp_ssse3
+# include "../strcmp.S"
+#endif
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/strcmp.S b/REORG.TODO/sysdeps/x86_64/multiarch/strcmp.S
new file mode 100644
index 0000000000..54f8f7dd44
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/strcmp.S
@@ -0,0 +1,209 @@
+/* Multiple versions of strcmp
+   Copyright (C) 2009-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+#ifdef USE_AS_STRNCMP
+/* Since the counter, %r11, is unsigned, we branch to strcmp_exitz
+   if the new counter > the old one or is 0.  */
+# define UPDATE_STRNCMP_COUNTER				\
+	/* calculate left number to compare */		\
+	lea	-16(%rcx, %r11), %r9;			\
+	cmp	%r9, %r11;				\
+	jb	LABEL(strcmp_exitz);			\
+	test	%r9, %r9;				\
+	je	LABEL(strcmp_exitz);			\
+	mov	%r9, %r11
+
+# define STRCMP_SSE42	__strncmp_sse42
+# define STRCMP_SSSE3	__strncmp_ssse3
+# define STRCMP_SSE2	__strncmp_sse2
+# define __GI_STRCMP	__GI_strncmp
+#elif defined USE_AS_STRCASECMP_L
+# include "locale-defines.h"
+
+# define UPDATE_STRNCMP_COUNTER
+
+# define STRCMP_AVX	__strcasecmp_l_avx
+# define STRCMP_SSE42	__strcasecmp_l_sse42
+# define STRCMP_SSSE3	__strcasecmp_l_ssse3
+# define STRCMP_SSE2	__strcasecmp_l_sse2
+# define __GI_STRCMP	__GI___strcasecmp_l
+#elif defined USE_AS_STRNCASECMP_L
+# include "locale-defines.h"
+
+/* Since the counter, %r11, is unsigned, we branch to strcmp_exitz
+   if the new counter > the old one or is 0.  */
+# define UPDATE_STRNCMP_COUNTER				\
+	/* calculate left number to compare */		\
+	lea	-16(%rcx, %r11), %r9;			\
+	cmp	%r9, %r11;				\
+	jb	LABEL(strcmp_exitz);			\
+	test	%r9, %r9;				\
+	je	LABEL(strcmp_exitz);			\
+	mov	%r9, %r11
+
+# define STRCMP_AVX	__strncasecmp_l_avx
+# define STRCMP_SSE42	__strncasecmp_l_sse42
+# define STRCMP_SSSE3	__strncasecmp_l_ssse3
+# define STRCMP_SSE2	__strncasecmp_l_sse2
+# define __GI_STRCMP	__GI___strncasecmp_l
+#else
+# define USE_AS_STRCMP
+# define UPDATE_STRNCMP_COUNTER
+# ifndef STRCMP
+#  define STRCMP	strcmp
+#  define STRCMP_SSE42	__strcmp_sse42
+#  define STRCMP_SSSE3	__strcmp_ssse3
+#  define STRCMP_SSE2	__strcmp_sse2
+#  define __GI_STRCMP	__GI_strcmp
+# endif
+#endif
+
+/* Define multiple versions only for the definition in libc.  Don't
+   define multiple versions for strncmp in static library since we
+   need strncmp before the initialization happened.  */
+#if (defined SHARED || !defined USE_AS_STRNCMP) && IS_IN (libc)
+	.text
+ENTRY(STRCMP)
+	.type	STRCMP, @gnu_indirect_function
+	LOAD_RTLD_GLOBAL_RO_RDX
+#ifdef USE_AS_STRCMP
+	leaq	__strcmp_sse2_unaligned(%rip), %rax
+	HAS_ARCH_FEATURE (Fast_Unaligned_Load)
+	jnz     3f
+#else
+	HAS_ARCH_FEATURE (Slow_SSE4_2)
+	jnz	2f
+	leaq	STRCMP_SSE42(%rip), %rax
+	HAS_CPU_FEATURE (SSE4_2)
+	jnz	3f
+#endif
+2:	leaq	STRCMP_SSSE3(%rip), %rax
+	HAS_CPU_FEATURE (SSSE3)
+	jnz	3f
+	leaq	STRCMP_SSE2(%rip), %rax
+3:	ret
+END(STRCMP)
+
+# ifdef USE_AS_STRCASECMP_L
+ENTRY(__strcasecmp)
+	.type	__strcasecmp, @gnu_indirect_function
+	LOAD_RTLD_GLOBAL_RO_RDX
+	leaq	__strcasecmp_avx(%rip), %rax
+	HAS_ARCH_FEATURE (AVX_Usable)
+	jnz	3f
+	HAS_ARCH_FEATURE (Slow_SSE4_2)
+	jnz	2f
+	leaq	__strcasecmp_sse42(%rip), %rax
+	HAS_CPU_FEATURE (SSE4_2)
+	jnz	3f
+2:	leaq	__strcasecmp_ssse3(%rip), %rax
+	HAS_CPU_FEATURE (SSSE3)
+	jnz	3f
+	leaq	__strcasecmp_sse2(%rip), %rax
+3:	ret
+END(__strcasecmp)
+weak_alias (__strcasecmp, strcasecmp)
+# endif
+# ifdef USE_AS_STRNCASECMP_L
+ENTRY(__strncasecmp)
+	.type	__strncasecmp, @gnu_indirect_function
+	LOAD_RTLD_GLOBAL_RO_RDX
+	leaq	__strncasecmp_avx(%rip), %rax
+	HAS_ARCH_FEATURE (AVX_Usable)
+	jnz	3f
+	HAS_ARCH_FEATURE (Slow_SSE4_2)
+	jnz	2f
+	leaq	__strncasecmp_sse42(%rip), %rax
+	HAS_CPU_FEATURE (SSE4_2)
+	jnz	3f
+2:	leaq	__strncasecmp_ssse3(%rip), %rax
+	HAS_CPU_FEATURE (SSSE3)
+	jnz	3f
+	leaq	__strncasecmp_sse2(%rip), %rax
+3:	ret
+END(__strncasecmp)
+weak_alias (__strncasecmp, strncasecmp)
+# endif
+
+# undef LABEL
+# define LABEL(l) .L##l##_sse42
+# define GLABEL(l) l##_sse42
+# define SECTION sse4.2
+# include "strcmp-sse42.S"
+
+
+# if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+#  define LABEL(l) .L##l##_avx
+#  define GLABEL(l) l##_avx
+#  define USE_AVX 1
+#  undef STRCMP_SSE42
+#  define STRCMP_SSE42 STRCMP_AVX
+#  define SECTION avx
+#  include "strcmp-sse42.S"
+# endif
+
+
+# undef ENTRY
+# define ENTRY(name) \
+	.type STRCMP_SSE2, @function; \
+	.align 16; \
+	.globl STRCMP_SSE2; \
+	.hidden STRCMP_SSE2; \
+	STRCMP_SSE2: cfi_startproc; \
+	CALL_MCOUNT
+# undef END
+# define END(name) \
+	cfi_endproc; .size STRCMP_SSE2, .-STRCMP_SSE2
+
+# ifdef USE_AS_STRCASECMP_L
+#  define ENTRY2(name) \
+	.type __strcasecmp_sse2, @function; \
+	.align 16; \
+	.globl __strcasecmp_sse2; \
+	.hidden __strcasecmp_sse2; \
+	__strcasecmp_sse2: cfi_startproc; \
+	CALL_MCOUNT
+#  define END2(name) \
+	cfi_endproc; .size __strcasecmp_sse2, .-__strcasecmp_sse2
+# endif
+
+# ifdef USE_AS_STRNCASECMP_L
+#  define ENTRY2(name) \
+	.type __strncasecmp_sse2, @function; \
+	.align 16; \
+	.globl __strncasecmp_sse2; \
+	.hidden __strncasecmp_sse2; \
+	__strncasecmp_sse2: cfi_startproc; \
+	CALL_MCOUNT
+#  define END2(name) \
+	cfi_endproc; .size __strncasecmp_sse2, .-__strncasecmp_sse2
+# endif
+
+# undef libc_hidden_builtin_def
+/* It doesn't make sense to send libc-internal strcmp calls through a PLT.
+   The speedup we get from using SSE4.2 instruction is likely eaten away
+   by the indirect call in the PLT.  */
+# define libc_hidden_builtin_def(name) \
+	.globl __GI_STRCMP; __GI_STRCMP = STRCMP_SSE2
+#endif
+
+#include "../strcmp.S"
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S b/REORG.TODO/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S
new file mode 100644
index 0000000000..6a5ab7ab26
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S
@@ -0,0 +1,1889 @@
+/* strcpy with SSE2 and unaligned load
+   Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+
+# ifndef USE_AS_STRCAT
+#  include <sysdep.h>
+
+#  ifndef STRCPY
+#   define STRCPY  __strcpy_sse2_unaligned
+#  endif
+
+# endif
+
+# define JMPTBL(I, B)	I - B
+# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)             \
+	lea	TABLE(%rip), %r11;                              \
+	movslq	(%r11, INDEX, SCALE), %rcx;                     \
+	lea	(%r11, %rcx), %rcx;                             \
+	jmp	*%rcx
+
+# ifndef USE_AS_STRCAT
+
+.text
+ENTRY (STRCPY)
+#  ifdef USE_AS_STRNCPY
+	mov	%rdx, %r8
+	test	%r8, %r8
+	jz	L(ExitZero)
+#  endif
+	mov	%rsi, %rcx
+#  ifndef USE_AS_STPCPY
+	mov	%rdi, %rax      /* save result */
+#  endif
+
+# endif
+
+	and	$63, %rcx
+	cmp	$32, %rcx
+	jbe	L(SourceStringAlignmentLess32)
+
+	and	$-16, %rsi
+	and	$15, %rcx
+	pxor	%xmm0, %xmm0
+	pxor	%xmm1, %xmm1
+
+	pcmpeqb	(%rsi), %xmm1
+	pmovmskb %xmm1, %rdx
+	shr	%cl, %rdx
+
+# ifdef USE_AS_STRNCPY
+#  if defined USE_AS_STPCPY || defined USE_AS_STRCAT
+	mov	$16, %r10
+	sub	%rcx, %r10
+	cmp	%r10, %r8
+#  else
+	mov	$17, %r10
+	sub	%rcx, %r10
+	cmp	%r10, %r8
+#  endif
+	jbe	L(CopyFrom1To16BytesTailCase2OrCase3)
+# endif
+	test	%rdx, %rdx
+	jnz	L(CopyFrom1To16BytesTail)
+
+	pcmpeqb	16(%rsi), %xmm0
+	pmovmskb %xmm0, %rdx
+
+# ifdef USE_AS_STRNCPY
+	add	$16, %r10
+	cmp	%r10, %r8
+	jbe	L(CopyFrom1To32BytesCase2OrCase3)
+# endif
+	test	%rdx, %rdx
+	jnz	L(CopyFrom1To32Bytes)
+
+	movdqu	(%rsi, %rcx), %xmm1   /* copy 16 bytes */
+	movdqu	%xmm1, (%rdi)
+
+/* If source address alignment != destination address alignment */
+	.p2align 4
+L(Unalign16Both):
+	sub	%rcx, %rdi
+# ifdef USE_AS_STRNCPY
+	add	%rcx, %r8
+	sbb	%rcx, %rcx
+	or	%rcx, %r8
+# endif
+	mov	$16, %rcx
+	movdqa	(%rsi, %rcx), %xmm1
+	movaps	16(%rsi, %rcx), %xmm2
+	movdqu	%xmm1, (%rdi, %rcx)
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %rdx
+	add	$16, %rcx
+# ifdef USE_AS_STRNCPY
+	sub	$48, %r8
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+	test	%rdx, %rdx
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	jnz	L(CopyFrom1To16BytesUnalignedXmm2)
+# else
+	jnz	L(CopyFrom1To16Bytes)
+# endif
+
+	movaps	16(%rsi, %rcx), %xmm3
+	movdqu	%xmm2, (%rdi, %rcx)
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %rdx
+	add	$16, %rcx
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+	test	%rdx, %rdx
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	jnz	L(CopyFrom1To16BytesUnalignedXmm3)
+# else
+	jnz	L(CopyFrom1To16Bytes)
+# endif
+
+	movaps	16(%rsi, %rcx), %xmm4
+	movdqu	%xmm3, (%rdi, %rcx)
+	pcmpeqb	%xmm4, %xmm0
+	pmovmskb %xmm0, %rdx
+	add	$16, %rcx
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+	test	%rdx, %rdx
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	jnz	L(CopyFrom1To16BytesUnalignedXmm4)
+# else
+	jnz	L(CopyFrom1To16Bytes)
+# endif
+
+	movaps	16(%rsi, %rcx), %xmm1
+	movdqu	%xmm4, (%rdi, %rcx)
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm0, %rdx
+	add	$16, %rcx
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+	test	%rdx, %rdx
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	jnz	L(CopyFrom1To16BytesUnalignedXmm1)
+# else
+	jnz	L(CopyFrom1To16Bytes)
+# endif
+
+	movaps	16(%rsi, %rcx), %xmm2
+	movdqu	%xmm1, (%rdi, %rcx)
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %rdx
+	add	$16, %rcx
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+	test	%rdx, %rdx
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	jnz	L(CopyFrom1To16BytesUnalignedXmm2)
+# else
+	jnz	L(CopyFrom1To16Bytes)
+# endif
+
+	movaps	16(%rsi, %rcx), %xmm3
+	movdqu	%xmm2, (%rdi, %rcx)
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %rdx
+	add	$16, %rcx
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+	test	%rdx, %rdx
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	jnz	L(CopyFrom1To16BytesUnalignedXmm3)
+# else
+	jnz	L(CopyFrom1To16Bytes)
+# endif
+
+	movdqu	%xmm3, (%rdi, %rcx)
+	mov	%rsi, %rdx
+	lea	16(%rsi, %rcx), %rsi
+	and	$-0x40, %rsi
+	sub	%rsi, %rdx
+	sub	%rdx, %rdi
+# ifdef USE_AS_STRNCPY
+	lea	128(%r8, %rdx), %r8
+# endif
+L(Unaligned64Loop):
+	movaps	(%rsi), %xmm2
+	movaps	%xmm2, %xmm4
+	movaps	16(%rsi), %xmm5
+	movaps	32(%rsi), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	48(%rsi), %xmm7
+	pminub	%xmm5, %xmm2
+	pminub	%xmm7, %xmm3
+	pminub	%xmm2, %xmm3
+	pcmpeqb	%xmm0, %xmm3
+	pmovmskb %xmm3, %rdx
+# ifdef USE_AS_STRNCPY
+	sub	$64, %r8
+	jbe	L(UnalignedLeaveCase2OrCase3)
+# endif
+	test	%rdx, %rdx
+	jnz	L(Unaligned64Leave)
+
+L(Unaligned64Loop_start):
+	add	$64, %rdi
+	add	$64, %rsi
+	movdqu	%xmm4, -64(%rdi)
+	movaps	(%rsi), %xmm2
+	movdqa	%xmm2, %xmm4
+	movdqu	%xmm5, -48(%rdi)
+	movaps	16(%rsi), %xmm5
+	pminub	%xmm5, %xmm2
+	movaps	32(%rsi), %xmm3
+	movdqu	%xmm6, -32(%rdi)
+	movaps	%xmm3, %xmm6
+	movdqu	%xmm7, -16(%rdi)
+	movaps	48(%rsi), %xmm7
+	pminub	%xmm7, %xmm3
+	pminub	%xmm2, %xmm3
+	pcmpeqb	%xmm0, %xmm3
+	pmovmskb %xmm3, %rdx
+# ifdef USE_AS_STRNCPY
+	sub	$64, %r8
+	jbe	L(UnalignedLeaveCase2OrCase3)
+# endif
+	test	%rdx, %rdx
+	jz	L(Unaligned64Loop_start)
+
+L(Unaligned64Leave):
+	pxor	%xmm1, %xmm1
+
+	pcmpeqb	%xmm4, %xmm0
+	pcmpeqb	%xmm5, %xmm1
+	pmovmskb %xmm0, %rdx
+	pmovmskb %xmm1, %rcx
+	test	%rdx, %rdx
+	jnz	L(CopyFrom1To16BytesUnaligned_0)
+	test	%rcx, %rcx
+	jnz	L(CopyFrom1To16BytesUnaligned_16)
+
+	pcmpeqb	%xmm6, %xmm0
+	pcmpeqb	%xmm7, %xmm1
+	pmovmskb %xmm0, %rdx
+	pmovmskb %xmm1, %rcx
+	test	%rdx, %rdx
+	jnz	L(CopyFrom1To16BytesUnaligned_32)
+
+	bsf	%rcx, %rdx
+	movdqu	%xmm4, (%rdi)
+	movdqu	%xmm5, 16(%rdi)
+	movdqu	%xmm6, 32(%rdi)
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+# ifdef USE_AS_STPCPY
+	lea	48(%rdi, %rdx), %rax
+# endif
+	movdqu	%xmm7, 48(%rdi)
+	add	$15, %r8
+	sub	%rdx, %r8
+	lea	49(%rdi, %rdx), %rdi
+	jmp	L(StrncpyFillTailWithZero)
+# else
+	add	$48, %rsi
+	add	$48, %rdi
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
+# endif
+
+/* If source address alignment == destination address alignment */
+
+L(SourceStringAlignmentLess32):
+	pxor	%xmm0, %xmm0
+	movdqu	(%rsi), %xmm1
+	movdqu	16(%rsi), %xmm2
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm0, %rdx
+
+# ifdef USE_AS_STRNCPY
+#  if defined USE_AS_STPCPY || defined USE_AS_STRCAT
+	cmp	$16, %r8
+#  else
+	cmp	$17, %r8
+#  endif
+	jbe	L(CopyFrom1To16BytesTail1Case2OrCase3)
+# endif
+	test	%rdx, %rdx
+	jnz	L(CopyFrom1To16BytesTail1)
+
+	pcmpeqb	%xmm2, %xmm0
+	movdqu	%xmm1, (%rdi)
+	pmovmskb %xmm0, %rdx
+
+# ifdef USE_AS_STRNCPY
+#  if defined USE_AS_STPCPY || defined USE_AS_STRCAT
+	cmp	$32, %r8
+#  else
+	cmp	$33, %r8
+#  endif
+	jbe	L(CopyFrom1To32Bytes1Case2OrCase3)
+# endif
+	test	%rdx, %rdx
+	jnz	L(CopyFrom1To32Bytes1)
+
+	and	$-16, %rsi
+	and	$15, %rcx
+	jmp	L(Unalign16Both)
+
+/*------End of main part with loops---------------------*/
+
+/* Case1 */
+
+# if (!defined USE_AS_STRNCPY) || (defined USE_AS_STRCAT)
+	.p2align 4
+L(CopyFrom1To16Bytes):
+	add	%rcx, %rdi
+	add	%rcx, %rsi
+	bsf	%rdx, %rdx
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
+# endif
+	.p2align 4
+L(CopyFrom1To16BytesTail):
+	add	%rcx, %rsi
+	bsf	%rdx, %rdx
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
+
+	.p2align 4
+L(CopyFrom1To32Bytes1):
+	add	$16, %rsi
+	add	$16, %rdi
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$16, %r8
+# endif
+L(CopyFrom1To16BytesTail1):
+	bsf	%rdx, %rdx
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
+
+	.p2align 4
+L(CopyFrom1To32Bytes):
+	bsf	%rdx, %rdx
+	add	%rcx, %rsi
+	add	$16, %rdx
+	sub	%rcx, %rdx
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
+
+	.p2align 4
+L(CopyFrom1To16BytesUnaligned_0):
+	bsf	%rdx, %rdx
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+# ifdef USE_AS_STPCPY
+	lea	(%rdi, %rdx), %rax
+# endif
+	movdqu	%xmm4, (%rdi)
+	add	$63, %r8
+	sub	%rdx, %r8
+	lea	1(%rdi, %rdx), %rdi
+	jmp	L(StrncpyFillTailWithZero)
+# else
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
+# endif
+
+	.p2align 4
+L(CopyFrom1To16BytesUnaligned_16):
+	bsf	%rcx, %rdx
+	movdqu	%xmm4, (%rdi)
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+# ifdef USE_AS_STPCPY
+	lea	16(%rdi, %rdx), %rax
+# endif
+	movdqu	%xmm5, 16(%rdi)
+	add	$47, %r8
+	sub	%rdx, %r8
+	lea	17(%rdi, %rdx), %rdi
+	jmp	L(StrncpyFillTailWithZero)
+# else
+	add	$16, %rsi
+	add	$16, %rdi
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
+# endif
+
+	.p2align 4
+L(CopyFrom1To16BytesUnaligned_32):
+	bsf	%rdx, %rdx
+	movdqu	%xmm4, (%rdi)
+	movdqu	%xmm5, 16(%rdi)
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+# ifdef USE_AS_STPCPY
+	lea	32(%rdi, %rdx), %rax
+# endif
+	movdqu	%xmm6, 32(%rdi)
+	add	$31, %r8
+	sub	%rdx, %r8
+	lea	33(%rdi, %rdx), %rdi
+	jmp	L(StrncpyFillTailWithZero)
+# else
+	add	$32, %rsi
+	add	$32, %rdi
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
+# endif
+
+# ifdef USE_AS_STRNCPY
+#  ifndef USE_AS_STRCAT
+	.p2align 4
+L(CopyFrom1To16BytesUnalignedXmm6):
+	movdqu	%xmm6, (%rdi, %rcx)
+	jmp	L(CopyFrom1To16BytesXmmExit)
+
+	.p2align 4
+L(CopyFrom1To16BytesUnalignedXmm5):
+	movdqu	%xmm5, (%rdi, %rcx)
+	jmp	L(CopyFrom1To16BytesXmmExit)
+
+	.p2align 4
+L(CopyFrom1To16BytesUnalignedXmm4):
+	movdqu	%xmm4, (%rdi, %rcx)
+	jmp	L(CopyFrom1To16BytesXmmExit)
+
+	.p2align 4
+L(CopyFrom1To16BytesUnalignedXmm3):
+	movdqu	%xmm3, (%rdi, %rcx)
+	jmp	L(CopyFrom1To16BytesXmmExit)
+
+	.p2align 4
+L(CopyFrom1To16BytesUnalignedXmm1):
+	movdqu	%xmm1, (%rdi, %rcx)
+	jmp	L(CopyFrom1To16BytesXmmExit)
+#  endif
+
+	.p2align 4
+L(CopyFrom1To16BytesExit):
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
+
+/* Case2 */
+
+	.p2align 4
+L(CopyFrom1To16BytesCase2):
+	add	$16, %r8
+	add	%rcx, %rdi
+	add	%rcx, %rsi
+	bsf	%rdx, %rdx
+	cmp	%r8, %rdx
+	jb	L(CopyFrom1To16BytesExit)
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
+
+	.p2align 4
+L(CopyFrom1To32BytesCase2):
+	add	%rcx, %rsi
+	bsf	%rdx, %rdx
+	add	$16, %rdx
+	sub	%rcx, %rdx
+	cmp	%r8, %rdx
+	jb	L(CopyFrom1To16BytesExit)
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
+
+L(CopyFrom1To16BytesTailCase2):
+	add	%rcx, %rsi
+	bsf	%rdx, %rdx
+	cmp	%r8, %rdx
+	jb	L(CopyFrom1To16BytesExit)
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
+
+L(CopyFrom1To16BytesTail1Case2):
+	bsf	%rdx, %rdx
+	cmp	%r8, %rdx
+	jb	L(CopyFrom1To16BytesExit)
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
+
+/* Case2 or Case3,  Case3 */
+
+	.p2align 4
+L(CopyFrom1To16BytesCase2OrCase3):
+	test	%rdx, %rdx
+	jnz	L(CopyFrom1To16BytesCase2)
+L(CopyFrom1To16BytesCase3):
+	add	$16, %r8
+	add	%rcx, %rdi
+	add	%rcx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
+
+	.p2align 4
+L(CopyFrom1To32BytesCase2OrCase3):
+	test	%rdx, %rdx
+	jnz	L(CopyFrom1To32BytesCase2)
+	add	%rcx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
+
+	.p2align 4
+L(CopyFrom1To16BytesTailCase2OrCase3):
+	test	%rdx, %rdx
+	jnz	L(CopyFrom1To16BytesTailCase2)
+	add	%rcx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
+
+	.p2align 4
+L(CopyFrom1To32Bytes1Case2OrCase3):
+	add	$16, %rdi
+	add	$16, %rsi
+	sub	$16, %r8
+L(CopyFrom1To16BytesTail1Case2OrCase3):
+	test	%rdx, %rdx
+	jnz	L(CopyFrom1To16BytesTail1Case2)
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
+
+# endif
+
+/*------------End labels regarding with copying 1-16 bytes--and 1-32 bytes----*/
+
+	.p2align 4
+L(Exit1):
+	mov	%dh, (%rdi)
+# ifdef USE_AS_STPCPY
+	lea	(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$1, %r8
+	lea	1(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	ret
+
+	.p2align 4
+L(Exit2):
+	mov	(%rsi), %dx
+	mov	%dx, (%rdi)
+# ifdef USE_AS_STPCPY
+	lea	1(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$2, %r8
+	lea	2(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	ret
+
+	.p2align 4
+L(Exit3):
+	mov	(%rsi), %cx
+	mov	%cx, (%rdi)
+	mov	%dh, 2(%rdi)
+# ifdef USE_AS_STPCPY
+	lea	2(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$3, %r8
+	lea	3(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	ret
+
+	.p2align 4
+L(Exit4):
+	mov	(%rsi), %edx
+	mov	%edx, (%rdi)
+# ifdef USE_AS_STPCPY
+	lea	3(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$4, %r8
+	lea	4(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	ret
+
+	.p2align 4
+L(Exit5):
+	mov	(%rsi), %ecx
+	mov	%dh, 4(%rdi)
+	mov	%ecx, (%rdi)
+# ifdef USE_AS_STPCPY
+	lea	4(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$5, %r8
+	lea	5(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	ret
+
+	.p2align 4
+L(Exit6):
+	mov	(%rsi), %ecx
+	mov	4(%rsi), %dx
+	mov	%ecx, (%rdi)
+	mov	%dx, 4(%rdi)
+# ifdef USE_AS_STPCPY
+	lea	5(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$6, %r8
+	lea	6(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	ret
+
+	.p2align 4
+L(Exit7):
+	mov	(%rsi), %ecx
+	mov	3(%rsi), %edx
+	mov	%ecx, (%rdi)
+	mov	%edx, 3(%rdi)
+# ifdef USE_AS_STPCPY
+	lea	6(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$7, %r8
+	lea	7(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	ret
+
+	.p2align 4
+L(Exit8):
+	mov	(%rsi), %rdx
+	mov	%rdx, (%rdi)
+# ifdef USE_AS_STPCPY
+	lea	7(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$8, %r8
+	lea	8(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	ret
+
+	.p2align 4
+L(Exit9):
+	mov	(%rsi), %rcx
+	mov	%dh, 8(%rdi)
+	mov	%rcx, (%rdi)
+# ifdef USE_AS_STPCPY
+	lea	8(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$9, %r8
+	lea	9(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	ret
+
+	.p2align 4
+L(Exit10):
+	mov	(%rsi), %rcx
+	mov	8(%rsi), %dx
+	mov	%rcx, (%rdi)
+	mov	%dx, 8(%rdi)
+# ifdef USE_AS_STPCPY
+	lea	9(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$10, %r8
+	lea	10(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	ret
+
+	.p2align 4
+L(Exit11):
+	mov	(%rsi), %rcx
+	mov	7(%rsi), %edx
+	mov	%rcx, (%rdi)
+	mov	%edx, 7(%rdi)
+# ifdef USE_AS_STPCPY
+	lea	10(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$11, %r8
+	lea	11(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	ret
+
+	.p2align 4
+L(Exit12):
+	mov	(%rsi), %rcx
+	mov	8(%rsi), %edx
+	mov	%rcx, (%rdi)
+	mov	%edx, 8(%rdi)
+# ifdef USE_AS_STPCPY
+	lea	11(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$12, %r8
+	lea	12(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	ret
+
+	.p2align 4
+L(Exit13):
+	mov	(%rsi), %rcx
+	mov	5(%rsi), %rdx
+	mov	%rcx, (%rdi)
+	mov	%rdx, 5(%rdi)
+# ifdef USE_AS_STPCPY
+	lea	12(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$13, %r8
+	lea	13(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	ret
+
+	.p2align 4
+L(Exit14):
+	mov	(%rsi), %rcx
+	mov	6(%rsi), %rdx
+	mov	%rcx, (%rdi)
+	mov	%rdx, 6(%rdi)
+# ifdef USE_AS_STPCPY
+	lea	13(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$14, %r8
+	lea	14(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	ret
+
+	.p2align 4
+L(Exit15):
+	mov	(%rsi), %rcx
+	mov	7(%rsi), %rdx
+	mov	%rcx, (%rdi)
+	mov	%rdx, 7(%rdi)
+# ifdef USE_AS_STPCPY
+	lea	14(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$15, %r8
+	lea	15(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	ret
+
+	.p2align 4
+L(Exit16):
+	movdqu	(%rsi), %xmm0
+	movdqu	%xmm0, (%rdi)
+# ifdef USE_AS_STPCPY
+	lea	15(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$16, %r8
+	lea	16(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	ret
+
+	.p2align 4
+L(Exit17):
+	movdqu	(%rsi), %xmm0
+	movdqu	%xmm0, (%rdi)
+	mov	%dh, 16(%rdi)
+# ifdef USE_AS_STPCPY
+	lea	16(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$17, %r8
+	lea	17(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	ret
+
+	.p2align 4
+L(Exit18):
+	movdqu	(%rsi), %xmm0
+	mov	16(%rsi), %cx
+	movdqu	%xmm0, (%rdi)
+	mov	%cx, 16(%rdi)
+# ifdef USE_AS_STPCPY
+	lea	17(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$18, %r8
+	lea	18(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	ret
+
+	.p2align 4
+L(Exit19):
+	movdqu	(%rsi), %xmm0
+	mov	15(%rsi), %ecx
+	movdqu	%xmm0, (%rdi)
+	mov	%ecx, 15(%rdi)
+# ifdef USE_AS_STPCPY
+	lea	18(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$19, %r8
+	lea	19(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	ret
+
+	.p2align 4
+L(Exit20):
+	movdqu	(%rsi), %xmm0
+	mov	16(%rsi), %ecx
+	movdqu	%xmm0, (%rdi)
+	mov	%ecx, 16(%rdi)
+# ifdef USE_AS_STPCPY
+	lea	19(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$20, %r8
+	lea	20(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	ret
+
+	.p2align 4
+L(Exit21):
+	movdqu	(%rsi), %xmm0
+	mov	16(%rsi), %ecx
+	movdqu	%xmm0, (%rdi)
+	mov	%ecx, 16(%rdi)
+	mov	%dh, 20(%rdi)
+# ifdef USE_AS_STPCPY
+	lea	20(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$21, %r8
+	lea	21(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	ret
+
+	.p2align 4
+L(Exit22):
+	movdqu	(%rsi), %xmm0
+	mov	14(%rsi), %rcx
+	movdqu	%xmm0, (%rdi)
+	mov	%rcx, 14(%rdi)
+# ifdef USE_AS_STPCPY
+	lea	21(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$22, %r8
+	lea	22(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	ret
+
+	.p2align 4
+L(Exit23):
+	movdqu	(%rsi), %xmm0
+	mov	15(%rsi), %rcx
+	movdqu	%xmm0, (%rdi)
+	mov	%rcx, 15(%rdi)
+# ifdef USE_AS_STPCPY
+	lea	22(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$23, %r8
+	lea	23(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	ret
+
+	.p2align 4
+L(Exit24):
+	movdqu	(%rsi), %xmm0
+	mov	16(%rsi), %rcx
+	movdqu	%xmm0, (%rdi)
+	mov	%rcx, 16(%rdi)
+# ifdef USE_AS_STPCPY
+	lea	23(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$24, %r8
+	lea	24(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	ret
+
+	.p2align 4
+L(Exit25):
+	movdqu	(%rsi), %xmm0
+	mov	16(%rsi), %rcx
+	movdqu	%xmm0, (%rdi)
+	mov	%rcx, 16(%rdi)
+	mov	%dh, 24(%rdi)
+# ifdef USE_AS_STPCPY
+	lea	24(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$25, %r8
+	lea	25(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	ret
+
+	.p2align 4
+L(Exit26):
+	movdqu	(%rsi), %xmm0
+	mov	16(%rsi), %rdx
+	mov	24(%rsi), %cx
+	movdqu	%xmm0, (%rdi)
+	mov	%rdx, 16(%rdi)
+	mov	%cx, 24(%rdi)
+# ifdef USE_AS_STPCPY
+	lea	25(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$26, %r8
+	lea	26(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	ret
+
+	.p2align 4
+L(Exit27):
+	movdqu	(%rsi), %xmm0
+	mov	16(%rsi), %rdx
+	mov	23(%rsi), %ecx
+	movdqu	%xmm0, (%rdi)
+	mov	%rdx, 16(%rdi)
+	mov	%ecx, 23(%rdi)
+# ifdef USE_AS_STPCPY
+	lea	26(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$27, %r8
+	lea	27(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	ret
+
+	.p2align 4
+L(Exit28):
+	movdqu	(%rsi), %xmm0
+	mov	16(%rsi), %rdx
+	mov	24(%rsi), %ecx
+	movdqu	%xmm0, (%rdi)
+	mov	%rdx, 16(%rdi)
+	mov	%ecx, 24(%rdi)
+# ifdef USE_AS_STPCPY
+	lea	27(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$28, %r8
+	lea	28(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	ret
+
+	.p2align 4
+L(Exit29):
+	movdqu	(%rsi), %xmm0
+	movdqu	13(%rsi), %xmm2
+	movdqu	%xmm0, (%rdi)
+	movdqu	%xmm2, 13(%rdi)
+# ifdef USE_AS_STPCPY
+	lea	28(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$29, %r8
+	lea	29(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	ret
+
+	.p2align 4
+L(Exit30):
+	movdqu	(%rsi), %xmm0
+	movdqu	14(%rsi), %xmm2
+	movdqu	%xmm0, (%rdi)
+	movdqu	%xmm2, 14(%rdi)
+# ifdef USE_AS_STPCPY
+	lea	29(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$30, %r8
+	lea	30(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	ret
+
+	.p2align 4
+L(Exit31):
+	movdqu	(%rsi), %xmm0
+	movdqu	15(%rsi), %xmm2
+	movdqu	%xmm0, (%rdi)
+	movdqu	%xmm2, 15(%rdi)
+# ifdef USE_AS_STPCPY
+	lea	30(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$31, %r8
+	lea	31(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	ret
+
+	.p2align 4
+L(Exit32):
+	movdqu	(%rsi), %xmm0
+	movdqu	16(%rsi), %xmm2
+	movdqu	%xmm0, (%rdi)
+	movdqu	%xmm2, 16(%rdi)
+# ifdef USE_AS_STPCPY
+	lea	31(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$32, %r8
+	lea	32(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	ret
+
+# ifdef USE_AS_STRNCPY
+
+	.p2align 4
+L(StrncpyExit0):
+#  ifdef USE_AS_STPCPY
+	mov	%rdi, %rax
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, (%rdi)
+#  endif
+	ret
+
+	.p2align 4
+L(StrncpyExit1):
+	mov	(%rsi), %dl
+	mov	%dl, (%rdi)
+#  ifdef USE_AS_STPCPY
+	lea	1(%rdi), %rax
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 1(%rdi)
+#  endif
+	ret
+
+	.p2align 4
+L(StrncpyExit2):
+	mov	(%rsi), %dx
+	mov	%dx, (%rdi)
+#  ifdef USE_AS_STPCPY
+	lea	2(%rdi), %rax
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 2(%rdi)
+#  endif
+	ret
+
+	.p2align 4
+L(StrncpyExit3):
+	mov	(%rsi), %cx
+	mov	2(%rsi), %dl
+	mov	%cx, (%rdi)
+	mov	%dl, 2(%rdi)
+#  ifdef USE_AS_STPCPY
+	lea	3(%rdi), %rax
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 3(%rdi)
+#  endif
+	ret
+
+	.p2align 4
+L(StrncpyExit4):
+	mov	(%rsi), %edx
+	mov	%edx, (%rdi)
+#  ifdef USE_AS_STPCPY
+	lea	4(%rdi), %rax
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 4(%rdi)
+#  endif
+	ret
+
+	.p2align 4
+L(StrncpyExit5):
+	mov	(%rsi), %ecx
+	mov	4(%rsi), %dl
+	mov	%ecx, (%rdi)
+	mov	%dl, 4(%rdi)
+#  ifdef USE_AS_STPCPY
+	lea	5(%rdi), %rax
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 5(%rdi)
+#  endif
+	ret
+
+	.p2align 4
+L(StrncpyExit6):
+	mov	(%rsi), %ecx
+	mov	4(%rsi), %dx
+	mov	%ecx, (%rdi)
+	mov	%dx, 4(%rdi)
+#  ifdef USE_AS_STPCPY
+	lea	6(%rdi), %rax
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 6(%rdi)
+#  endif
+	ret
+
+	.p2align 4
+L(StrncpyExit7):
+	mov	(%rsi), %ecx
+	mov	3(%rsi), %edx
+	mov	%ecx, (%rdi)
+	mov	%edx, 3(%rdi)
+#  ifdef USE_AS_STPCPY
+	lea	7(%rdi), %rax
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 7(%rdi)
+#  endif
+	ret
+
+	.p2align 4
+L(StrncpyExit8):
+	mov	(%rsi), %rdx
+	mov	%rdx, (%rdi)
+#  ifdef USE_AS_STPCPY
+	lea	8(%rdi), %rax
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 8(%rdi)
+#  endif
+	ret
+
+	.p2align 4
+L(StrncpyExit9):
+	mov	(%rsi), %rcx
+	mov	8(%rsi), %dl
+	mov	%rcx, (%rdi)
+	mov	%dl, 8(%rdi)
+#  ifdef USE_AS_STPCPY
+	lea	9(%rdi), %rax
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 9(%rdi)
+#  endif
+	ret
+
+	.p2align 4
+L(StrncpyExit10):
+	mov	(%rsi), %rcx
+	mov	8(%rsi), %dx
+	mov	%rcx, (%rdi)
+	mov	%dx, 8(%rdi)
+#  ifdef USE_AS_STPCPY
+	lea	10(%rdi), %rax
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 10(%rdi)
+#  endif
+	ret
+
+	.p2align 4
+L(StrncpyExit11):
+	mov	(%rsi), %rcx
+	mov	7(%rsi), %edx
+	mov	%rcx, (%rdi)
+	mov	%edx, 7(%rdi)
+#  ifdef USE_AS_STPCPY
+	lea	11(%rdi), %rax
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 11(%rdi)
+#  endif
+	ret
+
+	.p2align 4
+L(StrncpyExit12):
+	mov	(%rsi), %rcx
+	mov	8(%rsi), %edx
+	mov	%rcx, (%rdi)
+	mov	%edx, 8(%rdi)
+#  ifdef USE_AS_STPCPY
+	lea	12(%rdi), %rax
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 12(%rdi)
+#  endif
+	ret
+
+	.p2align 4
+L(StrncpyExit13):
+	mov	(%rsi), %rcx
+	mov	5(%rsi), %rdx
+	mov	%rcx, (%rdi)
+	mov	%rdx, 5(%rdi)
+#  ifdef USE_AS_STPCPY
+	lea	13(%rdi), %rax
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 13(%rdi)
+#  endif
+	ret
+
+	.p2align 4
+L(StrncpyExit14):
+	mov	(%rsi), %rcx
+	mov	6(%rsi), %rdx
+	mov	%rcx, (%rdi)
+	mov	%rdx, 6(%rdi)
+#  ifdef USE_AS_STPCPY
+	lea	14(%rdi), %rax
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 14(%rdi)
+#  endif
+	ret
+
+	.p2align 4
+L(StrncpyExit15):
+	mov	(%rsi), %rcx
+	mov	7(%rsi), %rdx
+	mov	%rcx, (%rdi)
+	mov	%rdx, 7(%rdi)
+#  ifdef USE_AS_STPCPY
+	lea	15(%rdi), %rax
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 15(%rdi)
+#  endif
+	ret
+
+	.p2align 4
+L(StrncpyExit16):
+	movdqu	(%rsi), %xmm0
+	movdqu	%xmm0, (%rdi)
+#  ifdef USE_AS_STPCPY
+	lea	16(%rdi), %rax
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 16(%rdi)
+#  endif
+	ret
+
+	.p2align 4
+L(StrncpyExit17):
+	movdqu	(%rsi), %xmm0
+	mov	16(%rsi), %cl
+	movdqu	%xmm0, (%rdi)
+	mov	%cl, 16(%rdi)
+#  ifdef USE_AS_STPCPY
+	lea	17(%rdi), %rax
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 17(%rdi)
+#  endif
+	ret
+
+	.p2align 4
+L(StrncpyExit18):
+	movdqu	(%rsi), %xmm0
+	mov	16(%rsi), %cx
+	movdqu	%xmm0, (%rdi)
+	mov	%cx, 16(%rdi)
+#  ifdef USE_AS_STPCPY
+	lea	18(%rdi), %rax
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 18(%rdi)
+#  endif
+	ret
+
+	.p2align 4
+L(StrncpyExit19):
+	movdqu	(%rsi), %xmm0
+	mov	15(%rsi), %ecx
+	movdqu	%xmm0, (%rdi)
+	mov	%ecx, 15(%rdi)
+#  ifdef USE_AS_STPCPY
+	lea	19(%rdi), %rax
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 19(%rdi)
+#  endif
+	ret
+
+	.p2align 4
+L(StrncpyExit20):
+	movdqu	(%rsi), %xmm0
+	mov	16(%rsi), %ecx
+	movdqu	%xmm0, (%rdi)
+	mov	%ecx, 16(%rdi)
+#  ifdef USE_AS_STPCPY
+	lea	20(%rdi), %rax
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 20(%rdi)
+#  endif
+	ret
+
+	.p2align 4
+L(StrncpyExit21):
+	movdqu	(%rsi), %xmm0
+	mov	16(%rsi), %ecx
+	mov	20(%rsi), %dl
+	movdqu	%xmm0, (%rdi)
+	mov	%ecx, 16(%rdi)
+	mov	%dl, 20(%rdi)
+#  ifdef USE_AS_STPCPY
+	lea	21(%rdi), %rax
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 21(%rdi)
+#  endif
+	ret
+
+	.p2align 4
+L(StrncpyExit22):
+	movdqu	(%rsi), %xmm0
+	mov	14(%rsi), %rcx
+	movdqu	%xmm0, (%rdi)
+	mov	%rcx, 14(%rdi)
+#  ifdef USE_AS_STPCPY
+	lea	22(%rdi), %rax
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 22(%rdi)
+#  endif
+	ret
+
+	.p2align 4
+L(StrncpyExit23):
+	movdqu	(%rsi), %xmm0
+	mov	15(%rsi), %rcx
+	movdqu	%xmm0, (%rdi)
+	mov	%rcx, 15(%rdi)
+#  ifdef USE_AS_STPCPY
+	lea	23(%rdi), %rax
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 23(%rdi)
+#  endif
+	ret
+
+	.p2align 4
+L(StrncpyExit24):
+	movdqu	(%rsi), %xmm0
+	mov	16(%rsi), %rcx
+	movdqu	%xmm0, (%rdi)
+	mov	%rcx, 16(%rdi)
+#  ifdef USE_AS_STPCPY
+	lea	24(%rdi), %rax
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 24(%rdi)
+#  endif
+	ret
+
+	.p2align 4
+L(StrncpyExit25):
+	movdqu	(%rsi), %xmm0
+	mov	16(%rsi), %rdx
+	mov	24(%rsi), %cl
+	movdqu	%xmm0, (%rdi)
+	mov	%rdx, 16(%rdi)
+	mov	%cl, 24(%rdi)
+#  ifdef USE_AS_STPCPY
+	lea	25(%rdi), %rax
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 25(%rdi)
+#  endif
+	ret
+
+	.p2align 4
+L(StrncpyExit26):
+	movdqu	(%rsi), %xmm0
+	mov	16(%rsi), %rdx
+	mov	24(%rsi), %cx
+	movdqu	%xmm0, (%rdi)
+	mov	%rdx, 16(%rdi)
+	mov	%cx, 24(%rdi)
+#  ifdef USE_AS_STPCPY
+	lea	26(%rdi), %rax
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 26(%rdi)
+#  endif
+	ret
+
+	.p2align 4
+L(StrncpyExit27):
+	movdqu	(%rsi), %xmm0
+	mov	16(%rsi), %rdx
+	mov	23(%rsi), %ecx
+	movdqu	%xmm0, (%rdi)
+	mov	%rdx, 16(%rdi)
+	mov	%ecx, 23(%rdi)
+#  ifdef USE_AS_STPCPY
+	lea	27(%rdi), %rax
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 27(%rdi)
+#  endif
+	ret
+
+	.p2align 4
+L(StrncpyExit28):
+	movdqu	(%rsi), %xmm0
+	mov	16(%rsi), %rdx
+	mov	24(%rsi), %ecx
+	movdqu	%xmm0, (%rdi)
+	mov	%rdx, 16(%rdi)
+	mov	%ecx, 24(%rdi)
+#  ifdef USE_AS_STPCPY
+	lea	28(%rdi), %rax
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 28(%rdi)
+#  endif
+	ret
+
+	.p2align 4
+L(StrncpyExit29):
+	movdqu	(%rsi), %xmm0
+	movdqu	13(%rsi), %xmm2
+	movdqu	%xmm0, (%rdi)
+	movdqu	%xmm2, 13(%rdi)
+#  ifdef USE_AS_STPCPY
+	lea	29(%rdi), %rax
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 29(%rdi)
+#  endif
+	ret
+
+	.p2align 4
+L(StrncpyExit30):
+	movdqu	(%rsi), %xmm0
+	movdqu	14(%rsi), %xmm2
+	movdqu	%xmm0, (%rdi)
+	movdqu	%xmm2, 14(%rdi)
+#  ifdef USE_AS_STPCPY
+	lea	30(%rdi), %rax
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 30(%rdi)
+#  endif
+	ret
+
+	.p2align 4
+L(StrncpyExit31):
+	movdqu	(%rsi), %xmm0
+	movdqu	15(%rsi), %xmm2
+	movdqu	%xmm0, (%rdi)
+	movdqu	%xmm2, 15(%rdi)
+#  ifdef USE_AS_STPCPY
+	lea	31(%rdi), %rax
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 31(%rdi)
+#  endif
+	ret
+
+	.p2align 4
+L(StrncpyExit32):
+	movdqu	(%rsi), %xmm0
+	movdqu	16(%rsi), %xmm2
+	movdqu	%xmm0, (%rdi)
+	movdqu	%xmm2, 16(%rdi)
+#  ifdef USE_AS_STPCPY
+	lea	32(%rdi), %rax
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 32(%rdi)
+#  endif
+	ret
+
+	.p2align 4
+L(StrncpyExit33):
+	movdqu	(%rsi), %xmm0
+	movdqu	16(%rsi), %xmm2
+	mov	32(%rsi), %cl
+	movdqu	%xmm0, (%rdi)
+	movdqu	%xmm2, 16(%rdi)
+	mov	%cl, 32(%rdi)
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 33(%rdi)
+#  endif
+	ret
+
+#  ifndef USE_AS_STRCAT
+
+	.p2align 4
+L(Fill0):
+	ret
+
+	.p2align 4
+L(Fill1):
+	mov	%dl, (%rdi)
+	ret
+
+	.p2align 4
+L(Fill2):
+	mov	%dx, (%rdi)
+	ret
+
+	.p2align 4
+L(Fill3):
+	mov	%edx, -1(%rdi)
+	ret
+
+	.p2align 4
+L(Fill4):
+	mov	%edx, (%rdi)
+	ret
+
+	.p2align 4
+L(Fill5):
+	mov	%edx, (%rdi)
+	mov	%dl, 4(%rdi)
+	ret
+
+	.p2align 4
+L(Fill6):
+	mov	%edx, (%rdi)
+	mov	%dx, 4(%rdi)
+	ret
+
+	.p2align 4
+L(Fill7):
+	mov	%rdx, -1(%rdi)
+	ret
+
+	.p2align 4
+L(Fill8):
+	mov	%rdx, (%rdi)
+	ret
+
+	.p2align 4
+L(Fill9):
+	mov	%rdx, (%rdi)
+	mov	%dl, 8(%rdi)
+	ret
+
+	.p2align 4
+L(Fill10):
+	mov	%rdx, (%rdi)
+	mov	%dx, 8(%rdi)
+	ret
+
+	.p2align 4
+L(Fill11):
+	mov	%rdx, (%rdi)
+	mov	%edx, 7(%rdi)
+	ret
+
+	.p2align 4
+L(Fill12):
+	mov	%rdx, (%rdi)
+	mov	%edx, 8(%rdi)
+	ret
+
+	.p2align 4
+L(Fill13):
+	mov	%rdx, (%rdi)
+	mov	%rdx, 5(%rdi)
+	ret
+
+	.p2align 4
+L(Fill14):
+	mov	%rdx, (%rdi)
+	mov	%rdx, 6(%rdi)
+	ret
+
+	.p2align 4
+L(Fill15):
+	movdqu	%xmm0, -1(%rdi)
+	ret
+
+	.p2align 4
+L(Fill16):
+	movdqu	%xmm0, (%rdi)
+	ret
+
+	.p2align 4
+L(CopyFrom1To16BytesUnalignedXmm2):
+	movdqu	%xmm2, (%rdi, %rcx)
+
+	.p2align 4
+L(CopyFrom1To16BytesXmmExit):
+	bsf	%rdx, %rdx
+	add	$15, %r8
+	add	%rcx, %rdi
+#   ifdef USE_AS_STPCPY
+	lea	(%rdi, %rdx), %rax
+#   endif
+	sub	%rdx, %r8
+	lea	1(%rdi, %rdx), %rdi
+
+	.p2align 4
+L(StrncpyFillTailWithZero):
+	pxor	%xmm0, %xmm0
+	xor	%rdx, %rdx
+	sub	$16, %r8
+	jbe	L(StrncpyFillExit)
+
+	movdqu	%xmm0, (%rdi)
+	add	$16, %rdi
+
+	mov	%rdi, %rsi
+	and	$0xf, %rsi
+	sub	%rsi, %rdi
+	add	%rsi, %r8
+	sub	$64, %r8
+	jb	L(StrncpyFillLess64)
+
+L(StrncpyFillLoopMovdqa):
+	movdqa	%xmm0, (%rdi)
+	movdqa	%xmm0, 16(%rdi)
+	movdqa	%xmm0, 32(%rdi)
+	movdqa	%xmm0, 48(%rdi)
+	add	$64, %rdi
+	sub	$64, %r8
+	jae	L(StrncpyFillLoopMovdqa)
+
+L(StrncpyFillLess64):
+	add	$32, %r8
+	jl	L(StrncpyFillLess32)
+	movdqa	%xmm0, (%rdi)
+	movdqa	%xmm0, 16(%rdi)
+	add	$32, %rdi
+	sub	$16, %r8
+	jl	L(StrncpyFillExit)
+	movdqa	%xmm0, (%rdi)
+	add	$16, %rdi
+	BRANCH_TO_JMPTBL_ENTRY (L(FillTable), %r8, 4)
+
+L(StrncpyFillLess32):
+	add	$16, %r8
+	jl	L(StrncpyFillExit)
+	movdqa	%xmm0, (%rdi)
+	add	$16, %rdi
+	BRANCH_TO_JMPTBL_ENTRY (L(FillTable), %r8, 4)
+
+L(StrncpyFillExit):
+	add	$16, %r8
+	BRANCH_TO_JMPTBL_ENTRY (L(FillTable), %r8, 4)
+
+/* end of ifndef USE_AS_STRCAT */
+#  endif
+
+	.p2align 4
+L(UnalignedLeaveCase2OrCase3):
+	test	%rdx, %rdx
+	jnz	L(Unaligned64LeaveCase2)
+L(Unaligned64LeaveCase3):
+	lea	64(%r8), %rcx
+	and	$-16, %rcx
+	add	$48, %r8
+	jl	L(CopyFrom1To16BytesCase3)
+	movdqu	%xmm4, (%rdi)
+	sub	$16, %r8
+	jb	L(CopyFrom1To16BytesCase3)
+	movdqu	%xmm5, 16(%rdi)
+	sub	$16, %r8
+	jb	L(CopyFrom1To16BytesCase3)
+	movdqu	%xmm6, 32(%rdi)
+	sub	$16, %r8
+	jb	L(CopyFrom1To16BytesCase3)
+	movdqu	%xmm7, 48(%rdi)
+#  ifdef USE_AS_STPCPY
+	lea	64(%rdi), %rax
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 64(%rdi)
+#  endif
+	ret
+
+	.p2align 4
+L(Unaligned64LeaveCase2):
+	xor	%rcx, %rcx
+	pcmpeqb	%xmm4, %xmm0
+	pmovmskb %xmm0, %rdx
+	add	$48, %r8
+	jle	L(CopyFrom1To16BytesCase2OrCase3)
+	test	%rdx, %rdx
+#  ifndef USE_AS_STRCAT
+	jnz	L(CopyFrom1To16BytesUnalignedXmm4)
+#  else
+	jnz	L(CopyFrom1To16Bytes)
+#  endif
+	pcmpeqb	%xmm5, %xmm0
+	pmovmskb %xmm0, %rdx
+	movdqu	%xmm4, (%rdi)
+	add	$16, %rcx
+	sub	$16, %r8
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+	test	%rdx, %rdx
+#  ifndef USE_AS_STRCAT
+	jnz	L(CopyFrom1To16BytesUnalignedXmm5)
+#  else
+	jnz	L(CopyFrom1To16Bytes)
+#  endif
+
+	pcmpeqb	%xmm6, %xmm0
+	pmovmskb %xmm0, %rdx
+	movdqu	%xmm5, 16(%rdi)
+	add	$16, %rcx
+	sub	$16, %r8
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+	test	%rdx, %rdx
+#  ifndef USE_AS_STRCAT
+	jnz	L(CopyFrom1To16BytesUnalignedXmm6)
+#  else
+	jnz	L(CopyFrom1To16Bytes)
+#  endif
+
+	pcmpeqb	%xmm7, %xmm0
+	pmovmskb %xmm0, %rdx
+	movdqu	%xmm6, 32(%rdi)
+	lea	16(%rdi, %rcx), %rdi
+	lea	16(%rsi, %rcx), %rsi
+	bsf	%rdx, %rdx
+	cmp	%r8, %rdx
+	jb	L(CopyFrom1To16BytesExit)
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
+
+	.p2align 4
+L(ExitZero):
+#  ifndef USE_AS_STRCAT
+	mov	%rdi, %rax
+#  endif
+	ret
+
+# endif
+
+# ifndef USE_AS_STRCAT
+END (STRCPY)
+# else
+END (STRCAT)
+# endif
+	.p2align 4
+	.section .rodata
+L(ExitTable):
+	.int	JMPTBL(L(Exit1), L(ExitTable))
+	.int	JMPTBL(L(Exit2), L(ExitTable))
+	.int	JMPTBL(L(Exit3), L(ExitTable))
+	.int	JMPTBL(L(Exit4), L(ExitTable))
+	.int	JMPTBL(L(Exit5), L(ExitTable))
+	.int	JMPTBL(L(Exit6), L(ExitTable))
+	.int	JMPTBL(L(Exit7), L(ExitTable))
+	.int	JMPTBL(L(Exit8), L(ExitTable))
+	.int	JMPTBL(L(Exit9), L(ExitTable))
+	.int	JMPTBL(L(Exit10), L(ExitTable))
+	.int	JMPTBL(L(Exit11), L(ExitTable))
+	.int	JMPTBL(L(Exit12), L(ExitTable))
+	.int	JMPTBL(L(Exit13), L(ExitTable))
+	.int	JMPTBL(L(Exit14), L(ExitTable))
+	.int	JMPTBL(L(Exit15), L(ExitTable))
+	.int	JMPTBL(L(Exit16), L(ExitTable))
+	.int	JMPTBL(L(Exit17), L(ExitTable))
+	.int	JMPTBL(L(Exit18), L(ExitTable))
+	.int	JMPTBL(L(Exit19), L(ExitTable))
+	.int	JMPTBL(L(Exit20), L(ExitTable))
+	.int	JMPTBL(L(Exit21), L(ExitTable))
+	.int	JMPTBL(L(Exit22), L(ExitTable))
+	.int    JMPTBL(L(Exit23), L(ExitTable))
+	.int	JMPTBL(L(Exit24), L(ExitTable))
+	.int	JMPTBL(L(Exit25), L(ExitTable))
+	.int	JMPTBL(L(Exit26), L(ExitTable))
+	.int	JMPTBL(L(Exit27), L(ExitTable))
+	.int	JMPTBL(L(Exit28), L(ExitTable))
+	.int	JMPTBL(L(Exit29), L(ExitTable))
+	.int	JMPTBL(L(Exit30), L(ExitTable))
+	.int	JMPTBL(L(Exit31), L(ExitTable))
+	.int	JMPTBL(L(Exit32), L(ExitTable))
+# ifdef USE_AS_STRNCPY
+L(ExitStrncpyTable):
+	.int	JMPTBL(L(StrncpyExit0), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit1), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit2), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit3), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit4), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit5), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit6), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit7), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit8), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit9), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit10), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit11), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit12), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit13), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit14), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit15), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit16), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit17), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit18), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit19), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit20), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit21), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit22), L(ExitStrncpyTable))
+	.int    JMPTBL(L(StrncpyExit23), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit24), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit25), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit26), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit27), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit28), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit29), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit30), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit31), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit32), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit33), L(ExitStrncpyTable))
+#  ifndef USE_AS_STRCAT
+	.p2align 4
+L(FillTable):
+	.int	JMPTBL(L(Fill0), L(FillTable))
+	.int	JMPTBL(L(Fill1), L(FillTable))
+	.int	JMPTBL(L(Fill2), L(FillTable))
+	.int	JMPTBL(L(Fill3), L(FillTable))
+	.int	JMPTBL(L(Fill4), L(FillTable))
+	.int	JMPTBL(L(Fill5), L(FillTable))
+	.int	JMPTBL(L(Fill6), L(FillTable))
+	.int	JMPTBL(L(Fill7), L(FillTable))
+	.int	JMPTBL(L(Fill8), L(FillTable))
+	.int	JMPTBL(L(Fill9), L(FillTable))
+	.int	JMPTBL(L(Fill10), L(FillTable))
+	.int	JMPTBL(L(Fill11), L(FillTable))
+	.int	JMPTBL(L(Fill12), L(FillTable))
+	.int	JMPTBL(L(Fill13), L(FillTable))
+	.int	JMPTBL(L(Fill14), L(FillTable))
+	.int	JMPTBL(L(Fill15), L(FillTable))
+	.int	JMPTBL(L(Fill16), L(FillTable))
+#  endif
+# endif
+#endif
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/strcpy-ssse3.S b/REORG.TODO/sysdeps/x86_64/multiarch/strcpy-ssse3.S
new file mode 100644
index 0000000000..47aaeae671
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/strcpy-ssse3.S
@@ -0,0 +1,3551 @@
+/* strcpy with SSSE3
+   Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+
+# ifndef USE_AS_STRCAT
+#  include <sysdep.h>
+
+#  ifndef STRCPY
+#   define STRCPY  __strcpy_ssse3
+#  endif
+
+	.section .text.ssse3,"ax",@progbits
+ENTRY (STRCPY)
+
+	mov	%rsi, %rcx
+#  ifdef USE_AS_STRNCPY
+	mov	%rdx, %r8
+#  endif
+	mov	%rdi, %rdx
+#  ifdef USE_AS_STRNCPY
+	test	%r8, %r8
+	jz	L(Exit0)
+	cmp	$8, %r8
+	jbe	L(StrncpyExit8Bytes)
+# endif
+	cmpb	$0, (%rcx)
+	jz	L(Exit1)
+	cmpb	$0, 1(%rcx)
+	jz	L(Exit2)
+	cmpb	$0, 2(%rcx)
+	jz	L(Exit3)
+	cmpb	$0, 3(%rcx)
+	jz	L(Exit4)
+	cmpb	$0, 4(%rcx)
+	jz	L(Exit5)
+	cmpb	$0, 5(%rcx)
+	jz	L(Exit6)
+	cmpb	$0, 6(%rcx)
+	jz	L(Exit7)
+	cmpb	$0, 7(%rcx)
+	jz	L(Exit8)
+# ifdef USE_AS_STRNCPY
+	cmp	$16, %r8
+	jb	L(StrncpyExit15Bytes)
+# endif
+	cmpb	$0, 8(%rcx)
+	jz	L(Exit9)
+	cmpb	$0, 9(%rcx)
+	jz	L(Exit10)
+	cmpb	$0, 10(%rcx)
+	jz	L(Exit11)
+	cmpb	$0, 11(%rcx)
+	jz	L(Exit12)
+	cmpb	$0, 12(%rcx)
+	jz	L(Exit13)
+	cmpb	$0, 13(%rcx)
+	jz	L(Exit14)
+	cmpb	$0, 14(%rcx)
+	jz	L(Exit15)
+# ifdef USE_AS_STRNCPY
+	cmp	$16, %r8
+	je	L(Exit16)
+# endif
+	cmpb	$0, 15(%rcx)
+	jz	L(Exit16)
+# endif
+
+# ifdef USE_AS_STRNCPY
+	mov	%rcx, %rsi
+	sub	$16, %r8
+	and	$0xf, %rsi
+
+/* add 16 bytes rcx_offset to r8 */
+
+	add	%rsi, %r8
+# endif
+	lea	16(%rcx), %rsi
+	and	$-16, %rsi
+	pxor	%xmm0, %xmm0
+	mov	(%rcx), %r9
+	mov	%r9, (%rdx)
+	pcmpeqb	(%rsi), %xmm0
+	mov	8(%rcx), %r9
+	mov	%r9, 8(%rdx)
+
+/* convert byte mask in xmm0 to bit mask */
+
+	pmovmskb %xmm0, %rax
+	sub	%rcx, %rsi
+
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(CopyFrom1To16Bytes)
+
+	mov	%rdx, %rax
+	lea	16(%rdx), %rdx
+	and	$-16, %rdx
+	sub	%rdx, %rax
+
+# ifdef USE_AS_STRNCPY
+	add	%rax, %rsi
+	lea	-1(%rsi), %rsi
+	and	$1<<31, %esi
+	test	%rsi, %rsi
+	jnz	L(ContinueCopy)
+	lea	16(%r8), %r8
+
+L(ContinueCopy):
+# endif
+	sub	%rax, %rcx
+	mov	%rcx, %rax
+	and	$0xf, %rax
+	mov	$0, %rsi
+
+/* case: rcx_offset == rdx_offset */
+
+	jz	L(Align16Both)
+
+	cmp	$8, %rax
+	jae	L(ShlHigh8)
+	cmp	$1, %rax
+	je	L(Shl1)
+	cmp	$2, %rax
+	je	L(Shl2)
+	cmp	$3, %rax
+	je	L(Shl3)
+	cmp	$4, %rax
+	je	L(Shl4)
+	cmp	$5, %rax
+	je	L(Shl5)
+	cmp	$6, %rax
+	je	L(Shl6)
+	jmp	L(Shl7)
+
+L(ShlHigh8):
+	je	L(Shl8)
+	cmp	$9, %rax
+	je	L(Shl9)
+	cmp	$10, %rax
+	je	L(Shl10)
+	cmp	$11, %rax
+	je	L(Shl11)
+	cmp	$12, %rax
+	je	L(Shl12)
+	cmp	$13, %rax
+	je	L(Shl13)
+	cmp	$14, %rax
+	je	L(Shl14)
+	jmp	L(Shl15)
+
+L(Align16Both):
+	movaps	(%rcx), %xmm1
+	movaps	16(%rcx), %xmm2
+	movaps	%xmm1, (%rdx)
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %rax
+	lea	16(%rsi), %rsi
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	16(%rcx, %rsi), %xmm3
+	movaps	%xmm2, (%rdx, %rsi)
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %rax
+	lea	16(%rsi), %rsi
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	16(%rcx, %rsi), %xmm4
+	movaps	%xmm3, (%rdx, %rsi)
+	pcmpeqb	%xmm4, %xmm0
+	pmovmskb %xmm0, %rax
+	lea	16(%rsi), %rsi
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	16(%rcx, %rsi), %xmm1
+	movaps	%xmm4, (%rdx, %rsi)
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm0, %rax
+	lea	16(%rsi), %rsi
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	16(%rcx, %rsi), %xmm2
+	movaps	%xmm1, (%rdx, %rsi)
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %rax
+	lea	16(%rsi), %rsi
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	16(%rcx, %rsi), %xmm3
+	movaps	%xmm2, (%rdx, %rsi)
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %rax
+	lea	16(%rsi), %rsi
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	%xmm3, (%rdx, %rsi)
+	mov	%rcx, %rax
+	lea	16(%rcx, %rsi), %rcx
+	and	$-0x40, %rcx
+	sub	%rcx, %rax
+	sub	%rax, %rdx
+# ifdef USE_AS_STRNCPY
+	lea	112(%r8, %rax), %r8
+# endif
+	mov	$-0x40, %rsi
+
+	.p2align 4
+L(Aligned64Loop):
+	movaps	(%rcx), %xmm2
+	movaps	%xmm2, %xmm4
+	movaps	16(%rcx), %xmm5
+	movaps	32(%rcx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	48(%rcx), %xmm7
+	pminub	%xmm5, %xmm2
+	pminub	%xmm7, %xmm3
+	pminub	%xmm2, %xmm3
+	pcmpeqb	%xmm0, %xmm3
+	pmovmskb %xmm3, %rax
+	lea	64(%rdx), %rdx
+	lea	64(%rcx), %rcx
+# ifdef USE_AS_STRNCPY
+	sub	$64, %r8
+	jbe	L(StrncpyLeaveCase2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Aligned64Leave)
+	movaps	%xmm4, -64(%rdx)
+	movaps	%xmm5, -48(%rdx)
+	movaps	%xmm6, -32(%rdx)
+	movaps	%xmm7, -16(%rdx)
+	jmp	L(Aligned64Loop)
+
+L(Aligned64Leave):
+# ifdef USE_AS_STRNCPY
+	lea	48(%r8), %r8
+# endif
+	pcmpeqb	%xmm4, %xmm0
+	pmovmskb %xmm0, %rax
+	test	%rax, %rax
+	jnz	L(CopyFrom1To16Bytes)
+
+	pcmpeqb	%xmm5, %xmm0
+# ifdef USE_AS_STRNCPY
+	lea	-16(%r8), %r8
+# endif
+	pmovmskb %xmm0, %rax
+	movaps	%xmm4, -64(%rdx)
+	test	%rax, %rax
+	lea	16(%rsi), %rsi
+	jnz	L(CopyFrom1To16Bytes)
+
+	pcmpeqb	%xmm6, %xmm0
+# ifdef USE_AS_STRNCPY
+	lea	-16(%r8), %r8
+# endif
+	pmovmskb %xmm0, %rax
+	movaps	%xmm5, -48(%rdx)
+	test	%rax, %rax
+	lea	16(%rsi), %rsi
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	%xmm6, -32(%rdx)
+	pcmpeqb	%xmm7, %xmm0
+# ifdef USE_AS_STRNCPY
+	lea	-16(%r8), %r8
+# endif
+	pmovmskb %xmm0, %rax
+	lea	16(%rsi), %rsi
+	jmp	L(CopyFrom1To16Bytes)
+
+	.p2align 4
+L(Shl1):
+	movaps	-1(%rcx), %xmm1
+	movaps	15(%rcx), %xmm2
+L(Shl1Start):
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %rax
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit1Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl1LoopExit)
+
+	palignr	$1, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	31(%rcx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+	movaps	%xmm2, %xmm1
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit1Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl1LoopExit)
+
+	palignr	$1, %xmm3, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	31(%rcx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit1Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl1LoopExit)
+
+	palignr	$1, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	31(%rcx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit1Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl1LoopExit)
+
+	palignr	$1, %xmm3, %xmm2
+	movaps	%xmm2, (%rdx)
+	lea	31(%rcx), %rcx
+	lea	16(%rdx), %rdx
+
+	mov	%rcx, %rax
+	and	$-0x40, %rcx
+	sub	%rcx, %rax
+	lea	-15(%rcx), %rcx
+	sub	%rax, %rdx
+# ifdef USE_AS_STRNCPY
+	add	%rax, %r8
+# endif
+	movaps	-1(%rcx), %xmm1
+
+/* 64 bytes loop */
+	.p2align 4
+L(Shl1LoopStart):
+	movaps	15(%rcx), %xmm2
+	movaps	31(%rcx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	47(%rcx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	63(%rcx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqb	%xmm0, %xmm7
+	pmovmskb %xmm7, %rax
+	movaps	%xmm5, %xmm7
+	palignr	$1, %xmm4, %xmm5
+	test	%rax, %rax
+	palignr	$1, %xmm3, %xmm4
+	jnz	L(Shl1Start)
+# ifdef USE_AS_STRNCPY
+	sub	$64, %r8
+	jbe	L(StrncpyLeave1)
+# endif
+	palignr	$1, %xmm2, %xmm3
+	lea	64(%rcx), %rcx
+	palignr	$1, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%rdx)
+	movaps	%xmm4, 32(%rdx)
+	movaps	%xmm3, 16(%rdx)
+	movaps	%xmm2, (%rdx)
+	lea	64(%rdx), %rdx
+	jmp	L(Shl1LoopStart)
+
+L(Shl1LoopExit):
+	movdqu	-1(%rcx), %xmm1
+	mov	$15, %rsi
+	movdqu	%xmm1, -1(%rdx)
+	jmp	L(CopyFrom1To16Bytes)
+
+	.p2align 4
+L(Shl2):
+	movaps	-2(%rcx), %xmm1
+	movaps	14(%rcx), %xmm2
+L(Shl2Start):
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %rax
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit2Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl2LoopExit)
+
+	palignr	$2, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	30(%rcx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+	movaps	%xmm2, %xmm1
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit2Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl2LoopExit)
+
+	palignr	$2, %xmm3, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	30(%rcx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit2Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl2LoopExit)
+
+	palignr	$2, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	30(%rcx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit2Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl2LoopExit)
+
+	palignr	$2, %xmm3, %xmm2
+	movaps	%xmm2, (%rdx)
+	lea	30(%rcx), %rcx
+	lea	16(%rdx), %rdx
+
+	mov	%rcx, %rax
+	and	$-0x40, %rcx
+	sub	%rcx, %rax
+	lea	-14(%rcx), %rcx
+	sub	%rax, %rdx
+# ifdef USE_AS_STRNCPY
+	add	%rax, %r8
+# endif
+	movaps	-2(%rcx), %xmm1
+
+/* 64 bytes loop */
+	.p2align 4
+L(Shl2LoopStart):
+	movaps	14(%rcx), %xmm2
+	movaps	30(%rcx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	46(%rcx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	62(%rcx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqb	%xmm0, %xmm7
+	pmovmskb %xmm7, %rax
+	movaps	%xmm5, %xmm7
+	palignr	$2, %xmm4, %xmm5
+	test	%rax, %rax
+	palignr	$2, %xmm3, %xmm4
+	jnz	L(Shl2Start)
+# ifdef USE_AS_STRNCPY
+	sub	$64, %r8
+	jbe	L(StrncpyLeave2)
+# endif
+	palignr	$2, %xmm2, %xmm3
+	lea	64(%rcx), %rcx
+	palignr	$2, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%rdx)
+	movaps	%xmm4, 32(%rdx)
+	movaps	%xmm3, 16(%rdx)
+	movaps	%xmm2, (%rdx)
+	lea	64(%rdx), %rdx
+	jmp	L(Shl2LoopStart)
+
+L(Shl2LoopExit):
+	movdqu	-2(%rcx), %xmm1
+	mov	$14, %rsi
+	movdqu	%xmm1, -2(%rdx)
+	jmp	L(CopyFrom1To16Bytes)
+
+	.p2align 4
+L(Shl3):
+	movaps	-3(%rcx), %xmm1
+	movaps	13(%rcx), %xmm2
+L(Shl3Start):
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %rax
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit3Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl3LoopExit)
+
+	palignr	$3, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	29(%rcx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+	movaps	%xmm2, %xmm1
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit3Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl3LoopExit)
+
+	palignr	$3, %xmm3, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	29(%rcx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit3Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl3LoopExit)
+
+	palignr	$3, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	29(%rcx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit3Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl3LoopExit)
+
+	palignr	$3, %xmm3, %xmm2
+	movaps	%xmm2, (%rdx)
+	lea	29(%rcx), %rcx
+	lea	16(%rdx), %rdx
+
+	mov	%rcx, %rax
+	and	$-0x40, %rcx
+	sub	%rcx, %rax
+	lea	-13(%rcx), %rcx
+	sub	%rax, %rdx
+# ifdef USE_AS_STRNCPY
+	add	%rax, %r8
+# endif
+	movaps	-3(%rcx), %xmm1
+
+/* 64 bytes loop */
+	.p2align 4
+L(Shl3LoopStart):
+	movaps	13(%rcx), %xmm2
+	movaps	29(%rcx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	45(%rcx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	61(%rcx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqb	%xmm0, %xmm7
+	pmovmskb %xmm7, %rax
+	movaps	%xmm5, %xmm7
+	palignr	$3, %xmm4, %xmm5
+	test	%rax, %rax
+	palignr	$3, %xmm3, %xmm4
+	jnz	L(Shl3Start)
+# ifdef USE_AS_STRNCPY
+	sub	$64, %r8
+	jbe	L(StrncpyLeave3)
+# endif
+	palignr	$3, %xmm2, %xmm3
+	lea	64(%rcx), %rcx
+	palignr	$3, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%rdx)
+	movaps	%xmm4, 32(%rdx)
+	movaps	%xmm3, 16(%rdx)
+	movaps	%xmm2, (%rdx)
+	lea	64(%rdx), %rdx
+	jmp	L(Shl3LoopStart)
+
+L(Shl3LoopExit):
+	movdqu	-3(%rcx), %xmm1
+	mov	$13, %rsi
+	movdqu	%xmm1, -3(%rdx)
+	jmp	L(CopyFrom1To16Bytes)
+
+	.p2align 4
+L(Shl4):
+	movaps	-4(%rcx), %xmm1
+	movaps	12(%rcx), %xmm2
+L(Shl4Start):
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %rax
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit4Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl4LoopExit)
+
+	palignr	$4, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	28(%rcx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+	movaps	%xmm2, %xmm1
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit4Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl4LoopExit)
+
+	palignr	$4, %xmm3, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	28(%rcx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit4Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl4LoopExit)
+
+	palignr	$4, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	28(%rcx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit4Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl4LoopExit)
+
+	palignr	$4, %xmm3, %xmm2
+	movaps	%xmm2, (%rdx)
+	lea	28(%rcx), %rcx
+	lea	16(%rdx), %rdx
+
+	mov	%rcx, %rax
+	and	$-0x40, %rcx
+	sub	%rcx, %rax
+	lea	-12(%rcx), %rcx
+	sub	%rax, %rdx
+# ifdef USE_AS_STRNCPY
+	add	%rax, %r8
+# endif
+	movaps	-4(%rcx), %xmm1
+
+/* 64 bytes loop */
+	.p2align 4
+L(Shl4LoopStart):
+	movaps	12(%rcx), %xmm2
+	movaps	28(%rcx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	44(%rcx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	60(%rcx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqb	%xmm0, %xmm7
+	pmovmskb %xmm7, %rax
+	movaps	%xmm5, %xmm7
+	palignr	$4, %xmm4, %xmm5
+	test	%rax, %rax
+	palignr	$4, %xmm3, %xmm4
+	jnz	L(Shl4Start)
+# ifdef USE_AS_STRNCPY
+	sub	$64, %r8
+	jbe	L(StrncpyLeave4)
+# endif
+	palignr	$4, %xmm2, %xmm3
+	lea	64(%rcx), %rcx
+	palignr	$4, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%rdx)
+	movaps	%xmm4, 32(%rdx)
+	movaps	%xmm3, 16(%rdx)
+	movaps	%xmm2, (%rdx)
+	lea	64(%rdx), %rdx
+	jmp	L(Shl4LoopStart)
+
+L(Shl4LoopExit):
+	movdqu	-4(%rcx), %xmm1
+	mov	$12, %rsi
+	movdqu	%xmm1, -4(%rdx)
+	jmp	L(CopyFrom1To16Bytes)
+
+	.p2align 4
+L(Shl5):
+	movaps	-5(%rcx), %xmm1
+	movaps	11(%rcx), %xmm2
+L(Shl5Start):
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %rax
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit5Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl5LoopExit)
+
+	palignr	$5, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	27(%rcx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+	movaps	%xmm2, %xmm1
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit5Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl5LoopExit)
+
+	palignr	$5, %xmm3, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	27(%rcx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit5Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl5LoopExit)
+
+	palignr	$5, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	27(%rcx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit5Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl5LoopExit)
+
+	palignr	$5, %xmm3, %xmm2
+	movaps	%xmm2, (%rdx)
+	lea	27(%rcx), %rcx
+	lea	16(%rdx), %rdx
+
+	mov	%rcx, %rax
+	and	$-0x40, %rcx
+	sub	%rcx, %rax
+	lea	-11(%rcx), %rcx
+	sub	%rax, %rdx
+# ifdef USE_AS_STRNCPY
+	add	%rax, %r8
+# endif
+	movaps	-5(%rcx), %xmm1
+
+/* 64 bytes loop */
+	.p2align 4
+L(Shl5LoopStart):
+	movaps	11(%rcx), %xmm2
+	movaps	27(%rcx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	43(%rcx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	59(%rcx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqb	%xmm0, %xmm7
+	pmovmskb %xmm7, %rax
+	movaps	%xmm5, %xmm7
+	palignr	$5, %xmm4, %xmm5
+	test	%rax, %rax
+	palignr	$5, %xmm3, %xmm4
+	jnz	L(Shl5Start)
+# ifdef USE_AS_STRNCPY
+	sub	$64, %r8
+	jbe	L(StrncpyLeave5)
+# endif
+	palignr	$5, %xmm2, %xmm3
+	lea	64(%rcx), %rcx
+	palignr	$5, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%rdx)
+	movaps	%xmm4, 32(%rdx)
+	movaps	%xmm3, 16(%rdx)
+	movaps	%xmm2, (%rdx)
+	lea	64(%rdx), %rdx
+	jmp	L(Shl5LoopStart)
+
+L(Shl5LoopExit):
+	movdqu	-5(%rcx), %xmm1
+	mov	$11, %rsi
+	movdqu	%xmm1, -5(%rdx)
+	jmp	L(CopyFrom1To16Bytes)
+
+	.p2align 4
+L(Shl6):
+	movaps	-6(%rcx), %xmm1
+	movaps	10(%rcx), %xmm2
+L(Shl6Start):
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %rax
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit6Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl6LoopExit)
+
+	palignr	$6, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	26(%rcx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+	movaps	%xmm2, %xmm1
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit6Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl6LoopExit)
+
+	palignr	$6, %xmm3, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	26(%rcx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit6Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl6LoopExit)
+
+	palignr	$6, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	26(%rcx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit6Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl6LoopExit)
+
+	palignr	$6, %xmm3, %xmm2
+	movaps	%xmm2, (%rdx)
+	lea	26(%rcx), %rcx
+	lea	16(%rdx), %rdx
+
+	mov	%rcx, %rax
+	and	$-0x40, %rcx
+	sub	%rcx, %rax
+	lea	-10(%rcx), %rcx
+	sub	%rax, %rdx
+# ifdef USE_AS_STRNCPY
+	add	%rax, %r8
+# endif
+	movaps	-6(%rcx), %xmm1
+
+/* 64 bytes loop */
+	.p2align 4
+L(Shl6LoopStart):
+	movaps	10(%rcx), %xmm2
+	movaps	26(%rcx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	42(%rcx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	58(%rcx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqb	%xmm0, %xmm7
+	pmovmskb %xmm7, %rax
+	movaps	%xmm5, %xmm7
+	palignr	$6, %xmm4, %xmm5
+	test	%rax, %rax
+	palignr	$6, %xmm3, %xmm4
+	jnz	L(Shl6Start)
+# ifdef USE_AS_STRNCPY
+	sub	$64, %r8
+	jbe	L(StrncpyLeave6)
+# endif
+	palignr	$6, %xmm2, %xmm3
+	lea	64(%rcx), %rcx
+	palignr	$6, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%rdx)
+	movaps	%xmm4, 32(%rdx)
+	movaps	%xmm3, 16(%rdx)
+	movaps	%xmm2, (%rdx)
+	lea	64(%rdx), %rdx
+	jmp	L(Shl6LoopStart)
+
+L(Shl6LoopExit):
+	mov	(%rcx), %r9
+	mov	6(%rcx), %esi
+	mov	%r9, (%rdx)
+	mov	%esi, 6(%rdx)
+	mov	$10, %rsi
+	jmp	L(CopyFrom1To16Bytes)
+
+	.p2align 4
+L(Shl7):
+	movaps	-7(%rcx), %xmm1
+	movaps	9(%rcx), %xmm2
+L(Shl7Start):
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %rax
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit7Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl7LoopExit)
+
+	palignr	$7, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	25(%rcx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+	movaps	%xmm2, %xmm1
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit7Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl7LoopExit)
+
+	palignr	$7, %xmm3, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	25(%rcx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit7Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl7LoopExit)
+
+	palignr	$7, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	25(%rcx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit7Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl7LoopExit)
+
+	palignr	$7, %xmm3, %xmm2
+	movaps	%xmm2, (%rdx)
+	lea	25(%rcx), %rcx
+	lea	16(%rdx), %rdx
+
+	mov	%rcx, %rax
+	and	$-0x40, %rcx
+	sub	%rcx, %rax
+	lea	-9(%rcx), %rcx
+	sub	%rax, %rdx
+# ifdef USE_AS_STRNCPY
+	add	%rax, %r8
+# endif
+	movaps	-7(%rcx), %xmm1
+
+/* 64 bytes loop */
+	.p2align 4
+L(Shl7LoopStart):
+	movaps	9(%rcx), %xmm2
+	movaps	25(%rcx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	41(%rcx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	57(%rcx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqb	%xmm0, %xmm7
+	pmovmskb %xmm7, %rax
+	movaps	%xmm5, %xmm7
+	palignr	$7, %xmm4, %xmm5
+	test	%rax, %rax
+	palignr	$7, %xmm3, %xmm4
+	jnz	L(Shl7Start)
+# ifdef USE_AS_STRNCPY
+	sub	$64, %r8
+	jbe	L(StrncpyLeave7)
+# endif
+	palignr	$7, %xmm2, %xmm3
+	lea	64(%rcx), %rcx
+	palignr	$7, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%rdx)
+	movaps	%xmm4, 32(%rdx)
+	movaps	%xmm3, 16(%rdx)
+	movaps	%xmm2, (%rdx)
+	lea	64(%rdx), %rdx
+	jmp	L(Shl7LoopStart)
+
+L(Shl7LoopExit):
+	mov	(%rcx), %r9
+	mov	5(%rcx), %esi
+	mov	%r9, (%rdx)
+	mov	%esi, 5(%rdx)
+	mov	$9, %rsi
+	jmp	L(CopyFrom1To16Bytes)
+
+	.p2align 4
+L(Shl8):
+	movaps	-8(%rcx), %xmm1
+	movaps	8(%rcx), %xmm2
+L(Shl8Start):
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %rax
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit8Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl8LoopExit)
+
+	palignr	$8, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	24(%rcx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+	movaps	%xmm2, %xmm1
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit8Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl8LoopExit)
+
+	palignr	$8, %xmm3, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	24(%rcx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit8Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl8LoopExit)
+
+	palignr	$8, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	24(%rcx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit8Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl8LoopExit)
+
+	palignr	$8, %xmm3, %xmm2
+	movaps	%xmm2, (%rdx)
+	lea	24(%rcx), %rcx
+	lea	16(%rdx), %rdx
+
+	mov	%rcx, %rax
+	and	$-0x40, %rcx
+	sub	%rcx, %rax
+	lea	-8(%rcx), %rcx
+	sub	%rax, %rdx
+# ifdef USE_AS_STRNCPY
+	add	%rax, %r8
+# endif
+	movaps	-8(%rcx), %xmm1
+
+/* 64 bytes loop */
+	.p2align 4
+L(Shl8LoopStart):
+	movaps	8(%rcx), %xmm2
+	movaps	24(%rcx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	40(%rcx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	56(%rcx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqb	%xmm0, %xmm7
+	pmovmskb %xmm7, %rax
+	movaps	%xmm5, %xmm7
+	palignr	$8, %xmm4, %xmm5
+	test	%rax, %rax
+	palignr	$8, %xmm3, %xmm4
+	jnz	L(Shl8Start)
+# ifdef USE_AS_STRNCPY
+	sub	$64, %r8
+	jbe	L(StrncpyLeave8)
+# endif
+	palignr	$8, %xmm2, %xmm3
+	lea	64(%rcx), %rcx
+	palignr	$8, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%rdx)
+	movaps	%xmm4, 32(%rdx)
+	movaps	%xmm3, 16(%rdx)
+	movaps	%xmm2, (%rdx)
+	lea	64(%rdx), %rdx
+	jmp	L(Shl8LoopStart)
+
+L(Shl8LoopExit):
+	mov	(%rcx), %r9
+	mov	$8, %rsi
+	mov	%r9, (%rdx)
+	jmp	L(CopyFrom1To16Bytes)
+
+	.p2align 4
+L(Shl9):
+	movaps	-9(%rcx), %xmm1
+	movaps	7(%rcx), %xmm2
+L(Shl9Start):
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %rax
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit9Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl9LoopExit)
+
+	palignr	$9, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	23(%rcx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+	movaps	%xmm2, %xmm1
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit9Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl9LoopExit)
+
+	palignr	$9, %xmm3, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	23(%rcx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit9Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl9LoopExit)
+
+	palignr	$9, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	23(%rcx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit9Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl9LoopExit)
+
+	palignr	$9, %xmm3, %xmm2
+	movaps	%xmm2, (%rdx)
+	lea	23(%rcx), %rcx
+	lea	16(%rdx), %rdx
+
+	mov	%rcx, %rax
+	and	$-0x40, %rcx
+	sub	%rcx, %rax
+	lea	-7(%rcx), %rcx
+	sub	%rax, %rdx
+# ifdef USE_AS_STRNCPY
+	add	%rax, %r8
+# endif
+	movaps	-9(%rcx), %xmm1
+
+/* 64 bytes loop */
+	.p2align 4
+L(Shl9LoopStart):
+	movaps	7(%rcx), %xmm2
+	movaps	23(%rcx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	39(%rcx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	55(%rcx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqb	%xmm0, %xmm7
+	pmovmskb %xmm7, %rax
+	movaps	%xmm5, %xmm7
+	palignr	$9, %xmm4, %xmm5
+	test	%rax, %rax
+	palignr	$9, %xmm3, %xmm4
+	jnz	L(Shl9Start)
+# ifdef USE_AS_STRNCPY
+	sub	$64, %r8
+	jbe	L(StrncpyLeave9)
+# endif
+	palignr	$9, %xmm2, %xmm3
+	lea	64(%rcx), %rcx
+	palignr	$9, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%rdx)
+	movaps	%xmm4, 32(%rdx)
+	movaps	%xmm3, 16(%rdx)
+	movaps	%xmm2, (%rdx)
+	lea	64(%rdx), %rdx
+	jmp	L(Shl9LoopStart)
+
+L(Shl9LoopExit):
+	mov	-1(%rcx), %r9
+	mov	$7, %rsi
+	mov	%r9, -1(%rdx)
+	jmp	L(CopyFrom1To16Bytes)
+
+	.p2align 4
+L(Shl10):
+	movaps	-10(%rcx), %xmm1
+	movaps	6(%rcx), %xmm2
+L(Shl10Start):
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %rax
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit10Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl10LoopExit)
+
+	palignr	$10, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	22(%rcx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+	movaps	%xmm2, %xmm1
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit10Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl10LoopExit)
+
+	palignr	$10, %xmm3, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	22(%rcx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit10Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl10LoopExit)
+
+	palignr	$10, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	22(%rcx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit10Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl10LoopExit)
+
+	palignr	$10, %xmm3, %xmm2
+	movaps	%xmm2, (%rdx)
+	lea	22(%rcx), %rcx
+	lea	16(%rdx), %rdx
+
+	mov	%rcx, %rax
+	and	$-0x40, %rcx
+	sub	%rcx, %rax
+	lea	-6(%rcx), %rcx
+	sub	%rax, %rdx
+# ifdef USE_AS_STRNCPY
+	add	%rax, %r8
+# endif
+	movaps	-10(%rcx), %xmm1
+
+/* 64 bytes loop */
+	.p2align 4
+L(Shl10LoopStart):
+	movaps	6(%rcx), %xmm2
+	movaps	22(%rcx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	38(%rcx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	54(%rcx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqb	%xmm0, %xmm7
+	pmovmskb %xmm7, %rax
+	movaps	%xmm5, %xmm7
+	palignr	$10, %xmm4, %xmm5
+	test	%rax, %rax
+	palignr	$10, %xmm3, %xmm4
+	jnz	L(Shl10Start)
+# ifdef USE_AS_STRNCPY
+	sub	$64, %r8
+	jbe	L(StrncpyLeave10)
+# endif
+	palignr	$10, %xmm2, %xmm3
+	lea	64(%rcx), %rcx
+	palignr	$10, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%rdx)
+	movaps	%xmm4, 32(%rdx)
+	movaps	%xmm3, 16(%rdx)
+	movaps	%xmm2, (%rdx)
+	lea	64(%rdx), %rdx
+	jmp	L(Shl10LoopStart)
+
+L(Shl10LoopExit):
+	mov	-2(%rcx), %r9
+	mov	$6, %rsi
+	mov	%r9, -2(%rdx)
+	jmp	L(CopyFrom1To16Bytes)
+
+	.p2align 4
+L(Shl11):
+	movaps	-11(%rcx), %xmm1
+	movaps	5(%rcx), %xmm2
+L(Shl11Start):
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %rax
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit11Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl11LoopExit)
+
+	palignr	$11, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	21(%rcx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+	movaps	%xmm2, %xmm1
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit11Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl11LoopExit)
+
+	palignr	$11, %xmm3, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	21(%rcx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit11Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl11LoopExit)
+
+	palignr	$11, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	21(%rcx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit11Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl11LoopExit)
+
+	palignr	$11, %xmm3, %xmm2
+	movaps	%xmm2, (%rdx)
+	lea	21(%rcx), %rcx
+	lea	16(%rdx), %rdx
+
+	mov	%rcx, %rax
+	and	$-0x40, %rcx
+	sub	%rcx, %rax
+	lea	-5(%rcx), %rcx
+	sub	%rax, %rdx
+# ifdef USE_AS_STRNCPY
+	add	%rax, %r8
+# endif
+	movaps	-11(%rcx), %xmm1
+
+/* 64 bytes loop */
+	.p2align 4
+L(Shl11LoopStart):
+	movaps	5(%rcx), %xmm2
+	movaps	21(%rcx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	37(%rcx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	53(%rcx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqb	%xmm0, %xmm7
+	pmovmskb %xmm7, %rax
+	movaps	%xmm5, %xmm7
+	palignr	$11, %xmm4, %xmm5
+	test	%rax, %rax
+	palignr	$11, %xmm3, %xmm4
+	jnz	L(Shl11Start)
+# ifdef USE_AS_STRNCPY
+	sub	$64, %r8
+	jbe	L(StrncpyLeave11)
+# endif
+	palignr	$11, %xmm2, %xmm3
+	lea	64(%rcx), %rcx
+	palignr	$11, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%rdx)
+	movaps	%xmm4, 32(%rdx)
+	movaps	%xmm3, 16(%rdx)
+	movaps	%xmm2, (%rdx)
+	lea	64(%rdx), %rdx
+	jmp	L(Shl11LoopStart)
+
+L(Shl11LoopExit):
+	mov	-3(%rcx), %r9
+	mov	$5, %rsi
+	mov	%r9, -3(%rdx)
+	jmp	L(CopyFrom1To16Bytes)
+
+	.p2align 4
+L(Shl12):
+	movaps	-12(%rcx), %xmm1
+	movaps	4(%rcx), %xmm2
+L(Shl12Start):
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %rax
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit12Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl12LoopExit)
+
+	palignr	$12, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	20(%rcx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+	movaps	%xmm2, %xmm1
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit12Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl12LoopExit)
+
+	palignr	$12, %xmm3, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	20(%rcx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit12Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl12LoopExit)
+
+	palignr	$12, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	20(%rcx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit12Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl12LoopExit)
+
+	palignr	$12, %xmm3, %xmm2
+	movaps	%xmm2, (%rdx)
+	lea	20(%rcx), %rcx
+	lea	16(%rdx), %rdx
+
+	mov	%rcx, %rax
+	and	$-0x40, %rcx
+	sub	%rcx, %rax
+	lea	-4(%rcx), %rcx
+	sub	%rax, %rdx
+# ifdef USE_AS_STRNCPY
+	add	%rax, %r8
+# endif
+	movaps	-12(%rcx), %xmm1
+
+/* 64 bytes loop */
+	.p2align 4
+L(Shl12LoopStart):
+	movaps	4(%rcx), %xmm2
+	movaps	20(%rcx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	36(%rcx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	52(%rcx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqb	%xmm0, %xmm7
+	pmovmskb %xmm7, %rax
+	movaps	%xmm5, %xmm7
+	palignr	$12, %xmm4, %xmm5
+	test	%rax, %rax
+	palignr	$12, %xmm3, %xmm4
+	jnz	L(Shl12Start)
+# ifdef USE_AS_STRNCPY
+	sub	$64, %r8
+	jbe	L(StrncpyLeave12)
+# endif
+	palignr	$12, %xmm2, %xmm3
+	lea	64(%rcx), %rcx
+	palignr	$12, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%rdx)
+	movaps	%xmm4, 32(%rdx)
+	movaps	%xmm3, 16(%rdx)
+	movaps	%xmm2, (%rdx)
+	lea	64(%rdx), %rdx
+	jmp	L(Shl12LoopStart)
+
+L(Shl12LoopExit):
+	mov	(%rcx), %r9d
+	mov	$4, %rsi
+	mov	%r9d, (%rdx)
+	jmp	L(CopyFrom1To16Bytes)
+
+	.p2align 4
+L(Shl13):
+	movaps	-13(%rcx), %xmm1
+	movaps	3(%rcx), %xmm2
+L(Shl13Start):
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %rax
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit13Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl13LoopExit)
+
+	palignr	$13, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	19(%rcx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+	movaps	%xmm2, %xmm1
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit13Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl13LoopExit)
+
+	palignr	$13, %xmm3, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	19(%rcx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit13Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl13LoopExit)
+
+	palignr	$13, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	19(%rcx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit13Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl13LoopExit)
+
+	palignr	$13, %xmm3, %xmm2
+	movaps	%xmm2, (%rdx)
+	lea	19(%rcx), %rcx
+	lea	16(%rdx), %rdx
+
+	mov	%rcx, %rax
+	and	$-0x40, %rcx
+	sub	%rcx, %rax
+	lea	-3(%rcx), %rcx
+	sub	%rax, %rdx
+# ifdef USE_AS_STRNCPY
+	add	%rax, %r8
+# endif
+	movaps	-13(%rcx), %xmm1
+
+/* 64 bytes loop */
+	.p2align 4
+L(Shl13LoopStart):
+	movaps	3(%rcx), %xmm2
+	movaps	19(%rcx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	35(%rcx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	51(%rcx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqb	%xmm0, %xmm7
+	pmovmskb %xmm7, %rax
+	movaps	%xmm5, %xmm7
+	palignr	$13, %xmm4, %xmm5
+	test	%rax, %rax
+	palignr	$13, %xmm3, %xmm4
+	jnz	L(Shl13Start)
+# ifdef USE_AS_STRNCPY
+	sub	$64, %r8
+	jbe	L(StrncpyLeave13)
+# endif
+	palignr	$13, %xmm2, %xmm3
+	lea	64(%rcx), %rcx
+	palignr	$13, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%rdx)
+	movaps	%xmm4, 32(%rdx)
+	movaps	%xmm3, 16(%rdx)
+	movaps	%xmm2, (%rdx)
+	lea	64(%rdx), %rdx
+	jmp	L(Shl13LoopStart)
+
+L(Shl13LoopExit):
+	mov	-1(%rcx), %r9d
+	mov	$3, %rsi
+	mov	%r9d, -1(%rdx)
+	jmp	L(CopyFrom1To16Bytes)
+
+	.p2align 4
+L(Shl14):
+	movaps	-14(%rcx), %xmm1
+	movaps	2(%rcx), %xmm2
+L(Shl14Start):
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %rax
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit14Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl14LoopExit)
+
+	palignr	$14, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	18(%rcx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+	movaps	%xmm2, %xmm1
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit14Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl14LoopExit)
+
+	palignr	$14, %xmm3, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	18(%rcx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit14Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl14LoopExit)
+
+	palignr	$14, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	18(%rcx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit14Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl14LoopExit)
+
+	palignr	$14, %xmm3, %xmm2
+	movaps	%xmm2, (%rdx)
+	lea	18(%rcx), %rcx
+	lea	16(%rdx), %rdx
+
+	mov	%rcx, %rax
+	and	$-0x40, %rcx
+	sub	%rcx, %rax
+	lea	-2(%rcx), %rcx
+	sub	%rax, %rdx
+# ifdef USE_AS_STRNCPY
+	add	%rax, %r8
+# endif
+	movaps	-14(%rcx), %xmm1
+
+/* 64 bytes loop */
+	.p2align 4
+L(Shl14LoopStart):
+	movaps	2(%rcx), %xmm2
+	movaps	18(%rcx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	34(%rcx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	50(%rcx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqb	%xmm0, %xmm7
+	pmovmskb %xmm7, %rax
+	movaps	%xmm5, %xmm7
+	palignr	$14, %xmm4, %xmm5
+	test	%rax, %rax
+	palignr	$14, %xmm3, %xmm4
+	jnz	L(Shl14Start)
+# ifdef USE_AS_STRNCPY
+	sub	$64, %r8
+	jbe	L(StrncpyLeave14)
+# endif
+	palignr	$14, %xmm2, %xmm3
+	lea	64(%rcx), %rcx
+	palignr	$14, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%rdx)
+	movaps	%xmm4, 32(%rdx)
+	movaps	%xmm3, 16(%rdx)
+	movaps	%xmm2, (%rdx)
+	lea	64(%rdx), %rdx
+	jmp	L(Shl14LoopStart)
+
+L(Shl14LoopExit):
+	mov	-2(%rcx), %r9d
+	mov	$2, %rsi
+	mov	%r9d, -2(%rdx)
+	jmp	L(CopyFrom1To16Bytes)
+
+	.p2align 4
+L(Shl15):
+	movaps	-15(%rcx), %xmm1
+	movaps	1(%rcx), %xmm2
+L(Shl15Start):
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %rax
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit15Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl15LoopExit)
+
+	palignr	$15, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	17(%rcx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+	movaps	%xmm2, %xmm1
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit15Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl15LoopExit)
+
+	palignr	$15, %xmm3, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	17(%rcx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit15Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl15LoopExit)
+
+	palignr	$15, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	17(%rcx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit15Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl15LoopExit)
+
+	palignr	$15, %xmm3, %xmm2
+	movaps	%xmm2, (%rdx)
+	lea	17(%rcx), %rcx
+	lea	16(%rdx), %rdx
+
+	mov	%rcx, %rax
+	and	$-0x40, %rcx
+	sub	%rcx, %rax
+	lea	-1(%rcx), %rcx
+	sub	%rax, %rdx
+# ifdef USE_AS_STRNCPY
+	add	%rax, %r8
+# endif
+	movaps	-15(%rcx), %xmm1
+
+/* 64 bytes loop */
+	.p2align 4
+L(Shl15LoopStart):
+	movaps	1(%rcx), %xmm2
+	movaps	17(%rcx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	33(%rcx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	49(%rcx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqb	%xmm0, %xmm7
+	pmovmskb %xmm7, %rax
+	movaps	%xmm5, %xmm7
+	palignr	$15, %xmm4, %xmm5
+	test	%rax, %rax
+	palignr	$15, %xmm3, %xmm4
+	jnz	L(Shl15Start)
+# ifdef USE_AS_STRNCPY
+	sub	$64, %r8
+	jbe	L(StrncpyLeave15)
+# endif
+	palignr	$15, %xmm2, %xmm3
+	lea	64(%rcx), %rcx
+	palignr	$15, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%rdx)
+	movaps	%xmm4, 32(%rdx)
+	movaps	%xmm3, 16(%rdx)
+	movaps	%xmm2, (%rdx)
+	lea	64(%rdx), %rdx
+	jmp	L(Shl15LoopStart)
+
+L(Shl15LoopExit):
+	mov	-3(%rcx), %r9d
+	mov	$1, %rsi
+	mov	%r9d, -3(%rdx)
+# ifdef USE_AS_STRCAT
+	jmp	L(CopyFrom1To16Bytes)
+# endif
+
+# ifndef USE_AS_STRCAT
+
+	.p2align 4
+L(CopyFrom1To16Bytes):
+#  ifdef USE_AS_STRNCPY
+	add	$16, %r8
+#  endif
+	add	%rsi, %rdx
+	add	%rsi, %rcx
+
+	test	%al, %al
+	jz	L(ExitHigh)
+	test	$0x01, %al
+	jnz	L(Exit1)
+	test	$0x02, %al
+	jnz	L(Exit2)
+	test	$0x04, %al
+	jnz	L(Exit3)
+	test	$0x08, %al
+	jnz	L(Exit4)
+	test	$0x10, %al
+	jnz	L(Exit5)
+	test	$0x20, %al
+	jnz	L(Exit6)
+	test	$0x40, %al
+	jnz	L(Exit7)
+
+	.p2align 4
+L(Exit8):
+	mov	(%rcx), %rax
+	mov	%rax, (%rdx)
+#  ifdef USE_AS_STPCPY
+	lea	7(%rdx), %rax
+#  else
+	mov	%rdi, %rax
+#  endif
+#  ifdef USE_AS_STRNCPY
+	sub	$8, %r8
+	lea	8(%rdx), %rcx
+	jnz	L(StrncpyFillTailWithZero1)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+#   endif
+#  endif
+	ret
+
+	.p2align 4
+L(ExitHigh):
+	test	$0x01, %ah
+	jnz	L(Exit9)
+	test	$0x02, %ah
+	jnz	L(Exit10)
+	test	$0x04, %ah
+	jnz	L(Exit11)
+	test	$0x08, %ah
+	jnz	L(Exit12)
+	test	$0x10, %ah
+	jnz	L(Exit13)
+	test	$0x20, %ah
+	jnz	L(Exit14)
+	test	$0x40, %ah
+	jnz	L(Exit15)
+
+	.p2align 4
+L(Exit16):
+	mov	(%rcx), %rax
+	mov	%rax, (%rdx)
+	mov	8(%rcx), %rax
+	mov	%rax, 8(%rdx)
+#  ifdef USE_AS_STPCPY
+	lea	15(%rdx), %rax
+#  else
+	mov	%rdi, %rax
+#  endif
+#  ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	lea	16(%rdx), %rcx
+	jnz	L(StrncpyFillTailWithZero1)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+#   endif
+#  endif
+	ret
+
+#  ifdef USE_AS_STRNCPY
+
+	.p2align 4
+L(CopyFrom1To16BytesCase2):
+	add	$16, %r8
+	add	%rsi, %rcx
+	lea	(%rsi, %rdx), %rsi
+	lea	-9(%r8), %rdx
+	and	$1<<7, %dh
+	or	%al, %dh
+	test	%dh, %dh
+	lea	(%rsi), %rdx
+	jz	L(ExitHighCase2)
+
+	cmp	$1, %r8
+	je	L(Exit1)
+	test	$0x01, %al
+	jnz	L(Exit1)
+	cmp	$2, %r8
+	je	L(Exit2)
+	test	$0x02, %al
+	jnz	L(Exit2)
+	cmp	$3, %r8
+	je	L(Exit3)
+	test	$0x04, %al
+	jnz	L(Exit3)
+	cmp	$4, %r8
+	je	L(Exit4)
+	test	$0x08, %al
+	jnz	L(Exit4)
+	cmp	$5, %r8
+	je	L(Exit5)
+	test	$0x10, %al
+	jnz	L(Exit5)
+	cmp	$6, %r8
+	je	L(Exit6)
+	test	$0x20, %al
+	jnz	L(Exit6)
+	cmp	$7, %r8
+	je	L(Exit7)
+	test	$0x40, %al
+	jnz	L(Exit7)
+	jmp	L(Exit8)
+
+	.p2align 4
+L(ExitHighCase2):
+	cmp	$9, %r8
+	je	L(Exit9)
+	test	$0x01, %ah
+	jnz	L(Exit9)
+	cmp	$10, %r8
+	je	L(Exit10)
+	test	$0x02, %ah
+	jnz	L(Exit10)
+	cmp	$11, %r8
+	je	L(Exit11)
+	test	$0x04, %ah
+	jnz	L(Exit11)
+	cmp	$12, %r8
+	je	L(Exit12)
+	test	$0x8, %ah
+	jnz	L(Exit12)
+	cmp	$13, %r8
+	je	L(Exit13)
+	test	$0x10, %ah
+	jnz	L(Exit13)
+	cmp	$14, %r8
+	je	L(Exit14)
+	test	$0x20, %ah
+	jnz	L(Exit14)
+	cmp	$15, %r8
+	je	L(Exit15)
+	test	$0x40, %ah
+	jnz	L(Exit15)
+	jmp	L(Exit16)
+
+L(CopyFrom1To16BytesCase2OrCase3):
+	test	%rax, %rax
+	jnz	L(CopyFrom1To16BytesCase2)
+
+	.p2align 4
+L(CopyFrom1To16BytesCase3):
+	add	$16, %r8
+	add	%rsi, %rdx
+	add	%rsi, %rcx
+
+	cmp	$16, %r8
+	je	L(Exit16)
+	cmp	$8, %r8
+	je	L(Exit8)
+	jg	L(More8Case3)
+	cmp	$4, %r8
+	je	L(Exit4)
+	jg	L(More4Case3)
+	cmp	$2, %r8
+	jl	L(Exit1)
+	je	L(Exit2)
+	jg	L(Exit3)
+L(More8Case3): /* but less than 16 */
+	cmp	$12, %r8
+	je	L(Exit12)
+	jl	L(Less12Case3)
+	cmp	$14, %r8
+	jl	L(Exit13)
+	je	L(Exit14)
+	jg	L(Exit15)
+L(More4Case3): /* but less than 8 */
+	cmp	$6, %r8
+	jl	L(Exit5)
+	je	L(Exit6)
+	jg	L(Exit7)
+L(Less12Case3): /* but more than 8 */
+	cmp	$10, %r8
+	jl	L(Exit9)
+	je	L(Exit10)
+	jg	L(Exit11)
+#  endif
+
+	.p2align 4
+L(Exit1):
+	movb	(%rcx), %al
+	movb	%al, (%rdx)
+#  ifdef USE_AS_STPCPY
+	lea	(%rdx), %rax
+#  else
+	mov	%rdi, %rax
+#  endif
+#  ifdef USE_AS_STRNCPY
+	sub	$1, %r8
+	lea	1(%rdx), %rcx
+	jnz	L(StrncpyFillTailWithZero1)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+#   endif
+#  endif
+	ret
+
+	.p2align 4
+L(Exit2):
+	movw	(%rcx), %ax
+	movw	%ax, (%rdx)
+#  ifdef USE_AS_STPCPY
+	lea	1(%rdx), %rax
+#  else
+	mov	%rdi, %rax
+#  endif
+#  ifdef USE_AS_STRNCPY
+	sub	$2, %r8
+	lea	2(%rdx), %rcx
+	jnz	L(StrncpyFillTailWithZero1)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+#   endif
+#  endif
+	ret
+
+	.p2align 4
+L(Exit3):
+	movw	(%rcx), %ax
+	movw	%ax, (%rdx)
+	movb	2(%rcx), %al
+	movb	%al, 2(%rdx)
+#  ifdef USE_AS_STPCPY
+	lea	2(%rdx), %rax
+#  else
+	mov	%rdi, %rax
+#  endif
+#  ifdef USE_AS_STRNCPY
+	sub	$3, %r8
+	lea	3(%rdx), %rcx
+	jnz	L(StrncpyFillTailWithZero1)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+#   endif
+#  endif
+	ret
+
+	.p2align 4
+L(Exit4):
+	movl	(%rcx), %eax
+	movl	%eax, (%rdx)
+#  ifdef USE_AS_STPCPY
+	lea	3(%rdx), %rax
+#  else
+	mov	%rdi, %rax
+#  endif
+#  ifdef USE_AS_STRNCPY
+	sub	$4, %r8
+	lea	4(%rdx), %rcx
+	jnz	L(StrncpyFillTailWithZero1)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+#   endif
+#  endif
+	ret
+
+	.p2align 4
+L(Exit5):
+	movl	(%rcx), %eax
+	movl	%eax, (%rdx)
+	movb	4(%rcx), %al
+	movb	%al, 4(%rdx)
+#  ifdef USE_AS_STPCPY
+	lea	4(%rdx), %rax
+#  else
+	mov	%rdi, %rax
+#  endif
+#  ifdef USE_AS_STRNCPY
+	sub	$5, %r8
+	lea	5(%rdx), %rcx
+	jnz	L(StrncpyFillTailWithZero1)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+#  endif
+#  endif
+	ret
+
+	.p2align 4
+L(Exit6):
+	movl	(%rcx), %eax
+	movl	%eax, (%rdx)
+	movw	4(%rcx), %ax
+	movw	%ax, 4(%rdx)
+#  ifdef USE_AS_STPCPY
+	lea	5(%rdx), %rax
+#  else
+	mov	%rdi, %rax
+#  endif
+#  ifdef USE_AS_STRNCPY
+	sub	$6, %r8
+	lea	6(%rdx), %rcx
+	jnz	L(StrncpyFillTailWithZero1)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+#  endif
+#  endif
+	ret
+
+	.p2align 4
+L(Exit7):
+	movl	(%rcx), %eax
+	movl	%eax, (%rdx)
+	movl	3(%rcx), %eax
+	movl	%eax, 3(%rdx)
+#  ifdef USE_AS_STPCPY
+	lea	6(%rdx), %rax
+#  else
+	mov	%rdi, %rax
+#  endif
+#  ifdef USE_AS_STRNCPY
+	sub	$7, %r8
+	lea	7(%rdx), %rcx
+	jnz	L(StrncpyFillTailWithZero1)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+#   endif
+#  endif
+	ret
+
+	.p2align 4
+L(Exit9):
+	mov	(%rcx), %rax
+	mov	%rax, (%rdx)
+	mov	5(%rcx), %eax
+	mov	%eax, 5(%rdx)
+#  ifdef USE_AS_STPCPY
+	lea	8(%rdx), %rax
+#  else
+	mov	%rdi, %rax
+#  endif
+#  ifdef USE_AS_STRNCPY
+	sub	$9, %r8
+	lea	9(%rdx), %rcx
+	jnz	L(StrncpyFillTailWithZero1)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+#   endif
+#  endif
+	ret
+
+	.p2align 4
+L(Exit10):
+	mov	(%rcx), %rax
+	mov	%rax, (%rdx)
+	mov	6(%rcx), %eax
+	mov	%eax, 6(%rdx)
+#  ifdef USE_AS_STPCPY
+	lea	9(%rdx), %rax
+#  else
+	mov	%rdi, %rax
+#  endif
+#  ifdef USE_AS_STRNCPY
+	sub	$10, %r8
+	lea	10(%rdx), %rcx
+	jnz	L(StrncpyFillTailWithZero1)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+#   endif
+#  endif
+	ret
+
+	.p2align 4
+L(Exit11):
+	mov	(%rcx), %rax
+	mov	%rax, (%rdx)
+	mov	7(%rcx), %eax
+	mov	%eax, 7(%rdx)
+#  ifdef USE_AS_STPCPY
+	lea	10(%rdx), %rax
+#  else
+	mov	%rdi, %rax
+#  endif
+#  ifdef USE_AS_STRNCPY
+	sub	$11, %r8
+	lea	11(%rdx), %rcx
+	jnz	L(StrncpyFillTailWithZero1)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+#   endif
+#  endif
+	ret
+
+	.p2align 4
+L(Exit12):
+	mov	(%rcx), %rax
+	mov	%rax, (%rdx)
+	mov	8(%rcx), %eax
+	mov	%eax, 8(%rdx)
+#  ifdef USE_AS_STPCPY
+	lea	11(%rdx), %rax
+#  else
+	mov	%rdi, %rax
+#  endif
+#  ifdef USE_AS_STRNCPY
+	sub	$12, %r8
+	lea	12(%rdx), %rcx
+	jnz	L(StrncpyFillTailWithZero1)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+#  endif
+#  endif
+	ret
+
+	.p2align 4
+L(Exit13):
+	mov	(%rcx), %rax
+	mov	%rax, (%rdx)
+	mov	5(%rcx), %rax
+	mov	%rax, 5(%rdx)
+#  ifdef USE_AS_STPCPY
+	lea	12(%rdx), %rax
+#  else
+	mov	%rdi, %rax
+#  endif
+#  ifdef USE_AS_STRNCPY
+	sub	$13, %r8
+	lea	13(%rdx), %rcx
+	jnz	L(StrncpyFillTailWithZero1)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+#   endif
+#  endif
+	ret
+
+	.p2align 4
+L(Exit14):
+	mov	(%rcx), %rax
+	mov	%rax, (%rdx)
+	mov	6(%rcx), %rax
+	mov	%rax, 6(%rdx)
+#  ifdef USE_AS_STPCPY
+	lea	13(%rdx), %rax
+#  else
+	mov	%rdi, %rax
+#  endif
+#  ifdef USE_AS_STRNCPY
+	sub	$14, %r8
+	lea	14(%rdx), %rcx
+	jnz	L(StrncpyFillTailWithZero1)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+#   endif
+#  endif
+	ret
+
+	.p2align 4
+L(Exit15):
+	mov	(%rcx), %rax
+	mov	%rax, (%rdx)
+	mov	7(%rcx), %rax
+	mov	%rax, 7(%rdx)
+#  ifdef USE_AS_STPCPY
+	lea	14(%rdx), %rax
+#  else
+	mov	%rdi, %rax
+#  endif
+#  ifdef USE_AS_STRNCPY
+	sub	$15, %r8
+	lea	15(%rdx), %rcx
+	jnz	L(StrncpyFillTailWithZero1)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+#   endif
+#  endif
+	ret
+
+#  ifdef USE_AS_STRNCPY
+	.p2align 4
+L(Fill0):
+	ret
+
+	.p2align 4
+L(Fill1):
+	movb	%dl, (%rcx)
+	ret
+
+	.p2align 4
+L(Fill2):
+	movw	%dx, (%rcx)
+	ret
+
+	.p2align 4
+L(Fill3):
+	movw	%dx, (%rcx)
+	movb	%dl, 2(%rcx)
+	ret
+
+	.p2align 4
+L(Fill4):
+	movl	%edx, (%rcx)
+	ret
+
+	.p2align 4
+L(Fill5):
+	movl	%edx, (%rcx)
+	movb	%dl, 4(%rcx)
+	ret
+
+	.p2align 4
+L(Fill6):
+	movl	%edx, (%rcx)
+	movw	%dx, 4(%rcx)
+	ret
+
+	.p2align 4
+L(Fill7):
+	movl	%edx, (%rcx)
+	movl	%edx, 3(%rcx)
+	ret
+
+	.p2align 4
+L(Fill8):
+	mov	%rdx, (%rcx)
+	ret
+
+	.p2align 4
+L(Fill9):
+	mov	%rdx, (%rcx)
+	movb	%dl, 8(%rcx)
+	ret
+
+	.p2align 4
+L(Fill10):
+	mov	%rdx, (%rcx)
+	movw	%dx, 8(%rcx)
+	ret
+
+	.p2align 4
+L(Fill11):
+	mov	%rdx, (%rcx)
+	movl	%edx, 7(%rcx)
+	ret
+
+	.p2align 4
+L(Fill12):
+	mov	%rdx, (%rcx)
+	movl	%edx, 8(%rcx)
+	ret
+
+	.p2align 4
+L(Fill13):
+	mov	%rdx, (%rcx)
+	mov	%rdx, 5(%rcx)
+	ret
+
+	.p2align 4
+L(Fill14):
+	mov	%rdx, (%rcx)
+	mov	%rdx, 6(%rcx)
+	ret
+
+	.p2align 4
+L(Fill15):
+	mov	%rdx, (%rcx)
+	mov	%rdx, 7(%rcx)
+	ret
+
+	.p2align 4
+L(Fill16):
+	mov	%rdx, (%rcx)
+	mov	%rdx, 8(%rcx)
+	ret
+
+	.p2align 4
+L(StrncpyFillExit1):
+	lea	16(%r8), %r8
+L(FillFrom1To16Bytes):
+	test	%r8, %r8
+	jz	L(Fill0)
+	cmp	$16, %r8
+	je	L(Fill16)
+	cmp	$8, %r8
+	je	L(Fill8)
+	jg	L(FillMore8)
+	cmp	$4, %r8
+	je	L(Fill4)
+	jg	L(FillMore4)
+	cmp	$2, %r8
+	jl	L(Fill1)
+	je	L(Fill2)
+	jg	L(Fill3)
+L(FillMore8): /* but less than 16 */
+	cmp	$12, %r8
+	je	L(Fill12)
+	jl	L(FillLess12)
+	cmp	$14, %r8
+	jl	L(Fill13)
+	je	L(Fill14)
+	jg	L(Fill15)
+L(FillMore4): /* but less than 8 */
+	cmp	$6, %r8
+	jl	L(Fill5)
+	je	L(Fill6)
+	jg	L(Fill7)
+L(FillLess12): /* but more than 8 */
+	cmp	$10, %r8
+	jl	L(Fill9)
+	je	L(Fill10)
+	jmp	L(Fill11)
+
+	.p2align 4
+L(StrncpyFillTailWithZero1):
+	xor	%rdx, %rdx
+	sub	$16, %r8
+	jbe	L(StrncpyFillExit1)
+
+	pxor	%xmm0, %xmm0
+	mov	%rdx, (%rcx)
+	mov	%rdx, 8(%rcx)
+
+	lea	16(%rcx), %rcx
+
+	mov	%rcx, %rdx
+	and	$0xf, %rdx
+	sub	%rdx, %rcx
+	add	%rdx, %r8
+	xor	%rdx, %rdx
+	sub	$64, %r8
+	jb	L(StrncpyFillLess64)
+
+L(StrncpyFillLoopMovdqa):
+	movdqa	%xmm0, (%rcx)
+	movdqa	%xmm0, 16(%rcx)
+	movdqa	%xmm0, 32(%rcx)
+	movdqa	%xmm0, 48(%rcx)
+	lea	64(%rcx), %rcx
+	sub	$64, %r8
+	jae	L(StrncpyFillLoopMovdqa)
+
+L(StrncpyFillLess64):
+	add	$32, %r8
+	jl	L(StrncpyFillLess32)
+	movdqa	%xmm0, (%rcx)
+	movdqa	%xmm0, 16(%rcx)
+	lea	32(%rcx), %rcx
+	sub	$16, %r8
+	jl	L(StrncpyFillExit1)
+	movdqa	%xmm0, (%rcx)
+	lea	16(%rcx), %rcx
+	jmp	L(FillFrom1To16Bytes)
+
+L(StrncpyFillLess32):
+	add	$16, %r8
+	jl	L(StrncpyFillExit1)
+	movdqa	%xmm0, (%rcx)
+	lea	16(%rcx), %rcx
+	jmp	L(FillFrom1To16Bytes)
+
+	.p2align 4
+L(Exit0):
+	mov	%rdx, %rax
+	ret
+
+	.p2align 4
+L(StrncpyExit15Bytes):
+	cmp	$9, %r8
+	je	L(Exit9)
+	cmpb	$0, 8(%rcx)
+	jz	L(Exit9)
+	cmp	$10, %r8
+	je	L(Exit10)
+	cmpb	$0, 9(%rcx)
+	jz	L(Exit10)
+	cmp	$11, %r8
+	je	L(Exit11)
+	cmpb	$0, 10(%rcx)
+	jz	L(Exit11)
+	cmp	$12, %r8
+	je	L(Exit12)
+	cmpb	$0, 11(%rcx)
+	jz	L(Exit12)
+	cmp	$13, %r8
+	je	L(Exit13)
+	cmpb	$0, 12(%rcx)
+	jz	L(Exit13)
+	cmp	$14, %r8
+	je	L(Exit14)
+	cmpb	$0, 13(%rcx)
+	jz	L(Exit14)
+	mov	(%rcx), %rax
+	mov	%rax, (%rdx)
+	mov	7(%rcx), %rax
+	mov	%rax, 7(%rdx)
+#   ifdef USE_AS_STPCPY
+	lea	14(%rdx), %rax
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+#   else
+	mov	%rdi, %rax
+#   endif
+	ret
+
+	.p2align 4
+L(StrncpyExit8Bytes):
+	cmp	$1, %r8
+	je	L(Exit1)
+	cmpb	$0, (%rcx)
+	jz	L(Exit1)
+	cmp	$2, %r8
+	je	L(Exit2)
+	cmpb	$0, 1(%rcx)
+	jz	L(Exit2)
+	cmp	$3, %r8
+	je	L(Exit3)
+	cmpb	$0, 2(%rcx)
+	jz	L(Exit3)
+	cmp	$4, %r8
+	je	L(Exit4)
+	cmpb	$0, 3(%rcx)
+	jz	L(Exit4)
+	cmp	$5, %r8
+	je	L(Exit5)
+	cmpb	$0, 4(%rcx)
+	jz	L(Exit5)
+	cmp	$6, %r8
+	je	L(Exit6)
+	cmpb	$0, 5(%rcx)
+	jz	L(Exit6)
+	cmp	$7, %r8
+	je	L(Exit7)
+	cmpb	$0, 6(%rcx)
+	jz	L(Exit7)
+	mov	(%rcx), %rax
+	mov	%rax, (%rdx)
+#   ifdef USE_AS_STPCPY
+	lea	7(%rdx), %rax
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+#   else
+	mov	%rdi, %rax
+#   endif
+	ret
+
+#  endif
+# endif
+
+# ifdef USE_AS_STRNCPY
+	.p2align 4
+L(StrncpyLeaveCase2OrCase3):
+	test	%rax, %rax
+	jnz	L(Aligned64LeaveCase2)
+
+L(Aligned64LeaveCase3):
+	lea	64(%r8), %r8
+	sub	$16, %r8
+	jbe	L(CopyFrom1To16BytesCase3)
+	movaps	%xmm4, -64(%rdx)
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(CopyFrom1To16BytesCase3)
+	movaps	%xmm5, -48(%rdx)
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(CopyFrom1To16BytesCase3)
+	movaps	%xmm6, -32(%rdx)
+	lea	16(%rsi), %rsi
+	lea	-16(%r8), %r8
+	jmp	L(CopyFrom1To16BytesCase3)
+
+L(Aligned64LeaveCase2):
+	pcmpeqb	%xmm4, %xmm0
+	pmovmskb %xmm0, %rax
+	add	$48, %r8
+	jle	L(CopyFrom1To16BytesCase2OrCase3)
+	test	%rax, %rax
+	jnz	L(CopyFrom1To16Bytes)
+
+	pcmpeqb	%xmm5, %xmm0
+	pmovmskb %xmm0, %rax
+	movaps	%xmm4, -64(%rdx)
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+	test	%rax, %rax
+	jnz	L(CopyFrom1To16Bytes)
+
+	pcmpeqb	%xmm6, %xmm0
+	pmovmskb %xmm0, %rax
+	movaps	%xmm5, -48(%rdx)
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+	test	%rax, %rax
+	jnz	L(CopyFrom1To16Bytes)
+
+	pcmpeqb	%xmm7, %xmm0
+	pmovmskb %xmm0, %rax
+	movaps	%xmm6, -32(%rdx)
+	lea	16(%rsi), %rsi
+	lea	-16(%r8), %r8
+	jmp	L(CopyFrom1To16BytesCase2)
+/*--------------------------------------------------*/
+	.p2align 4
+L(StrncpyExit1Case2OrCase3):
+	movdqu	-1(%rcx), %xmm0
+	movdqu	%xmm0, -1(%rdx)
+	mov	$15, %rsi
+	test	%rax, %rax
+	jnz	L(CopyFrom1To16BytesCase2)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+	.p2align 4
+L(StrncpyExit2Case2OrCase3):
+	movdqu	-2(%rcx), %xmm0
+	movdqu	%xmm0, -2(%rdx)
+	mov	$14, %rsi
+	test	%rax, %rax
+	jnz	L(CopyFrom1To16BytesCase2)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+	.p2align 4
+L(StrncpyExit3Case2OrCase3):
+	movdqu	-3(%rcx), %xmm0
+	movdqu	%xmm0, -3(%rdx)
+	mov	$13, %rsi
+	test	%rax, %rax
+	jnz	L(CopyFrom1To16BytesCase2)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+	.p2align 4
+L(StrncpyExit4Case2OrCase3):
+	movdqu	-4(%rcx), %xmm0
+	movdqu	%xmm0, -4(%rdx)
+	mov	$12, %rsi
+	test	%rax, %rax
+	jnz	L(CopyFrom1To16BytesCase2)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+	.p2align 4
+L(StrncpyExit5Case2OrCase3):
+	movdqu	-5(%rcx), %xmm0
+	movdqu	%xmm0, -5(%rdx)
+	mov	$11, %rsi
+	test	%rax, %rax
+	jnz	L(CopyFrom1To16BytesCase2)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+	.p2align 4
+L(StrncpyExit6Case2OrCase3):
+	mov	(%rcx), %rsi
+	mov	6(%rcx), %r9d
+	mov	%r9d, 6(%rdx)
+	mov	%rsi, (%rdx)
+	test	%rax, %rax
+	mov	$10, %rsi
+	jnz	L(CopyFrom1To16BytesCase2)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+	.p2align 4
+L(StrncpyExit7Case2OrCase3):
+	mov	(%rcx), %rsi
+	mov	5(%rcx), %r9d
+	mov	%r9d, 5(%rdx)
+	mov	%rsi, (%rdx)
+	test	%rax, %rax
+	mov	$9, %rsi
+	jnz	L(CopyFrom1To16BytesCase2)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+	.p2align 4
+L(StrncpyExit8Case2OrCase3):
+	mov	(%rcx), %r9
+	mov	$8, %rsi
+	mov	%r9, (%rdx)
+	test	%rax, %rax
+	jnz	L(CopyFrom1To16BytesCase2)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+	.p2align 4
+L(StrncpyExit9Case2OrCase3):
+	mov	-1(%rcx), %r9
+	mov	$7, %rsi
+	mov	%r9, -1(%rdx)
+	test	%rax, %rax
+	jnz	L(CopyFrom1To16BytesCase2)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+	.p2align 4
+L(StrncpyExit10Case2OrCase3):
+	mov	-2(%rcx), %r9
+	mov	$6, %rsi
+	mov	%r9, -2(%rdx)
+	test	%rax, %rax
+	jnz	L(CopyFrom1To16BytesCase2)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+	.p2align 4
+L(StrncpyExit11Case2OrCase3):
+	mov	-3(%rcx), %r9
+	mov	$5, %rsi
+	mov	%r9, -3(%rdx)
+	test	%rax, %rax
+	jnz	L(CopyFrom1To16BytesCase2)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+	.p2align 4
+L(StrncpyExit12Case2OrCase3):
+	mov	(%rcx), %r9d
+	mov	$4, %rsi
+	mov	%r9d, (%rdx)
+	test	%rax, %rax
+	jnz	L(CopyFrom1To16BytesCase2)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+	.p2align 4
+L(StrncpyExit13Case2OrCase3):
+	mov	-1(%rcx), %r9d
+	mov	$3, %rsi
+	mov	%r9d, -1(%rdx)
+	test	%rax, %rax
+	jnz	L(CopyFrom1To16BytesCase2)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+	.p2align 4
+L(StrncpyExit14Case2OrCase3):
+	mov	-2(%rcx), %r9d
+	mov	$2, %rsi
+	mov	%r9d, -2(%rdx)
+	test	%rax, %rax
+	jnz	L(CopyFrom1To16BytesCase2)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+	.p2align 4
+L(StrncpyExit15Case2OrCase3):
+	mov	-3(%rcx), %r9d
+	mov	$1, %rsi
+	mov	%r9d, -3(%rdx)
+	test	%rax, %rax
+	jnz	L(CopyFrom1To16BytesCase2)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+	.p2align 4
+L(StrncpyLeave1):
+	movaps	%xmm2, %xmm3
+	add	$48, %r8
+	jle	L(StrncpyExit1)
+	palignr	$1, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	31(%rcx), %xmm2
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(StrncpyExit1)
+	palignr	$1, %xmm3, %xmm2
+	movaps	%xmm2, 16(%rdx)
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(StrncpyExit1)
+	movaps	%xmm4, 32(%rdx)
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(StrncpyExit1)
+	movaps	%xmm5, 48(%rdx)
+	lea	16(%rsi), %rsi
+	lea	-16(%r8), %r8
+
+L(StrncpyExit1):
+	lea	15(%rdx, %rsi), %rdx
+	lea	15(%rcx, %rsi), %rcx
+	mov	-15(%rcx), %rsi
+	mov	-8(%rcx), %rax
+	mov	%rsi, -15(%rdx)
+	mov	%rax, -8(%rdx)
+	xor	%rsi, %rsi
+	jmp	L(CopyFrom1To16BytesCase3)
+
+	.p2align 4
+L(StrncpyLeave2):
+	movaps	%xmm2, %xmm3
+	add	$48, %r8
+	jle	L(StrncpyExit2)
+	palignr	$2, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	30(%rcx), %xmm2
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(StrncpyExit2)
+	palignr	$2, %xmm3, %xmm2
+	movaps	%xmm2, 16(%rdx)
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(StrncpyExit2)
+	movaps	%xmm4, 32(%rdx)
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(StrncpyExit2)
+	movaps	%xmm5, 48(%rdx)
+	lea	16(%rsi), %rsi
+	lea	-16(%r8), %r8
+
+L(StrncpyExit2):
+	lea	14(%rdx, %rsi), %rdx
+	lea	14(%rcx, %rsi), %rcx
+	mov	-14(%rcx), %rsi
+	mov	-8(%rcx), %rax
+	mov	%rsi, -14(%rdx)
+	mov	%rax, -8(%rdx)
+	xor	%rsi, %rsi
+	jmp	L(CopyFrom1To16BytesCase3)
+
+	.p2align 4
+L(StrncpyLeave3):
+	movaps	%xmm2, %xmm3
+	add	$48, %r8
+	jle	L(StrncpyExit3)
+	palignr	$3, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	29(%rcx), %xmm2
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(StrncpyExit3)
+	palignr	$3, %xmm3, %xmm2
+	movaps	%xmm2, 16(%rdx)
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(StrncpyExit3)
+	movaps	%xmm4, 32(%rdx)
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(StrncpyExit3)
+	movaps	%xmm5, 48(%rdx)
+	lea	16(%rsi), %rsi
+	lea	-16(%r8), %r8
+
+L(StrncpyExit3):
+	lea	13(%rdx, %rsi), %rdx
+	lea	13(%rcx, %rsi), %rcx
+	mov	-13(%rcx), %rsi
+	mov	-8(%rcx), %rax
+	mov	%rsi, -13(%rdx)
+	mov	%rax, -8(%rdx)
+	xor	%rsi, %rsi
+	jmp	L(CopyFrom1To16BytesCase3)
+
+	.p2align 4
+L(StrncpyLeave4):
+	movaps	%xmm2, %xmm3
+	add	$48, %r8
+	jle	L(StrncpyExit4)
+	palignr	$4, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	28(%rcx), %xmm2
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(StrncpyExit4)
+	palignr	$4, %xmm3, %xmm2
+	movaps	%xmm2, 16(%rdx)
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(StrncpyExit4)
+	movaps	%xmm4, 32(%rdx)
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(StrncpyExit4)
+	movaps	%xmm5, 48(%rdx)
+	lea	16(%rsi), %rsi
+	lea	-16(%r8), %r8
+
+L(StrncpyExit4):
+	lea	12(%rdx, %rsi), %rdx
+	lea	12(%rcx, %rsi), %rcx
+	mov	-12(%rcx), %rsi
+	mov	-4(%rcx), %eax
+	mov	%rsi, -12(%rdx)
+	mov	%eax, -4(%rdx)
+	xor	%rsi, %rsi
+	jmp	L(CopyFrom1To16BytesCase3)
+
+	.p2align 4
+L(StrncpyLeave5):
+	movaps	%xmm2, %xmm3
+	add	$48, %r8
+	jle	L(StrncpyExit5)
+	palignr	$5, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	27(%rcx), %xmm2
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(StrncpyExit5)
+	palignr	$5, %xmm3, %xmm2
+	movaps	%xmm2, 16(%rdx)
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(StrncpyExit5)
+	movaps	%xmm4, 32(%rdx)
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(StrncpyExit5)
+	movaps	%xmm5, 48(%rdx)
+	lea	16(%rsi), %rsi
+	lea	-16(%r8), %r8
+
+L(StrncpyExit5):
+	lea	11(%rdx, %rsi), %rdx
+	lea	11(%rcx, %rsi), %rcx
+	mov	-11(%rcx), %rsi
+	mov	-4(%rcx), %eax
+	mov	%rsi, -11(%rdx)
+	mov	%eax, -4(%rdx)
+	xor	%rsi, %rsi
+	jmp	L(CopyFrom1To16BytesCase3)
+
+	.p2align 4
+L(StrncpyLeave6):
+	movaps	%xmm2, %xmm3
+	add	$48, %r8
+	jle	L(StrncpyExit6)
+	palignr	$6, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	26(%rcx), %xmm2
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(StrncpyExit6)
+	palignr	$6, %xmm3, %xmm2
+	movaps	%xmm2, 16(%rdx)
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(StrncpyExit6)
+	movaps	%xmm4, 32(%rdx)
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(StrncpyExit6)
+	movaps	%xmm5, 48(%rdx)
+	lea	16(%rsi), %rsi
+	lea	-16(%r8), %r8
+
+L(StrncpyExit6):
+	lea	10(%rdx, %rsi), %rdx
+	lea	10(%rcx, %rsi), %rcx
+	mov	-10(%rcx), %rsi
+	movw	-2(%rcx), %ax
+	mov	%rsi, -10(%rdx)
+	movw	%ax, -2(%rdx)
+	xor	%rsi, %rsi
+	jmp	L(CopyFrom1To16BytesCase3)
+
+	.p2align 4
+L(StrncpyLeave7):
+	movaps	%xmm2, %xmm3
+	add	$48, %r8
+	jle	L(StrncpyExit7)
+	palignr	$7, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	25(%rcx), %xmm2
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(StrncpyExit7)
+	palignr	$7, %xmm3, %xmm2
+	movaps	%xmm2, 16(%rdx)
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(StrncpyExit7)
+	movaps	%xmm4, 32(%rdx)
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(StrncpyExit7)
+	movaps	%xmm5, 48(%rdx)
+	lea	16(%rsi), %rsi
+	lea	-16(%r8), %r8
+
+L(StrncpyExit7):
+	lea	9(%rdx, %rsi), %rdx
+	lea	9(%rcx, %rsi), %rcx
+	mov	-9(%rcx), %rsi
+	movb	-1(%rcx), %ah
+	mov	%rsi, -9(%rdx)
+	movb	%ah, -1(%rdx)
+	xor	%rsi, %rsi
+	jmp	L(CopyFrom1To16BytesCase3)
+
+	.p2align 4
+L(StrncpyLeave8):
+	movaps	%xmm2, %xmm3
+	add	$48, %r8
+	jle	L(StrncpyExit8)
+	palignr	$8, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	24(%rcx), %xmm2
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(StrncpyExit8)
+	palignr	$8, %xmm3, %xmm2
+	movaps	%xmm2, 16(%rdx)
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(StrncpyExit8)
+	movaps	%xmm4, 32(%rdx)
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(StrncpyExit8)
+	movaps	%xmm5, 48(%rdx)
+	lea	16(%rsi), %rsi
+	lea	-16(%r8), %r8
+
+L(StrncpyExit8):
+	lea	8(%rdx, %rsi), %rdx
+	lea	8(%rcx, %rsi), %rcx
+	mov	-8(%rcx), %rax
+	xor	%rsi, %rsi
+	mov	%rax, -8(%rdx)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+	.p2align 4
+L(StrncpyLeave9):
+	movaps	%xmm2, %xmm3
+	add	$48, %r8
+	jle	L(StrncpyExit9)
+	palignr	$9, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	23(%rcx), %xmm2
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(StrncpyExit9)
+	palignr	$9, %xmm3, %xmm2
+	movaps	%xmm2, 16(%rdx)
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(StrncpyExit9)
+	movaps	%xmm4, 32(%rdx)
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(StrncpyExit9)
+	movaps	%xmm5, 48(%rdx)
+	lea	16(%rsi), %rsi
+	lea	-16(%r8), %r8
+
+L(StrncpyExit9):
+	lea	7(%rdx, %rsi), %rdx
+	lea	7(%rcx, %rsi), %rcx
+	mov	-8(%rcx), %rax
+	xor	%rsi, %rsi
+	mov	%rax, -8(%rdx)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+	.p2align 4
+L(StrncpyLeave10):
+	movaps	%xmm2, %xmm3
+	add	$48, %r8
+	jle	L(StrncpyExit10)
+	palignr	$10, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	22(%rcx), %xmm2
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(StrncpyExit10)
+	palignr	$10, %xmm3, %xmm2
+	movaps	%xmm2, 16(%rdx)
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(StrncpyExit10)
+	movaps	%xmm4, 32(%rdx)
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(StrncpyExit10)
+	movaps	%xmm5, 48(%rdx)
+	lea	16(%rsi), %rsi
+	lea	-16(%r8), %r8
+
+L(StrncpyExit10):
+	lea	6(%rdx, %rsi), %rdx
+	lea	6(%rcx, %rsi), %rcx
+	mov	-8(%rcx), %rax
+	xor	%rsi, %rsi
+	mov	%rax, -8(%rdx)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+	.p2align 4
+L(StrncpyLeave11):
+	movaps	%xmm2, %xmm3
+	add	$48, %r8
+	jle	L(StrncpyExit11)
+	palignr	$11, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	21(%rcx), %xmm2
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(StrncpyExit11)
+	palignr	$11, %xmm3, %xmm2
+	movaps	%xmm2, 16(%rdx)
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(StrncpyExit11)
+	movaps	%xmm4, 32(%rdx)
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(StrncpyExit11)
+	movaps	%xmm5, 48(%rdx)
+	lea	16(%rsi), %rsi
+	lea	-16(%r8), %r8
+
+L(StrncpyExit11):
+	lea	5(%rdx, %rsi), %rdx
+	lea	5(%rcx, %rsi), %rcx
+	mov	-8(%rcx), %rax
+	xor	%rsi, %rsi
+	mov	%rax, -8(%rdx)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+	.p2align 4
+L(StrncpyLeave12):
+	movaps	%xmm2, %xmm3
+	add	$48, %r8
+	jle	L(StrncpyExit12)
+	palignr	$12, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	20(%rcx), %xmm2
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(StrncpyExit12)
+	palignr	$12, %xmm3, %xmm2
+	movaps	%xmm2, 16(%rdx)
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(StrncpyExit12)
+	movaps	%xmm4, 32(%rdx)
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(StrncpyExit12)
+	movaps	%xmm5, 48(%rdx)
+	lea	16(%rsi), %rsi
+	lea	-16(%r8), %r8
+
+L(StrncpyExit12):
+	lea	4(%rdx, %rsi), %rdx
+	lea	4(%rcx, %rsi), %rcx
+	mov	-4(%rcx), %eax
+	xor	%rsi, %rsi
+	mov	%eax, -4(%rdx)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+	.p2align 4
+L(StrncpyLeave13):
+	movaps	%xmm2, %xmm3
+	add	$48, %r8
+	jle	L(StrncpyExit13)
+	palignr	$13, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	19(%rcx), %xmm2
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(StrncpyExit13)
+	palignr	$13, %xmm3, %xmm2
+	movaps	%xmm2, 16(%rdx)
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(StrncpyExit13)
+	movaps	%xmm4, 32(%rdx)
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(StrncpyExit13)
+	movaps	%xmm5, 48(%rdx)
+	lea	16(%rsi), %rsi
+	lea	-16(%r8), %r8
+
+L(StrncpyExit13):
+	lea	3(%rdx, %rsi), %rdx
+	lea	3(%rcx, %rsi), %rcx
+	mov	-4(%rcx), %eax
+	xor	%rsi, %rsi
+	mov	%eax, -4(%rdx)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+	.p2align 4
+L(StrncpyLeave14):
+	movaps	%xmm2, %xmm3
+	add	$48, %r8
+	jle	L(StrncpyExit14)
+	palignr	$14, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	18(%rcx), %xmm2
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(StrncpyExit14)
+	palignr	$14, %xmm3, %xmm2
+	movaps	%xmm2, 16(%rdx)
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(StrncpyExit14)
+	movaps	%xmm4, 32(%rdx)
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(StrncpyExit14)
+	movaps	%xmm5, 48(%rdx)
+	lea	16(%rsi), %rsi
+	lea	-16(%r8), %r8
+
+L(StrncpyExit14):
+	lea	2(%rdx, %rsi), %rdx
+	lea	2(%rcx, %rsi), %rcx
+	movw	-2(%rcx), %ax
+	xor	%rsi, %rsi
+	movw	%ax, -2(%rdx)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+	.p2align 4
+L(StrncpyLeave15):
+	movaps	%xmm2, %xmm3
+	add	$48, %r8
+	jle	L(StrncpyExit15)
+	palignr	$15, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	17(%rcx), %xmm2
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(StrncpyExit15)
+	palignr	$15, %xmm3, %xmm2
+	movaps	%xmm2, 16(%rdx)
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(StrncpyExit15)
+	movaps	%xmm4, 32(%rdx)
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(StrncpyExit15)
+	movaps	%xmm5, 48(%rdx)
+	lea	16(%rsi), %rsi
+	lea	-16(%r8), %r8
+
+L(StrncpyExit15):
+	lea	1(%rdx, %rsi), %rdx
+	lea	1(%rcx, %rsi), %rcx
+	movb	-1(%rcx), %ah
+	xor	%rsi, %rsi
+	movb	%ah, -1(%rdx)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+# endif
+# ifndef USE_AS_STRCAT
+END (STRCPY)
+# endif
+#endif
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/strcpy.S b/REORG.TODO/sysdeps/x86_64/multiarch/strcpy.S
new file mode 100644
index 0000000000..77819ddc50
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/strcpy.S
@@ -0,0 +1,99 @@
+/* Multiple versions of strcpy
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2009-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+#if !defined (USE_AS_STPCPY) && !defined (USE_AS_STRNCPY)
+# ifndef STRCPY
+#  define STRCPY strcpy
+# endif
+#endif
+
+#ifdef USE_AS_STPCPY
+# ifdef USE_AS_STRNCPY
+#  define STRCPY_SSSE3		__stpncpy_ssse3
+#  define STRCPY_SSE2		__stpncpy_sse2
+#  define STRCPY_SSE2_UNALIGNED __stpncpy_sse2_unaligned
+#  define __GI_STRCPY		__GI_stpncpy
+#  define __GI___STRCPY		__GI___stpncpy
+# else
+#  define STRCPY_SSSE3		__stpcpy_ssse3
+#  define STRCPY_SSE2		__stpcpy_sse2
+#  define STRCPY_SSE2_UNALIGNED	__stpcpy_sse2_unaligned
+#  define __GI_STRCPY		__GI_stpcpy
+#  define __GI___STRCPY		__GI___stpcpy
+# endif
+#else
+# ifdef USE_AS_STRNCPY
+#  define STRCPY_SSSE3		__strncpy_ssse3
+#  define STRCPY_SSE2		__strncpy_sse2
+#  define STRCPY_SSE2_UNALIGNED	__strncpy_sse2_unaligned
+#  define __GI_STRCPY		__GI_strncpy
+# else
+#  define STRCPY_SSSE3		__strcpy_ssse3
+#  define STRCPY_SSE2		__strcpy_sse2
+#  define STRCPY_SSE2_UNALIGNED	__strcpy_sse2_unaligned
+#  define __GI_STRCPY		__GI_strcpy
+# endif
+#endif
+
+
+/* Define multiple versions only for the definition in libc.  */
+#if IS_IN (libc)
+	.text
+ENTRY(STRCPY)
+	.type	STRCPY, @gnu_indirect_function
+	LOAD_RTLD_GLOBAL_RO_RDX
+	leaq	STRCPY_SSE2_UNALIGNED(%rip), %rax
+	HAS_ARCH_FEATURE (Fast_Unaligned_Load)
+	jnz	2f
+	leaq	STRCPY_SSE2(%rip), %rax
+	HAS_CPU_FEATURE (SSSE3)
+	jz	2f
+	leaq	STRCPY_SSSE3(%rip), %rax
+2:	ret
+END(STRCPY)
+
+# undef ENTRY
+# define ENTRY(name) \
+	.type STRCPY_SSE2, @function; \
+	.align 16; \
+	.globl STRCPY_SSE2; \
+	.hidden STRCPY_SSE2; \
+	STRCPY_SSE2: cfi_startproc; \
+	CALL_MCOUNT
+# undef END
+# define END(name) \
+	cfi_endproc; .size STRCPY_SSE2, .-STRCPY_SSE2
+# undef libc_hidden_builtin_def
+/* It doesn't make sense to send libc-internal strcpy calls through a PLT.
+   The speedup we get from using SSSE3 instruction is likely eaten away
+   by the indirect call in the PLT.  */
+# define libc_hidden_builtin_def(name) \
+	.globl __GI_STRCPY; __GI_STRCPY = STRCPY_SSE2
+# undef libc_hidden_def
+# define libc_hidden_def(name) \
+	.globl __GI___STRCPY; __GI___STRCPY = STRCPY_SSE2
+#endif
+
+#ifndef USE_AS_STRNCPY
+#include "../strcpy.S"
+#endif
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/strcspn-c.c b/REORG.TODO/sysdeps/x86_64/multiarch/strcspn-c.c
new file mode 100644
index 0000000000..67991b5ca7
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/strcspn-c.c
@@ -0,0 +1,173 @@
+/* strcspn with SSE4.2 intrinsics
+   Copyright (C) 2009-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <nmmintrin.h>
+#include <string.h>
+#include "varshift.h"
+
+/* We use 0x2:
+	_SIDD_SBYTE_OPS
+	| _SIDD_CMP_EQUAL_ANY
+	| _SIDD_POSITIVE_POLARITY
+	| _SIDD_LEAST_SIGNIFICANT
+   on pcmpistri to compare xmm/mem128
+
+   0 1 2 3 4 5 6 7 8 9 A B C D E F
+   X X X X X X X X X X X X X X X X
+
+   against xmm
+
+   0 1 2 3 4 5 6 7 8 9 A B C D E F
+   A A A A A A A A A A A A A A A A
+
+   to find out if the first 16byte data element has any byte A and
+   the offset of the first byte.  There are 3 cases:
+
+   1. The first 16byte data element has the byte A at the offset X.
+   2. The first 16byte data element has EOS and doesn't have the byte A.
+   3. The first 16byte data element is valid and doesn't have the byte A.
+
+   Here is the table of ECX, CFlag, ZFlag and SFlag for 2 cases:
+
+    1		 X	  1	 0/1	  0
+    2		16	  0	  1	  0
+    3		16	  0	  0	  0
+
+   We exit from the loop for cases 1 and 2 with jbe which branches
+   when either CFlag or ZFlag is 1.  If CFlag == 1, ECX has the offset
+   X for case 1.  */
+
+#ifndef STRCSPN_SSE2
+# define STRCSPN_SSE2 __strcspn_sse2
+# define STRCSPN_SSE42 __strcspn_sse42
+#endif
+
+#ifdef USE_AS_STRPBRK
+# define RETURN(val1, val2) return val1
+#else
+# define RETURN(val1, val2) return val2
+#endif
+
+extern
+#ifdef USE_AS_STRPBRK
+char *
+#else
+size_t
+#endif
+STRCSPN_SSE2 (const char *, const char *);
+
+
+#ifdef USE_AS_STRPBRK
+char *
+#else
+size_t
+#endif
+__attribute__ ((section (".text.sse4.2")))
+STRCSPN_SSE42 (const char *s, const char *a)
+{
+  if (*a == 0)
+    RETURN (NULL, strlen (s));
+
+  const char *aligned;
+  __m128i mask;
+  int offset = (int) ((size_t) a & 15);
+  if (offset != 0)
+    {
+      /* Load masks.  */
+      aligned = (const char *) ((size_t) a & -16L);
+      __m128i mask0 = _mm_load_si128 ((__m128i *) aligned);
+
+      mask = __m128i_shift_right (mask0, offset);
+
+      /* Find where the NULL terminator is.  */
+      int length = _mm_cmpistri (mask, mask, 0x3a);
+      if (length == 16 - offset)
+	{
+	  /* There is no NULL terminator.  */
+	  __m128i mask1 = _mm_load_si128 ((__m128i *) (aligned + 16));
+	  int index = _mm_cmpistri (mask1, mask1, 0x3a);
+	  length += index;
+
+	  /* Don't use SSE4.2 if the length of A > 16.  */
+	  if (length > 16)
+	    return STRCSPN_SSE2 (s, a);
+
+	  if (index != 0)
+	    {
+	      /* Combine mask0 and mask1.  We could play games with
+		 palignr, but frankly this data should be in L1 now
+		 so do the merge via an unaligned load.  */
+	      mask = _mm_loadu_si128 ((__m128i *) a);
+	    }
+	}
+    }
+  else
+    {
+      /* A is aligned.  */
+      mask = _mm_load_si128 ((__m128i *) a);
+
+      /* Find where the NULL terminator is.  */
+      int length = _mm_cmpistri (mask, mask, 0x3a);
+      if (length == 16)
+	{
+	  /* There is no NULL terminator.  Don't use SSE4.2 if the length
+	     of A > 16.  */
+	  if (a[16] != 0)
+	    return STRCSPN_SSE2 (s, a);
+	}
+    }
+
+  offset = (int) ((size_t) s & 15);
+  if (offset != 0)
+    {
+      /* Check partial string.  */
+      aligned = (const char *) ((size_t) s & -16L);
+      __m128i value = _mm_load_si128 ((__m128i *) aligned);
+
+      value = __m128i_shift_right (value, offset);
+
+      int length = _mm_cmpistri (mask, value, 0x2);
+      /* No need to check ZFlag since ZFlag is always 1.  */
+      int cflag = _mm_cmpistrc (mask, value, 0x2);
+      if (cflag)
+	RETURN ((char *) (s + length), length);
+      /* Find where the NULL terminator is.  */
+      int index = _mm_cmpistri (value, value, 0x3a);
+      if (index < 16 - offset)
+	RETURN (NULL, index);
+      aligned += 16;
+    }
+  else
+    aligned = s;
+
+  while (1)
+    {
+      __m128i value = _mm_load_si128 ((__m128i *) aligned);
+      int index = _mm_cmpistri (mask, value, 0x2);
+      int cflag = _mm_cmpistrc (mask, value, 0x2);
+      int zflag = _mm_cmpistrz (mask, value, 0x2);
+      if (cflag)
+	RETURN ((char *) (aligned + index), (size_t) (aligned + index - s));
+      if (zflag)
+	RETURN (NULL,
+		/* Find where the NULL terminator is.  */
+		(size_t) (aligned + _mm_cmpistri (value, value, 0x3a) - s));
+      aligned += 16;
+    }
+}
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/strcspn.S b/REORG.TODO/sysdeps/x86_64/multiarch/strcspn.S
new file mode 100644
index 0000000000..d102c7e80b
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/strcspn.S
@@ -0,0 +1,69 @@
+/* Multiple versions of strcspn
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2009-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <config.h>
+#include <sysdep.h>
+#include <init-arch.h>
+
+#ifdef USE_AS_STRPBRK
+#define STRCSPN_SSE42	__strpbrk_sse42
+#define STRCSPN_SSE2	__strpbrk_sse2
+#define __GI_STRCSPN	__GI_strpbrk
+#else
+#ifndef STRCSPN
+#define STRCSPN		strcspn
+#define STRCSPN_SSE42	__strcspn_sse42
+#define STRCSPN_SSE2	__strcspn_sse2
+#define __GI_STRCSPN	__GI_strcspn
+#endif
+#endif
+
+/* Define multiple versions only for the definition in libc.  Don't
+   define multiple versions for strpbrk in static library since we
+   need strpbrk before the initialization happened.  */
+#if (defined SHARED || !defined USE_AS_STRPBRK) && IS_IN (libc)
+	.text
+ENTRY(STRCSPN)
+	.type	STRCSPN, @gnu_indirect_function
+	LOAD_RTLD_GLOBAL_RO_RDX
+	leaq	STRCSPN_SSE2(%rip), %rax
+	HAS_CPU_FEATURE (SSE4_2)
+	jz	2f
+	leaq	STRCSPN_SSE42(%rip), %rax
+2:	ret
+END(STRCSPN)
+
+# undef ENTRY
+# define ENTRY(name) \
+	.type STRCSPN_SSE2, @function; \
+	.globl STRCSPN_SSE2; \
+	.align 16; \
+	STRCSPN_SSE2: cfi_startproc; \
+	CALL_MCOUNT
+# undef END
+# define END(name) \
+	cfi_endproc; .size STRCSPN_SSE2, .-STRCSPN_SSE2
+#endif
+
+#ifdef USE_AS_STRPBRK
+#include "../strpbrk.S"
+#else
+#include "../strcspn.S"
+#endif
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/strncase_l-ssse3.S b/REORG.TODO/sysdeps/x86_64/multiarch/strncase_l-ssse3.S
new file mode 100644
index 0000000000..6728678688
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/strncase_l-ssse3.S
@@ -0,0 +1,6 @@
+#define USE_SSSE3 1
+#define USE_AS_STRNCASECMP_L
+#define NO_NOLOCALE_ALIAS
+#define STRCMP __strncasecmp_l_ssse3
+#define __strncasecmp __strncasecmp_ssse3
+#include "../strcmp.S"
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/strncase_l.S b/REORG.TODO/sysdeps/x86_64/multiarch/strncase_l.S
new file mode 100644
index 0000000000..9c0149788e
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/strncase_l.S
@@ -0,0 +1,8 @@
+/* Multiple versions of strncasecmp and strncasecmp_l
+   All versions must be listed in ifunc-impl-list.c.  */
+#define STRCMP __strncasecmp_l
+#define USE_AS_STRNCASECMP_L
+#include "strcmp.S"
+
+weak_alias (__strncasecmp_l, strncasecmp_l)
+libc_hidden_def (strncasecmp_l)
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/strncat-c.c b/REORG.TODO/sysdeps/x86_64/multiarch/strncat-c.c
new file mode 100644
index 0000000000..a3cdbff689
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/strncat-c.c
@@ -0,0 +1,8 @@
+#define STRNCAT __strncat_sse2
+#ifdef SHARED
+#undef libc_hidden_def
+#define libc_hidden_def(name) \
+  __hidden_ver1 (__strncat_sse2, __GI___strncat, __strncat_sse2);
+#endif
+
+#include "string/strncat.c"
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/strncat-sse2-unaligned.S b/REORG.TODO/sysdeps/x86_64/multiarch/strncat-sse2-unaligned.S
new file mode 100644
index 0000000000..133e1d20b0
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/strncat-sse2-unaligned.S
@@ -0,0 +1,3 @@
+#define USE_AS_STRNCAT
+#define STRCAT __strncat_sse2_unaligned
+#include "strcat-sse2-unaligned.S"
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/strncat-ssse3.S b/REORG.TODO/sysdeps/x86_64/multiarch/strncat-ssse3.S
new file mode 100644
index 0000000000..6c45ff3ec7
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/strncat-ssse3.S
@@ -0,0 +1,3 @@
+#define USE_AS_STRNCAT
+#define STRCAT __strncat_ssse3
+#include "strcat-ssse3.S"
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/strncat.S b/REORG.TODO/sysdeps/x86_64/multiarch/strncat.S
new file mode 100644
index 0000000000..5c1bf41453
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/strncat.S
@@ -0,0 +1,5 @@
+/* Multiple versions of strncat
+   All versions must be listed in ifunc-impl-list.c.  */
+#define STRCAT strncat
+#define USE_AS_STRNCAT
+#include "strcat.S"
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/strncmp-ssse3.S b/REORG.TODO/sysdeps/x86_64/multiarch/strncmp-ssse3.S
new file mode 100644
index 0000000000..96380a46be
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/strncmp-ssse3.S
@@ -0,0 +1,6 @@
+#ifdef SHARED
+# define USE_SSSE3 1
+# define STRCMP __strncmp_ssse3
+# define USE_AS_STRNCMP
+# include "../strcmp.S"
+#endif
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/strncmp.S b/REORG.TODO/sysdeps/x86_64/multiarch/strncmp.S
new file mode 100644
index 0000000000..fd5eb1397c
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/strncmp.S
@@ -0,0 +1,5 @@
+/* Multiple versions of strncmp
+   All versions must be listed in ifunc-impl-list.c.  */
+#define STRCMP strncmp
+#define USE_AS_STRNCMP
+#include "strcmp.S"
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/strncpy-c.c b/REORG.TODO/sysdeps/x86_64/multiarch/strncpy-c.c
new file mode 100644
index 0000000000..296c32cb5d
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/strncpy-c.c
@@ -0,0 +1,8 @@
+#define STRNCPY __strncpy_sse2
+#ifdef SHARED
+#undef libc_hidden_builtin_def
+#define libc_hidden_builtin_def(name) \
+  __hidden_ver1 (__strncpy_sse2, __GI_strncpy, __strncpy_sse2);
+#endif
+
+#include "strncpy.c"
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/strncpy-sse2-unaligned.S b/REORG.TODO/sysdeps/x86_64/multiarch/strncpy-sse2-unaligned.S
new file mode 100644
index 0000000000..fcc23a754a
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/strncpy-sse2-unaligned.S
@@ -0,0 +1,3 @@
+#define USE_AS_STRNCPY
+#define STRCPY __strncpy_sse2_unaligned
+#include "strcpy-sse2-unaligned.S"
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/strncpy-ssse3.S b/REORG.TODO/sysdeps/x86_64/multiarch/strncpy-ssse3.S
new file mode 100644
index 0000000000..bf82ee447d
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/strncpy-ssse3.S
@@ -0,0 +1,3 @@
+#define USE_AS_STRNCPY
+#define STRCPY __strncpy_ssse3
+#include "strcpy-ssse3.S"
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/strncpy.S b/REORG.TODO/sysdeps/x86_64/multiarch/strncpy.S
new file mode 100644
index 0000000000..6d87a0ba35
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/strncpy.S
@@ -0,0 +1,5 @@
+/* Multiple versions of strncpy
+   All versions must be listed in ifunc-impl-list.c.  */
+#define STRCPY strncpy
+#define USE_AS_STRNCPY
+#include "strcpy.S"
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/strpbrk-c.c b/REORG.TODO/sysdeps/x86_64/multiarch/strpbrk-c.c
new file mode 100644
index 0000000000..bbf5c49d89
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/strpbrk-c.c
@@ -0,0 +1,8 @@
+/* Don't define multiple versions for strpbrk in static library since we
+   need strpbrk before the initialization happened.  */
+#ifdef SHARED
+# define USE_AS_STRPBRK
+# define STRCSPN_SSE2 __strpbrk_sse2
+# define STRCSPN_SSE42 __strpbrk_sse42
+# include "strcspn-c.c"
+#endif
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/strpbrk.S b/REORG.TODO/sysdeps/x86_64/multiarch/strpbrk.S
new file mode 100644
index 0000000000..7201d6376f
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/strpbrk.S
@@ -0,0 +1,5 @@
+/* Multiple versions of strpbrk
+   All versions must be listed in ifunc-impl-list.c.  */
+#define STRCSPN strpbrk
+#define USE_AS_STRPBRK
+#include "strcspn.S"
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/strspn-c.c b/REORG.TODO/sysdeps/x86_64/multiarch/strspn-c.c
new file mode 100644
index 0000000000..1704606b80
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/strspn-c.c
@@ -0,0 +1,145 @@
+/* strspn with SSE4.2 intrinsics
+   Copyright (C) 2009-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <nmmintrin.h>
+#include <string.h>
+#include "varshift.h"
+
+/* We use 0x12:
+	_SIDD_SBYTE_OPS
+	| _SIDD_CMP_EQUAL_ANY
+	| _SIDD_NEGATIVE_POLARITY
+	| _SIDD_LEAST_SIGNIFICANT
+   on pcmpistri to compare xmm/mem128
+
+   0 1 2 3 4 5 6 7 8 9 A B C D E F
+   X X X X X X X X X X X X X X X X
+
+   against xmm
+
+   0 1 2 3 4 5 6 7 8 9 A B C D E F
+   A A A A A A A A A A A A A A A A
+
+   to find out if the first 16byte data element has any non-A byte and
+   the offset of the first byte.  There are 2 cases:
+
+   1. The first 16byte data element has the non-A byte, including
+      EOS, at the offset X.
+   2. The first 16byte data element is valid and doesn't have the non-A
+      byte.
+
+   Here is the table of ECX, CFlag, ZFlag and SFlag for 2 cases:
+
+   case		ECX	CFlag	ZFlag	SFlag
+    1		 X	  1	 0/1	  0
+    2		16	  0	  0	  0
+
+   We exit from the loop for case 1.  */
+
+extern size_t __strspn_sse2 (const char *, const char *);
+
+
+size_t
+__attribute__ ((section (".text.sse4.2")))
+__strspn_sse42 (const char *s, const char *a)
+{
+  if (*a == 0)
+    return 0;
+
+  const char *aligned;
+  __m128i mask;
+  int offset = (int) ((size_t) a & 15);
+  if (offset != 0)
+    {
+      /* Load masks.  */
+      aligned = (const char *) ((size_t) a & -16L);
+      __m128i mask0 = _mm_load_si128 ((__m128i *) aligned);
+
+      mask = __m128i_shift_right (mask0, offset);
+
+      /* Find where the NULL terminator is.  */
+      int length = _mm_cmpistri (mask, mask, 0x3a);
+      if (length == 16 - offset)
+	{
+	  /* There is no NULL terminator.  */
+	  __m128i mask1 = _mm_load_si128 ((__m128i *) (aligned + 16));
+	  int index = _mm_cmpistri (mask1, mask1, 0x3a);
+	  length += index;
+
+	  /* Don't use SSE4.2 if the length of A > 16.  */
+	  if (length > 16)
+	    return __strspn_sse2 (s, a);
+
+	  if (index != 0)
+	    {
+	      /* Combine mask0 and mask1.  We could play games with
+		 palignr, but frankly this data should be in L1 now
+		 so do the merge via an unaligned load.  */
+	      mask = _mm_loadu_si128 ((__m128i *) a);
+	    }
+	}
+    }
+  else
+    {
+      /* A is aligned.  */
+      mask = _mm_load_si128 ((__m128i *) a);
+
+      /* Find where the NULL terminator is.  */
+      int length = _mm_cmpistri (mask, mask, 0x3a);
+      if (length == 16)
+	{
+	  /* There is no NULL terminator.  Don't use SSE4.2 if the length
+	     of A > 16.  */
+	  if (a[16] != 0)
+	    return __strspn_sse2 (s, a);
+	}
+    }
+
+  offset = (int) ((size_t) s & 15);
+  if (offset != 0)
+    {
+      /* Check partial string.  */
+      aligned = (const char *) ((size_t) s & -16L);
+      __m128i value = _mm_load_si128 ((__m128i *) aligned);
+
+      value = __m128i_shift_right (value, offset);
+
+      int length = _mm_cmpistri (mask, value, 0x12);
+      /* No need to check CFlag since it is always 1.  */
+      if (length < 16 - offset)
+	return length;
+      /* Find where the NULL terminator is.  */
+      int index = _mm_cmpistri (value, value, 0x3a);
+      if (index < 16 - offset)
+	return length;
+      aligned += 16;
+    }
+  else
+    aligned = s;
+
+  while (1)
+    {
+      __m128i value = _mm_load_si128 ((__m128i *) aligned);
+      int index = _mm_cmpistri (mask, value, 0x12);
+      int cflag = _mm_cmpistrc (mask, value, 0x12);
+      if (cflag)
+	return (size_t) (aligned + index - s);
+      aligned += 16;
+    }
+}
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/strspn.S b/REORG.TODO/sysdeps/x86_64/multiarch/strspn.S
new file mode 100644
index 0000000000..adf7d9e533
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/strspn.S
@@ -0,0 +1,50 @@
+/* Multiple versions of strspn
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2009-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <config.h>
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in libc.  */
+#if IS_IN (libc)
+	.text
+ENTRY(strspn)
+	.type	strspn, @gnu_indirect_function
+	LOAD_RTLD_GLOBAL_RO_RDX
+	leaq	__strspn_sse2(%rip), %rax
+	HAS_CPU_FEATURE (SSE4_2)
+	jz	2f
+	leaq	__strspn_sse42(%rip), %rax
+2:	ret
+END(strspn)
+
+# undef ENTRY
+# define ENTRY(name) \
+	.type __strspn_sse2, @function; \
+	.globl __strspn_sse2; \
+	.align 16; \
+	__strspn_sse2: cfi_startproc; \
+	CALL_MCOUNT
+# undef END
+# define END(name) \
+	cfi_endproc; .size __strspn_sse2, .-__strspn_sse2
+#endif
+
+#include "../strspn.S"
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/strstr-sse2-unaligned.S b/REORG.TODO/sysdeps/x86_64/multiarch/strstr-sse2-unaligned.S
new file mode 100644
index 0000000000..138979d10a
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/strstr-sse2-unaligned.S
@@ -0,0 +1,374 @@
+/* strstr with unaligned loads
+   Copyright (C) 2009-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+ENTRY(__strstr_sse2_unaligned)
+	movzbl	(%rsi), %eax
+	testb	%al, %al
+	je	L(empty)
+	movzbl	1(%rsi), %edx
+	testb	%dl, %dl
+	je	L(strchr)
+	movd	%eax, %xmm1
+	movd	%edx, %xmm2
+	movq	%rdi, %rax
+	andl	$4095, %eax
+	punpcklbw	%xmm1, %xmm1
+	cmpq	$4031, %rax
+	punpcklbw	%xmm2, %xmm2
+	punpcklwd	%xmm1, %xmm1
+	punpcklwd	%xmm2, %xmm2
+	pshufd	$0, %xmm1, %xmm1
+	pshufd	$0, %xmm2, %xmm2
+	ja	L(cross_page)
+	movdqu	(%rdi), %xmm3
+	pxor	%xmm5, %xmm5
+	movdqu	1(%rdi), %xmm4
+	movdqa	%xmm3, %xmm6
+	pcmpeqb	%xmm1, %xmm3
+	pcmpeqb	%xmm2, %xmm4
+	movdqu	16(%rdi), %xmm0
+	pcmpeqb	%xmm5, %xmm6
+	pminub	%xmm4, %xmm3
+	movdqa	%xmm3, %xmm4
+	movdqu	17(%rdi), %xmm3
+	pcmpeqb	%xmm0, %xmm5
+	pcmpeqb	%xmm2, %xmm3
+	por	%xmm6, %xmm4
+	pcmpeqb	%xmm1, %xmm0
+	pminub	%xmm3, %xmm0
+	por	%xmm5, %xmm0
+	pmovmskb	%xmm4, %r8d
+	pmovmskb	%xmm0, %eax
+	salq	$16, %rax
+	orq	%rax, %r8
+	je	L(next_32_bytes)
+L(next_pair_index):
+	bsf	%r8, %rax
+	addq	%rdi, %rax
+	cmpb	$0, (%rax)
+	je	L(zero1)
+	movzbl	2(%rsi), %edx
+	testb	%dl, %dl
+	je	L(found1)
+	cmpb	2(%rax), %dl
+	jne	L(next_pair)
+	xorl	%edx, %edx
+	jmp	L(pair_loop_start)
+
+	.p2align 4
+L(strchr):
+	movzbl	%al, %esi
+	jmp	__strchr_sse2
+
+	.p2align 4
+L(pair_loop):
+	addq	$1, %rdx
+	cmpb	2(%rax,%rdx), %cl
+	jne	L(next_pair)
+L(pair_loop_start):
+	movzbl	3(%rsi,%rdx), %ecx
+	testb	%cl, %cl
+	jne	L(pair_loop)
+L(found1):
+	ret
+L(zero1):
+	xorl	%eax, %eax
+	ret
+
+	.p2align 4
+L(next_pair):
+	leaq	-1(%r8), %rax
+	andq	%rax, %r8
+	jne	L(next_pair_index)
+
+	.p2align 4
+L(next_32_bytes):
+	movdqu	32(%rdi), %xmm3
+	pxor	%xmm5, %xmm5
+	movdqu	33(%rdi), %xmm4
+	movdqa	%xmm3, %xmm6
+	pcmpeqb	%xmm1, %xmm3
+	pcmpeqb	%xmm2, %xmm4
+	movdqu	48(%rdi), %xmm0
+	pcmpeqb	%xmm5, %xmm6
+	pminub	%xmm4, %xmm3
+	movdqa	%xmm3, %xmm4
+	movdqu	49(%rdi), %xmm3
+	pcmpeqb	%xmm0, %xmm5
+	pcmpeqb	%xmm2, %xmm3
+	por	%xmm6, %xmm4
+	pcmpeqb	%xmm1, %xmm0
+	pminub	%xmm3, %xmm0
+	por	%xmm5, %xmm0
+	pmovmskb	%xmm4, %eax
+	salq	$32, %rax
+	pmovmskb	%xmm0, %r8d
+	salq	$48, %r8
+	orq	%rax, %r8
+	je	L(loop_header)
+L(next_pair2_index):
+	bsfq	%r8, %rax
+	addq	%rdi, %rax
+	cmpb	$0, (%rax)
+	je	L(zero2)
+	movzbl	2(%rsi), %edx
+	testb	%dl, %dl
+	je	L(found2)
+	cmpb	2(%rax), %dl
+	jne	L(next_pair2)
+	xorl	%edx, %edx
+	jmp	L(pair_loop2_start)
+
+	.p2align 4
+L(pair_loop2):
+	addq	$1, %rdx
+	cmpb	2(%rax,%rdx), %cl
+	jne	L(next_pair2)
+L(pair_loop2_start):
+	movzbl	3(%rsi,%rdx), %ecx
+	testb	%cl, %cl
+	jne	L(pair_loop2)
+L(found2):
+	ret
+	L(zero2):
+	xorl	%eax, %eax
+	ret
+L(empty):
+	mov %rdi, %rax
+	ret
+
+	.p2align 4
+L(next_pair2):
+	leaq	-1(%r8), %rax
+	andq	%rax, %r8
+	jne	L(next_pair2_index)
+L(loop_header):
+	movq	$-512, %r11
+	movq	%rdi, %r9
+
+	pxor	%xmm7, %xmm7
+	andq	$-64, %rdi
+
+	.p2align 4
+L(loop):
+	movdqa	64(%rdi), %xmm3
+	movdqu	63(%rdi), %xmm6
+	movdqa	%xmm3, %xmm0
+	pxor	%xmm2, %xmm3
+	pxor	%xmm1, %xmm6
+	movdqa	80(%rdi), %xmm10
+	por	%xmm3, %xmm6
+	pminub	%xmm10, %xmm0
+	movdqu	79(%rdi), %xmm3
+	pxor	%xmm2, %xmm10
+	pxor	%xmm1, %xmm3
+	movdqa	96(%rdi), %xmm9
+	por	%xmm10, %xmm3
+	pminub	%xmm9, %xmm0
+	pxor	%xmm2, %xmm9
+	movdqa	112(%rdi), %xmm8
+	addq	$64, %rdi
+	pminub	%xmm6, %xmm3
+	movdqu	31(%rdi), %xmm4
+	pminub	%xmm8, %xmm0
+	pxor	%xmm2, %xmm8
+	pxor	%xmm1, %xmm4
+	por	%xmm9, %xmm4
+	pminub	%xmm4, %xmm3
+	movdqu	47(%rdi), %xmm5
+	pxor	%xmm1, %xmm5
+	por	%xmm8, %xmm5
+	pminub	%xmm5, %xmm3
+	pminub	%xmm3, %xmm0
+	pcmpeqb	%xmm7, %xmm0
+	pmovmskb	%xmm0, %eax
+	testl	%eax, %eax
+	je	L(loop)
+	pminub (%rdi), %xmm6
+	pminub 32(%rdi),%xmm4
+	pminub 48(%rdi),%xmm5
+	pcmpeqb %xmm7, %xmm6
+	pcmpeqb %xmm7, %xmm5
+	pmovmskb	%xmm6, %edx
+	movdqa	16(%rdi), %xmm8
+	pcmpeqb %xmm7, %xmm4
+	movdqu  15(%rdi), %xmm0
+	pmovmskb	%xmm5, %r8d
+	movdqa  %xmm8, %xmm3
+	pmovmskb	%xmm4, %ecx
+	pcmpeqb %xmm1,%xmm0
+	pcmpeqb %xmm2,%xmm3
+	salq	$32, %rcx
+	pcmpeqb %xmm7,%xmm8
+	salq	$48, %r8
+	pminub  %xmm0,%xmm3
+	orq	%rcx, %rdx
+	por	%xmm3,%xmm8
+	orq	%rdx, %r8
+	pmovmskb	%xmm8, %eax
+	salq	$16, %rax
+	orq	%rax, %r8
+	je	L(loop)
+L(next_pair_index3):
+	bsfq	%r8, %rcx
+	addq	%rdi, %rcx
+	cmpb	$0, (%rcx)
+	je	L(zero)
+	xorl	%eax, %eax
+	movzbl	2(%rsi), %edx
+	testb	%dl, %dl
+	je	L(success3)
+	cmpb	1(%rcx), %dl
+	jne	L(next_pair3)
+	jmp	L(pair_loop_start3)
+
+	.p2align 4
+L(pair_loop3):
+	addq	$1, %rax
+	cmpb	1(%rcx,%rax), %dl
+	jne	L(next_pair3)
+L(pair_loop_start3):
+	movzbl	3(%rsi,%rax), %edx
+	testb	%dl, %dl
+	jne	L(pair_loop3)
+L(success3):
+	lea	-1(%rcx), %rax
+	ret
+
+	.p2align 4
+L(next_pair3):
+	addq	%rax, %r11
+	movq	%rdi,  %rax
+	subq	%r9, %rax
+	cmpq	%r11, %rax
+	jl	L(switch_strstr)
+	leaq	-1(%r8), %rax
+	andq	%rax, %r8
+	jne	L(next_pair_index3)
+	jmp	L(loop)
+
+	.p2align 4
+L(switch_strstr):
+	movq	%rdi, %rdi
+	jmp	__strstr_sse2
+
+	.p2align 4
+L(cross_page):
+
+	movq	%rdi, %rax
+	pxor	%xmm0, %xmm0
+	andq	$-64, %rax
+	movdqa	(%rax), %xmm3
+	movdqu	-1(%rax), %xmm4
+	movdqa	%xmm3, %xmm8
+	movdqa	16(%rax), %xmm5
+	pcmpeqb	%xmm1, %xmm4
+	pcmpeqb	%xmm0, %xmm8
+	pcmpeqb	%xmm2, %xmm3
+	movdqa	%xmm5, %xmm7
+	pminub	%xmm4, %xmm3
+	movdqu	15(%rax), %xmm4
+	pcmpeqb	%xmm0, %xmm7
+	por	%xmm3, %xmm8
+	movdqa	%xmm5, %xmm3
+	movdqa	32(%rax), %xmm5
+	pcmpeqb	%xmm1, %xmm4
+	pcmpeqb	%xmm2, %xmm3
+	movdqa	%xmm5, %xmm6
+	pmovmskb	%xmm8, %ecx
+	pminub	%xmm4, %xmm3
+	movdqu	31(%rax), %xmm4
+	por	%xmm3, %xmm7
+	movdqa	%xmm5, %xmm3
+	pcmpeqb	%xmm0, %xmm6
+	movdqa	48(%rax), %xmm5
+	pcmpeqb	%xmm1, %xmm4
+	pmovmskb	%xmm7, %r8d
+	pcmpeqb	%xmm2, %xmm3
+	pcmpeqb	%xmm5, %xmm0
+	pminub	%xmm4, %xmm3
+	movdqu	47(%rax), %xmm4
+	por	%xmm3, %xmm6
+	movdqa	%xmm5, %xmm3
+	salq	$16, %r8
+	pcmpeqb	%xmm1, %xmm4
+	pcmpeqb	%xmm2, %xmm3
+	pmovmskb	%xmm6, %r10d
+	pminub	%xmm4, %xmm3
+	por	%xmm3, %xmm0
+	salq	$32, %r10
+	orq	%r10, %r8
+	orq	%rcx, %r8
+	movl	%edi, %ecx
+	pmovmskb	%xmm0, %edx
+	subl	%eax, %ecx
+	salq	$48, %rdx
+	orq	%rdx, %r8
+	shrq	%cl, %r8
+	je	L(loop_header)
+L(next_pair_index4):
+	bsfq	%r8, %rax
+	addq	%rdi, %rax
+	cmpb	$0, (%rax)
+	je	L(zero)
+
+	cmpq	%rax,%rdi
+	je	L(next_pair4)
+
+	movzbl	2(%rsi), %edx
+	testb	%dl, %dl
+	je	L(found3)
+	cmpb	1(%rax), %dl
+	jne	L(next_pair4)
+	xorl	%edx, %edx
+	jmp	L(pair_loop_start4)
+
+	.p2align 4
+L(pair_loop4):
+	addq	$1, %rdx
+	cmpb	1(%rax,%rdx), %cl
+	jne	L(next_pair4)
+L(pair_loop_start4):
+	movzbl	3(%rsi,%rdx), %ecx
+	testb	%cl, %cl
+	jne	L(pair_loop4)
+L(found3):
+	subq $1, %rax
+	ret
+
+	.p2align 4
+L(next_pair4):
+	leaq	-1(%r8), %rax
+	andq	%rax, %r8
+	jne	L(next_pair_index4)
+	jmp	L(loop_header)
+
+	.p2align 4
+L(found):
+	rep
+	ret
+
+	.p2align 4
+L(zero):
+	xorl	%eax, %eax
+	ret
+
+
+END(__strstr_sse2_unaligned)
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/strstr.c b/REORG.TODO/sysdeps/x86_64/multiarch/strstr.c
new file mode 100644
index 0000000000..a7d181d797
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/strstr.c
@@ -0,0 +1,50 @@
+/* Multiple versions of strstr.
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2012-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* Redefine strstr so that the compiler won't complain about the type
+   mismatch with the IFUNC selector in strong_alias, below.  */
+#undef  strstr
+#define strstr __redirect_strstr
+#include <string.h>
+#undef  strstr
+
+#define STRSTR __strstr_sse2
+#ifdef SHARED
+# undef libc_hidden_builtin_def
+# define libc_hidden_builtin_def(name) \
+  __hidden_ver1 (__strstr_sse2, __GI_strstr, __strstr_sse2);
+#endif
+
+#include "string/strstr.c"
+
+extern __typeof (__redirect_strstr) __strstr_sse2_unaligned attribute_hidden;
+extern __typeof (__redirect_strstr) __strstr_sse2 attribute_hidden;
+
+#include "init-arch.h"
+
+/* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle
+   ifunc symbol properly.  */
+extern __typeof (__redirect_strstr) __libc_strstr;
+libc_ifunc (__libc_strstr,
+	    HAS_ARCH_FEATURE (Fast_Unaligned_Load)
+	    ? __strstr_sse2_unaligned
+	    : __strstr_sse2)
+
+#undef strstr
+strong_alias (__libc_strstr, strstr)
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/test-multiarch.c b/REORG.TODO/sysdeps/x86_64/multiarch/test-multiarch.c
new file mode 100644
index 0000000000..597d64e1e8
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/test-multiarch.c
@@ -0,0 +1,96 @@
+/* Test CPU feature data.
+   This file is part of the GNU C Library.
+   Copyright (C) 2012-2017 Free Software Foundation, Inc.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <cpu-features.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+static char *cpu_flags;
+
+/* Search for flags in /proc/cpuinfo and store line
+   in cpu_flags.  */
+void
+get_cpuinfo (void)
+{
+  FILE *f;
+  char *line = NULL;
+  size_t len = 0;
+  ssize_t read;
+
+  f = fopen ("/proc/cpuinfo", "r");
+  if (f == NULL)
+    {
+      printf ("cannot open /proc/cpuinfo\n");
+      exit (1);
+    }
+
+  while ((read = getline (&line, &len, f)) != -1)
+    {
+      if (strncmp (line, "flags", 5) == 0)
+       {
+         cpu_flags = strdup (line);
+         break;
+       }
+    }
+  fclose (f);
+  free (line);
+}
+
+int
+check_proc (const char *proc_name, int flag, const char *name)
+{
+  int found = 0;
+
+  printf ("Checking %s:\n", name);
+  printf ("  init-arch %d\n", flag);
+  if (strstr (cpu_flags, proc_name) != NULL)
+    found = 1;
+  printf ("  cpuinfo (%s) %d\n", proc_name, found);
+
+  if (found != flag)
+    printf (" *** failure ***\n");
+
+  return (found != flag);
+}
+
+static int
+do_test (int argc, char **argv)
+{
+  int fails;
+
+  get_cpuinfo ();
+  fails = check_proc ("avx", HAS_ARCH_FEATURE (AVX_Usable),
+		      "HAS_ARCH_FEATURE (AVX_Usable)");
+  fails += check_proc ("fma4", HAS_ARCH_FEATURE (FMA4_Usable),
+		       "HAS_ARCH_FEATURE (FMA4_Usable)");
+  fails += check_proc ("sse4_2", HAS_CPU_FEATURE (SSE4_2),
+		       "HAS_CPU_FEATURE (SSE4_2)");
+  fails += check_proc ("sse4_1", HAS_CPU_FEATURE (SSE4_1)
+		       , "HAS_CPU_FEATURE (SSE4_1)");
+  fails += check_proc ("ssse3", HAS_CPU_FEATURE (SSSE3),
+		       "HAS_CPU_FEATURE (SSSE3)");
+  fails += check_proc ("popcnt", HAS_CPU_FEATURE (POPCOUNT),
+		       "HAS_CPU_FEATURE (POPCOUNT)");
+
+  printf ("%d differences between /proc/cpuinfo and glibc code.\n", fails);
+
+  return (fails != 0);
+}
+
+#include "../../../test-skeleton.c"
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/varshift.c b/REORG.TODO/sysdeps/x86_64/multiarch/varshift.c
new file mode 100644
index 0000000000..1c3e34845d
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/varshift.c
@@ -0,0 +1,25 @@
+/* Helper for variable shifts of SSE registers.
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include "varshift.h"
+
+const int8_t ___m128i_shift_right[31] attribute_hidden =
+  {
+    0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
+  };
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/varshift.h b/REORG.TODO/sysdeps/x86_64/multiarch/varshift.h
new file mode 100644
index 0000000000..07bb76c4bf
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/varshift.h
@@ -0,0 +1,30 @@
+/* Helper for variable shifts of SSE registers.
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <stdint.h>
+#include <tmmintrin.h>
+
+extern const int8_t ___m128i_shift_right[31] attribute_hidden;
+
+static __inline__ __m128i
+__m128i_shift_right (__m128i value, unsigned long int offset)
+{
+  return _mm_shuffle_epi8 (value,
+			   _mm_loadu_si128 ((__m128i *) (___m128i_shift_right
+							 + offset)));
+}
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/wcscpy-c.c b/REORG.TODO/sysdeps/x86_64/multiarch/wcscpy-c.c
new file mode 100644
index 0000000000..a51a83a9be
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/wcscpy-c.c
@@ -0,0 +1,5 @@
+#if IS_IN (libc)
+# define wcscpy  __wcscpy_sse2
+#endif
+
+#include "wcsmbs/wcscpy.c"
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/wcscpy-ssse3.S b/REORG.TODO/sysdeps/x86_64/multiarch/wcscpy-ssse3.S
new file mode 100644
index 0000000000..53857ce4f5
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/wcscpy-ssse3.S
@@ -0,0 +1,552 @@
+/* wcscpy with SSSE3
+   Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+# include <sysdep.h>
+
+	.section .text.ssse3,"ax",@progbits
+ENTRY (__wcscpy_ssse3)
+
+	mov	%rsi, %rcx
+	mov	%rdi, %rdx
+
+	cmpl	$0, (%rcx)
+	jz	L(Exit4)
+	cmpl	$0, 4(%rcx)
+	jz	L(Exit8)
+	cmpl	$0, 8(%rcx)
+	jz	L(Exit12)
+	cmpl	$0, 12(%rcx)
+	jz	L(Exit16)
+
+	lea	16(%rcx), %rsi
+	and	$-16, %rsi
+
+	pxor	%xmm0, %xmm0
+	mov	(%rcx), %r9
+	mov	%r9, (%rdx)
+
+	pcmpeqd	(%rsi), %xmm0
+	mov	8(%rcx), %r9
+	mov	%r9, 8(%rdx)
+
+	pmovmskb %xmm0, %rax
+	sub	%rcx, %rsi
+
+	test	%rax, %rax
+	jnz	L(CopyFrom1To16Bytes)
+
+	mov	%rdx, %rax
+	lea	16(%rdx), %rdx
+	and	$-16, %rdx
+	sub	%rdx, %rax
+	sub	%rax, %rcx
+	mov	%rcx, %rax
+	and	$0xf, %rax
+	mov	$0, %rsi
+
+/* case: rcx_offset == rdx_offset */
+
+	jz	L(Align16Both)
+
+	cmp	$4, %rax
+	je	L(Shl4)
+	cmp	$8, %rax
+	je	L(Shl8)
+	jmp	L(Shl12)
+
+L(Align16Both):
+	movaps	(%rcx), %xmm1
+	movaps	16(%rcx), %xmm2
+	movaps	%xmm1, (%rdx)
+	pcmpeqd	%xmm2, %xmm0
+	pmovmskb %xmm0, %rax
+	lea	16(%rsi), %rsi
+
+	test	%rax, %rax
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	16(%rcx, %rsi), %xmm3
+	movaps	%xmm2, (%rdx, %rsi)
+	pcmpeqd	%xmm3, %xmm0
+	pmovmskb %xmm0, %rax
+	lea	16(%rsi), %rsi
+
+	test	%rax, %rax
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	16(%rcx, %rsi), %xmm4
+	movaps	%xmm3, (%rdx, %rsi)
+	pcmpeqd	%xmm4, %xmm0
+	pmovmskb %xmm0, %rax
+	lea	16(%rsi), %rsi
+
+	test	%rax, %rax
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	16(%rcx, %rsi), %xmm1
+	movaps	%xmm4, (%rdx, %rsi)
+	pcmpeqd	%xmm1, %xmm0
+	pmovmskb %xmm0, %rax
+	lea	16(%rsi), %rsi
+
+	test	%rax, %rax
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	16(%rcx, %rsi), %xmm2
+	movaps	%xmm1, (%rdx, %rsi)
+	pcmpeqd	%xmm2, %xmm0
+	pmovmskb %xmm0, %rax
+	lea	16(%rsi), %rsi
+
+	test	%rax, %rax
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	16(%rcx, %rsi), %xmm3
+	movaps	%xmm2, (%rdx, %rsi)
+	pcmpeqd	%xmm3, %xmm0
+	pmovmskb %xmm0, %rax
+	lea	16(%rsi), %rsi
+
+	test	%rax, %rax
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	%xmm3, (%rdx, %rsi)
+	mov	%rcx, %rax
+	lea	16(%rcx, %rsi), %rcx
+	and	$-0x40, %rcx
+	sub	%rcx, %rax
+	sub	%rax, %rdx
+
+	mov	$-0x40, %rsi
+
+	.p2align 4
+L(Aligned64Loop):
+	movaps	(%rcx), %xmm2
+	movaps	%xmm2, %xmm4
+	movaps	16(%rcx), %xmm5
+	movaps	32(%rcx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	48(%rcx), %xmm7
+	pminub	%xmm5, %xmm2
+	pminub	%xmm7, %xmm3
+	pminub	%xmm2, %xmm3
+	pcmpeqd	%xmm0, %xmm3
+	pmovmskb %xmm3, %rax
+	lea	64(%rdx), %rdx
+	lea	64(%rcx), %rcx
+	test	%rax, %rax
+	jnz	L(Aligned64Leave)
+	movaps	%xmm4, -64(%rdx)
+	movaps	%xmm5, -48(%rdx)
+	movaps	%xmm6, -32(%rdx)
+	movaps	%xmm7, -16(%rdx)
+	jmp	L(Aligned64Loop)
+
+L(Aligned64Leave):
+	pcmpeqd	%xmm4, %xmm0
+	pmovmskb %xmm0, %rax
+	test	%rax, %rax
+	jnz	L(CopyFrom1To16Bytes)
+
+	pcmpeqd	%xmm5, %xmm0
+
+	pmovmskb %xmm0, %rax
+	movaps	%xmm4, -64(%rdx)
+	test	%rax, %rax
+	lea	16(%rsi), %rsi
+	jnz	L(CopyFrom1To16Bytes)
+
+	pcmpeqd	%xmm6, %xmm0
+
+	pmovmskb %xmm0, %rax
+	movaps	%xmm5, -48(%rdx)
+	test	%rax, %rax
+	lea	16(%rsi), %rsi
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	%xmm6, -32(%rdx)
+	pcmpeqd	%xmm7, %xmm0
+
+	pmovmskb %xmm0, %rax
+	lea	16(%rsi), %rsi
+	test	%rax, %rax
+	jnz	L(CopyFrom1To16Bytes)
+
+	mov	$-0x40, %rsi
+	movaps	%xmm7, -16(%rdx)
+	jmp	L(Aligned64Loop)
+
+	.p2align 4
+L(Shl4):
+	movaps	-4(%rcx), %xmm1
+	movaps	12(%rcx), %xmm2
+L(Shl4Start):
+	pcmpeqd	%xmm2, %xmm0
+	pmovmskb %xmm0, %rax
+	movaps	%xmm2, %xmm3
+
+	test	%rax, %rax
+	jnz	L(Shl4LoopExit)
+
+	palignr	$4, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	28(%rcx), %xmm2
+
+	pcmpeqd	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+	movaps	%xmm2, %xmm1
+
+	test	%rax, %rax
+	jnz	L(Shl4LoopExit)
+
+	palignr	$4, %xmm3, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	28(%rcx), %xmm2
+
+	pcmpeqd	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+	movaps	%xmm2, %xmm3
+
+	test	%rax, %rax
+	jnz	L(Shl4LoopExit)
+
+	palignr	$4, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	28(%rcx), %xmm2
+
+	pcmpeqd	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+
+	test	%rax, %rax
+	jnz	L(Shl4LoopExit)
+
+	palignr	$4, %xmm3, %xmm2
+	movaps	%xmm2, (%rdx)
+	lea	28(%rcx), %rcx
+	lea	16(%rdx), %rdx
+
+	mov	%rcx, %rax
+	and	$-0x40, %rcx
+	sub	%rcx, %rax
+	lea	-12(%rcx), %rcx
+	sub	%rax, %rdx
+
+	movaps	-4(%rcx), %xmm1
+
+	.p2align 4
+L(Shl4LoopStart):
+	movaps	12(%rcx), %xmm2
+	movaps	28(%rcx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	44(%rcx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	60(%rcx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqd	%xmm0, %xmm7
+	pmovmskb %xmm7, %rax
+	movaps	%xmm5, %xmm7
+	palignr	$4, %xmm4, %xmm5
+	test	%rax, %rax
+	palignr	$4, %xmm3, %xmm4
+	jnz	L(Shl4Start)
+
+	palignr	$4, %xmm2, %xmm3
+	lea	64(%rcx), %rcx
+	palignr	$4, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%rdx)
+	movaps	%xmm4, 32(%rdx)
+	movaps	%xmm3, 16(%rdx)
+	movaps	%xmm2, (%rdx)
+	lea	64(%rdx), %rdx
+	jmp	L(Shl4LoopStart)
+
+L(Shl4LoopExit):
+	movdqu	-4(%rcx), %xmm1
+	mov	$12, %rsi
+	movdqu	%xmm1, -4(%rdx)
+	jmp	L(CopyFrom1To16Bytes)
+
+	.p2align 4
+L(Shl8):
+	movaps	-8(%rcx), %xmm1
+	movaps	8(%rcx), %xmm2
+L(Shl8Start):
+	pcmpeqd	%xmm2, %xmm0
+	pmovmskb %xmm0, %rax
+	movaps	%xmm2, %xmm3
+
+	test	%rax, %rax
+	jnz	L(Shl8LoopExit)
+
+	palignr	$8, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	24(%rcx), %xmm2
+
+	pcmpeqd	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+	movaps	%xmm2, %xmm1
+
+	test	%rax, %rax
+	jnz	L(Shl8LoopExit)
+
+	palignr	$8, %xmm3, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	24(%rcx), %xmm2
+
+	pcmpeqd	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+	movaps	%xmm2, %xmm3
+
+	test	%rax, %rax
+	jnz	L(Shl8LoopExit)
+
+	palignr	$8, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	24(%rcx), %xmm2
+
+	pcmpeqd	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+
+	test	%rax, %rax
+	jnz	L(Shl8LoopExit)
+
+	palignr	$8, %xmm3, %xmm2
+	movaps	%xmm2, (%rdx)
+	lea	24(%rcx), %rcx
+	lea	16(%rdx), %rdx
+
+	mov	%rcx, %rax
+	and	$-0x40, %rcx
+	sub	%rcx, %rax
+	lea	-8(%rcx), %rcx
+	sub	%rax, %rdx
+
+	movaps	-8(%rcx), %xmm1
+
+	.p2align 4
+L(Shl8LoopStart):
+	movaps	8(%rcx), %xmm2
+	movaps	24(%rcx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	40(%rcx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	56(%rcx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqd	%xmm0, %xmm7
+	pmovmskb %xmm7, %rax
+	movaps	%xmm5, %xmm7
+	palignr	$8, %xmm4, %xmm5
+	test	%rax, %rax
+	palignr	$8, %xmm3, %xmm4
+	jnz	L(Shl8Start)
+
+	palignr	$8, %xmm2, %xmm3
+	lea	64(%rcx), %rcx
+	palignr	$8, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%rdx)
+	movaps	%xmm4, 32(%rdx)
+	movaps	%xmm3, 16(%rdx)
+	movaps	%xmm2, (%rdx)
+	lea	64(%rdx), %rdx
+	jmp	L(Shl8LoopStart)
+
+L(Shl8LoopExit):
+	mov	(%rcx), %r9
+	mov	$8, %rsi
+	mov	%r9, (%rdx)
+	jmp	L(CopyFrom1To16Bytes)
+
+	.p2align 4
+L(Shl12):
+	movaps	-12(%rcx), %xmm1
+	movaps	4(%rcx), %xmm2
+L(Shl12Start):
+	pcmpeqd	%xmm2, %xmm0
+	pmovmskb %xmm0, %rax
+	movaps	%xmm2, %xmm3
+
+	test	%rax, %rax
+	jnz	L(Shl12LoopExit)
+
+	palignr	$12, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	20(%rcx), %xmm2
+
+	pcmpeqd	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+	movaps	%xmm2, %xmm1
+
+	test	%rax, %rax
+	jnz	L(Shl12LoopExit)
+
+	palignr	$12, %xmm3, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	20(%rcx), %xmm2
+
+	pcmpeqd	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+	movaps	%xmm2, %xmm3
+
+	test	%rax, %rax
+	jnz	L(Shl12LoopExit)
+
+	palignr	$12, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	20(%rcx), %xmm2
+
+	pcmpeqd	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+
+	test	%rax, %rax
+	jnz	L(Shl12LoopExit)
+
+	palignr	$12, %xmm3, %xmm2
+	movaps	%xmm2, (%rdx)
+	lea	20(%rcx), %rcx
+	lea	16(%rdx), %rdx
+
+	mov	%rcx, %rax
+	and	$-0x40, %rcx
+	sub	%rcx, %rax
+	lea	-4(%rcx), %rcx
+	sub	%rax, %rdx
+
+	movaps	-12(%rcx), %xmm1
+
+	.p2align 4
+L(Shl12LoopStart):
+	movaps	4(%rcx), %xmm2
+	movaps	20(%rcx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	36(%rcx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	52(%rcx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqd	%xmm0, %xmm7
+	pmovmskb %xmm7, %rax
+	movaps	%xmm5, %xmm7
+	palignr	$12, %xmm4, %xmm5
+	test	%rax, %rax
+	palignr	$12, %xmm3, %xmm4
+	jnz	L(Shl12Start)
+	palignr	$12, %xmm2, %xmm3
+	lea	64(%rcx), %rcx
+	palignr	$12, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%rdx)
+	movaps	%xmm4, 32(%rdx)
+	movaps	%xmm3, 16(%rdx)
+	movaps	%xmm2, (%rdx)
+	lea	64(%rdx), %rdx
+	jmp	L(Shl12LoopStart)
+
+L(Shl12LoopExit):
+	mov	(%rcx), %r9d
+	mov	$4, %rsi
+	mov	%r9d, (%rdx)
+	jmp	L(CopyFrom1To16Bytes)
+
+	.p2align 4
+L(CopyFrom1To16Bytes):
+	add	%rsi, %rdx
+	add	%rsi, %rcx
+
+	test	%al, %al
+	jz	L(ExitHigh)
+	test	$0x01, %al
+	jnz	L(Exit4)
+
+	mov	(%rcx), %rax
+	mov	%rax, (%rdx)
+	mov	%rdi, %rax
+	ret
+
+	.p2align 4
+L(ExitHigh):
+	test	$0x01, %ah
+	jnz	L(Exit12)
+
+	mov	(%rcx), %rax
+	mov	%rax, (%rdx)
+	mov	8(%rcx), %rax
+	mov	%rax, 8(%rdx)
+	mov	%rdi, %rax
+	ret
+
+	.p2align 4
+L(Exit4):
+	movl	(%rcx), %eax
+	movl	%eax, (%rdx)
+	mov	%rdi, %rax
+	ret
+
+	.p2align 4
+L(Exit8):
+	mov	(%rcx), %rax
+	mov	%rax, (%rdx)
+	mov	%rdi, %rax
+	ret
+
+	.p2align 4
+L(Exit12):
+	mov	(%rcx), %rax
+	mov	%rax, (%rdx)
+	mov	8(%rcx), %eax
+	mov	%eax, 8(%rdx)
+	mov	%rdi, %rax
+	ret
+
+	.p2align 4
+L(Exit16):
+	mov	(%rcx), %rax
+	mov	%rax, (%rdx)
+	mov	8(%rcx), %rax
+	mov	%rax, 8(%rdx)
+	mov	%rdi, %rax
+	ret
+
+END(__wcscpy_ssse3)
+#endif
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/wcscpy.S b/REORG.TODO/sysdeps/x86_64/multiarch/wcscpy.S
new file mode 100644
index 0000000000..9150ab6d18
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/wcscpy.S
@@ -0,0 +1,40 @@
+/* Multiple versions of wcscpy
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in libc. */
+#if IS_IN (libc)
+
+	.text
+ENTRY(wcscpy)
+	.type	wcscpy, @gnu_indirect_function
+	LOAD_RTLD_GLOBAL_RO_RDX
+	HAS_CPU_FEATURE (SSSE3)
+	jnz	2f
+	leaq	__wcscpy_sse2(%rip), %rax
+	ret
+
+2:	leaq	__wcscpy_ssse3(%rip), %rax
+	ret
+
+END(wcscpy)
+#endif
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/wcsnlen-c.c b/REORG.TODO/sysdeps/x86_64/multiarch/wcsnlen-c.c
new file mode 100644
index 0000000000..e1ec7cfbb5
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/wcsnlen-c.c
@@ -0,0 +1,9 @@
+#if IS_IN (libc)
+# include <wchar.h>
+
+# define WCSNLEN __wcsnlen_sse2
+
+extern __typeof (wcsnlen) __wcsnlen_sse2;
+#endif
+
+#include "wcsmbs/wcsnlen.c"
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S b/REORG.TODO/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S
new file mode 100644
index 0000000000..a8cab0cb00
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S
@@ -0,0 +1,5 @@
+#define AS_WCSLEN
+#define AS_STRNLEN
+#define strlen	__wcsnlen_sse4_1
+
+#include "../strlen.S"
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/wcsnlen.c b/REORG.TODO/sysdeps/x86_64/multiarch/wcsnlen.c
new file mode 100644
index 0000000000..304f62eec3
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/wcsnlen.c
@@ -0,0 +1,45 @@
+/* Multiple versions of wcsnlen.
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* Define multiple versions only for the definition in libc. */
+#if IS_IN (libc)
+# define __wcsnlen __redirect_wcsnlen
+# include <wchar.h>
+# undef __wcsnlen
+
+# define SYMBOL_NAME wcsnlen
+# include <init-arch.h>
+
+extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden;
+
+static inline void *
+IFUNC_SELECTOR (void)
+{
+  const struct cpu_features* cpu_features = __get_cpu_features ();
+
+  if (CPU_FEATURES_CPU_P (cpu_features, SSE4_1))
+    return OPTIMIZE (sse4_1);
+
+  return OPTIMIZE (sse2);
+}
+
+libc_ifunc_redirected (__redirect_wcsnlen, __wcsnlen, IFUNC_SELECTOR ());
+weak_alias (__wcsnlen, wcsnlen);
+#endif
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/wmemcmp-avx2-movbe.S b/REORG.TODO/sysdeps/x86_64/multiarch/wmemcmp-avx2-movbe.S
new file mode 100644
index 0000000000..bfa1a16a35
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/wmemcmp-avx2-movbe.S
@@ -0,0 +1,4 @@
+#define MEMCMP __wmemcmp_avx2_movbe
+#define USE_AS_WMEMCMP 1
+
+#include "memcmp-avx2-movbe.S"
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/wmemcmp-c.c b/REORG.TODO/sysdeps/x86_64/multiarch/wmemcmp-c.c
new file mode 100644
index 0000000000..46b6715e18
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/wmemcmp-c.c
@@ -0,0 +1,9 @@
+#if IS_IN (libc)
+# include <wchar.h>
+
+# define WMEMCMP  __wmemcmp_sse2
+
+extern __typeof (wmemcmp) __wmemcmp_sse2;
+#endif
+
+#include "wcsmbs/wmemcmp.c"
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/wmemcmp-sse4.S b/REORG.TODO/sysdeps/x86_64/multiarch/wmemcmp-sse4.S
new file mode 100644
index 0000000000..b07973a4f6
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/wmemcmp-sse4.S
@@ -0,0 +1,4 @@
+#define USE_AS_WMEMCMP 1
+#define MEMCMP __wmemcmp_sse4_1
+
+#include "memcmp-sse4.S"
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/wmemcmp-ssse3.S b/REORG.TODO/sysdeps/x86_64/multiarch/wmemcmp-ssse3.S
new file mode 100644
index 0000000000..a41ef95fc1
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/wmemcmp-ssse3.S
@@ -0,0 +1,4 @@
+#define USE_AS_WMEMCMP 1
+#define MEMCMP __wmemcmp_ssse3
+
+#include "memcmp-ssse3.S"
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/wmemcmp.S b/REORG.TODO/sysdeps/x86_64/multiarch/wmemcmp.S
new file mode 100644
index 0000000000..94b25a214c
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/wmemcmp.S
@@ -0,0 +1,55 @@
+/* Multiple versions of wmemcmp
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in libc. */
+#if IS_IN (libc)
+	.text
+ENTRY(wmemcmp)
+	.type	wmemcmp, @gnu_indirect_function
+	LOAD_RTLD_GLOBAL_RO_RDX
+	HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
+	jnz	1f
+	HAS_ARCH_FEATURE (AVX2_Usable)
+	jz	1f
+	HAS_CPU_FEATURE (MOVBE)
+	jz	1f
+	HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
+	jz	1f
+	leaq	__wmemcmp_avx2_movbe(%rip), %rax
+	ret
+
+1:	HAS_CPU_FEATURE (SSSE3)
+	jnz	2f
+	leaq	__wmemcmp_sse2(%rip), %rax
+	ret
+
+2:	HAS_CPU_FEATURE (SSE4_1)
+	jz	3f
+	leaq	__wmemcmp_sse4_1(%rip), %rax
+	ret
+
+3:	leaq	__wmemcmp_ssse3(%rip), %rax
+	ret
+
+END(wmemcmp)
+#endif
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/wmemset.c b/REORG.TODO/sysdeps/x86_64/multiarch/wmemset.c
new file mode 100644
index 0000000000..dd35be6e49
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/wmemset.c
@@ -0,0 +1,33 @@
+/* Multiple versions of wmemset.
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* Define multiple versions only for the definition in libc. */
+#if IS_IN (libc)
+# define wmemset __redirect_wmemset
+# define __wmemset __redirect___wmemset
+# include <wchar.h>
+# undef wmemset
+# undef __wmemset
+
+# define SYMBOL_NAME wmemset
+# include "ifunc-wmemset.h"
+
+libc_ifunc_redirected (__redirect_wmemset, __wmemset, IFUNC_SELECTOR ());
+weak_alias (__wmemset, wmemset)
+#endif
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/wmemset_chk-nonshared.S b/REORG.TODO/sysdeps/x86_64/multiarch/wmemset_chk-nonshared.S
new file mode 100644
index 0000000000..0a537fe272
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/wmemset_chk-nonshared.S
@@ -0,0 +1,21 @@
+/* Non-shared version of wmemset_chk for x86-64.
+   Copyright (C) 2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc) && !defined SHARED
+# include "../wmemset_chk.S"
+#endif
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/wmemset_chk.c b/REORG.TODO/sysdeps/x86_64/multiarch/wmemset_chk.c
new file mode 100644
index 0000000000..d3ded5595b
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/wmemset_chk.c
@@ -0,0 +1,31 @@
+/* Multiple versions of wmemset_chk.
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* Define multiple versions only for the definition in libc.so. */
+#if IS_IN (libc) && defined SHARED
+# define __wmemset_chk __redirect_wmemset_chk
+# include <wchar.h>
+# undef __wmemset_chk
+
+# define SYMBOL_NAME wmemset_chk
+# include "ifunc-wmemset.h"
+
+libc_ifunc_redirected (__redirect_wmemset_chk, __wmemset_chk,
+		       IFUNC_SELECTOR ());
+#endif
diff --git a/REORG.TODO/sysdeps/x86_64/nptl/Makefile b/REORG.TODO/sysdeps/x86_64/nptl/Makefile
new file mode 100644
index 0000000000..bad3831869
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/nptl/Makefile
@@ -0,0 +1,20 @@
+# Copyright (C) 2002-2017 Free Software Foundation, Inc.
+# This file is part of the GNU C Library.
+
+# The GNU C Library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+
+# The GNU C Library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+
+# You should have received a copy of the GNU Lesser General Public
+# License along with the GNU C Library; if not, see
+# <http://www.gnu.org/licenses/>.
+
+ifeq ($(subdir),csu)
+gen-as-const-headers += tcb-offsets.sym
+endif
diff --git a/REORG.TODO/sysdeps/x86_64/nptl/pthread_spin_init.c b/REORG.TODO/sysdeps/x86_64/nptl/pthread_spin_init.c
new file mode 100644
index 0000000000..f249c6fef5
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/nptl/pthread_spin_init.c
@@ -0,0 +1 @@
+#include <sysdeps/i386/nptl/pthread_spin_init.c>
diff --git a/REORG.TODO/sysdeps/x86_64/nptl/pthread_spin_lock.S b/REORG.TODO/sysdeps/x86_64/nptl/pthread_spin_lock.S
new file mode 100644
index 0000000000..36ba181d9b
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/nptl/pthread_spin_lock.S
@@ -0,0 +1,34 @@
+/* Copyright (C) 2012-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <lowlevellock.h>
+#include <sysdep.h>
+
+ENTRY(pthread_spin_lock)
+1:	LOCK
+	decl	0(%rdi)
+	jne	2f
+	xor	%eax, %eax
+	ret
+
+	.align	16
+2:	rep
+	nop
+	cmpl	$0, 0(%rdi)
+	jg	1b
+	jmp	2b
+END(pthread_spin_lock)
diff --git a/REORG.TODO/sysdeps/x86_64/nptl/pthread_spin_trylock.S b/REORG.TODO/sysdeps/x86_64/nptl/pthread_spin_trylock.S
new file mode 100644
index 0000000000..3419f1fec8
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/nptl/pthread_spin_trylock.S
@@ -0,0 +1,37 @@
+/* Copyright (C) 2002-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@redhat.com>, 2002.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <pthread-errnos.h>
+#include <sysdep.h>
+
+
+#ifdef UP
+# define LOCK
+#else
+# define LOCK lock
+#endif
+
+ENTRY(pthread_spin_trylock)
+	movl	$1, %eax
+	xorl	%ecx, %ecx
+	LOCK
+	cmpxchgl %ecx, (%rdi)
+	movl	$EBUSY, %eax
+	cmovel	%ecx, %eax
+	retq
+END(pthread_spin_trylock)
diff --git a/REORG.TODO/sysdeps/x86_64/nptl/pthread_spin_unlock.S b/REORG.TODO/sysdeps/x86_64/nptl/pthread_spin_unlock.S
new file mode 100644
index 0000000000..58f9388a36
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/nptl/pthread_spin_unlock.S
@@ -0,0 +1,29 @@
+/* Copyright (C) 2002-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@redhat.com>, 2002.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+ENTRY(pthread_spin_unlock)
+	movl	$1, (%rdi)
+	xorl	%eax, %eax
+	retq
+END(pthread_spin_unlock)
+
+	/* The implementation of pthread_spin_init is identical.  */
+	.globl	pthread_spin_init
+pthread_spin_init = pthread_spin_unlock
diff --git a/REORG.TODO/sysdeps/x86_64/nptl/pthreaddef.h b/REORG.TODO/sysdeps/x86_64/nptl/pthreaddef.h
new file mode 100644
index 0000000000..f248ecac80
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/nptl/pthreaddef.h
@@ -0,0 +1,44 @@
+/* Copyright (C) 2002-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@redhat.com>, 2002.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* Default stack size.  */
+#define ARCH_STACK_DEFAULT_SIZE	(2 * 1024 * 1024)
+
+/* Required stack pointer alignment at beginning.  SSE requires 16
+   bytes.  */
+#define STACK_ALIGN		16
+
+/* Minimal stack size after allocating thread descriptor and guard size.  */
+#define MINIMAL_REST_STACK	2048
+
+/* Alignment requirement for TCB.
+
+   We need to store post-AVX vector registers in the TCB and we want the
+   storage to be aligned to at least 32 bytes.
+
+   Some processors such as Intel Atom pay a big penalty on every
+   access using a segment override if that segment's base is not
+   aligned to the size of a cache line.  (See Intel 64 and IA-32
+   Architectures Optimization Reference Manual, section 13.3.3.3,
+   "Segment Base".)  On such machines, a cache line is 64 bytes.  */
+#define TCB_ALIGNMENT		64
+
+
+/* Location of current stack frame.  The frame pointer is not usable.  */
+#define CURRENT_STACK_FRAME \
+  ({ register char *frame __asm__("rsp"); frame; })
diff --git a/REORG.TODO/sysdeps/x86_64/nptl/tcb-offsets.sym b/REORG.TODO/sysdeps/x86_64/nptl/tcb-offsets.sym
new file mode 100644
index 0000000000..8a25c482cb
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/nptl/tcb-offsets.sym
@@ -0,0 +1,27 @@
+#include <sysdep.h>
+#include <tls.h>
+#include <kernel-features.h>
+
+RESULT			offsetof (struct pthread, result)
+TID			offsetof (struct pthread, tid)
+CANCELHANDLING		offsetof (struct pthread, cancelhandling)
+CLEANUP_JMP_BUF		offsetof (struct pthread, cleanup_jmp_buf)
+CLEANUP			offsetof (struct pthread, cleanup)
+CLEANUP_PREV		offsetof (struct _pthread_cleanup_buffer, __prev)
+MUTEX_FUTEX		offsetof (pthread_mutex_t, __data.__lock)
+MULTIPLE_THREADS_OFFSET	offsetof (tcbhead_t, multiple_threads)
+POINTER_GUARD		offsetof (tcbhead_t, pointer_guard)
+VGETCPU_CACHE_OFFSET	offsetof (tcbhead_t, vgetcpu_cache)
+#ifndef __ASSUME_PRIVATE_FUTEX
+PRIVATE_FUTEX		offsetof (tcbhead_t, private_futex)
+#endif
+
+-- Not strictly offsets, but these values are also used in the TCB.
+TCB_CANCELSTATE_BITMASK	 CANCELSTATE_BITMASK
+TCB_CANCELTYPE_BITMASK	 CANCELTYPE_BITMASK
+TCB_CANCELING_BITMASK	 CANCELING_BITMASK
+TCB_CANCELED_BITMASK	 CANCELED_BITMASK
+TCB_EXITING_BITMASK	 EXITING_BITMASK
+TCB_CANCEL_RESTMASK	 CANCEL_RESTMASK
+TCB_TERMINATED_BITMASK	 TERMINATED_BITMASK
+TCB_PTHREAD_CANCELED	 PTHREAD_CANCELED
diff --git a/REORG.TODO/sysdeps/x86_64/nptl/tls.h b/REORG.TODO/sysdeps/x86_64/nptl/tls.h
new file mode 100644
index 0000000000..9b8ad82550
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/nptl/tls.h
@@ -0,0 +1,367 @@
+/* Definition for thread-local data handling.  nptl/x86_64 version.
+   Copyright (C) 2002-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _TLS_H
+#define _TLS_H	1
+
+#ifndef __ASSEMBLER__
+# include <asm/prctl.h>	/* For ARCH_SET_FS.  */
+# include <stdbool.h>
+# include <stddef.h>
+# include <stdint.h>
+# include <stdlib.h>
+# include <sysdep.h>
+# include <libc-pointer-arith.h> /* For cast_to_integer.  */
+# include <kernel-features.h>
+# include <dl-dtv.h>
+
+/* Replacement type for __m128 since this file is included by ld.so,
+   which is compiled with -mno-sse.  It must not change the alignment
+   of rtld_savespace_sse.  */
+typedef struct
+{
+  int i[4];
+} __128bits;
+
+
+typedef struct
+{
+  void *tcb;		/* Pointer to the TCB.  Not necessarily the
+			   thread descriptor used by libpthread.  */
+  dtv_t *dtv;
+  void *self;		/* Pointer to the thread descriptor.  */
+  int multiple_threads;
+  int gscope_flag;
+  uintptr_t sysinfo;
+  uintptr_t stack_guard;
+  uintptr_t pointer_guard;
+  unsigned long int vgetcpu_cache[2];
+# ifndef __ASSUME_PRIVATE_FUTEX
+  int private_futex;
+# else
+  int __glibc_reserved1;
+# endif
+  int __glibc_unused1;
+  /* Reservation of some values for the TM ABI.  */
+  void *__private_tm[4];
+  /* GCC split stack support.  */
+  void *__private_ss;
+  long int __glibc_reserved2;
+  /* Must be kept even if it is no longer used by glibc since programs,
+     like AddressSanitizer, depend on the size of tcbhead_t.  */
+  __128bits __glibc_unused2[8][4] __attribute__ ((aligned (32)));
+
+  void *__padding[8];
+} tcbhead_t;
+
+#else /* __ASSEMBLER__ */
+# include <tcb-offsets.h>
+#endif
+
+
+/* Alignment requirement for the stack.  */
+#define STACK_ALIGN	16
+
+
+#ifndef __ASSEMBLER__
+/* Get system call information.  */
+# include <sysdep.h>
+
+#ifndef LOCK_PREFIX
+# ifdef UP
+#  define LOCK_PREFIX	/* nothing */
+# else
+#  define LOCK_PREFIX	"lock;"
+# endif
+#endif
+
+/* This is the size of the initial TCB.  Can't be just sizeof (tcbhead_t),
+   because NPTL getpid, __libc_alloca_cutoff etc. need (almost) the whole
+   struct pthread even when not linked with -lpthread.  */
+# define TLS_INIT_TCB_SIZE sizeof (struct pthread)
+
+/* Alignment requirements for the initial TCB.  */
+# define TLS_INIT_TCB_ALIGN __alignof__ (struct pthread)
+
+/* This is the size of the TCB.  */
+# define TLS_TCB_SIZE sizeof (struct pthread)
+
+/* Alignment requirements for the TCB.  */
+# define TLS_TCB_ALIGN __alignof__ (struct pthread)
+
+/* The TCB can have any size and the memory following the address the
+   thread pointer points to is unspecified.  Allocate the TCB there.  */
+# define TLS_TCB_AT_TP	1
+# define TLS_DTV_AT_TP	0
+
+/* Get the thread descriptor definition.  */
+# include <nptl/descr.h>
+
+
+/* Install the dtv pointer.  The pointer passed is to the element with
+   index -1 which contain the length.  */
+# define INSTALL_DTV(descr, dtvp) \
+  ((tcbhead_t *) (descr))->dtv = (dtvp) + 1
+
+/* Install new dtv for current thread.  */
+# define INSTALL_NEW_DTV(dtvp) \
+  ({ struct pthread *__pd;						      \
+     THREAD_SETMEM (__pd, header.dtv, (dtvp)); })
+
+/* Return dtv of given thread descriptor.  */
+# define GET_DTV(descr) \
+  (((tcbhead_t *) (descr))->dtv)
+
+
+/* Code to initially initialize the thread pointer.  This might need
+   special attention since 'errno' is not yet available and if the
+   operation can cause a failure 'errno' must not be touched.
+
+   We have to make the syscall for both uses of the macro since the
+   address might be (and probably is) different.  */
+# define TLS_INIT_TP(thrdescr) \
+  ({ void *_thrdescr = (thrdescr);					      \
+     tcbhead_t *_head = _thrdescr;					      \
+     int _result;							      \
+									      \
+     _head->tcb = _thrdescr;						      \
+     /* For now the thread descriptor is at the same address.  */	      \
+     _head->self = _thrdescr;						      \
+									      \
+     /* It is a simple syscall to set the %fs value for the thread.  */	      \
+     asm volatile ("syscall"						      \
+		   : "=a" (_result)					      \
+		   : "0" ((unsigned long int) __NR_arch_prctl),		      \
+		     "D" ((unsigned long int) ARCH_SET_FS),		      \
+		     "S" (_thrdescr)					      \
+		   : "memory", "cc", "r11", "cx");			      \
+									      \
+    _result ? "cannot set %fs base address for thread-local storage" : 0;     \
+  })
+
+# define TLS_DEFINE_INIT_TP(tp, pd) void *tp = (pd)
+
+
+/* Return the address of the dtv for the current thread.  */
+# define THREAD_DTV() \
+  ({ struct pthread *__pd;						      \
+     THREAD_GETMEM (__pd, header.dtv); })
+
+
+/* Return the thread descriptor for the current thread.
+
+   The contained asm must *not* be marked volatile since otherwise
+   assignments like
+	pthread_descr self = thread_self();
+   do not get optimized away.  */
+# define THREAD_SELF \
+  ({ struct pthread *__self;						      \
+     asm ("mov %%fs:%c1,%0" : "=r" (__self)				      \
+	  : "i" (offsetof (struct pthread, header.self)));	 	      \
+     __self;})
+
+/* Magic for libthread_db to know how to do THREAD_SELF.  */
+# define DB_THREAD_SELF_INCLUDE  <sys/reg.h> /* For the FS constant.  */
+# define DB_THREAD_SELF CONST_THREAD_AREA (64, FS)
+
+/* Read member of the thread descriptor directly.  */
+# define THREAD_GETMEM(descr, member) \
+  ({ __typeof (descr->member) __value;					      \
+     if (sizeof (__value) == 1)						      \
+       asm volatile ("movb %%fs:%P2,%b0"				      \
+		     : "=q" (__value)					      \
+		     : "0" (0), "i" (offsetof (struct pthread, member)));     \
+     else if (sizeof (__value) == 4)					      \
+       asm volatile ("movl %%fs:%P1,%0"					      \
+		     : "=r" (__value)					      \
+		     : "i" (offsetof (struct pthread, member)));	      \
+     else								      \
+       {								      \
+	 if (sizeof (__value) != 8)					      \
+	   /* There should not be any value with a size other than 1,	      \
+	      4 or 8.  */						      \
+	   abort ();							      \
+									      \
+	 asm volatile ("movq %%fs:%P1,%q0"				      \
+		       : "=r" (__value)					      \
+		       : "i" (offsetof (struct pthread, member)));	      \
+       }								      \
+     __value; })
+
+
+/* Same as THREAD_GETMEM, but the member offset can be non-constant.  */
+# define THREAD_GETMEM_NC(descr, member, idx) \
+  ({ __typeof (descr->member[0]) __value;				      \
+     if (sizeof (__value) == 1)						      \
+       asm volatile ("movb %%fs:%P2(%q3),%b0"				      \
+		     : "=q" (__value)					      \
+		     : "0" (0), "i" (offsetof (struct pthread, member[0])),   \
+		       "r" (idx));					      \
+     else if (sizeof (__value) == 4)					      \
+       asm volatile ("movl %%fs:%P1(,%q2,4),%0"				      \
+		     : "=r" (__value)					      \
+		     : "i" (offsetof (struct pthread, member[0])), "r" (idx));\
+     else								      \
+       {								      \
+	 if (sizeof (__value) != 8)					      \
+	   /* There should not be any value with a size other than 1,	      \
+	      4 or 8.  */						      \
+	   abort ();							      \
+									      \
+	 asm volatile ("movq %%fs:%P1(,%q2,8),%q0"			      \
+		       : "=r" (__value)					      \
+		       : "i" (offsetof (struct pthread, member[0])),	      \
+			 "r" (idx));					      \
+       }								      \
+     __value; })
+
+
+/* Loading addresses of objects on x86-64 needs to be treated special
+   when generating PIC code.  */
+#ifdef __pic__
+# define IMM_MODE "nr"
+#else
+# define IMM_MODE "ir"
+#endif
+
+
+/* Set member of the thread descriptor directly.  */
+# define THREAD_SETMEM(descr, member, value) \
+  ({ if (sizeof (descr->member) == 1)					      \
+       asm volatile ("movb %b0,%%fs:%P1" :				      \
+		     : "iq" (value),					      \
+		       "i" (offsetof (struct pthread, member)));	      \
+     else if (sizeof (descr->member) == 4)				      \
+       asm volatile ("movl %0,%%fs:%P1" :				      \
+		     : IMM_MODE (value),				      \
+		       "i" (offsetof (struct pthread, member)));	      \
+     else								      \
+       {								      \
+	 if (sizeof (descr->member) != 8)				      \
+	   /* There should not be any value with a size other than 1,	      \
+	      4 or 8.  */						      \
+	   abort ();							      \
+									      \
+	 asm volatile ("movq %q0,%%fs:%P1" :				      \
+		       : IMM_MODE ((uint64_t) cast_to_integer (value)),	      \
+			 "i" (offsetof (struct pthread, member)));	      \
+       }})
+
+
+/* Same as THREAD_SETMEM, but the member offset can be non-constant.  */
+# define THREAD_SETMEM_NC(descr, member, idx, value) \
+  ({ if (sizeof (descr->member[0]) == 1)				      \
+       asm volatile ("movb %b0,%%fs:%P1(%q2)" :				      \
+		     : "iq" (value),					      \
+		       "i" (offsetof (struct pthread, member[0])),	      \
+		       "r" (idx));					      \
+     else if (sizeof (descr->member[0]) == 4)				      \
+       asm volatile ("movl %0,%%fs:%P1(,%q2,4)" :			      \
+		     : IMM_MODE (value),				      \
+		       "i" (offsetof (struct pthread, member[0])),	      \
+		       "r" (idx));					      \
+     else								      \
+       {								      \
+	 if (sizeof (descr->member[0]) != 8)				      \
+	   /* There should not be any value with a size other than 1,	      \
+	      4 or 8.  */						      \
+	   abort ();							      \
+									      \
+	 asm volatile ("movq %q0,%%fs:%P1(,%q2,8)" :			      \
+		       : IMM_MODE ((uint64_t) cast_to_integer (value)),	      \
+			 "i" (offsetof (struct pthread, member[0])),	      \
+			 "r" (idx));					      \
+       }})
+
+
+/* Atomic compare and exchange on TLS, returning old value.  */
+# define THREAD_ATOMIC_CMPXCHG_VAL(descr, member, newval, oldval) \
+  ({ __typeof (descr->member) __ret;					      \
+     __typeof (oldval) __old = (oldval);				      \
+     if (sizeof (descr->member) == 4)					      \
+       asm volatile (LOCK_PREFIX "cmpxchgl %2, %%fs:%P3"		      \
+		     : "=a" (__ret)					      \
+		     : "0" (__old), "r" (newval),			      \
+		       "i" (offsetof (struct pthread, member)));	      \
+     else								      \
+       /* Not necessary for other sizes in the moment.  */		      \
+       abort ();							      \
+     __ret; })
+
+
+/* Atomic logical and.  */
+# define THREAD_ATOMIC_AND(descr, member, val) \
+  (void) ({ if (sizeof ((descr)->member) == 4)				      \
+	      asm volatile (LOCK_PREFIX "andl %1, %%fs:%P0"		      \
+			    :: "i" (offsetof (struct pthread, member)),	      \
+			       "ir" (val));				      \
+	    else							      \
+	      /* Not necessary for other sizes in the moment.  */	      \
+	      abort (); })
+
+
+/* Atomic set bit.  */
+# define THREAD_ATOMIC_BIT_SET(descr, member, bit) \
+  (void) ({ if (sizeof ((descr)->member) == 4)				      \
+	      asm volatile (LOCK_PREFIX "orl %1, %%fs:%P0"		      \
+			    :: "i" (offsetof (struct pthread, member)),	      \
+			       "ir" (1 << (bit)));			      \
+	    else							      \
+	      /* Not necessary for other sizes in the moment.  */	      \
+	      abort (); })
+
+
+/* Set the stack guard field in TCB head.  */
+# define THREAD_SET_STACK_GUARD(value) \
+    THREAD_SETMEM (THREAD_SELF, header.stack_guard, value)
+# define THREAD_COPY_STACK_GUARD(descr) \
+    ((descr)->header.stack_guard					      \
+     = THREAD_GETMEM (THREAD_SELF, header.stack_guard))
+
+
+/* Set the pointer guard field in the TCB head.  */
+# define THREAD_SET_POINTER_GUARD(value) \
+  THREAD_SETMEM (THREAD_SELF, header.pointer_guard, value)
+# define THREAD_COPY_POINTER_GUARD(descr) \
+  ((descr)->header.pointer_guard					      \
+   = THREAD_GETMEM (THREAD_SELF, header.pointer_guard))
+
+
+/* Get and set the global scope generation counter in the TCB head.  */
+# define THREAD_GSCOPE_FLAG_UNUSED 0
+# define THREAD_GSCOPE_FLAG_USED   1
+# define THREAD_GSCOPE_FLAG_WAIT   2
+# define THREAD_GSCOPE_RESET_FLAG() \
+  do									      \
+    { int __res;							      \
+      asm volatile ("xchgl %0, %%fs:%P1"				      \
+		    : "=r" (__res)					      \
+		    : "i" (offsetof (struct pthread, header.gscope_flag)),    \
+		      "0" (THREAD_GSCOPE_FLAG_UNUSED));			      \
+      if (__res == THREAD_GSCOPE_FLAG_WAIT)				      \
+	lll_futex_wake (&THREAD_SELF->header.gscope_flag, 1, LLL_PRIVATE);    \
+    }									      \
+  while (0)
+# define THREAD_GSCOPE_SET_FLAG() \
+  THREAD_SETMEM (THREAD_SELF, header.gscope_flag, THREAD_GSCOPE_FLAG_USED)
+# define THREAD_GSCOPE_WAIT() \
+  GL(dl_wait_lookup_done) ()
+
+#endif /* __ASSEMBLER__ */
+
+#endif	/* tls.h */
diff --git a/REORG.TODO/sysdeps/x86_64/preconfigure b/REORG.TODO/sysdeps/x86_64/preconfigure
new file mode 100644
index 0000000000..c8f1e0e132
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/preconfigure
@@ -0,0 +1,42 @@
+# This file is generated from configure.ac by Autoconf.  DO NOT EDIT!
+ # Local preconfigure fragment for sysdeps/x86_64
+
+test -n "$base_machine" || case "$machine" in
+x86_64)
+  base_machine=x86_64
+  # Check if we are building for x32.
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether $CC compiles in -mx32 mode by default" >&5
+$as_echo_n "checking whether $CC compiles in -mx32 mode by default... " >&6; }
+if ${libc_cv_x32+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+    cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#ifndef __ILP32__
+# error not x32
+#endif
+int
+main ()
+{
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+  libc_cv_x32=yes
+else
+  libc_cv_x32=no
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $libc_cv_x32" >&5
+$as_echo "$libc_cv_x32" >&6; }
+  if test $libc_cv_x32 = yes; then
+    machine=x86_64/x32
+  else
+    machine=x86_64/64
+  fi
+  ;;
+esac
diff --git a/REORG.TODO/sysdeps/x86_64/preconfigure.ac b/REORG.TODO/sysdeps/x86_64/preconfigure.ac
new file mode 100644
index 0000000000..600700ea1a
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/preconfigure.ac
@@ -0,0 +1,20 @@
+GLIBC_PROVIDES dnl See aclocal.m4 in the top level source directory.
+# Local preconfigure fragment for sysdeps/x86_64
+
+test -n "$base_machine" || case "$machine" in
+x86_64)
+  base_machine=x86_64
+  # Check if we are building for x32.
+  AC_CACHE_CHECK(whether $CC compiles in -mx32 mode by default,
+		 libc_cv_x32, [dnl
+  AC_TRY_COMPILE(dnl
+[#ifndef __ILP32__
+# error not x32
+#endif], [], libc_cv_x32=yes, libc_cv_x32=no)])
+  if test $libc_cv_x32 = yes; then
+    machine=x86_64/x32
+  else
+    machine=x86_64/64
+  fi
+  ;;
+esac
diff --git a/REORG.TODO/sysdeps/x86_64/rawmemchr.S b/REORG.TODO/sysdeps/x86_64/rawmemchr.S
new file mode 100644
index 0000000000..0405c7bb99
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/rawmemchr.S
@@ -0,0 +1,202 @@
+/* fast SSE2 memchr with 64 byte loop and pmaxub instruction using
+
+   Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+	.text
+ENTRY (__rawmemchr)
+	movd	%rsi, %xmm1
+	mov	%rdi, %rcx
+
+	punpcklbw %xmm1, %xmm1
+	punpcklbw %xmm1, %xmm1
+
+	and	$63, %rcx
+	pshufd	$0, %xmm1, %xmm1
+
+	cmp	$48, %rcx
+	ja	L(crosscache)
+
+	movdqu	(%rdi), %xmm0
+	pcmpeqb	%xmm1, %xmm0
+/* Check if there is a match.  */
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+
+	jnz	L(matches)
+	add	$16, %rdi
+	and	$-16, %rdi
+	jmp	L(loop_prolog)
+
+	.p2align 4
+L(crosscache):
+	and	$15, %rcx
+	and	$-16, %rdi
+	movdqa	(%rdi), %xmm0
+
+	pcmpeqb	%xmm1, %xmm0
+/* Check if there is a match.  */
+	pmovmskb %xmm0, %eax
+/* Remove the leading bytes.  */
+	sar	%cl, %eax
+	test	%eax, %eax
+	je	L(unaligned_no_match)
+/* Check which byte is a match.  */
+	bsf	%eax, %eax
+
+	add	%rdi, %rax
+	add	%rcx, %rax
+	ret
+
+	.p2align 4
+L(unaligned_no_match):
+	add	$16, %rdi
+
+	.p2align 4
+L(loop_prolog):
+	movdqa	(%rdi), %xmm0
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(matches)
+
+	movdqa	16(%rdi), %xmm2
+	pcmpeqb	%xmm1, %xmm2
+	pmovmskb %xmm2, %eax
+	test	%eax, %eax
+	jnz	L(matches16)
+
+	movdqa	32(%rdi), %xmm3
+	pcmpeqb	%xmm1, %xmm3
+	pmovmskb %xmm3, %eax
+	test	%eax, %eax
+	jnz	L(matches32)
+
+	movdqa	48(%rdi), %xmm4
+	pcmpeqb	%xmm1, %xmm4
+	add	$64, %rdi
+	pmovmskb %xmm4, %eax
+	test	%eax, %eax
+	jnz	L(matches0)
+
+	test	$0x3f, %rdi
+	jz	L(align64_loop)
+
+	movdqa	(%rdi), %xmm0
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(matches)
+
+	movdqa	16(%rdi), %xmm2
+	pcmpeqb	%xmm1, %xmm2
+	pmovmskb %xmm2, %eax
+	test	%eax, %eax
+	jnz	L(matches16)
+
+	movdqa	32(%rdi), %xmm3
+	pcmpeqb	%xmm1, %xmm3
+	pmovmskb %xmm3, %eax
+	test	%eax, %eax
+	jnz	L(matches32)
+
+	movdqa	48(%rdi), %xmm3
+	pcmpeqb	%xmm1, %xmm3
+	pmovmskb %xmm3, %eax
+
+	add	$64, %rdi
+	test	%eax, %eax
+	jnz	L(matches0)
+
+	and	$-64, %rdi
+
+	.p2align 4
+L(align64_loop):
+	movdqa	(%rdi), %xmm0
+	movdqa	16(%rdi), %xmm2
+	movdqa	32(%rdi), %xmm3
+	movdqa	48(%rdi), %xmm4
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm1, %xmm2
+	pcmpeqb	%xmm1, %xmm3
+	pcmpeqb	%xmm1, %xmm4
+
+	pmaxub	%xmm0, %xmm3
+	pmaxub	%xmm2, %xmm4
+	pmaxub	%xmm3, %xmm4
+	pmovmskb %xmm4, %eax
+
+	add	$64, %rdi
+
+	test	%eax, %eax
+	jz	L(align64_loop)
+
+	sub	$64, %rdi
+
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(matches)
+
+	pmovmskb %xmm2, %eax
+	test	%eax, %eax
+	jnz	L(matches16)
+
+	movdqa	32(%rdi), %xmm3
+	pcmpeqb	%xmm1, %xmm3
+
+	pcmpeqb	48(%rdi), %xmm1
+	pmovmskb %xmm3, %eax
+	test	%eax, %eax
+	jnz	L(matches32)
+
+	pmovmskb %xmm1, %eax
+	bsf	%eax, %eax
+	lea	48(%rdi, %rax), %rax
+	ret
+
+	.p2align 4
+L(matches0):
+	bsf	%eax, %eax
+	lea	-16(%rax, %rdi), %rax
+	ret
+
+	.p2align 4
+L(matches):
+	bsf	%eax, %eax
+	add	%rdi, %rax
+	ret
+
+	.p2align 4
+L(matches16):
+	bsf	%eax, %eax
+	lea	16(%rax, %rdi), %rax
+	ret
+
+	.p2align 4
+L(matches32):
+	bsf	%eax, %eax
+	lea	32(%rax, %rdi), %rax
+	ret
+
+END (__rawmemchr)
+
+weak_alias (__rawmemchr, rawmemchr)
+libc_hidden_builtin_def (__rawmemchr)
diff --git a/REORG.TODO/sysdeps/x86_64/rshift.S b/REORG.TODO/sysdeps/x86_64/rshift.S
new file mode 100644
index 0000000000..1686339e5c
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/rshift.S
@@ -0,0 +1,114 @@
+/* x86-64 __mpn_rshift --
+   Copyright (C) 2007-2017 Free Software Foundation, Inc.
+   This file is part of the GNU MP Library.
+
+   The GNU MP Library is free software; you can redistribute it and/or modify
+   it under the terms of the GNU Lesser General Public License as published by
+   the Free Software Foundation; either version 2.1 of the License, or (at your
+   option) any later version.
+
+   The GNU MP Library is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+   License for more details.
+
+   You should have received a copy of the GNU Lesser General Public License
+   along with the GNU MP Library; see the file COPYING.LIB.  If not,
+   see <http://www.gnu.org/licenses/>.  */
+
+#include "sysdep.h"
+#include "asm-syntax.h"
+
+#define rp	%rdi
+#define up	%rsi
+#define n	%rdx
+#define cnt	%cl
+
+	.text
+ENTRY (__mpn_rshift)
+	mov	%edx, %eax
+	and	$3, %eax
+	jne	L(nb00)
+L(b00):	/* n = 4, 8, 12, ... */
+	mov	(up), %r10
+	mov	8(up), %r11
+	xor	%eax, %eax
+	shrd	%cl, %r10, %rax
+	mov	16(up), %r8
+	lea	8(up), up
+	lea	-24(rp), rp
+	sub	$4, n
+	jmp	L(00)
+
+L(nb00):/* n = 1, 5, 9, ... */
+	cmp	$2, %eax
+	jae	L(nb01)
+L(b01):	mov	(up), %r9
+	xor	%eax, %eax
+	shrd	%cl, %r9, %rax
+	sub	$2, n
+	jb	L(le1)
+	mov	8(up), %r10
+	mov	16(up), %r11
+	lea	16(up), up
+	lea	-16(rp), rp
+	jmp	L(01)
+L(le1): shr	%cl, %r9
+	mov	%r9, (rp)
+	ret
+
+L(nb01):/* n = 2, 6, 10, ... */
+	jne	L(b11)
+L(b10):	mov	(up), %r8
+	mov	8(up), %r9
+	xor	%eax, %eax
+	shrd	%cl, %r8, %rax
+	sub	$3, n
+	jb	L(le2)
+	mov	16(up), %r10
+	lea	24(up), up
+	lea	-8(rp), rp
+	jmp	L(10)
+L(le2): shrd	%cl, %r9, %r8
+	mov	%r8, (rp)
+	shr	%cl, %r9
+	mov	%r9, 8(rp)
+	ret
+
+	.p2align 4
+L(b11):	/* n = 3, 7, 11, ... */
+	mov	(up), %r11
+	mov	8(up), %r8
+	xor	%eax, %eax
+	shrd	%cl, %r11, %rax
+	mov	16(up), %r9
+	lea	32(up), up
+	sub	$4, n
+	jb	L(end)
+
+	.p2align 4
+L(top):	shrd	%cl, %r8, %r11
+	mov	-8(up), %r10
+	mov	%r11, (rp)
+L(10):	shrd	%cl, %r9, %r8
+	mov	(up), %r11
+	mov	%r8, 8(rp)
+L(01):	shrd	%cl, %r10, %r9
+	mov	8(up), %r8
+	mov	%r9, 16(rp)
+L(00):	shrd	%cl, %r11, %r10
+	mov	16(up), %r9
+	mov	%r10, 24(rp)
+	add	$32, up
+	lea	32(rp), rp
+	sub	$4, n
+	jnc	L(top)
+
+L(end):	shrd	%cl, %r8, %r11
+	mov	%r11, (rp)
+	shrd	%cl, %r9, %r8
+	mov	%r8, 8(rp)
+	shr	%cl, %r9
+	mov	%r9, 16(rp)
+	ret
+END (__mpn_rshift)
diff --git a/REORG.TODO/sysdeps/x86_64/sched_cpucount.c b/REORG.TODO/sysdeps/x86_64/sched_cpucount.c
new file mode 100644
index 0000000000..408ddc9d61
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/sched_cpucount.c
@@ -0,0 +1,25 @@
+/* Copyright (C) 2007-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifdef __amdfam10
+# define POPCNT(l) \
+  ({ __cpu_mask r;							      \
+     asm ("popcntq %1, %0" : "=r" (r) : "0" (l));			      \
+     r; })
+#endif
+
+#include <posix/sched_cpucount.c>
diff --git a/REORG.TODO/sysdeps/x86_64/setjmp.S b/REORG.TODO/sysdeps/x86_64/setjmp.S
new file mode 100644
index 0000000000..3a889033cd
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/setjmp.S
@@ -0,0 +1,66 @@
+/* setjmp for x86-64.
+   Copyright (C) 2001-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <jmpbuf-offsets.h>
+#include <asm-syntax.h>
+#include <stap-probe.h>
+
+ENTRY (__sigsetjmp)
+	/* Save registers.  */
+	movq %rbx, (JB_RBX*8)(%rdi)
+#ifdef PTR_MANGLE
+# ifdef __ILP32__
+	/* Save the high bits of %rbp first, since PTR_MANGLE will
+	   only handle the low bits but we cannot presume %rbp is
+	   being used as a pointer and truncate it.  Here we write all
+	   of %rbp, but the low bits will be overwritten below.  */
+	movq %rbp, (JB_RBP*8)(%rdi)
+# endif
+	mov %RBP_LP, %RAX_LP
+	PTR_MANGLE (%RAX_LP)
+	mov %RAX_LP, (JB_RBP*8)(%rdi)
+#else
+	movq %rbp, (JB_RBP*8)(%rdi)
+#endif
+	movq %r12, (JB_R12*8)(%rdi)
+	movq %r13, (JB_R13*8)(%rdi)
+	movq %r14, (JB_R14*8)(%rdi)
+	movq %r15, (JB_R15*8)(%rdi)
+	lea 8(%rsp), %RDX_LP	/* Save SP as it will be after we return.  */
+#ifdef PTR_MANGLE
+	PTR_MANGLE (%RDX_LP)
+#endif
+	movq %rdx, (JB_RSP*8)(%rdi)
+	mov (%rsp), %RAX_LP	/* Save PC we are returning to now.  */
+	LIBC_PROBE (setjmp, 3, LP_SIZE@%RDI_LP, -4@%esi, LP_SIZE@%RAX_LP)
+#ifdef PTR_MANGLE
+	PTR_MANGLE (%RAX_LP)
+#endif
+	movq %rax, (JB_PC*8)(%rdi)
+
+#if IS_IN (rtld)
+	/* In ld.so we never save the signal mask.  */
+	xorl %eax, %eax
+	retq
+#else
+	/* Make a tail call to __sigjmp_save; it takes the same args.  */
+	jmp __sigjmp_save
+#endif
+END (__sigsetjmp)
+hidden_def (__sigsetjmp)
diff --git a/REORG.TODO/sysdeps/x86_64/stack-aliasing.h b/REORG.TODO/sysdeps/x86_64/stack-aliasing.h
new file mode 100644
index 0000000000..2efdacb3b4
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/stack-aliasing.h
@@ -0,0 +1 @@
+#include <sysdeps/i386/i686/stack-aliasing.h>
diff --git a/REORG.TODO/sysdeps/x86_64/stackguard-macros.h b/REORG.TODO/sysdeps/x86_64/stackguard-macros.h
new file mode 100644
index 0000000000..1948800cd0
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/stackguard-macros.h
@@ -0,0 +1,11 @@
+#include <stdint.h>
+
+#define STACK_CHK_GUARD \
+  ({ uintptr_t x;						\
+     asm ("mov %%fs:%c1, %0" : "=r" (x)				\
+	  : "i" (offsetof (tcbhead_t, stack_guard))); x; })
+
+#define POINTER_CHK_GUARD \
+  ({ uintptr_t x;						\
+     asm ("mov %%fs:%c1, %0" : "=r" (x)				\
+	  : "i" (offsetof (tcbhead_t, pointer_guard))); x; })
diff --git a/REORG.TODO/sysdeps/x86_64/stackinfo.h b/REORG.TODO/sysdeps/x86_64/stackinfo.h
new file mode 100644
index 0000000000..a1cbb43322
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/stackinfo.h
@@ -0,0 +1,43 @@
+/* Copyright (C) 2001-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* This file contains a bit of information about the stack allocation
+   of the processor.  */
+
+#ifndef _STACKINFO_H
+#define _STACKINFO_H	1
+
+#include <elf.h>
+
+/* On x86_64 the stack grows down.  */
+#define _STACK_GROWS_DOWN	1
+
+/* Default to an executable stack.  PF_X can be overridden if PT_GNU_STACK is
+ * present, but it is presumed absent.  */
+#define DEFAULT_STACK_PERMS (PF_R|PF_W|PF_X)
+
+/* Access to the stack pointer.  The macros are used in alloca_account
+   for which they need to act as barriers as well, hence the additional
+   (unnecessary) parameters.  */
+#define stackinfo_get_sp() \
+  ({ void *p__; asm volatile ("mov %%" RSP_LP ", %0" : "=r" (p__)); p__; })
+#define stackinfo_sub_sp(ptr) \
+  ({ ptrdiff_t d__;						\
+     asm volatile ("sub %%" RSP_LP " , %0" : "=r" (d__) : "0" (ptr));	\
+     d__; })
+
+#endif	/* stackinfo.h */
diff --git a/REORG.TODO/sysdeps/x86_64/start.S b/REORG.TODO/sysdeps/x86_64/start.S
new file mode 100644
index 0000000000..62a00eaeaa
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/start.S
@@ -0,0 +1,131 @@
+/* Startup code compliant to the ELF x86-64 ABI.
+   Copyright (C) 2001-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Andreas Jaeger <aj@suse.de>, 2001.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   In addition to the permissions in the GNU Lesser General Public
+   License, the Free Software Foundation gives you unlimited
+   permission to link the compiled version of this file with other
+   programs, and to distribute those programs without any restriction
+   coming from the use of this file. (The GNU Lesser General Public
+   License restrictions do apply in other respects; for example, they
+   cover modification of the file, and distribution when not linked
+   into another program.)
+
+   Note that people who make modified versions of this file are not
+   obligated to grant this special exception for their modified
+   versions; it is their choice whether to do so. The GNU Lesser
+   General Public License gives permission to release a modified
+   version without this exception; this exception also makes it
+   possible to release a modified version which carries forward this
+   exception.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* This is the canonical entry point, usually the first thing in the text
+   segment.  The SVR4/i386 ABI (pages 3-31, 3-32) says that when the entry
+   point runs, most registers' values are unspecified, except for:
+
+   %rdx		Contains a function pointer to be registered with `atexit'.
+		This is how the dynamic linker arranges to have DT_FINI
+		functions called for shared libraries that have been loaded
+		before this code runs.
+
+   %rsp		The stack contains the arguments and environment:
+		0(%rsp)				argc
+		LP_SIZE(%rsp)			argv[0]
+		...
+		(LP_SIZE*argc)(%rsp)		NULL
+		(LP_SIZE*(argc+1))(%rsp)	envp[0]
+		...
+						NULL
+*/
+
+#include <sysdep.h>
+
+ENTRY (_start)
+	/* Clearing frame pointer is insufficient, use CFI.  */
+	cfi_undefined (rip)
+	/* Clear the frame pointer.  The ABI suggests this be done, to mark
+	   the outermost frame obviously.  */
+	xorl %ebp, %ebp
+
+	/* Extract the arguments as encoded on the stack and set up
+	   the arguments for __libc_start_main (int (*main) (int, char **, char **),
+		   int argc, char *argv,
+		   void (*init) (void), void (*fini) (void),
+		   void (*rtld_fini) (void), void *stack_end).
+	   The arguments are passed via registers and on the stack:
+	main:		%rdi
+	argc:		%rsi
+	argv:		%rdx
+	init:		%rcx
+	fini:		%r8
+	rtld_fini:	%r9
+	stack_end:	stack.	*/
+
+	mov %RDX_LP, %R9_LP	/* Address of the shared library termination
+				   function.  */
+#ifdef __ILP32__
+	mov (%rsp), %esi	/* Simulate popping 4-byte argument count.  */
+	add $4, %esp
+#else
+	popq %rsi		/* Pop the argument count.  */
+#endif
+	/* argv starts just at the current stack top.  */
+	mov %RSP_LP, %RDX_LP
+	/* Align the stack to a 16 byte boundary to follow the ABI.  */
+	and  $~15, %RSP_LP
+
+	/* Push garbage because we push 8 more bytes.  */
+	pushq %rax
+
+	/* Provide the highest stack address to the user code (for stacks
+	   which grow downwards).  */
+	pushq %rsp
+
+#ifdef SHARED
+	/* Pass address of our own entry points to .fini and .init.  */
+	mov __libc_csu_fini@GOTPCREL(%rip), %R8_LP
+	mov __libc_csu_init@GOTPCREL(%rip), %RCX_LP
+
+	mov main@GOTPCREL(%rip), %RDI_LP
+#else
+	/* Pass address of our own entry points to .fini and .init.  */
+	mov $__libc_csu_fini, %R8_LP
+	mov $__libc_csu_init, %RCX_LP
+
+	mov $main, %RDI_LP
+#endif
+
+	/* Call the user's main function, and exit with its value.
+	   But let the libc call main.  Since __libc_start_main in
+	   libc.so is called very early, lazy binding isn't relevant
+	   here.  Use indirect branch via GOT to avoid extra branch
+	   to PLT slot.  In case of static executable, ld in binutils
+	   2.26 or above can convert indirect branch into direct
+	   branch.  */
+	call *__libc_start_main@GOTPCREL(%rip)
+
+	hlt			/* Crash if somehow `exit' does return.	 */
+END (_start)
+
+/* Define a symbol for the first piece of initialized data.  */
+	.data
+	.globl __data_start
+__data_start:
+	.long 0
+	.weak data_start
+	data_start = __data_start
diff --git a/REORG.TODO/sysdeps/x86_64/stpcpy.S b/REORG.TODO/sysdeps/x86_64/stpcpy.S
new file mode 100644
index 0000000000..ec23de1416
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/stpcpy.S
@@ -0,0 +1,8 @@
+#define USE_AS_STPCPY
+#define STRCPY __stpcpy
+
+#include <sysdeps/x86_64/strcpy.S>
+
+weak_alias (__stpcpy, stpcpy)
+libc_hidden_def (__stpcpy)
+libc_hidden_builtin_def (stpcpy)
diff --git a/REORG.TODO/sysdeps/x86_64/strcasecmp.S b/REORG.TODO/sysdeps/x86_64/strcasecmp.S
new file mode 100644
index 0000000000..fe49e820f2
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/strcasecmp.S
@@ -0,0 +1 @@
+/* In strcasecmp_l.S.  */
diff --git a/REORG.TODO/sysdeps/x86_64/strcasecmp_l-nonascii.c b/REORG.TODO/sysdeps/x86_64/strcasecmp_l-nonascii.c
new file mode 100644
index 0000000000..30e8969603
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/strcasecmp_l-nonascii.c
@@ -0,0 +1,8 @@
+#include <string.h>
+
+extern int __strcasecmp_l_nonascii (const char *__s1, const char *__s2,
+				    __locale_t __loc);
+
+#define __strcasecmp_l __strcasecmp_l_nonascii
+#define USE_IN_EXTENDED_LOCALE_MODEL    1
+#include <string/strcasecmp.c>
diff --git a/REORG.TODO/sysdeps/x86_64/strcasecmp_l.S b/REORG.TODO/sysdeps/x86_64/strcasecmp_l.S
new file mode 100644
index 0000000000..5456b3a49e
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/strcasecmp_l.S
@@ -0,0 +1,6 @@
+#define STRCMP __strcasecmp_l
+#define USE_AS_STRCASECMP_L
+#include "strcmp.S"
+
+weak_alias (__strcasecmp_l, strcasecmp_l)
+libc_hidden_def (strcasecmp_l)
diff --git a/REORG.TODO/sysdeps/x86_64/strcat.S b/REORG.TODO/sysdeps/x86_64/strcat.S
new file mode 100644
index 0000000000..44e6512339
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/strcat.S
@@ -0,0 +1,258 @@
+/* strcat(dest, src) -- Append SRC on the end of DEST.
+   Optimized for x86-64.
+   Copyright (C) 2002-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Andreas Jaeger <aj@suse.de>, 2002.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+/* Will be removed when new strcpy implementation gets merged.  */
+
+	.text
+ENTRY (strcat)
+	movq %rdi, %rcx		/* Dest. register. */
+	andl $7, %ecx		/* mask alignment bits */
+	movq %rdi, %rax		/* Duplicate destination pointer.  */
+	movq $0xfefefefefefefeff,%r8
+
+	/* First step: Find end of destination.  */
+	jz 4f			/* aligned => start loop */
+
+	neg %ecx		/* We need to align to 8 bytes.  */
+	addl $8,%ecx
+	/* Search the first bytes directly.  */
+0:	cmpb $0x0,(%rax)	/* is byte NUL? */
+	je 2f			/* yes => start copy */
+	incq %rax		/* increment pointer */
+	decl %ecx
+	jnz 0b
+
+
+
+	/* Now the source is aligned.  Scan for NUL byte.  */
+	.p2align 4
+4:
+	/* First unroll.  */
+	movq (%rax), %rcx	/* get double word (= 8 bytes) in question */
+	addq $8,%rax		/* adjust pointer for next word */
+	movq %r8, %rdx		/* magic value */
+	addq %rcx, %rdx		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc 3f			/* highest byte is NUL => return pointer */
+	xorq %rcx, %rdx		/* (word+magic)^word */
+	orq %r8, %rdx		/* set all non-carry bits */
+	incq %rdx		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jnz 3f			/* found NUL => return pointer */
+
+	/* Second unroll.  */
+	movq (%rax), %rcx	/* get double word (= 8 bytes) in question */
+	addq $8,%rax		/* adjust pointer for next word */
+	movq %r8, %rdx		/* magic value */
+	addq %rcx, %rdx		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc 3f			/* highest byte is NUL => return pointer */
+	xorq %rcx, %rdx		/* (word+magic)^word */
+	orq %r8, %rdx		/* set all non-carry bits */
+	incq %rdx		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jnz 3f			/* found NUL => return pointer */
+
+	/* Third unroll.  */
+	movq (%rax), %rcx	/* get double word (= 8 bytes) in question */
+	addq $8,%rax		/* adjust pointer for next word */
+	movq %r8, %rdx		/* magic value */
+	addq %rcx, %rdx		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc 3f			/* highest byte is NUL => return pointer */
+	xorq %rcx, %rdx		/* (word+magic)^word */
+	orq %r8, %rdx		/* set all non-carry bits */
+	incq %rdx		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jnz 3f			/* found NUL => return pointer */
+
+	/* Fourth unroll.  */
+	movq (%rax), %rcx	/* get double word (= 8 bytes) in question */
+	addq $8,%rax		/* adjust pointer for next word */
+	movq %r8, %rdx		/* magic value */
+	addq %rcx, %rdx		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc 3f			/* highest byte is NUL => return pointer */
+	xorq %rcx, %rdx		/* (word+magic)^word */
+	orq %r8, %rdx		/* set all non-carry bits */
+	incq %rdx		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jz 4b			/* no NUL found => continue loop */
+
+	.p2align 4		/* Align, it's a jump target.  */
+3:	subq $8,%rax		/* correct pointer increment.  */
+
+	testb %cl, %cl		/* is first byte NUL? */
+	jz 2f			/* yes => return */
+	incq %rax		/* increment pointer */
+
+	testb %ch, %ch		/* is second byte NUL? */
+	jz 2f			/* yes => return */
+	incq %rax		/* increment pointer */
+
+	testl $0x00ff0000, %ecx /* is third byte NUL? */
+	jz 2f			/* yes => return pointer */
+	incq %rax		/* increment pointer */
+
+	testl $0xff000000, %ecx /* is fourth byte NUL? */
+	jz 2f			/* yes => return pointer */
+	incq %rax		/* increment pointer */
+
+	shrq $32, %rcx		/* look at other half.  */
+
+	testb %cl, %cl		/* is first byte NUL? */
+	jz 2f			/* yes => return */
+	incq %rax		/* increment pointer */
+
+	testb %ch, %ch		/* is second byte NUL? */
+	jz 2f			/* yes => return */
+	incq %rax		/* increment pointer */
+
+	testl $0xff0000, %ecx	/* is third byte NUL? */
+	jz 2f			/* yes => return pointer */
+	incq %rax		/* increment pointer */
+
+2:
+	/* Second step: Copy source to destination.  */
+
+	movq	%rsi, %rcx	/* duplicate  */
+	andl	$7,%ecx		/* mask alignment bits */
+	movq	%rax, %rdx	/* move around */
+	jz	22f		/* aligned => start loop */
+
+	neg	%ecx		/* align to 8 bytes.  */
+	addl	$8, %ecx
+	/* Align the source pointer.  */
+21:
+	movb	(%rsi), %al	/* Fetch a byte */
+	testb	%al, %al	/* Is it NUL? */
+	movb	%al, (%rdx)	/* Store it */
+	jz	24f		/* If it was NUL, done! */
+	incq	%rsi
+	incq	%rdx
+	decl	%ecx
+	jnz	21b
+
+	/* Now the sources is aligned.  Unfortunatly we cannot force
+	   to have both source and destination aligned, so ignore the
+	   alignment of the destination.  */
+	.p2align 4
+22:
+	/* 1st unroll.  */
+	movq	(%rsi), %rax	/* Read double word (8 bytes).  */
+	addq	$8, %rsi	/* Adjust pointer for next word.  */
+	movq	%rax, %r9	/* Save a copy for NUL finding.  */
+	addq	%r8, %r9	/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc	23f		/* highest byte is NUL => return pointer */
+	xorq	%rax, %r9	/* (word+magic)^word */
+	orq	%r8, %r9	/* set all non-carry bits */
+	incq	%r9		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+
+	jnz	23f		/* found NUL => return pointer */
+
+	movq	%rax, (%rdx)	/* Write value to destination.  */
+	addq	$8, %rdx	/* Adjust pointer.  */
+
+	/* 2nd unroll.  */
+	movq	(%rsi), %rax	/* Read double word (8 bytes).  */
+	addq	$8, %rsi	/* Adjust pointer for next word.  */
+	movq	%rax, %r9	/* Save a copy for NUL finding.  */
+	addq	%r8, %r9	/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc	23f		/* highest byte is NUL => return pointer */
+	xorq	%rax, %r9	/* (word+magic)^word */
+	orq	%r8, %r9	/* set all non-carry bits */
+	incq	%r9		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+
+	jnz	23f		/* found NUL => return pointer */
+
+	movq	%rax, (%rdx)	/* Write value to destination.  */
+	addq	$8, %rdx	/* Adjust pointer.  */
+
+	/* 3rd unroll.  */
+	movq	(%rsi), %rax	/* Read double word (8 bytes).  */
+	addq	$8, %rsi	/* Adjust pointer for next word.  */
+	movq	%rax, %r9	/* Save a copy for NUL finding.  */
+	addq	%r8, %r9	/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc	23f		/* highest byte is NUL => return pointer */
+	xorq	%rax, %r9	/* (word+magic)^word */
+	orq	%r8, %r9	/* set all non-carry bits */
+	incq	%r9		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+
+	jnz	23f		/* found NUL => return pointer */
+
+	movq	%rax, (%rdx)	/* Write value to destination.  */
+	addq	$8, %rdx	/* Adjust pointer.  */
+
+	/* 4th unroll.  */
+	movq	(%rsi), %rax	/* Read double word (8 bytes).  */
+	addq	$8, %rsi	/* Adjust pointer for next word.  */
+	movq	%rax, %r9	/* Save a copy for NUL finding.  */
+	addq	%r8, %r9	/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc	23f		/* highest byte is NUL => return pointer */
+	xorq	%rax, %r9	/* (word+magic)^word */
+	orq	%r8, %r9	/* set all non-carry bits */
+	incq	%r9		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+
+	jnz	23f		/* found NUL => return pointer */
+
+	movq	%rax, (%rdx)	/* Write value to destination.  */
+	addq	$8, %rdx	/* Adjust pointer.  */
+	jmp	22b		/* Next iteration.  */
+
+	/* Do the last few bytes. %rax contains the value to write.
+	   The loop is unrolled twice.  */
+	.p2align 4
+23:
+	movb	%al, (%rdx)	/* 1st byte.  */
+	testb	%al, %al	/* Is it NUL.  */
+	jz	24f		/* yes, finish.  */
+	incq	%rdx		/* Increment destination.  */
+	movb	%ah, (%rdx)	/* 2nd byte.  */
+	testb	%ah, %ah	/* Is it NUL?.  */
+	jz	24f		/* yes, finish.  */
+	incq	%rdx		/* Increment destination.  */
+	shrq	$16, %rax	/* Shift...  */
+	jmp	23b		/* and look at next two bytes in %rax.  */
+
+
+24:
+	movq	%rdi, %rax	/* Source is return value.  */
+	retq
+END (strcat)
+libc_hidden_builtin_def (strcat)
diff --git a/REORG.TODO/sysdeps/x86_64/strchr.S b/REORG.TODO/sysdeps/x86_64/strchr.S
new file mode 100644
index 0000000000..16c1726803
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/strchr.S
@@ -0,0 +1,187 @@
+/* strchr (str, ch) -- Return pointer to first occurrence of CH in STR.
+   For AMD x86-64.
+   Copyright (C) 2009-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+	.text
+ENTRY (strchr)
+	movd	%esi, %xmm1
+	movl	%edi, %eax
+	andl	$4095, %eax
+	punpcklbw %xmm1, %xmm1
+	cmpl	$4032, %eax
+	punpcklwd %xmm1, %xmm1
+	pshufd	$0, %xmm1, %xmm1
+	jg	L(cross_page)
+	movdqu	(%rdi), %xmm0
+	pxor	%xmm3, %xmm3
+	movdqa	%xmm0, %xmm4
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm3, %xmm4
+	por	%xmm4, %xmm0
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	je	L(next_48_bytes)
+	bsf	%eax, %eax
+#ifdef AS_STRCHRNUL
+	leaq	(%rdi,%rax), %rax
+#else
+	movl	$0, %edx
+	leaq	(%rdi,%rax), %rax
+	cmpb	%sil, (%rax)
+	cmovne	%rdx, %rax
+#endif
+	ret
+
+	.p2align 3
+	L(next_48_bytes):
+	movdqu	16(%rdi), %xmm0
+	movdqa	%xmm0, %xmm4
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm3, %xmm4
+	por	%xmm4, %xmm0
+	pmovmskb %xmm0, %ecx
+	movdqu	32(%rdi), %xmm0
+	movdqa	%xmm0, %xmm4
+	pcmpeqb	%xmm1, %xmm0
+	salq	$16, %rcx
+	pcmpeqb	%xmm3, %xmm4
+	por	%xmm4, %xmm0
+	pmovmskb %xmm0, %eax
+	movdqu	48(%rdi), %xmm0
+	pcmpeqb	%xmm0, %xmm3
+	salq	$32, %rax
+	pcmpeqb	%xmm1, %xmm0
+	orq	%rcx, %rax
+	por	%xmm3, %xmm0
+	pmovmskb %xmm0, %ecx
+	salq	$48, %rcx
+	orq	%rcx, %rax
+	testq	%rax, %rax
+	jne	L(return)
+L(loop_start):
+	/* We use this alignment to force loop be aligned to 8 but not
+	   16 bytes.  This gives better sheduling on AMD processors.  */
+	.p2align 4
+	pxor	%xmm6, %xmm6
+	andq	$-64, %rdi
+	.p2align 3
+L(loop64):
+	addq	$64, %rdi
+	movdqa	(%rdi), %xmm5
+	movdqa	16(%rdi), %xmm2
+	movdqa	32(%rdi), %xmm3
+	pxor	%xmm1, %xmm5
+	movdqa	48(%rdi), %xmm4
+	pxor	%xmm1, %xmm2
+	pxor	%xmm1, %xmm3
+	pminub	(%rdi), %xmm5
+	pxor	%xmm1, %xmm4
+	pminub	16(%rdi), %xmm2
+	pminub	32(%rdi), %xmm3
+	pminub	%xmm2, %xmm5
+	pminub	48(%rdi), %xmm4
+	pminub	%xmm3, %xmm5
+	pminub	%xmm4, %xmm5
+	pcmpeqb %xmm6, %xmm5
+	pmovmskb %xmm5, %eax
+
+	testl	%eax, %eax
+	je	L(loop64)
+
+	movdqa	(%rdi), %xmm5
+	movdqa	%xmm5, %xmm0
+	pcmpeqb	%xmm1, %xmm5
+	pcmpeqb	%xmm6, %xmm0
+	por	%xmm0, %xmm5
+	pcmpeqb %xmm6, %xmm2
+	pcmpeqb %xmm6, %xmm3
+	pcmpeqb %xmm6, %xmm4
+
+	pmovmskb %xmm5, %ecx
+	pmovmskb %xmm2, %eax
+	salq	$16, %rax
+	pmovmskb %xmm3, %r8d
+	pmovmskb %xmm4, %edx
+	salq	$32, %r8
+	orq	%r8, %rax
+	orq	%rcx, %rax
+	salq	$48, %rdx
+	orq	%rdx, %rax
+	.p2align 3
+L(return):
+	bsfq	%rax, %rax
+#ifdef AS_STRCHRNUL
+	leaq	(%rdi,%rax), %rax
+#else
+	movl	$0, %edx
+	leaq	(%rdi,%rax), %rax
+	cmpb	%sil, (%rax)
+	cmovne	%rdx, %rax
+#endif
+	ret
+	.p2align 4
+
+L(cross_page):
+	movq	%rdi, %rdx
+	pxor	%xmm2, %xmm2
+	andq	$-64, %rdx
+	movdqa	%xmm1, %xmm0
+	movdqa	(%rdx), %xmm3
+	movdqa	%xmm3, %xmm4
+	pcmpeqb	%xmm1, %xmm3
+	pcmpeqb	%xmm2, %xmm4
+	por	%xmm4, %xmm3
+	pmovmskb %xmm3, %r8d
+	movdqa	16(%rdx), %xmm3
+	movdqa	%xmm3, %xmm4
+	pcmpeqb	%xmm1, %xmm3
+	pcmpeqb	%xmm2, %xmm4
+	por	%xmm4, %xmm3
+	pmovmskb %xmm3, %eax
+	movdqa	32(%rdx), %xmm3
+	movdqa	%xmm3, %xmm4
+	pcmpeqb	%xmm1, %xmm3
+	salq	$16, %rax
+	pcmpeqb	%xmm2, %xmm4
+	por	%xmm4, %xmm3
+	pmovmskb %xmm3, %r9d
+	movdqa	48(%rdx), %xmm3
+	pcmpeqb	%xmm3, %xmm2
+	salq	$32, %r9
+	pcmpeqb	%xmm3, %xmm0
+	orq	%r9, %rax
+	orq	%r8, %rax
+	por	%xmm2, %xmm0
+	pmovmskb %xmm0, %ecx
+	salq	$48, %rcx
+	orq	%rcx, %rax
+	movl	%edi, %ecx
+	subb	%dl, %cl
+	shrq	%cl, %rax
+	testq	%rax, %rax
+	jne	L(return)
+	jmp	L(loop_start)
+
+END (strchr)
+
+#ifndef AS_STRCHRNUL
+weak_alias (strchr, index)
+libc_hidden_builtin_def (strchr)
+#endif
diff --git a/REORG.TODO/sysdeps/x86_64/strchrnul.S b/REORG.TODO/sysdeps/x86_64/strchrnul.S
new file mode 100644
index 0000000000..841dfc2783
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/strchrnul.S
@@ -0,0 +1,27 @@
+/* strchrnul (str, ch) -- Return pointer to first occurrence of CH in STR
+	or terminating NUL byte.
+   For AMD x86-64.
+   Copyright (C) 2009-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+#define strchr __strchrnul
+#define AS_STRCHRNUL
+#include "strchr.S"
+
+weak_alias (__strchrnul, strchrnul)
diff --git a/REORG.TODO/sysdeps/x86_64/strcmp.S b/REORG.TODO/sysdeps/x86_64/strcmp.S
new file mode 100644
index 0000000000..076be04df5
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/strcmp.S
@@ -0,0 +1,2267 @@
+/* Highly optimized version for x86-64.
+   Copyright (C) 1999-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Based on i686 version contributed by Ulrich Drepper
+   <drepper@cygnus.com>, 1999.
+   Updated with SSE2 support contributed by Intel Corporation.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+#undef UPDATE_STRNCMP_COUNTER
+
+#ifndef LABEL
+#define LABEL(l) L(l)
+#endif
+
+#ifdef USE_AS_STRNCMP
+/* Since the counter, %r11, is unsigned, we branch to strcmp_exitz
+   if the new counter > the old one or is 0.  */
+# define UPDATE_STRNCMP_COUNTER				\
+	/* calculate left number to compare */		\
+	lea	-16(%rcx, %r11), %r9;			\
+	cmp	%r9, %r11;				\
+	jb	LABEL(strcmp_exitz);			\
+	test	%r9, %r9;				\
+	je	LABEL(strcmp_exitz);			\
+	mov	%r9, %r11
+
+#elif defined USE_AS_STRCASECMP_L
+# include "locale-defines.h"
+
+# define UPDATE_STRNCMP_COUNTER
+#elif defined USE_AS_STRNCASECMP_L
+# include "locale-defines.h"
+
+# define UPDATE_STRNCMP_COUNTER				\
+	/* calculate left number to compare */		\
+	lea	-16(%rcx, %r11), %r9;			\
+	cmp	%r9, %r11;				\
+	jb	LABEL(strcmp_exitz);			\
+	test	%r9, %r9;				\
+	je	LABEL(strcmp_exitz);			\
+	mov	%r9, %r11
+#else
+# define UPDATE_STRNCMP_COUNTER
+# ifndef STRCMP
+#  define STRCMP strcmp
+# endif
+#endif
+
+#ifndef USE_SSSE3
+	.text
+#else
+	.section .text.ssse3,"ax",@progbits
+#endif
+
+#ifdef USE_AS_STRCASECMP_L
+# ifndef ENTRY2
+#  define ENTRY2(name) ENTRY (name)
+#  define END2(name) END (name)
+# endif
+
+ENTRY2 (__strcasecmp)
+	movq	__libc_tsd_LOCALE@gottpoff(%rip),%rax
+	mov	%fs:(%rax),%RDX_LP
+
+	// XXX 5 byte should be before the function
+	/* 5-byte NOP.  */
+	.byte	0x0f,0x1f,0x44,0x00,0x00
+END2 (__strcasecmp)
+# ifndef NO_NOLOCALE_ALIAS
+weak_alias (__strcasecmp, strcasecmp)
+libc_hidden_def (__strcasecmp)
+# endif
+	/* FALLTHROUGH to strcasecmp_l.  */
+#elif defined USE_AS_STRNCASECMP_L
+# ifndef ENTRY2
+#  define ENTRY2(name) ENTRY (name)
+#  define END2(name) END (name)
+# endif
+
+ENTRY2 (__strncasecmp)
+	movq	__libc_tsd_LOCALE@gottpoff(%rip),%rax
+	mov	%fs:(%rax),%RCX_LP
+
+	// XXX 5 byte should be before the function
+	/* 5-byte NOP.  */
+	.byte	0x0f,0x1f,0x44,0x00,0x00
+END2 (__strncasecmp)
+# ifndef NO_NOLOCALE_ALIAS
+weak_alias (__strncasecmp, strncasecmp)
+libc_hidden_def (__strncasecmp)
+# endif
+	/* FALLTHROUGH to strncasecmp_l.  */
+#endif
+
+ENTRY (STRCMP)
+#ifdef USE_AS_STRCASECMP_L
+	/* We have to fall back on the C implementation for locales
+	   with encodings not matching ASCII for single bytes.  */
+# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
+	mov	LOCALE_T___LOCALES+LC_CTYPE*LP_SIZE(%rdx), %RAX_LP
+# else
+	mov	(%rdx), %RAX_LP
+# endif
+	testl	$1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%rax)
+	jne	__strcasecmp_l_nonascii
+#elif defined USE_AS_STRNCASECMP_L
+	/* We have to fall back on the C implementation for locales
+	   with encodings not matching ASCII for single bytes.  */
+# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
+	mov	LOCALE_T___LOCALES+LC_CTYPE*LP_SIZE(%rcx), %RAX_LP
+# else
+	mov	(%rcx), %RAX_LP
+# endif
+	testl	$1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%rax)
+	jne	__strncasecmp_l_nonascii
+#endif
+
+/*
+ * This implementation uses SSE to compare up to 16 bytes at a time.
+ */
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	test	%rdx, %rdx
+	je	LABEL(strcmp_exitz)
+	cmp	$1, %rdx
+	je	LABEL(Byte0)
+	mov	%rdx, %r11
+#endif
+	mov	%esi, %ecx
+	mov	%edi, %eax
+/* Use 64bit AND here to avoid long NOP padding.  */
+	and	$0x3f, %rcx		/* rsi alignment in cache line */
+	and	$0x3f, %rax		/* rdi alignment in cache line */
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+	.section .rodata.cst16,"aM",@progbits,16
+	.align 16
+.Lbelowupper:
+	.quad	0x4040404040404040
+	.quad	0x4040404040404040
+.Ltopupper:
+	.quad	0x5b5b5b5b5b5b5b5b
+	.quad	0x5b5b5b5b5b5b5b5b
+.Ltouppermask:
+	.quad	0x2020202020202020
+	.quad	0x2020202020202020
+	.previous
+	movdqa	.Lbelowupper(%rip), %xmm5
+# define UCLOW_reg %xmm5
+	movdqa	.Ltopupper(%rip), %xmm6
+# define UCHIGH_reg %xmm6
+	movdqa	.Ltouppermask(%rip), %xmm7
+# define LCQWORD_reg %xmm7
+#endif
+	cmp	$0x30, %ecx
+	ja	LABEL(crosscache)	/* rsi: 16-byte load will cross cache line */
+	cmp	$0x30, %eax
+	ja	LABEL(crosscache)	/* rdi: 16-byte load will cross cache line */
+	movlpd	(%rdi), %xmm1
+	movlpd	(%rsi), %xmm2
+	movhpd	8(%rdi), %xmm1
+	movhpd	8(%rsi), %xmm2
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+# define TOLOWER(reg1, reg2) \
+	movdqa	reg1, %xmm8;					\
+	movdqa	UCHIGH_reg, %xmm9;				\
+	movdqa	reg2, %xmm10;					\
+	movdqa	UCHIGH_reg, %xmm11;				\
+	pcmpgtb	UCLOW_reg, %xmm8;				\
+	pcmpgtb	reg1, %xmm9;					\
+	pcmpgtb	UCLOW_reg, %xmm10;				\
+	pcmpgtb	reg2, %xmm11;					\
+	pand	%xmm9, %xmm8;					\
+	pand	%xmm11, %xmm10;					\
+	pand	LCQWORD_reg, %xmm8;				\
+	pand	LCQWORD_reg, %xmm10;				\
+	por	%xmm8, reg1;					\
+	por	%xmm10, reg2
+	TOLOWER (%xmm1, %xmm2)
+#else
+# define TOLOWER(reg1, reg2)
+#endif
+	pxor	%xmm0, %xmm0		/* clear %xmm0 for null char checks */
+	pcmpeqb	%xmm1, %xmm0		/* Any null chars? */
+	pcmpeqb	%xmm2, %xmm1		/* compare first 16 bytes for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 16 bytes are same, edx == 0xffff */
+	jnz	LABEL(less16bytes)	/* If not, find different value or null char */
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	sub	$16, %r11
+	jbe	LABEL(strcmp_exitz)	/* finish comparision */
+#endif
+	add	$16, %rsi		/* prepare to search next 16 bytes */
+	add	$16, %rdi		/* prepare to search next 16 bytes */
+
+	/*
+	 * Determine source and destination string offsets from 16-byte alignment.
+	 * Use relative offset difference between the two to determine which case
+	 * below to use.
+	 */
+	.p2align 4
+LABEL(crosscache):
+	and	$0xfffffffffffffff0, %rsi	/* force %rsi is 16 byte aligned */
+	and	$0xfffffffffffffff0, %rdi	/* force %rdi is 16 byte aligned */
+	mov	$0xffff, %edx			/* for equivalent offset */
+	xor	%r8d, %r8d
+	and	$0xf, %ecx			/* offset of rsi */
+	and	$0xf, %eax			/* offset of rdi */
+	cmp	%eax, %ecx
+	je	LABEL(ashr_0)			/* rsi and rdi relative offset same */
+	ja	LABEL(bigger)
+	mov	%edx, %r8d			/* r8d is offset flag for exit tail */
+	xchg	%ecx, %eax
+	xchg	%rsi, %rdi
+LABEL(bigger):
+	lea	15(%rax), %r9
+	sub	%rcx, %r9
+	lea	LABEL(unaligned_table)(%rip), %r10
+	movslq	(%r10, %r9,4), %r9
+	lea	(%r10, %r9), %r10
+	jmp	*%r10				/* jump to corresponding case */
+
+/*
+ * The following cases will be handled by ashr_0
+ *  rcx(offset of rsi)  rax(offset of rdi)  relative offset  corresponding case
+ *        n(0~15)            n(0~15)           15(15+ n-n)         ashr_0
+ */
+	.p2align 4
+LABEL(ashr_0):
+
+	movdqa	(%rsi), %xmm1
+	pxor	%xmm0, %xmm0			/* clear %xmm0 for null char check */
+	pcmpeqb	%xmm1, %xmm0			/* Any null chars? */
+#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+	pcmpeqb	(%rdi), %xmm1			/* compare 16 bytes for equality */
+#else
+	movdqa	(%rdi), %xmm2
+	TOLOWER (%xmm1, %xmm2)
+	pcmpeqb	%xmm2, %xmm1			/* compare 16 bytes for equality */
+#endif
+	psubb	%xmm0, %xmm1			/* packed sub of comparison results*/
+	pmovmskb %xmm1, %r9d
+	shr	%cl, %edx			/* adjust 0xffff for offset */
+	shr	%cl, %r9d			/* adjust for 16-byte offset */
+	sub	%r9d, %edx
+	/*
+	 * edx must be the same with r9d if in left byte (16-rcx) is equal to
+	 * the start from (16-rax) and no null char was seen.
+	 */
+	jne	LABEL(less32bytes)		/* mismatch or null char */
+	UPDATE_STRNCMP_COUNTER
+	mov	$16, %rcx
+	mov	$16, %r9
+	pxor	%xmm0, %xmm0			/* clear xmm0, may have changed above */
+
+	/*
+	 * Now both strings are aligned at 16-byte boundary. Loop over strings
+	 * checking 32-bytes per iteration.
+	 */
+	.p2align 4
+LABEL(loop_ashr_0):
+	movdqa	(%rsi, %rcx), %xmm1
+	movdqa	(%rdi, %rcx), %xmm2
+	TOLOWER (%xmm1, %xmm2)
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx
+	jnz	LABEL(exit)		/* mismatch or null char seen */
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	sub	$16, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+	add	$16, %rcx
+	movdqa	(%rsi, %rcx), %xmm1
+	movdqa	(%rdi, %rcx), %xmm2
+	TOLOWER (%xmm1, %xmm2)
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx
+	jnz	LABEL(exit)
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	sub	$16, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+	add	$16, %rcx
+	jmp	LABEL(loop_ashr_0)
+
+/*
+ * The following cases will be handled by ashr_1
+ * rcx(offset of rsi)  rax(offset of rdi)   relative offset	corresponding case
+ *        n(15)            n -15            0(15 +(n-15) - n)         ashr_1
+ */
+	.p2align 4
+LABEL(ashr_1):
+	pxor	%xmm0, %xmm0
+	movdqa	(%rdi), %xmm2
+	movdqa	(%rsi), %xmm1
+	pcmpeqb	%xmm1, %xmm0		/* Any null chars? */
+	pslldq	$15, %xmm2		/* shift first string to align with second */
+	TOLOWER (%xmm1, %xmm2)
+	pcmpeqb	%xmm1, %xmm2		/* compare 16 bytes for equality */
+	psubb	%xmm0, %xmm2		/* packed sub of comparison results*/
+	pmovmskb %xmm2, %r9d
+	shr	%cl, %edx		/* adjust 0xffff for offset */
+	shr	%cl, %r9d		/* adjust for 16-byte offset */
+	sub	%r9d, %edx
+	jnz	LABEL(less32bytes)	/* mismatch or null char seen */
+	movdqa	(%rdi), %xmm3
+	UPDATE_STRNCMP_COUNTER
+
+	pxor	%xmm0, %xmm0
+	mov	$16, %rcx		/* index for loads*/
+	mov	$1, %r9d		/* byte position left over from less32bytes case */
+	/*
+	 * Setup %r10 value allows us to detect crossing a page boundary.
+	 * When %r10 goes positive we have crossed a page boundary and
+	 * need to do a nibble.
+	 */
+	lea	1(%rdi), %r10
+	and	$0xfff, %r10		/* offset into 4K page */
+	sub	$0x1000, %r10		/* subtract 4K pagesize */
+
+	.p2align 4
+LABEL(loop_ashr_1):
+	add	$16, %r10
+	jg	LABEL(nibble_ashr_1)	/* cross page boundary */
+
+LABEL(gobble_ashr_1):
+	movdqa	(%rsi, %rcx), %xmm1
+	movdqa	(%rdi, %rcx), %xmm2
+	movdqa	%xmm2, %xmm4		 /* store for next cycle */
+
+#ifndef USE_SSSE3
+	psrldq	$1, %xmm3
+	pslldq	$15, %xmm2
+	por	%xmm3, %xmm2		/* merge into one 16byte value */
+#else
+	palignr	$1, %xmm3, %xmm2	/* merge into one 16byte value */
+#endif
+	TOLOWER (%xmm1, %xmm2)
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx
+	jnz	LABEL(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	sub	$16, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+	add	$16, %rcx
+	movdqa	%xmm4, %xmm3
+
+	add	$16, %r10
+	jg	LABEL(nibble_ashr_1)	/* cross page boundary */
+
+	movdqa	(%rsi, %rcx), %xmm1
+	movdqa	(%rdi, %rcx), %xmm2
+	movdqa	%xmm2, %xmm4		/* store for next cycle */
+
+#ifndef USE_SSSE3
+	psrldq	$1, %xmm3
+	pslldq	$15, %xmm2
+	por	%xmm3, %xmm2		/* merge into one 16byte value */
+#else
+	palignr	$1, %xmm3, %xmm2	/* merge into one 16byte value */
+#endif
+	TOLOWER (%xmm1, %xmm2)
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx
+	jnz	LABEL(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	sub	$16, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+	add	$16, %rcx
+	movdqa	%xmm4, %xmm3
+	jmp	LABEL(loop_ashr_1)
+
+	/*
+	 * Nibble avoids loads across page boundary. This is to avoid a potential
+	 * access into unmapped memory.
+	 */
+	.p2align 4
+LABEL(nibble_ashr_1):
+	pcmpeqb	%xmm3, %xmm0		 /* check nibble for null char*/
+	pmovmskb %xmm0, %edx
+	test	$0xfffe, %edx
+	jnz	LABEL(ashr_1_exittail)	/* find null char*/
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$15, %r11
+	jbe	LABEL(ashr_1_exittail)
+#endif
+
+	pxor	%xmm0, %xmm0
+	sub	$0x1000, %r10		/* substract 4K from %r10 */
+	jmp	LABEL(gobble_ashr_1)
+
+	/*
+	 * Once find null char, determine if there is a string mismatch
+	 * before the null char.
+	 */
+	.p2align 4
+LABEL(ashr_1_exittail):
+	movdqa	(%rsi, %rcx), %xmm1
+	psrldq	$1, %xmm0
+	psrldq	$1, %xmm3
+	jmp	LABEL(aftertail)
+
+/*
+ * The following cases will be handled by ashr_2
+ * rcx(offset of rsi)  rax(offset of rdi)   relative offset   corresponding case
+ *        n(14~15)            n -14         1(15 +(n-14) - n)         ashr_2
+ */
+	.p2align 4
+LABEL(ashr_2):
+	pxor	%xmm0, %xmm0
+	movdqa	(%rdi), %xmm2
+	movdqa	(%rsi), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pslldq	$14, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
+	pmovmskb %xmm2, %r9d
+	shr	%cl, %edx
+	shr	%cl, %r9d
+	sub	%r9d, %edx
+	jnz	LABEL(less32bytes)
+	movdqa	(%rdi), %xmm3
+	UPDATE_STRNCMP_COUNTER
+
+	pxor	%xmm0, %xmm0
+	mov	$16, %rcx	/* index for loads */
+	mov	$2, %r9d	/* byte position left over from less32bytes case */
+	/*
+	 * Setup %r10 value allows us to detect crossing a page boundary.
+	 * When %r10 goes positive we have crossed a page boundary and
+	 * need to do a nibble.
+	 */
+	lea	2(%rdi), %r10
+	and	$0xfff, %r10	/* offset into 4K page */
+	sub	$0x1000, %r10	/* subtract 4K pagesize */
+
+	.p2align 4
+LABEL(loop_ashr_2):
+	add	$16, %r10
+	jg	LABEL(nibble_ashr_2)
+
+LABEL(gobble_ashr_2):
+	movdqa	(%rsi, %rcx), %xmm1
+	movdqa	(%rdi, %rcx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+#ifndef USE_SSSE3
+	psrldq	$2, %xmm3
+	pslldq	$14, %xmm2
+	por	%xmm3, %xmm2		/* merge into one 16byte value */
+#else
+	palignr	$2, %xmm3, %xmm2	/* merge into one 16byte value */
+#endif
+	TOLOWER (%xmm1, %xmm2)
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx
+	jnz	LABEL(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	sub	$16, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+
+	add	$16, %rcx
+	movdqa	%xmm4, %xmm3
+
+	add	$16, %r10
+	jg	LABEL(nibble_ashr_2)	/* cross page boundary */
+
+	movdqa	(%rsi, %rcx), %xmm1
+	movdqa	(%rdi, %rcx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+#ifndef USE_SSSE3
+	psrldq	$2, %xmm3
+	pslldq	$14, %xmm2
+	por	%xmm3, %xmm2		/* merge into one 16byte value */
+#else
+	palignr	$2, %xmm3, %xmm2	/* merge into one 16byte value */
+#endif
+	TOLOWER (%xmm1, %xmm2)
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx
+	jnz	LABEL(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	sub	$16, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+
+	add	$16, %rcx
+	movdqa	%xmm4, %xmm3
+	jmp	LABEL(loop_ashr_2)
+
+	.p2align 4
+LABEL(nibble_ashr_2):
+	pcmpeqb	%xmm3, %xmm0		/* check nibble for null char */
+	pmovmskb %xmm0, %edx
+	test	$0xfffc, %edx
+	jnz	LABEL(ashr_2_exittail)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$14, %r11
+	jbe	LABEL(ashr_2_exittail)
+#endif
+
+	pxor	%xmm0, %xmm0
+	sub	$0x1000, %r10
+	jmp	LABEL(gobble_ashr_2)
+
+	.p2align 4
+LABEL(ashr_2_exittail):
+	movdqa	(%rsi, %rcx), %xmm1
+	psrldq	$2, %xmm0
+	psrldq	$2, %xmm3
+	jmp	LABEL(aftertail)
+
+/*
+ * The following cases will be handled by ashr_3
+ *  rcx(offset of rsi)  rax(offset of rdi)  relative offset	 corresponding case
+ *        n(13~15)            n -13         2(15 +(n-13) - n)         ashr_3
+ */
+	.p2align 4
+LABEL(ashr_3):
+	pxor	%xmm0, %xmm0
+	movdqa	(%rdi), %xmm2
+	movdqa	(%rsi), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pslldq	$13, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
+	pmovmskb %xmm2, %r9d
+	shr	%cl, %edx
+	shr	%cl, %r9d
+	sub	%r9d, %edx
+	jnz	LABEL(less32bytes)
+	movdqa	(%rdi), %xmm3
+
+	UPDATE_STRNCMP_COUNTER
+
+	pxor	%xmm0, %xmm0
+	mov	$16, %rcx	/* index for loads */
+	mov	$3, %r9d	/* byte position left over from less32bytes case */
+	/*
+	 * Setup %r10 value allows us to detect crossing a page boundary.
+	 * When %r10 goes positive we have crossed a page boundary and
+	 * need to do a nibble.
+	 */
+	lea	3(%rdi), %r10
+	and	$0xfff, %r10	/* offset into 4K page */
+	sub	$0x1000, %r10	/* subtract 4K pagesize */
+
+	.p2align 4
+LABEL(loop_ashr_3):
+	add	$16, %r10
+	jg	LABEL(nibble_ashr_3)
+
+LABEL(gobble_ashr_3):
+	movdqa	(%rsi, %rcx), %xmm1
+	movdqa	(%rdi, %rcx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+#ifndef USE_SSSE3
+	psrldq	$3, %xmm3
+	pslldq	$13, %xmm2
+	por	%xmm3, %xmm2		/* merge into one 16byte value */
+#else
+	palignr	$3, %xmm3, %xmm2	/* merge into one 16byte value */
+#endif
+	TOLOWER (%xmm1, %xmm2)
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx
+	jnz	LABEL(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	sub	$16, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+
+	add	$16, %rcx
+	movdqa	%xmm4, %xmm3
+
+	add	$16, %r10
+	jg	LABEL(nibble_ashr_3)	/* cross page boundary */
+
+	movdqa	(%rsi, %rcx), %xmm1
+	movdqa	(%rdi, %rcx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+#ifndef USE_SSSE3
+	psrldq	$3, %xmm3
+	pslldq	$13, %xmm2
+	por	%xmm3, %xmm2		/* merge into one 16byte value */
+#else
+	palignr	$3, %xmm3, %xmm2	/* merge into one 16byte value */
+#endif
+	TOLOWER (%xmm1, %xmm2)
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx
+	jnz	LABEL(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	sub	$16, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+
+	add	$16, %rcx
+	movdqa	%xmm4, %xmm3
+	jmp	LABEL(loop_ashr_3)
+
+	.p2align 4
+LABEL(nibble_ashr_3):
+	pcmpeqb	%xmm3, %xmm0		/* check nibble for null char */
+	pmovmskb %xmm0, %edx
+	test	$0xfff8, %edx
+	jnz	LABEL(ashr_3_exittail)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$13, %r11
+	jbe	LABEL(ashr_3_exittail)
+#endif
+
+	pxor	%xmm0, %xmm0
+	sub	$0x1000, %r10
+	jmp	LABEL(gobble_ashr_3)
+
+	.p2align 4
+LABEL(ashr_3_exittail):
+	movdqa	(%rsi, %rcx), %xmm1
+	psrldq	$3, %xmm0
+	psrldq	$3, %xmm3
+	jmp	LABEL(aftertail)
+
+/*
+ * The following cases will be handled by ashr_4
+ *  rcx(offset of rsi)  rax(offset of rdi)  relative offset	 corresponding case
+ *        n(12~15)            n -12         3(15 +(n-12) - n)         ashr_4
+ */
+	.p2align 4
+LABEL(ashr_4):
+	pxor	%xmm0, %xmm0
+	movdqa	(%rdi), %xmm2
+	movdqa	(%rsi), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pslldq	$12, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
+	pmovmskb %xmm2, %r9d
+	shr	%cl, %edx
+	shr	%cl, %r9d
+	sub	%r9d, %edx
+	jnz	LABEL(less32bytes)
+	movdqa	(%rdi), %xmm3
+
+	UPDATE_STRNCMP_COUNTER
+
+	pxor	%xmm0, %xmm0
+	mov	$16, %rcx	/* index for loads */
+	mov	$4, %r9d	/* byte position left over from less32bytes case */
+	/*
+	 * Setup %r10 value allows us to detect crossing a page boundary.
+	 * When %r10 goes positive we have crossed a page boundary and
+	 * need to do a nibble.
+	 */
+	lea	4(%rdi), %r10
+	and	$0xfff, %r10	/* offset into 4K page */
+	sub	$0x1000, %r10	/* subtract 4K pagesize */
+
+	.p2align 4
+LABEL(loop_ashr_4):
+	add	$16, %r10
+	jg	LABEL(nibble_ashr_4)
+
+LABEL(gobble_ashr_4):
+	movdqa	(%rsi, %rcx), %xmm1
+	movdqa	(%rdi, %rcx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+#ifndef USE_SSSE3
+	psrldq	$4, %xmm3
+	pslldq	$12, %xmm2
+	por	%xmm3, %xmm2		/* merge into one 16byte value */
+#else
+	palignr	$4, %xmm3, %xmm2	/* merge into one 16byte value */
+#endif
+	TOLOWER (%xmm1, %xmm2)
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx
+	jnz	LABEL(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	sub	$16, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+
+	add	$16, %rcx
+	movdqa	%xmm4, %xmm3
+
+	add	$16, %r10
+	jg	LABEL(nibble_ashr_4)	/* cross page boundary */
+
+	movdqa	(%rsi, %rcx), %xmm1
+	movdqa	(%rdi, %rcx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+#ifndef USE_SSSE3
+	psrldq	$4, %xmm3
+	pslldq	$12, %xmm2
+	por	%xmm3, %xmm2		/* merge into one 16byte value */
+#else
+	palignr	$4, %xmm3, %xmm2	/* merge into one 16byte value */
+#endif
+	TOLOWER (%xmm1, %xmm2)
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx
+	jnz	LABEL(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	sub	$16, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+
+	add	$16, %rcx
+	movdqa	%xmm4, %xmm3
+	jmp	LABEL(loop_ashr_4)
+
+	.p2align 4
+LABEL(nibble_ashr_4):
+	pcmpeqb	%xmm3, %xmm0		/* check nibble for null char */
+	pmovmskb %xmm0, %edx
+	test	$0xfff0, %edx
+	jnz	LABEL(ashr_4_exittail)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$12, %r11
+	jbe	LABEL(ashr_4_exittail)
+#endif
+
+	pxor	%xmm0, %xmm0
+	sub	$0x1000, %r10
+	jmp	LABEL(gobble_ashr_4)
+
+	.p2align 4
+LABEL(ashr_4_exittail):
+	movdqa	(%rsi, %rcx), %xmm1
+	psrldq	$4, %xmm0
+	psrldq	$4, %xmm3
+	jmp	LABEL(aftertail)
+
+/*
+ * The following cases will be handled by ashr_5
+ *  rcx(offset of rsi)  rax(offset of rdi)        relative offset      corresponding case
+ *        n(11~15)          n - 11		  4(15 +(n-11) - n)         ashr_5
+ */
+	.p2align 4
+LABEL(ashr_5):
+	pxor	%xmm0, %xmm0
+	movdqa	(%rdi), %xmm2
+	movdqa	(%rsi), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pslldq	$11, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
+	pmovmskb %xmm2, %r9d
+	shr	%cl, %edx
+	shr	%cl, %r9d
+	sub	%r9d, %edx
+	jnz	LABEL(less32bytes)
+	movdqa	(%rdi), %xmm3
+
+	UPDATE_STRNCMP_COUNTER
+
+	pxor	%xmm0, %xmm0
+	mov	$16, %rcx	/* index for loads */
+	mov	$5, %r9d	/* byte position left over from less32bytes case */
+	/*
+	 * Setup %r10 value allows us to detect crossing a page boundary.
+	 * When %r10 goes positive we have crossed a page boundary and
+	 * need to do a nibble.
+	 */
+	lea	5(%rdi), %r10
+	and	$0xfff, %r10	/* offset into 4K page */
+	sub	$0x1000, %r10	/* subtract 4K pagesize */
+
+	.p2align 4
+LABEL(loop_ashr_5):
+	add	$16, %r10
+	jg	LABEL(nibble_ashr_5)
+
+LABEL(gobble_ashr_5):
+	movdqa	(%rsi, %rcx), %xmm1
+	movdqa	(%rdi, %rcx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+#ifndef USE_SSSE3
+	psrldq	$5, %xmm3
+	pslldq	$11, %xmm2
+	por	%xmm3, %xmm2		/* merge into one 16byte value */
+#else
+	palignr	$5, %xmm3, %xmm2	/* merge into one 16byte value */
+#endif
+	TOLOWER (%xmm1, %xmm2)
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx
+	jnz	LABEL(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	sub	$16, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+
+	add	$16, %rcx
+	movdqa	%xmm4, %xmm3
+
+	add	$16, %r10
+	jg	LABEL(nibble_ashr_5)	/* cross page boundary */
+
+	movdqa	(%rsi, %rcx), %xmm1
+	movdqa	(%rdi, %rcx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+#ifndef USE_SSSE3
+	psrldq	$5, %xmm3
+	pslldq	$11, %xmm2
+	por	%xmm3, %xmm2		/* merge into one 16byte value */
+#else
+	palignr	$5, %xmm3, %xmm2	/* merge into one 16byte value */
+#endif
+	TOLOWER (%xmm1, %xmm2)
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx
+	jnz	LABEL(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	sub	$16, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+
+	add	$16, %rcx
+	movdqa	%xmm4, %xmm3
+	jmp	LABEL(loop_ashr_5)
+
+	.p2align 4
+LABEL(nibble_ashr_5):
+	pcmpeqb	%xmm3, %xmm0		/* check nibble for null char */
+	pmovmskb %xmm0, %edx
+	test	$0xffe0, %edx
+	jnz	LABEL(ashr_5_exittail)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$11, %r11
+	jbe	LABEL(ashr_5_exittail)
+#endif
+
+	pxor	%xmm0, %xmm0
+	sub	$0x1000, %r10
+	jmp	LABEL(gobble_ashr_5)
+
+	.p2align 4
+LABEL(ashr_5_exittail):
+	movdqa	(%rsi, %rcx), %xmm1
+	psrldq	$5, %xmm0
+	psrldq	$5, %xmm3
+	jmp	LABEL(aftertail)
+
+/*
+ * The following cases will be handled by ashr_6
+ *  rcx(offset of rsi)  rax(offset of rdi)        relative offset      corresponding case
+ *        n(10~15)          n - 10		  5(15 +(n-10) - n)         ashr_6
+ */
+	.p2align 4
+LABEL(ashr_6):
+	pxor	%xmm0, %xmm0
+	movdqa	(%rdi), %xmm2
+	movdqa	(%rsi), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pslldq	$10, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
+	pmovmskb %xmm2, %r9d
+	shr	%cl, %edx
+	shr	%cl, %r9d
+	sub	%r9d, %edx
+	jnz	LABEL(less32bytes)
+	movdqa	(%rdi), %xmm3
+
+	UPDATE_STRNCMP_COUNTER
+
+	pxor	%xmm0, %xmm0
+	mov	$16, %rcx	/* index for loads */
+	mov	$6, %r9d	/* byte position left over from less32bytes case */
+	/*
+	 * Setup %r10 value allows us to detect crossing a page boundary.
+	 * When %r10 goes positive we have crossed a page boundary and
+	 * need to do a nibble.
+	 */
+	lea	6(%rdi), %r10
+	and	$0xfff, %r10	/* offset into 4K page */
+	sub	$0x1000, %r10	/* subtract 4K pagesize */
+
+	.p2align 4
+LABEL(loop_ashr_6):
+	add	$16, %r10
+	jg	LABEL(nibble_ashr_6)
+
+LABEL(gobble_ashr_6):
+	movdqa	(%rsi, %rcx), %xmm1
+	movdqa	(%rdi, %rcx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+#ifndef USE_SSSE3
+	psrldq	$6, %xmm3
+	pslldq	$10, %xmm2
+	por	%xmm3, %xmm2		/* merge into one 16byte value */
+#else
+	palignr	$6, %xmm3, %xmm2	/* merge into one 16byte value */
+#endif
+	TOLOWER (%xmm1, %xmm2)
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx
+	jnz	LABEL(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	sub	$16, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+
+	add	$16, %rcx
+	movdqa	%xmm4, %xmm3
+
+	add	$16, %r10
+	jg	LABEL(nibble_ashr_6)	/* cross page boundary */
+
+	movdqa	(%rsi, %rcx), %xmm1
+	movdqa	(%rdi, %rcx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+#ifndef USE_SSSE3
+	psrldq	$6, %xmm3
+	pslldq	$10, %xmm2
+	por	%xmm3, %xmm2		/* merge into one 16byte value */
+#else
+	palignr	$6, %xmm3, %xmm2	/* merge into one 16byte value */
+#endif
+	TOLOWER (%xmm1, %xmm2)
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx
+	jnz	LABEL(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	sub	$16, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+
+	add	$16, %rcx
+	movdqa	%xmm4, %xmm3
+	jmp	LABEL(loop_ashr_6)
+
+	.p2align 4
+LABEL(nibble_ashr_6):
+	pcmpeqb	%xmm3, %xmm0		/* check nibble for null char */
+	pmovmskb %xmm0, %edx
+	test	$0xffc0, %edx
+	jnz	LABEL(ashr_6_exittail)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$10, %r11
+	jbe	LABEL(ashr_6_exittail)
+#endif
+
+	pxor	%xmm0, %xmm0
+	sub	$0x1000, %r10
+	jmp	LABEL(gobble_ashr_6)
+
+	.p2align 4
+LABEL(ashr_6_exittail):
+	movdqa	(%rsi, %rcx), %xmm1
+	psrldq	$6, %xmm0
+	psrldq	$6, %xmm3
+	jmp	LABEL(aftertail)
+
+/*
+ * The following cases will be handled by ashr_7
+ *  rcx(offset of rsi)  rax(offset of rdi)        relative offset      corresponding case
+ *        n(9~15)          n - 9                6(15 +(n - 9) - n)         ashr_7
+ */
+	.p2align 4
+LABEL(ashr_7):
+	pxor	%xmm0, %xmm0
+	movdqa	(%rdi), %xmm2
+	movdqa	(%rsi), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pslldq	$9, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
+	pmovmskb %xmm2, %r9d
+	shr	%cl, %edx
+	shr	%cl, %r9d
+	sub	%r9d, %edx
+	jnz	LABEL(less32bytes)
+	movdqa	(%rdi), %xmm3
+
+	UPDATE_STRNCMP_COUNTER
+
+	pxor	%xmm0, %xmm0
+	mov	$16, %rcx	/* index for loads */
+	mov	$7, %r9d	/* byte position left over from less32bytes case */
+	/*
+	 * Setup %r10 value allows us to detect crossing a page boundary.
+	 * When %r10 goes positive we have crossed a page boundary and
+	 * need to do a nibble.
+	 */
+	lea	7(%rdi), %r10
+	and	$0xfff, %r10	/* offset into 4K page */
+	sub	$0x1000, %r10	/* subtract 4K pagesize */
+
+	.p2align 4
+LABEL(loop_ashr_7):
+	add	$16, %r10
+	jg	LABEL(nibble_ashr_7)
+
+LABEL(gobble_ashr_7):
+	movdqa	(%rsi, %rcx), %xmm1
+	movdqa	(%rdi, %rcx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+#ifndef USE_SSSE3
+	psrldq	$7, %xmm3
+	pslldq	$9, %xmm2
+	por	%xmm3, %xmm2		/* merge into one 16byte value */
+#else
+	palignr	$7, %xmm3, %xmm2	/* merge into one 16byte value */
+#endif
+	TOLOWER (%xmm1, %xmm2)
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx
+	jnz	LABEL(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	sub	$16, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+
+	add	$16, %rcx
+	movdqa	%xmm4, %xmm3
+
+	add	$16, %r10
+	jg	LABEL(nibble_ashr_7)	/* cross page boundary */
+
+	movdqa	(%rsi, %rcx), %xmm1
+	movdqa	(%rdi, %rcx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+#ifndef USE_SSSE3
+	psrldq	$7, %xmm3
+	pslldq	$9, %xmm2
+	por	%xmm3, %xmm2		/* merge into one 16byte value */
+#else
+	palignr	$7, %xmm3, %xmm2	/* merge into one 16byte value */
+#endif
+	TOLOWER (%xmm1, %xmm2)
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx
+	jnz	LABEL(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	sub	$16, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+
+	add	$16, %rcx
+	movdqa	%xmm4, %xmm3
+	jmp	LABEL(loop_ashr_7)
+
+	.p2align 4
+LABEL(nibble_ashr_7):
+	pcmpeqb	%xmm3, %xmm0		/* check nibble for null char */
+	pmovmskb %xmm0, %edx
+	test	$0xff80, %edx
+	jnz	LABEL(ashr_7_exittail)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$9, %r11
+	jbe	LABEL(ashr_7_exittail)
+#endif
+
+	pxor	%xmm0, %xmm0
+	sub	$0x1000, %r10
+	jmp	LABEL(gobble_ashr_7)
+
+	.p2align 4
+LABEL(ashr_7_exittail):
+	movdqa	(%rsi, %rcx), %xmm1
+	psrldq	$7, %xmm0
+	psrldq	$7, %xmm3
+	jmp	LABEL(aftertail)
+
+/*
+ *  The following cases will be handled by ashr_8
+ *  rcx(offset of rsi)  rax(offset of rdi)        relative offset	 corresponding case
+ *        n(8~15)          n - 8                7(15 +(n - 8) - n)         ashr_8
+ */
+	.p2align 4
+LABEL(ashr_8):
+	pxor	%xmm0, %xmm0
+	movdqa	(%rdi), %xmm2
+	movdqa	(%rsi), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pslldq	$8, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
+	pmovmskb %xmm2, %r9d
+	shr	%cl, %edx
+	shr	%cl, %r9d
+	sub	%r9d, %edx
+	jnz	LABEL(less32bytes)
+	movdqa	(%rdi), %xmm3
+
+	UPDATE_STRNCMP_COUNTER
+
+	pxor	%xmm0, %xmm0
+	mov	$16, %rcx	/* index for loads */
+	mov	$8, %r9d	/* byte position left over from less32bytes case */
+	/*
+	 * Setup %r10 value allows us to detect crossing a page boundary.
+	 * When %r10 goes positive we have crossed a page boundary and
+	 * need to do a nibble.
+	 */
+	lea	8(%rdi), %r10
+	and	$0xfff, %r10	/* offset into 4K page */
+	sub	$0x1000, %r10	/* subtract 4K pagesize */
+
+	.p2align 4
+LABEL(loop_ashr_8):
+	add	$16, %r10
+	jg	LABEL(nibble_ashr_8)
+
+LABEL(gobble_ashr_8):
+	movdqa	(%rsi, %rcx), %xmm1
+	movdqa	(%rdi, %rcx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+#ifndef USE_SSSE3
+	psrldq	$8, %xmm3
+	pslldq	$8, %xmm2
+	por	%xmm3, %xmm2		/* merge into one 16byte value */
+#else
+	palignr	$8, %xmm3, %xmm2	/* merge into one 16byte value */
+#endif
+	TOLOWER (%xmm1, %xmm2)
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx
+	jnz	LABEL(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	sub	$16, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+
+	add	$16, %rcx
+	movdqa	%xmm4, %xmm3
+
+	add	$16, %r10
+	jg	LABEL(nibble_ashr_8)	/* cross page boundary */
+
+	movdqa	(%rsi, %rcx), %xmm1
+	movdqa	(%rdi, %rcx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+#ifndef USE_SSSE3
+	psrldq	$8, %xmm3
+	pslldq	$8, %xmm2
+	por	%xmm3, %xmm2		/* merge into one 16byte value */
+#else
+	palignr	$8, %xmm3, %xmm2	/* merge into one 16byte value */
+#endif
+	TOLOWER (%xmm1, %xmm2)
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx
+	jnz	LABEL(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	sub	$16, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+
+	add	$16, %rcx
+	movdqa	%xmm4, %xmm3
+	jmp	LABEL(loop_ashr_8)
+
+	.p2align 4
+LABEL(nibble_ashr_8):
+	pcmpeqb	%xmm3, %xmm0		/* check nibble for null char */
+	pmovmskb %xmm0, %edx
+	test	$0xff00, %edx
+	jnz	LABEL(ashr_8_exittail)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$8, %r11
+	jbe	LABEL(ashr_8_exittail)
+#endif
+
+	pxor	%xmm0, %xmm0
+	sub	$0x1000, %r10
+	jmp	LABEL(gobble_ashr_8)
+
+	.p2align 4
+LABEL(ashr_8_exittail):
+	movdqa	(%rsi, %rcx), %xmm1
+	psrldq	$8, %xmm0
+	psrldq	$8, %xmm3
+	jmp	LABEL(aftertail)
+
+/*
+ *  The following cases will be handled by ashr_9
+ *  rcx(offset of rsi)  rax(offset of rdi)        relative offset	 corresponding case
+ *        n(7~15)          n - 7                8(15 +(n - 7) - n)         ashr_9
+ */
+	.p2align 4
+LABEL(ashr_9):
+	pxor	%xmm0, %xmm0
+	movdqa	(%rdi), %xmm2
+	movdqa	(%rsi), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pslldq	$7, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
+	pmovmskb %xmm2, %r9d
+	shr	%cl, %edx
+	shr	%cl, %r9d
+	sub	%r9d, %edx
+	jnz	LABEL(less32bytes)
+	movdqa	(%rdi), %xmm3
+
+	UPDATE_STRNCMP_COUNTER
+
+	pxor	%xmm0, %xmm0
+	mov	$16, %rcx	/* index for loads */
+	mov	$9, %r9d	/* byte position left over from less32bytes case */
+	/*
+	 * Setup %r10 value allows us to detect crossing a page boundary.
+	 * When %r10 goes positive we have crossed a page boundary and
+	 * need to do a nibble.
+	 */
+	lea	9(%rdi), %r10
+	and	$0xfff, %r10	/* offset into 4K page */
+	sub	$0x1000, %r10	/* subtract 4K pagesize */
+
+	.p2align 4
+LABEL(loop_ashr_9):
+	add	$16, %r10
+	jg	LABEL(nibble_ashr_9)
+
+LABEL(gobble_ashr_9):
+	movdqa	(%rsi, %rcx), %xmm1
+	movdqa	(%rdi, %rcx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+#ifndef USE_SSSE3
+	psrldq	$9, %xmm3
+	pslldq	$7, %xmm2
+	por	%xmm3, %xmm2		/* merge into one 16byte value */
+#else
+	palignr	$9, %xmm3, %xmm2	/* merge into one 16byte value */
+#endif
+	TOLOWER (%xmm1, %xmm2)
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx
+	jnz	LABEL(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	sub	$16, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+
+	add	$16, %rcx
+	movdqa	%xmm4, %xmm3
+
+	add	$16, %r10
+	jg	LABEL(nibble_ashr_9)	/* cross page boundary */
+
+	movdqa	(%rsi, %rcx), %xmm1
+	movdqa	(%rdi, %rcx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+#ifndef USE_SSSE3
+	psrldq	$9, %xmm3
+	pslldq	$7, %xmm2
+	por	%xmm3, %xmm2		/* merge into one 16byte value */
+#else
+	palignr	$9, %xmm3, %xmm2	/* merge into one 16byte value */
+#endif
+	TOLOWER (%xmm1, %xmm2)
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx
+	jnz	LABEL(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	sub	$16, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+
+	add	$16, %rcx
+	movdqa	%xmm4, %xmm3		/* store for next cycle */
+	jmp	LABEL(loop_ashr_9)
+
+	.p2align 4
+LABEL(nibble_ashr_9):
+	pcmpeqb	%xmm3, %xmm0		/* check nibble for null char */
+	pmovmskb %xmm0, %edx
+	test	$0xfe00, %edx
+	jnz	LABEL(ashr_9_exittail)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$7, %r11
+	jbe	LABEL(ashr_9_exittail)
+#endif
+
+	pxor	%xmm0, %xmm0
+	sub	$0x1000, %r10
+	jmp	LABEL(gobble_ashr_9)
+
+	.p2align 4
+LABEL(ashr_9_exittail):
+	movdqa	(%rsi, %rcx), %xmm1
+	psrldq	$9, %xmm0
+	psrldq	$9, %xmm3
+	jmp	LABEL(aftertail)
+
+/*
+ *  The following cases will be handled by ashr_10
+ *  rcx(offset of rsi)  rax(offset of rdi)        relative offset	 corresponding case
+ *        n(6~15)          n - 6                9(15 +(n - 6) - n)         ashr_10
+ */
+	.p2align 4
+LABEL(ashr_10):
+	pxor	%xmm0, %xmm0
+	movdqa	(%rdi), %xmm2
+	movdqa	(%rsi), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pslldq	$6, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
+	pmovmskb %xmm2, %r9d
+	shr	%cl, %edx
+	shr	%cl, %r9d
+	sub	%r9d, %edx
+	jnz	LABEL(less32bytes)
+	movdqa	(%rdi), %xmm3
+
+	UPDATE_STRNCMP_COUNTER
+
+	pxor	%xmm0, %xmm0
+	mov	$16, %rcx	/* index for loads */
+	mov	$10, %r9d	/* byte position left over from less32bytes case */
+	/*
+	 * Setup %r10 value allows us to detect crossing a page boundary.
+	 * When %r10 goes positive we have crossed a page boundary and
+	 * need to do a nibble.
+	 */
+	lea	10(%rdi), %r10
+	and	$0xfff, %r10	/* offset into 4K page */
+	sub	$0x1000, %r10	/* subtract 4K pagesize */
+
+	.p2align 4
+LABEL(loop_ashr_10):
+	add	$16, %r10
+	jg	LABEL(nibble_ashr_10)
+
+LABEL(gobble_ashr_10):
+	movdqa	(%rsi, %rcx), %xmm1
+	movdqa	(%rdi, %rcx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+#ifndef USE_SSSE3
+	psrldq	$10, %xmm3
+	pslldq	$6, %xmm2
+	por	%xmm3, %xmm2		/* merge into one 16byte value */
+#else
+	palignr	$10, %xmm3, %xmm2	/* merge into one 16byte value */
+#endif
+	TOLOWER (%xmm1, %xmm2)
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx
+	jnz	LABEL(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	sub	$16, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+
+	add	$16, %rcx
+	movdqa	%xmm4, %xmm3
+
+	add	$16, %r10
+	jg	LABEL(nibble_ashr_10)	/* cross page boundary */
+
+	movdqa	(%rsi, %rcx), %xmm1
+	movdqa	(%rdi, %rcx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+#ifndef USE_SSSE3
+	psrldq	$10, %xmm3
+	pslldq	$6, %xmm2
+	por	%xmm3, %xmm2		/* merge into one 16byte value */
+#else
+	palignr	$10, %xmm3, %xmm2	/* merge into one 16byte value */
+#endif
+	TOLOWER (%xmm1, %xmm2)
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx
+	jnz	LABEL(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	sub	$16, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+
+	add	$16, %rcx
+	movdqa	%xmm4, %xmm3
+	jmp	LABEL(loop_ashr_10)
+
+	.p2align 4
+LABEL(nibble_ashr_10):
+	pcmpeqb	%xmm3, %xmm0		/* check nibble for null char */
+	pmovmskb %xmm0, %edx
+	test	$0xfc00, %edx
+	jnz	LABEL(ashr_10_exittail)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$6, %r11
+	jbe	LABEL(ashr_10_exittail)
+#endif
+
+	pxor	%xmm0, %xmm0
+	sub	$0x1000, %r10
+	jmp	LABEL(gobble_ashr_10)
+
+	.p2align 4
+LABEL(ashr_10_exittail):
+	movdqa	(%rsi, %rcx), %xmm1
+	psrldq	$10, %xmm0
+	psrldq	$10, %xmm3
+	jmp	LABEL(aftertail)
+
+/*
+ *  The following cases will be handled by ashr_11
+ *  rcx(offset of rsi)  rax(offset of rdi)        relative offset	 corresponding case
+ *        n(5~15)          n - 5               10(15 +(n - 5) - n)         ashr_11
+ */
+	.p2align 4
+LABEL(ashr_11):
+	pxor	%xmm0, %xmm0
+	movdqa	(%rdi), %xmm2
+	movdqa	(%rsi), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pslldq	$5, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
+	pmovmskb %xmm2, %r9d
+	shr	%cl, %edx
+	shr	%cl, %r9d
+	sub	%r9d, %edx
+	jnz	LABEL(less32bytes)
+	movdqa	(%rdi), %xmm3
+
+	UPDATE_STRNCMP_COUNTER
+
+	pxor	%xmm0, %xmm0
+	mov	$16, %rcx	/* index for loads */
+	mov	$11, %r9d	/* byte position left over from less32bytes case */
+	/*
+	 * Setup %r10 value allows us to detect crossing a page boundary.
+	 * When %r10 goes positive we have crossed a page boundary and
+	 * need to do a nibble.
+	 */
+	lea	11(%rdi), %r10
+	and	$0xfff, %r10	/* offset into 4K page */
+	sub	$0x1000, %r10	/* subtract 4K pagesize */
+
+	.p2align 4
+LABEL(loop_ashr_11):
+	add	$16, %r10
+	jg	LABEL(nibble_ashr_11)
+
+LABEL(gobble_ashr_11):
+	movdqa	(%rsi, %rcx), %xmm1
+	movdqa	(%rdi, %rcx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+#ifndef USE_SSSE3
+	psrldq	$11, %xmm3
+	pslldq	$5, %xmm2
+	por	%xmm3, %xmm2		/* merge into one 16byte value */
+#else
+	palignr	$11, %xmm3, %xmm2	/* merge into one 16byte value */
+#endif
+	TOLOWER (%xmm1, %xmm2)
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx
+	jnz	LABEL(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	sub	$16, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+
+	add	$16, %rcx
+	movdqa	%xmm4, %xmm3
+
+	add	$16, %r10
+	jg	LABEL(nibble_ashr_11)	/* cross page boundary */
+
+	movdqa	(%rsi, %rcx), %xmm1
+	movdqa	(%rdi, %rcx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+#ifndef USE_SSSE3
+	psrldq	$11, %xmm3
+	pslldq	$5, %xmm2
+	por	%xmm3, %xmm2		/* merge into one 16byte value */
+#else
+	palignr	$11, %xmm3, %xmm2	/* merge into one 16byte value */
+#endif
+	TOLOWER (%xmm1, %xmm2)
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx
+	jnz	LABEL(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	sub	$16, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+
+	add	$16, %rcx
+	movdqa	%xmm4, %xmm3
+	jmp	LABEL(loop_ashr_11)
+
+	.p2align 4
+LABEL(nibble_ashr_11):
+	pcmpeqb	%xmm3, %xmm0		/* check nibble for null char */
+	pmovmskb %xmm0, %edx
+	test	$0xf800, %edx
+	jnz	LABEL(ashr_11_exittail)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$5, %r11
+	jbe	LABEL(ashr_11_exittail)
+#endif
+
+	pxor	%xmm0, %xmm0
+	sub	$0x1000, %r10
+	jmp	LABEL(gobble_ashr_11)
+
+	.p2align 4
+LABEL(ashr_11_exittail):
+	movdqa	(%rsi, %rcx), %xmm1
+	psrldq	$11, %xmm0
+	psrldq	$11, %xmm3
+	jmp	LABEL(aftertail)
+
+/*
+ *  The following cases will be handled by ashr_12
+ *  rcx(offset of rsi)  rax(offset of rdi)        relative offset	 corresponding case
+ *        n(4~15)          n - 4                11(15 +(n - 4) - n)         ashr_12
+ */
+	.p2align 4
+LABEL(ashr_12):
+	pxor	%xmm0, %xmm0
+	movdqa	(%rdi), %xmm2
+	movdqa	(%rsi), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pslldq	$4, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
+	pmovmskb %xmm2, %r9d
+	shr	%cl, %edx
+	shr	%cl, %r9d
+	sub	%r9d, %edx
+	jnz	LABEL(less32bytes)
+	movdqa	(%rdi), %xmm3
+
+	UPDATE_STRNCMP_COUNTER
+
+	pxor	%xmm0, %xmm0
+	mov	$16, %rcx	/* index for loads */
+	mov	$12, %r9d	/* byte position left over from less32bytes case */
+	/*
+	 * Setup %r10 value allows us to detect crossing a page boundary.
+	 * When %r10 goes positive we have crossed a page boundary and
+	 * need to do a nibble.
+	 */
+	lea	12(%rdi), %r10
+	and	$0xfff, %r10	/* offset into 4K page */
+	sub	$0x1000, %r10	/* subtract 4K pagesize */
+
+	.p2align 4
+LABEL(loop_ashr_12):
+	add	$16, %r10
+	jg	LABEL(nibble_ashr_12)
+
+LABEL(gobble_ashr_12):
+	movdqa	(%rsi, %rcx), %xmm1
+	movdqa	(%rdi, %rcx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+#ifndef USE_SSSE3
+	psrldq	$12, %xmm3
+	pslldq	$4, %xmm2
+	por	%xmm3, %xmm2		/* merge into one 16byte value */
+#else
+	palignr	$12, %xmm3, %xmm2	/* merge into one 16byte value */
+#endif
+	TOLOWER (%xmm1, %xmm2)
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx
+	jnz	LABEL(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	sub	$16, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+
+	add	$16, %rcx
+	movdqa	%xmm4, %xmm3
+
+	add	$16, %r10
+	jg	LABEL(nibble_ashr_12)	/* cross page boundary */
+
+	movdqa	(%rsi, %rcx), %xmm1
+	movdqa	(%rdi, %rcx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+#ifndef USE_SSSE3
+	psrldq	$12, %xmm3
+	pslldq	$4, %xmm2
+	por	%xmm3, %xmm2		/* merge into one 16byte value */
+#else
+	palignr	$12, %xmm3, %xmm2	/* merge into one 16byte value */
+#endif
+	TOLOWER (%xmm1, %xmm2)
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx
+	jnz	LABEL(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	sub	$16, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+
+	add	$16, %rcx
+	movdqa	%xmm4, %xmm3
+	jmp	LABEL(loop_ashr_12)
+
+	.p2align 4
+LABEL(nibble_ashr_12):
+	pcmpeqb	%xmm3, %xmm0		/* check nibble for null char */
+	pmovmskb %xmm0, %edx
+	test	$0xf000, %edx
+	jnz	LABEL(ashr_12_exittail)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$4, %r11
+	jbe	LABEL(ashr_12_exittail)
+#endif
+
+	pxor	%xmm0, %xmm0
+	sub	$0x1000, %r10
+	jmp	LABEL(gobble_ashr_12)
+
+	.p2align 4
+LABEL(ashr_12_exittail):
+	movdqa	(%rsi, %rcx), %xmm1
+	psrldq	$12, %xmm0
+	psrldq	$12, %xmm3
+	jmp	LABEL(aftertail)
+
+/*
+ *  The following cases will be handled by ashr_13
+ *  rcx(offset of rsi)  rax(offset of rdi)        relative offset	 corresponding case
+ *        n(3~15)          n - 3                12(15 +(n - 3) - n)         ashr_13
+ */
+	.p2align 4
+LABEL(ashr_13):
+	pxor	%xmm0, %xmm0
+	movdqa	(%rdi), %xmm2
+	movdqa	(%rsi), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pslldq	$3, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
+	pmovmskb %xmm2, %r9d
+	shr	%cl, %edx
+	shr	%cl, %r9d
+	sub	%r9d, %edx
+	jnz	LABEL(less32bytes)
+	movdqa	(%rdi), %xmm3
+
+	UPDATE_STRNCMP_COUNTER
+
+	pxor	%xmm0, %xmm0
+	mov	$16, %rcx	/* index for loads */
+	mov	$13, %r9d	/* byte position left over from less32bytes case */
+	/*
+	 * Setup %r10 value allows us to detect crossing a page boundary.
+	 * When %r10 goes positive we have crossed a page boundary and
+	 * need to do a nibble.
+	 */
+	lea	13(%rdi), %r10
+	and	$0xfff, %r10	/* offset into 4K page */
+	sub	$0x1000, %r10	/* subtract 4K pagesize */
+
+	.p2align 4
+LABEL(loop_ashr_13):
+	add	$16, %r10
+	jg	LABEL(nibble_ashr_13)
+
+LABEL(gobble_ashr_13):
+	movdqa	(%rsi, %rcx), %xmm1
+	movdqa	(%rdi, %rcx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+#ifndef USE_SSSE3
+	psrldq	$13, %xmm3
+	pslldq	$3, %xmm2
+	por	%xmm3, %xmm2		/* merge into one 16byte value */
+#else
+	palignr	$13, %xmm3, %xmm2	/* merge into one 16byte value */
+#endif
+	TOLOWER (%xmm1, %xmm2)
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx
+	jnz	LABEL(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	sub	$16, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+
+	add	$16, %rcx
+	movdqa	%xmm4, %xmm3
+
+	add	$16, %r10
+	jg	LABEL(nibble_ashr_13)	/* cross page boundary */
+
+	movdqa	(%rsi, %rcx), %xmm1
+	movdqa	(%rdi, %rcx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+#ifndef USE_SSSE3
+	psrldq	$13, %xmm3
+	pslldq	$3, %xmm2
+	por	%xmm3, %xmm2		/* merge into one 16byte value */
+#else
+	palignr	$13, %xmm3, %xmm2	/* merge into one 16byte value */
+#endif
+	TOLOWER (%xmm1, %xmm2)
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx
+	jnz	LABEL(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	sub	$16, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+
+	add	$16, %rcx
+	movdqa	%xmm4, %xmm3
+	jmp	LABEL(loop_ashr_13)
+
+	.p2align 4
+LABEL(nibble_ashr_13):
+	pcmpeqb	%xmm3, %xmm0		/* check nibble for null char */
+	pmovmskb %xmm0, %edx
+	test	$0xe000, %edx
+	jnz	LABEL(ashr_13_exittail)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$3, %r11
+	jbe	LABEL(ashr_13_exittail)
+#endif
+
+	pxor	%xmm0, %xmm0
+	sub	$0x1000, %r10
+	jmp	LABEL(gobble_ashr_13)
+
+	.p2align 4
+LABEL(ashr_13_exittail):
+	movdqa	(%rsi, %rcx), %xmm1
+	psrldq  $13, %xmm0
+	psrldq  $13, %xmm3
+	jmp	LABEL(aftertail)
+
+/*
+ *  The following cases will be handled by ashr_14
+ *  rcx(offset of rsi)  rax(offset of rdi)        relative offset	 corresponding case
+ *        n(2~15)          n - 2                13(15 +(n - 2) - n)         ashr_14
+ */
+	.p2align 4
+LABEL(ashr_14):
+	pxor	%xmm0, %xmm0
+	movdqa	(%rdi), %xmm2
+	movdqa	(%rsi), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pslldq  $2, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
+	pmovmskb %xmm2, %r9d
+	shr	%cl, %edx
+	shr	%cl, %r9d
+	sub	%r9d, %edx
+	jnz	LABEL(less32bytes)
+	movdqa	(%rdi), %xmm3
+
+	UPDATE_STRNCMP_COUNTER
+
+	pxor	%xmm0, %xmm0
+	mov	$16, %rcx	/* index for loads */
+	mov	$14, %r9d	/* byte position left over from less32bytes case */
+	/*
+	 * Setup %r10 value allows us to detect crossing a page boundary.
+	 * When %r10 goes positive we have crossed a page boundary and
+	 * need to do a nibble.
+	 */
+	lea	14(%rdi), %r10
+	and	$0xfff, %r10	/* offset into 4K page */
+	sub	$0x1000, %r10	/* subtract 4K pagesize */
+
+	.p2align 4
+LABEL(loop_ashr_14):
+	add	$16, %r10
+	jg	LABEL(nibble_ashr_14)
+
+LABEL(gobble_ashr_14):
+	movdqa	(%rsi, %rcx), %xmm1
+	movdqa	(%rdi, %rcx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+#ifndef USE_SSSE3
+	psrldq	$14, %xmm3
+	pslldq	$2, %xmm2
+	por	%xmm3, %xmm2		/* merge into one 16byte value */
+#else
+	palignr	$14, %xmm3, %xmm2	/* merge into one 16byte value */
+#endif
+	TOLOWER (%xmm1, %xmm2)
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx
+	jnz	LABEL(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	sub	$16, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+
+	add	$16, %rcx
+	movdqa	%xmm4, %xmm3
+
+	add	$16, %r10
+	jg	LABEL(nibble_ashr_14)	/* cross page boundary */
+
+	movdqa	(%rsi, %rcx), %xmm1
+	movdqa	(%rdi, %rcx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+#ifndef USE_SSSE3
+	psrldq	$14, %xmm3
+	pslldq	$2, %xmm2
+	por	%xmm3, %xmm2		/* merge into one 16byte value */
+#else
+	palignr	$14, %xmm3, %xmm2	/* merge into one 16byte value */
+#endif
+	TOLOWER (%xmm1, %xmm2)
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx
+	jnz	LABEL(exit)
+
+#if defined USE_AS_STRNCMP | defined USE_AS_STRNCASECMP_L
+	sub	$16, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+
+	add	$16, %rcx
+	movdqa	%xmm4, %xmm3
+	jmp	LABEL(loop_ashr_14)
+
+	.p2align 4
+LABEL(nibble_ashr_14):
+	pcmpeqb	%xmm3, %xmm0		/* check nibble for null char */
+	pmovmskb %xmm0, %edx
+	test	$0xc000, %edx
+	jnz	LABEL(ashr_14_exittail)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$2, %r11
+	jbe	LABEL(ashr_14_exittail)
+#endif
+
+	pxor	%xmm0, %xmm0
+	sub	$0x1000, %r10
+	jmp	LABEL(gobble_ashr_14)
+
+	.p2align 4
+LABEL(ashr_14_exittail):
+	movdqa	(%rsi, %rcx), %xmm1
+	psrldq	$14, %xmm0
+	psrldq	$14, %xmm3
+	jmp	LABEL(aftertail)
+
+/*
+ *  The following cases will be handled by ashr_15
+ *  rcx(offset of rsi)  rax(offset of rdi)        relative offset	 corresponding case
+ *        n(1~15)          n - 1                14(15 +(n - 1) - n)         ashr_15
+ */
+	.p2align 4
+LABEL(ashr_15):
+	pxor	%xmm0, %xmm0
+	movdqa	(%rdi), %xmm2
+	movdqa	(%rsi), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pslldq	$1, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
+	pmovmskb %xmm2, %r9d
+	shr	%cl, %edx
+	shr	%cl, %r9d
+	sub	%r9d, %edx
+	jnz	LABEL(less32bytes)
+
+	movdqa	(%rdi), %xmm3
+
+	UPDATE_STRNCMP_COUNTER
+
+	pxor	%xmm0, %xmm0
+	mov	$16, %rcx	/* index for loads */
+	mov	$15, %r9d	/* byte position left over from less32bytes case */
+	/*
+	 * Setup %r10 value allows us to detect crossing a page boundary.
+	 * When %r10 goes positive we have crossed a page boundary and
+	 * need to do a nibble.
+	 */
+	lea	15(%rdi), %r10
+	and	$0xfff, %r10	/* offset into 4K page */
+
+	sub	$0x1000, %r10	/* subtract 4K pagesize */
+
+	.p2align 4
+LABEL(loop_ashr_15):
+	add	$16, %r10
+	jg	LABEL(nibble_ashr_15)
+
+LABEL(gobble_ashr_15):
+	movdqa	(%rsi, %rcx), %xmm1
+	movdqa	(%rdi, %rcx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+#ifndef USE_SSSE3
+	psrldq	$15, %xmm3
+	pslldq	$1, %xmm2
+	por	%xmm3, %xmm2		/* merge into one 16byte value */
+#else
+	palignr	$15, %xmm3, %xmm2	/* merge into one 16byte value */
+#endif
+	TOLOWER (%xmm1, %xmm2)
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx
+	jnz	LABEL(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	sub	$16, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+
+	add	$16, %rcx
+	movdqa	%xmm4, %xmm3
+
+	add	$16, %r10
+	jg	LABEL(nibble_ashr_15)	/* cross page boundary */
+
+	movdqa	(%rsi, %rcx), %xmm1
+	movdqa	(%rdi, %rcx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+#ifndef USE_SSSE3
+	psrldq	$15, %xmm3
+	pslldq	$1, %xmm2
+	por	%xmm3, %xmm2		/* merge into one 16byte value */
+#else
+	palignr	$15, %xmm3, %xmm2	/* merge into one 16byte value */
+#endif
+	TOLOWER (%xmm1, %xmm2)
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx
+	jnz	LABEL(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	sub	$16, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+
+	add	$16, %rcx
+	movdqa	%xmm4, %xmm3
+	jmp	LABEL(loop_ashr_15)
+
+	.p2align 4
+LABEL(nibble_ashr_15):
+	pcmpeqb	%xmm3, %xmm0		/* check nibble for null char */
+	pmovmskb %xmm0, %edx
+	test	$0x8000, %edx
+	jnz	LABEL(ashr_15_exittail)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmpq	$1, %r11
+	jbe	LABEL(ashr_15_exittail)
+#endif
+
+	pxor	%xmm0, %xmm0
+	sub	$0x1000, %r10
+	jmp	LABEL(gobble_ashr_15)
+
+	.p2align 4
+LABEL(ashr_15_exittail):
+	movdqa	(%rsi, %rcx), %xmm1
+	psrldq	$15, %xmm3
+	psrldq	$15, %xmm0
+
+	.p2align 4
+LABEL(aftertail):
+	TOLOWER (%xmm1, %xmm3)
+	pcmpeqb	%xmm3, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %edx
+	not	%edx
+
+	.p2align 4
+LABEL(exit):
+	lea	-16(%r9, %rcx), %rax	/* locate the exact offset for rdi */
+LABEL(less32bytes):
+	lea	(%rdi, %rax), %rdi	/* locate the exact address for first operand(rdi) */
+	lea	(%rsi, %rcx), %rsi	/* locate the exact address for second operand(rsi) */
+	test	%r8d, %r8d
+	jz	LABEL(ret)
+	xchg	%rsi, %rdi		/* recover original order according to flag(%r8d) */
+
+	.p2align 4
+LABEL(ret):
+LABEL(less16bytes):
+	bsf	%rdx, %rdx		/* find and store bit index in %rdx */
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	sub	%rdx, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+	movzbl	(%rsi, %rdx), %ecx
+	movzbl	(%rdi, %rdx), %eax
+
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+	leaq	_nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx
+	movl	(%rdx,%rcx,4), %ecx
+	movl	(%rdx,%rax,4), %eax
+#endif
+
+	sub	%ecx, %eax
+	ret
+
+LABEL(strcmp_exitz):
+	xor	%eax, %eax
+	ret
+
+	.p2align 4
+LABEL(Byte0):
+	movzx	(%rsi), %ecx
+	movzx	(%rdi), %eax
+
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+	leaq	_nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx
+	movl	(%rdx,%rcx,4), %ecx
+	movl	(%rdx,%rax,4), %eax
+#endif
+
+	sub	%ecx, %eax
+	ret
+END (STRCMP)
+
+	.section .rodata,"a",@progbits
+	.p2align 3
+LABEL(unaligned_table):
+	.int	LABEL(ashr_1) - LABEL(unaligned_table)
+	.int	LABEL(ashr_2) - LABEL(unaligned_table)
+	.int	LABEL(ashr_3) - LABEL(unaligned_table)
+	.int	LABEL(ashr_4) - LABEL(unaligned_table)
+	.int	LABEL(ashr_5) - LABEL(unaligned_table)
+	.int	LABEL(ashr_6) - LABEL(unaligned_table)
+	.int	LABEL(ashr_7) - LABEL(unaligned_table)
+	.int	LABEL(ashr_8) - LABEL(unaligned_table)
+	.int	LABEL(ashr_9) - LABEL(unaligned_table)
+	.int	LABEL(ashr_10) - LABEL(unaligned_table)
+	.int	LABEL(ashr_11) - LABEL(unaligned_table)
+	.int	LABEL(ashr_12) - LABEL(unaligned_table)
+	.int	LABEL(ashr_13) - LABEL(unaligned_table)
+	.int	LABEL(ashr_14) - LABEL(unaligned_table)
+	.int	LABEL(ashr_15) - LABEL(unaligned_table)
+	.int	LABEL(ashr_0) - LABEL(unaligned_table)
+libc_hidden_builtin_def (STRCMP)
diff --git a/REORG.TODO/sysdeps/x86_64/strcpy.S b/REORG.TODO/sysdeps/x86_64/strcpy.S
new file mode 100644
index 0000000000..0351b0820d
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/strcpy.S
@@ -0,0 +1,156 @@
+/* strcpy/stpcpy implementation for x86-64.
+   Copyright (C) 2002-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Andreas Jaeger <aj@suse.de>, 2002.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+#ifndef USE_AS_STPCPY
+# define STRCPY strcpy
+#endif
+
+	.text
+ENTRY (STRCPY)
+	movq %rsi, %rcx		/* Source register. */
+	andl $7, %ecx		/* mask alignment bits */
+	movq %rdi, %rdx		/* Duplicate destination pointer.  */
+
+	jz 5f			/* aligned => start loop */
+
+	neg %ecx		/* We need to align to 8 bytes.  */
+	addl $8,%ecx
+	/* Search the first bytes directly.  */
+0:
+	movb	(%rsi), %al	/* Fetch a byte */
+	testb	%al, %al	/* Is it NUL? */
+	movb	%al, (%rdx)	/* Store it */
+	jz	4f		/* If it was NUL, done! */
+	incq	%rsi
+	incq	%rdx
+	decl	%ecx
+	jnz	0b
+
+5:
+	movq $0xfefefefefefefeff,%r8
+
+	/* Now the sources is aligned.  Unfortunatly we cannot force
+	   to have both source and destination aligned, so ignore the
+	   alignment of the destination.  */
+	.p2align 4
+1:
+	/* 1st unroll.  */
+	movq	(%rsi), %rax	/* Read double word (8 bytes).  */
+	addq	$8, %rsi	/* Adjust pointer for next word.  */
+	movq	%rax, %r9	/* Save a copy for NUL finding.  */
+	addq	%r8, %r9	/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc	3f		/* highest byte is NUL => return pointer */
+	xorq	%rax, %r9	/* (word+magic)^word */
+	orq	%r8, %r9	/* set all non-carry bits */
+	incq	%r9		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+
+	jnz	3f		/* found NUL => return pointer */
+
+	movq	%rax, (%rdx)	/* Write value to destination.  */
+	addq	$8, %rdx	/* Adjust pointer.  */
+
+	/* 2nd unroll.  */
+	movq	(%rsi), %rax	/* Read double word (8 bytes).  */
+	addq	$8, %rsi	/* Adjust pointer for next word.  */
+	movq	%rax, %r9	/* Save a copy for NUL finding.  */
+	addq	%r8, %r9	/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc	3f		/* highest byte is NUL => return pointer */
+	xorq	%rax, %r9	/* (word+magic)^word */
+	orq	%r8, %r9	/* set all non-carry bits */
+	incq	%r9		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+
+	jnz	3f		/* found NUL => return pointer */
+
+	movq	%rax, (%rdx)	/* Write value to destination.  */
+	addq	$8, %rdx	/* Adjust pointer.  */
+
+	/* 3rd unroll.  */
+	movq	(%rsi), %rax	/* Read double word (8 bytes).  */
+	addq	$8, %rsi	/* Adjust pointer for next word.  */
+	movq	%rax, %r9	/* Save a copy for NUL finding.  */
+	addq	%r8, %r9	/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc	3f		/* highest byte is NUL => return pointer */
+	xorq	%rax, %r9	/* (word+magic)^word */
+	orq	%r8, %r9	/* set all non-carry bits */
+	incq	%r9		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+
+	jnz	3f		/* found NUL => return pointer */
+
+	movq	%rax, (%rdx)	/* Write value to destination.  */
+	addq	$8, %rdx	/* Adjust pointer.  */
+
+	/* 4th unroll.  */
+	movq	(%rsi), %rax	/* Read double word (8 bytes).  */
+	addq	$8, %rsi	/* Adjust pointer for next word.  */
+	movq	%rax, %r9	/* Save a copy for NUL finding.  */
+	addq	%r8, %r9	/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc	3f		/* highest byte is NUL => return pointer */
+	xorq	%rax, %r9	/* (word+magic)^word */
+	orq	%r8, %r9	/* set all non-carry bits */
+	incq	%r9		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+
+	jnz	3f		/* found NUL => return pointer */
+
+	movq	%rax, (%rdx)	/* Write value to destination.  */
+	addq	$8, %rdx	/* Adjust pointer.  */
+	jmp	1b		/* Next iteration.  */
+
+	/* Do the last few bytes. %rax contains the value to write.
+	   The loop is unrolled twice.  */
+	.p2align 4
+3:
+	/* Note that stpcpy needs to return with the value of the NUL
+	   byte.  */
+	movb	%al, (%rdx)	/* 1st byte.  */
+	testb	%al, %al	/* Is it NUL.  */
+	jz	4f		/* yes, finish.  */
+	incq	%rdx		/* Increment destination.  */
+	movb	%ah, (%rdx)	/* 2nd byte.  */
+	testb	%ah, %ah	/* Is it NUL?.  */
+	jz	4f		/* yes, finish.  */
+	incq	%rdx		/* Increment destination.  */
+	shrq	$16, %rax	/* Shift...  */
+	jmp	3b		/* and look at next two bytes in %rax.  */
+
+4:
+#ifdef USE_AS_STPCPY
+	movq	%rdx, %rax	/* Destination is return value.  */
+#else
+	movq	%rdi, %rax	/* Source is return value.  */
+#endif
+	retq
+END (STRCPY)
+#ifndef USE_AS_STPCPY
+libc_hidden_builtin_def (strcpy)
+#endif
diff --git a/REORG.TODO/sysdeps/x86_64/strcspn.S b/REORG.TODO/sysdeps/x86_64/strcspn.S
new file mode 100644
index 0000000000..a1d1f7dfba
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/strcspn.S
@@ -0,0 +1,125 @@
+/* strcspn (str, ss) -- Return the length of the initial segment of STR
+			which contains no characters from SS.
+   For AMD x86-64.
+   Copyright (C) 1994-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@gnu.ai.mit.edu>.
+   Bug fixes by Alan Modra <Alan@SPRI.Levels.UniSA.Edu.Au>.
+   Adopted for x86-64 by Andreas Jaeger <aj@suse.de>.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+/* BEWARE: `#ifdef strcspn' means that strcspn is redefined as `strpbrk' */
+#define STRPBRK_P (defined strcspn)
+
+	.text
+ENTRY (strcspn)
+
+	movq %rdi, %rdx		/* Save SRC.  */
+
+	/* First we create a table with flags for all possible characters.
+	   For the ASCII (7bit/8bit) or ISO-8859-X character sets which are
+	   supported by the C string functions we have 256 characters.
+	   Before inserting marks for the stop characters we clear the whole
+	   table.  */
+	movq %rdi, %r8			/* Save value.  */
+	subq $256, %rsp			/* Make space for 256 bytes.  */
+	cfi_adjust_cfa_offset(256)
+	movl $32,  %ecx			/* 32*8 bytes = 256 bytes.  */
+	movq %rsp, %rdi
+	xorl %eax, %eax			/* We store 0s.  */
+	cld
+	rep
+	stosq
+
+	movq %rsi, %rax			/* Setup skipset.  */
+
+/* For understanding the following code remember that %rcx == 0 now.
+   Although all the following instruction only modify %cl we always
+   have a correct zero-extended 64-bit value in %rcx.  */
+
+	.p2align 4
+L(2):	movb (%rax), %cl	/* get byte from skipset */
+	testb %cl, %cl		/* is NUL char? */
+	jz L(1)			/* yes => start compare loop */
+	movb %cl, (%rsp,%rcx)	/* set corresponding byte in skipset table */
+
+	movb 1(%rax), %cl	/* get byte from skipset */
+	testb $0xff, %cl	/* is NUL char? */
+	jz L(1)			/* yes => start compare loop */
+	movb %cl, (%rsp,%rcx)	/* set corresponding byte in skipset table */
+
+	movb 2(%rax), %cl	/* get byte from skipset */
+	testb $0xff, %cl	/* is NUL char? */
+	jz L(1)			/* yes => start compare loop */
+	movb %cl, (%rsp,%rcx)	/* set corresponding byte in skipset table */
+
+	movb 3(%rax), %cl	/* get byte from skipset */
+	addq $4, %rax		/* increment skipset pointer */
+	movb %cl, (%rsp,%rcx)	/* set corresponding byte in skipset table */
+	testb $0xff, %cl	/* is NUL char? */
+	jnz L(2)		/* no => process next dword from skipset */
+
+L(1):	leaq -4(%rdx), %rax	/* prepare loop */
+
+	/* We use a neat trick for the following loop.  Normally we would
+	   have to test for two termination conditions
+	   1. a character in the skipset was found
+	   and
+	   2. the end of the string was found
+	   But as a sign that the character is in the skipset we store its
+	   value in the table.  But the value of NUL is NUL so the loop
+	   terminates for NUL in every case.  */
+
+	.p2align 4
+L(3):	addq $4, %rax		/* adjust pointer for full loop round */
+
+	movb (%rax), %cl	/* get byte from string */
+	cmpb %cl, (%rsp,%rcx)	/* is it contained in skipset? */
+	je L(4)			/* yes => return */
+
+	movb 1(%rax), %cl	/* get byte from string */
+	cmpb %cl, (%rsp,%rcx)	/* is it contained in skipset? */
+	je L(5)			/* yes => return */
+
+	movb 2(%rax), %cl	/* get byte from string */
+	cmpb %cl, (%rsp,%rcx)	/* is it contained in skipset? */
+	jz L(6)			/* yes => return */
+
+	movb 3(%rax), %cl	/* get byte from string */
+	cmpb %cl, (%rsp,%rcx)	/* is it contained in skipset? */
+	jne L(3)		/* no => start loop again */
+
+	incq %rax		/* adjust pointer */
+L(6):	incq %rax
+L(5):	incq %rax
+
+L(4):	addq $256, %rsp		/* remove skipset */
+	cfi_adjust_cfa_offset(-256)
+#if STRPBRK_P
+	xorl %edx,%edx
+	orb %cl, %cl		/* was last character NUL? */
+	cmovzq %rdx, %rax	/* Yes:	return NULL */
+#else
+	subq %rdx, %rax		/* we have to return the number of valid
+				   characters, so compute distance to first
+				   non-valid character */
+#endif
+	ret
+END (strcspn)
+libc_hidden_builtin_def (strcspn)
diff --git a/REORG.TODO/sysdeps/x86_64/strlen.S b/REORG.TODO/sysdeps/x86_64/strlen.S
new file mode 100644
index 0000000000..b5ab117c79
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/strlen.S
@@ -0,0 +1,258 @@
+/* SSE2 version of strlen/wcslen.
+   Copyright (C) 2012-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+#ifdef AS_WCSLEN
+# define PMINU		pminud
+# define PCMPEQ		pcmpeqd
+# define SHIFT_RETURN	shrq $2, %rax
+#else
+# define PMINU		pminub
+# define PCMPEQ		pcmpeqb
+# define SHIFT_RETURN
+#endif
+
+/* Long lived register in strlen(s), strnlen(s, n) are:
+
+	%xmm3 - zero
+	%rdi   - s
+	%r10  (s+n) & (~(64-1))
+	%r11   s+n
+*/
+
+
+.text
+ENTRY(strlen)
+
+/* Test 64 bytes from %rax for zero. Save result as bitmask in %rdx.  */
+#define FIND_ZERO	\
+	PCMPEQ	(%rax), %xmm0;	\
+	PCMPEQ	16(%rax), %xmm1;	\
+	PCMPEQ	32(%rax), %xmm2;	\
+	PCMPEQ	48(%rax), %xmm3;	\
+	pmovmskb	%xmm0, %esi;	\
+	pmovmskb	%xmm1, %edx;	\
+	pmovmskb	%xmm2, %r8d;	\
+	pmovmskb	%xmm3, %ecx;	\
+	salq	$16, %rdx;	\
+	salq	$16, %rcx;	\
+	orq	%rsi, %rdx;	\
+	orq	%r8, %rcx;	\
+	salq	$32, %rcx;	\
+	orq	%rcx, %rdx;
+
+#ifdef AS_STRNLEN
+/* Do not read anything when n==0.  */
+	test	%rsi, %rsi
+	jne	L(n_nonzero)
+	xor	%rax, %rax
+	ret
+L(n_nonzero):
+# ifdef AS_WCSLEN
+	shlq	$2, %rsi
+# endif
+
+/* Initialize long lived registers.  */
+
+	add	%rdi, %rsi
+	mov	%rsi, %r10
+	and	$-64, %r10
+	mov	%rsi, %r11
+#endif
+
+	pxor	%xmm0, %xmm0
+	pxor	%xmm1, %xmm1
+	pxor	%xmm2, %xmm2
+	pxor	%xmm3, %xmm3
+	movq	%rdi, %rax
+	movq	%rdi, %rcx
+	andq	$4095, %rcx
+/* Offsets 4032-4047 will be aligned into 4032 thus fit into page.  */
+	cmpq	$4047, %rcx
+/* We cannot unify this branching as it would be ~6 cycles slower.  */
+	ja	L(cross_page)
+
+#ifdef AS_STRNLEN
+/* Test if end is among first 64 bytes.  */
+# define STRNLEN_PROLOG	\
+	mov	%r11, %rsi;	\
+	subq	%rax, %rsi;	\
+	andq	$-64, %rax;	\
+	testq	$-64, %rsi;	\
+	je	L(strnlen_ret)
+#else
+# define STRNLEN_PROLOG  andq $-64, %rax;
+#endif
+
+/* Ignore bits in mask that come before start of string.  */
+#define PROLOG(lab)	\
+	movq	%rdi, %rcx;	\
+	xorq	%rax, %rcx;	\
+	STRNLEN_PROLOG;	\
+	sarq	%cl, %rdx;	\
+	test	%rdx, %rdx;	\
+	je	L(lab);	\
+	bsfq	%rdx, %rax;	\
+	SHIFT_RETURN;		\
+	ret
+
+#ifdef AS_STRNLEN
+	andq	$-16, %rax
+	FIND_ZERO
+#else
+	/* Test first 16 bytes unaligned.  */
+	movdqu	(%rax), %xmm4
+	PCMPEQ	%xmm0, %xmm4
+	pmovmskb	%xmm4, %edx
+	test	%edx, %edx
+	je 	L(next48_bytes)
+	bsf	%edx, %eax /* If eax is zeroed 16bit bsf can be used.  */
+	SHIFT_RETURN
+	ret
+
+L(next48_bytes):
+/* Same as FIND_ZERO except we do not check first 16 bytes.  */
+	andq	$-16, %rax
+	PCMPEQ 16(%rax), %xmm1
+	PCMPEQ 32(%rax), %xmm2
+	PCMPEQ 48(%rax), %xmm3
+	pmovmskb	%xmm1, %edx
+	pmovmskb	%xmm2, %r8d
+	pmovmskb	%xmm3, %ecx
+	salq	$16, %rdx
+	salq	$16, %rcx
+	orq	%r8, %rcx
+	salq	$32, %rcx
+	orq	%rcx, %rdx
+#endif
+
+	/* When no zero byte is found xmm1-3 are zero so we do not have to
+	   zero them.  */
+	PROLOG(loop)
+
+	.p2align 4
+L(cross_page):
+	andq	$-64, %rax
+	FIND_ZERO
+	PROLOG(loop_init)
+
+#ifdef AS_STRNLEN
+/* We must do this check to correctly handle strnlen (s, -1).  */
+L(strnlen_ret):
+	bts	%rsi, %rdx
+	sarq	%cl, %rdx
+	test	%rdx, %rdx
+	je	L(loop_init)
+	bsfq	%rdx, %rax
+	SHIFT_RETURN
+	ret
+#endif
+	.p2align 4
+L(loop_init):
+	pxor	%xmm1, %xmm1
+	pxor	%xmm2, %xmm2
+	pxor	%xmm3, %xmm3
+#ifdef AS_STRNLEN
+	.p2align 4
+L(loop):
+
+	addq	$64, %rax
+	cmpq	%rax, %r10
+	je	L(exit_end)
+
+	movdqa	(%rax), %xmm0
+	PMINU	16(%rax), %xmm0
+	PMINU	32(%rax), %xmm0
+	PMINU	48(%rax), %xmm0
+	PCMPEQ	%xmm3, %xmm0
+	pmovmskb	%xmm0, %edx
+	testl	%edx, %edx
+	jne	L(exit)
+	jmp	L(loop)
+
+	.p2align 4
+L(exit_end):
+	cmp	%rax, %r11
+	je	L(first) /* Do not read when end is at page boundary.  */
+	pxor	%xmm0, %xmm0
+	FIND_ZERO
+
+L(first):
+	bts	%r11, %rdx
+	bsfq	%rdx, %rdx
+	addq	%rdx, %rax
+	subq	%rdi, %rax
+	SHIFT_RETURN
+	ret
+
+	.p2align 4
+L(exit):
+	pxor	%xmm0, %xmm0
+	FIND_ZERO
+
+	bsfq	%rdx, %rdx
+	addq	%rdx, %rax
+	subq	%rdi, %rax
+	SHIFT_RETURN
+	ret
+
+#else
+
+	/* Main loop.  Unrolled twice to improve L2 cache performance on core2.  */
+	.p2align 4
+L(loop):
+
+	movdqa	64(%rax), %xmm0
+	PMINU	80(%rax), %xmm0
+	PMINU	96(%rax), %xmm0
+	PMINU	112(%rax), %xmm0
+	PCMPEQ	%xmm3, %xmm0
+	pmovmskb	%xmm0, %edx
+	testl	%edx, %edx
+	jne	L(exit64)
+
+	subq	$-128, %rax
+
+	movdqa	(%rax), %xmm0
+	PMINU	16(%rax), %xmm0
+	PMINU	32(%rax), %xmm0
+	PMINU	48(%rax), %xmm0
+	PCMPEQ	%xmm3, %xmm0
+	pmovmskb	%xmm0, %edx
+	testl	%edx, %edx
+	jne	L(exit0)
+	jmp	L(loop)
+
+	.p2align 4
+L(exit64):
+	addq	$64, %rax
+L(exit0):
+	pxor	%xmm0, %xmm0
+	FIND_ZERO
+
+	bsfq	%rdx, %rdx
+	addq	%rdx, %rax
+	subq	%rdi, %rax
+	SHIFT_RETURN
+	ret
+
+#endif
+
+END(strlen)
+libc_hidden_builtin_def (strlen)
diff --git a/REORG.TODO/sysdeps/x86_64/strncase.S b/REORG.TODO/sysdeps/x86_64/strncase.S
new file mode 100644
index 0000000000..2de2ce4b96
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/strncase.S
@@ -0,0 +1 @@
+/* In strncase_l.S.  */
diff --git a/REORG.TODO/sysdeps/x86_64/strncase_l-nonascii.c b/REORG.TODO/sysdeps/x86_64/strncase_l-nonascii.c
new file mode 100644
index 0000000000..8664863778
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/strncase_l-nonascii.c
@@ -0,0 +1,8 @@
+#include <string.h>
+
+extern int __strncasecmp_l_nonascii (const char *__s1, const char *__s2,
+				     size_t __n, __locale_t __loc);
+
+#define __strncasecmp_l __strncasecmp_l_nonascii
+#define USE_IN_EXTENDED_LOCALE_MODEL    1
+#include <string/strncase.c>
diff --git a/REORG.TODO/sysdeps/x86_64/strncase_l.S b/REORG.TODO/sysdeps/x86_64/strncase_l.S
new file mode 100644
index 0000000000..c725cd85b3
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/strncase_l.S
@@ -0,0 +1,6 @@
+#define STRCMP __strncasecmp_l
+#define USE_AS_STRNCASECMP_L
+#include "strcmp.S"
+
+weak_alias (__strncasecmp_l, strncasecmp_l)
+libc_hidden_def (strncasecmp_l)
diff --git a/REORG.TODO/sysdeps/x86_64/strncmp.S b/REORG.TODO/sysdeps/x86_64/strncmp.S
new file mode 100644
index 0000000000..0af34e7f15
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/strncmp.S
@@ -0,0 +1,3 @@
+#define STRCMP strncmp
+#define USE_AS_STRNCMP
+#include "strcmp.S"
diff --git a/REORG.TODO/sysdeps/x86_64/strnlen.S b/REORG.TODO/sysdeps/x86_64/strnlen.S
new file mode 100644
index 0000000000..d3c43ac482
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/strnlen.S
@@ -0,0 +1,6 @@
+#define AS_STRNLEN
+#define strlen __strnlen
+#include "strlen.S"
+
+weak_alias (__strnlen, strnlen);
+libc_hidden_builtin_def (strnlen)
diff --git a/REORG.TODO/sysdeps/x86_64/strpbrk.S b/REORG.TODO/sysdeps/x86_64/strpbrk.S
new file mode 100644
index 0000000000..9b97ada84e
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/strpbrk.S
@@ -0,0 +1,2 @@
+#define strcspn strpbrk
+#include <sysdeps/x86_64/strcspn.S>
diff --git a/REORG.TODO/sysdeps/x86_64/strrchr.S b/REORG.TODO/sysdeps/x86_64/strrchr.S
new file mode 100644
index 0000000000..e6a33bc599
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/strrchr.S
@@ -0,0 +1,228 @@
+/* strrchr (str, ch) -- Return pointer to last occurrence of CH in STR.
+   Copyright (C) 2013-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+
+#include <sysdep.h>
+
+	.text
+ENTRY (strrchr)
+	movd	%esi, %xmm1
+	movq	%rdi, %rax
+	andl	$4095, %eax
+	punpcklbw	%xmm1, %xmm1
+	cmpq	$4032, %rax
+	punpcklwd	%xmm1, %xmm1
+	pshufd	$0, %xmm1, %xmm1
+	ja	L(cross_page)
+	movdqu	(%rdi), %xmm0
+	pxor	%xmm2, %xmm2
+	movdqa	%xmm0, %xmm3
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm3
+	pmovmskb	%xmm0, %ecx
+	pmovmskb	%xmm3, %edx
+	testq	%rdx, %rdx
+	je	L(next_48_bytes)
+	leaq	-1(%rdx), %rax
+	xorq	%rdx, %rax
+	andq	%rcx, %rax
+	je	L(exit)
+	bsrq	%rax, %rax
+	addq	%rdi, %rax
+	ret
+
+	.p2align 4
+L(next_48_bytes):
+	movdqu	16(%rdi), %xmm4
+	movdqa	%xmm4, %xmm5
+	movdqu	32(%rdi), %xmm3
+	pcmpeqb	%xmm1, %xmm4
+	pcmpeqb	%xmm2, %xmm5
+	movdqu	48(%rdi), %xmm0
+	pmovmskb	%xmm5, %edx
+	movdqa	%xmm3, %xmm5
+	pcmpeqb	%xmm1, %xmm3
+	pcmpeqb	%xmm2, %xmm5
+	pcmpeqb	%xmm0, %xmm2
+	salq	$16, %rdx
+	pmovmskb	%xmm3, %r8d
+	pmovmskb	%xmm5, %eax
+	pmovmskb	%xmm2, %esi
+	salq	$32, %r8
+	salq	$32, %rax
+	pcmpeqb	%xmm1, %xmm0
+	orq	%rdx, %rax
+	movq	%rsi, %rdx
+	pmovmskb	%xmm4, %esi
+	salq	$48, %rdx
+	salq	$16, %rsi
+	orq	%r8, %rsi
+	orq	%rcx, %rsi
+	pmovmskb	%xmm0, %ecx
+	salq	$48, %rcx
+	orq	%rcx, %rsi
+	orq	%rdx, %rax
+	je	L(loop_header2)
+	leaq	-1(%rax), %rcx
+	xorq	%rax, %rcx
+	andq	%rcx, %rsi
+	je	L(exit)
+	bsrq	%rsi, %rsi
+	leaq	(%rdi,%rsi), %rax
+	ret
+
+	.p2align 4
+L(loop_header2):
+	testq	%rsi, %rsi
+	movq	%rdi, %rcx
+	je	L(no_c_found)
+L(loop_header):
+	addq	$64, %rdi
+	pxor	%xmm7, %xmm7
+	andq	$-64, %rdi
+	jmp	L(loop_entry)
+
+	.p2align 4
+L(loop64):
+	testq	%rdx, %rdx
+	cmovne	%rdx, %rsi
+	cmovne	%rdi, %rcx
+	addq	$64, %rdi
+L(loop_entry):
+	movdqa	32(%rdi), %xmm3
+	pxor	%xmm6, %xmm6
+	movdqa	48(%rdi), %xmm2
+	movdqa	%xmm3, %xmm0
+	movdqa	16(%rdi), %xmm4
+	pminub	%xmm2, %xmm0
+	movdqa	(%rdi), %xmm5
+	pminub	%xmm4, %xmm0
+	pminub	%xmm5, %xmm0
+	pcmpeqb	%xmm7, %xmm0
+	pmovmskb	%xmm0, %eax
+	movdqa	%xmm5, %xmm0
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb	%xmm0, %r9d
+	movdqa	%xmm4, %xmm0
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb	%xmm0, %edx
+	movdqa	%xmm3, %xmm0
+	pcmpeqb	%xmm1, %xmm0
+	salq	$16, %rdx
+	pmovmskb	%xmm0, %r10d
+	movdqa	%xmm2, %xmm0
+	pcmpeqb	%xmm1, %xmm0
+	salq	$32, %r10
+	orq	%r10, %rdx
+	pmovmskb	%xmm0, %r8d
+	orq	%r9, %rdx
+	salq	$48, %r8
+	orq	%r8, %rdx
+	testl	%eax, %eax
+	je	L(loop64)
+	pcmpeqb	%xmm6, %xmm4
+	pcmpeqb	%xmm6, %xmm3
+	pcmpeqb	%xmm6, %xmm5
+	pmovmskb	%xmm4, %eax
+	pmovmskb	%xmm3, %r10d
+	pcmpeqb	%xmm6, %xmm2
+	pmovmskb	%xmm5, %r9d
+	salq	$32, %r10
+	salq	$16, %rax
+	pmovmskb	%xmm2, %r8d
+	orq	%r10, %rax
+	orq	%r9, %rax
+	salq	$48, %r8
+	orq	%r8, %rax
+	leaq	-1(%rax), %r8
+	xorq	%rax, %r8
+	andq	%r8, %rdx
+	cmovne	%rdi, %rcx
+	cmovne	%rdx, %rsi
+	bsrq	%rsi, %rsi
+	leaq	(%rcx,%rsi), %rax
+	ret
+
+	.p2align 4
+L(no_c_found):
+	movl	$1, %esi
+	xorl	%ecx, %ecx
+	jmp	L(loop_header)
+
+	.p2align 4
+L(exit):
+	xorl	%eax, %eax
+	ret
+
+	.p2align 4
+L(cross_page):
+	movq	%rdi, %rax
+	pxor	%xmm0, %xmm0
+	andq	$-64, %rax
+	movdqu	(%rax), %xmm5
+	movdqa	%xmm5, %xmm6
+	movdqu	16(%rax), %xmm4
+	pcmpeqb	%xmm1, %xmm5
+	pcmpeqb	%xmm0, %xmm6
+	movdqu	32(%rax), %xmm3
+	pmovmskb	%xmm6, %esi
+	movdqa	%xmm4, %xmm6
+	movdqu	48(%rax), %xmm2
+	pcmpeqb	%xmm1, %xmm4
+	pcmpeqb	%xmm0, %xmm6
+	pmovmskb	%xmm6, %edx
+	movdqa	%xmm3, %xmm6
+	pcmpeqb	%xmm1, %xmm3
+	pcmpeqb	%xmm0, %xmm6
+	pcmpeqb	%xmm2, %xmm0
+	salq	$16, %rdx
+	pmovmskb	%xmm3, %r9d
+	pmovmskb	%xmm6, %r8d
+	pmovmskb	%xmm0, %ecx
+	salq	$32, %r9
+	salq	$32, %r8
+	pcmpeqb	%xmm1, %xmm2
+	orq	%r8, %rdx
+	salq	$48, %rcx
+	pmovmskb	%xmm5, %r8d
+	orq	%rsi, %rdx
+	pmovmskb	%xmm4, %esi
+	orq	%rcx, %rdx
+	pmovmskb	%xmm2, %ecx
+	salq	$16, %rsi
+	salq	$48, %rcx
+	orq	%r9, %rsi
+	orq	%r8, %rsi
+	orq	%rcx, %rsi
+	movl	%edi, %ecx
+	subl	%eax, %ecx
+	shrq	%cl, %rdx
+	shrq	%cl, %rsi
+	testq	%rdx, %rdx
+	je	L(loop_header2)
+	leaq	-1(%rdx), %rax
+	xorq	%rdx, %rax
+	andq	%rax, %rsi
+	je	L(exit)
+	bsrq	%rsi, %rax
+	addq	%rdi, %rax
+	ret
+END (strrchr)
+
+weak_alias (strrchr, rindex)
+libc_hidden_builtin_def (strrchr)
diff --git a/REORG.TODO/sysdeps/x86_64/strspn.S b/REORG.TODO/sysdeps/x86_64/strspn.S
new file mode 100644
index 0000000000..3da576f3d6
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/strspn.S
@@ -0,0 +1,115 @@
+/* strspn (str, ss) -- Return the length of the initial segment of STR
+			which contains only characters from SS.
+   For AMD x86-64.
+   Copyright (C) 1994-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@gnu.ai.mit.edu>.
+   Bug fixes by Alan Modra <Alan@SPRI.Levels.UniSA.Edu.Au>.
+   Adopted for x86-64 by Andreas Jaeger <aj@suse.de>.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+	.text
+ENTRY (strspn)
+
+	movq %rdi, %rdx		/* Save SRC.  */
+
+	/* First we create a table with flags for all possible characters.
+	   For the ASCII (7bit/8bit) or ISO-8859-X character sets which are
+	   supported by the C string functions we have 256 characters.
+	   Before inserting marks for the stop characters we clear the whole
+	   table.  */
+	movq %rdi, %r8			/* Save value.  */
+	subq $256, %rsp			/* Make space for 256 bytes.  */
+	cfi_adjust_cfa_offset(256)
+	movl $32,  %ecx			/* 32*8 bytes = 256 bytes.  */
+	movq %rsp, %rdi
+	xorl %eax, %eax			/* We store 0s.  */
+	cld
+	rep
+	stosq
+
+	movq %rsi, %rax			/* Setup stopset.  */
+
+/* For understanding the following code remember that %rcx == 0 now.
+   Although all the following instruction only modify %cl we always
+   have a correct zero-extended 64-bit value in %rcx.  */
+
+	.p2align 4
+L(2):	movb (%rax), %cl	/* get byte from stopset */
+	testb %cl, %cl		/* is NUL char? */
+	jz L(1)			/* yes => start compare loop */
+	movb %cl, (%rsp,%rcx)	/* set corresponding byte in stopset table */
+
+	movb 1(%rax), %cl	/* get byte from stopset */
+	testb $0xff, %cl	/* is NUL char? */
+	jz L(1)			/* yes => start compare loop */
+	movb %cl, (%rsp,%rcx)	/* set corresponding byte in stopset table */
+
+	movb 2(%rax), %cl	/* get byte from stopset */
+	testb $0xff, %cl	/* is NUL char? */
+	jz L(1)			/* yes => start compare loop */
+	movb %cl, (%rsp,%rcx)	/* set corresponding byte in stopset table */
+
+	movb 3(%rax), %cl	/* get byte from stopset */
+	addq $4, %rax		/* increment stopset pointer */
+	movb %cl, (%rsp,%rcx)	/* set corresponding byte in stopset table */
+	testb $0xff, %cl	/* is NUL char? */
+	jnz L(2)		/* no => process next dword from stopset */
+
+L(1):	leaq -4(%rdx), %rax	/* prepare loop */
+
+	/* We use a neat trick for the following loop.  Normally we would
+	   have to test for two termination conditions
+	   1. a character in the stopset was found
+	   and
+	   2. the end of the string was found
+	   But as a sign that the character is in the stopset we store its
+	   value in the table.  But the value of NUL is NUL so the loop
+	   terminates for NUL in every case.  */
+
+	.p2align 4
+L(3):	addq $4, %rax		/* adjust pointer for full loop round */
+
+	movb (%rax), %cl	/* get byte from string */
+	testb %cl, (%rsp,%rcx)	/* is it contained in skipset? */
+	jz L(4)			/* no => return */
+
+	movb 1(%rax), %cl	/* get byte from string */
+	testb %cl, (%rsp,%rcx)	/* is it contained in skipset? */
+	jz L(5)			/* no => return */
+
+	movb 2(%rax), %cl	/* get byte from string */
+	testb %cl, (%rsp,%rcx)	/* is it contained in skipset? */
+	jz L(6)			/* no => return */
+
+	movb 3(%rax), %cl	/* get byte from string */
+	testb %cl, (%rsp,%rcx)	/* is it contained in skipset? */
+	jnz L(3)		/* yes => start loop again */
+
+	incq %rax		/* adjust pointer */
+L(6):	incq %rax
+L(5):	incq %rax
+
+L(4):	addq $256, %rsp		/* remove stopset */
+	cfi_adjust_cfa_offset(-256)
+	subq %rdx, %rax		/* we have to return the number of valid
+				   characters, so compute distance to first
+				   non-valid character */
+	ret
+END (strspn)
+libc_hidden_builtin_def (strspn)
diff --git a/REORG.TODO/sysdeps/x86_64/sub_n.S b/REORG.TODO/sysdeps/x86_64/sub_n.S
new file mode 100644
index 0000000000..44c0d88c58
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/sub_n.S
@@ -0,0 +1,23 @@
+/* x86-64 __mpn_sub_n -- Add two limb vectors of the same length > 0 and store
+   sum in a third limb vector.
+   Copyright (C) 2006-2017 Free Software Foundation, Inc.
+   This file is part of the GNU MP Library.
+
+   The GNU MP Library is free software; you can redistribute it and/or modify
+   it under the terms of the GNU Lesser General Public License as published by
+   the Free Software Foundation; either version 2.1 of the License, or (at your
+   option) any later version.
+
+   The GNU MP Library is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+   License for more details.
+
+   You should have received a copy of the GNU Lesser General Public License
+   along with the GNU MP Library; see the file COPYING.LIB.  If not,
+   see <http://www.gnu.org/licenses/>.  */
+
+#define func __mpn_sub_n
+#define ADCSBB sbb
+
+#include "add_n.S"
diff --git a/REORG.TODO/sysdeps/x86_64/submul_1.S b/REORG.TODO/sysdeps/x86_64/submul_1.S
new file mode 100644
index 0000000000..77f772cb0b
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/submul_1.S
@@ -0,0 +1,23 @@
+/* x86-64 __mpn_submul_1 -- Multiply a limb vector with a limb and subtract
+   the result from a second limb vector.
+   Copyright (C) 2003-2017 Free Software Foundation, Inc.
+   This file is part of the GNU MP Library.
+
+   The GNU MP Library is free software; you can redistribute it and/or modify
+   it under the terms of the GNU Lesser General Public License as published by
+   the Free Software Foundation; either version 2.1 of the License, or (at your
+   option) any later version.
+
+   The GNU MP Library is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+   License for more details.
+
+   You should have received a copy of the GNU Lesser General Public License
+   along with the GNU MP Library; see the file COPYING.LIB.  If not,
+   see <http://www.gnu.org/licenses/>.  */
+
+#define func __mpn_submul_1
+#define ADDSUB sub
+
+#include "addmul_1.S"
diff --git a/REORG.TODO/sysdeps/x86_64/sysdep.h b/REORG.TODO/sysdeps/x86_64/sysdep.h
new file mode 100644
index 0000000000..1c52544fa3
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/sysdep.h
@@ -0,0 +1,169 @@
+/* Assembler macros for x86-64.
+   Copyright (C) 2001-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _X86_64_SYSDEP_H
+#define _X86_64_SYSDEP_H 1
+
+#include <sysdeps/generic/sysdep.h>
+
+#ifdef	__ASSEMBLER__
+
+/* Syntactic details of assembler.  */
+
+/* This macro is for setting proper CFI with DW_CFA_expression describing
+   the register as saved relative to %rsp instead of relative to the CFA.
+   Expression is DW_OP_drop, DW_OP_breg7 (%rsp is register 7), sleb128 offset
+   from %rsp.  */
+#define cfi_offset_rel_rsp(regn, off)	.cfi_escape 0x10, regn, 0x4, 0x13, \
+					0x77, off & 0x7F | 0x80, off >> 7
+
+/* ELF uses byte-counts for .align, most others use log2 of count of bytes.  */
+#define ALIGNARG(log2) 1<<log2
+#define ASM_SIZE_DIRECTIVE(name) .size name,.-name;
+
+
+/* Define an entry point visible from C.  */
+#define	ENTRY(name)							      \
+  .globl C_SYMBOL_NAME(name);						      \
+  .type C_SYMBOL_NAME(name),@function;					      \
+  .align ALIGNARG(4);							      \
+  C_LABEL(name)								      \
+  cfi_startproc;							      \
+  CALL_MCOUNT
+
+#undef	END
+#define END(name)							      \
+  cfi_endproc;								      \
+  ASM_SIZE_DIRECTIVE(name)
+
+#define ENTRY_CHK(name) ENTRY (name)
+#define END_CHK(name) END (name)
+
+/* If compiled for profiling, call `mcount' at the start of each function.  */
+#ifdef	PROF
+/* The mcount code relies on a normal frame pointer being on the stack
+   to locate our caller, so push one just for its benefit.  */
+#define CALL_MCOUNT                                                          \
+  pushq %rbp;                                                                \
+  cfi_adjust_cfa_offset(8);                                                  \
+  movq %rsp, %rbp;                                                           \
+  cfi_def_cfa_register(%rbp);                                                \
+  call JUMPTARGET(mcount);                                                   \
+  popq %rbp;                                                                 \
+  cfi_def_cfa(rsp,8);
+#else
+#define CALL_MCOUNT		/* Do nothing.  */
+#endif
+
+/* Since C identifiers are not normally prefixed with an underscore
+   on this system, the asm identifier `syscall_error' intrudes on the
+   C name space.  Make sure we use an innocuous name.  */
+#define	syscall_error	__syscall_error
+#define mcount		_mcount
+
+#define	PSEUDO(name, syscall_name, args)				      \
+lose:									      \
+  jmp JUMPTARGET(syscall_error)						      \
+  .globl syscall_error;							      \
+  ENTRY (name)								      \
+  DO_CALL (syscall_name, args);						      \
+  jb lose
+
+#undef	PSEUDO_END
+#define	PSEUDO_END(name)						      \
+  END (name)
+
+#undef JUMPTARGET
+#ifdef SHARED
+# ifdef BIND_NOW
+#  define JUMPTARGET(name)	*name##@GOTPCREL(%rip)
+# else
+#  define JUMPTARGET(name)	name##@PLT
+# endif
+#else
+/* For static archives, branch to target directly.  */
+# define JUMPTARGET(name)	name
+#endif
+
+/* Local label name for asm code. */
+#ifndef L
+/* ELF-like local names start with `.L'.  */
+# define L(name)	.L##name
+#endif
+
+#define atom_text_section .section ".text.atom", "ax"
+
+/* Long and pointer size in bytes.  */
+#define LP_SIZE	8
+
+/* Instruction to operate on long and pointer.  */
+#define LP_OP(insn) insn##q
+
+/* Assembler address directive. */
+#define ASM_ADDR .quad
+
+/* Registers to hold long and pointer.  */
+#define RAX_LP	rax
+#define RBP_LP	rbp
+#define RBX_LP	rbx
+#define RCX_LP	rcx
+#define RDI_LP	rdi
+#define RDX_LP	rdx
+#define RSI_LP	rsi
+#define RSP_LP	rsp
+#define R8_LP	r8
+#define R9_LP	r9
+#define R10_LP	r10
+#define R11_LP	r11
+#define R12_LP	r12
+#define R13_LP	r13
+#define R14_LP	r14
+#define R15_LP	r15
+
+#else	/* __ASSEMBLER__ */
+
+/* Long and pointer size in bytes.  */
+#define LP_SIZE "8"
+
+/* Instruction to operate on long and pointer.  */
+#define LP_OP(insn) #insn "q"
+
+/* Assembler address directive. */
+#define ASM_ADDR ".quad"
+
+/* Registers to hold long and pointer.  */
+#define RAX_LP	"rax"
+#define RBP_LP	"rbp"
+#define RBX_LP	"rbx"
+#define RCX_LP	"rcx"
+#define RDI_LP	"rdi"
+#define RDX_LP	"rdx"
+#define RSI_LP	"rsi"
+#define RSP_LP	"rsp"
+#define R8_LP	"r8"
+#define R9_LP	"r9"
+#define R10_LP	"r10"
+#define R11_LP	"r11"
+#define R12_LP	"r12"
+#define R13_LP	"r13"
+#define R14_LP	"r14"
+#define R15_LP	"r15"
+
+#endif	/* __ASSEMBLER__ */
+
+#endif	/* _X86_64_SYSDEP_H */
diff --git a/REORG.TODO/sysdeps/x86_64/tls-macros.h b/REORG.TODO/sysdeps/x86_64/tls-macros.h
new file mode 100644
index 0000000000..22d2a4b592
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/tls-macros.h
@@ -0,0 +1,39 @@
+#define TLS_LE(x) \
+  ({ int *__l;								      \
+     asm ("mov %%fs:0,%0\n\t"						      \
+	  "lea " #x "@tpoff(%0), %0"					      \
+	  : "=r" (__l));						      \
+     __l; })
+
+#define TLS_IE(x) \
+  ({ int *__l;								      \
+     asm ("mov %%fs:0,%0\n\t"						      \
+	  "add " #x "@gottpoff(%%rip),%0"				      \
+	  : "=r" (__l));						      \
+     __l; })
+
+#define TLS_LD(x) \
+  ({ int *__l, __c, __d;						      \
+     asm ("leaq " #x "@tlsld(%%rip),%%rdi\n\t"				      \
+	  "call __tls_get_addr@plt\n\t"					      \
+	  "leaq " #x "@dtpoff(%%rax), %%rax"				      \
+	  : "=a" (__l), "=&c" (__c), "=&d" (__d)			      \
+	  : : "rdi", "rsi", "r8", "r9", "r10", "r11"); 			      \
+     __l; })
+
+#ifdef __ILP32__
+# define TLS_GD_PREFIX
+#else
+# define TLS_GD_PREFIX	".byte 0x66\n\t"
+#endif
+
+#define TLS_GD(x) \
+  ({ int *__l, __c, __d;						      \
+     asm (TLS_GD_PREFIX							      \
+	  "leaq " #x "@tlsgd(%%rip),%%rdi\n\t"				      \
+	  ".word 0x6666\n\t"						      \
+	  "rex64\n\t"							      \
+	  "call __tls_get_addr@plt"					      \
+	  : "=a" (__l), "=&c" (__c), "=&d" (__d)			      \
+	  : : "rdi", "rsi", "r8", "r9", "r10", "r11"); 			      \
+     __l; })
diff --git a/REORG.TODO/sysdeps/x86_64/tlsdesc.c b/REORG.TODO/sysdeps/x86_64/tlsdesc.c
new file mode 100644
index 0000000000..20d821ac66
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/tlsdesc.c
@@ -0,0 +1,150 @@
+/* Manage TLS descriptors.  x86_64 version.
+   Copyright (C) 2005-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <link.h>
+#include <ldsodefs.h>
+#include <elf/dynamic-link.h>
+#include <tls.h>
+#include <dl-tlsdesc.h>
+#include <dl-unmap-segments.h>
+#include <tlsdeschtab.h>
+
+/* The following 2 functions take a caller argument, that contains the
+   address expected to be in the TLS descriptor.  If it's changed, we
+   want to return immediately.  */
+
+/* This function is used to lazily resolve TLS_DESC RELA relocations.
+   The argument location is used to hold a pointer to the relocation.  */
+
+void
+attribute_hidden
+_dl_tlsdesc_resolve_rela_fixup (struct tlsdesc volatile *td,
+				struct link_map *l)
+{
+  const ElfW(Rela) *reloc = td->arg;
+
+  if (_dl_tlsdesc_resolve_early_return_p
+      (td, (void*)(D_PTR (l, l_info[ADDRIDX (DT_TLSDESC_PLT)]) + l->l_addr)))
+    return;
+
+  /* The code below was borrowed from _dl_fixup().  */
+  const ElfW(Sym) *const symtab
+    = (const void *) D_PTR (l, l_info[DT_SYMTAB]);
+  const char *strtab = (const void *) D_PTR (l, l_info[DT_STRTAB]);
+  const ElfW(Sym) *sym = &symtab[ELFW(R_SYM) (reloc->r_info)];
+  lookup_t result;
+
+   /* Look up the target symbol.  If the normal lookup rules are not
+      used don't look in the global scope.  */
+  if (ELFW(ST_BIND) (sym->st_info) != STB_LOCAL
+      && __builtin_expect (ELFW(ST_VISIBILITY) (sym->st_other), 0) == 0)
+    {
+      const struct r_found_version *version = NULL;
+
+      if (l->l_info[VERSYMIDX (DT_VERSYM)] != NULL)
+	{
+	  const ElfW(Half) *vernum =
+	    (const void *) D_PTR (l, l_info[VERSYMIDX (DT_VERSYM)]);
+	  ElfW(Half) ndx = vernum[ELFW(R_SYM) (reloc->r_info)] & 0x7fff;
+	  version = &l->l_versions[ndx];
+	  if (version->hash == 0)
+	    version = NULL;
+	}
+
+      result = _dl_lookup_symbol_x (strtab + sym->st_name, l, &sym,
+				    l->l_scope, version, ELF_RTYPE_CLASS_PLT,
+				    DL_LOOKUP_ADD_DEPENDENCY, NULL);
+    }
+  else
+    {
+      /* We already found the symbol.  The module (and therefore its load
+	 address) is also known.  */
+      result = l;
+    }
+
+  if (! sym)
+    {
+      td->arg = (void*)reloc->r_addend;
+      td->entry = _dl_tlsdesc_undefweak;
+    }
+  else
+    {
+#  ifndef SHARED
+      CHECK_STATIC_TLS (l, result);
+#  else
+      if (!TRY_STATIC_TLS (l, result))
+	{
+	  td->arg = _dl_make_tlsdesc_dynamic (result, sym->st_value
+					      + reloc->r_addend);
+	  td->entry = _dl_tlsdesc_dynamic;
+	}
+      else
+#  endif
+	{
+	  td->arg = (void*)(sym->st_value - result->l_tls_offset
+			    + reloc->r_addend);
+	  td->entry = _dl_tlsdesc_return;
+	}
+    }
+
+  _dl_tlsdesc_wake_up_held_fixups ();
+}
+
+/* This function is used to avoid busy waiting for other threads to
+   complete the lazy relocation.  Once another thread wins the race to
+   relocate a TLS descriptor, it sets the descriptor up such that this
+   function is called to wait until the resolver releases the
+   lock.  */
+
+void
+attribute_hidden
+_dl_tlsdesc_resolve_hold_fixup (struct tlsdesc volatile *td,
+				void *caller)
+{
+  /* Maybe we're lucky and can return early.  */
+  if (caller != td->entry)
+    return;
+
+  /* Locking here will stop execution until the running resolver runs
+     _dl_tlsdesc_wake_up_held_fixups(), releasing the lock.
+
+     FIXME: We'd be better off waiting on a condition variable, such
+     that we didn't have to hold the lock throughout the relocation
+     processing.  */
+  __rtld_lock_lock_recursive (GL(dl_load_lock));
+  __rtld_lock_unlock_recursive (GL(dl_load_lock));
+}
+
+/* Unmap the dynamic object, but also release its TLS descriptor table
+   if there is one.  */
+
+void
+internal_function
+_dl_unmap (struct link_map *map)
+{
+  _dl_unmap_segments (map);
+
+#ifdef SHARED
+  /* _dl_unmap is only called for dlopen()ed libraries, for which
+     calling free() is safe, or before we've completed the initial
+     relocation, in which case calling free() is probably pointless,
+     but still safe.  */
+  if (map->l_mach.tlsdesc_table)
+    htab_delete (map->l_mach.tlsdesc_table);
+#endif
+}
diff --git a/REORG.TODO/sysdeps/x86_64/tlsdesc.sym b/REORG.TODO/sysdeps/x86_64/tlsdesc.sym
new file mode 100644
index 0000000000..33854975d0
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/tlsdesc.sym
@@ -0,0 +1,17 @@
+#include <stddef.h>
+#include <sysdep.h>
+#include <tls.h>
+#include <link.h>
+#include <dl-tlsdesc.h>
+
+--
+
+-- Abuse tls.h macros to derive offsets relative to the thread register.
+
+DTV_OFFSET			offsetof(struct pthread, header.dtv)
+
+TLSDESC_ARG			offsetof(struct tlsdesc, arg)
+
+TLSDESC_GEN_COUNT		offsetof(struct tlsdesc_dynamic_arg, gen_count)
+TLSDESC_MODID			offsetof(struct tlsdesc_dynamic_arg, tlsinfo.ti_module)
+TLSDESC_MODOFF			offsetof(struct tlsdesc_dynamic_arg, tlsinfo.ti_offset)
diff --git a/REORG.TODO/sysdeps/x86_64/tst-audit.h b/REORG.TODO/sysdeps/x86_64/tst-audit.h
new file mode 100644
index 0000000000..e3c780e42c
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/tst-audit.h
@@ -0,0 +1,32 @@
+/* Definitions for testing PLT entry/exit auditing.  x86_64 version.
+
+   Copyright (C) 2012-2017 Free Software Foundation, Inc.
+
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef __ILP32__
+# define pltenter la_x86_64_gnu_pltenter
+# define pltexit la_x86_64_gnu_pltexit
+# define La_regs La_x86_64_regs
+# define La_retval La_x86_64_retval
+#else
+# define pltenter la_x32_gnu_pltenter
+# define pltexit la_x32_gnu_pltexit
+# define La_regs La_x32_regs
+# define La_retval La_x32_retval
+#endif
+#define int_retval lrv_rax
diff --git a/REORG.TODO/sysdeps/x86_64/tst-audit10-aux.c b/REORG.TODO/sysdeps/x86_64/tst-audit10-aux.c
new file mode 100644
index 0000000000..4663136419
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/tst-audit10-aux.c
@@ -0,0 +1,41 @@
+/* Test case for preserved AVX512 registers in dynamic linker, -mavx512f part.
+   Copyright (C) 2012-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <immintrin.h>
+#include <stdlib.h>
+#include <string.h>
+
+int
+tst_audit10_aux (void)
+{
+#ifdef __AVX512F__
+  extern __m512i audit_test (__m512i, __m512i, __m512i, __m512i,
+                             __m512i, __m512i, __m512i, __m512i);
+
+  __m512i zmm = _mm512_setzero_si512 ();
+  __m512i ret = audit_test (zmm, zmm, zmm, zmm, zmm, zmm, zmm, zmm);
+
+  zmm = _mm512_set1_epi64 (0x12349876);
+
+  if (memcmp (&zmm, &ret, sizeof (ret)))
+    abort ();
+  return 0;
+#else /* __AVX512F__ */
+  return 77;
+#endif /* __AVX512F__ */
+}
diff --git a/REORG.TODO/sysdeps/x86_64/tst-audit10.c b/REORG.TODO/sysdeps/x86_64/tst-audit10.c
new file mode 100644
index 0000000000..bda248ac7e
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/tst-audit10.c
@@ -0,0 +1,57 @@
+/* Test case for preserved AVX512 registers in dynamic linker.
+   Copyright (C) 2012-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <cpuid.h>
+
+int tst_audit10_aux (void);
+
+static int
+avx512_enabled (void)
+{
+#ifdef bit_AVX512F
+  unsigned int eax, ebx, ecx, edx;
+
+  if (__get_cpuid (1, &eax, &ebx, &ecx, &edx) == 0
+      || (ecx & (bit_AVX | bit_OSXSAVE)) != (bit_AVX | bit_OSXSAVE))
+    return 0;
+
+  __cpuid_count (7, 0, eax, ebx, ecx, edx);
+  if (!(ebx & bit_AVX512F))
+    return 0;
+
+  asm ("xgetbv" : "=a" (eax), "=d" (edx) : "c" (0));
+
+  /* Verify that ZMM, YMM and XMM states are enabled.  */
+  return (eax & 0xe6) == 0xe6;
+#else
+  return 0;
+#endif
+}
+
+static int
+do_test (void)
+{
+  /* Run AVX512 test only if AVX512 is supported.  */
+  if (avx512_enabled ())
+    return tst_audit10_aux ();
+  else
+    return 77;
+}
+
+#define TEST_FUNCTION do_test ()
+#include "../../test-skeleton.c"
diff --git a/REORG.TODO/sysdeps/x86_64/tst-audit3.c b/REORG.TODO/sysdeps/x86_64/tst-audit3.c
new file mode 100644
index 0000000000..0602aa25db
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/tst-audit3.c
@@ -0,0 +1,23 @@
+/* Test case for x86-64 preserved registers in dynamic linker.  */
+
+#include <stdlib.h>
+#include <string.h>
+
+#include <emmintrin.h>
+
+extern __m128i audit_test (__m128i, __m128i, __m128i, __m128i,
+			   __m128i, __m128i, __m128i, __m128i);
+static int
+do_test (void)
+{
+  __m128i xmm = _mm_setzero_si128 ();
+  __m128i ret = audit_test (xmm, xmm, xmm, xmm, xmm, xmm, xmm, xmm);
+
+  if (memcmp (&xmm, &ret, sizeof (ret)))
+    abort ();
+
+  return 0;
+}
+
+#define TEST_FUNCTION do_test ()
+#include "../../test-skeleton.c"
diff --git a/REORG.TODO/sysdeps/x86_64/tst-audit4-aux.c b/REORG.TODO/sysdeps/x86_64/tst-audit4-aux.c
new file mode 100644
index 0000000000..c78c51c747
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/tst-audit4-aux.c
@@ -0,0 +1,39 @@
+/* Test case for preserved AVX registers in dynamic linker, -mavx part.
+   Copyright (C) 2009-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <immintrin.h>
+#include <stdlib.h>
+#include <string.h>
+
+extern __m256i audit_test (__m256i, __m256i, __m256i, __m256i,
+			   __m256i, __m256i, __m256i, __m256i);
+
+int
+tst_audit4_aux (void)
+{
+#ifdef __AVX__
+  __m256i ymm = _mm256_setzero_si256 ();
+  __m256i ret = audit_test (ymm, ymm, ymm, ymm, ymm, ymm, ymm, ymm);
+  ymm =	 _mm256_set1_epi32 (0x12349876);
+  if (memcmp (&ymm, &ret, sizeof (ret)))
+    abort ();
+  return 0;
+#else  /* __AVX__ */
+  return 77;
+#endif  /* __AVX__ */
+}
diff --git a/REORG.TODO/sysdeps/x86_64/tst-audit4.c b/REORG.TODO/sysdeps/x86_64/tst-audit4.c
new file mode 100644
index 0000000000..8178f2c6d2
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/tst-audit4.c
@@ -0,0 +1,49 @@
+/* Test case for preserved AVX registers in dynamic linker.
+   Copyright (C) 2009-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <cpuid.h>
+
+int tst_audit4_aux (void);
+
+static int
+avx_enabled (void)
+{
+  unsigned int eax, ebx, ecx, edx;
+
+  if (__get_cpuid (1, &eax, &ebx, &ecx, &edx) == 0
+      || (ecx & (bit_AVX | bit_OSXSAVE)) != (bit_AVX | bit_OSXSAVE))
+    return 0;
+
+  /* Check the OS has AVX and SSE saving enabled.  */
+  asm ("xgetbv" : "=a" (eax), "=d" (edx) : "c" (0));
+
+  return (eax & 6) == 6;
+}
+
+static int
+do_test (void)
+{
+  /* Run AVX test only if AVX is supported.  */
+  if (avx_enabled ())
+    return tst_audit4_aux ();
+  else
+    return 77;
+}
+
+#define TEST_FUNCTION do_test ()
+#include "../../test-skeleton.c"
diff --git a/REORG.TODO/sysdeps/x86_64/tst-audit5.c b/REORG.TODO/sysdeps/x86_64/tst-audit5.c
new file mode 100644
index 0000000000..225b4c86ac
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/tst-audit5.c
@@ -0,0 +1,24 @@
+/* Test case for x86-64 preserved registers in dynamic linker.  */
+
+#include <stdlib.h>
+#include <string.h>
+
+#include <emmintrin.h>
+
+extern __m128i audit_test (__m128i, __m128i, __m128i, __m128i,
+			   __m128i, __m128i, __m128i, __m128i);
+static int
+do_test (void)
+{
+  __m128i xmm = _mm_setzero_si128 ();
+  __m128i ret = audit_test (xmm, xmm, xmm, xmm, xmm, xmm, xmm, xmm);
+
+  xmm = _mm_set1_epi32 (0x12349876);
+  if (memcmp (&xmm, &ret, sizeof (ret)))
+    abort ();
+
+  return 0;
+}
+
+#define TEST_FUNCTION do_test ()
+#include "../../test-skeleton.c"
diff --git a/REORG.TODO/sysdeps/x86_64/tst-audit6.c b/REORG.TODO/sysdeps/x86_64/tst-audit6.c
new file mode 100644
index 0000000000..f2f6a487c4
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/tst-audit6.c
@@ -0,0 +1,45 @@
+/* Test case for x86-64 preserved registers in dynamic linker.  */
+
+#include <stdlib.h>
+#include <string.h>
+#include <cpuid.h>
+#include <emmintrin.h>
+
+extern __m128i audit_test (__m128i, __m128i, __m128i, __m128i,
+			   __m128i, __m128i, __m128i, __m128i);
+
+
+static int
+avx_enabled (void)
+{
+  unsigned int eax, ebx, ecx, edx;
+
+  if (__get_cpuid (1, &eax, &ebx, &ecx, &edx) == 0
+      || (ecx & (bit_AVX | bit_OSXSAVE)) != (bit_AVX | bit_OSXSAVE))
+    return 0;
+
+  /* Check the OS has AVX and SSE saving enabled.  */
+  asm ("xgetbv" : "=a" (eax), "=d" (edx) : "c" (0));
+
+  return (eax & 6) == 6;
+}
+
+
+static int
+do_test (void)
+{
+  /* Run AVX test only if AVX is supported.  */
+  if (avx_enabled ())
+    {
+      __m128i xmm = _mm_setzero_si128 ();
+      __m128i ret = audit_test (xmm, xmm, xmm, xmm, xmm, xmm, xmm, xmm);
+
+      xmm = _mm_set1_epi32 (0x98abcdef);
+      if (memcmp (&xmm, &ret, sizeof (ret)))
+	abort ();
+    }
+  return 0;
+}
+
+#define TEST_FUNCTION do_test ()
+#include "../../test-skeleton.c"
diff --git a/REORG.TODO/sysdeps/x86_64/tst-audit7.c b/REORG.TODO/sysdeps/x86_64/tst-audit7.c
new file mode 100644
index 0000000000..1d2a7de439
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/tst-audit7.c
@@ -0,0 +1 @@
+#include "tst-audit6.c"
diff --git a/REORG.TODO/sysdeps/x86_64/tst-auditmod10a.c b/REORG.TODO/sysdeps/x86_64/tst-auditmod10a.c
new file mode 100644
index 0000000000..41c77e98a5
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/tst-auditmod10a.c
@@ -0,0 +1,65 @@
+/* Copyright (C) 2012-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* Test case for x86-64 preserved registers in dynamic linker.  */
+
+#ifdef __AVX512F__
+#include <stdlib.h>
+#include <string.h>
+#include <immintrin.h>
+
+__m512i
+audit_test (__m512i x0, __m512i x1, __m512i x2, __m512i x3,
+	    __m512i x4, __m512i x5, __m512i x6, __m512i x7)
+{
+  __m512i zmm;
+
+  zmm = _mm512_set1_epi64 (1);
+  if (memcmp (&zmm, &x0, sizeof (zmm)))
+    abort ();
+
+  zmm = _mm512_set1_epi64 (2);
+  if (memcmp (&zmm, &x1, sizeof (zmm)))
+    abort ();
+
+  zmm = _mm512_set1_epi64 (3);
+  if (memcmp (&zmm, &x2, sizeof (zmm)))
+    abort ();
+
+  zmm = _mm512_set1_epi64 (4);
+  if (memcmp (&zmm, &x3, sizeof (zmm)))
+    abort ();
+
+  zmm = _mm512_set1_epi64 (5);
+  if (memcmp (&zmm, &x4, sizeof (zmm)))
+    abort ();
+
+  zmm = _mm512_set1_epi64 (6);
+  if (memcmp (&zmm, &x5, sizeof (zmm)))
+    abort ();
+
+  zmm = _mm512_set1_epi64 (7);
+  if (memcmp (&zmm, &x6, sizeof (zmm)))
+    abort ();
+
+  zmm = _mm512_set1_epi64 (8);
+  if (memcmp (&zmm, &x7, sizeof (zmm)))
+    abort ();
+
+  return _mm512_setzero_si512 ();
+}
+#endif
diff --git a/REORG.TODO/sysdeps/x86_64/tst-auditmod10b.c b/REORG.TODO/sysdeps/x86_64/tst-auditmod10b.c
new file mode 100644
index 0000000000..5b9a985ca2
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/tst-auditmod10b.c
@@ -0,0 +1,231 @@
+/* Copyright (C) 2012-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* Verify that changing AVX512 registers in audit library won't affect
+   function parameter passing/return.  */
+
+#include <dlfcn.h>
+#include <link.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <bits/wordsize.h>
+#include <gnu/lib-names.h>
+
+unsigned int
+la_version (unsigned int v)
+{
+  setlinebuf (stdout);
+
+  printf ("version: %u\n", v);
+
+  char buf[20];
+  sprintf (buf, "%u", v);
+
+  return v;
+}
+
+void
+la_activity (uintptr_t *cookie, unsigned int flag)
+{
+  if (flag == LA_ACT_CONSISTENT)
+    printf ("activity: consistent\n");
+  else if (flag == LA_ACT_ADD)
+    printf ("activity: add\n");
+  else if (flag == LA_ACT_DELETE)
+    printf ("activity: delete\n");
+  else
+    printf ("activity: unknown activity %u\n", flag);
+}
+
+char *
+la_objsearch (const char *name, uintptr_t *cookie, unsigned int flag)
+{
+  char buf[100];
+  const char *flagstr;
+  if (flag == LA_SER_ORIG)
+    flagstr = "LA_SET_ORIG";
+  else if (flag == LA_SER_LIBPATH)
+    flagstr = "LA_SER_LIBPATH";
+  else if (flag == LA_SER_RUNPATH)
+    flagstr = "LA_SER_RUNPATH";
+  else if (flag == LA_SER_CONFIG)
+    flagstr = "LA_SER_CONFIG";
+  else if (flag == LA_SER_DEFAULT)
+    flagstr = "LA_SER_DEFAULT";
+  else if (flag == LA_SER_SECURE)
+    flagstr = "LA_SER_SECURE";
+  else
+    {
+       sprintf (buf, "unknown flag %d", flag);
+       flagstr = buf;
+    }
+  printf ("objsearch: %s, %s\n", name, flagstr);
+
+  return (char *) name;
+}
+
+unsigned int
+la_objopen (struct link_map *l, Lmid_t lmid, uintptr_t *cookie)
+{
+  printf ("objopen: %ld, %s\n", lmid, l->l_name);
+
+  return 3;
+}
+
+void
+la_preinit (uintptr_t *cookie)
+{
+  printf ("preinit\n");
+}
+
+unsigned int
+la_objclose  (uintptr_t *cookie)
+{
+  printf ("objclose\n");
+  return 0;
+}
+
+uintptr_t
+la_symbind32 (Elf32_Sym *sym, unsigned int ndx, uintptr_t *refcook,
+	      uintptr_t *defcook, unsigned int *flags, const char *symname)
+{
+  printf ("symbind32: symname=%s, st_value=%#lx, ndx=%u, flags=%u\n",
+	  symname, (long int) sym->st_value, ndx, *flags);
+
+  return sym->st_value;
+}
+
+uintptr_t
+la_symbind64 (Elf64_Sym *sym, unsigned int ndx, uintptr_t *refcook,
+	      uintptr_t *defcook, unsigned int *flags, const char *symname)
+{
+  printf ("symbind64: symname=%s, st_value=%#lx, ndx=%u, flags=%u\n",
+	  symname, (long int) sym->st_value, ndx, *flags);
+
+  return sym->st_value;
+}
+
+#include <tst-audit.h>
+
+#ifdef __AVX512F__
+#include <immintrin.h>
+#include <cpuid.h>
+
+static int
+check_avx512 (void)
+{
+  unsigned int eax, ebx, ecx, edx;
+
+  if (__get_cpuid (1, &eax, &ebx, &ecx, &edx) == 0
+      || (ecx & (bit_AVX | bit_OSXSAVE)) != (bit_AVX | bit_OSXSAVE))
+    return 0;
+
+  __cpuid_count (7, 0, eax, ebx, ecx, edx);
+  if (!(ebx & bit_AVX512F))
+    return 0;
+
+  asm ("xgetbv" : "=a" (eax), "=d" (edx) : "c" (0));
+
+  /* Verify that ZMM, YMM and XMM states are enabled.  */
+  return (eax & 0xe6) == 0xe6;
+}
+
+#else
+#include <emmintrin.h>
+#endif
+
+ElfW(Addr)
+pltenter (ElfW(Sym) *sym, unsigned int ndx, uintptr_t *refcook,
+	  uintptr_t *defcook, La_regs *regs, unsigned int *flags,
+	  const char *symname, long int *framesizep)
+{
+  printf ("pltenter: symname=%s, st_value=%#lx, ndx=%u, flags=%u\n",
+	  symname, (long int) sym->st_value, ndx, *flags);
+
+#ifdef __AVX512F__
+  if (check_avx512 () && strcmp (symname, "audit_test") == 0)
+    {
+      __m512i zero = _mm512_setzero_si512 ();
+      if (memcmp (&regs->lr_vector[0], &zero, sizeof (zero))
+	  || memcmp (&regs->lr_vector[1], &zero, sizeof (zero))
+	  || memcmp (&regs->lr_vector[2], &zero, sizeof (zero))
+	  || memcmp (&regs->lr_vector[3], &zero, sizeof (zero))
+	  || memcmp (&regs->lr_vector[4], &zero, sizeof (zero))
+	  || memcmp (&regs->lr_vector[5], &zero, sizeof (zero))
+	  || memcmp (&regs->lr_vector[6], &zero, sizeof (zero))
+	  || memcmp (&regs->lr_vector[7], &zero, sizeof (zero)))
+	abort ();
+
+      for (int i = 0; i < 8; i++)
+	regs->lr_vector[i].zmm[0]
+	  = (La_x86_64_zmm) _mm512_set1_epi64 (i + 1);
+
+      __m512i zmm = _mm512_set1_epi64 (-1);
+      asm volatile ("vmovdqa64 %0, %%zmm0" : : "x" (zmm) : "xmm0" );
+      asm volatile ("vmovdqa64 %0, %%zmm1" : : "x" (zmm) : "xmm1" );
+      asm volatile ("vmovdqa64 %0, %%zmm2" : : "x" (zmm) : "xmm2" );
+      asm volatile ("vmovdqa64 %0, %%zmm3" : : "x" (zmm) : "xmm3" );
+      asm volatile ("vmovdqa64 %0, %%zmm4" : : "x" (zmm) : "xmm4" );
+      asm volatile ("vmovdqa64 %0, %%zmm5" : : "x" (zmm) : "xmm5" );
+      asm volatile ("vmovdqa64 %0, %%zmm6" : : "x" (zmm) : "xmm6" );
+      asm volatile ("vmovdqa64 %0, %%zmm7" : : "x" (zmm) : "xmm7" );
+
+      *framesizep = 1024;
+    }
+#endif
+
+  return sym->st_value;
+}
+
+unsigned int
+pltexit (ElfW(Sym) *sym, unsigned int ndx, uintptr_t *refcook,
+	 uintptr_t *defcook, const La_regs *inregs, La_retval *outregs,
+	 const char *symname)
+{
+  printf ("pltexit: symname=%s, st_value=%#lx, ndx=%u, retval=%tu\n",
+	  symname, (long int) sym->st_value, ndx,
+	  (ptrdiff_t) outregs->int_retval);
+
+#ifdef __AVX512F__
+  if (check_avx512 () && strcmp (symname, "audit_test") == 0)
+    {
+      __m512i zero = _mm512_setzero_si512 ();
+      if (memcmp (&outregs->lrv_vector0, &zero, sizeof (zero)))
+	abort ();
+
+      for (int i = 0; i < 8; i++)
+	{
+	  __m512i zmm = _mm512_set1_epi64 (i + 1);
+	  if (memcmp (&inregs->lr_vector[i], &zmm, sizeof (zmm)) != 0)
+	    abort ();
+	}
+
+      outregs->lrv_vector0.zmm[0]
+	= (La_x86_64_zmm) _mm512_set1_epi64 (0x12349876);
+
+      __m512i zmm = _mm512_set1_epi64 (-1);
+      asm volatile ("vmovdqa64 %0, %%zmm0" : : "x" (zmm) : "xmm0" );
+      asm volatile ("vmovdqa64 %0, %%zmm1" : : "x" (zmm) : "xmm1" );
+    }
+#endif
+
+  return 0;
+}
diff --git a/REORG.TODO/sysdeps/x86_64/tst-auditmod3a.c b/REORG.TODO/sysdeps/x86_64/tst-auditmod3a.c
new file mode 100644
index 0000000000..9514aba505
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/tst-auditmod3a.c
@@ -0,0 +1,24 @@
+/* Test case for x86-64 preserved registers in dynamic linker.  */
+
+#include <stdlib.h>
+#include <string.h>
+#include <emmintrin.h>
+
+__m128i
+audit_test (__m128i x0, __m128i x1, __m128i x2, __m128i x3,
+	    __m128i x4, __m128i x5, __m128i x6, __m128i x7)
+{
+  __m128i xmm = _mm_setzero_si128 ();
+
+  if (memcmp (&xmm, &x0, sizeof (xmm))
+      || memcmp (&xmm, &x1, sizeof (xmm))
+      || memcmp (&xmm, &x2, sizeof (xmm))
+      || memcmp (&xmm, &x3, sizeof (xmm))
+      || memcmp (&xmm, &x4, sizeof (xmm))
+      || memcmp (&xmm, &x5, sizeof (xmm))
+      || memcmp (&xmm, &x6, sizeof (xmm))
+      || memcmp (&xmm, &x7, sizeof (xmm)))
+    abort ();
+
+  return xmm;
+}
diff --git a/REORG.TODO/sysdeps/x86_64/tst-auditmod3b.c b/REORG.TODO/sysdeps/x86_64/tst-auditmod3b.c
new file mode 100644
index 0000000000..7aad92382e
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/tst-auditmod3b.c
@@ -0,0 +1,153 @@
+/* Verify that changing xmm registers in audit library won't affect
+   function parameter passing/return.  */
+
+#include <dlfcn.h>
+#include <link.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <bits/wordsize.h>
+#include <gnu/lib-names.h>
+#include <emmintrin.h>
+
+unsigned int
+la_version (unsigned int v)
+{
+  setlinebuf (stdout);
+
+  printf ("version: %u\n", v);
+
+  char buf[20];
+  sprintf (buf, "%u", v);
+
+  return v;
+}
+
+void
+la_activity (uintptr_t *cookie, unsigned int flag)
+{
+  if (flag == LA_ACT_CONSISTENT)
+    printf ("activity: consistent\n");
+  else if (flag == LA_ACT_ADD)
+    printf ("activity: add\n");
+  else if (flag == LA_ACT_DELETE)
+    printf ("activity: delete\n");
+  else
+    printf ("activity: unknown activity %u\n", flag);
+}
+
+char *
+la_objsearch (const char *name, uintptr_t *cookie, unsigned int flag)
+{
+  char buf[100];
+  const char *flagstr;
+  if (flag == LA_SER_ORIG)
+    flagstr = "LA_SET_ORIG";
+  else if (flag == LA_SER_LIBPATH)
+    flagstr = "LA_SER_LIBPATH";
+  else if (flag == LA_SER_RUNPATH)
+    flagstr = "LA_SER_RUNPATH";
+  else if (flag == LA_SER_CONFIG)
+    flagstr = "LA_SER_CONFIG";
+  else if (flag == LA_SER_DEFAULT)
+    flagstr = "LA_SER_DEFAULT";
+  else if (flag == LA_SER_SECURE)
+    flagstr = "LA_SER_SECURE";
+  else
+    {
+       sprintf (buf, "unknown flag %d", flag);
+       flagstr = buf;
+    }
+  printf ("objsearch: %s, %s\n", name, flagstr);
+
+  return (char *) name;
+}
+
+unsigned int
+la_objopen (struct link_map *l, Lmid_t lmid, uintptr_t *cookie)
+{
+  printf ("objopen: %ld, %s\n", lmid, l->l_name);
+
+  return 3;
+}
+
+void
+la_preinit (uintptr_t *cookie)
+{
+  printf ("preinit\n");
+}
+
+unsigned int
+la_objclose  (uintptr_t *cookie)
+{
+  printf ("objclose\n");
+  return 0;
+}
+
+uintptr_t
+la_symbind32 (Elf32_Sym *sym, unsigned int ndx, uintptr_t *refcook,
+	      uintptr_t *defcook, unsigned int *flags, const char *symname)
+{
+  printf ("symbind32: symname=%s, st_value=%#lx, ndx=%u, flags=%u\n",
+	  symname, (long int) sym->st_value, ndx, *flags);
+
+  return sym->st_value;
+}
+
+uintptr_t
+la_symbind64 (Elf64_Sym *sym, unsigned int ndx, uintptr_t *refcook,
+	      uintptr_t *defcook, unsigned int *flags, const char *symname)
+{
+  printf ("symbind64: symname=%s, st_value=%#lx, ndx=%u, flags=%u\n",
+	  symname, (long int) sym->st_value, ndx, *flags);
+
+  return sym->st_value;
+}
+
+#include <tst-audit.h>
+
+ElfW(Addr)
+pltenter (ElfW(Sym) *sym, unsigned int ndx, uintptr_t *refcook,
+	  uintptr_t *defcook, La_regs *regs, unsigned int *flags,
+	  const char *symname, long int *framesizep)
+{
+  printf ("pltenter: symname=%s, st_value=%#lx, ndx=%u, flags=%u\n",
+	  symname, (long int) sym->st_value, ndx, *flags);
+
+  __m128i xmm = _mm_set1_epi32 (-1);
+  asm volatile ("movdqa %0, %%xmm0" : : "x" (xmm) : "xmm0" );
+  asm volatile ("movdqa %0, %%xmm1" : : "x" (xmm) : "xmm1" );
+  asm volatile ("movdqa %0, %%xmm2" : : "x" (xmm) : "xmm2" );
+  asm volatile ("movdqa %0, %%xmm3" : : "x" (xmm) : "xmm3" );
+  asm volatile ("movdqa %0, %%xmm4" : : "x" (xmm) : "xmm4" );
+  asm volatile ("movdqa %0, %%xmm5" : : "x" (xmm) : "xmm5" );
+  asm volatile ("movdqa %0, %%xmm6" : : "x" (xmm) : "xmm6" );
+  asm volatile ("movdqa %0, %%xmm7" : : "x" (xmm) : "xmm7" );
+
+  return sym->st_value;
+}
+
+unsigned int
+pltexit (ElfW(Sym) *sym, unsigned int ndx, uintptr_t *refcook,
+	 uintptr_t *defcook, const La_regs *inregs, La_retval *outregs,
+	 const char *symname)
+{
+  printf ("pltexit: symname=%s, st_value=%#lx, ndx=%u, retval=%tu\n",
+	  symname, (long int) sym->st_value, ndx,
+	  (ptrdiff_t) outregs->int_retval);
+
+  __m128i xmm = _mm_set1_epi32 (-1);
+  asm volatile ("movdqa %0, %%xmm0" : : "x" (xmm) : "xmm0" );
+  asm volatile ("movdqa %0, %%xmm1" : : "x" (xmm) : "xmm1" );
+  asm volatile ("movdqa %0, %%xmm2" : : "x" (xmm) : "xmm2" );
+  asm volatile ("movdqa %0, %%xmm3" : : "x" (xmm) : "xmm3" );
+  asm volatile ("movdqa %0, %%xmm4" : : "x" (xmm) : "xmm4" );
+  asm volatile ("movdqa %0, %%xmm5" : : "x" (xmm) : "xmm5" );
+  asm volatile ("movdqa %0, %%xmm6" : : "x" (xmm) : "xmm6" );
+  asm volatile ("movdqa %0, %%xmm7" : : "x" (xmm) : "xmm7" );
+
+  return 0;
+}
diff --git a/REORG.TODO/sysdeps/x86_64/tst-auditmod4a.c b/REORG.TODO/sysdeps/x86_64/tst-auditmod4a.c
new file mode 100644
index 0000000000..c9c24c04a8
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/tst-auditmod4a.c
@@ -0,0 +1,48 @@
+/* Test case for x86-64 preserved registers in dynamic linker.  */
+
+#ifdef __AVX__
+#include <stdlib.h>
+#include <string.h>
+#include <immintrin.h>
+
+__m256i
+audit_test (__m256i x0, __m256i x1, __m256i x2, __m256i x3,
+	    __m256i x4, __m256i x5, __m256i x6, __m256i x7)
+{
+  __m256i ymm;
+
+  ymm = _mm256_set1_epi32 (1);
+  if (memcmp (&ymm, &x0, sizeof (ymm)))
+    abort ();
+
+  ymm = _mm256_set1_epi32 (2);
+  if (memcmp (&ymm, &x1, sizeof (ymm)))
+    abort ();
+
+  ymm = _mm256_set1_epi32 (3);
+  if (memcmp (&ymm, &x2, sizeof (ymm)))
+    abort ();
+
+  ymm = _mm256_set1_epi32 (4);
+  if (memcmp (&ymm, &x3, sizeof (ymm)))
+    abort ();
+
+  ymm = _mm256_set1_epi32 (5);
+  if (memcmp (&ymm, &x4, sizeof (ymm)))
+    abort ();
+
+  ymm = _mm256_set1_epi32 (6);
+  if (memcmp (&ymm, &x5, sizeof (ymm)))
+    abort ();
+
+  ymm = _mm256_set1_epi32 (7);
+  if (memcmp (&ymm, &x6, sizeof (ymm)))
+    abort ();
+
+  ymm = _mm256_set1_epi32 (8);
+  if (memcmp (&ymm, &x7, sizeof (ymm)))
+    abort ();
+
+  return _mm256_setzero_si256 ();
+}
+#endif
diff --git a/REORG.TODO/sysdeps/x86_64/tst-auditmod4b.c b/REORG.TODO/sysdeps/x86_64/tst-auditmod4b.c
new file mode 100644
index 0000000000..1153ea442c
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/tst-auditmod4b.c
@@ -0,0 +1,213 @@
+/* Verify that changing AVX registers in audit library won't affect
+   function parameter passing/return.  */
+
+#include <dlfcn.h>
+#include <link.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <bits/wordsize.h>
+#include <gnu/lib-names.h>
+
+unsigned int
+la_version (unsigned int v)
+{
+  setlinebuf (stdout);
+
+  printf ("version: %u\n", v);
+
+  char buf[20];
+  sprintf (buf, "%u", v);
+
+  return v;
+}
+
+void
+la_activity (uintptr_t *cookie, unsigned int flag)
+{
+  if (flag == LA_ACT_CONSISTENT)
+    printf ("activity: consistent\n");
+  else if (flag == LA_ACT_ADD)
+    printf ("activity: add\n");
+  else if (flag == LA_ACT_DELETE)
+    printf ("activity: delete\n");
+  else
+    printf ("activity: unknown activity %u\n", flag);
+}
+
+char *
+la_objsearch (const char *name, uintptr_t *cookie, unsigned int flag)
+{
+  char buf[100];
+  const char *flagstr;
+  if (flag == LA_SER_ORIG)
+    flagstr = "LA_SET_ORIG";
+  else if (flag == LA_SER_LIBPATH)
+    flagstr = "LA_SER_LIBPATH";
+  else if (flag == LA_SER_RUNPATH)
+    flagstr = "LA_SER_RUNPATH";
+  else if (flag == LA_SER_CONFIG)
+    flagstr = "LA_SER_CONFIG";
+  else if (flag == LA_SER_DEFAULT)
+    flagstr = "LA_SER_DEFAULT";
+  else if (flag == LA_SER_SECURE)
+    flagstr = "LA_SER_SECURE";
+  else
+    {
+       sprintf (buf, "unknown flag %d", flag);
+       flagstr = buf;
+    }
+  printf ("objsearch: %s, %s\n", name, flagstr);
+
+  return (char *) name;
+}
+
+unsigned int
+la_objopen (struct link_map *l, Lmid_t lmid, uintptr_t *cookie)
+{
+  printf ("objopen: %ld, %s\n", lmid, l->l_name);
+
+  return 3;
+}
+
+void
+la_preinit (uintptr_t *cookie)
+{
+  printf ("preinit\n");
+}
+
+unsigned int
+la_objclose  (uintptr_t *cookie)
+{
+  printf ("objclose\n");
+  return 0;
+}
+
+uintptr_t
+la_symbind32 (Elf32_Sym *sym, unsigned int ndx, uintptr_t *refcook,
+	      uintptr_t *defcook, unsigned int *flags, const char *symname)
+{
+  printf ("symbind32: symname=%s, st_value=%#lx, ndx=%u, flags=%u\n",
+	  symname, (long int) sym->st_value, ndx, *flags);
+
+  return sym->st_value;
+}
+
+uintptr_t
+la_symbind64 (Elf64_Sym *sym, unsigned int ndx, uintptr_t *refcook,
+	      uintptr_t *defcook, unsigned int *flags, const char *symname)
+{
+  printf ("symbind64: symname=%s, st_value=%#lx, ndx=%u, flags=%u\n",
+	  symname, (long int) sym->st_value, ndx, *flags);
+
+  return sym->st_value;
+}
+
+#include <tst-audit.h>
+
+#ifdef __AVX__
+#include <immintrin.h>
+#include <cpuid.h>
+
+static int avx = -1;
+
+static inline int
+__attribute ((always_inline))
+check_avx (void)
+{
+  if (avx == -1)
+    {
+      unsigned int eax, ebx, ecx, edx;
+
+      if (__get_cpuid (1, &eax, &ebx, &ecx, &edx)
+	  && (ecx & bit_AVX))
+	avx = 1;
+      else
+	avx = 0;
+    }
+  return avx;
+}
+#else
+#include <emmintrin.h>
+#endif
+
+ElfW(Addr)
+pltenter (ElfW(Sym) *sym, unsigned int ndx, uintptr_t *refcook,
+	  uintptr_t *defcook, La_regs *regs, unsigned int *flags,
+	  const char *symname, long int *framesizep)
+{
+  printf ("pltenter: symname=%s, st_value=%#lx, ndx=%u, flags=%u\n",
+	  symname, (long int) sym->st_value, ndx, *flags);
+
+#ifdef __AVX__
+  if (check_avx () && strcmp (symname, "audit_test") == 0)
+    {
+      __m256i zero = _mm256_setzero_si256 ();
+      if (memcmp (&regs->lr_vector[0], &zero, sizeof (zero))
+	  || memcmp (&regs->lr_vector[1], &zero, sizeof (zero))
+	  || memcmp (&regs->lr_vector[2], &zero, sizeof (zero))
+	  || memcmp (&regs->lr_vector[3], &zero, sizeof (zero))
+	  || memcmp (&regs->lr_vector[4], &zero, sizeof (zero))
+	  || memcmp (&regs->lr_vector[5], &zero, sizeof (zero))
+	  || memcmp (&regs->lr_vector[6], &zero, sizeof (zero))
+	  || memcmp (&regs->lr_vector[7], &zero, sizeof (zero)))
+	abort ();
+
+      for (int i = 0; i < 8; i++)
+	regs->lr_vector[i].ymm[0]
+	  = (La_x86_64_ymm) _mm256_set1_epi32 (i + 1);
+
+      __m256i ymm = _mm256_set1_epi32 (-1);
+      asm volatile ("vmovdqa %0, %%ymm0" : : "x" (ymm) : "xmm0" );
+      asm volatile ("vmovdqa %0, %%ymm1" : : "x" (ymm) : "xmm1" );
+      asm volatile ("vmovdqa %0, %%ymm2" : : "x" (ymm) : "xmm2" );
+      asm volatile ("vmovdqa %0, %%ymm3" : : "x" (ymm) : "xmm3" );
+      asm volatile ("vmovdqa %0, %%ymm4" : : "x" (ymm) : "xmm4" );
+      asm volatile ("vmovdqa %0, %%ymm5" : : "x" (ymm) : "xmm5" );
+      asm volatile ("vmovdqa %0, %%ymm6" : : "x" (ymm) : "xmm6" );
+      asm volatile ("vmovdqa %0, %%ymm7" : : "x" (ymm) : "xmm7" );
+
+      *framesizep = 1024;
+    }
+#endif
+
+  return sym->st_value;
+}
+
+unsigned int
+pltexit (ElfW(Sym) *sym, unsigned int ndx, uintptr_t *refcook,
+	 uintptr_t *defcook, const La_regs *inregs, La_retval *outregs,
+	 const char *symname)
+{
+  printf ("pltexit: symname=%s, st_value=%#lx, ndx=%u, retval=%tu\n",
+	  symname, (long int) sym->st_value, ndx,
+	  (ptrdiff_t) outregs->int_retval);
+
+#ifdef __AVX__
+  if (check_avx () && strcmp (symname, "audit_test") == 0)
+    {
+      __m256i zero = _mm256_setzero_si256 ();
+      if (memcmp (&outregs->lrv_vector0, &zero, sizeof (zero)))
+	abort ();
+
+      for (int i = 0; i < 8; i++)
+	{
+	  __m256i ymm = _mm256_set1_epi32 (i + 1);
+	  if (memcmp (&inregs->lr_vector[i], &ymm, sizeof (ymm)) != 0)
+	    abort ();
+	}
+
+      outregs->lrv_vector0.ymm[0]
+	= (La_x86_64_ymm) _mm256_set1_epi32 (0x12349876);
+
+      __m256i ymm = _mm256_set1_epi32 (-1);
+      asm volatile ("vmovdqa %0, %%ymm0" : : "x" (ymm) : "xmm0" );
+      asm volatile ("vmovdqa %0, %%ymm1" : : "x" (ymm) : "xmm1" );
+    }
+#endif
+
+  return 0;
+}
diff --git a/REORG.TODO/sysdeps/x86_64/tst-auditmod5a.c b/REORG.TODO/sysdeps/x86_64/tst-auditmod5a.c
new file mode 100644
index 0000000000..8511a70747
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/tst-auditmod5a.c
@@ -0,0 +1,46 @@
+/* Test case for x86-64 preserved registers in dynamic linker.  */
+
+#include <stdlib.h>
+#include <string.h>
+#include <emmintrin.h>
+
+__m128i
+audit_test (__m128i x0, __m128i x1, __m128i x2, __m128i x3,
+	    __m128i x4, __m128i x5, __m128i x6, __m128i x7)
+{
+  __m128i xmm;
+
+  xmm =  _mm_set1_epi32 (1);
+  if (memcmp (&xmm, &x0, sizeof (xmm)))
+    abort ();
+
+  xmm =  _mm_set1_epi32 (2);
+  if (memcmp (&xmm, &x1, sizeof (xmm)))
+    abort ();
+
+  xmm =  _mm_set1_epi32 (3);
+  if (memcmp (&xmm, &x2, sizeof (xmm)))
+    abort ();
+
+  xmm =  _mm_set1_epi32 (4);
+  if (memcmp (&xmm, &x3, sizeof (xmm)))
+    abort ();
+
+  xmm =  _mm_set1_epi32 (5);
+  if (memcmp (&xmm, &x4, sizeof (xmm)))
+    abort ();
+
+  xmm =  _mm_set1_epi32 (6);
+  if (memcmp (&xmm, &x5, sizeof (xmm)))
+    abort ();
+
+  xmm =  _mm_set1_epi32 (7);
+  if (memcmp (&xmm, &x6, sizeof (xmm)))
+    abort ();
+
+  xmm =  _mm_set1_epi32 (8);
+  if (memcmp (&xmm, &x7, sizeof (xmm)))
+    abort ();
+
+  return _mm_setzero_si128 ();
+}
diff --git a/REORG.TODO/sysdeps/x86_64/tst-auditmod5b.c b/REORG.TODO/sysdeps/x86_64/tst-auditmod5b.c
new file mode 100644
index 0000000000..6a280fd61b
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/tst-auditmod5b.c
@@ -0,0 +1,185 @@
+/* Verify that changing xmm registers in audit library won't affect
+   function parameter passing/return.  */
+
+#include <dlfcn.h>
+#include <link.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <bits/wordsize.h>
+#include <gnu/lib-names.h>
+#include <emmintrin.h>
+
+unsigned int
+la_version (unsigned int v)
+{
+  setlinebuf (stdout);
+
+  printf ("version: %u\n", v);
+
+  char buf[20];
+  sprintf (buf, "%u", v);
+
+  return v;
+}
+
+void
+la_activity (uintptr_t *cookie, unsigned int flag)
+{
+  if (flag == LA_ACT_CONSISTENT)
+    printf ("activity: consistent\n");
+  else if (flag == LA_ACT_ADD)
+    printf ("activity: add\n");
+  else if (flag == LA_ACT_DELETE)
+    printf ("activity: delete\n");
+  else
+    printf ("activity: unknown activity %u\n", flag);
+}
+
+char *
+la_objsearch (const char *name, uintptr_t *cookie, unsigned int flag)
+{
+  char buf[100];
+  const char *flagstr;
+  if (flag == LA_SER_ORIG)
+    flagstr = "LA_SET_ORIG";
+  else if (flag == LA_SER_LIBPATH)
+    flagstr = "LA_SER_LIBPATH";
+  else if (flag == LA_SER_RUNPATH)
+    flagstr = "LA_SER_RUNPATH";
+  else if (flag == LA_SER_CONFIG)
+    flagstr = "LA_SER_CONFIG";
+  else if (flag == LA_SER_DEFAULT)
+    flagstr = "LA_SER_DEFAULT";
+  else if (flag == LA_SER_SECURE)
+    flagstr = "LA_SER_SECURE";
+  else
+    {
+       sprintf (buf, "unknown flag %d", flag);
+       flagstr = buf;
+    }
+  printf ("objsearch: %s, %s\n", name, flagstr);
+
+  return (char *) name;
+}
+
+unsigned int
+la_objopen (struct link_map *l, Lmid_t lmid, uintptr_t *cookie)
+{
+  printf ("objopen: %ld, %s\n", lmid, l->l_name);
+
+  return 3;
+}
+
+void
+la_preinit (uintptr_t *cookie)
+{
+  printf ("preinit\n");
+}
+
+unsigned int
+la_objclose  (uintptr_t *cookie)
+{
+  printf ("objclose\n");
+  return 0;
+}
+
+uintptr_t
+la_symbind32 (Elf32_Sym *sym, unsigned int ndx, uintptr_t *refcook,
+	      uintptr_t *defcook, unsigned int *flags, const char *symname)
+{
+  printf ("symbind32: symname=%s, st_value=%#lx, ndx=%u, flags=%u\n",
+	  symname, (long int) sym->st_value, ndx, *flags);
+
+  return sym->st_value;
+}
+
+uintptr_t
+la_symbind64 (Elf64_Sym *sym, unsigned int ndx, uintptr_t *refcook,
+	      uintptr_t *defcook, unsigned int *flags, const char *symname)
+{
+  printf ("symbind64: symname=%s, st_value=%#lx, ndx=%u, flags=%u\n",
+	  symname, (long int) sym->st_value, ndx, *flags);
+
+  return sym->st_value;
+}
+
+#include <tst-audit.h>
+
+ElfW(Addr)
+pltenter (ElfW(Sym) *sym, unsigned int ndx, uintptr_t *refcook,
+	  uintptr_t *defcook, La_regs *regs, unsigned int *flags,
+	  const char *symname, long int *framesizep)
+{
+  printf ("pltenter: symname=%s, st_value=%#lx, ndx=%u, flags=%u\n",
+	  symname, (long int) sym->st_value, ndx, *flags);
+
+  __m128i minusone = _mm_set1_epi32 (-1);
+
+  if (strcmp (symname, "audit_test") == 0)
+    {
+      __m128i zero = _mm_setzero_si128 ();
+      if (memcmp (&regs->lr_xmm[0], &zero, sizeof (zero))
+	  || memcmp (&regs->lr_xmm[1], &zero, sizeof (zero))
+	  || memcmp (&regs->lr_xmm[2], &zero, sizeof (zero))
+	  || memcmp (&regs->lr_xmm[3], &zero, sizeof (zero))
+	  || memcmp (&regs->lr_xmm[4], &zero, sizeof (zero))
+	  || memcmp (&regs->lr_xmm[5], &zero, sizeof (zero))
+	  || memcmp (&regs->lr_xmm[6], &zero, sizeof (zero))
+	  || memcmp (&regs->lr_xmm[7], &zero, sizeof (zero)))
+	abort ();
+
+      for (int i = 0; i < 8; i++)
+	regs->lr_xmm[i] = (La_x86_64_xmm) _mm_set1_epi32 (i + 1);
+
+      *framesizep = 1024;
+    }
+
+  asm volatile ("movdqa %0, %%xmm0" : : "x" (minusone) : "xmm0" );
+  asm volatile ("movdqa %0, %%xmm1" : : "x" (minusone) : "xmm1" );
+  asm volatile ("movdqa %0, %%xmm2" : : "x" (minusone) : "xmm2" );
+  asm volatile ("movdqa %0, %%xmm3" : : "x" (minusone) : "xmm3" );
+  asm volatile ("movdqa %0, %%xmm4" : : "x" (minusone) : "xmm4" );
+  asm volatile ("movdqa %0, %%xmm5" : : "x" (minusone) : "xmm5" );
+  asm volatile ("movdqa %0, %%xmm6" : : "x" (minusone) : "xmm6" );
+  asm volatile ("movdqa %0, %%xmm7" : : "x" (minusone) : "xmm7" );
+
+  return sym->st_value;
+}
+
+unsigned int
+pltexit (ElfW(Sym) *sym, unsigned int ndx, uintptr_t *refcook,
+	 uintptr_t *defcook, const La_regs *inregs, La_retval *outregs,
+	 const char *symname)
+{
+  printf ("pltexit: symname=%s, st_value=%#lx, ndx=%u, retval=%tu\n",
+	  symname, (long int) sym->st_value, ndx,
+	  (ptrdiff_t) outregs->int_retval);
+
+  __m128i xmm;
+
+  if (strcmp (symname, "audit_test") == 0)
+    {
+      __m128i zero = _mm_setzero_si128 ();
+      if (memcmp (&outregs->lrv_xmm0, &zero, sizeof (zero)))
+	abort ();
+
+      for (int i = 0; i < 8; i++)
+	{
+	  xmm = _mm_set1_epi32 (i + 1);
+	  if (memcmp (&inregs->lr_xmm[i], &xmm, sizeof (xmm)) != 0)
+	    abort ();
+	}
+
+      outregs->lrv_xmm0 = (La_x86_64_xmm) _mm_set1_epi32 (0x12349876);
+    }
+
+  xmm = _mm_set1_epi32 (-1);
+  asm volatile ("movdqa %0, %%xmm0" : : "x" (xmm) : "xmm0" );
+  asm volatile ("movdqa %0, %%xmm1" : : "x" (xmm) : "xmm1" );
+
+  return 0;
+}
diff --git a/REORG.TODO/sysdeps/x86_64/tst-auditmod6a.c b/REORG.TODO/sysdeps/x86_64/tst-auditmod6a.c
new file mode 100644
index 0000000000..c3a850ce98
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/tst-auditmod6a.c
@@ -0,0 +1,46 @@
+/* Test case for x86-64 preserved registers in dynamic linker.  */
+
+#include <stdlib.h>
+#include <string.h>
+#include <emmintrin.h>
+
+__m128i
+audit_test (__m128i x0, __m128i x1, __m128i x2, __m128i x3,
+	    __m128i x4, __m128i x5, __m128i x6, __m128i x7)
+{
+  __m128i xmm;
+
+  xmm =  _mm_set1_epi32 (0x100);
+  if (memcmp (&xmm, &x0, sizeof (xmm)))
+    abort ();
+
+  xmm =  _mm_set1_epi32 (0x101);
+  if (memcmp (&xmm, &x1, sizeof (xmm)))
+    abort ();
+
+  xmm =  _mm_set1_epi32 (0x102);
+  if (memcmp (&xmm, &x2, sizeof (xmm)))
+    abort ();
+
+  xmm =  _mm_set1_epi32 (0x103);
+  if (memcmp (&xmm, &x3, sizeof (xmm)))
+    abort ();
+
+  xmm =  _mm_set1_epi32 (0x104);
+  if (memcmp (&xmm, &x4, sizeof (xmm)))
+    abort ();
+
+  xmm =  _mm_set1_epi32 (0x105);
+  if (memcmp (&xmm, &x5, sizeof (xmm)))
+    abort ();
+
+  xmm =  _mm_set1_epi32 (0x106);
+  if (memcmp (&xmm, &x6, sizeof (xmm)))
+    abort ();
+
+  xmm =  _mm_set1_epi32 (0x107);
+  if (memcmp (&xmm, &x7, sizeof (xmm)))
+    abort ();
+
+  return _mm_setzero_si128 ();
+}
diff --git a/REORG.TODO/sysdeps/x86_64/tst-auditmod6b.c b/REORG.TODO/sysdeps/x86_64/tst-auditmod6b.c
new file mode 100644
index 0000000000..3533602c07
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/tst-auditmod6b.c
@@ -0,0 +1,227 @@
+/* Verify that changing AVX registers in audit library won't affect
+   function parameter passing/return.  */
+
+#include <dlfcn.h>
+#include <link.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <bits/wordsize.h>
+#include <gnu/lib-names.h>
+
+unsigned int
+la_version (unsigned int v)
+{
+  setlinebuf (stdout);
+
+  printf ("version: %u\n", v);
+
+  char buf[20];
+  sprintf (buf, "%u", v);
+
+  return v;
+}
+
+void
+la_activity (uintptr_t *cookie, unsigned int flag)
+{
+  if (flag == LA_ACT_CONSISTENT)
+    printf ("activity: consistent\n");
+  else if (flag == LA_ACT_ADD)
+    printf ("activity: add\n");
+  else if (flag == LA_ACT_DELETE)
+    printf ("activity: delete\n");
+  else
+    printf ("activity: unknown activity %u\n", flag);
+}
+
+char *
+la_objsearch (const char *name, uintptr_t *cookie, unsigned int flag)
+{
+  char buf[100];
+  const char *flagstr;
+  if (flag == LA_SER_ORIG)
+    flagstr = "LA_SET_ORIG";
+  else if (flag == LA_SER_LIBPATH)
+    flagstr = "LA_SER_LIBPATH";
+  else if (flag == LA_SER_RUNPATH)
+    flagstr = "LA_SER_RUNPATH";
+  else if (flag == LA_SER_CONFIG)
+    flagstr = "LA_SER_CONFIG";
+  else if (flag == LA_SER_DEFAULT)
+    flagstr = "LA_SER_DEFAULT";
+  else if (flag == LA_SER_SECURE)
+    flagstr = "LA_SER_SECURE";
+  else
+    {
+       sprintf (buf, "unknown flag %d", flag);
+       flagstr = buf;
+    }
+  printf ("objsearch: %s, %s\n", name, flagstr);
+
+  return (char *) name;
+}
+
+unsigned int
+la_objopen (struct link_map *l, Lmid_t lmid, uintptr_t *cookie)
+{
+  printf ("objopen: %ld, %s\n", lmid, l->l_name);
+
+  return 3;
+}
+
+void
+la_preinit (uintptr_t *cookie)
+{
+  printf ("preinit\n");
+}
+
+unsigned int
+la_objclose  (uintptr_t *cookie)
+{
+  printf ("objclose\n");
+  return 0;
+}
+
+uintptr_t
+la_symbind32 (Elf32_Sym *sym, unsigned int ndx, uintptr_t *refcook,
+	      uintptr_t *defcook, unsigned int *flags, const char *symname)
+{
+  printf ("symbind32: symname=%s, st_value=%#lx, ndx=%u, flags=%u\n",
+	  symname, (long int) sym->st_value, ndx, *flags);
+
+  return sym->st_value;
+}
+
+uintptr_t
+la_symbind64 (Elf64_Sym *sym, unsigned int ndx, uintptr_t *refcook,
+	      uintptr_t *defcook, unsigned int *flags, const char *symname)
+{
+  printf ("symbind64: symname=%s, st_value=%#lx, ndx=%u, flags=%u\n",
+	  symname, (long int) sym->st_value, ndx, *flags);
+
+  return sym->st_value;
+}
+
+#include <tst-audit.h>
+
+#ifdef __AVX__
+#include <immintrin.h>
+#include <cpuid.h>
+
+static int avx = -1;
+
+static inline int
+__attribute ((always_inline))
+check_avx (void)
+{
+  if (avx == -1)
+    {
+      unsigned int eax, ebx, ecx, edx;
+
+      if (__get_cpuid (1, &eax, &ebx, &ecx, &edx)
+	  && (ecx & bit_AVX))
+	avx = 1;
+      else
+	avx = 0;
+    }
+  return avx;
+}
+#else
+#include <emmintrin.h>
+#endif
+
+ElfW(Addr)
+pltenter (ElfW(Sym) *sym, unsigned int ndx, uintptr_t *refcook,
+	  uintptr_t *defcook, La_regs *regs, unsigned int *flags,
+	  const char *symname, long int *framesizep)
+{
+  printf ("pltenter: symname=%s, st_value=%#lx, ndx=%u, flags=%u\n",
+	  symname, (long int) sym->st_value, ndx, *flags);
+
+#ifdef __AVX__
+  if (check_avx () && strcmp (symname, "audit_test") == 0)
+    {
+      int i;
+
+      __m128i xmm = _mm_setzero_si128 ();
+      for (i = 0; i < 8; i++)
+	if (memcmp (&regs->lr_xmm[i], &xmm, sizeof (xmm))
+	    || memcmp (&regs->lr_vector[i], &xmm, sizeof (xmm)))
+	  abort ();
+
+      for (i = 0; i < 8; i += 2)
+	{
+	  regs->lr_xmm[i] = (La_x86_64_xmm) _mm_set1_epi32 (i + 1);
+	  regs->lr_vector[i].xmm[0] = regs->lr_xmm[i];
+	  regs->lr_vector[i + 1].ymm[0]
+	    = (La_x86_64_ymm) _mm256_set1_epi32 (i + 2);
+	  regs->lr_xmm[i + 1] = regs->lr_vector[i + 1].xmm[0];
+	}
+
+      __m256i ymm = _mm256_set1_epi32 (-1);
+      asm volatile ("vmovdqa %0, %%ymm0" : : "x" (ymm) : "xmm0" );
+      asm volatile ("vmovdqa %0, %%ymm1" : : "x" (ymm) : "xmm1" );
+      asm volatile ("vmovdqa %0, %%ymm2" : : "x" (ymm) : "xmm2" );
+      asm volatile ("vmovdqa %0, %%ymm3" : : "x" (ymm) : "xmm3" );
+      asm volatile ("vmovdqa %0, %%ymm4" : : "x" (ymm) : "xmm4" );
+      asm volatile ("vmovdqa %0, %%ymm5" : : "x" (ymm) : "xmm5" );
+      asm volatile ("vmovdqa %0, %%ymm6" : : "x" (ymm) : "xmm6" );
+      asm volatile ("vmovdqa %0, %%ymm7" : : "x" (ymm) : "xmm7" );
+
+      *framesizep = 1024;
+    }
+#endif
+
+  return sym->st_value;
+}
+
+unsigned int
+pltexit (ElfW(Sym) *sym, unsigned int ndx, uintptr_t *refcook,
+	 uintptr_t *defcook, const La_regs *inregs, La_retval *outregs,
+	 const char *symname)
+{
+  printf ("pltexit: symname=%s, st_value=%#lx, ndx=%u, retval=%tu\n",
+	  symname, (long int) sym->st_value, ndx,
+	  (ptrdiff_t) outregs->int_retval);
+
+#ifdef __AVX__
+  if (check_avx () && strcmp (symname, "audit_test") == 0)
+    {
+      int i;
+
+      __m128i xmm = _mm_setzero_si128 ();
+      if (memcmp (&outregs->lrv_xmm0, &xmm, sizeof (xmm))
+	  || memcmp (&outregs->lrv_vector0, &xmm, sizeof (xmm)))
+	abort ();
+
+      __m256i ymm;
+
+      for (i = 0; i < 8; i += 2)
+	{
+	  xmm = _mm_set1_epi32 (i + 0x100);
+	  if (memcmp (&inregs->lr_xmm[i], &xmm, sizeof (xmm))
+	      || memcmp (&inregs->lr_vector[i], &xmm, sizeof (xmm)))
+	    abort ();
+
+	  ymm = _mm256_set1_epi32 (i + 0x101);
+	  if (memcmp (&inregs->lr_xmm[i + 1],
+		      &inregs->lr_vector[i + 1].xmm[0], sizeof (xmm))
+	      || memcmp (&inregs->lr_vector[i + 1], &ymm, sizeof (ymm)))
+	    abort ();
+	}
+
+      outregs->lrv_vector0.ymm[0]
+	= (La_x86_64_ymm) _mm256_set1_epi32 (0x12349876);
+
+      ymm = _mm256_set1_epi32 (-1);
+      asm volatile ("vmovdqa %0, %%ymm0" : : "x" (ymm) : "xmm0" );
+      asm volatile ("vmovdqa %0, %%ymm1" : : "x" (ymm) : "xmm1" );
+    }
+#endif
+
+  return 0;
+}
diff --git a/REORG.TODO/sysdeps/x86_64/tst-auditmod6c.c b/REORG.TODO/sysdeps/x86_64/tst-auditmod6c.c
new file mode 100644
index 0000000000..8000e89224
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/tst-auditmod6c.c
@@ -0,0 +1,232 @@
+/* Verify that changing AVX registers in audit library won't affect
+   function parameter passing/return.  */
+
+#include <dlfcn.h>
+#include <link.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <bits/wordsize.h>
+#include <gnu/lib-names.h>
+
+unsigned int
+la_version (unsigned int v)
+{
+  setlinebuf (stdout);
+
+  printf ("version: %u\n", v);
+
+  char buf[20];
+  sprintf (buf, "%u", v);
+
+  return v;
+}
+
+void
+la_activity (uintptr_t *cookie, unsigned int flag)
+{
+  if (flag == LA_ACT_CONSISTENT)
+    printf ("activity: consistent\n");
+  else if (flag == LA_ACT_ADD)
+    printf ("activity: add\n");
+  else if (flag == LA_ACT_DELETE)
+    printf ("activity: delete\n");
+  else
+    printf ("activity: unknown activity %u\n", flag);
+}
+
+char *
+la_objsearch (const char *name, uintptr_t *cookie, unsigned int flag)
+{
+  char buf[100];
+  const char *flagstr;
+  if (flag == LA_SER_ORIG)
+    flagstr = "LA_SET_ORIG";
+  else if (flag == LA_SER_LIBPATH)
+    flagstr = "LA_SER_LIBPATH";
+  else if (flag == LA_SER_RUNPATH)
+    flagstr = "LA_SER_RUNPATH";
+  else if (flag == LA_SER_CONFIG)
+    flagstr = "LA_SER_CONFIG";
+  else if (flag == LA_SER_DEFAULT)
+    flagstr = "LA_SER_DEFAULT";
+  else if (flag == LA_SER_SECURE)
+    flagstr = "LA_SER_SECURE";
+  else
+    {
+       sprintf (buf, "unknown flag %d", flag);
+       flagstr = buf;
+    }
+  printf ("objsearch: %s, %s\n", name, flagstr);
+
+  return (char *) name;
+}
+
+unsigned int
+la_objopen (struct link_map *l, Lmid_t lmid, uintptr_t *cookie)
+{
+  printf ("objopen: %ld, %s\n", lmid, l->l_name);
+
+  return 3;
+}
+
+void
+la_preinit (uintptr_t *cookie)
+{
+  printf ("preinit\n");
+}
+
+unsigned int
+la_objclose  (uintptr_t *cookie)
+{
+  printf ("objclose\n");
+  return 0;
+}
+
+uintptr_t
+la_symbind32 (Elf32_Sym *sym, unsigned int ndx, uintptr_t *refcook,
+	      uintptr_t *defcook, unsigned int *flags, const char *symname)
+{
+  printf ("symbind32: symname=%s, st_value=%#lx, ndx=%u, flags=%u\n",
+	  symname, (long int) sym->st_value, ndx, *flags);
+
+  return sym->st_value;
+}
+
+uintptr_t
+la_symbind64 (Elf64_Sym *sym, unsigned int ndx, uintptr_t *refcook,
+	      uintptr_t *defcook, unsigned int *flags, const char *symname)
+{
+  printf ("symbind64: symname=%s, st_value=%#lx, ndx=%u, flags=%u\n",
+	  symname, (long int) sym->st_value, ndx, *flags);
+
+  return sym->st_value;
+}
+
+#include <tst-audit.h>
+
+#ifdef __AVX__
+#include <immintrin.h>
+#include <cpuid.h>
+
+static int avx = -1;
+
+static inline int
+__attribute ((always_inline))
+check_avx (void)
+{
+  if (avx == -1)
+    {
+      unsigned int eax, ebx, ecx, edx;
+
+      if (__get_cpuid (1, &eax, &ebx, &ecx, &edx)
+	  && (ecx & bit_AVX))
+	avx = 1;
+      else
+	avx = 0;
+    }
+  return avx;
+}
+#else
+#include <emmintrin.h>
+#endif
+
+ElfW(Addr)
+pltenter (ElfW(Sym) *sym, unsigned int ndx, uintptr_t *refcook,
+	  uintptr_t *defcook, La_regs *regs, unsigned int *flags,
+	  const char *symname, long int *framesizep)
+{
+  printf ("pltenter: symname=%s, st_value=%#lx, ndx=%u, flags=%u\n",
+	  symname, (long int) sym->st_value, ndx, *flags);
+
+#ifdef __AVX__
+  if (check_avx () && strcmp (symname, "audit_test") == 0)
+    {
+      int i;
+      __m128i xmm;
+      __m256i ymm;
+
+      for (i = 0; i < 8; i += 2)
+	{
+	  xmm = _mm_set1_epi32 (i + 1);
+	  if (memcmp (&regs->lr_xmm[i], &xmm, sizeof (xmm))
+	      || memcmp (&regs->lr_vector[i], &xmm, sizeof (xmm)))
+	    abort ();
+	  regs->lr_xmm[i] = (La_x86_64_xmm) _mm_set1_epi32 (i + 0x100);
+	  regs->lr_vector[i].xmm[0] = regs->lr_xmm[i];
+
+	  ymm = _mm256_set1_epi32 (i + 2);
+	  if (memcmp (&regs->lr_xmm[i + 1],
+		      &regs->lr_vector[i + 1].xmm[0], sizeof (xmm))
+	      || memcmp (&regs->lr_vector[i + 1], &ymm, sizeof (ymm)))
+	    abort ();
+	  regs->lr_vector[i + 1].ymm[0]
+	    = (La_x86_64_ymm) _mm256_set1_epi32 (i + 0x101);
+	  regs->lr_xmm[i + 1] = regs->lr_vector[i + 1].xmm[0];
+	}
+
+      ymm = _mm256_set1_epi32 (-1);
+      asm volatile ("vmovdqa %0, %%ymm0" : : "x" (ymm) : "xmm0" );
+      asm volatile ("vmovdqa %0, %%ymm1" : : "x" (ymm) : "xmm1" );
+      asm volatile ("vmovdqa %0, %%ymm2" : : "x" (ymm) : "xmm2" );
+      asm volatile ("vmovdqa %0, %%ymm3" : : "x" (ymm) : "xmm3" );
+      asm volatile ("vmovdqa %0, %%ymm4" : : "x" (ymm) : "xmm4" );
+      asm volatile ("vmovdqa %0, %%ymm5" : : "x" (ymm) : "xmm5" );
+      asm volatile ("vmovdqa %0, %%ymm6" : : "x" (ymm) : "xmm6" );
+      asm volatile ("vmovdqa %0, %%ymm7" : : "x" (ymm) : "xmm7" );
+
+      *framesizep = 1024;
+    }
+#endif
+
+  return sym->st_value;
+}
+
+unsigned int
+pltexit (ElfW(Sym) *sym, unsigned int ndx, uintptr_t *refcook,
+	 uintptr_t *defcook, const La_regs *inregs, La_retval *outregs,
+	 const char *symname)
+{
+  printf ("pltexit: symname=%s, st_value=%#lx, ndx=%u, retval=%tu\n",
+	  symname, (long int) sym->st_value, ndx,
+	  (ptrdiff_t) outregs->int_retval);
+
+#ifdef __AVX__
+  if (check_avx () && strcmp (symname, "audit_test") == 0)
+    {
+      int i;
+
+      __m256i ymm = _mm256_set1_epi32 (0x12349876);;
+      if (memcmp (&outregs->lrv_vector0, &ymm, sizeof (ymm)))
+	abort ();
+
+      __m128i xmm;
+
+      for (i = 0; i < 8; i += 2)
+	{
+	  xmm = _mm_set1_epi32 (i + 0x100);
+	  if (memcmp (&inregs->lr_xmm[i], &xmm, sizeof (xmm))
+	      || memcmp (&inregs->lr_vector[i], &xmm, sizeof (xmm)))
+	    abort ();
+
+	  ymm = _mm256_set1_epi32 (i + 0x101);
+	  if (memcmp (&inregs->lr_xmm[i + 1],
+		      &inregs->lr_vector[i + 1].xmm[0], sizeof (xmm))
+	      || memcmp (&inregs->lr_vector[i + 1], &ymm, sizeof (ymm)))
+	    abort ();
+	}
+
+      outregs->lrv_vector0.ymm[0]
+	= (La_x86_64_ymm) _mm256_set1_epi32 (0x98abcdef);
+
+      ymm = _mm256_set1_epi32 (-1);
+      asm volatile ("vmovdqa %0, %%ymm0" : : "x" (ymm) : "xmm0" );
+      asm volatile ("vmovdqa %0, %%ymm1" : : "x" (ymm) : "xmm1" );
+    }
+#endif
+
+  return 0;
+}
diff --git a/REORG.TODO/sysdeps/x86_64/tst-auditmod7a.c b/REORG.TODO/sysdeps/x86_64/tst-auditmod7a.c
new file mode 100644
index 0000000000..b379df75d6
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/tst-auditmod7a.c
@@ -0,0 +1 @@
+#include "tst-auditmod6a.c"
diff --git a/REORG.TODO/sysdeps/x86_64/tst-auditmod7b.c b/REORG.TODO/sysdeps/x86_64/tst-auditmod7b.c
new file mode 100644
index 0000000000..5abe6d1bc9
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/tst-auditmod7b.c
@@ -0,0 +1,225 @@
+/* Verify that changing AVX registers in audit library won't affect
+   function parameter passing/return.  */
+
+#include <dlfcn.h>
+#include <link.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <bits/wordsize.h>
+#include <gnu/lib-names.h>
+
+unsigned int
+la_version (unsigned int v)
+{
+  setlinebuf (stdout);
+
+  printf ("version: %u\n", v);
+
+  char buf[20];
+  sprintf (buf, "%u", v);
+
+  return v;
+}
+
+void
+la_activity (uintptr_t *cookie, unsigned int flag)
+{
+  if (flag == LA_ACT_CONSISTENT)
+    printf ("activity: consistent\n");
+  else if (flag == LA_ACT_ADD)
+    printf ("activity: add\n");
+  else if (flag == LA_ACT_DELETE)
+    printf ("activity: delete\n");
+  else
+    printf ("activity: unknown activity %u\n", flag);
+}
+
+char *
+la_objsearch (const char *name, uintptr_t *cookie, unsigned int flag)
+{
+  char buf[100];
+  const char *flagstr;
+  if (flag == LA_SER_ORIG)
+    flagstr = "LA_SET_ORIG";
+  else if (flag == LA_SER_LIBPATH)
+    flagstr = "LA_SER_LIBPATH";
+  else if (flag == LA_SER_RUNPATH)
+    flagstr = "LA_SER_RUNPATH";
+  else if (flag == LA_SER_CONFIG)
+    flagstr = "LA_SER_CONFIG";
+  else if (flag == LA_SER_DEFAULT)
+    flagstr = "LA_SER_DEFAULT";
+  else if (flag == LA_SER_SECURE)
+    flagstr = "LA_SER_SECURE";
+  else
+    {
+       sprintf (buf, "unknown flag %d", flag);
+       flagstr = buf;
+    }
+  printf ("objsearch: %s, %s\n", name, flagstr);
+
+  return (char *) name;
+}
+
+unsigned int
+la_objopen (struct link_map *l, Lmid_t lmid, uintptr_t *cookie)
+{
+  printf ("objopen: %ld, %s\n", lmid, l->l_name);
+
+  return 3;
+}
+
+void
+la_preinit (uintptr_t *cookie)
+{
+  printf ("preinit\n");
+}
+
+unsigned int
+la_objclose  (uintptr_t *cookie)
+{
+  printf ("objclose\n");
+  return 0;
+}
+
+uintptr_t
+la_symbind32 (Elf32_Sym *sym, unsigned int ndx, uintptr_t *refcook,
+	      uintptr_t *defcook, unsigned int *flags, const char *symname)
+{
+  printf ("symbind32: symname=%s, st_value=%#lx, ndx=%u, flags=%u\n",
+	  symname, (long int) sym->st_value, ndx, *flags);
+
+  return sym->st_value;
+}
+
+uintptr_t
+la_symbind64 (Elf64_Sym *sym, unsigned int ndx, uintptr_t *refcook,
+	      uintptr_t *defcook, unsigned int *flags, const char *symname)
+{
+  printf ("symbind64: symname=%s, st_value=%#lx, ndx=%u, flags=%u\n",
+	  symname, (long int) sym->st_value, ndx, *flags);
+
+  return sym->st_value;
+}
+
+#include <tst-audit.h>
+
+#ifdef __AVX__
+#include <immintrin.h>
+#include <cpuid.h>
+
+static int avx = -1;
+
+static inline int
+__attribute ((always_inline))
+check_avx (void)
+{
+  if (avx == -1)
+    {
+      unsigned int eax, ebx, ecx, edx;
+
+      if (__get_cpuid (1, &eax, &ebx, &ecx, &edx)
+	  && (ecx & bit_AVX))
+	avx = 1;
+      else
+	avx = 0;
+    }
+  return avx;
+}
+#else
+#include <emmintrin.h>
+#endif
+
+ElfW(Addr)
+pltenter (ElfW(Sym) *sym, unsigned int ndx, uintptr_t *refcook,
+	  uintptr_t *defcook, La_regs *regs, unsigned int *flags,
+	  const char *symname, long int *framesizep)
+{
+  printf ("pltenter: symname=%s, st_value=%#lx, ndx=%u, flags=%u\n",
+	  symname, (long int) sym->st_value, ndx, *flags);
+
+#ifdef __AVX__
+  if (check_avx () && strcmp (symname, "audit_test") == 0)
+    {
+      int i;
+
+      __m128i xmm = _mm_setzero_si128 ();
+      for (i = 0; i < 8; i++)
+	if (memcmp (&regs->lr_xmm[i], &xmm, sizeof (xmm))
+	    || memcmp (&regs->lr_vector[i], &xmm, sizeof (xmm)))
+	  abort ();
+
+      for (i = 0; i < 8; i += 2)
+	{
+	  regs->lr_xmm[i] = (La_x86_64_xmm) _mm_set1_epi32 (i + 0x100);
+	  regs->lr_vector[i + 1].ymm[0]
+	    = (La_x86_64_ymm) _mm256_set1_epi32 (i + 0x101);
+	}
+
+      __m256i ymm = _mm256_set1_epi32 (-1);
+      asm volatile ("vmovdqa %0, %%ymm0" : : "x" (ymm) : "xmm0" );
+      asm volatile ("vmovdqa %0, %%ymm1" : : "x" (ymm) : "xmm1" );
+      asm volatile ("vmovdqa %0, %%ymm2" : : "x" (ymm) : "xmm2" );
+      asm volatile ("vmovdqa %0, %%ymm3" : : "x" (ymm) : "xmm3" );
+      asm volatile ("vmovdqa %0, %%ymm4" : : "x" (ymm) : "xmm4" );
+      asm volatile ("vmovdqa %0, %%ymm5" : : "x" (ymm) : "xmm5" );
+      asm volatile ("vmovdqa %0, %%ymm6" : : "x" (ymm) : "xmm6" );
+      asm volatile ("vmovdqa %0, %%ymm7" : : "x" (ymm) : "xmm7" );
+
+      *framesizep = 1024;
+    }
+#endif
+
+  return sym->st_value;
+}
+
+unsigned int
+pltexit (ElfW(Sym) *sym, unsigned int ndx, uintptr_t *refcook,
+	 uintptr_t *defcook, const La_regs *inregs, La_retval *outregs,
+	 const char *symname)
+{
+  printf ("pltexit: symname=%s, st_value=%#lx, ndx=%u, retval=%tu\n",
+	  symname, (long int) sym->st_value, ndx,
+	  (ptrdiff_t) outregs->int_retval);
+
+#ifdef __AVX__
+  if (check_avx () && strcmp (symname, "audit_test") == 0)
+    {
+      int i;
+
+      __m128i xmm = _mm_setzero_si128 ();
+      if (memcmp (&outregs->lrv_xmm0, &xmm, sizeof (xmm))
+	  || memcmp (&outregs->lrv_vector0, &xmm, sizeof (xmm)))
+	abort ();
+
+      __m256i ymm;
+
+      for (i = 0; i < 8; i += 2)
+	{
+	  xmm = _mm_set1_epi32 (i + 0x100);
+	  if (memcmp (&inregs->lr_xmm[i], &xmm, sizeof (xmm))
+	      || memcmp (&inregs->lr_vector[i], &xmm, sizeof (xmm)))
+	    abort ();
+
+	  ymm = _mm256_set1_epi32 (i + 0x101);
+	  if (memcmp (&inregs->lr_xmm[i + 1],
+		      &inregs->lr_vector[i + 1].xmm[0], sizeof (xmm))
+	      || memcmp (&inregs->lr_vector[i + 1], &ymm, sizeof (ymm)))
+	    abort ();
+	}
+
+      outregs->lrv_vector0.ymm[0]
+	= (La_x86_64_ymm) _mm256_set1_epi32 (0x98abcdef);
+
+      ymm = _mm256_set1_epi32 (-1);
+      asm volatile ("vmovdqa %0, %%ymm0" : : "x" (ymm) : "xmm0" );
+      asm volatile ("vmovdqa %0, %%ymm1" : : "x" (ymm) : "xmm1" );
+    }
+#endif
+
+  return 0;
+}
diff --git a/REORG.TODO/sysdeps/x86_64/tst-avx-aux.c b/REORG.TODO/sysdeps/x86_64/tst-avx-aux.c
new file mode 100644
index 0000000000..e3807de7bb
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/tst-avx-aux.c
@@ -0,0 +1,47 @@
+/* Test case for preserved AVX registers in dynamic linker, -mavx part.
+   Copyright (C) 2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <immintrin.h>
+#include <stdlib.h>
+#include <string.h>
+
+int
+tst_avx_aux (void)
+{
+#ifdef __AVX__
+  extern __m256i avx_test (__m256i, __m256i, __m256i, __m256i,
+			   __m256i, __m256i, __m256i, __m256i);
+
+  __m256i ymm0 = _mm256_set1_epi32 (0);
+  __m256i ymm1 = _mm256_set1_epi32 (1);
+  __m256i ymm2 = _mm256_set1_epi32 (2);
+  __m256i ymm3 = _mm256_set1_epi32 (3);
+  __m256i ymm4 = _mm256_set1_epi32 (4);
+  __m256i ymm5 = _mm256_set1_epi32 (5);
+  __m256i ymm6 = _mm256_set1_epi32 (6);
+  __m256i ymm7 = _mm256_set1_epi32 (7);
+  __m256i ret = avx_test (ymm0, ymm1, ymm2, ymm3,
+			  ymm4, ymm5, ymm6, ymm7);
+  ymm0 =  _mm256_set1_epi32 (0x12349876);
+  if (memcmp (&ymm0, &ret, sizeof (ret)))
+    abort ();
+  return 0;
+#else  /* __AVX__ */
+  return 77;
+#endif  /* __AVX__ */
+}
diff --git a/REORG.TODO/sysdeps/x86_64/tst-avx.c b/REORG.TODO/sysdeps/x86_64/tst-avx.c
new file mode 100644
index 0000000000..ec2e3a79ff
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/tst-avx.c
@@ -0,0 +1,49 @@
+/* Test case for preserved AVX registers in dynamic linker.
+   Copyright (C) 2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <cpuid.h>
+
+int tst_avx_aux (void);
+
+static int
+avx_enabled (void)
+{
+  unsigned int eax, ebx, ecx, edx;
+
+  if (__get_cpuid (1, &eax, &ebx, &ecx, &edx) == 0
+      || (ecx & (bit_AVX | bit_OSXSAVE)) != (bit_AVX | bit_OSXSAVE))
+    return 0;
+
+  /* Check the OS has AVX and SSE saving enabled.  */
+  asm ("xgetbv" : "=a" (eax), "=d" (edx) : "c" (0));
+
+  return (eax & 6) == 6;
+}
+
+static int
+do_test (void)
+{
+  /* Run AVX test only if AVX is supported.  */
+  if (avx_enabled ())
+    return tst_avx_aux ();
+  else
+    return 77;
+}
+
+#define TEST_FUNCTION do_test ()
+#include "../../test-skeleton.c"
diff --git a/REORG.TODO/sysdeps/x86_64/tst-avx512-aux.c b/REORG.TODO/sysdeps/x86_64/tst-avx512-aux.c
new file mode 100644
index 0000000000..6cebc523f2
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/tst-avx512-aux.c
@@ -0,0 +1,48 @@
+/* Test case for preserved AVX512 registers in dynamic linker,
+   -mavx512 part.
+   Copyright (C) 2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <immintrin.h>
+#include <stdlib.h>
+#include <string.h>
+
+int
+tst_avx512_aux (void)
+{
+#ifdef __AVX512F__
+  extern __m512i avx512_test (__m512i, __m512i, __m512i, __m512i,
+			      __m512i, __m512i, __m512i, __m512i);
+
+  __m512i zmm0 = _mm512_set1_epi32 (0);
+  __m512i zmm1 = _mm512_set1_epi32 (1);
+  __m512i zmm2 = _mm512_set1_epi32 (2);
+  __m512i zmm3 = _mm512_set1_epi32 (3);
+  __m512i zmm4 = _mm512_set1_epi32 (4);
+  __m512i zmm5 = _mm512_set1_epi32 (5);
+  __m512i zmm6 = _mm512_set1_epi32 (6);
+  __m512i zmm7 = _mm512_set1_epi32 (7);
+  __m512i ret = avx512_test (zmm0, zmm1, zmm2, zmm3,
+			     zmm4, zmm5, zmm6, zmm7);
+  zmm0 =  _mm512_set1_epi32 (0x12349876);
+  if (memcmp (&zmm0, &ret, sizeof (ret)))
+    abort ();
+  return 0;
+#else  /* __AVX512F__ */
+  return 77;
+#endif  /* __AVX512F__ */
+}
diff --git a/REORG.TODO/sysdeps/x86_64/tst-avx512.c b/REORG.TODO/sysdeps/x86_64/tst-avx512.c
new file mode 100644
index 0000000000..a8e42ef553
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/tst-avx512.c
@@ -0,0 +1,57 @@
+/* Test case for preserved AVX512 registers in dynamic linker.
+   Copyright (C) 2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <cpuid.h>
+
+int tst_avx512_aux (void);
+
+static int
+avx512_enabled (void)
+{
+#ifdef bit_AVX512F
+  unsigned int eax, ebx, ecx, edx;
+
+  if (__get_cpuid (1, &eax, &ebx, &ecx, &edx) == 0
+      || (ecx & (bit_AVX | bit_OSXSAVE)) != (bit_AVX | bit_OSXSAVE))
+    return 0;
+
+  __cpuid_count (7, 0, eax, ebx, ecx, edx);
+  if (!(ebx & bit_AVX512F))
+    return 0;
+
+  asm ("xgetbv" : "=a" (eax), "=d" (edx) : "c" (0));
+
+  /* Verify that ZMM, YMM and XMM states are enabled.  */
+  return (eax & 0xe6) == 0xe6;
+#else
+  return 0;
+#endif
+}
+
+static int
+do_test (void)
+{
+  /* Run AVX512 test only if AVX512 is supported.  */
+  if (avx512_enabled ())
+    return tst_avx512_aux ();
+  else
+    return 77;
+}
+
+#define TEST_FUNCTION do_test ()
+#include "../../test-skeleton.c"
diff --git a/REORG.TODO/sysdeps/x86_64/tst-avx512mod.c b/REORG.TODO/sysdeps/x86_64/tst-avx512mod.c
new file mode 100644
index 0000000000..4cfb3a2c3d
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/tst-avx512mod.c
@@ -0,0 +1,48 @@
+/* Test case for x86-64 preserved AVX512 registers in dynamic linker.  */
+
+#ifdef __AVX512F__
+#include <stdlib.h>
+#include <string.h>
+#include <immintrin.h>
+
+__m512i
+avx512_test (__m512i x0, __m512i x1, __m512i x2, __m512i x3,
+	     __m512i x4, __m512i x5, __m512i x6, __m512i x7)
+{
+  __m512i zmm;
+
+  zmm = _mm512_set1_epi32 (0);
+  if (memcmp (&zmm, &x0, sizeof (zmm)))
+    abort ();
+
+  zmm = _mm512_set1_epi32 (1);
+  if (memcmp (&zmm, &x1, sizeof (zmm)))
+    abort ();
+
+  zmm = _mm512_set1_epi32 (2);
+  if (memcmp (&zmm, &x2, sizeof (zmm)))
+    abort ();
+
+  zmm = _mm512_set1_epi32 (3);
+  if (memcmp (&zmm, &x3, sizeof (zmm)))
+    abort ();
+
+  zmm = _mm512_set1_epi32 (4);
+  if (memcmp (&zmm, &x4, sizeof (zmm)))
+    abort ();
+
+  zmm = _mm512_set1_epi32 (5);
+  if (memcmp (&zmm, &x5, sizeof (zmm)))
+    abort ();
+
+  zmm = _mm512_set1_epi32 (6);
+  if (memcmp (&zmm, &x6, sizeof (zmm)))
+    abort ();
+
+  zmm = _mm512_set1_epi32 (7);
+  if (memcmp (&zmm, &x7, sizeof (zmm)))
+    abort ();
+
+  return _mm512_set1_epi32 (0x12349876);
+}
+#endif
diff --git a/REORG.TODO/sysdeps/x86_64/tst-avxmod.c b/REORG.TODO/sysdeps/x86_64/tst-avxmod.c
new file mode 100644
index 0000000000..6e5b154997
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/tst-avxmod.c
@@ -0,0 +1,48 @@
+/* Test case for x86-64 preserved AVX registers in dynamic linker.  */
+
+#ifdef __AVX__
+#include <stdlib.h>
+#include <string.h>
+#include <immintrin.h>
+
+__m256i
+avx_test (__m256i x0, __m256i x1, __m256i x2, __m256i x3,
+	  __m256i x4, __m256i x5, __m256i x6, __m256i x7)
+{
+  __m256i ymm;
+
+  ymm = _mm256_set1_epi32 (0);
+  if (memcmp (&ymm, &x0, sizeof (ymm)))
+    abort ();
+
+  ymm = _mm256_set1_epi32 (1);
+  if (memcmp (&ymm, &x1, sizeof (ymm)))
+    abort ();
+
+  ymm = _mm256_set1_epi32 (2);
+  if (memcmp (&ymm, &x2, sizeof (ymm)))
+    abort ();
+
+  ymm = _mm256_set1_epi32 (3);
+  if (memcmp (&ymm, &x3, sizeof (ymm)))
+    abort ();
+
+  ymm = _mm256_set1_epi32 (4);
+  if (memcmp (&ymm, &x4, sizeof (ymm)))
+    abort ();
+
+  ymm = _mm256_set1_epi32 (5);
+  if (memcmp (&ymm, &x5, sizeof (ymm)))
+    abort ();
+
+  ymm = _mm256_set1_epi32 (6);
+  if (memcmp (&ymm, &x6, sizeof (ymm)))
+    abort ();
+
+  ymm = _mm256_set1_epi32 (7);
+  if (memcmp (&ymm, &x7, sizeof (ymm)))
+    abort ();
+
+  return _mm256_set1_epi32 (0x12349876);
+}
+#endif
diff --git a/REORG.TODO/sysdeps/x86_64/tst-mallocalign1.c b/REORG.TODO/sysdeps/x86_64/tst-mallocalign1.c
new file mode 100644
index 0000000000..1221829b44
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/tst-mallocalign1.c
@@ -0,0 +1,72 @@
+/* Copyright (C) 2012-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <stdio.h>
+#include <stdlib.h>
+
+/* Specified by x86-64 psABI.  */
+#define ALIGN_MASK (16 - 1)
+
+void *
+test (size_t s)
+{
+  void *p = malloc (s);
+
+  printf ("malloc: %ld, %p: %ld\n", (unsigned long) s, p,
+	  ((unsigned long) p) & ALIGN_MASK);
+  return p;
+}
+
+static int
+do_test (void)
+{
+  void *p;
+  int ret = 0;
+
+  p = test (2);
+  ret |= (unsigned long) p & ALIGN_MASK;
+  free (p);
+
+  p = test (8);
+  ret |= (unsigned long) p & ALIGN_MASK;
+  free (p);
+
+  p = test (13);
+  ret |= (unsigned long) p & ALIGN_MASK;
+  free (p);
+
+  p = test (16);
+  ret |= (unsigned long) p & ALIGN_MASK;
+  free (p);
+
+  p = test (23);
+  ret |= (unsigned long) p & ALIGN_MASK;
+  free (p);
+
+  p = test (43);
+  ret |= (unsigned long) p & ALIGN_MASK;
+  free (p);
+
+  p = test (123);
+  ret |= (unsigned long) p & ALIGN_MASK;
+  free (p);
+
+  return ret;
+}
+
+#define TEST_FUNCTION do_test ()
+#include "../test-skeleton.c"
diff --git a/REORG.TODO/sysdeps/x86_64/tst-quad1.c b/REORG.TODO/sysdeps/x86_64/tst-quad1.c
new file mode 100644
index 0000000000..106bbac58b
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/tst-quad1.c
@@ -0,0 +1,25 @@
+/* Copyright (C) 2012-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+extern void foo (void);
+
+int
+main (void)
+{
+  foo ();
+  return 0;
+}
diff --git a/REORG.TODO/sysdeps/x86_64/tst-quad1pie.c b/REORG.TODO/sysdeps/x86_64/tst-quad1pie.c
new file mode 100644
index 0000000000..f5fd45f9b0
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/tst-quad1pie.c
@@ -0,0 +1 @@
+#include "tst-quad1.c"
diff --git a/REORG.TODO/sysdeps/x86_64/tst-quad2.c b/REORG.TODO/sysdeps/x86_64/tst-quad2.c
new file mode 100644
index 0000000000..f5fd45f9b0
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/tst-quad2.c
@@ -0,0 +1 @@
+#include "tst-quad1.c"
diff --git a/REORG.TODO/sysdeps/x86_64/tst-quad2pie.c b/REORG.TODO/sysdeps/x86_64/tst-quad2pie.c
new file mode 100644
index 0000000000..a15d8d36ac
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/tst-quad2pie.c
@@ -0,0 +1 @@
+#include "tst-quad2.c"
diff --git a/REORG.TODO/sysdeps/x86_64/tst-quadmod1.S b/REORG.TODO/sysdeps/x86_64/tst-quadmod1.S
new file mode 100644
index 0000000000..a2d9af87f3
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/tst-quadmod1.S
@@ -0,0 +1,44 @@
+/* Copyright (C) 2012-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef BIAS
+# define BIAS	0x7fffffff
+#endif
+
+	.section	.data.rel,"aw",@progbits
+	.align 8
+.Ljmp:
+	.quad	func + BIAS
+	.text
+	.globl	func
+	.type	func, @function
+func:
+	.cfi_startproc
+	xorl	%edi, %edi
+	jmp	exit@PLT
+	.cfi_endproc
+	.size	func, .-func
+	.globl	foo
+	.type	foo, @function
+foo:
+	.cfi_startproc
+	.cfi_def_cfa_register 6
+	movq	.Ljmp(%rip), %rax
+	subq	$BIAS, %rax
+	jmp	*%rax
+	.cfi_endproc
+	.size	foo, .-foo
diff --git a/REORG.TODO/sysdeps/x86_64/tst-quadmod1pie.S b/REORG.TODO/sysdeps/x86_64/tst-quadmod1pie.S
new file mode 100644
index 0000000000..c671d0cda9
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/tst-quadmod1pie.S
@@ -0,0 +1,2 @@
+#define BIAS 0x7fff0000
+#include "tst-quadmod1.S"
diff --git a/REORG.TODO/sysdeps/x86_64/tst-quadmod2.S b/REORG.TODO/sysdeps/x86_64/tst-quadmod2.S
new file mode 100644
index 0000000000..78599cdeb0
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/tst-quadmod2.S
@@ -0,0 +1,43 @@
+/* Copyright (C) 2012-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef BIAS
+# define BIAS 0x7fff0000
+#endif
+
+	.section	.data.rel.local,"aw",@progbits
+	.align 8
+.Ljmp:
+	.quad	func + BIAS
+	.text
+	.type	func, @function
+func:
+	.cfi_startproc
+	xorl	%edi, %edi
+	jmp	exit@PLT
+	.cfi_endproc
+	.size	func, .-func
+	.globl	foo
+	.type	foo, @function
+foo:
+	.cfi_startproc
+	.cfi_def_cfa_register 6
+	movq	.Ljmp(%rip), %rax
+	subq	$BIAS, %rax
+	jmp	*%rax
+	.cfi_endproc
+	.size	foo, .-foo
diff --git a/REORG.TODO/sysdeps/x86_64/tst-quadmod2pie.S b/REORG.TODO/sysdeps/x86_64/tst-quadmod2pie.S
new file mode 100644
index 0000000000..609183fe58
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/tst-quadmod2pie.S
@@ -0,0 +1 @@
+#include "tst-quadmod2.S"
diff --git a/REORG.TODO/sysdeps/x86_64/tst-split-dynreloc.c b/REORG.TODO/sysdeps/x86_64/tst-split-dynreloc.c
new file mode 100644
index 0000000000..2f9e9b9477
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/tst-split-dynreloc.c
@@ -0,0 +1,28 @@
+/* This test will be used to create an executable with a specific
+   section layout in which .rela.dyn and .rela.plt are not contiguous.
+   For x86 case, readelf will report something like:
+
+   ...
+   [10] .rela.dyn         RELA
+   [11] .bar              PROGBITS
+   [12] .rela.plt         RELA
+   ...
+
+   This is important as this case was not correctly handled by dynamic
+   linker in the bind-now case, and the second section was never
+   processed.  */
+
+#include <stdio.h>
+
+const int __attribute__ ((section(".bar"))) bar = 0x12345678;
+static const char foo[] = "foo";
+
+static int
+do_test (void)
+{
+  printf ("%s %d\n", foo, bar);
+  return 0;
+}
+
+#define TEST_FUNCTION do_test ()
+#include "../test-skeleton.c"
diff --git a/REORG.TODO/sysdeps/x86_64/tst-split-dynreloc.lds b/REORG.TODO/sysdeps/x86_64/tst-split-dynreloc.lds
new file mode 100644
index 0000000000..2229e698c9
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/tst-split-dynreloc.lds
@@ -0,0 +1,5 @@
+SECTIONS
+{
+   .bar : { *(.bar) }
+}
+INSERT AFTER .rela.dyn;
diff --git a/REORG.TODO/sysdeps/x86_64/tst-sse.c b/REORG.TODO/sysdeps/x86_64/tst-sse.c
new file mode 100644
index 0000000000..dd1537cf27
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/tst-sse.c
@@ -0,0 +1,46 @@
+/* Test case for preserved SSE registers in dynamic linker.
+   Copyright (C) 2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <immintrin.h>
+#include <stdlib.h>
+#include <string.h>
+
+extern __m128i sse_test (__m128i, __m128i, __m128i, __m128i,
+			 __m128i, __m128i, __m128i, __m128i);
+
+static int
+do_test (void)
+{
+  __m128i xmm0 = _mm_set1_epi32 (0);
+  __m128i xmm1 = _mm_set1_epi32 (1);
+  __m128i xmm2 = _mm_set1_epi32 (2);
+  __m128i xmm3 = _mm_set1_epi32 (3);
+  __m128i xmm4 = _mm_set1_epi32 (4);
+  __m128i xmm5 = _mm_set1_epi32 (5);
+  __m128i xmm6 = _mm_set1_epi32 (6);
+  __m128i xmm7 = _mm_set1_epi32 (7);
+  __m128i ret = sse_test (xmm0, xmm1, xmm2, xmm3,
+			  xmm4, xmm5, xmm6, xmm7);
+  xmm0 =  _mm_set1_epi32 (0x12349876);
+  if (memcmp (&xmm0, &ret, sizeof (ret)))
+    abort ();
+  return 0;
+}
+
+#define TEST_FUNCTION do_test ()
+#include "../../test-skeleton.c"
diff --git a/REORG.TODO/sysdeps/x86_64/tst-ssemod.c b/REORG.TODO/sysdeps/x86_64/tst-ssemod.c
new file mode 100644
index 0000000000..907a64c69e
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/tst-ssemod.c
@@ -0,0 +1,46 @@
+/* Test case for x86-64 preserved SSE registers in dynamic linker.  */
+
+#include <stdlib.h>
+#include <string.h>
+#include <immintrin.h>
+
+__m128i
+sse_test (__m128i x0, __m128i x1, __m128i x2, __m128i x3,
+	  __m128i x4, __m128i x5, __m128i x6, __m128i x7)
+{
+  __m128i xmm;
+
+  xmm = _mm_set1_epi32 (0);
+  if (memcmp (&xmm, &x0, sizeof (xmm)))
+    abort ();
+
+  xmm = _mm_set1_epi32 (1);
+  if (memcmp (&xmm, &x1, sizeof (xmm)))
+    abort ();
+
+  xmm = _mm_set1_epi32 (2);
+  if (memcmp (&xmm, &x2, sizeof (xmm)))
+    abort ();
+
+  xmm = _mm_set1_epi32 (3);
+  if (memcmp (&xmm, &x3, sizeof (xmm)))
+    abort ();
+
+  xmm = _mm_set1_epi32 (4);
+  if (memcmp (&xmm, &x4, sizeof (xmm)))
+    abort ();
+
+  xmm = _mm_set1_epi32 (5);
+  if (memcmp (&xmm, &x5, sizeof (xmm)))
+    abort ();
+
+  xmm = _mm_set1_epi32 (6);
+  if (memcmp (&xmm, &x6, sizeof (xmm)))
+    abort ();
+
+  xmm = _mm_set1_epi32 (7);
+  if (memcmp (&xmm, &x7, sizeof (xmm)))
+    abort ();
+
+  return _mm_set1_epi32 (0x12349876);
+}
diff --git a/REORG.TODO/sysdeps/x86_64/tst-stack-align.h b/REORG.TODO/sysdeps/x86_64/tst-stack-align.h
new file mode 100644
index 0000000000..abe14deb0f
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/tst-stack-align.h
@@ -0,0 +1,46 @@
+/* Copyright (C) 2003-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <stdio.h>
+#include <stdint.h>
+
+#define TEST_STACK_ALIGN() \
+  ({									     \
+    /* AMD64 ABI mandates 16byte aligned stack.				     \
+       Unfortunately, current GCC doesn't support __int128 or __float128     \
+       types, so use aligned attribute instead.  */			     \
+    struct _S								     \
+      {									     \
+        int _i __attribute__((aligned (16)));				     \
+	int _pad[3];							     \
+      } _s = { ._i = 18 };						     \
+    double _d = 12.0;							     \
+    long double _ld = 15.0;						     \
+    int _ret = 0;							     \
+    printf ("__int128:  %d %p %zu\n", _s._i, &_s, __alignof (_s));	     \
+    if ((((uintptr_t) &_s) & (__alignof (_s) - 1)) != 0)		     \
+      _ret = 1;								     \
+									     \
+    printf ("double:  %g %p %zu\n", _d, &_d, __alignof (double));	     \
+    if ((((uintptr_t) &_d) & (__alignof (double) - 1)) != 0)		     \
+      _ret = 1;								     \
+									     \
+    printf ("ldouble: %Lg %p %zu\n", _ld, &_ld, __alignof (long double));    \
+    if ((((uintptr_t) &_ld) & (__alignof (long double) - 1)) != 0)	     \
+      _ret = 1;								     \
+    _ret;								     \
+    })
diff --git a/REORG.TODO/sysdeps/x86_64/wcschr.S b/REORG.TODO/sysdeps/x86_64/wcschr.S
new file mode 100644
index 0000000000..a3e7d67dec
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/wcschr.S
@@ -0,0 +1,156 @@
+/* wcschr with SSSE3
+   Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+	.text
+ENTRY (__wcschr)
+
+	movd	%rsi, %xmm1
+	pxor	%xmm2, %xmm2
+	mov	%rdi, %rcx
+	punpckldq %xmm1, %xmm1
+	punpckldq %xmm1, %xmm1
+
+	and	$63, %rcx
+	cmp	$48, %rcx
+	ja	L(cross_cache)
+
+	movdqu	(%rdi), %xmm0
+	pcmpeqd	%xmm0, %xmm2
+	add	$16, %rdi
+	pcmpeqd	%xmm1, %xmm0
+	pmovmskb %xmm2, %rdx
+	pmovmskb %xmm0, %rax
+	or	%rax, %rdx
+	jnz	L(matches)
+
+	and	$-16, %rdi
+
+	movdqa	(%rdi), %xmm0
+	pcmpeqd	%xmm0, %xmm2
+	add	$16, %rdi
+	pcmpeqd	%xmm1, %xmm0
+	pmovmskb %xmm2, %rdx
+	pmovmskb %xmm0, %rax
+	or	%rax, %rdx
+	jnz	L(matches)
+
+	jmp	L(loop)
+
+L(cross_cache):
+	and	$15, %rcx
+	and	$-16, %rdi
+	movdqa	(%rdi), %xmm0
+	pcmpeqd	%xmm0, %xmm2
+	pcmpeqd	%xmm1, %xmm0
+	pmovmskb %xmm2, %rdx
+	pmovmskb %xmm0, %rax
+
+	sar	%cl, %rdx
+	sar	%cl, %rax
+	test	%rax, %rax
+	je	L(unaligned_no_match)
+
+	bsf	%rax, %rax
+	test	%rdx, %rdx
+	je	L(unaligned_match)
+	bsf	%rdx, %rdx
+	cmp	%rdx, %rax
+	ja	L(return_null)
+
+L(unaligned_match):
+	add	%rdi, %rax
+	add	%rcx, %rax
+	ret
+
+	.p2align 4
+L(unaligned_no_match):
+	test	%rdx, %rdx
+	jne	L(return_null)
+	pxor	%xmm2, %xmm2
+
+	add	$16, %rdi
+
+	.p2align 4
+/* Loop start on aligned string.  */
+L(loop):
+	movdqa	(%rdi), %xmm0
+	pcmpeqd	%xmm0, %xmm2
+	add	$16, %rdi
+	pcmpeqd	%xmm1, %xmm0
+	pmovmskb %xmm2, %rdx
+	pmovmskb %xmm0, %rax
+	or	%rax, %rdx
+	jnz	L(matches)
+
+	movdqa	(%rdi), %xmm0
+	pcmpeqd	%xmm0, %xmm2
+	add	$16, %rdi
+	pcmpeqd	%xmm1, %xmm0
+	pmovmskb %xmm2, %rdx
+	pmovmskb %xmm0, %rax
+	or	%rax, %rdx
+	jnz	L(matches)
+
+	movdqa	(%rdi), %xmm0
+	pcmpeqd	%xmm0, %xmm2
+	add	$16, %rdi
+	pcmpeqd	%xmm1, %xmm0
+	pmovmskb %xmm2, %rdx
+	pmovmskb %xmm0, %rax
+	or	%rax, %rdx
+	jnz	L(matches)
+
+	movdqa	(%rdi), %xmm0
+	pcmpeqd	%xmm0, %xmm2
+	add	$16, %rdi
+	pcmpeqd	%xmm1, %xmm0
+	pmovmskb %xmm2, %rdx
+	pmovmskb %xmm0, %rax
+	or	%rax, %rdx
+	jnz	L(matches)
+	jmp	L(loop)
+
+	.p2align 4
+L(matches):
+	pmovmskb %xmm2, %rdx
+	test	%rax, %rax
+	jz	L(return_null)
+	bsf	%rax, %rax
+	test	%rdx, %rdx
+	je	L(match)
+	bsf	%rdx, %rcx
+	cmp	%rcx, %rax
+	ja	L(return_null)
+L(match):
+	sub	$16, %rdi
+	add	%rdi, %rax
+	ret
+
+	.p2align 4
+L(return_null):
+	xor	%rax, %rax
+	ret
+
+END (__wcschr)
+
+libc_hidden_def(__wcschr)
+weak_alias (__wcschr, wcschr)
+libc_hidden_weak (wcschr)
diff --git a/REORG.TODO/sysdeps/x86_64/wcscmp.S b/REORG.TODO/sysdeps/x86_64/wcscmp.S
new file mode 100644
index 0000000000..3ef3341cd0
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/wcscmp.S
@@ -0,0 +1,950 @@
+/* Optimized wcscmp for x86-64 with SSE2.
+   Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+/* Note: wcscmp uses signed comparison, not unsighed as in strcmp function. */
+
+	.text
+ENTRY (__wcscmp)
+/*
+	* This implementation uses SSE to compare up to 16 bytes at a time.
+*/
+	mov	%esi, %eax
+	mov	%edi, %edx
+	pxor	%xmm0, %xmm0		/* clear %xmm0 for null char checks */
+	mov	%al, %ch
+	mov	%dl, %cl
+	and	$63, %eax		/* rsi alignment in cache line */
+	and	$63, %edx		/* rdi alignment in cache line */
+	and	$15, %cl
+	jz	L(continue_00)
+	cmp	$16, %edx
+	jb	L(continue_0)
+	cmp	$32, %edx
+	jb	L(continue_16)
+	cmp	$48, %edx
+	jb	L(continue_32)
+
+L(continue_48):
+	and	$15, %ch
+	jz	L(continue_48_00)
+	cmp	$16, %eax
+	jb	L(continue_0_48)
+	cmp	$32, %eax
+	jb	L(continue_16_48)
+	cmp	$48, %eax
+	jb	L(continue_32_48)
+
+	.p2align 4
+L(continue_48_48):
+	mov	(%rsi), %ecx
+	cmp	%ecx, (%rdi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	mov	4(%rsi), %ecx
+	cmp	%ecx, 4(%rdi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	mov	8(%rsi), %ecx
+	cmp	%ecx, 8(%rdi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	mov	12(%rsi), %ecx
+	cmp	%ecx, 12(%rdi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	movdqu	16(%rdi), %xmm1
+	movdqu	16(%rsi), %xmm2
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	%xmm2, %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_16)
+
+	movdqu	32(%rdi), %xmm1
+	movdqu	32(%rsi), %xmm2
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	%xmm2, %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_32)
+
+	movdqu	48(%rdi), %xmm1
+	movdqu	48(%rsi), %xmm2
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	%xmm2, %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_48)
+
+	add	$64, %rsi
+	add	$64, %rdi
+	jmp	L(continue_48_48)
+
+L(continue_0):
+	and	$15, %ch
+	jz	L(continue_0_00)
+	cmp	$16, %eax
+	jb	L(continue_0_0)
+	cmp	$32, %eax
+	jb	L(continue_0_16)
+	cmp	$48, %eax
+	jb	L(continue_0_32)
+
+	.p2align 4
+L(continue_0_48):
+	mov	(%rsi), %ecx
+	cmp	%ecx, (%rdi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	mov	4(%rsi), %ecx
+	cmp	%ecx, 4(%rdi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	mov	8(%rsi), %ecx
+	cmp	%ecx, 8(%rdi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	mov	12(%rsi), %ecx
+	cmp	%ecx, 12(%rdi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	movdqu	16(%rdi), %xmm1
+	movdqu	16(%rsi), %xmm2
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	%xmm2, %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_16)
+
+	movdqu	32(%rdi), %xmm1
+	movdqu	32(%rsi), %xmm2
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	%xmm2, %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_32)
+
+	mov	48(%rsi), %ecx
+	cmp	%ecx, 48(%rdi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	mov	52(%rsi), %ecx
+	cmp	%ecx, 52(%rdi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	mov	56(%rsi), %ecx
+	cmp	%ecx, 56(%rdi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	mov	60(%rsi), %ecx
+	cmp	%ecx, 60(%rdi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	add	$64, %rsi
+	add	$64, %rdi
+	jmp	L(continue_0_48)
+
+	.p2align 4
+L(continue_00):
+	and	$15, %ch
+	jz	L(continue_00_00)
+	cmp	$16, %eax
+	jb	L(continue_00_0)
+	cmp	$32, %eax
+	jb	L(continue_00_16)
+	cmp	$48, %eax
+	jb	L(continue_00_32)
+
+	.p2align 4
+L(continue_00_48):
+	pcmpeqd	(%rdi), %xmm0
+	mov	(%rdi), %eax
+	pmovmskb %xmm0, %ecx
+	test	%ecx, %ecx
+	jnz	L(less4_double_words1)
+
+	cmp	(%rsi), %eax
+	jne	L(nequal)
+
+	mov	4(%rdi), %eax
+	cmp	4(%rsi), %eax
+	jne	L(nequal)
+
+	mov	8(%rdi), %eax
+	cmp	8(%rsi), %eax
+	jne	L(nequal)
+
+	mov	12(%rdi), %eax
+	cmp	12(%rsi), %eax
+	jne	L(nequal)
+
+	movdqu	16(%rsi), %xmm2
+	pcmpeqd	%xmm2, %xmm0		/* Any null double_word? */
+	pcmpeqd	16(%rdi), %xmm2		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm2		/* packed sub of comparison results*/
+	pmovmskb %xmm2, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_16)
+
+	movdqu	32(%rsi), %xmm2
+	pcmpeqd	%xmm2, %xmm0		/* Any null double_word? */
+	pcmpeqd	32(%rdi), %xmm2		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm2		/* packed sub of comparison results*/
+	pmovmskb %xmm2, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_32)
+
+	movdqu	48(%rsi), %xmm2
+	pcmpeqd	%xmm2, %xmm0		/* Any null double_word? */
+	pcmpeqd	48(%rdi), %xmm2		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm2		/* packed sub of comparison results*/
+	pmovmskb %xmm2, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_48)
+
+	add	$64, %rsi
+	add	$64, %rdi
+	jmp	L(continue_00_48)
+
+	.p2align 4
+L(continue_32):
+	and	$15, %ch
+	jz	L(continue_32_00)
+	cmp	$16, %eax
+	jb	L(continue_0_32)
+	cmp	$32, %eax
+	jb	L(continue_16_32)
+	cmp	$48, %eax
+	jb	L(continue_32_32)
+
+	.p2align 4
+L(continue_32_48):
+	mov	(%rsi), %ecx
+	cmp	%ecx, (%rdi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	mov	4(%rsi), %ecx
+	cmp	%ecx, 4(%rdi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	mov	8(%rsi), %ecx
+	cmp	%ecx, 8(%rdi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	mov	12(%rsi), %ecx
+	cmp	%ecx, 12(%rdi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	mov	16(%rsi), %ecx
+	cmp	%ecx, 16(%rdi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	mov	20(%rsi), %ecx
+	cmp	%ecx, 20(%rdi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	mov	24(%rsi), %ecx
+	cmp	%ecx, 24(%rdi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	mov	28(%rsi), %ecx
+	cmp	%ecx, 28(%rdi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	movdqu	32(%rdi), %xmm1
+	movdqu	32(%rsi), %xmm2
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	%xmm2, %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_32)
+
+	movdqu	48(%rdi), %xmm1
+	movdqu	48(%rsi), %xmm2
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	%xmm2, %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_48)
+
+	add	$64, %rsi
+	add	$64, %rdi
+	jmp	L(continue_32_48)
+
+	.p2align 4
+L(continue_16):
+	and	$15, %ch
+	jz	L(continue_16_00)
+	cmp	$16, %eax
+	jb	L(continue_0_16)
+	cmp	$32, %eax
+	jb	L(continue_16_16)
+	cmp	$48, %eax
+	jb	L(continue_16_32)
+
+	.p2align 4
+L(continue_16_48):
+	mov	(%rsi), %ecx
+	cmp	%ecx, (%rdi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	mov	4(%rsi), %ecx
+	cmp	%ecx, 4(%rdi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	mov	8(%rsi), %ecx
+	cmp	%ecx, 8(%rdi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	mov	12(%rsi), %ecx
+	cmp	%ecx, 12(%rdi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	movdqu	16(%rdi), %xmm1
+	movdqu	16(%rsi), %xmm2
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	%xmm2, %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_16)
+
+	mov	32(%rsi), %ecx
+	cmp	%ecx, 32(%rdi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	mov	36(%rsi), %ecx
+	cmp	%ecx, 36(%rdi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	mov	40(%rsi), %ecx
+	cmp	%ecx, 40(%rdi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	mov	44(%rsi), %ecx
+	cmp	%ecx, 44(%rdi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	movdqu	48(%rdi), %xmm1
+	movdqu	48(%rsi), %xmm2
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	%xmm2, %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_48)
+
+	add	$64, %rsi
+	add	$64, %rdi
+	jmp	L(continue_16_48)
+
+	.p2align 4
+L(continue_00_00):
+	movdqa	(%rdi), %xmm1
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	(%rsi), %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words)
+
+	movdqa	16(%rdi), %xmm3
+	pcmpeqd	%xmm3, %xmm0		/* Any null double_word? */
+	pcmpeqd	16(%rsi), %xmm3		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm3		/* packed sub of comparison results*/
+	pmovmskb %xmm3, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_16)
+
+	movdqa	32(%rdi), %xmm5
+	pcmpeqd	%xmm5, %xmm0		/* Any null double_word? */
+	pcmpeqd	32(%rsi), %xmm5		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm5		/* packed sub of comparison results*/
+	pmovmskb %xmm5, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_32)
+
+	movdqa	48(%rdi), %xmm1
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	48(%rsi), %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_48)
+
+	add	$64, %rsi
+	add	$64, %rdi
+	jmp	L(continue_00_00)
+
+	.p2align 4
+L(continue_00_32):
+	movdqu	(%rsi), %xmm2
+	pcmpeqd	%xmm2, %xmm0		/* Any null double_word? */
+	pcmpeqd	(%rdi), %xmm2		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm2		/* packed sub of comparison results*/
+	pmovmskb %xmm2, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words)
+
+	add	$16, %rsi
+	add	$16, %rdi
+	jmp	L(continue_00_48)
+
+	.p2align 4
+L(continue_00_16):
+	movdqu	(%rsi), %xmm2
+	pcmpeqd	%xmm2, %xmm0		/* Any null double_word? */
+	pcmpeqd	(%rdi), %xmm2		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm2		/* packed sub of comparison results*/
+	pmovmskb %xmm2, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words)
+
+	movdqu	16(%rsi), %xmm2
+	pcmpeqd	%xmm2, %xmm0		/* Any null double_word? */
+	pcmpeqd	16(%rdi), %xmm2		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm2		/* packed sub of comparison results*/
+	pmovmskb %xmm2, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_16)
+
+	add	$32, %rsi
+	add	$32, %rdi
+	jmp	L(continue_00_48)
+
+	.p2align 4
+L(continue_00_0):
+	movdqu	(%rsi), %xmm2
+	pcmpeqd	%xmm2, %xmm0		/* Any null double_word? */
+	pcmpeqd	(%rdi), %xmm2		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm2		/* packed sub of comparison results*/
+	pmovmskb %xmm2, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words)
+
+	movdqu	16(%rsi), %xmm2
+	pcmpeqd	%xmm2, %xmm0		/* Any null double_word? */
+	pcmpeqd	16(%rdi), %xmm2		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm2		/* packed sub of comparison results*/
+	pmovmskb %xmm2, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_16)
+
+	movdqu	32(%rsi), %xmm2
+	pcmpeqd	%xmm2, %xmm0		/* Any null double_word? */
+	pcmpeqd	32(%rdi), %xmm2		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm2		/* packed sub of comparison results*/
+	pmovmskb %xmm2, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_32)
+
+	add	$48, %rsi
+	add	$48, %rdi
+	jmp	L(continue_00_48)
+
+	.p2align 4
+L(continue_48_00):
+	pcmpeqd	(%rsi), %xmm0
+	mov	(%rdi), %eax
+	pmovmskb %xmm0, %ecx
+	test	%ecx, %ecx
+	jnz	L(less4_double_words1)
+
+	cmp	(%rsi), %eax
+	jne	L(nequal)
+
+	mov	4(%rdi), %eax
+	cmp	4(%rsi), %eax
+	jne	L(nequal)
+
+	mov	8(%rdi), %eax
+	cmp	8(%rsi), %eax
+	jne	L(nequal)
+
+	mov	12(%rdi), %eax
+	cmp	12(%rsi), %eax
+	jne	L(nequal)
+
+	movdqu	16(%rdi), %xmm1
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	16(%rsi), %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_16)
+
+	movdqu	32(%rdi), %xmm1
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	32(%rsi), %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_32)
+
+	movdqu	48(%rdi), %xmm1
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	48(%rsi), %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_48)
+
+	add	$64, %rsi
+	add	$64, %rdi
+	jmp	L(continue_48_00)
+
+	.p2align 4
+L(continue_32_00):
+	movdqu	(%rdi), %xmm1
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	(%rsi), %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words)
+
+	add	$16, %rsi
+	add	$16, %rdi
+	jmp	L(continue_48_00)
+
+	.p2align 4
+L(continue_16_00):
+	movdqu	(%rdi), %xmm1
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	(%rsi), %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words)
+
+	movdqu	16(%rdi), %xmm1
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	16(%rsi), %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_16)
+
+	add	$32, %rsi
+	add	$32, %rdi
+	jmp	L(continue_48_00)
+
+	.p2align 4
+L(continue_0_00):
+	movdqu	(%rdi), %xmm1
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	(%rsi), %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words)
+
+	movdqu	16(%rdi), %xmm1
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	16(%rsi), %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_16)
+
+	movdqu	32(%rdi), %xmm1
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	32(%rsi), %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_32)
+
+	add	$48, %rsi
+	add	$48, %rdi
+	jmp	L(continue_48_00)
+
+	.p2align 4
+L(continue_32_32):
+	movdqu	(%rdi), %xmm1
+	movdqu	(%rsi), %xmm2
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	%xmm2, %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words)
+
+	add	$16, %rsi
+	add	$16, %rdi
+	jmp	L(continue_48_48)
+
+	.p2align 4
+L(continue_16_16):
+	movdqu	(%rdi), %xmm1
+	movdqu	(%rsi), %xmm2
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	%xmm2, %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words)
+
+	movdqu	16(%rdi), %xmm3
+	movdqu	16(%rsi), %xmm4
+	pcmpeqd	%xmm3, %xmm0		/* Any null double_word? */
+	pcmpeqd	%xmm4, %xmm3		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm3		/* packed sub of comparison results*/
+	pmovmskb %xmm3, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_16)
+
+	add	$32, %rsi
+	add	$32, %rdi
+	jmp	L(continue_48_48)
+
+	.p2align 4
+L(continue_0_0):
+	movdqu	(%rdi), %xmm1
+	movdqu	(%rsi), %xmm2
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	%xmm2, %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words)
+
+	movdqu	16(%rdi), %xmm3
+	movdqu	16(%rsi), %xmm4
+	pcmpeqd	%xmm3, %xmm0		/* Any null double_word? */
+	pcmpeqd	%xmm4, %xmm3		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm3		/* packed sub of comparison results*/
+	pmovmskb %xmm3, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_16)
+
+	movdqu	32(%rdi), %xmm1
+	movdqu	32(%rsi), %xmm2
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	%xmm2, %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_32)
+
+	add	$48, %rsi
+	add	$48, %rdi
+	jmp	L(continue_48_48)
+
+	.p2align 4
+L(continue_0_16):
+	movdqu	(%rdi), %xmm1
+	movdqu	(%rsi), %xmm2
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	%xmm2, %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words)
+
+	movdqu	16(%rdi), %xmm1
+	movdqu	16(%rsi), %xmm2
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	%xmm2, %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_16)
+
+	add	$32, %rsi
+	add	$32, %rdi
+	jmp	L(continue_32_48)
+
+	.p2align 4
+L(continue_0_32):
+	movdqu	(%rdi), %xmm1
+	movdqu	(%rsi), %xmm2
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	%xmm2, %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words)
+
+	add	$16, %rsi
+	add	$16, %rdi
+	jmp	L(continue_16_48)
+
+	.p2align 4
+L(continue_16_32):
+	movdqu	(%rdi), %xmm1
+	movdqu	(%rsi), %xmm2
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	%xmm2, %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words)
+
+	add	$16, %rsi
+	add	$16, %rdi
+	jmp	L(continue_32_48)
+
+	.p2align 4
+L(less4_double_words1):
+	cmp	(%rsi), %eax
+	jne	L(nequal)
+	test	%eax, %eax
+	jz	L(equal)
+
+	mov	4(%rsi), %ecx
+	cmp	%ecx, 4(%rdi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	mov	8(%rsi), %ecx
+	cmp	%ecx, 8(%rdi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	mov	12(%rsi), %ecx
+	cmp	%ecx, 12(%rdi)
+	jne	L(nequal)
+	xor	%eax, %eax
+	ret
+
+	.p2align 4
+L(less4_double_words):
+	xor	%eax, %eax
+	test	%dl, %dl
+	jz	L(next_two_double_words)
+	and	$15, %dl
+	jz	L(second_double_word)
+	mov	(%rdi), %eax
+	cmp	(%rsi), %eax
+	jne	L(nequal)
+	ret
+
+	.p2align 4
+L(second_double_word):
+	mov	4(%rdi), %eax
+	cmp	4(%rsi), %eax
+	jne	L(nequal)
+	ret
+
+	.p2align 4
+L(next_two_double_words):
+	and	$15, %dh
+	jz	L(fourth_double_word)
+	mov	8(%rdi), %eax
+	cmp	8(%rsi), %eax
+	jne	L(nequal)
+	ret
+
+	.p2align 4
+L(fourth_double_word):
+	mov	12(%rdi), %eax
+	cmp	12(%rsi), %eax
+	jne	L(nequal)
+	ret
+
+	.p2align 4
+L(less4_double_words_16):
+	xor	%eax, %eax
+	test	%dl, %dl
+	jz	L(next_two_double_words_16)
+	and	$15, %dl
+	jz	L(second_double_word_16)
+	mov	16(%rdi), %eax
+	cmp	16(%rsi), %eax
+	jne	L(nequal)
+	ret
+
+	.p2align 4
+L(second_double_word_16):
+	mov	20(%rdi), %eax
+	cmp	20(%rsi), %eax
+	jne	L(nequal)
+	ret
+
+	.p2align 4
+L(next_two_double_words_16):
+	and	$15, %dh
+	jz	L(fourth_double_word_16)
+	mov	24(%rdi), %eax
+	cmp	24(%rsi), %eax
+	jne	L(nequal)
+	ret
+
+	.p2align 4
+L(fourth_double_word_16):
+	mov	28(%rdi), %eax
+	cmp	28(%rsi), %eax
+	jne	L(nequal)
+	ret
+
+	.p2align 4
+L(less4_double_words_32):
+	xor	%eax, %eax
+	test	%dl, %dl
+	jz	L(next_two_double_words_32)
+	and	$15, %dl
+	jz	L(second_double_word_32)
+	mov	32(%rdi), %eax
+	cmp	32(%rsi), %eax
+	jne	L(nequal)
+	ret
+
+	.p2align 4
+L(second_double_word_32):
+	mov	36(%rdi), %eax
+	cmp	36(%rsi), %eax
+	jne	L(nequal)
+	ret
+
+	.p2align 4
+L(next_two_double_words_32):
+	and	$15, %dh
+	jz	L(fourth_double_word_32)
+	mov	40(%rdi), %eax
+	cmp	40(%rsi), %eax
+	jne	L(nequal)
+	ret
+
+	.p2align 4
+L(fourth_double_word_32):
+	mov	44(%rdi), %eax
+	cmp	44(%rsi), %eax
+	jne	L(nequal)
+	ret
+
+	.p2align 4
+L(less4_double_words_48):
+	xor	%eax, %eax
+	test	%dl, %dl
+	jz	L(next_two_double_words_48)
+	and	$15, %dl
+	jz	L(second_double_word_48)
+	mov	48(%rdi), %eax
+	cmp	48(%rsi), %eax
+	jne	L(nequal)
+	ret
+
+	.p2align 4
+L(second_double_word_48):
+	mov	52(%rdi), %eax
+	cmp	52(%rsi), %eax
+	jne	L(nequal)
+	ret
+
+	.p2align 4
+L(next_two_double_words_48):
+	and	$15, %dh
+	jz	L(fourth_double_word_48)
+	mov	56(%rdi), %eax
+	cmp	56(%rsi), %eax
+	jne	L(nequal)
+	ret
+
+	.p2align 4
+L(fourth_double_word_48):
+	mov	60(%rdi), %eax
+	cmp	60(%rsi), %eax
+	jne	L(nequal)
+	ret
+
+	.p2align 4
+L(nequal):
+	mov	$1, %eax
+	jg	L(nequal_bigger)
+	neg	%eax
+
+L(nequal_bigger):
+	ret
+
+	.p2align 4
+L(equal):
+	xor	%rax, %rax
+	ret
+
+END (__wcscmp)
+libc_hidden_def (__wcscmp)
+weak_alias (__wcscmp, wcscmp)
diff --git a/REORG.TODO/sysdeps/x86_64/wcslen.S b/REORG.TODO/sysdeps/x86_64/wcslen.S
new file mode 100644
index 0000000000..c6081a482f
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/wcslen.S
@@ -0,0 +1,238 @@
+/* Optimized wcslen for x86-64 with SSE2.
+   Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+	.text
+ENTRY (__wcslen)
+	cmpl	$0, (%rdi)
+	jz	L(exit_tail0)
+	cmpl	$0, 4(%rdi)
+	jz	L(exit_tail1)
+	cmpl	$0, 8(%rdi)
+	jz	L(exit_tail2)
+	cmpl	$0, 12(%rdi)
+	jz	L(exit_tail3)
+	cmpl	$0, 16(%rdi)
+	jz	L(exit_tail4)
+	cmpl	$0, 20(%rdi)
+	jz	L(exit_tail5)
+	cmpl	$0, 24(%rdi)
+	jz	L(exit_tail6)
+	cmpl	$0, 28(%rdi)
+	jz	L(exit_tail7)
+
+	pxor	%xmm0, %xmm0
+
+	lea	32(%rdi), %rax
+	lea	16(%rdi), %rcx
+	and	$-16, %rax
+
+	pcmpeqd	(%rax), %xmm0
+	pmovmskb %xmm0, %edx
+	pxor	%xmm1, %xmm1
+	test	%edx, %edx
+	lea	16(%rax), %rax
+	jnz	L(exit)
+
+	pcmpeqd	(%rax), %xmm1
+	pmovmskb %xmm1, %edx
+	pxor	%xmm2, %xmm2
+	test	%edx, %edx
+	lea	16(%rax), %rax
+	jnz	L(exit)
+
+	pcmpeqd	(%rax), %xmm2
+	pmovmskb %xmm2, %edx
+	pxor	%xmm3, %xmm3
+	test	%edx, %edx
+	lea	16(%rax), %rax
+	jnz	L(exit)
+
+	pcmpeqd	(%rax), %xmm3
+	pmovmskb %xmm3, %edx
+	test	%edx, %edx
+	lea	16(%rax), %rax
+	jnz	L(exit)
+
+	pcmpeqd	(%rax), %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	lea	16(%rax), %rax
+	jnz	L(exit)
+
+	pcmpeqd	(%rax), %xmm1
+	pmovmskb %xmm1, %edx
+	test	%edx, %edx
+	lea	16(%rax), %rax
+	jnz	L(exit)
+
+	pcmpeqd	(%rax), %xmm2
+	pmovmskb %xmm2, %edx
+	test	%edx, %edx
+	lea	16(%rax), %rax
+	jnz	L(exit)
+
+	pcmpeqd	(%rax), %xmm3
+	pmovmskb %xmm3, %edx
+	test	%edx, %edx
+	lea	16(%rax), %rax
+	jnz	L(exit)
+
+	pcmpeqd	(%rax), %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	lea	16(%rax), %rax
+	jnz	L(exit)
+
+	pcmpeqd	(%rax), %xmm1
+	pmovmskb %xmm1, %edx
+	test	%edx, %edx
+	lea	16(%rax), %rax
+	jnz	L(exit)
+
+	pcmpeqd	(%rax), %xmm2
+	pmovmskb %xmm2, %edx
+	test	%edx, %edx
+	lea	16(%rax), %rax
+	jnz	L(exit)
+
+	pcmpeqd	(%rax), %xmm3
+	pmovmskb %xmm3, %edx
+	test	%edx, %edx
+	lea	16(%rax), %rax
+	jnz	L(exit)
+
+	and	$-0x40, %rax
+
+	.p2align 4
+L(aligned_64_loop):
+	movaps	(%rax), %xmm0
+	movaps	16(%rax), %xmm1
+	movaps	32(%rax), %xmm2
+	movaps	48(%rax), %xmm6
+
+	pminub	%xmm1, %xmm0
+	pminub	%xmm6, %xmm2
+	pminub	%xmm0, %xmm2
+	pcmpeqd	%xmm3, %xmm2
+	pmovmskb %xmm2, %edx
+	test	%edx, %edx
+	lea	64(%rax), %rax
+	jz	L(aligned_64_loop)
+
+	pcmpeqd	-64(%rax), %xmm3
+	pmovmskb %xmm3, %edx
+	test	%edx, %edx
+	lea	48(%rcx), %rcx
+	jnz	L(exit)
+
+	pcmpeqd	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	test	%edx, %edx
+	lea	-16(%rcx), %rcx
+	jnz	L(exit)
+
+	pcmpeqd	-32(%rax), %xmm3
+	pmovmskb %xmm3, %edx
+	test	%edx, %edx
+	lea	-16(%rcx), %rcx
+	jnz	L(exit)
+
+	pcmpeqd	%xmm6, %xmm3
+	pmovmskb %xmm3, %edx
+	test	%edx, %edx
+	lea	-16(%rcx), %rcx
+	jnz	L(exit)
+
+	jmp	L(aligned_64_loop)
+
+	.p2align 4
+L(exit):
+	sub	%rcx, %rax
+	shr	$2, %rax
+	test	%dl, %dl
+	jz	L(exit_high)
+
+	mov	%dl, %cl
+	and	$15, %cl
+	jz	L(exit_1)
+	ret
+
+	.p2align 4
+L(exit_high):
+	mov	%dh, %ch
+	and	$15, %ch
+	jz	L(exit_3)
+	add	$2, %rax
+	ret
+
+	.p2align 4
+L(exit_1):
+	add	$1, %rax
+	ret
+
+	.p2align 4
+L(exit_3):
+	add	$3, %rax
+	ret
+
+	.p2align 4
+L(exit_tail0):
+	xor	%rax, %rax
+	ret
+
+	.p2align 4
+L(exit_tail1):
+	mov	$1, %rax
+	ret
+
+	.p2align 4
+L(exit_tail2):
+	mov	$2, %rax
+	ret
+
+	.p2align 4
+L(exit_tail3):
+	mov	$3, %rax
+	ret
+
+	.p2align 4
+L(exit_tail4):
+	mov	$4, %rax
+	ret
+
+	.p2align 4
+L(exit_tail5):
+	mov	$5, %rax
+	ret
+
+	.p2align 4
+L(exit_tail6):
+	mov	$6, %rax
+	ret
+
+	.p2align 4
+L(exit_tail7):
+	mov	$7, %rax
+	ret
+
+END (__wcslen)
+
+weak_alias(__wcslen, wcslen)
diff --git a/REORG.TODO/sysdeps/x86_64/wcsrchr.S b/REORG.TODO/sysdeps/x86_64/wcsrchr.S
new file mode 100644
index 0000000000..a6c385c511
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/wcsrchr.S
@@ -0,0 +1,282 @@
+/* wcsrchr with SSSE3
+   Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+	.text
+ENTRY (wcsrchr)
+
+	movd	%rsi, %xmm1
+	mov	%rdi, %rcx
+	punpckldq %xmm1, %xmm1
+	pxor	%xmm2, %xmm2
+	punpckldq %xmm1, %xmm1
+	and	$63, %rcx
+	cmp	$48, %rcx
+	ja	L(crosscache)
+
+	movdqu	(%rdi), %xmm0
+	pcmpeqd	%xmm0, %xmm2
+	pcmpeqd	%xmm1, %xmm0
+	pmovmskb %xmm2, %rcx
+	pmovmskb %xmm0, %rax
+	add	$16, %rdi
+
+	test	%rax, %rax
+	jnz	L(unaligned_match1)
+
+	test	%rcx, %rcx
+	jnz	L(return_null)
+
+	and	$-16, %rdi
+	xor	%r8, %r8
+	jmp	L(loop)
+
+	.p2align 4
+L(unaligned_match1):
+	test	%rcx, %rcx
+	jnz	L(prolog_find_zero_1)
+
+	mov	%rax, %r8
+	mov	%rdi, %rsi
+	and	$-16, %rdi
+	jmp	L(loop)
+
+	.p2align 4
+L(crosscache):
+	and	$15, %rcx
+	and	$-16, %rdi
+	pxor	%xmm3, %xmm3
+	movdqa	(%rdi), %xmm0
+	pcmpeqd	%xmm0, %xmm3
+	pcmpeqd	%xmm1, %xmm0
+	pmovmskb %xmm3, %rdx
+	pmovmskb %xmm0, %rax
+	shr	%cl, %rdx
+	shr	%cl, %rax
+	add	$16, %rdi
+
+	test	%rax, %rax
+	jnz	L(unaligned_match)
+
+	test	%rdx, %rdx
+	jnz	L(return_null)
+
+	xor	%r8, %r8
+	jmp	L(loop)
+
+	.p2align 4
+L(unaligned_match):
+	test	%rdx, %rdx
+	jnz	L(prolog_find_zero)
+
+	mov	%rax, %r8
+	lea	(%rdi, %rcx), %rsi
+
+/* Loop start on aligned string.  */
+	.p2align 4
+L(loop):
+	movdqa	(%rdi), %xmm0
+	pcmpeqd	%xmm0, %xmm2
+	add	$16, %rdi
+	pcmpeqd	%xmm1, %xmm0
+	pmovmskb %xmm2, %rcx
+	pmovmskb %xmm0, %rax
+	or	%rax, %rcx
+	jnz	L(matches)
+
+	movdqa	(%rdi), %xmm3
+	pcmpeqd	%xmm3, %xmm2
+	add	$16, %rdi
+	pcmpeqd	%xmm1, %xmm3
+	pmovmskb %xmm2, %rcx
+	pmovmskb %xmm3, %rax
+	or	%rax, %rcx
+	jnz	L(matches)
+
+	movdqa	(%rdi), %xmm4
+	pcmpeqd	%xmm4, %xmm2
+	add	$16, %rdi
+	pcmpeqd	%xmm1, %xmm4
+	pmovmskb %xmm2, %rcx
+	pmovmskb %xmm4, %rax
+	or	%rax, %rcx
+	jnz	L(matches)
+
+	movdqa	(%rdi), %xmm5
+	pcmpeqd	%xmm5, %xmm2
+	add	$16, %rdi
+	pcmpeqd	%xmm1, %xmm5
+	pmovmskb %xmm2, %rcx
+	pmovmskb %xmm5, %rax
+	or	%rax, %rcx
+	jz	L(loop)
+
+	.p2align 4
+L(matches):
+	test	%rax, %rax
+	jnz	L(match)
+L(return_value):
+	test	%r8, %r8
+	jz	L(return_null)
+	mov	%r8, %rax
+	mov	%rsi, %rdi
+
+	test	$15 << 4, %ah
+	jnz	L(match_fourth_wchar)
+	test	%ah, %ah
+	jnz	L(match_third_wchar)
+	test	$15 << 4, %al
+	jnz	L(match_second_wchar)
+	lea	-16(%rdi), %rax
+	ret
+
+	.p2align 4
+L(match):
+	pmovmskb %xmm2, %rcx
+	test	%rcx, %rcx
+	jnz	L(find_zero)
+	mov	%rax, %r8
+	mov	%rdi, %rsi
+	jmp	L(loop)
+
+	.p2align 4
+L(find_zero):
+	test	$15, %cl
+	jnz	L(find_zero_in_first_wchar)
+	test	%cl, %cl
+	jnz	L(find_zero_in_second_wchar)
+	test	$15, %ch
+	jnz	L(find_zero_in_third_wchar)
+
+	and	$1 << 13 - 1, %rax
+	jz	L(return_value)
+
+	test	$15 << 4, %ah
+	jnz	L(match_fourth_wchar)
+	test	%ah, %ah
+	jnz	L(match_third_wchar)
+	test	$15 << 4, %al
+	jnz	L(match_second_wchar)
+	lea	-16(%rdi), %rax
+	ret
+
+	.p2align 4
+L(find_zero_in_first_wchar):
+	test	$1, %rax
+	jz	L(return_value)
+	lea	-16(%rdi), %rax
+	ret
+
+	.p2align 4
+L(find_zero_in_second_wchar):
+	and	$1 << 5 - 1, %rax
+	jz	L(return_value)
+
+	test	$15 << 4, %al
+	jnz	L(match_second_wchar)
+	lea	-16(%rdi), %rax
+	ret
+
+	.p2align 4
+L(find_zero_in_third_wchar):
+	and	$1 << 9 - 1, %rax
+	jz	L(return_value)
+
+	test	%ah, %ah
+	jnz	L(match_third_wchar)
+	test	$15 << 4, %al
+	jnz	L(match_second_wchar)
+	lea	-16(%rdi), %rax
+	ret
+
+	.p2align 4
+L(prolog_find_zero):
+	add	%rcx, %rdi
+	mov     %rdx, %rcx
+L(prolog_find_zero_1):
+	test	$15, %cl
+	jnz	L(prolog_find_zero_in_first_wchar)
+	test	%cl, %cl
+	jnz	L(prolog_find_zero_in_second_wchar)
+	test	$15, %ch
+	jnz	L(prolog_find_zero_in_third_wchar)
+
+	and	$1 << 13 - 1, %rax
+	jz	L(return_null)
+
+	test	$15 << 4, %ah
+	jnz	L(match_fourth_wchar)
+	test	%ah, %ah
+	jnz	L(match_third_wchar)
+	test	$15 << 4, %al
+	jnz	L(match_second_wchar)
+	lea	-16(%rdi), %rax
+	ret
+
+	.p2align 4
+L(prolog_find_zero_in_first_wchar):
+	test	$1, %rax
+	jz	L(return_null)
+	lea	-16(%rdi), %rax
+	ret
+
+	.p2align 4
+L(prolog_find_zero_in_second_wchar):
+	and	$1 << 5 - 1, %rax
+	jz	L(return_null)
+
+	test	$15 << 4, %al
+	jnz	L(match_second_wchar)
+	lea	-16(%rdi), %rax
+	ret
+
+	.p2align 4
+L(prolog_find_zero_in_third_wchar):
+	and	$1 << 9 - 1, %rax
+	jz	L(return_null)
+
+	test	%ah, %ah
+	jnz	L(match_third_wchar)
+	test	$15 << 4, %al
+	jnz	L(match_second_wchar)
+	lea	-16(%rdi), %rax
+	ret
+
+	.p2align 4
+L(match_second_wchar):
+	lea	-12(%rdi), %rax
+	ret
+
+	.p2align 4
+L(match_third_wchar):
+	lea	-8(%rdi), %rax
+	ret
+
+	.p2align 4
+L(match_fourth_wchar):
+	lea	-4(%rdi), %rax
+	ret
+
+	.p2align 4
+L(return_null):
+	xor	%rax, %rax
+	ret
+
+END (wcsrchr)
diff --git a/REORG.TODO/sysdeps/x86_64/wmemset.S b/REORG.TODO/sysdeps/x86_64/wmemset.S
new file mode 100644
index 0000000000..f96d567fd8
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/wmemset.S
@@ -0,0 +1 @@
+/* Implemented in memset.S.  */
diff --git a/REORG.TODO/sysdeps/x86_64/wmemset_chk.S b/REORG.TODO/sysdeps/x86_64/wmemset_chk.S
new file mode 100644
index 0000000000..64c277413f
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/wmemset_chk.S
@@ -0,0 +1,33 @@
+/* Checking wmemset for x86-64.
+   Copyright (C) 2004-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+#ifndef SHARED
+	/* For libc.so this is defined in wmemset.S.
+	   For libc.a, this is a separate source to avoid
+	   wmemset bringing in __chk_fail and all routines
+	   it calls.  */
+        .text
+ENTRY (__wmemset_chk)
+	cmpq	%rdx, %rcx
+	jb	__chk_fail
+	jmp	wmemset
+END (__wmemset_chk)
+#endif
diff --git a/REORG.TODO/sysdeps/x86_64/wordcopy.c b/REORG.TODO/sysdeps/x86_64/wordcopy.c
new file mode 100644
index 0000000000..590b6cb16b
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/wordcopy.c
@@ -0,0 +1 @@
+/* X86-64 doesn't use memory copy functions.  */
diff --git a/REORG.TODO/sysdeps/x86_64/x32/Implies-after b/REORG.TODO/sysdeps/x86_64/x32/Implies-after
new file mode 100644
index 0000000000..39a34c5f57
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/x32/Implies-after
@@ -0,0 +1 @@
+wordsize-32
diff --git a/REORG.TODO/sysdeps/x86_64/x32/Makefile b/REORG.TODO/sysdeps/x86_64/x32/Makefile
new file mode 100644
index 0000000000..f2ebc24fb0
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/x32/Makefile
@@ -0,0 +1,6 @@
+ifeq ($(subdir),math)
+# Since x32 returns 32-bit long int and 64-bit long long int in the
+# same 64-bit register, we make the 32b-bit lround an alias of the
+# 64-bit llround.  Add -fno-builtin-lround to silence the compiler.
+CFLAGS-s_llround.c += -fno-builtin-lround
+endif
diff --git a/REORG.TODO/sysdeps/x86_64/x32/_itoa.h b/REORG.TODO/sysdeps/x86_64/x32/_itoa.h
new file mode 100644
index 0000000000..0f9ed47726
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/x32/_itoa.h
@@ -0,0 +1,4 @@
+/* X32 uses 64-bit _itoa_word and _itoa is mapped to _itoa_word.  */
+#define _ITOA_NEEDED		0
+#define _ITOA_WORD_TYPE		unsigned long long int
+#include_next <_itoa.h>
diff --git a/REORG.TODO/sysdeps/x86_64/x32/divdi3.c b/REORG.TODO/sysdeps/x86_64/x32/divdi3.c
new file mode 100644
index 0000000000..bc7b4c4441
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/x32/divdi3.c
@@ -0,0 +1 @@
+/* Fortunately nothing to do.  */
diff --git a/REORG.TODO/sysdeps/x86_64/x32/dl-machine.h b/REORG.TODO/sysdeps/x86_64/x32/dl-machine.h
new file mode 100644
index 0000000000..2c50688d94
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/x32/dl-machine.h
@@ -0,0 +1,86 @@
+/* Machine-dependent ELF dynamic relocation inline functions.  x32 version.
+   Copyright (C) 2012-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* Must allow <sysdeps/x86_64/dl-machine.h> to be included more than once.
+   See #ifdef RESOLVE_MAP in sysdeps/x86_64/dl-machine.h.  */
+#include <sysdeps/x86_64/dl-machine.h>
+
+#ifndef _X32_DL_MACHINE_H
+#define _X32_DL_MACHINE_H
+
+#undef ARCH_LA_PLTENTER
+#undef ARCH_LA_PLTEXIT
+#undef RTLD_START
+
+/* Names of the architecture-specific auditing callback functions.  */
+#define ARCH_LA_PLTENTER x32_gnu_pltenter
+#define ARCH_LA_PLTEXIT x32_gnu_pltexit
+
+/* Initial entry point code for the dynamic linker.
+   The C function `_dl_start' is the real entry point;
+   its return value is the user program's entry point.  */
+#define RTLD_START asm ("\n\
+.text\n\
+	.p2align 4\n\
+.globl _start\n\
+.globl _dl_start_user\n\
+_start:\n\
+	movl %esp, %edi\n\
+	call _dl_start\n\
+_dl_start_user:\n\
+	# Save the user entry point address in %r12.\n\
+	movl %eax, %r12d\n\
+	# See if we were run as a command with the executable file\n\
+	# name as an extra leading argument.\n\
+	movl _dl_skip_args(%rip), %eax\n\
+	# Pop the original argument count.\n\
+	movl (%rsp), %edx\n\
+	# Adjust the stack pointer to skip _dl_skip_args words.\n\
+	lea 4(%rsp,%rax,4), %esp\n\
+	# Subtract _dl_skip_args from argc.\n\
+	subl %eax, %edx\n\
+	# Push argc back on the stack.\n\
+	subl $4, %esp\n\
+	movl %edx, (%rsp)\n\
+	# Call _dl_init (struct link_map *main_map, int argc, char **argv, char **env)\n\
+	# argc -> rsi\n\
+	movl %edx, %esi\n\
+	# Save %rsp value in %r13.\n\
+	movl %esp, %r13d\n\
+	# And align stack for the _dl_init call.\n\
+	and $-16, %esp\n\
+	# _dl_loaded -> rdi\n\
+	movl _rtld_local(%rip), %edi\n\
+	# env -> rcx\n\
+	lea 8(%r13,%rdx,4), %ecx\n\
+	# argv -> rdx\n\
+	lea 4(%r13), %edx\n\
+	# Clear %rbp to mark outermost frame obviously even for constructors.\n\
+	xorl %ebp, %ebp\n\
+	# Call the function to run the initializers.\n\
+	call _dl_init\n\
+	# Pass our finalizer function to the user in %rdx, as per ELF ABI.\n\
+	lea _dl_fini(%rip), %edx\n\
+	# And make sure %rsp points to argc stored on the stack.\n\
+	movl %r13d, %esp\n\
+	# Jump to the user's entry point.\n\
+	jmp *%r12\n\
+.previous\n\
+");
+
+#endif /* !_X32_DL_MACHINE_H */
diff --git a/REORG.TODO/sysdeps/x86_64/x32/ffs.c b/REORG.TODO/sysdeps/x86_64/x32/ffs.c
new file mode 100644
index 0000000000..fa7de8b887
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/x32/ffs.c
@@ -0,0 +1,4 @@
+#define ffsl __something_else
+#include <sysdeps/x86_64/ffs.c>
+#undef ffsl
+weak_alias (__ffs, ffsl)
diff --git a/REORG.TODO/sysdeps/x86_64/x32/fpu/s_lrint.S b/REORG.TODO/sysdeps/x86_64/x32/fpu/s_lrint.S
new file mode 100644
index 0000000000..86d258c192
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/x32/fpu/s_lrint.S
@@ -0,0 +1,27 @@
+/* Round argument to nearest integral value according to current rounding
+   direction.
+   Copyright (C) 2015-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+	.text
+ENTRY(__lrint)
+	cvtsd2si %xmm0,%eax
+	ret
+END(__lrint)
+weak_alias (__lrint, lrint)
diff --git a/REORG.TODO/sysdeps/x86_64/x32/fpu/s_lrintf.S b/REORG.TODO/sysdeps/x86_64/x32/fpu/s_lrintf.S
new file mode 100644
index 0000000000..2e6f9aaf2b
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/x32/fpu/s_lrintf.S
@@ -0,0 +1,27 @@
+/* Round argument to nearest integral value according to current rounding
+   direction.
+   Copyright (C) 2015-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+	.text
+ENTRY(__lrintf)
+	cvtss2si %xmm0,%eax
+	ret
+END(__lrintf)
+weak_alias (__lrintf, lrintf)
diff --git a/REORG.TODO/sysdeps/x86_64/x32/fpu/s_lrintl.S b/REORG.TODO/sysdeps/x86_64/x32/fpu/s_lrintl.S
new file mode 100644
index 0000000000..623c6fcbc9
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/x32/fpu/s_lrintl.S
@@ -0,0 +1,30 @@
+/* Round argument to nearest integral value according to current rounding
+   direction.
+   Copyright (C) 1997-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+	.text
+ENTRY(__lrintl)
+	fldt	8(%rsp)
+	fistpl	-4(%rsp)
+	fwait
+	movl	-4(%rsp),%eax
+	ret
+END(__lrintl)
+weak_alias (__lrintl, lrintl)
diff --git a/REORG.TODO/sysdeps/x86_64/x32/gmp-mparam.h b/REORG.TODO/sysdeps/x86_64/x32/gmp-mparam.h
new file mode 100644
index 0000000000..1915bfc67a
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/x32/gmp-mparam.h
@@ -0,0 +1,33 @@
+/* gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright (C) 2012-2017 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, see
+<http://www.gnu.org/licenses/>.  */
+
+#if defined __GMP_H__ && ! defined _LONG_LONG_LIMB
+#error "Included too late for _LONG_LONG_LIMB to take effect"
+#endif
+
+#define _LONG_LONG_LIMB
+#define BITS_PER_MP_LIMB 64
+#define BYTES_PER_MP_LIMB 8
+#define BITS_PER_LONGINT 32
+#define BITS_PER_INT 32
+#define BITS_PER_SHORTINT 16
+#define BITS_PER_CHAR 8
+
+#define IEEE_DOUBLE_BIG_ENDIAN 0
diff --git a/REORG.TODO/sysdeps/x86_64/x32/symbol-hacks.h b/REORG.TODO/sysdeps/x86_64/x32/symbol-hacks.h
new file mode 100644
index 0000000000..22aad04437
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/x32/symbol-hacks.h
@@ -0,0 +1 @@
+#include <sysdeps/generic/symbol-hacks.h>
diff --git a/REORG.TODO/sysdeps/x86_64/x32/sysdep.h b/REORG.TODO/sysdeps/x86_64/x32/sysdep.h
new file mode 100644
index 0000000000..034a3f04e9
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/x32/sysdep.h
@@ -0,0 +1,92 @@
+/* Assembler macros for x32.
+   Copyright (C) 2012-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdeps/x86_64/sysdep.h>
+
+#undef LP_SIZE
+#undef LP_OP
+#undef ASM_ADDR
+
+#undef RAX_LP
+#undef RBP_LP
+#undef RBX_LP
+#undef RCX_LP
+#undef RDI_LP
+#undef RDX_LP
+#undef RSP_LP
+#undef RSI_LP
+#undef R8_LP
+#undef R9_LP
+#undef R10_LP
+#undef R11_LP
+#undef R12_LP
+#undef R13_LP
+#undef R14_LP
+#undef R15_LP
+
+#ifdef	__ASSEMBLER__
+
+# define LP_SIZE 4
+
+# define LP_OP(insn) insn##l
+
+# define ASM_ADDR .long
+
+# define RAX_LP	eax
+# define RBP_LP	ebp
+# define RBX_LP	ebx
+# define RCX_LP	ecx
+# define RDI_LP	edi
+# define RDX_LP	edx
+# define RSI_LP	esi
+# define RSP_LP	esp
+# define R8_LP	r8d
+# define R9_LP	r9d
+# define R10_LP	r10d
+# define R11_LP	r11d
+# define R12_LP	r12d
+# define R13_LP	r13d
+# define R14_LP	r14d
+# define R15_LP	r15d
+
+#else	/* __ASSEMBLER__ */
+
+# define LP_SIZE "4"
+
+# define LP_OP(insn) #insn "l"
+
+# define ASM_ADDR ".long"
+
+# define RAX_LP	"eax"
+# define RBP_LP	"ebp"
+# define RBX_LP	"ebx"
+# define RCX_LP	"ecx"
+# define RDI_LP	"edi"
+# define RDX_LP	"edx"
+# define RSI_LP	"esi"
+# define RSP_LP	"esp"
+# define R8_LP	"r8d"
+# define R9_LP	"r9d"
+# define R10_LP	"r10d"
+# define R11_LP	"r11d"
+# define R12_LP	"r12d"
+# define R13_LP	"r13d"
+# define R14_LP	"r14d"
+# define R15_LP	"r15d"
+
+#endif	/* __ASSEMBLER__ */