diff options
Diffstat (limited to 'REORG.TODO/sysdeps/x86_64')
588 files changed, 79900 insertions, 0 deletions
diff --git a/REORG.TODO/sysdeps/x86_64/64/Implies-after b/REORG.TODO/sysdeps/x86_64/64/Implies-after new file mode 100644 index 0000000000..a8cae95f9d --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/64/Implies-after @@ -0,0 +1 @@ +wordsize-64 diff --git a/REORG.TODO/sysdeps/x86_64/Implies b/REORG.TODO/sysdeps/x86_64/Implies new file mode 100644 index 0000000000..811c19a8f2 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/Implies @@ -0,0 +1,5 @@ +x86 +ieee754/ldbl-96 +ieee754/dbl-64/wordsize-64 +ieee754/dbl-64 +ieee754/flt-32 diff --git a/REORG.TODO/sysdeps/x86_64/Makefile b/REORG.TODO/sysdeps/x86_64/Makefile new file mode 100644 index 0000000000..5075c91277 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/Makefile @@ -0,0 +1,124 @@ +# The i387 `long double' is a distinct type we support. +long-double-fcts = yes + +ifeq ($(subdir),csu) +gen-as-const-headers += link-defines.sym +endif + +ifeq ($(subdir),gmon) +sysdep_routines += _mcount +# We cannot compile _mcount.S with -pg because that would create +# recursive calls when ENTRY is used. Just copy the normal static +# object. +sysdep_noprof += _mcount +endif + +ifeq ($(subdir),malloc) +tests += tst-mallocalign1 +endif + +ifeq ($(subdir),string) +sysdep_routines += cacheinfo strcasecmp_l-nonascii strncase_l-nonascii +gen-as-const-headers += locale-defines.sym +endif + +ifeq ($(subdir),elf) +# There is no good reason to use MMX in x86-64 ld.so with GCC. +CFLAGS-.os += $(if $(filter $(@F),$(patsubst %,%.os,$(all-rtld-routines))),\ + -mno-mmx) + +sysdep-dl-routines += tlsdesc dl-tlsdesc + +tests += ifuncmain8 +modules-names += ifuncmod8 + +$(objpfx)ifuncmain8: $(objpfx)ifuncmod8.so + +tests += tst-quad1 tst-quad2 +modules-names += tst-quadmod1 tst-quadmod2 + +$(objpfx)tst-quad1: $(objpfx)tst-quadmod1.so +$(objpfx)tst-quad2: $(objpfx)tst-quadmod2.so + +quad-pie-test += tst-quad1pie tst-quad2pie +tests += $(quad-pie-test) +tests-pie += $(quad-pie-test) +test-extras += tst-quadmod1pie tst-quadmod2pie +extra-test-objs += tst-quadmod1pie.o tst-quadmod2pie.o + +$(objpfx)tst-quad1pie: $(objpfx)tst-quadmod1pie.o +$(objpfx)tst-quad2pie: $(objpfx)tst-quadmod2pie.o + +CFLAGS-tst-quad1pie.c = $(PIE-ccflag) +CFLAGS-tst-quad2pie.c = $(PIE-ccflag) + +tests += tst-audit3 tst-audit4 tst-audit5 tst-audit6 tst-audit7 \ + tst-audit10 tst-sse tst-avx tst-avx512 +test-extras += tst-audit4-aux tst-audit10-aux \ + tst-avx-aux tst-avx512-aux +extra-test-objs += tst-audit4-aux.o tst-audit10-aux.o \ + tst-avx-aux.o tst-avx512-aux.o + +tests += tst-split-dynreloc +LDFLAGS-tst-split-dynreloc = -Wl,-T,$(..)sysdeps/x86_64/tst-split-dynreloc.lds +tst-split-dynreloc-ENV = LD_BIND_NOW=1 + +modules-names += tst-auditmod3a tst-auditmod3b \ + tst-auditmod4a tst-auditmod4b \ + tst-auditmod5a tst-auditmod5b \ + tst-auditmod6a tst-auditmod6b tst-auditmod6c \ + tst-auditmod7a tst-auditmod7b \ + tst-auditmod10a tst-auditmod10b \ + tst-ssemod tst-avxmod tst-avx512mod + +$(objpfx)tst-audit3: $(objpfx)tst-auditmod3a.so +$(objpfx)tst-audit3.out: $(objpfx)tst-auditmod3b.so +tst-audit3-ENV = LD_AUDIT=$(objpfx)tst-auditmod3b.so + +$(objpfx)tst-audit4: $(objpfx)tst-audit4-aux.o $(objpfx)tst-auditmod4a.so +$(objpfx)tst-audit4.out: $(objpfx)tst-auditmod4b.so +tst-audit4-ENV = LD_AUDIT=$(objpfx)tst-auditmod4b.so + +$(objpfx)tst-audit5: $(objpfx)tst-auditmod5a.so +$(objpfx)tst-audit5.out: $(objpfx)tst-auditmod5b.so +tst-audit5-ENV = LD_AUDIT=$(objpfx)tst-auditmod5b.so + +$(objpfx)tst-audit6: $(objpfx)tst-auditmod6a.so +$(objpfx)tst-audit6.out: $(objpfx)tst-auditmod6b.so \ + $(objpfx)tst-auditmod6c.so +tst-audit6-ENV = LD_AUDIT=$(objpfx)tst-auditmod6b.so:$(objpfx)tst-auditmod6c.so + +$(objpfx)tst-audit7: $(objpfx)tst-auditmod7a.so +$(objpfx)tst-audit7.out: $(objpfx)tst-auditmod7b.so +tst-audit7-ENV = LD_AUDIT=$(objpfx)tst-auditmod7b.so + +$(objpfx)tst-audit10: $(objpfx)tst-audit10-aux.o $(objpfx)tst-auditmod10a.so +$(objpfx)tst-audit10.out: $(objpfx)tst-auditmod10b.so +tst-audit10-ENV = LD_AUDIT=$(objpfx)tst-auditmod10b.so + +$(objpfx)tst-sse: $(objpfx)tst-ssemod.so +$(objpfx)tst-avx: $(objpfx)tst-avx-aux.o $(objpfx)tst-avxmod.so +$(objpfx)tst-avx512: $(objpfx)tst-avx512-aux.o $(objpfx)tst-avx512mod.so + +AVX-CFLAGS=-mavx -mno-vzeroupper +CFLAGS-tst-audit4-aux.c += $(AVX-CFLAGS) +CFLAGS-tst-auditmod4a.c += $(AVX-CFLAGS) +CFLAGS-tst-auditmod4b.c += $(AVX-CFLAGS) +CFLAGS-tst-auditmod6b.c += $(AVX-CFLAGS) +CFLAGS-tst-auditmod6c.c += $(AVX-CFLAGS) +CFLAGS-tst-auditmod7b.c += $(AVX-CFLAGS) +CFLAGS-tst-avx-aux.c += $(AVX-CFLAGS) +CFLAGS-tst-avxmod.c += $(AVX-CFLAGS) +ifeq (yes,$(config-cflags-avx512)) +AVX512-CFLAGS = -mavx512f +CFLAGS-tst-audit10-aux.c += $(AVX512-CFLAGS) +CFLAGS-tst-auditmod10a.c += $(AVX512-CFLAGS) +CFLAGS-tst-auditmod10b.c += $(AVX512-CFLAGS) +CFLAGS-tst-avx512-aux.c += $(AVX512-CFLAGS) +CFLAGS-tst-avx512mod.c += $(AVX512-CFLAGS) +endif +endif + +ifeq ($(subdir),csu) +gen-as-const-headers += tlsdesc.sym +endif diff --git a/REORG.TODO/sysdeps/x86_64/Versions b/REORG.TODO/sysdeps/x86_64/Versions new file mode 100644 index 0000000000..a437f85e6e --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/Versions @@ -0,0 +1,12 @@ +libc { + GLIBC_2.14 { + memcpy; + } +} +libm { + GLIBC_2.1 { + # A generic bug got this omitted from other configurations' version + # sets, but we always had it. + exp2l; + } +} diff --git a/REORG.TODO/sysdeps/x86_64/____longjmp_chk.S b/REORG.TODO/sysdeps/x86_64/____longjmp_chk.S new file mode 100644 index 0000000000..0910861a9d --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/____longjmp_chk.S @@ -0,0 +1 @@ +#error "OS-specific version needed" diff --git a/REORG.TODO/sysdeps/x86_64/__longjmp.S b/REORG.TODO/sysdeps/x86_64/__longjmp.S new file mode 100644 index 0000000000..350b6b1bf6 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/__longjmp.S @@ -0,0 +1,68 @@ +/* Copyright (C) 2001-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <jmpbuf-offsets.h> +#include <asm-syntax.h> +#include <stap-probe.h> + +/* Jump to the position specified by ENV, causing the + setjmp call there to return VAL, or 1 if VAL is 0. + void __longjmp (__jmp_buf env, int val). */ + .text +ENTRY(__longjmp) + /* Restore registers. */ + mov (JB_RSP*8)(%rdi),%R8_LP + mov (JB_RBP*8)(%rdi),%R9_LP + mov (JB_PC*8)(%rdi),%RDX_LP +#ifdef PTR_DEMANGLE + PTR_DEMANGLE (%R8_LP) + PTR_DEMANGLE (%R9_LP) + PTR_DEMANGLE (%RDX_LP) +# ifdef __ILP32__ + /* We ignored the high bits of the %rbp value because only the low + bits are mangled. But we cannot presume that %rbp is being used + as a pointer and truncate it, so recover the high bits. */ + movl (JB_RBP*8 + 4)(%rdi), %eax + shlq $32, %rax + orq %rax, %r9 +# endif +#endif + LIBC_PROBE (longjmp, 3, LP_SIZE@%RDI_LP, -4@%esi, LP_SIZE@%RDX_LP) + /* We add unwind information for the target here. */ + cfi_def_cfa(%rdi, 0) + cfi_register(%rsp,%r8) + cfi_register(%rbp,%r9) + cfi_register(%rip,%rdx) + cfi_offset(%rbx,JB_RBX*8) + cfi_offset(%r12,JB_R12*8) + cfi_offset(%r13,JB_R13*8) + cfi_offset(%r14,JB_R14*8) + cfi_offset(%r15,JB_R15*8) + movq (JB_RBX*8)(%rdi),%rbx + movq (JB_R12*8)(%rdi),%r12 + movq (JB_R13*8)(%rdi),%r13 + movq (JB_R14*8)(%rdi),%r14 + movq (JB_R15*8)(%rdi),%r15 + /* Set return value for setjmp. */ + mov %esi, %eax + mov %R8_LP,%RSP_LP + movq %r9,%rbp + LIBC_PROBE (longjmp_target, 3, + LP_SIZE@%RDI_LP, -4@%eax, LP_SIZE@%RDX_LP) + jmpq *%rdx +END (__longjmp) diff --git a/REORG.TODO/sysdeps/x86_64/_mcount.S b/REORG.TODO/sysdeps/x86_64/_mcount.S new file mode 100644 index 0000000000..bcf0957752 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/_mcount.S @@ -0,0 +1,125 @@ +/* Machine-specific calling sequence for `mcount' profiling function. x86-64 version. + Copyright (C) 2002-2017 Free Software Foundation, Inc. + Contributed by Andreas Jaeger <aj@suse.de>. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +/* Assembly stub to invoke _mcount(). Compiler generated code calls + this stub after executing a function's prologue and without saving any + registers. It is therefore necessary to preserve %rcx, %rdx, %rsi, %rdi, + %r8, %r9 as they may contain function arguments. */ + +#include <sysdep.h> + +ENTRY(_mcount) + /* Allocate space for 7 registers. */ + subq $56,%rsp + cfi_adjust_cfa_offset (56) + movq %rax,(%rsp) + cfi_rel_offset (rax, 0) + movq %rcx,8(%rsp) + cfi_rel_offset (rcx, 8) + movq %rdx,16(%rsp) + cfi_rel_offset (rdx, 16) + movq %rsi,24(%rsp) + cfi_rel_offset (rsi, 24) + movq %rdi,32(%rsp) + cfi_rel_offset (rdi, 32) + movq %r8,40(%rsp) + cfi_rel_offset (r8, 40) + movq %r9,48(%rsp) + cfi_rel_offset (r9, 48) + + /* Setup parameter for __mcount_internal. */ + /* selfpc is the return address on the stack. */ + movq 56(%rsp),%rsi + /* Get frompc via the frame pointer. */ + movq 8(%rbp),%rdi + call C_SYMBOL_NAME(__mcount_internal) + /* Pop the saved registers. Please note that `mcount' has no + return value. */ + movq 48(%rsp),%r9 + cfi_restore (r9) + movq 40(%rsp),%r8 + cfi_restore (r8) + movq 32(%rsp),%rdi + cfi_restore (rdi) + movq 24(%rsp),%rsi + cfi_restore (rsi) + movq 16(%rsp),%rdx + cfi_restore (rdx) + movq 8(%rsp),%rcx + cfi_restore (rcx) + movq (%rsp),%rax + cfi_restore (rax) + addq $56,%rsp + cfi_adjust_cfa_offset (-56) + ret +END(_mcount) + +#undef mcount +weak_alias (_mcount, mcount) + +/* __fentry__ is different from _mcount in that it is called before + function prolog. This means (among other things) that it has non-standard + stack alignment on entry: (%RSP & 0xF) == 0. */ + +ENTRY(__fentry__) + /* Allocate space for 7 registers + (+8 bytes for proper stack alignment). */ + subq $64,%rsp + cfi_adjust_cfa_offset (64) + movq %rax,(%rsp) + cfi_rel_offset (rax, 0) + movq %rcx,8(%rsp) + cfi_rel_offset (rcx, 8) + movq %rdx,16(%rsp) + cfi_rel_offset (rdx, 16) + movq %rsi,24(%rsp) + cfi_rel_offset (rsi, 24) + movq %rdi,32(%rsp) + cfi_rel_offset (rdi, 32) + movq %r8,40(%rsp) + cfi_rel_offset (r8, 40) + movq %r9,48(%rsp) + cfi_rel_offset (r9, 48) + + /* Setup parameter for __mcount_internal. */ + /* selfpc is the return address on the stack. */ + movq 64(%rsp),%rsi + /* caller is the return address above it */ + movq 72(%rsp),%rdi + call C_SYMBOL_NAME(__mcount_internal) + /* Pop the saved registers. Please note that `__fentry__' has no + return value. */ + movq 48(%rsp),%r9 + cfi_restore (r9) + movq 40(%rsp),%r8 + cfi_restore (r8) + movq 32(%rsp),%rdi + cfi_restore (rdi) + movq 24(%rsp),%rsi + cfi_restore (rsi) + movq 16(%rsp),%rdx + cfi_restore (rdx) + movq 8(%rsp),%rcx + cfi_restore (rcx) + movq (%rsp),%rax + cfi_restore (rax) + addq $64,%rsp + cfi_adjust_cfa_offset (-64) + ret +END(__fentry__) diff --git a/REORG.TODO/sysdeps/x86_64/abort-instr.h b/REORG.TODO/sysdeps/x86_64/abort-instr.h new file mode 100644 index 0000000000..810f10379b --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/abort-instr.h @@ -0,0 +1,2 @@ +/* An instruction which should crash any program is `hlt'. */ +#define ABORT_INSTRUCTION asm ("hlt") diff --git a/REORG.TODO/sysdeps/x86_64/add_n.S b/REORG.TODO/sysdeps/x86_64/add_n.S new file mode 100644 index 0000000000..4ba83c0bdb --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/add_n.S @@ -0,0 +1,100 @@ +/* x86-64 __mpn_add_n -- Add two limb vectors of the same length > 0 and store + sum in a third limb vector. + Copyright (C) 2006-2017 Free Software Foundation, Inc. + This file is part of the GNU MP Library. + + The GNU MP Library is free software; you can redistribute it and/or modify + it under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation; either version 2.1 of the License, or (at your + option) any later version. + + The GNU MP Library is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public + License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with the GNU MP Library; see the file COPYING.LIB. If not, + see <http://www.gnu.org/licenses/>. */ + +#include "sysdep.h" +#include "asm-syntax.h" + +#define rp %rdi +#define up %rsi +#define vp %rdx +#define n %rcx +#define cy %r8 + +#ifndef func +# define func __mpn_add_n +# define ADCSBB adc +#endif + + .text +ENTRY (func) + xor %r8, %r8 + mov (up), %r10 + mov (vp), %r11 + + lea -8(up,n,8), up + lea -8(vp,n,8), vp + lea -16(rp,n,8), rp + mov %ecx, %eax + neg n + and $3, %eax + je L(b00) + add %rax, n /* clear low rcx bits for jrcxz */ + cmp $2, %eax + jl L(b01) + je L(b10) + +L(b11): shr %r8 /* set cy */ + jmp L(e11) + +L(b00): shr %r8 /* set cy */ + mov %r10, %r8 + mov %r11, %r9 + lea 4(n), n + jmp L(e00) + +L(b01): shr %r8 /* set cy */ + jmp L(e01) + +L(b10): shr %r8 /* set cy */ + mov %r10, %r8 + mov %r11, %r9 + jmp L(e10) + +L(end): ADCSBB %r11, %r10 + mov %r10, 8(rp) + mov %ecx, %eax /* clear eax, ecx contains 0 */ + adc %eax, %eax + ret + + .p2align 4 +L(top): + mov -24(up,n,8), %r8 + mov -24(vp,n,8), %r9 + ADCSBB %r11, %r10 + mov %r10, -24(rp,n,8) +L(e00): + mov -16(up,n,8), %r10 + mov -16(vp,n,8), %r11 + ADCSBB %r9, %r8 + mov %r8, -16(rp,n,8) +L(e11): + mov -8(up,n,8), %r8 + mov -8(vp,n,8), %r9 + ADCSBB %r11, %r10 + mov %r10, -8(rp,n,8) +L(e10): + mov (up,n,8), %r10 + mov (vp,n,8), %r11 + ADCSBB %r9, %r8 + mov %r8, (rp,n,8) +L(e01): + jrcxz L(end) + lea 4(n), n + jmp L(top) +END (func) diff --git a/REORG.TODO/sysdeps/x86_64/addmul_1.S b/REORG.TODO/sysdeps/x86_64/addmul_1.S new file mode 100644 index 0000000000..faccdfdbc4 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/addmul_1.S @@ -0,0 +1,114 @@ +/* x86-64 __mpn_addmul_1 -- Multiply a limb vector with a limb and add + the result to a second limb vector. + Copyright (C) 2003-2017 Free Software Foundation, Inc. + This file is part of the GNU MP Library. + + The GNU MP Library is free software; you can redistribute it and/or modify + it under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation; either version 2.1 of the License, or (at your + option) any later version. + + The GNU MP Library is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public + License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with the GNU MP Library; see the file COPYING.LIB. If not, + see <http://www.gnu.org/licenses/>. */ + +#include "sysdep.h" +#include "asm-syntax.h" + +#define rp %rdi +#define up %rsi +#define n %rdx +#define v0 %rcx + +#ifndef func +# define func __mpn_addmul_1 +# define ADDSUB add +#endif + + .text +ENTRY (func) + push %rbx + push %rbp + lea (%rdx), %rbx + neg %rbx + + mov (up), %rax + mov (rp), %r10 + + lea -16(rp,%rdx,8), rp + lea (up,%rdx,8), up + mul %rcx + + bt $0, %ebx + jc L(odd) + + lea (%rax), %r11 + mov 8(up,%rbx,8), %rax + lea (%rdx), %rbp + mul %rcx + add $2, %rbx + jns L(n2) + + lea (%rax), %r8 + mov (up,%rbx,8), %rax + lea (%rdx), %r9 + jmp L(mid) + +L(odd): add $1, %rbx + jns L(n1) + + lea (%rax), %r8 + mov (up,%rbx,8), %rax + lea (%rdx), %r9 + mul %rcx + lea (%rax), %r11 + mov 8(up,%rbx,8), %rax + lea (%rdx), %rbp + jmp L(e) + + .p2align 4 +L(top): mul %rcx + ADDSUB %r8, %r10 + lea (%rax), %r8 + mov (up,%rbx,8), %rax + adc %r9, %r11 + mov %r10, -8(rp,%rbx,8) + mov (rp,%rbx,8), %r10 + lea (%rdx), %r9 + adc $0, %rbp +L(mid): mul %rcx + ADDSUB %r11, %r10 + lea (%rax), %r11 + mov 8(up,%rbx,8), %rax + adc %rbp, %r8 + mov %r10, (rp,%rbx,8) + mov 8(rp,%rbx,8), %r10 + lea (%rdx), %rbp + adc $0, %r9 +L(e): add $2, %rbx + js L(top) + + mul %rcx + ADDSUB %r8, %r10 + adc %r9, %r11 + mov %r10, -8(rp) + adc $0, %rbp +L(n2): mov (rp), %r10 + ADDSUB %r11, %r10 + adc %rbp, %rax + mov %r10, (rp) + adc $0, %rdx +L(n1): mov 8(rp), %r10 + ADDSUB %rax, %r10 + mov %r10, 8(rp) + mov %ebx, %eax /* zero rax */ + adc %rdx, %rax + pop %rbp + pop %rbx + ret +END (func) diff --git a/REORG.TODO/sysdeps/x86_64/atomic-machine.h b/REORG.TODO/sysdeps/x86_64/atomic-machine.h new file mode 100644 index 0000000000..c454734001 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/atomic-machine.h @@ -0,0 +1,482 @@ +/* Copyright (C) 2002-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Ulrich Drepper <drepper@redhat.com>, 2002. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#ifndef _X86_64_ATOMIC_MACHINE_H +#define _X86_64_ATOMIC_MACHINE_H 1 + +#include <stdint.h> +#include <tls.h> /* For tcbhead_t. */ +#include <libc-pointer-arith.h> /* For cast_to_integer. */ + +typedef int8_t atomic8_t; +typedef uint8_t uatomic8_t; +typedef int_fast8_t atomic_fast8_t; +typedef uint_fast8_t uatomic_fast8_t; + +typedef int16_t atomic16_t; +typedef uint16_t uatomic16_t; +typedef int_fast16_t atomic_fast16_t; +typedef uint_fast16_t uatomic_fast16_t; + +typedef int32_t atomic32_t; +typedef uint32_t uatomic32_t; +typedef int_fast32_t atomic_fast32_t; +typedef uint_fast32_t uatomic_fast32_t; + +typedef int64_t atomic64_t; +typedef uint64_t uatomic64_t; +typedef int_fast64_t atomic_fast64_t; +typedef uint_fast64_t uatomic_fast64_t; + +typedef intptr_t atomicptr_t; +typedef uintptr_t uatomicptr_t; +typedef intmax_t atomic_max_t; +typedef uintmax_t uatomic_max_t; + + +#ifndef LOCK_PREFIX +# ifdef UP +# define LOCK_PREFIX /* nothing */ +# else +# define LOCK_PREFIX "lock;" +# endif +#endif + +#define __HAVE_64B_ATOMICS 1 +#define USE_ATOMIC_COMPILER_BUILTINS 1 +#define ATOMIC_EXCHANGE_USES_CAS 0 + +#define atomic_compare_and_exchange_val_acq(mem, newval, oldval) \ + __sync_val_compare_and_swap (mem, oldval, newval) +#define atomic_compare_and_exchange_bool_acq(mem, newval, oldval) \ + (! __sync_bool_compare_and_swap (mem, oldval, newval)) + + +#define __arch_c_compare_and_exchange_val_8_acq(mem, newval, oldval) \ + ({ __typeof (*mem) ret; \ + __asm __volatile ("cmpl $0, %%fs:%P5\n\t" \ + "je 0f\n\t" \ + "lock\n" \ + "0:\tcmpxchgb %b2, %1" \ + : "=a" (ret), "=m" (*mem) \ + : "q" (newval), "m" (*mem), "0" (oldval), \ + "i" (offsetof (tcbhead_t, multiple_threads))); \ + ret; }) + +#define __arch_c_compare_and_exchange_val_16_acq(mem, newval, oldval) \ + ({ __typeof (*mem) ret; \ + __asm __volatile ("cmpl $0, %%fs:%P5\n\t" \ + "je 0f\n\t" \ + "lock\n" \ + "0:\tcmpxchgw %w2, %1" \ + : "=a" (ret), "=m" (*mem) \ + : "q" (newval), "m" (*mem), "0" (oldval), \ + "i" (offsetof (tcbhead_t, multiple_threads))); \ + ret; }) + +#define __arch_c_compare_and_exchange_val_32_acq(mem, newval, oldval) \ + ({ __typeof (*mem) ret; \ + __asm __volatile ("cmpl $0, %%fs:%P5\n\t" \ + "je 0f\n\t" \ + "lock\n" \ + "0:\tcmpxchgl %2, %1" \ + : "=a" (ret), "=m" (*mem) \ + : "q" (newval), "m" (*mem), "0" (oldval), \ + "i" (offsetof (tcbhead_t, multiple_threads))); \ + ret; }) + +#define __arch_c_compare_and_exchange_val_64_acq(mem, newval, oldval) \ + ({ __typeof (*mem) ret; \ + __asm __volatile ("cmpl $0, %%fs:%P5\n\t" \ + "je 0f\n\t" \ + "lock\n" \ + "0:\tcmpxchgq %q2, %1" \ + : "=a" (ret), "=m" (*mem) \ + : "q" ((atomic64_t) cast_to_integer (newval)), \ + "m" (*mem), \ + "0" ((atomic64_t) cast_to_integer (oldval)), \ + "i" (offsetof (tcbhead_t, multiple_threads))); \ + ret; }) + + +/* Note that we need no lock prefix. */ +#define atomic_exchange_acq(mem, newvalue) \ + ({ __typeof (*mem) result; \ + if (sizeof (*mem) == 1) \ + __asm __volatile ("xchgb %b0, %1" \ + : "=q" (result), "=m" (*mem) \ + : "0" (newvalue), "m" (*mem)); \ + else if (sizeof (*mem) == 2) \ + __asm __volatile ("xchgw %w0, %1" \ + : "=r" (result), "=m" (*mem) \ + : "0" (newvalue), "m" (*mem)); \ + else if (sizeof (*mem) == 4) \ + __asm __volatile ("xchgl %0, %1" \ + : "=r" (result), "=m" (*mem) \ + : "0" (newvalue), "m" (*mem)); \ + else \ + __asm __volatile ("xchgq %q0, %1" \ + : "=r" (result), "=m" (*mem) \ + : "0" ((atomic64_t) cast_to_integer (newvalue)), \ + "m" (*mem)); \ + result; }) + + +#define __arch_exchange_and_add_body(lock, mem, value) \ + ({ __typeof (*mem) result; \ + if (sizeof (*mem) == 1) \ + __asm __volatile (lock "xaddb %b0, %1" \ + : "=q" (result), "=m" (*mem) \ + : "0" (value), "m" (*mem), \ + "i" (offsetof (tcbhead_t, multiple_threads))); \ + else if (sizeof (*mem) == 2) \ + __asm __volatile (lock "xaddw %w0, %1" \ + : "=r" (result), "=m" (*mem) \ + : "0" (value), "m" (*mem), \ + "i" (offsetof (tcbhead_t, multiple_threads))); \ + else if (sizeof (*mem) == 4) \ + __asm __volatile (lock "xaddl %0, %1" \ + : "=r" (result), "=m" (*mem) \ + : "0" (value), "m" (*mem), \ + "i" (offsetof (tcbhead_t, multiple_threads))); \ + else \ + __asm __volatile (lock "xaddq %q0, %1" \ + : "=r" (result), "=m" (*mem) \ + : "0" ((atomic64_t) cast_to_integer (value)), \ + "m" (*mem), \ + "i" (offsetof (tcbhead_t, multiple_threads))); \ + result; }) + +#define atomic_exchange_and_add(mem, value) \ + __sync_fetch_and_add (mem, value) + +#define __arch_exchange_and_add_cprefix \ + "cmpl $0, %%fs:%P4\n\tje 0f\n\tlock\n0:\t" + +#define catomic_exchange_and_add(mem, value) \ + __arch_exchange_and_add_body (__arch_exchange_and_add_cprefix, mem, value) + + +#define __arch_add_body(lock, pfx, mem, value) \ + do { \ + if (__builtin_constant_p (value) && (value) == 1) \ + pfx##_increment (mem); \ + else if (__builtin_constant_p (value) && (value) == -1) \ + pfx##_decrement (mem); \ + else if (sizeof (*mem) == 1) \ + __asm __volatile (lock "addb %b1, %0" \ + : "=m" (*mem) \ + : "iq" (value), "m" (*mem), \ + "i" (offsetof (tcbhead_t, multiple_threads))); \ + else if (sizeof (*mem) == 2) \ + __asm __volatile (lock "addw %w1, %0" \ + : "=m" (*mem) \ + : "ir" (value), "m" (*mem), \ + "i" (offsetof (tcbhead_t, multiple_threads))); \ + else if (sizeof (*mem) == 4) \ + __asm __volatile (lock "addl %1, %0" \ + : "=m" (*mem) \ + : "ir" (value), "m" (*mem), \ + "i" (offsetof (tcbhead_t, multiple_threads))); \ + else \ + __asm __volatile (lock "addq %q1, %0" \ + : "=m" (*mem) \ + : "ir" ((atomic64_t) cast_to_integer (value)), \ + "m" (*mem), \ + "i" (offsetof (tcbhead_t, multiple_threads))); \ + } while (0) + +#define atomic_add(mem, value) \ + __arch_add_body (LOCK_PREFIX, atomic, mem, value) + +#define __arch_add_cprefix \ + "cmpl $0, %%fs:%P3\n\tje 0f\n\tlock\n0:\t" + +#define catomic_add(mem, value) \ + __arch_add_body (__arch_add_cprefix, catomic, mem, value) + + +#define atomic_add_negative(mem, value) \ + ({ unsigned char __result; \ + if (sizeof (*mem) == 1) \ + __asm __volatile (LOCK_PREFIX "addb %b2, %0; sets %1" \ + : "=m" (*mem), "=qm" (__result) \ + : "iq" (value), "m" (*mem)); \ + else if (sizeof (*mem) == 2) \ + __asm __volatile (LOCK_PREFIX "addw %w2, %0; sets %1" \ + : "=m" (*mem), "=qm" (__result) \ + : "ir" (value), "m" (*mem)); \ + else if (sizeof (*mem) == 4) \ + __asm __volatile (LOCK_PREFIX "addl %2, %0; sets %1" \ + : "=m" (*mem), "=qm" (__result) \ + : "ir" (value), "m" (*mem)); \ + else \ + __asm __volatile (LOCK_PREFIX "addq %q2, %0; sets %1" \ + : "=m" (*mem), "=qm" (__result) \ + : "ir" ((atomic64_t) cast_to_integer (value)), \ + "m" (*mem)); \ + __result; }) + + +#define atomic_add_zero(mem, value) \ + ({ unsigned char __result; \ + if (sizeof (*mem) == 1) \ + __asm __volatile (LOCK_PREFIX "addb %b2, %0; setz %1" \ + : "=m" (*mem), "=qm" (__result) \ + : "iq" (value), "m" (*mem)); \ + else if (sizeof (*mem) == 2) \ + __asm __volatile (LOCK_PREFIX "addw %w2, %0; setz %1" \ + : "=m" (*mem), "=qm" (__result) \ + : "ir" (value), "m" (*mem)); \ + else if (sizeof (*mem) == 4) \ + __asm __volatile (LOCK_PREFIX "addl %2, %0; setz %1" \ + : "=m" (*mem), "=qm" (__result) \ + : "ir" (value), "m" (*mem)); \ + else \ + __asm __volatile (LOCK_PREFIX "addq %q2, %0; setz %1" \ + : "=m" (*mem), "=qm" (__result) \ + : "ir" ((atomic64_t) cast_to_integer (value)), \ + "m" (*mem)); \ + __result; }) + + +#define __arch_increment_body(lock, mem) \ + do { \ + if (sizeof (*mem) == 1) \ + __asm __volatile (lock "incb %b0" \ + : "=m" (*mem) \ + : "m" (*mem), \ + "i" (offsetof (tcbhead_t, multiple_threads))); \ + else if (sizeof (*mem) == 2) \ + __asm __volatile (lock "incw %w0" \ + : "=m" (*mem) \ + : "m" (*mem), \ + "i" (offsetof (tcbhead_t, multiple_threads))); \ + else if (sizeof (*mem) == 4) \ + __asm __volatile (lock "incl %0" \ + : "=m" (*mem) \ + : "m" (*mem), \ + "i" (offsetof (tcbhead_t, multiple_threads))); \ + else \ + __asm __volatile (lock "incq %q0" \ + : "=m" (*mem) \ + : "m" (*mem), \ + "i" (offsetof (tcbhead_t, multiple_threads))); \ + } while (0) + +#define atomic_increment(mem) __arch_increment_body (LOCK_PREFIX, mem) + +#define __arch_increment_cprefix \ + "cmpl $0, %%fs:%P2\n\tje 0f\n\tlock\n0:\t" + +#define catomic_increment(mem) \ + __arch_increment_body (__arch_increment_cprefix, mem) + + +#define atomic_increment_and_test(mem) \ + ({ unsigned char __result; \ + if (sizeof (*mem) == 1) \ + __asm __volatile (LOCK_PREFIX "incb %b0; sete %1" \ + : "=m" (*mem), "=qm" (__result) \ + : "m" (*mem)); \ + else if (sizeof (*mem) == 2) \ + __asm __volatile (LOCK_PREFIX "incw %w0; sete %1" \ + : "=m" (*mem), "=qm" (__result) \ + : "m" (*mem)); \ + else if (sizeof (*mem) == 4) \ + __asm __volatile (LOCK_PREFIX "incl %0; sete %1" \ + : "=m" (*mem), "=qm" (__result) \ + : "m" (*mem)); \ + else \ + __asm __volatile (LOCK_PREFIX "incq %q0; sete %1" \ + : "=m" (*mem), "=qm" (__result) \ + : "m" (*mem)); \ + __result; }) + + +#define __arch_decrement_body(lock, mem) \ + do { \ + if (sizeof (*mem) == 1) \ + __asm __volatile (lock "decb %b0" \ + : "=m" (*mem) \ + : "m" (*mem), \ + "i" (offsetof (tcbhead_t, multiple_threads))); \ + else if (sizeof (*mem) == 2) \ + __asm __volatile (lock "decw %w0" \ + : "=m" (*mem) \ + : "m" (*mem), \ + "i" (offsetof (tcbhead_t, multiple_threads))); \ + else if (sizeof (*mem) == 4) \ + __asm __volatile (lock "decl %0" \ + : "=m" (*mem) \ + : "m" (*mem), \ + "i" (offsetof (tcbhead_t, multiple_threads))); \ + else \ + __asm __volatile (lock "decq %q0" \ + : "=m" (*mem) \ + : "m" (*mem), \ + "i" (offsetof (tcbhead_t, multiple_threads))); \ + } while (0) + +#define atomic_decrement(mem) __arch_decrement_body (LOCK_PREFIX, mem) + +#define __arch_decrement_cprefix \ + "cmpl $0, %%fs:%P2\n\tje 0f\n\tlock\n0:\t" + +#define catomic_decrement(mem) \ + __arch_decrement_body (__arch_decrement_cprefix, mem) + + +#define atomic_decrement_and_test(mem) \ + ({ unsigned char __result; \ + if (sizeof (*mem) == 1) \ + __asm __volatile (LOCK_PREFIX "decb %b0; sete %1" \ + : "=m" (*mem), "=qm" (__result) \ + : "m" (*mem)); \ + else if (sizeof (*mem) == 2) \ + __asm __volatile (LOCK_PREFIX "decw %w0; sete %1" \ + : "=m" (*mem), "=qm" (__result) \ + : "m" (*mem)); \ + else if (sizeof (*mem) == 4) \ + __asm __volatile (LOCK_PREFIX "decl %0; sete %1" \ + : "=m" (*mem), "=qm" (__result) \ + : "m" (*mem)); \ + else \ + __asm __volatile (LOCK_PREFIX "decq %q0; sete %1" \ + : "=m" (*mem), "=qm" (__result) \ + : "m" (*mem)); \ + __result; }) + + +#define atomic_bit_set(mem, bit) \ + do { \ + if (sizeof (*mem) == 1) \ + __asm __volatile (LOCK_PREFIX "orb %b2, %0" \ + : "=m" (*mem) \ + : "m" (*mem), "iq" (1L << (bit))); \ + else if (sizeof (*mem) == 2) \ + __asm __volatile (LOCK_PREFIX "orw %w2, %0" \ + : "=m" (*mem) \ + : "m" (*mem), "ir" (1L << (bit))); \ + else if (sizeof (*mem) == 4) \ + __asm __volatile (LOCK_PREFIX "orl %2, %0" \ + : "=m" (*mem) \ + : "m" (*mem), "ir" (1L << (bit))); \ + else if (__builtin_constant_p (bit) && (bit) < 32) \ + __asm __volatile (LOCK_PREFIX "orq %2, %0" \ + : "=m" (*mem) \ + : "m" (*mem), "i" (1L << (bit))); \ + else \ + __asm __volatile (LOCK_PREFIX "orq %q2, %0" \ + : "=m" (*mem) \ + : "m" (*mem), "r" (1UL << (bit))); \ + } while (0) + + +#define atomic_bit_test_set(mem, bit) \ + ({ unsigned char __result; \ + if (sizeof (*mem) == 1) \ + __asm __volatile (LOCK_PREFIX "btsb %3, %1; setc %0" \ + : "=q" (__result), "=m" (*mem) \ + : "m" (*mem), "iq" (bit)); \ + else if (sizeof (*mem) == 2) \ + __asm __volatile (LOCK_PREFIX "btsw %3, %1; setc %0" \ + : "=q" (__result), "=m" (*mem) \ + : "m" (*mem), "ir" (bit)); \ + else if (sizeof (*mem) == 4) \ + __asm __volatile (LOCK_PREFIX "btsl %3, %1; setc %0" \ + : "=q" (__result), "=m" (*mem) \ + : "m" (*mem), "ir" (bit)); \ + else \ + __asm __volatile (LOCK_PREFIX "btsq %3, %1; setc %0" \ + : "=q" (__result), "=m" (*mem) \ + : "m" (*mem), "ir" (bit)); \ + __result; }) + + +#define atomic_spin_nop() asm ("rep; nop") + + +#define __arch_and_body(lock, mem, mask) \ + do { \ + if (sizeof (*mem) == 1) \ + __asm __volatile (lock "andb %b1, %0" \ + : "=m" (*mem) \ + : "iq" (mask), "m" (*mem), \ + "i" (offsetof (tcbhead_t, multiple_threads))); \ + else if (sizeof (*mem) == 2) \ + __asm __volatile (lock "andw %w1, %0" \ + : "=m" (*mem) \ + : "ir" (mask), "m" (*mem), \ + "i" (offsetof (tcbhead_t, multiple_threads))); \ + else if (sizeof (*mem) == 4) \ + __asm __volatile (lock "andl %1, %0" \ + : "=m" (*mem) \ + : "ir" (mask), "m" (*mem), \ + "i" (offsetof (tcbhead_t, multiple_threads))); \ + else \ + __asm __volatile (lock "andq %q1, %0" \ + : "=m" (*mem) \ + : "ir" (mask), "m" (*mem), \ + "i" (offsetof (tcbhead_t, multiple_threads))); \ + } while (0) + +#define __arch_cprefix \ + "cmpl $0, %%fs:%P3\n\tje 0f\n\tlock\n0:\t" + +#define atomic_and(mem, mask) __arch_and_body (LOCK_PREFIX, mem, mask) + +#define catomic_and(mem, mask) __arch_and_body (__arch_cprefix, mem, mask) + + +#define __arch_or_body(lock, mem, mask) \ + do { \ + if (sizeof (*mem) == 1) \ + __asm __volatile (lock "orb %b1, %0" \ + : "=m" (*mem) \ + : "iq" (mask), "m" (*mem), \ + "i" (offsetof (tcbhead_t, multiple_threads))); \ + else if (sizeof (*mem) == 2) \ + __asm __volatile (lock "orw %w1, %0" \ + : "=m" (*mem) \ + : "ir" (mask), "m" (*mem), \ + "i" (offsetof (tcbhead_t, multiple_threads))); \ + else if (sizeof (*mem) == 4) \ + __asm __volatile (lock "orl %1, %0" \ + : "=m" (*mem) \ + : "ir" (mask), "m" (*mem), \ + "i" (offsetof (tcbhead_t, multiple_threads))); \ + else \ + __asm __volatile (lock "orq %q1, %0" \ + : "=m" (*mem) \ + : "ir" (mask), "m" (*mem), \ + "i" (offsetof (tcbhead_t, multiple_threads))); \ + } while (0) + +#define atomic_or(mem, mask) __arch_or_body (LOCK_PREFIX, mem, mask) + +#define catomic_or(mem, mask) __arch_or_body (__arch_cprefix, mem, mask) + +/* We don't use mfence because it is supposedly slower due to having to + provide stronger guarantees (e.g., regarding self-modifying code). */ +#define atomic_full_barrier() \ + __asm __volatile (LOCK_PREFIX "orl $0, (%%rsp)" ::: "memory") +#define atomic_read_barrier() __asm ("" ::: "memory") +#define atomic_write_barrier() __asm ("" ::: "memory") + +#endif /* atomic-machine.h */ diff --git a/REORG.TODO/sysdeps/x86_64/backtrace.c b/REORG.TODO/sysdeps/x86_64/backtrace.c new file mode 100644 index 0000000000..15f425b410 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/backtrace.c @@ -0,0 +1,133 @@ +/* Return backtrace of current program state. + Copyright (C) 2003-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Jakub Jelinek <jakub@redhat.com>, 2003. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <libc-lock.h> +#include <dlfcn.h> +#include <execinfo.h> +#include <stdlib.h> +#include <unwind.h> + +struct trace_arg +{ + void **array; + _Unwind_Word cfa; + int cnt; + int size; +}; + +#ifdef SHARED +static _Unwind_Reason_Code (*unwind_backtrace) (_Unwind_Trace_Fn, void *); +static _Unwind_Ptr (*unwind_getip) (struct _Unwind_Context *); +static _Unwind_Word (*unwind_getcfa) (struct _Unwind_Context *); +static void *libgcc_handle; + + +/* Dummy version in case libgcc_s does not contain the real code. */ +static _Unwind_Word +dummy_getcfa (struct _Unwind_Context *ctx __attribute__ ((unused))) +{ + return 0; +} + + +static void +init (void) +{ + libgcc_handle = __libc_dlopen ("libgcc_s.so.1"); + + if (libgcc_handle == NULL) + return; + + unwind_backtrace = __libc_dlsym (libgcc_handle, "_Unwind_Backtrace"); + unwind_getip = __libc_dlsym (libgcc_handle, "_Unwind_GetIP"); + if (unwind_getip == NULL) + unwind_backtrace = NULL; + unwind_getcfa = (__libc_dlsym (libgcc_handle, "_Unwind_GetCFA") + ?: dummy_getcfa); +} +#else +# define unwind_backtrace _Unwind_Backtrace +# define unwind_getip _Unwind_GetIP +# define unwind_getcfa _Unwind_GetCFA +#endif + +static _Unwind_Reason_Code +backtrace_helper (struct _Unwind_Context *ctx, void *a) +{ + struct trace_arg *arg = a; + + /* We are first called with address in the __backtrace function. + Skip it. */ + if (arg->cnt != -1) + { + arg->array[arg->cnt] = (void *) unwind_getip (ctx); + + /* Check whether we make any progress. */ + _Unwind_Word cfa = unwind_getcfa (ctx); + + if (arg->cnt > 0 && arg->array[arg->cnt - 1] == arg->array[arg->cnt] + && cfa == arg->cfa) + return _URC_END_OF_STACK; + arg->cfa = cfa; + } + if (++arg->cnt == arg->size) + return _URC_END_OF_STACK; + return _URC_NO_REASON; +} + +int +__backtrace (void **array, int size) +{ + struct trace_arg arg = { .array = array, .cfa = 0, .size = size, .cnt = -1 }; + + if (size <= 0) + return 0; + +#ifdef SHARED + __libc_once_define (static, once); + + __libc_once (once, init); + if (unwind_backtrace == NULL) + return 0; +#endif + + unwind_backtrace (backtrace_helper, &arg); + + /* _Unwind_Backtrace seems to put NULL address above + _start. Fix it up here. */ + if (arg.cnt > 1 && arg.array[arg.cnt - 1] == NULL) + --arg.cnt; + return arg.cnt != -1 ? arg.cnt : 0; +} +weak_alias (__backtrace, backtrace) +libc_hidden_def (__backtrace) + + +#ifdef SHARED +/* Free all resources if necessary. */ +libc_freeres_fn (free_mem) +{ + unwind_backtrace = NULL; + if (libgcc_handle != NULL) + { + __libc_dlclose (libgcc_handle); + libgcc_handle = NULL; + } +} +#endif diff --git a/REORG.TODO/sysdeps/x86_64/bsd-_setjmp.S b/REORG.TODO/sysdeps/x86_64/bsd-_setjmp.S new file mode 100644 index 0000000000..bc40a88938 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/bsd-_setjmp.S @@ -0,0 +1,37 @@ +/* BSD `_setjmp' entry point to `sigsetjmp (..., 0)'. x86-64 version. + Copyright (C) 1994-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +/* This just does a tail-call to `__sigsetjmp (ARG, 0)'. + We cannot do it in C because it must be a tail-call, so frame-unwinding + in setjmp doesn't clobber the state restored by longjmp. */ + +#include <sysdep.h> +#define _ASM +#define _SETJMP_H +#include <bits/setjmp.h> + +ENTRY (_setjmp) + /* Set up arguments, we only need to set the second arg. */ + xorl %esi, %esi +#ifdef PIC + jmp HIDDEN_JUMPTARGET (__sigsetjmp) +#else + jmp __sigsetjmp +#endif +END (_setjmp) +libc_hidden_def (_setjmp) diff --git a/REORG.TODO/sysdeps/x86_64/bsd-setjmp.S b/REORG.TODO/sysdeps/x86_64/bsd-setjmp.S new file mode 100644 index 0000000000..45ee1234b9 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/bsd-setjmp.S @@ -0,0 +1,36 @@ +/* BSD `setjmp' entry point to `sigsetjmp (..., 1)'. x86-64 version. + Copyright (C) 1994-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +/* This just does a tail-call to `__sigsetjmp (ARG, 1)'. + We cannot do it in C because it must be a tail-call, so frame-unwinding + in setjmp doesn't clobber the state restored by longjmp. */ + +#include <sysdep.h> +#define _ASM +#define _SETJMP_H +#include <bits/setjmp.h> + +ENTRY (setjmp) + /* Set up arguments, we only need to set the 2nd arg. */ + movl $1, %esi +#ifdef PIC + jmp HIDDEN_JUMPTARGET (__sigsetjmp) +#else + jmp __sigsetjmp +#endif +END (setjmp) diff --git a/REORG.TODO/sysdeps/x86_64/bzero.S b/REORG.TODO/sysdeps/x86_64/bzero.S new file mode 100644 index 0000000000..f96d567fd8 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/bzero.S @@ -0,0 +1 @@ +/* Implemented in memset.S. */ diff --git a/REORG.TODO/sysdeps/x86_64/configure b/REORG.TODO/sysdeps/x86_64/configure new file mode 100644 index 0000000000..2d14c344df --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/configure @@ -0,0 +1,156 @@ +# This file is generated from configure.ac by Autoconf. DO NOT EDIT! + # Local configure fragment for sysdeps/x86_64. + +for ac_prog in $AS +do + # Extract the first word of "$ac_prog", so it can be a program name with args. +set dummy $ac_prog; ac_word=$2 +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5 +$as_echo_n "checking for $ac_word... " >&6; } +if ${ac_cv_prog_AS+:} false; then : + $as_echo_n "(cached) " >&6 +else + if test -n "$AS"; then + ac_cv_prog_AS="$AS" # Let the user override the test. +else +as_save_IFS=$IFS; IFS=$PATH_SEPARATOR +for as_dir in $PATH +do + IFS=$as_save_IFS + test -z "$as_dir" && as_dir=. + for ac_exec_ext in '' $ac_executable_extensions; do + if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then + ac_cv_prog_AS="$ac_prog" + $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5 + break 2 + fi +done + done +IFS=$as_save_IFS + +fi +fi +AS=$ac_cv_prog_AS +if test -n "$AS"; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: $AS" >&5 +$as_echo "$AS" >&6; } +else + { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 +$as_echo "no" >&6; } +fi + + + test -n "$AS" && break +done + +if test -z "$AS"; then + ac_verc_fail=yes +else + # Found it, now check the version. + { $as_echo "$as_me:${as_lineno-$LINENO}: checking version of $AS" >&5 +$as_echo_n "checking version of $AS... " >&6; } + ac_prog_version=`$AS --version 2>&1 | sed -n 's/^.*GNU assembler.* \([0-9]*\.[0-9.]*\).*$/\1/p'` + case $ac_prog_version in + '') ac_prog_version="v. ?.??, bad"; ac_verc_fail=yes;; + 2.2[4-9]*|2.[3-9][0-9]*|[3-9].*|[1-9][0-9]*) + ac_prog_version="$ac_prog_version, ok"; ac_verc_fail=no;; + *) ac_prog_version="$ac_prog_version, bad"; ac_verc_fail=yes;; + + esac + { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_prog_version" >&5 +$as_echo "$ac_prog_version" >&6; } +fi +if test $ac_verc_fail = yes; then + critic_missing="$critic_missing The program AS is required in version >= 2.24 for target x86_64." +fi + + +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for AVX512DQ support in assembler" >&5 +$as_echo_n "checking for AVX512DQ support in assembler... " >&6; } +if ${libc_cv_asm_avx512dq+:} false; then : + $as_echo_n "(cached) " >&6 +else + cat > conftest.s <<\EOF + vandpd (%rax), %zmm6, %zmm1 +EOF +if { ac_try='${CC-cc} -c $ASFLAGS conftest.s 1>&5' + { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5 + (eval $ac_try) 2>&5 + ac_status=$? + $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 + test $ac_status = 0; }; }; then + libc_cv_asm_avx512dq=yes +else + libc_cv_asm_avx512dq=no +fi +rm -f conftest* +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $libc_cv_asm_avx512dq" >&5 +$as_echo "$libc_cv_asm_avx512dq" >&6; } +if test $libc_cv_asm_avx512dq = yes; then + $as_echo "#define HAVE_AVX512DQ_ASM_SUPPORT 1" >>confdefs.h + +fi + +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for AVX512 support" >&5 +$as_echo_n "checking for AVX512 support... " >&6; } +if ${libc_cv_cc_avx512+:} false; then : + $as_echo_n "(cached) " >&6 +else + if { ac_try='${CC-cc} -mavx512f -xc /dev/null -S -o /dev/null' + { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5 + (eval $ac_try) 2>&5 + ac_status=$? + $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 + test $ac_status = 0; }; }; then : + libc_cv_cc_avx512=$libc_cv_asm_avx512dq +else + libc_cv_cc_avx512=no +fi + +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $libc_cv_cc_avx512" >&5 +$as_echo "$libc_cv_cc_avx512" >&6; } +if test $libc_cv_cc_avx512 = yes; then + $as_echo "#define HAVE_AVX512_SUPPORT 1" >>confdefs.h + +fi +config_vars="$config_vars +config-cflags-avx512 = $libc_cv_cc_avx512" + +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for Intel MPX support" >&5 +$as_echo_n "checking for Intel MPX support... " >&6; } +if ${libc_cv_asm_mpx+:} false; then : + $as_echo_n "(cached) " >&6 +else + cat > conftest.s <<\EOF + bndmov %bnd0,(%rsp) +EOF +if { ac_try='${CC-cc} -c $ASFLAGS conftest.s 1>&5' + { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5 + (eval $ac_try) 2>&5 + ac_status=$? + $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 + test $ac_status = 0; }; }; then + libc_cv_asm_mpx=yes +else + libc_cv_asm_mpx=no +fi +rm -f conftest* +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $libc_cv_asm_mpx" >&5 +$as_echo "$libc_cv_asm_mpx" >&6; } +if test $libc_cv_asm_mpx = yes; then + $as_echo "#define HAVE_MPX_SUPPORT 1" >>confdefs.h + +fi + +if test x"$build_mathvec" = xnotset; then + build_mathvec=yes +fi + +$as_echo "#define PI_STATIC_AND_HIDDEN 1" >>confdefs.h + + +test -n "$critic_missing" && as_fn_error $? " +*** $critic_missing" "$LINENO" 5 diff --git a/REORG.TODO/sysdeps/x86_64/configure.ac b/REORG.TODO/sysdeps/x86_64/configure.ac new file mode 100644 index 0000000000..7d8aaafc0c --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/configure.ac @@ -0,0 +1,58 @@ +GLIBC_PROVIDES dnl See aclocal.m4 in the top level source directory. +# Local configure fragment for sysdeps/x86_64. + +dnl Accept as 2.24 or newer for AVX512 load and store. +AC_CHECK_PROG_VER(AS, $AS, --version, + [GNU assembler.* \([0-9]*\.[0-9.]*\)], + [2.2[4-9]*|2.[3-9][0-9]*|[3-9].*|[1-9][0-9]*], + critic_missing="$critic_missing The program AS is required in version >= 2.24 for target x86_64.") + +dnl Check if asm supports AVX512DQ. +AC_CACHE_CHECK(for AVX512DQ support in assembler, libc_cv_asm_avx512dq, [dnl +cat > conftest.s <<\EOF + vandpd (%rax), %zmm6, %zmm1 +EOF +if AC_TRY_COMMAND(${CC-cc} -c $ASFLAGS conftest.s 1>&AS_MESSAGE_LOG_FD); then + libc_cv_asm_avx512dq=yes +else + libc_cv_asm_avx512dq=no +fi +rm -f conftest*]) +if test $libc_cv_asm_avx512dq = yes; then + AC_DEFINE(HAVE_AVX512DQ_ASM_SUPPORT) +fi + +dnl Check if -mavx512f works. +AC_CACHE_CHECK(for AVX512 support, libc_cv_cc_avx512, [dnl +LIBC_TRY_CC_OPTION([-mavx512f], [libc_cv_cc_avx512=$libc_cv_asm_avx512dq], [libc_cv_cc_avx512=no]) +]) +if test $libc_cv_cc_avx512 = yes; then + AC_DEFINE(HAVE_AVX512_SUPPORT) +fi +LIBC_CONFIG_VAR([config-cflags-avx512], [$libc_cv_cc_avx512]) + +dnl Check whether asm supports Intel MPX +AC_CACHE_CHECK(for Intel MPX support, libc_cv_asm_mpx, [dnl +cat > conftest.s <<\EOF + bndmov %bnd0,(%rsp) +EOF +if AC_TRY_COMMAND(${CC-cc} -c $ASFLAGS conftest.s 1>&AS_MESSAGE_LOG_FD); then + libc_cv_asm_mpx=yes +else + libc_cv_asm_mpx=no +fi +rm -f conftest*]) +if test $libc_cv_asm_mpx = yes; then + AC_DEFINE(HAVE_MPX_SUPPORT) +fi + +if test x"$build_mathvec" = xnotset; then + build_mathvec=yes +fi + +dnl It is always possible to access static and hidden symbols in an +dnl position independent way. +AC_DEFINE(PI_STATIC_AND_HIDDEN) + +test -n "$critic_missing" && AC_MSG_ERROR([ +*** $critic_missing]) diff --git a/REORG.TODO/sysdeps/x86_64/crti.S b/REORG.TODO/sysdeps/x86_64/crti.S new file mode 100644 index 0000000000..2687f35cb7 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/crti.S @@ -0,0 +1,80 @@ +/* Special .init and .fini section support for x86-64. + Copyright (C) 2012-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + In addition to the permissions in the GNU Lesser General Public + License, the Free Software Foundation gives you unlimited + permission to link the compiled version of this file with other + programs, and to distribute those programs without any restriction + coming from the use of this file. (The GNU Lesser General Public + License restrictions do apply in other respects; for example, they + cover modification of the file, and distribution when not linked + into another program.) + + Note that people who make modified versions of this file are not + obligated to grant this special exception for their modified + versions; it is their choice whether to do so. The GNU Lesser + General Public License gives permission to release a modified + version without this exception; this exception also makes it + possible to release a modified version which carries forward this + exception. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +/* crti.S puts a function prologue at the beginning of the .init and + .fini sections and defines global symbols for those addresses, so + they can be called as functions. The symbols _init and _fini are + magic and cause the linker to emit DT_INIT and DT_FINI. */ + +#include <libc-symbols.h> +#include <sysdep.h> + +#ifndef PREINIT_FUNCTION +# define PREINIT_FUNCTION __gmon_start__ +#endif + +#ifndef PREINIT_FUNCTION_WEAK +# define PREINIT_FUNCTION_WEAK 1 +#endif + +#if PREINIT_FUNCTION_WEAK + weak_extern (PREINIT_FUNCTION) +#else + .hidden PREINIT_FUNCTION +#endif + + .section .init,"ax",@progbits + .p2align 2 + .globl _init + .type _init, @function +_init: + /* Maintain 16-byte stack alignment for called functions. */ + subq $8, %rsp +#if PREINIT_FUNCTION_WEAK + movq PREINIT_FUNCTION@GOTPCREL(%rip), %rax + testq %rax, %rax + je .Lno_weak_fn + call *%rax +.Lno_weak_fn: +#else + call PREINIT_FUNCTION +#endif + + .section .fini,"ax",@progbits + .p2align 2 + .globl _fini + .type _fini, @function +_fini: + subq $8, %rsp diff --git a/REORG.TODO/sysdeps/x86_64/crtn.S b/REORG.TODO/sysdeps/x86_64/crtn.S new file mode 100644 index 0000000000..29e3b85300 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/crtn.S @@ -0,0 +1,45 @@ +/* Special .init and .fini section support for x86-64. + Copyright (C) 2012-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + In addition to the permissions in the GNU Lesser General Public + License, the Free Software Foundation gives you unlimited + permission to link the compiled version of this file with other + programs, and to distribute those programs without any restriction + coming from the use of this file. (The GNU Lesser General Public + License restrictions do apply in other respects; for example, they + cover modification of the file, and distribution when not linked + into another program.) + + Note that people who make modified versions of this file are not + obligated to grant this special exception for their modified + versions; it is their choice whether to do so. The GNU Lesser + General Public License gives permission to release a modified + version without this exception; this exception also makes it + possible to release a modified version which carries forward this + exception. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +/* crtn.S puts function epilogues in the .init and .fini sections + corresponding to the prologues in crti.S. */ + + .section .init,"ax",@progbits + addq $8, %rsp + ret + + .section .fini,"ax",@progbits + addq $8, %rsp + ret diff --git a/REORG.TODO/sysdeps/x86_64/dl-irel.h b/REORG.TODO/sysdeps/x86_64/dl-irel.h new file mode 100644 index 0000000000..5f9967abe5 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/dl-irel.h @@ -0,0 +1,51 @@ +/* Machine-dependent ELF indirect relocation inline functions. + x86-64 version. + Copyright (C) 2009-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#ifndef _DL_IREL_H +#define _DL_IREL_H + +#include <stdio.h> +#include <unistd.h> + +#define ELF_MACHINE_IRELA 1 + +static inline ElfW(Addr) +__attribute ((always_inline)) +elf_ifunc_invoke (ElfW(Addr) addr) +{ + return ((ElfW(Addr) (*) (void)) (addr)) (); +} + +static inline void +__attribute ((always_inline)) +elf_irela (const ElfW(Rela) *reloc) +{ + ElfW(Addr) *const reloc_addr = (void *) reloc->r_offset; + const unsigned long int r_type = ELFW(R_TYPE) (reloc->r_info); + + if (__glibc_likely (r_type == R_X86_64_IRELATIVE)) + { + ElfW(Addr) value = elf_ifunc_invoke(reloc->r_addend); + *reloc_addr = value; + } + else + __libc_fatal ("unexpected reloc type in static binary"); +} + +#endif /* dl-irel.h */ diff --git a/REORG.TODO/sysdeps/x86_64/dl-lookupcfg.h b/REORG.TODO/sysdeps/x86_64/dl-lookupcfg.h new file mode 100644 index 0000000000..47b534a059 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/dl-lookupcfg.h @@ -0,0 +1,32 @@ +/* Configuration of lookup functions. + Copyright (C) 2005-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#define DL_UNMAP_IS_SPECIAL + +#include_next <dl-lookupcfg.h> + +/* Address of protected data defined in the shared library may be + external due to copy relocation. */ +#define DL_EXTERN_PROTECTED_DATA + +struct link_map; + +extern void _dl_unmap (struct link_map *map) + internal_function attribute_hidden; + +#define DL_UNMAP(map) _dl_unmap (map) diff --git a/REORG.TODO/sysdeps/x86_64/dl-machine.h b/REORG.TODO/sysdeps/x86_64/dl-machine.h new file mode 100644 index 0000000000..0015db4d6a --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/dl-machine.h @@ -0,0 +1,601 @@ +/* Machine-dependent ELF dynamic relocation inline functions. x86-64 version. + Copyright (C) 2001-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Andreas Jaeger <aj@suse.de>. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#ifndef dl_machine_h +#define dl_machine_h + +#define ELF_MACHINE_NAME "x86_64" + +#include <sys/param.h> +#include <sysdep.h> +#include <tls.h> +#include <dl-tlsdesc.h> +#include <cpu-features.c> + +/* Return nonzero iff ELF header is compatible with the running host. */ +static inline int __attribute__ ((unused)) +elf_machine_matches_host (const ElfW(Ehdr) *ehdr) +{ + return ehdr->e_machine == EM_X86_64; +} + + +/* Return the link-time address of _DYNAMIC. Conveniently, this is the + first element of the GOT. This must be inlined in a function which + uses global data. */ +static inline ElfW(Addr) __attribute__ ((unused)) +elf_machine_dynamic (void) +{ + /* This produces an IP-relative reloc which is resolved at link time. */ + extern const ElfW(Addr) _GLOBAL_OFFSET_TABLE_[] attribute_hidden; + return _GLOBAL_OFFSET_TABLE_[0]; +} + + +/* Return the run-time load address of the shared object. */ +static inline ElfW(Addr) __attribute__ ((unused)) +elf_machine_load_address (void) +{ + /* Compute the difference between the runtime address of _DYNAMIC as seen + by an IP-relative reference, and the link-time address found in the + special unrelocated first GOT entry. */ + extern ElfW(Dyn) _DYNAMIC[] attribute_hidden; + return (ElfW(Addr)) &_DYNAMIC - elf_machine_dynamic (); +} + +/* Set up the loaded object described by L so its unrelocated PLT + entries will jump to the on-demand fixup code in dl-runtime.c. */ + +static inline int __attribute__ ((unused, always_inline)) +elf_machine_runtime_setup (struct link_map *l, int lazy, int profile) +{ + Elf64_Addr *got; + extern void _dl_runtime_resolve_sse (ElfW(Word)) attribute_hidden; + extern void _dl_runtime_resolve_avx (ElfW(Word)) attribute_hidden; + extern void _dl_runtime_resolve_avx_slow (ElfW(Word)) attribute_hidden; + extern void _dl_runtime_resolve_avx_opt (ElfW(Word)) attribute_hidden; + extern void _dl_runtime_resolve_avx512 (ElfW(Word)) attribute_hidden; + extern void _dl_runtime_resolve_avx512_opt (ElfW(Word)) attribute_hidden; + extern void _dl_runtime_profile_sse (ElfW(Word)) attribute_hidden; + extern void _dl_runtime_profile_avx (ElfW(Word)) attribute_hidden; + extern void _dl_runtime_profile_avx512 (ElfW(Word)) attribute_hidden; + + if (l->l_info[DT_JMPREL] && lazy) + { + /* The GOT entries for functions in the PLT have not yet been filled + in. Their initial contents will arrange when called to push an + offset into the .rel.plt section, push _GLOBAL_OFFSET_TABLE_[1], + and then jump to _GLOBAL_OFFSET_TABLE_[2]. */ + got = (Elf64_Addr *) D_PTR (l, l_info[DT_PLTGOT]); + /* If a library is prelinked but we have to relocate anyway, + we have to be able to undo the prelinking of .got.plt. + The prelinker saved us here address of .plt + 0x16. */ + if (got[1]) + { + l->l_mach.plt = got[1] + l->l_addr; + l->l_mach.gotplt = (ElfW(Addr)) &got[3]; + } + /* Identify this shared object. */ + *(ElfW(Addr) *) (got + 1) = (ElfW(Addr)) l; + + /* The got[2] entry contains the address of a function which gets + called to get the address of a so far unresolved function and + jump to it. The profiling extension of the dynamic linker allows + to intercept the calls to collect information. In this case we + don't store the address in the GOT so that all future calls also + end in this function. */ + if (__glibc_unlikely (profile)) + { + if (HAS_ARCH_FEATURE (AVX512F_Usable)) + *(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_profile_avx512; + else if (HAS_ARCH_FEATURE (AVX_Usable)) + *(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_profile_avx; + else + *(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_profile_sse; + + if (GLRO(dl_profile) != NULL + && _dl_name_match_p (GLRO(dl_profile), l)) + /* This is the object we are looking for. Say that we really + want profiling and the timers are started. */ + GL(dl_profile_map) = l; + } + else + { + /* This function will get called to fix up the GOT entry + indicated by the offset on the stack, and then jump to + the resolved address. */ + if (HAS_ARCH_FEATURE (AVX512F_Usable)) + { + if (HAS_ARCH_FEATURE (Use_dl_runtime_resolve_opt)) + *(ElfW(Addr) *) (got + 2) + = (ElfW(Addr)) &_dl_runtime_resolve_avx512_opt; + else + *(ElfW(Addr) *) (got + 2) + = (ElfW(Addr)) &_dl_runtime_resolve_avx512; + } + else if (HAS_ARCH_FEATURE (AVX_Usable)) + { + if (HAS_ARCH_FEATURE (Use_dl_runtime_resolve_opt)) + *(ElfW(Addr) *) (got + 2) + = (ElfW(Addr)) &_dl_runtime_resolve_avx_opt; + else if (HAS_ARCH_FEATURE (Use_dl_runtime_resolve_slow)) + *(ElfW(Addr) *) (got + 2) + = (ElfW(Addr)) &_dl_runtime_resolve_avx_slow; + else + *(ElfW(Addr) *) (got + 2) + = (ElfW(Addr)) &_dl_runtime_resolve_avx; + } + else + *(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_resolve_sse; + } + } + + if (l->l_info[ADDRIDX (DT_TLSDESC_GOT)] && lazy) + *(ElfW(Addr)*)(D_PTR (l, l_info[ADDRIDX (DT_TLSDESC_GOT)]) + l->l_addr) + = (ElfW(Addr)) &_dl_tlsdesc_resolve_rela; + + return lazy; +} + +/* Initial entry point code for the dynamic linker. + The C function `_dl_start' is the real entry point; + its return value is the user program's entry point. */ +#define RTLD_START asm ("\n\ +.text\n\ + .align 16\n\ +.globl _start\n\ +.globl _dl_start_user\n\ +_start:\n\ + movq %rsp, %rdi\n\ + call _dl_start\n\ +_dl_start_user:\n\ + # Save the user entry point address in %r12.\n\ + movq %rax, %r12\n\ + # See if we were run as a command with the executable file\n\ + # name as an extra leading argument.\n\ + movl _dl_skip_args(%rip), %eax\n\ + # Pop the original argument count.\n\ + popq %rdx\n\ + # Adjust the stack pointer to skip _dl_skip_args words.\n\ + leaq (%rsp,%rax,8), %rsp\n\ + # Subtract _dl_skip_args from argc.\n\ + subl %eax, %edx\n\ + # Push argc back on the stack.\n\ + pushq %rdx\n\ + # Call _dl_init (struct link_map *main_map, int argc, char **argv, char **env)\n\ + # argc -> rsi\n\ + movq %rdx, %rsi\n\ + # Save %rsp value in %r13.\n\ + movq %rsp, %r13\n\ + # And align stack for the _dl_init call. \n\ + andq $-16, %rsp\n\ + # _dl_loaded -> rdi\n\ + movq _rtld_local(%rip), %rdi\n\ + # env -> rcx\n\ + leaq 16(%r13,%rdx,8), %rcx\n\ + # argv -> rdx\n\ + leaq 8(%r13), %rdx\n\ + # Clear %rbp to mark outermost frame obviously even for constructors.\n\ + xorl %ebp, %ebp\n\ + # Call the function to run the initializers.\n\ + call _dl_init\n\ + # Pass our finalizer function to the user in %rdx, as per ELF ABI.\n\ + leaq _dl_fini(%rip), %rdx\n\ + # And make sure %rsp points to argc stored on the stack.\n\ + movq %r13, %rsp\n\ + # Jump to the user's entry point.\n\ + jmp *%r12\n\ +.previous\n\ +"); + +/* ELF_RTYPE_CLASS_PLT iff TYPE describes relocation of a PLT entry or + TLS variable, so undefined references should not be allowed to + define the value. + ELF_RTYPE_CLASS_COPY iff TYPE should not be allowed to resolve to one + of the main executable's symbols, as for a COPY reloc. + ELF_RTYPE_CLASS_EXTERN_PROTECTED_DATA iff TYPE describes relocation may + against protected data whose address be external due to copy relocation. + */ +#define elf_machine_type_class(type) \ + ((((type) == R_X86_64_JUMP_SLOT \ + || (type) == R_X86_64_DTPMOD64 \ + || (type) == R_X86_64_DTPOFF64 \ + || (type) == R_X86_64_TPOFF64 \ + || (type) == R_X86_64_TLSDESC) \ + * ELF_RTYPE_CLASS_PLT) \ + | (((type) == R_X86_64_COPY) * ELF_RTYPE_CLASS_COPY) \ + | (((type) == R_X86_64_GLOB_DAT) * ELF_RTYPE_CLASS_EXTERN_PROTECTED_DATA)) + +/* A reloc type used for ld.so cmdline arg lookups to reject PLT entries. */ +#define ELF_MACHINE_JMP_SLOT R_X86_64_JUMP_SLOT + +/* The relative ifunc relocation. */ +// XXX This is a work-around for a broken linker. Remove! +#define ELF_MACHINE_IRELATIVE R_X86_64_IRELATIVE + +/* The x86-64 never uses Elf64_Rel/Elf32_Rel relocations. */ +#define ELF_MACHINE_NO_REL 1 +#define ELF_MACHINE_NO_RELA 0 + +/* We define an initialization function. This is called very early in + _dl_sysdep_start. */ +#define DL_PLATFORM_INIT dl_platform_init () + +static inline void __attribute__ ((unused)) +dl_platform_init (void) +{ +#if IS_IN (rtld) + /* init_cpu_features has been called early from __libc_start_main in + static executable. */ + init_cpu_features (&GLRO(dl_x86_cpu_features)); +#else + if (GLRO(dl_platform) != NULL && *GLRO(dl_platform) == '\0') + /* Avoid an empty string which would disturb us. */ + GLRO(dl_platform) = NULL; +#endif +} + +static inline ElfW(Addr) +elf_machine_fixup_plt (struct link_map *map, lookup_t t, + const ElfW(Rela) *reloc, + ElfW(Addr) *reloc_addr, ElfW(Addr) value) +{ + return *reloc_addr = value; +} + +/* Return the final value of a PLT relocation. On x86-64 the + JUMP_SLOT relocation ignores the addend. */ +static inline ElfW(Addr) +elf_machine_plt_value (struct link_map *map, const ElfW(Rela) *reloc, + ElfW(Addr) value) +{ + return value; +} + + +/* Names of the architecture-specific auditing callback functions. */ +#define ARCH_LA_PLTENTER x86_64_gnu_pltenter +#define ARCH_LA_PLTEXIT x86_64_gnu_pltexit + +#endif /* !dl_machine_h */ + +#ifdef RESOLVE_MAP + +/* Perform the relocation specified by RELOC and SYM (which is fully resolved). + MAP is the object containing the reloc. */ + +auto inline void +__attribute__ ((always_inline)) +elf_machine_rela (struct link_map *map, const ElfW(Rela) *reloc, + const ElfW(Sym) *sym, const struct r_found_version *version, + void *const reloc_addr_arg, int skip_ifunc) +{ + ElfW(Addr) *const reloc_addr = reloc_addr_arg; + const unsigned long int r_type = ELFW(R_TYPE) (reloc->r_info); + +# if !defined RTLD_BOOTSTRAP || !defined HAVE_Z_COMBRELOC + if (__glibc_unlikely (r_type == R_X86_64_RELATIVE)) + { +# if !defined RTLD_BOOTSTRAP && !defined HAVE_Z_COMBRELOC + /* This is defined in rtld.c, but nowhere in the static libc.a; + make the reference weak so static programs can still link. + This declaration cannot be done when compiling rtld.c + (i.e. #ifdef RTLD_BOOTSTRAP) because rtld.c contains the + common defn for _dl_rtld_map, which is incompatible with a + weak decl in the same file. */ +# ifndef SHARED + weak_extern (GL(dl_rtld_map)); +# endif + if (map != &GL(dl_rtld_map)) /* Already done in rtld itself. */ +# endif + *reloc_addr = map->l_addr + reloc->r_addend; + } + else +# endif +# if !defined RTLD_BOOTSTRAP + /* l_addr + r_addend may be > 0xffffffff and R_X86_64_RELATIVE64 + relocation updates the whole 64-bit entry. */ + if (__glibc_unlikely (r_type == R_X86_64_RELATIVE64)) + *(Elf64_Addr *) reloc_addr = (Elf64_Addr) map->l_addr + reloc->r_addend; + else +# endif + if (__glibc_unlikely (r_type == R_X86_64_NONE)) + return; + else + { +# ifndef RTLD_BOOTSTRAP + const ElfW(Sym) *const refsym = sym; +# endif + struct link_map *sym_map = RESOLVE_MAP (&sym, version, r_type); + ElfW(Addr) value = (sym == NULL ? 0 + : (ElfW(Addr)) sym_map->l_addr + sym->st_value); + + if (sym != NULL + && __builtin_expect (ELFW(ST_TYPE) (sym->st_info) == STT_GNU_IFUNC, + 0) + && __builtin_expect (sym->st_shndx != SHN_UNDEF, 1) + && __builtin_expect (!skip_ifunc, 1)) + { +# ifndef RTLD_BOOTSTRAP + if (sym_map != map + && sym_map->l_type != lt_executable + && !sym_map->l_relocated) + { + const char *strtab + = (const char *) D_PTR (map, l_info[DT_STRTAB]); + _dl_error_printf ("\ +%s: Relink `%s' with `%s' for IFUNC symbol `%s'\n", + RTLD_PROGNAME, map->l_name, + sym_map->l_name, + strtab + refsym->st_name); + } +# endif + value = ((ElfW(Addr) (*) (void)) value) (); + } + + switch (r_type) + { +# ifndef RTLD_BOOTSTRAP +# ifdef __ILP32__ + case R_X86_64_SIZE64: + /* Set to symbol size plus addend. */ + *(Elf64_Addr *) (uintptr_t) reloc_addr + = (Elf64_Addr) sym->st_size + reloc->r_addend; + break; + + case R_X86_64_SIZE32: +# else + case R_X86_64_SIZE64: +# endif + /* Set to symbol size plus addend. */ + value = sym->st_size; +# endif + case R_X86_64_GLOB_DAT: + case R_X86_64_JUMP_SLOT: + *reloc_addr = value + reloc->r_addend; + break; + +# ifndef RESOLVE_CONFLICT_FIND_MAP + case R_X86_64_DTPMOD64: +# ifdef RTLD_BOOTSTRAP + /* During startup the dynamic linker is always the module + with index 1. + XXX If this relocation is necessary move before RESOLVE + call. */ + *reloc_addr = 1; +# else + /* Get the information from the link map returned by the + resolve function. */ + if (sym_map != NULL) + *reloc_addr = sym_map->l_tls_modid; +# endif + break; + case R_X86_64_DTPOFF64: +# ifndef RTLD_BOOTSTRAP + /* During relocation all TLS symbols are defined and used. + Therefore the offset is already correct. */ + if (sym != NULL) + { + value = sym->st_value + reloc->r_addend; +# ifdef __ILP32__ + /* This relocation type computes a signed offset that is + usually negative. The symbol and addend values are 32 + bits but the GOT entry is 64 bits wide and the whole + 64-bit entry is used as a signed quantity, so we need + to sign-extend the computed value to 64 bits. */ + *(Elf64_Sxword *) reloc_addr = (Elf64_Sxword) (Elf32_Sword) value; +# else + *reloc_addr = value; +# endif + } +# endif + break; + case R_X86_64_TLSDESC: + { + struct tlsdesc volatile *td = + (struct tlsdesc volatile *)reloc_addr; + +# ifndef RTLD_BOOTSTRAP + if (! sym) + { + td->arg = (void*)reloc->r_addend; + td->entry = _dl_tlsdesc_undefweak; + } + else +# endif + { +# ifndef RTLD_BOOTSTRAP +# ifndef SHARED + CHECK_STATIC_TLS (map, sym_map); +# else + if (!TRY_STATIC_TLS (map, sym_map)) + { + td->arg = _dl_make_tlsdesc_dynamic + (sym_map, sym->st_value + reloc->r_addend); + td->entry = _dl_tlsdesc_dynamic; + } + else +# endif +# endif + { + td->arg = (void*)(sym->st_value - sym_map->l_tls_offset + + reloc->r_addend); + td->entry = _dl_tlsdesc_return; + } + } + break; + } + case R_X86_64_TPOFF64: + /* The offset is negative, forward from the thread pointer. */ +# ifndef RTLD_BOOTSTRAP + if (sym != NULL) +# endif + { +# ifndef RTLD_BOOTSTRAP + CHECK_STATIC_TLS (map, sym_map); +# endif + /* We know the offset of the object the symbol is contained in. + It is a negative value which will be added to the + thread pointer. */ + value = (sym->st_value + reloc->r_addend + - sym_map->l_tls_offset); +# ifdef __ILP32__ + /* The symbol and addend values are 32 bits but the GOT + entry is 64 bits wide and the whole 64-bit entry is used + as a signed quantity, so we need to sign-extend the + computed value to 64 bits. */ + *(Elf64_Sxword *) reloc_addr = (Elf64_Sxword) (Elf32_Sword) value; +# else + *reloc_addr = value; +# endif + } + break; +# endif + +# ifndef RTLD_BOOTSTRAP + case R_X86_64_64: + /* value + r_addend may be > 0xffffffff and R_X86_64_64 + relocation updates the whole 64-bit entry. */ + *(Elf64_Addr *) reloc_addr = (Elf64_Addr) value + reloc->r_addend; + break; +# ifndef __ILP32__ + case R_X86_64_SIZE32: + /* Set to symbol size plus addend. */ + value = sym->st_size; +# endif + case R_X86_64_32: + value += reloc->r_addend; + *(unsigned int *) reloc_addr = value; + + const char *fmt; + if (__glibc_unlikely (value > UINT_MAX)) + { + const char *strtab; + + fmt = "\ +%s: Symbol `%s' causes overflow in R_X86_64_32 relocation\n"; +# ifndef RESOLVE_CONFLICT_FIND_MAP + print_err: +# endif + strtab = (const char *) D_PTR (map, l_info[DT_STRTAB]); + + _dl_error_printf (fmt, RTLD_PROGNAME, strtab + refsym->st_name); + } + break; +# ifndef RESOLVE_CONFLICT_FIND_MAP + /* Not needed for dl-conflict.c. */ + case R_X86_64_PC32: + value += reloc->r_addend - (ElfW(Addr)) reloc_addr; + *(unsigned int *) reloc_addr = value; + if (__glibc_unlikely (value != (int) value)) + { + fmt = "\ +%s: Symbol `%s' causes overflow in R_X86_64_PC32 relocation\n"; + goto print_err; + } + break; + case R_X86_64_COPY: + if (sym == NULL) + /* This can happen in trace mode if an object could not be + found. */ + break; + memcpy (reloc_addr_arg, (void *) value, + MIN (sym->st_size, refsym->st_size)); + if (__builtin_expect (sym->st_size > refsym->st_size, 0) + || (__builtin_expect (sym->st_size < refsym->st_size, 0) + && GLRO(dl_verbose))) + { + fmt = "\ +%s: Symbol `%s' has different size in shared object, consider re-linking\n"; + goto print_err; + } + break; +# endif + case R_X86_64_IRELATIVE: + value = map->l_addr + reloc->r_addend; + value = ((ElfW(Addr) (*) (void)) value) (); + *reloc_addr = value; + break; + default: + _dl_reloc_bad_type (map, r_type, 0); + break; +# endif + } + } +} + +auto inline void +__attribute ((always_inline)) +elf_machine_rela_relative (ElfW(Addr) l_addr, const ElfW(Rela) *reloc, + void *const reloc_addr_arg) +{ + ElfW(Addr) *const reloc_addr = reloc_addr_arg; +#if !defined RTLD_BOOTSTRAP + /* l_addr + r_addend may be > 0xffffffff and R_X86_64_RELATIVE64 + relocation updates the whole 64-bit entry. */ + if (__glibc_unlikely (ELFW(R_TYPE) (reloc->r_info) == R_X86_64_RELATIVE64)) + *(Elf64_Addr *) reloc_addr = (Elf64_Addr) l_addr + reloc->r_addend; + else +#endif + { + assert (ELFW(R_TYPE) (reloc->r_info) == R_X86_64_RELATIVE); + *reloc_addr = l_addr + reloc->r_addend; + } +} + +auto inline void +__attribute ((always_inline)) +elf_machine_lazy_rel (struct link_map *map, + ElfW(Addr) l_addr, const ElfW(Rela) *reloc, + int skip_ifunc) +{ + ElfW(Addr) *const reloc_addr = (void *) (l_addr + reloc->r_offset); + const unsigned long int r_type = ELFW(R_TYPE) (reloc->r_info); + + /* Check for unexpected PLT reloc type. */ + if (__glibc_likely (r_type == R_X86_64_JUMP_SLOT)) + { + if (__builtin_expect (map->l_mach.plt, 0) == 0) + *reloc_addr += l_addr; + else + *reloc_addr = + map->l_mach.plt + + (((ElfW(Addr)) reloc_addr) - map->l_mach.gotplt) * 2; + } + else if (__glibc_likely (r_type == R_X86_64_TLSDESC)) + { + struct tlsdesc volatile * __attribute__((__unused__)) td = + (struct tlsdesc volatile *)reloc_addr; + + td->arg = (void*)reloc; + td->entry = (void*)(D_PTR (map, l_info[ADDRIDX (DT_TLSDESC_PLT)]) + + map->l_addr); + } + else if (__glibc_unlikely (r_type == R_X86_64_IRELATIVE)) + { + ElfW(Addr) value = map->l_addr + reloc->r_addend; + if (__glibc_likely (!skip_ifunc)) + value = ((ElfW(Addr) (*) (void)) value) (); + *reloc_addr = value; + } + else + _dl_reloc_bad_type (map, r_type, 1); +} + +#endif /* RESOLVE_MAP */ diff --git a/REORG.TODO/sysdeps/x86_64/dl-procinfo.c b/REORG.TODO/sysdeps/x86_64/dl-procinfo.c new file mode 100644 index 0000000000..17ae800a37 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/dl-procinfo.c @@ -0,0 +1,45 @@ +/* Data for x86-64 version of processor capability information. + Copyright (C) 2015-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +/* If anything should be added here check whether the size of each string + is still ok with the given array size. + + All the #ifdefs in the definitions are quite irritating but + necessary if we want to avoid duplicating the information. There + are three different modes: + + - PROCINFO_DECL is defined. This means we are only interested in + declarations. + + - PROCINFO_DECL is not defined: + + + if SHARED is defined the file is included in an array + initializer. The .element = { ... } syntax is needed. + + + if SHARED is not defined a normal array initialization is + needed. + */ + +#ifndef PROCINFO_CLASS +# define PROCINFO_CLASS +#endif + +#include <sysdeps/x86/dl-procinfo.c> + +#undef PROCINFO_DECL +#undef PROCINFO_CLASS diff --git a/REORG.TODO/sysdeps/x86_64/dl-runtime.c b/REORG.TODO/sysdeps/x86_64/dl-runtime.c new file mode 100644 index 0000000000..b625d1e882 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/dl-runtime.c @@ -0,0 +1,9 @@ +/* The ABI calls for the PLT stubs to pass the index of the relocation + and not its offset. In _dl_profile_fixup and _dl_call_pltexit we + also use the index. Therefore it is wasteful to compute the offset + in the trampoline just to reverse the operation immediately + afterwards. */ +#define reloc_offset reloc_arg * sizeof (PLTREL) +#define reloc_index reloc_arg + +#include <elf/dl-runtime.c> diff --git a/REORG.TODO/sysdeps/x86_64/dl-tls.h b/REORG.TODO/sysdeps/x86_64/dl-tls.h new file mode 100644 index 0000000000..4a59d2a924 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/dl-tls.h @@ -0,0 +1,29 @@ +/* Thread-local storage handling in the ELF dynamic linker. x86-64 version. + Copyright (C) 2002-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <stdint.h> + +/* Type used for the representation of TLS information in the GOT. */ +typedef struct dl_tls_index +{ + uint64_t ti_module; + uint64_t ti_offset; +} tls_index; + + +extern void *__tls_get_addr (tls_index *ti); diff --git a/REORG.TODO/sysdeps/x86_64/dl-tlsdesc.S b/REORG.TODO/sysdeps/x86_64/dl-tlsdesc.S new file mode 100644 index 0000000000..be3a780a1a --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/dl-tlsdesc.S @@ -0,0 +1,245 @@ +/* Thread-local storage handling in the ELF dynamic linker. x86_64 version. + Copyright (C) 2004-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <tls.h> +#include "tlsdesc.h" + + .text + + /* This function is used to compute the TP offset for symbols in + Static TLS, i.e., whose TP offset is the same for all + threads. + + The incoming %rax points to the TLS descriptor, such that + 0(%rax) points to _dl_tlsdesc_return itself, and 8(%rax) holds + the TP offset of the symbol corresponding to the object + denoted by the argument. */ + + .hidden _dl_tlsdesc_return + .global _dl_tlsdesc_return + .type _dl_tlsdesc_return,@function + cfi_startproc + .align 16 +_dl_tlsdesc_return: + movq 8(%rax), %rax + ret + cfi_endproc + .size _dl_tlsdesc_return, .-_dl_tlsdesc_return + + /* This function is used for undefined weak TLS symbols, for + which the base address (i.e., disregarding any addend) should + resolve to NULL. + + %rax points to the TLS descriptor, such that 0(%rax) points to + _dl_tlsdesc_undefweak itself, and 8(%rax) holds the addend. + We return the addend minus the TP, such that, when the caller + adds TP, it gets the addend back. If that's zero, as usual, + that's most likely a NULL pointer. */ + + .hidden _dl_tlsdesc_undefweak + .global _dl_tlsdesc_undefweak + .type _dl_tlsdesc_undefweak,@function + cfi_startproc + .align 16 +_dl_tlsdesc_undefweak: + movq 8(%rax), %rax + subq %fs:0, %rax + ret + cfi_endproc + .size _dl_tlsdesc_undefweak, .-_dl_tlsdesc_undefweak + +#ifdef SHARED + .hidden _dl_tlsdesc_dynamic + .global _dl_tlsdesc_dynamic + .type _dl_tlsdesc_dynamic,@function + + /* %rax points to the TLS descriptor, such that 0(%rax) points to + _dl_tlsdesc_dynamic itself, and 8(%rax) points to a struct + tlsdesc_dynamic_arg object. It must return in %rax the offset + between the thread pointer and the object denoted by the + argument, without clobbering any registers. + + The assembly code that follows is a rendition of the following + C code, hand-optimized a little bit. + +ptrdiff_t +_dl_tlsdesc_dynamic (register struct tlsdesc *tdp asm ("%rax")) +{ + struct tlsdesc_dynamic_arg *td = tdp->arg; + dtv_t *dtv = *(dtv_t **)((char *)__thread_pointer + DTV_OFFSET); + if (__builtin_expect (td->gen_count <= dtv[0].counter + && (dtv[td->tlsinfo.ti_module].pointer.val + != TLS_DTV_UNALLOCATED), + 1)) + return dtv[td->tlsinfo.ti_module].pointer.val + td->tlsinfo.ti_offset + - __thread_pointer; + + return __tls_get_addr_internal (&td->tlsinfo) - __thread_pointer; +} +*/ + cfi_startproc + .align 16 +_dl_tlsdesc_dynamic: + /* Preserve call-clobbered registers that we modify. + We need two scratch regs anyway. */ + movq %rsi, -16(%rsp) + movq %fs:DTV_OFFSET, %rsi + movq %rdi, -8(%rsp) + movq TLSDESC_ARG(%rax), %rdi + movq (%rsi), %rax + cmpq %rax, TLSDESC_GEN_COUNT(%rdi) + ja .Lslow + movq TLSDESC_MODID(%rdi), %rax + salq $4, %rax + movq (%rax,%rsi), %rax + cmpq $-1, %rax + je .Lslow + addq TLSDESC_MODOFF(%rdi), %rax +.Lret: + movq -16(%rsp), %rsi + subq %fs:0, %rax + movq -8(%rsp), %rdi + ret +.Lslow: + /* Besides rdi and rsi, saved above, save rdx, rcx, r8, r9, + r10 and r11. Also, align the stack, that's off by 8 bytes. */ + subq $72, %rsp + cfi_adjust_cfa_offset (72) + movq %rdx, 8(%rsp) + movq %rcx, 16(%rsp) + movq %r8, 24(%rsp) + movq %r9, 32(%rsp) + movq %r10, 40(%rsp) + movq %r11, 48(%rsp) + /* %rdi already points to the tlsinfo data structure. */ + call HIDDEN_JUMPTARGET (__tls_get_addr) + movq 8(%rsp), %rdx + movq 16(%rsp), %rcx + movq 24(%rsp), %r8 + movq 32(%rsp), %r9 + movq 40(%rsp), %r10 + movq 48(%rsp), %r11 + addq $72, %rsp + cfi_adjust_cfa_offset (-72) + jmp .Lret + cfi_endproc + .size _dl_tlsdesc_dynamic, .-_dl_tlsdesc_dynamic +#endif /* SHARED */ + + /* This function is a wrapper for a lazy resolver for TLS_DESC + RELA relocations. The incoming 0(%rsp) points to the caller's + link map, pushed by the dynamic object's internal lazy TLS + resolver front-end before tail-calling us. We need to pop it + ourselves. %rax points to a TLS descriptor, such that 0(%rax) + holds the address of the internal resolver front-end (unless + some other thread beat us to resolving it) and 8(%rax) holds a + pointer to the relocation. + + When the actual resolver returns, it will have adjusted the + TLS descriptor such that we can tail-call it for it to return + the TP offset of the symbol. */ + + .hidden _dl_tlsdesc_resolve_rela + .global _dl_tlsdesc_resolve_rela + .type _dl_tlsdesc_resolve_rela,@function + cfi_startproc + .align 16 + /* The PLT entry will have pushed the link_map pointer. */ +_dl_tlsdesc_resolve_rela: + cfi_adjust_cfa_offset (8) + /* Save all call-clobbered registers. Add 8 bytes for push in + the PLT entry to align the stack. */ + subq $80, %rsp + cfi_adjust_cfa_offset (80) + movq %rax, (%rsp) + movq %rdi, 8(%rsp) + movq %rax, %rdi /* Pass tlsdesc* in %rdi. */ + movq %rsi, 16(%rsp) + movq 80(%rsp), %rsi /* Pass link_map* in %rsi. */ + movq %r8, 24(%rsp) + movq %r9, 32(%rsp) + movq %r10, 40(%rsp) + movq %r11, 48(%rsp) + movq %rdx, 56(%rsp) + movq %rcx, 64(%rsp) + call _dl_tlsdesc_resolve_rela_fixup + movq (%rsp), %rax + movq 8(%rsp), %rdi + movq 16(%rsp), %rsi + movq 24(%rsp), %r8 + movq 32(%rsp), %r9 + movq 40(%rsp), %r10 + movq 48(%rsp), %r11 + movq 56(%rsp), %rdx + movq 64(%rsp), %rcx + addq $88, %rsp + cfi_adjust_cfa_offset (-88) + jmp *(%rax) + cfi_endproc + .size _dl_tlsdesc_resolve_rela, .-_dl_tlsdesc_resolve_rela + + /* This function is a placeholder for lazy resolving of TLS + relocations. Once some thread starts resolving a TLS + relocation, it sets up the TLS descriptor to use this + resolver, such that other threads that would attempt to + resolve it concurrently may skip the call to the original lazy + resolver and go straight to a condition wait. + + When the actual resolver returns, it will have adjusted the + TLS descriptor such that we can tail-call it for it to return + the TP offset of the symbol. */ + + .hidden _dl_tlsdesc_resolve_hold + .global _dl_tlsdesc_resolve_hold + .type _dl_tlsdesc_resolve_hold,@function + cfi_startproc + .align 16 +_dl_tlsdesc_resolve_hold: +0: + /* Save all call-clobbered registers. */ + subq $72, %rsp + cfi_adjust_cfa_offset (72) + movq %rax, (%rsp) + movq %rdi, 8(%rsp) + movq %rax, %rdi /* Pass tlsdesc* in %rdi. */ + movq %rsi, 16(%rsp) + /* Pass _dl_tlsdesc_resolve_hold's address in %rsi. */ + leaq . - _dl_tlsdesc_resolve_hold(%rip), %rsi + movq %r8, 24(%rsp) + movq %r9, 32(%rsp) + movq %r10, 40(%rsp) + movq %r11, 48(%rsp) + movq %rdx, 56(%rsp) + movq %rcx, 64(%rsp) + call _dl_tlsdesc_resolve_hold_fixup +1: + movq (%rsp), %rax + movq 8(%rsp), %rdi + movq 16(%rsp), %rsi + movq 24(%rsp), %r8 + movq 32(%rsp), %r9 + movq 40(%rsp), %r10 + movq 48(%rsp), %r11 + movq 56(%rsp), %rdx + movq 64(%rsp), %rcx + addq $72, %rsp + cfi_adjust_cfa_offset (-72) + jmp *(%rax) + cfi_endproc + .size _dl_tlsdesc_resolve_hold, .-_dl_tlsdesc_resolve_hold diff --git a/REORG.TODO/sysdeps/x86_64/dl-tlsdesc.h b/REORG.TODO/sysdeps/x86_64/dl-tlsdesc.h new file mode 100644 index 0000000000..14019a2610 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/dl-tlsdesc.h @@ -0,0 +1,70 @@ +/* Thread-local storage descriptor handling in the ELF dynamic linker. + x86_64 version. + Copyright (C) 2005-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <stdint.h> + +#ifndef _X86_64_DL_TLSDESC_H +# define _X86_64_DL_TLSDESC_H 1 + +/* Type used to represent a TLS descriptor in the GOT. */ +struct tlsdesc +{ + /* Anonymous union is used here to ensure that GOT entry slot is always + 8 bytes for both x32 and x86-64. */ + union + { + ptrdiff_t (*entry) (struct tlsdesc *on_rax); + uint64_t entry_slot; + }; + union + { + void *arg; + uint64_t arg_slot; + }; +}; + +typedef struct dl_tls_index +{ + uint64_t ti_module; + uint64_t ti_offset; +} tls_index; + +/* Type used as the argument in a TLS descriptor for a symbol that + needs dynamic TLS offsets. */ +struct tlsdesc_dynamic_arg +{ + tls_index tlsinfo; + size_t gen_count; +}; + +extern ptrdiff_t attribute_hidden + _dl_tlsdesc_return(struct tlsdesc *on_rax), + _dl_tlsdesc_undefweak(struct tlsdesc *on_rax), + _dl_tlsdesc_resolve_rela(struct tlsdesc *on_rax), + _dl_tlsdesc_resolve_hold(struct tlsdesc *on_rax); + +# ifdef SHARED +extern void *_dl_make_tlsdesc_dynamic (struct link_map *map, + size_t ti_offset) + internal_function attribute_hidden; + +extern ptrdiff_t attribute_hidden _dl_tlsdesc_dynamic(struct tlsdesc *); +# endif + +#endif diff --git a/REORG.TODO/sysdeps/x86_64/dl-trampoline.S b/REORG.TODO/sysdeps/x86_64/dl-trampoline.S new file mode 100644 index 0000000000..c14c61aa58 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/dl-trampoline.S @@ -0,0 +1,147 @@ +/* PLT trampolines. x86-64 version. + Copyright (C) 2004-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <config.h> +#include <sysdep.h> +#include <cpu-features.h> +#include <link-defines.h> + +#ifndef DL_STACK_ALIGNMENT +/* Due to GCC bug: + + https://gcc.gnu.org/bugzilla/show_bug.cgi?id=58066 + + __tls_get_addr may be called with 8-byte stack alignment. Although + this bug has been fixed in GCC 4.9.4, 5.3 and 6, we can't assume + that stack will be always aligned at 16 bytes. We use unaligned + 16-byte move to load and store SSE registers, which has no penalty + on modern processors if stack is 16-byte aligned. */ +# define DL_STACK_ALIGNMENT 8 +#endif + +#ifndef DL_RUNTIME_UNALIGNED_VEC_SIZE +/* The maximum size in bytes of unaligned vector load and store in the + dynamic linker. Since SSE optimized memory/string functions with + aligned SSE register load and store are used in the dynamic linker, + we must set this to 8 so that _dl_runtime_resolve_sse will align the + stack before calling _dl_fixup. */ +# define DL_RUNTIME_UNALIGNED_VEC_SIZE 8 +#endif + +/* True if _dl_runtime_resolve should align stack to VEC_SIZE bytes. */ +#define DL_RUNTIME_RESOLVE_REALIGN_STACK \ + (VEC_SIZE > DL_STACK_ALIGNMENT \ + && VEC_SIZE > DL_RUNTIME_UNALIGNED_VEC_SIZE) + +/* Align vector register save area to 16 bytes. */ +#define REGISTER_SAVE_VEC_OFF 0 + +/* Area on stack to save and restore registers used for parameter + passing when calling _dl_fixup. */ +#ifdef __ILP32__ +# define REGISTER_SAVE_RAX (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 8) +# define PRESERVE_BND_REGS_PREFIX +#else +/* Align bound register save area to 16 bytes. */ +# define REGISTER_SAVE_BND0 (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 8) +# define REGISTER_SAVE_BND1 (REGISTER_SAVE_BND0 + 16) +# define REGISTER_SAVE_BND2 (REGISTER_SAVE_BND1 + 16) +# define REGISTER_SAVE_BND3 (REGISTER_SAVE_BND2 + 16) +# define REGISTER_SAVE_RAX (REGISTER_SAVE_BND3 + 16) +# ifdef HAVE_MPX_SUPPORT +# define PRESERVE_BND_REGS_PREFIX bnd +# else +# define PRESERVE_BND_REGS_PREFIX .byte 0xf2 +# endif +#endif +#define REGISTER_SAVE_RCX (REGISTER_SAVE_RAX + 8) +#define REGISTER_SAVE_RDX (REGISTER_SAVE_RCX + 8) +#define REGISTER_SAVE_RSI (REGISTER_SAVE_RDX + 8) +#define REGISTER_SAVE_RDI (REGISTER_SAVE_RSI + 8) +#define REGISTER_SAVE_R8 (REGISTER_SAVE_RDI + 8) +#define REGISTER_SAVE_R9 (REGISTER_SAVE_R8 + 8) + +#define RESTORE_AVX + +#define VEC_SIZE 64 +#define VMOVA vmovdqa64 +#if DL_RUNTIME_RESOLVE_REALIGN_STACK || VEC_SIZE <= DL_STACK_ALIGNMENT +# define VMOV vmovdqa64 +#else +# define VMOV vmovdqu64 +#endif +#define VEC(i) zmm##i +#define _dl_runtime_resolve _dl_runtime_resolve_avx512 +#define _dl_runtime_profile _dl_runtime_profile_avx512 +#include "dl-trampoline.h" +#undef _dl_runtime_resolve +#undef _dl_runtime_profile +#undef VEC +#undef VMOV +#undef VMOVA +#undef VEC_SIZE + +#define VEC_SIZE 32 +#define VMOVA vmovdqa +#if DL_RUNTIME_RESOLVE_REALIGN_STACK || VEC_SIZE <= DL_STACK_ALIGNMENT +# define VMOV vmovdqa +#else +# define VMOV vmovdqu +#endif +#define VEC(i) ymm##i +#define _dl_runtime_resolve _dl_runtime_resolve_avx +#define _dl_runtime_resolve_opt _dl_runtime_resolve_avx_opt +#define _dl_runtime_profile _dl_runtime_profile_avx +#include "dl-trampoline.h" +#undef _dl_runtime_resolve +#undef _dl_runtime_resolve_opt +#undef _dl_runtime_profile +#undef VEC +#undef VMOV +#undef VMOVA +#undef VEC_SIZE + +/* movaps/movups is 1-byte shorter. */ +#define VEC_SIZE 16 +#define VMOVA movaps +#if DL_RUNTIME_RESOLVE_REALIGN_STACK || VEC_SIZE <= DL_STACK_ALIGNMENT +# define VMOV movaps +#else +# define VMOV movups +#endif +#define VEC(i) xmm##i +#define _dl_runtime_resolve _dl_runtime_resolve_sse +#define _dl_runtime_profile _dl_runtime_profile_sse +#undef RESTORE_AVX +#include "dl-trampoline.h" +#undef _dl_runtime_resolve +#undef _dl_runtime_profile +#undef VMOV +#undef VMOVA + +/* Used by _dl_runtime_resolve_avx_opt/_dl_runtime_resolve_avx512_opt + to preserve the full vector registers with zero upper bits. */ +#define VMOVA vmovdqa +#if DL_RUNTIME_RESOLVE_REALIGN_STACK || VEC_SIZE <= DL_STACK_ALIGNMENT +# define VMOV vmovdqa +#else +# define VMOV vmovdqu +#endif +#define _dl_runtime_resolve _dl_runtime_resolve_sse_vex +#define _dl_runtime_resolve_opt _dl_runtime_resolve_avx512_opt +#include "dl-trampoline.h" diff --git a/REORG.TODO/sysdeps/x86_64/dl-trampoline.h b/REORG.TODO/sysdeps/x86_64/dl-trampoline.h new file mode 100644 index 0000000000..8db24c16ac --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/dl-trampoline.h @@ -0,0 +1,647 @@ +/* PLT trampolines. x86-64 version. + Copyright (C) 2009-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#undef REGISTER_SAVE_AREA_RAW +#ifdef __ILP32__ +/* X32 saves RCX, RDX, RSI, RDI, R8 and R9 plus RAX as well as VEC0 to + VEC7. */ +# define REGISTER_SAVE_AREA_RAW (8 * 7 + VEC_SIZE * 8) +#else +/* X86-64 saves RCX, RDX, RSI, RDI, R8 and R9 plus RAX as well as + BND0, BND1, BND2, BND3 and VEC0 to VEC7. */ +# define REGISTER_SAVE_AREA_RAW (8 * 7 + 16 * 4 + VEC_SIZE * 8) +#endif + +#undef REGISTER_SAVE_AREA +#undef LOCAL_STORAGE_AREA +#undef BASE +#if DL_RUNTIME_RESOLVE_REALIGN_STACK +# define REGISTER_SAVE_AREA (REGISTER_SAVE_AREA_RAW + 8) +/* Local stack area before jumping to function address: RBX. */ +# define LOCAL_STORAGE_AREA 8 +# define BASE rbx +# if (REGISTER_SAVE_AREA % VEC_SIZE) != 0 +# error REGISTER_SAVE_AREA must be multples of VEC_SIZE +# endif +#else +# define REGISTER_SAVE_AREA REGISTER_SAVE_AREA_RAW +/* Local stack area before jumping to function address: All saved + registers. */ +# define LOCAL_STORAGE_AREA REGISTER_SAVE_AREA +# define BASE rsp +# if (REGISTER_SAVE_AREA % 16) != 8 +# error REGISTER_SAVE_AREA must be odd multples of 8 +# endif +#endif + + .text +#ifdef _dl_runtime_resolve_opt +/* Use the smallest vector registers to preserve the full YMM/ZMM + registers to avoid SSE transition penalty. */ + +# if VEC_SIZE == 32 +/* Check if the upper 128 bits in %ymm0 - %ymm7 registers are non-zero + and preserve %xmm0 - %xmm7 registers with the zero upper bits. Since + there is no SSE transition penalty on AVX512 processors which don't + support XGETBV with ECX == 1, _dl_runtime_resolve_avx512_slow isn't + provided. */ + .globl _dl_runtime_resolve_avx_slow + .hidden _dl_runtime_resolve_avx_slow + .type _dl_runtime_resolve_avx_slow, @function + .align 16 +_dl_runtime_resolve_avx_slow: + cfi_startproc + cfi_adjust_cfa_offset(16) # Incorporate PLT + vorpd %ymm0, %ymm1, %ymm8 + vorpd %ymm2, %ymm3, %ymm9 + vorpd %ymm4, %ymm5, %ymm10 + vorpd %ymm6, %ymm7, %ymm11 + vorpd %ymm8, %ymm9, %ymm9 + vorpd %ymm10, %ymm11, %ymm10 + vpcmpeqd %xmm8, %xmm8, %xmm8 + vorpd %ymm9, %ymm10, %ymm10 + vptest %ymm10, %ymm8 + # Preserve %ymm0 - %ymm7 registers if the upper 128 bits of any + # %ymm0 - %ymm7 registers aren't zero. + PRESERVE_BND_REGS_PREFIX + jnc _dl_runtime_resolve_avx + # Use vzeroupper to avoid SSE transition penalty. + vzeroupper + # Preserve %xmm0 - %xmm7 registers with the zero upper 128 bits + # when the upper 128 bits of %ymm0 - %ymm7 registers are zero. + PRESERVE_BND_REGS_PREFIX + jmp _dl_runtime_resolve_sse_vex + cfi_adjust_cfa_offset(-16) # Restore PLT adjustment + cfi_endproc + .size _dl_runtime_resolve_avx_slow, .-_dl_runtime_resolve_avx_slow +# endif + +/* Use XGETBV with ECX == 1 to check which bits in vector registers are + non-zero and only preserve the non-zero lower bits with zero upper + bits. */ + .globl _dl_runtime_resolve_opt + .hidden _dl_runtime_resolve_opt + .type _dl_runtime_resolve_opt, @function + .align 16 +_dl_runtime_resolve_opt: + cfi_startproc + cfi_adjust_cfa_offset(16) # Incorporate PLT + pushq %rax + cfi_adjust_cfa_offset(8) + cfi_rel_offset(%rax, 0) + pushq %rcx + cfi_adjust_cfa_offset(8) + cfi_rel_offset(%rcx, 0) + pushq %rdx + cfi_adjust_cfa_offset(8) + cfi_rel_offset(%rdx, 0) + movl $1, %ecx + xgetbv + movl %eax, %r11d + popq %rdx + cfi_adjust_cfa_offset(-8) + cfi_restore (%rdx) + popq %rcx + cfi_adjust_cfa_offset(-8) + cfi_restore (%rcx) + popq %rax + cfi_adjust_cfa_offset(-8) + cfi_restore (%rax) +# if VEC_SIZE == 32 + # For YMM registers, check if YMM state is in use. + andl $bit_YMM_state, %r11d + # Preserve %xmm0 - %xmm7 registers with the zero upper 128 bits if + # YMM state isn't in use. + PRESERVE_BND_REGS_PREFIX + jz _dl_runtime_resolve_sse_vex +# elif VEC_SIZE == 16 + # For ZMM registers, check if YMM state and ZMM state are in + # use. + andl $(bit_YMM_state | bit_ZMM0_15_state), %r11d + cmpl $bit_YMM_state, %r11d + # Preserve %zmm0 - %zmm7 registers if ZMM state is in use. + PRESERVE_BND_REGS_PREFIX + jg _dl_runtime_resolve_avx512 + # Preserve %ymm0 - %ymm7 registers with the zero upper 256 bits if + # ZMM state isn't in use. + PRESERVE_BND_REGS_PREFIX + je _dl_runtime_resolve_avx + # Preserve %xmm0 - %xmm7 registers with the zero upper 384 bits if + # neither YMM state nor ZMM state are in use. +# else +# error Unsupported VEC_SIZE! +# endif + cfi_adjust_cfa_offset(-16) # Restore PLT adjustment + cfi_endproc + .size _dl_runtime_resolve_opt, .-_dl_runtime_resolve_opt +#endif + .globl _dl_runtime_resolve + .hidden _dl_runtime_resolve + .type _dl_runtime_resolve, @function + .align 16 + cfi_startproc +_dl_runtime_resolve: + cfi_adjust_cfa_offset(16) # Incorporate PLT +#if DL_RUNTIME_RESOLVE_REALIGN_STACK +# if LOCAL_STORAGE_AREA != 8 +# error LOCAL_STORAGE_AREA must be 8 +# endif + pushq %rbx # push subtracts stack by 8. + cfi_adjust_cfa_offset(8) + cfi_rel_offset(%rbx, 0) + mov %RSP_LP, %RBX_LP + cfi_def_cfa_register(%rbx) + and $-VEC_SIZE, %RSP_LP +#endif + sub $REGISTER_SAVE_AREA, %RSP_LP +#if !DL_RUNTIME_RESOLVE_REALIGN_STACK + cfi_adjust_cfa_offset(REGISTER_SAVE_AREA) +#endif + # Preserve registers otherwise clobbered. + movq %rax, REGISTER_SAVE_RAX(%rsp) + movq %rcx, REGISTER_SAVE_RCX(%rsp) + movq %rdx, REGISTER_SAVE_RDX(%rsp) + movq %rsi, REGISTER_SAVE_RSI(%rsp) + movq %rdi, REGISTER_SAVE_RDI(%rsp) + movq %r8, REGISTER_SAVE_R8(%rsp) + movq %r9, REGISTER_SAVE_R9(%rsp) + VMOV %VEC(0), (REGISTER_SAVE_VEC_OFF)(%rsp) + VMOV %VEC(1), (REGISTER_SAVE_VEC_OFF + VEC_SIZE)(%rsp) + VMOV %VEC(2), (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 2)(%rsp) + VMOV %VEC(3), (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 3)(%rsp) + VMOV %VEC(4), (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 4)(%rsp) + VMOV %VEC(5), (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 5)(%rsp) + VMOV %VEC(6), (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 6)(%rsp) + VMOV %VEC(7), (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 7)(%rsp) +#ifndef __ILP32__ + # We also have to preserve bound registers. These are nops if + # Intel MPX isn't available or disabled. +# ifdef HAVE_MPX_SUPPORT + bndmov %bnd0, REGISTER_SAVE_BND0(%rsp) + bndmov %bnd1, REGISTER_SAVE_BND1(%rsp) + bndmov %bnd2, REGISTER_SAVE_BND2(%rsp) + bndmov %bnd3, REGISTER_SAVE_BND3(%rsp) +# else +# if REGISTER_SAVE_BND0 == 0 + .byte 0x66,0x0f,0x1b,0x04,0x24 +# else + .byte 0x66,0x0f,0x1b,0x44,0x24,REGISTER_SAVE_BND0 +# endif + .byte 0x66,0x0f,0x1b,0x4c,0x24,REGISTER_SAVE_BND1 + .byte 0x66,0x0f,0x1b,0x54,0x24,REGISTER_SAVE_BND2 + .byte 0x66,0x0f,0x1b,0x5c,0x24,REGISTER_SAVE_BND3 +# endif +#endif + # Copy args pushed by PLT in register. + # %rdi: link_map, %rsi: reloc_index + mov (LOCAL_STORAGE_AREA + 8)(%BASE), %RSI_LP + mov LOCAL_STORAGE_AREA(%BASE), %RDI_LP + call _dl_fixup # Call resolver. + mov %RAX_LP, %R11_LP # Save return value +#ifndef __ILP32__ + # Restore bound registers. These are nops if Intel MPX isn't + # avaiable or disabled. +# ifdef HAVE_MPX_SUPPORT + bndmov REGISTER_SAVE_BND3(%rsp), %bnd3 + bndmov REGISTER_SAVE_BND2(%rsp), %bnd2 + bndmov REGISTER_SAVE_BND1(%rsp), %bnd1 + bndmov REGISTER_SAVE_BND0(%rsp), %bnd0 +# else + .byte 0x66,0x0f,0x1a,0x5c,0x24,REGISTER_SAVE_BND3 + .byte 0x66,0x0f,0x1a,0x54,0x24,REGISTER_SAVE_BND2 + .byte 0x66,0x0f,0x1a,0x4c,0x24,REGISTER_SAVE_BND1 +# if REGISTER_SAVE_BND0 == 0 + .byte 0x66,0x0f,0x1a,0x04,0x24 +# else + .byte 0x66,0x0f,0x1a,0x44,0x24,REGISTER_SAVE_BND0 +# endif +# endif +#endif + # Get register content back. + movq REGISTER_SAVE_R9(%rsp), %r9 + movq REGISTER_SAVE_R8(%rsp), %r8 + movq REGISTER_SAVE_RDI(%rsp), %rdi + movq REGISTER_SAVE_RSI(%rsp), %rsi + movq REGISTER_SAVE_RDX(%rsp), %rdx + movq REGISTER_SAVE_RCX(%rsp), %rcx + movq REGISTER_SAVE_RAX(%rsp), %rax + VMOV (REGISTER_SAVE_VEC_OFF)(%rsp), %VEC(0) + VMOV (REGISTER_SAVE_VEC_OFF + VEC_SIZE)(%rsp), %VEC(1) + VMOV (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 2)(%rsp), %VEC(2) + VMOV (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 3)(%rsp), %VEC(3) + VMOV (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 4)(%rsp), %VEC(4) + VMOV (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 5)(%rsp), %VEC(5) + VMOV (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 6)(%rsp), %VEC(6) + VMOV (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 7)(%rsp), %VEC(7) +#if DL_RUNTIME_RESOLVE_REALIGN_STACK + mov %RBX_LP, %RSP_LP + cfi_def_cfa_register(%rsp) + movq (%rsp), %rbx + cfi_restore(%rbx) +#endif + # Adjust stack(PLT did 2 pushes) + add $(LOCAL_STORAGE_AREA + 16), %RSP_LP + cfi_adjust_cfa_offset(-(LOCAL_STORAGE_AREA + 16)) + # Preserve bound registers. + PRESERVE_BND_REGS_PREFIX + jmp *%r11 # Jump to function address. + cfi_endproc + .size _dl_runtime_resolve, .-_dl_runtime_resolve + + +/* To preserve %xmm0 - %xmm7 registers, dl-trampoline.h is included + twice, for _dl_runtime_resolve_sse and _dl_runtime_resolve_sse_vex. + But we don't need another _dl_runtime_profile for XMM registers. */ +#if !defined PROF && defined _dl_runtime_profile +# if (LR_VECTOR_OFFSET % VEC_SIZE) != 0 +# error LR_VECTOR_OFFSET must be multples of VEC_SIZE +# endif + + .globl _dl_runtime_profile + .hidden _dl_runtime_profile + .type _dl_runtime_profile, @function + .align 16 +_dl_runtime_profile: + cfi_startproc + cfi_adjust_cfa_offset(16) # Incorporate PLT + /* The La_x86_64_regs data structure pointed to by the + fourth paramater must be VEC_SIZE-byte aligned. This must + be explicitly enforced. We have the set up a dynamically + sized stack frame. %rbx points to the top half which + has a fixed size and preserves the original stack pointer. */ + + sub $32, %RSP_LP # Allocate the local storage. + cfi_adjust_cfa_offset(32) + movq %rbx, (%rsp) + cfi_rel_offset(%rbx, 0) + + /* On the stack: + 56(%rbx) parameter #1 + 48(%rbx) return address + + 40(%rbx) reloc index + 32(%rbx) link_map + + 24(%rbx) La_x86_64_regs pointer + 16(%rbx) framesize + 8(%rbx) rax + (%rbx) rbx + */ + + movq %rax, 8(%rsp) + mov %RSP_LP, %RBX_LP + cfi_def_cfa_register(%rbx) + + /* Actively align the La_x86_64_regs structure. */ + and $-VEC_SIZE, %RSP_LP + /* sizeof(La_x86_64_regs). Need extra space for 8 SSE registers + to detect if any xmm0-xmm7 registers are changed by audit + module. */ + sub $(LR_SIZE + XMM_SIZE*8), %RSP_LP + movq %rsp, 24(%rbx) + + /* Fill the La_x86_64_regs structure. */ + movq %rdx, LR_RDX_OFFSET(%rsp) + movq %r8, LR_R8_OFFSET(%rsp) + movq %r9, LR_R9_OFFSET(%rsp) + movq %rcx, LR_RCX_OFFSET(%rsp) + movq %rsi, LR_RSI_OFFSET(%rsp) + movq %rdi, LR_RDI_OFFSET(%rsp) + movq %rbp, LR_RBP_OFFSET(%rsp) + + lea 48(%rbx), %RAX_LP + movq %rax, LR_RSP_OFFSET(%rsp) + + /* We always store the XMM registers even if AVX is available. + This is to provide backward binary compatibility for existing + audit modules. */ + movaps %xmm0, (LR_XMM_OFFSET)(%rsp) + movaps %xmm1, (LR_XMM_OFFSET + XMM_SIZE)(%rsp) + movaps %xmm2, (LR_XMM_OFFSET + XMM_SIZE*2)(%rsp) + movaps %xmm3, (LR_XMM_OFFSET + XMM_SIZE*3)(%rsp) + movaps %xmm4, (LR_XMM_OFFSET + XMM_SIZE*4)(%rsp) + movaps %xmm5, (LR_XMM_OFFSET + XMM_SIZE*5)(%rsp) + movaps %xmm6, (LR_XMM_OFFSET + XMM_SIZE*6)(%rsp) + movaps %xmm7, (LR_XMM_OFFSET + XMM_SIZE*7)(%rsp) + +# ifndef __ILP32__ +# ifdef HAVE_MPX_SUPPORT + bndmov %bnd0, (LR_BND_OFFSET)(%rsp) # Preserve bound + bndmov %bnd1, (LR_BND_OFFSET + BND_SIZE)(%rsp) # registers. Nops if + bndmov %bnd2, (LR_BND_OFFSET + BND_SIZE*2)(%rsp) # MPX not available + bndmov %bnd3, (LR_BND_OFFSET + BND_SIZE*3)(%rsp) # or disabled. +# else + .byte 0x66,0x0f,0x1b,0x84,0x24;.long (LR_BND_OFFSET) + .byte 0x66,0x0f,0x1b,0x8c,0x24;.long (LR_BND_OFFSET + BND_SIZE) + .byte 0x66,0x0f,0x1b,0x94,0x24;.long (LR_BND_OFFSET + BND_SIZE*2) + .byte 0x66,0x0f,0x1b,0x9c,0x24;.long (LR_BND_OFFSET + BND_SIZE*3) +# endif +# endif + +# ifdef RESTORE_AVX + /* This is to support AVX audit modules. */ + VMOVA %VEC(0), (LR_VECTOR_OFFSET)(%rsp) + VMOVA %VEC(1), (LR_VECTOR_OFFSET + VECTOR_SIZE)(%rsp) + VMOVA %VEC(2), (LR_VECTOR_OFFSET + VECTOR_SIZE*2)(%rsp) + VMOVA %VEC(3), (LR_VECTOR_OFFSET + VECTOR_SIZE*3)(%rsp) + VMOVA %VEC(4), (LR_VECTOR_OFFSET + VECTOR_SIZE*4)(%rsp) + VMOVA %VEC(5), (LR_VECTOR_OFFSET + VECTOR_SIZE*5)(%rsp) + VMOVA %VEC(6), (LR_VECTOR_OFFSET + VECTOR_SIZE*6)(%rsp) + VMOVA %VEC(7), (LR_VECTOR_OFFSET + VECTOR_SIZE*7)(%rsp) + + /* Save xmm0-xmm7 registers to detect if any of them are + changed by audit module. */ + vmovdqa %xmm0, (LR_SIZE)(%rsp) + vmovdqa %xmm1, (LR_SIZE + XMM_SIZE)(%rsp) + vmovdqa %xmm2, (LR_SIZE + XMM_SIZE*2)(%rsp) + vmovdqa %xmm3, (LR_SIZE + XMM_SIZE*3)(%rsp) + vmovdqa %xmm4, (LR_SIZE + XMM_SIZE*4)(%rsp) + vmovdqa %xmm5, (LR_SIZE + XMM_SIZE*5)(%rsp) + vmovdqa %xmm6, (LR_SIZE + XMM_SIZE*6)(%rsp) + vmovdqa %xmm7, (LR_SIZE + XMM_SIZE*7)(%rsp) +# endif + + mov %RSP_LP, %RCX_LP # La_x86_64_regs pointer to %rcx. + mov 48(%rbx), %RDX_LP # Load return address if needed. + mov 40(%rbx), %RSI_LP # Copy args pushed by PLT in register. + mov 32(%rbx), %RDI_LP # %rdi: link_map, %rsi: reloc_index + lea 16(%rbx), %R8_LP # Address of framesize + call _dl_profile_fixup # Call resolver. + + mov %RAX_LP, %R11_LP # Save return value. + + movq 8(%rbx), %rax # Get back register content. + movq LR_RDX_OFFSET(%rsp), %rdx + movq LR_R8_OFFSET(%rsp), %r8 + movq LR_R9_OFFSET(%rsp), %r9 + + movaps (LR_XMM_OFFSET)(%rsp), %xmm0 + movaps (LR_XMM_OFFSET + XMM_SIZE)(%rsp), %xmm1 + movaps (LR_XMM_OFFSET + XMM_SIZE*2)(%rsp), %xmm2 + movaps (LR_XMM_OFFSET + XMM_SIZE*3)(%rsp), %xmm3 + movaps (LR_XMM_OFFSET + XMM_SIZE*4)(%rsp), %xmm4 + movaps (LR_XMM_OFFSET + XMM_SIZE*5)(%rsp), %xmm5 + movaps (LR_XMM_OFFSET + XMM_SIZE*6)(%rsp), %xmm6 + movaps (LR_XMM_OFFSET + XMM_SIZE*7)(%rsp), %xmm7 + +# ifdef RESTORE_AVX + /* Check if any xmm0-xmm7 registers are changed by audit + module. */ + vpcmpeqq (LR_SIZE)(%rsp), %xmm0, %xmm8 + vpmovmskb %xmm8, %esi + cmpl $0xffff, %esi + je 2f + vmovdqa %xmm0, (LR_VECTOR_OFFSET)(%rsp) + jmp 1f +2: VMOVA (LR_VECTOR_OFFSET)(%rsp), %VEC(0) + vmovdqa %xmm0, (LR_XMM_OFFSET)(%rsp) + +1: vpcmpeqq (LR_SIZE + XMM_SIZE)(%rsp), %xmm1, %xmm8 + vpmovmskb %xmm8, %esi + cmpl $0xffff, %esi + je 2f + vmovdqa %xmm1, (LR_VECTOR_OFFSET + VECTOR_SIZE)(%rsp) + jmp 1f +2: VMOVA (LR_VECTOR_OFFSET + VECTOR_SIZE)(%rsp), %VEC(1) + vmovdqa %xmm1, (LR_XMM_OFFSET + XMM_SIZE)(%rsp) + +1: vpcmpeqq (LR_SIZE + XMM_SIZE*2)(%rsp), %xmm2, %xmm8 + vpmovmskb %xmm8, %esi + cmpl $0xffff, %esi + je 2f + vmovdqa %xmm2, (LR_VECTOR_OFFSET + VECTOR_SIZE*2)(%rsp) + jmp 1f +2: VMOVA (LR_VECTOR_OFFSET + VECTOR_SIZE*2)(%rsp), %VEC(2) + vmovdqa %xmm2, (LR_XMM_OFFSET + XMM_SIZE*2)(%rsp) + +1: vpcmpeqq (LR_SIZE + XMM_SIZE*3)(%rsp), %xmm3, %xmm8 + vpmovmskb %xmm8, %esi + cmpl $0xffff, %esi + je 2f + vmovdqa %xmm3, (LR_VECTOR_OFFSET + VECTOR_SIZE*3)(%rsp) + jmp 1f +2: VMOVA (LR_VECTOR_OFFSET + VECTOR_SIZE*3)(%rsp), %VEC(3) + vmovdqa %xmm3, (LR_XMM_OFFSET + XMM_SIZE*3)(%rsp) + +1: vpcmpeqq (LR_SIZE + XMM_SIZE*4)(%rsp), %xmm4, %xmm8 + vpmovmskb %xmm8, %esi + cmpl $0xffff, %esi + je 2f + vmovdqa %xmm4, (LR_VECTOR_OFFSET + VECTOR_SIZE*4)(%rsp) + jmp 1f +2: VMOVA (LR_VECTOR_OFFSET + VECTOR_SIZE*4)(%rsp), %VEC(4) + vmovdqa %xmm4, (LR_XMM_OFFSET + XMM_SIZE*4)(%rsp) + +1: vpcmpeqq (LR_SIZE + XMM_SIZE*5)(%rsp), %xmm5, %xmm8 + vpmovmskb %xmm8, %esi + cmpl $0xffff, %esi + je 2f + vmovdqa %xmm5, (LR_VECTOR_OFFSET + VECTOR_SIZE*5)(%rsp) + jmp 1f +2: VMOVA (LR_VECTOR_OFFSET + VECTOR_SIZE*5)(%rsp), %VEC(5) + vmovdqa %xmm5, (LR_XMM_OFFSET + XMM_SIZE*5)(%rsp) + +1: vpcmpeqq (LR_SIZE + XMM_SIZE*6)(%rsp), %xmm6, %xmm8 + vpmovmskb %xmm8, %esi + cmpl $0xffff, %esi + je 2f + vmovdqa %xmm6, (LR_VECTOR_OFFSET + VECTOR_SIZE*6)(%rsp) + jmp 1f +2: VMOVA (LR_VECTOR_OFFSET + VECTOR_SIZE*6)(%rsp), %VEC(6) + vmovdqa %xmm6, (LR_XMM_OFFSET + XMM_SIZE*6)(%rsp) + +1: vpcmpeqq (LR_SIZE + XMM_SIZE*7)(%rsp), %xmm7, %xmm8 + vpmovmskb %xmm8, %esi + cmpl $0xffff, %esi + je 2f + vmovdqa %xmm7, (LR_VECTOR_OFFSET + VECTOR_SIZE*7)(%rsp) + jmp 1f +2: VMOVA (LR_VECTOR_OFFSET + VECTOR_SIZE*7)(%rsp), %VEC(7) + vmovdqa %xmm7, (LR_XMM_OFFSET + XMM_SIZE*7)(%rsp) + +1: +# endif + +# ifndef __ILP32__ +# ifdef HAVE_MPX_SUPPORT + bndmov (LR_BND_OFFSET)(%rsp), %bnd0 # Restore bound + bndmov (LR_BND_OFFSET + BND_SIZE)(%rsp), %bnd1 # registers. + bndmov (LR_BND_OFFSET + BND_SIZE*2)(%rsp), %bnd2 + bndmov (LR_BND_OFFSET + BND_SIZE*3)(%rsp), %bnd3 +# else + .byte 0x66,0x0f,0x1a,0x84,0x24;.long (LR_BND_OFFSET) + .byte 0x66,0x0f,0x1a,0x8c,0x24;.long (LR_BND_OFFSET + BND_SIZE) + .byte 0x66,0x0f,0x1a,0x94,0x24;.long (LR_BND_OFFSET + BND_SIZE*2) + .byte 0x66,0x0f,0x1a,0x9c,0x24;.long (LR_BND_OFFSET + BND_SIZE*3) +# endif +# endif + + mov 16(%rbx), %R10_LP # Anything in framesize? + test %R10_LP, %R10_LP + PRESERVE_BND_REGS_PREFIX + jns 3f + + /* There's nothing in the frame size, so there + will be no call to the _dl_call_pltexit. */ + + /* Get back registers content. */ + movq LR_RCX_OFFSET(%rsp), %rcx + movq LR_RSI_OFFSET(%rsp), %rsi + movq LR_RDI_OFFSET(%rsp), %rdi + + mov %RBX_LP, %RSP_LP + movq (%rsp), %rbx + cfi_restore(%rbx) + cfi_def_cfa_register(%rsp) + + add $48, %RSP_LP # Adjust the stack to the return value + # (eats the reloc index and link_map) + cfi_adjust_cfa_offset(-48) + PRESERVE_BND_REGS_PREFIX + jmp *%r11 # Jump to function address. + +3: + cfi_adjust_cfa_offset(48) + cfi_rel_offset(%rbx, 0) + cfi_def_cfa_register(%rbx) + + /* At this point we need to prepare new stack for the function + which has to be called. We copy the original stack to a + temporary buffer of the size specified by the 'framesize' + returned from _dl_profile_fixup */ + + lea LR_RSP_OFFSET(%rbx), %RSI_LP # stack + add $8, %R10_LP + and $-16, %R10_LP + mov %R10_LP, %RCX_LP + sub %R10_LP, %RSP_LP + mov %RSP_LP, %RDI_LP + shr $3, %RCX_LP + rep + movsq + + movq 24(%rdi), %rcx # Get back register content. + movq 32(%rdi), %rsi + movq 40(%rdi), %rdi + + PRESERVE_BND_REGS_PREFIX + call *%r11 + + mov 24(%rbx), %RSP_LP # Drop the copied stack content + + /* Now we have to prepare the La_x86_64_retval structure for the + _dl_call_pltexit. The La_x86_64_regs is being pointed by rsp now, + so we just need to allocate the sizeof(La_x86_64_retval) space on + the stack, since the alignment has already been taken care of. */ +# ifdef RESTORE_AVX + /* sizeof(La_x86_64_retval). Need extra space for 2 SSE + registers to detect if xmm0/xmm1 registers are changed + by audit module. */ + sub $(LRV_SIZE + XMM_SIZE*2), %RSP_LP +# else + sub $LRV_SIZE, %RSP_LP # sizeof(La_x86_64_retval) +# endif + mov %RSP_LP, %RCX_LP # La_x86_64_retval argument to %rcx. + + /* Fill in the La_x86_64_retval structure. */ + movq %rax, LRV_RAX_OFFSET(%rcx) + movq %rdx, LRV_RDX_OFFSET(%rcx) + + movaps %xmm0, LRV_XMM0_OFFSET(%rcx) + movaps %xmm1, LRV_XMM1_OFFSET(%rcx) + +# ifdef RESTORE_AVX + /* This is to support AVX audit modules. */ + VMOVA %VEC(0), LRV_VECTOR0_OFFSET(%rcx) + VMOVA %VEC(1), LRV_VECTOR1_OFFSET(%rcx) + + /* Save xmm0/xmm1 registers to detect if they are changed + by audit module. */ + vmovdqa %xmm0, (LRV_SIZE)(%rcx) + vmovdqa %xmm1, (LRV_SIZE + XMM_SIZE)(%rcx) +# endif + +# ifndef __ILP32__ +# ifdef HAVE_MPX_SUPPORT + bndmov %bnd0, LRV_BND0_OFFSET(%rcx) # Preserve returned bounds. + bndmov %bnd1, LRV_BND1_OFFSET(%rcx) +# else + .byte 0x66,0x0f,0x1b,0x81;.long (LRV_BND0_OFFSET) + .byte 0x66,0x0f,0x1b,0x89;.long (LRV_BND1_OFFSET) +# endif +# endif + + fstpt LRV_ST0_OFFSET(%rcx) + fstpt LRV_ST1_OFFSET(%rcx) + + movq 24(%rbx), %rdx # La_x86_64_regs argument to %rdx. + movq 40(%rbx), %rsi # Copy args pushed by PLT in register. + movq 32(%rbx), %rdi # %rdi: link_map, %rsi: reloc_index + call _dl_call_pltexit + + /* Restore return registers. */ + movq LRV_RAX_OFFSET(%rsp), %rax + movq LRV_RDX_OFFSET(%rsp), %rdx + + movaps LRV_XMM0_OFFSET(%rsp), %xmm0 + movaps LRV_XMM1_OFFSET(%rsp), %xmm1 + +# ifdef RESTORE_AVX + /* Check if xmm0/xmm1 registers are changed by audit module. */ + vpcmpeqq (LRV_SIZE)(%rsp), %xmm0, %xmm2 + vpmovmskb %xmm2, %esi + cmpl $0xffff, %esi + jne 1f + VMOVA LRV_VECTOR0_OFFSET(%rsp), %VEC(0) + +1: vpcmpeqq (LRV_SIZE + XMM_SIZE)(%rsp), %xmm1, %xmm2 + vpmovmskb %xmm2, %esi + cmpl $0xffff, %esi + jne 1f + VMOVA LRV_VECTOR1_OFFSET(%rsp), %VEC(1) + +1: +# endif + +# ifndef __ILP32__ +# ifdef HAVE_MPX_SUPPORT + bndmov LRV_BND0_OFFSET(%rsp), %bnd0 # Restore bound registers. + bndmov LRV_BND1_OFFSET(%rsp), %bnd1 +# else + .byte 0x66,0x0f,0x1a,0x84,0x24;.long (LRV_BND0_OFFSET) + .byte 0x66,0x0f,0x1a,0x8c,0x24;.long (LRV_BND1_OFFSET) +# endif +# endif + + fldt LRV_ST1_OFFSET(%rsp) + fldt LRV_ST0_OFFSET(%rsp) + + mov %RBX_LP, %RSP_LP + movq (%rsp), %rbx + cfi_restore(%rbx) + cfi_def_cfa_register(%rsp) + + add $48, %RSP_LP # Adjust the stack to the return value + # (eats the reloc index and link_map) + cfi_adjust_cfa_offset(-48) + PRESERVE_BND_REGS_PREFIX + retq + + cfi_endproc + .size _dl_runtime_profile, .-_dl_runtime_profile +#endif diff --git a/REORG.TODO/sysdeps/x86_64/ffs.c b/REORG.TODO/sysdeps/x86_64/ffs.c new file mode 100644 index 0000000000..132812c488 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/ffs.c @@ -0,0 +1,39 @@ +/* ffs -- find first set bit in a word, counted from least significant end. + For AMD x86-64. + This file is part of the GNU C Library. + Copyright (C) 1991-2017 Free Software Foundation, Inc. + Contributed by Ulrich Drepper <drepper@cygnus.com>. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <string.h> + +#undef ffs + +int +__ffs (int x) +{ + int cnt; + int tmp; + + asm ("bsfl %2,%0\n" /* Count low bits in X and store in %1. */ + "cmovel %1,%0\n" /* If number was zero, use -1 as result. */ + : "=&r" (cnt), "=r" (tmp) : "rm" (x), "1" (-1)); + + return cnt + 1; +} +weak_alias (__ffs, ffs) +libc_hidden_def (__ffs) +libc_hidden_builtin_def (ffs) diff --git a/REORG.TODO/sysdeps/x86_64/ffsll.c b/REORG.TODO/sysdeps/x86_64/ffsll.c new file mode 100644 index 0000000000..47111ce61b --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/ffsll.c @@ -0,0 +1,42 @@ +/* ffsll -- find first set bit in a word, counted from least significant end. + For AMD x86-64. + This file is part of the GNU C Library. + Copyright (C) 1991-2017 Free Software Foundation, Inc. + Contributed by Ulrich Drepper <drepper@cygnus.com>. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#define ffsl __something_else +#include <string.h> + +#undef ffsll + +int +ffsll (long long int x) +{ + long long int cnt; + long long int tmp; + + asm ("bsfq %2,%0\n" /* Count low bits in X and store in %1. */ + "cmoveq %1,%0\n" /* If number was zero, use -1 as result. */ + : "=&r" (cnt), "=r" (tmp) : "rm" (x), "1" (-1)); + + return cnt + 1; +} + +#ifndef __ILP32__ +#undef ffsl +weak_alias (ffsll, ffsl) +#endif diff --git a/REORG.TODO/sysdeps/x86_64/fpu/Implies b/REORG.TODO/sysdeps/x86_64/fpu/Implies new file mode 100644 index 0000000000..2b745a34fb --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/Implies @@ -0,0 +1 @@ +x86/fpu diff --git a/REORG.TODO/sysdeps/x86_64/fpu/Makefile b/REORG.TODO/sysdeps/x86_64/fpu/Makefile new file mode 100644 index 0000000000..2b7d69bb50 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/Makefile @@ -0,0 +1,239 @@ +ifeq ($(subdir),mathvec) +libmvec-support += svml_d_cos2_core svml_d_cos4_core_avx \ + svml_d_cos4_core svml_d_cos8_core \ + svml_d_sin2_core svml_d_sin4_core_avx \ + svml_d_sin4_core svml_d_sin8_core svml_d_trig_data \ + svml_s_cosf4_core svml_s_cosf8_core_avx \ + svml_s_cosf8_core svml_s_cosf16_core svml_s_trig_data \ + svml_s_sinf4_core svml_s_sinf8_core_avx \ + svml_s_sinf8_core svml_s_sinf16_core \ + svml_d_sincos2_core svml_d_sincos4_core_avx \ + svml_d_sincos4_core svml_d_sincos8_core \ + svml_d_log2_core svml_d_log4_core_avx svml_d_log4_core \ + svml_d_log8_core svml_d_log_data svml_s_logf4_core \ + svml_s_logf8_core_avx svml_s_logf8_core svml_s_logf16_core \ + svml_s_logf_data svml_d_exp2_core svml_d_exp4_core_avx \ + svml_d_exp4_core svml_d_exp8_core svml_d_exp_data \ + svml_s_expf4_core svml_s_expf8_core_avx svml_s_expf8_core \ + svml_s_expf16_core svml_s_expf_data svml_d_pow2_core \ + svml_d_pow4_core_avx svml_d_pow4_core svml_d_pow8_core \ + svml_d_pow_data svml_s_powf4_core svml_s_powf8_core_avx \ + svml_s_powf8_core svml_s_powf16_core svml_s_powf_data \ + svml_s_sincosf4_core svml_s_sincosf8_core_avx \ + svml_s_sincosf8_core svml_s_sincosf16_core svml_finite_alias + +libmvec-static-only-routines = svml_finite_alias +endif + +# Variables for libmvec tests. +ifeq ($(subdir),math) +ifeq ($(build-mathvec),yes) +libmvec-tests += double-vlen2 double-vlen4 double-vlen4-avx2 \ + float-vlen4 float-vlen8 float-vlen8-avx2 +tests += test-double-libmvec-alias test-double-libmvec-alias-avx \ + test-double-libmvec-alias-avx2 test-double-libmvec-alias-main \ + test-double-libmvec-alias-avx-main test-double-libmvec-alias-avx2-main \ + test-float-libmvec-alias test-float-libmvec-alias-avx \ + test-float-libmvec-alias-avx2 test-float-libmvec-alias-main \ + test-float-libmvec-alias-avx-main test-float-libmvec-alias-avx2-main \ + test-double-libmvec-sincos test-double-libmvec-sincos-avx \ + test-double-libmvec-sincos-avx2 test-float-libmvec-sincosf \ + test-float-libmvec-sincosf-avx test-float-libmvec-sincosf-avx2 +modules-names += test-double-libmvec-alias-mod \ + test-double-libmvec-alias-avx-mod \ + test-double-libmvec-alias-avx2-mod \ + test-float-libmvec-alias-mod \ + test-float-libmvec-alias-avx-mod \ + test-float-libmvec-alias-avx2-mod +modules-names-tests += test-double-libmvec-alias-mod \ + test-double-libmvec-alias-avx-mod \ + test-double-libmvec-alias-avx2-mod \ + test-float-libmvec-alias-mod \ + test-float-libmvec-alias-avx-mod \ + test-float-libmvec-alias-avx2-mod +extra-test-objs += test-double-libmvec-sincos-avx-main.o \ + test-double-libmvec-sincos-avx2-main.o \ + test-double-libmvec-sincos-main.o \ + test-float-libmvec-sincosf-avx-main.o \ + test-float-libmvec-sincosf-avx2-main.o\ + test-float-libmvec-sincosf-main.o +test-double-libmvec-alias-mod.so-no-z-defs = yes +test-double-libmvec-alias-avx-mod.so-no-z-defs = yes +test-double-libmvec-alias-avx2-mod.so-no-z-defs = yes +test-float-libmvec-alias-mod.so-no-z-defs = yes +test-float-libmvec-alias-avx-mod.so-no-z-defs = yes +test-float-libmvec-alias-avx2-mod.so-no-z-defs = yes + +$(objpfx)test-double-libmvec-alias: \ + $(objpfx)test-double-libmvec-alias-mod.so +$(objpfx)test-double-libmvec-alias-mod.so: \ + $(objpfx)../mathvec/libmvec_nonshared.a $(libmvec) + +$(objpfx)test-double-libmvec-alias-avx: \ + $(objpfx)test-double-libmvec-alias-avx-mod.so +$(objpfx)test-double-libmvec-alias-avx-mod.so: \ + $(objpfx)../mathvec/libmvec_nonshared.a $(libmvec) + +$(objpfx)test-double-libmvec-alias-avx2: \ + $(objpfx)test-double-libmvec-alias-avx2-mod.so +$(objpfx)test-double-libmvec-alias-avx2-mod.so: \ + $(objpfx)../mathvec/libmvec_nonshared.a $(libmvec) + +$(objpfx)test-double-libmvec-alias-main: \ + $(objpfx)test-double-libmvec-alias-mod.os \ + $(objpfx)../mathvec/libmvec_nonshared.a $(libmvec) + +$(objpfx)test-double-libmvec-alias-avx-main: \ + $(objpfx)test-double-libmvec-alias-avx-mod.os \ + $(objpfx)../mathvec/libmvec_nonshared.a $(libmvec) + +$(objpfx)test-double-libmvec-alias-avx2-main: \ + $(objpfx)test-double-libmvec-alias-avx2-mod.os \ + $(objpfx)../mathvec/libmvec_nonshared.a $(libmvec) + +$(objpfx)test-float-libmvec-alias: \ + $(objpfx)test-float-libmvec-alias-mod.so +$(objpfx)test-float-libmvec-alias-mod.so: \ + $(objpfx)../mathvec/libmvec_nonshared.a $(libmvec) + +$(objpfx)test-float-libmvec-alias-avx: \ + $(objpfx)test-float-libmvec-alias-avx-mod.so +$(objpfx)test-float-libmvec-alias-avx-mod.so: \ + $(objpfx)../mathvec/libmvec_nonshared.a $(libmvec) + +$(objpfx)test-float-libmvec-alias-avx2: \ + $(objpfx)test-float-libmvec-alias-avx2-mod.so +$(objpfx)test-float-libmvec-alias-avx2-mod.so: \ + $(objpfx)../mathvec/libmvec_nonshared.a $(libmvec) + +$(objpfx)test-float-libmvec-alias-main: \ + $(objpfx)test-float-libmvec-alias-mod.os \ + $(objpfx)../mathvec/libmvec_nonshared.a $(libmvec) + +$(objpfx)test-float-libmvec-alias-avx-main: \ + $(objpfx)test-float-libmvec-alias-avx-mod.os \ + $(objpfx)../mathvec/libmvec_nonshared.a $(libmvec) + +$(objpfx)test-float-libmvec-alias-avx2-main: \ + $(objpfx)test-float-libmvec-alias-avx2-mod.os \ + $(objpfx)../mathvec/libmvec_nonshared.a $(libmvec) + +$(objpfx)test-double-libmvec-sincos: \ + $(objpfx)test-double-libmvec-sincos.o \ + $(objpfx)test-double-libmvec-sincos-main.o $(libmvec) + +$(objpfx)test-double-libmvec-sincos-avx: \ + $(objpfx)test-double-libmvec-sincos-avx.o \ + $(objpfx)test-double-libmvec-sincos-avx-main.o $(libmvec) + +$(objpfx)test-double-libmvec-sincos-avx2: \ + $(objpfx)test-double-libmvec-sincos-avx2.o \ + $(objpfx)test-double-libmvec-sincos-avx2-main.o $(libmvec) + +$(objpfx)test-float-libmvec-sincosf: \ + $(objpfx)test-float-libmvec-sincosf.o \ + $(objpfx)test-float-libmvec-sincosf-main.o $(libmvec) + +$(objpfx)test-float-libmvec-sincosf-avx: \ + $(objpfx)test-float-libmvec-sincosf-avx.o \ + $(objpfx)test-float-libmvec-sincosf-avx-main.o $(libmvec) + +$(objpfx)test-float-libmvec-sincosf-avx2: \ + $(objpfx)test-float-libmvec-sincosf-avx2.o \ + $(objpfx)test-float-libmvec-sincosf-avx2-main.o $(libmvec) + +ifeq (yes,$(config-cflags-avx512)) +libmvec-tests += double-vlen8 float-vlen16 +tests += test-double-libmvec-alias-avx512 \ + test-float-libmvec-alias-avx512 \ + test-double-libmvec-alias-avx512-main \ + test-float-libmvec-alias-avx512-main \ + test-double-libmvec-sincos-avx512 \ + test-float-libmvec-sincosf-avx512 +modules-names += test-double-libmvec-alias-avx512-mod \ + test-float-libmvec-alias-avx512-mod +modules-names-tests += test-double-libmvec-alias-avx512-mod \ + test-float-libmvec-alias-avx512-mod +extra-test-objs += test-double-libmvec-sincos-avx512-main.o \ + test-float-libmvec-sincosf-avx512-main.o +test-double-libmvec-alias-avx512-mod.so-no-z-defs = yes +test-float-libmvec-alias-avx512-mod.so-no-z-defs = yes + +$(objpfx)test-double-libmvec-alias-avx512: \ + $(objpfx)test-double-libmvec-alias-avx512-mod.so +$(objpfx)test-double-libmvec-alias-avx512-mod.so: \ + $(objpfx)../mathvec/libmvec_nonshared.a $(libmvec) + +$(objpfx)test-double-libmvec-alias-avx512-main: \ + $(objpfx)test-double-libmvec-alias-avx512-mod.os \ + $(objpfx)../mathvec/libmvec_nonshared.a $(libmvec) + +$(objpfx)test-float-libmvec-alias-avx512: \ + $(objpfx)test-float-libmvec-alias-avx512-mod.so +$(objpfx)test-float-libmvec-alias-avx512-mod.so: \ + $(objpfx)../mathvec/libmvec_nonshared.a $(libmvec) + +$(objpfx)test-float-libmvec-alias-avx512-main: \ + $(objpfx)test-float-libmvec-alias-avx512-mod.os \ + $(objpfx)../mathvec/libmvec_nonshared.a $(libmvec) + +$(objpfx)test-double-libmvec-sincos-avx512: \ + $(objpfx)test-double-libmvec-sincos-avx512.o \ + $(objpfx)test-double-libmvec-sincos-avx512-main.o $(libmvec) + +$(objpfx)test-float-libmvec-sincosf-avx512: \ + $(objpfx)test-float-libmvec-sincosf-avx512.o \ + $(objpfx)test-float-libmvec-sincosf-avx512-main.o $(libmvec) +endif + +double-vlen2-funcs = cos exp log pow sin sincos +double-vlen4-funcs = cos exp log pow sin sincos +double-vlen4-avx2-funcs = cos exp log pow sin sincos +double-vlen8-funcs = cos exp log pow sin sincos +float-vlen4-funcs = cos exp log pow sin sincos +float-vlen8-funcs = cos exp log pow sin sincos +float-vlen8-avx2-funcs = cos exp log pow sin sincos +float-vlen16-funcs = cos exp log pow sin sincos + +double-vlen4-arch-ext-cflags = -mavx +double-vlen4-arch-ext2-cflags = -mavx2 +double-vlen8-arch-ext-cflags = -mavx512f + +float-vlen8-arch-ext-cflags = -mavx +float-vlen8-arch-ext2-cflags = -mavx2 +float-vlen16-arch-ext-cflags = -mavx512f + +libmvec-sincos-cflags = $(libm-test-fast-math-cflags) -fno-inline -fopenmp -Wno-unknown-pragmas +libmvec-alias-cflags = $(libmvec-sincos-cflags) -ffloat-store -ffinite-math-only + +CFLAGS-test-double-libmvec-alias-mod.c = $(libmvec-alias-cflags) +CFLAGS-test-double-libmvec-alias-avx-mod.c = $(double-vlen4-arch-ext-cflags) $(libmvec-alias-cflags) -DREQUIRE_AVX +CFLAGS-test-double-libmvec-alias-avx2-mod.c = $(double-vlen4-arch-ext2-cflags) $(libmvec-alias-cflags) -DREQUIRE_AVX2 +CFLAGS-test-double-libmvec-alias-avx512-mod.c = $(double-vlen8-arch-ext-cflags) $(libmvec-alias-cflags) -DREQUIRE_AVX512F + +CFLAGS-test-float-libmvec-alias-mod.c = $(libmvec-alias-cflags) +CFLAGS-test-float-libmvec-alias-avx-mod.c = $(double-vlen4-arch-ext-cflags) $(libmvec-alias-cflags) -DREQUIRE_AVX +CFLAGS-test-float-libmvec-alias-avx2-mod.c = $(double-vlen4-arch-ext2-cflags) $(libmvec-alias-cflags) -DREQUIRE_AVX2 +CFLAGS-test-float-libmvec-alias-avx512-mod.c = $(double-vlen8-arch-ext-cflags) $(libmvec-alias-cflags) -DREQUIRE_AVX512F + +CFLAGS-test-double-vlen4-avx2-wrappers.c = $(double-vlen4-arch-ext2-cflags) + +CFLAGS-test-float-vlen8-avx2-wrappers.c = $(float-vlen8-arch-ext2-cflags) + +CFLAGS-test-double-libmvec-sincos-main.c = $(libmvec-sincos-cflags) +CFLAGS-test-double-libmvec-sincos-avx.c = -DREQUIRE_AVX +CFLAGS-test-double-libmvec-sincos-avx-main.c = $(libmvec-sincos-cflags) $(double-vlen4-arch-ext-cflags) +CFLAGS-test-double-libmvec-sincos-avx2.c = -DREQUIRE_AVX2 +CFLAGS-test-double-libmvec-sincos-avx2-main.c = $(libmvec-sincos-cflags) $(double-vlen4-arch-ext2-cflags) +CFLAGS-test-double-libmvec-sincos-avx512.c = -DREQUIRE_AVX512F +CFLAGS-test-double-libmvec-sincos-avx512-main.c = $(libmvec-sincos-cflags) $(double-vlen8-arch-ext-cflags) + +CFLAGS-test-float-libmvec-sincosf-main.c = $(libmvec-sincos-cflags) +CFLAGS-test-float-libmvec-sincosf-avx.c = -DREQUIRE_AVX +CFLAGS-test-float-libmvec-sincosf-avx-main.c = $(libmvec-sincos-cflags) $(float-vlen8-arch-ext-cflags) +CFLAGS-test-float-libmvec-sincosf-avx2.c = -DREQUIRE_AVX2 +CFLAGS-test-float-libmvec-sincosf-avx2-main.c = $(libmvec-sincos-cflags) $(float-vlen8-arch-ext2-cflags) +CFLAGS-test-float-libmvec-sincosf-avx512.c = -DREQUIRE_AVX512F +CFLAGS-test-float-libmvec-sincosf-avx512-main.c = $(libmvec-sincos-cflags) $(float-vlen16-arch-ext-cflags) +endif +endif diff --git a/REORG.TODO/sysdeps/x86_64/fpu/Versions b/REORG.TODO/sysdeps/x86_64/fpu/Versions new file mode 100644 index 0000000000..08132045d6 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/Versions @@ -0,0 +1,16 @@ +libmvec { + GLIBC_2.22 { + _ZGVbN2v_cos; _ZGVcN4v_cos; _ZGVdN4v_cos; _ZGVeN8v_cos; + _ZGVbN2v_sin; _ZGVcN4v_sin; _ZGVdN4v_sin; _ZGVeN8v_sin; + _ZGVbN2vvv_sincos; _ZGVcN4vvv_sincos; _ZGVdN4vvv_sincos; _ZGVeN8vvv_sincos; + _ZGVbN2v_log; _ZGVcN4v_log; _ZGVdN4v_log; _ZGVeN8v_log; + _ZGVbN2v_exp; _ZGVcN4v_exp; _ZGVdN4v_exp; _ZGVeN8v_exp; + _ZGVbN2vv_pow; _ZGVcN4vv_pow; _ZGVdN4vv_pow; _ZGVeN8vv_pow; + _ZGVbN4v_cosf; _ZGVcN8v_cosf; _ZGVdN8v_cosf; _ZGVeN16v_cosf; + _ZGVbN4v_sinf; _ZGVcN8v_sinf; _ZGVdN8v_sinf; _ZGVeN16v_sinf; + _ZGVbN4v_logf; _ZGVcN8v_logf; _ZGVdN8v_logf; _ZGVeN16v_logf; + _ZGVbN4v_expf; _ZGVcN8v_expf; _ZGVdN8v_expf; _ZGVeN16v_expf; + _ZGVbN4vv_powf; _ZGVcN8vv_powf; _ZGVdN8vv_powf; _ZGVeN16vv_powf; + _ZGVbN4vvv_sincosf; _ZGVcN8vvv_sincosf; _ZGVdN8vvv_sincosf; _ZGVeN16vvv_sincosf; + } +} diff --git a/REORG.TODO/sysdeps/x86_64/fpu/e_acosl.c b/REORG.TODO/sysdeps/x86_64/fpu/e_acosl.c new file mode 100644 index 0000000000..1ef6d3c94a --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/e_acosl.c @@ -0,0 +1 @@ +#include "sysdeps/i386/fpu/e_acosl.c" diff --git a/REORG.TODO/sysdeps/x86_64/fpu/e_atan2l.c b/REORG.TODO/sysdeps/x86_64/fpu/e_atan2l.c new file mode 100644 index 0000000000..bbd549f307 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/e_atan2l.c @@ -0,0 +1,2 @@ +#include "sysdeps/i386/fpu/e_atan2l.c" + diff --git a/REORG.TODO/sysdeps/x86_64/fpu/e_exp10l.S b/REORG.TODO/sysdeps/x86_64/fpu/e_exp10l.S new file mode 100644 index 0000000000..d843e2b5e8 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/e_exp10l.S @@ -0,0 +1,2 @@ +#define USE_AS_EXP10L +#include <e_expl.S> diff --git a/REORG.TODO/sysdeps/x86_64/fpu/e_exp2l.S b/REORG.TODO/sysdeps/x86_64/fpu/e_exp2l.S new file mode 100644 index 0000000000..0e059b7565 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/e_exp2l.S @@ -0,0 +1,58 @@ +/* + * Written by J.T. Conklin <jtc@netbsd.org>. + * Adapted for exp2 by Ulrich Drepper <drepper@cygnus.com>. + * Adapted for x86-64 by Andreas Jaeger <aj@suse.de>. + * Public domain. + */ + +#include <machine/asm.h> +#include <x86_64-math-asm.h> + +DEFINE_LDBL_MIN + +#ifdef PIC +# define MO(op) op##(%rip) +#else +# define MO(op) op +#endif + + .text +ENTRY(__ieee754_exp2l) + fldt 8(%rsp) +/* I added the following ugly construct because exp(+-Inf) resulted + in NaN. The ugliness results from the bright minds at Intel. + For the i686 the code can be written better. + -- drepper@cygnus.com. */ + fxam /* Is NaN or +-Inf? */ + fstsw %ax + movb $0x45, %dh + andb %ah, %dh + cmpb $0x05, %dh + je 1f /* Is +-Inf, jump. */ + movzwl 8+8(%rsp), %eax + andl $0x7fff, %eax + cmpl $0x3fbe, %eax + jge 3f + /* Argument's exponent below -65, result rounds to 1. */ + fld1 + faddp + ret +3: fld %st + frndint /* int(x) */ + fsubr %st,%st(1) /* fract(x) */ + fxch + f2xm1 /* 2^(fract(x)) - 1 */ + fld1 + faddp /* 2^(fract(x)) */ + fscale /* e^x */ + fstp %st(1) + LDBL_CHECK_FORCE_UFLOW_NONNEG_NAN + ret + +1: testl $0x200, %eax /* Test sign. */ + jz 2f /* If positive, jump. */ + fstp %st + fldz /* Set result to 0. */ +2: ret +END (__ieee754_exp2l) +strong_alias (__ieee754_exp2l, __exp2l_finite) diff --git a/REORG.TODO/sysdeps/x86_64/fpu/e_expf.S b/REORG.TODO/sysdeps/x86_64/fpu/e_expf.S new file mode 100644 index 0000000000..4fd2bb1fb5 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/e_expf.S @@ -0,0 +1,339 @@ +/* Optimized __ieee754_expf function. + Copyright (C) 2012-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> + +/* Short algorithm description: + * + * Let K = 64 (table size). + * e^x = 2^(x/log(2)) = 2^n * T[j] * (1 + P(y)) + * where + * x = m*log(2)/K + y, y in [0.0..log(2)/K] + * m = n*K + j, m,n,j - signed integer, j in [0..K-1] + * values of 2^(j/K) are tabulated as T[j]. + * + * P(y) is a minimax polynomial approximation of expf(x)-1 + * on small interval [0.0..log(2)/K]. + * + * P(y) = P3*y*y*y*y + P2*y*y*y + P1*y*y + P0*y, calculated as + * z = y*y; P(y) = (P3*z + P1)*z + (P2*z + P0)*y + * + * Special cases: + * expf(NaN) = NaN + * expf(+INF) = +INF + * expf(-INF) = 0 + * expf(x) = 1 for subnormals + * for finite argument, only expf(0)=1 is exact + * expf(x) overflows if x>88.7228317260742190 + * expf(x) underflows if x<-103.972076416015620 + */ + + .text +ENTRY(__ieee754_expf) + /* Input: single precision x in %xmm0 */ + cvtss2sd %xmm0, %xmm1 /* Convert x to double precision */ + movd %xmm0, %ecx /* Copy x */ + movsd L(DP_KLN2)(%rip), %xmm2 /* DP K/log(2) */ + movsd L(DP_P2)(%rip), %xmm3 /* DP P2 */ + movl %ecx, %eax /* x */ + mulsd %xmm1, %xmm2 /* DP x*K/log(2) */ + andl $0x7fffffff, %ecx /* |x| */ + lea L(DP_T)(%rip), %rsi /* address of table T[j] */ + cmpl $0x42ad496b, %ecx /* |x|<125*log(2) ? */ + movsd L(DP_P3)(%rip), %xmm4 /* DP P3 */ + addsd L(DP_RS)(%rip), %xmm2 /* DP x*K/log(2)+RS */ + jae L(special_paths) + + /* Here if |x|<125*log(2) */ + cmpl $0x31800000, %ecx /* |x|<2^(-28) ? */ + jb L(small_arg) + + /* Main path: here if 2^(-28)<=|x|<125*log(2) */ + cvtsd2ss %xmm2, %xmm2 /* SP x*K/log(2)+RS */ + movd %xmm2, %eax /* bits of n*K+j with trash */ + subss L(SP_RS)(%rip), %xmm2 /* SP t=round(x*K/log(2)) */ + movl %eax, %edx /* n*K+j with trash */ + cvtss2sd %xmm2, %xmm2 /* DP t */ + andl $0x3f, %eax /* bits of j */ + mulsd L(DP_NLN2K)(%rip), %xmm2/* DP -t*log(2)/K */ + andl $0xffffffc0, %edx /* bits of n */ +#ifdef __AVX__ + vaddsd %xmm1, %xmm2, %xmm0 /* DP y=x-t*log(2)/K */ + vmulsd %xmm0, %xmm0, %xmm2 /* DP z=y*y */ +#else + addsd %xmm1, %xmm2 /* DP y=x-t*log(2)/K */ + movaps %xmm2, %xmm0 /* DP y */ + mulsd %xmm2, %xmm2 /* DP z=y*y */ +#endif + mulsd %xmm2, %xmm4 /* DP P3*z */ + addl $0x1fc0, %edx /* bits of n + SP exponent bias */ + mulsd %xmm2, %xmm3 /* DP P2*z */ + shll $17, %edx /* SP 2^n */ + addsd L(DP_P1)(%rip), %xmm4 /* DP P3*z+P1 */ + addsd L(DP_P0)(%rip), %xmm3 /* DP P2*z+P0 */ + movd %edx, %xmm1 /* SP 2^n */ + mulsd %xmm2, %xmm4 /* DP (P3*z+P1)*z */ + mulsd %xmm3, %xmm0 /* DP (P2*z+P0)*y */ + addsd %xmm4, %xmm0 /* DP P(y) */ + mulsd (%rsi,%rax,8), %xmm0 /* DP P(y)*T[j] */ + addsd (%rsi,%rax,8), %xmm0 /* DP T[j]*(P(y)+1) */ + cvtsd2ss %xmm0, %xmm0 /* SP T[j]*(P(y)+1) */ + mulss %xmm1, %xmm0 /* SP result=2^n*(T[j]*(P(y)+1)) */ + ret + + .p2align 4 +L(small_arg): + /* Here if 0<=|x|<2^(-28) */ + addss L(SP_ONE)(%rip), %xmm0 /* 1.0 + x */ + /* Return 1.0 with inexact raised, except for x==0 */ + ret + + .p2align 4 +L(special_paths): + /* Here if 125*log(2)<=|x| */ + shrl $31, %eax /* Get sign bit of x, and depending on it: */ + lea L(SP_RANGE)(%rip), %rdx /* load over/underflow bound */ + cmpl (%rdx,%rax,4), %ecx /* |x|<under/overflow bound ? */ + jbe L(near_under_or_overflow) + + /* Here if |x|>under/overflow bound */ + cmpl $0x7f800000, %ecx /* |x| is finite ? */ + jae L(arg_inf_or_nan) + + /* Here if |x|>under/overflow bound, and x is finite */ + testq %rax, %rax /* sign of x nonzero ? */ + je L(res_overflow) + + /* Here if -inf<x<underflow bound (x<0) */ + movss L(SP_SMALL)(%rip), %xmm0/* load small value 2^(-100) */ + mulss %xmm0, %xmm0 /* Return underflowed result (zero or subnormal) */ + ret + + .p2align 4 +L(res_overflow): + /* Here if overflow bound<x<inf (x>0) */ + movss L(SP_LARGE)(%rip), %xmm0/* load large value 2^100 */ + mulss %xmm0, %xmm0 /* Return overflowed result (Inf or max normal) */ + ret + + .p2align 4 +L(arg_inf_or_nan): + /* Here if |x| is Inf or NAN */ + jne L(arg_nan) /* |x| is Inf ? */ + + /* Here if |x| is Inf */ + lea L(SP_INF_0)(%rip), %rdx /* depending on sign of x: */ + movss (%rdx,%rax,4), %xmm0 /* return zero or Inf */ + ret + + .p2align 4 +L(arg_nan): + /* Here if |x| is NaN */ + addss %xmm0, %xmm0 /* Return x+x (raise invalid) */ + ret + + .p2align 4 +L(near_under_or_overflow): + /* Here if 125*log(2)<=|x|<under/overflow bound */ + cvtsd2ss %xmm2, %xmm2 /* SP x*K/log(2)+RS */ + movd %xmm2, %eax /* bits of n*K+j with trash */ + subss L(SP_RS)(%rip), %xmm2 /* SP t=round(x*K/log(2)) */ + movl %eax, %edx /* n*K+j with trash */ + cvtss2sd %xmm2, %xmm2 /* DP t */ + andl $0x3f, %eax /* bits of j */ + mulsd L(DP_NLN2K)(%rip), %xmm2/* DP -t*log(2)/K */ + andl $0xffffffc0, %edx /* bits of n */ +#ifdef __AVX__ + vaddsd %xmm1, %xmm2, %xmm0 /* DP y=x-t*log(2)/K */ + vmulsd %xmm0, %xmm0, %xmm2 /* DP z=y*y */ +#else + addsd %xmm1, %xmm2 /* DP y=x-t*log(2)/K */ + movaps %xmm2, %xmm0 /* DP y */ + mulsd %xmm2, %xmm2 /* DP z=y*y */ +#endif + mulsd %xmm2, %xmm4 /* DP P3*z */ + addl $0xffc0, %edx /* bits of n + DP exponent bias */ + mulsd %xmm2, %xmm3 /* DP P2*z */ + shlq $46, %rdx /* DP 2^n */ + addsd L(DP_P1)(%rip), %xmm4 /* DP P3*z+P1 */ + addsd L(DP_P0)(%rip), %xmm3 /* DP P2*z+P0 */ + movd %rdx, %xmm1 /* DP 2^n */ + mulsd %xmm2, %xmm4 /* DP (P3*z+P1)*z */ + mulsd %xmm3, %xmm0 /* DP (P2*z+P0)*y */ + addsd %xmm4, %xmm0 /* DP P(y) */ + mulsd (%rsi,%rax,8), %xmm0 /* DP P(y)*T[j] */ + addsd (%rsi,%rax,8), %xmm0 /* DP T[j]*(P(y)+1) */ + mulsd %xmm1, %xmm0 /* DP result=2^n*(T[j]*(P(y)+1)) */ + cvtsd2ss %xmm0, %xmm0 /* convert result to single precision */ + ret +END(__ieee754_expf) + + .section .rodata, "a" + .p2align 3 +L(DP_T): /* table of double precision values 2^(j/K) for j=[0..K-1] */ + .long 0x00000000, 0x3ff00000 + .long 0x3e778061, 0x3ff02c9a + .long 0xd3158574, 0x3ff059b0 + .long 0x18759bc8, 0x3ff08745 + .long 0x6cf9890f, 0x3ff0b558 + .long 0x32d3d1a2, 0x3ff0e3ec + .long 0xd0125b51, 0x3ff11301 + .long 0xaea92de0, 0x3ff1429a + .long 0x3c7d517b, 0x3ff172b8 + .long 0xeb6fcb75, 0x3ff1a35b + .long 0x3168b9aa, 0x3ff1d487 + .long 0x88628cd6, 0x3ff2063b + .long 0x6e756238, 0x3ff2387a + .long 0x65e27cdd, 0x3ff26b45 + .long 0xf51fdee1, 0x3ff29e9d + .long 0xa6e4030b, 0x3ff2d285 + .long 0x0a31b715, 0x3ff306fe + .long 0xb26416ff, 0x3ff33c08 + .long 0x373aa9cb, 0x3ff371a7 + .long 0x34e59ff7, 0x3ff3a7db + .long 0x4c123422, 0x3ff3dea6 + .long 0x21f72e2a, 0x3ff4160a + .long 0x6061892d, 0x3ff44e08 + .long 0xb5c13cd0, 0x3ff486a2 + .long 0xd5362a27, 0x3ff4bfda + .long 0x769d2ca7, 0x3ff4f9b2 + .long 0x569d4f82, 0x3ff5342b + .long 0x36b527da, 0x3ff56f47 + .long 0xdd485429, 0x3ff5ab07 + .long 0x15ad2148, 0x3ff5e76f + .long 0xb03a5585, 0x3ff6247e + .long 0x82552225, 0x3ff66238 + .long 0x667f3bcd, 0x3ff6a09e + .long 0x3c651a2f, 0x3ff6dfb2 + .long 0xe8ec5f74, 0x3ff71f75 + .long 0x564267c9, 0x3ff75feb + .long 0x73eb0187, 0x3ff7a114 + .long 0x36cf4e62, 0x3ff7e2f3 + .long 0x994cce13, 0x3ff82589 + .long 0x9b4492ed, 0x3ff868d9 + .long 0x422aa0db, 0x3ff8ace5 + .long 0x99157736, 0x3ff8f1ae + .long 0xb0cdc5e5, 0x3ff93737 + .long 0x9fde4e50, 0x3ff97d82 + .long 0x82a3f090, 0x3ff9c491 + .long 0x7b5de565, 0x3ffa0c66 + .long 0xb23e255d, 0x3ffa5503 + .long 0x5579fdbf, 0x3ffa9e6b + .long 0x995ad3ad, 0x3ffae89f + .long 0xb84f15fb, 0x3ffb33a2 + .long 0xf2fb5e47, 0x3ffb7f76 + .long 0x904bc1d2, 0x3ffbcc1e + .long 0xdd85529c, 0x3ffc199b + .long 0x2e57d14b, 0x3ffc67f1 + .long 0xdcef9069, 0x3ffcb720 + .long 0x4a07897c, 0x3ffd072d + .long 0xdcfba487, 0x3ffd5818 + .long 0x03db3285, 0x3ffda9e6 + .long 0x337b9b5f, 0x3ffdfc97 + .long 0xe78b3ff6, 0x3ffe502e + .long 0xa2a490da, 0x3ffea4af + .long 0xee615a27, 0x3ffefa1b + .long 0x5b6e4540, 0x3fff5076 + .long 0x819e90d8, 0x3fffa7c1 + .type L(DP_T), @object + ASM_SIZE_DIRECTIVE(L(DP_T)) + + .section .rodata.cst8,"aM",@progbits,8 + .p2align 3 +L(DP_KLN2): /* double precision K/log(2) */ + .long 0x652b82fe, 0x40571547 + .type L(DP_KLN2), @object + ASM_SIZE_DIRECTIVE(L(DP_KLN2)) + + .p2align 3 +L(DP_NLN2K): /* double precision -log(2)/K */ + .long 0xfefa39ef, 0xbf862e42 + .type L(DP_NLN2K), @object + ASM_SIZE_DIRECTIVE(L(DP_NLN2K)) + + .p2align 3 +L(DP_RS): /* double precision 2^23+2^22 */ + .long 0x00000000, 0x41680000 + .type L(DP_RS), @object + ASM_SIZE_DIRECTIVE(L(DP_RS)) + + .p2align 3 +L(DP_P3): /* double precision polynomial coefficient P3 */ + .long 0xeb78fa85, 0x3fa56420 + .type L(DP_P3), @object + ASM_SIZE_DIRECTIVE(L(DP_P3)) + + .p2align 3 +L(DP_P1): /* double precision polynomial coefficient P1 */ + .long 0x008d6118, 0x3fe00000 + .type L(DP_P1), @object + ASM_SIZE_DIRECTIVE(L(DP_P1)) + + .p2align 3 +L(DP_P2): /* double precision polynomial coefficient P2 */ + .long 0xda752d4f, 0x3fc55550 + .type L(DP_P2), @object + ASM_SIZE_DIRECTIVE(L(DP_P2)) + + .p2align 3 +L(DP_P0): /* double precision polynomial coefficient P0 */ + .long 0xffffe7c6, 0x3fefffff + .type L(DP_P0), @object + ASM_SIZE_DIRECTIVE(L(DP_P0)) + + .p2align 2 +L(SP_RANGE): /* single precision overflow/underflow bounds */ + .long 0x42b17217 /* if x>this bound, then result overflows */ + .long 0x42cff1b4 /* if x<this bound, then result underflows */ + .type L(SP_RANGE), @object + ASM_SIZE_DIRECTIVE(L(SP_RANGE)) + + .p2align 2 +L(SP_INF_0): + .long 0x7f800000 /* single precision Inf */ + .long 0 /* single precision zero */ + .type L(SP_INF_0), @object + ASM_SIZE_DIRECTIVE(L(SP_INF_0)) + + .section .rodata.cst4,"aM",@progbits,4 + .p2align 2 +L(SP_RS): /* single precision 2^23+2^22 */ + .long 0x4b400000 + .type L(SP_RS), @object + ASM_SIZE_DIRECTIVE(L(SP_RS)) + + .p2align 2 +L(SP_SMALL): /* single precision small value 2^(-100) */ + .long 0x0d800000 + .type L(SP_SMALL), @object + ASM_SIZE_DIRECTIVE(L(SP_SMALL)) + + .p2align 2 +L(SP_LARGE): /* single precision large value 2^100 */ + .long 0x71800000 + .type L(SP_LARGE), @object + ASM_SIZE_DIRECTIVE(L(SP_LARGE)) + + .p2align 2 +L(SP_ONE): /* single precision 1.0 */ + .long 0x3f800000 + .type L(SP_ONE), @object + ASM_SIZE_DIRECTIVE(L(SP_ONE)) + +strong_alias (__ieee754_expf, __expf_finite) diff --git a/REORG.TODO/sysdeps/x86_64/fpu/e_expl.S b/REORG.TODO/sysdeps/x86_64/fpu/e_expl.S new file mode 100644 index 0000000000..a4ef023b2b --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/e_expl.S @@ -0,0 +1,219 @@ +/* + * Written by J.T. Conklin <jtc@netbsd.org>. + * Public domain. + * + * Adapted for `long double' by Ulrich Drepper <drepper@cygnus.com>. + */ + +/* + * The 8087 method for the exponential function is to calculate + * exp(x) = 2^(x log2(e)) + * after separating integer and fractional parts + * x log2(e) = i + f, |f| <= .5 + * 2^i is immediate but f needs to be precise for long double accuracy. + * Suppress range reduction error in computing f by the following. + * Separate x into integer and fractional parts + * x = xi + xf, |xf| <= .5 + * Separate log2(e) into the sum of an exact number c0 and small part c1. + * c0 + c1 = log2(e) to extra precision + * Then + * f = (c0 xi - i) + c0 xf + c1 x + * where c0 xi is exact and so also is (c0 xi - i). + * -- moshier@na-net.ornl.gov + */ + +#include <machine/asm.h> +#include <x86_64-math-asm.h> + +#ifdef USE_AS_EXP10L +# define IEEE754_EXPL __ieee754_exp10l +# define EXPL_FINITE __exp10l_finite +# define FLDLOG fldl2t +#elif defined USE_AS_EXPM1L +# define IEEE754_EXPL __expm1l +# undef EXPL_FINITE +# define FLDLOG fldl2e +#else +# define IEEE754_EXPL __ieee754_expl +# define EXPL_FINITE __expl_finite +# define FLDLOG fldl2e +#endif + + .section .rodata.cst16,"aM",@progbits,16 + + .p2align 4 +#ifdef USE_AS_EXP10L + .type c0,@object +c0: .byte 0, 0, 0, 0, 0, 0, 0x9a, 0xd4, 0x00, 0x40 + .byte 0, 0, 0, 0, 0, 0 + ASM_SIZE_DIRECTIVE(c0) + .type c1,@object +c1: .byte 0x58, 0x92, 0xfc, 0x15, 0x37, 0x9a, 0x97, 0xf0, 0xef, 0x3f + .byte 0, 0, 0, 0, 0, 0 + ASM_SIZE_DIRECTIVE(c1) +#else + .type c0,@object +c0: .byte 0, 0, 0, 0, 0, 0, 0xaa, 0xb8, 0xff, 0x3f + .byte 0, 0, 0, 0, 0, 0 + ASM_SIZE_DIRECTIVE(c0) + .type c1,@object +c1: .byte 0x20, 0xfa, 0xee, 0xc2, 0x5f, 0x70, 0xa5, 0xec, 0xed, 0x3f + .byte 0, 0, 0, 0, 0, 0 + ASM_SIZE_DIRECTIVE(c1) +#endif +#ifndef USE_AS_EXPM1L + .type csat,@object +csat: .byte 0, 0, 0, 0, 0, 0, 0, 0x80, 0x0e, 0x40 + .byte 0, 0, 0, 0, 0, 0 + ASM_SIZE_DIRECTIVE(csat) +DEFINE_LDBL_MIN +#endif + +#ifdef PIC +# define MO(op) op##(%rip) +#else +# define MO(op) op +#endif + + .text +ENTRY(IEEE754_EXPL) +#ifdef USE_AS_EXPM1L + movzwl 8+8(%rsp), %eax + xorb $0x80, %ah // invert sign bit (now 1 is "positive") + cmpl $0xc006, %eax // is num positive and exp >= 6 (number is >= 128.0)? + jae HIDDEN_JUMPTARGET (__expl) // (if num is denormal, it is at least >= 64.0) +#endif + fldt 8(%rsp) +/* I added the following ugly construct because expl(+-Inf) resulted + in NaN. The ugliness results from the bright minds at Intel. + For the i686 the code can be written better. + -- drepper@cygnus.com. */ + fxam /* Is NaN or +-Inf? */ +#ifdef USE_AS_EXPM1L + xorb $0x80, %ah + cmpl $0xc006, %eax + fstsw %ax + movb $0x45, %dh + jb 4f + + /* Below -64.0 (may be -NaN or -Inf). */ + andb %ah, %dh + cmpb $0x01, %dh + je 6f /* Is +-NaN, jump. */ + jmp 1f /* -large, possibly -Inf. */ + +4: /* In range -64.0 to 64.0 (may be +-0 but not NaN or +-Inf). */ + /* Test for +-0 as argument. */ + andb %ah, %dh + cmpb $0x40, %dh + je 2f + + /* Test for arguments that are small but not subnormal. */ + movzwl 8+8(%rsp), %eax + andl $0x7fff, %eax + cmpl $0x3fbf, %eax + jge 3f + /* Argument's exponent below -64; avoid spurious underflow if + normal. */ + cmpl $0x0001, %eax + jge 2f + /* Force underflow and return the argument, to avoid wrong signs + of zero results from the code below in some rounding modes. */ + fld %st + fmul %st + fstp %st + jmp 2f +#else + movzwl 8+8(%rsp), %eax + andl $0x7fff, %eax + cmpl $0x400d, %eax + jg 5f + cmpl $0x3fbc, %eax + jge 3f + /* Argument's exponent below -67, result rounds to 1. */ + fld1 + faddp + jmp 2f +5: /* Overflow, underflow or infinity or NaN as argument. */ + fstsw %ax + movb $0x45, %dh + andb %ah, %dh + cmpb $0x05, %dh + je 1f /* Is +-Inf, jump. */ + cmpb $0x01, %dh + je 6f /* Is +-NaN, jump. */ + /* Overflow or underflow; saturate. */ + fstp %st + fldt MO(csat) + andb $2, %ah + jz 3f + fchs +#endif +3: FLDLOG /* 1 log2(base) */ + fmul %st(1), %st /* 1 x log2(base) */ + /* Set round-to-nearest temporarily. */ + fstcw -4(%rsp) + movl $0xf3ff, %edx + andl -4(%rsp), %edx + movl %edx, -8(%rsp) + fldcw -8(%rsp) + frndint /* 1 i */ + fld %st(1) /* 2 x */ + frndint /* 2 xi */ + fldcw -4(%rsp) + fld %st(1) /* 3 i */ + fldt MO(c0) /* 4 c0 */ + fld %st(2) /* 5 xi */ + fmul %st(1), %st /* 5 c0 xi */ + fsubp %st, %st(2) /* 4 f = c0 xi - i */ + fld %st(4) /* 5 x */ + fsub %st(3), %st /* 5 xf = x - xi */ + fmulp %st, %st(1) /* 4 c0 xf */ + faddp %st, %st(1) /* 3 f = f + c0 xf */ + fldt MO(c1) /* 4 */ + fmul %st(4), %st /* 4 c1 * x */ + faddp %st, %st(1) /* 3 f = f + c1 * x */ + f2xm1 /* 3 2^(fract(x * log2(base))) - 1 */ +#ifdef USE_AS_EXPM1L + fstp %st(1) /* 2 */ + fscale /* 2 scale factor is st(1); base^x - 2^i */ + fxch /* 2 i */ + fld1 /* 3 1.0 */ + fscale /* 3 2^i */ + fld1 /* 4 1.0 */ + fsubrp %st, %st(1) /* 3 2^i - 1.0 */ + fstp %st(1) /* 2 */ + faddp %st, %st(1) /* 1 base^x - 1.0 */ +#else + fld1 /* 4 1.0 */ + faddp /* 3 2^(fract(x * log2(base))) */ + fstp %st(1) /* 2 */ + fscale /* 2 scale factor is st(1); base^x */ + fstp %st(1) /* 1 */ + LDBL_CHECK_FORCE_UFLOW_NONNEG +#endif + fstp %st(1) /* 0 */ + jmp 2f +1: +#ifdef USE_AS_EXPM1L + /* For expm1l, only negative sign gets here. */ + fstp %st + fld1 + fchs +#else + testl $0x200, %eax /* Test sign. */ + jz 2f /* If positive, jump. */ + fstp %st + fldz /* Set result to 0. */ +#endif +2: ret +6: /* NaN argument. */ + fadd %st + ret +END(IEEE754_EXPL) +#ifdef USE_AS_EXPM1L +libm_hidden_def (__expm1l) +weak_alias (__expm1l, expm1l) +#else +strong_alias (IEEE754_EXPL, EXPL_FINITE) +#endif diff --git a/REORG.TODO/sysdeps/x86_64/fpu/e_fmodl.S b/REORG.TODO/sysdeps/x86_64/fpu/e_fmodl.S new file mode 100644 index 0000000000..07c50df8d1 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/e_fmodl.S @@ -0,0 +1,23 @@ +/* + * Written by J.T. Conklin <jtc@netbsd.org>. + * Public domain. + * + * Adapted for `long double' by Ulrich Drepper <drepper@cygnus.com>. + * Adapted for x86-64 by Andreas Jaeger <aj@suse.de>. + */ + +#include <machine/asm.h> + +RCSID("$NetBSD: $") + +ENTRY(__ieee754_fmodl) + fldt 24(%rsp) + fldt 8(%rsp) +1: fprem + fstsw %ax + and $04,%ah + jnz 1b + fstp %st(1) + ret +END (__ieee754_fmodl) +strong_alias (__ieee754_fmodl, __fmodl_finite) diff --git a/REORG.TODO/sysdeps/x86_64/fpu/e_ilogbl.S b/REORG.TODO/sysdeps/x86_64/fpu/e_ilogbl.S new file mode 100644 index 0000000000..ae6c0fe6f9 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/e_ilogbl.S @@ -0,0 +1,39 @@ +/* + * Written by J.T. Conklin <jtc@netbsd.org>. + * Changes for long double by Ulrich Drepper <drepper@cygnus.com> + * Adapted for x86-64 by Andreas Jaeger <aj@suse.de>. + * Public domain. + */ + +#include <machine/asm.h> + +ENTRY(__ieee754_ilogbl) + fldt 8(%rsp) +/* I added the following ugly construct because ilogb(+-Inf) is + required to return INT_MAX in ISO C99. + -- jakub@redhat.com. */ + fxam /* Is NaN or +-Inf? */ + fstsw %ax + movb $0x45, %dh + andb %ah, %dh + cmpb $0x05, %dh + je 1f /* Is +-Inf, jump. */ + cmpb $0x40, %dh + je 2f /* Is +-Inf, jump. */ + + fxtract + fstp %st + + fistpl -4(%rsp) + fwait + movl -4(%rsp),%eax + + ret + +1: fstp %st + movl $0x7fffffff, %eax + ret +2: fstp %st + movl $0x80000000, %eax /* FP_ILOGB0 */ + ret +END (__ieee754_ilogbl) diff --git a/REORG.TODO/sysdeps/x86_64/fpu/e_log10l.S b/REORG.TODO/sysdeps/x86_64/fpu/e_log10l.S new file mode 100644 index 0000000000..e0cb88e32e --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/e_log10l.S @@ -0,0 +1,92 @@ +/* + * Written by J.T. Conklin <jtc@netbsd.org>. + * Public domain. + * + * Adapted for `long double' by Ulrich Drepper <drepper@cygnus.com>. + * + * Changed to use fyl2xp1 for values near 1, <drepper@cygnus.com>. + * Adapted for x86-64 by Andreas Jaeger <aj@suse.de>. + */ + +#include <machine/asm.h> + + .section .rodata.cst8,"aM",@progbits,8 + + .p2align 3 + .type one,@object +one: .double 1.0 + ASM_SIZE_DIRECTIVE(one) + /* It is not important that this constant is precise. It is only + a value which is known to be on the safe side for using the + fyl2xp1 instruction. */ + .type limit,@object +limit: .double 0.29 + ASM_SIZE_DIRECTIVE(limit) + + +#ifdef PIC +# define MO(op) op##(%rip) +#else +# define MO(op) op +#endif + + .text +ENTRY(__ieee754_log10l) + fldlg2 // log10(2) + fldt 8(%rsp) // x : log10(2) + fxam + fnstsw + fld %st // x : x : log10(2) + testb $1, %ah + jnz 3f // in case x is NaN or ±Inf +4: fsubl MO(one) // x-1 : x : log10(2) + fld %st // x-1 : x-1 : x : log10(2) + fabs // |x-1| : x-1 : x : log10(2) + fcompl MO(limit) // x-1 : x : log10(2) + fnstsw // x-1 : x : log10(2) + andb $0x45, %ah + jz 2f + fxam + fnstsw + andb $0x45, %ah + cmpb $0x40, %ah + jne 5f + fabs // log10(1) is +0 in all rounding modes. +5: fstp %st(1) // x-1 : log10(2) + fyl2xp1 // log10(x) + ret + +2: fstp %st(0) // x : log10(2) + fyl2x // log10(x) + ret + +3: testb $4, %ah + jnz 4b // in case x is ±Inf + fstp %st(1) + fstp %st(1) + fadd %st(0) + ret +END(__ieee754_log10l) + + +ENTRY(__log10l_finite) + fldlg2 // log10(2) + fldt 8(%rsp) // x : log10(2) + fld %st // x : x : log10(2) +4: fsubl MO(one) // x-1 : x : log10(2) + fld %st // x-1 : x-1 : x : log10(2) + fabs // |x-1| : x-1 : x : log10(2) + fcompl MO(limit) // x-1 : x : log10(2) + fnstsw // x-1 : x : log10(2) + andb $0x45, %ah + jz 2b + fxam + fnstsw + andb $0x45, %ah + cmpb $0x40, %ah + jne 6f + fabs // log10(1) is +0 in all rounding modes. +6: fstp %st(1) // x-1 : log10(2) + fyl2xp1 // log10(x) + ret +END(__log10l_finite) diff --git a/REORG.TODO/sysdeps/x86_64/fpu/e_log2l.S b/REORG.TODO/sysdeps/x86_64/fpu/e_log2l.S new file mode 100644 index 0000000000..023ec29164 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/e_log2l.S @@ -0,0 +1,91 @@ +/* + * Written by J.T. Conklin <jtc@netbsd.org>. + * Adapted for use as log2 by Ulrich Drepper <drepper@cygnus.com>. + * Public domain. + * + * Changed to use fyl2xp1 for values near 1, <drepper@cygnus.com>. + * Adapted for x86-64 by Andreas Jaeger <aj@suse.de>. + */ + +#include <machine/asm.h> + + .section .rodata.cst8,"aM",@progbits,8 + + .p2align 3 + .type one,@object +one: .double 1.0 + ASM_SIZE_DIRECTIVE(one) + /* It is not important that this constant is precise. It is only + a value which is known to be on the safe side for using the + fyl2xp1 instruction. */ + .type limit,@object +limit: .double 0.29 + ASM_SIZE_DIRECTIVE(limit) + + +#ifdef PIC +# define MO(op) op##(%rip) +#else +# define MO(op) op +#endif + + .text +ENTRY(__ieee754_log2l) + fldl MO(one) + fldt 8(%rsp) // x : 1 + fxam + fnstsw + fld %st // x : x : 1 + testb $1, %ah + jnz 3f // in case x is NaN or ±Inf +4: fsub %st(2), %st // x-1 : x : 1 + fld %st // x-1 : x-1 : x : 1 + fabs // |x-1| : x-1 : x : 1 + fcompl MO(limit) // x-1 : x : 1 + fnstsw // x-1 : x : 1 + andb $0x45, %ah + jz 2f + fxam + fnstsw + andb $0x45, %ah + cmpb $0x40, %ah + jne 5f + fabs // log2(1) is +0 in all rounding modes. +5: fstp %st(1) // x-1 : 1 + fyl2xp1 // log(x) + ret + +2: fstp %st(0) // x : 1 + fyl2x // log(x) + ret + +3: testb $4, %ah + jnz 4b // in case x is ±Inf + fstp %st(1) + fstp %st(1) + fadd %st(0) + ret +END (__ieee754_log2l) + + +ENTRY(__log2l_finite) + fldl MO(one) + fldt 8(%rsp) // x : 1 + fld %st // x : x : 1 + fsub %st(2), %st // x-1 : x : 1 + fld %st // x-1 : x-1 : x : 1 + fabs // |x-1| : x-1 : x : 1 + fcompl MO(limit) // x-1 : x : 1 + fnstsw // x-1 : x : 1 + andb $0x45, %ah + jz 2b + fxam + fnstsw + andb $0x45, %ah + cmpb $0x40, %ah + jne 6f + fabs // log2(1) is +0 in all rounding modes. +6: fstp %st(1) // x-1 : 1 + fyl2xp1 // log(x) + ret +END (__log2l_finite) diff --git a/REORG.TODO/sysdeps/x86_64/fpu/e_logl.S b/REORG.TODO/sysdeps/x86_64/fpu/e_logl.S new file mode 100644 index 0000000000..0d3576f48b --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/e_logl.S @@ -0,0 +1,94 @@ +/* + * Written by J.T. Conklin <jtc@netbsd.org>. + * Public domain. + * + * Adapted for `long double' by Ulrich Drepper <drepper@cygnus.com>. + * Adapted for x86-64 by Andreas Jaeger <aj@suse.de>. + */ + +#include <machine/asm.h> + + + .section .rodata.cst8,"aM",@progbits,8 + + .p2align 3 + .type one,@object +one: .double 1.0 + ASM_SIZE_DIRECTIVE(one) + /* It is not important that this constant is precise. It is only + a value which is known to be on the safe side for using the + fyl2xp1 instruction. */ + .type limit,@object +limit: .double 0.29 + ASM_SIZE_DIRECTIVE(limit) + + +#ifdef PIC +# define MO(op) op##(%rip) +#else +# define MO(op) op +#endif + + .text +ENTRY(__ieee754_logl) + fldln2 // log(2) + fldt 8(%rsp) // x : log(2) + fxam + fnstsw + fld %st // x : x : log(2) + testb $1, %ah + jnz 3f // in case x is NaN or +-Inf + movzwl 8+8(%rsp), %eax + cmpl $0xc000, %eax + jae 6f // x <= -2, avoid overflow from -LDBL_MAX - 1. +4: fsubl MO(one) // x-1 : x : log(2) +6: fld %st // x-1 : x-1 : x : log(2) + fabs // |x-1| : x-1 : x : log(2) + fcompl MO(limit) // x-1 : x : log(2) + fnstsw // x-1 : x : log(2) + andb $0x45, %ah + jz 2f + fxam + fnstsw + andb $0x45, %ah + cmpb $0x40, %ah + jne 5f + fabs // log(1) is +0 in all rounding modes. +5: fstp %st(1) // x-1 : log(2) + fyl2xp1 // log(x) + ret + +2: fstp %st(0) // x : log(2) + fyl2x // log(x) + ret + +3: testb $4, %ah + jnz 4b // in case x is +-Inf + fstp %st(1) + fstp %st(1) + fadd %st(0) + ret +END (__ieee754_logl) + + +ENTRY(__logl_finite) + fldln2 // log(2) + fldt 8(%rsp) // x : log(2) + fld %st // x : x : log(2) + fsubl MO(one) // x-1 : x : log(2) + fld %st // x-1 : x-1 : x : log(2) + fabs // |x-1| : x-1 : x : log(2) + fcompl MO(limit) // x-1 : x : log(2) + fnstsw // x-1 : x : log(2) + andb $0x45, %ah + jz 2b + fxam + fnstsw + andb $0x45, %ah + cmpb $0x40, %ah + jne 7f + fabs // log(1) is +0 in all rounding modes. +7: fstp %st(1) // x-1 : log(2) + fyl2xp1 // log(x) + ret +END (__logl_finite) diff --git a/REORG.TODO/sysdeps/x86_64/fpu/e_powl.S b/REORG.TODO/sysdeps/x86_64/fpu/e_powl.S new file mode 100644 index 0000000000..571c0a18d5 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/e_powl.S @@ -0,0 +1,433 @@ +/* ix87 specific implementation of pow function. + Copyright (C) 1996-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Ulrich Drepper <drepper@cygnus.com>, 1996. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <machine/asm.h> +#include <x86_64-math-asm.h> + + .section .rodata.cst8,"aM",@progbits,8 + + .p2align 3 + .type one,@object +one: .double 1.0 + ASM_SIZE_DIRECTIVE(one) + .type p2,@object +p2: .byte 0, 0, 0, 0, 0, 0, 0x10, 0x40 + ASM_SIZE_DIRECTIVE(p2) + .type p63,@object +p63: .byte 0, 0, 0, 0, 0, 0, 0xe0, 0x43 + ASM_SIZE_DIRECTIVE(p63) + .type p64,@object +p64: .byte 0, 0, 0, 0, 0, 0, 0xf0, 0x43 + ASM_SIZE_DIRECTIVE(p64) + .type p78,@object +p78: .byte 0, 0, 0, 0, 0, 0, 0xd0, 0x44 + ASM_SIZE_DIRECTIVE(p78) + .type pm79,@object +pm79: .byte 0, 0, 0, 0, 0, 0, 0, 0x3b + ASM_SIZE_DIRECTIVE(pm79) + + .section .rodata.cst16,"aM",@progbits,16 + + .p2align 3 + .type infinity,@object +inf_zero: +infinity: + .byte 0, 0, 0, 0, 0, 0, 0xf0, 0x7f + ASM_SIZE_DIRECTIVE(infinity) + .type zero,@object +zero: .double 0.0 + ASM_SIZE_DIRECTIVE(zero) + .type minf_mzero,@object +minf_mzero: +minfinity: + .byte 0, 0, 0, 0, 0, 0, 0xf0, 0xff +mzero: + .byte 0, 0, 0, 0, 0, 0, 0, 0x80 + ASM_SIZE_DIRECTIVE(minf_mzero) +DEFINE_LDBL_MIN + +#ifdef PIC +# define MO(op) op##(%rip) +#else +# define MO(op) op +#endif + + .text +ENTRY(__ieee754_powl) + fldt 24(%rsp) // y + fxam + + + fnstsw + movb %ah, %dl + andb $0x45, %ah + cmpb $0x40, %ah // is y == 0 ? + je 11f + + cmpb $0x05, %ah // is y == ±inf ? + je 12f + + cmpb $0x01, %ah // is y == NaN ? + je 30f + + fldt 8(%rsp) // x : y + + fxam + fnstsw + movb %ah, %dh + andb $0x45, %ah + cmpb $0x40, %ah + je 20f // x is ±0 + + cmpb $0x05, %ah + je 15f // x is ±inf + + cmpb $0x01, %ah + je 31f // x is NaN + + fxch // y : x + + /* fistpll raises invalid exception for |y| >= 1L<<63. */ + fldl MO(p63) // 1L<<63 : y : x + fld %st(1) // y : 1L<<63 : y : x + fabs // |y| : 1L<<63 : y : x + fcomip %st(1), %st // 1L<<63 : y : x + fstp %st(0) // y : x + jnc 2f + + /* First see whether `y' is a natural number. In this case we + can use a more precise algorithm. */ + fld %st // y : y : x + fistpll -8(%rsp) // y : x + fildll -8(%rsp) // int(y) : y : x + fucomip %st(1),%st // y : x + je 9f + + // If y has absolute value at most 0x1p-79, then any finite + // nonzero x will result in 1. Saturate y to those bounds to + // avoid underflow in the calculation of y*log2(x). + fldl MO(pm79) // 0x1p-79 : y : x + fld %st(1) // y : 0x1p-79 : y : x + fabs // |y| : 0x1p-79 : y : x + fcomip %st(1), %st // 0x1p-79 : y : x + fstp %st(0) // y : x + jnc 3f + fstp %st(0) // pop y + fldl MO(pm79) // 0x1p-79 : x + testb $2, %dl + jnz 3f // y > 0 + fchs // -0x1p-79 : x + jmp 3f + +9: /* OK, we have an integer value for y. Unless very small + (we use < 4), use the algorithm for real exponent to avoid + accumulation of errors. */ + fldl MO(p2) // 4 : y : x + fld %st(1) // y : 4 : y : x + fabs // |y| : 4 : y : x + fcomip %st(1), %st // 4 : y : x + fstp %st(0) // y : x + jnc 3f + mov -8(%rsp),%eax + mov -4(%rsp),%edx + orl $0, %edx + fstp %st(0) // x + jns 4f // y >= 0, jump + fdivrl MO(one) // 1/x (now referred to as x) + negl %eax + adcl $0, %edx + negl %edx +4: fldl MO(one) // 1 : x + fxch + + /* If y is even, take the absolute value of x. Otherwise, + ensure all intermediate values that might overflow have the + sign of x. */ + testb $1, %al + jnz 6f + fabs + +6: shrdl $1, %edx, %eax + jnc 5f + fxch + fabs + fmul %st(1) // x : ST*x + fxch +5: fld %st // x : x : ST*x + fabs // |x| : x : ST*x + fmulp // |x|*x : ST*x + shrl $1, %edx + movl %eax, %ecx + orl %edx, %ecx + jnz 6b + fstp %st(0) // ST*x + LDBL_CHECK_FORCE_UFLOW_NONNAN + ret + + /* y is ±NAN */ +30: fldt 8(%rsp) // x : y + fldl MO(one) // 1.0 : x : y + fucomip %st(1),%st // x : y + je 32f +31: /* At least one argument NaN, and result should be NaN. */ + faddp + ret +32: jc 31b + /* pow (1, NaN); check if the NaN signaling. */ + testb $0x40, 31(%rsp) + jz 31b + fstp %st(1) + ret + + .align ALIGNARG(4) +2: // y is a large integer (absolute value at least 1L<<63). + // If y has absolute value at least 1L<<78, then any finite + // nonzero x will result in 0 (underflow), 1 or infinity (overflow). + // Saturate y to those bounds to avoid overflow in the calculation + // of y*log2(x). + fldl MO(p78) // 1L<<78 : y : x + fld %st(1) // y : 1L<<78 : y : x + fabs // |y| : 1L<<78 : y : x + fcomip %st(1), %st // 1L<<78 : y : x + fstp %st(0) // y : x + jc 3f + fstp %st(0) // pop y + fldl MO(p78) // 1L<<78 : x + testb $2, %dl + jz 3f // y > 0 + fchs // -(1L<<78) : x + .align ALIGNARG(4) +3: /* y is a real number. */ + subq $40, %rsp + cfi_adjust_cfa_offset (40) + fstpt 16(%rsp) // x + fstpt (%rsp) // <empty> + call HIDDEN_JUMPTARGET (__powl_helper) // <result> + addq $40, %rsp + cfi_adjust_cfa_offset (-40) + ret + + // pow(x,±0) = 1, unless x is sNaN + .align ALIGNARG(4) +11: fstp %st(0) // pop y + fldt 8(%rsp) // x + fxam + fnstsw + andb $0x45, %ah + cmpb $0x01, %ah + je 112f // x is NaN +111: fstp %st(0) + fldl MO(one) + ret + +112: testb $0x40, 15(%rsp) + jnz 111b + fadd %st(0) + ret + + // y == ±inf + .align ALIGNARG(4) +12: fstp %st(0) // pop y + fldl MO(one) // 1 + fldt 8(%rsp) // x : 1 + fabs // abs(x) : 1 + fucompp // < 1, == 1, or > 1 + fnstsw + andb $0x45, %ah + cmpb $0x45, %ah + je 13f // jump if x is NaN + + cmpb $0x40, %ah + je 14f // jump if |x| == 1 + + shlb $1, %ah + xorb %ah, %dl + andl $2, %edx +#ifdef PIC + lea inf_zero(%rip),%rcx + fldl (%rcx, %rdx, 4) +#else + fldl inf_zero(,%rdx, 4) +#endif + ret + + .align ALIGNARG(4) +14: fldl MO(one) + ret + + .align ALIGNARG(4) +13: fldt 8(%rsp) // load x == NaN + fadd %st(0) + ret + + .align ALIGNARG(4) + // x is ±inf +15: fstp %st(0) // y + testb $2, %dh + jz 16f // jump if x == +inf + + // fistpll raises invalid exception for |y| >= 1L<<63, but y + // may be odd unless we know |y| >= 1L<<64. + fldl MO(p64) // 1L<<64 : y + fld %st(1) // y : 1L<<64 : y + fabs // |y| : 1L<<64 : y + fcomip %st(1), %st // 1L<<64 : y + fstp %st(0) // y + jnc 16f + fldl MO(p63) // p63 : y + fxch // y : p63 + fprem // y%p63 : p63 + fstp %st(1) // y%p63 + + // We must find out whether y is an odd integer. + fld %st // y : y + fistpll -8(%rsp) // y + fildll -8(%rsp) // int(y) : y + fucomip %st(1),%st + ffreep %st // <empty> + jne 17f + + // OK, the value is an integer, but is it odd? + mov -8(%rsp), %eax + mov -4(%rsp), %edx + andb $1, %al + jz 18f // jump if not odd + // It's an odd integer. + shrl $31, %edx +#ifdef PIC + lea minf_mzero(%rip),%rcx + fldl (%rcx, %rdx, 8) +#else + fldl minf_mzero(,%rdx, 8) +#endif + ret + + .align ALIGNARG(4) +16: fcompl MO(zero) + fnstsw + shrl $5, %eax + andl $8, %eax +#ifdef PIC + lea inf_zero(%rip),%rcx + fldl (%rcx, %rax, 1) +#else + fldl inf_zero(,%rax, 1) +#endif + ret + + .align ALIGNARG(4) +17: shll $30, %edx // sign bit for y in right position +18: shrl $31, %edx +#ifdef PIC + lea inf_zero(%rip),%rcx + fldl (%rcx, %rdx, 8) +#else + fldl inf_zero(,%rdx, 8) +#endif + ret + + .align ALIGNARG(4) + // x is ±0 +20: fstp %st(0) // y + testb $2, %dl + jz 21f // y > 0 + + // x is ±0 and y is < 0. We must find out whether y is an odd integer. + testb $2, %dh + jz 25f + + // fistpll raises invalid exception for |y| >= 1L<<63, but y + // may be odd unless we know |y| >= 1L<<64. + fldl MO(p64) // 1L<<64 : y + fld %st(1) // y : 1L<<64 : y + fabs // |y| : 1L<<64 : y + fcomip %st(1), %st // 1L<<64 : y + fstp %st(0) // y + jnc 25f + fldl MO(p63) // p63 : y + fxch // y : p63 + fprem // y%p63 : p63 + fstp %st(1) // y%p63 + + fld %st // y : y + fistpll -8(%rsp) // y + fildll -8(%rsp) // int(y) : y + fucomip %st(1),%st + ffreep %st // <empty> + jne 26f + + // OK, the value is an integer, but is it odd? + mov -8(%rsp),%eax + mov -4(%rsp),%edx + andb $1, %al + jz 27f // jump if not odd + // It's an odd integer. + // Raise divide-by-zero exception and get minus infinity value. + fldl MO(one) + fdivl MO(zero) + fchs + ret + +25: fstp %st(0) +26: +27: // Raise divide-by-zero exception and get infinity value. + fldl MO(one) + fdivl MO(zero) + ret + + .align ALIGNARG(4) + // x is ±0 and y is > 0. We must find out whether y is an odd integer. +21: testb $2, %dh + jz 22f + + // fistpll raises invalid exception for |y| >= 1L<<63, but y + // may be odd unless we know |y| >= 1L<<64. + fldl MO(p64) // 1L<<64 : y + fxch // y : 1L<<64 + fcomi %st(1), %st // y : 1L<<64 + fstp %st(1) // y + jnc 22f + fldl MO(p63) // p63 : y + fxch // y : p63 + fprem // y%p63 : p63 + fstp %st(1) // y%p63 + + fld %st // y : y + fistpll -8(%rsp) // y + fildll -8(%rsp) // int(y) : y + fucomip %st(1),%st + ffreep %st // <empty> + jne 23f + + // OK, the value is an integer, but is it odd? + mov -8(%rsp),%eax + mov -4(%rsp),%edx + andb $1, %al + jz 24f // jump if not odd + // It's an odd integer. + fldl MO(mzero) + ret + +22: fstp %st(0) +23: +24: fldl MO(zero) + ret + +END(__ieee754_powl) +strong_alias (__ieee754_powl, __powl_finite) diff --git a/REORG.TODO/sysdeps/x86_64/fpu/e_remainderl.S b/REORG.TODO/sysdeps/x86_64/fpu/e_remainderl.S new file mode 100644 index 0000000000..4ee0910912 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/e_remainderl.S @@ -0,0 +1,21 @@ +/* + * Written by J.T. Conklin <jtc@netbsd.org>. + * Public domain. + * + * Adapted for `long double' by Ulrich Drepper <drepper@cygnus.com>. + * Adapted for x86-64 by Andreas Jaeger <aj@suse.de>. + */ + +#include <machine/asm.h> + +ENTRY(__ieee754_remainderl) + fldt 24(%rsp) + fldt 8(%rsp) +1: fprem1 + fstsw %ax + testl $0x400,%eax + jnz 1b + fstp %st(1) + ret +END (__ieee754_remainderl) +strong_alias (__ieee754_remainderl, __remainderl_finite) diff --git a/REORG.TODO/sysdeps/x86_64/fpu/e_scalbl.S b/REORG.TODO/sysdeps/x86_64/fpu/e_scalbl.S new file mode 100644 index 0000000000..2982dc3b9e --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/e_scalbl.S @@ -0,0 +1,89 @@ +/* + * Written by J.T. Conklin <jtc@netbsd.org>. + * Public domain. + * + * Adapted for `long double' by Ulrich Drepper <drepper@cygnus.com>. + * Adapted for x86-64 by Andreas Jaeger <aj@suse.de> + * + * Correct handling of y==-inf <drepper@gnu> + */ + +#include <machine/asm.h> + + .section .rodata + + .align ALIGNARG(4) + .type zero_nan,@object +zero_nan: + .double 0.0 +nan: .byte 0, 0, 0, 0, 0, 0, 0xff, 0x7f + .byte 0, 0, 0, 0, 0, 0, 0, 0x80 + .byte 0, 0, 0, 0, 0, 0, 0xff, 0x7f + ASM_SIZE_DIRECTIVE(zero_nan) + + +#ifdef PIC +# define MO(op) op##(%rip) +#else +# define MO(op) op +#endif + + .text +ENTRY(__ieee754_scalbl) + fldt 24(%rsp) + fxam + fnstsw + fldt 8(%rsp) + andl $0x4700, %eax + cmpl $0x0700, %eax + je 1f + andl $0x4500, %eax + cmpl $0x0100, %eax + je 2f + fxam + fnstsw + andl $0x4500, %eax + cmpl $0x0100, %eax + je 2f + fld %st(1) + frndint + fcomip %st(2), %st + jne 4f + fscale + fstp %st(1) + ret + + /* y is -inf */ +1: fxam + fnstsw + movl 16(%rsp), %edx + shrl $5, %eax + fstp %st + fstp %st + andl $0x8000, %edx + andl $0x0228, %eax + cmpl $0x0028, %eax + je 4f + andl $8, %eax + shrl $11, %edx + addl %edx, %eax +#ifdef PIC + lea zero_nan(%rip),%rdx + fldl (%rdx,%rax,1) +#else + fldl zero_nan(%rax, 1) +#endif + ret + + /* The result is NaN; raise an exception for sNaN arguments. */ +2: faddp + ret + + /* Return NaN and raise the invalid exception. */ +4: fstp %st + fstp %st + fldz + fdiv %st + ret +END(__ieee754_scalbl) +strong_alias (__ieee754_scalbl, __scalbl_finite) diff --git a/REORG.TODO/sysdeps/x86_64/fpu/e_sqrt.c b/REORG.TODO/sysdeps/x86_64/fpu/e_sqrt.c new file mode 100644 index 0000000000..33b59f67c1 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/e_sqrt.c @@ -0,0 +1,31 @@ +/* Square root of floating point number. + Copyright (C) 2002-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <math_private.h> + +#undef __ieee754_sqrt +double +__ieee754_sqrt (double x) +{ + double res; + + asm ("sqrtsd %1, %0" : "=x" (res) : "xm" (x)); + + return res; +} +strong_alias (__ieee754_sqrt, __sqrt_finite) diff --git a/REORG.TODO/sysdeps/x86_64/fpu/e_sqrtf.c b/REORG.TODO/sysdeps/x86_64/fpu/e_sqrtf.c new file mode 100644 index 0000000000..386b903c43 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/e_sqrtf.c @@ -0,0 +1,31 @@ +/* Square root of floating point number. + Copyright (C) 2002-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <math_private.h> + +#undef __ieee754_sqrtf +float +__ieee754_sqrtf (float x) +{ + float res; + + asm ("sqrtss %1, %0" : "=x" (res) : "xm" (x)); + + return res; +} +strong_alias (__ieee754_sqrtf, __sqrtf_finite) diff --git a/REORG.TODO/sysdeps/x86_64/fpu/e_sqrtl.c b/REORG.TODO/sysdeps/x86_64/fpu/e_sqrtl.c new file mode 100644 index 0000000000..90e4e164e5 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/e_sqrtl.c @@ -0,0 +1 @@ +#include "sysdeps/i386/fpu/e_sqrtl.c" diff --git a/REORG.TODO/sysdeps/x86_64/fpu/fclrexcpt.c b/REORG.TODO/sysdeps/x86_64/fpu/fclrexcpt.c new file mode 100644 index 0000000000..93bf0d341f --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/fclrexcpt.c @@ -0,0 +1,52 @@ +/* Clear given exceptions in current floating-point environment. + Copyright (C) 2001-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <fenv.h> + +int +feclearexcept (int excepts) +{ + fenv_t temp; + unsigned int mxcsr; + + /* Mask out unsupported bits/exceptions. */ + excepts &= FE_ALL_EXCEPT; + + /* Bah, we have to clear selected exceptions. Since there is no + `fldsw' instruction we have to do it the hard way. */ + __asm__ ("fnstenv %0" : "=m" (*&temp)); + + /* Clear the relevant bits. */ + temp.__status_word &= excepts ^ FE_ALL_EXCEPT; + + /* Put the new data in effect. */ + __asm__ ("fldenv %0" : : "m" (*&temp)); + + /* And the same procedure for SSE. */ + __asm__ ("stmxcsr %0" : "=m" (*&mxcsr)); + + /* Clear the relevant bits. */ + mxcsr &= ~excepts; + + /* And put them into effect. */ + __asm__ ("ldmxcsr %0" : : "m" (*&mxcsr)); + + /* Success. */ + return 0; +} +libm_hidden_def (feclearexcept) diff --git a/REORG.TODO/sysdeps/x86_64/fpu/fedisblxcpt.c b/REORG.TODO/sysdeps/x86_64/fpu/fedisblxcpt.c new file mode 100644 index 0000000000..512987bd03 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/fedisblxcpt.c @@ -0,0 +1,46 @@ +/* Disable floating-point exceptions. + Copyright (C) 2001-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Andreas Jaeger <aj@suse.de>, 2001. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <fenv.h> + +int +fedisableexcept (int excepts) +{ + unsigned short int new_exc, old_exc; + unsigned int new; + + excepts &= FE_ALL_EXCEPT; + + /* Get the current control word of the x87 FPU. */ + __asm__ ("fstcw %0" : "=m" (*&new_exc)); + + old_exc = (~new_exc) & FE_ALL_EXCEPT; + + new_exc |= excepts; + __asm__ ("fldcw %0" : : "m" (*&new_exc)); + + /* And now the same for the SSE MXCSR register. */ + __asm__ ("stmxcsr %0" : "=m" (*&new)); + + /* The SSE exception masks are shifted by 7 bits. */ + new |= excepts << 7; + __asm__ ("ldmxcsr %0" : : "m" (*&new)); + + return old_exc; +} diff --git a/REORG.TODO/sysdeps/x86_64/fpu/feenablxcpt.c b/REORG.TODO/sysdeps/x86_64/fpu/feenablxcpt.c new file mode 100644 index 0000000000..0985d71a00 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/feenablxcpt.c @@ -0,0 +1,46 @@ +/* Enable floating-point exceptions. + Copyright (C) 2001-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Andreas Jaeger <aj@suse.de>, 2001. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <fenv.h> + +int +feenableexcept (int excepts) +{ + unsigned short int new_exc, old_exc; + unsigned int new; + + excepts &= FE_ALL_EXCEPT; + + /* Get the current control word of the x87 FPU. */ + __asm__ ("fstcw %0" : "=m" (*&new_exc)); + + old_exc = (~new_exc) & FE_ALL_EXCEPT; + + new_exc &= ~excepts; + __asm__ ("fldcw %0" : : "m" (*&new_exc)); + + /* And now the same for the SSE MXCSR register. */ + __asm__ ("stmxcsr %0" : "=m" (*&new)); + + /* The SSE exception masks are shifted by 7 bits. */ + new &= ~(excepts << 7); + __asm__ ("ldmxcsr %0" : : "m" (*&new)); + + return old_exc; +} diff --git a/REORG.TODO/sysdeps/x86_64/fpu/fegetenv.c b/REORG.TODO/sysdeps/x86_64/fpu/fegetenv.c new file mode 100644 index 0000000000..af7642e990 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/fegetenv.c @@ -0,0 +1,35 @@ +/* Store current floating-point environment. + Copyright (C) 2001-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <fenv.h> + +int +__fegetenv (fenv_t *envp) +{ + __asm__ ("fnstenv %0\n" + /* fnstenv changes the exception mask, so load back the + stored environment. */ + "fldenv %0\n" + "stmxcsr %1" : "=m" (*envp), "=m" (envp->__mxcsr)); + + /* Success. */ + return 0; +} +libm_hidden_def (__fegetenv) +weak_alias (__fegetenv, fegetenv) +libm_hidden_weak (fegetenv) diff --git a/REORG.TODO/sysdeps/x86_64/fpu/fegetexcept.c b/REORG.TODO/sysdeps/x86_64/fpu/fegetexcept.c new file mode 100644 index 0000000000..7dbf40401e --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/fegetexcept.c @@ -0,0 +1,31 @@ +/* Get enabled floating-point exceptions. + Copyright (C) 2001-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Andreas Jaeger <aj@suse.de>, 2001. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <fenv.h> + +int +fegetexcept (void) +{ + unsigned short int exc; + + /* Get the current control word. */ + __asm__ ("fstcw %0" : "=m" (*&exc)); + + return (~exc) & FE_ALL_EXCEPT; +} diff --git a/REORG.TODO/sysdeps/x86_64/fpu/fegetmode.c b/REORG.TODO/sysdeps/x86_64/fpu/fegetmode.c new file mode 100644 index 0000000000..4513f80c85 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/fegetmode.c @@ -0,0 +1,28 @@ +/* Store current floating-point control modes. x86_64 version. + Copyright (C) 2016-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <fenv.h> +#include <fpu_control.h> + +int +fegetmode (femode_t *modep) +{ + _FPU_GETCW (modep->__control_word); + __asm__ ("stmxcsr %0" : "=m" (modep->__mxcsr)); + return 0; +} diff --git a/REORG.TODO/sysdeps/x86_64/fpu/fegetround.c b/REORG.TODO/sysdeps/x86_64/fpu/fegetround.c new file mode 100644 index 0000000000..bff3eae102 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/fegetround.c @@ -0,0 +1,35 @@ +/* Return current rounding direction. + Copyright (C) 1997-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <fenv.h> + +int +__fegetround (void) +{ + int cw; + /* We only check the x87 FPU unit. The SSE unit should be the same + - and if it's not the same there's no way to signal it. */ + + __asm__ ("fnstcw %0" : "=m" (*&cw)); + + return cw & 0xc00; +} +libm_hidden_def (__fegetround) +weak_alias (__fegetround, fegetround) +libm_hidden_weak (fegetround) diff --git a/REORG.TODO/sysdeps/x86_64/fpu/feholdexcpt.c b/REORG.TODO/sysdeps/x86_64/fpu/feholdexcpt.c new file mode 100644 index 0000000000..0a6c836f4f --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/feholdexcpt.c @@ -0,0 +1,41 @@ +/* Store current floating-point environment and clear exceptions. + Copyright (C) 2001-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <fenv.h> + +int +__feholdexcept (fenv_t *envp) +{ + unsigned int mxcsr; + + /* Store the environment. Recall that fnstenv has a side effect of + masking all exceptions. Then clear all exceptions. */ + __asm__ ("fnstenv %0\n\t" + "stmxcsr %1\n\t" + "fnclex" + : "=m" (*envp), "=m" (envp->__mxcsr)); + + /* Set the SSE MXCSR register. */ + mxcsr = (envp->__mxcsr | 0x1f80) & ~0x3f; + __asm__ ("ldmxcsr %0" : : "m" (*&mxcsr)); + + return 0; +} +libm_hidden_def (__feholdexcept) +weak_alias (__feholdexcept, feholdexcept) +libm_hidden_weak (feholdexcept) diff --git a/REORG.TODO/sysdeps/x86_64/fpu/fesetenv.c b/REORG.TODO/sysdeps/x86_64/fpu/fesetenv.c new file mode 100644 index 0000000000..90164bf3d3 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/fesetenv.c @@ -0,0 +1,114 @@ +/* Install given floating-point environment. + Copyright (C) 2001-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <fenv.h> +#include <fpu_control.h> +#include <assert.h> + + +/* All exceptions, including the x86-specific "denormal operand" + exception. */ +#define FE_ALL_EXCEPT_X86 (FE_ALL_EXCEPT | __FE_DENORM) + + +int +__fesetenv (const fenv_t *envp) +{ + fenv_t temp; + + /* Install the environment specified by ENVP. But there are a few + values which we do not want to come from the saved environment. + Therefore, we get the current environment and replace the values + we want to use from the environment specified by the parameter. */ + __asm__ ("fnstenv %0\n" + "stmxcsr %1" : "=m" (*&temp), "=m" (*&temp.__mxcsr)); + + if (envp == FE_DFL_ENV) + { + temp.__control_word |= FE_ALL_EXCEPT_X86; + temp.__control_word &= ~FE_TOWARDZERO; + temp.__control_word |= _FPU_EXTENDED; + temp.__status_word &= ~FE_ALL_EXCEPT_X86; + temp.__eip = 0; + temp.__cs_selector = 0; + temp.__opcode = 0; + temp.__data_offset = 0; + temp.__data_selector = 0; + /* Clear SSE exceptions. */ + temp.__mxcsr &= ~FE_ALL_EXCEPT_X86; + /* Set mask for SSE MXCSR. */ + temp.__mxcsr |= (FE_ALL_EXCEPT_X86 << 7); + /* Set rounding to FE_TONEAREST. */ + temp.__mxcsr &= ~ 0x6000; + temp.__mxcsr |= (FE_TONEAREST << 3); + /* Clear the FZ and DAZ bits. */ + temp.__mxcsr &= ~0x8040; + } + else if (envp == FE_NOMASK_ENV) + { + temp.__control_word &= ~(FE_ALL_EXCEPT | FE_TOWARDZERO); + /* Keep the "denormal operand" exception masked. */ + temp.__control_word |= __FE_DENORM; + temp.__control_word |= _FPU_EXTENDED; + temp.__status_word &= ~FE_ALL_EXCEPT_X86; + temp.__eip = 0; + temp.__cs_selector = 0; + temp.__opcode = 0; + temp.__data_offset = 0; + temp.__data_selector = 0; + /* Clear SSE exceptions. */ + temp.__mxcsr &= ~FE_ALL_EXCEPT_X86; + /* Set mask for SSE MXCSR. */ + /* Set rounding to FE_TONEAREST. */ + temp.__mxcsr &= ~ 0x6000; + temp.__mxcsr |= (FE_TONEAREST << 3); + /* Do not mask exceptions. */ + temp.__mxcsr &= ~(FE_ALL_EXCEPT << 7); + /* Keep the "denormal operand" exception masked. */ + temp.__mxcsr |= (__FE_DENORM << 7); + /* Clear the FZ and DAZ bits. */ + temp.__mxcsr &= ~0x8040; + } + else + { + temp.__control_word &= ~(FE_ALL_EXCEPT_X86 + | FE_TOWARDZERO + | _FPU_EXTENDED); + temp.__control_word |= (envp->__control_word + & (FE_ALL_EXCEPT_X86 + | FE_TOWARDZERO + | _FPU_EXTENDED)); + temp.__status_word &= ~FE_ALL_EXCEPT_X86; + temp.__status_word |= envp->__status_word & FE_ALL_EXCEPT_X86; + temp.__eip = envp->__eip; + temp.__cs_selector = envp->__cs_selector; + temp.__opcode = envp->__opcode; + temp.__data_offset = envp->__data_offset; + temp.__data_selector = envp->__data_selector; + temp.__mxcsr = envp->__mxcsr; + } + + __asm__ ("fldenv %0\n" + "ldmxcsr %1" : : "m" (temp), "m" (temp.__mxcsr)); + + /* Success. */ + return 0; +} +libm_hidden_def (__fesetenv) +weak_alias (__fesetenv, fesetenv) +libm_hidden_weak (fesetenv) diff --git a/REORG.TODO/sysdeps/x86_64/fpu/fesetexcept.c b/REORG.TODO/sysdeps/x86_64/fpu/fesetexcept.c new file mode 100644 index 0000000000..65683b5697 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/fesetexcept.c @@ -0,0 +1,31 @@ +/* Set given exception flags. x86_64 version. + Copyright (C) 2016-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <fenv.h> + +int +fesetexcept (int excepts) +{ + unsigned int mxcsr; + + __asm__ ("stmxcsr %0" : "=m" (*&mxcsr)); + mxcsr |= excepts & FE_ALL_EXCEPT; + __asm__ ("ldmxcsr %0" : : "m" (*&mxcsr)); + + return 0; +} diff --git a/REORG.TODO/sysdeps/x86_64/fpu/fesetmode.c b/REORG.TODO/sysdeps/x86_64/fpu/fesetmode.c new file mode 100644 index 0000000000..27429f7887 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/fesetmode.c @@ -0,0 +1,50 @@ +/* Install given floating-point control modes. x86_64 version. + Copyright (C) 2016-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <fenv.h> +#include <fpu_control.h> + +/* All exceptions, including the x86-specific "denormal operand" + exception. */ +#define FE_ALL_EXCEPT_X86 (FE_ALL_EXCEPT | __FE_DENORM) + +int +fesetmode (const femode_t *modep) +{ + fpu_control_t cw; + unsigned int mxcsr; + __asm__ ("stmxcsr %0" : "=m" (mxcsr)); + /* Preserve SSE exception flags but restore other state in + MXCSR. */ + mxcsr &= FE_ALL_EXCEPT_X86; + if (modep == FE_DFL_MODE) + { + cw = _FPU_DEFAULT; + /* Default MXCSR state has all bits zero except for those + masking exceptions. */ + mxcsr |= FE_ALL_EXCEPT_X86 << 7; + } + else + { + cw = modep->__control_word; + mxcsr |= modep->__mxcsr & ~FE_ALL_EXCEPT_X86; + } + _FPU_SETCW (cw); + __asm__ ("ldmxcsr %0" : : "m" (mxcsr)); + return 0; +} diff --git a/REORG.TODO/sysdeps/x86_64/fpu/fesetround.c b/REORG.TODO/sysdeps/x86_64/fpu/fesetround.c new file mode 100644 index 0000000000..939297252a --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/fesetround.c @@ -0,0 +1,48 @@ +/* Set current rounding direction. + Copyright (C) 2001-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <fenv.h> + +int +__fesetround (int round) +{ + unsigned short int cw; + int mxcsr; + + if ((round & ~0xc00) != 0) + /* ROUND is no valid rounding mode. */ + return 1; + + /* First set the x87 FPU. */ + asm ("fnstcw %0" : "=m" (*&cw)); + cw &= ~0xc00; + cw |= round; + asm ("fldcw %0" : : "m" (*&cw)); + + /* And now the MSCSR register for SSE, the precision is at different bit + positions in the different units, we need to shift it 3 bits. */ + asm ("stmxcsr %0" : "=m" (*&mxcsr)); + mxcsr &= ~ 0x6000; + mxcsr |= round << 3; + asm ("ldmxcsr %0" : : "m" (*&mxcsr)); + + return 0; +} +libm_hidden_def (__fesetround) +weak_alias (__fesetround, fesetround) +libm_hidden_weak (fesetround) diff --git a/REORG.TODO/sysdeps/x86_64/fpu/feupdateenv.c b/REORG.TODO/sysdeps/x86_64/fpu/feupdateenv.c new file mode 100644 index 0000000000..3bc110ce48 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/feupdateenv.c @@ -0,0 +1,52 @@ +/* Install given floating-point environment and raise exceptions. + Copyright (C) 1997-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <fenv.h> + +int +__feupdateenv (const fenv_t *envp) +{ + fexcept_t temp; + unsigned int xtemp; + + /* Save current exceptions. */ + __asm__ ("fnstsw %0\n\tstmxcsr %1" : "=m" (*&temp), "=m" (xtemp)); + temp = (temp | xtemp) & FE_ALL_EXCEPT; + + /* Install new environment. */ + __fesetenv (envp); + + /* Raise the saved exception. Incidently for us the implementation + defined format of the values in objects of type fexcept_t is the + same as the ones specified using the FE_* constants. */ + __feraiseexcept ((int) temp); + + /* Success. */ + return 0; +} + +#include <shlib-compat.h> +#if SHLIB_COMPAT (libm, GLIBC_2_1, GLIBC_2_2) +strong_alias (__feupdateenv, __old_feupdateenv) +compat_symbol (libm, __old_feupdateenv, feupdateenv, GLIBC_2_1); +#endif + +libm_hidden_def (__feupdateenv) +libm_hidden_ver (__feupdateenv, feupdateenv) +versioned_symbol (libm, __feupdateenv, feupdateenv, GLIBC_2_2); diff --git a/REORG.TODO/sysdeps/x86_64/fpu/fgetexcptflg.c b/REORG.TODO/sysdeps/x86_64/fpu/fgetexcptflg.c new file mode 100644 index 0000000000..c1a0c2f872 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/fgetexcptflg.c @@ -0,0 +1,35 @@ +/* Store current representation for exceptions. + Copyright (C) 2001-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <fenv.h> + +int +fegetexceptflag (fexcept_t *flagp, int excepts) +{ + fexcept_t temp; + unsigned int mxscr; + + /* Get the current exceptions for the x87 FPU and SSE unit. */ + __asm__ ("fnstsw %0\n" + "stmxcsr %1" : "=m" (*&temp), "=m" (*&mxscr)); + + *flagp = (temp | mxscr) & FE_ALL_EXCEPT & excepts; + + /* Success. */ + return 0; +} diff --git a/REORG.TODO/sysdeps/x86_64/fpu/fraiseexcpt.c b/REORG.TODO/sysdeps/x86_64/fpu/fraiseexcpt.c new file mode 100644 index 0000000000..13eb4af331 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/fraiseexcpt.c @@ -0,0 +1,121 @@ +/* Raise given exceptions. + Copyright (C) 2001-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <fenv.h> +#include <math.h> + +int +__feraiseexcept (int excepts) +{ + /* Raise exceptions represented by EXPECTS. But we must raise only + one signal at a time. It is important that if the overflow/underflow + exception and the inexact exception are given at the same time, + the overflow/underflow exception follows the inexact exception. */ + + /* First: invalid exception. */ + if ((FE_INVALID & excepts) != 0) + { + /* One example of an invalid operation is 0.0 / 0.0. */ + float f = 0.0; + + __asm__ __volatile__ ("divss %0, %0 " : : "x" (f)); + (void) &f; + } + + /* Next: division by zero. */ + if ((FE_DIVBYZERO & excepts) != 0) + { + float f = 1.0; + float g = 0.0; + + __asm__ __volatile__ ("divss %1, %0" : : "x" (f), "x" (g)); + (void) &f; + } + + /* Next: overflow. */ + if ((FE_OVERFLOW & excepts) != 0) + { + /* XXX: Is it ok to only set the x87 FPU? */ + /* There is no way to raise only the overflow flag. Do it the + hard way. */ + fenv_t temp; + + /* Bah, we have to clear selected exceptions. Since there is no + `fldsw' instruction we have to do it the hard way. */ + __asm__ __volatile__ ("fnstenv %0" : "=m" (*&temp)); + + /* Set the relevant bits. */ + temp.__status_word |= FE_OVERFLOW; + + /* Put the new data in effect. */ + __asm__ __volatile__ ("fldenv %0" : : "m" (*&temp)); + + /* And raise the exception. */ + __asm__ __volatile__ ("fwait"); + } + + /* Next: underflow. */ + if ((FE_UNDERFLOW & excepts) != 0) + { + /* XXX: Is it ok to only set the x87 FPU? */ + /* There is no way to raise only the underflow flag. Do it the + hard way. */ + fenv_t temp; + + /* Bah, we have to clear selected exceptions. Since there is no + `fldsw' instruction we have to do it the hard way. */ + __asm__ __volatile__ ("fnstenv %0" : "=m" (*&temp)); + + /* Set the relevant bits. */ + temp.__status_word |= FE_UNDERFLOW; + + /* Put the new data in effect. */ + __asm__ __volatile__ ("fldenv %0" : : "m" (*&temp)); + + /* And raise the exception. */ + __asm__ __volatile__ ("fwait"); + } + + /* Last: inexact. */ + if ((FE_INEXACT & excepts) != 0) + { + /* XXX: Is it ok to only set the x87 FPU? */ + /* There is no way to raise only the inexact flag. Do it the + hard way. */ + fenv_t temp; + + /* Bah, we have to clear selected exceptions. Since there is no + `fldsw' instruction we have to do it the hard way. */ + __asm__ __volatile__ ("fnstenv %0" : "=m" (*&temp)); + + /* Set the relevant bits. */ + temp.__status_word |= FE_INEXACT; + + /* Put the new data in effect. */ + __asm__ __volatile__ ("fldenv %0" : : "m" (*&temp)); + + /* And raise the exception. */ + __asm__ __volatile__ ("fwait"); + } + + /* Success. */ + return 0; +} +libm_hidden_def (__feraiseexcept) +weak_alias (__feraiseexcept, feraiseexcept) +libm_hidden_weak (feraiseexcept) diff --git a/REORG.TODO/sysdeps/x86_64/fpu/fsetexcptflg.c b/REORG.TODO/sysdeps/x86_64/fpu/fsetexcptflg.c new file mode 100644 index 0000000000..ffc44dcad5 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/fsetexcptflg.c @@ -0,0 +1,53 @@ +/* Set floating-point environment exception handling. + Copyright (C) 2001-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <fenv.h> +#include <math.h> + +int +fesetexceptflag (const fexcept_t *flagp, int excepts) +{ + fenv_t temp; + unsigned int mxcsr; + + /* XXX: Do we really need to set both the exception in both units? + Shouldn't it be enough to set only the SSE unit? */ + + /* Get the current x87 FPU environment. We have to do this since we + cannot separately set the status word. */ + __asm__ ("fnstenv %0" : "=m" (*&temp)); + + temp.__status_word &= ~(excepts & FE_ALL_EXCEPT); + temp.__status_word |= *flagp & excepts & FE_ALL_EXCEPT; + + /* Store the new status word (along with the rest of the environment. + Possibly new exceptions are set but they won't get executed unless + the next floating-point instruction. */ + __asm__ ("fldenv %0" : : "m" (*&temp)); + + /* And now the same for SSE. */ + __asm__ ("stmxcsr %0" : "=m" (*&mxcsr)); + + mxcsr &= ~(excepts & FE_ALL_EXCEPT); + mxcsr |= *flagp & excepts & FE_ALL_EXCEPT; + + __asm__ ("ldmxcsr %0" : : "m" (*&mxcsr)); + + /* Success. */ + return 0; +} diff --git a/REORG.TODO/sysdeps/x86_64/fpu/ftestexcept.c b/REORG.TODO/sysdeps/x86_64/fpu/ftestexcept.c new file mode 100644 index 0000000000..502bdb2c42 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/ftestexcept.c @@ -0,0 +1,33 @@ +/* Test exception in current environment. + Copyright (C) 2001-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <fenv.h> + +int +fetestexcept (int excepts) +{ + int temp; + unsigned int mxscr; + + /* Get current exceptions. */ + __asm__ ("fnstsw %0\n" + "stmxcsr %1" : "=m" (*&temp), "=m" (*&mxscr)); + + return (temp | mxscr) & excepts & FE_ALL_EXCEPT; +} +libm_hidden_def (fetestexcept) diff --git a/REORG.TODO/sysdeps/x86_64/fpu/libm-test-ulps b/REORG.TODO/sysdeps/x86_64/fpu/libm-test-ulps new file mode 100644 index 0000000000..61da961a57 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/libm-test-ulps @@ -0,0 +1,2368 @@ +# Begin of automatic generation + +# Maximal error of functions: +Function: "acos": +float: 1 +ifloat: 1 +ildouble: 1 +ldouble: 1 + +Function: "acos_downward": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 2 +ldouble: 2 + +Function: "acos_towardzero": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 2 +ldouble: 2 + +Function: "acos_upward": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 2 +ldouble: 2 + +Function: "acosh": +double: 2 +float: 2 +idouble: 2 +ifloat: 2 +ildouble: 2 +ldouble: 2 + +Function: "acosh_downward": +double: 2 +float: 2 +idouble: 2 +ifloat: 2 +ildouble: 4 +ldouble: 4 + +Function: "acosh_towardzero": +double: 2 +float: 2 +idouble: 2 +ifloat: 2 +ildouble: 4 +ldouble: 4 + +Function: "acosh_upward": +double: 2 +float: 2 +idouble: 2 +ifloat: 2 +ildouble: 3 +ldouble: 3 + +Function: "asin": +float: 1 +ifloat: 1 +ildouble: 1 +ldouble: 1 + +Function: "asin_downward": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 2 +ldouble: 2 + +Function: "asin_towardzero": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 1 +ldouble: 1 + +Function: "asin_upward": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 1 +ldouble: 1 + +Function: "asinh": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 3 +ldouble: 3 + +Function: "asinh_downward": +double: 3 +float: 3 +idouble: 3 +ifloat: 3 +ildouble: 5 +ldouble: 5 + +Function: "asinh_towardzero": +double: 2 +float: 2 +idouble: 2 +ifloat: 2 +ildouble: 4 +ldouble: 4 + +Function: "asinh_upward": +double: 3 +float: 3 +idouble: 3 +ifloat: 3 +ildouble: 5 +ldouble: 5 + +Function: "atan": +float: 1 +ifloat: 1 +ildouble: 1 +ldouble: 1 + +Function: "atan2": +float: 1 +ifloat: 1 +ildouble: 1 +ldouble: 1 + +Function: "atan2_downward": +double: 1 +float: 2 +idouble: 1 +ifloat: 2 +ildouble: 1 +ldouble: 1 + +Function: "atan2_towardzero": +double: 1 +float: 2 +idouble: 1 +ifloat: 2 +ildouble: 1 +ldouble: 1 + +Function: "atan2_upward": +double: 1 +float: 2 +idouble: 1 +ifloat: 2 +ildouble: 1 +ldouble: 1 + +Function: "atan_downward": +double: 1 +float: 2 +idouble: 1 +ifloat: 2 +ildouble: 1 +ldouble: 1 + +Function: "atan_towardzero": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 1 +ldouble: 1 + +Function: "atan_upward": +double: 1 +float: 2 +idouble: 1 +ifloat: 2 +ildouble: 1 +ldouble: 1 + +Function: "atanh": +double: 2 +float: 2 +idouble: 2 +ifloat: 2 +ildouble: 3 +ldouble: 3 + +Function: "atanh_downward": +double: 3 +float: 3 +idouble: 3 +ifloat: 3 +ildouble: 5 +ldouble: 5 + +Function: "atanh_towardzero": +double: 2 +float: 2 +idouble: 2 +ifloat: 2 +ildouble: 4 +ldouble: 4 + +Function: "atanh_upward": +double: 3 +float: 3 +idouble: 3 +ifloat: 3 +ildouble: 5 +ldouble: 5 + +Function: "cabs": +double: 1 +idouble: 1 +ildouble: 1 +ldouble: 1 + +Function: "cabs_downward": +double: 1 +idouble: 1 +ildouble: 1 +ldouble: 1 + +Function: "cabs_towardzero": +double: 1 +idouble: 1 +ildouble: 1 +ldouble: 1 + +Function: "cabs_upward": +double: 1 +idouble: 1 +ildouble: 1 +ldouble: 1 + +Function: Real part of "cacos": +double: 1 +float: 2 +idouble: 1 +ifloat: 2 +ildouble: 1 +ldouble: 1 + +Function: Imaginary part of "cacos": +double: 2 +float: 2 +idouble: 2 +ifloat: 2 +ildouble: 2 +ldouble: 2 + +Function: Real part of "cacos_downward": +double: 3 +float: 2 +idouble: 3 +ifloat: 2 +ildouble: 2 +ldouble: 2 + +Function: Imaginary part of "cacos_downward": +double: 5 +float: 3 +idouble: 5 +ifloat: 3 +ildouble: 6 +ldouble: 6 + +Function: Real part of "cacos_towardzero": +double: 3 +float: 2 +idouble: 3 +ifloat: 2 +ildouble: 2 +ldouble: 2 + +Function: Imaginary part of "cacos_towardzero": +double: 5 +float: 3 +idouble: 5 +ifloat: 3 +ildouble: 5 +ldouble: 5 + +Function: Real part of "cacos_upward": +double: 2 +float: 2 +idouble: 2 +ifloat: 2 +ildouble: 2 +ldouble: 2 + +Function: Imaginary part of "cacos_upward": +double: 5 +float: 7 +idouble: 5 +ifloat: 7 +ildouble: 7 +ldouble: 7 + +Function: Real part of "cacosh": +double: 2 +float: 2 +idouble: 2 +ifloat: 2 +ildouble: 2 +ldouble: 2 + +Function: Imaginary part of "cacosh": +double: 1 +float: 2 +idouble: 1 +ifloat: 2 +ildouble: 1 +ldouble: 1 + +Function: Real part of "cacosh_downward": +double: 5 +float: 3 +idouble: 5 +ifloat: 3 +ildouble: 5 +ldouble: 5 + +Function: Imaginary part of "cacosh_downward": +double: 3 +float: 3 +idouble: 3 +ifloat: 3 +ildouble: 3 +ldouble: 3 + +Function: Real part of "cacosh_towardzero": +double: 5 +float: 3 +idouble: 5 +ifloat: 3 +ildouble: 5 +ldouble: 5 + +Function: Imaginary part of "cacosh_towardzero": +double: 3 +float: 2 +idouble: 3 +ifloat: 2 +ildouble: 2 +ldouble: 2 + +Function: Real part of "cacosh_upward": +double: 4 +float: 4 +idouble: 4 +ifloat: 4 +ildouble: 5 +ldouble: 5 + +Function: Imaginary part of "cacosh_upward": +double: 3 +float: 2 +idouble: 3 +ifloat: 2 +ildouble: 3 +ldouble: 3 + +Function: "carg": +float: 1 +ifloat: 1 +ildouble: 1 +ldouble: 1 + +Function: "carg_downward": +double: 1 +float: 2 +idouble: 1 +ifloat: 2 +ildouble: 1 +ldouble: 1 + +Function: "carg_towardzero": +double: 1 +float: 2 +idouble: 1 +ifloat: 2 +ildouble: 1 +ldouble: 1 + +Function: "carg_upward": +double: 1 +float: 2 +idouble: 1 +ifloat: 2 +ildouble: 1 +ldouble: 1 + +Function: Real part of "casin": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 1 +ldouble: 1 + +Function: Imaginary part of "casin": +double: 2 +float: 2 +idouble: 2 +ifloat: 2 +ildouble: 2 +ldouble: 2 + +Function: Real part of "casin_downward": +double: 3 +float: 2 +idouble: 3 +ifloat: 2 +ildouble: 3 +ldouble: 3 + +Function: Imaginary part of "casin_downward": +double: 5 +float: 3 +idouble: 5 +ifloat: 3 +ildouble: 6 +ldouble: 6 + +Function: Real part of "casin_towardzero": +double: 3 +float: 1 +idouble: 3 +ifloat: 1 +ildouble: 3 +ldouble: 3 + +Function: Imaginary part of "casin_towardzero": +double: 5 +float: 3 +idouble: 5 +ifloat: 3 +ildouble: 5 +ldouble: 5 + +Function: Real part of "casin_upward": +double: 3 +float: 2 +idouble: 3 +ifloat: 2 +ildouble: 2 +ldouble: 2 + +Function: Imaginary part of "casin_upward": +double: 5 +float: 7 +idouble: 5 +ifloat: 7 +ildouble: 7 +ldouble: 7 + +Function: Real part of "casinh": +double: 2 +float: 2 +idouble: 2 +ifloat: 2 +ildouble: 2 +ldouble: 2 + +Function: Imaginary part of "casinh": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 1 +ldouble: 1 + +Function: Real part of "casinh_downward": +double: 5 +float: 3 +idouble: 5 +ifloat: 3 +ildouble: 6 +ldouble: 6 + +Function: Imaginary part of "casinh_downward": +double: 3 +float: 2 +idouble: 3 +ifloat: 2 +ildouble: 3 +ldouble: 3 + +Function: Real part of "casinh_towardzero": +double: 5 +float: 3 +idouble: 5 +ifloat: 3 +ildouble: 5 +ldouble: 5 + +Function: Imaginary part of "casinh_towardzero": +double: 3 +float: 1 +idouble: 3 +ifloat: 1 +ildouble: 3 +ldouble: 3 + +Function: Real part of "casinh_upward": +double: 5 +float: 7 +idouble: 5 +ifloat: 7 +ildouble: 7 +ldouble: 7 + +Function: Imaginary part of "casinh_upward": +double: 3 +float: 2 +idouble: 3 +ifloat: 2 +ildouble: 2 +ldouble: 2 + +Function: Real part of "catan": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 1 +ldouble: 1 + +Function: Imaginary part of "catan": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 1 +ldouble: 1 + +Function: Real part of "catan_downward": +double: 1 +float: 2 +idouble: 1 +ifloat: 2 +ildouble: 1 +ldouble: 1 + +Function: Imaginary part of "catan_downward": +double: 2 +float: 2 +idouble: 2 +ifloat: 2 +ildouble: 4 +ldouble: 4 + +Function: Real part of "catan_towardzero": +double: 1 +float: 2 +idouble: 1 +ifloat: 2 +ildouble: 1 +ldouble: 1 + +Function: Imaginary part of "catan_towardzero": +double: 2 +float: 2 +idouble: 2 +ifloat: 2 +ildouble: 4 +ldouble: 4 + +Function: Real part of "catan_upward": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 1 +ldouble: 1 + +Function: Imaginary part of "catan_upward": +double: 3 +float: 3 +idouble: 3 +ifloat: 3 +ildouble: 3 +ldouble: 3 + +Function: Real part of "catanh": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 1 +ldouble: 1 + +Function: Imaginary part of "catanh": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 1 +ldouble: 1 + +Function: Real part of "catanh_downward": +double: 2 +float: 2 +idouble: 2 +ifloat: 2 +ildouble: 4 +ldouble: 4 + +Function: Imaginary part of "catanh_downward": +double: 1 +float: 2 +idouble: 1 +ifloat: 2 +ildouble: 1 +ldouble: 1 + +Function: Real part of "catanh_towardzero": +double: 2 +float: 2 +idouble: 2 +ifloat: 2 +ildouble: 4 +ldouble: 4 + +Function: Imaginary part of "catanh_towardzero": +double: 1 +float: 2 +idouble: 1 +ifloat: 2 +ildouble: 1 +ldouble: 1 + +Function: Real part of "catanh_upward": +double: 4 +float: 4 +idouble: 4 +ifloat: 4 +ildouble: 4 +ldouble: 4 + +Function: Imaginary part of "catanh_upward": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 1 +ldouble: 1 + +Function: "cbrt": +double: 3 +float: 1 +idouble: 3 +ifloat: 1 +ildouble: 1 +ldouble: 1 + +Function: "cbrt_downward": +double: 4 +float: 1 +idouble: 4 +ifloat: 1 +ildouble: 1 +ldouble: 1 + +Function: "cbrt_towardzero": +double: 3 +float: 1 +idouble: 3 +ifloat: 1 +ildouble: 1 +ldouble: 1 + +Function: "cbrt_upward": +double: 5 +float: 1 +idouble: 5 +ifloat: 1 +ildouble: 1 +ldouble: 1 + +Function: Real part of "ccos": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 1 +ldouble: 1 + +Function: Imaginary part of "ccos": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 1 +ldouble: 1 + +Function: Real part of "ccos_downward": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 3 +ldouble: 3 + +Function: Imaginary part of "ccos_downward": +double: 2 +float: 3 +idouble: 2 +ifloat: 3 +ildouble: 3 +ldouble: 3 + +Function: Real part of "ccos_towardzero": +double: 1 +float: 2 +idouble: 1 +ifloat: 2 +ildouble: 3 +ldouble: 3 + +Function: Imaginary part of "ccos_towardzero": +double: 2 +float: 3 +idouble: 2 +ifloat: 3 +ildouble: 3 +ldouble: 3 + +Function: Real part of "ccos_upward": +double: 1 +float: 2 +idouble: 1 +ifloat: 2 +ildouble: 2 +ldouble: 2 + +Function: Imaginary part of "ccos_upward": +double: 2 +float: 2 +idouble: 2 +ifloat: 2 +ildouble: 2 +ldouble: 2 + +Function: Real part of "ccosh": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 1 +ldouble: 1 + +Function: Imaginary part of "ccosh": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 1 +ldouble: 1 + +Function: Real part of "ccosh_downward": +double: 1 +float: 2 +idouble: 1 +ifloat: 2 +ildouble: 3 +ldouble: 3 + +Function: Imaginary part of "ccosh_downward": +double: 2 +float: 3 +idouble: 2 +ifloat: 3 +ildouble: 3 +ldouble: 3 + +Function: Real part of "ccosh_towardzero": +double: 1 +float: 3 +idouble: 1 +ifloat: 3 +ildouble: 3 +ldouble: 3 + +Function: Imaginary part of "ccosh_towardzero": +double: 2 +float: 3 +idouble: 2 +ifloat: 3 +ildouble: 3 +ldouble: 3 + +Function: Real part of "ccosh_upward": +double: 1 +float: 2 +idouble: 1 +ifloat: 2 +ildouble: 2 +ldouble: 2 + +Function: Imaginary part of "ccosh_upward": +double: 2 +float: 2 +idouble: 2 +ifloat: 2 +ildouble: 2 +ldouble: 2 + +Function: Real part of "cexp": +double: 2 +float: 1 +idouble: 2 +ifloat: 1 +ildouble: 1 +ldouble: 1 + +Function: Imaginary part of "cexp": +double: 1 +float: 2 +idouble: 1 +ifloat: 2 +ildouble: 1 +ldouble: 1 + +Function: Real part of "cexp_downward": +double: 1 +float: 2 +idouble: 1 +ifloat: 2 +ildouble: 3 +ldouble: 3 + +Function: Imaginary part of "cexp_downward": +double: 1 +float: 3 +idouble: 1 +ifloat: 3 +ildouble: 3 +ldouble: 3 + +Function: Real part of "cexp_towardzero": +double: 1 +float: 2 +idouble: 1 +ifloat: 2 +ildouble: 3 +ldouble: 3 + +Function: Imaginary part of "cexp_towardzero": +double: 1 +float: 3 +idouble: 1 +ifloat: 3 +ildouble: 3 +ldouble: 3 + +Function: Real part of "cexp_upward": +double: 1 +float: 2 +idouble: 1 +ifloat: 2 +ildouble: 2 +ldouble: 2 + +Function: Imaginary part of "cexp_upward": +double: 1 +float: 2 +idouble: 1 +ifloat: 2 +ildouble: 3 +ldouble: 3 + +Function: Real part of "clog": +double: 3 +float: 3 +idouble: 3 +ifloat: 3 +ildouble: 3 +ldouble: 3 + +Function: Imaginary part of "clog": +float: 1 +ifloat: 1 +ildouble: 1 +ldouble: 1 + +Function: Real part of "clog10": +double: 3 +float: 4 +idouble: 3 +ifloat: 4 +ildouble: 4 +ldouble: 4 + +Function: Imaginary part of "clog10": +double: 2 +float: 2 +idouble: 2 +ifloat: 2 +ildouble: 2 +ldouble: 2 + +Function: Real part of "clog10_downward": +double: 5 +float: 4 +idouble: 5 +ifloat: 4 +ildouble: 8 +ldouble: 8 + +Function: Imaginary part of "clog10_downward": +double: 2 +float: 4 +idouble: 2 +ifloat: 4 +ildouble: 3 +ldouble: 3 + +Function: Real part of "clog10_towardzero": +double: 5 +float: 5 +idouble: 5 +ifloat: 5 +ildouble: 8 +ldouble: 8 + +Function: Imaginary part of "clog10_towardzero": +double: 2 +float: 4 +idouble: 2 +ifloat: 4 +ildouble: 3 +ldouble: 3 + +Function: Real part of "clog10_upward": +double: 6 +float: 5 +idouble: 6 +ifloat: 5 +ildouble: 8 +ldouble: 8 + +Function: Imaginary part of "clog10_upward": +double: 2 +float: 4 +idouble: 2 +ifloat: 4 +ildouble: 3 +ldouble: 3 + +Function: Real part of "clog_downward": +double: 4 +float: 3 +idouble: 4 +ifloat: 3 +ildouble: 5 +ldouble: 5 + +Function: Imaginary part of "clog_downward": +double: 1 +float: 2 +idouble: 1 +ifloat: 2 +ildouble: 1 +ldouble: 1 + +Function: Real part of "clog_towardzero": +double: 4 +float: 4 +idouble: 4 +ifloat: 4 +ildouble: 5 +ldouble: 5 + +Function: Imaginary part of "clog_towardzero": +double: 1 +float: 3 +idouble: 1 +ifloat: 3 +ildouble: 1 +ldouble: 1 + +Function: Real part of "clog_upward": +double: 4 +float: 3 +idouble: 4 +ifloat: 3 +ildouble: 4 +ldouble: 4 + +Function: Imaginary part of "clog_upward": +double: 1 +float: 2 +idouble: 1 +ifloat: 2 +ildouble: 1 +ldouble: 1 + +Function: "cos": +ildouble: 1 +ldouble: 1 + +Function: "cos_downward": +double: 1 +idouble: 1 +ildouble: 3 +ldouble: 3 + +Function: "cos_towardzero": +double: 1 +idouble: 1 +ildouble: 2 +ldouble: 2 + +Function: "cos_upward": +double: 1 +idouble: 1 +ildouble: 2 +ldouble: 2 + +Function: "cos_vlen16": +float: 1 + +Function: "cos_vlen2": +double: 2 + +Function: "cos_vlen4": +double: 2 +float: 1 + +Function: "cos_vlen4_avx2": +double: 2 + +Function: "cos_vlen8": +double: 1 +float: 1 + +Function: "cos_vlen8_avx2": +float: 1 + +Function: "cosh": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 2 +ldouble: 2 + +Function: "cosh_downward": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 2 +ldouble: 3 + +Function: "cosh_towardzero": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 2 +ldouble: 2 + +Function: "cosh_upward": +double: 1 +float: 2 +idouble: 1 +ifloat: 2 +ildouble: 2 +ldouble: 3 + +Function: Real part of "cpow": +double: 2 +float: 5 +idouble: 2 +ifloat: 5 +ildouble: 3 +ldouble: 3 + +Function: Imaginary part of "cpow": +float: 2 +ifloat: 2 +ildouble: 4 +ldouble: 4 + +Function: Real part of "cpow_downward": +double: 4 +float: 8 +idouble: 4 +ifloat: 8 +ildouble: 7 +ldouble: 7 + +Function: Imaginary part of "cpow_downward": +double: 1 +float: 2 +idouble: 1 +ifloat: 2 +ildouble: 2 +ldouble: 2 + +Function: Real part of "cpow_towardzero": +double: 4 +float: 8 +idouble: 4 +ifloat: 8 +ildouble: 7 +ldouble: 7 + +Function: Imaginary part of "cpow_towardzero": +double: 1 +float: 2 +idouble: 1 +ifloat: 2 +ildouble: 1 +ldouble: 1 + +Function: Real part of "cpow_upward": +double: 4 +float: 1 +idouble: 4 +ifloat: 1 +ildouble: 2 +ldouble: 2 + +Function: Imaginary part of "cpow_upward": +double: 1 +float: 2 +idouble: 1 +ifloat: 2 +ildouble: 2 +ldouble: 2 + +Function: Real part of "csin": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 1 +ldouble: 1 + +Function: Real part of "csin_downward": +double: 2 +float: 3 +idouble: 2 +ifloat: 3 +ildouble: 3 +ldouble: 3 + +Function: Imaginary part of "csin_downward": +double: 1 +float: 2 +idouble: 1 +ifloat: 2 +ildouble: 3 +ldouble: 3 + +Function: Real part of "csin_towardzero": +double: 2 +float: 3 +idouble: 2 +ifloat: 3 +ildouble: 3 +ldouble: 3 + +Function: Imaginary part of "csin_towardzero": +double: 2 +float: 2 +idouble: 2 +ifloat: 2 +ildouble: 3 +ldouble: 3 + +Function: Real part of "csin_upward": +double: 2 +float: 3 +idouble: 2 +ifloat: 3 +ildouble: 3 +ldouble: 3 + +Function: Imaginary part of "csin_upward": +double: 1 +float: 3 +idouble: 1 +ifloat: 3 +ildouble: 3 +ldouble: 3 + +Function: Real part of "csinh": +float: 1 +ifloat: 1 +ildouble: 1 +ldouble: 1 + +Function: Imaginary part of "csinh": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 1 +ldouble: 1 + +Function: Real part of "csinh_downward": +double: 2 +float: 2 +idouble: 2 +ifloat: 2 +ildouble: 3 +ldouble: 3 + +Function: Imaginary part of "csinh_downward": +double: 2 +float: 3 +idouble: 2 +ifloat: 3 +ildouble: 3 +ldouble: 3 + +Function: Real part of "csinh_towardzero": +double: 2 +float: 2 +idouble: 2 +ifloat: 2 +ildouble: 3 +ldouble: 3 + +Function: Imaginary part of "csinh_towardzero": +double: 2 +float: 3 +idouble: 2 +ifloat: 3 +ildouble: 3 +ldouble: 3 + +Function: Real part of "csinh_upward": +double: 1 +float: 3 +idouble: 1 +ifloat: 3 +ildouble: 3 +ldouble: 3 + +Function: Imaginary part of "csinh_upward": +double: 2 +float: 3 +idouble: 2 +ifloat: 3 +ildouble: 3 +ldouble: 3 + +Function: Real part of "csqrt": +double: 2 +float: 2 +idouble: 2 +ifloat: 2 +ildouble: 2 +ldouble: 2 + +Function: Imaginary part of "csqrt": +double: 2 +float: 2 +idouble: 2 +ifloat: 2 +ildouble: 2 +ldouble: 2 + +Function: Real part of "csqrt_downward": +double: 5 +float: 4 +idouble: 5 +ifloat: 4 +ildouble: 5 +ldouble: 5 + +Function: Imaginary part of "csqrt_downward": +double: 4 +float: 3 +idouble: 4 +ifloat: 3 +ildouble: 4 +ldouble: 4 + +Function: Real part of "csqrt_towardzero": +double: 4 +float: 3 +idouble: 4 +ifloat: 3 +ildouble: 4 +ldouble: 4 + +Function: Imaginary part of "csqrt_towardzero": +double: 4 +float: 3 +idouble: 4 +ifloat: 3 +ildouble: 4 +ldouble: 4 + +Function: Real part of "csqrt_upward": +double: 5 +float: 4 +idouble: 5 +ifloat: 4 +ildouble: 5 +ldouble: 5 + +Function: Imaginary part of "csqrt_upward": +double: 3 +float: 3 +idouble: 3 +ifloat: 3 +ildouble: 4 +ldouble: 4 + +Function: Real part of "ctan": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 2 +ldouble: 2 + +Function: Imaginary part of "ctan": +double: 2 +float: 2 +idouble: 2 +ifloat: 2 +ildouble: 1 +ldouble: 1 + +Function: Real part of "ctan_downward": +double: 6 +float: 5 +idouble: 6 +ifloat: 5 +ildouble: 5 +ldouble: 5 + +Function: Imaginary part of "ctan_downward": +double: 2 +float: 2 +idouble: 2 +ifloat: 2 +ildouble: 4 +ldouble: 4 + +Function: Real part of "ctan_towardzero": +double: 5 +float: 3 +idouble: 5 +ifloat: 3 +ildouble: 5 +ldouble: 5 + +Function: Imaginary part of "ctan_towardzero": +double: 2 +float: 2 +idouble: 2 +ifloat: 2 +ildouble: 4 +ldouble: 4 + +Function: Real part of "ctan_upward": +double: 2 +float: 4 +idouble: 2 +ifloat: 4 +ildouble: 3 +ldouble: 3 + +Function: Imaginary part of "ctan_upward": +double: 2 +float: 1 +idouble: 2 +ifloat: 1 +ildouble: 3 +ldouble: 3 + +Function: Real part of "ctanh": +double: 2 +float: 2 +idouble: 2 +ifloat: 2 +ildouble: 1 +ldouble: 1 + +Function: Imaginary part of "ctanh": +double: 2 +float: 2 +idouble: 2 +ifloat: 2 +ildouble: 2 +ldouble: 2 + +Function: Real part of "ctanh_downward": +double: 4 +float: 2 +idouble: 4 +ifloat: 2 +ildouble: 4 +ldouble: 4 + +Function: Imaginary part of "ctanh_downward": +double: 6 +float: 5 +idouble: 6 +ifloat: 5 +ildouble: 4 +ldouble: 4 + +Function: Real part of "ctanh_towardzero": +double: 2 +float: 2 +idouble: 2 +ifloat: 2 +ildouble: 4 +ldouble: 4 + +Function: Imaginary part of "ctanh_towardzero": +double: 5 +float: 3 +idouble: 5 +ifloat: 3 +ildouble: 3 +ldouble: 3 + +Function: Real part of "ctanh_upward": +double: 2 +float: 2 +idouble: 2 +ifloat: 2 +ildouble: 3 +ldouble: 3 + +Function: Imaginary part of "ctanh_upward": +double: 2 +float: 3 +idouble: 2 +ifloat: 3 +ildouble: 3 +ldouble: 3 + +Function: "erf": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 1 +ldouble: 1 + +Function: "erf_downward": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 1 +ldouble: 1 + +Function: "erf_towardzero": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 1 +ldouble: 1 + +Function: "erf_upward": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 1 +ldouble: 1 + +Function: "erfc": +double: 3 +float: 2 +idouble: 3 +ifloat: 2 +ildouble: 3 +ldouble: 3 + +Function: "erfc_downward": +double: 5 +float: 6 +idouble: 5 +ifloat: 6 +ildouble: 4 +ldouble: 4 + +Function: "erfc_towardzero": +double: 3 +float: 4 +idouble: 3 +ifloat: 4 +ildouble: 4 +ldouble: 4 + +Function: "erfc_upward": +double: 5 +float: 6 +idouble: 5 +ifloat: 6 +ildouble: 5 +ldouble: 5 + +Function: "exp": +ildouble: 1 +ldouble: 1 + +Function: "exp10": +double: 2 +idouble: 2 +ildouble: 1 +ldouble: 1 + +Function: "exp10_downward": +double: 2 +float: 1 +idouble: 2 +ifloat: 1 +ildouble: 2 +ldouble: 2 + +Function: "exp10_towardzero": +double: 2 +float: 1 +idouble: 2 +ifloat: 1 +ildouble: 2 +ldouble: 2 + +Function: "exp10_upward": +double: 2 +float: 1 +idouble: 2 +ifloat: 1 +ildouble: 2 +ldouble: 2 + +Function: "exp2": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 1 +ldouble: 1 + +Function: "exp2_downward": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 1 +ldouble: 1 + +Function: "exp2_towardzero": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 1 +ldouble: 1 + +Function: "exp2_upward": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 1 +ldouble: 1 + +Function: "exp_downward": +double: 1 +idouble: 1 +ildouble: 1 +ldouble: 1 + +Function: "exp_towardzero": +double: 1 +idouble: 1 +ildouble: 2 +ldouble: 2 + +Function: "exp_upward": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 1 +ldouble: 1 + +Function: "exp_vlen16": +float: 1 + +Function: "exp_vlen2": +double: 1 + +Function: "exp_vlen4": +double: 1 +float: 1 + +Function: "exp_vlen4_avx2": +double: 1 + +Function: "exp_vlen8": +double: 1 +float: 1 + +Function: "exp_vlen8_avx2": +float: 1 + +Function: "expm1": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 2 +ldouble: 2 + +Function: "expm1_downward": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 4 +ldouble: 4 + +Function: "expm1_towardzero": +double: 1 +float: 2 +idouble: 1 +ifloat: 2 +ildouble: 4 +ldouble: 4 + +Function: "expm1_upward": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 4 +ldouble: 4 + +Function: "gamma": +double: 4 +float: 4 +idouble: 4 +ifloat: 4 +ildouble: 4 +ldouble: 4 + +Function: "gamma_downward": +double: 5 +float: 4 +idouble: 5 +ifloat: 4 +ildouble: 7 +ldouble: 7 + +Function: "gamma_towardzero": +double: 5 +float: 4 +idouble: 5 +ifloat: 4 +ildouble: 7 +ldouble: 7 + +Function: "gamma_upward": +double: 5 +float: 5 +idouble: 5 +ifloat: 5 +ildouble: 6 +ldouble: 6 + +Function: "hypot": +double: 1 +idouble: 1 +ildouble: 1 +ldouble: 1 + +Function: "hypot_downward": +double: 1 +idouble: 1 +ildouble: 1 +ldouble: 1 + +Function: "hypot_towardzero": +double: 1 +idouble: 1 +ildouble: 1 +ldouble: 1 + +Function: "hypot_upward": +double: 1 +idouble: 1 +ildouble: 1 +ldouble: 1 + +Function: "j0": +double: 2 +float: 2 +idouble: 2 +ifloat: 2 +ildouble: 2 +ldouble: 2 + +Function: "j0_downward": +double: 2 +float: 4 +idouble: 2 +ifloat: 4 +ildouble: 4 +ldouble: 4 + +Function: "j0_towardzero": +double: 3 +float: 2 +idouble: 3 +ifloat: 2 +ildouble: 5 +ldouble: 5 + +Function: "j0_upward": +double: 3 +float: 2 +idouble: 3 +ifloat: 2 +ildouble: 4 +ldouble: 4 + +Function: "j1": +double: 1 +float: 2 +idouble: 1 +ifloat: 2 +ildouble: 1 +ldouble: 1 + +Function: "j1_downward": +double: 3 +float: 3 +idouble: 3 +ifloat: 3 +ildouble: 4 +ldouble: 4 + +Function: "j1_towardzero": +double: 3 +float: 2 +idouble: 3 +ifloat: 2 +ildouble: 4 +ldouble: 4 + +Function: "j1_upward": +double: 3 +float: 5 +idouble: 3 +ifloat: 5 +ildouble: 3 +ldouble: 3 + +Function: "jn": +double: 4 +float: 4 +idouble: 4 +ifloat: 4 +ildouble: 4 +ldouble: 4 + +Function: "jn_downward": +double: 5 +float: 5 +idouble: 5 +ifloat: 5 +ildouble: 4 +ldouble: 4 + +Function: "jn_towardzero": +double: 5 +float: 5 +idouble: 5 +ifloat: 5 +ildouble: 5 +ldouble: 5 + +Function: "jn_upward": +double: 5 +float: 5 +idouble: 5 +ifloat: 5 +ildouble: 5 +ldouble: 5 + +Function: "lgamma": +double: 4 +float: 4 +idouble: 4 +ifloat: 4 +ildouble: 4 +ldouble: 4 + +Function: "lgamma_downward": +double: 5 +float: 4 +idouble: 5 +ifloat: 4 +ildouble: 7 +ldouble: 7 + +Function: "lgamma_towardzero": +double: 5 +float: 4 +idouble: 5 +ifloat: 4 +ildouble: 7 +ldouble: 7 + +Function: "lgamma_upward": +double: 5 +float: 5 +idouble: 5 +ifloat: 5 +ildouble: 6 +ldouble: 6 + +Function: "log": +float: 1 +ifloat: 1 +ildouble: 1 +ldouble: 1 + +Function: "log10": +double: 2 +float: 2 +idouble: 2 +ifloat: 2 +ildouble: 1 +ldouble: 1 + +Function: "log10_downward": +double: 2 +float: 3 +idouble: 2 +ifloat: 3 +ildouble: 2 +ldouble: 2 + +Function: "log10_towardzero": +double: 2 +float: 2 +idouble: 2 +ifloat: 2 +ildouble: 2 +ldouble: 2 + +Function: "log10_upward": +double: 2 +float: 2 +idouble: 2 +ifloat: 2 +ildouble: 1 +ldouble: 1 + +Function: "log1p": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 2 +ldouble: 2 + +Function: "log1p_downward": +double: 2 +float: 2 +idouble: 2 +ifloat: 2 +ildouble: 4 +ldouble: 4 + +Function: "log1p_towardzero": +double: 2 +float: 2 +idouble: 2 +ifloat: 2 +ildouble: 4 +ldouble: 4 + +Function: "log1p_upward": +double: 2 +float: 2 +idouble: 2 +ifloat: 2 +ildouble: 3 +ldouble: 3 + +Function: "log2": +double: 2 +float: 1 +idouble: 2 +ifloat: 1 +ildouble: 1 +ldouble: 1 + +Function: "log2_downward": +double: 3 +float: 3 +idouble: 3 +ifloat: 3 +ildouble: 1 +ldouble: 1 + +Function: "log2_towardzero": +double: 2 +float: 2 +idouble: 2 +ifloat: 2 +ildouble: 1 +ldouble: 1 + +Function: "log2_upward": +double: 3 +float: 3 +idouble: 3 +ifloat: 3 +ildouble: 1 +ldouble: 1 + +Function: "log_downward": +float: 2 +ifloat: 2 +ildouble: 2 +ldouble: 2 + +Function: "log_towardzero": +float: 2 +ifloat: 2 +ildouble: 2 +ldouble: 2 + +Function: "log_upward": +double: 1 +float: 2 +idouble: 1 +ifloat: 2 +ildouble: 1 +ldouble: 1 + +Function: "log_vlen16": +float: 3 + +Function: "log_vlen2": +double: 1 + +Function: "log_vlen4": +double: 1 +float: 3 + +Function: "log_vlen4_avx2": +double: 1 + +Function: "log_vlen8": +double: 1 +float: 3 + +Function: "log_vlen8_avx2": +float: 2 + +Function: "pow": +float: 1 +ifloat: 1 +ildouble: 1 +ldouble: 1 + +Function: "pow10": +double: 2 +idouble: 2 +ildouble: 1 +ldouble: 1 + +Function: "pow10_downward": +double: 2 +float: 1 +idouble: 2 +ifloat: 1 +ildouble: 2 +ldouble: 2 + +Function: "pow10_towardzero": +double: 2 +float: 1 +idouble: 2 +ifloat: 1 +ildouble: 2 +ldouble: 2 + +Function: "pow10_upward": +double: 2 +float: 1 +idouble: 2 +ifloat: 1 +ildouble: 2 +ldouble: 2 + +Function: "pow_downward": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 4 +ldouble: 4 + +Function: "pow_towardzero": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 4 +ldouble: 4 + +Function: "pow_upward": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 4 +ldouble: 4 + +Function: "pow_vlen16": +float: 3 + +Function: "pow_vlen2": +double: 1 + +Function: "pow_vlen4": +double: 1 +float: 3 + +Function: "pow_vlen4_avx2": +double: 1 + +Function: "pow_vlen8": +double: 1 +float: 3 + +Function: "pow_vlen8_avx2": +float: 3 + +Function: "sin": +ildouble: 1 +ldouble: 1 + +Function: "sin_downward": +double: 1 +idouble: 1 +ildouble: 3 +ldouble: 3 + +Function: "sin_towardzero": +double: 1 +idouble: 1 +ildouble: 2 +ldouble: 2 + +Function: "sin_upward": +double: 1 +idouble: 1 +ildouble: 3 +ldouble: 3 + +Function: "sin_vlen16": +float: 1 + +Function: "sin_vlen2": +double: 2 + +Function: "sin_vlen4": +double: 2 +float: 1 + +Function: "sin_vlen4_avx2": +double: 2 + +Function: "sin_vlen8": +double: 2 +float: 1 + +Function: "sin_vlen8_avx2": +float: 1 + +Function: "sincos": +ildouble: 1 +ldouble: 1 + +Function: "sincos_downward": +double: 1 +idouble: 1 +ildouble: 3 +ldouble: 3 + +Function: "sincos_towardzero": +double: 1 +idouble: 1 +ildouble: 2 +ldouble: 2 + +Function: "sincos_upward": +double: 1 +idouble: 1 +ildouble: 3 +ldouble: 3 + +Function: "sincos_vlen16": +float: 1 + +Function: "sincos_vlen2": +double: 2 + +Function: "sincos_vlen4": +double: 2 +float: 1 + +Function: "sincos_vlen4_avx2": +double: 2 + +Function: "sincos_vlen8": +double: 1 +float: 1 + +Function: "sincos_vlen8_avx2": +float: 1 + +Function: "sinh": +double: 2 +float: 2 +idouble: 2 +ifloat: 2 +ildouble: 2 +ldouble: 2 + +Function: "sinh_downward": +double: 3 +float: 3 +idouble: 3 +ifloat: 3 +ildouble: 5 +ldouble: 5 + +Function: "sinh_towardzero": +double: 2 +float: 2 +idouble: 2 +ifloat: 2 +ildouble: 4 +ldouble: 4 + +Function: "sinh_upward": +double: 3 +float: 3 +idouble: 3 +ifloat: 3 +ildouble: 5 +ldouble: 5 + +Function: "tan": +float: 1 +ifloat: 1 +ildouble: 2 +ldouble: 2 + +Function: "tan_downward": +double: 1 +float: 2 +idouble: 1 +ifloat: 2 +ildouble: 3 +ldouble: 3 + +Function: "tan_towardzero": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 3 +ldouble: 3 + +Function: "tan_upward": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 2 +ldouble: 2 + +Function: "tanh": +double: 2 +float: 2 +idouble: 2 +ifloat: 2 +ildouble: 3 +ldouble: 3 + +Function: "tanh_downward": +double: 3 +float: 3 +idouble: 3 +ifloat: 3 +ildouble: 4 +ldouble: 4 + +Function: "tanh_towardzero": +double: 2 +float: 2 +idouble: 2 +ifloat: 2 +ildouble: 3 +ldouble: 3 + +Function: "tanh_upward": +double: 3 +float: 3 +idouble: 3 +ifloat: 3 +ildouble: 4 +ldouble: 4 + +Function: "tgamma": +double: 5 +float: 5 +idouble: 5 +ifloat: 5 +ildouble: 5 +ldouble: 5 + +Function: "tgamma_downward": +double: 5 +float: 5 +idouble: 5 +ifloat: 5 +ildouble: 5 +ldouble: 5 + +Function: "tgamma_towardzero": +double: 5 +float: 5 +idouble: 5 +ifloat: 5 +ildouble: 5 +ldouble: 5 + +Function: "tgamma_upward": +double: 5 +float: 5 +idouble: 5 +ifloat: 5 +ildouble: 5 +ldouble: 5 + +Function: "y0": +double: 2 +float: 1 +idouble: 2 +ifloat: 1 +ildouble: 1 +ldouble: 1 + +Function: "y0_downward": +double: 3 +float: 4 +idouble: 3 +ifloat: 4 +ildouble: 5 +ldouble: 5 + +Function: "y0_towardzero": +double: 3 +float: 3 +idouble: 3 +ifloat: 3 +ildouble: 5 +ldouble: 5 + +Function: "y0_upward": +double: 3 +float: 5 +idouble: 3 +ifloat: 5 +ildouble: 3 +ldouble: 3 + +Function: "y1": +double: 3 +float: 2 +idouble: 3 +ifloat: 2 +ildouble: 2 +ldouble: 2 + +Function: "y1_downward": +double: 3 +float: 2 +idouble: 3 +ifloat: 2 +ildouble: 7 +ldouble: 7 + +Function: "y1_towardzero": +double: 3 +float: 2 +idouble: 3 +ifloat: 2 +ildouble: 5 +ldouble: 5 + +Function: "y1_upward": +double: 7 +float: 2 +idouble: 7 +ifloat: 2 +ildouble: 7 +ldouble: 7 + +Function: "yn": +double: 3 +float: 3 +idouble: 3 +ifloat: 3 +ildouble: 4 +ldouble: 4 + +Function: "yn_downward": +double: 3 +float: 4 +idouble: 3 +ifloat: 4 +ildouble: 5 +ldouble: 5 + +Function: "yn_towardzero": +double: 3 +float: 3 +idouble: 3 +ifloat: 3 +ildouble: 5 +ldouble: 5 + +Function: "yn_upward": +double: 4 +float: 5 +idouble: 4 +ifloat: 5 +ildouble: 4 +ldouble: 4 + +# end of automatic generation diff --git a/REORG.TODO/sysdeps/x86_64/fpu/libm-test-ulps-name b/REORG.TODO/sysdeps/x86_64/fpu/libm-test-ulps-name new file mode 100644 index 0000000000..1c09346681 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/libm-test-ulps-name @@ -0,0 +1 @@ +x86_64 diff --git a/REORG.TODO/sysdeps/x86_64/fpu/math-tests-arch.h b/REORG.TODO/sysdeps/x86_64/fpu/math-tests-arch.h new file mode 100644 index 0000000000..9278e3440b --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/math-tests-arch.h @@ -0,0 +1,53 @@ +/* Runtime architecture check for math tests. x86_64 version. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <cpu-features.h> + +#if defined REQUIRE_AVX + +# define INIT_ARCH_EXT +# define CHECK_ARCH_EXT \ + do \ + { \ + if (!HAS_ARCH_FEATURE (AVX_Usable)) return; \ + } \ + while (0) + +#elif defined REQUIRE_AVX2 + +# define INIT_ARCH_EXT +# define CHECK_ARCH_EXT \ + do \ + { \ + if (!HAS_ARCH_FEATURE (AVX2_Usable)) return; \ + } \ + while (0) + +#elif defined REQUIRE_AVX512F + +# define INIT_ARCH_EXT +# define CHECK_ARCH_EXT \ + do \ + { \ + if (!HAS_ARCH_FEATURE (AVX512F_Usable)) return; \ + } \ + while (0) + +#else +# include <sysdeps/generic/math-tests-arch.h> +#endif diff --git a/REORG.TODO/sysdeps/x86_64/fpu/math_ldbl.h b/REORG.TODO/sysdeps/x86_64/fpu/math_ldbl.h new file mode 100644 index 0000000000..6c5bc13455 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/math_ldbl.h @@ -0,0 +1,100 @@ +/* Manipulation of the bit representation of 'long double' quantities. + Copyright (C) 2001-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#ifndef _MATH_LDBL_H_ +#define _MATH_LDBL_H_ 1 + +#include <stdint.h> + +/* A union which permits us to convert between a long double and + three 32 bit ints. */ + +typedef union +{ + long double value; + struct + { + uint32_t lsw; + uint32_t msw; + int sign_exponent:16; + unsigned int empty1:16; + unsigned int empty0:32; + } parts; +} ieee_long_double_shape_type; + +/* Get three 32 bit ints from a double. */ + +#define GET_LDOUBLE_WORDS(exp,ix0,ix1,d) \ +do { \ + ieee_long_double_shape_type ew_u; \ + ew_u.value = (d); \ + (exp) = ew_u.parts.sign_exponent; \ + (ix0) = ew_u.parts.msw; \ + (ix1) = ew_u.parts.lsw; \ +} while (0) + +/* Set a double from two 32 bit ints. */ + +#define SET_LDOUBLE_WORDS(d,exp,ix0,ix1) \ +do { \ + ieee_long_double_shape_type iw_u; \ + iw_u.parts.sign_exponent = (exp); \ + iw_u.parts.msw = (ix0); \ + iw_u.parts.lsw = (ix1); \ + (d) = iw_u.value; \ +} while (0) + +/* Get the more significant 32 bits of a long double mantissa. */ + +#define GET_LDOUBLE_MSW(v,d) \ +do { \ + ieee_long_double_shape_type sh_u; \ + sh_u.value = (d); \ + (v) = sh_u.parts.msw; \ +} while (0) + +/* Set the more significant 32 bits of a long double mantissa from an int. */ + +#define SET_LDOUBLE_MSW(d,v) \ +do { \ + ieee_long_double_shape_type sh_u; \ + sh_u.value = (d); \ + sh_u.parts.msw = (v); \ + (d) = sh_u.value; \ +} while (0) + +/* Get int from the exponent of a long double. */ + +#define GET_LDOUBLE_EXP(exp,d) \ +do { \ + ieee_long_double_shape_type ge_u; \ + ge_u.value = (d); \ + (exp) = ge_u.parts.sign_exponent; \ +} while (0) + +/* Set exponent of a long double from an int. */ + +#define SET_LDOUBLE_EXP(d,exp) \ +do { \ + ieee_long_double_shape_type se_u; \ + se_u.value = (d); \ + se_u.parts.sign_exponent = (exp); \ + (d) = se_u.value; \ +} while (0) + +#endif /* math_ldbl.h */ diff --git a/REORG.TODO/sysdeps/x86_64/fpu/math_private.h b/REORG.TODO/sysdeps/x86_64/fpu/math_private.h new file mode 100644 index 0000000000..027a6a3a4d --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/math_private.h @@ -0,0 +1,133 @@ +#ifndef X86_64_MATH_PRIVATE_H +#define X86_64_MATH_PRIVATE_H 1 + +/* We can do a few things better on x86-64. */ + +#if defined __AVX__ || defined SSE2AVX +# define MOVD "vmovd" +# define MOVQ "vmovq" +#else +# define MOVD "movd" +# define MOVQ "movq" +#endif + +/* Direct movement of float into integer register. */ +#define EXTRACT_WORDS64(i, d) \ + do { \ + int64_t i_; \ + asm (MOVQ " %1, %0" : "=rm" (i_) : "x" ((double) (d))); \ + (i) = i_; \ + } while (0) + +/* And the reverse. */ +#define INSERT_WORDS64(d, i) \ + do { \ + int64_t i_ = i; \ + double d__; \ + asm (MOVQ " %1, %0" : "=x" (d__) : "rm" (i_)); \ + d = d__; \ + } while (0) + +/* Direct movement of float into integer register. */ +#define GET_FLOAT_WORD(i, d) \ + do { \ + int i_; \ + asm (MOVD " %1, %0" : "=rm" (i_) : "x" ((float) (d))); \ + (i) = i_; \ + } while (0) + +/* And the reverse. */ +#define SET_FLOAT_WORD(f, i) \ + do { \ + int i_ = i; \ + float f__; \ + asm (MOVD " %1, %0" : "=x" (f__) : "rm" (i_)); \ + f = f__; \ + } while (0) + +#include <sysdeps/i386/fpu/fenv_private.h> +#include_next <math_private.h> + +extern __always_inline double +__ieee754_sqrt (double d) +{ + double res; +#if defined __AVX__ || defined SSE2AVX + asm ("vsqrtsd %1, %0, %0" : "=x" (res) : "xm" (d)); +#else + asm ("sqrtsd %1, %0" : "=x" (res) : "xm" (d)); +#endif + return res; +} + +extern __always_inline float +__ieee754_sqrtf (float d) +{ + float res; +#if defined __AVX__ || defined SSE2AVX + asm ("vsqrtss %1, %0, %0" : "=x" (res) : "xm" (d)); +#else + asm ("sqrtss %1, %0" : "=x" (res) : "xm" (d)); +#endif + return res; +} + +extern __always_inline long double +__ieee754_sqrtl (long double d) +{ + long double res; + asm ("fsqrt" : "=t" (res) : "0" (d)); + return res; +} + +#ifdef __SSE4_1__ +extern __always_inline double +__rint (double d) +{ + double res; +# if defined __AVX__ || defined SSE2AVX + asm ("vroundsd $4, %1, %0, %0" : "=x" (res) : "xm" (d)); +# else + asm ("roundsd $4, %1, %0" : "=x" (res) : "xm" (d)); +# endif + return res; +} + +extern __always_inline float +__rintf (float d) +{ + float res; +# if defined __AVX__ || defined SSE2AVX + asm ("vroundss $4, %1, %0, %0" : "=x" (res) : "xm" (d)); +# else + asm ("roundss $4, %1, %0" : "=x" (res) : "xm" (d)); +# endif + return res; +} + +extern __always_inline double +__floor (double d) +{ + double res; +# if defined __AVX__ || defined SSE2AVX + asm ("vroundsd $1, %1, %0, %0" : "=x" (res) : "xm" (d)); +# else + asm ("roundsd $1, %1, %0" : "=x" (res) : "xm" (d)); +# endif + return res; +} + +extern __always_inline float +__floorf (float d) +{ + float res; +# if defined __AVX__ || defined SSE2AVX + asm ("vroundss $1, %1, %0, %0" : "=x" (res) : "xm" (d)); +# else + asm ("roundss $1, %1, %0" : "=x" (res) : "xm" (d)); +# endif + return res; +} +#endif /* __SSE4_1__ */ + +#endif /* X86_64_MATH_PRIVATE_H */ diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/Makefile b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/Makefile new file mode 100644 index 0000000000..34542155aa --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/Makefile @@ -0,0 +1,70 @@ +ifeq ($(subdir),math) +libm-sysdep_routines += s_floor-c s_ceil-c s_floorf-c s_ceilf-c \ + s_rint-c s_rintf-c s_nearbyint-c s_nearbyintf-c + +libm-sysdep_routines += e_exp-fma4 e_log-fma4 e_pow-fma4 s_atan-fma4 \ + e_asin-fma4 e_atan2-fma4 s_sin-fma4 s_tan-fma4 \ + mplog-fma4 mpa-fma4 slowexp-fma4 slowpow-fma4 \ + sincos32-fma4 doasin-fma4 dosincos-fma4 \ + halfulp-fma4 mpexp-fma4 \ + mpatan2-fma4 mpatan-fma4 mpsqrt-fma4 mptan-fma4 + +CFLAGS-doasin-fma4.c = -mfma4 +CFLAGS-dosincos-fma4.c = -mfma4 +CFLAGS-e_asin-fma4.c = -mfma4 +CFLAGS-e_atan2-fma4.c = -mfma4 +CFLAGS-e_exp-fma4.c = -mfma4 +CFLAGS-e_log-fma4.c = -mfma4 +CFLAGS-e_pow-fma4.c = -mfma4 $(config-cflags-nofma) +CFLAGS-halfulp-fma4.c = -mfma4 +CFLAGS-mpa-fma4.c = -mfma4 +CFLAGS-mpatan-fma4.c = -mfma4 +CFLAGS-mpatan2-fma4.c = -mfma4 +CFLAGS-mpexp-fma4.c = -mfma4 +CFLAGS-mplog-fma4.c = -mfma4 +CFLAGS-mpsqrt-fma4.c = -mfma4 +CFLAGS-mptan-fma4.c = -mfma4 +CFLAGS-s_atan-fma4.c = -mfma4 +CFLAGS-sincos32-fma4.c = -mfma4 +CFLAGS-slowexp-fma4.c = -mfma4 +CFLAGS-slowpow-fma4.c = -mfma4 +CFLAGS-s_sin-fma4.c = -mfma4 +CFLAGS-s_tan-fma4.c = -mfma4 + +libm-sysdep_routines += e_exp-avx e_log-avx s_atan-avx \ + e_atan2-avx s_sin-avx s_tan-avx \ + mplog-avx mpa-avx slowexp-avx \ + mpexp-avx + +CFLAGS-e_atan2-avx.c = -msse2avx -DSSE2AVX +CFLAGS-e_exp-avx.c = -msse2avx -DSSE2AVX +CFLAGS-e_log-avx.c = -msse2avx -DSSE2AVX +CFLAGS-mpa-avx.c = -msse2avx -DSSE2AVX +CFLAGS-mpexp-avx.c = -msse2avx -DSSE2AVX +CFLAGS-mplog-avx.c = -msse2avx -DSSE2AVX +CFLAGS-s_atan-avx.c = -msse2avx -DSSE2AVX +CFLAGS-s_sin-avx.c = -msse2avx -DSSE2AVX +CFLAGS-slowexp-avx.c = -msse2avx -DSSE2AVX +CFLAGS-s_tan-avx.c = -msse2avx -DSSE2AVX +endif + +ifeq ($(subdir),mathvec) +libmvec-sysdep_routines += svml_d_cos2_core_sse4 svml_d_cos4_core_avx2 \ + svml_d_cos8_core_avx512 svml_d_sin2_core_sse4 \ + svml_d_sin4_core_avx2 svml_d_sin8_core_avx512 \ + svml_d_log2_core_sse4 svml_d_log4_core_avx2 \ + svml_d_log8_core_avx512 svml_d_sincos2_core_sse4 \ + svml_d_sincos4_core_avx2 svml_d_sincos8_core_avx512 \ + svml_s_cosf4_core_sse4 svml_s_cosf8_core_avx2 \ + svml_s_cosf16_core_avx512 svml_s_sinf4_core_sse4 \ + svml_s_sinf8_core_avx2 svml_s_sinf16_core_avx512 \ + svml_s_logf4_core_sse4 svml_s_logf8_core_avx2 \ + svml_s_logf16_core_avx512 svml_d_exp2_core_sse4 \ + svml_d_exp4_core_avx2 svml_d_exp8_core_avx512 \ + svml_s_expf4_core_sse4 svml_s_expf8_core_avx2 \ + svml_s_expf16_core_avx512 svml_d_pow2_core_sse4 \ + svml_d_pow4_core_avx2 svml_d_pow8_core_avx512 \ + svml_s_powf4_core_sse4 svml_s_powf8_core_avx2 \ + svml_s_powf16_core_avx512 svml_s_sincosf4_core_sse4 \ + svml_s_sincosf8_core_avx2 svml_s_sincosf16_core_avx512 +endif diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/doasin-fma4.c b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/doasin-fma4.c new file mode 100644 index 0000000000..53eb419472 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/doasin-fma4.c @@ -0,0 +1,4 @@ +#define __doasin __doasin_fma4 +#define SECTION __attribute__ ((section (".text.fma4"))) + +#include <sysdeps/ieee754/dbl-64/doasin.c> diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/dosincos-fma4.c b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/dosincos-fma4.c new file mode 100644 index 0000000000..1578b2fce0 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/dosincos-fma4.c @@ -0,0 +1,6 @@ +#define __docos __docos_fma4 +#define __dubcos __dubcos_fma4 +#define __dubsin __dubsin_fma4 +#define SECTION __attribute__ ((section (".text.fma4"))) + +#include <sysdeps/ieee754/dbl-64/dosincos.c> diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/e_asin-fma4.c b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/e_asin-fma4.c new file mode 100644 index 0000000000..2657c31f49 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/e_asin-fma4.c @@ -0,0 +1,11 @@ +#define __ieee754_acos __ieee754_acos_fma4 +#define __ieee754_asin __ieee754_asin_fma4 +#define __cos32 __cos32_fma4 +#define __doasin __doasin_fma4 +#define __docos __docos_fma4 +#define __dubcos __dubcos_fma4 +#define __dubsin __dubsin_fma4 +#define __sin32 __sin32_fma4 +#define SECTION __attribute__ ((section (".text.fma4"))) + +#include <sysdeps/ieee754/dbl-64/e_asin.c> diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/e_asin.c b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/e_asin.c new file mode 100644 index 0000000000..111a5b99bd --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/e_asin.c @@ -0,0 +1,26 @@ +#include <init-arch.h> +#include <math.h> +#include <math_private.h> + +extern double __ieee754_acos_sse2 (double); +extern double __ieee754_asin_sse2 (double); +extern double __ieee754_acos_fma4 (double); +extern double __ieee754_asin_fma4 (double); + +libm_ifunc (__ieee754_acos, + HAS_ARCH_FEATURE (FMA4_Usable) + ? __ieee754_acos_fma4 + : __ieee754_acos_sse2); +strong_alias (__ieee754_acos, __acos_finite) + +libm_ifunc (__ieee754_asin, + HAS_ARCH_FEATURE (FMA4_Usable) + ? __ieee754_asin_fma4 + : __ieee754_asin_sse2); +strong_alias (__ieee754_asin, __asin_finite) + +#define __ieee754_acos __ieee754_acos_sse2 +#define __ieee754_asin __ieee754_asin_sse2 + + +#include <sysdeps/ieee754/dbl-64/e_asin.c> diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/e_atan2-avx.c b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/e_atan2-avx.c new file mode 100644 index 0000000000..3012afac37 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/e_atan2-avx.c @@ -0,0 +1,9 @@ +#define __ieee754_atan2 __ieee754_atan2_avx +#define __add __add_avx +#define __dbl_mp __dbl_mp_avx +#define __dvd __dvd_avx +#define __mul __mul_avx +#define __sub __sub_avx +#define SECTION __attribute__ ((section (".text.avx"))) + +#include <sysdeps/ieee754/dbl-64/e_atan2.c> diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/e_atan2-fma4.c b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/e_atan2-fma4.c new file mode 100644 index 0000000000..f4e986293e --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/e_atan2-fma4.c @@ -0,0 +1,10 @@ +#define __ieee754_atan2 __ieee754_atan2_fma4 +#define __add __add_fma4 +#define __dbl_mp __dbl_mp_fma4 +#define __dvd __dvd_fma4 +#define __mpatan2 __mpatan2_fma4 +#define __mul __mul_fma4 +#define __sub __sub_fma4 +#define SECTION __attribute__ ((section (".text.fma4"))) + +#include <sysdeps/ieee754/dbl-64/e_atan2.c> diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/e_atan2.c b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/e_atan2.c new file mode 100644 index 0000000000..9ca3c02a44 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/e_atan2.c @@ -0,0 +1,18 @@ +#include <init-arch.h> +#include <math.h> +#include <math_private.h> + +extern double __ieee754_atan2_sse2 (double, double); +extern double __ieee754_atan2_avx (double, double); +extern double __ieee754_atan2_fma4 (double, double); + +libm_ifunc (__ieee754_atan2, + HAS_ARCH_FEATURE (FMA4_Usable) ? __ieee754_atan2_fma4 + : (HAS_ARCH_FEATURE (AVX_Usable) + ? __ieee754_atan2_avx : __ieee754_atan2_sse2)); +strong_alias (__ieee754_atan2, __atan2_finite) + +#define __ieee754_atan2 __ieee754_atan2_sse2 + + +#include <sysdeps/ieee754/dbl-64/e_atan2.c> diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/e_exp-avx.c b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/e_exp-avx.c new file mode 100644 index 0000000000..ee5dd6d2dc --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/e_exp-avx.c @@ -0,0 +1,6 @@ +#define __ieee754_exp __ieee754_exp_avx +#define __exp1 __exp1_avx +#define __slowexp __slowexp_avx +#define SECTION __attribute__ ((section (".text.avx"))) + +#include <sysdeps/ieee754/dbl-64/e_exp.c> diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/e_exp-fma4.c b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/e_exp-fma4.c new file mode 100644 index 0000000000..ae6eb67603 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/e_exp-fma4.c @@ -0,0 +1,6 @@ +#define __ieee754_exp __ieee754_exp_fma4 +#define __exp1 __exp1_fma4 +#define __slowexp __slowexp_fma4 +#define SECTION __attribute__ ((section (".text.fma4"))) + +#include <sysdeps/ieee754/dbl-64/e_exp.c> diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/e_exp.c b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/e_exp.c new file mode 100644 index 0000000000..b7d7b5ff27 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/e_exp.c @@ -0,0 +1,18 @@ +#include <init-arch.h> +#include <math.h> +#include <math_private.h> + +extern double __ieee754_exp_sse2 (double); +extern double __ieee754_exp_avx (double); +extern double __ieee754_exp_fma4 (double); + +libm_ifunc (__ieee754_exp, + HAS_ARCH_FEATURE (FMA4_Usable) ? __ieee754_exp_fma4 + : (HAS_ARCH_FEATURE (AVX_Usable) + ? __ieee754_exp_avx : __ieee754_exp_sse2)); +strong_alias (__ieee754_exp, __exp_finite) + +#define __ieee754_exp __ieee754_exp_sse2 + + +#include <sysdeps/ieee754/dbl-64/e_exp.c> diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/e_log-avx.c b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/e_log-avx.c new file mode 100644 index 0000000000..c669019bc2 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/e_log-avx.c @@ -0,0 +1,8 @@ +#define __ieee754_log __ieee754_log_avx +#define __mplog __mplog_avx +#define __add __add_avx +#define __dbl_mp __dbl_mp_avx +#define __sub __sub_avx +#define SECTION __attribute__ ((section (".text.avx"))) + +#include <sysdeps/ieee754/dbl-64/e_log.c> diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/e_log-fma4.c b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/e_log-fma4.c new file mode 100644 index 0000000000..a2346cc618 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/e_log-fma4.c @@ -0,0 +1,8 @@ +#define __ieee754_log __ieee754_log_fma4 +#define __mplog __mplog_fma4 +#define __add __add_fma4 +#define __dbl_mp __dbl_mp_fma4 +#define __sub __sub_fma4 +#define SECTION __attribute__ ((section (".text.fma4"))) + +#include <sysdeps/ieee754/dbl-64/e_log.c> diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/e_log.c b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/e_log.c new file mode 100644 index 0000000000..cf9533d6c0 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/e_log.c @@ -0,0 +1,18 @@ +#include <init-arch.h> +#include <math.h> +#include <math_private.h> + +extern double __ieee754_log_sse2 (double); +extern double __ieee754_log_avx (double); +extern double __ieee754_log_fma4 (double); + +libm_ifunc (__ieee754_log, + HAS_ARCH_FEATURE (FMA4_Usable) ? __ieee754_log_fma4 + : (HAS_ARCH_FEATURE (AVX_Usable) + ? __ieee754_log_avx : __ieee754_log_sse2)); +strong_alias (__ieee754_log, __log_finite) + +#define __ieee754_log __ieee754_log_sse2 + + +#include <sysdeps/ieee754/dbl-64/e_log.c> diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/e_pow-fma4.c b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/e_pow-fma4.c new file mode 100644 index 0000000000..5b3ea8e103 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/e_pow-fma4.c @@ -0,0 +1,6 @@ +#define __ieee754_pow __ieee754_pow_fma4 +#define __exp1 __exp1_fma4 +#define __slowpow __slowpow_fma4 +#define SECTION __attribute__ ((section (".text.fma4"))) + +#include <sysdeps/ieee754/dbl-64/e_pow.c> diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/e_pow.c b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/e_pow.c new file mode 100644 index 0000000000..a5c5d89c3e --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/e_pow.c @@ -0,0 +1,17 @@ +#include <init-arch.h> +#include <math.h> +#include <math_private.h> + +extern double __ieee754_pow_sse2 (double, double); +extern double __ieee754_pow_fma4 (double, double); + +libm_ifunc (__ieee754_pow, + HAS_ARCH_FEATURE (FMA4_Usable) + ? __ieee754_pow_fma4 + : __ieee754_pow_sse2); +strong_alias (__ieee754_pow, __pow_finite) + +#define __ieee754_pow __ieee754_pow_sse2 + + +#include <sysdeps/ieee754/dbl-64/e_pow.c> diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/halfulp-fma4.c b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/halfulp-fma4.c new file mode 100644 index 0000000000..a00c17c016 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/halfulp-fma4.c @@ -0,0 +1,4 @@ +#define __halfulp __halfulp_fma4 +#define SECTION __attribute__ ((section (".text.fma4"))) + +#include <sysdeps/ieee754/dbl-64/halfulp.c> diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/mpa-avx.c b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/mpa-avx.c new file mode 100644 index 0000000000..366b0b7134 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/mpa-avx.c @@ -0,0 +1,14 @@ +#define __add __add_avx +#define __mul __mul_avx +#define __sqr __sqr_avx +#define __sub __sub_avx +#define __dbl_mp __dbl_mp_avx +#define __dvd __dvd_avx + +#define NO___CPY 1 +#define NO___MP_DBL 1 +#define NO___ACR 1 +#define NO__CONST 1 +#define SECTION __attribute__ ((section (".text.avx"))) + +#include <sysdeps/ieee754/dbl-64/mpa.c> diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/mpa-fma4.c b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/mpa-fma4.c new file mode 100644 index 0000000000..a4a759407e --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/mpa-fma4.c @@ -0,0 +1,14 @@ +#define __add __add_fma4 +#define __mul __mul_fma4 +#define __sqr __sqr_fma4 +#define __sub __sub_fma4 +#define __dbl_mp __dbl_mp_fma4 +#define __dvd __dvd_fma4 + +#define NO___CPY 1 +#define NO___MP_DBL 1 +#define NO___ACR 1 +#define NO__CONST 1 +#define SECTION __attribute__ ((section (".text.fma4"))) + +#include <sysdeps/ieee754/dbl-64/mpa.c> diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/mpatan-fma4.c b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/mpatan-fma4.c new file mode 100644 index 0000000000..fbd3bd49a2 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/mpatan-fma4.c @@ -0,0 +1,10 @@ +#define __mpatan __mpatan_fma4 +#define __add __add_fma4 +#define __dvd __dvd_fma4 +#define __mpsqrt __mpsqrt_fma4 +#define __mul __mul_fma4 +#define __sub __sub_fma4 +#define AVOID_MPATAN_H 1 +#define SECTION __attribute__ ((section (".text.fma4"))) + +#include <sysdeps/ieee754/dbl-64/mpatan.c> diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/mpatan2-fma4.c b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/mpatan2-fma4.c new file mode 100644 index 0000000000..e6e44d49b0 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/mpatan2-fma4.c @@ -0,0 +1,9 @@ +#define __mpatan2 __mpatan2_fma4 +#define __add __add_fma4 +#define __dvd __dvd_fma4 +#define __mpatan __mpatan_fma4 +#define __mpsqrt __mpsqrt_fma4 +#define __mul __mul_fma4 +#define SECTION __attribute__ ((section (".text.fma4"))) + +#include <sysdeps/ieee754/dbl-64/mpatan2.c> diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/mpexp-avx.c b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/mpexp-avx.c new file mode 100644 index 0000000000..87f29c96c9 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/mpexp-avx.c @@ -0,0 +1,9 @@ +#define __mpexp __mpexp_avx +#define __add __add_avx +#define __dbl_mp __dbl_mp_avx +#define __dvd __dvd_avx +#define __mul __mul_avx +#define AVOID_MPEXP_H 1 +#define SECTION __attribute__ ((section (".text.avx"))) + +#include <sysdeps/ieee754/dbl-64/mpexp.c> diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/mpexp-fma4.c b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/mpexp-fma4.c new file mode 100644 index 0000000000..07ca6e9ad0 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/mpexp-fma4.c @@ -0,0 +1,9 @@ +#define __mpexp __mpexp_fma4 +#define __add __add_fma4 +#define __dbl_mp __dbl_mp_fma4 +#define __dvd __dvd_fma4 +#define __mul __mul_fma4 +#define AVOID_MPEXP_H 1 +#define SECTION __attribute__ ((section (".text.fma4"))) + +#include <sysdeps/ieee754/dbl-64/mpexp.c> diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/mplog-avx.c b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/mplog-avx.c new file mode 100644 index 0000000000..fd783d9a67 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/mplog-avx.c @@ -0,0 +1,8 @@ +#define __mplog __mplog_avx +#define __add __add_avx +#define __mpexp __mpexp_avx +#define __mul __mul_avx +#define __sub __sub_avx +#define SECTION __attribute__ ((section (".text.avx"))) + +#include <sysdeps/ieee754/dbl-64/mplog.c> diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/mplog-fma4.c b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/mplog-fma4.c new file mode 100644 index 0000000000..b4733118d7 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/mplog-fma4.c @@ -0,0 +1,8 @@ +#define __mplog __mplog_fma4 +#define __add __add_fma4 +#define __mpexp __mpexp_fma4 +#define __mul __mul_fma4 +#define __sub __sub_fma4 +#define SECTION __attribute__ ((section (".text.fma4"))) + +#include <sysdeps/ieee754/dbl-64/mplog.c> diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/mpsqrt-fma4.c b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/mpsqrt-fma4.c new file mode 100644 index 0000000000..f8a1ba2d92 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/mpsqrt-fma4.c @@ -0,0 +1,8 @@ +#define __mpsqrt __mpsqrt_fma4 +#define __dbl_mp __dbl_mp_fma4 +#define __mul __mul_fma4 +#define __sub __sub_fma4 +#define AVOID_MPSQRT_H 1 +#define SECTION __attribute__ ((section (".text.fma4"))) + +#include <sysdeps/ieee754/dbl-64/mpsqrt.c> diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/mptan-fma4.c b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/mptan-fma4.c new file mode 100644 index 0000000000..fb4a9d48ca --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/mptan-fma4.c @@ -0,0 +1,7 @@ +#define __mptan __mptan_fma4 +#define __c32 __c32_fma4 +#define __dvd __dvd_fma4 +#define __mpranred __mpranred_fma4 +#define SECTION __attribute__ ((section (".text.fma4"))) + +#include <sysdeps/ieee754/dbl-64/mptan.c> diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_atan-avx.c b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_atan-avx.c new file mode 100644 index 0000000000..b5cb9c3a75 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_atan-avx.c @@ -0,0 +1,8 @@ +#define atan __atan_avx +#define __add __add_avx +#define __dbl_mp __dbl_mp_avx +#define __mul __mul_avx +#define __sub __sub_avx +#define SECTION __attribute__ ((section (".text.avx"))) + +#include <sysdeps/ieee754/dbl-64/s_atan.c> diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_atan-fma4.c b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_atan-fma4.c new file mode 100644 index 0000000000..9e83e6cdab --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_atan-fma4.c @@ -0,0 +1,9 @@ +#define atan __atan_fma4 +#define __add __add_fma4 +#define __dbl_mp __dbl_mp_fma4 +#define __mpatan __mpatan_fma4 +#define __mul __mul_fma4 +#define __sub __sub_fma4 +#define SECTION __attribute__ ((section (".text.fma4"))) + +#include <sysdeps/ieee754/dbl-64/s_atan.c> diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_atan.c b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_atan.c new file mode 100644 index 0000000000..742e95cb96 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_atan.c @@ -0,0 +1,15 @@ +#include <init-arch.h> +#include <math.h> + +extern double __atan_sse2 (double); +extern double __atan_avx (double); +extern double __atan_fma4 (double); + +libm_ifunc (atan, (HAS_ARCH_FEATURE (FMA4_Usable) ? __atan_fma4 : + HAS_ARCH_FEATURE (AVX_Usable) + ? __atan_avx : __atan_sse2)); + +#define atan __atan_sse2 + + +#include <sysdeps/ieee754/dbl-64/s_atan.c> diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_ceil-c.c b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_ceil-c.c new file mode 100644 index 0000000000..6a5ea3ff27 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_ceil-c.c @@ -0,0 +1,2 @@ +#define __ceil __ceil_c +#include <sysdeps/ieee754/dbl-64/wordsize-64/s_ceil.c> diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_ceil.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_ceil.S new file mode 100644 index 0000000000..f8eef43eff --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_ceil.S @@ -0,0 +1,38 @@ +/* Copyright (C) 2011-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Ulrich Drepper <drepper@gmail.come>, 2011. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <machine/asm.h> +#include <init-arch.h> + + +ENTRY(__ceil) + .type __ceil, @gnu_indirect_function + LOAD_RTLD_GLOBAL_RO_RDX + leaq __ceil_sse41(%rip), %rax + HAS_CPU_FEATURE (SSE4_1) + jnz 2f + leaq __ceil_c(%rip), %rax +2: ret +END(__ceil) +weak_alias (__ceil, ceil) + + +ENTRY(__ceil_sse41) + roundsd $10, %xmm0, %xmm0 + ret +END(__ceil_sse41) diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_ceilf-c.c b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_ceilf-c.c new file mode 100644 index 0000000000..229a6273b2 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_ceilf-c.c @@ -0,0 +1,2 @@ +#define __ceilf __ceilf_c +#include <sysdeps/ieee754/flt-32/s_ceilf.c> diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_ceilf.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_ceilf.S new file mode 100644 index 0000000000..076f10f0f0 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_ceilf.S @@ -0,0 +1,38 @@ +/* Copyright (C) 2011-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Ulrich Drepper <drepper@gmail.come>, 2011. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <machine/asm.h> +#include <init-arch.h> + + +ENTRY(__ceilf) + .type __ceilf, @gnu_indirect_function + LOAD_RTLD_GLOBAL_RO_RDX + leaq __ceilf_sse41(%rip), %rax + HAS_CPU_FEATURE (SSE4_1) + jnz 2f + leaq __ceilf_c(%rip), %rax +2: ret +END(__ceilf) +weak_alias (__ceilf, ceilf) + + +ENTRY(__ceilf_sse41) + roundss $10, %xmm0, %xmm0 + ret +END(__ceilf_sse41) diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_floor-c.c b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_floor-c.c new file mode 100644 index 0000000000..68733b69ef --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_floor-c.c @@ -0,0 +1,3 @@ +#undef __floor +#define __floor __floor_c +#include <sysdeps/ieee754/dbl-64/wordsize-64/s_floor.c> diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_floor.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_floor.S new file mode 100644 index 0000000000..f519ab24f4 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_floor.S @@ -0,0 +1,38 @@ +/* Copyright (C) 2011-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Ulrich Drepper <drepper@gmail.come>, 2011. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <machine/asm.h> +#include <init-arch.h> + + +ENTRY(__floor) + .type __floor, @gnu_indirect_function + LOAD_RTLD_GLOBAL_RO_RDX + leaq __floor_sse41(%rip), %rax + HAS_CPU_FEATURE (SSE4_1) + jnz 2f + leaq __floor_c(%rip), %rax +2: ret +END(__floor) +weak_alias (__floor, floor) + + +ENTRY(__floor_sse41) + roundsd $9, %xmm0, %xmm0 + ret +END(__floor_sse41) diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_floorf-c.c b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_floorf-c.c new file mode 100644 index 0000000000..2386362328 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_floorf-c.c @@ -0,0 +1,3 @@ +#undef __floorf +#define __floorf __floorf_c +#include <sysdeps/ieee754/flt-32/s_floorf.c> diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_floorf.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_floorf.S new file mode 100644 index 0000000000..8613f73acc --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_floorf.S @@ -0,0 +1,38 @@ +/* Copyright (C) 2011-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Ulrich Drepper <drepper@gmail.come>, 2011. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <machine/asm.h> +#include <init-arch.h> + + +ENTRY(__floorf) + .type __floorf, @gnu_indirect_function + LOAD_RTLD_GLOBAL_RO_RDX + leaq __floorf_sse41(%rip), %rax + HAS_CPU_FEATURE (SSE4_1) + jnz 2f + leaq __floorf_c(%rip), %rax +2: ret +END(__floorf) +weak_alias (__floorf, floorf) + + +ENTRY(__floorf_sse41) + roundss $9, %xmm0, %xmm0 + ret +END(__floorf_sse41) diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_fma.c b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_fma.c new file mode 100644 index 0000000000..3ac4fed660 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_fma.c @@ -0,0 +1,50 @@ +/* FMA version of fma. + Copyright (C) 2009-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <config.h> +#include <math.h> +#include <init-arch.h> + +extern double __fma_sse2 (double x, double y, double z) attribute_hidden; + + +static double +__fma_fma3 (double x, double y, double z) +{ + asm ("vfmadd213sd %3, %2, %0" : "=x" (x) : "0" (x), "x" (y), "xm" (z)); + return x; +} + + +static double +__fma_fma4 (double x, double y, double z) +{ + asm ("vfmaddsd %3, %2, %1, %0" : "=x" (x) : "x" (x), "x" (y), "x" (z)); + return x; +} + + +libm_ifunc (__fma, HAS_ARCH_FEATURE (FMA_Usable) + ? __fma_fma3 : (HAS_ARCH_FEATURE (FMA4_Usable) + ? __fma_fma4 : __fma_sse2)); +weak_alias (__fma, fma) + +#define __fma __fma_sse2 + +#include <sysdeps/ieee754/dbl-64/s_fma.c> diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_fmaf.c b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_fmaf.c new file mode 100644 index 0000000000..1ae227c1d4 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_fmaf.c @@ -0,0 +1,49 @@ +/* FMA version of fmaf. + Copyright (C) 2009-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <config.h> +#include <math.h> +#include <init-arch.h> + +extern float __fmaf_sse2 (float x, float y, float z) attribute_hidden; + + +static float +__fmaf_fma3 (float x, float y, float z) +{ + asm ("vfmadd213ss %3, %2, %0" : "=x" (x) : "0" (x), "x" (y), "xm" (z)); + return x; +} + + +static float +__fmaf_fma4 (float x, float y, float z) +{ + asm ("vfmaddss %3, %2, %1, %0" : "=x" (x) : "x" (x), "x" (y), "x" (z)); + return x; +} + + +libm_ifunc (__fmaf, HAS_ARCH_FEATURE (FMA_Usable) + ? __fmaf_fma3 : (HAS_ARCH_FEATURE (FMA4_Usable) + ? __fmaf_fma4 : __fmaf_sse2)); +weak_alias (__fmaf, fmaf) + +#define __fmaf __fmaf_sse2 + +#include <sysdeps/ieee754/dbl-64/s_fmaf.c> diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_nearbyint-c.c b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_nearbyint-c.c new file mode 100644 index 0000000000..f897a2a6a6 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_nearbyint-c.c @@ -0,0 +1,3 @@ +#undef __nearbyint +#define __nearbyint __nearbyint_c +#include <sysdeps/ieee754/dbl-64/wordsize-64/s_nearbyint.c> diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_nearbyint.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_nearbyint.S new file mode 100644 index 0000000000..5a734f6027 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_nearbyint.S @@ -0,0 +1,38 @@ +/* Copyright (C) 2011-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Ulrich Drepper <drepper@gmail.come>, 2011. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <machine/asm.h> +#include <init-arch.h> + + +ENTRY(__nearbyint) + .type __nearbyint, @gnu_indirect_function + LOAD_RTLD_GLOBAL_RO_RDX + leaq __nearbyint_sse41(%rip), %rax + HAS_CPU_FEATURE (SSE4_1) + jnz 2f + leaq __nearbyint_c(%rip), %rax +2: ret +END(__nearbyint) +weak_alias (__nearbyint, nearbyint) + + +ENTRY(__nearbyint_sse41) + roundsd $0xc, %xmm0, %xmm0 + ret +END(__nearbyint_sse41) diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_nearbyintf-c.c b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_nearbyintf-c.c new file mode 100644 index 0000000000..aa7768233b --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_nearbyintf-c.c @@ -0,0 +1,3 @@ +#undef __nearbyintf +#define __nearbyintf __nearbyintf_c +#include <sysdeps/ieee754/flt-32/s_nearbyintf.c> diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_nearbyintf.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_nearbyintf.S new file mode 100644 index 0000000000..ad79fd6021 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_nearbyintf.S @@ -0,0 +1,38 @@ +/* Copyright (C) 2011-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Ulrich Drepper <drepper@gmail.come>, 2011. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <machine/asm.h> +#include <init-arch.h> + + +ENTRY(__nearbyintf) + .type __nearbyintf, @gnu_indirect_function + LOAD_RTLD_GLOBAL_RO_RDX + leaq __nearbyintf_sse41(%rip), %rax + HAS_CPU_FEATURE (SSE4_1) + jnz 2f + leaq __nearbyintf_c(%rip), %rax +2: ret +END(__nearbyintf) +weak_alias (__nearbyintf, nearbyintf) + + +ENTRY(__nearbyintf_sse41) + roundss $0xc, %xmm0, %xmm0 + ret +END(__nearbyintf_sse41) diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_rint-c.c b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_rint-c.c new file mode 100644 index 0000000000..162a630ff9 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_rint-c.c @@ -0,0 +1,3 @@ +#undef __rint +#define __rint __rint_c +#include <sysdeps/ieee754/dbl-64/wordsize-64/s_rint.c> diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_rint.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_rint.S new file mode 100644 index 0000000000..4f628a93a4 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_rint.S @@ -0,0 +1,38 @@ +/* Copyright (C) 2011-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Ulrich Drepper <drepper@gmail.come>, 2011. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <machine/asm.h> +#include <init-arch.h> + + +ENTRY(__rint) + .type __rint, @gnu_indirect_function + LOAD_RTLD_GLOBAL_RO_RDX + leaq __rint_sse41(%rip), %rax + HAS_CPU_FEATURE (SSE4_1) + jnz 2f + leaq __rint_c(%rip), %rax +2: ret +END(__rint) +weak_alias (__rint, rint) + + +ENTRY(__rint_sse41) + roundsd $4, %xmm0, %xmm0 + ret +END(__rint_sse41) diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_rintf-c.c b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_rintf-c.c new file mode 100644 index 0000000000..8505249f34 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_rintf-c.c @@ -0,0 +1,3 @@ +#undef __rintf +#define __rintf __rintf_c +#include <sysdeps/ieee754/flt-32/s_rintf.c> diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_rintf.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_rintf.S new file mode 100644 index 0000000000..dee4ad794c --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_rintf.S @@ -0,0 +1,38 @@ +/* Copyright (C) 2011-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Ulrich Drepper <drepper@gmail.come>, 2011. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <machine/asm.h> +#include <init-arch.h> + + +ENTRY(__rintf) + .type __rintf, @gnu_indirect_function + LOAD_RTLD_GLOBAL_RO_RDX + leaq __rintf_sse41(%rip), %rax + HAS_CPU_FEATURE (SSE4_1) + jnz 2f + leaq __rintf_c(%rip), %rax +2: ret +END(__rintf) +weak_alias (__rintf, rintf) + + +ENTRY(__rintf_sse41) + roundss $4, %xmm0, %xmm0 + ret +END(__rintf_sse41) diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_sin-avx.c b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_sin-avx.c new file mode 100644 index 0000000000..e1c6de0259 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_sin-avx.c @@ -0,0 +1,5 @@ +#define __cos __cos_avx +#define __sin __sin_avx +#define SECTION __attribute__ ((section (".text.avx"))) + +#include <sysdeps/ieee754/dbl-64/s_sin.c> diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_sin-fma4.c b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_sin-fma4.c new file mode 100644 index 0000000000..4c35739dc9 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_sin-fma4.c @@ -0,0 +1,11 @@ +#define __cos __cos_fma4 +#define __sin __sin_fma4 +#define __docos __docos_fma4 +#define __dubsin __dubsin_fma4 +#define __mpcos __mpcos_fma4 +#define __mpcos1 __mpcos1_fma4 +#define __mpsin __mpsin_fma4 +#define __mpsin1 __mpsin1_fma4 +#define SECTION __attribute__ ((section (".text.fma4"))) + +#include <sysdeps/ieee754/dbl-64/s_sin.c> diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_sin.c b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_sin.c new file mode 100644 index 0000000000..8ffd3e7125 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_sin.c @@ -0,0 +1,26 @@ +#include <init-arch.h> +#include <math.h> +#undef NAN + +extern double __cos_sse2 (double); +extern double __sin_sse2 (double); +extern double __cos_avx (double); +extern double __sin_avx (double); +extern double __cos_fma4 (double); +extern double __sin_fma4 (double); + +libm_ifunc (__cos, (HAS_ARCH_FEATURE (FMA4_Usable) ? __cos_fma4 : + HAS_ARCH_FEATURE (AVX_Usable) + ? __cos_avx : __cos_sse2)); +weak_alias (__cos, cos) + +libm_ifunc (__sin, (HAS_ARCH_FEATURE (FMA4_Usable) ? __sin_fma4 : + HAS_ARCH_FEATURE (AVX_Usable) + ? __sin_avx : __sin_sse2)); +weak_alias (__sin, sin) + +#define __cos __cos_sse2 +#define __sin __sin_sse2 + + +#include <sysdeps/ieee754/dbl-64/s_sin.c> diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_tan-avx.c b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_tan-avx.c new file mode 100644 index 0000000000..53de5d3c98 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_tan-avx.c @@ -0,0 +1,6 @@ +#define tan __tan_avx +#define __dbl_mp __dbl_mp_avx +#define __sub __sub_avx +#define SECTION __attribute__ ((section (".text.avx"))) + +#include <sysdeps/ieee754/dbl-64/s_tan.c> diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_tan-fma4.c b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_tan-fma4.c new file mode 100644 index 0000000000..a805440b46 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_tan-fma4.c @@ -0,0 +1,8 @@ +#define tan __tan_fma4 +#define __dbl_mp __dbl_mp_fma4 +#define __mpranred __mpranred_fma4 +#define __mptan __mptan_fma4 +#define __sub __sub_fma4 +#define SECTION __attribute__ ((section (".text.fma4"))) + +#include <sysdeps/ieee754/dbl-64/s_tan.c> diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_tan.c b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_tan.c new file mode 100644 index 0000000000..25f3bca07e --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_tan.c @@ -0,0 +1,15 @@ +#include <init-arch.h> +#include <math.h> + +extern double __tan_sse2 (double); +extern double __tan_avx (double); +extern double __tan_fma4 (double); + +libm_ifunc (tan, (HAS_ARCH_FEATURE (FMA4_Usable) ? __tan_fma4 : + HAS_ARCH_FEATURE (AVX_Usable) + ? __tan_avx : __tan_sse2)); + +#define tan __tan_sse2 + + +#include <sysdeps/ieee754/dbl-64/s_tan.c> diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/sincos32-fma4.c b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/sincos32-fma4.c new file mode 100644 index 0000000000..ebbfa18cca --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/sincos32-fma4.c @@ -0,0 +1,15 @@ +#define __cos32 __cos32_fma4 +#define __sin32 __sin32_fma4 +#define __c32 __c32_fma4 +#define __mpsin __mpsin_fma4 +#define __mpsin1 __mpsin1_fma4 +#define __mpcos __mpcos_fma4 +#define __mpcos1 __mpcos1_fma4 +#define __mpranred __mpranred_fma4 +#define __add __add_fma4 +#define __dbl_mp __dbl_mp_fma4 +#define __mul __mul_fma4 +#define __sub __sub_fma4 +#define SECTION __attribute__ ((section (".text.fma4"))) + +#include <sysdeps/ieee754/dbl-64/sincos32.c> diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/slowexp-avx.c b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/slowexp-avx.c new file mode 100644 index 0000000000..d01c6d71a4 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/slowexp-avx.c @@ -0,0 +1,9 @@ +#define __slowexp __slowexp_avx +#define __add __add_avx +#define __dbl_mp __dbl_mp_avx +#define __mpexp __mpexp_avx +#define __mul __mul_avx +#define __sub __sub_avx +#define SECTION __attribute__ ((section (".text.avx"))) + +#include <sysdeps/ieee754/dbl-64/slowexp.c> diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/slowexp-fma4.c b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/slowexp-fma4.c new file mode 100644 index 0000000000..3bcde84233 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/slowexp-fma4.c @@ -0,0 +1,9 @@ +#define __slowexp __slowexp_fma4 +#define __add __add_fma4 +#define __dbl_mp __dbl_mp_fma4 +#define __mpexp __mpexp_fma4 +#define __mul __mul_fma4 +#define __sub __sub_fma4 +#define SECTION __attribute__ ((section (".text.fma4"))) + +#include <sysdeps/ieee754/dbl-64/slowexp.c> diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/slowpow-fma4.c b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/slowpow-fma4.c new file mode 100644 index 0000000000..69d69823bb --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/slowpow-fma4.c @@ -0,0 +1,11 @@ +#define __slowpow __slowpow_fma4 +#define __add __add_fma4 +#define __dbl_mp __dbl_mp_fma4 +#define __mpexp __mpexp_fma4 +#define __mplog __mplog_fma4 +#define __mul __mul_fma4 +#define __sub __sub_fma4 +#define __halfulp __halfulp_fma4 +#define SECTION __attribute__ ((section (".text.fma4"))) + +#include <sysdeps/ieee754/dbl-64/slowpow.c> diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_cos2_core.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_cos2_core.S new file mode 100644 index 0000000000..b209492442 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_cos2_core.S @@ -0,0 +1,36 @@ +/* Multiple versions of vectorized cos, vector length is 2. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + + .text +ENTRY (_ZGVbN2v_cos) + .type _ZGVbN2v_cos, @gnu_indirect_function + LOAD_RTLD_GLOBAL_RO_RDX + leaq _ZGVbN2v_cos_sse4(%rip), %rax + HAS_CPU_FEATURE (SSE4_1) + jz 2f + ret +2: leaq _ZGVbN2v_cos_sse2(%rip), %rax + ret +END (_ZGVbN2v_cos) +libmvec_hidden_def (_ZGVbN2v_cos) + +#define _ZGVbN2v_cos _ZGVbN2v_cos_sse2 +#include "../svml_d_cos2_core.S" diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_cos2_core_sse4.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_cos2_core_sse4.S new file mode 100644 index 0000000000..858dc6532f --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_cos2_core_sse4.S @@ -0,0 +1,223 @@ +/* Function cos vectorized with SSE4. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "svml_d_trig_data.h" + + .text +ENTRY (_ZGVbN2v_cos_sse4) +/* ALGORITHM DESCRIPTION: + + ( low accuracy ( < 4ulp ) or enhanced performance + ( half of correct mantissa ) implementation ) + + Argument representation: + arg + Pi/2 = (N*Pi + R) + + Result calculation: + cos(arg) = sin(arg+Pi/2) = sin(N*Pi + R) = (-1)^N * sin(R) + sin(R) is approximated by corresponding polynomial + */ + pushq %rbp + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%rbp, 0) + movq %rsp, %rbp + cfi_def_cfa_register (%rbp) + andq $-64, %rsp + subq $320, %rsp + movaps %xmm0, %xmm3 + movq __svml_d_trig_data@GOTPCREL(%rip), %rax + movups __dHalfPI(%rax), %xmm2 + +/* ARGUMENT RANGE REDUCTION: + Add Pi/2 to argument: X' = X+Pi/2 + */ + addpd %xmm3, %xmm2 + movups __dInvPI(%rax), %xmm5 + movups __dAbsMask(%rax), %xmm4 + +/* Get absolute argument value: X' = |X'| */ + andps %xmm2, %xmm4 + +/* Y = X'*InvPi + RS : right shifter add */ + mulpd %xmm5, %xmm2 + +/* Check for large arguments path */ + cmpnlepd __dRangeVal(%rax), %xmm4 + movups __dRShifter(%rax), %xmm6 + addpd %xmm6, %xmm2 + movmskpd %xmm4, %ecx + +/* N = Y - RS : right shifter sub */ + movaps %xmm2, %xmm1 + +/* SignRes = Y<<63 : shift LSB to MSB place for result sign */ + psllq $63, %xmm2 + subpd %xmm6, %xmm1 + +/* N = N - 0.5 */ + subpd __dOneHalf(%rax), %xmm1 + movups __dPI1(%rax), %xmm7 + +/* R = X - N*Pi1 */ + mulpd %xmm1, %xmm7 + movups __dPI2(%rax), %xmm4 + +/* R = R - N*Pi2 */ + mulpd %xmm1, %xmm4 + subpd %xmm7, %xmm0 + movups __dPI3(%rax), %xmm5 + +/* R = R - N*Pi3 */ + mulpd %xmm1, %xmm5 + subpd %xmm4, %xmm0 + +/* R = R - N*Pi4 */ + movups __dPI4(%rax), %xmm6 + mulpd %xmm6, %xmm1 + subpd %xmm5, %xmm0 + subpd %xmm1, %xmm0 + +/* POLYNOMIAL APPROXIMATION: R2 = R*R */ + movaps %xmm0, %xmm4 + mulpd %xmm0, %xmm4 + movups __dC7(%rax), %xmm1 + mulpd %xmm4, %xmm1 + addpd __dC6(%rax), %xmm1 + mulpd %xmm4, %xmm1 + addpd __dC5(%rax), %xmm1 + mulpd %xmm4, %xmm1 + addpd __dC4(%rax), %xmm1 + +/* Poly = C3+R2*(C4+R2*(C5+R2*(C6+R2*C7))) */ + mulpd %xmm4, %xmm1 + addpd __dC3(%rax), %xmm1 + +/* Poly = R+R*(R2*(C1+R2*(C2+R2*Poly))) */ + mulpd %xmm4, %xmm1 + addpd __dC2(%rax), %xmm1 + mulpd %xmm4, %xmm1 + addpd __dC1(%rax), %xmm1 + mulpd %xmm1, %xmm4 + mulpd %xmm0, %xmm4 + addpd %xmm4, %xmm0 + +/* RECONSTRUCTION: + Final sign setting: Res = Poly^SignRes */ + xorps %xmm2, %xmm0 + testl %ecx, %ecx + jne .LBL_1_3 + +.LBL_1_2: + cfi_remember_state + movq %rbp, %rsp + cfi_def_cfa_register (%rsp) + popq %rbp + cfi_adjust_cfa_offset (-8) + cfi_restore (%rbp) + ret + +.LBL_1_3: + cfi_restore_state + movups %xmm3, 192(%rsp) + movups %xmm0, 256(%rsp) + je .LBL_1_2 + + xorb %dl, %dl + xorl %eax, %eax + movups %xmm8, 112(%rsp) + movups %xmm9, 96(%rsp) + movups %xmm10, 80(%rsp) + movups %xmm11, 64(%rsp) + movups %xmm12, 48(%rsp) + movups %xmm13, 32(%rsp) + movups %xmm14, 16(%rsp) + movups %xmm15, (%rsp) + movq %rsi, 136(%rsp) + movq %rdi, 128(%rsp) + movq %r12, 168(%rsp) + cfi_offset_rel_rsp (12, 168) + movb %dl, %r12b + movq %r13, 160(%rsp) + cfi_offset_rel_rsp (13, 160) + movl %ecx, %r13d + movq %r14, 152(%rsp) + cfi_offset_rel_rsp (14, 152) + movl %eax, %r14d + movq %r15, 144(%rsp) + cfi_offset_rel_rsp (15, 144) + cfi_remember_state + +.LBL_1_6: + btl %r14d, %r13d + jc .LBL_1_12 + +.LBL_1_7: + lea 1(%r14), %esi + btl %esi, %r13d + jc .LBL_1_10 + +.LBL_1_8: + incb %r12b + addl $2, %r14d + cmpb $16, %r12b + jb .LBL_1_6 + + movups 112(%rsp), %xmm8 + movups 96(%rsp), %xmm9 + movups 80(%rsp), %xmm10 + movups 64(%rsp), %xmm11 + movups 48(%rsp), %xmm12 + movups 32(%rsp), %xmm13 + movups 16(%rsp), %xmm14 + movups (%rsp), %xmm15 + movq 136(%rsp), %rsi + movq 128(%rsp), %rdi + movq 168(%rsp), %r12 + cfi_restore (%r12) + movq 160(%rsp), %r13 + cfi_restore (%r13) + movq 152(%rsp), %r14 + cfi_restore (%r14) + movq 144(%rsp), %r15 + cfi_restore (%r15) + movups 256(%rsp), %xmm0 + jmp .LBL_1_2 + +.LBL_1_10: + cfi_restore_state + movzbl %r12b, %r15d + shlq $4, %r15 + movsd 200(%rsp,%r15), %xmm0 + + call JUMPTARGET(cos) + + movsd %xmm0, 264(%rsp,%r15) + jmp .LBL_1_8 + +.LBL_1_12: + movzbl %r12b, %r15d + shlq $4, %r15 + movsd 192(%rsp,%r15), %xmm0 + + call JUMPTARGET(cos) + + movsd %xmm0, 256(%rsp,%r15) + jmp .LBL_1_7 + +END (_ZGVbN2v_cos_sse4) diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_cos4_core.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_cos4_core.S new file mode 100644 index 0000000000..ff382e9c6c --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_cos4_core.S @@ -0,0 +1,36 @@ +/* Multiple versions of vectorized cos, vector length is 4. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + + .text +ENTRY (_ZGVdN4v_cos) + .type _ZGVdN4v_cos, @gnu_indirect_function + LOAD_RTLD_GLOBAL_RO_RDX + leaq _ZGVdN4v_cos_avx2(%rip), %rax + HAS_ARCH_FEATURE (AVX2_Usable) + jz 2f + ret +2: leaq _ZGVdN4v_cos_sse_wrapper(%rip), %rax + ret +END (_ZGVdN4v_cos) +libmvec_hidden_def (_ZGVdN4v_cos) + +#define _ZGVdN4v_cos _ZGVdN4v_cos_sse_wrapper +#include "../svml_d_cos4_core.S" diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_cos4_core_avx2.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_cos4_core_avx2.S new file mode 100644 index 0000000000..4b6d09743b --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_cos4_core_avx2.S @@ -0,0 +1,207 @@ +/* Function cos vectorized with AVX2. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "svml_d_trig_data.h" + + .text +ENTRY (_ZGVdN4v_cos_avx2) + +/* ALGORITHM DESCRIPTION: + + ( low accuracy ( < 4ulp ) or enhanced performance + ( half of correct mantissa ) implementation ) + + Argument representation: + arg + Pi/2 = (N*Pi + R) + + Result calculation: + cos(arg) = sin(arg+Pi/2) = sin(N*Pi + R) = (-1)^N * sin(R) + sin(R) is approximated by corresponding polynomial + */ + pushq %rbp + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%rbp, 0) + movq %rsp, %rbp + cfi_def_cfa_register (%rbp) + andq $-64, %rsp + subq $448, %rsp + movq __svml_d_trig_data@GOTPCREL(%rip), %rax + vmovapd %ymm0, %ymm1 + vmovupd __dInvPI(%rax), %ymm4 + vmovupd __dRShifter(%rax), %ymm5 + +/* + ARGUMENT RANGE REDUCTION: + Add Pi/2 to argument: X' = X+Pi/2 + */ + vaddpd __dHalfPI(%rax), %ymm1, %ymm7 + +/* Get absolute argument value: X' = |X'| */ + vandpd __dAbsMask(%rax), %ymm7, %ymm2 + +/* Y = X'*InvPi + RS : right shifter add */ + vfmadd213pd %ymm5, %ymm4, %ymm7 + vmovupd __dC7(%rax), %ymm4 + +/* Check for large arguments path */ + vcmpnle_uqpd __dRangeVal(%rax), %ymm2, %ymm3 + +/* N = Y - RS : right shifter sub */ + vsubpd %ymm5, %ymm7, %ymm6 + vmovupd __dPI1_FMA(%rax), %ymm2 + +/* SignRes = Y<<63 : shift LSB to MSB place for result sign */ + vpsllq $63, %ymm7, %ymm7 + +/* N = N - 0.5 */ + vsubpd __dOneHalf(%rax), %ymm6, %ymm0 + vmovmskpd %ymm3, %ecx + +/* R = X - N*Pi1 */ + vmovapd %ymm1, %ymm3 + vfnmadd231pd %ymm0, %ymm2, %ymm3 + +/* R = R - N*Pi2 */ + vfnmadd231pd __dPI2_FMA(%rax), %ymm0, %ymm3 + +/* R = R - N*Pi3 */ + vfnmadd132pd __dPI3_FMA(%rax), %ymm3, %ymm0 + +/* POLYNOMIAL APPROXIMATION: R2 = R*R */ + vmulpd %ymm0, %ymm0, %ymm5 + vfmadd213pd __dC6(%rax), %ymm5, %ymm4 + vfmadd213pd __dC5(%rax), %ymm5, %ymm4 + vfmadd213pd __dC4(%rax), %ymm5, %ymm4 + +/* Poly = C3+R2*(C4+R2*(C5+R2*(C6+R2*C7))) */ + vfmadd213pd __dC3(%rax), %ymm5, %ymm4 + +/* Poly = R+R*(R2*(C1+R2*(C2+R2*Poly))) */ + vfmadd213pd __dC2(%rax), %ymm5, %ymm4 + vfmadd213pd __dC1(%rax), %ymm5, %ymm4 + vmulpd %ymm5, %ymm4, %ymm6 + vfmadd213pd %ymm0, %ymm0, %ymm6 + +/* + RECONSTRUCTION: + Final sign setting: Res = Poly^SignRes */ + vxorpd %ymm7, %ymm6, %ymm0 + testl %ecx, %ecx + jne .LBL_1_3 + +.LBL_1_2: + cfi_remember_state + movq %rbp, %rsp + cfi_def_cfa_register (%rsp) + popq %rbp + cfi_adjust_cfa_offset (-8) + cfi_restore (%rbp) + ret + +.LBL_1_3: + cfi_restore_state + vmovupd %ymm1, 320(%rsp) + vmovupd %ymm0, 384(%rsp) + je .LBL_1_2 + + xorb %dl, %dl + xorl %eax, %eax + vmovups %ymm8, 224(%rsp) + vmovups %ymm9, 192(%rsp) + vmovups %ymm10, 160(%rsp) + vmovups %ymm11, 128(%rsp) + vmovups %ymm12, 96(%rsp) + vmovups %ymm13, 64(%rsp) + vmovups %ymm14, 32(%rsp) + vmovups %ymm15, (%rsp) + movq %rsi, 264(%rsp) + movq %rdi, 256(%rsp) + movq %r12, 296(%rsp) + cfi_offset_rel_rsp (12, 296) + movb %dl, %r12b + movq %r13, 288(%rsp) + cfi_offset_rel_rsp (13, 288) + movl %ecx, %r13d + movq %r14, 280(%rsp) + cfi_offset_rel_rsp (14, 280) + movl %eax, %r14d + movq %r15, 272(%rsp) + cfi_offset_rel_rsp (15, 272) + cfi_remember_state + +.LBL_1_6: + btl %r14d, %r13d + jc .LBL_1_12 + +.LBL_1_7: + lea 1(%r14), %esi + btl %esi, %r13d + jc .LBL_1_10 + +.LBL_1_8: + incb %r12b + addl $2, %r14d + cmpb $16, %r12b + jb .LBL_1_6 + + vmovups 224(%rsp), %ymm8 + vmovups 192(%rsp), %ymm9 + vmovups 160(%rsp), %ymm10 + vmovups 128(%rsp), %ymm11 + vmovups 96(%rsp), %ymm12 + vmovups 64(%rsp), %ymm13 + vmovups 32(%rsp), %ymm14 + vmovups (%rsp), %ymm15 + vmovupd 384(%rsp), %ymm0 + movq 264(%rsp), %rsi + movq 256(%rsp), %rdi + movq 296(%rsp), %r12 + cfi_restore (%r12) + movq 288(%rsp), %r13 + cfi_restore (%r13) + movq 280(%rsp), %r14 + cfi_restore (%r14) + movq 272(%rsp), %r15 + cfi_restore (%r15) + jmp .LBL_1_2 + +.LBL_1_10: + cfi_restore_state + movzbl %r12b, %r15d + shlq $4, %r15 + vmovsd 328(%rsp,%r15), %xmm0 + vzeroupper + + call JUMPTARGET(cos) + + vmovsd %xmm0, 392(%rsp,%r15) + jmp .LBL_1_8 + +.LBL_1_12: + movzbl %r12b, %r15d + shlq $4, %r15 + vmovsd 320(%rsp,%r15), %xmm0 + vzeroupper + + call JUMPTARGET(cos) + + vmovsd %xmm0, 384(%rsp,%r15) + jmp .LBL_1_7 + +END (_ZGVdN4v_cos_avx2) diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core.S new file mode 100644 index 0000000000..46d35a25d2 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core.S @@ -0,0 +1,37 @@ +/* Multiple versions of vectorized cos, vector length is 8. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + + .text +ENTRY (_ZGVeN8v_cos) + .type _ZGVeN8v_cos, @gnu_indirect_function + LOAD_RTLD_GLOBAL_RO_RDX +1: leaq _ZGVeN8v_cos_skx(%rip), %rax + HAS_ARCH_FEATURE (AVX512DQ_Usable) + jnz 2f + leaq _ZGVeN8v_cos_knl(%rip), %rax + HAS_ARCH_FEATURE (AVX512F_Usable) + jnz 2f + leaq _ZGVeN8v_cos_avx2_wrapper(%rip), %rax +2: ret +END (_ZGVeN8v_cos) + +#define _ZGVeN8v_cos _ZGVeN8v_cos_avx2_wrapper +#include "../svml_d_cos8_core.S" diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S new file mode 100644 index 0000000000..e7af83c6d5 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S @@ -0,0 +1,463 @@ +/* Function cos vectorized with AVX-512, KNL and SKX versions. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "svml_d_trig_data.h" +#include "svml_d_wrapper_impl.h" + + .text +ENTRY (_ZGVeN8v_cos_knl) +#ifndef HAVE_AVX512DQ_ASM_SUPPORT +WRAPPER_IMPL_AVX512 _ZGVdN4v_cos +#else +/* + ALGORITHM DESCRIPTION: + + ( low accuracy ( < 4ulp ) or enhanced performance + ( half of correct mantissa ) implementation ) + + Argument representation: + arg + Pi/2 = (N*Pi + R) + + Result calculation: + cos(arg) = sin(arg+Pi/2) = sin(N*Pi + R) = (-1)^N * sin(R) + sin(R) is approximated by corresponding polynomial + */ + pushq %rbp + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%rbp, 0) + movq %rsp, %rbp + cfi_def_cfa_register (%rbp) + andq $-64, %rsp + subq $1280, %rsp + movq __svml_d_trig_data@GOTPCREL(%rip), %rax + +/* R = X - N*Pi1 */ + vmovaps %zmm0, %zmm7 + +/* Check for large arguments path */ + movq $-1, %rcx + +/* + ARGUMENT RANGE REDUCTION: + Add Pi/2 to argument: X' = X+Pi/2 + */ + vaddpd __dHalfPI(%rax), %zmm0, %zmm5 + vmovups __dInvPI(%rax), %zmm3 + +/* Get absolute argument value: X' = |X'| */ + vpandq __dAbsMask(%rax), %zmm5, %zmm1 + +/* Y = X'*InvPi + RS : right shifter add */ + vfmadd213pd __dRShifter(%rax), %zmm3, %zmm5 + vmovups __dPI1_FMA(%rax), %zmm6 + +/* N = Y - RS : right shifter sub */ + vsubpd __dRShifter(%rax), %zmm5, %zmm4 + +/* SignRes = Y<<63 : shift LSB to MSB place for result sign */ + vpsllq $63, %zmm5, %zmm12 + vmovups __dC7(%rax), %zmm8 + +/* N = N - 0.5 */ + vsubpd __dOneHalf(%rax), %zmm4, %zmm10 + vcmppd $22, __dRangeVal(%rax), %zmm1, %k1 + vpbroadcastq %rcx, %zmm2{%k1}{z} + vfnmadd231pd %zmm10, %zmm6, %zmm7 + vptestmq %zmm2, %zmm2, %k0 + +/* R = R - N*Pi2 */ + vfnmadd231pd __dPI2_FMA(%rax), %zmm10, %zmm7 + kmovw %k0, %ecx + movzbl %cl, %ecx + +/* R = R - N*Pi3 */ + vfnmadd132pd __dPI3_FMA(%rax), %zmm7, %zmm10 + +/* + POLYNOMIAL APPROXIMATION: + R2 = R*R + */ + vmulpd %zmm10, %zmm10, %zmm9 + vfmadd213pd __dC6(%rax), %zmm9, %zmm8 + vfmadd213pd __dC5(%rax), %zmm9, %zmm8 + vfmadd213pd __dC4(%rax), %zmm9, %zmm8 + +/* Poly = C3+R2*(C4+R2*(C5+R2*(C6+R2*C7))) */ + vfmadd213pd __dC3(%rax), %zmm9, %zmm8 + +/* Poly = R+R*(R2*(C1+R2*(C2+R2*Poly))) */ + vfmadd213pd __dC2(%rax), %zmm9, %zmm8 + vfmadd213pd __dC1(%rax), %zmm9, %zmm8 + vmulpd %zmm9, %zmm8, %zmm11 + vfmadd213pd %zmm10, %zmm10, %zmm11 + +/* + RECONSTRUCTION: + Final sign setting: Res = Poly^SignRes + */ + vpxorq %zmm12, %zmm11, %zmm1 + testl %ecx, %ecx + jne .LBL_1_3 + +.LBL_1_2: + cfi_remember_state + vmovaps %zmm1, %zmm0 + movq %rbp, %rsp + cfi_def_cfa_register (%rsp) + popq %rbp + cfi_adjust_cfa_offset (-8) + cfi_restore (%rbp) + ret + +.LBL_1_3: + cfi_restore_state + vmovups %zmm0, 1152(%rsp) + vmovups %zmm1, 1216(%rsp) + je .LBL_1_2 + + xorb %dl, %dl + kmovw %k4, 1048(%rsp) + xorl %eax, %eax + kmovw %k5, 1040(%rsp) + kmovw %k6, 1032(%rsp) + kmovw %k7, 1024(%rsp) + vmovups %zmm16, 960(%rsp) + vmovups %zmm17, 896(%rsp) + vmovups %zmm18, 832(%rsp) + vmovups %zmm19, 768(%rsp) + vmovups %zmm20, 704(%rsp) + vmovups %zmm21, 640(%rsp) + vmovups %zmm22, 576(%rsp) + vmovups %zmm23, 512(%rsp) + vmovups %zmm24, 448(%rsp) + vmovups %zmm25, 384(%rsp) + vmovups %zmm26, 320(%rsp) + vmovups %zmm27, 256(%rsp) + vmovups %zmm28, 192(%rsp) + vmovups %zmm29, 128(%rsp) + vmovups %zmm30, 64(%rsp) + vmovups %zmm31, (%rsp) + movq %rsi, 1064(%rsp) + movq %rdi, 1056(%rsp) + movq %r12, 1096(%rsp) + cfi_offset_rel_rsp (12, 1096) + movb %dl, %r12b + movq %r13, 1088(%rsp) + cfi_offset_rel_rsp (13, 1088) + movl %ecx, %r13d + movq %r14, 1080(%rsp) + cfi_offset_rel_rsp (14, 1080) + movl %eax, %r14d + movq %r15, 1072(%rsp) + cfi_offset_rel_rsp (15, 1072) + cfi_remember_state + +.LBL_1_6: + btl %r14d, %r13d + jc .LBL_1_12 + +.LBL_1_7: + lea 1(%r14), %esi + btl %esi, %r13d + jc .LBL_1_10 + +.LBL_1_8: + addb $1, %r12b + addl $2, %r14d + cmpb $16, %r12b + jb .LBL_1_6 + + kmovw 1048(%rsp), %k4 + movq 1064(%rsp), %rsi + kmovw 1040(%rsp), %k5 + movq 1056(%rsp), %rdi + kmovw 1032(%rsp), %k6 + movq 1096(%rsp), %r12 + cfi_restore (%r12) + movq 1088(%rsp), %r13 + cfi_restore (%r13) + kmovw 1024(%rsp), %k7 + vmovups 960(%rsp), %zmm16 + vmovups 896(%rsp), %zmm17 + vmovups 832(%rsp), %zmm18 + vmovups 768(%rsp), %zmm19 + vmovups 704(%rsp), %zmm20 + vmovups 640(%rsp), %zmm21 + vmovups 576(%rsp), %zmm22 + vmovups 512(%rsp), %zmm23 + vmovups 448(%rsp), %zmm24 + vmovups 384(%rsp), %zmm25 + vmovups 320(%rsp), %zmm26 + vmovups 256(%rsp), %zmm27 + vmovups 192(%rsp), %zmm28 + vmovups 128(%rsp), %zmm29 + vmovups 64(%rsp), %zmm30 + vmovups (%rsp), %zmm31 + movq 1080(%rsp), %r14 + cfi_restore (%r14) + movq 1072(%rsp), %r15 + cfi_restore (%r15) + vmovups 1216(%rsp), %zmm1 + jmp .LBL_1_2 + +.LBL_1_10: + cfi_restore_state + movzbl %r12b, %r15d + shlq $4, %r15 + vmovsd 1160(%rsp,%r15), %xmm0 + call JUMPTARGET(cos) + vmovsd %xmm0, 1224(%rsp,%r15) + jmp .LBL_1_8 + +.LBL_1_12: + movzbl %r12b, %r15d + shlq $4, %r15 + vmovsd 1152(%rsp,%r15), %xmm0 + call JUMPTARGET(cos) + vmovsd %xmm0, 1216(%rsp,%r15) + jmp .LBL_1_7 +#endif +END (_ZGVeN8v_cos_knl) + +ENTRY (_ZGVeN8v_cos_skx) +#ifndef HAVE_AVX512DQ_ASM_SUPPORT +WRAPPER_IMPL_AVX512 _ZGVdN4v_cos +#else +/* + ALGORITHM DESCRIPTION: + + ( low accuracy ( < 4ulp ) or enhanced performance + ( half of correct mantissa ) implementation ) + + Argument representation: + arg + Pi/2 = (N*Pi + R) + + Result calculation: + cos(arg) = sin(arg+Pi/2) = sin(N*Pi + R) = (-1)^N * sin(R) + sin(R) is approximated by corresponding polynomial + */ + pushq %rbp + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%rbp, 0) + movq %rsp, %rbp + cfi_def_cfa_register (%rbp) + andq $-64, %rsp + subq $1280, %rsp + movq __svml_d_trig_data@GOTPCREL(%rip), %rax + +/* R = X - N*Pi1 */ + vmovaps %zmm0, %zmm8 + +/* Check for large arguments path */ + vpbroadcastq .L_2il0floatpacket.16(%rip), %zmm2 + +/* + ARGUMENT RANGE REDUCTION: + Add Pi/2 to argument: X' = X+Pi/2 + */ + vaddpd __dHalfPI(%rax), %zmm0, %zmm6 + vmovups __dInvPI(%rax), %zmm3 + vmovups __dRShifter(%rax), %zmm4 + vmovups __dPI1_FMA(%rax), %zmm7 + vmovups __dC7(%rax), %zmm9 + +/* Get absolute argument value: X' = |X'| */ + vandpd __dAbsMask(%rax), %zmm6, %zmm1 + +/* Y = X'*InvPi + RS : right shifter add */ + vfmadd213pd %zmm4, %zmm3, %zmm6 + vcmppd $18, __dRangeVal(%rax), %zmm1, %k1 + +/* SignRes = Y<<63 : shift LSB to MSB place for result sign */ + vpsllq $63, %zmm6, %zmm13 + +/* N = Y - RS : right shifter sub */ + vsubpd %zmm4, %zmm6, %zmm5 + +/* N = N - 0.5 */ + vsubpd __dOneHalf(%rax), %zmm5, %zmm11 + vfnmadd231pd %zmm11, %zmm7, %zmm8 + +/* R = R - N*Pi2 */ + vfnmadd231pd __dPI2_FMA(%rax), %zmm11, %zmm8 + +/* R = R - N*Pi3 */ + vfnmadd132pd __dPI3_FMA(%rax), %zmm8, %zmm11 + +/* + POLYNOMIAL APPROXIMATION: + R2 = R*R + */ + vmulpd %zmm11, %zmm11, %zmm10 + vfmadd213pd __dC6(%rax), %zmm10, %zmm9 + vfmadd213pd __dC5(%rax), %zmm10, %zmm9 + vfmadd213pd __dC4(%rax), %zmm10, %zmm9 + +/* Poly = C3+R2*(C4+R2*(C5+R2*(C6+R2*C7))) */ + vfmadd213pd __dC3(%rax), %zmm10, %zmm9 + +/* Poly = R+R*(R2*(C1+R2*(C2+R2*Poly))) */ + vfmadd213pd __dC2(%rax), %zmm10, %zmm9 + vfmadd213pd __dC1(%rax), %zmm10, %zmm9 + vmulpd %zmm10, %zmm9, %zmm12 + vfmadd213pd %zmm11, %zmm11, %zmm12 + vpandnq %zmm1, %zmm1, %zmm2{%k1} + vcmppd $3, %zmm2, %zmm2, %k0 + +/* + RECONSTRUCTION: + Final sign setting: Res = Poly^SignRes + */ + vxorpd %zmm13, %zmm12, %zmm1 + kmovw %k0, %ecx + testl %ecx, %ecx + jne .LBL_2_3 + +.LBL_2_2: + cfi_remember_state + vmovaps %zmm1, %zmm0 + movq %rbp, %rsp + cfi_def_cfa_register (%rsp) + popq %rbp + cfi_adjust_cfa_offset (-8) + cfi_restore (%rbp) + ret + +.LBL_2_3: + cfi_restore_state + vmovups %zmm0, 1152(%rsp) + vmovups %zmm1, 1216(%rsp) + je .LBL_2_2 + + xorb %dl, %dl + xorl %eax, %eax + kmovw %k4, 1048(%rsp) + kmovw %k5, 1040(%rsp) + kmovw %k6, 1032(%rsp) + kmovw %k7, 1024(%rsp) + vmovups %zmm16, 960(%rsp) + vmovups %zmm17, 896(%rsp) + vmovups %zmm18, 832(%rsp) + vmovups %zmm19, 768(%rsp) + vmovups %zmm20, 704(%rsp) + vmovups %zmm21, 640(%rsp) + vmovups %zmm22, 576(%rsp) + vmovups %zmm23, 512(%rsp) + vmovups %zmm24, 448(%rsp) + vmovups %zmm25, 384(%rsp) + vmovups %zmm26, 320(%rsp) + vmovups %zmm27, 256(%rsp) + vmovups %zmm28, 192(%rsp) + vmovups %zmm29, 128(%rsp) + vmovups %zmm30, 64(%rsp) + vmovups %zmm31, (%rsp) + movq %rsi, 1064(%rsp) + movq %rdi, 1056(%rsp) + movq %r12, 1096(%rsp) + cfi_offset_rel_rsp (12, 1096) + movb %dl, %r12b + movq %r13, 1088(%rsp) + cfi_offset_rel_rsp (13, 1088) + movl %ecx, %r13d + movq %r14, 1080(%rsp) + cfi_offset_rel_rsp (14, 1080) + movl %eax, %r14d + movq %r15, 1072(%rsp) + cfi_offset_rel_rsp (15, 1072) + cfi_remember_state + +.LBL_2_6: + btl %r14d, %r13d + jc .LBL_2_12 + +.LBL_2_7: + lea 1(%r14), %esi + btl %esi, %r13d + jc .LBL_2_10 + +.LBL_2_8: + incb %r12b + addl $2, %r14d + cmpb $16, %r12b + jb .LBL_2_6 + + kmovw 1048(%rsp), %k4 + kmovw 1040(%rsp), %k5 + kmovw 1032(%rsp), %k6 + kmovw 1024(%rsp), %k7 + vmovups 960(%rsp), %zmm16 + vmovups 896(%rsp), %zmm17 + vmovups 832(%rsp), %zmm18 + vmovups 768(%rsp), %zmm19 + vmovups 704(%rsp), %zmm20 + vmovups 640(%rsp), %zmm21 + vmovups 576(%rsp), %zmm22 + vmovups 512(%rsp), %zmm23 + vmovups 448(%rsp), %zmm24 + vmovups 384(%rsp), %zmm25 + vmovups 320(%rsp), %zmm26 + vmovups 256(%rsp), %zmm27 + vmovups 192(%rsp), %zmm28 + vmovups 128(%rsp), %zmm29 + vmovups 64(%rsp), %zmm30 + vmovups (%rsp), %zmm31 + vmovups 1216(%rsp), %zmm1 + movq 1064(%rsp), %rsi + movq 1056(%rsp), %rdi + movq 1096(%rsp), %r12 + cfi_restore (%r12) + movq 1088(%rsp), %r13 + cfi_restore (%r13) + movq 1080(%rsp), %r14 + cfi_restore (%r14) + movq 1072(%rsp), %r15 + cfi_restore (%r15) + jmp .LBL_2_2 + +.LBL_2_10: + cfi_restore_state + movzbl %r12b, %r15d + shlq $4, %r15 + vmovsd 1160(%rsp,%r15), %xmm0 + vzeroupper + vmovsd 1160(%rsp,%r15), %xmm0 + + call JUMPTARGET(cos) + + vmovsd %xmm0, 1224(%rsp,%r15) + jmp .LBL_2_8 + +.LBL_2_12: + movzbl %r12b, %r15d + shlq $4, %r15 + vmovsd 1152(%rsp,%r15), %xmm0 + vzeroupper + vmovsd 1152(%rsp,%r15), %xmm0 + + call JUMPTARGET(cos) + + vmovsd %xmm0, 1216(%rsp,%r15) + jmp .LBL_2_7 +#endif +END (_ZGVeN8v_cos_skx) + + .section .rodata, "a" +.L_2il0floatpacket.16: + .long 0xffffffff,0xffffffff + .type .L_2il0floatpacket.16,@object diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_exp2_core.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_exp2_core.S new file mode 100644 index 0000000000..5a17e11a0f --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_exp2_core.S @@ -0,0 +1,36 @@ +/* Multiple versions of vectorized exp. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + + .text +ENTRY (_ZGVbN2v_exp) + .type _ZGVbN2v_exp, @gnu_indirect_function + LOAD_RTLD_GLOBAL_RO_RDX + leaq _ZGVbN2v_exp_sse4(%rip), %rax + HAS_CPU_FEATURE (SSE4_1) + jz 2f + ret +2: leaq _ZGVbN2v_exp_sse2(%rip), %rax + ret +END (_ZGVbN2v_exp) +libmvec_hidden_def (_ZGVbN2v_exp) + +#define _ZGVbN2v_exp _ZGVbN2v_exp_sse2 +#include "../svml_d_exp2_core.S" diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_exp2_core_sse4.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_exp2_core_sse4.S new file mode 100644 index 0000000000..864dc5ae9f --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_exp2_core_sse4.S @@ -0,0 +1,225 @@ +/* Function exp vectorized with SSE4. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "svml_d_exp_data.h" + + .text +ENTRY (_ZGVbN2v_exp_sse4) +/* + ALGORITHM DESCRIPTION: + + Argument representation: + N = rint(X*2^k/ln2) = 2^k*M+j + X = N*ln2/2^k + r = M*ln2 + ln2*(j/2^k) + r + then -ln2/2^(k+1) < r < ln2/2^(k+1) + Alternatively: + N = trunc(X*2^k/ln2) + then 0 < r < ln2/2^k + + Result calculation: + exp(X) = exp(M*ln2 + ln2*(j/2^k) + r) + = 2^M * 2^(j/2^k) * exp(r) + 2^M is calculated by bit manipulation + 2^(j/2^k) is stored in table + exp(r) is approximated by polynomial. + + The table lookup is skipped if k = 0. */ + + pushq %rbp + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%rbp, 0) + movq %rsp, %rbp + cfi_def_cfa_register (%rbp) + andq $-64, %rsp + subq $320, %rsp + movaps %xmm0, %xmm3 + movq __svml_dexp_data@GOTPCREL(%rip), %r8 + +/* iAbsX = (int)(lX>>32), lX = *(longlong*)&X */ + pshufd $221, %xmm3, %xmm7 + movups __dbInvLn2(%r8), %xmm0 + +/* dK = X*dbInvLn2 */ + mulpd %xmm3, %xmm0 + movq __iAbsMask(%r8), %xmm5 + movq __iDomainRange(%r8), %xmm6 + +/* iAbsX = iAbsX&iAbsMask */ + pand %xmm5, %xmm7 + +/* iRangeMask = (iAbsX>iDomainRange) */ + pcmpgtd %xmm6, %xmm7 + +/* Mask = iRangeMask?1:0, set mask for overflow/underflow */ + movmskps %xmm7, %eax + +/* dN = rint(X*2^k/Ln2) */ + xorps %xmm7, %xmm7 + movups __dbLn2hi(%r8), %xmm5 + movups __dbLn2lo(%r8), %xmm6 + roundpd $0, %xmm0, %xmm7 + +/* dR = X - dN*dbLn2hi, dbLn2hi is 52-8-k hi bits of ln2/2^k */ + mulpd %xmm7, %xmm5 + +/* dR = dR - dN*dbLn2lo, dbLn2lo is 40..94 bits of lo part of ln2/2^k */ + mulpd %xmm6, %xmm7 + movups __dbShifter(%r8), %xmm4 + +/* dM = X*dbInvLn2+dbShifter */ + addpd %xmm0, %xmm4 + movaps %xmm3, %xmm0 + subpd %xmm5, %xmm0 + subpd %xmm7, %xmm0 + movups __dPC2(%r8), %xmm5 + +/* exp(r) = b0+r*(b0+r*(b1+r*b2)) */ + mulpd %xmm0, %xmm5 + addpd __dPC1(%r8), %xmm5 + mulpd %xmm0, %xmm5 + movups __dPC0(%r8), %xmm6 + addpd %xmm6, %xmm5 + mulpd %xmm5, %xmm0 + movdqu __lIndexMask(%r8), %xmm2 + +/* lIndex = (*(longlong*)&dM)&lIndexMask, lIndex is the lower K bits of lM */ + movdqa %xmm2, %xmm1 + +/* lM = (*(longlong*)&dM)&(~lIndexMask) */ + pandn %xmm4, %xmm2 + pand %xmm4, %xmm1 + +/* lM = lM<<(52-K), 2^M */ + psllq $42, %xmm2 + +/* table lookup for dT[j] = 2^(j/2^k) */ + movd %xmm1, %edx + pextrw $4, %xmm1, %ecx + addpd %xmm0, %xmm6 + shll $3, %edx + shll $3, %ecx + movq (%r8,%rdx), %xmm0 + andl $3, %eax + movhpd (%r8,%rcx), %xmm0 + +/* 2^(j/2^k) * exp(r) */ + mulpd %xmm6, %xmm0 + +/* multiply by 2^M through integer add */ + paddq %xmm2, %xmm0 + jne .LBL_1_3 + +.LBL_1_2: + cfi_remember_state + movq %rbp, %rsp + cfi_def_cfa_register (%rsp) + popq %rbp + cfi_adjust_cfa_offset (-8) + cfi_restore (%rbp) + ret + +.LBL_1_3: + cfi_restore_state + movups %xmm3, 192(%rsp) + movups %xmm0, 256(%rsp) + je .LBL_1_2 + + xorb %cl, %cl + xorl %edx, %edx + movups %xmm8, 112(%rsp) + movups %xmm9, 96(%rsp) + movups %xmm10, 80(%rsp) + movups %xmm11, 64(%rsp) + movups %xmm12, 48(%rsp) + movups %xmm13, 32(%rsp) + movups %xmm14, 16(%rsp) + movups %xmm15, (%rsp) + movq %rsi, 136(%rsp) + movq %rdi, 128(%rsp) + movq %r12, 168(%rsp) + cfi_offset_rel_rsp (12, 168) + movb %cl, %r12b + movq %r13, 160(%rsp) + cfi_offset_rel_rsp (13, 160) + movl %eax, %r13d + movq %r14, 152(%rsp) + cfi_offset_rel_rsp (14, 152) + movl %edx, %r14d + movq %r15, 144(%rsp) + cfi_offset_rel_rsp (15, 144) + cfi_remember_state + +.LBL_1_6: + btl %r14d, %r13d + jc .LBL_1_12 + +.LBL_1_7: + lea 1(%r14), %esi + btl %esi, %r13d + jc .LBL_1_10 + +.LBL_1_8: + incb %r12b + addl $2, %r14d + cmpb $16, %r12b + jb .LBL_1_6 + + movups 112(%rsp), %xmm8 + movups 96(%rsp), %xmm9 + movups 80(%rsp), %xmm10 + movups 64(%rsp), %xmm11 + movups 48(%rsp), %xmm12 + movups 32(%rsp), %xmm13 + movups 16(%rsp), %xmm14 + movups (%rsp), %xmm15 + movq 136(%rsp), %rsi + movq 128(%rsp), %rdi + movq 168(%rsp), %r12 + cfi_restore (%r12) + movq 160(%rsp), %r13 + cfi_restore (%r13) + movq 152(%rsp), %r14 + cfi_restore (%r14) + movq 144(%rsp), %r15 + cfi_restore (%r15) + movups 256(%rsp), %xmm0 + jmp .LBL_1_2 + +.LBL_1_10: + cfi_restore_state + movzbl %r12b, %r15d + shlq $4, %r15 + movsd 200(%rsp,%r15), %xmm0 + + call JUMPTARGET(__exp_finite) + + movsd %xmm0, 264(%rsp,%r15) + jmp .LBL_1_8 + +.LBL_1_12: + movzbl %r12b, %r15d + shlq $4, %r15 + movsd 192(%rsp,%r15), %xmm0 + + call JUMPTARGET(__exp_finite) + + movsd %xmm0, 256(%rsp,%r15) + jmp .LBL_1_7 + +END (_ZGVbN2v_exp_sse4) diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_exp4_core.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_exp4_core.S new file mode 100644 index 0000000000..b994a794cd --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_exp4_core.S @@ -0,0 +1,36 @@ +/* Multiple versions of vectorized exp. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + + .text +ENTRY (_ZGVdN4v_exp) + .type _ZGVdN4v_exp, @gnu_indirect_function + LOAD_RTLD_GLOBAL_RO_RDX + leaq _ZGVdN4v_exp_avx2(%rip), %rax + HAS_ARCH_FEATURE (AVX2_Usable) + jz 2f + ret +2: leaq _ZGVdN4v_exp_sse_wrapper(%rip), %rax + ret +END (_ZGVdN4v_exp) +libmvec_hidden_def (_ZGVdN4v_exp) + +#define _ZGVdN4v_exp _ZGVdN4v_exp_sse_wrapper +#include "../svml_d_exp4_core.S" diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_exp4_core_avx2.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_exp4_core_avx2.S new file mode 100644 index 0000000000..937b3c09a6 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_exp4_core_avx2.S @@ -0,0 +1,212 @@ +/* Function exp vectorized with AVX2. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "svml_d_exp_data.h" + + .text +ENTRY (_ZGVdN4v_exp_avx2) +/* + ALGORITHM DESCRIPTION: + + Argument representation: + N = rint(X*2^k/ln2) = 2^k*M+j + X = N*ln2/2^k + r = M*ln2 + ln2*(j/2^k) + r + then -ln2/2^(k+1) < r < ln2/2^(k+1) + Alternatively: + N = trunc(X*2^k/ln2) + then 0 < r < ln2/2^k + + Result calculation: + exp(X) = exp(M*ln2 + ln2*(j/2^k) + r) + = 2^M * 2^(j/2^k) * exp(r) + 2^M is calculated by bit manipulation + 2^(j/2^k) is stored in table + exp(r) is approximated by polynomial + + The table lookup is skipped if k = 0. */ + + pushq %rbp + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%rbp, 0) + movq %rsp, %rbp + cfi_def_cfa_register (%rbp) + andq $-64, %rsp + subq $448, %rsp + movq __svml_dexp_data@GOTPCREL(%rip), %rax + vmovdqa %ymm0, %ymm2 + vmovupd __dbInvLn2(%rax), %ymm3 + vmovupd __dbShifter(%rax), %ymm1 + vmovupd __lIndexMask(%rax), %ymm4 + +/* dM = X*dbInvLn2+dbShifter, dbInvLn2 = 2^k/Ln2 */ + vfmadd213pd %ymm1, %ymm2, %ymm3 + +/* iAbsX = (int)(lX>>32), lX = *(longlong*)&X */ + vextracti128 $1, %ymm2, %xmm5 + vshufps $221, %xmm5, %xmm2, %xmm6 + +/* iAbsX = iAbsX&iAbsMask */ + vandps __iAbsMask(%rax), %xmm6, %xmm7 + +/* dN = dM-dbShifter, dN = rint(X*2^k/Ln2) */ + vsubpd %ymm1, %ymm3, %ymm6 + +/* iRangeMask = (iAbsX>iDomainRange) */ + vpcmpgtd __iDomainRange(%rax), %xmm7, %xmm0 + vmovupd __dbLn2hi(%rax), %ymm1 + vmovupd __dPC0(%rax), %ymm7 + +/* Mask = iRangeMask?1:0, set mask for overflow/underflow */ + vmovmskps %xmm0, %ecx + vmovupd __dPC2(%rax), %ymm0 + +/* dR = X - dN*dbLn2hi, dbLn2hi is 52-8-k hi bits of ln2/2^k */ + vmovdqa %ymm2, %ymm5 + vfnmadd231pd %ymm6, %ymm1, %ymm5 + +/* dR = dR - dN*dbLn2lo, dbLn2lo is 40..94 bits of lo part of ln2/2^k */ + vfnmadd132pd __dbLn2lo(%rax), %ymm5, %ymm6 + +/* exp(r) = b0+r*(b0+r*(b1+r*b2)) */ + vfmadd213pd __dPC1(%rax), %ymm6, %ymm0 + vfmadd213pd %ymm7, %ymm6, %ymm0 + vfmadd213pd %ymm7, %ymm6, %ymm0 + +/* lIndex = (*(longlong*)&dM)&lIndexMask, lIndex is the lower K bits of lM */ + vandps %ymm4, %ymm3, %ymm1 + +/* table lookup for dT[j] = 2^(j/2^k) */ + vxorpd %ymm6, %ymm6, %ymm6 + vpcmpeqd %ymm5, %ymm5, %ymm5 + vgatherqpd %ymm5, (%rax,%ymm1,8), %ymm6 + +/* lM = (*(longlong*)&dM)&(~lIndexMask) */ + vpandn %ymm3, %ymm4, %ymm3 + +/* 2^(j/2^k) * exp(r) */ + vmulpd %ymm0, %ymm6, %ymm0 + +/* lM = lM<<(52-K), 2^M */ + vpsllq $42, %ymm3, %ymm4 + +/* multiply by 2^M through integer add */ + vpaddq %ymm4, %ymm0, %ymm0 + testl %ecx, %ecx + jne .LBL_1_3 + +.LBL_1_2: + cfi_remember_state + movq %rbp, %rsp + cfi_def_cfa_register (%rsp) + popq %rbp + cfi_adjust_cfa_offset (-8) + cfi_restore (%rbp) + ret + +.LBL_1_3: + cfi_restore_state + vmovupd %ymm2, 320(%rsp) + vmovupd %ymm0, 384(%rsp) + je .LBL_1_2 + + xorb %dl, %dl + xorl %eax, %eax + vmovups %ymm8, 224(%rsp) + vmovups %ymm9, 192(%rsp) + vmovups %ymm10, 160(%rsp) + vmovups %ymm11, 128(%rsp) + vmovups %ymm12, 96(%rsp) + vmovups %ymm13, 64(%rsp) + vmovups %ymm14, 32(%rsp) + vmovups %ymm15, (%rsp) + movq %rsi, 264(%rsp) + movq %rdi, 256(%rsp) + movq %r12, 296(%rsp) + cfi_offset_rel_rsp (12, 296) + movb %dl, %r12b + movq %r13, 288(%rsp) + cfi_offset_rel_rsp (13, 288) + movl %ecx, %r13d + movq %r14, 280(%rsp) + cfi_offset_rel_rsp (14, 280) + movl %eax, %r14d + movq %r15, 272(%rsp) + cfi_offset_rel_rsp (15, 272) + cfi_remember_state + +.LBL_1_6: + btl %r14d, %r13d + jc .LBL_1_12 + +.LBL_1_7: + lea 1(%r14), %esi + btl %esi, %r13d + jc .LBL_1_10 + +.LBL_1_8: + incb %r12b + addl $2, %r14d + cmpb $16, %r12b + jb .LBL_1_6 + + vmovups 224(%rsp), %ymm8 + vmovups 192(%rsp), %ymm9 + vmovups 160(%rsp), %ymm10 + vmovups 128(%rsp), %ymm11 + vmovups 96(%rsp), %ymm12 + vmovups 64(%rsp), %ymm13 + vmovups 32(%rsp), %ymm14 + vmovups (%rsp), %ymm15 + vmovupd 384(%rsp), %ymm0 + movq 264(%rsp), %rsi + movq 256(%rsp), %rdi + movq 296(%rsp), %r12 + cfi_restore (%r12) + movq 288(%rsp), %r13 + cfi_restore (%r13) + movq 280(%rsp), %r14 + cfi_restore (%r14) + movq 272(%rsp), %r15 + cfi_restore (%r15) + jmp .LBL_1_2 + +.LBL_1_10: + cfi_restore_state + movzbl %r12b, %r15d + shlq $4, %r15 + vmovsd 328(%rsp,%r15), %xmm0 + vzeroupper + + call JUMPTARGET(__exp_finite) + + vmovsd %xmm0, 392(%rsp,%r15) + jmp .LBL_1_8 + +.LBL_1_12: + movzbl %r12b, %r15d + shlq $4, %r15 + vmovsd 320(%rsp,%r15), %xmm0 + vzeroupper + + call JUMPTARGET(__exp_finite) + + vmovsd %xmm0, 384(%rsp,%r15) + jmp .LBL_1_7 + +END (_ZGVdN4v_exp_avx2) diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_exp8_core.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_exp8_core.S new file mode 100644 index 0000000000..6189080fcc --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_exp8_core.S @@ -0,0 +1,37 @@ +/* Multiple versions of vectorized exp. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + + .text +ENTRY (_ZGVeN8v_exp) + .type _ZGVeN8v_exp, @gnu_indirect_function + LOAD_RTLD_GLOBAL_RO_RDX + leaq _ZGVeN8v_exp_skx(%rip), %rax + HAS_ARCH_FEATURE (AVX512DQ_Usable) + jnz 2f + leaq _ZGVeN8v_exp_knl(%rip), %rax + HAS_ARCH_FEATURE (AVX512F_Usable) + jnz 2f + leaq _ZGVeN8v_exp_avx2_wrapper(%rip), %rax +2: ret +END (_ZGVeN8v_exp) + +#define _ZGVeN8v_exp _ZGVeN8v_exp_avx2_wrapper +#include "../svml_d_exp8_core.S" diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_exp8_core_avx512.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_exp8_core_avx512.S new file mode 100644 index 0000000000..97ba72c2a0 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_exp8_core_avx512.S @@ -0,0 +1,456 @@ +/* Function exp vectorized with AVX-512. KNL and SKX versions. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "svml_d_exp_data.h" +#include "svml_d_wrapper_impl.h" + + .text +ENTRY (_ZGVeN8v_exp_knl) +#ifndef HAVE_AVX512DQ_ASM_SUPPORT +WRAPPER_IMPL_AVX512 _ZGVdN4v_exp +#else +/* + ALGORITHM DESCRIPTION: + + Argument representation: + N = rint(X*2^k/ln2) = 2^k*M+j + X = N*ln2/2^k + r = M*ln2 + ln2*(j/2^k) + r + then -ln2/2^(k+1) < r < ln2/2^(k+1) + Alternatively: + N = trunc(X*2^k/ln2) + then 0 < r < ln2/2^k + + Result calculation: + exp(X) = exp(M*ln2 + ln2*(j/2^k) + r) + = 2^M * 2^(j/2^k) * exp(r) + 2^M is calculated by bit manipulation + 2^(j/2^k) is stored in table + exp(r) is approximated by polynomial + + The table lookup is skipped if k = 0. */ + + pushq %rbp + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%rbp, 0) + movq %rsp, %rbp + cfi_def_cfa_register (%rbp) + andq $-64, %rsp + subq $1280, %rsp + movq __svml_dexp_data@GOTPCREL(%rip), %rax + +/* dR = X - dN*dbLn2hi, dbLn2hi is 52-8-k hi bits of ln2/2^k */ + vmovaps %zmm0, %zmm8 + +/* iAbsX = (int)(lX>>32), lX = *(longlong*)&X */ + vpsrlq $32, %zmm0, %zmm1 + +/* iAbsX = iAbsX&iAbsMask */ + movl $255, %edx + vpmovqd %zmm1, %ymm2 + kmovw %edx, %k2 + +/* iRangeMask = (iAbsX>iDomainRange) */ + movl $-1, %ecx + +/* table lookup for dT[j] = 2^(j/2^k) */ + vpxord %zmm11, %zmm11, %zmm11 + vmovups __dbInvLn2(%rax), %zmm5 + vmovups __dbLn2hi(%rax), %zmm7 + kxnorw %k3, %k3, %k3 + +/* dM = X*dbInvLn2+dbShifter, dbInvLn2 = 2^k/Ln2 */ + vfmadd213pd __dbShifter(%rax), %zmm0, %zmm5 + vmovups __dPC2(%rax), %zmm12 + +/* dN = dM-dbShifter, dN = rint(X*2^k/Ln2) */ + vsubpd __dbShifter(%rax), %zmm5, %zmm9 + vmovups __lIndexMask(%rax), %zmm4 + vfnmadd231pd %zmm9, %zmm7, %zmm8 + vpandd __iAbsMask(%rax), %zmm2, %zmm2{%k2} + +/* lIndex = (*(longlong*)&dM)&lIndexMask, lIndex is the lower K bits of lM */ + vpandq %zmm4, %zmm5, %zmm10 + vgatherqpd (%rax,%zmm10,8), %zmm11{%k3} + vpcmpgtd __iDomainRange(%rax), %zmm2, %k1{%k2} + +/* lM = (*(longlong*)&dM)&(~lIndexMask) */ + vpandnq %zmm5, %zmm4, %zmm6 + vpbroadcastd %ecx, %zmm3{%k1}{z} + +/* lM = lM<<(52-K), 2^M */ + vpsllq $42, %zmm6, %zmm14 + +/* dR = dR - dN*dbLn2lo, dbLn2lo is 40..94 bits of lo part of ln2/2^k */ + vfnmadd132pd __dbLn2lo(%rax), %zmm8, %zmm9 + +/* Mask = iRangeMask?1:0, set mask for overflow/underflow */ + vptestmd %zmm3, %zmm3, %k0{%k2} + +/* exp(r) = b0+r*(b0+r*(b1+r*b2)) */ + vfmadd213pd __dPC1(%rax), %zmm9, %zmm12 + kmovw %k0, %ecx + movzbl %cl, %ecx + vfmadd213pd __dPC0(%rax), %zmm9, %zmm12 + vfmadd213pd __dPC0(%rax), %zmm9, %zmm12 + +/* 2^(j/2^k) * exp(r) */ + vmulpd %zmm12, %zmm11, %zmm13 + +/* multiply by 2^M through integer add */ + vpaddq %zmm14, %zmm13, %zmm1 + testl %ecx, %ecx + jne .LBL_1_3 + +.LBL_1_2: + cfi_remember_state + vmovaps %zmm1, %zmm0 + movq %rbp, %rsp + cfi_def_cfa_register (%rsp) + popq %rbp + cfi_adjust_cfa_offset (-8) + cfi_restore (%rbp) + ret + +.LBL_1_3: + cfi_restore_state + vmovups %zmm0, 1152(%rsp) + vmovups %zmm1, 1216(%rsp) + je .LBL_1_2 + + xorb %dl, %dl + kmovw %k4, 1048(%rsp) + xorl %eax, %eax + kmovw %k5, 1040(%rsp) + kmovw %k6, 1032(%rsp) + kmovw %k7, 1024(%rsp) + vmovups %zmm16, 960(%rsp) + vmovups %zmm17, 896(%rsp) + vmovups %zmm18, 832(%rsp) + vmovups %zmm19, 768(%rsp) + vmovups %zmm20, 704(%rsp) + vmovups %zmm21, 640(%rsp) + vmovups %zmm22, 576(%rsp) + vmovups %zmm23, 512(%rsp) + vmovups %zmm24, 448(%rsp) + vmovups %zmm25, 384(%rsp) + vmovups %zmm26, 320(%rsp) + vmovups %zmm27, 256(%rsp) + vmovups %zmm28, 192(%rsp) + vmovups %zmm29, 128(%rsp) + vmovups %zmm30, 64(%rsp) + vmovups %zmm31, (%rsp) + movq %rsi, 1064(%rsp) + movq %rdi, 1056(%rsp) + movq %r12, 1096(%rsp) + cfi_offset_rel_rsp (12, 1096) + movb %dl, %r12b + movq %r13, 1088(%rsp) + cfi_offset_rel_rsp (13, 1088) + movl %ecx, %r13d + movq %r14, 1080(%rsp) + cfi_offset_rel_rsp (14, 1080) + movl %eax, %r14d + movq %r15, 1072(%rsp) + cfi_offset_rel_rsp (15, 1072) + cfi_remember_state + +.LBL_1_6: + btl %r14d, %r13d + jc .LBL_1_12 + +.LBL_1_7: + lea 1(%r14), %esi + btl %esi, %r13d + jc .LBL_1_10 + +.LBL_1_8: + addb $1, %r12b + addl $2, %r14d + cmpb $16, %r12b + jb .LBL_1_6 + + kmovw 1048(%rsp), %k4 + movq 1064(%rsp), %rsi + kmovw 1040(%rsp), %k5 + movq 1056(%rsp), %rdi + kmovw 1032(%rsp), %k6 + movq 1096(%rsp), %r12 + cfi_restore (%r12) + movq 1088(%rsp), %r13 + cfi_restore (%r13) + kmovw 1024(%rsp), %k7 + vmovups 960(%rsp), %zmm16 + vmovups 896(%rsp), %zmm17 + vmovups 832(%rsp), %zmm18 + vmovups 768(%rsp), %zmm19 + vmovups 704(%rsp), %zmm20 + vmovups 640(%rsp), %zmm21 + vmovups 576(%rsp), %zmm22 + vmovups 512(%rsp), %zmm23 + vmovups 448(%rsp), %zmm24 + vmovups 384(%rsp), %zmm25 + vmovups 320(%rsp), %zmm26 + vmovups 256(%rsp), %zmm27 + vmovups 192(%rsp), %zmm28 + vmovups 128(%rsp), %zmm29 + vmovups 64(%rsp), %zmm30 + vmovups (%rsp), %zmm31 + movq 1080(%rsp), %r14 + cfi_restore (%r14) + movq 1072(%rsp), %r15 + cfi_restore (%r15) + vmovups 1216(%rsp), %zmm1 + jmp .LBL_1_2 + +.LBL_1_10: + cfi_restore_state + movzbl %r12b, %r15d + shlq $4, %r15 + vmovsd 1160(%rsp,%r15), %xmm0 + call JUMPTARGET(__exp_finite) + vmovsd %xmm0, 1224(%rsp,%r15) + jmp .LBL_1_8 + +.LBL_1_12: + movzbl %r12b, %r15d + shlq $4, %r15 + vmovsd 1152(%rsp,%r15), %xmm0 + call JUMPTARGET(__exp_finite) + vmovsd %xmm0, 1216(%rsp,%r15) + jmp .LBL_1_7 +#endif +END (_ZGVeN8v_exp_knl) + +ENTRY (_ZGVeN8v_exp_skx) +#ifndef HAVE_AVX512DQ_ASM_SUPPORT +WRAPPER_IMPL_AVX512 _ZGVdN4v_exp +#else +/* + ALGORITHM DESCRIPTION: + + Argument representation: + N = rint(X*2^k/ln2) = 2^k*M+j + X = N*ln2/2^k + r = M*ln2 + ln2*(j/2^k) + r + then -ln2/2^(k+1) < r < ln2/2^(k+1) + Alternatively: + N = trunc(X*2^k/ln2) + then 0 < r < ln2/2^k + + Result calculation: + exp(X) = exp(M*ln2 + ln2*(j/2^k) + r) + = 2^M * 2^(j/2^k) * exp(r) + 2^M is calculated by bit manipulation + 2^(j/2^k) is stored in table + exp(r) is approximated by polynomial + + The table lookup is skipped if k = 0. */ + + pushq %rbp + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%rbp, 0) + movq %rsp, %rbp + cfi_def_cfa_register (%rbp) + andq $-64, %rsp + subq $1280, %rsp + movq __svml_dexp_data@GOTPCREL(%rip), %rax + +/* table lookup for dT[j] = 2^(j/2^k) */ + kxnorw %k1, %k1, %k1 + +/* iAbsX = (int)(lX>>32), lX = *(longlong*)&X */ + vpsrlq $32, %zmm0, %zmm1 + vmovups __dbInvLn2(%rax), %zmm7 + vmovups __dbShifter(%rax), %zmm5 + vmovups __lIndexMask(%rax), %zmm6 + vmovups __dbLn2hi(%rax), %zmm9 + vmovups __dPC0(%rax), %zmm12 + +/* dM = X*dbInvLn2+dbShifter, dbInvLn2 = 2^k/Ln2 */ + vfmadd213pd %zmm5, %zmm0, %zmm7 + vpmovqd %zmm1, %ymm2 + +/* dN = dM-dbShifter, dN = rint(X*2^k/Ln2) */ + vsubpd %zmm5, %zmm7, %zmm11 + +/* iAbsX = iAbsX&iAbsMask */ + vpand __iAbsMask(%rax), %ymm2, %ymm3 + +/* dR = X - dN*dbLn2hi, dbLn2hi is 52-8-k hi bits of ln2/2^k */ + vmovaps %zmm0, %zmm10 + vfnmadd231pd %zmm11, %zmm9, %zmm10 + vmovups __dPC2(%rax), %zmm9 + +/* dR = dR - dN*dbLn2lo, dbLn2lo is 40..94 bits of lo part of ln2/2^k */ + vfnmadd132pd __dbLn2lo(%rax), %zmm10, %zmm11 + +/* exp(r) = b0+r*(b0+r*(b1+r*b2)) */ + vfmadd213pd __dPC1(%rax), %zmm11, %zmm9 + vfmadd213pd %zmm12, %zmm11, %zmm9 + vfmadd213pd %zmm12, %zmm11, %zmm9 + +/* iRangeMask = (iAbsX>iDomainRange) */ + vpcmpgtd __iDomainRange(%rax), %ymm3, %ymm4 + +/* Mask = iRangeMask?1:0, set mask for overflow/underflow */ + vmovmskps %ymm4, %ecx + +/* lIndex = (*(longlong*)&dM)&lIndexMask, lIndex is the lower K bits of lM */ + vpandq %zmm6, %zmm7, %zmm13 + vpmovqd %zmm13, %ymm14 + vpxord %zmm15, %zmm15, %zmm15 + vgatherdpd (%rax,%ymm14,8), %zmm15{%k1} + +/* 2^(j/2^k) * exp(r) */ + vmulpd %zmm9, %zmm15, %zmm10 + +/* lM = (*(longlong*)&dM)&(~lIndexMask) */ + vpandnq %zmm7, %zmm6, %zmm8 + +/* lM = lM<<(52-K), 2^M */ + vpsllq $42, %zmm8, %zmm1 + +/* multiply by 2^M through integer add */ + vpaddq %zmm1, %zmm10, %zmm1 + testl %ecx, %ecx + jne .LBL_2_3 + +.LBL_2_2: + cfi_remember_state + vmovaps %zmm1, %zmm0 + movq %rbp, %rsp + cfi_def_cfa_register (%rsp) + popq %rbp + cfi_adjust_cfa_offset (-8) + cfi_restore (%rbp) + ret + +.LBL_2_3: + cfi_restore_state + vmovups %zmm0, 1152(%rsp) + vmovups %zmm1, 1216(%rsp) + je .LBL_2_2 + + xorb %dl, %dl + xorl %eax, %eax + kmovw %k4, 1048(%rsp) + kmovw %k5, 1040(%rsp) + kmovw %k6, 1032(%rsp) + kmovw %k7, 1024(%rsp) + vmovups %zmm16, 960(%rsp) + vmovups %zmm17, 896(%rsp) + vmovups %zmm18, 832(%rsp) + vmovups %zmm19, 768(%rsp) + vmovups %zmm20, 704(%rsp) + vmovups %zmm21, 640(%rsp) + vmovups %zmm22, 576(%rsp) + vmovups %zmm23, 512(%rsp) + vmovups %zmm24, 448(%rsp) + vmovups %zmm25, 384(%rsp) + vmovups %zmm26, 320(%rsp) + vmovups %zmm27, 256(%rsp) + vmovups %zmm28, 192(%rsp) + vmovups %zmm29, 128(%rsp) + vmovups %zmm30, 64(%rsp) + vmovups %zmm31, (%rsp) + movq %rsi, 1064(%rsp) + movq %rdi, 1056(%rsp) + movq %r12, 1096(%rsp) + cfi_offset_rel_rsp (12, 1096) + movb %dl, %r12b + movq %r13, 1088(%rsp) + cfi_offset_rel_rsp (13, 1088) + movl %ecx, %r13d + movq %r14, 1080(%rsp) + cfi_offset_rel_rsp (14, 1080) + movl %eax, %r14d + movq %r15, 1072(%rsp) + cfi_offset_rel_rsp (15, 1072) + cfi_remember_state + +.LBL_2_6: + btl %r14d, %r13d + jc .LBL_2_12 + +.LBL_2_7: + lea 1(%r14), %esi + btl %esi, %r13d + jc .LBL_2_10 + +.LBL_2_8: + incb %r12b + addl $2, %r14d + cmpb $16, %r12b + jb .LBL_2_6 + + kmovw 1048(%rsp), %k4 + kmovw 1040(%rsp), %k5 + kmovw 1032(%rsp), %k6 + kmovw 1024(%rsp), %k7 + vmovups 960(%rsp), %zmm16 + vmovups 896(%rsp), %zmm17 + vmovups 832(%rsp), %zmm18 + vmovups 768(%rsp), %zmm19 + vmovups 704(%rsp), %zmm20 + vmovups 640(%rsp), %zmm21 + vmovups 576(%rsp), %zmm22 + vmovups 512(%rsp), %zmm23 + vmovups 448(%rsp), %zmm24 + vmovups 384(%rsp), %zmm25 + vmovups 320(%rsp), %zmm26 + vmovups 256(%rsp), %zmm27 + vmovups 192(%rsp), %zmm28 + vmovups 128(%rsp), %zmm29 + vmovups 64(%rsp), %zmm30 + vmovups (%rsp), %zmm31 + vmovups 1216(%rsp), %zmm1 + movq 1064(%rsp), %rsi + movq 1056(%rsp), %rdi + movq 1096(%rsp), %r12 + cfi_restore (%r12) + movq 1088(%rsp), %r13 + cfi_restore (%r13) + movq 1080(%rsp), %r14 + cfi_restore (%r14) + movq 1072(%rsp), %r15 + cfi_restore (%r15) + jmp .LBL_2_2 + +.LBL_2_10: + cfi_restore_state + movzbl %r12b, %r15d + shlq $4, %r15 + vmovsd 1160(%rsp,%r15), %xmm0 + vzeroupper + vmovsd 1160(%rsp,%r15), %xmm0 + call JUMPTARGET(__exp_finite) + vmovsd %xmm0, 1224(%rsp,%r15) + jmp .LBL_2_8 + +.LBL_2_12: + movzbl %r12b, %r15d + shlq $4, %r15 + vmovsd 1152(%rsp,%r15), %xmm0 + vzeroupper + vmovsd 1152(%rsp,%r15), %xmm0 + call JUMPTARGET(__exp_finite) + vmovsd %xmm0, 1216(%rsp,%r15) + jmp .LBL_2_7 + +#endif +END (_ZGVeN8v_exp_skx) diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_log2_core.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_log2_core.S new file mode 100644 index 0000000000..5097add6b5 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_log2_core.S @@ -0,0 +1,36 @@ +/* Multiple versions of vectorized log. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + + .text +ENTRY (_ZGVbN2v_log) + .type _ZGVbN2v_log, @gnu_indirect_function + LOAD_RTLD_GLOBAL_RO_RDX + leaq _ZGVbN2v_log_sse4(%rip), %rax + HAS_CPU_FEATURE (SSE4_1) + jz 2f + ret +2: leaq _ZGVbN2v_log_sse2(%rip), %rax + ret +END (_ZGVbN2v_log) +libmvec_hidden_def (_ZGVbN2v_log) + +#define _ZGVbN2v_log _ZGVbN2v_log_sse2 +#include "../svml_d_log2_core.S" diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_log2_core_sse4.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_log2_core_sse4.S new file mode 100644 index 0000000000..7d4b3c8850 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_log2_core_sse4.S @@ -0,0 +1,229 @@ +/* Function log vectorized with SSE4. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "svml_d_log_data.h" + + .text +ENTRY (_ZGVbN2v_log_sse4) +/* + ALGORITHM DESCRIPTION: + + log(x) = -log(Rcp) + log(Rcp*x), + where Rcp ~ 1/x (accuracy ~9 bits, obtained by rounding + HW approximation to 1+9 mantissa bits) + + Reduced argument R=Rcp*x-1 is used to approximate log(1+R) as polynomial + + log(Rcp) = exponent_Rcp*log(2) + log(mantissa_Rcp) + -log(mantissa_Rcp) is obtained from a lookup table, + accessed by a 9-bit index + */ + pushq %rbp + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%rbp, 0) + movq %rsp, %rbp + cfi_def_cfa_register (%rbp) + andq $-64, %rsp + subq $320, %rsp + movaps %xmm0, %xmm6 + movq __svml_dlog_data@GOTPCREL(%rip), %r8 + movaps %xmm6, %xmm3 + movaps %xmm6, %xmm2 + +/* isolate exponent bits */ + movaps %xmm6, %xmm1 + psrlq $20, %xmm1 + movups _ExpMask(%r8), %xmm5 + +/* preserve mantissa, set input exponent to 2^(-10) */ + andps %xmm6, %xmm5 + orps _Two10(%r8), %xmm5 + +/* reciprocal approximation good to at least 11 bits */ + cvtpd2ps %xmm5, %xmm7 + cmpltpd _MinNorm(%r8), %xmm3 + cmpnlepd _MaxNorm(%r8), %xmm2 + movlhps %xmm7, %xmm7 + +/* combine and get argument value range mask */ + orps %xmm2, %xmm3 + rcpps %xmm7, %xmm0 + movmskpd %xmm3, %eax + movups _HalfMask(%r8), %xmm2 + +/* argument reduction started: R = Mantissa*Rcp - 1 */ + andps %xmm5, %xmm2 + cvtps2pd %xmm0, %xmm4 + subpd %xmm2, %xmm5 + +/* round reciprocal to nearest integer, will have 1+9 mantissa bits */ + roundpd $0, %xmm4, %xmm4 + mulpd %xmm4, %xmm2 + mulpd %xmm4, %xmm5 + subpd _One(%r8), %xmm2 + addpd %xmm2, %xmm5 + movups _Threshold(%r8), %xmm2 + +/* calculate index for table lookup */ + movaps %xmm4, %xmm3 + cmpltpd %xmm4, %xmm2 + pshufd $221, %xmm1, %xmm7 + psrlq $40, %xmm3 + +/* convert biased exponent to DP format */ + cvtdq2pd %xmm7, %xmm0 + movd %xmm3, %edx + movups _poly_coeff_1(%r8), %xmm4 + +/* polynomial computation */ + mulpd %xmm5, %xmm4 + andps _Bias(%r8), %xmm2 + orps _Bias1(%r8), %xmm2 + +/* + Table stores -log(0.5*mantissa) for larger mantissas, + adjust exponent accordingly + */ + subpd %xmm2, %xmm0 + addpd _poly_coeff_2(%r8), %xmm4 + +/* exponent*log(2.0) */ + mulpd _L2(%r8), %xmm0 + movaps %xmm5, %xmm2 + mulpd %xmm5, %xmm2 + movups _poly_coeff_3(%r8), %xmm7 + mulpd %xmm5, %xmm7 + mulpd %xmm2, %xmm4 + addpd _poly_coeff_4(%r8), %xmm7 + addpd %xmm4, %xmm7 + mulpd %xmm7, %xmm2 + movslq %edx, %rdx + pextrd $2, %xmm3, %ecx + +/* + reconstruction: + (exponent*log(2)) + (LogRcp + (R+poly)) + */ + addpd %xmm2, %xmm5 + movslq %ecx, %rcx + movsd _LogRcp_lookup(%r8,%rdx), %xmm1 + movhpd _LogRcp_lookup(%r8,%rcx), %xmm1 + addpd %xmm5, %xmm1 + addpd %xmm1, %xmm0 + testl %eax, %eax + jne .LBL_1_3 + +.LBL_1_2: + cfi_remember_state + movq %rbp, %rsp + cfi_def_cfa_register (%rsp) + popq %rbp + cfi_adjust_cfa_offset (-8) + cfi_restore (%rbp) + ret + +.LBL_1_3: + cfi_restore_state + movups %xmm6, 192(%rsp) + movups %xmm0, 256(%rsp) + je .LBL_1_2 + + xorb %cl, %cl + xorl %edx, %edx + movups %xmm8, 112(%rsp) + movups %xmm9, 96(%rsp) + movups %xmm10, 80(%rsp) + movups %xmm11, 64(%rsp) + movups %xmm12, 48(%rsp) + movups %xmm13, 32(%rsp) + movups %xmm14, 16(%rsp) + movups %xmm15, (%rsp) + movq %rsi, 136(%rsp) + movq %rdi, 128(%rsp) + movq %r12, 168(%rsp) + cfi_offset_rel_rsp (12, 168) + movb %cl, %r12b + movq %r13, 160(%rsp) + cfi_offset_rel_rsp (13, 160) + movl %eax, %r13d + movq %r14, 152(%rsp) + cfi_offset_rel_rsp (14, 152) + movl %edx, %r14d + movq %r15, 144(%rsp) + cfi_offset_rel_rsp (15, 144) + cfi_remember_state + +.LBL_1_6: + btl %r14d, %r13d + jc .LBL_1_12 + +.LBL_1_7: + lea 1(%r14), %esi + btl %esi, %r13d + jc .LBL_1_10 + +.LBL_1_8: + incb %r12b + addl $2, %r14d + cmpb $16, %r12b + jb .LBL_1_6 + + movups 112(%rsp), %xmm8 + movups 96(%rsp), %xmm9 + movups 80(%rsp), %xmm10 + movups 64(%rsp), %xmm11 + movups 48(%rsp), %xmm12 + movups 32(%rsp), %xmm13 + movups 16(%rsp), %xmm14 + movups (%rsp), %xmm15 + movq 136(%rsp), %rsi + movq 128(%rsp), %rdi + movq 168(%rsp), %r12 + cfi_restore (%r12) + movq 160(%rsp), %r13 + cfi_restore (%r13) + movq 152(%rsp), %r14 + cfi_restore (%r14) + movq 144(%rsp), %r15 + cfi_restore (%r15) + movups 256(%rsp), %xmm0 + jmp .LBL_1_2 + +.LBL_1_10: + cfi_restore_state + movzbl %r12b, %r15d + shlq $4, %r15 + movsd 200(%rsp,%r15), %xmm0 + + call JUMPTARGET(__log_finite) + + movsd %xmm0, 264(%rsp,%r15) + jmp .LBL_1_8 + +.LBL_1_12: + movzbl %r12b, %r15d + shlq $4, %r15 + movsd 192(%rsp,%r15), %xmm0 + + call JUMPTARGET(__log_finite) + + movsd %xmm0, 256(%rsp,%r15) + jmp .LBL_1_7 + +END (_ZGVbN2v_log_sse4) diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_log4_core.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_log4_core.S new file mode 100644 index 0000000000..1e9a2f48a1 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_log4_core.S @@ -0,0 +1,36 @@ +/* Multiple versions of vectorized log. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + + .text +ENTRY (_ZGVdN4v_log) + .type _ZGVdN4v_log, @gnu_indirect_function + LOAD_RTLD_GLOBAL_RO_RDX + leaq _ZGVdN4v_log_avx2(%rip), %rax + HAS_ARCH_FEATURE (AVX2_Usable) + jz 2f + ret +2: leaq _ZGVdN4v_log_sse_wrapper(%rip), %rax + ret +END (_ZGVdN4v_log) +libmvec_hidden_def (_ZGVdN4v_log) + +#define _ZGVdN4v_log _ZGVdN4v_log_sse_wrapper +#include "../svml_d_log4_core.S" diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_log4_core_avx2.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_log4_core_avx2.S new file mode 100644 index 0000000000..04ea9e0071 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_log4_core_avx2.S @@ -0,0 +1,210 @@ +/* Function log vectorized with AVX2. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "svml_d_log_data.h" + + .text +ENTRY (_ZGVdN4v_log_avx2) +/* ALGORITHM DESCRIPTION: + + log(x) = -log(Rcp) + log(Rcp*x), + where Rcp ~ 1/x (accuracy ~9 bits, obtained by rounding + HW approximation to 1+9 mantissa bits) + + Reduced argument R=Rcp*x-1 is used to approximate log(1+R) as polynomial + + log(Rcp) = exponent_Rcp*log(2) + log(mantissa_Rcp) + -log(mantissa_Rcp) is obtained from a lookup table, + accessed by a 9-bit index + */ + pushq %rbp + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%rbp, 0) + movq %rsp, %rbp + cfi_def_cfa_register (%rbp) + andq $-64, %rsp + subq $448, %rsp + movq __svml_dlog_data@GOTPCREL(%rip), %rax + vmovdqa %ymm0, %ymm5 + +/* isolate exponent bits */ + vpsrlq $20, %ymm5, %ymm0 + +/* preserve mantissa, set input exponent to 2^(-10) */ + vandpd _ExpMask(%rax), %ymm5, %ymm6 + vorpd _Two10(%rax), %ymm6, %ymm4 + +/* reciprocal approximation good to at least 11 bits */ + vcvtpd2ps %ymm4, %xmm7 + vrcpps %xmm7, %xmm1 + vcmplt_oqpd _MinNorm(%rax), %ymm5, %ymm7 + vcvtps2pd %xmm1, %ymm3 + vcmpnle_uqpd _MaxNorm(%rax), %ymm5, %ymm1 + vextracti128 $1, %ymm0, %xmm2 + vshufps $221, %xmm2, %xmm0, %xmm6 + +/* round reciprocal to nearest integer, will have 1+9 mantissa bits */ + vroundpd $0, %ymm3, %ymm2 + +/* convert biased exponent to DP format */ + vcvtdq2pd %xmm6, %ymm0 + +/* combine and get argument value range mask */ + vorpd %ymm1, %ymm7, %ymm3 + vmovupd _One(%rax), %ymm1 + vmovmskpd %ymm3, %ecx + +/* calculate index for table lookup */ + vpsrlq $40, %ymm2, %ymm3 + +/* argument reduction started: R = Mantissa*Rcp - 1 */ + vfmsub213pd %ymm1, %ymm2, %ymm4 + vcmpgt_oqpd _Threshold(%rax), %ymm2, %ymm2 + vpcmpeqd %ymm6, %ymm6, %ymm6 + vxorpd %ymm1, %ymm1, %ymm1 + vgatherqpd %ymm6, _LogRcp_lookup(%rax,%ymm3), %ymm1 + +/* exponent*log(2.0) */ + vmovupd _poly_coeff_1(%rax), %ymm6 + vmulpd %ymm4, %ymm4, %ymm3 + +/* polynomial computation */ + vfmadd213pd _poly_coeff_2(%rax), %ymm4, %ymm6 + vandpd _Bias(%rax), %ymm2, %ymm7 + vorpd _Bias1(%rax), %ymm7, %ymm2 + +/* + Table stores -log(0.5*mantissa) for larger mantissas, + adjust exponent accordingly + */ + vsubpd %ymm2, %ymm0, %ymm0 + vmovupd _poly_coeff_3(%rax), %ymm2 + vfmadd213pd _poly_coeff_4(%rax), %ymm4, %ymm2 + vfmadd213pd %ymm2, %ymm3, %ymm6 + +/* + reconstruction: + (exponent*log(2)) + (LogRcp + (R+poly)) + */ + vfmadd213pd %ymm4, %ymm3, %ymm6 + vaddpd %ymm1, %ymm6, %ymm4 + vfmadd132pd _L2(%rax), %ymm4, %ymm0 + testl %ecx, %ecx + jne .LBL_1_3 + +.LBL_1_2: + cfi_remember_state + movq %rbp, %rsp + cfi_def_cfa_register (%rsp) + popq %rbp + cfi_adjust_cfa_offset (-8) + cfi_restore (%rbp) + ret + +.LBL_1_3: + cfi_restore_state + vmovupd %ymm5, 320(%rsp) + vmovupd %ymm0, 384(%rsp) + je .LBL_1_2 + + xorb %dl, %dl + xorl %eax, %eax + vmovups %ymm8, 224(%rsp) + vmovups %ymm9, 192(%rsp) + vmovups %ymm10, 160(%rsp) + vmovups %ymm11, 128(%rsp) + vmovups %ymm12, 96(%rsp) + vmovups %ymm13, 64(%rsp) + vmovups %ymm14, 32(%rsp) + vmovups %ymm15, (%rsp) + movq %rsi, 264(%rsp) + movq %rdi, 256(%rsp) + movq %r12, 296(%rsp) + cfi_offset_rel_rsp (12, 296) + movb %dl, %r12b + movq %r13, 288(%rsp) + cfi_offset_rel_rsp (13, 288) + movl %ecx, %r13d + movq %r14, 280(%rsp) + cfi_offset_rel_rsp (14, 280) + movl %eax, %r14d + movq %r15, 272(%rsp) + cfi_offset_rel_rsp (15, 272) + cfi_remember_state + +.LBL_1_6: + btl %r14d, %r13d + jc .LBL_1_12 + +.LBL_1_7: + lea 1(%r14), %esi + btl %esi, %r13d + jc .LBL_1_10 + +.LBL_1_8: + incb %r12b + addl $2, %r14d + cmpb $16, %r12b + jb .LBL_1_6 + + vmovups 224(%rsp), %ymm8 + vmovups 192(%rsp), %ymm9 + vmovups 160(%rsp), %ymm10 + vmovups 128(%rsp), %ymm11 + vmovups 96(%rsp), %ymm12 + vmovups 64(%rsp), %ymm13 + vmovups 32(%rsp), %ymm14 + vmovups (%rsp), %ymm15 + vmovupd 384(%rsp), %ymm0 + movq 264(%rsp), %rsi + movq 256(%rsp), %rdi + movq 296(%rsp), %r12 + cfi_restore (%r12) + movq 288(%rsp), %r13 + cfi_restore (%r13) + movq 280(%rsp), %r14 + cfi_restore (%r14) + movq 272(%rsp), %r15 + cfi_restore (%r15) + jmp .LBL_1_2 + +.LBL_1_10: + cfi_restore_state + movzbl %r12b, %r15d + shlq $4, %r15 + vmovsd 328(%rsp,%r15), %xmm0 + vzeroupper + + call JUMPTARGET(__log_finite) + + vmovsd %xmm0, 392(%rsp,%r15) + jmp .LBL_1_8 + +.LBL_1_12: + movzbl %r12b, %r15d + shlq $4, %r15 + vmovsd 320(%rsp,%r15), %xmm0 + vzeroupper + + call JUMPTARGET(__log_finite) + + vmovsd %xmm0, 384(%rsp,%r15) + jmp .LBL_1_7 + +END (_ZGVdN4v_log_avx2) diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core.S new file mode 100644 index 0000000000..43f572d36c --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core.S @@ -0,0 +1,37 @@ +/* Multiple versions of vectorized log. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + + .text +ENTRY (_ZGVeN8v_log) + .type _ZGVeN8v_log, @gnu_indirect_function + LOAD_RTLD_GLOBAL_RO_RDX + leaq _ZGVeN8v_log_skx(%rip), %rax + HAS_ARCH_FEATURE (AVX512DQ_Usable) + jnz 2f + leaq _ZGVeN8v_log_knl(%rip), %rax + HAS_ARCH_FEATURE (AVX512F_Usable) + jnz 2f + leaq _ZGVeN8v_log_avx2_wrapper(%rip), %rax +2: ret +END (_ZGVeN8v_log) + +#define _ZGVeN8v_log _ZGVeN8v_log_avx2_wrapper +#include "../svml_d_log8_core.S" diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S new file mode 100644 index 0000000000..d10d5114c6 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S @@ -0,0 +1,468 @@ +/* Function log vectorized with AVX-512. KNL and SKX versions. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "svml_d_log_data.h" +#include "svml_d_wrapper_impl.h" + + .text +ENTRY (_ZGVeN8v_log_knl) +#ifndef HAVE_AVX512DQ_ASM_SUPPORT +WRAPPER_IMPL_AVX512 _ZGVdN4v_log +#else +/* + ALGORITHM DESCRIPTION: + + log(x) = -log(Rcp) + log(Rcp*x), + where Rcp ~ 1/x (accuracy ~9 bits, obtained by + rounding HW approximation to 1+9 mantissa bits) + + Reduced argument R=Rcp*x-1 is used to approximate log(1+R) as polynomial + + log(Rcp) = exponent_Rcp*log(2) + log(mantissa_Rcp) + -log(mantissa_Rcp) is obtained from a lookup table, + accessed by a 9-bit index + */ + pushq %rbp + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%rbp, 0) + movq %rsp, %rbp + cfi_def_cfa_register (%rbp) + andq $-64, %rsp + subq $1280, %rsp + movq __svml_dlog_data@GOTPCREL(%rip), %rdx + movq $-1, %rax + +/* isolate exponent bits */ + vpsrlq $20, %zmm0, %zmm2 + vpsrlq $32, %zmm2, %zmm3 + vpxord %zmm2, %zmm2, %zmm2 + kxnorw %k3, %k3, %k3 + vmovups _Two10(%rdx), %zmm1 + vmovups _One(%rdx), %zmm9 + vpmovqd %zmm3, %ymm4 + +/* convert biased exponent to DP format */ + vcvtdq2pd %ymm4, %zmm13 + +/* preserve mantissa, set input exponent to 2^(-10) */ + vpternlogq $248, _ExpMask(%rdx), %zmm0, %zmm1 + vcmppd $17, _MinNorm(%rdx), %zmm0, %k1 + +/* reciprocal approximation good to at least 11 bits */ + vrcp28pd %zmm1, %zmm5 + vpbroadcastq %rax, %zmm6{%k1}{z} + vmovups _poly_coeff_3(%rdx), %zmm15 + vcmppd $22, _MaxNorm(%rdx), %zmm0, %k2 + vmovups _Bias1(%rdx), %zmm14 + +/* round reciprocal to nearest integer, will have 1+9 mantissa bits */ + vrndscalepd $8, %zmm5, %zmm11 + vpbroadcastq %rax, %zmm7{%k2}{z} + +/* argument reduction started: R = Mantissa*Rcp - 1 */ + vfmsub213pd %zmm9, %zmm11, %zmm1 + +/* calculate index for table lookup */ + vpsrlq $40, %zmm11, %zmm10 + vgatherqpd _LogRcp_lookup(%rdx,%zmm10), %zmm2{%k3} + vcmppd $30, _Threshold(%rdx), %zmm11, %k1 + +/* combine and get argument value range mask */ + vporq %zmm7, %zmm6, %zmm8 + +/* exponent*log(2.0) */ + vmovups _poly_coeff_1(%rdx), %zmm11 + vmulpd %zmm1, %zmm1, %zmm10 + vptestmq %zmm8, %zmm8, %k0 + vfmadd213pd _poly_coeff_4(%rdx), %zmm1, %zmm15 + kmovw %k0, %ecx + +/* polynomial computation */ + vfmadd213pd _poly_coeff_2(%rdx), %zmm1, %zmm11 + movzbl %cl, %ecx + vpbroadcastq %rax, %zmm12{%k1}{z} + vfmadd213pd %zmm15, %zmm10, %zmm11 + vpternlogq $248, _Bias(%rdx), %zmm12, %zmm14 + +/* + Table stores -log(0.5*mantissa) for larger mantissas, + adjust exponent accordingly + */ + vsubpd %zmm14, %zmm13, %zmm3 + +/* + reconstruction: + (exponent*log(2)) + (LogRcp + (R+poly)) + */ + vfmadd213pd %zmm1, %zmm10, %zmm11 + vaddpd %zmm2, %zmm11, %zmm1 + vfmadd132pd _L2(%rdx), %zmm1, %zmm3 + testl %ecx, %ecx + jne .LBL_1_3 + +.LBL_1_2: + cfi_remember_state + vmovaps %zmm3, %zmm0 + movq %rbp, %rsp + cfi_def_cfa_register (%rsp) + popq %rbp + cfi_adjust_cfa_offset (-8) + cfi_restore (%rbp) + ret + +.LBL_1_3: + cfi_restore_state + vmovups %zmm0, 1152(%rsp) + vmovups %zmm3, 1216(%rsp) + je .LBL_1_2 + + xorb %dl, %dl + kmovw %k4, 1048(%rsp) + xorl %eax, %eax + kmovw %k5, 1040(%rsp) + kmovw %k6, 1032(%rsp) + kmovw %k7, 1024(%rsp) + vmovups %zmm16, 960(%rsp) + vmovups %zmm17, 896(%rsp) + vmovups %zmm18, 832(%rsp) + vmovups %zmm19, 768(%rsp) + vmovups %zmm20, 704(%rsp) + vmovups %zmm21, 640(%rsp) + vmovups %zmm22, 576(%rsp) + vmovups %zmm23, 512(%rsp) + vmovups %zmm24, 448(%rsp) + vmovups %zmm25, 384(%rsp) + vmovups %zmm26, 320(%rsp) + vmovups %zmm27, 256(%rsp) + vmovups %zmm28, 192(%rsp) + vmovups %zmm29, 128(%rsp) + vmovups %zmm30, 64(%rsp) + vmovups %zmm31, (%rsp) + movq %rsi, 1064(%rsp) + movq %rdi, 1056(%rsp) + movq %r12, 1096(%rsp) + cfi_offset_rel_rsp (12, 1096) + movb %dl, %r12b + movq %r13, 1088(%rsp) + cfi_offset_rel_rsp (13, 1088) + movl %ecx, %r13d + movq %r14, 1080(%rsp) + cfi_offset_rel_rsp (14, 1080) + movl %eax, %r14d + movq %r15, 1072(%rsp) + cfi_offset_rel_rsp (15, 1072) + cfi_remember_state + +.LBL_1_6: + btl %r14d, %r13d + jc .LBL_1_12 + +.LBL_1_7: + lea 1(%r14), %esi + btl %esi, %r13d + jc .LBL_1_10 + +.LBL_1_8: + addb $1, %r12b + addl $2, %r14d + cmpb $16, %r12b + jb .LBL_1_6 + + kmovw 1048(%rsp), %k4 + movq 1064(%rsp), %rsi + kmovw 1040(%rsp), %k5 + movq 1056(%rsp), %rdi + kmovw 1032(%rsp), %k6 + movq 1096(%rsp), %r12 + cfi_restore (%r12) + movq 1088(%rsp), %r13 + cfi_restore (%r13) + kmovw 1024(%rsp), %k7 + vmovups 960(%rsp), %zmm16 + vmovups 896(%rsp), %zmm17 + vmovups 832(%rsp), %zmm18 + vmovups 768(%rsp), %zmm19 + vmovups 704(%rsp), %zmm20 + vmovups 640(%rsp), %zmm21 + vmovups 576(%rsp), %zmm22 + vmovups 512(%rsp), %zmm23 + vmovups 448(%rsp), %zmm24 + vmovups 384(%rsp), %zmm25 + vmovups 320(%rsp), %zmm26 + vmovups 256(%rsp), %zmm27 + vmovups 192(%rsp), %zmm28 + vmovups 128(%rsp), %zmm29 + vmovups 64(%rsp), %zmm30 + vmovups (%rsp), %zmm31 + movq 1080(%rsp), %r14 + cfi_restore (%r14) + movq 1072(%rsp), %r15 + cfi_restore (%r15) + vmovups 1216(%rsp), %zmm3 + jmp .LBL_1_2 + +.LBL_1_10: + cfi_restore_state + movzbl %r12b, %r15d + shlq $4, %r15 + vmovsd 1160(%rsp,%r15), %xmm0 + call JUMPTARGET(__log_finite) + vmovsd %xmm0, 1224(%rsp,%r15) + jmp .LBL_1_8 + +.LBL_1_12: + movzbl %r12b, %r15d + shlq $4, %r15 + vmovsd 1152(%rsp,%r15), %xmm0 + call JUMPTARGET(__log_finite) + vmovsd %xmm0, 1216(%rsp,%r15) + jmp .LBL_1_7 +#endif +END (_ZGVeN8v_log_knl) + +ENTRY (_ZGVeN8v_log_skx) +#ifndef HAVE_AVX512DQ_ASM_SUPPORT +WRAPPER_IMPL_AVX512 _ZGVdN4v_log +#else +/* + ALGORITHM DESCRIPTION: + + log(x) = -log(Rcp) + log(Rcp*x), + where Rcp ~ 1/x (accuracy ~9 bits, + obtained by rounding HW approximation to 1+9 mantissa bits) + + Reduced argument R=Rcp*x-1 is used to approximate log(1+R) as polynomial + + log(Rcp) = exponent_Rcp*log(2) + log(mantissa_Rcp) + -log(mantissa_Rcp) is obtained from a lookup table, + accessed by a 9-bit index + */ + pushq %rbp + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%rbp, 0) + movq %rsp, %rbp + cfi_def_cfa_register (%rbp) + andq $-64, %rsp + subq $1280, %rsp + movq __svml_dlog_data@GOTPCREL(%rip), %rax + vmovaps %zmm0, %zmm3 + kxnorw %k3, %k3, %k3 + vmovups _Two10(%rax), %zmm2 + vmovups _Threshold(%rax), %zmm14 + vmovups _One(%rax), %zmm11 + vcmppd $21, _MinNorm(%rax), %zmm3, %k1 + vcmppd $18, _MaxNorm(%rax), %zmm3, %k2 + +/* isolate exponent bits */ + vpsrlq $20, %zmm3, %zmm4 + +/* preserve mantissa, set input exponent to 2^(-10) */ + vpternlogq $248, _ExpMask(%rax), %zmm3, %zmm2 + vpbroadcastq .L_2il0floatpacket.12(%rip), %zmm1 + vpsrlq $32, %zmm4, %zmm6 + +/* reciprocal approximation good to at least 11 bits */ + vrcp14pd %zmm2, %zmm5 + +/* exponent*log(2.0) */ + vmovups _poly_coeff_1(%rax), %zmm4 + vpmovqd %zmm6, %ymm7 + +/* round reciprocal to nearest integer, will have 1+9 mantissa bits */ + vrndscalepd $8, %zmm5, %zmm0 + +/* calculate index for table lookup */ + vpsrlq $40, %zmm0, %zmm12 + +/* argument reduction started: R = Mantissa*Rcp - 1 */ + vfmsub213pd %zmm11, %zmm0, %zmm2 + vpmovqd %zmm12, %ymm13 + +/* polynomial computation */ + vfmadd213pd _poly_coeff_2(%rax), %zmm2, %zmm4 + vmovaps %zmm1, %zmm8 + vmovaps %zmm1, %zmm9 + vpxord %zmm5, %zmm5, %zmm5 + vgatherdpd _LogRcp_lookup(%rax,%ymm13), %zmm5{%k3} + vmovups _Bias1(%rax), %zmm13 + vpandnq %zmm3, %zmm3, %zmm8{%k1} + vcmppd $21, %zmm0, %zmm14, %k1 + vpandnq %zmm14, %zmm14, %zmm1{%k1} + vmulpd %zmm2, %zmm2, %zmm14 + vpternlogq $248, _Bias(%rax), %zmm1, %zmm13 + vmovups _poly_coeff_3(%rax), %zmm1 + vfmadd213pd _poly_coeff_4(%rax), %zmm2, %zmm1 + vfmadd213pd %zmm1, %zmm14, %zmm4 + +/* + reconstruction: + (exponent*log(2)) + (LogRcp + (R+poly)) + */ + vfmadd213pd %zmm2, %zmm14, %zmm4 + vaddpd %zmm5, %zmm4, %zmm2 + vpandnq %zmm3, %zmm3, %zmm9{%k2} + +/* combine and get argument value range mask */ + vorpd %zmm9, %zmm8, %zmm10 + vcmppd $3, %zmm10, %zmm10, %k0 + kmovw %k0, %ecx + +/* convert biased exponent to DP format */ + vcvtdq2pd %ymm7, %zmm15 + +/* + Table stores -log(0.5*mantissa) for larger mantissas, + adjust exponent accordingly + */ + vsubpd %zmm13, %zmm15, %zmm0 + vfmadd132pd _L2(%rax), %zmm2, %zmm0 + testl %ecx, %ecx + jne .LBL_2_3 + +.LBL_2_2: + cfi_remember_state + movq %rbp, %rsp + cfi_def_cfa_register (%rsp) + popq %rbp + cfi_adjust_cfa_offset (-8) + cfi_restore (%rbp) + ret + +.LBL_2_3: + cfi_restore_state + vmovups %zmm3, 1152(%rsp) + vmovups %zmm0, 1216(%rsp) + je .LBL_2_2 + + xorb %dl, %dl + xorl %eax, %eax + kmovw %k4, 1048(%rsp) + kmovw %k5, 1040(%rsp) + kmovw %k6, 1032(%rsp) + kmovw %k7, 1024(%rsp) + vmovups %zmm16, 960(%rsp) + vmovups %zmm17, 896(%rsp) + vmovups %zmm18, 832(%rsp) + vmovups %zmm19, 768(%rsp) + vmovups %zmm20, 704(%rsp) + vmovups %zmm21, 640(%rsp) + vmovups %zmm22, 576(%rsp) + vmovups %zmm23, 512(%rsp) + vmovups %zmm24, 448(%rsp) + vmovups %zmm25, 384(%rsp) + vmovups %zmm26, 320(%rsp) + vmovups %zmm27, 256(%rsp) + vmovups %zmm28, 192(%rsp) + vmovups %zmm29, 128(%rsp) + vmovups %zmm30, 64(%rsp) + vmovups %zmm31, (%rsp) + movq %rsi, 1064(%rsp) + movq %rdi, 1056(%rsp) + movq %r12, 1096(%rsp) + cfi_offset_rel_rsp (12, 1096) + movb %dl, %r12b + movq %r13, 1088(%rsp) + cfi_offset_rel_rsp (13, 1088) + movl %ecx, %r13d + movq %r14, 1080(%rsp) + cfi_offset_rel_rsp (14, 1080) + movl %eax, %r14d + movq %r15, 1072(%rsp) + cfi_offset_rel_rsp (15, 1072) + cfi_remember_state + +.LBL_2_6: + btl %r14d, %r13d + jc .LBL_2_12 + +.LBL_2_7: + lea 1(%r14), %esi + btl %esi, %r13d + jc .LBL_2_10 + +.LBL_2_8: + incb %r12b + addl $2, %r14d + cmpb $16, %r12b + jb .LBL_2_6 + + kmovw 1048(%rsp), %k4 + kmovw 1040(%rsp), %k5 + kmovw 1032(%rsp), %k6 + kmovw 1024(%rsp), %k7 + vmovups 960(%rsp), %zmm16 + vmovups 896(%rsp), %zmm17 + vmovups 832(%rsp), %zmm18 + vmovups 768(%rsp), %zmm19 + vmovups 704(%rsp), %zmm20 + vmovups 640(%rsp), %zmm21 + vmovups 576(%rsp), %zmm22 + vmovups 512(%rsp), %zmm23 + vmovups 448(%rsp), %zmm24 + vmovups 384(%rsp), %zmm25 + vmovups 320(%rsp), %zmm26 + vmovups 256(%rsp), %zmm27 + vmovups 192(%rsp), %zmm28 + vmovups 128(%rsp), %zmm29 + vmovups 64(%rsp), %zmm30 + vmovups (%rsp), %zmm31 + vmovups 1216(%rsp), %zmm0 + movq 1064(%rsp), %rsi + movq 1056(%rsp), %rdi + movq 1096(%rsp), %r12 + cfi_restore (%r12) + movq 1088(%rsp), %r13 + cfi_restore (%r13) + movq 1080(%rsp), %r14 + cfi_restore (%r14) + movq 1072(%rsp), %r15 + cfi_restore (%r15) + jmp .LBL_2_2 + +.LBL_2_10: + cfi_restore_state + movzbl %r12b, %r15d + shlq $4, %r15 + vmovsd 1160(%rsp,%r15), %xmm0 + vzeroupper + vmovsd 1160(%rsp,%r15), %xmm0 + + call JUMPTARGET(__log_finite) + + vmovsd %xmm0, 1224(%rsp,%r15) + jmp .LBL_2_8 + +.LBL_2_12: + movzbl %r12b, %r15d + shlq $4, %r15 + vmovsd 1152(%rsp,%r15), %xmm0 + vzeroupper + vmovsd 1152(%rsp,%r15), %xmm0 + + call JUMPTARGET(__log_finite) + + vmovsd %xmm0, 1216(%rsp,%r15) + jmp .LBL_2_7 +#endif +END (_ZGVeN8v_log_skx) + + .section .rodata, "a" +.L_2il0floatpacket.12: + .long 0xffffffff,0xffffffff + .type .L_2il0floatpacket.12,@object diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_pow2_core.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_pow2_core.S new file mode 100644 index 0000000000..adb0872e56 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_pow2_core.S @@ -0,0 +1,36 @@ +/* Multiple versions of vectorized pow. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + + .text +ENTRY (_ZGVbN2vv_pow) + .type _ZGVbN2vv_pow, @gnu_indirect_function + LOAD_RTLD_GLOBAL_RO_RDX + leaq _ZGVbN2vv_pow_sse4(%rip), %rax + HAS_CPU_FEATURE (SSE4_1) + jz 2f + ret +2: leaq _ZGVbN2vv_pow_sse2(%rip), %rax + ret +END (_ZGVbN2vv_pow) +libmvec_hidden_def (_ZGVbN2vv_pow) + +#define _ZGVbN2vv_pow _ZGVbN2vv_pow_sse2 +#include "../svml_d_pow2_core.S" diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_pow2_core_sse4.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_pow2_core_sse4.S new file mode 100644 index 0000000000..ad7c215ff0 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_pow2_core_sse4.S @@ -0,0 +1,432 @@ +/* Function pow vectorized with SSE4. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "svml_d_pow_data.h" + + .text +ENTRY (_ZGVbN2vv_pow_sse4) +/* + ALGORITHM DESCRIPTION: + + 1) Calculating log2|x| + Here we use the following formula. + Let |x|=2^k1*X1, where k1 is integer, 1<=X1<2. + Let C ~= 1/ln(2), + Rcp1 ~= 1/X1, X2=Rcp1*X1, + Rcp2 ~= 1/X2, X3=Rcp2*X2, + Rcp3 ~= 1/X3, Rcp3C ~= C/X3. + Then + log2|x| = k1 + log2(1/Rcp1) + log2(1/Rcp2) + log2(C/Rcp3C) + + log2(X1*Rcp1*Rcp2*Rcp3C/C), + where X1*Rcp1*Rcp2*Rcp3C = C*(1+q), q is very small. + + The values of Rcp1, log2(1/Rcp1), Rcp2, log2(1/Rcp2), + Rcp3C, log2(C/Rcp3C) are taken from tables. + Values of Rcp1, Rcp2, Rcp3C are such that RcpC=Rcp1*Rcp2*Rcp3C + is exactly represented in target precision. + + log2(X1*Rcp1*Rcp2*Rcp3C/C) = log2(1+q) = ln(1+q)/ln2 = + = 1/(ln2)*q - 1/(2ln2)*q^2 + 1/(3ln2)*q^3 - ... = + = 1/(C*ln2)*cq - 1/(2*C^2*ln2)*cq^2 + 1/(3*C^3*ln2)*cq^3 - ... = + = (1 + a1)*cq + a2*cq^2 + a3*cq^3 + ..., + where cq = X1*Rcp1*Rcp2*Rcp3C-C, + a1=1/(C*ln(2))-1 is small, + a2=1/(2*C^2*ln2), + a3=1/(3*C^3*ln2), + ... + We get 3 parts of log2 result: HH+HL+HLL ~= log2|x|. + + 2) Calculation of y*(HH+HL+HLL). + Split y into YHi+YLo. + Get high PH and medium PL parts of y*log2|x|. + Get low PLL part of y*log2|x|. + Now we have PH+PL+PLL ~= y*log2|x|. + + 3) Calculation of 2^(PH+PL+PLL). + Mathematical idea of computing 2^(PH+PL+PLL) is the following. + Let's represent PH+PL+PLL in the form N + j/2^expK + Z, + where expK=7 in this implementation, N and j are integers, + 0<=j<=2^expK-1, |Z|<2^(-expK-1). + Hence 2^(PH+PL+PLL) ~= 2^N * 2^(j/2^expK) * 2^Z, + where 2^(j/2^expK) is stored in a table, and + 2^Z ~= 1 + B1*Z + B2*Z^2 ... + B5*Z^5. + + We compute 2^(PH+PL+PLL) as follows. + Break PH into PHH + PHL, where PHH = N + j/2^expK. + Z = PHL + PL + PLL + Exp2Poly = B1*Z + B2*Z^2 ... + B5*Z^5 + Get 2^(j/2^expK) from table in the form THI+TLO. + Now we have 2^(PH+PL+PLL) ~= 2^N * (THI + TLO) * (1 + Exp2Poly). + + Get significand of 2^(PH+PL+PLL) in the form ResHi+ResLo: + ResHi := THI + ResLo := THI * Exp2Poly + TLO + + Get exponent ERes of the result: + Res := ResHi + ResLo: + Result := ex(Res) + N. */ + + pushq %rbp + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%rbp, 0) + movq %rsp, %rbp + cfi_def_cfa_register (%rbp) + andq $-64, %rsp + subq $256, %rsp + movq __svml_dpow_data@GOTPCREL(%rip), %rdx + movups %xmm14, 80(%rsp) + movups %xmm9, 176(%rsp) + movaps %xmm1, %xmm9 + pshufd $221, %xmm0, %xmm1 + movq _iIndexMask(%rdx), %xmm14 + movq _iIndexAdd(%rdx), %xmm6 + +/* i = (((Hi(x) & 0x000ffe00) + 0x00000200) >> 10); -> i = (b1..b11 + 1) / 2 */ + pand %xmm1, %xmm14 + paddd %xmm6, %xmm14 + psrld $10, %xmm14 + movups %xmm13, 96(%rsp) + +/* Index for reciprocal table */ + movdqa %xmm14, %xmm13 + pslld $3, %xmm13 + +/* Index for log2 table */ + pslld $4, %xmm14 + movd %xmm13, %eax + movups %xmm10, 160(%rsp) + movups _iMantissaMask(%rdx), %xmm10 + movslq %eax, %rax + +/* x1 = x; Hi(x1) = (Hi(x1)&0x000fffff)|0x3ff00000 */ + andps %xmm0, %xmm10 + pextrd $1, %xmm13, %ecx + movslq %ecx, %rcx + movups %xmm0, (%rsp) + movdqa %xmm1, %xmm0 + +/* k = Hi(x); k = k - 0x3fe7fe00; k = k >> 20 */ + movq _i3fe7fe00(%rdx), %xmm6 + psubd %xmm6, %xmm0 + movups _iHighMask(%rdx), %xmm6 + psrad $20, %xmm0 + movups %xmm15, 48(%rsp) + movups %xmm12, 112(%rsp) + orps _dbOne(%rdx), %xmm10 + movsd 11712(%rdx,%rax), %xmm12 + movd %xmm14, %r8d + movq _i2p20_2p19(%rdx), %xmm15 + movhpd 11712(%rdx,%rcx), %xmm12 + paddd %xmm15, %xmm0 + pextrd $1, %xmm14, %r9d + +/* x1Hi=x1; Lo(x1Hi)&=0xf8000000; x1Lo = x1-x1Hi */ + movaps %xmm6, %xmm14 + andps %xmm10, %xmm14 + movaps %xmm10, %xmm15 + subpd %xmm14, %xmm15 + +/* r1 = x1*rcp1 */ + mulpd %xmm12, %xmm10 + +/* E = -r1+__fence(x1Hi*rcp1) */ + mulpd %xmm12, %xmm14 + +/* E=E+x1Lo*rcp1 */ + mulpd %xmm15, %xmm12 + subpd %xmm10, %xmm14 + pshufd $80, %xmm0, %xmm0 + movslq %r8d, %r8 + andps _iffffffff00000000(%rdx), %xmm0 + subpd _db2p20_2p19(%rdx), %xmm0 + addpd %xmm12, %xmm14 + movslq %r9d, %r9 + +/* T_Rh_Eh = T_Rh + E */ + movaps %xmm14, %xmm15 + movups %xmm8, 208(%rsp) + movups 19968(%rdx,%r8), %xmm8 + movups %xmm11, 144(%rsp) + movaps %xmm8, %xmm11 + +/* cq = c+r1 */ + movups _LHN(%rdx), %xmm13 + movhpd 19968(%rdx,%r9), %xmm11 + addpd %xmm10, %xmm13 + +/* T = k + L1hi */ + addpd %xmm0, %xmm11 + +/* T_Rh = T + cq */ + movaps %xmm13, %xmm12 + addpd %xmm11, %xmm12 + addpd %xmm12, %xmm15 + +/* Rl = T-T_Rh; -> -Rh */ + subpd %xmm12, %xmm11 + +/* HLL = T_Rh - T_Rh_Eh; -> -Eh */ + subpd %xmm15, %xmm12 + +/* Rl=Rl+cq; */ + addpd %xmm13, %xmm11 + +/* cq = cq + E */ + addpd %xmm14, %xmm13 + +/* HLL+=E; -> El */ + addpd %xmm14, %xmm12 + +/* HLL+=Rl */ + addpd %xmm12, %xmm11 + unpckhpd 19968(%rdx,%r9), %xmm8 + +/* T_Rh_Eh_HLLhi = T_Rh_Eh + HLL */ + movaps %xmm15, %xmm14 + +/* HLL+=L1lo; */ + addpd %xmm11, %xmm8 + movups _clv_2(%rdx), %xmm11 + +/* HH = T_Rh_Eh_HLLhi; Lo(HH)&=0xf8000000 */ + movaps %xmm6, %xmm12 + +/* HLL = HLL + (((((((a7)*cq+a6)*cq+a5)*cq+a4)*cq+a3)*cq+a2)*cq+a1)*cq */ + mulpd %xmm13, %xmm11 + addpd _clv_3(%rdx), %xmm11 + mulpd %xmm13, %xmm11 + addpd _clv_4(%rdx), %xmm11 + mulpd %xmm13, %xmm11 + addpd _clv_5(%rdx), %xmm11 + mulpd %xmm13, %xmm11 + addpd _clv_6(%rdx), %xmm11 + mulpd %xmm13, %xmm11 + addpd _clv_7(%rdx), %xmm11 + mulpd %xmm11, %xmm13 + addpd %xmm13, %xmm8 + addpd %xmm8, %xmm14 + +/* + 2^(y*(HH+HL+HLL)) starts here: + yH = y; Lo(yH)&=0xf8000000 + */ + andps %xmm9, %xmm6 + +/* yL = y-yH; */ + movaps %xmm9, %xmm11 + subpd %xmm6, %xmm11 + andps %xmm14, %xmm12 + +/* HLLhi = T_Rh_Eh_HLLhi - T_Rh_Eh */ + movaps %xmm14, %xmm10 + +/* HL = T_Rh_Eh_HLLhi-HH; */ + subpd %xmm12, %xmm14 + subpd %xmm15, %xmm10 + movq _HIDELTA(%rdx), %xmm2 + +/* pH = yH*HH; */ + movaps %xmm6, %xmm13 + movq _LORANGE(%rdx), %xmm3 + paddd %xmm2, %xmm1 + pcmpgtd %xmm1, %xmm3 + +/* pL=yL*HL+yH*HL; pL+=yL*HH; */ + movaps %xmm11, %xmm1 + mulpd %xmm14, %xmm1 + mulpd %xmm14, %xmm6 + mulpd %xmm12, %xmm13 + mulpd %xmm11, %xmm12 + addpd %xmm6, %xmm1 + +/* HLL = HLL - HLLhi */ + subpd %xmm10, %xmm8 + addpd %xmm12, %xmm1 + +/* pLL = y*HLL */ + mulpd %xmm9, %xmm8 + movups _db2p45_2p44(%rdx), %xmm11 + +/* pHH = pH + *(double*)&db2p45_2p44 */ + movaps %xmm11, %xmm12 + addpd %xmm13, %xmm12 + +/* t=pL+pLL; t+=pHL */ + addpd %xmm8, %xmm1 + movq _ABSMASK(%rdx), %xmm5 + pshufd $221, %xmm9, %xmm4 + pand %xmm5, %xmm4 + movq _INF(%rdx), %xmm7 + movdqa %xmm4, %xmm2 + pcmpgtd %xmm7, %xmm2 + pcmpeqd %xmm7, %xmm4 + pshufd $136, %xmm12, %xmm7 + por %xmm4, %xmm2 + +/* pHH = pHH - *(double*)&db2p45_2p44 */ + subpd %xmm11, %xmm12 + pshufd $221, %xmm13, %xmm10 + por %xmm2, %xmm3 + +/* pHL = pH - pHH; */ + subpd %xmm12, %xmm13 + pand %xmm5, %xmm10 + movq _DOMAINRANGE(%rdx), %xmm5 + movdqa %xmm10, %xmm4 + addpd %xmm1, %xmm13 + pcmpgtd %xmm5, %xmm4 + pcmpeqd %xmm5, %xmm10 + por %xmm10, %xmm4 + movq _jIndexMask(%rdx), %xmm6 + por %xmm4, %xmm3 + movmskps %xmm3, %eax + +/* j = Lo(pHH)&0x0000007f */ + pand %xmm7, %xmm6 + movq _iOne(%rdx), %xmm3 + +/* _n = Lo(pHH); + _n = _n & 0xffffff80; + _n = _n >> 7; + Hi(_2n) = (0x3ff+_n)<<20; Lo(_2n) = 0; -> 2^n + */ + pslld $13, %xmm7 + paddd %xmm3, %xmm7 + pslld $4, %xmm6 + movups _cev_1(%rdx), %xmm3 + movaps %xmm13, %xmm4 + mulpd %xmm13, %xmm3 + +/* T1 = ((double*)exp2_tbl)[ 2*j ] */ + movd %xmm6, %r10d + pshufd $80, %xmm7, %xmm0 + andps _ifff0000000000000(%rdx), %xmm0 + addpd _cev_2(%rdx), %xmm3 + mulpd %xmm13, %xmm3 + addpd _cev_3(%rdx), %xmm3 + mulpd %xmm13, %xmm3 + movslq %r10d, %r10 + andl $3, %eax + pextrd $1, %xmm6, %r11d + movslq %r11d, %r11 + addpd _cev_4(%rdx), %xmm3 + movsd 36416(%rdx,%r10), %xmm2 + movhpd 36416(%rdx,%r11), %xmm2 + mulpd %xmm2, %xmm0 + mulpd %xmm3, %xmm13 + mulpd %xmm0, %xmm4 + addpd _cev_5(%rdx), %xmm13 + mulpd %xmm4, %xmm13 + addpd %xmm13, %xmm0 + jne .LBL_1_3 + +.LBL_1_2: + cfi_remember_state + movups 208(%rsp), %xmm8 + movups 176(%rsp), %xmm9 + movups 160(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 112(%rsp), %xmm12 + movups 96(%rsp), %xmm13 + movups 80(%rsp), %xmm14 + movups 48(%rsp), %xmm15 + movq %rbp, %rsp + cfi_def_cfa_register (%rsp) + popq %rbp + cfi_adjust_cfa_offset (-8) + cfi_restore (%rbp) + ret + +.LBL_1_3: + cfi_restore_state + movups (%rsp), %xmm1 + movups %xmm1, 64(%rsp) + movups %xmm9, 128(%rsp) + movups %xmm0, 192(%rsp) + je .LBL_1_2 + + xorb %cl, %cl + xorl %edx, %edx + movq %rsi, 8(%rsp) + movq %rdi, (%rsp) + movq %r12, 40(%rsp) + cfi_offset_rel_rsp (12, 40) + movb %cl, %r12b + movq %r13, 32(%rsp) + cfi_offset_rel_rsp (13, 32) + movl %eax, %r13d + movq %r14, 24(%rsp) + cfi_offset_rel_rsp (14, 24) + movl %edx, %r14d + movq %r15, 16(%rsp) + cfi_offset_rel_rsp (15, 16) + cfi_remember_state + +.LBL_1_6: + btl %r14d, %r13d + jc .LBL_1_12 + +.LBL_1_7: + lea 1(%r14), %esi + btl %esi, %r13d + jc .LBL_1_10 + +.LBL_1_8: + incb %r12b + addl $2, %r14d + cmpb $16, %r12b + jb .LBL_1_6 + + movq 8(%rsp), %rsi + movq (%rsp), %rdi + movq 40(%rsp), %r12 + cfi_restore (%r12) + movq 32(%rsp), %r13 + cfi_restore (%r13) + movq 24(%rsp), %r14 + cfi_restore (%r14) + movq 16(%rsp), %r15 + cfi_restore (%r15) + movups 192(%rsp), %xmm0 + jmp .LBL_1_2 + +.LBL_1_10: + cfi_restore_state + movzbl %r12b, %r15d + shlq $4, %r15 + movsd 72(%rsp,%r15), %xmm0 + movsd 136(%rsp,%r15), %xmm1 + + call JUMPTARGET(__pow_finite) + + movsd %xmm0, 200(%rsp,%r15) + jmp .LBL_1_8 + +.LBL_1_12: + movzbl %r12b, %r15d + shlq $4, %r15 + movsd 64(%rsp,%r15), %xmm0 + movsd 128(%rsp,%r15), %xmm1 + + call JUMPTARGET(__pow_finite) + + movsd %xmm0, 192(%rsp,%r15) + jmp .LBL_1_7 + +END (_ZGVbN2vv_pow_sse4) diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_pow4_core.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_pow4_core.S new file mode 100644 index 0000000000..eea8af6638 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_pow4_core.S @@ -0,0 +1,36 @@ +/* Multiple versions of vectorized pow. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + + .text +ENTRY (_ZGVdN4vv_pow) + .type _ZGVdN4vv_pow, @gnu_indirect_function + LOAD_RTLD_GLOBAL_RO_RDX + leaq _ZGVdN4vv_pow_avx2(%rip), %rax + HAS_ARCH_FEATURE (AVX2_Usable) + jz 2f + ret +2: leaq _ZGVdN4vv_pow_sse_wrapper(%rip), %rax + ret +END (_ZGVdN4vv_pow) +libmvec_hidden_def (_ZGVdN4vv_pow) + +#define _ZGVdN4vv_pow _ZGVdN4vv_pow_sse_wrapper +#include "../svml_d_pow4_core.S" diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_pow4_core_avx2.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_pow4_core_avx2.S new file mode 100644 index 0000000000..3092328909 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_pow4_core_avx2.S @@ -0,0 +1,387 @@ +/* Function pow vectorized with AVX2. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "svml_d_pow_data.h" + + .text +ENTRY (_ZGVdN4vv_pow_avx2) +/* + ALGORITHM DESCRIPTION: + + 1) Calculating log2|x| + Here we use the following formula. + Let |x|=2^k1*X1, where k1 is integer, 1<=X1<2. + Let C ~= 1/ln(2), + Rcp1 ~= 1/X1, X2=Rcp1*X1, + Rcp2 ~= 1/X2, X3=Rcp2*X2, + Rcp3 ~= 1/X3, Rcp3C ~= C/X3. + Then + log2|x| = k1 + log2(1/Rcp1) + log2(1/Rcp2) + log2(C/Rcp3C) + + log2(X1*Rcp1*Rcp2*Rcp3C/C), + where X1*Rcp1*Rcp2*Rcp3C = C*(1+q), q is very small. + + The values of Rcp1, log2(1/Rcp1), Rcp2, log2(1/Rcp2), + Rcp3C, log2(C/Rcp3C) are taken from tables. + Values of Rcp1, Rcp2, Rcp3C are such that RcpC=Rcp1*Rcp2*Rcp3C + is exactly represented in target precision. + + log2(X1*Rcp1*Rcp2*Rcp3C/C) = log2(1+q) = ln(1+q)/ln2 = + = 1/(ln2)*q - 1/(2ln2)*q^2 + 1/(3ln2)*q^3 - ... = + = 1/(C*ln2)*cq - 1/(2*C^2*ln2)*cq^2 + 1/(3*C^3*ln2)*cq^3 - ... = + = (1 + a1)*cq + a2*cq^2 + a3*cq^3 + ..., + where cq = X1*Rcp1*Rcp2*Rcp3C-C, + a1=1/(C*ln(2))-1 is small, + a2=1/(2*C^2*ln2), + a3=1/(3*C^3*ln2), + ... + We get 3 parts of log2 result: HH+HL+HLL ~= log2|x|. + + 2) Calculation of y*(HH+HL+HLL). + Split y into YHi+YLo. + Get high PH and medium PL parts of y*log2|x|. + Get low PLL part of y*log2|x|. + Now we have PH+PL+PLL ~= y*log2|x|. + + 3) Calculation of 2^(PH+PL+PLL). + Mathematical idea of computing 2^(PH+PL+PLL) is the following. + Let's represent PH+PL+PLL in the form N + j/2^expK + Z, + where expK=7 in this implementation, N and j are integers, + 0<=j<=2^expK-1, |Z|<2^(-expK-1). + Hence 2^(PH+PL+PLL) ~= 2^N * 2^(j/2^expK) * 2^Z, + where 2^(j/2^expK) is stored in a table, and + 2^Z ~= 1 + B1*Z + B2*Z^2 ... + B5*Z^5. + + We compute 2^(PH+PL+PLL) as follows. + Break PH into PHH + PHL, where PHH = N + j/2^expK. + Z = PHL + PL + PLL + Exp2Poly = B1*Z + B2*Z^2 ... + B5*Z^5 + Get 2^(j/2^expK) from table in the form THI+TLO. + Now we have 2^(PH+PL+PLL) ~= 2^N * (THI + TLO) * (1 + Exp2Poly). + + Get significand of 2^(PH+PL+PLL) in the form ResHi+ResLo: + ResHi := THI + ResLo := THI * Exp2Poly + TLO + + Get exponent ERes of the result: + Res := ResHi + ResLo: + Result := ex(Res) + N. */ + + pushq %rbp + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%rbp, 0) + movq %rsp, %rbp + cfi_def_cfa_register (%rbp) + andq $-64, %rsp + subq $448, %rsp + movq __svml_dpow_data@GOTPCREL(%rip), %rax + vmovups %ymm11, 160(%rsp) + vmovups %ymm8, 224(%rsp) + vmovups %ymm10, 352(%rsp) + vmovups %ymm9, 384(%rsp) + vmovups %ymm13, 288(%rsp) + vmovapd %ymm1, %ymm11 + vxorpd %ymm1, %ymm1, %ymm1 + vextracti128 $1, %ymm0, %xmm5 + vshufps $221, %xmm5, %xmm0, %xmm5 + +/* i = (((Hi(x) & 0x000ffe00) + 0x00000200) >> 10); -> i = (b1..b11 + 1) / 2 */ + vandps _iIndexMask(%rax), %xmm5, %xmm3 + vpaddd _iIndexAdd(%rax), %xmm3, %xmm6 + vpsrld $10, %xmm6, %xmm8 + +/* Index for reciprocal table */ + vpslld $3, %xmm8, %xmm9 + +/* Index for log2 table */ + vpslld $4, %xmm8, %xmm6 + +/* x1 = x; Hi(x1) = (Hi(x1)&0x000fffff)|0x3ff00000 */ + vandpd _iMantissaMask(%rax), %ymm0, %ymm4 + vorpd _dbOne(%rax), %ymm4, %ymm13 + vpcmpeqd %ymm4, %ymm4, %ymm4 + vpcmpeqd %ymm8, %ymm8, %ymm8 + +/* k = Hi(x); k = k - 0x3fe7fe00; k = k >> 20 */ + vpsubd _i3fe7fe00(%rax), %xmm5, %xmm3 + vpaddd _HIDELTA(%rax), %xmm5, %xmm5 + vextracti128 $1, %ymm11, %xmm7 + vshufps $221, %xmm7, %xmm11, %xmm2 + vpand _ABSMASK(%rax), %xmm2, %xmm10 + vpcmpeqd %ymm2, %ymm2, %ymm2 + vgatherdpd %ymm2, 11712(%rax,%xmm9), %ymm1 + vmovups _LORANGE(%rax), %xmm7 + vxorpd %ymm2, %ymm2, %ymm2 + vgatherdpd %ymm4, 19968(%rax,%xmm6), %ymm2 + vxorpd %ymm4, %ymm4, %ymm4 + vgatherdpd %ymm8, 19976(%rax,%xmm6), %ymm4 + vpsrad $20, %xmm3, %xmm6 + vpaddd _i2p20_2p19(%rax), %xmm6, %xmm9 + vpshufd $80, %xmm9, %xmm8 + vpshufd $250, %xmm9, %xmm3 + +/* x1Hi=x1; Lo(x1Hi)&=0xf8000000; x1Lo = x1-x1Hi */ + vandpd _iHighMask(%rax), %ymm13, %ymm9 + vinserti128 $1, %xmm3, %ymm8, %ymm6 + vandpd _iffffffff00000000(%rax), %ymm6, %ymm8 + +/* r1 = x1*rcp1 */ + vmulpd %ymm1, %ymm13, %ymm6 + vsubpd %ymm9, %ymm13, %ymm3 + vsubpd _db2p20_2p19(%rax), %ymm8, %ymm8 + +/* cq = c+r1 */ + vaddpd _LHN(%rax), %ymm6, %ymm13 + +/* E = -r1+__fence(x1Hi*rcp1) */ + vfmsub213pd %ymm6, %ymm1, %ymm9 + +/* E=E+x1Lo*rcp1 */ + vfmadd213pd %ymm9, %ymm1, %ymm3 + +/* T = k + L1hi */ + vaddpd %ymm2, %ymm8, %ymm1 + +/* T_Rh = T + cq */ + vaddpd %ymm13, %ymm1, %ymm8 + +/* Rl = T-T_Rh; -> -Rh */ + vsubpd %ymm8, %ymm1, %ymm6 + +/* Rl=Rl+cq */ + vaddpd %ymm6, %ymm13, %ymm1 + +/* T_Rh_Eh = T_Rh + E */ + vaddpd %ymm3, %ymm8, %ymm6 + +/* cq = cq + E */ + vaddpd %ymm3, %ymm13, %ymm13 + +/* HLL = T_Rh - T_Rh_Eh; -> -Eh */ + vsubpd %ymm6, %ymm8, %ymm9 + +/* HLL+=E; -> El */ + vaddpd %ymm9, %ymm3, %ymm2 + +/* HLL+=Rl */ + vaddpd %ymm1, %ymm2, %ymm8 + +/* HLL+=L1lo */ + vaddpd %ymm4, %ymm8, %ymm4 + vmovupd _clv_2(%rax), %ymm8 + +/* HLL = HLL + (((((((a7)*cq+a6)*cq+a5)*cq+a4)*cq+a3)*cq+a2)*cq+a1)*cq */ + vfmadd213pd _clv_3(%rax), %ymm13, %ymm8 + vfmadd213pd _clv_4(%rax), %ymm13, %ymm8 + vfmadd213pd _clv_5(%rax), %ymm13, %ymm8 + vfmadd213pd _clv_6(%rax), %ymm13, %ymm8 + vfmadd213pd _clv_7(%rax), %ymm13, %ymm8 + vfmadd213pd %ymm4, %ymm13, %ymm8 + +/* T_Rh_Eh_HLLhi = T_Rh_Eh + HLL */ + vaddpd %ymm8, %ymm6, %ymm9 + +/* HH = T_Rh_Eh_HLLhi; Lo(HH)&=0xf8000000 */ + vandpd _iHighMask(%rax), %ymm9, %ymm2 + +/* + 2^(y*(HH+HL+HLL)) starts here: + yH = y; Lo(yH)&=0xf8000000; + */ + vandpd _iHighMask(%rax), %ymm11, %ymm1 + +/* HLLhi = T_Rh_Eh_HLLhi - T_Rh_Eh */ + vsubpd %ymm6, %ymm9, %ymm13 + +/* HL = T_Rh_Eh_HLLhi-HH */ + vsubpd %ymm2, %ymm9, %ymm4 + +/* pH = yH*HH */ + vmulpd %ymm2, %ymm1, %ymm9 + +/* HLL = HLL - HLLhi */ + vsubpd %ymm13, %ymm8, %ymm6 + +/* yL = y-yH */ + vsubpd %ymm1, %ymm11, %ymm8 + vextracti128 $1, %ymm9, %xmm3 + vshufps $221, %xmm3, %xmm9, %xmm13 + vpand _ABSMASK(%rax), %xmm13, %xmm3 + vpcmpgtd %xmm5, %xmm7, %xmm13 + vpcmpgtd _INF(%rax), %xmm10, %xmm7 + vpcmpeqd _INF(%rax), %xmm10, %xmm10 + vpor %xmm10, %xmm7, %xmm7 + vpor %xmm7, %xmm13, %xmm5 + +/* pL=yL*HL+yH*HL; pL+=yL*HH */ + vmulpd %ymm4, %ymm8, %ymm7 + vpcmpgtd _DOMAINRANGE(%rax), %xmm3, %xmm13 + vpcmpeqd _DOMAINRANGE(%rax), %xmm3, %xmm10 + vpor %xmm10, %xmm13, %xmm3 + vpor %xmm3, %xmm5, %xmm13 + vfmadd213pd %ymm7, %ymm4, %ymm1 + +/* pLL = y*HLL; + pHH = pH + *(double*)&db2p45_2p44 + */ + vaddpd _db2p45_2p44(%rax), %ymm9, %ymm7 + vmovmskps %xmm13, %ecx + vfmadd213pd %ymm1, %ymm2, %ymm8 + +/* t=pL+pLL; t+=pHL */ + vfmadd231pd %ymm11, %ymm6, %ymm8 + vextracti128 $1, %ymm7, %xmm1 + vshufps $136, %xmm1, %xmm7, %xmm10 + +/* _n = Lo(pHH); + _n = _n & 0xffffff80; + _n = _n >> 7; + Hi(_2n) = (0x3ff+_n)<<20; Lo(_2n) = 0; -> 2^n + */ + vpslld $13, %xmm10, %xmm2 + vpaddd _iOne(%rax), %xmm2, %xmm13 + vpshufd $80, %xmm13, %xmm4 + vpshufd $250, %xmm13, %xmm1 + +/* j = Lo(pHH)&0x0000007f */ + vandps _jIndexMask(%rax), %xmm10, %xmm3 + +/* T1 = ((double*)exp2_tbl)[ 2*j ] */ + vpcmpeqd %ymm10, %ymm10, %ymm10 + vpslld $4, %xmm3, %xmm5 + +/* pHH = pHH - *(double*)&db2p45_2p44 */ + vsubpd _db2p45_2p44(%rax), %ymm7, %ymm7 + +/* pHL = pH - pHH */ + vsubpd %ymm7, %ymm9, %ymm9 + vaddpd %ymm9, %ymm8, %ymm6 + vinserti128 $1, %xmm1, %ymm4, %ymm2 + vxorpd %ymm1, %ymm1, %ymm1 + vgatherdpd %ymm10, 36416(%rax,%xmm5), %ymm1 + vandpd _ifff0000000000000(%rax), %ymm2, %ymm13 + vmovupd _cev_1(%rax), %ymm2 + vmulpd %ymm1, %ymm13, %ymm1 + vfmadd213pd _cev_2(%rax), %ymm6, %ymm2 + vmulpd %ymm6, %ymm1, %ymm8 + vfmadd213pd _cev_3(%rax), %ymm6, %ymm2 + vfmadd213pd _cev_4(%rax), %ymm6, %ymm2 + vfmadd213pd _cev_5(%rax), %ymm6, %ymm2 + vfmadd213pd %ymm1, %ymm8, %ymm2 + testl %ecx, %ecx + jne .LBL_1_3 + +.LBL_1_2: + cfi_remember_state + vmovups 224(%rsp), %ymm8 + vmovups 384(%rsp), %ymm9 + vmovups 352(%rsp), %ymm10 + vmovups 160(%rsp), %ymm11 + vmovups 288(%rsp), %ymm13 + vmovdqa %ymm2, %ymm0 + movq %rbp, %rsp + cfi_def_cfa_register (%rsp) + popq %rbp + cfi_adjust_cfa_offset (-8) + cfi_restore (%rbp) + ret + +.LBL_1_3: + cfi_restore_state + vmovupd %ymm0, 192(%rsp) + vmovupd %ymm11, 256(%rsp) + vmovupd %ymm2, 320(%rsp) + je .LBL_1_2 + + xorb %dl, %dl + xorl %eax, %eax + vmovups %ymm12, 64(%rsp) + vmovups %ymm14, 32(%rsp) + vmovups %ymm15, (%rsp) + movq %rsi, 104(%rsp) + movq %rdi, 96(%rsp) + movq %r12, 136(%rsp) + cfi_offset_rel_rsp (12, 136) + movb %dl, %r12b + movq %r13, 128(%rsp) + cfi_offset_rel_rsp (13, 128) + movl %ecx, %r13d + movq %r14, 120(%rsp) + cfi_offset_rel_rsp (14, 120) + movl %eax, %r14d + movq %r15, 112(%rsp) + cfi_offset_rel_rsp (15, 112) + cfi_remember_state + +.LBL_1_6: + btl %r14d, %r13d + jc .LBL_1_12 + +.LBL_1_7: + lea 1(%r14), %esi + btl %esi, %r13d + jc .LBL_1_10 + +.LBL_1_8: + incb %r12b + addl $2, %r14d + cmpb $16, %r12b + jb .LBL_1_6 + + vmovups 64(%rsp), %ymm12 + vmovups 32(%rsp), %ymm14 + vmovups (%rsp), %ymm15 + vmovupd 320(%rsp), %ymm2 + movq 104(%rsp), %rsi + movq 96(%rsp), %rdi + movq 136(%rsp), %r12 + cfi_restore (%r12) + movq 128(%rsp), %r13 + cfi_restore (%r13) + movq 120(%rsp), %r14 + cfi_restore (%r14) + movq 112(%rsp), %r15 + cfi_restore (%r15) + jmp .LBL_1_2 + +.LBL_1_10: + cfi_restore_state + movzbl %r12b, %r15d + shlq $4, %r15 + vmovsd 200(%rsp,%r15), %xmm0 + vmovsd 264(%rsp,%r15), %xmm1 + vzeroupper + + call JUMPTARGET(__pow_finite) + + vmovsd %xmm0, 328(%rsp,%r15) + jmp .LBL_1_8 + +.LBL_1_12: + movzbl %r12b, %r15d + shlq $4, %r15 + vmovsd 192(%rsp,%r15), %xmm0 + vmovsd 256(%rsp,%r15), %xmm1 + vzeroupper + + call JUMPTARGET(__pow_finite) + + vmovsd %xmm0, 320(%rsp,%r15) + jmp .LBL_1_7 + +END (_ZGVdN4vv_pow_avx2) diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_pow8_core.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_pow8_core.S new file mode 100644 index 0000000000..68f12b2848 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_pow8_core.S @@ -0,0 +1,37 @@ +/* Multiple versions of vectorized pow. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + + .text +ENTRY (_ZGVeN8vv_pow) + .type _ZGVeN8vv_pow, @gnu_indirect_function + LOAD_RTLD_GLOBAL_RO_RDX + leaq _ZGVeN8vv_pow_skx(%rip), %rax + HAS_ARCH_FEATURE (AVX512DQ_Usable) + jnz 2f + leaq _ZGVeN8vv_pow_knl(%rip), %rax + HAS_ARCH_FEATURE (AVX512F_Usable) + jnz 2f + leaq _ZGVeN8vv_pow_avx2_wrapper(%rip), %rax +2: ret +END (_ZGVeN8vv_pow) + +#define _ZGVeN8vv_pow _ZGVeN8vv_pow_avx2_wrapper +#include "../svml_d_pow8_core.S" diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_pow8_core_avx512.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_pow8_core_avx512.S new file mode 100644 index 0000000000..2190c1f6b4 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_pow8_core_avx512.S @@ -0,0 +1,741 @@ +/* Function pow vectorized with AVX-512. KNL and SKX versions. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "svml_d_pow_data.h" +#include "svml_d_wrapper_impl.h" + +/* ALGORITHM DESCRIPTION: + + 1) Calculating log2|x| + Here we use the following formula. + Let |x|=2^k1*X1, where k1 is integer, 1<=X1<2. + Let C ~= 1/ln(2), + Rcp1 ~= 1/X1, X2=Rcp1*X1, + Rcp2 ~= 1/X2, X3=Rcp2*X2, + Rcp3 ~= 1/X3, Rcp3C ~= C/X3. + Then + log2|x| = k1 + log2(1/Rcp1) + log2(1/Rcp2) + log2(C/Rcp3C) + + log2(X1*Rcp1*Rcp2*Rcp3C/C), + where X1*Rcp1*Rcp2*Rcp3C = C*(1+q), q is very small. + + The values of Rcp1, log2(1/Rcp1), Rcp2, log2(1/Rcp2), + Rcp3C, log2(C/Rcp3C) are taken from tables. + Values of Rcp1, Rcp2, Rcp3C are such that RcpC=Rcp1*Rcp2*Rcp3C + is exactly represented in target precision. + + log2(X1*Rcp1*Rcp2*Rcp3C/C) = log2(1+q) = ln(1+q)/ln2 = + = 1/(ln2)*q - 1/(2ln2)*q^2 + 1/(3ln2)*q^3 - ... = + = 1/(C*ln2)*cq - 1/(2*C^2*ln2)*cq^2 + 1/(3*C^3*ln2)*cq^3 - ... = + = (1 + a1)*cq + a2*cq^2 + a3*cq^3 + ..., + where cq = X1*Rcp1*Rcp2*Rcp3C-C, + a1=1/(C*ln(2))-1 is small, + a2=1/(2*C^2*ln2), + a3=1/(3*C^3*ln2), + ... + We get 3 parts of log2 result: HH+HL+HLL ~= log2|x|. + + 2) Calculation of y*(HH+HL+HLL). + Split y into YHi+YLo. + Get high PH and medium PL parts of y*log2|x|. + Get low PLL part of y*log2|x|. + Now we have PH+PL+PLL ~= y*log2|x|. + + 3) Calculation of 2^(PH+PL+PLL). + Mathematical idea of computing 2^(PH+PL+PLL) is the following. + Let's represent PH+PL+PLL in the form N + j/2^expK + Z, + where expK=7 in this implementation, N and j are integers, + 0<=j<=2^expK-1, |Z|<2^(-expK-1). + Hence 2^(PH+PL+PLL) ~= 2^N * 2^(j/2^expK) * 2^Z, + where 2^(j/2^expK) is stored in a table, and + 2^Z ~= 1 + B1*Z + B2*Z^2 ... + B5*Z^5. + + We compute 2^(PH+PL+PLL) as follows. + Break PH into PHH + PHL, where PHH = N + j/2^expK. + Z = PHL + PL + PLL + Exp2Poly = B1*Z + B2*Z^2 ... + B5*Z^5 + Get 2^(j/2^expK) from table in the form THI+TLO. + Now we have 2^(PH+PL+PLL) ~= 2^N * (THI + TLO) * (1 + Exp2Poly). + + Get significand of 2^(PH+PL+PLL) in the form ResHi+ResLo: + ResHi := THI + ResLo := THI * Exp2Poly + TLO + + Get exponent ERes of the result: + Res := ResHi + ResLo: + Result := ex(Res) + N. */ + + .text +ENTRY (_ZGVeN8vv_pow_knl) +#ifndef HAVE_AVX512DQ_ASM_SUPPORT +WRAPPER_IMPL_AVX512_ff _ZGVdN4vv_pow +#else + pushq %rbp + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%rbp, 0) + movq %rsp, %rbp + cfi_def_cfa_register (%rbp) + andq $-64, %rsp + subq $1344, %rsp + vpsrlq $32, %zmm0, %zmm13 + vmovaps %zmm1, %zmm12 + movq __svml_dpow_data@GOTPCREL(%rip), %rax + movl $255, %edx + vpmovqd %zmm13, %ymm10 + vpsrlq $32, %zmm12, %zmm14 + kmovw %edx, %k1 + movl $-1, %ecx + vpmovqd %zmm14, %ymm15 + +/* x1 = x; Hi(x1) = (Hi(x1)&0x000fffff)|0x3ff00000 */ + vmovups _dbOne(%rax), %zmm6 + +/* i = (((Hi(x) & 0x000ffe00) + 0x00000200) >> 10); -> i = (b1..b11 + 1) / 2 */ + vmovaps %zmm10, %zmm5 + +/* k = Hi(x); k = k - 0x3fe7fe00; k = k >> 20 */ + vpsubd _i3fe7fe00(%rax), %zmm10, %zmm14{%k1} + vpandd _iIndexMask(%rax), %zmm10, %zmm5{%k1} + vpsrad $20, %zmm14, %zmm14{%k1} + vpxord %zmm9, %zmm9, %zmm9 + vpaddd _HIDELTA(%rax), %zmm10, %zmm3{%k1} + vpaddd _iIndexAdd(%rax), %zmm5, %zmm5{%k1} + vpxord %zmm7, %zmm7, %zmm7 + vpaddd _i2p20_2p19(%rax), %zmm14, %zmm14{%k1} + vpcmpd $1, _LORANGE(%rax), %zmm3, %k2{%k1} + vpsrld $10, %zmm5, %zmm5{%k1} + vpandd _ABSMASK(%rax), %zmm15, %zmm2{%k1} + vpbroadcastd %ecx, %zmm1{%k2}{z} + +/* Index for reciprocal table */ + vpslld $3, %zmm5, %zmm8{%k1} + kxnorw %k2, %k2, %k2 + vgatherdpd 11712(%rax,%ymm8), %zmm9{%k2} + vpmovzxdq %ymm14, %zmm10 + +/* Index for log2 table */ + vpslld $4, %zmm5, %zmm13{%k1} + kxnorw %k2, %k2, %k2 + vpsllq $32, %zmm10, %zmm3 + vpxord %zmm8, %zmm8, %zmm8 + vpcmpd $5, _INF(%rax), %zmm2, %k3{%k1} + vpbroadcastd %ecx, %zmm4{%k3}{z} + vpternlogq $248, _iMantissaMask(%rax), %zmm0, %zmm6 + kxnorw %k3, %k3, %k3 + vpternlogq $168, _iffffffff00000000(%rax), %zmm10, %zmm3 + +/* x1Hi=x1; Lo(x1Hi)&=0xf8000000; x1Lo = x1-x1Hi */ + vpandq _iHighMask(%rax), %zmm6, %zmm2 + vgatherdpd 19976(%rax,%ymm13), %zmm8{%k2} + vpord %zmm4, %zmm1, %zmm11{%k1} + vsubpd _db2p20_2p19(%rax), %zmm3, %zmm1 + vsubpd %zmm2, %zmm6, %zmm5 + +/* r1 = x1*rcp1 */ + vmulpd %zmm9, %zmm6, %zmm6 + vgatherdpd 19968(%rax,%ymm13), %zmm7{%k3} + +/* cq = c+r1 */ + vaddpd _LHN(%rax), %zmm6, %zmm4 + +/* E = -r1+__fence(x1Hi*rcp1) */ + vfmsub213pd %zmm6, %zmm9, %zmm2 + +/* T = k + L1hi */ + vaddpd %zmm7, %zmm1, %zmm7 + +/* E=E+x1Lo*rcp1 */ + vfmadd213pd %zmm2, %zmm9, %zmm5 + +/* T_Rh = T + cq */ + vaddpd %zmm4, %zmm7, %zmm3 + +/* Rl = T-T_Rh; -> -Rh */ + vsubpd %zmm3, %zmm7, %zmm9 + +/* Rl=Rl+cq */ + vaddpd %zmm9, %zmm4, %zmm6 + +/* T_Rh_Eh = T_Rh + E */ + vaddpd %zmm5, %zmm3, %zmm9 + +/* HLL = T_Rh - T_Rh_Eh; -> -Eh */ + vsubpd %zmm9, %zmm3, %zmm2 + +/* cq = cq + E; */ + vaddpd %zmm5, %zmm4, %zmm4 + +/* HLL+=E; -> El */ + vaddpd %zmm2, %zmm5, %zmm1 + vmovups _clv_2(%rax), %zmm5 + +/* HLL = HLL + (((((((a7)*cq+a6)*cq+a5)*cq+a4)*cq+a3)*cq+a2)*cq+a1)*cq */ + vfmadd213pd _clv_3(%rax), %zmm4, %zmm5 + +/* HLL+=Rl */ + vaddpd %zmm6, %zmm1, %zmm7 + +/* 2^(y*(HH+HL+HLL)) starts here: + yH = y; Lo(yH)&=0xf8000000 + */ + vpandq _iHighMask(%rax), %zmm12, %zmm6 + +/* yL = y-yH */ + vsubpd %zmm6, %zmm12, %zmm2 + vfmadd213pd _clv_4(%rax), %zmm4, %zmm5 + +/* HLL+=L1lo */ + vaddpd %zmm8, %zmm7, %zmm8 + vfmadd213pd _clv_5(%rax), %zmm4, %zmm5 + vfmadd213pd _clv_6(%rax), %zmm4, %zmm5 + vfmadd213pd _clv_7(%rax), %zmm4, %zmm5 + vfmadd213pd %zmm8, %zmm4, %zmm5 + +/* T_Rh_Eh_HLLhi = T_Rh_Eh + HLL */ + vaddpd %zmm5, %zmm9, %zmm13 + +/* HLLhi = T_Rh_Eh_HLLhi - T_Rh_Eh */ + vsubpd %zmm9, %zmm13, %zmm10 + +/* HLL = HLL - HLLhi */ + vsubpd %zmm10, %zmm5, %zmm3 + +/* HH = T_Rh_Eh_HLLhi; Lo(HH)&=0xf8000000 */ + vpandq _iHighMask(%rax), %zmm13, %zmm5 + +/* pH = yH*HH */ + vmulpd %zmm5, %zmm6, %zmm1 + +/* HL = T_Rh_Eh_HLLhi-HH */ + vsubpd %zmm5, %zmm13, %zmm4 + vpsrlq $32, %zmm1, %zmm14 + +/* pLL = y*HLL; + pHH = pH + *(double*)&db2p45_2p44 + */ + vaddpd _db2p45_2p44(%rax), %zmm1, %zmm10 + vpmovqd %zmm14, %ymm15 + vpandd _ABSMASK(%rax), %zmm15, %zmm14{%k1} + vpcmpd $5, _DOMAINRANGE(%rax), %zmm14, %k3{%k1} + +/* T1 = ((double*)exp2_tbl)[ 2*j ] */ + vpxord %zmm14, %zmm14, %zmm14 + vpbroadcastd %ecx, %zmm13{%k3}{z} + vpord %zmm13, %zmm11, %zmm11{%k1} + vptestmd %zmm11, %zmm11, %k0{%k1} + +/* pL=yL*HL+yH*HL; pL+=yL*HH */ + vmulpd %zmm4, %zmm2, %zmm11 + kmovw %k0, %ecx + vfmadd213pd %zmm11, %zmm4, %zmm6 + +/* pHH = pHH - *(double*)&db2p45_2p44 */ + vsubpd _db2p45_2p44(%rax), %zmm10, %zmm11 + vpmovqd %zmm10, %ymm4 + movzbl %cl, %ecx + +/* _n = Lo(pHH); + _n = _n & 0xffffff80; + _n = _n >> 7; + Hi(_2n) = (0x3ff+_n)<<20; Lo(_2n) = 0; -> 2^n + */ + vpslld $13, %zmm4, %zmm7{%k1} + +/* j = Lo(pHH)&0x0000007f */ + vpandd _jIndexMask(%rax), %zmm4, %zmm9{%k1} + vfmadd213pd %zmm6, %zmm5, %zmm2 + +/* pHL = pH - pHH */ + vsubpd %zmm11, %zmm1, %zmm1 + vpaddd _iOne(%rax), %zmm7, %zmm7{%k1} + +/* t=pL+pLL; t+=pHL */ + vfmadd231pd %zmm12, %zmm3, %zmm2 + vpslld $4, %zmm9, %zmm9{%k1} + kxnorw %k1, %k1, %k1 + vgatherdpd 36416(%rax,%ymm9), %zmm14{%k1} + vpmovzxdq %ymm7, %zmm8 + vaddpd %zmm1, %zmm2, %zmm2 + vmovups _cev_1(%rax), %zmm1 + vpsllq $32, %zmm8, %zmm13 + vpternlogq $168, _ifff0000000000000(%rax), %zmm8, %zmm13 + vfmadd213pd _cev_2(%rax), %zmm2, %zmm1 + vmulpd %zmm14, %zmm13, %zmm15 + vfmadd213pd _cev_3(%rax), %zmm2, %zmm1 + vmulpd %zmm2, %zmm15, %zmm3 + vfmadd213pd _cev_4(%rax), %zmm2, %zmm1 + vfmadd213pd _cev_5(%rax), %zmm2, %zmm1 + vfmadd213pd %zmm15, %zmm3, %zmm1 + testl %ecx, %ecx + jne .LBL_1_3 + +.LBL_1_2: + cfi_remember_state + vmovaps %zmm1, %zmm0 + movq %rbp, %rsp + cfi_def_cfa_register (%rsp) + popq %rbp + cfi_adjust_cfa_offset (-8) + cfi_restore (%rbp) + ret + +.LBL_1_3: + cfi_restore_state + vmovups %zmm0, 1152(%rsp) + vmovups %zmm12, 1216(%rsp) + vmovups %zmm1, 1280(%rsp) + je .LBL_1_2 + + xorb %dl, %dl + kmovw %k4, 1048(%rsp) + xorl %eax, %eax + kmovw %k5, 1040(%rsp) + kmovw %k6, 1032(%rsp) + kmovw %k7, 1024(%rsp) + vmovups %zmm16, 960(%rsp) + vmovups %zmm17, 896(%rsp) + vmovups %zmm18, 832(%rsp) + vmovups %zmm19, 768(%rsp) + vmovups %zmm20, 704(%rsp) + vmovups %zmm21, 640(%rsp) + vmovups %zmm22, 576(%rsp) + vmovups %zmm23, 512(%rsp) + vmovups %zmm24, 448(%rsp) + vmovups %zmm25, 384(%rsp) + vmovups %zmm26, 320(%rsp) + vmovups %zmm27, 256(%rsp) + vmovups %zmm28, 192(%rsp) + vmovups %zmm29, 128(%rsp) + vmovups %zmm30, 64(%rsp) + vmovups %zmm31, (%rsp) + movq %rsi, 1064(%rsp) + movq %rdi, 1056(%rsp) + movq %r12, 1096(%rsp) + cfi_offset_rel_rsp (12, 1096) + movb %dl, %r12b + movq %r13, 1088(%rsp) + cfi_offset_rel_rsp (13, 1088) + movl %ecx, %r13d + movq %r14, 1080(%rsp) + cfi_offset_rel_rsp (14, 1080) + movl %eax, %r14d + movq %r15, 1072(%rsp) + cfi_offset_rel_rsp (15, 1072) + cfi_remember_state + +.LBL_1_6: + btl %r14d, %r13d + jc .LBL_1_12 + +.LBL_1_7: + lea 1(%r14), %esi + btl %esi, %r13d + jc .LBL_1_10 + +.LBL_1_8: + addb $1, %r12b + addl $2, %r14d + cmpb $16, %r12b + jb .LBL_1_6 + + kmovw 1048(%rsp), %k4 + movq 1064(%rsp), %rsi + kmovw 1040(%rsp), %k5 + movq 1056(%rsp), %rdi + kmovw 1032(%rsp), %k6 + movq 1096(%rsp), %r12 + cfi_restore (%r12) + movq 1088(%rsp), %r13 + cfi_restore (%r13) + kmovw 1024(%rsp), %k7 + vmovups 960(%rsp), %zmm16 + vmovups 896(%rsp), %zmm17 + vmovups 832(%rsp), %zmm18 + vmovups 768(%rsp), %zmm19 + vmovups 704(%rsp), %zmm20 + vmovups 640(%rsp), %zmm21 + vmovups 576(%rsp), %zmm22 + vmovups 512(%rsp), %zmm23 + vmovups 448(%rsp), %zmm24 + vmovups 384(%rsp), %zmm25 + vmovups 320(%rsp), %zmm26 + vmovups 256(%rsp), %zmm27 + vmovups 192(%rsp), %zmm28 + vmovups 128(%rsp), %zmm29 + vmovups 64(%rsp), %zmm30 + vmovups (%rsp), %zmm31 + movq 1080(%rsp), %r14 + cfi_restore (%r14) + movq 1072(%rsp), %r15 + cfi_restore (%r15) + vmovups 1280(%rsp), %zmm1 + jmp .LBL_1_2 + +.LBL_1_10: + cfi_restore_state + movzbl %r12b, %r15d + shlq $4, %r15 + vmovsd 1160(%rsp,%r15), %xmm0 + vmovsd 1224(%rsp,%r15), %xmm1 + call JUMPTARGET(__pow_finite) + vmovsd %xmm0, 1288(%rsp,%r15) + jmp .LBL_1_8 + +.LBL_1_12: + movzbl %r12b, %r15d + shlq $4, %r15 + vmovsd 1152(%rsp,%r15), %xmm0 + vmovsd 1216(%rsp,%r15), %xmm1 + call JUMPTARGET(__pow_finite) + vmovsd %xmm0, 1280(%rsp,%r15) + jmp .LBL_1_7 + +#endif +END (_ZGVeN8vv_pow_knl) + +ENTRY (_ZGVeN8vv_pow_skx) +#ifndef HAVE_AVX512DQ_ASM_SUPPORT +WRAPPER_IMPL_AVX512_ff _ZGVdN4vv_pow +#else + pushq %rbp + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%rbp, 0) + movq %rsp, %rbp + cfi_def_cfa_register (%rbp) + andq $-64, %rsp + subq $1344, %rsp + vpsrlq $32, %zmm0, %zmm10 + kxnorw %k1, %k1, %k1 + kxnorw %k2, %k2, %k2 + kxnorw %k3, %k3, %k3 + vpmovqd %zmm10, %ymm7 + movq __svml_dpow_data@GOTPCREL(%rip), %rax + vmovaps %zmm1, %zmm6 + vpsrlq $32, %zmm6, %zmm13 + +/* i = (((Hi(x) & 0x000ffe00) + 0x00000200) >> 10); -> i = (b1..b11 + 1) / 2 */ + vpand _iIndexMask(%rax), %ymm7, %ymm15 + vpaddd _HIDELTA(%rax), %ymm7, %ymm2 + +/* k = Hi(x); k = k - 0x3fe7fe00; k = k >> 20 */ + vpsubd _i3fe7fe00(%rax), %ymm7, %ymm7 + vmovdqu _ABSMASK(%rax), %ymm4 + vmovdqu _LORANGE(%rax), %ymm3 + +/* x1 = x; Hi(x1) = (Hi(x1)&0x000fffff)|0x3ff00000 */ + vmovups _dbOne(%rax), %zmm11 + vmovdqu _INF(%rax), %ymm5 + vpaddd _iIndexAdd(%rax), %ymm15, %ymm12 + vpmovqd %zmm13, %ymm14 + vpternlogq $248, _iMantissaMask(%rax), %zmm0, %zmm11 + vpsrld $10, %ymm12, %ymm10 + vpsrad $20, %ymm7, %ymm13 + +/* Index for reciprocal table */ + vpslld $3, %ymm10, %ymm8 + +/* Index for log2 table */ + vpslld $4, %ymm10, %ymm1 + vpcmpgtd %ymm2, %ymm3, %ymm3 + vpand %ymm4, %ymm14, %ymm2 + vpaddd _i2p20_2p19(%rax), %ymm13, %ymm14 + vpmovzxdq %ymm14, %zmm15 + vpsllq $32, %zmm15, %zmm7 + vpternlogq $168, _iffffffff00000000(%rax), %zmm15, %zmm7 + vsubpd _db2p20_2p19(%rax), %zmm7, %zmm13 + vpxord %zmm9, %zmm9, %zmm9 + vgatherdpd 11712(%rax,%ymm8), %zmm9{%k1} + +/* T1 = ((double*)exp2_tbl)[ 2*j ] */ + kxnorw %k1, %k1, %k1 + vpxord %zmm12, %zmm12, %zmm12 + vpxord %zmm8, %zmm8, %zmm8 + vgatherdpd 19968(%rax,%ymm1), %zmm12{%k2} + vgatherdpd 19976(%rax,%ymm1), %zmm8{%k3} + vmovups _iHighMask(%rax), %zmm1 + +/* x1Hi=x1; Lo(x1Hi)&=0xf8000000; x1Lo = x1-x1Hi */ + vandpd %zmm1, %zmm11, %zmm10 + vsubpd %zmm10, %zmm11, %zmm15 + +/* r1 = x1*rcp1 */ + vmulpd %zmm9, %zmm11, %zmm11 + +/* E = -r1+__fence(x1Hi*rcp1) */ + vfmsub213pd %zmm11, %zmm9, %zmm10 + +/* cq = c+r1 */ + vaddpd _LHN(%rax), %zmm11, %zmm14 + +/* E=E+x1Lo*rcp1 */ + vfmadd213pd %zmm10, %zmm9, %zmm15 + +/* T = k + L1hi */ + vaddpd %zmm12, %zmm13, %zmm9 + +/* T_Rh = T + cq */ + vaddpd %zmm14, %zmm9, %zmm11 + +/* T_Rh_Eh = T_Rh + E */ + vaddpd %zmm15, %zmm11, %zmm13 + +/* Rl = T-T_Rh; -> -Rh */ + vsubpd %zmm11, %zmm9, %zmm12 + +/* HLL = T_Rh - T_Rh_Eh; -> -Eh */ + vsubpd %zmm13, %zmm11, %zmm9 + +/* Rl=Rl+cq */ + vaddpd %zmm12, %zmm14, %zmm10 + +/* HLL+=E; -> El */ + vaddpd %zmm9, %zmm15, %zmm7 + +/* HLL+=Rl */ + vaddpd %zmm10, %zmm7, %zmm12 + +/* 2^(y*(HH+HL+HLL)) starts here: + yH = y; Lo(yH)&=0xf8000000 + */ + vandpd %zmm1, %zmm6, %zmm7 + +/* HLL+=L1lo */ + vaddpd %zmm8, %zmm12, %zmm12 + +/* cq = cq + E */ + vaddpd %zmm15, %zmm14, %zmm8 + vmovups _clv_2(%rax), %zmm14 + +/* HLL = HLL + (((((((a7)*cq+a6)*cq+a5)*cq+a4)*cq+a3)*cq+a2)*cq+a1)*cq */ + vfmadd213pd _clv_3(%rax), %zmm8, %zmm14 + vfmadd213pd _clv_4(%rax), %zmm8, %zmm14 + vfmadd213pd _clv_5(%rax), %zmm8, %zmm14 + vfmadd213pd _clv_6(%rax), %zmm8, %zmm14 + vfmadd213pd _clv_7(%rax), %zmm8, %zmm14 + vfmadd213pd %zmm12, %zmm8, %zmm14 + +/* yL = y-yH */ + vsubpd %zmm7, %zmm6, %zmm8 + +/* T_Rh_Eh_HLLhi = T_Rh_Eh + HLL */ + vaddpd %zmm14, %zmm13, %zmm15 + +/* HH = T_Rh_Eh_HLLhi; Lo(HH)&=0xf8000000 */ + vandpd %zmm1, %zmm15, %zmm11 + +/* HLLhi = T_Rh_Eh_HLLhi - T_Rh_Eh */ + vsubpd %zmm13, %zmm15, %zmm13 + +/* pH = yH*HH */ + vmulpd %zmm11, %zmm7, %zmm9 + +/* HLL = HLL - HLLhi */ + vsubpd %zmm13, %zmm14, %zmm12 + +/* HL = T_Rh_Eh_HLLhi-HH */ + vsubpd %zmm11, %zmm15, %zmm10 + vpsrlq $32, %zmm9, %zmm1 + vmovdqu _DOMAINRANGE(%rax), %ymm13 + vpmovqd %zmm1, %ymm1 + vpand %ymm4, %ymm1, %ymm1 + vpcmpgtd %ymm5, %ymm2, %ymm4 + vpcmpeqd %ymm5, %ymm2, %ymm5 + vpternlogd $254, %ymm5, %ymm4, %ymm3 + vpcmpgtd %ymm13, %ymm1, %ymm2 + vpcmpeqd %ymm13, %ymm1, %ymm4 + vpternlogd $254, %ymm4, %ymm2, %ymm3 + +/* pLL = y*HLL */ + vmovups _db2p45_2p44(%rax), %zmm2 + +/* pHH = pH + *(double*)&db2p45_2p44 */ + vaddpd %zmm2, %zmm9, %zmm1 + vpmovqd %zmm1, %ymm5 + +/* j = Lo(pHH)&0x0000007f */ + vpand _jIndexMask(%rax), %ymm5, %ymm14 + vpslld $4, %ymm14, %ymm15 + vmovmskps %ymm3, %ecx + +/* pL=yL*HL+yH*HL; pL+=yL*HH */ + vmulpd %zmm10, %zmm8, %zmm3 + vfmadd213pd %zmm3, %zmm10, %zmm7 + vfmadd213pd %zmm7, %zmm11, %zmm8 + +/* _n = Lo(pHH) + _n = _n & 0xffffff80 + _n = _n >> 7 + Hi(_2n) = (0x3ff+_n)<<20; Lo(_2n) = 0; -> 2^n + */ + vpslld $13, %ymm5, %ymm7 + +/* t=pL+pLL; t+=pHL */ + vfmadd231pd %zmm6, %zmm12, %zmm8 + vpaddd _iOne(%rax), %ymm7, %ymm10 + vpmovzxdq %ymm10, %zmm11 + vpsllq $32, %zmm11, %zmm3 + vpternlogq $168, _ifff0000000000000(%rax), %zmm11, %zmm3 + +/* pHH = pHH - *(double*)&db2p45_2p44 */ + vsubpd %zmm2, %zmm1, %zmm11 + vmovups _cev_1(%rax), %zmm2 + +/* pHL = pH - pHH */ + vsubpd %zmm11, %zmm9, %zmm9 + vaddpd %zmm9, %zmm8, %zmm8 + vfmadd213pd _cev_2(%rax), %zmm8, %zmm2 + vfmadd213pd _cev_3(%rax), %zmm8, %zmm2 + vfmadd213pd _cev_4(%rax), %zmm8, %zmm2 + vfmadd213pd _cev_5(%rax), %zmm8, %zmm2 + vpxord %zmm4, %zmm4, %zmm4 + vgatherdpd 36416(%rax,%ymm15), %zmm4{%k1} + vmulpd %zmm4, %zmm3, %zmm1 + vmulpd %zmm8, %zmm1, %zmm12 + vfmadd213pd %zmm1, %zmm12, %zmm2 + testl %ecx, %ecx + jne .LBL_2_3 + +.LBL_2_2: + cfi_remember_state + vmovaps %zmm2, %zmm0 + movq %rbp, %rsp + cfi_def_cfa_register (%rsp) + popq %rbp + cfi_adjust_cfa_offset (-8) + cfi_restore (%rbp) + ret + +.LBL_2_3: + cfi_restore_state + vmovups %zmm0, 1152(%rsp) + vmovups %zmm6, 1216(%rsp) + vmovups %zmm2, 1280(%rsp) + je .LBL_2_2 + + xorb %dl, %dl + xorl %eax, %eax + kmovw %k4, 1048(%rsp) + kmovw %k5, 1040(%rsp) + kmovw %k6, 1032(%rsp) + kmovw %k7, 1024(%rsp) + vmovups %zmm16, 960(%rsp) + vmovups %zmm17, 896(%rsp) + vmovups %zmm18, 832(%rsp) + vmovups %zmm19, 768(%rsp) + vmovups %zmm20, 704(%rsp) + vmovups %zmm21, 640(%rsp) + vmovups %zmm22, 576(%rsp) + vmovups %zmm23, 512(%rsp) + vmovups %zmm24, 448(%rsp) + vmovups %zmm25, 384(%rsp) + vmovups %zmm26, 320(%rsp) + vmovups %zmm27, 256(%rsp) + vmovups %zmm28, 192(%rsp) + vmovups %zmm29, 128(%rsp) + vmovups %zmm30, 64(%rsp) + vmovups %zmm31, (%rsp) + movq %rsi, 1064(%rsp) + movq %rdi, 1056(%rsp) + movq %r12, 1096(%rsp) + cfi_offset_rel_rsp (12, 1096) + movb %dl, %r12b + movq %r13, 1088(%rsp) + cfi_offset_rel_rsp (13, 1088) + movl %ecx, %r13d + movq %r14, 1080(%rsp) + cfi_offset_rel_rsp (14, 1080) + movl %eax, %r14d + movq %r15, 1072(%rsp) + cfi_offset_rel_rsp (15, 1072) + cfi_remember_state + +.LBL_2_6: + btl %r14d, %r13d + jc .LBL_2_12 + +.LBL_2_7: + lea 1(%r14), %esi + btl %esi, %r13d + jc .LBL_2_10 + +.LBL_2_8: + incb %r12b + addl $2, %r14d + cmpb $16, %r12b + jb .LBL_2_6 + + kmovw 1048(%rsp), %k4 + kmovw 1040(%rsp), %k5 + kmovw 1032(%rsp), %k6 + kmovw 1024(%rsp), %k7 + vmovups 960(%rsp), %zmm16 + vmovups 896(%rsp), %zmm17 + vmovups 832(%rsp), %zmm18 + vmovups 768(%rsp), %zmm19 + vmovups 704(%rsp), %zmm20 + vmovups 640(%rsp), %zmm21 + vmovups 576(%rsp), %zmm22 + vmovups 512(%rsp), %zmm23 + vmovups 448(%rsp), %zmm24 + vmovups 384(%rsp), %zmm25 + vmovups 320(%rsp), %zmm26 + vmovups 256(%rsp), %zmm27 + vmovups 192(%rsp), %zmm28 + vmovups 128(%rsp), %zmm29 + vmovups 64(%rsp), %zmm30 + vmovups (%rsp), %zmm31 + vmovups 1280(%rsp), %zmm2 + movq 1064(%rsp), %rsi + movq 1056(%rsp), %rdi + movq 1096(%rsp), %r12 + cfi_restore (%r12) + movq 1088(%rsp), %r13 + cfi_restore (%r13) + movq 1080(%rsp), %r14 + cfi_restore (%r14) + movq 1072(%rsp), %r15 + cfi_restore (%r15) + jmp .LBL_2_2 + +.LBL_2_10: + cfi_restore_state + movzbl %r12b, %r15d + shlq $4, %r15 + vmovsd 1224(%rsp,%r15), %xmm1 + vzeroupper + vmovsd 1160(%rsp,%r15), %xmm0 + + call JUMPTARGET(__pow_finite) + + vmovsd %xmm0, 1288(%rsp,%r15) + jmp .LBL_2_8 + +.LBL_2_12: + movzbl %r12b, %r15d + shlq $4, %r15 + vmovsd 1216(%rsp,%r15), %xmm1 + vzeroupper + vmovsd 1152(%rsp,%r15), %xmm0 + + call JUMPTARGET(__pow_finite) + + vmovsd %xmm0, 1280(%rsp,%r15) + jmp .LBL_2_7 + +#endif +END (_ZGVeN8vv_pow_skx) diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_sin2_core.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_sin2_core.S new file mode 100644 index 0000000000..e35654be8d --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_sin2_core.S @@ -0,0 +1,36 @@ +/* Multiple versions of vectorized sin. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + + .text +ENTRY (_ZGVbN2v_sin) + .type _ZGVbN2v_sin, @gnu_indirect_function + LOAD_RTLD_GLOBAL_RO_RDX + leaq _ZGVbN2v_sin_sse4(%rip), %rax + HAS_CPU_FEATURE (SSE4_1) + jz 2f + ret +2: leaq _ZGVbN2v_sin_sse2(%rip), %rax + ret +END (_ZGVbN2v_sin) +libmvec_hidden_def (_ZGVbN2v_sin) + +#define _ZGVbN2v_sin _ZGVbN2v_sin_sse2 +#include "../svml_d_sin2_core.S" diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_sin2_core_sse4.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_sin2_core_sse4.S new file mode 100644 index 0000000000..393ba03b76 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_sin2_core_sse4.S @@ -0,0 +1,229 @@ +/* Function sin vectorized with SSE4. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "svml_d_trig_data.h" + + .text +ENTRY (_ZGVbN2v_sin_sse4) +/* ALGORITHM DESCRIPTION: + + ( low accuracy ( < 4ulp ) or enhanced performance + ( half of correct mantissa ) implementation ) + + Argument representation: + arg = N*Pi + R + + Result calculation: + sin(arg) = sin(N*Pi + R) = (-1)^N * sin(R) + sin(R) is approximated by corresponding polynomial + */ + pushq %rbp + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%rbp, 0) + movq %rsp, %rbp + cfi_def_cfa_register (%rbp) + andq $-64, %rsp + subq $320, %rsp + movaps %xmm0, %xmm5 + movq __svml_d_trig_data@GOTPCREL(%rip), %rax + movups __dAbsMask(%rax), %xmm3 +/* + ARGUMENT RANGE REDUCTION: + X' = |X| + */ + movaps %xmm3, %xmm4 + +/* SignX - sign bit of X */ + andnps %xmm5, %xmm3 + movups __dInvPI(%rax), %xmm2 + andps %xmm5, %xmm4 + +/* Y = X'*InvPi + RS : right shifter add */ + mulpd %xmm4, %xmm2 + movups __dRShifter(%rax), %xmm6 + +/* R = X' - N*Pi1 */ + movaps %xmm4, %xmm0 + addpd %xmm6, %xmm2 + cmpnlepd __dRangeVal(%rax), %xmm4 + +/* N = Y - RS : right shifter sub */ + movaps %xmm2, %xmm1 + +/* SignRes = Y<<63 : shift LSB to MSB place for result sign */ + psllq $63, %xmm2 + subpd %xmm6, %xmm1 + movmskpd %xmm4, %ecx + movups __dPI1(%rax), %xmm7 + mulpd %xmm1, %xmm7 + movups __dPI2(%rax), %xmm6 + +/* R = R - N*Pi2 */ + mulpd %xmm1, %xmm6 + subpd %xmm7, %xmm0 + movups __dPI3(%rax), %xmm7 + +/* R = R - N*Pi3 */ + mulpd %xmm1, %xmm7 + subpd %xmm6, %xmm0 + movups __dPI4(%rax), %xmm6 + +/* R = R - N*Pi4 */ + mulpd %xmm6, %xmm1 + subpd %xmm7, %xmm0 + subpd %xmm1, %xmm0 + +/* + POLYNOMIAL APPROXIMATION: + R2 = R*R + */ + movaps %xmm0, %xmm1 + mulpd %xmm0, %xmm1 + +/* R = R^SignRes : update sign of reduced argument */ + xorps %xmm2, %xmm0 + movups __dC7_sin(%rax), %xmm2 + mulpd %xmm1, %xmm2 + addpd __dC6_sin(%rax), %xmm2 + mulpd %xmm1, %xmm2 + addpd __dC5_sin(%rax), %xmm2 + mulpd %xmm1, %xmm2 + addpd __dC4_sin(%rax), %xmm2 + +/* Poly = C3+R2*(C4+R2*(C5+R2*(C6+R2*C7))) */ + mulpd %xmm1, %xmm2 + addpd __dC3_sin(%rax), %xmm2 + +/* Poly = R2*(C1+R2*(C2+R2*Poly)) */ + mulpd %xmm1, %xmm2 + addpd __dC2_sin(%rax), %xmm2 + mulpd %xmm1, %xmm2 + addpd __dC1_sin(%rax), %xmm2 + mulpd %xmm2, %xmm1 + +/* Poly = Poly*R + R */ + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm0 + +/* + RECONSTRUCTION: + Final sign setting: Res = Poly^SignX + */ + xorps %xmm3, %xmm0 + testl %ecx, %ecx + jne .LBL_1_3 + +.LBL_1_2: + cfi_remember_state + movq %rbp, %rsp + cfi_def_cfa_register (%rsp) + popq %rbp + cfi_adjust_cfa_offset (-8) + cfi_restore (%rbp) + ret + +.LBL_1_3: + cfi_restore_state + movups %xmm5, 192(%rsp) + movups %xmm0, 256(%rsp) + je .LBL_1_2 + + xorb %dl, %dl + xorl %eax, %eax + movups %xmm8, 112(%rsp) + movups %xmm9, 96(%rsp) + movups %xmm10, 80(%rsp) + movups %xmm11, 64(%rsp) + movups %xmm12, 48(%rsp) + movups %xmm13, 32(%rsp) + movups %xmm14, 16(%rsp) + movups %xmm15, (%rsp) + movq %rsi, 136(%rsp) + movq %rdi, 128(%rsp) + movq %r12, 168(%rsp) + cfi_offset_rel_rsp (12, 168) + movb %dl, %r12b + movq %r13, 160(%rsp) + cfi_offset_rel_rsp (13, 160) + movl %ecx, %r13d + movq %r14, 152(%rsp) + cfi_offset_rel_rsp (14, 152) + movl %eax, %r14d + movq %r15, 144(%rsp) + cfi_offset_rel_rsp (15, 144) + cfi_remember_state + +.LBL_1_6: + btl %r14d, %r13d + jc .LBL_1_12 + +.LBL_1_7: + lea 1(%r14), %esi + btl %esi, %r13d + jc .LBL_1_10 + +.LBL_1_8: + incb %r12b + addl $2, %r14d + cmpb $16, %r12b + jb .LBL_1_6 + + movups 112(%rsp), %xmm8 + movups 96(%rsp), %xmm9 + movups 80(%rsp), %xmm10 + movups 64(%rsp), %xmm11 + movups 48(%rsp), %xmm12 + movups 32(%rsp), %xmm13 + movups 16(%rsp), %xmm14 + movups (%rsp), %xmm15 + movq 136(%rsp), %rsi + movq 128(%rsp), %rdi + movq 168(%rsp), %r12 + cfi_restore (%r12) + movq 160(%rsp), %r13 + cfi_restore (%r13) + movq 152(%rsp), %r14 + cfi_restore (%r14) + movq 144(%rsp), %r15 + cfi_restore (%r15) + movups 256(%rsp), %xmm0 + jmp .LBL_1_2 + +.LBL_1_10: + cfi_restore_state + movzbl %r12b, %r15d + shlq $4, %r15 + movsd 200(%rsp,%r15), %xmm0 + + call JUMPTARGET(sin) + + movsd %xmm0, 264(%rsp,%r15) + jmp .LBL_1_8 + +.LBL_1_12: + movzbl %r12b, %r15d + shlq $4, %r15 + movsd 192(%rsp,%r15), %xmm0 + + call JUMPTARGET(sin) + + movsd %xmm0, 256(%rsp,%r15) + jmp .LBL_1_7 + +END (_ZGVbN2v_sin_sse4) diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_sin4_core.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_sin4_core.S new file mode 100644 index 0000000000..f4482d3a11 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_sin4_core.S @@ -0,0 +1,36 @@ +/* Multiple versions of vectorized sin, vector length is 4. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + + .text +ENTRY (_ZGVdN4v_sin) + .type _ZGVdN4v_sin, @gnu_indirect_function + LOAD_RTLD_GLOBAL_RO_RDX + leaq _ZGVdN4v_sin_avx2(%rip), %rax + HAS_ARCH_FEATURE (AVX2_Usable) + jz 2f + ret +2: leaq _ZGVdN4v_sin_sse_wrapper(%rip), %rax + ret +END (_ZGVdN4v_sin) +libmvec_hidden_def (_ZGVdN4v_sin) + +#define _ZGVdN4v_sin _ZGVdN4v_sin_sse_wrapper +#include "../svml_d_sin4_core.S" diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_sin4_core_avx2.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_sin4_core_avx2.S new file mode 100644 index 0000000000..b035fa1b15 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_sin4_core_avx2.S @@ -0,0 +1,210 @@ +/* Function sin vectorized with AVX2. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "svml_d_trig_data.h" + + .text +ENTRY (_ZGVdN4v_sin_avx2) +/* ALGORITHM DESCRIPTION: + + ( low accuracy ( < 4ulp ) or enhanced performance + ( half of correct mantissa ) implementation ) + + Argument representation: + arg = N*Pi + R + + Result calculation: + sin(arg) = sin(N*Pi + R) = (-1)^N * sin(R) + sin(R) is approximated by corresponding polynomial + */ + pushq %rbp + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%rbp, 0) + movq %rsp, %rbp + cfi_def_cfa_register (%rbp) + andq $-64, %rsp + subq $448, %rsp + movq __svml_d_trig_data@GOTPCREL(%rip), %rax + vmovdqa %ymm0, %ymm4 + vmovupd __dAbsMask(%rax), %ymm2 + vmovupd __dInvPI(%rax), %ymm6 + vmovupd __dRShifter(%rax), %ymm5 + vmovupd __dPI1_FMA(%rax), %ymm7 +/* + ARGUMENT RANGE REDUCTION: + X' = |X| + */ + vandpd %ymm2, %ymm4, %ymm3 + +/* Y = X'*InvPi + RS : right shifter add */ + vfmadd213pd %ymm5, %ymm3, %ymm6 + +/* N = Y - RS : right shifter sub */ + vsubpd %ymm5, %ymm6, %ymm1 + +/* SignRes = Y<<63 : shift LSB to MSB place for result sign */ + vpsllq $63, %ymm6, %ymm5 + +/* R = X' - N*Pi1 */ + vmovapd %ymm3, %ymm0 + vfnmadd231pd %ymm1, %ymm7, %ymm0 + vcmpnle_uqpd __dRangeVal(%rax), %ymm3, %ymm3 + +/* R = R - N*Pi2 */ + vfnmadd231pd __dPI2_FMA(%rax), %ymm1, %ymm0 + +/* R = R - N*Pi3 */ + vfnmadd132pd __dPI3_FMA(%rax), %ymm0, %ymm1 + +/* + POLYNOMIAL APPROXIMATION: + R2 = R*R + */ + vmulpd %ymm1, %ymm1, %ymm0 + +/* R = R^SignRes : update sign of reduced argument */ + vxorpd %ymm5, %ymm1, %ymm6 + vmovupd __dC7_sin(%rax), %ymm1 + vfmadd213pd __dC6_sin(%rax), %ymm0, %ymm1 + vfmadd213pd __dC5_sin(%rax), %ymm0, %ymm1 + vfmadd213pd __dC4_sin(%rax), %ymm0, %ymm1 + +/* Poly = C3+R2*(C4+R2*(C5+R2*(C6+R2*C7))) */ + vfmadd213pd __dC3_sin(%rax), %ymm0, %ymm1 + +/* Poly = R2*(C1+R2*(C2+R2*Poly)) */ + vfmadd213pd __dC2_sin(%rax), %ymm0, %ymm1 + vfmadd213pd __dC1_sin(%rax), %ymm0, %ymm1 + +/* SignX - sign bit of X */ + vandnpd %ymm4, %ymm2, %ymm7 + vmulpd %ymm0, %ymm1, %ymm2 + +/* Poly = Poly*R + R */ + vfmadd213pd %ymm6, %ymm6, %ymm2 + vmovmskpd %ymm3, %ecx + +/* + RECONSTRUCTION: + Final sign setting: Res = Poly^SignX + */ + vxorpd %ymm7, %ymm2, %ymm0 + testl %ecx, %ecx + jne .LBL_1_3 + +.LBL_1_2: + cfi_remember_state + movq %rbp, %rsp + cfi_def_cfa_register (%rsp) + popq %rbp + cfi_adjust_cfa_offset (-8) + cfi_restore (%rbp) + ret + +.LBL_1_3: + cfi_restore_state + vmovupd %ymm4, 320(%rsp) + vmovupd %ymm0, 384(%rsp) + je .LBL_1_2 + + xorb %dl, %dl + xorl %eax, %eax + vmovups %ymm8, 224(%rsp) + vmovups %ymm9, 192(%rsp) + vmovups %ymm10, 160(%rsp) + vmovups %ymm11, 128(%rsp) + vmovups %ymm12, 96(%rsp) + vmovups %ymm13, 64(%rsp) + vmovups %ymm14, 32(%rsp) + vmovups %ymm15, (%rsp) + movq %rsi, 264(%rsp) + movq %rdi, 256(%rsp) + movq %r12, 296(%rsp) + cfi_offset_rel_rsp (12, 296) + movb %dl, %r12b + movq %r13, 288(%rsp) + cfi_offset_rel_rsp (13, 288) + movl %ecx, %r13d + movq %r14, 280(%rsp) + cfi_offset_rel_rsp (14, 280) + movl %eax, %r14d + movq %r15, 272(%rsp) + cfi_offset_rel_rsp (15, 272) + cfi_remember_state + +.LBL_1_6: + btl %r14d, %r13d + jc .LBL_1_12 + +.LBL_1_7: + lea 1(%r14), %esi + btl %esi, %r13d + jc .LBL_1_10 + +.LBL_1_8: + incb %r12b + addl $2, %r14d + cmpb $16, %r12b + jb .LBL_1_6 + + vmovups 224(%rsp), %ymm8 + vmovups 192(%rsp), %ymm9 + vmovups 160(%rsp), %ymm10 + vmovups 128(%rsp), %ymm11 + vmovups 96(%rsp), %ymm12 + vmovups 64(%rsp), %ymm13 + vmovups 32(%rsp), %ymm14 + vmovups (%rsp), %ymm15 + vmovupd 384(%rsp), %ymm0 + movq 264(%rsp), %rsi + movq 256(%rsp), %rdi + movq 296(%rsp), %r12 + cfi_restore (%r12) + movq 288(%rsp), %r13 + cfi_restore (%r13) + movq 280(%rsp), %r14 + cfi_restore (%r14) + movq 272(%rsp), %r15 + cfi_restore (%r15) + jmp .LBL_1_2 + +.LBL_1_10: + cfi_restore_state + movzbl %r12b, %r15d + shlq $4, %r15 + vmovsd 328(%rsp,%r15), %xmm0 + vzeroupper + + call JUMPTARGET(sin) + + vmovsd %xmm0, 392(%rsp,%r15) + jmp .LBL_1_8 + +.LBL_1_12: + movzbl %r12b, %r15d + shlq $4, %r15 + vmovsd 320(%rsp,%r15), %xmm0 + vzeroupper + + call JUMPTARGET(sin) + + vmovsd %xmm0, 384(%rsp,%r15) + jmp .LBL_1_7 + +END (_ZGVdN4v_sin_avx2) diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core.S new file mode 100644 index 0000000000..2b15889c71 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core.S @@ -0,0 +1,37 @@ +/* Multiple versions of vectorized sin. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + + .text +ENTRY (_ZGVeN8v_sin) + .type _ZGVeN8v_sin, @gnu_indirect_function + LOAD_RTLD_GLOBAL_RO_RDX + leaq _ZGVeN8v_sin_skx(%rip), %rax + HAS_ARCH_FEATURE (AVX512DQ_Usable) + jnz 2f + leaq _ZGVeN8v_sin_knl(%rip), %rax + HAS_ARCH_FEATURE (AVX512F_Usable) + jnz 2f + leaq _ZGVeN8v_sin_avx2_wrapper(%rip), %rax +2: ret +END (_ZGVeN8v_sin) + +#define _ZGVeN8v_sin _ZGVeN8v_sin_avx2_wrapper +#include "../svml_d_sin8_core.S" diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S new file mode 100644 index 0000000000..7580e60636 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S @@ -0,0 +1,465 @@ +/* Function sin vectorized with AVX-512, KNL and SKX versions. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "svml_d_trig_data.h" +#include "svml_d_wrapper_impl.h" + + .text +ENTRY (_ZGVeN8v_sin_knl) +#ifndef HAVE_AVX512DQ_ASM_SUPPORT +WRAPPER_IMPL_AVX512 _ZGVdN4v_sin +#else +/* + ALGORITHM DESCRIPTION: + + ( low accuracy ( < 4ulp ) or enhanced performance + ( half of correct mantissa ) implementation ) + + Argument representation: + arg = N*Pi + R + + Result calculation: + sin(arg) = sin(N*Pi + R) = (-1)^N * sin(R) + sin(R) is approximated by corresponding polynomial + */ + pushq %rbp + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%rbp, 0) + movq %rsp, %rbp + cfi_def_cfa_register (%rbp) + andq $-64, %rsp + subq $1280, %rsp + movq __svml_d_trig_data@GOTPCREL(%rip), %rax + movq $-1, %rdx + vmovups __dAbsMask(%rax), %zmm6 + vmovups __dInvPI(%rax), %zmm1 + +/* + ARGUMENT RANGE REDUCTION: + X' = |X| + */ + vpandq %zmm6, %zmm0, %zmm12 + vmovups __dPI1_FMA(%rax), %zmm2 + vmovups __dC7_sin(%rax), %zmm7 + +/* SignX - sign bit of X */ + vpandnq %zmm0, %zmm6, %zmm11 + +/* R = X' - N*Pi1 */ + vmovaps %zmm12, %zmm3 + +/* Y = X'*InvPi + RS : right shifter add */ + vfmadd213pd __dRShifter(%rax), %zmm12, %zmm1 + vcmppd $22, __dRangeVal(%rax), %zmm12, %k1 + vpbroadcastq %rdx, %zmm13{%k1}{z} + +/* N = Y - RS : right shifter sub */ + vsubpd __dRShifter(%rax), %zmm1, %zmm4 + +/* SignRes = Y<<63 : shift LSB to MSB place for result sign */ + vpsllq $63, %zmm1, %zmm5 + vptestmq %zmm13, %zmm13, %k0 + vfnmadd231pd %zmm4, %zmm2, %zmm3 + kmovw %k0, %ecx + movzbl %cl, %ecx + +/* R = R - N*Pi2 */ + vfnmadd231pd __dPI2_FMA(%rax), %zmm4, %zmm3 + +/* R = R - N*Pi3 */ + vfnmadd132pd __dPI3_FMA(%rax), %zmm3, %zmm4 + +/* + POLYNOMIAL APPROXIMATION: + R2 = R*R + */ + vmulpd %zmm4, %zmm4, %zmm8 + +/* R = R^SignRes : update sign of reduced argument */ + vpxorq %zmm5, %zmm4, %zmm9 + vfmadd213pd __dC6_sin(%rax), %zmm8, %zmm7 + vfmadd213pd __dC5_sin(%rax), %zmm8, %zmm7 + vfmadd213pd __dC4_sin(%rax), %zmm8, %zmm7 + +/* Poly = C3+R2*(C4+R2*(C5+R2*(C6+R2*C7))) */ + vfmadd213pd __dC3_sin(%rax), %zmm8, %zmm7 + +/* Poly = R2*(C1+R2*(C2+R2*Poly)) */ + vfmadd213pd __dC2_sin(%rax), %zmm8, %zmm7 + vfmadd213pd __dC1_sin(%rax), %zmm8, %zmm7 + vmulpd %zmm8, %zmm7, %zmm10 + +/* Poly = Poly*R + R */ + vfmadd213pd %zmm9, %zmm9, %zmm10 + +/* + RECONSTRUCTION: + Final sign setting: Res = Poly^SignX + */ + vpxorq %zmm11, %zmm10, %zmm1 + testl %ecx, %ecx + jne .LBL_1_3 + +.LBL_1_2: + cfi_remember_state + vmovaps %zmm1, %zmm0 + movq %rbp, %rsp + cfi_def_cfa_register (%rsp) + popq %rbp + cfi_adjust_cfa_offset (-8) + cfi_restore (%rbp) + ret + +.LBL_1_3: + cfi_restore_state + vmovups %zmm0, 1152(%rsp) + vmovups %zmm1, 1216(%rsp) + je .LBL_1_2 + + xorb %dl, %dl + kmovw %k4, 1048(%rsp) + xorl %eax, %eax + kmovw %k5, 1040(%rsp) + kmovw %k6, 1032(%rsp) + kmovw %k7, 1024(%rsp) + vmovups %zmm16, 960(%rsp) + vmovups %zmm17, 896(%rsp) + vmovups %zmm18, 832(%rsp) + vmovups %zmm19, 768(%rsp) + vmovups %zmm20, 704(%rsp) + vmovups %zmm21, 640(%rsp) + vmovups %zmm22, 576(%rsp) + vmovups %zmm23, 512(%rsp) + vmovups %zmm24, 448(%rsp) + vmovups %zmm25, 384(%rsp) + vmovups %zmm26, 320(%rsp) + vmovups %zmm27, 256(%rsp) + vmovups %zmm28, 192(%rsp) + vmovups %zmm29, 128(%rsp) + vmovups %zmm30, 64(%rsp) + vmovups %zmm31, (%rsp) + movq %rsi, 1064(%rsp) + movq %rdi, 1056(%rsp) + movq %r12, 1096(%rsp) + cfi_offset_rel_rsp (12, 1096) + movb %dl, %r12b + movq %r13, 1088(%rsp) + cfi_offset_rel_rsp (13, 1088) + movl %ecx, %r13d + movq %r14, 1080(%rsp) + cfi_offset_rel_rsp (14, 1080) + movl %eax, %r14d + movq %r15, 1072(%rsp) + cfi_offset_rel_rsp (15, 1072) + cfi_remember_state + +.LBL_1_6: + btl %r14d, %r13d + jc .LBL_1_12 + +.LBL_1_7: + lea 1(%r14), %esi + btl %esi, %r13d + jc .LBL_1_10 + +.LBL_1_8: + addb $1, %r12b + addl $2, %r14d + cmpb $16, %r12b + jb .LBL_1_6 + + kmovw 1048(%rsp), %k4 + movq 1064(%rsp), %rsi + kmovw 1040(%rsp), %k5 + movq 1056(%rsp), %rdi + kmovw 1032(%rsp), %k6 + movq 1096(%rsp), %r12 + cfi_restore (%r12) + movq 1088(%rsp), %r13 + cfi_restore (%r13) + kmovw 1024(%rsp), %k7 + vmovups 960(%rsp), %zmm16 + vmovups 896(%rsp), %zmm17 + vmovups 832(%rsp), %zmm18 + vmovups 768(%rsp), %zmm19 + vmovups 704(%rsp), %zmm20 + vmovups 640(%rsp), %zmm21 + vmovups 576(%rsp), %zmm22 + vmovups 512(%rsp), %zmm23 + vmovups 448(%rsp), %zmm24 + vmovups 384(%rsp), %zmm25 + vmovups 320(%rsp), %zmm26 + vmovups 256(%rsp), %zmm27 + vmovups 192(%rsp), %zmm28 + vmovups 128(%rsp), %zmm29 + vmovups 64(%rsp), %zmm30 + vmovups (%rsp), %zmm31 + movq 1080(%rsp), %r14 + cfi_restore (%r14) + movq 1072(%rsp), %r15 + cfi_restore (%r15) + vmovups 1216(%rsp), %zmm1 + jmp .LBL_1_2 + +.LBL_1_10: + cfi_restore_state + movzbl %r12b, %r15d + shlq $4, %r15 + vmovsd 1160(%rsp,%r15), %xmm0 + call JUMPTARGET(sin) + vmovsd %xmm0, 1224(%rsp,%r15) + jmp .LBL_1_8 + +.LBL_1_12: + movzbl %r12b, %r15d + shlq $4, %r15 + vmovsd 1152(%rsp,%r15), %xmm0 + call JUMPTARGET(sin) + vmovsd %xmm0, 1216(%rsp,%r15) + jmp .LBL_1_7 +#endif +END (_ZGVeN8v_sin_knl) + +ENTRY (_ZGVeN8v_sin_skx) +#ifndef HAVE_AVX512DQ_ASM_SUPPORT +WRAPPER_IMPL_AVX512 _ZGVdN4v_sin +#else +/* + ALGORITHM DESCRIPTION: + + ( low accuracy ( < 4ulp ) or enhanced performance + ( half of correct mantissa ) implementation ) + + Argument representation: + arg = N*Pi + R + + Result calculation: + sin(arg) = sin(N*Pi + R) = (-1)^N * sin(R) + sin(R) is approximated by corresponding polynomial + */ + pushq %rbp + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%rbp, 0) + movq %rsp, %rbp + cfi_def_cfa_register (%rbp) + andq $-64, %rsp + subq $1280, %rsp + movq __svml_d_trig_data@GOTPCREL(%rip), %rax + vpbroadcastq .L_2il0floatpacket.14(%rip), %zmm14 + vmovups __dAbsMask(%rax), %zmm7 + vmovups __dInvPI(%rax), %zmm2 + vmovups __dRShifter(%rax), %zmm1 + vmovups __dPI1_FMA(%rax), %zmm3 + vmovups __dC7_sin(%rax), %zmm8 + +/* + ARGUMENT RANGE REDUCTION: + X' = |X| + */ + vandpd %zmm7, %zmm0, %zmm13 + +/* SignX - sign bit of X */ + vandnpd %zmm0, %zmm7, %zmm12 + +/* Y = X'*InvPi + RS : right shifter add */ + vfmadd213pd %zmm1, %zmm13, %zmm2 + vcmppd $18, __dRangeVal(%rax), %zmm13, %k1 + +/* SignRes = Y<<63 : shift LSB to MSB place for result sign */ + vpsllq $63, %zmm2, %zmm6 + +/* N = Y - RS : right shifter sub */ + vsubpd %zmm1, %zmm2, %zmm5 + +/* R = X' - N*Pi1 */ + vmovaps %zmm13, %zmm4 + vfnmadd231pd %zmm5, %zmm3, %zmm4 + +/* R = R - N*Pi2 */ + vfnmadd231pd __dPI2_FMA(%rax), %zmm5, %zmm4 + +/* R = R - N*Pi3 */ + vfnmadd132pd __dPI3_FMA(%rax), %zmm4, %zmm5 + +/* + POLYNOMIAL APPROXIMATION: + R2 = R*R + */ + vmulpd %zmm5, %zmm5, %zmm9 + +/* R = R^SignRes : update sign of reduced argument */ + vxorpd %zmm6, %zmm5, %zmm10 + vfmadd213pd __dC6_sin(%rax), %zmm9, %zmm8 + vfmadd213pd __dC5_sin(%rax), %zmm9, %zmm8 + vfmadd213pd __dC4_sin(%rax), %zmm9, %zmm8 + +/* Poly = C3+R2*(C4+R2*(C5+R2*(C6+R2*C7))) */ + vfmadd213pd __dC3_sin(%rax), %zmm9, %zmm8 + +/* Poly = R2*(C1+R2*(C2+R2*Poly)) */ + vfmadd213pd __dC2_sin(%rax), %zmm9, %zmm8 + vfmadd213pd __dC1_sin(%rax), %zmm9, %zmm8 + vmulpd %zmm9, %zmm8, %zmm11 + +/* Poly = Poly*R + R */ + vfmadd213pd %zmm10, %zmm10, %zmm11 + +/* + RECONSTRUCTION: + Final sign setting: Res = Poly^SignX + */ + vxorpd %zmm12, %zmm11, %zmm1 + vpandnq %zmm13, %zmm13, %zmm14{%k1} + vcmppd $3, %zmm14, %zmm14, %k0 + kmovw %k0, %ecx + testl %ecx, %ecx + jne .LBL_2_3 + +.LBL_2_2: + cfi_remember_state + vmovaps %zmm1, %zmm0 + movq %rbp, %rsp + cfi_def_cfa_register (%rsp) + popq %rbp + cfi_adjust_cfa_offset (-8) + cfi_restore (%rbp) + ret + +.LBL_2_3: + cfi_restore_state + vmovups %zmm0, 1152(%rsp) + vmovups %zmm1, 1216(%rsp) + je .LBL_2_2 + + xorb %dl, %dl + xorl %eax, %eax + kmovw %k4, 1048(%rsp) + kmovw %k5, 1040(%rsp) + kmovw %k6, 1032(%rsp) + kmovw %k7, 1024(%rsp) + vmovups %zmm16, 960(%rsp) + vmovups %zmm17, 896(%rsp) + vmovups %zmm18, 832(%rsp) + vmovups %zmm19, 768(%rsp) + vmovups %zmm20, 704(%rsp) + vmovups %zmm21, 640(%rsp) + vmovups %zmm22, 576(%rsp) + vmovups %zmm23, 512(%rsp) + vmovups %zmm24, 448(%rsp) + vmovups %zmm25, 384(%rsp) + vmovups %zmm26, 320(%rsp) + vmovups %zmm27, 256(%rsp) + vmovups %zmm28, 192(%rsp) + vmovups %zmm29, 128(%rsp) + vmovups %zmm30, 64(%rsp) + vmovups %zmm31, (%rsp) + movq %rsi, 1064(%rsp) + movq %rdi, 1056(%rsp) + movq %r12, 1096(%rsp) + cfi_offset_rel_rsp (12, 1096) + movb %dl, %r12b + movq %r13, 1088(%rsp) + cfi_offset_rel_rsp (13, 1088) + movl %ecx, %r13d + movq %r14, 1080(%rsp) + cfi_offset_rel_rsp (14, 1080) + movl %eax, %r14d + movq %r15, 1072(%rsp) + cfi_offset_rel_rsp (15, 1072) + cfi_remember_state + +.LBL_2_6: + btl %r14d, %r13d + jc .LBL_2_12 + +.LBL_2_7: + lea 1(%r14), %esi + btl %esi, %r13d + jc .LBL_2_10 + +.LBL_2_8: + incb %r12b + addl $2, %r14d + cmpb $16, %r12b + jb .LBL_2_6 + + kmovw 1048(%rsp), %k4 + kmovw 1040(%rsp), %k5 + kmovw 1032(%rsp), %k6 + kmovw 1024(%rsp), %k7 + vmovups 960(%rsp), %zmm16 + vmovups 896(%rsp), %zmm17 + vmovups 832(%rsp), %zmm18 + vmovups 768(%rsp), %zmm19 + vmovups 704(%rsp), %zmm20 + vmovups 640(%rsp), %zmm21 + vmovups 576(%rsp), %zmm22 + vmovups 512(%rsp), %zmm23 + vmovups 448(%rsp), %zmm24 + vmovups 384(%rsp), %zmm25 + vmovups 320(%rsp), %zmm26 + vmovups 256(%rsp), %zmm27 + vmovups 192(%rsp), %zmm28 + vmovups 128(%rsp), %zmm29 + vmovups 64(%rsp), %zmm30 + vmovups (%rsp), %zmm31 + vmovups 1216(%rsp), %zmm1 + movq 1064(%rsp), %rsi + movq 1056(%rsp), %rdi + movq 1096(%rsp), %r12 + cfi_restore (%r12) + movq 1088(%rsp), %r13 + cfi_restore (%r13) + movq 1080(%rsp), %r14 + cfi_restore (%r14) + movq 1072(%rsp), %r15 + cfi_restore (%r15) + jmp .LBL_2_2 + +.LBL_2_10: + cfi_restore_state + movzbl %r12b, %r15d + shlq $4, %r15 + vmovsd 1160(%rsp,%r15), %xmm0 + vzeroupper + vmovsd 1160(%rsp,%r15), %xmm0 + + call JUMPTARGET(sin) + + vmovsd %xmm0, 1224(%rsp,%r15) + jmp .LBL_2_8 + +.LBL_2_12: + movzbl %r12b, %r15d + shlq $4, %r15 + vmovsd 1152(%rsp,%r15), %xmm0 + vzeroupper + vmovsd 1152(%rsp,%r15), %xmm0 + + call JUMPTARGET(sin) + + vmovsd %xmm0, 1216(%rsp,%r15) + jmp .LBL_2_7 +#endif +END (_ZGVeN8v_sin_skx) + + .section .rodata, "a" +.L_2il0floatpacket.14: + .long 0xffffffff,0xffffffff + .type .L_2il0floatpacket.14,@object diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_sincos2_core.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_sincos2_core.S new file mode 100644 index 0000000000..13279e3fb7 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_sincos2_core.S @@ -0,0 +1,36 @@ +/* Multiple versions of vectorized sincos. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + + .text +ENTRY (_ZGVbN2vvv_sincos) + .type _ZGVbN2vvv_sincos, @gnu_indirect_function + LOAD_RTLD_GLOBAL_RO_RDX + leaq _ZGVbN2vvv_sincos_sse4(%rip), %rax + HAS_CPU_FEATURE (SSE4_1) + jz 2f + ret +2: leaq _ZGVbN2vvv_sincos_sse2(%rip), %rax + ret +END (_ZGVbN2vvv_sincos) +libmvec_hidden_def (_ZGVbN2vvv_sincos) + +#define _ZGVbN2vvv_sincos _ZGVbN2vvv_sincos_sse2 +#include "../svml_d_sincos2_core.S" diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_sincos2_core_sse4.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_sincos2_core_sse4.S new file mode 100644 index 0000000000..c46109f35d --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_sincos2_core_sse4.S @@ -0,0 +1,368 @@ +/* Function sincos vectorized with SSE4. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "svml_d_trig_data.h" + + .text +ENTRY (_ZGVbN2vl8l8_sincos_sse4) +/* + ALGORITHM DESCRIPTION: + + ( low accuracy ( < 4ulp ) or enhanced performance + ( half of correct mantissa ) implementation ) + + Argument representation: + arg = N*Pi + R + + Result calculation: + sin(arg) = sin(N*Pi + R) = (-1)^N * sin(R) + arg + Pi/2 = (N'*Pi + R') + cos(arg) = sin(arg+Pi/2) = sin(N'*Pi + R') = (-1)^N' * sin(R') + sin(R), sin(R') are approximated by corresponding polynomial. */ + + pushq %rbp + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%rbp, 0) + movq %rsp, %rbp + cfi_def_cfa_register (%rbp) + andq $-64, %rsp + subq $320, %rsp + movq __svml_d_trig_data@GOTPCREL(%rip), %rax + movups %xmm11, 160(%rsp) + movups %xmm12, 144(%rsp) + movups __dSignMask(%rax), %xmm11 + +/* ARGUMENT RANGE REDUCTION: + Absolute argument: X' = |X| */ + movaps %xmm11, %xmm4 + +/* Grab sign bit from argument */ + movaps %xmm11, %xmm7 + movups __dInvPI(%rax), %xmm5 + andnps %xmm0, %xmm4 + +/* SinY = X'*InvPi + RS : right shifter add */ + mulpd %xmm4, %xmm5 + addpd __dRShifter(%rax), %xmm5 + +/* SinSignRes = Y<<63 : shift LSB to MSB place for result sign */ + movaps %xmm5, %xmm12 + andps %xmm0, %xmm7 + +/* SinN = Y - RS : right shifter sub */ + subpd __dRShifter(%rax), %xmm5 + movups %xmm10, 176(%rsp) + psllq $63, %xmm12 + movups __dPI1(%rax), %xmm10 + +/* SinR = X' - SinN*Pi1 */ + movaps %xmm10, %xmm1 + mulpd %xmm5, %xmm1 + movups __dPI2(%rax), %xmm6 + +/* SinR = SinR - SinN*Pi1 */ + movaps %xmm6, %xmm2 + mulpd %xmm5, %xmm2 + movups %xmm13, 112(%rsp) + movaps %xmm4, %xmm13 + subpd %xmm1, %xmm13 + subpd %xmm2, %xmm13 + +/* Sine result sign: SinRSign = SignMask & SinR */ + movaps %xmm11, %xmm2 + +/* CosR = SinX - CosN*Pi1 */ + movaps %xmm4, %xmm1 + movups __dOneHalf(%rax), %xmm3 + andps %xmm13, %xmm2 + +/* Set SinRSign to 0.5 */ + orps %xmm2, %xmm3 + +/* Update CosRSign and CosSignRes signs */ + xorps %xmm11, %xmm2 + +/* CosN = SinN +(-)0.5 */ + addpd %xmm5, %xmm3 + cmpnlepd __dRangeVal(%rax), %xmm4 + mulpd %xmm3, %xmm10 + +/* CosR = CosR - CosN*Pi2 */ + mulpd %xmm3, %xmm6 + subpd %xmm10, %xmm1 + movmskpd %xmm4, %ecx + movups __dPI3(%rax), %xmm10 + xorps %xmm12, %xmm2 + subpd %xmm6, %xmm1 + +/* SinR = SinR - SinN*Pi3 */ + movaps %xmm10, %xmm6 + +/* Final reconstruction. + Combine Sin result's sign */ + xorps %xmm7, %xmm12 + mulpd %xmm5, %xmm6 + +/* CosR = CosR - CosN*Pi3 */ + mulpd %xmm3, %xmm10 + subpd %xmm6, %xmm13 + subpd %xmm10, %xmm1 + movups __dPI4(%rax), %xmm6 + +/* SinR = SinR - SinN*Pi4 */ + mulpd %xmm6, %xmm5 + +/* CosR = CosR - CosN*Pi4 */ + mulpd %xmm6, %xmm3 + subpd %xmm5, %xmm13 + subpd %xmm3, %xmm1 + +/* SinR2 = SinR^2 */ + movaps %xmm13, %xmm6 + +/* CosR2 = CosR^2 */ + movaps %xmm1, %xmm10 + mulpd %xmm13, %xmm6 + mulpd %xmm1, %xmm10 + +/* Polynomial approximation */ + movups __dC7(%rax), %xmm5 + movaps %xmm5, %xmm3 + mulpd %xmm6, %xmm3 + mulpd %xmm10, %xmm5 + addpd __dC6(%rax), %xmm3 + addpd __dC6(%rax), %xmm5 + mulpd %xmm6, %xmm3 + mulpd %xmm10, %xmm5 + addpd __dC5(%rax), %xmm3 + addpd __dC5(%rax), %xmm5 + mulpd %xmm6, %xmm3 + mulpd %xmm10, %xmm5 + addpd __dC4(%rax), %xmm3 + addpd __dC4(%rax), %xmm5 + +/* SinPoly = C3 + SinR2*(C4 + SinR2*(C5 + SinR2*(C6 + SinR2*C7))) */ + mulpd %xmm6, %xmm3 + +/* CosPoly = C3 + CosR2*(C4 + CosR2*(C5 + CosR2*(C6 + CosR2*C7))) */ + mulpd %xmm10, %xmm5 + addpd __dC3(%rax), %xmm3 + addpd __dC3(%rax), %xmm5 + +/* SinPoly = C2 + SinR2*SinPoly */ + mulpd %xmm6, %xmm3 + +/* CosPoly = C2 + CosR2*CosPoly */ + mulpd %xmm10, %xmm5 + addpd __dC2(%rax), %xmm3 + addpd __dC2(%rax), %xmm5 + +/* SinPoly = C1 + SinR2*SinPoly */ + mulpd %xmm6, %xmm3 + +/* CosPoly = C1 + CosR2*CosPoly */ + mulpd %xmm10, %xmm5 + addpd __dC1(%rax), %xmm3 + addpd __dC1(%rax), %xmm5 + +/* SinPoly = SinR2*SinPoly */ + mulpd %xmm3, %xmm6 + +/* CosPoly = CosR2*CosPoly */ + mulpd %xmm5, %xmm10 + +/* SinPoly = SinR*SinPoly */ + mulpd %xmm13, %xmm6 + +/* CosPoly = CosR*CosPoly */ + mulpd %xmm1, %xmm10 + addpd %xmm6, %xmm13 + addpd %xmm10, %xmm1 + +/* Update Sin result's sign */ + xorps %xmm12, %xmm13 + +/* Update Cos result's sign */ + xorps %xmm2, %xmm1 + testl %ecx, %ecx + jne .LBL_1_3 + +.LBL_1_2: + cfi_remember_state + movups 176(%rsp), %xmm10 + movaps %xmm13, (%rdi) + movups 160(%rsp), %xmm11 + movups 144(%rsp), %xmm12 + movups 112(%rsp), %xmm13 + movups %xmm1, (%rsi) + movq %rbp, %rsp + cfi_def_cfa_register (%rsp) + popq %rbp + cfi_adjust_cfa_offset (-8) + cfi_restore (%rbp) + ret + +.LBL_1_3: + cfi_restore_state + movups %xmm0, 128(%rsp) + movups %xmm13, 192(%rsp) + movups %xmm1, 256(%rsp) + je .LBL_1_2 + + xorb %dl, %dl + xorl %eax, %eax + movups %xmm8, 48(%rsp) + movups %xmm9, 32(%rsp) + movups %xmm14, 16(%rsp) + movups %xmm15, (%rsp) + movq %rsi, 64(%rsp) + movq %r12, 104(%rsp) + cfi_offset_rel_rsp (12, 104) + movb %dl, %r12b + movq %r13, 96(%rsp) + cfi_offset_rel_rsp (13, 96) + movl %eax, %r13d + movq %r14, 88(%rsp) + cfi_offset_rel_rsp (14, 88) + movl %ecx, %r14d + movq %r15, 80(%rsp) + cfi_offset_rel_rsp (15, 80) + movq %rbx, 72(%rsp) + movq %rdi, %rbx + cfi_remember_state + +.LBL_1_6: + btl %r13d, %r14d + jc .LBL_1_13 + +.LBL_1_7: + lea 1(%r13), %esi + btl %esi, %r14d + jc .LBL_1_10 + +.LBL_1_8: + incb %r12b + addl $2, %r13d + cmpb $16, %r12b + jb .LBL_1_6 + + movups 48(%rsp), %xmm8 + movq %rbx, %rdi + movups 32(%rsp), %xmm9 + movups 16(%rsp), %xmm14 + movups (%rsp), %xmm15 + movq 64(%rsp), %rsi + movq 104(%rsp), %r12 + cfi_restore (%r12) + movq 96(%rsp), %r13 + cfi_restore (%r13) + movq 88(%rsp), %r14 + cfi_restore (%r14) + movq 80(%rsp), %r15 + cfi_restore (%r15) + movq 72(%rsp), %rbx + movups 192(%rsp), %xmm13 + movups 256(%rsp), %xmm1 + jmp .LBL_1_2 + +.LBL_1_10: + cfi_restore_state + movzbl %r12b, %r15d + shlq $4, %r15 + movsd 136(%rsp,%r15), %xmm0 + + call JUMPTARGET(sin) + + movsd %xmm0, 200(%rsp,%r15) + movsd 136(%rsp,%r15), %xmm0 + + call JUMPTARGET(cos) + + movsd %xmm0, 264(%rsp,%r15) + jmp .LBL_1_8 + +.LBL_1_13: + movzbl %r12b, %r15d + shlq $4, %r15 + movsd 128(%rsp,%r15), %xmm0 + + call JUMPTARGET(sin) + + movsd %xmm0, 192(%rsp,%r15) + movsd 128(%rsp,%r15), %xmm0 + + call JUMPTARGET(cos) + + movsd %xmm0, 256(%rsp,%r15) + jmp .LBL_1_7 +END (_ZGVbN2vl8l8_sincos_sse4) +libmvec_hidden_def(_ZGVbN2vl8l8_sincos_sse4) + +/* vvv version implemented with wrapper to vl8l8 variant. */ +ENTRY (_ZGVbN2vvv_sincos_sse4) +#ifndef __ILP32__ + subq $72, %rsp + .cfi_def_cfa_offset 80 + movdqu %xmm1, 32(%rsp) + lea (%rsp), %rdi + movdqu %xmm2, 48(%rdi) + lea 16(%rsp), %rsi + call HIDDEN_JUMPTARGET(_ZGVbN2vl8l8_sincos_sse4) + movq 32(%rsp), %rdx + movq 48(%rsp), %rsi + movq 40(%rsp), %r8 + movq 56(%rsp), %r10 + movq (%rsp), %rax + movq 16(%rsp), %rcx + movq 8(%rsp), %rdi + movq 24(%rsp), %r9 + movq %rax, (%rdx) + movq %rcx, (%rsi) + movq %rdi, (%r8) + movq %r9, (%r10) + addq $72, %rsp + .cfi_def_cfa_offset 8 + ret +#else + subl $72, %esp + .cfi_def_cfa_offset 80 + leal 48(%rsp), %esi + movaps %xmm1, 16(%esp) + leal 32(%rsp), %edi + movaps %xmm2, (%esp) + call HIDDEN_JUMPTARGET(_ZGVbN2vl8l8_sincos_sse4) + movdqa 16(%esp), %xmm1 + movsd 32(%esp), %xmm0 + movq %xmm1, %rax + movdqa (%esp), %xmm2 + movsd %xmm0, (%eax) + movsd 40(%esp), %xmm0 + pextrd $1, %xmm1, %eax + movsd %xmm0, (%eax) + movsd 48(%esp), %xmm0 + movq %xmm2, %rax + movsd %xmm0, (%eax) + movsd 56(%esp), %xmm0 + pextrd $1, %xmm2, %eax + movsd %xmm0, (%eax) + addl $72, %esp + .cfi_def_cfa_offset 8 + ret +#endif +END (_ZGVbN2vvv_sincos_sse4) diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_sincos4_core.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_sincos4_core.S new file mode 100644 index 0000000000..8aacb8e76a --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_sincos4_core.S @@ -0,0 +1,36 @@ +/* Multiple versions of vectorized sincos. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + + .text +ENTRY (_ZGVdN4vvv_sincos) + .type _ZGVdN4vvv_sincos, @gnu_indirect_function + LOAD_RTLD_GLOBAL_RO_RDX + leaq _ZGVdN4vvv_sincos_avx2(%rip), %rax + HAS_ARCH_FEATURE (AVX2_Usable) + jz 2f + ret +2: leaq _ZGVdN4vvv_sincos_sse_wrapper(%rip), %rax + ret +END (_ZGVdN4vvv_sincos) +libmvec_hidden_def (_ZGVdN4vvv_sincos) + +#define _ZGVdN4vvv_sincos _ZGVdN4vvv_sincos_sse_wrapper +#include "../svml_d_sincos4_core.S" diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_sincos4_core_avx2.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_sincos4_core_avx2.S new file mode 100644 index 0000000000..a6318c5ca6 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_sincos4_core_avx2.S @@ -0,0 +1,373 @@ +/* Function sincos vectorized with AVX2. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "svml_d_trig_data.h" + + .text +ENTRY (_ZGVdN4vl8l8_sincos_avx2) +/* + ALGORITHM DESCRIPTION: + + ( low accuracy ( < 4ulp ) or enhanced performance + ( half of correct mantissa ) implementation ) + + Argument representation: + arg = N*Pi + R + + Result calculation: + sin(arg) = sin(N*Pi + R) = (-1)^N * sin(R) + arg + Pi/2 = (N'*Pi + R') + cos(arg) = sin(arg+Pi/2) = sin(N'*Pi + R') = (-1)^N' * sin(R') + sin(R), sin(R') are approximated by corresponding polynomial. */ + + pushq %rbp + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%rbp, 0) + movq %rsp, %rbp + cfi_def_cfa_register (%rbp) + andq $-64, %rsp + subq $448, %rsp + movq __svml_d_trig_data@GOTPCREL(%rip), %rax + vmovups %ymm14, 288(%rsp) + vmovups %ymm8, 352(%rsp) + vmovupd __dSignMask(%rax), %ymm6 + vmovupd __dInvPI(%rax), %ymm2 + vmovupd __dPI1_FMA(%rax), %ymm5 + vmovups %ymm9, 224(%rsp) + +/* ARGUMENT RANGE REDUCTION: + Absolute argument: X' = |X| */ + vandnpd %ymm0, %ymm6, %ymm1 + +/* SinY = X'*InvPi + RS : right shifter add */ + vfmadd213pd __dRShifter(%rax), %ymm1, %ymm2 + +/* SinSignRes = Y<<63 : shift LSB to MSB place for result sign */ + vpsllq $63, %ymm2, %ymm4 + +/* SinN = Y - RS : right shifter sub */ + vsubpd __dRShifter(%rax), %ymm2, %ymm2 + +/* SinR = X' - SinN*Pi1 */ + vmovdqa %ymm1, %ymm14 + vfnmadd231pd %ymm2, %ymm5, %ymm14 + +/* SinR = SinR - SinN*Pi1 */ + vfnmadd231pd __dPI2_FMA(%rax), %ymm2, %ymm14 + +/* Sine result sign: SinRSign = SignMask & SinR */ + vandpd %ymm14, %ymm6, %ymm7 + +/* Set SinRSign to 0.5 */ + vorpd __dOneHalf(%rax), %ymm7, %ymm3 + +/* CosN = SinN +(-)0.5 */ + vaddpd %ymm3, %ymm2, %ymm3 + +/* CosR = SinX - CosN*Pi1 */ + vmovdqa %ymm1, %ymm8 + vfnmadd231pd %ymm3, %ymm5, %ymm8 + vmovupd __dPI3_FMA(%rax), %ymm5 + vcmpnle_uqpd __dRangeVal(%rax), %ymm1, %ymm1 + +/* CosR = CosR - CosN*Pi2 */ + vfnmadd231pd __dPI2_FMA(%rax), %ymm3, %ymm8 + +/* SinR = SinR - SinN*Pi3 */ + vfnmadd213pd %ymm14, %ymm5, %ymm2 + +/* CosR = CosR - CosN*Pi3 */ + vfnmadd213pd %ymm8, %ymm5, %ymm3 + vmovupd __dC6(%rax), %ymm8 + +/* SinR2 = SinR^2 */ + vmulpd %ymm2, %ymm2, %ymm14 + +/* CosR2 = CosR^2 */ + vmulpd %ymm3, %ymm3, %ymm5 + +/* Grab SignX */ + vandpd %ymm0, %ymm6, %ymm9 + +/* Update CosRSign and CosSignRes signs */ + vxorpd %ymm6, %ymm7, %ymm6 + vxorpd %ymm6, %ymm4, %ymm7 + +/* Update sign SinSignRes */ + vxorpd %ymm9, %ymm4, %ymm6 + +/* Polynomial approximation */ + vmovupd __dC7(%rax), %ymm4 + vmovdqa %ymm8, %ymm9 + vfmadd231pd __dC7(%rax), %ymm14, %ymm9 + vfmadd213pd %ymm8, %ymm5, %ymm4 + vfmadd213pd __dC5(%rax), %ymm14, %ymm9 + vfmadd213pd __dC5(%rax), %ymm5, %ymm4 + vfmadd213pd __dC4(%rax), %ymm14, %ymm9 + vfmadd213pd __dC4(%rax), %ymm5, %ymm4 + +/* SinPoly = C3 + SinR2*(C4 + SinR2*(C5 + SinR2*(C6 + SinR2*C7))) */ + vfmadd213pd __dC3(%rax), %ymm14, %ymm9 + +/* CosPoly = C3 + CosR2*(C4 + CosR2*(C5 + CosR2*(C6 + CosR2*C7))) */ + vfmadd213pd __dC3(%rax), %ymm5, %ymm4 + +/* SinPoly = C2 + SinR2*SinPoly */ + vfmadd213pd __dC2(%rax), %ymm14, %ymm9 + +/* CosPoly = C2 + CosR2*CosPoly */ + vfmadd213pd __dC2(%rax), %ymm5, %ymm4 + +/* SinPoly = C1 + SinR2*SinPoly */ + vfmadd213pd __dC1(%rax), %ymm14, %ymm9 + +/* CosPoly = C1 + CosR2*CosPoly */ + vfmadd213pd __dC1(%rax), %ymm5, %ymm4 + +/* SinPoly = SinR2*SinPoly */ + vmulpd %ymm14, %ymm9, %ymm8 + +/* CosPoly = CosR2*CosPoly */ + vmulpd %ymm5, %ymm4, %ymm4 + +/* SinPoly = SinR*SinPoly */ + vfmadd213pd %ymm2, %ymm2, %ymm8 + +/* CosPoly = CosR*CosPoly */ + vfmadd213pd %ymm3, %ymm3, %ymm4 + vmovmskpd %ymm1, %ecx + +/* Final reconstruction + Update Sin result's sign */ + vxorpd %ymm6, %ymm8, %ymm3 + +/* Update Cos result's sign */ + vxorpd %ymm7, %ymm4, %ymm2 + testl %ecx, %ecx + jne .LBL_1_3 + +.LBL_1_2: + cfi_remember_state + vmovups 352(%rsp), %ymm8 + vmovups 224(%rsp), %ymm9 + vmovups 288(%rsp), %ymm14 + vmovupd %ymm2, (%rsi) + vmovdqa %ymm3, (%rdi) + movq %rbp, %rsp + cfi_def_cfa_register (%rsp) + popq %rbp + cfi_adjust_cfa_offset (-8) + cfi_restore (%rbp) + ret + +.LBL_1_3: + cfi_restore_state + vmovupd %ymm0, 256(%rsp) + vmovupd %ymm3, 320(%rsp) + vmovupd %ymm2, 384(%rsp) + je .LBL_1_2 + + xorb %dl, %dl + xorl %eax, %eax + vmovups %ymm10, 128(%rsp) + vmovups %ymm11, 96(%rsp) + vmovups %ymm12, 64(%rsp) + vmovups %ymm13, 32(%rsp) + vmovups %ymm15, (%rsp) + movq %rsi, 160(%rsp) + movq %r12, 200(%rsp) + cfi_offset_rel_rsp (12, 200) + movb %dl, %r12b + movq %r13, 192(%rsp) + cfi_offset_rel_rsp (13, 192) + movl %eax, %r13d + movq %r14, 184(%rsp) + cfi_offset_rel_rsp (14, 184) + movl %ecx, %r14d + movq %r15, 176(%rsp) + cfi_offset_rel_rsp (15, 176) + movq %rbx, 168(%rsp) + movq %rdi, %rbx + cfi_remember_state + +.LBL_1_6: + btl %r13d, %r14d + jc .LBL_1_13 + +.LBL_1_7: + lea 1(%r13), %esi + btl %esi, %r14d + jc .LBL_1_10 + +.LBL_1_8: + incb %r12b + addl $2, %r13d + cmpb $16, %r12b + jb .LBL_1_6 + + vmovups 128(%rsp), %ymm10 + movq %rbx, %rdi + vmovups 96(%rsp), %ymm11 + vmovups 64(%rsp), %ymm12 + vmovups 32(%rsp), %ymm13 + vmovups (%rsp), %ymm15 + vmovupd 320(%rsp), %ymm3 + vmovupd 384(%rsp), %ymm2 + movq 160(%rsp), %rsi + movq 200(%rsp), %r12 + cfi_restore (%r12) + movq 192(%rsp), %r13 + cfi_restore (%r13) + movq 184(%rsp), %r14 + cfi_restore (%r14) + movq 176(%rsp), %r15 + cfi_restore (%r15) + movq 168(%rsp), %rbx + jmp .LBL_1_2 + +.LBL_1_10: + cfi_restore_state + movzbl %r12b, %r15d + shlq $4, %r15 + vmovsd 264(%rsp,%r15), %xmm0 + vzeroupper + + call JUMPTARGET(sin) + + vmovsd %xmm0, 328(%rsp,%r15) + vmovsd 264(%rsp,%r15), %xmm0 + + call JUMPTARGET(cos) + + vmovsd %xmm0, 392(%rsp,%r15) + jmp .LBL_1_8 + +.LBL_1_13: + movzbl %r12b, %r15d + shlq $4, %r15 + vmovsd 256(%rsp,%r15), %xmm0 + vzeroupper + + call JUMPTARGET(sin) + + vmovsd %xmm0, 320(%rsp,%r15) + vmovsd 256(%rsp,%r15), %xmm0 + + call JUMPTARGET(cos) + + vmovsd %xmm0, 384(%rsp,%r15) + jmp .LBL_1_7 + +END (_ZGVdN4vl8l8_sincos_avx2) +libmvec_hidden_def(_ZGVdN4vl8l8_sincos_avx2) + +/* vvv version implemented with wrapper to vl8l8 variant. */ +ENTRY (_ZGVdN4vvv_sincos_avx2) +#ifndef __ILP32__ + pushq %rbp + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%rbp, 0) + movq %rsp, %rbp + cfi_def_cfa_register (%rbp) + andq $-32, %rsp + subq $128, %rsp + vmovdqu %ymm1, 64(%rsp) + lea (%rsp), %rdi + vmovdqu %ymm2, 96(%rdi) + lea 32(%rsp), %rsi + call HIDDEN_JUMPTARGET(_ZGVdN4vl8l8_sincos_avx2) + movq 64(%rsp), %rdx + movq 96(%rsp), %rsi + movq 72(%rsp), %r8 + movq 104(%rsp), %r10 + movq (%rsp), %rax + movq 32(%rsp), %rcx + movq 8(%rsp), %rdi + movq 40(%rsp), %r9 + movq %rax, (%rdx) + movq %rcx, (%rsi) + movq 80(%rsp), %rax + movq 112(%rsp), %rcx + movq %rdi, (%r8) + movq %r9, (%r10) + movq 88(%rsp), %rdi + movq 120(%rsp), %r9 + movq 16(%rsp), %r11 + movq 48(%rsp), %rdx + movq 24(%rsp), %rsi + movq 56(%rsp), %r8 + movq %r11, (%rax) + movq %rdx, (%rcx) + movq %rsi, (%rdi) + movq %r8, (%r9) + movq %rbp, %rsp + cfi_def_cfa_register (%rsp) + popq %rbp + cfi_adjust_cfa_offset (-8) + cfi_restore (%rbp) + ret +#else + leal 8(%rsp), %r10d + .cfi_def_cfa 10, 0 + andl $-32, %esp + pushq -8(%r10d) + pushq %rbp + .cfi_escape 0x10,0x6,0x2,0x76,0 + movl %esp, %ebp + pushq %r10 + .cfi_escape 0xf,0x3,0x76,0x78,0x6 + leal -48(%rbp), %esi + leal -80(%rbp), %edi + subl $104, %esp + vmovaps %xmm1, -96(%ebp) + vmovaps %xmm2, -112(%ebp) + call HIDDEN_JUMPTARGET(_ZGVdN4vl8l8_sincos_avx2) + movl -96(%ebp), %eax + vmovsd -80(%ebp), %xmm0 + vmovsd %xmm0, (%eax) + movl -92(%ebp), %eax + vmovsd -72(%ebp), %xmm0 + vmovsd %xmm0, (%eax) + movl -88(%ebp), %eax + vmovsd -64(%ebp), %xmm0 + vmovsd %xmm0, (%eax) + movl -84(%ebp), %eax + vmovsd -56(%ebp), %xmm0 + vmovsd %xmm0, (%eax) + movl -112(%ebp), %eax + vmovsd -48(%ebp), %xmm0 + vmovsd %xmm0, (%eax) + movl -108(%ebp), %eax + vmovsd -40(%ebp), %xmm0 + vmovsd %xmm0, (%eax) + movl -104(%ebp), %eax + vmovsd -32(%ebp), %xmm0 + vmovsd %xmm0, (%eax) + movl -100(%ebp), %eax + vmovsd -24(%ebp), %xmm0 + vmovsd %xmm0, (%eax) + addl $104, %esp + popq %r10 + .cfi_def_cfa 10, 0 + popq %rbp + leal -8(%r10), %esp + .cfi_def_cfa 7, 8 + ret +#endif +END (_ZGVdN4vvv_sincos_avx2) diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core.S new file mode 100644 index 0000000000..3c0abc379e --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core.S @@ -0,0 +1,37 @@ +/* Multiple versions of vectorized sincos. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + + .text +ENTRY (_ZGVeN8vvv_sincos) + .type _ZGVeN8vvv_sincos, @gnu_indirect_function + LOAD_RTLD_GLOBAL_RO_RDX + leaq _ZGVeN8vvv_sincos_skx(%rip), %rax + HAS_ARCH_FEATURE (AVX512DQ_Usable) + jnz 2f + leaq _ZGVeN8vvv_sincos_knl(%rip), %rax + HAS_ARCH_FEATURE (AVX512F_Usable) + jnz 2f + leaq _ZGVeN8vvv_sincos_avx2_wrapper(%rip), %rax +2: ret +END (_ZGVeN8vvv_sincos) + +#define _ZGVeN8vvv_sincos _ZGVeN8vvv_sincos_avx2_wrapper +#include "../svml_d_sincos8_core.S" diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S new file mode 100644 index 0000000000..c9207558c5 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S @@ -0,0 +1,763 @@ +/* Function sincos vectorized with AVX-512. KNL and SKX versions. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "svml_d_trig_data.h" +#include "svml_d_wrapper_impl.h" + +/* + ALGORITHM DESCRIPTION: + + ( low accuracy ( < 4ulp ) or enhanced performance + ( half of correct mantissa ) implementation ) + + Argument representation: + arg = N*Pi + R + + Result calculation: + sin(arg) = sin(N*Pi + R) = (-1)^N * sin(R) + arg + Pi/2 = (N'*Pi + R') + cos(arg) = sin(arg+Pi/2) = sin(N'*Pi + R') = (-1)^N' * sin(R') + sin(R), sin(R') are approximated by corresponding polynomial. */ + + .text +ENTRY (_ZGVeN8vl8l8_sincos_knl) +#ifndef HAVE_AVX512DQ_ASM_SUPPORT +WRAPPER_IMPL_AVX512_fFF _ZGVdN4vl8l8_sincos +#else + pushq %rbp + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%rbp, 0) + movq %rsp, %rbp + cfi_def_cfa_register (%rbp) + andq $-64, %rsp + subq $1344, %rsp + movq __svml_d_trig_data@GOTPCREL(%rip), %rax + vmovaps %zmm0, %zmm4 + movq $-1, %rdx + vmovups __dSignMask(%rax), %zmm12 + vmovups __dInvPI(%rax), %zmm5 + +/* ARGUMENT RANGE REDUCTION: + Absolute argument: X' = |X| */ + vpandnq %zmm4, %zmm12, %zmm3 + vmovups __dPI1_FMA(%rax), %zmm7 + vmovups __dPI3_FMA(%rax), %zmm9 + +/* SinR = X' - SinN*Pi1 */ + vmovaps %zmm3, %zmm8 + +/* CosR = SinX - CosN*Pi1 */ + vmovaps %zmm3, %zmm10 + +/* SinY = X'*InvPi + RS : right shifter add */ + vfmadd213pd __dRShifter(%rax), %zmm3, %zmm5 + vmovups __dC6(%rax), %zmm13 + +/* SinN = Y - RS : right shifter sub */ + vsubpd __dRShifter(%rax), %zmm5, %zmm1 + vmovaps %zmm13, %zmm14 + +/* SinSignRes = Y<<63 : shift LSB to MSB place for result sign */ + vpsllq $63, %zmm5, %zmm2 + vcmppd $22, __dRangeVal(%rax), %zmm3, %k1 + +/* Update CosRSign and CosSignRes signs */ + vmovaps %zmm12, %zmm5 + vfnmadd231pd %zmm1, %zmm7, %zmm8 + +/* SinR = SinR - SinN*Pi1 */ + vfnmadd231pd __dPI2_FMA(%rax), %zmm1, %zmm8 + +/* Sine result sign: SinRSign = SignMask & SinR */ + vpandq %zmm8, %zmm12, %zmm11 + +/* Set SinRSign to 0.5 */ + vporq __dOneHalf(%rax), %zmm11, %zmm6 + vpternlogq $150, %zmm2, %zmm11, %zmm5 + +/* Update sign SinSignRes */ + vpternlogq $120, %zmm4, %zmm12, %zmm2 + +/* Polynomial approximation */ + vmovups __dC7(%rax), %zmm11 + +/* CosN = SinN +(-)0.5 */ + vaddpd %zmm6, %zmm1, %zmm0 + +/* SinR = SinR - SinN*Pi3 */ + vfnmadd213pd %zmm8, %zmm9, %zmm1 + vfnmadd231pd %zmm0, %zmm7, %zmm10 + +/* SinR2 = SinR^2 */ + vmulpd %zmm1, %zmm1, %zmm15 + +/* Grab SignX + CosR = CosR - CosN*Pi2 */ + vfnmadd231pd __dPI2_FMA(%rax), %zmm0, %zmm10 + vfmadd231pd __dC7(%rax), %zmm15, %zmm14 + +/* CosR = CosR - CosN*Pi3 */ + vfnmadd213pd %zmm10, %zmm9, %zmm0 + vfmadd213pd __dC5(%rax), %zmm15, %zmm14 + +/* CosR2 = CosR^2 */ + vmulpd %zmm0, %zmm0, %zmm12 + vfmadd213pd __dC4(%rax), %zmm15, %zmm14 + vfmadd213pd %zmm13, %zmm12, %zmm11 + +/* SinPoly = C3 + SinR2*(C4 + SinR2*(C5 + SinR2*(C6 + SinR2*C7))) */ + vfmadd213pd __dC3(%rax), %zmm15, %zmm14 + vfmadd213pd __dC5(%rax), %zmm12, %zmm11 + +/* SinPoly = C2 + SinR2*SinPoly */ + vfmadd213pd __dC2(%rax), %zmm15, %zmm14 + vfmadd213pd __dC4(%rax), %zmm12, %zmm11 + +/* SinPoly = C1 + SinR2*SinPoly */ + vfmadd213pd __dC1(%rax), %zmm15, %zmm14 + +/* CosPoly = C3 + CosR2*(C4 + CosR2*(C5 + CosR2*(C6 + CosR2*C7))) */ + vfmadd213pd __dC3(%rax), %zmm12, %zmm11 + +/* SinPoly = SinR2*SinPoly */ + vmulpd %zmm15, %zmm14, %zmm13 + +/* CosPoly = C2 + CosR2*CosPoly */ + vfmadd213pd __dC2(%rax), %zmm12, %zmm11 + +/* SinPoly = SinR*SinPoly */ + vfmadd213pd %zmm1, %zmm1, %zmm13 + vpbroadcastq %rdx, %zmm1{%k1}{z} + +/* CosPoly = C1 + CosR2*CosPoly */ + vfmadd213pd __dC1(%rax), %zmm12, %zmm11 + vptestmq %zmm1, %zmm1, %k0 + kmovw %k0, %ecx + +/* CosPoly = CosR2*CosPoly */ + vmulpd %zmm12, %zmm11, %zmm14 + movzbl %cl, %ecx + +/* CosPoly = CosR*CosPoly */ + vfmadd213pd %zmm0, %zmm0, %zmm14 + +/* Final reconstruction. + Update Sin result's sign */ + vpxorq %zmm2, %zmm13, %zmm0 + +/* Update Cos result's sign */ + vpxorq %zmm5, %zmm14, %zmm2 + testl %ecx, %ecx + jne .LBL_1_3 + +.LBL_1_2: + cfi_remember_state + vmovups %zmm0, (%rdi) + vmovups %zmm2, (%rsi) + movq %rbp, %rsp + cfi_def_cfa_register (%rsp) + popq %rbp + cfi_adjust_cfa_offset (-8) + cfi_restore (%rbp) + ret + +.LBL_1_3: + cfi_restore_state + vmovups %zmm4, 1152(%rsp) + vmovups %zmm0, 1216(%rsp) + vmovups %zmm2, 1280(%rsp) + je .LBL_1_2 + + xorb %dl, %dl + kmovw %k4, 1048(%rsp) + xorl %eax, %eax + kmovw %k5, 1040(%rsp) + kmovw %k6, 1032(%rsp) + kmovw %k7, 1024(%rsp) + vmovups %zmm16, 960(%rsp) + vmovups %zmm17, 896(%rsp) + vmovups %zmm18, 832(%rsp) + vmovups %zmm19, 768(%rsp) + vmovups %zmm20, 704(%rsp) + vmovups %zmm21, 640(%rsp) + vmovups %zmm22, 576(%rsp) + vmovups %zmm23, 512(%rsp) + vmovups %zmm24, 448(%rsp) + vmovups %zmm25, 384(%rsp) + vmovups %zmm26, 320(%rsp) + vmovups %zmm27, 256(%rsp) + vmovups %zmm28, 192(%rsp) + vmovups %zmm29, 128(%rsp) + vmovups %zmm30, 64(%rsp) + vmovups %zmm31, (%rsp) + movq %rsi, 1056(%rsp) + movq %r12, 1096(%rsp) + cfi_offset_rel_rsp (12, 1096) + movb %dl, %r12b + movq %r13, 1088(%rsp) + cfi_offset_rel_rsp (13, 1088) + movl %eax, %r13d + movq %r14, 1080(%rsp) + cfi_offset_rel_rsp (14, 1080) + movl %ecx, %r14d + movq %r15, 1072(%rsp) + cfi_offset_rel_rsp (15, 1072) + movq %rbx, 1064(%rsp) + movq %rdi, %rbx + cfi_remember_state + +.LBL_1_6: + btl %r13d, %r14d + jc .LBL_1_13 + +.LBL_1_7: + lea 1(%r13), %esi + btl %esi, %r14d + jc .LBL_1_10 + +.LBL_1_8: + addb $1, %r12b + addl $2, %r13d + cmpb $16, %r12b + jb .LBL_1_6 + + movq %rbx, %rdi + kmovw 1048(%rsp), %k4 + movq 1056(%rsp), %rsi + kmovw 1040(%rsp), %k5 + movq 1096(%rsp), %r12 + cfi_restore (%r12) + kmovw 1032(%rsp), %k6 + movq 1088(%rsp), %r13 + cfi_restore (%r13) + kmovw 1024(%rsp), %k7 + vmovups 960(%rsp), %zmm16 + vmovups 896(%rsp), %zmm17 + vmovups 832(%rsp), %zmm18 + vmovups 768(%rsp), %zmm19 + vmovups 704(%rsp), %zmm20 + vmovups 640(%rsp), %zmm21 + vmovups 576(%rsp), %zmm22 + vmovups 512(%rsp), %zmm23 + vmovups 448(%rsp), %zmm24 + vmovups 384(%rsp), %zmm25 + vmovups 320(%rsp), %zmm26 + vmovups 256(%rsp), %zmm27 + vmovups 192(%rsp), %zmm28 + vmovups 128(%rsp), %zmm29 + vmovups 64(%rsp), %zmm30 + vmovups (%rsp), %zmm31 + movq 1080(%rsp), %r14 + cfi_restore (%r14) + movq 1072(%rsp), %r15 + cfi_restore (%r15) + movq 1064(%rsp), %rbx + vmovups 1216(%rsp), %zmm0 + vmovups 1280(%rsp), %zmm2 + jmp .LBL_1_2 + +.LBL_1_10: + cfi_restore_state + movzbl %r12b, %r15d + shlq $4, %r15 + vmovsd 1160(%rsp,%r15), %xmm0 + + call JUMPTARGET(sin) + + vmovsd %xmm0, 1224(%rsp,%r15) + vmovsd 1160(%rsp,%r15), %xmm0 + + call JUMPTARGET(cos) + + vmovsd %xmm0, 1288(%rsp,%r15) + jmp .LBL_1_8 + +.LBL_1_13: + movzbl %r12b, %r15d + shlq $4, %r15 + vmovsd 1152(%rsp,%r15), %xmm0 + + call JUMPTARGET(sin) + + vmovsd %xmm0, 1216(%rsp,%r15) + vmovsd 1152(%rsp,%r15), %xmm0 + + call JUMPTARGET(cos) + + vmovsd %xmm0, 1280(%rsp,%r15) + jmp .LBL_1_7 + +#endif +END (_ZGVeN8vl8l8_sincos_knl) +libmvec_hidden_def(_ZGVeN8vl8l8_sincos_knl) + +ENTRY (_ZGVeN8vl8l8_sincos_skx) +#ifndef HAVE_AVX512DQ_ASM_SUPPORT +WRAPPER_IMPL_AVX512_fFF _ZGVdN4vl8l8_sincos +#else + pushq %rbp + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%rbp, 0) + movq %rsp, %rbp + cfi_def_cfa_register (%rbp) + andq $-64, %rsp + subq $1344, %rsp + movq __svml_d_trig_data@GOTPCREL(%rip), %rax + vmovaps %zmm0, %zmm8 + vmovups __dSignMask(%rax), %zmm4 + vmovups __dInvPI(%rax), %zmm9 + vmovups __dRShifter(%rax), %zmm10 + vmovups __dPI1_FMA(%rax), %zmm13 + vmovups __dPI2_FMA(%rax), %zmm14 + vmovups __dOneHalf(%rax), %zmm11 + vmovups __dPI3_FMA(%rax), %zmm2 + +/* ARGUMENT RANGE REDUCTION: + Absolute argument: X' = |X| */ + vandnpd %zmm8, %zmm4, %zmm7 + +/* SinY = X'*InvPi + RS : right shifter add */ + vfmadd213pd %zmm10, %zmm7, %zmm9 + vcmppd $18, __dRangeVal(%rax), %zmm7, %k1 + +/* SinSignRes = Y<<63 : shift LSB to MSB place for result sign */ + vpsllq $63, %zmm9, %zmm6 + +/* SinN = Y - RS : right shifter sub */ + vsubpd %zmm10, %zmm9, %zmm5 + vmovups __dC5(%rax), %zmm9 + vmovups __dC4(%rax), %zmm10 + +/* SinR = X' - SinN*Pi1 */ + vmovaps %zmm7, %zmm15 + vfnmadd231pd %zmm5, %zmm13, %zmm15 + +/* SinR = SinR - SinN*Pi1 */ + vfnmadd231pd %zmm5, %zmm14, %zmm15 + +/* Sine result sign: SinRSign = SignMask & SinR */ + vandpd %zmm15, %zmm4, %zmm1 + +/* Set SinRSign to 0.5 */ + vorpd %zmm1, %zmm11, %zmm12 + vmovups __dC3(%rax), %zmm11 + +/* CosN = SinN +(-)0.5 */ + vaddpd %zmm12, %zmm5, %zmm3 + +/* SinR = SinR - SinN*Pi3 */ + vfnmadd213pd %zmm15, %zmm2, %zmm5 + vmovups __dC2(%rax), %zmm12 + +/* SinR2 = SinR^2 */ + vmulpd %zmm5, %zmm5, %zmm15 + +/* CosR = SinX - CosN*Pi1 */ + vmovaps %zmm7, %zmm0 + vfnmadd231pd %zmm3, %zmm13, %zmm0 + vmovups __dC1(%rax), %zmm13 + +/* Grab SignX + CosR = CosR - CosN*Pi2 */ + vfnmadd231pd %zmm3, %zmm14, %zmm0 + +/* CosR = CosR - CosN*Pi3 */ + vfnmadd213pd %zmm0, %zmm2, %zmm3 + +/* Polynomial approximation */ + vmovups __dC7(%rax), %zmm0 + +/* Update CosRSign and CosSignRes signs */ + vmovaps %zmm4, %zmm2 + vpternlogq $150, %zmm6, %zmm1, %zmm2 + +/* Update sign SinSignRes */ + vpternlogq $120, %zmm8, %zmm4, %zmm6 + +/* CosR2 = CosR^2 */ + vmulpd %zmm3, %zmm3, %zmm1 + vmovups __dC6(%rax), %zmm4 + vmovaps %zmm0, %zmm14 + vfmadd213pd %zmm4, %zmm1, %zmm0 + vfmadd213pd %zmm4, %zmm15, %zmm14 + vfmadd213pd %zmm9, %zmm1, %zmm0 + vfmadd213pd %zmm9, %zmm15, %zmm14 + vfmadd213pd %zmm10, %zmm1, %zmm0 + vfmadd213pd %zmm10, %zmm15, %zmm14 + +/* CosPoly = C3 + CosR2*(C4 + CosR2*(C5 + CosR2*(C6 + CosR2*C7))) */ + vfmadd213pd %zmm11, %zmm1, %zmm0 + +/* SinPoly = C3 + SinR2*(C4 + SinR2*(C5 + SinR2*(C6 + SinR2*C7))) */ + vfmadd213pd %zmm11, %zmm15, %zmm14 + +/* CosPoly = C2 + CosR2*CosPoly */ + vfmadd213pd %zmm12, %zmm1, %zmm0 + +/* SinPoly = C2 + SinR2*SinPoly */ + vfmadd213pd %zmm12, %zmm15, %zmm14 + +/* CosPoly = C1 + CosR2*CosPoly */ + vfmadd213pd %zmm13, %zmm1, %zmm0 + +/* SinPoly = C1 + SinR2*SinPoly */ + vfmadd213pd %zmm13, %zmm15, %zmm14 + +/* CosPoly = CosR2*CosPoly */ + vmulpd %zmm1, %zmm0, %zmm1 + +/* SinPoly = SinR2*SinPoly */ + vmulpd %zmm15, %zmm14, %zmm4 + +/* CosPoly = CosR*CosPoly */ + vfmadd213pd %zmm3, %zmm3, %zmm1 + +/* SinPoly = SinR*SinPoly */ + vfmadd213pd %zmm5, %zmm5, %zmm4 + vpbroadcastq .L_2il0floatpacket.15(%rip), %zmm3 + +/* Update Cos result's sign */ + vxorpd %zmm2, %zmm1, %zmm1 + +/* Final reconstruction. + Update Sin result's sign */ + vxorpd %zmm6, %zmm4, %zmm0 + vpandnq %zmm7, %zmm7, %zmm3{%k1} + vcmppd $3, %zmm3, %zmm3, %k0 + kmovw %k0, %ecx + testl %ecx, %ecx + jne .LBL_2_3 + +.LBL_2_2: + cfi_remember_state + vmovups %zmm0, (%rdi) + vmovups %zmm1, (%rsi) + movq %rbp, %rsp + cfi_def_cfa_register (%rsp) + popq %rbp + cfi_adjust_cfa_offset (-8) + cfi_restore (%rbp) + ret + +.LBL_2_3: + cfi_restore_state + vmovups %zmm8, 1152(%rsp) + vmovups %zmm0, 1216(%rsp) + vmovups %zmm1, 1280(%rsp) + je .LBL_2_2 + + xorb %dl, %dl + xorl %eax, %eax + kmovw %k4, 1048(%rsp) + kmovw %k5, 1040(%rsp) + kmovw %k6, 1032(%rsp) + kmovw %k7, 1024(%rsp) + vmovups %zmm16, 960(%rsp) + vmovups %zmm17, 896(%rsp) + vmovups %zmm18, 832(%rsp) + vmovups %zmm19, 768(%rsp) + vmovups %zmm20, 704(%rsp) + vmovups %zmm21, 640(%rsp) + vmovups %zmm22, 576(%rsp) + vmovups %zmm23, 512(%rsp) + vmovups %zmm24, 448(%rsp) + vmovups %zmm25, 384(%rsp) + vmovups %zmm26, 320(%rsp) + vmovups %zmm27, 256(%rsp) + vmovups %zmm28, 192(%rsp) + vmovups %zmm29, 128(%rsp) + vmovups %zmm30, 64(%rsp) + vmovups %zmm31, (%rsp) + movq %rsi, 1056(%rsp) + movq %r12, 1096(%rsp) + cfi_offset_rel_rsp (12, 1096) + movb %dl, %r12b + movq %r13, 1088(%rsp) + cfi_offset_rel_rsp (13, 1088) + movl %eax, %r13d + movq %r14, 1080(%rsp) + cfi_offset_rel_rsp (14, 1080) + movl %ecx, %r14d + movq %r15, 1072(%rsp) + cfi_offset_rel_rsp (15, 1072) + movq %rbx, 1064(%rsp) + movq %rdi, %rbx + cfi_remember_state + +.LBL_2_6: + btl %r13d, %r14d + jc .LBL_2_13 + +.LBL_2_7: + lea 1(%r13), %esi + btl %esi, %r14d + jc .LBL_2_10 + +.LBL_2_8: + incb %r12b + addl $2, %r13d + cmpb $16, %r12b + jb .LBL_2_6 + + kmovw 1048(%rsp), %k4 + movq %rbx, %rdi + kmovw 1040(%rsp), %k5 + kmovw 1032(%rsp), %k6 + kmovw 1024(%rsp), %k7 + vmovups 960(%rsp), %zmm16 + vmovups 896(%rsp), %zmm17 + vmovups 832(%rsp), %zmm18 + vmovups 768(%rsp), %zmm19 + vmovups 704(%rsp), %zmm20 + vmovups 640(%rsp), %zmm21 + vmovups 576(%rsp), %zmm22 + vmovups 512(%rsp), %zmm23 + vmovups 448(%rsp), %zmm24 + vmovups 384(%rsp), %zmm25 + vmovups 320(%rsp), %zmm26 + vmovups 256(%rsp), %zmm27 + vmovups 192(%rsp), %zmm28 + vmovups 128(%rsp), %zmm29 + vmovups 64(%rsp), %zmm30 + vmovups (%rsp), %zmm31 + vmovups 1216(%rsp), %zmm0 + vmovups 1280(%rsp), %zmm1 + movq 1056(%rsp), %rsi + movq 1096(%rsp), %r12 + cfi_restore (%r12) + movq 1088(%rsp), %r13 + cfi_restore (%r13) + movq 1080(%rsp), %r14 + cfi_restore (%r14) + movq 1072(%rsp), %r15 + cfi_restore (%r15) + movq 1064(%rsp), %rbx + jmp .LBL_2_2 + +.LBL_2_10: + cfi_restore_state + movzbl %r12b, %r15d + shlq $4, %r15 + vmovsd 1160(%rsp,%r15), %xmm0 + vzeroupper + vmovsd 1160(%rsp,%r15), %xmm0 + + call JUMPTARGET(sin) + + vmovsd %xmm0, 1224(%rsp,%r15) + vmovsd 1160(%rsp,%r15), %xmm0 + + call JUMPTARGET(cos) + + vmovsd %xmm0, 1288(%rsp,%r15) + jmp .LBL_2_8 + +.LBL_2_13: + movzbl %r12b, %r15d + shlq $4, %r15 + vmovsd 1152(%rsp,%r15), %xmm0 + vzeroupper + vmovsd 1152(%rsp,%r15), %xmm0 + + call JUMPTARGET(sin) + + vmovsd %xmm0, 1216(%rsp,%r15) + vmovsd 1152(%rsp,%r15), %xmm0 + + call JUMPTARGET(cos) + + vmovsd %xmm0, 1280(%rsp,%r15) + jmp .LBL_2_7 + +#endif +END (_ZGVeN8vl8l8_sincos_skx) +libmvec_hidden_def(_ZGVeN8vl8l8_sincos_skx) + +/* Wrapper between vvv and vl8l8 vector variants. */ +.macro WRAPPER_AVX512_vvv_vl8l8 callee +#ifndef __ILP32__ + pushq %rbp + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%rbp, 0) + movq %rsp, %rbp + cfi_def_cfa_register (%rbp) + andq $-64, %rsp + subq $256, %rsp + /* Encoding for vmovups %zmm1, 128(%rsp). */ + .byte 0x62 + .byte 0xf1 + .byte 0x7c + .byte 0x48 + .byte 0x11 + .byte 0x4c + .byte 0x24 + .byte 0x02 + lea (%rsp), %rdi + /* Encoding for vmovups %zmm2, 192(%rdi). */ + .byte 0x62 + .byte 0xf1 + .byte 0x7c + .byte 0x48 + .byte 0x11 + .byte 0x57 + .byte 0x03 + lea 64(%rsp), %rsi + call HIDDEN_JUMPTARGET(\callee) + movq 128(%rsp), %rdx + movq 136(%rsp), %rsi + movq 144(%rsp), %r8 + movq 152(%rsp), %r10 + movq (%rsp), %rax + movq 8(%rsp), %rcx + movq 16(%rsp), %rdi + movq 24(%rsp), %r9 + movq %rax, (%rdx) + movq %rcx, (%rsi) + movq 160(%rsp), %rax + movq 168(%rsp), %rcx + movq %rdi, (%r8) + movq %r9, (%r10) + movq 176(%rsp), %rdi + movq 184(%rsp), %r9 + movq 32(%rsp), %r11 + movq 40(%rsp), %rdx + movq 48(%rsp), %rsi + movq 56(%rsp), %r8 + movq %r11, (%rax) + movq %rdx, (%rcx) + movq 192(%rsp), %r11 + movq 200(%rsp), %rdx + movq %rsi, (%rdi) + movq %r8, (%r9) + movq 208(%rsp), %rsi + movq 216(%rsp), %r8 + movq 64(%rsp), %r10 + movq 72(%rsp), %rax + movq 80(%rsp), %rcx + movq 88(%rsp), %rdi + movq %r10, (%r11) + movq %rax, (%rdx) + movq 224(%rsp), %r10 + movq 232(%rsp), %rax + movq %rcx, (%rsi) + movq %rdi, (%r8) + movq 240(%rsp), %rcx + movq 248(%rsp), %rdi + movq 96(%rsp), %r9 + movq 104(%rsp), %r11 + movq 112(%rsp), %rdx + movq 120(%rsp), %rsi + movq %r9, (%r10) + movq %r11, (%rax) + movq %rdx, (%rcx) + movq %rsi, (%rdi) + movq %rbp, %rsp + cfi_def_cfa_register (%rsp) + popq %rbp + cfi_adjust_cfa_offset (-8) + cfi_restore (%rbp) + ret +#else + leal 8(%rsp), %r10d + .cfi_def_cfa 10, 0 + andl $-64, %esp + pushq -8(%r10d) + pushq %rbp + .cfi_escape 0x10,0x6,0x2,0x76,0 + movl %esp, %ebp + pushq %r10 + .cfi_escape 0xf,0x3,0x76,0x78,0x6 + leal -112(%rbp), %esi + leal -176(%rbp), %edi + subl $232, %esp + vmovdqa %ymm1, -208(%ebp) + vmovdqa %ymm2, -240(%ebp) + call HIDDEN_JUMPTARGET(\callee) + vmovdqa -208(%ebp), %xmm0 + vmovq %xmm0, %rax + vmovsd -176(%ebp), %xmm0 + vmovsd %xmm0, (%eax) + shrq $32, %rax + vmovsd -168(%ebp), %xmm0 + vmovsd %xmm0, (%eax) + movq -200(%ebp), %rax + vmovsd -160(%ebp), %xmm0 + vmovsd %xmm0, (%eax) + shrq $32, %rax + vmovsd -152(%ebp), %xmm0 + vmovsd %xmm0, (%eax) + movq -192(%ebp), %rax + vmovsd -144(%ebp), %xmm0 + vmovsd %xmm0, (%eax) + shrq $32, %rax + vmovsd -136(%ebp), %xmm0 + vmovsd %xmm0, (%eax) + movq -184(%ebp), %rax + vmovsd -128(%ebp), %xmm0 + vmovsd %xmm0, (%eax) + shrq $32, %rax + vmovsd -120(%ebp), %xmm0 + vmovsd %xmm0, (%eax) + vmovdqa -240(%ebp), %xmm0 + vmovq %xmm0, %rax + vmovsd -112(%ebp), %xmm0 + vmovsd %xmm0, (%eax) + shrq $32, %rax + vmovsd -104(%ebp), %xmm0 + vmovsd %xmm0, (%eax) + movq -232(%ebp), %rax + vmovsd -96(%ebp), %xmm0 + vmovsd %xmm0, (%eax) + shrq $32, %rax + vmovsd -88(%ebp), %xmm0 + vmovsd %xmm0, (%eax) + movq -224(%ebp), %rax + vmovsd -80(%ebp), %xmm0 + vmovsd %xmm0, (%eax) + shrq $32, %rax + vmovsd -72(%ebp), %xmm0 + vmovsd %xmm0, (%eax) + movq -216(%ebp), %rax + vmovsd -64(%ebp), %xmm0 + vmovsd %xmm0, (%eax) + shrq $32, %rax + vmovsd -56(%ebp), %xmm0 + vmovsd %xmm0, (%eax) + addl $232, %esp + popq %r10 + .cfi_def_cfa 10, 0 + popq %rbp + leal -8(%r10), %esp + .cfi_def_cfa 7, 8 + ret +#endif +.endm + +ENTRY (_ZGVeN8vvv_sincos_knl) +WRAPPER_AVX512_vvv_vl8l8 _ZGVeN8vl8l8_sincos_knl +END (_ZGVeN8vvv_sincos_knl) + +ENTRY (_ZGVeN8vvv_sincos_skx) +WRAPPER_AVX512_vvv_vl8l8 _ZGVeN8vl8l8_sincos_skx +END (_ZGVeN8vvv_sincos_skx) + + .section .rodata, "a" +.L_2il0floatpacket.15: + .long 0xffffffff,0xffffffff + .type .L_2il0floatpacket.15,@object diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core.S new file mode 100644 index 0000000000..cd67665972 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core.S @@ -0,0 +1,37 @@ +/* Multiple versions of vectorized cosf. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + + .text +ENTRY (_ZGVeN16v_cosf) + .type _ZGVeN16v_cosf, @gnu_indirect_function + LOAD_RTLD_GLOBAL_RO_RDX + leaq _ZGVeN16v_cosf_skx(%rip), %rax + HAS_ARCH_FEATURE (AVX512DQ_Usable) + jnz 2f + leaq _ZGVeN16v_cosf_knl(%rip), %rax + HAS_ARCH_FEATURE (AVX512F_Usable) + jnz 2f + leaq _ZGVeN16v_cosf_avx2_wrapper(%rip), %rax +2: ret +END (_ZGVeN16v_cosf) + +#define _ZGVeN16v_cosf _ZGVeN16v_cosf_avx2_wrapper +#include "../svml_s_cosf16_core.S" diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S new file mode 100644 index 0000000000..611bb5dd2d --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S @@ -0,0 +1,460 @@ +/* Function cosf vectorized with AVX-512. KNL and SKX versions. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "svml_s_trig_data.h" +#include "svml_s_wrapper_impl.h" + + .text +ENTRY (_ZGVeN16v_cosf_knl) +#ifndef HAVE_AVX512DQ_ASM_SUPPORT +WRAPPER_IMPL_AVX512 _ZGVdN8v_cosf +#else +/* + ALGORITHM DESCRIPTION: + + 1) Range reduction to [-Pi/2; +Pi/2] interval + a) We remove sign using AND operation + b) Add Pi/2 value to argument X for Cos to Sin transformation + c) Getting octant Y by 1/Pi multiplication + d) Add "Right Shifter" value + e) Treat obtained value as integer for destination sign setting. + Shift first bit of this value to the last (sign) position + f) Subtract "Right Shifter" value + g) Subtract 0.5 from result for octant correction + h) Subtract Y*PI from X argument, where PI divided to 4 parts: + X = X - Y*PI1 - Y*PI2 - Y*PI3 - Y*PI4; + 2) Polynomial (minimax for sin within [-Pi/2; +Pi/2] interval) + a) Calculate X^2 = X * X + b) Calculate polynomial: + R = X + X * X^2 * (A3 + x^2 * (A5 + ..... + 3) Destination sign setting + a) Set shifted destination sign using XOR operation: + R = XOR( R, S ); + */ + pushq %rbp + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%rbp, 0) + movq %rsp, %rbp + cfi_def_cfa_register (%rbp) + andq $-64, %rsp + subq $1280, %rsp + movq __svml_s_trig_data@GOTPCREL(%rip), %rdx + +/* + h) Subtract Y*PI from X argument, where PI divided to 4 parts: + X = X - Y*PI1 - Y*PI2 - Y*PI3 + */ + vmovaps %zmm0, %zmm6 + movl $-1, %eax + +/* b) Add Pi/2 value to argument X for Cos to Sin transformation */ + vaddps __sHalfPI(%rdx), %zmm0, %zmm2 + vmovups __sRShifter(%rdx), %zmm3 + +/* + 1) Range reduction to [-Pi/2; +Pi/2] interval + c) Getting octant Y by 1/Pi multiplication + d) Add "Right Shifter" (0x4B000000) value + */ + vfmadd132ps __sInvPI(%rdx), %zmm3, %zmm2 + vmovups __sPI1_FMA(%rdx), %zmm5 + +/* f) Subtract "Right Shifter" (0x4B000000) value */ + vsubps %zmm3, %zmm2, %zmm4 + vmovups __sA9_FMA(%rdx), %zmm9 + +/* Check for large and special arguments */ + vpandd __sAbsMask(%rdx), %zmm0, %zmm1 + +/* + e) Treat obtained value as integer for destination sign setting. + Shift first bit of this value to the last (sign) position (S << 31) + */ + vpslld $31, %zmm2, %zmm8 + vcmpps $22, __sRangeReductionVal(%rdx), %zmm1, %k1 + vpbroadcastd %eax, %zmm12{%k1}{z} + +/* g) Subtract 0.5 from result for octant correction */ + vsubps __sOneHalf(%rdx), %zmm4, %zmm7 + vptestmd %zmm12, %zmm12, %k0 + vfnmadd231ps %zmm7, %zmm5, %zmm6 + kmovw %k0, %ecx + vfnmadd231ps __sPI2_FMA(%rdx), %zmm7, %zmm6 + vfnmadd132ps __sPI3_FMA(%rdx), %zmm6, %zmm7 + +/* a) Calculate X^2 = X * X */ + vmulps %zmm7, %zmm7, %zmm10 + +/* + 3) Destination sign setting + a) Set shifted destination sign using XOR operation: + R = XOR( R, S ); + */ + vpxord %zmm8, %zmm7, %zmm11 + +/* + b) Calculate polynomial: + R = X + X * X^2 * (A3 + x^2 * (A5 + x^2 * (A7 + x^2 * (A9)))); + */ + vfmadd213ps __sA7_FMA(%rdx), %zmm10, %zmm9 + vfmadd213ps __sA5_FMA(%rdx), %zmm10, %zmm9 + vfmadd213ps __sA3(%rdx), %zmm10, %zmm9 + vmulps %zmm10, %zmm9, %zmm1 + vfmadd213ps %zmm11, %zmm11, %zmm1 + testl %ecx, %ecx + jne .LBL_1_3 + +.LBL_1_2: + cfi_remember_state + vmovaps %zmm1, %zmm0 + movq %rbp, %rsp + cfi_def_cfa_register (%rsp) + popq %rbp + cfi_adjust_cfa_offset (-8) + cfi_restore (%rbp) + ret + +.LBL_1_3: + cfi_restore_state + vmovups %zmm0, 1152(%rsp) + vmovups %zmm1, 1216(%rsp) + je .LBL_1_2 + + xorb %dl, %dl + kmovw %k4, 1048(%rsp) + xorl %eax, %eax + kmovw %k5, 1040(%rsp) + kmovw %k6, 1032(%rsp) + kmovw %k7, 1024(%rsp) + vmovups %zmm16, 960(%rsp) + vmovups %zmm17, 896(%rsp) + vmovups %zmm18, 832(%rsp) + vmovups %zmm19, 768(%rsp) + vmovups %zmm20, 704(%rsp) + vmovups %zmm21, 640(%rsp) + vmovups %zmm22, 576(%rsp) + vmovups %zmm23, 512(%rsp) + vmovups %zmm24, 448(%rsp) + vmovups %zmm25, 384(%rsp) + vmovups %zmm26, 320(%rsp) + vmovups %zmm27, 256(%rsp) + vmovups %zmm28, 192(%rsp) + vmovups %zmm29, 128(%rsp) + vmovups %zmm30, 64(%rsp) + vmovups %zmm31, (%rsp) + movq %rsi, 1064(%rsp) + movq %rdi, 1056(%rsp) + movq %r12, 1096(%rsp) + cfi_offset_rel_rsp (12, 1096) + movb %dl, %r12b + movq %r13, 1088(%rsp) + cfi_offset_rel_rsp (13, 1088) + movl %ecx, %r13d + movq %r14, 1080(%rsp) + cfi_offset_rel_rsp (14, 1080) + movl %eax, %r14d + movq %r15, 1072(%rsp) + cfi_offset_rel_rsp (15, 1072) + cfi_remember_state + +.LBL_1_6: + btl %r14d, %r13d + jc .LBL_1_12 + +.LBL_1_7: + lea 1(%r14), %esi + btl %esi, %r13d + jc .LBL_1_10 + +.LBL_1_8: + addb $1, %r12b + addl $2, %r14d + cmpb $16, %r12b + jb .LBL_1_6 + + kmovw 1048(%rsp), %k4 + movq 1064(%rsp), %rsi + kmovw 1040(%rsp), %k5 + movq 1056(%rsp), %rdi + kmovw 1032(%rsp), %k6 + movq 1096(%rsp), %r12 + cfi_restore (%r12) + movq 1088(%rsp), %r13 + cfi_restore (%r13) + kmovw 1024(%rsp), %k7 + vmovups 960(%rsp), %zmm16 + vmovups 896(%rsp), %zmm17 + vmovups 832(%rsp), %zmm18 + vmovups 768(%rsp), %zmm19 + vmovups 704(%rsp), %zmm20 + vmovups 640(%rsp), %zmm21 + vmovups 576(%rsp), %zmm22 + vmovups 512(%rsp), %zmm23 + vmovups 448(%rsp), %zmm24 + vmovups 384(%rsp), %zmm25 + vmovups 320(%rsp), %zmm26 + vmovups 256(%rsp), %zmm27 + vmovups 192(%rsp), %zmm28 + vmovups 128(%rsp), %zmm29 + vmovups 64(%rsp), %zmm30 + vmovups (%rsp), %zmm31 + movq 1080(%rsp), %r14 + cfi_restore (%r14) + movq 1072(%rsp), %r15 + cfi_restore (%r15) + vmovups 1216(%rsp), %zmm1 + jmp .LBL_1_2 + +.LBL_1_10: + cfi_restore_state + movzbl %r12b, %r15d + vmovss 1156(%rsp,%r15,8), %xmm0 + call JUMPTARGET(cosf) + vmovss %xmm0, 1220(%rsp,%r15,8) + jmp .LBL_1_8 + +.LBL_1_12: + movzbl %r12b, %r15d + vmovss 1152(%rsp,%r15,8), %xmm0 + call JUMPTARGET(cosf) + vmovss %xmm0, 1216(%rsp,%r15,8) + jmp .LBL_1_7 +#endif +END (_ZGVeN16v_cosf_knl) + +ENTRY (_ZGVeN16v_cosf_skx) +#ifndef HAVE_AVX512DQ_ASM_SUPPORT +WRAPPER_IMPL_AVX512 _ZGVdN8v_cosf +#else +/* + ALGORITHM DESCRIPTION: + + 1) Range reduction to [-Pi/2; +Pi/2] interval + a) We remove sign using AND operation + b) Add Pi/2 value to argument X for Cos to Sin transformation + c) Getting octant Y by 1/Pi multiplication + d) Add "Right Shifter" value + e) Treat obtained value as integer for destination sign setting. + Shift first bit of this value to the last (sign) position + f) Subtract "Right Shifter" value + g) Subtract 0.5 from result for octant correction + h) Subtract Y*PI from X argument, where PI divided to 4 parts: + X = X - Y*PI1 - Y*PI2 - Y*PI3 - Y*PI4; + 2) Polynomial (minimax for sin within [-Pi/2; +Pi/2] interval) + a) Calculate X^2 = X * X + b) Calculate polynomial: + R = X + X * X^2 * (A3 + x^2 * (A5 + ..... + 3) Destination sign setting + a) Set shifted destination sign using XOR operation: + R = XOR( R, S ); + */ + pushq %rbp + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%rbp, 0) + movq %rsp, %rbp + cfi_def_cfa_register (%rbp) + andq $-64, %rsp + subq $1280, %rsp + movq __svml_s_trig_data@GOTPCREL(%rip), %rax + +/* + h) Subtract Y*PI from X argument, where PI divided to 4 parts: + X = X - Y*PI1 - Y*PI2 - Y*PI3 + */ + vmovaps %zmm0, %zmm6 + vmovups .L_2il0floatpacket.13(%rip), %zmm12 + vmovups __sRShifter(%rax), %zmm3 + vmovups __sPI1_FMA(%rax), %zmm5 + vmovups __sA9_FMA(%rax), %zmm9 + +/* b) Add Pi/2 value to argument X for Cos to Sin transformation */ + vaddps __sHalfPI(%rax), %zmm0, %zmm2 + +/* Check for large and special arguments */ + vandps __sAbsMask(%rax), %zmm0, %zmm1 + +/* + 1) Range reduction to [-Pi/2; +Pi/2] interval + c) Getting octant Y by 1/Pi multiplication + d) Add "Right Shifter" (0x4B000000) value + */ + vfmadd132ps __sInvPI(%rax), %zmm3, %zmm2 + vcmpps $18, __sRangeReductionVal(%rax), %zmm1, %k1 + +/* + e) Treat obtained value as integer for destination sign setting. + Shift first bit of this value to the last (sign) position (S << 31) + */ + vpslld $31, %zmm2, %zmm8 + +/* f) Subtract "Right Shifter" (0x4B000000) value */ + vsubps %zmm3, %zmm2, %zmm4 + +/* g) Subtract 0.5 from result for octant correction */ + vsubps __sOneHalf(%rax), %zmm4, %zmm7 + vfnmadd231ps %zmm7, %zmm5, %zmm6 + vfnmadd231ps __sPI2_FMA(%rax), %zmm7, %zmm6 + vfnmadd132ps __sPI3_FMA(%rax), %zmm6, %zmm7 + +/* a) Calculate X^2 = X * X */ + vmulps %zmm7, %zmm7, %zmm10 + +/* + 3) Destination sign setting + a) Set shifted destination sign using XOR operation: + R = XOR( R, S ); + */ + vxorps %zmm8, %zmm7, %zmm11 + +/* + b) Calculate polynomial: + R = X + X * X^2 * (A3 + x^2 * (A5 + x^2 * (A7 + x^2 * (A9)))); + */ + vfmadd213ps __sA7_FMA(%rax), %zmm10, %zmm9 + vfmadd213ps __sA5_FMA(%rax), %zmm10, %zmm9 + vfmadd213ps __sA3(%rax), %zmm10, %zmm9 + vpandnd %zmm1, %zmm1, %zmm12{%k1} + vmulps %zmm10, %zmm9, %zmm1 + vptestmd %zmm12, %zmm12, %k0 + vfmadd213ps %zmm11, %zmm11, %zmm1 + kmovw %k0, %ecx + testl %ecx, %ecx + jne .LBL_2_3 +.LBL_2_2: + cfi_remember_state + vmovaps %zmm1, %zmm0 + movq %rbp, %rsp + cfi_def_cfa_register (%rsp) + popq %rbp + cfi_adjust_cfa_offset (-8) + cfi_restore (%rbp) + ret + +.LBL_2_3: + cfi_restore_state + vmovups %zmm0, 1152(%rsp) + vmovups %zmm1, 1216(%rsp) + je .LBL_2_2 + + xorb %dl, %dl + xorl %eax, %eax + kmovw %k4, 1048(%rsp) + kmovw %k5, 1040(%rsp) + kmovw %k6, 1032(%rsp) + kmovw %k7, 1024(%rsp) + vmovups %zmm16, 960(%rsp) + vmovups %zmm17, 896(%rsp) + vmovups %zmm18, 832(%rsp) + vmovups %zmm19, 768(%rsp) + vmovups %zmm20, 704(%rsp) + vmovups %zmm21, 640(%rsp) + vmovups %zmm22, 576(%rsp) + vmovups %zmm23, 512(%rsp) + vmovups %zmm24, 448(%rsp) + vmovups %zmm25, 384(%rsp) + vmovups %zmm26, 320(%rsp) + vmovups %zmm27, 256(%rsp) + vmovups %zmm28, 192(%rsp) + vmovups %zmm29, 128(%rsp) + vmovups %zmm30, 64(%rsp) + vmovups %zmm31, (%rsp) + movq %rsi, 1064(%rsp) + movq %rdi, 1056(%rsp) + movq %r12, 1096(%rsp) + cfi_offset_rel_rsp (12, 1096) + movb %dl, %r12b + movq %r13, 1088(%rsp) + cfi_offset_rel_rsp (13, 1088) + movl %ecx, %r13d + movq %r14, 1080(%rsp) + cfi_offset_rel_rsp (14, 1080) + movl %eax, %r14d + movq %r15, 1072(%rsp) + cfi_offset_rel_rsp (15, 1072) + cfi_remember_state + +.LBL_2_6: + btl %r14d, %r13d + jc .LBL_2_12 +.LBL_2_7: + lea 1(%r14), %esi + btl %esi, %r13d + jc .LBL_2_10 +.LBL_2_8: + incb %r12b + addl $2, %r14d + cmpb $16, %r12b + jb .LBL_2_6 + kmovw 1048(%rsp), %k4 + kmovw 1040(%rsp), %k5 + kmovw 1032(%rsp), %k6 + kmovw 1024(%rsp), %k7 + vmovups 960(%rsp), %zmm16 + vmovups 896(%rsp), %zmm17 + vmovups 832(%rsp), %zmm18 + vmovups 768(%rsp), %zmm19 + vmovups 704(%rsp), %zmm20 + vmovups 640(%rsp), %zmm21 + vmovups 576(%rsp), %zmm22 + vmovups 512(%rsp), %zmm23 + vmovups 448(%rsp), %zmm24 + vmovups 384(%rsp), %zmm25 + vmovups 320(%rsp), %zmm26 + vmovups 256(%rsp), %zmm27 + vmovups 192(%rsp), %zmm28 + vmovups 128(%rsp), %zmm29 + vmovups 64(%rsp), %zmm30 + vmovups (%rsp), %zmm31 + vmovups 1216(%rsp), %zmm1 + movq 1064(%rsp), %rsi + movq 1056(%rsp), %rdi + movq 1096(%rsp), %r12 + cfi_restore (%r12) + movq 1088(%rsp), %r13 + cfi_restore (%r13) + movq 1080(%rsp), %r14 + cfi_restore (%r14) + movq 1072(%rsp), %r15 + cfi_restore (%r15) + jmp .LBL_2_2 + +.LBL_2_10: + cfi_restore_state + movzbl %r12b, %r15d + vmovss 1156(%rsp,%r15,8), %xmm0 + vzeroupper + vmovss 1156(%rsp,%r15,8), %xmm0 + call JUMPTARGET(cosf) + vmovss %xmm0, 1220(%rsp,%r15,8) + jmp .LBL_2_8 +.LBL_2_12: + movzbl %r12b, %r15d + vmovss 1152(%rsp,%r15,8), %xmm0 + vzeroupper + vmovss 1152(%rsp,%r15,8), %xmm0 + call JUMPTARGET(cosf) + vmovss %xmm0, 1216(%rsp,%r15,8) + jmp .LBL_2_7 +#endif +END (_ZGVeN16v_cosf_skx) + + .section .rodata, "a" +.L_2il0floatpacket.13: + .long 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff + .type .L_2il0floatpacket.13,@object diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_cosf4_core.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_cosf4_core.S new file mode 100644 index 0000000000..d73d7c7e3f --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_cosf4_core.S @@ -0,0 +1,36 @@ +/* Multiple versions of vectorized cosf, vector length is 4. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + + .text +ENTRY (_ZGVbN4v_cosf) + .type _ZGVbN4v_cosf, @gnu_indirect_function + LOAD_RTLD_GLOBAL_RO_RDX + leaq _ZGVbN4v_cosf_sse4(%rip), %rax + HAS_CPU_FEATURE (SSE4_1) + jz 2f + ret +2: leaq _ZGVbN4v_cosf_sse2(%rip), %rax + ret +END (_ZGVbN4v_cosf) +libmvec_hidden_def (_ZGVbN4v_cosf) + +#define _ZGVbN4v_cosf _ZGVbN4v_cosf_sse2 +#include "../svml_s_cosf4_core.S" diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_cosf4_core_sse4.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_cosf4_core_sse4.S new file mode 100644 index 0000000000..73797e1a93 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_cosf4_core_sse4.S @@ -0,0 +1,227 @@ +/* Function cosf vectorized with SSE4. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "svml_s_trig_data.h" + + .text +ENTRY (_ZGVbN4v_cosf_sse4) +/* + ALGORITHM DESCRIPTION: + + 1) Range reduction to [-Pi/2; +Pi/2] interval + a) We remove sign using AND operation + b) Add Pi/2 value to argument X for Cos to Sin transformation + c) Getting octant Y by 1/Pi multiplication + d) Add "Right Shifter" value + e) Treat obtained value as integer for destination sign setting. + Shift first bit of this value to the last (sign) position + f) Subtract "Right Shifter" value + g) Subtract 0.5 from result for octant correction + h) Subtract Y*PI from X argument, where PI divided to 4 parts: + X = X - Y*PI1 - Y*PI2 - Y*PI3 - Y*PI4; + 2) Polynomial (minimax for sin within [-Pi/2; +Pi/2] interval) + a) Calculate X^2 = X * X + b) Calculate polynomial: + R = X + X * X^2 * (A3 + x^2 * (A5 + ..... + 3) Destination sign setting + a) Set shifted destination sign using XOR operation: + R = XOR( R, S ); + */ + pushq %rbp + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%rbp, 0) + movq %rsp, %rbp + cfi_def_cfa_register (%rbp) + andq $-64, %rsp + subq $320, %rsp + movaps %xmm0, %xmm4 + movq __svml_s_trig_data@GOTPCREL(%rip), %rax + movups __sHalfPI(%rax), %xmm1 + movups __sRShifter(%rax), %xmm5 + +/* b) Add Pi/2 value to argument X for Cos to Sin transformation */ + addps %xmm4, %xmm1 + +/* + 1) Range reduction to [-Pi/2; +Pi/2] interval + c) Getting octant Y by 1/Pi multiplication + d) Add "Right Shifter" (0x4B000000) value + */ + mulps __sInvPI(%rax), %xmm1 + movups __sPI1(%rax), %xmm6 + addps %xmm5, %xmm1 + +/* + e) Treat obtained value as integer for destination sign setting. + Shift first bit of this value to the last (sign) position (S << 31) + */ + movaps %xmm1, %xmm2 + +/* f) Subtract "Right Shifter" (0x4B000000) value */ + subps %xmm5, %xmm1 + movups __sPI2(%rax), %xmm7 + pslld $31, %xmm2 + movups __sPI3(%rax), %xmm5 + movups __sAbsMask(%rax), %xmm3 + +/* Check for large and special arguments */ + andps %xmm4, %xmm3 + +/* g) Subtract 0.5 from result for octant correction */ + subps __sOneHalf(%rax), %xmm1 + cmpnleps __sRangeReductionVal(%rax), %xmm3 + +/* + h) Subtract Y*PI from X argument, where PI divided to 4 parts: + X = X - Y*PI1 - Y*PI2 - Y*PI3 - Y*PI4; + */ + mulps %xmm1, %xmm6 + mulps %xmm1, %xmm7 + mulps %xmm1, %xmm5 + subps %xmm6, %xmm0 + movmskps %xmm3, %ecx + movups __sPI4(%rax), %xmm6 + subps %xmm7, %xmm0 + mulps %xmm6, %xmm1 + subps %xmm5, %xmm0 + subps %xmm1, %xmm0 + +/* a) Calculate X^2 = X * X */ + movaps %xmm0, %xmm1 + mulps %xmm0, %xmm1 + +/* + 3) Destination sign setting + a) Set shifted destination sign using XOR operation: + R = XOR( R, S ); + */ + xorps %xmm2, %xmm0 + movups __sA9(%rax), %xmm2 + +/* + b) Calculate polynomial: + R = X + X * X^2 * (A3 + x^2 * (A5 + x^2 * (A7 + x^2 * (A9)))); + */ + mulps %xmm1, %xmm2 + addps __sA7(%rax), %xmm2 + mulps %xmm1, %xmm2 + addps __sA5(%rax), %xmm2 + mulps %xmm1, %xmm2 + addps __sA3(%rax), %xmm2 + mulps %xmm2, %xmm1 + mulps %xmm0, %xmm1 + addps %xmm1, %xmm0 + testl %ecx, %ecx + jne .LBL_1_3 + +.LBL_1_2: + cfi_remember_state + movq %rbp, %rsp + cfi_def_cfa_register (%rsp) + popq %rbp + cfi_adjust_cfa_offset (-8) + cfi_restore (%rbp) + ret + +.LBL_1_3: + cfi_restore_state + movups %xmm4, 192(%rsp) + movups %xmm0, 256(%rsp) + je .LBL_1_2 + + xorb %dl, %dl + xorl %eax, %eax + movups %xmm8, 112(%rsp) + movups %xmm9, 96(%rsp) + movups %xmm10, 80(%rsp) + movups %xmm11, 64(%rsp) + movups %xmm12, 48(%rsp) + movups %xmm13, 32(%rsp) + movups %xmm14, 16(%rsp) + movups %xmm15, (%rsp) + movq %rsi, 136(%rsp) + movq %rdi, 128(%rsp) + movq %r12, 168(%rsp) + cfi_offset_rel_rsp (12, 168) + movb %dl, %r12b + movq %r13, 160(%rsp) + cfi_offset_rel_rsp (13, 160) + movl %ecx, %r13d + movq %r14, 152(%rsp) + cfi_offset_rel_rsp (14, 152) + movl %eax, %r14d + movq %r15, 144(%rsp) + cfi_offset_rel_rsp (15, 144) + cfi_remember_state + +.LBL_1_6: + btl %r14d, %r13d + jc .LBL_1_12 + +.LBL_1_7: + lea 1(%r14), %esi + btl %esi, %r13d + jc .LBL_1_10 + +.LBL_1_8: + incb %r12b + addl $2, %r14d + cmpb $16, %r12b + jb .LBL_1_6 + + movups 112(%rsp), %xmm8 + movups 96(%rsp), %xmm9 + movups 80(%rsp), %xmm10 + movups 64(%rsp), %xmm11 + movups 48(%rsp), %xmm12 + movups 32(%rsp), %xmm13 + movups 16(%rsp), %xmm14 + movups (%rsp), %xmm15 + movq 136(%rsp), %rsi + movq 128(%rsp), %rdi + movq 168(%rsp), %r12 + cfi_restore (%r12) + movq 160(%rsp), %r13 + cfi_restore (%r13) + movq 152(%rsp), %r14 + cfi_restore (%r14) + movq 144(%rsp), %r15 + cfi_restore (%r15) + movups 256(%rsp), %xmm0 + jmp .LBL_1_2 + +.LBL_1_10: + cfi_restore_state + movzbl %r12b, %r15d + movss 196(%rsp,%r15,8), %xmm0 + + call JUMPTARGET(cosf) + + movss %xmm0, 260(%rsp,%r15,8) + jmp .LBL_1_8 + +.LBL_1_12: + movzbl %r12b, %r15d + movss 192(%rsp,%r15,8), %xmm0 + + call JUMPTARGET(cosf) + + movss %xmm0, 256(%rsp,%r15,8) + jmp .LBL_1_7 +END (_ZGVbN4v_cosf_sse4) diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_cosf8_core.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_cosf8_core.S new file mode 100644 index 0000000000..f7530c138a --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_cosf8_core.S @@ -0,0 +1,36 @@ +/* Multiple versions of vectorized cosf, vector length is 8. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + + .text +ENTRY (_ZGVdN8v_cosf) + .type _ZGVdN8v_cosf, @gnu_indirect_function + LOAD_RTLD_GLOBAL_RO_RDX + leaq _ZGVdN8v_cosf_avx2(%rip), %rax + HAS_ARCH_FEATURE (AVX2_Usable) + jz 2f + ret +2: leaq _ZGVdN8v_cosf_sse_wrapper(%rip), %rax + ret +END (_ZGVdN8v_cosf) +libmvec_hidden_def (_ZGVdN8v_cosf) + +#define _ZGVdN8v_cosf _ZGVdN8v_cosf_sse_wrapper +#include "../svml_s_cosf8_core.S" diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_cosf8_core_avx2.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_cosf8_core_avx2.S new file mode 100644 index 0000000000..c61add3bb9 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_cosf8_core_avx2.S @@ -0,0 +1,215 @@ +/* Function cosf vectorized with AVX2. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + + +#include <sysdep.h> +#include "svml_s_trig_data.h" + + .text +ENTRY (_ZGVdN8v_cosf_avx2) +/* + ALGORITHM DESCRIPTION: + + 1) Range reduction to [-Pi/2; +Pi/2] interval + a) We remove sign using AND operation + b) Add Pi/2 value to argument X for Cos to Sin transformation + c) Getting octant Y by 1/Pi multiplication + d) Add "Right Shifter" value + e) Treat obtained value as integer for destination sign setting. + Shift first bit of this value to the last (sign) position + f) Subtract "Right Shifter" value + g) Subtract 0.5 from result for octant correction + h) Subtract Y*PI from X argument, where PI divided to 4 parts: + X = X - Y*PI1 - Y*PI2 - Y*PI3 - Y*PI4; + 2) Polynomial (minimax for sin within [-Pi/2; +Pi/2] interval) + a) Calculate X^2 = X * X + b) Calculate polynomial: + R = X + X * X^2 * (A3 + x^2 * (A5 + ..... + 3) Destination sign setting + a) Set shifted destination sign using XOR operation: + R = XOR( R, S ); + */ + pushq %rbp + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%rbp, 0) + movq %rsp, %rbp + cfi_def_cfa_register (%rbp) + andq $-64, %rsp + subq $448, %rsp + movq __svml_s_trig_data@GOTPCREL(%rip), %rax + vmovaps %ymm0, %ymm2 + vmovups __sRShifter(%rax), %ymm5 + vmovups __sPI1_FMA(%rax), %ymm7 + +/* b) Add Pi/2 value to argument X for Cos to Sin transformation */ + vaddps __sHalfPI(%rax), %ymm2, %ymm4 + +/* + 1) Range reduction to [-Pi/2; +Pi/2] interval + c) Getting octant Y by 1/Pi multiplication + d) Add "Right Shifter" (0x4B000000) value + */ + vfmadd132ps __sInvPI(%rax), %ymm5, %ymm4 + +/* f) Subtract "Right Shifter" (0x4B000000) value */ + vsubps %ymm5, %ymm4, %ymm6 + +/* + e) Treat obtained value as integer for destination sign setting. + Shift first bit of this value to the last (sign) position (S << 31) + */ + vpslld $31, %ymm4, %ymm0 + +/* g) Subtract 0.5 from result for octant correction */ + vsubps __sOneHalf(%rax), %ymm6, %ymm4 + +/* Check for large and special arguments */ + vandps __sAbsMask(%rax), %ymm2, %ymm3 + vcmpnle_uqps __sRangeReductionVal(%rax), %ymm3, %ymm1 + +/* + h) Subtract Y*PI from X argument, where PI divided to 4 parts: + X = X - Y*PI1 - Y*PI2 - Y*PI3 + */ + vmovaps %ymm2, %ymm3 + vfnmadd231ps %ymm4, %ymm7, %ymm3 + vfnmadd231ps __sPI2_FMA(%rax), %ymm4, %ymm3 + vfnmadd132ps __sPI3_FMA(%rax), %ymm3, %ymm4 + +/* a) Calculate X^2 = X * X */ + vmulps %ymm4, %ymm4, %ymm5 + +/* + 3) Destination sign setting + a) Set shifted destination sign using XOR operation: + R = XOR( R, S ); + */ + vxorps %ymm0, %ymm4, %ymm6 + vmovups __sA9_FMA(%rax), %ymm0 + +/* + b) Calculate polynomial: + R = X + X * X^2 * (A3 + x^2 * (A5 + x^2 * (A7 + x^2 * (A9)))) + */ + vfmadd213ps __sA7_FMA(%rax), %ymm5, %ymm0 + vfmadd213ps __sA5_FMA(%rax), %ymm5, %ymm0 + vfmadd213ps __sA3(%rax), %ymm5, %ymm0 + vmulps %ymm5, %ymm0, %ymm0 + vmovmskps %ymm1, %ecx + vfmadd213ps %ymm6, %ymm6, %ymm0 + testl %ecx, %ecx + jne .LBL_1_3 + +.LBL_1_2: + cfi_remember_state + movq %rbp, %rsp + cfi_def_cfa_register (%rsp) + popq %rbp + cfi_adjust_cfa_offset (-8) + cfi_restore (%rbp) + ret + +.LBL_1_3: + cfi_restore_state + vmovups %ymm2, 320(%rsp) + vmovups %ymm0, 384(%rsp) + je .LBL_1_2 + + xorb %dl, %dl + xorl %eax, %eax + vmovups %ymm8, 224(%rsp) + vmovups %ymm9, 192(%rsp) + vmovups %ymm10, 160(%rsp) + vmovups %ymm11, 128(%rsp) + vmovups %ymm12, 96(%rsp) + vmovups %ymm13, 64(%rsp) + vmovups %ymm14, 32(%rsp) + vmovups %ymm15, (%rsp) + movq %rsi, 264(%rsp) + movq %rdi, 256(%rsp) + movq %r12, 296(%rsp) + cfi_offset_rel_rsp (12, 296) + movb %dl, %r12b + movq %r13, 288(%rsp) + cfi_offset_rel_rsp (13, 288) + movl %ecx, %r13d + movq %r14, 280(%rsp) + cfi_offset_rel_rsp (14, 280) + movl %eax, %r14d + movq %r15, 272(%rsp) + cfi_offset_rel_rsp (15, 272) + cfi_remember_state + +.LBL_1_6: + btl %r14d, %r13d + jc .LBL_1_12 + +.LBL_1_7: + lea 1(%r14), %esi + btl %esi, %r13d + jc .LBL_1_10 + +.LBL_1_8: + incb %r12b + addl $2, %r14d + cmpb $16, %r12b + jb .LBL_1_6 + + vmovups 224(%rsp), %ymm8 + vmovups 192(%rsp), %ymm9 + vmovups 160(%rsp), %ymm10 + vmovups 128(%rsp), %ymm11 + vmovups 96(%rsp), %ymm12 + vmovups 64(%rsp), %ymm13 + vmovups 32(%rsp), %ymm14 + vmovups (%rsp), %ymm15 + vmovups 384(%rsp), %ymm0 + movq 264(%rsp), %rsi + movq 256(%rsp), %rdi + movq 296(%rsp), %r12 + cfi_restore (%r12) + movq 288(%rsp), %r13 + cfi_restore (%r13) + movq 280(%rsp), %r14 + cfi_restore (%r14) + movq 272(%rsp), %r15 + cfi_restore (%r15) + jmp .LBL_1_2 + +.LBL_1_10: + cfi_restore_state + movzbl %r12b, %r15d + vmovss 324(%rsp,%r15,8), %xmm0 + vzeroupper + + call JUMPTARGET(cosf) + + vmovss %xmm0, 388(%rsp,%r15,8) + jmp .LBL_1_8 + +.LBL_1_12: + movzbl %r12b, %r15d + vmovss 320(%rsp,%r15,8), %xmm0 + vzeroupper + + call JUMPTARGET(cosf) + + vmovss %xmm0, 384(%rsp,%r15,8) + jmp .LBL_1_7 + +END (_ZGVdN8v_cosf_avx2) diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core.S new file mode 100644 index 0000000000..3998f616aa --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core.S @@ -0,0 +1,37 @@ +/* Multiple versions of vectorized expf. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + + .text +ENTRY (_ZGVeN16v_expf) + .type _ZGVeN16v_expf, @gnu_indirect_function + LOAD_RTLD_GLOBAL_RO_RDX + leaq _ZGVeN16v_expf_skx(%rip), %rax + HAS_ARCH_FEATURE (AVX512DQ_Usable) + jnz 2f + leaq _ZGVeN16v_expf_knl(%rip), %rax + HAS_ARCH_FEATURE (AVX512F_Usable) + jnz 2f + leaq _ZGVeN16v_expf_avx2_wrapper(%rip), %rax +2: ret +END (_ZGVeN16v_expf) + +#define _ZGVeN16v_expf _ZGVeN16v_expf_avx2_wrapper +#include "../svml_s_expf16_core.S" diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S new file mode 100644 index 0000000000..e80b2be1a7 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S @@ -0,0 +1,447 @@ +/* Function expf vectorized with AVX-512. KNL and SKX versions. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "svml_s_expf_data.h" +#include "svml_s_wrapper_impl.h" + + .text +ENTRY (_ZGVeN16v_expf_knl) +#ifndef HAVE_AVX512DQ_ASM_SUPPORT +WRAPPER_IMPL_AVX512 _ZGVdN8v_expf +#else +/* + ALGORITHM DESCRIPTION: + + Argument representation: + M = rint(X*2^k/ln2) = 2^k*N+j + X = M*ln2/2^k + r = N*ln2 + ln2*(j/2^k) + r + then -ln2/2^(k+1) < r < ln2/2^(k+1) + Alternatively: + M = trunc(X*2^k/ln2) + then 0 < r < ln2/2^k + + Result calculation: + exp(X) = exp(N*ln2 + ln2*(j/2^k) + r) + = 2^N * 2^(j/2^k) * exp(r) + 2^N is calculated by bit manipulation + 2^(j/2^k) is computed from table lookup + exp(r) is approximated by polynomial + + The table lookup is skipped if k = 0. + For low accuracy approximation, exp(r) ~ 1 or 1+r. */ + + pushq %rbp + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%rbp, 0) + movq %rsp, %rbp + cfi_def_cfa_register (%rbp) + andq $-64, %rsp + subq $1280, %rsp + movq __svml_sexp_data@GOTPCREL(%rip), %rax + +/* r = x-n*ln2_hi/2^k */ + vmovaps %zmm0, %zmm6 + +/* compare against threshold */ + movl $-1, %ecx + vmovups __sInvLn2(%rax), %zmm3 + vmovups __sLn2hi(%rax), %zmm5 + +/* m = x*2^k/ln2 + shifter */ + vfmadd213ps __sShifter(%rax), %zmm0, %zmm3 + vmovups __sPC5(%rax), %zmm9 + +/* n = m - shifter = rint(x*2^k/ln2) */ + vsubps __sShifter(%rax), %zmm3, %zmm7 + +/* remove sign of x by "and" operation */ + vpandd __iAbsMask(%rax), %zmm0, %zmm1 + vpaddd __iBias(%rax), %zmm3, %zmm4 + vpcmpgtd __iDomainRange(%rax), %zmm1, %k1 + +/* compute 2^N with "shift" */ + vpslld $23, %zmm4, %zmm8 + vfnmadd231ps %zmm7, %zmm5, %zmm6 + vpbroadcastd %ecx, %zmm2{%k1}{z} + +/* r = r-n*ln2_lo/2^k = x - n*ln2/2^k */ + vfnmadd132ps __sLn2lo(%rax), %zmm6, %zmm7 + +/* set mask for overflow/underflow */ + vptestmd %zmm2, %zmm2, %k0 + kmovw %k0, %ecx + +/* c5*r+c4 */ + vfmadd213ps __sPC4(%rax), %zmm7, %zmm9 + +/* (c5*r+c4)*r+c3 */ + vfmadd213ps __sPC3(%rax), %zmm7, %zmm9 + +/* ((c5*r+c4)*r+c3)*r+c2 */ + vfmadd213ps __sPC2(%rax), %zmm7, %zmm9 + +/* (((c5*r+c4)*r+c3)*r+c2)*r+c1 */ + vfmadd213ps __sPC1(%rax), %zmm7, %zmm9 + +/* exp(r) = ((((c5*r+c4)*r+c3)*r+c2)*r+c1)*r+c0 */ + vfmadd213ps __sPC0(%rax), %zmm7, %zmm9 + +/* 2^N*exp(r) */ + vmulps %zmm9, %zmm8, %zmm1 + testl %ecx, %ecx + jne .LBL_1_3 + +.LBL_1_2: + cfi_remember_state + vmovaps %zmm1, %zmm0 + movq %rbp, %rsp + cfi_def_cfa_register (%rsp) + popq %rbp + cfi_adjust_cfa_offset (-8) + cfi_restore (%rbp) + ret + +.LBL_1_3: + cfi_restore_state + vmovups %zmm0, 1152(%rsp) + vmovups %zmm1, 1216(%rsp) + je .LBL_1_2 + + xorb %dl, %dl + kmovw %k4, 1048(%rsp) + xorl %eax, %eax + kmovw %k5, 1040(%rsp) + kmovw %k6, 1032(%rsp) + kmovw %k7, 1024(%rsp) + vmovups %zmm16, 960(%rsp) + vmovups %zmm17, 896(%rsp) + vmovups %zmm18, 832(%rsp) + vmovups %zmm19, 768(%rsp) + vmovups %zmm20, 704(%rsp) + vmovups %zmm21, 640(%rsp) + vmovups %zmm22, 576(%rsp) + vmovups %zmm23, 512(%rsp) + vmovups %zmm24, 448(%rsp) + vmovups %zmm25, 384(%rsp) + vmovups %zmm26, 320(%rsp) + vmovups %zmm27, 256(%rsp) + vmovups %zmm28, 192(%rsp) + vmovups %zmm29, 128(%rsp) + vmovups %zmm30, 64(%rsp) + vmovups %zmm31, (%rsp) + movq %rsi, 1064(%rsp) + movq %rdi, 1056(%rsp) + movq %r12, 1096(%rsp) + cfi_offset_rel_rsp (12, 1096) + movb %dl, %r12b + movq %r13, 1088(%rsp) + cfi_offset_rel_rsp (13, 1088) + movl %ecx, %r13d + movq %r14, 1080(%rsp) + cfi_offset_rel_rsp (14, 1080) + movl %eax, %r14d + movq %r15, 1072(%rsp) + cfi_offset_rel_rsp (15, 1072) + cfi_remember_state + +.LBL_1_6: + btl %r14d, %r13d + jc .LBL_1_12 + +.LBL_1_7: + lea 1(%r14), %esi + btl %esi, %r13d + jc .LBL_1_10 + +.LBL_1_8: + addb $1, %r12b + addl $2, %r14d + cmpb $16, %r12b + jb .LBL_1_6 + + kmovw 1048(%rsp), %k4 + movq 1064(%rsp), %rsi + kmovw 1040(%rsp), %k5 + movq 1056(%rsp), %rdi + kmovw 1032(%rsp), %k6 + movq 1096(%rsp), %r12 + cfi_restore (%r12) + movq 1088(%rsp), %r13 + cfi_restore (%r13) + kmovw 1024(%rsp), %k7 + vmovups 960(%rsp), %zmm16 + vmovups 896(%rsp), %zmm17 + vmovups 832(%rsp), %zmm18 + vmovups 768(%rsp), %zmm19 + vmovups 704(%rsp), %zmm20 + vmovups 640(%rsp), %zmm21 + vmovups 576(%rsp), %zmm22 + vmovups 512(%rsp), %zmm23 + vmovups 448(%rsp), %zmm24 + vmovups 384(%rsp), %zmm25 + vmovups 320(%rsp), %zmm26 + vmovups 256(%rsp), %zmm27 + vmovups 192(%rsp), %zmm28 + vmovups 128(%rsp), %zmm29 + vmovups 64(%rsp), %zmm30 + vmovups (%rsp), %zmm31 + movq 1080(%rsp), %r14 + cfi_restore (%r14) + movq 1072(%rsp), %r15 + cfi_restore (%r15) + vmovups 1216(%rsp), %zmm1 + jmp .LBL_1_2 + +.LBL_1_10: + cfi_restore_state + movzbl %r12b, %r15d + vmovss 1156(%rsp,%r15,8), %xmm0 + call JUMPTARGET(__expf_finite) + vmovss %xmm0, 1220(%rsp,%r15,8) + jmp .LBL_1_8 + +.LBL_1_12: + movzbl %r12b, %r15d + vmovss 1152(%rsp,%r15,8), %xmm0 + call JUMPTARGET(__expf_finite) + vmovss %xmm0, 1216(%rsp,%r15,8) + jmp .LBL_1_7 + +#endif +END (_ZGVeN16v_expf_knl) + +ENTRY (_ZGVeN16v_expf_skx) +#ifndef HAVE_AVX512DQ_ASM_SUPPORT +WRAPPER_IMPL_AVX512 _ZGVdN8v_expf +#else +/* + ALGORITHM DESCRIPTION: + + Argument representation: + M = rint(X*2^k/ln2) = 2^k*N+j + X = M*ln2/2^k + r = N*ln2 + ln2*(j/2^k) + r + then -ln2/2^(k+1) < r < ln2/2^(k+1) + Alternatively: + M = trunc(X*2^k/ln2) + then 0 < r < ln2/2^k + + Result calculation: + exp(X) = exp(N*ln2 + ln2*(j/2^k) + r) + = 2^N * 2^(j/2^k) * exp(r) + 2^N is calculated by bit manipulation + 2^(j/2^k) is computed from table lookup + exp(r) is approximated by polynomial + + The table lookup is skipped if k = 0. + For low accuracy approximation, exp(r) ~ 1 or 1+r. */ + + pushq %rbp + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%rbp, 0) + movq %rsp, %rbp + cfi_def_cfa_register (%rbp) + andq $-64, %rsp + subq $1280, %rsp + movq __svml_sexp_data@GOTPCREL(%rip), %rax + +/* r = x-n*ln2_hi/2^k */ + vmovaps %zmm0, %zmm7 + +/* compare against threshold */ + vmovups .L_2il0floatpacket.13(%rip), %zmm3 + vmovups __sInvLn2(%rax), %zmm4 + vmovups __sShifter(%rax), %zmm1 + vmovups __sLn2hi(%rax), %zmm6 + vmovups __sPC5(%rax), %zmm10 + +/* m = x*2^k/ln2 + shifter */ + vfmadd213ps %zmm1, %zmm0, %zmm4 + +/* n = m - shifter = rint(x*2^k/ln2) */ + vsubps %zmm1, %zmm4, %zmm8 + vpaddd __iBias(%rax), %zmm4, %zmm5 + vfnmadd231ps %zmm8, %zmm6, %zmm7 + +/* compute 2^N with "shift" */ + vpslld $23, %zmm5, %zmm9 + +/* r = r-n*ln2_lo/2^k = x - n*ln2/2^k */ + vfnmadd132ps __sLn2lo(%rax), %zmm7, %zmm8 + +/* c5*r+c4 */ + vfmadd213ps __sPC4(%rax), %zmm8, %zmm10 + +/* (c5*r+c4)*r+c3 */ + vfmadd213ps __sPC3(%rax), %zmm8, %zmm10 + +/* ((c5*r+c4)*r+c3)*r+c2 */ + vfmadd213ps __sPC2(%rax), %zmm8, %zmm10 + +/* (((c5*r+c4)*r+c3)*r+c2)*r+c1 */ + vfmadd213ps __sPC1(%rax), %zmm8, %zmm10 + +/* exp(r) = ((((c5*r+c4)*r+c3)*r+c2)*r+c1)*r+c0 */ + vfmadd213ps __sPC0(%rax), %zmm8, %zmm10 + +/* 2^N*exp(r) */ + vmulps %zmm10, %zmm9, %zmm1 + +/* remove sign of x by "and" operation */ + vpandd __iAbsMask(%rax), %zmm0, %zmm2 + vpcmpd $2, __iDomainRange(%rax), %zmm2, %k1 + vpandnd %zmm2, %zmm2, %zmm3{%k1} + +/* set mask for overflow/underflow */ + vptestmd %zmm3, %zmm3, %k0 + kmovw %k0, %ecx + testl %ecx, %ecx + jne .LBL_2_3 + +.LBL_2_2: + cfi_remember_state + vmovaps %zmm1, %zmm0 + movq %rbp, %rsp + cfi_def_cfa_register (%rsp) + popq %rbp + cfi_adjust_cfa_offset (-8) + cfi_restore (%rbp) + ret + +.LBL_2_3: + cfi_restore_state + vmovups %zmm0, 1152(%rsp) + vmovups %zmm1, 1216(%rsp) + je .LBL_2_2 + + xorb %dl, %dl + xorl %eax, %eax + kmovw %k4, 1048(%rsp) + kmovw %k5, 1040(%rsp) + kmovw %k6, 1032(%rsp) + kmovw %k7, 1024(%rsp) + vmovups %zmm16, 960(%rsp) + vmovups %zmm17, 896(%rsp) + vmovups %zmm18, 832(%rsp) + vmovups %zmm19, 768(%rsp) + vmovups %zmm20, 704(%rsp) + vmovups %zmm21, 640(%rsp) + vmovups %zmm22, 576(%rsp) + vmovups %zmm23, 512(%rsp) + vmovups %zmm24, 448(%rsp) + vmovups %zmm25, 384(%rsp) + vmovups %zmm26, 320(%rsp) + vmovups %zmm27, 256(%rsp) + vmovups %zmm28, 192(%rsp) + vmovups %zmm29, 128(%rsp) + vmovups %zmm30, 64(%rsp) + vmovups %zmm31, (%rsp) + movq %rsi, 1064(%rsp) + movq %rdi, 1056(%rsp) + movq %r12, 1096(%rsp) + cfi_offset_rel_rsp (12, 1096) + movb %dl, %r12b + movq %r13, 1088(%rsp) + cfi_offset_rel_rsp (13, 1088) + movl %ecx, %r13d + movq %r14, 1080(%rsp) + cfi_offset_rel_rsp (14, 1080) + movl %eax, %r14d + movq %r15, 1072(%rsp) + cfi_offset_rel_rsp (15, 1072) + cfi_remember_state + + +.LBL_2_6: + btl %r14d, %r13d + jc .LBL_2_12 + +.LBL_2_7: + lea 1(%r14), %esi + btl %esi, %r13d + jc .LBL_2_10 + +.LBL_2_8: + incb %r12b + addl $2, %r14d + cmpb $16, %r12b + jb .LBL_2_6 + + kmovw 1048(%rsp), %k4 + kmovw 1040(%rsp), %k5 + kmovw 1032(%rsp), %k6 + kmovw 1024(%rsp), %k7 + vmovups 960(%rsp), %zmm16 + vmovups 896(%rsp), %zmm17 + vmovups 832(%rsp), %zmm18 + vmovups 768(%rsp), %zmm19 + vmovups 704(%rsp), %zmm20 + vmovups 640(%rsp), %zmm21 + vmovups 576(%rsp), %zmm22 + vmovups 512(%rsp), %zmm23 + vmovups 448(%rsp), %zmm24 + vmovups 384(%rsp), %zmm25 + vmovups 320(%rsp), %zmm26 + vmovups 256(%rsp), %zmm27 + vmovups 192(%rsp), %zmm28 + vmovups 128(%rsp), %zmm29 + vmovups 64(%rsp), %zmm30 + vmovups (%rsp), %zmm31 + vmovups 1216(%rsp), %zmm1 + movq 1064(%rsp), %rsi + movq 1056(%rsp), %rdi + movq 1096(%rsp), %r12 + cfi_restore (%r12) + movq 1088(%rsp), %r13 + cfi_restore (%r13) + movq 1080(%rsp), %r14 + cfi_restore (%r14) + movq 1072(%rsp), %r15 + cfi_restore (%r15) + jmp .LBL_2_2 + +.LBL_2_10: + cfi_restore_state + movzbl %r12b, %r15d + vmovss 1156(%rsp,%r15,8), %xmm0 + vzeroupper + vmovss 1156(%rsp,%r15,8), %xmm0 + + call JUMPTARGET(__expf_finite) + + vmovss %xmm0, 1220(%rsp,%r15,8) + jmp .LBL_2_8 + +.LBL_2_12: + movzbl %r12b, %r15d + vmovss 1152(%rsp,%r15,8), %xmm0 + vzeroupper + vmovss 1152(%rsp,%r15,8), %xmm0 + + call JUMPTARGET(__expf_finite) + + vmovss %xmm0, 1216(%rsp,%r15,8) + jmp .LBL_2_7 + +#endif +END (_ZGVeN16v_expf_skx) + + .section .rodata, "a" +.L_2il0floatpacket.13: + .long 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff + .type .L_2il0floatpacket.13,@object diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_expf4_core.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_expf4_core.S new file mode 100644 index 0000000000..8051720ec2 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_expf4_core.S @@ -0,0 +1,36 @@ +/* Multiple versions of vectorized expf. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + + .text +ENTRY (_ZGVbN4v_expf) + .type _ZGVbN4v_expf, @gnu_indirect_function + LOAD_RTLD_GLOBAL_RO_RDX + leaq _ZGVbN4v_expf_sse4(%rip), %rax + HAS_CPU_FEATURE (SSE4_1) + jz 2f + ret +2: leaq _ZGVbN4v_expf_sse2(%rip), %rax + ret +END (_ZGVbN4v_expf) +libmvec_hidden_def (_ZGVbN4v_expf) + +#define _ZGVbN4v_expf _ZGVbN4v_expf_sse2 +#include "../svml_s_expf4_core.S" diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_expf4_core_sse4.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_expf4_core_sse4.S new file mode 100644 index 0000000000..2bc510bbf7 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_expf4_core_sse4.S @@ -0,0 +1,212 @@ +/* Function expf vectorized with SSE4. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "svml_s_expf_data.h" + + .text +ENTRY (_ZGVbN4v_expf_sse4) +/* + ALGORITHM DESCRIPTION: + + Argument representation: + M = rint(X*2^k/ln2) = 2^k*N+j + X = M*ln2/2^k + r = N*ln2 + ln2*(j/2^k) + r + then -ln2/2^(k+1) < r < ln2/2^(k+1) + Alternatively: + M = trunc(X*2^k/ln2) + then 0 < r < ln2/2^k + + Result calculation: + exp(X) = exp(N*ln2 + ln2*(j/2^k) + r) + = 2^N * 2^(j/2^k) * exp(r) + 2^N is calculated by bit manipulation + 2^(j/2^k) is computed from table lookup + exp(r) is approximated by polynomial + + The table lookup is skipped if k = 0. + For low accuracy approximation, exp(r) ~ 1 or 1+r. */ + + pushq %rbp + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%rbp, 0) + movq %rsp, %rbp + cfi_def_cfa_register (%rbp) + andq $-64, %rsp + subq $320, %rsp + movaps %xmm0, %xmm5 + movq __svml_sexp_data@GOTPCREL(%rip), %rax + movups __sInvLn2(%rax), %xmm0 + +/* m = x*2^k/ln2 + shifter */ + mulps %xmm5, %xmm0 + movups __sShifter(%rax), %xmm6 + movups __sLn2hi(%rax), %xmm4 + addps %xmm6, %xmm0 + +/* n = m - shifter = rint(x*2^k/ln2) */ + movaps %xmm0, %xmm2 + +/* remove sign of x by "and" operation */ + movdqu __iAbsMask(%rax), %xmm7 + subps %xmm6, %xmm2 + +/* r = x-n*ln2_hi/2^k */ + mulps %xmm2, %xmm4 + pand %xmm5, %xmm7 + +/* compare against threshold */ + pcmpgtd __iDomainRange(%rax), %xmm7 + movups __sLn2lo(%rax), %xmm1 + +/* set mask for overflow/underflow */ + movmskps %xmm7, %ecx + movaps %xmm5, %xmm7 + movups __sPC5(%rax), %xmm3 + subps %xmm4, %xmm7 + +/* r = r-n*ln2_lo/2^k = x - n*ln2/2^k */ + mulps %xmm1, %xmm2 + +/* compute 2^N with "shift" */ + movdqu __iBias(%rax), %xmm6 + subps %xmm2, %xmm7 + +/* c5*r+c4 */ + mulps %xmm7, %xmm3 + paddd %xmm6, %xmm0 + pslld $23, %xmm0 + addps __sPC4(%rax), %xmm3 + +/* (c5*r+c4)*r+c3 */ + mulps %xmm7, %xmm3 + addps __sPC3(%rax), %xmm3 + +/* ((c5*r+c4)*r+c3)*r+c2 */ + mulps %xmm7, %xmm3 + addps __sPC2(%rax), %xmm3 + +/* (((c5*r+c4)*r+c3)*r+c2)*r+c1 */ + mulps %xmm7, %xmm3 + addps __sPC1(%rax), %xmm3 + +/* exp(r) = ((((c5*r+c4)*r+c3)*r+c2)*r+c1)*r+c0 */ + mulps %xmm3, %xmm7 + addps __sPC0(%rax), %xmm7 + +/* 2^N*exp(r) */ + mulps %xmm7, %xmm0 + testl %ecx, %ecx + jne .LBL_1_3 + +.LBL_1_2: + cfi_remember_state + movq %rbp, %rsp + cfi_def_cfa_register (%rsp) + popq %rbp + cfi_adjust_cfa_offset (-8) + cfi_restore (%rbp) + ret + +.LBL_1_3: + cfi_restore_state + movups %xmm5, 192(%rsp) + movups %xmm0, 256(%rsp) + je .LBL_1_2 + + xorb %dl, %dl + xorl %eax, %eax + movups %xmm8, 112(%rsp) + movups %xmm9, 96(%rsp) + movups %xmm10, 80(%rsp) + movups %xmm11, 64(%rsp) + movups %xmm12, 48(%rsp) + movups %xmm13, 32(%rsp) + movups %xmm14, 16(%rsp) + movups %xmm15, (%rsp) + movq %rsi, 136(%rsp) + movq %rdi, 128(%rsp) + movq %r12, 168(%rsp) + cfi_offset_rel_rsp (12, 168) + movb %dl, %r12b + movq %r13, 160(%rsp) + cfi_offset_rel_rsp (13, 160) + movl %ecx, %r13d + movq %r14, 152(%rsp) + cfi_offset_rel_rsp (14, 152) + movl %eax, %r14d + movq %r15, 144(%rsp) + cfi_offset_rel_rsp (15, 144) + cfi_remember_state + +.LBL_1_6: + btl %r14d, %r13d + jc .LBL_1_12 + +.LBL_1_7: + lea 1(%r14), %esi + btl %esi, %r13d + jc .LBL_1_10 + +.LBL_1_8: + incb %r12b + addl $2, %r14d + cmpb $16, %r12b + jb .LBL_1_6 + + movups 112(%rsp), %xmm8 + movups 96(%rsp), %xmm9 + movups 80(%rsp), %xmm10 + movups 64(%rsp), %xmm11 + movups 48(%rsp), %xmm12 + movups 32(%rsp), %xmm13 + movups 16(%rsp), %xmm14 + movups (%rsp), %xmm15 + movq 136(%rsp), %rsi + movq 128(%rsp), %rdi + movq 168(%rsp), %r12 + cfi_restore (%r12) + movq 160(%rsp), %r13 + cfi_restore (%r13) + movq 152(%rsp), %r14 + cfi_restore (%r14) + movq 144(%rsp), %r15 + cfi_restore (%r15) + movups 256(%rsp), %xmm0 + jmp .LBL_1_2 + +.LBL_1_10: + cfi_restore_state + movzbl %r12b, %r15d + movss 196(%rsp,%r15,8), %xmm0 + + call JUMPTARGET(__expf_finite) + + movss %xmm0, 260(%rsp,%r15,8) + jmp .LBL_1_8 + +.LBL_1_12: + movzbl %r12b, %r15d + movss 192(%rsp,%r15,8), %xmm0 + + call JUMPTARGET(__expf_finite) + + movss %xmm0, 256(%rsp,%r15,8) + jmp .LBL_1_7 + +END (_ZGVbN4v_expf_sse4) diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_expf8_core.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_expf8_core.S new file mode 100644 index 0000000000..6ffb1fd784 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_expf8_core.S @@ -0,0 +1,36 @@ +/* Multiple versions of vectorized expf. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + + .text +ENTRY (_ZGVdN8v_expf) + .type _ZGVdN8v_expf, @gnu_indirect_function + LOAD_RTLD_GLOBAL_RO_RDX + leaq _ZGVdN8v_expf_avx2(%rip), %rax + HAS_ARCH_FEATURE (AVX2_Usable) + jz 2f + ret +2: leaq _ZGVdN8v_expf_sse_wrapper(%rip), %rax + ret +END (_ZGVdN8v_expf) +libmvec_hidden_def (_ZGVdN8v_expf) + +#define _ZGVdN8v_expf _ZGVdN8v_expf_sse_wrapper +#include "../svml_s_expf8_core.S" diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_expf8_core_avx2.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_expf8_core_avx2.S new file mode 100644 index 0000000000..b4a070ac86 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_expf8_core_avx2.S @@ -0,0 +1,202 @@ +/* Function expf vectorized with AVX2. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "svml_s_expf_data.h" + + .text +ENTRY(_ZGVdN8v_expf_avx2) +/* + ALGORITHM DESCRIPTION: + + Argument representation: + M = rint(X*2^k/ln2) = 2^k*N+j + X = M*ln2/2^k + r = N*ln2 + ln2*(j/2^k) + r + then -ln2/2^(k+1) < r < ln2/2^(k+1) + Alternatively: + M = trunc(X*2^k/ln2) + then 0 < r < ln2/2^k + + Result calculation: + exp(X) = exp(N*ln2 + ln2*(j/2^k) + r) + = 2^N * 2^(j/2^k) * exp(r) + 2^N is calculated by bit manipulation + 2^(j/2^k) is computed from table lookup + exp(r) is approximated by polynomial + + The table lookup is skipped if k = 0. + For low accuracy approximation, exp(r) ~ 1 or 1+r. */ + + pushq %rbp + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%rbp, 0) + movq %rsp, %rbp + cfi_def_cfa_register (%rbp) + andq $-64, %rsp + subq $448, %rsp + movq __svml_sexp_data@GOTPCREL(%rip), %rax + vmovaps %ymm0, %ymm2 + vmovups __sInvLn2(%rax), %ymm7 + vmovups __sShifter(%rax), %ymm4 + vmovups __sLn2hi(%rax), %ymm3 + vmovups __sPC5(%rax), %ymm1 + +/* m = x*2^k/ln2 + shifter */ + vfmadd213ps %ymm4, %ymm2, %ymm7 + +/* n = m - shifter = rint(x*2^k/ln2) */ + vsubps %ymm4, %ymm7, %ymm0 + vpaddd __iBias(%rax), %ymm7, %ymm4 + +/* remove sign of x by "and" operation */ + vandps __iAbsMask(%rax), %ymm2, %ymm5 + +/* compare against threshold */ + vpcmpgtd __iDomainRange(%rax), %ymm5, %ymm6 + +/* r = x-n*ln2_hi/2^k */ + vmovaps %ymm2, %ymm5 + vfnmadd231ps %ymm0, %ymm3, %ymm5 + +/* r = r-n*ln2_lo/2^k = x - n*ln2/2^k */ + vfnmadd132ps __sLn2lo(%rax), %ymm5, %ymm0 + +/* c5*r+c4 */ + vfmadd213ps __sPC4(%rax), %ymm0, %ymm1 + +/* (c5*r+c4)*r+c3 */ + vfmadd213ps __sPC3(%rax), %ymm0, %ymm1 + +/* ((c5*r+c4)*r+c3)*r+c2 */ + vfmadd213ps __sPC2(%rax), %ymm0, %ymm1 + +/* (((c5*r+c4)*r+c3)*r+c2)*r+c1 */ + vfmadd213ps __sPC1(%rax), %ymm0, %ymm1 + +/* exp(r) = ((((c5*r+c4)*r+c3)*r+c2)*r+c1)*r+c0 */ + vfmadd213ps __sPC0(%rax), %ymm0, %ymm1 + +/* set mask for overflow/underflow */ + vmovmskps %ymm6, %ecx + +/* compute 2^N with "shift" */ + vpslld $23, %ymm4, %ymm6 + +/* 2^N*exp(r) */ + vmulps %ymm1, %ymm6, %ymm0 + testl %ecx, %ecx + jne .LBL_1_3 + +.LBL_1_2: + cfi_remember_state + movq %rbp, %rsp + cfi_def_cfa_register (%rsp) + popq %rbp + cfi_adjust_cfa_offset (-8) + cfi_restore (%rbp) + ret + +.LBL_1_3: + cfi_restore_state + vmovups %ymm2, 320(%rsp) + vmovups %ymm0, 384(%rsp) + je .LBL_1_2 + + xorb %dl, %dl + xorl %eax, %eax + vmovups %ymm8, 224(%rsp) + vmovups %ymm9, 192(%rsp) + vmovups %ymm10, 160(%rsp) + vmovups %ymm11, 128(%rsp) + vmovups %ymm12, 96(%rsp) + vmovups %ymm13, 64(%rsp) + vmovups %ymm14, 32(%rsp) + vmovups %ymm15, (%rsp) + movq %rsi, 264(%rsp) + movq %rdi, 256(%rsp) + movq %r12, 296(%rsp) + cfi_offset_rel_rsp (12, 296) + movb %dl, %r12b + movq %r13, 288(%rsp) + cfi_offset_rel_rsp (13, 288) + movl %ecx, %r13d + movq %r14, 280(%rsp) + cfi_offset_rel_rsp (14, 280) + movl %eax, %r14d + movq %r15, 272(%rsp) + cfi_offset_rel_rsp (15, 272) + cfi_remember_state + +.LBL_1_6: + btl %r14d, %r13d + jc .LBL_1_12 + +.LBL_1_7: + lea 1(%r14), %esi + btl %esi, %r13d + jc .LBL_1_10 + +.LBL_1_8: + incb %r12b + addl $2, %r14d + cmpb $16, %r12b + jb .LBL_1_6 + + vmovups 224(%rsp), %ymm8 + vmovups 192(%rsp), %ymm9 + vmovups 160(%rsp), %ymm10 + vmovups 128(%rsp), %ymm11 + vmovups 96(%rsp), %ymm12 + vmovups 64(%rsp), %ymm13 + vmovups 32(%rsp), %ymm14 + vmovups (%rsp), %ymm15 + vmovups 384(%rsp), %ymm0 + movq 264(%rsp), %rsi + movq 256(%rsp), %rdi + movq 296(%rsp), %r12 + cfi_restore (%r12) + movq 288(%rsp), %r13 + cfi_restore (%r13) + movq 280(%rsp), %r14 + cfi_restore (%r14) + movq 272(%rsp), %r15 + cfi_restore (%r15) + jmp .LBL_1_2 + +.LBL_1_10: + cfi_restore_state + movzbl %r12b, %r15d + vmovss 324(%rsp,%r15,8), %xmm0 + vzeroupper + + call JUMPTARGET(__expf_finite) + + vmovss %xmm0, 388(%rsp,%r15,8) + jmp .LBL_1_8 + +.LBL_1_12: + movzbl %r12b, %r15d + vmovss 320(%rsp,%r15,8), %xmm0 + vzeroupper + + call JUMPTARGET(__expf_finite) + + vmovss %xmm0, 384(%rsp,%r15,8) + jmp .LBL_1_7 + +END(_ZGVdN8v_expf_avx2) diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core.S new file mode 100644 index 0000000000..8ab03195c6 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core.S @@ -0,0 +1,37 @@ +/* Multiple versions of vectorized logf. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + + .text +ENTRY (_ZGVeN16v_logf) + .type _ZGVeN16v_logf, @gnu_indirect_function + LOAD_RTLD_GLOBAL_RO_RDX + leaq _ZGVeN16v_logf_skx(%rip), %rax + HAS_ARCH_FEATURE (AVX512DQ_Usable) + jnz 2f + leaq _ZGVeN16v_logf_knl(%rip), %rax + HAS_ARCH_FEATURE (AVX512F_Usable) + jnz 2f + leaq _ZGVeN16v_logf_avx2_wrapper(%rip), %rax +2: ret +END (_ZGVeN16v_logf) + +#define _ZGVeN16v_logf _ZGVeN16v_logf_avx2_wrapper +#include "../svml_s_logf16_core.S" diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S new file mode 100644 index 0000000000..7ff6fff848 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S @@ -0,0 +1,416 @@ +/* Function logf vectorized with AVX-512. KNL and SKX versions. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "svml_s_logf_data.h" +#include "svml_s_wrapper_impl.h" + + .text +ENTRY (_ZGVeN16v_logf_knl) +#ifndef HAVE_AVX512DQ_ASM_SUPPORT +WRAPPER_IMPL_AVX512 _ZGVdN8v_logf +#else +/* + ALGORITHM DESCRIPTION: + + log(x) = exponent_x*log(2) + log(mantissa_x), if mantissa_x<4/3 + log(x) = (exponent_x+1)*log(2) + log(0.5*mantissa_x), if mantissa_x>4/3 + + R = mantissa_x - 1, if mantissa_x<4/3 + R = 0.5*mantissa_x - 1, if mantissa_x>4/3 + |R|< 1/3 + + log(1+R) is approximated as a polynomial: degree 9 for 1-ulp, + degree 7 for 4-ulp, degree 3 for half-precision. */ + + pushq %rbp + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%rbp, 0) + movq %rsp, %rbp + cfi_def_cfa_register (%rbp) + andq $-64, %rsp + subq $1280, %rsp + movq __svml_slog_data@GOTPCREL(%rip), %rax + movl $-1, %ecx + +/* reduction: compute r,n */ + vpsubd _iBrkValue(%rax), %zmm0, %zmm2 + vmovups _sPoly_7(%rax), %zmm7 + vpandd _iOffExpoMask(%rax), %zmm2, %zmm3 + +/* exponent_x (mantissa_x<4/3) or exponent_x+1 (mantissa_x>4/3) */ + vpsrad $23, %zmm2, %zmm4 + +/* check for working range, + set special argument mask (denormals/zero/Inf/NaN) + */ + vpaddd _iHiDelta(%rax), %zmm0, %zmm1 + +/* mantissa_x (mantissa_x<4/3), or 0.5*mantissa_x (mantissa_x>4/3) */ + vpaddd _iBrkValue(%rax), %zmm3, %zmm6 + vpcmpd $1, _iLoRange(%rax), %zmm1, %k1 + vcvtdq2ps {rn-sae}, %zmm4, %zmm1 + +/* reduced argument R */ + vsubps _sOne(%rax), %zmm6, %zmm8 + vpbroadcastd %ecx, %zmm5{%k1}{z} + +/* polynomial evaluation starts here */ + vfmadd213ps _sPoly_6(%rax), %zmm8, %zmm7 + vptestmd %zmm5, %zmm5, %k0 + kmovw %k0, %ecx + vfmadd213ps _sPoly_5(%rax), %zmm8, %zmm7 + vfmadd213ps _sPoly_4(%rax), %zmm8, %zmm7 + vfmadd213ps _sPoly_3(%rax), %zmm8, %zmm7 + vfmadd213ps _sPoly_2(%rax), %zmm8, %zmm7 + vfmadd213ps _sPoly_1(%rax), %zmm8, %zmm7 + vmulps %zmm8, %zmm7, %zmm9 + +/* polynomial evaluation end */ + vfmadd213ps %zmm8, %zmm8, %zmm9 + +/* + final reconstruction: + add exponent_value*log2 to polynomial result + */ + vfmadd132ps _sLn2(%rax), %zmm9, %zmm1 + testl %ecx, %ecx + jne .LBL_1_3 + +.LBL_1_2: + cfi_remember_state + vmovaps %zmm1, %zmm0 + movq %rbp, %rsp + cfi_def_cfa_register (%rsp) + popq %rbp + cfi_adjust_cfa_offset (-8) + cfi_restore (%rbp) + ret + +.LBL_1_3: + cfi_restore_state + vmovups %zmm0, 1152(%rsp) + vmovups %zmm1, 1216(%rsp) + je .LBL_1_2 + + xorb %dl, %dl + kmovw %k4, 1048(%rsp) + xorl %eax, %eax + kmovw %k5, 1040(%rsp) + kmovw %k6, 1032(%rsp) + kmovw %k7, 1024(%rsp) + vmovups %zmm16, 960(%rsp) + vmovups %zmm17, 896(%rsp) + vmovups %zmm18, 832(%rsp) + vmovups %zmm19, 768(%rsp) + vmovups %zmm20, 704(%rsp) + vmovups %zmm21, 640(%rsp) + vmovups %zmm22, 576(%rsp) + vmovups %zmm23, 512(%rsp) + vmovups %zmm24, 448(%rsp) + vmovups %zmm25, 384(%rsp) + vmovups %zmm26, 320(%rsp) + vmovups %zmm27, 256(%rsp) + vmovups %zmm28, 192(%rsp) + vmovups %zmm29, 128(%rsp) + vmovups %zmm30, 64(%rsp) + vmovups %zmm31, (%rsp) + movq %rsi, 1064(%rsp) + movq %rdi, 1056(%rsp) + movq %r12, 1096(%rsp) + cfi_offset_rel_rsp (12, 1096) + movb %dl, %r12b + movq %r13, 1088(%rsp) + cfi_offset_rel_rsp (13, 1088) + movl %ecx, %r13d + movq %r14, 1080(%rsp) + cfi_offset_rel_rsp (14, 1080) + movl %eax, %r14d + movq %r15, 1072(%rsp) + cfi_offset_rel_rsp (15, 1072) + cfi_remember_state + +.LBL_1_6: + btl %r14d, %r13d + jc .LBL_1_12 + +.LBL_1_7: + lea 1(%r14), %esi + btl %esi, %r13d + jc .LBL_1_10 + +.LBL_1_8: + addb $1, %r12b + addl $2, %r14d + cmpb $16, %r12b + jb .LBL_1_6 + + kmovw 1048(%rsp), %k4 + movq 1064(%rsp), %rsi + kmovw 1040(%rsp), %k5 + movq 1056(%rsp), %rdi + kmovw 1032(%rsp), %k6 + movq 1096(%rsp), %r12 + cfi_restore (%r12) + movq 1088(%rsp), %r13 + cfi_restore (%r13) + kmovw 1024(%rsp), %k7 + vmovups 960(%rsp), %zmm16 + vmovups 896(%rsp), %zmm17 + vmovups 832(%rsp), %zmm18 + vmovups 768(%rsp), %zmm19 + vmovups 704(%rsp), %zmm20 + vmovups 640(%rsp), %zmm21 + vmovups 576(%rsp), %zmm22 + vmovups 512(%rsp), %zmm23 + vmovups 448(%rsp), %zmm24 + vmovups 384(%rsp), %zmm25 + vmovups 320(%rsp), %zmm26 + vmovups 256(%rsp), %zmm27 + vmovups 192(%rsp), %zmm28 + vmovups 128(%rsp), %zmm29 + vmovups 64(%rsp), %zmm30 + vmovups (%rsp), %zmm31 + movq 1080(%rsp), %r14 + cfi_restore (%r14) + movq 1072(%rsp), %r15 + cfi_restore (%r15) + vmovups 1216(%rsp), %zmm1 + jmp .LBL_1_2 + +.LBL_1_10: + cfi_restore_state + movzbl %r12b, %r15d + vmovss 1156(%rsp,%r15,8), %xmm0 + call JUMPTARGET(__logf_finite) + vmovss %xmm0, 1220(%rsp,%r15,8) + jmp .LBL_1_8 + +.LBL_1_12: + movzbl %r12b, %r15d + vmovss 1152(%rsp,%r15,8), %xmm0 + call JUMPTARGET(__logf_finite) + vmovss %xmm0, 1216(%rsp,%r15,8) + jmp .LBL_1_7 +#endif +END (_ZGVeN16v_logf_knl) + +ENTRY (_ZGVeN16v_logf_skx) +#ifndef HAVE_AVX512DQ_ASM_SUPPORT +WRAPPER_IMPL_AVX512 _ZGVdN8v_logf +#else +/* + ALGORITHM DESCRIPTION: + + log(x) = exponent_x*log(2) + log(mantissa_x), if mantissa_x<4/3 + log(x) = (exponent_x+1)*log(2) + log(0.5*mantissa_x), if mantissa_x>4/3 + + R = mantissa_x - 1, if mantissa_x<4/3 + R = 0.5*mantissa_x - 1, if mantissa_x>4/3 + |R|< 1/3 + + log(1+R) is approximated as a polynomial: degree 9 for 1-ulp, + degree 7 for 4-ulp, degree 3 for half-precision. */ + + pushq %rbp + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%rbp, 0) + movq %rsp, %rbp + cfi_def_cfa_register (%rbp) + andq $-64, %rsp + subq $1280, %rsp + movq __svml_slog_data@GOTPCREL(%rip), %rax + vmovups .L_2il0floatpacket.7(%rip), %zmm6 + vmovups _iBrkValue(%rax), %zmm4 + vmovups _sPoly_7(%rax), %zmm8 + +/* + check for working range, + set special argument mask (denormals/zero/Inf/NaN) + */ + vpaddd _iHiDelta(%rax), %zmm0, %zmm1 + +/* reduction: compute r,n */ + vpsubd %zmm4, %zmm0, %zmm2 + vpcmpd $5, _iLoRange(%rax), %zmm1, %k1 + +/* exponent_x (mantissa_x<4/3) or exponent_x+1 (mantissa_x>4/3) */ + vpsrad $23, %zmm2, %zmm5 + vpandd _iOffExpoMask(%rax), %zmm2, %zmm3 + +/* mantissa_x (mantissa_x<4/3), or 0.5*mantissa_x (mantissa_x>4/3) */ + vpaddd %zmm4, %zmm3, %zmm7 + +/* reduced argument R */ + vsubps _sOne(%rax), %zmm7, %zmm9 + +/* polynomial evaluation starts here */ + vfmadd213ps _sPoly_6(%rax), %zmm9, %zmm8 + vfmadd213ps _sPoly_5(%rax), %zmm9, %zmm8 + vfmadd213ps _sPoly_4(%rax), %zmm9, %zmm8 + vfmadd213ps _sPoly_3(%rax), %zmm9, %zmm8 + vfmadd213ps _sPoly_2(%rax), %zmm9, %zmm8 + vfmadd213ps _sPoly_1(%rax), %zmm9, %zmm8 + vmulps %zmm9, %zmm8, %zmm10 + +/* polynomial evaluation end */ + vfmadd213ps %zmm9, %zmm9, %zmm10 + vpandnd %zmm1, %zmm1, %zmm6{%k1} + vptestmd %zmm6, %zmm6, %k0 + vcvtdq2ps {rn-sae}, %zmm5, %zmm1 + kmovw %k0, %ecx + +/* + final reconstruction: + add exponent_value*log2 to polynomial result + */ + vfmadd132ps _sLn2(%rax), %zmm10, %zmm1 + testl %ecx, %ecx + jne .LBL_2_3 + +.LBL_2_2: + cfi_remember_state + vmovaps %zmm1, %zmm0 + movq %rbp, %rsp + cfi_def_cfa_register (%rsp) + popq %rbp + cfi_adjust_cfa_offset (-8) + cfi_restore (%rbp) + ret + +.LBL_2_3: + cfi_restore_state + vmovups %zmm0, 1152(%rsp) + vmovups %zmm1, 1216(%rsp) + je .LBL_2_2 + + xorb %dl, %dl + xorl %eax, %eax + kmovw %k4, 1048(%rsp) + kmovw %k5, 1040(%rsp) + kmovw %k6, 1032(%rsp) + kmovw %k7, 1024(%rsp) + vmovups %zmm16, 960(%rsp) + vmovups %zmm17, 896(%rsp) + vmovups %zmm18, 832(%rsp) + vmovups %zmm19, 768(%rsp) + vmovups %zmm20, 704(%rsp) + vmovups %zmm21, 640(%rsp) + vmovups %zmm22, 576(%rsp) + vmovups %zmm23, 512(%rsp) + vmovups %zmm24, 448(%rsp) + vmovups %zmm25, 384(%rsp) + vmovups %zmm26, 320(%rsp) + vmovups %zmm27, 256(%rsp) + vmovups %zmm28, 192(%rsp) + vmovups %zmm29, 128(%rsp) + vmovups %zmm30, 64(%rsp) + vmovups %zmm31, (%rsp) + movq %rsi, 1064(%rsp) + movq %rdi, 1056(%rsp) + movq %r12, 1096(%rsp) + cfi_offset_rel_rsp (12, 1096) + movb %dl, %r12b + movq %r13, 1088(%rsp) + cfi_offset_rel_rsp (13, 1088) + movl %ecx, %r13d + movq %r14, 1080(%rsp) + cfi_offset_rel_rsp (14, 1080) + movl %eax, %r14d + movq %r15, 1072(%rsp) + cfi_offset_rel_rsp (15, 1072) + cfi_remember_state + +.LBL_2_6: + btl %r14d, %r13d + jc .LBL_2_12 + +.LBL_2_7: + lea 1(%r14), %esi + btl %esi, %r13d + jc .LBL_2_10 + +.LBL_2_8: + incb %r12b + addl $2, %r14d + cmpb $16, %r12b + jb .LBL_2_6 + + kmovw 1048(%rsp), %k4 + kmovw 1040(%rsp), %k5 + kmovw 1032(%rsp), %k6 + kmovw 1024(%rsp), %k7 + vmovups 960(%rsp), %zmm16 + vmovups 896(%rsp), %zmm17 + vmovups 832(%rsp), %zmm18 + vmovups 768(%rsp), %zmm19 + vmovups 704(%rsp), %zmm20 + vmovups 640(%rsp), %zmm21 + vmovups 576(%rsp), %zmm22 + vmovups 512(%rsp), %zmm23 + vmovups 448(%rsp), %zmm24 + vmovups 384(%rsp), %zmm25 + vmovups 320(%rsp), %zmm26 + vmovups 256(%rsp), %zmm27 + vmovups 192(%rsp), %zmm28 + vmovups 128(%rsp), %zmm29 + vmovups 64(%rsp), %zmm30 + vmovups (%rsp), %zmm31 + vmovups 1216(%rsp), %zmm1 + movq 1064(%rsp), %rsi + movq 1056(%rsp), %rdi + movq 1096(%rsp), %r12 + cfi_restore (%r12) + movq 1088(%rsp), %r13 + cfi_restore (%r13) + movq 1080(%rsp), %r14 + cfi_restore (%r14) + movq 1072(%rsp), %r15 + cfi_restore (%r15) + jmp .LBL_2_2 + +.LBL_2_10: + cfi_restore_state + movzbl %r12b, %r15d + vmovss 1156(%rsp,%r15,8), %xmm0 + vzeroupper + vmovss 1156(%rsp,%r15,8), %xmm0 + + call JUMPTARGET(__logf_finite) + + vmovss %xmm0, 1220(%rsp,%r15,8) + jmp .LBL_2_8 + +.LBL_2_12: + movzbl %r12b, %r15d + vmovss 1152(%rsp,%r15,8), %xmm0 + vzeroupper + vmovss 1152(%rsp,%r15,8), %xmm0 + + call JUMPTARGET(__logf_finite) + + vmovss %xmm0, 1216(%rsp,%r15,8) + jmp .LBL_2_7 + +#endif +END (_ZGVeN16v_logf_skx) + + .section .rodata, "a" +.L_2il0floatpacket.7: + .long 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff + .type .L_2il0floatpacket.7,@object diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_logf4_core.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_logf4_core.S new file mode 100644 index 0000000000..4e0e36d5bd --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_logf4_core.S @@ -0,0 +1,36 @@ +/* Multiple versions of vectorized logf. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + + .text +ENTRY (_ZGVbN4v_logf) + .type _ZGVbN4v_logf, @gnu_indirect_function + LOAD_RTLD_GLOBAL_RO_RDX + leaq _ZGVbN4v_logf_sse4(%rip), %rax + HAS_CPU_FEATURE (SSE4_1) + jz 2f + ret +2: leaq _ZGVbN4v_logf_sse2(%rip), %rax + ret +END (_ZGVbN4v_logf) +libmvec_hidden_def (_ZGVbN4v_logf) + +#define _ZGVbN4v_logf _ZGVbN4v_logf_sse2 +#include "../svml_s_logf4_core.S" diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_logf4_core_sse4.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_logf4_core_sse4.S new file mode 100644 index 0000000000..156face181 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_logf4_core_sse4.S @@ -0,0 +1,194 @@ +/* Function logf vectorized with SSE4. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "svml_s_logf_data.h" + + .text +ENTRY (_ZGVbN4v_logf_sse4) +/* + ALGORITHM DESCRIPTION: + + log(x) = exponent_x*log(2) + log(mantissa_x), if mantissa_x<4/3 + log(x) = (exponent_x+1)*log(2) + log(0.5*mantissa_x), if mantissa_x>4/3 + + R = mantissa_x - 1, if mantissa_x<4/3 + R = 0.5*mantissa_x - 1, if mantissa_x>4/3 + |R|< 1/3 + + log(1+R) is approximated as a polynomial: degree 9 for 1-ulp, + degree 7 for 4-ulp, degree 3 for half-precision. */ + + pushq %rbp + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%rbp, 0) + movq %rsp, %rbp + cfi_def_cfa_register (%rbp) + andq $-64, %rsp + subq $320, %rsp + +/* reduction: compute r,n */ + movaps %xmm0, %xmm2 + +/* check for working range, + set special argument mask (denormals/zero/Inf/NaN) */ + movq __svml_slog_data@GOTPCREL(%rip), %rax + movdqu _iHiDelta(%rax), %xmm1 + movdqu _iLoRange(%rax), %xmm4 + paddd %xmm0, %xmm1 + movdqu _iBrkValue(%rax), %xmm3 + pcmpgtd %xmm1, %xmm4 + movdqu _iOffExpoMask(%rax), %xmm1 + psubd %xmm3, %xmm2 + pand %xmm2, %xmm1 + +/* exponent_x (mantissa_x<4/3) or exponent_x+1 (mantissa_x>4/3) */ + psrad $23, %xmm2 + paddd %xmm3, %xmm1 + movups _sPoly_7(%rax), %xmm5 + +/* mantissa_x (mantissa_x<4/3), or 0.5*mantissa_x (mantissa_x>4/3) */ + cvtdq2ps %xmm2, %xmm6 + +/* reduced argument R */ + subps _sOne(%rax), %xmm1 + movmskps %xmm4, %ecx + +/* final reconstruction: + add exponent_value*log2 to polynomial result */ + mulps _sLn2(%rax), %xmm6 + +/* polynomial evaluation starts here */ + mulps %xmm1, %xmm5 + addps _sPoly_6(%rax), %xmm5 + mulps %xmm1, %xmm5 + addps _sPoly_5(%rax), %xmm5 + mulps %xmm1, %xmm5 + addps _sPoly_4(%rax), %xmm5 + mulps %xmm1, %xmm5 + addps _sPoly_3(%rax), %xmm5 + mulps %xmm1, %xmm5 + addps _sPoly_2(%rax), %xmm5 + mulps %xmm1, %xmm5 + addps _sPoly_1(%rax), %xmm5 + mulps %xmm1, %xmm5 + +/* polynomial evaluation end */ + mulps %xmm1, %xmm5 + addps %xmm5, %xmm1 + addps %xmm6, %xmm1 + testl %ecx, %ecx + jne .LBL_1_3 + +.LBL_1_2: + cfi_remember_state + movdqa %xmm1, %xmm0 + movq %rbp, %rsp + cfi_def_cfa_register (%rsp) + popq %rbp + cfi_adjust_cfa_offset (-8) + cfi_restore (%rbp) + ret + +.LBL_1_3: + cfi_restore_state + movups %xmm0, 192(%rsp) + movups %xmm1, 256(%rsp) + je .LBL_1_2 + + xorb %dl, %dl + xorl %eax, %eax + movups %xmm8, 112(%rsp) + movups %xmm9, 96(%rsp) + movups %xmm10, 80(%rsp) + movups %xmm11, 64(%rsp) + movups %xmm12, 48(%rsp) + movups %xmm13, 32(%rsp) + movups %xmm14, 16(%rsp) + movups %xmm15, (%rsp) + movq %rsi, 136(%rsp) + movq %rdi, 128(%rsp) + movq %r12, 168(%rsp) + cfi_offset_rel_rsp (12, 168) + movb %dl, %r12b + movq %r13, 160(%rsp) + cfi_offset_rel_rsp (13, 160) + movl %ecx, %r13d + movq %r14, 152(%rsp) + cfi_offset_rel_rsp (14, 152) + movl %eax, %r14d + movq %r15, 144(%rsp) + cfi_offset_rel_rsp (15, 144) + cfi_remember_state + +.LBL_1_6: + btl %r14d, %r13d + jc .LBL_1_12 + +.LBL_1_7: + lea 1(%r14), %esi + btl %esi, %r13d + jc .LBL_1_10 + +.LBL_1_8: + incb %r12b + addl $2, %r14d + cmpb $16, %r12b + jb .LBL_1_6 + + movups 112(%rsp), %xmm8 + movups 96(%rsp), %xmm9 + movups 80(%rsp), %xmm10 + movups 64(%rsp), %xmm11 + movups 48(%rsp), %xmm12 + movups 32(%rsp), %xmm13 + movups 16(%rsp), %xmm14 + movups (%rsp), %xmm15 + movq 136(%rsp), %rsi + movq 128(%rsp), %rdi + movq 168(%rsp), %r12 + cfi_restore (%r12) + movq 160(%rsp), %r13 + cfi_restore (%r13) + movq 152(%rsp), %r14 + cfi_restore (%r14) + movq 144(%rsp), %r15 + cfi_restore (%r15) + movups 256(%rsp), %xmm1 + jmp .LBL_1_2 + +.LBL_1_10: + cfi_restore_state + movzbl %r12b, %r15d + movss 196(%rsp,%r15,8), %xmm0 + + call JUMPTARGET(__logf_finite) + + movss %xmm0, 260(%rsp,%r15,8) + jmp .LBL_1_8 + +.LBL_1_12: + movzbl %r12b, %r15d + movss 192(%rsp,%r15,8), %xmm0 + + call JUMPTARGET(__logf_finite) + + movss %xmm0, 256(%rsp,%r15,8) + jmp .LBL_1_7 + +END (_ZGVbN4v_logf_sse4) diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_logf8_core.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_logf8_core.S new file mode 100644 index 0000000000..f4b82de3d4 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_logf8_core.S @@ -0,0 +1,36 @@ +/* Multiple versions of vectorized logf. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + + .text +ENTRY (_ZGVdN8v_logf) + .type _ZGVdN8v_logf, @gnu_indirect_function + LOAD_RTLD_GLOBAL_RO_RDX + leaq _ZGVdN8v_logf_avx2(%rip), %rax + HAS_ARCH_FEATURE (AVX2_Usable) + jz 2f + ret +2: leaq _ZGVdN8v_logf_sse_wrapper(%rip), %rax + ret +END (_ZGVdN8v_logf) +libmvec_hidden_def (_ZGVdN8v_logf) + +#define _ZGVdN8v_logf _ZGVdN8v_logf_sse_wrapper +#include "../svml_s_logf8_core.S" diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_logf8_core_avx2.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_logf8_core_avx2.S new file mode 100644 index 0000000000..994af91ffe --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_logf8_core_avx2.S @@ -0,0 +1,184 @@ +/* Function logf vectorized with AVX2. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "svml_s_logf_data.h" + + .text +ENTRY(_ZGVdN8v_logf_avx2) +/* + ALGORITHM DESCRIPTION: + + log(x) = exponent_x*log(2) + log(mantissa_x), if mantissa_x<4/3 + log(x) = (exponent_x+1)*log(2) + log(0.5*mantissa_x), if mantissa_x>4/3 + + R = mantissa_x - 1, if mantissa_x<4/3 + R = 0.5*mantissa_x - 1, if mantissa_x>4/3 + |R|< 1/3 + + log(1+R) is approximated as a polynomial: degree 9 for 1-ulp, + degree 7 for 4-ulp, degree 3 for half-precision. */ + + pushq %rbp + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%rbp, 0) + movq %rsp, %rbp + cfi_def_cfa_register (%rbp) + andq $-64, %rsp + subq $448, %rsp + movq __svml_slog_data@GOTPCREL(%rip), %rax + vmovaps %ymm0, %ymm2 + vmovups _iBrkValue(%rax), %ymm6 + vmovups _iLoRange(%rax), %ymm1 +/* check for working range, + set special argument mask (denormals/zero/Inf/NaN) */ + vpaddd _iHiDelta(%rax), %ymm2, %ymm7 + +/* reduction: compute r,n */ + vpsubd %ymm6, %ymm2, %ymm4 + +/* exponent_x (mantissa_x<4/3) or exponent_x+1 (mantissa_x>4/3) */ + vpsrad $23, %ymm4, %ymm3 + vpand _iOffExpoMask(%rax), %ymm4, %ymm5 + vmovups _sPoly_7(%rax), %ymm4 + vcvtdq2ps %ymm3, %ymm0 + +/* mantissa_x (mantissa_x<4/3), or 0.5*mantissa_x (mantissa_x>4/3) */ + vpaddd %ymm6, %ymm5, %ymm3 + +/* reduced argument R */ + vsubps _sOne(%rax), %ymm3, %ymm5 + +/* polynomial evaluation starts here */ + vfmadd213ps _sPoly_6(%rax), %ymm5, %ymm4 + vfmadd213ps _sPoly_5(%rax), %ymm5, %ymm4 + vfmadd213ps _sPoly_4(%rax), %ymm5, %ymm4 + vfmadd213ps _sPoly_3(%rax), %ymm5, %ymm4 + vfmadd213ps _sPoly_2(%rax), %ymm5, %ymm4 + vfmadd213ps _sPoly_1(%rax), %ymm5, %ymm4 + vmulps %ymm5, %ymm4, %ymm6 + +/* polynomial evaluation end */ + vfmadd213ps %ymm5, %ymm5, %ymm6 + vpcmpgtd %ymm7, %ymm1, %ymm1 + vmovmskps %ymm1, %ecx + +/* final reconstruction: + add exponent_value*log2 to polynomial result */ + vfmadd132ps _sLn2(%rax), %ymm6, %ymm0 + testl %ecx, %ecx + jne .LBL_1_3 + +.LBL_1_2: + cfi_remember_state + movq %rbp, %rsp + cfi_def_cfa_register (%rsp) + popq %rbp + cfi_adjust_cfa_offset (-8) + cfi_restore (%rbp) + ret + +.LBL_1_3: + cfi_restore_state + vmovups %ymm2, 320(%rsp) + vmovups %ymm0, 384(%rsp) + je .LBL_1_2 + + xorb %dl, %dl + xorl %eax, %eax + vmovups %ymm8, 224(%rsp) + vmovups %ymm9, 192(%rsp) + vmovups %ymm10, 160(%rsp) + vmovups %ymm11, 128(%rsp) + vmovups %ymm12, 96(%rsp) + vmovups %ymm13, 64(%rsp) + vmovups %ymm14, 32(%rsp) + vmovups %ymm15, (%rsp) + movq %rsi, 264(%rsp) + movq %rdi, 256(%rsp) + movq %r12, 296(%rsp) + cfi_offset_rel_rsp (12, 296) + movb %dl, %r12b + movq %r13, 288(%rsp) + cfi_offset_rel_rsp (13, 288) + movl %ecx, %r13d + movq %r14, 280(%rsp) + cfi_offset_rel_rsp (14, 280) + movl %eax, %r14d + movq %r15, 272(%rsp) + cfi_offset_rel_rsp (15, 272) + cfi_remember_state + +.LBL_1_6: + btl %r14d, %r13d + jc .LBL_1_12 + +.LBL_1_7: + lea 1(%r14), %esi + btl %esi, %r13d + jc .LBL_1_10 + +.LBL_1_8: + incb %r12b + addl $2, %r14d + cmpb $16, %r12b + jb .LBL_1_6 + + vmovups 224(%rsp), %ymm8 + vmovups 192(%rsp), %ymm9 + vmovups 160(%rsp), %ymm10 + vmovups 128(%rsp), %ymm11 + vmovups 96(%rsp), %ymm12 + vmovups 64(%rsp), %ymm13 + vmovups 32(%rsp), %ymm14 + vmovups (%rsp), %ymm15 + vmovups 384(%rsp), %ymm0 + movq 264(%rsp), %rsi + movq 256(%rsp), %rdi + movq 296(%rsp), %r12 + cfi_restore (%r12) + movq 288(%rsp), %r13 + cfi_restore (%r13) + movq 280(%rsp), %r14 + cfi_restore (%r14) + movq 272(%rsp), %r15 + cfi_restore (%r15) + jmp .LBL_1_2 + +.LBL_1_10: + cfi_restore_state + movzbl %r12b, %r15d + vmovss 324(%rsp,%r15,8), %xmm0 + vzeroupper + + call JUMPTARGET(__logf_finite) + + vmovss %xmm0, 388(%rsp,%r15,8) + jmp .LBL_1_8 + +.LBL_1_12: + movzbl %r12b, %r15d + vmovss 320(%rsp,%r15,8), %xmm0 + vzeroupper + + call JUMPTARGET(__logf_finite) + + vmovss %xmm0, 384(%rsp,%r15,8) + jmp .LBL_1_7 + +END(_ZGVdN8v_logf_avx2) diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core.S new file mode 100644 index 0000000000..6d10c7576f --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core.S @@ -0,0 +1,37 @@ +/* Multiple versions of vectorized powf. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + + .text +ENTRY (_ZGVeN16vv_powf) + .type _ZGVeN16vv_powf, @gnu_indirect_function + LOAD_RTLD_GLOBAL_RO_RDX + leaq _ZGVeN16vv_powf_skx(%rip), %rax + HAS_ARCH_FEATURE (AVX512DQ_Usable) + jnz 2f + leaq _ZGVeN16vv_powf_knl(%rip), %rax + HAS_ARCH_FEATURE (AVX512F_Usable) + jnz 2f + leaq _ZGVeN16vv_powf_avx2_wrapper(%rip), %rax +2: ret +END (_ZGVeN16vv_powf) + +#define _ZGVeN16vv_powf _ZGVeN16vv_powf_avx2_wrapper +#include "../svml_s_powf16_core.S" diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S new file mode 100644 index 0000000000..fc91a092b0 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S @@ -0,0 +1,653 @@ +/* Function powf vectorized with AVX-512. KNL and SKX versions. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "svml_s_powf_data.h" +#include "svml_s_wrapper_impl.h" + +/* + ALGORITHM DESCRIPTION: + + We are using the next identity : pow(x,y) = 2^(y * log2(x)). + + 1) log2(x) calculation + Here we use the following formula. + Let |x|=2^k1*X1, where k1 is integer, 1<=X1<2. + Let C ~= 1/ln(2), + Rcp1 ~= 1/X1, X2=Rcp1*X1, + Rcp2 ~= 1/X2, X3=Rcp2*X2, + Rcp3 ~= 1/X3, Rcp3C ~= C/X3. + Then + log2|x| = k1 + log2(1/Rcp1) + log2(1/Rcp2) + log2(C/Rcp3C) + + log2(X1*Rcp1*Rcp2*Rcp3C/C), + where X1*Rcp1*Rcp2*Rcp3C = C*(1+q), q is very small. + + The values of Rcp1, log2(1/Rcp1), Rcp2, log2(1/Rcp2), + Rcp3C, log2(C/Rcp3C) are taken from tables. + Values of Rcp1, Rcp2, Rcp3C are such that RcpC=Rcp1*Rcp2*Rcp3C + is exactly represented in target precision. + + log2(X1*Rcp1*Rcp2*Rcp3C/C) = log2(1+q) = ln(1+q)/ln2 = + = 1/(ln2)*q - 1/(2ln2)*q^2 + 1/(3ln2)*q^3 - ... = + = 1/(C*ln2)*cq - 1/(2*C^2*ln2)*cq^2 + 1/(3*C^3*ln2)*cq^3 - ... = + = (1 + a1)*cq + a2*cq^2 + a3*cq^3 + ..., + where + cq=X1*Rcp1*Rcp2*Rcp3C-C, + a1=1/(C*ln(2))-1 is small, + a2=1/(2*C^2*ln2), + a3=1/(3*C^3*ln2), + ... + Log2 result is split by three parts: HH+HL+HLL + + 2) Calculation of y*log2(x) + Split y into YHi+YLo. + Get high PH and medium PL parts of y*log2|x|. + Get low PLL part of y*log2|x|. + Now we have PH+PL+PLL ~= y*log2|x|. + + 3) Calculation of 2^(y*log2(x)) + Let's represent PH+PL+PLL in the form N + j/2^expK + Z, + where expK=7 in this implementation, N and j are integers, + 0<=j<=2^expK-1, |Z|<2^(-expK-1). Hence + 2^(PH+PL+PLL) ~= 2^N * 2^(j/2^expK) * 2^Z, + where 2^(j/2^expK) is stored in a table, and + 2^Z ~= 1 + B1*Z + B2*Z^2 ... + B5*Z^5. + We compute 2^(PH+PL+PLL) as follows: + Break PH into PHH + PHL, where PHH = N + j/2^expK. + Z = PHL + PL + PLL + Exp2Poly = B1*Z + B2*Z^2 ... + B5*Z^5 + Get 2^(j/2^expK) from table in the form THI+TLO. + Now we have 2^(PH+PL+PLL) ~= 2^N * (THI + TLO) * (1 + Exp2Poly). + Get significand of 2^(PH+PL+PLL) in the form ResHi+ResLo: + ResHi := THI + ResLo := THI * Exp2Poly + TLO + Get exponent ERes of the result: + Res := ResHi + ResLo: + Result := ex(Res) + N. */ + + .text +ENTRY (_ZGVeN16vv_powf_knl) +#ifndef HAVE_AVX512DQ_ASM_SUPPORT +WRAPPER_IMPL_AVX512_ff _ZGVdN8vv_powf +#else + pushq %rbp + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%rbp, 0) + movq %rsp, %rbp + cfi_def_cfa_register (%rbp) + andq $-64, %rsp + subq $1344, %rsp + movq __svml_spow_data@GOTPCREL(%rip), %rdx + vmovaps %zmm1, %zmm9 + vshuff32x4 $238, %zmm0, %zmm0, %zmm7 + kxnorw %k3, %k3, %k3 + vcvtps2pd %ymm0, %zmm14 + vcvtps2pd %ymm7, %zmm10 + movl $-1, %eax + movq $-1, %rcx + vpandd _ABSMASK(%rdx), %zmm9, %zmm4 + vmovups _ExpMask(%rdx), %zmm6 + +/* exponent bits selection */ + vpsrlq $20, %zmm14, %zmm13 + vshuff32x4 $238, %zmm9, %zmm9, %zmm8 + vpcmpd $5, _INF(%rdx), %zmm4, %k2 + vpsrlq $32, %zmm13, %zmm15 + vcvtps2pd %ymm8, %zmm2 + vmovups _Two10(%rdx), %zmm4 + vpmovqd %zmm15, %ymm12 + vcvtps2pd %ymm9, %zmm1 + vpsubd _NMINNORM(%rdx), %zmm0, %zmm3 + vpbroadcastd %eax, %zmm8{%k2}{z} + vpcmpd $5, _NMAXVAL(%rdx), %zmm3, %k1 + +/* preserve mantissa, set input exponent to 2^(-10) */ + vmovaps %zmm6, %zmm3 + vpternlogq $248, %zmm6, %zmm10, %zmm4 + vpsrlq $20, %zmm10, %zmm10 + vpternlogq $234, _Two10(%rdx), %zmm14, %zmm3 + +/* reciprocal approximation good to at least 11 bits */ + vrcp28pd %zmm4, %zmm11 + vpsrlq $32, %zmm10, %zmm14 + vpbroadcastd %eax, %zmm7{%k1}{z} + kxnorw %k1, %k1, %k1 + vrcp28pd %zmm3, %zmm5 + vpmovqd %zmm14, %ymm6 + vshufi32x4 $68, %zmm6, %zmm12, %zmm13 + vmovups _One(%rdx), %zmm6 + +/* round reciprocal to nearest integer, will have 1+9 mantissa bits */ + vrndscalepd $8, %zmm5, %zmm14 + +/* biased exponent in DP format */ + vshuff32x4 $238, %zmm13, %zmm13, %zmm5 + vrndscalepd $8, %zmm11, %zmm11 + vcmppd $30, _Threshold(%rdx), %zmm14, %k2 + vcvtdq2pd %ymm13, %zmm10 + vcvtdq2pd %ymm5, %zmm15 + +/* table lookup */ + vpsrlq $40, %zmm14, %zmm13 + vpxord %zmm5, %zmm5, %zmm5 + vgatherqpd _Log2Rcp_lookup(%rdx,%zmm13), %zmm5{%k3} + vfmsub213pd %zmm6, %zmm14, %zmm3 + vfmsub213pd %zmm6, %zmm11, %zmm4 + vcmppd $30, _Threshold(%rdx), %zmm11, %k3 + vpbroadcastq %rcx, %zmm14{%k2}{z} + +/* dpP= _dbT+lJ*T_ITEM_GRAN */ + kxnorw %k2, %k2, %k2 + vpsrlq $40, %zmm11, %zmm12 + vpxord %zmm6, %zmm6, %zmm6 + vpbroadcastq %rcx, %zmm11{%k3}{z} + kxnorw %k3, %k3, %k3 + vgatherqpd _Log2Rcp_lookup(%rdx,%zmm12), %zmm6{%k1} + vmovups _Bias1(%rdx), %zmm12 + vpternlogq $236, _Bias(%rdx), %zmm12, %zmm14 + vpternlogq $248, _Bias(%rdx), %zmm11, %zmm12 + vsubpd %zmm14, %zmm10, %zmm13 + vsubpd %zmm12, %zmm15, %zmm10 + vmovups _poly_coeff_3(%rdx), %zmm11 + vmovups _poly_coeff_4(%rdx), %zmm15 + vfmadd213pd %zmm15, %zmm4, %zmm11 + vmulpd %zmm4, %zmm4, %zmm12 + vmovaps %zmm15, %zmm14 + vmulpd %zmm3, %zmm3, %zmm15 + vfmadd231pd _poly_coeff_3(%rdx), %zmm3, %zmm14 + +/* reconstruction */ + vfmadd213pd %zmm4, %zmm12, %zmm11 + vfmadd213pd %zmm3, %zmm15, %zmm14 + vaddpd %zmm6, %zmm11, %zmm11 + vaddpd %zmm5, %zmm14, %zmm3 + vfmadd231pd _L2(%rdx), %zmm10, %zmm11 + vfmadd132pd _L2(%rdx), %zmm3, %zmm13 + vmulpd %zmm2, %zmm11, %zmm12 + vmulpd %zmm1, %zmm13, %zmm10 + vmulpd __dbInvLn2(%rdx), %zmm12, %zmm6 + +/* hi bits */ + vpsrlq $32, %zmm12, %zmm12 + vmulpd __dbInvLn2(%rdx), %zmm10, %zmm1 + +/* to round down; if dR is an integer we will get R = 1, which is ok */ + vsubpd __dbHALF(%rdx), %zmm6, %zmm4 + vpsrlq $32, %zmm10, %zmm11 + vpmovqd %zmm11, %ymm3 + vsubpd __dbHALF(%rdx), %zmm1, %zmm2 + vaddpd __dbShifter(%rdx), %zmm4, %zmm14 + vpmovqd %zmm12, %ymm4 + vshufi32x4 $68, %zmm4, %zmm3, %zmm5 + vpxord %zmm4, %zmm4, %zmm4 + vaddpd __dbShifter(%rdx), %zmm2, %zmm2 + +/* iAbsX = iAbsX&iAbsMask; */ + vpandd __iAbsMask(%rdx), %zmm5, %zmm11 + vpxord %zmm5, %zmm5, %zmm5 + vsubpd __dbShifter(%rdx), %zmm14, %zmm13 + +/* iRangeMask = (iAbsX>iDomainRange) */ + vpcmpgtd __iDomainRange(%rdx), %zmm11, %k1 + vsubpd __dbShifter(%rdx), %zmm2, %zmm15 + vpbroadcastd %eax, %zmm10{%k1}{z} + vpternlogd $254, %zmm8, %zmm7, %zmm10 + +/* [0..1) */ + vsubpd %zmm15, %zmm1, %zmm1 + +/* low K bits */ + vpandq __lbLOWKBITS(%rdx), %zmm14, %zmm11 + vgatherqpd 13952(%rdx,%zmm11,8), %zmm5{%k3} + vsubpd %zmm13, %zmm6, %zmm7 + vptestmd %zmm10, %zmm10, %k0 + vpandq __lbLOWKBITS(%rdx), %zmm2, %zmm10 + vmulpd __dbC1(%rdx), %zmm1, %zmm1 + vmulpd __dbC1(%rdx), %zmm7, %zmm3 + vpsrlq $11, %zmm2, %zmm8 + vpsrlq $11, %zmm14, %zmm2 + +/* NB : including +/- sign for the exponent!! */ + vpsllq $52, %zmm8, %zmm8 + kmovw %k0, %ecx + vpsllq $52, %zmm2, %zmm6 + vfmadd213pd %zmm5, %zmm3, %zmm5 + vgatherqpd 13952(%rdx,%zmm10,8), %zmm4{%k2} + vfmadd213pd %zmm4, %zmm1, %zmm4 + vpaddq %zmm6, %zmm5, %zmm10 + vcvtpd2ps %zmm10, %ymm12 + vpaddq %zmm8, %zmm4, %zmm7 + vcvtpd2ps %zmm7, %ymm11 + vshuff32x4 $68, %zmm12, %zmm11, %zmm1 + testl %ecx, %ecx + jne .LBL_1_3 + +.LBL_1_2: + cfi_remember_state + vmovaps %zmm1, %zmm0 + movq %rbp, %rsp + cfi_def_cfa_register (%rsp) + popq %rbp + cfi_adjust_cfa_offset (-8) + cfi_restore (%rbp) + ret + +.LBL_1_3: + cfi_restore_state + vmovups %zmm0, 1152(%rsp) + vmovups %zmm9, 1216(%rsp) + vmovups %zmm1, 1280(%rsp) + je .LBL_1_2 + + xorb %dl, %dl + kmovw %k4, 1048(%rsp) + xorl %eax, %eax + kmovw %k5, 1040(%rsp) + kmovw %k6, 1032(%rsp) + kmovw %k7, 1024(%rsp) + vmovups %zmm16, 960(%rsp) + vmovups %zmm17, 896(%rsp) + vmovups %zmm18, 832(%rsp) + vmovups %zmm19, 768(%rsp) + vmovups %zmm20, 704(%rsp) + vmovups %zmm21, 640(%rsp) + vmovups %zmm22, 576(%rsp) + vmovups %zmm23, 512(%rsp) + vmovups %zmm24, 448(%rsp) + vmovups %zmm25, 384(%rsp) + vmovups %zmm26, 320(%rsp) + vmovups %zmm27, 256(%rsp) + vmovups %zmm28, 192(%rsp) + vmovups %zmm29, 128(%rsp) + vmovups %zmm30, 64(%rsp) + vmovups %zmm31, (%rsp) + movq %rsi, 1064(%rsp) + movq %rdi, 1056(%rsp) + movq %r12, 1096(%rsp) + cfi_offset_rel_rsp (12, 1096) + movb %dl, %r12b + movq %r13, 1088(%rsp) + cfi_offset_rel_rsp (13, 1088) + movl %ecx, %r13d + movq %r14, 1080(%rsp) + cfi_offset_rel_rsp (14, 1080) + movl %eax, %r14d + movq %r15, 1072(%rsp) + cfi_offset_rel_rsp (15, 1072) + cfi_remember_state + +.LBL_1_6: + btl %r14d, %r13d + jc .LBL_1_12 + +.LBL_1_7: + lea 1(%r14), %esi + btl %esi, %r13d + jc .LBL_1_10 + +.LBL_1_8: + addb $1, %r12b + addl $2, %r14d + cmpb $16, %r12b + jb .LBL_1_6 + + kmovw 1048(%rsp), %k4 + movq 1064(%rsp), %rsi + kmovw 1040(%rsp), %k5 + movq 1056(%rsp), %rdi + kmovw 1032(%rsp), %k6 + movq 1096(%rsp), %r12 + cfi_restore (%r12) + movq 1088(%rsp), %r13 + cfi_restore (%r13) + kmovw 1024(%rsp), %k7 + vmovups 960(%rsp), %zmm16 + vmovups 896(%rsp), %zmm17 + vmovups 832(%rsp), %zmm18 + vmovups 768(%rsp), %zmm19 + vmovups 704(%rsp), %zmm20 + vmovups 640(%rsp), %zmm21 + vmovups 576(%rsp), %zmm22 + vmovups 512(%rsp), %zmm23 + vmovups 448(%rsp), %zmm24 + vmovups 384(%rsp), %zmm25 + vmovups 320(%rsp), %zmm26 + vmovups 256(%rsp), %zmm27 + vmovups 192(%rsp), %zmm28 + vmovups 128(%rsp), %zmm29 + vmovups 64(%rsp), %zmm30 + vmovups (%rsp), %zmm31 + movq 1080(%rsp), %r14 + cfi_restore (%r14) + movq 1072(%rsp), %r15 + cfi_restore (%r15) + vmovups 1280(%rsp), %zmm1 + jmp .LBL_1_2 + +.LBL_1_10: + cfi_restore_state + movzbl %r12b, %r15d + vmovss 1156(%rsp,%r15,8), %xmm0 + vmovss 1220(%rsp,%r15,8), %xmm1 + call JUMPTARGET(__powf_finite) + vmovss %xmm0, 1284(%rsp,%r15,8) + jmp .LBL_1_8 + +.LBL_1_12: + movzbl %r12b, %r15d + vmovss 1152(%rsp,%r15,8), %xmm0 + vmovss 1216(%rsp,%r15,8), %xmm1 + call JUMPTARGET(__powf_finite) + vmovss %xmm0, 1280(%rsp,%r15,8) + jmp .LBL_1_7 +#endif +END (_ZGVeN16vv_powf_knl) + +ENTRY (_ZGVeN16vv_powf_skx) +#ifndef HAVE_AVX512DQ_ASM_SUPPORT +WRAPPER_IMPL_AVX512_ff _ZGVdN8vv_powf +#else + pushq %rbp + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%rbp, 0) + movq %rsp, %rbp + cfi_def_cfa_register (%rbp) + andq $-64, %rsp + subq $1344, %rsp + movq __svml_spow_data@GOTPCREL(%rip), %rax + vextractf32x8 $1, %zmm1, %ymm14 + vextractf32x8 $1, %zmm0, %ymm15 + vpsubd _NMINNORM(%rax), %zmm0, %zmm9 + vmovups %zmm26, 1280(%rsp) + vmovups _ExpMask(%rax), %zmm6 + vpcmpd $1, _NMAXVAL(%rax), %zmm9, %k1 + vcvtps2pd %ymm0, %zmm5 + vcvtps2pd %ymm1, %zmm12 + kxnorw %k3, %k3, %k3 + +/* exponent bits selection */ + vpsrlq $20, %zmm5, %zmm3 + vpsrlq $32, %zmm3, %zmm2 + vpmovqd %zmm2, %ymm11 + vcvtps2pd %ymm14, %zmm13 + vmovups .L_2il0floatpacket.23(%rip), %zmm14 + vmovaps %zmm14, %zmm26 + vpandd _ABSMASK(%rax), %zmm1, %zmm8 + vpcmpd $1, _INF(%rax), %zmm8, %k2 + vpandnd %zmm9, %zmm9, %zmm26{%k1} + vmovups _Two10(%rax), %zmm9 + kxnorw %k1, %k1, %k1 + vcvtps2pd %ymm15, %zmm4 + vmovaps %zmm14, %zmm15 + +/* preserve mantissa, set input exponent to 2^(-10) */ + vpternlogq $248, %zmm6, %zmm4, %zmm9 + vpsrlq $20, %zmm4, %zmm4 + +/* reciprocal approximation good to at least 11 bits */ + vrcp14pd %zmm9, %zmm10 + +/* round reciprocal to nearest integer, will have 1+9 mantissa bits */ + vrndscalepd $8, %zmm10, %zmm3 + vmovups _One(%rax), %zmm10 + vfmsub213pd %zmm10, %zmm3, %zmm9 + vpandnd %zmm8, %zmm8, %zmm15{%k2} + vmovaps %zmm6, %zmm8 + vpternlogq $234, _Two10(%rax), %zmm5, %zmm8 + vpsrlq $32, %zmm4, %zmm5 + vrcp14pd %zmm8, %zmm7 + vpmovqd %zmm5, %ymm6 + vrndscalepd $8, %zmm7, %zmm2 + vfmsub213pd %zmm10, %zmm2, %zmm8 + +/* table lookup */ + vpsrlq $40, %zmm2, %zmm10 + vinserti32x8 $1, %ymm6, %zmm11, %zmm4 + vpsrlq $40, %zmm3, %zmm11 + +/* biased exponent in DP format */ + vextracti32x8 $1, %zmm4, %ymm7 + vcvtdq2pd %ymm4, %zmm6 + vpmovqd %zmm10, %ymm4 + vpmovqd %zmm11, %ymm5 + vpxord %zmm10, %zmm10, %zmm10 + vgatherdpd _Log2Rcp_lookup(%rax,%ymm4), %zmm10{%k3} + vpbroadcastq .L_2il0floatpacket.24(%rip), %zmm4 + vpxord %zmm11, %zmm11, %zmm11 + vcvtdq2pd %ymm7, %zmm7 + vgatherdpd _Log2Rcp_lookup(%rax,%ymm5), %zmm11{%k1} + vmovups _Threshold(%rax), %zmm5 + vcmppd $21, %zmm2, %zmm5, %k2 + vcmppd $21, %zmm3, %zmm5, %k3 + vmovups _Bias1(%rax), %zmm3 + vmovaps %zmm4, %zmm2 + vpandnq %zmm5, %zmm5, %zmm2{%k2} + vpternlogq $236, _Bias(%rax), %zmm3, %zmm2 + +/* dpP= _dbT+lJ*T_ITEM_GRAN */ + kxnorw %k2, %k2, %k2 + vpandnq %zmm5, %zmm5, %zmm4{%k3} + vpternlogq $248, _Bias(%rax), %zmm4, %zmm3 + vsubpd %zmm2, %zmm6, %zmm4 + vmovups _poly_coeff_3(%rax), %zmm6 + vmovups _poly_coeff_4(%rax), %zmm2 + vsubpd %zmm3, %zmm7, %zmm5 + vmulpd %zmm8, %zmm8, %zmm7 + vfmadd213pd %zmm2, %zmm9, %zmm6 + kxnorw %k3, %k3, %k3 + vmovaps %zmm2, %zmm3 + vmulpd %zmm9, %zmm9, %zmm2 + vfmadd231pd _poly_coeff_3(%rax), %zmm8, %zmm3 + +/* reconstruction */ + vfmadd213pd %zmm9, %zmm2, %zmm6 + vfmadd213pd %zmm8, %zmm7, %zmm3 + vaddpd %zmm11, %zmm6, %zmm8 + vaddpd %zmm10, %zmm3, %zmm9 + vfmadd231pd _L2(%rax), %zmm5, %zmm8 + vfmadd132pd _L2(%rax), %zmm9, %zmm4 + vmulpd %zmm13, %zmm8, %zmm13 + vmulpd %zmm12, %zmm4, %zmm3 + vmulpd __dbInvLn2(%rax), %zmm13, %zmm10 + vmulpd __dbInvLn2(%rax), %zmm3, %zmm8 + +/* hi bits */ + vpsrlq $32, %zmm3, %zmm4 + vpsrlq $32, %zmm13, %zmm13 + +/* to round down; if dR is an integer we will get R = 1, which is ok */ + vsubpd __dbHALF(%rax), %zmm8, %zmm12 + vpmovqd %zmm4, %ymm5 + vpmovqd %zmm13, %ymm2 + vsubpd __dbHALF(%rax), %zmm10, %zmm9 + vaddpd __dbShifter(%rax), %zmm12, %zmm7 + vaddpd __dbShifter(%rax), %zmm9, %zmm9 + vsubpd __dbShifter(%rax), %zmm7, %zmm11 + vsubpd __dbShifter(%rax), %zmm9, %zmm12 + vinserti32x8 $1, %ymm2, %zmm5, %zmm3 + +/* iAbsX = iAbsX&iAbsMask */ + vpandd __iAbsMask(%rax), %zmm3, %zmm4 + +/* iRangeMask = (iAbsX>iDomainRange) */ + vpcmpd $2, __iDomainRange(%rax), %zmm4, %k1 + vpandnd %zmm4, %zmm4, %zmm14{%k1} + vpternlogd $254, %zmm15, %zmm26, %zmm14 + +/* [0..1) */ + vsubpd %zmm11, %zmm8, %zmm15 + vsubpd %zmm12, %zmm10, %zmm26 + vptestmd %zmm14, %zmm14, %k0 + vpsrlq $11, %zmm7, %zmm8 + vpsrlq $11, %zmm9, %zmm10 + vmulpd __dbC1(%rax), %zmm26, %zmm26 + vmulpd __dbC1(%rax), %zmm15, %zmm15 + +/* NB : including +/- sign for the exponent!! */ + vpsllq $52, %zmm10, %zmm13 + vpsllq $52, %zmm8, %zmm12 + kmovw %k0, %ecx + +/* low K bits */ + vpandq __lbLOWKBITS(%rax), %zmm9, %zmm14 + vpandq __lbLOWKBITS(%rax), %zmm7, %zmm6 + vpmovqd %zmm14, %ymm7 + vpmovqd %zmm6, %ymm9 + vpxord %zmm2, %zmm2, %zmm2 + vgatherdpd 13952(%rax,%ymm7,8), %zmm2{%k3} + vfmadd213pd %zmm2, %zmm26, %zmm2 + vpaddq %zmm13, %zmm2, %zmm2 + vcvtpd2ps %zmm2, %ymm4 + vpxord %zmm11, %zmm11, %zmm11 + vgatherdpd 13952(%rax,%ymm9,8), %zmm11{%k2} + vfmadd213pd %zmm11, %zmm15, %zmm11 + vpaddq %zmm12, %zmm11, %zmm3 + vcvtpd2ps %zmm3, %ymm5 + vinsertf32x8 $1, %ymm4, %zmm5, %zmm2 + testl %ecx, %ecx + jne .LBL_2_3 + +.LBL_2_2: + cfi_remember_state + vmovups 1280(%rsp), %zmm26 + vmovaps %zmm2, %zmm0 + movq %rbp, %rsp + cfi_def_cfa_register (%rsp) + popq %rbp + cfi_adjust_cfa_offset (-8) + cfi_restore (%rbp) + ret + +.LBL_2_3: + cfi_restore_state + vmovups %zmm0, 1088(%rsp) + vmovups %zmm1, 1152(%rsp) + vmovups %zmm2, 1216(%rsp) + je .LBL_2_2 + + xorb %dl, %dl + xorl %eax, %eax + kmovw %k4, 984(%rsp) + kmovw %k5, 976(%rsp) + kmovw %k6, 968(%rsp) + kmovw %k7, 960(%rsp) + vmovups %zmm16, 896(%rsp) + vmovups %zmm17, 832(%rsp) + vmovups %zmm18, 768(%rsp) + vmovups %zmm19, 704(%rsp) + vmovups %zmm20, 640(%rsp) + vmovups %zmm21, 576(%rsp) + vmovups %zmm22, 512(%rsp) + vmovups %zmm23, 448(%rsp) + vmovups %zmm24, 384(%rsp) + vmovups %zmm25, 320(%rsp) + vmovups %zmm27, 256(%rsp) + vmovups %zmm28, 192(%rsp) + vmovups %zmm29, 128(%rsp) + vmovups %zmm30, 64(%rsp) + vmovups %zmm31, (%rsp) + movq %rsi, 1000(%rsp) + movq %rdi, 992(%rsp) + movq %r12, 1032(%rsp) + cfi_offset_rel_rsp (12, 1032) + movb %dl, %r12b + movq %r13, 1024(%rsp) + cfi_offset_rel_rsp (13, 1024) + movl %ecx, %r13d + movq %r14, 1016(%rsp) + cfi_offset_rel_rsp (14, 1016) + movl %eax, %r14d + movq %r15, 1008(%rsp) + cfi_offset_rel_rsp (15, 1008) + cfi_remember_state + +.LBL_2_6: + btl %r14d, %r13d + jc .LBL_2_12 + +.LBL_2_7: + lea 1(%r14), %esi + btl %esi, %r13d + jc .LBL_2_10 + +.LBL_2_8: + incb %r12b + addl $2, %r14d + cmpb $16, %r12b + jb .LBL_2_6 + + kmovw 984(%rsp), %k4 + kmovw 976(%rsp), %k5 + kmovw 968(%rsp), %k6 + kmovw 960(%rsp), %k7 + vmovups 896(%rsp), %zmm16 + vmovups 832(%rsp), %zmm17 + vmovups 768(%rsp), %zmm18 + vmovups 704(%rsp), %zmm19 + vmovups 640(%rsp), %zmm20 + vmovups 576(%rsp), %zmm21 + vmovups 512(%rsp), %zmm22 + vmovups 448(%rsp), %zmm23 + vmovups 384(%rsp), %zmm24 + vmovups 320(%rsp), %zmm25 + vmovups 256(%rsp), %zmm27 + vmovups 192(%rsp), %zmm28 + vmovups 128(%rsp), %zmm29 + vmovups 64(%rsp), %zmm30 + vmovups (%rsp), %zmm31 + vmovups 1216(%rsp), %zmm2 + movq 1000(%rsp), %rsi + movq 992(%rsp), %rdi + movq 1032(%rsp), %r12 + cfi_restore (%r12) + movq 1024(%rsp), %r13 + cfi_restore (%r13) + movq 1016(%rsp), %r14 + cfi_restore (%r14) + movq 1008(%rsp), %r15 + cfi_restore (%r15) + jmp .LBL_2_2 + +.LBL_2_10: + cfi_restore_state + movzbl %r12b, %r15d + vmovss 1156(%rsp,%r15,8), %xmm1 + vzeroupper + vmovss 1092(%rsp,%r15,8), %xmm0 + call JUMPTARGET(__powf_finite) + vmovss %xmm0, 1220(%rsp,%r15,8) + jmp .LBL_2_8 + +.LBL_2_12: + movzbl %r12b, %r15d + vmovss 1152(%rsp,%r15,8), %xmm1 + vzeroupper + vmovss 1088(%rsp,%r15,8), %xmm0 + call JUMPTARGET(__powf_finite) + vmovss %xmm0, 1216(%rsp,%r15,8) + jmp .LBL_2_7 +#endif +END (_ZGVeN16vv_powf_skx) + + .section .rodata, "a" +.L_2il0floatpacket.23: + .long 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff + .type .L_2il0floatpacket.23,@object +.L_2il0floatpacket.24: + .long 0xffffffff,0xffffffff + .type .L_2il0floatpacket.24,@object diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_powf4_core.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_powf4_core.S new file mode 100644 index 0000000000..785b549882 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_powf4_core.S @@ -0,0 +1,36 @@ +/* Multiple versions of vectorized powf. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + + .text +ENTRY (_ZGVbN4vv_powf) + .type _ZGVbN4vv_powf, @gnu_indirect_function + LOAD_RTLD_GLOBAL_RO_RDX + leaq _ZGVbN4vv_powf_sse4(%rip), %rax + HAS_CPU_FEATURE (SSE4_1) + jz 2f + ret +2: leaq _ZGVbN4vv_powf_sse2(%rip), %rax + ret +END (_ZGVbN4vv_powf) +libmvec_hidden_def (_ZGVbN4vv_powf) + +#define _ZGVbN4vv_powf _ZGVbN4vv_powf_sse2 +#include "../svml_s_powf4_core.S" diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_powf4_core_sse4.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_powf4_core_sse4.S new file mode 100644 index 0000000000..8b1b4e74bb --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_powf4_core_sse4.S @@ -0,0 +1,374 @@ +/* Function powf vectorized with SSE4. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "svml_s_powf_data.h" + + .text +ENTRY (_ZGVbN4vv_powf_sse4) +/* + ALGORITHM DESCRIPTION: + + We are using the next identity: pow(x,y) = 2^(y * log2(x)). + + 1) log2(x) calculation + Here we use the following formula. + Let |x|=2^k1*X1, where k1 is integer, 1<=X1<2. + Let C ~= 1/ln(2), + Rcp1 ~= 1/X1, X2=Rcp1*X1, + Rcp2 ~= 1/X2, X3=Rcp2*X2, + Rcp3 ~= 1/X3, Rcp3C ~= C/X3. + Then + log2|x| = k1 + log2(1/Rcp1) + log2(1/Rcp2) + log2(C/Rcp3C) + + log2(X1*Rcp1*Rcp2*Rcp3C/C), + where X1*Rcp1*Rcp2*Rcp3C = C*(1+q), q is very small. + + The values of Rcp1, log2(1/Rcp1), Rcp2, log2(1/Rcp2), + Rcp3C, log2(C/Rcp3C) are taken from tables. + Values of Rcp1, Rcp2, Rcp3C are such that RcpC=Rcp1*Rcp2*Rcp3C + is exactly represented in target precision. + + log2(X1*Rcp1*Rcp2*Rcp3C/C) = log2(1+q) = ln(1+q)/ln2 = + = 1/(ln2)*q - 1/(2ln2)*q^2 + 1/(3ln2)*q^3 - ... = + = 1/(C*ln2)*cq - 1/(2*C^2*ln2)*cq^2 + 1/(3*C^3*ln2)*cq^3 - ... = + = (1 + a1)*cq + a2*cq^2 + a3*cq^3 + ..., + where + cq=X1*Rcp1*Rcp2*Rcp3C-C, + a1=1/(C*ln(2))-1 is small, + a2=1/(2*C^2*ln2), + a3=1/(3*C^3*ln2), + ... + Log2 result is split by three parts: HH+HL+HLL + + 2) Calculation of y*log2(x) + Split y into YHi+YLo. + Get high PH and medium PL parts of y*log2|x|. + Get low PLL part of y*log2|x|. + Now we have PH+PL+PLL ~= y*log2|x|. + + 3) Calculation of 2^(y*log2(x)) + Let's represent PH+PL+PLL in the form N + j/2^expK + Z, + where expK=7 in this implementation, N and j are integers, + 0<=j<=2^expK-1, |Z|<2^(-expK-1). Hence + 2^(PH+PL+PLL) ~= 2^N * 2^(j/2^expK) * 2^Z, + where 2^(j/2^expK) is stored in a table, and + 2^Z ~= 1 + B1*Z + B2*Z^2 ... + B5*Z^5. + We compute 2^(PH+PL+PLL) as follows: + Break PH into PHH + PHL, where PHH = N + j/2^expK. + Z = PHL + PL + PLL + Exp2Poly = B1*Z + B2*Z^2 ... + B5*Z^5 + Get 2^(j/2^expK) from table in the form THI+TLO. + Now we have 2^(PH+PL+PLL) ~= 2^N * (THI + TLO) * (1 + Exp2Poly). + Get significand of 2^(PH+PL+PLL) in the form ResHi+ResLo: + ResHi := THI + ResLo := THI * Exp2Poly + TLO + Get exponent ERes of the result: + Res := ResHi + ResLo: + Result := ex(Res) + N. */ + + pushq %rbp + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%rbp, 0) + movq %rsp, %rbp + cfi_def_cfa_register (%rbp) + andq $-64, %rsp + subq $256, %rsp + movaps %xmm0, %xmm3 + movhlps %xmm0, %xmm3 + movaps %xmm1, %xmm5 + movups %xmm8, 112(%rsp) + movaps %xmm5, %xmm2 + cvtps2pd %xmm3, %xmm8 + cvtps2pd %xmm5, %xmm7 + movups %xmm9, 96(%rsp) + movaps %xmm0, %xmm4 + cvtps2pd %xmm0, %xmm9 + movq __svml_spow_data@GOTPCREL(%rip), %rdx + movups %xmm10, 176(%rsp) + movups %xmm13, 48(%rsp) + movups _ExpMask(%rdx), %xmm6 + +/* preserve mantissa, set input exponent to 2^(-10) */ + movaps %xmm6, %xmm10 + andps %xmm8, %xmm6 + andps %xmm9, %xmm10 + +/* exponent bits selection */ + psrlq $20, %xmm9 + orps _Two10(%rdx), %xmm6 + psrlq $20, %xmm8 + orps _Two10(%rdx), %xmm10 + +/* reciprocal approximation good to at least 11 bits */ + cvtpd2ps %xmm6, %xmm13 + cvtpd2ps %xmm10, %xmm1 + movlhps %xmm13, %xmm13 + movhlps %xmm5, %xmm2 + movlhps %xmm1, %xmm1 + movups %xmm12, 208(%rsp) + rcpps %xmm13, %xmm12 + movups %xmm11, 80(%rsp) + cvtps2pd %xmm2, %xmm11 + rcpps %xmm1, %xmm2 + movups %xmm14, 144(%rsp) + cvtps2pd %xmm12, %xmm14 + movups %xmm15, 160(%rsp) + cvtps2pd %xmm2, %xmm15 + shufps $221, %xmm8, %xmm9 + +/* round reciprocal to nearest integer, will have 1+9 mantissa bits */ + roundpd $0, %xmm14, %xmm14 + +/* biased exponent in DP format */ + pshufd $238, %xmm9, %xmm8 + roundpd $0, %xmm15, %xmm15 + cvtdq2pd %xmm8, %xmm1 + mulpd %xmm15, %xmm10 + mulpd %xmm14, %xmm6 + cvtdq2pd %xmm9, %xmm2 + subpd _One(%rdx), %xmm10 + subpd _One(%rdx), %xmm6 + +/* table lookup */ + movaps %xmm14, %xmm8 + movaps %xmm15, %xmm9 + psrlq $40, %xmm8 + psrlq $40, %xmm9 + movd %xmm8, %r8d + movd %xmm9, %eax + psubd _NMINNORM(%rdx), %xmm4 + movdqu _ABSMASK(%rdx), %xmm3 + pextrd $2, %xmm8, %r9d + pand %xmm5, %xmm3 + movups _Threshold(%rdx), %xmm8 + pextrd $2, %xmm9, %ecx + movaps %xmm8, %xmm9 + cmpltpd %xmm15, %xmm9 + cmpltpd %xmm14, %xmm8 + andps _Bias(%rdx), %xmm9 + movaps %xmm10, %xmm14 + andps _Bias(%rdx), %xmm8 + movaps %xmm6, %xmm15 + orps _Bias1(%rdx), %xmm9 + orps _Bias1(%rdx), %xmm8 + subpd %xmm9, %xmm2 + subpd %xmm8, %xmm1 + mulpd %xmm10, %xmm14 + mulpd %xmm6, %xmm15 + mulpd _L2(%rdx), %xmm2 + mulpd _L2(%rdx), %xmm1 + movups _poly_coeff_3(%rdx), %xmm9 + movaps %xmm9, %xmm8 + mulpd %xmm10, %xmm8 + mulpd %xmm6, %xmm9 + addpd _poly_coeff_4(%rdx), %xmm8 + addpd _poly_coeff_4(%rdx), %xmm9 + mulpd %xmm14, %xmm8 + mulpd %xmm15, %xmm9 + +/* reconstruction */ + addpd %xmm8, %xmm10 + addpd %xmm9, %xmm6 + movslq %eax, %rax + movslq %r8d, %r8 + movslq %ecx, %rcx + movslq %r9d, %r9 + movsd _Log2Rcp_lookup(%rdx,%rax), %xmm13 + movsd _Log2Rcp_lookup(%rdx,%r8), %xmm12 + movhpd _Log2Rcp_lookup(%rdx,%rcx), %xmm13 + movhpd _Log2Rcp_lookup(%rdx,%r9), %xmm12 + addpd %xmm10, %xmm13 + addpd %xmm6, %xmm12 + addpd %xmm13, %xmm2 + addpd %xmm12, %xmm1 + mulpd %xmm7, %xmm2 + mulpd %xmm11, %xmm1 + movups __dbInvLn2(%rdx), %xmm11 + movdqa %xmm4, %xmm12 + movaps %xmm11, %xmm10 + mulpd %xmm2, %xmm10 + mulpd %xmm1, %xmm11 + +/* to round down; if dR is an integer we will get R = 1, which is ok */ + movaps %xmm10, %xmm8 + movaps %xmm11, %xmm9 + subpd __dbHALF(%rdx), %xmm8 + subpd __dbHALF(%rdx), %xmm9 + addpd __dbShifter(%rdx), %xmm8 + addpd __dbShifter(%rdx), %xmm9 + movaps %xmm8, %xmm6 + movaps %xmm9, %xmm7 + subpd __dbShifter(%rdx), %xmm6 + subpd __dbShifter(%rdx), %xmm7 + +/* [0..1) */ + subpd %xmm6, %xmm10 + subpd %xmm7, %xmm11 + mulpd __dbC1(%rdx), %xmm10 + mulpd __dbC1(%rdx), %xmm11 + +/* hi bits */ + shufps $221, %xmm1, %xmm2 + movdqu _NMAXVAL(%rdx), %xmm1 + pcmpgtd %xmm1, %xmm12 + pcmpeqd %xmm1, %xmm4 + por %xmm4, %xmm12 + movdqa %xmm3, %xmm1 + movdqu _INF(%rdx), %xmm4 + pcmpgtd %xmm4, %xmm1 + pcmpeqd %xmm4, %xmm3 + +/* iAbsX = iAbsX&iAbsMask */ + pand __iAbsMask(%rdx), %xmm2 + por %xmm3, %xmm1 + +/* iRangeMask = (iAbsX>iDomainRange) */ + pcmpgtd __iDomainRange(%rdx), %xmm2 + por %xmm1, %xmm12 + movups __lbLOWKBITS(%rdx), %xmm3 + por %xmm2, %xmm12 + +/* low K bits */ + movaps %xmm3, %xmm2 + andps %xmm9, %xmm3 + andps %xmm8, %xmm2 + psrlq $11, %xmm8 + +/* dpP= _dbT+lJ*T_ITEM_GRAN */ + movd %xmm2, %r10d + psrlq $11, %xmm9 + movd %xmm3, %ecx + +/* NB : including +/- sign for the exponent!! */ + psllq $52, %xmm8 + psllq $52, %xmm9 + pextrw $4, %xmm2, %r11d + pextrw $4, %xmm3, %r8d + movmskps %xmm12, %eax + shll $3, %r10d + shll $3, %ecx + shll $3, %r11d + shll $3, %r8d + movq 13952(%rdx,%r10), %xmm6 + movq 13952(%rdx,%rcx), %xmm7 + movhpd 13952(%rdx,%r11), %xmm6 + movhpd 13952(%rdx,%r8), %xmm7 + mulpd %xmm6, %xmm10 + mulpd %xmm7, %xmm11 + addpd %xmm10, %xmm6 + addpd %xmm11, %xmm7 + paddq %xmm8, %xmm6 + paddq %xmm9, %xmm7 + cvtpd2ps %xmm6, %xmm1 + cvtpd2ps %xmm7, %xmm4 + movlhps %xmm4, %xmm1 + testl %eax, %eax + jne .LBL_1_3 + +.LBL_1_2: + cfi_remember_state + movups 112(%rsp), %xmm8 + movaps %xmm1, %xmm0 + movups 96(%rsp), %xmm9 + movups 176(%rsp), %xmm10 + movups 80(%rsp), %xmm11 + movups 208(%rsp), %xmm12 + movups 48(%rsp), %xmm13 + movups 144(%rsp), %xmm14 + movups 160(%rsp), %xmm15 + movq %rbp, %rsp + cfi_def_cfa_register (%rsp) + popq %rbp + cfi_adjust_cfa_offset (-8) + cfi_restore (%rbp) + ret + +.LBL_1_3: + cfi_restore_state + movups %xmm0, 64(%rsp) + movups %xmm5, 128(%rsp) + movups %xmm1, 192(%rsp) + je .LBL_1_2 + + xorb %cl, %cl + xorl %edx, %edx + movq %rsi, 8(%rsp) + movq %rdi, (%rsp) + movq %r12, 40(%rsp) + cfi_offset_rel_rsp (12, 40) + movb %cl, %r12b + movq %r13, 32(%rsp) + cfi_offset_rel_rsp (13, 32) + movl %eax, %r13d + movq %r14, 24(%rsp) + cfi_offset_rel_rsp (14, 24) + movl %edx, %r14d + movq %r15, 16(%rsp) + cfi_offset_rel_rsp (15, 16) + cfi_remember_state + +.LBL_1_6: + btl %r14d, %r13d + jc .LBL_1_12 + +.LBL_1_7: + lea 1(%r14), %esi + btl %esi, %r13d + jc .LBL_1_10 + +.LBL_1_8: + incb %r12b + addl $2, %r14d + cmpb $16, %r12b + jb .LBL_1_6 + + movq 8(%rsp), %rsi + movq (%rsp), %rdi + movq 40(%rsp), %r12 + cfi_restore (%r12) + movq 32(%rsp), %r13 + cfi_restore (%r13) + movq 24(%rsp), %r14 + cfi_restore (%r14) + movq 16(%rsp), %r15 + cfi_restore (%r15) + movups 192(%rsp), %xmm1 + jmp .LBL_1_2 + +.LBL_1_10: + cfi_restore_state + movzbl %r12b, %r15d + movss 68(%rsp,%r15,8), %xmm0 + movss 132(%rsp,%r15,8), %xmm1 + + call JUMPTARGET(__powf_finite) + + movss %xmm0, 196(%rsp,%r15,8) + jmp .LBL_1_8 + +.LBL_1_12: + movzbl %r12b, %r15d + movss 64(%rsp,%r15,8), %xmm0 + movss 128(%rsp,%r15,8), %xmm1 + + call JUMPTARGET(__powf_finite) + + movss %xmm0, 192(%rsp,%r15,8) + jmp .LBL_1_7 + +END (_ZGVbN4vv_powf_sse4) diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_powf8_core.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_powf8_core.S new file mode 100644 index 0000000000..1f6a07315e --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_powf8_core.S @@ -0,0 +1,36 @@ +/* Multiple versions of vectorized powf. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + + .text +ENTRY (_ZGVdN8vv_powf) + .type _ZGVdN8vv_powf, @gnu_indirect_function + LOAD_RTLD_GLOBAL_RO_RDX + leaq _ZGVdN8vv_powf_avx2(%rip), %rax + HAS_ARCH_FEATURE (AVX2_Usable) + jz 2f + ret +2: leaq _ZGVdN8vv_powf_sse_wrapper(%rip), %rax + ret +END (_ZGVdN8vv_powf) +libmvec_hidden_def (_ZGVdN8vv_powf) + +#define _ZGVdN8vv_powf _ZGVdN8vv_powf_sse_wrapper +#include "../svml_s_powf8_core.S" diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_powf8_core_avx2.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_powf8_core_avx2.S new file mode 100644 index 0000000000..683932f410 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_powf8_core_avx2.S @@ -0,0 +1,357 @@ +/* Function powf vectorized with AVX2. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "svml_s_powf_data.h" + + .text +ENTRY(_ZGVdN8vv_powf_avx2) +/* + ALGORITHM DESCRIPTION: + + We are using the next identity : pow(x,y) = 2^(y * log2(x)). + + 1) log2(x) calculation + Here we use the following formula. + Let |x|=2^k1*X1, where k1 is integer, 1<=X1<2. + Let C ~= 1/ln(2), + Rcp1 ~= 1/X1, X2=Rcp1*X1, + Rcp2 ~= 1/X2, X3=Rcp2*X2, + Rcp3 ~= 1/X3, Rcp3C ~= C/X3. + Then + log2|x| = k1 + log2(1/Rcp1) + log2(1/Rcp2) + log2(C/Rcp3C) + + log2(X1*Rcp1*Rcp2*Rcp3C/C), + where X1*Rcp1*Rcp2*Rcp3C = C*(1+q), q is very small. + + The values of Rcp1, log2(1/Rcp1), Rcp2, log2(1/Rcp2), + Rcp3C, log2(C/Rcp3C) are taken from tables. + Values of Rcp1, Rcp2, Rcp3C are such that RcpC=Rcp1*Rcp2*Rcp3C + is exactly represented in target precision. + + log2(X1*Rcp1*Rcp2*Rcp3C/C) = log2(1+q) = ln(1+q)/ln2 = + = 1/(ln2)*q - 1/(2ln2)*q^2 + 1/(3ln2)*q^3 - ... = + = 1/(C*ln2)*cq - 1/(2*C^2*ln2)*cq^2 + 1/(3*C^3*ln2)*cq^3 - ... = + = (1 + a1)*cq + a2*cq^2 + a3*cq^3 + ..., + where + cq=X1*Rcp1*Rcp2*Rcp3C-C, + a1=1/(C*ln(2))-1 is small, + a2=1/(2*C^2*ln2), + a3=1/(3*C^3*ln2), + ... + Log2 result is split by three parts: HH+HL+HLL + + 2) Calculation of y*log2(x) + Split y into YHi+YLo. + Get high PH and medium PL parts of y*log2|x|. + Get low PLL part of y*log2|x|. + Now we have PH+PL+PLL ~= y*log2|x|. + + 3) Calculation of 2^(y*log2(x)) + Let's represent PH+PL+PLL in the form N + j/2^expK + Z, + where expK=7 in this implementation, N and j are integers, + 0<=j<=2^expK-1, |Z|<2^(-expK-1). Hence + 2^(PH+PL+PLL) ~= 2^N * 2^(j/2^expK) * 2^Z, + where 2^(j/2^expK) is stored in a table, and + 2^Z ~= 1 + B1*Z + B2*Z^2 ... + B5*Z^5. + We compute 2^(PH+PL+PLL) as follows: + Break PH into PHH + PHL, where PHH = N + j/2^expK. + Z = PHL + PL + PLL + Exp2Poly = B1*Z + B2*Z^2 ... + B5*Z^5 + Get 2^(j/2^expK) from table in the form THI+TLO. + Now we have 2^(PH+PL+PLL) ~= 2^N * (THI + TLO) * (1 + Exp2Poly). + Get significand of 2^(PH+PL+PLL) in the form ResHi+ResLo: + ResHi := THI + ResLo := THI * Exp2Poly + TLO + Get exponent ERes of the result: + Res := ResHi + ResLo: + Result := ex(Res) + N. */ + + pushq %rbp + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%rbp, 0) + movq %rsp, %rbp + cfi_def_cfa_register (%rbp) + andq $-64, %rsp + subq $448, %rsp + lea __VPACK_ODD_ind.6357.0.1(%rip), %rcx + vmovups %ymm14, 320(%rsp) + +/* hi bits */ + lea __VPACK_ODD_ind.6358.0.1(%rip), %rax + vmovups %ymm12, 256(%rsp) + vmovups %ymm9, 96(%rsp) + vmovups %ymm13, 224(%rsp) + vmovups %ymm15, 352(%rsp) + vmovups %ymm11, 384(%rsp) + vmovups %ymm10, 288(%rsp) + vmovups (%rcx), %ymm10 + vmovups %ymm8, 160(%rsp) + vmovdqa %ymm1, %ymm9 + movq __svml_spow_data@GOTPCREL(%rip), %rdx + vextractf128 $1, %ymm0, %xmm7 + vcvtps2pd %xmm0, %ymm14 + vcvtps2pd %xmm7, %ymm12 + vpsubd _NMINNORM(%rdx), %ymm0, %ymm7 + +/* preserve mantissa, set input exponent to 2^(-10) */ + vandpd _ExpMask(%rdx), %ymm14, %ymm3 + vandpd _ExpMask(%rdx), %ymm12, %ymm13 + +/* exponent bits selection */ + vpsrlq $20, %ymm12, %ymm12 + vpsrlq $20, %ymm14, %ymm14 + vextractf128 $1, %ymm9, %xmm2 + vcvtps2pd %xmm9, %ymm1 + vpand _ABSMASK(%rdx), %ymm9, %ymm8 + vcvtps2pd %xmm2, %ymm6 + vorpd _Two10(%rdx), %ymm3, %ymm2 + vorpd _Two10(%rdx), %ymm13, %ymm3 + +/* reciprocal approximation good to at least 11 bits */ + vcvtpd2ps %ymm2, %xmm5 + vcvtpd2ps %ymm3, %xmm15 + vrcpps %xmm5, %xmm4 + vrcpps %xmm15, %xmm11 + vcvtps2pd %xmm4, %ymm13 + vcvtps2pd %xmm11, %ymm4 + vpermps %ymm12, %ymm10, %ymm11 + +/* round reciprocal to nearest integer, will have 1+9 mantissa bits */ + vroundpd $0, %ymm13, %ymm12 + vpermps %ymm14, %ymm10, %ymm5 + vroundpd $0, %ymm4, %ymm14 + vmovupd _One(%rdx), %ymm4 + +/* table lookup */ + vpsrlq $40, %ymm12, %ymm10 + vfmsub213pd %ymm4, %ymm12, %ymm2 + vfmsub213pd %ymm4, %ymm14, %ymm3 + vcmpgt_oqpd _Threshold(%rdx), %ymm12, %ymm12 + vxorpd %ymm4, %ymm4, %ymm4 + vandpd _Bias(%rdx), %ymm12, %ymm12 + +/* biased exponent in DP format */ + vcvtdq2pd %xmm11, %ymm13 + vpcmpeqd %ymm11, %ymm11, %ymm11 + vgatherqpd %ymm11, _Log2Rcp_lookup(%rdx,%ymm10), %ymm4 + vpsrlq $40, %ymm14, %ymm10 + vcmpgt_oqpd _Threshold(%rdx), %ymm14, %ymm14 + vpcmpeqd %ymm11, %ymm11, %ymm11 + vandpd _Bias(%rdx), %ymm14, %ymm14 + vcvtdq2pd %xmm5, %ymm15 + vxorpd %ymm5, %ymm5, %ymm5 + vgatherqpd %ymm11, _Log2Rcp_lookup(%rdx,%ymm10), %ymm5 + vorpd _Bias1(%rdx), %ymm12, %ymm11 + vorpd _Bias1(%rdx), %ymm14, %ymm10 + vsubpd %ymm11, %ymm15, %ymm11 + vsubpd %ymm10, %ymm13, %ymm14 + vmovupd _poly_coeff_4(%rdx), %ymm15 + vmovupd _poly_coeff_3(%rdx), %ymm13 + vmulpd %ymm3, %ymm3, %ymm10 + vfmadd213pd %ymm15, %ymm3, %ymm13 + vmovdqa %ymm15, %ymm12 + vfmadd231pd _poly_coeff_3(%rdx), %ymm2, %ymm12 + vmulpd %ymm2, %ymm2, %ymm15 + +/* reconstruction */ + vfmadd213pd %ymm3, %ymm10, %ymm13 + vfmadd213pd %ymm2, %ymm15, %ymm12 + vaddpd %ymm5, %ymm13, %ymm13 + vaddpd %ymm4, %ymm12, %ymm2 + vfmadd231pd _L2(%rdx), %ymm14, %ymm13 + vfmadd132pd _L2(%rdx), %ymm2, %ymm11 + vmulpd %ymm6, %ymm13, %ymm2 + vmulpd %ymm1, %ymm11, %ymm10 + vmulpd __dbInvLn2(%rdx), %ymm2, %ymm6 + vmulpd __dbInvLn2(%rdx), %ymm10, %ymm15 + +/* to round down; if dR is an integer we will get R = 1, which is ok */ + vsubpd __dbHALF(%rdx), %ymm6, %ymm3 + vsubpd __dbHALF(%rdx), %ymm15, %ymm1 + vaddpd __dbShifter(%rdx), %ymm3, %ymm13 + vaddpd __dbShifter(%rdx), %ymm1, %ymm14 + vsubpd __dbShifter(%rdx), %ymm13, %ymm12 + vmovups (%rax), %ymm1 + vsubpd __dbShifter(%rdx), %ymm14, %ymm11 + +/* [0..1) */ + vsubpd %ymm12, %ymm6, %ymm6 + vpermps %ymm10, %ymm1, %ymm3 + vpermps %ymm2, %ymm1, %ymm10 + vpcmpgtd _NMAXVAL(%rdx), %ymm7, %ymm4 + vpcmpgtd _INF(%rdx), %ymm8, %ymm1 + vpcmpeqd _NMAXVAL(%rdx), %ymm7, %ymm7 + vpcmpeqd _INF(%rdx), %ymm8, %ymm8 + vpor %ymm7, %ymm4, %ymm2 + vpor %ymm8, %ymm1, %ymm1 + vsubpd %ymm11, %ymm15, %ymm7 + vinsertf128 $1, %xmm10, %ymm3, %ymm10 + vpor %ymm1, %ymm2, %ymm3 + +/* iAbsX = iAbsX&iAbsMask */ + vandps __iAbsMask(%rdx), %ymm10, %ymm10 + +/* iRangeMask = (iAbsX>iDomainRange) */ + vpcmpgtd __iDomainRange(%rdx), %ymm10, %ymm4 + vpor %ymm4, %ymm3, %ymm5 + vmulpd __dbC1(%rdx), %ymm7, %ymm4 + vmovmskps %ymm5, %ecx + vmulpd __dbC1(%rdx), %ymm6, %ymm5 + +/* low K bits */ + vandps __lbLOWKBITS(%rdx), %ymm14, %ymm6 + +/* dpP= _dbT+lJ*T_ITEM_GRAN */ + vxorpd %ymm7, %ymm7, %ymm7 + vpcmpeqd %ymm1, %ymm1, %ymm1 + vandps __lbLOWKBITS(%rdx), %ymm13, %ymm2 + vxorpd %ymm10, %ymm10, %ymm10 + vpcmpeqd %ymm3, %ymm3, %ymm3 + vgatherqpd %ymm1, 13952(%rdx,%ymm6,8), %ymm7 + vgatherqpd %ymm3, 13952(%rdx,%ymm2,8), %ymm10 + vpsrlq $11, %ymm14, %ymm14 + vpsrlq $11, %ymm13, %ymm13 + vfmadd213pd %ymm7, %ymm4, %ymm7 + vfmadd213pd %ymm10, %ymm5, %ymm10 + +/* NB : including +/- sign for the exponent!! */ + vpsllq $52, %ymm14, %ymm8 + vpsllq $52, %ymm13, %ymm11 + vpaddq %ymm8, %ymm7, %ymm12 + vpaddq %ymm11, %ymm10, %ymm1 + vcvtpd2ps %ymm12, %xmm15 + vcvtpd2ps %ymm1, %xmm2 + vinsertf128 $1, %xmm2, %ymm15, %ymm1 + testl %ecx, %ecx + jne .LBL_1_3 + +.LBL_1_2: + cfi_remember_state + vmovups 160(%rsp), %ymm8 + vmovups 96(%rsp), %ymm9 + vmovups 288(%rsp), %ymm10 + vmovups 384(%rsp), %ymm11 + vmovups 256(%rsp), %ymm12 + vmovups 224(%rsp), %ymm13 + vmovups 320(%rsp), %ymm14 + vmovups 352(%rsp), %ymm15 + vmovdqa %ymm1, %ymm0 + movq %rbp, %rsp + cfi_def_cfa_register (%rsp) + popq %rbp + cfi_adjust_cfa_offset (-8) + cfi_restore (%rbp) + ret + +.LBL_1_3: + cfi_restore_state + vmovups %ymm0, 64(%rsp) + vmovups %ymm9, 128(%rsp) + vmovups %ymm1, 192(%rsp) + je .LBL_1_2 + + xorb %dl, %dl + xorl %eax, %eax + movq %rsi, 8(%rsp) + movq %rdi, (%rsp) + movq %r12, 40(%rsp) + cfi_offset_rel_rsp (12, 40) + movb %dl, %r12b + movq %r13, 32(%rsp) + cfi_offset_rel_rsp (13, 32) + movl %ecx, %r13d + movq %r14, 24(%rsp) + cfi_offset_rel_rsp (14, 24) + movl %eax, %r14d + movq %r15, 16(%rsp) + cfi_offset_rel_rsp (15, 16) + cfi_remember_state + +.LBL_1_6: + btl %r14d, %r13d + jc .LBL_1_12 + +.LBL_1_7: + lea 1(%r14), %esi + btl %esi, %r13d + jc .LBL_1_10 + +.LBL_1_8: + incb %r12b + addl $2, %r14d + cmpb $16, %r12b + jb .LBL_1_6 + + movq 8(%rsp), %rsi + movq (%rsp), %rdi + movq 40(%rsp), %r12 + cfi_restore (%r12) + movq 32(%rsp), %r13 + cfi_restore (%r13) + movq 24(%rsp), %r14 + cfi_restore (%r14) + movq 16(%rsp), %r15 + cfi_restore (%r15) + vmovups 192(%rsp), %ymm1 + jmp .LBL_1_2 + +.LBL_1_10: + cfi_restore_state + movzbl %r12b, %r15d + vmovss 68(%rsp,%r15,8), %xmm0 + vmovss 132(%rsp,%r15,8), %xmm1 + vzeroupper + + call JUMPTARGET(__powf_finite) + + vmovss %xmm0, 196(%rsp,%r15,8) + jmp .LBL_1_8 + +.LBL_1_12: + movzbl %r12b, %r15d + vmovss 64(%rsp,%r15,8), %xmm0 + vmovss 128(%rsp,%r15,8), %xmm1 + vzeroupper + + call JUMPTARGET(__powf_finite) + + vmovss %xmm0, 192(%rsp,%r15,8) + jmp .LBL_1_7 + +END(_ZGVdN8vv_powf_avx2) + + .section .rodata, "a" +__VPACK_ODD_ind.6357.0.1: + .long 1 + .long 3 + .long 5 + .long 7 + .long 0 + .long 0 + .long 0 + .long 0 + .space 32, 0x00 +__VPACK_ODD_ind.6358.0.1: + .long 1 + .long 3 + .long 5 + .long 7 + .long 0 + .long 0 + .long 0 + .long 0 diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core.S new file mode 100644 index 0000000000..0545460952 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core.S @@ -0,0 +1,37 @@ +/* Multiple versions of vectorized sincosf. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + + .text +ENTRY (_ZGVeN16vvv_sincosf) + .type _ZGVeN16vvv_sincosf, @gnu_indirect_function + LOAD_RTLD_GLOBAL_RO_RDX + leaq _ZGVeN16vvv_sincosf_skx(%rip), %rax + HAS_ARCH_FEATURE (AVX512DQ_Usable) + jnz 2f + leaq _ZGVeN16vvv_sincosf_knl(%rip), %rax + HAS_ARCH_FEATURE (AVX512F_Usable) + jnz 2f + leaq _ZGVeN16vvv_sincosf_avx2_wrapper(%rip), %rax +2: ret +END (_ZGVeN16vvv_sincosf) + +#define _ZGVeN16vvv_sincosf _ZGVeN16vvv_sincosf_avx2_wrapper +#include "../svml_s_sincosf16_core.S" diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S new file mode 100644 index 0000000000..f73ab7de7c --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S @@ -0,0 +1,806 @@ +/* Function sincosf vectorized with AVX-512. KNL and SKX versions. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "svml_s_trig_data.h" +#include "svml_s_wrapper_impl.h" + +/* + ALGORITHM DESCRIPTION: + + 1) Range reduction to [-Pi/4; +Pi/4] interval + a) Grab sign from source argument and save it. + b) Remove sign using AND operation + c) Getting octant Y by 2/Pi multiplication + d) Add "Right Shifter" value + e) Treat obtained value as integer S for destination sign setting. + SS = ((S-S&1)&2)<<30; For sin part + SC = ((S+S&1)&2)<<30; For cos part + f) Change destination sign if source sign is negative + using XOR operation. + g) Subtract "Right Shifter" (0x4B000000) value + h) Subtract Y*(PI/2) from X argument, where PI/2 divided to 4 parts: + X = X - Y*PI1 - Y*PI2 - Y*PI3 - Y*PI4; + 2) Polynomial (minimax for sin within [-Pi/4; +Pi/4] interval) + a) Calculate X^2 = X * X + b) Calculate 2 polynomials for sin and cos: + RS = X * ( A0 + X^2 * (A1 + x^2 * (A2 + x^2 * (A3)))); + RC = B0 + X^2 * (B1 + x^2 * (B2 + x^2 * (B3 + x^2 * (B4)))); + c) Swap RS & RC if if first bit of obtained value after + Right Shifting is set to 1. Using And, Andnot & Or operations. + 3) Destination sign setting + a) Set shifted destination sign using XOR operation: + R1 = XOR( RS, SS ); + R2 = XOR( RC, SC ). */ + + .text +ENTRY (_ZGVeN16vl4l4_sincosf_knl) +#ifndef HAVE_AVX512DQ_ASM_SUPPORT +WRAPPER_IMPL_AVX512_fFF _ZGVdN8vl4l4_sincosf +#else + pushq %rbp + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%rbp, 0) + movq %rsp, %rbp + cfi_def_cfa_register (%rbp) + andq $-64, %rsp + subq $1344, %rsp + movq __svml_s_trig_data@GOTPCREL(%rip), %rax + vmovaps %zmm0, %zmm2 + movl $-1, %edx + vmovups __sAbsMask(%rax), %zmm0 + vmovups __sInvPI(%rax), %zmm3 + +/* Absolute argument computation */ + vpandd %zmm0, %zmm2, %zmm1 + vmovups __sPI1_FMA(%rax), %zmm5 + vmovups __sSignMask(%rax), %zmm9 + vpandnd %zmm2, %zmm0, %zmm0 + +/* h) Subtract Y*(PI/2) from X argument, where PI/2 divided to 3 parts: + X = X - Y*PI1 - Y*PI2 - Y*PI3 */ + vmovaps %zmm1, %zmm6 + vmovaps %zmm1, %zmm8 + +/* c) Getting octant Y by 2/Pi multiplication + d) Add "Right Shifter" value */ + vfmadd213ps __sRShifter(%rax), %zmm1, %zmm3 + vmovups __sPI3_FMA(%rax), %zmm7 + +/* g) Subtract "Right Shifter" (0x4B000000) value */ + vsubps __sRShifter(%rax), %zmm3, %zmm12 + +/* e) Treat obtained value as integer S for destination sign setting */ + vpslld $31, %zmm3, %zmm13 + vmovups __sA7_FMA(%rax), %zmm14 + vfnmadd231ps %zmm12, %zmm5, %zmm6 + +/* 2) Polynomial (minimax for sin within [-Pi/4; +Pi/4] interval) + a) Calculate X^2 = X * X + b) Calculate 2 polynomials for sin and cos: + RS = X * ( A0 + X^2 * (A1 + x^2 * (A2 + x^2 * (A3)))); + RC = B0 + X^2 * (B1 + x^2 * (B2 + x^2 * (B3 + x^2 * (B4)))) */ + vmovaps %zmm14, %zmm15 + vmovups __sA9_FMA(%rax), %zmm3 + vcmpps $22, __sRangeReductionVal(%rax), %zmm1, %k1 + vpbroadcastd %edx, %zmm1{%k1}{z} + vfnmadd231ps __sPI2_FMA(%rax), %zmm12, %zmm6 + vptestmd %zmm1, %zmm1, %k0 + vpandd %zmm6, %zmm9, %zmm11 + kmovw %k0, %ecx + vpxord __sOneHalf(%rax), %zmm11, %zmm4 + +/* Result sign calculations */ + vpternlogd $150, %zmm13, %zmm9, %zmm11 + +/* Add correction term 0.5 for cos() part */ + vaddps %zmm4, %zmm12, %zmm10 + vfnmadd213ps %zmm6, %zmm7, %zmm12 + vfnmadd231ps %zmm10, %zmm5, %zmm8 + vpxord %zmm13, %zmm12, %zmm13 + vmulps %zmm13, %zmm13, %zmm12 + vfnmadd231ps __sPI2_FMA(%rax), %zmm10, %zmm8 + vfmadd231ps __sA9_FMA(%rax), %zmm12, %zmm15 + vfnmadd213ps %zmm8, %zmm7, %zmm10 + vfmadd213ps __sA5_FMA(%rax), %zmm12, %zmm15 + vpxord %zmm11, %zmm10, %zmm5 + vmulps %zmm5, %zmm5, %zmm4 + vfmadd213ps __sA3(%rax), %zmm12, %zmm15 + vfmadd213ps %zmm14, %zmm4, %zmm3 + vmulps %zmm12, %zmm15, %zmm14 + vfmadd213ps __sA5_FMA(%rax), %zmm4, %zmm3 + vfmadd213ps %zmm13, %zmm13, %zmm14 + vfmadd213ps __sA3(%rax), %zmm4, %zmm3 + vpxord %zmm0, %zmm14, %zmm0 + vmulps %zmm4, %zmm3, %zmm3 + vfmadd213ps %zmm5, %zmm5, %zmm3 + testl %ecx, %ecx + jne .LBL_1_3 + +.LBL_1_2: + cfi_remember_state + vmovups %zmm0, (%rdi) + vmovups %zmm3, (%rsi) + movq %rbp, %rsp + cfi_def_cfa_register (%rsp) + popq %rbp + cfi_adjust_cfa_offset (-8) + cfi_restore (%rbp) + ret + +.LBL_1_3: + cfi_restore_state + vmovups %zmm2, 1152(%rsp) + vmovups %zmm0, 1216(%rsp) + vmovups %zmm3, 1280(%rsp) + je .LBL_1_2 + + xorb %dl, %dl + kmovw %k4, 1048(%rsp) + xorl %eax, %eax + kmovw %k5, 1040(%rsp) + kmovw %k6, 1032(%rsp) + kmovw %k7, 1024(%rsp) + vmovups %zmm16, 960(%rsp) + vmovups %zmm17, 896(%rsp) + vmovups %zmm18, 832(%rsp) + vmovups %zmm19, 768(%rsp) + vmovups %zmm20, 704(%rsp) + vmovups %zmm21, 640(%rsp) + vmovups %zmm22, 576(%rsp) + vmovups %zmm23, 512(%rsp) + vmovups %zmm24, 448(%rsp) + vmovups %zmm25, 384(%rsp) + vmovups %zmm26, 320(%rsp) + vmovups %zmm27, 256(%rsp) + vmovups %zmm28, 192(%rsp) + vmovups %zmm29, 128(%rsp) + vmovups %zmm30, 64(%rsp) + vmovups %zmm31, (%rsp) + movq %rsi, 1056(%rsp) + movq %r12, 1096(%rsp) + cfi_offset_rel_rsp (12, 1096) + movb %dl, %r12b + movq %r13, 1088(%rsp) + cfi_offset_rel_rsp (13, 1088) + movl %eax, %r13d + movq %r14, 1080(%rsp) + cfi_offset_rel_rsp (14, 1080) + movl %ecx, %r14d + movq %r15, 1072(%rsp) + cfi_offset_rel_rsp (15, 1072) + movq %rbx, 1064(%rsp) + movq %rdi, %rbx + cfi_remember_state + +.LBL_1_6: + btl %r13d, %r14d + jc .LBL_1_13 + +.LBL_1_7: + lea 1(%r13), %esi + btl %esi, %r14d + jc .LBL_1_10 + +.LBL_1_8: + addb $1, %r12b + addl $2, %r13d + cmpb $16, %r12b + jb .LBL_1_6 + + movq %rbx, %rdi + kmovw 1048(%rsp), %k4 + movq 1056(%rsp), %rsi + kmovw 1040(%rsp), %k5 + movq 1096(%rsp), %r12 + cfi_restore (%r12) + kmovw 1032(%rsp), %k6 + movq 1088(%rsp), %r13 + cfi_restore (%r13) + kmovw 1024(%rsp), %k7 + vmovups 960(%rsp), %zmm16 + vmovups 896(%rsp), %zmm17 + vmovups 832(%rsp), %zmm18 + vmovups 768(%rsp), %zmm19 + vmovups 704(%rsp), %zmm20 + vmovups 640(%rsp), %zmm21 + vmovups 576(%rsp), %zmm22 + vmovups 512(%rsp), %zmm23 + vmovups 448(%rsp), %zmm24 + vmovups 384(%rsp), %zmm25 + vmovups 320(%rsp), %zmm26 + vmovups 256(%rsp), %zmm27 + vmovups 192(%rsp), %zmm28 + vmovups 128(%rsp), %zmm29 + vmovups 64(%rsp), %zmm30 + vmovups (%rsp), %zmm31 + movq 1080(%rsp), %r14 + cfi_restore (%r14) + movq 1072(%rsp), %r15 + cfi_restore (%r15) + movq 1064(%rsp), %rbx + vmovups 1216(%rsp), %zmm0 + vmovups 1280(%rsp), %zmm3 + jmp .LBL_1_2 + +.LBL_1_10: + cfi_restore_state + movzbl %r12b, %r15d + vmovss 1156(%rsp,%r15,8), %xmm0 + + call JUMPTARGET(sinf) + + vmovss %xmm0, 1220(%rsp,%r15,8) + vmovss 1156(%rsp,%r15,8), %xmm0 + + call JUMPTARGET(cosf) + + vmovss %xmm0, 1284(%rsp,%r15,8) + jmp .LBL_1_8 + +.LBL_1_13: + movzbl %r12b, %r15d + vmovss 1152(%rsp,%r15,8), %xmm0 + + call JUMPTARGET(sinf) + + vmovss %xmm0, 1216(%rsp,%r15,8) + vmovss 1152(%rsp,%r15,8), %xmm0 + + call JUMPTARGET(cosf) + + vmovss %xmm0, 1280(%rsp,%r15,8) + jmp .LBL_1_7 +#endif +END (_ZGVeN16vl4l4_sincosf_knl) +libmvec_hidden_def(_ZGVeN16vl4l4_sincosf_knl) + +ENTRY (_ZGVeN16vl4l4_sincosf_skx) +#ifndef HAVE_AVX512DQ_ASM_SUPPORT +WRAPPER_IMPL_AVX512_fFF _ZGVdN8vvv_sincosf +#else + pushq %rbp + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%rbp, 0) + movq %rsp, %rbp + cfi_def_cfa_register (%rbp) + andq $-64, %rsp + subq $1344, %rsp + movq __svml_s_trig_data@GOTPCREL(%rip), %rax + vmovaps %zmm0, %zmm4 + vmovups __sAbsMask(%rax), %zmm3 + vmovups __sInvPI(%rax), %zmm5 + vmovups __sRShifter(%rax), %zmm6 + vmovups __sPI1_FMA(%rax), %zmm9 + vmovups __sPI2_FMA(%rax), %zmm10 + vmovups __sSignMask(%rax), %zmm14 + vmovups __sOneHalf(%rax), %zmm7 + vmovups __sPI3_FMA(%rax), %zmm12 + +/* Absolute argument computation */ + vandps %zmm3, %zmm4, %zmm2 + +/* c) Getting octant Y by 2/Pi multiplication + d) Add "Right Shifter" value */ + vfmadd213ps %zmm6, %zmm2, %zmm5 + vcmpps $18, __sRangeReductionVal(%rax), %zmm2, %k1 + +/* e) Treat obtained value as integer S for destination sign setting */ + vpslld $31, %zmm5, %zmm0 + +/* g) Subtract "Right Shifter" (0x4B000000) value */ + vsubps %zmm6, %zmm5, %zmm5 + vmovups __sA3(%rax), %zmm6 + +/* h) Subtract Y*(PI/2) from X argument, where PI/2 divided to 3 parts: + X = X - Y*PI1 - Y*PI2 - Y*PI3 */ + vmovaps %zmm2, %zmm11 + vfnmadd231ps %zmm5, %zmm9, %zmm11 + vfnmadd231ps %zmm5, %zmm10, %zmm11 + vandps %zmm11, %zmm14, %zmm1 + vxorps %zmm1, %zmm7, %zmm8 + +/* Result sign calculations */ + vpternlogd $150, %zmm0, %zmm14, %zmm1 + vmovups .L_2il0floatpacket.13(%rip), %zmm14 + +/* Add correction term 0.5 for cos() part */ + vaddps %zmm8, %zmm5, %zmm15 + vfnmadd213ps %zmm11, %zmm12, %zmm5 + vandnps %zmm4, %zmm3, %zmm11 + vmovups __sA7_FMA(%rax), %zmm3 + vmovaps %zmm2, %zmm13 + vfnmadd231ps %zmm15, %zmm9, %zmm13 + vxorps %zmm0, %zmm5, %zmm9 + vmovups __sA5_FMA(%rax), %zmm0 + vfnmadd231ps %zmm15, %zmm10, %zmm13 + vmulps %zmm9, %zmm9, %zmm8 + vfnmadd213ps %zmm13, %zmm12, %zmm15 + vmovups __sA9_FMA(%rax), %zmm12 + vxorps %zmm1, %zmm15, %zmm1 + vmulps %zmm1, %zmm1, %zmm13 + +/* 2) Polynomial (minimax for sin within [-Pi/4; +Pi/4] interval) + a) Calculate X^2 = X * X + b) Calculate 2 polynomials for sin and cos: + RS = X * ( A0 + X^2 * (A1 + x^2 * (A2 + x^2 * (A3)))); + RC = B0 + X^2 * (B1 + x^2 * (B2 + x^2 * (B3 + x^2 * (B4)))) */ + vmovaps %zmm12, %zmm7 + vfmadd213ps %zmm3, %zmm8, %zmm7 + vfmadd213ps %zmm3, %zmm13, %zmm12 + vfmadd213ps %zmm0, %zmm8, %zmm7 + vfmadd213ps %zmm0, %zmm13, %zmm12 + vfmadd213ps %zmm6, %zmm8, %zmm7 + vfmadd213ps %zmm6, %zmm13, %zmm12 + vmulps %zmm8, %zmm7, %zmm10 + vmulps %zmm13, %zmm12, %zmm3 + vfmadd213ps %zmm9, %zmm9, %zmm10 + vfmadd213ps %zmm1, %zmm1, %zmm3 + vxorps %zmm11, %zmm10, %zmm0 + vpandnd %zmm2, %zmm2, %zmm14{%k1} + vptestmd %zmm14, %zmm14, %k0 + kmovw %k0, %ecx + testl %ecx, %ecx + jne .LBL_2_3 + +.LBL_2_2: + cfi_remember_state + vmovups %zmm0, (%rdi) + vmovups %zmm3, (%rsi) + movq %rbp, %rsp + cfi_def_cfa_register (%rsp) + popq %rbp + cfi_adjust_cfa_offset (-8) + cfi_restore (%rbp) + ret + +.LBL_2_3: + cfi_restore_state + vmovups %zmm4, 1152(%rsp) + vmovups %zmm0, 1216(%rsp) + vmovups %zmm3, 1280(%rsp) + je .LBL_2_2 + + xorb %dl, %dl + xorl %eax, %eax + kmovw %k4, 1048(%rsp) + kmovw %k5, 1040(%rsp) + kmovw %k6, 1032(%rsp) + kmovw %k7, 1024(%rsp) + vmovups %zmm16, 960(%rsp) + vmovups %zmm17, 896(%rsp) + vmovups %zmm18, 832(%rsp) + vmovups %zmm19, 768(%rsp) + vmovups %zmm20, 704(%rsp) + vmovups %zmm21, 640(%rsp) + vmovups %zmm22, 576(%rsp) + vmovups %zmm23, 512(%rsp) + vmovups %zmm24, 448(%rsp) + vmovups %zmm25, 384(%rsp) + vmovups %zmm26, 320(%rsp) + vmovups %zmm27, 256(%rsp) + vmovups %zmm28, 192(%rsp) + vmovups %zmm29, 128(%rsp) + vmovups %zmm30, 64(%rsp) + vmovups %zmm31, (%rsp) + movq %rsi, 1056(%rsp) + movq %r12, 1096(%rsp) + cfi_offset_rel_rsp (12, 1096) + movb %dl, %r12b + movq %r13, 1088(%rsp) + cfi_offset_rel_rsp (13, 1088) + movl %eax, %r13d + movq %r14, 1080(%rsp) + cfi_offset_rel_rsp (14, 1080) + movl %ecx, %r14d + movq %r15, 1072(%rsp) + cfi_offset_rel_rsp (15, 1072) + movq %rbx, 1064(%rsp) + movq %rdi, %rbx + cfi_remember_state + +.LBL_2_6: + btl %r13d, %r14d + jc .LBL_2_13 + +.LBL_2_7: + lea 1(%r13), %esi + btl %esi, %r14d + jc .LBL_2_10 + +.LBL_2_8: + incb %r12b + addl $2, %r13d + cmpb $16, %r12b + jb .LBL_2_6 + + kmovw 1048(%rsp), %k4 + movq %rbx, %rdi + kmovw 1040(%rsp), %k5 + kmovw 1032(%rsp), %k6 + kmovw 1024(%rsp), %k7 + vmovups 960(%rsp), %zmm16 + vmovups 896(%rsp), %zmm17 + vmovups 832(%rsp), %zmm18 + vmovups 768(%rsp), %zmm19 + vmovups 704(%rsp), %zmm20 + vmovups 640(%rsp), %zmm21 + vmovups 576(%rsp), %zmm22 + vmovups 512(%rsp), %zmm23 + vmovups 448(%rsp), %zmm24 + vmovups 384(%rsp), %zmm25 + vmovups 320(%rsp), %zmm26 + vmovups 256(%rsp), %zmm27 + vmovups 192(%rsp), %zmm28 + vmovups 128(%rsp), %zmm29 + vmovups 64(%rsp), %zmm30 + vmovups (%rsp), %zmm31 + vmovups 1216(%rsp), %zmm0 + vmovups 1280(%rsp), %zmm3 + movq 1056(%rsp), %rsi + movq 1096(%rsp), %r12 + cfi_restore (%r12) + movq 1088(%rsp), %r13 + cfi_restore (%r13) + movq 1080(%rsp), %r14 + cfi_restore (%r14) + movq 1072(%rsp), %r15 + cfi_restore (%r15) + movq 1064(%rsp), %rbx + jmp .LBL_2_2 + +.LBL_2_10: + cfi_restore_state + movzbl %r12b, %r15d + vmovss 1156(%rsp,%r15,8), %xmm0 + vzeroupper + vmovss 1156(%rsp,%r15,8), %xmm0 + + call JUMPTARGET(sinf) + + vmovss %xmm0, 1220(%rsp,%r15,8) + vmovss 1156(%rsp,%r15,8), %xmm0 + + call JUMPTARGET(cosf) + + vmovss %xmm0, 1284(%rsp,%r15,8) + jmp .LBL_2_8 + +.LBL_2_13: + movzbl %r12b, %r15d + vmovss 1152(%rsp,%r15,8), %xmm0 + vzeroupper + vmovss 1152(%rsp,%r15,8), %xmm0 + + call JUMPTARGET(sinf) + + vmovss %xmm0, 1216(%rsp,%r15,8) + vmovss 1152(%rsp,%r15,8), %xmm0 + + call JUMPTARGET(cosf) + + vmovss %xmm0, 1280(%rsp,%r15,8) + jmp .LBL_2_7 +#endif +END (_ZGVeN16vl4l4_sincosf_skx) +libmvec_hidden_def(_ZGVeN16vl4l4_sincosf_skx) + +/* Wrapper between vvv and vl4l4 vector variants. */ +.macro WRAPPER_AVX512_vvv_vl4l4 callee +#ifndef __ILP32__ + pushq %rbp + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%rbp, 0) + movq %rsp, %rbp + cfi_def_cfa_register (%rbp) + andq $-64, %rsp + subq $384, %rsp + /* Encoding for vmovups %zmm1, 128(%rsp). */ + .byte 0x62 + .byte 0xf1 + .byte 0x7c + .byte 0x48 + .byte 0x11 + .byte 0x4c + .byte 0x24 + .byte 0x02 + lea (%rsp), %rdi + /* Encoding for vmovups %zmm2, 192(%rdi). */ + .byte 0x62 + .byte 0xf1 + .byte 0x7c + .byte 0x48 + .byte 0x11 + .byte 0x57 + .byte 0x03 + /* Encoding for vmovups %zmm3, 256(%rdi). */ + .byte 0x62 + .byte 0xf1 + .byte 0x7c + .byte 0x48 + .byte 0x11 + .byte 0x5f + .byte 0x04 + /* Encoding for vmovups %zmm4, 320(%rdi). */ + .byte 0x62 + .byte 0xf1 + .byte 0x7c + .byte 0x48 + .byte 0x11 + .byte 0x67 + .byte 0x05 + lea 64(%rsp), %rsi + call HIDDEN_JUMPTARGET(\callee) + movq 128(%rsp), %rdx + movq 136(%rsp), %rsi + movq 144(%rsp), %r8 + movq 152(%rsp), %r10 + movl (%rsp), %eax + movl 4(%rsp), %ecx + movl 8(%rsp), %edi + movl 12(%rsp), %r9d + movl %eax, (%rdx) + movl %ecx, (%rsi) + movq 160(%rsp), %rax + movq 168(%rsp), %rcx + movl %edi, (%r8) + movl %r9d, (%r10) + movq 176(%rsp), %rdi + movq 184(%rsp), %r9 + movl 16(%rsp), %r11d + movl 20(%rsp), %edx + movl 24(%rsp), %esi + movl 28(%rsp), %r8d + movl %r11d, (%rax) + movl %edx, (%rcx) + movq 192(%rsp), %r11 + movq 200(%rsp), %rdx + movl %esi, (%rdi) + movl %r8d, (%r9) + movq 208(%rsp), %rsi + movq 216(%rsp), %r8 + movl 32(%rsp), %r10d + movl 36(%rsp), %eax + movl 40(%rsp), %ecx + movl 44(%rsp), %edi + movl %r10d, (%r11) + movl %eax, (%rdx) + movq 224(%rsp), %r10 + movq 232(%rsp), %rax + movl %ecx, (%rsi) + movl %edi, (%r8) + movq 240(%rsp), %rcx + movq 248(%rsp), %rdi + movl 48(%rsp), %r9d + movl 52(%rsp), %r11d + movl 56(%rsp), %edx + movl 60(%rsp), %esi + movl %r9d, (%r10) + movl %r11d, (%rax) + movq 256(%rsp), %r9 + movq 264(%rsp), %r11 + movl %edx, (%rcx) + movl %esi, (%rdi) + movq 272(%rsp), %rdx + movq 280(%rsp), %rsi + movl 64(%rsp), %r8d + movl 68(%rsp), %r10d + movl 72(%rsp), %eax + movl 76(%rsp), %ecx + movl %r8d, (%r9) + movl %r10d, (%r11) + movq 288(%rsp), %r8 + movq 296(%rsp), %r10 + movl %eax, (%rdx) + movl %ecx, (%rsi) + movq 304(%rsp), %rax + movq 312(%rsp), %rcx + movl 80(%rsp), %edi + movl 84(%rsp), %r9d + movl 88(%rsp), %r11d + movl 92(%rsp), %edx + movl %edi, (%r8) + movl %r9d, (%r10) + movq 320(%rsp), %rdi + movq 328(%rsp), %r9 + movl %r11d, (%rax) + movl %edx, (%rcx) + movq 336(%rsp), %r11 + movq 344(%rsp), %rdx + movl 96(%rsp), %esi + movl 100(%rsp), %r8d + movl 104(%rsp), %r10d + movl 108(%rsp), %eax + movl %esi, (%rdi) + movl %r8d, (%r9) + movq 352(%rsp), %rsi + movq 360(%rsp), %r8 + movl %r10d, (%r11) + movl %eax, (%rdx) + movq 368(%rsp), %r10 + movq 376(%rsp), %rax + movl 112(%rsp), %ecx + movl 116(%rsp), %edi + movl 120(%rsp), %r9d + movl 124(%rsp), %r11d + movl %ecx, (%rsi) + movl %edi, (%r8) + movl %r9d, (%r10) + movl %r11d, (%rax) + movq %rbp, %rsp + cfi_def_cfa_register (%rsp) + popq %rbp + cfi_adjust_cfa_offset (-8) + cfi_restore (%rbp) + ret +#else + leal 8(%rsp), %r10d + .cfi_def_cfa 10, 0 + andl $-64, %esp + pushq -8(%r10d) + pushq %rbp + .cfi_escape 0x10,0x6,0x2,0x76,0 + movl %esp, %ebp + pushq %r10 + .cfi_escape 0xf,0x3,0x76,0x78,0x6 + leal -112(%rbp), %esi + leal -176(%rbp), %edi + subl $296, %esp + /* Encoding for vmovdqa64 %zmm1, -240(%ebp). */ + .byte 0x67 + .byte 0x62 + .byte 0xf1 + .byte 0xfd + .byte 0x48 + .byte 0x7f + .byte 0x8d + .byte 0x10 + .byte 0xff + .byte 0xff + .byte 0xff + /* Encoding for vmovdqa64 %zmm2, -304(%ebp). */ + .byte 0x67 + .byte 0x62 + .byte 0xf1 + .byte 0xfd + .byte 0x48 + .byte 0x7f + .byte 0x95 + .byte 0xd0 + .byte 0xfe + .byte 0xff + .byte 0xff + call HIDDEN_JUMPTARGET(\callee) + movl -240(%ebp), %eax + vmovss -176(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -236(%ebp), %eax + vmovss -172(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -232(%ebp), %eax + vmovss -168(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -228(%ebp), %eax + vmovss -164(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -224(%ebp), %eax + vmovss -160(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -220(%ebp), %eax + vmovss -156(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -216(%ebp), %eax + vmovss -152(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -212(%ebp), %eax + vmovss -148(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -208(%ebp), %eax + vmovss -144(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -204(%ebp), %eax + vmovss -140(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -200(%ebp), %eax + vmovss -136(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -196(%ebp), %eax + vmovss -132(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -192(%ebp), %eax + vmovss -128(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -188(%ebp), %eax + vmovss -124(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -184(%ebp), %eax + vmovss -120(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -180(%ebp), %eax + vmovss -116(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -304(%ebp), %eax + vmovss -112(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -300(%ebp), %eax + vmovss -108(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -296(%ebp), %eax + vmovss -104(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -292(%ebp), %eax + vmovss -100(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -288(%ebp), %eax + vmovss -96(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -284(%ebp), %eax + vmovss -92(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -280(%ebp), %eax + vmovss -88(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -276(%ebp), %eax + vmovss -84(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -272(%ebp), %eax + vmovss -80(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -268(%ebp), %eax + vmovss -76(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -264(%ebp), %eax + vmovss -72(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -260(%ebp), %eax + vmovss -68(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -256(%ebp), %eax + vmovss -64(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -252(%ebp), %eax + vmovss -60(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -248(%ebp), %eax + vmovss -56(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -244(%ebp), %eax + vmovss -52(%ebp), %xmm0 + vmovss %xmm0, (%eax) + addl $296, %esp + popq %r10 + .cfi_def_cfa 10, 0 + popq %rbp + leal -8(%r10), %esp + .cfi_def_cfa 7, 8 + ret +#endif +.endm + +ENTRY (_ZGVeN16vvv_sincosf_knl) +WRAPPER_AVX512_vvv_vl4l4 _ZGVeN16vl4l4_sincosf_knl +END (_ZGVeN16vvv_sincosf_knl) + +ENTRY (_ZGVeN16vvv_sincosf_skx) +WRAPPER_AVX512_vvv_vl4l4 _ZGVeN16vl4l4_sincosf_skx +END (_ZGVeN16vvv_sincosf_skx) + + .section .rodata, "a" +.L_2il0floatpacket.13: + .long 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff + .type .L_2il0floatpacket.13,@object diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf4_core.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf4_core.S new file mode 100644 index 0000000000..a249be33d1 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf4_core.S @@ -0,0 +1,36 @@ +/* Multiple versions of vectorized sincosf. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + + .text +ENTRY (_ZGVbN4vvv_sincosf) + .type _ZGVbN4vvv_sincosf, @gnu_indirect_function + LOAD_RTLD_GLOBAL_RO_RDX + leaq _ZGVbN4vvv_sincosf_sse4(%rip), %rax + HAS_CPU_FEATURE (SSE4_1) + jz 2f + ret +2: leaq _ZGVbN4vvv_sincosf_sse2(%rip), %rax + ret +END (_ZGVbN4vvv_sincosf) +libmvec_hidden_def (_ZGVbN4vvv_sincosf) + +#define _ZGVbN4vvv_sincosf _ZGVbN4vvv_sincosf_sse2 +#include "../svml_s_sincosf4_core.S" diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf4_core_sse4.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf4_core_sse4.S new file mode 100644 index 0000000000..74a6ac1157 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf4_core_sse4.S @@ -0,0 +1,346 @@ +/* Function sincosf vectorized with SSE4. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "svml_s_trig_data.h" + + .text +ENTRY (_ZGVbN4vl4l4_sincosf_sse4) +/* + ALGORITHM DESCRIPTION: + + 1) Range reduction to [-Pi/4; +Pi/4] interval + a) Grab sign from source argument and save it. + b) Remove sign using AND operation + c) Getting octant Y by 2/Pi multiplication + d) Add "Right Shifter" value + e) Treat obtained value as integer S for destination sign setting. + SS = ((S-S&1)&2)<<30; For sin part + SC = ((S+S&1)&2)<<30; For cos part + f) Change destination sign if source sign is negative + using XOR operation. + g) Subtract "Right Shifter" (0x4B000000) value + h) Subtract Y*(PI/2) from X argument, where PI/2 divided to 4 parts: + X = X - Y*PI1 - Y*PI2 - Y*PI3 - Y*PI4; + 2) Polynomial (minimax for sin within [-Pi/4; +Pi/4] interval) + a) Calculate X^2 = X * X + b) Calculate 2 polynomials for sin and cos: + RS = X * ( A0 + X^2 * (A1 + x^2 * (A2 + x^2 * (A3)))); + RC = B0 + X^2 * (B1 + x^2 * (B2 + x^2 * (B3 + x^2 * (B4)))); + c) Swap RS & RC if if first bit of obtained value after + Right Shifting is set to 1. Using And, Andnot & Or operations. + 3) Destination sign setting + a) Set shifted destination sign using XOR operation: + R1 = XOR( RS, SS ); + R2 = XOR( RC, SC ). */ + + pushq %rbp + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%rbp, 0) + movq %rsp, %rbp + cfi_def_cfa_register (%rbp) + andq $-64, %rsp + subq $320, %rsp + movq __svml_s_trig_data@GOTPCREL(%rip), %rax + movups %xmm12, 176(%rsp) + movups %xmm9, 160(%rsp) + movups __sAbsMask(%rax), %xmm12 + +/* Absolute argument computation */ + movaps %xmm12, %xmm5 + andnps %xmm0, %xmm12 + movups __sInvPI(%rax), %xmm7 + andps %xmm0, %xmm5 + +/* c) Getting octant Y by 2/Pi multiplication + d) Add "Right Shifter" value. */ + mulps %xmm5, %xmm7 + movups %xmm10, 144(%rsp) + movups __sPI1(%rax), %xmm10 + +/* h) Subtract Y*(PI/2) from X argument, where PI/2 divided to 3 parts: + X = X - Y*PI1 - Y*PI2 - Y*PI3. */ + movaps %xmm10, %xmm1 + addps __sRShifter(%rax), %xmm7 + +/* e) Treat obtained value as integer S for destination sign setting */ + movaps %xmm7, %xmm9 + +/* g) Subtract "Right Shifter" (0x4B000000) value */ + subps __sRShifter(%rax), %xmm7 + mulps %xmm7, %xmm1 + pslld $31, %xmm9 + movups __sPI2(%rax), %xmm6 + movups %xmm13, 112(%rsp) + movaps %xmm5, %xmm13 + movaps %xmm6, %xmm2 + subps %xmm1, %xmm13 + mulps %xmm7, %xmm2 + movups __sSignMask(%rax), %xmm3 + movaps %xmm5, %xmm1 + movups __sOneHalf(%rax), %xmm4 + subps %xmm2, %xmm13 + cmpnleps __sRangeReductionVal(%rax), %xmm5 + movaps %xmm3, %xmm2 + andps %xmm13, %xmm2 + xorps %xmm2, %xmm4 + +/* Result sign calculations */ + xorps %xmm2, %xmm3 + xorps %xmm9, %xmm3 + +/* Add correction term 0.5 for cos() part */ + addps %xmm7, %xmm4 + movmskps %xmm5, %ecx + mulps %xmm4, %xmm10 + mulps %xmm4, %xmm6 + subps %xmm10, %xmm1 + movups __sPI3(%rax), %xmm10 + subps %xmm6, %xmm1 + movaps %xmm10, %xmm6 + mulps %xmm7, %xmm6 + mulps %xmm4, %xmm10 + subps %xmm6, %xmm13 + subps %xmm10, %xmm1 + movups __sPI4(%rax), %xmm6 + mulps %xmm6, %xmm7 + mulps %xmm6, %xmm4 + subps %xmm7, %xmm13 + subps %xmm4, %xmm1 + xorps %xmm9, %xmm13 + xorps %xmm3, %xmm1 + movaps %xmm13, %xmm4 + movaps %xmm1, %xmm2 + mulps %xmm13, %xmm4 + mulps %xmm1, %xmm2 + movups __sA9(%rax), %xmm7 + +/* 2) Polynomial (minimax for sin within [-Pi/4; +Pi/4] interval) + a) Calculate X^2 = X * X + b) Calculate 2 polynomials for sin and cos: + RS = X * ( A0 + X^2 * (A1 + x^2 * (A2 + x^2 * (A3)))); + RC = B0 + X^2 * (B1 + x^2 * (B2 + x^2 * (B3 + x^2 * (B4)))) */ + movaps %xmm7, %xmm3 + mulps %xmm4, %xmm3 + mulps %xmm2, %xmm7 + addps __sA7(%rax), %xmm3 + addps __sA7(%rax), %xmm7 + mulps %xmm4, %xmm3 + mulps %xmm2, %xmm7 + addps __sA5(%rax), %xmm3 + addps __sA5(%rax), %xmm7 + mulps %xmm4, %xmm3 + mulps %xmm2, %xmm7 + addps __sA3(%rax), %xmm3 + addps __sA3(%rax), %xmm7 + mulps %xmm3, %xmm4 + mulps %xmm7, %xmm2 + mulps %xmm13, %xmm4 + mulps %xmm1, %xmm2 + addps %xmm4, %xmm13 + addps %xmm2, %xmm1 + xorps %xmm12, %xmm13 + testl %ecx, %ecx + jne .LBL_1_3 + +.LBL_1_2: + cfi_remember_state + movups 160(%rsp), %xmm9 + movaps %xmm13, (%rdi) + movups 144(%rsp), %xmm10 + movups 176(%rsp), %xmm12 + movups 112(%rsp), %xmm13 + movups %xmm1, (%rsi) + movq %rbp, %rsp + cfi_def_cfa_register (%rsp) + popq %rbp + cfi_adjust_cfa_offset (-8) + cfi_restore (%rbp) + ret + +.LBL_1_3: + cfi_restore_state + movups %xmm0, 128(%rsp) + movups %xmm13, 192(%rsp) + movups %xmm1, 256(%rsp) + je .LBL_1_2 + + xorb %dl, %dl + xorl %eax, %eax + movups %xmm8, 48(%rsp) + movups %xmm11, 32(%rsp) + movups %xmm14, 16(%rsp) + movups %xmm15, (%rsp) + movq %rsi, 64(%rsp) + movq %r12, 104(%rsp) + cfi_offset_rel_rsp (12, 104) + movb %dl, %r12b + movq %r13, 96(%rsp) + cfi_offset_rel_rsp (13, 96) + movl %eax, %r13d + movq %r14, 88(%rsp) + cfi_offset_rel_rsp (14, 88) + movl %ecx, %r14d + movq %r15, 80(%rsp) + cfi_offset_rel_rsp (15, 80) + movq %rbx, 72(%rsp) + movq %rdi, %rbx + cfi_remember_state + +.LBL_1_6: + btl %r13d, %r14d + jc .LBL_1_13 + +.LBL_1_7: + lea 1(%r13), %esi + btl %esi, %r14d + jc .LBL_1_10 + +.LBL_1_8: + incb %r12b + addl $2, %r13d + cmpb $16, %r12b + jb .LBL_1_6 + + movups 48(%rsp), %xmm8 + movq %rbx, %rdi + movups 32(%rsp), %xmm11 + movups 16(%rsp), %xmm14 + movups (%rsp), %xmm15 + movq 64(%rsp), %rsi + movq 104(%rsp), %r12 + cfi_restore (%r12) + movq 96(%rsp), %r13 + cfi_restore (%r13) + movq 88(%rsp), %r14 + cfi_restore (%r14) + movq 80(%rsp), %r15 + cfi_restore (%r15) + movq 72(%rsp), %rbx + movups 192(%rsp), %xmm13 + movups 256(%rsp), %xmm1 + jmp .LBL_1_2 + +.LBL_1_10: + cfi_restore_state + movzbl %r12b, %r15d + movss 132(%rsp,%r15,8), %xmm0 + + call JUMPTARGET(sinf) + + movss %xmm0, 196(%rsp,%r15,8) + movss 132(%rsp,%r15,8), %xmm0 + + call JUMPTARGET(cosf) + + movss %xmm0, 260(%rsp,%r15,8) + jmp .LBL_1_8 + +.LBL_1_13: + movzbl %r12b, %r15d + movss 128(%rsp,%r15,8), %xmm0 + + call JUMPTARGET(sinf) + + movss %xmm0, 192(%rsp,%r15,8) + movss 128(%rsp,%r15,8), %xmm0 + + call JUMPTARGET(cosf) + + movss %xmm0, 256(%rsp,%r15,8) + jmp .LBL_1_7 + +END (_ZGVbN4vl4l4_sincosf_sse4) +libmvec_hidden_def(_ZGVbN4vl4l4_sincosf_sse4) + +/* vvv version implemented with wrapper to vl4l4 variant. */ +ENTRY (_ZGVbN4vvv_sincosf_sse4) +#ifndef __ILP32__ + subq $104, %rsp + .cfi_def_cfa_offset 112 + movdqu %xmm1, 32(%rsp) + lea (%rsp), %rdi + movdqu %xmm2, 48(%rdi) + lea 16(%rsp), %rsi + movdqu %xmm3, 48(%rsi) + movdqu %xmm4, 64(%rsi) + call HIDDEN_JUMPTARGET(_ZGVbN4vl4l4_sincosf_sse4) + movq 32(%rsp), %rdx + movq 40(%rsp), %rsi + movq 48(%rsp), %r8 + movq 56(%rsp), %r10 + movl (%rsp), %eax + movl 4(%rsp), %ecx + movl 8(%rsp), %edi + movl 12(%rsp), %r9d + movl %eax, (%rdx) + movl %ecx, (%rsi) + movq 64(%rsp), %rax + movq 72(%rsp), %rcx + movl %edi, (%r8) + movl %r9d, (%r10) + movq 80(%rsp), %rdi + movq 88(%rsp), %r9 + movl 16(%rsp), %r11d + movl 20(%rsp), %edx + movl 24(%rsp), %esi + movl 28(%rsp), %r8d + movl %r11d, (%rax) + movl %edx, (%rcx) + movl %esi, (%rdi) + movl %r8d, (%r9) + addq $104, %rsp + .cfi_def_cfa_offset 8 + ret +#else + subl $72, %esp + .cfi_def_cfa_offset 80 + leal 48(%rsp), %esi + movaps %xmm1, 16(%esp) + leal 32(%rsp), %edi + movaps %xmm2, (%esp) + call HIDDEN_JUMPTARGET(_ZGVbN4vl4l4_sincosf_sse4) + movl 16(%esp), %eax + movss 32(%esp), %xmm0 + movss %xmm0, (%eax) + movl 20(%esp), %eax + movss 36(%esp), %xmm0 + movss %xmm0, (%eax) + movl 24(%esp), %eax + movss 40(%esp), %xmm0 + movss %xmm0, (%eax) + movl 28(%esp), %eax + movss 44(%esp), %xmm0 + movss %xmm0, (%eax) + movl (%esp), %eax + movss 48(%esp), %xmm0 + movss %xmm0, (%eax) + movl 4(%esp), %eax + movss 52(%esp), %xmm0 + movss %xmm0, (%eax) + movl 8(%esp), %eax + movss 56(%esp), %xmm0 + movss %xmm0, (%eax) + movl 12(%esp), %eax + movss 60(%esp), %xmm0 + movss %xmm0, (%eax) + addl $72, %esp + .cfi_def_cfa_offset 8 + ret +#endif +END (_ZGVbN4vvv_sincosf_sse4) diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf8_core.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf8_core.S new file mode 100644 index 0000000000..320fd861a5 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf8_core.S @@ -0,0 +1,36 @@ +/* Multiple versions of vectorized sincosf. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + + .text +ENTRY (_ZGVdN8vvv_sincosf) + .type _ZGVdN8vvv_sincosf, @gnu_indirect_function + LOAD_RTLD_GLOBAL_RO_RDX + leaq _ZGVdN8vvv_sincosf_avx2(%rip), %rax + HAS_ARCH_FEATURE (AVX2_Usable) + jz 2f + ret +2: leaq _ZGVdN8vvv_sincosf_sse_wrapper(%rip), %rax + ret +END (_ZGVdN8vvv_sincosf) +libmvec_hidden_def (_ZGVdN8vvv_sincosf) + +#define _ZGVdN8vvv_sincosf _ZGVdN8vvv_sincosf_sse_wrapper +#include "../svml_s_sincosf8_core.S" diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf8_core_avx2.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf8_core_avx2.S new file mode 100644 index 0000000000..9e4e2c71c5 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf8_core_avx2.S @@ -0,0 +1,389 @@ +/* Function sincosf vectorized with AVX2. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "svml_s_trig_data.h" + + .text +ENTRY (_ZGVdN8vl4l4_sincosf_avx2) +/* + ALGORITHM DESCRIPTION: + + 1) Range reduction to [-Pi/4; +Pi/4] interval + a) Grab sign from source argument and save it. + b) Remove sign using AND operation + c) Getting octant Y by 2/Pi multiplication + d) Add "Right Shifter" value + e) Treat obtained value as integer S for destination sign setting. + SS = ((S-S&1)&2)<<30; For sin part + SC = ((S+S&1)&2)<<30; For cos part + f) Change destination sign if source sign is negative + using XOR operation. + g) Subtract "Right Shifter" (0x4B000000) value + h) Subtract Y*(PI/2) from X argument, where PI/2 divided to 4 parts: + X = X - Y*PI1 - Y*PI2 - Y*PI3 - Y*PI4; + 2) Polynomial (minimax for sin within [-Pi/4; +Pi/4] interval) + a) Calculate X^2 = X * X + b) Calculate 2 polynomials for sin and cos: + RS = X * ( A0 + X^2 * (A1 + x^2 * (A2 + x^2 * (A3)))); + RC = B0 + X^2 * (B1 + x^2 * (B2 + x^2 * (B3 + x^2 * (B4)))); + c) Swap RS & RC if if first bit of obtained value after + Right Shifting is set to 1. Using And, Andnot & Or operations. + 3) Destination sign setting + a) Set shifted destination sign using XOR operation: + R1 = XOR( RS, SS ); + R2 = XOR( RC, SC ). */ + + pushq %rbp + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%rbp, 0) + movq %rsp, %rbp + cfi_def_cfa_register (%rbp) + andq $-64, %rsp + subq $448, %rsp + movq __svml_s_trig_data@GOTPCREL(%rip), %rax + vmovdqa %ymm0, %ymm5 + vmovups %ymm13, 352(%rsp) + vmovups __sAbsMask(%rax), %ymm2 + vmovups __sInvPI(%rax), %ymm1 + vmovups __sPI1_FMA(%rax), %ymm13 + vmovups %ymm15, 288(%rsp) + +/* Absolute argument computation */ + vandps %ymm2, %ymm5, %ymm4 + +/* c) Getting octant Y by 2/Pi multiplication + d) Add "Right Shifter" value */ + vfmadd213ps __sRShifter(%rax), %ymm4, %ymm1 + +/* e) Treat obtained value as integer S for destination sign setting */ + vpslld $31, %ymm1, %ymm0 + +/* g) Subtract "Right Shifter" (0x4B000000) value */ + vsubps __sRShifter(%rax), %ymm1, %ymm1 + +/* h) Subtract Y*(PI/2) from X argument, where PI/2 divided to 3 parts: + X = X - Y*PI1 - Y*PI2 - Y*PI3 */ + vmovdqa %ymm4, %ymm7 + vfnmadd231ps %ymm1, %ymm13, %ymm7 + vfnmadd231ps __sPI2_FMA(%rax), %ymm1, %ymm7 + vandps __sSignMask(%rax), %ymm7, %ymm15 + vxorps __sOneHalf(%rax), %ymm15, %ymm6 + +/* Add correction term 0.5 for cos() part */ + vaddps %ymm6, %ymm1, %ymm6 + vmovdqa %ymm4, %ymm3 + vfnmadd231ps %ymm6, %ymm13, %ymm3 + vmovups __sPI3_FMA(%rax), %ymm13 + vcmpnle_uqps __sRangeReductionVal(%rax), %ymm4, %ymm4 + vfnmadd231ps __sPI2_FMA(%rax), %ymm6, %ymm3 + vfnmadd213ps %ymm7, %ymm13, %ymm1 + vfnmadd213ps %ymm3, %ymm13, %ymm6 + +/* Result sign calculations */ + vxorps __sSignMask(%rax), %ymm15, %ymm3 + vxorps %ymm0, %ymm3, %ymm7 + vxorps %ymm7, %ymm6, %ymm3 + vxorps %ymm0, %ymm1, %ymm15 + vandnps %ymm5, %ymm2, %ymm6 + vmovups __sA7_FMA(%rax), %ymm2 + vmulps %ymm15, %ymm15, %ymm13 + vmovups __sA9_FMA(%rax), %ymm7 + vmulps %ymm3, %ymm3, %ymm1 + +/* 2) Polynomial (minimax for sin within [-Pi/4; +Pi/4] interval) + a) Calculate X^2 = X * X + b) Calculate 2 polynomials for sin and cos: + RS = X * ( A0 + X^2 * (A1 + x^2 * (A2 + x^2 * (A3)))); + RC = B0 + X^2 * (B1 + x^2 * (B2 + x^2 * (B3 + x^2 * (B4)))) */ + vmovdqa %ymm2, %ymm0 + vfmadd231ps __sA9_FMA(%rax), %ymm13, %ymm0 + vfmadd213ps %ymm2, %ymm1, %ymm7 + vfmadd213ps __sA5_FMA(%rax), %ymm13, %ymm0 + vfmadd213ps __sA5_FMA(%rax), %ymm1, %ymm7 + vfmadd213ps __sA3(%rax), %ymm13, %ymm0 + vfmadd213ps __sA3(%rax), %ymm1, %ymm7 + vmulps %ymm13, %ymm0, %ymm13 + vmulps %ymm1, %ymm7, %ymm1 + vfmadd213ps %ymm15, %ymm15, %ymm13 + vfmadd213ps %ymm3, %ymm3, %ymm1 + vmovmskps %ymm4, %ecx + vxorps %ymm6, %ymm13, %ymm0 + testl %ecx, %ecx + jne .LBL_1_3 + +.LBL_1_2: + cfi_remember_state + vmovups 352(%rsp), %ymm13 + vmovups 288(%rsp), %ymm15 + vmovups %ymm0, (%rdi) + vmovups %ymm1, (%rsi) + movq %rbp, %rsp + cfi_def_cfa_register (%rsp) + popq %rbp + cfi_adjust_cfa_offset (-8) + cfi_restore (%rbp) + ret + +.LBL_1_3: + cfi_restore_state + vmovups %ymm5, 256(%rsp) + vmovups %ymm0, 320(%rsp) + vmovups %ymm1, 384(%rsp) + je .LBL_1_2 + + xorb %dl, %dl + xorl %eax, %eax + vmovups %ymm8, 160(%rsp) + vmovups %ymm9, 128(%rsp) + vmovups %ymm10, 96(%rsp) + vmovups %ymm11, 64(%rsp) + vmovups %ymm12, 32(%rsp) + vmovups %ymm14, (%rsp) + movq %rsi, 192(%rsp) + movq %r12, 232(%rsp) + cfi_offset_rel_rsp (12, 232) + movb %dl, %r12b + movq %r13, 224(%rsp) + cfi_offset_rel_rsp (13, 224) + movl %eax, %r13d + movq %r14, 216(%rsp) + cfi_offset_rel_rsp (14, 216) + movl %ecx, %r14d + movq %r15, 208(%rsp) + cfi_offset_rel_rsp (14, 208) + movq %rbx, 200(%rsp) + movq %rdi, %rbx + cfi_remember_state + +.LBL_1_6: + btl %r13d, %r14d + jc .LBL_1_13 + +.LBL_1_7: + lea 1(%r13), %esi + btl %esi, %r14d + jc .LBL_1_10 + +.LBL_1_8: + incb %r12b + addl $2, %r13d + cmpb $16, %r12b + jb .LBL_1_6 + + vmovups 160(%rsp), %ymm8 + movq %rbx, %rdi + vmovups 128(%rsp), %ymm9 + vmovups 96(%rsp), %ymm10 + vmovups 64(%rsp), %ymm11 + vmovups 32(%rsp), %ymm12 + vmovups (%rsp), %ymm14 + vmovups 320(%rsp), %ymm0 + vmovups 384(%rsp), %ymm1 + movq 192(%rsp), %rsi + movq 232(%rsp), %r12 + cfi_restore (%r12) + movq 224(%rsp), %r13 + cfi_restore (%r13) + movq 216(%rsp), %r14 + cfi_restore (%r14) + movq 208(%rsp), %r15 + cfi_restore (%r15) + movq 200(%rsp), %rbx + jmp .LBL_1_2 + +.LBL_1_10: + cfi_restore_state + movzbl %r12b, %r15d + vmovss 260(%rsp,%r15,8), %xmm0 + vzeroupper + + call JUMPTARGET(sinf) + + vmovss %xmm0, 324(%rsp,%r15,8) + vmovss 260(%rsp,%r15,8), %xmm0 + + call JUMPTARGET(cosf) + + vmovss %xmm0, 388(%rsp,%r15,8) + jmp .LBL_1_8 + +.LBL_1_13: + movzbl %r12b, %r15d + vmovss 256(%rsp,%r15,8), %xmm0 + vzeroupper + + call JUMPTARGET(sinf) + + vmovss %xmm0, 320(%rsp,%r15,8) + vmovss 256(%rsp,%r15,8), %xmm0 + + call JUMPTARGET(cosf) + + vmovss %xmm0, 384(%rsp,%r15,8) + jmp .LBL_1_7 + +END (_ZGVdN8vl4l4_sincosf_avx2) +libmvec_hidden_def(_ZGVdN8vl4l4_sincosf_avx2) + +/* vvv version implemented with wrapper to vl4l4 variant. */ +ENTRY (_ZGVdN8vvv_sincosf_avx2) +#ifndef __ILP32__ + pushq %rbp + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%rbp, 0) + movq %rsp, %rbp + cfi_def_cfa_register (%rbp) + andq $-32, %rsp + subq $192, %rsp + vmovdqu %ymm1, 64(%rsp) + lea (%rsp), %rdi + vmovdqu %ymm2, 96(%rdi) + vmovdqu %ymm3, 128(%rdi) + vmovdqu %ymm4, 160(%rdi) + lea 32(%rsp), %rsi + call HIDDEN_JUMPTARGET(_ZGVdN8vl4l4_sincosf_avx2) + movq 64(%rsp), %rdx + movq 72(%rsp), %rsi + movq 80(%rsp), %r8 + movq 88(%rsp), %r10 + movl (%rsp), %eax + movl 4(%rsp), %ecx + movl 8(%rsp), %edi + movl 12(%rsp), %r9d + movl %eax, (%rdx) + movl %ecx, (%rsi) + movq 96(%rsp), %rax + movq 104(%rsp), %rcx + movl %edi, (%r8) + movl %r9d, (%r10) + movq 112(%rsp), %rdi + movq 120(%rsp), %r9 + movl 16(%rsp), %r11d + movl 20(%rsp), %edx + movl 24(%rsp), %esi + movl 28(%rsp), %r8d + movl %r11d, (%rax) + movl %edx, (%rcx) + movq 128(%rsp), %r11 + movq 136(%rsp), %rdx + movl %esi, (%rdi) + movl %r8d, (%r9) + movq 144(%rsp), %rsi + movq 152(%rsp), %r8 + movl 32(%rsp), %r10d + movl 36(%rsp), %eax + movl 40(%rsp), %ecx + movl 44(%rsp), %edi + movl %r10d, (%r11) + movl %eax, (%rdx) + movq 160(%rsp), %r10 + movq 168(%rsp), %rax + movl %ecx, (%rsi) + movl %edi, (%r8) + movq 176(%rsp), %rcx + movq 184(%rsp), %rdi + movl 48(%rsp), %r9d + movl 52(%rsp), %r11d + movl 56(%rsp), %edx + movl 60(%rsp), %esi + movl %r9d, (%r10) + movl %r11d, (%rax) + movl %edx, (%rcx) + movl %esi, (%rdi) + movq %rbp, %rsp + cfi_def_cfa_register (%rsp) + popq %rbp + cfi_adjust_cfa_offset (-8) + cfi_restore (%rbp) + ret +#else + leal 8(%rsp), %r10d + .cfi_def_cfa 10, 0 + andl $-32, %esp + pushq -8(%r10d) + pushq %rbp + .cfi_escape 0x10,0x6,0x2,0x76,0 + movl %esp, %ebp + pushq %r10 + .cfi_escape 0xf,0x3,0x76,0x78,0x6 + leal -48(%rbp), %esi + leal -80(%rbp), %edi + subl $136, %esp + vmovdqa %ymm1, -112(%ebp) + vmovdqa %ymm2, -144(%ebp) + call HIDDEN_JUMPTARGET(_ZGVdN8vl4l4_sincosf_avx2) + vmovdqa -112(%ebp), %xmm0 + vmovq %xmm0, %rax + vmovss -80(%ebp), %xmm0 + vmovss %xmm0, (%eax) + vmovss -76(%ebp), %xmm0 + shrq $32, %rax + vmovss %xmm0, (%eax) + movq -104(%ebp), %rax + vmovss -72(%ebp), %xmm0 + vmovss %xmm0, (%eax) + vmovss -68(%ebp), %xmm0 + shrq $32, %rax + vmovss %xmm0, (%eax) + movq -96(%ebp), %rax + vmovss -64(%ebp), %xmm0 + vmovss %xmm0, (%eax) + vmovss -60(%ebp), %xmm0 + shrq $32, %rax + vmovss %xmm0, (%eax) + movq -88(%ebp), %rax + vmovss -56(%ebp), %xmm0 + vmovss %xmm0, (%eax) + vmovss -52(%ebp), %xmm0 + shrq $32, %rax + vmovss %xmm0, (%eax) + vmovdqa -144(%ebp), %xmm0 + vmovq %xmm0, %rax + vmovss -48(%ebp), %xmm0 + vmovss %xmm0, (%eax) + vmovss -44(%ebp), %xmm0 + shrq $32, %rax + vmovss %xmm0, (%eax) + movq -136(%ebp), %rax + vmovss -40(%ebp), %xmm0 + vmovss %xmm0, (%eax) + vmovss -36(%ebp), %xmm0 + shrq $32, %rax + vmovss %xmm0, (%eax) + movq -128(%ebp), %rax + vmovss -32(%ebp), %xmm0 + vmovss %xmm0, (%eax) + vmovss -28(%ebp), %xmm0 + shrq $32, %rax + vmovss %xmm0, (%eax) + movq -120(%ebp), %rax + vmovss -24(%ebp), %xmm0 + vmovss %xmm0, (%eax) + vmovss -20(%ebp), %xmm0 + shrq $32, %rax + vmovss %xmm0, (%eax) + addl $136, %esp + popq %r10 + .cfi_def_cfa 10, 0 + popq %rbp + leal -8(%r10), %esp + .cfi_def_cfa 7, 8 + ret +#endif +END (_ZGVdN8vvv_sincosf_avx2) diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core.S new file mode 100644 index 0000000000..2c18dbce53 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core.S @@ -0,0 +1,37 @@ +/* Multiple versions of vectorized sinf. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + + .text +ENTRY (_ZGVeN16v_sinf) + .type _ZGVeN16v_sinf, @gnu_indirect_function + LOAD_RTLD_GLOBAL_RO_RDX + leaq _ZGVeN16v_sinf_skx(%rip), %rax + HAS_ARCH_FEATURE (AVX512DQ_Usable) + jnz 2f + leaq _ZGVeN16v_sinf_knl(%rip), %rax + HAS_ARCH_FEATURE (AVX512F_Usable) + jnz 2f + leaq _ZGVeN16v_sinf_avx2_wrapper(%rip), %rax +2: ret +END (_ZGVeN16v_sinf) + +#define _ZGVeN16v_sinf _ZGVeN16v_sinf_avx2_wrapper +#include "../svml_s_sinf16_core.S" diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S new file mode 100644 index 0000000000..8670673a29 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S @@ -0,0 +1,479 @@ +/* Function sinf vectorized with AVX-512. KNL and SKX versions. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "svml_s_trig_data.h" +#include "svml_s_wrapper_impl.h" + + .text +ENTRY(_ZGVeN16v_sinf_knl) +#ifndef HAVE_AVX512DQ_ASM_SUPPORT +WRAPPER_IMPL_AVX512 _ZGVdN8v_sinf +#else +/* + ALGORITHM DESCRIPTION: + + 1) Range reduction to [-Pi/2; +Pi/2] interval + a) Grab sign from source argument and save it. + b) Remove sign using AND operation + c) Getting octant Y by 1/Pi multiplication + d) Add "Right Shifter" value + e) Treat obtained value as integer for destination sign setting. + Shift first bit of this value to the last (sign) position + f) Change destination sign if source sign is negative + using XOR operation. + g) Subtract "Right Shifter" value + h) Subtract Y*PI from X argument, where PI divided to 4 parts: + X = X - Y*PI1 - Y*PI2 - Y*PI3 - Y*PI4; + 2) Polynomial (minimax for sin within [-Pi/2; +Pi/2] interval) + a) Calculate X^2 = X * X + b) Calculate polynomial: + R = X + X * X^2 * (A3 + x^2 * (A5 + ...... + 3) Destination sign setting + a) Set shifted destination sign using XOR operation: + R = XOR( R, S ); + */ + pushq %rbp + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%rbp, 0) + movq %rsp, %rbp + cfi_def_cfa_register (%rbp) + andq $-64, %rsp + subq $1280, %rsp + movq __svml_s_trig_data@GOTPCREL(%rip), %rax + +/* Check for large and special values */ + movl $-1, %edx + vmovups __sAbsMask(%rax), %zmm4 + vmovups __sInvPI(%rax), %zmm1 + +/* b) Remove sign using AND operation */ + vpandd %zmm4, %zmm0, %zmm12 + vmovups __sPI1_FMA(%rax), %zmm2 + vmovups __sA9(%rax), %zmm7 + +/* + f) Change destination sign if source sign is negative + using XOR operation. + */ + vpandnd %zmm0, %zmm4, %zmm11 + +/* + h) Subtract Y*PI from X argument, where PI divided to 4 parts: + X = X - Y*PI1 - Y*PI2 - Y*PI3; + */ + vmovaps %zmm12, %zmm3 + +/* + c) Getting octant Y by 1/Pi multiplication + d) Add "Right Shifter" value + */ + vfmadd213ps __sRShifter(%rax), %zmm12, %zmm1 + vcmpps $22, __sRangeReductionVal(%rax), %zmm12, %k1 + vpbroadcastd %edx, %zmm13{%k1}{z} + +/* g) Subtract "Right Shifter" value */ + vsubps __sRShifter(%rax), %zmm1, %zmm5 + +/* + e) Treat obtained value as integer for destination sign setting. + Shift first bit of this value to the last (sign) position + */ + vpslld $31, %zmm1, %zmm6 + vptestmd %zmm13, %zmm13, %k0 + vfnmadd231ps %zmm5, %zmm2, %zmm3 + kmovw %k0, %ecx + vfnmadd231ps __sPI2_FMA(%rax), %zmm5, %zmm3 + vfnmadd132ps __sPI3_FMA(%rax), %zmm3, %zmm5 + +/* + 2) Polynomial (minimax for sin within [-Pi/2; +Pi/2] interval) + a) Calculate X^2 = X * X + b) Calculate polynomial: + R = X + X * X^2 * (A3 + x^2 * (A5 + ...... + */ + vmulps %zmm5, %zmm5, %zmm8 + vpxord %zmm6, %zmm5, %zmm9 + vfmadd213ps __sA7(%rax), %zmm8, %zmm7 + vfmadd213ps __sA5(%rax), %zmm8, %zmm7 + vfmadd213ps __sA3(%rax), %zmm8, %zmm7 + vmulps %zmm8, %zmm7, %zmm10 + vfmadd213ps %zmm9, %zmm9, %zmm10 + +/* + 3) Destination sign setting + a) Set shifted destination sign using XOR operation: + R = XOR( R, S ); + */ + vpxord %zmm11, %zmm10, %zmm1 + testl %ecx, %ecx + jne .LBL_1_3 + +.LBL_1_2: + cfi_remember_state + vmovaps %zmm1, %zmm0 + movq %rbp, %rsp + cfi_def_cfa_register (%rsp) + popq %rbp + cfi_adjust_cfa_offset (-8) + cfi_restore (%rbp) + ret + +.LBL_1_3: + cfi_restore_state + vmovups %zmm0, 1152(%rsp) + vmovups %zmm1, 1216(%rsp) + je .LBL_1_2 + + xorb %dl, %dl + kmovw %k4, 1048(%rsp) + xorl %eax, %eax + kmovw %k5, 1040(%rsp) + kmovw %k6, 1032(%rsp) + kmovw %k7, 1024(%rsp) + vmovups %zmm16, 960(%rsp) + vmovups %zmm17, 896(%rsp) + vmovups %zmm18, 832(%rsp) + vmovups %zmm19, 768(%rsp) + vmovups %zmm20, 704(%rsp) + vmovups %zmm21, 640(%rsp) + vmovups %zmm22, 576(%rsp) + vmovups %zmm23, 512(%rsp) + vmovups %zmm24, 448(%rsp) + vmovups %zmm25, 384(%rsp) + vmovups %zmm26, 320(%rsp) + vmovups %zmm27, 256(%rsp) + vmovups %zmm28, 192(%rsp) + vmovups %zmm29, 128(%rsp) + vmovups %zmm30, 64(%rsp) + vmovups %zmm31, (%rsp) + movq %rsi, 1064(%rsp) + movq %rdi, 1056(%rsp) + movq %r12, 1096(%rsp) + cfi_offset_rel_rsp (12, 1096) + movb %dl, %r12b + movq %r13, 1088(%rsp) + cfi_offset_rel_rsp (13, 1088) + movl %ecx, %r13d + movq %r14, 1080(%rsp) + cfi_offset_rel_rsp (14, 1080) + movl %eax, %r14d + movq %r15, 1072(%rsp) + cfi_offset_rel_rsp (15, 1072) + cfi_remember_state + +.LBL_1_6: + btl %r14d, %r13d + jc .LBL_1_12 + +.LBL_1_7: + lea 1(%r14), %esi + btl %esi, %r13d + jc .LBL_1_10 + +.LBL_1_8: + addb $1, %r12b + addl $2, %r14d + cmpb $16, %r12b + jb .LBL_1_6 + + kmovw 1048(%rsp), %k4 + movq 1064(%rsp), %rsi + kmovw 1040(%rsp), %k5 + movq 1056(%rsp), %rdi + kmovw 1032(%rsp), %k6 + movq 1096(%rsp), %r12 + cfi_restore (%r12) + movq 1088(%rsp), %r13 + cfi_restore (%r13) + kmovw 1024(%rsp), %k7 + vmovups 960(%rsp), %zmm16 + vmovups 896(%rsp), %zmm17 + vmovups 832(%rsp), %zmm18 + vmovups 768(%rsp), %zmm19 + vmovups 704(%rsp), %zmm20 + vmovups 640(%rsp), %zmm21 + vmovups 576(%rsp), %zmm22 + vmovups 512(%rsp), %zmm23 + vmovups 448(%rsp), %zmm24 + vmovups 384(%rsp), %zmm25 + vmovups 320(%rsp), %zmm26 + vmovups 256(%rsp), %zmm27 + vmovups 192(%rsp), %zmm28 + vmovups 128(%rsp), %zmm29 + vmovups 64(%rsp), %zmm30 + vmovups (%rsp), %zmm31 + movq 1080(%rsp), %r14 + cfi_restore (%r14) + movq 1072(%rsp), %r15 + cfi_restore (%r15) + vmovups 1216(%rsp), %zmm1 + jmp .LBL_1_2 + +.LBL_1_10: + cfi_restore_state + movzbl %r12b, %r15d + vmovss 1156(%rsp,%r15,8), %xmm0 + call JUMPTARGET(sinf) + vmovss %xmm0, 1220(%rsp,%r15,8) + jmp .LBL_1_8 + +.LBL_1_12: + movzbl %r12b, %r15d + vmovss 1152(%rsp,%r15,8), %xmm0 + call JUMPTARGET(sinf) + vmovss %xmm0, 1216(%rsp,%r15,8) + jmp .LBL_1_7 +#endif +END(_ZGVeN16v_sinf_knl) + +ENTRY (_ZGVeN16v_sinf_skx) +#ifndef HAVE_AVX512DQ_ASM_SUPPORT +WRAPPER_IMPL_AVX512 _ZGVdN8v_sinf +#else +/* + ALGORITHM DESCRIPTION: + + 1) Range reduction to [-Pi/2; +Pi/2] interval + a) Grab sign from source argument and save it. + b) Remove sign using AND operation + c) Getting octant Y by 1/Pi multiplication + d) Add "Right Shifter" value + e) Treat obtained value as integer for destination sign setting. + Shift first bit of this value to the last (sign) position + f) Change destination sign if source sign is negative + using XOR operation. + g) Subtract "Right Shifter" value + h) Subtract Y*PI from X argument, where PI divided to 4 parts: + X = X - Y*PI1 - Y*PI2 - Y*PI3 - Y*PI4; + 2) Polynomial (minimax for sin within [-Pi/2; +Pi/2] interval) + a) Calculate X^2 = X * X + b) Calculate polynomial: + R = X + X * X^2 * (A3 + x^2 * (A5 + ...... + 3) Destination sign setting + a) Set shifted destination sign using XOR operation: + R = XOR( R, S ); + */ + + pushq %rbp + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%rbp, 0) + movq %rsp, %rbp + cfi_def_cfa_register (%rbp) + andq $-64, %rsp + subq $1280, %rsp + movq __svml_s_trig_data@GOTPCREL(%rip), %rax + +/* Check for large and special values */ + vmovups .L_2il0floatpacket.11(%rip), %zmm14 + vmovups __sAbsMask(%rax), %zmm5 + vmovups __sInvPI(%rax), %zmm1 + vmovups __sRShifter(%rax), %zmm2 + vmovups __sPI1_FMA(%rax), %zmm3 + vmovups __sA9(%rax), %zmm8 + +/* b) Remove sign using AND operation */ + vandps %zmm5, %zmm0, %zmm13 + +/* + f) Change destination sign if source sign is negative + using XOR operation. + */ + vandnps %zmm0, %zmm5, %zmm12 + +/* + c) Getting octant Y by 1/Pi multiplication + d) Add "Right Shifter" value + */ + vfmadd213ps %zmm2, %zmm13, %zmm1 + vcmpps $18, __sRangeReductionVal(%rax), %zmm13, %k1 + +/* + e) Treat obtained value as integer for destination sign setting. + Shift first bit of this value to the last (sign) position + */ + vpslld $31, %zmm1, %zmm7 + +/* g) Subtract "Right Shifter" value */ + vsubps %zmm2, %zmm1, %zmm6 + +/* + h) Subtract Y*PI from X argument, where PI divided to 4 parts: + X = X - Y*PI1 - Y*PI2 - Y*PI3; + */ + vmovaps %zmm13, %zmm4 + vfnmadd231ps %zmm6, %zmm3, %zmm4 + vfnmadd231ps __sPI2_FMA(%rax), %zmm6, %zmm4 + vfnmadd132ps __sPI3_FMA(%rax), %zmm4, %zmm6 + +/* + 2) Polynomial (minimax for sin within [-Pi/2; +Pi/2] interval) + a) Calculate X^2 = X * X + b) Calculate polynomial: + R = X + X * X^2 * (A3 + x^2 * (A5 + ...... + */ + vmulps %zmm6, %zmm6, %zmm9 + vxorps %zmm7, %zmm6, %zmm10 + vfmadd213ps __sA7(%rax), %zmm9, %zmm8 + vfmadd213ps __sA5(%rax), %zmm9, %zmm8 + vfmadd213ps __sA3(%rax), %zmm9, %zmm8 + vmulps %zmm9, %zmm8, %zmm11 + vfmadd213ps %zmm10, %zmm10, %zmm11 + +/* + 3) Destination sign setting + a) Set shifted destination sign using XOR operation: + R = XOR( R, S ); + */ + vxorps %zmm12, %zmm11, %zmm1 + vpandnd %zmm13, %zmm13, %zmm14{%k1} + vptestmd %zmm14, %zmm14, %k0 + kmovw %k0, %ecx + testl %ecx, %ecx + jne .LBL_2_3 + +.LBL_2_2: + cfi_remember_state + vmovaps %zmm1, %zmm0 + movq %rbp, %rsp + cfi_def_cfa_register (%rsp) + popq %rbp + cfi_adjust_cfa_offset (-8) + cfi_restore (%rbp) + ret + +.LBL_2_3: + cfi_restore_state + vmovups %zmm0, 1152(%rsp) + vmovups %zmm1, 1216(%rsp) + je .LBL_2_2 + + xorb %dl, %dl + xorl %eax, %eax + kmovw %k4, 1048(%rsp) + kmovw %k5, 1040(%rsp) + kmovw %k6, 1032(%rsp) + kmovw %k7, 1024(%rsp) + vmovups %zmm16, 960(%rsp) + vmovups %zmm17, 896(%rsp) + vmovups %zmm18, 832(%rsp) + vmovups %zmm19, 768(%rsp) + vmovups %zmm20, 704(%rsp) + vmovups %zmm21, 640(%rsp) + vmovups %zmm22, 576(%rsp) + vmovups %zmm23, 512(%rsp) + vmovups %zmm24, 448(%rsp) + vmovups %zmm25, 384(%rsp) + vmovups %zmm26, 320(%rsp) + vmovups %zmm27, 256(%rsp) + vmovups %zmm28, 192(%rsp) + vmovups %zmm29, 128(%rsp) + vmovups %zmm30, 64(%rsp) + vmovups %zmm31, (%rsp) + movq %rsi, 1064(%rsp) + movq %rdi, 1056(%rsp) + movq %r12, 1096(%rsp) + cfi_offset_rel_rsp (12, 1096) + movb %dl, %r12b + movq %r13, 1088(%rsp) + cfi_offset_rel_rsp (13, 1088) + movl %ecx, %r13d + movq %r14, 1080(%rsp) + cfi_offset_rel_rsp (14, 1080) + movl %eax, %r14d + movq %r15, 1072(%rsp) + cfi_offset_rel_rsp (15, 1072) + cfi_remember_state + +.LBL_2_6: + btl %r14d, %r13d + jc .LBL_2_12 + +.LBL_2_7: + lea 1(%r14), %esi + btl %esi, %r13d + jc .LBL_2_10 + +.LBL_2_8: + incb %r12b + addl $2, %r14d + cmpb $16, %r12b + jb .LBL_2_6 + + kmovw 1048(%rsp), %k4 + kmovw 1040(%rsp), %k5 + kmovw 1032(%rsp), %k6 + kmovw 1024(%rsp), %k7 + vmovups 960(%rsp), %zmm16 + vmovups 896(%rsp), %zmm17 + vmovups 832(%rsp), %zmm18 + vmovups 768(%rsp), %zmm19 + vmovups 704(%rsp), %zmm20 + vmovups 640(%rsp), %zmm21 + vmovups 576(%rsp), %zmm22 + vmovups 512(%rsp), %zmm23 + vmovups 448(%rsp), %zmm24 + vmovups 384(%rsp), %zmm25 + vmovups 320(%rsp), %zmm26 + vmovups 256(%rsp), %zmm27 + vmovups 192(%rsp), %zmm28 + vmovups 128(%rsp), %zmm29 + vmovups 64(%rsp), %zmm30 + vmovups (%rsp), %zmm31 + vmovups 1216(%rsp), %zmm1 + movq 1064(%rsp), %rsi + movq 1056(%rsp), %rdi + movq 1096(%rsp), %r12 + cfi_restore (%r12) + movq 1088(%rsp), %r13 + cfi_restore (%r13) + movq 1080(%rsp), %r14 + cfi_restore (%r14) + movq 1072(%rsp), %r15 + cfi_restore (%r15) + jmp .LBL_2_2 + +.LBL_2_10: + cfi_restore_state + movzbl %r12b, %r15d + vmovss 1156(%rsp,%r15,8), %xmm0 + vzeroupper + vmovss 1156(%rsp,%r15,8), %xmm0 + + call JUMPTARGET(sinf) + + vmovss %xmm0, 1220(%rsp,%r15,8) + jmp .LBL_2_8 + +.LBL_2_12: + movzbl %r12b, %r15d + vmovss 1152(%rsp,%r15,8), %xmm0 + vzeroupper + vmovss 1152(%rsp,%r15,8), %xmm0 + + call JUMPTARGET(sinf) + + vmovss %xmm0, 1216(%rsp,%r15,8) + jmp .LBL_2_7 +#endif +END (_ZGVeN16v_sinf_skx) + + .section .rodata, "a" +.L_2il0floatpacket.11: + .long 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff + .type .L_2il0floatpacket.11,@object diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_sinf4_core.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_sinf4_core.S new file mode 100644 index 0000000000..3556473899 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_sinf4_core.S @@ -0,0 +1,36 @@ +/* Multiple versions of vectorized sinf. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + + .text +ENTRY (_ZGVbN4v_sinf) + .type _ZGVbN4v_sinf, @gnu_indirect_function + LOAD_RTLD_GLOBAL_RO_RDX + leaq _ZGVbN4v_sinf_sse4(%rip), %rax + HAS_CPU_FEATURE (SSE4_1) + jz 2f + ret +2: leaq _ZGVbN4v_sinf_sse2(%rip), %rax + ret +END (_ZGVbN4v_sinf) +libmvec_hidden_def (_ZGVbN4v_sinf) + +#define _ZGVbN4v_sinf _ZGVbN4v_sinf_sse2 +#include "../svml_s_sinf4_core.S" diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_sinf4_core_sse4.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_sinf4_core_sse4.S new file mode 100644 index 0000000000..c690150964 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_sinf4_core_sse4.S @@ -0,0 +1,224 @@ +/* Function sinf vectorized with SSE4. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + + +#include <sysdep.h> +#include "svml_s_trig_data.h" + + .text +ENTRY(_ZGVbN4v_sinf_sse4) +/* + ALGORITHM DESCRIPTION: + + 1) Range reduction to [-Pi/2; +Pi/2] interval + a) Grab sign from source argument and save it. + b) Remove sign using AND operation + c) Getting octant Y by 1/Pi multiplication + d) Add "Right Shifter" value + e) Treat obtained value as integer for destination sign setting. + Shift first bit of this value to the last (sign) position + f) Change destination sign if source sign is negative + using XOR operation. + g) Subtract "Right Shifter" value + h) Subtract Y*PI from X argument, where PI divided to 4 parts: + X = X - Y*PI1 - Y*PI2 - Y*PI3 - Y*PI4; + 2) Polynomial (minimax for sin within [-Pi/2; +Pi/2] interval) + a) Calculate X^2 = X * X + b) Calculate polynomial: + R = X + X * X^2 * (A3 + x^2 * (A5 + ...... + 3) Destination sign setting + a) Set shifted destination sign using XOR operation: + R = XOR( R, S ); + */ + pushq %rbp + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%rbp, 0) + movq %rsp, %rbp + cfi_def_cfa_register (%rbp) + andq $-64, %rsp + subq $320, %rsp + movaps %xmm0, %xmm5 + movq __svml_s_trig_data@GOTPCREL(%rip), %rax + movups __sAbsMask(%rax), %xmm2 + +/* b) Remove sign using AND operation */ + movaps %xmm2, %xmm4 + +/* + f) Change destination sign if source sign is negative + using XOR operation. + */ + andnps %xmm5, %xmm2 + movups __sInvPI(%rax), %xmm1 + andps %xmm5, %xmm4 + +/* c) Getting octant Y by 1/Pi multiplication + d) Add "Right Shifter" value */ + mulps %xmm4, %xmm1 + +/* h) Subtract Y*PI from X argument, where PI divided to 4 parts: + X = X - Y*PI1 - Y*PI2 - Y*PI3 - Y*PI4 */ + movaps %xmm4, %xmm0 + +/* Check for large and special values */ + cmpnleps __sRangeReductionVal(%rax), %xmm4 + movups __sRShifter(%rax), %xmm6 + movups __sPI1(%rax), %xmm7 + addps %xmm6, %xmm1 + movmskps %xmm4, %ecx + +/* e) Treat obtained value as integer for destination sign setting. + Shift first bit of this value to the last (sign) position */ + movaps %xmm1, %xmm3 + +/* g) Subtract "Right Shifter" value */ + subps %xmm6, %xmm1 + mulps %xmm1, %xmm7 + pslld $31, %xmm3 + movups __sPI2(%rax), %xmm6 + subps %xmm7, %xmm0 + mulps %xmm1, %xmm6 + movups __sPI3(%rax), %xmm7 + subps %xmm6, %xmm0 + mulps %xmm1, %xmm7 + movups __sPI4(%rax), %xmm6 + subps %xmm7, %xmm0 + mulps %xmm6, %xmm1 + subps %xmm1, %xmm0 + +/* 2) Polynomial (minimax for sin within [-Pi/2; +Pi/2] interval) + a) Calculate X^2 = X * X + b) Calculate polynomial: + R = X + X * X^2 * (A3 + x^2 * (A5 + ...... */ + movaps %xmm0, %xmm1 + mulps %xmm0, %xmm1 + xorps %xmm3, %xmm0 + movups __sA9(%rax), %xmm3 + mulps %xmm1, %xmm3 + addps __sA7(%rax), %xmm3 + mulps %xmm1, %xmm3 + addps __sA5(%rax), %xmm3 + mulps %xmm1, %xmm3 + addps __sA3(%rax), %xmm3 + mulps %xmm3, %xmm1 + mulps %xmm0, %xmm1 + addps %xmm1, %xmm0 + +/* 3) Destination sign setting + a) Set shifted destination sign using XOR operation: + R = XOR( R, S ); */ + xorps %xmm2, %xmm0 + testl %ecx, %ecx + jne .LBL_1_3 + +.LBL_1_2: + cfi_remember_state + movq %rbp, %rsp + cfi_def_cfa_register (%rsp) + popq %rbp + cfi_adjust_cfa_offset (-8) + cfi_restore (%rbp) + ret + +.LBL_1_3: + cfi_restore_state + movups %xmm5, 192(%rsp) + movups %xmm0, 256(%rsp) + je .LBL_1_2 + + xorb %dl, %dl + xorl %eax, %eax + movups %xmm8, 112(%rsp) + movups %xmm9, 96(%rsp) + movups %xmm10, 80(%rsp) + movups %xmm11, 64(%rsp) + movups %xmm12, 48(%rsp) + movups %xmm13, 32(%rsp) + movups %xmm14, 16(%rsp) + movups %xmm15, (%rsp) + movq %rsi, 136(%rsp) + movq %rdi, 128(%rsp) + movq %r12, 168(%rsp) + cfi_offset_rel_rsp (12, 168) + movb %dl, %r12b + movq %r13, 160(%rsp) + cfi_offset_rel_rsp (13, 160) + movl %ecx, %r13d + movq %r14, 152(%rsp) + cfi_offset_rel_rsp (14, 152) + movl %eax, %r14d + movq %r15, 144(%rsp) + cfi_offset_rel_rsp (15, 144) + cfi_remember_state + +.LBL_1_6: + btl %r14d, %r13d + jc .LBL_1_12 + +.LBL_1_7: + lea 1(%r14), %esi + btl %esi, %r13d + jc .LBL_1_10 + +.LBL_1_8: + incb %r12b + addl $2, %r14d + cmpb $16, %r12b + jb .LBL_1_6 + + movups 112(%rsp), %xmm8 + movups 96(%rsp), %xmm9 + movups 80(%rsp), %xmm10 + movups 64(%rsp), %xmm11 + movups 48(%rsp), %xmm12 + movups 32(%rsp), %xmm13 + movups 16(%rsp), %xmm14 + movups (%rsp), %xmm15 + movq 136(%rsp), %rsi + movq 128(%rsp), %rdi + movq 168(%rsp), %r12 + cfi_restore (%r12) + movq 160(%rsp), %r13 + cfi_restore (%r13) + movq 152(%rsp), %r14 + cfi_restore (%r14) + movq 144(%rsp), %r15 + cfi_restore (%r15) + movups 256(%rsp), %xmm0 + jmp .LBL_1_2 + +.LBL_1_10: + cfi_restore_state + movzbl %r12b, %r15d + movss 196(%rsp,%r15,8), %xmm0 + + call JUMPTARGET(sinf) + + movss %xmm0, 260(%rsp,%r15,8) + jmp .LBL_1_8 + +.LBL_1_12: + movzbl %r12b, %r15d + movss 192(%rsp,%r15,8), %xmm0 + + call JUMPTARGET(sinf) + + movss %xmm0, 256(%rsp,%r15,8) + jmp .LBL_1_7 + +END(_ZGVbN4v_sinf_sse4) diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_sinf8_core.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_sinf8_core.S new file mode 100644 index 0000000000..674e88bd55 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_sinf8_core.S @@ -0,0 +1,36 @@ +/* Multiple versions of vectorized sinf, vector length is 8. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + + .text +ENTRY (_ZGVdN8v_sinf) + .type _ZGVdN8v_sinf, @gnu_indirect_function + LOAD_RTLD_GLOBAL_RO_RDX +1: leaq _ZGVdN8v_sinf_avx2(%rip), %rax + HAS_ARCH_FEATURE (AVX2_Usable) + jz 2f + ret +2: leaq _ZGVdN8v_sinf_sse_wrapper(%rip), %rax + ret +END (_ZGVdN8v_sinf) +libmvec_hidden_def (_ZGVdN8v_sinf) + +#define _ZGVdN8v_sinf _ZGVdN8v_sinf_sse_wrapper +#include "../svml_s_sinf8_core.S" diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_sinf8_core_avx2.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_sinf8_core_avx2.S new file mode 100644 index 0000000000..d34870fa3a --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_sinf8_core_avx2.S @@ -0,0 +1,219 @@ +/* Function sinf vectorized with AVX2. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "svml_s_trig_data.h" + + .text +ENTRY(_ZGVdN8v_sinf_avx2) +/* + ALGORITHM DESCRIPTION: + + 1) Range reduction to [-Pi/2; +Pi/2] interval + a) Grab sign from source argument and save it. + b) Remove sign using AND operation + c) Getting octant Y by 1/Pi multiplication + d) Add "Right Shifter" value + e) Treat obtained value as integer for destination sign setting. + Shift first bit of this value to the last (sign) position + f) Change destination sign if source sign is negative + using XOR operation. + g) Subtract "Right Shifter" value + h) Subtract Y*PI from X argument, where PI divided to 4 parts: + X = X - Y*PI1 - Y*PI2 - Y*PI3 - Y*PI4; + 2) Polynomial (minimax for sin within [-Pi/2; +Pi/2] interval) + a) Calculate X^2 = X * X + b) Calculate polynomial: + R = X + X * X^2 * (A3 + x^2 * (A5 + ...... + 3) Destination sign setting + a) Set shifted destination sign using XOR operation: + R = XOR( R, S ); + */ + pushq %rbp + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%rbp, 0) + movq %rsp, %rbp + cfi_def_cfa_register (%rbp) + andq $-64, %rsp + subq $448, %rsp + movq __svml_s_trig_data@GOTPCREL(%rip), %rax + vmovdqa %ymm0, %ymm5 + vmovups __sAbsMask(%rax), %ymm3 + vmovups __sInvPI(%rax), %ymm7 + vmovups __sRShifter(%rax), %ymm0 + vmovups __sPI1_FMA(%rax), %ymm1 + +/* b) Remove sign using AND operation */ + vandps %ymm3, %ymm5, %ymm4 + +/* + c) Getting octant Y by 1/Pi multiplication + d) Add "Right Shifter" value + */ + vfmadd213ps %ymm0, %ymm4, %ymm7 + +/* g) Subtract "Right Shifter" value */ + vsubps %ymm0, %ymm7, %ymm2 + +/* + e) Treat obtained value as integer for destination sign setting. + Shift first bit of this value to the last (sign) position + */ + vpslld $31, %ymm7, %ymm6 + +/* + h) Subtract Y*PI from X argument, where PI divided to 4 parts: + X = X - Y*PI1 - Y*PI2 - Y*PI3; + */ + vmovdqa %ymm4, %ymm0 + vfnmadd231ps %ymm2, %ymm1, %ymm0 + +/* Check for large and special values */ + vcmpnle_uqps __sRangeReductionVal(%rax), %ymm4, %ymm4 + vfnmadd231ps __sPI2_FMA(%rax), %ymm2, %ymm0 + vfnmadd132ps __sPI3_FMA(%rax), %ymm0, %ymm2 + +/* + 2) Polynomial (minimax for sin within [-Pi/2; +Pi/2] interval) + a) Calculate X^2 = X * X + b) Calculate polynomial: + R = X + X * X^2 * (A3 + x^2 * (A5 + ...... + */ + vmulps %ymm2, %ymm2, %ymm1 + +/* + f) Change destination sign if source sign is negative + using XOR operation. + */ + vandnps %ymm5, %ymm3, %ymm0 + vxorps %ymm6, %ymm2, %ymm3 + vmovups __sA9(%rax), %ymm2 + vfmadd213ps __sA7(%rax), %ymm1, %ymm2 + vfmadd213ps __sA5(%rax), %ymm1, %ymm2 + vfmadd213ps __sA3(%rax), %ymm1, %ymm2 + vmulps %ymm1, %ymm2, %ymm6 + vfmadd213ps %ymm3, %ymm3, %ymm6 + vmovmskps %ymm4, %ecx + +/* + 3) Destination sign setting + a) Set shifted destination sign using XOR operation: + R = XOR( R, S ); + */ + vxorps %ymm0, %ymm6, %ymm0 + testl %ecx, %ecx + jne .LBL_1_3 + +.LBL_1_2: + cfi_remember_state + movq %rbp, %rsp + cfi_def_cfa_register (%rsp) + popq %rbp + cfi_adjust_cfa_offset (-8) + cfi_restore (%rbp) + ret + +.LBL_1_3: + cfi_restore_state + vmovups %ymm5, 320(%rsp) + vmovups %ymm0, 384(%rsp) + je .LBL_1_2 + + xorb %dl, %dl + xorl %eax, %eax + vmovups %ymm8, 224(%rsp) + vmovups %ymm9, 192(%rsp) + vmovups %ymm10, 160(%rsp) + vmovups %ymm11, 128(%rsp) + vmovups %ymm12, 96(%rsp) + vmovups %ymm13, 64(%rsp) + vmovups %ymm14, 32(%rsp) + vmovups %ymm15, (%rsp) + movq %rsi, 264(%rsp) + movq %rdi, 256(%rsp) + movq %r12, 296(%rsp) + cfi_offset_rel_rsp (12, 296) + movb %dl, %r12b + movq %r13, 288(%rsp) + cfi_offset_rel_rsp (13, 288) + movl %ecx, %r13d + movq %r14, 280(%rsp) + cfi_offset_rel_rsp (14, 280) + movl %eax, %r14d + movq %r15, 272(%rsp) + cfi_offset_rel_rsp (15, 272) + cfi_remember_state + +.LBL_1_6: + btl %r14d, %r13d + jc .LBL_1_12 + +.LBL_1_7: + lea 1(%r14), %esi + btl %esi, %r13d + jc .LBL_1_10 + +.LBL_1_8: + incb %r12b + addl $2, %r14d + cmpb $16, %r12b + jb .LBL_1_6 + + vmovups 224(%rsp), %ymm8 + vmovups 192(%rsp), %ymm9 + vmovups 160(%rsp), %ymm10 + vmovups 128(%rsp), %ymm11 + vmovups 96(%rsp), %ymm12 + vmovups 64(%rsp), %ymm13 + vmovups 32(%rsp), %ymm14 + vmovups (%rsp), %ymm15 + vmovups 384(%rsp), %ymm0 + movq 264(%rsp), %rsi + movq 256(%rsp), %rdi + movq 296(%rsp), %r12 + cfi_restore (%r12) + movq 288(%rsp), %r13 + cfi_restore (%r13) + movq 280(%rsp), %r14 + cfi_restore (%r14) + movq 272(%rsp), %r15 + cfi_restore (%r15) + jmp .LBL_1_2 + +.LBL_1_10: + cfi_restore_state + movzbl %r12b, %r15d + vmovss 324(%rsp,%r15,8), %xmm0 + vzeroupper + + call JUMPTARGET(sinf) + + vmovss %xmm0, 388(%rsp,%r15,8) + jmp .LBL_1_8 + +.LBL_1_12: + movzbl %r12b, %r15d + vmovss 320(%rsp,%r15,8), %xmm0 + vzeroupper + + call JUMPTARGET(sinf) + + vmovss %xmm0, 384(%rsp,%r15,8) + jmp .LBL_1_7 + +END(_ZGVdN8v_sinf_avx2) diff --git a/REORG.TODO/sysdeps/x86_64/fpu/printf_fphex.c b/REORG.TODO/sysdeps/x86_64/fpu/printf_fphex.c new file mode 100644 index 0000000000..fd68eaeebf --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/printf_fphex.c @@ -0,0 +1,93 @@ +/* Print floating point number in hexadecimal notation according to ISO C99. + Copyright (C) 1997-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#ifndef LONG_DOUBLE_DENORM_BIAS +# define LONG_DOUBLE_DENORM_BIAS (IEEE854_LONG_DOUBLE_BIAS - 1) +#endif + +#define PRINT_FPHEX_LONG_DOUBLE \ +do { \ + /* The "strange" 80 bit format on ix86 and m68k has an explicit \ + leading digit in the 64 bit mantissa. */ \ + unsigned long long int num; \ + union ieee854_long_double u; \ + u.d = fpnum.ldbl; \ + \ + num = (((unsigned long long int) u.ieee.mantissa0) << 32 \ + | u.ieee.mantissa1); \ + \ + zero_mantissa = num == 0; \ + \ + if (sizeof (unsigned long int) > 6) \ + { \ + numstr = _itoa_word (num, numbuf + sizeof numbuf, 16, \ + info->spec == 'A'); \ + wnumstr = _itowa_word (num, \ + wnumbuf + sizeof (wnumbuf) / sizeof (wchar_t),\ + 16, info->spec == 'A'); \ + } \ + else \ + { \ + numstr = _itoa (num, numbuf + sizeof numbuf, 16, info->spec == 'A');\ + wnumstr = _itowa (num, \ + wnumbuf + sizeof (wnumbuf) / sizeof (wchar_t), \ + 16, info->spec == 'A'); \ + } \ + \ + /* Fill with zeroes. */ \ + while (numstr > numbuf + (sizeof numbuf - 64 / 4)) \ + { \ + *--numstr = '0'; \ + *--wnumstr = L'0'; \ + } \ + \ + /* We use a full nibble for the leading digit. */ \ + leading = *numstr++; \ + wnumstr++; \ + \ + /* We have 3 bits from the mantissa in the leading nibble. \ + Therefore we are here using `IEEE854_LONG_DOUBLE_BIAS + 3'. */ \ + exponent = u.ieee.exponent; \ + \ + if (exponent == 0) \ + { \ + if (zero_mantissa) \ + expnegative = 0; \ + else \ + { \ + /* This is a denormalized number. */ \ + expnegative = 1; \ + /* This is a hook for the m68k long double format, where the \ + exponent bias is the same for normalized and denormalized \ + numbers. */ \ + exponent = LONG_DOUBLE_DENORM_BIAS + 3; \ + } \ + } \ + else if (exponent >= IEEE854_LONG_DOUBLE_BIAS + 3) \ + { \ + expnegative = 0; \ + exponent -= IEEE854_LONG_DOUBLE_BIAS + 3; \ + } \ + else \ + { \ + expnegative = 1; \ + exponent = -(exponent - (IEEE854_LONG_DOUBLE_BIAS + 3)); \ + } \ +} while (0) + +#include <stdio-common/printf_fphex.c> diff --git a/REORG.TODO/sysdeps/x86_64/fpu/s_atanl.c b/REORG.TODO/sysdeps/x86_64/fpu/s_atanl.c new file mode 100644 index 0000000000..fd4a455b55 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/s_atanl.c @@ -0,0 +1 @@ +#include "sysdeps/i386/fpu/s_atanl.c" diff --git a/REORG.TODO/sysdeps/x86_64/fpu/s_ceill.S b/REORG.TODO/sysdeps/x86_64/fpu/s_ceill.S new file mode 100644 index 0000000000..9d8b79dbee --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/s_ceill.S @@ -0,0 +1,36 @@ +/* + * Written by J.T. Conklin <jtc@netbsd.org>. + * Changes for long double by Ulrich Drepper <drepper@cygnus.com> + * Changes for x86-64 by Andreas Jaeger <aj@suse.de> + * Public domain. + */ + +#include <machine/asm.h> + + +ENTRY(__ceill) + fldt 8(%rsp) + + fnstenv -28(%rsp) /* store fpu environment */ + + /* We use here %edx although only the low 1 bits are defined. + But none of the operations should care and they are faster + than the 16 bit operations. */ + movl $0x0800,%edx /* round towards +oo */ + orl -28(%rsp),%edx + andl $0xfbff,%edx + movl %edx,-32(%rsp) + fldcw -32(%rsp) /* load modified control word */ + + frndint /* round */ + + /* Preserve "invalid" exceptions from sNaN input. */ + fnstsw + andl $0x1, %eax + orl %eax, -24(%rsp) + + fldenv -28(%rsp) /* restore original environment */ + + ret +END (__ceill) +weak_alias (__ceill, ceill) diff --git a/REORG.TODO/sysdeps/x86_64/fpu/s_copysign.S b/REORG.TODO/sysdeps/x86_64/fpu/s_copysign.S new file mode 100644 index 0000000000..8939dffd99 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/s_copysign.S @@ -0,0 +1,50 @@ +/* copy sign, double version. + Copyright (C) 2002-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Andreas Jaeger <aj@suse.de>, 2002. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <machine/asm.h> + + .section .rodata.cst16,"aM",@progbits,16 + + .align ALIGNARG(4) + .type signmask,@object +signmask: + .byte 0, 0, 0, 0, 0, 0, 0, 0x80 + .byte 0, 0, 0, 0, 0, 0, 0, 0 + ASM_SIZE_DIRECTIVE(signmask) + .type othermask,@object +othermask: + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x7f + .byte 0, 0, 0, 0, 0, 0, 0, 0 + ASM_SIZE_DIRECTIVE(othermask) + +#ifdef PIC +#define MO(op) op##(%rip) +#else +#define MO(op) op +#endif + + .text +ENTRY(__copysign) + andpd MO(othermask),%xmm0 + andpd MO(signmask),%xmm1 + orpd %xmm1,%xmm0 + ret +END (__copysign) + +weak_alias (__copysign, copysign) diff --git a/REORG.TODO/sysdeps/x86_64/fpu/s_copysignf.S b/REORG.TODO/sysdeps/x86_64/fpu/s_copysignf.S new file mode 100644 index 0000000000..213c2d3c2c --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/s_copysignf.S @@ -0,0 +1,45 @@ +/* copy sign, double version. + Copyright (C) 2002-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Andreas Jaeger <aj@suse.de>, 2002. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <machine/asm.h> + + .section .rodata + + .align ALIGNARG(4) + .type mask,@object +mask: + .byte 0xff, 0xff, 0xff, 0x7f + ASM_SIZE_DIRECTIVE(mask) + +#ifdef PIC +#define MO(op) op##(%rip) +#else +#define MO(op) op +#endif + + .text +ENTRY(__copysignf) + movss MO(mask),%xmm3 + andps %xmm3,%xmm0 + andnps %xmm1,%xmm3 + orps %xmm3,%xmm0 + retq +END (__copysignf) + +weak_alias (__copysignf, copysignf) diff --git a/REORG.TODO/sysdeps/x86_64/fpu/s_copysignl.S b/REORG.TODO/sysdeps/x86_64/fpu/s_copysignl.S new file mode 100644 index 0000000000..2ffd612d65 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/s_copysignl.S @@ -0,0 +1,22 @@ +/* + * Written by J.T. Conklin <jtc@netbsd.org>. + * Changes for long double by Ulrich Drepper <drepper@cygnus.com>. + * Adopted for x86-64 by Andreas Jaeger <aj@suse.de>. + * Public domain. + */ + +#include <machine/asm.h> + +RCSID("$NetBSD: $") + +ENTRY(__copysignl) + movl 32(%rsp),%edx + movl 16(%rsp),%eax + andl $0x8000,%edx + andl $0x7fff,%eax + orl %edx,%eax + movl %eax,16(%rsp) + fldt 8(%rsp) + ret +END (__copysignl) +weak_alias (__copysignl, copysignl) diff --git a/REORG.TODO/sysdeps/x86_64/fpu/s_cosf.S b/REORG.TODO/sysdeps/x86_64/fpu/s_cosf.S new file mode 100644 index 0000000000..e9fdc7e56e --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/s_cosf.S @@ -0,0 +1,533 @@ +/* Optimized cosf function. + Copyright (C) 2012-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#define __need_Emath +#include <bits/errno.h> + +/* Short algorithm description: + * + * 1) if |x| == 0: return 1.0-|x|. + * 2) if |x| < 2^-27: return 1.0-|x|. + * 3) if |x| < 2^-5 : return 1.0+x^2*DP_COS2_0+x^5*DP_COS2_1. + * 4) if |x| < Pi/4: return 1.0+x^2*(C0+x^2*(C1+x^2*(C2+x^2*(C3+x^2*C4)))). + * 5) if |x| < 9*Pi/4: + * 5.1) Range reduction: k=trunc(|x|/(Pi/4)), j=(k+1)&0x0e, n=k+3, + * t=|x|-j*Pi/4. + * 5.2) Reconstruction: + * s = (-1.0)^((n>>2)&1) + * if(n&2 != 0) { + * using cos(t) polynomial for |t|<Pi/4, result is + * s * (1.0+t^2*(C0+t^2*(C1+t^2*(C2+t^2*(C3+t^2*C4))))). + * } else { + * using sin(t) polynomial for |t|<Pi/4, result is + * s * t * (1.0+t^2*(S0+t^2*(S1+t^2*(S2+t^2*(S3+t^2*S4))))). + * } + * 6) if |x| < 2^23, large args: + * 6.1) Range reduction: k=trunc(|x|/(Pi/4)), j=(k+1)&0xfffffffe, n=k+3, + * t=|x|-j*Pi/4. + * 6.2) Reconstruction same as (5.2). + * 7) if |x| >= 2^23, very large args: + * 7.1) Range reduction: k=trunc(|x|/(Pi/4)), j=(k+1)&0xfffffffe, n=k+3, + * t=|x|-j*Pi/4. + * 7.2) Reconstruction same as (5.2). + * 8) if x is Inf, return x-x, and set errno=EDOM. + * 9) if x is NaN, return x-x. + * + * Special cases: + * cos(+-0) = 1 not raising inexact, + * cos(subnormal) raises inexact, + * cos(min_normalized) raises inexact, + * cos(normalized) raises inexact, + * cos(Inf) = NaN, raises invalid, sets errno to EDOM, + * cos(NaN) = NaN. + */ + + .text +ENTRY(__cosf) + /* Input: single precision x in %xmm0 */ + + movd %xmm0, %eax /* Bits of x */ + movaps %xmm0, %xmm7 /* Copy of x */ + cvtss2sd %xmm0, %xmm0 /* DP x */ + movss L(SP_ABS_MASK)(%rip), %xmm3 + andl $0x7fffffff, %eax /* |x| */ + + cmpl $0x3f490fdb, %eax /* |x|<Pi/4? */ + jb L(arg_less_pio4) + + /* Here if |x|>=Pi/4 */ + andps %xmm7, %xmm3 /* SP |x| */ + andpd L(DP_ABS_MASK)(%rip), %xmm0 /* DP |x| */ + movss L(SP_INVPIO4)(%rip), %xmm2 /* SP 1/(Pi/4) */ + + cmpl $0x40e231d6, %eax /* |x|<9*Pi/4? */ + jae L(large_args) + + /* Here if Pi/4<=|x|<9*Pi/4 */ + mulss %xmm3, %xmm2 /* SP |x|/(Pi/4) */ + cvttss2si %xmm2, %eax /* k, number of Pi/4 in x */ + lea L(PIO4J)(%rip), %rsi + addl $1, %eax /* k+1 */ + movl $0x0e, %edx + andl %eax, %edx /* j = (k+1)&0x0e */ + addl $2, %eax /* n */ + subsd (%rsi,%rdx,8), %xmm0 /* t = |x| - j * Pi/4 */ + +L(reconstruction): + /* Input: %eax=n, %xmm0=t */ + testl $2, %eax /* n&2 != 0? */ + jz L(sin_poly) + +/*L(cos_poly):*/ + /* Here if cos(x) calculated using cos(t) polynomial for |t|<Pi/4: + * y = t*t; z = y*y; + * s = sign(x) * (-1.0)^((n>>2)&1) + * result = s * (1.0+t^2*(C0+t^2*(C1+t^2*(C2+t^2*(C3+t^2*C4))))) + */ + shrl $2, %eax /* n>>2 */ + mulsd %xmm0, %xmm0 /* y=t^2 */ + andl $1, %eax /* (n>>2)&1 */ + movaps %xmm0, %xmm1 /* y */ + mulsd %xmm0, %xmm0 /* z=t^4 */ + + movsd L(DP_C4)(%rip), %xmm4 /* C4 */ + mulsd %xmm0, %xmm4 /* z*C4 */ + movsd L(DP_C3)(%rip), %xmm3 /* C3 */ + mulsd %xmm0, %xmm3 /* z*C3 */ + lea L(DP_ONES)(%rip), %rsi + addsd L(DP_C2)(%rip), %xmm4 /* C2+z*C4 */ + mulsd %xmm0, %xmm4 /* z*(C2+z*C4) */ + addsd L(DP_C1)(%rip), %xmm3 /* C1+z*C3 */ + mulsd %xmm0, %xmm3 /* z*(C1+z*C3) */ + addsd L(DP_C0)(%rip), %xmm4 /* C0+z*(C2+z*C4) */ + mulsd %xmm1, %xmm4 /* y*(C0+z*(C2+z*C4)) */ + + addsd %xmm4, %xmm3 /* y*(C0+y*(C1+y*(C2+y*(C3+y*C4)))) */ + /* 1.0+y*(C0+y*(C1+y*(C2+y*(C3+y*C4)))) */ + addsd L(DP_ONES)(%rip), %xmm3 + + mulsd (%rsi,%rax,8), %xmm3 /* DP result */ + cvtsd2ss %xmm3, %xmm0 /* SP result */ + ret + + .p2align 4 +L(sin_poly): + /* Here if cos(x) calculated using sin(t) polynomial for |t|<Pi/4: + * y = t*t; z = y*y; + * s = sign(x) * (-1.0)^((n>>2)&1) + * result = s * t * (1.0+t^2*(S0+t^2*(S1+t^2*(S2+t^2*(S3+t^2*S4))))) + */ + + movaps %xmm0, %xmm4 /* t */ + shrl $2, %eax /* n>>2 */ + mulsd %xmm0, %xmm0 /* y=t^2 */ + andl $1, %eax /* (n>>2)&1 */ + movaps %xmm0, %xmm1 /* y */ + mulsd %xmm0, %xmm0 /* z=t^4 */ + + movsd L(DP_S4)(%rip), %xmm2 /* S4 */ + mulsd %xmm0, %xmm2 /* z*S4 */ + movsd L(DP_S3)(%rip), %xmm3 /* S3 */ + mulsd %xmm0, %xmm3 /* z*S3 */ + lea L(DP_ONES)(%rip), %rsi + addsd L(DP_S2)(%rip), %xmm2 /* S2+z*S4 */ + mulsd %xmm0, %xmm2 /* z*(S2+z*S4) */ + addsd L(DP_S1)(%rip), %xmm3 /* S1+z*S3 */ + mulsd %xmm0, %xmm3 /* z*(S1+z*S3) */ + addsd L(DP_S0)(%rip), %xmm2 /* S0+z*(S2+z*S4) */ + mulsd %xmm1, %xmm2 /* y*(S0+z*(S2+z*S4)) */ + /* t*s, where s = sign(x) * (-1.0)^((n>>2)&1) */ + mulsd (%rsi,%rax,8), %xmm4 + /* y*(S0+y*(S1+y*(S2+y*(S3+y*S4)))) */ + addsd %xmm2, %xmm3 + /* t*s*y*(S0+y*(S1+y*(S2+y*(S3+y*S4)))) */ + mulsd %xmm4, %xmm3 + /* t*s*(1.0+y*(S0+y*(S1+y*(S2+y*(S3+y*S4)))) */ + addsd %xmm4, %xmm3 + cvtsd2ss %xmm3, %xmm0 /* SP result */ + ret + + .p2align 4 +L(large_args): + /* Here if |x|>=9*Pi/4 */ + cmpl $0x7f800000, %eax /* x is Inf or NaN? */ + jae L(arg_inf_or_nan) + + /* Here if finite |x|>=9*Pi/4 */ + cmpl $0x4b000000, %eax /* |x|<2^23? */ + jae L(very_large_args) + + /* Here if 9*Pi/4<=|x|<2^23 */ + movsd L(DP_INVPIO4)(%rip), %xmm1 /* 1/(Pi/4) */ + mulsd %xmm0, %xmm1 /* |x|/(Pi/4) */ + cvttsd2si %xmm1, %eax /* k=trunc(|x|/(Pi/4)) */ + addl $1, %eax /* k+1 */ + movl %eax, %edx + andl $0xfffffffe, %edx /* j=(k+1)&0xfffffffe */ + cvtsi2sdl %edx, %xmm4 /* DP j */ + movsd L(DP_PIO4HI)(%rip), %xmm2 /* -PIO4HI = high part of -Pi/4 */ + mulsd %xmm4, %xmm2 /* -j*PIO4HI */ + movsd L(DP_PIO4LO)(%rip), %xmm3 /* -PIO4LO = low part of -Pi/4 */ + addsd %xmm2, %xmm0 /* |x| - j*PIO4HI */ + addl $2, %eax /* n */ + mulsd %xmm3, %xmm4 /* j*PIO4LO */ + addsd %xmm4, %xmm0 /* t = |x| - j*PIO4HI - j*PIO4LO */ + jmp L(reconstruction) + + .p2align 4 +L(very_large_args): + /* Here if finite |x|>=2^23 */ + + /* bitpos = (ix>>23) - BIAS_32 + 59; */ + shrl $23, %eax /* eb = biased exponent of x */ + /* bitpos = eb - 0x7f + 59, where 0x7f is exponent bias */ + subl $68, %eax + movl $28, %ecx /* %cl=28 */ + movl %eax, %edx /* bitpos copy */ + + /* j = bitpos/28; */ + div %cl /* j in register %al=%ax/%cl */ + movapd %xmm0, %xmm3 /* |x| */ + /* clear unneeded remainder from %ah */ + andl $0xff, %eax + + imull $28, %eax, %ecx /* j*28 */ + lea L(_FPI)(%rip), %rsi + movsd L(DP_HI_MASK)(%rip), %xmm4 /* DP_HI_MASK */ + movapd %xmm0, %xmm5 /* |x| */ + mulsd -16(%rsi,%rax,8), %xmm3 /* tmp3 = FPI[j-2]*|x| */ + movapd %xmm0, %xmm1 /* |x| */ + mulsd -8(%rsi,%rax,8), %xmm5 /* tmp2 = FPI[j-1]*|x| */ + mulsd (%rsi,%rax,8), %xmm0 /* tmp0 = FPI[j]*|x| */ + addl $19, %ecx /* j*28+19 */ + mulsd 8(%rsi,%rax,8), %xmm1 /* tmp1 = FPI[j+1]*|x| */ + cmpl %ecx, %edx /* bitpos>=j*28+19? */ + jl L(very_large_skip1) + + /* Here if bitpos>=j*28+19 */ + andpd %xmm3, %xmm4 /* HI(tmp3) */ + subsd %xmm4, %xmm3 /* tmp3 = tmp3 - HI(tmp3) */ +L(very_large_skip1): + + movsd L(DP_2POW52)(%rip), %xmm6 + movapd %xmm5, %xmm2 /* tmp2 copy */ + addsd %xmm3, %xmm5 /* tmp5 = tmp3 + tmp2 */ + movl $1, %edx + addsd %xmm5, %xmm6 /* tmp6 = tmp5 + 2^52 */ + movsd 8+L(DP_2POW52)(%rip), %xmm4 + movd %xmm6, %eax /* k = I64_LO(tmp6); */ + addsd %xmm6, %xmm4 /* tmp4 = tmp6 - 2^52 */ + comisd %xmm5, %xmm4 /* tmp4 > tmp5? */ + jbe L(very_large_skip2) + + /* Here if tmp4 > tmp5 */ + subl $1, %eax /* k-- */ + addsd 8+L(DP_ONES)(%rip), %xmm4 /* tmp4 -= 1.0 */ +L(very_large_skip2): + + andl %eax, %edx /* k&1 */ + lea L(DP_ZERONE)(%rip), %rsi + subsd %xmm4, %xmm3 /* tmp3 -= tmp4 */ + addsd (%rsi,%rdx,8), %xmm3 /* t = DP_ZERONE[k&1] + tmp3 */ + addsd %xmm2, %xmm3 /* t += tmp2 */ + addsd %xmm3, %xmm0 /* t += tmp0 */ + addl $3, %eax /* n=k+3 */ + addsd %xmm1, %xmm0 /* t += tmp1 */ + mulsd L(DP_PIO4)(%rip), %xmm0 /* t *= PI04 */ + + jmp L(reconstruction) /* end of very_large_args peth */ + + .p2align 4 +L(arg_less_pio4): + /* Here if |x|<Pi/4 */ + cmpl $0x3d000000, %eax /* |x|<2^-5? */ + jl L(arg_less_2pn5) + + /* Here if 2^-5<=|x|<Pi/4 */ + mulsd %xmm0, %xmm0 /* y=x^2 */ + movaps %xmm0, %xmm1 /* y */ + mulsd %xmm0, %xmm0 /* z=x^4 */ + movsd L(DP_C4)(%rip), %xmm3 /* C4 */ + mulsd %xmm0, %xmm3 /* z*C4 */ + movsd L(DP_C3)(%rip), %xmm5 /* C3 */ + mulsd %xmm0, %xmm5 /* z*C3 */ + addsd L(DP_C2)(%rip), %xmm3 /* C2+z*C4 */ + mulsd %xmm0, %xmm3 /* z*(C2+z*C4) */ + addsd L(DP_C1)(%rip), %xmm5 /* C1+z*C3 */ + mulsd %xmm0, %xmm5 /* z*(C1+z*C3) */ + addsd L(DP_C0)(%rip), %xmm3 /* C0+z*(C2+z*C4) */ + mulsd %xmm1, %xmm3 /* y*(C0+z*(C2+z*C4)) */ + /* y*(C0+y*(C1+y*(C2+y*(C3+y*C4)))) */ + addsd %xmm5, %xmm3 + /* 1.0 + y*(C0+y*(C1+y*(C2+y*(C3+y*C4)))) */ + addsd L(DP_ONES)(%rip), %xmm3 + cvtsd2ss %xmm3, %xmm0 /* SP result */ + ret + + .p2align 4 +L(arg_less_2pn5): + /* Here if |x|<2^-5 */ + cmpl $0x32000000, %eax /* |x|<2^-27? */ + jl L(arg_less_2pn27) + + /* Here if 2^-27<=|x|<2^-5 */ + mulsd %xmm0, %xmm0 /* DP x^2 */ + movsd L(DP_COS2_1)(%rip), %xmm3 /* DP DP_COS2_1 */ + mulsd %xmm0, %xmm3 /* DP x^2*DP_COS2_1 */ + addsd L(DP_COS2_0)(%rip), %xmm3 /* DP DP_COS2_0+x^2*DP_COS2_1 */ + mulsd %xmm0, %xmm3 /* DP x^2*DP_COS2_0+x^4*DP_COS2_1 */ + /* DP 1.0+x^2*DP_COS2_0+x^4*DP_COS2_1 */ + addsd L(DP_ONES)(%rip), %xmm3 + cvtsd2ss %xmm3, %xmm0 /* SP result */ + ret + + .p2align 4 +L(arg_less_2pn27): + /* Here if |x|<2^-27 */ + andps L(SP_ABS_MASK)(%rip),%xmm7 /* |x| */ + movss L(SP_ONE)(%rip), %xmm0 /* 1.0 */ + subss %xmm7, %xmm0 /* result is 1.0-|x| */ + ret + + .p2align 4 +L(arg_inf_or_nan): + /* Here if |x| is Inf or NAN */ + jne L(skip_errno_setting) /* in case of x is NaN */ + + /* Align stack to 16 bytes. */ + subq $8, %rsp + cfi_adjust_cfa_offset (8) + /* Here if x is Inf. Set errno to EDOM. */ + call JUMPTARGET(__errno_location) + addq $8, %rsp + cfi_adjust_cfa_offset (-8) + + movl $EDOM, (%rax) + + .p2align 4 +L(skip_errno_setting): + /* Here if |x| is Inf or NAN. Continued. */ + movaps %xmm7, %xmm0 /* load x */ + subss %xmm0, %xmm0 /* Result is NaN */ + ret +END(__cosf) + + .section .rodata, "a" + .p2align 3 +L(PIO4J): /* Table of j*Pi/4, for j=0,1,..,10 */ + .long 0x00000000,0x00000000 + .long 0x54442d18,0x3fe921fb + .long 0x54442d18,0x3ff921fb + .long 0x7f3321d2,0x4002d97c + .long 0x54442d18,0x400921fb + .long 0x2955385e,0x400f6a7a + .long 0x7f3321d2,0x4012d97c + .long 0xe9bba775,0x4015fdbb + .long 0x54442d18,0x401921fb + .long 0xbeccb2bb,0x401c463a + .long 0x2955385e,0x401f6a7a + .type L(PIO4J), @object + ASM_SIZE_DIRECTIVE(L(PIO4J)) + + .p2align 3 +L(_FPI): /* 4/Pi broken into sum of positive DP values */ + .long 0x00000000,0x00000000 + .long 0x6c000000,0x3ff45f30 + .long 0x2a000000,0x3e3c9c88 + .long 0xa8000000,0x3c54fe13 + .long 0xd0000000,0x3aaf47d4 + .long 0x6c000000,0x38fbb81b + .long 0xe0000000,0x3714acc9 + .long 0x7c000000,0x3560e410 + .long 0x56000000,0x33bca2c7 + .long 0xac000000,0x31fbd778 + .long 0xe0000000,0x300b7246 + .long 0xe8000000,0x2e5d2126 + .long 0x48000000,0x2c970032 + .long 0xe8000000,0x2ad77504 + .long 0xe0000000,0x290921cf + .long 0xb0000000,0x274deb1c + .long 0xe0000000,0x25829a73 + .long 0xbe000000,0x23fd1046 + .long 0x10000000,0x2224baed + .long 0x8e000000,0x20709d33 + .long 0x80000000,0x1e535a2f + .long 0x64000000,0x1cef904e + .long 0x30000000,0x1b0d6398 + .long 0x24000000,0x1964ce7d + .long 0x16000000,0x17b908bf + .type L(_FPI), @object + ASM_SIZE_DIRECTIVE(L(_FPI)) + +/* Coefficients of polynomial + for cos(x)~=1.0+x^2*DP_COS2_0+x^4*DP_COS2_1, |x|<2^-5. */ + .p2align 3 +L(DP_COS2_0): + .long 0xff5cc6fd,0xbfdfffff + .type L(DP_COS2_0), @object + ASM_SIZE_DIRECTIVE(L(DP_COS2_0)) + + .p2align 3 +L(DP_COS2_1): + .long 0xb178dac5,0x3fa55514 + .type L(DP_COS2_1), @object + ASM_SIZE_DIRECTIVE(L(DP_COS2_1)) + + .p2align 3 +L(DP_ZERONE): + .long 0x00000000,0x00000000 /* 0.0 */ + .long 0x00000000,0xbff00000 /* 1.0 */ + .type L(DP_ZERONE), @object + ASM_SIZE_DIRECTIVE(L(DP_ZERONE)) + + .p2align 3 +L(DP_ONES): + .long 0x00000000,0x3ff00000 /* +1.0 */ + .long 0x00000000,0xbff00000 /* -1.0 */ + .type L(DP_ONES), @object + ASM_SIZE_DIRECTIVE(L(DP_ONES)) + +/* Coefficients of polynomial + for sin(t)~=t+t^3*(S0+t^2*(S1+t^2*(S2+t^2*(S3+t^2*S4)))), |t|<Pi/4. */ + .p2align 3 +L(DP_S3): + .long 0x64e6b5b4,0x3ec71d72 + .type L(DP_S3), @object + ASM_SIZE_DIRECTIVE(L(DP_S3)) + + .p2align 3 +L(DP_S1): + .long 0x10c2688b,0x3f811111 + .type L(DP_S1), @object + ASM_SIZE_DIRECTIVE(L(DP_S1)) + + .p2align 3 +L(DP_S4): + .long 0x1674b58a,0xbe5a947e + .type L(DP_S4), @object + ASM_SIZE_DIRECTIVE(L(DP_S4)) + + .p2align 3 +L(DP_S2): + .long 0x8b4bd1f9,0xbf2a019f + .type L(DP_S2),@object + ASM_SIZE_DIRECTIVE(L(DP_S2)) + + .p2align 3 +L(DP_S0): + .long 0x55551cd9,0xbfc55555 + .type L(DP_S0), @object + ASM_SIZE_DIRECTIVE(L(DP_S0)) + +/* Coefficients of polynomial + for cos(t)~=1.0+t^2*(C0+t^2*(C1+t^2*(C2+t^2*(C3+t^2*C4)))), |t|<Pi/4. */ + .p2align 3 +L(DP_C3): + .long 0x9ac43cc0,0x3efa00eb + .type L(DP_C3), @object + ASM_SIZE_DIRECTIVE(L(DP_C3)) + + .p2align 3 +L(DP_C1): + .long 0x545c50c7,0x3fa55555 + .type L(DP_C1), @object + ASM_SIZE_DIRECTIVE(L(DP_C1)) + + .p2align 3 +L(DP_C4): + .long 0xdd8844d7,0xbe923c97 + .type L(DP_C4), @object + ASM_SIZE_DIRECTIVE(L(DP_C4)) + + .p2align 3 +L(DP_C2): + .long 0x348b6874,0xbf56c16b + .type L(DP_C2), @object + ASM_SIZE_DIRECTIVE(L(DP_C2)) + + .p2align 3 +L(DP_C0): + .long 0xfffe98ae,0xbfdfffff + .type L(DP_C0), @object + ASM_SIZE_DIRECTIVE(L(DP_C0)) + + .p2align 3 +L(DP_PIO4): + .long 0x54442d18,0x3fe921fb /* Pi/4 */ + .type L(DP_PIO4), @object + ASM_SIZE_DIRECTIVE(L(DP_PIO4)) + + .p2align 3 +L(DP_2POW52): + .long 0x00000000,0x43300000 /* +2^52 */ + .long 0x00000000,0xc3300000 /* -2^52 */ + .type L(DP_2POW52), @object + ASM_SIZE_DIRECTIVE(L(DP_2POW52)) + + .p2align 3 +L(DP_INVPIO4): + .long 0x6dc9c883,0x3ff45f30 /* 4/Pi */ + .type L(DP_INVPIO4), @object + ASM_SIZE_DIRECTIVE(L(DP_INVPIO4)) + + .p2align 3 +L(DP_PIO4HI): + .long 0x54000000,0xbfe921fb /* High part of Pi/4 */ + .type L(DP_PIO4HI), @object + ASM_SIZE_DIRECTIVE(L(DP_PIO4HI)) + + .p2align 3 +L(DP_PIO4LO): + .long 0x11A62633,0xbe010b46 /* Low part of Pi/4 */ + .type L(DP_PIO4LO), @object + ASM_SIZE_DIRECTIVE(L(DP_PIO4LO)) + + .p2align 2 +L(SP_INVPIO4): + .long 0x3fa2f983 /* 4/Pi */ + .type L(SP_INVPIO4), @object + ASM_SIZE_DIRECTIVE(L(SP_INVPIO4)) + + .p2align 4 +L(DP_ABS_MASK): /* Mask for getting DP absolute value */ + .long 0xffffffff,0x7fffffff + .long 0xffffffff,0x7fffffff + .type L(DP_ABS_MASK), @object + ASM_SIZE_DIRECTIVE(L(DP_ABS_MASK)) + + .p2align 3 +L(DP_HI_MASK): /* Mask for getting high 21 bits of DP value */ + .long 0x00000000,0xffffffff + .type L(DP_HI_MASK), @object + ASM_SIZE_DIRECTIVE(L(DP_HI_MASK)) + + .p2align 4 +L(SP_ABS_MASK): /* Mask for getting SP absolute value */ + .long 0x7fffffff,0x7fffffff + .long 0x7fffffff,0x7fffffff + .type L(SP_ABS_MASK), @object + ASM_SIZE_DIRECTIVE(L(SP_ABS_MASK)) + + .p2align 2 +L(SP_ONE): + .long 0x3f800000 /* 1.0 */ + .type L(SP_ONE), @object + ASM_SIZE_DIRECTIVE(L(SP_ONE)) + +weak_alias(__cosf, cosf) diff --git a/REORG.TODO/sysdeps/x86_64/fpu/s_expm1l.S b/REORG.TODO/sysdeps/x86_64/fpu/s_expm1l.S new file mode 100644 index 0000000000..7fbd99b0db --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/s_expm1l.S @@ -0,0 +1,2 @@ +#define USE_AS_EXPM1L +#include <e_expl.S> diff --git a/REORG.TODO/sysdeps/x86_64/fpu/s_fabs.c b/REORG.TODO/sysdeps/x86_64/fpu/s_fabs.c new file mode 100644 index 0000000000..f5d3ee87e9 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/s_fabs.c @@ -0,0 +1,26 @@ +/* Absolute value of floating point number. + Copyright (C) 2002-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <math.h> + +double +__fabs (double x) +{ + return __builtin_fabs (x); +} +weak_alias (__fabs, fabs) diff --git a/REORG.TODO/sysdeps/x86_64/fpu/s_fabsf.c b/REORG.TODO/sysdeps/x86_64/fpu/s_fabsf.c new file mode 100644 index 0000000000..9956cce757 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/s_fabsf.c @@ -0,0 +1,26 @@ +/* Absolute value of floating point number. + Copyright (C) 2002-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <math.h> + +float +__fabsf (float x) +{ + return __builtin_fabsf (x); +} +weak_alias (__fabsf, fabsf) diff --git a/REORG.TODO/sysdeps/x86_64/fpu/s_fabsl.S b/REORG.TODO/sysdeps/x86_64/fpu/s_fabsl.S new file mode 100644 index 0000000000..1aef8318d9 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/s_fabsl.S @@ -0,0 +1,27 @@ +/* Absolute value of floating point number. + Copyright (C) 2002-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> + + .text +ENTRY(__fabsl) + fldt 8(%rsp) + fabs + ret +END(__fabsl) +weak_alias (__fabsl, fabsl) diff --git a/REORG.TODO/sysdeps/x86_64/fpu/s_finitel.S b/REORG.TODO/sysdeps/x86_64/fpu/s_finitel.S new file mode 100644 index 0000000000..9e49796901 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/s_finitel.S @@ -0,0 +1,16 @@ +/* + * Written by Joe Keane <jgk@jgk.org>. + * Adopted for x86-64 by Andreas Jaeger <aj@suse.de>. + */ + +#include <machine/asm.h> + +ENTRY(__finitel) + movl 16(%rsp),%eax + orl $0xffff8000, %eax + incl %eax + shrl $31, %eax + ret +END (__finitel) +weak_alias (__finitel, finitel) +hidden_def (__finitel) diff --git a/REORG.TODO/sysdeps/x86_64/fpu/s_floorl.S b/REORG.TODO/sysdeps/x86_64/fpu/s_floorl.S new file mode 100644 index 0000000000..535fdd8571 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/s_floorl.S @@ -0,0 +1,35 @@ +/* + * Written by J.T. Conklin <jtc@netbsd.org>. + * Changes for long double by Ulrich Drepper <drepper@cygnus.com> + * Changes for x86-64 by Andreas Jaeger <aj@suse.de>=09 + * Public domain. + */ + +#include <machine/asm.h> + +ENTRY(__floorl) + fldt 8(%rsp) + + fnstenv -28(%rsp) /* store fpu environment */ + + /* We use here %edx although only the low 1 bits are defined. + But none of the operations should care and they are faster + than the 16 bit operations. */ + movl $0x400,%edx /* round towards -oo */ + orl -28(%rsp),%edx + andl $0xf7ff,%edx + movl %edx,-32(%rsp) + fldcw -32(%rsp) /* load modified control word */ + + frndint /* round */ + + /* Preserve "invalid" exceptions from sNaN input. */ + fnstsw + andl $0x1, %eax + orl %eax, -24(%rsp) + + fldenv -28(%rsp) /* restore original environment */ + + ret +END (__floorl) +weak_alias (__floorl, floorl) diff --git a/REORG.TODO/sysdeps/x86_64/fpu/s_fmax.S b/REORG.TODO/sysdeps/x86_64/fpu/s_fmax.S new file mode 100644 index 0000000000..f93c9f9371 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/s_fmax.S @@ -0,0 +1,52 @@ +/* Compute maximum of two numbers, regarding NaN as missing argument. + Copyright (C) 2002-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Andreas Jaeger <aj@suse.de>, 2002. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> + + .text +ENTRY(__fmax) + ucomisd %xmm0, %xmm1 + jp 1f // jump if unordered + maxsd %xmm1, %xmm0 + jmp 2f + +1: ucomisd %xmm1, %xmm1 // Is xmm1 a NaN? + jp 3f + // xmm0 is a NaN; xmm1 is not. Test if xmm0 is signaling. + movsd %xmm0, -8(%rsp) + testb $0x8, -2(%rsp) + jz 4f + movsd %xmm1, %xmm0 // otherwise return xmm1 + ret + +3: // xmm1 is a NaN; xmm0 may or may not be. + ucomisd %xmm0, %xmm0 + jp 4f + // xmm1 is a NaN; xmm0 is not. Test if xmm1 is signaling. + movsd %xmm1, -8(%rsp) + testb $0x8, -2(%rsp) + jz 4f + ret + +4: // Both arguments are NaNs, or one is a signaling NaN. + addsd %xmm1, %xmm0 + +2: ret +END(__fmax) +weak_alias (__fmax, fmax) diff --git a/REORG.TODO/sysdeps/x86_64/fpu/s_fmaxf.S b/REORG.TODO/sysdeps/x86_64/fpu/s_fmaxf.S new file mode 100644 index 0000000000..82989feb4b --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/s_fmaxf.S @@ -0,0 +1,52 @@ +/* Compute maximum of two numbers, regarding NaN as missing argument. + Copyright (C) 2002-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Andreas Jaeger <aj@suse.de>, 2002. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> + + .text +ENTRY(__fmaxf) + ucomiss %xmm0, %xmm1 + jp 1f // jump if unordered + maxss %xmm1, %xmm0 + jmp 2f + +1: ucomiss %xmm1, %xmm1 // Is xmm1 a NaN? + jp 3f + // xmm0 is a NaN; xmm1 is not. Test if xmm0 is signaling. + movss %xmm0, -4(%rsp) + testb $0x40, -2(%rsp) + jz 4f + movss %xmm1, %xmm0 // otherwise return xmm1 + ret + +3: // xmm1 is a NaN; xmm0 may or may not be. + ucomiss %xmm0, %xmm0 + jp 4f + // xmm1 is a NaN; xmm0 is not. Test if xmm1 is signaling. + movss %xmm1, -4(%rsp) + testb $0x40, -2(%rsp) + jz 4f + ret + +4: // Both arguments are NaNs, or one is a signaling NaN. + addss %xmm1, %xmm0 + +2: ret +END(__fmaxf) +weak_alias (__fmaxf, fmaxf) diff --git a/REORG.TODO/sysdeps/x86_64/fpu/s_fmaxl.S b/REORG.TODO/sysdeps/x86_64/fpu/s_fmaxl.S new file mode 100644 index 0000000000..2d3321fce4 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/s_fmaxl.S @@ -0,0 +1,58 @@ +/* Compute maximum of two numbers, regarding NaN as missing argument. + Copyright (C) 1997-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> + + .text +ENTRY(__fmaxl) + fldt 8(%rsp) // x + fldt 24(%rsp) // x : y + + fucomi %st(1), %st + jp 2f + fcmovb %st(1), %st + + fstp %st(1) + + ret + +2: // Unordered. + fucomi %st(0), %st + jp 3f + // st(1) is a NaN; st(0) is not. Test if st(1) is signaling. + testb $0x40, 15(%rsp) + jz 4f + fstp %st(1) + ret + +3: // st(0) is a NaN; st(1) may or may not be. + fxch + fucomi %st(0), %st + jp 4f + // st(1) is a NaN; st(0) is not. Test if st(1) is signaling. + testb $0x40, 31(%rsp) + jz 4f + fstp %st(1) + ret + +4: // Both arguments are NaNs, or one is a signaling NaN. + faddp + ret +END(__fmaxl) +weak_alias (__fmaxl, fmaxl) diff --git a/REORG.TODO/sysdeps/x86_64/fpu/s_fmin.S b/REORG.TODO/sysdeps/x86_64/fpu/s_fmin.S new file mode 100644 index 0000000000..718bf489df --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/s_fmin.S @@ -0,0 +1,52 @@ +/* Compute minimum of two numbers, regarding NaN as missing argument. + Copyright (C) 2002-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Andreas Jaeger <aj@suse.de>, 2002. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> + + .text +ENTRY(__fmin) + ucomisd %xmm0, %xmm1 + jp 1f // jump if unordered + minsd %xmm1, %xmm0 + jmp 2f + +1: ucomisd %xmm1, %xmm1 // Is xmm1 a NaN? + jp 3f + // xmm0 is a NaN; xmm1 is not. Test if xmm0 is signaling. + movsd %xmm0, -8(%rsp) + testb $0x8, -2(%rsp) + jz 4f + movsd %xmm1, %xmm0 // otherwise return xmm1 + ret + +3: // xmm1 is a NaN; xmm0 may or may not be. + ucomisd %xmm0, %xmm0 + jp 4f + // xmm1 is a NaN; xmm0 is not. Test if xmm1 is signaling. + movsd %xmm1, -8(%rsp) + testb $0x8, -2(%rsp) + jz 4f + ret + +4: // Both arguments are NaNs, or one is a signaling NaN. + addsd %xmm1, %xmm0 + +2: ret +END(__fmin) +weak_alias (__fmin, fmin) diff --git a/REORG.TODO/sysdeps/x86_64/fpu/s_fminf.S b/REORG.TODO/sysdeps/x86_64/fpu/s_fminf.S new file mode 100644 index 0000000000..8e8c9360ac --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/s_fminf.S @@ -0,0 +1,52 @@ +/* Compute minimum of two numbers, regarding NaN as missing argument. + Copyright (C) 2002-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Andreas Jaeger <aj@suse.de>, 2002. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> + + .text +ENTRY(__fminf) + ucomiss %xmm0, %xmm1 + jp 1f // jump if unordered + minss %xmm1, %xmm0 + jmp 2f + +1: ucomiss %xmm1, %xmm1 // Is xmm1 a NaN? + jp 3f + // xmm0 is a NaN; xmm1 is not. Test if xmm0 is signaling. + movss %xmm0, -4(%rsp) + testb $0x40, -2(%rsp) + jz 4f + movss %xmm1, %xmm0 // otherwise return xmm1 + ret + +3: // xmm1 is a NaN; xmm0 may or may not be. + ucomiss %xmm0, %xmm0 + jp 4f + // xmm1 is a NaN; xmm0 is not. Test if xmm1 is signaling. + movss %xmm1, -4(%rsp) + testb $0x40, -2(%rsp) + jz 4f + ret + +4: // Both arguments are NaNs, or one is a signaling NaN. + addss %xmm1, %xmm0 + +2: ret +END(__fminf) +weak_alias (__fminf, fminf) diff --git a/REORG.TODO/sysdeps/x86_64/fpu/s_fminl.S b/REORG.TODO/sysdeps/x86_64/fpu/s_fminl.S new file mode 100644 index 0000000000..33eed7b30b --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/s_fminl.S @@ -0,0 +1,58 @@ +/* Compute minimum of two numbers, regarding NaN as missing argument. + Copyright (C) 1997-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> + + .text +ENTRY(__fminl) + fldt 8(%rsp) // x + fldt 24(%rsp) // x : y + + fucomi %st(1), %st + jp 2f + fcmovnb %st(1), %st + + fstp %st(1) + + ret + +2: // Unordered. + fucomi %st(0), %st + jp 3f + // st(1) is a NaN; st(0) is not. Test if st(1) is signaling. + testb $0x40, 15(%rsp) + jz 4f + fstp %st(1) + ret + +3: // st(0) is a NaN; st(1) may or may not be. + fxch + fucomi %st(0), %st + jp 4f + // st(1) is a NaN; st(0) is not. Test if st(1) is signaling. + testb $0x40, 31(%rsp) + jz 4f + fstp %st(1) + ret + +4: // Both arguments are NaNs, or one is a signaling NaN. + faddp + ret +END(__fminl) +weak_alias (__fminl, fminl) diff --git a/REORG.TODO/sysdeps/x86_64/fpu/s_fpclassifyl.c b/REORG.TODO/sysdeps/x86_64/fpu/s_fpclassifyl.c new file mode 100644 index 0000000000..856854b0f5 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/s_fpclassifyl.c @@ -0,0 +1,2 @@ +#include <sysdeps/i386/fpu/s_fpclassifyl.c> + diff --git a/REORG.TODO/sysdeps/x86_64/fpu/s_isinfl.c b/REORG.TODO/sysdeps/x86_64/fpu/s_isinfl.c new file mode 100644 index 0000000000..ca818b5e90 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/s_isinfl.c @@ -0,0 +1 @@ +#include <sysdeps/i386/fpu/s_isinfl.c> diff --git a/REORG.TODO/sysdeps/x86_64/fpu/s_isnanl.c b/REORG.TODO/sysdeps/x86_64/fpu/s_isnanl.c new file mode 100644 index 0000000000..06e69c3aeb --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/s_isnanl.c @@ -0,0 +1 @@ +#include <sysdeps/i386/fpu/s_isnanl.c> diff --git a/REORG.TODO/sysdeps/x86_64/fpu/s_llrint.S b/REORG.TODO/sysdeps/x86_64/fpu/s_llrint.S new file mode 100644 index 0000000000..af7bbce585 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/s_llrint.S @@ -0,0 +1,32 @@ +/* Round argument to nearest integral value according to current rounding + direction. + Copyright (C) 2002-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Andreas Jaeger <aj@suse.d>, 2002. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> + + .text +ENTRY(__llrint) + cvtsd2si %xmm0,%rax + ret +END(__llrint) +weak_alias (__llrint, llrint) +#ifndef __ILP32__ +strong_alias (__llrint, __lrint) +weak_alias (__llrint, lrint) +#endif diff --git a/REORG.TODO/sysdeps/x86_64/fpu/s_llrintf.S b/REORG.TODO/sysdeps/x86_64/fpu/s_llrintf.S new file mode 100644 index 0000000000..9edb78bf1d --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/s_llrintf.S @@ -0,0 +1,32 @@ +/* Round argument to nearest integral value according to current rounding + direction. + Copyright (C) 2002-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Andreas Jaeger <aj@suse.d>, 2002. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> + + .text +ENTRY(__llrintf) + cvtss2si %xmm0,%rax + ret +END(__llrintf) +weak_alias (__llrintf, llrintf) +#ifndef __ILP32__ +strong_alias (__llrintf, __lrintf) +weak_alias (__llrintf, lrintf) +#endif diff --git a/REORG.TODO/sysdeps/x86_64/fpu/s_llrintl.S b/REORG.TODO/sysdeps/x86_64/fpu/s_llrintl.S new file mode 100644 index 0000000000..e5bbf0106e --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/s_llrintl.S @@ -0,0 +1,34 @@ +/* Round argument to nearest integral value according to current rounding + direction. + Copyright (C) 1997-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> + + .text +ENTRY(__llrintl) + fldt 8(%rsp) + fistpll -8(%rsp) + fwait + movq -8(%rsp),%rax + ret +END(__llrintl) +weak_alias (__llrintl, llrintl) +#ifndef __ILP32__ +strong_alias (__llrintl, __lrintl) +weak_alias (__llrintl, lrintl) +#endif diff --git a/REORG.TODO/sysdeps/x86_64/fpu/s_log1pl.S b/REORG.TODO/sysdeps/x86_64/fpu/s_log1pl.S new file mode 100644 index 0000000000..947e5e4552 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/s_log1pl.S @@ -0,0 +1,74 @@ +/* + * Written by J.T. Conklin <jtc@netbsd.org>. + * Public domain. + * + * Adapted for `long double' by Ulrich Drepper <drepper@cygnus.com>. + * Adapted for x86-64 by Andreas Jaeger <aj@suse.de>. + */ + +#include <machine/asm.h> + +RCSID("$NetBSD: s_log1p.S,v 1.7 1995/05/09 00:10:58 jtc Exp $") + + .section .rodata + + .align ALIGNARG(4) + /* The fyl2xp1 can only be used for values in + -1 + sqrt(2) / 2 <= x <= 1 - sqrt(2) / 2 + 0.29 is a safe value. + */ +limit: .tfloat 0.29 + /* Please note: we use a double value here. Since 1.0 has + an exact representation this does not effect the accuracy + but it helps to optimize the code. */ +one: .double 1.0 + +/* + * Use the fyl2xp1 function when the argument is in the range -0.29 to 0.29, + * otherwise fyl2x with the needed extra computation. + */ +#ifdef PIC +#define MO(op) op##(%rip) +#else +#define MO(op) op +#endif + + .text +ENTRY(__log1pl) + fldln2 + + fldt 8(%rsp) + + fxam + fnstsw + fld %st + testb $1, %ah + jnz 3f // in case x is NaN or ±Inf +4: + fabs + fldt MO(limit) + fcompp + fnstsw + andb $1,%ah + jz 2f + + movzwl 8+8(%rsp), %eax + xorb $0x80, %ah + cmpl $0xc040, %eax + jae 5f + + faddl MO(one) +5: fyl2x + ret + +2: fyl2xp1 + ret + +3: testb $4, %ah + jnz 4b // in case x is ±Inf + fstp %st(1) + fstp %st(1) + fadd %st(0) + ret + +END (__log1pl) diff --git a/REORG.TODO/sysdeps/x86_64/fpu/s_logbl.c b/REORG.TODO/sysdeps/x86_64/fpu/s_logbl.c new file mode 100644 index 0000000000..4791ba64e8 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/s_logbl.c @@ -0,0 +1 @@ +#include <sysdeps/i386/fpu/s_logbl.c> diff --git a/REORG.TODO/sysdeps/x86_64/fpu/s_lrint.S b/REORG.TODO/sysdeps/x86_64/fpu/s_lrint.S new file mode 100644 index 0000000000..dfc31359a0 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/s_lrint.S @@ -0,0 +1 @@ +/* Not needed, see s_llrint.S. */ diff --git a/REORG.TODO/sysdeps/x86_64/fpu/s_lrintf.S b/REORG.TODO/sysdeps/x86_64/fpu/s_lrintf.S new file mode 100644 index 0000000000..fcdc4dca9a --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/s_lrintf.S @@ -0,0 +1 @@ +/* Not needed, see s_llrintf.S. */ diff --git a/REORG.TODO/sysdeps/x86_64/fpu/s_lrintl.S b/REORG.TODO/sysdeps/x86_64/fpu/s_lrintl.S new file mode 100644 index 0000000000..ef9c45d00d --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/s_lrintl.S @@ -0,0 +1 @@ +/* Not needed, see s_llrintl.S. */ diff --git a/REORG.TODO/sysdeps/x86_64/fpu/s_nearbyintl.S b/REORG.TODO/sysdeps/x86_64/fpu/s_nearbyintl.S new file mode 100644 index 0000000000..31b21a5037 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/s_nearbyintl.S @@ -0,0 +1,19 @@ +/* + * Written by J.T. Conklin <jtc@netbsd.org>. + * Public domain. + */ +/* Adapted for use as nearbyint by Ulrich Drepper <drepper@cygnus.com>. */ + +#include <machine/asm.h> + +ENTRY(__nearbyintl) + fldt 8(%rsp) + fnstenv -28(%rsp) + frndint + fnstsw + andl $0x1, %eax + orl %eax, -24(%rsp) + fldenv -28(%rsp) + ret +END (__nearbyintl) +weak_alias (__nearbyintl, nearbyintl) diff --git a/REORG.TODO/sysdeps/x86_64/fpu/s_nextafterl.c b/REORG.TODO/sysdeps/x86_64/fpu/s_nextafterl.c new file mode 100644 index 0000000000..f59f16848f --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/s_nextafterl.c @@ -0,0 +1 @@ +#include <sysdeps/i386/fpu/s_nextafterl.c> diff --git a/REORG.TODO/sysdeps/x86_64/fpu/s_nexttoward.c b/REORG.TODO/sysdeps/x86_64/fpu/s_nexttoward.c new file mode 100644 index 0000000000..aee2bb5895 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/s_nexttoward.c @@ -0,0 +1 @@ +#include <sysdeps/i386/fpu/s_nexttoward.c> diff --git a/REORG.TODO/sysdeps/x86_64/fpu/s_nexttowardf.c b/REORG.TODO/sysdeps/x86_64/fpu/s_nexttowardf.c new file mode 100644 index 0000000000..55e95f6916 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/s_nexttowardf.c @@ -0,0 +1 @@ +#include <sysdeps/i386/fpu/s_nexttowardf.c> diff --git a/REORG.TODO/sysdeps/x86_64/fpu/s_rintl.c b/REORG.TODO/sysdeps/x86_64/fpu/s_rintl.c new file mode 100644 index 0000000000..1cad42e921 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/s_rintl.c @@ -0,0 +1 @@ +#include <sysdeps/i386/fpu/s_rintl.c> diff --git a/REORG.TODO/sysdeps/x86_64/fpu/s_scalbnl.S b/REORG.TODO/sysdeps/x86_64/fpu/s_scalbnl.S new file mode 100644 index 0000000000..6c7683c32b --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/s_scalbnl.S @@ -0,0 +1,17 @@ +/* + * Written by J.T. Conklin <jtc@netbsd.org>. + * Changes for long double by Ulrich Drepper <drepper@cygnus.com> + * Changes for x86-64 by Andreas Jaeger <aj@suse.de>=09 + * Public domain. + */ + +#include <machine/asm.h> + +ENTRY(__scalbnl) + movl %edi,-4(%rsp) + fildl -4(%rsp) + fldt 8(%rsp) + fscale + fstp %st(1) + ret +END (__scalbnl) diff --git a/REORG.TODO/sysdeps/x86_64/fpu/s_signbit.S b/REORG.TODO/sysdeps/x86_64/fpu/s_signbit.S new file mode 100644 index 0000000000..a24757cd48 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/s_signbit.S @@ -0,0 +1,26 @@ +/* Return nonzero value if number is negative. + Copyright (C) 2009-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Ulrich Drepper <drepper@redha.com>, 2009. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> + +ENTRY(__signbit) + pmovmskb %xmm0, %eax + andl $0x80, %eax + ret +END(__signbit) diff --git a/REORG.TODO/sysdeps/x86_64/fpu/s_signbitf.S b/REORG.TODO/sysdeps/x86_64/fpu/s_signbitf.S new file mode 100644 index 0000000000..7739424bf6 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/s_signbitf.S @@ -0,0 +1,26 @@ +/* Return nonzero value if number is negative. + Copyright (C) 2009-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Ulrich Drepper <drepper@redha.com>, 2009. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> + +ENTRY(__signbitf) + pmovmskb %xmm0, %eax + andl $0x8, %eax + ret +END(__signbitf) diff --git a/REORG.TODO/sysdeps/x86_64/fpu/s_significandl.c b/REORG.TODO/sysdeps/x86_64/fpu/s_significandl.c new file mode 100644 index 0000000000..a4ad986164 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/s_significandl.c @@ -0,0 +1 @@ +#include <sysdeps/i386/fpu/s_significandl.c> diff --git a/REORG.TODO/sysdeps/x86_64/fpu/s_sincosf.S b/REORG.TODO/sysdeps/x86_64/fpu/s_sincosf.S new file mode 100644 index 0000000000..e6ed81ed91 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/s_sincosf.S @@ -0,0 +1,564 @@ +/* Optimized sincosf function. + Copyright (C) 2012-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#define __need_Emath +#include <bits/errno.h> + +/* Short algorithm description: + * + * 1) if |x|==0: sin(x)=x, + * cos(x)=1. + * 2) if |x|<2^-27: sin(x)=x-x*DP_SMALL, raising underflow only when needed, + * cos(x)=1-|x|. + * 3) if |x|<2^-5 : sin(x)=x+x*x^2*DP_SIN2_0+x^5*DP_SIN2_1, + * cos(x)=1+1*x^2*DP_COS2_0+x^5*DP_COS2_1 + * 4) if |x|< Pi/4: sin(x)=x+x*x^2*(S0+x^2*(S1+x^2*(S2+x^2*(S3+x^2*S4)))), + * cos(x)=1+1*x^2*(C0+x^2*(C1+x^2*(C2+x^2*(C3+x^2*C4)))). + * 5) if |x| < 9*Pi/4: + * 5.1) Range reduction: + * k=trunc(|x|/(Pi/4)), j=(k+1)&0x0e, n=k+1, t=|x|-j*Pi/4. + * 5.2) Reconstruction: + * sign_sin = sign(x) * (-1.0)^(( n >>2)&1) + * sign_cos = (-1.0)^(((n+2)>>2)&1) + * poly_sin = ((((S4*t^2 + S3)*t^2 + S2)*t^2 + S1)*t^2 + S0)*t^2*t+t + * poly_cos = ((((C4*t^2 + C3)*t^2 + C2)*t^2 + C1)*t^2 + C0)*t^2*s+s + * if(n&2 != 0) { + * using cos(t) and sin(t) polynomials for |t|<Pi/4, results are + * cos(x) = poly_sin * sign_cos + * sin(x) = poly_cos * sign_sin + * } else { + * sin(x) = poly_sin * sign_sin + * cos(x) = poly_cos * sign_cos + * } + * 6) if |x| < 2^23, large args: + * 6.1) Range reduction: + * k=trunc(|x|/(Pi/4)), j=(k+1)&0xfffffffe, n=k+1, t=|x|-j*Pi/4 + * 6.2) Reconstruction same as (5.2). + * 7) if |x| >= 2^23, very large args: + * 7.1) Range reduction: + * k=trunc(|x|/(Pi/4)), j=(k+1)&0xfffffffe, n=k+1, t=|x|-j*Pi/4. + * 7.2) Reconstruction same as (5.2). + * 8) if x is Inf, return x-x, and set errno=EDOM. + * 9) if x is NaN, return x-x. + * + * Special cases: + * sin/cos(+-0) = +-0/1 not raising inexact/underflow, + * sin/cos(subnormal) raises inexact/underflow, + * sin/cos(min_normalized) raises inexact/underflow, + * sin/cos(normalized) raises inexact, + * sin/cos(Inf) = NaN, raises invalid, sets errno to EDOM, + * sin/cos(NaN) = NaN. + */ + +# define ARG_SIN_PTR %rdi +# define ARG_COS_PTR %rsi + + .text +ENTRY(__sincosf) + /* Input: %xmm0 contains single precision argument x */ + /* %rdi points to sin result */ + /* %rsi points to cos result */ + + movd %xmm0, %eax /* Bits of x */ + movaps %xmm0, %xmm7 /* Copy of x */ + cvtss2sd %xmm0, %xmm0 /* DP x */ + movss L(SP_ABS_MASK)(%rip), %xmm3 + movl %eax, %r8d /* Copy of x bits */ + andl $0x7fffffff, %eax /* |x| */ + + cmpl $0x3f490fdb, %eax /* |x|<Pi/4 ? */ + jb L(arg_less_pio4) + + /* Here if |x|>=Pi/4 */ + andps %xmm7, %xmm3 /* SP |x| */ + andpd L(DP_ABS_MASK)(%rip),%xmm0 /* DP |x| */ + movss L(SP_INVPIO4)(%rip), %xmm2 /* SP 1/(Pi/4) */ + + cmpl $0x40e231d6, %eax /* |x|<9*Pi/4 ? */ + jae L(large_args) + + /* Here if Pi/4<=|x|<9*Pi/4 */ + mulss %xmm3, %xmm2 /* SP |x|/(Pi/4) */ + movl %r8d, %ecx /* Load x */ + cvttss2si %xmm2, %eax /* k, number of Pi/4 in x */ + lea L(PIO4J)(%rip), %r9 + shrl $29, %ecx /* (sign of x) << 2 */ + addl $1, %eax /* k+1 */ + movl $0x0e, %edx + andl %eax, %edx /* j = (k+1)&0x0e */ + subsd (%r9,%rdx,8), %xmm0 /* t = |x| - j * Pi/4 */ + +L(reconstruction): + /* Input: %eax=n, %xmm0=t, %ecx=sign(x) */ + + movaps %xmm0, %xmm4 /* t */ + movhpd L(DP_ONES)(%rip), %xmm4 /* 1|t */ + mulsd %xmm0, %xmm0 /* y=t^2 */ + movl $2, %edx + unpcklpd %xmm0, %xmm0 /* y|y */ + addl %eax, %edx /* k+2 */ + movaps %xmm0, %xmm1 /* y|y */ + mulpd %xmm0, %xmm0 /* z=t^4|z=t^4 */ + + movaps L(DP_SC4)(%rip), %xmm2 /* S4 */ + mulpd %xmm0, %xmm2 /* z*S4 */ + movaps L(DP_SC3)(%rip), %xmm3 /* S3 */ + mulpd %xmm0, %xmm3 /* z*S3 */ + xorl %eax, %ecx /* (sign_x ^ (k>>2))<<2 */ + addpd L(DP_SC2)(%rip), %xmm2 /* S2+z*S4 */ + mulpd %xmm0, %xmm2 /* z*(S2+z*S4) */ + shrl $2, %edx /* (k+2)>>2 */ + addpd L(DP_SC1)(%rip), %xmm3 /* S1+z*S3 */ + mulpd %xmm0, %xmm3 /* z*(S1+z*S3) */ + shrl $2, %ecx /* sign_x ^ k>>2 */ + addpd L(DP_SC0)(%rip), %xmm2 /* S0+z*(S2+z*S4) */ + andl $1, %edx /* sign_cos = ((k+2)>>2)&1 */ + mulpd %xmm1, %xmm2 /* y*(S0+z*(S2+z*S4)) */ + andl $1, %ecx /* sign_sin = sign_x ^ ((k>>2)&1) */ + addpd %xmm2, %xmm3 /* y*(S0+y*(S1+y*(S2+y*(S3+y*S4)))) */ + lea L(DP_ONES)(%rip), %r9 + mulpd %xmm4, %xmm3 /*t*y*(S0+y*(S1+y*(S2+y*(S3+y*S4))))*/ + testl $2, %eax /* n&2 != 0 ? */ + addpd %xmm4, %xmm3 /*t+t*y*(S0+y*(S1+y*(S2+y*(S3+y*S4))*/ + jnz L(sin_result_sin_poly) + +/*L(sin_result_cos_poly):*/ + /* + * Here if + * cos(x) = poly_sin * sign_cos + * sin(x) = poly_cos * sign_sin + */ + movsd (%r9,%rcx,8), %xmm4 /* 0|sign_sin */ + movhpd (%r9,%rdx,8), %xmm4 /* sign_cos|sign_sin */ + mulpd %xmm4, %xmm3 /* result_cos|result_sin */ + cvtpd2ps %xmm3, %xmm0 /* SP results */ + movss %xmm0, (ARG_SIN_PTR) /* store sin(x) from xmm0[0] */ + shufps $1, %xmm0, %xmm0 /* move cos(x) to xmm0[0] */ + movss %xmm0, (ARG_COS_PTR) /* store cos(x) */ + ret + + .p2align 4 +L(sin_result_sin_poly): + /* + * Here if + * sin(x) = poly_sin * sign_sin + * cos(x) = poly_cos * sign_cos + */ + movsd (%r9,%rdx,8), %xmm4 /* 0|sign_cos */ + movhpd (%r9,%rcx,8), %xmm4 /* sign_sin|sign_cos */ + mulpd %xmm4, %xmm3 /* result_sin|result_cos */ + cvtpd2ps %xmm3, %xmm0 /* SP results */ + movss %xmm0, (ARG_COS_PTR) /* store cos(x) from xmm0[0] */ + shufps $1, %xmm0, %xmm0 /* move sin(x) to xmm0[0] */ + movss %xmm0, (ARG_SIN_PTR) /* store sin(x) */ + ret + + .p2align 4 +L(large_args): + /* Here if |x|>=9*Pi/4 */ + cmpl $0x7f800000, %eax /* x is Inf or NaN ? */ + jae L(arg_inf_or_nan) + + /* Here if finite |x|>=9*Pi/4 */ + cmpl $0x4b000000, %eax /* |x|<2^23 ? */ + jae L(very_large_args) + + /* Here if 9*Pi/4<=|x|<2^23 */ + movsd L(DP_INVPIO4)(%rip), %xmm1 /* 1/(Pi/4) */ + mulsd %xmm0, %xmm1 /* |x|/(Pi/4) */ + cvttsd2si %xmm1, %eax /* k=trunc(|x|/(Pi/4)) */ + addl $1, %eax /* k+1 */ + movl %eax, %edx + andl $0xfffffffe, %edx /* j=(k+1)&0xfffffffe */ + cvtsi2sdl %edx, %xmm4 /* DP j */ + movl %r8d, %ecx /* Load x */ + movsd L(DP_PIO4HI)(%rip), %xmm2 /* -PIO4HI = high part of -Pi/4 */ + shrl $29, %ecx /* (sign of x) << 2 */ + mulsd %xmm4, %xmm2 /* -j*PIO4HI */ + movsd L(DP_PIO4LO)(%rip), %xmm3 /* -PIO4LO = low part of -Pi/4 */ + addsd %xmm2, %xmm0 /* |x| - j*PIO4HI */ + mulsd %xmm3, %xmm4 /* j*PIO4LO */ + addsd %xmm4, %xmm0 /* t = |x| - j*PIO4HI - j*PIO4LO */ + jmp L(reconstruction) + + .p2align 4 +L(very_large_args): + /* Here if finite |x|>=2^23 */ + + /* bitpos = (ix>>23) - BIAS_32 + 59; */ + shrl $23, %eax /* eb = biased exponent of x */ + subl $68, %eax /* bitpos=eb-0x7f+59, where 0x7f */ + /*is exponent bias */ + movl $28, %ecx /* %cl=28 */ + movl %eax, %edx /* bitpos copy */ + + /* j = bitpos/28; */ + div %cl /* j in register %al=%ax/%cl */ + movapd %xmm0, %xmm3 /* |x| */ + andl $0xff, %eax /* clear unneeded remainder from %ah*/ + + imull $28, %eax, %ecx /* j*28 */ + lea L(_FPI)(%rip), %r9 + movsd L(DP_HI_MASK)(%rip), %xmm4 /* DP_HI_MASK */ + movapd %xmm0, %xmm5 /* |x| */ + mulsd -16(%r9,%rax,8), %xmm3 /* tmp3 = FPI[j-2]*|x| */ + movapd %xmm0, %xmm1 /* |x| */ + mulsd -8(%r9,%rax,8), %xmm5 /* tmp2 = FPI[j-1]*|x| */ + mulsd (%r9,%rax,8), %xmm0 /* tmp0 = FPI[j]*|x| */ + addl $19, %ecx /* j*28+19 */ + mulsd 8(%r9,%rax,8), %xmm1 /* tmp1 = FPI[j+1]*|x| */ + cmpl %ecx, %edx /* bitpos>=j*28+19 ? */ + jl L(very_large_skip1) + + /* Here if bitpos>=j*28+19 */ + andpd %xmm3, %xmm4 /* HI(tmp3) */ + subsd %xmm4, %xmm3 /* tmp3 = tmp3 - HI(tmp3) */ +L(very_large_skip1): + + movsd L(DP_2POW52)(%rip), %xmm6 + movapd %xmm5, %xmm2 /* tmp2 copy */ + addsd %xmm3, %xmm5 /* tmp5 = tmp3 + tmp2 */ + movl $1, %edx + addsd %xmm5, %xmm6 /* tmp6 = tmp5 + 2^52 */ + movsd 8+L(DP_2POW52)(%rip), %xmm4 + movd %xmm6, %eax /* k = I64_LO(tmp6); */ + addsd %xmm6, %xmm4 /* tmp4 = tmp6 - 2^52 */ + movl %r8d, %ecx /* Load x */ + comisd %xmm5, %xmm4 /* tmp4 > tmp5 ? */ + jbe L(very_large_skip2) + + /* Here if tmp4 > tmp5 */ + subl $1, %eax /* k-- */ + addsd 8+L(DP_ONES)(%rip), %xmm4 /* tmp4 -= 1.0 */ +L(very_large_skip2): + + andl %eax, %edx /* k&1 */ + lea L(DP_ZERONE)(%rip), %r9 + subsd %xmm4, %xmm3 /* tmp3 -= tmp4 */ + addsd (%r9,%rdx,8), %xmm3 /* t = DP_ZERONE[k&1] + tmp3 */ + addsd %xmm2, %xmm3 /* t += tmp2 */ + shrl $29, %ecx /* (sign of x) << 2 */ + addsd %xmm3, %xmm0 /* t += tmp0 */ + addl $1, %eax /* n=k+1 */ + addsd %xmm1, %xmm0 /* t += tmp1 */ + mulsd L(DP_PIO4)(%rip), %xmm0 /* t *= PI04 */ + + jmp L(reconstruction) /* end of very_large_args peth */ + + .p2align 4 +L(arg_less_pio4): + /* Here if |x|<Pi/4 */ + cmpl $0x3d000000, %eax /* |x|<2^-5 ? */ + jl L(arg_less_2pn5) + + /* Here if 2^-5<=|x|<Pi/4 */ + movaps %xmm0, %xmm3 /* DP x */ + movhpd L(DP_ONES)(%rip), %xmm3 /* DP 1|x */ + mulsd %xmm0, %xmm0 /* DP y=x^2 */ + unpcklpd %xmm0, %xmm0 /* DP y|y */ + movaps %xmm0, %xmm1 /* y|y */ + mulpd %xmm0, %xmm0 /* z=x^4|z=x^4 */ + + movapd L(DP_SC4)(%rip), %xmm4 /* S4 */ + mulpd %xmm0, %xmm4 /* z*S4 */ + movapd L(DP_SC3)(%rip), %xmm5 /* S3 */ + mulpd %xmm0, %xmm5 /* z*S3 */ + addpd L(DP_SC2)(%rip), %xmm4 /* S2+z*S4 */ + mulpd %xmm0, %xmm4 /* z*(S2+z*S4) */ + addpd L(DP_SC1)(%rip), %xmm5 /* S1+z*S3 */ + mulpd %xmm0, %xmm5 /* z*(S1+z*S3) */ + addpd L(DP_SC0)(%rip), %xmm4 /* S0+z*(S2+z*S4) */ + mulpd %xmm1, %xmm4 /* y*(S0+z*(S2+z*S4)) */ + mulpd %xmm3, %xmm5 /* x*z*(S1+z*S3) */ + mulpd %xmm3, %xmm4 /* x*y*(S0+z*(S2+z*S4)) */ + addpd %xmm5, %xmm4 /*x*y*(S0+y*(S1+y*(S2+y*(S3+y*S4))))*/ + addpd %xmm4, %xmm3 /*x+x*y*(S0+y*(S1+y*(S2+y*(S3+y*S4))*/ + cvtpd2ps %xmm3, %xmm0 /* SP results */ + movss %xmm0, (ARG_SIN_PTR) /* store sin(x) from xmm0[0] */ + shufps $1, %xmm0, %xmm0 /* move cos(x) to xmm0[0] */ + movss %xmm0, (ARG_COS_PTR) /* store cos(x) */ + ret + + .p2align 4 +L(arg_less_2pn5): + /* Here if |x|<2^-5 */ + cmpl $0x32000000, %eax /* |x|<2^-27 ? */ + jl L(arg_less_2pn27) + + /* Here if 2^-27<=|x|<2^-5 */ + movaps %xmm0, %xmm1 /* DP x */ + movhpd L(DP_ONES)(%rip), %xmm1 /* DP 1|x */ + mulsd %xmm0, %xmm0 /* DP x^2 */ + unpcklpd %xmm0, %xmm0 /* DP x^2|x^2 */ + + movaps L(DP_SINCOS2_1)(%rip), %xmm3 /* DP DP_SIN2_1 */ + mulpd %xmm0, %xmm3 /* DP x^2*DP_SIN2_1 */ + addpd L(DP_SINCOS2_0)(%rip), %xmm3 /* DP DP_SIN2_0+x^2*DP_SIN2_1 */ + mulpd %xmm0, %xmm3 /* DP x^2*DP_SIN2_0+x^4*DP_SIN2_1 */ + mulpd %xmm1, %xmm3 /* DP x^3*DP_SIN2_0+x^5*DP_SIN2_1 */ + addpd %xmm1, %xmm3 /* DP x+x^3*DP_SIN2_0+x^5*DP_SIN2_1 */ + cvtpd2ps %xmm3, %xmm0 /* SP results */ + movss %xmm0, (ARG_SIN_PTR) /* store sin(x) from xmm0[0] */ + shufps $1, %xmm0, %xmm0 /* move cos(x) to xmm0[0] */ + movss %xmm0, (ARG_COS_PTR) /* store cos(x) */ + ret + + .p2align 4 +L(arg_less_2pn27): + cmpl $0, %eax /* x=0 ? */ + je L(arg_zero) /* in case x=0 return sin(+-0)==+-0 */ + /* Here if |x|<2^-27 */ + /* + * Special cases here: + * sin(subnormal) raises inexact/underflow + * sin(min_normalized) raises inexact/underflow + * sin(normalized) raises inexact + * cos(here)=1-|x| (raising inexact) + */ + movaps %xmm0, %xmm3 /* DP x */ + mulsd L(DP_SMALL)(%rip), %xmm0/* DP x*DP_SMALL */ + subsd %xmm0, %xmm3 /* DP sin result is x-x*DP_SMALL */ + andps L(SP_ABS_MASK)(%rip), %xmm7/* SP |x| */ + cvtsd2ss %xmm3, %xmm0 /* sin(x) */ + movss L(SP_ONE)(%rip), %xmm1 /* SP 1.0 */ + movss %xmm0, (ARG_SIN_PTR) /* sin(x) store */ + subss %xmm7, %xmm1 /* cos(x) */ + movss %xmm1, (ARG_COS_PTR) /* cos(x) store */ + ret + + .p2align 4 +L(arg_zero): + movss L(SP_ONE)(%rip), %xmm0 /* 1.0 */ + movss %xmm7, (ARG_SIN_PTR) /* sin(+-0)==x */ + movss %xmm0, (ARG_COS_PTR) /* cos(+-0)==1 */ + ret + + .p2align 4 +L(arg_inf_or_nan): + /* Here if |x| is Inf or NAN */ + jne L(skip_errno_setting) /* in case of x is NaN */ + + /* Align stack to 16 bytes. */ + subq $8, %rsp + cfi_adjust_cfa_offset (8) + /* Here if x is Inf. Set errno to EDOM. */ + call JUMPTARGET(__errno_location) + addq $8, %rsp + cfi_adjust_cfa_offset (-8) + + movl $EDOM, (%rax) + + .p2align 4 +L(skip_errno_setting): + /* Here if |x| is Inf or NAN. Continued. */ + subss %xmm7, %xmm7 /* x-x, result is NaN */ + movss %xmm7, (ARG_SIN_PTR) + movss %xmm7, (ARG_COS_PTR) + ret +END(__sincosf) + + .section .rodata, "a" + .p2align 3 +L(PIO4J): /* Table of j*Pi/4, for j=0,1,..,10 */ + .long 0x00000000,0x00000000 + .long 0x54442d18,0x3fe921fb + .long 0x54442d18,0x3ff921fb + .long 0x7f3321d2,0x4002d97c + .long 0x54442d18,0x400921fb + .long 0x2955385e,0x400f6a7a + .long 0x7f3321d2,0x4012d97c + .long 0xe9bba775,0x4015fdbb + .long 0x54442d18,0x401921fb + .long 0xbeccb2bb,0x401c463a + .long 0x2955385e,0x401f6a7a + .type L(PIO4J), @object + ASM_SIZE_DIRECTIVE(L(PIO4J)) + + .p2align 3 +L(_FPI): /* 4/Pi broken into sum of positive DP values */ + .long 0x00000000,0x00000000 + .long 0x6c000000,0x3ff45f30 + .long 0x2a000000,0x3e3c9c88 + .long 0xa8000000,0x3c54fe13 + .long 0xd0000000,0x3aaf47d4 + .long 0x6c000000,0x38fbb81b + .long 0xe0000000,0x3714acc9 + .long 0x7c000000,0x3560e410 + .long 0x56000000,0x33bca2c7 + .long 0xac000000,0x31fbd778 + .long 0xe0000000,0x300b7246 + .long 0xe8000000,0x2e5d2126 + .long 0x48000000,0x2c970032 + .long 0xe8000000,0x2ad77504 + .long 0xe0000000,0x290921cf + .long 0xb0000000,0x274deb1c + .long 0xe0000000,0x25829a73 + .long 0xbe000000,0x23fd1046 + .long 0x10000000,0x2224baed + .long 0x8e000000,0x20709d33 + .long 0x80000000,0x1e535a2f + .long 0x64000000,0x1cef904e + .long 0x30000000,0x1b0d6398 + .long 0x24000000,0x1964ce7d + .long 0x16000000,0x17b908bf + .type L(_FPI), @object + ASM_SIZE_DIRECTIVE(L(_FPI)) + +/* Coefficients of polynomials for */ +/* sin(x)~=x+x*x^2*(DP_SIN2_0+x^2*DP_SIN2_1) in low DP part, */ +/* cos(x)~=1+1*x^2*(DP_COS2_0+x^2*DP_COS2_1) in high DP part, */ +/* for |x|<2^-5. */ + .p2align 4 +L(DP_SINCOS2_0): + .long 0x5543d49d,0xbfc55555 + .long 0xff5cc6fd,0xbfdfffff + .type L(DP_SINCOS2_0), @object + ASM_SIZE_DIRECTIVE(L(DP_SINCOS2_0)) + + .p2align 4 +L(DP_SINCOS2_1): + .long 0x75cec8c5,0x3f8110f4 + .long 0xb178dac5,0x3fa55514 + .type L(DP_SINCOS2_1), @object + ASM_SIZE_DIRECTIVE(L(DP_SINCOS2_1)) + + + .p2align 3 +L(DP_ZERONE): + .long 0x00000000,0x00000000 /* 0.0 */ + .long 0x00000000,0xbff00000 /* 1.0 */ + .type L(DP_ZERONE), @object + ASM_SIZE_DIRECTIVE(L(DP_ZERONE)) + + .p2align 3 +L(DP_ONES): + .long 0x00000000,0x3ff00000 /* +1.0 */ + .long 0x00000000,0xbff00000 /* -1.0 */ + .type L(DP_ONES), @object + ASM_SIZE_DIRECTIVE(L(DP_ONES)) + +/* Coefficients of polynomials for */ +/* sin(t)~=t+t*t^2*(S0+t^2*(S1+t^2*(S2+t^2*(S3+t^2*S4)))) in low DP part, */ +/* cos(t)~=1+1*t^2*(C0+t^2*(C1+t^2*(C2+t^2*(C3+t^2*C4)))) in high DP part, */ +/* for |t|<Pi/4. */ + .p2align 4 +L(DP_SC4): + .long 0x1674b58a,0xbe5a947e + .long 0xdd8844d7,0xbe923c97 + .type L(DP_SC4), @object + ASM_SIZE_DIRECTIVE(L(DP_SC4)) + + .p2align 4 +L(DP_SC3): + .long 0x64e6b5b4,0x3ec71d72 + .long 0x9ac43cc0,0x3efa00eb + .type L(DP_SC3), @object + ASM_SIZE_DIRECTIVE(L(DP_SC3)) + + .p2align 4 +L(DP_SC2): + .long 0x8b4bd1f9,0xbf2a019f + .long 0x348b6874,0xbf56c16b + .type L(DP_SC2), @object + ASM_SIZE_DIRECTIVE(L(DP_SC2)) + + .p2align 4 +L(DP_SC1): + .long 0x10c2688b,0x3f811111 + .long 0x545c50c7,0x3fa55555 + .type L(DP_SC1), @object + ASM_SIZE_DIRECTIVE(L(DP_SC1)) + + .p2align 4 +L(DP_SC0): + .long 0x55551cd9,0xbfc55555 + .long 0xfffe98ae,0xbfdfffff + .type L(DP_SC0), @object + ASM_SIZE_DIRECTIVE(L(DP_SC0)) + + .p2align 3 +L(DP_SMALL): + .long 0x00000000,0x3cd00000 /* 2^(-50) */ + .type L(DP_SMALL), @object + ASM_SIZE_DIRECTIVE(L(DP_SMALL)) + + .p2align 3 +L(DP_PIO4): + .long 0x54442d18,0x3fe921fb /* Pi/4 */ + .type L(DP_PIO4), @object + ASM_SIZE_DIRECTIVE(L(DP_PIO4)) + + .p2align 3 +L(DP_2POW52): + .long 0x00000000,0x43300000 /* +2^52 */ + .long 0x00000000,0xc3300000 /* -2^52 */ + .type L(DP_2POW52), @object + ASM_SIZE_DIRECTIVE(L(DP_2POW52)) + + .p2align 3 +L(DP_INVPIO4): + .long 0x6dc9c883,0x3ff45f30 /* 4/Pi */ + .type L(DP_INVPIO4), @object + ASM_SIZE_DIRECTIVE(L(DP_INVPIO4)) + + .p2align 3 +L(DP_PIO4HI): + .long 0x54000000,0xbfe921fb /* High part of Pi/4 */ + .type L(DP_PIO4HI), @object + ASM_SIZE_DIRECTIVE(L(DP_PIO4HI)) + + .p2align 3 +L(DP_PIO4LO): + .long 0x11A62633,0xbe010b46 /* Low part of Pi/4 */ + .type L(DP_PIO4LO), @object + ASM_SIZE_DIRECTIVE(L(DP_PIO4LO)) + + .p2align 2 +L(SP_INVPIO4): + .long 0x3fa2f983 /* 4/Pi */ + .type L(SP_INVPIO4), @object + ASM_SIZE_DIRECTIVE(L(SP_INVPIO4)) + + .p2align 4 +L(DP_ABS_MASK): /* Mask for getting DP absolute value */ + .long 0xffffffff,0x7fffffff + .long 0xffffffff,0x7fffffff + .type L(DP_ABS_MASK), @object + ASM_SIZE_DIRECTIVE(L(DP_ABS_MASK)) + + .p2align 3 +L(DP_HI_MASK): /* Mask for getting high 21 bits of DP value */ + .long 0x00000000,0xffffffff + .type L(DP_HI_MASK), @object + ASM_SIZE_DIRECTIVE(L(DP_HI_MASK)) + + .p2align 4 +L(SP_ABS_MASK): /* Mask for getting SP absolute value */ + .long 0x7fffffff,0x7fffffff + .long 0x7fffffff,0x7fffffff + .type L(SP_ABS_MASK), @object + ASM_SIZE_DIRECTIVE(L(SP_ABS_MASK)) + + .p2align 2 +L(SP_ONE): + .long 0x3f800000 /* 1.0 */ + .type L(SP_ONE), @object + ASM_SIZE_DIRECTIVE(L(SP_ONE)) + +weak_alias(__sincosf, sincosf) diff --git a/REORG.TODO/sysdeps/x86_64/fpu/s_sinf.S b/REORG.TODO/sysdeps/x86_64/fpu/s_sinf.S new file mode 100644 index 0000000000..0aa5d43d8c --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/s_sinf.S @@ -0,0 +1,559 @@ +/* Optimized sinf function. + Copyright (C) 2012-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#define __need_Emath +#include <bits/errno.h> + +/* Short algorithm description: + * + * 1) if |x| == 0: return x. + * 2) if |x| < 2^-27: return x-x*DP_SMALL, raise underflow only when needed. + * 3) if |x| < 2^-5 : return x+x^3*DP_SIN2_0+x^5*DP_SIN2_1. + * 4) if |x| < Pi/4: return x+x^3*(S0+x^2*(S1+x^2*(S2+x^2*(S3+x^2*S4)))). + * 5) if |x| < 9*Pi/4: + * 5.1) Range reduction: k=trunc(|x|/(Pi/4)), j=(k+1)&0x0e, n=k+1, + * t=|x|-j*Pi/4. + * 5.2) Reconstruction: + * s = sign(x) * (-1.0)^((n>>2)&1) + * if(n&2 != 0) { + * using cos(t) polynomial for |t|<Pi/4, result is + * s * (1.0+t^2*(C0+t^2*(C1+t^2*(C2+t^2*(C3+t^2*C4))))). + * } else { + * using sin(t) polynomial for |t|<Pi/4, result is + * s * t * (1.0+t^2*(S0+t^2*(S1+t^2*(S2+t^2*(S3+t^2*S4))))). + * } + * 6) if |x| < 2^23, large args: + * 6.1) Range reduction: k=trunc(|x|/(Pi/4)), j=(k+1)&0xfffffffe, n=k+1, + * t=|x|-j*Pi/4. + * 6.2) Reconstruction same as (5.2). + * 7) if |x| >= 2^23, very large args: + * 7.1) Range reduction: k=trunc(|x|/(Pi/4)), j=(k+1)&0xfffffffe, n=k+1, + * t=|x|-j*Pi/4. + * 7.2) Reconstruction same as (5.2). + * 8) if x is Inf, return x-x, and set errno=EDOM. + * 9) if x is NaN, return x-x. + * + * Special cases: + * sin(+-0) = +-0 not raising inexact/underflow, + * sin(subnormal) raises inexact/underflow, + * sin(min_normalized) raises inexact/underflow, + * sin(normalized) raises inexact, + * sin(Inf) = NaN, raises invalid, sets errno to EDOM, + * sin(NaN) = NaN. + */ + + .text +ENTRY(__sinf) + /* Input: single precision x in %xmm0 */ + + movd %xmm0, %eax /* Bits of x */ + movaps %xmm0, %xmm7 /* Copy of x */ + cvtss2sd %xmm0, %xmm0 /* DP x */ + movss L(SP_ABS_MASK)(%rip), %xmm3 + movl %eax, %edi /* Copy of x bits */ + andl $0x7fffffff, %eax /* |x| */ + + cmpl $0x3f490fdb, %eax /* |x|<Pi/4? */ + jb L(arg_less_pio4) + + /* Here if |x|>=Pi/4 */ + andps %xmm7, %xmm3 /* SP |x| */ + andpd L(DP_ABS_MASK)(%rip),%xmm0 /* DP |x| */ + movss L(SP_INVPIO4)(%rip), %xmm2 /* SP 1/(Pi/4) */ + + cmpl $0x40e231d6, %eax /* |x|<9*Pi/4? */ + jae L(large_args) + + /* Here if Pi/4<=|x|<9*Pi/4 */ + mulss %xmm3, %xmm2 /* SP |x|/(Pi/4) */ + movl %edi, %ecx /* Load x */ + cvttss2si %xmm2, %eax /* k, number of Pi/4 in x */ + lea L(PIO4J)(%rip), %rsi + shrl $31, %ecx /* sign of x */ + addl $1, %eax /* k+1 */ + movl $0x0e, %edx + andl %eax, %edx /* j = (k+1)&0x0e */ + subsd (%rsi,%rdx,8), %xmm0 /* t = |x| - j * Pi/4 */ + +L(reconstruction): + /* Input: %eax=n, %xmm0=t, %ecx=sign(x) */ + testl $2, %eax /* n&2 != 0? */ + jz L(sin_poly) + +/*L(cos_poly):*/ + /* Here if sin(x) calculated using cos(t) polynomial for |t|<Pi/4: + * y = t*t; z = y*y; + * s = sign(x) * (-1.0)^((n>>2)&1) + * result = s * (1.0+t^2*(C0+t^2*(C1+t^2*(C2+t^2*(C3+t^2*C4))))) + */ + shrl $2, %eax /* n>>2 */ + mulsd %xmm0, %xmm0 /* y=t^2 */ + andl $1, %eax /* (n>>2)&1 */ + movaps %xmm0, %xmm1 /* y */ + mulsd %xmm0, %xmm0 /* z=t^4 */ + + movsd L(DP_C4)(%rip), %xmm4 /* C4 */ + mulsd %xmm0, %xmm4 /* z*C4 */ + xorl %eax, %ecx /* (-1.0)^((n>>2)&1) XOR sign(x) */ + movsd L(DP_C3)(%rip), %xmm3 /* C3 */ + mulsd %xmm0, %xmm3 /* z*C3 */ + lea L(DP_ONES)(%rip), %rsi + addsd L(DP_C2)(%rip), %xmm4 /* C2+z*C4 */ + mulsd %xmm0, %xmm4 /* z*(C2+z*C4) */ + addsd L(DP_C1)(%rip), %xmm3 /* C1+z*C3 */ + mulsd %xmm0, %xmm3 /* z*(C1+z*C3) */ + addsd L(DP_C0)(%rip), %xmm4 /* C0+z*(C2+z*C4) */ + mulsd %xmm1, %xmm4 /* y*(C0+z*(C2+z*C4)) */ + + /* y*(C0+y*(C1+y*(C2+y*(C3+y*C4)))) */ + addsd %xmm4, %xmm3 + /* 1.0+y*(C0+y*(C1+y*(C2+y*(C3+y*C4)))) */ + addsd L(DP_ONES)(%rip), %xmm3 + + mulsd (%rsi,%rcx,8), %xmm3 /* DP result */ + cvtsd2ss %xmm3, %xmm0 /* SP result */ + ret + + .p2align 4 +L(sin_poly): + /* Here if sin(x) calculated using sin(t) polynomial for |t|<Pi/4: + * y = t*t; z = y*y; + * s = sign(x) * (-1.0)^((n>>2)&1) + * result = s * t * (1.0+t^2*(S0+t^2*(S1+t^2*(S2+t^2*(S3+t^2*S4))))) + */ + + movaps %xmm0, %xmm4 /* t */ + shrl $2, %eax /* n>>2 */ + mulsd %xmm0, %xmm0 /* y=t^2 */ + andl $1, %eax /* (n>>2)&1 */ + movaps %xmm0, %xmm1 /* y */ + xorl %eax, %ecx /* (-1.0)^((n>>2)&1) XOR sign(x) */ + mulsd %xmm0, %xmm0 /* z=t^4 */ + + movsd L(DP_S4)(%rip), %xmm2 /* S4 */ + mulsd %xmm0, %xmm2 /* z*S4 */ + movsd L(DP_S3)(%rip), %xmm3 /* S3 */ + mulsd %xmm0, %xmm3 /* z*S3 */ + lea L(DP_ONES)(%rip), %rsi + addsd L(DP_S2)(%rip), %xmm2 /* S2+z*S4 */ + mulsd %xmm0, %xmm2 /* z*(S2+z*S4) */ + addsd L(DP_S1)(%rip), %xmm3 /* S1+z*S3 */ + mulsd %xmm0, %xmm3 /* z*(S1+z*S3) */ + addsd L(DP_S0)(%rip), %xmm2 /* S0+z*(S2+z*S4) */ + mulsd %xmm1, %xmm2 /* y*(S0+z*(S2+z*S4)) */ + /* t*s, where s = sign(x) * (-1.0)^((n>>2)&1) */ + mulsd (%rsi,%rcx,8), %xmm4 + /* y*(S0+y*(S1+y*(S2+y*(S3+y*S4)))) */ + addsd %xmm2, %xmm3 + /* t*s*y*(S0+y*(S1+y*(S2+y*(S3+y*S4)))) */ + mulsd %xmm4, %xmm3 + /* t*s*(1.0+y*(S0+y*(S1+y*(S2+y*(S3+y*S4)))) */ + addsd %xmm4, %xmm3 + cvtsd2ss %xmm3, %xmm0 /* SP result */ + ret + + .p2align 4 +L(large_args): + /* Here if |x|>=9*Pi/4 */ + cmpl $0x7f800000, %eax /* x is Inf or NaN? */ + jae L(arg_inf_or_nan) + + /* Here if finite |x|>=9*Pi/4 */ + cmpl $0x4b000000, %eax /* |x|<2^23? */ + jae L(very_large_args) + + /* Here if 9*Pi/4<=|x|<2^23 */ + movsd L(DP_INVPIO4)(%rip), %xmm1 /* 1/(Pi/4) */ + mulsd %xmm0, %xmm1 /* |x|/(Pi/4) */ + cvttsd2si %xmm1, %eax /* k=trunc(|x|/(Pi/4)) */ + addl $1, %eax /* k+1 */ + movl %eax, %edx + andl $0xfffffffe, %edx /* j=(k+1)&0xfffffffe */ + cvtsi2sdl %edx, %xmm4 /* DP j */ + movl %edi, %ecx /* Load x */ + movsd L(DP_PIO4HI)(%rip), %xmm2 /* -PIO4HI = high part of -Pi/4 */ + shrl $31, %ecx /* sign bit of x */ + mulsd %xmm4, %xmm2 /* -j*PIO4HI */ + movsd L(DP_PIO4LO)(%rip), %xmm3 /* -PIO4LO = low part of -Pi/4 */ + addsd %xmm2, %xmm0 /* |x| - j*PIO4HI */ + mulsd %xmm3, %xmm4 /* j*PIO4LO */ + addsd %xmm4, %xmm0 /* t = |x| - j*PIO4HI - j*PIO4LO */ + jmp L(reconstruction) + + .p2align 4 +L(very_large_args): + /* Here if finite |x|>=2^23 */ + + /* bitpos = (ix>>23) - BIAS_32 + 59; */ + shrl $23, %eax /* eb = biased exponent of x */ + /* bitpos = eb - 0x7f + 59, where 0x7f is exponent bias */ + subl $68, %eax + movl $28, %ecx /* %cl=28 */ + movl %eax, %edx /* bitpos copy */ + + /* j = bitpos/28; */ + div %cl /* j in register %al=%ax/%cl */ + movapd %xmm0, %xmm3 /* |x| */ + /* clear unneeded remainder from %ah */ + andl $0xff, %eax + + imull $28, %eax, %ecx /* j*28 */ + lea L(_FPI)(%rip), %rsi + movsd L(DP_HI_MASK)(%rip), %xmm4 /* DP_HI_MASK */ + movapd %xmm0, %xmm5 /* |x| */ + mulsd -16(%rsi,%rax,8), %xmm3 /* tmp3 = FPI[j-2]*|x| */ + movapd %xmm0, %xmm1 /* |x| */ + mulsd -8(%rsi,%rax,8), %xmm5 /* tmp2 = FPI[j-1]*|x| */ + mulsd (%rsi,%rax,8), %xmm0 /* tmp0 = FPI[j]*|x| */ + addl $19, %ecx /* j*28+19 */ + mulsd 8(%rsi,%rax,8), %xmm1 /* tmp1 = FPI[j+1]*|x| */ + cmpl %ecx, %edx /* bitpos>=j*28+19? */ + jl L(very_large_skip1) + + /* Here if bitpos>=j*28+19 */ + andpd %xmm3, %xmm4 /* HI(tmp3) */ + subsd %xmm4, %xmm3 /* tmp3 = tmp3 - HI(tmp3) */ +L(very_large_skip1): + + movsd L(DP_2POW52)(%rip), %xmm6 + movapd %xmm5, %xmm2 /* tmp2 copy */ + addsd %xmm3, %xmm5 /* tmp5 = tmp3 + tmp2 */ + movl $1, %edx + addsd %xmm5, %xmm6 /* tmp6 = tmp5 + 2^52 */ + movsd 8+L(DP_2POW52)(%rip), %xmm4 + movd %xmm6, %eax /* k = I64_LO(tmp6); */ + addsd %xmm6, %xmm4 /* tmp4 = tmp6 - 2^52 */ + movl %edi, %ecx /* Load x */ + comisd %xmm5, %xmm4 /* tmp4 > tmp5? */ + jbe L(very_large_skip2) + + /* Here if tmp4 > tmp5 */ + subl $1, %eax /* k-- */ + addsd 8+L(DP_ONES)(%rip), %xmm4 /* tmp4 -= 1.0 */ +L(very_large_skip2): + + andl %eax, %edx /* k&1 */ + lea L(DP_ZERONE)(%rip), %rsi + subsd %xmm4, %xmm3 /* tmp3 -= tmp4 */ + addsd (%rsi,%rdx,8), %xmm3 /* t = DP_ZERONE[k&1] + tmp3 */ + addsd %xmm2, %xmm3 /* t += tmp2 */ + shrl $31, %ecx /* sign of x */ + addsd %xmm3, %xmm0 /* t += tmp0 */ + addl $1, %eax /* n=k+1 */ + addsd %xmm1, %xmm0 /* t += tmp1 */ + mulsd L(DP_PIO4)(%rip), %xmm0 /* t *= PI04 */ + + jmp L(reconstruction) /* end of very_large_args peth */ + + .p2align 4 +L(arg_less_pio4): + /* Here if |x|<Pi/4 */ + cmpl $0x3d000000, %eax /* |x|<2^-5? */ + jl L(arg_less_2pn5) + + /* Here if 2^-5<=|x|<Pi/4 */ + movaps %xmm0, %xmm3 /* x */ + mulsd %xmm0, %xmm0 /* y=x^2 */ + movaps %xmm0, %xmm1 /* y */ + mulsd %xmm0, %xmm0 /* z=x^4 */ + movsd L(DP_S4)(%rip), %xmm4 /* S4 */ + mulsd %xmm0, %xmm4 /* z*S4 */ + movsd L(DP_S3)(%rip), %xmm5 /* S3 */ + mulsd %xmm0, %xmm5 /* z*S3 */ + addsd L(DP_S2)(%rip), %xmm4 /* S2+z*S4 */ + mulsd %xmm0, %xmm4 /* z*(S2+z*S4) */ + addsd L(DP_S1)(%rip), %xmm5 /* S1+z*S3 */ + mulsd %xmm0, %xmm5 /* z*(S1+z*S3) */ + addsd L(DP_S0)(%rip), %xmm4 /* S0+z*(S2+z*S4) */ + mulsd %xmm1, %xmm4 /* y*(S0+z*(S2+z*S4)) */ + mulsd %xmm3, %xmm5 /* x*z*(S1+z*S3) */ + mulsd %xmm3, %xmm4 /* x*y*(S0+z*(S2+z*S4)) */ + /* x*y*(S0+y*(S1+y*(S2+y*(S3+y*S4)))) */ + addsd %xmm5, %xmm4 + /* x + x*y*(S0+y*(S1+y*(S2+y*(S3+y*S4)))) */ + addsd %xmm4, %xmm3 + cvtsd2ss %xmm3, %xmm0 /* SP result */ + ret + + .p2align 4 +L(arg_less_2pn5): + /* Here if |x|<2^-5 */ + cmpl $0x32000000, %eax /* |x|<2^-27? */ + jl L(arg_less_2pn27) + + /* Here if 2^-27<=|x|<2^-5 */ + movaps %xmm0, %xmm1 /* DP x */ + mulsd %xmm0, %xmm0 /* DP x^2 */ + movsd L(DP_SIN2_1)(%rip), %xmm3 /* DP DP_SIN2_1 */ + mulsd %xmm0, %xmm3 /* DP x^2*DP_SIN2_1 */ + addsd L(DP_SIN2_0)(%rip), %xmm3 /* DP DP_SIN2_0+x^2*DP_SIN2_1 */ + mulsd %xmm0, %xmm3 /* DP x^2*DP_SIN2_0+x^4*DP_SIN2_1 */ + mulsd %xmm1, %xmm3 /* DP x^3*DP_SIN2_0+x^5*DP_SIN2_1 */ + addsd %xmm1, %xmm3 /* DP x+x^3*DP_SIN2_0+x^5*DP_SIN2_1 */ + cvtsd2ss %xmm3, %xmm0 /* SP result */ + ret + + .p2align 4 +L(arg_less_2pn27): + cmpl $0, %eax /* x=0? */ + je L(arg_zero) /* in case x=0 return sin(+-0)==+-0 */ + /* Here if |x|<2^-27 */ + /* + * Special cases here: + * sin(subnormal) raises inexact/underflow + * sin(min_normalized) raises inexact/underflow + * sin(normalized) raises inexact + */ + movaps %xmm0, %xmm3 /* Copy of DP x */ + mulsd L(DP_SMALL)(%rip), %xmm0 /* x*DP_SMALL */ + subsd %xmm0, %xmm3 /* Result is x-x*DP_SMALL */ + cvtsd2ss %xmm3, %xmm0 /* Result converted to SP */ + ret + + .p2align 4 +L(arg_zero): + movaps %xmm7, %xmm0 /* SP x */ + ret + + .p2align 4 +L(arg_inf_or_nan): + /* Here if |x| is Inf or NAN */ + jne L(skip_errno_setting) /* in case of x is NaN */ + + /* Align stack to 16 bytes. */ + subq $8, %rsp + cfi_adjust_cfa_offset (8) + /* Here if x is Inf. Set errno to EDOM. */ + call JUMPTARGET(__errno_location) + addq $8, %rsp + cfi_adjust_cfa_offset (-8) + + movl $EDOM, (%rax) + + .p2align 4 +L(skip_errno_setting): + /* Here if |x| is Inf or NAN. Continued. */ + movaps %xmm7, %xmm0 /* load x */ + subss %xmm0, %xmm0 /* Result is NaN */ + ret +END(__sinf) + + .section .rodata, "a" + .p2align 3 +L(PIO4J): /* Table of j*Pi/4, for j=0,1,..,10 */ + .long 0x00000000,0x00000000 + .long 0x54442d18,0x3fe921fb + .long 0x54442d18,0x3ff921fb + .long 0x7f3321d2,0x4002d97c + .long 0x54442d18,0x400921fb + .long 0x2955385e,0x400f6a7a + .long 0x7f3321d2,0x4012d97c + .long 0xe9bba775,0x4015fdbb + .long 0x54442d18,0x401921fb + .long 0xbeccb2bb,0x401c463a + .long 0x2955385e,0x401f6a7a + .type L(PIO4J), @object + ASM_SIZE_DIRECTIVE(L(PIO4J)) + + .p2align 3 +L(_FPI): /* 4/Pi broken into sum of positive DP values */ + .long 0x00000000,0x00000000 + .long 0x6c000000,0x3ff45f30 + .long 0x2a000000,0x3e3c9c88 + .long 0xa8000000,0x3c54fe13 + .long 0xd0000000,0x3aaf47d4 + .long 0x6c000000,0x38fbb81b + .long 0xe0000000,0x3714acc9 + .long 0x7c000000,0x3560e410 + .long 0x56000000,0x33bca2c7 + .long 0xac000000,0x31fbd778 + .long 0xe0000000,0x300b7246 + .long 0xe8000000,0x2e5d2126 + .long 0x48000000,0x2c970032 + .long 0xe8000000,0x2ad77504 + .long 0xe0000000,0x290921cf + .long 0xb0000000,0x274deb1c + .long 0xe0000000,0x25829a73 + .long 0xbe000000,0x23fd1046 + .long 0x10000000,0x2224baed + .long 0x8e000000,0x20709d33 + .long 0x80000000,0x1e535a2f + .long 0x64000000,0x1cef904e + .long 0x30000000,0x1b0d6398 + .long 0x24000000,0x1964ce7d + .long 0x16000000,0x17b908bf + .type L(_FPI), @object + ASM_SIZE_DIRECTIVE(L(_FPI)) + +/* Coefficients of polynomial + for sin(x)~=x+x^3*DP_SIN2_0+x^5*DP_SIN2_1, |x|<2^-5. */ + .p2align 3 +L(DP_SIN2_0): + .long 0x5543d49d,0xbfc55555 + .type L(DP_SIN2_0), @object + ASM_SIZE_DIRECTIVE(L(DP_SIN2_0)) + + .p2align 3 +L(DP_SIN2_1): + .long 0x75cec8c5,0x3f8110f4 + .type L(DP_SIN2_1), @object + ASM_SIZE_DIRECTIVE(L(DP_SIN2_1)) + + .p2align 3 +L(DP_ZERONE): + .long 0x00000000,0x00000000 /* 0.0 */ + .long 0x00000000,0xbff00000 /* 1.0 */ + .type L(DP_ZERONE), @object + ASM_SIZE_DIRECTIVE(L(DP_ZERONE)) + + .p2align 3 +L(DP_ONES): + .long 0x00000000,0x3ff00000 /* +1.0 */ + .long 0x00000000,0xbff00000 /* -1.0 */ + .type L(DP_ONES), @object + ASM_SIZE_DIRECTIVE(L(DP_ONES)) + +/* Coefficients of polynomial + for sin(t)~=t+t^3*(S0+t^2*(S1+t^2*(S2+t^2*(S3+t^2*S4)))), |t|<Pi/4. */ + .p2align 3 +L(DP_S3): + .long 0x64e6b5b4,0x3ec71d72 + .type L(DP_S3), @object + ASM_SIZE_DIRECTIVE(L(DP_S3)) + + .p2align 3 +L(DP_S1): + .long 0x10c2688b,0x3f811111 + .type L(DP_S1), @object + ASM_SIZE_DIRECTIVE(L(DP_S1)) + + .p2align 3 +L(DP_S4): + .long 0x1674b58a,0xbe5a947e + .type L(DP_S4), @object + ASM_SIZE_DIRECTIVE(L(DP_S4)) + + .p2align 3 +L(DP_S2): + .long 0x8b4bd1f9,0xbf2a019f + .type L(DP_S2), @object + ASM_SIZE_DIRECTIVE(L(DP_S2)) + + .p2align 3 +L(DP_S0): + .long 0x55551cd9,0xbfc55555 + .type L(DP_S0), @object + ASM_SIZE_DIRECTIVE(L(DP_S0)) + + .p2align 3 +L(DP_SMALL): + .long 0x00000000,0x3cd00000 /* 2^(-50) */ + .type L(DP_SMALL), @object + ASM_SIZE_DIRECTIVE(L(DP_SMALL)) + +/* Coefficients of polynomial + for cos(t)~=1.0+t^2*(C0+t^2*(C1+t^2*(C2+t^2*(C3+t^2*C4)))), |t|<Pi/4. */ + .p2align 3 +L(DP_C3): + .long 0x9ac43cc0,0x3efa00eb + .type L(DP_C3), @object + ASM_SIZE_DIRECTIVE(L(DP_C3)) + + .p2align 3 +L(DP_C1): + .long 0x545c50c7,0x3fa55555 + .type L(DP_C1), @object + ASM_SIZE_DIRECTIVE(L(DP_C1)) + + .p2align 3 +L(DP_C4): + .long 0xdd8844d7,0xbe923c97 + .type L(DP_C4), @object + ASM_SIZE_DIRECTIVE(L(DP_C4)) + + .p2align 3 +L(DP_C2): + .long 0x348b6874,0xbf56c16b + .type L(DP_C2), @object + ASM_SIZE_DIRECTIVE(L(DP_C2)) + + .p2align 3 +L(DP_C0): + .long 0xfffe98ae,0xbfdfffff + .type L(DP_C0), @object + ASM_SIZE_DIRECTIVE(L(DP_C0)) + + .p2align 3 +L(DP_PIO4): + .long 0x54442d18,0x3fe921fb /* Pi/4 */ + .type L(DP_PIO4), @object + ASM_SIZE_DIRECTIVE(L(DP_PIO4)) + + .p2align 3 +L(DP_2POW52): + .long 0x00000000,0x43300000 /* +2^52 */ + .long 0x00000000,0xc3300000 /* -2^52 */ + .type L(DP_2POW52), @object + ASM_SIZE_DIRECTIVE(L(DP_2POW52)) + + .p2align 3 +L(DP_INVPIO4): + .long 0x6dc9c883,0x3ff45f30 /* 4/Pi */ + .type L(DP_INVPIO4), @object + ASM_SIZE_DIRECTIVE(L(DP_INVPIO4)) + + .p2align 3 +L(DP_PIO4HI): + .long 0x54000000,0xbfe921fb /* High part of Pi/4 */ + .type L(DP_PIO4HI), @object + ASM_SIZE_DIRECTIVE(L(DP_PIO4HI)) + + .p2align 3 +L(DP_PIO4LO): + .long 0x11A62633,0xbe010b46 /* Low part of Pi/4 */ + .type L(DP_PIO4LO), @object + ASM_SIZE_DIRECTIVE(L(DP_PIO4LO)) + + .p2align 2 +L(SP_INVPIO4): + .long 0x3fa2f983 /* 4/Pi */ + .type L(SP_INVPIO4), @object + ASM_SIZE_DIRECTIVE(L(SP_INVPIO4)) + + .p2align 4 +L(DP_ABS_MASK): /* Mask for getting DP absolute value */ + .long 0xffffffff,0x7fffffff + .long 0xffffffff,0x7fffffff + .type L(DP_ABS_MASK), @object + ASM_SIZE_DIRECTIVE(L(DP_ABS_MASK)) + + .p2align 3 +L(DP_HI_MASK): /* Mask for getting high 21 bits of DP value */ + .long 0x00000000,0xffffffff + .type L(DP_HI_MASK),@object + ASM_SIZE_DIRECTIVE(L(DP_HI_MASK)) + + .p2align 4 +L(SP_ABS_MASK): /* Mask for getting SP absolute value */ + .long 0x7fffffff,0x7fffffff + .long 0x7fffffff,0x7fffffff + .type L(SP_ABS_MASK), @object + ASM_SIZE_DIRECTIVE(L(SP_ABS_MASK)) + +weak_alias(__sinf, sinf) diff --git a/REORG.TODO/sysdeps/x86_64/fpu/s_truncl.S b/REORG.TODO/sysdeps/x86_64/fpu/s_truncl.S new file mode 100644 index 0000000000..b6ca0bae7b --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/s_truncl.S @@ -0,0 +1,36 @@ +/* Truncate long double value. + Copyright (C) 1997-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <machine/asm.h> + +ENTRY(__truncl) + fldt 8(%rsp) + fnstenv -28(%rsp) + movl $0xc00, %edx + orl -28(%rsp), %edx + movl %edx, -32(%rsp) + fldcw -32(%rsp) + frndint + fnstsw + andl $0x1, %eax + orl %eax, -24(%rsp) + fldenv -28(%rsp) + ret +END(__truncl) +weak_alias (__truncl, truncl) diff --git a/REORG.TODO/sysdeps/x86_64/fpu/svml_d_cos2_core.S b/REORG.TODO/sysdeps/x86_64/fpu/svml_d_cos2_core.S new file mode 100644 index 0000000000..db4fd3f62f --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/svml_d_cos2_core.S @@ -0,0 +1,29 @@ +/* Function cos vectorized with SSE2. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "svml_d_wrapper_impl.h" + + .text +ENTRY (_ZGVbN2v_cos) +WRAPPER_IMPL_SSE2 cos +END (_ZGVbN2v_cos) + +#ifndef USE_MULTIARCH + libmvec_hidden_def (_ZGVbN2v_cos) +#endif diff --git a/REORG.TODO/sysdeps/x86_64/fpu/svml_d_cos4_core.S b/REORG.TODO/sysdeps/x86_64/fpu/svml_d_cos4_core.S new file mode 100644 index 0000000000..a30f1c43f5 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/svml_d_cos4_core.S @@ -0,0 +1,29 @@ +/* Function cos vectorized with AVX2, wrapper version. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "svml_d_wrapper_impl.h" + + .text +ENTRY (_ZGVdN4v_cos) +WRAPPER_IMPL_AVX _ZGVbN2v_cos +END (_ZGVdN4v_cos) + +#ifndef USE_MULTIARCH + libmvec_hidden_def (_ZGVdN4v_cos) +#endif diff --git a/REORG.TODO/sysdeps/x86_64/fpu/svml_d_cos4_core_avx.S b/REORG.TODO/sysdeps/x86_64/fpu/svml_d_cos4_core_avx.S new file mode 100644 index 0000000000..c6ce6fa1a4 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/svml_d_cos4_core_avx.S @@ -0,0 +1,25 @@ +/* Function cos vectorized in AVX ISA as wrapper to SSE4 ISA version. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "svml_d_wrapper_impl.h" + + .text +ENTRY (_ZGVcN4v_cos) +WRAPPER_IMPL_AVX _ZGVbN2v_cos +END (_ZGVcN4v_cos) diff --git a/REORG.TODO/sysdeps/x86_64/fpu/svml_d_cos8_core.S b/REORG.TODO/sysdeps/x86_64/fpu/svml_d_cos8_core.S new file mode 100644 index 0000000000..5432bc701e --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/svml_d_cos8_core.S @@ -0,0 +1,25 @@ +/* Function cos vectorized with AVX-512, wrapper to AVX2. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "svml_d_wrapper_impl.h" + + .text +ENTRY (_ZGVeN8v_cos) +WRAPPER_IMPL_AVX512 _ZGVdN4v_cos +END (_ZGVeN8v_cos) diff --git a/REORG.TODO/sysdeps/x86_64/fpu/svml_d_exp2_core.S b/REORG.TODO/sysdeps/x86_64/fpu/svml_d_exp2_core.S new file mode 100644 index 0000000000..92b328331d --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/svml_d_exp2_core.S @@ -0,0 +1,29 @@ +/* Function exp vectorized with SSE2. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "svml_d_wrapper_impl.h" + + .text +ENTRY (_ZGVbN2v_exp) +WRAPPER_IMPL_SSE2 __exp_finite +END (_ZGVbN2v_exp) + +#ifndef USE_MULTIARCH + libmvec_hidden_def (_ZGVbN2v_exp) +#endif diff --git a/REORG.TODO/sysdeps/x86_64/fpu/svml_d_exp4_core.S b/REORG.TODO/sysdeps/x86_64/fpu/svml_d_exp4_core.S new file mode 100644 index 0000000000..e062263d7a --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/svml_d_exp4_core.S @@ -0,0 +1,29 @@ +/* Function exp vectorized with AVX2, wrapper version. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "svml_d_wrapper_impl.h" + + .text +ENTRY (_ZGVdN4v_exp) +WRAPPER_IMPL_AVX _ZGVbN2v_exp +END (_ZGVdN4v_exp) + +#ifndef USE_MULTIARCH + libmvec_hidden_def (_ZGVdN4v_exp) +#endif diff --git a/REORG.TODO/sysdeps/x86_64/fpu/svml_d_exp4_core_avx.S b/REORG.TODO/sysdeps/x86_64/fpu/svml_d_exp4_core_avx.S new file mode 100644 index 0000000000..21ae29d330 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/svml_d_exp4_core_avx.S @@ -0,0 +1,25 @@ +/* Function exp vectorized in AVX ISA as wrapper to SSE4 ISA version. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "svml_d_wrapper_impl.h" + + .text +ENTRY (_ZGVcN4v_exp) +WRAPPER_IMPL_AVX _ZGVbN2v_exp +END (_ZGVcN4v_exp) diff --git a/REORG.TODO/sysdeps/x86_64/fpu/svml_d_exp8_core.S b/REORG.TODO/sysdeps/x86_64/fpu/svml_d_exp8_core.S new file mode 100644 index 0000000000..28bfa98dde --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/svml_d_exp8_core.S @@ -0,0 +1,25 @@ +/* Function exp vectorized with AVX-512. Wrapper to AVX2 version. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "svml_d_wrapper_impl.h" + + .text +ENTRY (_ZGVeN8v_exp) +WRAPPER_IMPL_AVX512 _ZGVdN4v_exp +END (_ZGVeN8v_exp) diff --git a/REORG.TODO/sysdeps/x86_64/fpu/svml_d_exp_data.S b/REORG.TODO/sysdeps/x86_64/fpu/svml_d_exp_data.S new file mode 100644 index 0000000000..521537e3f6 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/svml_d_exp_data.S @@ -0,0 +1,1088 @@ +/* Data for vector function exp. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include "svml_d_exp_data.h" + + .section .rodata, "a" + .align 64 + +/* Data table for vector implementations of function exp. + * The table may contain polynomial, reduction, lookup + * coefficients and other constants obtained through different + * methods of research and experimental work. */ + .globl __svml_dexp_data +__svml_dexp_data: + +/* Lookup table of 2^(j/2^K): */ +.if .-__svml_dexp_data != __dbT +.err +.endif + .quad 0x3ff0000000000000 + .quad 0x3ff002c605e2e8cf + .quad 0x3ff0058c86da1c0a + .quad 0x3ff0085382faef83 + .quad 0x3ff00b1afa5abcbf + .quad 0x3ff00de2ed0ee0f5 + .quad 0x3ff010ab5b2cbd11 + .quad 0x3ff0137444c9b5b5 + .quad 0x3ff0163da9fb3335 + .quad 0x3ff019078ad6a19f + .quad 0x3ff01bd1e77170b4 + .quad 0x3ff01e9cbfe113ef + .quad 0x3ff02168143b0281 + .quad 0x3ff02433e494b755 + .quad 0x3ff027003103b10e + .quad 0x3ff029ccf99d720a + .quad 0x3ff02c9a3e778061 + .quad 0x3ff02f67ffa765e6 + .quad 0x3ff032363d42b027 + .quad 0x3ff03504f75ef071 + .quad 0x3ff037d42e11bbcc + .quad 0x3ff03aa3e170aafe + .quad 0x3ff03d7411915a8a + .quad 0x3ff04044be896ab6 + .quad 0x3ff04315e86e7f85 + .quad 0x3ff045e78f5640b9 + .quad 0x3ff048b9b35659d8 + .quad 0x3ff04b8c54847a28 + .quad 0x3ff04e5f72f654b1 + .quad 0x3ff051330ec1a03f + .quad 0x3ff0540727fc1762 + .quad 0x3ff056dbbebb786b + .quad 0x3ff059b0d3158574 + .quad 0x3ff05c866520045b + .quad 0x3ff05f5c74f0bec2 + .quad 0x3ff06233029d8216 + .quad 0x3ff0650a0e3c1f89 + .quad 0x3ff067e197e26c14 + .quad 0x3ff06ab99fa6407c + .quad 0x3ff06d92259d794d + .quad 0x3ff0706b29ddf6de + .quad 0x3ff07344ac7d9d51 + .quad 0x3ff0761ead925493 + .quad 0x3ff078f92d32085d + .quad 0x3ff07bd42b72a836 + .quad 0x3ff07eafa86a2771 + .quad 0x3ff0818ba42e7d30 + .quad 0x3ff084681ed5a462 + .quad 0x3ff0874518759bc8 + .quad 0x3ff08a22912465f2 + .quad 0x3ff08d0088f8093f + .quad 0x3ff08fdf00068fe2 + .quad 0x3ff092bdf66607e0 + .quad 0x3ff0959d6c2c830d + .quad 0x3ff0987d61701716 + .quad 0x3ff09b5dd646dd77 + .quad 0x3ff09e3ecac6f383 + .quad 0x3ff0a1203f067a63 + .quad 0x3ff0a402331b9715 + .quad 0x3ff0a6e4a71c726e + .quad 0x3ff0a9c79b1f3919 + .quad 0x3ff0acab0f3a1b9c + .quad 0x3ff0af8f03834e52 + .quad 0x3ff0b27378110974 + .quad 0x3ff0b5586cf9890f + .quad 0x3ff0b83de2530d11 + .quad 0x3ff0bb23d833d93f + .quad 0x3ff0be0a4eb2353b + .quad 0x3ff0c0f145e46c85 + .quad 0x3ff0c3d8bde0ce7a + .quad 0x3ff0c6c0b6bdae53 + .quad 0x3ff0c9a93091632a + .quad 0x3ff0cc922b7247f7 + .quad 0x3ff0cf7ba776bb94 + .quad 0x3ff0d265a4b520ba + .quad 0x3ff0d5502343de02 + .quad 0x3ff0d83b23395dec + .quad 0x3ff0db26a4ac0ed5 + .quad 0x3ff0de12a7b26300 + .quad 0x3ff0e0ff2c62d096 + .quad 0x3ff0e3ec32d3d1a2 + .quad 0x3ff0e6d9bb1be415 + .quad 0x3ff0e9c7c55189c6 + .quad 0x3ff0ecb6518b4874 + .quad 0x3ff0efa55fdfa9c5 + .quad 0x3ff0f294f0653b45 + .quad 0x3ff0f58503328e6d + .quad 0x3ff0f875985e389b + .quad 0x3ff0fb66affed31b + .quad 0x3ff0fe584a2afb21 + .quad 0x3ff1014a66f951ce + .quad 0x3ff1043d06807c2f + .quad 0x3ff1073028d7233e + .quad 0x3ff10a23ce13f3e2 + .quad 0x3ff10d17f64d9ef1 + .quad 0x3ff1100ca19ad92f + .quad 0x3ff11301d0125b51 + .quad 0x3ff115f781cae1fa + .quad 0x3ff118edb6db2dc1 + .quad 0x3ff11be46f5a032c + .quad 0x3ff11edbab5e2ab6 + .quad 0x3ff121d36afe70c9 + .quad 0x3ff124cbae51a5c8 + .quad 0x3ff127c4756e9e05 + .quad 0x3ff12abdc06c31cc + .quad 0x3ff12db78f613d5b + .quad 0x3ff130b1e264a0e9 + .quad 0x3ff133acb98d40a2 + .quad 0x3ff136a814f204ab + .quad 0x3ff139a3f4a9d922 + .quad 0x3ff13ca058cbae1e + .quad 0x3ff13f9d416e77af + .quad 0x3ff1429aaea92de0 + .quad 0x3ff14598a092ccb7 + .quad 0x3ff1489717425438 + .quad 0x3ff14b9612cec861 + .quad 0x3ff14e95934f312e + .quad 0x3ff1519598da9a9a + .quad 0x3ff154962388149e + .quad 0x3ff15797336eb333 + .quad 0x3ff15a98c8a58e51 + .quad 0x3ff15d9ae343c1f2 + .quad 0x3ff1609d83606e12 + .quad 0x3ff163a0a912b6ac + .quad 0x3ff166a45471c3c2 + .quad 0x3ff169a88594c157 + .quad 0x3ff16cad3c92df73 + .quad 0x3ff16fb279835224 + .quad 0x3ff172b83c7d517b + .quad 0x3ff175be85981992 + .quad 0x3ff178c554eaea89 + .quad 0x3ff17bccaa8d0888 + .quad 0x3ff17ed48695bbc0 + .quad 0x3ff181dce91c506a + .quad 0x3ff184e5d23816c9 + .quad 0x3ff187ef4200632b + .quad 0x3ff18af9388c8dea + .quad 0x3ff18e03b5f3f36b + .quad 0x3ff1910eba4df41f + .quad 0x3ff1941a45b1f487 + .quad 0x3ff1972658375d2f + .quad 0x3ff19a32f1f59ab4 + .quad 0x3ff19d4013041dc2 + .quad 0x3ff1a04dbb7a5b13 + .quad 0x3ff1a35beb6fcb75 + .quad 0x3ff1a66aa2fbebc7 + .quad 0x3ff1a979e2363cf8 + .quad 0x3ff1ac89a936440d + .quad 0x3ff1af99f8138a1c + .quad 0x3ff1b2aacee59c53 + .quad 0x3ff1b5bc2dc40bf0 + .quad 0x3ff1b8ce14c66e4c + .quad 0x3ff1bbe084045cd4 + .quad 0x3ff1bef37b95750b + .quad 0x3ff1c206fb91588f + .quad 0x3ff1c51b040fad15 + .quad 0x3ff1c82f95281c6b + .quad 0x3ff1cb44aef2547a + .quad 0x3ff1ce5a51860746 + .quad 0x3ff1d1707cfaeaed + .quad 0x3ff1d4873168b9aa + .quad 0x3ff1d79e6ee731d7 + .quad 0x3ff1dab6358e15e8 + .quad 0x3ff1ddce85752c71 + .quad 0x3ff1e0e75eb44027 + .quad 0x3ff1e400c1631fdb + .quad 0x3ff1e71aad999e82 + .quad 0x3ff1ea35236f9330 + .quad 0x3ff1ed5022fcd91d + .quad 0x3ff1f06bac594fa0 + .quad 0x3ff1f387bf9cda38 + .quad 0x3ff1f6a45cdf6085 + .quad 0x3ff1f9c18438ce4d + .quad 0x3ff1fcdf35c1137a + .quad 0x3ff1fffd7190241e + .quad 0x3ff2031c37bdf872 + .quad 0x3ff2063b88628cd6 + .quad 0x3ff2095b6395e1d2 + .quad 0x3ff20c7bc96ffc18 + .quad 0x3ff20f9cba08e483 + .quad 0x3ff212be3578a819 + .quad 0x3ff215e03bd7580c + .quad 0x3ff21902cd3d09b9 + .quad 0x3ff21c25e9c1d6aa + .quad 0x3ff21f49917ddc96 + .quad 0x3ff2226dc4893d64 + .quad 0x3ff2259282fc1f27 + .quad 0x3ff228b7cceeac25 + .quad 0x3ff22bdda27912d1 + .quad 0x3ff22f0403b385d2 + .quad 0x3ff2322af0b63bff + .quad 0x3ff2355269997062 + .quad 0x3ff2387a6e756238 + .quad 0x3ff23ba2ff6254f4 + .quad 0x3ff23ecc1c78903a + .quad 0x3ff241f5c5d05fe6 + .quad 0x3ff2451ffb82140a + .quad 0x3ff2484abda600ef + .quad 0x3ff24b760c547f15 + .quad 0x3ff24ea1e7a5eb35 + .quad 0x3ff251ce4fb2a63f + .quad 0x3ff254fb44931561 + .quad 0x3ff25828c65fa1ff + .quad 0x3ff25b56d530b9bc + .quad 0x3ff25e85711ece75 + .quad 0x3ff261b49a425645 + .quad 0x3ff264e450b3cb82 + .quad 0x3ff26814948bacc3 + .quad 0x3ff26b4565e27cdd + .quad 0x3ff26e76c4d0c2e5 + .quad 0x3ff271a8b16f0a30 + .quad 0x3ff274db2bd5e254 + .quad 0x3ff2780e341ddf29 + .quad 0x3ff27b41ca5f98cb + .quad 0x3ff27e75eeb3ab98 + .quad 0x3ff281aaa132b832 + .quad 0x3ff284dfe1f56381 + .quad 0x3ff28815b11456b1 + .quad 0x3ff28b4c0ea83f36 + .quad 0x3ff28e82fac9ceca + .quad 0x3ff291ba7591bb70 + .quad 0x3ff294f27f18bf72 + .quad 0x3ff2982b17779965 + .quad 0x3ff29b643ec70c27 + .quad 0x3ff29e9df51fdee1 + .quad 0x3ff2a1d83a9add08 + .quad 0x3ff2a5130f50d65c + .quad 0x3ff2a84e735a9eec + .quad 0x3ff2ab8a66d10f13 + .quad 0x3ff2aec6e9cd037b + .quad 0x3ff2b203fc675d1f + .quad 0x3ff2b5419eb90148 + .quad 0x3ff2b87fd0dad990 + .quad 0x3ff2bbbe92e5d3e3 + .quad 0x3ff2befde4f2e280 + .quad 0x3ff2c23dc71afbf7 + .quad 0x3ff2c57e39771b2f + .quad 0x3ff2c8bf3c203f5f + .quad 0x3ff2cc00cf2f6c18 + .quad 0x3ff2cf42f2bda93d + .quad 0x3ff2d285a6e4030b + .quad 0x3ff2d5c8ebbb8a15 + .quad 0x3ff2d90cc15d5346 + .quad 0x3ff2dc5127e277e3 + .quad 0x3ff2df961f641589 + .quad 0x3ff2e2dba7fb4e33 + .quad 0x3ff2e621c1c14833 + .quad 0x3ff2e9686ccf2e3b + .quad 0x3ff2ecafa93e2f56 + .quad 0x3ff2eff777277ef0 + .quad 0x3ff2f33fd6a454d2 + .quad 0x3ff2f688c7cded23 + .quad 0x3ff2f9d24abd886b + .quad 0x3ff2fd1c5f8c6b93 + .quad 0x3ff300670653dfe4 + .quad 0x3ff303b23f2d330b + .quad 0x3ff306fe0a31b715 + .quad 0x3ff30a4a677ac276 + .quad 0x3ff30d975721b004 + .quad 0x3ff310e4d93fdefb + .quad 0x3ff31432edeeb2fd + .quad 0x3ff3178195479413 + .quad 0x3ff31ad0cf63eeac + .quad 0x3ff31e209c5d33a0 + .quad 0x3ff32170fc4cd831 + .quad 0x3ff324c1ef4c560a + .quad 0x3ff3281375752b40 + .quad 0x3ff32b658ee0da54 + .quad 0x3ff32eb83ba8ea32 + .quad 0x3ff3320b7be6e633 + .quad 0x3ff3355f4fb45e20 + .quad 0x3ff338b3b72ae62d + .quad 0x3ff33c08b26416ff + .quad 0x3ff33f5e41798daa + .quad 0x3ff342b46484ebb4 + .quad 0x3ff3460b1b9fd712 + .quad 0x3ff3496266e3fa2d + .quad 0x3ff34cba466b03e1 + .quad 0x3ff35012ba4ea77d + .quad 0x3ff3536bc2a89cc4 + .quad 0x3ff356c55f929ff1 + .quad 0x3ff35a1f912671b1 + .quad 0x3ff35d7a577dd72b + .quad 0x3ff360d5b2b299fc + .quad 0x3ff36431a2de883b + .quad 0x3ff3678e281b7475 + .quad 0x3ff36aeb428335b4 + .quad 0x3ff36e48f22fa77c + .quad 0x3ff371a7373aa9cb + .quad 0x3ff3750611be211c + .quad 0x3ff3786581d3f669 + .quad 0x3ff37bc587961726 + .quad 0x3ff37f26231e754a + .quad 0x3ff3828754870746 + .quad 0x3ff385e91be9c811 + .quad 0x3ff3894b7960b71f + .quad 0x3ff38cae6d05d866 + .quad 0x3ff39011f6f3345f + .quad 0x3ff393761742d808 + .quad 0x3ff396dace0ed4e1 + .quad 0x3ff39a401b7140ef + .quad 0x3ff39da5ff8436bc + .quad 0x3ff3a10c7a61d55b + .quad 0x3ff3a4738c244064 + .quad 0x3ff3a7db34e59ff7 + .quad 0x3ff3ab4374c020bd + .quad 0x3ff3aeac4bcdf3ea + .quad 0x3ff3b215ba294f39 + .quad 0x3ff3b57fbfec6cf4 + .quad 0x3ff3b8ea5d318bef + .quad 0x3ff3bc559212ef89 + .quad 0x3ff3bfc15eaadfb1 + .quad 0x3ff3c32dc313a8e5 + .quad 0x3ff3c69abf679c2e + .quad 0x3ff3ca0853c10f28 + .quad 0x3ff3cd76803a5c00 + .quad 0x3ff3d0e544ede173 + .quad 0x3ff3d454a1f602d0 + .quad 0x3ff3d7c4976d27fa + .quad 0x3ff3db35256dbd67 + .quad 0x3ff3dea64c123422 + .quad 0x3ff3e2180b7501cc + .quad 0x3ff3e58a63b0a09b + .quad 0x3ff3e8fd54df8f5c + .quad 0x3ff3ec70df1c5175 + .quad 0x3ff3efe502816ee3 + .quad 0x3ff3f359bf29743f + .quad 0x3ff3f6cf152ef2b8 + .quad 0x3ff3fa4504ac801c + .quad 0x3ff3fdbb8dbcb6d2 + .quad 0x3ff40132b07a35df + .quad 0x3ff404aa6cffa0e5 + .quad 0x3ff40822c367a024 + .quad 0x3ff40b9bb3cce07c + .quad 0x3ff40f153e4a136a + .quad 0x3ff4128f62f9ef0e + .quad 0x3ff4160a21f72e2a + .quad 0x3ff419857b5c901f + .quad 0x3ff41d016f44d8f5 + .quad 0x3ff4207dfdcad153 + .quad 0x3ff423fb2709468a + .quad 0x3ff42778eb1b0a8b + .quad 0x3ff42af74a1af3f1 + .quad 0x3ff42e764423ddfd + .quad 0x3ff431f5d950a897 + .quad 0x3ff4357609bc3850 + .quad 0x3ff438f6d5817663 + .quad 0x3ff43c783cbb50b4 + .quad 0x3ff43ffa3f84b9d4 + .quad 0x3ff4437cddf8a8fe + .quad 0x3ff4470018321a1a + .quad 0x3ff44a83ee4c0dbd + .quad 0x3ff44e086061892d + .quad 0x3ff4518d6e8d965b + .quad 0x3ff4551318eb43ec + .quad 0x3ff458995f95a532 + .quad 0x3ff45c2042a7d232 + .quad 0x3ff45fa7c23ce7a4 + .quad 0x3ff4632fde7006f4 + .quad 0x3ff466b8975c563e + .quad 0x3ff46a41ed1d0057 + .quad 0x3ff46dcbdfcd34c8 + .quad 0x3ff471566f8827d0 + .quad 0x3ff474e19c691265 + .quad 0x3ff4786d668b3237 + .quad 0x3ff47bf9ce09c9ab + .quad 0x3ff47f86d3001fe5 + .quad 0x3ff48314758980bf + .quad 0x3ff486a2b5c13cd0 + .quad 0x3ff48a3193c2a96c + .quad 0x3ff48dc10fa920a1 + .quad 0x3ff491512990013f + .quad 0x3ff494e1e192aed2 + .quad 0x3ff4987337cc91a5 + .quad 0x3ff49c052c5916c4 + .quad 0x3ff49f97bf53affd + .quad 0x3ff4a32af0d7d3de + .quad 0x3ff4a6bec100fdba + .quad 0x3ff4aa532feaada6 + .quad 0x3ff4ade83db0687a + .quad 0x3ff4b17dea6db7d7 + .quad 0x3ff4b514363e2a20 + .quad 0x3ff4b8ab213d5283 + .quad 0x3ff4bc42ab86c8f1 + .quad 0x3ff4bfdad5362a27 + .quad 0x3ff4c3739e6717aa + .quad 0x3ff4c70d073537ca + .quad 0x3ff4caa70fbc35a1 + .quad 0x3ff4ce41b817c114 + .quad 0x3ff4d1dd00638ed8 + .quad 0x3ff4d578e8bb586b + .quad 0x3ff4d915713adc1e + .quad 0x3ff4dcb299fddd0d + .quad 0x3ff4e05063202327 + .quad 0x3ff4e3eeccbd7b2a + .quad 0x3ff4e78dd6f1b6a6 + .quad 0x3ff4eb2d81d8abff + .quad 0x3ff4eecdcd8e3669 + .quad 0x3ff4f26eba2e35f0 + .quad 0x3ff4f61047d48f73 + .quad 0x3ff4f9b2769d2ca7 + .quad 0x3ff4fd5546a3fc17 + .quad 0x3ff500f8b804f127 + .quad 0x3ff5049ccadc0412 + .quad 0x3ff508417f4531ee + .quad 0x3ff50be6d55c7ca9 + .quad 0x3ff50f8ccd3deb0d + .quad 0x3ff51333670588bf + .quad 0x3ff516daa2cf6642 + .quad 0x3ff51a8280b798f4 + .quad 0x3ff51e2b00da3b14 + .quad 0x3ff521d423536bbe + .quad 0x3ff5257de83f4eef + .quad 0x3ff529284fba0d84 + .quad 0x3ff52cd359dfd53d + .quad 0x3ff5307f06ccd8ba + .quad 0x3ff5342b569d4f82 + .quad 0x3ff537d8496d75fc + .quad 0x3ff53b85df598d78 + .quad 0x3ff53f34187ddc28 + .quad 0x3ff542e2f4f6ad27 + .quad 0x3ff5469274e05078 + .quad 0x3ff54a4298571b06 + .quad 0x3ff54df35f7766a3 + .quad 0x3ff551a4ca5d920f + .quad 0x3ff55556d92600f1 + .quad 0x3ff559098bed1bdf + .quad 0x3ff55cbce2cf505b + .quad 0x3ff56070dde910d2 + .quad 0x3ff564257d56d4a2 + .quad 0x3ff567dac1351819 + .quad 0x3ff56b90a9a05c72 + .quad 0x3ff56f4736b527da + .quad 0x3ff572fe68900573 + .quad 0x3ff576b63f4d854c + .quad 0x3ff57a6ebb0a3c6d + .quad 0x3ff57e27dbe2c4cf + .quad 0x3ff581e1a1f3bd60 + .quad 0x3ff5859c0d59ca07 + .quad 0x3ff589571e31939f + .quad 0x3ff58d12d497c7fd + .quad 0x3ff590cf30a919ed + .quad 0x3ff5948c32824135 + .quad 0x3ff59849da3ffa96 + .quad 0x3ff59c0827ff07cc + .quad 0x3ff59fc71bdc2f8e + .quad 0x3ff5a386b5f43d92 + .quad 0x3ff5a746f664028b + .quad 0x3ff5ab07dd485429 + .quad 0x3ff5aec96abe0d1f + .quad 0x3ff5b28b9ee20d1e + .quad 0x3ff5b64e79d138d8 + .quad 0x3ff5ba11fba87a03 + .quad 0x3ff5bdd62484bf56 + .quad 0x3ff5c19af482fc8f + .quad 0x3ff5c5606bc02a6d + .quad 0x3ff5c9268a5946b7 + .quad 0x3ff5cced506b543a + .quad 0x3ff5d0b4be135acc + .quad 0x3ff5d47cd36e6747 + .quad 0x3ff5d84590998b93 + .quad 0x3ff5dc0ef5b1de9e + .quad 0x3ff5dfd902d47c65 + .quad 0x3ff5e3a3b81e85ec + .quad 0x3ff5e76f15ad2148 + .quad 0x3ff5eb3b1b9d799a + .quad 0x3ff5ef07ca0cbf0f + .quad 0x3ff5f2d5211826e8 + .quad 0x3ff5f6a320dceb71 + .quad 0x3ff5fa71c9784c0b + .quad 0x3ff5fe411b078d26 + .quad 0x3ff6021115a7f849 + .quad 0x3ff605e1b976dc09 + .quad 0x3ff609b306918c13 + .quad 0x3ff60d84fd15612a + .quad 0x3ff611579d1fb925 + .quad 0x3ff6152ae6cdf6f4 + .quad 0x3ff618feda3d829f + .quad 0x3ff61cd3778bc944 + .quad 0x3ff620a8bed63d1f + .quad 0x3ff6247eb03a5585 + .quad 0x3ff628554bd58ee5 + .quad 0x3ff62c2c91c56acd + .quad 0x3ff6300482276fe8 + .quad 0x3ff633dd1d1929fd + .quad 0x3ff637b662b829f5 + .quad 0x3ff63b90532205d8 + .quad 0x3ff63f6aee7458cd + .quad 0x3ff6434634ccc320 + .quad 0x3ff647222648ea3d + .quad 0x3ff64afec30678b7 + .quad 0x3ff64edc0b231e41 + .quad 0x3ff652b9febc8fb7 + .quad 0x3ff656989df08719 + .quad 0x3ff65a77e8dcc390 + .quad 0x3ff65e57df9f096b + .quad 0x3ff6623882552225 + .quad 0x3ff66619d11cdc5f + .quad 0x3ff669fbcc140be7 + .quad 0x3ff66dde735889b8 + .quad 0x3ff671c1c70833f6 + .quad 0x3ff675a5c740edf5 + .quad 0x3ff6798a7420a036 + .quad 0x3ff67d6fcdc5386a + .quad 0x3ff68155d44ca973 + .quad 0x3ff6853c87d4eb62 + .quad 0x3ff68923e87bfb7a + .quad 0x3ff68d0bf65fdc34 + .quad 0x3ff690f4b19e9538 + .quad 0x3ff694de1a563367 + .quad 0x3ff698c830a4c8d4 + .quad 0x3ff69cb2f4a86cca + .quad 0x3ff6a09e667f3bcd + .quad 0x3ff6a48a86475795 + .quad 0x3ff6a877541ee718 + .quad 0x3ff6ac64d0241683 + .quad 0x3ff6b052fa75173e + .quad 0x3ff6b441d3301fee + .quad 0x3ff6b8315a736c75 + .quad 0x3ff6bc21905d3df0 + .quad 0x3ff6c012750bdabf + .quad 0x3ff6c404089d8e7d + .quad 0x3ff6c7f64b30aa09 + .quad 0x3ff6cbe93ce38381 + .quad 0x3ff6cfdcddd47645 + .quad 0x3ff6d3d12e21e2fb + .quad 0x3ff6d7c62dea2f8a + .quad 0x3ff6dbbbdd4bc720 + .quad 0x3ff6dfb23c651a2f + .quad 0x3ff6e3a94b549e71 + .quad 0x3ff6e7a10a38cee8 + .quad 0x3ff6eb9979302bdd + .quad 0x3ff6ef9298593ae5 + .quad 0x3ff6f38c67d286dd + .quad 0x3ff6f786e7ba9fef + .quad 0x3ff6fb8218301b90 + .quad 0x3ff6ff7df9519484 + .quad 0x3ff7037a8b3daadb + .quad 0x3ff70777ce1303f6 + .quad 0x3ff70b75c1f04a84 + .quad 0x3ff70f7466f42e87 + .quad 0x3ff71373bd3d6551 + .quad 0x3ff71773c4eaa988 + .quad 0x3ff71b747e1abb24 + .quad 0x3ff71f75e8ec5f74 + .quad 0x3ff72378057e611a + .quad 0x3ff7277ad3ef9011 + .quad 0x3ff72b7e545ec1a8 + .quad 0x3ff72f8286ead08a + .quad 0x3ff733876bb29cb8 + .quad 0x3ff7378d02d50b8f + .quad 0x3ff73b934c7107c7 + .quad 0x3ff73f9a48a58174 + .quad 0x3ff743a1f7916e05 + .quad 0x3ff747aa5953c849 + .quad 0x3ff74bb36e0b906d + .quad 0x3ff74fbd35d7cbfd + .quad 0x3ff753c7b0d785e8 + .quad 0x3ff757d2df29ce7c + .quad 0x3ff75bdec0edbb6b + .quad 0x3ff75feb564267c9 + .quad 0x3ff763f89f46f40f + .quad 0x3ff768069c1a861d + .quad 0x3ff76c154cdc4937 + .quad 0x3ff77024b1ab6e09 + .quad 0x3ff77434caa72aa7 + .quad 0x3ff7784597eeba8f + .quad 0x3ff77c5719a15ea6 + .quad 0x3ff780694fde5d3f + .quad 0x3ff7847c3ac50219 + .quad 0x3ff7888fda749e5d + .quad 0x3ff78ca42f0c88a5 + .quad 0x3ff790b938ac1cf6 + .quad 0x3ff794cef772bcc9 + .quad 0x3ff798e56b7fcf03 + .quad 0x3ff79cfc94f2bfff + .quad 0x3ff7a11473eb0187 + .quad 0x3ff7a52d08880ad9 + .quad 0x3ff7a94652e958aa + .quad 0x3ff7ad60532e6d20 + .quad 0x3ff7b17b0976cfdb + .quad 0x3ff7b59675e20def + .quad 0x3ff7b9b2988fb9ec + .quad 0x3ff7bdcf719f6bd7 + .quad 0x3ff7c1ed0130c132 + .quad 0x3ff7c60b47635cf9 + .quad 0x3ff7ca2a4456e7a3 + .quad 0x3ff7ce49f82b0f24 + .quad 0x3ff7d26a62ff86f0 + .quad 0x3ff7d68b84f407f8 + .quad 0x3ff7daad5e2850ac + .quad 0x3ff7decfeebc24fe + .quad 0x3ff7e2f336cf4e62 + .quad 0x3ff7e71736819bcd + .quad 0x3ff7eb3bedf2e1b9 + .quad 0x3ff7ef615d42fa24 + .quad 0x3ff7f3878491c491 + .quad 0x3ff7f7ae63ff260a + .quad 0x3ff7fbd5fbab091f + .quad 0x3ff7fffe4bb55dec + .quad 0x3ff80427543e1a12 + .quad 0x3ff80851156538be + .quad 0x3ff80c7b8f4abaa9 + .quad 0x3ff810a6c20ea617 + .quad 0x3ff814d2add106d9 + .quad 0x3ff818ff52b1ee50 + .quad 0x3ff81d2cb0d1736a + .quad 0x3ff8215ac84fb2a6 + .quad 0x3ff82589994cce13 + .quad 0x3ff829b923e8ed53 + .quad 0x3ff82de968443d9a + .quad 0x3ff8321a667ef1b2 + .quad 0x3ff8364c1eb941f7 + .quad 0x3ff83a7e91136c5d + .quad 0x3ff83eb1bdadb46d + .quad 0x3ff842e5a4a8634a + .quad 0x3ff8471a4623c7ad + .quad 0x3ff84b4fa24035ea + .quad 0x3ff84f85b91e07f1 + .quad 0x3ff853bc8add9d4c + .quad 0x3ff857f4179f5b21 + .quad 0x3ff85c2c5f83ac35 + .quad 0x3ff8606562ab00ec + .quad 0x3ff8649f2135cf48 + .quad 0x3ff868d99b4492ed + .quad 0x3ff86d14d0f7cd1d + .quad 0x3ff87150c27004c2 + .quad 0x3ff8758d6fcdc666 + .quad 0x3ff879cad931a436 + .quad 0x3ff87e08febc3608 + .quad 0x3ff88247e08e1957 + .quad 0x3ff886877ec7f144 + .quad 0x3ff88ac7d98a6699 + .quad 0x3ff88f08f0f627cb + .quad 0x3ff8934ac52be8f7 + .quad 0x3ff8978d564c63e7 + .quad 0x3ff89bd0a478580f + .quad 0x3ff8a014afd08a94 + .quad 0x3ff8a4597875c644 + .quad 0x3ff8a89efe88dba1 + .quad 0x3ff8ace5422aa0db + .quad 0x3ff8b12c437bf1d4 + .quad 0x3ff8b574029db01e + .quad 0x3ff8b9bc7fb0c302 + .quad 0x3ff8be05bad61778 + .quad 0x3ff8c24fb42ea033 + .quad 0x3ff8c69a6bdb5598 + .quad 0x3ff8cae5e1fd35c4 + .quad 0x3ff8cf3216b5448c + .quad 0x3ff8d37f0a248b7f + .quad 0x3ff8d7ccbc6c19e6 + .quad 0x3ff8dc1b2dad04c4 + .quad 0x3ff8e06a5e0866d9 + .quad 0x3ff8e4ba4d9f60a1 + .quad 0x3ff8e90afc931857 + .quad 0x3ff8ed5c6b04b9f6 + .quad 0x3ff8f1ae99157736 + .quad 0x3ff8f60186e68793 + .quad 0x3ff8fa553499284b + .quad 0x3ff8fea9a24e9c5c + .quad 0x3ff902fed0282c8a + .quad 0x3ff90754be472760 + .quad 0x3ff90bab6ccce12c + .quad 0x3ff91002dbdab403 + .quad 0x3ff9145b0b91ffc6 + .quad 0x3ff918b3fc142a19 + .quad 0x3ff91d0dad829e70 + .quad 0x3ff921681ffece05 + .quad 0x3ff925c353aa2fe2 + .quad 0x3ff92a1f48a640dc + .quad 0x3ff92e7bff148396 + .quad 0x3ff932d977168083 + .quad 0x3ff93737b0cdc5e5 + .quad 0x3ff93b96ac5be7d1 + .quad 0x3ff93ff669e2802b + .quad 0x3ff94456e9832ead + .quad 0x3ff948b82b5f98e5 + .quad 0x3ff94d1a2f996a33 + .quad 0x3ff9517cf65253d1 + .quad 0x3ff955e07fac0ccd + .quad 0x3ff95a44cbc8520f + .quad 0x3ff95ea9dac8e658 + .quad 0x3ff9630faccf9243 + .quad 0x3ff9677641fe2446 + .quad 0x3ff96bdd9a7670b3 + .quad 0x3ff97045b65a51ba + .quad 0x3ff974ae95cba768 + .quad 0x3ff9791838ec57ab + .quad 0x3ff97d829fde4e50 + .quad 0x3ff981edcac37d05 + .quad 0x3ff98659b9bddb5b + .quad 0x3ff98ac66cef66c8 + .quad 0x3ff98f33e47a22a2 + .quad 0x3ff993a220801829 + .quad 0x3ff9981121235681 + .quad 0x3ff99c80e685f2b5 + .quad 0x3ff9a0f170ca07ba + .quad 0x3ff9a562c011b66d + .quad 0x3ff9a9d4d47f2598 + .quad 0x3ff9ae47ae3481ed + .quad 0x3ff9b2bb4d53fe0d + .quad 0x3ff9b72fb1ffd285 + .quad 0x3ff9bba4dc5a3dd3 + .quad 0x3ff9c01acc858463 + .quad 0x3ff9c49182a3f090 + .quad 0x3ff9c908fed7d2aa + .quad 0x3ff9cd81414380f2 + .quad 0x3ff9d1fa4a09579d + .quad 0x3ff9d674194bb8d5 + .quad 0x3ff9daeeaf2d0cb8 + .quad 0x3ff9df6a0bcfc15e + .quad 0x3ff9e3e62f564ad5 + .quad 0x3ff9e86319e32323 + .quad 0x3ff9ece0cb98ca4b + .quad 0x3ff9f15f4499c647 + .quad 0x3ff9f5de8508a311 + .quad 0x3ff9fa5e8d07f29e + .quad 0x3ff9fedf5cba4ce0 + .quad 0x3ffa0360f4424fcb + .quad 0x3ffa07e353c29f50 + .quad 0x3ffa0c667b5de565 + .quad 0x3ffa10ea6b36d1fe + .quad 0x3ffa156f23701b15 + .quad 0x3ffa19f4a42c7ca9 + .quad 0x3ffa1e7aed8eb8bb + .quad 0x3ffa2301ffb99757 + .quad 0x3ffa2789dacfe68c + .quad 0x3ffa2c127ef47a74 + .quad 0x3ffa309bec4a2d33 + .quad 0x3ffa352622f3def6 + .quad 0x3ffa39b1231475f7 + .quad 0x3ffa3e3ceccede7c + .quad 0x3ffa42c980460ad8 + .quad 0x3ffa4756dd9cf36e + .quad 0x3ffa4be504f696b1 + .quad 0x3ffa5073f675f924 + .quad 0x3ffa5503b23e255d + .quad 0x3ffa599438722c03 + .quad 0x3ffa5e25893523d4 + .quad 0x3ffa62b7a4aa29a1 + .quad 0x3ffa674a8af46052 + .quad 0x3ffa6bde3c36f0e6 + .quad 0x3ffa7072b8950a73 + .quad 0x3ffa75080031e22b + .quad 0x3ffa799e1330b358 + .quad 0x3ffa7e34f1b4bf62 + .quad 0x3ffa82cc9be14dca + .quad 0x3ffa876511d9ac32 + .quad 0x3ffa8bfe53c12e59 + .quad 0x3ffa909861bb2e1d + .quad 0x3ffa95333beb0b7e + .quad 0x3ffa99cee2742c9d + .quad 0x3ffa9e6b5579fdbf + .quad 0x3ffaa308951ff14d + .quad 0x3ffaa7a6a1897fd2 + .quad 0x3ffaac457ada2803 + .quad 0x3ffab0e521356eba + .quad 0x3ffab58594bedefa + .quad 0x3ffaba26d59a09ee + .quad 0x3ffabec8e3ea86ee + .quad 0x3ffac36bbfd3f37a + .quad 0x3ffac80f6979f340 + .quad 0x3ffaccb3e100301e + .quad 0x3ffad159268a5a1c + .quad 0x3ffad5ff3a3c2774 + .quad 0x3ffadaa61c395493 + .quad 0x3ffadf4dcca5a413 + .quad 0x3ffae3f64ba4dec6 + .quad 0x3ffae89f995ad3ad + .quad 0x3ffaed49b5eb5803 + .quad 0x3ffaf1f4a17a4735 + .quad 0x3ffaf6a05c2b82e9 + .quad 0x3ffafb4ce622f2ff + .quad 0x3ffafffa3f84858c + .quad 0x3ffb04a868742ee4 + .quad 0x3ffb09576115e994 + .quad 0x3ffb0e07298db666 + .quad 0x3ffb12b7c1ff9c61 + .quad 0x3ffb17692a8fa8cd + .quad 0x3ffb1c1b6361ef31 + .quad 0x3ffb20ce6c9a8952 + .quad 0x3ffb2582465d973c + .quad 0x3ffb2a36f0cf3f3a + .quad 0x3ffb2eec6c13addd + .quad 0x3ffb33a2b84f15fb + .quad 0x3ffb3859d5a5b0b1 + .quad 0x3ffb3d11c43bbd62 + .quad 0x3ffb41ca843581ba + .quad 0x3ffb468415b749b1 + .quad 0x3ffb4b3e78e56786 + .quad 0x3ffb4ff9ade433c6 + .quad 0x3ffb54b5b4d80d4a + .quad 0x3ffb59728de5593a + .quad 0x3ffb5e303930830c + .quad 0x3ffb62eeb6ddfc87 + .quad 0x3ffb67ae07123dc3 + .quad 0x3ffb6c6e29f1c52a + .quad 0x3ffb712f1fa1177b + .quad 0x3ffb75f0e844bfc6 + .quad 0x3ffb7ab384014f76 + .quad 0x3ffb7f76f2fb5e47 + .quad 0x3ffb843b35578a51 + .quad 0x3ffb89004b3a7804 + .quad 0x3ffb8dc634c8d228 + .quad 0x3ffb928cf22749e4 + .quad 0x3ffb9754837a96b7 + .quad 0x3ffb9c1ce8e77680 + .quad 0x3ffba0e62292ad7d + .quad 0x3ffba5b030a1064a + .quad 0x3ffbaa7b133751e3 + .quad 0x3ffbaf46ca7a67a7 + .quad 0x3ffbb413568f255a + .quad 0x3ffbb8e0b79a6f1f + .quad 0x3ffbbdaeedc12f82 + .quad 0x3ffbc27df9285775 + .quad 0x3ffbc74dd9f4de4f + .quad 0x3ffbcc1e904bc1d2 + .quad 0x3ffbd0f01c520628 + .quad 0x3ffbd5c27e2cb5e5 + .quad 0x3ffbda95b600e20b + .quad 0x3ffbdf69c3f3a207 + .quad 0x3ffbe43ea82a13b5 + .quad 0x3ffbe91462c95b60 + .quad 0x3ffbedeaf3f6a3c2 + .quad 0x3ffbf2c25bd71e09 + .quad 0x3ffbf79a9a9001d2 + .quad 0x3ffbfc73b0468d30 + .quad 0x3ffc014d9d2004aa + .quad 0x3ffc06286141b33d + .quad 0x3ffc0b03fcd0ea5c + .quad 0x3ffc0fe06ff301f4 + .quad 0x3ffc14bdbacd586a + .quad 0x3ffc199bdd85529c + .quad 0x3ffc1e7ad8405be6 + .quad 0x3ffc235aab23e61e + .quad 0x3ffc283b56556999 + .quad 0x3ffc2d1cd9fa652c + .quad 0x3ffc31ff36385e29 + .quad 0x3ffc36e26b34e065 + .quad 0x3ffc3bc679157e38 + .quad 0x3ffc40ab5fffd07a + .quad 0x3ffc45912019768c + .quad 0x3ffc4a77b9881650 + .quad 0x3ffc4f5f2c715c31 + .quad 0x3ffc544778fafb22 + .quad 0x3ffc59309f4aac9f + .quad 0x3ffc5e1a9f8630ad + .quad 0x3ffc630579d34ddd + .quad 0x3ffc67f12e57d14b + .quad 0x3ffc6cddbd398ea4 + .quad 0x3ffc71cb269e601f + .quad 0x3ffc76b96aac2686 + .quad 0x3ffc7ba88988c933 + .quad 0x3ffc8098835a3611 + .quad 0x3ffc8589584661a1 + .quad 0x3ffc8a7b087346f4 + .quad 0x3ffc8f6d9406e7b5 + .quad 0x3ffc9460fb274c22 + .quad 0x3ffc99553dfa8313 + .quad 0x3ffc9e4a5ca6a1f8 + .quad 0x3ffca3405751c4db + .quad 0x3ffca8372e220e61 + .quad 0x3ffcad2ee13da7cb + .quad 0x3ffcb22770cac0f9 + .quad 0x3ffcb720dcef9069 + .quad 0x3ffcbc1b25d25337 + .quad 0x3ffcc1164b994d23 + .quad 0x3ffcc6124e6ac88b + .quad 0x3ffccb0f2e6d1675 + .quad 0x3ffcd00cebc68e87 + .quad 0x3ffcd50b869d8f0f + .quad 0x3ffcda0aff187d02 + .quad 0x3ffcdf0b555dc3fa + .quad 0x3ffce40c8993d63d + .quad 0x3ffce90e9be12cb9 + .quad 0x3ffcee118c6c4709 + .quad 0x3ffcf3155b5bab74 + .quad 0x3ffcf81a08d5e6ec + .quad 0x3ffcfd1f95018d17 + .quad 0x3ffd022600053845 + .quad 0x3ffd072d4a07897c + .quad 0x3ffd0c35732f2870 + .quad 0x3ffd113e7ba2c38c + .quad 0x3ffd164863890fee + .quad 0x3ffd1b532b08c968 + .quad 0x3ffd205ed248b287 + .quad 0x3ffd256b596f948c + .quad 0x3ffd2a78c0a43f72 + .quad 0x3ffd2f87080d89f2 + .quad 0x3ffd34962fd2517a + .quad 0x3ffd39a638197a3c + .quad 0x3ffd3eb72109ef21 + .quad 0x3ffd43c8eacaa1d6 + .quad 0x3ffd48db95828ac7 + .quad 0x3ffd4def2158a91f + .quad 0x3ffd53038e7402ce + .quad 0x3ffd5818dcfba487 + .quad 0x3ffd5d2f0d16a1c3 + .quad 0x3ffd62461eec14be + .quad 0x3ffd675e12a31e7f + .quad 0x3ffd6c76e862e6d3 + .quad 0x3ffd7190a0529c51 + .quad 0x3ffd76ab3a99745b + .quad 0x3ffd7bc6b75eab1f + .quad 0x3ffd80e316c98398 + .quad 0x3ffd86005901478f + .quad 0x3ffd8b1e7e2d479d + .quad 0x3ffd903d8674db2b + .quad 0x3ffd955d71ff6075 + .quad 0x3ffd9a7e40f43c89 + .quad 0x3ffd9f9ff37adb4a + .quad 0x3ffda4c289baaf6e + .quad 0x3ffda9e603db3285 + .quad 0x3ffdaf0a6203e4f5 + .quad 0x3ffdb42fa45c4dfd + .quad 0x3ffdb955cb0bfbb6 + .quad 0x3ffdbe7cd63a8315 + .quad 0x3ffdc3a4c60f7fea + .quad 0x3ffdc8cd9ab294e4 + .quad 0x3ffdcdf7544b6b92 + .quad 0x3ffdd321f301b460 + .quad 0x3ffdd84d76fd269e + .quad 0x3ffddd79e065807d + .quad 0x3ffde2a72f628712 + .quad 0x3ffde7d5641c0658 + .quad 0x3ffded047eb9d12d + .quad 0x3ffdf2347f63c159 + .quad 0x3ffdf7656641b78c + .quad 0x3ffdfc97337b9b5f + .quad 0x3ffe01c9e7395b56 + .quad 0x3ffe06fd81a2ece1 + .quad 0x3ffe0c3202e04c5d + .quad 0x3ffe11676b197d17 + .quad 0x3ffe169dba768949 + .quad 0x3ffe1bd4f11f8220 + .quad 0x3ffe210d0f3c7fba + .quad 0x3ffe264614f5a129 + .quad 0x3ffe2b8002730c71 + .quad 0x3ffe30bad7dcee90 + .quad 0x3ffe35f6955b7b78 + .quad 0x3ffe3b333b16ee12 + .quad 0x3ffe4070c9378842 + .quad 0x3ffe45af3fe592e8 + .quad 0x3ffe4aee9f495ddc + .quad 0x3ffe502ee78b3ff6 + .quad 0x3ffe557018d3970b + .quad 0x3ffe5ab2334ac7ee + .quad 0x3ffe5ff537193e75 + .quad 0x3ffe653924676d76 + .quad 0x3ffe6a7dfb5dceca + .quad 0x3ffe6fc3bc24e350 + .quad 0x3ffe750a66e532eb + .quad 0x3ffe7a51fbc74c83 + .quad 0x3ffe7f9a7af3c60b + .quad 0x3ffe84e3e4933c7e + .quad 0x3ffe8a2e38ce53df + .quad 0x3ffe8f7977cdb740 + .quad 0x3ffe94c5a1ba18bd + .quad 0x3ffe9a12b6bc3181 + .quad 0x3ffe9f60b6fcc1c7 + .quad 0x3ffea4afa2a490da + .quad 0x3ffea9ff79dc6d14 + .quad 0x3ffeaf503ccd2be5 + .quad 0x3ffeb4a1eb9fa9d1 + .quad 0x3ffeb9f4867cca6e + .quad 0x3ffebf480d8d786d + .quad 0x3ffec49c80faa594 + .quad 0x3ffec9f1e0ed4ac2 + .quad 0x3ffecf482d8e67f1 + .quad 0x3ffed49f67070435 + .quad 0x3ffed9f78d802dc2 + .quad 0x3ffedf50a122f9e6 + .quad 0x3ffee4aaa2188510 + .quad 0x3ffeea059089f2d0 + .quad 0x3ffeef616ca06dd6 + .quad 0x3ffef4be368527f6 + .quad 0x3ffefa1bee615a27 + .quad 0x3ffeff7a945e4487 + .quad 0x3fff04da28a52e59 + .quad 0x3fff0a3aab5f6609 + .quad 0x3fff0f9c1cb6412a + .quad 0x3fff14fe7cd31c7b + .quad 0x3fff1a61cbdf5be7 + .quad 0x3fff1fc60a046a84 + .quad 0x3fff252b376bba97 + .quad 0x3fff2a91543ec595 + .quad 0x3fff2ff860a70c22 + .quad 0x3fff35605cce1613 + .quad 0x3fff3ac948dd7274 + .quad 0x3fff403324feb781 + .quad 0x3fff459df15b82ac + .quad 0x3fff4b09ae1d78a1 + .quad 0x3fff50765b6e4540 + .quad 0x3fff55e3f9779ba5 + .quad 0x3fff5b5288633625 + .quad 0x3fff60c2085ad652 + .quad 0x3fff6632798844f8 + .quad 0x3fff6ba3dc155226 + .quad 0x3fff7116302bd526 + .quad 0x3fff768975f5ac86 + .quad 0x3fff7bfdad9cbe14 + .quad 0x3fff8172d74af6e1 + .quad 0x3fff86e8f32a4b45 + .quad 0x3fff8c600164b6dc + .quad 0x3fff91d802243c89 + .quad 0x3fff9750f592e677 + .quad 0x3fff9ccadbdac61d + .quad 0x3fffa245b525f439 + .quad 0x3fffa7c1819e90d8 + .quad 0x3fffad3e416ec354 + .quad 0x3fffb2bbf4c0ba54 + .quad 0x3fffb83a9bbeabd1 + .quad 0x3fffbdba3692d514 + .quad 0x3fffc33ac5677ab8 + .quad 0x3fffc8bc4866e8ad + .quad 0x3fffce3ebfbb7237 + .quad 0x3fffd3c22b8f71f1 + .quad 0x3fffd9468c0d49cc + .quad 0x3fffdecbe15f6314 + .quad 0x3fffe4522bb02e6e + .quad 0x3fffe9d96b2a23d9 + .quad 0x3fffef619ff7c2b3 + .quad 0x3ffff4eaca4391b6 + .quad 0x3ffffa74ea381efc + +/* Range reduction coefficients: + * log(2) inverted = 2^k/ln2 */ +double_vector __dbInvLn2 0x40971547652b82fe + +/* right-shifter value = 3*2^52 */ +double_vector __dbShifter 0x4338000000000000 + +/* log(2) high part = ln2/2^k(52-k-9 hibits) */ +double_vector __dbLn2hi 0x3f462e42fec00000 + +/* log(2) low part = ln2/2^k(52-k-9..104-k-9 lobits) */ +double_vector __dbLn2lo 0x3d5d1cf79abc9e3b + +/* Polynomial coefficients (k=10, deg=3): */ +double_vector __dPC0 0x3ff0000000000000 +double_vector __dPC1 0x3fe0000001ebfbe0 +double_vector __dPC2 0x3fc5555555555556 + +/* Other constants: + * index mask = 2^k-1 */ +double_vector __lIndexMask 0x00000000000003ff + +/* absolute value mask (SP) */ +float_vector __iAbsMask 0x7fffffff + +/* domain range (SP) (>=4086232B) */ +float_vector __iDomainRange 0x4086232a + .type __svml_dexp_data,@object + .size __svml_dexp_data,.-__svml_dexp_data diff --git a/REORG.TODO/sysdeps/x86_64/fpu/svml_d_exp_data.h b/REORG.TODO/sysdeps/x86_64/fpu/svml_d_exp_data.h new file mode 100644 index 0000000000..70e7660739 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/svml_d_exp_data.h @@ -0,0 +1,52 @@ +/* Offsets for data table for function exp. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#ifndef D_EXP_DATA_H +#define D_EXP_DATA_H + +#define __dbT 0 +#define __dbInvLn2 8192 +#define __dbShifter 8256 +#define __dbLn2hi 8320 +#define __dbLn2lo 8384 +#define __dPC0 8448 +#define __dPC1 8512 +#define __dPC2 8576 +#define __lIndexMask 8640 +#define __iAbsMask 8704 +#define __iDomainRange 8768 + +.macro double_vector offset value +.if .-__svml_dexp_data != \offset +.err +.endif +.rept 8 +.quad \value +.endr +.endm + +.macro float_vector offset value +.if .-__svml_dexp_data != \offset +.err +.endif +.rept 16 +.long \value +.endr +.endm + +#endif diff --git a/REORG.TODO/sysdeps/x86_64/fpu/svml_d_log2_core.S b/REORG.TODO/sysdeps/x86_64/fpu/svml_d_log2_core.S new file mode 100644 index 0000000000..4e2d9b9640 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/svml_d_log2_core.S @@ -0,0 +1,29 @@ +/* Function log vectorized with SSE2. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "svml_d_wrapper_impl.h" + + .text +ENTRY (_ZGVbN2v_log) +WRAPPER_IMPL_SSE2 __log_finite +END (_ZGVbN2v_log) + +#ifndef USE_MULTIARCH + libmvec_hidden_def (_ZGVbN2v_log) +#endif diff --git a/REORG.TODO/sysdeps/x86_64/fpu/svml_d_log4_core.S b/REORG.TODO/sysdeps/x86_64/fpu/svml_d_log4_core.S new file mode 100644 index 0000000000..2db872682d --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/svml_d_log4_core.S @@ -0,0 +1,29 @@ +/* Function log vectorized with AVX2, wrapper version. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "svml_d_wrapper_impl.h" + + .text +ENTRY (_ZGVdN4v_log) +WRAPPER_IMPL_AVX _ZGVbN2v_log +END (_ZGVdN4v_log) + +#ifndef USE_MULTIARCH + libmvec_hidden_def (_ZGVdN4v_log) +#endif diff --git a/REORG.TODO/sysdeps/x86_64/fpu/svml_d_log4_core_avx.S b/REORG.TODO/sysdeps/x86_64/fpu/svml_d_log4_core_avx.S new file mode 100644 index 0000000000..72cb77a1b7 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/svml_d_log4_core_avx.S @@ -0,0 +1,25 @@ +/* Function log vectorized in AVX ISA as wrapper to SSE4 ISA version. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "svml_d_wrapper_impl.h" + + .text +ENTRY (_ZGVcN4v_log) +WRAPPER_IMPL_AVX _ZGVbN2v_log +END (_ZGVcN4v_log) diff --git a/REORG.TODO/sysdeps/x86_64/fpu/svml_d_log8_core.S b/REORG.TODO/sysdeps/x86_64/fpu/svml_d_log8_core.S new file mode 100644 index 0000000000..d4c4850fdc --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/svml_d_log8_core.S @@ -0,0 +1,25 @@ +/* Function log vectorized with AVX-512. Wrapper to AVX2 version. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "svml_d_wrapper_impl.h" + + .text +ENTRY (_ZGVeN8v_log) +WRAPPER_IMPL_AVX512 _ZGVdN4v_log +END (_ZGVeN8v_log) diff --git a/REORG.TODO/sysdeps/x86_64/fpu/svml_d_log_data.S b/REORG.TODO/sysdeps/x86_64/fpu/svml_d_log_data.S new file mode 100644 index 0000000000..b17874100c --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/svml_d_log_data.S @@ -0,0 +1,1662 @@ +/* Data for function log. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include "svml_d_log_data.h" + + .section .rodata, "a" + .align 64 + +/* Data table for vector implementations of function log. + The table may contain polynomial, reduction, lookup coefficients + and other constants obtained through different methods + of research and experimental work. */ + .globl __svml_dlog_data +__svml_dlog_data: + +/* Lookup table in high+low parts and 9-bit index for + -log(mRcp), where mRcp is mantissa of 1/x 9-bit accurate reciprocal: */ +.if .-__svml_dlog_data != _Log_HA_table +.err +.endif + .quad 0xc086232bdd7a8300 + .quad 0xbe1ce91eef3fb100 + .quad 0xc086232fdc7ad828 + .quad 0xbe1cefcffda73b6a + .quad 0xc0862333d97d2ba0 + .quad 0xbe1cef406748f1ff + .quad 0xc0862337d48378e0 + .quad 0xbe1cef2a9429925a + .quad 0xc086233bcd8fb878 + .quad 0xbe1cf138d17ebecb + .quad 0xc086233fc4a3e018 + .quad 0xbe1ceff2dbbbb29e + .quad 0xc0862343b9c1e270 + .quad 0xbe1cf1a42aae437b + .quad 0xc0862347acebaf68 + .quad 0xbe1cef3b152048af + .quad 0xc086234b9e2333f0 + .quad 0xbe1cef20e127805e + .quad 0xc086234f8d6a5a30 + .quad 0xbe1cf00ad6052cf4 + .quad 0xc08623537ac30980 + .quad 0xbe1cefc4642ee597 + .quad 0xc0862357662f2660 + .quad 0xbe1cf1f277d36e16 + .quad 0xc086235b4fb092a0 + .quad 0xbe1ceed009e8d8e6 + .quad 0xc086235f37492d28 + .quad 0xbe1cf1e4038cb362 + .quad 0xc08623631cfad250 + .quad 0xbe1cf0b0873b8557 + .quad 0xc086236700c75b98 + .quad 0xbe1cf15bb3227c0b + .quad 0xc086236ae2b09fe0 + .quad 0xbe1cf151ef8ca9ed + .quad 0xc086236ec2b87358 + .quad 0xbe1cefe1dc2cd2ed + .quad 0xc0862372a0e0a780 + .quad 0xbe1cf0d1eec5454f + .quad 0xc08623767d2b0b48 + .quad 0xbe1ceeefd570bbce + .quad 0xc086237a57996af0 + .quad 0xbe1cee99ae91b3a7 + .quad 0xc086237e302d9028 + .quad 0xbe1cf0412830fbd1 + .quad 0xc086238206e94218 + .quad 0xbe1ceee898588610 + .quad 0xc0862385dbce4548 + .quad 0xbe1cee9a1fbcaaea + .quad 0xc0862389aede5bc0 + .quad 0xbe1ceed8e7cc1ad6 + .quad 0xc086238d801b4500 + .quad 0xbe1cf10c8d059da6 + .quad 0xc08623914f86be18 + .quad 0xbe1ceee6c63a8165 + .quad 0xc08623951d228180 + .quad 0xbe1cf0c3592d2ff1 + .quad 0xc0862398e8f04758 + .quad 0xbe1cf0026cc4cb1b + .quad 0xc086239cb2f1c538 + .quad 0xbe1cf15d48d8e670 + .quad 0xc08623a07b28ae60 + .quad 0xbe1cef359363787c + .quad 0xc08623a44196b390 + .quad 0xbe1cefdf1ab2e82c + .quad 0xc08623a8063d8338 + .quad 0xbe1cefe43c02aa84 + .quad 0xc08623abc91ec960 + .quad 0xbe1cf044f5ae35b7 + .quad 0xc08623af8a3c2fb8 + .quad 0xbe1cf0b0b4001e1b + .quad 0xc08623b349975d98 + .quad 0xbe1cf1bae76dfbcf + .quad 0xc08623b70731f810 + .quad 0xbe1cef0a72e13a62 + .quad 0xc08623bac30da1c8 + .quad 0xbe1cf184007d2b6b + .quad 0xc08623be7d2bfb40 + .quad 0xbe1cf16f4b239e98 + .quad 0xc08623c2358ea2a0 + .quad 0xbe1cf0976acada87 + .quad 0xc08623c5ec3733d0 + .quad 0xbe1cf066318a16ff + .quad 0xc08623c9a1274880 + .quad 0xbe1ceffaa7148798 + .quad 0xc08623cd54607820 + .quad 0xbe1cf23ab02e9b6e + .quad 0xc08623d105e45800 + .quad 0xbe1cefdfef7d4fde + .quad 0xc08623d4b5b47b20 + .quad 0xbe1cf17fece44f2b + .quad 0xc08623d863d27270 + .quad 0xbe1cf18f907d0d7c + .quad 0xc08623dc103fccb0 + .quad 0xbe1cee61fe072c98 + .quad 0xc08623dfbafe1668 + .quad 0xbe1cf022dd891e2f + .quad 0xc08623e3640eda20 + .quad 0xbe1ceecc1daf4358 + .quad 0xc08623e70b73a028 + .quad 0xbe1cf0173c4fa380 + .quad 0xc08623eab12deec8 + .quad 0xbe1cf16a2150c2f4 + .quad 0xc08623ee553f4a30 + .quad 0xbe1cf1bf980b1f4b + .quad 0xc08623f1f7a93480 + .quad 0xbe1cef8b731663c2 + .quad 0xc08623f5986d2dc0 + .quad 0xbe1cee9a664d7ef4 + .quad 0xc08623f9378cb3f0 + .quad 0xbe1cf1eda2af6400 + .quad 0xc08623fcd5094320 + .quad 0xbe1cf1923f9d68d7 + .quad 0xc086240070e45548 + .quad 0xbe1cf0747cd3e03a + .quad 0xc08624040b1f6260 + .quad 0xbe1cf22ee855bd6d + .quad 0xc0862407a3bbe078 + .quad 0xbe1cf0d57360c00b + .quad 0xc086240b3abb4398 + .quad 0xbe1ceebc815cd575 + .quad 0xc086240ed01efdd0 + .quad 0xbe1cf03bfb970951 + .quad 0xc086241263e87f50 + .quad 0xbe1cf16e74768529 + .quad 0xc0862415f6193658 + .quad 0xbe1cefec64b8becb + .quad 0xc086241986b28f30 + .quad 0xbe1cf0838d210baa + .quad 0xc086241d15b5f448 + .quad 0xbe1cf0ea86e75b11 + .quad 0xc0862420a324ce28 + .quad 0xbe1cf1708d11d805 + .quad 0xc08624242f008380 + .quad 0xbe1ceea988c5a417 + .quad 0xc0862427b94a7910 + .quad 0xbe1cef166a7bbca5 + .quad 0xc086242b420411d0 + .quad 0xbe1cf0c9d9e86a38 + .quad 0xc086242ec92eaee8 + .quad 0xbe1cef0946455411 + .quad 0xc08624324ecbaf98 + .quad 0xbe1cefea60907739 + .quad 0xc0862435d2dc7160 + .quad 0xbe1cf1ed0934ce42 + .quad 0xc086243955624ff8 + .quad 0xbe1cf191ba746c7d + .quad 0xc086243cd65ea548 + .quad 0xbe1ceeec78cf2a7e + .quad 0xc086244055d2c968 + .quad 0xbe1cef345284c119 + .quad 0xc0862443d3c012b8 + .quad 0xbe1cf24f77355219 + .quad 0xc08624475027d5e8 + .quad 0xbe1cf05bf087e114 + .quad 0xc086244acb0b65d0 + .quad 0xbe1cef3504a32189 + .quad 0xc086244e446c1398 + .quad 0xbe1ceff54b2a406f + .quad 0xc0862451bc4b2eb8 + .quad 0xbe1cf0757d54ed4f + .quad 0xc086245532aa04f0 + .quad 0xbe1cf0c8099fdfd5 + .quad 0xc0862458a789e250 + .quad 0xbe1cf0b173796a31 + .quad 0xc086245c1aec1138 + .quad 0xbe1cf11d8734540d + .quad 0xc086245f8cd1da60 + .quad 0xbe1cf1916a723ceb + .quad 0xc0862462fd3c84d8 + .quad 0xbe1cf19a911e1da7 + .quad 0xc08624666c2d5608 + .quad 0xbe1cf23a9ef72e4f + .quad 0xc0862469d9a591c0 + .quad 0xbe1cef503d947663 + .quad 0xc086246d45a67a18 + .quad 0xbe1cf0fceeb1a0b2 + .quad 0xc0862470b0314fa8 + .quad 0xbe1cf107e27e4fbc + .quad 0xc086247419475160 + .quad 0xbe1cf03dd9922331 + .quad 0xc086247780e9bc98 + .quad 0xbe1cefce1a10e129 + .quad 0xc086247ae719cd18 + .quad 0xbe1ceea47f73c4f6 + .quad 0xc086247e4bd8bd10 + .quad 0xbe1ceec0ac56d100 + .quad 0xc0862481af27c528 + .quad 0xbe1cee8a6593278a + .quad 0xc086248511081c70 + .quad 0xbe1cf2231dd9dec7 + .quad 0xc0862488717af888 + .quad 0xbe1cf0b4b8ed7da8 + .quad 0xc086248bd0818d68 + .quad 0xbe1cf1bd8d835002 + .quad 0xc086248f2e1d0d98 + .quad 0xbe1cf259acc107f4 + .quad 0xc08624928a4eaa20 + .quad 0xbe1cee897636b00c + .quad 0xc0862495e5179270 + .quad 0xbe1cee757f20c326 + .quad 0xc08624993e78f490 + .quad 0xbe1cefafd3aa54a4 + .quad 0xc086249c9673fd10 + .quad 0xbe1cee7298d38b97 + .quad 0xc086249fed09d6f8 + .quad 0xbe1ceedc158d4ceb + .quad 0xc08624a3423babe0 + .quad 0xbe1cf2282987cb2e + .quad 0xc08624a6960aa400 + .quad 0xbe1cefe7381ecc4b + .quad 0xc08624a9e877e600 + .quad 0xbe1cef328dbbce80 + .quad 0xc08624ad39849728 + .quad 0xbe1cefde45f3cc71 + .quad 0xc08624b08931db58 + .quad 0xbe1cefa8b89433b9 + .quad 0xc08624b3d780d500 + .quad 0xbe1cef6773c0b139 + .quad 0xc08624b72472a528 + .quad 0xbe1cf031c931c11f + .quad 0xc08624ba70086b78 + .quad 0xbe1cf088f49275e7 + .quad 0xc08624bdba434630 + .quad 0xbe1cf17de0eaa86d + .quad 0xc08624c103245238 + .quad 0xbe1cefd492f1ba75 + .quad 0xc08624c44aacab08 + .quad 0xbe1cf1253e154466 + .quad 0xc08624c790dd6ad0 + .quad 0xbe1cf0fb09ee6d55 + .quad 0xc08624cad5b7aa58 + .quad 0xbe1cf1f08dd048fe + .quad 0xc08624ce193c8120 + .quad 0xbe1ceeca0809697f + .quad 0xc08624d15b6d0538 + .quad 0xbe1cef8d5662d968 + .quad 0xc08624d49c4a4b78 + .quad 0xbe1cee97b556ed78 + .quad 0xc08624d7dbd56750 + .quad 0xbe1cf1b14b6acb75 + .quad 0xc08624db1a0f6b00 + .quad 0xbe1cef1e860623f2 + .quad 0xc08624de56f96758 + .quad 0xbe1ceeaf4d156f3d + .quad 0xc08624e192946bf0 + .quad 0xbe1ceecc12b400ed + .quad 0xc08624e4cce18710 + .quad 0xbe1cf180c40c794f + .quad 0xc08624e805e1c5c8 + .quad 0xbe1cf185a08f7f65 + .quad 0xc08624eb3d9633d8 + .quad 0xbe1cef45fc924078 + .quad 0xc08624ee73ffdbb0 + .quad 0xbe1cf1e4f457f32a + .quad 0xc08624f1a91fc6a0 + .quad 0xbe1cf040147b8a5a + .quad 0xc08624f4dcf6fc98 + .quad 0xbe1cf1effca0dfb2 + .quad 0xc08624f80f868468 + .quad 0xbe1cf0470146e5bc + .quad 0xc08624fb40cf6390 + .quad 0xbe1cef4dd186e501 + .quad 0xc08624fe70d29e60 + .quad 0xbe1ceebe257f66c7 + .quad 0xc08625019f9137f0 + .quad 0xbe1ceefb7a1c395c + .quad 0xc0862504cd0c3220 + .quad 0xbe1cf209dedfed8c + .quad 0xc0862507f9448db0 + .quad 0xbe1cf082da464994 + .quad 0xc086250b243b4a18 + .quad 0xbe1cee88694a73cf + .quad 0xc086250e4df165a0 + .quad 0xbe1cf0b61e8f0531 + .quad 0xc08625117667dd78 + .quad 0xbe1cf1106599c962 + .quad 0xc08625149d9fad98 + .quad 0xbe1ceff1ee88af1f + .quad 0xc0862517c399d0c8 + .quad 0xbe1cf0f746994ef6 + .quad 0xc086251ae85740b8 + .quad 0xbe1cefe8a1d077e4 + .quad 0xc086251e0bd8f5e0 + .quad 0xbe1cf1a1da036092 + .quad 0xc08625212e1fe7a8 + .quad 0xbe1cf0f8a7786fcd + .quad 0xc08625244f2d0c48 + .quad 0xbe1cefa1174a07a7 + .quad 0xc08625276f0158d8 + .quad 0xbe1cef1043aa5b25 + .quad 0xc086252a8d9dc150 + .quad 0xbe1cf15d521c169d + .quad 0xc086252dab033898 + .quad 0xbe1cf220bba8861f + .quad 0xc0862530c732b078 + .quad 0xbe1cef51e310eae2 + .quad 0xc0862533e22d1988 + .quad 0xbe1cf222fcedd8ae + .quad 0xc0862536fbf36370 + .quad 0xbe1cefdb4da4bda8 + .quad 0xc086253a14867ca0 + .quad 0xbe1ceeafc1112171 + .quad 0xc086253d2be75280 + .quad 0xbe1cee99dfb4b408 + .quad 0xc08625404216d160 + .quad 0xbe1cf22d2536f06b + .quad 0xc08625435715e498 + .quad 0xbe1cef6abbf2e268 + .quad 0xc08625466ae57648 + .quad 0xbe1cf093a14789f5 + .quad 0xc08625497d866fa0 + .quad 0xbe1cf0f93655603c + .quad 0xc086254c8ef9b8b8 + .quad 0xbe1cf1cc40c9aafc + .quad 0xc086254f9f4038a8 + .quad 0xbe1ceeea5f4e9157 + .quad 0xc0862552ae5ad568 + .quad 0xbe1cefa9f52d4997 + .quad 0xc0862555bc4a7400 + .quad 0xbe1cefa490a638ff + .quad 0xc0862558c90ff868 + .quad 0xbe1cef7fcf797d6f + .quad 0xc086255bd4ac4590 + .quad 0xbe1cf1b4c51113c9 + .quad 0xc086255edf203d78 + .quad 0xbe1cef55e5b4a55d + .quad 0xc0862561e86cc100 + .quad 0xbe1cf0d37a25f9dc + .quad 0xc0862564f092b028 + .quad 0xbe1ceebe9efc19d9 + .quad 0xc0862567f792e9d8 + .quad 0xbe1cee8ad30a57b5 + .quad 0xc086256afd6e4c08 + .quad 0xbe1cef4e1817b90b + .quad 0xc086256e0225b3b8 + .quad 0xbe1cee7fa9229996 + .quad 0xc086257105b9fce0 + .quad 0xbe1cf0b54963d945 + .quad 0xc0862574082c0298 + .quad 0xbe1cee5f2f3c7995 + .quad 0xc0862577097c9ee0 + .quad 0xbe1cf0828e303a2c + .quad 0xc086257a09acaae0 + .quad 0xbe1cf172c3078947 + .quad 0xc086257d08bcfec0 + .quad 0xbe1cf189252afa22 + .quad 0xc086258006ae71b8 + .quad 0xbe1cefdb80426923 + .quad 0xc08625830381da08 + .quad 0xbe1ceef1391a0372 + .quad 0xc0862585ff380d00 + .quad 0xbe1cf17720c78d13 + .quad 0xc0862588f9d1df18 + .quad 0xbe1ceef1f9027d83 + .quad 0xc086258bf35023b8 + .quad 0xbe1cf06fac99dec9 + .quad 0xc086258eebb3ad78 + .quad 0xbe1cf1373eeb45c0 + .quad 0xc0862591e2fd4e00 + .quad 0xbe1cef777536bb81 + .quad 0xc0862594d92dd600 + .quad 0xbe1cf0f43ca40766 + .quad 0xc0862597ce461558 + .quad 0xbe1cefb2cfc6766b + .quad 0xc086259ac246daf0 + .quad 0xbe1ceea49e64ffa2 + .quad 0xc086259db530f4c8 + .quad 0xbe1cf250fa457dec + .quad 0xc08625a0a7053018 + .quad 0xbe1cf17d8bb2a44e + .quad 0xc08625a397c45918 + .quad 0xbe1cf1d5906d54b7 + .quad 0xc08625a6876f3b30 + .quad 0xbe1cf08fe7b31780 + .quad 0xc08625a97606a0e0 + .quad 0xbe1cef13edfc9d11 + .quad 0xc08625ac638b53c8 + .quad 0xbe1cef9d2b107219 + .quad 0xc08625af4ffe1cb0 + .quad 0xbe1cf1ddd4ff6160 + .quad 0xc08625b23b5fc390 + .quad 0xbe1cefa02a996495 + .quad 0xc08625b525b10f68 + .quad 0xbe1cf166a7e37ee5 + .quad 0xc08625b80ef2c680 + .quad 0xbe1cef0b171068a5 + .quad 0xc08625baf725ae28 + .quad 0xbe1cf05c80779283 + .quad 0xc08625bdde4a8af0 + .quad 0xbe1cf1bbfbffb889 + .quad 0xc08625c0c4622090 + .quad 0xbe1cf0b8666c0124 + .quad 0xc08625c3a96d31e0 + .quad 0xbe1cf0a8fcf47a86 + .quad 0xc08625c68d6c80f0 + .quad 0xbe1cef46e18cb092 + .quad 0xc08625c97060cef0 + .quad 0xbe1cf1458a350efb + .quad 0xc08625cc524adc58 + .quad 0xbe1ceeea1dadce12 + .quad 0xc08625cf332b68b0 + .quad 0xbe1cf0a1bfdc44c7 + .quad 0xc08625d2130332d0 + .quad 0xbe1cef96d02da73e + .quad 0xc08625d4f1d2f8a8 + .quad 0xbe1cf2451c3c7701 + .quad 0xc08625d7cf9b7778 + .quad 0xbe1cf10d08f83812 + .quad 0xc08625daac5d6ba0 + .quad 0xbe1ceec5b4895c5e + .quad 0xc08625dd881990b0 + .quad 0xbe1cf14e1325c5e4 + .quad 0xc08625e062d0a188 + .quad 0xbe1cf21d0904be12 + .quad 0xc08625e33c835838 + .quad 0xbe1ceed0839bcf21 + .quad 0xc08625e615326df0 + .quad 0xbe1cf1bb944889d2 + .quad 0xc08625e8ecde9b48 + .quad 0xbe1cee738e85eece + .quad 0xc08625ebc38897e0 + .quad 0xbe1cf25c2bc6ef12 + .quad 0xc08625ee99311ac8 + .quad 0xbe1cf132b70a41ad + .quad 0xc08625f16dd8da28 + .quad 0xbe1cf1984236a6e3 + .quad 0xc08625f441808b78 + .quad 0xbe1cf19ae74998f9 + .quad 0xc08625f71428e370 + .quad 0xbe1cef3e175d61a1 + .quad 0xc08625f9e5d295f8 + .quad 0xbe1cf101f9868fd9 + .quad 0xc08625fcb67e5658 + .quad 0xbe1cee69db83dcd2 + .quad 0xc08625ff862cd6f8 + .quad 0xbe1cf081b636af51 + .quad 0xc086260254dec9a8 + .quad 0xbe1cee62c7d59b3e + .quad 0xc08626052294df58 + .quad 0xbe1cf1b745c57716 + .quad 0xc0862607ef4fc868 + .quad 0xbe1cef3d2800ea23 + .quad 0xc086260abb103458 + .quad 0xbe1cef480ff1acd2 + .quad 0xc086260d85d6d200 + .quad 0xbe1cf2424c9a17ef + .quad 0xc08626104fa44f90 + .quad 0xbe1cf12cfde90fd5 + .quad 0xc086261318795a68 + .quad 0xbe1cf21f590dd5b6 + .quad 0xc0862615e0569f48 + .quad 0xbe1cf0c50f9cd28a + .quad 0xc0862618a73cca30 + .quad 0xbe1ceedbdb520545 + .quad 0xc086261b6d2c8668 + .quad 0xbe1cf0b030396011 + .quad 0xc086261e32267e98 + .quad 0xbe1cf19917010e96 + .quad 0xc0862620f62b5cb0 + .quad 0xbe1cf07331355985 + .quad 0xc0862623b93bc9e8 + .quad 0xbe1cf01ae921a1c3 + .quad 0xc08626267b586ed0 + .quad 0xbe1cefe5cf0dbf0c + .quad 0xc08626293c81f348 + .quad 0xbe1cf01b258aeb50 + .quad 0xc086262bfcb8fe88 + .quad 0xbe1cee6b9e7f4c68 + .quad 0xc086262ebbfe3710 + .quad 0xbe1cee684a9b21c9 + .quad 0xc08626317a5242b8 + .quad 0xbe1cf1f8bcde9a8b + .quad 0xc086263437b5c6c0 + .quad 0xbe1cf1d063d36238 + .quad 0xc0862636f42967a8 + .quad 0xbe1cf1e31a19075e + .quad 0xc0862639afadc950 + .quad 0xbe1cf1d8efdf7e7d + .quad 0xc086263c6a438ef0 + .quad 0xbe1cf1812ee72dba + .quad 0xc086263f23eb5b18 + .quad 0xbe1cf1449a9a2279 + .quad 0xc0862641dca5cfb8 + .quad 0xbe1cee96edce5085 + .quad 0xc086264494738e08 + .quad 0xbe1cf06797bd03b2 + .quad 0xc08626474b5536b8 + .quad 0xbe1cef91b9b7ffc1 + .quad 0xc086264a014b69c0 + .quad 0xbe1cef4b6721278f + .quad 0xc086264cb656c678 + .quad 0xbe1cf1942925eb4a + .quad 0xc086264f6a77eba8 + .quad 0xbe1cefa2c7bc2e39 + .quad 0xc08626521daf7758 + .quad 0xbe1cf252595aceb3 + .quad 0xc0862654cffe0718 + .quad 0xbe1cee8e9ae47ec2 + .quad 0xc0862657816437a8 + .quad 0xbe1cf1bf913828fa + .quad 0xc086265a31e2a558 + .quad 0xbe1cf23475d6b366 + .quad 0xc086265ce179ebc8 + .quad 0xbe1cef8df00a922b + .quad 0xc086265f902aa5f0 + .quad 0xbe1cef279bfa43e0 + .quad 0xc08626623df56e38 + .quad 0xbe1cf080e10b8365 + .quad 0xc0862664eadade70 + .quad 0xbe1cf1a518f9b544 + .quad 0xc086266796db8fd0 + .quad 0xbe1cef9308fed9e9 + .quad 0xc086266a41f81ae8 + .quad 0xbe1ceea3ae6b19c9 + .quad 0xc086266cec3117b8 + .quad 0xbe1ceef06003d4c2 + .quad 0xc086266f95871da8 + .quad 0xbe1cf0b8457ffb0c + .quad 0xc08626723dfac390 + .quad 0xbe1cf0c526745ad6 + .quad 0xc0862674e58c9fa8 + .quad 0xbe1cf0cf91ff7b5d + .quad 0xc08626778c3d4798 + .quad 0xbe1cefe260819380 + .quad 0xc086267a320d5070 + .quad 0xbe1ceebd90aa27a3 + .quad 0xc086267cd6fd4ea8 + .quad 0xbe1cf0388121dffa + .quad 0xc086267f7b0dd630 + .quad 0xbe1cf1a3881435f1 + .quad 0xc08626821e3f7a68 + .quad 0xbe1cef28e9d9ac52 + .quad 0xc0862684c092ce08 + .quad 0xbe1cf02d300062dd + .quad 0xc086268762086350 + .quad 0xbe1cefaee1edfa35 + .quad 0xc086268a02a0cbe0 + .quad 0xbe1cf0a5a052e936 + .quad 0xc086268ca25c98d8 + .quad 0xbe1cee60a4a497ed + .quad 0xc086268f413c5ab0 + .quad 0xbe1cf0e4a5d0cf49 + .quad 0xc0862691df40a170 + .quad 0xbe1cf149235a4e6e + .quad 0xc08626947c69fc80 + .quad 0xbe1cf215180b9fcc + .quad 0xc086269718b8fac8 + .quad 0xbe1cef9b156a9840 + .quad 0xc0862699b42e2a90 + .quad 0xbe1cf054c91441be + .quad 0xc086269c4eca19a8 + .quad 0xbe1cf13ded26512c + .quad 0xc086269ee88d5550 + .quad 0xbe1cf22ea4d8ac06 + .quad 0xc08626a181786a40 + .quad 0xbe1cf2354666ee2e + .quad 0xc08626a4198be4a8 + .quad 0xbe1cefef936752b3 + .quad 0xc08626a6b0c85020 + .quad 0xbe1cf1e360a9db68 + .quad 0xc08626a9472e37d8 + .quad 0xbe1ceed6aeb812c5 + .quad 0xc08626abdcbe2650 + .quad 0xbe1cf227340b4986 + .quad 0xc08626ae7178a5b0 + .quad 0xbe1cf0215a0cbe0d + .quad 0xc08626b1055e3f70 + .quad 0xbe1cf256adf0ae26 + .quad 0xc08626b3986f7ca8 + .quad 0xbe1ceff3c67aed06 + .quad 0xc08626b62aace5c8 + .quad 0xbe1cf2159fb93652 + .quad 0xc08626b8bc1702e0 + .quad 0xbe1cf01e6dbd1c7f + .quad 0xc08626bb4cae5b60 + .quad 0xbe1cf009e75d1c0c + .quad 0xc08626bddc737648 + .quad 0xbe1ceec10a020e73 + .quad 0xc08626c06b66da08 + .quad 0xbe1cf06d5783eee7 + .quad 0xc08626c2f9890ca0 + .quad 0xbe1cf0cb8f169ffe + .quad 0xc08626c586da9388 + .quad 0xbe1cef7de2452430 + .quad 0xc08626c8135bf3b0 + .quad 0xbe1cf05da6f783ae + .quad 0xc08626ca9f0db198 + .quad 0xbe1cefcc877d681d + .quad 0xc08626cd29f05138 + .quad 0xbe1cef0531954ab3 + .quad 0xc08626cfb4045608 + .quad 0xbe1cf06b8565ea3d + .quad 0xc08626d23d4a4310 + .quad 0xbe1cefdc455d9d7e + .quad 0xc08626d4c5c29ad0 + .quad 0xbe1ceefc47e8fa64 + .quad 0xc08626d74d6ddf48 + .quad 0xbe1cf1872bf033f2 + .quad 0xc08626d9d44c9210 + .quad 0xbe1cf19d91087f9d + .quad 0xc08626dc5a5f3438 + .quad 0xbe1cf012d444c6ab + .quad 0xc08626dedfa64650 + .quad 0xbe1cf0ba528ee153 + .quad 0xc08626e164224880 + .quad 0xbe1ceeb431709788 + .quad 0xc08626e3e7d3ba60 + .quad 0xbe1cf0b9af31a6a5 + .quad 0xc08626e66abb1b28 + .quad 0xbe1cf168fb2e135b + .quad 0xc08626e8ecd8e990 + .quad 0xbe1cef9097461c93 + .quad 0xc08626eb6e2da3d0 + .quad 0xbe1cee7a434735d8 + .quad 0xc08626edeeb9c7a8 + .quad 0xbe1cf235732b86f2 + .quad 0xc08626f06e7dd280 + .quad 0xbe1cefe1510b89e6 + .quad 0xc08626f2ed7a4120 + .quad 0xbe1cf1f64b9b80ef + .quad 0xc08626f56baf9000 + .quad 0xbe1cf08f320ca339 + .quad 0xc08626f7e91e3b08 + .quad 0xbe1cf1b1de2808a1 + .quad 0xc08626fa65c6bdc0 + .quad 0xbe1cf1976d778b28 + .quad 0xc08626fce1a99338 + .quad 0xbe1ceef40a4f076f + .quad 0xc08626ff5cc73600 + .quad 0xbe1cef3e45869ce3 + .quad 0xc0862701d7202048 + .quad 0xbe1ceef601b4c9d6 + .quad 0xc086270450b4cbc0 + .quad 0xbe1cf1eaf0b57fd6 + .quad 0xc0862706c985b1c0 + .quad 0xbe1cef82a44990f3 + .quad 0xc086270941934b10 + .quad 0xbe1ceefe32981f2c + .quad 0xc086270bb8de1018 + .quad 0xbe1cefbf6f5a0445 + .quad 0xc086270e2f6678d0 + .quad 0xbe1cf18dba75792c + .quad 0xc0862710a52cfcc8 + .quad 0xbe1cf0da64ce995f + .quad 0xc08627131a321318 + .quad 0xbe1cef04ac0fb802 + .quad 0xc08627158e763268 + .quad 0xbe1cee9d4e2ad9bd + .quad 0xc086271801f9d0f8 + .quad 0xbe1cefa9b55407b5 + .quad 0xc086271a74bd64a0 + .quad 0xbe1cefe6bd329570 + .quad 0xc086271ce6c162c8 + .quad 0xbe1cef0b1205dc85 + .quad 0xc086271f58064068 + .quad 0xbe1cef092a785e3f + .quad 0xc0862721c88c7210 + .quad 0xbe1cf050dcdaac30 + .quad 0xc086272438546be8 + .quad 0xbe1cf210907ded8b + .quad 0xc0862726a75ea1b8 + .quad 0xbe1cee760be44f99 + .quad 0xc086272915ab86c0 + .quad 0xbe1ceeeee07c2bcc + .quad 0xc086272b833b8df0 + .quad 0xbe1cf06874992df5 + .quad 0xc086272df00f29d0 + .quad 0xbe1cef8fac5d4899 + .quad 0xc08627305c26cc70 + .quad 0xbe1cf1103241cc99 + .quad 0xc0862732c782e788 + .quad 0xbe1cf1d35fef83fe + .quad 0xc08627353223ec68 + .quad 0xbe1cef3ec8133e1d + .quad 0xc08627379c0a4be8 + .quad 0xbe1cef7261daccd8 + .quad 0xc086273a05367688 + .quad 0xbe1cf18656c50806 + .quad 0xc086273c6da8dc68 + .quad 0xbe1cf1c8736e049a + .quad 0xc086273ed561ed38 + .quad 0xbe1cf1f93bff4911 + .quad 0xc08627413c621848 + .quad 0xbe1cf188a4ea680c + .quad 0xc0862743a2a9cc80 + .quad 0xbe1cf1d270930c80 + .quad 0xc086274608397868 + .quad 0xbe1cf25a328c28e2 + .quad 0xc08627486d118a28 + .quad 0xbe1cf106f90aa3b8 + .quad 0xc086274ad1326f80 + .quad 0xbe1cee5e9d2e885a + .quad 0xc086274d349c95c0 + .quad 0xbe1cf1c0bac27228 + .quad 0xc086274f975069f8 + .quad 0xbe1cf1a1500f9b1c + .quad 0xc0862751f94e58c0 + .quad 0xbe1cefc30663ac44 + .quad 0xc08627545a96ce48 + .quad 0xbe1cf17123e427a2 + .quad 0xc0862756bb2a3678 + .quad 0xbe1cefb92749fea4 + .quad 0xc08627591b08fcc0 + .quad 0xbe1cefa40e1ea74a + .quad 0xc086275b7a338c40 + .quad 0xbe1cee6f4612c3e9 + .quad 0xc086275dd8aa4fa8 + .quad 0xbe1cf1c54a053627 + .quad 0xc0862760366db168 + .quad 0xbe1ceff5eb503d9e + .quad 0xc0862762937e1b70 + .quad 0xbe1cf02e47f10cee + .quad 0xc0862764efdbf768 + .quad 0xbe1ceeb06e1d0dad + .quad 0xc08627674b87ae88 + .quad 0xbe1cf10aadd6dba5 + .quad 0xc0862769a681a9c0 + .quad 0xbe1cf24e9913d30f + .quad 0xc086276c00ca51a0 + .quad 0xbe1cef47b301e312 + .quad 0xc086276e5a620e48 + .quad 0xbe1ceeb1cefc2e85 + .quad 0xc0862770b3494788 + .quad 0xbe1cf16f1fbbe011 + .quad 0xc08627730b8064e8 + .quad 0xbe1ceebdf75174c7 + .quad 0xc08627756307cd70 + .quad 0xbe1cf06e3871a0da + .quad 0xc0862777b9dfe7f0 + .quad 0xbe1cef16799fd554 + .quad 0xc086277a10091ac0 + .quad 0xbe1cf248dabf5377 + .quad 0xc086277c6583cc00 + .quad 0xbe1cf0c78d92a2cd + .quad 0xc086277eba506158 + .quad 0xbe1cf0b911b029f0 + .quad 0xc08627810e6f4028 + .quad 0xbe1cefdc24719766 + .quad 0xc086278361e0cd70 + .quad 0xbe1cefbb6562b7e7 + .quad 0xc0862785b4a56dd8 + .quad 0xbe1cf1e0afb349ec + .quad 0xc086278806bd85c0 + .quad 0xbe1cf008292e52fc + .quad 0xc086278a58297918 + .quad 0xbe1cf053073872bf + .quad 0xc086278ca8e9ab88 + .quad 0xbe1cf17a0a55a947 + .quad 0xc086278ef8fe8068 + .quad 0xbe1ceeffb0b60234 + .quad 0xc086279148685aa0 + .quad 0xbe1cf162204794a8 + .quad 0xc086279397279ce0 + .quad 0xbe1cf24cc8cb48ac + .quad 0xc0862795e53ca978 + .quad 0xbe1cf0c9be68d5c3 + .quad 0xc086279832a7e258 + .quad 0xbe1cf172cd3d7388 + .quad 0xc086279a7f69a930 + .quad 0xbe1ceea2465fbce5 + .quad 0xc086279ccb825f40 + .quad 0xbe1cf0a386d2500f + .quad 0xc086279f16f26590 + .quad 0xbe1cf1e338ddc18a + .quad 0xc08627a161ba1cd0 + .quad 0xbe1cef1f5049867f + .quad 0xc08627a3abd9e548 + .quad 0xbe1cef96c1ea8b1f + .quad 0xc08627a5f5521f00 + .quad 0xbe1cf138f6fd3c26 + .quad 0xc08627a83e2329b0 + .quad 0xbe1cf0d4fcbfdf3a + .quad 0xc08627aa864d64b0 + .quad 0xbe1cf24870c12c81 + .quad 0xc08627accdd12f18 + .quad 0xbe1cf0ae2a56348d + .quad 0xc08627af14aee7a0 + .quad 0xbe1cee8ca1a9b893 + .quad 0xc08627b15ae6eca8 + .quad 0xbe1cf20414d637b0 + .quad 0xc08627b3a0799c60 + .quad 0xbe1cf0fc6b7b12d8 + .quad 0xc08627b5e5675488 + .quad 0xbe1cf152d93c4a00 + .quad 0xc08627b829b072a0 + .quad 0xbe1cf1073f9b77c2 + .quad 0xc08627ba6d5553d8 + .quad 0xbe1cee694f97d5a4 + .quad 0xc08627bcb0565500 + .quad 0xbe1cf0456b8239d7 + .quad 0xc08627bef2b3d2b0 + .quad 0xbe1cf211497127e3 + .quad 0xc08627c1346e2930 + .quad 0xbe1cf01856c0384d + .quad 0xc08627c37585b468 + .quad 0xbe1cefa7dd05479e + .quad 0xc08627c5b5fad000 + .quad 0xbe1cef3ae8e50b93 + .quad 0xc08627c7f5cdd750 + .quad 0xbe1ceea5f32fdd3a + .quad 0xc08627ca34ff2560 + .quad 0xbe1cef424caeb8d9 + .quad 0xc08627cc738f14f0 + .quad 0xbe1cf0194d07a81f + .quad 0xc08627ceb17e0070 + .quad 0xbe1cf20f452000c1 + .quad 0xc08627d0eecc4210 + .quad 0xbe1cf00e356218e4 + .quad 0xc08627d32b7a33a0 + .quad 0xbe1cef30484b4bcb + .quad 0xc08627d567882eb0 + .quad 0xbe1ceeea11a6641b + .quad 0xc08627d7a2f68c80 + .quad 0xbe1cf13492d5bd7b + .quad 0xc08627d9ddc5a618 + .quad 0xbe1ceeb7048fad96 + .quad 0xc08627dc17f5d418 + .quad 0xbe1ceef0666f0477 + .quad 0xc08627de51876ee8 + .quad 0xbe1cf060d4b8b5c2 + .quad 0xc08627e08a7acea8 + .quad 0xbe1cf0b2a4b6ff8c + .quad 0xc08627e2c2d04b28 + .quad 0xbe1cf0e34809a875 + .quad 0xc08627e4fa883bf0 + .quad 0xbe1cf16bf74a3522 + .quad 0xc08627e731a2f848 + .quad 0xbe1cee6a24623d57 + .quad 0xc08627e96820d718 + .quad 0xbe1cefc7b4f1528e + .quad 0xc08627eb9e022f18 + .quad 0xbe1cf163051f3548 + .quad 0xc08627edd34756b8 + .quad 0xbe1cef36b3366305 + .quad 0xc08627f007f0a408 + .quad 0xbe1cf18134625550 + .quad 0xc08627f23bfe6cf0 + .quad 0xbe1cf0ec32ec1a11 + .quad 0xc08627f46f710700 + .quad 0xbe1ceeb3b64f3edc + .quad 0xc08627f6a248c778 + .quad 0xbe1cf0cd15805bc8 + .quad 0xc08627f8d4860368 + .quad 0xbe1cf20db3bddebe + .quad 0xc08627fb06290f90 + .quad 0xbe1cf25188430e25 + .quad 0xc08627fd37324070 + .quad 0xbe1ceea1713490f9 + .quad 0xc08627ff67a1ea28 + .quad 0xbe1cf159521d234c + .quad 0xc0862801977860b8 + .quad 0xbe1cf24dfe50783b + .quad 0xc0862803c6b5f7d0 + .quad 0xbe1ceef2ef89a60b + .quad 0xc0862805f55b02c8 + .quad 0xbe1cee7fc919d62c + .quad 0xc08628082367d4c0 + .quad 0xbe1cf215a7fb513a + .quad 0xc086280a50dcc0a8 + .quad 0xbe1cf0e4401c5ed4 + .quad 0xc086280c7dba1910 + .quad 0xbe1cf04ec734d256 + .quad 0xc086280eaa003050 + .quad 0xbe1cf010ad787fea + .quad 0xc0862810d5af5880 + .quad 0xbe1cee622478393d + .quad 0xc086281300c7e368 + .quad 0xbe1cf01c7482564f + .quad 0xc08628152b4a22a0 + .quad 0xbe1cf0de20d33536 + .quad 0xc086281755366778 + .quad 0xbe1cef2edae5837d + .quad 0xc08628197e8d02f0 + .quad 0xbe1cf0a345318cc9 + .quad 0xc086281ba74e45d8 + .quad 0xbe1cf20085aa34b8 + .quad 0xc086281dcf7a80c0 + .quad 0xbe1cef5fa845ad83 + .quad 0xc086281ff71203e0 + .quad 0xbe1cf050d1df69c4 + .quad 0xc08628221e151f48 + .quad 0xbe1ceffe43c035b9 + .quad 0xc0862824448422b8 + .quad 0xbe1cf14f3018d3c2 + .quad 0xc08628266a5f5dc0 + .quad 0xbe1cef0a5fbae83d + .quad 0xc08628288fa71f98 + .quad 0xbe1ceff8a95b72a1 + .quad 0xc086282ab45bb750 + .quad 0xbe1cef073aa9849b + .quad 0xc086282cd87d73a8 + .quad 0xbe1cef69b3835c02 + .quad 0xc086282efc0ca328 + .quad 0xbe1cf0bc139379a9 + .quad 0xc08628311f099420 + .quad 0xbe1cef247a9ec596 + .quad 0xc086283341749490 + .quad 0xbe1cef74bbcc488a + .quad 0xc0862835634df248 + .quad 0xbe1cef4bc42e7b8e + .quad 0xc08628378495fad0 + .quad 0xbe1cf136d4d5a810 + .quad 0xc0862839a54cfb80 + .quad 0xbe1cf0d290b24dd8 + .quad 0xc086283bc5734168 + .quad 0xbe1ceeebde8e0065 + .quad 0xc086283de5091950 + .quad 0xbe1cf1a09f60aa1e + .quad 0xc0862840040ecfe0 + .quad 0xbe1cf0803947a234 + .quad 0xc08628422284b168 + .quad 0xbe1cf0abf7638127 + .quad 0xc0862844406b0a08 + .quad 0xbe1cf0f73ee12058 + .quad 0xc08628465dc225a0 + .quad 0xbe1cf2079971b26c + .quad 0xc08628487a8a4fe0 + .quad 0xbe1cee74957564b1 + .quad 0xc086284a96c3d420 + .quad 0xbe1ceee77c1b7d43 + .quad 0xc086284cb26efd90 + .quad 0xbe1cf23addba6e09 + .quad 0xc086284ecd8c1730 + .quad 0xbe1cf199f4a1da60 + .quad 0xc0862850e81b6bb0 + .quad 0xbe1cf09fdea81393 + .quad 0xc0862853021d4588 + .quad 0xbe1cf176adb417f7 + .quad 0xc08628551b91ef00 + .quad 0xbe1cf0f64f84a8da + .quad 0xc08628573479b220 + .quad 0xbe1ceec34cf49523 + .quad 0xc08628594cd4d8a8 + .quad 0xbe1cf16d60fbe0bb + .quad 0xc086285b64a3ac40 + .quad 0xbe1cee8de7acfc7b + .quad 0xc086285d7be67630 + .quad 0xbe1ceee6256cce8d + .quad 0xc086285f929d7fa0 + .quad 0xbe1cee7d66a3d8a5 + .quad 0xc0862861a8c91170 + .quad 0xbe1cf0bef8265792 + .quad 0xc0862863be697458 + .quad 0xbe1cf097f890c6f8 + .quad 0xc0862865d37ef0c8 + .quad 0xbe1cf09502d5c3fc + .quad 0xc0862867e809cf00 + .quad 0xbe1ceeffb239dac7 + .quad 0xc0862869fc0a56f8 + .quad 0xbe1cf1fbfff95c98 + .quad 0xc086286c0f80d090 + .quad 0xbe1cefa57ad3eef7 + .quad 0xc086286e226d8348 + .quad 0xbe1cf22c58b9183d + .quad 0xc086287034d0b690 + .quad 0xbe1ceff262d0a248 + .quad 0xc086287246aab180 + .quad 0xbe1cefa7bc194186 + .quad 0xc086287457fbbb08 + .quad 0xbe1cf06782d784d9 + .quad 0xc086287668c419e0 + .quad 0xbe1cf1d44d0eaa07 + .quad 0xc086287879041490 + .quad 0xbe1cf034803c8a48 + .quad 0xc086287a88bbf158 + .quad 0xbe1cf08e84916b6f + .quad 0xc086287c97ebf650 + .quad 0xbe1cf0c4d3dc1bc7 + .quad 0xc086287ea6946958 + .quad 0xbe1cefb1e4625943 + .quad 0xc0862880b4b59010 + .quad 0xbe1cf143efdd1fd0 + .quad 0xc0862882c24faff8 + .quad 0xbe1cee9896d016da + .quad 0xc0862884cf630e38 + .quad 0xbe1cf2186072f2cc + .quad 0xc0862886dbefeff0 + .quad 0xbe1cef9217633d34 + .quad 0xc0862888e7f699e0 + .quad 0xbe1cf05603549486 + .quad 0xc086288af37750b0 + .quad 0xbe1cef50fff513d3 + .quad 0xc086288cfe7258c0 + .quad 0xbe1cf127713b32d0 + .quad 0xc086288f08e7f650 + .quad 0xbe1cf05015520f3d + .quad 0xc086289112d86d58 + .quad 0xbe1cf12eb458b26f + .quad 0xc08628931c4401a8 + .quad 0xbe1cf22eae2887ed + .quad 0xc0862895252af6e0 + .quad 0xbe1cefdd6656dd2d + .quad 0xc08628972d8d9058 + .quad 0xbe1cf1048ea4e646 + .quad 0xc0862899356c1150 + .quad 0xbe1ceec4501167e9 + .quad 0xc086289b3cc6bcb8 + .quad 0xbe1cf0ad52becc3f + .quad 0xc086289d439dd568 + .quad 0xbe1cf0daa4e00e35 + .quad 0xc086289f49f19df8 + .quad 0xbe1cf00b80de8d6a + .quad 0xc08628a14fc258c8 + .quad 0xbe1cf1bcf2ea8464 + .quad 0xc08628a355104818 + .quad 0xbe1cf0435e2782b0 + .quad 0xc08628a559dbade0 + .quad 0xbe1cf0e3e1a5f56c + .quad 0xc08628a75e24cbf8 + .quad 0xbe1cefed9d5a721d + .quad 0xc08628a961ebe3f8 + .quad 0xbe1cf0d2d74321e2 + .quad 0xc08628ab65313750 + .quad 0xbe1cf24200eb55e9 + .quad 0xc08628ad67f50740 + .quad 0xbe1cf23e9d7cf979 + .quad 0xc08628af6a3794d0 + .quad 0xbe1cf23a088f421c + .quad 0xc08628b16bf920e0 + .quad 0xbe1cef2c1de1ab32 + .quad 0xc08628b36d39ec08 + .quad 0xbe1cf1abc231f7b2 + .quad 0xc08628b56dfa36d0 + .quad 0xbe1cf2074d5ba303 + .quad 0xc08628b76e3a4180 + .quad 0xbe1cf05cd5eed880 + .rept 48 + .byte 0 + .endr + +/* Lookup table with 9-bit index for + -log(mRcp), where mRcp is mantissa of 1/x 9-bit accurate reciprocal: + */ +.if .-__svml_dlog_data != _Log_LA_table +.err +.endif + .quad 0x8000000000000000 + .quad 0xbf5ff802a9ab10e6 + .quad 0xbf6ff00aa2b10bc0 + .quad 0xbf77ee11ebd82e94 + .quad 0xbf7fe02a6b106789 + .quad 0xbf83e7295d25a7d9 + .quad 0xbf87dc475f810a77 + .quad 0xbf8bcf712c74384c + .quad 0xbf8fc0a8b0fc03e4 + .quad 0xbf91d7f7eb9eebe7 + .quad 0xbf93cea44346a575 + .quad 0xbf95c45a51b8d389 + .quad 0xbf97b91b07d5b11b + .quad 0xbf99ace7551cc514 + .quad 0xbf9b9fc027af9198 + .quad 0xbf9d91a66c543cc4 + .quad 0xbf9f829b0e783300 + .quad 0xbfa0b94f7c196176 + .quad 0xbfa1b0d98923d980 + .quad 0xbfa2a7ec2214e873 + .quad 0xbfa39e87b9febd60 + .quad 0xbfa494acc34d911c + .quad 0xbfa58a5bafc8e4d5 + .quad 0xbfa67f94f094bd98 + .quad 0xbfa77458f632dcfc + .quad 0xbfa868a83083f6cf + .quad 0xbfa95c830ec8e3eb + .quad 0xbfaa4fe9ffa3d235 + .quad 0xbfab42dd711971bf + .quad 0xbfac355dd0921f2d + .quad 0xbfad276b8adb0b52 + .quad 0xbfae19070c276016 + .quad 0xbfaf0a30c01162a6 + .quad 0xbfaffae9119b9303 + .quad 0xbfb075983598e471 + .quad 0xbfb0ed839b5526fe + .quad 0xbfb16536eea37ae1 + .quad 0xbfb1dcb263db1944 + .quad 0xbfb253f62f0a1417 + .quad 0xbfb2cb0283f5de1f + .quad 0xbfb341d7961bd1d1 + .quad 0xbfb3b87598b1b6ee + .quad 0xbfb42edcbea646f0 + .quad 0xbfb4a50d3aa1b040 + .quad 0xbfb51b073f06183f + .quad 0xbfb590cafdf01c28 + .quad 0xbfb60658a93750c4 + .quad 0xbfb67bb0726ec0fc + .quad 0xbfb6f0d28ae56b4c + .quad 0xbfb765bf23a6be13 + .quad 0xbfb7da766d7b12cd + .quad 0xbfb84ef898e8282a + .quad 0xbfb8c345d6319b21 + .quad 0xbfb9375e55595ede + .quad 0xbfb9ab42462033ad + .quad 0xbfba1ef1d8061cd4 + .quad 0xbfba926d3a4ad563 + .quad 0xbfbb05b49bee43fe + .quad 0xbfbb78c82bb0eda1 + .quad 0xbfbbeba818146765 + .quad 0xbfbc5e548f5bc743 + .quad 0xbfbcd0cdbf8c13e1 + .quad 0xbfbd4313d66cb35d + .quad 0xbfbdb5270187d927 + .quad 0xbfbe27076e2af2e6 + .quad 0xbfbe98b549671467 + .quad 0xbfbf0a30c01162a6 + .quad 0xbfbf7b79fec37ddf + .quad 0xbfbfec9131dbeabb + .quad 0xbfc02ebb42bf3d4b + .quad 0xbfc0671512ca596e + .quad 0xbfc09f561ee719c3 + .quad 0xbfc0d77e7cd08e59 + .quad 0xbfc10f8e422539b1 + .quad 0xbfc14785846742ac + .quad 0xbfc17f6458fca611 + .quad 0xbfc1b72ad52f67a0 + .quad 0xbfc1eed90e2dc2c3 + .quad 0xbfc2266f190a5acb + .quad 0xbfc25ded0abc6ad2 + .quad 0xbfc29552f81ff523 + .quad 0xbfc2cca0f5f5f251 + .quad 0xbfc303d718e47fd3 + .quad 0xbfc33af575770e4f + .quad 0xbfc371fc201e8f74 + .quad 0xbfc3a8eb2d31a376 + .quad 0xbfc3dfc2b0ecc62a + .quad 0xbfc41682bf727bc0 + .quad 0xbfc44d2b6ccb7d1e + .quad 0xbfc483bccce6e3dd + .quad 0xbfc4ba36f39a55e5 + .quad 0xbfc4f099f4a230b2 + .quad 0xbfc526e5e3a1b438 + .quad 0xbfc55d1ad4232d6f + .quad 0xbfc59338d9982086 + .quad 0xbfc5c940075972b9 + .quad 0xbfc5ff3070a793d4 + .quad 0xbfc6350a28aaa758 + .quad 0xbfc66acd4272ad51 + .quad 0xbfc6a079d0f7aad2 + .quad 0xbfc6d60fe719d21d + .quad 0xbfc70b8f97a1aa75 + .quad 0xbfc740f8f54037a5 + .quad 0xbfc7764c128f2127 + .quad 0xbfc7ab890210d909 + .quad 0xbfc7e0afd630c274 + .quad 0xbfc815c0a14357eb + .quad 0xbfc84abb75865139 + .quad 0xbfc87fa06520c911 + .quad 0xbfc8b46f8223625b + .quad 0xbfc8e928de886d41 + .quad 0xbfc91dcc8c340bde + .quad 0xbfc9525a9cf456b4 + .quad 0xbfc986d3228180ca + .quad 0xbfc9bb362e7dfb83 + .quad 0xbfc9ef83d2769a34 + .quad 0xbfca23bc1fe2b563 + .quad 0xbfca57df28244dcd + .quad 0xbfca8becfc882f19 + .quad 0xbfcabfe5ae46124c + .quad 0xbfcaf3c94e80bff3 + .quad 0xbfcb2797ee46320c + .quad 0xbfcb5b519e8fb5a4 + .quad 0xbfcb8ef670420c3b + .quad 0xbfcbc286742d8cd6 + .quad 0xbfcbf601bb0e44e2 + .quad 0xbfcc2968558c18c1 + .quad 0xbfcc5cba543ae425 + .quad 0xbfcc8ff7c79a9a22 + .quad 0xbfccc320c0176502 + .quad 0xbfccf6354e09c5dc + .quad 0xbfcd293581b6b3e7 + .quad 0xbfcd5c216b4fbb91 + .quad 0xbfcd8ef91af31d5e + .quad 0xbfcdc1bca0abec7d + .quad 0xbfcdf46c0c722d2f + .quad 0xbfce27076e2af2e6 + .quad 0xbfce598ed5a87e2f + .quad 0xbfce8c0252aa5a60 + .quad 0xbfcebe61f4dd7b0b + .quad 0xbfcef0adcbdc5936 + .quad 0xbfcf22e5e72f105d + .quad 0xbfcf550a564b7b37 + .quad 0xbfcf871b28955045 + .quad 0xbfcfb9186d5e3e2b + .quad 0xbfcfeb0233e607cc + .quad 0xbfd00e6c45ad501d + .quad 0xbfd0274dc16c232f + .quad 0xbfd0402594b4d041 + .quad 0xbfd058f3c703ebc6 + .quad 0xbfd071b85fcd590d + .quad 0xbfd08a73667c57af + .quad 0xbfd0a324e27390e3 + .quad 0xbfd0bbccdb0d24bd + .quad 0xbfd0d46b579ab74b + .quad 0xbfd0ed005f657da4 + .quad 0xbfd1058bf9ae4ad5 + .quad 0xbfd11e0e2dad9cb7 + .quad 0xbfd136870293a8b0 + .quad 0xbfd14ef67f88685a + .quad 0xbfd1675cababa60e + .quad 0xbfd17fb98e15095d + .quad 0xbfd1980d2dd4236f + .quad 0xbfd1b05791f07b49 + .quad 0xbfd1c898c16999fb + .quad 0xbfd1e0d0c33716be + .quad 0xbfd1f8ff9e48a2f3 + .quad 0xbfd211255986160c + .quad 0xbfd22941fbcf7966 + .quad 0xbfd241558bfd1404 + .quad 0xbfd2596010df763a + .quad 0xbfd27161913f853d + .quad 0xbfd2895a13de86a3 + .quad 0xbfd2a1499f762bc9 + .quad 0xbfd2b9303ab89d25 + .quad 0xbfd2d10dec508583 + .quad 0xbfd2e8e2bae11d31 + .quad 0xbfd300aead06350c + .quad 0xbfd31871c9544185 + .quad 0xbfd3302c16586588 + .quad 0xbfd347dd9a987d55 + .quad 0xbfd35f865c93293e + .quad 0xbfd3772662bfd85b + .quad 0xbfd38ebdb38ed321 + .quad 0xbfd3a64c556945ea + .quad 0xbfd3bdd24eb14b6a + .quad 0xbfd3d54fa5c1f710 + .quad 0xbfd3ecc460ef5f50 + .quad 0xbfd404308686a7e4 + .quad 0xbfd41b941cce0bee + .quad 0xbfd432ef2a04e814 + .quad 0xbfd44a41b463c47c + .quad 0xbfd4618bc21c5ec2 + .quad 0xbfd478cd5959b3d9 + .quad 0xbfd49006804009d1 + .quad 0xbfd4a7373cecf997 + .quad 0xbfd4be5f957778a1 + .quad 0xbfd4d57f8fefe27f + .quad 0xbfd4ec973260026a + .quad 0xbfd503a682cb1cb3 + .quad 0xbfd51aad872df82d + .quad 0xbfd531ac457ee77e + .quad 0xbfd548a2c3add263 + .quad 0xbfd55f9107a43ee2 + .quad 0xbfd5767717455a6c + .quad 0xbfd58d54f86e02f2 + .quad 0xbfd5a42ab0f4cfe2 + .quad 0xbfd5baf846aa1b19 + .quad 0xbfd5d1bdbf5809ca + .quad 0xbfd5e87b20c2954a + .quad 0xbfd5ff3070a793d4 + .quad 0xbfd615ddb4bec13c + .quad 0xbfd62c82f2b9c795 + .quad 0x3fd61965cdb02c1f + .quad 0x3fd602d08af091ec + .quad 0x3fd5ec433d5c35ae + .quad 0x3fd5d5bddf595f30 + .quad 0x3fd5bf406b543db2 + .quad 0x3fd5a8cadbbedfa1 + .quad 0x3fd5925d2b112a59 + .quad 0x3fd57bf753c8d1fb + .quad 0x3fd565995069514c + .quad 0x3fd54f431b7be1a9 + .quad 0x3fd538f4af8f72fe + .quad 0x3fd522ae0738a3d8 + .quad 0x3fd50c6f1d11b97c + .quad 0x3fd4f637ebba9810 + .quad 0x3fd4e0086dd8baca + .quad 0x3fd4c9e09e172c3c + .quad 0x3fd4b3c077267e9a + .quad 0x3fd49da7f3bcc41f + .quad 0x3fd487970e958770 + .quad 0x3fd4718dc271c41b + .quad 0x3fd45b8c0a17df13 + .quad 0x3fd44591e0539f49 + .quad 0x3fd42f9f3ff62642 + .quad 0x3fd419b423d5e8c7 + .quad 0x3fd403d086cea79c + .quad 0x3fd3edf463c1683e + .quad 0x3fd3d81fb5946dba + .quad 0x3fd3c25277333184 + .quad 0x3fd3ac8ca38e5c5f + .quad 0x3fd396ce359bbf54 + .quad 0x3fd3811728564cb2 + .quad 0x3fd36b6776be1117 + .quad 0x3fd355bf1bd82c8b + .quad 0x3fd3401e12aecba1 + .quad 0x3fd32a84565120a8 + .quad 0x3fd314f1e1d35ce4 + .quad 0x3fd2ff66b04ea9d4 + .quad 0x3fd2e9e2bce12286 + .quad 0x3fd2d46602adccee + .quad 0x3fd2bef07cdc9354 + .quad 0x3fd2a982269a3dbf + .quad 0x3fd2941afb186b7c + .quad 0x3fd27ebaf58d8c9d + .quad 0x3fd269621134db92 + .quad 0x3fd25410494e56c7 + .quad 0x3fd23ec5991eba49 + .quad 0x3fd22981fbef797b + .quad 0x3fd214456d0eb8d4 + .quad 0x3fd1ff0fe7cf47a7 + .quad 0x3fd1e9e1678899f4 + .quad 0x3fd1d4b9e796c245 + .quad 0x3fd1bf99635a6b95 + .quad 0x3fd1aa7fd638d33f + .quad 0x3fd1956d3b9bc2fa + .quad 0x3fd180618ef18adf + .quad 0x3fd16b5ccbacfb73 + .quad 0x3fd1565eed455fc3 + .quad 0x3fd14167ef367783 + .quad 0x3fd12c77cd00713b + .quad 0x3fd1178e8227e47c + .quad 0x3fd102ac0a35cc1c + .quad 0x3fd0edd060b78081 + .quad 0x3fd0d8fb813eb1ef + .quad 0x3fd0c42d676162e3 + .quad 0x3fd0af660eb9e279 + .quad 0x3fd09aa572e6c6d4 + .quad 0x3fd085eb8f8ae797 + .quad 0x3fd07138604d5862 + .quad 0x3fd05c8be0d9635a + .quad 0x3fd047e60cde83b8 + .quad 0x3fd03346e0106062 + .quad 0x3fd01eae5626c691 + .quad 0x3fd00a1c6adda473 + .quad 0x3fcfeb2233ea07cd + .quad 0x3fcfc218be620a5e + .quad 0x3fcf991c6cb3b379 + .quad 0x3fcf702d36777df0 + .quad 0x3fcf474b134df229 + .quad 0x3fcf1e75fadf9bde + .quad 0x3fcef5ade4dcffe6 + .quad 0x3fceccf2c8fe920a + .quad 0x3fcea4449f04aaf5 + .quad 0x3fce7ba35eb77e2a + .quad 0x3fce530effe71012 + .quad 0x3fce2a877a6b2c12 + .quad 0x3fce020cc6235ab5 + .quad 0x3fcdd99edaf6d7e9 + .quad 0x3fcdb13db0d48940 + .quad 0x3fcd88e93fb2f450 + .quad 0x3fcd60a17f903515 + .quad 0x3fcd38666871f465 + .quad 0x3fcd1037f2655e7b + .quad 0x3fcce816157f1988 + .quad 0x3fccc000c9db3c52 + .quad 0x3fcc97f8079d44ec + .quad 0x3fcc6ffbc6f00f71 + .quad 0x3fcc480c0005ccd1 + .quad 0x3fcc2028ab17f9b4 + .quad 0x3fcbf851c067555f + .quad 0x3fcbd087383bd8ad + .quad 0x3fcba8c90ae4ad19 + .quad 0x3fcb811730b823d2 + .quad 0x3fcb5971a213acdb + .quad 0x3fcb31d8575bce3d + .quad 0x3fcb0a4b48fc1b46 + .quad 0x3fcae2ca6f672bd4 + .quad 0x3fcabb55c31693ad + .quad 0x3fca93ed3c8ad9e3 + .quad 0x3fca6c90d44b704e + .quad 0x3fca454082e6ab05 + .quad 0x3fca1dfc40f1b7f1 + .quad 0x3fc9f6c407089664 + .quad 0x3fc9cf97cdce0ec3 + .quad 0x3fc9a8778debaa38 + .quad 0x3fc981634011aa75 + .quad 0x3fc95a5adcf7017f + .quad 0x3fc9335e5d594989 + .quad 0x3fc90c6db9fcbcd9 + .quad 0x3fc8e588ebac2dbf + .quad 0x3fc8beafeb38fe8c + .quad 0x3fc897e2b17b19a5 + .quad 0x3fc871213750e994 + .quad 0x3fc84a6b759f512f + .quad 0x3fc823c16551a3c2 + .quad 0x3fc7fd22ff599d4f + .quad 0x3fc7d6903caf5ad0 + .quad 0x3fc7b0091651528c + .quad 0x3fc7898d85444c73 + .quad 0x3fc7631d82935a86 + .quad 0x3fc73cb9074fd14d + .quad 0x3fc716600c914054 + .quad 0x3fc6f0128b756abc + .quad 0x3fc6c9d07d203fc7 + .quad 0x3fc6a399dabbd383 + .quad 0x3fc67d6e9d785771 + .quad 0x3fc6574ebe8c133a + .quad 0x3fc6313a37335d76 + .quad 0x3fc60b3100b09476 + .quad 0x3fc5e533144c1719 + .quad 0x3fc5bf406b543db2 + .quad 0x3fc59958ff1d52f1 + .quad 0x3fc5737cc9018cdd + .quad 0x3fc54dabc26105d2 + .quad 0x3fc527e5e4a1b58d + .quad 0x3fc5022b292f6a45 + .quad 0x3fc4dc7b897bc1c8 + .quad 0x3fc4b6d6fefe22a4 + .quad 0x3fc4913d8333b561 + .quad 0x3fc46baf0f9f5db7 + .quad 0x3fc4462b9dc9b3dc + .quad 0x3fc420b32740fdd4 + .quad 0x3fc3fb45a59928cc + .quad 0x3fc3d5e3126bc27f + .quad 0x3fc3b08b6757f2a9 + .quad 0x3fc38b3e9e027479 + .quad 0x3fc365fcb0159016 + .quad 0x3fc340c59741142e + .quad 0x3fc31b994d3a4f85 + .quad 0x3fc2f677cbbc0a96 + .quad 0x3fc2d1610c86813a + .quad 0x3fc2ac55095f5c59 + .quad 0x3fc28753bc11aba5 + .quad 0x3fc2625d1e6ddf57 + .quad 0x3fc23d712a49c202 + .quad 0x3fc2188fd9807263 + .quad 0x3fc1f3b925f25d41 + .quad 0x3fc1ceed09853752 + .quad 0x3fc1aa2b7e23f72a + .quad 0x3fc185747dbecf34 + .quad 0x3fc160c8024b27b1 + .quad 0x3fc13c2605c398c3 + .quad 0x3fc1178e8227e47c + .quad 0x3fc0f301717cf0fb + .quad 0x3fc0ce7ecdccc28d + .quad 0x3fc0aa06912675d5 + .quad 0x3fc08598b59e3a07 + .quad 0x3fc06135354d4b18 + .quad 0x3fc03cdc0a51ec0d + .quad 0x3fc0188d2ecf6140 + .quad 0x3fbfe89139dbd566 + .quad 0x3fbfa01c9db57ce2 + .quad 0x3fbf57bc7d9005db + .quad 0x3fbf0f70cdd992e3 + .quad 0x3fbec739830a1120 + .quad 0x3fbe7f1691a32d3e + .quad 0x3fbe3707ee30487b + .quad 0x3fbdef0d8d466db9 + .quad 0x3fbda727638446a2 + .quad 0x3fbd5f55659210e2 + .quad 0x3fbd179788219364 + .quad 0x3fbccfedbfee13a8 + .quad 0x3fbc885801bc4b23 + .quad 0x3fbc40d6425a5cb1 + .quad 0x3fbbf968769fca11 + .quad 0x3fbbb20e936d6974 + .quad 0x3fbb6ac88dad5b1c + .quad 0x3fbb23965a52ff00 + .quad 0x3fbadc77ee5aea8c + .quad 0x3fba956d3ecade63 + .quad 0x3fba4e7640b1bc38 + .quad 0x3fba0792e9277cac + .quad 0x3fb9c0c32d4d2548 + .quad 0x3fb97a07024cbe74 + .quad 0x3fb9335e5d594989 + .quad 0x3fb8ecc933aeb6e8 + .quad 0x3fb8a6477a91dc29 + .quad 0x3fb85fd927506a48 + .quad 0x3fb8197e2f40e3f0 + .quad 0x3fb7d33687c293c9 + .quad 0x3fb78d02263d82d3 + .quad 0x3fb746e100226ed9 + .quad 0x3fb700d30aeac0e1 + .quad 0x3fb6bad83c1883b6 + .quad 0x3fb674f089365a7a + .quad 0x3fb62f1be7d77743 + .quad 0x3fb5e95a4d9791cb + .quad 0x3fb5a3abb01ade25 + .quad 0x3fb55e10050e0384 + .quad 0x3fb518874226130a + .quad 0x3fb4d3115d207eac + .quad 0x3fb48dae4bc31018 + .quad 0x3fb4485e03dbdfad + .quad 0x3fb403207b414b7f + .quad 0x3fb3bdf5a7d1ee64 + .quad 0x3fb378dd7f749714 + .quad 0x3fb333d7f8183f4b + .quad 0x3fb2eee507b40301 + .quad 0x3fb2aa04a44717a5 + .quad 0x3fb26536c3d8c369 + .quad 0x3fb2207b5c78549e + .quad 0x3fb1dbd2643d190b + .quad 0x3fb1973bd1465567 + .quad 0x3fb152b799bb3cc9 + .quad 0x3fb10e45b3cae831 + .quad 0x3fb0c9e615ac4e17 + .quad 0x3fb08598b59e3a07 + .quad 0x3fb0415d89e74444 + .quad 0x3faffa6911ab9301 + .quad 0x3faf723b517fc523 + .quad 0x3faeea31c006b87c + .quad 0x3fae624c4a0b5e1b + .quad 0x3fadda8adc67ee4e + .quad 0x3fad52ed6405d86f + .quad 0x3faccb73cdddb2cc + .quad 0x3fac441e06f72a9e + .quad 0x3fabbcebfc68f420 + .quad 0x3fab35dd9b58baad + .quad 0x3faaaef2d0fb10fc + .quad 0x3faa282b8a936171 + .quad 0x3fa9a187b573de7c + .quad 0x3fa91b073efd7314 + .quad 0x3fa894aa149fb343 + .quad 0x3fa80e7023d8ccc4 + .quad 0x3fa788595a3577ba + .quad 0x3fa70265a550e777 + .quad 0x3fa67c94f2d4bb58 + .quad 0x3fa5f6e73078efb8 + .quad 0x3fa5715c4c03ceef + .quad 0x3fa4ebf43349e26f + .quad 0x3fa466aed42de3ea + .quad 0x3fa3e18c1ca0ae92 + .quad 0x3fa35c8bfaa1306b + .quad 0x3fa2d7ae5c3c5bae + .quad 0x3fa252f32f8d183f + .quad 0x3fa1ce5a62bc353a + .quad 0x3fa149e3e4005a8d + .quad 0x3fa0c58fa19dfaaa + .quad 0x3fa0415d89e74444 + .quad 0x3f9f7a9b16782856 + .quad 0x3f9e72bf2813ce51 + .quad 0x3f9d6b2725979802 + .quad 0x3f9c63d2ec14aaf2 + .quad 0x3f9b5cc258b718e6 + .quad 0x3f9a55f548c5c43f + .quad 0x3f994f6b99a24475 + .quad 0x3f98492528c8cabf + .quad 0x3f974321d3d006d3 + .quad 0x3f963d6178690bd6 + .quad 0x3f9537e3f45f3565 + .quad 0x3f9432a925980cc1 + .quad 0x3f932db0ea132e22 + .quad 0x3f9228fb1fea2e28 + .quad 0x3f912487a5507f70 + .quad 0x3f90205658935847 + .quad 0x3f8e38ce3033310c + .quad 0x3f8c317384c75f06 + .quad 0x3f8a2a9c6c170462 + .quad 0x3f882448a388a2aa + .quad 0x3f861e77e8b53fc6 + .quad 0x3f841929f96832f0 + .quad 0x3f82145e939ef1e9 + .quad 0x3f8010157588de71 + .quad 0x3f7c189cbb0e27fb + .quad 0x3f78121214586b54 + .quad 0x3f740c8a747878e2 + .quad 0x3f70080559588b35 + .quad 0x3f680904828985c0 + .quad 0x3f60040155d5889e + .quad 0x3f50020055655889 + .quad 0x0000000000000000 + .rept 56 + .byte 0 + .endr + +/* Polynomial coefficients: */ +double_vector _poly_coeff_1 0x3fc9999cacdb4d0a +double_vector _poly_coeff_2 0xbfd0000148058ee1 +double_vector _poly_coeff_3 0x3fd55555555543c5 +double_vector _poly_coeff_4 0xbfdffffffffff81f + +/* Exponent mask */ +double_vector _ExpMask 0x000fffffffffffff + +/* 2^10 */ +double_vector _Two10 0x3f50000000000000 + +/* Minimum normal number */ +double_vector _MinNorm 0x0010000000000000 + +/* Maximum normal number */ +double_vector _MaxNorm 0x7fefffffffffffff + +/* Half of mantissa mask */ +double_vector _HalfMask 0xfffffffffc000000 + +/* 1.0 */ +double_vector _One 0x3ff0000000000000 + +/* log(2) high part */ +double_vector _L2H 0x3fe62e42fefa0000 + +/* log(2) low part */ +double_vector _L2L 0x3d7cf79abc9e0000 + +/* Work range threshold = 724 */ +double_vector _Threshold 0x4086a00000000000 + +/* Bias */ +double_vector _Bias 0x408ff80000000000 + +/* Bias (-1 bit) */ +double_vector _Bias1 0x408ff00000000000 + +/* log(2) */ +double_vector _L2 0x3fe62e42fefa39ef + +/* General purpose constants: + DP infinities, +/- */ +.if .-__svml_dlog_data != _dInfs +.err +.endif + .quad 0x7ff0000000000000 + .quad 0xfff0000000000000 + .rept 48 + .byte 0 + .endr + +/* DP 1.0, +/- */ +.if .-__svml_dlog_data != _dOnes +.err +.endif + .quad 0x3ff0000000000000 + .quad 0xbff0000000000000 + .rept 48 + .byte 0 + .endr + +/* DP 0.0, +/- */ +.if .-__svml_dlog_data != _dZeros +.err +.endif + .quad 0x0000000000000000 + .quad 0x8000000000000000 + .rept 48 + .byte 0 + .endr + .type __svml_dlog_data,@object + .size __svml_dlog_data,.-__svml_dlog_data diff --git a/REORG.TODO/sysdeps/x86_64/fpu/svml_d_log_data.h b/REORG.TODO/sysdeps/x86_64/fpu/svml_d_log_data.h new file mode 100644 index 0000000000..84d65db95d --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/svml_d_log_data.h @@ -0,0 +1,54 @@ +/* Offsets for data table for function log. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#ifndef D_LOG_DATA_H +#define D_LOG_DATA_H + +#define _LogRcp_lookup -4218816 +#define _Log_HA_table 0 +#define _Log_LA_table 8256 +#define _poly_coeff_1 12416 +#define _poly_coeff_2 12480 +#define _poly_coeff_3 12544 +#define _poly_coeff_4 12608 +#define _ExpMask 12672 +#define _Two10 12736 +#define _MinNorm 12800 +#define _MaxNorm 12864 +#define _HalfMask 12928 +#define _One 12992 +#define _L2H 13056 +#define _L2L 13120 +#define _Threshold 13184 +#define _Bias 13248 +#define _Bias1 13312 +#define _L2 13376 +#define _dInfs 13440 +#define _dOnes 13504 +#define _dZeros 13568 + +.macro double_vector offset value +.if .-__svml_dlog_data != \offset +.err +.endif +.rept 8 +.quad \value +.endr +.endm + +#endif diff --git a/REORG.TODO/sysdeps/x86_64/fpu/svml_d_pow2_core.S b/REORG.TODO/sysdeps/x86_64/fpu/svml_d_pow2_core.S new file mode 100644 index 0000000000..ccdb592135 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/svml_d_pow2_core.S @@ -0,0 +1,29 @@ +/* Function pow vectorized with SSE2. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "svml_d_wrapper_impl.h" + + .text +ENTRY (_ZGVbN2vv_pow) +WRAPPER_IMPL_SSE2_ff __pow_finite +END (_ZGVbN2vv_pow) + +#ifndef USE_MULTIARCH + libmvec_hidden_def (_ZGVbN2vv_pow) +#endif diff --git a/REORG.TODO/sysdeps/x86_64/fpu/svml_d_pow4_core.S b/REORG.TODO/sysdeps/x86_64/fpu/svml_d_pow4_core.S new file mode 100644 index 0000000000..30ae0f5a2f --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/svml_d_pow4_core.S @@ -0,0 +1,29 @@ +/* Function pow vectorized with AVX2, wrapper version. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "svml_d_wrapper_impl.h" + + .text +ENTRY (_ZGVdN4vv_pow) +WRAPPER_IMPL_AVX_ff _ZGVbN2vv_pow +END (_ZGVdN4vv_pow) + +#ifndef USE_MULTIARCH + libmvec_hidden_def (_ZGVdN4vv_pow) +#endif diff --git a/REORG.TODO/sysdeps/x86_64/fpu/svml_d_pow4_core_avx.S b/REORG.TODO/sysdeps/x86_64/fpu/svml_d_pow4_core_avx.S new file mode 100644 index 0000000000..bcea225c4d --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/svml_d_pow4_core_avx.S @@ -0,0 +1,25 @@ +/* Function pow vectorized in AVX ISA as wrapper to SSE4 ISA version. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "svml_d_wrapper_impl.h" + + .text +ENTRY (_ZGVcN4vv_pow) +WRAPPER_IMPL_AVX_ff _ZGVbN2vv_pow +END (_ZGVcN4vv_pow) diff --git a/REORG.TODO/sysdeps/x86_64/fpu/svml_d_pow8_core.S b/REORG.TODO/sysdeps/x86_64/fpu/svml_d_pow8_core.S new file mode 100644 index 0000000000..06b3a81124 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/svml_d_pow8_core.S @@ -0,0 +1,25 @@ +/* Function pow vectorized with AVX-512. Wrapper to AVX2 version. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "svml_d_wrapper_impl.h" + + .text +ENTRY (_ZGVeN8vv_pow) +WRAPPER_IMPL_AVX512_ff _ZGVdN4vv_pow +END (_ZGVeN8vv_pow) diff --git a/REORG.TODO/sysdeps/x86_64/fpu/svml_d_pow_data.S b/REORG.TODO/sysdeps/x86_64/fpu/svml_d_pow_data.S new file mode 100644 index 0000000000..2f05f7becb --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/svml_d_pow_data.S @@ -0,0 +1,4863 @@ +/* Data for function pow. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include "svml_d_pow_data.h" + + .section .rodata, "a" + .align 64 + +/* Data table for vector implementations of function pow. + The table may contain polynomial, reduction, lookup coefficients and + other coefficients obtained through different methods of research and + experimental work. */ + + .globl __svml_dpow_data +__svml_dpow_data: + +/* Lookup log(2) table (for HSW): */ +.if .-__svml_dpow_data != _hsw_log2_table +.err +.endif + .quad 0xc08ff00000000000 + .quad 0x0000000000000000 + .quad 0xc08ff005c3e0ffc2 + .quad 0xbd33ab2631d4676d + .quad 0xc08ff00b84e236bc + .quad 0xbd4563ba56cde925 + .quad 0xc08ff01143068126 + .quad 0x3d11790209e88471 + .quad 0xc08ff016fe50b6ee + .quad 0xbd408517f8e37b00 + .quad 0xc08ff01cb6c3abd0 + .quad 0xbd44558b51cada94 + .quad 0xc08ff0226c622f52 + .quad 0xbd3ec312ed069b24 + .quad 0xc08ff0281f2f0cd0 + .quad 0xbd374a4cb0be9e8a + .quad 0xc08ff02dcf2d0b86 + .quad 0x3d26eb3ac8ec0ef7 + .quad 0xc08ff0337c5eee92 + .quad 0xbd45984a60ff3d2f + .quad 0xc08ff03926c7750a + .quad 0xbd0f0cccdd01ee2f + .quad 0xc08ff03ece6959f0 + .quad 0xbd3a5671e1bd4ae8 + .quad 0xc08ff0447347544c + .quad 0xbd3a0976c0a2827d + .quad 0xc08ff04a1564172a + .quad 0x3d1e14ebaf30c95e + .quad 0xc08ff04fb4c251a0 + .quad 0xbd46898809d2dc10 + .quad 0xc08ff0555164aee2 + .quad 0xbd4355e6ecb8e0f1 + .quad 0xc08ff05aeb4dd63c + .quad 0x3cf3c6764fc87b4a + .quad 0xc08ff06082806b1c + .quad 0xbd4532c412ba94db + .quad 0xc08ff06616ff0d24 + .quad 0xbd4465182838ed44 + .quad 0xc08ff06ba8cc5824 + .quad 0xbd47dc6d46384b31 + .quad 0xc08ff07137eae42a + .quad 0xbd35af7a7c7c34f3 + .quad 0xc08ff076c45d4584 + .quad 0x3d18a0e14f76d994 + .quad 0xc08ff07c4e260cc8 + .quad 0xbd44e7e87341aeee + .quad 0xc08ff081d547c6e4 + .quad 0xbd153121e9af5428 + .quad 0xc08ff08759c4fd14 + .quad 0xbd3f9ab3cf74baba + .quad 0xc08ff08cdba034fa + .quad 0xbd3f09941811b2ee + .quad 0xc08ff0925adbf09a + .quad 0xbd3a3c89a2cf3516 + .quad 0xc08ff097d77aae66 + .quad 0x3d291b415eeb24ed + .quad 0xc08ff09d517ee940 + .quad 0x3d2c7a4ff65ddbc9 + .quad 0xc08ff0a2c8eb1886 + .quad 0xbd385a047f97bb3e + .quad 0xc08ff0a83dc1b01a + .quad 0x3d1124ac34b21259 + .quad 0xc08ff0adb005205e + .quad 0xbd34f286d207e2c8 + .quad 0xc08ff0b31fb7d648 + .quad 0xbd33167ccc538261 + .quad 0xc08ff0b88cdc3b5e + .quad 0xbd4542fe4ce30d63 + .quad 0xc08ff0bdf774b5c4 + .quad 0xbd41409e20d7191b + .quad 0xc08ff0c35f83a83c + .quad 0xbd40638b5ff73edf + .quad 0xc08ff0c8c50b7232 + .quad 0x3d294aa31b9b6d65 + .quad 0xc08ff0ce280e6fba + .quad 0xbd38723279ebfab6 + .quad 0xc08ff0d3888ef9a4 + .quad 0xbd124fad116078ef + .quad 0xc08ff0d8e68f6572 + .quad 0xbd437350d69ea580 + .quad 0xc08ff0de4212056c + .quad 0xbd45dd31d962d373 + .quad 0xc08ff0e39b19289e + .quad 0x3d058b34834a501e + .quad 0xc08ff0e8f1a71adc + .quad 0xbd06d26859c7991e + .quad 0xc08ff0ee45be24d0 + .quad 0xbd3ddb7886f88587 + .quad 0xc08ff0f397608bfc + .quad 0xbd42d90e5edaecee + .quad 0xc08ff0f8e69092be + .quad 0xbd40c5eacb577b4a + .quad 0xc08ff0fe33507858 + .quad 0xbce49209a68c72a1 + .quad 0xc08ff1037da278f2 + .quad 0xbd30e0f9c896007d + .quad 0xc08ff108c588cda8 + .quad 0x3d2871a7610e40bd + .quad 0xc08ff10e0b05ac84 + .quad 0xbd31da156756faad + .quad 0xc08ff1134e1b4890 + .quad 0xbd28b7fcd690403e + .quad 0xc08ff1188ecbd1d0 + .quad 0xbd46be4a29c44115 + .quad 0xc08ff11dcd197552 + .quad 0xbd36f6bd48a860f0 + .quad 0xc08ff12309065d28 + .quad 0xbd47913e788c5887 + .quad 0xc08ff1284294b07a + .quad 0xbd28fe35da2ab291 + .quad 0xc08ff12d79c6937e + .quad 0xbd3fb9b1aaf54bcc + .quad 0xc08ff132ae9e278a + .quad 0xbd3c343ea3e580eb + .quad 0xc08ff137e11d8b10 + .quad 0xbd3f1140264356b8 + .quad 0xc08ff13d1146d9a8 + .quad 0xbd34c7e0166e1f56 + .quad 0xc08ff1423f1c2c12 + .quad 0xbd3d449e80431d92 + .quad 0xc08ff1476a9f983e + .quad 0xbd474d3138e94164 + .quad 0xc08ff14c93d33152 + .quad 0x3d2370693afbcdb1 + .quad 0xc08ff151bab907a6 + .quad 0x3d1badba7fbb3d20 + .quad 0xc08ff156df5328d6 + .quad 0x3d2cea9347cb6655 + .quad 0xc08ff15c01a39fbc + .quad 0xbd46879fa00b120a + .quad 0xc08ff16121ac7480 + .quad 0xbd43cf0ff16ff990 + .quad 0xc08ff1663f6fac90 + .quad 0xbd43167ccc538261 + .quad 0xc08ff16b5aef4aae + .quad 0xbd2f7081b8e33aad + .quad 0xc08ff170742d4ef0 + .quad 0xbd13f94e00e7d6bc + .quad 0xc08ff1758b2bb6c8 + .quad 0x3d22280434bda911 + .quad 0xc08ff17a9fec7d06 + .quad 0x3d1108740d92f890 + .quad 0xc08ff17fb27199de + .quad 0xbd416d18135d3266 + .quad 0xc08ff184c2bd02f0 + .quad 0xbd1d97ee9124773b + .quad 0xc08ff189d0d0ab42 + .quad 0xbd40ccd0edd00e4c + .quad 0xc08ff18edcae8352 + .quad 0xbd36d76b9a843329 + .quad 0xc08ff193e6587910 + .quad 0xbd210f7ac89c6f2d + .quad 0xc08ff198edd077e6 + .quad 0xbd40df02face8ca9 + .quad 0xc08ff19df31868c0 + .quad 0xbd41d4cc2f68b868 + .quad 0xc08ff1a2f632320c + .quad 0x3d2e54d71deb636a + .quad 0xc08ff1a7f71fb7ba + .quad 0xbd373af6b5487f35 + .quad 0xc08ff1acf5e2db4e + .quad 0xbd3927dfc23d9780 + .quad 0xc08ff1b1f27d7bd8 + .quad 0x3d2601ccfac2b557 + .quad 0xc08ff1b6ecf175f8 + .quad 0xbd45e96bed8cce30 + .quad 0xc08ff1bbe540a3f0 + .quad 0xbd1b76a46f31880a + .quad 0xc08ff1c0db6cdd94 + .quad 0xbd3bdc81c4db3134 + .quad 0xc08ff1c5cf77f860 + .quad 0xbd304cc6600a133e + .quad 0xc08ff1cac163c770 + .quad 0xbd3b912d8994b162 + .quad 0xc08ff1cfb1321b8c + .quad 0xbd20009770ea1465 + .quad 0xc08ff1d49ee4c326 + .quad 0x3d2a40dc2d2a6bf7 + .quad 0xc08ff1d98a7d8a60 + .quad 0xbd269affffe47644 + .quad 0xc08ff1de73fe3b14 + .quad 0xbd301dc37c84e79a + .quad 0xc08ff1e35b689cd2 + .quad 0xbd2953e61f15bd9b + .quad 0xc08ff1e840be74e6 + .quad 0xbd34998f93e7aa3c + .quad 0xc08ff1ed2401865e + .quad 0x3cf5c14e55f57802 + .quad 0xc08ff1f205339208 + .quad 0xbd3e4e8eea54ce63 + .quad 0xc08ff1f6e4565680 + .quad 0x3d0aaa72ba2c6ba2 + .quad 0xc08ff1fbc16b9026 + .quad 0xbd30144751b3314f + .quad 0xc08ff2009c74f930 + .quad 0x3d2a15a5b343a140 + .quad 0xc08ff205757449a0 + .quad 0xbd398eec5e85b29f + .quad 0xc08ff20a4c6b3756 + .quad 0xbd1b361c7dddadb6 + .quad 0xc08ff20f215b7606 + .quad 0xbcc2de0634d33aa9 + .quad 0xc08ff213f446b744 + .quad 0xbce024b5b4e89254 + .quad 0xc08ff218c52eaa84 + .quad 0xbd451d49f63f4830 + .quad 0xc08ff21d9414fd24 + .quad 0x3d1f4c2417f39394 + .quad 0xc08ff22260fb5a60 + .quad 0xbd46eb9612e0b4f3 + .quad 0xc08ff2272be36b6c + .quad 0xbd1a5bd9bcda22fd + .quad 0xc08ff22bf4ced760 + .quad 0xbd41feb2fc708a78 + .quad 0xc08ff230bbbf4350 + .quad 0x3d13045428f88499 + .quad 0xc08ff23580b6523e + .quad 0xbcfc14a31ce1b7e3 + .quad 0xc08ff23a43b5a52a + .quad 0xbd38c9a2f2dbcaf9 + .quad 0xc08ff23f04bedb12 + .quad 0x3d1ecd417972c083 + .quad 0xc08ff243c3d390ee + .quad 0xbd38e36471414f76 + .quad 0xc08ff24880f561c0 + .quad 0xbd3ce60916e52e91 + .quad 0xc08ff24d3c25e68e + .quad 0x3d1d406db502402d + .quad 0xc08ff251f566b664 + .quad 0xbd3a0d8c0e85a909 + .quad 0xc08ff256acb96662 + .quad 0xbd2dafbfd96d5335 + .quad 0xc08ff25b621f89b2 + .quad 0xbd455ede26f47b19 + .quad 0xc08ff260159ab196 + .quad 0xbd461f2e47488cf1 + .quad 0xc08ff264c72c6d64 + .quad 0xbd406b35c7c781db + .quad 0xc08ff26976d64a8c + .quad 0xbd20c369fc5a3d9b + .quad 0xc08ff26e2499d49a + .quad 0x3d20993376649b50 + .quad 0xc08ff272d078953a + .quad 0x3d1664deafdbfed5 + .quad 0xc08ff2777a74143c + .quad 0x3d282b53e791792d + .quad 0xc08ff27c228dd794 + .quad 0x3ccc79237996a42b + .quad 0xc08ff280c8c76360 + .quad 0xbd3125d6cbcd1095 + .quad 0xc08ff2856d2239ea + .quad 0xbd3194cfcc6c23cf + .quad 0xc08ff28a0f9fdbaa + .quad 0x3cee35952fb0019c + .quad 0xc08ff28eb041c748 + .quad 0xbd2286fbc7f749ff + .quad 0xc08ff2934f0979a2 + .quad 0xbd4715fc9257edff + .quad 0xc08ff297ebf86dd0 + .quad 0xbd35dcccaf649933 + .quad 0xc08ff29c87101d1e + .quad 0xbd46d3f77ae3858b + .quad 0xc08ff2a12051ff1c + .quad 0xbd0432648cfc8738 + .quad 0xc08ff2a5b7bf8992 + .quad 0xbd3acdf73d83987f + .quad 0xc08ff2aa4d5a3092 + .quad 0xbd2e6c522ceda3fb + .quad 0xc08ff2aee123666e + .quad 0xbd4195620f0359d8 + .quad 0xc08ff2b3731c9bc4 + .quad 0xbd3c70f15d3ebabd + .quad 0xc08ff2b803473f7a + .quad 0xbd3a1e7e802c4828 + .quad 0xc08ff2bc91a4bec4 + .quad 0xbd4572ca23a96c48 + .quad 0xc08ff2c11e368528 + .quad 0xbd415b2de01cea41 + .quad 0xc08ff2c5a8fdfc7c + .quad 0xbd47dc11ebf92a98 + .quad 0xc08ff2ca31fc8cee + .quad 0xbd474dca44f1db91 + .quad 0xc08ff2ceb9339d04 + .quad 0x3cfb88755d6ca189 + .quad 0xc08ff2d33ea4919a + .quad 0xbd32e1a3152150d3 + .quad 0xc08ff2d7c250cdf0 + .quad 0xbd206adfcaa4bcf5 + .quad 0xc08ff2dc4439b3a2 + .quad 0x3d290d43956fa5d8 + .quad 0xc08ff2e0c460a2ae + .quad 0x3d27158a37417c3a + .quad 0xc08ff2e542c6f978 + .quad 0xbd1829434d994a2a + .quad 0xc08ff2e9bf6e14cc + .quad 0xbd2c3e1e30d370ea + .quad 0xc08ff2ee3a574fde + .quad 0xbd4677c8dfd9aa24 + .quad 0xc08ff2f2b3840452 + .quad 0xbd2788eba5c173ee + .quad 0xc08ff2f72af58a34 + .quad 0xbd4588aec6dfa7dc + .quad 0xc08ff2fba0ad3808 + .quad 0xbd47fe42f19c5879 + .quad 0xc08ff30014ac62c4 + .quad 0x3d2d5e6a8a4fb059 + .quad 0xc08ff30486f45dce + .quad 0xbd0edb9d09608783 + .quad 0xc08ff308f7867b0c + .quad 0xbd18dc7c094eee51 + .quad 0xc08ff30d66640ada + .quad 0xbd46028f37225746 + .quad 0xc08ff311d38e5c16 + .quad 0xbd212d25b3252647 + .quad 0xc08ff3163f06bc16 + .quad 0xbd3906944ba567f4 + .quad 0xc08ff31aa8ce76b8 + .quad 0xbd2b8d59e8492d6e + .quad 0xc08ff31f10e6d65a + .quad 0xbd339eec34ce3ce3 + .quad 0xc08ff323775123e2 + .quad 0xbd3c22d2cad415ae + .quad 0xc08ff327dc0ea6be + .quad 0xbd42ce2af5839ab8 + .quad 0xc08ff32c3f20a4e8 + .quad 0xbd03719eb3af5b8d + .quad 0xc08ff330a08862e2 + .quad 0xbd3feed12980ee19 + .quad 0xc08ff335004723c4 + .quad 0xbd2979a5db68721d + .quad 0xc08ff3395e5e2932 + .quad 0x3cf7159b944f7fd7 + .quad 0xc08ff33dbaceb364 + .quad 0xbd377e236c73e71b + .quad 0xc08ff342159a012a + .quad 0xbd4568bb43ac99bb + .quad 0xc08ff3466ec14fec + .quad 0xbcf4275f1035e5e8 + .quad 0xc08ff34ac645dba6 + .quad 0xbd3cc58a505d117a + .quad 0xc08ff34f1c28def8 + .quad 0x3d10bad7dfa568f7 + .quad 0xc08ff353706b9318 + .quad 0xbd3c27e675df639d + .quad 0xc08ff357c30f2fe4 + .quad 0x3d06e3cb71b554e7 + .quad 0xc08ff35c1414ebd4 + .quad 0xbd40c353cb7112a5 + .quad 0xc08ff360637dfc0c + .quad 0xbd30d199805b0aec + .quad 0xc08ff364b14b9450 + .quad 0xbd381e2a51761f86 + .quad 0xc08ff368fd7ee710 + .quad 0xbd250520a377c7ec + .quad 0xc08ff36d48192564 + .quad 0xbcef941453836236 + .quad 0xc08ff371911b7f10 + .quad 0xbd39e65cd77582e2 + .quad 0xc08ff375d887228a + .quad 0x3d201640f615fa5c + .quad 0xc08ff37a1e5d3cf2 + .quad 0xbce855a216719009 + .quad 0xc08ff37e629efa1e + .quad 0xbd3ae66b65d78df9 + .quad 0xc08ff382a54d8498 + .quad 0xbd45cb804b949696 + .quad 0xc08ff386e66a05a0 + .quad 0xbd33de15e265b5d9 + .quad 0xc08ff38b25f5a52a + .quad 0xbd46acfcfdca95de + .quad 0xc08ff38f63f189ea + .quad 0xbd1a3f6c066ebdd4 + .quad 0xc08ff393a05ed948 + .quad 0xbd3ecf4dff1e8ea2 + .quad 0xc08ff397db3eb770 + .quad 0xbd40d40bb2010158 + .quad 0xc08ff39c1492474a + .quad 0xbd40f992ba145dcf + .quad 0xc08ff3a04c5aaa80 + .quad 0xbd346fab3fa1a144 + .quad 0xc08ff3a48299017e + .quad 0xbd23ea90adf6a54a + .quad 0xc08ff3a8b74e6b74 + .quad 0xbd449e1389f86468 + .quad 0xc08ff3acea7c065c + .quad 0xbd441dfc7d7c3321 + .quad 0xc08ff3b11c22eef6 + .quad 0xbd148ad9b560f3b7 + .quad 0xc08ff3b54c4440ca + .quad 0x3cf1bfb62d6a3aa8 + .quad 0xc08ff3b97ae1162e + .quad 0xbd2ac444ea257ffa + .quad 0xc08ff3bda7fa8846 + .quad 0xbd39313aec658458 + .quad 0xc08ff3c1d391af06 + .quad 0x3d2a140de4db9aae + .quad 0xc08ff3c5fda7a12e + .quad 0xbd24c06f912ab9d1 + .quad 0xc08ff3ca263d7456 + .quad 0xbd426152c271eb36 + .quad 0xc08ff3ce4d543cea + .quad 0xbd33483146784bd2 + .quad 0xc08ff3d272ed0e28 + .quad 0xbd44640a8fec6a2e + .quad 0xc08ff3d69708fa2a + .quad 0xbd479ca7cb93cc08 + .quad 0xc08ff3dab9a911e2 + .quad 0xbd3cc65b96825ec6 + .quad 0xc08ff3dedace651c + .quad 0xbd2103e8f00d41c8 + .quad 0xc08ff3e2fa7a0280 + .quad 0xbd3ebdb1bbaf9ab0 + .quad 0xc08ff3e718acf798 + .quad 0xbd350343f8df4b43 + .quad 0xc08ff3eb356850ca + .quad 0xbd3db11aa6a7cdea + .quad 0xc08ff3ef50ad1960 + .quad 0xbd3b3b3864c60011 + .quad 0xc08ff3f36a7c5b86 + .quad 0xbd3310f9839f068a + .quad 0xc08ff3f782d7204c + .quad 0xbd40144751b3314f + .quad 0xc08ff3fb99be6faa + .quad 0xbd429875b0e43fd8 + .quad 0xc08ff3ffaf335080 + .quad 0x3cf9518ce032f41d + .quad 0xc08ff403c336c894 + .quad 0x3d29ab66b62c5ca8 + .quad 0xc08ff407d5c9dc98 + .quad 0xbd437fc8cafdef46 + .quad 0xc08ff40be6ed9030 + .quad 0xbd2515e1cacac36e + .quad 0xc08ff40ff6a2e5e6 + .quad 0xbd27f33943464056 + .quad 0xc08ff41404eadf38 + .quad 0xbd1cb6f70109b0f1 + .quad 0xc08ff41811c67c94 + .quad 0x3d24dc166e0e0c68 + .quad 0xc08ff41c1d36bd58 + .quad 0xbd3d990d1e0f6657 + .quad 0xc08ff420273c9fdc + .quad 0xbcfea92d9e0e8ac2 + .quad 0xc08ff4242fd92166 + .quad 0xbd303cf98ab4e537 + .quad 0xc08ff428370d3e38 + .quad 0xbd2fbc00d8d6cbcf + .quad 0xc08ff42c3cd9f18a + .quad 0xbd2fd3fe3499ea9f + .quad 0xc08ff4304140358e + .quad 0xbd3532c412ba94db + .quad 0xc08ff43444410372 + .quad 0xbd1f5ab329b483ec + .quad 0xc08ff43845dd535e + .quad 0xbd40444ebaaf2894 + .quad 0xc08ff43c46161c7c + .quad 0xbd35897d184aaac4 + .quad 0xc08ff44044ec54f2 + .quad 0xbd1d4f639bb5cdf6 + .quad 0xc08ff4444260f1e6 + .quad 0xbd467d28344c2ff0 + .quad 0xc08ff4483e74e786 + .quad 0xbcccb52b4581174d + .quad 0xc08ff44c392928fa + .quad 0xbd449eb852b25382 + .quad 0xc08ff450327ea878 + .quad 0xbd450e785694a8c6 + .quad 0xc08ff4542a765738 + .quad 0xbd2410f5d3161a62 + .quad 0xc08ff45821112578 + .quad 0xbcc81e2b378ff59d + .quad 0xc08ff45c16500280 + .quad 0xbd3e6009faee4be8 + .quad 0xc08ff4600a33dca6 + .quad 0x3d12b628e2d05d76 + .quad 0xc08ff463fcbda144 + .quad 0xbd3cbb828084fcb1 + .quad 0xc08ff467edee3cc8 + .quad 0xbd4085c5870d5301 + .quad 0xc08ff46bddc69aaa + .quad 0xbd4475780e47156b + .quad 0xc08ff46fcc47a574 + .quad 0xbcdbc76a2753b99b + .quad 0xc08ff473b97246bc + .quad 0xbd2012f1593ee62a + .quad 0xc08ff477a547672e + .quad 0xbd3d30c3d2643639 + .quad 0xc08ff47b8fc7ee8a + .quad 0xbd062c45c4bc31c9 + .quad 0xc08ff47f78f4c3a0 + .quad 0xbd22642415d47384 + .quad 0xc08ff48360cecc5a + .quad 0x3d2372fd3ff3197b + .quad 0xc08ff4874756edb4 + .quad 0xbd4668c543d0b42b + .quad 0xc08ff48b2c8e0bca + .quad 0xbd33f65cadbe0d26 + .quad 0xc08ff48f107509ca + .quad 0x3cfbfbf899cf2b3c + .quad 0xc08ff492f30cc9fe + .quad 0xbd307470f69809cc + .quad 0xc08ff496d4562dce + .quad 0xbd44115a1a340462 + .quad 0xc08ff49ab45215c0 + .quad 0xbcff5369fdf426cf + .quad 0xc08ff49e93016172 + .quad 0xbd3fc02bc277071d + .quad 0xc08ff4a27064efa8 + .quad 0xbd4728da988cc139 + .quad 0xc08ff4a64c7d9e44 + .quad 0xbd458147cf67745e + .quad 0xc08ff4aa274c4a4a + .quad 0xbd22100986691daa + .quad 0xc08ff4ae00d1cfde + .quad 0xbd36879fa00b120a + .quad 0xc08ff4b1d90f0a4c + .quad 0xbd40b68fc634db41 + .quad 0xc08ff4b5b004d404 + .quad 0xbd3c03254a7145e3 + .quad 0xc08ff4b985b4069c + .quad 0xbcf4f144da6e4533 + .quad 0xc08ff4bd5a1d7ad0 + .quad 0x3d1b3d7b0e65d2ce + .quad 0xc08ff4c12d420886 + .quad 0x3d0dd3d30f5deaa7 + .quad 0xc08ff4c4ff2286ce + .quad 0x3d20dc60dc5befec + .quad 0xc08ff4c8cfbfcbe0 + .quad 0xbd47f6a1ab3efbbe + .quad 0xc08ff4cc9f1aad26 + .quad 0xbd429b21ae4817e9 + .quad 0xc08ff4d06d33ff32 + .quad 0x3d256a9ae5dca5a3 + .quad 0xc08ff4d43a0c95c2 + .quad 0x3cf38bc99b3611ce + .quad 0xc08ff4d805a543c8 + .quad 0xbd0c6d2c37daf317 + .quad 0xc08ff4dbcffedb64 + .quad 0xbd262404772a151d + .quad 0xc08ff4df991a2de8 + .quad 0xbd11c0de7b779cb3 + .quad 0xc08ff4e360f80bd6 + .quad 0xbd4424a06f870b9e + .quad 0xc08ff4e7279944e8 + .quad 0xbd3a69393bab4fd0 + .quad 0xc08ff4eaecfea808 + .quad 0xbd266cccab240e90 + .quad 0xc08ff4eeb1290356 + .quad 0xbd38e9b57298d22f + .quad 0xc08ff4f27419242c + .quad 0x3d2eddd33ea4d6f1 + .quad 0xc08ff4f635cfd714 + .quad 0xbd476e0ed8a042be + .quad 0xc08ff4f9f64de7dc + .quad 0xbce66ae2a7ada553 + .quad 0xc08ff4fdb5942180 + .quad 0xbd0cd57d9d86514e + .quad 0xc08ff50173a34e3c + .quad 0xbd42efafb4bec72b + .quad 0xc08ff505307c378a + .quad 0xbd1a46dbdcc762d3 + .quad 0xc08ff508ec1fa61a + .quad 0xbd354b383b0e8a55 + .quad 0xc08ff50ca68e61e0 + .quad 0x3d2c7d469ea019ad + .quad 0xc08ff5105fc93208 + .quad 0xbd264adb1adca9a8 + .quad 0xc08ff51417d0dd04 + .quad 0x3ce5c601f0626dc8 + .quad 0xc08ff517cea62882 + .quad 0x3d18eb650003fb32 + .quad 0xc08ff51b8449d972 + .quad 0xbd326baaf0b591f8 + .quad 0xc08ff51f38bcb408 + .quad 0xbd461b8d0e43a37f + .quad 0xc08ff522ebff7bbc + .quad 0xbd33859a74f0d148 + .quad 0xc08ff5269e12f346 + .quad 0xbd3c57f2495fb7fa + .quad 0xc08ff52a4ef7dca8 + .quad 0xbcd5dc21a39bf974 + .quad 0xc08ff52dfeaef926 + .quad 0x3d0aa0e9e6bca777 + .quad 0xc08ff531ad39094c + .quad 0xbd47d0fa4fa0c208 + .quad 0xc08ff5355a96ccf4 + .quad 0x3d23bb5921006679 + .quad 0xc08ff53906c90336 + .quad 0xbd21f3e0c466e8f9 + .quad 0xc08ff53cb1d06a7c + .quad 0xbd39f3ba83f85c08 + .quad 0xc08ff5405badc07a + .quad 0x3d2e77ad7a4b71c0 + .quad 0xc08ff5440461c22a + .quad 0xbd1f1bbd2926f164 + .quad 0xc08ff547abed2bd8 + .quad 0xbd44479667bb79bf + .quad 0xc08ff54b5250b91e + .quad 0xbd2094ef49b8484b + .quad 0xc08ff54ef78d24de + .quad 0xbd41fb87566dd18c + .quad 0xc08ff5529ba32950 + .quad 0xbd3c6d8d86531d56 + .quad 0xc08ff5563e937ff8 + .quad 0xbd323e7492de8d74 + .quad 0xc08ff559e05ee1ac + .quad 0xbcf63d8bd35fdc18 + .quad 0xc08ff55d81060692 + .quad 0xbd3cc78dae939320 + .quad 0xc08ff5612089a626 + .quad 0xbd44cf0e362f4a36 + .quad 0xc08ff564beea7736 + .quad 0xbd3a96d7a36f1545 + .quad 0xc08ff5685c292fe2 + .quad 0xbd4570af1a0bc9f4 + .quad 0xc08ff56bf84685a4 + .quad 0x3d1bdc90791aef03 + .quad 0xc08ff56f93432d44 + .quad 0xbd40d2abacfc0489 + .quad 0xc08ff5732d1fdaea + .quad 0xbd39e35c1aa7693f + .quad 0xc08ff576c5dd4210 + .quad 0xbd23c49c247ab6af + .quad 0xc08ff57a5d7c1588 + .quad 0xbd4374da167aead5 + .quad 0xc08ff57df3fd0782 + .quad 0xbd2aeb8cb1ac05cd + .quad 0xc08ff5818960c982 + .quad 0xbd3b1b8ae4633046 + .quad 0xc08ff5851da80c6c + .quad 0xbd20899cee46ebe4 + .quad 0xc08ff588b0d3807c + .quad 0xbcfc4413fd83dec1 + .quad 0xc08ff58c42e3d54c + .quad 0xbd02101a9685c779 + .quad 0xc08ff58fd3d9b9d2 + .quad 0xbd45c074c957d037 + .quad 0xc08ff59363b5dc66 + .quad 0xbd3f7cc3df8803d1 + .quad 0xc08ff596f278eaba + .quad 0xbd3961ecab44052e + .quad 0xc08ff59a802391e2 + .quad 0xbd1979a5db68721d + .quad 0xc08ff59e0cb67e50 + .quad 0xbd3e4ce321e589a9 + .quad 0xc08ff5a198325bdc + .quad 0x3d0e321d11f8a0ce + .quad 0xc08ff5a52297d5ba + .quad 0x3d227ae8037b21bf + .quad 0xc08ff5a8abe79684 + .quad 0x3d1ebefecd51a1be + .quad 0xc08ff5ac34224836 + .quad 0xbd372c2fed3f759f + .quad 0xc08ff5afbb489432 + .quad 0xbd46b82e2a9e810c + .quad 0xc08ff5b3415b2340 + .quad 0x3d2e59ad84a6a593 + .quad 0xc08ff5b6c65a9d86 + .quad 0xbd249d97df07e357 + .quad 0xc08ff5ba4a47aa98 + .quad 0xbd46d25a5b8a19b2 + .quad 0xc08ff5bdcd22f172 + .quad 0x3d2e859780f0cdc7 + .quad 0xc08ff5c14eed186e + .quad 0xbd4171cf05a99915 + .quad 0xc08ff5c4cfa6c55a + .quad 0xbd41ef9459fef720 + .quad 0xc08ff5c84f509d68 + .quad 0x3d145ccfb66fabd2 + .quad 0xc08ff5cbcdeb4530 + .quad 0xbd46bf2e7459b97d + .quad 0xc08ff5cf4b7760be + .quad 0xbd36132520b9d027 + .quad 0xc08ff5d2c7f59382 + .quad 0x3d15872350f805d6 + .quad 0xc08ff5d643668058 + .quad 0xbd41835d469035a9 + .quad 0xc08ff5d9bdcac98e + .quad 0xbd47b7378ad99d2e + .quad 0xc08ff5dd372310dc + .quad 0xbd472d51ea7c162e + .quad 0xc08ff5e0af6ff76a + .quad 0x3d2a8843781eda15 + .quad 0xc08ff5e426b21dc8 + .quad 0xbd44ea36d76b0bd8 + .quad 0xc08ff5e79cea2402 + .quad 0x3d2e03b336c24b74 + .quad 0xc08ff5eb1218a986 + .quad 0xbd45a7bfdb3c98b0 + .quad 0xc08ff5ee863e4d40 + .quad 0xbd37204f55bbf90d + .quad 0xc08ff5f1f95bad84 + .quad 0xbd41b72e122257f1 + .quad 0xc08ff5f56b71681e + .quad 0xbd1488084776534a + .quad 0xc08ff5f8dc801a48 + .quad 0xbd2866405210e49e + .quad 0xc08ff5fc4c8860b4 + .quad 0x3d1d45da26510032 + .quad 0xc08ff5ffbb8ad784 + .quad 0xbd2f386200388584 + .quad 0xc08ff60329881a52 + .quad 0xbd47e32446892fb9 + .quad 0xc08ff6069680c42e + .quad 0xbd4330c4c4a27e40 + .quad 0xc08ff60a02756f9c + .quad 0xbd0cb6f70109b0f1 + .quad 0xc08ff60d6d66b694 + .quad 0xbd4777531ab1b43f + .quad 0xc08ff610d755328e + .quad 0x3d118906313e79cf + .quad 0xc08ff61440417c70 + .quad 0x3d0a5b363a6f499c + .quad 0xc08ff617a82c2c9e + .quad 0xbd39308437e74325 + .quad 0xc08ff61b0f15daf6 + .quad 0xbd3fef5f3fc61899 + .quad 0xc08ff61e74ff1ece + .quad 0xbd3b85f3204507b9 + .quad 0xc08ff621d9e88ef6 + .quad 0xbd42fc8ea3276ba0 + .quad 0xc08ff6253dd2c1bc + .quad 0x3d0d2fe4574e09b9 + .quad 0xc08ff628a0be4ce4 + .quad 0xbd3245829ca653e6 + .quad 0xc08ff62c02abc5b4 + .quad 0xbd42a385b236e315 + .quad 0xc08ff62f639bc0ee + .quad 0xbd301f1e98d8979c + .quad 0xc08ff632c38ed2ce + .quad 0xbd3ded9b44542fd9 + .quad 0xc08ff63622858f12 + .quad 0xbd3d400fd651da9a + .quad 0xc08ff639808088f6 + .quad 0x3d29f78153fcfec0 + .quad 0xc08ff63cdd805330 + .quad 0xbd46af859d47a29a + .quad 0xc08ff64039858000 + .quad 0xbd3667f21fa8423f + .quad 0xc08ff6439490a11e + .quad 0xbd1b254cabaa042b + .quad 0xc08ff646eea247c6 + .quad 0x3d1ee969a95f528f + .quad 0xc08ff64a47bb04b4 + .quad 0xbd3821d36e0b7548 + .quad 0xc08ff64d9fdb682a + .quad 0xbd3974e6432d9ee8 + .quad 0xc08ff650f70401ea + .quad 0xbd1d74d044558154 + .quad 0xc08ff6544d356138 + .quad 0xbd371b3a63cddadf + .quad 0xc08ff657a27014e0 + .quad 0x3d17b6aad08dc210 + .quad 0xc08ff65af6b4ab2c + .quad 0xbd47d7bfb12454c5 + .quad 0xc08ff65e4a03b1f4 + .quad 0xbd373647bf25fa5f + .quad 0xc08ff6619c5db68e + .quad 0xbcf742a6b2827cf0 + .quad 0xc08ff664edc345d8 + .quad 0xbd02d3bbd925734c + .quad 0xc08ff6683e34ec38 + .quad 0xbd03f7a55cd2af4c + .quad 0xc08ff66b8db3359a + .quad 0xbd308364fa508035 + .quad 0xc08ff66edc3ead74 + .quad 0x3d2b37bd36337985 + .quad 0xc08ff67229d7dec0 + .quad 0x3d22a424c693063d + .quad 0xc08ff675767f5404 + .quad 0xbd166cccab240e90 + .quad 0xc08ff678c2359750 + .quad 0x3d2bce65acc07927 + .quad 0xc08ff67c0cfb323a + .quad 0xbd25651ccd0e0880 + .quad 0xc08ff67f56d0ade6 + .quad 0xbd4533d5b4542c99 + .quad 0xc08ff6829fb69304 + .quad 0xbd22ce6312ebb81d + .quad 0xc08ff685e7ad69ca + .quad 0xbd2b6967f02b01d8 + .quad 0xc08ff6892eb5b9fe + .quad 0xbd3bb55730409355 + .quad 0xc08ff68c74d00af2 + .quad 0xbd4352b18e47fcd2 + .quad 0xc08ff68fb9fce386 + .quad 0xbceed0798d1aa216 + .quad 0xc08ff692fe3cca22 + .quad 0xbd464b702b56565e + .quad 0xc08ff696419044c4 + .quad 0xbd45909799f95e23 + .quad 0xc08ff69983f7d8f4 + .quad 0xbd2bebde1ac6e983 + .quad 0xc08ff69cc5740bc8 + .quad 0xbd18f7aac147fdc1 + .quad 0xc08ff6a0060561e8 + .quad 0x3d2653a2eb403f26 + .quad 0xc08ff6a345ac5f8a + .quad 0x3d1769a8e6b40f5e + .quad 0xc08ff6a684698876 + .quad 0xbd1770535b322bbf + .quad 0xc08ff6a9c23d6004 + .quad 0xbd434df378df21ad + .quad 0xc08ff6acff286920 + .quad 0xbd398cc3b5d08e15 + .quad 0xc08ff6b03b2b2644 + .quad 0xbd39d941e9e746a4 + .quad 0xc08ff6b376461980 + .quad 0x3d2fd2e802de76ad + .quad 0xc08ff6b6b079c472 + .quad 0xbcf968ab16b0d7ba + .quad 0xc08ff6b9e9c6a850 + .quad 0xbd3fa4a9eb6b8621 + .quad 0xc08ff6bd222d45e4 + .quad 0xbd36ad5bac74b87f + .quad 0xc08ff6c059ae1d8a + .quad 0x3d057c1b79ee9964 + .quad 0xc08ff6c39049af32 + .quad 0xbd0af5e9bb5386c2 + .quad 0xc08ff6c6c6007a64 + .quad 0xbce8467191344d58 + .quad 0xc08ff6c9fad2fe3c + .quad 0xbd1148dad646cb9d + .quad 0xc08ff6cd2ec1b96c + .quad 0xbd4149540d5fceb9 + .quad 0xc08ff6d061cd2a40 + .quad 0xbd117b2f1731efbe + .quad 0xc08ff6d393f5ce96 + .quad 0x3d25005be8c5610b + .quad 0xc08ff6d6c53c23e6 + .quad 0x3d29a1979619fe2f + .quad 0xc08ff6d9f5a0a740 + .quad 0x3d15ebe99c4f6416 + .quad 0xc08ff6dd2523d54c + .quad 0xbd36d25a5b8a19b2 + .quad 0xc08ff6e053c62a4c + .quad 0xbd47f3f2612caf97 + .quad 0xc08ff6e38188221c + .quad 0xbd3848e9d1d92d88 + .quad 0xc08ff6e6ae6a382e + .quad 0xbd3b4aada7453897 + .quad 0xc08ff6e9da6ce792 + .quad 0xbd2640ef87ede14b + .quad 0xc08ff6ed0590aaf0 + .quad 0xbd2da89e835cc3d2 + .quad 0xc08ff6f02fd5fc8e + .quad 0x3d2fa6e2ac948d1a + .quad 0xc08ff6f3593d5648 + .quad 0xbd44bf3775fde250 + .quad 0xc08ff6f681c731a0 + .quad 0x3d2924ae921f7eca + .quad 0xc08ff6f9a97407a8 + .quad 0xbd32994b351f388c + .quad 0xc08ff6fcd0445118 + .quad 0xbd429af37d1edf2f + .quad 0xc08ff6fff6388644 + .quad 0x3d2ed5a8a2de89da + .quad 0xc08ff7031b511f16 + .quad 0xbd474d8b66a69572 + .quad 0xc08ff7063f8e9322 + .quad 0xbd3b20d190c69cff + .quad 0xc08ff70962f15992 + .quad 0xbcf455bedf4083bc + .quad 0xc08ff70c8579e930 + .quad 0xbd215844900583de + .quad 0xc08ff70fa728b868 + .quad 0xbd054cda62d3926e + .quad 0xc08ff712c7fe3d44 + .quad 0x3d2143e9a0cbd481 + .quad 0xc08ff715e7faed6e + .quad 0x3d2a82ed66976b91 + .quad 0xc08ff719071f3e30 + .quad 0xbd318c64f0672cf9 + .quad 0xc08ff71c256ba478 + .quad 0xbd2c760bc9b188c4 + .quad 0xc08ff71f42e094d2 + .quad 0xbd2b88ca364674ac + .quad 0xc08ff7225f7e836c + .quad 0xbd46361ccd8974a5 + .quad 0xc08ff7257b45e41a + .quad 0xbd24e3eb5884aae7 + .quad 0xc08ff72896372a4c + .quad 0xbd38b1aff71c8605 + .quad 0xc08ff72bb052c91a + .quad 0xbd429a0a140ddd8a + .quad 0xc08ff72ec999333e + .quad 0xbd43d6bb35ec114f + .quad 0xc08ff731e20adb16 + .quad 0xbd2bd849ce4dc635 + .quad 0xc08ff734f9a832a2 + .quad 0xbd206c243749114c + .quad 0xc08ff7381071ab88 + .quad 0xbd3595f2f68d91fd + .quad 0xc08ff73b2667b714 + .quad 0xbd3017eb15bb7de4 + .quad 0xc08ff73e3b8ac636 + .quad 0x3d1c28798c12cc39 + .quad 0xc08ff7414fdb4982 + .quad 0xbd12ce6312ebb81d + .quad 0xc08ff7446359b134 + .quad 0xbd4395510d1e3f81 + .quad 0xc08ff74776066d30 + .quad 0xbd3f86493917b407 + .quad 0xc08ff74a87e1ecfe + .quad 0xbd10be3a57487484 + .quad 0xc08ff74d98ec9fcc + .quad 0xbd2d5297837adb4b + .quad 0xc08ff750a926f472 + .quad 0xbd43ae4d308b33a5 + .quad 0xc08ff753b8915972 + .quad 0x3d2d54d244e2aaee + .quad 0xc08ff756c72c3cee + .quad 0xbd35f097b0fe80a3 + .quad 0xc08ff759d4f80cba + .quad 0xbd3077f1f5f0cc83 + .quad 0xc08ff75ce1f5364e + .quad 0x3d19367107b8e917 + .quad 0xc08ff75fee2426ca + .quad 0xbd33623c81400bcf + .quad 0xc08ff762f9854afc + .quad 0xbd33b55bcb161bac + .quad 0xc08ff76604190f5a + .quad 0x3d2eb3c3bf914b9c + .quad 0xc08ff7690ddfe000 + .quad 0xbd45a6a7f43f6ec0 + .quad 0xc08ff76c16da28be + .quad 0xbd3b253dff5e0495 + .quad 0xc08ff76f1f085508 + .quad 0x3d1b08127eec65d2 + .quad 0xc08ff772266acffc + .quad 0xbd45b1799ceaeb51 + .quad 0xc08ff7752d02046c + .quad 0xbd2e63bd0fcda210 + .quad 0xc08ff77832ce5cce + .quad 0xbd148cd0a7bb24b2 + .quad 0xc08ff77b37d04348 + .quad 0x3d11ef56fa3d37b4 + .quad 0xc08ff77e3c0821ac + .quad 0x3d1a768216f872eb + .quad 0xc08ff7813f766178 + .quad 0xbd44b4a15a96316e + .quad 0xc08ff784421b6bdc + .quad 0xbd4258a7b2336919 + .quad 0xc08ff78743f7a9b2 + .quad 0x3d03f659faac5a20 + .quad 0xc08ff78a450b8380 + .quad 0xbd2401fbaaa67e3c + .quad 0xc08ff78d4557617e + .quad 0xbd476fa81cf6a494 + .quad 0xc08ff79044dbab94 + .quad 0xbd44f46b93eece0a + .quad 0xc08ff7934398c956 + .quad 0xbd3c91f073716495 + .quad 0xc08ff796418f2208 + .quad 0xbd3672b0c88d4dd6 + .quad 0xc08ff7993ebf1c9e + .quad 0xbd3fb554647678d1 + .quad 0xc08ff79c3b291fbe + .quad 0xbd0bb98afdf33295 + .quad 0xc08ff79f36cd91ba + .quad 0xbd3a1c40753a869f + .quad 0xc08ff7a231acd89a + .quad 0xbd3395510d1e3f81 + .quad 0xc08ff7a52bc75a14 + .quad 0xbcf98fd2dca61c14 + .quad 0xc08ff7a8251d7b8e + .quad 0xbd40e7b8e7574248 + .quad 0xc08ff7ab1dafa224 + .quad 0xbd43f88ff2576e98 + .quad 0xc08ff7ae157e32a2 + .quad 0xbd1f61a96b8ce776 + .quad 0xc08ff7b10c899184 + .quad 0x3cde66be73b9da04 + .quad 0xc08ff7b402d222fa + .quad 0xbd408d5c3f1d5c0d + .quad 0xc08ff7b6f8584aea + .quad 0xbd3cbebea25ecd9e + .quad 0xc08ff7b9ed1c6cea + .quad 0xbd2507d6dc1f27ef + .quad 0xc08ff7bce11eec44 + .quad 0x3d2794d4c6c8f327 + .quad 0xc08ff7bfd4602bf4 + .quad 0xbd3f1e32799da52d + .quad 0xc08ff7c2c6e08eb0 + .quad 0xbd35c01818adf4af + .quad 0xc08ff7c5b8a076de + .quad 0x3d2cfc4de6d73dea + .quad 0xc08ff7c8a9a04696 + .quad 0xbd4227264a17d460 + .quad 0xc08ff7cb99e05fae + .quad 0xbd0142b08bb672e8 + .quad 0xc08ff7ce896123a8 + .quad 0xbd2564fcfaea5fb3 + .quad 0xc08ff7d17822f3c2 + .quad 0x3d2aab1b2a41b090 + .quad 0xc08ff7d4662630ea + .quad 0xbd46ac3b83ef359a + .quad 0xc08ff7d7536b3bce + .quad 0x3d241a2f220ccf53 + .quad 0xc08ff7da3ff274c6 + .quad 0xbd38f5d37680fd7c + .quad 0xc08ff7dd2bbc3bec + .quad 0x3d048a179268271d + .quad 0xc08ff7e016c8f108 + .quad 0xbd471e548b69f12a + .quad 0xc08ff7e30118f3a2 + .quad 0xbd41a23946dfa58c + .quad 0xc08ff7e5eaaca2f4 + .quad 0xbd25330d5605f2a6 + .quad 0xc08ff7e8d3845df0 + .quad 0xbd319b14945cf6ba + .quad 0xc08ff7ebbba08342 + .quad 0xbd4702e1863f7c92 + .quad 0xc08ff7eea3017150 + .quad 0xbd437cfeba9ff979 + .quad 0xc08ff7f189a78636 + .quad 0xbd3df6e958e938b0 + .quad 0xc08ff7f46f931fca + .quad 0xbd37ca15910e7069 + .quad 0xc08ff7f754c49b9c + .quad 0xbd15cfd00d77e6ec + .quad 0xc08ff7fa393c56f4 + .quad 0xbd2a025d9e2442e6 + .quad 0xc08ff7fd1cfaaed6 + .quad 0xbd3258e9a821b7cc + .quad 0xc08ff80000000000 + .quad 0x0000000000000000 + .rept 48 + .byte 0 + .endr + +/* Lookup exp(2) table (for HSW): */ +.if .-__svml_dpow_data != _hsw_dTe +.err +.endif + .quad 0x3ff0000000000000 + .quad 0x3ff00b1afa5abcbf + .quad 0x3ff0163da9fb3335 + .quad 0x3ff02168143b0281 + .quad 0x3ff02c9a3e778061 + .quad 0x3ff037d42e11bbcc + .quad 0x3ff04315e86e7f85 + .quad 0x3ff04e5f72f654b1 + .quad 0x3ff059b0d3158574 + .quad 0x3ff0650a0e3c1f89 + .quad 0x3ff0706b29ddf6de + .quad 0x3ff07bd42b72a836 + .quad 0x3ff0874518759bc8 + .quad 0x3ff092bdf66607e0 + .quad 0x3ff09e3ecac6f383 + .quad 0x3ff0a9c79b1f3919 + .quad 0x3ff0b5586cf9890f + .quad 0x3ff0c0f145e46c85 + .quad 0x3ff0cc922b7247f7 + .quad 0x3ff0d83b23395dec + .quad 0x3ff0e3ec32d3d1a2 + .quad 0x3ff0efa55fdfa9c5 + .quad 0x3ff0fb66affed31b + .quad 0x3ff1073028d7233e + .quad 0x3ff11301d0125b51 + .quad 0x3ff11edbab5e2ab6 + .quad 0x3ff12abdc06c31cc + .quad 0x3ff136a814f204ab + .quad 0x3ff1429aaea92de0 + .quad 0x3ff14e95934f312e + .quad 0x3ff15a98c8a58e51 + .quad 0x3ff166a45471c3c2 + .quad 0x3ff172b83c7d517b + .quad 0x3ff17ed48695bbc0 + .quad 0x3ff18af9388c8dea + .quad 0x3ff1972658375d2f + .quad 0x3ff1a35beb6fcb75 + .quad 0x3ff1af99f8138a1c + .quad 0x3ff1bbe084045cd4 + .quad 0x3ff1c82f95281c6b + .quad 0x3ff1d4873168b9aa + .quad 0x3ff1e0e75eb44027 + .quad 0x3ff1ed5022fcd91d + .quad 0x3ff1f9c18438ce4d + .quad 0x3ff2063b88628cd6 + .quad 0x3ff212be3578a819 + .quad 0x3ff21f49917ddc96 + .quad 0x3ff22bdda27912d1 + .quad 0x3ff2387a6e756238 + .quad 0x3ff2451ffb82140a + .quad 0x3ff251ce4fb2a63f + .quad 0x3ff25e85711ece75 + .quad 0x3ff26b4565e27cdd + .quad 0x3ff2780e341ddf29 + .quad 0x3ff284dfe1f56381 + .quad 0x3ff291ba7591bb70 + .quad 0x3ff29e9df51fdee1 + .quad 0x3ff2ab8a66d10f13 + .quad 0x3ff2b87fd0dad990 + .quad 0x3ff2c57e39771b2f + .quad 0x3ff2d285a6e4030b + .quad 0x3ff2df961f641589 + .quad 0x3ff2ecafa93e2f56 + .quad 0x3ff2f9d24abd886b + .quad 0x3ff306fe0a31b715 + .quad 0x3ff31432edeeb2fd + .quad 0x3ff32170fc4cd831 + .quad 0x3ff32eb83ba8ea32 + .quad 0x3ff33c08b26416ff + .quad 0x3ff3496266e3fa2d + .quad 0x3ff356c55f929ff1 + .quad 0x3ff36431a2de883b + .quad 0x3ff371a7373aa9cb + .quad 0x3ff37f26231e754a + .quad 0x3ff38cae6d05d866 + .quad 0x3ff39a401b7140ef + .quad 0x3ff3a7db34e59ff7 + .quad 0x3ff3b57fbfec6cf4 + .quad 0x3ff3c32dc313a8e5 + .quad 0x3ff3d0e544ede173 + .quad 0x3ff3dea64c123422 + .quad 0x3ff3ec70df1c5175 + .quad 0x3ff3fa4504ac801c + .quad 0x3ff40822c367a024 + .quad 0x3ff4160a21f72e2a + .quad 0x3ff423fb2709468a + .quad 0x3ff431f5d950a897 + .quad 0x3ff43ffa3f84b9d4 + .quad 0x3ff44e086061892d + .quad 0x3ff45c2042a7d232 + .quad 0x3ff46a41ed1d0057 + .quad 0x3ff4786d668b3237 + .quad 0x3ff486a2b5c13cd0 + .quad 0x3ff494e1e192aed2 + .quad 0x3ff4a32af0d7d3de + .quad 0x3ff4b17dea6db7d7 + .quad 0x3ff4bfdad5362a27 + .quad 0x3ff4ce41b817c114 + .quad 0x3ff4dcb299fddd0d + .quad 0x3ff4eb2d81d8abff + .quad 0x3ff4f9b2769d2ca7 + .quad 0x3ff508417f4531ee + .quad 0x3ff516daa2cf6642 + .quad 0x3ff5257de83f4eef + .quad 0x3ff5342b569d4f82 + .quad 0x3ff542e2f4f6ad27 + .quad 0x3ff551a4ca5d920f + .quad 0x3ff56070dde910d2 + .quad 0x3ff56f4736b527da + .quad 0x3ff57e27dbe2c4cf + .quad 0x3ff58d12d497c7fd + .quad 0x3ff59c0827ff07cc + .quad 0x3ff5ab07dd485429 + .quad 0x3ff5ba11fba87a03 + .quad 0x3ff5c9268a5946b7 + .quad 0x3ff5d84590998b93 + .quad 0x3ff5e76f15ad2148 + .quad 0x3ff5f6a320dceb71 + .quad 0x3ff605e1b976dc09 + .quad 0x3ff6152ae6cdf6f4 + .quad 0x3ff6247eb03a5585 + .quad 0x3ff633dd1d1929fd + .quad 0x3ff6434634ccc320 + .quad 0x3ff652b9febc8fb7 + .quad 0x3ff6623882552225 + .quad 0x3ff671c1c70833f6 + .quad 0x3ff68155d44ca973 + .quad 0x3ff690f4b19e9538 + .quad 0x3ff6a09e667f3bcd + .quad 0x3ff6b052fa75173e + .quad 0x3ff6c012750bdabf + .quad 0x3ff6cfdcddd47645 + .quad 0x3ff6dfb23c651a2f + .quad 0x3ff6ef9298593ae5 + .quad 0x3ff6ff7df9519484 + .quad 0x3ff70f7466f42e87 + .quad 0x3ff71f75e8ec5f74 + .quad 0x3ff72f8286ead08a + .quad 0x3ff73f9a48a58174 + .quad 0x3ff74fbd35d7cbfd + .quad 0x3ff75feb564267c9 + .quad 0x3ff77024b1ab6e09 + .quad 0x3ff780694fde5d3f + .quad 0x3ff790b938ac1cf6 + .quad 0x3ff7a11473eb0187 + .quad 0x3ff7b17b0976cfdb + .quad 0x3ff7c1ed0130c132 + .quad 0x3ff7d26a62ff86f0 + .quad 0x3ff7e2f336cf4e62 + .quad 0x3ff7f3878491c491 + .quad 0x3ff80427543e1a12 + .quad 0x3ff814d2add106d9 + .quad 0x3ff82589994cce13 + .quad 0x3ff8364c1eb941f7 + .quad 0x3ff8471a4623c7ad + .quad 0x3ff857f4179f5b21 + .quad 0x3ff868d99b4492ed + .quad 0x3ff879cad931a436 + .quad 0x3ff88ac7d98a6699 + .quad 0x3ff89bd0a478580f + .quad 0x3ff8ace5422aa0db + .quad 0x3ff8be05bad61778 + .quad 0x3ff8cf3216b5448c + .quad 0x3ff8e06a5e0866d9 + .quad 0x3ff8f1ae99157736 + .quad 0x3ff902fed0282c8a + .quad 0x3ff9145b0b91ffc6 + .quad 0x3ff925c353aa2fe2 + .quad 0x3ff93737b0cdc5e5 + .quad 0x3ff948b82b5f98e5 + .quad 0x3ff95a44cbc8520f + .quad 0x3ff96bdd9a7670b3 + .quad 0x3ff97d829fde4e50 + .quad 0x3ff98f33e47a22a2 + .quad 0x3ff9a0f170ca07ba + .quad 0x3ff9b2bb4d53fe0d + .quad 0x3ff9c49182a3f090 + .quad 0x3ff9d674194bb8d5 + .quad 0x3ff9e86319e32323 + .quad 0x3ff9fa5e8d07f29e + .quad 0x3ffa0c667b5de565 + .quad 0x3ffa1e7aed8eb8bb + .quad 0x3ffa309bec4a2d33 + .quad 0x3ffa42c980460ad8 + .quad 0x3ffa5503b23e255d + .quad 0x3ffa674a8af46052 + .quad 0x3ffa799e1330b358 + .quad 0x3ffa8bfe53c12e59 + .quad 0x3ffa9e6b5579fdbf + .quad 0x3ffab0e521356eba + .quad 0x3ffac36bbfd3f37a + .quad 0x3ffad5ff3a3c2774 + .quad 0x3ffae89f995ad3ad + .quad 0x3ffafb4ce622f2ff + .quad 0x3ffb0e07298db666 + .quad 0x3ffb20ce6c9a8952 + .quad 0x3ffb33a2b84f15fb + .quad 0x3ffb468415b749b1 + .quad 0x3ffb59728de5593a + .quad 0x3ffb6c6e29f1c52a + .quad 0x3ffb7f76f2fb5e47 + .quad 0x3ffb928cf22749e4 + .quad 0x3ffba5b030a1064a + .quad 0x3ffbb8e0b79a6f1f + .quad 0x3ffbcc1e904bc1d2 + .quad 0x3ffbdf69c3f3a207 + .quad 0x3ffbf2c25bd71e09 + .quad 0x3ffc06286141b33d + .quad 0x3ffc199bdd85529c + .quad 0x3ffc2d1cd9fa652c + .quad 0x3ffc40ab5fffd07a + .quad 0x3ffc544778fafb22 + .quad 0x3ffc67f12e57d14b + .quad 0x3ffc7ba88988c933 + .quad 0x3ffc8f6d9406e7b5 + .quad 0x3ffca3405751c4db + .quad 0x3ffcb720dcef9069 + .quad 0x3ffccb0f2e6d1675 + .quad 0x3ffcdf0b555dc3fa + .quad 0x3ffcf3155b5bab74 + .quad 0x3ffd072d4a07897c + .quad 0x3ffd1b532b08c968 + .quad 0x3ffd2f87080d89f2 + .quad 0x3ffd43c8eacaa1d6 + .quad 0x3ffd5818dcfba487 + .quad 0x3ffd6c76e862e6d3 + .quad 0x3ffd80e316c98398 + .quad 0x3ffd955d71ff6075 + .quad 0x3ffda9e603db3285 + .quad 0x3ffdbe7cd63a8315 + .quad 0x3ffdd321f301b460 + .quad 0x3ffde7d5641c0658 + .quad 0x3ffdfc97337b9b5f + .quad 0x3ffe11676b197d17 + .quad 0x3ffe264614f5a129 + .quad 0x3ffe3b333b16ee12 + .quad 0x3ffe502ee78b3ff6 + .quad 0x3ffe653924676d76 + .quad 0x3ffe7a51fbc74c83 + .quad 0x3ffe8f7977cdb740 + .quad 0x3ffea4afa2a490da + .quad 0x3ffeb9f4867cca6e + .quad 0x3ffecf482d8e67f1 + .quad 0x3ffee4aaa2188510 + .quad 0x3ffefa1bee615a27 + .quad 0x3fff0f9c1cb6412a + .quad 0x3fff252b376bba97 + .quad 0x3fff3ac948dd7274 + .quad 0x3fff50765b6e4540 + .quad 0x3fff6632798844f8 + .quad 0x3fff7bfdad9cbe14 + .quad 0x3fff91d802243c89 + .quad 0x3fffa7c1819e90d8 + .quad 0x3fffbdba3692d514 + .quad 0x3fffd3c22b8f71f1 + .quad 0x3fffe9d96b2a23d9 + +/* General purpose constants: + * hsw_dMantMask */ +double_vector _hsw_dMantMask 0x000fffffffffffff + +/* hsw_dOne */ +double_vector _hsw_dOne 0x3ff0000000000000 + +/* hsw_dCvtMask */ +double_vector _hsw_dCvtMask 0x4338000000000000 + +/* hsw_dMinNorm */ +double_vector _hsw_dMinNorm 0x0010000000000000 + +/* hsw_dMaxNorm */ +double_vector _hsw_dMaxNorm 0x7fefffffffffffff + +/* hsw_lRndBit */ +double_vector _hsw_lRndBit 0x0000040000000000 + +/* hsw_lRndMask */ +double_vector _hsw_lRndMask 0xfffff80000000000 + +/* Log polynomial: + * hsw_dc6 */ +double_vector _hsw_dc6 0xbfcec1cfbbc5c90c + +/* hsw_dc5 */ +double_vector _hsw_dc5 0x3fd2776da3d26e6a + +/* hsw_dc4 */ +double_vector _hsw_dc4 0xbfd71547655d37e0 + +/* hsw_dc3 */ +double_vector _hsw_dc3 0x3fdec709dc39fb02 + +/* hsw_dc1 */ +double_vector _hsw_dc1 0x3c777a3a2c24613d + +/* hsw_dc1h */ +double_vector _hsw_dc1h 0x3ff71547652b82fe + +/* hsw_dc2 */ +double_vector _hsw_dc2 0xbfe71547652b82fe + +/* Additional constants: + * hsw_AbsMask */ +double_vector _hsw_dAbsMask 0x7fffffffffffffff + +/* hsw_dDomainRange */ +double_vector _hsw_dDomainRange 0x408fec0000000000 + +/* hsw_dShifter */ +double_vector _hsw_dShifter 0x42b800000003ff00 + +/* hsw_dIndexMask */ +double_vector _hsw_dIndexMask 0x00000000000007f8 + +/* Exp polynomial: + * hsw_dce4 */ +double_vector _hsw_dce4 0x3f83b2ab930f15f9 + +/* hsw_dce3 */ +double_vector _hsw_dce3 0x3fac6b090da1e0a9 + +/* hsw_dce2 */ +double_vector _hsw_dce2 0x3fcebfbdff82c54d + +/* hsw_dce1 */ +double_vector _hsw_dce1 0x3fe62e42fefa39b9 + +/* Reciprocal lookup table for log part (non HSW): */ +.if .-__svml_dpow_data != _rcp_t1 +.err +.endif + .quad 0x3ff7154740000000 + .quad 0x3ff70f8340000000 + .quad 0x3ff709c240000000 + .quad 0x3ff7040440000000 + .quad 0x3ff6fe4900000000 + .quad 0x3ff6f89080000000 + .quad 0x3ff6f2db00000000 + .quad 0x3ff6ed2840000000 + .quad 0x3ff6e77840000000 + .quad 0x3ff6e1cb40000000 + .quad 0x3ff6dc2100000000 + .quad 0x3ff6d67980000000 + .quad 0x3ff6d0d4c0000000 + .quad 0x3ff6cb32c0000000 + .quad 0x3ff6c593c0000000 + .quad 0x3ff6bff780000000 + .quad 0x3ff6ba5dc0000000 + .quad 0x3ff6b4c700000000 + .quad 0x3ff6af32c0000000 + .quad 0x3ff6a9a180000000 + .quad 0x3ff6a41300000000 + .quad 0x3ff69e8700000000 + .quad 0x3ff698fdc0000000 + .quad 0x3ff6937740000000 + .quad 0x3ff68df380000000 + .quad 0x3ff6887280000000 + .quad 0x3ff682f400000000 + .quad 0x3ff67d7840000000 + .quad 0x3ff677ff40000000 + .quad 0x3ff67288c0000000 + .quad 0x3ff66d1540000000 + .quad 0x3ff667a400000000 + .quad 0x3ff6623580000000 + .quad 0x3ff65cc9c0000000 + .quad 0x3ff6576080000000 + .quad 0x3ff651fa00000000 + .quad 0x3ff64c9600000000 + .quad 0x3ff6473480000000 + .quad 0x3ff641d5c0000000 + .quad 0x3ff63c7980000000 + .quad 0x3ff6372000000000 + .quad 0x3ff631c900000000 + .quad 0x3ff62c7480000000 + .quad 0x3ff6272280000000 + .quad 0x3ff621d340000000 + .quad 0x3ff61c8640000000 + .quad 0x3ff6173c00000000 + .quad 0x3ff611f440000000 + .quad 0x3ff60caf00000000 + .quad 0x3ff6076c40000000 + .quad 0x3ff6022c00000000 + .quad 0x3ff5fcee80000000 + .quad 0x3ff5f7b340000000 + .quad 0x3ff5f27a80000000 + .quad 0x3ff5ed4440000000 + .quad 0x3ff5e81040000000 + .quad 0x3ff5e2df00000000 + .quad 0x3ff5ddb040000000 + .quad 0x3ff5d883c0000000 + .quad 0x3ff5d359c0000000 + .quad 0x3ff5ce3240000000 + .quad 0x3ff5c90d40000000 + .quad 0x3ff5c3ea80000000 + .quad 0x3ff5beca40000000 + .quad 0x3ff5b9ac80000000 + .quad 0x3ff5b49100000000 + .quad 0x3ff5af7800000000 + .quad 0x3ff5aa6180000000 + .quad 0x3ff5a54d40000000 + .quad 0x3ff5a03b40000000 + .quad 0x3ff59b2bc0000000 + .quad 0x3ff5961ec0000000 + .quad 0x3ff59113c0000000 + .quad 0x3ff58c0b80000000 + .quad 0x3ff5870540000000 + .quad 0x3ff58201c0000000 + .quad 0x3ff57d0040000000 + .quad 0x3ff5780140000000 + .quad 0x3ff5730480000000 + .quad 0x3ff56e0a00000000 + .quad 0x3ff56911c0000000 + .quad 0x3ff5641c00000000 + .quad 0x3ff55f2880000000 + .quad 0x3ff55a3740000000 + .quad 0x3ff5554840000000 + .quad 0x3ff5505bc0000000 + .quad 0x3ff54b7140000000 + .quad 0x3ff5468900000000 + .quad 0x3ff541a340000000 + .quad 0x3ff53cbf80000000 + .quad 0x3ff537de40000000 + .quad 0x3ff532ff00000000 + .quad 0x3ff52e2240000000 + .quad 0x3ff5294780000000 + .quad 0x3ff5246f00000000 + .quad 0x3ff51f98c0000000 + .quad 0x3ff51ac4c0000000 + .quad 0x3ff515f300000000 + .quad 0x3ff5112340000000 + .quad 0x3ff50c5600000000 + .quad 0x3ff5078ac0000000 + .quad 0x3ff502c1c0000000 + .quad 0x3ff4fdfac0000000 + .quad 0x3ff4f93600000000 + .quad 0x3ff4f47380000000 + .quad 0x3ff4efb340000000 + .quad 0x3ff4eaf500000000 + .quad 0x3ff4e638c0000000 + .quad 0x3ff4e17ec0000000 + .quad 0x3ff4dcc700000000 + .quad 0x3ff4d81180000000 + .quad 0x3ff4d35dc0000000 + .quad 0x3ff4ceac80000000 + .quad 0x3ff4c9fd00000000 + .quad 0x3ff4c54fc0000000 + .quad 0x3ff4c0a4c0000000 + .quad 0x3ff4bbfbc0000000 + .quad 0x3ff4b754c0000000 + .quad 0x3ff4b2b000000000 + .quad 0x3ff4ae0d40000000 + .quad 0x3ff4a96c80000000 + .quad 0x3ff4a4ce00000000 + .quad 0x3ff4a03140000000 + .quad 0x3ff49b9700000000 + .quad 0x3ff496fe80000000 + .quad 0x3ff4926800000000 + .quad 0x3ff48dd3c0000000 + .quad 0x3ff4894180000000 + .quad 0x3ff484b100000000 + .quad 0x3ff48022c0000000 + .quad 0x3ff47b96c0000000 + .quad 0x3ff4770c80000000 + .quad 0x3ff4728440000000 + .quad 0x3ff46dfe00000000 + .quad 0x3ff46979c0000000 + .quad 0x3ff464f780000000 + .quad 0x3ff4607780000000 + .quad 0x3ff45bf940000000 + .quad 0x3ff4577d00000000 + .quad 0x3ff45302c0000000 + .quad 0x3ff44e8a40000000 + .quad 0x3ff44a1400000000 + .quad 0x3ff4459f80000000 + .quad 0x3ff4412d40000000 + .quad 0x3ff43cbcc0000000 + .quad 0x3ff4384e40000000 + .quad 0x3ff433e180000000 + .quad 0x3ff42f7700000000 + .quad 0x3ff42b0e40000000 + .quad 0x3ff426a780000000 + .quad 0x3ff4224280000000 + .quad 0x3ff41ddf80000000 + .quad 0x3ff4197e80000000 + .quad 0x3ff4151f40000000 + .quad 0x3ff410c200000000 + .quad 0x3ff40c66c0000000 + .quad 0x3ff4080d40000000 + .quad 0x3ff403b5c0000000 + .quad 0x3ff3ff6000000000 + .quad 0x3ff3fb0c00000000 + .quad 0x3ff3f6ba40000000 + .quad 0x3ff3f26a00000000 + .quad 0x3ff3ee1bc0000000 + .quad 0x3ff3e9cf80000000 + .quad 0x3ff3e58500000000 + .quad 0x3ff3e13c40000000 + .quad 0x3ff3dcf580000000 + .quad 0x3ff3d8b080000000 + .quad 0x3ff3d46d40000000 + .quad 0x3ff3d02c00000000 + .quad 0x3ff3cbec80000000 + .quad 0x3ff3c7aec0000000 + .quad 0x3ff3c37300000000 + .quad 0x3ff3bf3900000000 + .quad 0x3ff3bb00c0000000 + .quad 0x3ff3b6ca40000000 + .quad 0x3ff3b29580000000 + .quad 0x3ff3ae62c0000000 + .quad 0x3ff3aa3180000000 + .quad 0x3ff3a60240000000 + .quad 0x3ff3a1d4c0000000 + .quad 0x3ff39da900000000 + .quad 0x3ff3997f40000000 + .quad 0x3ff3955700000000 + .quad 0x3ff3913080000000 + .quad 0x3ff38d0bc0000000 + .quad 0x3ff388e900000000 + .quad 0x3ff384c7c0000000 + .quad 0x3ff380a840000000 + .quad 0x3ff37c8ac0000000 + .quad 0x3ff3786ec0000000 + .quad 0x3ff3745480000000 + .quad 0x3ff3703c00000000 + .quad 0x3ff36c2540000000 + .quad 0x3ff3681040000000 + .quad 0x3ff363fcc0000000 + .quad 0x3ff35feb40000000 + .quad 0x3ff35bdb40000000 + .quad 0x3ff357cd00000000 + .quad 0x3ff353c080000000 + .quad 0x3ff34fb5c0000000 + .quad 0x3ff34bac80000000 + .quad 0x3ff347a540000000 + .quad 0x3ff3439f80000000 + .quad 0x3ff33f9b40000000 + .quad 0x3ff33b9900000000 + .quad 0x3ff3379840000000 + .quad 0x3ff3339900000000 + .quad 0x3ff32f9bc0000000 + .quad 0x3ff32b9fc0000000 + .quad 0x3ff327a5c0000000 + .quad 0x3ff323ad40000000 + .quad 0x3ff31fb680000000 + .quad 0x3ff31bc140000000 + .quad 0x3ff317cdc0000000 + .quad 0x3ff313dbc0000000 + .quad 0x3ff30feb80000000 + .quad 0x3ff30bfd00000000 + .quad 0x3ff3080fc0000000 + .quad 0x3ff3042480000000 + .quad 0x3ff3003ac0000000 + .quad 0x3ff2fc5280000000 + .quad 0x3ff2f86bc0000000 + .quad 0x3ff2f48700000000 + .quad 0x3ff2f0a380000000 + .quad 0x3ff2ecc1c0000000 + .quad 0x3ff2e8e180000000 + .quad 0x3ff2e502c0000000 + .quad 0x3ff2e125c0000000 + .quad 0x3ff2dd4a40000000 + .quad 0x3ff2d97080000000 + .quad 0x3ff2d59840000000 + .quad 0x3ff2d1c180000000 + .quad 0x3ff2cdec40000000 + .quad 0x3ff2ca1880000000 + .quad 0x3ff2c64680000000 + .quad 0x3ff2c27600000000 + .quad 0x3ff2bea700000000 + .quad 0x3ff2bad9c0000000 + .quad 0x3ff2b70dc0000000 + .quad 0x3ff2b34380000000 + .quad 0x3ff2af7ac0000000 + .quad 0x3ff2abb340000000 + .quad 0x3ff2a7ed80000000 + .quad 0x3ff2a42980000000 + .quad 0x3ff2a066c0000000 + .quad 0x3ff29ca580000000 + .quad 0x3ff298e5c0000000 + .quad 0x3ff29527c0000000 + .quad 0x3ff2916b00000000 + .quad 0x3ff28dafc0000000 + .quad 0x3ff289f640000000 + .quad 0x3ff2863e00000000 + .quad 0x3ff2828740000000 + .quad 0x3ff27ed240000000 + .quad 0x3ff27b1e80000000 + .quad 0x3ff2776c40000000 + .quad 0x3ff273bb80000000 + .quad 0x3ff2700c40000000 + .quad 0x3ff26c5e80000000 + .quad 0x3ff268b200000000 + .quad 0x3ff2650740000000 + .quad 0x3ff2615dc0000000 + .quad 0x3ff25db5c0000000 + .quad 0x3ff25a0f40000000 + .quad 0x3ff2566a40000000 + .quad 0x3ff252c6c0000000 + .quad 0x3ff24f2480000000 + .quad 0x3ff24b83c0000000 + .quad 0x3ff247e480000000 + .quad 0x3ff24446c0000000 + .quad 0x3ff240aa40000000 + .quad 0x3ff23d0f40000000 + .quad 0x3ff23975c0000000 + .quad 0x3ff235dd80000000 + .quad 0x3ff23246c0000000 + .quad 0x3ff22eb180000000 + .quad 0x3ff22b1d80000000 + .quad 0x3ff2278b00000000 + .quad 0x3ff223fa00000000 + .quad 0x3ff2206a40000000 + .quad 0x3ff21cdc00000000 + .quad 0x3ff2194f00000000 + .quad 0x3ff215c380000000 + .quad 0x3ff2123940000000 + .quad 0x3ff20eb080000000 + .quad 0x3ff20b2940000000 + .quad 0x3ff207a340000000 + .quad 0x3ff2041ec0000000 + .quad 0x3ff2009b80000000 + .quad 0x3ff1fd1980000000 + .quad 0x3ff1f99900000000 + .quad 0x3ff1f619c0000000 + .quad 0x3ff1f29c00000000 + .quad 0x3ff1ef1fc0000000 + .quad 0x3ff1eba480000000 + .quad 0x3ff1e82ac0000000 + .quad 0x3ff1e4b280000000 + .quad 0x3ff1e13b80000000 + .quad 0x3ff1ddc5c0000000 + .quad 0x3ff1da5180000000 + .quad 0x3ff1d6de80000000 + .quad 0x3ff1d36cc0000000 + .quad 0x3ff1cffc40000000 + .quad 0x3ff1cc8d40000000 + .quad 0x3ff1c91f80000000 + .quad 0x3ff1c5b340000000 + .quad 0x3ff1c24840000000 + .quad 0x3ff1bede40000000 + .quad 0x3ff1bb7600000000 + .quad 0x3ff1b80ec0000000 + .quad 0x3ff1b4a900000000 + .quad 0x3ff1b14480000000 + .quad 0x3ff1ade140000000 + .quad 0x3ff1aa7f40000000 + .quad 0x3ff1a71e80000000 + .quad 0x3ff1a3bf40000000 + .quad 0x3ff1a06140000000 + .quad 0x3ff19d0480000000 + .quad 0x3ff199a900000000 + .quad 0x3ff1964ec0000000 + .quad 0x3ff192f5c0000000 + .quad 0x3ff18f9e00000000 + .quad 0x3ff18c47c0000000 + .quad 0x3ff188f280000000 + .quad 0x3ff1859ec0000000 + .quad 0x3ff1824c00000000 + .quad 0x3ff17efac0000000 + .quad 0x3ff17baa80000000 + .quad 0x3ff1785bc0000000 + .quad 0x3ff1750e40000000 + .quad 0x3ff171c1c0000000 + .quad 0x3ff16e76c0000000 + .quad 0x3ff16b2d00000000 + .quad 0x3ff167e440000000 + .quad 0x3ff1649d00000000 + .quad 0x3ff16156c0000000 + .quad 0x3ff15e11c0000000 + .quad 0x3ff15ace40000000 + .quad 0x3ff1578bc0000000 + .quad 0x3ff1544a80000000 + .quad 0x3ff1510a80000000 + .quad 0x3ff14dcbc0000000 + .quad 0x3ff14a8e40000000 + .quad 0x3ff14751c0000000 + .quad 0x3ff14416c0000000 + .quad 0x3ff140dcc0000000 + .quad 0x3ff13da400000000 + .quad 0x3ff13a6c80000000 + .quad 0x3ff1373600000000 + .quad 0x3ff1340100000000 + .quad 0x3ff130cd00000000 + .quad 0x3ff12d9a40000000 + .quad 0x3ff12a68c0000000 + .quad 0x3ff1273840000000 + .quad 0x3ff1240900000000 + .quad 0x3ff120db00000000 + .quad 0x3ff11dae40000000 + .quad 0x3ff11a8280000000 + .quad 0x3ff1175800000000 + .quad 0x3ff1142ec0000000 + .quad 0x3ff11106c0000000 + .quad 0x3ff10ddfc0000000 + .quad 0x3ff10ab9c0000000 + .quad 0x3ff1079540000000 + .quad 0x3ff10471c0000000 + .quad 0x3ff1014f80000000 + .quad 0x3ff0fe2e40000000 + .quad 0x3ff0fb0e40000000 + .quad 0x3ff0f7ef40000000 + .quad 0x3ff0f4d180000000 + .quad 0x3ff0f1b500000000 + .quad 0x3ff0ee9980000000 + .quad 0x3ff0eb7f40000000 + .quad 0x3ff0e86600000000 + .quad 0x3ff0e54e00000000 + .quad 0x3ff0e23700000000 + .quad 0x3ff0df2140000000 + .quad 0x3ff0dc0c80000000 + .quad 0x3ff0d8f900000000 + .quad 0x3ff0d5e6c0000000 + .quad 0x3ff0d2d540000000 + .quad 0x3ff0cfc540000000 + .quad 0x3ff0ccb640000000 + .quad 0x3ff0c9a840000000 + .quad 0x3ff0c69b40000000 + .quad 0x3ff0c38f80000000 + .quad 0x3ff0c08500000000 + .quad 0x3ff0bd7b80000000 + .quad 0x3ff0ba7300000000 + .quad 0x3ff0b76bc0000000 + .quad 0x3ff0b46580000000 + .quad 0x3ff0b16040000000 + .quad 0x3ff0ae5c40000000 + .quad 0x3ff0ab5940000000 + .quad 0x3ff0a85780000000 + .quad 0x3ff0a556c0000000 + .quad 0x3ff0a25700000000 + .quad 0x3ff09f5880000000 + .quad 0x3ff09c5ac0000000 + .quad 0x3ff0995e80000000 + .quad 0x3ff0966300000000 + .quad 0x3ff09368c0000000 + .quad 0x3ff0906f80000000 + .quad 0x3ff08d7740000000 + .quad 0x3ff08a8000000000 + .quad 0x3ff0878a00000000 + .quad 0x3ff0849500000000 + .quad 0x3ff081a100000000 + .quad 0x3ff07eae40000000 + .quad 0x3ff07bbc40000000 + .quad 0x3ff078cb80000000 + .quad 0x3ff075dbc0000000 + .quad 0x3ff072ed00000000 + .quad 0x3ff06fff80000000 + .quad 0x3ff06d12c0000000 + .quad 0x3ff06a2740000000 + .quad 0x3ff0673cc0000000 + .quad 0x3ff0645340000000 + .quad 0x3ff0616ac0000000 + .quad 0x3ff05e8340000000 + .quad 0x3ff05b9d00000000 + .quad 0x3ff058b780000000 + .quad 0x3ff055d340000000 + .quad 0x3ff052f000000000 + .quad 0x3ff0500d80000000 + .quad 0x3ff04d2c40000000 + .quad 0x3ff04a4c00000000 + .quad 0x3ff0476cc0000000 + .quad 0x3ff0448e80000000 + .quad 0x3ff041b140000000 + .quad 0x3ff03ed500000000 + .quad 0x3ff03bf9c0000000 + .quad 0x3ff0391fc0000000 + .quad 0x3ff0364680000000 + .quad 0x3ff0336e40000000 + .quad 0x3ff0309700000000 + .quad 0x3ff02dc0c0000000 + .quad 0x3ff02aeb80000000 + .quad 0x3ff0281740000000 + .quad 0x3ff0254400000000 + .quad 0x3ff02271c0000000 + .quad 0x3ff01fa080000000 + .quad 0x3ff01cd040000000 + .quad 0x3ff01a00c0000000 + .quad 0x3ff0173280000000 + .quad 0x3ff0146540000000 + .quad 0x3ff01198c0000000 + .quad 0x3ff00ecd80000000 + .quad 0x3ff00c0300000000 + .quad 0x3ff0093980000000 + .quad 0x3ff0067100000000 + .quad 0x3ff003a980000000 + .quad 0x3ff000e300000000 + .quad 0x3feffc3a80000000 + .quad 0x3feff6b140000000 + .quad 0x3feff129c0000000 + .quad 0x3fefeba480000000 + .quad 0x3fefe620c0000000 + .quad 0x3fefe09f40000000 + .quad 0x3fefdb1f80000000 + .quad 0x3fefd5a180000000 + .quad 0x3fefd02580000000 + .quad 0x3fefcaab80000000 + .quad 0x3fefc53340000000 + .quad 0x3fefbfbd00000000 + .quad 0x3fefba4880000000 + .quad 0x3fefb4d600000000 + .quad 0x3fefaf6540000000 + .quad 0x3fefa9f680000000 + .quad 0x3fefa48980000000 + .quad 0x3fef9f1e40000000 + .quad 0x3fef99b500000000 + .quad 0x3fef944dc0000000 + .quad 0x3fef8ee800000000 + .quad 0x3fef898440000000 + .quad 0x3fef842280000000 + .quad 0x3fef7ec280000000 + .quad 0x3fef796440000000 + .quad 0x3fef7407c0000000 + .quad 0x3fef6ead40000000 + .quad 0x3fef695480000000 + .quad 0x3fef63fd80000000 + .quad 0x3fef5ea880000000 + .quad 0x3fef595540000000 + .quad 0x3fef5403c0000000 + .quad 0x3fef4eb400000000 + .quad 0x3fef496640000000 + .quad 0x3fef441a00000000 + .quad 0x3fef3ecfc0000000 + .quad 0x3fef398740000000 + .quad 0x3fef344080000000 + .quad 0x3fef2efb80000000 + .quad 0x3fef29b880000000 + .quad 0x3fef247700000000 + .quad 0x3fef1f3780000000 + .quad 0x3fef19f980000000 + .quad 0x3fef14bd80000000 + .quad 0x3fef0f8340000000 + .quad 0x3fef0a4ac0000000 + .quad 0x3fef0513c0000000 + .quad 0x3feeffdec0000000 + .quad 0x3feefaab80000000 + .quad 0x3feef57a00000000 + .quad 0x3feef04a00000000 + .quad 0x3feeeb1c00000000 + .quad 0x3feee5ef80000000 + .quad 0x3feee0c500000000 + .quad 0x3feedb9c00000000 + .quad 0x3feed67500000000 + .quad 0x3feed14f80000000 + .quad 0x3feecc2bc0000000 + .quad 0x3feec709c0000000 + .quad 0x3feec1e940000000 + .quad 0x3feebccac0000000 + .quad 0x3feeb7adc0000000 + .quad 0x3feeb29280000000 + .quad 0x3feead7900000000 + .quad 0x3feea86140000000 + .quad 0x3feea34b40000000 + .quad 0x3fee9e36c0000000 + .quad 0x3fee992400000000 + .quad 0x3fee941300000000 + .quad 0x3fee8f0380000000 + .quad 0x3fee89f5c0000000 + .quad 0x3fee84e9c0000000 + .quad 0x3fee7fdf40000000 + .quad 0x3fee7ad680000000 + .quad 0x3fee75cf80000000 + .quad 0x3fee70ca00000000 + .quad 0x3fee6bc640000000 + .quad 0x3fee66c440000000 + .quad 0x3fee61c3c0000000 + .quad 0x3fee5cc500000000 + .quad 0x3fee57c7c0000000 + .quad 0x3fee52cc40000000 + .quad 0x3fee4dd280000000 + .quad 0x3fee48da00000000 + .quad 0x3fee43e380000000 + .quad 0x3fee3eee80000000 + .quad 0x3fee39fb00000000 + .quad 0x3fee350940000000 + .quad 0x3fee301940000000 + .quad 0x3fee2b2ac0000000 + .quad 0x3fee263dc0000000 + .quad 0x3fee215280000000 + .quad 0x3fee1c68c0000000 + .quad 0x3fee178080000000 + .quad 0x3fee129a00000000 + .quad 0x3fee0db540000000 + .quad 0x3fee08d1c0000000 + .quad 0x3fee03f000000000 + .quad 0x3fedff1000000000 + .quad 0x3fedfa3140000000 + .quad 0x3fedf55440000000 + .quad 0x3fedf07900000000 + .quad 0x3fedeb9f00000000 + .quad 0x3fede6c6c0000000 + .quad 0x3fede1f040000000 + .quad 0x3feddd1b00000000 + .quad 0x3fedd84780000000 + .quad 0x3fedd37580000000 + .quad 0x3fedcea500000000 + .quad 0x3fedc9d600000000 + .quad 0x3fedc508c0000000 + .quad 0x3fedc03d00000000 + .quad 0x3fedbb72c0000000 + .quad 0x3fedb6aa00000000 + .quad 0x3fedb1e2c0000000 + .quad 0x3fedad1d00000000 + .quad 0x3feda85900000000 + .quad 0x3feda39680000000 + .quad 0x3fed9ed540000000 + .quad 0x3fed9a15c0000000 + .quad 0x3fed9557c0000000 + .quad 0x3fed909b40000000 + .quad 0x3fed8be040000000 + .quad 0x3fed8726c0000000 + .quad 0x3fed826f00000000 + .quad 0x3fed7db880000000 + .quad 0x3fed790380000000 + .quad 0x3fed745000000000 + .quad 0x3fed6f9e40000000 + .quad 0x3fed6aedc0000000 + .quad 0x3fed663ec0000000 + .quad 0x3fed619140000000 + .quad 0x3fed5ce540000000 + .quad 0x3fed583ac0000000 + .quad 0x3fed5391c0000000 + .quad 0x3fed4eea40000000 + .quad 0x3fed4a4440000000 + .quad 0x3fed459f80000000 + .quad 0x3fed40fc80000000 + .quad 0x3fed3c5ac0000000 + .quad 0x3fed37bac0000000 + .quad 0x3fed331c00000000 + .quad 0x3fed2e7ec0000000 + .quad 0x3fed29e300000000 + .quad 0x3fed254880000000 + .quad 0x3fed20afc0000000 + .quad 0x3fed1c1840000000 + .quad 0x3fed178240000000 + .quad 0x3fed12edc0000000 + .quad 0x3fed0e5ac0000000 + .quad 0x3fed09c900000000 + .quad 0x3fed0538c0000000 + .quad 0x3fed00aa00000000 + .quad 0x3fecfc1c80000000 + .quad 0x3fecf790c0000000 + .quad 0x3fecf30600000000 + .quad 0x3fecee7d00000000 + .quad 0x3fece9f540000000 + .quad 0x3fece56f00000000 + .quad 0x3fece0ea40000000 + .quad 0x3fecdc66c0000000 + .quad 0x3fecd7e4c0000000 + .quad 0x3fecd36440000000 + .quad 0x3feccee500000000 + .quad 0x3fecca6740000000 + .quad 0x3fecc5eac0000000 + .quad 0x3fecc16fc0000000 + .quad 0x3fecbcf640000000 + .quad 0x3fecb87e00000000 + .quad 0x3fecb40740000000 + .quad 0x3fecaf91c0000000 + .quad 0x3fecab1dc0000000 + .quad 0x3feca6ab00000000 + .quad 0x3feca239c0000000 + .quad 0x3fec9dc9c0000000 + .quad 0x3fec995b40000000 + .quad 0x3fec94ee00000000 + .quad 0x3fec908240000000 + .quad 0x3fec8c17c0000000 + .quad 0x3fec87aec0000000 + .quad 0x3fec834700000000 + .quad 0x3fec7ee0c0000000 + .quad 0x3fec7a7bc0000000 + .quad 0x3fec761800000000 + .quad 0x3fec71b5c0000000 + .quad 0x3fec6d54c0000000 + .quad 0x3fec68f540000000 + .quad 0x3fec649700000000 + .quad 0x3fec603a00000000 + .quad 0x3fec5bde80000000 + .quad 0x3fec578440000000 + .quad 0x3fec532b80000000 + .quad 0x3fec4ed3c0000000 + .quad 0x3fec4a7dc0000000 + .quad 0x3fec4628c0000000 + .quad 0x3fec41d540000000 + .quad 0x3fec3d8300000000 + .quad 0x3fec393200000000 + .quad 0x3fec34e240000000 + .quad 0x3fec309400000000 + .quad 0x3fec2c4700000000 + .quad 0x3fec27fb80000000 + .quad 0x3fec23b100000000 + .quad 0x3fec1f6800000000 + .quad 0x3fec1b2040000000 + .quad 0x3fec16d9c0000000 + .quad 0x3fec1294c0000000 + .quad 0x3fec0e50c0000000 + .quad 0x3fec0a0e40000000 + .quad 0x3fec05cd00000000 + .quad 0x3fec018d00000000 + .quad 0x3febfd4e40000000 + .quad 0x3febf91100000000 + .quad 0x3febf4d4c0000000 + .quad 0x3febf09a00000000 + .quad 0x3febec6080000000 + .quad 0x3febe82840000000 + .quad 0x3febe3f140000000 + .quad 0x3febdfbb80000000 + .quad 0x3febdb8700000000 + .quad 0x3febd753c0000000 + .quad 0x3febd32200000000 + .quad 0x3febcef140000000 + .quad 0x3febcac1c0000000 + .quad 0x3febc693c0000000 + .quad 0x3febc266c0000000 + .quad 0x3febbe3b40000000 + .quad 0x3febba10c0000000 + .quad 0x3febb5e7c0000000 + .quad 0x3febb1bfc0000000 + .quad 0x3febad9940000000 + .quad 0x3feba973c0000000 + .quad 0x3feba54fc0000000 + .quad 0x3feba12cc0000000 + .quad 0x3feb9d0b00000000 + .quad 0x3feb98eac0000000 + .quad 0x3feb94cb80000000 + .quad 0x3feb90ad80000000 + .quad 0x3feb8c90c0000000 + .quad 0x3feb887540000000 + .quad 0x3feb845b00000000 + .quad 0x3feb8041c0000000 + .quad 0x3feb7c2a00000000 + .quad 0x3feb781340000000 + .quad 0x3feb73fe00000000 + .quad 0x3feb6fe9c0000000 + .quad 0x3feb6bd6c0000000 + .quad 0x3feb67c500000000 + .quad 0x3feb63b440000000 + .quad 0x3feb5fa500000000 + .quad 0x3feb5b96c0000000 + .quad 0x3feb5789c0000000 + .quad 0x3feb537e00000000 + .quad 0x3feb4f7380000000 + .quad 0x3feb4b6a00000000 + .quad 0x3feb476200000000 + .quad 0x3feb435b00000000 + .quad 0x3feb3f5540000000 + .quad 0x3feb3b5080000000 + .quad 0x3feb374d00000000 + .quad 0x3feb334ac0000000 + .quad 0x3feb2f49c0000000 + .quad 0x3feb2b49c0000000 + .quad 0x3feb274b40000000 + .quad 0x3feb234d80000000 + .quad 0x3feb1f5140000000 + .quad 0x3feb1b5600000000 + .quad 0x3feb175c00000000 + .quad 0x3feb136300000000 + .quad 0x3feb0f6b80000000 + .quad 0x3feb0b74c0000000 + .quad 0x3feb077f80000000 + .quad 0x3feb038b40000000 + .quad 0x3feaff9840000000 + .quad 0x3feafba640000000 + .quad 0x3feaf7b580000000 + .quad 0x3feaf3c600000000 + .quad 0x3feaefd780000000 + .quad 0x3feaebea40000000 + .quad 0x3feae7fe00000000 + .quad 0x3feae41300000000 + .quad 0x3feae02900000000 + .quad 0x3feadc4040000000 + .quad 0x3fead858c0000000 + .quad 0x3fead47240000000 + .quad 0x3fead08cc0000000 + .quad 0x3feacca8c0000000 + .quad 0x3feac8c580000000 + .quad 0x3feac4e380000000 + .quad 0x3feac102c0000000 + .quad 0x3feabd2300000000 + .quad 0x3feab94480000000 + .quad 0x3feab56700000000 + .quad 0x3feab18a80000000 + .quad 0x3feaadaf80000000 + .quad 0x3feaa9d540000000 + .quad 0x3feaa5fc40000000 + .quad 0x3feaa22440000000 + .quad 0x3fea9e4d80000000 + .quad 0x3fea9a77c0000000 + .quad 0x3fea96a340000000 + .quad 0x3fea92cfc0000000 + .quad 0x3fea8efd80000000 + .quad 0x3fea8b2c40000000 + .quad 0x3fea875c00000000 + .quad 0x3fea838cc0000000 + .quad 0x3fea7fbec0000000 + .quad 0x3fea7bf200000000 + .quad 0x3fea782640000000 + .quad 0x3fea745b80000000 + .quad 0x3fea7091c0000000 + .quad 0x3fea6cc940000000 + .quad 0x3fea6901c0000000 + .quad 0x3fea653b40000000 + .quad 0x3fea617600000000 + .quad 0x3fea5db1c0000000 + .quad 0x3fea59ee80000000 + .quad 0x3fea562c80000000 + .quad 0x3fea526b80000000 + .quad 0x3fea4eab80000000 + .quad 0x3fea4aecc0000000 + .quad 0x3fea472ec0000000 + .quad 0x3fea437200000000 + .quad 0x3fea3fb640000000 + .quad 0x3fea3bfbc0000000 + .quad 0x3fea384240000000 + .quad 0x3fea3489c0000000 + .quad 0x3fea30d240000000 + .quad 0x3fea2d1bc0000000 + .quad 0x3fea296680000000 + .quad 0x3fea25b200000000 + .quad 0x3fea21fec0000000 + .quad 0x3fea1e4cc0000000 + .quad 0x3fea1a9b80000000 + .quad 0x3fea16eb40000000 + .quad 0x3fea133c40000000 + .quad 0x3fea0f8e40000000 + .quad 0x3fea0be140000000 + .quad 0x3fea083540000000 + .quad 0x3fea048a40000000 + .quad 0x3fea00e080000000 + .quad 0x3fe9fd3780000000 + .quad 0x3fe9f98fc0000000 + .quad 0x3fe9f5e900000000 + .quad 0x3fe9f24340000000 + .quad 0x3fe9ee9e80000000 + .quad 0x3fe9eafac0000000 + .quad 0x3fe9e75800000000 + .quad 0x3fe9e3b640000000 + .quad 0x3fe9e01580000000 + .quad 0x3fe9dc7600000000 + .quad 0x3fe9d8d740000000 + .quad 0x3fe9d539c0000000 + .quad 0x3fe9d19d00000000 + .quad 0x3fe9ce0180000000 + .quad 0x3fe9ca66c0000000 + .quad 0x3fe9c6cd40000000 + .quad 0x3fe9c33480000000 + .quad 0x3fe9bf9d00000000 + .quad 0x3fe9bc0680000000 + .quad 0x3fe9b870c0000000 + .quad 0x3fe9b4dc40000000 + .quad 0x3fe9b148c0000000 + .quad 0x3fe9adb600000000 + .quad 0x3fe9aa2480000000 + .quad 0x3fe9a693c0000000 + .quad 0x3fe9a30440000000 + .quad 0x3fe99f7580000000 + .quad 0x3fe99be7c0000000 + .quad 0x3fe9985b40000000 + .quad 0x3fe994cf80000000 + .quad 0x3fe99144c0000000 + .quad 0x3fe98dbb00000000 + .quad 0x3fe98a3240000000 + .quad 0x3fe986aa80000000 + .quad 0x3fe98323c0000000 + .quad 0x3fe97f9e00000000 + .quad 0x3fe97c1900000000 + .quad 0x3fe9789540000000 + .quad 0x3fe9751240000000 + .quad 0x3fe9719080000000 + .quad 0x3fe96e0f80000000 + .quad 0x3fe96a8f80000000 + .quad 0x3fe9671040000000 + .quad 0x3fe9639240000000 + .quad 0x3fe9601540000000 + .quad 0x3fe95c9900000000 + .quad 0x3fe9591dc0000000 + .quad 0x3fe955a380000000 + .quad 0x3fe9522a40000000 + .quad 0x3fe94eb200000000 + .quad 0x3fe94b3a80000000 + .quad 0x3fe947c400000000 + .quad 0x3fe9444e80000000 + .quad 0x3fe940da00000000 + .quad 0x3fe93d6640000000 + .quad 0x3fe939f3c0000000 + .quad 0x3fe9368200000000 + .quad 0x3fe9331140000000 + .quad 0x3fe92fa140000000 + .quad 0x3fe92c3280000000 + .quad 0x3fe928c480000000 + .quad 0x3fe9255780000000 + .quad 0x3fe921eb40000000 + .quad 0x3fe91e8040000000 + .quad 0x3fe91b1600000000 + .quad 0x3fe917ac80000000 + .quad 0x3fe9144440000000 + .quad 0x3fe910dcc0000000 + .quad 0x3fe90d7640000000 + .quad 0x3fe90a1080000000 + .quad 0x3fe906abc0000000 + .quad 0x3fe9034800000000 + .quad 0x3fe8ffe540000000 + .quad 0x3fe8fc8340000000 + .quad 0x3fe8f92240000000 + .quad 0x3fe8f5c200000000 + .quad 0x3fe8f26300000000 + .quad 0x3fe8ef0480000000 + .quad 0x3fe8eba740000000 + .quad 0x3fe8e84ac0000000 + .quad 0x3fe8e4ef40000000 + .quad 0x3fe8e19480000000 + .quad 0x3fe8de3ac0000000 + .quad 0x3fe8dae1c0000000 + .quad 0x3fe8d78a00000000 + .quad 0x3fe8d432c0000000 + .quad 0x3fe8d0dcc0000000 + .quad 0x3fe8cd8780000000 + .quad 0x3fe8ca3300000000 + .quad 0x3fe8c6df80000000 + .quad 0x3fe8c38d00000000 + .quad 0x3fe8c03b40000000 + .quad 0x3fe8bcea80000000 + .quad 0x3fe8b99a80000000 + .quad 0x3fe8b64b80000000 + .quad 0x3fe8b2fd40000000 + .quad 0x3fe8afb000000000 + .quad 0x3fe8ac63c0000000 + .quad 0x3fe8a91840000000 + .quad 0x3fe8a5cd80000000 + .quad 0x3fe8a283c0000000 + .quad 0x3fe89f3b00000000 + .quad 0x3fe89bf300000000 + .quad 0x3fe898abc0000000 + .quad 0x3fe8956580000000 + .quad 0x3fe8922040000000 + .quad 0x3fe88edbc0000000 + .quad 0x3fe88b9800000000 + .quad 0x3fe8885540000000 + .quad 0x3fe8851380000000 + .quad 0x3fe881d240000000 + .quad 0x3fe87e9240000000 + .quad 0x3fe87b52c0000000 + .quad 0x3fe8781480000000 + .quad 0x3fe874d6c0000000 + .quad 0x3fe8719a00000000 + .quad 0x3fe86e5e40000000 + .quad 0x3fe86b2340000000 + .quad 0x3fe867e900000000 + .quad 0x3fe864afc0000000 + .quad 0x3fe8617740000000 + .quad 0x3fe85e3f80000000 + .quad 0x3fe85b08c0000000 + .quad 0x3fe857d300000000 + .quad 0x3fe8549dc0000000 + .quad 0x3fe8516980000000 + .quad 0x3fe84e3640000000 + .quad 0x3fe84b03c0000000 + .quad 0x3fe847d200000000 + .quad 0x3fe844a100000000 + .quad 0x3fe8417100000000 + .quad 0x3fe83e4200000000 + .quad 0x3fe83b1380000000 + .quad 0x3fe837e600000000 + .quad 0x3fe834b940000000 + .quad 0x3fe8318d80000000 + .quad 0x3fe82e6280000000 + .quad 0x3fe82b3840000000 + .quad 0x3fe8280f00000000 + .quad 0x3fe824e640000000 + .quad 0x3fe821bec0000000 + .quad 0x3fe81e97c0000000 + .quad 0x3fe81b71c0000000 + .quad 0x3fe8184c80000000 + .quad 0x3fe8152800000000 + .quad 0x3fe8120480000000 + .quad 0x3fe80ee1c0000000 + .quad 0x3fe80bbfc0000000 + .quad 0x3fe8089e80000000 + .quad 0x3fe8057e40000000 + .quad 0x3fe8025ec0000000 + .quad 0x3fe7ff4000000000 + .quad 0x3fe7fc2200000000 + .quad 0x3fe7f90500000000 + .quad 0x3fe7f5e8c0000000 + .quad 0x3fe7f2cd40000000 + .quad 0x3fe7efb280000000 + .quad 0x3fe7ec9880000000 + .quad 0x3fe7e97f80000000 + .quad 0x3fe7e66740000000 + .quad 0x3fe7e34fc0000000 + .quad 0x3fe7e03940000000 + .quad 0x3fe7dd2340000000 + .quad 0x3fe7da0e40000000 + .quad 0x3fe7d6fa00000000 + .quad 0x3fe7d3e680000000 + .quad 0x3fe7d0d3c0000000 + .quad 0x3fe7cdc1c0000000 + .quad 0x3fe7cab0c0000000 + .quad 0x3fe7c7a080000000 + .quad 0x3fe7c49100000000 + .quad 0x3fe7c18240000000 + .quad 0x3fe7be7440000000 + .quad 0x3fe7bb6700000000 + .quad 0x3fe7b85ac0000000 + .quad 0x3fe7b54f00000000 + .quad 0x3fe7b24440000000 + .quad 0x3fe7af3a40000000 + .quad 0x3fe7ac3100000000 + .quad 0x3fe7a92880000000 + .quad 0x3fe7a620c0000000 + .quad 0x3fe7a319c0000000 + .quad 0x3fe7a013c0000000 + .quad 0x3fe79d0e40000000 + .quad 0x3fe79a09c0000000 + .quad 0x3fe7970600000000 + .quad 0x3fe79402c0000000 + .quad 0x3fe7910080000000 + .quad 0x3fe78dff00000000 + .quad 0x3fe78afe40000000 + .quad 0x3fe787fe40000000 + .quad 0x3fe784ff00000000 + .quad 0x3fe7820080000000 + .quad 0x3fe77f02c0000000 + .quad 0x3fe77c05c0000000 + .quad 0x3fe77909c0000000 + .quad 0x3fe7760e40000000 + .quad 0x3fe7731380000000 + .quad 0x3fe77019c0000000 + .quad 0x3fe76d2080000000 + .quad 0x3fe76a2800000000 + .quad 0x3fe7673080000000 + .quad 0x3fe7643980000000 + .quad 0x3fe7614340000000 + .quad 0x3fe75e4e00000000 + .quad 0x3fe75b5940000000 + .quad 0x3fe7586580000000 + .quad 0x3fe7557240000000 + .quad 0x3fe7527fc0000000 + .quad 0x3fe74f8e40000000 + .quad 0x3fe74c9d40000000 + .quad 0x3fe749ad00000000 + .quad 0x3fe746bd80000000 + .quad 0x3fe743cec0000000 + .quad 0x3fe740e100000000 + .quad 0x3fe73df3c0000000 + .quad 0x3fe73b0740000000 + .quad 0x3fe7381b80000000 + .quad 0x3fe7353080000000 + .quad 0x3fe7324600000000 + .quad 0x3fe72f5c80000000 + .quad 0x3fe72c73c0000000 + .quad 0x3fe7298b80000000 + .quad 0x3fe726a440000000 + .quad 0x3fe723bd80000000 + .quad 0x3fe720d7c0000000 + .quad 0x3fe71df280000000 + .quad 0x3fe71b0e00000000 + .quad 0x3fe7182a40000000 + .quad 0x3fe7154740000000 + .quad 0x0000000000000000 + .rept 48 + .byte 0 + .endr + +/* Log(2) lookup table for log part (non HSW): */ +.if .-__svml_dpow_data != _log2_t1 +.err +.endif + .rept 2 + .quad 0x0000000000000000 + .endr + .quad 0x3f5712e100000000 + .quad 0x3e0ee8a22f7c5987 + .quad 0x3f670fc100000000 + .quad 0x3e17e16043fd7529 + .quad 0x3f71497700000000 + .quad 0x3e239efb866b119c + .quad 0x3f7709bb00000000 + .quad 0x3e1b5ea7ee997dc0 + .quad 0x3f7cc8aa00000000 + .quad 0x3e2efad156451e8d + .quad 0x3f81430200000000 + .quad 0x3e204975bf955ee8 + .quad 0x3f84210300000000 + .quad 0x3e2e526353333f9a + .quad 0x3f86fe5800000000 + .quad 0x3e2dbbc5d9986525 + .quad 0x3f89dae000000000 + .quad 0x3e211ae127d370f8 + .quad 0x3f8cb6ba00000000 + .quad 0x3e2af44e8a20fe77 + .quad 0x3f8f91e600000000 + .quad 0x3e1f77bd1cd9fbc7 + .quad 0x3f91363100000000 + .quad 0x3e40f52f789c83a3 + .quad 0x3f92a31800000000 + .quad 0x3e172308c2064b24 + .quad 0x3f940f9600000000 + .quad 0x3e2f342d9eb8aeed + .quad 0x3f957bbb00000000 + .quad 0x3e4abb9a144866b7 + .quad 0x3f96e79800000000 + .quad 0x3e48b85ac72b0200 + .quad 0x3f98530c00000000 + .quad 0x3e2d1e01fbc85d86 + .quad 0x3f99be3600000000 + .quad 0x3e37d26f00cda0dd + .quad 0x3f9b28f600000000 + .quad 0x3e3433218e840f16 + .quad 0x3f9c935b00000000 + .quad 0x3e4f50a107fb8c37 + .quad 0x3f9dfd7700000000 + .quad 0x3e3604e609a9e948 + .quad 0x3f9f673700000000 + .quad 0x3e489f0de52d1118 + .quad 0x3fa0684e00000000 + .quad 0x3e4d127bd17abd42 + .quad 0x3fa11cd300000000 + .quad 0x3e3a899b4ece6057 + .quad 0x3fa1d12900000000 + .quad 0x3e5f0d0f99858cfa + .quad 0x3fa2855a00000000 + .quad 0x3e58b94e89d977a4 + .quad 0x3fa3395d00000000 + .quad 0x3e402a7f6bf76796 + .quad 0x3fa3ed3100000000 + .quad 0x3e3e342da3e0aab6 + .quad 0x3fa4a0de00000000 + .quad 0x3e58cae94cd5496b + .quad 0x3fa5545500000000 + .quad 0x3e3fdc64d89d4032 + .quad 0x3fa607ad00000000 + .quad 0x3e37dfd30f154124 + .quad 0x3fa6bad500000000 + .quad 0x3e5eb1e05460b0e3 + .quad 0x3fa76dcf00000000 + .quad 0x3e490ead14c7109d + .quad 0x3fa820a100000000 + .quad 0x3e5258eaf10715e3 + .quad 0x3fa8d34400000000 + .quad 0x3e242a28e25fb4d0 + .quad 0x3fa985bf00000000 + .quad 0x3dfa4a83c146ec0f + .quad 0x3faa381200000000 + .quad 0x3e3c7de45fe856f6 + .quad 0x3faaea3500000000 + .quad 0x3e408258f0914a28 + .quad 0x3fab9c3000000000 + .quad 0x3e3f9589c628dfe0 + .quad 0x3fac4dfa00000000 + .quad 0x3e5721556bde9f1f + .quad 0x3facff9c00000000 + .quad 0x3e5a8867f80f2a46 + .quad 0x3fadb11600000000 + .quad 0x3e4a583c979a598e + .quad 0x3fae626700000000 + .quad 0x3e443847800c1405 + .quad 0x3faf138700000000 + .quad 0x3e1664a168a10688 + .quad 0x3fafc48600000000 + .quad 0x3e2eb49173242e2e + .quad 0x3fb03aa900000000 + .quad 0x3e6b1b90df1d2899 + .quad 0x3fb092fb00000000 + .quad 0x3e6f4828dce8ef96 + .quad 0x3fb0eb3900000000 + .quad 0x3e57e8a84071ed7c + .quad 0x3fb1436100000000 + .quad 0x3e6ea26e46fc50e3 + .quad 0x3fb19b7500000000 + .quad 0x3e64d3ec52377554 + .quad 0x3fb1f37000000000 + .quad 0x3e46a5728109990d + .quad 0x3fb24b5900000000 + .quad 0x3e6b426b10e12ca0 + .quad 0x3fb2a32e00000000 + .quad 0x3e59bbba7c1b46c7 + .quad 0x3fb2faed00000000 + .quad 0x3e67f99638784faf + .quad 0x3fb3529c00000000 + .quad 0x3e1e52f196858161 + .quad 0x3fb3aa3000000000 + .quad 0x3e67a4fe6def19e6 + .quad 0x3fb401b000000000 + .quad 0x3e0302a326e6a3dc + .quad 0x3fb4591d00000000 + .quad 0x3e6fa21b2e435f49 + .quad 0x3fb4b07600000000 + .quad 0x3e58415e51626967 + .quad 0x3fb507b900000000 + .quad 0x3e3a033d6c5941c4 + .quad 0x3fb55ee600000000 + .quad 0x3e33c8467c54296b + .quad 0x3fb5b60100000000 + .quad 0x3e5e02f5a12fe65d + .quad 0x3fb60d0600000000 + .quad 0x3e6ecfc86d9ed70d + .quad 0x3fb663f600000000 + .quad 0x3e5eb24497a376b8 + .quad 0x3fb6bad400000000 + .quad 0x3e48c77f72e2b40f + .quad 0x3fb7119b00000000 + .quad 0x3e68ed7d5e52d89e + .quad 0x3fb7684d00000000 + .quad 0x3e43fa7ea9d3799b + .quad 0x3fb7beec00000000 + .quad 0x3e60571414f770db + .quad 0x3fb8157900000000 + .quad 0x3e68c7d07f316ee3 + .quad 0x3fb86bf000000000 + .quad 0x3e6360f420c77bec + .quad 0x3fb8c25000000000 + .quad 0x3e6d91c947d50fa1 + .quad 0x3fb918a300000000 + .quad 0x3e4b231ba93bd154 + .quad 0x3fb96eda00000000 + .quad 0x3e61d38c8099fddd + .quad 0x3fb9c50300000000 + .quad 0x3e677eeb9b0174ac + .quad 0x3fba1b1100000000 + .quad 0x3e69d6ddd016014c + .quad 0x3fba711100000000 + .quad 0x3e626690842b7789 + .quad 0x3fbac6fa00000000 + .quad 0x3e5830b93095c531 + .quad 0x3fbb1cd000000000 + .quad 0x3e5c2b99518e0d2c + .quad 0x3fbb729300000000 + .quad 0x3e66279b91823620 + .quad 0x3fbbc84400000000 + .quad 0x3e30adafc9057ecc + .quad 0x3fbc1ddd00000000 + .quad 0x3e461ce45269682a + .quad 0x3fbc736300000000 + .quad 0x3e5044ef5f2fe276 + .quad 0x3fbcc8d600000000 + .quad 0x3e4eb3dbd5234ce7 + .quad 0x3fbd1e3600000000 + .quad 0x3e2eb70a6e724019 + .quad 0x3fbd737e00000000 + .quad 0x3e5403a5977b9a51 + .quad 0x3fbdc8b700000000 + .quad 0x3e62d343b2886c33 + .quad 0x3fbe1ddd00000000 + .quad 0x3e5f443cfbd572a9 + .quad 0x3fbe72eb00000000 + .quad 0x3e632ff4a08c00ad + .quad 0x3fbec7ea00000000 + .quad 0x3e611d934f5c870b + .quad 0x3fbf1cd100000000 + .quad 0x3e610afc18ecc7fd + .quad 0x3fbf71a900000000 + .quad 0x3e4c5db9d4383f15 + .quad 0x3fbfc66800000000 + .quad 0x3e6a615fe5dcf50a + .quad 0x3fc00d8c00000000 + .quad 0x3e6f8684b8524b4d + .quad 0x3fc037da00000000 + .quad 0x3e7471e52c396096 + .quad 0x3fc0621e00000000 + .quad 0x3e7a1aad94d3758a + .quad 0x3fc08c5800000000 + .quad 0x3e7f9b4f573cd19d + .quad 0x3fc0b68900000000 + .quad 0x3e4e88e925a98afd + .quad 0x3fc0e0b100000000 + .quad 0x3e677212d0eeb433 + .quad 0x3fc10acd00000000 + .quad 0x3e63ff48e459228f + .quad 0x3fc134e100000000 + .quad 0x3e63a241697adc33 + .quad 0x3fc15eeb00000000 + .quad 0x3e4f4a7ae82699a0 + .quad 0x3fc188ec00000000 + .quad 0x3e7d83a2e1fe8196 + .quad 0x3fc1b2e400000000 + .quad 0x3e6e765c52c5b577 + .quad 0x3fc1dcd100000000 + .quad 0x3e77eaa5780399be + .quad 0x3fc206b400000000 + .quad 0x3e766c5ef95ab1fc + .quad 0x3fc2308f00000000 + .quad 0x3e703a52d5db6084 + .quad 0x3fc25a6200000000 + .quad 0x3e51786d7d82f6f1 + .quad 0x3fc2842a00000000 + .quad 0x3e6641ea2ded60b8 + .quad 0x3fc2ade800000000 + .quad 0x3e4addfbeaa772f7 + .quad 0x3fc2d79b00000000 + .quad 0x3e67cdfbbc061e04 + .quad 0x3fc3014800000000 + .quad 0x3e717ad775a7481b + .quad 0x3fc32ae800000000 + .quad 0x3e7e4f15a673baf4 + .quad 0x3fc3548300000000 + .quad 0x3e58eca1813fa934 + .quad 0x3fc37e1200000000 + .quad 0x3e7a3622382e96fb + .quad 0x3fc3a79700000000 + .quad 0x3e7916bb2a2cea0a + .quad 0x3fc3d11400000000 + .quad 0x3e61e6a28aaa11cb + .quad 0x3fc3fa8800000000 + .quad 0x3e61a3ceca68f920 + .quad 0x3fc423f100000000 + .quad 0x3e705825c8caf8ed + .quad 0x3fc44d5200000000 + .quad 0x3e572d6f71f4b037 + .quad 0x3fc476aa00000000 + .quad 0x3e6060fdf3cabb49 + .quad 0x3fc49ff700000000 + .quad 0x3e6df855c48e67aa + .quad 0x3fc4c93e00000000 + .quad 0x3e60854767c83d89 + .quad 0x3fc4f27700000000 + .quad 0x3e7c27d2adfa3cf1 + .quad 0x3fc51bab00000000 + .quad 0x3e21e96f77a9b8ff + .quad 0x3fc544d500000000 + .quad 0x3e69b89066da0127 + .quad 0x3fc56df400000000 + .quad 0x3e7831ab063f0639 + .quad 0x3fc5970b00000000 + .quad 0x3e62a3ff97f4402e + .quad 0x3fc5c01b00000000 + .quad 0x3e5cfdec6aa61224 + .quad 0x3fc5e92000000000 + .quad 0x3e30bf99a341739b + .quad 0x3fc6121900000000 + .quad 0x3e7589025c069af7 + .quad 0x3fc63b0c00000000 + .quad 0x3e73e7c70dc28176 + .quad 0x3fc663f600000000 + .quad 0x3e7319225255ed92 + .quad 0x3fc68cd700000000 + .quad 0x3e721d999e92e626 + .quad 0x3fc6b5af00000000 + .quad 0x3e6feaba3c111c8a + .quad 0x3fc6de7e00000000 + .quad 0x3e67408ffba276e0 + .quad 0x3fc7074100000000 + .quad 0x3e7b9de032cb0fd0 + .quad 0x3fc72ffe00000000 + .quad 0x3e6fbab18df0f78e + .quad 0x3fc758b100000000 + .quad 0x3e7eed8f544cc58a + .quad 0x3fc7815c00000000 + .quad 0x3e5f34382f992a55 + .quad 0x3fc7a9ff00000000 + .quad 0x3e723a0bf2565894 + .quad 0x3fc7d29700000000 + .quad 0x3e6784d72660bf64 + .quad 0x3fc7fb2800000000 + .quad 0x3e53cef9f2a00fda + .quad 0x3fc823ad00000000 + .quad 0x3e6636827e73660e + .quad 0x3fc84c2b00000000 + .quad 0x3e6e0bc0ce905e5f + .quad 0x3fc874a000000000 + .quad 0x3e5b40d32ca21b4f + .quad 0x3fc89d0d00000000 + .quad 0x3e7a968650124684 + .quad 0x3fc8c56f00000000 + .quad 0x3e7724c9f4c54dc2 + .quad 0x3fc8edca00000000 + .quad 0x3e6b8d4ab3e3b13c + .quad 0x3fc9161b00000000 + .quad 0x3e74576bcfdafe5e + .quad 0x3fc93e6500000000 + .quad 0x3e7332208c376c3f + .quad 0x3fc966a600000000 + .quad 0x3df175e083c82deb + .quad 0x3fc98edc00000000 + .quad 0x3e79efce11aa7d30 + .quad 0x3fc9b70c00000000 + .quad 0x3e62ae7840b35985 + .quad 0x3fc9df3200000000 + .quad 0x3e4e8c13081d57dc + .quad 0x3fca074e00000000 + .quad 0x3e60b028bf61097b + .quad 0x3fca2f6200000000 + .quad 0x3e7fa41706304e8f + .quad 0x3fca576d00000000 + .quad 0x3e7f0e5f94377493 + .quad 0x3fca7f7100000000 + .quad 0x3e6edeeabeeeab1a + .quad 0x3fcaa76d00000000 + .quad 0x3e6fdf22f0ca6c0d + .quad 0x3fcacf5d00000000 + .quad 0x3e676d3aee892f9c + .quad 0x3fcaf74700000000 + .quad 0x3e7fbc37f3121ab7 + .quad 0x3fcb1f2800000000 + .quad 0x3e7717af8e5dd5b2 + .quad 0x3fcb46ff00000000 + .quad 0x3e70c006784d6d72 + .quad 0x3fcb6ece00000000 + .quad 0x3e75ebf2abe7a8f0 + .quad 0x3fcb969600000000 + .quad 0x3e570772e1aa6f94 + .quad 0x3fcbbe5300000000 + .quad 0x3e7507e05d60e5c4 + .quad 0x3fcbe60900000000 + .quad 0x3e6a479c1c7622d5 + .quad 0x3fcc0db700000000 + .quad 0x3e6a7653cad63a6a + .quad 0x3fcc355b00000000 + .quad 0x3e63c6576ac08e77 + .quad 0x3fcc5cf700000000 + .quad 0x3e696181ff9674a7 + .quad 0x3fcc848b00000000 + .quad 0x3e74c88b88cb08d4 + .quad 0x3fccac1500000000 + .quad 0x3e768ee1a3f58613 + .quad 0x3fccd39700000000 + .quad 0x3e7bc7d00e53901c + .quad 0x3fccfb1200000000 + .quad 0x3e4cb8c314503175 + .quad 0x3fcd228400000000 + .quad 0x3e6a40646984129b + .quad 0x3fcd49ee00000000 + .quad 0x3e77864b48c32b3c + .quad 0x3fcd714e00000000 + .quad 0x3e76dc470f22f1ee + .quad 0x3fcd98a900000000 + .quad 0x3e153043b87205ac + .quad 0x3fcdbff800000000 + .quad 0x3e7ce2096f5baed1 + .quad 0x3fcde74000000000 + .quad 0x3e76b6293b0e2ea0 + .quad 0x3fce0e8000000000 + .quad 0x3e69e5c03298a8d0 + .quad 0x3fce35b500000000 + .quad 0x3e7359a4add9086c + .quad 0x3fce5ce400000000 + .quad 0x3e7fbba6e4320b0b + .quad 0x3fce840c00000000 + .quad 0x3e57a7356760bf17 + .quad 0x3fceab2b00000000 + .quad 0x3e5412dd4c71d4aa + .quad 0x3fced23f00000000 + .quad 0x3e708cbbd3de4f64 + .quad 0x3fcef94d00000000 + .quad 0x3e7ed1ec6fb9ef8f + .quad 0x3fcf205400000000 + .quad 0x3e4b20911d7e37db + .quad 0x3fcf474f00000000 + .quad 0x3e7192aee74aaf85 + .quad 0x3fcf6e4500000000 + .quad 0x3de9ff7395251cf5 + .quad 0x3fcf953200000000 + .quad 0x3e418fcf45710fc3 + .quad 0x3fcfbc1600000000 + .quad 0x3e77204d0144751b + .quad 0x3fcfe2f200000000 + .quad 0x3e7df662b4d59d8e + .quad 0x3fd004e300000000 + .quad 0x3e75d25f17b09d21 + .quad 0x3fd0184a00000000 + .quad 0x3e64044284485ca5 + .quad 0x3fd02bab00000000 + .quad 0x3e80a9a0c732cb2c + .quad 0x3fd03f0900000000 + .quad 0x3e89a98ad1490635 + .quad 0x3fd0526300000000 + .quad 0x3e897756562a827f + .quad 0x3fd065b900000000 + .quad 0x3e7f42d1cecd3768 + .quad 0x3fd0790a00000000 + .quad 0x3e8bb6060195a070 + .quad 0x3fd08c5900000000 + .quad 0x3e5c5a7b3a2bd335 + .quad 0x3fd09fa100000000 + .quad 0x3e8a2743f6a4cd20 + .quad 0x3fd0b2e700000000 + .quad 0x3e775f83f99025b0 + .quad 0x3fd0c62900000000 + .quad 0x3e87ca856421a674 + .quad 0x3fd0d96600000000 + .quad 0x3e814d2830ef12fd + .quad 0x3fd0eca000000000 + .quad 0x3e62348eca90f220 + .quad 0x3fd0ffd600000000 + .quad 0x3e812fcf75d18b23 + .quad 0x3fd1130700000000 + .quad 0x3e73b4c2bf9f9dd3 + .quad 0x3fd1263600000000 + .quad 0x3e499ef30070a508 + .quad 0x3fd1395f00000000 + .quad 0x3e61edb0d9e8da9b + .quad 0x3fd14c8400000000 + .quad 0x3e8f23ac3152c264 + .quad 0x3fd15fa600000000 + .quad 0x3e752ec233b712ad + .quad 0x3fd172c400000000 + .quad 0x3e7a163986a7b84c + .quad 0x3fd185dd00000000 + .quad 0x3e8f734fda450672 + .quad 0x3fd198f400000000 + .quad 0x3e7028962c15f52b + .quad 0x3fd1ac0500000000 + .quad 0x3e8fd23e213f6416 + .quad 0x3fd1bf1300000000 + .quad 0x3e68e4e3166c3339 + .quad 0x3fd1d21e00000000 + .quad 0x3e70ea55e7da3fec + .quad 0x3fd1e52300000000 + .quad 0x3e81b9e3403df05d + .quad 0x3fd1f82500000000 + .quad 0x3e7e762367a00f4a + .quad 0x3fd20b2400000000 + .quad 0x3e3388b4dd9d8704 + .quad 0x3fd21e1f00000000 + .quad 0x3e6603bbc7b763e4 + .quad 0x3fd2311400000000 + .quad 0x3e7f38b9f767e1c9 + .quad 0x3fd2440700000000 + .quad 0x3e8361c0e424306b + .quad 0x3fd256f600000000 + .quad 0x3e53e15a0763e5f5 + .quad 0x3fd269e100000000 + .quad 0x3e5c346e0f5542ab + .quad 0x3fd27cc800000000 + .quad 0x3e8623bac0f6e8e5 + .quad 0x3fd28fab00000000 + .quad 0x3e82d664ea511964 + .quad 0x3fd2a28b00000000 + .quad 0x3e244827751649e1 + .quad 0x3fd2b56500000000 + .quad 0x3e870662732a8325 + .quad 0x3fd2c83c00000000 + .quad 0x3e8db880f0396c05 + .quad 0x3fd2db1000000000 + .quad 0x3e8409b34923f5d0 + .quad 0x3fd2ede000000000 + .quad 0x3e899c121e8496e6 + .quad 0x3fd300ad00000000 + .quad 0x3e7c232f22d20f20 + .quad 0x3fd3137500000000 + .quad 0x3e73683d6c58ca0d + .quad 0x3fd3263900000000 + .quad 0x3e836d65141862cf + .quad 0x3fd338fa00000000 + .quad 0x3e75be12efc2f601 + .quad 0x3fd34bb600000000 + .quad 0x3e70751869f3b7a6 + .quad 0x3fd35e6f00000000 + .quad 0x3e89f95043bbfc91 + .quad 0x3fd3712400000000 + .quad 0x3e80d499b29f7615 + .quad 0x3fd383d500000000 + .quad 0x3e83dd8f4de52902 + .quad 0x3fd3968400000000 + .quad 0x3e748a73fa7e46e2 + .quad 0x3fd3a92e00000000 + .quad 0x3e6252112c0e2155 + .quad 0x3fd3bbd300000000 + .quad 0x3e52a1dc831e5ad7 + .quad 0x3fd3ce7500000000 + .quad 0x3e825d1013e78284 + .quad 0x3fd3e11400000000 + .quad 0x3e796f27f8ed6ab1 + .quad 0x3fd3f3af00000000 + .quad 0x3e81043c4e106f6a + .quad 0x3fd4064500000000 + .quad 0x3e8723607a748d45 + .quad 0x3fd418d900000000 + .quad 0x3e7c5a76f3c6b991 + .quad 0x3fd42b6900000000 + .quad 0x3e7c13d54b6ede12 + .quad 0x3fd43df400000000 + .quad 0x3e7d02dc433313ae + .quad 0x3fd4507c00000000 + .quad 0x3e8edba9f6e1776c + .quad 0x3fd4630100000000 + .quad 0x3e86e864bf1d1aaa + .quad 0x3fd4758100000000 + .quad 0x3e7cae90765abc31 + .quad 0x3fd487fe00000000 + .quad 0x3e849fe23646e5a5 + .quad 0x3fd49a7800000000 + .quad 0x3e479a36743be41d + .quad 0x3fd4aced00000000 + .quad 0x3e8483e03299b840 + .quad 0x3fd4bf5f00000000 + .quad 0x3e7abba144c6b22b + .quad 0x3fd4d1cd00000000 + .quad 0x3e774d20fdd9f23b + .quad 0x3fd4e43800000000 + .quad 0x3e871d1f7aa47e01 + .quad 0x3fd4f69e00000000 + .quad 0x3e8f2860ba3b3db5 + .quad 0x3fd5090200000000 + .quad 0x3e83af1c17099bfe + .quad 0x3fd51b6200000000 + .quad 0x3e785ff9de74a1b4 + .quad 0x3fd52dbe00000000 + .quad 0x3e709325cfafa80f + .quad 0x3fd5401600000000 + .quad 0x3e6e6947ccf73d7a + .quad 0x3fd5526a00000000 + .quad 0x3e738124d5db9ad7 + .quad 0x3fd564bb00000000 + .quad 0x3e86b2911c62b3a2 + .quad 0x3fd5770900000000 + .quad 0x3e6719bc759ee891 + .quad 0x3fd5895200000000 + .quad 0x3e869a322d9370bc + .quad 0x3fd59b9800000000 + .quad 0x3e719789a94340e2 + .quad 0x3fd5addb00000000 + .quad 0x3e61c3d9786a1c1a + .quad 0x3fd5c01a00000000 + .quad 0x3e37ef590a213419 + .quad 0x3fd5d25400000000 + .quad 0x3e8d54eb1103130f + .quad 0x3fd5e48d00000000 + .quad 0x3e52f62a9cc12fd0 + .quad 0x3fd5f6c100000000 + .quad 0x3e6be9b244784641 + .quad 0x3fd608f100000000 + .quad 0x3e758a521184b277 + .quad 0x3fd61b1e00000000 + .quad 0x3e86042873323471 + .quad 0x3fd62d4700000000 + .quad 0x3e8fbc7d80b47bcf + .quad 0x3fd63f6d00000000 + .quad 0x3e6e2c82077ea756 + .quad 0x3fd6518f00000000 + .quad 0x3e85ccef6bf767f4 + .quad 0x3fd663ae00000000 + .quad 0x3e46ead81df81e8f + .quad 0x3fd675c900000000 + .quad 0x3e82dd03f10cd685 + .quad 0x3fd687e100000000 + .quad 0x3e3e902c6dbc1f0c + .quad 0x3fd699f500000000 + .quad 0x3e84319abac9c4b2 + .quad 0x3fd6ac0600000000 + .quad 0x3e5b055166c24b15 + .quad 0x3fd6be1200000000 + .quad 0x3e7c3be07b4f7882 + .quad 0x3fd6d01b00000000 + .quad 0x3e8cfd93dd847e5d + .quad 0x3fd6e22100000000 + .quad 0x3e6ace863358e8d7 + .quad 0x3fd6f42300000000 + .quad 0x3e83e40c6242bfe9 + .quad 0x3fd7062300000000 + .quad 0x3e610ab6a8479b5d + .quad 0x3fd7181e00000000 + .quad 0x3e7cd689bcfd9cf6 + .quad 0x3fd72a1600000000 + .quad 0x3e8b1978624662cc + .quad 0x3fd73c0b00000000 + .quad 0x3e3b1a8d9a80c213 + .quad 0x3fd74dfa00000000 + .quad 0x3e8f44cc629fadc5 + .quad 0x3fd75fe900000000 + .quad 0x3e70d17562376005 + .quad 0x3fd771d300000000 + .quad 0x3e731fbf269b0088 + .quad 0x3fd783b900000000 + .quad 0x3e52ab13f0273736 + .quad 0x3fd7959b00000000 + .quad 0x3e8ba45253b127d6 + .quad 0x3fd7a77b00000000 + .quad 0x3e852fa4783a4dfd + .quad 0x3fd7b95700000000 + .quad 0x3e6528d527430d54 + .quad 0x3fd7cb2f00000000 + .quad 0x3e84f6c8a8c54418 + .quad 0x3fd7dd0500000000 + .quad 0x3e5f404ba538c133 + .quad 0x3fd7eed700000000 + .quad 0x3e81d08a084632f9 + .quad 0x3fd800a500000000 + .quad 0x3e84e2c39b578d96 + .quad 0x3fd8127000000000 + .quad 0x3e8641178f2c2b02 + .quad 0x3fd8243700000000 + .quad 0x3e781b9c28ee919e + .quad 0x3fd835fa00000000 + .quad 0x3e8f7b17b6d5775c + .quad 0x3fd847bc00000000 + .quad 0x3e89db0c612f1b2e + .quad 0x3fd8597800000000 + .quad 0x3e8dffaae2cbad0f + .quad 0x3fd86b3300000000 + .quad 0x3e70f5b6d0513247 + .quad 0x3fd87ce900000000 + .quad 0x3e6699b2d0c42cca + .quad 0x3fd88e9b00000000 + .quad 0x3e8edc16362782b3 + .quad 0x3fd8a04b00000000 + .quad 0x3e83cd771d49fb4b + .quad 0x3fd8b1f800000000 + .quad 0x3e60b05b11747e4c + .quad 0x3fd8c3a100000000 + .quad 0x3e7f52c9816db2c1 + .quad 0x3fd8d54600000000 + .quad 0x3e782d70d541d6c1 + .quad 0x3fd8e6e800000000 + .quad 0x3e57aa91cc153dde + .quad 0x3fd8f88600000000 + .quad 0x3e83f65a8e01affc + .quad 0x3fd90a2100000000 + .quad 0x3e8ecae2475966df + .quad 0x3fd91bba00000000 + .quad 0x3e591f169848d269 + .quad 0x3fd92d4f00000000 + .quad 0x3e3647c7943a8d23 + .quad 0x3fd93ee000000000 + .quad 0x3e8726bf3db3e718 + .quad 0x3fd9506d00000000 + .quad 0x3e8c1a18fafa10d5 + .quad 0x3fd961f900000000 + .quad 0x3e5b2740c198f220 + .quad 0x3fd9737f00000000 + .quad 0x3e887fb1536242b8 + .quad 0x3fd9850400000000 + .quad 0x3e7ec5c619b71f3e + .quad 0x3fd9968400000000 + .quad 0x3e8366d3eb0e5d24 + .quad 0x3fd9a80200000000 + .quad 0x3e88a3c48f5901ad + .quad 0x3fd9b97c00000000 + .quad 0x3e74a3bb2d70054b + .quad 0x3fd9caf200000000 + .quad 0x3e825931e77b3ed9 + .quad 0x3fd9dc6600000000 + .quad 0x3e8ac1bd72bb6920 + .quad 0x3fd9edd600000000 + .quad 0x3e7d26c9777b80e6 + .quad 0x3fd9ff4200000000 + .quad 0x3e87cdf6b003fe44 + .quad 0x3fda10ad00000000 + .quad 0x3e32256c5f5257da + .quad 0x3fda221200000000 + .quad 0x3e83b4a3ff1466d0 + .quad 0x3fda337600000000 + .quad 0x3e673fb048cd2b2f + .quad 0x3fda44d600000000 + .quad 0x3e7844f0a7da3c13 + .quad 0x3fda563100000000 + .quad 0x3e8bcba6da5b37e1 + .quad 0x3fda678b00000000 + .quad 0x3e7325816e447b2d + .quad 0x3fda78e100000000 + .quad 0x3e753defc2fb5aa0 + .quad 0x3fda8a3300000000 + .quad 0x3e8e9f688620242e + .quad 0x3fda9b8300000000 + .quad 0x3e650c63633bbec2 + .quad 0x3fdaacce00000000 + .quad 0x3e8e38f926facedd + .quad 0x3fdabe1800000000 + .quad 0x3e83efe3f1bc83ea + .quad 0x3fdacf5d00000000 + .quad 0x3e809e9d83cd28e8 + .quad 0x3fdae0a000000000 + .quad 0x3e72f7a9feea5b2a + .quad 0x3fdaf1df00000000 + .quad 0x3e83762377a3c900 + .quad 0x3fdb031b00000000 + .quad 0x3e7c7818efde9c0a + .quad 0x3fdb145500000000 + .quad 0x3e618ff8ce39a19e + .quad 0x3fdb258900000000 + .quad 0x3e8fd450b400cdc5 + .quad 0x3fdb36bc00000000 + .quad 0x3e861347926aa708 + .quad 0x3fdb47eb00000000 + .quad 0x3e8be7104fa3a380 + .quad 0x3fdb591700000000 + .quad 0x3e80fdc35b90ee8d + .quad 0x3fdb6a4100000000 + .quad 0x3e056415269e9adc + .quad 0x3fdb7b6600000000 + .quad 0x3e8ddbe05932e271 + .quad 0x3fdb8c8900000000 + .quad 0x3e73fe21df4fea38 + .quad 0x3fdb9da800000000 + .quad 0x3e60b2e6d80d2ce6 + .quad 0x3fdbaec400000000 + .quad 0x3e874289e4e1d49c + .quad 0x3fdbbfdd00000000 + .quad 0x3e87ce1b050aa700 + .quad 0x3fdbd0f300000000 + .quad 0x3e65f3c859448338 + .quad 0x3fdbe20400000000 + .quad 0x3e8ffc7f79678a39 + .quad 0x3fdbf31400000000 + .quad 0x3e824a1ec9be7496 + .quad 0x3fdc042100000000 + .quad 0x3e8c2b16ec00f182 + .quad 0x3fdc152a00000000 + .quad 0x3e6a92654ec891d7 + .quad 0x3fdc263000000000 + .quad 0x3e7037888b90c7f8 + .quad 0x3fdc373200000000 + .quad 0x3e84e5a090419bc8 + .quad 0x3fdc483200000000 + .quad 0x3e882722e066f64d + .quad 0x3fdc592f00000000 + .quad 0x3e6894ad710aef0c + .quad 0x3fdc6a2900000000 + .quad 0x3e74290c06a50919 + .quad 0x3fdc7b1f00000000 + .quad 0x3e8829ea41109e48 + .quad 0x3fdc8c1200000000 + .quad 0x3e8011fb6ad70668 + .quad 0x3fdc9d0200000000 + .quad 0x3e8d1948f3cb0098 + .quad 0x3fdcadef00000000 + .quad 0x3e835c4dc117de0d + .quad 0x3fdcbed900000000 + .quad 0x3e8e37710c7563b4 + .quad 0x3fdccfc000000000 + .quad 0x3e81b705b8191331 + .quad 0x3fdce0a400000000 + .quad 0x3e89474b1cfe31f4 + .quad 0x3fdcf18500000000 + .quad 0x3e71c8d86ee32d3b + .quad 0x3fdd026300000000 + .quad 0x3e7815019917c831 + .quad 0x3fdd133d00000000 + .quad 0x3e86a58c1d40a370 + .quad 0x3fdd241400000000 + .quad 0x3e70c2fc81bc79c2 + .quad 0x3fdd34e900000000 + .quad 0x3e88ba3405adb567 + .quad 0x3fdd45ba00000000 + .quad 0x3e5ddba9ecf26bb9 + .quad 0x3fdd568800000000 + .quad 0x3e3d1ef9e850540f + .quad 0x3fdd675300000000 + .quad 0x3e80065d34ca0dce + .quad 0x3fdd781c00000000 + .quad 0x3e80d733e02d0dd1 + .quad 0x3fdd88e100000000 + .quad 0x3e870ef65b098f9c + .quad 0x3fdd99a300000000 + .quad 0x3e52c86102e26030 + .quad 0x3fddaa6100000000 + .quad 0x3e8e80c9ef4c81d3 + .quad 0x3fddbb1e00000000 + .quad 0x3e7692e19cb2b425 + .quad 0x3fddcbd600000000 + .quad 0x3e8c462e64521547 + .quad 0x3fdddc8c00000000 + .quad 0x3e8d5a1dd411035e + .quad 0x3fdded4000000000 + .quad 0x3e7c908df47a8f92 + .quad 0x3fddfdf000000000 + .quad 0x3e545cf17f40aa9d + .quad 0x3fde0e9d00000000 + .quad 0x3e687c172ac42c55 + .quad 0x3fde1f4600000000 + .quad 0x3e78da98936314cf + .quad 0x3fde2fed00000000 + .quad 0x3e4812e4ac4e8487 + .quad 0x3fde409100000000 + .quad 0x3e64755453322906 + .quad 0x3fde513100000000 + .quad 0x3e7528ae2e3ef4fa + .quad 0x3fde61d000000000 + .quad 0x3e7501716cf4be90 + .quad 0x3fde726900000000 + .quad 0x3e8f3cea8b8b9869 + .quad 0x3fde830200000000 + .quad 0x3e7be69828149b31 + .quad 0x3fde939600000000 + .quad 0x3e8d5e2937a72435 + .quad 0x3fdea42800000000 + .quad 0x3e89bfbbe2698141 + .quad 0x3fdeb4b800000000 + .quad 0x3e56d15b8c6d35e8 + .quad 0x3fdec54400000000 + .quad 0x3e886f8d094b9a13 + .quad 0x3fded5cd00000000 + .quad 0x3e7b23c5dca4eff0 + .quad 0x3fdee65300000000 + .quad 0x3e7d463bf0218027 + .quad 0x3fdef6d600000000 + .quad 0x3e8b651c6050e055 + .quad 0x3fdf075600000000 + .quad 0x3e6b46a793b8e626 + .quad 0x3fdf17d400000000 + .quad 0x3e74650236b11f5f + .quad 0x3fdf284e00000000 + .quad 0x3e77629298efa0ad + .quad 0x3fdf38c500000000 + .quad 0x3e87d798bebcb6ab + .quad 0x3fdf493a00000000 + .quad 0x3e7ccde6d2f4c9f7 + .quad 0x3fdf59ab00000000 + .quad 0x3e5186572a5ff9c8 + .quad 0x3fdf6a1a00000000 + .quad 0x3e799d006591c907 + .quad 0x3fdf7a8500000000 + .quad 0x3e841960e73ec979 + .quad 0x3fdf8aee00000000 + .quad 0x3e630aa8521479fd + .quad 0x3fdf9b5300000000 + .quad 0x3e8e8b869c429d94 + .quad 0x3fdfabb700000000 + .quad 0x3e4350fc25c8a13b + .quad 0x3fdfbc1700000000 + .quad 0x3e79009a6ef5d48a + .quad 0x3fdfcc7300000000 + .quad 0x3e8306349a8abfef + .quad 0x3fdfdcce00000000 + .quad 0x3e7d9f569f06bc1e + .quad 0x3fdfed2500000000 + .quad 0x3e65160ec1d12919 + .quad 0x3fdffd7900000000 + .quad 0x3e5a83ff2555a494 + .quad 0x3fe006e500000000 + .quad 0x3e9afca83644de26 + .quad 0x3fe00f0d00000000 + .quad 0x3e53c49d9079d468 + .quad 0x3fe0173200000000 + .quad 0x3e9ae76be763882e + .quad 0x3fe01f5700000000 + .quad 0x3e7f793285e25c81 + .quad 0x3fe0277a00000000 + .quad 0x3e800243639826ee + .quad 0x3fe02f9b00000000 + .quad 0x3e9b301832f2c8a9 + .quad 0x3fe037bc00000000 + .quad 0x3e54b54b5457ab7c + .quad 0x3fe03fda00000000 + .quad 0x3e9a32f3449fa7a6 + .quad 0x3fe047f700000000 + .quad 0x3e8e060e91d41da5 + .quad 0x3fe0501300000000 + .quad 0x3e8a3f382aa1e82b + .quad 0x3fe0582d00000000 + .quad 0x3e9da8b4318c1dd2 + .quad 0x3fe0604700000000 + .quad 0x3e3f9274a07c17a6 + .quad 0x3fe0685e00000000 + .quad 0x3e95804ec5f0fe6d + .quad 0x3fe0707400000000 + .quad 0x3e9c8eac786d0112 + .quad 0x3fe0788900000000 + .quad 0x3e958943fb66416b + .quad 0x3fe0809d00000000 + .quad 0x3e33fb82cede51e0 + .quad 0x3fe088ae00000000 + .quad 0x3e9cc27b15563034 + .quad 0x3fe090bf00000000 + .quad 0x3e8581667ca3348d + .quad 0x3fe098ce00000000 + .quad 0x3e8454acd057fbfa + .quad 0x3fe0a0dc00000000 + .quad 0x3e91cf1c5c53f37d + .quad 0x3fe0a8e800000000 + .quad 0x3e93b2b423f481d0 + .quad 0x3fe0b0f300000000 + .quad 0x3e7a8314e3b62474 + .quad 0x3fe0b8fd00000000 + .quad 0x3e574eeba208d495 + .quad 0x3fe0c10400000000 + .quad 0x3e961ac74d5ada6a + .quad 0x3fe0c90b00000000 + .quad 0x3e926ddde7aa78b1 + .quad 0x3fe0d11000000000 + .quad 0x3e9f51b91d907509 + .quad 0x3fe0d91400000000 + .quad 0x3e9ca5d77a3bf837 + .quad 0x3fe0e11700000000 + .quad 0x3e84935ef97f078e + .quad 0x3fe0e91800000000 + .quad 0x3e80395f3d5449d6 + .quad 0x3fe0f11800000000 + .quad 0x3e8a2c7cb38d9ed1 + .quad 0x3fe0f91600000000 + .quad 0x3e9677ba0152cbb4 + .quad 0x3fe1011300000000 + .quad 0x3e9b3a7927aec2fd + .quad 0x3fe1090f00000000 + .quad 0x3e707f2889e8b7a9 + .quad 0x3fe1110900000000 + .quad 0x3e93bcf3ba17fb1f + .quad 0x3fe1190200000000 + .quad 0x3e7cecd182c0b1e4 + .quad 0x3fe120f900000000 + .quad 0x3e95a3c2fb2785b2 + .quad 0x3fe128ef00000000 + .quad 0x3e9edbce6a636a11 + .quad 0x3fe130e400000000 + .quad 0x3e972c7da9b832d3 + .quad 0x3fe138d700000000 + .quad 0x3e9e74efeb672a03 + .quad 0x3fe140ca00000000 + .quad 0x3e2a1e54f6b89e31 + .quad 0x3fe148ba00000000 + .quad 0x3e90ad737019fd24 + .quad 0x3fe150a900000000 + .quad 0x3e9b639c287d2824 + .quad 0x3fe1589700000000 + .quad 0x3e9495b6dd3ddabd + .quad 0x3fe1608400000000 + .quad 0x3e7f2aeffe31b5d0 + .quad 0x3fe1686f00000000 + .quad 0x3e827b385c52cc9f + .quad 0x3fe1705900000000 + .quad 0x3e71e501d3944026 + .quad 0x3fe1784100000000 + .quad 0x3e99628a2c0e2602 + .quad 0x3fe1802800000000 + .quad 0x3e9c2e52f159a4bf + .quad 0x3fe1880e00000000 + .quad 0x3e8976d9b0f3dfdd + .quad 0x3fe18ff300000000 + .quad 0x3e628513cd04695c + .quad 0x3fe197d600000000 + .quad 0x3e75b2da605bddf8 + .quad 0x3fe19fb700000000 + .quad 0x3e95ee648263ee18 + .quad 0x3fe1a79700000000 + .quad 0x3e9f6e601ac91256 + .quad 0x3fe1af7700000000 + .quad 0x3e5d155a178b90cd + .quad 0x3fe1b75400000000 + .quad 0x3e9cfbe9de667b41 + .quad 0x3fe1bf3100000000 + .quad 0x3e744ae80f899fbd + .quad 0x3fe1c70c00000000 + .quad 0x3e76d96ff1c879c9 + .quad 0x3fe1cee500000000 + .quad 0x3e9ecb5e2c072eb0 + .quad 0x3fe1d6be00000000 + .quad 0x3e71c11dbe1db818 + .quad 0x3fe1de9500000000 + .quad 0x3e625cbb9559d10f + .quad 0x3fe1e66a00000000 + .quad 0x3e9841c66176bdde + .quad 0x3fe1ee3f00000000 + .quad 0x3e78dd143c97c211 + .quad 0x3fe1f61200000000 + .quad 0x3e309f38f10515b8 + .quad 0x3fe1fde300000000 + .quad 0x3e9de1d02b7acb55 + .quad 0x3fe205b400000000 + .quad 0x3e7d6e666f069f9f + .quad 0x3fe20d8300000000 + .quad 0x3e80c459b58a9a68 + .quad 0x3fe2155100000000 + .quad 0x3e4b3ac6c4422b43 + .quad 0x3fe21d1d00000000 + .quad 0x3e90a6dabdf57c13 + .quad 0x3fe224e800000000 + .quad 0x3e87a6f05e2e66b4 + .quad 0x3fe22cb200000000 + .quad 0x3e83ebcaaaa786ff + .quad 0x3fe2347a00000000 + .quad 0x3e933c5177ae38be + .quad 0x3fe23c4100000000 + .quad 0x3e9f44e5029b8b1d + .quad 0x3fe2440700000000 + .quad 0x3e9635c0e894df30 + .quad 0x3fe24bcc00000000 + .quad 0x3e6e87f9f1f3590c + .quad 0x3fe2538f00000000 + .quad 0x3e7feacb86a3b429 + .quad 0x3fe25b5100000000 + .quad 0x3e8cfdcf4e10a41a + .quad 0x3fe2631100000000 + .quad 0x3e9f73a21fdde641 + .quad 0x3fe26ad100000000 + .quad 0x3e7a8b8011d56d3b + .quad 0x3fe2728f00000000 + .quad 0x3e6f84bf7d5b34d0 + .quad 0x3fe27a4c00000000 + .quad 0x3e6985cc1c8f11b0 + .quad 0x3fe2820700000000 + .quad 0x3e88d25a6a02c803 + .quad 0x3fe289c100000000 + .quad 0x3e975fd4c3433e76 + .quad 0x3fe2917a00000000 + .quad 0x3e8825154781d2c2 + .quad 0x3fe2993200000000 + .quad 0x3e62791595e60d25 + .quad 0x3fe2a0e800000000 + .quad 0x3e605b4c41d5635b + .quad 0x3fe2a89d00000000 + .quad 0x3e68e92900528496 + .quad 0x3fe2b05000000000 + .quad 0x3e9970145df6a281 + .quad 0xbfda8ff900000000 + .quad 0xbe86302155df0de3 + .quad 0xbfda809600000000 + .quad 0xbe8d2b316176fad0 + .quad 0xbfda713700000000 + .quad 0xbe824db2f6aceb96 + .quad 0xbfda61da00000000 + .quad 0xbe67117a804da234 + .quad 0xbfda527f00000000 + .quad 0xbe7f97f60ff5807b + .quad 0xbfda432700000000 + .quad 0xbe809d5c44adaa28 + .quad 0xbfda33d200000000 + .quad 0xbe70e2c7de9ac83b + .quad 0xbfda247f00000000 + .quad 0xbe8781011952fb40 + .quad 0xbfda152f00000000 + .quad 0xbe6794c0edaf9f16 + .quad 0xbfda05e100000000 + .quad 0xbe77ddf6e9895b08 + .quad 0xbfd9f69600000000 + .quad 0xbe73aef455ae3da8 + .quad 0xbfd9e74d00000000 + .quad 0xbe6eaf442c7ba9be + .quad 0xbfd9d80600000000 + .quad 0xbe8dc93243f14070 + .quad 0xbfd9c8c300000000 + .quad 0xbe78d1ba7956f02d + .quad 0xbfd9b98100000000 + .quad 0xbe8b8c1e78260310 + .quad 0xbfd9aa4300000000 + .quad 0xbe5ce27fc9d31391 + .quad 0xbfd99b0700000000 + .quad 0xbe634b6355f4087a + .quad 0xbfd98bcd00000000 + .quad 0xbe6c94b4572fef43 + .quad 0xbfd97c9600000000 + .quad 0xbe5846721de94267 + .quad 0xbfd96d6100000000 + .quad 0xbe88b74acdde1f6a + .quad 0xbfd95e2f00000000 + .quad 0xbe801a3e03f6b280 + .quad 0xbfd94f0000000000 + .quad 0xbe4b35095482043f + .quad 0xbfd93fd200000000 + .quad 0xbe856437d9bb4a5c + .quad 0xbfd930a800000000 + .quad 0xbe5db5b388b06a65 + .quad 0xbfd9218000000000 + .quad 0xbe79c93768c0e5d4 + .quad 0xbfd9125a00000000 + .quad 0xbe27f0e9d0aaf77a + .quad 0xbfd9033700000000 + .quad 0xbe6e085f7c5942f1 + .quad 0xbfd8f41600000000 + .quad 0xbe81b98df5f47569 + .quad 0xbfd8e4f700000000 + .quad 0xbe8f3428ac4ddeec + .quad 0xbfd8d5dc00000000 + .quad 0xbe7127ef6092650e + .quad 0xbfd8c6c300000000 + .quad 0xbe7c262e6c66cdb8 + .quad 0xbfd8b7ac00000000 + .quad 0xbe876faffff4af15 + .quad 0xbfd8a89800000000 + .quad 0xbe635fdead9ef9a2 + .quad 0xbfd8998600000000 + .quad 0xbe7dfc6109e45ceb + .quad 0xbfd88a7600000000 + .quad 0xbe8d94a9416e4721 + .quad 0xbfd87b6900000000 + .quad 0xbe80c9bd35322fa9 + .quad 0xbfd86c5f00000000 + .quad 0xbe45bd4714c8ffcf + .quad 0xbfd85d5700000000 + .quad 0xbe7f0ac6abba5180 + .quad 0xbfd84e5100000000 + .quad 0xbe74a1d4fc76c4e2 + .quad 0xbfd83f4e00000000 + .quad 0xbe58c7bbd43ea059 + .quad 0xbfd8304d00000000 + .quad 0xbe8a18240481523a + .quad 0xbfd8214e00000000 + .quad 0xbe8e4115e0e87309 + .quad 0xbfd8125300000000 + .quad 0xbe4067fcc9c54454 + .quad 0xbfd8035a00000000 + .quad 0xbe5519044060b3ca + .quad 0xbfd7f46200000000 + .quad 0xbe81f1c2bab3efa5 + .quad 0xbfd7e56e00000000 + .quad 0xbe2f4f8116a92f1f + .quad 0xbfd7d67c00000000 + .quad 0xbe7d00ebaf755412 + .quad 0xbfd7c78c00000000 + .quad 0xbe757cb332aa9b04 + .quad 0xbfd7b89f00000000 + .quad 0xbe6b67957924a221 + .quad 0xbfd7a9b400000000 + .quad 0xbe749441f289397f + .quad 0xbfd79acb00000000 + .quad 0xbe853e207739b243 + .quad 0xbfd78be500000000 + .quad 0xbe6f940fb688810d + .quad 0xbfd77d0100000000 + .quad 0xbe8b3df7ad1f744b + .quad 0xbfd76e2000000000 + .quad 0xbe86b033ad082bc9 + .quad 0xbfd75f4100000000 + .quad 0xbe8a6afc121884da + .quad 0xbfd7506500000000 + .quad 0xbe6a7683b47c1884 + .quad 0xbfd7418a00000000 + .quad 0xbe8b777e34575fd6 + .quad 0xbfd732b200000000 + .quad 0xbe8927fbbcb9ee5d + .quad 0xbfd723dd00000000 + .quad 0xbe88c68d7090566b + .quad 0xbfd7150b00000000 + .quad 0xbe4a2b2a2a0eb191 + .quad 0xbfd7063900000000 + .quad 0xbe8afbf68de6383b + .quad 0xbfd6f76b00000000 + .quad 0xbe86ddf093045ea8 + .quad 0xbfd6e89f00000000 + .quad 0xbe8c8c435cc0756e + .quad 0xbfd6d9d600000000 + .quad 0xbe786d3ae8f9661f + .quad 0xbfd6cb0f00000000 + .quad 0xbe6832e43f6d9d88 + .quad 0xbfd6bc4a00000000 + .quad 0xbe747cb81361877f + .quad 0xbfd6ad8800000000 + .quad 0xbe82035808f1c0f3 + .quad 0xbfd69ec800000000 + .quad 0xbe76ff1399db6922 + .quad 0xbfd6900a00000000 + .quad 0xbe7fcdb431863dd3 + .quad 0xbfd6814e00000000 + .quad 0xbe8f693d13fbb8d9 + .quad 0xbfd6729600000000 + .quad 0xbe834eb29036fad3 + .quad 0xbfd663df00000000 + .quad 0xbe899b456a12ce2e + .quad 0xbfd6552b00000000 + .quad 0xbe772618a503c189 + .quad 0xbfd6467900000000 + .quad 0xbe72cc529275c5a3 + .quad 0xbfd637c900000000 + .quad 0xbe8344c9b19a2513 + .quad 0xbfd6291c00000000 + .quad 0xbe72be4c963d47b8 + .quad 0xbfd61a7100000000 + .quad 0xbe77cb0653b68de6 + .quad 0xbfd60bc800000000 + .quad 0xbe8b082faedc50d1 + .quad 0xbfd5fd2200000000 + .quad 0xbe86f7868080f7bc + .quad 0xbfd5ee7e00000000 + .quad 0xbe6a9fb569e79a60 + .quad 0xbfd5dfdc00000000 + .quad 0xbe8cbdd5bf453a04 + .quad 0xbfd5d13d00000000 + .quad 0xbe6bb6ee545183dc + .quad 0xbfd5c2a000000000 + .quad 0xbe87ec26c29aa221 + .quad 0xbfd5b40500000000 + .quad 0xbe8d5da983e3cbed + .quad 0xbfd5a56d00000000 + .quad 0xbe80b6e1bfe5ec04 + .quad 0xbfd596d700000000 + .quad 0xbe8228784608b2df + .quad 0xbfd5884300000000 + .quad 0xbe7116419622027e + .quad 0xbfd579b200000000 + .quad 0xbe6aee6a38f29592 + .quad 0xbfd56b2200000000 + .quad 0xbe8a36af180d0f15 + .quad 0xbfd55c9500000000 + .quad 0xbe8c853372ca57cc + .quad 0xbfd54e0b00000000 + .quad 0xbe7bb00ee04486c4 + .quad 0xbfd53f8300000000 + .quad 0xbe7cc02b891628da + .quad 0xbfd530fd00000000 + .quad 0xbe63794fe93c7f63 + .quad 0xbfd5227900000000 + .quad 0xbe75d7854e0de2c5 + .quad 0xbfd513f800000000 + .quad 0xbe372da45519dce0 + .quad 0xbfd5057800000000 + .quad 0xbe79f8d2da727bf4 + .quad 0xbfd4f6fc00000000 + .quad 0xbe56cec60358c3fd + .quad 0xbfd4e88000000000 + .quad 0xbe8602e65c350140 + .quad 0xbfd4da0800000000 + .quad 0xbe8328c92737a9b0 + .quad 0xbfd4cb9200000000 + .quad 0xbe6dc3078767b5b5 + .quad 0xbfd4bd1e00000000 + .quad 0xbe79203927cd12cc + .quad 0xbfd4aead00000000 + .quad 0xbe55c17da1b07b42 + .quad 0xbfd4a03d00000000 + .quad 0xbe80825c25cbdda8 + .quad 0xbfd491d000000000 + .quad 0xbe7f601ba1cb823b + .quad 0xbfd4836600000000 + .quad 0xbe2caebe06773e1b + .quad 0xbfd474fd00000000 + .quad 0xbe72afc887224809 + .quad 0xbfd4669700000000 + .quad 0xbe60b454dababfee + .quad 0xbfd4583200000000 + .quad 0xbe8777e382ef584f + .quad 0xbfd449d000000000 + .quad 0xbe8d0defa65e43f7 + .quad 0xbfd43b7100000000 + .quad 0xbe8520e465f01125 + .quad 0xbfd42d1400000000 + .quad 0xbe68a9db3066f3ad + .quad 0xbfd41eb900000000 + .quad 0xbe7418cd285c77e6 + .quad 0xbfd4106000000000 + .quad 0xbe6ce1f66985cea7 + .quad 0xbfd4020900000000 + .quad 0xbe8798904973ef89 + .quad 0xbfd3f3b500000000 + .quad 0xbe4967d2ab8251d8 + .quad 0xbfd3e56200000000 + .quad 0xbe8f242d496e3d08 + .quad 0xbfd3d71200000000 + .quad 0xbe86a393bba964c4 + .quad 0xbfd3c8c500000000 + .quad 0xbe507570cacef7bf + .quad 0xbfd3ba7900000000 + .quad 0xbe6efe0fa4f69a96 + .quad 0xbfd3ac3000000000 + .quad 0xbe4b827373e0a286 + .quad 0xbfd39de800000000 + .quad 0xbe864ab3e2fb43d9 + .quad 0xbfd38fa300000000 + .quad 0xbe8f81504eb31318 + .quad 0xbfd3816100000000 + .quad 0xbe5d3164fb917590 + .quad 0xbfd3732000000000 + .quad 0xbe8ccb836b329f7f + .quad 0xbfd364e200000000 + .quad 0xbe8133990d5010c8 + .quad 0xbfd356a600000000 + .quad 0xbe404bc113420455 + .quad 0xbfd3486c00000000 + .quad 0xbe697514cf0a57dc + .quad 0xbfd33a3400000000 + .quad 0xbe6dce5b769a0eb8 + .quad 0xbfd32bfe00000000 + .quad 0xbe8e6e1dd018cc95 + .quad 0xbfd31dcb00000000 + .quad 0xbe817b505f20e7f3 + .quad 0xbfd30f9a00000000 + .quad 0xbe3835df86199ab1 + .quad 0xbfd3016b00000000 + .quad 0xbe69cf10d769bddb + .quad 0xbfd2f33e00000000 + .quad 0xbe7168482a60bb7c + .quad 0xbfd2e51400000000 + .quad 0xbe4bd6cdf5bcf5c4 + .quad 0xbfd2d6ea00000000 + .quad 0xbe8d924633fff084 + .quad 0xbfd2c8c500000000 + .quad 0xbe7542c49a05ee8f + .quad 0xbfd2baa000000000 + .quad 0xbe8ac97c411279db + .quad 0xbfd2ac7f00000000 + .quad 0xbe536acce9910bf7 + .quad 0xbfd29e5f00000000 + .quad 0xbe6e5f25492f16f4 + .quad 0xbfd2904100000000 + .quad 0xbe74df4847fe96f4 + .quad 0xbfd2822500000000 + .quad 0xbe763798f43090eb + .quad 0xbfd2740c00000000 + .quad 0xbe5fb975ad3295a5 + .quad 0xbfd265f400000000 + .quad 0xbe8afcc065467993 + .quad 0xbfd257e000000000 + .quad 0xbe751f024a4452fe + .quad 0xbfd249cc00000000 + .quad 0xbe8e6279a0249a31 + .quad 0xbfd23bbc00000000 + .quad 0xbe7631798bcda203 + .quad 0xbfd22dad00000000 + .quad 0xbe869d668ff512cd + .quad 0xbfd21fa100000000 + .quad 0xbe4179cae9beee0d + .quad 0xbfd2119700000000 + .quad 0xbe63fa3a108ec52d + .quad 0xbfd2038e00000000 + .quad 0xbe7bbae8d6fb8a1c + .quad 0xbfd1f58800000000 + .quad 0xbe807f90e4c2ec69 + .quad 0xbfd1e78400000000 + .quad 0xbe82bc2f5babe119 + .quad 0xbfd1d98200000000 + .quad 0xbe84baa4d8e71f1c + .quad 0xbfd1cb8200000000 + .quad 0xbe86a24fc7020b2b + .quad 0xbfd1bd8500000000 + .quad 0xbe8302982dfe3735 + .quad 0xbfd1af8900000000 + .quad 0xbe8536eece3209fa + .quad 0xbfd1a19000000000 + .quad 0xbe823ace8fc13621 + .quad 0xbfd1939900000000 + .quad 0xbe7f9b761181cc82 + .quad 0xbfd185a400000000 + .quad 0xbe7c2e82add30fbf + .quad 0xbfd177b100000000 + .quad 0xbe7a7defb44845fc + .quad 0xbfd169c000000000 + .quad 0xbe7ad8fc5efe4b5f + .quad 0xbfd15bd100000000 + .quad 0xbe7d8efa5836733a + .quad 0xbfd14de400000000 + .quad 0xbe8177a6d8101fb8 + .quad 0xbfd13ffa00000000 + .quad 0xbe8030b69ab39bd3 + .quad 0xbfd1321100000000 + .quad 0xbe86067085d42483 + .quad 0xbfd1242a00000000 + .quad 0xbe8da8a239a3d693 + .quad 0xbfd1164700000000 + .quad 0xbe4d72294066a603 + .quad 0xbfd1086400000000 + .quad 0xbe7b1ba1dc449b96 + .quad 0xbfd0fa8400000000 + .quad 0xbe862896725de3dd + .quad 0xbfd0eca600000000 + .quad 0xbe6a4d928a11e457 + .quad 0xbfd0deca00000000 + .quad 0xbe843a36b9d55575 + .quad 0xbfd0d0f000000000 + .quad 0xbe73f2208d19fe75 + .quad 0xbfd0c31800000000 + .quad 0xbe8d4bfe81a344c0 + .quad 0xbfd0b54200000000 + .quad 0xbe88ff16f1f6621d + .quad 0xbfd0a76f00000000 + .quad 0xbe829e78b22b06aa + .quad 0xbfd0999d00000000 + .quad 0xbe84e64b365fec9a + .quad 0xbfd08bcd00000000 + .quad 0xbe8ab2bf39987eff + .quad 0xbfd07e0000000000 + .quad 0xbe8ef00e6f310240 + .quad 0xbfd0703500000000 + .quad 0xbe7884f5dd34e44b + .quad 0xbfd0626b00000000 + .quad 0xbe8d92500f14b471 + .quad 0xbfd054a400000000 + .quad 0xbe8307e1dd3ad028 + .quad 0xbfd046df00000000 + .quad 0xbe79971a63342c6a + .quad 0xbfd0391c00000000 + .quad 0xbe760b6f55e8db61 + .quad 0xbfd02b5a00000000 + .quad 0xbe8302cf89e64237 + .quad 0xbfd01d9b00000000 + .quad 0xbe8a9f4c3efc935a + .quad 0xbfd00fde00000000 + .quad 0xbe788f5a8dc51cdf + .quad 0xbfd0022300000000 + .quad 0xbe8de87b8de45c1c + .quad 0xbfcfe8d500000000 + .quad 0xbe73bc8feab63684 + .quad 0xbfcfcd6700000000 + .quad 0xbe766b590d531889 + .quad 0xbfcfb1fe00000000 + .quad 0xbe50ba5e451bff1a + .quad 0xbfcf969700000000 + .quad 0xbe5d9e85a4fc1ce1 + .quad 0xbfcf7b3600000000 + .quad 0xbe687fbdab298db0 + .quad 0xbfcf5fd800000000 + .quad 0xbe5c831eaf201561 + .quad 0xbfcf447e00000000 + .quad 0xbe6c97cc28a0c985 + .quad 0xbfcf292900000000 + .quad 0xbe4096a784f160c8 + .quad 0xbfcf0dd800000000 + .quad 0xbe463a00e430058b + .quad 0xbfcef28900000000 + .quad 0xbe7a9ae40adf8036 + .quad 0xbfced74100000000 + .quad 0xbe76178f7389c2b3 + .quad 0xbfcebbfc00000000 + .quad 0xbe628e408a6030db + .quad 0xbfcea0bb00000000 + .quad 0xbe65370cfca139e2 + .quad 0xbfce857d00000000 + .quad 0xbe509b099c44098a + .quad 0xbfce6a4300000000 + .quad 0xbe68d5caf2faef74 + .quad 0xbfce4f0e00000000 + .quad 0xbe4dd08f036b132f + .quad 0xbfce33dd00000000 + .quad 0xbe64ccf4cb32e460 + .quad 0xbfce18af00000000 + .quad 0xbe64c4c42c4e4661 + .quad 0xbfcdfd8700000000 + .quad 0xbe70b81de05729de + .quad 0xbfcde26000000000 + .quad 0xbe7a821176a0fe0e + .quad 0xbfcdc74000000000 + .quad 0xbe669566643c24c3 + .quad 0xbfcdac2200000000 + .quad 0xbe767c88339625fc + .quad 0xbfcd910900000000 + .quad 0xbe72da2735aa6c86 + .quad 0xbfcd75f300000000 + .quad 0xbe644c6d4a5f5ad6 + .quad 0xbfcd5ae300000000 + .quad 0xbe6396dd21fe2514 + .quad 0xbfcd3fd400000000 + .quad 0xbe6ca92ae56a4fcf + .quad 0xbfcd24cb00000000 + .quad 0xbe7bdc846e0ed386 + .quad 0xbfcd09c600000000 + .quad 0xbe55b88be3ae865a + .quad 0xbfcceec500000000 + .quad 0xbe3fc6a072116830 + .quad 0xbfccd3c600000000 + .quad 0xbe7b1a6214562c52 + .quad 0xbfccb8cd00000000 + .quad 0xbe5f2c91c96636d8 + .quad 0xbfcc9dd800000000 + .quad 0xbe60c3b48651cf97 + .quad 0xbfcc82e600000000 + .quad 0xbe5966f235766ddb + .quad 0xbfcc67f800000000 + .quad 0xbe78ce14eae5dca8 + .quad 0xbfcc4d0e00000000 + .quad 0xbe625479353b5c4a + .quad 0xbfcc322800000000 + .quad 0xbe6d333a7b285ac2 + .quad 0xbfcc174500000000 + .quad 0xbe7277affe5d329a + .quad 0xbfcbfc6700000000 + .quad 0xbe67fffd12834efc + .quad 0xbfcbe18d00000000 + .quad 0xbe7b862223583bcf + .quad 0xbfcbc6b700000000 + .quad 0xbe649b874647b1f2 + .quad 0xbfcbabe300000000 + .quad 0xbe78929bf1c864a7 + .quad 0xbfcb911600000000 + .quad 0xbe74d074968f73d7 + .quad 0xbfcb764a00000000 + .quad 0xbe79fb251b935310 + .quad 0xbfcb5b8300000000 + .quad 0xbe769696568e41b9 + .quad 0xbfcb40c100000000 + .quad 0xbe65ed80b7eb91e0 + .quad 0xbfcb260200000000 + .quad 0xbe07d52c3932a2e4 + .quad 0xbfcb0b4700000000 + .quad 0xbe6b8ad7d7a99fe6 + .quad 0xbfcaf08f00000000 + .quad 0xbe7cbc2b9155b770 + .quad 0xbfcad5db00000000 + .quad 0xbe6aa03f2514a52b + .quad 0xbfcabb2d00000000 + .quad 0xbe6cfb1d524b6daf + .quad 0xbfcaa08000000000 + .quad 0xbe7a78cd1fbb1e99 + .quad 0xbfca85d900000000 + .quad 0xbe119017e37d4667 + .quad 0xbfca6b3400000000 + .quad 0xbe6184b897951f46 + .quad 0xbfca509400000000 + .quad 0xbe675349e1651fc0 + .quad 0xbfca35f700000000 + .quad 0xbe71c8acc30679dd + .quad 0xbfca1b5f00000000 + .quad 0xbe72ec1682bf9837 + .quad 0xbfca00ca00000000 + .quad 0xbe77d09336233c90 + .quad 0xbfc9e63a00000000 + .quad 0xbe7852e40017e39c + .quad 0xbfc9cbad00000000 + .quad 0xbe7d1fd8802fb817 + .quad 0xbfc9b12400000000 + .quad 0xbe59d13fae79743c + .quad 0xbfc9969d00000000 + .quad 0xbe748d385e0277cf + .quad 0xbfc97c1b00000000 + .quad 0xbe7f678fa8388a68 + .quad 0xbfc9619f00000000 + .quad 0xbe5d6188e89480ec + .quad 0xbfc9472500000000 + .quad 0xbe74e4cb139c1e95 + .quad 0xbfc92caf00000000 + .quad 0xbe6093e9a4239741 + .quad 0xbfc9123c00000000 + .quad 0xbe3c518d850f7ba8 + .quad 0xbfc8f7cd00000000 + .quad 0xbe797b7fc86f1c0c + .quad 0xbfc8dd6200000000 + .quad 0xbe77d280a0117cfd + .quad 0xbfc8c2fa00000000 + .quad 0xbe7d078174c6928f + .quad 0xbfc8a89800000000 + .quad 0xbe357f7a64ccd537 + .quad 0xbfc88e3800000000 + .quad 0xbe6a22cd1f2e8f29 + .quad 0xbfc873dc00000000 + .quad 0xbe1c582d297ff644 + .quad 0xbfc8598400000000 + .quad 0xbe73cd87ce24f758 + .quad 0xbfc83f3000000000 + .quad 0xbe6eb716bac42623 + .quad 0xbfc824df00000000 + .quad 0xbe73592a0f410400 + .quad 0xbfc80a9300000000 + .quad 0xbe78343174876ba5 + .quad 0xbfc7f04900000000 + .quad 0xbe6ba4f9b930430e + .quad 0xbfc7d60400000000 + .quad 0xbe5367dd3b0b6b0b + .quad 0xbfc7bbc200000000 + .quad 0xbe556265a1dc7a8e + .quad 0xbfc7a18500000000 + .quad 0xbe5f71aca38241c4 + .quad 0xbfc7874b00000000 + .quad 0xbe746381f987646b + .quad 0xbfc76d1500000000 + .quad 0xbe665804bc056069 + .quad 0xbfc752e200000000 + .quad 0xbe68e83e5955bbc6 + .quad 0xbfc738b200000000 + .quad 0xbe787a19887d1e81 + .quad 0xbfc71e8800000000 + .quad 0xbe5fd1054d6e1895 + .quad 0xbfc7045f00000000 + .quad 0xbe6471e7650be845 + .quad 0xbfc6ea3b00000000 + .quad 0xbe707e9d9296377f + .quad 0xbfc6d01c00000000 + .quad 0xbe7b1bb94e9cc3b2 + .quad 0xbfc6b5ff00000000 + .quad 0xbe7936ceca9afdc8 + .quad 0xbfc69be600000000 + .quad 0xbe4cb3a881abfdf7 + .quad 0xbfc681d100000000 + .quad 0xbe732151a8286c6f + .quad 0xbfc667c000000000 + .quad 0xbe6efc2e3e9ced23 + .quad 0xbfc64db200000000 + .quad 0xbe78eb86ac9ef252 + .quad 0xbfc633a800000000 + .quad 0xbe6f50df1abe0fc9 + .quad 0xbfc619a100000000 + .quad 0xbe73f3aefe930c8f + .quad 0xbfc5ff9f00000000 + .quad 0xbe7edc30c01b141d + .quad 0xbfc5e59f00000000 + .quad 0xbe7f08ed31fe1628 + .quad 0xbfc5cba500000000 + .quad 0xbe5983b170e6c68f + .quad 0xbfc5b1ad00000000 + .quad 0xbe7c5342ddbb7371 + .quad 0xbfc597ba00000000 + .quad 0xbe31f13b9ecb2da6 + .quad 0xbfc57dc900000000 + .quad 0xbe75038fc82fbc24 + .quad 0xbfc563dc00000000 + .quad 0xbe783ff5ad081783 + .quad 0xbfc549f300000000 + .quad 0xbe662723a6715875 + .quad 0xbfc5300d00000000 + .quad 0xbe6b7b7cc9af768a + .quad 0xbfc5162b00000000 + .quad 0xbe1f78d1162b410d + .quad 0xbfc4fc4d00000000 + .quad 0xbe7cb37679326801 + .quad 0xbfc4e27200000000 + .quad 0xbe7065fa9470590b + .quad 0xbfc4c89c00000000 + .quad 0xbe6c3a0233eda037 + .quad 0xbfc4aec800000000 + .quad 0xbe4e014055897901 + .quad 0xbfc494f900000000 + .quad 0xbe4fb8e003c2f3b1 + .quad 0xbfc47b2b00000000 + .quad 0xbe7c8996199d6eea + .quad 0xbfc4616400000000 + .quad 0xbe0faf0bc81e4b94 + .quad 0xbfc4479d00000000 + .quad 0xbe7cc047f1f25c83 + .quad 0xbfc42ddd00000000 + .quad 0xbe53d0da516b147f + .quad 0xbfc4141f00000000 + .quad 0xbe7fcb190acb1c29 + .quad 0xbfc3fa6400000000 + .quad 0xbe7414ec0c60bad1 + .quad 0xbfc3e0ae00000000 + .quad 0xbe74e9ba984a9a60 + .quad 0xbfc3c6fc00000000 + .quad 0xbe624337ccc1362d + .quad 0xbfc3ad4b00000000 + .quad 0xbe7774b4cc0ec2a8 + .quad 0xbfc393a000000000 + .quad 0xbe732b380b7efc7c + .quad 0xbfc379f700000000 + .quad 0xbe62dac931c2e190 + .quad 0xbfc3605300000000 + .quad 0xbe6b470fa43dc529 + .quad 0xbfc346b100000000 + .quad 0xbe69abf6162bfc32 + .quad 0xbfc32d1300000000 + .quad 0xbe2ba4b334a02879 + .quad 0xbfc3137a00000000 + .quad 0xbe4d8be297e30d03 + .quad 0xbfc2f9e300000000 + .quad 0xbe415bfda1644c22 + .quad 0xbfc2e04f00000000 + .quad 0xbe763bbe948b1ac0 + .quad 0xbfc2c6c000000000 + .quad 0xbe016a3f42b0e0f2 + .quad 0xbfc2ad3400000000 + .quad 0xbe00b500d8b4466e + .quad 0xbfc293ab00000000 + .quad 0xbe767834aad3c38f + .quad 0xbfc27a2700000000 + .quad 0xbe4b3fb7ded60421 + .quad 0xbfc260a600000000 + .quad 0xbe5cc6018f3bcd49 + .quad 0xbfc2472700000000 + .quad 0xbe603b59bc184860 + .quad 0xbfc22dad00000000 + .quad 0xbe7a556695fca0d7 + .quad 0xbfc2143600000000 + .quad 0xbe64434576d52cb7 + .quad 0xbfc1fac400000000 + .quad 0xbe6796ca377ea74e + .quad 0xbfc1e15400000000 + .quad 0xbe66f7798c85559d + .quad 0xbfc1c7e800000000 + .quad 0xbe4bde34965f6984 + .quad 0xbfc1ae7d00000000 + .quad 0xbe79e4ab7003a0e6 + .quad 0xbfc1951900000000 + .quad 0xbe49fd11e39abaac + .quad 0xbfc17bb800000000 + .quad 0xbe56b7b48b95c15b + .quad 0xbfc1625900000000 + .quad 0xbe5cc36d3e3cca65 + .quad 0xbfc148fe00000000 + .quad 0xbe41ce485761f69c + .quad 0xbfc12fa600000000 + .quad 0xbe770a1f05316811 + .quad 0xbfc1165300000000 + .quad 0xbe578d49dc1afe94 + .quad 0xbfc0fd0300000000 + .quad 0xbe6e0dca31cd9e54 + .quad 0xbfc0e3b500000000 + .quad 0xbe784e650e0a2fd5 + .quad 0xbfc0ca6b00000000 + .quad 0xbe7c536d57d9dab9 + .quad 0xbfc0b12500000000 + .quad 0xbe7b57a5578d01fd + .quad 0xbfc097e300000000 + .quad 0xbe759cc0cf3da52a + .quad 0xbfc07ea300000000 + .quad 0xbe70dc7f7c36aab7 + .quad 0xbfc0656900000000 + .quad 0xbe43057726eea6f9 + .quad 0xbfc04c3000000000 + .quad 0xbe75532713b0b555 + .quad 0xbfc032fc00000000 + .quad 0xbe51f736f8234297 + .quad 0xbfc019c900000000 + .quad 0xbe757a9427127e28 + .quad 0xbfc0009c00000000 + .quad 0xbe7dd37909d634e1 + .quad 0xbfbfcee400000000 + .quad 0xbe60e50b92227f37 + .quad 0xbfbf9c9700000000 + .quad 0xbe10744b2bbd5c34 + .quad 0xbfbf6a4d00000000 + .quad 0xbe6576fb1ab66ad7 + .quad 0xbfbf380f00000000 + .quad 0xbe6b5374d31a91ee + .quad 0xbfbf05d600000000 + .quad 0xbe4db610eee1b81b + .quad 0xbfbed3a000000000 + .quad 0xbe6a19b7978e8bb8 + .quad 0xbfbea17600000000 + .quad 0xbe6f4cb6bf56f18e + .quad 0xbfbe6f5100000000 + .quad 0xbe57f67e0bd3b63f + .quad 0xbfbe3d3300000000 + .quad 0xbe666a27d6a83d6c + .quad 0xbfbe0b1a00000000 + .quad 0xbe523cbf0c85fa27 + .quad 0xbfbdd90800000000 + .quad 0xbe6a7ced811f7da6 + .quad 0xbfbda6ff00000000 + .quad 0xbe5615e1bd550182 + .quad 0xbfbd74fd00000000 + .quad 0xbe6b4da043725d03 + .quad 0xbfbd430000000000 + .quad 0xbe658a49aa2dca64 + .quad 0xbfbd110b00000000 + .quad 0xbe6066543ad84ef1 + .quad 0xbfbcdf1a00000000 + .quad 0xbe66073d700e9f19 + .quad 0xbfbcad3500000000 + .quad 0xbe63a29cd758d759 + .quad 0xbfbc7b5100000000 + .quad 0xbe49b8777d6bbc9d + .quad 0xbfbc497800000000 + .quad 0xbe623f87f4487fe4 + .quad 0xbfbc17a400000000 + .quad 0xbe55196cb4c66620 + .quad 0xbfbbe5d800000000 + .quad 0xbe496e785a0317a3 + .quad 0xbfbbb41000000000 + .quad 0xbe5ee49501957b40 + .quad 0xbfbb825000000000 + .quad 0xbe6cf6df4849748b + .quad 0xbfbb509500000000 + .quad 0xbe688f964bd70c8f + .quad 0xbfbb1ee600000000 + .quad 0xbe6072c317519bb4 + .quad 0xbfbaed3800000000 + .quad 0xbe05b3290a662bd0 + .quad 0xbfbabb9500000000 + .quad 0xbe5b246ad0582c09 + .quad 0xbfba89f700000000 + .quad 0xbe55372721811f66 + .quad 0xbfba585d00000000 + .quad 0xbe67c995fe88bce3 + .quad 0xbfba26cc00000000 + .quad 0xbe596605e161e768 + .quad 0xbfb9f54300000000 + .quad 0xbe53bd6ea8cdcabf + .quad 0xbfb9c3be00000000 + .quad 0xbe6873a6488f239e + .quad 0xbfb9924200000000 + .quad 0xbe6038db2539e54e + .quad 0xbfb960ca00000000 + .quad 0xbe6a3576f0eb47ea + .quad 0xbfb92f5b00000000 + .quad 0xbe5ca16578e782d8 + .quad 0xbfb8fdf000000000 + .quad 0xbe6571dd058c9404 + .quad 0xbfb8cc8e00000000 + .quad 0xbe4e8172926b3912 + .quad 0xbfb89b3400000000 + .quad 0xbe458eb8a49a1ed9 + .quad 0xbfb869de00000000 + .quad 0xbe67736434037b3e + .quad 0xbfb8388d00000000 + .quad 0xbe6e2728b7069e85 + .quad 0xbfb8074500000000 + .quad 0xbe61c6bcd5b504de + .quad 0xbfb7d60500000000 + .quad 0xbe62d9f791fd12f7 + .quad 0xbfb7a4ca00000000 + .quad 0xbe53b18b476f88bf + .quad 0xbfb7739300000000 + .quad 0xbe671b2ad71bba2e + .quad 0xbfb7426500000000 + .quad 0xbe6329422bbd68e8 + .quad 0xbfb7113f00000000 + .quad 0xbe6e8b3c2fe4ecae + .quad 0xbfb6e01f00000000 + .quad 0xbe2795edd5ed58e9 + .quad 0xbfb6af0200000000 + .quad 0xbe6c4c07447a13fa + .quad 0xbfb67def00000000 + .quad 0xbe4f2ea58340e81e + .quad 0xbfb64ce400000000 + .quad 0xbe4203398a8ffda4 + .quad 0xbfb61bda00000000 + .quad 0xbe2d4147ad124eaa + .quad 0xbfb5eadc00000000 + .quad 0xbe539c66835b9867 + .quad 0xbfb5b9df00000000 + .quad 0xbe6317f3d15a9860 + .quad 0xbfb588ef00000000 + .quad 0xbe503474104b244e + .quad 0xbfb557ff00000000 + .quad 0xbe6f1dfae0bd2e94 + .quad 0xbfb5271900000000 + .quad 0xbe541889ef09d7c8 + .quad 0xbfb4f63b00000000 + .quad 0xbe52dc76d475d4d1 + .quad 0xbfb4c56200000000 + .quad 0xbe433458770a1735 + .quad 0xbfb4948d00000000 + .quad 0xbe6c8223b5c8b49b + .quad 0xbfb463c200000000 + .quad 0xbe540d91e2302042 + .quad 0xbfb432fb00000000 + .quad 0xbe64b47f064d986f + .quad 0xbfb4023900000000 + .quad 0xbe6ce4d526c81e43 + .quad 0xbfb3d18000000000 + .quad 0xbe6c41714a091d46 + .quad 0xbfb3a0d000000000 + .quad 0xbe63540db8c80703 + .quad 0xbfb3702100000000 + .quad 0xbe5f8cf1a845a25c + .quad 0xbfb33f7b00000000 + .quad 0xbe430a65c7a2686f + .quad 0xbfb30edd00000000 + .quad 0xbe62d26a7215665c + .quad 0xbfb2de4500000000 + .quad 0xbe1bff57e3bab991 + .quad 0xbfb2adb100000000 + .quad 0xbe5e8adfc156e82d + .quad 0xbfb27d2200000000 + .quad 0xbe6e5d041c5f1a05 + .quad 0xbfb24c9d00000000 + .quad 0xbe50a21095df344c + .quad 0xbfb21c2000000000 + .quad 0xbe5b57c218054e22 + .quad 0xbfb1eba400000000 + .quad 0xbe6b1806f4988888 + .quad 0xbfb1bb3200000000 + .quad 0xbe430029dc60a716 + .quad 0xbfb18ac400000000 + .quad 0xbe611e8ed29c4bea + .quad 0xbfb15a5f00000000 + .quad 0xbe6aae4e1e1cd7e9 + .quad 0xbfb12a0000000000 + .quad 0xbe4f2855166a96d5 + .quad 0xbfb0f9a500000000 + .quad 0xbe68ccc743692647 + .quad 0xbfb0c95400000000 + .quad 0xbe50c2b8ff93eea0 + .quad 0xbfb0990400000000 + .quad 0xbe329700306849f4 + .quad 0xbfb068c000000000 + .quad 0xbe661c7597dfa0cf + .quad 0xbfb0387e00000000 + .quad 0xbe64f950c199fdd6 + .quad 0xbfb0084500000000 + .quad 0xbe6434bda55a11e5 + .quad 0xbfafb02300000000 + .quad 0xbe537435dba745c1 + .quad 0xbfaf4fc600000000 + .quad 0xbe4793720209c664 + .quad 0xbfaeef7b00000000 + .quad 0xbe3e845c9d0173b4 + .quad 0xbfae8f3a00000000 + .quad 0xbe527188bd53b8bf + .quad 0xbfae2f0400000000 + .quad 0xbe49e4e1f2d00cb9 + .quad 0xbfadced800000000 + .quad 0xbe57db5b6132809a + .quad 0xbfad6ebf00000000 + .quad 0xbe43c7fbabdf571f + .quad 0xbfad0eb000000000 + .quad 0xbe4c086873f1531f + .quad 0xbfacaeac00000000 + .quad 0xbe33d01264312288 + .quad 0xbfac4eb200000000 + .quad 0xbe4ed73a1b11c287 + .quad 0xbfabeecb00000000 + .quad 0xbe328d5761ea48d2 + .quad 0xbfab8eee00000000 + .quad 0xbe4e2759579ac08a + .quad 0xbfab2f1c00000000 + .quad 0xbe4eea927b8de26e + .quad 0xbfaacf5500000000 + .quad 0xbe3a03ec4341a4ac + .quad 0xbfaa6f9800000000 + .quad 0xbe54efb9656181bf + .quad 0xbfaa0fee00000000 + .quad 0xbe529aa680456564 + .quad 0xbfa9b04f00000000 + .quad 0xbe42b60fbbf05015 + .quad 0xbfa950ba00000000 + .quad 0xbe59ea4d388956ac + .quad 0xbfa8f13800000000 + .quad 0xbe5c820f8ddadcd6 + .quad 0xbfa891ba00000000 + .quad 0xbe27e05a334c58f7 + .quad 0xbfa8324d00000000 + .quad 0xbe5d3229b2ba0376 + .quad 0xbfa7d2ec00000000 + .quad 0xbe545e77c08ed94c + .quad 0xbfa7739600000000 + .quad 0xbe427656b6f95551 + .quad 0xbfa7144a00000000 + .quad 0xbe5c82a193d30405 + .quad 0xbfa6b50a00000000 + .quad 0xbe4ddebd1f3c284a + .quad 0xbfa655dc00000000 + .quad 0xbe599c108199cfd8 + .quad 0xbfa5f6ba00000000 + .quad 0xbe348e1f3828f0d8 + .quad 0xbfa597a200000000 + .quad 0xbe5240beb8df56ca + .quad 0xbfa5389600000000 + .quad 0xbe1aed65370b9099 + .quad 0xbfa4d99400000000 + .quad 0xbe5429166d091c5d + .quad 0xbfa47a9e00000000 + .quad 0xbe44d5db06b75692 + .quad 0xbfa41bba00000000 + .quad 0xbe5e4ff2e670387a + .quad 0xbfa3bcda00000000 + .quad 0xbe5e73df6e675ed2 + .quad 0xbfa35e0d00000000 + .quad 0xbe5df2994af6bbf0 + .quad 0xbfa2ff4c00000000 + .quad 0xbe31a09f65bfdef1 + .quad 0xbfa2a09500000000 + .quad 0xbe5290bafe6a7061 + .quad 0xbfa241ea00000000 + .quad 0xbe425151c43b4181 + .quad 0xbfa1e34a00000000 + .quad 0xbe41d8dbc0646431 + .quad 0xbfa184b500000000 + .quad 0xbe5298ac777c8c9d + .quad 0xbfa1263400000000 + .quad 0xbe10a2f9d7e8035a + .quad 0xbfa0c7b600000000 + .quad 0xbe0bbc4c660fd088 + .quad 0xbfa0694b00000000 + .quad 0xbe3cc374b7950d13 + .quad 0xbfa00aeb00000000 + .quad 0xbe5aa058acdc0265 + .quad 0xbf9f592000000000 + .quad 0xbe149b4d7e5df2c0 + .quad 0xbf9e9c8f00000000 + .quad 0xbe10a7a7e78bdba3 + .quad 0xbf9de01500000000 + .quad 0xbde02a1d978db2f1 + .quad 0xbf9d23b100000000 + .quad 0xbe4e9227a287068e + .quad 0xbf9c676500000000 + .quad 0xbe4e8561096793f8 + .quad 0xbf9bab3100000000 + .quad 0xbe0968e122179f22 + .quad 0xbf9aef1300000000 + .quad 0xbe328465c0dba24f + .quad 0xbf9a330c00000000 + .quad 0xbe47051e31e0d70b + .quad 0xbf99771d00000000 + .quad 0xbe38b8d275ff3a9a + .quad 0xbf98bb5500000000 + .quad 0xbe122bdb89883925 + .quad 0xbf97ff9400000000 + .quad 0xbe36fbf85d50fecb + .quad 0xbf9743eb00000000 + .quad 0xbdf87cba8eccac44 + .quad 0xbf96886800000000 + .quad 0xbe4bd57d800c1470 + .quad 0xbf95ccee00000000 + .quad 0xbe3be2933856d62e + .quad 0xbf95118b00000000 + .quad 0xbe409620e0f1be7b + .quad 0xbf94564f00000000 + .quad 0xbe4e4325cf62b811 + .quad 0xbf939b1c00000000 + .quad 0xbe2adee9af6a25c0 + .quad 0xbf92e00000000000 + .quad 0xbe20ce46d28f63c9 + .quad 0xbf92250b00000000 + .quad 0xbe41f6aa9fb6fe0b + .quad 0xbf916a1e00000000 + .quad 0xbe4e41409957601b + .quad 0xbf90af5900000000 + .quad 0xbe4e53e5a63658ad + .quad 0xbf8fe93900000000 + .quad 0xbe3eded24d629d7d + .quad 0xbf8e73ef00000000 + .quad 0xbe3a29d2ea7d362b + .quad 0xbf8cfef500000000 + .quad 0xbe1e2e79fe4aa765 + .quad 0xbf8b8a0a00000000 + .quad 0xbe3e8785027a216b + .quad 0xbf8a155000000000 + .quad 0xbe37a174d5a8bded + .quad 0xbf88a0c600000000 + .quad 0xbe35dde88f39d7ce + .quad 0xbf872c6c00000000 + .quad 0xbe3c41ea3f44a785 + .quad 0xbf85b86300000000 + .quad 0xbe194c69ffd7f42d + .quad 0xbf84446a00000000 + .quad 0xbe1a5e4e0d24af39 + .quad 0xbf82d0a100000000 + .quad 0xbe381611eb6c3818 + .quad 0xbf815d0900000000 + .quad 0xbe3dd5da9cc5f987 + .quad 0xbf7fd34500000000 + .quad 0xbe25bd80e0b0590e + .quad 0xbf7cec9900000000 + .quad 0xbe1ce47bb0eea510 + .quad 0xbf7a068e00000000 + .quad 0xbe26dbe100877575 + .quad 0xbf7720e600000000 + .quad 0xbd9aa4f614b9e1ac + .quad 0xbf743b5f00000000 + .quad 0xbe271a96b1eb7842 + .quad 0xbf71567b00000000 + .quad 0xbe2318f60005710d + .quad 0xbf6ce37400000000 + .quad 0xbe0c7a4e122b1762 + .quad 0xbf671b3600000000 + .quad 0xbe1c85d1e3d214d1 + .quad 0xbf61533f00000000 + .quad 0xbe0e793b61aa1f54 + .quad 0xbf57181c00000000 + .quad 0xbe01296a4555af78 + .quad 0xbf47168e00000000 + .quad 0xbdf30d6f34ebfa1c + .rept 2 + .quad 0x0000000000000000 + .endr + .rept 48 + .byte 0 + .endr + +/* Exp(2) lookup table for exp part (non HSW) */ +.if .-__svml_dpow_data != _exp2_tbl +.err +.endif + .quad 0x3ff0000000000000 + .quad 0x0000000000000000 + .quad 0x3ff0163da9fb3335 + .quad 0x3c9b61299ab8cdb7 + .quad 0x3ff02c9a3e778061 + .quad 0xbc719083535b085d + .quad 0x3ff04315e86e7f85 + .quad 0xbc90a31c1977c96e + .quad 0x3ff059b0d3158574 + .quad 0x3c8d73e2a475b465 + .quad 0x3ff0706b29ddf6de + .quad 0xbc8c91dfe2b13c26 + .quad 0x3ff0874518759bc8 + .quad 0x3c6186be4bb284ff + .quad 0x3ff09e3ecac6f383 + .quad 0x3c91487818316135 + .quad 0x3ff0b5586cf9890f + .quad 0x3c98a62e4adc610a + .quad 0x3ff0cc922b7247f7 + .quad 0x3c901edc16e24f71 + .quad 0x3ff0e3ec32d3d1a2 + .quad 0x3c403a1727c57b52 + .quad 0x3ff0fb66affed31b + .quad 0xbc6b9bedc44ebd7b + .quad 0x3ff11301d0125b51 + .quad 0xbc96c51039449b39 + .quad 0x3ff12abdc06c31cc + .quad 0xbc51b514b36ca5c7 + .quad 0x3ff1429aaea92de0 + .quad 0xbc932fbf9af1369e + .quad 0x3ff15a98c8a58e51 + .quad 0x3c82406ab9eeab09 + .quad 0x3ff172b83c7d517b + .quad 0xbc819041b9d78a75 + .quad 0x3ff18af9388c8dea + .quad 0xbc911023d1970f6b + .quad 0x3ff1a35beb6fcb75 + .quad 0x3c8e5b4c7b4968e4 + .quad 0x3ff1bbe084045cd4 + .quad 0xbc995386352ef607 + .quad 0x3ff1d4873168b9aa + .quad 0x3c9e016e00a2643c + .quad 0x3ff1ed5022fcd91d + .quad 0xbc91df98027bb78b + .quad 0x3ff2063b88628cd6 + .quad 0x3c8dc775814a8494 + .quad 0x3ff21f49917ddc96 + .quad 0x3c82a97e9494a5ed + .quad 0x3ff2387a6e756238 + .quad 0x3c99b07eb6c70572 + .quad 0x3ff251ce4fb2a63f + .quad 0x3c8ac155bef4f4a4 + .quad 0x3ff26b4565e27cdd + .quad 0x3c82bd339940e9d9 + .quad 0x3ff284dfe1f56381 + .quad 0xbc9a4c3a8c3f0d7d + .quad 0x3ff29e9df51fdee1 + .quad 0x3c8612e8afad1255 + .quad 0x3ff2b87fd0dad990 + .quad 0xbc410adcd6381aa3 + .quad 0x3ff2d285a6e4030b + .quad 0x3c90024754db41d4 + .quad 0x3ff2ecafa93e2f56 + .quad 0x3c71ca0f45d52383 + .quad 0x3ff306fe0a31b715 + .quad 0x3c86f46ad23182e4 + .quad 0x3ff32170fc4cd831 + .quad 0x3c8a9ce78e18047c + .quad 0x3ff33c08b26416ff + .quad 0x3c932721843659a5 + .quad 0x3ff356c55f929ff1 + .quad 0xbc8b5cee5c4e4628 + .quad 0x3ff371a7373aa9cb + .quad 0xbc963aeabf42eae1 + .quad 0x3ff38cae6d05d866 + .quad 0xbc9e958d3c9904bc + .quad 0x3ff3a7db34e59ff7 + .quad 0xbc75e436d661f5e2 + .quad 0x3ff3c32dc313a8e5 + .quad 0xbc9efff8375d29c3 + .quad 0x3ff3dea64c123422 + .quad 0x3c8ada0911f09ebb + .quad 0x3ff3fa4504ac801c + .quad 0xbc97d023f956f9f3 + .quad 0x3ff4160a21f72e2a + .quad 0xbc5ef3691c309278 + .quad 0x3ff431f5d950a897 + .quad 0xbc81c7dde35f7998 + .quad 0x3ff44e086061892d + .quad 0x3c489b7a04ef80cf + .quad 0x3ff46a41ed1d0057 + .quad 0x3c9c944bd1648a76 + .quad 0x3ff486a2b5c13cd0 + .quad 0x3c73c1a3b69062f0 + .quad 0x3ff4a32af0d7d3de + .quad 0x3c99cb62f3d1be56 + .quad 0x3ff4bfdad5362a27 + .quad 0x3c7d4397afec42e2 + .quad 0x3ff4dcb299fddd0d + .quad 0x3c98ecdbbc6a7833 + .quad 0x3ff4f9b2769d2ca7 + .quad 0xbc94b309d25957e3 + .quad 0x3ff516daa2cf6642 + .quad 0xbc8f768569bd93ee + .quad 0x3ff5342b569d4f82 + .quad 0xbc807abe1db13cac + .quad 0x3ff551a4ca5d920f + .quad 0xbc8d689cefede59a + .quad 0x3ff56f4736b527da + .quad 0x3c99bb2c011d93ac + .quad 0x3ff58d12d497c7fd + .quad 0x3c8295e15b9a1de7 + .quad 0x3ff5ab07dd485429 + .quad 0x3c96324c054647ac + .quad 0x3ff5c9268a5946b7 + .quad 0x3c3c4b1b816986a2 + .quad 0x3ff5e76f15ad2148 + .quad 0x3c9ba6f93080e65d + .quad 0x3ff605e1b976dc09 + .quad 0xbc93e2429b56de47 + .quad 0x3ff6247eb03a5585 + .quad 0xbc9383c17e40b496 + .quad 0x3ff6434634ccc320 + .quad 0xbc8c483c759d8932 + .quad 0x3ff6623882552225 + .quad 0xbc9bb60987591c33 + .quad 0x3ff68155d44ca973 + .quad 0x3c6038ae44f73e64 + .quad 0x3ff6a09e667f3bcd + .quad 0xbc9bdd3413b26455 + .quad 0x3ff6c012750bdabf + .quad 0xbc72895667ff0b0c + .quad 0x3ff6dfb23c651a2f + .quad 0xbc6bbe3a683c88aa + .quad 0x3ff6ff7df9519484 + .quad 0xbc883c0f25860ef6 + .quad 0x3ff71f75e8ec5f74 + .quad 0xbc816e4786887a99 + .quad 0x3ff73f9a48a58174 + .quad 0xbc90a8d96c65d53b + .quad 0x3ff75feb564267c9 + .quad 0xbc90245957316dd3 + .quad 0x3ff780694fde5d3f + .quad 0x3c9866b80a02162c + .quad 0x3ff7a11473eb0187 + .quad 0xbc841577ee04992f + .quad 0x3ff7c1ed0130c132 + .quad 0x3c9f124cd1164dd5 + .quad 0x3ff7e2f336cf4e62 + .quad 0x3c705d02ba15797e + .quad 0x3ff80427543e1a12 + .quad 0xbc927c86626d972a + .quad 0x3ff82589994cce13 + .quad 0xbc9d4c1dd41532d7 + .quad 0x3ff8471a4623c7ad + .quad 0xbc88d684a341cdfb + .quad 0x3ff868d99b4492ed + .quad 0xbc9fc6f89bd4f6ba + .quad 0x3ff88ac7d98a6699 + .quad 0x3c9994c2f37cb53a + .quad 0x3ff8ace5422aa0db + .quad 0x3c96e9f156864b26 + .quad 0x3ff8cf3216b5448c + .quad 0xbc70d55e32e9e3aa + .quad 0x3ff8f1ae99157736 + .quad 0x3c85cc13a2e3976c + .quad 0x3ff9145b0b91ffc6 + .quad 0xbc9dd6792e582523 + .quad 0x3ff93737b0cdc5e5 + .quad 0xbc675fc781b57ebb + .quad 0x3ff95a44cbc8520f + .quad 0xbc764b7c96a5f039 + .quad 0x3ff97d829fde4e50 + .quad 0xbc9d185b7c1b85d0 + .quad 0x3ff9a0f170ca07ba + .quad 0xbc9173bd91cee632 + .quad 0x3ff9c49182a3f090 + .quad 0x3c7c7c46b071f2be + .quad 0x3ff9e86319e32323 + .quad 0x3c7824ca78e64c6e + .quad 0x3ffa0c667b5de565 + .quad 0xbc9359495d1cd532 + .quad 0x3ffa309bec4a2d33 + .quad 0x3c96305c7ddc36ab + .quad 0x3ffa5503b23e255d + .quad 0xbc9d2f6edb8d41e1 + .quad 0x3ffa799e1330b358 + .quad 0x3c9bcb7ecac563c6 + .quad 0x3ffa9e6b5579fdbf + .quad 0x3c90fac90ef7fd31 + .quad 0x3ffac36bbfd3f37a + .quad 0xbc8f9234cae76cd0 + .quad 0x3ffae89f995ad3ad + .quad 0x3c97a1cd345dcc81 + .quad 0x3ffb0e07298db666 + .quad 0xbc9bdef54c80e424 + .quad 0x3ffb33a2b84f15fb + .quad 0xbc62805e3084d707 + .quad 0x3ffb59728de5593a + .quad 0xbc9c71dfbbba6de3 + .quad 0x3ffb7f76f2fb5e47 + .quad 0xbc75584f7e54ac3a + .quad 0x3ffba5b030a1064a + .quad 0xbc9efcd30e54292e + .quad 0x3ffbcc1e904bc1d2 + .quad 0x3c823dd07a2d9e84 + .quad 0x3ffbf2c25bd71e09 + .quad 0xbc9efdca3f6b9c72 + .quad 0x3ffc199bdd85529c + .quad 0x3c811065895048dd + .quad 0x3ffc40ab5fffd07a + .quad 0x3c9b4537e083c60a + .quad 0x3ffc67f12e57d14b + .quad 0x3c92884dff483cac + .quad 0x3ffc8f6d9406e7b5 + .quad 0x3c71acbc48805c44 + .quad 0x3ffcb720dcef9069 + .quad 0x3c7503cbd1e949db + .quad 0x3ffcdf0b555dc3fa + .quad 0xbc8dd83b53829d72 + .quad 0x3ffd072d4a07897c + .quad 0xbc9cbc3743797a9c + .quad 0x3ffd2f87080d89f2 + .quad 0xbc9d487b719d8577 + .quad 0x3ffd5818dcfba487 + .quad 0x3c82ed02d75b3706 + .quad 0x3ffd80e316c98398 + .quad 0xbc911ec18beddfe8 + .quad 0x3ffda9e603db3285 + .quad 0x3c9c2300696db532 + .quad 0x3ffdd321f301b460 + .quad 0x3c92da5778f018c2 + .quad 0x3ffdfc97337b9b5f + .quad 0xbc91a5cd4f184b5b + .quad 0x3ffe264614f5a129 + .quad 0xbc97b627817a1496 + .quad 0x3ffe502ee78b3ff6 + .quad 0x3c839e8980a9cc8f + .quad 0x3ffe7a51fbc74c83 + .quad 0x3c92d522ca0c8de1 + .quad 0x3ffea4afa2a490da + .quad 0xbc9e9c23179c2893 + .quad 0x3ffecf482d8e67f1 + .quad 0xbc9c93f3b411ad8c + .quad 0x3ffefa1bee615a27 + .quad 0x3c9dc7f486a4b6b0 + .quad 0x3fff252b376bba97 + .quad 0x3c93a1a5bf0d8e43 + .quad 0x3fff50765b6e4540 + .quad 0x3c99d3e12dd8a18a + .quad 0x3fff7bfdad9cbe14 + .quad 0xbc9dbb12d0063509 + .quad 0x3fffa7c1819e90d8 + .quad 0x3c874853f3a5931e + .quad 0x3fffd3c22b8f71f1 + .quad 0x3c62eb74966579e7 + +/* log2 polynomial coefficients: + * clv7 */ +double_vector _clv_1 0x3f903950cf599c56 + +/* clv6 */ +double_vector _clv_2 0xbf9b4ea0e9419f52 + +/* clv5 */ +double_vector _clv_3 0x3fa7a334ddfc9f86 + +/* clv4 */ +double_vector _clv_4 0xbfb550472a8bb463 + +/* clv3 */ +double_vector _clv_5 0x3fc47fd462b3b816 + +/* clv2 */ +double_vector _clv_6 0xbfd62e4346694107 + +/* clv1 */ +double_vector _clv_7 0x3e79c3a6966457ee + +/* exponential polynomial coefficients: + * cev5 */ +double_vector _cev_1 0x3f55d87fe78a6731 + +/* cev4 */ +double_vector _cev_2 0x3f83b2ab6fba4e77 + +/* cev3 */ +double_vector _cev_3 0x3fac6b08d704a0bf + +/* cev2 */ +double_vector _cev_4 0x3fcebfbdff82c58e + +/* cev1 */ +double_vector _cev_5 0x3fe62e42fefa39ef + +/* General purpose constants: + * iMantissaMask */ +double_vector _iMantissaMask 0x000fffffffffffff + +/* i3fe7fe0000000000 */ +double_vector _i3fe7fe0000000000 0x3fe7fe0000000000 + +/* dbOne */ +double_vector _dbOne 0x3ff0000000000000 + +/* iffffffff00000000 */ +double_vector _iffffffff00000000 0xffffffff00000000 + +/* db2p20_2p19 = 2^20+2^19 */ +double_vector _db2p20_2p19 0x4138000000000000 + +/* iHighMask */ +double_vector _iHighMask 0xfffffffff8000000 + +/* LHN = -log2(e) truncated to 22 bits */ +double_vector _LHN 0xbff7154740000000 + +/* ifff0000000000000 */ +double_vector _ifff0000000000000 0xfff0000000000000 + +/* db2p45_2p44 */ +double_vector _db2p45_2p44 0x42c8000000000000 + +/* NEG_INF */ +double_vector _NEG_INF 0xfff0000000000000 + +/* NEG_ZERO */ +double_vector _NEG_ZERO 0x8000000000000000 + +/* 2pow52 */ +double_vector _d2pow52 0x4330000000000000 + +/* 1div2pow111 */ +double_vector _d1div2pow111 0x3900000000000000 + +/* HIDELTA */ +float_vector _HIDELTA 0x00100000 + +/* LORANGE */ +float_vector _LORANGE 0x00200000 + +/* ABSMASK */ +float_vector _ABSMASK 0x7fffffff + +/* INF */ +float_vector _INF 0x7f800000 + +/* DOMAINRANGE */ +float_vector _DOMAINRANGE 0x408f3fff + +/* iIndexMask */ +float_vector _iIndexMask 0x000ffe00 + +/* iIndexAdd */ +float_vector _iIndexAdd 0x00000200 + +/* i3fe7fe00 */ +float_vector _i3fe7fe00 0x3fe7fe00 + +/* i2p20_2p19 */ +float_vector _i2p20_2p19 0x41380000 + +/* iOne */ +float_vector _iOne 0x3ff00000 + +/* jIndexMask */ +float_vector _jIndexMask 0x0000007f + .type __svml_dpow_data,@object + .size __svml_dpow_data,.-__svml_dpow_data diff --git a/REORG.TODO/sysdeps/x86_64/fpu/svml_d_pow_data.h b/REORG.TODO/sysdeps/x86_64/fpu/svml_d_pow_data.h new file mode 100644 index 0000000000..ce90d8546b --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/svml_d_pow_data.h @@ -0,0 +1,104 @@ +/* Offsets for data table for function pow. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#ifndef D_POW_DATA_H +#define D_POW_DATA_H + +#define _hsw_log2_table 0 +#define _hsw_dTe 8256 +#define _hsw_dMantMask 10304 +#define _hsw_dOne 10368 +#define _hsw_dCvtMask 10432 +#define _hsw_dMinNorm 10496 +#define _hsw_dMaxNorm 10560 +#define _hsw_lRndBit 10624 +#define _hsw_lRndMask 10688 +#define _hsw_dc6 10752 +#define _hsw_dc5 10816 +#define _hsw_dc4 10880 +#define _hsw_dc3 10944 +#define _hsw_dc1 11008 +#define _hsw_dc1h 11072 +#define _hsw_dc2 11136 +#define _hsw_dAbsMask 11200 +#define _hsw_dDomainRange 11264 +#define _hsw_dShifter 11328 +#define _hsw_dIndexMask 11392 +#define _hsw_dce4 11456 +#define _hsw_dce3 11520 +#define _hsw_dce2 11584 +#define _hsw_dce1 11648 +#define _rcp_t1 11712 +#define _log2_t1 19968 +#define _exp2_tbl 36416 +#define _clv_1 38464 +#define _clv_2 38528 +#define _clv_3 38592 +#define _clv_4 38656 +#define _clv_5 38720 +#define _clv_6 38784 +#define _clv_7 38848 +#define _cev_1 38912 +#define _cev_2 38976 +#define _cev_3 39040 +#define _cev_4 39104 +#define _cev_5 39168 +#define _iMantissaMask 39232 +#define _i3fe7fe0000000000 39296 +#define _dbOne 39360 +#define _iffffffff00000000 39424 +#define _db2p20_2p19 39488 +#define _iHighMask 39552 +#define _LHN 39616 +#define _ifff0000000000000 39680 +#define _db2p45_2p44 39744 +#define _NEG_INF 39808 +#define _NEG_ZERO 39872 +#define _d2pow52 39936 +#define _d1div2pow111 40000 +#define _HIDELTA 40064 +#define _LORANGE 40128 +#define _ABSMASK 40192 +#define _INF 40256 +#define _DOMAINRANGE 40320 +#define _iIndexMask 40384 +#define _iIndexAdd 40448 +#define _i3fe7fe00 40512 +#define _i2p20_2p19 40576 +#define _iOne 40640 +#define _jIndexMask 40704 + +.macro double_vector offset value +.if .-__svml_dpow_data != \offset +.err +.endif +.rept 8 +.quad \value +.endr +.endm + +.macro float_vector offset value +.if .-__svml_dpow_data != \offset +.err +.endif +.rept 16 +.long \value +.endr +.endm + +#endif diff --git a/REORG.TODO/sysdeps/x86_64/fpu/svml_d_sin2_core.S b/REORG.TODO/sysdeps/x86_64/fpu/svml_d_sin2_core.S new file mode 100644 index 0000000000..85990833be --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/svml_d_sin2_core.S @@ -0,0 +1,29 @@ +/* Function sin vectorized with SSE2. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "svml_d_wrapper_impl.h" + + .text +ENTRY (_ZGVbN2v_sin) +WRAPPER_IMPL_SSE2 sin +END (_ZGVbN2v_sin) + +#ifndef USE_MULTIARCH + libmvec_hidden_def (_ZGVbN2v_sin) +#endif diff --git a/REORG.TODO/sysdeps/x86_64/fpu/svml_d_sin4_core.S b/REORG.TODO/sysdeps/x86_64/fpu/svml_d_sin4_core.S new file mode 100644 index 0000000000..7b9211d8c7 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/svml_d_sin4_core.S @@ -0,0 +1,29 @@ +/* Function sin vectorized with AVX2, wrapper version. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "svml_d_wrapper_impl.h" + + .text +ENTRY (_ZGVdN4v_sin) +WRAPPER_IMPL_AVX _ZGVbN2v_sin +END (_ZGVdN4v_sin) + +#ifndef USE_MULTIARCH + libmvec_hidden_def (_ZGVdN4v_sin) +#endif diff --git a/REORG.TODO/sysdeps/x86_64/fpu/svml_d_sin4_core_avx.S b/REORG.TODO/sysdeps/x86_64/fpu/svml_d_sin4_core_avx.S new file mode 100644 index 0000000000..3edf88a047 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/svml_d_sin4_core_avx.S @@ -0,0 +1,25 @@ +/* Function sin vectorized in AVX ISA as wrapper to SSE4 ISA version. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "svml_d_wrapper_impl.h" + + .text +ENTRY (_ZGVcN4v_sin) +WRAPPER_IMPL_AVX _ZGVbN2v_sin +END (_ZGVcN4v_sin) diff --git a/REORG.TODO/sysdeps/x86_64/fpu/svml_d_sin8_core.S b/REORG.TODO/sysdeps/x86_64/fpu/svml_d_sin8_core.S new file mode 100644 index 0000000000..8e67f3cbbe --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/svml_d_sin8_core.S @@ -0,0 +1,25 @@ +/* Function sin vectorized with AVX-512, wrapper to AVX2 version. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "svml_d_wrapper_impl.h" + + .text +ENTRY (_ZGVeN8v_sin) +WRAPPER_IMPL_AVX512 _ZGVdN4v_sin +END (_ZGVeN8v_sin) diff --git a/REORG.TODO/sysdeps/x86_64/fpu/svml_d_sincos2_core.S b/REORG.TODO/sysdeps/x86_64/fpu/svml_d_sincos2_core.S new file mode 100644 index 0000000000..e8023e8e8e --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/svml_d_sincos2_core.S @@ -0,0 +1,110 @@ +/* Function sincos vectorized with SSE2. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "svml_d_wrapper_impl.h" + + .text +ENTRY (_ZGVbN2vl8l8_sincos) +WRAPPER_IMPL_SSE2_fFF sincos +END (_ZGVbN2vl8l8_sincos) +libmvec_hidden_def (_ZGVbN2vl8l8_sincos) + +/* SSE2 ISA version as wrapper to scalar (for vector + function declared with #pragma omp declare simd notinbranch). */ +.macro WRAPPER_IMPL_SSE2_fFF_vvv callee +#ifndef __ILP32__ + subq $88, %rsp + cfi_adjust_cfa_offset(88) + movaps %xmm0, 64(%rsp) + lea (%rsp), %rdi + movdqa %xmm1, 32(%rdi) + lea 16(%rsp), %rsi + movdqa %xmm2, 32(%rsi) + call JUMPTARGET(\callee) + movsd 72(%rsp), %xmm0 + lea 8(%rsp), %rdi + lea 24(%rsp), %rsi + call JUMPTARGET(\callee) + movq 32(%rsp), %rdx + movq 48(%rsp), %rsi + movq 40(%rsp), %r8 + movq 56(%rsp), %r10 + movq (%rsp), %rax + movq 16(%rsp), %rcx + movq 8(%rsp), %rdi + movq 24(%rsp), %r9 + movq %rax, (%rdx) + movq %rcx, (%rsi) + movq %rdi, (%r8) + movq %r9, (%r10) + addq $88, %rsp + cfi_adjust_cfa_offset(-88) + ret +#else + pushq %rbp + .cfi_def_cfa_offset 16 + .cfi_offset 6, -16 + pushq %rbx + .cfi_def_cfa_offset 24 + .cfi_offset 3, -24 + subl $88, %esp + .cfi_def_cfa_offset 112 + leal 64(%rsp), %esi + movaps %xmm1, 32(%esp) + leal 48(%rsp), %edi + movaps %xmm2, 16(%esp) + movq %rsi, %rbp + movq %rdi, %rbx + movaps %xmm0, (%esp) + call JUMPTARGET(\callee) + movupd 8(%esp), %xmm0 + leal 8(%rbp), %esi + leal 8(%rbx), %edi + call JUMPTARGET(\callee) + movdqa 32(%esp), %xmm1 + movsd 48(%esp), %xmm0 + movq %xmm1, %rax + movdqa 16(%esp), %xmm2 + movsd %xmm0, (%eax) + movsd 56(%esp), %xmm0 + pextrd $1, %xmm1, %eax + movsd %xmm0, (%eax) + movsd 64(%esp), %xmm0 + movq %xmm2, %rax + movsd %xmm0, (%eax) + movsd 72(%esp), %xmm0 + pextrd $1, %xmm2, %eax + movsd %xmm0, (%eax) + addl $88, %esp + .cfi_def_cfa_offset 24 + popq %rbx + .cfi_def_cfa_offset 16 + popq %rbp + .cfi_def_cfa_offset 8 + ret +#endif +.endm + +ENTRY (_ZGVbN2vvv_sincos) +WRAPPER_IMPL_SSE2_fFF_vvv sincos +END (_ZGVbN2vvv_sincos) + +#ifndef USE_MULTIARCH + libmvec_hidden_def (_ZGVbN2vvv_sincos) +#endif diff --git a/REORG.TODO/sysdeps/x86_64/fpu/svml_d_sincos4_core.S b/REORG.TODO/sysdeps/x86_64/fpu/svml_d_sincos4_core.S new file mode 100644 index 0000000000..3bcd09b62d --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/svml_d_sincos4_core.S @@ -0,0 +1,152 @@ +/* Function sincos vectorized with AVX2, wrapper version. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "svml_d_wrapper_impl.h" + + .text +ENTRY (_ZGVdN4vl8l8_sincos) +WRAPPER_IMPL_AVX_fFF _ZGVbN2vl8l8_sincos +END (_ZGVdN4vl8l8_sincos) +libmvec_hidden_def (_ZGVdN4vl8l8_sincos) + +/* AVX2 ISA version as wrapper to SSE ISA version (for vector + function declared with #pragma omp declare simd notinbranch). */ +.macro WRAPPER_IMPL_AVX2_fFF_vvv callee +#ifndef __ILP32__ + pushq %rbp + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%rbp, 0) + movq %rsp, %rbp + cfi_def_cfa_register (%rbp) + andq $-32, %rsp + subq $160, %rsp + vmovupd %ymm0, 128(%rsp) + lea (%rsp), %rdi + vmovdqu %ymm1, 64(%rdi) + vmovdqu %ymm2, 96(%rdi) + lea 32(%rsp), %rsi + vzeroupper + call HIDDEN_JUMPTARGET(\callee) + vmovupd 144(%rsp), %xmm0 + lea 16(%rsp), %rdi + lea 48(%rsp), %rsi + call HIDDEN_JUMPTARGET(\callee) + movq 64(%rsp), %rdx + movq 96(%rsp), %rsi + movq 72(%rsp), %r8 + movq 104(%rsp), %r10 + movq (%rsp), %rax + movq 32(%rsp), %rcx + movq 8(%rsp), %rdi + movq 40(%rsp), %r9 + movq %rax, (%rdx) + movq %rcx, (%rsi) + movq 80(%rsp), %rax + movq 112(%rsp), %rcx + movq %rdi, (%r8) + movq %r9, (%r10) + movq 88(%rsp), %rdi + movq 120(%rsp), %r9 + movq 16(%rsp), %r11 + movq 48(%rsp), %rdx + movq 24(%rsp), %rsi + movq 56(%rsp), %r8 + movq %r11, (%rax) + movq %rdx, (%rcx) + movq %rsi, (%rdi) + movq %r8, (%r9) + movq %rbp, %rsp + cfi_def_cfa_register (%rsp) + popq %rbp + cfi_adjust_cfa_offset (-8) + cfi_restore (%rbp) + ret +#else + leal 8(%rsp), %r10d + .cfi_def_cfa 10, 0 + andl $-32, %esp + pushq -8(%r10d) + pushq %rbp + .cfi_escape 0x10,0x6,0x2,0x76,0 + movl %esp, %ebp + pushq %r12 + leal -80(%rbp), %esi + pushq %r10 + .cfi_escape 0xf,0x3,0x76,0x70,0x6 + .cfi_escape 0x10,0xc,0x2,0x76,0x78 + leal -112(%rbp), %edi + movq %rsi, %r12 + pushq %rbx + .cfi_escape 0x10,0x3,0x2,0x76,0x68 + movq %rdi, %rbx + subl $152, %esp + vmovaps %xmm1, -128(%ebp) + vmovaps %xmm2, -144(%ebp) + vmovapd %ymm0, -176(%ebp) + vzeroupper + call HIDDEN_JUMPTARGET(\callee) + leal 16(%r12), %esi + vmovapd -160(%ebp), %xmm0 + leal 16(%rbx), %edi + call HIDDEN_JUMPTARGET(\callee) + movq -128(%ebp), %rax + vmovsd -112(%ebp), %xmm0 + vmovdqa -128(%ebp), %xmm5 + vmovdqa -144(%ebp), %xmm1 + vmovsd %xmm0, (%eax) + vmovsd -104(%ebp), %xmm0 + vpextrd $1, %xmm5, %eax + vmovsd %xmm0, (%eax) + movq -120(%ebp), %rax + vmovsd -96(%ebp), %xmm0 + vmovsd %xmm0, (%eax) + vmovsd -88(%ebp), %xmm0 + vpextrd $3, %xmm5, %eax + vmovsd %xmm0, (%eax) + movq -144(%ebp), %rax + vmovsd -80(%ebp), %xmm0 + vmovsd %xmm0, (%eax) + vmovsd -72(%ebp), %xmm0 + vpextrd $1, %xmm1, %eax + vmovsd %xmm0, (%eax) + movq -136(%ebp), %rax + vmovsd -64(%ebp), %xmm0 + vmovsd %xmm0, (%eax) + vmovsd -56(%ebp), %xmm0 + vpextrd $3, %xmm1, %eax + vmovsd %xmm0, (%eax) + addl $152, %esp + popq %rbx + popq %r10 + .cfi_def_cfa 10, 0 + popq %r12 + popq %rbp + leal -8(%r10), %esp + .cfi_def_cfa 7, 8 + ret +#endif +.endm + +ENTRY (_ZGVdN4vvv_sincos) +WRAPPER_IMPL_AVX2_fFF_vvv _ZGVbN2vl8l8_sincos +END (_ZGVdN4vvv_sincos) + +#ifndef USE_MULTIARCH + libmvec_hidden_def (_ZGVdN4vvv_sincos) +#endif diff --git a/REORG.TODO/sysdeps/x86_64/fpu/svml_d_sincos4_core_avx.S b/REORG.TODO/sysdeps/x86_64/fpu/svml_d_sincos4_core_avx.S new file mode 100644 index 0000000000..1164ae7a74 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/svml_d_sincos4_core_avx.S @@ -0,0 +1,143 @@ +/* Function sincos vectorized in AVX ISA as wrapper to SSE4 ISA version. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "svml_d_wrapper_impl.h" + + .text +ENTRY (_ZGVcN4vl8l8_sincos) +WRAPPER_IMPL_AVX_fFF _ZGVbN2vl8l8_sincos +END (_ZGVcN4vl8l8_sincos) + +/* AVX ISA version as wrapper to SSE ISA version (for vector + function declared with #pragma omp declare simd notinbranch). */ +.macro WRAPPER_IMPL_AVX_fFF_vvv callee +#ifndef __ILP32__ + pushq %rbp + movq %rsp, %rbp + andq $-32, %rsp + subq $160, %rsp + vmovupd %ymm0, 64(%rsp) + lea (%rsp), %rdi + vmovdqu %xmm1, 96(%rdi) + vmovdqu %xmm2, 112(%rdi) + vmovdqu %xmm3, 128(%rdi) + vmovdqu %xmm4, 144(%rdi) + lea 32(%rsp), %rsi + vzeroupper + call HIDDEN_JUMPTARGET(\callee) + vmovdqu 80(%rsp), %xmm0 + lea 16(%rsp), %rdi + lea 48(%rsp), %rsi + call HIDDEN_JUMPTARGET(\callee) + movq 96(%rsp), %rdx + movq 104(%rsp), %rsi + movq 112(%rsp), %r8 + movq 120(%rsp), %r10 + movq (%rsp), %rax + movq 8(%rsp), %rcx + movq 16(%rsp), %rdi + movq 24(%rsp), %r9 + movq %rax, (%rdx) + movq %rcx, (%rsi) + movq 128(%rsp), %rax + movq 136(%rsp), %rcx + movq %rdi, (%r8) + movq %r9, (%r10) + movq 144(%rsp), %rdi + movq 152(%rsp), %r9 + movq 32(%rsp), %r11 + movq 40(%rsp), %rdx + movq 48(%rsp), %rsi + movq 56(%rsp), %r8 + movq %r11, (%rax) + movq %rdx, (%rcx) + movq %rsi, (%rdi) + movq %r8, (%r9) + movq %rbp, %rsp + popq %rbp + ret +#else + leal 8(%rsp), %r10d + .cfi_def_cfa 10, 0 + andl $-32, %esp + pushq -8(%r10d) + pushq %rbp + .cfi_escape 0x10,0x6,0x2,0x76,0 + movl %esp, %ebp + pushq %r12 + leal -80(%rbp), %esi + pushq %r10 + .cfi_escape 0xf,0x3,0x76,0x70,0x6 + .cfi_escape 0x10,0xc,0x2,0x76,0x78 + leal -112(%rbp), %edi + movq %rsi, %r12 + pushq %rbx + .cfi_escape 0x10,0x3,0x2,0x76,0x68 + movq %rdi, %rbx + subl $152, %esp + vmovaps %xmm1, -128(%ebp) + vmovaps %xmm2, -144(%ebp) + vmovapd %ymm0, -176(%ebp) + vzeroupper + call HIDDEN_JUMPTARGET(\callee) + leal 16(%r12), %esi + vmovupd -160(%ebp), %xmm0 + leal 16(%rbx), %edi + call HIDDEN_JUMPTARGET(\callee) + movq -128(%ebp), %rax + vmovsd -112(%ebp), %xmm0 + vmovdqa -128(%ebp), %xmm5 + vmovdqa -144(%ebp), %xmm1 + vmovsd %xmm0, (%eax) + vmovsd -104(%ebp), %xmm0 + vpextrd $1, %xmm5, %eax + vmovsd %xmm0, (%eax) + movq -120(%ebp), %rax + vmovsd -96(%ebp), %xmm0 + vmovsd %xmm0, (%eax) + vmovsd -88(%ebp), %xmm0 + vpextrd $3, %xmm5, %eax + vmovsd %xmm0, (%eax) + movq -144(%ebp), %rax + vmovsd -80(%ebp), %xmm0 + vmovsd %xmm0, (%eax) + vmovsd -72(%ebp), %xmm0 + vpextrd $1, %xmm1, %eax + vmovsd %xmm0, (%eax) + movq -136(%ebp), %rax + vmovsd -64(%ebp), %xmm0 + vmovsd %xmm0, (%eax) + vmovsd -56(%ebp), %xmm0 + vpextrd $3, %xmm1, %eax + vmovsd %xmm0, (%eax) + addl $152, %esp + popq %rbx + popq %r10 + .cfi_def_cfa 10, 0 + popq %r12 + popq %rbp + leal -8(%r10), %esp + .cfi_def_cfa 7, 8 + ret +#endif +.endm + +ENTRY (_ZGVcN4vvv_sincos) +WRAPPER_IMPL_AVX_fFF_vvv _ZGVbN2vl8l8_sincos +END (_ZGVcN4vvv_sincos) diff --git a/REORG.TODO/sysdeps/x86_64/fpu/svml_d_sincos8_core.S b/REORG.TODO/sysdeps/x86_64/fpu/svml_d_sincos8_core.S new file mode 100644 index 0000000000..c104539821 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/svml_d_sincos8_core.S @@ -0,0 +1,224 @@ +/* Function sincos vectorized with AVX-512. Wrapper to AVX2 version. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "svml_d_wrapper_impl.h" + + .text +ENTRY (_ZGVeN8vl8l8_sincos) +WRAPPER_IMPL_AVX512_fFF _ZGVdN4vl8l8_sincos +END (_ZGVeN8vl8l8_sincos) + +/* AVX512 ISA version as wrapper to AVX2 ISA version (for vector + function declared with #pragma omp declare simd notinbranch). */ +.macro WRAPPER_IMPL_AVX512_fFF_vvv callee +#ifndef __ILP32__ + pushq %rbp + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%rbp, 0) + movq %rsp, %rbp + cfi_def_cfa_register (%rbp) + andq $-64, %rsp + subq $320, %rsp + /* Encoding for vmovups %zmm0, 256(%rsp). */ + .byte 0x62 + .byte 0xf1 + .byte 0x7c + .byte 0x48 + .byte 0x11 + .byte 0x44 + .byte 0x24 + .byte 0x04 + lea (%rsp), %rdi + /* Encoding for vmovups %zmm1, 128(%rdi). */ + .byte 0x62 + .byte 0xf1 + .byte 0x7c + .byte 0x48 + .byte 0x11 + .byte 0x4f + .byte 0x02 + /* Encoding for vmovups %zmm2, 192(%rdi). */ + .byte 0x62 + .byte 0xf1 + .byte 0x7c + .byte 0x48 + .byte 0x11 + .byte 0x57 + .byte 0x03 + lea 64(%rsp), %rsi + call HIDDEN_JUMPTARGET(\callee) + vmovdqu 288(%rsp), %ymm0 + lea 32(%rsp), %rdi + lea 96(%rsp), %rsi + call HIDDEN_JUMPTARGET(\callee) + movq 128(%rsp), %rdx + movq 192(%rsp), %rsi + movq 136(%rsp), %r8 + movq 200(%rsp), %r10 + movq (%rsp), %rax + movq 64(%rsp), %rcx + movq 8(%rsp), %rdi + movq 72(%rsp), %r9 + movq %rax, (%rdx) + movq %rcx, (%rsi) + movq 144(%rsp), %rax + movq 208(%rsp), %rcx + movq %rdi, (%r8) + movq %r9, (%r10) + movq 152(%rsp), %rdi + movq 216(%rsp), %r9 + movq 16(%rsp), %r11 + movq 80(%rsp), %rdx + movq 24(%rsp), %rsi + movq 88(%rsp), %r8 + movq %r11, (%rax) + movq %rdx, (%rcx) + movq 160(%rsp), %r11 + movq 224(%rsp), %rdx + movq %rsi, (%rdi) + movq %r8, (%r9) + movq 168(%rsp), %rsi + movq 232(%rsp), %r8 + movq 32(%rsp), %r10 + movq 96(%rsp), %rax + movq 40(%rsp), %rcx + movq 104(%rsp), %rdi + movq %r10, (%r11) + movq %rax, (%rdx) + movq 176(%rsp), %r10 + movq 240(%rsp), %rax + movq %rcx, (%rsi) + movq %rdi, (%r8) + movq 184(%rsp), %rcx + movq 248(%rsp), %rdi + movq 48(%rsp), %r9 + movq 112(%rsp), %r11 + movq 56(%rsp), %rdx + movq 120(%rsp), %rsi + movq %r9, (%r10) + movq %r11, (%rax) + movq %rdx, (%rcx) + movq %rsi, (%rdi) + movq %rbp, %rsp + cfi_def_cfa_register (%rsp) + popq %rbp + cfi_adjust_cfa_offset (-8) + cfi_restore (%rbp) + ret +#else + leal 8(%rsp), %r10d + .cfi_def_cfa 10, 0 + andl $-64, %esp + pushq -8(%r10d) + pushq %rbp + .cfi_escape 0x10,0x6,0x2,0x76,0 + movl %esp, %ebp + pushq %r12 + leal -112(%rbp), %esi + pushq %r10 + .cfi_escape 0xf,0x3,0x76,0x70,0x6 + .cfi_escape 0x10,0xc,0x2,0x76,0x78 + leal -176(%rbp), %edi + movq %rsi, %r12 + pushq %rbx + .cfi_escape 0x10,0x3,0x2,0x76,0x68 + movq %rdi, %rbx + subl $280, %esp + vmovdqa %ymm1, -208(%ebp) + vmovdqa %ymm2, -240(%ebp) + /* Encoding for vmovapd %zmm0, -304(%ebp). */ + .byte 0x67 + .byte 0x62 + .byte 0xf1 + .byte 0xfd + .byte 0x48 + .byte 0x29 + .byte 0x85 + .byte 0xd0 + .byte 0xfe + .byte 0xff + .byte 0xff + call HIDDEN_JUMPTARGET(\callee) + leal 32(%r12), %esi + vmovupd -272(%ebp), %ymm0 + leal 32(%rbx), %edi + call HIDDEN_JUMPTARGET(\callee) + movl -208(%ebp), %eax + vmovsd -176(%ebp), %xmm0 + vmovsd %xmm0, (%eax) + movl -204(%ebp), %eax + vmovsd -168(%ebp), %xmm0 + vmovsd %xmm0, (%eax) + movl -200(%ebp), %eax + vmovsd -160(%ebp), %xmm0 + vmovsd %xmm0, (%eax) + movl -196(%ebp), %eax + vmovsd -152(%ebp), %xmm0 + vmovsd %xmm0, (%eax) + movl -192(%ebp), %eax + vmovsd -144(%ebp), %xmm0 + vmovsd %xmm0, (%eax) + movl -188(%ebp), %eax + vmovsd -136(%ebp), %xmm0 + vmovsd %xmm0, (%eax) + movl -184(%ebp), %eax + vmovsd -128(%ebp), %xmm0 + vmovsd %xmm0, (%eax) + movl -180(%ebp), %eax + vmovsd -120(%ebp), %xmm0 + vmovsd %xmm0, (%eax) + movl -240(%ebp), %eax + vmovsd -112(%ebp), %xmm0 + vmovsd %xmm0, (%eax) + movl -236(%ebp), %eax + vmovsd -104(%ebp), %xmm0 + vmovsd %xmm0, (%eax) + movl -232(%ebp), %eax + vmovsd -96(%ebp), %xmm0 + vmovsd %xmm0, (%eax) + movl -228(%ebp), %eax + vmovsd -88(%ebp), %xmm0 + vmovsd %xmm0, (%eax) + movl -224(%ebp), %eax + vmovsd -80(%ebp), %xmm0 + vmovsd %xmm0, (%eax) + movl -220(%ebp), %eax + vmovsd -72(%ebp), %xmm0 + vmovsd %xmm0, (%eax) + movl -216(%ebp), %eax + vmovsd -64(%ebp), %xmm0 + vmovsd %xmm0, (%eax) + movl -212(%ebp), %eax + vmovsd -56(%ebp), %xmm0 + vmovsd %xmm0, (%eax) + addl $280, %esp + popq %rbx + popq %r10 + .cfi_def_cfa 10, 0 + popq %r12 + popq %rbp + leal -8(%r10), %esp + .cfi_def_cfa 7, 8 + ret +#endif +.endm + +ENTRY (_ZGVeN8vvv_sincos) +WRAPPER_IMPL_AVX512_fFF_vvv _ZGVdN4vl8l8_sincos +END (_ZGVeN8vvv_sincos) diff --git a/REORG.TODO/sysdeps/x86_64/fpu/svml_d_trig_data.S b/REORG.TODO/sysdeps/x86_64/fpu/svml_d_trig_data.S new file mode 100644 index 0000000000..f7cf6c0a08 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/svml_d_trig_data.S @@ -0,0 +1,130 @@ +/* Data for vectorized sin, cos, sincos. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include "svml_d_trig_data.h" + + .section .rodata, "a" + .align 64 + +/* Data table for vector implementations. + The table may contain polynomial, reduction, lookup + coefficients and other constants obtained through different + methods of research and experimental work. + */ + .globl __svml_d_trig_data +__svml_d_trig_data: + +/* General purpose constants: + absolute value mask + */ +double_vector __dAbsMask 0x7fffffffffffffff + +/* working range threshold */ +double_vector __dRangeVal 0x4160000000000000 + +/* working range threshold */ +double_vector __dRangeVal_sin 0x4170000000000000 + +/* PI/2 */ +double_vector __dHalfPI 0x3ff921fb54442d18 + +/* 1/PI */ +double_vector __dInvPI 0x3fd45f306dc9c883 + +/* right-shifter constant */ +double_vector __dRShifter 0x4338000000000000 + +/* 0.0 */ +double_vector __dZero 0x0000000000000000 + +/* -0.0 */ +double_vector __lNZero 0x8000000000000000 + +/* 0.5 */ +double_vector __dOneHalf 0x3fe0000000000000 + +/* Range reduction PI-based constants: + PI high part + */ +double_vector __dPI1 0x400921fb40000000 + +/* PI mid part 1 */ +double_vector __dPI2 0x3e84442d00000000 + +/* PI mid part 2 */ +double_vector __dPI3 0x3d08469880000000 + +/* PI low part */ +double_vector __dPI4 0x3b88cc51701b839a + +/* Range reduction PI-based constants if FMA available: + PI high part (FMA available) + */ +double_vector __dPI1_FMA 0x400921fb54442d18 + +/* PI mid part (FMA available) */ +double_vector __dPI2_FMA 0x3ca1a62633145c06 + +/* PI low part (FMA available) */ +double_vector __dPI3_FMA 0x395c1cd129024e09 + +/* HalfPI1 */ +double_vector __dHalfPI1 0x3ff921fc00000000 + +/* HalfPI2 */ +double_vector __dHalfPI2 0xbea5777a00000000 + +/* HalfPI3 */ +double_vector __dHalfPI3 0xbd473dcc00000000 + +/* HalfPI4 */ +double_vector __dHalfPI4 0x3bf898cc51701b84 + +/* Polynomial coefficients (relative error 2^(-52.115)): */ +double_vector __dC1 0xbfc55555555554a7 +double_vector __dC2 0x3f8111111110a4a8 +double_vector __dC3 0xbf2a01a019a5b86d +double_vector __dC4 0x3ec71de38030fea0 +double_vector __dC5 0xbe5ae63546002231 +double_vector __dC6 0x3de60e6857a2f220 +double_vector __dC7 0xbd69f0d60811aac8 + +/* Polynomial coefficients (relative error 2^(-52.115)): */ +double_vector __dC1_sin 0xbfc55555555554a8 +double_vector __dC2_sin 0x3f8111111110a573 +double_vector __dC3_sin 0xbf2a01a019a659dd +double_vector __dC4_sin 0x3ec71de3806add1a +double_vector __dC5_sin 0xbe5ae6355aaa4a53 +double_vector __dC6_sin 0x3de60e6bee01d83e +double_vector __dC7_sin 0xbd69f1517e9f65f0 + +/* + Additional constants: + absolute value mask + */ +/* right-shifer for low accuracy version */ +double_vector __dRShifter_la 0x4330000000000000 + +/* right-shifer-1.0 for low accuracy version */ +double_vector __dRShifterm5_la 0x432fffffffffffff + +/* right-shifer with low mask for low accuracy version */ +double_vector __dRXmax_la 0x43300000007ffffe + + .type __svml_d_trig_data,@object + .size __svml_d_trig_data,.-__svml_d_trig_data diff --git a/REORG.TODO/sysdeps/x86_64/fpu/svml_d_trig_data.h b/REORG.TODO/sysdeps/x86_64/fpu/svml_d_trig_data.h new file mode 100644 index 0000000000..ccdff7edb8 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/svml_d_trig_data.h @@ -0,0 +1,72 @@ +/* Offsets for data table for vectorized sin, cos, sincos. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#ifndef D_TRIG_DATA_H +#define D_TRIG_DATA_H + +#define __dAbsMask 0 +#define __dRangeVal 64 +#define __dRangeVal_sin 64*2 +#define __dHalfPI 64*3 +#define __dInvPI 64*4 +#define __dRShifter 64*5 +#define __dZero 64*6 +#define __lNZero 64*7 +#define __dOneHalf 64*8 +#define __dPI1 64*9 +#define __dPI2 64*10 +#define __dPI3 64*11 +#define __dPI4 64*12 +#define __dPI1_FMA 64*13 +#define __dPI2_FMA 64*14 +#define __dPI3_FMA 64*15 +#define __dHalfPI1 64*16 +#define __dHalfPI2 64*17 +#define __dHalfPI3 64*18 +#define __dHalfPI4 64*19 +#define __dC1 64*20 +#define __dC2 64*21 +#define __dC3 64*22 +#define __dC4 64*23 +#define __dC5 64*24 +#define __dC6 64*25 +#define __dC7 64*26 +#define __dC1_sin 64*27 +#define __dC2_sin 64*28 +#define __dC3_sin 64*29 +#define __dC4_sin 64*30 +#define __dC5_sin 64*31 +#define __dC6_sin 64*32 +#define __dC7_sin 64*33 +#define __dRShifter_la 64*34 +#define __dRShifterm5_la 64*35 +#define __dRXmax_la 64*36 +#define __dAbsMask_la __dAbsMask +#define __dInvPI_la __dInvPI +#define __dSignMask __lNZero + +.macro double_vector offset value +.if .-__svml_d_trig_data != \offset +.err +.endif +.rept 8 +.quad \value +.endr +.endm + +#endif diff --git a/REORG.TODO/sysdeps/x86_64/fpu/svml_d_wrapper_impl.h b/REORG.TODO/sysdeps/x86_64/fpu/svml_d_wrapper_impl.h new file mode 100644 index 0000000000..625eb6642b --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/svml_d_wrapper_impl.h @@ -0,0 +1,335 @@ +/* Wrapper implementations of vector math functions. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +/* SSE2 ISA version as wrapper to scalar. */ +.macro WRAPPER_IMPL_SSE2 callee + subq $40, %rsp + cfi_adjust_cfa_offset(40) + movaps %xmm0, (%rsp) + call JUMPTARGET(\callee) + movsd %xmm0, 16(%rsp) + movsd 8(%rsp), %xmm0 + call JUMPTARGET(\callee) + movsd 16(%rsp), %xmm1 + movsd %xmm0, 24(%rsp) + unpcklpd %xmm0, %xmm1 + movaps %xmm1, %xmm0 + addq $40, %rsp + cfi_adjust_cfa_offset(-40) + ret +.endm + +/* 2 argument SSE2 ISA version as wrapper to scalar. */ +.macro WRAPPER_IMPL_SSE2_ff callee + subq $56, %rsp + cfi_adjust_cfa_offset(56) + movaps %xmm0, (%rsp) + movaps %xmm1, 16(%rsp) + call JUMPTARGET(\callee) + movsd %xmm0, 32(%rsp) + movsd 8(%rsp), %xmm0 + movsd 24(%rsp), %xmm1 + call JUMPTARGET(\callee) + movsd 32(%rsp), %xmm1 + movsd %xmm0, 40(%rsp) + unpcklpd %xmm0, %xmm1 + movaps %xmm1, %xmm0 + addq $56, %rsp + cfi_adjust_cfa_offset(-56) + ret +.endm + +/* 3 argument SSE2 ISA version as wrapper to scalar. */ +.macro WRAPPER_IMPL_SSE2_fFF callee + pushq %rbp + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%rbp, 0) + pushq %rbx + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%rbx, 0) + movq %rdi, %rbp + movq %rsi, %rbx + subq $40, %rsp + cfi_adjust_cfa_offset(40) + leaq 16(%rsp), %rsi + leaq 24(%rsp), %rdi + movaps %xmm0, (%rsp) + call JUMPTARGET(\callee) + leaq 16(%rsp), %rsi + leaq 24(%rsp), %rdi + movsd 24(%rsp), %xmm0 + movapd (%rsp), %xmm1 + movsd %xmm0, 0(%rbp) + unpckhpd %xmm1, %xmm1 + movsd 16(%rsp), %xmm0 + movsd %xmm0, (%rbx) + movapd %xmm1, %xmm0 + call JUMPTARGET(\callee) + movsd 24(%rsp), %xmm0 + movsd %xmm0, 8(%rbp) + movsd 16(%rsp), %xmm0 + movsd %xmm0, 8(%rbx) + addq $40, %rsp + cfi_adjust_cfa_offset(-40) + popq %rbx + cfi_adjust_cfa_offset (-8) + cfi_restore (%rbx) + popq %rbp + cfi_adjust_cfa_offset (-8) + cfi_restore (%rbp) + ret +.endm + +/* AVX/AVX2 ISA version as wrapper to SSE ISA version. */ +.macro WRAPPER_IMPL_AVX callee + pushq %rbp + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%rbp, 0) + movq %rsp, %rbp + cfi_def_cfa_register (%rbp) + andq $-32, %rsp + subq $32, %rsp + vextractf128 $1, %ymm0, (%rsp) + vzeroupper + call HIDDEN_JUMPTARGET(\callee) + vmovapd %xmm0, 16(%rsp) + vmovaps (%rsp), %xmm0 + call HIDDEN_JUMPTARGET(\callee) + vmovapd %xmm0, %xmm1 + vmovapd 16(%rsp), %xmm0 + vinsertf128 $1, %xmm1, %ymm0, %ymm0 + movq %rbp, %rsp + cfi_def_cfa_register (%rsp) + popq %rbp + cfi_adjust_cfa_offset (-8) + cfi_restore (%rbp) + ret +.endm + +/* 2 argument AVX/AVX2 ISA version as wrapper to SSE ISA version. */ +.macro WRAPPER_IMPL_AVX_ff callee + pushq %rbp + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%rbp, 0) + movq %rsp, %rbp + cfi_def_cfa_register (%rbp) + andq $-32, %rsp + subq $64, %rsp + vextractf128 $1, %ymm0, 16(%rsp) + vextractf128 $1, %ymm1, (%rsp) + vzeroupper + call HIDDEN_JUMPTARGET(\callee) + vmovaps %xmm0, 32(%rsp) + vmovaps 16(%rsp), %xmm0 + vmovaps (%rsp), %xmm1 + call HIDDEN_JUMPTARGET(\callee) + vmovaps %xmm0, %xmm1 + vmovaps 32(%rsp), %xmm0 + vinsertf128 $1, %xmm1, %ymm0, %ymm0 + movq %rbp, %rsp + cfi_def_cfa_register (%rsp) + popq %rbp + cfi_adjust_cfa_offset (-8) + cfi_restore (%rbp) + ret +.endm + +/* 3 argument AVX/AVX2 ISA version as wrapper to SSE ISA version. */ +.macro WRAPPER_IMPL_AVX_fFF callee + pushq %rbp + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%rbp, 0) + movq %rsp, %rbp + cfi_def_cfa_register (%rbp) + andq $-32, %rsp + pushq %r13 + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%r13, 0) + pushq %r14 + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%r14, 0) + subq $48, %rsp + movq %rsi, %r14 + movq %rdi, %r13 + vextractf128 $1, %ymm0, 32(%rsp) + vzeroupper + call HIDDEN_JUMPTARGET(\callee) + vmovaps 32(%rsp), %xmm0 + lea (%rsp), %rdi + lea 16(%rsp), %rsi + call HIDDEN_JUMPTARGET(\callee) + vmovapd (%rsp), %xmm0 + vmovapd 16(%rsp), %xmm1 + vmovapd %xmm0, 16(%r13) + vmovapd %xmm1, 16(%r14) + addq $48, %rsp + popq %r14 + cfi_adjust_cfa_offset (-8) + cfi_restore (%r14) + popq %r13 + cfi_adjust_cfa_offset (-8) + cfi_restore (%r13) + movq %rbp, %rsp + cfi_def_cfa_register (%rsp) + popq %rbp + cfi_adjust_cfa_offset (-8) + cfi_restore (%rbp) + ret +.endm + +/* AVX512 ISA version as wrapper to AVX2 ISA version. */ +.macro WRAPPER_IMPL_AVX512 callee + pushq %rbp + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%rbp, 0) + movq %rsp, %rbp + cfi_def_cfa_register (%rbp) + andq $-64, %rsp + subq $128, %rsp +/* Below is encoding for vmovups %zmm0, (%rsp). */ + .byte 0x62 + .byte 0xf1 + .byte 0x7c + .byte 0x48 + .byte 0x11 + .byte 0x04 + .byte 0x24 + vmovupd (%rsp), %ymm0 + call HIDDEN_JUMPTARGET(\callee) + vmovupd %ymm0, 64(%rsp) + vmovupd 32(%rsp), %ymm0 + call HIDDEN_JUMPTARGET(\callee) + vmovupd %ymm0, 96(%rsp) +/* Below is encoding for vmovups 64(%rsp), %zmm0. */ + .byte 0x62 + .byte 0xf1 + .byte 0x7c + .byte 0x48 + .byte 0x10 + .byte 0x44 + .byte 0x24 + .byte 0x01 + movq %rbp, %rsp + cfi_def_cfa_register (%rsp) + popq %rbp + cfi_adjust_cfa_offset (-8) + cfi_restore (%rbp) + ret +.endm + +/* 2 argument AVX512 ISA version as wrapper to AVX2 ISA version. */ +.macro WRAPPER_IMPL_AVX512_ff callee + pushq %rbp + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%rbp, 0) + movq %rsp, %rbp + cfi_def_cfa_register (%rbp) + andq $-64, %rsp + subq $192, %rsp +/* Below is encoding for vmovups %zmm0, (%rsp). */ + .byte 0x62 + .byte 0xf1 + .byte 0x7c + .byte 0x48 + .byte 0x11 + .byte 0x04 + .byte 0x24 +/* Below is encoding for vmovups %zmm1, 64(%rsp). */ + .byte 0x62 + .byte 0xf1 + .byte 0x7c + .byte 0x48 + .byte 0x11 + .byte 0x4c + .byte 0x24 + .byte 0x01 + vmovupd (%rsp), %ymm0 + vmovupd 64(%rsp), %ymm1 + call HIDDEN_JUMPTARGET(\callee) + vmovupd %ymm0, 128(%rsp) + vmovupd 32(%rsp), %ymm0 + vmovupd 96(%rsp), %ymm1 + call HIDDEN_JUMPTARGET(\callee) + vmovupd %ymm0, 160(%rsp) +/* Below is encoding for vmovups 128(%rsp), %zmm0. */ + .byte 0x62 + .byte 0xf1 + .byte 0x7c + .byte 0x48 + .byte 0x10 + .byte 0x44 + .byte 0x24 + .byte 0x02 + movq %rbp, %rsp + cfi_def_cfa_register (%rsp) + popq %rbp + cfi_adjust_cfa_offset (-8) + cfi_restore (%rbp) + ret +.endm + +/* 3 argument AVX512 ISA version as wrapper to AVX2 ISA version. */ +.macro WRAPPER_IMPL_AVX512_fFF callee + pushq %rbp + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%rbp, 0) + movq %rsp, %rbp + cfi_def_cfa_register (%rbp) + andq $-64, %rsp + pushq %r12 + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%r12, 0) + pushq %r13 + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%r13, 0) + subq $176, %rsp + movq %rsi, %r13 +/* Below is encoding for vmovups %zmm0, (%rsp). */ + .byte 0x62 + .byte 0xf1 + .byte 0x7c + .byte 0x48 + .byte 0x11 + .byte 0x04 + .byte 0x24 + movq %rdi, %r12 + vmovupd (%rsp), %ymm0 + call HIDDEN_JUMPTARGET(\callee) + vmovupd 32(%rsp), %ymm0 + lea 64(%rsp), %rdi + lea 96(%rsp), %rsi + call HIDDEN_JUMPTARGET(\callee) + vmovupd 64(%rsp), %ymm0 + vmovupd 96(%rsp), %ymm1 + vmovupd %ymm0, 32(%r12) + vmovupd %ymm1, 32(%r13) + vzeroupper + addq $176, %rsp + popq %r13 + cfi_adjust_cfa_offset (-8) + cfi_restore (%r13) + popq %r12 + cfi_adjust_cfa_offset (-8) + cfi_restore (%r12) + movq %rbp, %rsp + cfi_def_cfa_register (%rsp) + popq %rbp + cfi_adjust_cfa_offset (-8) + cfi_restore (%rbp) + ret +.endm diff --git a/REORG.TODO/sysdeps/x86_64/fpu/svml_finite_alias.S b/REORG.TODO/sysdeps/x86_64/fpu/svml_finite_alias.S new file mode 100644 index 0000000000..7e39e7801d --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/svml_finite_alias.S @@ -0,0 +1,58 @@ +/* These aliases added as workaround to exclude unnecessary symbol + aliases in libmvec.so while compiler creates the vector names + based on scalar asm name. Corresponding discussion is at + <https://gcc.gnu.org/ml/gcc/2015-06/msg00173.html>. + Copyright (C) 2015-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> + +#define ALIAS_IMPL(alias, target) \ +ENTRY (alias); \ + jmp *target@GOTPCREL(%rip); \ +END (alias) + + .text +ALIAS_IMPL (_ZGVbN2v___log_finite, _ZGVbN2v_log) +ALIAS_IMPL (_ZGVcN4v___log_finite, _ZGVcN4v_log) +ALIAS_IMPL (_ZGVdN4v___log_finite, _ZGVdN4v_log) +ALIAS_IMPL (_ZGVeN8v___log_finite, _ZGVeN8v_log) + +ALIAS_IMPL (_ZGVbN4v___logf_finite, _ZGVbN4v_logf) +ALIAS_IMPL (_ZGVcN8v___logf_finite, _ZGVcN8v_logf) +ALIAS_IMPL (_ZGVdN8v___logf_finite, _ZGVdN8v_logf) +ALIAS_IMPL (_ZGVeN16v___logf_finite, _ZGVeN16v_logf) + +ALIAS_IMPL (_ZGVbN2v___exp_finite, _ZGVbN2v_exp) +ALIAS_IMPL (_ZGVcN4v___exp_finite, _ZGVcN4v_exp) +ALIAS_IMPL (_ZGVdN4v___exp_finite, _ZGVdN4v_exp) +ALIAS_IMPL (_ZGVeN8v___exp_finite, _ZGVeN8v_exp) + +ALIAS_IMPL (_ZGVbN4v___expf_finite, _ZGVbN4v_expf) +ALIAS_IMPL (_ZGVcN8v___expf_finite, _ZGVcN8v_expf) +ALIAS_IMPL (_ZGVdN8v___expf_finite, _ZGVdN8v_expf) +ALIAS_IMPL (_ZGVeN16v___expf_finite, _ZGVeN16v_expf) + +ALIAS_IMPL (_ZGVbN2vv___pow_finite, _ZGVbN2vv_pow) +ALIAS_IMPL (_ZGVcN4vv___pow_finite, _ZGVcN4vv_pow) +ALIAS_IMPL (_ZGVdN4vv___pow_finite, _ZGVdN4vv_pow) +ALIAS_IMPL (_ZGVeN8vv___pow_finite, _ZGVeN8vv_pow) + +ALIAS_IMPL (_ZGVbN4vv___powf_finite, _ZGVbN4vv_powf) +ALIAS_IMPL (_ZGVcN8vv___powf_finite, _ZGVcN8vv_powf) +ALIAS_IMPL (_ZGVdN8vv___powf_finite, _ZGVdN8vv_powf) +ALIAS_IMPL (_ZGVeN16vv___powf_finite, _ZGVeN16vv_powf) diff --git a/REORG.TODO/sysdeps/x86_64/fpu/svml_s_cosf16_core.S b/REORG.TODO/sysdeps/x86_64/fpu/svml_s_cosf16_core.S new file mode 100644 index 0000000000..127eb82ae0 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/svml_s_cosf16_core.S @@ -0,0 +1,25 @@ +/* Function cosf vectorized with AVX-512. Wrapper to AVX2 version. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "svml_s_wrapper_impl.h" + + .text +ENTRY (_ZGVeN16v_cosf) +WRAPPER_IMPL_AVX512 _ZGVdN8v_cosf +END (_ZGVeN16v_cosf) diff --git a/REORG.TODO/sysdeps/x86_64/fpu/svml_s_cosf4_core.S b/REORG.TODO/sysdeps/x86_64/fpu/svml_s_cosf4_core.S new file mode 100644 index 0000000000..800766cc4e --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/svml_s_cosf4_core.S @@ -0,0 +1,29 @@ +/* Function cosf vectorized with SSE2, wrapper version. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "svml_s_wrapper_impl.h" + + .text +ENTRY (_ZGVbN4v_cosf) +WRAPPER_IMPL_SSE2 cosf +END (_ZGVbN4v_cosf) + +#ifndef USE_MULTIARCH + libmvec_hidden_def (_ZGVbN4v_cosf) +#endif diff --git a/REORG.TODO/sysdeps/x86_64/fpu/svml_s_cosf8_core.S b/REORG.TODO/sysdeps/x86_64/fpu/svml_s_cosf8_core.S new file mode 100644 index 0000000000..46c588074c --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/svml_s_cosf8_core.S @@ -0,0 +1,29 @@ +/* Function cosf vectorized with AVX2, wrapper version. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "svml_s_wrapper_impl.h" + + .text +ENTRY (_ZGVdN8v_cosf) +WRAPPER_IMPL_AVX _ZGVbN4v_cosf +END (_ZGVdN8v_cosf) + +#ifndef USE_MULTIARCH + libmvec_hidden_def (_ZGVdN8v_cosf) +#endif diff --git a/REORG.TODO/sysdeps/x86_64/fpu/svml_s_cosf8_core_avx.S b/REORG.TODO/sysdeps/x86_64/fpu/svml_s_cosf8_core_avx.S new file mode 100644 index 0000000000..459685ee6a --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/svml_s_cosf8_core_avx.S @@ -0,0 +1,25 @@ +/* Function cosf vectorized in AVX ISA as wrapper to SSE4 ISA version. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "svml_s_wrapper_impl.h" + + .text +ENTRY (_ZGVcN8v_cosf) +WRAPPER_IMPL_AVX _ZGVbN4v_cosf +END (_ZGVcN8v_cosf) diff --git a/REORG.TODO/sysdeps/x86_64/fpu/svml_s_expf16_core.S b/REORG.TODO/sysdeps/x86_64/fpu/svml_s_expf16_core.S new file mode 100644 index 0000000000..a32f03e1a7 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/svml_s_expf16_core.S @@ -0,0 +1,25 @@ +/* Function expf vectorized with AVX-512. Wrapper to AVX2 version. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "svml_s_wrapper_impl.h" + + .text +ENTRY (_ZGVeN16v_expf) +WRAPPER_IMPL_AVX512 _ZGVdN8v_expf +END (_ZGVeN16v_expf) diff --git a/REORG.TODO/sysdeps/x86_64/fpu/svml_s_expf4_core.S b/REORG.TODO/sysdeps/x86_64/fpu/svml_s_expf4_core.S new file mode 100644 index 0000000000..c8ec8f97b7 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/svml_s_expf4_core.S @@ -0,0 +1,30 @@ +/* Function expf vectorized with SSE2. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + + +#include <sysdep.h> +#include "svml_s_wrapper_impl.h" + + .text +ENTRY (_ZGVbN4v_expf) +WRAPPER_IMPL_SSE2 __expf_finite +END (_ZGVbN4v_expf) + +#ifndef USE_MULTIARCH + libmvec_hidden_def (_ZGVbN4v_expf) +#endif diff --git a/REORG.TODO/sysdeps/x86_64/fpu/svml_s_expf8_core.S b/REORG.TODO/sysdeps/x86_64/fpu/svml_s_expf8_core.S new file mode 100644 index 0000000000..f5e1be62eb --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/svml_s_expf8_core.S @@ -0,0 +1,29 @@ +/* Function expf vectorized with AVX2, wrapper version. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "svml_s_wrapper_impl.h" + + .text +ENTRY (_ZGVdN8v_expf) +WRAPPER_IMPL_AVX _ZGVbN4v_expf +END (_ZGVdN8v_expf) + +#ifndef USE_MULTIARCH + libmvec_hidden_def (_ZGVdN8v_expf) +#endif diff --git a/REORG.TODO/sysdeps/x86_64/fpu/svml_s_expf8_core_avx.S b/REORG.TODO/sysdeps/x86_64/fpu/svml_s_expf8_core_avx.S new file mode 100644 index 0000000000..f3557f8c19 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/svml_s_expf8_core_avx.S @@ -0,0 +1,25 @@ +/* Function expf vectorized in AVX ISA as wrapper to SSE4 ISA version. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "svml_s_wrapper_impl.h" + + .text +ENTRY(_ZGVcN8v_expf) +WRAPPER_IMPL_AVX _ZGVbN4v_expf +END(_ZGVcN8v_expf) diff --git a/REORG.TODO/sysdeps/x86_64/fpu/svml_s_expf_data.S b/REORG.TODO/sysdeps/x86_64/fpu/svml_s_expf_data.S new file mode 100644 index 0000000000..226104f5f9 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/svml_s_expf_data.S @@ -0,0 +1,63 @@ +/* Data for function expf. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include "svml_s_expf_data.h" + + .section .rodata, "a" + .align 64 + +/* Data table for vector implementations of function expf. + The table may contain polynomial, reduction, lookup coefficients and + other coefficients obtained through different methods of research and + experimental work. */ + + .globl __svml_sexp_data +__svml_sexp_data: + +/* Range reduction coefficients: + * log(2) inverted */ +float_vector __sInvLn2 0x3fb8aa3b + +/* right shifter constant */ +float_vector __sShifter 0x4b400000 + +/* log(2) high part */ +float_vector __sLn2hi 0x3f317200 + +/* log(2) low part */ +float_vector __sLn2lo 0x35bfbe8e + +/* bias */ +float_vector __iBias 0x0000007f + +/* Polynomial coefficients: + * Here we approximate 2^x on [-0.5, 0.5] */ +float_vector __sPC0 0x3f800000 +float_vector __sPC1 0x3f7ffffe +float_vector __sPC2 0x3effff34 +float_vector __sPC3 0x3e2aacac +float_vector __sPC4 0x3d2b8392 +float_vector __sPC5 0x3c07d9fe + +/* absolute value mask */ +float_vector __iAbsMask 0x7fffffff + +/* working domain range */ +float_vector __iDomainRange 0x42aeac4f + .type __svml_sexp_data,@object + .size __svml_sexp_data,.-__svml_sexp_data diff --git a/REORG.TODO/sysdeps/x86_64/fpu/svml_s_expf_data.h b/REORG.TODO/sysdeps/x86_64/fpu/svml_s_expf_data.h new file mode 100644 index 0000000000..5badb84b14 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/svml_s_expf_data.h @@ -0,0 +1,45 @@ +/* Offsets for data table for vector function expf. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#ifndef S_EXPF_DATA_H +#define S_EXPF_DATA_H + +#define __sInvLn2 0 +#define __sShifter 64 +#define __sLn2hi 128 +#define __sLn2lo 192 +#define __iBias 256 +#define __sPC0 320 +#define __sPC1 384 +#define __sPC2 448 +#define __sPC3 512 +#define __sPC4 576 +#define __sPC5 640 +#define __iAbsMask 704 +#define __iDomainRange 768 + +.macro float_vector offset value +.if .-__svml_sexp_data != \offset +.err +.endif +.rept 16 +.long \value +.endr +.endm + +#endif diff --git a/REORG.TODO/sysdeps/x86_64/fpu/svml_s_logf16_core.S b/REORG.TODO/sysdeps/x86_64/fpu/svml_s_logf16_core.S new file mode 100644 index 0000000000..081c449f42 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/svml_s_logf16_core.S @@ -0,0 +1,25 @@ +/* Function logf vectorized with AVX-512. Wrapper to AVX2 version. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "svml_s_wrapper_impl.h" + + .text +ENTRY (_ZGVeN16v_logf) +WRAPPER_IMPL_AVX512 _ZGVdN8v_logf +END (_ZGVeN16v_logf) diff --git a/REORG.TODO/sysdeps/x86_64/fpu/svml_s_logf4_core.S b/REORG.TODO/sysdeps/x86_64/fpu/svml_s_logf4_core.S new file mode 100644 index 0000000000..fab301db1e --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/svml_s_logf4_core.S @@ -0,0 +1,30 @@ +/* Function logf vectorized with SSE2. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + + +#include <sysdep.h> +#include "svml_s_wrapper_impl.h" + + .text +ENTRY (_ZGVbN4v_logf) +WRAPPER_IMPL_SSE2 __logf_finite +END (_ZGVbN4v_logf) + +#ifndef USE_MULTIARCH + libmvec_hidden_def (_ZGVbN4v_logf) +#endif diff --git a/REORG.TODO/sysdeps/x86_64/fpu/svml_s_logf8_core.S b/REORG.TODO/sysdeps/x86_64/fpu/svml_s_logf8_core.S new file mode 100644 index 0000000000..e1aa2f363c --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/svml_s_logf8_core.S @@ -0,0 +1,29 @@ +/* Function logf vectorized with AVX2, wrapper version. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "svml_s_wrapper_impl.h" + + .text +ENTRY (_ZGVdN8v_logf) +WRAPPER_IMPL_AVX _ZGVbN4v_logf +END (_ZGVdN8v_logf) + +#ifndef USE_MULTIARCH + libmvec_hidden_def (_ZGVdN8v_logf) +#endif diff --git a/REORG.TODO/sysdeps/x86_64/fpu/svml_s_logf8_core_avx.S b/REORG.TODO/sysdeps/x86_64/fpu/svml_s_logf8_core_avx.S new file mode 100644 index 0000000000..e74e47c152 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/svml_s_logf8_core_avx.S @@ -0,0 +1,25 @@ +/* Function logf vectorized in AVX ISA as wrapper to SSE4 ISA version. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "svml_s_wrapper_impl.h" + + .text +ENTRY(_ZGVcN8v_logf) +WRAPPER_IMPL_AVX _ZGVbN4v_logf +END(_ZGVcN8v_logf) diff --git a/REORG.TODO/sysdeps/x86_64/fpu/svml_s_logf_data.S b/REORG.TODO/sysdeps/x86_64/fpu/svml_s_logf_data.S new file mode 100644 index 0000000000..487c439120 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/svml_s_logf_data.S @@ -0,0 +1,102 @@ +/* Data for vector function logf. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include "svml_s_logf_data.h" + + .section .rodata, "a" + .align 64 + +/* Data table for vector implementations of function logf. + The table may contain polynomial, reduction, lookup coefficients and + other coefficients obtained through different methods of research and + experimental work. */ + + .globl __svml_slog_data +__svml_slog_data: + +/* Polynomial sPoly[] coefficients: + * -5.0000000000000000000000000e-01 */ +float_vector _sPoly_1 0xbf000000 + +/* 3.3336564898490905761718750e-01 */ +float_vector _sPoly_2 0x3eaaaee7 + +/* -2.5004664063453674316406250e-01 */ +float_vector _sPoly_3 0xbe80061d + +/* 1.9822503626346588134765625e-01 */ +float_vector _sPoly_4 0x3e4afb81 + +/* -1.6462457180023193359375000e-01 */ +float_vector _sPoly_5 0xbe289358 + +/* 1.6964881122112274169921875e-01 */ +float_vector _sPoly_6 0x3e2db86b + +/* -1.5177205204963684082031250e-01 */ +float_vector _sPoly_7 0xbe1b6a22 + +/* Constant for work range check: Delta 80000000-7f800000 */ +float_vector _iHiDelta 0x00800000 + +/* Constant for work range check: 00800000 + Delta */ +float_vector _iLoRange 0x01000000 + +/* Mantissa break point SP 2/3 */ +float_vector _iBrkValue 0x3f2aaaab + +/* SP significand mask */ +float_vector _iOffExpoMask 0x007fffff + +/* 1.0f */ +float_vector _sOne 0x3f800000 + +/* SP log(2) */ +float_vector _sLn2 0x3f317218 + +/* SP infinity, +/- */ +.if .-__svml_slog_data != _sInfs +.err +.endif + .long 0x7f800000 + .long 0xff800000 + .rept 56 + .byte 0 + .endr + +/* SP one, +/- */ +.if .-__svml_slog_data != _sOnes +.err +.endif + .long 0x3f800000 + .long 0xbf800000 + .rept 56 + .byte 0 + .endr + +/* SP zero +/- */ +.if .-__svml_slog_data != _sZeros +.err +.endif + .long 0x00000000 + .long 0x80000000 + .rept 56 + .byte 0 + .endr + .type __svml_slog_data,@object + .size __svml_slog_data,.-__svml_slog_data diff --git a/REORG.TODO/sysdeps/x86_64/fpu/svml_s_logf_data.h b/REORG.TODO/sysdeps/x86_64/fpu/svml_s_logf_data.h new file mode 100644 index 0000000000..52612e3ae3 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/svml_s_logf_data.h @@ -0,0 +1,48 @@ +/* Offsets for data table for vectorized function logf. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#ifndef S_LOGF_DATA_H +#define S_LOGF_DATA_H + +#define _sPoly_1 0 +#define _sPoly_2 64 +#define _sPoly_3 128 +#define _sPoly_4 192 +#define _sPoly_5 256 +#define _sPoly_6 320 +#define _sPoly_7 384 +#define _iHiDelta 448 +#define _iLoRange 512 +#define _iBrkValue 576 +#define _iOffExpoMask 640 +#define _sOne 704 +#define _sLn2 768 +#define _sInfs 832 +#define _sOnes 896 +#define _sZeros 960 + +.macro float_vector offset value +.if .-__svml_slog_data != \offset +.err +.endif +.rept 16 +.long \value +.endr +.endm + +#endif diff --git a/REORG.TODO/sysdeps/x86_64/fpu/svml_s_powf16_core.S b/REORG.TODO/sysdeps/x86_64/fpu/svml_s_powf16_core.S new file mode 100644 index 0000000000..ac041df507 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/svml_s_powf16_core.S @@ -0,0 +1,25 @@ +/* Function powf vectorized with AVX-512. Wrapper to AVX2 version. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "svml_s_wrapper_impl.h" + + .text +ENTRY (_ZGVeN16vv_powf) +WRAPPER_IMPL_AVX512_ff _ZGVdN8vv_powf +END (_ZGVeN16vv_powf) diff --git a/REORG.TODO/sysdeps/x86_64/fpu/svml_s_powf4_core.S b/REORG.TODO/sysdeps/x86_64/fpu/svml_s_powf4_core.S new file mode 100644 index 0000000000..61d336e160 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/svml_s_powf4_core.S @@ -0,0 +1,29 @@ +/* Function powf vectorized with SSE2. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "svml_s_wrapper_impl.h" + + .text +ENTRY (_ZGVbN4vv_powf) +WRAPPER_IMPL_SSE2_ff __powf_finite +END (_ZGVbN4vv_powf) + +#ifndef USE_MULTIARCH + libmvec_hidden_def (_ZGVbN4vv_powf) +#endif diff --git a/REORG.TODO/sysdeps/x86_64/fpu/svml_s_powf8_core.S b/REORG.TODO/sysdeps/x86_64/fpu/svml_s_powf8_core.S new file mode 100644 index 0000000000..2ae28051c5 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/svml_s_powf8_core.S @@ -0,0 +1,29 @@ +/* Function powf vectorized with AVX2, wrapper version. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "svml_s_wrapper_impl.h" + + .text +ENTRY (_ZGVdN8vv_powf) +WRAPPER_IMPL_AVX_ff _ZGVbN4vv_powf +END (_ZGVdN8vv_powf) + +#ifndef USE_MULTIARCH + libmvec_hidden_def (_ZGVdN8vv_powf) +#endif diff --git a/REORG.TODO/sysdeps/x86_64/fpu/svml_s_powf8_core_avx.S b/REORG.TODO/sysdeps/x86_64/fpu/svml_s_powf8_core_avx.S new file mode 100644 index 0000000000..0522865ef1 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/svml_s_powf8_core_avx.S @@ -0,0 +1,25 @@ +/* Function powf vectorized in AVX ISA as wrapper to SSE4 ISA version. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "svml_s_wrapper_impl.h" + + .text +ENTRY(_ZGVcN8vv_powf) +WRAPPER_IMPL_AVX_ff _ZGVbN4vv_powf +END(_ZGVcN8vv_powf) diff --git a/REORG.TODO/sysdeps/x86_64/fpu/svml_s_powf_data.S b/REORG.TODO/sysdeps/x86_64/fpu/svml_s_powf_data.S new file mode 100644 index 0000000000..630baa62a8 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/svml_s_powf_data.S @@ -0,0 +1,3759 @@ +/* Data for function powf. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include "svml_s_powf_data.h" + + .section .rodata, "a" + .align 64 + +/* Data table for vector implementations of function powf. + The table may contain polynomial, reduction, lookup coefficients and + other coefficients obtained through different methods of research + and experimental work. */ + + .globl __svml_spow_data +__svml_spow_data: + +/* General purpose constants for H+L multiplication: + * NMINNORM */ +float_vector _NMINNORM 0x80800000 + +/* NMAXVAL */ +float_vector _NMAXVAL 0xfeffffff + +/* INF */ +float_vector _INF 0x7f800000 + +/* ABSMASK */ +float_vector _ABSMASK 0x7fffffff + +/* DOMAINRANGE */ +float_vector _DOMAINRANGE 0x42ae9a00 + +/* Log(2) lookup High+Low table for logarithmic part */ +.if .-__svml_spow_data != _Log_HA_table +.err +.endif + .quad 0xc086232bdd7a8300 + .quad 0xbe1ce91eef3fb100 + .quad 0xc086232fdc7ad828 + .quad 0xbe1cefcffda73b6a + .quad 0xc0862333d97d2ba0 + .quad 0xbe1cef406748f1ff + .quad 0xc0862337d48378e0 + .quad 0xbe1cef2a9429925a + .quad 0xc086233bcd8fb878 + .quad 0xbe1cf138d17ebecb + .quad 0xc086233fc4a3e018 + .quad 0xbe1ceff2dbbbb29e + .quad 0xc0862343b9c1e270 + .quad 0xbe1cf1a42aae437b + .quad 0xc0862347acebaf68 + .quad 0xbe1cef3b152048af + .quad 0xc086234b9e2333f0 + .quad 0xbe1cef20e127805e + .quad 0xc086234f8d6a5a30 + .quad 0xbe1cf00ad6052cf4 + .quad 0xc08623537ac30980 + .quad 0xbe1cefc4642ee597 + .quad 0xc0862357662f2660 + .quad 0xbe1cf1f277d36e16 + .quad 0xc086235b4fb092a0 + .quad 0xbe1ceed009e8d8e6 + .quad 0xc086235f37492d28 + .quad 0xbe1cf1e4038cb362 + .quad 0xc08623631cfad250 + .quad 0xbe1cf0b0873b8557 + .quad 0xc086236700c75b98 + .quad 0xbe1cf15bb3227c0b + .quad 0xc086236ae2b09fe0 + .quad 0xbe1cf151ef8ca9ed + .quad 0xc086236ec2b87358 + .quad 0xbe1cefe1dc2cd2ed + .quad 0xc0862372a0e0a780 + .quad 0xbe1cf0d1eec5454f + .quad 0xc08623767d2b0b48 + .quad 0xbe1ceeefd570bbce + .quad 0xc086237a57996af0 + .quad 0xbe1cee99ae91b3a7 + .quad 0xc086237e302d9028 + .quad 0xbe1cf0412830fbd1 + .quad 0xc086238206e94218 + .quad 0xbe1ceee898588610 + .quad 0xc0862385dbce4548 + .quad 0xbe1cee9a1fbcaaea + .quad 0xc0862389aede5bc0 + .quad 0xbe1ceed8e7cc1ad6 + .quad 0xc086238d801b4500 + .quad 0xbe1cf10c8d059da6 + .quad 0xc08623914f86be18 + .quad 0xbe1ceee6c63a8165 + .quad 0xc08623951d228180 + .quad 0xbe1cf0c3592d2ff1 + .quad 0xc0862398e8f04758 + .quad 0xbe1cf0026cc4cb1b + .quad 0xc086239cb2f1c538 + .quad 0xbe1cf15d48d8e670 + .quad 0xc08623a07b28ae60 + .quad 0xbe1cef359363787c + .quad 0xc08623a44196b390 + .quad 0xbe1cefdf1ab2e82c + .quad 0xc08623a8063d8338 + .quad 0xbe1cefe43c02aa84 + .quad 0xc08623abc91ec960 + .quad 0xbe1cf044f5ae35b7 + .quad 0xc08623af8a3c2fb8 + .quad 0xbe1cf0b0b4001e1b + .quad 0xc08623b349975d98 + .quad 0xbe1cf1bae76dfbcf + .quad 0xc08623b70731f810 + .quad 0xbe1cef0a72e13a62 + .quad 0xc08623bac30da1c8 + .quad 0xbe1cf184007d2b6b + .quad 0xc08623be7d2bfb40 + .quad 0xbe1cf16f4b239e98 + .quad 0xc08623c2358ea2a0 + .quad 0xbe1cf0976acada87 + .quad 0xc08623c5ec3733d0 + .quad 0xbe1cf066318a16ff + .quad 0xc08623c9a1274880 + .quad 0xbe1ceffaa7148798 + .quad 0xc08623cd54607820 + .quad 0xbe1cf23ab02e9b6e + .quad 0xc08623d105e45800 + .quad 0xbe1cefdfef7d4fde + .quad 0xc08623d4b5b47b20 + .quad 0xbe1cf17fece44f2b + .quad 0xc08623d863d27270 + .quad 0xbe1cf18f907d0d7c + .quad 0xc08623dc103fccb0 + .quad 0xbe1cee61fe072c98 + .quad 0xc08623dfbafe1668 + .quad 0xbe1cf022dd891e2f + .quad 0xc08623e3640eda20 + .quad 0xbe1ceecc1daf4358 + .quad 0xc08623e70b73a028 + .quad 0xbe1cf0173c4fa380 + .quad 0xc08623eab12deec8 + .quad 0xbe1cf16a2150c2f4 + .quad 0xc08623ee553f4a30 + .quad 0xbe1cf1bf980b1f4b + .quad 0xc08623f1f7a93480 + .quad 0xbe1cef8b731663c2 + .quad 0xc08623f5986d2dc0 + .quad 0xbe1cee9a664d7ef4 + .quad 0xc08623f9378cb3f0 + .quad 0xbe1cf1eda2af6400 + .quad 0xc08623fcd5094320 + .quad 0xbe1cf1923f9d68d7 + .quad 0xc086240070e45548 + .quad 0xbe1cf0747cd3e03a + .quad 0xc08624040b1f6260 + .quad 0xbe1cf22ee855bd6d + .quad 0xc0862407a3bbe078 + .quad 0xbe1cf0d57360c00b + .quad 0xc086240b3abb4398 + .quad 0xbe1ceebc815cd575 + .quad 0xc086240ed01efdd0 + .quad 0xbe1cf03bfb970951 + .quad 0xc086241263e87f50 + .quad 0xbe1cf16e74768529 + .quad 0xc0862415f6193658 + .quad 0xbe1cefec64b8becb + .quad 0xc086241986b28f30 + .quad 0xbe1cf0838d210baa + .quad 0xc086241d15b5f448 + .quad 0xbe1cf0ea86e75b11 + .quad 0xc0862420a324ce28 + .quad 0xbe1cf1708d11d805 + .quad 0xc08624242f008380 + .quad 0xbe1ceea988c5a417 + .quad 0xc0862427b94a7910 + .quad 0xbe1cef166a7bbca5 + .quad 0xc086242b420411d0 + .quad 0xbe1cf0c9d9e86a38 + .quad 0xc086242ec92eaee8 + .quad 0xbe1cef0946455411 + .quad 0xc08624324ecbaf98 + .quad 0xbe1cefea60907739 + .quad 0xc0862435d2dc7160 + .quad 0xbe1cf1ed0934ce42 + .quad 0xc086243955624ff8 + .quad 0xbe1cf191ba746c7d + .quad 0xc086243cd65ea548 + .quad 0xbe1ceeec78cf2a7e + .quad 0xc086244055d2c968 + .quad 0xbe1cef345284c119 + .quad 0xc0862443d3c012b8 + .quad 0xbe1cf24f77355219 + .quad 0xc08624475027d5e8 + .quad 0xbe1cf05bf087e114 + .quad 0xc086244acb0b65d0 + .quad 0xbe1cef3504a32189 + .quad 0xc086244e446c1398 + .quad 0xbe1ceff54b2a406f + .quad 0xc0862451bc4b2eb8 + .quad 0xbe1cf0757d54ed4f + .quad 0xc086245532aa04f0 + .quad 0xbe1cf0c8099fdfd5 + .quad 0xc0862458a789e250 + .quad 0xbe1cf0b173796a31 + .quad 0xc086245c1aec1138 + .quad 0xbe1cf11d8734540d + .quad 0xc086245f8cd1da60 + .quad 0xbe1cf1916a723ceb + .quad 0xc0862462fd3c84d8 + .quad 0xbe1cf19a911e1da7 + .quad 0xc08624666c2d5608 + .quad 0xbe1cf23a9ef72e4f + .quad 0xc0862469d9a591c0 + .quad 0xbe1cef503d947663 + .quad 0xc086246d45a67a18 + .quad 0xbe1cf0fceeb1a0b2 + .quad 0xc0862470b0314fa8 + .quad 0xbe1cf107e27e4fbc + .quad 0xc086247419475160 + .quad 0xbe1cf03dd9922331 + .quad 0xc086247780e9bc98 + .quad 0xbe1cefce1a10e129 + .quad 0xc086247ae719cd18 + .quad 0xbe1ceea47f73c4f6 + .quad 0xc086247e4bd8bd10 + .quad 0xbe1ceec0ac56d100 + .quad 0xc0862481af27c528 + .quad 0xbe1cee8a6593278a + .quad 0xc086248511081c70 + .quad 0xbe1cf2231dd9dec7 + .quad 0xc0862488717af888 + .quad 0xbe1cf0b4b8ed7da8 + .quad 0xc086248bd0818d68 + .quad 0xbe1cf1bd8d835002 + .quad 0xc086248f2e1d0d98 + .quad 0xbe1cf259acc107f4 + .quad 0xc08624928a4eaa20 + .quad 0xbe1cee897636b00c + .quad 0xc0862495e5179270 + .quad 0xbe1cee757f20c326 + .quad 0xc08624993e78f490 + .quad 0xbe1cefafd3aa54a4 + .quad 0xc086249c9673fd10 + .quad 0xbe1cee7298d38b97 + .quad 0xc086249fed09d6f8 + .quad 0xbe1ceedc158d4ceb + .quad 0xc08624a3423babe0 + .quad 0xbe1cf2282987cb2e + .quad 0xc08624a6960aa400 + .quad 0xbe1cefe7381ecc4b + .quad 0xc08624a9e877e600 + .quad 0xbe1cef328dbbce80 + .quad 0xc08624ad39849728 + .quad 0xbe1cefde45f3cc71 + .quad 0xc08624b08931db58 + .quad 0xbe1cefa8b89433b9 + .quad 0xc08624b3d780d500 + .quad 0xbe1cef6773c0b139 + .quad 0xc08624b72472a528 + .quad 0xbe1cf031c931c11f + .quad 0xc08624ba70086b78 + .quad 0xbe1cf088f49275e7 + .quad 0xc08624bdba434630 + .quad 0xbe1cf17de0eaa86d + .quad 0xc08624c103245238 + .quad 0xbe1cefd492f1ba75 + .quad 0xc08624c44aacab08 + .quad 0xbe1cf1253e154466 + .quad 0xc08624c790dd6ad0 + .quad 0xbe1cf0fb09ee6d55 + .quad 0xc08624cad5b7aa58 + .quad 0xbe1cf1f08dd048fe + .quad 0xc08624ce193c8120 + .quad 0xbe1ceeca0809697f + .quad 0xc08624d15b6d0538 + .quad 0xbe1cef8d5662d968 + .quad 0xc08624d49c4a4b78 + .quad 0xbe1cee97b556ed78 + .quad 0xc08624d7dbd56750 + .quad 0xbe1cf1b14b6acb75 + .quad 0xc08624db1a0f6b00 + .quad 0xbe1cef1e860623f2 + .quad 0xc08624de56f96758 + .quad 0xbe1ceeaf4d156f3d + .quad 0xc08624e192946bf0 + .quad 0xbe1ceecc12b400ed + .quad 0xc08624e4cce18710 + .quad 0xbe1cf180c40c794f + .quad 0xc08624e805e1c5c8 + .quad 0xbe1cf185a08f7f65 + .quad 0xc08624eb3d9633d8 + .quad 0xbe1cef45fc924078 + .quad 0xc08624ee73ffdbb0 + .quad 0xbe1cf1e4f457f32a + .quad 0xc08624f1a91fc6a0 + .quad 0xbe1cf040147b8a5a + .quad 0xc08624f4dcf6fc98 + .quad 0xbe1cf1effca0dfb2 + .quad 0xc08624f80f868468 + .quad 0xbe1cf0470146e5bc + .quad 0xc08624fb40cf6390 + .quad 0xbe1cef4dd186e501 + .quad 0xc08624fe70d29e60 + .quad 0xbe1ceebe257f66c7 + .quad 0xc08625019f9137f0 + .quad 0xbe1ceefb7a1c395c + .quad 0xc0862504cd0c3220 + .quad 0xbe1cf209dedfed8c + .quad 0xc0862507f9448db0 + .quad 0xbe1cf082da464994 + .quad 0xc086250b243b4a18 + .quad 0xbe1cee88694a73cf + .quad 0xc086250e4df165a0 + .quad 0xbe1cf0b61e8f0531 + .quad 0xc08625117667dd78 + .quad 0xbe1cf1106599c962 + .quad 0xc08625149d9fad98 + .quad 0xbe1ceff1ee88af1f + .quad 0xc0862517c399d0c8 + .quad 0xbe1cf0f746994ef6 + .quad 0xc086251ae85740b8 + .quad 0xbe1cefe8a1d077e4 + .quad 0xc086251e0bd8f5e0 + .quad 0xbe1cf1a1da036092 + .quad 0xc08625212e1fe7a8 + .quad 0xbe1cf0f8a7786fcd + .quad 0xc08625244f2d0c48 + .quad 0xbe1cefa1174a07a7 + .quad 0xc08625276f0158d8 + .quad 0xbe1cef1043aa5b25 + .quad 0xc086252a8d9dc150 + .quad 0xbe1cf15d521c169d + .quad 0xc086252dab033898 + .quad 0xbe1cf220bba8861f + .quad 0xc0862530c732b078 + .quad 0xbe1cef51e310eae2 + .quad 0xc0862533e22d1988 + .quad 0xbe1cf222fcedd8ae + .quad 0xc0862536fbf36370 + .quad 0xbe1cefdb4da4bda8 + .quad 0xc086253a14867ca0 + .quad 0xbe1ceeafc1112171 + .quad 0xc086253d2be75280 + .quad 0xbe1cee99dfb4b408 + .quad 0xc08625404216d160 + .quad 0xbe1cf22d2536f06b + .quad 0xc08625435715e498 + .quad 0xbe1cef6abbf2e268 + .quad 0xc08625466ae57648 + .quad 0xbe1cf093a14789f5 + .quad 0xc08625497d866fa0 + .quad 0xbe1cf0f93655603c + .quad 0xc086254c8ef9b8b8 + .quad 0xbe1cf1cc40c9aafc + .quad 0xc086254f9f4038a8 + .quad 0xbe1ceeea5f4e9157 + .quad 0xc0862552ae5ad568 + .quad 0xbe1cefa9f52d4997 + .quad 0xc0862555bc4a7400 + .quad 0xbe1cefa490a638ff + .quad 0xc0862558c90ff868 + .quad 0xbe1cef7fcf797d6f + .quad 0xc086255bd4ac4590 + .quad 0xbe1cf1b4c51113c9 + .quad 0xc086255edf203d78 + .quad 0xbe1cef55e5b4a55d + .quad 0xc0862561e86cc100 + .quad 0xbe1cf0d37a25f9dc + .quad 0xc0862564f092b028 + .quad 0xbe1ceebe9efc19d9 + .quad 0xc0862567f792e9d8 + .quad 0xbe1cee8ad30a57b5 + .quad 0xc086256afd6e4c08 + .quad 0xbe1cef4e1817b90b + .quad 0xc086256e0225b3b8 + .quad 0xbe1cee7fa9229996 + .quad 0xc086257105b9fce0 + .quad 0xbe1cf0b54963d945 + .quad 0xc0862574082c0298 + .quad 0xbe1cee5f2f3c7995 + .quad 0xc0862577097c9ee0 + .quad 0xbe1cf0828e303a2c + .quad 0xc086257a09acaae0 + .quad 0xbe1cf172c3078947 + .quad 0xc086257d08bcfec0 + .quad 0xbe1cf189252afa22 + .quad 0xc086258006ae71b8 + .quad 0xbe1cefdb80426923 + .quad 0xc08625830381da08 + .quad 0xbe1ceef1391a0372 + .quad 0xc0862585ff380d00 + .quad 0xbe1cf17720c78d13 + .quad 0xc0862588f9d1df18 + .quad 0xbe1ceef1f9027d83 + .quad 0xc086258bf35023b8 + .quad 0xbe1cf06fac99dec9 + .quad 0xc086258eebb3ad78 + .quad 0xbe1cf1373eeb45c0 + .quad 0xc0862591e2fd4e00 + .quad 0xbe1cef777536bb81 + .quad 0xc0862594d92dd600 + .quad 0xbe1cf0f43ca40766 + .quad 0xc0862597ce461558 + .quad 0xbe1cefb2cfc6766b + .quad 0xc086259ac246daf0 + .quad 0xbe1ceea49e64ffa2 + .quad 0xc086259db530f4c8 + .quad 0xbe1cf250fa457dec + .quad 0xc08625a0a7053018 + .quad 0xbe1cf17d8bb2a44e + .quad 0xc08625a397c45918 + .quad 0xbe1cf1d5906d54b7 + .quad 0xc08625a6876f3b30 + .quad 0xbe1cf08fe7b31780 + .quad 0xc08625a97606a0e0 + .quad 0xbe1cef13edfc9d11 + .quad 0xc08625ac638b53c8 + .quad 0xbe1cef9d2b107219 + .quad 0xc08625af4ffe1cb0 + .quad 0xbe1cf1ddd4ff6160 + .quad 0xc08625b23b5fc390 + .quad 0xbe1cefa02a996495 + .quad 0xc08625b525b10f68 + .quad 0xbe1cf166a7e37ee5 + .quad 0xc08625b80ef2c680 + .quad 0xbe1cef0b171068a5 + .quad 0xc08625baf725ae28 + .quad 0xbe1cf05c80779283 + .quad 0xc08625bdde4a8af0 + .quad 0xbe1cf1bbfbffb889 + .quad 0xc08625c0c4622090 + .quad 0xbe1cf0b8666c0124 + .quad 0xc08625c3a96d31e0 + .quad 0xbe1cf0a8fcf47a86 + .quad 0xc08625c68d6c80f0 + .quad 0xbe1cef46e18cb092 + .quad 0xc08625c97060cef0 + .quad 0xbe1cf1458a350efb + .quad 0xc08625cc524adc58 + .quad 0xbe1ceeea1dadce12 + .quad 0xc08625cf332b68b0 + .quad 0xbe1cf0a1bfdc44c7 + .quad 0xc08625d2130332d0 + .quad 0xbe1cef96d02da73e + .quad 0xc08625d4f1d2f8a8 + .quad 0xbe1cf2451c3c7701 + .quad 0xc08625d7cf9b7778 + .quad 0xbe1cf10d08f83812 + .quad 0xc08625daac5d6ba0 + .quad 0xbe1ceec5b4895c5e + .quad 0xc08625dd881990b0 + .quad 0xbe1cf14e1325c5e4 + .quad 0xc08625e062d0a188 + .quad 0xbe1cf21d0904be12 + .quad 0xc08625e33c835838 + .quad 0xbe1ceed0839bcf21 + .quad 0xc08625e615326df0 + .quad 0xbe1cf1bb944889d2 + .quad 0xc08625e8ecde9b48 + .quad 0xbe1cee738e85eece + .quad 0xc08625ebc38897e0 + .quad 0xbe1cf25c2bc6ef12 + .quad 0xc08625ee99311ac8 + .quad 0xbe1cf132b70a41ad + .quad 0xc08625f16dd8da28 + .quad 0xbe1cf1984236a6e3 + .quad 0xc08625f441808b78 + .quad 0xbe1cf19ae74998f9 + .quad 0xc08625f71428e370 + .quad 0xbe1cef3e175d61a1 + .quad 0xc08625f9e5d295f8 + .quad 0xbe1cf101f9868fd9 + .quad 0xc08625fcb67e5658 + .quad 0xbe1cee69db83dcd2 + .quad 0xc08625ff862cd6f8 + .quad 0xbe1cf081b636af51 + .quad 0xc086260254dec9a8 + .quad 0xbe1cee62c7d59b3e + .quad 0xc08626052294df58 + .quad 0xbe1cf1b745c57716 + .quad 0xc0862607ef4fc868 + .quad 0xbe1cef3d2800ea23 + .quad 0xc086260abb103458 + .quad 0xbe1cef480ff1acd2 + .quad 0xc086260d85d6d200 + .quad 0xbe1cf2424c9a17ef + .quad 0xc08626104fa44f90 + .quad 0xbe1cf12cfde90fd5 + .quad 0xc086261318795a68 + .quad 0xbe1cf21f590dd5b6 + .quad 0xc0862615e0569f48 + .quad 0xbe1cf0c50f9cd28a + .quad 0xc0862618a73cca30 + .quad 0xbe1ceedbdb520545 + .quad 0xc086261b6d2c8668 + .quad 0xbe1cf0b030396011 + .quad 0xc086261e32267e98 + .quad 0xbe1cf19917010e96 + .quad 0xc0862620f62b5cb0 + .quad 0xbe1cf07331355985 + .quad 0xc0862623b93bc9e8 + .quad 0xbe1cf01ae921a1c3 + .quad 0xc08626267b586ed0 + .quad 0xbe1cefe5cf0dbf0c + .quad 0xc08626293c81f348 + .quad 0xbe1cf01b258aeb50 + .quad 0xc086262bfcb8fe88 + .quad 0xbe1cee6b9e7f4c68 + .quad 0xc086262ebbfe3710 + .quad 0xbe1cee684a9b21c9 + .quad 0xc08626317a5242b8 + .quad 0xbe1cf1f8bcde9a8b + .quad 0xc086263437b5c6c0 + .quad 0xbe1cf1d063d36238 + .quad 0xc0862636f42967a8 + .quad 0xbe1cf1e31a19075e + .quad 0xc0862639afadc950 + .quad 0xbe1cf1d8efdf7e7d + .quad 0xc086263c6a438ef0 + .quad 0xbe1cf1812ee72dba + .quad 0xc086263f23eb5b18 + .quad 0xbe1cf1449a9a2279 + .quad 0xc0862641dca5cfb8 + .quad 0xbe1cee96edce5085 + .quad 0xc086264494738e08 + .quad 0xbe1cf06797bd03b2 + .quad 0xc08626474b5536b8 + .quad 0xbe1cef91b9b7ffc1 + .quad 0xc086264a014b69c0 + .quad 0xbe1cef4b6721278f + .quad 0xc086264cb656c678 + .quad 0xbe1cf1942925eb4a + .quad 0xc086264f6a77eba8 + .quad 0xbe1cefa2c7bc2e39 + .quad 0xc08626521daf7758 + .quad 0xbe1cf252595aceb3 + .quad 0xc0862654cffe0718 + .quad 0xbe1cee8e9ae47ec2 + .quad 0xc0862657816437a8 + .quad 0xbe1cf1bf913828fa + .quad 0xc086265a31e2a558 + .quad 0xbe1cf23475d6b366 + .quad 0xc086265ce179ebc8 + .quad 0xbe1cef8df00a922b + .quad 0xc086265f902aa5f0 + .quad 0xbe1cef279bfa43e0 + .quad 0xc08626623df56e38 + .quad 0xbe1cf080e10b8365 + .quad 0xc0862664eadade70 + .quad 0xbe1cf1a518f9b544 + .quad 0xc086266796db8fd0 + .quad 0xbe1cef9308fed9e9 + .quad 0xc086266a41f81ae8 + .quad 0xbe1ceea3ae6b19c9 + .quad 0xc086266cec3117b8 + .quad 0xbe1ceef06003d4c2 + .quad 0xc086266f95871da8 + .quad 0xbe1cf0b8457ffb0c + .quad 0xc08626723dfac390 + .quad 0xbe1cf0c526745ad6 + .quad 0xc0862674e58c9fa8 + .quad 0xbe1cf0cf91ff7b5d + .quad 0xc08626778c3d4798 + .quad 0xbe1cefe260819380 + .quad 0xc086267a320d5070 + .quad 0xbe1ceebd90aa27a3 + .quad 0xc086267cd6fd4ea8 + .quad 0xbe1cf0388121dffa + .quad 0xc086267f7b0dd630 + .quad 0xbe1cf1a3881435f1 + .quad 0xc08626821e3f7a68 + .quad 0xbe1cef28e9d9ac52 + .quad 0xc0862684c092ce08 + .quad 0xbe1cf02d300062dd + .quad 0xc086268762086350 + .quad 0xbe1cefaee1edfa35 + .quad 0xc086268a02a0cbe0 + .quad 0xbe1cf0a5a052e936 + .quad 0xc086268ca25c98d8 + .quad 0xbe1cee60a4a497ed + .quad 0xc086268f413c5ab0 + .quad 0xbe1cf0e4a5d0cf49 + .quad 0xc0862691df40a170 + .quad 0xbe1cf149235a4e6e + .quad 0xc08626947c69fc80 + .quad 0xbe1cf215180b9fcc + .quad 0xc086269718b8fac8 + .quad 0xbe1cef9b156a9840 + .quad 0xc0862699b42e2a90 + .quad 0xbe1cf054c91441be + .quad 0xc086269c4eca19a8 + .quad 0xbe1cf13ded26512c + .quad 0xc086269ee88d5550 + .quad 0xbe1cf22ea4d8ac06 + .quad 0xc08626a181786a40 + .quad 0xbe1cf2354666ee2e + .quad 0xc08626a4198be4a8 + .quad 0xbe1cefef936752b3 + .quad 0xc08626a6b0c85020 + .quad 0xbe1cf1e360a9db68 + .quad 0xc08626a9472e37d8 + .quad 0xbe1ceed6aeb812c5 + .quad 0xc08626abdcbe2650 + .quad 0xbe1cf227340b4986 + .quad 0xc08626ae7178a5b0 + .quad 0xbe1cf0215a0cbe0d + .quad 0xc08626b1055e3f70 + .quad 0xbe1cf256adf0ae26 + .quad 0xc08626b3986f7ca8 + .quad 0xbe1ceff3c67aed06 + .quad 0xc08626b62aace5c8 + .quad 0xbe1cf2159fb93652 + .quad 0xc08626b8bc1702e0 + .quad 0xbe1cf01e6dbd1c7f + .quad 0xc08626bb4cae5b60 + .quad 0xbe1cf009e75d1c0c + .quad 0xc08626bddc737648 + .quad 0xbe1ceec10a020e73 + .quad 0xc08626c06b66da08 + .quad 0xbe1cf06d5783eee7 + .quad 0xc08626c2f9890ca0 + .quad 0xbe1cf0cb8f169ffe + .quad 0xc08626c586da9388 + .quad 0xbe1cef7de2452430 + .quad 0xc08626c8135bf3b0 + .quad 0xbe1cf05da6f783ae + .quad 0xc08626ca9f0db198 + .quad 0xbe1cefcc877d681d + .quad 0xc08626cd29f05138 + .quad 0xbe1cef0531954ab3 + .quad 0xc08626cfb4045608 + .quad 0xbe1cf06b8565ea3d + .quad 0xc08626d23d4a4310 + .quad 0xbe1cefdc455d9d7e + .quad 0xc08626d4c5c29ad0 + .quad 0xbe1ceefc47e8fa64 + .quad 0xc08626d74d6ddf48 + .quad 0xbe1cf1872bf033f2 + .quad 0xc08626d9d44c9210 + .quad 0xbe1cf19d91087f9d + .quad 0xc08626dc5a5f3438 + .quad 0xbe1cf012d444c6ab + .quad 0xc08626dedfa64650 + .quad 0xbe1cf0ba528ee153 + .quad 0xc08626e164224880 + .quad 0xbe1ceeb431709788 + .quad 0xc08626e3e7d3ba60 + .quad 0xbe1cf0b9af31a6a5 + .quad 0xc08626e66abb1b28 + .quad 0xbe1cf168fb2e135b + .quad 0xc08626e8ecd8e990 + .quad 0xbe1cef9097461c93 + .quad 0xc08626eb6e2da3d0 + .quad 0xbe1cee7a434735d8 + .quad 0xc08626edeeb9c7a8 + .quad 0xbe1cf235732b86f2 + .quad 0xc08626f06e7dd280 + .quad 0xbe1cefe1510b89e6 + .quad 0xc08626f2ed7a4120 + .quad 0xbe1cf1f64b9b80ef + .quad 0xc08626f56baf9000 + .quad 0xbe1cf08f320ca339 + .quad 0xc08626f7e91e3b08 + .quad 0xbe1cf1b1de2808a1 + .quad 0xc08626fa65c6bdc0 + .quad 0xbe1cf1976d778b28 + .quad 0xc08626fce1a99338 + .quad 0xbe1ceef40a4f076f + .quad 0xc08626ff5cc73600 + .quad 0xbe1cef3e45869ce3 + .quad 0xc0862701d7202048 + .quad 0xbe1ceef601b4c9d6 + .quad 0xc086270450b4cbc0 + .quad 0xbe1cf1eaf0b57fd6 + .quad 0xc0862706c985b1c0 + .quad 0xbe1cef82a44990f3 + .quad 0xc086270941934b10 + .quad 0xbe1ceefe32981f2c + .quad 0xc086270bb8de1018 + .quad 0xbe1cefbf6f5a0445 + .quad 0xc086270e2f6678d0 + .quad 0xbe1cf18dba75792c + .quad 0xc0862710a52cfcc8 + .quad 0xbe1cf0da64ce995f + .quad 0xc08627131a321318 + .quad 0xbe1cef04ac0fb802 + .quad 0xc08627158e763268 + .quad 0xbe1cee9d4e2ad9bd + .quad 0xc086271801f9d0f8 + .quad 0xbe1cefa9b55407b5 + .quad 0xc086271a74bd64a0 + .quad 0xbe1cefe6bd329570 + .quad 0xc086271ce6c162c8 + .quad 0xbe1cef0b1205dc85 + .quad 0xc086271f58064068 + .quad 0xbe1cef092a785e3f + .quad 0xc0862721c88c7210 + .quad 0xbe1cf050dcdaac30 + .quad 0xc086272438546be8 + .quad 0xbe1cf210907ded8b + .quad 0xc0862726a75ea1b8 + .quad 0xbe1cee760be44f99 + .quad 0xc086272915ab86c0 + .quad 0xbe1ceeeee07c2bcc + .quad 0xc086272b833b8df0 + .quad 0xbe1cf06874992df5 + .quad 0xc086272df00f29d0 + .quad 0xbe1cef8fac5d4899 + .quad 0xc08627305c26cc70 + .quad 0xbe1cf1103241cc99 + .quad 0xc0862732c782e788 + .quad 0xbe1cf1d35fef83fe + .quad 0xc08627353223ec68 + .quad 0xbe1cef3ec8133e1d + .quad 0xc08627379c0a4be8 + .quad 0xbe1cef7261daccd8 + .quad 0xc086273a05367688 + .quad 0xbe1cf18656c50806 + .quad 0xc086273c6da8dc68 + .quad 0xbe1cf1c8736e049a + .quad 0xc086273ed561ed38 + .quad 0xbe1cf1f93bff4911 + .quad 0xc08627413c621848 + .quad 0xbe1cf188a4ea680c + .quad 0xc0862743a2a9cc80 + .quad 0xbe1cf1d270930c80 + .quad 0xc086274608397868 + .quad 0xbe1cf25a328c28e2 + .quad 0xc08627486d118a28 + .quad 0xbe1cf106f90aa3b8 + .quad 0xc086274ad1326f80 + .quad 0xbe1cee5e9d2e885a + .quad 0xc086274d349c95c0 + .quad 0xbe1cf1c0bac27228 + .quad 0xc086274f975069f8 + .quad 0xbe1cf1a1500f9b1c + .quad 0xc0862751f94e58c0 + .quad 0xbe1cefc30663ac44 + .quad 0xc08627545a96ce48 + .quad 0xbe1cf17123e427a2 + .quad 0xc0862756bb2a3678 + .quad 0xbe1cefb92749fea4 + .quad 0xc08627591b08fcc0 + .quad 0xbe1cefa40e1ea74a + .quad 0xc086275b7a338c40 + .quad 0xbe1cee6f4612c3e9 + .quad 0xc086275dd8aa4fa8 + .quad 0xbe1cf1c54a053627 + .quad 0xc0862760366db168 + .quad 0xbe1ceff5eb503d9e + .quad 0xc0862762937e1b70 + .quad 0xbe1cf02e47f10cee + .quad 0xc0862764efdbf768 + .quad 0xbe1ceeb06e1d0dad + .quad 0xc08627674b87ae88 + .quad 0xbe1cf10aadd6dba5 + .quad 0xc0862769a681a9c0 + .quad 0xbe1cf24e9913d30f + .quad 0xc086276c00ca51a0 + .quad 0xbe1cef47b301e312 + .quad 0xc086276e5a620e48 + .quad 0xbe1ceeb1cefc2e85 + .quad 0xc0862770b3494788 + .quad 0xbe1cf16f1fbbe011 + .quad 0xc08627730b8064e8 + .quad 0xbe1ceebdf75174c7 + .quad 0xc08627756307cd70 + .quad 0xbe1cf06e3871a0da + .quad 0xc0862777b9dfe7f0 + .quad 0xbe1cef16799fd554 + .quad 0xc086277a10091ac0 + .quad 0xbe1cf248dabf5377 + .quad 0xc086277c6583cc00 + .quad 0xbe1cf0c78d92a2cd + .quad 0xc086277eba506158 + .quad 0xbe1cf0b911b029f0 + .quad 0xc08627810e6f4028 + .quad 0xbe1cefdc24719766 + .quad 0xc086278361e0cd70 + .quad 0xbe1cefbb6562b7e7 + .quad 0xc0862785b4a56dd8 + .quad 0xbe1cf1e0afb349ec + .quad 0xc086278806bd85c0 + .quad 0xbe1cf008292e52fc + .quad 0xc086278a58297918 + .quad 0xbe1cf053073872bf + .quad 0xc086278ca8e9ab88 + .quad 0xbe1cf17a0a55a947 + .quad 0xc086278ef8fe8068 + .quad 0xbe1ceeffb0b60234 + .quad 0xc086279148685aa0 + .quad 0xbe1cf162204794a8 + .quad 0xc086279397279ce0 + .quad 0xbe1cf24cc8cb48ac + .quad 0xc0862795e53ca978 + .quad 0xbe1cf0c9be68d5c3 + .quad 0xc086279832a7e258 + .quad 0xbe1cf172cd3d7388 + .quad 0xc086279a7f69a930 + .quad 0xbe1ceea2465fbce5 + .quad 0xc086279ccb825f40 + .quad 0xbe1cf0a386d2500f + .quad 0xc086279f16f26590 + .quad 0xbe1cf1e338ddc18a + .quad 0xc08627a161ba1cd0 + .quad 0xbe1cef1f5049867f + .quad 0xc08627a3abd9e548 + .quad 0xbe1cef96c1ea8b1f + .quad 0xc08627a5f5521f00 + .quad 0xbe1cf138f6fd3c26 + .quad 0xc08627a83e2329b0 + .quad 0xbe1cf0d4fcbfdf3a + .quad 0xc08627aa864d64b0 + .quad 0xbe1cf24870c12c81 + .quad 0xc08627accdd12f18 + .quad 0xbe1cf0ae2a56348d + .quad 0xc08627af14aee7a0 + .quad 0xbe1cee8ca1a9b893 + .quad 0xc08627b15ae6eca8 + .quad 0xbe1cf20414d637b0 + .quad 0xc08627b3a0799c60 + .quad 0xbe1cf0fc6b7b12d8 + .quad 0xc08627b5e5675488 + .quad 0xbe1cf152d93c4a00 + .quad 0xc08627b829b072a0 + .quad 0xbe1cf1073f9b77c2 + .quad 0xc08627ba6d5553d8 + .quad 0xbe1cee694f97d5a4 + .quad 0xc08627bcb0565500 + .quad 0xbe1cf0456b8239d7 + .quad 0xc08627bef2b3d2b0 + .quad 0xbe1cf211497127e3 + .quad 0xc08627c1346e2930 + .quad 0xbe1cf01856c0384d + .quad 0xc08627c37585b468 + .quad 0xbe1cefa7dd05479e + .quad 0xc08627c5b5fad000 + .quad 0xbe1cef3ae8e50b93 + .quad 0xc08627c7f5cdd750 + .quad 0xbe1ceea5f32fdd3a + .quad 0xc08627ca34ff2560 + .quad 0xbe1cef424caeb8d9 + .quad 0xc08627cc738f14f0 + .quad 0xbe1cf0194d07a81f + .quad 0xc08627ceb17e0070 + .quad 0xbe1cf20f452000c1 + .quad 0xc08627d0eecc4210 + .quad 0xbe1cf00e356218e4 + .quad 0xc08627d32b7a33a0 + .quad 0xbe1cef30484b4bcb + .quad 0xc08627d567882eb0 + .quad 0xbe1ceeea11a6641b + .quad 0xc08627d7a2f68c80 + .quad 0xbe1cf13492d5bd7b + .quad 0xc08627d9ddc5a618 + .quad 0xbe1ceeb7048fad96 + .quad 0xc08627dc17f5d418 + .quad 0xbe1ceef0666f0477 + .quad 0xc08627de51876ee8 + .quad 0xbe1cf060d4b8b5c2 + .quad 0xc08627e08a7acea8 + .quad 0xbe1cf0b2a4b6ff8c + .quad 0xc08627e2c2d04b28 + .quad 0xbe1cf0e34809a875 + .quad 0xc08627e4fa883bf0 + .quad 0xbe1cf16bf74a3522 + .quad 0xc08627e731a2f848 + .quad 0xbe1cee6a24623d57 + .quad 0xc08627e96820d718 + .quad 0xbe1cefc7b4f1528e + .quad 0xc08627eb9e022f18 + .quad 0xbe1cf163051f3548 + .quad 0xc08627edd34756b8 + .quad 0xbe1cef36b3366305 + .quad 0xc08627f007f0a408 + .quad 0xbe1cf18134625550 + .quad 0xc08627f23bfe6cf0 + .quad 0xbe1cf0ec32ec1a11 + .quad 0xc08627f46f710700 + .quad 0xbe1ceeb3b64f3edc + .quad 0xc08627f6a248c778 + .quad 0xbe1cf0cd15805bc8 + .quad 0xc08627f8d4860368 + .quad 0xbe1cf20db3bddebe + .quad 0xc08627fb06290f90 + .quad 0xbe1cf25188430e25 + .quad 0xc08627fd37324070 + .quad 0xbe1ceea1713490f9 + .quad 0xc08627ff67a1ea28 + .quad 0xbe1cf159521d234c + .quad 0xc0862801977860b8 + .quad 0xbe1cf24dfe50783b + .quad 0xc0862803c6b5f7d0 + .quad 0xbe1ceef2ef89a60b + .quad 0xc0862805f55b02c8 + .quad 0xbe1cee7fc919d62c + .quad 0xc08628082367d4c0 + .quad 0xbe1cf215a7fb513a + .quad 0xc086280a50dcc0a8 + .quad 0xbe1cf0e4401c5ed4 + .quad 0xc086280c7dba1910 + .quad 0xbe1cf04ec734d256 + .quad 0xc086280eaa003050 + .quad 0xbe1cf010ad787fea + .quad 0xc0862810d5af5880 + .quad 0xbe1cee622478393d + .quad 0xc086281300c7e368 + .quad 0xbe1cf01c7482564f + .quad 0xc08628152b4a22a0 + .quad 0xbe1cf0de20d33536 + .quad 0xc086281755366778 + .quad 0xbe1cef2edae5837d + .quad 0xc08628197e8d02f0 + .quad 0xbe1cf0a345318cc9 + .quad 0xc086281ba74e45d8 + .quad 0xbe1cf20085aa34b8 + .quad 0xc086281dcf7a80c0 + .quad 0xbe1cef5fa845ad83 + .quad 0xc086281ff71203e0 + .quad 0xbe1cf050d1df69c4 + .quad 0xc08628221e151f48 + .quad 0xbe1ceffe43c035b9 + .quad 0xc0862824448422b8 + .quad 0xbe1cf14f3018d3c2 + .quad 0xc08628266a5f5dc0 + .quad 0xbe1cef0a5fbae83d + .quad 0xc08628288fa71f98 + .quad 0xbe1ceff8a95b72a1 + .quad 0xc086282ab45bb750 + .quad 0xbe1cef073aa9849b + .quad 0xc086282cd87d73a8 + .quad 0xbe1cef69b3835c02 + .quad 0xc086282efc0ca328 + .quad 0xbe1cf0bc139379a9 + .quad 0xc08628311f099420 + .quad 0xbe1cef247a9ec596 + .quad 0xc086283341749490 + .quad 0xbe1cef74bbcc488a + .quad 0xc0862835634df248 + .quad 0xbe1cef4bc42e7b8e + .quad 0xc08628378495fad0 + .quad 0xbe1cf136d4d5a810 + .quad 0xc0862839a54cfb80 + .quad 0xbe1cf0d290b24dd8 + .quad 0xc086283bc5734168 + .quad 0xbe1ceeebde8e0065 + .quad 0xc086283de5091950 + .quad 0xbe1cf1a09f60aa1e + .quad 0xc0862840040ecfe0 + .quad 0xbe1cf0803947a234 + .quad 0xc08628422284b168 + .quad 0xbe1cf0abf7638127 + .quad 0xc0862844406b0a08 + .quad 0xbe1cf0f73ee12058 + .quad 0xc08628465dc225a0 + .quad 0xbe1cf2079971b26c + .quad 0xc08628487a8a4fe0 + .quad 0xbe1cee74957564b1 + .quad 0xc086284a96c3d420 + .quad 0xbe1ceee77c1b7d43 + .quad 0xc086284cb26efd90 + .quad 0xbe1cf23addba6e09 + .quad 0xc086284ecd8c1730 + .quad 0xbe1cf199f4a1da60 + .quad 0xc0862850e81b6bb0 + .quad 0xbe1cf09fdea81393 + .quad 0xc0862853021d4588 + .quad 0xbe1cf176adb417f7 + .quad 0xc08628551b91ef00 + .quad 0xbe1cf0f64f84a8da + .quad 0xc08628573479b220 + .quad 0xbe1ceec34cf49523 + .quad 0xc08628594cd4d8a8 + .quad 0xbe1cf16d60fbe0bb + .quad 0xc086285b64a3ac40 + .quad 0xbe1cee8de7acfc7b + .quad 0xc086285d7be67630 + .quad 0xbe1ceee6256cce8d + .quad 0xc086285f929d7fa0 + .quad 0xbe1cee7d66a3d8a5 + .quad 0xc0862861a8c91170 + .quad 0xbe1cf0bef8265792 + .quad 0xc0862863be697458 + .quad 0xbe1cf097f890c6f8 + .quad 0xc0862865d37ef0c8 + .quad 0xbe1cf09502d5c3fc + .quad 0xc0862867e809cf00 + .quad 0xbe1ceeffb239dac7 + .quad 0xc0862869fc0a56f8 + .quad 0xbe1cf1fbfff95c98 + .quad 0xc086286c0f80d090 + .quad 0xbe1cefa57ad3eef7 + .quad 0xc086286e226d8348 + .quad 0xbe1cf22c58b9183d + .quad 0xc086287034d0b690 + .quad 0xbe1ceff262d0a248 + .quad 0xc086287246aab180 + .quad 0xbe1cefa7bc194186 + .quad 0xc086287457fbbb08 + .quad 0xbe1cf06782d784d9 + .quad 0xc086287668c419e0 + .quad 0xbe1cf1d44d0eaa07 + .quad 0xc086287879041490 + .quad 0xbe1cf034803c8a48 + .quad 0xc086287a88bbf158 + .quad 0xbe1cf08e84916b6f + .quad 0xc086287c97ebf650 + .quad 0xbe1cf0c4d3dc1bc7 + .quad 0xc086287ea6946958 + .quad 0xbe1cefb1e4625943 + .quad 0xc0862880b4b59010 + .quad 0xbe1cf143efdd1fd0 + .quad 0xc0862882c24faff8 + .quad 0xbe1cee9896d016da + .quad 0xc0862884cf630e38 + .quad 0xbe1cf2186072f2cc + .quad 0xc0862886dbefeff0 + .quad 0xbe1cef9217633d34 + .quad 0xc0862888e7f699e0 + .quad 0xbe1cf05603549486 + .quad 0xc086288af37750b0 + .quad 0xbe1cef50fff513d3 + .quad 0xc086288cfe7258c0 + .quad 0xbe1cf127713b32d0 + .quad 0xc086288f08e7f650 + .quad 0xbe1cf05015520f3d + .quad 0xc086289112d86d58 + .quad 0xbe1cf12eb458b26f + .quad 0xc08628931c4401a8 + .quad 0xbe1cf22eae2887ed + .quad 0xc0862895252af6e0 + .quad 0xbe1cefdd6656dd2d + .quad 0xc08628972d8d9058 + .quad 0xbe1cf1048ea4e646 + .quad 0xc0862899356c1150 + .quad 0xbe1ceec4501167e9 + .quad 0xc086289b3cc6bcb8 + .quad 0xbe1cf0ad52becc3f + .quad 0xc086289d439dd568 + .quad 0xbe1cf0daa4e00e35 + .quad 0xc086289f49f19df8 + .quad 0xbe1cf00b80de8d6a + .quad 0xc08628a14fc258c8 + .quad 0xbe1cf1bcf2ea8464 + .quad 0xc08628a355104818 + .quad 0xbe1cf0435e2782b0 + .quad 0xc08628a559dbade0 + .quad 0xbe1cf0e3e1a5f56c + .quad 0xc08628a75e24cbf8 + .quad 0xbe1cefed9d5a721d + .quad 0xc08628a961ebe3f8 + .quad 0xbe1cf0d2d74321e2 + .quad 0xc08628ab65313750 + .quad 0xbe1cf24200eb55e9 + .quad 0xc08628ad67f50740 + .quad 0xbe1cf23e9d7cf979 + .quad 0xc08628af6a3794d0 + .quad 0xbe1cf23a088f421c + .quad 0xc08628b16bf920e0 + .quad 0xbe1cef2c1de1ab32 + .quad 0xc08628b36d39ec08 + .quad 0xbe1cf1abc231f7b2 + .quad 0xc08628b56dfa36d0 + .quad 0xbe1cf2074d5ba303 + .quad 0xc08628b76e3a4180 + .quad 0xbe1cf05cd5eed880 + .rept 48 + .byte 0 + .endr + +/* Log(2) lookup table for logarithmic part */ +.if .-__svml_spow_data != _Log_LA_table +.err +.endif + .quad 0x8000000000000000 + .quad 0xbf5ff802a9ab10e6 + .quad 0xbf6ff00aa2b10bc0 + .quad 0xbf77ee11ebd82e94 + .quad 0xbf7fe02a6b106789 + .quad 0xbf83e7295d25a7d9 + .quad 0xbf87dc475f810a77 + .quad 0xbf8bcf712c74384c + .quad 0xbf8fc0a8b0fc03e4 + .quad 0xbf91d7f7eb9eebe7 + .quad 0xbf93cea44346a575 + .quad 0xbf95c45a51b8d389 + .quad 0xbf97b91b07d5b11b + .quad 0xbf99ace7551cc514 + .quad 0xbf9b9fc027af9198 + .quad 0xbf9d91a66c543cc4 + .quad 0xbf9f829b0e783300 + .quad 0xbfa0b94f7c196176 + .quad 0xbfa1b0d98923d980 + .quad 0xbfa2a7ec2214e873 + .quad 0xbfa39e87b9febd60 + .quad 0xbfa494acc34d911c + .quad 0xbfa58a5bafc8e4d5 + .quad 0xbfa67f94f094bd98 + .quad 0xbfa77458f632dcfc + .quad 0xbfa868a83083f6cf + .quad 0xbfa95c830ec8e3eb + .quad 0xbfaa4fe9ffa3d235 + .quad 0xbfab42dd711971bf + .quad 0xbfac355dd0921f2d + .quad 0xbfad276b8adb0b52 + .quad 0xbfae19070c276016 + .quad 0xbfaf0a30c01162a6 + .quad 0xbfaffae9119b9303 + .quad 0xbfb075983598e471 + .quad 0xbfb0ed839b5526fe + .quad 0xbfb16536eea37ae1 + .quad 0xbfb1dcb263db1944 + .quad 0xbfb253f62f0a1417 + .quad 0xbfb2cb0283f5de1f + .quad 0xbfb341d7961bd1d1 + .quad 0xbfb3b87598b1b6ee + .quad 0xbfb42edcbea646f0 + .quad 0xbfb4a50d3aa1b040 + .quad 0xbfb51b073f06183f + .quad 0xbfb590cafdf01c28 + .quad 0xbfb60658a93750c4 + .quad 0xbfb67bb0726ec0fc + .quad 0xbfb6f0d28ae56b4c + .quad 0xbfb765bf23a6be13 + .quad 0xbfb7da766d7b12cd + .quad 0xbfb84ef898e8282a + .quad 0xbfb8c345d6319b21 + .quad 0xbfb9375e55595ede + .quad 0xbfb9ab42462033ad + .quad 0xbfba1ef1d8061cd4 + .quad 0xbfba926d3a4ad563 + .quad 0xbfbb05b49bee43fe + .quad 0xbfbb78c82bb0eda1 + .quad 0xbfbbeba818146765 + .quad 0xbfbc5e548f5bc743 + .quad 0xbfbcd0cdbf8c13e1 + .quad 0xbfbd4313d66cb35d + .quad 0xbfbdb5270187d927 + .quad 0xbfbe27076e2af2e6 + .quad 0xbfbe98b549671467 + .quad 0xbfbf0a30c01162a6 + .quad 0xbfbf7b79fec37ddf + .quad 0xbfbfec9131dbeabb + .quad 0xbfc02ebb42bf3d4b + .quad 0xbfc0671512ca596e + .quad 0xbfc09f561ee719c3 + .quad 0xbfc0d77e7cd08e59 + .quad 0xbfc10f8e422539b1 + .quad 0xbfc14785846742ac + .quad 0xbfc17f6458fca611 + .quad 0xbfc1b72ad52f67a0 + .quad 0xbfc1eed90e2dc2c3 + .quad 0xbfc2266f190a5acb + .quad 0xbfc25ded0abc6ad2 + .quad 0xbfc29552f81ff523 + .quad 0xbfc2cca0f5f5f251 + .quad 0xbfc303d718e47fd3 + .quad 0xbfc33af575770e4f + .quad 0xbfc371fc201e8f74 + .quad 0xbfc3a8eb2d31a376 + .quad 0xbfc3dfc2b0ecc62a + .quad 0xbfc41682bf727bc0 + .quad 0xbfc44d2b6ccb7d1e + .quad 0xbfc483bccce6e3dd + .quad 0xbfc4ba36f39a55e5 + .quad 0xbfc4f099f4a230b2 + .quad 0xbfc526e5e3a1b438 + .quad 0xbfc55d1ad4232d6f + .quad 0xbfc59338d9982086 + .quad 0xbfc5c940075972b9 + .quad 0xbfc5ff3070a793d4 + .quad 0xbfc6350a28aaa758 + .quad 0xbfc66acd4272ad51 + .quad 0xbfc6a079d0f7aad2 + .quad 0xbfc6d60fe719d21d + .quad 0xbfc70b8f97a1aa75 + .quad 0xbfc740f8f54037a5 + .quad 0xbfc7764c128f2127 + .quad 0xbfc7ab890210d909 + .quad 0xbfc7e0afd630c274 + .quad 0xbfc815c0a14357eb + .quad 0xbfc84abb75865139 + .quad 0xbfc87fa06520c911 + .quad 0xbfc8b46f8223625b + .quad 0xbfc8e928de886d41 + .quad 0xbfc91dcc8c340bde + .quad 0xbfc9525a9cf456b4 + .quad 0xbfc986d3228180ca + .quad 0xbfc9bb362e7dfb83 + .quad 0xbfc9ef83d2769a34 + .quad 0xbfca23bc1fe2b563 + .quad 0xbfca57df28244dcd + .quad 0xbfca8becfc882f19 + .quad 0xbfcabfe5ae46124c + .quad 0xbfcaf3c94e80bff3 + .quad 0xbfcb2797ee46320c + .quad 0xbfcb5b519e8fb5a4 + .quad 0xbfcb8ef670420c3b + .quad 0xbfcbc286742d8cd6 + .quad 0xbfcbf601bb0e44e2 + .quad 0xbfcc2968558c18c1 + .quad 0xbfcc5cba543ae425 + .quad 0xbfcc8ff7c79a9a22 + .quad 0xbfccc320c0176502 + .quad 0xbfccf6354e09c5dc + .quad 0xbfcd293581b6b3e7 + .quad 0xbfcd5c216b4fbb91 + .quad 0xbfcd8ef91af31d5e + .quad 0xbfcdc1bca0abec7d + .quad 0xbfcdf46c0c722d2f + .quad 0xbfce27076e2af2e6 + .quad 0xbfce598ed5a87e2f + .quad 0xbfce8c0252aa5a60 + .quad 0xbfcebe61f4dd7b0b + .quad 0xbfcef0adcbdc5936 + .quad 0xbfcf22e5e72f105d + .quad 0xbfcf550a564b7b37 + .quad 0xbfcf871b28955045 + .quad 0xbfcfb9186d5e3e2b + .quad 0xbfcfeb0233e607cc + .quad 0xbfd00e6c45ad501d + .quad 0xbfd0274dc16c232f + .quad 0xbfd0402594b4d041 + .quad 0xbfd058f3c703ebc6 + .quad 0xbfd071b85fcd590d + .quad 0xbfd08a73667c57af + .quad 0xbfd0a324e27390e3 + .quad 0xbfd0bbccdb0d24bd + .quad 0xbfd0d46b579ab74b + .quad 0xbfd0ed005f657da4 + .quad 0xbfd1058bf9ae4ad5 + .quad 0xbfd11e0e2dad9cb7 + .quad 0xbfd136870293a8b0 + .quad 0xbfd14ef67f88685a + .quad 0xbfd1675cababa60e + .quad 0xbfd17fb98e15095d + .quad 0xbfd1980d2dd4236f + .quad 0xbfd1b05791f07b49 + .quad 0xbfd1c898c16999fb + .quad 0xbfd1e0d0c33716be + .quad 0xbfd1f8ff9e48a2f3 + .quad 0xbfd211255986160c + .quad 0xbfd22941fbcf7966 + .quad 0xbfd241558bfd1404 + .quad 0xbfd2596010df763a + .quad 0xbfd27161913f853d + .quad 0xbfd2895a13de86a3 + .quad 0xbfd2a1499f762bc9 + .quad 0xbfd2b9303ab89d25 + .quad 0xbfd2d10dec508583 + .quad 0xbfd2e8e2bae11d31 + .quad 0xbfd300aead06350c + .quad 0xbfd31871c9544185 + .quad 0xbfd3302c16586588 + .quad 0xbfd347dd9a987d55 + .quad 0xbfd35f865c93293e + .quad 0xbfd3772662bfd85b + .quad 0xbfd38ebdb38ed321 + .quad 0xbfd3a64c556945ea + .quad 0xbfd3bdd24eb14b6a + .quad 0xbfd3d54fa5c1f710 + .quad 0xbfd3ecc460ef5f50 + .quad 0xbfd404308686a7e4 + .quad 0xbfd41b941cce0bee + .quad 0xbfd432ef2a04e814 + .quad 0xbfd44a41b463c47c + .quad 0xbfd4618bc21c5ec2 + .quad 0xbfd478cd5959b3d9 + .quad 0xbfd49006804009d1 + .quad 0xbfd4a7373cecf997 + .quad 0xbfd4be5f957778a1 + .quad 0xbfd4d57f8fefe27f + .quad 0xbfd4ec973260026a + .quad 0xbfd503a682cb1cb3 + .quad 0xbfd51aad872df82d + .quad 0xbfd531ac457ee77e + .quad 0xbfd548a2c3add263 + .quad 0xbfd55f9107a43ee2 + .quad 0xbfd5767717455a6c + .quad 0xbfd58d54f86e02f2 + .quad 0xbfd5a42ab0f4cfe2 + .quad 0xbfd5baf846aa1b19 + .quad 0xbfd5d1bdbf5809ca + .quad 0xbfd5e87b20c2954a + .quad 0xbfd5ff3070a793d4 + .quad 0xbfd615ddb4bec13c + .quad 0xbfd62c82f2b9c795 + .quad 0x3fd61965cdb02c1f + .quad 0x3fd602d08af091ec + .quad 0x3fd5ec433d5c35ae + .quad 0x3fd5d5bddf595f30 + .quad 0x3fd5bf406b543db2 + .quad 0x3fd5a8cadbbedfa1 + .quad 0x3fd5925d2b112a59 + .quad 0x3fd57bf753c8d1fb + .quad 0x3fd565995069514c + .quad 0x3fd54f431b7be1a9 + .quad 0x3fd538f4af8f72fe + .quad 0x3fd522ae0738a3d8 + .quad 0x3fd50c6f1d11b97c + .quad 0x3fd4f637ebba9810 + .quad 0x3fd4e0086dd8baca + .quad 0x3fd4c9e09e172c3c + .quad 0x3fd4b3c077267e9a + .quad 0x3fd49da7f3bcc41f + .quad 0x3fd487970e958770 + .quad 0x3fd4718dc271c41b + .quad 0x3fd45b8c0a17df13 + .quad 0x3fd44591e0539f49 + .quad 0x3fd42f9f3ff62642 + .quad 0x3fd419b423d5e8c7 + .quad 0x3fd403d086cea79c + .quad 0x3fd3edf463c1683e + .quad 0x3fd3d81fb5946dba + .quad 0x3fd3c25277333184 + .quad 0x3fd3ac8ca38e5c5f + .quad 0x3fd396ce359bbf54 + .quad 0x3fd3811728564cb2 + .quad 0x3fd36b6776be1117 + .quad 0x3fd355bf1bd82c8b + .quad 0x3fd3401e12aecba1 + .quad 0x3fd32a84565120a8 + .quad 0x3fd314f1e1d35ce4 + .quad 0x3fd2ff66b04ea9d4 + .quad 0x3fd2e9e2bce12286 + .quad 0x3fd2d46602adccee + .quad 0x3fd2bef07cdc9354 + .quad 0x3fd2a982269a3dbf + .quad 0x3fd2941afb186b7c + .quad 0x3fd27ebaf58d8c9d + .quad 0x3fd269621134db92 + .quad 0x3fd25410494e56c7 + .quad 0x3fd23ec5991eba49 + .quad 0x3fd22981fbef797b + .quad 0x3fd214456d0eb8d4 + .quad 0x3fd1ff0fe7cf47a7 + .quad 0x3fd1e9e1678899f4 + .quad 0x3fd1d4b9e796c245 + .quad 0x3fd1bf99635a6b95 + .quad 0x3fd1aa7fd638d33f + .quad 0x3fd1956d3b9bc2fa + .quad 0x3fd180618ef18adf + .quad 0x3fd16b5ccbacfb73 + .quad 0x3fd1565eed455fc3 + .quad 0x3fd14167ef367783 + .quad 0x3fd12c77cd00713b + .quad 0x3fd1178e8227e47c + .quad 0x3fd102ac0a35cc1c + .quad 0x3fd0edd060b78081 + .quad 0x3fd0d8fb813eb1ef + .quad 0x3fd0c42d676162e3 + .quad 0x3fd0af660eb9e279 + .quad 0x3fd09aa572e6c6d4 + .quad 0x3fd085eb8f8ae797 + .quad 0x3fd07138604d5862 + .quad 0x3fd05c8be0d9635a + .quad 0x3fd047e60cde83b8 + .quad 0x3fd03346e0106062 + .quad 0x3fd01eae5626c691 + .quad 0x3fd00a1c6adda473 + .quad 0x3fcfeb2233ea07cd + .quad 0x3fcfc218be620a5e + .quad 0x3fcf991c6cb3b379 + .quad 0x3fcf702d36777df0 + .quad 0x3fcf474b134df229 + .quad 0x3fcf1e75fadf9bde + .quad 0x3fcef5ade4dcffe6 + .quad 0x3fceccf2c8fe920a + .quad 0x3fcea4449f04aaf5 + .quad 0x3fce7ba35eb77e2a + .quad 0x3fce530effe71012 + .quad 0x3fce2a877a6b2c12 + .quad 0x3fce020cc6235ab5 + .quad 0x3fcdd99edaf6d7e9 + .quad 0x3fcdb13db0d48940 + .quad 0x3fcd88e93fb2f450 + .quad 0x3fcd60a17f903515 + .quad 0x3fcd38666871f465 + .quad 0x3fcd1037f2655e7b + .quad 0x3fcce816157f1988 + .quad 0x3fccc000c9db3c52 + .quad 0x3fcc97f8079d44ec + .quad 0x3fcc6ffbc6f00f71 + .quad 0x3fcc480c0005ccd1 + .quad 0x3fcc2028ab17f9b4 + .quad 0x3fcbf851c067555f + .quad 0x3fcbd087383bd8ad + .quad 0x3fcba8c90ae4ad19 + .quad 0x3fcb811730b823d2 + .quad 0x3fcb5971a213acdb + .quad 0x3fcb31d8575bce3d + .quad 0x3fcb0a4b48fc1b46 + .quad 0x3fcae2ca6f672bd4 + .quad 0x3fcabb55c31693ad + .quad 0x3fca93ed3c8ad9e3 + .quad 0x3fca6c90d44b704e + .quad 0x3fca454082e6ab05 + .quad 0x3fca1dfc40f1b7f1 + .quad 0x3fc9f6c407089664 + .quad 0x3fc9cf97cdce0ec3 + .quad 0x3fc9a8778debaa38 + .quad 0x3fc981634011aa75 + .quad 0x3fc95a5adcf7017f + .quad 0x3fc9335e5d594989 + .quad 0x3fc90c6db9fcbcd9 + .quad 0x3fc8e588ebac2dbf + .quad 0x3fc8beafeb38fe8c + .quad 0x3fc897e2b17b19a5 + .quad 0x3fc871213750e994 + .quad 0x3fc84a6b759f512f + .quad 0x3fc823c16551a3c2 + .quad 0x3fc7fd22ff599d4f + .quad 0x3fc7d6903caf5ad0 + .quad 0x3fc7b0091651528c + .quad 0x3fc7898d85444c73 + .quad 0x3fc7631d82935a86 + .quad 0x3fc73cb9074fd14d + .quad 0x3fc716600c914054 + .quad 0x3fc6f0128b756abc + .quad 0x3fc6c9d07d203fc7 + .quad 0x3fc6a399dabbd383 + .quad 0x3fc67d6e9d785771 + .quad 0x3fc6574ebe8c133a + .quad 0x3fc6313a37335d76 + .quad 0x3fc60b3100b09476 + .quad 0x3fc5e533144c1719 + .quad 0x3fc5bf406b543db2 + .quad 0x3fc59958ff1d52f1 + .quad 0x3fc5737cc9018cdd + .quad 0x3fc54dabc26105d2 + .quad 0x3fc527e5e4a1b58d + .quad 0x3fc5022b292f6a45 + .quad 0x3fc4dc7b897bc1c8 + .quad 0x3fc4b6d6fefe22a4 + .quad 0x3fc4913d8333b561 + .quad 0x3fc46baf0f9f5db7 + .quad 0x3fc4462b9dc9b3dc + .quad 0x3fc420b32740fdd4 + .quad 0x3fc3fb45a59928cc + .quad 0x3fc3d5e3126bc27f + .quad 0x3fc3b08b6757f2a9 + .quad 0x3fc38b3e9e027479 + .quad 0x3fc365fcb0159016 + .quad 0x3fc340c59741142e + .quad 0x3fc31b994d3a4f85 + .quad 0x3fc2f677cbbc0a96 + .quad 0x3fc2d1610c86813a + .quad 0x3fc2ac55095f5c59 + .quad 0x3fc28753bc11aba5 + .quad 0x3fc2625d1e6ddf57 + .quad 0x3fc23d712a49c202 + .quad 0x3fc2188fd9807263 + .quad 0x3fc1f3b925f25d41 + .quad 0x3fc1ceed09853752 + .quad 0x3fc1aa2b7e23f72a + .quad 0x3fc185747dbecf34 + .quad 0x3fc160c8024b27b1 + .quad 0x3fc13c2605c398c3 + .quad 0x3fc1178e8227e47c + .quad 0x3fc0f301717cf0fb + .quad 0x3fc0ce7ecdccc28d + .quad 0x3fc0aa06912675d5 + .quad 0x3fc08598b59e3a07 + .quad 0x3fc06135354d4b18 + .quad 0x3fc03cdc0a51ec0d + .quad 0x3fc0188d2ecf6140 + .quad 0x3fbfe89139dbd566 + .quad 0x3fbfa01c9db57ce2 + .quad 0x3fbf57bc7d9005db + .quad 0x3fbf0f70cdd992e3 + .quad 0x3fbec739830a1120 + .quad 0x3fbe7f1691a32d3e + .quad 0x3fbe3707ee30487b + .quad 0x3fbdef0d8d466db9 + .quad 0x3fbda727638446a2 + .quad 0x3fbd5f55659210e2 + .quad 0x3fbd179788219364 + .quad 0x3fbccfedbfee13a8 + .quad 0x3fbc885801bc4b23 + .quad 0x3fbc40d6425a5cb1 + .quad 0x3fbbf968769fca11 + .quad 0x3fbbb20e936d6974 + .quad 0x3fbb6ac88dad5b1c + .quad 0x3fbb23965a52ff00 + .quad 0x3fbadc77ee5aea8c + .quad 0x3fba956d3ecade63 + .quad 0x3fba4e7640b1bc38 + .quad 0x3fba0792e9277cac + .quad 0x3fb9c0c32d4d2548 + .quad 0x3fb97a07024cbe74 + .quad 0x3fb9335e5d594989 + .quad 0x3fb8ecc933aeb6e8 + .quad 0x3fb8a6477a91dc29 + .quad 0x3fb85fd927506a48 + .quad 0x3fb8197e2f40e3f0 + .quad 0x3fb7d33687c293c9 + .quad 0x3fb78d02263d82d3 + .quad 0x3fb746e100226ed9 + .quad 0x3fb700d30aeac0e1 + .quad 0x3fb6bad83c1883b6 + .quad 0x3fb674f089365a7a + .quad 0x3fb62f1be7d77743 + .quad 0x3fb5e95a4d9791cb + .quad 0x3fb5a3abb01ade25 + .quad 0x3fb55e10050e0384 + .quad 0x3fb518874226130a + .quad 0x3fb4d3115d207eac + .quad 0x3fb48dae4bc31018 + .quad 0x3fb4485e03dbdfad + .quad 0x3fb403207b414b7f + .quad 0x3fb3bdf5a7d1ee64 + .quad 0x3fb378dd7f749714 + .quad 0x3fb333d7f8183f4b + .quad 0x3fb2eee507b40301 + .quad 0x3fb2aa04a44717a5 + .quad 0x3fb26536c3d8c369 + .quad 0x3fb2207b5c78549e + .quad 0x3fb1dbd2643d190b + .quad 0x3fb1973bd1465567 + .quad 0x3fb152b799bb3cc9 + .quad 0x3fb10e45b3cae831 + .quad 0x3fb0c9e615ac4e17 + .quad 0x3fb08598b59e3a07 + .quad 0x3fb0415d89e74444 + .quad 0x3faffa6911ab9301 + .quad 0x3faf723b517fc523 + .quad 0x3faeea31c006b87c + .quad 0x3fae624c4a0b5e1b + .quad 0x3fadda8adc67ee4e + .quad 0x3fad52ed6405d86f + .quad 0x3faccb73cdddb2cc + .quad 0x3fac441e06f72a9e + .quad 0x3fabbcebfc68f420 + .quad 0x3fab35dd9b58baad + .quad 0x3faaaef2d0fb10fc + .quad 0x3faa282b8a936171 + .quad 0x3fa9a187b573de7c + .quad 0x3fa91b073efd7314 + .quad 0x3fa894aa149fb343 + .quad 0x3fa80e7023d8ccc4 + .quad 0x3fa788595a3577ba + .quad 0x3fa70265a550e777 + .quad 0x3fa67c94f2d4bb58 + .quad 0x3fa5f6e73078efb8 + .quad 0x3fa5715c4c03ceef + .quad 0x3fa4ebf43349e26f + .quad 0x3fa466aed42de3ea + .quad 0x3fa3e18c1ca0ae92 + .quad 0x3fa35c8bfaa1306b + .quad 0x3fa2d7ae5c3c5bae + .quad 0x3fa252f32f8d183f + .quad 0x3fa1ce5a62bc353a + .quad 0x3fa149e3e4005a8d + .quad 0x3fa0c58fa19dfaaa + .quad 0x3fa0415d89e74444 + .quad 0x3f9f7a9b16782856 + .quad 0x3f9e72bf2813ce51 + .quad 0x3f9d6b2725979802 + .quad 0x3f9c63d2ec14aaf2 + .quad 0x3f9b5cc258b718e6 + .quad 0x3f9a55f548c5c43f + .quad 0x3f994f6b99a24475 + .quad 0x3f98492528c8cabf + .quad 0x3f974321d3d006d3 + .quad 0x3f963d6178690bd6 + .quad 0x3f9537e3f45f3565 + .quad 0x3f9432a925980cc1 + .quad 0x3f932db0ea132e22 + .quad 0x3f9228fb1fea2e28 + .quad 0x3f912487a5507f70 + .quad 0x3f90205658935847 + .quad 0x3f8e38ce3033310c + .quad 0x3f8c317384c75f06 + .quad 0x3f8a2a9c6c170462 + .quad 0x3f882448a388a2aa + .quad 0x3f861e77e8b53fc6 + .quad 0x3f841929f96832f0 + .quad 0x3f82145e939ef1e9 + .quad 0x3f8010157588de71 + .quad 0x3f7c189cbb0e27fb + .quad 0x3f78121214586b54 + .quad 0x3f740c8a747878e2 + .quad 0x3f70080559588b35 + .quad 0x3f680904828985c0 + .quad 0x3f60040155d5889e + .quad 0x3f50020055655889 + .quad 0x0000000000000000 + .rept 56 + .byte 0 + .endr + +/* Polynomial coefficients for log part: + * coeff4 */ +double_vector _poly_coeff_1 0x3fc9999cacdb4d0a + +/* coeff3 */ +double_vector _poly_coeff_2 0xbfd0000148058ee1 + +/* coeff2 */ +double_vector _poly_coeff_3 0x3fd55555555543c5 + +/* coeff1 */ +double_vector _poly_coeff_4 0xbfdffffffffff81f + +/* General purpose constants for log part: ExpMask */ +double_vector _ExpMask 0x000fffffffffffff + +/* Two10 */ +double_vector _Two10 0x3f50000000000000 + +/* MinNorm */ +double_vector _MinNorm 0x0010000000000000 + +/* MaxNorm */ +double_vector _MaxNorm 0x7fefffffffffffff + +/* HalfMask */ +double_vector _HalfMask 0xfffffffffc000000 + +/* One */ +double_vector _One 0x3ff0000000000000 + +/* L2H */ +double_vector _L2H 0x3fe62e42fefa0000 + +/* L2L */ +double_vector _L2L 0x3d7cf79abc9e0000 + +/* Threshold */ +double_vector _Threshold 0x4086a00000000000 + +/* Bias */ +double_vector _Bias 0x408ff80000000000 + +/* Bias1 */ +double_vector _Bias1 0x408ff00000000000 + +/* L2L */ +double_vector _L2 0x3fe62e42fefa39ef + +/* dInfs = DP infinity, +/- == */ +.if .-__svml_spow_data != _dInfs +.err +.endif + .quad 0x7ff0000000000000 + .quad 0xfff0000000000000 + .rept 48 + .byte 0 + .endr + +/* dOnes = DP one, +/- == */ +.if .-__svml_spow_data != _dOnes +.err +.endif + .quad 0x3ff0000000000000 + .quad 0xbff0000000000000 + .rept 48 + .byte 0 + .endr + +/* dZeros = DP zero +/- == */ +.if .-__svml_spow_data != _dZeros +.err +.endif + .quad 0x0000000000000000 + .quad 0x8000000000000000 + .rept 48 + .byte 0 + .endr +.if .-__svml_spow_data != __dbT +.err +.endif + .quad 0x3feffffffc27dd9e + .quad 0x3ff00162f1a4047d + .quad 0x3ff002c603f68252 + .quad 0x3ff00429350e12af + .quad 0x3ff0058c84ed6032 + .quad 0x3ff006eff39715b2 + .quad 0x3ff00853810dde41 + .quad 0x3ff009b72d54652f + .quad 0x3ff00b1af86d5604 + .quad 0x3ff00c7ee25b5c86 + .quad 0x3ff00de2eb2124b3 + .quad 0x3ff00f4712c15ac8 + .quad 0x3ff010ab593eab39 + .quad 0x3ff0120fbe9bc2ba + .quad 0x3ff0137442db4e38 + .quad 0x3ff014d8e5fffada + .quad 0x3ff0163da80c7604 + .quad 0x3ff017a289036d56 + .quad 0x3ff0190788e78eab + .quad 0x3ff01a6ca7bb8818 + .quad 0x3ff01bd1e58207ef + .quad 0x3ff01d37423dbcbc + .quad 0x3ff01e9cbdf15549 + .quad 0x3ff02002589f8099 + .quad 0x3ff02168124aedec + .quad 0x3ff022cdeaf64cbc + .quad 0x3ff02433e2a44cc1 + .quad 0x3ff02599f9579ded + .quad 0x3ff027002f12f06d + .quad 0x3ff0286683d8f4ac + .quad 0x3ff029ccf7ac5b4d + .quad 0x3ff02b338a8fd532 + .quad 0x3ff02c9a3c861379 + .quad 0x3ff02e010d91c778 + .quad 0x3ff02f67fdb5a2c4 + .quad 0x3ff030cf0cf4572d + .quad 0x3ff032363b5096bc + .quad 0x3ff0339d88cd13bc + .quad 0x3ff03504f56c80ae + .quad 0x3ff0366c81319053 + .quad 0x3ff037d42c1ef5a2 + .quad 0x3ff0393bf63763d5 + .quad 0x3ff03aa3df7d8e5f + .quad 0x3ff03c0be7f428eb + .quad 0x3ff03d740f9de766 + .quad 0x3ff03edc567d7df7 + .quad 0x3ff04044bc95a0fe + .quad 0x3ff041ad41e9051d + .quad 0x3ff04315e67a5f2a + .quad 0x3ff0447eaa4c643e + .quad 0x3ff045e78d61c9ac + .quad 0x3ff047508fbd4502 + .quad 0x3ff048b9b1618c0b + .quad 0x3ff04a22f25154cd + .quad 0x3ff04b8c528f558b + .quad 0x3ff04cf5d21e44c4 + .quad 0x3ff04e5f7100d935 + .quad 0x3ff04fc92f39c9d4 + .quad 0x3ff051330ccbcdd5 + .quad 0x3ff0529d09b99ca8 + .quad 0x3ff054072605edfb + .quad 0x3ff0557161b379b3 + .quad 0x3ff056dbbcc4f7f8 + .quad 0x3ff05846373d212a + .quad 0x3ff059b0d11eade5 + .quad 0x3ff05b1b8a6c5706 + .quad 0x3ff05c866328d5a2 + .quad 0x3ff05df15b56e30a + .quad 0x3ff05f5c72f938cf + .quad 0x3ff060c7aa1290bd + .quad 0x3ff0623300a5a4db + .quad 0x3ff0639e76b52f6e + .quad 0x3ff0650a0c43eaf6 + .quad 0x3ff06675c1549232 + .quad 0x3ff067e195e9e01a + .quad 0x3ff0694d8a068fe7 + .quad 0x3ff06ab99dad5d0c + .quad 0x3ff06c25d0e10338 + .quad 0x3ff06d9223a43e58 + .quad 0x3ff06efe95f9ca95 + .quad 0x3ff0706b27e46455 + .quad 0x3ff071d7d966c83a + .quad 0x3ff07344aa83b324 + .quad 0x3ff074b19b3de22f + .quad 0x3ff0761eab9812b4 + .quad 0x3ff0778bdb950247 + .quad 0x3ff078f92b376ebc + .quad 0x3ff07a669a821621 + .quad 0x3ff07bd42977b6c4 + .quad 0x3ff07d41d81b0f2b + .quad 0x3ff07eafa66ede1e + .quad 0x3ff0801d9475e2a0 + .quad 0x3ff0818ba232dbee + .quad 0x3ff082f9cfa88985 + .quad 0x3ff084681cd9ab21 + .quad 0x3ff085d689c900b6 + .quad 0x3ff0874516794a79 + .quad 0x3ff088b3c2ed48d9 + .quad 0x3ff08a228f27bc86 + .quad 0x3ff08b917b2b6667 + .quad 0x3ff08d0086fb07a6 + .quad 0x3ff08e6fb29961a8 + .quad 0x3ff08fdefe09360d + .quad 0x3ff0914e694d46b6 + .quad 0x3ff092bdf46855c0 + .quad 0x3ff0942d9f5d2582 + .quad 0x3ff0959d6a2e7893 + .quad 0x3ff0970d54df11c8 + .quad 0x3ff0987d5f71b432 + .quad 0x3ff099ed89e9231e + .quad 0x3ff09b5dd448221a + .quad 0x3ff09cce3e9174ec + .quad 0x3ff09e3ec8c7df9d + .quad 0x3ff09faf72ee2670 + .quad 0x3ff0a1203d070de5 + .quad 0x3ff0a29127155abd + .quad 0x3ff0a402311bd1f0 + .quad 0x3ff0a5735b1d38bb + .quad 0x3ff0a6e4a51c5493 + .quad 0x3ff0a8560f1beb2c + .quad 0x3ff0a9c7991ec278 + .quad 0x3ff0ab394327a0a7 + .quad 0x3ff0acab0d394c25 + .quad 0x3ff0ae1cf7568b9d + .quad 0x3ff0af8f018225f7 + .quad 0x3ff0b1012bbee259 + .quad 0x3ff0b273760f8825 + .quad 0x3ff0b3e5e076defc + .quad 0x3ff0b5586af7aebc + .quad 0x3ff0b6cb1594bf84 + .quad 0x3ff0b83de050d9ab + .quad 0x3ff0b9b0cb2ec5ca + .quad 0x3ff0bb23d6314cb7 + .quad 0x3ff0bc97015b3783 + .quad 0x3ff0be0a4caf4f81 + .quad 0x3ff0bf7db8305e3f + .quad 0x3ff0c0f143e12d8a + .quad 0x3ff0c264efc4876c + .quad 0x3ff0c3d8bbdd362e + .quad 0x3ff0c54ca82e0455 + .quad 0x3ff0c6c0b4b9bca6 + .quad 0x3ff0c834e1832a24 + .quad 0x3ff0c9a92e8d180e + .quad 0x3ff0cb1d9bda51e1 + .quad 0x3ff0cc92296da35b + .quad 0x3ff0ce06d749d876 + .quad 0x3ff0cf7ba571bd6a + .quad 0x3ff0d0f093e81eab + .quad 0x3ff0d265a2afc8f1 + .quad 0x3ff0d3dad1cb892b + .quad 0x3ff0d550213e2c8c + .quad 0x3ff0d6c5910a8081 + .quad 0x3ff0d83b213352b8 + .quad 0x3ff0d9b0d1bb711b + .quad 0x3ff0db26a2a5a9d4 + .quad 0x3ff0dc9c93f4cb4a + .quad 0x3ff0de12a5aba423 + .quad 0x3ff0df88d7cd0344 + .quad 0x3ff0e0ff2a5bb7cd + .quad 0x3ff0e2759d5a9121 + .quad 0x3ff0e3ec30cc5edd + .quad 0x3ff0e562e4b3f0df + .quad 0x3ff0e6d9b9141745 + .quad 0x3ff0e850adefa265 + .quad 0x3ff0e9c7c34962db + .quad 0x3ff0eb3ef924297d + .quad 0x3ff0ecb64f82c75e + .quad 0x3ff0ee2dc6680dd6 + .quad 0x3ff0efa55dd6ce75 + .quad 0x3ff0f11d15d1db0c + .quad 0x3ff0f294ee5c05ab + .quad 0x3ff0f40ce77820a2 + .quad 0x3ff0f5850128fe7a + .quad 0x3ff0f6fd3b717200 + .quad 0x3ff0f87596544e3f + .quad 0x3ff0f9ee11d4667f + .quad 0x3ff0fb66adf48e46 + .quad 0x3ff0fcdf6ab7995c + .quad 0x3ff0fe5848205bc4 + .quad 0x3ff0ffd14631a9c2 + .quad 0x3ff1014a64ee57d9 + .quad 0x3ff102c3a4593ac9 + .quad 0x3ff1043d04752792 + .quad 0x3ff105b68544f373 + .quad 0x3ff1073026cb73e9 + .quad 0x3ff108a9e90b7eb2 + .quad 0x3ff10a23cc07e9c6 + .quad 0x3ff10b9dcfc38b63 + .quad 0x3ff10d17f44139fe + .quad 0x3ff10e923983cc53 + .quad 0x3ff1100c9f8e1955 + .quad 0x3ff111872662f83e + .quad 0x3ff11301ce054081 + .quad 0x3ff1147c9677c9d2 + .quad 0x3ff115f77fbd6c23 + .quad 0x3ff1177289d8ffa9 + .quad 0x3ff118edb4cd5cd4 + .quad 0x3ff11a69009d5c54 + .quad 0x3ff11be46d4bd71a + .quad 0x3ff11d5ffadba653 + .quad 0x3ff11edba94fa36e + .quad 0x3ff1205778aaa817 + .quad 0x3ff121d368ef8e3b + .quad 0x3ff1234f7a213008 + .quad 0x3ff124cbac4267e5 + .quad 0x3ff12647ff56107f + .quad 0x3ff127c4735f04bd + .quad 0x3ff1294108601fcb + .quad 0x3ff12abdbe5c3d0f + .quad 0x3ff12c3a95563832 + .quad 0x3ff12db78d50ed19 + .quad 0x3ff12f34a64f37ed + .quad 0x3ff130b1e053f513 + .quad 0x3ff1322f3b62012e + .quad 0x3ff133acb77c3927 + .quad 0x3ff1352a54a57a1f + .quad 0x3ff136a812e0a17c + .quad 0x3ff13825f2308ce0 + .quad 0x3ff139a3f2981a2e + .quad 0x3ff13b22141a278a + .quad 0x3ff13ca056b99356 + .quad 0x3ff13e1eba793c33 + .quad 0x3ff13f9d3f5c0103 + .quad 0x3ff1411be564c0e7 + .quad 0x3ff1429aac965b40 + .quad 0x3ff1441994f3afae + .quad 0x3ff145989e7f9e13 + .quad 0x3ff14717c93d068e + .quad 0x3ff14897152ec980 + .quad 0x3ff14a168257c787 + .quad 0x3ff14b9610bae185 + .quad 0x3ff14d15c05af897 + .quad 0x3ff14e95913aee1e + .quad 0x3ff15015835da3b8 + .quad 0x3ff1519596c5fb46 + .quad 0x3ff15315cb76d6e5 + .quad 0x3ff15496217318f6 + .quad 0x3ff1561698bda417 + .quad 0x3ff1579731595b27 + .quad 0x3ff15917eb492145 + .quad 0x3ff15a98c68fd9d1 + .quad 0x3ff15c19c330686b + .quad 0x3ff15d9ae12db0ef + .quad 0x3ff15f1c208a977f + .quad 0x3ff1609d814a007b + .quad 0x3ff1621f036ed081 + .quad 0x3ff163a0a6fbec71 + .quad 0x3ff165226bf4396d + .quad 0x3ff166a4525a9cd3 + .quad 0x3ff168265a31fc44 + .quad 0x3ff169a8837d3da3 + .quad 0x3ff16b2ace3f4710 + .quad 0x3ff16cad3a7afeeb + .quad 0x3ff16e2fc8334bd8 + .quad 0x3ff16fb2776b14b8 + .quad 0x3ff17135482540ad + .quad 0x3ff172b83a64b71a + .quad 0x3ff1743b4e2c5fa4 + .quad 0x3ff175be837f222d + .quad 0x3ff17741da5fe6da + .quad 0x3ff178c552d1960f + .quad 0x3ff17a48ecd71873 + .quad 0x3ff17bcca87356e9 + .quad 0x3ff17d5085a93a9b + .quad 0x3ff17ed4847bacec + .quad 0x3ff18058a4ed9787 + .quad 0x3ff181dce701e451 + .quad 0x3ff183614abb7d75 + .quad 0x3ff184e5d01d4d5b + .quad 0x3ff1866a772a3ead + .quad 0x3ff187ef3fe53c58 + .quad 0x3ff189742a513185 + .quad 0x3ff18af9367109a1 + .quad 0x3ff18c7e6447b059 + .quad 0x3ff18e03b3d8119c + .quad 0x3ff18f8925251997 + .quad 0x3ff1910eb831b4ba + .quad 0x3ff192946d00cfb6 + .quad 0x3ff1941a4395577c + .quad 0x3ff195a03bf2393e + .quad 0x3ff19726561a626d + .quad 0x3ff198ac9210c0c1 + .quad 0x3ff19a32efd8422c + .quad 0x3ff19bb96f73d4e5 + .quad 0x3ff19d4010e66763 + .quad 0x3ff19ec6d432e85c + .quad 0x3ff1a04db95c46cc + .quad 0x3ff1a1d4c06571ed + .quad 0x3ff1a35be9515937 + .quad 0x3ff1a4e33422ec69 + .quad 0x3ff1a66aa0dd1b81 + .quad 0x3ff1a7f22f82d6bc + .quad 0x3ff1a979e0170e9a + .quad 0x3ff1ab01b29cb3dd + .quad 0x3ff1ac89a716b786 + .quad 0x3ff1ae11bd880ada + .quad 0x3ff1af99f5f39f5d + .quad 0x3ff1b122505c66d5 + .quad 0x3ff1b2aaccc5534b + .quad 0x3ff1b4336b315705 + .quad 0x3ff1b5bc2ba3648e + .quad 0x3ff1b7450e1e6eb3 + .quad 0x3ff1b8ce12a56880 + .quad 0x3ff1ba57393b4544 + .quad 0x3ff1bbe081e2f88e + .quad 0x3ff1bd69ec9f762f + .quad 0x3ff1bef37973b23b + .quad 0x3ff1c07d2862a105 + .quad 0x3ff1c206f96f3724 + .quad 0x3ff1c390ec9c696f + .quad 0x3ff1c51b01ed2cfe + .quad 0x3ff1c6a53964772e + .quad 0x3ff1c82f93053d99 + .quad 0x3ff1c9ba0ed2761e + .quad 0x3ff1cb44accf16dc + .quad 0x3ff1cccf6cfe1634 + .quad 0x3ff1ce5a4f626acb + .quad 0x3ff1cfe553ff0b83 + .quad 0x3ff1d1707ad6ef85 + .quad 0x3ff1d2fbc3ed0e37 + .quad 0x3ff1d4872f445f44 + .quad 0x3ff1d612bcdfda99 + .quad 0x3ff1d79e6cc27863 + .quad 0x3ff1d92a3eef3111 + .quad 0x3ff1dab63368fd56 + .quad 0x3ff1dc424a32d624 + .quad 0x3ff1ddce834fb4b0 + .quad 0x3ff1df5adec29273 + .quad 0x3ff1e0e75c8e6927 + .quad 0x3ff1e273fcb632c5 + .quad 0x3ff1e400bf3ce98b + .quad 0x3ff1e58da42587fa + .quad 0x3ff1e71aab7308d1 + .quad 0x3ff1e8a7d5286717 + .quad 0x3ff1ea3521489e0e + .quad 0x3ff1ebc28fd6a942 + .quad 0x3ff1ed5020d5847a + .quad 0x3ff1eeddd4482bc3 + .quad 0x3ff1f06baa319b6b + .quad 0x3ff1f1f9a294d004 + .quad 0x3ff1f387bd74c660 + .quad 0x3ff1f515fad47b95 + .quad 0x3ff1f6a45ab6ecfa + .quad 0x3ff1f832dd1f1829 + .quad 0x3ff1f9c1820ffafe + .quad 0x3ff1fb50498c9397 + .quad 0x3ff1fcdf3397e057 + .quad 0x3ff1fe6e4034dfdf + .quad 0x3ff1fffd6f669115 + .quad 0x3ff2018cc12ff324 + .quad 0x3ff2031c35940574 + .quad 0x3ff204abcc95c7b4 + .quad 0x3ff2063b863839d2 + .quad 0x3ff207cb627e5c01 + .quad 0x3ff2095b616b2eb7 + .quad 0x3ff20aeb8301b2aa + .quad 0x3ff20c7bc744e8d5 + .quad 0x3ff20e0c2e37d274 + .quad 0x3ff20f9cb7dd7108 + .quad 0x3ff2112d6438c651 + .quad 0x3ff212be334cd455 + .quad 0x3ff2144f251c9d5b + .quad 0x3ff215e039ab23ee + .quad 0x3ff2177170fb6adb + .quad 0x3ff21902cb107531 + .quad 0x3ff21a9447ed4643 + .quad 0x3ff21c25e794e1a7 + .quad 0x3ff21db7aa0a4b34 + .quad 0x3ff21f498f508707 + .quad 0x3ff220db976a997e + .quad 0x3ff2226dc25b8739 + .quad 0x3ff224001026551c + .quad 0x3ff2259280ce084e + .quad 0x3ff227251455a63b + .quad 0x3ff228b7cac0348e + .quad 0x3ff22a4aa410b938 + .quad 0x3ff22bdda04a3a6b + .quad 0x3ff22d70bf6fbea0 + .quad 0x3ff22f0401844c8d + .quad 0x3ff23097668aeb31 + .quad 0x3ff2322aee86a1ca + .quad 0x3ff233be997a77db + .quad 0x3ff235526769752b + .quad 0x3ff236e65856a1c4 + .quad 0x3ff2387a6c4505ef + .quad 0x3ff23a0ea337aa40 + .quad 0x3ff23ba2fd319789 + .quad 0x3ff23d377a35d6de + .quad 0x3ff23ecc1a47719b + .quad 0x3ff24060dd69715c + .quad 0x3ff241f5c39ee002 + .quad 0x3ff2438acceac7b2 + .quad 0x3ff2451ff95032d1 + .quad 0x3ff246b548d22c0c + .quad 0x3ff2484abb73be50 + .quad 0x3ff249e05137f4cf + .quad 0x3ff24b760a21daff + .quad 0x3ff24d0be6347c98 + .quad 0x3ff24ea1e572e597 + .quad 0x3ff2503807e0223a + .quad 0x3ff251ce4d7f3f08 + .quad 0x3ff25364b65348c6 + .quad 0x3ff254fb425f4c80 + .quad 0x3ff25691f1a65784 + .quad 0x3ff25828c42b7763 + .quad 0x3ff259bfb9f1b9f7 + .quad 0x3ff25b56d2fc2d55 + .quad 0x3ff25cee0f4ddfdd + .quad 0x3ff25e856ee9e031 + .quad 0x3ff2601cf1d33d35 + .quad 0x3ff261b4980d0613 + .quad 0x3ff2634c619a4a36 + .quad 0x3ff264e44e7e1952 + .quad 0x3ff2667c5ebb8358 + .quad 0x3ff2681492559883 + .quad 0x3ff269ace94f694f + .quad 0x3ff26b4563ac067d + .quad 0x3ff26cde016e8111 + .quad 0x3ff26e76c299ea53 + .quad 0x3ff2700fa73153d1 + .quad 0x3ff271a8af37cf5b + .quad 0x3ff27341dab06f07 + .quad 0x3ff274db299e452c + .quad 0x3ff276749c046468 + .quad 0x3ff2780e31e5df9c + .quad 0x3ff279a7eb45c9ef + .quad 0x3ff27b41c82736c8 + .quad 0x3ff27cdbc88d39d8 + .quad 0x3ff27e75ec7ae70f + .quad 0x3ff2801033f352a4 + .quad 0x3ff281aa9ef99111 + .quad 0x3ff283452d90b716 + .quad 0x3ff284dfdfbbd9b7 + .quad 0x3ff2867ab57e0e3a + .quad 0x3ff28815aeda6a2d + .quad 0x3ff289b0cbd4035f + .quad 0x3ff28b4c0c6defe6 + .quad 0x3ff28ce770ab461b + .quad 0x3ff28e82f88f1c9e + .quad 0x3ff2901ea41c8a50 + .quad 0x3ff291ba7356a657 + .quad 0x3ff2935666408820 + .quad 0x3ff294f27cdd475a + .quad 0x3ff2968eb72ffbfc + .quad 0x3ff2982b153bbe3d + .quad 0x3ff299c79703a69e + .quad 0x3ff29b643c8acdde + .quad 0x3ff29d0105d44d08 + .quad 0x3ff29e9df2e33d66 + .quad 0x3ff2a03b03bab88b + .quad 0x3ff2a1d8385dd84a + .quad 0x3ff2a37590cfb6bf + .quad 0x3ff2a5130d136e49 + .quad 0x3ff2a6b0ad2c198d + .quad 0x3ff2a84e711cd374 + .quad 0x3ff2a9ec58e8b729 + .quad 0x3ff2ab8a6492e024 + .quad 0x3ff2ad28941e6a18 + .quad 0x3ff2aec6e78e7104 + .quad 0x3ff2b0655ee61129 + .quad 0x3ff2b203fa28670e + .quad 0x3ff2b3a2b9588f7f + .quad 0x3ff2b5419c79a78c + .quad 0x3ff2b6e0a38ecc8b + .quad 0x3ff2b87fce9b1c18 + .quad 0x3ff2ba1f1da1b412 + .quad 0x3ff2bbbe90a5b29e + .quad 0x3ff2bd5e27aa3627 + .quad 0x3ff2befde2b25d5c + .quad 0x3ff2c09dc1c14733 + .quad 0x3ff2c23dc4da12e4 + .quad 0x3ff2c3ddebffdff0 + .quad 0x3ff2c57e3735ce1a + .quad 0x3ff2c71ea67efd6d + .quad 0x3ff2c8bf39de8e38 + .quad 0x3ff2ca5ff157a10f + .quad 0x3ff2cc00cced56cd + .quad 0x3ff2cda1cca2d08f + .quad 0x3ff2cf42f07b2fbd + .quad 0x3ff2d0e4387995fe + .quad 0x3ff2d285a4a12544 + .quad 0x3ff2d42734f4ffc3 + .quad 0x3ff2d5c8e97847f6 + .quad 0x3ff2d76ac22e209d + .quad 0x3ff2d90cbf19acbd + .quad 0x3ff2daaee03e0fa3 + .quad 0x3ff2dc51259e6ce0 + .quad 0x3ff2ddf38f3de848 + .quad 0x3ff2df961d1fa5f9 + .quad 0x3ff2e138cf46ca57 + .quad 0x3ff2e2dba5b67a06 + .quad 0x3ff2e47ea071d9f4 + .quad 0x3ff2e621bf7c0f57 + .quad 0x3ff2e7c502d83fa4 + .quad 0x3ff2e9686a89909e + .quad 0x3ff2eb0bf6932845 + .quad 0x3ff2ecafa6f82ce7 + .quad 0x3ff2ee537bbbc512 + .quad 0x3ff2eff774e1179d + .quad 0x3ff2f19b926b4ba5 + .quad 0x3ff2f33fd45d888a + .quad 0x3ff2f4e43abaf5f5 + .quad 0x3ff2f688c586bbd5 + .quad 0x3ff2f82d74c4025c + .quad 0x3ff2f9d24875f205 + .quad 0x3ff2fb77409fb390 + .quad 0x3ff2fd1c5d447003 + .quad 0x3ff2fec19e6750ab + .quad 0x3ff30067040b7f19 + .quad 0x3ff3020c8e342527 + .quad 0x3ff303b23ce46cf4 + .quad 0x3ff30558101f80e3 + .quad 0x3ff306fe07e88ba0 + .quad 0x3ff308a42442b81d + .quad 0x3ff30a4a65313191 + .quad 0x3ff30bf0cab7237a + .quad 0x3ff30d9754d7b99d + .quad 0x3ff30f3e03962005 + .quad 0x3ff310e4d6f58302 + .quad 0x3ff3128bcef90f2b + .quad 0x3ff31432eba3f15f + .quad 0x3ff315da2cf956c2 + .quad 0x3ff3178192fc6cbf + .quad 0x3ff319291db06106 + .quad 0x3ff31ad0cd186190 + .quad 0x3ff31c78a1379c9b + .quad 0x3ff31e209a1140ab + .quad 0x3ff31fc8b7a87c8c + .quad 0x3ff32170fa007f51 + .quad 0x3ff32319611c7851 + .quad 0x3ff324c1ecff972d + .quad 0x3ff3266a9dad0bca + .quad 0x3ff3281373280654 + .quad 0x3ff329bc6d73b741 + .quad 0x3ff32b658c934f48 + .quad 0x3ff32d0ed089ff6c + .quad 0x3ff32eb8395af8f4 + .quad 0x3ff33061c7096d6f + .quad 0x3ff3320b79988eb2 + .quad 0x3ff333b5510b8eda + .quad 0x3ff3355f4d65a04a + .quad 0x3ff337096ea9f5ab + .quad 0x3ff338b3b4dbc1f0 + .quad 0x3ff33a5e1ffe384f + .quad 0x3ff33c08b0148c49 + .quad 0x3ff33db36521f1a3 + .quad 0x3ff33f5e3f299c69 + .quad 0x3ff341093e2ec0f2 + .quad 0x3ff342b4623493d7 + .quad 0x3ff3445fab3e49fa + .quad 0x3ff3460b194f1887 + .quad 0x3ff347b6ac6a34ec + .quad 0x3ff349626492d4e2 + .quad 0x3ff34b0e41cc2e67 + .quad 0x3ff34cba441977c4 + .quad 0x3ff34e666b7de784 + .quad 0x3ff35012b7fcb47d + .quad 0x3ff351bf299915c9 + .quad 0x3ff3536bc05642cf + .quad 0x3ff355187c377337 + .quad 0x3ff356c55d3fdef4 + .quad 0x3ff358726372be40 + .quad 0x3ff35a1f8ed3499b + .quad 0x3ff35bccdf64b9cf + .quad 0x3ff35d7a552a47ea + .quad 0x3ff35f27f0272d45 + .quad 0x3ff360d5b05ea37f + .quad 0x3ff3628395d3e47d + .quad 0x3ff36431a08a2a6f + .quad 0x3ff365dfd084afc9 + .quad 0x3ff3678e25c6af48 + .quad 0x3ff3693ca05363f3 + .quad 0x3ff36aeb402e0915 + .quad 0x3ff36c9a0559da43 + .quad 0x3ff36e48efda1358 + .quad 0x3ff36ff7ffb1f078 + .quad 0x3ff371a734e4ae11 + .quad 0x3ff373568f7588d3 + .quad 0x3ff375060f67bdb9 + .quad 0x3ff376b5b4be8a0a + .quad 0x3ff378657f7d2b4c + .quad 0x3ff37a156fa6df54 + .quad 0x3ff37bc5853ee43d + .quad 0x3ff37d75c0487869 + .quad 0x3ff37f2620c6da82 + .quad 0x3ff380d6a6bd497c + .quad 0x3ff38287522f048e + .quad 0x3ff38438231f4b3f + .quad 0x3ff385e919915d57 + .quad 0x3ff3879a35887ae9 + .quad 0x3ff3894b7707e450 + .quad 0x3ff38afcde12da2f + .quad 0x3ff38cae6aac9d71 + .quad 0x3ff38e601cd86f48 + .quad 0x3ff39011f4999132 + .quad 0x3ff391c3f1f344f1 + .quad 0x3ff3937614e8cc90 + .quad 0x3ff395285d7d6a65 + .quad 0x3ff396dacbb4610c + .quad 0x3ff3988d5f90f36a + .quad 0x3ff39a40191664ac + .quad 0x3ff39bf2f847f847 + .quad 0x3ff39da5fd28f1f8 + .quad 0x3ff39f5927bc95c8 + .quad 0x3ff3a10c78062804 + .quad 0x3ff3a2bfee08ed45 + .quad 0x3ff3a47389c82a68 + .quad 0x3ff3a6274b472498 + .quad 0x3ff3a7db32892144 + .quad 0x3ff3a98f3f916626 + .quad 0x3ff3ab4372633941 + .quad 0x3ff3acf7cb01e0de + .quad 0x3ff3aeac4970a392 + .quad 0x3ff3b060edb2c837 + .quad 0x3ff3b215b7cb95f3 + .quad 0x3ff3b3caa7be5434 + .quad 0x3ff3b57fbd8e4aae + .quad 0x3ff3b734f93ec163 + .quad 0x3ff3b8ea5ad30097 + .quad 0x3ff3ba9fe24e50dd + .quad 0x3ff3bc558fb3fb0d + .quad 0x3ff3be0b6307484a + .quad 0x3ff3bfc15c4b81ff + .quad 0x3ff3c1777b83f1e0 + .quad 0x3ff3c32dc0b3e1ea + .quad 0x3ff3c4e42bde9c62 + .quad 0x3ff3c69abd076bd8 + .quad 0x3ff3c85174319b24 + .quad 0x3ff3ca0851607565 + .quad 0x3ff3cbbf54974607 + .quad 0x3ff3cd767dd958bd + .quad 0x3ff3cf2dcd29f984 + .quad 0x3ff3d0e5428c749e + .quad 0x3ff3d29cde04169e + .quad 0x3ff3d4549f942c57 + .quad 0x3ff3d60c874002ee + .quad 0x3ff3d7c4950ae7cb + .quad 0x3ff3d97cc8f828a2 + .quad 0x3ff3db35230b136f + .quad 0x3ff3dceda346f679 + .quad 0x3ff3dea649af204f + .quad 0x3ff3e05f1646dfca + .quad 0x3ff3e2180911840c + .quad 0x3ff3e3d122125c7f + .quad 0x3ff3e58a614cb8db + .quad 0x3ff3e743c6c3e91b + .quad 0x3ff3e8fd527b3d8a + .quad 0x3ff3eab7047606b7 + .quad 0x3ff3ec70dcb7957e + .quad 0x3ff3ee2adb433b04 + .quad 0x3ff3efe5001c48b5 + .quad 0x3ff3f19f4b46104c + .quad 0x3ff3f359bcc3e3c8 + .quad 0x3ff3f51454991573 + .quad 0x3ff3f6cf12c8f7e5 + .quad 0x3ff3f889f756ddfa + .quad 0x3ff3fa4502461adb + .quad 0x3ff3fc00339a01f9 + .quad 0x3ff3fdbb8b55e710 + .quad 0x3ff3ff77097d1e26 + .quad 0x3ff40132ae12fb8a + .quad 0x3ff402ee791ad3d5 + .quad 0x3ff404aa6a97fbea + .quad 0x3ff40666828dc8f6 + .quad 0x3ff40822c0ff9071 + .quad 0x3ff409df25f0a81b + .quad 0x3ff40b9bb16465fe + .quad 0x3ff40d58635e2070 + .quad 0x3ff40f153be12e0f + .quad 0x3ff410d23af0e5c5 + .quad 0x3ff4128f60909ec4 + .quad 0x3ff4144cacc3b08a + .quad 0x3ff4160a1f8d72dd + .quad 0x3ff417c7b8f13dd0 + .quad 0x3ff4198578f269be + .quad 0x3ff41b435f944f4c + .quad 0x3ff41d016cda476c + .quad 0x3ff41ebfa0c7ab57 + .quad 0x3ff4207dfb5fd491 + .quad 0x3ff4223c7ca61cea + .quad 0x3ff423fb249dde7b + .quad 0x3ff425b9f34a73a7 + .quad 0x3ff42778e8af371d + .quad 0x3ff4293804cf83d5 + .quad 0x3ff42af747aeb511 + .quad 0x3ff42cb6b1502661 + .quad 0x3ff42e7641b73399 + .quad 0x3ff43035f8e738de + .quad 0x3ff431f5d6e3929c + .quad 0x3ff433b5dbaf9d8b + .quad 0x3ff43576074eb6ac + .quad 0x3ff4373659c43b4c + .quad 0x3ff438f6d3138902 + .quad 0x3ff43ab7733ffdb1 + .quad 0x3ff43c783a4cf784 + .quad 0x3ff43e39283dd4f5 + .quad 0x3ff43ffa3d15f4c3 + .quad 0x3ff441bb78d8b5fc + .quad 0x3ff4437cdb8977f9 + .quad 0x3ff4453e652b9a59 + .quad 0x3ff4470015c27d0e + .quad 0x3ff448c1ed51804b + .quad 0x3ff44a83ebdc0497 + .quad 0x3ff44c4611656abf + .quad 0x3ff44e085df113da + .quad 0x3ff44fcad182614e + .quad 0x3ff4518d6c1cb4c9 + .quad 0x3ff453502dc37045 + .quad 0x3ff455131679f608 + .quad 0x3ff456d62643a8a0 + .quad 0x3ff458995d23eae9 + .quad 0x3ff45a5cbb1e2009 + .quad 0x3ff45c204035ab72 + .quad 0x3ff45de3ec6df0df + .quad 0x3ff45fa7bfca5459 + .quad 0x3ff4616bba4e3a34 + .quad 0x3ff4632fdbfd070c + .quad 0x3ff464f424da1fcc + .quad 0x3ff466b894e8e9a6 + .quad 0x3ff4687d2c2cca1e + .quad 0x3ff46a41eaa926fc + .quad 0x3ff46c06d061665a + .quad 0x3ff46dcbdd58ee98 + .quad 0x3ff46f9111932664 + .quad 0x3ff471566d1374b7 + .quad 0x3ff4731befdd40d6 + .quad 0x3ff474e199f3f251 + .quad 0x3ff476a76b5af103 + .quad 0x3ff4786d6415a514 + .quad 0x3ff47a33842776f6 + .quad 0x3ff47bf9cb93cf67 + .quad 0x3ff47dc03a5e1772 + .quad 0x3ff47f86d089b86d + .quad 0x3ff4814d8e1a1bf8 + .quad 0x3ff483147312ac00 + .quad 0x3ff484db7f76d2be + .quad 0x3ff486a2b349fab7 + .quad 0x3ff4886a0e8f8ebb + .quad 0x3ff48a31914af9e6 + .quad 0x3ff48bf93b7fa79f + .quad 0x3ff48dc10d31039b + .quad 0x3ff48f89066279da + .quad 0x3ff49151271776a6 + .quad 0x3ff493196f536698 + .quad 0x3ff494e1df19b693 + .quad 0x3ff496aa766dd3c6 + .quad 0x3ff4987335532bad + .quad 0x3ff49a3c1bcd2c0f + .quad 0x3ff49c0529df4300 + .quad 0x3ff49dce5f8cdee0 + .quad 0x3ff49f97bcd96e5a + .quad 0x3ff4a16141c86066 + .quad 0x3ff4a32aee5d2449 + .quad 0x3ff4a4f4c29b2993 + .quad 0x3ff4a6bebe85e020 + .quad 0x3ff4a888e220b819 + .quad 0x3ff4aa532d6f21f4 + .quad 0x3ff4ac1da0748e6f + .quad 0x3ff4ade83b346e9c + .quad 0x3ff4afb2fdb233d4 + .quad 0x3ff4b17de7f14fbb + .quad 0x3ff4b348f9f53446 + .quad 0x3ff4b51433c153b3 + .quad 0x3ff4b6df9559208f + .quad 0x3ff4b8ab1ec00db1 + .quad 0x3ff4ba76cff98e3e + .quad 0x3ff4bc42a90915a7 + .quad 0x3ff4be0ea9f217aa + .quad 0x3ff4bfdad2b80852 + .quad 0x3ff4c1a7235e5bf6 + .quad 0x3ff4c3739be88737 + .quad 0x3ff4c5403c59ff09 + .quad 0x3ff4c70d04b638a6 + .quad 0x3ff4c8d9f500a999 + .quad 0x3ff4caa70d3cc7b9 + .quad 0x3ff4cc744d6e0926 + .quad 0x3ff4ce41b597e454 + .quad 0x3ff4d00f45bdcffe + .quad 0x3ff4d1dcfde3432d + .quad 0x3ff4d3aade0bb539 + .quad 0x3ff4d578e63a9dc2 + .quad 0x3ff4d747167374bd + .quad 0x3ff4d9156eb9b264 + .quad 0x3ff4dae3ef10cf42 + .quad 0x3ff4dcb2977c442f + .quad 0x3ff4de8167ff8a4e + .quad 0x3ff4e050609e1b11 + .quad 0x3ff4e21f815b7036 + .quad 0x3ff4e3eeca3b03c9 + .quad 0x3ff4e5be3b405022 + .quad 0x3ff4e78dd46ecfe6 + .quad 0x3ff4e95d95c9fe0b + .quad 0x3ff4eb2d7f5555ce + .quad 0x3ff4ecfd911452bd + .quad 0x3ff4eecdcb0a70b3 + .quad 0x3ff4f09e2d3b2bd8 + .quad 0x3ff4f26eb7aa00a1 + .quad 0x3ff4f43f6a5a6bd2 + .quad 0x3ff4f610454fea79 + .quad 0x3ff4f7e1488df9f4 + .quad 0x3ff4f9b2741817ee + .quad 0x3ff4fb83c7f1c25e + .quad 0x3ff4fd55441e778b + .quad 0x3ff4ff26e8a1b608 + .quad 0x3ff500f8b57efcb6 + .quad 0x3ff502caaab9cac1 + .quad 0x3ff5049cc8559fa7 + .quad 0x3ff5066f0e55fb31 + .quad 0x3ff508417cbe5d77 + .quad 0x3ff50a14139246db + .quad 0x3ff50be6d2d53812 + .quad 0x3ff50db9ba8ab21a + .quad 0x3ff50f8ccab63642 + .quad 0x3ff51160035b4625 + .quad 0x3ff51333647d63ad + .quad 0x3ff51506ee201112 + .quad 0x3ff516daa046d0d6 + .quad 0x3ff518ae7af525ce + .quad 0x3ff51a827e2e931a + .quad 0x3ff51c56a9f69c2a + .quad 0x3ff51e2afe50c4b9 + .quad 0x3ff51fff7b4090d2 + .quad 0x3ff521d420c984ce + .quad 0x3ff523a8eeef2553 + .quad 0x3ff5257de5b4f757 + .quad 0x3ff52753051e801a + .quad 0x3ff529284d2f4530 + .quad 0x3ff52afdbdeacc76 + .quad 0x3ff52cd357549c19 + .quad 0x3ff52ea919703a95 + .quad 0x3ff5307f04412eb4 + .quad 0x3ff5325517caff8d + .quad 0x3ff5342b54113485 + .quad 0x3ff53601b9175551 + .quad 0x3ff537d846e0e9f5 + .quad 0x3ff539aefd717ac0 + .quad 0x3ff53b85dccc9053 + .quad 0x3ff53d5ce4f5b39a + .quad 0x3ff53f3415f06dd2 + .quad 0x3ff5410b6fc04885 + .quad 0x3ff542e2f268cd8c + .quad 0x3ff544ba9ded870f + .quad 0x3ff546927251ff84 + .quad 0x3ff5486a6f99c1b1 + .quad 0x3ff54a4295c858a6 + .quad 0x3ff54c1ae4e14fc7 + .quad 0x3ff54df35ce832c3 + .quad 0x3ff54fcbfde08d9b + .quad 0x3ff551a4c7cdec9c + .quad 0x3ff5537dbab3dc60 + .quad 0x3ff55556d695e9d6 + .quad 0x3ff557301b77a236 + .quad 0x3ff55909895c9309 + .quad 0x3ff55ae320484a28 + .quad 0x3ff55cbce03e55b7 + .quad 0x3ff55e96c942442b + .quad 0x3ff56070db57a44b + .quad 0x3ff5624b16820529 + .quad 0x3ff564257ac4f625 + .quad 0x3ff56600082406f4 + .quad 0x3ff567dabea2c792 + .quad 0x3ff569b59e44c851 + .quad 0x3ff56b90a70d99ce + .quad 0x3ff56d6bd900ccf5 + .quad 0x3ff56f473421f304 + .quad 0x3ff57122b8749d87 + .quad 0x3ff572fe65fc5e58 + .quad 0x3ff574da3cbcc7a0 + .quad 0x3ff576b63cb96bd8 + .quad 0x3ff5789265f5ddca + .quad 0x3ff57a6eb875b08c + .quad 0x3ff57c4b343c7786 + .quad 0x3ff57e27d94dc66d + .quad 0x3ff58004a7ad3148 + .quad 0x3ff581e19f5e4c6a + .quad 0x3ff583bec064ac79 + .quad 0x3ff5859c0ac3e669 + .quad 0x3ff587797e7f8f7c + .quad 0x3ff589571b9b3d44 + .quad 0x3ff58b34e21a85a7 + .quad 0x3ff58d12d200fed2 + .quad 0x3ff58ef0eb523f4a + .quad 0x3ff590cf2e11ddde + .quad 0x3ff592ad9a4371af + .quad 0x3ff5948c2fea922e + .quad 0x3ff5966aef0ad71b + .quad 0x3ff59849d7a7d883 + .quad 0x3ff59a28e9c52ec9 + .quad 0x3ff59c0825667299 + .quad 0x3ff59de78a8f3cf4 + .quad 0x3ff59fc719432727 + .quad 0x3ff5a1a6d185cad3 + .quad 0x3ff5a386b35ac1e4 + .quad 0x3ff5a566bec5a699 + .quad 0x3ff5a746f3ca1381 + .quad 0x3ff5a927526ba378 + .quad 0x3ff5ab07daadf1af + .quad 0x3ff5ace88c9499a3 + .quad 0x3ff5aec968233721 + .quad 0x3ff5b0aa6d5d6649 + .quad 0x3ff5b28b9c46c389 + .quad 0x3ff5b46cf4e2eb9d + .quad 0x3ff5b64e77357b97 + .quad 0x3ff5b830234210d3 + .quad 0x3ff5ba11f90c4902 + .quad 0x3ff5bbf3f897c221 + .quad 0x3ff5bdd621e81a81 + .quad 0x3ff5bfb87500f0c1 + .quad 0x3ff5c19af1e5e3d2 + .quad 0x3ff5c37d989a92f2 + .quad 0x3ff5c56069229db4 + .quad 0x3ff5c7436381a3f7 + .quad 0x3ff5c92687bb45ed + .quad 0x3ff5cb09d5d3241a + .quad 0x3ff5cced4dccdf4c + .quad 0x3ff5ced0efac18aa + .quad 0x3ff5d0b4bb7471a6 + .quad 0x3ff5d298b1298c02 + .quad 0x3ff5d47cd0cf09d4 + .quad 0x3ff5d6611a688d81 + .quad 0x3ff5d8458df9b9bf + .quad 0x3ff5da2a2b863193 + .quad 0x3ff5dc0ef3119855 + .quad 0x3ff5ddf3e49f91ad + .quad 0x3ff5dfd90033c193 + .quad 0x3ff5e1be45d1cc4f + .quad 0x3ff5e3a3b57d567d + .quad 0x3ff5e5894f3a0506 + .quad 0x3ff5e76f130b7d27 + .quad 0x3ff5e95500f5646d + .quad 0x3ff5eb3b18fb60b3 + .quad 0x3ff5ed215b21182a + .quad 0x3ff5ef07c76a314e + .quad 0x3ff5f0ee5dda52f4 + .quad 0x3ff5f2d51e752439 + .quad 0x3ff5f4bc093e4c90 + .quad 0x3ff5f6a31e3973bf + .quad 0x3ff5f88a5d6a41d9 + .quad 0x3ff5fa71c6d45f43 + .quad 0x3ff5fc595a7b74b4 + .quad 0x3ff5fe4118632b33 + .quad 0x3ff60029008f2c1b + .quad 0x3ff6021113032116 + .quad 0x3ff603f94fc2b41f + .quad 0x3ff605e1b6d18f82 + .quad 0x3ff607ca48335ddf + .quad 0x3ff609b303ebca24 + .quad 0x3ff60b9be9fe7f93 + .quad 0x3ff60d84fa6f29bf + .quad 0x3ff60f6e3541748a + .quad 0x3ff611579a790c29 + .quad 0x3ff613412a199d25 + .quad 0x3ff6152ae426d453 + .quad 0x3ff61714c8a45edf + .quad 0x3ff618fed795ea44 + .quad 0x3ff61ae910ff244e + .quad 0x3ff61cd374e3bb1b + .quad 0x3ff61ebe03475d1c + .quad 0x3ff620a8bc2db914 + .quad 0x3ff622939f9a7e14 + .quad 0x3ff6247ead915b83 + .quad 0x3ff62669e6160116 + .quad 0x3ff62855492c1ed7 + .quad 0x3ff62a40d6d76521 + .quad 0x3ff62c2c8f1b84a0 + .quad 0x3ff62e1871fc2e52 + .quad 0x3ff630047f7d1386 + .quad 0x3ff631f0b7a1e5e1 + .quad 0x3ff633dd1a6e5753 + .quad 0x3ff635c9a7e61a25 + .quad 0x3ff637b6600ce0ed + .quad 0x3ff639a342e65e97 + .quad 0x3ff63b905076465f + .quad 0x3ff63d7d88c04bd0 + .quad 0x3ff63f6aebc822cd + .quad 0x3ff6415879917f88 + .quad 0x3ff6434632201685 + .quad 0x3ff6453415779c9b + .quad 0x3ff64722239bc6f3 + .quad 0x3ff649105c904b09 + .quad 0x3ff64afec058dea9 + .quad 0x3ff64ced4ef937f3 + .quad 0x3ff64edc08750d5b + .quad 0x3ff650caecd015a3 + .quad 0x3ff652b9fc0e07e3 + .quad 0x3ff654a936329b85 + .quad 0x3ff656989b418844 + .quad 0x3ff658882b3e862e + .quad 0x3ff65a77e62d4da4 + .quad 0x3ff65c67cc119759 + .quad 0x3ff65e57dcef1c54 + .quad 0x3ff6604818c995ed + .quad 0x3ff662387fa4bdce + .quad 0x3ff6642911844df6 + .quad 0x3ff66619ce6c00b4 + .quad 0x3ff6680ab65f90ac + .quad 0x3ff669fbc962b8d3 + .quad 0x3ff66bed07793473 + .quad 0x3ff66dde70a6bf26 + .quad 0x3ff66fd004ef14db + .quad 0x3ff671c1c455f1d2 + .quad 0x3ff673b3aedf129f + .quad 0x3ff675a5c48e342a + .quad 0x3ff67798056713aa + .quad 0x3ff6798a716d6eaf + .quad 0x3ff67b7d08a50316 + .quad 0x3ff67d6fcb118f12 + .quad 0x3ff67f62b8b6d12a + .quad 0x3ff68155d1988835 + .quad 0x3ff6834915ba7361 + .quad 0x3ff6853c8520522a + .quad 0x3ff687301fcde464 + .quad 0x3ff68923e5c6ea33 + .quad 0x3ff68b17d70f2412 + .quad 0x3ff68d0bf3aa52c9 + .quad 0x3ff68f003b9c3779 + .quad 0x3ff690f4aee89395 + .quad 0x3ff692e94d9328e0 + .quad 0x3ff694de179fb976 + .quad 0x3ff696d30d1207c0 + .quad 0x3ff698c82dedd681 + .quad 0x3ff69abd7a36e8c9 + .quad 0x3ff69cb2f1f101ff + .quad 0x3ff69ea8951fe5e0 + .quad 0x3ff6a09e63c75876 + .quad 0x3ff6a2945deb1e23 + .quad 0x3ff6a48a838efb9d + .quad 0x3ff6a680d4b6b5ec + .quad 0x3ff6a8775166126a + .quad 0x3ff6aa6df9a0d6c8 + .quad 0x3ff6ac64cd6ac90a + .quad 0x3ff6ae5bccc7af86 + .quad 0x3ff6b052f7bb50e6 + .quad 0x3ff6b24a4e497429 + .quad 0x3ff6b441d075e0a1 + .quad 0x3ff6b6397e445df5 + .quad 0x3ff6b83157b8b41e + .quad 0x3ff6ba295cd6ab6a + .quad 0x3ff6bc218da20c7a + .quad 0x3ff6be19ea1ea046 + .quad 0x3ff6c01272503016 + .quad 0x3ff6c20b263a8587 + .quad 0x3ff6c40405e16a8b + .quad 0x3ff6c5fd1148a969 + .quad 0x3ff6c7f648740cb9 + .quad 0x3ff6c9efab675f6a + .quad 0x3ff6cbe93a266cbe + .quad 0x3ff6cde2f4b5004b + .quad 0x3ff6cfdcdb16e5fb + .quad 0x3ff6d1d6ed4fea0d + .quad 0x3ff6d3d12b63d914 + .quad 0x3ff6d5cb95567ff7 + .quad 0x3ff6d7c62b2babf1 + .quad 0x3ff6d9c0ece72a93 + .quad 0x3ff6dbbbda8cc9c0 + .quad 0x3ff6ddb6f42057b0 + .quad 0x3ff6dfb239a5a2f3 + .quad 0x3ff6e1adab207a67 + .quad 0x3ff6e3a94894ad43 + .quad 0x3ff6e5a512060b13 + .quad 0x3ff6e7a1077863b4 + .quad 0x3ff6e99d28ef875a + .quad 0x3ff6eb99766f468d + .quad 0x3ff6ed95effb722a + .quad 0x3ff6ef929597db64 + .quad 0x3ff6f18f674853bf + .quad 0x3ff6f38c6510ad16 + .quad 0x3ff6f5898ef4b99a + .quad 0x3ff6f786e4f84bcd + .quad 0x3ff6f984671f368a + .quad 0x3ff6fb82156d4cfe + .quad 0x3ff6fd7fefe662ac + .quad 0x3ff6ff7df68e4b6c + .quad 0x3ff7017c2968db6b + .quad 0x3ff7037a8879e729 + .quad 0x3ff7057913c5437d + .quad 0x3ff70777cb4ec594 + .quad 0x3ff70976af1a42ec + .quad 0x3ff70b75bf2b915c + .quad 0x3ff70d74fb868710 + .quad 0x3ff70f74642efa85 + .quad 0x3ff71173f928c291 + .quad 0x3ff71373ba77b65f + .quad 0x3ff71573a81fad6d + .quad 0x3ff71773c2247f90 + .quad 0x3ff71974088a04f2 + .quad 0x3ff71b747b541612 + .quad 0x3ff71d751a868bc4 + .quad 0x3ff71f75e6253f32 + .quad 0x3ff72176de3409db + .quad 0x3ff7237802b6c593 + .quad 0x3ff7257953b14c84 + .quad 0x3ff7277ad127792f + .quad 0x3ff7297c7b1d2667 + .quad 0x3ff72b7e51962f56 + .quad 0x3ff72d8054966f7e + .quad 0x3ff72f828421c2b3 + .quad 0x3ff73184e03c0520 + .quad 0x3ff7338768e91346 + .quad 0x3ff7358a1e2cc9fc + .quad 0x3ff7378d000b066d + .quad 0x3ff739900e87a61c + .quad 0x3ff73b9349a686df + .quad 0x3ff73d96b16b86e5 + .quad 0x3ff73f9a45da84b1 + .quad 0x3ff7419e06f75f1a + .quad 0x3ff743a1f4c5f551 + .quad 0x3ff745a60f4a26da + .quad 0x3ff747aa5687d38f + .quad 0x3ff749aeca82dba1 + .quad 0x3ff74bb36b3f1f98 + .quad 0x3ff74db838c0804e + .quad 0x3ff74fbd330adef7 + .quad 0x3ff751c25a221d1c + .quad 0x3ff753c7ae0a1c9b + .quad 0x3ff755cd2ec6bfaa + .quad 0x3ff757d2dc5be8d3 + .quad 0x3ff759d8b6cd7af8 + .quad 0x3ff75bdebe1f5950 + .quad 0x3ff75de4f2556769 + .quad 0x3ff75feb53738927 + .quad 0x3ff761f1e17da2c4 + .quad 0x3ff763f89c7798d0 + .quad 0x3ff765ff84655034 + .quad 0x3ff76806994aae2c + .quad 0x3ff76a0ddb2b984c + .quad 0x3ff76c154a0bf47d + .quad 0x3ff76e1ce5efa903 + .quad 0x3ff77024aeda9c72 + .quad 0x3ff7722ca4d0b5ba + .quad 0x3ff77434c7d5dc1c + .quad 0x3ff7763d17edf738 + .quad 0x3ff77845951ceefb + .quad 0x3ff77a4e3f66abb0 + .quad 0x3ff77c5716cf15f4 + .quad 0x3ff77e601b5a16bd + .quad 0x3ff780694d0b9758 + .quad 0x3ff78272abe78169 + .quad 0x3ff7847c37f1bee8 + .quad 0x3ff78685f12e3a27 + .quad 0x3ff7888fd7a0ddcc + .quad 0x3ff78a99eb4d94d8 + .quad 0x3ff78ca42c384a9f + .quad 0x3ff78eae9a64eacc + .quad 0x3ff790b935d76165 + .quad 0x3ff792c3fe939ac3 + .quad 0x3ff794cef49d8396 + .quad 0x3ff796da17f908e9 + .quad 0x3ff798e568aa181a + .quad 0x3ff79af0e6b49ee0 + .quad 0x3ff79cfc921c8b49 + .quad 0x3ff79f086ae5cbba + .quad 0x3ff7a11471144eef + .quad 0x3ff7a320a4ac03fa + .quad 0x3ff7a52d05b0da48 + .quad 0x3ff7a7399426c19b + .quad 0x3ff7a9465011aa0b + .quad 0x3ff7ab5339758409 + .quad 0x3ff7ad605056405d + .quad 0x3ff7af6d94b7d027 + .quad 0x3ff7b17b069e24de + .quad 0x3ff7b388a60d3050 + .quad 0x3ff7b5967308e4a3 + .quad 0x3ff7b7a46d953455 + .quad 0x3ff7b9b295b6123a + .quad 0x3ff7bbc0eb6f7180 + .quad 0x3ff7bdcf6ec545aa + .quad 0x3ff7bfde1fbb8295 + .quad 0x3ff7c1ecfe561c73 + .quad 0x3ff7c3fc0a9907d2 + .quad 0x3ff7c60b44883993 + .quad 0x3ff7c81aac27a6f1 + .quad 0x3ff7ca2a417b4580 + .quad 0x3ff7cc3a04870b28 + .quad 0x3ff7ce49f54eee2d + .quad 0x3ff7d05a13d6e52a + .quad 0x3ff7d26a6022e710 + .quad 0x3ff7d47ada36eb2a + .quad 0x3ff7d68b8216e919 + .quad 0x3ff7d89c57c6d8d7 + .quad 0x3ff7daad5b4ab2b8 + .quad 0x3ff7dcbe8ca66f64 + .quad 0x3ff7decfebde07de + .quad 0x3ff7e0e178f57582 + .quad 0x3ff7e2f333f0b201 + .quad 0x3ff7e5051cd3b766 + .quad 0x3ff7e71733a28014 + .quad 0x3ff7e929786106c7 + .quad 0x3ff7eb3beb134693 + .quad 0x3ff7ed4e8bbd3ae3 + .quad 0x3ff7ef615a62df7a + .quad 0x3ff7f17457083077 + .quad 0x3ff7f38781b12a4e + .quad 0x3ff7f59ada61c9cc + .quad 0x3ff7f7ae611e0c17 + .quad 0x3ff7f9c215e9eeae + .quad 0x3ff7fbd5f8c96f66 + .quad 0x3ff7fdea09c08c72 + .quad 0x3ff7fffe48d34457 + .quad 0x3ff80212b60595f7 + .quad 0x3ff80427515b808b + .quad 0x3ff8063c1ad903a4 + .quad 0x3ff8085112821f2e + .quad 0x3ff80a66385ad36d + .quad 0x3ff80c7b8c6720fb + .quad 0x3ff80e910eab08ce + .quad 0x3ff810a6bf2a8c34 + .quad 0x3ff812bc9de9acd3 + .quad 0x3ff814d2aaec6cab + .quad 0x3ff816e8e636ce15 + .quad 0x3ff818ff4fccd3c0 + .quad 0x3ff81b15e7b280b9 + .quad 0x3ff81d2cadebd863 + .quad 0x3ff81f43a27cde79 + .quad 0x3ff8215ac5699711 + .quad 0x3ff8237216b60699 + .quad 0x3ff82589966631da + .quad 0x3ff827a1447e1df3 + .quad 0x3ff829b92101d060 + .quad 0x3ff82bd12bf54ef1 + .quad 0x3ff82de9655c9fd6 + .quad 0x3ff83001cd3bc993 + .quad 0x3ff8321a6396d307 + .quad 0x3ff834332871c36a + .quad 0x3ff8364c1bd0a24e + .quad 0x3ff838653db7779f + .quad 0x3ff83a7e8e2a4ba1 + .quad 0x3ff83c980d2d26f1 + .quad 0x3ff83eb1bac41287 + .quad 0x3ff840cb96f317b4 + .quad 0x3ff842e5a1be4023 + .quad 0x3ff844ffdb2995d7 + .quad 0x3ff8471a4339232f + .quad 0x3ff84934d9f0f2e1 + .quad 0x3ff84b4f9f550fff + .quad 0x3ff84d6a936985f4 + .quad 0x3ff84f85b6326082 + .quad 0x3ff851a107b3abca + .quad 0x3ff853bc87f17443 + .quad 0x3ff855d836efc6bd + .quad 0x3ff857f414b2b067 + .quad 0x3ff85a10213e3ec4 + .quad 0x3ff85c2c5c967fb5 + .quad 0x3ff85e48c6bf8171 + .quad 0x3ff860655fbd528d + .quad 0x3ff86282279401f7 + .quad 0x3ff8649f1e479ef5 + .quad 0x3ff866bc43dc392a + .quad 0x3ff868d99855e08f + .quad 0x3ff86af71bb8a57c + .quad 0x3ff86d14ce08989e + .quad 0x3ff86f32af49cb03 + .quad 0x3ff87150bf804e0b + .quad 0x3ff8736efeb03378 + .quad 0x3ff8758d6cdd8d61 + .quad 0x3ff877ac0a0c6e38 + .quad 0x3ff879cad640e8cc + .quad 0x3ff87be9d17f1044 + .quad 0x3ff87e08fbcaf822 + .quad 0x3ff880285528b444 + .quad 0x3ff88247dd9c58df + .quad 0x3ff884679529fa86 + .quad 0x3ff886877bd5ae23 + .quad 0x3ff888a791a388ff + .quad 0x3ff88ac7d697a0b9 + .quad 0x3ff88ce84ab60b4f + .quad 0x3ff88f08ee02df15 + .quad 0x3ff89129c08232be + .quad 0x3ff8934ac2381d54 + .quad 0x3ff8956bf328b63f + .quad 0x3ff8978d53581541 + .quad 0x3ff899aee2ca5273 + .quad 0x3ff89bd0a183864e + .quad 0x3ff89df28f87c9a5 + .quad 0x3ff8a014acdb35a2 + .quad 0x3ff8a236f981e3cd + .quad 0x3ff8a459757fee0b + .quad 0x3ff8a67c20d96e96 + .quad 0x3ff8a89efb928009 + .quad 0x3ff8aac205af3d57 + .quad 0x3ff8ace53f33c1ce + .quad 0x3ff8af08a8242919 + .quad 0x3ff8b12c40848f3b + .quad 0x3ff8b35008591095 + .quad 0x3ff8b573ffa5c9e2 + .quad 0x3ff8b798266ed839 + .quad 0x3ff8b9bc7cb8590d + .quad 0x3ff8bbe102866a27 + .quad 0x3ff8be05b7dd29b2 + .quad 0x3ff8c02a9cc0b632 + .quad 0x3ff8c24fb1352e86 + .quad 0x3ff8c474f53eb1e8 + .quad 0x3ff8c69a68e15fed + .quad 0x3ff8c8c00c215887 + .quad 0x3ff8cae5df02bc04 + .quad 0x3ff8cd0be189ab0a + .quad 0x3ff8cf3213ba46a0 + .quad 0x3ff8d1587598b023 + .quad 0x3ff8d37f07290950 + .quad 0x3ff8d5a5c86f743d + .quad 0x3ff8d7ccb970135d + .quad 0x3ff8d9f3da2f097f + .quad 0x3ff8dc1b2ab079ca + .quad 0x3ff8de42aaf887c7 + .quad 0x3ff8e06a5b0b5758 + .quad 0x3ff8e2923aed0cb7 + .quad 0x3ff8e4ba4aa1cc81 + .quad 0x3ff8e6e28a2dbba9 + .quad 0x3ff8e90af994ff81 + .quad 0x3ff8eb3398dbbdb7 + .quad 0x3ff8ed5c68061c54 + .quad 0x3ff8ef85671841bc + .quad 0x3ff8f1ae961654b0 + .quad 0x3ff8f3d7f5047c4f + .quad 0x3ff8f60183e6e012 + .quad 0x3ff8f82b42c1a7cf + .quad 0x3ff8fa553198fbb8 + .quad 0x3ff8fc7f5071045a + .quad 0x3ff8fea99f4deaa1 + .quad 0x3ff900d41e33d7d1 + .quad 0x3ff902fecd26f58f + .quad 0x3ff90529ac2b6dda + .quad 0x3ff90754bb456b0e + .quad 0x3ff9097ffa7917e2 + .quad 0x3ff90bab69ca9f6c + .quad 0x3ff90dd7093e2d1b + .quad 0x3ff91002d8d7ecbd + .quad 0x3ff9122ed89c0a7e + .quad 0x3ff9145b088eb2e4 + .quad 0x3ff9168768b412d0 + .quad 0x3ff918b3f9105783 + .quad 0x3ff91ae0b9a7ae9b + .quad 0x3ff91d0daa7e4610 + .quad 0x3ff91f3acb984c37 + .quad 0x3ff921681cf9efc3 + .quad 0x3ff923959ea75fc4 + .quad 0x3ff925c350a4cba7 + .quad 0x3ff927f132f66333 + .quad 0x3ff92a1f45a05690 + .quad 0x3ff92c4d88a6d63f + .quad 0x3ff92e7bfc0e1323 + .quad 0x3ff930aa9fda3e74 + .quad 0x3ff932d9740f89d1 + .quad 0x3ff9350878b2272d + .quad 0x3ff93737adc648dd + .quad 0x3ff9396713502192 + .quad 0x3ff93b96a953e45b + .quad 0x3ff93dc66fd5c4a2 + .quad 0x3ff93ff666d9f630 + .quad 0x3ff942268e64ad2b + .quad 0x3ff94456e67a1e16 + .quad 0x3ff946876f1e7dd2 + .quad 0x3ff948b82856019b + .quad 0x3ff94ae91224df0d + .quad 0x3ff94d1a2c8f4c1e + .quad 0x3ff94f4b77997f27 + .quad 0x3ff9517cf347aeda + .quad 0x3ff953ae9f9e1246 + .quad 0x3ff955e07ca0e0dd + .quad 0x3ff958128a545266 + .quad 0x3ff95a44c8bc9f0e + .quad 0x3ff95c7737ddff5a + .quad 0x3ff95ea9d7bcac2f + .quad 0x3ff960dca85cdecf + .quad 0x3ff9630fa9c2d0da + .quad 0x3ff96542dbf2bc4e + .quad 0x3ff967763ef0db86 + .quad 0x3ff969a9d2c1693a + .quad 0x3ff96bdd9768a084 + .quad 0x3ff96e118ceabcd7 + .quad 0x3ff97045b34bfa05 + .quad 0x3ff9727a0a90943f + .quad 0x3ff974ae92bcc816 + .quad 0x3ff976e34bd4d273 + .quad 0x3ff9791835dcf0a3 + .quad 0x3ff97b4d50d9604e + .quad 0x3ff97d829cce5f7c + .quad 0x3ff97fb819c02c8f + .quad 0x3ff981edc7b3064d + .quad 0x3ff98423a6ab2bd5 + .quad 0x3ff98659b6acdca7 + .quad 0x3ff9888ff7bc58a2 + .quad 0x3ff98ac669dde001 + .quad 0x3ff98cfd0d15b35d + .quad 0x3ff98f33e16813b0 + .quad 0x3ff9916ae6d94251 + .quad 0x3ff993a21d6d80f4 + .quad 0x3ff995d9852911ae + .quad 0x3ff998111e1036f2 + .quad 0x3ff99a48e827338e + .quad 0x3ff99c80e3724ab5 + .quad 0x3ff99eb90ff5bff1 + .quad 0x3ff9a0f16db5d730 + .quad 0x3ff9a329fcb6d4be + .quad 0x3ff9a562bcfcfd42 + .quad 0x3ff9a79bae8c95c8 + .quad 0x3ff9a9d4d169e3b4 + .quad 0x3ff9ac0e25992ccd + .quad 0x3ff9ae47ab1eb739 + .quad 0x3ff9b08161fec979 + .quad 0x3ff9b2bb4a3daa71 + .quad 0x3ff9b4f563dfa161 + .quad 0x3ff9b72faee8f5e9 + .quad 0x3ff9b96a2b5df009 + .quad 0x3ff9bba4d942d81f + .quad 0x3ff9bddfb89bf6e9 + .quad 0x3ff9c01ac96d9580 + .quad 0x3ff9c2560bbbfd60 + .quad 0x3ff9c4917f8b7866 + .quad 0x3ff9c6cd24e050c8 + .quad 0x3ff9c908fbbed121 + .quad 0x3ff9cb45042b4467 + .quad 0x3ff9cd813e29f5f2 + .quad 0x3ff9cfbda9bf3179 + .quad 0x3ff9d1fa46ef430e + .quad 0x3ff9d43715be772a + .quad 0x3ff9d67416311aa0 + .quad 0x3ff9d8b1484b7aa2 + .quad 0x3ff9daeeac11e4c5 + .quad 0x3ff9dd2c4188a6fb + .quad 0x3ff9df6a08b40f94 + .quad 0x3ff9e1a801986d45 + .quad 0x3ff9e3e62c3a0f1d + .quad 0x3ff9e624889d448d + .quad 0x3ff9e86316c65d65 + .quad 0x3ff9eaa1d6b9a9d6 + .quad 0x3ff9ece0c87b7a6f + .quad 0x3ff9ef1fec102020 + .quad 0x3ff9f15f417bec36 + .quad 0x3ff9f39ec8c33062 + .quad 0x3ff9f5de81ea3eb2 + .quad 0x3ff9f81e6cf56995 + .quad 0x3ff9fa5e89e903d9 + .quad 0x3ff9fc9ed8c960ac + .quad 0x3ff9fedf599ad39d + .quad 0x3ffa01200c61b09a + .quad 0x3ffa0360f1224bf2 + .quad 0x3ffa05a207e0fa53 + .quad 0x3ffa07e350a210ca + .quad 0x3ffa0a24cb69e4c7 + .quad 0x3ffa0c66783ccc19 + .quad 0x3ffa0ea8571f1ced + .quad 0x3ffa10ea68152dd4 + .quad 0x3ffa132cab2355bc + .quad 0x3ffa156f204debf5 + .quad 0x3ffa17b1c7994830 + .quad 0x3ffa19f4a109c27b + .quad 0x3ffa1c37aca3b348 + .quad 0x3ffa1e7aea6b7367 + .quad 0x3ffa20be5a655c0a + .quad 0x3ffa2301fc95c6c4 + .quad 0x3ffa2545d1010d86 + .quad 0x3ffa2789d7ab8aa3 + .quad 0x3ffa29ce109998cf + .quad 0x3ffa2c127bcf931c + .quad 0x3ffa2e571951d502 + .quad 0x3ffa309be924ba55 + .quad 0x3ffa32e0eb4c9f4a + .quad 0x3ffa35261fcde079 + .quad 0x3ffa376b86acdad9 + .quad 0x3ffa39b11fedebc2 + .quad 0x3ffa3bf6eb9570ef + .quad 0x3ffa3e3ce9a7c878 + .quad 0x3ffa40831a2950d8 + .quad 0x3ffa42c97d1e68ec + .quad 0x3ffa4510128b6ff1 + .quad 0x3ffa4756da74c583 + .quad 0x3ffa499dd4dec9a2 + .quad 0x3ffa4be501cddcad + .quad 0x3ffa4e2c61465f66 + .quad 0x3ffa5073f34cb2f0 + .quad 0x3ffa52bbb7e538cc + .quad 0x3ffa5503af1452e0 + .quad 0x3ffa574bd8de6371 + .quad 0x3ffa59943547cd25 + .quad 0x3ffa5bdcc454f307 + .quad 0x3ffa5e25860a387d + .quad 0x3ffa606e7a6c0154 + .quad 0x3ffa62b7a17eb1b8 + .quad 0x3ffa6500fb46ae37 + .quad 0x3ffa674a87c85bbf + .quad 0x3ffa699447081fa2 + .quad 0x3ffa6bde390a5f91 + .quad 0x3ffa6e285dd3819f + .quad 0x3ffa7072b567ec43 + .quad 0x3ffa72bd3fcc0653 + .quad 0x3ffa7507fd043708 + .quad 0x3ffa7752ed14e5fb + .quad 0x3ffa799e10027b29 + .quad 0x3ffa7be965d15ef0 + .quad 0x3ffa7e34ee85fa0f + .quad 0x3ffa8080aa24b5a6 + .quad 0x3ffa82cc98b1fb3a + .quad 0x3ffa8518ba3234b0 + .quad 0x3ffa87650ea9cc4d + .quad 0x3ffa89b1961d2cbb + .quad 0x3ffa8bfe5090c106 + .quad 0x3ffa8e4b3e08f499 + .quad 0x3ffa90985e8a3344 + .quad 0x3ffa92e5b218e937 + .quad 0x3ffa953338b98307 + .quad 0x3ffa9780f2706da6 + .quad 0x3ffa99cedf42166e + .quad 0x3ffa9c1cff32eb19 + .quad 0x3ffa9e6b524759c1 + .quad 0x3ffaa0b9d883d0e6 + .quad 0x3ffaa30891ecbf66 + .quad 0x3ffaa5577e869486 + .quad 0x3ffaa7a69e55bfea + .quad 0x3ffaa9f5f15eb19b + .quad 0x3ffaac4577a5da02 + .quad 0x3ffaae95312fa9ec + .quad 0x3ffab0e51e009287 + .quad 0x3ffab3353e1d0565 + .quad 0x3ffab5859189747c + .quad 0x3ffab7d6184a5220 + .quad 0x3ffaba26d264110c + .quad 0x3ffabc77bfdb245d + .quad 0x3ffabec8e0b3ff90 + .quad 0x3ffac11a34f31687 + .quad 0x3ffac36bbc9cdd87 + .quad 0x3ffac5bd77b5c936 + .quad 0x3ffac80f66424e9f + .quad 0x3ffaca618846e330 + .quad 0x3ffaccb3ddc7fcb7 + .quad 0x3ffacf0666ca1167 + .quad 0x3ffad159235197d6 + .quad 0x3ffad3ac136306fc + .quad 0x3ffad5ff3702d636 + .quad 0x3ffad8528e357d43 + .quad 0x3ffadaa618ff7445 + .quad 0x3ffadcf9d76533bf + .quad 0x3ffadf4dc96b349b + .quad 0x3ffae1a1ef15f025 + .quad 0x3ffae3f64869e00c + .quad 0x3ffae64ad56b7e60 + .quad 0x3ffae89f961f4598 + .quad 0x3ffaeaf48a89b08d + .quad 0x3ffaed49b2af3a7a + .quad 0x3ffaef9f0e945eff + .quad 0x3ffaf1f49e3d9a1f + .quad 0x3ffaf44a61af6840 + .quad 0x3ffaf6a058ee462d + .quad 0x3ffaf8f683feb114 + .quad 0x3ffafb4ce2e52685 + .quad 0x3ffafda375a62474 + .quad 0x3ffafffa3c46293a + .quad 0x3ffb025136c9b394 + .quad 0x3ffb04a8653542a2 + .quad 0x3ffb06ffc78d55e6 + .quad 0x3ffb09575dd66d48 + .quad 0x3ffb0baf28150913 + .quad 0x3ffb0e07264da9f8 + .quad 0x3ffb105f5884d106 + .quad 0x3ffb12b7bebeffb8 + .quad 0x3ffb15105900b7e6 + .quad 0x3ffb1769274e7bcf + .quad 0x3ffb19c229acce18 + .quad 0x3ffb1c1b602031c6 + .quad 0x3ffb1e74caad2a44 + .quad 0x3ffb20ce69583b61 + .quad 0x3ffb23283c25e951 + .quad 0x3ffb2582431ab8ab + .quad 0x3ffb27dc7e3b2e6b + .quad 0x3ffb2a36ed8bcff1 + .quad 0x3ffb2c9191112300 + .quad 0x3ffb2eec68cfadc2 + .quad 0x3ffb314774cbf6c3 + .quad 0x3ffb33a2b50a84f5 + .quad 0x3ffb35fe298fdfad + .quad 0x3ffb3859d2608ea7 + .quad 0x3ffb3ab5af811a00 + .quad 0x3ffb3d11c0f60a3b + .quad 0x3ffb3f6e06c3e840 + .quad 0x3ffb41ca80ef3d5d + .quad 0x3ffb44272f7c9343 + .quad 0x3ffb468412707405 + .quad 0x3ffb48e129cf6a20 + .quad 0x3ffb4b3e759e0071 + .quad 0x3ffb4d9bf5e0c23e + .quad 0x3ffb4ff9aa9c3b30 + .quad 0x3ffb525793d4f751 + .quad 0x3ffb54b5b18f8319 + .quad 0x3ffb571403d06b5b + .quad 0x3ffb59728a9c3d55 + .quad 0x3ffb5bd145f786a7 + .quad 0x3ffb5e3035e6d559 + .quad 0x3ffb608f5a6eb7d6 + .quad 0x3ffb62eeb393bcee + .quad 0x3ffb654e415a73d6 + .quad 0x3ffb67ae03c76c2a + .quad 0x3ffb6a0dfadf35e8 + .quad 0x3ffb6c6e26a66177 + .quad 0x3ffb6ece87217fa1 + .quad 0x3ffb712f1c552196 + .quad 0x3ffb738fe645d8e9 + .quad 0x3ffb75f0e4f83795 + .quad 0x3ffb78521870cffb + .quad 0x3ffb7ab380b434df + .quad 0x3ffb7d151dc6f96c + .quad 0x3ffb7f76efadb132 + .quad 0x3ffb81d8f66cf026 + .quad 0x3ffb843b32094aa4 + .quad 0x3ffb869da287556c + .quad 0x3ffb890047eba5a5 + .quad 0x3ffb8b63223ad0da + .quad 0x3ffb8dc631796cfe + .quad 0x3ffb902975ac1068 + .quad 0x3ffb928ceed751d6 + .quad 0x3ffb94f09cffc869 + .quad 0x3ffb9754802a0bab + .quad 0x3ffb99b8985ab38a + .quad 0x3ffb9c1ce596585d + .quad 0x3ffb9e8167e192dc + .quad 0x3ffba0e61f40fc29 + .quad 0x3ffba34b0bb92dca + .quad 0x3ffba5b02d4ec1ab + .quad 0x3ffba81584065220 + .quad 0x3ffbaa7b0fe479e1 + .quad 0x3ffbace0d0edd40c + .quad 0x3ffbaf46c726fc27 + .quad 0x3ffbb1acf2948e1f + .quad 0x3ffbb413533b2643 + .quad 0x3ffbb679e91f614c + .quad 0x3ffbb8e0b445dc58 + .quad 0x3ffbbb47b4b334eb + .quad 0x3ffbbdaeea6c08f0 + .quad 0x3ffbc0165574f6bb + .quad 0x3ffbc27df5d29d00 + .quad 0x3ffbc4e5cb899adf + .quad 0x3ffbc74dd69e8fdc + .quad 0x3ffbc9b617161be5 + .quad 0x3ffbcc1e8cf4df48 + .quad 0x3ffbce87383f7ac1 + .quad 0x3ffbd0f018fa8f6d + .quad 0x3ffbd3592f2abed3 + .quad 0x3ffbd5c27ad4aae0 + .quad 0x3ffbd82bfbfcf5e7 + .quad 0x3ffbda95b2a842a2 + .quad 0x3ffbdcff9edb3432 + .quad 0x3ffbdf69c09a6e20 + .quad 0x3ffbe1d417ea945a + .quad 0x3ffbe43ea4d04b36 + .quad 0x3ffbe6a967503772 + .quad 0x3ffbe9145f6efe30 + .quad 0x3ffbeb7f8d3144fc + .quad 0x3ffbedeaf09bb1c7 + .quad 0x3ffbf05689b2eaec + .quad 0x3ffbf2c2587b9729 + .quad 0x3ffbf52e5cfa5da6 + .quad 0x3ffbf79a9733e5f3 + .quad 0x3ffbfa07072cd804 + .quad 0x3ffbfc73ace9dc39 + .quad 0x3ffbfee0886f9b53 + .quad 0x3ffc014d99c2be80 + .quad 0x3ffc03bae0e7ef53 + .quad 0x3ffc06285de3d7c7 + .quad 0x3ffc089610bb223d + .quad 0x3ffc0b03f9727980 + .quad 0x3ffc0d72180e88c1 + .quad 0x3ffc0fe06c93fb98 + .quad 0x3ffc124ef7077e06 + .quad 0x3ffc14bdb76dbc74 + .quad 0x3ffc172cadcb63b0 + .quad 0x3ffc199bda2520f2 + .quad 0x3ffc1c0b3c7fa1d9 + .quad 0x3ffc1e7ad4df946e + .quad 0x3ffc20eaa349a71c + .quad 0x3ffc235aa7c288be + .quad 0x3ffc25cae24ee890 + .quad 0x3ffc283b52f37637 + .quad 0x3ffc2aabf9b4e1c5 + .quad 0x3ffc2d1cd697dbaf + .quad 0x3ffc2f8de9a114d2 + .quad 0x3ffc31ff32d53e76 + .quad 0x3ffc3470b2390a49 + .quad 0x3ffc36e267d12a62 + .quad 0x3ffc395453a25140 + .quad 0x3ffc3bc675b131cb + .quad 0x3ffc3e38ce027f50 + .quad 0x3ffc40ab5c9aed89 + .quad 0x3ffc431e217f3095 + .quad 0x3ffc45911cb3fcfd + .quad 0x3ffc48044e3e07b0 + .quad 0x3ffc4a77b6220609 + .quad 0x3ffc4ceb5464adc8 + .quad 0x3ffc4f5f290ab517 + .quad 0x3ffc51d33418d28a + .quad 0x3ffc54477593bd1c + .quad 0x3ffc56bbed802c30 + .quad 0x3ffc59309be2d792 + .quad 0x3ffc5ba580c07778 + .quad 0x3ffc5e1a9c1dc47f + .quad 0x3ffc608fedff77ae + .quad 0x3ffc6305766a4a74 + .quad 0x3ffc657b3562f6a9 + .quad 0x3ffc67f12aee368d + .quad 0x3ffc6a675710c4cc + .quad 0x3ffc6cddb9cf5c77 + .quad 0x3ffc6f54532eb909 + .quad 0x3ffc71cb23339668 + .quad 0x3ffc744229e2b0e1 + .quad 0x3ffc76b96740c52b + .quad 0x3ffc7930db529065 + .quad 0x3ffc7ba8861cd01a + .quad 0x3ffc7e2067a44239 + .quad 0x3ffc80987feda51f + .quad 0x3ffc8310cefdb791 + .quad 0x3ffc858954d938bc + .quad 0x3ffc88021184e837 + .quad 0x3ffc8a7b05058602 + .quad 0x3ffc8cf42f5fd289 + .quad 0x3ffc8f6d90988e9c + .quad 0x3ffc91e728b47b79 + .quad 0x3ffc9460f7b85ac7 + .quad 0x3ffc96dafda8ee95 + .quad 0x3ffc99553a8af95b + .quad 0x3ffc9bcfae633dfe + .quad 0x3ffc9e4a59367fca + .quad 0x3ffca0c53b098273 + .quad 0x3ffca34053e10a1b + .quad 0x3ffca5bba3c1db4b + .quad 0x3ffca8372ab0baf6 + .quad 0x3ffcaab2e8b26e78 + .quad 0x3ffcad2eddcbbb9a + .quad 0x3ffcafab0a01688c + .quad 0x3ffcb2276d583be7 + .quad 0x3ffcb4a407d4fcb3 + .quad 0x3ffcb720d97c725c + .quad 0x3ffcb99de25364bb + .quad 0x3ffcbc1b225e9c14 + .quad 0x3ffcbe9899a2e114 + .quad 0x3ffcc1164824fcd0 + .quad 0x3ffcc3942de9b8ca + .quad 0x3ffcc6124af5deee + .quad 0x3ffcc8909f4e3990 + .quad 0x3ffccb0f2af79372 + .quad 0x3ffccd8dedf6b7bd + .quad 0x3ffcd00ce8507204 + .quad 0x3ffcd28c1a098e48 + .quad 0x3ffcd50b8326d8f2 + .quad 0x3ffcd78b23ad1ed5 + .quad 0x3ffcda0afba12d30 + .quad 0x3ffcdc8b0b07d1aa + .quad 0x3ffcdf0b51e5da58 + .quad 0x3ffce18bd04015b7 + .quad 0x3ffce40c861b52b1 + .quad 0x3ffce68d737c6096 + .quad 0x3ffce90e98680f28 + .quad 0x3ffceb8ff4e32e8c + .quad 0x3ffcee1188f28f58 + .quad 0x3ffcf093549b0289 + .quad 0x3ffcf31557e15988 + .quad 0x3ffcf59792ca6629 + .quad 0x3ffcf81a055afaab + .quad 0x3ffcfa9caf97e9b8 + .quad 0x3ffcfd1f91860666 + .quad 0x3ffcffa2ab2a2432 + .quad 0x3ffd0225fc891709 + .quad 0x3ffd04a985a7b341 + .quad 0x3ffd072d468acd9b + .quad 0x3ffd09b13f373b42 + .quad 0x3ffd0c356fb1d1ce + .quad 0x3ffd0eb9d7ff6743 + .quad 0x3ffd113e7824d20f + .quad 0x3ffd13c35026e90b + .quad 0x3ffd1648600a837b + .quad 0x3ffd18cda7d4790f + .quad 0x3ffd1b532789a1e4 + .quad 0x3ffd1dd8df2ed681 + .quad 0x3ffd205ecec8efd8 + .quad 0x3ffd22e4f65cc746 + .quad 0x3ffd256b55ef3696 + .quad 0x3ffd27f1ed8517fd + .quad 0x3ffd2a78bd23461a + .quad 0x3ffd2cffc4ce9bfe + .quad 0x3ffd2f87048bf51e + .quad 0x3ffd320e7c602d5e + .quad 0x3ffd34962c50210e + .quad 0x3ffd371e1460aced + .quad 0x3ffd39a63496ae1e + .quad 0x3ffd3c2e8cf70237 + .quad 0x3ffd3eb71d868736 + .quad 0x3ffd413fe64a1b88 + .quad 0x3ffd43c8e7469e02 + .quad 0x3ffd46522080edeb + .quad 0x3ffd48db91fdeaf0 + .quad 0x3ffd4b653bc2752c + .quad 0x3ffd4def1dd36d29 + .quad 0x3ffd50793835b3da + .quad 0x3ffd53038aee2a9f + .quad 0x3ffd558e1601b344 + .quad 0x3ffd5818d9753003 + .quad 0x3ffd5aa3d54d8381 + .quad 0x3ffd5d2f098f90cf + .quad 0x3ffd5fba76403b6b + .quad 0x3ffd62461b64673f + .quad 0x3ffd64d1f900f8a4 + .quad 0x3ffd675e0f1ad45a + .quad 0x3ffd69ea5db6df94 + .quad 0x3ffd6c76e4d9ffed + .quad 0x3ffd6f03a4891b6e + .quad 0x3ffd71909cc9188f + .quad 0x3ffd741dcd9ede30 + .quad 0x3ffd76ab370f53a1 + .quad 0x3ffd7938d91f609f + .quad 0x3ffd7bc6b3d3ed53 + .quad 0x3ffd7e54c731e251 + .quad 0x3ffd80e3133e289e + .quad 0x3ffd837197fda9a8 + .quad 0x3ffd860055754f4c + .quad 0x3ffd888f4baa03d3 + .quad 0x3ffd8b1e7aa0b1f5 + .quad 0x3ffd8dade25e44d5 + .quad 0x3ffd903d82e7a803 + .quad 0x3ffd92cd5c41c77f + .quad 0x3ffd955d6e718fb2 + .quad 0x3ffd97edb97bed76 + .quad 0x3ffd9a7e3d65ce10 + .quad 0x3ffd9d0efa341f33 + .quad 0x3ffd9f9fefebceff + .quad 0x3ffda2311e91cc02 + .quad 0x3ffda4c2862b0536 + .quad 0x3ffda75426bc6a05 + .quad 0x3ffda9e6004aea45 + .quad 0x3ffdac7812db7638 + .quad 0x3ffdaf0a5e72fe91 + .quad 0x3ffdb19ce316746e + .quad 0x3ffdb42fa0cac95a + .quad 0x3ffdb6c29794ef50 + .quad 0x3ffdb955c779d8b8 + .quad 0x3ffdbbe9307e7867 + .quad 0x3ffdbe7cd2a7c1a1 + .quad 0x3ffdc110adfaa815 + .quad 0x3ffdc3a4c27c1fe4 + .quad 0x3ffdc63910311d9a + .quad 0x3ffdc8cd971e9631 + .quad 0x3ffdcb6257497f13 + .quad 0x3ffdcdf750b6ce17 + .quad 0x3ffdd08c836b797f + .quad 0x3ffdd321ef6c7800 + .quad 0x3ffdd5b794bec0bc + .quad 0x3ffdd84d73674b3f + .quad 0x3ffddae38b6b0f89 + .quad 0x3ffddd79dccf0603 + .quad 0x3ffde01067982789 + .quad 0x3ffde2a72bcb6d61 + .quad 0x3ffde53e296dd143 + .quad 0x3ffde7d560844d54 + .quad 0x3ffdea6cd113dc26 + .quad 0x3ffded047b2178bb + .quad 0x3ffdef9c5eb21e83 + .quad 0x3ffdf2347bcac95e + .quad 0x3ffdf4ccd2707596 + .quad 0x3ffdf76562a81feb + .quad 0x3ffdf9fe2c76c585 + .quad 0x3ffdfc972fe163fd + .quad 0x3ffdff306cecf95b + .quad 0x3ffe01c9e39e8418 + .quad 0x3ffe046393fb0315 + .quad 0x3ffe06fd7e0775aa + .quad 0x3ffe0997a1c8db99 + .quad 0x3ffe0c31ff443512 + .quad 0x3ffe0ecc967e82b9 + .quad 0x3ffe1167677cc59c + .quad 0x3ffe14027243ff3b + .quad 0x3ffe169db6d93183 + .quad 0x3ffe193935415ed1 + .quad 0x3ffe1bd4ed8189f2 + .quad 0x3ffe1e70df9eb621 + .quad 0x3ffe210d0b9de709 + .quad 0x3ffe23a9718420c3 + .quad 0x3ffe2646115667d9 + .quad 0x3ffe28e2eb19c142 + .quad 0x3ffe2b7ffed33266 + .quad 0x3ffe2e1d4c87c11e + .quad 0x3ffe30bad43c73ae + .quad 0x3ffe335895f650cf + .quad 0x3ffe35f691ba5fa4 + .quad 0x3ffe3894c78da7c2 + .quad 0x3ffe3b333775312f + .quad 0x3ffe3dd1e176045e + .quad 0x3ffe4070c5952a35 + .quad 0x3ffe430fe3d7ac06 + .quad 0x3ffe45af3c429394 + .quad 0x3ffe484ecedaeb14 + .quad 0x3ffe4aee9ba5bd26 + .quad 0x3ffe4d8ea2a814df + .quad 0x3ffe502ee3e6fdc2 + .quad 0x3ffe52cf5f6783c0 + .quad 0x3ffe5570152eb33c + .quad 0x3ffe581105419909 + .quad 0x3ffe5ab22fa54269 + .quad 0x3ffe5d53945ebd0f + .quad 0x3ffe5ff53373171e + .quad 0x3ffe62970ce75f28 + .quad 0x3ffe653920c0a430 + .quad 0x3ffe67db6f03f5ab + .quad 0x3ffe6a7df7b6637a + .quad 0x3ffe6d20badcfdf3 + .quad 0x3ffe6fc3b87cd5d9 + .quad 0x3ffe7266f09afc62 + .quad 0x3ffe750a633c8332 + .quad 0x3ffe77ae10667c5d + .quad 0x3ffe7a51f81dfa6b + .quad 0x3ffe7cf61a681052 + .quad 0x3ffe7f9a7749d178 + .quad 0x3ffe823f0ec851b6 + .quad 0x3ffe84e3e0e8a554 + .quad 0x3ffe8788edafe10a + .quad 0x3ffe8a2e35231a01 + .quad 0x3ffe8cd3b74765d6 + .quad 0x3ffe8f797421da93 + .quad 0x3ffe921f6bb78eb2 + .quad 0x3ffe94c59e0d9924 + .quad 0x3ffe976c0b291144 + .quad 0x3ffe9a12b30f0ee0 + .quad 0x3ffe9cb995c4aa3b + .quad 0x3ffe9f60b34efc02 + .quad 0x3ffea2080bb31d5a + .quad 0x3ffea4af9ef627d4 + .quad 0x3ffea7576d1d3575 + .quad 0x3ffea9ff762d60b2 + .quad 0x3ffeaca7ba2bc471 + .quad 0x3ffeaf50391d7c09 + .quad 0x3ffeb1f8f307a346 + .quad 0x3ffeb4a1e7ef5660 + .quad 0x3ffeb74b17d9b203 + .quad 0x3ffeb9f482cbd34b + .quad 0x3ffebc9e28cad7ca + .quad 0x3ffebf4809dbdd7c + .quad 0x3ffec1f2260402d5 + .quad 0x3ffec49c7d4866b8 + .quad 0x3ffec7470fae2879 + .quad 0x3ffec9f1dd3a67df + .quad 0x3ffecc9ce5f24521 + .quad 0x3ffecf4829dae0eb + .quad 0x3ffed1f3a8f95c56 + .quad 0x3ffed49f6352d8ef + .quad 0x3ffed74b58ec78b7 + .quad 0x3ffed9f789cb5e20 + .quad 0x3ffedca3f5f4ac0a + .quad 0x3ffedf509d6d85cb + .quad 0x3ffee1fd803b0f2a + .quad 0x3ffee4aa9e626c5f + .quad 0x3ffee757f7e8c217 + .quad 0x3ffeea058cd3356e + .quad 0x3ffeecb35d26ebf2 + .quad 0x3ffeef6168e90ba5 + .quad 0x3ffef20fb01ebafb + .quad 0x3ffef4be32cd20da + .quad 0x3ffef76cf0f9649a + .quad 0x3ffefa1beaa8ae04 + .quad 0x3ffefccb1fe02556 + .quad 0x3ffeff7a90a4f33f + .quad 0x3fff022a3cfc40e1 + .quad 0x3fff04da24eb37d0 + .quad 0x3fff078a48770213 + .quad 0x3fff0a3aa7a4ca23 + .quad 0x3fff0ceb4279baea + .quad 0x3fff0f9c18faffca + .quad 0x3fff124d2b2dc491 + .quad 0x3fff14fe79173584 + .quad 0x3fff17b002bc7f5a + .quad 0x3fff1a61c822cf3c + .quad 0x3fff1d13c94f52c7 + .quad 0x3fff1fc606473809 + .quad 0x3fff22787f0fad85 + .quad 0x3fff252b33ade22f + .quad 0x3fff27de24270571 + .quad 0x3fff2a9150804723 + .quad 0x3fff2d44b8bed796 + .quad 0x3fff2ff85ce7e78a + .quad 0x3fff32ac3d00a832 + .quad 0x3fff3560590e4b38 + .quad 0x3fff3814b11602b5 + .quad 0x3fff3ac9451d0138 + .quad 0x3fff3d7e152879c2 + .quad 0x3fff4033213d9fc8 + .quad 0x3fff42e86961a731 + .quad 0x3fff459ded99c45a + .quad 0x3fff4853adeb2c11 + .quad 0x3fff4b09aa5b1398 + .quad 0x3fff4dbfe2eeb0a6 + .quad 0x3fff507657ab3963 + .quad 0x3fff532d0895e46e + .quad 0x3fff55e3f5b3e8d8 + .quad 0x3fff589b1f0a7e23 + .quad 0x3fff5b52849edc4a + .quad 0x3fff5e0a26763bb8 + .quad 0x3fff60c20495d54d + .quad 0x3fff637a1f02e25c + .quad 0x3fff663275c29cab + .quad 0x3fff68eb08da3e7a + .quad 0x3fff6ba3d84f0275 + .quad 0x3fff6e5ce42623c1 + .quad 0x3fff71162c64ddf3 + .quad 0x3fff73cfb1106d1b + .quad 0x3fff7689722e0db5 + .quad 0x3fff79436fc2fcb6 + .quad 0x3fff7bfda9d47787 + .quad 0x3fff7eb82067bc04 + .quad 0x3fff8172d382087c + .quad 0x3fff842dc3289bb5 + .quad 0x3fff86e8ef60b4ea + .quad 0x3fff89a4582f93c7 + .quad 0x3fff8c5ffd9a786e + .quad 0x3fff8f1bdfa6a377 + .quad 0x3fff91d7fe5955eb + .quad 0x3fff949459b7d14b + .quad 0x3fff9750f1c7578c + .quad 0x3fff9a0dc68d2b16 + .quad 0x3fff9ccad80e8ec8 + .quad 0x3fff9f882650c5f2 + .quad 0x3fffa245b159145c + .quad 0x3fffa503792cbe42 + .quad 0x3fffa7c17dd10856 + .quad 0x3fffaa7fbf4b37bd + .quad 0x3fffad3e3da09211 + .quad 0x3fffaffcf8d65d61 + .quad 0x3fffb2bbf0f1e031 + .quad 0x3fffb57b25f8617d + .quad 0x3fffb83a97ef28b2 + .quad 0x3fffbafa46db7db4 + .quad 0x3fffbdba32c2a8db + .quad 0x3fffc07a5ba9f2f6 + .quad 0x3fffc33ac196a548 + .quad 0x3fffc5fb648e098a + .quad 0x3fffc8bc449569e9 + .quad 0x3fffcb7d61b21108 + .quad 0x3fffce3ebbe94a01 + .quad 0x3fffd10053406061 + .quad 0x3fffd3c227bca02c + .quad 0x3fffd684396355da + .quad 0x3fffd9468839ce5a + .quad 0x3fffdc0914455712 + .quad 0x3fffdecbdd8b3dd8 + .quad 0x3fffe18ee410d0ff + .quad 0x3fffe45227db5f4b + .quad 0x3fffe715a8f037f6 + .quad 0x3fffe9d96754aab1 + .quad 0x3fffec9d630e07a4 + .quad 0x3fffef619c219f69 + .quad 0x3ffff2261294c314 + .quad 0x3ffff4eac66cc42c + .quad 0x3ffff7afb7aef4b0 + .quad 0x3ffffa74e660a715 + .quad 0x3ffffd3a52872e44 + .quad 0x3ffffffffc27dd9e + .rept 56 + .byte 0 + .endr + +/* Other general purpose constants: + * _dbInvLn2 */ +double_vector __dbInvLn2 0x40a71547652b82fe + +/* _dbShifter */ +double_vector __dbShifter 0x4338000000000000 + +/* _dbHALF */ +double_vector __dbHALF 0x3fe0000000000000 + +/* _dbC1 = 2^(1/2^K)-1 */ +double_vector __dbC1 0x3f362f3904051fa1 + +/* _lbLOWKBITS = 2^K-1 */ +double_vector __lbLOWKBITS 0x00000000000007ff + +/* _iAbsMask */ +float_vector __iAbsMask 0x7fffffff + +/* _iDomainRange */ +float_vector __iDomainRange 0x4059fe36 + .type __svml_spow_data,@object + .size __svml_spow_data,.-__svml_spow_data diff --git a/REORG.TODO/sysdeps/x86_64/fpu/svml_s_powf_data.h b/REORG.TODO/sysdeps/x86_64/fpu/svml_s_powf_data.h new file mode 100644 index 0000000000..016dbf7fce --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/svml_s_powf_data.h @@ -0,0 +1,76 @@ +/* Offsets for data table for function powf. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#ifndef S_POWF_DATA_H +#define S_POWF_DATA_H + +#define _Log2Rcp_lookup -4218496 +#define _NMINNORM 0 +#define _NMAXVAL 64 +#define _INF 128 +#define _ABSMASK 192 +#define _DOMAINRANGE 256 +#define _Log_HA_table 320 +#define _Log_LA_table 8576 +#define _poly_coeff_1 12736 +#define _poly_coeff_2 12800 +#define _poly_coeff_3 12864 +#define _poly_coeff_4 12928 +#define _ExpMask 12992 +#define _Two10 13056 +#define _MinNorm 13120 +#define _MaxNorm 13184 +#define _HalfMask 13248 +#define _One 13312 +#define _L2H 13376 +#define _L2L 13440 +#define _Threshold 13504 +#define _Bias 13568 +#define _Bias1 13632 +#define _L2 13696 +#define _dInfs 13760 +#define _dOnes 13824 +#define _dZeros 13888 +#define __dbT 13952 +#define __dbInvLn2 30400 +#define __dbShifter 30464 +#define __dbHALF 30528 +#define __dbC1 30592 +#define __lbLOWKBITS 30656 +#define __iAbsMask 30720 +#define __iDomainRange 30784 + +.macro double_vector offset value +.if .-__svml_spow_data != \offset +.err +.endif +.rept 8 +.quad \value +.endr +.endm + +.macro float_vector offset value +.if .-__svml_spow_data != \offset +.err +.endif +.rept 16 +.long \value +.endr +.endm + +#endif diff --git a/REORG.TODO/sysdeps/x86_64/fpu/svml_s_sincosf16_core.S b/REORG.TODO/sysdeps/x86_64/fpu/svml_s_sincosf16_core.S new file mode 100644 index 0000000000..d86c91380e --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/svml_s_sincosf16_core.S @@ -0,0 +1,358 @@ +/* Function sincosf vectorized with AVX-512. Wrapper to AVX2 version. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "svml_s_wrapper_impl.h" + + .text +ENTRY (_ZGVeN16vl4l4_sincosf) +WRAPPER_IMPL_AVX512_fFF _ZGVdN8vl4l4_sincosf +END (_ZGVeN16vl4l4_sincosf) + +/* AVX512 ISA version as wrapper to AVX2 ISA version (for vector + function declared with #pragma omp declare simd notinbranch). */ +.macro WRAPPER_IMPL_AVX512_fFF_vvv callee +#ifndef __ILP32__ + pushq %rbp + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%rbp, 0) + movq %rsp, %rbp + cfi_def_cfa_register (%rbp) + andq $-64, %rsp + subq $448, %rsp + /* Encoding for vmovups %zmm0, 384(%rsp). */ + .byte 0x62 + .byte 0xf1 + .byte 0x7c + .byte 0x48 + .byte 0x11 + .byte 0x44 + .byte 0x24 + .byte 0x06 + lea (%rsp), %rdi + /* Encoding for vmovups %zmm1, 128(%rdi). */ + .byte 0x62 + .byte 0xf1 + .byte 0x7c + .byte 0x48 + .byte 0x11 + .byte 0x4f + .byte 0x02 + /* Encoding for vmovups %zmm2, 192(%rdi). */ + .byte 0x62 + .byte 0xf1 + .byte 0x7c + .byte 0x48 + .byte 0x11 + .byte 0x57 + .byte 0x03 + /* Encoding for vmovups %zmm3, 256(%rdi). */ + .byte 0x62 + .byte 0xf1 + .byte 0x7c + .byte 0x48 + .byte 0x11 + .byte 0x5f + .byte 0x04 + /* Encoding for vmovups %zmm4, 320(%rdi). */ + .byte 0x62 + .byte 0xf1 + .byte 0x7c + .byte 0x48 + .byte 0x11 + .byte 0x67 + .byte 0x05 + lea 64(%rsp), %rsi + call HIDDEN_JUMPTARGET(\callee) + vmovdqu 416(%rsp), %ymm0 + lea 32(%rsp), %rdi + lea 96(%rsp), %rsi + call HIDDEN_JUMPTARGET(\callee) + movq 128(%rsp), %rdx + movq 136(%rsp), %rsi + movq 144(%rsp), %r8 + movq 152(%rsp), %r10 + movl (%rsp), %eax + movl 4(%rsp), %ecx + movl 8(%rsp), %edi + movl 12(%rsp), %r9d + movl %eax, (%rdx) + movl %ecx, (%rsi) + movq 160(%rsp), %rax + movq 168(%rsp), %rcx + movl %edi, (%r8) + movl %r9d, (%r10) + movq 176(%rsp), %rdi + movq 184(%rsp), %r9 + movl 16(%rsp), %r11d + movl 20(%rsp), %edx + movl 24(%rsp), %esi + movl 28(%rsp), %r8d + movl %r11d, (%rax) + movl %edx, (%rcx) + movq 192(%rsp), %r11 + movq 200(%rsp), %rdx + movl %esi, (%rdi) + movl %r8d, (%r9) + movq 208(%rsp), %rsi + movq 216(%rsp), %r8 + movl 32(%rsp), %r10d + movl 36(%rsp), %eax + movl 40(%rsp), %ecx + movl 44(%rsp), %edi + movl %r10d, (%r11) + movl %eax, (%rdx) + movq 224(%rsp), %r10 + movq 232(%rsp), %rax + movl %ecx, (%rsi) + movl %edi, (%r8) + movq 240(%rsp), %rcx + movq 248(%rsp), %rdi + movl 48(%rsp), %r9d + movl 52(%rsp), %r11d + movl 56(%rsp), %edx + movl 60(%rsp), %esi + movl %r9d, (%r10) + movl %r11d, (%rax) + movq 256(%rsp), %r9 + movq 264(%rsp), %r11 + movl %edx, (%rcx) + movl %esi, (%rdi) + movq 272(%rsp), %rdx + movq 280(%rsp), %rsi + movl 64(%rsp), %r8d + movl 68(%rsp), %r10d + movl 72(%rsp), %eax + movl 76(%rsp), %ecx + movl %r8d, (%r9) + movl %r10d, (%r11) + movq 288(%rsp), %r8 + movq 296(%rsp), %r10 + movl %eax, (%rdx) + movl %ecx, (%rsi) + movq 304(%rsp), %rax + movq 312(%rsp), %rcx + movl 80(%rsp), %edi + movl 84(%rsp), %r9d + movl 88(%rsp), %r11d + movl 92(%rsp), %edx + movl %edi, (%r8) + movl %r9d, (%r10) + movq 320(%rsp), %rdi + movq 328(%rsp), %r9 + movl %r11d, (%rax) + movl %edx, (%rcx) + movq 336(%rsp), %r11 + movq 344(%rsp), %rdx + movl 96(%rsp), %esi + movl 100(%rsp), %r8d + movl 104(%rsp), %r10d + movl 108(%rsp), %eax + movl %esi, (%rdi) + movl %r8d, (%r9) + movq 352(%rsp), %rsi + movq 360(%rsp), %r8 + movl %r10d, (%r11) + movl %eax, (%rdx) + movq 368(%rsp), %r10 + movq 376(%rsp), %rax + movl 112(%rsp), %ecx + movl 116(%rsp), %edi + movl 120(%rsp), %r9d + movl 124(%rsp), %r11d + movl %ecx, (%rsi) + movl %edi, (%r8) + movl %r9d, (%r10) + movl %r11d, (%rax) + movq %rbp, %rsp + cfi_def_cfa_register (%rsp) + popq %rbp + cfi_adjust_cfa_offset (-8) + cfi_restore (%rbp) + ret +#else + leal 8(%rsp), %r10d + .cfi_def_cfa 10, 0 + andl $-64, %esp + pushq -8(%r10d) + pushq %rbp + .cfi_escape 0x10,0x6,0x2,0x76,0 + movl %esp, %ebp + pushq %r12 + leal -112(%rbp), %esi + pushq %r10 + .cfi_escape 0xf,0x3,0x76,0x70,0x6 + .cfi_escape 0x10,0xc,0x2,0x76,0x78 + leal -176(%rbp), %edi + movq %rsi, %r12 + pushq %rbx + .cfi_escape 0x10,0x3,0x2,0x76,0x68 + movq %rdi, %rbx + subl $344, %esp + /* Encoding for vmovdqa64 %zmm1, -240(%ebp). */ + .byte 0x67 + .byte 0x62 + .byte 0xf1 + .byte 0xfd + .byte 0x48 + .byte 0x7f + .byte 0x8d + .byte 0x10 + .byte 0xff + .byte 0xff + .byte 0xff + /* Encoding for vmovdqa64 %zmm2, -304(%ebp). */ + .byte 0x67 + .byte 0x62 + .byte 0xf1 + .byte 0xfd + .byte 0x48 + .byte 0x7f + .byte 0x95 + .byte 0xd0 + .byte 0xfe + .byte 0xff + .byte 0xff + /* Encoding for vmovaps %zmm0, -368(%ebp). */ + .byte 0x67 + .byte 0x62 + .byte 0xf1 + .byte 0x7c + .byte 0x48 + .byte 0x29 + .byte 0x85 + .byte 0x90 + .byte 0xfe + .byte 0xff + .byte 0xff + call HIDDEN_JUMPTARGET(\callee) + leal 32(%r12), %esi + vmovups -336(%ebp), %ymm0 + leal 32(%rbx), %edi + call HIDDEN_JUMPTARGET(\callee) + movl -240(%ebp), %eax + vmovss -176(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -236(%ebp), %eax + vmovss -172(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -232(%ebp), %eax + vmovss -168(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -228(%ebp), %eax + vmovss -164(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -224(%ebp), %eax + vmovss -160(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -220(%ebp), %eax + vmovss -156(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -216(%ebp), %eax + vmovss -152(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -212(%ebp), %eax + vmovss -148(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -208(%ebp), %eax + vmovss -144(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -204(%ebp), %eax + vmovss -140(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -200(%ebp), %eax + vmovss -136(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -196(%ebp), %eax + vmovss -132(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -192(%ebp), %eax + vmovss -128(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -188(%ebp), %eax + vmovss -124(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -184(%ebp), %eax + vmovss -120(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -180(%ebp), %eax + vmovss -116(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -304(%ebp), %eax + vmovss -112(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -300(%ebp), %eax + vmovss -108(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -296(%ebp), %eax + vmovss -104(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -292(%ebp), %eax + vmovss -100(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -288(%ebp), %eax + vmovss -96(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -284(%ebp), %eax + vmovss -92(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -280(%ebp), %eax + vmovss -88(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -276(%ebp), %eax + vmovss -84(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -272(%ebp), %eax + vmovss -80(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -268(%ebp), %eax + vmovss -76(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -264(%ebp), %eax + vmovss -72(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -260(%ebp), %eax + vmovss -68(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -256(%ebp), %eax + vmovss -64(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -252(%ebp), %eax + vmovss -60(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -248(%ebp), %eax + vmovss -56(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -244(%ebp), %eax + vmovss -52(%ebp), %xmm0 + vmovss %xmm0, (%eax) + addl $344, %esp + popq %rbx + popq %r10 + .cfi_def_cfa 10, 0 + popq %r12 + popq %rbp + leal -8(%r10), %esp + .cfi_def_cfa 7, 8 + ret +#endif +.endm + +ENTRY (_ZGVeN16vvv_sincosf) +WRAPPER_IMPL_AVX512_fFF_vvv _ZGVdN8vl4l4_sincosf +END (_ZGVeN16vvv_sincosf) diff --git a/REORG.TODO/sysdeps/x86_64/fpu/svml_s_sincosf4_core.S b/REORG.TODO/sysdeps/x86_64/fpu/svml_s_sincosf4_core.S new file mode 100644 index 0000000000..2ab33b59a7 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/svml_s_sincosf4_core.S @@ -0,0 +1,152 @@ +/* Function sincosf vectorized with SSE2. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "svml_s_wrapper_impl.h" + + .text +ENTRY (_ZGVbN4vl4l4_sincosf) +WRAPPER_IMPL_SSE2_fFF sincosf +END (_ZGVbN4vl4l4_sincosf) +libmvec_hidden_def (_ZGVbN4vl4l4_sincosf) + +/* SSE2 ISA version as wrapper to scalar (for vector + function declared with #pragma omp declare simd notinbranch). */ +.macro WRAPPER_IMPL_SSE2_fFF_vvv callee +#ifndef __ILP32__ + subq $120, %rsp + cfi_adjust_cfa_offset(120) + movaps %xmm0, 96(%rsp) + lea (%rsp), %rdi + movdqa %xmm1, 32(%rdi) + lea 16(%rsp), %rsi + movdqa %xmm2, 32(%rsi) + movdqa %xmm3, 48(%rsi) + movdqa %xmm4, 64(%rsi) + call JUMPTARGET(\callee) + movss 100(%rsp), %xmm0 + lea 4(%rsp), %rdi + lea 20(%rsp), %rsi + call JUMPTARGET(\callee) + movss 104(%rsp), %xmm0 + lea 8(%rsp), %rdi + lea 24(%rsp), %rsi + call JUMPTARGET(\callee) + movss 108(%rsp), %xmm0 + lea 12(%rsp), %rdi + lea 28(%rsp), %rsi + call JUMPTARGET(\callee) + movq 32(%rsp), %rdx + movq 40(%rsp), %rsi + movq 48(%rsp), %r8 + movq 56(%rsp), %r10 + movl (%rsp), %eax + movl 4(%rsp), %ecx + movl 8(%rsp), %edi + movl 12(%rsp), %r9d + movl %eax, (%rdx) + movl %ecx, (%rsi) + movq 64(%rsp), %rax + movq 72(%rsp), %rcx + movl %edi, (%r8) + movl %r9d, (%r10) + movq 80(%rsp), %rdi + movq 88(%rsp), %r9 + movl 16(%rsp), %r11d + movl 20(%rsp), %edx + movl 24(%rsp), %esi + movl 28(%rsp), %r8d + movl %r11d, (%rax) + movl %edx, (%rcx) + movl %esi, (%rdi) + movl %r8d, (%r9) + addq $120, %rsp + cfi_adjust_cfa_offset(-120) + ret +#else + pushq %rbp + .cfi_def_cfa_offset 16 + .cfi_offset 6, -16 + pushq %rbx + .cfi_def_cfa_offset 24 + .cfi_offset 3, -24 + subl $88, %esp + .cfi_def_cfa_offset 112 + leal 64(%rsp), %esi + movaps %xmm1, (%esp) + leal 48(%rsp), %edi + movaps %xmm2, 16(%esp) + movq %rsi, %rbp + movq %rdi, %rbx + movaps %xmm0, 32(%esp) + call JUMPTARGET(\callee) + movups 36(%esp), %xmm0 + leal 4(%rbp), %esi + leal 4(%rbx), %edi + call JUMPTARGET(\callee) + movups 40(%esp), %xmm0 + leal 8(%rbp), %esi + leal 8(%rbx), %edi + call JUMPTARGET(\callee) + movups 44(%esp), %xmm0 + leal 12(%rbp), %esi + leal 12(%rbx), %edi + call JUMPTARGET(\callee) + movq (%esp), %rax + movss 48(%esp), %xmm0 + movdqa (%esp), %xmm4 + movdqa 16(%esp), %xmm7 + movss %xmm0, (%eax) + movss 52(%esp), %xmm0 + pextrd $1, %xmm4, %eax + movss %xmm0, (%eax) + movq 8(%esp), %rax + movss 56(%esp), %xmm0 + movss %xmm0, (%eax) + movss 60(%esp), %xmm0 + pextrd $3, %xmm4, %eax + movss %xmm0, (%eax) + movq 16(%esp), %rax + movss 64(%esp), %xmm0 + movss %xmm0, (%eax) + movss 68(%esp), %xmm0 + pextrd $1, %xmm7, %eax + movss %xmm0, (%eax) + movq 24(%esp), %rax + movss 72(%esp), %xmm0 + movss %xmm0, (%eax) + movss 76(%esp), %xmm0 + pextrd $3, %xmm7, %eax + movss %xmm0, (%eax) + addl $88, %esp + .cfi_def_cfa_offset 24 + popq %rbx + .cfi_def_cfa_offset 16 + popq %rbp + .cfi_def_cfa_offset 8 + ret +#endif +.endm + +ENTRY (_ZGVbN4vvv_sincosf) +WRAPPER_IMPL_SSE2_fFF_vvv sincosf +END (_ZGVbN4vvv_sincosf) + +#ifndef USE_MULTIARCH + libmvec_hidden_def (_ZGVbN4vvv_sincosf) +#endif diff --git a/REORG.TODO/sysdeps/x86_64/fpu/svml_s_sincosf8_core.S b/REORG.TODO/sysdeps/x86_64/fpu/svml_s_sincosf8_core.S new file mode 100644 index 0000000000..757d39c522 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/svml_s_sincosf8_core.S @@ -0,0 +1,200 @@ +/* Function sincosf vectorized with AVX2, wrapper version. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "svml_s_wrapper_impl.h" + + .text +ENTRY (_ZGVdN8vl4l4_sincosf) +WRAPPER_IMPL_AVX_fFF _ZGVbN4vl4l4_sincosf +END (_ZGVdN8vl4l4_sincosf) +libmvec_hidden_def (_ZGVdN8vl4l4_sincosf) + +/* AVX2 ISA version as wrapper to SSE ISA version (for vector + function declared with #pragma omp declare simd notinbranch). */ +.macro WRAPPER_IMPL_AVX2_fFF_vvv callee +#ifndef __ILP32__ + pushq %rbp + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%rbp, 0) + movq %rsp, %rbp + cfi_def_cfa_register (%rbp) + andq $-32, %rsp + subq $224, %rsp + vmovups %ymm0, 192(%rsp) + lea (%rsp), %rdi + vmovdqu %ymm1, 64(%rdi) + vmovdqu %ymm2, 96(%rdi) + vmovdqu %ymm3, 128(%rdi) + vmovdqu %ymm4, 160(%rdi) + lea 32(%rsp), %rsi + vzeroupper + call HIDDEN_JUMPTARGET(\callee) + vmovups 208(%rsp), %xmm0 + lea 16(%rsp), %rdi + lea 48(%rsp), %rsi + call HIDDEN_JUMPTARGET(\callee) + movq 64(%rsp), %rdx + movq 72(%rsp), %rsi + movq 80(%rsp), %r8 + movq 88(%rsp), %r10 + movl (%rsp), %eax + movl 4(%rsp), %ecx + movl 8(%rsp), %edi + movl 12(%rsp), %r9d + movl %eax, (%rdx) + movl %ecx, (%rsi) + movq 96(%rsp), %rax + movq 104(%rsp), %rcx + movl %edi, (%r8) + movl %r9d, (%r10) + movq 112(%rsp), %rdi + movq 120(%rsp), %r9 + movl 16(%rsp), %r11d + movl 20(%rsp), %edx + movl 24(%rsp), %esi + movl 28(%rsp), %r8d + movl %r11d, (%rax) + movl %edx, (%rcx) + movq 128(%rsp), %r11 + movq 136(%rsp), %rdx + movl %esi, (%rdi) + movl %r8d, (%r9) + movq 144(%rsp), %rsi + movq 152(%rsp), %r8 + movl 32(%rsp), %r10d + movl 36(%rsp), %eax + movl 40(%rsp), %ecx + movl 44(%rsp), %edi + movl %r10d, (%r11) + movl %eax, (%rdx) + movq 160(%rsp), %r10 + movq 168(%rsp), %rax + movl %ecx, (%rsi) + movl %edi, (%r8) + movq 176(%rsp), %rcx + movq 184(%rsp), %rdi + movl 48(%rsp), %r9d + movl 52(%rsp), %r11d + movl 56(%rsp), %edx + movl 60(%rsp), %esi + movl %r9d, (%r10) + movl %r11d, (%rax) + movl %edx, (%rcx) + movl %esi, (%rdi) + movq %rbp, %rsp + cfi_def_cfa_register (%rsp) + popq %rbp + cfi_adjust_cfa_offset (-8) + cfi_restore (%rbp) + ret +#else + leal 8(%rsp), %r10d + .cfi_def_cfa 10, 0 + andl $-32, %esp + pushq -8(%r10d) + pushq %rbp + .cfi_escape 0x10,0x6,0x2,0x76,0 + movl %esp, %ebp + pushq %r12 + leal -80(%rbp), %esi + pushq %r10 + .cfi_escape 0xf,0x3,0x76,0x70,0x6 + .cfi_escape 0x10,0xc,0x2,0x76,0x78 + leal -112(%rbp), %edi + movq %rsi, %r12 + pushq %rbx + .cfi_escape 0x10,0x3,0x2,0x76,0x68 + movq %rdi, %rbx + subl $184, %esp + vmovdqa %ymm1, -144(%ebp) + vmovdqa %ymm2, -176(%ebp) + vmovaps %ymm0, -208(%ebp) + vzeroupper + call HIDDEN_JUMPTARGET(\callee) + leal 16(%r12), %esi + vmovups -192(%ebp), %xmm0 + leal 16(%rbx), %edi + call HIDDEN_JUMPTARGET(\callee) + movl -144(%ebp), %eax + vmovss -112(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -140(%ebp), %eax + vmovss -108(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -136(%ebp), %eax + vmovss -104(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -132(%ebp), %eax + vmovss -100(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -128(%ebp), %eax + vmovss -96(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -124(%ebp), %eax + vmovss -92(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -120(%ebp), %eax + vmovss -88(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -116(%ebp), %eax + vmovss -84(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -176(%ebp), %eax + vmovss -80(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -172(%ebp), %eax + vmovss -76(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -168(%ebp), %eax + vmovss -72(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -164(%ebp), %eax + vmovss -68(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -160(%ebp), %eax + vmovss -64(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -156(%ebp), %eax + vmovss -60(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -152(%ebp), %eax + vmovss -56(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -148(%ebp), %eax + vmovss -52(%ebp), %xmm0 + vmovss %xmm0, (%eax) + addl $184, %esp + popq %rbx + popq %r10 + .cfi_def_cfa 10, 0 + popq %r12 + popq %rbp + leal -8(%r10), %esp + .cfi_def_cfa 7, 8 + ret +#endif +.endm + +ENTRY (_ZGVdN8vvv_sincosf) +WRAPPER_IMPL_AVX2_fFF_vvv _ZGVbN4vl4l4_sincosf +END (_ZGVdN8vvv_sincosf) + +#ifndef USE_MULTIARCH + libmvec_hidden_def (_ZGVdN8vvv_sincosf) +#endif diff --git a/REORG.TODO/sysdeps/x86_64/fpu/svml_s_sincosf8_core_avx.S b/REORG.TODO/sysdeps/x86_64/fpu/svml_s_sincosf8_core_avx.S new file mode 100644 index 0000000000..0955924cdd --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/svml_s_sincosf8_core_avx.S @@ -0,0 +1,198 @@ +/* Function sincosf vectorized in AVX ISA as wrapper to SSE4 ISA version. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "svml_s_wrapper_impl.h" + + .text +ENTRY (_ZGVcN8vl4l4_sincosf) +WRAPPER_IMPL_AVX_fFF _ZGVbN4vl4l4_sincosf +END (_ZGVcN8vl4l4_sincosf) + +/* AVX ISA version as wrapper to SSE ISA version (for vector + function declared with #pragma omp declare simd notinbranch). */ +.macro WRAPPER_IMPL_AVX_fFF_vvv callee +#ifndef __ILP32__ + pushq %rbp + movq %rsp, %rbp + andq $-32, %rsp + subq $224, %rsp + vmovups %ymm0, 64(%rsp) + lea (%rsp), %rdi + vmovdqu %xmm1, 96(%rdi) + vmovdqu %xmm2, 112(%rdi) + vmovdqu %xmm3, 128(%rdi) + vmovdqu %xmm4, 144(%rdi) + vmovdqu %xmm5, 160(%rdi) + lea 32(%rsp), %rsi + vmovdqu %xmm6, 144(%rsi) + vmovdqu %xmm7, 160(%rsi) + vzeroupper + call HIDDEN_JUMPTARGET(\callee) + vmovdqu 80(%rsp), %xmm0 + lea 16(%rsp), %rdi + lea 48(%rsp), %rsi + call HIDDEN_JUMPTARGET(\callee) + movq 96(%rsp), %rdx + movq 104(%rsp), %rsi + movq 112(%rsp), %r8 + movq 120(%rsp), %r10 + movl (%rsp), %eax + movl 4(%rsp), %ecx + movl 8(%rsp), %edi + movl 12(%rsp), %r9d + movl %eax, (%rdx) + movl %ecx, (%rsi) + movq 128(%rsp), %rax + movq 136(%rsp), %rcx + movl %edi, (%r8) + movl %r9d, (%r10) + movq 144(%rsp), %rdi + movq 152(%rsp), %r9 + movl 16(%rsp), %r11d + movl 20(%rsp), %edx + movl 24(%rsp), %esi + movl 28(%rsp), %r8d + movl %r11d, (%rax) + movl %edx, (%rcx) + movq 160(%rsp), %r11 + movq 168(%rsp), %rdx + movl %esi, (%rdi) + movl %r8d, (%r9) + movq 176(%rsp), %rsi + movq 184(%rsp), %r8 + movl 32(%rsp), %r10d + movl 36(%rsp), %eax + movl 40(%rsp), %ecx + movl 44(%rsp), %edi + movl %r10d, (%r11) + movl %eax, (%rdx) + movq 192(%rsp), %r10 + movq 200(%rsp), %rax + movl %ecx, (%rsi) + movl %edi, (%r8) + movq 16(%rbp), %rcx + movq 24(%rbp), %rdi + movl 48(%rsp), %r9d + movl 52(%rsp), %r11d + movl 56(%rsp), %edx + movl 60(%rsp), %esi + movl %r9d, (%r10) + movl %r11d, (%rax) + movl %edx, (%rcx) + movl %esi, (%rdi) + movq %rbp, %rsp + popq %rbp + ret +#else + leal 8(%rsp), %r10d + .cfi_def_cfa 10, 0 + andl $-32, %esp + pushq -8(%r10d) + pushq %rbp + .cfi_escape 0x10,0x6,0x2,0x76,0 + movl %esp, %ebp + pushq %r12 + leal -80(%rbp), %esi + pushq %r10 + .cfi_escape 0xf,0x3,0x76,0x70,0x6 + .cfi_escape 0x10,0xc,0x2,0x76,0x78 + leal -112(%rbp), %edi + movq %rsi, %r12 + pushq %rbx + .cfi_escape 0x10,0x3,0x2,0x76,0x68 + movq %rdi, %rbx + subl $184, %esp + vmovaps %xmm1, -128(%ebp) + vmovaps %xmm2, -144(%ebp) + vmovaps %xmm3, -160(%ebp) + vmovaps %xmm4, -176(%ebp) + vmovaps %ymm0, -208(%ebp) + vzeroupper + call HIDDEN_JUMPTARGET(\callee) + leal 16(%r12), %esi + vmovups -192(%ebp), %xmm0 + leal 16(%rbx), %edi + call HIDDEN_JUMPTARGET(\callee) + movq -128(%ebp), %rax + vmovss -112(%ebp), %xmm0 + vmovdqa -128(%ebp), %xmm7 + vmovdqa -144(%ebp), %xmm3 + vmovss %xmm0, (%eax) + vmovss -108(%ebp), %xmm0 + vpextrd $1, %xmm7, %eax + vmovss %xmm0, (%eax) + movq -120(%ebp), %rax + vmovss -104(%ebp), %xmm0 + vmovss %xmm0, (%eax) + vmovss -100(%ebp), %xmm0 + vpextrd $3, %xmm7, %eax + vmovdqa -160(%ebp), %xmm7 + vmovss %xmm0, (%eax) + movq -144(%ebp), %rax + vmovss -96(%ebp), %xmm0 + vmovss %xmm0, (%eax) + vmovss -92(%ebp), %xmm0 + vpextrd $1, %xmm3, %eax + vmovss %xmm0, (%eax) + movq -136(%ebp), %rax + vmovss -88(%ebp), %xmm0 + vmovss %xmm0, (%eax) + vmovss -84(%ebp), %xmm0 + vpextrd $3, %xmm3, %eax + vmovss %xmm0, (%eax) + movq -160(%ebp), %rax + vmovss -80(%ebp), %xmm0 + vmovss %xmm0, (%eax) + vmovss -76(%ebp), %xmm0 + vpextrd $1, %xmm7, %eax + vmovss %xmm0, (%eax) + movq -152(%ebp), %rax + vmovss -72(%ebp), %xmm0 + vmovss %xmm0, (%eax) + vmovss -68(%ebp), %xmm0 + vpextrd $3, %xmm7, %eax + vmovss %xmm0, (%eax) + movq -176(%ebp), %rax + vmovss -64(%ebp), %xmm0 + vmovdqa -176(%ebp), %xmm3 + vmovss %xmm0, (%eax) + vmovss -60(%ebp), %xmm0 + vpextrd $1, %xmm3, %eax + vmovss %xmm0, (%eax) + movq -168(%ebp), %rax + vmovss -56(%ebp), %xmm0 + vmovss %xmm0, (%eax) + vmovss -52(%ebp), %xmm0 + vpextrd $3, %xmm3, %eax + vmovss %xmm0, (%eax) + addl $184, %esp + popq %rbx + popq %r10 + .cfi_def_cfa 10, 0 + popq %r12 + popq %rbp + leal -8(%r10), %esp + .cfi_def_cfa 7, 8 + ret +#endif +.endm + +ENTRY (_ZGVcN8vvv_sincosf) +WRAPPER_IMPL_AVX_fFF_vvv _ZGVbN4vl4l4_sincosf +END (_ZGVcN8vvv_sincosf) diff --git a/REORG.TODO/sysdeps/x86_64/fpu/svml_s_sinf16_core.S b/REORG.TODO/sysdeps/x86_64/fpu/svml_s_sinf16_core.S new file mode 100644 index 0000000000..14473da427 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/svml_s_sinf16_core.S @@ -0,0 +1,25 @@ +/* Function sinf vectorized with AVX-512. Wrapper to AVX2 version. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "svml_s_wrapper_impl.h" + + .text +ENTRY (_ZGVeN16v_sinf) +WRAPPER_IMPL_AVX512 _ZGVdN8v_sinf +END (_ZGVeN16v_sinf) diff --git a/REORG.TODO/sysdeps/x86_64/fpu/svml_s_sinf4_core.S b/REORG.TODO/sysdeps/x86_64/fpu/svml_s_sinf4_core.S new file mode 100644 index 0000000000..910f39c7f2 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/svml_s_sinf4_core.S @@ -0,0 +1,30 @@ +/* Function sinf vectorized with SSE2. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + + +#include <sysdep.h> +#include "svml_s_wrapper_impl.h" + + .text +ENTRY (_ZGVbN4v_sinf) +WRAPPER_IMPL_SSE2 sinf +END (_ZGVbN4v_sinf) + +#ifndef USE_MULTIARCH + libmvec_hidden_def (_ZGVbN4v_sinf) +#endif diff --git a/REORG.TODO/sysdeps/x86_64/fpu/svml_s_sinf8_core.S b/REORG.TODO/sysdeps/x86_64/fpu/svml_s_sinf8_core.S new file mode 100644 index 0000000000..568c978a22 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/svml_s_sinf8_core.S @@ -0,0 +1,29 @@ +/* Function sinf vectorized with AVX2, wrapper version. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "svml_s_wrapper_impl.h" + + .text +ENTRY (_ZGVdN8v_sinf) +WRAPPER_IMPL_AVX _ZGVbN4v_sinf +END (_ZGVdN8v_sinf) + +#ifndef USE_MULTIARCH + libmvec_hidden_def (_ZGVdN8v_sinf) +#endif diff --git a/REORG.TODO/sysdeps/x86_64/fpu/svml_s_sinf8_core_avx.S b/REORG.TODO/sysdeps/x86_64/fpu/svml_s_sinf8_core_avx.S new file mode 100644 index 0000000000..603f59ed1b --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/svml_s_sinf8_core_avx.S @@ -0,0 +1,25 @@ +/* Function sinf vectorized in AVX ISA as wrapper to SSE4 ISA version. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "svml_s_wrapper_impl.h" + + .text +ENTRY(_ZGVcN8v_sinf) +WRAPPER_IMPL_AVX _ZGVbN4v_sinf +END(_ZGVcN8v_sinf) diff --git a/REORG.TODO/sysdeps/x86_64/fpu/svml_s_trig_data.S b/REORG.TODO/sysdeps/x86_64/fpu/svml_s_trig_data.S new file mode 100644 index 0000000000..19a569118f --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/svml_s_trig_data.S @@ -0,0 +1,111 @@ +/* Data for function cosf. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include "svml_s_trig_data.h" + + .section .rodata, "a" + .align 64 + +/* Data table for vector implementations of function cosf. + The table may contain polynomial, reduction, lookup coefficients + and other macro_names obtained through different methods + of research and experimental work. */ + + .globl __svml_s_trig_data +__svml_s_trig_data: + +/* General purpose constants: + absolute value mask */ +float_vector __sAbsMask 0x7fffffff + +/* threshold for out-of-range values */ +float_vector __sRangeReductionVal 0x461c4000 + +/* +INF */ +float_vector __sRangeVal 0x7f800000 + +/* High Accuracy version polynomial coefficients: + S1 = -1.66666666664728165763e-01 */ +float_vector __sS1 0xbe2aaaab + +/* S2 = 8.33329173045453069014e-03 */ +float_vector __sS2 0x3c08885c + +/* C1 = -5.00000000000000000000e-01 */ +float_vector __sC1 0xbf000000 + +/* C2 = 4.16638942914469202550e-02 */ +float_vector __sC2 0x3d2aaa7c + +/* Range reduction PI-based constants: + PI high part */ +float_vector __sPI1 0x40490000 + +/* PI mid part 1 */ +float_vector __sPI2 0x3a7da000 + +/* PI mid part 2 */ +float_vector __sPI3 0x34222000 + +/* PI low part */ +float_vector __sPI4 0x2cb4611a + +/* PI1, PI2, and PI3 when FMA is available + PI high part (when FMA available) */ +float_vector __sPI1_FMA 0x40490fdb + +/* PI mid part (when FMA available) */ +float_vector __sPI2_FMA 0xb3bbbd2e + +/* PI low part (when FMA available) */ +float_vector __sPI3_FMA 0xa7772ced + +/* Polynomial constants for work w/o FMA, relative error ~ 2^(-26.625) */ +float_vector __sA3 0xbe2aaaa6 +float_vector __sA5 0x3c08876a +float_vector __sA7 0xb94fb7ff +float_vector __sA9 0x362edef8 + +/* Polynomial constants, work with FMA, relative error ~ 2^(-26.417) */ +float_vector __sA5_FMA 0x3c088768 +float_vector __sA7_FMA 0xb94fb6cf +float_vector __sA9_FMA 0x362ec335 + +/* 1/PI */ +float_vector __sInvPI 0x3ea2f983 + +/* right-shifter constant */ +float_vector __sRShifter 0x4b400000 + +/* PI/2 */ +float_vector __sHalfPI 0x3fc90fdb + +/* 1/2 */ +float_vector __sOneHalf 0x3f000000 + +/* high accuracy table index mask */ +float_vector __iIndexMask 0x000000ff + +/* 2^(k-1) */ +float_vector __i2pK_1 0x00000040 + +/* sign field mask */ +float_vector __sSignMask 0x80000000 + + .type __svml_s_trig_data,@object + .size __svml_s_trig_data,.-__svml_s_trig_data diff --git a/REORG.TODO/sysdeps/x86_64/fpu/svml_s_trig_data.h b/REORG.TODO/sysdeps/x86_64/fpu/svml_s_trig_data.h new file mode 100644 index 0000000000..04f4f7b1ed --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/svml_s_trig_data.h @@ -0,0 +1,62 @@ +/* Offsets for data table for vectorized sinf, cosf, sincosf. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#ifndef S_TRIG_DATA_H +#define S_TRIG_DATA_H + +.macro float_vector offset value +.if .-__svml_s_trig_data != \offset +.err +.endif +.rept 16 +.long \value +.endr +.endm + +#define __sAbsMask 0 +#define __sRangeReductionVal 64 +#define __sRangeVal 64*2 +#define __sS1 64*3 +#define __sS2 64*4 +#define __sC1 64*5 +#define __sC2 64*6 +#define __sPI1 64*7 +#define __sPI2 64*8 +#define __sPI3 64*9 +#define __sPI4 64*10 +#define __sPI1_FMA 64*11 +#define __sPI2_FMA 64*12 +#define __sPI3_FMA 64*13 +#define __sA3 64*14 +#define __sA5 64*15 +#define __sA7 64*16 +#define __sA9 64*17 +#define __sA5_FMA 64*18 +#define __sA7_FMA 64*19 +#define __sA9_FMA 64*20 +#define __sInvPI 64*21 +#define __sRShifter 64*22 +#define __sHalfPI 64*23 +#define __sOneHalf 64*24 +#define __iIndexMask 64*25 +#define __i2pK_1 64*26 +#define __sSignMask 64*27 +#define __dT_cosf 64*28 +#define __dT 64*92 + +#endif diff --git a/REORG.TODO/sysdeps/x86_64/fpu/svml_s_wrapper_impl.h b/REORG.TODO/sysdeps/x86_64/fpu/svml_s_wrapper_impl.h new file mode 100644 index 0000000000..cd6d58361c --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/svml_s_wrapper_impl.h @@ -0,0 +1,371 @@ +/* Wrapper implementations of vector math functions. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +/* SSE2 ISA version as wrapper to scalar. */ +.macro WRAPPER_IMPL_SSE2 callee + subq $40, %rsp + cfi_adjust_cfa_offset(40) + movaps %xmm0, (%rsp) + call JUMPTARGET(\callee) + movss %xmm0, 16(%rsp) + movss 4(%rsp), %xmm0 + call JUMPTARGET(\callee) + movss %xmm0, 20(%rsp) + movss 8(%rsp), %xmm0 + call JUMPTARGET(\callee) + movss %xmm0, 24(%rsp) + movss 12(%rsp), %xmm0 + call JUMPTARGET(\callee) + movss 16(%rsp), %xmm3 + movss 20(%rsp), %xmm2 + movss 24(%rsp), %xmm1 + movss %xmm0, 28(%rsp) + unpcklps %xmm1, %xmm3 + unpcklps %xmm0, %xmm2 + unpcklps %xmm2, %xmm3 + movaps %xmm3, %xmm0 + addq $40, %rsp + cfi_adjust_cfa_offset(-40) + ret +.endm + +/* 2 argument SSE2 ISA version as wrapper to scalar. */ +.macro WRAPPER_IMPL_SSE2_ff callee + subq $56, %rsp + cfi_adjust_cfa_offset(56) + movaps %xmm0, (%rsp) + movaps %xmm1, 16(%rsp) + call JUMPTARGET(\callee) + movss %xmm0, 32(%rsp) + movss 4(%rsp), %xmm0 + movss 20(%rsp), %xmm1 + call JUMPTARGET(\callee) + movss %xmm0, 36(%rsp) + movss 8(%rsp), %xmm0 + movss 24(%rsp), %xmm1 + call JUMPTARGET(\callee) + movss %xmm0, 40(%rsp) + movss 12(%rsp), %xmm0 + movss 28(%rsp), %xmm1 + call JUMPTARGET(\callee) + movss 32(%rsp), %xmm3 + movss 36(%rsp), %xmm2 + movss 40(%rsp), %xmm1 + movss %xmm0, 44(%rsp) + unpcklps %xmm1, %xmm3 + unpcklps %xmm0, %xmm2 + unpcklps %xmm2, %xmm3 + movaps %xmm3, %xmm0 + addq $56, %rsp + cfi_adjust_cfa_offset(-56) + ret +.endm + +/* 3 argument SSE2 ISA version as wrapper to scalar. */ +.macro WRAPPER_IMPL_SSE2_fFF callee + pushq %rbp + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%rbp, 0) + pushq %rbx + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%rbx, 0) + movq %rdi, %rbp + movq %rsi, %rbx + subq $40, %rsp + cfi_adjust_cfa_offset(40) + leaq 24(%rsp), %rsi + leaq 28(%rsp), %rdi + movaps %xmm0, (%rsp) + call JUMPTARGET(\callee) + leaq 24(%rsp), %rsi + leaq 28(%rsp), %rdi + movss 28(%rsp), %xmm0 + movss %xmm0, 0(%rbp) + movaps (%rsp), %xmm1 + movss 24(%rsp), %xmm0 + movss %xmm0, (%rbx) + movaps %xmm1, %xmm0 + shufps $85, %xmm1, %xmm0 + call JUMPTARGET(\callee) + movss 28(%rsp), %xmm0 + leaq 24(%rsp), %rsi + movss %xmm0, 4(%rbp) + leaq 28(%rsp), %rdi + movaps (%rsp), %xmm1 + movss 24(%rsp), %xmm0 + movss %xmm0, 4(%rbx) + movaps %xmm1, %xmm0 + unpckhps %xmm1, %xmm0 + call JUMPTARGET(\callee) + movaps (%rsp), %xmm1 + leaq 24(%rsp), %rsi + leaq 28(%rsp), %rdi + movss 28(%rsp), %xmm0 + shufps $255, %xmm1, %xmm1 + movss %xmm0, 8(%rbp) + movss 24(%rsp), %xmm0 + movss %xmm0, 8(%rbx) + movaps %xmm1, %xmm0 + call JUMPTARGET(\callee) + movss 28(%rsp), %xmm0 + movss %xmm0, 12(%rbp) + movss 24(%rsp), %xmm0 + movss %xmm0, 12(%rbx) + addq $40, %rsp + cfi_adjust_cfa_offset(-40) + popq %rbx + cfi_adjust_cfa_offset (-8) + cfi_restore (%rbx) + popq %rbp + cfi_adjust_cfa_offset (-8) + cfi_restore (%rbp) + ret +.endm + +/* AVX/AVX2 ISA version as wrapper to SSE ISA version. */ +.macro WRAPPER_IMPL_AVX callee + pushq %rbp + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%rbp, 0) + movq %rsp, %rbp + cfi_def_cfa_register (%rbp) + andq $-32, %rsp + subq $32, %rsp + vextractf128 $1, %ymm0, (%rsp) + vzeroupper + call HIDDEN_JUMPTARGET(\callee) + vmovaps %xmm0, 16(%rsp) + vmovaps (%rsp), %xmm0 + call HIDDEN_JUMPTARGET(\callee) + vmovaps %xmm0, %xmm1 + vmovaps 16(%rsp), %xmm0 + vinsertf128 $1, %xmm1, %ymm0, %ymm0 + movq %rbp, %rsp + cfi_def_cfa_register (%rsp) + popq %rbp + cfi_adjust_cfa_offset (-8) + cfi_restore (%rbp) + ret +.endm + +/* 2 argument AVX/AVX2 ISA version as wrapper to SSE ISA version. */ +.macro WRAPPER_IMPL_AVX_ff callee + pushq %rbp + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%rbp, 0) + movq %rsp, %rbp + cfi_def_cfa_register (%rbp) + andq $-32, %rsp + subq $64, %rsp + vextractf128 $1, %ymm0, 16(%rsp) + vextractf128 $1, %ymm1, (%rsp) + vzeroupper + call HIDDEN_JUMPTARGET(\callee) + vmovaps %xmm0, 32(%rsp) + vmovaps 16(%rsp), %xmm0 + vmovaps (%rsp), %xmm1 + call HIDDEN_JUMPTARGET(\callee) + vmovaps %xmm0, %xmm1 + vmovaps 32(%rsp), %xmm0 + vinsertf128 $1, %xmm1, %ymm0, %ymm0 + movq %rbp, %rsp + cfi_def_cfa_register (%rsp) + popq %rbp + cfi_adjust_cfa_offset (-8) + cfi_restore (%rbp) + ret +.endm + +/* 3 argument AVX/AVX2 ISA version as wrapper to SSE ISA version. */ +.macro WRAPPER_IMPL_AVX_fFF callee + pushq %rbp + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%rbp, 0) + movq %rsp, %rbp + cfi_def_cfa_register (%rbp) + andq $-32, %rsp + pushq %r13 + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%r13, 0) + pushq %r14 + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%r14, 0) + subq $48, %rsp + movq %rsi, %r14 + vmovaps %ymm0, (%rsp) + movq %rdi, %r13 + vmovaps 16(%rsp), %xmm1 + vmovaps %xmm1, 32(%rsp) + vzeroupper + vmovaps (%rsp), %xmm0 + call HIDDEN_JUMPTARGET(\callee) + vmovaps 32(%rsp), %xmm0 + lea (%rsp), %rdi + lea 16(%rsp), %rsi + call HIDDEN_JUMPTARGET(\callee) + vmovaps (%rsp), %xmm0 + vmovaps 16(%rsp), %xmm1 + vmovaps %xmm0, 16(%r13) + vmovaps %xmm1, 16(%r14) + addq $48, %rsp + popq %r14 + cfi_adjust_cfa_offset (-8) + cfi_restore (%r14) + popq %r13 + cfi_adjust_cfa_offset (-8) + cfi_restore (%r13) + movq %rbp, %rsp + cfi_def_cfa_register (%rsp) + popq %rbp + cfi_adjust_cfa_offset (-8) + cfi_restore (%rbp) + ret +.endm + +/* AVX512 ISA version as wrapper to AVX2 ISA version. */ +.macro WRAPPER_IMPL_AVX512 callee + pushq %rbp + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%rbp, 0) + movq %rsp, %rbp + cfi_def_cfa_register (%rbp) + andq $-64, %rsp + subq $128, %rsp +/* Below is encoding for vmovups %zmm0, (%rsp). */ + .byte 0x62 + .byte 0xf1 + .byte 0x7c + .byte 0x48 + .byte 0x11 + .byte 0x04 + .byte 0x24 + vmovupd (%rsp), %ymm0 + call HIDDEN_JUMPTARGET(\callee) + vmovupd %ymm0, 64(%rsp) + vmovupd 32(%rsp), %ymm0 + call HIDDEN_JUMPTARGET(\callee) + vmovupd %ymm0, 96(%rsp) +/* Below is encoding for vmovups 64(%rsp), %zmm0. */ + .byte 0x62 + .byte 0xf1 + .byte 0x7c + .byte 0x48 + .byte 0x10 + .byte 0x44 + .byte 0x24 + .byte 0x01 + movq %rbp, %rsp + cfi_def_cfa_register (%rsp) + popq %rbp + cfi_adjust_cfa_offset (-8) + cfi_restore (%rbp) + ret +.endm + +/* 2 argument AVX512 ISA version as wrapper to AVX2 ISA version. */ +.macro WRAPPER_IMPL_AVX512_ff callee + pushq %rbp + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%rbp, 0) + movq %rsp, %rbp + cfi_def_cfa_register (%rbp) + andq $-64, %rsp + subq $192, %rsp +/* Below is encoding for vmovups %zmm0, (%rsp). */ + .byte 0x62 + .byte 0xf1 + .byte 0x7c + .byte 0x48 + .byte 0x11 + .byte 0x04 + .byte 0x24 +/* Below is encoding for vmovups %zmm1, 64(%rsp). */ + .byte 0x62 + .byte 0xf1 + .byte 0x7c + .byte 0x48 + .byte 0x11 + .byte 0x4c + .byte 0x24 + .byte 0x01 + vmovups (%rsp), %ymm0 + vmovups 64(%rsp), %ymm1 + call HIDDEN_JUMPTARGET(\callee) + vmovups %ymm0, 128(%rsp) + vmovups 32(%rsp), %ymm0 + vmovups 96(%rsp), %ymm1 + call HIDDEN_JUMPTARGET(\callee) + vmovups %ymm0, 160(%rsp) +/* Below is encoding for vmovups 128(%rsp), %zmm0. */ + .byte 0x62 + .byte 0xf1 + .byte 0x7c + .byte 0x48 + .byte 0x10 + .byte 0x44 + .byte 0x24 + .byte 0x02 + movq %rbp, %rsp + cfi_def_cfa_register (%rsp) + popq %rbp + cfi_adjust_cfa_offset (-8) + cfi_restore (%rbp) + ret +.endm + +/* 3 argument AVX512 ISA version as wrapper to AVX2 ISA version. */ +.macro WRAPPER_IMPL_AVX512_fFF callee + pushq %rbp + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%rbp, 0) + movq %rsp, %rbp + cfi_def_cfa_register (%rbp) + andq $-64, %rsp + pushq %r12 + pushq %r13 + subq $176, %rsp + movq %rsi, %r13 +/* Below is encoding for vmovaps %zmm0, (%rsp). */ + .byte 0x62 + .byte 0xf1 + .byte 0x7c + .byte 0x48 + .byte 0x29 + .byte 0x04 + .byte 0x24 + movq %rdi, %r12 + vmovaps (%rsp), %ymm0 + call HIDDEN_JUMPTARGET(\callee) + vmovaps 32(%rsp), %ymm0 + lea 64(%rsp), %rdi + lea 96(%rsp), %rsi + call HIDDEN_JUMPTARGET(\callee) + vmovaps 64(%rsp), %ymm0 + vmovaps 96(%rsp), %ymm1 + vmovaps %ymm0, 32(%r12) + vmovaps %ymm1, 32(%r13) + addq $176, %rsp + popq %r13 + popq %r12 + movq %rbp, %rsp + cfi_def_cfa_register (%rsp) + popq %rbp + cfi_adjust_cfa_offset (-8) + cfi_restore (%rbp) + ret +.endm diff --git a/REORG.TODO/sysdeps/x86_64/fpu/test-double-libmvec-alias-avx-main.c b/REORG.TODO/sysdeps/x86_64/fpu/test-double-libmvec-alias-avx-main.c new file mode 100644 index 0000000000..43914ef0e7 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/test-double-libmvec-alias-avx-main.c @@ -0,0 +1 @@ +#include "test-double-libmvec-alias.c" diff --git a/REORG.TODO/sysdeps/x86_64/fpu/test-double-libmvec-alias-avx-mod.c b/REORG.TODO/sysdeps/x86_64/fpu/test-double-libmvec-alias-avx-mod.c new file mode 100644 index 0000000000..514883dcf9 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/test-double-libmvec-alias-avx-mod.c @@ -0,0 +1 @@ +#include "test-double-libmvec-alias-mod.c" diff --git a/REORG.TODO/sysdeps/x86_64/fpu/test-double-libmvec-alias-avx.c b/REORG.TODO/sysdeps/x86_64/fpu/test-double-libmvec-alias-avx.c new file mode 100644 index 0000000000..43914ef0e7 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/test-double-libmvec-alias-avx.c @@ -0,0 +1 @@ +#include "test-double-libmvec-alias.c" diff --git a/REORG.TODO/sysdeps/x86_64/fpu/test-double-libmvec-alias-avx2-main.c b/REORG.TODO/sysdeps/x86_64/fpu/test-double-libmvec-alias-avx2-main.c new file mode 100644 index 0000000000..43914ef0e7 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/test-double-libmvec-alias-avx2-main.c @@ -0,0 +1 @@ +#include "test-double-libmvec-alias.c" diff --git a/REORG.TODO/sysdeps/x86_64/fpu/test-double-libmvec-alias-avx2-mod.c b/REORG.TODO/sysdeps/x86_64/fpu/test-double-libmvec-alias-avx2-mod.c new file mode 100644 index 0000000000..514883dcf9 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/test-double-libmvec-alias-avx2-mod.c @@ -0,0 +1 @@ +#include "test-double-libmvec-alias-mod.c" diff --git a/REORG.TODO/sysdeps/x86_64/fpu/test-double-libmvec-alias-avx2.c b/REORG.TODO/sysdeps/x86_64/fpu/test-double-libmvec-alias-avx2.c new file mode 100644 index 0000000000..43914ef0e7 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/test-double-libmvec-alias-avx2.c @@ -0,0 +1 @@ +#include "test-double-libmvec-alias.c" diff --git a/REORG.TODO/sysdeps/x86_64/fpu/test-double-libmvec-alias-avx512-main.c b/REORG.TODO/sysdeps/x86_64/fpu/test-double-libmvec-alias-avx512-main.c new file mode 100644 index 0000000000..43914ef0e7 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/test-double-libmvec-alias-avx512-main.c @@ -0,0 +1 @@ +#include "test-double-libmvec-alias.c" diff --git a/REORG.TODO/sysdeps/x86_64/fpu/test-double-libmvec-alias-avx512-mod.c b/REORG.TODO/sysdeps/x86_64/fpu/test-double-libmvec-alias-avx512-mod.c new file mode 100644 index 0000000000..514883dcf9 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/test-double-libmvec-alias-avx512-mod.c @@ -0,0 +1 @@ +#include "test-double-libmvec-alias-mod.c" diff --git a/REORG.TODO/sysdeps/x86_64/fpu/test-double-libmvec-alias-avx512.c b/REORG.TODO/sysdeps/x86_64/fpu/test-double-libmvec-alias-avx512.c new file mode 100644 index 0000000000..43914ef0e7 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/test-double-libmvec-alias-avx512.c @@ -0,0 +1 @@ +#include "test-double-libmvec-alias.c" diff --git a/REORG.TODO/sysdeps/x86_64/fpu/test-double-libmvec-alias-main.c b/REORG.TODO/sysdeps/x86_64/fpu/test-double-libmvec-alias-main.c new file mode 100644 index 0000000000..43914ef0e7 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/test-double-libmvec-alias-main.c @@ -0,0 +1 @@ +#include "test-double-libmvec-alias.c" diff --git a/REORG.TODO/sysdeps/x86_64/fpu/test-double-libmvec-alias-mod.c b/REORG.TODO/sysdeps/x86_64/fpu/test-double-libmvec-alias-mod.c new file mode 100644 index 0000000000..d549c3ec19 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/test-double-libmvec-alias-mod.c @@ -0,0 +1,25 @@ +/* Part of test to build shared library to ensure link against + *_finite aliases from libmvec. + Copyright (C) 2016-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <math.h> +#include <stdlib.h> +#include <math-tests-arch.h> + +#include "test-double.h" +#include "test-libmvec-alias-mod.c" diff --git a/REORG.TODO/sysdeps/x86_64/fpu/test-double-libmvec-alias.c b/REORG.TODO/sysdeps/x86_64/fpu/test-double-libmvec-alias.c new file mode 100644 index 0000000000..c7048d346f --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/test-double-libmvec-alias.c @@ -0,0 +1,29 @@ +/* Part of test to ensure link against *_finite aliases from libmvec. + Copyright (C) 2016-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +extern int +test_finite_alias (void); + +static int +do_test (void) +{ + return test_finite_alias (); +} + +#define TEST_FUNCTION do_test () +#include "../../../test-skeleton.c" diff --git a/REORG.TODO/sysdeps/x86_64/fpu/test-double-libmvec-sincos-avx-main.c b/REORG.TODO/sysdeps/x86_64/fpu/test-double-libmvec-sincos-avx-main.c new file mode 100644 index 0000000000..fc2ffea314 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/test-double-libmvec-sincos-avx-main.c @@ -0,0 +1 @@ +#include "test-double-libmvec-sincos-main.c" diff --git a/REORG.TODO/sysdeps/x86_64/fpu/test-double-libmvec-sincos-avx.c b/REORG.TODO/sysdeps/x86_64/fpu/test-double-libmvec-sincos-avx.c new file mode 100644 index 0000000000..896f1bcbaf --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/test-double-libmvec-sincos-avx.c @@ -0,0 +1 @@ +#include "test-double-libmvec-sincos.c" diff --git a/REORG.TODO/sysdeps/x86_64/fpu/test-double-libmvec-sincos-avx2-main.c b/REORG.TODO/sysdeps/x86_64/fpu/test-double-libmvec-sincos-avx2-main.c new file mode 100644 index 0000000000..fc2ffea314 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/test-double-libmvec-sincos-avx2-main.c @@ -0,0 +1 @@ +#include "test-double-libmvec-sincos-main.c" diff --git a/REORG.TODO/sysdeps/x86_64/fpu/test-double-libmvec-sincos-avx2.c b/REORG.TODO/sysdeps/x86_64/fpu/test-double-libmvec-sincos-avx2.c new file mode 100644 index 0000000000..896f1bcbaf --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/test-double-libmvec-sincos-avx2.c @@ -0,0 +1 @@ +#include "test-double-libmvec-sincos.c" diff --git a/REORG.TODO/sysdeps/x86_64/fpu/test-double-libmvec-sincos-avx512-main.c b/REORG.TODO/sysdeps/x86_64/fpu/test-double-libmvec-sincos-avx512-main.c new file mode 100644 index 0000000000..fc2ffea314 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/test-double-libmvec-sincos-avx512-main.c @@ -0,0 +1 @@ +#include "test-double-libmvec-sincos-main.c" diff --git a/REORG.TODO/sysdeps/x86_64/fpu/test-double-libmvec-sincos-avx512.c b/REORG.TODO/sysdeps/x86_64/fpu/test-double-libmvec-sincos-avx512.c new file mode 100644 index 0000000000..896f1bcbaf --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/test-double-libmvec-sincos-avx512.c @@ -0,0 +1 @@ +#include "test-double-libmvec-sincos.c" diff --git a/REORG.TODO/sysdeps/x86_64/fpu/test-double-libmvec-sincos-main.c b/REORG.TODO/sysdeps/x86_64/fpu/test-double-libmvec-sincos-main.c new file mode 100644 index 0000000000..c33436dc0f --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/test-double-libmvec-sincos-main.c @@ -0,0 +1,43 @@ +/* Test for vector sincos ABI. + Copyright (C) 2016-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <math.h> + +#define N 1000 +double x[N], s[N], c[N]; +double* s_ptrs[N]; +double* c_ptrs[N]; + +int +test_sincos_abi (void) +{ + int i; + + for(i = 0; i < N; i++) + { + x[i] = i / 3; + s_ptrs[i] = &s[i]; + c_ptrs[i] = &c[i]; + } + +#pragma omp simd + for(i = 0; i < N; i++) + sincos (x[i], s_ptrs[i], c_ptrs[i]); + + return 0; +} diff --git a/REORG.TODO/sysdeps/x86_64/fpu/test-double-libmvec-sincos.c b/REORG.TODO/sysdeps/x86_64/fpu/test-double-libmvec-sincos.c new file mode 100644 index 0000000000..9be71edd93 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/test-double-libmvec-sincos.c @@ -0,0 +1,44 @@ +/* Test for vector sincos ABI. + Copyright (C) 2016-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <math-tests-arch.h> + +extern int test_sincos_abi (void); + +int arch_check = 1; + +static void +check_arch (void) +{ + CHECK_ARCH_EXT; + arch_check = 0; +} + +static int +do_test (void) +{ + check_arch (); + + if (arch_check) + return 77; + + return test_sincos_abi (); +} + +#define TEST_FUNCTION do_test () +#include "../../../test-skeleton.c" diff --git a/REORG.TODO/sysdeps/x86_64/fpu/test-double-vlen2-wrappers.c b/REORG.TODO/sysdeps/x86_64/fpu/test-double-vlen2-wrappers.c new file mode 100644 index 0000000000..b4457f700a --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/test-double-vlen2-wrappers.c @@ -0,0 +1,33 @@ +/* Wrapper part of tests for SSE ISA versions of vector math functions. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include "test-double-vlen2.h" +#include "test-math-vector-sincos.h" +#include <immintrin.h> + +#define VEC_TYPE __m128d + +VECTOR_WRAPPER (WRAPPER_NAME (cos), _ZGVbN2v_cos) +VECTOR_WRAPPER (WRAPPER_NAME (sin), _ZGVbN2v_sin) +VECTOR_WRAPPER (WRAPPER_NAME (log), _ZGVbN2v_log) +VECTOR_WRAPPER (WRAPPER_NAME (exp), _ZGVbN2v_exp) +VECTOR_WRAPPER_ff (WRAPPER_NAME (pow), _ZGVbN2vv_pow) + +#define VEC_INT_TYPE __m128i + +VECTOR_WRAPPER_fFF_2 (WRAPPER_NAME (sincos), _ZGVbN2vvv_sincos) diff --git a/REORG.TODO/sysdeps/x86_64/fpu/test-double-vlen4-avx2-wrappers.c b/REORG.TODO/sysdeps/x86_64/fpu/test-double-vlen4-avx2-wrappers.c new file mode 100644 index 0000000000..e6b991ceaf --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/test-double-vlen4-avx2-wrappers.c @@ -0,0 +1,40 @@ +/* Wrapper part of tests for AVX2 ISA versions of vector math functions. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include "test-double-vlen4.h" +#include "test-math-vector-sincos.h" +#include <immintrin.h> + +#undef VEC_SUFF +#define VEC_SUFF _vlen4_avx2 + +#define VEC_TYPE __m256d + +VECTOR_WRAPPER (WRAPPER_NAME (cos), _ZGVdN4v_cos) +VECTOR_WRAPPER (WRAPPER_NAME (sin), _ZGVdN4v_sin) +VECTOR_WRAPPER (WRAPPER_NAME (log), _ZGVdN4v_log) +VECTOR_WRAPPER (WRAPPER_NAME (exp), _ZGVdN4v_exp) +VECTOR_WRAPPER_ff (WRAPPER_NAME (pow), _ZGVdN4vv_pow) + +#ifndef __ILP32__ +# define VEC_INT_TYPE __m256i +#else +# define VEC_INT_TYPE __m128i +#endif + +VECTOR_WRAPPER_fFF_2 (WRAPPER_NAME (sincos), _ZGVdN4vvv_sincos) diff --git a/REORG.TODO/sysdeps/x86_64/fpu/test-double-vlen4-avx2.h b/REORG.TODO/sysdeps/x86_64/fpu/test-double-vlen4-avx2.h new file mode 100644 index 0000000000..a15d4be31f --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/test-double-vlen4-avx2.h @@ -0,0 +1,25 @@ +/* Tests for AVX2 ISA versions of vector math functions. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <test-double-vlen4.h> + +#undef VEC_SUFF +#define VEC_SUFF _vlen4_avx2 + +#undef REQUIRE_AVX +#define REQUIRE_AVX2 diff --git a/REORG.TODO/sysdeps/x86_64/fpu/test-double-vlen4-wrappers.c b/REORG.TODO/sysdeps/x86_64/fpu/test-double-vlen4-wrappers.c new file mode 100644 index 0000000000..3606b6f55f --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/test-double-vlen4-wrappers.c @@ -0,0 +1,37 @@ +/* Wrapper part of tests for AVX ISA versions of vector math functions. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include "test-double-vlen4.h" +#include "test-math-vector-sincos.h" +#include <immintrin.h> + +#define VEC_TYPE __m256d + +VECTOR_WRAPPER (WRAPPER_NAME (cos), _ZGVcN4v_cos) +VECTOR_WRAPPER (WRAPPER_NAME (sin), _ZGVcN4v_sin) +VECTOR_WRAPPER (WRAPPER_NAME (log), _ZGVcN4v_log) +VECTOR_WRAPPER (WRAPPER_NAME (exp), _ZGVcN4v_exp) +VECTOR_WRAPPER_ff (WRAPPER_NAME (pow), _ZGVcN4vv_pow) + +#define VEC_INT_TYPE __m128i + +#ifndef __ILP32__ +VECTOR_WRAPPER_fFF_3 (WRAPPER_NAME (sincos), _ZGVcN4vvv_sincos) +#else +VECTOR_WRAPPER_fFF_2 (WRAPPER_NAME (sincos), _ZGVcN4vvv_sincos) +#endif diff --git a/REORG.TODO/sysdeps/x86_64/fpu/test-double-vlen4.h b/REORG.TODO/sysdeps/x86_64/fpu/test-double-vlen4.h new file mode 100644 index 0000000000..1698e621d6 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/test-double-vlen4.h @@ -0,0 +1,21 @@ +/* Tests for AVX ISA versions of vector math functions. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include_next <test-double-vlen4.h> + +#define REQUIRE_AVX diff --git a/REORG.TODO/sysdeps/x86_64/fpu/test-double-vlen8-wrappers.c b/REORG.TODO/sysdeps/x86_64/fpu/test-double-vlen8-wrappers.c new file mode 100644 index 0000000000..d77b43046d --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/test-double-vlen8-wrappers.c @@ -0,0 +1,37 @@ +/* Wrapper part of tests for AVX-512 versions of vector math functions. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include "test-double-vlen8.h" +#include "test-math-vector-sincos.h" +#include <immintrin.h> + +#define VEC_TYPE __m512d + +VECTOR_WRAPPER (WRAPPER_NAME (cos), _ZGVeN8v_cos) +VECTOR_WRAPPER (WRAPPER_NAME (sin), _ZGVeN8v_sin) +VECTOR_WRAPPER (WRAPPER_NAME (log), _ZGVeN8v_log) +VECTOR_WRAPPER (WRAPPER_NAME (exp), _ZGVeN8v_exp) +VECTOR_WRAPPER_ff (WRAPPER_NAME (pow), _ZGVeN8vv_pow) + +#ifndef __ILP32__ +# define VEC_INT_TYPE __m512i +#else +# define VEC_INT_TYPE __m256i +#endif + +VECTOR_WRAPPER_fFF_2 (WRAPPER_NAME (sincos), _ZGVeN8vvv_sincos) diff --git a/REORG.TODO/sysdeps/x86_64/fpu/test-double-vlen8.h b/REORG.TODO/sysdeps/x86_64/fpu/test-double-vlen8.h new file mode 100644 index 0000000000..5802abc121 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/test-double-vlen8.h @@ -0,0 +1,21 @@ +/* Tests for AVX-512 versions of vector math functions. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include_next <test-double-vlen8.h> + +#define REQUIRE_AVX512F diff --git a/REORG.TODO/sysdeps/x86_64/fpu/test-float-libmvec-alias-avx-main.c b/REORG.TODO/sysdeps/x86_64/fpu/test-float-libmvec-alias-avx-main.c new file mode 100644 index 0000000000..f3691cc8e6 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/test-float-libmvec-alias-avx-main.c @@ -0,0 +1 @@ +#include "test-float-libmvec-alias.c" diff --git a/REORG.TODO/sysdeps/x86_64/fpu/test-float-libmvec-alias-avx-mod.c b/REORG.TODO/sysdeps/x86_64/fpu/test-float-libmvec-alias-avx-mod.c new file mode 100644 index 0000000000..7fc3d8aedd --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/test-float-libmvec-alias-avx-mod.c @@ -0,0 +1 @@ +#include "test-float-libmvec-alias-mod.c" diff --git a/REORG.TODO/sysdeps/x86_64/fpu/test-float-libmvec-alias-avx.c b/REORG.TODO/sysdeps/x86_64/fpu/test-float-libmvec-alias-avx.c new file mode 100644 index 0000000000..f3691cc8e6 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/test-float-libmvec-alias-avx.c @@ -0,0 +1 @@ +#include "test-float-libmvec-alias.c" diff --git a/REORG.TODO/sysdeps/x86_64/fpu/test-float-libmvec-alias-avx2-main.c b/REORG.TODO/sysdeps/x86_64/fpu/test-float-libmvec-alias-avx2-main.c new file mode 100644 index 0000000000..f3691cc8e6 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/test-float-libmvec-alias-avx2-main.c @@ -0,0 +1 @@ +#include "test-float-libmvec-alias.c" diff --git a/REORG.TODO/sysdeps/x86_64/fpu/test-float-libmvec-alias-avx2-mod.c b/REORG.TODO/sysdeps/x86_64/fpu/test-float-libmvec-alias-avx2-mod.c new file mode 100644 index 0000000000..7fc3d8aedd --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/test-float-libmvec-alias-avx2-mod.c @@ -0,0 +1 @@ +#include "test-float-libmvec-alias-mod.c" diff --git a/REORG.TODO/sysdeps/x86_64/fpu/test-float-libmvec-alias-avx2.c b/REORG.TODO/sysdeps/x86_64/fpu/test-float-libmvec-alias-avx2.c new file mode 100644 index 0000000000..f3691cc8e6 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/test-float-libmvec-alias-avx2.c @@ -0,0 +1 @@ +#include "test-float-libmvec-alias.c" diff --git a/REORG.TODO/sysdeps/x86_64/fpu/test-float-libmvec-alias-avx512-main.c b/REORG.TODO/sysdeps/x86_64/fpu/test-float-libmvec-alias-avx512-main.c new file mode 100644 index 0000000000..f3691cc8e6 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/test-float-libmvec-alias-avx512-main.c @@ -0,0 +1 @@ +#include "test-float-libmvec-alias.c" diff --git a/REORG.TODO/sysdeps/x86_64/fpu/test-float-libmvec-alias-avx512-mod.c b/REORG.TODO/sysdeps/x86_64/fpu/test-float-libmvec-alias-avx512-mod.c new file mode 100644 index 0000000000..7fc3d8aedd --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/test-float-libmvec-alias-avx512-mod.c @@ -0,0 +1 @@ +#include "test-float-libmvec-alias-mod.c" diff --git a/REORG.TODO/sysdeps/x86_64/fpu/test-float-libmvec-alias-avx512.c b/REORG.TODO/sysdeps/x86_64/fpu/test-float-libmvec-alias-avx512.c new file mode 100644 index 0000000000..f3691cc8e6 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/test-float-libmvec-alias-avx512.c @@ -0,0 +1 @@ +#include "test-float-libmvec-alias.c" diff --git a/REORG.TODO/sysdeps/x86_64/fpu/test-float-libmvec-alias-main.c b/REORG.TODO/sysdeps/x86_64/fpu/test-float-libmvec-alias-main.c new file mode 100644 index 0000000000..f3691cc8e6 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/test-float-libmvec-alias-main.c @@ -0,0 +1 @@ +#include "test-float-libmvec-alias.c" diff --git a/REORG.TODO/sysdeps/x86_64/fpu/test-float-libmvec-alias-mod.c b/REORG.TODO/sysdeps/x86_64/fpu/test-float-libmvec-alias-mod.c new file mode 100644 index 0000000000..109307f997 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/test-float-libmvec-alias-mod.c @@ -0,0 +1,25 @@ +/* Part of test to build shared library to ensure link against + *_finite aliases from libmvec. + Copyright (C) 2016-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <math.h> +#include <stdlib.h> +#include <math-tests-arch.h> + +#include "test-float.h" +#include "test-libmvec-alias-mod.c" diff --git a/REORG.TODO/sysdeps/x86_64/fpu/test-float-libmvec-alias.c b/REORG.TODO/sysdeps/x86_64/fpu/test-float-libmvec-alias.c new file mode 100644 index 0000000000..c7048d346f --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/test-float-libmvec-alias.c @@ -0,0 +1,29 @@ +/* Part of test to ensure link against *_finite aliases from libmvec. + Copyright (C) 2016-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +extern int +test_finite_alias (void); + +static int +do_test (void) +{ + return test_finite_alias (); +} + +#define TEST_FUNCTION do_test () +#include "../../../test-skeleton.c" diff --git a/REORG.TODO/sysdeps/x86_64/fpu/test-float-libmvec-sincosf-avx-main.c b/REORG.TODO/sysdeps/x86_64/fpu/test-float-libmvec-sincosf-avx-main.c new file mode 100644 index 0000000000..558e2ac649 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/test-float-libmvec-sincosf-avx-main.c @@ -0,0 +1 @@ +#include "test-float-libmvec-sincosf-main.c" diff --git a/REORG.TODO/sysdeps/x86_64/fpu/test-float-libmvec-sincosf-avx.c b/REORG.TODO/sysdeps/x86_64/fpu/test-float-libmvec-sincosf-avx.c new file mode 100644 index 0000000000..5b45f0a055 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/test-float-libmvec-sincosf-avx.c @@ -0,0 +1 @@ +#include "test-float-libmvec-sincosf.c" diff --git a/REORG.TODO/sysdeps/x86_64/fpu/test-float-libmvec-sincosf-avx2-main.c b/REORG.TODO/sysdeps/x86_64/fpu/test-float-libmvec-sincosf-avx2-main.c new file mode 100644 index 0000000000..558e2ac649 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/test-float-libmvec-sincosf-avx2-main.c @@ -0,0 +1 @@ +#include "test-float-libmvec-sincosf-main.c" diff --git a/REORG.TODO/sysdeps/x86_64/fpu/test-float-libmvec-sincosf-avx2.c b/REORG.TODO/sysdeps/x86_64/fpu/test-float-libmvec-sincosf-avx2.c new file mode 100644 index 0000000000..5b45f0a055 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/test-float-libmvec-sincosf-avx2.c @@ -0,0 +1 @@ +#include "test-float-libmvec-sincosf.c" diff --git a/REORG.TODO/sysdeps/x86_64/fpu/test-float-libmvec-sincosf-avx512-main.c b/REORG.TODO/sysdeps/x86_64/fpu/test-float-libmvec-sincosf-avx512-main.c new file mode 100644 index 0000000000..558e2ac649 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/test-float-libmvec-sincosf-avx512-main.c @@ -0,0 +1 @@ +#include "test-float-libmvec-sincosf-main.c" diff --git a/REORG.TODO/sysdeps/x86_64/fpu/test-float-libmvec-sincosf-avx512.c b/REORG.TODO/sysdeps/x86_64/fpu/test-float-libmvec-sincosf-avx512.c new file mode 100644 index 0000000000..5b45f0a055 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/test-float-libmvec-sincosf-avx512.c @@ -0,0 +1 @@ +#include "test-float-libmvec-sincosf.c" diff --git a/REORG.TODO/sysdeps/x86_64/fpu/test-float-libmvec-sincosf-main.c b/REORG.TODO/sysdeps/x86_64/fpu/test-float-libmvec-sincosf-main.c new file mode 100644 index 0000000000..5dd1efa8f9 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/test-float-libmvec-sincosf-main.c @@ -0,0 +1,42 @@ +/* Test for vector sincosf ABI. + Copyright (C) 2016-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <math.h> + +#define N 1000 +float x[N], s[N], c[N]; +float *s_ptrs[N]; +float *c_ptrs[N]; + +int +test_sincosf_abi (void) +{ + int i; + for(i = 0; i < N; i++) + { + x[i] = i / 3; + s_ptrs[i] = &s[i]; + c_ptrs[i] = &c[i]; + } + +#pragma omp simd + for(i = 0; i < N; i++) + sincosf (x[i], s_ptrs[i], c_ptrs[i]); + + return 0; +} diff --git a/REORG.TODO/sysdeps/x86_64/fpu/test-float-libmvec-sincosf.c b/REORG.TODO/sysdeps/x86_64/fpu/test-float-libmvec-sincosf.c new file mode 100644 index 0000000000..79543f5cb0 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/test-float-libmvec-sincosf.c @@ -0,0 +1,44 @@ +/* Test for vector sincosf ABI. + Copyright (C) 2016-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <math-tests-arch.h> + +extern int test_sincosf_abi (void); + +int arch_check = 1; + +static void +check_arch (void) +{ + CHECK_ARCH_EXT; + arch_check = 0; +} + +static int +do_test (void) +{ + check_arch (); + + if (arch_check) + return 77; + + return test_sincosf_abi (); +} + +#define TEST_FUNCTION do_test () +#include "../../../test-skeleton.c" diff --git a/REORG.TODO/sysdeps/x86_64/fpu/test-float-vlen16-wrappers.c b/REORG.TODO/sysdeps/x86_64/fpu/test-float-vlen16-wrappers.c new file mode 100644 index 0000000000..2e729e2770 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/test-float-vlen16-wrappers.c @@ -0,0 +1,37 @@ +/* Wrapper part of tests for AVX-512 ISA versions of vector math functions. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include "test-float-vlen16.h" +#include "test-math-vector-sincos.h" +#include <immintrin.h> + +#define VEC_TYPE __m512 + +VECTOR_WRAPPER (WRAPPER_NAME (cosf), _ZGVeN16v_cosf) +VECTOR_WRAPPER (WRAPPER_NAME (sinf), _ZGVeN16v_sinf) +VECTOR_WRAPPER (WRAPPER_NAME (logf), _ZGVeN16v_logf) +VECTOR_WRAPPER (WRAPPER_NAME (expf), _ZGVeN16v_expf) +VECTOR_WRAPPER_ff (WRAPPER_NAME (powf), _ZGVeN16vv_powf) + +#define VEC_INT_TYPE __m512i + +#ifndef __ILP32__ +VECTOR_WRAPPER_fFF_3 (WRAPPER_NAME (sincosf), _ZGVeN16vvv_sincosf) +#else +VECTOR_WRAPPER_fFF_2 (WRAPPER_NAME (sincosf), _ZGVeN16vvv_sincosf) +#endif diff --git a/REORG.TODO/sysdeps/x86_64/fpu/test-float-vlen16.h b/REORG.TODO/sysdeps/x86_64/fpu/test-float-vlen16.h new file mode 100644 index 0000000000..b2bfbf5371 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/test-float-vlen16.h @@ -0,0 +1,21 @@ +/* Tests for AVX-512 ISA versions of vector math functions. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include_next <test-float-vlen16.h> + +#define REQUIRE_AVX512F diff --git a/REORG.TODO/sysdeps/x86_64/fpu/test-float-vlen4-wrappers.c b/REORG.TODO/sysdeps/x86_64/fpu/test-float-vlen4-wrappers.c new file mode 100644 index 0000000000..a332a65236 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/test-float-vlen4-wrappers.c @@ -0,0 +1,37 @@ +/* Wrapper part of tests for SSE ISA versions of vector math functions. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include "test-float-vlen4.h" +#include "test-math-vector-sincos.h" +#include <immintrin.h> + +#define VEC_TYPE __m128 + +VECTOR_WRAPPER (WRAPPER_NAME (cosf), _ZGVbN4v_cosf) +VECTOR_WRAPPER (WRAPPER_NAME (sinf), _ZGVbN4v_sinf) +VECTOR_WRAPPER (WRAPPER_NAME (logf), _ZGVbN4v_logf) +VECTOR_WRAPPER (WRAPPER_NAME (expf), _ZGVbN4v_expf) +VECTOR_WRAPPER_ff (WRAPPER_NAME (powf), _ZGVbN4vv_powf) + +#define VEC_INT_TYPE __m128i + +#ifndef __ILP32__ +VECTOR_WRAPPER_fFF_3 (WRAPPER_NAME (sincosf), _ZGVbN4vvv_sincosf) +#else +VECTOR_WRAPPER_fFF_2 (WRAPPER_NAME (sincosf), _ZGVbN4vvv_sincosf) +#endif diff --git a/REORG.TODO/sysdeps/x86_64/fpu/test-float-vlen8-avx2-wrappers.c b/REORG.TODO/sysdeps/x86_64/fpu/test-float-vlen8-avx2-wrappers.c new file mode 100644 index 0000000000..511f9342a6 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/test-float-vlen8-avx2-wrappers.c @@ -0,0 +1,43 @@ +/* Wrapper part of tests for AVX2 ISA versions of vector math functions. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include "test-float-vlen8.h" +#include "test-math-vector-sincos.h" +#include <immintrin.h> + +#undef VEC_SUFF +#define VEC_SUFF _vlen8_avx2 + +#define VEC_TYPE __m256 + +VECTOR_WRAPPER (WRAPPER_NAME (cosf), _ZGVdN8v_cosf) +VECTOR_WRAPPER (WRAPPER_NAME (sinf), _ZGVdN8v_sinf) +VECTOR_WRAPPER (WRAPPER_NAME (logf), _ZGVdN8v_logf) +VECTOR_WRAPPER (WRAPPER_NAME (expf), _ZGVdN8v_expf) +VECTOR_WRAPPER_ff (WRAPPER_NAME (powf), _ZGVdN8vv_powf) + +/* Redefinition of wrapper to be compatible with _ZGVdN8vvv_sincosf. */ +#undef VECTOR_WRAPPER_fFF + +#define VEC_INT_TYPE __m256i + +#ifndef __ILP32__ +VECTOR_WRAPPER_fFF_3 (WRAPPER_NAME (sincosf), _ZGVdN8vvv_sincosf) +#else +VECTOR_WRAPPER_fFF_2 (WRAPPER_NAME (sincosf), _ZGVdN8vvv_sincosf) +#endif diff --git a/REORG.TODO/sysdeps/x86_64/fpu/test-float-vlen8-avx2.h b/REORG.TODO/sysdeps/x86_64/fpu/test-float-vlen8-avx2.h new file mode 100644 index 0000000000..4967f9d19b --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/test-float-vlen8-avx2.h @@ -0,0 +1,25 @@ +/* Tests for AVX2 ISA versions of vector math functions. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <test-float-vlen8.h> + +#undef VEC_SUFF +#define VEC_SUFF _vlen8_avx2 + +#undef REQUIRE_AVX +#define REQUIRE_AVX2 diff --git a/REORG.TODO/sysdeps/x86_64/fpu/test-float-vlen8-wrappers.c b/REORG.TODO/sysdeps/x86_64/fpu/test-float-vlen8-wrappers.c new file mode 100644 index 0000000000..5a3581b0c8 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/test-float-vlen8-wrappers.c @@ -0,0 +1,37 @@ +/* Wrapper part of tests for AVX ISA versions of vector math functions. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include "test-float-vlen8.h" +#include "test-math-vector-sincos.h" +#include <immintrin.h> + +#define VEC_TYPE __m256 + +VECTOR_WRAPPER (WRAPPER_NAME (cosf), _ZGVcN8v_cosf) +VECTOR_WRAPPER (WRAPPER_NAME (sinf), _ZGVcN8v_sinf) +VECTOR_WRAPPER (WRAPPER_NAME (logf), _ZGVcN8v_logf) +VECTOR_WRAPPER (WRAPPER_NAME (expf), _ZGVcN8v_expf) +VECTOR_WRAPPER_ff (WRAPPER_NAME (powf), _ZGVcN8vv_powf) + +#define VEC_INT_TYPE __m128i + +#ifndef __ILP32__ +VECTOR_WRAPPER_fFF_4 (WRAPPER_NAME (sincosf), _ZGVcN8vvv_sincosf) +#else +VECTOR_WRAPPER_fFF_3 (WRAPPER_NAME (sincosf), _ZGVcN8vvv_sincosf) +#endif diff --git a/REORG.TODO/sysdeps/x86_64/fpu/test-float-vlen8.h b/REORG.TODO/sysdeps/x86_64/fpu/test-float-vlen8.h new file mode 100644 index 0000000000..23ef71c6c5 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/test-float-vlen8.h @@ -0,0 +1,21 @@ +/* Tests for AVX ISA versions of vector math functions. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include_next <test-float-vlen8.h> + +#define REQUIRE_AVX diff --git a/REORG.TODO/sysdeps/x86_64/fpu/test-libmvec-alias-mod.c b/REORG.TODO/sysdeps/x86_64/fpu/test-libmvec-alias-mod.c new file mode 100644 index 0000000000..9746b0ae1c --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/test-libmvec-alias-mod.c @@ -0,0 +1,66 @@ +/* Part of test to build shared library to ensure link against + *_finite aliases from libmvec. + Copyright (C) 2016-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#define N 4000 +FLOAT log_arg[N]; +FLOAT exp_arg[N]; +FLOAT log_res[N]; +FLOAT exp_res[N]; +FLOAT pow_res[N]; +int arch_check = 1; + +static void +init_arg (void) +{ + int i; + + CHECK_ARCH_EXT; + + arch_check = 0; + + for (i = 0; i < N; i += 1) + { + log_arg[i] = 1.0; + exp_arg[i] = 0.0; + } +} + +int +test_finite_alias (void) +{ + int i; + + init_arg (); + + if (arch_check) return 77; + +#pragma omp simd + for (i = 0; i < N; i += 1) + { + log_res[i] = FUNC (log) (log_arg[i]); + exp_res[i] = FUNC (exp) (exp_arg[i]); + pow_res[i] = FUNC (pow) (log_arg[i], log_arg[i]); + } + + if (log_res[0] != 0.0) return 1; + if (exp_res[0] != 1.0) return 1; + if (pow_res[0] != 1.0) return 1; + + return 0; +} diff --git a/REORG.TODO/sysdeps/x86_64/fpu/x86_64-math-asm.h b/REORG.TODO/sysdeps/x86_64/fpu/x86_64-math-asm.h new file mode 100644 index 0000000000..4b4e40c3e7 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/x86_64-math-asm.h @@ -0,0 +1,74 @@ +/* Helper macros for x86_64 libm functions. + Copyright (C) 2015-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#ifndef _X86_64_MATH_ASM_H +#define _X86_64_MATH_ASM_H 1 + +/* Define constants for the minimum value of a floating-point + type. */ +#define DEFINE_LDBL_MIN \ + .section .rodata.cst16,"aM",@progbits,16; \ + .p2align 4; \ + .type ldbl_min,@object; \ +ldbl_min: \ + .byte 0, 0, 0, 0, 0, 0, 0, 0x80, 0x1, 0; \ + .byte 0, 0, 0, 0, 0, 0; \ + .size ldbl_min, .-ldbl_min; + +/* Force an underflow exception if the given value (nonnegative or + NaN) is subnormal. The relevant constant for the minimum of the + type must have been defined, the MO macro must have been defined + for access to memory operands, and, if PIC, the PIC register must + have been loaded. */ +#define LDBL_CHECK_FORCE_UFLOW_NONNEG_NAN \ + fldt MO(ldbl_min); \ + fld %st(1); \ + fucomip %st(1), %st(0); \ + fstp %st(0); \ + jnc 6464f; \ + fld %st(0); \ + fmul %st(0); \ + fstp %st(0); \ +6464: + +/* Likewise, but the argument is not a NaN. */ +#define LDBL_CHECK_FORCE_UFLOW_NONNAN \ + fldt MO(ldbl_min); \ + fld %st(1); \ + fabs; \ + fcomip %st(1), %st(0); \ + fstp %st(0); \ + jnc 6464f; \ + fld %st(0); \ + fmul %st(0); \ + fstp %st(0); \ +6464: + +/* Likewise, but the argument is nonnegative and not a NaN. */ +#define LDBL_CHECK_FORCE_UFLOW_NONNEG \ + fldt MO(ldbl_min); \ + fld %st(1); \ + fcomip %st(1), %st(0); \ + fstp %st(0); \ + jnc 6464f; \ + fld %st(0); \ + fmul %st(0); \ + fstp %st(0); \ +6464: + +#endif /* x86_64-math-asm.h. */ diff --git a/REORG.TODO/sysdeps/x86_64/hp-timing.h b/REORG.TODO/sysdeps/x86_64/hp-timing.h new file mode 100644 index 0000000000..1b2d2cde33 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/hp-timing.h @@ -0,0 +1,40 @@ +/* High precision, low overhead timing functions. x86-64 version. + Copyright (C) 2002-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#ifndef _HP_TIMING_H +#define _HP_TIMING_H 1 + +/* We always assume having the timestamp register. */ +#define HP_TIMING_AVAIL (1) +#define HP_SMALL_TIMING_AVAIL (1) + +/* We indeed have inlined functions. */ +#define HP_TIMING_INLINE (1) + +/* We use 64bit values for the times. */ +typedef unsigned long long int hp_timing_t; + +/* The "=A" constraint used in 32-bit mode does not work in 64-bit mode. */ +#define HP_TIMING_NOW(Var) \ + ({ unsigned int _hi, _lo; \ + asm volatile ("rdtsc" : "=a" (_lo), "=d" (_hi)); \ + (Var) = ((unsigned long long int) _hi << 32) | _lo; }) + +#include <hp-timing-common.h> + +#endif /* hp-timing.h */ diff --git a/REORG.TODO/sysdeps/x86_64/htonl.S b/REORG.TODO/sysdeps/x86_64/htonl.S new file mode 100644 index 0000000000..dcc6bca592 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/htonl.S @@ -0,0 +1,34 @@ +/* Change byte order in word. For AMD x86-64. + Copyright (C) 1997-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "asm-syntax.h" + +/* + INPUT PARAMETERS: + word %rdi +*/ + + .text +ENTRY (htonl) + movl %edi, %eax + bswap %eax + ret +END (htonl) + +weak_alias (htonl, ntohl) diff --git a/REORG.TODO/sysdeps/x86_64/ifuncmain8.c b/REORG.TODO/sysdeps/x86_64/ifuncmain8.c new file mode 100644 index 0000000000..c97cad0af4 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/ifuncmain8.c @@ -0,0 +1,32 @@ +/* Test IFUNC selector with floating-point parameters. + Copyright (C) 2015-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <stdlib.h> + +extern float foo (float); + +static int +do_test (void) +{ + if (foo (2) != 3) + abort (); + return 0; +} + +#define TEST_FUNCTION do_test () +#include "../test-skeleton.c" diff --git a/REORG.TODO/sysdeps/x86_64/ifuncmod8.c b/REORG.TODO/sysdeps/x86_64/ifuncmod8.c new file mode 100644 index 0000000000..037158b2b5 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/ifuncmod8.c @@ -0,0 +1,37 @@ +/* Test IFUNC selector with floating-point parameters. + Copyright (C) 2015-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <emmintrin.h> + +void * foo_ifunc (void) __asm__ ("foo"); +__asm__(".type foo, %gnu_indirect_function"); + +static float +foo_impl (float x) +{ + return x + 1; +} + +void * +inhibit_stack_protector +foo_ifunc (void) +{ + __m128i xmm = _mm_set1_epi32 (-1); + asm volatile ("movdqa %0, %%xmm0" : : "x" (xmm) : "xmm0" ); + return foo_impl; +} diff --git a/REORG.TODO/sysdeps/x86_64/jmpbuf-offsets.h b/REORG.TODO/sysdeps/x86_64/jmpbuf-offsets.h new file mode 100644 index 0000000000..7471deaae8 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/jmpbuf-offsets.h @@ -0,0 +1,29 @@ +/* Private macros for accessing __jmp_buf contents. x86-64 version. + Copyright (C) 2006-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +/* We only need to save callee-saved registers plus stackpointer and + program counter. */ +#define JB_RBX 0 +#define JB_RBP 1 +#define JB_R12 2 +#define JB_R13 3 +#define JB_R14 4 +#define JB_R15 5 +#define JB_RSP 6 +#define JB_PC 7 +#define JB_SIZE (8*8) diff --git a/REORG.TODO/sysdeps/x86_64/jmpbuf-unwind.h b/REORG.TODO/sysdeps/x86_64/jmpbuf-unwind.h new file mode 100644 index 0000000000..a22c77af05 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/jmpbuf-unwind.h @@ -0,0 +1,49 @@ +/* Copyright (C) 2003-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Jakub Jelinek <jakub@redhat.com>, 2003. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <setjmp.h> +#include <jmpbuf-offsets.h> +#include <stdint.h> +#include <unwind.h> +#include <sysdep.h> + +/* Test if longjmp to JMPBUF would unwind the frame + containing a local variable at ADDRESS. */ +#define _JMPBUF_UNWINDS(jmpbuf, address, demangle) \ + ((void *) (address) < (void *) demangle ((jmpbuf)[JB_RSP])) + +#define _JMPBUF_CFA_UNWINDS_ADJ(_jmpbuf, _context, _adj) \ + _JMPBUF_UNWINDS_ADJ (_jmpbuf, \ + (void *) (_Unwind_Ptr) _Unwind_GetCFA (_context), \ + _adj) + +static inline uintptr_t __attribute__ ((unused)) +_jmpbuf_sp (__jmp_buf regs) +{ + uintptr_t sp = regs[JB_RSP]; +#ifdef PTR_DEMANGLE + PTR_DEMANGLE (sp); +#endif + return sp; +} + +#define _JMPBUF_UNWINDS_ADJ(_jmpbuf, _address, _adj) \ + ((uintptr_t) (_address) - (_adj) < _jmpbuf_sp (_jmpbuf) - (_adj)) + +/* We use the normal longjmp for unwinding. */ +#define __libc_unwind_longjmp(buf, val) __libc_longjmp (buf, val) diff --git a/REORG.TODO/sysdeps/x86_64/l10nflist.c b/REORG.TODO/sysdeps/x86_64/l10nflist.c new file mode 100644 index 0000000000..2e08372338 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/l10nflist.c @@ -0,0 +1,13 @@ +#ifdef __POPCNT__ +# include <popcntintrin.h> + +static inline unsigned int +pop (unsigned int x) +{ + return _mm_popcnt_u32 (x); +} +# define ARCH_POP 1 + +#endif + +#include <intl/l10nflist.c> diff --git a/REORG.TODO/sysdeps/x86_64/ldbl2mpn.c b/REORG.TODO/sysdeps/x86_64/ldbl2mpn.c new file mode 100644 index 0000000000..641b789cd4 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/ldbl2mpn.c @@ -0,0 +1 @@ +#include "../i386/ldbl2mpn.c" diff --git a/REORG.TODO/sysdeps/x86_64/ldsodefs.h b/REORG.TODO/sysdeps/x86_64/ldsodefs.h new file mode 100644 index 0000000000..19ff8c8209 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/ldsodefs.h @@ -0,0 +1,56 @@ +/* Run-time dynamic linker data structures for loaded ELF shared objects. + Copyright (C) 1995-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#ifndef _X86_64_LDSODEFS_H +#define _X86_64_LDSODEFS_H 1 + +#include <elf.h> +#include <cpu-features.h> + +struct La_x86_64_regs; +struct La_x86_64_retval; +struct La_x32_regs; +struct La_x32_retval; + +#define ARCH_PLTENTER_MEMBERS \ + Elf64_Addr (*x86_64_gnu_pltenter) (Elf64_Sym *, unsigned int, \ + uintptr_t *, \ + uintptr_t *, struct La_x86_64_regs *, \ + unsigned int *, const char *name, \ + long int *framesizep); \ + Elf32_Addr (*x32_gnu_pltenter) (Elf32_Sym *, unsigned int, uintptr_t *, \ + uintptr_t *, struct La_x32_regs *, \ + unsigned int *, const char *name, \ + long int *framesizep) + +#define ARCH_PLTEXIT_MEMBERS \ + unsigned int (*x86_64_gnu_pltexit) (Elf64_Sym *, unsigned int, \ + uintptr_t *, \ + uintptr_t *, \ + const struct La_x86_64_regs *, \ + struct La_x86_64_retval *, \ + const char *); \ + unsigned int (*x32_gnu_pltexit) (Elf32_Sym *, unsigned int, uintptr_t *, \ + uintptr_t *, \ + const struct La_x32_regs *, \ + struct La_x86_64_retval *, \ + const char *) + +#include_next <ldsodefs.h> + +#endif diff --git a/REORG.TODO/sysdeps/x86_64/link-defines.sym b/REORG.TODO/sysdeps/x86_64/link-defines.sym new file mode 100644 index 0000000000..963c69b320 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/link-defines.sym @@ -0,0 +1,38 @@ +#include "link.h" +#include <stddef.h> + +-- +VECTOR_SIZE sizeof (La_x86_64_vector) +XMM_SIZE sizeof (La_x86_64_xmm) +YMM_SIZE sizeof (La_x86_64_ymm) +ZMM_SIZE sizeof (La_x86_64_zmm) +BND_SIZE sizeof (__int128_t) + +LR_SIZE sizeof (struct La_x86_64_regs) +LR_RDX_OFFSET offsetof (struct La_x86_64_regs, lr_rdx) +LR_R8_OFFSET offsetof (struct La_x86_64_regs, lr_r8) +LR_R9_OFFSET offsetof (struct La_x86_64_regs, lr_r9) +LR_RCX_OFFSET offsetof (struct La_x86_64_regs, lr_rcx) +LR_RSI_OFFSET offsetof (struct La_x86_64_regs, lr_rsi) +LR_RDI_OFFSET offsetof (struct La_x86_64_regs, lr_rdi) +LR_RBP_OFFSET offsetof (struct La_x86_64_regs, lr_rbp) +LR_RSP_OFFSET offsetof (struct La_x86_64_regs, lr_rsp) +LR_XMM_OFFSET offsetof (struct La_x86_64_regs, lr_xmm) +LR_VECTOR_OFFSET offsetof (struct La_x86_64_regs, lr_vector) +#ifndef __ILP32__ +LR_BND_OFFSET offsetof (struct La_x86_64_regs, lr_bnd) +#endif + +LRV_SIZE sizeof (struct La_x86_64_retval) +LRV_RAX_OFFSET offsetof (struct La_x86_64_retval, lrv_rax) +LRV_RDX_OFFSET offsetof (struct La_x86_64_retval, lrv_rdx) +LRV_XMM0_OFFSET offsetof (struct La_x86_64_retval, lrv_xmm0) +LRV_XMM1_OFFSET offsetof (struct La_x86_64_retval, lrv_xmm1) +LRV_ST0_OFFSET offsetof (struct La_x86_64_retval, lrv_st0) +LRV_ST1_OFFSET offsetof (struct La_x86_64_retval, lrv_st1) +LRV_VECTOR0_OFFSET offsetof (struct La_x86_64_retval, lrv_vector0) +LRV_VECTOR1_OFFSET offsetof (struct La_x86_64_retval, lrv_vector1) +#ifndef __ILP32__ +LRV_BND0_OFFSET offsetof (struct La_x86_64_retval, lrv_bnd0) +LRV_BND1_OFFSET offsetof (struct La_x86_64_retval, lrv_bnd1) +#endif diff --git a/REORG.TODO/sysdeps/x86_64/locale-defines.sym b/REORG.TODO/sysdeps/x86_64/locale-defines.sym new file mode 100644 index 0000000000..aebff9a4f9 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/locale-defines.sym @@ -0,0 +1,11 @@ +#include <locale/localeinfo.h> +#include <langinfo.h> +#include <stddef.h> + +-- + +LOCALE_T___LOCALES offsetof (struct __locale_struct, __locales) +LC_CTYPE +_NL_CTYPE_NONASCII_CASE +LOCALE_DATA_VALUES offsetof (struct __locale_data, values) +SIZEOF_VALUES sizeof (((struct __locale_data *) 0)->values[0]) diff --git a/REORG.TODO/sysdeps/x86_64/localplt.data b/REORG.TODO/sysdeps/x86_64/localplt.data new file mode 100644 index 0000000000..a1840cff31 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/localplt.data @@ -0,0 +1,20 @@ +# See scripts/check-localplt.awk for how this file is processed. +# PLT use is required for the malloc family and for matherr because +# users can define their own functions and have library internals call them. +# Linker in binutils 2.26 and newer consolidates R_X86_64_JUMP_SLOT +# relocation with R_X86_64_GLOB_DAT relocation against the same symbol. +libc.so: calloc + RELA R_X86_64_GLOB_DAT +libc.so: free + RELA R_X86_64_GLOB_DAT +libc.so: malloc + RELA R_X86_64_GLOB_DAT +libc.so: memalign + RELA R_X86_64_GLOB_DAT +libc.so: realloc + RELA R_X86_64_GLOB_DAT +libm.so: matherr + RELA R_X86_64_GLOB_DAT +# The main malloc is interposed into the dynamic linker, for +# allocations after the initial link (when dlopen is used). +ld.so: malloc + RELA R_X86_64_GLOB_DAT +ld.so: calloc + RELA R_X86_64_GLOB_DAT +ld.so: realloc + RELA R_X86_64_GLOB_DAT +ld.so: free + RELA R_X86_64_GLOB_DAT +# The TLS-enabled version of these functions is interposed from libc.so. +ld.so: _dl_signal_error + RELA R_X86_64_GLOB_DAT +ld.so: _dl_catch_error + RELA R_X86_64_GLOB_DAT diff --git a/REORG.TODO/sysdeps/x86_64/lshift.S b/REORG.TODO/sysdeps/x86_64/lshift.S new file mode 100644 index 0000000000..690f83555a --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/lshift.S @@ -0,0 +1,116 @@ +/* x86-64 __mpn_lshift -- + Copyright (C) 2007-2017 Free Software Foundation, Inc. + This file is part of the GNU MP Library. + + The GNU MP Library is free software; you can redistribute it and/or modify + it under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation; either version 2.1 of the License, or (at your + option) any later version. + + The GNU MP Library is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public + License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with the GNU MP Library; see the file COPYING.LIB. If not, + see <http://www.gnu.org/licenses/>. */ + +#include "sysdep.h" +#include "asm-syntax.h" + +#define rp %rdi +#define up %rsi +#define n %rdx +#define cnt %cl + + .text +ENTRY (__mpn_lshift) + lea -8(rp,n,8), rp + lea -8(up,n,8), up + + mov %edx, %eax + and $3, %eax + jne L(nb00) +L(b00): /* n = 4, 8, 12, ... */ + mov (up), %r10 + mov -8(up), %r11 + xor %eax, %eax + shld %cl, %r10, %rax + mov -16(up), %r8 + lea 24(rp), rp + sub $4, n + jmp L(00) + +L(nb00):/* n = 1, 5, 9, ... */ + cmp $2, %eax + jae L(nb01) +L(b01): mov (up), %r9 + xor %eax, %eax + shld %cl, %r9, %rax + sub $2, n + jb L(le1) + mov -8(up), %r10 + mov -16(up), %r11 + lea -8(up), up + lea 16(rp), rp + jmp L(01) +L(le1): shl %cl, %r9 + mov %r9, (rp) + ret + +L(nb01):/* n = 2, 6, 10, ... */ + jne L(b11) +L(b10): mov (up), %r8 + mov -8(up), %r9 + xor %eax, %eax + shld %cl, %r8, %rax + sub $3, n + jb L(le2) + mov -16(up), %r10 + lea -16(up), up + lea 8(rp), rp + jmp L(10) +L(le2): shld %cl, %r9, %r8 + mov %r8, (rp) + shl %cl, %r9 + mov %r9, -8(rp) + ret + + .p2align 4 /* performance critical! */ +L(b11): /* n = 3, 7, 11, ... */ + mov (up), %r11 + mov -8(up), %r8 + xor %eax, %eax + shld %cl, %r11, %rax + mov -16(up), %r9 + lea -24(up), up + sub $4, n + jb L(end) + + .p2align 4 +L(top): shld %cl, %r8, %r11 + mov (up), %r10 + mov %r11, (rp) +L(10): shld %cl, %r9, %r8 + mov -8(up), %r11 + mov %r8, -8(rp) +L(01): shld %cl, %r10, %r9 + mov -16(up), %r8 + mov %r9, -16(rp) +L(00): shld %cl, %r11, %r10 + mov -24(up), %r9 + mov %r10, -24(rp) + add $-32, up + lea -32(rp), rp + sub $4, n + jnc L(top) + +L(end): shld %cl, %r8, %r11 + mov %r11, (rp) + shld %cl, %r9, %r8 + mov %r8, -8(rp) + shl %cl, %r9 + mov %r9, -16(rp) + ret +END (__mpn_lshift) diff --git a/REORG.TODO/sysdeps/x86_64/machine-gmon.h b/REORG.TODO/sysdeps/x86_64/machine-gmon.h new file mode 100644 index 0000000000..4fdccf8d17 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/machine-gmon.h @@ -0,0 +1,38 @@ +/* x86-64-specific implementation of profiling support. + Copyright (C) 2002-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Andreas Jaeger <aj@suse.de>, 2002. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> + +/* We need a special version of the `mcount' function since for x86-64 + so that we do not use __builtin_return_address (N) and avoid + clobbering of register. */ + + +/* We must not pollute the global namespace. */ +#define mcount_internal __mcount_internal + +void mcount_internal (u_long frompc, u_long selfpc); + +#define _MCOUNT_DECL(frompc, selfpc) \ +void mcount_internal (u_long frompc, u_long selfpc) + + +/* Define MCOUNT as empty since we have the implementation in another + file. */ +#define MCOUNT diff --git a/REORG.TODO/sysdeps/x86_64/memchr.S b/REORG.TODO/sysdeps/x86_64/memchr.S new file mode 100644 index 0000000000..d3be012424 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/memchr.S @@ -0,0 +1,315 @@ +/* Copyright (C) 2011-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> + +/* fast SSE2 version with using pmaxub and 64 byte loop */ + + .text +ENTRY(memchr) + movd %esi, %xmm1 + mov %edi, %ecx + + punpcklbw %xmm1, %xmm1 + test %rdx, %rdx + jz L(return_null) + punpcklbw %xmm1, %xmm1 + + and $63, %ecx + pshufd $0, %xmm1, %xmm1 + + cmp $48, %ecx + ja L(crosscache) + + movdqu (%rdi), %xmm0 + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %eax + test %eax, %eax + + jnz L(matches_1) + sub $16, %rdx + jbe L(return_null) + add $16, %rdi + and $15, %ecx + and $-16, %rdi + add %rcx, %rdx + sub $64, %rdx + jbe L(exit_loop) + jmp L(loop_prolog) + + .p2align 4 +L(crosscache): + and $15, %ecx + and $-16, %rdi + movdqa (%rdi), %xmm0 + + pcmpeqb %xmm1, %xmm0 +/* Check if there is a match. */ + pmovmskb %xmm0, %eax +/* Remove the leading bytes. */ + sar %cl, %eax + test %eax, %eax + je L(unaligned_no_match) +/* Check which byte is a match. */ + bsf %eax, %eax + + sub %rax, %rdx + jbe L(return_null) + add %rdi, %rax + add %rcx, %rax + ret + + .p2align 4 +L(unaligned_no_match): + /* "rcx" is less than 16. Calculate "rdx + rcx - 16" by using + "rdx - (16 - rcx)" instead of "(rdx + rcx) - 16" to void + possible addition overflow. */ + neg %rcx + add $16, %rcx + sub %rcx, %rdx + jbe L(return_null) + add $16, %rdi + sub $64, %rdx + jbe L(exit_loop) + + .p2align 4 +L(loop_prolog): + movdqa (%rdi), %xmm0 + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(matches) + + movdqa 16(%rdi), %xmm2 + pcmpeqb %xmm1, %xmm2 + pmovmskb %xmm2, %eax + test %eax, %eax + jnz L(matches16) + + movdqa 32(%rdi), %xmm3 + pcmpeqb %xmm1, %xmm3 + pmovmskb %xmm3, %eax + test %eax, %eax + jnz L(matches32) + + movdqa 48(%rdi), %xmm4 + pcmpeqb %xmm1, %xmm4 + add $64, %rdi + pmovmskb %xmm4, %eax + test %eax, %eax + jnz L(matches0) + + test $0x3f, %rdi + jz L(align64_loop) + + sub $64, %rdx + jbe L(exit_loop) + + movdqa (%rdi), %xmm0 + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(matches) + + movdqa 16(%rdi), %xmm2 + pcmpeqb %xmm1, %xmm2 + pmovmskb %xmm2, %eax + test %eax, %eax + jnz L(matches16) + + movdqa 32(%rdi), %xmm3 + pcmpeqb %xmm1, %xmm3 + pmovmskb %xmm3, %eax + test %eax, %eax + jnz L(matches32) + + movdqa 48(%rdi), %xmm3 + pcmpeqb %xmm1, %xmm3 + pmovmskb %xmm3, %eax + + add $64, %rdi + test %eax, %eax + jnz L(matches0) + + mov %rdi, %rcx + and $-64, %rdi + and $63, %ecx + add %rcx, %rdx + + .p2align 4 +L(align64_loop): + sub $64, %rdx + jbe L(exit_loop) + movdqa (%rdi), %xmm0 + movdqa 16(%rdi), %xmm2 + movdqa 32(%rdi), %xmm3 + movdqa 48(%rdi), %xmm4 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm1, %xmm2 + pcmpeqb %xmm1, %xmm3 + pcmpeqb %xmm1, %xmm4 + + pmaxub %xmm0, %xmm3 + pmaxub %xmm2, %xmm4 + pmaxub %xmm3, %xmm4 + pmovmskb %xmm4, %eax + + add $64, %rdi + + test %eax, %eax + jz L(align64_loop) + + sub $64, %rdi + + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(matches) + + pmovmskb %xmm2, %eax + test %eax, %eax + jnz L(matches16) + + movdqa 32(%rdi), %xmm3 + pcmpeqb %xmm1, %xmm3 + + pcmpeqb 48(%rdi), %xmm1 + pmovmskb %xmm3, %eax + test %eax, %eax + jnz L(matches32) + + pmovmskb %xmm1, %eax + bsf %eax, %eax + lea 48(%rdi, %rax), %rax + ret + + .p2align 4 +L(exit_loop): + add $32, %edx + jle L(exit_loop_32) + + movdqa (%rdi), %xmm0 + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(matches) + + movdqa 16(%rdi), %xmm2 + pcmpeqb %xmm1, %xmm2 + pmovmskb %xmm2, %eax + test %eax, %eax + jnz L(matches16) + + movdqa 32(%rdi), %xmm3 + pcmpeqb %xmm1, %xmm3 + pmovmskb %xmm3, %eax + test %eax, %eax + jnz L(matches32_1) + sub $16, %edx + jle L(return_null) + + pcmpeqb 48(%rdi), %xmm1 + pmovmskb %xmm1, %eax + test %eax, %eax + jnz L(matches48_1) + xor %eax, %eax + ret + + .p2align 4 +L(exit_loop_32): + add $32, %edx + movdqa (%rdi), %xmm0 + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(matches_1) + sub $16, %edx + jbe L(return_null) + + pcmpeqb 16(%rdi), %xmm1 + pmovmskb %xmm1, %eax + test %eax, %eax + jnz L(matches16_1) + xor %eax, %eax + ret + + .p2align 4 +L(matches0): + bsf %eax, %eax + lea -16(%rax, %rdi), %rax + ret + + .p2align 4 +L(matches): + bsf %eax, %eax + add %rdi, %rax + ret + + .p2align 4 +L(matches16): + bsf %eax, %eax + lea 16(%rax, %rdi), %rax + ret + + .p2align 4 +L(matches32): + bsf %eax, %eax + lea 32(%rax, %rdi), %rax + ret + + .p2align 4 +L(matches_1): + bsf %eax, %eax + sub %rax, %rdx + jbe L(return_null) + add %rdi, %rax + ret + + .p2align 4 +L(matches16_1): + bsf %eax, %eax + sub %rax, %rdx + jbe L(return_null) + lea 16(%rdi, %rax), %rax + ret + + .p2align 4 +L(matches32_1): + bsf %eax, %eax + sub %rax, %rdx + jbe L(return_null) + lea 32(%rdi, %rax), %rax + ret + + .p2align 4 +L(matches48_1): + bsf %eax, %eax + sub %rax, %rdx + jbe L(return_null) + lea 48(%rdi, %rax), %rax + ret + + .p2align 4 +L(return_null): + xor %eax, %eax + ret +END(memchr) + +strong_alias (memchr, __memchr) + +libc_hidden_builtin_def(memchr) diff --git a/REORG.TODO/sysdeps/x86_64/memcmp.S b/REORG.TODO/sysdeps/x86_64/memcmp.S new file mode 100644 index 0000000000..0828a22534 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/memcmp.S @@ -0,0 +1,358 @@ +/* memcmp with SSE2 + Copyright (C) 2009-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> + + .text +ENTRY (memcmp) + test %rdx, %rdx + jz L(finz) + cmpq $1, %rdx + jle L(finr1b) + subq %rdi, %rsi + movq %rdx, %r10 + cmpq $32, %r10 + jge L(gt32) + /* Handle small chunks and last block of less than 32 bytes. */ +L(small): + testq $1, %r10 + jz L(s2b) + movzbl (%rdi), %eax + movzbl (%rdi, %rsi), %edx + subq $1, %r10 + je L(finz1) + addq $1, %rdi + subl %edx, %eax + jnz L(exit) +L(s2b): + testq $2, %r10 + jz L(s4b) + movzwl (%rdi), %eax + movzwl (%rdi, %rsi), %edx + subq $2, %r10 + je L(fin2_7) + addq $2, %rdi + cmpl %edx, %eax + jnz L(fin2_7) +L(s4b): + testq $4, %r10 + jz L(s8b) + movl (%rdi), %eax + movl (%rdi, %rsi), %edx + subq $4, %r10 + je L(fin2_7) + addq $4, %rdi + cmpl %edx, %eax + jnz L(fin2_7) +L(s8b): + testq $8, %r10 + jz L(s16b) + movq (%rdi), %rax + movq (%rdi, %rsi), %rdx + subq $8, %r10 + je L(fin2_7) + addq $8, %rdi + cmpq %rdx, %rax + jnz L(fin2_7) +L(s16b): + movdqu (%rdi), %xmm1 + movdqu (%rdi, %rsi), %xmm0 + pcmpeqb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + xorl %eax, %eax + subl $0xffff, %edx + jz L(finz) + bsfl %edx, %ecx + leaq (%rdi, %rcx), %rcx + movzbl (%rcx), %eax + movzbl (%rsi, %rcx), %edx + jmp L(finz1) + + .p2align 4,, 4 +L(finr1b): + movzbl (%rdi), %eax + movzbl (%rsi), %edx +L(finz1): + subl %edx, %eax +L(exit): + ret + + .p2align 4,, 4 +L(fin2_7): + cmpq %rdx, %rax + jz L(finz) + movq %rax, %r11 + subq %rdx, %r11 + bsfq %r11, %rcx + sarq $3, %rcx + salq $3, %rcx + sarq %cl, %rax + movzbl %al, %eax + sarq %cl, %rdx + movzbl %dl, %edx + subl %edx, %eax + ret + + .p2align 4,, 4 +L(finz): + xorl %eax, %eax + ret + + /* For blocks bigger than 32 bytes + 1. Advance one of the addr pointer to be 16B aligned. + 2. Treat the case of both addr pointers aligned to 16B + separately to avoid movdqu. + 3. Handle any blocks of greater than 64 consecutive bytes with + unrolling to reduce branches. + 4. At least one addr pointer is 16B aligned, use memory version + of pcmbeqb. + */ + .p2align 4,, 4 +L(gt32): + movq %rdx, %r11 + addq %rdi, %r11 + movq %rdi, %r8 + + andq $15, %r8 + jz L(16am) + /* Both pointers may be misaligned. */ + movdqu (%rdi), %xmm1 + movdqu (%rdi, %rsi), %xmm0 + pcmpeqb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + subl $0xffff, %edx + jnz L(neq) + neg %r8 + leaq 16(%rdi, %r8), %rdi +L(16am): + /* Handle two 16B aligned pointers separately. */ + testq $15, %rsi + jz L(ATR) + testq $16, %rdi + jz L(A32) + movdqu (%rdi, %rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + pmovmskb %xmm0, %edx + subl $0xffff, %edx + jnz L(neq) + addq $16, %rdi +L(A32): + movq %r11, %r10 + andq $-32, %r10 + cmpq %r10, %rdi + jge L(mt16) + /* Pre-unroll to be ready for unrolled 64B loop. */ + testq $32, %rdi + jz L(A64) + movdqu (%rdi,%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + pmovmskb %xmm0, %edx + subl $0xffff, %edx + jnz L(neq) + addq $16, %rdi + + movdqu (%rdi,%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + pmovmskb %xmm0, %edx + subl $0xffff, %edx + jnz L(neq) + addq $16, %rdi + +L(A64): + movq %r11, %r10 + andq $-64, %r10 + cmpq %r10, %rdi + jge L(mt32) + +L(A64main): + movdqu (%rdi,%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + pmovmskb %xmm0, %edx + subl $0xffff, %edx + jnz L(neq) + addq $16, %rdi + + movdqu (%rdi,%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + pmovmskb %xmm0, %edx + subl $0xffff, %edx + jnz L(neq) + addq $16, %rdi + + movdqu (%rdi,%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + pmovmskb %xmm0, %edx + subl $0xffff, %edx + jnz L(neq) + addq $16, %rdi + + movdqu (%rdi,%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + pmovmskb %xmm0, %edx + subl $0xffff, %edx + jnz L(neq) + addq $16, %rdi + + cmpq %rdi, %r10 + jne L(A64main) + +L(mt32): + movq %r11, %r10 + andq $-32, %r10 + cmpq %r10, %rdi + jge L(mt16) + +L(A32main): + movdqu (%rdi,%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + pmovmskb %xmm0, %edx + subl $0xffff, %edx + jnz L(neq) + addq $16, %rdi + + movdqu (%rdi,%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + pmovmskb %xmm0, %edx + subl $0xffff, %edx + jnz L(neq) + addq $16, %rdi + + cmpq %rdi, %r10 + jne L(A32main) +L(mt16): + subq %rdi, %r11 + je L(finz) + movq %r11, %r10 + jmp L(small) + + .p2align 4,, 4 +L(neq): + bsfl %edx, %ecx + movzbl (%rdi, %rcx), %eax + addq %rdi, %rsi + movzbl (%rsi,%rcx), %edx + jmp L(finz1) + + .p2align 4,, 4 +L(ATR): + movq %r11, %r10 + andq $-32, %r10 + cmpq %r10, %rdi + jge L(mt16) + testq $16, %rdi + jz L(ATR32) + + movdqa (%rdi,%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + pmovmskb %xmm0, %edx + subl $0xffff, %edx + jnz L(neq) + addq $16, %rdi + cmpq %rdi, %r10 + je L(mt16) + +L(ATR32): + movq %r11, %r10 + andq $-64, %r10 + testq $32, %rdi + jz L(ATR64) + + movdqa (%rdi,%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + pmovmskb %xmm0, %edx + subl $0xffff, %edx + jnz L(neq) + addq $16, %rdi + + movdqa (%rdi,%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + pmovmskb %xmm0, %edx + subl $0xffff, %edx + jnz L(neq) + addq $16, %rdi + +L(ATR64): + cmpq %rdi, %r10 + je L(mt32) + +L(ATR64main): + movdqa (%rdi,%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + pmovmskb %xmm0, %edx + subl $0xffff, %edx + jnz L(neq) + addq $16, %rdi + + movdqa (%rdi,%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + pmovmskb %xmm0, %edx + subl $0xffff, %edx + jnz L(neq) + addq $16, %rdi + + movdqa (%rdi,%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + pmovmskb %xmm0, %edx + subl $0xffff, %edx + jnz L(neq) + addq $16, %rdi + + movdqa (%rdi,%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + pmovmskb %xmm0, %edx + subl $0xffff, %edx + jnz L(neq) + addq $16, %rdi + cmpq %rdi, %r10 + jne L(ATR64main) + + movq %r11, %r10 + andq $-32, %r10 + cmpq %r10, %rdi + jge L(mt16) + +L(ATR32res): + movdqa (%rdi,%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + pmovmskb %xmm0, %edx + subl $0xffff, %edx + jnz L(neq) + addq $16, %rdi + + movdqa (%rdi,%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + pmovmskb %xmm0, %edx + subl $0xffff, %edx + jnz L(neq) + addq $16, %rdi + + cmpq %r10, %rdi + jne L(ATR32res) + + subq %rdi, %r11 + je L(finz) + movq %r11, %r10 + jmp L(small) + /* Align to 16byte to improve instruction fetch. */ + .p2align 4,, 4 +END(memcmp) + +#undef bcmp +weak_alias (memcmp, bcmp) +libc_hidden_builtin_def (memcmp) diff --git a/REORG.TODO/sysdeps/x86_64/memcopy.h b/REORG.TODO/sysdeps/x86_64/memcopy.h new file mode 100644 index 0000000000..590b6cb16b --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/memcopy.h @@ -0,0 +1 @@ +/* X86-64 doesn't use memory copy functions. */ diff --git a/REORG.TODO/sysdeps/x86_64/memcpy.S b/REORG.TODO/sysdeps/x86_64/memcpy.S new file mode 100644 index 0000000000..d98500a78a --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/memcpy.S @@ -0,0 +1 @@ +/* Implemented in memcpy.S. */ diff --git a/REORG.TODO/sysdeps/x86_64/memcpy_chk.S b/REORG.TODO/sysdeps/x86_64/memcpy_chk.S new file mode 100644 index 0000000000..23e9e1ade5 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/memcpy_chk.S @@ -0,0 +1,33 @@ +/* Checking memcpy for x86-64. + Copyright (C) 2004-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "asm-syntax.h" + +#ifndef SHARED + /* For libc.so this is defined in memcpy.S. + For libc.a, this is a separate source to avoid + memcpy bringing in __chk_fail and all routines + it calls. */ + .text +ENTRY (__memcpy_chk) + cmpq %rdx, %rcx + jb __chk_fail + jmp memcpy +END (__memcpy_chk) +#endif diff --git a/REORG.TODO/sysdeps/x86_64/memmove.S b/REORG.TODO/sysdeps/x86_64/memmove.S new file mode 100644 index 0000000000..5bbae9904f --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/memmove.S @@ -0,0 +1,71 @@ +/* Optimized memmove for x86-64. + Copyright (C) 2016-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> + +#define VEC_SIZE 16 +#define VEC(i) xmm##i +#define PREFETCHNT prefetchnta +#define VMOVNT movntdq +/* Use movups and movaps for smaller code sizes. */ +#define VMOVU movups +#define VMOVA movaps + +#define SECTION(p) p + +#ifdef USE_MULTIARCH +# if !defined SHARED || !IS_IN (libc) +# define MEMCPY_SYMBOL(p,s) memcpy +# endif +#else +# if defined SHARED && IS_IN (libc) +# define MEMCPY_SYMBOL(p,s) __memcpy +# else +# define MEMCPY_SYMBOL(p,s) memcpy +# endif +#endif +#if !defined SHARED || !defined USE_MULTIARCH || !IS_IN (libc) +# define MEMPCPY_SYMBOL(p,s) __mempcpy +#endif +#ifndef MEMMOVE_SYMBOL +# define MEMMOVE_CHK_SYMBOL(p,s) p +# define MEMMOVE_SYMBOL(p,s) memmove +#endif + +#include "multiarch/memmove-vec-unaligned-erms.S" + +#ifndef USE_MULTIARCH +libc_hidden_builtin_def (memmove) +# if defined SHARED && IS_IN (libc) +strong_alias (memmove, __memcpy) +libc_hidden_ver (memmove, memcpy) +# endif +libc_hidden_def (__mempcpy) +weak_alias (__mempcpy, mempcpy) +libc_hidden_builtin_def (mempcpy) + +# if defined SHARED && IS_IN (libc) +# undef memcpy +# include <shlib-compat.h> +versioned_symbol (libc, __memcpy, memcpy, GLIBC_2_14); + +# if SHLIB_COMPAT (libc, GLIBC_2_2_5, GLIBC_2_14) +compat_symbol (libc, memmove, memcpy, GLIBC_2_2_5); +# endif +# endif +#endif diff --git a/REORG.TODO/sysdeps/x86_64/memmove_chk.S b/REORG.TODO/sysdeps/x86_64/memmove_chk.S new file mode 100644 index 0000000000..a87345800b --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/memmove_chk.S @@ -0,0 +1,33 @@ +/* Checking memmove for x86-64. + Copyright (C) 2016-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "asm-syntax.h" + +#ifndef SHARED + /* For libc.so this is defined in memmove.S. + For libc.a, this is a separate source to avoid + memmove bringing in __chk_fail and all routines + it calls. */ + .text +ENTRY (__memmove_chk) + cmpq %rdx, %rcx + jb __chk_fail + jmp memmove +END (__memmove_chk) +#endif diff --git a/REORG.TODO/sysdeps/x86_64/mempcpy.S b/REORG.TODO/sysdeps/x86_64/mempcpy.S new file mode 100644 index 0000000000..d98500a78a --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/mempcpy.S @@ -0,0 +1 @@ +/* Implemented in memcpy.S. */ diff --git a/REORG.TODO/sysdeps/x86_64/mempcpy_chk.S b/REORG.TODO/sysdeps/x86_64/mempcpy_chk.S new file mode 100644 index 0000000000..f912291576 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/mempcpy_chk.S @@ -0,0 +1,33 @@ +/* Checking mempcpy for x86-64. + Copyright (C) 2004-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "asm-syntax.h" + +#ifndef SHARED + /* For libc.so this is defined in memcpy.S. + For libc.a, this is a separate source to avoid + mempcpy bringing in __chk_fail and all routines + it calls. */ + .text +ENTRY (__mempcpy_chk) + cmpq %rdx, %rcx + jb __chk_fail + jmp mempcpy +END (__mempcpy_chk) +#endif diff --git a/REORG.TODO/sysdeps/x86_64/memrchr.S b/REORG.TODO/sysdeps/x86_64/memrchr.S new file mode 100644 index 0000000000..5fa0fe9c1c --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/memrchr.S @@ -0,0 +1,380 @@ +/* fast SSE2 memrchr with 64 byte loop and pmaxub instruction using + + Copyright (C) 2011-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> + + .text +ENTRY (__memrchr) + movd %esi, %xmm1 + + sub $16, %rdx + jbe L(length_less16) + + punpcklbw %xmm1, %xmm1 + punpcklbw %xmm1, %xmm1 + + add %rdx, %rdi + pshufd $0, %xmm1, %xmm1 + + movdqu (%rdi), %xmm0 + pcmpeqb %xmm1, %xmm0 + +/* Check if there is a match. */ + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(matches0) + + sub $64, %rdi + mov %edi, %ecx + and $15, %ecx + jz L(loop_prolog) + + add $16, %rdi + add $16, %rdx + and $-16, %rdi + sub %rcx, %rdx + + .p2align 4 +L(loop_prolog): + sub $64, %rdx + jbe L(exit_loop) + + movdqa 48(%rdi), %xmm0 + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(matches48) + + movdqa 32(%rdi), %xmm2 + pcmpeqb %xmm1, %xmm2 + pmovmskb %xmm2, %eax + test %eax, %eax + jnz L(matches32) + + movdqa 16(%rdi), %xmm3 + pcmpeqb %xmm1, %xmm3 + pmovmskb %xmm3, %eax + test %eax, %eax + jnz L(matches16) + + movdqa (%rdi), %xmm4 + pcmpeqb %xmm1, %xmm4 + pmovmskb %xmm4, %eax + test %eax, %eax + jnz L(matches0) + + sub $64, %rdi + sub $64, %rdx + jbe L(exit_loop) + + movdqa 48(%rdi), %xmm0 + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(matches48) + + movdqa 32(%rdi), %xmm2 + pcmpeqb %xmm1, %xmm2 + pmovmskb %xmm2, %eax + test %eax, %eax + jnz L(matches32) + + movdqa 16(%rdi), %xmm3 + pcmpeqb %xmm1, %xmm3 + pmovmskb %xmm3, %eax + test %eax, %eax + jnz L(matches16) + + movdqa (%rdi), %xmm3 + pcmpeqb %xmm1, %xmm3 + pmovmskb %xmm3, %eax + test %eax, %eax + jnz L(matches0) + + mov %edi, %ecx + and $63, %ecx + jz L(align64_loop) + + add $64, %rdi + add $64, %rdx + and $-64, %rdi + sub %rcx, %rdx + + .p2align 4 +L(align64_loop): + sub $64, %rdi + sub $64, %rdx + jbe L(exit_loop) + + movdqa (%rdi), %xmm0 + movdqa 16(%rdi), %xmm2 + movdqa 32(%rdi), %xmm3 + movdqa 48(%rdi), %xmm4 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm1, %xmm2 + pcmpeqb %xmm1, %xmm3 + pcmpeqb %xmm1, %xmm4 + + pmaxub %xmm3, %xmm0 + pmaxub %xmm4, %xmm2 + pmaxub %xmm0, %xmm2 + pmovmskb %xmm2, %eax + + test %eax, %eax + jz L(align64_loop) + + pmovmskb %xmm4, %eax + test %eax, %eax + jnz L(matches48) + + pmovmskb %xmm3, %eax + test %eax, %eax + jnz L(matches32) + + movdqa 16(%rdi), %xmm2 + + pcmpeqb %xmm1, %xmm2 + pcmpeqb (%rdi), %xmm1 + + pmovmskb %xmm2, %eax + test %eax, %eax + jnz L(matches16) + + pmovmskb %xmm1, %eax + bsr %eax, %eax + + add %rdi, %rax + ret + + .p2align 4 +L(exit_loop): + add $64, %edx + cmp $32, %edx + jbe L(exit_loop_32) + + movdqa 48(%rdi), %xmm0 + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(matches48) + + movdqa 32(%rdi), %xmm2 + pcmpeqb %xmm1, %xmm2 + pmovmskb %xmm2, %eax + test %eax, %eax + jnz L(matches32) + + movdqa 16(%rdi), %xmm3 + pcmpeqb %xmm1, %xmm3 + pmovmskb %xmm3, %eax + test %eax, %eax + jnz L(matches16_1) + cmp $48, %edx + jbe L(return_null) + + pcmpeqb (%rdi), %xmm1 + pmovmskb %xmm1, %eax + test %eax, %eax + jnz L(matches0_1) + xor %eax, %eax + ret + + .p2align 4 +L(exit_loop_32): + movdqa 48(%rdi), %xmm0 + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(matches48_1) + cmp $16, %edx + jbe L(return_null) + + pcmpeqb 32(%rdi), %xmm1 + pmovmskb %xmm1, %eax + test %eax, %eax + jnz L(matches32_1) + xor %eax, %eax + ret + + .p2align 4 +L(matches0): + bsr %eax, %eax + add %rdi, %rax + ret + + .p2align 4 +L(matches16): + bsr %eax, %eax + lea 16(%rax, %rdi), %rax + ret + + .p2align 4 +L(matches32): + bsr %eax, %eax + lea 32(%rax, %rdi), %rax + ret + + .p2align 4 +L(matches48): + bsr %eax, %eax + lea 48(%rax, %rdi), %rax + ret + + .p2align 4 +L(matches0_1): + bsr %eax, %eax + sub $64, %rdx + add %rax, %rdx + jl L(return_null) + add %rdi, %rax + ret + + .p2align 4 +L(matches16_1): + bsr %eax, %eax + sub $48, %rdx + add %rax, %rdx + jl L(return_null) + lea 16(%rdi, %rax), %rax + ret + + .p2align 4 +L(matches32_1): + bsr %eax, %eax + sub $32, %rdx + add %rax, %rdx + jl L(return_null) + lea 32(%rdi, %rax), %rax + ret + + .p2align 4 +L(matches48_1): + bsr %eax, %eax + sub $16, %rdx + add %rax, %rdx + jl L(return_null) + lea 48(%rdi, %rax), %rax + ret + + .p2align 4 +L(return_null): + xor %eax, %eax + ret + + .p2align 4 +L(length_less16_offset0): + test %edx, %edx + jz L(return_null) + + mov %dl, %cl + pcmpeqb (%rdi), %xmm1 + + mov $1, %edx + sal %cl, %edx + sub $1, %edx + + pmovmskb %xmm1, %eax + + and %edx, %eax + test %eax, %eax + jz L(return_null) + + bsr %eax, %eax + add %rdi, %rax + ret + + .p2align 4 +L(length_less16): + punpcklbw %xmm1, %xmm1 + punpcklbw %xmm1, %xmm1 + + add $16, %edx + + pshufd $0, %xmm1, %xmm1 + + mov %edi, %ecx + and $15, %ecx + jz L(length_less16_offset0) + + mov %cl, %dh + mov %ecx, %esi + add %dl, %dh + and $-16, %rdi + + sub $16, %dh + ja L(length_less16_part2) + + pcmpeqb (%rdi), %xmm1 + pmovmskb %xmm1, %eax + + sar %cl, %eax + mov %dl, %cl + + mov $1, %edx + sal %cl, %edx + sub $1, %edx + + and %edx, %eax + test %eax, %eax + jz L(return_null) + + bsr %eax, %eax + add %rdi, %rax + add %rsi, %rax + ret + + .p2align 4 +L(length_less16_part2): + movdqa 16(%rdi), %xmm2 + pcmpeqb %xmm1, %xmm2 + pmovmskb %xmm2, %eax + + mov %dh, %cl + mov $1, %edx + sal %cl, %edx + sub $1, %edx + + and %edx, %eax + + test %eax, %eax + jnz L(length_less16_part2_return) + + pcmpeqb (%rdi), %xmm1 + pmovmskb %xmm1, %eax + + mov %esi, %ecx + sar %cl, %eax + test %eax, %eax + jz L(return_null) + + bsr %eax, %eax + add %rdi, %rax + add %rsi, %rax + ret + + .p2align 4 +L(length_less16_part2_return): + bsr %eax, %eax + lea 16(%rax, %rdi), %rax + ret + +END (__memrchr) +weak_alias (__memrchr, memrchr) diff --git a/REORG.TODO/sysdeps/x86_64/memset.S b/REORG.TODO/sysdeps/x86_64/memset.S new file mode 100644 index 0000000000..41278787fe --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/memset.S @@ -0,0 +1,67 @@ +/* memset/bzero -- set memory area to CH/0 + Optimized version for x86-64. + Copyright (C) 2002-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> + +#define VEC_SIZE 16 +#define VEC(i) xmm##i +/* Don't use movups and movaps since it will get larger nop paddings for + alignment. */ +#define VMOVU movdqu +#define VMOVA movdqa + +#define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ + movd d, %xmm0; \ + movq r, %rax; \ + punpcklbw %xmm0, %xmm0; \ + punpcklwd %xmm0, %xmm0; \ + pshufd $0, %xmm0, %xmm0 + +#define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ + movd d, %xmm0; \ + movq r, %rax; \ + pshufd $0, %xmm0, %xmm0 + +#define SECTION(p) p + +#ifndef MEMSET_SYMBOL +# define MEMSET_CHK_SYMBOL(p,s) p +# define MEMSET_SYMBOL(p,s) memset +#endif + +#ifndef WMEMSET_SYMBOL +# define WMEMSET_CHK_SYMBOL(p,s) p +# define WMEMSET_SYMBOL(p,s) __wmemset +#endif + +#include "multiarch/memset-vec-unaligned-erms.S" + +libc_hidden_builtin_def (memset) + +#if IS_IN (libc) +libc_hidden_def (__wmemset) +weak_alias (__wmemset, wmemset) +libc_hidden_weak (wmemset) +#endif + +#if defined SHARED && IS_IN (libc) && !defined USE_MULTIARCH +strong_alias (__memset_chk, __memset_zero_constant_len_parameter) + .section .gnu.warning.__memset_zero_constant_len_parameter + .string "memset used with constant zero length parameter; this could be due to transposed parameters" +#endif diff --git a/REORG.TODO/sysdeps/x86_64/memset_chk.S b/REORG.TODO/sysdeps/x86_64/memset_chk.S new file mode 100644 index 0000000000..33d15c0c10 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/memset_chk.S @@ -0,0 +1,33 @@ +/* Checking memset for x86-64. + Copyright (C) 2004-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "asm-syntax.h" + +#ifndef SHARED + /* For libc.so this is defined in memset.S. + For libc.a, this is a separate source to avoid + memset bringing in __chk_fail and all routines + it calls. */ + .text +ENTRY (__memset_chk) + cmpq %rdx, %rcx + jb __chk_fail + jmp memset +END (__memset_chk) +#endif diff --git a/REORG.TODO/sysdeps/x86_64/memusage.h b/REORG.TODO/sysdeps/x86_64/memusage.h new file mode 100644 index 0000000000..50f960b140 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/memusage.h @@ -0,0 +1,21 @@ +/* Copyright (C) 2001-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#define GETSP() ({ register uintptr_t stack_ptr asm ("rsp"); stack_ptr; }) +#define GETTIME(low,high) asm ("rdtsc" : "=a" (low), "=d" (high)) + +#include <sysdeps/generic/memusage.h> diff --git a/REORG.TODO/sysdeps/x86_64/mp_clz_tab.c b/REORG.TODO/sysdeps/x86_64/mp_clz_tab.c new file mode 100644 index 0000000000..7b13a394da --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/mp_clz_tab.c @@ -0,0 +1 @@ +/* __clz_tab not needed on x86-64. */ diff --git a/REORG.TODO/sysdeps/x86_64/mul_1.S b/REORG.TODO/sysdeps/x86_64/mul_1.S new file mode 100644 index 0000000000..5c1c4335bf --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/mul_1.S @@ -0,0 +1,128 @@ +/* AMD64 __mpn_mul_1 -- Multiply a limb vector with a limb and store + the result in a second limb vector. + Copyright (C) 2003-2017 Free Software Foundation, Inc. + This file is part of the GNU MP Library. + + The GNU MP Library is free software; you can redistribute it and/or modify + it under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation; either version 2.1 of the License, or (at your + option) any later version. + + The GNU MP Library is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public + License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with the GNU MP Library; see the file COPYING.LIB. If not, + see <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "asm-syntax.h" + +#define rp %rdi +#define up %rsi +#define n_param %rdx +#define vl %rcx + +#define n %r11 + + .text +ENTRY (__mpn_mul_1) + push %rbx + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%rbx, 0) + xor %r10, %r10 + mov (up), %rax /* read first u limb early */ + mov n_param, %rbx /* move away n from rdx, mul uses it */ + mul vl + mov %rbx, %r11 + + add %r10, %rax + adc $0, %rdx + + and $3, %ebx + jz L(b0) + cmp $2, %ebx + jz L(b2) + jg L(b3) + +L(b1): dec n + jne L(gt1) + mov %rax, (rp) + jmp L(ret) +L(gt1): lea 8(up,n,8), up + lea -8(rp,n,8), rp + neg n + xor %r10, %r10 + xor %ebx, %ebx + mov %rax, %r9 + mov (up,n,8), %rax + mov %rdx, %r8 + jmp L(L1) + +L(b0): lea (up,n,8), up + lea -16(rp,n,8), rp + neg n + xor %r10, %r10 + mov %rax, %r8 + mov %rdx, %rbx + jmp L(L0) + +L(b3): lea -8(up,n,8), up + lea -24(rp,n,8), rp + neg n + mov %rax, %rbx + mov %rdx, %r10 + jmp L(L3) + +L(b2): lea -16(up,n,8), up + lea -32(rp,n,8), rp + neg n + xor %r8, %r8 + xor %ebx, %ebx + mov %rax, %r10 + mov 24(up,n,8), %rax + mov %rdx, %r9 + jmp L(L2) + + .p2align 4 +L(top): mov %r10, (rp,n,8) + add %rax, %r9 + mov (up,n,8), %rax + adc %rdx, %r8 + mov $0, %r10d +L(L1): mul vl + mov %r9, 8(rp,n,8) + add %rax, %r8 + adc %rdx, %rbx +L(L0): mov 8(up,n,8), %rax + mul vl + mov %r8, 16(rp,n,8) + add %rax, %rbx + adc %rdx, %r10 +L(L3): mov 16(up,n,8), %rax + mul vl + mov %rbx, 24(rp,n,8) + mov $0, %r8d # zero + mov %r8, %rbx # zero + add %rax, %r10 + mov 24(up,n,8), %rax + mov %r8, %r9 # zero + adc %rdx, %r9 +L(L2): mul vl + add $4, n + js L(top) + + mov %r10, (rp,n,8) + add %rax, %r9 + adc %r8, %rdx + mov %r9, 8(rp,n,8) + add %r8, %rdx +L(ret): mov %rdx, %rax + + pop %rbx + cfi_adjust_cfa_offset (-8) + cfi_restore (%rbx) + ret +END (__mpn_mul_1) diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/Makefile b/REORG.TODO/sysdeps/x86_64/multiarch/Makefile new file mode 100644 index 0000000000..310a3a4b72 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/Makefile @@ -0,0 +1,42 @@ +ifeq ($(subdir),csu) +tests += test-multiarch +endif + +ifeq ($(subdir),string) + +sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \ + strcmp-sse2-unaligned strncmp-ssse3 \ + memcmp-avx2-movbe \ + memcmp-sse4 memcpy-ssse3 \ + memmove-ssse3 \ + memcpy-ssse3-back \ + memmove-ssse3-back \ + memmove-avx512-no-vzeroupper strcasecmp_l-ssse3 \ + strncase_l-ssse3 strcat-ssse3 strncat-ssse3\ + strcpy-ssse3 strncpy-ssse3 stpcpy-ssse3 stpncpy-ssse3 \ + strcpy-sse2-unaligned strncpy-sse2-unaligned \ + stpcpy-sse2-unaligned stpncpy-sse2-unaligned \ + strcat-sse2-unaligned strncat-sse2-unaligned \ + strchr-sse2-no-bsf memcmp-ssse3 strstr-sse2-unaligned \ + strcspn-c strpbrk-c strspn-c varshift \ + memset-avx512-no-vzeroupper \ + memmove-avx-unaligned-erms \ + memmove-avx512-unaligned-erms \ + memset-avx2-unaligned-erms \ + memset-avx512-unaligned-erms +CFLAGS-varshift.c += -msse4 +CFLAGS-strcspn-c.c += -msse4 +CFLAGS-strpbrk-c.c += -msse4 +CFLAGS-strspn-c.c += -msse4 +endif + +ifeq ($(subdir),wcsmbs) +sysdep_routines += wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c \ + wmemcmp-avx2-movbe \ + wcscpy-ssse3 wcscpy-c \ + wcsnlen-sse4_1 wcsnlen-c +endif + +ifeq ($(subdir),debug) +sysdep_routines += wmemset_chk-nonshared +endif diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/bcopy.S b/REORG.TODO/sysdeps/x86_64/multiarch/bcopy.S new file mode 100644 index 0000000000..639f02bde3 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/bcopy.S @@ -0,0 +1,7 @@ +#include <sysdep.h> + + .text +ENTRY(bcopy) + xchg %rdi, %rsi + jmp __libc_memmove /* Branch to IFUNC memmove. */ +END(bcopy) diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/REORG.TODO/sysdeps/x86_64/multiarch/ifunc-impl-list.c new file mode 100644 index 0000000000..5627183aca --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/ifunc-impl-list.c @@ -0,0 +1,460 @@ +/* Enumerate available IFUNC implementations of a function. x86-64 version. + Copyright (C) 2012-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <assert.h> +#include <string.h> +#include <wchar.h> +#include <ifunc-impl-list.h> +#include <sysdep.h> +#include "init-arch.h" + +/* Maximum number of IFUNC implementations. */ +#define MAX_IFUNC 5 + +/* Fill ARRAY of MAX elements with IFUNC implementations for function + NAME supported on target machine and return the number of valid + entries. */ + +size_t +__libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + size_t max) +{ + assert (max >= MAX_IFUNC); + + size_t i = 0; + + /* Support sysdeps/x86_64/multiarch/memcmp.S. */ + IFUNC_IMPL (i, name, memcmp, + IFUNC_IMPL_ADD (array, i, memcmp, + (HAS_ARCH_FEATURE (AVX2_Usable) + && HAS_CPU_FEATURE (MOVBE)), + __memcmp_avx2_movbe) + IFUNC_IMPL_ADD (array, i, memcmp, HAS_CPU_FEATURE (SSE4_1), + __memcmp_sse4_1) + IFUNC_IMPL_ADD (array, i, memcmp, HAS_CPU_FEATURE (SSSE3), + __memcmp_ssse3) + IFUNC_IMPL_ADD (array, i, memcmp, 1, __memcmp_sse2)) + + /* Support sysdeps/x86_64/multiarch/memmove_chk.c. */ + IFUNC_IMPL (i, name, __memmove_chk, + IFUNC_IMPL_ADD (array, i, __memmove_chk, + HAS_ARCH_FEATURE (AVX512F_Usable), + __memmove_chk_avx512_no_vzeroupper) + IFUNC_IMPL_ADD (array, i, __memmove_chk, + HAS_ARCH_FEATURE (AVX512F_Usable), + __memmove_chk_avx512_unaligned) + IFUNC_IMPL_ADD (array, i, __memmove_chk, + HAS_ARCH_FEATURE (AVX512F_Usable), + __memmove_chk_avx512_unaligned_erms) + IFUNC_IMPL_ADD (array, i, __memmove_chk, + HAS_ARCH_FEATURE (AVX_Usable), + __memmove_chk_avx_unaligned) + IFUNC_IMPL_ADD (array, i, __memmove_chk, + HAS_ARCH_FEATURE (AVX_Usable), + __memmove_chk_avx_unaligned_erms) + IFUNC_IMPL_ADD (array, i, __memmove_chk, + HAS_CPU_FEATURE (SSSE3), + __memmove_chk_ssse3_back) + IFUNC_IMPL_ADD (array, i, __memmove_chk, + HAS_CPU_FEATURE (SSSE3), + __memmove_chk_ssse3) + IFUNC_IMPL_ADD (array, i, __memmove_chk, 1, + __memmove_chk_sse2_unaligned) + IFUNC_IMPL_ADD (array, i, __memmove_chk, 1, + __memmove_chk_sse2_unaligned_erms)) + + /* Support sysdeps/x86_64/multiarch/memmove.S. */ + IFUNC_IMPL (i, name, memmove, + IFUNC_IMPL_ADD (array, i, memmove, + HAS_ARCH_FEATURE (AVX_Usable), + __memmove_avx_unaligned) + IFUNC_IMPL_ADD (array, i, memmove, + HAS_ARCH_FEATURE (AVX_Usable), + __memmove_avx_unaligned_erms) + IFUNC_IMPL_ADD (array, i, memmove, + HAS_ARCH_FEATURE (AVX512F_Usable), + __memmove_avx512_no_vzeroupper) + IFUNC_IMPL_ADD (array, i, memmove, + HAS_ARCH_FEATURE (AVX512F_Usable), + __memmove_avx512_unaligned) + IFUNC_IMPL_ADD (array, i, memmove, + HAS_ARCH_FEATURE (AVX512F_Usable), + __memmove_avx512_unaligned_erms) + IFUNC_IMPL_ADD (array, i, memmove, HAS_CPU_FEATURE (SSSE3), + __memmove_ssse3_back) + IFUNC_IMPL_ADD (array, i, memmove, HAS_CPU_FEATURE (SSSE3), + __memmove_ssse3) + IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_erms) + IFUNC_IMPL_ADD (array, i, memmove, 1, + __memmove_sse2_unaligned) + IFUNC_IMPL_ADD (array, i, memmove, 1, + __memmove_sse2_unaligned_erms)) + + /* Support sysdeps/x86_64/multiarch/memset_chk.S. */ + IFUNC_IMPL (i, name, __memset_chk, + IFUNC_IMPL_ADD (array, i, __memset_chk, 1, + __memset_chk_sse2_unaligned) + IFUNC_IMPL_ADD (array, i, __memset_chk, 1, + __memset_chk_sse2_unaligned_erms) + IFUNC_IMPL_ADD (array, i, __memset_chk, + HAS_ARCH_FEATURE (AVX2_Usable), + __memset_chk_avx2_unaligned) + IFUNC_IMPL_ADD (array, i, __memset_chk, + HAS_ARCH_FEATURE (AVX2_Usable), + __memset_chk_avx2_unaligned_erms) + IFUNC_IMPL_ADD (array, i, __memset_chk, + HAS_ARCH_FEATURE (AVX512F_Usable), + __memset_chk_avx512_unaligned_erms) + IFUNC_IMPL_ADD (array, i, __memset_chk, + HAS_ARCH_FEATURE (AVX512F_Usable), + __memset_chk_avx512_unaligned) + IFUNC_IMPL_ADD (array, i, __memset_chk, + HAS_ARCH_FEATURE (AVX512F_Usable), + __memset_chk_avx512_no_vzeroupper) + ) + + /* Support sysdeps/x86_64/multiarch/memset.S. */ + IFUNC_IMPL (i, name, memset, + IFUNC_IMPL_ADD (array, i, memset, 1, + __memset_sse2_unaligned) + IFUNC_IMPL_ADD (array, i, memset, 1, + __memset_sse2_unaligned_erms) + IFUNC_IMPL_ADD (array, i, memset, 1, __memset_erms) + IFUNC_IMPL_ADD (array, i, memset, + HAS_ARCH_FEATURE (AVX2_Usable), + __memset_avx2_unaligned) + IFUNC_IMPL_ADD (array, i, memset, + HAS_ARCH_FEATURE (AVX2_Usable), + __memset_avx2_unaligned_erms) + IFUNC_IMPL_ADD (array, i, memset, + HAS_ARCH_FEATURE (AVX512F_Usable), + __memset_avx512_unaligned_erms) + IFUNC_IMPL_ADD (array, i, memset, + HAS_ARCH_FEATURE (AVX512F_Usable), + __memset_avx512_unaligned) + IFUNC_IMPL_ADD (array, i, memset, + HAS_ARCH_FEATURE (AVX512F_Usable), + __memset_avx512_no_vzeroupper) + ) + + /* Support sysdeps/x86_64/multiarch/stpncpy.S. */ + IFUNC_IMPL (i, name, stpncpy, + IFUNC_IMPL_ADD (array, i, stpncpy, HAS_CPU_FEATURE (SSSE3), + __stpncpy_ssse3) + IFUNC_IMPL_ADD (array, i, stpncpy, 1, + __stpncpy_sse2_unaligned) + IFUNC_IMPL_ADD (array, i, stpncpy, 1, __stpncpy_sse2)) + + /* Support sysdeps/x86_64/multiarch/stpcpy.S. */ + IFUNC_IMPL (i, name, stpcpy, + IFUNC_IMPL_ADD (array, i, stpcpy, HAS_CPU_FEATURE (SSSE3), + __stpcpy_ssse3) + IFUNC_IMPL_ADD (array, i, stpcpy, 1, __stpcpy_sse2_unaligned) + IFUNC_IMPL_ADD (array, i, stpcpy, 1, __stpcpy_sse2)) + + /* Support sysdeps/x86_64/multiarch/strcasecmp_l.S. */ + IFUNC_IMPL (i, name, strcasecmp, + IFUNC_IMPL_ADD (array, i, strcasecmp, + HAS_ARCH_FEATURE (AVX_Usable), + __strcasecmp_avx) + IFUNC_IMPL_ADD (array, i, strcasecmp, + HAS_CPU_FEATURE (SSE4_2), + __strcasecmp_sse42) + IFUNC_IMPL_ADD (array, i, strcasecmp, + HAS_CPU_FEATURE (SSSE3), + __strcasecmp_ssse3) + IFUNC_IMPL_ADD (array, i, strcasecmp, 1, __strcasecmp_sse2)) + + /* Support sysdeps/x86_64/multiarch/strcasecmp_l.S. */ + IFUNC_IMPL (i, name, strcasecmp_l, + IFUNC_IMPL_ADD (array, i, strcasecmp_l, + HAS_ARCH_FEATURE (AVX_Usable), + __strcasecmp_l_avx) + IFUNC_IMPL_ADD (array, i, strcasecmp_l, + HAS_CPU_FEATURE (SSE4_2), + __strcasecmp_l_sse42) + IFUNC_IMPL_ADD (array, i, strcasecmp_l, + HAS_CPU_FEATURE (SSSE3), + __strcasecmp_l_ssse3) + IFUNC_IMPL_ADD (array, i, strcasecmp_l, 1, + __strcasecmp_l_sse2)) + + /* Support sysdeps/x86_64/multiarch/strcat.S. */ + IFUNC_IMPL (i, name, strcat, + IFUNC_IMPL_ADD (array, i, strcat, HAS_CPU_FEATURE (SSSE3), + __strcat_ssse3) + IFUNC_IMPL_ADD (array, i, strcat, 1, __strcat_sse2_unaligned) + IFUNC_IMPL_ADD (array, i, strcat, 1, __strcat_sse2)) + + /* Support sysdeps/x86_64/multiarch/strchr.S. */ + IFUNC_IMPL (i, name, strchr, + IFUNC_IMPL_ADD (array, i, strchr, 1, __strchr_sse2_no_bsf) + IFUNC_IMPL_ADD (array, i, strchr, 1, __strchr_sse2)) + + /* Support sysdeps/x86_64/multiarch/strcmp.S. */ + IFUNC_IMPL (i, name, strcmp, + IFUNC_IMPL_ADD (array, i, strcmp, HAS_CPU_FEATURE (SSE4_2), + __strcmp_sse42) + IFUNC_IMPL_ADD (array, i, strcmp, HAS_CPU_FEATURE (SSSE3), + __strcmp_ssse3) + IFUNC_IMPL_ADD (array, i, strcmp, 1, __strcmp_sse2_unaligned) + IFUNC_IMPL_ADD (array, i, strcmp, 1, __strcmp_sse2)) + + /* Support sysdeps/x86_64/multiarch/strcpy.S. */ + IFUNC_IMPL (i, name, strcpy, + IFUNC_IMPL_ADD (array, i, strcpy, HAS_CPU_FEATURE (SSSE3), + __strcpy_ssse3) + IFUNC_IMPL_ADD (array, i, strcpy, 1, __strcpy_sse2_unaligned) + IFUNC_IMPL_ADD (array, i, strcpy, 1, __strcpy_sse2)) + + /* Support sysdeps/x86_64/multiarch/strcspn.S. */ + IFUNC_IMPL (i, name, strcspn, + IFUNC_IMPL_ADD (array, i, strcspn, HAS_CPU_FEATURE (SSE4_2), + __strcspn_sse42) + IFUNC_IMPL_ADD (array, i, strcspn, 1, __strcspn_sse2)) + + /* Support sysdeps/x86_64/multiarch/strncase_l.S. */ + IFUNC_IMPL (i, name, strncasecmp, + IFUNC_IMPL_ADD (array, i, strncasecmp, + HAS_ARCH_FEATURE (AVX_Usable), + __strncasecmp_avx) + IFUNC_IMPL_ADD (array, i, strncasecmp, + HAS_CPU_FEATURE (SSE4_2), + __strncasecmp_sse42) + IFUNC_IMPL_ADD (array, i, strncasecmp, + HAS_CPU_FEATURE (SSSE3), + __strncasecmp_ssse3) + IFUNC_IMPL_ADD (array, i, strncasecmp, 1, + __strncasecmp_sse2)) + + /* Support sysdeps/x86_64/multiarch/strncase_l.S. */ + IFUNC_IMPL (i, name, strncasecmp_l, + IFUNC_IMPL_ADD (array, i, strncasecmp_l, + HAS_ARCH_FEATURE (AVX_Usable), + __strncasecmp_l_avx) + IFUNC_IMPL_ADD (array, i, strncasecmp_l, + HAS_CPU_FEATURE (SSE4_2), + __strncasecmp_l_sse42) + IFUNC_IMPL_ADD (array, i, strncasecmp_l, + HAS_CPU_FEATURE (SSSE3), + __strncasecmp_l_ssse3) + IFUNC_IMPL_ADD (array, i, strncasecmp_l, 1, + __strncasecmp_l_sse2)) + + /* Support sysdeps/x86_64/multiarch/strncat.S. */ + IFUNC_IMPL (i, name, strncat, + IFUNC_IMPL_ADD (array, i, strncat, HAS_CPU_FEATURE (SSSE3), + __strncat_ssse3) + IFUNC_IMPL_ADD (array, i, strncat, 1, + __strncat_sse2_unaligned) + IFUNC_IMPL_ADD (array, i, strncat, 1, __strncat_sse2)) + + /* Support sysdeps/x86_64/multiarch/strncpy.S. */ + IFUNC_IMPL (i, name, strncpy, + IFUNC_IMPL_ADD (array, i, strncpy, HAS_CPU_FEATURE (SSSE3), + __strncpy_ssse3) + IFUNC_IMPL_ADD (array, i, strncpy, 1, + __strncpy_sse2_unaligned) + IFUNC_IMPL_ADD (array, i, strncpy, 1, __strncpy_sse2)) + + /* Support sysdeps/x86_64/multiarch/strpbrk.S. */ + IFUNC_IMPL (i, name, strpbrk, + IFUNC_IMPL_ADD (array, i, strpbrk, HAS_CPU_FEATURE (SSE4_2), + __strpbrk_sse42) + IFUNC_IMPL_ADD (array, i, strpbrk, 1, __strpbrk_sse2)) + + + /* Support sysdeps/x86_64/multiarch/strspn.S. */ + IFUNC_IMPL (i, name, strspn, + IFUNC_IMPL_ADD (array, i, strspn, HAS_CPU_FEATURE (SSE4_2), + __strspn_sse42) + IFUNC_IMPL_ADD (array, i, strspn, 1, __strspn_sse2)) + + /* Support sysdeps/x86_64/multiarch/strstr.c. */ + IFUNC_IMPL (i, name, strstr, + IFUNC_IMPL_ADD (array, i, strstr, 1, __strstr_sse2_unaligned) + IFUNC_IMPL_ADD (array, i, strstr, 1, __strstr_sse2)) + + /* Support sysdeps/x86_64/multiarch/wcscpy.S. */ + IFUNC_IMPL (i, name, wcscpy, + IFUNC_IMPL_ADD (array, i, wcscpy, HAS_CPU_FEATURE (SSSE3), + __wcscpy_ssse3) + IFUNC_IMPL_ADD (array, i, wcscpy, 1, __wcscpy_sse2)) + + /* Support sysdeps/x86_64/multiarch/wcsnlen.c. */ + IFUNC_IMPL (i, name, wcsnlen, + IFUNC_IMPL_ADD (array, i, wcsnlen, + HAS_CPU_FEATURE (SSE4_1), + __wcsnlen_sse4_1) + IFUNC_IMPL_ADD (array, i, wcsnlen, 1, __wcsnlen_sse2)) + + /* Support sysdeps/x86_64/multiarch/wmemcmp.S. */ + IFUNC_IMPL (i, name, wmemcmp, + IFUNC_IMPL_ADD (array, i, wmemcmp, + (HAS_ARCH_FEATURE (AVX2_Usable) + && HAS_CPU_FEATURE (MOVBE)), + __wmemcmp_avx2_movbe) + IFUNC_IMPL_ADD (array, i, wmemcmp, HAS_CPU_FEATURE (SSE4_1), + __wmemcmp_sse4_1) + IFUNC_IMPL_ADD (array, i, wmemcmp, HAS_CPU_FEATURE (SSSE3), + __wmemcmp_ssse3) + IFUNC_IMPL_ADD (array, i, wmemcmp, 1, __wmemcmp_sse2)) + + /* Support sysdeps/x86_64/multiarch/wmemset.c. */ + IFUNC_IMPL (i, name, wmemset, + IFUNC_IMPL_ADD (array, i, wmemset, 1, + __wmemset_sse2_unaligned) + IFUNC_IMPL_ADD (array, i, wmemset, + HAS_ARCH_FEATURE (AVX2_Usable), + __wmemset_avx2_unaligned) + IFUNC_IMPL_ADD (array, i, wmemset, + HAS_ARCH_FEATURE (AVX512F_Usable), + __wmemset_avx512_unaligned)) + +#ifdef SHARED + /* Support sysdeps/x86_64/multiarch/memcpy_chk.S. */ + IFUNC_IMPL (i, name, __memcpy_chk, + IFUNC_IMPL_ADD (array, i, __memcpy_chk, + HAS_ARCH_FEATURE (AVX512F_Usable), + __memcpy_chk_avx512_no_vzeroupper) + IFUNC_IMPL_ADD (array, i, __memcpy_chk, + HAS_ARCH_FEATURE (AVX512F_Usable), + __memcpy_chk_avx512_unaligned) + IFUNC_IMPL_ADD (array, i, __memcpy_chk, + HAS_ARCH_FEATURE (AVX512F_Usable), + __memcpy_chk_avx512_unaligned_erms) + IFUNC_IMPL_ADD (array, i, __memcpy_chk, + HAS_ARCH_FEATURE (AVX_Usable), + __memcpy_chk_avx_unaligned) + IFUNC_IMPL_ADD (array, i, __memcpy_chk, + HAS_ARCH_FEATURE (AVX_Usable), + __memcpy_chk_avx_unaligned_erms) + IFUNC_IMPL_ADD (array, i, __memcpy_chk, + HAS_CPU_FEATURE (SSSE3), + __memcpy_chk_ssse3_back) + IFUNC_IMPL_ADD (array, i, __memcpy_chk, + HAS_CPU_FEATURE (SSSE3), + __memcpy_chk_ssse3) + IFUNC_IMPL_ADD (array, i, __memcpy_chk, 1, + __memcpy_chk_sse2_unaligned) + IFUNC_IMPL_ADD (array, i, __memcpy_chk, 1, + __memcpy_chk_sse2_unaligned_erms)) + + /* Support sysdeps/x86_64/multiarch/memcpy.S. */ + IFUNC_IMPL (i, name, memcpy, + IFUNC_IMPL_ADD (array, i, memcpy, + HAS_ARCH_FEATURE (AVX_Usable), + __memcpy_avx_unaligned) + IFUNC_IMPL_ADD (array, i, memcpy, + HAS_ARCH_FEATURE (AVX_Usable), + __memcpy_avx_unaligned_erms) + IFUNC_IMPL_ADD (array, i, memcpy, HAS_CPU_FEATURE (SSSE3), + __memcpy_ssse3_back) + IFUNC_IMPL_ADD (array, i, memcpy, HAS_CPU_FEATURE (SSSE3), + __memcpy_ssse3) + IFUNC_IMPL_ADD (array, i, memcpy, + HAS_ARCH_FEATURE (AVX512F_Usable), + __memcpy_avx512_no_vzeroupper) + IFUNC_IMPL_ADD (array, i, memcpy, + HAS_ARCH_FEATURE (AVX512F_Usable), + __memcpy_avx512_unaligned) + IFUNC_IMPL_ADD (array, i, memcpy, + HAS_ARCH_FEATURE (AVX512F_Usable), + __memcpy_avx512_unaligned_erms) + IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_sse2_unaligned) + IFUNC_IMPL_ADD (array, i, memcpy, 1, + __memcpy_sse2_unaligned_erms) + IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_erms)) + + /* Support sysdeps/x86_64/multiarch/mempcpy_chk.S. */ + IFUNC_IMPL (i, name, __mempcpy_chk, + IFUNC_IMPL_ADD (array, i, __mempcpy_chk, + HAS_ARCH_FEATURE (AVX512F_Usable), + __mempcpy_chk_avx512_no_vzeroupper) + IFUNC_IMPL_ADD (array, i, __mempcpy_chk, + HAS_ARCH_FEATURE (AVX512F_Usable), + __mempcpy_chk_avx512_unaligned) + IFUNC_IMPL_ADD (array, i, __mempcpy_chk, + HAS_ARCH_FEATURE (AVX512F_Usable), + __mempcpy_chk_avx512_unaligned_erms) + IFUNC_IMPL_ADD (array, i, __mempcpy_chk, + HAS_ARCH_FEATURE (AVX_Usable), + __mempcpy_chk_avx_unaligned) + IFUNC_IMPL_ADD (array, i, __mempcpy_chk, + HAS_ARCH_FEATURE (AVX_Usable), + __mempcpy_chk_avx_unaligned_erms) + IFUNC_IMPL_ADD (array, i, __mempcpy_chk, + HAS_CPU_FEATURE (SSSE3), + __mempcpy_chk_ssse3_back) + IFUNC_IMPL_ADD (array, i, __mempcpy_chk, + HAS_CPU_FEATURE (SSSE3), + __mempcpy_chk_ssse3) + IFUNC_IMPL_ADD (array, i, __mempcpy_chk, 1, + __mempcpy_chk_sse2_unaligned) + IFUNC_IMPL_ADD (array, i, __mempcpy_chk, 1, + __mempcpy_chk_sse2_unaligned_erms)) + + /* Support sysdeps/x86_64/multiarch/mempcpy.S. */ + IFUNC_IMPL (i, name, mempcpy, + IFUNC_IMPL_ADD (array, i, mempcpy, + HAS_ARCH_FEATURE (AVX512F_Usable), + __mempcpy_avx512_no_vzeroupper) + IFUNC_IMPL_ADD (array, i, mempcpy, + HAS_ARCH_FEATURE (AVX512F_Usable), + __mempcpy_avx512_unaligned) + IFUNC_IMPL_ADD (array, i, mempcpy, + HAS_ARCH_FEATURE (AVX512F_Usable), + __mempcpy_avx512_unaligned_erms) + IFUNC_IMPL_ADD (array, i, mempcpy, + HAS_ARCH_FEATURE (AVX_Usable), + __mempcpy_avx_unaligned) + IFUNC_IMPL_ADD (array, i, mempcpy, + HAS_ARCH_FEATURE (AVX_Usable), + __mempcpy_avx_unaligned_erms) + IFUNC_IMPL_ADD (array, i, mempcpy, HAS_CPU_FEATURE (SSSE3), + __mempcpy_ssse3_back) + IFUNC_IMPL_ADD (array, i, mempcpy, HAS_CPU_FEATURE (SSSE3), + __mempcpy_ssse3) + IFUNC_IMPL_ADD (array, i, mempcpy, 1, + __mempcpy_sse2_unaligned) + IFUNC_IMPL_ADD (array, i, mempcpy, 1, + __mempcpy_sse2_unaligned_erms) + IFUNC_IMPL_ADD (array, i, mempcpy, 1, __mempcpy_erms)) + + /* Support sysdeps/x86_64/multiarch/strncmp.S. */ + IFUNC_IMPL (i, name, strncmp, + IFUNC_IMPL_ADD (array, i, strncmp, HAS_CPU_FEATURE (SSE4_2), + __strncmp_sse42) + IFUNC_IMPL_ADD (array, i, strncmp, HAS_CPU_FEATURE (SSSE3), + __strncmp_ssse3) + IFUNC_IMPL_ADD (array, i, strncmp, 1, __strncmp_sse2)) + + /* Support sysdeps/x86_64/multiarch/wmemset_chk.c. */ + IFUNC_IMPL (i, name, __wmemset_chk, + IFUNC_IMPL_ADD (array, i, __wmemset_chk, 1, + __wmemset_chk_sse2_unaligned) + IFUNC_IMPL_ADD (array, i, __wmemset_chk, + HAS_ARCH_FEATURE (AVX2_Usable), + __wmemset_chk_avx2_unaligned) + IFUNC_IMPL_ADD (array, i, __wmemset_chk, + HAS_ARCH_FEATURE (AVX512F_Usable), + __wmemset_chk_avx512_unaligned)) +#endif + + return i; +} diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/ifunc-wmemset.h b/REORG.TODO/sysdeps/x86_64/multiarch/ifunc-wmemset.h new file mode 100644 index 0000000000..d761985a47 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/ifunc-wmemset.h @@ -0,0 +1,42 @@ +/* Common definition for wmemset/wmemset_chk ifunc selections. + Copyright (C) 2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <init-arch.h> + +extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned) attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned) attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx512_unaligned) attribute_hidden; + +static inline void * +IFUNC_SELECTOR (void) +{ + const struct cpu_features* cpu_features = __get_cpu_features (); + + if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER) + && CPU_FEATURES_ARCH_P (cpu_features, AVX2_Usable) + && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load)) + { + if (CPU_FEATURES_ARCH_P (cpu_features, AVX512F_Usable) + && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512)) + return OPTIMIZE (avx512_unaligned); + else + return OPTIMIZE (avx2_unaligned); + } + + return OPTIMIZE (sse2_unaligned); +} diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S b/REORG.TODO/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S new file mode 100644 index 0000000000..47630dd97b --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S @@ -0,0 +1,425 @@ +/* memcmp/wmemcmp optimized with AVX2. + Copyright (C) 2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#if IS_IN (libc) + +/* memcmp/wmemcmp is implemented as: + 1. For size from 2 to 7 bytes, load as big endian with movbe and bswap + to avoid branches. + 2. Use overlapping compare to avoid branch. + 3. Use vector compare when size >= 4 bytes for memcmp or size >= 8 + bytes for wmemcmp. + 4. If size is 8 * VEC_SIZE or less, unroll the loop. + 5. Compare 4 * VEC_SIZE at a time with the aligned first memory + area. + 6. Use 2 vector compares when size is 2 * VEC_SIZE or less. + 7. Use 4 vector compares when size is 4 * VEC_SIZE or less. + 8. Use 8 vector compares when size is 8 * VEC_SIZE or less. */ + +# include <sysdep.h> + +# ifndef MEMCMP +# define MEMCMP __memcmp_avx2_movbe +# endif + +# ifdef USE_AS_WMEMCMP +# define VPCMPEQ vpcmpeqd +# else +# define VPCMPEQ vpcmpeqb +# endif + +# ifndef VZEROUPPER +# define VZEROUPPER vzeroupper +# endif + +# define VEC_SIZE 32 +# define VEC_MASK ((1 << VEC_SIZE) - 1) + +/* Warning! + wmemcmp has to use SIGNED comparison for elements. + memcmp has to use UNSIGNED comparison for elemnts. +*/ + + .section .text.avx,"ax",@progbits +ENTRY (MEMCMP) +# ifdef USE_AS_WMEMCMP + shl $2, %rdx +# endif + cmpq $VEC_SIZE, %rdx + jb L(less_vec) + cmpq $(VEC_SIZE * 2), %rdx + ja L(more_2x_vec) + +L(last_2x_vec): + /* From VEC to 2 * VEC. No branch when size == VEC_SIZE. */ + vmovdqu (%rsi), %ymm2 + VPCMPEQ (%rdi), %ymm2, %ymm2 + vpmovmskb %ymm2, %eax + subl $VEC_MASK, %eax + jnz L(first_vec) + +L(last_vec): + /* Use overlapping loads to avoid branches. */ + leaq -VEC_SIZE(%rdi, %rdx), %rdi + leaq -VEC_SIZE(%rsi, %rdx), %rsi + vmovdqu (%rsi), %ymm2 + VPCMPEQ (%rdi), %ymm2, %ymm2 + vpmovmskb %ymm2, %eax + subl $VEC_MASK, %eax + jnz L(first_vec) + VZEROUPPER + ret + + .p2align 4 +L(first_vec): + /* A byte or int32 is different within 16 or 32 bytes. */ + tzcntl %eax, %ecx +# ifdef USE_AS_WMEMCMP + xorl %eax, %eax + movl (%rdi, %rcx), %edx + cmpl (%rsi, %rcx), %edx +L(wmemcmp_return): + setl %al + negl %eax + orl $1, %eax +# else + movzbl (%rdi, %rcx), %eax + movzbl (%rsi, %rcx), %edx + sub %edx, %eax +# endif + VZEROUPPER + ret + +# ifdef USE_AS_WMEMCMP + .p2align 4 +L(4): + xorl %eax, %eax + movl (%rdi), %edx + cmpl (%rsi), %edx + jne L(wmemcmp_return) + ret +# else + .p2align 4 +L(between_4_7): + /* Load as big endian with overlapping movbe to avoid branches. */ + movbe (%rdi), %eax + movbe (%rsi), %ecx + shlq $32, %rax + shlq $32, %rcx + movbe -4(%rdi, %rdx), %edi + movbe -4(%rsi, %rdx), %esi + orq %rdi, %rax + orq %rsi, %rcx + subq %rcx, %rax + je L(exit) + sbbl %eax, %eax + orl $1, %eax + ret + + .p2align 4 +L(exit): + ret + + .p2align 4 +L(between_2_3): + /* Load as big endian with overlapping loads and bswap to avoid + branches. */ + movzwl -2(%rdi, %rdx), %eax + movzwl -2(%rsi, %rdx), %ecx + shll $16, %eax + shll $16, %ecx + movzwl (%rdi), %edi + movzwl (%rsi), %esi + orl %edi, %eax + orl %esi, %ecx + bswap %eax + bswap %ecx + subl %ecx, %eax + ret + + .p2align 4 +L(1): + movzbl (%rdi), %eax + movzbl (%rsi), %ecx + subl %ecx, %eax + ret +# endif + + .p2align 4 +L(zero): + xorl %eax, %eax + ret + + .p2align 4 +L(less_vec): +# ifdef USE_AS_WMEMCMP + /* It can only be 0, 4, 8, 12, 16, 20, 24, 28 bytes. */ + cmpb $4, %dl + je L(4) + jb L(zero) +# else + cmpb $1, %dl + je L(1) + jb L(zero) + cmpb $4, %dl + jb L(between_2_3) + cmpb $8, %dl + jb L(between_4_7) +# endif + cmpb $16, %dl + jae L(between_16_31) + /* It is between 8 and 15 bytes. */ + vmovq (%rdi), %xmm1 + vmovq (%rsi), %xmm2 + VPCMPEQ %xmm1, %xmm2, %xmm2 + vpmovmskb %xmm2, %eax + subl $0xffff, %eax + jnz L(first_vec) + /* Use overlapping loads to avoid branches. */ + leaq -8(%rdi, %rdx), %rdi + leaq -8(%rsi, %rdx), %rsi + vmovq (%rdi), %xmm1 + vmovq (%rsi), %xmm2 + VPCMPEQ %xmm1, %xmm2, %xmm2 + vpmovmskb %xmm2, %eax + subl $0xffff, %eax + jnz L(first_vec) + ret + + .p2align 4 +L(between_16_31): + /* From 16 to 31 bytes. No branch when size == 16. */ + vmovdqu (%rsi), %xmm2 + VPCMPEQ (%rdi), %xmm2, %xmm2 + vpmovmskb %xmm2, %eax + subl $0xffff, %eax + jnz L(first_vec) + + /* Use overlapping loads to avoid branches. */ + leaq -16(%rdi, %rdx), %rdi + leaq -16(%rsi, %rdx), %rsi + vmovdqu (%rsi), %xmm2 + VPCMPEQ (%rdi), %xmm2, %xmm2 + vpmovmskb %xmm2, %eax + subl $0xffff, %eax + jnz L(first_vec) + ret + + .p2align 4 +L(more_2x_vec): + /* More than 2 * VEC. */ + cmpq $(VEC_SIZE * 8), %rdx + ja L(more_8x_vec) + cmpq $(VEC_SIZE * 4), %rdx + jb L(last_4x_vec) + + /* From 4 * VEC to 8 * VEC, inclusively. */ + vmovdqu (%rsi), %ymm1 + VPCMPEQ (%rdi), %ymm1, %ymm1 + + vmovdqu VEC_SIZE(%rsi), %ymm2 + VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2 + + vmovdqu (VEC_SIZE * 2)(%rsi), %ymm3 + VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3 + + vmovdqu (VEC_SIZE * 3)(%rsi), %ymm4 + VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4 + + vpand %ymm1, %ymm2, %ymm5 + vpand %ymm3, %ymm4, %ymm6 + vpand %ymm5, %ymm6, %ymm5 + + vpmovmskb %ymm5, %eax + subl $VEC_MASK, %eax + jnz L(4x_vec_end) + + leaq -(4 * VEC_SIZE)(%rdi, %rdx), %rdi + leaq -(4 * VEC_SIZE)(%rsi, %rdx), %rsi + vmovdqu (%rsi), %ymm1 + VPCMPEQ (%rdi), %ymm1, %ymm1 + + vmovdqu VEC_SIZE(%rsi), %ymm2 + VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2 + vpand %ymm2, %ymm1, %ymm5 + + vmovdqu (VEC_SIZE * 2)(%rsi), %ymm3 + VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3 + vpand %ymm3, %ymm5, %ymm5 + + vmovdqu (VEC_SIZE * 3)(%rsi), %ymm4 + VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4 + vpand %ymm4, %ymm5, %ymm5 + + vpmovmskb %ymm5, %eax + subl $VEC_MASK, %eax + jnz L(4x_vec_end) + VZEROUPPER + ret + + .p2align 4 +L(more_8x_vec): + /* More than 8 * VEC. Check the first VEC. */ + vmovdqu (%rsi), %ymm2 + VPCMPEQ (%rdi), %ymm2, %ymm2 + vpmovmskb %ymm2, %eax + subl $VEC_MASK, %eax + jnz L(first_vec) + + /* Align the first memory area for aligned loads in the loop. + Compute how much the first memory area is misaligned. */ + movq %rdi, %rcx + andl $(VEC_SIZE - 1), %ecx + /* Get the negative of offset for alignment. */ + subq $VEC_SIZE, %rcx + /* Adjust the second memory area. */ + subq %rcx, %rsi + /* Adjust the first memory area which should be aligned now. */ + subq %rcx, %rdi + /* Adjust length. */ + addq %rcx, %rdx + +L(loop_4x_vec): + /* Compare 4 * VEC at a time forward. */ + vmovdqu (%rsi), %ymm1 + VPCMPEQ (%rdi), %ymm1, %ymm1 + + vmovdqu VEC_SIZE(%rsi), %ymm2 + VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2 + vpand %ymm2, %ymm1, %ymm5 + + vmovdqu (VEC_SIZE * 2)(%rsi), %ymm3 + VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3 + vpand %ymm3, %ymm5, %ymm5 + + vmovdqu (VEC_SIZE * 3)(%rsi), %ymm4 + VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4 + vpand %ymm4, %ymm5, %ymm5 + + vpmovmskb %ymm5, %eax + subl $VEC_MASK, %eax + jnz L(4x_vec_end) + + addq $(VEC_SIZE * 4), %rdi + addq $(VEC_SIZE * 4), %rsi + + subq $(VEC_SIZE * 4), %rdx + cmpq $(VEC_SIZE * 4), %rdx + jae L(loop_4x_vec) + + /* Less than 4 * VEC. */ + cmpq $VEC_SIZE, %rdx + jbe L(last_vec) + cmpq $(VEC_SIZE * 2), %rdx + jbe L(last_2x_vec) + +L(last_4x_vec): + /* From 2 * VEC to 4 * VEC. */ + vmovdqu (%rsi), %ymm2 + VPCMPEQ (%rdi), %ymm2, %ymm2 + vpmovmskb %ymm2, %eax + subl $VEC_MASK, %eax + jnz L(first_vec) + + addq $VEC_SIZE, %rdi + addq $VEC_SIZE, %rsi + vmovdqu (%rsi), %ymm2 + VPCMPEQ (%rdi), %ymm2, %ymm2 + vpmovmskb %ymm2, %eax + subl $VEC_MASK, %eax + jnz L(first_vec) + + /* Use overlapping loads to avoid branches. */ + leaq -(3 * VEC_SIZE)(%rdi, %rdx), %rdi + leaq -(3 * VEC_SIZE)(%rsi, %rdx), %rsi + vmovdqu (%rsi), %ymm2 + VPCMPEQ (%rdi), %ymm2, %ymm2 + vpmovmskb %ymm2, %eax + subl $VEC_MASK, %eax + jnz L(first_vec) + + addq $VEC_SIZE, %rdi + addq $VEC_SIZE, %rsi + vmovdqu (%rsi), %ymm2 + VPCMPEQ (%rdi), %ymm2, %ymm2 + vpmovmskb %ymm2, %eax + subl $VEC_MASK, %eax + jnz L(first_vec) + VZEROUPPER + ret + + .p2align 4 +L(4x_vec_end): + vpmovmskb %ymm1, %eax + subl $VEC_MASK, %eax + jnz L(first_vec) + vpmovmskb %ymm2, %eax + subl $VEC_MASK, %eax + jnz L(first_vec_x1) + vpmovmskb %ymm3, %eax + subl $VEC_MASK, %eax + jnz L(first_vec_x2) + vpmovmskb %ymm4, %eax + subl $VEC_MASK, %eax + tzcntl %eax, %ecx +# ifdef USE_AS_WMEMCMP + xorl %eax, %eax + movl (VEC_SIZE * 3)(%rdi, %rcx), %edx + cmpl (VEC_SIZE * 3)(%rsi, %rcx), %edx + jmp L(wmemcmp_return) +# else + movzbl (VEC_SIZE * 3)(%rdi, %rcx), %eax + movzbl (VEC_SIZE * 3)(%rsi, %rcx), %edx + sub %edx, %eax +# endif + VZEROUPPER + ret + + .p2align 4 +L(first_vec_x1): + tzcntl %eax, %ecx +# ifdef USE_AS_WMEMCMP + xorl %eax, %eax + movl VEC_SIZE(%rdi, %rcx), %edx + cmpl VEC_SIZE(%rsi, %rcx), %edx + jmp L(wmemcmp_return) +# else + movzbl VEC_SIZE(%rdi, %rcx), %eax + movzbl VEC_SIZE(%rsi, %rcx), %edx + sub %edx, %eax +# endif + VZEROUPPER + ret + + .p2align 4 +L(first_vec_x2): + tzcntl %eax, %ecx +# ifdef USE_AS_WMEMCMP + xorl %eax, %eax + movl (VEC_SIZE * 2)(%rdi, %rcx), %edx + cmpl (VEC_SIZE * 2)(%rsi, %rcx), %edx + jmp L(wmemcmp_return) +# else + movzbl (VEC_SIZE * 2)(%rdi, %rcx), %eax + movzbl (VEC_SIZE * 2)(%rsi, %rcx), %edx + sub %edx, %eax +# endif + VZEROUPPER + ret +END (MEMCMP) +#endif diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/memcmp-sse4.S b/REORG.TODO/sysdeps/x86_64/multiarch/memcmp-sse4.S new file mode 100644 index 0000000000..771639f662 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/memcmp-sse4.S @@ -0,0 +1,1776 @@ +/* memcmp with SSE4.1, wmemcmp with SSE4.1 + Copyright (C) 2010-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#if IS_IN (libc) + +# include <sysdep.h> + +# ifndef MEMCMP +# define MEMCMP __memcmp_sse4_1 +# endif + +# define JMPTBL(I, B) (I - B) + +# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ + lea TABLE(%rip), %r11; \ + movslq (%r11, INDEX, SCALE), %rcx; \ + add %r11, %rcx; \ + jmp *%rcx; \ + ud2 + +/* Warning! + wmemcmp has to use SIGNED comparison for elements. + memcmp has to use UNSIGNED comparison for elemnts. +*/ + + .section .text.sse4.1,"ax",@progbits +ENTRY (MEMCMP) +# ifdef USE_AS_WMEMCMP + shl $2, %rdx +# endif + pxor %xmm0, %xmm0 + cmp $79, %rdx + ja L(79bytesormore) +# ifndef USE_AS_WMEMCMP + cmp $1, %rdx + je L(firstbyte) +# endif + add %rdx, %rsi + add %rdx, %rdi + BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4) + +# ifndef USE_AS_WMEMCMP + .p2align 4 +L(firstbyte): + movzbl (%rdi), %eax + movzbl (%rsi), %ecx + sub %ecx, %eax + ret +# endif + + .p2align 4 +L(79bytesormore): + movdqu (%rsi), %xmm1 + movdqu (%rdi), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(16bytesin256) + mov %rsi, %rcx + and $-16, %rsi + add $16, %rsi + sub %rsi, %rcx + + sub %rcx, %rdi + add %rcx, %rdx + test $0xf, %rdi + jz L(2aligned) + + cmp $128, %rdx + ja L(128bytesormore) +L(less128bytes): + sub $64, %rdx + + movdqu (%rdi), %xmm2 + pxor (%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(16bytesin256) + + movdqu 16(%rdi), %xmm2 + pxor 16(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(32bytesin256) + + movdqu 32(%rdi), %xmm2 + pxor 32(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(48bytesin256) + + movdqu 48(%rdi), %xmm2 + pxor 48(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(64bytesin256) + cmp $32, %rdx + jb L(less32bytesin64) + + movdqu 64(%rdi), %xmm2 + pxor 64(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(80bytesin256) + + movdqu 80(%rdi), %xmm2 + pxor 80(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(96bytesin256) + sub $32, %rdx + add $32, %rdi + add $32, %rsi +L(less32bytesin64): + add $64, %rdi + add $64, %rsi + add %rdx, %rsi + add %rdx, %rdi + BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4) + +L(128bytesormore): + cmp $512, %rdx + ja L(512bytesormore) + cmp $256, %rdx + ja L(less512bytes) +L(less256bytes): + sub $128, %rdx + + movdqu (%rdi), %xmm2 + pxor (%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(16bytesin256) + + movdqu 16(%rdi), %xmm2 + pxor 16(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(32bytesin256) + + movdqu 32(%rdi), %xmm2 + pxor 32(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(48bytesin256) + + movdqu 48(%rdi), %xmm2 + pxor 48(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(64bytesin256) + + movdqu 64(%rdi), %xmm2 + pxor 64(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(80bytesin256) + + movdqu 80(%rdi), %xmm2 + pxor 80(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(96bytesin256) + + movdqu 96(%rdi), %xmm2 + pxor 96(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(112bytesin256) + + movdqu 112(%rdi), %xmm2 + pxor 112(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(128bytesin256) + + add $128, %rsi + add $128, %rdi + + cmp $64, %rdx + jae L(less128bytes) + + cmp $32, %rdx + jb L(less32bytesin128) + + movdqu (%rdi), %xmm2 + pxor (%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(16bytesin256) + + movdqu 16(%rdi), %xmm2 + pxor 16(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(32bytesin256) + sub $32, %rdx + add $32, %rdi + add $32, %rsi +L(less32bytesin128): + add %rdx, %rsi + add %rdx, %rdi + BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4) + +L(less512bytes): + sub $256, %rdx + movdqu (%rdi), %xmm2 + pxor (%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(16bytesin256) + + movdqu 16(%rdi), %xmm2 + pxor 16(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(32bytesin256) + + movdqu 32(%rdi), %xmm2 + pxor 32(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(48bytesin256) + + movdqu 48(%rdi), %xmm2 + pxor 48(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(64bytesin256) + + movdqu 64(%rdi), %xmm2 + pxor 64(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(80bytesin256) + + movdqu 80(%rdi), %xmm2 + pxor 80(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(96bytesin256) + + movdqu 96(%rdi), %xmm2 + pxor 96(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(112bytesin256) + + movdqu 112(%rdi), %xmm2 + pxor 112(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(128bytesin256) + + movdqu 128(%rdi), %xmm2 + pxor 128(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(144bytesin256) + + movdqu 144(%rdi), %xmm2 + pxor 144(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(160bytesin256) + + movdqu 160(%rdi), %xmm2 + pxor 160(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(176bytesin256) + + movdqu 176(%rdi), %xmm2 + pxor 176(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(192bytesin256) + + movdqu 192(%rdi), %xmm2 + pxor 192(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(208bytesin256) + + movdqu 208(%rdi), %xmm2 + pxor 208(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(224bytesin256) + + movdqu 224(%rdi), %xmm2 + pxor 224(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(240bytesin256) + + movdqu 240(%rdi), %xmm2 + pxor 240(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(256bytesin256) + + add $256, %rsi + add $256, %rdi + + cmp $128, %rdx + jae L(less256bytes) + + cmp $64, %rdx + jae L(less128bytes) + + cmp $32, %rdx + jb L(less32bytesin256) + + movdqu (%rdi), %xmm2 + pxor (%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(16bytesin256) + + movdqu 16(%rdi), %xmm2 + pxor 16(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(32bytesin256) + sub $32, %rdx + add $32, %rdi + add $32, %rsi +L(less32bytesin256): + add %rdx, %rsi + add %rdx, %rdi + BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4) + + .p2align 4 +L(512bytesormore): +# ifdef DATA_CACHE_SIZE_HALF + mov $DATA_CACHE_SIZE_HALF, %R8_LP +# else + mov __x86_data_cache_size_half(%rip), %R8_LP +# endif + mov %r8, %r9 + shr $1, %r8 + add %r9, %r8 + cmp %r8, %rdx + ja L(L2_L3_cache_unaglined) + sub $64, %rdx + .p2align 4 +L(64bytesormore_loop): + movdqu (%rdi), %xmm2 + pxor (%rsi), %xmm2 + movdqa %xmm2, %xmm1 + + movdqu 16(%rdi), %xmm3 + pxor 16(%rsi), %xmm3 + por %xmm3, %xmm1 + + movdqu 32(%rdi), %xmm4 + pxor 32(%rsi), %xmm4 + por %xmm4, %xmm1 + + movdqu 48(%rdi), %xmm5 + pxor 48(%rsi), %xmm5 + por %xmm5, %xmm1 + + ptest %xmm1, %xmm0 + jnc L(64bytesormore_loop_end) + add $64, %rsi + add $64, %rdi + sub $64, %rdx + jae L(64bytesormore_loop) + + add $64, %rdx + add %rdx, %rsi + add %rdx, %rdi + BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4) + +L(L2_L3_cache_unaglined): + sub $64, %rdx + .p2align 4 +L(L2_L3_unaligned_128bytes_loop): + prefetchnta 0x1c0(%rdi) + prefetchnta 0x1c0(%rsi) + movdqu (%rdi), %xmm2 + pxor (%rsi), %xmm2 + movdqa %xmm2, %xmm1 + + movdqu 16(%rdi), %xmm3 + pxor 16(%rsi), %xmm3 + por %xmm3, %xmm1 + + movdqu 32(%rdi), %xmm4 + pxor 32(%rsi), %xmm4 + por %xmm4, %xmm1 + + movdqu 48(%rdi), %xmm5 + pxor 48(%rsi), %xmm5 + por %xmm5, %xmm1 + + ptest %xmm1, %xmm0 + jnc L(64bytesormore_loop_end) + add $64, %rsi + add $64, %rdi + sub $64, %rdx + jae L(L2_L3_unaligned_128bytes_loop) + + add $64, %rdx + add %rdx, %rsi + add %rdx, %rdi + BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4) + +/* + * This case is for machines which are sensitive for unaligned instructions. + */ + .p2align 4 +L(2aligned): + cmp $128, %rdx + ja L(128bytesormorein2aligned) +L(less128bytesin2aligned): + sub $64, %rdx + + movdqa (%rdi), %xmm2 + pxor (%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(16bytesin256) + + movdqa 16(%rdi), %xmm2 + pxor 16(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(32bytesin256) + + movdqa 32(%rdi), %xmm2 + pxor 32(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(48bytesin256) + + movdqa 48(%rdi), %xmm2 + pxor 48(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(64bytesin256) + cmp $32, %rdx + jb L(less32bytesin64in2alinged) + + movdqa 64(%rdi), %xmm2 + pxor 64(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(80bytesin256) + + movdqa 80(%rdi), %xmm2 + pxor 80(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(96bytesin256) + sub $32, %rdx + add $32, %rdi + add $32, %rsi +L(less32bytesin64in2alinged): + add $64, %rdi + add $64, %rsi + add %rdx, %rsi + add %rdx, %rdi + BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4) + + .p2align 4 +L(128bytesormorein2aligned): + cmp $512, %rdx + ja L(512bytesormorein2aligned) + cmp $256, %rdx + ja L(256bytesormorein2aligned) +L(less256bytesin2alinged): + sub $128, %rdx + + movdqa (%rdi), %xmm2 + pxor (%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(16bytesin256) + + movdqa 16(%rdi), %xmm2 + pxor 16(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(32bytesin256) + + movdqa 32(%rdi), %xmm2 + pxor 32(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(48bytesin256) + + movdqa 48(%rdi), %xmm2 + pxor 48(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(64bytesin256) + + movdqa 64(%rdi), %xmm2 + pxor 64(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(80bytesin256) + + movdqa 80(%rdi), %xmm2 + pxor 80(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(96bytesin256) + + movdqa 96(%rdi), %xmm2 + pxor 96(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(112bytesin256) + + movdqa 112(%rdi), %xmm2 + pxor 112(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(128bytesin256) + + add $128, %rsi + add $128, %rdi + + cmp $64, %rdx + jae L(less128bytesin2aligned) + + cmp $32, %rdx + jb L(less32bytesin128in2aligned) + + movdqu (%rdi), %xmm2 + pxor (%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(16bytesin256) + + movdqu 16(%rdi), %xmm2 + pxor 16(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(32bytesin256) + sub $32, %rdx + add $32, %rdi + add $32, %rsi +L(less32bytesin128in2aligned): + add %rdx, %rsi + add %rdx, %rdi + BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4) + + .p2align 4 +L(256bytesormorein2aligned): + + sub $256, %rdx + movdqa (%rdi), %xmm2 + pxor (%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(16bytesin256) + + movdqa 16(%rdi), %xmm2 + pxor 16(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(32bytesin256) + + movdqa 32(%rdi), %xmm2 + pxor 32(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(48bytesin256) + + movdqa 48(%rdi), %xmm2 + pxor 48(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(64bytesin256) + + movdqa 64(%rdi), %xmm2 + pxor 64(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(80bytesin256) + + movdqa 80(%rdi), %xmm2 + pxor 80(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(96bytesin256) + + movdqa 96(%rdi), %xmm2 + pxor 96(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(112bytesin256) + + movdqa 112(%rdi), %xmm2 + pxor 112(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(128bytesin256) + + movdqa 128(%rdi), %xmm2 + pxor 128(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(144bytesin256) + + movdqa 144(%rdi), %xmm2 + pxor 144(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(160bytesin256) + + movdqa 160(%rdi), %xmm2 + pxor 160(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(176bytesin256) + + movdqa 176(%rdi), %xmm2 + pxor 176(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(192bytesin256) + + movdqa 192(%rdi), %xmm2 + pxor 192(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(208bytesin256) + + movdqa 208(%rdi), %xmm2 + pxor 208(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(224bytesin256) + + movdqa 224(%rdi), %xmm2 + pxor 224(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(240bytesin256) + + movdqa 240(%rdi), %xmm2 + pxor 240(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(256bytesin256) + + add $256, %rsi + add $256, %rdi + + cmp $128, %rdx + jae L(less256bytesin2alinged) + + cmp $64, %rdx + jae L(less128bytesin2aligned) + + cmp $32, %rdx + jb L(less32bytesin256in2alinged) + + movdqa (%rdi), %xmm2 + pxor (%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(16bytesin256) + + movdqa 16(%rdi), %xmm2 + pxor 16(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(32bytesin256) + sub $32, %rdx + add $32, %rdi + add $32, %rsi +L(less32bytesin256in2alinged): + add %rdx, %rsi + add %rdx, %rdi + BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4) + + .p2align 4 +L(512bytesormorein2aligned): +# ifdef DATA_CACHE_SIZE_HALF + mov $DATA_CACHE_SIZE_HALF, %R8_LP +# else + mov __x86_data_cache_size_half(%rip), %R8_LP +# endif + mov %r8, %r9 + shr $1, %r8 + add %r9, %r8 + cmp %r8, %rdx + ja L(L2_L3_cache_aglined) + + sub $64, %rdx + .p2align 4 +L(64bytesormore_loopin2aligned): + movdqa (%rdi), %xmm2 + pxor (%rsi), %xmm2 + movdqa %xmm2, %xmm1 + + movdqa 16(%rdi), %xmm3 + pxor 16(%rsi), %xmm3 + por %xmm3, %xmm1 + + movdqa 32(%rdi), %xmm4 + pxor 32(%rsi), %xmm4 + por %xmm4, %xmm1 + + movdqa 48(%rdi), %xmm5 + pxor 48(%rsi), %xmm5 + por %xmm5, %xmm1 + + ptest %xmm1, %xmm0 + jnc L(64bytesormore_loop_end) + add $64, %rsi + add $64, %rdi + sub $64, %rdx + jae L(64bytesormore_loopin2aligned) + + add $64, %rdx + add %rdx, %rsi + add %rdx, %rdi + BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4) +L(L2_L3_cache_aglined): + sub $64, %rdx + + .p2align 4 +L(L2_L3_aligned_128bytes_loop): + prefetchnta 0x1c0(%rdi) + prefetchnta 0x1c0(%rsi) + movdqa (%rdi), %xmm2 + pxor (%rsi), %xmm2 + movdqa %xmm2, %xmm1 + + movdqa 16(%rdi), %xmm3 + pxor 16(%rsi), %xmm3 + por %xmm3, %xmm1 + + movdqa 32(%rdi), %xmm4 + pxor 32(%rsi), %xmm4 + por %xmm4, %xmm1 + + movdqa 48(%rdi), %xmm5 + pxor 48(%rsi), %xmm5 + por %xmm5, %xmm1 + + ptest %xmm1, %xmm0 + jnc L(64bytesormore_loop_end) + add $64, %rsi + add $64, %rdi + sub $64, %rdx + jae L(L2_L3_aligned_128bytes_loop) + + add $64, %rdx + add %rdx, %rsi + add %rdx, %rdi + BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4) + + + .p2align 4 +L(64bytesormore_loop_end): + add $16, %rdi + add $16, %rsi + ptest %xmm2, %xmm0 + jnc L(16bytes) + + add $16, %rdi + add $16, %rsi + ptest %xmm3, %xmm0 + jnc L(16bytes) + + add $16, %rdi + add $16, %rsi + ptest %xmm4, %xmm0 + jnc L(16bytes) + + add $16, %rdi + add $16, %rsi + jmp L(16bytes) + +L(256bytesin256): + add $256, %rdi + add $256, %rsi + jmp L(16bytes) +L(240bytesin256): + add $240, %rdi + add $240, %rsi + jmp L(16bytes) +L(224bytesin256): + add $224, %rdi + add $224, %rsi + jmp L(16bytes) +L(208bytesin256): + add $208, %rdi + add $208, %rsi + jmp L(16bytes) +L(192bytesin256): + add $192, %rdi + add $192, %rsi + jmp L(16bytes) +L(176bytesin256): + add $176, %rdi + add $176, %rsi + jmp L(16bytes) +L(160bytesin256): + add $160, %rdi + add $160, %rsi + jmp L(16bytes) +L(144bytesin256): + add $144, %rdi + add $144, %rsi + jmp L(16bytes) +L(128bytesin256): + add $128, %rdi + add $128, %rsi + jmp L(16bytes) +L(112bytesin256): + add $112, %rdi + add $112, %rsi + jmp L(16bytes) +L(96bytesin256): + add $96, %rdi + add $96, %rsi + jmp L(16bytes) +L(80bytesin256): + add $80, %rdi + add $80, %rsi + jmp L(16bytes) +L(64bytesin256): + add $64, %rdi + add $64, %rsi + jmp L(16bytes) +L(48bytesin256): + add $16, %rdi + add $16, %rsi +L(32bytesin256): + add $16, %rdi + add $16, %rsi +L(16bytesin256): + add $16, %rdi + add $16, %rsi +L(16bytes): + mov -16(%rdi), %rax + mov -16(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) +L(8bytes): + mov -8(%rdi), %rax + mov -8(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) + xor %eax, %eax + ret + + .p2align 4 +L(12bytes): + mov -12(%rdi), %rax + mov -12(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) +L(4bytes): + mov -4(%rsi), %ecx +# ifndef USE_AS_WMEMCMP + mov -4(%rdi), %eax + cmp %eax, %ecx +# else + cmp -4(%rdi), %ecx +# endif + jne L(diffin4bytes) +L(0bytes): + xor %eax, %eax + ret + +# ifndef USE_AS_WMEMCMP +/* unreal case for wmemcmp */ + .p2align 4 +L(65bytes): + movdqu -65(%rdi), %xmm1 + movdqu -65(%rsi), %xmm2 + mov $-65, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(49bytes): + movdqu -49(%rdi), %xmm1 + movdqu -49(%rsi), %xmm2 + mov $-49, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(33bytes): + movdqu -33(%rdi), %xmm1 + movdqu -33(%rsi), %xmm2 + mov $-33, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(17bytes): + mov -17(%rdi), %rax + mov -17(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) +L(9bytes): + mov -9(%rdi), %rax + mov -9(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) + movzbl -1(%rdi), %eax + movzbl -1(%rsi), %edx + sub %edx, %eax + ret + + .p2align 4 +L(13bytes): + mov -13(%rdi), %rax + mov -13(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) + mov -8(%rdi), %rax + mov -8(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) + xor %eax, %eax + ret + + .p2align 4 +L(5bytes): + mov -5(%rdi), %eax + mov -5(%rsi), %ecx + cmp %eax, %ecx + jne L(diffin4bytes) + movzbl -1(%rdi), %eax + movzbl -1(%rsi), %edx + sub %edx, %eax + ret + + .p2align 4 +L(66bytes): + movdqu -66(%rdi), %xmm1 + movdqu -66(%rsi), %xmm2 + mov $-66, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(50bytes): + movdqu -50(%rdi), %xmm1 + movdqu -50(%rsi), %xmm2 + mov $-50, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(34bytes): + movdqu -34(%rdi), %xmm1 + movdqu -34(%rsi), %xmm2 + mov $-34, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(18bytes): + mov -18(%rdi), %rax + mov -18(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) +L(10bytes): + mov -10(%rdi), %rax + mov -10(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) + movzwl -2(%rdi), %eax + movzwl -2(%rsi), %ecx + cmp %cl, %al + jne L(end) + and $0xffff, %eax + and $0xffff, %ecx + sub %ecx, %eax + ret + + .p2align 4 +L(14bytes): + mov -14(%rdi), %rax + mov -14(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) + mov -8(%rdi), %rax + mov -8(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) + xor %eax, %eax + ret + + .p2align 4 +L(6bytes): + mov -6(%rdi), %eax + mov -6(%rsi), %ecx + cmp %eax, %ecx + jne L(diffin4bytes) +L(2bytes): + movzwl -2(%rsi), %ecx + movzwl -2(%rdi), %eax + cmp %cl, %al + jne L(end) + and $0xffff, %eax + and $0xffff, %ecx + sub %ecx, %eax + ret + + .p2align 4 +L(67bytes): + movdqu -67(%rdi), %xmm2 + movdqu -67(%rsi), %xmm1 + mov $-67, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(51bytes): + movdqu -51(%rdi), %xmm2 + movdqu -51(%rsi), %xmm1 + mov $-51, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(35bytes): + movdqu -35(%rsi), %xmm1 + movdqu -35(%rdi), %xmm2 + mov $-35, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(19bytes): + mov -19(%rdi), %rax + mov -19(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) +L(11bytes): + mov -11(%rdi), %rax + mov -11(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) + mov -4(%rdi), %eax + mov -4(%rsi), %ecx + cmp %eax, %ecx + jne L(diffin4bytes) + xor %eax, %eax + ret + + .p2align 4 +L(15bytes): + mov -15(%rdi), %rax + mov -15(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) + mov -8(%rdi), %rax + mov -8(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) + xor %eax, %eax + ret + + .p2align 4 +L(7bytes): + mov -7(%rdi), %eax + mov -7(%rsi), %ecx + cmp %eax, %ecx + jne L(diffin4bytes) + mov -4(%rdi), %eax + mov -4(%rsi), %ecx + cmp %eax, %ecx + jne L(diffin4bytes) + xor %eax, %eax + ret + + .p2align 4 +L(3bytes): + movzwl -3(%rdi), %eax + movzwl -3(%rsi), %ecx + cmp %eax, %ecx + jne L(diffin2bytes) +L(1bytes): + movzbl -1(%rdi), %eax + movzbl -1(%rsi), %ecx + sub %ecx, %eax + ret +# endif + + .p2align 4 +L(68bytes): + movdqu -68(%rdi), %xmm2 + movdqu -68(%rsi), %xmm1 + mov $-68, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(52bytes): + movdqu -52(%rdi), %xmm2 + movdqu -52(%rsi), %xmm1 + mov $-52, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(36bytes): + movdqu -36(%rdi), %xmm2 + movdqu -36(%rsi), %xmm1 + mov $-36, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(20bytes): + movdqu -20(%rdi), %xmm2 + movdqu -20(%rsi), %xmm1 + mov $-20, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + mov -4(%rsi), %ecx + +# ifndef USE_AS_WMEMCMP + mov -4(%rdi), %eax + cmp %eax, %ecx +# else + cmp -4(%rdi), %ecx +# endif + jne L(diffin4bytes) + xor %eax, %eax + ret + +# ifndef USE_AS_WMEMCMP +/* unreal cases for wmemcmp */ + .p2align 4 +L(69bytes): + movdqu -69(%rsi), %xmm1 + movdqu -69(%rdi), %xmm2 + mov $-69, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(53bytes): + movdqu -53(%rsi), %xmm1 + movdqu -53(%rdi), %xmm2 + mov $-53, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(37bytes): + movdqu -37(%rsi), %xmm1 + movdqu -37(%rdi), %xmm2 + mov $-37, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(21bytes): + movdqu -21(%rsi), %xmm1 + movdqu -21(%rdi), %xmm2 + mov $-21, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + mov -8(%rdi), %rax + mov -8(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) + xor %eax, %eax + ret + + .p2align 4 +L(70bytes): + movdqu -70(%rsi), %xmm1 + movdqu -70(%rdi), %xmm2 + mov $-70, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(54bytes): + movdqu -54(%rsi), %xmm1 + movdqu -54(%rdi), %xmm2 + mov $-54, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(38bytes): + movdqu -38(%rsi), %xmm1 + movdqu -38(%rdi), %xmm2 + mov $-38, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(22bytes): + movdqu -22(%rsi), %xmm1 + movdqu -22(%rdi), %xmm2 + mov $-22, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + mov -8(%rdi), %rax + mov -8(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) + xor %eax, %eax + ret + + .p2align 4 +L(71bytes): + movdqu -71(%rsi), %xmm1 + movdqu -71(%rdi), %xmm2 + mov $-71, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(55bytes): + movdqu -55(%rdi), %xmm2 + movdqu -55(%rsi), %xmm1 + mov $-55, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(39bytes): + movdqu -39(%rdi), %xmm2 + movdqu -39(%rsi), %xmm1 + mov $-39, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(23bytes): + movdqu -23(%rdi), %xmm2 + movdqu -23(%rsi), %xmm1 + mov $-23, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + mov -8(%rdi), %rax + mov -8(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) + xor %eax, %eax + ret +# endif + + .p2align 4 +L(72bytes): + movdqu -72(%rsi), %xmm1 + movdqu -72(%rdi), %xmm2 + mov $-72, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(56bytes): + movdqu -56(%rdi), %xmm2 + movdqu -56(%rsi), %xmm1 + mov $-56, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(40bytes): + movdqu -40(%rdi), %xmm2 + movdqu -40(%rsi), %xmm1 + mov $-40, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(24bytes): + movdqu -24(%rdi), %xmm2 + movdqu -24(%rsi), %xmm1 + mov $-24, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + + mov -8(%rsi), %rcx + mov -8(%rdi), %rax + cmp %rax, %rcx + jne L(diffin8bytes) + xor %eax, %eax + ret + +# ifndef USE_AS_WMEMCMP +/* unreal cases for wmemcmp */ + .p2align 4 +L(73bytes): + movdqu -73(%rsi), %xmm1 + movdqu -73(%rdi), %xmm2 + mov $-73, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(57bytes): + movdqu -57(%rdi), %xmm2 + movdqu -57(%rsi), %xmm1 + mov $-57, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(41bytes): + movdqu -41(%rdi), %xmm2 + movdqu -41(%rsi), %xmm1 + mov $-41, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(25bytes): + movdqu -25(%rdi), %xmm2 + movdqu -25(%rsi), %xmm1 + mov $-25, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + mov -9(%rdi), %rax + mov -9(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) + movzbl -1(%rdi), %eax + movzbl -1(%rsi), %ecx + sub %ecx, %eax + ret + + .p2align 4 +L(74bytes): + movdqu -74(%rsi), %xmm1 + movdqu -74(%rdi), %xmm2 + mov $-74, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(58bytes): + movdqu -58(%rdi), %xmm2 + movdqu -58(%rsi), %xmm1 + mov $-58, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(42bytes): + movdqu -42(%rdi), %xmm2 + movdqu -42(%rsi), %xmm1 + mov $-42, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(26bytes): + movdqu -26(%rdi), %xmm2 + movdqu -26(%rsi), %xmm1 + mov $-26, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + mov -10(%rdi), %rax + mov -10(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) + movzwl -2(%rdi), %eax + movzwl -2(%rsi), %ecx + jmp L(diffin2bytes) + + .p2align 4 +L(75bytes): + movdqu -75(%rsi), %xmm1 + movdqu -75(%rdi), %xmm2 + mov $-75, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(59bytes): + movdqu -59(%rdi), %xmm2 + movdqu -59(%rsi), %xmm1 + mov $-59, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(43bytes): + movdqu -43(%rdi), %xmm2 + movdqu -43(%rsi), %xmm1 + mov $-43, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(27bytes): + movdqu -27(%rdi), %xmm2 + movdqu -27(%rsi), %xmm1 + mov $-27, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + mov -11(%rdi), %rax + mov -11(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) + mov -4(%rdi), %eax + mov -4(%rsi), %ecx + cmp %eax, %ecx + jne L(diffin4bytes) + xor %eax, %eax + ret +# endif + .p2align 4 +L(76bytes): + movdqu -76(%rsi), %xmm1 + movdqu -76(%rdi), %xmm2 + mov $-76, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(60bytes): + movdqu -60(%rdi), %xmm2 + movdqu -60(%rsi), %xmm1 + mov $-60, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(44bytes): + movdqu -44(%rdi), %xmm2 + movdqu -44(%rsi), %xmm1 + mov $-44, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(28bytes): + movdqu -28(%rdi), %xmm2 + movdqu -28(%rsi), %xmm1 + mov $-28, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + mov -12(%rdi), %rax + mov -12(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) + mov -4(%rsi), %ecx +# ifndef USE_AS_WMEMCMP + mov -4(%rdi), %eax + cmp %eax, %ecx +# else + cmp -4(%rdi), %ecx +# endif + jne L(diffin4bytes) + xor %eax, %eax + ret + +# ifndef USE_AS_WMEMCMP +/* unreal cases for wmemcmp */ + .p2align 4 +L(77bytes): + movdqu -77(%rsi), %xmm1 + movdqu -77(%rdi), %xmm2 + mov $-77, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(61bytes): + movdqu -61(%rdi), %xmm2 + movdqu -61(%rsi), %xmm1 + mov $-61, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(45bytes): + movdqu -45(%rdi), %xmm2 + movdqu -45(%rsi), %xmm1 + mov $-45, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(29bytes): + movdqu -29(%rdi), %xmm2 + movdqu -29(%rsi), %xmm1 + mov $-29, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + + mov -13(%rdi), %rax + mov -13(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) + + mov -8(%rdi), %rax + mov -8(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) + xor %eax, %eax + ret + + .p2align 4 +L(78bytes): + movdqu -78(%rsi), %xmm1 + movdqu -78(%rdi), %xmm2 + mov $-78, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(62bytes): + movdqu -62(%rdi), %xmm2 + movdqu -62(%rsi), %xmm1 + mov $-62, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(46bytes): + movdqu -46(%rdi), %xmm2 + movdqu -46(%rsi), %xmm1 + mov $-46, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(30bytes): + movdqu -30(%rdi), %xmm2 + movdqu -30(%rsi), %xmm1 + mov $-30, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + mov -14(%rdi), %rax + mov -14(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) + mov -8(%rdi), %rax + mov -8(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) + xor %eax, %eax + ret + + .p2align 4 +L(79bytes): + movdqu -79(%rsi), %xmm1 + movdqu -79(%rdi), %xmm2 + mov $-79, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(63bytes): + movdqu -63(%rdi), %xmm2 + movdqu -63(%rsi), %xmm1 + mov $-63, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(47bytes): + movdqu -47(%rdi), %xmm2 + movdqu -47(%rsi), %xmm1 + mov $-47, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(31bytes): + movdqu -31(%rdi), %xmm2 + movdqu -31(%rsi), %xmm1 + mov $-31, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + mov -15(%rdi), %rax + mov -15(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) + mov -8(%rdi), %rax + mov -8(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) + xor %eax, %eax + ret +# endif + .p2align 4 +L(64bytes): + movdqu -64(%rdi), %xmm2 + movdqu -64(%rsi), %xmm1 + mov $-64, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(48bytes): + movdqu -48(%rdi), %xmm2 + movdqu -48(%rsi), %xmm1 + mov $-48, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(32bytes): + movdqu -32(%rdi), %xmm2 + movdqu -32(%rsi), %xmm1 + mov $-32, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + + mov -16(%rdi), %rax + mov -16(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) + + mov -8(%rdi), %rax + mov -8(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) + xor %eax, %eax + ret + +/* + * Aligned 8 bytes to avoid 2 branch "taken" in one 16 alinged code block. + */ + .p2align 3 +L(less16bytes): + movsbq %dl, %rdx + mov (%rsi, %rdx), %rcx + mov (%rdi, %rdx), %rax + cmp %rax, %rcx + jne L(diffin8bytes) + mov 8(%rsi, %rdx), %rcx + mov 8(%rdi, %rdx), %rax +L(diffin8bytes): + cmp %eax, %ecx + jne L(diffin4bytes) + shr $32, %rcx + shr $32, %rax + +# ifdef USE_AS_WMEMCMP +/* for wmemcmp */ + cmp %eax, %ecx + jne L(diffin4bytes) + xor %eax, %eax + ret +# endif + +L(diffin4bytes): +# ifndef USE_AS_WMEMCMP + cmp %cx, %ax + jne L(diffin2bytes) + shr $16, %ecx + shr $16, %eax +L(diffin2bytes): + cmp %cl, %al + jne L(end) + and $0xffff, %eax + and $0xffff, %ecx + sub %ecx, %eax + ret + + .p2align 4 +L(end): + and $0xff, %eax + and $0xff, %ecx + sub %ecx, %eax + ret +# else + +/* for wmemcmp */ + mov $1, %eax + jl L(nequal_bigger) + neg %eax + ret + + .p2align 4 +L(nequal_bigger): + ret + +L(unreal_case): + xor %eax, %eax + ret +# endif + +END (MEMCMP) + + .section .rodata.sse4.1,"a",@progbits + .p2align 3 +# ifndef USE_AS_WMEMCMP +L(table_64bytes): + .int JMPTBL (L(0bytes), L(table_64bytes)) + .int JMPTBL (L(1bytes), L(table_64bytes)) + .int JMPTBL (L(2bytes), L(table_64bytes)) + .int JMPTBL (L(3bytes), L(table_64bytes)) + .int JMPTBL (L(4bytes), L(table_64bytes)) + .int JMPTBL (L(5bytes), L(table_64bytes)) + .int JMPTBL (L(6bytes), L(table_64bytes)) + .int JMPTBL (L(7bytes), L(table_64bytes)) + .int JMPTBL (L(8bytes), L(table_64bytes)) + .int JMPTBL (L(9bytes), L(table_64bytes)) + .int JMPTBL (L(10bytes), L(table_64bytes)) + .int JMPTBL (L(11bytes), L(table_64bytes)) + .int JMPTBL (L(12bytes), L(table_64bytes)) + .int JMPTBL (L(13bytes), L(table_64bytes)) + .int JMPTBL (L(14bytes), L(table_64bytes)) + .int JMPTBL (L(15bytes), L(table_64bytes)) + .int JMPTBL (L(16bytes), L(table_64bytes)) + .int JMPTBL (L(17bytes), L(table_64bytes)) + .int JMPTBL (L(18bytes), L(table_64bytes)) + .int JMPTBL (L(19bytes), L(table_64bytes)) + .int JMPTBL (L(20bytes), L(table_64bytes)) + .int JMPTBL (L(21bytes), L(table_64bytes)) + .int JMPTBL (L(22bytes), L(table_64bytes)) + .int JMPTBL (L(23bytes), L(table_64bytes)) + .int JMPTBL (L(24bytes), L(table_64bytes)) + .int JMPTBL (L(25bytes), L(table_64bytes)) + .int JMPTBL (L(26bytes), L(table_64bytes)) + .int JMPTBL (L(27bytes), L(table_64bytes)) + .int JMPTBL (L(28bytes), L(table_64bytes)) + .int JMPTBL (L(29bytes), L(table_64bytes)) + .int JMPTBL (L(30bytes), L(table_64bytes)) + .int JMPTBL (L(31bytes), L(table_64bytes)) + .int JMPTBL (L(32bytes), L(table_64bytes)) + .int JMPTBL (L(33bytes), L(table_64bytes)) + .int JMPTBL (L(34bytes), L(table_64bytes)) + .int JMPTBL (L(35bytes), L(table_64bytes)) + .int JMPTBL (L(36bytes), L(table_64bytes)) + .int JMPTBL (L(37bytes), L(table_64bytes)) + .int JMPTBL (L(38bytes), L(table_64bytes)) + .int JMPTBL (L(39bytes), L(table_64bytes)) + .int JMPTBL (L(40bytes), L(table_64bytes)) + .int JMPTBL (L(41bytes), L(table_64bytes)) + .int JMPTBL (L(42bytes), L(table_64bytes)) + .int JMPTBL (L(43bytes), L(table_64bytes)) + .int JMPTBL (L(44bytes), L(table_64bytes)) + .int JMPTBL (L(45bytes), L(table_64bytes)) + .int JMPTBL (L(46bytes), L(table_64bytes)) + .int JMPTBL (L(47bytes), L(table_64bytes)) + .int JMPTBL (L(48bytes), L(table_64bytes)) + .int JMPTBL (L(49bytes), L(table_64bytes)) + .int JMPTBL (L(50bytes), L(table_64bytes)) + .int JMPTBL (L(51bytes), L(table_64bytes)) + .int JMPTBL (L(52bytes), L(table_64bytes)) + .int JMPTBL (L(53bytes), L(table_64bytes)) + .int JMPTBL (L(54bytes), L(table_64bytes)) + .int JMPTBL (L(55bytes), L(table_64bytes)) + .int JMPTBL (L(56bytes), L(table_64bytes)) + .int JMPTBL (L(57bytes), L(table_64bytes)) + .int JMPTBL (L(58bytes), L(table_64bytes)) + .int JMPTBL (L(59bytes), L(table_64bytes)) + .int JMPTBL (L(60bytes), L(table_64bytes)) + .int JMPTBL (L(61bytes), L(table_64bytes)) + .int JMPTBL (L(62bytes), L(table_64bytes)) + .int JMPTBL (L(63bytes), L(table_64bytes)) + .int JMPTBL (L(64bytes), L(table_64bytes)) + .int JMPTBL (L(65bytes), L(table_64bytes)) + .int JMPTBL (L(66bytes), L(table_64bytes)) + .int JMPTBL (L(67bytes), L(table_64bytes)) + .int JMPTBL (L(68bytes), L(table_64bytes)) + .int JMPTBL (L(69bytes), L(table_64bytes)) + .int JMPTBL (L(70bytes), L(table_64bytes)) + .int JMPTBL (L(71bytes), L(table_64bytes)) + .int JMPTBL (L(72bytes), L(table_64bytes)) + .int JMPTBL (L(73bytes), L(table_64bytes)) + .int JMPTBL (L(74bytes), L(table_64bytes)) + .int JMPTBL (L(75bytes), L(table_64bytes)) + .int JMPTBL (L(76bytes), L(table_64bytes)) + .int JMPTBL (L(77bytes), L(table_64bytes)) + .int JMPTBL (L(78bytes), L(table_64bytes)) + .int JMPTBL (L(79bytes), L(table_64bytes)) +# else +L(table_64bytes): + .int JMPTBL (L(0bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(4bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(8bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(12bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(16bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(20bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(24bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(28bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(32bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(36bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(40bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(44bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(48bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(52bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(56bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(60bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(64bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(68bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(72bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(76bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) +# endif +#endif diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/memcmp-ssse3.S b/REORG.TODO/sysdeps/x86_64/multiarch/memcmp-ssse3.S new file mode 100644 index 0000000000..8d7d2fe67b --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/memcmp-ssse3.S @@ -0,0 +1,1990 @@ +/* memcmp with SSSE3, wmemcmp with SSSE3 + Copyright (C) 2011-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#if IS_IN (libc) + +# include <sysdep.h> + +# ifndef MEMCMP +# define MEMCMP __memcmp_ssse3 +# endif + +/* Warning! + wmemcmp has to use SIGNED comparison for elements. + memcmp has to use UNSIGNED comparison for elemnts. +*/ + + atom_text_section +ENTRY (MEMCMP) +# ifdef USE_AS_WMEMCMP + shl $2, %rdx + test %rdx, %rdx + jz L(equal) +# endif + mov %rdx, %rcx + mov %rdi, %rdx + cmp $48, %rcx; + jae L(48bytesormore) /* LEN => 48 */ + + add %rcx, %rsi + add %rcx, %rdi + jmp L(less48bytes) + + .p2align 4 +/* ECX >= 32. */ +L(48bytesormore): + movdqu (%rdi), %xmm3 + movdqu (%rsi), %xmm0 + pcmpeqb %xmm0, %xmm3 + pmovmskb %xmm3, %edx + lea 16(%rdi), %rdi + lea 16(%rsi), %rsi + sub $0xffff, %edx + jnz L(less16bytes) + mov %edi, %edx + and $0xf, %edx + xor %rdx, %rdi + sub %rdx, %rsi + add %rdx, %rcx + mov %esi, %edx + and $0xf, %edx + jz L(shr_0) + xor %rdx, %rsi + +# ifndef USE_AS_WMEMCMP + cmp $8, %edx + jae L(next_unaligned_table) + cmp $0, %edx + je L(shr_0) + cmp $1, %edx + je L(shr_1) + cmp $2, %edx + je L(shr_2) + cmp $3, %edx + je L(shr_3) + cmp $4, %edx + je L(shr_4) + cmp $5, %edx + je L(shr_5) + cmp $6, %edx + je L(shr_6) + jmp L(shr_7) + + .p2align 2 +L(next_unaligned_table): + cmp $8, %edx + je L(shr_8) + cmp $9, %edx + je L(shr_9) + cmp $10, %edx + je L(shr_10) + cmp $11, %edx + je L(shr_11) + cmp $12, %edx + je L(shr_12) + cmp $13, %edx + je L(shr_13) + cmp $14, %edx + je L(shr_14) + jmp L(shr_15) +# else + cmp $0, %edx + je L(shr_0) + cmp $4, %edx + je L(shr_4) + cmp $8, %edx + je L(shr_8) + jmp L(shr_12) +# endif + + .p2align 4 +L(shr_0): + cmp $80, %rcx + lea -48(%rcx), %rcx + jae L(shr_0_gobble) + xor %eax, %eax + movdqa (%rsi), %xmm1 + pcmpeqb (%rdi), %xmm1 + movdqa 16(%rsi), %xmm2 + pcmpeqb 16(%rdi), %xmm2 + pand %xmm1, %xmm2 + pmovmskb %xmm2, %edx + lea 32(%rdi), %rdi + lea 32(%rsi), %rsi + sub $0xffff, %edx + jnz L(exit) + add %rcx, %rsi + add %rcx, %rdi + jmp L(less48bytes) + + .p2align 4 +L(shr_0_gobble): + movdqa (%rsi), %xmm0 + xor %eax, %eax + pcmpeqb (%rdi), %xmm0 + sub $32, %rcx + movdqa 16(%rsi), %xmm2 + pcmpeqb 16(%rdi), %xmm2 +L(shr_0_gobble_loop): + pand %xmm0, %xmm2 + sub $32, %rcx + pmovmskb %xmm2, %edx + movdqa %xmm0, %xmm1 + movdqa 32(%rsi), %xmm0 + movdqa 48(%rsi), %xmm2 + sbb $0xffff, %edx + pcmpeqb 32(%rdi), %xmm0 + pcmpeqb 48(%rdi), %xmm2 + lea 32(%rdi), %rdi + lea 32(%rsi), %rsi + jz L(shr_0_gobble_loop) + + pand %xmm0, %xmm2 + cmp $0, %rcx + jge L(next) + inc %edx + add $32, %rcx +L(next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm2, %edx + movdqa %xmm0, %xmm1 + lea 32(%rdi), %rdi + lea 32(%rsi), %rsi + sub $0xffff, %edx + jnz L(exit) + add %rcx, %rsi + add %rcx, %rdi + jmp L(less48bytes) + +# ifndef USE_AS_WMEMCMP + + .p2align 4 +L(shr_1): + cmp $80, %rcx + lea -48(%rcx), %rcx + mov %edx, %eax + jae L(shr_1_gobble) + + movdqa 16(%rsi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $1, (%rsi), %xmm1 + pcmpeqb (%rdi), %xmm1 + + movdqa 32(%rsi), %xmm3 + palignr $1, %xmm2, %xmm3 + pcmpeqb 16(%rdi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%rdi), %rdi + lea 32(%rsi), %rsi + sub $0xffff, %edx + jnz L(exit) + add $1, %rsi + add %rcx, %rsi + add %rcx, %rdi + jmp L(less48bytes) + + .p2align 4 +L(shr_1_gobble): + sub $32, %rcx + movdqa 16(%rsi), %xmm0 + palignr $1, (%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + + movdqa 32(%rsi), %xmm3 + palignr $1, 16(%rsi), %xmm3 + pcmpeqb 16(%rdi), %xmm3 + +L(shr_1_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %rcx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%rsi), %xmm3 + palignr $1, 48(%rsi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%rsi), %xmm0 + palignr $1, 32(%rsi), %xmm0 + pcmpeqb 32(%rdi), %xmm0 + lea 32(%rsi), %rsi + pcmpeqb 48(%rdi), %xmm3 + + lea 32(%rdi), %rdi + jz L(shr_1_gobble_loop) + pand %xmm0, %xmm3 + + cmp $0, %rcx + jge L(shr_1_gobble_next) + inc %edx + add $32, %rcx +L(shr_1_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%rdi), %rdi + lea 32(%rsi), %rsi + sub $0xffff, %edx + jnz L(exit) + + lea 1(%rsi), %rsi + add %rcx, %rsi + add %rcx, %rdi + jmp L(less48bytes) + + + .p2align 4 +L(shr_2): + cmp $80, %rcx + lea -48(%rcx), %rcx + mov %edx, %eax + jae L(shr_2_gobble) + + movdqa 16(%rsi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $2, (%rsi), %xmm1 + pcmpeqb (%rdi), %xmm1 + + movdqa 32(%rsi), %xmm3 + palignr $2, %xmm2, %xmm3 + pcmpeqb 16(%rdi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%rdi), %rdi + lea 32(%rsi), %rsi + sub $0xffff, %edx + jnz L(exit) + add $2, %rsi + add %rcx, %rsi + add %rcx, %rdi + jmp L(less48bytes) + + .p2align 4 +L(shr_2_gobble): + sub $32, %rcx + movdqa 16(%rsi), %xmm0 + palignr $2, (%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + + movdqa 32(%rsi), %xmm3 + palignr $2, 16(%rsi), %xmm3 + pcmpeqb 16(%rdi), %xmm3 + +L(shr_2_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %rcx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%rsi), %xmm3 + palignr $2, 48(%rsi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%rsi), %xmm0 + palignr $2, 32(%rsi), %xmm0 + pcmpeqb 32(%rdi), %xmm0 + lea 32(%rsi), %rsi + pcmpeqb 48(%rdi), %xmm3 + + lea 32(%rdi), %rdi + jz L(shr_2_gobble_loop) + pand %xmm0, %xmm3 + + cmp $0, %rcx + jge L(shr_2_gobble_next) + inc %edx + add $32, %rcx +L(shr_2_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%rdi), %rdi + lea 32(%rsi), %rsi + sub $0xffff, %edx + jnz L(exit) + + lea 2(%rsi), %rsi + add %rcx, %rsi + add %rcx, %rdi + jmp L(less48bytes) + + .p2align 4 +L(shr_3): + cmp $80, %rcx + lea -48(%rcx), %rcx + mov %edx, %eax + jae L(shr_3_gobble) + + movdqa 16(%rsi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $3, (%rsi), %xmm1 + pcmpeqb (%rdi), %xmm1 + + movdqa 32(%rsi), %xmm3 + palignr $3, %xmm2, %xmm3 + pcmpeqb 16(%rdi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%rdi), %rdi + lea 32(%rsi), %rsi + sub $0xffff, %edx + jnz L(exit) + add $3, %rsi + add %rcx, %rsi + add %rcx, %rdi + jmp L(less48bytes) + + .p2align 4 +L(shr_3_gobble): + sub $32, %rcx + movdqa 16(%rsi), %xmm0 + palignr $3, (%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + + movdqa 32(%rsi), %xmm3 + palignr $3, 16(%rsi), %xmm3 + pcmpeqb 16(%rdi), %xmm3 + +L(shr_3_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %rcx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%rsi), %xmm3 + palignr $3, 48(%rsi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%rsi), %xmm0 + palignr $3, 32(%rsi), %xmm0 + pcmpeqb 32(%rdi), %xmm0 + lea 32(%rsi), %rsi + pcmpeqb 48(%rdi), %xmm3 + + lea 32(%rdi), %rdi + jz L(shr_3_gobble_loop) + pand %xmm0, %xmm3 + + cmp $0, %rcx + jge L(shr_3_gobble_next) + inc %edx + add $32, %rcx +L(shr_3_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%rdi), %rdi + lea 32(%rsi), %rsi + sub $0xffff, %edx + jnz L(exit) + + lea 3(%rsi), %rsi + add %rcx, %rsi + add %rcx, %rdi + jmp L(less48bytes) + +# endif + + .p2align 4 +L(shr_4): + cmp $80, %rcx + lea -48(%rcx), %rcx + mov %edx, %eax + jae L(shr_4_gobble) + + movdqa 16(%rsi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $4, (%rsi), %xmm1 + pcmpeqb (%rdi), %xmm1 + + movdqa 32(%rsi), %xmm3 + palignr $4, %xmm2, %xmm3 + pcmpeqb 16(%rdi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%rdi), %rdi + lea 32(%rsi), %rsi + sub $0xffff, %edx + jnz L(exit) + add $4, %rsi + add %rcx, %rsi + add %rcx, %rdi + jmp L(less48bytes) + + .p2align 4 +L(shr_4_gobble): + sub $32, %rcx + movdqa 16(%rsi), %xmm0 + palignr $4, (%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + + movdqa 32(%rsi), %xmm3 + palignr $4, 16(%rsi), %xmm3 + pcmpeqb 16(%rdi), %xmm3 + +L(shr_4_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %rcx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%rsi), %xmm3 + palignr $4, 48(%rsi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%rsi), %xmm0 + palignr $4, 32(%rsi), %xmm0 + pcmpeqb 32(%rdi), %xmm0 + lea 32(%rsi), %rsi + pcmpeqb 48(%rdi), %xmm3 + + lea 32(%rdi), %rdi + jz L(shr_4_gobble_loop) + pand %xmm0, %xmm3 + + cmp $0, %rcx + jge L(shr_4_gobble_next) + inc %edx + add $32, %rcx +L(shr_4_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%rdi), %rdi + lea 32(%rsi), %rsi + sub $0xffff, %edx + jnz L(exit) + + lea 4(%rsi), %rsi + add %rcx, %rsi + add %rcx, %rdi + jmp L(less48bytes) + +# ifndef USE_AS_WMEMCMP + + .p2align 4 +L(shr_5): + cmp $80, %rcx + lea -48(%rcx), %rcx + mov %edx, %eax + jae L(shr_5_gobble) + + movdqa 16(%rsi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $5, (%rsi), %xmm1 + pcmpeqb (%rdi), %xmm1 + + movdqa 32(%rsi), %xmm3 + palignr $5, %xmm2, %xmm3 + pcmpeqb 16(%rdi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%rdi), %rdi + lea 32(%rsi), %rsi + sub $0xffff, %edx + jnz L(exit) + add $5, %rsi + add %rcx, %rsi + add %rcx, %rdi + jmp L(less48bytes) + + .p2align 4 +L(shr_5_gobble): + sub $32, %rcx + movdqa 16(%rsi), %xmm0 + palignr $5, (%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + + movdqa 32(%rsi), %xmm3 + palignr $5, 16(%rsi), %xmm3 + pcmpeqb 16(%rdi), %xmm3 + +L(shr_5_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %rcx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%rsi), %xmm3 + palignr $5, 48(%rsi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%rsi), %xmm0 + palignr $5, 32(%rsi), %xmm0 + pcmpeqb 32(%rdi), %xmm0 + lea 32(%rsi), %rsi + pcmpeqb 48(%rdi), %xmm3 + + lea 32(%rdi), %rdi + jz L(shr_5_gobble_loop) + pand %xmm0, %xmm3 + + cmp $0, %rcx + jge L(shr_5_gobble_next) + inc %edx + add $32, %rcx +L(shr_5_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%rdi), %rdi + lea 32(%rsi), %rsi + sub $0xffff, %edx + jnz L(exit) + + lea 5(%rsi), %rsi + add %rcx, %rsi + add %rcx, %rdi + jmp L(less48bytes) + + .p2align 4 +L(shr_6): + cmp $80, %rcx + lea -48(%rcx), %rcx + mov %edx, %eax + jae L(shr_6_gobble) + + movdqa 16(%rsi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $6, (%rsi), %xmm1 + pcmpeqb (%rdi), %xmm1 + + movdqa 32(%rsi), %xmm3 + palignr $6, %xmm2, %xmm3 + pcmpeqb 16(%rdi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%rdi), %rdi + lea 32(%rsi), %rsi + sub $0xffff, %edx + jnz L(exit) + add $6, %rsi + add %rcx, %rsi + add %rcx, %rdi + jmp L(less48bytes) + + .p2align 4 +L(shr_6_gobble): + sub $32, %rcx + movdqa 16(%rsi), %xmm0 + palignr $6, (%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + + movdqa 32(%rsi), %xmm3 + palignr $6, 16(%rsi), %xmm3 + pcmpeqb 16(%rdi), %xmm3 + +L(shr_6_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %rcx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%rsi), %xmm3 + palignr $6, 48(%rsi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%rsi), %xmm0 + palignr $6, 32(%rsi), %xmm0 + pcmpeqb 32(%rdi), %xmm0 + lea 32(%rsi), %rsi + pcmpeqb 48(%rdi), %xmm3 + + lea 32(%rdi), %rdi + jz L(shr_6_gobble_loop) + pand %xmm0, %xmm3 + + cmp $0, %rcx + jge L(shr_6_gobble_next) + inc %edx + add $32, %rcx +L(shr_6_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%rdi), %rdi + lea 32(%rsi), %rsi + sub $0xffff, %edx + jnz L(exit) + + lea 6(%rsi), %rsi + add %rcx, %rsi + add %rcx, %rdi + jmp L(less48bytes) + + .p2align 4 +L(shr_7): + cmp $80, %rcx + lea -48(%rcx), %rcx + mov %edx, %eax + jae L(shr_7_gobble) + + movdqa 16(%rsi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $7, (%rsi), %xmm1 + pcmpeqb (%rdi), %xmm1 + + movdqa 32(%rsi), %xmm3 + palignr $7, %xmm2, %xmm3 + pcmpeqb 16(%rdi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%rdi), %rdi + lea 32(%rsi), %rsi + sub $0xffff, %edx + jnz L(exit) + add $7, %rsi + add %rcx, %rsi + add %rcx, %rdi + jmp L(less48bytes) + + .p2align 4 +L(shr_7_gobble): + sub $32, %rcx + movdqa 16(%rsi), %xmm0 + palignr $7, (%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + + movdqa 32(%rsi), %xmm3 + palignr $7, 16(%rsi), %xmm3 + pcmpeqb 16(%rdi), %xmm3 + +L(shr_7_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %rcx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%rsi), %xmm3 + palignr $7, 48(%rsi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%rsi), %xmm0 + palignr $7, 32(%rsi), %xmm0 + pcmpeqb 32(%rdi), %xmm0 + lea 32(%rsi), %rsi + pcmpeqb 48(%rdi), %xmm3 + + lea 32(%rdi), %rdi + jz L(shr_7_gobble_loop) + pand %xmm0, %xmm3 + + cmp $0, %rcx + jge L(shr_7_gobble_next) + inc %edx + add $32, %rcx +L(shr_7_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%rdi), %rdi + lea 32(%rsi), %rsi + sub $0xffff, %edx + jnz L(exit) + + lea 7(%rsi), %rsi + add %rcx, %rsi + add %rcx, %rdi + jmp L(less48bytes) + +# endif + + .p2align 4 +L(shr_8): + cmp $80, %rcx + lea -48(%rcx), %rcx + mov %edx, %eax + jae L(shr_8_gobble) + + movdqa 16(%rsi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $8, (%rsi), %xmm1 + pcmpeqb (%rdi), %xmm1 + + movdqa 32(%rsi), %xmm3 + palignr $8, %xmm2, %xmm3 + pcmpeqb 16(%rdi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%rdi), %rdi + lea 32(%rsi), %rsi + sub $0xffff, %edx + jnz L(exit) + add $8, %rsi + add %rcx, %rsi + add %rcx, %rdi + jmp L(less48bytes) + + .p2align 4 +L(shr_8_gobble): + sub $32, %rcx + movdqa 16(%rsi), %xmm0 + palignr $8, (%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + + movdqa 32(%rsi), %xmm3 + palignr $8, 16(%rsi), %xmm3 + pcmpeqb 16(%rdi), %xmm3 + +L(shr_8_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %rcx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%rsi), %xmm3 + palignr $8, 48(%rsi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%rsi), %xmm0 + palignr $8, 32(%rsi), %xmm0 + pcmpeqb 32(%rdi), %xmm0 + lea 32(%rsi), %rsi + pcmpeqb 48(%rdi), %xmm3 + + lea 32(%rdi), %rdi + jz L(shr_8_gobble_loop) + pand %xmm0, %xmm3 + + cmp $0, %rcx + jge L(shr_8_gobble_next) + inc %edx + add $32, %rcx +L(shr_8_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%rdi), %rdi + lea 32(%rsi), %rsi + sub $0xffff, %edx + jnz L(exit) + + lea 8(%rsi), %rsi + add %rcx, %rsi + add %rcx, %rdi + jmp L(less48bytes) + +# ifndef USE_AS_WMEMCMP + + .p2align 4 +L(shr_9): + cmp $80, %rcx + lea -48(%rcx), %rcx + mov %edx, %eax + jae L(shr_9_gobble) + + movdqa 16(%rsi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $9, (%rsi), %xmm1 + pcmpeqb (%rdi), %xmm1 + + movdqa 32(%rsi), %xmm3 + palignr $9, %xmm2, %xmm3 + pcmpeqb 16(%rdi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%rdi), %rdi + lea 32(%rsi), %rsi + sub $0xffff, %edx + jnz L(exit) + add $9, %rsi + add %rcx, %rsi + add %rcx, %rdi + jmp L(less48bytes) + + .p2align 4 +L(shr_9_gobble): + sub $32, %rcx + movdqa 16(%rsi), %xmm0 + palignr $9, (%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + + movdqa 32(%rsi), %xmm3 + palignr $9, 16(%rsi), %xmm3 + pcmpeqb 16(%rdi), %xmm3 + +L(shr_9_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %rcx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%rsi), %xmm3 + palignr $9, 48(%rsi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%rsi), %xmm0 + palignr $9, 32(%rsi), %xmm0 + pcmpeqb 32(%rdi), %xmm0 + lea 32(%rsi), %rsi + pcmpeqb 48(%rdi), %xmm3 + + lea 32(%rdi), %rdi + jz L(shr_9_gobble_loop) + pand %xmm0, %xmm3 + + cmp $0, %rcx + jge L(shr_9_gobble_next) + inc %edx + add $32, %rcx +L(shr_9_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%rdi), %rdi + lea 32(%rsi), %rsi + sub $0xffff, %edx + jnz L(exit) + + lea 9(%rsi), %rsi + add %rcx, %rsi + add %rcx, %rdi + jmp L(less48bytes) + + .p2align 4 +L(shr_10): + cmp $80, %rcx + lea -48(%rcx), %rcx + mov %edx, %eax + jae L(shr_10_gobble) + + movdqa 16(%rsi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $10, (%rsi), %xmm1 + pcmpeqb (%rdi), %xmm1 + + movdqa 32(%rsi), %xmm3 + palignr $10, %xmm2, %xmm3 + pcmpeqb 16(%rdi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%rdi), %rdi + lea 32(%rsi), %rsi + sub $0xffff, %edx + jnz L(exit) + add $10, %rsi + add %rcx, %rsi + add %rcx, %rdi + jmp L(less48bytes) + + .p2align 4 +L(shr_10_gobble): + sub $32, %rcx + movdqa 16(%rsi), %xmm0 + palignr $10, (%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + + movdqa 32(%rsi), %xmm3 + palignr $10, 16(%rsi), %xmm3 + pcmpeqb 16(%rdi), %xmm3 + +L(shr_10_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %rcx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%rsi), %xmm3 + palignr $10, 48(%rsi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%rsi), %xmm0 + palignr $10, 32(%rsi), %xmm0 + pcmpeqb 32(%rdi), %xmm0 + lea 32(%rsi), %rsi + pcmpeqb 48(%rdi), %xmm3 + + lea 32(%rdi), %rdi + jz L(shr_10_gobble_loop) + pand %xmm0, %xmm3 + + cmp $0, %rcx + jge L(shr_10_gobble_next) + inc %edx + add $32, %rcx +L(shr_10_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%rdi), %rdi + lea 32(%rsi), %rsi + sub $0xffff, %edx + jnz L(exit) + + lea 10(%rsi), %rsi + add %rcx, %rsi + add %rcx, %rdi + jmp L(less48bytes) + + .p2align 4 +L(shr_11): + cmp $80, %rcx + lea -48(%rcx), %rcx + mov %edx, %eax + jae L(shr_11_gobble) + + movdqa 16(%rsi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $11, (%rsi), %xmm1 + pcmpeqb (%rdi), %xmm1 + + movdqa 32(%rsi), %xmm3 + palignr $11, %xmm2, %xmm3 + pcmpeqb 16(%rdi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%rdi), %rdi + lea 32(%rsi), %rsi + sub $0xffff, %edx + jnz L(exit) + add $11, %rsi + add %rcx, %rsi + add %rcx, %rdi + jmp L(less48bytes) + + .p2align 4 +L(shr_11_gobble): + sub $32, %rcx + movdqa 16(%rsi), %xmm0 + palignr $11, (%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + + movdqa 32(%rsi), %xmm3 + palignr $11, 16(%rsi), %xmm3 + pcmpeqb 16(%rdi), %xmm3 + +L(shr_11_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %rcx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%rsi), %xmm3 + palignr $11, 48(%rsi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%rsi), %xmm0 + palignr $11, 32(%rsi), %xmm0 + pcmpeqb 32(%rdi), %xmm0 + lea 32(%rsi), %rsi + pcmpeqb 48(%rdi), %xmm3 + + lea 32(%rdi), %rdi + jz L(shr_11_gobble_loop) + pand %xmm0, %xmm3 + + cmp $0, %rcx + jge L(shr_11_gobble_next) + inc %edx + add $32, %rcx +L(shr_11_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%rdi), %rdi + lea 32(%rsi), %rsi + sub $0xffff, %edx + jnz L(exit) + + lea 11(%rsi), %rsi + add %rcx, %rsi + add %rcx, %rdi + jmp L(less48bytes) + +# endif + + .p2align 4 +L(shr_12): + cmp $80, %rcx + lea -48(%rcx), %rcx + mov %edx, %eax + jae L(shr_12_gobble) + + movdqa 16(%rsi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $12, (%rsi), %xmm1 + pcmpeqb (%rdi), %xmm1 + + movdqa 32(%rsi), %xmm3 + palignr $12, %xmm2, %xmm3 + pcmpeqb 16(%rdi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%rdi), %rdi + lea 32(%rsi), %rsi + sub $0xffff, %edx + jnz L(exit) + add $12, %rsi + add %rcx, %rsi + add %rcx, %rdi + jmp L(less48bytes) + + .p2align 4 +L(shr_12_gobble): + sub $32, %rcx + movdqa 16(%rsi), %xmm0 + palignr $12, (%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + + movdqa 32(%rsi), %xmm3 + palignr $12, 16(%rsi), %xmm3 + pcmpeqb 16(%rdi), %xmm3 + +L(shr_12_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %rcx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%rsi), %xmm3 + palignr $12, 48(%rsi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%rsi), %xmm0 + palignr $12, 32(%rsi), %xmm0 + pcmpeqb 32(%rdi), %xmm0 + lea 32(%rsi), %rsi + pcmpeqb 48(%rdi), %xmm3 + + lea 32(%rdi), %rdi + jz L(shr_12_gobble_loop) + pand %xmm0, %xmm3 + + cmp $0, %rcx + jge L(shr_12_gobble_next) + inc %edx + add $32, %rcx +L(shr_12_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%rdi), %rdi + lea 32(%rsi), %rsi + sub $0xffff, %edx + jnz L(exit) + + lea 12(%rsi), %rsi + add %rcx, %rsi + add %rcx, %rdi + jmp L(less48bytes) + +# ifndef USE_AS_WMEMCMP + + .p2align 4 +L(shr_13): + cmp $80, %rcx + lea -48(%rcx), %rcx + mov %edx, %eax + jae L(shr_13_gobble) + + movdqa 16(%rsi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $13, (%rsi), %xmm1 + pcmpeqb (%rdi), %xmm1 + + movdqa 32(%rsi), %xmm3 + palignr $13, %xmm2, %xmm3 + pcmpeqb 16(%rdi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%rdi), %rdi + lea 32(%rsi), %rsi + sub $0xffff, %edx + jnz L(exit) + add $13, %rsi + add %rcx, %rsi + add %rcx, %rdi + jmp L(less48bytes) + + .p2align 4 +L(shr_13_gobble): + sub $32, %rcx + movdqa 16(%rsi), %xmm0 + palignr $13, (%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + + movdqa 32(%rsi), %xmm3 + palignr $13, 16(%rsi), %xmm3 + pcmpeqb 16(%rdi), %xmm3 + +L(shr_13_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %rcx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%rsi), %xmm3 + palignr $13, 48(%rsi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%rsi), %xmm0 + palignr $13, 32(%rsi), %xmm0 + pcmpeqb 32(%rdi), %xmm0 + lea 32(%rsi), %rsi + pcmpeqb 48(%rdi), %xmm3 + + lea 32(%rdi), %rdi + jz L(shr_13_gobble_loop) + pand %xmm0, %xmm3 + + cmp $0, %rcx + jge L(shr_13_gobble_next) + inc %edx + add $32, %rcx +L(shr_13_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%rdi), %rdi + lea 32(%rsi), %rsi + sub $0xffff, %edx + jnz L(exit) + + lea 13(%rsi), %rsi + add %rcx, %rsi + add %rcx, %rdi + jmp L(less48bytes) + + .p2align 4 +L(shr_14): + cmp $80, %rcx + lea -48(%rcx), %rcx + mov %edx, %eax + jae L(shr_14_gobble) + + movdqa 16(%rsi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $14, (%rsi), %xmm1 + pcmpeqb (%rdi), %xmm1 + + movdqa 32(%rsi), %xmm3 + palignr $14, %xmm2, %xmm3 + pcmpeqb 16(%rdi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%rdi), %rdi + lea 32(%rsi), %rsi + sub $0xffff, %edx + jnz L(exit) + add $14, %rsi + add %rcx, %rsi + add %rcx, %rdi + jmp L(less48bytes) + + .p2align 4 +L(shr_14_gobble): + sub $32, %rcx + movdqa 16(%rsi), %xmm0 + palignr $14, (%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + + movdqa 32(%rsi), %xmm3 + palignr $14, 16(%rsi), %xmm3 + pcmpeqb 16(%rdi), %xmm3 + +L(shr_14_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %rcx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%rsi), %xmm3 + palignr $14, 48(%rsi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%rsi), %xmm0 + palignr $14, 32(%rsi), %xmm0 + pcmpeqb 32(%rdi), %xmm0 + lea 32(%rsi), %rsi + pcmpeqb 48(%rdi), %xmm3 + + lea 32(%rdi), %rdi + jz L(shr_14_gobble_loop) + pand %xmm0, %xmm3 + + cmp $0, %rcx + jge L(shr_14_gobble_next) + inc %edx + add $32, %rcx +L(shr_14_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%rdi), %rdi + lea 32(%rsi), %rsi + sub $0xffff, %edx + jnz L(exit) + + lea 14(%rsi), %rsi + add %rcx, %rsi + add %rcx, %rdi + jmp L(less48bytes) + + .p2align 4 +L(shr_15): + cmp $80, %rcx + lea -48(%rcx), %rcx + mov %edx, %eax + jae L(shr_15_gobble) + + movdqa 16(%rsi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $15, (%rsi), %xmm1 + pcmpeqb (%rdi), %xmm1 + + movdqa 32(%rsi), %xmm3 + palignr $15, %xmm2, %xmm3 + pcmpeqb 16(%rdi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%rdi), %rdi + lea 32(%rsi), %rsi + sub $0xffff, %edx + jnz L(exit) + add $15, %rsi + add %rcx, %rsi + add %rcx, %rdi + jmp L(less48bytes) + + .p2align 4 +L(shr_15_gobble): + sub $32, %rcx + movdqa 16(%rsi), %xmm0 + palignr $15, (%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + + movdqa 32(%rsi), %xmm3 + palignr $15, 16(%rsi), %xmm3 + pcmpeqb 16(%rdi), %xmm3 + +L(shr_15_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %rcx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%rsi), %xmm3 + palignr $15, 48(%rsi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%rsi), %xmm0 + palignr $15, 32(%rsi), %xmm0 + pcmpeqb 32(%rdi), %xmm0 + lea 32(%rsi), %rsi + pcmpeqb 48(%rdi), %xmm3 + + lea 32(%rdi), %rdi + jz L(shr_15_gobble_loop) + pand %xmm0, %xmm3 + + cmp $0, %rcx + jge L(shr_15_gobble_next) + inc %edx + add $32, %rcx +L(shr_15_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%rdi), %rdi + lea 32(%rsi), %rsi + sub $0xffff, %edx + jnz L(exit) + + lea 15(%rsi), %rsi + add %rcx, %rsi + add %rcx, %rdi + jmp L(less48bytes) +# endif + .p2align 4 +L(exit): + pmovmskb %xmm1, %r8d + sub $0xffff, %r8d + jz L(first16bytes) + lea -16(%rsi), %rsi + lea -16(%rdi), %rdi + mov %r8d, %edx +L(first16bytes): + add %rax, %rsi +L(less16bytes): +# ifndef USE_AS_WMEMCMP + test %dl, %dl + jz L(next_24_bytes) + + test $0x01, %dl + jnz L(Byte16) + + test $0x02, %dl + jnz L(Byte17) + + test $0x04, %dl + jnz L(Byte18) + + test $0x08, %dl + jnz L(Byte19) + + test $0x10, %dl + jnz L(Byte20) + + test $0x20, %dl + jnz L(Byte21) + + test $0x40, %dl + jnz L(Byte22) + + movzbl -9(%rdi), %eax + movzbl -9(%rsi), %edx + sub %edx, %eax + ret + + .p2align 4 +L(Byte16): + movzbl -16(%rdi), %eax + movzbl -16(%rsi), %edx + sub %edx, %eax + ret + + .p2align 4 +L(Byte17): + movzbl -15(%rdi), %eax + movzbl -15(%rsi), %edx + sub %edx, %eax + ret + + .p2align 4 +L(Byte18): + movzbl -14(%rdi), %eax + movzbl -14(%rsi), %edx + sub %edx, %eax + ret + + .p2align 4 +L(Byte19): + movzbl -13(%rdi), %eax + movzbl -13(%rsi), %edx + sub %edx, %eax + ret + + .p2align 4 +L(Byte20): + movzbl -12(%rdi), %eax + movzbl -12(%rsi), %edx + sub %edx, %eax + ret + + .p2align 4 +L(Byte21): + movzbl -11(%rdi), %eax + movzbl -11(%rsi), %edx + sub %edx, %eax + ret + + .p2align 4 +L(Byte22): + movzbl -10(%rdi), %eax + movzbl -10(%rsi), %edx + sub %edx, %eax + ret + + .p2align 4 +L(next_24_bytes): + lea 8(%rdi), %rdi + lea 8(%rsi), %rsi + test $0x01, %dh + jnz L(Byte16) + + test $0x02, %dh + jnz L(Byte17) + + test $0x04, %dh + jnz L(Byte18) + + test $0x08, %dh + jnz L(Byte19) + + test $0x10, %dh + jnz L(Byte20) + + test $0x20, %dh + jnz L(Byte21) + + test $0x40, %dh + jnz L(Byte22) + + movzbl -9(%rdi), %eax + movzbl -9(%rsi), %edx + sub %edx, %eax + ret +# else +/* special for wmemcmp */ + xor %eax, %eax + test %dl, %dl + jz L(next_two_double_words) + and $15, %dl + jz L(second_double_word) + mov -16(%rdi), %eax + cmp -16(%rsi), %eax + jne L(find_diff) + ret + + .p2align 4 +L(second_double_word): + mov -12(%rdi), %eax + cmp -12(%rsi), %eax + jne L(find_diff) + ret + + .p2align 4 +L(next_two_double_words): + and $15, %dh + jz L(fourth_double_word) + mov -8(%rdi), %eax + cmp -8(%rsi), %eax + jne L(find_diff) + ret + + .p2align 4 +L(fourth_double_word): + mov -4(%rdi), %eax + cmp -4(%rsi), %eax + jne L(find_diff) + ret +# endif + + .p2align 4 +L(less48bytes): + cmp $8, %ecx + jae L(more8bytes) + cmp $0, %ecx + je L(0bytes) +# ifndef USE_AS_WMEMCMP + cmp $1, %ecx + je L(1bytes) + cmp $2, %ecx + je L(2bytes) + cmp $3, %ecx + je L(3bytes) + cmp $4, %ecx + je L(4bytes) + cmp $5, %ecx + je L(5bytes) + cmp $6, %ecx + je L(6bytes) + jmp L(7bytes) +# else + jmp L(4bytes) +# endif + + .p2align 4 +L(more8bytes): + cmp $16, %ecx + jae L(more16bytes) + cmp $8, %ecx + je L(8bytes) +# ifndef USE_AS_WMEMCMP + cmp $9, %ecx + je L(9bytes) + cmp $10, %ecx + je L(10bytes) + cmp $11, %ecx + je L(11bytes) + cmp $12, %ecx + je L(12bytes) + cmp $13, %ecx + je L(13bytes) + cmp $14, %ecx + je L(14bytes) + jmp L(15bytes) +# else + jmp L(12bytes) +# endif + + .p2align 4 +L(more16bytes): + cmp $24, %ecx + jae L(more24bytes) + cmp $16, %ecx + je L(16bytes) +# ifndef USE_AS_WMEMCMP + cmp $17, %ecx + je L(17bytes) + cmp $18, %ecx + je L(18bytes) + cmp $19, %ecx + je L(19bytes) + cmp $20, %ecx + je L(20bytes) + cmp $21, %ecx + je L(21bytes) + cmp $22, %ecx + je L(22bytes) + jmp L(23bytes) +# else + jmp L(20bytes) +# endif + + .p2align 4 +L(more24bytes): + cmp $32, %ecx + jae L(more32bytes) + cmp $24, %ecx + je L(24bytes) +# ifndef USE_AS_WMEMCMP + cmp $25, %ecx + je L(25bytes) + cmp $26, %ecx + je L(26bytes) + cmp $27, %ecx + je L(27bytes) + cmp $28, %ecx + je L(28bytes) + cmp $29, %ecx + je L(29bytes) + cmp $30, %ecx + je L(30bytes) + jmp L(31bytes) +# else + jmp L(28bytes) +# endif + + .p2align 4 +L(more32bytes): + cmp $40, %ecx + jae L(more40bytes) + cmp $32, %ecx + je L(32bytes) +# ifndef USE_AS_WMEMCMP + cmp $33, %ecx + je L(33bytes) + cmp $34, %ecx + je L(34bytes) + cmp $35, %ecx + je L(35bytes) + cmp $36, %ecx + je L(36bytes) + cmp $37, %ecx + je L(37bytes) + cmp $38, %ecx + je L(38bytes) + jmp L(39bytes) +# else + jmp L(36bytes) +# endif + + .p2align 4 +L(more40bytes): + cmp $40, %ecx + je L(40bytes) +# ifndef USE_AS_WMEMCMP + cmp $41, %ecx + je L(41bytes) + cmp $42, %ecx + je L(42bytes) + cmp $43, %ecx + je L(43bytes) + cmp $44, %ecx + je L(44bytes) + cmp $45, %ecx + je L(45bytes) + cmp $46, %ecx + je L(46bytes) + jmp L(47bytes) + + .p2align 4 +L(44bytes): + movl -44(%rdi), %eax + movl -44(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(40bytes): + movl -40(%rdi), %eax + movl -40(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(36bytes): + movl -36(%rdi), %eax + movl -36(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(32bytes): + movl -32(%rdi), %eax + movl -32(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(28bytes): + movl -28(%rdi), %eax + movl -28(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(24bytes): + movl -24(%rdi), %eax + movl -24(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(20bytes): + movl -20(%rdi), %eax + movl -20(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(16bytes): + movl -16(%rdi), %eax + movl -16(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(12bytes): + movl -12(%rdi), %eax + movl -12(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(8bytes): + movl -8(%rdi), %eax + movl -8(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(4bytes): + movl -4(%rdi), %eax + movl -4(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(0bytes): + xor %eax, %eax + ret +# else + .p2align 4 +L(44bytes): + movl -44(%rdi), %eax + cmp -44(%rsi), %eax + jne L(find_diff) +L(40bytes): + movl -40(%rdi), %eax + cmp -40(%rsi), %eax + jne L(find_diff) +L(36bytes): + movl -36(%rdi), %eax + cmp -36(%rsi), %eax + jne L(find_diff) +L(32bytes): + movl -32(%rdi), %eax + cmp -32(%rsi), %eax + jne L(find_diff) +L(28bytes): + movl -28(%rdi), %eax + cmp -28(%rsi), %eax + jne L(find_diff) +L(24bytes): + movl -24(%rdi), %eax + cmp -24(%rsi), %eax + jne L(find_diff) +L(20bytes): + movl -20(%rdi), %eax + cmp -20(%rsi), %eax + jne L(find_diff) +L(16bytes): + movl -16(%rdi), %eax + cmp -16(%rsi), %eax + jne L(find_diff) +L(12bytes): + movl -12(%rdi), %eax + cmp -12(%rsi), %eax + jne L(find_diff) +L(8bytes): + movl -8(%rdi), %eax + cmp -8(%rsi), %eax + jne L(find_diff) +L(4bytes): + movl -4(%rdi), %eax + cmp -4(%rsi), %eax + jne L(find_diff) +L(0bytes): + xor %eax, %eax + ret +# endif + +# ifndef USE_AS_WMEMCMP + .p2align 4 +L(45bytes): + movl -45(%rdi), %eax + movl -45(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(41bytes): + movl -41(%rdi), %eax + movl -41(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(37bytes): + movl -37(%rdi), %eax + movl -37(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(33bytes): + movl -33(%rdi), %eax + movl -33(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(29bytes): + movl -29(%rdi), %eax + movl -29(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(25bytes): + movl -25(%rdi), %eax + movl -25(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(21bytes): + movl -21(%rdi), %eax + movl -21(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(17bytes): + movl -17(%rdi), %eax + movl -17(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(13bytes): + movl -13(%rdi), %eax + movl -13(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(9bytes): + movl -9(%rdi), %eax + movl -9(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(5bytes): + movl -5(%rdi), %eax + movl -5(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(1bytes): + movzbl -1(%rdi), %eax + cmpb -1(%rsi), %al + jne L(set) + xor %eax, %eax + ret + + .p2align 4 +L(46bytes): + movl -46(%rdi), %eax + movl -46(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(42bytes): + movl -42(%rdi), %eax + movl -42(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(38bytes): + movl -38(%rdi), %eax + movl -38(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(34bytes): + movl -34(%rdi), %eax + movl -34(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(30bytes): + movl -30(%rdi), %eax + movl -30(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(26bytes): + movl -26(%rdi), %eax + movl -26(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(22bytes): + movl -22(%rdi), %eax + movl -22(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(18bytes): + movl -18(%rdi), %eax + movl -18(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(14bytes): + movl -14(%rdi), %eax + movl -14(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(10bytes): + movl -10(%rdi), %eax + movl -10(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(6bytes): + movl -6(%rdi), %eax + movl -6(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(2bytes): + movzwl -2(%rdi), %eax + movzwl -2(%rsi), %ecx + cmpb %cl, %al + jne L(set) + cmp %ecx, %eax + jne L(set) + xor %eax, %eax + ret + + .p2align 4 +L(47bytes): + movl -47(%rdi), %eax + movl -47(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(43bytes): + movl -43(%rdi), %eax + movl -43(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(39bytes): + movl -39(%rdi), %eax + movl -39(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(35bytes): + movl -35(%rdi), %eax + movl -35(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(31bytes): + movl -31(%rdi), %eax + movl -31(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(27bytes): + movl -27(%rdi), %eax + movl -27(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(23bytes): + movl -23(%rdi), %eax + movl -23(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(19bytes): + movl -19(%rdi), %eax + movl -19(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(15bytes): + movl -15(%rdi), %eax + movl -15(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(11bytes): + movl -11(%rdi), %eax + movl -11(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(7bytes): + movl -7(%rdi), %eax + movl -7(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(3bytes): + movzwl -3(%rdi), %eax + movzwl -3(%rsi), %ecx + cmpb %cl, %al + jne L(set) + cmp %ecx, %eax + jne L(set) + movzbl -1(%rdi), %eax + cmpb -1(%rsi), %al + jne L(set) + xor %eax, %eax + ret + + .p2align 4 +L(find_diff): + cmpb %cl, %al + jne L(set) + cmpw %cx, %ax + jne L(set) + shr $16, %eax + shr $16, %ecx + cmpb %cl, %al + jne L(set) + +/* We get there only if we already know there is a +difference. */ + + cmp %ecx, %eax +L(set): + sbb %eax, %eax + sbb $-1, %eax + ret +# else + +/* for wmemcmp */ + .p2align 4 +L(find_diff): + mov $1, %eax + jg L(find_diff_bigger) + neg %eax + ret + + .p2align 4 +L(find_diff_bigger): + ret +# endif + + .p2align 4 +L(equal): + xor %eax, %eax + ret + +END (MEMCMP) +#endif diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/memcmp.S b/REORG.TODO/sysdeps/x86_64/multiarch/memcmp.S new file mode 100644 index 0000000000..0c9804b7e9 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/memcmp.S @@ -0,0 +1,78 @@ +/* Multiple versions of memcmp + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2010-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + +/* Define multiple versions only for the definition in libc. */ +#if IS_IN (libc) + .text +ENTRY(memcmp) + .type memcmp, @gnu_indirect_function + LOAD_RTLD_GLOBAL_RO_RDX + HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER) + jnz 1f + HAS_ARCH_FEATURE (AVX2_Usable) + jz 1f + HAS_CPU_FEATURE (MOVBE) + jz 1f + HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load) + jz 1f + leaq __memcmp_avx2_movbe(%rip), %rax + ret + +1: HAS_CPU_FEATURE (SSSE3) + jnz 2f + leaq __memcmp_sse2(%rip), %rax + ret + +2: HAS_CPU_FEATURE (SSE4_1) + jz 3f + leaq __memcmp_sse4_1(%rip), %rax + ret + +3: leaq __memcmp_ssse3(%rip), %rax + ret + +END(memcmp) + +# undef ENTRY +# define ENTRY(name) \ + .type __memcmp_sse2, @function; \ + .p2align 4; \ + .globl __memcmp_sse2; \ + .hidden __memcmp_sse2; \ + __memcmp_sse2: cfi_startproc; \ + CALL_MCOUNT +# undef END +# define END(name) \ + cfi_endproc; .size __memcmp_sse2, .-__memcmp_sse2 + +# ifdef SHARED +# undef libc_hidden_builtin_def +/* It doesn't make sense to send libc-internal memcmp calls through a PLT. + The speedup we get from using SSE4 instructions is likely eaten away + by the indirect call in the PLT. */ +# define libc_hidden_builtin_def(name) \ + .globl __GI_memcmp; __GI_memcmp = __memcmp_sse2 +# endif +#endif + +#include "../memcmp.S" diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S b/REORG.TODO/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S new file mode 100644 index 0000000000..4e060a27fd --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S @@ -0,0 +1,3180 @@ +/* memcpy with SSSE3 and REP string + Copyright (C) 2010-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> + +#if IS_IN (libc) \ + && (defined SHARED \ + || defined USE_AS_MEMMOVE \ + || !defined USE_MULTIARCH) + +#include "asm-syntax.h" + +#ifndef MEMCPY +# define MEMCPY __memcpy_ssse3_back +# define MEMCPY_CHK __memcpy_chk_ssse3_back +# define MEMPCPY __mempcpy_ssse3_back +# define MEMPCPY_CHK __mempcpy_chk_ssse3_back +#endif + +#define JMPTBL(I, B) I - B + +/* Branch to an entry in a jump table. TABLE is a jump table with + relative offsets. INDEX is a register contains the index into the + jump table. SCALE is the scale of INDEX. */ +#define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ + lea TABLE(%rip), %r11; \ + movslq (%r11, INDEX, SCALE), INDEX; \ + lea (%r11, INDEX), INDEX; \ + jmp *INDEX; \ + ud2 + + .section .text.ssse3,"ax",@progbits +#if !defined USE_AS_MEMPCPY && !defined USE_AS_MEMMOVE +ENTRY (MEMPCPY_CHK) + cmpq %rdx, %rcx + jb HIDDEN_JUMPTARGET (__chk_fail) +END (MEMPCPY_CHK) + +ENTRY (MEMPCPY) + movq %rdi, %rax + addq %rdx, %rax + jmp L(start) +END (MEMPCPY) +#endif + +#if !defined USE_AS_BCOPY +ENTRY (MEMCPY_CHK) + cmpq %rdx, %rcx + jb HIDDEN_JUMPTARGET (__chk_fail) +END (MEMCPY_CHK) +#endif + +ENTRY (MEMCPY) + mov %rdi, %rax +#ifdef USE_AS_MEMPCPY + add %rdx, %rax +#endif + +#ifdef USE_AS_MEMMOVE + cmp %rsi, %rdi + jb L(copy_forward) + je L(bwd_write_0bytes) + cmp $144, %rdx + jae L(copy_backward) + BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) +L(copy_forward): +#endif +L(start): + cmp $144, %rdx + jae L(144bytesormore) + +L(fwd_write_less32bytes): +#ifndef USE_AS_MEMMOVE + cmp %dil, %sil + jbe L(bk_write) +#endif + add %rdx, %rsi + add %rdx, %rdi + BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) +#ifndef USE_AS_MEMMOVE +L(bk_write): + + BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) +#endif + + .p2align 4 +L(144bytesormore): + +#ifndef USE_AS_MEMMOVE + cmp %dil, %sil + jle L(copy_backward) +#endif + movdqu (%rsi), %xmm0 + mov %rdi, %r8 + and $-16, %rdi + add $16, %rdi + mov %rdi, %r9 + sub %r8, %r9 + sub %r9, %rdx + add %r9, %rsi + mov %rsi, %r9 + and $0xf, %r9 + jz L(shl_0) +#ifdef DATA_CACHE_SIZE + mov $DATA_CACHE_SIZE, %RCX_LP +#else + mov __x86_data_cache_size(%rip), %RCX_LP +#endif + cmp %rcx, %rdx + jae L(gobble_mem_fwd) + lea L(shl_table_fwd)(%rip), %r11 + sub $0x80, %rdx + movslq (%r11, %r9, 4), %r9 + add %r11, %r9 + jmp *%r9 + ud2 + + .p2align 4 +L(copy_backward): +#ifdef DATA_CACHE_SIZE + mov $DATA_CACHE_SIZE, %RCX_LP +#else + mov __x86_data_cache_size(%rip), %RCX_LP +#endif + shl $1, %rcx + cmp %rcx, %rdx + ja L(gobble_mem_bwd) + + add %rdx, %rdi + add %rdx, %rsi + movdqu -16(%rsi), %xmm0 + lea -16(%rdi), %r8 + mov %rdi, %r9 + and $0xf, %r9 + xor %r9, %rdi + sub %r9, %rsi + sub %r9, %rdx + mov %rsi, %r9 + and $0xf, %r9 + jz L(shl_0_bwd) + lea L(shl_table_bwd)(%rip), %r11 + sub $0x80, %rdx + movslq (%r11, %r9, 4), %r9 + add %r11, %r9 + jmp *%r9 + ud2 + + .p2align 4 +L(shl_0): + + mov %rdx, %r9 + shr $8, %r9 + add %rdx, %r9 +#ifdef DATA_CACHE_SIZE + cmp $DATA_CACHE_SIZE_HALF, %R9_LP +#else + cmp __x86_data_cache_size_half(%rip), %R9_LP +#endif + jae L(gobble_mem_fwd) + sub $0x80, %rdx + .p2align 4 +L(shl_0_loop): + movdqa (%rsi), %xmm1 + movdqa %xmm1, (%rdi) + movaps 0x10(%rsi), %xmm2 + movaps %xmm2, 0x10(%rdi) + movaps 0x20(%rsi), %xmm3 + movaps %xmm3, 0x20(%rdi) + movaps 0x30(%rsi), %xmm4 + movaps %xmm4, 0x30(%rdi) + movaps 0x40(%rsi), %xmm1 + movaps %xmm1, 0x40(%rdi) + movaps 0x50(%rsi), %xmm2 + movaps %xmm2, 0x50(%rdi) + movaps 0x60(%rsi), %xmm3 + movaps %xmm3, 0x60(%rdi) + movaps 0x70(%rsi), %xmm4 + movaps %xmm4, 0x70(%rdi) + sub $0x80, %rdx + lea 0x80(%rsi), %rsi + lea 0x80(%rdi), %rdi + jae L(shl_0_loop) + movdqu %xmm0, (%r8) + add $0x80, %rdx + add %rdx, %rsi + add %rdx, %rdi + BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) + + .p2align 4 +L(shl_0_bwd): + sub $0x80, %rdx +L(copy_backward_loop): + movaps -0x10(%rsi), %xmm1 + movaps %xmm1, -0x10(%rdi) + movaps -0x20(%rsi), %xmm2 + movaps %xmm2, -0x20(%rdi) + movaps -0x30(%rsi), %xmm3 + movaps %xmm3, -0x30(%rdi) + movaps -0x40(%rsi), %xmm4 + movaps %xmm4, -0x40(%rdi) + movaps -0x50(%rsi), %xmm5 + movaps %xmm5, -0x50(%rdi) + movaps -0x60(%rsi), %xmm5 + movaps %xmm5, -0x60(%rdi) + movaps -0x70(%rsi), %xmm5 + movaps %xmm5, -0x70(%rdi) + movaps -0x80(%rsi), %xmm5 + movaps %xmm5, -0x80(%rdi) + sub $0x80, %rdx + lea -0x80(%rdi), %rdi + lea -0x80(%rsi), %rsi + jae L(copy_backward_loop) + + movdqu %xmm0, (%r8) + add $0x80, %rdx + sub %rdx, %rdi + sub %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) + + .p2align 4 +L(shl_1): + sub $0x80, %rdx + movaps -0x01(%rsi), %xmm1 + movaps 0x0f(%rsi), %xmm2 + movaps 0x1f(%rsi), %xmm3 + movaps 0x2f(%rsi), %xmm4 + movaps 0x3f(%rsi), %xmm5 + movaps 0x4f(%rsi), %xmm6 + movaps 0x5f(%rsi), %xmm7 + movaps 0x6f(%rsi), %xmm8 + movaps 0x7f(%rsi), %xmm9 + lea 0x80(%rsi), %rsi + palignr $1, %xmm8, %xmm9 + movaps %xmm9, 0x70(%rdi) + palignr $1, %xmm7, %xmm8 + movaps %xmm8, 0x60(%rdi) + palignr $1, %xmm6, %xmm7 + movaps %xmm7, 0x50(%rdi) + palignr $1, %xmm5, %xmm6 + movaps %xmm6, 0x40(%rdi) + palignr $1, %xmm4, %xmm5 + movaps %xmm5, 0x30(%rdi) + palignr $1, %xmm3, %xmm4 + movaps %xmm4, 0x20(%rdi) + palignr $1, %xmm2, %xmm3 + movaps %xmm3, 0x10(%rdi) + palignr $1, %xmm1, %xmm2 + movaps %xmm2, (%rdi) + lea 0x80(%rdi), %rdi + jae L(shl_1) + movdqu %xmm0, (%r8) + add $0x80, %rdx + add %rdx, %rdi + add %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) + + .p2align 4 +L(shl_1_bwd): + movaps -0x01(%rsi), %xmm1 + + movaps -0x11(%rsi), %xmm2 + palignr $1, %xmm2, %xmm1 + movaps %xmm1, -0x10(%rdi) + + movaps -0x21(%rsi), %xmm3 + palignr $1, %xmm3, %xmm2 + movaps %xmm2, -0x20(%rdi) + + movaps -0x31(%rsi), %xmm4 + palignr $1, %xmm4, %xmm3 + movaps %xmm3, -0x30(%rdi) + + movaps -0x41(%rsi), %xmm5 + palignr $1, %xmm5, %xmm4 + movaps %xmm4, -0x40(%rdi) + + movaps -0x51(%rsi), %xmm6 + palignr $1, %xmm6, %xmm5 + movaps %xmm5, -0x50(%rdi) + + movaps -0x61(%rsi), %xmm7 + palignr $1, %xmm7, %xmm6 + movaps %xmm6, -0x60(%rdi) + + movaps -0x71(%rsi), %xmm8 + palignr $1, %xmm8, %xmm7 + movaps %xmm7, -0x70(%rdi) + + movaps -0x81(%rsi), %xmm9 + palignr $1, %xmm9, %xmm8 + movaps %xmm8, -0x80(%rdi) + + sub $0x80, %rdx + lea -0x80(%rdi), %rdi + lea -0x80(%rsi), %rsi + jae L(shl_1_bwd) + movdqu %xmm0, (%r8) + add $0x80, %rdx + sub %rdx, %rdi + sub %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) + + .p2align 4 +L(shl_2): + sub $0x80, %rdx + movaps -0x02(%rsi), %xmm1 + movaps 0x0e(%rsi), %xmm2 + movaps 0x1e(%rsi), %xmm3 + movaps 0x2e(%rsi), %xmm4 + movaps 0x3e(%rsi), %xmm5 + movaps 0x4e(%rsi), %xmm6 + movaps 0x5e(%rsi), %xmm7 + movaps 0x6e(%rsi), %xmm8 + movaps 0x7e(%rsi), %xmm9 + lea 0x80(%rsi), %rsi + palignr $2, %xmm8, %xmm9 + movaps %xmm9, 0x70(%rdi) + palignr $2, %xmm7, %xmm8 + movaps %xmm8, 0x60(%rdi) + palignr $2, %xmm6, %xmm7 + movaps %xmm7, 0x50(%rdi) + palignr $2, %xmm5, %xmm6 + movaps %xmm6, 0x40(%rdi) + palignr $2, %xmm4, %xmm5 + movaps %xmm5, 0x30(%rdi) + palignr $2, %xmm3, %xmm4 + movaps %xmm4, 0x20(%rdi) + palignr $2, %xmm2, %xmm3 + movaps %xmm3, 0x10(%rdi) + palignr $2, %xmm1, %xmm2 + movaps %xmm2, (%rdi) + lea 0x80(%rdi), %rdi + jae L(shl_2) + movdqu %xmm0, (%r8) + add $0x80, %rdx + add %rdx, %rdi + add %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) + + .p2align 4 +L(shl_2_bwd): + movaps -0x02(%rsi), %xmm1 + + movaps -0x12(%rsi), %xmm2 + palignr $2, %xmm2, %xmm1 + movaps %xmm1, -0x10(%rdi) + + movaps -0x22(%rsi), %xmm3 + palignr $2, %xmm3, %xmm2 + movaps %xmm2, -0x20(%rdi) + + movaps -0x32(%rsi), %xmm4 + palignr $2, %xmm4, %xmm3 + movaps %xmm3, -0x30(%rdi) + + movaps -0x42(%rsi), %xmm5 + palignr $2, %xmm5, %xmm4 + movaps %xmm4, -0x40(%rdi) + + movaps -0x52(%rsi), %xmm6 + palignr $2, %xmm6, %xmm5 + movaps %xmm5, -0x50(%rdi) + + movaps -0x62(%rsi), %xmm7 + palignr $2, %xmm7, %xmm6 + movaps %xmm6, -0x60(%rdi) + + movaps -0x72(%rsi), %xmm8 + palignr $2, %xmm8, %xmm7 + movaps %xmm7, -0x70(%rdi) + + movaps -0x82(%rsi), %xmm9 + palignr $2, %xmm9, %xmm8 + movaps %xmm8, -0x80(%rdi) + + sub $0x80, %rdx + lea -0x80(%rdi), %rdi + lea -0x80(%rsi), %rsi + jae L(shl_2_bwd) + movdqu %xmm0, (%r8) + add $0x80, %rdx + sub %rdx, %rdi + sub %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) + + .p2align 4 +L(shl_3): + sub $0x80, %rdx + movaps -0x03(%rsi), %xmm1 + movaps 0x0d(%rsi), %xmm2 + movaps 0x1d(%rsi), %xmm3 + movaps 0x2d(%rsi), %xmm4 + movaps 0x3d(%rsi), %xmm5 + movaps 0x4d(%rsi), %xmm6 + movaps 0x5d(%rsi), %xmm7 + movaps 0x6d(%rsi), %xmm8 + movaps 0x7d(%rsi), %xmm9 + lea 0x80(%rsi), %rsi + palignr $3, %xmm8, %xmm9 + movaps %xmm9, 0x70(%rdi) + palignr $3, %xmm7, %xmm8 + movaps %xmm8, 0x60(%rdi) + palignr $3, %xmm6, %xmm7 + movaps %xmm7, 0x50(%rdi) + palignr $3, %xmm5, %xmm6 + movaps %xmm6, 0x40(%rdi) + palignr $3, %xmm4, %xmm5 + movaps %xmm5, 0x30(%rdi) + palignr $3, %xmm3, %xmm4 + movaps %xmm4, 0x20(%rdi) + palignr $3, %xmm2, %xmm3 + movaps %xmm3, 0x10(%rdi) + palignr $3, %xmm1, %xmm2 + movaps %xmm2, (%rdi) + lea 0x80(%rdi), %rdi + jae L(shl_3) + movdqu %xmm0, (%r8) + add $0x80, %rdx + add %rdx, %rdi + add %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) + + .p2align 4 +L(shl_3_bwd): + movaps -0x03(%rsi), %xmm1 + + movaps -0x13(%rsi), %xmm2 + palignr $3, %xmm2, %xmm1 + movaps %xmm1, -0x10(%rdi) + + movaps -0x23(%rsi), %xmm3 + palignr $3, %xmm3, %xmm2 + movaps %xmm2, -0x20(%rdi) + + movaps -0x33(%rsi), %xmm4 + palignr $3, %xmm4, %xmm3 + movaps %xmm3, -0x30(%rdi) + + movaps -0x43(%rsi), %xmm5 + palignr $3, %xmm5, %xmm4 + movaps %xmm4, -0x40(%rdi) + + movaps -0x53(%rsi), %xmm6 + palignr $3, %xmm6, %xmm5 + movaps %xmm5, -0x50(%rdi) + + movaps -0x63(%rsi), %xmm7 + palignr $3, %xmm7, %xmm6 + movaps %xmm6, -0x60(%rdi) + + movaps -0x73(%rsi), %xmm8 + palignr $3, %xmm8, %xmm7 + movaps %xmm7, -0x70(%rdi) + + movaps -0x83(%rsi), %xmm9 + palignr $3, %xmm9, %xmm8 + movaps %xmm8, -0x80(%rdi) + + sub $0x80, %rdx + lea -0x80(%rdi), %rdi + lea -0x80(%rsi), %rsi + jae L(shl_3_bwd) + movdqu %xmm0, (%r8) + add $0x80, %rdx + sub %rdx, %rdi + sub %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) + + .p2align 4 +L(shl_4): + sub $0x80, %rdx + movaps -0x04(%rsi), %xmm1 + movaps 0x0c(%rsi), %xmm2 + movaps 0x1c(%rsi), %xmm3 + movaps 0x2c(%rsi), %xmm4 + movaps 0x3c(%rsi), %xmm5 + movaps 0x4c(%rsi), %xmm6 + movaps 0x5c(%rsi), %xmm7 + movaps 0x6c(%rsi), %xmm8 + movaps 0x7c(%rsi), %xmm9 + lea 0x80(%rsi), %rsi + palignr $4, %xmm8, %xmm9 + movaps %xmm9, 0x70(%rdi) + palignr $4, %xmm7, %xmm8 + movaps %xmm8, 0x60(%rdi) + palignr $4, %xmm6, %xmm7 + movaps %xmm7, 0x50(%rdi) + palignr $4, %xmm5, %xmm6 + movaps %xmm6, 0x40(%rdi) + palignr $4, %xmm4, %xmm5 + movaps %xmm5, 0x30(%rdi) + palignr $4, %xmm3, %xmm4 + movaps %xmm4, 0x20(%rdi) + palignr $4, %xmm2, %xmm3 + movaps %xmm3, 0x10(%rdi) + palignr $4, %xmm1, %xmm2 + movaps %xmm2, (%rdi) + lea 0x80(%rdi), %rdi + jae L(shl_4) + movdqu %xmm0, (%r8) + add $0x80, %rdx + add %rdx, %rdi + add %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) + + .p2align 4 +L(shl_4_bwd): + movaps -0x04(%rsi), %xmm1 + + movaps -0x14(%rsi), %xmm2 + palignr $4, %xmm2, %xmm1 + movaps %xmm1, -0x10(%rdi) + + movaps -0x24(%rsi), %xmm3 + palignr $4, %xmm3, %xmm2 + movaps %xmm2, -0x20(%rdi) + + movaps -0x34(%rsi), %xmm4 + palignr $4, %xmm4, %xmm3 + movaps %xmm3, -0x30(%rdi) + + movaps -0x44(%rsi), %xmm5 + palignr $4, %xmm5, %xmm4 + movaps %xmm4, -0x40(%rdi) + + movaps -0x54(%rsi), %xmm6 + palignr $4, %xmm6, %xmm5 + movaps %xmm5, -0x50(%rdi) + + movaps -0x64(%rsi), %xmm7 + palignr $4, %xmm7, %xmm6 + movaps %xmm6, -0x60(%rdi) + + movaps -0x74(%rsi), %xmm8 + palignr $4, %xmm8, %xmm7 + movaps %xmm7, -0x70(%rdi) + + movaps -0x84(%rsi), %xmm9 + palignr $4, %xmm9, %xmm8 + movaps %xmm8, -0x80(%rdi) + + sub $0x80, %rdx + lea -0x80(%rdi), %rdi + lea -0x80(%rsi), %rsi + jae L(shl_4_bwd) + movdqu %xmm0, (%r8) + add $0x80, %rdx + sub %rdx, %rdi + sub %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) + + .p2align 4 +L(shl_5): + sub $0x80, %rdx + movaps -0x05(%rsi), %xmm1 + movaps 0x0b(%rsi), %xmm2 + movaps 0x1b(%rsi), %xmm3 + movaps 0x2b(%rsi), %xmm4 + movaps 0x3b(%rsi), %xmm5 + movaps 0x4b(%rsi), %xmm6 + movaps 0x5b(%rsi), %xmm7 + movaps 0x6b(%rsi), %xmm8 + movaps 0x7b(%rsi), %xmm9 + lea 0x80(%rsi), %rsi + palignr $5, %xmm8, %xmm9 + movaps %xmm9, 0x70(%rdi) + palignr $5, %xmm7, %xmm8 + movaps %xmm8, 0x60(%rdi) + palignr $5, %xmm6, %xmm7 + movaps %xmm7, 0x50(%rdi) + palignr $5, %xmm5, %xmm6 + movaps %xmm6, 0x40(%rdi) + palignr $5, %xmm4, %xmm5 + movaps %xmm5, 0x30(%rdi) + palignr $5, %xmm3, %xmm4 + movaps %xmm4, 0x20(%rdi) + palignr $5, %xmm2, %xmm3 + movaps %xmm3, 0x10(%rdi) + palignr $5, %xmm1, %xmm2 + movaps %xmm2, (%rdi) + lea 0x80(%rdi), %rdi + jae L(shl_5) + movdqu %xmm0, (%r8) + add $0x80, %rdx + add %rdx, %rdi + add %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) + + .p2align 4 +L(shl_5_bwd): + movaps -0x05(%rsi), %xmm1 + + movaps -0x15(%rsi), %xmm2 + palignr $5, %xmm2, %xmm1 + movaps %xmm1, -0x10(%rdi) + + movaps -0x25(%rsi), %xmm3 + palignr $5, %xmm3, %xmm2 + movaps %xmm2, -0x20(%rdi) + + movaps -0x35(%rsi), %xmm4 + palignr $5, %xmm4, %xmm3 + movaps %xmm3, -0x30(%rdi) + + movaps -0x45(%rsi), %xmm5 + palignr $5, %xmm5, %xmm4 + movaps %xmm4, -0x40(%rdi) + + movaps -0x55(%rsi), %xmm6 + palignr $5, %xmm6, %xmm5 + movaps %xmm5, -0x50(%rdi) + + movaps -0x65(%rsi), %xmm7 + palignr $5, %xmm7, %xmm6 + movaps %xmm6, -0x60(%rdi) + + movaps -0x75(%rsi), %xmm8 + palignr $5, %xmm8, %xmm7 + movaps %xmm7, -0x70(%rdi) + + movaps -0x85(%rsi), %xmm9 + palignr $5, %xmm9, %xmm8 + movaps %xmm8, -0x80(%rdi) + + sub $0x80, %rdx + lea -0x80(%rdi), %rdi + lea -0x80(%rsi), %rsi + jae L(shl_5_bwd) + movdqu %xmm0, (%r8) + add $0x80, %rdx + sub %rdx, %rdi + sub %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) + + .p2align 4 +L(shl_6): + sub $0x80, %rdx + movaps -0x06(%rsi), %xmm1 + movaps 0x0a(%rsi), %xmm2 + movaps 0x1a(%rsi), %xmm3 + movaps 0x2a(%rsi), %xmm4 + movaps 0x3a(%rsi), %xmm5 + movaps 0x4a(%rsi), %xmm6 + movaps 0x5a(%rsi), %xmm7 + movaps 0x6a(%rsi), %xmm8 + movaps 0x7a(%rsi), %xmm9 + lea 0x80(%rsi), %rsi + palignr $6, %xmm8, %xmm9 + movaps %xmm9, 0x70(%rdi) + palignr $6, %xmm7, %xmm8 + movaps %xmm8, 0x60(%rdi) + palignr $6, %xmm6, %xmm7 + movaps %xmm7, 0x50(%rdi) + palignr $6, %xmm5, %xmm6 + movaps %xmm6, 0x40(%rdi) + palignr $6, %xmm4, %xmm5 + movaps %xmm5, 0x30(%rdi) + palignr $6, %xmm3, %xmm4 + movaps %xmm4, 0x20(%rdi) + palignr $6, %xmm2, %xmm3 + movaps %xmm3, 0x10(%rdi) + palignr $6, %xmm1, %xmm2 + movaps %xmm2, (%rdi) + lea 0x80(%rdi), %rdi + jae L(shl_6) + movdqu %xmm0, (%r8) + add $0x80, %rdx + add %rdx, %rdi + add %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) + + .p2align 4 +L(shl_6_bwd): + movaps -0x06(%rsi), %xmm1 + + movaps -0x16(%rsi), %xmm2 + palignr $6, %xmm2, %xmm1 + movaps %xmm1, -0x10(%rdi) + + movaps -0x26(%rsi), %xmm3 + palignr $6, %xmm3, %xmm2 + movaps %xmm2, -0x20(%rdi) + + movaps -0x36(%rsi), %xmm4 + palignr $6, %xmm4, %xmm3 + movaps %xmm3, -0x30(%rdi) + + movaps -0x46(%rsi), %xmm5 + palignr $6, %xmm5, %xmm4 + movaps %xmm4, -0x40(%rdi) + + movaps -0x56(%rsi), %xmm6 + palignr $6, %xmm6, %xmm5 + movaps %xmm5, -0x50(%rdi) + + movaps -0x66(%rsi), %xmm7 + palignr $6, %xmm7, %xmm6 + movaps %xmm6, -0x60(%rdi) + + movaps -0x76(%rsi), %xmm8 + palignr $6, %xmm8, %xmm7 + movaps %xmm7, -0x70(%rdi) + + movaps -0x86(%rsi), %xmm9 + palignr $6, %xmm9, %xmm8 + movaps %xmm8, -0x80(%rdi) + + sub $0x80, %rdx + lea -0x80(%rdi), %rdi + lea -0x80(%rsi), %rsi + jae L(shl_6_bwd) + movdqu %xmm0, (%r8) + add $0x80, %rdx + sub %rdx, %rdi + sub %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) + + .p2align 4 +L(shl_7): + sub $0x80, %rdx + movaps -0x07(%rsi), %xmm1 + movaps 0x09(%rsi), %xmm2 + movaps 0x19(%rsi), %xmm3 + movaps 0x29(%rsi), %xmm4 + movaps 0x39(%rsi), %xmm5 + movaps 0x49(%rsi), %xmm6 + movaps 0x59(%rsi), %xmm7 + movaps 0x69(%rsi), %xmm8 + movaps 0x79(%rsi), %xmm9 + lea 0x80(%rsi), %rsi + palignr $7, %xmm8, %xmm9 + movaps %xmm9, 0x70(%rdi) + palignr $7, %xmm7, %xmm8 + movaps %xmm8, 0x60(%rdi) + palignr $7, %xmm6, %xmm7 + movaps %xmm7, 0x50(%rdi) + palignr $7, %xmm5, %xmm6 + movaps %xmm6, 0x40(%rdi) + palignr $7, %xmm4, %xmm5 + movaps %xmm5, 0x30(%rdi) + palignr $7, %xmm3, %xmm4 + movaps %xmm4, 0x20(%rdi) + palignr $7, %xmm2, %xmm3 + movaps %xmm3, 0x10(%rdi) + palignr $7, %xmm1, %xmm2 + movaps %xmm2, (%rdi) + lea 0x80(%rdi), %rdi + jae L(shl_7) + movdqu %xmm0, (%r8) + add $0x80, %rdx + add %rdx, %rdi + add %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) + + .p2align 4 +L(shl_7_bwd): + movaps -0x07(%rsi), %xmm1 + + movaps -0x17(%rsi), %xmm2 + palignr $7, %xmm2, %xmm1 + movaps %xmm1, -0x10(%rdi) + + movaps -0x27(%rsi), %xmm3 + palignr $7, %xmm3, %xmm2 + movaps %xmm2, -0x20(%rdi) + + movaps -0x37(%rsi), %xmm4 + palignr $7, %xmm4, %xmm3 + movaps %xmm3, -0x30(%rdi) + + movaps -0x47(%rsi), %xmm5 + palignr $7, %xmm5, %xmm4 + movaps %xmm4, -0x40(%rdi) + + movaps -0x57(%rsi), %xmm6 + palignr $7, %xmm6, %xmm5 + movaps %xmm5, -0x50(%rdi) + + movaps -0x67(%rsi), %xmm7 + palignr $7, %xmm7, %xmm6 + movaps %xmm6, -0x60(%rdi) + + movaps -0x77(%rsi), %xmm8 + palignr $7, %xmm8, %xmm7 + movaps %xmm7, -0x70(%rdi) + + movaps -0x87(%rsi), %xmm9 + palignr $7, %xmm9, %xmm8 + movaps %xmm8, -0x80(%rdi) + + sub $0x80, %rdx + lea -0x80(%rdi), %rdi + lea -0x80(%rsi), %rsi + jae L(shl_7_bwd) + movdqu %xmm0, (%r8) + add $0x80, %rdx + sub %rdx, %rdi + sub %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) + + .p2align 4 +L(shl_8): + sub $0x80, %rdx + movaps -0x08(%rsi), %xmm1 + movaps 0x08(%rsi), %xmm2 + movaps 0x18(%rsi), %xmm3 + movaps 0x28(%rsi), %xmm4 + movaps 0x38(%rsi), %xmm5 + movaps 0x48(%rsi), %xmm6 + movaps 0x58(%rsi), %xmm7 + movaps 0x68(%rsi), %xmm8 + movaps 0x78(%rsi), %xmm9 + lea 0x80(%rsi), %rsi + palignr $8, %xmm8, %xmm9 + movaps %xmm9, 0x70(%rdi) + palignr $8, %xmm7, %xmm8 + movaps %xmm8, 0x60(%rdi) + palignr $8, %xmm6, %xmm7 + movaps %xmm7, 0x50(%rdi) + palignr $8, %xmm5, %xmm6 + movaps %xmm6, 0x40(%rdi) + palignr $8, %xmm4, %xmm5 + movaps %xmm5, 0x30(%rdi) + palignr $8, %xmm3, %xmm4 + movaps %xmm4, 0x20(%rdi) + palignr $8, %xmm2, %xmm3 + movaps %xmm3, 0x10(%rdi) + palignr $8, %xmm1, %xmm2 + movaps %xmm2, (%rdi) + lea 0x80(%rdi), %rdi + jae L(shl_8) + movdqu %xmm0, (%r8) + add $0x80, %rdx + add %rdx, %rdi + add %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) + + .p2align 4 +L(shl_8_bwd): + movaps -0x08(%rsi), %xmm1 + + movaps -0x18(%rsi), %xmm2 + palignr $8, %xmm2, %xmm1 + movaps %xmm1, -0x10(%rdi) + + movaps -0x28(%rsi), %xmm3 + palignr $8, %xmm3, %xmm2 + movaps %xmm2, -0x20(%rdi) + + movaps -0x38(%rsi), %xmm4 + palignr $8, %xmm4, %xmm3 + movaps %xmm3, -0x30(%rdi) + + movaps -0x48(%rsi), %xmm5 + palignr $8, %xmm5, %xmm4 + movaps %xmm4, -0x40(%rdi) + + movaps -0x58(%rsi), %xmm6 + palignr $8, %xmm6, %xmm5 + movaps %xmm5, -0x50(%rdi) + + movaps -0x68(%rsi), %xmm7 + palignr $8, %xmm7, %xmm6 + movaps %xmm6, -0x60(%rdi) + + movaps -0x78(%rsi), %xmm8 + palignr $8, %xmm8, %xmm7 + movaps %xmm7, -0x70(%rdi) + + movaps -0x88(%rsi), %xmm9 + palignr $8, %xmm9, %xmm8 + movaps %xmm8, -0x80(%rdi) + + sub $0x80, %rdx + lea -0x80(%rdi), %rdi + lea -0x80(%rsi), %rsi + jae L(shl_8_bwd) +L(shl_8_end_bwd): + movdqu %xmm0, (%r8) + add $0x80, %rdx + sub %rdx, %rdi + sub %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) + + .p2align 4 +L(shl_9): + sub $0x80, %rdx + movaps -0x09(%rsi), %xmm1 + movaps 0x07(%rsi), %xmm2 + movaps 0x17(%rsi), %xmm3 + movaps 0x27(%rsi), %xmm4 + movaps 0x37(%rsi), %xmm5 + movaps 0x47(%rsi), %xmm6 + movaps 0x57(%rsi), %xmm7 + movaps 0x67(%rsi), %xmm8 + movaps 0x77(%rsi), %xmm9 + lea 0x80(%rsi), %rsi + palignr $9, %xmm8, %xmm9 + movaps %xmm9, 0x70(%rdi) + palignr $9, %xmm7, %xmm8 + movaps %xmm8, 0x60(%rdi) + palignr $9, %xmm6, %xmm7 + movaps %xmm7, 0x50(%rdi) + palignr $9, %xmm5, %xmm6 + movaps %xmm6, 0x40(%rdi) + palignr $9, %xmm4, %xmm5 + movaps %xmm5, 0x30(%rdi) + palignr $9, %xmm3, %xmm4 + movaps %xmm4, 0x20(%rdi) + palignr $9, %xmm2, %xmm3 + movaps %xmm3, 0x10(%rdi) + palignr $9, %xmm1, %xmm2 + movaps %xmm2, (%rdi) + lea 0x80(%rdi), %rdi + jae L(shl_9) + movdqu %xmm0, (%r8) + add $0x80, %rdx + add %rdx, %rdi + add %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) + + .p2align 4 +L(shl_9_bwd): + movaps -0x09(%rsi), %xmm1 + + movaps -0x19(%rsi), %xmm2 + palignr $9, %xmm2, %xmm1 + movaps %xmm1, -0x10(%rdi) + + movaps -0x29(%rsi), %xmm3 + palignr $9, %xmm3, %xmm2 + movaps %xmm2, -0x20(%rdi) + + movaps -0x39(%rsi), %xmm4 + palignr $9, %xmm4, %xmm3 + movaps %xmm3, -0x30(%rdi) + + movaps -0x49(%rsi), %xmm5 + palignr $9, %xmm5, %xmm4 + movaps %xmm4, -0x40(%rdi) + + movaps -0x59(%rsi), %xmm6 + palignr $9, %xmm6, %xmm5 + movaps %xmm5, -0x50(%rdi) + + movaps -0x69(%rsi), %xmm7 + palignr $9, %xmm7, %xmm6 + movaps %xmm6, -0x60(%rdi) + + movaps -0x79(%rsi), %xmm8 + palignr $9, %xmm8, %xmm7 + movaps %xmm7, -0x70(%rdi) + + movaps -0x89(%rsi), %xmm9 + palignr $9, %xmm9, %xmm8 + movaps %xmm8, -0x80(%rdi) + + sub $0x80, %rdx + lea -0x80(%rdi), %rdi + lea -0x80(%rsi), %rsi + jae L(shl_9_bwd) + movdqu %xmm0, (%r8) + add $0x80, %rdx + sub %rdx, %rdi + sub %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) + + .p2align 4 +L(shl_10): + sub $0x80, %rdx + movaps -0x0a(%rsi), %xmm1 + movaps 0x06(%rsi), %xmm2 + movaps 0x16(%rsi), %xmm3 + movaps 0x26(%rsi), %xmm4 + movaps 0x36(%rsi), %xmm5 + movaps 0x46(%rsi), %xmm6 + movaps 0x56(%rsi), %xmm7 + movaps 0x66(%rsi), %xmm8 + movaps 0x76(%rsi), %xmm9 + lea 0x80(%rsi), %rsi + palignr $10, %xmm8, %xmm9 + movaps %xmm9, 0x70(%rdi) + palignr $10, %xmm7, %xmm8 + movaps %xmm8, 0x60(%rdi) + palignr $10, %xmm6, %xmm7 + movaps %xmm7, 0x50(%rdi) + palignr $10, %xmm5, %xmm6 + movaps %xmm6, 0x40(%rdi) + palignr $10, %xmm4, %xmm5 + movaps %xmm5, 0x30(%rdi) + palignr $10, %xmm3, %xmm4 + movaps %xmm4, 0x20(%rdi) + palignr $10, %xmm2, %xmm3 + movaps %xmm3, 0x10(%rdi) + palignr $10, %xmm1, %xmm2 + movaps %xmm2, (%rdi) + lea 0x80(%rdi), %rdi + jae L(shl_10) + movdqu %xmm0, (%r8) + add $0x80, %rdx + add %rdx, %rdi + add %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) + + .p2align 4 +L(shl_10_bwd): + movaps -0x0a(%rsi), %xmm1 + + movaps -0x1a(%rsi), %xmm2 + palignr $10, %xmm2, %xmm1 + movaps %xmm1, -0x10(%rdi) + + movaps -0x2a(%rsi), %xmm3 + palignr $10, %xmm3, %xmm2 + movaps %xmm2, -0x20(%rdi) + + movaps -0x3a(%rsi), %xmm4 + palignr $10, %xmm4, %xmm3 + movaps %xmm3, -0x30(%rdi) + + movaps -0x4a(%rsi), %xmm5 + palignr $10, %xmm5, %xmm4 + movaps %xmm4, -0x40(%rdi) + + movaps -0x5a(%rsi), %xmm6 + palignr $10, %xmm6, %xmm5 + movaps %xmm5, -0x50(%rdi) + + movaps -0x6a(%rsi), %xmm7 + palignr $10, %xmm7, %xmm6 + movaps %xmm6, -0x60(%rdi) + + movaps -0x7a(%rsi), %xmm8 + palignr $10, %xmm8, %xmm7 + movaps %xmm7, -0x70(%rdi) + + movaps -0x8a(%rsi), %xmm9 + palignr $10, %xmm9, %xmm8 + movaps %xmm8, -0x80(%rdi) + + sub $0x80, %rdx + lea -0x80(%rdi), %rdi + lea -0x80(%rsi), %rsi + jae L(shl_10_bwd) + movdqu %xmm0, (%r8) + add $0x80, %rdx + sub %rdx, %rdi + sub %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) + + .p2align 4 +L(shl_11): + sub $0x80, %rdx + movaps -0x0b(%rsi), %xmm1 + movaps 0x05(%rsi), %xmm2 + movaps 0x15(%rsi), %xmm3 + movaps 0x25(%rsi), %xmm4 + movaps 0x35(%rsi), %xmm5 + movaps 0x45(%rsi), %xmm6 + movaps 0x55(%rsi), %xmm7 + movaps 0x65(%rsi), %xmm8 + movaps 0x75(%rsi), %xmm9 + lea 0x80(%rsi), %rsi + palignr $11, %xmm8, %xmm9 + movaps %xmm9, 0x70(%rdi) + palignr $11, %xmm7, %xmm8 + movaps %xmm8, 0x60(%rdi) + palignr $11, %xmm6, %xmm7 + movaps %xmm7, 0x50(%rdi) + palignr $11, %xmm5, %xmm6 + movaps %xmm6, 0x40(%rdi) + palignr $11, %xmm4, %xmm5 + movaps %xmm5, 0x30(%rdi) + palignr $11, %xmm3, %xmm4 + movaps %xmm4, 0x20(%rdi) + palignr $11, %xmm2, %xmm3 + movaps %xmm3, 0x10(%rdi) + palignr $11, %xmm1, %xmm2 + movaps %xmm2, (%rdi) + lea 0x80(%rdi), %rdi + jae L(shl_11) + movdqu %xmm0, (%r8) + add $0x80, %rdx + add %rdx, %rdi + add %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) + + .p2align 4 +L(shl_11_bwd): + movaps -0x0b(%rsi), %xmm1 + + movaps -0x1b(%rsi), %xmm2 + palignr $11, %xmm2, %xmm1 + movaps %xmm1, -0x10(%rdi) + + movaps -0x2b(%rsi), %xmm3 + palignr $11, %xmm3, %xmm2 + movaps %xmm2, -0x20(%rdi) + + movaps -0x3b(%rsi), %xmm4 + palignr $11, %xmm4, %xmm3 + movaps %xmm3, -0x30(%rdi) + + movaps -0x4b(%rsi), %xmm5 + palignr $11, %xmm5, %xmm4 + movaps %xmm4, -0x40(%rdi) + + movaps -0x5b(%rsi), %xmm6 + palignr $11, %xmm6, %xmm5 + movaps %xmm5, -0x50(%rdi) + + movaps -0x6b(%rsi), %xmm7 + palignr $11, %xmm7, %xmm6 + movaps %xmm6, -0x60(%rdi) + + movaps -0x7b(%rsi), %xmm8 + palignr $11, %xmm8, %xmm7 + movaps %xmm7, -0x70(%rdi) + + movaps -0x8b(%rsi), %xmm9 + palignr $11, %xmm9, %xmm8 + movaps %xmm8, -0x80(%rdi) + + sub $0x80, %rdx + lea -0x80(%rdi), %rdi + lea -0x80(%rsi), %rsi + jae L(shl_11_bwd) + movdqu %xmm0, (%r8) + add $0x80, %rdx + sub %rdx, %rdi + sub %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) + + .p2align 4 +L(shl_12): + sub $0x80, %rdx + movdqa -0x0c(%rsi), %xmm1 + movaps 0x04(%rsi), %xmm2 + movaps 0x14(%rsi), %xmm3 + movaps 0x24(%rsi), %xmm4 + movaps 0x34(%rsi), %xmm5 + movaps 0x44(%rsi), %xmm6 + movaps 0x54(%rsi), %xmm7 + movaps 0x64(%rsi), %xmm8 + movaps 0x74(%rsi), %xmm9 + lea 0x80(%rsi), %rsi + palignr $12, %xmm8, %xmm9 + movaps %xmm9, 0x70(%rdi) + palignr $12, %xmm7, %xmm8 + movaps %xmm8, 0x60(%rdi) + palignr $12, %xmm6, %xmm7 + movaps %xmm7, 0x50(%rdi) + palignr $12, %xmm5, %xmm6 + movaps %xmm6, 0x40(%rdi) + palignr $12, %xmm4, %xmm5 + movaps %xmm5, 0x30(%rdi) + palignr $12, %xmm3, %xmm4 + movaps %xmm4, 0x20(%rdi) + palignr $12, %xmm2, %xmm3 + movaps %xmm3, 0x10(%rdi) + palignr $12, %xmm1, %xmm2 + movaps %xmm2, (%rdi) + + lea 0x80(%rdi), %rdi + jae L(shl_12) + movdqu %xmm0, (%r8) + add $0x80, %rdx + add %rdx, %rdi + add %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) + + .p2align 4 +L(shl_12_bwd): + movaps -0x0c(%rsi), %xmm1 + + movaps -0x1c(%rsi), %xmm2 + palignr $12, %xmm2, %xmm1 + movaps %xmm1, -0x10(%rdi) + + movaps -0x2c(%rsi), %xmm3 + palignr $12, %xmm3, %xmm2 + movaps %xmm2, -0x20(%rdi) + + movaps -0x3c(%rsi), %xmm4 + palignr $12, %xmm4, %xmm3 + movaps %xmm3, -0x30(%rdi) + + movaps -0x4c(%rsi), %xmm5 + palignr $12, %xmm5, %xmm4 + movaps %xmm4, -0x40(%rdi) + + movaps -0x5c(%rsi), %xmm6 + palignr $12, %xmm6, %xmm5 + movaps %xmm5, -0x50(%rdi) + + movaps -0x6c(%rsi), %xmm7 + palignr $12, %xmm7, %xmm6 + movaps %xmm6, -0x60(%rdi) + + movaps -0x7c(%rsi), %xmm8 + palignr $12, %xmm8, %xmm7 + movaps %xmm7, -0x70(%rdi) + + movaps -0x8c(%rsi), %xmm9 + palignr $12, %xmm9, %xmm8 + movaps %xmm8, -0x80(%rdi) + + sub $0x80, %rdx + lea -0x80(%rdi), %rdi + lea -0x80(%rsi), %rsi + jae L(shl_12_bwd) + movdqu %xmm0, (%r8) + add $0x80, %rdx + sub %rdx, %rdi + sub %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) + + .p2align 4 +L(shl_13): + sub $0x80, %rdx + movaps -0x0d(%rsi), %xmm1 + movaps 0x03(%rsi), %xmm2 + movaps 0x13(%rsi), %xmm3 + movaps 0x23(%rsi), %xmm4 + movaps 0x33(%rsi), %xmm5 + movaps 0x43(%rsi), %xmm6 + movaps 0x53(%rsi), %xmm7 + movaps 0x63(%rsi), %xmm8 + movaps 0x73(%rsi), %xmm9 + lea 0x80(%rsi), %rsi + palignr $13, %xmm8, %xmm9 + movaps %xmm9, 0x70(%rdi) + palignr $13, %xmm7, %xmm8 + movaps %xmm8, 0x60(%rdi) + palignr $13, %xmm6, %xmm7 + movaps %xmm7, 0x50(%rdi) + palignr $13, %xmm5, %xmm6 + movaps %xmm6, 0x40(%rdi) + palignr $13, %xmm4, %xmm5 + movaps %xmm5, 0x30(%rdi) + palignr $13, %xmm3, %xmm4 + movaps %xmm4, 0x20(%rdi) + palignr $13, %xmm2, %xmm3 + movaps %xmm3, 0x10(%rdi) + palignr $13, %xmm1, %xmm2 + movaps %xmm2, (%rdi) + lea 0x80(%rdi), %rdi + jae L(shl_13) + movdqu %xmm0, (%r8) + add $0x80, %rdx + add %rdx, %rdi + add %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) + + .p2align 4 +L(shl_13_bwd): + movaps -0x0d(%rsi), %xmm1 + + movaps -0x1d(%rsi), %xmm2 + palignr $13, %xmm2, %xmm1 + movaps %xmm1, -0x10(%rdi) + + movaps -0x2d(%rsi), %xmm3 + palignr $13, %xmm3, %xmm2 + movaps %xmm2, -0x20(%rdi) + + movaps -0x3d(%rsi), %xmm4 + palignr $13, %xmm4, %xmm3 + movaps %xmm3, -0x30(%rdi) + + movaps -0x4d(%rsi), %xmm5 + palignr $13, %xmm5, %xmm4 + movaps %xmm4, -0x40(%rdi) + + movaps -0x5d(%rsi), %xmm6 + palignr $13, %xmm6, %xmm5 + movaps %xmm5, -0x50(%rdi) + + movaps -0x6d(%rsi), %xmm7 + palignr $13, %xmm7, %xmm6 + movaps %xmm6, -0x60(%rdi) + + movaps -0x7d(%rsi), %xmm8 + palignr $13, %xmm8, %xmm7 + movaps %xmm7, -0x70(%rdi) + + movaps -0x8d(%rsi), %xmm9 + palignr $13, %xmm9, %xmm8 + movaps %xmm8, -0x80(%rdi) + + sub $0x80, %rdx + lea -0x80(%rdi), %rdi + lea -0x80(%rsi), %rsi + jae L(shl_13_bwd) + movdqu %xmm0, (%r8) + add $0x80, %rdx + sub %rdx, %rdi + sub %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) + + .p2align 4 +L(shl_14): + sub $0x80, %rdx + movaps -0x0e(%rsi), %xmm1 + movaps 0x02(%rsi), %xmm2 + movaps 0x12(%rsi), %xmm3 + movaps 0x22(%rsi), %xmm4 + movaps 0x32(%rsi), %xmm5 + movaps 0x42(%rsi), %xmm6 + movaps 0x52(%rsi), %xmm7 + movaps 0x62(%rsi), %xmm8 + movaps 0x72(%rsi), %xmm9 + lea 0x80(%rsi), %rsi + palignr $14, %xmm8, %xmm9 + movaps %xmm9, 0x70(%rdi) + palignr $14, %xmm7, %xmm8 + movaps %xmm8, 0x60(%rdi) + palignr $14, %xmm6, %xmm7 + movaps %xmm7, 0x50(%rdi) + palignr $14, %xmm5, %xmm6 + movaps %xmm6, 0x40(%rdi) + palignr $14, %xmm4, %xmm5 + movaps %xmm5, 0x30(%rdi) + palignr $14, %xmm3, %xmm4 + movaps %xmm4, 0x20(%rdi) + palignr $14, %xmm2, %xmm3 + movaps %xmm3, 0x10(%rdi) + palignr $14, %xmm1, %xmm2 + movaps %xmm2, (%rdi) + lea 0x80(%rdi), %rdi + jae L(shl_14) + movdqu %xmm0, (%r8) + add $0x80, %rdx + add %rdx, %rdi + add %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) + + .p2align 4 +L(shl_14_bwd): + movaps -0x0e(%rsi), %xmm1 + + movaps -0x1e(%rsi), %xmm2 + palignr $14, %xmm2, %xmm1 + movaps %xmm1, -0x10(%rdi) + + movaps -0x2e(%rsi), %xmm3 + palignr $14, %xmm3, %xmm2 + movaps %xmm2, -0x20(%rdi) + + movaps -0x3e(%rsi), %xmm4 + palignr $14, %xmm4, %xmm3 + movaps %xmm3, -0x30(%rdi) + + movaps -0x4e(%rsi), %xmm5 + palignr $14, %xmm5, %xmm4 + movaps %xmm4, -0x40(%rdi) + + movaps -0x5e(%rsi), %xmm6 + palignr $14, %xmm6, %xmm5 + movaps %xmm5, -0x50(%rdi) + + movaps -0x6e(%rsi), %xmm7 + palignr $14, %xmm7, %xmm6 + movaps %xmm6, -0x60(%rdi) + + movaps -0x7e(%rsi), %xmm8 + palignr $14, %xmm8, %xmm7 + movaps %xmm7, -0x70(%rdi) + + movaps -0x8e(%rsi), %xmm9 + palignr $14, %xmm9, %xmm8 + movaps %xmm8, -0x80(%rdi) + + sub $0x80, %rdx + lea -0x80(%rdi), %rdi + lea -0x80(%rsi), %rsi + jae L(shl_14_bwd) + movdqu %xmm0, (%r8) + add $0x80, %rdx + sub %rdx, %rdi + sub %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) + + .p2align 4 +L(shl_15): + sub $0x80, %rdx + movaps -0x0f(%rsi), %xmm1 + movaps 0x01(%rsi), %xmm2 + movaps 0x11(%rsi), %xmm3 + movaps 0x21(%rsi), %xmm4 + movaps 0x31(%rsi), %xmm5 + movaps 0x41(%rsi), %xmm6 + movaps 0x51(%rsi), %xmm7 + movaps 0x61(%rsi), %xmm8 + movaps 0x71(%rsi), %xmm9 + lea 0x80(%rsi), %rsi + palignr $15, %xmm8, %xmm9 + movaps %xmm9, 0x70(%rdi) + palignr $15, %xmm7, %xmm8 + movaps %xmm8, 0x60(%rdi) + palignr $15, %xmm6, %xmm7 + movaps %xmm7, 0x50(%rdi) + palignr $15, %xmm5, %xmm6 + movaps %xmm6, 0x40(%rdi) + palignr $15, %xmm4, %xmm5 + movaps %xmm5, 0x30(%rdi) + palignr $15, %xmm3, %xmm4 + movaps %xmm4, 0x20(%rdi) + palignr $15, %xmm2, %xmm3 + movaps %xmm3, 0x10(%rdi) + palignr $15, %xmm1, %xmm2 + movaps %xmm2, (%rdi) + lea 0x80(%rdi), %rdi + jae L(shl_15) + movdqu %xmm0, (%r8) + add $0x80, %rdx + add %rdx, %rdi + add %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) + + .p2align 4 +L(shl_15_bwd): + movaps -0x0f(%rsi), %xmm1 + + movaps -0x1f(%rsi), %xmm2 + palignr $15, %xmm2, %xmm1 + movaps %xmm1, -0x10(%rdi) + + movaps -0x2f(%rsi), %xmm3 + palignr $15, %xmm3, %xmm2 + movaps %xmm2, -0x20(%rdi) + + movaps -0x3f(%rsi), %xmm4 + palignr $15, %xmm4, %xmm3 + movaps %xmm3, -0x30(%rdi) + + movaps -0x4f(%rsi), %xmm5 + palignr $15, %xmm5, %xmm4 + movaps %xmm4, -0x40(%rdi) + + movaps -0x5f(%rsi), %xmm6 + palignr $15, %xmm6, %xmm5 + movaps %xmm5, -0x50(%rdi) + + movaps -0x6f(%rsi), %xmm7 + palignr $15, %xmm7, %xmm6 + movaps %xmm6, -0x60(%rdi) + + movaps -0x7f(%rsi), %xmm8 + palignr $15, %xmm8, %xmm7 + movaps %xmm7, -0x70(%rdi) + + movaps -0x8f(%rsi), %xmm9 + palignr $15, %xmm9, %xmm8 + movaps %xmm8, -0x80(%rdi) + + sub $0x80, %rdx + lea -0x80(%rdi), %rdi + lea -0x80(%rsi), %rsi + jae L(shl_15_bwd) + movdqu %xmm0, (%r8) + add $0x80, %rdx + sub %rdx, %rdi + sub %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) + + .p2align 4 +L(gobble_mem_fwd): + movdqu (%rsi), %xmm1 + movdqu %xmm0, (%r8) + movdqa %xmm1, (%rdi) + sub $16, %rdx + add $16, %rsi + add $16, %rdi + +#ifdef SHARED_CACHE_SIZE_HALF + mov $SHARED_CACHE_SIZE_HALF, %RCX_LP +#else + mov __x86_shared_cache_size_half(%rip), %RCX_LP +#endif +#ifdef USE_AS_MEMMOVE + mov %rsi, %r9 + sub %rdi, %r9 + cmp %rdx, %r9 + jae L(memmove_is_memcpy_fwd) + cmp %rcx, %r9 + jbe L(ll_cache_copy_fwd_start) +L(memmove_is_memcpy_fwd): +#endif + cmp %rcx, %rdx + ja L(bigger_in_fwd) + mov %rdx, %rcx +L(bigger_in_fwd): + sub %rcx, %rdx + cmp $0x1000, %rdx + jbe L(ll_cache_copy_fwd) + + mov %rcx, %r9 + shl $3, %r9 + cmp %r9, %rdx + jbe L(2steps_copy_fwd) + add %rcx, %rdx + xor %rcx, %rcx +L(2steps_copy_fwd): + sub $0x80, %rdx +L(gobble_mem_fwd_loop): + sub $0x80, %rdx + prefetcht0 0x200(%rsi) + prefetcht0 0x300(%rsi) + movdqu (%rsi), %xmm0 + movdqu 0x10(%rsi), %xmm1 + movdqu 0x20(%rsi), %xmm2 + movdqu 0x30(%rsi), %xmm3 + movdqu 0x40(%rsi), %xmm4 + movdqu 0x50(%rsi), %xmm5 + movdqu 0x60(%rsi), %xmm6 + movdqu 0x70(%rsi), %xmm7 + lfence + movntdq %xmm0, (%rdi) + movntdq %xmm1, 0x10(%rdi) + movntdq %xmm2, 0x20(%rdi) + movntdq %xmm3, 0x30(%rdi) + movntdq %xmm4, 0x40(%rdi) + movntdq %xmm5, 0x50(%rdi) + movntdq %xmm6, 0x60(%rdi) + movntdq %xmm7, 0x70(%rdi) + lea 0x80(%rsi), %rsi + lea 0x80(%rdi), %rdi + jae L(gobble_mem_fwd_loop) + sfence + cmp $0x80, %rcx + jb L(gobble_mem_fwd_end) + add $0x80, %rdx +L(ll_cache_copy_fwd): + add %rcx, %rdx +L(ll_cache_copy_fwd_start): + sub $0x80, %rdx +L(gobble_ll_loop_fwd): + prefetchnta 0x1c0(%rsi) + prefetchnta 0x280(%rsi) + prefetchnta 0x1c0(%rdi) + prefetchnta 0x280(%rdi) + sub $0x80, %rdx + movdqu (%rsi), %xmm0 + movdqu 0x10(%rsi), %xmm1 + movdqu 0x20(%rsi), %xmm2 + movdqu 0x30(%rsi), %xmm3 + movdqu 0x40(%rsi), %xmm4 + movdqu 0x50(%rsi), %xmm5 + movdqu 0x60(%rsi), %xmm6 + movdqu 0x70(%rsi), %xmm7 + movdqa %xmm0, (%rdi) + movdqa %xmm1, 0x10(%rdi) + movdqa %xmm2, 0x20(%rdi) + movdqa %xmm3, 0x30(%rdi) + movdqa %xmm4, 0x40(%rdi) + movdqa %xmm5, 0x50(%rdi) + movdqa %xmm6, 0x60(%rdi) + movdqa %xmm7, 0x70(%rdi) + lea 0x80(%rsi), %rsi + lea 0x80(%rdi), %rdi + jae L(gobble_ll_loop_fwd) +L(gobble_mem_fwd_end): + add $0x80, %rdx + add %rdx, %rsi + add %rdx, %rdi + BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) + + .p2align 4 +L(gobble_mem_bwd): + add %rdx, %rsi + add %rdx, %rdi + + movdqu -16(%rsi), %xmm0 + lea -16(%rdi), %r8 + mov %rdi, %r9 + and $-16, %rdi + sub %rdi, %r9 + sub %r9, %rsi + sub %r9, %rdx + + +#ifdef SHARED_CACHE_SIZE_HALF + mov $SHARED_CACHE_SIZE_HALF, %RCX_LP +#else + mov __x86_shared_cache_size_half(%rip), %RCX_LP +#endif +#ifdef USE_AS_MEMMOVE + mov %rdi, %r9 + sub %rsi, %r9 + cmp %rdx, %r9 + jae L(memmove_is_memcpy_bwd) + cmp %rcx, %r9 + jbe L(ll_cache_copy_bwd_start) +L(memmove_is_memcpy_bwd): +#endif + cmp %rcx, %rdx + ja L(bigger) + mov %rdx, %rcx +L(bigger): + sub %rcx, %rdx + cmp $0x1000, %rdx + jbe L(ll_cache_copy) + + mov %rcx, %r9 + shl $3, %r9 + cmp %r9, %rdx + jbe L(2steps_copy) + add %rcx, %rdx + xor %rcx, %rcx +L(2steps_copy): + sub $0x80, %rdx +L(gobble_mem_bwd_loop): + sub $0x80, %rdx + prefetcht0 -0x200(%rsi) + prefetcht0 -0x300(%rsi) + movdqu -0x10(%rsi), %xmm1 + movdqu -0x20(%rsi), %xmm2 + movdqu -0x30(%rsi), %xmm3 + movdqu -0x40(%rsi), %xmm4 + movdqu -0x50(%rsi), %xmm5 + movdqu -0x60(%rsi), %xmm6 + movdqu -0x70(%rsi), %xmm7 + movdqu -0x80(%rsi), %xmm8 + lfence + movntdq %xmm1, -0x10(%rdi) + movntdq %xmm2, -0x20(%rdi) + movntdq %xmm3, -0x30(%rdi) + movntdq %xmm4, -0x40(%rdi) + movntdq %xmm5, -0x50(%rdi) + movntdq %xmm6, -0x60(%rdi) + movntdq %xmm7, -0x70(%rdi) + movntdq %xmm8, -0x80(%rdi) + lea -0x80(%rsi), %rsi + lea -0x80(%rdi), %rdi + jae L(gobble_mem_bwd_loop) + sfence + cmp $0x80, %rcx + jb L(gobble_mem_bwd_end) + add $0x80, %rdx +L(ll_cache_copy): + add %rcx, %rdx +L(ll_cache_copy_bwd_start): + sub $0x80, %rdx +L(gobble_ll_loop): + prefetchnta -0x1c0(%rsi) + prefetchnta -0x280(%rsi) + prefetchnta -0x1c0(%rdi) + prefetchnta -0x280(%rdi) + sub $0x80, %rdx + movdqu -0x10(%rsi), %xmm1 + movdqu -0x20(%rsi), %xmm2 + movdqu -0x30(%rsi), %xmm3 + movdqu -0x40(%rsi), %xmm4 + movdqu -0x50(%rsi), %xmm5 + movdqu -0x60(%rsi), %xmm6 + movdqu -0x70(%rsi), %xmm7 + movdqu -0x80(%rsi), %xmm8 + movdqa %xmm1, -0x10(%rdi) + movdqa %xmm2, -0x20(%rdi) + movdqa %xmm3, -0x30(%rdi) + movdqa %xmm4, -0x40(%rdi) + movdqa %xmm5, -0x50(%rdi) + movdqa %xmm6, -0x60(%rdi) + movdqa %xmm7, -0x70(%rdi) + movdqa %xmm8, -0x80(%rdi) + lea -0x80(%rsi), %rsi + lea -0x80(%rdi), %rdi + jae L(gobble_ll_loop) +L(gobble_mem_bwd_end): + movdqu %xmm0, (%r8) + add $0x80, %rdx + sub %rdx, %rsi + sub %rdx, %rdi + BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) + + .p2align 4 +L(fwd_write_128bytes): + lddqu -128(%rsi), %xmm0 + movdqu %xmm0, -128(%rdi) +L(fwd_write_112bytes): + lddqu -112(%rsi), %xmm0 + movdqu %xmm0, -112(%rdi) +L(fwd_write_96bytes): + lddqu -96(%rsi), %xmm0 + movdqu %xmm0, -96(%rdi) +L(fwd_write_80bytes): + lddqu -80(%rsi), %xmm0 + movdqu %xmm0, -80(%rdi) +L(fwd_write_64bytes): + lddqu -64(%rsi), %xmm0 + movdqu %xmm0, -64(%rdi) +L(fwd_write_48bytes): + lddqu -48(%rsi), %xmm0 + movdqu %xmm0, -48(%rdi) +L(fwd_write_32bytes): + lddqu -32(%rsi), %xmm0 + movdqu %xmm0, -32(%rdi) +L(fwd_write_16bytes): + lddqu -16(%rsi), %xmm0 + movdqu %xmm0, -16(%rdi) +L(fwd_write_0bytes): + ret + + + .p2align 4 +L(fwd_write_143bytes): + lddqu -143(%rsi), %xmm0 + movdqu %xmm0, -143(%rdi) +L(fwd_write_127bytes): + lddqu -127(%rsi), %xmm0 + movdqu %xmm0, -127(%rdi) +L(fwd_write_111bytes): + lddqu -111(%rsi), %xmm0 + movdqu %xmm0, -111(%rdi) +L(fwd_write_95bytes): + lddqu -95(%rsi), %xmm0 + movdqu %xmm0, -95(%rdi) +L(fwd_write_79bytes): + lddqu -79(%rsi), %xmm0 + movdqu %xmm0, -79(%rdi) +L(fwd_write_63bytes): + lddqu -63(%rsi), %xmm0 + movdqu %xmm0, -63(%rdi) +L(fwd_write_47bytes): + lddqu -47(%rsi), %xmm0 + movdqu %xmm0, -47(%rdi) +L(fwd_write_31bytes): + lddqu -31(%rsi), %xmm0 + lddqu -16(%rsi), %xmm1 + movdqu %xmm0, -31(%rdi) + movdqu %xmm1, -16(%rdi) + ret + + .p2align 4 +L(fwd_write_15bytes): + mov -15(%rsi), %rdx + mov -8(%rsi), %rcx + mov %rdx, -15(%rdi) + mov %rcx, -8(%rdi) + ret + + .p2align 4 +L(fwd_write_142bytes): + lddqu -142(%rsi), %xmm0 + movdqu %xmm0, -142(%rdi) +L(fwd_write_126bytes): + lddqu -126(%rsi), %xmm0 + movdqu %xmm0, -126(%rdi) +L(fwd_write_110bytes): + lddqu -110(%rsi), %xmm0 + movdqu %xmm0, -110(%rdi) +L(fwd_write_94bytes): + lddqu -94(%rsi), %xmm0 + movdqu %xmm0, -94(%rdi) +L(fwd_write_78bytes): + lddqu -78(%rsi), %xmm0 + movdqu %xmm0, -78(%rdi) +L(fwd_write_62bytes): + lddqu -62(%rsi), %xmm0 + movdqu %xmm0, -62(%rdi) +L(fwd_write_46bytes): + lddqu -46(%rsi), %xmm0 + movdqu %xmm0, -46(%rdi) +L(fwd_write_30bytes): + lddqu -30(%rsi), %xmm0 + lddqu -16(%rsi), %xmm1 + movdqu %xmm0, -30(%rdi) + movdqu %xmm1, -16(%rdi) + ret + + .p2align 4 +L(fwd_write_14bytes): + mov -14(%rsi), %rdx + mov -8(%rsi), %rcx + mov %rdx, -14(%rdi) + mov %rcx, -8(%rdi) + ret + + .p2align 4 +L(fwd_write_141bytes): + lddqu -141(%rsi), %xmm0 + movdqu %xmm0, -141(%rdi) +L(fwd_write_125bytes): + lddqu -125(%rsi), %xmm0 + movdqu %xmm0, -125(%rdi) +L(fwd_write_109bytes): + lddqu -109(%rsi), %xmm0 + movdqu %xmm0, -109(%rdi) +L(fwd_write_93bytes): + lddqu -93(%rsi), %xmm0 + movdqu %xmm0, -93(%rdi) +L(fwd_write_77bytes): + lddqu -77(%rsi), %xmm0 + movdqu %xmm0, -77(%rdi) +L(fwd_write_61bytes): + lddqu -61(%rsi), %xmm0 + movdqu %xmm0, -61(%rdi) +L(fwd_write_45bytes): + lddqu -45(%rsi), %xmm0 + movdqu %xmm0, -45(%rdi) +L(fwd_write_29bytes): + lddqu -29(%rsi), %xmm0 + lddqu -16(%rsi), %xmm1 + movdqu %xmm0, -29(%rdi) + movdqu %xmm1, -16(%rdi) + ret + + .p2align 4 +L(fwd_write_13bytes): + mov -13(%rsi), %rdx + mov -8(%rsi), %rcx + mov %rdx, -13(%rdi) + mov %rcx, -8(%rdi) + ret + + .p2align 4 +L(fwd_write_140bytes): + lddqu -140(%rsi), %xmm0 + movdqu %xmm0, -140(%rdi) +L(fwd_write_124bytes): + lddqu -124(%rsi), %xmm0 + movdqu %xmm0, -124(%rdi) +L(fwd_write_108bytes): + lddqu -108(%rsi), %xmm0 + movdqu %xmm0, -108(%rdi) +L(fwd_write_92bytes): + lddqu -92(%rsi), %xmm0 + movdqu %xmm0, -92(%rdi) +L(fwd_write_76bytes): + lddqu -76(%rsi), %xmm0 + movdqu %xmm0, -76(%rdi) +L(fwd_write_60bytes): + lddqu -60(%rsi), %xmm0 + movdqu %xmm0, -60(%rdi) +L(fwd_write_44bytes): + lddqu -44(%rsi), %xmm0 + movdqu %xmm0, -44(%rdi) +L(fwd_write_28bytes): + lddqu -28(%rsi), %xmm0 + lddqu -16(%rsi), %xmm1 + movdqu %xmm0, -28(%rdi) + movdqu %xmm1, -16(%rdi) + ret + + .p2align 4 +L(fwd_write_12bytes): + mov -12(%rsi), %rdx + mov -4(%rsi), %ecx + mov %rdx, -12(%rdi) + mov %ecx, -4(%rdi) + ret + + .p2align 4 +L(fwd_write_139bytes): + lddqu -139(%rsi), %xmm0 + movdqu %xmm0, -139(%rdi) +L(fwd_write_123bytes): + lddqu -123(%rsi), %xmm0 + movdqu %xmm0, -123(%rdi) +L(fwd_write_107bytes): + lddqu -107(%rsi), %xmm0 + movdqu %xmm0, -107(%rdi) +L(fwd_write_91bytes): + lddqu -91(%rsi), %xmm0 + movdqu %xmm0, -91(%rdi) +L(fwd_write_75bytes): + lddqu -75(%rsi), %xmm0 + movdqu %xmm0, -75(%rdi) +L(fwd_write_59bytes): + lddqu -59(%rsi), %xmm0 + movdqu %xmm0, -59(%rdi) +L(fwd_write_43bytes): + lddqu -43(%rsi), %xmm0 + movdqu %xmm0, -43(%rdi) +L(fwd_write_27bytes): + lddqu -27(%rsi), %xmm0 + lddqu -16(%rsi), %xmm1 + movdqu %xmm0, -27(%rdi) + movdqu %xmm1, -16(%rdi) + ret + + .p2align 4 +L(fwd_write_11bytes): + mov -11(%rsi), %rdx + mov -4(%rsi), %ecx + mov %rdx, -11(%rdi) + mov %ecx, -4(%rdi) + ret + + .p2align 4 +L(fwd_write_138bytes): + lddqu -138(%rsi), %xmm0 + movdqu %xmm0, -138(%rdi) +L(fwd_write_122bytes): + lddqu -122(%rsi), %xmm0 + movdqu %xmm0, -122(%rdi) +L(fwd_write_106bytes): + lddqu -106(%rsi), %xmm0 + movdqu %xmm0, -106(%rdi) +L(fwd_write_90bytes): + lddqu -90(%rsi), %xmm0 + movdqu %xmm0, -90(%rdi) +L(fwd_write_74bytes): + lddqu -74(%rsi), %xmm0 + movdqu %xmm0, -74(%rdi) +L(fwd_write_58bytes): + lddqu -58(%rsi), %xmm0 + movdqu %xmm0, -58(%rdi) +L(fwd_write_42bytes): + lddqu -42(%rsi), %xmm0 + movdqu %xmm0, -42(%rdi) +L(fwd_write_26bytes): + lddqu -26(%rsi), %xmm0 + lddqu -16(%rsi), %xmm1 + movdqu %xmm0, -26(%rdi) + movdqu %xmm1, -16(%rdi) + ret + + .p2align 4 +L(fwd_write_10bytes): + mov -10(%rsi), %rdx + mov -4(%rsi), %ecx + mov %rdx, -10(%rdi) + mov %ecx, -4(%rdi) + ret + + .p2align 4 +L(fwd_write_137bytes): + lddqu -137(%rsi), %xmm0 + movdqu %xmm0, -137(%rdi) +L(fwd_write_121bytes): + lddqu -121(%rsi), %xmm0 + movdqu %xmm0, -121(%rdi) +L(fwd_write_105bytes): + lddqu -105(%rsi), %xmm0 + movdqu %xmm0, -105(%rdi) +L(fwd_write_89bytes): + lddqu -89(%rsi), %xmm0 + movdqu %xmm0, -89(%rdi) +L(fwd_write_73bytes): + lddqu -73(%rsi), %xmm0 + movdqu %xmm0, -73(%rdi) +L(fwd_write_57bytes): + lddqu -57(%rsi), %xmm0 + movdqu %xmm0, -57(%rdi) +L(fwd_write_41bytes): + lddqu -41(%rsi), %xmm0 + movdqu %xmm0, -41(%rdi) +L(fwd_write_25bytes): + lddqu -25(%rsi), %xmm0 + lddqu -16(%rsi), %xmm1 + movdqu %xmm0, -25(%rdi) + movdqu %xmm1, -16(%rdi) + ret + + .p2align 4 +L(fwd_write_9bytes): + mov -9(%rsi), %rdx + mov -4(%rsi), %ecx + mov %rdx, -9(%rdi) + mov %ecx, -4(%rdi) + ret + + .p2align 4 +L(fwd_write_136bytes): + lddqu -136(%rsi), %xmm0 + movdqu %xmm0, -136(%rdi) +L(fwd_write_120bytes): + lddqu -120(%rsi), %xmm0 + movdqu %xmm0, -120(%rdi) +L(fwd_write_104bytes): + lddqu -104(%rsi), %xmm0 + movdqu %xmm0, -104(%rdi) +L(fwd_write_88bytes): + lddqu -88(%rsi), %xmm0 + movdqu %xmm0, -88(%rdi) +L(fwd_write_72bytes): + lddqu -72(%rsi), %xmm0 + movdqu %xmm0, -72(%rdi) +L(fwd_write_56bytes): + lddqu -56(%rsi), %xmm0 + movdqu %xmm0, -56(%rdi) +L(fwd_write_40bytes): + lddqu -40(%rsi), %xmm0 + movdqu %xmm0, -40(%rdi) +L(fwd_write_24bytes): + lddqu -24(%rsi), %xmm0 + lddqu -16(%rsi), %xmm1 + movdqu %xmm0, -24(%rdi) + movdqu %xmm1, -16(%rdi) + ret + + .p2align 4 +L(fwd_write_8bytes): + mov -8(%rsi), %rdx + mov %rdx, -8(%rdi) + ret + + .p2align 4 +L(fwd_write_135bytes): + lddqu -135(%rsi), %xmm0 + movdqu %xmm0, -135(%rdi) +L(fwd_write_119bytes): + lddqu -119(%rsi), %xmm0 + movdqu %xmm0, -119(%rdi) +L(fwd_write_103bytes): + lddqu -103(%rsi), %xmm0 + movdqu %xmm0, -103(%rdi) +L(fwd_write_87bytes): + lddqu -87(%rsi), %xmm0 + movdqu %xmm0, -87(%rdi) +L(fwd_write_71bytes): + lddqu -71(%rsi), %xmm0 + movdqu %xmm0, -71(%rdi) +L(fwd_write_55bytes): + lddqu -55(%rsi), %xmm0 + movdqu %xmm0, -55(%rdi) +L(fwd_write_39bytes): + lddqu -39(%rsi), %xmm0 + movdqu %xmm0, -39(%rdi) +L(fwd_write_23bytes): + lddqu -23(%rsi), %xmm0 + lddqu -16(%rsi), %xmm1 + movdqu %xmm0, -23(%rdi) + movdqu %xmm1, -16(%rdi) + ret + + .p2align 4 +L(fwd_write_7bytes): + mov -7(%rsi), %edx + mov -4(%rsi), %ecx + mov %edx, -7(%rdi) + mov %ecx, -4(%rdi) + ret + + .p2align 4 +L(fwd_write_134bytes): + lddqu -134(%rsi), %xmm0 + movdqu %xmm0, -134(%rdi) +L(fwd_write_118bytes): + lddqu -118(%rsi), %xmm0 + movdqu %xmm0, -118(%rdi) +L(fwd_write_102bytes): + lddqu -102(%rsi), %xmm0 + movdqu %xmm0, -102(%rdi) +L(fwd_write_86bytes): + lddqu -86(%rsi), %xmm0 + movdqu %xmm0, -86(%rdi) +L(fwd_write_70bytes): + lddqu -70(%rsi), %xmm0 + movdqu %xmm0, -70(%rdi) +L(fwd_write_54bytes): + lddqu -54(%rsi), %xmm0 + movdqu %xmm0, -54(%rdi) +L(fwd_write_38bytes): + lddqu -38(%rsi), %xmm0 + movdqu %xmm0, -38(%rdi) +L(fwd_write_22bytes): + lddqu -22(%rsi), %xmm0 + lddqu -16(%rsi), %xmm1 + movdqu %xmm0, -22(%rdi) + movdqu %xmm1, -16(%rdi) + ret + + .p2align 4 +L(fwd_write_6bytes): + mov -6(%rsi), %edx + mov -4(%rsi), %ecx + mov %edx, -6(%rdi) + mov %ecx, -4(%rdi) + ret + + .p2align 4 +L(fwd_write_133bytes): + lddqu -133(%rsi), %xmm0 + movdqu %xmm0, -133(%rdi) +L(fwd_write_117bytes): + lddqu -117(%rsi), %xmm0 + movdqu %xmm0, -117(%rdi) +L(fwd_write_101bytes): + lddqu -101(%rsi), %xmm0 + movdqu %xmm0, -101(%rdi) +L(fwd_write_85bytes): + lddqu -85(%rsi), %xmm0 + movdqu %xmm0, -85(%rdi) +L(fwd_write_69bytes): + lddqu -69(%rsi), %xmm0 + movdqu %xmm0, -69(%rdi) +L(fwd_write_53bytes): + lddqu -53(%rsi), %xmm0 + movdqu %xmm0, -53(%rdi) +L(fwd_write_37bytes): + lddqu -37(%rsi), %xmm0 + movdqu %xmm0, -37(%rdi) +L(fwd_write_21bytes): + lddqu -21(%rsi), %xmm0 + lddqu -16(%rsi), %xmm1 + movdqu %xmm0, -21(%rdi) + movdqu %xmm1, -16(%rdi) + ret + + .p2align 4 +L(fwd_write_5bytes): + mov -5(%rsi), %edx + mov -4(%rsi), %ecx + mov %edx, -5(%rdi) + mov %ecx, -4(%rdi) + ret + + .p2align 4 +L(fwd_write_132bytes): + lddqu -132(%rsi), %xmm0 + movdqu %xmm0, -132(%rdi) +L(fwd_write_116bytes): + lddqu -116(%rsi), %xmm0 + movdqu %xmm0, -116(%rdi) +L(fwd_write_100bytes): + lddqu -100(%rsi), %xmm0 + movdqu %xmm0, -100(%rdi) +L(fwd_write_84bytes): + lddqu -84(%rsi), %xmm0 + movdqu %xmm0, -84(%rdi) +L(fwd_write_68bytes): + lddqu -68(%rsi), %xmm0 + movdqu %xmm0, -68(%rdi) +L(fwd_write_52bytes): + lddqu -52(%rsi), %xmm0 + movdqu %xmm0, -52(%rdi) +L(fwd_write_36bytes): + lddqu -36(%rsi), %xmm0 + movdqu %xmm0, -36(%rdi) +L(fwd_write_20bytes): + lddqu -20(%rsi), %xmm0 + lddqu -16(%rsi), %xmm1 + movdqu %xmm0, -20(%rdi) + movdqu %xmm1, -16(%rdi) + ret + + .p2align 4 +L(fwd_write_4bytes): + mov -4(%rsi), %edx + mov %edx, -4(%rdi) + ret + + .p2align 4 +L(fwd_write_131bytes): + lddqu -131(%rsi), %xmm0 + movdqu %xmm0, -131(%rdi) +L(fwd_write_115bytes): + lddqu -115(%rsi), %xmm0 + movdqu %xmm0, -115(%rdi) +L(fwd_write_99bytes): + lddqu -99(%rsi), %xmm0 + movdqu %xmm0, -99(%rdi) +L(fwd_write_83bytes): + lddqu -83(%rsi), %xmm0 + movdqu %xmm0, -83(%rdi) +L(fwd_write_67bytes): + lddqu -67(%rsi), %xmm0 + movdqu %xmm0, -67(%rdi) +L(fwd_write_51bytes): + lddqu -51(%rsi), %xmm0 + movdqu %xmm0, -51(%rdi) +L(fwd_write_35bytes): + lddqu -35(%rsi), %xmm0 + movdqu %xmm0, -35(%rdi) +L(fwd_write_19bytes): + lddqu -19(%rsi), %xmm0 + lddqu -16(%rsi), %xmm1 + movdqu %xmm0, -19(%rdi) + movdqu %xmm1, -16(%rdi) + ret + + .p2align 4 +L(fwd_write_3bytes): + mov -3(%rsi), %dx + mov -2(%rsi), %cx + mov %dx, -3(%rdi) + mov %cx, -2(%rdi) + ret + + .p2align 4 +L(fwd_write_130bytes): + lddqu -130(%rsi), %xmm0 + movdqu %xmm0, -130(%rdi) +L(fwd_write_114bytes): + lddqu -114(%rsi), %xmm0 + movdqu %xmm0, -114(%rdi) +L(fwd_write_98bytes): + lddqu -98(%rsi), %xmm0 + movdqu %xmm0, -98(%rdi) +L(fwd_write_82bytes): + lddqu -82(%rsi), %xmm0 + movdqu %xmm0, -82(%rdi) +L(fwd_write_66bytes): + lddqu -66(%rsi), %xmm0 + movdqu %xmm0, -66(%rdi) +L(fwd_write_50bytes): + lddqu -50(%rsi), %xmm0 + movdqu %xmm0, -50(%rdi) +L(fwd_write_34bytes): + lddqu -34(%rsi), %xmm0 + movdqu %xmm0, -34(%rdi) +L(fwd_write_18bytes): + lddqu -18(%rsi), %xmm0 + lddqu -16(%rsi), %xmm1 + movdqu %xmm0, -18(%rdi) + movdqu %xmm1, -16(%rdi) + ret + + .p2align 4 +L(fwd_write_2bytes): + movzwl -2(%rsi), %edx + mov %dx, -2(%rdi) + ret + + .p2align 4 +L(fwd_write_129bytes): + lddqu -129(%rsi), %xmm0 + movdqu %xmm0, -129(%rdi) +L(fwd_write_113bytes): + lddqu -113(%rsi), %xmm0 + movdqu %xmm0, -113(%rdi) +L(fwd_write_97bytes): + lddqu -97(%rsi), %xmm0 + movdqu %xmm0, -97(%rdi) +L(fwd_write_81bytes): + lddqu -81(%rsi), %xmm0 + movdqu %xmm0, -81(%rdi) +L(fwd_write_65bytes): + lddqu -65(%rsi), %xmm0 + movdqu %xmm0, -65(%rdi) +L(fwd_write_49bytes): + lddqu -49(%rsi), %xmm0 + movdqu %xmm0, -49(%rdi) +L(fwd_write_33bytes): + lddqu -33(%rsi), %xmm0 + movdqu %xmm0, -33(%rdi) +L(fwd_write_17bytes): + lddqu -17(%rsi), %xmm0 + lddqu -16(%rsi), %xmm1 + movdqu %xmm0, -17(%rdi) + movdqu %xmm1, -16(%rdi) + ret + + .p2align 4 +L(fwd_write_1bytes): + movzbl -1(%rsi), %edx + mov %dl, -1(%rdi) + ret + + .p2align 4 +L(bwd_write_128bytes): + lddqu 112(%rsi), %xmm0 + movdqu %xmm0, 112(%rdi) +L(bwd_write_112bytes): + lddqu 96(%rsi), %xmm0 + movdqu %xmm0, 96(%rdi) +L(bwd_write_96bytes): + lddqu 80(%rsi), %xmm0 + movdqu %xmm0, 80(%rdi) +L(bwd_write_80bytes): + lddqu 64(%rsi), %xmm0 + movdqu %xmm0, 64(%rdi) +L(bwd_write_64bytes): + lddqu 48(%rsi), %xmm0 + movdqu %xmm0, 48(%rdi) +L(bwd_write_48bytes): + lddqu 32(%rsi), %xmm0 + movdqu %xmm0, 32(%rdi) +L(bwd_write_32bytes): + lddqu 16(%rsi), %xmm0 + movdqu %xmm0, 16(%rdi) +L(bwd_write_16bytes): + lddqu (%rsi), %xmm0 + movdqu %xmm0, (%rdi) +L(bwd_write_0bytes): + ret + + .p2align 4 +L(bwd_write_143bytes): + lddqu 127(%rsi), %xmm0 + movdqu %xmm0, 127(%rdi) +L(bwd_write_127bytes): + lddqu 111(%rsi), %xmm0 + movdqu %xmm0, 111(%rdi) +L(bwd_write_111bytes): + lddqu 95(%rsi), %xmm0 + movdqu %xmm0, 95(%rdi) +L(bwd_write_95bytes): + lddqu 79(%rsi), %xmm0 + movdqu %xmm0, 79(%rdi) +L(bwd_write_79bytes): + lddqu 63(%rsi), %xmm0 + movdqu %xmm0, 63(%rdi) +L(bwd_write_63bytes): + lddqu 47(%rsi), %xmm0 + movdqu %xmm0, 47(%rdi) +L(bwd_write_47bytes): + lddqu 31(%rsi), %xmm0 + movdqu %xmm0, 31(%rdi) +L(bwd_write_31bytes): + lddqu 15(%rsi), %xmm0 + lddqu (%rsi), %xmm1 + movdqu %xmm0, 15(%rdi) + movdqu %xmm1, (%rdi) + ret + + + .p2align 4 +L(bwd_write_15bytes): + mov 7(%rsi), %rdx + mov (%rsi), %rcx + mov %rdx, 7(%rdi) + mov %rcx, (%rdi) + ret + + .p2align 4 +L(bwd_write_142bytes): + lddqu 126(%rsi), %xmm0 + movdqu %xmm0, 126(%rdi) +L(bwd_write_126bytes): + lddqu 110(%rsi), %xmm0 + movdqu %xmm0, 110(%rdi) +L(bwd_write_110bytes): + lddqu 94(%rsi), %xmm0 + movdqu %xmm0, 94(%rdi) +L(bwd_write_94bytes): + lddqu 78(%rsi), %xmm0 + movdqu %xmm0, 78(%rdi) +L(bwd_write_78bytes): + lddqu 62(%rsi), %xmm0 + movdqu %xmm0, 62(%rdi) +L(bwd_write_62bytes): + lddqu 46(%rsi), %xmm0 + movdqu %xmm0, 46(%rdi) +L(bwd_write_46bytes): + lddqu 30(%rsi), %xmm0 + movdqu %xmm0, 30(%rdi) +L(bwd_write_30bytes): + lddqu 14(%rsi), %xmm0 + lddqu (%rsi), %xmm1 + movdqu %xmm0, 14(%rdi) + movdqu %xmm1, (%rdi) + ret + + .p2align 4 +L(bwd_write_14bytes): + mov 6(%rsi), %rdx + mov (%rsi), %rcx + mov %rdx, 6(%rdi) + mov %rcx, (%rdi) + ret + + .p2align 4 +L(bwd_write_141bytes): + lddqu 125(%rsi), %xmm0 + movdqu %xmm0, 125(%rdi) +L(bwd_write_125bytes): + lddqu 109(%rsi), %xmm0 + movdqu %xmm0, 109(%rdi) +L(bwd_write_109bytes): + lddqu 93(%rsi), %xmm0 + movdqu %xmm0, 93(%rdi) +L(bwd_write_93bytes): + lddqu 77(%rsi), %xmm0 + movdqu %xmm0, 77(%rdi) +L(bwd_write_77bytes): + lddqu 61(%rsi), %xmm0 + movdqu %xmm0, 61(%rdi) +L(bwd_write_61bytes): + lddqu 45(%rsi), %xmm0 + movdqu %xmm0, 45(%rdi) +L(bwd_write_45bytes): + lddqu 29(%rsi), %xmm0 + movdqu %xmm0, 29(%rdi) +L(bwd_write_29bytes): + lddqu 13(%rsi), %xmm0 + lddqu (%rsi), %xmm1 + movdqu %xmm0, 13(%rdi) + movdqu %xmm1, (%rdi) + ret + + .p2align 4 +L(bwd_write_13bytes): + mov 5(%rsi), %rdx + mov (%rsi), %rcx + mov %rdx, 5(%rdi) + mov %rcx, (%rdi) + ret + + .p2align 4 +L(bwd_write_140bytes): + lddqu 124(%rsi), %xmm0 + movdqu %xmm0, 124(%rdi) +L(bwd_write_124bytes): + lddqu 108(%rsi), %xmm0 + movdqu %xmm0, 108(%rdi) +L(bwd_write_108bytes): + lddqu 92(%rsi), %xmm0 + movdqu %xmm0, 92(%rdi) +L(bwd_write_92bytes): + lddqu 76(%rsi), %xmm0 + movdqu %xmm0, 76(%rdi) +L(bwd_write_76bytes): + lddqu 60(%rsi), %xmm0 + movdqu %xmm0, 60(%rdi) +L(bwd_write_60bytes): + lddqu 44(%rsi), %xmm0 + movdqu %xmm0, 44(%rdi) +L(bwd_write_44bytes): + lddqu 28(%rsi), %xmm0 + movdqu %xmm0, 28(%rdi) +L(bwd_write_28bytes): + lddqu 12(%rsi), %xmm0 + lddqu (%rsi), %xmm1 + movdqu %xmm0, 12(%rdi) + movdqu %xmm1, (%rdi) + ret + + .p2align 4 +L(bwd_write_12bytes): + mov 4(%rsi), %rdx + mov (%rsi), %rcx + mov %rdx, 4(%rdi) + mov %rcx, (%rdi) + ret + + .p2align 4 +L(bwd_write_139bytes): + lddqu 123(%rsi), %xmm0 + movdqu %xmm0, 123(%rdi) +L(bwd_write_123bytes): + lddqu 107(%rsi), %xmm0 + movdqu %xmm0, 107(%rdi) +L(bwd_write_107bytes): + lddqu 91(%rsi), %xmm0 + movdqu %xmm0, 91(%rdi) +L(bwd_write_91bytes): + lddqu 75(%rsi), %xmm0 + movdqu %xmm0, 75(%rdi) +L(bwd_write_75bytes): + lddqu 59(%rsi), %xmm0 + movdqu %xmm0, 59(%rdi) +L(bwd_write_59bytes): + lddqu 43(%rsi), %xmm0 + movdqu %xmm0, 43(%rdi) +L(bwd_write_43bytes): + lddqu 27(%rsi), %xmm0 + movdqu %xmm0, 27(%rdi) +L(bwd_write_27bytes): + lddqu 11(%rsi), %xmm0 + lddqu (%rsi), %xmm1 + movdqu %xmm0, 11(%rdi) + movdqu %xmm1, (%rdi) + ret + + .p2align 4 +L(bwd_write_11bytes): + mov 3(%rsi), %rdx + mov (%rsi), %rcx + mov %rdx, 3(%rdi) + mov %rcx, (%rdi) + ret + + .p2align 4 +L(bwd_write_138bytes): + lddqu 122(%rsi), %xmm0 + movdqu %xmm0, 122(%rdi) +L(bwd_write_122bytes): + lddqu 106(%rsi), %xmm0 + movdqu %xmm0, 106(%rdi) +L(bwd_write_106bytes): + lddqu 90(%rsi), %xmm0 + movdqu %xmm0, 90(%rdi) +L(bwd_write_90bytes): + lddqu 74(%rsi), %xmm0 + movdqu %xmm0, 74(%rdi) +L(bwd_write_74bytes): + lddqu 58(%rsi), %xmm0 + movdqu %xmm0, 58(%rdi) +L(bwd_write_58bytes): + lddqu 42(%rsi), %xmm0 + movdqu %xmm0, 42(%rdi) +L(bwd_write_42bytes): + lddqu 26(%rsi), %xmm0 + movdqu %xmm0, 26(%rdi) +L(bwd_write_26bytes): + lddqu 10(%rsi), %xmm0 + lddqu (%rsi), %xmm1 + movdqu %xmm0, 10(%rdi) + movdqu %xmm1, (%rdi) + ret + + .p2align 4 +L(bwd_write_10bytes): + mov 2(%rsi), %rdx + mov (%rsi), %rcx + mov %rdx, 2(%rdi) + mov %rcx, (%rdi) + ret + + .p2align 4 +L(bwd_write_137bytes): + lddqu 121(%rsi), %xmm0 + movdqu %xmm0, 121(%rdi) +L(bwd_write_121bytes): + lddqu 105(%rsi), %xmm0 + movdqu %xmm0, 105(%rdi) +L(bwd_write_105bytes): + lddqu 89(%rsi), %xmm0 + movdqu %xmm0, 89(%rdi) +L(bwd_write_89bytes): + lddqu 73(%rsi), %xmm0 + movdqu %xmm0, 73(%rdi) +L(bwd_write_73bytes): + lddqu 57(%rsi), %xmm0 + movdqu %xmm0, 57(%rdi) +L(bwd_write_57bytes): + lddqu 41(%rsi), %xmm0 + movdqu %xmm0, 41(%rdi) +L(bwd_write_41bytes): + lddqu 25(%rsi), %xmm0 + movdqu %xmm0, 25(%rdi) +L(bwd_write_25bytes): + lddqu 9(%rsi), %xmm0 + lddqu (%rsi), %xmm1 + movdqu %xmm0, 9(%rdi) + movdqu %xmm1, (%rdi) + ret + + .p2align 4 +L(bwd_write_9bytes): + mov 1(%rsi), %rdx + mov (%rsi), %rcx + mov %rdx, 1(%rdi) + mov %rcx, (%rdi) + ret + + .p2align 4 +L(bwd_write_136bytes): + lddqu 120(%rsi), %xmm0 + movdqu %xmm0, 120(%rdi) +L(bwd_write_120bytes): + lddqu 104(%rsi), %xmm0 + movdqu %xmm0, 104(%rdi) +L(bwd_write_104bytes): + lddqu 88(%rsi), %xmm0 + movdqu %xmm0, 88(%rdi) +L(bwd_write_88bytes): + lddqu 72(%rsi), %xmm0 + movdqu %xmm0, 72(%rdi) +L(bwd_write_72bytes): + lddqu 56(%rsi), %xmm0 + movdqu %xmm0, 56(%rdi) +L(bwd_write_56bytes): + lddqu 40(%rsi), %xmm0 + movdqu %xmm0, 40(%rdi) +L(bwd_write_40bytes): + lddqu 24(%rsi), %xmm0 + movdqu %xmm0, 24(%rdi) +L(bwd_write_24bytes): + lddqu 8(%rsi), %xmm0 + lddqu (%rsi), %xmm1 + movdqu %xmm0, 8(%rdi) + movdqu %xmm1, (%rdi) + ret + + .p2align 4 +L(bwd_write_8bytes): + mov (%rsi), %rdx + mov %rdx, (%rdi) + ret + + .p2align 4 +L(bwd_write_135bytes): + lddqu 119(%rsi), %xmm0 + movdqu %xmm0, 119(%rdi) +L(bwd_write_119bytes): + lddqu 103(%rsi), %xmm0 + movdqu %xmm0, 103(%rdi) +L(bwd_write_103bytes): + lddqu 87(%rsi), %xmm0 + movdqu %xmm0, 87(%rdi) +L(bwd_write_87bytes): + lddqu 71(%rsi), %xmm0 + movdqu %xmm0, 71(%rdi) +L(bwd_write_71bytes): + lddqu 55(%rsi), %xmm0 + movdqu %xmm0, 55(%rdi) +L(bwd_write_55bytes): + lddqu 39(%rsi), %xmm0 + movdqu %xmm0, 39(%rdi) +L(bwd_write_39bytes): + lddqu 23(%rsi), %xmm0 + movdqu %xmm0, 23(%rdi) +L(bwd_write_23bytes): + lddqu 7(%rsi), %xmm0 + lddqu (%rsi), %xmm1 + movdqu %xmm0, 7(%rdi) + movdqu %xmm1, (%rdi) + ret + + .p2align 4 +L(bwd_write_7bytes): + mov 3(%rsi), %edx + mov (%rsi), %ecx + mov %edx, 3(%rdi) + mov %ecx, (%rdi) + ret + + .p2align 4 +L(bwd_write_134bytes): + lddqu 118(%rsi), %xmm0 + movdqu %xmm0, 118(%rdi) +L(bwd_write_118bytes): + lddqu 102(%rsi), %xmm0 + movdqu %xmm0, 102(%rdi) +L(bwd_write_102bytes): + lddqu 86(%rsi), %xmm0 + movdqu %xmm0, 86(%rdi) +L(bwd_write_86bytes): + lddqu 70(%rsi), %xmm0 + movdqu %xmm0, 70(%rdi) +L(bwd_write_70bytes): + lddqu 54(%rsi), %xmm0 + movdqu %xmm0, 54(%rdi) +L(bwd_write_54bytes): + lddqu 38(%rsi), %xmm0 + movdqu %xmm0, 38(%rdi) +L(bwd_write_38bytes): + lddqu 22(%rsi), %xmm0 + movdqu %xmm0, 22(%rdi) +L(bwd_write_22bytes): + lddqu 6(%rsi), %xmm0 + lddqu (%rsi), %xmm1 + movdqu %xmm0, 6(%rdi) + movdqu %xmm1, (%rdi) + ret + + .p2align 4 +L(bwd_write_6bytes): + mov 2(%rsi), %edx + mov (%rsi), %ecx + mov %edx, 2(%rdi) + mov %ecx, (%rdi) + ret + + .p2align 4 +L(bwd_write_133bytes): + lddqu 117(%rsi), %xmm0 + movdqu %xmm0, 117(%rdi) +L(bwd_write_117bytes): + lddqu 101(%rsi), %xmm0 + movdqu %xmm0, 101(%rdi) +L(bwd_write_101bytes): + lddqu 85(%rsi), %xmm0 + movdqu %xmm0, 85(%rdi) +L(bwd_write_85bytes): + lddqu 69(%rsi), %xmm0 + movdqu %xmm0, 69(%rdi) +L(bwd_write_69bytes): + lddqu 53(%rsi), %xmm0 + movdqu %xmm0, 53(%rdi) +L(bwd_write_53bytes): + lddqu 37(%rsi), %xmm0 + movdqu %xmm0, 37(%rdi) +L(bwd_write_37bytes): + lddqu 21(%rsi), %xmm0 + movdqu %xmm0, 21(%rdi) +L(bwd_write_21bytes): + lddqu 5(%rsi), %xmm0 + lddqu (%rsi), %xmm1 + movdqu %xmm0, 5(%rdi) + movdqu %xmm1, (%rdi) + ret + + .p2align 4 +L(bwd_write_5bytes): + mov 1(%rsi), %edx + mov (%rsi), %ecx + mov %edx, 1(%rdi) + mov %ecx, (%rdi) + ret + + .p2align 4 +L(bwd_write_132bytes): + lddqu 116(%rsi), %xmm0 + movdqu %xmm0, 116(%rdi) +L(bwd_write_116bytes): + lddqu 100(%rsi), %xmm0 + movdqu %xmm0, 100(%rdi) +L(bwd_write_100bytes): + lddqu 84(%rsi), %xmm0 + movdqu %xmm0, 84(%rdi) +L(bwd_write_84bytes): + lddqu 68(%rsi), %xmm0 + movdqu %xmm0, 68(%rdi) +L(bwd_write_68bytes): + lddqu 52(%rsi), %xmm0 + movdqu %xmm0, 52(%rdi) +L(bwd_write_52bytes): + lddqu 36(%rsi), %xmm0 + movdqu %xmm0, 36(%rdi) +L(bwd_write_36bytes): + lddqu 20(%rsi), %xmm0 + movdqu %xmm0, 20(%rdi) +L(bwd_write_20bytes): + lddqu 4(%rsi), %xmm0 + lddqu (%rsi), %xmm1 + movdqu %xmm0, 4(%rdi) + movdqu %xmm1, (%rdi) + ret + + .p2align 4 +L(bwd_write_4bytes): + mov (%rsi), %edx + mov %edx, (%rdi) + ret + + .p2align 4 +L(bwd_write_131bytes): + lddqu 115(%rsi), %xmm0 + movdqu %xmm0, 115(%rdi) +L(bwd_write_115bytes): + lddqu 99(%rsi), %xmm0 + movdqu %xmm0, 99(%rdi) +L(bwd_write_99bytes): + lddqu 83(%rsi), %xmm0 + movdqu %xmm0, 83(%rdi) +L(bwd_write_83bytes): + lddqu 67(%rsi), %xmm0 + movdqu %xmm0, 67(%rdi) +L(bwd_write_67bytes): + lddqu 51(%rsi), %xmm0 + movdqu %xmm0, 51(%rdi) +L(bwd_write_51bytes): + lddqu 35(%rsi), %xmm0 + movdqu %xmm0, 35(%rdi) +L(bwd_write_35bytes): + lddqu 19(%rsi), %xmm0 + movdqu %xmm0, 19(%rdi) +L(bwd_write_19bytes): + lddqu 3(%rsi), %xmm0 + lddqu (%rsi), %xmm1 + movdqu %xmm0, 3(%rdi) + movdqu %xmm1, (%rdi) + ret + + .p2align 4 +L(bwd_write_3bytes): + mov 1(%rsi), %dx + mov (%rsi), %cx + mov %dx, 1(%rdi) + mov %cx, (%rdi) + ret + + .p2align 4 +L(bwd_write_130bytes): + lddqu 114(%rsi), %xmm0 + movdqu %xmm0, 114(%rdi) +L(bwd_write_114bytes): + lddqu 98(%rsi), %xmm0 + movdqu %xmm0, 98(%rdi) +L(bwd_write_98bytes): + lddqu 82(%rsi), %xmm0 + movdqu %xmm0, 82(%rdi) +L(bwd_write_82bytes): + lddqu 66(%rsi), %xmm0 + movdqu %xmm0, 66(%rdi) +L(bwd_write_66bytes): + lddqu 50(%rsi), %xmm0 + movdqu %xmm0, 50(%rdi) +L(bwd_write_50bytes): + lddqu 34(%rsi), %xmm0 + movdqu %xmm0, 34(%rdi) +L(bwd_write_34bytes): + lddqu 18(%rsi), %xmm0 + movdqu %xmm0, 18(%rdi) +L(bwd_write_18bytes): + lddqu 2(%rsi), %xmm0 + lddqu (%rsi), %xmm1 + movdqu %xmm0, 2(%rdi) + movdqu %xmm1, (%rdi) + ret + + .p2align 4 +L(bwd_write_2bytes): + movzwl (%rsi), %edx + mov %dx, (%rdi) + ret + + .p2align 4 +L(bwd_write_129bytes): + lddqu 113(%rsi), %xmm0 + movdqu %xmm0, 113(%rdi) +L(bwd_write_113bytes): + lddqu 97(%rsi), %xmm0 + movdqu %xmm0, 97(%rdi) +L(bwd_write_97bytes): + lddqu 81(%rsi), %xmm0 + movdqu %xmm0, 81(%rdi) +L(bwd_write_81bytes): + lddqu 65(%rsi), %xmm0 + movdqu %xmm0, 65(%rdi) +L(bwd_write_65bytes): + lddqu 49(%rsi), %xmm0 + movdqu %xmm0, 49(%rdi) +L(bwd_write_49bytes): + lddqu 33(%rsi), %xmm0 + movdqu %xmm0, 33(%rdi) +L(bwd_write_33bytes): + lddqu 17(%rsi), %xmm0 + movdqu %xmm0, 17(%rdi) +L(bwd_write_17bytes): + lddqu 1(%rsi), %xmm0 + lddqu (%rsi), %xmm1 + movdqu %xmm0, 1(%rdi) + movdqu %xmm1, (%rdi) + ret + + .p2align 4 +L(bwd_write_1bytes): + movzbl (%rsi), %edx + mov %dl, (%rdi) + ret + +END (MEMCPY) + + .section .rodata.ssse3,"a",@progbits + .p2align 3 +L(table_144_bytes_bwd): + .int JMPTBL (L(bwd_write_0bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_1bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_2bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_3bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_4bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_5bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_6bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_7bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_8bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_9bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_10bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_11bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_12bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_13bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_14bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_15bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_16bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_17bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_18bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_19bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_20bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_21bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_22bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_23bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_24bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_25bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_26bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_27bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_28bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_29bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_30bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_31bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_32bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_33bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_34bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_35bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_36bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_37bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_38bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_39bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_40bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_41bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_42bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_43bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_44bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_45bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_46bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_47bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_48bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_49bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_50bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_51bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_52bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_53bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_54bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_55bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_56bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_57bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_58bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_59bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_60bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_61bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_62bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_63bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_64bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_65bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_66bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_67bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_68bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_69bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_70bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_71bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_72bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_73bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_74bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_75bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_76bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_77bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_78bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_79bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_80bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_81bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_82bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_83bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_84bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_85bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_86bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_87bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_88bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_89bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_90bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_91bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_92bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_93bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_94bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_95bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_96bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_97bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_98bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_99bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_100bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_101bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_102bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_103bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_104bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_105bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_106bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_107bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_108bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_109bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_110bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_111bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_112bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_113bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_114bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_115bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_116bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_117bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_118bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_119bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_120bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_121bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_122bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_123bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_124bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_125bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_126bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_127bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_128bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_129bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_130bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_131bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_132bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_133bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_134bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_135bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_136bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_137bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_138bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_139bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_140bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_141bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_142bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_143bytes), L(table_144_bytes_bwd)) + + .p2align 3 +L(table_144_bytes_fwd): + .int JMPTBL (L(fwd_write_0bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_1bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_2bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_3bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_4bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_5bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_6bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_7bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_8bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_9bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_10bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_11bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_12bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_13bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_14bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_15bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_16bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_17bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_18bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_19bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_20bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_21bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_22bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_23bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_24bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_25bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_26bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_27bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_28bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_29bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_30bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_31bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_32bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_33bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_34bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_35bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_36bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_37bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_38bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_39bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_40bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_41bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_42bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_43bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_44bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_45bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_46bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_47bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_48bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_49bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_50bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_51bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_52bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_53bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_54bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_55bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_56bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_57bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_58bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_59bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_60bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_61bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_62bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_63bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_64bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_65bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_66bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_67bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_68bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_69bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_70bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_71bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_72bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_73bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_74bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_75bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_76bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_77bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_78bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_79bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_80bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_81bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_82bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_83bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_84bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_85bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_86bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_87bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_88bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_89bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_90bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_91bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_92bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_93bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_94bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_95bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_96bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_97bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_98bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_99bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_100bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_101bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_102bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_103bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_104bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_105bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_106bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_107bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_108bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_109bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_110bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_111bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_112bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_113bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_114bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_115bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_116bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_117bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_118bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_119bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_120bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_121bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_122bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_123bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_124bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_125bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_126bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_127bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_128bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_129bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_130bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_131bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_132bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_133bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_134bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_135bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_136bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_137bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_138bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_139bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_140bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_141bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_142bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_143bytes), L(table_144_bytes_fwd)) + + .p2align 3 +L(shl_table_fwd): + .int JMPTBL (L(shl_0), L(shl_table_fwd)) + .int JMPTBL (L(shl_1), L(shl_table_fwd)) + .int JMPTBL (L(shl_2), L(shl_table_fwd)) + .int JMPTBL (L(shl_3), L(shl_table_fwd)) + .int JMPTBL (L(shl_4), L(shl_table_fwd)) + .int JMPTBL (L(shl_5), L(shl_table_fwd)) + .int JMPTBL (L(shl_6), L(shl_table_fwd)) + .int JMPTBL (L(shl_7), L(shl_table_fwd)) + .int JMPTBL (L(shl_8), L(shl_table_fwd)) + .int JMPTBL (L(shl_9), L(shl_table_fwd)) + .int JMPTBL (L(shl_10), L(shl_table_fwd)) + .int JMPTBL (L(shl_11), L(shl_table_fwd)) + .int JMPTBL (L(shl_12), L(shl_table_fwd)) + .int JMPTBL (L(shl_13), L(shl_table_fwd)) + .int JMPTBL (L(shl_14), L(shl_table_fwd)) + .int JMPTBL (L(shl_15), L(shl_table_fwd)) + + .p2align 3 +L(shl_table_bwd): + .int JMPTBL (L(shl_0_bwd), L(shl_table_bwd)) + .int JMPTBL (L(shl_1_bwd), L(shl_table_bwd)) + .int JMPTBL (L(shl_2_bwd), L(shl_table_bwd)) + .int JMPTBL (L(shl_3_bwd), L(shl_table_bwd)) + .int JMPTBL (L(shl_4_bwd), L(shl_table_bwd)) + .int JMPTBL (L(shl_5_bwd), L(shl_table_bwd)) + .int JMPTBL (L(shl_6_bwd), L(shl_table_bwd)) + .int JMPTBL (L(shl_7_bwd), L(shl_table_bwd)) + .int JMPTBL (L(shl_8_bwd), L(shl_table_bwd)) + .int JMPTBL (L(shl_9_bwd), L(shl_table_bwd)) + .int JMPTBL (L(shl_10_bwd), L(shl_table_bwd)) + .int JMPTBL (L(shl_11_bwd), L(shl_table_bwd)) + .int JMPTBL (L(shl_12_bwd), L(shl_table_bwd)) + .int JMPTBL (L(shl_13_bwd), L(shl_table_bwd)) + .int JMPTBL (L(shl_14_bwd), L(shl_table_bwd)) + .int JMPTBL (L(shl_15_bwd), L(shl_table_bwd)) + +#endif diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/memcpy-ssse3.S b/REORG.TODO/sysdeps/x86_64/multiarch/memcpy-ssse3.S new file mode 100644 index 0000000000..f3ea52a46c --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/memcpy-ssse3.S @@ -0,0 +1,3150 @@ +/* memcpy with SSSE3 + Copyright (C) 2010-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> + +#if IS_IN (libc) \ + && (defined SHARED \ + || defined USE_AS_MEMMOVE \ + || !defined USE_MULTIARCH) + +#include "asm-syntax.h" + +#ifndef MEMCPY +# define MEMCPY __memcpy_ssse3 +# define MEMCPY_CHK __memcpy_chk_ssse3 +# define MEMPCPY __mempcpy_ssse3 +# define MEMPCPY_CHK __mempcpy_chk_ssse3 +#endif + +#define JMPTBL(I, B) I - B + +/* Branch to an entry in a jump table. TABLE is a jump table with + relative offsets. INDEX is a register contains the index into the + jump table. SCALE is the scale of INDEX. */ +#define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ + lea TABLE(%rip), %r11; \ + movslq (%r11, INDEX, SCALE), INDEX; \ + lea (%r11, INDEX), INDEX; \ + jmp *INDEX; \ + ud2 + + .section .text.ssse3,"ax",@progbits +#if !defined USE_AS_MEMPCPY && !defined USE_AS_MEMMOVE +ENTRY (MEMPCPY_CHK) + cmpq %rdx, %rcx + jb HIDDEN_JUMPTARGET (__chk_fail) +END (MEMPCPY_CHK) + +ENTRY (MEMPCPY) + movq %rdi, %rax + addq %rdx, %rax + jmp L(start) +END (MEMPCPY) +#endif + +#if !defined USE_AS_BCOPY +ENTRY (MEMCPY_CHK) + cmpq %rdx, %rcx + jb HIDDEN_JUMPTARGET (__chk_fail) +END (MEMCPY_CHK) +#endif + +ENTRY (MEMCPY) + mov %rdi, %rax +#ifdef USE_AS_MEMPCPY + add %rdx, %rax +#endif + +#ifdef USE_AS_MEMMOVE + cmp %rsi, %rdi + jb L(copy_forward) + je L(write_0bytes) + cmp $79, %rdx + jbe L(copy_forward) + jmp L(copy_backward) +L(copy_forward): +#endif +L(start): + cmp $79, %rdx + lea L(table_less_80bytes)(%rip), %r11 + ja L(80bytesormore) + movslq (%r11, %rdx, 4), %r9 + add %rdx, %rsi + add %rdx, %rdi + add %r11, %r9 + jmp *%r9 + ud2 + + .p2align 4 +L(80bytesormore): +#ifndef USE_AS_MEMMOVE + cmp %dil, %sil + jle L(copy_backward) +#endif + + movdqu (%rsi), %xmm0 + mov %rdi, %rcx + and $-16, %rdi + add $16, %rdi + mov %rcx, %r8 + sub %rdi, %rcx + add %rcx, %rdx + sub %rcx, %rsi + +#ifdef SHARED_CACHE_SIZE_HALF + mov $SHARED_CACHE_SIZE_HALF, %RCX_LP +#else + mov __x86_shared_cache_size_half(%rip), %RCX_LP +#endif + cmp %rcx, %rdx + mov %rsi, %r9 + ja L(large_page_fwd) + and $0xf, %r9 + jz L(shl_0) +#ifdef DATA_CACHE_SIZE_HALF + mov $DATA_CACHE_SIZE_HALF, %RCX_LP +#else + mov __x86_data_cache_size_half(%rip), %RCX_LP +#endif + BRANCH_TO_JMPTBL_ENTRY (L(shl_table), %r9, 4) + + .p2align 4 +L(copy_backward): + movdqu -16(%rsi, %rdx), %xmm0 + add %rdx, %rsi + lea -16(%rdi, %rdx), %r8 + add %rdx, %rdi + + mov %rdi, %rcx + and $0xf, %rcx + xor %rcx, %rdi + sub %rcx, %rdx + sub %rcx, %rsi + +#ifdef SHARED_CACHE_SIZE_HALF + mov $SHARED_CACHE_SIZE_HALF, %RCX_LP +#else + mov __x86_shared_cache_size_half(%rip), %RCX_LP +#endif + + cmp %rcx, %rdx + mov %rsi, %r9 + ja L(large_page_bwd) + and $0xf, %r9 + jz L(shl_0_bwd) +#ifdef DATA_CACHE_SIZE_HALF + mov $DATA_CACHE_SIZE_HALF, %RCX_LP +#else + mov __x86_data_cache_size_half(%rip), %RCX_LP +#endif + BRANCH_TO_JMPTBL_ENTRY (L(shl_table_bwd), %r9, 4) + + .p2align 4 +L(shl_0): + sub $16, %rdx + movdqa (%rsi), %xmm1 + add $16, %rsi + movdqa %xmm1, (%rdi) + add $16, %rdi + cmp $128, %rdx + movdqu %xmm0, (%r8) + ja L(shl_0_gobble) + cmp $64, %rdx + jb L(shl_0_less_64bytes) + movaps (%rsi), %xmm4 + movaps 16(%rsi), %xmm1 + movaps 32(%rsi), %xmm2 + movaps 48(%rsi), %xmm3 + movaps %xmm4, (%rdi) + movaps %xmm1, 16(%rdi) + movaps %xmm2, 32(%rdi) + movaps %xmm3, 48(%rdi) + sub $64, %rdx + add $64, %rsi + add $64, %rdi +L(shl_0_less_64bytes): + add %rdx, %rsi + add %rdx, %rdi + BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4) + + .p2align 4 +L(shl_0_gobble): +#ifdef DATA_CACHE_SIZE_HALF + cmp $DATA_CACHE_SIZE_HALF, %RDX_LP +#else + cmp __x86_data_cache_size_half(%rip), %RDX_LP +#endif + lea -128(%rdx), %rdx + jae L(shl_0_gobble_mem_loop) +L(shl_0_gobble_cache_loop): + movdqa (%rsi), %xmm4 + movaps 0x10(%rsi), %xmm1 + movaps 0x20(%rsi), %xmm2 + movaps 0x30(%rsi), %xmm3 + + movdqa %xmm4, (%rdi) + movaps %xmm1, 0x10(%rdi) + movaps %xmm2, 0x20(%rdi) + movaps %xmm3, 0x30(%rdi) + + sub $128, %rdx + movaps 0x40(%rsi), %xmm4 + movaps 0x50(%rsi), %xmm5 + movaps 0x60(%rsi), %xmm6 + movaps 0x70(%rsi), %xmm7 + lea 0x80(%rsi), %rsi + movaps %xmm4, 0x40(%rdi) + movaps %xmm5, 0x50(%rdi) + movaps %xmm6, 0x60(%rdi) + movaps %xmm7, 0x70(%rdi) + lea 0x80(%rdi), %rdi + + jae L(shl_0_gobble_cache_loop) + cmp $-0x40, %rdx + lea 0x80(%rdx), %rdx + jl L(shl_0_cache_less_64bytes) + + movdqa (%rsi), %xmm4 + sub $0x40, %rdx + movdqa 0x10(%rsi), %xmm1 + + movdqa %xmm4, (%rdi) + movdqa %xmm1, 0x10(%rdi) + + movdqa 0x20(%rsi), %xmm4 + movdqa 0x30(%rsi), %xmm1 + add $0x40, %rsi + + movdqa %xmm4, 0x20(%rdi) + movdqa %xmm1, 0x30(%rdi) + add $0x40, %rdi +L(shl_0_cache_less_64bytes): + add %rdx, %rsi + add %rdx, %rdi + BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4) + + .p2align 4 +L(shl_0_gobble_mem_loop): + prefetcht0 0x1c0(%rsi) + prefetcht0 0x280(%rsi) + + movdqa (%rsi), %xmm0 + movdqa 0x10(%rsi), %xmm1 + movdqa 0x20(%rsi), %xmm2 + movdqa 0x30(%rsi), %xmm3 + movdqa 0x40(%rsi), %xmm4 + movdqa 0x50(%rsi), %xmm5 + movdqa 0x60(%rsi), %xmm6 + movdqa 0x70(%rsi), %xmm7 + lea 0x80(%rsi), %rsi + sub $0x80, %rdx + movdqa %xmm0, (%rdi) + movdqa %xmm1, 0x10(%rdi) + movdqa %xmm2, 0x20(%rdi) + movdqa %xmm3, 0x30(%rdi) + movdqa %xmm4, 0x40(%rdi) + movdqa %xmm5, 0x50(%rdi) + movdqa %xmm6, 0x60(%rdi) + movdqa %xmm7, 0x70(%rdi) + lea 0x80(%rdi), %rdi + + jae L(shl_0_gobble_mem_loop) + cmp $-0x40, %rdx + lea 0x80(%rdx), %rdx + jl L(shl_0_mem_less_64bytes) + + movdqa (%rsi), %xmm0 + sub $0x40, %rdx + movdqa 0x10(%rsi), %xmm1 + + movdqa %xmm0, (%rdi) + movdqa %xmm1, 0x10(%rdi) + + movdqa 0x20(%rsi), %xmm0 + movdqa 0x30(%rsi), %xmm1 + add $0x40, %rsi + + movdqa %xmm0, 0x20(%rdi) + movdqa %xmm1, 0x30(%rdi) + add $0x40, %rdi +L(shl_0_mem_less_64bytes): + cmp $0x20, %rdx + jb L(shl_0_mem_less_32bytes) + movdqa (%rsi), %xmm0 + sub $0x20, %rdx + movdqa 0x10(%rsi), %xmm1 + add $0x20, %rsi + movdqa %xmm0, (%rdi) + movdqa %xmm1, 0x10(%rdi) + add $0x20, %rdi +L(shl_0_mem_less_32bytes): + add %rdx, %rdi + add %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4) + + .p2align 4 +L(shl_0_bwd): + sub $16, %rdx + movdqa -0x10(%rsi), %xmm1 + sub $16, %rsi + movdqa %xmm1, -0x10(%rdi) + sub $16, %rdi + cmp $0x80, %rdx + movdqu %xmm0, (%r8) + ja L(shl_0_gobble_bwd) + cmp $64, %rdx + jb L(shl_0_less_64bytes_bwd) + movaps -0x10(%rsi), %xmm0 + movaps -0x20(%rsi), %xmm1 + movaps -0x30(%rsi), %xmm2 + movaps -0x40(%rsi), %xmm3 + movaps %xmm0, -0x10(%rdi) + movaps %xmm1, -0x20(%rdi) + movaps %xmm2, -0x30(%rdi) + movaps %xmm3, -0x40(%rdi) + sub $64, %rdx + sub $0x40, %rsi + sub $0x40, %rdi +L(shl_0_less_64bytes_bwd): + BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4) + + .p2align 4 +L(shl_0_gobble_bwd): +#ifdef DATA_CACHE_SIZE_HALF + cmp $DATA_CACHE_SIZE_HALF, %RDX_LP +#else + cmp __x86_data_cache_size_half(%rip), %RDX_LP +#endif + lea -128(%rdx), %rdx + jae L(shl_0_gobble_mem_bwd_loop) +L(shl_0_gobble_bwd_loop): + movdqa -0x10(%rsi), %xmm0 + movaps -0x20(%rsi), %xmm1 + movaps -0x30(%rsi), %xmm2 + movaps -0x40(%rsi), %xmm3 + + movdqa %xmm0, -0x10(%rdi) + movaps %xmm1, -0x20(%rdi) + movaps %xmm2, -0x30(%rdi) + movaps %xmm3, -0x40(%rdi) + + sub $0x80, %rdx + movaps -0x50(%rsi), %xmm4 + movaps -0x60(%rsi), %xmm5 + movaps -0x70(%rsi), %xmm6 + movaps -0x80(%rsi), %xmm7 + lea -0x80(%rsi), %rsi + movaps %xmm4, -0x50(%rdi) + movaps %xmm5, -0x60(%rdi) + movaps %xmm6, -0x70(%rdi) + movaps %xmm7, -0x80(%rdi) + lea -0x80(%rdi), %rdi + + jae L(shl_0_gobble_bwd_loop) + cmp $-0x40, %rdx + lea 0x80(%rdx), %rdx + jl L(shl_0_gobble_bwd_less_64bytes) + + movdqa -0x10(%rsi), %xmm0 + sub $0x40, %rdx + movdqa -0x20(%rsi), %xmm1 + + movdqa %xmm0, -0x10(%rdi) + movdqa %xmm1, -0x20(%rdi) + + movdqa -0x30(%rsi), %xmm0 + movdqa -0x40(%rsi), %xmm1 + sub $0x40, %rsi + + movdqa %xmm0, -0x30(%rdi) + movdqa %xmm1, -0x40(%rdi) + sub $0x40, %rdi +L(shl_0_gobble_bwd_less_64bytes): + BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4) + + .p2align 4 +L(shl_0_gobble_mem_bwd_loop): + prefetcht0 -0x1c0(%rsi) + prefetcht0 -0x280(%rsi) + movdqa -0x10(%rsi), %xmm0 + movdqa -0x20(%rsi), %xmm1 + movdqa -0x30(%rsi), %xmm2 + movdqa -0x40(%rsi), %xmm3 + movdqa -0x50(%rsi), %xmm4 + movdqa -0x60(%rsi), %xmm5 + movdqa -0x70(%rsi), %xmm6 + movdqa -0x80(%rsi), %xmm7 + lea -0x80(%rsi), %rsi + sub $0x80, %rdx + movdqa %xmm0, -0x10(%rdi) + movdqa %xmm1, -0x20(%rdi) + movdqa %xmm2, -0x30(%rdi) + movdqa %xmm3, -0x40(%rdi) + movdqa %xmm4, -0x50(%rdi) + movdqa %xmm5, -0x60(%rdi) + movdqa %xmm6, -0x70(%rdi) + movdqa %xmm7, -0x80(%rdi) + lea -0x80(%rdi), %rdi + + jae L(shl_0_gobble_mem_bwd_loop) + cmp $-0x40, %rdx + lea 0x80(%rdx), %rdx + jl L(shl_0_mem_bwd_less_64bytes) + + movdqa -0x10(%rsi), %xmm0 + sub $0x40, %rdx + movdqa -0x20(%rsi), %xmm1 + + movdqa %xmm0, -0x10(%rdi) + movdqa %xmm1, -0x20(%rdi) + + movdqa -0x30(%rsi), %xmm0 + movdqa -0x40(%rsi), %xmm1 + sub $0x40, %rsi + + movdqa %xmm0, -0x30(%rdi) + movdqa %xmm1, -0x40(%rdi) + sub $0x40, %rdi +L(shl_0_mem_bwd_less_64bytes): + cmp $0x20, %rdx + jb L(shl_0_mem_bwd_less_32bytes) + movdqa -0x10(%rsi), %xmm0 + sub $0x20, %rdx + movdqa -0x20(%rsi), %xmm1 + sub $0x20, %rsi + movdqa %xmm0, -0x10(%rdi) + movdqa %xmm1, -0x20(%rdi) + sub $0x20, %rdi +L(shl_0_mem_bwd_less_32bytes): + BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4) + + .p2align 4 +L(shl_1): + lea (L(shl_1_loop_L1)-L(shl_1))(%r9), %r9 + cmp %rcx, %rdx + movaps -0x01(%rsi), %xmm1 + jb L(L1_fwd) + lea (L(shl_1_loop_L2)-L(shl_1_loop_L1))(%r9), %r9 +L(L1_fwd): + lea -64(%rdx), %rdx + jmp *%r9 + ud2 +L(shl_1_loop_L2): + prefetchnta 0x1c0(%rsi) +L(shl_1_loop_L1): + sub $64, %rdx + movaps 0x0f(%rsi), %xmm2 + movaps 0x1f(%rsi), %xmm3 + movaps 0x2f(%rsi), %xmm4 + movaps 0x3f(%rsi), %xmm5 + movdqa %xmm5, %xmm6 + palignr $1, %xmm4, %xmm5 + lea 64(%rsi), %rsi + palignr $1, %xmm3, %xmm4 + palignr $1, %xmm2, %xmm3 + lea 64(%rdi), %rdi + palignr $1, %xmm1, %xmm2 + movdqa %xmm6, %xmm1 + movdqa %xmm2, -0x40(%rdi) + movaps %xmm3, -0x30(%rdi) + jb L(shl_1_end) + movaps %xmm4, -0x20(%rdi) + movaps %xmm5, -0x10(%rdi) + jmp *%r9 + ud2 +L(shl_1_end): + movaps %xmm4, -0x20(%rdi) + lea 64(%rdx), %rdx + movaps %xmm5, -0x10(%rdi) + add %rdx, %rdi + movdqu %xmm0, (%r8) + add %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) + + .p2align 4 +L(shl_1_bwd): + lea (L(shl_1_bwd_loop_L1)-L(shl_1_bwd))(%r9), %r9 + cmp %rcx, %rdx + movaps -0x01(%rsi), %xmm1 + jb L(L1_bwd) + lea (L(shl_1_bwd_loop_L2)-L(shl_1_bwd_loop_L1))(%r9), %r9 +L(L1_bwd): + lea -64(%rdx), %rdx + jmp *%r9 + ud2 +L(shl_1_bwd_loop_L2): + prefetchnta -0x1c0(%rsi) +L(shl_1_bwd_loop_L1): + movaps -0x11(%rsi), %xmm2 + sub $0x40, %rdx + movaps -0x21(%rsi), %xmm3 + movaps -0x31(%rsi), %xmm4 + movaps -0x41(%rsi), %xmm5 + lea -0x40(%rsi), %rsi + palignr $1, %xmm2, %xmm1 + palignr $1, %xmm3, %xmm2 + palignr $1, %xmm4, %xmm3 + palignr $1, %xmm5, %xmm4 + + movaps %xmm1, -0x10(%rdi) + movaps %xmm5, %xmm1 + + movaps %xmm2, -0x20(%rdi) + lea -0x40(%rdi), %rdi + + movaps %xmm3, 0x10(%rdi) + jb L(shl_1_bwd_end) + movaps %xmm4, (%rdi) + jmp *%r9 + ud2 +L(shl_1_bwd_end): + movaps %xmm4, (%rdi) + lea 64(%rdx), %rdx + movdqu %xmm0, (%r8) + BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) + + .p2align 4 +L(shl_2): + lea (L(shl_2_loop_L1)-L(shl_2))(%r9), %r9 + cmp %rcx, %rdx + movaps -0x02(%rsi), %xmm1 + jb L(L2_fwd) + lea (L(shl_2_loop_L2)-L(shl_2_loop_L1))(%r9), %r9 +L(L2_fwd): + lea -64(%rdx), %rdx + jmp *%r9 + ud2 +L(shl_2_loop_L2): + prefetchnta 0x1c0(%rsi) +L(shl_2_loop_L1): + sub $64, %rdx + movaps 0x0e(%rsi), %xmm2 + movaps 0x1e(%rsi), %xmm3 + movaps 0x2e(%rsi), %xmm4 + movaps 0x3e(%rsi), %xmm5 + movdqa %xmm5, %xmm6 + palignr $2, %xmm4, %xmm5 + lea 64(%rsi), %rsi + palignr $2, %xmm3, %xmm4 + palignr $2, %xmm2, %xmm3 + lea 64(%rdi), %rdi + palignr $2, %xmm1, %xmm2 + movdqa %xmm6, %xmm1 + movdqa %xmm2, -0x40(%rdi) + movaps %xmm3, -0x30(%rdi) + jb L(shl_2_end) + movaps %xmm4, -0x20(%rdi) + movaps %xmm5, -0x10(%rdi) + jmp *%r9 + ud2 +L(shl_2_end): + movaps %xmm4, -0x20(%rdi) + lea 64(%rdx), %rdx + movaps %xmm5, -0x10(%rdi) + add %rdx, %rdi + movdqu %xmm0, (%r8) + add %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) + + .p2align 4 +L(shl_2_bwd): + lea (L(shl_2_bwd_loop_L1)-L(shl_2_bwd))(%r9), %r9 + cmp %rcx, %rdx + movaps -0x02(%rsi), %xmm1 + jb L(L2_bwd) + lea (L(shl_2_bwd_loop_L2)-L(shl_2_bwd_loop_L1))(%r9), %r9 +L(L2_bwd): + lea -64(%rdx), %rdx + jmp *%r9 + ud2 +L(shl_2_bwd_loop_L2): + prefetchnta -0x1c0(%rsi) +L(shl_2_bwd_loop_L1): + movaps -0x12(%rsi), %xmm2 + sub $0x40, %rdx + movaps -0x22(%rsi), %xmm3 + movaps -0x32(%rsi), %xmm4 + movaps -0x42(%rsi), %xmm5 + lea -0x40(%rsi), %rsi + palignr $2, %xmm2, %xmm1 + palignr $2, %xmm3, %xmm2 + palignr $2, %xmm4, %xmm3 + palignr $2, %xmm5, %xmm4 + + movaps %xmm1, -0x10(%rdi) + movaps %xmm5, %xmm1 + + movaps %xmm2, -0x20(%rdi) + lea -0x40(%rdi), %rdi + + movaps %xmm3, 0x10(%rdi) + jb L(shl_2_bwd_end) + movaps %xmm4, (%rdi) + jmp *%r9 + ud2 +L(shl_2_bwd_end): + movaps %xmm4, (%rdi) + lea 64(%rdx), %rdx + movdqu %xmm0, (%r8) + BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) + + .p2align 4 +L(shl_3): + lea (L(shl_3_loop_L1)-L(shl_3))(%r9), %r9 + cmp %rcx, %rdx + movaps -0x03(%rsi), %xmm1 + jb L(L3_fwd) + lea (L(shl_3_loop_L2)-L(shl_3_loop_L1))(%r9), %r9 +L(L3_fwd): + lea -64(%rdx), %rdx + jmp *%r9 + ud2 +L(shl_3_loop_L2): + prefetchnta 0x1c0(%rsi) +L(shl_3_loop_L1): + sub $64, %rdx + movaps 0x0d(%rsi), %xmm2 + movaps 0x1d(%rsi), %xmm3 + movaps 0x2d(%rsi), %xmm4 + movaps 0x3d(%rsi), %xmm5 + movdqa %xmm5, %xmm6 + palignr $3, %xmm4, %xmm5 + lea 64(%rsi), %rsi + palignr $3, %xmm3, %xmm4 + palignr $3, %xmm2, %xmm3 + lea 64(%rdi), %rdi + palignr $3, %xmm1, %xmm2 + movdqa %xmm6, %xmm1 + movdqa %xmm2, -0x40(%rdi) + movaps %xmm3, -0x30(%rdi) + jb L(shl_3_end) + movaps %xmm4, -0x20(%rdi) + movaps %xmm5, -0x10(%rdi) + jmp *%r9 + ud2 +L(shl_3_end): + movaps %xmm4, -0x20(%rdi) + lea 64(%rdx), %rdx + movaps %xmm5, -0x10(%rdi) + add %rdx, %rdi + movdqu %xmm0, (%r8) + add %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) + + .p2align 4 +L(shl_3_bwd): + lea (L(shl_3_bwd_loop_L1)-L(shl_3_bwd))(%r9), %r9 + cmp %rcx, %rdx + movaps -0x03(%rsi), %xmm1 + jb L(L3_bwd) + lea (L(shl_3_bwd_loop_L2)-L(shl_3_bwd_loop_L1))(%r9), %r9 +L(L3_bwd): + lea -64(%rdx), %rdx + jmp *%r9 + ud2 +L(shl_3_bwd_loop_L2): + prefetchnta -0x1c0(%rsi) +L(shl_3_bwd_loop_L1): + movaps -0x13(%rsi), %xmm2 + sub $0x40, %rdx + movaps -0x23(%rsi), %xmm3 + movaps -0x33(%rsi), %xmm4 + movaps -0x43(%rsi), %xmm5 + lea -0x40(%rsi), %rsi + palignr $3, %xmm2, %xmm1 + palignr $3, %xmm3, %xmm2 + palignr $3, %xmm4, %xmm3 + palignr $3, %xmm5, %xmm4 + + movaps %xmm1, -0x10(%rdi) + movaps %xmm5, %xmm1 + + movaps %xmm2, -0x20(%rdi) + lea -0x40(%rdi), %rdi + + movaps %xmm3, 0x10(%rdi) + jb L(shl_3_bwd_end) + movaps %xmm4, (%rdi) + jmp *%r9 + ud2 +L(shl_3_bwd_end): + movaps %xmm4, (%rdi) + lea 64(%rdx), %rdx + movdqu %xmm0, (%r8) + BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) + + .p2align 4 +L(shl_4): + lea (L(shl_4_loop_L1)-L(shl_4))(%r9), %r9 + cmp %rcx, %rdx + movaps -0x04(%rsi), %xmm1 + jb L(L4_fwd) + lea (L(shl_4_loop_L2)-L(shl_4_loop_L1))(%r9), %r9 +L(L4_fwd): + lea -64(%rdx), %rdx + jmp *%r9 + ud2 +L(shl_4_loop_L2): + prefetchnta 0x1c0(%rsi) +L(shl_4_loop_L1): + sub $64, %rdx + movaps 0x0c(%rsi), %xmm2 + movaps 0x1c(%rsi), %xmm3 + movaps 0x2c(%rsi), %xmm4 + movaps 0x3c(%rsi), %xmm5 + movdqa %xmm5, %xmm6 + palignr $4, %xmm4, %xmm5 + lea 64(%rsi), %rsi + palignr $4, %xmm3, %xmm4 + palignr $4, %xmm2, %xmm3 + lea 64(%rdi), %rdi + palignr $4, %xmm1, %xmm2 + movdqa %xmm6, %xmm1 + movdqa %xmm2, -0x40(%rdi) + movaps %xmm3, -0x30(%rdi) + jb L(shl_4_end) + movaps %xmm4, -0x20(%rdi) + movaps %xmm5, -0x10(%rdi) + jmp *%r9 + ud2 +L(shl_4_end): + movaps %xmm4, -0x20(%rdi) + lea 64(%rdx), %rdx + movaps %xmm5, -0x10(%rdi) + add %rdx, %rdi + movdqu %xmm0, (%r8) + add %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) + + .p2align 4 +L(shl_4_bwd): + lea (L(shl_4_bwd_loop_L1)-L(shl_4_bwd))(%r9), %r9 + cmp %rcx, %rdx + movaps -0x04(%rsi), %xmm1 + jb L(L4_bwd) + lea (L(shl_4_bwd_loop_L2)-L(shl_4_bwd_loop_L1))(%r9), %r9 +L(L4_bwd): + lea -64(%rdx), %rdx + jmp *%r9 + ud2 +L(shl_4_bwd_loop_L2): + prefetchnta -0x1c0(%rsi) +L(shl_4_bwd_loop_L1): + movaps -0x14(%rsi), %xmm2 + sub $0x40, %rdx + movaps -0x24(%rsi), %xmm3 + movaps -0x34(%rsi), %xmm4 + movaps -0x44(%rsi), %xmm5 + lea -0x40(%rsi), %rsi + palignr $4, %xmm2, %xmm1 + palignr $4, %xmm3, %xmm2 + palignr $4, %xmm4, %xmm3 + palignr $4, %xmm5, %xmm4 + + movaps %xmm1, -0x10(%rdi) + movaps %xmm5, %xmm1 + + movaps %xmm2, -0x20(%rdi) + lea -0x40(%rdi), %rdi + + movaps %xmm3, 0x10(%rdi) + jb L(shl_4_bwd_end) + movaps %xmm4, (%rdi) + jmp *%r9 + ud2 +L(shl_4_bwd_end): + movaps %xmm4, (%rdi) + lea 64(%rdx), %rdx + movdqu %xmm0, (%r8) + BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) + + .p2align 4 +L(shl_5): + lea (L(shl_5_loop_L1)-L(shl_5))(%r9), %r9 + cmp %rcx, %rdx + movaps -0x05(%rsi), %xmm1 + jb L(L5_fwd) + lea (L(shl_5_loop_L2)-L(shl_5_loop_L1))(%r9), %r9 +L(L5_fwd): + lea -64(%rdx), %rdx + jmp *%r9 + ud2 +L(shl_5_loop_L2): + prefetchnta 0x1c0(%rsi) +L(shl_5_loop_L1): + sub $64, %rdx + movaps 0x0b(%rsi), %xmm2 + movaps 0x1b(%rsi), %xmm3 + movaps 0x2b(%rsi), %xmm4 + movaps 0x3b(%rsi), %xmm5 + movdqa %xmm5, %xmm6 + palignr $5, %xmm4, %xmm5 + lea 64(%rsi), %rsi + palignr $5, %xmm3, %xmm4 + palignr $5, %xmm2, %xmm3 + lea 64(%rdi), %rdi + palignr $5, %xmm1, %xmm2 + movdqa %xmm6, %xmm1 + movdqa %xmm2, -0x40(%rdi) + movaps %xmm3, -0x30(%rdi) + jb L(shl_5_end) + movaps %xmm4, -0x20(%rdi) + movaps %xmm5, -0x10(%rdi) + jmp *%r9 + ud2 +L(shl_5_end): + movaps %xmm4, -0x20(%rdi) + lea 64(%rdx), %rdx + movaps %xmm5, -0x10(%rdi) + add %rdx, %rdi + movdqu %xmm0, (%r8) + add %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) + + .p2align 4 +L(shl_5_bwd): + lea (L(shl_5_bwd_loop_L1)-L(shl_5_bwd))(%r9), %r9 + cmp %rcx, %rdx + movaps -0x05(%rsi), %xmm1 + jb L(L5_bwd) + lea (L(shl_5_bwd_loop_L2)-L(shl_5_bwd_loop_L1))(%r9), %r9 +L(L5_bwd): + lea -64(%rdx), %rdx + jmp *%r9 + ud2 +L(shl_5_bwd_loop_L2): + prefetchnta -0x1c0(%rsi) +L(shl_5_bwd_loop_L1): + movaps -0x15(%rsi), %xmm2 + sub $0x40, %rdx + movaps -0x25(%rsi), %xmm3 + movaps -0x35(%rsi), %xmm4 + movaps -0x45(%rsi), %xmm5 + lea -0x40(%rsi), %rsi + palignr $5, %xmm2, %xmm1 + palignr $5, %xmm3, %xmm2 + palignr $5, %xmm4, %xmm3 + palignr $5, %xmm5, %xmm4 + + movaps %xmm1, -0x10(%rdi) + movaps %xmm5, %xmm1 + + movaps %xmm2, -0x20(%rdi) + lea -0x40(%rdi), %rdi + + movaps %xmm3, 0x10(%rdi) + jb L(shl_5_bwd_end) + movaps %xmm4, (%rdi) + jmp *%r9 + ud2 +L(shl_5_bwd_end): + movaps %xmm4, (%rdi) + lea 64(%rdx), %rdx + movdqu %xmm0, (%r8) + BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) + + .p2align 4 +L(shl_6): + lea (L(shl_6_loop_L1)-L(shl_6))(%r9), %r9 + cmp %rcx, %rdx + movaps -0x06(%rsi), %xmm1 + jb L(L6_fwd) + lea (L(shl_6_loop_L2)-L(shl_6_loop_L1))(%r9), %r9 +L(L6_fwd): + lea -64(%rdx), %rdx + jmp *%r9 + ud2 +L(shl_6_loop_L2): + prefetchnta 0x1c0(%rsi) +L(shl_6_loop_L1): + sub $64, %rdx + movaps 0x0a(%rsi), %xmm2 + movaps 0x1a(%rsi), %xmm3 + movaps 0x2a(%rsi), %xmm4 + movaps 0x3a(%rsi), %xmm5 + movdqa %xmm5, %xmm6 + palignr $6, %xmm4, %xmm5 + lea 64(%rsi), %rsi + palignr $6, %xmm3, %xmm4 + palignr $6, %xmm2, %xmm3 + lea 64(%rdi), %rdi + palignr $6, %xmm1, %xmm2 + movdqa %xmm6, %xmm1 + movdqa %xmm2, -0x40(%rdi) + movaps %xmm3, -0x30(%rdi) + jb L(shl_6_end) + movaps %xmm4, -0x20(%rdi) + movaps %xmm5, -0x10(%rdi) + jmp *%r9 + ud2 +L(shl_6_end): + movaps %xmm4, -0x20(%rdi) + lea 64(%rdx), %rdx + movaps %xmm5, -0x10(%rdi) + add %rdx, %rdi + movdqu %xmm0, (%r8) + add %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) + + .p2align 4 +L(shl_6_bwd): + lea (L(shl_6_bwd_loop_L1)-L(shl_6_bwd))(%r9), %r9 + cmp %rcx, %rdx + movaps -0x06(%rsi), %xmm1 + jb L(L6_bwd) + lea (L(shl_6_bwd_loop_L2)-L(shl_6_bwd_loop_L1))(%r9), %r9 +L(L6_bwd): + lea -64(%rdx), %rdx + jmp *%r9 + ud2 +L(shl_6_bwd_loop_L2): + prefetchnta -0x1c0(%rsi) +L(shl_6_bwd_loop_L1): + movaps -0x16(%rsi), %xmm2 + sub $0x40, %rdx + movaps -0x26(%rsi), %xmm3 + movaps -0x36(%rsi), %xmm4 + movaps -0x46(%rsi), %xmm5 + lea -0x40(%rsi), %rsi + palignr $6, %xmm2, %xmm1 + palignr $6, %xmm3, %xmm2 + palignr $6, %xmm4, %xmm3 + palignr $6, %xmm5, %xmm4 + + movaps %xmm1, -0x10(%rdi) + movaps %xmm5, %xmm1 + + movaps %xmm2, -0x20(%rdi) + lea -0x40(%rdi), %rdi + + movaps %xmm3, 0x10(%rdi) + jb L(shl_6_bwd_end) + movaps %xmm4, (%rdi) + jmp *%r9 + ud2 +L(shl_6_bwd_end): + movaps %xmm4, (%rdi) + lea 64(%rdx), %rdx + movdqu %xmm0, (%r8) + BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) + + .p2align 4 +L(shl_7): + lea (L(shl_7_loop_L1)-L(shl_7))(%r9), %r9 + cmp %rcx, %rdx + movaps -0x07(%rsi), %xmm1 + jb L(L7_fwd) + lea (L(shl_7_loop_L2)-L(shl_7_loop_L1))(%r9), %r9 +L(L7_fwd): + lea -64(%rdx), %rdx + jmp *%r9 + ud2 +L(shl_7_loop_L2): + prefetchnta 0x1c0(%rsi) +L(shl_7_loop_L1): + sub $64, %rdx + movaps 0x09(%rsi), %xmm2 + movaps 0x19(%rsi), %xmm3 + movaps 0x29(%rsi), %xmm4 + movaps 0x39(%rsi), %xmm5 + movdqa %xmm5, %xmm6 + palignr $7, %xmm4, %xmm5 + lea 64(%rsi), %rsi + palignr $7, %xmm3, %xmm4 + palignr $7, %xmm2, %xmm3 + lea 64(%rdi), %rdi + palignr $7, %xmm1, %xmm2 + movdqa %xmm6, %xmm1 + movdqa %xmm2, -0x40(%rdi) + movaps %xmm3, -0x30(%rdi) + jb L(shl_7_end) + movaps %xmm4, -0x20(%rdi) + movaps %xmm5, -0x10(%rdi) + jmp *%r9 + ud2 +L(shl_7_end): + movaps %xmm4, -0x20(%rdi) + lea 64(%rdx), %rdx + movaps %xmm5, -0x10(%rdi) + add %rdx, %rdi + movdqu %xmm0, (%r8) + add %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) + + .p2align 4 +L(shl_7_bwd): + lea (L(shl_7_bwd_loop_L1)-L(shl_7_bwd))(%r9), %r9 + cmp %rcx, %rdx + movaps -0x07(%rsi), %xmm1 + jb L(L7_bwd) + lea (L(shl_7_bwd_loop_L2)-L(shl_7_bwd_loop_L1))(%r9), %r9 +L(L7_bwd): + lea -64(%rdx), %rdx + jmp *%r9 + ud2 +L(shl_7_bwd_loop_L2): + prefetchnta -0x1c0(%rsi) +L(shl_7_bwd_loop_L1): + movaps -0x17(%rsi), %xmm2 + sub $0x40, %rdx + movaps -0x27(%rsi), %xmm3 + movaps -0x37(%rsi), %xmm4 + movaps -0x47(%rsi), %xmm5 + lea -0x40(%rsi), %rsi + palignr $7, %xmm2, %xmm1 + palignr $7, %xmm3, %xmm2 + palignr $7, %xmm4, %xmm3 + palignr $7, %xmm5, %xmm4 + + movaps %xmm1, -0x10(%rdi) + movaps %xmm5, %xmm1 + + movaps %xmm2, -0x20(%rdi) + lea -0x40(%rdi), %rdi + + movaps %xmm3, 0x10(%rdi) + jb L(shl_7_bwd_end) + movaps %xmm4, (%rdi) + jmp *%r9 + ud2 +L(shl_7_bwd_end): + movaps %xmm4, (%rdi) + lea 64(%rdx), %rdx + movdqu %xmm0, (%r8) + BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) + + .p2align 4 +L(shl_8): + lea (L(shl_8_loop_L1)-L(shl_8))(%r9), %r9 + cmp %rcx, %rdx + movaps -0x08(%rsi), %xmm1 + jb L(L8_fwd) + lea (L(shl_8_loop_L2)-L(shl_8_loop_L1))(%r9), %r9 +L(L8_fwd): + lea -64(%rdx), %rdx + jmp *%r9 +L(shl_8_loop_L2): + prefetchnta 0x1c0(%rsi) +L(shl_8_loop_L1): + sub $64, %rdx + movaps 0x08(%rsi), %xmm2 + movaps 0x18(%rsi), %xmm3 + movaps 0x28(%rsi), %xmm4 + movaps 0x38(%rsi), %xmm5 + movdqa %xmm5, %xmm6 + palignr $8, %xmm4, %xmm5 + lea 64(%rsi), %rsi + palignr $8, %xmm3, %xmm4 + palignr $8, %xmm2, %xmm3 + lea 64(%rdi), %rdi + palignr $8, %xmm1, %xmm2 + movdqa %xmm6, %xmm1 + movdqa %xmm2, -0x40(%rdi) + movaps %xmm3, -0x30(%rdi) + jb L(shl_8_end) + movaps %xmm4, -0x20(%rdi) + movaps %xmm5, -0x10(%rdi) + jmp *%r9 + ud2 + .p2align 4 +L(shl_8_end): + lea 64(%rdx), %rdx + movaps %xmm4, -0x20(%rdi) + add %rdx, %rsi + movaps %xmm5, -0x10(%rdi) + add %rdx, %rdi + movdqu %xmm0, (%r8) + BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) + + .p2align 4 +L(shl_8_bwd): + lea (L(shl_8_bwd_loop_L1)-L(shl_8_bwd))(%r9), %r9 + cmp %rcx, %rdx + movaps -0x08(%rsi), %xmm1 + jb L(L8_bwd) + lea (L(shl_8_bwd_loop_L2)-L(shl_8_bwd_loop_L1))(%r9), %r9 +L(L8_bwd): + lea -64(%rdx), %rdx + jmp *%r9 + ud2 +L(shl_8_bwd_loop_L2): + prefetchnta -0x1c0(%rsi) +L(shl_8_bwd_loop_L1): + movaps -0x18(%rsi), %xmm2 + sub $0x40, %rdx + movaps -0x28(%rsi), %xmm3 + movaps -0x38(%rsi), %xmm4 + movaps -0x48(%rsi), %xmm5 + lea -0x40(%rsi), %rsi + palignr $8, %xmm2, %xmm1 + palignr $8, %xmm3, %xmm2 + palignr $8, %xmm4, %xmm3 + palignr $8, %xmm5, %xmm4 + + movaps %xmm1, -0x10(%rdi) + movaps %xmm5, %xmm1 + + movaps %xmm2, -0x20(%rdi) + lea -0x40(%rdi), %rdi + + movaps %xmm3, 0x10(%rdi) + jb L(shl_8_bwd_end) + movaps %xmm4, (%rdi) + jmp *%r9 + ud2 +L(shl_8_bwd_end): + movaps %xmm4, (%rdi) + lea 64(%rdx), %rdx + movdqu %xmm0, (%r8) + BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) + + .p2align 4 +L(shl_9): + lea (L(shl_9_loop_L1)-L(shl_9))(%r9), %r9 + cmp %rcx, %rdx + movaps -0x09(%rsi), %xmm1 + jb L(L9_fwd) + lea (L(shl_9_loop_L2)-L(shl_9_loop_L1))(%r9), %r9 +L(L9_fwd): + lea -64(%rdx), %rdx + jmp *%r9 + ud2 +L(shl_9_loop_L2): + prefetchnta 0x1c0(%rsi) +L(shl_9_loop_L1): + sub $64, %rdx + movaps 0x07(%rsi), %xmm2 + movaps 0x17(%rsi), %xmm3 + movaps 0x27(%rsi), %xmm4 + movaps 0x37(%rsi), %xmm5 + movdqa %xmm5, %xmm6 + palignr $9, %xmm4, %xmm5 + lea 64(%rsi), %rsi + palignr $9, %xmm3, %xmm4 + palignr $9, %xmm2, %xmm3 + lea 64(%rdi), %rdi + palignr $9, %xmm1, %xmm2 + movdqa %xmm6, %xmm1 + movdqa %xmm2, -0x40(%rdi) + movaps %xmm3, -0x30(%rdi) + jb L(shl_9_end) + movaps %xmm4, -0x20(%rdi) + movaps %xmm5, -0x10(%rdi) + jmp *%r9 + ud2 +L(shl_9_end): + movaps %xmm4, -0x20(%rdi) + lea 64(%rdx), %rdx + movaps %xmm5, -0x10(%rdi) + add %rdx, %rdi + movdqu %xmm0, (%r8) + add %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) + + .p2align 4 +L(shl_9_bwd): + lea (L(shl_9_bwd_loop_L1)-L(shl_9_bwd))(%r9), %r9 + cmp %rcx, %rdx + movaps -0x09(%rsi), %xmm1 + jb L(L9_bwd) + lea (L(shl_9_bwd_loop_L2)-L(shl_9_bwd_loop_L1))(%r9), %r9 +L(L9_bwd): + lea -64(%rdx), %rdx + jmp *%r9 + ud2 +L(shl_9_bwd_loop_L2): + prefetchnta -0x1c0(%rsi) +L(shl_9_bwd_loop_L1): + movaps -0x19(%rsi), %xmm2 + sub $0x40, %rdx + movaps -0x29(%rsi), %xmm3 + movaps -0x39(%rsi), %xmm4 + movaps -0x49(%rsi), %xmm5 + lea -0x40(%rsi), %rsi + palignr $9, %xmm2, %xmm1 + palignr $9, %xmm3, %xmm2 + palignr $9, %xmm4, %xmm3 + palignr $9, %xmm5, %xmm4 + + movaps %xmm1, -0x10(%rdi) + movaps %xmm5, %xmm1 + + movaps %xmm2, -0x20(%rdi) + lea -0x40(%rdi), %rdi + + movaps %xmm3, 0x10(%rdi) + jb L(shl_9_bwd_end) + movaps %xmm4, (%rdi) + jmp *%r9 + ud2 +L(shl_9_bwd_end): + movaps %xmm4, (%rdi) + lea 64(%rdx), %rdx + movdqu %xmm0, (%r8) + BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) + + .p2align 4 +L(shl_10): + lea (L(shl_10_loop_L1)-L(shl_10))(%r9), %r9 + cmp %rcx, %rdx + movaps -0x0a(%rsi), %xmm1 + jb L(L10_fwd) + lea (L(shl_10_loop_L2)-L(shl_10_loop_L1))(%r9), %r9 +L(L10_fwd): + lea -64(%rdx), %rdx + jmp *%r9 + ud2 +L(shl_10_loop_L2): + prefetchnta 0x1c0(%rsi) +L(shl_10_loop_L1): + sub $64, %rdx + movaps 0x06(%rsi), %xmm2 + movaps 0x16(%rsi), %xmm3 + movaps 0x26(%rsi), %xmm4 + movaps 0x36(%rsi), %xmm5 + movdqa %xmm5, %xmm6 + palignr $10, %xmm4, %xmm5 + lea 64(%rsi), %rsi + palignr $10, %xmm3, %xmm4 + palignr $10, %xmm2, %xmm3 + lea 64(%rdi), %rdi + palignr $10, %xmm1, %xmm2 + movdqa %xmm6, %xmm1 + movdqa %xmm2, -0x40(%rdi) + movaps %xmm3, -0x30(%rdi) + jb L(shl_10_end) + movaps %xmm4, -0x20(%rdi) + movaps %xmm5, -0x10(%rdi) + jmp *%r9 + ud2 +L(shl_10_end): + movaps %xmm4, -0x20(%rdi) + lea 64(%rdx), %rdx + movaps %xmm5, -0x10(%rdi) + add %rdx, %rdi + movdqu %xmm0, (%r8) + add %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) + + .p2align 4 +L(shl_10_bwd): + lea (L(shl_10_bwd_loop_L1)-L(shl_10_bwd))(%r9), %r9 + cmp %rcx, %rdx + movaps -0x0a(%rsi), %xmm1 + jb L(L10_bwd) + lea (L(shl_10_bwd_loop_L2)-L(shl_10_bwd_loop_L1))(%r9), %r9 +L(L10_bwd): + lea -64(%rdx), %rdx + jmp *%r9 + ud2 +L(shl_10_bwd_loop_L2): + prefetchnta -0x1c0(%rsi) +L(shl_10_bwd_loop_L1): + movaps -0x1a(%rsi), %xmm2 + sub $0x40, %rdx + movaps -0x2a(%rsi), %xmm3 + movaps -0x3a(%rsi), %xmm4 + movaps -0x4a(%rsi), %xmm5 + lea -0x40(%rsi), %rsi + palignr $10, %xmm2, %xmm1 + palignr $10, %xmm3, %xmm2 + palignr $10, %xmm4, %xmm3 + palignr $10, %xmm5, %xmm4 + + movaps %xmm1, -0x10(%rdi) + movaps %xmm5, %xmm1 + + movaps %xmm2, -0x20(%rdi) + lea -0x40(%rdi), %rdi + + movaps %xmm3, 0x10(%rdi) + jb L(shl_10_bwd_end) + movaps %xmm4, (%rdi) + jmp *%r9 + ud2 +L(shl_10_bwd_end): + movaps %xmm4, (%rdi) + lea 64(%rdx), %rdx + movdqu %xmm0, (%r8) + BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) + + .p2align 4 +L(shl_11): + lea (L(shl_11_loop_L1)-L(shl_11))(%r9), %r9 + cmp %rcx, %rdx + movaps -0x0b(%rsi), %xmm1 + jb L(L11_fwd) + lea (L(shl_11_loop_L2)-L(shl_11_loop_L1))(%r9), %r9 +L(L11_fwd): + lea -64(%rdx), %rdx + jmp *%r9 + ud2 +L(shl_11_loop_L2): + prefetchnta 0x1c0(%rsi) +L(shl_11_loop_L1): + sub $64, %rdx + movaps 0x05(%rsi), %xmm2 + movaps 0x15(%rsi), %xmm3 + movaps 0x25(%rsi), %xmm4 + movaps 0x35(%rsi), %xmm5 + movdqa %xmm5, %xmm6 + palignr $11, %xmm4, %xmm5 + lea 64(%rsi), %rsi + palignr $11, %xmm3, %xmm4 + palignr $11, %xmm2, %xmm3 + lea 64(%rdi), %rdi + palignr $11, %xmm1, %xmm2 + movdqa %xmm6, %xmm1 + movdqa %xmm2, -0x40(%rdi) + movaps %xmm3, -0x30(%rdi) + jb L(shl_11_end) + movaps %xmm4, -0x20(%rdi) + movaps %xmm5, -0x10(%rdi) + jmp *%r9 + ud2 +L(shl_11_end): + movaps %xmm4, -0x20(%rdi) + lea 64(%rdx), %rdx + movaps %xmm5, -0x10(%rdi) + add %rdx, %rdi + movdqu %xmm0, (%r8) + add %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) + + .p2align 4 +L(shl_11_bwd): + lea (L(shl_11_bwd_loop_L1)-L(shl_11_bwd))(%r9), %r9 + cmp %rcx, %rdx + movaps -0x0b(%rsi), %xmm1 + jb L(L11_bwd) + lea (L(shl_11_bwd_loop_L2)-L(shl_11_bwd_loop_L1))(%r9), %r9 +L(L11_bwd): + lea -64(%rdx), %rdx + jmp *%r9 + ud2 +L(shl_11_bwd_loop_L2): + prefetchnta -0x1c0(%rsi) +L(shl_11_bwd_loop_L1): + movaps -0x1b(%rsi), %xmm2 + sub $0x40, %rdx + movaps -0x2b(%rsi), %xmm3 + movaps -0x3b(%rsi), %xmm4 + movaps -0x4b(%rsi), %xmm5 + lea -0x40(%rsi), %rsi + palignr $11, %xmm2, %xmm1 + palignr $11, %xmm3, %xmm2 + palignr $11, %xmm4, %xmm3 + palignr $11, %xmm5, %xmm4 + + movaps %xmm1, -0x10(%rdi) + movaps %xmm5, %xmm1 + + movaps %xmm2, -0x20(%rdi) + lea -0x40(%rdi), %rdi + + movaps %xmm3, 0x10(%rdi) + jb L(shl_11_bwd_end) + movaps %xmm4, (%rdi) + jmp *%r9 + ud2 +L(shl_11_bwd_end): + movaps %xmm4, (%rdi) + lea 64(%rdx), %rdx + movdqu %xmm0, (%r8) + BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) + + .p2align 4 +L(shl_12): + lea (L(shl_12_loop_L1)-L(shl_12))(%r9), %r9 + cmp %rcx, %rdx + movaps -0x0c(%rsi), %xmm1 + jb L(L12_fwd) + lea (L(shl_12_loop_L2)-L(shl_12_loop_L1))(%r9), %r9 +L(L12_fwd): + lea -64(%rdx), %rdx + jmp *%r9 + ud2 +L(shl_12_loop_L2): + prefetchnta 0x1c0(%rsi) +L(shl_12_loop_L1): + sub $64, %rdx + movaps 0x04(%rsi), %xmm2 + movaps 0x14(%rsi), %xmm3 + movaps 0x24(%rsi), %xmm4 + movaps 0x34(%rsi), %xmm5 + movdqa %xmm5, %xmm6 + palignr $12, %xmm4, %xmm5 + lea 64(%rsi), %rsi + palignr $12, %xmm3, %xmm4 + palignr $12, %xmm2, %xmm3 + lea 64(%rdi), %rdi + palignr $12, %xmm1, %xmm2 + movdqa %xmm6, %xmm1 + movdqa %xmm2, -0x40(%rdi) + movaps %xmm3, -0x30(%rdi) + jb L(shl_12_end) + movaps %xmm4, -0x20(%rdi) + movaps %xmm5, -0x10(%rdi) + jmp *%r9 + ud2 +L(shl_12_end): + movaps %xmm4, -0x20(%rdi) + lea 64(%rdx), %rdx + movaps %xmm5, -0x10(%rdi) + add %rdx, %rdi + movdqu %xmm0, (%r8) + add %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) + + .p2align 4 +L(shl_12_bwd): + lea (L(shl_12_bwd_loop_L1)-L(shl_12_bwd))(%r9), %r9 + cmp %rcx, %rdx + movaps -0x0c(%rsi), %xmm1 + jb L(L12_bwd) + lea (L(shl_12_bwd_loop_L2)-L(shl_12_bwd_loop_L1))(%r9), %r9 +L(L12_bwd): + lea -64(%rdx), %rdx + jmp *%r9 + ud2 +L(shl_12_bwd_loop_L2): + prefetchnta -0x1c0(%rsi) +L(shl_12_bwd_loop_L1): + movaps -0x1c(%rsi), %xmm2 + sub $0x40, %rdx + movaps -0x2c(%rsi), %xmm3 + movaps -0x3c(%rsi), %xmm4 + movaps -0x4c(%rsi), %xmm5 + lea -0x40(%rsi), %rsi + palignr $12, %xmm2, %xmm1 + palignr $12, %xmm3, %xmm2 + palignr $12, %xmm4, %xmm3 + palignr $12, %xmm5, %xmm4 + + movaps %xmm1, -0x10(%rdi) + movaps %xmm5, %xmm1 + + movaps %xmm2, -0x20(%rdi) + lea -0x40(%rdi), %rdi + + movaps %xmm3, 0x10(%rdi) + jb L(shl_12_bwd_end) + movaps %xmm4, (%rdi) + jmp *%r9 + ud2 +L(shl_12_bwd_end): + movaps %xmm4, (%rdi) + lea 64(%rdx), %rdx + movdqu %xmm0, (%r8) + BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) + + .p2align 4 +L(shl_13): + lea (L(shl_13_loop_L1)-L(shl_13))(%r9), %r9 + cmp %rcx, %rdx + movaps -0x0d(%rsi), %xmm1 + jb L(L13_fwd) + lea (L(shl_13_loop_L2)-L(shl_13_loop_L1))(%r9), %r9 +L(L13_fwd): + lea -64(%rdx), %rdx + jmp *%r9 + ud2 +L(shl_13_loop_L2): + prefetchnta 0x1c0(%rsi) +L(shl_13_loop_L1): + sub $64, %rdx + movaps 0x03(%rsi), %xmm2 + movaps 0x13(%rsi), %xmm3 + movaps 0x23(%rsi), %xmm4 + movaps 0x33(%rsi), %xmm5 + movdqa %xmm5, %xmm6 + palignr $13, %xmm4, %xmm5 + lea 64(%rsi), %rsi + palignr $13, %xmm3, %xmm4 + palignr $13, %xmm2, %xmm3 + lea 64(%rdi), %rdi + palignr $13, %xmm1, %xmm2 + movdqa %xmm6, %xmm1 + movdqa %xmm2, -0x40(%rdi) + movaps %xmm3, -0x30(%rdi) + jb L(shl_13_end) + movaps %xmm4, -0x20(%rdi) + movaps %xmm5, -0x10(%rdi) + jmp *%r9 + ud2 +L(shl_13_end): + movaps %xmm4, -0x20(%rdi) + lea 64(%rdx), %rdx + movaps %xmm5, -0x10(%rdi) + add %rdx, %rdi + movdqu %xmm0, (%r8) + add %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) + + .p2align 4 +L(shl_13_bwd): + lea (L(shl_13_bwd_loop_L1)-L(shl_13_bwd))(%r9), %r9 + cmp %rcx, %rdx + movaps -0x0d(%rsi), %xmm1 + jb L(L13_bwd) + lea (L(shl_13_bwd_loop_L2)-L(shl_13_bwd_loop_L1))(%r9), %r9 +L(L13_bwd): + lea -64(%rdx), %rdx + jmp *%r9 + ud2 +L(shl_13_bwd_loop_L2): + prefetchnta -0x1c0(%rsi) +L(shl_13_bwd_loop_L1): + movaps -0x1d(%rsi), %xmm2 + sub $0x40, %rdx + movaps -0x2d(%rsi), %xmm3 + movaps -0x3d(%rsi), %xmm4 + movaps -0x4d(%rsi), %xmm5 + lea -0x40(%rsi), %rsi + palignr $13, %xmm2, %xmm1 + palignr $13, %xmm3, %xmm2 + palignr $13, %xmm4, %xmm3 + palignr $13, %xmm5, %xmm4 + + movaps %xmm1, -0x10(%rdi) + movaps %xmm5, %xmm1 + + movaps %xmm2, -0x20(%rdi) + lea -0x40(%rdi), %rdi + + movaps %xmm3, 0x10(%rdi) + jb L(shl_13_bwd_end) + movaps %xmm4, (%rdi) + jmp *%r9 + ud2 +L(shl_13_bwd_end): + movaps %xmm4, (%rdi) + lea 64(%rdx), %rdx + movdqu %xmm0, (%r8) + BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) + + .p2align 4 +L(shl_14): + lea (L(shl_14_loop_L1)-L(shl_14))(%r9), %r9 + cmp %rcx, %rdx + movaps -0x0e(%rsi), %xmm1 + jb L(L14_fwd) + lea (L(shl_14_loop_L2)-L(shl_14_loop_L1))(%r9), %r9 +L(L14_fwd): + lea -64(%rdx), %rdx + jmp *%r9 + ud2 +L(shl_14_loop_L2): + prefetchnta 0x1c0(%rsi) +L(shl_14_loop_L1): + sub $64, %rdx + movaps 0x02(%rsi), %xmm2 + movaps 0x12(%rsi), %xmm3 + movaps 0x22(%rsi), %xmm4 + movaps 0x32(%rsi), %xmm5 + movdqa %xmm5, %xmm6 + palignr $14, %xmm4, %xmm5 + lea 64(%rsi), %rsi + palignr $14, %xmm3, %xmm4 + palignr $14, %xmm2, %xmm3 + lea 64(%rdi), %rdi + palignr $14, %xmm1, %xmm2 + movdqa %xmm6, %xmm1 + movdqa %xmm2, -0x40(%rdi) + movaps %xmm3, -0x30(%rdi) + jb L(shl_14_end) + movaps %xmm4, -0x20(%rdi) + movaps %xmm5, -0x10(%rdi) + jmp *%r9 + ud2 +L(shl_14_end): + movaps %xmm4, -0x20(%rdi) + lea 64(%rdx), %rdx + movaps %xmm5, -0x10(%rdi) + add %rdx, %rdi + movdqu %xmm0, (%r8) + add %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) + + .p2align 4 +L(shl_14_bwd): + lea (L(shl_14_bwd_loop_L1)-L(shl_14_bwd))(%r9), %r9 + cmp %rcx, %rdx + movaps -0x0e(%rsi), %xmm1 + jb L(L14_bwd) + lea (L(shl_14_bwd_loop_L2)-L(shl_14_bwd_loop_L1))(%r9), %r9 +L(L14_bwd): + lea -64(%rdx), %rdx + jmp *%r9 + ud2 +L(shl_14_bwd_loop_L2): + prefetchnta -0x1c0(%rsi) +L(shl_14_bwd_loop_L1): + movaps -0x1e(%rsi), %xmm2 + sub $0x40, %rdx + movaps -0x2e(%rsi), %xmm3 + movaps -0x3e(%rsi), %xmm4 + movaps -0x4e(%rsi), %xmm5 + lea -0x40(%rsi), %rsi + palignr $14, %xmm2, %xmm1 + palignr $14, %xmm3, %xmm2 + palignr $14, %xmm4, %xmm3 + palignr $14, %xmm5, %xmm4 + + movaps %xmm1, -0x10(%rdi) + movaps %xmm5, %xmm1 + + movaps %xmm2, -0x20(%rdi) + lea -0x40(%rdi), %rdi + + movaps %xmm3, 0x10(%rdi) + jb L(shl_14_bwd_end) + movaps %xmm4, (%rdi) + jmp *%r9 + ud2 +L(shl_14_bwd_end): + movaps %xmm4, (%rdi) + lea 64(%rdx), %rdx + movdqu %xmm0, (%r8) + BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) + + .p2align 4 +L(shl_15): + lea (L(shl_15_loop_L1)-L(shl_15))(%r9), %r9 + cmp %rcx, %rdx + movaps -0x0f(%rsi), %xmm1 + jb L(L15_fwd) + lea (L(shl_15_loop_L2)-L(shl_15_loop_L1))(%r9), %r9 +L(L15_fwd): + lea -64(%rdx), %rdx + jmp *%r9 + ud2 +L(shl_15_loop_L2): + prefetchnta 0x1c0(%rsi) +L(shl_15_loop_L1): + sub $64, %rdx + movaps 0x01(%rsi), %xmm2 + movaps 0x11(%rsi), %xmm3 + movaps 0x21(%rsi), %xmm4 + movaps 0x31(%rsi), %xmm5 + movdqa %xmm5, %xmm6 + palignr $15, %xmm4, %xmm5 + lea 64(%rsi), %rsi + palignr $15, %xmm3, %xmm4 + palignr $15, %xmm2, %xmm3 + lea 64(%rdi), %rdi + palignr $15, %xmm1, %xmm2 + movdqa %xmm6, %xmm1 + movdqa %xmm2, -0x40(%rdi) + movaps %xmm3, -0x30(%rdi) + jb L(shl_15_end) + movaps %xmm4, -0x20(%rdi) + movaps %xmm5, -0x10(%rdi) + jmp *%r9 + ud2 +L(shl_15_end): + movaps %xmm4, -0x20(%rdi) + lea 64(%rdx), %rdx + movaps %xmm5, -0x10(%rdi) + add %rdx, %rdi + movdqu %xmm0, (%r8) + add %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) + + .p2align 4 +L(shl_15_bwd): + lea (L(shl_15_bwd_loop_L1)-L(shl_15_bwd))(%r9), %r9 + cmp %rcx, %rdx + movaps -0x0f(%rsi), %xmm1 + jb L(L15_bwd) + lea (L(shl_15_bwd_loop_L2)-L(shl_15_bwd_loop_L1))(%r9), %r9 +L(L15_bwd): + lea -64(%rdx), %rdx + jmp *%r9 + ud2 +L(shl_15_bwd_loop_L2): + prefetchnta -0x1c0(%rsi) +L(shl_15_bwd_loop_L1): + movaps -0x1f(%rsi), %xmm2 + sub $0x40, %rdx + movaps -0x2f(%rsi), %xmm3 + movaps -0x3f(%rsi), %xmm4 + movaps -0x4f(%rsi), %xmm5 + lea -0x40(%rsi), %rsi + palignr $15, %xmm2, %xmm1 + palignr $15, %xmm3, %xmm2 + palignr $15, %xmm4, %xmm3 + palignr $15, %xmm5, %xmm4 + + movaps %xmm1, -0x10(%rdi) + movaps %xmm5, %xmm1 + + movaps %xmm2, -0x20(%rdi) + lea -0x40(%rdi), %rdi + + movaps %xmm3, 0x10(%rdi) + jb L(shl_15_bwd_end) + movaps %xmm4, (%rdi) + jmp *%r9 + ud2 +L(shl_15_bwd_end): + movaps %xmm4, (%rdi) + lea 64(%rdx), %rdx + movdqu %xmm0, (%r8) + BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) + + .p2align 4 +L(write_72bytes): + movdqu -72(%rsi), %xmm0 + movdqu -56(%rsi), %xmm1 + mov -40(%rsi), %r8 + mov -32(%rsi), %r9 + mov -24(%rsi), %r10 + mov -16(%rsi), %r11 + mov -8(%rsi), %rcx + movdqu %xmm0, -72(%rdi) + movdqu %xmm1, -56(%rdi) + mov %r8, -40(%rdi) + mov %r9, -32(%rdi) + mov %r10, -24(%rdi) + mov %r11, -16(%rdi) + mov %rcx, -8(%rdi) + ret + + .p2align 4 +L(write_64bytes): + movdqu -64(%rsi), %xmm0 + mov -48(%rsi), %rcx + mov -40(%rsi), %r8 + mov -32(%rsi), %r9 + mov -24(%rsi), %r10 + mov -16(%rsi), %r11 + mov -8(%rsi), %rdx + movdqu %xmm0, -64(%rdi) + mov %rcx, -48(%rdi) + mov %r8, -40(%rdi) + mov %r9, -32(%rdi) + mov %r10, -24(%rdi) + mov %r11, -16(%rdi) + mov %rdx, -8(%rdi) + ret + + .p2align 4 +L(write_56bytes): + movdqu -56(%rsi), %xmm0 + mov -40(%rsi), %r8 + mov -32(%rsi), %r9 + mov -24(%rsi), %r10 + mov -16(%rsi), %r11 + mov -8(%rsi), %rcx + movdqu %xmm0, -56(%rdi) + mov %r8, -40(%rdi) + mov %r9, -32(%rdi) + mov %r10, -24(%rdi) + mov %r11, -16(%rdi) + mov %rcx, -8(%rdi) + ret + + .p2align 4 +L(write_48bytes): + mov -48(%rsi), %rcx + mov -40(%rsi), %r8 + mov -32(%rsi), %r9 + mov -24(%rsi), %r10 + mov -16(%rsi), %r11 + mov -8(%rsi), %rdx + mov %rcx, -48(%rdi) + mov %r8, -40(%rdi) + mov %r9, -32(%rdi) + mov %r10, -24(%rdi) + mov %r11, -16(%rdi) + mov %rdx, -8(%rdi) + ret + + .p2align 4 +L(write_40bytes): + mov -40(%rsi), %r8 + mov -32(%rsi), %r9 + mov -24(%rsi), %r10 + mov -16(%rsi), %r11 + mov -8(%rsi), %rdx + mov %r8, -40(%rdi) + mov %r9, -32(%rdi) + mov %r10, -24(%rdi) + mov %r11, -16(%rdi) + mov %rdx, -8(%rdi) + ret + + .p2align 4 +L(write_32bytes): + mov -32(%rsi), %r9 + mov -24(%rsi), %r10 + mov -16(%rsi), %r11 + mov -8(%rsi), %rdx + mov %r9, -32(%rdi) + mov %r10, -24(%rdi) + mov %r11, -16(%rdi) + mov %rdx, -8(%rdi) + ret + + .p2align 4 +L(write_24bytes): + mov -24(%rsi), %r10 + mov -16(%rsi), %r11 + mov -8(%rsi), %rdx + mov %r10, -24(%rdi) + mov %r11, -16(%rdi) + mov %rdx, -8(%rdi) + ret + + .p2align 4 +L(write_16bytes): + mov -16(%rsi), %r11 + mov -8(%rsi), %rdx + mov %r11, -16(%rdi) + mov %rdx, -8(%rdi) + ret + + .p2align 4 +L(write_8bytes): + mov -8(%rsi), %rdx + mov %rdx, -8(%rdi) +L(write_0bytes): + ret + + .p2align 4 +L(write_73bytes): + movdqu -73(%rsi), %xmm0 + movdqu -57(%rsi), %xmm1 + mov -41(%rsi), %rcx + mov -33(%rsi), %r9 + mov -25(%rsi), %r10 + mov -17(%rsi), %r11 + mov -9(%rsi), %r8 + mov -4(%rsi), %edx + movdqu %xmm0, -73(%rdi) + movdqu %xmm1, -57(%rdi) + mov %rcx, -41(%rdi) + mov %r9, -33(%rdi) + mov %r10, -25(%rdi) + mov %r11, -17(%rdi) + mov %r8, -9(%rdi) + mov %edx, -4(%rdi) + ret + + .p2align 4 +L(write_65bytes): + movdqu -65(%rsi), %xmm0 + movdqu -49(%rsi), %xmm1 + mov -33(%rsi), %r9 + mov -25(%rsi), %r10 + mov -17(%rsi), %r11 + mov -9(%rsi), %rcx + mov -4(%rsi), %edx + movdqu %xmm0, -65(%rdi) + movdqu %xmm1, -49(%rdi) + mov %r9, -33(%rdi) + mov %r10, -25(%rdi) + mov %r11, -17(%rdi) + mov %rcx, -9(%rdi) + mov %edx, -4(%rdi) + ret + + .p2align 4 +L(write_57bytes): + movdqu -57(%rsi), %xmm0 + mov -41(%rsi), %r8 + mov -33(%rsi), %r9 + mov -25(%rsi), %r10 + mov -17(%rsi), %r11 + mov -9(%rsi), %rcx + mov -4(%rsi), %edx + movdqu %xmm0, -57(%rdi) + mov %r8, -41(%rdi) + mov %r9, -33(%rdi) + mov %r10, -25(%rdi) + mov %r11, -17(%rdi) + mov %rcx, -9(%rdi) + mov %edx, -4(%rdi) + ret + + .p2align 4 +L(write_49bytes): + movdqu -49(%rsi), %xmm0 + mov -33(%rsi), %r9 + mov -25(%rsi), %r10 + mov -17(%rsi), %r11 + mov -9(%rsi), %rcx + mov -4(%rsi), %edx + movdqu %xmm0, -49(%rdi) + mov %r9, -33(%rdi) + mov %r10, -25(%rdi) + mov %r11, -17(%rdi) + mov %rcx, -9(%rdi) + mov %edx, -4(%rdi) + ret + + .p2align 4 +L(write_41bytes): + mov -41(%rsi), %r8 + mov -33(%rsi), %r9 + mov -25(%rsi), %r10 + mov -17(%rsi), %r11 + mov -9(%rsi), %rcx + mov -1(%rsi), %dl + mov %r8, -41(%rdi) + mov %r9, -33(%rdi) + mov %r10, -25(%rdi) + mov %r11, -17(%rdi) + mov %rcx, -9(%rdi) + mov %dl, -1(%rdi) + ret + + .p2align 4 +L(write_33bytes): + mov -33(%rsi), %r9 + mov -25(%rsi), %r10 + mov -17(%rsi), %r11 + mov -9(%rsi), %rcx + mov -1(%rsi), %dl + mov %r9, -33(%rdi) + mov %r10, -25(%rdi) + mov %r11, -17(%rdi) + mov %rcx, -9(%rdi) + mov %dl, -1(%rdi) + ret + + .p2align 4 +L(write_25bytes): + mov -25(%rsi), %r10 + mov -17(%rsi), %r11 + mov -9(%rsi), %rcx + mov -1(%rsi), %dl + mov %r10, -25(%rdi) + mov %r11, -17(%rdi) + mov %rcx, -9(%rdi) + mov %dl, -1(%rdi) + ret + + .p2align 4 +L(write_17bytes): + mov -17(%rsi), %r11 + mov -9(%rsi), %rcx + mov -4(%rsi), %edx + mov %r11, -17(%rdi) + mov %rcx, -9(%rdi) + mov %edx, -4(%rdi) + ret + + .p2align 4 +L(write_9bytes): + mov -9(%rsi), %rcx + mov -4(%rsi), %edx + mov %rcx, -9(%rdi) + mov %edx, -4(%rdi) + ret + + .p2align 4 +L(write_1bytes): + mov -1(%rsi), %dl + mov %dl, -1(%rdi) + ret + + .p2align 4 +L(write_74bytes): + movdqu -74(%rsi), %xmm0 + movdqu -58(%rsi), %xmm1 + mov -42(%rsi), %r8 + mov -34(%rsi), %r9 + mov -26(%rsi), %r10 + mov -18(%rsi), %r11 + mov -10(%rsi), %rcx + mov -4(%rsi), %edx + movdqu %xmm0, -74(%rdi) + movdqu %xmm1, -58(%rdi) + mov %r8, -42(%rdi) + mov %r9, -34(%rdi) + mov %r10, -26(%rdi) + mov %r11, -18(%rdi) + mov %rcx, -10(%rdi) + mov %edx, -4(%rdi) + ret + + .p2align 4 +L(write_66bytes): + movdqu -66(%rsi), %xmm0 + movdqu -50(%rsi), %xmm1 + mov -42(%rsi), %r8 + mov -34(%rsi), %r9 + mov -26(%rsi), %r10 + mov -18(%rsi), %r11 + mov -10(%rsi), %rcx + mov -4(%rsi), %edx + movdqu %xmm0, -66(%rdi) + movdqu %xmm1, -50(%rdi) + mov %r8, -42(%rdi) + mov %r9, -34(%rdi) + mov %r10, -26(%rdi) + mov %r11, -18(%rdi) + mov %rcx, -10(%rdi) + mov %edx, -4(%rdi) + ret + + .p2align 4 +L(write_58bytes): + movdqu -58(%rsi), %xmm1 + mov -42(%rsi), %r8 + mov -34(%rsi), %r9 + mov -26(%rsi), %r10 + mov -18(%rsi), %r11 + mov -10(%rsi), %rcx + mov -4(%rsi), %edx + movdqu %xmm1, -58(%rdi) + mov %r8, -42(%rdi) + mov %r9, -34(%rdi) + mov %r10, -26(%rdi) + mov %r11, -18(%rdi) + mov %rcx, -10(%rdi) + mov %edx, -4(%rdi) + ret + + .p2align 4 +L(write_50bytes): + movdqu -50(%rsi), %xmm0 + mov -34(%rsi), %r9 + mov -26(%rsi), %r10 + mov -18(%rsi), %r11 + mov -10(%rsi), %rcx + mov -4(%rsi), %edx + movdqu %xmm0, -50(%rdi) + mov %r9, -34(%rdi) + mov %r10, -26(%rdi) + mov %r11, -18(%rdi) + mov %rcx, -10(%rdi) + mov %edx, -4(%rdi) + ret + + .p2align 4 +L(write_42bytes): + mov -42(%rsi), %r8 + mov -34(%rsi), %r9 + mov -26(%rsi), %r10 + mov -18(%rsi), %r11 + mov -10(%rsi), %rcx + mov -4(%rsi), %edx + mov %r8, -42(%rdi) + mov %r9, -34(%rdi) + mov %r10, -26(%rdi) + mov %r11, -18(%rdi) + mov %rcx, -10(%rdi) + mov %edx, -4(%rdi) + ret + + .p2align 4 +L(write_34bytes): + mov -34(%rsi), %r9 + mov -26(%rsi), %r10 + mov -18(%rsi), %r11 + mov -10(%rsi), %rcx + mov -4(%rsi), %edx + mov %r9, -34(%rdi) + mov %r10, -26(%rdi) + mov %r11, -18(%rdi) + mov %rcx, -10(%rdi) + mov %edx, -4(%rdi) + ret + + .p2align 4 +L(write_26bytes): + mov -26(%rsi), %r10 + mov -18(%rsi), %r11 + mov -10(%rsi), %rcx + mov -4(%rsi), %edx + mov %r10, -26(%rdi) + mov %r11, -18(%rdi) + mov %rcx, -10(%rdi) + mov %edx, -4(%rdi) + ret + + .p2align 4 +L(write_18bytes): + mov -18(%rsi), %r11 + mov -10(%rsi), %rcx + mov -4(%rsi), %edx + mov %r11, -18(%rdi) + mov %rcx, -10(%rdi) + mov %edx, -4(%rdi) + ret + + .p2align 4 +L(write_10bytes): + mov -10(%rsi), %rcx + mov -4(%rsi), %edx + mov %rcx, -10(%rdi) + mov %edx, -4(%rdi) + ret + + .p2align 4 +L(write_2bytes): + mov -2(%rsi), %dx + mov %dx, -2(%rdi) + ret + + .p2align 4 +L(write_75bytes): + movdqu -75(%rsi), %xmm0 + movdqu -59(%rsi), %xmm1 + mov -43(%rsi), %r8 + mov -35(%rsi), %r9 + mov -27(%rsi), %r10 + mov -19(%rsi), %r11 + mov -11(%rsi), %rcx + mov -4(%rsi), %edx + movdqu %xmm0, -75(%rdi) + movdqu %xmm1, -59(%rdi) + mov %r8, -43(%rdi) + mov %r9, -35(%rdi) + mov %r10, -27(%rdi) + mov %r11, -19(%rdi) + mov %rcx, -11(%rdi) + mov %edx, -4(%rdi) + ret + + .p2align 4 +L(write_67bytes): + movdqu -67(%rsi), %xmm0 + movdqu -59(%rsi), %xmm1 + mov -43(%rsi), %r8 + mov -35(%rsi), %r9 + mov -27(%rsi), %r10 + mov -19(%rsi), %r11 + mov -11(%rsi), %rcx + mov -4(%rsi), %edx + movdqu %xmm0, -67(%rdi) + movdqu %xmm1, -59(%rdi) + mov %r8, -43(%rdi) + mov %r9, -35(%rdi) + mov %r10, -27(%rdi) + mov %r11, -19(%rdi) + mov %rcx, -11(%rdi) + mov %edx, -4(%rdi) + ret + + .p2align 4 +L(write_59bytes): + movdqu -59(%rsi), %xmm0 + mov -43(%rsi), %r8 + mov -35(%rsi), %r9 + mov -27(%rsi), %r10 + mov -19(%rsi), %r11 + mov -11(%rsi), %rcx + mov -4(%rsi), %edx + movdqu %xmm0, -59(%rdi) + mov %r8, -43(%rdi) + mov %r9, -35(%rdi) + mov %r10, -27(%rdi) + mov %r11, -19(%rdi) + mov %rcx, -11(%rdi) + mov %edx, -4(%rdi) + ret + + .p2align 4 +L(write_51bytes): + movdqu -51(%rsi), %xmm0 + mov -35(%rsi), %r9 + mov -27(%rsi), %r10 + mov -19(%rsi), %r11 + mov -11(%rsi), %rcx + mov -4(%rsi), %edx + movdqu %xmm0, -51(%rdi) + mov %r9, -35(%rdi) + mov %r10, -27(%rdi) + mov %r11, -19(%rdi) + mov %rcx, -11(%rdi) + mov %edx, -4(%rdi) + ret + + .p2align 4 +L(write_43bytes): + mov -43(%rsi), %r8 + mov -35(%rsi), %r9 + mov -27(%rsi), %r10 + mov -19(%rsi), %r11 + mov -11(%rsi), %rcx + mov -4(%rsi), %edx + mov %r8, -43(%rdi) + mov %r9, -35(%rdi) + mov %r10, -27(%rdi) + mov %r11, -19(%rdi) + mov %rcx, -11(%rdi) + mov %edx, -4(%rdi) + ret + + .p2align 4 +L(write_35bytes): + mov -35(%rsi), %r9 + mov -27(%rsi), %r10 + mov -19(%rsi), %r11 + mov -11(%rsi), %rcx + mov -4(%rsi), %edx + mov %r9, -35(%rdi) + mov %r10, -27(%rdi) + mov %r11, -19(%rdi) + mov %rcx, -11(%rdi) + mov %edx, -4(%rdi) + ret + + .p2align 4 +L(write_27bytes): + mov -27(%rsi), %r10 + mov -19(%rsi), %r11 + mov -11(%rsi), %rcx + mov -4(%rsi), %edx + mov %r10, -27(%rdi) + mov %r11, -19(%rdi) + mov %rcx, -11(%rdi) + mov %edx, -4(%rdi) + ret + + .p2align 4 +L(write_19bytes): + mov -19(%rsi), %r11 + mov -11(%rsi), %rcx + mov -4(%rsi), %edx + mov %r11, -19(%rdi) + mov %rcx, -11(%rdi) + mov %edx, -4(%rdi) + ret + + .p2align 4 +L(write_11bytes): + mov -11(%rsi), %rcx + mov -4(%rsi), %edx + mov %rcx, -11(%rdi) + mov %edx, -4(%rdi) + ret + + .p2align 4 +L(write_3bytes): + mov -3(%rsi), %dx + mov -2(%rsi), %cx + mov %dx, -3(%rdi) + mov %cx, -2(%rdi) + ret + + .p2align 4 +L(write_76bytes): + movdqu -76(%rsi), %xmm0 + movdqu -60(%rsi), %xmm1 + mov -44(%rsi), %r8 + mov -36(%rsi), %r9 + mov -28(%rsi), %r10 + mov -20(%rsi), %r11 + mov -12(%rsi), %rcx + mov -4(%rsi), %edx + movdqu %xmm0, -76(%rdi) + movdqu %xmm1, -60(%rdi) + mov %r8, -44(%rdi) + mov %r9, -36(%rdi) + mov %r10, -28(%rdi) + mov %r11, -20(%rdi) + mov %rcx, -12(%rdi) + mov %edx, -4(%rdi) + ret + + .p2align 4 +L(write_68bytes): + movdqu -68(%rsi), %xmm0 + movdqu -52(%rsi), %xmm1 + mov -36(%rsi), %r9 + mov -28(%rsi), %r10 + mov -20(%rsi), %r11 + mov -12(%rsi), %rcx + mov -4(%rsi), %edx + movdqu %xmm0, -68(%rdi) + movdqu %xmm1, -52(%rdi) + mov %r9, -36(%rdi) + mov %r10, -28(%rdi) + mov %r11, -20(%rdi) + mov %rcx, -12(%rdi) + mov %edx, -4(%rdi) + ret + + .p2align 4 +L(write_60bytes): + movdqu -60(%rsi), %xmm0 + mov -44(%rsi), %r8 + mov -36(%rsi), %r9 + mov -28(%rsi), %r10 + mov -20(%rsi), %r11 + mov -12(%rsi), %rcx + mov -4(%rsi), %edx + movdqu %xmm0, -60(%rdi) + mov %r8, -44(%rdi) + mov %r9, -36(%rdi) + mov %r10, -28(%rdi) + mov %r11, -20(%rdi) + mov %rcx, -12(%rdi) + mov %edx, -4(%rdi) + ret + + .p2align 4 +L(write_52bytes): + movdqu -52(%rsi), %xmm0 + mov -36(%rsi), %r9 + mov -28(%rsi), %r10 + mov -20(%rsi), %r11 + mov -12(%rsi), %rcx + mov -4(%rsi), %edx + movdqu %xmm0, -52(%rdi) + mov %r9, -36(%rdi) + mov %r10, -28(%rdi) + mov %r11, -20(%rdi) + mov %rcx, -12(%rdi) + mov %edx, -4(%rdi) + ret + + .p2align 4 +L(write_44bytes): + mov -44(%rsi), %r8 + mov -36(%rsi), %r9 + mov -28(%rsi), %r10 + mov -20(%rsi), %r11 + mov -12(%rsi), %rcx + mov -4(%rsi), %edx + mov %r8, -44(%rdi) + mov %r9, -36(%rdi) + mov %r10, -28(%rdi) + mov %r11, -20(%rdi) + mov %rcx, -12(%rdi) + mov %edx, -4(%rdi) + ret + + .p2align 4 +L(write_36bytes): + mov -36(%rsi), %r9 + mov -28(%rsi), %r10 + mov -20(%rsi), %r11 + mov -12(%rsi), %rcx + mov -4(%rsi), %edx + mov %r9, -36(%rdi) + mov %r10, -28(%rdi) + mov %r11, -20(%rdi) + mov %rcx, -12(%rdi) + mov %edx, -4(%rdi) + ret + + .p2align 4 +L(write_28bytes): + mov -28(%rsi), %r10 + mov -20(%rsi), %r11 + mov -12(%rsi), %rcx + mov -4(%rsi), %edx + mov %r10, -28(%rdi) + mov %r11, -20(%rdi) + mov %rcx, -12(%rdi) + mov %edx, -4(%rdi) + ret + + .p2align 4 +L(write_20bytes): + mov -20(%rsi), %r11 + mov -12(%rsi), %rcx + mov -4(%rsi), %edx + mov %r11, -20(%rdi) + mov %rcx, -12(%rdi) + mov %edx, -4(%rdi) + ret + + .p2align 4 +L(write_12bytes): + mov -12(%rsi), %rcx + mov -4(%rsi), %edx + mov %rcx, -12(%rdi) + mov %edx, -4(%rdi) + ret + + .p2align 4 +L(write_4bytes): + mov -4(%rsi), %edx + mov %edx, -4(%rdi) + ret + + .p2align 4 +L(write_77bytes): + movdqu -77(%rsi), %xmm0 + movdqu -61(%rsi), %xmm1 + mov -45(%rsi), %r8 + mov -37(%rsi), %r9 + mov -29(%rsi), %r10 + mov -21(%rsi), %r11 + mov -13(%rsi), %rcx + mov -8(%rsi), %rdx + movdqu %xmm0, -77(%rdi) + movdqu %xmm1, -61(%rdi) + mov %r8, -45(%rdi) + mov %r9, -37(%rdi) + mov %r10, -29(%rdi) + mov %r11, -21(%rdi) + mov %rcx, -13(%rdi) + mov %rdx, -8(%rdi) + ret + + .p2align 4 +L(write_69bytes): + movdqu -69(%rsi), %xmm0 + movdqu -53(%rsi), %xmm1 + mov -37(%rsi), %r9 + mov -29(%rsi), %r10 + mov -21(%rsi), %r11 + mov -13(%rsi), %rcx + mov -8(%rsi), %rdx + movdqu %xmm0, -69(%rdi) + movdqu %xmm1, -53(%rdi) + mov %r9, -37(%rdi) + mov %r10, -29(%rdi) + mov %r11, -21(%rdi) + mov %rcx, -13(%rdi) + mov %rdx, -8(%rdi) + ret + + .p2align 4 +L(write_61bytes): + movdqu -61(%rsi), %xmm0 + mov -45(%rsi), %r8 + mov -37(%rsi), %r9 + mov -29(%rsi), %r10 + mov -21(%rsi), %r11 + mov -13(%rsi), %rcx + mov -8(%rsi), %rdx + movdqu %xmm0, -61(%rdi) + mov %r8, -45(%rdi) + mov %r9, -37(%rdi) + mov %r10, -29(%rdi) + mov %r11, -21(%rdi) + mov %rcx, -13(%rdi) + mov %rdx, -8(%rdi) + ret + + .p2align 4 +L(write_53bytes): + movdqu -53(%rsi), %xmm0 + mov -45(%rsi), %r8 + mov -37(%rsi), %r9 + mov -29(%rsi), %r10 + mov -21(%rsi), %r11 + mov -13(%rsi), %rcx + mov -8(%rsi), %rdx + movdqu %xmm0, -53(%rdi) + mov %r9, -37(%rdi) + mov %r10, -29(%rdi) + mov %r11, -21(%rdi) + mov %rcx, -13(%rdi) + mov %rdx, -8(%rdi) + ret + + .p2align 4 +L(write_45bytes): + mov -45(%rsi), %r8 + mov -37(%rsi), %r9 + mov -29(%rsi), %r10 + mov -21(%rsi), %r11 + mov -13(%rsi), %rcx + mov -8(%rsi), %rdx + mov %r8, -45(%rdi) + mov %r9, -37(%rdi) + mov %r10, -29(%rdi) + mov %r11, -21(%rdi) + mov %rcx, -13(%rdi) + mov %rdx, -8(%rdi) + ret + + .p2align 4 +L(write_37bytes): + mov -37(%rsi), %r9 + mov -29(%rsi), %r10 + mov -21(%rsi), %r11 + mov -13(%rsi), %rcx + mov -8(%rsi), %rdx + mov %r9, -37(%rdi) + mov %r10, -29(%rdi) + mov %r11, -21(%rdi) + mov %rcx, -13(%rdi) + mov %rdx, -8(%rdi) + ret + + .p2align 4 +L(write_29bytes): + mov -29(%rsi), %r10 + mov -21(%rsi), %r11 + mov -13(%rsi), %rcx + mov -8(%rsi), %rdx + mov %r10, -29(%rdi) + mov %r11, -21(%rdi) + mov %rcx, -13(%rdi) + mov %rdx, -8(%rdi) + ret + + .p2align 4 +L(write_21bytes): + mov -21(%rsi), %r11 + mov -13(%rsi), %rcx + mov -8(%rsi), %rdx + mov %r11, -21(%rdi) + mov %rcx, -13(%rdi) + mov %rdx, -8(%rdi) + ret + + .p2align 4 +L(write_13bytes): + mov -13(%rsi), %rcx + mov -8(%rsi), %rdx + mov %rcx, -13(%rdi) + mov %rdx, -8(%rdi) + ret + + .p2align 4 +L(write_5bytes): + mov -5(%rsi), %edx + mov -4(%rsi), %ecx + mov %edx, -5(%rdi) + mov %ecx, -4(%rdi) + ret + + .p2align 4 +L(write_78bytes): + movdqu -78(%rsi), %xmm0 + movdqu -62(%rsi), %xmm1 + mov -46(%rsi), %r8 + mov -38(%rsi), %r9 + mov -30(%rsi), %r10 + mov -22(%rsi), %r11 + mov -14(%rsi), %rcx + mov -8(%rsi), %rdx + movdqu %xmm0, -78(%rdi) + movdqu %xmm1, -62(%rdi) + mov %r8, -46(%rdi) + mov %r9, -38(%rdi) + mov %r10, -30(%rdi) + mov %r11, -22(%rdi) + mov %rcx, -14(%rdi) + mov %rdx, -8(%rdi) + ret + + .p2align 4 +L(write_70bytes): + movdqu -70(%rsi), %xmm0 + movdqu -54(%rsi), %xmm1 + mov -38(%rsi), %r9 + mov -30(%rsi), %r10 + mov -22(%rsi), %r11 + mov -14(%rsi), %rcx + mov -8(%rsi), %rdx + movdqu %xmm0, -70(%rdi) + movdqu %xmm1, -54(%rdi) + mov %r9, -38(%rdi) + mov %r10, -30(%rdi) + mov %r11, -22(%rdi) + mov %rcx, -14(%rdi) + mov %rdx, -8(%rdi) + ret + + .p2align 4 +L(write_62bytes): + movdqu -62(%rsi), %xmm0 + mov -46(%rsi), %r8 + mov -38(%rsi), %r9 + mov -30(%rsi), %r10 + mov -22(%rsi), %r11 + mov -14(%rsi), %rcx + mov -8(%rsi), %rdx + movdqu %xmm0, -62(%rdi) + mov %r8, -46(%rdi) + mov %r9, -38(%rdi) + mov %r10, -30(%rdi) + mov %r11, -22(%rdi) + mov %rcx, -14(%rdi) + mov %rdx, -8(%rdi) + ret + + .p2align 4 +L(write_54bytes): + movdqu -54(%rsi), %xmm0 + mov -38(%rsi), %r9 + mov -30(%rsi), %r10 + mov -22(%rsi), %r11 + mov -14(%rsi), %rcx + mov -8(%rsi), %rdx + movdqu %xmm0, -54(%rdi) + mov %r9, -38(%rdi) + mov %r10, -30(%rdi) + mov %r11, -22(%rdi) + mov %rcx, -14(%rdi) + mov %rdx, -8(%rdi) + ret + + .p2align 4 +L(write_46bytes): + mov -46(%rsi), %r8 + mov -38(%rsi), %r9 + mov -30(%rsi), %r10 + mov -22(%rsi), %r11 + mov -14(%rsi), %rcx + mov -8(%rsi), %rdx + mov %r8, -46(%rdi) + mov %r9, -38(%rdi) + mov %r10, -30(%rdi) + mov %r11, -22(%rdi) + mov %rcx, -14(%rdi) + mov %rdx, -8(%rdi) + ret + + .p2align 4 +L(write_38bytes): + mov -38(%rsi), %r9 + mov -30(%rsi), %r10 + mov -22(%rsi), %r11 + mov -14(%rsi), %rcx + mov -8(%rsi), %rdx + mov %r9, -38(%rdi) + mov %r10, -30(%rdi) + mov %r11, -22(%rdi) + mov %rcx, -14(%rdi) + mov %rdx, -8(%rdi) + ret + + .p2align 4 +L(write_30bytes): + mov -30(%rsi), %r10 + mov -22(%rsi), %r11 + mov -14(%rsi), %rcx + mov -8(%rsi), %rdx + mov %r10, -30(%rdi) + mov %r11, -22(%rdi) + mov %rcx, -14(%rdi) + mov %rdx, -8(%rdi) + ret + + .p2align 4 +L(write_22bytes): + mov -22(%rsi), %r11 + mov -14(%rsi), %rcx + mov -8(%rsi), %rdx + mov %r11, -22(%rdi) + mov %rcx, -14(%rdi) + mov %rdx, -8(%rdi) + ret + + .p2align 4 +L(write_14bytes): + mov -14(%rsi), %rcx + mov -8(%rsi), %rdx + mov %rcx, -14(%rdi) + mov %rdx, -8(%rdi) + ret + + .p2align 4 +L(write_6bytes): + mov -6(%rsi), %edx + mov -4(%rsi), %ecx + mov %edx, -6(%rdi) + mov %ecx, -4(%rdi) + ret + + .p2align 4 +L(write_79bytes): + movdqu -79(%rsi), %xmm0 + movdqu -63(%rsi), %xmm1 + mov -47(%rsi), %r8 + mov -39(%rsi), %r9 + mov -31(%rsi), %r10 + mov -23(%rsi), %r11 + mov -15(%rsi), %rcx + mov -8(%rsi), %rdx + movdqu %xmm0, -79(%rdi) + movdqu %xmm1, -63(%rdi) + mov %r8, -47(%rdi) + mov %r9, -39(%rdi) + mov %r10, -31(%rdi) + mov %r11, -23(%rdi) + mov %rcx, -15(%rdi) + mov %rdx, -8(%rdi) + ret + + .p2align 4 +L(write_71bytes): + movdqu -71(%rsi), %xmm0 + movdqu -55(%rsi), %xmm1 + mov -39(%rsi), %r9 + mov -31(%rsi), %r10 + mov -23(%rsi), %r11 + mov -15(%rsi), %rcx + mov -8(%rsi), %rdx + movdqu %xmm0, -71(%rdi) + movdqu %xmm1, -55(%rdi) + mov %r9, -39(%rdi) + mov %r10, -31(%rdi) + mov %r11, -23(%rdi) + mov %rcx, -15(%rdi) + mov %rdx, -8(%rdi) + ret + + .p2align 4 +L(write_63bytes): + movdqu -63(%rsi), %xmm0 + mov -47(%rsi), %r8 + mov -39(%rsi), %r9 + mov -31(%rsi), %r10 + mov -23(%rsi), %r11 + mov -15(%rsi), %rcx + mov -8(%rsi), %rdx + movdqu %xmm0, -63(%rdi) + mov %r8, -47(%rdi) + mov %r9, -39(%rdi) + mov %r10, -31(%rdi) + mov %r11, -23(%rdi) + mov %rcx, -15(%rdi) + mov %rdx, -8(%rdi) + ret + + .p2align 4 +L(write_55bytes): + movdqu -55(%rsi), %xmm0 + mov -39(%rsi), %r9 + mov -31(%rsi), %r10 + mov -23(%rsi), %r11 + mov -15(%rsi), %rcx + mov -8(%rsi), %rdx + movdqu %xmm0, -55(%rdi) + mov %r9, -39(%rdi) + mov %r10, -31(%rdi) + mov %r11, -23(%rdi) + mov %rcx, -15(%rdi) + mov %rdx, -8(%rdi) + ret + + .p2align 4 +L(write_47bytes): + mov -47(%rsi), %r8 + mov -39(%rsi), %r9 + mov -31(%rsi), %r10 + mov -23(%rsi), %r11 + mov -15(%rsi), %rcx + mov -8(%rsi), %rdx + mov %r8, -47(%rdi) + mov %r9, -39(%rdi) + mov %r10, -31(%rdi) + mov %r11, -23(%rdi) + mov %rcx, -15(%rdi) + mov %rdx, -8(%rdi) + ret + + .p2align 4 +L(write_39bytes): + mov -39(%rsi), %r9 + mov -31(%rsi), %r10 + mov -23(%rsi), %r11 + mov -15(%rsi), %rcx + mov -8(%rsi), %rdx + mov %r9, -39(%rdi) + mov %r10, -31(%rdi) + mov %r11, -23(%rdi) + mov %rcx, -15(%rdi) + mov %rdx, -8(%rdi) + ret + + .p2align 4 +L(write_31bytes): + mov -31(%rsi), %r10 + mov -23(%rsi), %r11 + mov -15(%rsi), %rcx + mov -8(%rsi), %rdx + mov %r10, -31(%rdi) + mov %r11, -23(%rdi) + mov %rcx, -15(%rdi) + mov %rdx, -8(%rdi) + ret + + .p2align 4 +L(write_23bytes): + mov -23(%rsi), %r11 + mov -15(%rsi), %rcx + mov -8(%rsi), %rdx + mov %r11, -23(%rdi) + mov %rcx, -15(%rdi) + mov %rdx, -8(%rdi) + ret + + .p2align 4 +L(write_15bytes): + mov -15(%rsi), %rcx + mov -8(%rsi), %rdx + mov %rcx, -15(%rdi) + mov %rdx, -8(%rdi) + ret + + .p2align 4 +L(write_7bytes): + mov -7(%rsi), %edx + mov -4(%rsi), %ecx + mov %edx, -7(%rdi) + mov %ecx, -4(%rdi) + ret + + .p2align 4 +L(large_page_fwd): + movdqu (%rsi), %xmm1 + lea 16(%rsi), %rsi + movdqu %xmm0, (%r8) + movntdq %xmm1, (%rdi) + lea 16(%rdi), %rdi + lea -0x90(%rdx), %rdx +#ifdef USE_AS_MEMMOVE + mov %rsi, %r9 + sub %rdi, %r9 + cmp %rdx, %r9 + jae L(memmove_is_memcpy_fwd) + shl $2, %rcx + cmp %rcx, %rdx + jb L(ll_cache_copy_fwd_start) +L(memmove_is_memcpy_fwd): +#endif +L(large_page_loop): + movdqu (%rsi), %xmm0 + movdqu 0x10(%rsi), %xmm1 + movdqu 0x20(%rsi), %xmm2 + movdqu 0x30(%rsi), %xmm3 + movdqu 0x40(%rsi), %xmm4 + movdqu 0x50(%rsi), %xmm5 + movdqu 0x60(%rsi), %xmm6 + movdqu 0x70(%rsi), %xmm7 + lea 0x80(%rsi), %rsi + + sub $0x80, %rdx + movntdq %xmm0, (%rdi) + movntdq %xmm1, 0x10(%rdi) + movntdq %xmm2, 0x20(%rdi) + movntdq %xmm3, 0x30(%rdi) + movntdq %xmm4, 0x40(%rdi) + movntdq %xmm5, 0x50(%rdi) + movntdq %xmm6, 0x60(%rdi) + movntdq %xmm7, 0x70(%rdi) + lea 0x80(%rdi), %rdi + jae L(large_page_loop) + cmp $-0x40, %rdx + lea 0x80(%rdx), %rdx + jl L(large_page_less_64bytes) + + movdqu (%rsi), %xmm0 + movdqu 0x10(%rsi), %xmm1 + movdqu 0x20(%rsi), %xmm2 + movdqu 0x30(%rsi), %xmm3 + lea 0x40(%rsi), %rsi + + movntdq %xmm0, (%rdi) + movntdq %xmm1, 0x10(%rdi) + movntdq %xmm2, 0x20(%rdi) + movntdq %xmm3, 0x30(%rdi) + lea 0x40(%rdi), %rdi + sub $0x40, %rdx +L(large_page_less_64bytes): + add %rdx, %rsi + add %rdx, %rdi + sfence + BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4) + +#ifdef USE_AS_MEMMOVE + .p2align 4 +L(ll_cache_copy_fwd_start): + prefetcht0 0x1c0(%rsi) + prefetcht0 0x200(%rsi) + movdqu (%rsi), %xmm0 + movdqu 0x10(%rsi), %xmm1 + movdqu 0x20(%rsi), %xmm2 + movdqu 0x30(%rsi), %xmm3 + movdqu 0x40(%rsi), %xmm4 + movdqu 0x50(%rsi), %xmm5 + movdqu 0x60(%rsi), %xmm6 + movdqu 0x70(%rsi), %xmm7 + lea 0x80(%rsi), %rsi + + sub $0x80, %rdx + movaps %xmm0, (%rdi) + movaps %xmm1, 0x10(%rdi) + movaps %xmm2, 0x20(%rdi) + movaps %xmm3, 0x30(%rdi) + movaps %xmm4, 0x40(%rdi) + movaps %xmm5, 0x50(%rdi) + movaps %xmm6, 0x60(%rdi) + movaps %xmm7, 0x70(%rdi) + lea 0x80(%rdi), %rdi + jae L(ll_cache_copy_fwd_start) + cmp $-0x40, %rdx + lea 0x80(%rdx), %rdx + jl L(large_page_ll_less_fwd_64bytes) + + movdqu (%rsi), %xmm0 + movdqu 0x10(%rsi), %xmm1 + movdqu 0x20(%rsi), %xmm2 + movdqu 0x30(%rsi), %xmm3 + lea 0x40(%rsi), %rsi + + movaps %xmm0, (%rdi) + movaps %xmm1, 0x10(%rdi) + movaps %xmm2, 0x20(%rdi) + movaps %xmm3, 0x30(%rdi) + lea 0x40(%rdi), %rdi + sub $0x40, %rdx +L(large_page_ll_less_fwd_64bytes): + add %rdx, %rsi + add %rdx, %rdi + BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4) + +#endif + .p2align 4 +L(large_page_bwd): + movdqu -0x10(%rsi), %xmm1 + lea -16(%rsi), %rsi + movdqu %xmm0, (%r8) + movdqa %xmm1, -0x10(%rdi) + lea -16(%rdi), %rdi + lea -0x90(%rdx), %rdx +#ifdef USE_AS_MEMMOVE + mov %rdi, %r9 + sub %rsi, %r9 + cmp %rdx, %r9 + jae L(memmove_is_memcpy_bwd) + cmp %rcx, %r9 + jb L(ll_cache_copy_bwd_start) +L(memmove_is_memcpy_bwd): +#endif +L(large_page_bwd_loop): + movdqu -0x10(%rsi), %xmm0 + movdqu -0x20(%rsi), %xmm1 + movdqu -0x30(%rsi), %xmm2 + movdqu -0x40(%rsi), %xmm3 + movdqu -0x50(%rsi), %xmm4 + movdqu -0x60(%rsi), %xmm5 + movdqu -0x70(%rsi), %xmm6 + movdqu -0x80(%rsi), %xmm7 + lea -0x80(%rsi), %rsi + + sub $0x80, %rdx + movntdq %xmm0, -0x10(%rdi) + movntdq %xmm1, -0x20(%rdi) + movntdq %xmm2, -0x30(%rdi) + movntdq %xmm3, -0x40(%rdi) + movntdq %xmm4, -0x50(%rdi) + movntdq %xmm5, -0x60(%rdi) + movntdq %xmm6, -0x70(%rdi) + movntdq %xmm7, -0x80(%rdi) + lea -0x80(%rdi), %rdi + jae L(large_page_bwd_loop) + cmp $-0x40, %rdx + lea 0x80(%rdx), %rdx + jl L(large_page_less_bwd_64bytes) + + movdqu -0x10(%rsi), %xmm0 + movdqu -0x20(%rsi), %xmm1 + movdqu -0x30(%rsi), %xmm2 + movdqu -0x40(%rsi), %xmm3 + lea -0x40(%rsi), %rsi + + movntdq %xmm0, -0x10(%rdi) + movntdq %xmm1, -0x20(%rdi) + movntdq %xmm2, -0x30(%rdi) + movntdq %xmm3, -0x40(%rdi) + lea -0x40(%rdi), %rdi + sub $0x40, %rdx +L(large_page_less_bwd_64bytes): + sfence + BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4) + +#ifdef USE_AS_MEMMOVE + .p2align 4 +L(ll_cache_copy_bwd_start): + prefetcht0 -0x1c0(%rsi) + prefetcht0 -0x200(%rsi) + movdqu -0x10(%rsi), %xmm0 + movdqu -0x20(%rsi), %xmm1 + movdqu -0x30(%rsi), %xmm2 + movdqu -0x40(%rsi), %xmm3 + movdqu -0x50(%rsi), %xmm4 + movdqu -0x60(%rsi), %xmm5 + movdqu -0x70(%rsi), %xmm6 + movdqu -0x80(%rsi), %xmm7 + lea -0x80(%rsi), %rsi + + sub $0x80, %rdx + movaps %xmm0, -0x10(%rdi) + movaps %xmm1, -0x20(%rdi) + movaps %xmm2, -0x30(%rdi) + movaps %xmm3, -0x40(%rdi) + movaps %xmm4, -0x50(%rdi) + movaps %xmm5, -0x60(%rdi) + movaps %xmm6, -0x70(%rdi) + movaps %xmm7, -0x80(%rdi) + lea -0x80(%rdi), %rdi + jae L(ll_cache_copy_bwd_start) + cmp $-0x40, %rdx + lea 0x80(%rdx), %rdx + jl L(large_page_ll_less_bwd_64bytes) + + movdqu -0x10(%rsi), %xmm0 + movdqu -0x20(%rsi), %xmm1 + movdqu -0x30(%rsi), %xmm2 + movdqu -0x40(%rsi), %xmm3 + lea -0x40(%rsi), %rsi + + movaps %xmm0, -0x10(%rdi) + movaps %xmm1, -0x20(%rdi) + movaps %xmm2, -0x30(%rdi) + movaps %xmm3, -0x40(%rdi) + lea -0x40(%rdi), %rdi + sub $0x40, %rdx +L(large_page_ll_less_bwd_64bytes): + BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4) +#endif + +END (MEMCPY) + + .section .rodata.ssse3,"a",@progbits + .p2align 3 +L(table_less_80bytes): + .int JMPTBL (L(write_0bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_1bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_2bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_3bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_4bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_5bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_6bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_7bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_8bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_9bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_10bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_11bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_12bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_13bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_14bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_15bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_16bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_17bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_18bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_19bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_20bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_21bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_22bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_23bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_24bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_25bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_26bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_27bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_28bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_29bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_30bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_31bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_32bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_33bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_34bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_35bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_36bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_37bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_38bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_39bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_40bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_41bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_42bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_43bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_44bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_45bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_46bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_47bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_48bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_49bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_50bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_51bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_52bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_53bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_54bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_55bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_56bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_57bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_58bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_59bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_60bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_61bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_62bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_63bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_64bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_65bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_66bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_67bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_68bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_69bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_70bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_71bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_72bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_73bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_74bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_75bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_76bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_77bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_78bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_79bytes), L(table_less_80bytes)) + + .p2align 3 +L(shl_table): + .int JMPTBL (L(shl_0), L(shl_table)) + .int JMPTBL (L(shl_1), L(shl_table)) + .int JMPTBL (L(shl_2), L(shl_table)) + .int JMPTBL (L(shl_3), L(shl_table)) + .int JMPTBL (L(shl_4), L(shl_table)) + .int JMPTBL (L(shl_5), L(shl_table)) + .int JMPTBL (L(shl_6), L(shl_table)) + .int JMPTBL (L(shl_7), L(shl_table)) + .int JMPTBL (L(shl_8), L(shl_table)) + .int JMPTBL (L(shl_9), L(shl_table)) + .int JMPTBL (L(shl_10), L(shl_table)) + .int JMPTBL (L(shl_11), L(shl_table)) + .int JMPTBL (L(shl_12), L(shl_table)) + .int JMPTBL (L(shl_13), L(shl_table)) + .int JMPTBL (L(shl_14), L(shl_table)) + .int JMPTBL (L(shl_15), L(shl_table)) + + .p2align 3 +L(shl_table_bwd): + .int JMPTBL (L(shl_0_bwd), L(shl_table_bwd)) + .int JMPTBL (L(shl_1_bwd), L(shl_table_bwd)) + .int JMPTBL (L(shl_2_bwd), L(shl_table_bwd)) + .int JMPTBL (L(shl_3_bwd), L(shl_table_bwd)) + .int JMPTBL (L(shl_4_bwd), L(shl_table_bwd)) + .int JMPTBL (L(shl_5_bwd), L(shl_table_bwd)) + .int JMPTBL (L(shl_6_bwd), L(shl_table_bwd)) + .int JMPTBL (L(shl_7_bwd), L(shl_table_bwd)) + .int JMPTBL (L(shl_8_bwd), L(shl_table_bwd)) + .int JMPTBL (L(shl_9_bwd), L(shl_table_bwd)) + .int JMPTBL (L(shl_10_bwd), L(shl_table_bwd)) + .int JMPTBL (L(shl_11_bwd), L(shl_table_bwd)) + .int JMPTBL (L(shl_12_bwd), L(shl_table_bwd)) + .int JMPTBL (L(shl_13_bwd), L(shl_table_bwd)) + .int JMPTBL (L(shl_14_bwd), L(shl_table_bwd)) + .int JMPTBL (L(shl_15_bwd), L(shl_table_bwd)) + +#endif diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/memcpy.S b/REORG.TODO/sysdeps/x86_64/multiarch/memcpy.S new file mode 100644 index 0000000000..af2770397c --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/memcpy.S @@ -0,0 +1,75 @@ +/* Multiple versions of memcpy + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2010-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + +/* Define multiple versions only for the definition in lib and for + DSO. In static binaries we need memcpy before the initialization + happened. */ +#if defined SHARED && IS_IN (libc) + .text +ENTRY(__new_memcpy) + .type __new_memcpy, @gnu_indirect_function + LOAD_RTLD_GLOBAL_RO_RDX + lea __memcpy_erms(%rip), %RAX_LP + HAS_ARCH_FEATURE (Prefer_ERMS) + jnz 2f + HAS_ARCH_FEATURE (Prefer_No_AVX512) + jnz 1f + HAS_ARCH_FEATURE (AVX512F_Usable) + jz 1f + lea __memcpy_avx512_no_vzeroupper(%rip), %RAX_LP + HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER) + jnz 2f + lea __memcpy_avx512_unaligned_erms(%rip), %RAX_LP + HAS_CPU_FEATURE (ERMS) + jnz 2f + lea __memcpy_avx512_unaligned(%rip), %RAX_LP + ret +1: lea __memcpy_avx_unaligned(%rip), %RAX_LP + HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load) + jz L(Fast_Unaligned_Load) + HAS_CPU_FEATURE (ERMS) + jz 2f + lea __memcpy_avx_unaligned_erms(%rip), %RAX_LP + ret +L(Fast_Unaligned_Load): + lea __memcpy_sse2_unaligned(%rip), %RAX_LP + HAS_ARCH_FEATURE (Fast_Unaligned_Copy) + jz L(SSSE3) + HAS_CPU_FEATURE (ERMS) + jz 2f + lea __memcpy_sse2_unaligned_erms(%rip), %RAX_LP + ret +L(SSSE3): + HAS_CPU_FEATURE (SSSE3) + jz 2f + lea __memcpy_ssse3_back(%rip), %RAX_LP + HAS_ARCH_FEATURE (Fast_Copy_Backward) + jnz 2f + lea __memcpy_ssse3(%rip), %RAX_LP +2: ret +END(__new_memcpy) + +# undef memcpy +# include <shlib-compat.h> +versioned_symbol (libc, __new_memcpy, memcpy, GLIBC_2_14); +#endif diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/memcpy_chk.S b/REORG.TODO/sysdeps/x86_64/multiarch/memcpy_chk.S new file mode 100644 index 0000000000..8737fb9755 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/memcpy_chk.S @@ -0,0 +1,72 @@ +/* Multiple versions of __memcpy_chk + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2010-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + +/* Define multiple versions only for the definition in lib and for + DSO. There are no multiarch memcpy functions for static binaries. + */ +#if IS_IN (libc) +# ifdef SHARED + .text +ENTRY(__memcpy_chk) + .type __memcpy_chk, @gnu_indirect_function + LOAD_RTLD_GLOBAL_RO_RDX + HAS_ARCH_FEATURE (Prefer_No_AVX512) + jnz 1f + HAS_ARCH_FEATURE (AVX512F_Usable) + jz 1f + lea __memcpy_chk_avx512_no_vzeroupper(%rip), %RAX_LP + HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER) + jnz 2f + lea __memcpy_chk_avx512_unaligned_erms(%rip), %RAX_LP + HAS_CPU_FEATURE (ERMS) + jnz 2f + lea __memcpy_chk_avx512_unaligned(%rip), %RAX_LP + ret +1: lea __memcpy_chk_avx_unaligned(%rip), %RAX_LP + HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load) + jz L(Fast_Unaligned_Load) + HAS_CPU_FEATURE (ERMS) + jz 2f + lea __memcpy_chk_avx_unaligned_erms(%rip), %RAX_LP + ret +L(Fast_Unaligned_Load): + lea __memcpy_chk_sse2_unaligned(%rip), %RAX_LP + HAS_ARCH_FEATURE (Fast_Unaligned_Copy) + jz L(SSSE3) + HAS_CPU_FEATURE (ERMS) + jz 2f + lea __memcpy_chk_sse2_unaligned_erms(%rip), %RAX_LP + ret +L(SSSE3): + HAS_CPU_FEATURE (SSSE3) + jz 2f + lea __memcpy_chk_ssse3_back(%rip), %RAX_LP + HAS_ARCH_FEATURE (Fast_Copy_Backward) + jnz 2f + lea __memcpy_chk_ssse3(%rip), %RAX_LP +2: ret +END(__memcpy_chk) +# else +# include "../memcpy_chk.S" +# endif +#endif diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S b/REORG.TODO/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S new file mode 100644 index 0000000000..e195e93f15 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S @@ -0,0 +1,12 @@ +#if IS_IN (libc) +# define VEC_SIZE 32 +# define VEC(i) ymm##i +# define VMOVNT vmovntdq +# define VMOVU vmovdqu +# define VMOVA vmovdqa + +# define SECTION(p) p##.avx +# define MEMMOVE_SYMBOL(p,s) p##_avx_##s + +# include "memmove-vec-unaligned-erms.S" +#endif diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S b/REORG.TODO/sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S new file mode 100644 index 0000000000..f3ef10577c --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S @@ -0,0 +1,420 @@ +/* memmove/memcpy/mempcpy optimized with AVX512 for KNL hardware. + Copyright (C) 2016-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> + +#if IS_IN (libc) + +# include "asm-syntax.h" + + .section .text.avx512,"ax",@progbits +# if defined SHARED && !defined USE_AS_MEMPCPY && !defined USE_AS_MEMMOVE +ENTRY (__mempcpy_chk_avx512_no_vzeroupper) + cmpq %rdx, %rcx + jb HIDDEN_JUMPTARGET (__chk_fail) +END (__mempcpy_chk_avx512_no_vzeroupper) + +ENTRY (__mempcpy_avx512_no_vzeroupper) + movq %rdi, %rax + addq %rdx, %rax + jmp L(start) +END (__mempcpy_avx512_no_vzeroupper) +# endif + +# ifdef SHARED +ENTRY (__memmove_chk_avx512_no_vzeroupper) + cmpq %rdx, %rcx + jb HIDDEN_JUMPTARGET (__chk_fail) +END (__memmove_chk_avx512_no_vzeroupper) +# endif + +ENTRY (__memmove_avx512_no_vzeroupper) + mov %rdi, %rax +# ifdef USE_AS_MEMPCPY + add %rdx, %rax +# endif +L(start): + lea (%rsi, %rdx), %rcx + lea (%rdi, %rdx), %r9 + cmp $512, %rdx + ja L(512bytesormore) + +L(check): + cmp $16, %rdx + jbe L(less_16bytes) + cmp $256, %rdx + jb L(less_256bytes) + vmovups (%rsi), %zmm0 + vmovups 0x40(%rsi), %zmm1 + vmovups 0x80(%rsi), %zmm2 + vmovups 0xC0(%rsi), %zmm3 + vmovups -0x100(%rcx), %zmm4 + vmovups -0xC0(%rcx), %zmm5 + vmovups -0x80(%rcx), %zmm6 + vmovups -0x40(%rcx), %zmm7 + vmovups %zmm0, (%rdi) + vmovups %zmm1, 0x40(%rdi) + vmovups %zmm2, 0x80(%rdi) + vmovups %zmm3, 0xC0(%rdi) + vmovups %zmm4, -0x100(%r9) + vmovups %zmm5, -0xC0(%r9) + vmovups %zmm6, -0x80(%r9) + vmovups %zmm7, -0x40(%r9) + ret + +L(less_256bytes): + cmp $128, %dl + jb L(less_128bytes) + vmovups (%rsi), %zmm0 + vmovups 0x40(%rsi), %zmm1 + vmovups -0x80(%rcx), %zmm2 + vmovups -0x40(%rcx), %zmm3 + vmovups %zmm0, (%rdi) + vmovups %zmm1, 0x40(%rdi) + vmovups %zmm2, -0x80(%r9) + vmovups %zmm3, -0x40(%r9) + ret + +L(less_128bytes): + cmp $64, %dl + jb L(less_64bytes) + vmovdqu (%rsi), %ymm0 + vmovdqu 0x20(%rsi), %ymm1 + vmovdqu -0x40(%rcx), %ymm2 + vmovdqu -0x20(%rcx), %ymm3 + vmovdqu %ymm0, (%rdi) + vmovdqu %ymm1, 0x20(%rdi) + vmovdqu %ymm2, -0x40(%r9) + vmovdqu %ymm3, -0x20(%r9) + ret + +L(less_64bytes): + cmp $32, %dl + jb L(less_32bytes) + vmovdqu (%rsi), %ymm0 + vmovdqu -0x20(%rcx), %ymm1 + vmovdqu %ymm0, (%rdi) + vmovdqu %ymm1, -0x20(%r9) + ret + +L(less_32bytes): + vmovdqu (%rsi), %xmm0 + vmovdqu -0x10(%rcx), %xmm1 + vmovdqu %xmm0, (%rdi) + vmovdqu %xmm1, -0x10(%r9) + ret + +L(less_16bytes): + cmp $8, %dl + jb L(less_8bytes) + movq (%rsi), %rsi + movq -0x8(%rcx), %rcx + movq %rsi, (%rdi) + movq %rcx, -0x8(%r9) + ret + +L(less_8bytes): + cmp $4, %dl + jb L(less_4bytes) + mov (%rsi), %esi + mov -0x4(%rcx), %ecx + mov %esi, (%rdi) + mov %ecx, -0x4(%r9) + ret + +L(less_4bytes): + cmp $2, %dl + jb L(less_2bytes) + mov (%rsi), %si + mov -0x2(%rcx), %cx + mov %si, (%rdi) + mov %cx, -0x2(%r9) + ret + +L(less_2bytes): + cmp $1, %dl + jb L(less_1bytes) + mov (%rsi), %cl + mov %cl, (%rdi) +L(less_1bytes): + ret + +L(512bytesormore): +# ifdef SHARED_CACHE_SIZE_HALF + mov $SHARED_CACHE_SIZE_HALF, %r8 +# else + mov __x86_shared_cache_size_half(%rip), %r8 +# endif + cmp %r8, %rdx + jae L(preloop_large) + cmp $1024, %rdx + ja L(1024bytesormore) + prefetcht1 (%rsi) + prefetcht1 0x40(%rsi) + prefetcht1 0x80(%rsi) + prefetcht1 0xC0(%rsi) + prefetcht1 0x100(%rsi) + prefetcht1 0x140(%rsi) + prefetcht1 0x180(%rsi) + prefetcht1 0x1C0(%rsi) + prefetcht1 -0x200(%rcx) + prefetcht1 -0x1C0(%rcx) + prefetcht1 -0x180(%rcx) + prefetcht1 -0x140(%rcx) + prefetcht1 -0x100(%rcx) + prefetcht1 -0xC0(%rcx) + prefetcht1 -0x80(%rcx) + prefetcht1 -0x40(%rcx) + vmovups (%rsi), %zmm0 + vmovups 0x40(%rsi), %zmm1 + vmovups 0x80(%rsi), %zmm2 + vmovups 0xC0(%rsi), %zmm3 + vmovups 0x100(%rsi), %zmm4 + vmovups 0x140(%rsi), %zmm5 + vmovups 0x180(%rsi), %zmm6 + vmovups 0x1C0(%rsi), %zmm7 + vmovups -0x200(%rcx), %zmm8 + vmovups -0x1C0(%rcx), %zmm9 + vmovups -0x180(%rcx), %zmm10 + vmovups -0x140(%rcx), %zmm11 + vmovups -0x100(%rcx), %zmm12 + vmovups -0xC0(%rcx), %zmm13 + vmovups -0x80(%rcx), %zmm14 + vmovups -0x40(%rcx), %zmm15 + vmovups %zmm0, (%rdi) + vmovups %zmm1, 0x40(%rdi) + vmovups %zmm2, 0x80(%rdi) + vmovups %zmm3, 0xC0(%rdi) + vmovups %zmm4, 0x100(%rdi) + vmovups %zmm5, 0x140(%rdi) + vmovups %zmm6, 0x180(%rdi) + vmovups %zmm7, 0x1C0(%rdi) + vmovups %zmm8, -0x200(%r9) + vmovups %zmm9, -0x1C0(%r9) + vmovups %zmm10, -0x180(%r9) + vmovups %zmm11, -0x140(%r9) + vmovups %zmm12, -0x100(%r9) + vmovups %zmm13, -0xC0(%r9) + vmovups %zmm14, -0x80(%r9) + vmovups %zmm15, -0x40(%r9) + ret + +L(1024bytesormore): + cmp %rsi, %rdi + ja L(1024bytesormore_bkw) + sub $512, %r9 + vmovups -0x200(%rcx), %zmm8 + vmovups -0x1C0(%rcx), %zmm9 + vmovups -0x180(%rcx), %zmm10 + vmovups -0x140(%rcx), %zmm11 + vmovups -0x100(%rcx), %zmm12 + vmovups -0xC0(%rcx), %zmm13 + vmovups -0x80(%rcx), %zmm14 + vmovups -0x40(%rcx), %zmm15 + prefetcht1 (%rsi) + prefetcht1 0x40(%rsi) + prefetcht1 0x80(%rsi) + prefetcht1 0xC0(%rsi) + prefetcht1 0x100(%rsi) + prefetcht1 0x140(%rsi) + prefetcht1 0x180(%rsi) + prefetcht1 0x1C0(%rsi) + +/* Loop with unaligned memory access. */ +L(gobble_512bytes_loop): + vmovups (%rsi), %zmm0 + vmovups 0x40(%rsi), %zmm1 + vmovups 0x80(%rsi), %zmm2 + vmovups 0xC0(%rsi), %zmm3 + vmovups 0x100(%rsi), %zmm4 + vmovups 0x140(%rsi), %zmm5 + vmovups 0x180(%rsi), %zmm6 + vmovups 0x1C0(%rsi), %zmm7 + add $512, %rsi + prefetcht1 (%rsi) + prefetcht1 0x40(%rsi) + prefetcht1 0x80(%rsi) + prefetcht1 0xC0(%rsi) + prefetcht1 0x100(%rsi) + prefetcht1 0x140(%rsi) + prefetcht1 0x180(%rsi) + prefetcht1 0x1C0(%rsi) + vmovups %zmm0, (%rdi) + vmovups %zmm1, 0x40(%rdi) + vmovups %zmm2, 0x80(%rdi) + vmovups %zmm3, 0xC0(%rdi) + vmovups %zmm4, 0x100(%rdi) + vmovups %zmm5, 0x140(%rdi) + vmovups %zmm6, 0x180(%rdi) + vmovups %zmm7, 0x1C0(%rdi) + add $512, %rdi + cmp %r9, %rdi + jb L(gobble_512bytes_loop) + vmovups %zmm8, (%r9) + vmovups %zmm9, 0x40(%r9) + vmovups %zmm10, 0x80(%r9) + vmovups %zmm11, 0xC0(%r9) + vmovups %zmm12, 0x100(%r9) + vmovups %zmm13, 0x140(%r9) + vmovups %zmm14, 0x180(%r9) + vmovups %zmm15, 0x1C0(%r9) + ret + +L(1024bytesormore_bkw): + add $512, %rdi + vmovups 0x1C0(%rsi), %zmm8 + vmovups 0x180(%rsi), %zmm9 + vmovups 0x140(%rsi), %zmm10 + vmovups 0x100(%rsi), %zmm11 + vmovups 0xC0(%rsi), %zmm12 + vmovups 0x80(%rsi), %zmm13 + vmovups 0x40(%rsi), %zmm14 + vmovups (%rsi), %zmm15 + prefetcht1 -0x40(%rcx) + prefetcht1 -0x80(%rcx) + prefetcht1 -0xC0(%rcx) + prefetcht1 -0x100(%rcx) + prefetcht1 -0x140(%rcx) + prefetcht1 -0x180(%rcx) + prefetcht1 -0x1C0(%rcx) + prefetcht1 -0x200(%rcx) + +/* Backward loop with unaligned memory access. */ +L(gobble_512bytes_loop_bkw): + vmovups -0x40(%rcx), %zmm0 + vmovups -0x80(%rcx), %zmm1 + vmovups -0xC0(%rcx), %zmm2 + vmovups -0x100(%rcx), %zmm3 + vmovups -0x140(%rcx), %zmm4 + vmovups -0x180(%rcx), %zmm5 + vmovups -0x1C0(%rcx), %zmm6 + vmovups -0x200(%rcx), %zmm7 + sub $512, %rcx + prefetcht1 -0x40(%rcx) + prefetcht1 -0x80(%rcx) + prefetcht1 -0xC0(%rcx) + prefetcht1 -0x100(%rcx) + prefetcht1 -0x140(%rcx) + prefetcht1 -0x180(%rcx) + prefetcht1 -0x1C0(%rcx) + prefetcht1 -0x200(%rcx) + vmovups %zmm0, -0x40(%r9) + vmovups %zmm1, -0x80(%r9) + vmovups %zmm2, -0xC0(%r9) + vmovups %zmm3, -0x100(%r9) + vmovups %zmm4, -0x140(%r9) + vmovups %zmm5, -0x180(%r9) + vmovups %zmm6, -0x1C0(%r9) + vmovups %zmm7, -0x200(%r9) + sub $512, %r9 + cmp %rdi, %r9 + ja L(gobble_512bytes_loop_bkw) + vmovups %zmm8, -0x40(%rdi) + vmovups %zmm9, -0x80(%rdi) + vmovups %zmm10, -0xC0(%rdi) + vmovups %zmm11, -0x100(%rdi) + vmovups %zmm12, -0x140(%rdi) + vmovups %zmm13, -0x180(%rdi) + vmovups %zmm14, -0x1C0(%rdi) + vmovups %zmm15, -0x200(%rdi) + ret + +L(preloop_large): + cmp %rsi, %rdi + ja L(preloop_large_bkw) + vmovups (%rsi), %zmm4 + vmovups 0x40(%rsi), %zmm5 + +/* Align destination for access with non-temporal stores in the loop. */ + mov %rdi, %r8 + and $-0x80, %rdi + add $0x80, %rdi + sub %rdi, %r8 + sub %r8, %rsi + add %r8, %rdx +L(gobble_256bytes_nt_loop): + prefetcht1 0x200(%rsi) + prefetcht1 0x240(%rsi) + prefetcht1 0x280(%rsi) + prefetcht1 0x2C0(%rsi) + prefetcht1 0x300(%rsi) + prefetcht1 0x340(%rsi) + prefetcht1 0x380(%rsi) + prefetcht1 0x3C0(%rsi) + vmovdqu64 (%rsi), %zmm0 + vmovdqu64 0x40(%rsi), %zmm1 + vmovdqu64 0x80(%rsi), %zmm2 + vmovdqu64 0xC0(%rsi), %zmm3 + vmovntdq %zmm0, (%rdi) + vmovntdq %zmm1, 0x40(%rdi) + vmovntdq %zmm2, 0x80(%rdi) + vmovntdq %zmm3, 0xC0(%rdi) + sub $256, %rdx + add $256, %rsi + add $256, %rdi + cmp $256, %rdx + ja L(gobble_256bytes_nt_loop) + sfence + vmovups %zmm4, (%rax) + vmovups %zmm5, 0x40(%rax) + jmp L(check) + +L(preloop_large_bkw): + vmovups -0x80(%rcx), %zmm4 + vmovups -0x40(%rcx), %zmm5 + +/* Align end of destination for access with non-temporal stores. */ + mov %r9, %r8 + and $-0x80, %r9 + sub %r9, %r8 + sub %r8, %rcx + sub %r8, %rdx + add %r9, %r8 +L(gobble_256bytes_nt_loop_bkw): + prefetcht1 -0x400(%rcx) + prefetcht1 -0x3C0(%rcx) + prefetcht1 -0x380(%rcx) + prefetcht1 -0x340(%rcx) + prefetcht1 -0x300(%rcx) + prefetcht1 -0x2C0(%rcx) + prefetcht1 -0x280(%rcx) + prefetcht1 -0x240(%rcx) + vmovdqu64 -0x100(%rcx), %zmm0 + vmovdqu64 -0xC0(%rcx), %zmm1 + vmovdqu64 -0x80(%rcx), %zmm2 + vmovdqu64 -0x40(%rcx), %zmm3 + vmovntdq %zmm0, -0x100(%r9) + vmovntdq %zmm1, -0xC0(%r9) + vmovntdq %zmm2, -0x80(%r9) + vmovntdq %zmm3, -0x40(%r9) + sub $256, %rdx + sub $256, %rcx + sub $256, %r9 + cmp $256, %rdx + ja L(gobble_256bytes_nt_loop_bkw) + sfence + vmovups %zmm4, -0x80(%r8) + vmovups %zmm5, -0x40(%r8) + jmp L(check) +END (__memmove_avx512_no_vzeroupper) + +# ifdef SHARED +strong_alias (__memmove_avx512_no_vzeroupper, __memcpy_avx512_no_vzeroupper) +strong_alias (__memmove_chk_avx512_no_vzeroupper, __memcpy_chk_avx512_no_vzeroupper) +# endif +#endif diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S b/REORG.TODO/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S new file mode 100644 index 0000000000..aac1515cf6 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S @@ -0,0 +1,12 @@ +#if IS_IN (libc) +# define VEC_SIZE 64 +# define VEC(i) zmm##i +# define VMOVNT vmovntdq +# define VMOVU vmovdqu64 +# define VMOVA vmovdqa64 + +# define SECTION(p) p##.avx512 +# define MEMMOVE_SYMBOL(p,s) p##_avx512_##s + +# include "memmove-vec-unaligned-erms.S" +#endif diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/memmove-ssse3-back.S b/REORG.TODO/sysdeps/x86_64/multiarch/memmove-ssse3-back.S new file mode 100644 index 0000000000..f9a4e9aff9 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/memmove-ssse3-back.S @@ -0,0 +1,4 @@ +#define USE_AS_MEMMOVE +#define MEMCPY __memmove_ssse3_back +#define MEMCPY_CHK __memmove_chk_ssse3_back +#include "memcpy-ssse3-back.S" diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/memmove-ssse3.S b/REORG.TODO/sysdeps/x86_64/multiarch/memmove-ssse3.S new file mode 100644 index 0000000000..295430b1ef --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/memmove-ssse3.S @@ -0,0 +1,4 @@ +#define USE_AS_MEMMOVE +#define MEMCPY __memmove_ssse3 +#define MEMCPY_CHK __memmove_chk_ssse3 +#include "memcpy-ssse3.S" diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/REORG.TODO/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S new file mode 100644 index 0000000000..dee3ec529c --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S @@ -0,0 +1,553 @@ +/* memmove/memcpy/mempcpy with unaligned load/store and rep movsb + Copyright (C) 2016-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +/* memmove/memcpy/mempcpy is implemented as: + 1. Use overlapping load and store to avoid branch. + 2. Load all sources into registers and store them together to avoid + possible address overlap between source and destination. + 3. If size is 8 * VEC_SIZE or less, load all sources into registers + and store them together. + 4. If address of destination > address of source, backward copy + 4 * VEC_SIZE at a time with unaligned load and aligned store. + Load the first 4 * VEC and last VEC before the loop and store + them after the loop to support overlapping addresses. + 5. Otherwise, forward copy 4 * VEC_SIZE at a time with unaligned + load and aligned store. Load the last 4 * VEC and first VEC + before the loop and store them after the loop to support + overlapping addresses. + 6. If size >= __x86_shared_non_temporal_threshold and there is no + overlap between destination and source, use non-temporal store + instead of aligned store. */ + +#include <sysdep.h> + +#ifndef MEMCPY_SYMBOL +# define MEMCPY_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s) +#endif + +#ifndef MEMPCPY_SYMBOL +# define MEMPCPY_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s) +#endif + +#ifndef MEMMOVE_CHK_SYMBOL +# define MEMMOVE_CHK_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s) +#endif + +#ifndef VZEROUPPER +# if VEC_SIZE > 16 +# define VZEROUPPER vzeroupper +# else +# define VZEROUPPER +# endif +#endif + +/* Threshold to use Enhanced REP MOVSB. Since there is overhead to set + up REP MOVSB operation, REP MOVSB isn't faster on short data. The + memcpy micro benchmark in glibc shows that 2KB is the approximate + value above which REP MOVSB becomes faster than SSE2 optimization + on processors with Enhanced REP MOVSB. Since larger register size + can move more data with a single load and store, the threshold is + higher with larger register size. */ +#ifndef REP_MOVSB_THRESHOLD +# define REP_MOVSB_THRESHOLD (2048 * (VEC_SIZE / 16)) +#endif + +#ifndef PREFETCH +# define PREFETCH(addr) prefetcht0 addr +#endif + +/* Assume 64-byte prefetch size. */ +#ifndef PREFETCH_SIZE +# define PREFETCH_SIZE 64 +#endif + +#define PREFETCHED_LOAD_SIZE (VEC_SIZE * 4) + +#if PREFETCH_SIZE == 64 +# if PREFETCHED_LOAD_SIZE == PREFETCH_SIZE +# define PREFETCH_ONE_SET(dir, base, offset) \ + PREFETCH ((offset)base) +# elif PREFETCHED_LOAD_SIZE == 2 * PREFETCH_SIZE +# define PREFETCH_ONE_SET(dir, base, offset) \ + PREFETCH ((offset)base); \ + PREFETCH ((offset + dir * PREFETCH_SIZE)base) +# elif PREFETCHED_LOAD_SIZE == 4 * PREFETCH_SIZE +# define PREFETCH_ONE_SET(dir, base, offset) \ + PREFETCH ((offset)base); \ + PREFETCH ((offset + dir * PREFETCH_SIZE)base); \ + PREFETCH ((offset + dir * PREFETCH_SIZE)base); \ + PREFETCH ((offset + dir * PREFETCH_SIZE * 2)base); \ + PREFETCH ((offset + dir * PREFETCH_SIZE * 3)base) +# else +# error Unsupported PREFETCHED_LOAD_SIZE! +# endif +#else +# error Unsupported PREFETCH_SIZE! +#endif + +#ifndef SECTION +# error SECTION is not defined! +#endif + + .section SECTION(.text),"ax",@progbits +#if defined SHARED && IS_IN (libc) +ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned)) + cmpq %rdx, %rcx + jb HIDDEN_JUMPTARGET (__chk_fail) +END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned)) +#endif + +#if VEC_SIZE == 16 || defined SHARED +ENTRY (MEMPCPY_SYMBOL (__mempcpy, unaligned)) + movq %rdi, %rax + addq %rdx, %rax + jmp L(start) +END (MEMPCPY_SYMBOL (__mempcpy, unaligned)) +#endif + +#if defined SHARED && IS_IN (libc) +ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned)) + cmpq %rdx, %rcx + jb HIDDEN_JUMPTARGET (__chk_fail) +END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned)) +#endif + +ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned)) + movq %rdi, %rax +L(start): + cmpq $VEC_SIZE, %rdx + jb L(less_vec) + cmpq $(VEC_SIZE * 2), %rdx + ja L(more_2x_vec) +#if !defined USE_MULTIARCH || !IS_IN (libc) +L(last_2x_vec): +#endif + /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */ + VMOVU (%rsi), %VEC(0) + VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(1) + VMOVU %VEC(0), (%rdi) + VMOVU %VEC(1), -VEC_SIZE(%rdi,%rdx) + VZEROUPPER +#if !defined USE_MULTIARCH || !IS_IN (libc) +L(nop): +#endif + ret +#if defined USE_MULTIARCH && IS_IN (libc) +END (MEMMOVE_SYMBOL (__memmove, unaligned)) + +# if VEC_SIZE == 16 +# if defined SHARED +/* Only used to measure performance of REP MOVSB. */ +ENTRY (__mempcpy_erms) + movq %rdi, %rax + addq %rdx, %rax + jmp L(start_movsb) +END (__mempcpy_erms) +# endif + +ENTRY (__memmove_erms) + movq %rdi, %rax +L(start_movsb): + movq %rdx, %rcx + cmpq %rsi, %rdi + jb 1f + /* Source == destination is less common. */ + je 2f + leaq (%rsi,%rcx), %rdx + cmpq %rdx, %rdi + jb L(movsb_backward) +1: + rep movsb +2: + ret +L(movsb_backward): + leaq -1(%rdi,%rcx), %rdi + leaq -1(%rsi,%rcx), %rsi + std + rep movsb + cld + ret +END (__memmove_erms) +# if defined SHARED +strong_alias (__memmove_erms, __memcpy_erms) +# endif +# endif + +# ifdef SHARED +ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms)) + cmpq %rdx, %rcx + jb HIDDEN_JUMPTARGET (__chk_fail) +END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms)) + +ENTRY (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms)) + movq %rdi, %rax + addq %rdx, %rax + jmp L(start_erms) +END (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms)) + +ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms)) + cmpq %rdx, %rcx + jb HIDDEN_JUMPTARGET (__chk_fail) +END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms)) +# endif + +ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned_erms)) + movq %rdi, %rax +L(start_erms): + cmpq $VEC_SIZE, %rdx + jb L(less_vec) + cmpq $(VEC_SIZE * 2), %rdx + ja L(movsb_more_2x_vec) +L(last_2x_vec): + /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */ + VMOVU (%rsi), %VEC(0) + VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(1) + VMOVU %VEC(0), (%rdi) + VMOVU %VEC(1), -VEC_SIZE(%rdi,%rdx) +L(return): + VZEROUPPER + ret + +L(movsb): + cmpq __x86_shared_non_temporal_threshold(%rip), %rdx + jae L(more_8x_vec) + cmpq %rsi, %rdi + jb 1f + /* Source == destination is less common. */ + je L(nop) + leaq (%rsi,%rdx), %r9 + cmpq %r9, %rdi + /* Avoid slow backward REP MOVSB. */ +# if REP_MOVSB_THRESHOLD <= (VEC_SIZE * 8) +# error Unsupported REP_MOVSB_THRESHOLD and VEC_SIZE! +# endif + jb L(more_8x_vec_backward) +1: + movq %rdx, %rcx + rep movsb +L(nop): + ret +#endif + +L(less_vec): + /* Less than 1 VEC. */ +#if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64 +# error Unsupported VEC_SIZE! +#endif +#if VEC_SIZE > 32 + cmpb $32, %dl + jae L(between_32_63) +#endif +#if VEC_SIZE > 16 + cmpb $16, %dl + jae L(between_16_31) +#endif + cmpb $8, %dl + jae L(between_8_15) + cmpb $4, %dl + jae L(between_4_7) + cmpb $1, %dl + ja L(between_2_3) + jb 1f + movzbl (%rsi), %ecx + movb %cl, (%rdi) +1: + ret +#if VEC_SIZE > 32 +L(between_32_63): + /* From 32 to 63. No branch when size == 32. */ + vmovdqu (%rsi), %ymm0 + vmovdqu -32(%rsi,%rdx), %ymm1 + vmovdqu %ymm0, (%rdi) + vmovdqu %ymm1, -32(%rdi,%rdx) + VZEROUPPER + ret +#endif +#if VEC_SIZE > 16 + /* From 16 to 31. No branch when size == 16. */ +L(between_16_31): + vmovdqu (%rsi), %xmm0 + vmovdqu -16(%rsi,%rdx), %xmm1 + vmovdqu %xmm0, (%rdi) + vmovdqu %xmm1, -16(%rdi,%rdx) + ret +#endif +L(between_8_15): + /* From 8 to 15. No branch when size == 8. */ + movq -8(%rsi,%rdx), %rcx + movq (%rsi), %rsi + movq %rcx, -8(%rdi,%rdx) + movq %rsi, (%rdi) + ret +L(between_4_7): + /* From 4 to 7. No branch when size == 4. */ + movl -4(%rsi,%rdx), %ecx + movl (%rsi), %esi + movl %ecx, -4(%rdi,%rdx) + movl %esi, (%rdi) + ret +L(between_2_3): + /* From 2 to 3. No branch when size == 2. */ + movzwl -2(%rsi,%rdx), %ecx + movzwl (%rsi), %esi + movw %cx, -2(%rdi,%rdx) + movw %si, (%rdi) + ret + +#if defined USE_MULTIARCH && IS_IN (libc) +L(movsb_more_2x_vec): + cmpq $REP_MOVSB_THRESHOLD, %rdx + ja L(movsb) +#endif +L(more_2x_vec): + /* More than 2 * VEC and there may be overlap between destination + and source. */ + cmpq $(VEC_SIZE * 8), %rdx + ja L(more_8x_vec) + cmpq $(VEC_SIZE * 4), %rdx + jb L(last_4x_vec) + /* Copy from 4 * VEC to 8 * VEC, inclusively. */ + VMOVU (%rsi), %VEC(0) + VMOVU VEC_SIZE(%rsi), %VEC(1) + VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2) + VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3) + VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(4) + VMOVU -(VEC_SIZE * 2)(%rsi,%rdx), %VEC(5) + VMOVU -(VEC_SIZE * 3)(%rsi,%rdx), %VEC(6) + VMOVU -(VEC_SIZE * 4)(%rsi,%rdx), %VEC(7) + VMOVU %VEC(0), (%rdi) + VMOVU %VEC(1), VEC_SIZE(%rdi) + VMOVU %VEC(2), (VEC_SIZE * 2)(%rdi) + VMOVU %VEC(3), (VEC_SIZE * 3)(%rdi) + VMOVU %VEC(4), -VEC_SIZE(%rdi,%rdx) + VMOVU %VEC(5), -(VEC_SIZE * 2)(%rdi,%rdx) + VMOVU %VEC(6), -(VEC_SIZE * 3)(%rdi,%rdx) + VMOVU %VEC(7), -(VEC_SIZE * 4)(%rdi,%rdx) + VZEROUPPER + ret +L(last_4x_vec): + /* Copy from 2 * VEC to 4 * VEC. */ + VMOVU (%rsi), %VEC(0) + VMOVU VEC_SIZE(%rsi), %VEC(1) + VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(2) + VMOVU -(VEC_SIZE * 2)(%rsi,%rdx), %VEC(3) + VMOVU %VEC(0), (%rdi) + VMOVU %VEC(1), VEC_SIZE(%rdi) + VMOVU %VEC(2), -VEC_SIZE(%rdi,%rdx) + VMOVU %VEC(3), -(VEC_SIZE * 2)(%rdi,%rdx) + VZEROUPPER + ret + +L(more_8x_vec): + cmpq %rsi, %rdi + ja L(more_8x_vec_backward) + /* Source == destination is less common. */ + je L(nop) + /* Load the first VEC and last 4 * VEC to support overlapping + addresses. */ + VMOVU (%rsi), %VEC(4) + VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(5) + VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(6) + VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(7) + VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(8) + /* Save start and stop of the destination buffer. */ + movq %rdi, %r11 + leaq -VEC_SIZE(%rdi, %rdx), %rcx + /* Align destination for aligned stores in the loop. Compute + how much destination is misaligned. */ + movq %rdi, %r8 + andq $(VEC_SIZE - 1), %r8 + /* Get the negative of offset for alignment. */ + subq $VEC_SIZE, %r8 + /* Adjust source. */ + subq %r8, %rsi + /* Adjust destination which should be aligned now. */ + subq %r8, %rdi + /* Adjust length. */ + addq %r8, %rdx +#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc) + /* Check non-temporal store threshold. */ + cmpq __x86_shared_non_temporal_threshold(%rip), %rdx + ja L(large_forward) +#endif +L(loop_4x_vec_forward): + /* Copy 4 * VEC a time forward. */ + VMOVU (%rsi), %VEC(0) + VMOVU VEC_SIZE(%rsi), %VEC(1) + VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2) + VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3) + addq $(VEC_SIZE * 4), %rsi + subq $(VEC_SIZE * 4), %rdx + VMOVA %VEC(0), (%rdi) + VMOVA %VEC(1), VEC_SIZE(%rdi) + VMOVA %VEC(2), (VEC_SIZE * 2)(%rdi) + VMOVA %VEC(3), (VEC_SIZE * 3)(%rdi) + addq $(VEC_SIZE * 4), %rdi + cmpq $(VEC_SIZE * 4), %rdx + ja L(loop_4x_vec_forward) + /* Store the last 4 * VEC. */ + VMOVU %VEC(5), (%rcx) + VMOVU %VEC(6), -VEC_SIZE(%rcx) + VMOVU %VEC(7), -(VEC_SIZE * 2)(%rcx) + VMOVU %VEC(8), -(VEC_SIZE * 3)(%rcx) + /* Store the first VEC. */ + VMOVU %VEC(4), (%r11) + VZEROUPPER + ret + +L(more_8x_vec_backward): + /* Load the first 4 * VEC and last VEC to support overlapping + addresses. */ + VMOVU (%rsi), %VEC(4) + VMOVU VEC_SIZE(%rsi), %VEC(5) + VMOVU (VEC_SIZE * 2)(%rsi), %VEC(6) + VMOVU (VEC_SIZE * 3)(%rsi), %VEC(7) + VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(8) + /* Save stop of the destination buffer. */ + leaq -VEC_SIZE(%rdi, %rdx), %r11 + /* Align destination end for aligned stores in the loop. Compute + how much destination end is misaligned. */ + leaq -VEC_SIZE(%rsi, %rdx), %rcx + movq %r11, %r9 + movq %r11, %r8 + andq $(VEC_SIZE - 1), %r8 + /* Adjust source. */ + subq %r8, %rcx + /* Adjust the end of destination which should be aligned now. */ + subq %r8, %r9 + /* Adjust length. */ + subq %r8, %rdx +#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc) + /* Check non-temporal store threshold. */ + cmpq __x86_shared_non_temporal_threshold(%rip), %rdx + ja L(large_backward) +#endif +L(loop_4x_vec_backward): + /* Copy 4 * VEC a time backward. */ + VMOVU (%rcx), %VEC(0) + VMOVU -VEC_SIZE(%rcx), %VEC(1) + VMOVU -(VEC_SIZE * 2)(%rcx), %VEC(2) + VMOVU -(VEC_SIZE * 3)(%rcx), %VEC(3) + subq $(VEC_SIZE * 4), %rcx + subq $(VEC_SIZE * 4), %rdx + VMOVA %VEC(0), (%r9) + VMOVA %VEC(1), -VEC_SIZE(%r9) + VMOVA %VEC(2), -(VEC_SIZE * 2)(%r9) + VMOVA %VEC(3), -(VEC_SIZE * 3)(%r9) + subq $(VEC_SIZE * 4), %r9 + cmpq $(VEC_SIZE * 4), %rdx + ja L(loop_4x_vec_backward) + /* Store the first 4 * VEC. */ + VMOVU %VEC(4), (%rdi) + VMOVU %VEC(5), VEC_SIZE(%rdi) + VMOVU %VEC(6), (VEC_SIZE * 2)(%rdi) + VMOVU %VEC(7), (VEC_SIZE * 3)(%rdi) + /* Store the last VEC. */ + VMOVU %VEC(8), (%r11) + VZEROUPPER + ret + +#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc) +L(large_forward): + /* Don't use non-temporal store if there is overlap between + destination and source since destination may be in cache + when source is loaded. */ + leaq (%rdi, %rdx), %r10 + cmpq %r10, %rsi + jb L(loop_4x_vec_forward) +L(loop_large_forward): + /* Copy 4 * VEC a time forward with non-temporal stores. */ + PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 2) + PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 3) + VMOVU (%rsi), %VEC(0) + VMOVU VEC_SIZE(%rsi), %VEC(1) + VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2) + VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3) + addq $PREFETCHED_LOAD_SIZE, %rsi + subq $PREFETCHED_LOAD_SIZE, %rdx + VMOVNT %VEC(0), (%rdi) + VMOVNT %VEC(1), VEC_SIZE(%rdi) + VMOVNT %VEC(2), (VEC_SIZE * 2)(%rdi) + VMOVNT %VEC(3), (VEC_SIZE * 3)(%rdi) + addq $PREFETCHED_LOAD_SIZE, %rdi + cmpq $PREFETCHED_LOAD_SIZE, %rdx + ja L(loop_large_forward) + sfence + /* Store the last 4 * VEC. */ + VMOVU %VEC(5), (%rcx) + VMOVU %VEC(6), -VEC_SIZE(%rcx) + VMOVU %VEC(7), -(VEC_SIZE * 2)(%rcx) + VMOVU %VEC(8), -(VEC_SIZE * 3)(%rcx) + /* Store the first VEC. */ + VMOVU %VEC(4), (%r11) + VZEROUPPER + ret + +L(large_backward): + /* Don't use non-temporal store if there is overlap between + destination and source since destination may be in cache + when source is loaded. */ + leaq (%rcx, %rdx), %r10 + cmpq %r10, %r9 + jb L(loop_4x_vec_backward) +L(loop_large_backward): + /* Copy 4 * VEC a time backward with non-temporal stores. */ + PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 2) + PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 3) + VMOVU (%rcx), %VEC(0) + VMOVU -VEC_SIZE(%rcx), %VEC(1) + VMOVU -(VEC_SIZE * 2)(%rcx), %VEC(2) + VMOVU -(VEC_SIZE * 3)(%rcx), %VEC(3) + subq $PREFETCHED_LOAD_SIZE, %rcx + subq $PREFETCHED_LOAD_SIZE, %rdx + VMOVNT %VEC(0), (%r9) + VMOVNT %VEC(1), -VEC_SIZE(%r9) + VMOVNT %VEC(2), -(VEC_SIZE * 2)(%r9) + VMOVNT %VEC(3), -(VEC_SIZE * 3)(%r9) + subq $PREFETCHED_LOAD_SIZE, %r9 + cmpq $PREFETCHED_LOAD_SIZE, %rdx + ja L(loop_large_backward) + sfence + /* Store the first 4 * VEC. */ + VMOVU %VEC(4), (%rdi) + VMOVU %VEC(5), VEC_SIZE(%rdi) + VMOVU %VEC(6), (VEC_SIZE * 2)(%rdi) + VMOVU %VEC(7), (VEC_SIZE * 3)(%rdi) + /* Store the last VEC. */ + VMOVU %VEC(8), (%r11) + VZEROUPPER + ret +#endif +END (MEMMOVE_SYMBOL (__memmove, unaligned_erms)) + +#ifdef SHARED +# if IS_IN (libc) +# ifdef USE_MULTIARCH +strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned_erms), + MEMMOVE_SYMBOL (__memcpy, unaligned_erms)) +strong_alias (MEMMOVE_SYMBOL (__memmove_chk, unaligned_erms), + MEMMOVE_SYMBOL (__memcpy_chk, unaligned_erms)) +# endif +strong_alias (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned), + MEMMOVE_CHK_SYMBOL (__memcpy_chk, unaligned)) +# endif +#endif +#if VEC_SIZE == 16 || defined SHARED +strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned), + MEMCPY_SYMBOL (__memcpy, unaligned)) +#endif diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/memmove.S b/REORG.TODO/sysdeps/x86_64/multiarch/memmove.S new file mode 100644 index 0000000000..8c534e83e0 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/memmove.S @@ -0,0 +1,101 @@ +/* Multiple versions of memmove + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2016-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + +/* Define multiple versions only for the definition in lib and for + DSO. */ +#if IS_IN (libc) + .text +ENTRY(__libc_memmove) + .type __libc_memmove, @gnu_indirect_function + LOAD_RTLD_GLOBAL_RO_RDX + lea __memmove_erms(%rip), %RAX_LP + HAS_ARCH_FEATURE (Prefer_ERMS) + jnz 2f + HAS_ARCH_FEATURE (Prefer_No_AVX512) + jnz 1f + HAS_ARCH_FEATURE (AVX512F_Usable) + jz 1f + lea __memmove_avx512_no_vzeroupper(%rip), %RAX_LP + HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER) + jnz 2f + lea __memmove_avx512_unaligned_erms(%rip), %RAX_LP + HAS_CPU_FEATURE (ERMS) + jnz 2f + lea __memmove_avx512_unaligned(%rip), %RAX_LP + ret +1: lea __memmove_avx_unaligned(%rip), %RAX_LP + HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load) + jz L(Fast_Unaligned_Load) + HAS_CPU_FEATURE (ERMS) + jz 2f + lea __memmove_avx_unaligned_erms(%rip), %RAX_LP + ret +L(Fast_Unaligned_Load): + lea __memmove_sse2_unaligned(%rip), %RAX_LP + HAS_ARCH_FEATURE (Fast_Unaligned_Copy) + jz L(SSSE3) + HAS_CPU_FEATURE (ERMS) + jz 2f + lea __memmove_sse2_unaligned_erms(%rip), %RAX_LP + ret +L(SSSE3): + HAS_CPU_FEATURE (SSSE3) + jz 2f + lea __memmove_ssse3_back(%rip), %RAX_LP + HAS_ARCH_FEATURE (Fast_Copy_Backward) + jnz 2f + lea __memmove_ssse3(%rip), %RAX_LP +2: ret +END(__libc_memmove) +#endif + +#if IS_IN (libc) +# define MEMMOVE_SYMBOL(p,s) p##_sse2_##s + +# ifdef SHARED +libc_hidden_ver (__memmove_sse2_unaligned, memmove) +libc_hidden_ver (__memcpy_sse2_unaligned, memcpy) +libc_hidden_ver (__mempcpy_sse2_unaligned, mempcpy) +libc_hidden_ver (__mempcpy_sse2_unaligned, __mempcpy) + +# undef libc_hidden_builtin_def +/* It doesn't make sense to send libc-internal memmove calls through a PLT. + The speedup we get from using SSE2 instructions is likely eaten away + by the indirect call in the PLT. */ +# define libc_hidden_builtin_def +# endif +strong_alias (__libc_memmove, memmove) +#endif + +#if !defined SHARED || !IS_IN (libc) +weak_alias (__mempcpy, mempcpy) +#endif + +#include "../memmove.S" + +#if defined SHARED && IS_IN (libc) +# include <shlib-compat.h> +# if SHLIB_COMPAT (libc, GLIBC_2_2_5, GLIBC_2_14) +/* Use __memmove_sse2_unaligned to support overlapping addresses. */ +compat_symbol (libc, __memmove_sse2_unaligned, memcpy, GLIBC_2_2_5); +# endif +#endif diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/memmove_chk.S b/REORG.TODO/sysdeps/x86_64/multiarch/memmove_chk.S new file mode 100644 index 0000000000..7870dd0247 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/memmove_chk.S @@ -0,0 +1,71 @@ +/* Multiple versions of __memmove_chk + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2016-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + +/* Define multiple versions only for the definition in lib and for + DSO. There are no multiarch memmove functions for static binaries. + */ +#if IS_IN (libc) +# ifdef SHARED + .text +ENTRY(__memmove_chk) + .type __memmove_chk, @gnu_indirect_function + LOAD_RTLD_GLOBAL_RO_RDX + HAS_ARCH_FEATURE (Prefer_No_AVX512) + jnz 1f + HAS_ARCH_FEATURE (AVX512F_Usable) + jz 1f + lea __memmove_chk_avx512_no_vzeroupper(%rip), %RAX_LP + HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER) + jnz 2f + lea __memmove_chk_avx512_unaligned_erms(%rip), %RAX_LP + HAS_CPU_FEATURE (ERMS) + jnz 2f + lea __memmove_chk_avx512_unaligned(%rip), %RAX_LP + ret +1: lea __memmove_chk_avx_unaligned(%rip), %RAX_LP + HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load) + jz L(Fast_Unaligned_Load) + HAS_CPU_FEATURE (ERMS) + jz 2f + lea __memmove_chk_avx_unaligned_erms(%rip), %RAX_LP + ret +L(Fast_Unaligned_Load): + lea __memmove_chk_sse2_unaligned(%rip), %RAX_LP + HAS_ARCH_FEATURE (Fast_Unaligned_Copy) + jz L(SSSE3) + HAS_CPU_FEATURE (ERMS) + jz 2f + lea __memmove_chk_sse2_unaligned_erms(%rip), %RAX_LP + ret +L(SSSE3): + HAS_CPU_FEATURE (SSSE3) + jz 2f + lea __memmove_chk_ssse3_back(%rip), %RAX_LP + HAS_ARCH_FEATURE (Fast_Copy_Backward) + jnz 2f + lea __memmove_chk_ssse3(%rip), %RAX_LP +2: ret +END(__memmove_chk) +# else +# include "../memmove_chk.S" +# endif +#endif diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/mempcpy.S b/REORG.TODO/sysdeps/x86_64/multiarch/mempcpy.S new file mode 100644 index 0000000000..b8b2b28094 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/mempcpy.S @@ -0,0 +1,73 @@ +/* Multiple versions of mempcpy + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2010-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + +/* Define multiple versions only for the definition in lib and for + DSO. In static binaries we need mempcpy before the initialization + happened. */ +#if defined SHARED && IS_IN (libc) + .text +ENTRY(__mempcpy) + .type __mempcpy, @gnu_indirect_function + LOAD_RTLD_GLOBAL_RO_RDX + lea __mempcpy_erms(%rip), %RAX_LP + HAS_ARCH_FEATURE (Prefer_ERMS) + jnz 2f + HAS_ARCH_FEATURE (Prefer_No_AVX512) + jnz 1f + HAS_ARCH_FEATURE (AVX512F_Usable) + jz 1f + lea __mempcpy_avx512_no_vzeroupper(%rip), %RAX_LP + HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER) + jnz 2f + lea __mempcpy_avx512_unaligned_erms(%rip), %RAX_LP + HAS_CPU_FEATURE (ERMS) + jnz 2f + lea __mempcpy_avx512_unaligned(%rip), %RAX_LP + ret +1: lea __mempcpy_avx_unaligned(%rip), %RAX_LP + HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load) + jz L(Fast_Unaligned_Load) + HAS_CPU_FEATURE (ERMS) + jz 2f + lea __mempcpy_avx_unaligned_erms(%rip), %RAX_LP + ret +L(Fast_Unaligned_Load): + lea __mempcpy_sse2_unaligned(%rip), %RAX_LP + HAS_ARCH_FEATURE (Fast_Unaligned_Copy) + jz L(SSSE3) + HAS_CPU_FEATURE (ERMS) + jz 2f + lea __mempcpy_sse2_unaligned_erms(%rip), %RAX_LP + ret +L(SSSE3): + HAS_CPU_FEATURE (SSSE3) + jz 2f + lea __mempcpy_ssse3_back(%rip), %RAX_LP + HAS_ARCH_FEATURE (Fast_Copy_Backward) + jnz 2f + lea __mempcpy_ssse3(%rip), %RAX_LP +2: ret +END(__mempcpy) + +weak_alias (__mempcpy, mempcpy) +#endif diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/mempcpy_chk.S b/REORG.TODO/sysdeps/x86_64/multiarch/mempcpy_chk.S new file mode 100644 index 0000000000..072b22c49f --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/mempcpy_chk.S @@ -0,0 +1,72 @@ +/* Multiple versions of __mempcpy_chk + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2010-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + +/* Define multiple versions only for the definition in lib and for + DSO. There are no multiarch mempcpy functions for static binaries. + */ +#if IS_IN (libc) +# ifdef SHARED + .text +ENTRY(__mempcpy_chk) + .type __mempcpy_chk, @gnu_indirect_function + LOAD_RTLD_GLOBAL_RO_RDX + HAS_ARCH_FEATURE (Prefer_No_AVX512) + jnz 1f + HAS_ARCH_FEATURE (AVX512F_Usable) + jz 1f + lea __mempcpy_chk_avx512_no_vzeroupper(%rip), %RAX_LP + HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER) + jnz 2f + lea __mempcpy_chk_avx512_unaligned_erms(%rip), %RAX_LP + HAS_CPU_FEATURE (ERMS) + jnz 2f + lea __mempcpy_chk_avx512_unaligned(%rip), %RAX_LP + ret +1: lea __mempcpy_chk_avx_unaligned(%rip), %RAX_LP + HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load) + jz L(Fast_Unaligned_Load) + HAS_CPU_FEATURE (ERMS) + jz 2f + lea __mempcpy_chk_avx_unaligned_erms(%rip), %RAX_LP + ret +L(Fast_Unaligned_Load): + lea __mempcpy_chk_sse2_unaligned(%rip), %RAX_LP + HAS_ARCH_FEATURE (Fast_Unaligned_Copy) + jz L(SSSE3) + HAS_CPU_FEATURE (ERMS) + jz 2f + lea __mempcpy_chk_sse2_unaligned_erms(%rip), %RAX_LP + ret +L(SSSE3): + HAS_CPU_FEATURE (SSSE3) + jz 2f + lea __mempcpy_chk_ssse3_back(%rip), %RAX_LP + HAS_ARCH_FEATURE (Fast_Copy_Backward) + jnz 2f + lea __mempcpy_chk_ssse3(%rip), %RAX_LP +2: ret +END(__mempcpy_chk) +# else +# include "../mempcpy_chk.S" +# endif +#endif diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/REORG.TODO/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S new file mode 100644 index 0000000000..7ab3d89849 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S @@ -0,0 +1,22 @@ +#if IS_IN (libc) +# define VEC_SIZE 32 +# define VEC(i) ymm##i +# define VMOVU vmovdqu +# define VMOVA vmovdqa + +# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ + vmovd d, %xmm0; \ + movq r, %rax; \ + vpbroadcastb %xmm0, %ymm0 + +# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ + vmovd d, %xmm0; \ + movq r, %rax; \ + vpbroadcastd %xmm0, %ymm0 + +# define SECTION(p) p##.avx +# define MEMSET_SYMBOL(p,s) p##_avx2_##s +# define WMEMSET_SYMBOL(p,s) p##_avx2_##s + +# include "memset-vec-unaligned-erms.S" +#endif diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/memset-avx512-no-vzeroupper.S b/REORG.TODO/sysdeps/x86_64/multiarch/memset-avx512-no-vzeroupper.S new file mode 100644 index 0000000000..1f66602398 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/memset-avx512-no-vzeroupper.S @@ -0,0 +1,194 @@ +/* memset optimized with AVX512 for KNL hardware. + Copyright (C) 2015-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> + +#if IS_IN (libc) + +#include "asm-syntax.h" +#ifndef MEMSET +# define MEMSET __memset_avx512_no_vzeroupper +# define MEMSET_CHK __memset_chk_avx512_no_vzeroupper +#endif + + .section .text.avx512,"ax",@progbits +#if defined PIC +ENTRY (MEMSET_CHK) + cmpq %rdx, %rcx + jb HIDDEN_JUMPTARGET (__chk_fail) +END (MEMSET_CHK) +#endif + +ENTRY (MEMSET) + vpxor %xmm0, %xmm0, %xmm0 + vmovd %esi, %xmm1 + lea (%rdi, %rdx), %rsi + mov %rdi, %rax + vpshufb %xmm0, %xmm1, %xmm0 + cmp $16, %rdx + jb L(less_16bytes) + cmp $512, %rdx + vbroadcastss %xmm0, %zmm2 + ja L(512bytesormore) + cmp $256, %rdx + jb L(less_256bytes) + vmovups %zmm2, (%rdi) + vmovups %zmm2, 0x40(%rdi) + vmovups %zmm2, 0x80(%rdi) + vmovups %zmm2, 0xC0(%rdi) + vmovups %zmm2, -0x100(%rsi) + vmovups %zmm2, -0xC0(%rsi) + vmovups %zmm2, -0x80(%rsi) + vmovups %zmm2, -0x40(%rsi) + ret + +L(less_256bytes): + cmp $128, %dl + jb L(less_128bytes) + vmovups %zmm2, (%rdi) + vmovups %zmm2, 0x40(%rdi) + vmovups %zmm2, -0x80(%rsi) + vmovups %zmm2, -0x40(%rsi) + ret + +L(less_128bytes): + cmp $64, %dl + jb L(less_64bytes) + vmovups %zmm2, (%rdi) + vmovups %zmm2, -0x40(%rsi) + ret + +L(less_64bytes): + cmp $32, %dl + jb L(less_32bytes) + vmovdqu %ymm2, (%rdi) + vmovdqu %ymm2, -0x20(%rsi) + ret + +L(less_32bytes): + vmovdqu %xmm0, (%rdi) + vmovdqu %xmm0, -0x10(%rsi) + ret + +L(less_16bytes): + cmp $8, %dl + jb L(less_8bytes) + vmovq %xmm0, (%rdi) + vmovq %xmm0, -0x08(%rsi) + ret + +L(less_8bytes): + vmovd %xmm0, %ecx + cmp $4, %dl + jb L(less_4bytes) + mov %ecx, (%rdi) + mov %ecx, -0x04(%rsi) + ret + +L(less_4bytes): + cmp $2, %dl + jb L(less_2bytes) + mov %cx, (%rdi) + mov %cx, -0x02(%rsi) + ret + +L(less_2bytes): + cmp $1, %dl + jb L(less_1bytes) + mov %cl, (%rdi) +L(less_1bytes): + ret + +L(512bytesormore): + mov __x86_shared_cache_size_half(%rip), %rcx + cmp %rcx, %rdx + ja L(preloop_large) + cmp $1024, %rdx + ja L(1024bytesormore) + + vmovups %zmm2, (%rdi) + vmovups %zmm2, 0x40(%rdi) + vmovups %zmm2, 0x80(%rdi) + vmovups %zmm2, 0xC0(%rdi) + vmovups %zmm2, 0x100(%rdi) + vmovups %zmm2, 0x140(%rdi) + vmovups %zmm2, 0x180(%rdi) + vmovups %zmm2, 0x1C0(%rdi) + vmovups %zmm2, -0x200(%rsi) + vmovups %zmm2, -0x1C0(%rsi) + vmovups %zmm2, -0x180(%rsi) + vmovups %zmm2, -0x140(%rsi) + vmovups %zmm2, -0x100(%rsi) + vmovups %zmm2, -0xC0(%rsi) + vmovups %zmm2, -0x80(%rsi) + vmovups %zmm2, -0x40(%rsi) + ret + +/* Align on 64 and loop with aligned stores. */ +L(1024bytesormore): + sub $0x100, %rsi + vmovups %zmm2, (%rax) + and $-0x40, %rdi + add $0x40, %rdi + +L(gobble_256bytes_loop): + vmovaps %zmm2, (%rdi) + vmovaps %zmm2, 0x40(%rdi) + vmovaps %zmm2, 0x80(%rdi) + vmovaps %zmm2, 0xC0(%rdi) + add $0x100, %rdi + cmp %rsi, %rdi + jb L(gobble_256bytes_loop) + vmovups %zmm2, (%rsi) + vmovups %zmm2, 0x40(%rsi) + vmovups %zmm2, 0x80(%rsi) + vmovups %zmm2, 0xC0(%rsi) + ret + +/* Align on 128 and loop with non-temporal stores. */ +L(preloop_large): + and $-0x80, %rdi + add $0x80, %rdi + vmovups %zmm2, (%rax) + vmovups %zmm2, 0x40(%rax) + sub $0x200, %rsi + +L(gobble_512bytes_nt_loop): + vmovntdq %zmm2, (%rdi) + vmovntdq %zmm2, 0x40(%rdi) + vmovntdq %zmm2, 0x80(%rdi) + vmovntdq %zmm2, 0xC0(%rdi) + vmovntdq %zmm2, 0x100(%rdi) + vmovntdq %zmm2, 0x140(%rdi) + vmovntdq %zmm2, 0x180(%rdi) + vmovntdq %zmm2, 0x1C0(%rdi) + add $0x200, %rdi + cmp %rsi, %rdi + jb L(gobble_512bytes_nt_loop) + sfence + vmovups %zmm2, (%rsi) + vmovups %zmm2, 0x40(%rsi) + vmovups %zmm2, 0x80(%rsi) + vmovups %zmm2, 0xC0(%rsi) + vmovups %zmm2, 0x100(%rsi) + vmovups %zmm2, 0x140(%rsi) + vmovups %zmm2, 0x180(%rsi) + vmovups %zmm2, 0x1C0(%rsi) + ret +END (MEMSET) +#endif diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/REORG.TODO/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S new file mode 100644 index 0000000000..0783979ca5 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S @@ -0,0 +1,24 @@ +#if IS_IN (libc) +# define VEC_SIZE 64 +# define VEC(i) zmm##i +# define VMOVU vmovdqu64 +# define VMOVA vmovdqa64 + +# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ + vmovd d, %xmm0; \ + movq r, %rax; \ + vpbroadcastb %xmm0, %xmm0; \ + vpbroadcastq %xmm0, %zmm0 + +# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ + vmovd d, %xmm0; \ + movq r, %rax; \ + vpbroadcastd %xmm0, %xmm0; \ + vpbroadcastq %xmm0, %zmm0 + +# define SECTION(p) p##.avx512 +# define MEMSET_SYMBOL(p,s) p##_avx512_##s +# define WMEMSET_SYMBOL(p,s) p##_avx512_##s + +# include "memset-vec-unaligned-erms.S" +#endif diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/REORG.TODO/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S new file mode 100644 index 0000000000..2eb9e3744e --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S @@ -0,0 +1,263 @@ +/* memset/bzero with unaligned store and rep stosb + Copyright (C) 2016-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +/* memset is implemented as: + 1. Use overlapping store to avoid branch. + 2. If size is less than VEC, use integer register stores. + 3. If size is from VEC_SIZE to 2 * VEC_SIZE, use 2 VEC stores. + 4. If size is from 2 * VEC_SIZE to 4 * VEC_SIZE, use 4 VEC stores. + 5. If size is more to 4 * VEC_SIZE, align to 4 * VEC_SIZE with + 4 VEC stores and store 4 * VEC at a time until done. */ + +#include <sysdep.h> + +#ifndef MEMSET_CHK_SYMBOL +# define MEMSET_CHK_SYMBOL(p,s) MEMSET_SYMBOL(p, s) +#endif + +#ifndef WMEMSET_CHK_SYMBOL +# define WMEMSET_CHK_SYMBOL(p,s) WMEMSET_SYMBOL(p, s) +#endif + +#ifndef VZEROUPPER +# if VEC_SIZE > 16 +# define VZEROUPPER vzeroupper +# else +# define VZEROUPPER +# endif +#endif + +#ifndef VZEROUPPER_SHORT_RETURN +# if VEC_SIZE > 16 +# define VZEROUPPER_SHORT_RETURN vzeroupper +# else +# define VZEROUPPER_SHORT_RETURN rep +# endif +#endif + +#ifndef MOVQ +# if VEC_SIZE > 16 +# define MOVQ vmovq +# else +# define MOVQ movq +# endif +#endif + +/* Threshold to use Enhanced REP STOSB. Since there is overhead to set + up REP STOSB operation, REP STOSB isn't faster on short data. The + memset micro benchmark in glibc shows that 2KB is the approximate + value above which REP STOSB becomes faster on processors with + Enhanced REP STOSB. Since the stored value is fixed, larger register + size has minimal impact on threshold. */ +#ifndef REP_STOSB_THRESHOLD +# define REP_STOSB_THRESHOLD 2048 +#endif + +#ifndef SECTION +# error SECTION is not defined! +#endif + + .section SECTION(.text),"ax",@progbits +#if VEC_SIZE == 16 && IS_IN (libc) +ENTRY (__bzero) + movq %rdi, %rax /* Set return value. */ + movq %rsi, %rdx /* Set n. */ + pxor %xmm0, %xmm0 + jmp L(entry_from_bzero) +END (__bzero) +weak_alias (__bzero, bzero) +#endif + +#if IS_IN (libc) +# if defined SHARED +ENTRY_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned)) + cmpq %rdx, %rcx + jb HIDDEN_JUMPTARGET (__chk_fail) +END_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned)) +# endif + +ENTRY (WMEMSET_SYMBOL (__wmemset, unaligned)) + shlq $2, %rdx + WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi) + jmp L(entry_from_bzero) +END (WMEMSET_SYMBOL (__wmemset, unaligned)) +#endif + +#if defined SHARED && IS_IN (libc) +ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned)) + cmpq %rdx, %rcx + jb HIDDEN_JUMPTARGET (__chk_fail) +END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned)) +#endif + +ENTRY (MEMSET_SYMBOL (__memset, unaligned)) + MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi) +L(entry_from_bzero): + cmpq $VEC_SIZE, %rdx + jb L(less_vec) + cmpq $(VEC_SIZE * 2), %rdx + ja L(more_2x_vec) + /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */ + VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx) + VMOVU %VEC(0), (%rdi) + VZEROUPPER + ret +#if defined USE_MULTIARCH && IS_IN (libc) +END (MEMSET_SYMBOL (__memset, unaligned)) + +# if VEC_SIZE == 16 +/* Only used to measure performance of REP STOSB. */ +ENTRY (__memset_erms) +# else +/* Provide a symbol to debugger. */ +ENTRY (MEMSET_SYMBOL (__memset, erms)) +# endif +L(stosb): + /* Issue vzeroupper before rep stosb. */ + VZEROUPPER + movq %rdx, %rcx + movzbl %sil, %eax + movq %rdi, %rdx + rep stosb + movq %rdx, %rax + ret +# if VEC_SIZE == 16 +END (__memset_erms) +# else +END (MEMSET_SYMBOL (__memset, erms)) +# endif + +# if defined SHARED && IS_IN (libc) +ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms)) + cmpq %rdx, %rcx + jb HIDDEN_JUMPTARGET (__chk_fail) +END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms)) +# endif + +ENTRY (MEMSET_SYMBOL (__memset, unaligned_erms)) + MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi) + cmpq $VEC_SIZE, %rdx + jb L(less_vec) + cmpq $(VEC_SIZE * 2), %rdx + ja L(stosb_more_2x_vec) + /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */ + VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx) + VMOVU %VEC(0), (%rdi) + VZEROUPPER + ret + +L(stosb_more_2x_vec): + cmpq $REP_STOSB_THRESHOLD, %rdx + ja L(stosb) +#endif +L(more_2x_vec): + cmpq $(VEC_SIZE * 4), %rdx + ja L(loop_start) + VMOVU %VEC(0), (%rdi) + VMOVU %VEC(0), VEC_SIZE(%rdi) + VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx) + VMOVU %VEC(0), -(VEC_SIZE * 2)(%rdi,%rdx) +L(return): + VZEROUPPER + ret + +L(loop_start): + leaq (VEC_SIZE * 4)(%rdi), %rcx + VMOVU %VEC(0), (%rdi) + andq $-(VEC_SIZE * 4), %rcx + VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx) + VMOVU %VEC(0), VEC_SIZE(%rdi) + VMOVU %VEC(0), -(VEC_SIZE * 2)(%rdi,%rdx) + VMOVU %VEC(0), (VEC_SIZE * 2)(%rdi) + VMOVU %VEC(0), -(VEC_SIZE * 3)(%rdi,%rdx) + VMOVU %VEC(0), (VEC_SIZE * 3)(%rdi) + VMOVU %VEC(0), -(VEC_SIZE * 4)(%rdi,%rdx) + addq %rdi, %rdx + andq $-(VEC_SIZE * 4), %rdx + cmpq %rdx, %rcx + je L(return) +L(loop): + VMOVA %VEC(0), (%rcx) + VMOVA %VEC(0), VEC_SIZE(%rcx) + VMOVA %VEC(0), (VEC_SIZE * 2)(%rcx) + VMOVA %VEC(0), (VEC_SIZE * 3)(%rcx) + addq $(VEC_SIZE * 4), %rcx + cmpq %rcx, %rdx + jne L(loop) + VZEROUPPER_SHORT_RETURN + ret +L(less_vec): + /* Less than 1 VEC. */ +# if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64 +# error Unsupported VEC_SIZE! +# endif +# if VEC_SIZE > 32 + cmpb $32, %dl + jae L(between_32_63) +# endif +# if VEC_SIZE > 16 + cmpb $16, %dl + jae L(between_16_31) +# endif + MOVQ %xmm0, %rcx + cmpb $8, %dl + jae L(between_8_15) + cmpb $4, %dl + jae L(between_4_7) + cmpb $1, %dl + ja L(between_2_3) + jb 1f + movb %cl, (%rdi) +1: + VZEROUPPER + ret +# if VEC_SIZE > 32 + /* From 32 to 63. No branch when size == 32. */ +L(between_32_63): + vmovdqu %ymm0, -32(%rdi,%rdx) + vmovdqu %ymm0, (%rdi) + VZEROUPPER + ret +# endif +# if VEC_SIZE > 16 + /* From 16 to 31. No branch when size == 16. */ +L(between_16_31): + vmovdqu %xmm0, -16(%rdi,%rdx) + vmovdqu %xmm0, (%rdi) + VZEROUPPER + ret +# endif + /* From 8 to 15. No branch when size == 8. */ +L(between_8_15): + movq %rcx, -8(%rdi,%rdx) + movq %rcx, (%rdi) + VZEROUPPER + ret +L(between_4_7): + /* From 4 to 7. No branch when size == 4. */ + movl %ecx, -4(%rdi,%rdx) + movl %ecx, (%rdi) + VZEROUPPER + ret +L(between_2_3): + /* From 2 to 3. No branch when size == 2. */ + movw %cx, -2(%rdi,%rdx) + movw %cx, (%rdi) + VZEROUPPER + ret +END (MEMSET_SYMBOL (__memset, unaligned_erms)) diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/memset.S b/REORG.TODO/sysdeps/x86_64/multiarch/memset.S new file mode 100644 index 0000000000..11f27378b0 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/memset.S @@ -0,0 +1,82 @@ +/* Multiple versions of memset + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <shlib-compat.h> +#include <init-arch.h> + +/* Define multiple versions only for the definition in lib. */ +#if IS_IN (libc) +ENTRY(memset) + .type memset, @gnu_indirect_function + LOAD_RTLD_GLOBAL_RO_RDX + lea __memset_erms(%rip), %RAX_LP + HAS_ARCH_FEATURE (Prefer_ERMS) + jnz 2f + lea __memset_sse2_unaligned_erms(%rip), %RAX_LP + HAS_CPU_FEATURE (ERMS) + jnz 1f + lea __memset_sse2_unaligned(%rip), %RAX_LP +1: + HAS_ARCH_FEATURE (AVX2_Usable) + jz 2f + lea __memset_avx2_unaligned_erms(%rip), %RAX_LP + HAS_CPU_FEATURE (ERMS) + jnz L(AVX512F) + lea __memset_avx2_unaligned(%rip), %RAX_LP +L(AVX512F): + HAS_ARCH_FEATURE (Prefer_No_AVX512) + jnz 2f + HAS_ARCH_FEATURE (AVX512F_Usable) + jz 2f + lea __memset_avx512_no_vzeroupper(%rip), %RAX_LP + HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER) + jnz 2f + lea __memset_avx512_unaligned_erms(%rip), %RAX_LP + HAS_CPU_FEATURE (ERMS) + jnz 2f + lea __memset_avx512_unaligned(%rip), %RAX_LP +2: ret +END(memset) +#endif + +#if IS_IN (libc) +# define MEMSET_SYMBOL(p,s) p##_sse2_##s +# define WMEMSET_SYMBOL(p,s) p##_sse2_##s + +# ifdef SHARED +# undef libc_hidden_builtin_def +/* It doesn't make sense to send libc-internal memset calls through a PLT. + The speedup we get from using SSE2 instructions is likely eaten away + by the indirect call in the PLT. */ +# define libc_hidden_builtin_def(name) \ + .globl __GI_memset; __GI_memset = __memset_sse2_unaligned; \ + .globl __GI_wmemset; __GI_wmemset = __wmemset_sse2_unaligned; \ + .globl __GI___wmemset; __GI___wmemset = __wmemset_sse2_unaligned +# endif + +# undef weak_alias +# define weak_alias(original, alias) \ + .weak bzero; bzero = __bzero + +# undef strong_alias +# define strong_alias(original, alias) +#endif + +#include "../memset.S" diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/memset_chk.S b/REORG.TODO/sysdeps/x86_64/multiarch/memset_chk.S new file mode 100644 index 0000000000..7e08311cdf --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/memset_chk.S @@ -0,0 +1,61 @@ +/* Multiple versions of memset_chk + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + +/* Define multiple versions only for the definition in lib. */ +#if IS_IN (libc) +# ifdef SHARED +ENTRY(__memset_chk) + .type __memset_chk, @gnu_indirect_function + LOAD_RTLD_GLOBAL_RO_RDX + lea __memset_chk_sse2_unaligned_erms(%rip), %RAX_LP + HAS_CPU_FEATURE (ERMS) + jnz 1f + lea __memset_chk_sse2_unaligned(%rip), %RAX_LP +1: + HAS_ARCH_FEATURE (AVX2_Usable) + jz 2f + lea __memset_chk_avx2_unaligned_erms(%rip), %RAX_LP + HAS_CPU_FEATURE (ERMS) + jnz L(AVX512F) + lea __memset_chk_avx2_unaligned(%rip), %RAX_LP +L(AVX512F): + HAS_ARCH_FEATURE (Prefer_No_AVX512) + jnz 2f + HAS_ARCH_FEATURE (AVX512F_Usable) + jz 2f + lea __memset_chk_avx512_no_vzeroupper(%rip), %RAX_LP + HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER) + jnz 2f + lea __memset_chk_avx512_unaligned_erms(%rip), %RAX_LP + HAS_CPU_FEATURE (ERMS) + jnz 2f + lea __memset_chk_avx512_unaligned(%rip), %RAX_LP +2: ret +END(__memset_chk) + +strong_alias (__memset_chk, __memset_zero_constant_len_parameter) + .section .gnu.warning.__memset_zero_constant_len_parameter + .string "memset used with constant zero length parameter; this could be due to transposed parameters" +# else +# include "../memset_chk.S" +# endif +#endif diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/sched_cpucount.c b/REORG.TODO/sysdeps/x86_64/multiarch/sched_cpucount.c new file mode 100644 index 0000000000..453f183747 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/sched_cpucount.c @@ -0,0 +1,36 @@ +/* Count bits in CPU set. x86-64 multi-arch version. + This file is part of the GNU C Library. + Copyright (C) 2008-2017 Free Software Foundation, Inc. + Contributed by Ulrich Drepper <drepper@redhat.com>. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sched.h> +#include "init-arch.h" + +#define __sched_cpucount static generic_cpucount +#include <posix/sched_cpucount.c> +#undef __sched_cpucount + +#define POPCNT(l) \ + ({ __cpu_mask r; \ + asm ("popcnt %1, %0" : "=r" (r) : "0" (l));\ + r; }) +#define __sched_cpucount static popcount_cpucount +#include <posix/sched_cpucount.c> +#undef __sched_cpucount + +libc_ifunc (__sched_cpucount, + HAS_CPU_FEATURE (POPCOUNT) ? popcount_cpucount : generic_cpucount); diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/stpcpy-sse2-unaligned.S b/REORG.TODO/sysdeps/x86_64/multiarch/stpcpy-sse2-unaligned.S new file mode 100644 index 0000000000..34231f8b46 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/stpcpy-sse2-unaligned.S @@ -0,0 +1,3 @@ +#define USE_AS_STPCPY +#define STRCPY __stpcpy_sse2_unaligned +#include "strcpy-sse2-unaligned.S" diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/stpcpy-ssse3.S b/REORG.TODO/sysdeps/x86_64/multiarch/stpcpy-ssse3.S new file mode 100644 index 0000000000..d971c2da38 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/stpcpy-ssse3.S @@ -0,0 +1,3 @@ +#define USE_AS_STPCPY +#define STRCPY __stpcpy_ssse3 +#include "strcpy-ssse3.S" diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/stpcpy.S b/REORG.TODO/sysdeps/x86_64/multiarch/stpcpy.S new file mode 100644 index 0000000000..ee81ab6ae3 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/stpcpy.S @@ -0,0 +1,9 @@ +/* Multiple versions of stpcpy + All versions must be listed in ifunc-impl-list.c. */ +#define USE_AS_STPCPY +#define STRCPY __stpcpy +#include "strcpy.S" + +weak_alias (__stpcpy, stpcpy) +libc_hidden_def (__stpcpy) +libc_hidden_builtin_def (stpcpy) diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/stpncpy-c.c b/REORG.TODO/sysdeps/x86_64/multiarch/stpncpy-c.c new file mode 100644 index 0000000000..2fde77dcab --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/stpncpy-c.c @@ -0,0 +1,8 @@ +#define STPNCPY __stpncpy_sse2 +#ifdef SHARED +#undef libc_hidden_def +#define libc_hidden_def(name) \ + __hidden_ver1 (__stpncpy_sse2, __GI___stpncpy, __stpncpy_sse2); +#endif + +#include "stpncpy.c" diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/stpncpy-sse2-unaligned.S b/REORG.TODO/sysdeps/x86_64/multiarch/stpncpy-sse2-unaligned.S new file mode 100644 index 0000000000..658520f78f --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/stpncpy-sse2-unaligned.S @@ -0,0 +1,4 @@ +#define USE_AS_STPCPY +#define USE_AS_STRNCPY +#define STRCPY __stpncpy_sse2_unaligned +#include "strcpy-sse2-unaligned.S" diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/stpncpy-ssse3.S b/REORG.TODO/sysdeps/x86_64/multiarch/stpncpy-ssse3.S new file mode 100644 index 0000000000..14ed16f6b5 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/stpncpy-ssse3.S @@ -0,0 +1,4 @@ +#define USE_AS_STPCPY +#define USE_AS_STRNCPY +#define STRCPY __stpncpy_ssse3 +#include "strcpy-ssse3.S" diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/stpncpy.S b/REORG.TODO/sysdeps/x86_64/multiarch/stpncpy.S new file mode 100644 index 0000000000..2698ca6a8c --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/stpncpy.S @@ -0,0 +1,8 @@ +/* Multiple versions of stpncpy + All versions must be listed in ifunc-impl-list.c. */ +#define STRCPY __stpncpy +#define USE_AS_STPCPY +#define USE_AS_STRNCPY +#include "strcpy.S" + +weak_alias (__stpncpy, stpncpy) diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/strcasecmp_l-ssse3.S b/REORG.TODO/sysdeps/x86_64/multiarch/strcasecmp_l-ssse3.S new file mode 100644 index 0000000000..fb2f9ae14a --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/strcasecmp_l-ssse3.S @@ -0,0 +1,6 @@ +#define USE_SSSE3 1 +#define USE_AS_STRCASECMP_L +#define NO_NOLOCALE_ALIAS +#define STRCMP __strcasecmp_l_ssse3 +#define __strcasecmp __strcasecmp_ssse3 +#include "../strcmp.S" diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/strcasecmp_l.S b/REORG.TODO/sysdeps/x86_64/multiarch/strcasecmp_l.S new file mode 100644 index 0000000000..49f5b9fd95 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/strcasecmp_l.S @@ -0,0 +1,8 @@ +/* Multiple versions of strcasecmp and strcasecmp_l + All versions must be listed in ifunc-impl-list.c. */ +#define STRCMP __strcasecmp_l +#define USE_AS_STRCASECMP_L +#include "strcmp.S" + +weak_alias (__strcasecmp_l, strcasecmp_l) +libc_hidden_def (strcasecmp_l) diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S b/REORG.TODO/sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S new file mode 100644 index 0000000000..d0a8a1518a --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S @@ -0,0 +1,279 @@ +/* strcat with SSE2 + Copyright (C) 2011-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#if IS_IN (libc) + +# include <sysdep.h> + +# ifndef STRCAT +# define STRCAT __strcat_sse2_unaligned +# endif + +# define USE_AS_STRCAT + +.text +ENTRY (STRCAT) + mov %rdi, %r9 +# ifdef USE_AS_STRNCAT + mov %rdx, %r8 +# endif + +/* Inline corresponding strlen file, temporary until new strcpy + implementation gets merged. */ + + xor %rax, %rax + mov %edi, %ecx + and $0x3f, %ecx + pxor %xmm0, %xmm0 + cmp $0x30, %ecx + ja L(next) + movdqu (%rdi), %xmm1 + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz L(exit_less16) + mov %rdi, %rax + and $-16, %rax + jmp L(align16_start) +L(next): + mov %rdi, %rax + and $-16, %rax + pcmpeqb (%rax), %xmm0 + mov $-1, %r10d + sub %rax, %rcx + shl %cl, %r10d + pmovmskb %xmm0, %edx + and %r10d, %edx + jnz L(exit) + +L(align16_start): + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + pcmpeqb 16(%rax), %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz L(exit16) + + pcmpeqb 32(%rax), %xmm1 + pmovmskb %xmm1, %edx + test %edx, %edx + jnz L(exit32) + + pcmpeqb 48(%rax), %xmm2 + pmovmskb %xmm2, %edx + test %edx, %edx + jnz L(exit48) + + pcmpeqb 64(%rax), %xmm3 + pmovmskb %xmm3, %edx + test %edx, %edx + jnz L(exit64) + + pcmpeqb 80(%rax), %xmm0 + add $64, %rax + pmovmskb %xmm0, %edx + test %edx, %edx + jnz L(exit16) + + pcmpeqb 32(%rax), %xmm1 + pmovmskb %xmm1, %edx + test %edx, %edx + jnz L(exit32) + + pcmpeqb 48(%rax), %xmm2 + pmovmskb %xmm2, %edx + test %edx, %edx + jnz L(exit48) + + pcmpeqb 64(%rax), %xmm3 + pmovmskb %xmm3, %edx + test %edx, %edx + jnz L(exit64) + + pcmpeqb 80(%rax), %xmm0 + add $64, %rax + pmovmskb %xmm0, %edx + test %edx, %edx + jnz L(exit16) + + pcmpeqb 32(%rax), %xmm1 + pmovmskb %xmm1, %edx + test %edx, %edx + jnz L(exit32) + + pcmpeqb 48(%rax), %xmm2 + pmovmskb %xmm2, %edx + test %edx, %edx + jnz L(exit48) + + pcmpeqb 64(%rax), %xmm3 + pmovmskb %xmm3, %edx + test %edx, %edx + jnz L(exit64) + + pcmpeqb 80(%rax), %xmm0 + add $64, %rax + pmovmskb %xmm0, %edx + test %edx, %edx + jnz L(exit16) + + pcmpeqb 32(%rax), %xmm1 + pmovmskb %xmm1, %edx + test %edx, %edx + jnz L(exit32) + + pcmpeqb 48(%rax), %xmm2 + pmovmskb %xmm2, %edx + test %edx, %edx + jnz L(exit48) + + pcmpeqb 64(%rax), %xmm3 + pmovmskb %xmm3, %edx + test %edx, %edx + jnz L(exit64) + + test $0x3f, %rax + jz L(align64_loop) + + pcmpeqb 80(%rax), %xmm0 + add $80, %rax + pmovmskb %xmm0, %edx + test %edx, %edx + jnz L(exit) + + test $0x3f, %rax + jz L(align64_loop) + + pcmpeqb 16(%rax), %xmm1 + add $16, %rax + pmovmskb %xmm1, %edx + test %edx, %edx + jnz L(exit) + + test $0x3f, %rax + jz L(align64_loop) + + pcmpeqb 16(%rax), %xmm2 + add $16, %rax + pmovmskb %xmm2, %edx + test %edx, %edx + jnz L(exit) + + test $0x3f, %rax + jz L(align64_loop) + + pcmpeqb 16(%rax), %xmm3 + add $16, %rax + pmovmskb %xmm3, %edx + test %edx, %edx + jnz L(exit) + + add $16, %rax + .p2align 4 + L(align64_loop): + movaps (%rax), %xmm4 + pminub 16(%rax), %xmm4 + movaps 32(%rax), %xmm5 + pminub 48(%rax), %xmm5 + add $64, %rax + pminub %xmm4, %xmm5 + pcmpeqb %xmm0, %xmm5 + pmovmskb %xmm5, %edx + test %edx, %edx + jz L(align64_loop) + + pcmpeqb -64(%rax), %xmm0 + sub $80, %rax + pmovmskb %xmm0, %edx + test %edx, %edx + jnz L(exit16) + + pcmpeqb 32(%rax), %xmm1 + pmovmskb %xmm1, %edx + test %edx, %edx + jnz L(exit32) + + pcmpeqb 48(%rax), %xmm2 + pmovmskb %xmm2, %edx + test %edx, %edx + jnz L(exit48) + + pcmpeqb 64(%rax), %xmm3 + pmovmskb %xmm3, %edx + sub %rdi, %rax + bsf %rdx, %rdx + add %rdx, %rax + add $64, %rax + jmp L(StartStrcpyPart) + + .p2align 4 +L(exit): + sub %rdi, %rax +L(exit_less16): + bsf %rdx, %rdx + add %rdx, %rax + jmp L(StartStrcpyPart) + + .p2align 4 +L(exit16): + sub %rdi, %rax + bsf %rdx, %rdx + add %rdx, %rax + add $16, %rax + jmp L(StartStrcpyPart) + + .p2align 4 +L(exit32): + sub %rdi, %rax + bsf %rdx, %rdx + add %rdx, %rax + add $32, %rax + jmp L(StartStrcpyPart) + + .p2align 4 +L(exit48): + sub %rdi, %rax + bsf %rdx, %rdx + add %rdx, %rax + add $48, %rax + jmp L(StartStrcpyPart) + + .p2align 4 +L(exit64): + sub %rdi, %rax + bsf %rdx, %rdx + add %rdx, %rax + add $64, %rax + + .p2align 4 +L(StartStrcpyPart): + lea (%r9, %rax), %rdi + mov %rsi, %rcx + mov %r9, %rax /* save result */ + +# ifdef USE_AS_STRNCAT + test %r8, %r8 + jz L(ExitZero) +# define USE_AS_STRNCPY +# endif + +# include "strcpy-sse2-unaligned.S" +#endif diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/strcat-ssse3.S b/REORG.TODO/sysdeps/x86_64/multiarch/strcat-ssse3.S new file mode 100644 index 0000000000..edd683d778 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/strcat-ssse3.S @@ -0,0 +1,867 @@ +/* strcat with SSSE3 + Copyright (C) 2011-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#if IS_IN (libc) + +# include <sysdep.h> + +# ifndef STRCAT +# define STRCAT __strcat_ssse3 +# endif + +# define USE_AS_STRCAT + +.text +ENTRY (STRCAT) +# ifdef USE_AS_STRNCAT + mov %rdx, %r8 +# endif + + +/* Inline corresponding strlen file, temporary until new strcpy + implementation gets merged. */ + + xor %eax, %eax + cmpb $0, (%rdi) + jz L(exit_tail0) + cmpb $0, 1(%rdi) + jz L(exit_tail1) + cmpb $0, 2(%rdi) + jz L(exit_tail2) + cmpb $0, 3(%rdi) + jz L(exit_tail3) + + cmpb $0, 4(%rdi) + jz L(exit_tail4) + cmpb $0, 5(%rdi) + jz L(exit_tail5) + cmpb $0, 6(%rdi) + jz L(exit_tail6) + cmpb $0, 7(%rdi) + jz L(exit_tail7) + + cmpb $0, 8(%rdi) + jz L(exit_tail8) + cmpb $0, 9(%rdi) + jz L(exit_tail9) + cmpb $0, 10(%rdi) + jz L(exit_tail10) + cmpb $0, 11(%rdi) + jz L(exit_tail11) + + cmpb $0, 12(%rdi) + jz L(exit_tail12) + cmpb $0, 13(%rdi) + jz L(exit_tail13) + cmpb $0, 14(%rdi) + jz L(exit_tail14) + cmpb $0, 15(%rdi) + jz L(exit_tail15) + pxor %xmm0, %xmm0 + lea 16(%rdi), %rcx + lea 16(%rdi), %rax + and $-16, %rax + + pcmpeqb (%rax), %xmm0 + pmovmskb %xmm0, %edx + pxor %xmm1, %xmm1 + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + + pcmpeqb (%rax), %xmm1 + pmovmskb %xmm1, %edx + pxor %xmm2, %xmm2 + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + + pcmpeqb (%rax), %xmm2 + pmovmskb %xmm2, %edx + pxor %xmm3, %xmm3 + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + + pcmpeqb (%rax), %xmm3 + pmovmskb %xmm3, %edx + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + + pcmpeqb (%rax), %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + + pcmpeqb (%rax), %xmm1 + pmovmskb %xmm1, %edx + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + + pcmpeqb (%rax), %xmm2 + pmovmskb %xmm2, %edx + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + + pcmpeqb (%rax), %xmm3 + pmovmskb %xmm3, %edx + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + + pcmpeqb (%rax), %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + + pcmpeqb (%rax), %xmm1 + pmovmskb %xmm1, %edx + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + + pcmpeqb (%rax), %xmm2 + pmovmskb %xmm2, %edx + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + + pcmpeqb (%rax), %xmm3 + pmovmskb %xmm3, %edx + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + + pcmpeqb (%rax), %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + + pcmpeqb (%rax), %xmm1 + pmovmskb %xmm1, %edx + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + + pcmpeqb (%rax), %xmm2 + pmovmskb %xmm2, %edx + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + + pcmpeqb (%rax), %xmm3 + pmovmskb %xmm3, %edx + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + + and $-0x40, %rax + + .p2align 4 +L(aligned_64): + pcmpeqb (%rax), %xmm0 + pcmpeqb 16(%rax), %xmm1 + pcmpeqb 32(%rax), %xmm2 + pcmpeqb 48(%rax), %xmm3 + pmovmskb %xmm0, %edx + pmovmskb %xmm1, %r11d + pmovmskb %xmm2, %r10d + pmovmskb %xmm3, %r9d + or %edx, %r9d + or %r11d, %r9d + or %r10d, %r9d + lea 64(%rax), %rax + jz L(aligned_64) + + test %edx, %edx + jnz L(aligned_64_exit_16) + test %r11d, %r11d + jnz L(aligned_64_exit_32) + test %r10d, %r10d + jnz L(aligned_64_exit_48) + +L(aligned_64_exit_64): + pmovmskb %xmm3, %edx + jmp L(exit) + +L(aligned_64_exit_48): + lea -16(%rax), %rax + mov %r10d, %edx + jmp L(exit) + +L(aligned_64_exit_32): + lea -32(%rax), %rax + mov %r11d, %edx + jmp L(exit) + +L(aligned_64_exit_16): + lea -48(%rax), %rax + +L(exit): + sub %rcx, %rax + test %dl, %dl + jz L(exit_high) + test $0x01, %dl + jnz L(exit_tail0) + + test $0x02, %dl + jnz L(exit_tail1) + + test $0x04, %dl + jnz L(exit_tail2) + + test $0x08, %dl + jnz L(exit_tail3) + + test $0x10, %dl + jnz L(exit_tail4) + + test $0x20, %dl + jnz L(exit_tail5) + + test $0x40, %dl + jnz L(exit_tail6) + add $7, %eax +L(exit_tail0): + jmp L(StartStrcpyPart) + + .p2align 4 +L(exit_high): + add $8, %eax + test $0x01, %dh + jnz L(exit_tail0) + + test $0x02, %dh + jnz L(exit_tail1) + + test $0x04, %dh + jnz L(exit_tail2) + + test $0x08, %dh + jnz L(exit_tail3) + + test $0x10, %dh + jnz L(exit_tail4) + + test $0x20, %dh + jnz L(exit_tail5) + + test $0x40, %dh + jnz L(exit_tail6) + add $7, %eax + jmp L(StartStrcpyPart) + + .p2align 4 +L(exit_tail1): + add $1, %eax + jmp L(StartStrcpyPart) + + .p2align 4 +L(exit_tail2): + add $2, %eax + jmp L(StartStrcpyPart) + + .p2align 4 +L(exit_tail3): + add $3, %eax + jmp L(StartStrcpyPart) + + .p2align 4 +L(exit_tail4): + add $4, %eax + jmp L(StartStrcpyPart) + + .p2align 4 +L(exit_tail5): + add $5, %eax + jmp L(StartStrcpyPart) + + .p2align 4 +L(exit_tail6): + add $6, %eax + jmp L(StartStrcpyPart) + + .p2align 4 +L(exit_tail7): + add $7, %eax + jmp L(StartStrcpyPart) + + .p2align 4 +L(exit_tail8): + add $8, %eax + jmp L(StartStrcpyPart) + + .p2align 4 +L(exit_tail9): + add $9, %eax + jmp L(StartStrcpyPart) + + .p2align 4 +L(exit_tail10): + add $10, %eax + jmp L(StartStrcpyPart) + + .p2align 4 +L(exit_tail11): + add $11, %eax + jmp L(StartStrcpyPart) + + .p2align 4 +L(exit_tail12): + add $12, %eax + jmp L(StartStrcpyPart) + + .p2align 4 +L(exit_tail13): + add $13, %eax + jmp L(StartStrcpyPart) + + .p2align 4 +L(exit_tail14): + add $14, %eax + jmp L(StartStrcpyPart) + + .p2align 4 +L(exit_tail15): + add $15, %eax + + .p2align 4 +L(StartStrcpyPart): + mov %rsi, %rcx + lea (%rdi, %rax), %rdx +# ifdef USE_AS_STRNCAT + test %r8, %r8 + jz L(StrncatExit0) + cmp $8, %r8 + jbe L(StrncatExit8Bytes) +# endif + cmpb $0, (%rcx) + jz L(Exit1) + cmpb $0, 1(%rcx) + jz L(Exit2) + cmpb $0, 2(%rcx) + jz L(Exit3) + cmpb $0, 3(%rcx) + jz L(Exit4) + cmpb $0, 4(%rcx) + jz L(Exit5) + cmpb $0, 5(%rcx) + jz L(Exit6) + cmpb $0, 6(%rcx) + jz L(Exit7) + cmpb $0, 7(%rcx) + jz L(Exit8) + cmpb $0, 8(%rcx) + jz L(Exit9) +# ifdef USE_AS_STRNCAT + cmp $16, %r8 + jb L(StrncatExit15Bytes) +# endif + cmpb $0, 9(%rcx) + jz L(Exit10) + cmpb $0, 10(%rcx) + jz L(Exit11) + cmpb $0, 11(%rcx) + jz L(Exit12) + cmpb $0, 12(%rcx) + jz L(Exit13) + cmpb $0, 13(%rcx) + jz L(Exit14) + cmpb $0, 14(%rcx) + jz L(Exit15) + cmpb $0, 15(%rcx) + jz L(Exit16) +# ifdef USE_AS_STRNCAT + cmp $16, %r8 + je L(StrncatExit16) +# define USE_AS_STRNCPY +# endif + +# include "strcpy-ssse3.S" + + .p2align 4 +L(CopyFrom1To16Bytes): + add %rsi, %rdx + add %rsi, %rcx + + test %al, %al + jz L(ExitHigh) + test $0x01, %al + jnz L(Exit1) + test $0x02, %al + jnz L(Exit2) + test $0x04, %al + jnz L(Exit3) + test $0x08, %al + jnz L(Exit4) + test $0x10, %al + jnz L(Exit5) + test $0x20, %al + jnz L(Exit6) + test $0x40, %al + jnz L(Exit7) + movlpd (%rcx), %xmm0 + movlpd %xmm0, (%rdx) + mov %rdi, %rax + ret + + .p2align 4 +L(ExitHigh): + test $0x01, %ah + jnz L(Exit9) + test $0x02, %ah + jnz L(Exit10) + test $0x04, %ah + jnz L(Exit11) + test $0x08, %ah + jnz L(Exit12) + test $0x10, %ah + jnz L(Exit13) + test $0x20, %ah + jnz L(Exit14) + test $0x40, %ah + jnz L(Exit15) + movlpd (%rcx), %xmm0 + movlpd 8(%rcx), %xmm1 + movlpd %xmm0, (%rdx) + movlpd %xmm1, 8(%rdx) + mov %rdi, %rax + ret + + .p2align 4 +L(StrncatExit1): + xor %ah, %ah + movb %ah, 1(%rdx) +L(Exit1): + movb (%rcx), %al + movb %al, (%rdx) + mov %rdi, %rax + ret + + .p2align 4 +L(StrncatExit2): + xor %ah, %ah + movb %ah, 2(%rdx) +L(Exit2): + movw (%rcx), %ax + movw %ax, (%rdx) + mov %rdi, %rax + ret + + .p2align 4 +L(StrncatExit3): + xor %ah, %ah + movb %ah, 3(%rdx) +L(Exit3): + movw (%rcx), %ax + movw %ax, (%rdx) + movb 2(%rcx), %al + movb %al, 2(%rdx) + mov %rdi, %rax + ret + + .p2align 4 +L(StrncatExit4): + xor %ah, %ah + movb %ah, 4(%rdx) +L(Exit4): + mov (%rcx), %eax + mov %eax, (%rdx) + mov %rdi, %rax + ret + + .p2align 4 +L(StrncatExit5): + xor %ah, %ah + movb %ah, 5(%rdx) +L(Exit5): + mov (%rcx), %eax + mov %eax, (%rdx) + movb 4(%rcx), %al + movb %al, 4(%rdx) + mov %rdi, %rax + ret + + .p2align 4 +L(StrncatExit6): + xor %ah, %ah + movb %ah, 6(%rdx) +L(Exit6): + mov (%rcx), %eax + mov %eax, (%rdx) + movw 4(%rcx), %ax + movw %ax, 4(%rdx) + mov %rdi, %rax + ret + + .p2align 4 +L(StrncatExit7): + xor %ah, %ah + movb %ah, 7(%rdx) +L(Exit7): + mov (%rcx), %eax + mov %eax, (%rdx) + mov 3(%rcx), %eax + mov %eax, 3(%rdx) + mov %rdi, %rax + ret + + .p2align 4 +L(StrncatExit8): + xor %ah, %ah + movb %ah, 8(%rdx) +L(Exit8): + movlpd (%rcx), %xmm0 + movlpd %xmm0, (%rdx) + mov %rdi, %rax + ret + + .p2align 4 +L(StrncatExit9): + xor %ah, %ah + movb %ah, 9(%rdx) +L(Exit9): + movlpd (%rcx), %xmm0 + movlpd %xmm0, (%rdx) + movb 8(%rcx), %al + movb %al, 8(%rdx) + mov %rdi, %rax + ret + + .p2align 4 +L(StrncatExit10): + xor %ah, %ah + movb %ah, 10(%rdx) +L(Exit10): + movlpd (%rcx), %xmm0 + movlpd %xmm0, (%rdx) + movw 8(%rcx), %ax + movw %ax, 8(%rdx) + mov %rdi, %rax + ret + + .p2align 4 +L(StrncatExit11): + xor %ah, %ah + movb %ah, 11(%rdx) +L(Exit11): + movlpd (%rcx), %xmm0 + movlpd %xmm0, (%rdx) + mov 7(%rcx), %eax + mov %eax, 7(%rdx) + mov %rdi, %rax + ret + + .p2align 4 +L(StrncatExit12): + xor %ah, %ah + movb %ah, 12(%rdx) +L(Exit12): + movlpd (%rcx), %xmm0 + movlpd %xmm0, (%rdx) + mov 8(%rcx), %eax + mov %eax, 8(%rdx) + mov %rdi, %rax + ret + + .p2align 4 +L(StrncatExit13): + xor %ah, %ah + movb %ah, 13(%rdx) +L(Exit13): + movlpd (%rcx), %xmm0 + movlpd %xmm0, (%rdx) + movlpd 5(%rcx), %xmm1 + movlpd %xmm1, 5(%rdx) + mov %rdi, %rax + ret + + .p2align 4 +L(StrncatExit14): + xor %ah, %ah + movb %ah, 14(%rdx) +L(Exit14): + movlpd (%rcx), %xmm0 + movlpd %xmm0, (%rdx) + movlpd 6(%rcx), %xmm1 + movlpd %xmm1, 6(%rdx) + mov %rdi, %rax + ret + + .p2align 4 +L(StrncatExit15): + xor %ah, %ah + movb %ah, 15(%rdx) +L(Exit15): + movlpd (%rcx), %xmm0 + movlpd %xmm0, (%rdx) + movlpd 7(%rcx), %xmm1 + movlpd %xmm1, 7(%rdx) + mov %rdi, %rax + ret + + .p2align 4 +L(StrncatExit16): + xor %ah, %ah + movb %ah, 16(%rdx) +L(Exit16): + movlpd (%rcx), %xmm0 + movlpd 8(%rcx), %xmm1 + movlpd %xmm0, (%rdx) + movlpd %xmm1, 8(%rdx) + mov %rdi, %rax + ret + +# ifdef USE_AS_STRNCPY + + .p2align 4 +L(CopyFrom1To16BytesCase2): + add $16, %r8 + add %rsi, %rcx + lea (%rsi, %rdx), %rsi + lea -9(%r8), %rdx + and $1<<7, %dh + or %al, %dh + test %dh, %dh + lea (%rsi), %rdx + jz L(ExitHighCase2) + + test $0x01, %al + jnz L(Exit1) + cmp $1, %r8 + je L(StrncatExit1) + test $0x02, %al + jnz L(Exit2) + cmp $2, %r8 + je L(StrncatExit2) + test $0x04, %al + jnz L(Exit3) + cmp $3, %r8 + je L(StrncatExit3) + test $0x08, %al + jnz L(Exit4) + cmp $4, %r8 + je L(StrncatExit4) + test $0x10, %al + jnz L(Exit5) + cmp $5, %r8 + je L(StrncatExit5) + test $0x20, %al + jnz L(Exit6) + cmp $6, %r8 + je L(StrncatExit6) + test $0x40, %al + jnz L(Exit7) + cmp $7, %r8 + je L(StrncatExit7) + movlpd (%rcx), %xmm0 + movlpd %xmm0, (%rdx) + lea 7(%rdx), %rax + cmpb $1, (%rax) + sbb $-1, %rax + xor %cl, %cl + movb %cl, (%rax) + mov %rdi, %rax + ret + + .p2align 4 +L(ExitHighCase2): + test $0x01, %ah + jnz L(Exit9) + cmp $9, %r8 + je L(StrncatExit9) + test $0x02, %ah + jnz L(Exit10) + cmp $10, %r8 + je L(StrncatExit10) + test $0x04, %ah + jnz L(Exit11) + cmp $11, %r8 + je L(StrncatExit11) + test $0x8, %ah + jnz L(Exit12) + cmp $12, %r8 + je L(StrncatExit12) + test $0x10, %ah + jnz L(Exit13) + cmp $13, %r8 + je L(StrncatExit13) + test $0x20, %ah + jnz L(Exit14) + cmp $14, %r8 + je L(StrncatExit14) + test $0x40, %ah + jnz L(Exit15) + cmp $15, %r8 + je L(StrncatExit15) + movlpd (%rcx), %xmm0 + movlpd %xmm0, (%rdx) + movlpd 8(%rcx), %xmm1 + movlpd %xmm1, 8(%rdx) + mov %rdi, %rax + ret + +L(CopyFrom1To16BytesCase2OrCase3): + test %rax, %rax + jnz L(CopyFrom1To16BytesCase2) + + .p2align 4 +L(CopyFrom1To16BytesCase3): + add $16, %r8 + add %rsi, %rdx + add %rsi, %rcx + + cmp $8, %r8 + ja L(ExitHighCase3) + cmp $1, %r8 + je L(StrncatExit1) + cmp $2, %r8 + je L(StrncatExit2) + cmp $3, %r8 + je L(StrncatExit3) + cmp $4, %r8 + je L(StrncatExit4) + cmp $5, %r8 + je L(StrncatExit5) + cmp $6, %r8 + je L(StrncatExit6) + cmp $7, %r8 + je L(StrncatExit7) + movlpd (%rcx), %xmm0 + movlpd %xmm0, (%rdx) + xor %ah, %ah + movb %ah, 8(%rdx) + mov %rdi, %rax + ret + + .p2align 4 +L(ExitHighCase3): + cmp $9, %r8 + je L(StrncatExit9) + cmp $10, %r8 + je L(StrncatExit10) + cmp $11, %r8 + je L(StrncatExit11) + cmp $12, %r8 + je L(StrncatExit12) + cmp $13, %r8 + je L(StrncatExit13) + cmp $14, %r8 + je L(StrncatExit14) + cmp $15, %r8 + je L(StrncatExit15) + movlpd (%rcx), %xmm0 + movlpd %xmm0, (%rdx) + movlpd 8(%rcx), %xmm1 + movlpd %xmm1, 8(%rdx) + xor %ah, %ah + movb %ah, 16(%rdx) + mov %rdi, %rax + ret + + .p2align 4 +L(StrncatExit0): + mov %rdi, %rax + ret + + .p2align 4 +L(StrncatExit15Bytes): + cmp $9, %r8 + je L(StrncatExit9) + cmpb $0, 9(%rcx) + jz L(Exit10) + cmp $10, %r8 + je L(StrncatExit10) + cmpb $0, 10(%rcx) + jz L(Exit11) + cmp $11, %r8 + je L(StrncatExit11) + cmpb $0, 11(%rcx) + jz L(Exit12) + cmp $12, %r8 + je L(StrncatExit12) + cmpb $0, 12(%rcx) + jz L(Exit13) + cmp $13, %r8 + je L(StrncatExit13) + cmpb $0, 13(%rcx) + jz L(Exit14) + cmp $14, %r8 + je L(StrncatExit14) + movlpd (%rcx), %xmm0 + movlpd %xmm0, (%rdx) + movlpd 7(%rcx), %xmm1 + movlpd %xmm1, 7(%rdx) + lea 14(%rdx), %rax + cmpb $1, (%rax) + sbb $-1, %rax + xor %cl, %cl + movb %cl, (%rax) + mov %rdi, %rax + ret + + .p2align 4 +L(StrncatExit8Bytes): + cmpb $0, (%rcx) + jz L(Exit1) + cmp $1, %r8 + je L(StrncatExit1) + cmpb $0, 1(%rcx) + jz L(Exit2) + cmp $2, %r8 + je L(StrncatExit2) + cmpb $0, 2(%rcx) + jz L(Exit3) + cmp $3, %r8 + je L(StrncatExit3) + cmpb $0, 3(%rcx) + jz L(Exit4) + cmp $4, %r8 + je L(StrncatExit4) + cmpb $0, 4(%rcx) + jz L(Exit5) + cmp $5, %r8 + je L(StrncatExit5) + cmpb $0, 5(%rcx) + jz L(Exit6) + cmp $6, %r8 + je L(StrncatExit6) + cmpb $0, 6(%rcx) + jz L(Exit7) + cmp $7, %r8 + je L(StrncatExit7) + movlpd (%rcx), %xmm0 + movlpd %xmm0, (%rdx) + lea 7(%rdx), %rax + cmpb $1, (%rax) + sbb $-1, %rax + xor %cl, %cl + movb %cl, (%rax) + mov %rdi, %rax + ret + +# endif +END (STRCAT) +#endif diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/strcat.S b/REORG.TODO/sysdeps/x86_64/multiarch/strcat.S new file mode 100644 index 0000000000..0e0e5dda9c --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/strcat.S @@ -0,0 +1,85 @@ +/* Multiple versions of strcat + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2009-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + +#ifndef USE_AS_STRNCAT +# ifndef STRCAT +# define STRCAT strcat +# endif +#endif + +#ifdef USE_AS_STRNCAT +# define STRCAT_SSSE3 __strncat_ssse3 +# define STRCAT_SSE2 __strncat_sse2 +# define STRCAT_SSE2_UNALIGNED __strncat_sse2_unaligned +# define __GI_STRCAT __GI_strncat +# define __GI___STRCAT __GI___strncat +#else +# define STRCAT_SSSE3 __strcat_ssse3 +# define STRCAT_SSE2 __strcat_sse2 +# define STRCAT_SSE2_UNALIGNED __strcat_sse2_unaligned +# define __GI_STRCAT __GI_strcat +# define __GI___STRCAT __GI___strcat +#endif + + +/* Define multiple versions only for the definition in libc. */ +#if IS_IN (libc) + .text +ENTRY(STRCAT) + .type STRCAT, @gnu_indirect_function + LOAD_RTLD_GLOBAL_RO_RDX + leaq STRCAT_SSE2_UNALIGNED(%rip), %rax + HAS_ARCH_FEATURE (Fast_Unaligned_Load) + jnz 2f + leaq STRCAT_SSE2(%rip), %rax + HAS_CPU_FEATURE (SSSE3) + jz 2f + leaq STRCAT_SSSE3(%rip), %rax +2: ret +END(STRCAT) + +# undef ENTRY +# define ENTRY(name) \ + .type STRCAT_SSE2, @function; \ + .align 16; \ + .globl STRCAT_SSE2; \ + .hidden STRCAT_SSE2; \ + STRCAT_SSE2: cfi_startproc; \ + CALL_MCOUNT +# undef END +# define END(name) \ + cfi_endproc; .size STRCAT_SSE2, .-STRCAT_SSE2 +# undef libc_hidden_builtin_def +/* It doesn't make sense to send libc-internal strcat calls through a PLT. + The speedup we get from using SSSE3 instruction is likely eaten away + by the indirect call in the PLT. */ +# define libc_hidden_builtin_def(name) \ + .globl __GI_STRCAT; __GI_STRCAT = STRCAT_SSE2 +# undef libc_hidden_def +# define libc_hidden_def(name) \ + .globl __GI___STRCAT; __GI___STRCAT = STRCAT_SSE2 +#endif + +#ifndef USE_AS_STRNCAT +# include "../strcat.S" +#endif diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/strchr-sse2-no-bsf.S b/REORG.TODO/sysdeps/x86_64/multiarch/strchr-sse2-no-bsf.S new file mode 100644 index 0000000000..cbbd0b33d3 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/strchr-sse2-no-bsf.S @@ -0,0 +1,280 @@ +/* strchr with SSE2 without bsf + Copyright (C) 2011-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#if IS_IN (libc) + +# include <sysdep.h> +# include "asm-syntax.h" + + atom_text_section +ENTRY (__strchr_sse2_no_bsf) + movd %esi, %xmm1 + movq %rdi, %rcx + punpcklbw %xmm1, %xmm1 + andq $~15, %rdi + pxor %xmm2, %xmm2 + punpcklbw %xmm1, %xmm1 + orl $0xffffffff, %esi + movdqa (%rdi), %xmm0 + pshufd $0, %xmm1, %xmm1 + subq %rdi, %rcx + movdqa %xmm0, %xmm3 + leaq 16(%rdi), %rdi + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm3 + shl %cl, %esi + pmovmskb %xmm0, %eax + pmovmskb %xmm3, %edx + andl %esi, %eax + andl %esi, %edx + test %eax, %eax + jnz L(matches) + test %edx, %edx + jnz L(return_null) + +L(loop): + movdqa (%rdi), %xmm0 + leaq 16(%rdi), %rdi + movdqa %xmm0, %xmm3 + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm3 + pmovmskb %xmm0, %eax + pmovmskb %xmm3, %edx + or %eax, %edx + jz L(loop) + + pmovmskb %xmm3, %edx + test %eax, %eax + jnz L(matches) + +/* Return NULL. */ + .p2align 4 +L(return_null): + xor %rax, %rax + ret + +L(matches): + /* There is a match. First find where NULL is. */ + leaq -16(%rdi), %rdi + test %edx, %edx + jz L(match_case1) + + .p2align 4 +L(match_case2): + test %al, %al + jz L(match_high_case2) + + mov %al, %cl + and $15, %cl + jnz L(match_case2_4) + + mov %dl, %ch + and $15, %ch + jnz L(return_null) + + test $0x10, %al + jnz L(Exit5) + test $0x10, %dl + jnz L(return_null) + test $0x20, %al + jnz L(Exit6) + test $0x20, %dl + jnz L(return_null) + test $0x40, %al + jnz L(Exit7) + test $0x40, %dl + jnz L(return_null) + lea 7(%rdi), %rax + ret + + .p2align 4 +L(match_case2_4): + test $0x01, %al + jnz L(Exit1) + test $0x01, %dl + jnz L(return_null) + test $0x02, %al + jnz L(Exit2) + test $0x02, %dl + jnz L(return_null) + test $0x04, %al + jnz L(Exit3) + test $0x04, %dl + jnz L(return_null) + lea 3(%rdi), %rax + ret + + .p2align 4 +L(match_high_case2): + test %dl, %dl + jnz L(return_null) + + mov %ah, %cl + and $15, %cl + jnz L(match_case2_12) + + mov %dh, %ch + and $15, %ch + jnz L(return_null) + + test $0x10, %ah + jnz L(Exit13) + test $0x10, %dh + jnz L(return_null) + test $0x20, %ah + jnz L(Exit14) + test $0x20, %dh + jnz L(return_null) + test $0x40, %ah + jnz L(Exit15) + test $0x40, %dh + jnz L(return_null) + lea 15(%rdi), %rax + ret + + .p2align 4 +L(match_case2_12): + test $0x01, %ah + jnz L(Exit9) + test $0x01, %dh + jnz L(return_null) + test $0x02, %ah + jnz L(Exit10) + test $0x02, %dh + jnz L(return_null) + test $0x04, %ah + jnz L(Exit11) + test $0x04, %dh + jnz L(return_null) + lea 11(%rdi), %rax + ret + + .p2align 4 +L(match_case1): + test %al, %al + jz L(match_high_case1) + + test $0x01, %al + jnz L(Exit1) + test $0x02, %al + jnz L(Exit2) + test $0x04, %al + jnz L(Exit3) + test $0x08, %al + jnz L(Exit4) + test $0x10, %al + jnz L(Exit5) + test $0x20, %al + jnz L(Exit6) + test $0x40, %al + jnz L(Exit7) + lea 7(%rdi), %rax + ret + + .p2align 4 +L(match_high_case1): + test $0x01, %ah + jnz L(Exit9) + test $0x02, %ah + jnz L(Exit10) + test $0x04, %ah + jnz L(Exit11) + test $0x08, %ah + jnz L(Exit12) + test $0x10, %ah + jnz L(Exit13) + test $0x20, %ah + jnz L(Exit14) + test $0x40, %ah + jnz L(Exit15) + lea 15(%rdi), %rax + ret + + .p2align 4 +L(Exit1): + lea (%rdi), %rax + ret + + .p2align 4 +L(Exit2): + lea 1(%rdi), %rax + ret + + .p2align 4 +L(Exit3): + lea 2(%rdi), %rax + ret + + .p2align 4 +L(Exit4): + lea 3(%rdi), %rax + ret + + .p2align 4 +L(Exit5): + lea 4(%rdi), %rax + ret + + .p2align 4 +L(Exit6): + lea 5(%rdi), %rax + ret + + .p2align 4 +L(Exit7): + lea 6(%rdi), %rax + ret + + .p2align 4 +L(Exit9): + lea 8(%rdi), %rax + ret + + .p2align 4 +L(Exit10): + lea 9(%rdi), %rax + ret + + .p2align 4 +L(Exit11): + lea 10(%rdi), %rax + ret + + .p2align 4 +L(Exit12): + lea 11(%rdi), %rax + ret + + .p2align 4 +L(Exit13): + lea 12(%rdi), %rax + ret + + .p2align 4 +L(Exit14): + lea 13(%rdi), %rax + ret + + .p2align 4 +L(Exit15): + lea 14(%rdi), %rax + ret + +END (__strchr_sse2_no_bsf) +#endif diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/strchr.S b/REORG.TODO/sysdeps/x86_64/multiarch/strchr.S new file mode 100644 index 0000000000..c9f54ca2e2 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/strchr.S @@ -0,0 +1,57 @@ +/* Multiple versions of strchr + Copyright (C) 2009-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + + +/* Define multiple versions only for the definition in libc. */ +#if IS_IN (libc) + .text +ENTRY(strchr) + .type strchr, @gnu_indirect_function + LOAD_RTLD_GLOBAL_RO_RDX + leaq __strchr_sse2(%rip), %rax +2: HAS_ARCH_FEATURE (Slow_BSF) + jz 3f + leaq __strchr_sse2_no_bsf(%rip), %rax +3: ret +END(strchr) + + + +# undef ENTRY +# define ENTRY(name) \ + .type __strchr_sse2, @function; \ + .align 16; \ + .globl __strchr_sse2; \ + .hidden __strchr_sse2; \ + __strchr_sse2: cfi_startproc; \ + CALL_MCOUNT +# undef END +# define END(name) \ + cfi_endproc; .size __strchr_sse2, .-__strchr_sse2 +# undef libc_hidden_builtin_def +/* It doesn't make sense to send libc-internal strchr calls through a PLT. + The speedup we get from using SSE4.2 instruction is likely eaten away + by the indirect call in the PLT. */ +# define libc_hidden_builtin_def(name) \ + .globl __GI_strchr; __GI_strchr = __strchr_sse2 +#endif + +#include "../strchr.S" diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S b/REORG.TODO/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S new file mode 100644 index 0000000000..b0992dce39 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S @@ -0,0 +1,213 @@ +/* strcmp with unaligned loads + Copyright (C) 2013-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#if IS_IN (libc) + +#include "sysdep.h" + +ENTRY ( __strcmp_sse2_unaligned) + movl %edi, %eax + xorl %edx, %edx + pxor %xmm7, %xmm7 + orl %esi, %eax + andl $4095, %eax + cmpl $4032, %eax + jg L(cross_page) + movdqu (%rdi), %xmm1 + movdqu (%rsi), %xmm0 + pcmpeqb %xmm1, %xmm0 + pminub %xmm1, %xmm0 + pxor %xmm1, %xmm1 + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %eax + testq %rax, %rax + je L(next_48_bytes) +L(return): + bsfq %rax, %rdx + movzbl (%rdi, %rdx), %eax + movzbl (%rsi, %rdx), %edx + subl %edx, %eax + ret + + .p2align 4 +L(next_48_bytes): + movdqu 16(%rdi), %xmm6 + movdqu 16(%rsi), %xmm3 + movdqu 32(%rdi), %xmm5 + pcmpeqb %xmm6, %xmm3 + movdqu 32(%rsi), %xmm2 + pminub %xmm6, %xmm3 + pcmpeqb %xmm1, %xmm3 + movdqu 48(%rdi), %xmm4 + pcmpeqb %xmm5, %xmm2 + pmovmskb %xmm3, %edx + movdqu 48(%rsi), %xmm0 + pminub %xmm5, %xmm2 + pcmpeqb %xmm1, %xmm2 + pcmpeqb %xmm4, %xmm0 + pmovmskb %xmm2, %eax + salq $16, %rdx + pminub %xmm4, %xmm0 + pcmpeqb %xmm1, %xmm0 + salq $32, %rax + orq %rdx, %rax + pmovmskb %xmm0, %ecx + movq %rcx, %rdx + salq $48, %rdx + orq %rdx, %rax + jne L(return) +L(main_loop_header): + leaq 64(%rdi), %rdx + movl $4096, %ecx + pxor %xmm9, %xmm9 + andq $-64, %rdx + subq %rdi, %rdx + leaq (%rdi, %rdx), %rax + addq %rsi, %rdx + movq %rdx, %rsi + andl $4095, %esi + subq %rsi, %rcx + shrq $6, %rcx + movq %rcx, %rsi + jmp L(loop_start) + + .p2align 4 +L(loop): + addq $64, %rax + addq $64, %rdx +L(loop_start): + testq %rsi, %rsi + leaq -1(%rsi), %rsi + je L(loop_cross_page) +L(back_to_loop): + movdqu (%rdx), %xmm0 + movdqu 16(%rdx), %xmm1 + movdqa (%rax), %xmm2 + movdqa 16(%rax), %xmm3 + pcmpeqb %xmm2, %xmm0 + movdqu 32(%rdx), %xmm5 + pcmpeqb %xmm3, %xmm1 + pminub %xmm2, %xmm0 + movdqu 48(%rdx), %xmm6 + pminub %xmm3, %xmm1 + movdqa 32(%rax), %xmm2 + pminub %xmm1, %xmm0 + movdqa 48(%rax), %xmm3 + pcmpeqb %xmm2, %xmm5 + pcmpeqb %xmm3, %xmm6 + pminub %xmm2, %xmm5 + pminub %xmm3, %xmm6 + pminub %xmm5, %xmm0 + pminub %xmm6, %xmm0 + pcmpeqb %xmm7, %xmm0 + pmovmskb %xmm0, %ecx + testl %ecx, %ecx + je L(loop) + pcmpeqb %xmm7, %xmm5 + movdqu (%rdx), %xmm0 + pcmpeqb %xmm7, %xmm1 + movdqa (%rax), %xmm2 + pcmpeqb %xmm2, %xmm0 + pminub %xmm2, %xmm0 + pcmpeqb %xmm7, %xmm6 + pcmpeqb %xmm7, %xmm0 + pmovmskb %xmm1, %ecx + pmovmskb %xmm5, %r8d + pmovmskb %xmm0, %edi + salq $16, %rcx + salq $32, %r8 + pmovmskb %xmm6, %esi + orq %r8, %rcx + orq %rdi, %rcx + salq $48, %rsi + orq %rsi, %rcx + bsfq %rcx, %rcx + movzbl (%rax, %rcx), %eax + movzbl (%rdx, %rcx), %edx + subl %edx, %eax + ret + + .p2align 4 +L(loop_cross_page): + xor %r10, %r10 + movq %rdx, %r9 + and $63, %r9 + subq %r9, %r10 + + movdqa (%rdx, %r10), %xmm0 + movdqa 16(%rdx, %r10), %xmm1 + movdqu (%rax, %r10), %xmm2 + movdqu 16(%rax, %r10), %xmm3 + pcmpeqb %xmm2, %xmm0 + movdqa 32(%rdx, %r10), %xmm5 + pcmpeqb %xmm3, %xmm1 + pminub %xmm2, %xmm0 + movdqa 48(%rdx, %r10), %xmm6 + pminub %xmm3, %xmm1 + movdqu 32(%rax, %r10), %xmm2 + movdqu 48(%rax, %r10), %xmm3 + pcmpeqb %xmm2, %xmm5 + pcmpeqb %xmm3, %xmm6 + pminub %xmm2, %xmm5 + pminub %xmm3, %xmm6 + + pcmpeqb %xmm7, %xmm0 + pcmpeqb %xmm7, %xmm1 + pcmpeqb %xmm7, %xmm5 + pcmpeqb %xmm7, %xmm6 + + pmovmskb %xmm1, %ecx + pmovmskb %xmm5, %r8d + pmovmskb %xmm0, %edi + salq $16, %rcx + salq $32, %r8 + pmovmskb %xmm6, %esi + orq %r8, %rdi + orq %rcx, %rdi + salq $48, %rsi + orq %rsi, %rdi + movq %r9, %rcx + movq $63, %rsi + shrq %cl, %rdi + test %rdi, %rdi + je L(back_to_loop) + bsfq %rdi, %rcx + movzbl (%rax, %rcx), %eax + movzbl (%rdx, %rcx), %edx + subl %edx, %eax + ret + + .p2align 4 +L(cross_page_loop): + cmpb %cl, %al + jne L(different) + addq $1, %rdx + cmpq $64, %rdx + je L(main_loop_header) +L(cross_page): + movzbl (%rdi, %rdx), %eax + movzbl (%rsi, %rdx), %ecx + testb %al, %al + jne L(cross_page_loop) + xorl %eax, %eax +L(different): + subl %ecx, %eax + ret +END (__strcmp_sse2_unaligned) + +#endif diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/strcmp-sse42.S b/REORG.TODO/sysdeps/x86_64/multiarch/strcmp-sse42.S new file mode 100644 index 0000000000..ed26d4a8fb --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/strcmp-sse42.S @@ -0,0 +1,1792 @@ +/* strcmp with SSE4.2 + Copyright (C) 2009-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + + +/* We use 0x1a: + _SIDD_SBYTE_OPS + | _SIDD_CMP_EQUAL_EACH + | _SIDD_NEGATIVE_POLARITY + | _SIDD_LEAST_SIGNIFICANT + on pcmpistri to find out if two 16byte data elements are the same + and the offset of the first different byte. There are 4 cases: + + 1. Both 16byte data elements are valid and identical. + 2. Both 16byte data elements have EOS and identical. + 3. Both 16byte data elements are valid and they differ at offset X. + 4. At least one 16byte data element has EOS at offset X. Two 16byte + data elements must differ at or before offset X. + + Here is the table of ECX, CFlag, ZFlag and SFlag for 4 cases: + + case ECX CFlag ZFlag SFlag + 1 16 0 0 0 + 2 16 0 1 1 + 3 X 1 0 0 + 4 0 <= X 1 0/1 0/1 + + We exit from the loop for cases 2, 3 and 4 with jbe which branches + when either CFlag or ZFlag is 1. If CFlag == 0, we return 0 for + case 2. */ + + /* Put all SSE 4.2 functions together. */ + .section .text.SECTION,"ax",@progbits + .align 16 + .type STRCMP_SSE42, @function + .globl STRCMP_SSE42 + .hidden STRCMP_SSE42 +#ifdef USE_AS_STRCASECMP_L +ENTRY (GLABEL(__strcasecmp)) + movq __libc_tsd_LOCALE@gottpoff(%rip),%rax + mov %fs:(%rax),%RDX_LP + + // XXX 5 byte should be before the function + /* 5-byte NOP. */ + .byte 0x0f,0x1f,0x44,0x00,0x00 +END (GLABEL(__strcasecmp)) + /* FALLTHROUGH to strcasecmp_l. */ +#endif +#ifdef USE_AS_STRNCASECMP_L +ENTRY (GLABEL(__strncasecmp)) + movq __libc_tsd_LOCALE@gottpoff(%rip),%rax + mov %fs:(%rax),%RCX_LP + + // XXX 5 byte should be before the function + /* 5-byte NOP. */ + .byte 0x0f,0x1f,0x44,0x00,0x00 +END (GLABEL(__strncasecmp)) + /* FALLTHROUGH to strncasecmp_l. */ +#endif + + +#ifdef USE_AVX +# define movdqa vmovdqa +# define movdqu vmovdqu +# define pmovmskb vpmovmskb +# define pcmpistri vpcmpistri +# define psubb vpsubb +# define pcmpeqb vpcmpeqb +# define psrldq vpsrldq +# define pslldq vpslldq +# define palignr vpalignr +# define pxor vpxor +# define D(arg) arg, arg +#else +# define D(arg) arg +#endif + +STRCMP_SSE42: + cfi_startproc + CALL_MCOUNT + +/* + * This implementation uses SSE to compare up to 16 bytes at a time. + */ +#ifdef USE_AS_STRCASECMP_L + /* We have to fall back on the C implementation for locales + with encodings not matching ASCII for single bytes. */ +# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0 + mov LOCALE_T___LOCALES+LC_CTYPE*LP_SIZE(%rdx), %RAX_LP +# else + mov (%rdx), %RAX_LP +# endif + testl $1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%rax) + jne __strcasecmp_l_nonascii +#endif +#ifdef USE_AS_STRNCASECMP_L + /* We have to fall back on the C implementation for locales + with encodings not matching ASCII for single bytes. */ +# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0 + mov LOCALE_T___LOCALES+LC_CTYPE*LP_SIZE(%rcx), %RAX_LP +# else + mov (%rcx), %RAX_LP +# endif + testl $1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%rax) + jne __strncasecmp_l_nonascii +#endif + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + test %rdx, %rdx + je LABEL(strcmp_exitz) + cmp $1, %rdx + je LABEL(Byte0) + mov %rdx, %r11 +#endif + mov %esi, %ecx + mov %edi, %eax +/* Use 64bit AND here to avoid long NOP padding. */ + and $0x3f, %rcx /* rsi alignment in cache line */ + and $0x3f, %rax /* rdi alignment in cache line */ +#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L + .section .rodata.cst16,"aM",@progbits,16 + .align 16 +LABEL(belowupper): + .quad 0x4040404040404040 + .quad 0x4040404040404040 +LABEL(topupper): +# ifdef USE_AVX + .quad 0x5a5a5a5a5a5a5a5a + .quad 0x5a5a5a5a5a5a5a5a +# else + .quad 0x5b5b5b5b5b5b5b5b + .quad 0x5b5b5b5b5b5b5b5b +# endif +LABEL(touppermask): + .quad 0x2020202020202020 + .quad 0x2020202020202020 + .previous + movdqa LABEL(belowupper)(%rip), %xmm4 +# define UCLOW_reg %xmm4 + movdqa LABEL(topupper)(%rip), %xmm5 +# define UCHIGH_reg %xmm5 + movdqa LABEL(touppermask)(%rip), %xmm6 +# define LCQWORD_reg %xmm6 +#endif + cmp $0x30, %ecx + ja LABEL(crosscache)/* rsi: 16-byte load will cross cache line */ + cmp $0x30, %eax + ja LABEL(crosscache)/* rdi: 16-byte load will cross cache line */ + movdqu (%rdi), %xmm1 + movdqu (%rsi), %xmm2 +#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L +# ifdef USE_AVX +# define TOLOWER(reg1, reg2) \ + vpcmpgtb UCLOW_reg, reg1, %xmm7; \ + vpcmpgtb UCHIGH_reg, reg1, %xmm8; \ + vpcmpgtb UCLOW_reg, reg2, %xmm9; \ + vpcmpgtb UCHIGH_reg, reg2, %xmm10; \ + vpandn %xmm7, %xmm8, %xmm8; \ + vpandn %xmm9, %xmm10, %xmm10; \ + vpand LCQWORD_reg, %xmm8, %xmm8; \ + vpand LCQWORD_reg, %xmm10, %xmm10; \ + vpor reg1, %xmm8, reg1; \ + vpor reg2, %xmm10, reg2 +# else +# define TOLOWER(reg1, reg2) \ + movdqa reg1, %xmm7; \ + movdqa UCHIGH_reg, %xmm8; \ + movdqa reg2, %xmm9; \ + movdqa UCHIGH_reg, %xmm10; \ + pcmpgtb UCLOW_reg, %xmm7; \ + pcmpgtb reg1, %xmm8; \ + pcmpgtb UCLOW_reg, %xmm9; \ + pcmpgtb reg2, %xmm10; \ + pand %xmm8, %xmm7; \ + pand %xmm10, %xmm9; \ + pand LCQWORD_reg, %xmm7; \ + pand LCQWORD_reg, %xmm9; \ + por %xmm7, reg1; \ + por %xmm9, reg2 +# endif + TOLOWER (%xmm1, %xmm2) +#else +# define TOLOWER(reg1, reg2) +#endif + pxor %xmm0, D(%xmm0) /* clear %xmm0 for null char checks */ + pcmpeqb %xmm1, D(%xmm0) /* Any null chars? */ + pcmpeqb %xmm2, D(%xmm1) /* compare first 16 bytes for equality */ + psubb %xmm0, D(%xmm1) /* packed sub of comparison results*/ + pmovmskb %xmm1, %edx + sub $0xffff, %edx /* if first 16 bytes are same, edx == 0xffff */ + jnz LABEL(less16bytes)/* If not, find different value or null char */ +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz)/* finish comparison */ +#endif + add $16, %rsi /* prepare to search next 16 bytes */ + add $16, %rdi /* prepare to search next 16 bytes */ + + /* + * Determine source and destination string offsets from 16-byte + * alignment. Use relative offset difference between the two to + * determine which case below to use. + */ + .p2align 4 +LABEL(crosscache): + and $0xfffffffffffffff0, %rsi /* force %rsi is 16 byte aligned */ + and $0xfffffffffffffff0, %rdi /* force %rdi is 16 byte aligned */ + mov $0xffff, %edx /* for equivalent offset */ + xor %r8d, %r8d + and $0xf, %ecx /* offset of rsi */ + and $0xf, %eax /* offset of rdi */ + pxor %xmm0, D(%xmm0) /* clear %xmm0 for null char check */ + cmp %eax, %ecx + je LABEL(ashr_0) /* rsi and rdi relative offset same */ + ja LABEL(bigger) + mov %edx, %r8d /* r8d is offset flag for exit tail */ + xchg %ecx, %eax + xchg %rsi, %rdi +LABEL(bigger): + movdqa (%rdi), %xmm2 + movdqa (%rsi), %xmm1 + lea 15(%rax), %r9 + sub %rcx, %r9 + lea LABEL(unaligned_table)(%rip), %r10 + movslq (%r10, %r9,4), %r9 + pcmpeqb %xmm1, D(%xmm0) /* Any null chars? */ + lea (%r10, %r9), %r10 + jmp *%r10 /* jump to corresponding case */ + +/* + * The following cases will be handled by ashr_0 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(0~15) n(0~15) 15(15+ n-n) ashr_0 + */ + .p2align 4 +LABEL(ashr_0): + + movdqa (%rsi), %xmm1 + pcmpeqb %xmm1, D(%xmm0) /* Any null chars? */ +#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpeqb (%rdi), D(%xmm1) /* compare 16 bytes for equality */ +#else + movdqa (%rdi), %xmm2 + TOLOWER (%xmm1, %xmm2) + pcmpeqb %xmm2, D(%xmm1) /* compare 16 bytes for equality */ +#endif + psubb %xmm0, D(%xmm1) /* packed sub of comparison results*/ + pmovmskb %xmm1, %r9d + shr %cl, %edx /* adjust 0xffff for offset */ + shr %cl, %r9d /* adjust for 16-byte offset */ + sub %r9d, %edx + /* + * edx must be the same with r9d if in left byte (16-rcx) is equal to + * the start from (16-rax) and no null char was seen. + */ + jne LABEL(less32bytes) /* mismatch or null char */ + UPDATE_STRNCMP_COUNTER + mov $16, %rcx + mov $16, %r9 + + /* + * Now both strings are aligned at 16-byte boundary. Loop over strings + * checking 32-bytes per iteration. + */ + mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ + .p2align 4 +LABEL(ashr_0_use): + movdqa (%rdi,%rdx), %xmm0 +#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 +#else + movdqa (%rsi,%rdx), %xmm1 + TOLOWER (%xmm0, %xmm1) + pcmpistri $0x1a, %xmm1, %xmm0 +#endif + lea 16(%rdx), %rdx + jbe LABEL(ashr_0_exit_use) +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + + movdqa (%rdi,%rdx), %xmm0 +#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 +#else + movdqa (%rsi,%rdx), %xmm1 + TOLOWER (%xmm0, %xmm1) + pcmpistri $0x1a, %xmm1, %xmm0 +#endif + lea 16(%rdx), %rdx + jbe LABEL(ashr_0_exit_use) +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + jmp LABEL(ashr_0_use) + + + .p2align 4 +LABEL(ashr_0_exit_use): + jnc LABEL(strcmp_exitz) +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub %rcx, %r11 + jbe LABEL(strcmp_exitz) +#endif + lea -16(%rdx, %rcx), %rcx + movzbl (%rdi, %rcx), %eax + movzbl (%rsi, %rcx), %edx +#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L + leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rcx + movl (%rcx,%rax,4), %eax + movl (%rcx,%rdx,4), %edx +#endif + sub %edx, %eax + ret + + + +/* + * The following cases will be handled by ashr_1 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(15) n -15 0(15 +(n-15) - n) ashr_1 + */ + .p2align 4 +LABEL(ashr_1): + pslldq $15, D(%xmm2) /* shift first string to align with second */ + TOLOWER (%xmm1, %xmm2) + pcmpeqb %xmm1, D(%xmm2) /* compare 16 bytes for equality */ + psubb %xmm0, D(%xmm2) /* packed sub of comparison results*/ + pmovmskb %xmm2, %r9d + shr %cl, %edx /* adjust 0xffff for offset */ + shr %cl, %r9d /* adjust for 16-byte offset */ + sub %r9d, %edx + jnz LABEL(less32bytes) /* mismatch or null char seen */ + movdqa (%rdi), %xmm3 + UPDATE_STRNCMP_COUNTER + + mov $16, %rcx /* index for loads*/ + mov $1, %r9d /* byte position left over from less32bytes case */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we have crossed a page boundary and + * need to do a nibble. + */ + lea 1(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + sub $0x1000, %r10 /* subtract 4K pagesize */ + mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ + + .p2align 4 +LABEL(loop_ashr_1_use): + add $16, %r10 + jg LABEL(nibble_ashr_1_use) + +LABEL(nibble_ashr_1_restart_use): + movdqa (%rdi, %rdx), %xmm0 + palignr $1, -16(%rdi, %rdx), D(%xmm0) +#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 +#else + movdqa (%rsi,%rdx), %xmm1 + TOLOWER (%xmm0, %xmm1) + pcmpistri $0x1a, %xmm1, %xmm0 +#endif + jbe LABEL(exit_use) +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + + add $16, %rdx + add $16, %r10 + jg LABEL(nibble_ashr_1_use) + + movdqa (%rdi, %rdx), %xmm0 + palignr $1, -16(%rdi, %rdx), D(%xmm0) +#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 +#else + movdqa (%rsi,%rdx), %xmm1 + TOLOWER (%xmm0, %xmm1) + pcmpistri $0x1a, %xmm1, %xmm0 +#endif + jbe LABEL(exit_use) +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + add $16, %rdx + jmp LABEL(loop_ashr_1_use) + + .p2align 4 +LABEL(nibble_ashr_1_use): + sub $0x1000, %r10 + movdqa -16(%rdi, %rdx), %xmm0 + psrldq $1, D(%xmm0) + pcmpistri $0x3a,%xmm0, %xmm0 +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp %r11, %rcx + jae LABEL(nibble_ashr_exit_use) +#endif + cmp $14, %ecx + ja LABEL(nibble_ashr_1_restart_use) + + jmp LABEL(nibble_ashr_exit_use) + +/* + * The following cases will be handled by ashr_2 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(14~15) n -14 1(15 +(n-14) - n) ashr_2 + */ + .p2align 4 +LABEL(ashr_2): + pslldq $14, D(%xmm2) + TOLOWER (%xmm1, %xmm2) + pcmpeqb %xmm1, D(%xmm2) + psubb %xmm0, D(%xmm2) + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d + sub %r9d, %edx + jnz LABEL(less32bytes) + movdqa (%rdi), %xmm3 + UPDATE_STRNCMP_COUNTER + + mov $16, %rcx /* index for loads */ + mov $2, %r9d /* byte position left over from less32bytes case */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we have crossed a page boundary and + * need to do a nibble. + */ + lea 2(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + sub $0x1000, %r10 /* subtract 4K pagesize */ + mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ + + .p2align 4 +LABEL(loop_ashr_2_use): + add $16, %r10 + jg LABEL(nibble_ashr_2_use) + +LABEL(nibble_ashr_2_restart_use): + movdqa (%rdi, %rdx), %xmm0 + palignr $2, -16(%rdi, %rdx), D(%xmm0) +#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 +#else + movdqa (%rsi,%rdx), %xmm1 + TOLOWER (%xmm0, %xmm1) + pcmpistri $0x1a, %xmm1, %xmm0 +#endif + jbe LABEL(exit_use) +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + + add $16, %rdx + add $16, %r10 + jg LABEL(nibble_ashr_2_use) + + movdqa (%rdi, %rdx), %xmm0 + palignr $2, -16(%rdi, %rdx), D(%xmm0) +#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 +#else + movdqa (%rsi,%rdx), %xmm1 + TOLOWER (%xmm0, %xmm1) + pcmpistri $0x1a, %xmm1, %xmm0 +#endif + jbe LABEL(exit_use) +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + add $16, %rdx + jmp LABEL(loop_ashr_2_use) + + .p2align 4 +LABEL(nibble_ashr_2_use): + sub $0x1000, %r10 + movdqa -16(%rdi, %rdx), %xmm0 + psrldq $2, D(%xmm0) + pcmpistri $0x3a,%xmm0, %xmm0 +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp %r11, %rcx + jae LABEL(nibble_ashr_exit_use) +#endif + cmp $13, %ecx + ja LABEL(nibble_ashr_2_restart_use) + + jmp LABEL(nibble_ashr_exit_use) + +/* + * The following cases will be handled by ashr_3 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(13~15) n -13 2(15 +(n-13) - n) ashr_3 + */ + .p2align 4 +LABEL(ashr_3): + pslldq $13, D(%xmm2) + TOLOWER (%xmm1, %xmm2) + pcmpeqb %xmm1, D(%xmm2) + psubb %xmm0, D(%xmm2) + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d + sub %r9d, %edx + jnz LABEL(less32bytes) + movdqa (%rdi), %xmm3 + + UPDATE_STRNCMP_COUNTER + + mov $16, %rcx /* index for loads */ + mov $3, %r9d /* byte position left over from less32bytes case */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we have crossed a page boundary and + * need to do a nibble. + */ + lea 3(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + sub $0x1000, %r10 /* subtract 4K pagesize */ + mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ + +LABEL(loop_ashr_3_use): + add $16, %r10 + jg LABEL(nibble_ashr_3_use) + +LABEL(nibble_ashr_3_restart_use): + movdqa (%rdi, %rdx), %xmm0 + palignr $3, -16(%rdi, %rdx), D(%xmm0) +#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 +#else + movdqa (%rsi,%rdx), %xmm1 + TOLOWER (%xmm0, %xmm1) + pcmpistri $0x1a, %xmm1, %xmm0 +#endif + jbe LABEL(exit_use) +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + + add $16, %rdx + add $16, %r10 + jg LABEL(nibble_ashr_3_use) + + movdqa (%rdi, %rdx), %xmm0 + palignr $3, -16(%rdi, %rdx), D(%xmm0) +#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 +#else + movdqa (%rsi,%rdx), %xmm1 + TOLOWER (%xmm0, %xmm1) + pcmpistri $0x1a, %xmm1, %xmm0 +#endif + jbe LABEL(exit_use) +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + add $16, %rdx + jmp LABEL(loop_ashr_3_use) + + .p2align 4 +LABEL(nibble_ashr_3_use): + sub $0x1000, %r10 + movdqa -16(%rdi, %rdx), %xmm0 + psrldq $3, D(%xmm0) + pcmpistri $0x3a,%xmm0, %xmm0 +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp %r11, %rcx + jae LABEL(nibble_ashr_exit_use) +#endif + cmp $12, %ecx + ja LABEL(nibble_ashr_3_restart_use) + + jmp LABEL(nibble_ashr_exit_use) + +/* + * The following cases will be handled by ashr_4 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(12~15) n -12 3(15 +(n-12) - n) ashr_4 + */ + .p2align 4 +LABEL(ashr_4): + pslldq $12, D(%xmm2) + TOLOWER (%xmm1, %xmm2) + pcmpeqb %xmm1, D(%xmm2) + psubb %xmm0, D(%xmm2) + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d + sub %r9d, %edx + jnz LABEL(less32bytes) + movdqa (%rdi), %xmm3 + + UPDATE_STRNCMP_COUNTER + + mov $16, %rcx /* index for loads */ + mov $4, %r9d /* byte position left over from less32bytes case */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we have crossed a page boundary and + * need to do a nibble. + */ + lea 4(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + sub $0x1000, %r10 /* subtract 4K pagesize */ + mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ + + .p2align 4 +LABEL(loop_ashr_4_use): + add $16, %r10 + jg LABEL(nibble_ashr_4_use) + +LABEL(nibble_ashr_4_restart_use): + movdqa (%rdi, %rdx), %xmm0 + palignr $4, -16(%rdi, %rdx), D(%xmm0) +#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 +#else + movdqa (%rsi,%rdx), %xmm1 + TOLOWER (%xmm0, %xmm1) + pcmpistri $0x1a, %xmm1, %xmm0 +#endif + jbe LABEL(exit_use) +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + + add $16, %rdx + add $16, %r10 + jg LABEL(nibble_ashr_4_use) + + movdqa (%rdi, %rdx), %xmm0 + palignr $4, -16(%rdi, %rdx), D(%xmm0) +#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 +#else + movdqa (%rsi,%rdx), %xmm1 + TOLOWER (%xmm0, %xmm1) + pcmpistri $0x1a, %xmm1, %xmm0 +#endif + jbe LABEL(exit_use) +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + add $16, %rdx + jmp LABEL(loop_ashr_4_use) + + .p2align 4 +LABEL(nibble_ashr_4_use): + sub $0x1000, %r10 + movdqa -16(%rdi, %rdx), %xmm0 + psrldq $4, D(%xmm0) + pcmpistri $0x3a,%xmm0, %xmm0 +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp %r11, %rcx + jae LABEL(nibble_ashr_exit_use) +#endif + cmp $11, %ecx + ja LABEL(nibble_ashr_4_restart_use) + + jmp LABEL(nibble_ashr_exit_use) + +/* + * The following cases will be handled by ashr_5 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(11~15) n - 11 4(15 +(n-11) - n) ashr_5 + */ + .p2align 4 +LABEL(ashr_5): + pslldq $11, D(%xmm2) + TOLOWER (%xmm1, %xmm2) + pcmpeqb %xmm1, D(%xmm2) + psubb %xmm0, D(%xmm2) + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d + sub %r9d, %edx + jnz LABEL(less32bytes) + movdqa (%rdi), %xmm3 + + UPDATE_STRNCMP_COUNTER + + mov $16, %rcx /* index for loads */ + mov $5, %r9d /* byte position left over from less32bytes case */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we have crossed a page boundary and + * need to do a nibble. + */ + lea 5(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + sub $0x1000, %r10 /* subtract 4K pagesize */ + mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ + + .p2align 4 +LABEL(loop_ashr_5_use): + add $16, %r10 + jg LABEL(nibble_ashr_5_use) + +LABEL(nibble_ashr_5_restart_use): + movdqa (%rdi, %rdx), %xmm0 + palignr $5, -16(%rdi, %rdx), D(%xmm0) +#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 +#else + movdqa (%rsi,%rdx), %xmm1 + TOLOWER (%xmm0, %xmm1) + pcmpistri $0x1a, %xmm1, %xmm0 +#endif + jbe LABEL(exit_use) +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + + add $16, %rdx + add $16, %r10 + jg LABEL(nibble_ashr_5_use) + + movdqa (%rdi, %rdx), %xmm0 + + palignr $5, -16(%rdi, %rdx), D(%xmm0) +#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 +#else + movdqa (%rsi,%rdx), %xmm1 + TOLOWER (%xmm0, %xmm1) + pcmpistri $0x1a, %xmm1, %xmm0 +#endif + jbe LABEL(exit_use) +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + add $16, %rdx + jmp LABEL(loop_ashr_5_use) + + .p2align 4 +LABEL(nibble_ashr_5_use): + sub $0x1000, %r10 + movdqa -16(%rdi, %rdx), %xmm0 + psrldq $5, D(%xmm0) + pcmpistri $0x3a,%xmm0, %xmm0 +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp %r11, %rcx + jae LABEL(nibble_ashr_exit_use) +#endif + cmp $10, %ecx + ja LABEL(nibble_ashr_5_restart_use) + + jmp LABEL(nibble_ashr_exit_use) + +/* + * The following cases will be handled by ashr_6 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(10~15) n - 10 5(15 +(n-10) - n) ashr_6 + */ + .p2align 4 +LABEL(ashr_6): + pslldq $10, D(%xmm2) + TOLOWER (%xmm1, %xmm2) + pcmpeqb %xmm1, D(%xmm2) + psubb %xmm0, D(%xmm2) + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d + sub %r9d, %edx + jnz LABEL(less32bytes) + movdqa (%rdi), %xmm3 + + UPDATE_STRNCMP_COUNTER + + mov $16, %rcx /* index for loads */ + mov $6, %r9d /* byte position left over from less32bytes case */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we have crossed a page boundary and + * need to do a nibble. + */ + lea 6(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + sub $0x1000, %r10 /* subtract 4K pagesize */ + mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ + + .p2align 4 +LABEL(loop_ashr_6_use): + add $16, %r10 + jg LABEL(nibble_ashr_6_use) + +LABEL(nibble_ashr_6_restart_use): + movdqa (%rdi, %rdx), %xmm0 + palignr $6, -16(%rdi, %rdx), D(%xmm0) +#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 +#else + movdqa (%rsi,%rdx), %xmm1 + TOLOWER (%xmm0, %xmm1) + pcmpistri $0x1a, %xmm1, %xmm0 +#endif + jbe LABEL(exit_use) +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + + add $16, %rdx + add $16, %r10 + jg LABEL(nibble_ashr_6_use) + + movdqa (%rdi, %rdx), %xmm0 + palignr $6, -16(%rdi, %rdx), D(%xmm0) +#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 +#else + movdqa (%rsi,%rdx), %xmm1 + TOLOWER (%xmm0, %xmm1) + pcmpistri $0x1a, %xmm1, %xmm0 +#endif + jbe LABEL(exit_use) +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + add $16, %rdx + jmp LABEL(loop_ashr_6_use) + + .p2align 4 +LABEL(nibble_ashr_6_use): + sub $0x1000, %r10 + movdqa -16(%rdi, %rdx), %xmm0 + psrldq $6, D(%xmm0) + pcmpistri $0x3a,%xmm0, %xmm0 +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp %r11, %rcx + jae LABEL(nibble_ashr_exit_use) +#endif + cmp $9, %ecx + ja LABEL(nibble_ashr_6_restart_use) + + jmp LABEL(nibble_ashr_exit_use) + +/* + * The following cases will be handled by ashr_7 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(9~15) n - 9 6(15 +(n - 9) - n) ashr_7 + */ + .p2align 4 +LABEL(ashr_7): + pslldq $9, D(%xmm2) + TOLOWER (%xmm1, %xmm2) + pcmpeqb %xmm1, D(%xmm2) + psubb %xmm0, D(%xmm2) + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d + sub %r9d, %edx + jnz LABEL(less32bytes) + movdqa (%rdi), %xmm3 + + UPDATE_STRNCMP_COUNTER + + mov $16, %rcx /* index for loads */ + mov $7, %r9d /* byte position left over from less32bytes case */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we have crossed a page boundary and + * need to do a nibble. + */ + lea 7(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + sub $0x1000, %r10 /* subtract 4K pagesize */ + mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ + + .p2align 4 +LABEL(loop_ashr_7_use): + add $16, %r10 + jg LABEL(nibble_ashr_7_use) + +LABEL(nibble_ashr_7_restart_use): + movdqa (%rdi, %rdx), %xmm0 + palignr $7, -16(%rdi, %rdx), D(%xmm0) +#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 +#else + movdqa (%rsi,%rdx), %xmm1 + TOLOWER (%xmm0, %xmm1) + pcmpistri $0x1a, %xmm1, %xmm0 +#endif + jbe LABEL(exit_use) +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + + add $16, %rdx + add $16, %r10 + jg LABEL(nibble_ashr_7_use) + + movdqa (%rdi, %rdx), %xmm0 + palignr $7, -16(%rdi, %rdx), D(%xmm0) +#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 +#else + movdqa (%rsi,%rdx), %xmm1 + TOLOWER (%xmm0, %xmm1) + pcmpistri $0x1a, %xmm1, %xmm0 +#endif + jbe LABEL(exit_use) +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + add $16, %rdx + jmp LABEL(loop_ashr_7_use) + + .p2align 4 +LABEL(nibble_ashr_7_use): + sub $0x1000, %r10 + movdqa -16(%rdi, %rdx), %xmm0 + psrldq $7, D(%xmm0) + pcmpistri $0x3a,%xmm0, %xmm0 +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp %r11, %rcx + jae LABEL(nibble_ashr_exit_use) +#endif + cmp $8, %ecx + ja LABEL(nibble_ashr_7_restart_use) + + jmp LABEL(nibble_ashr_exit_use) + +/* + * The following cases will be handled by ashr_8 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(8~15) n - 8 7(15 +(n - 8) - n) ashr_8 + */ + .p2align 4 +LABEL(ashr_8): + pslldq $8, D(%xmm2) + TOLOWER (%xmm1, %xmm2) + pcmpeqb %xmm1, D(%xmm2) + psubb %xmm0, D(%xmm2) + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d + sub %r9d, %edx + jnz LABEL(less32bytes) + movdqa (%rdi), %xmm3 + + UPDATE_STRNCMP_COUNTER + + mov $16, %rcx /* index for loads */ + mov $8, %r9d /* byte position left over from less32bytes case */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we have crossed a page boundary and + * need to do a nibble. + */ + lea 8(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + sub $0x1000, %r10 /* subtract 4K pagesize */ + mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ + + .p2align 4 +LABEL(loop_ashr_8_use): + add $16, %r10 + jg LABEL(nibble_ashr_8_use) + +LABEL(nibble_ashr_8_restart_use): + movdqa (%rdi, %rdx), %xmm0 + palignr $8, -16(%rdi, %rdx), D(%xmm0) +#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a, (%rsi,%rdx), %xmm0 +#else + movdqa (%rsi,%rdx), %xmm1 + TOLOWER (%xmm0, %xmm1) + pcmpistri $0x1a, %xmm1, %xmm0 +#endif + jbe LABEL(exit_use) +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + + add $16, %rdx + add $16, %r10 + jg LABEL(nibble_ashr_8_use) + + movdqa (%rdi, %rdx), %xmm0 + palignr $8, -16(%rdi, %rdx), D(%xmm0) +#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a, (%rsi,%rdx), %xmm0 +#else + movdqa (%rsi,%rdx), %xmm1 + TOLOWER (%xmm0, %xmm1) + pcmpistri $0x1a, %xmm1, %xmm0 +#endif + jbe LABEL(exit_use) +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + add $16, %rdx + jmp LABEL(loop_ashr_8_use) + + .p2align 4 +LABEL(nibble_ashr_8_use): + sub $0x1000, %r10 + movdqa -16(%rdi, %rdx), %xmm0 + psrldq $8, D(%xmm0) + pcmpistri $0x3a,%xmm0, %xmm0 +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp %r11, %rcx + jae LABEL(nibble_ashr_exit_use) +#endif + cmp $7, %ecx + ja LABEL(nibble_ashr_8_restart_use) + + jmp LABEL(nibble_ashr_exit_use) + +/* + * The following cases will be handled by ashr_9 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(7~15) n - 7 8(15 +(n - 7) - n) ashr_9 + */ + .p2align 4 +LABEL(ashr_9): + pslldq $7, D(%xmm2) + TOLOWER (%xmm1, %xmm2) + pcmpeqb %xmm1, D(%xmm2) + psubb %xmm0, D(%xmm2) + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d + sub %r9d, %edx + jnz LABEL(less32bytes) + movdqa (%rdi), %xmm3 + + UPDATE_STRNCMP_COUNTER + + mov $16, %rcx /* index for loads */ + mov $9, %r9d /* byte position left over from less32bytes case */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we have crossed a page boundary and + * need to do a nibble. + */ + lea 9(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + sub $0x1000, %r10 /* subtract 4K pagesize */ + mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ + + .p2align 4 +LABEL(loop_ashr_9_use): + add $16, %r10 + jg LABEL(nibble_ashr_9_use) + +LABEL(nibble_ashr_9_restart_use): + movdqa (%rdi, %rdx), %xmm0 + + palignr $9, -16(%rdi, %rdx), D(%xmm0) +#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a, (%rsi,%rdx), %xmm0 +#else + movdqa (%rsi,%rdx), %xmm1 + TOLOWER (%xmm0, %xmm1) + pcmpistri $0x1a, %xmm1, %xmm0 +#endif + jbe LABEL(exit_use) +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + + add $16, %rdx + add $16, %r10 + jg LABEL(nibble_ashr_9_use) + + movdqa (%rdi, %rdx), %xmm0 + palignr $9, -16(%rdi, %rdx), D(%xmm0) +#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a, (%rsi,%rdx), %xmm0 +#else + movdqa (%rsi,%rdx), %xmm1 + TOLOWER (%xmm0, %xmm1) + pcmpistri $0x1a, %xmm1, %xmm0 +#endif + jbe LABEL(exit_use) +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + add $16, %rdx + jmp LABEL(loop_ashr_9_use) + + .p2align 4 +LABEL(nibble_ashr_9_use): + sub $0x1000, %r10 + movdqa -16(%rdi, %rdx), %xmm0 + psrldq $9, D(%xmm0) + pcmpistri $0x3a,%xmm0, %xmm0 +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp %r11, %rcx + jae LABEL(nibble_ashr_exit_use) +#endif + cmp $6, %ecx + ja LABEL(nibble_ashr_9_restart_use) + + jmp LABEL(nibble_ashr_exit_use) + +/* + * The following cases will be handled by ashr_10 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(6~15) n - 6 9(15 +(n - 6) - n) ashr_10 + */ + .p2align 4 +LABEL(ashr_10): + pslldq $6, D(%xmm2) + TOLOWER (%xmm1, %xmm2) + pcmpeqb %xmm1, D(%xmm2) + psubb %xmm0, D(%xmm2) + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d + sub %r9d, %edx + jnz LABEL(less32bytes) + movdqa (%rdi), %xmm3 + + UPDATE_STRNCMP_COUNTER + + mov $16, %rcx /* index for loads */ + mov $10, %r9d /* byte position left over from less32bytes case */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we have crossed a page boundary and + * need to do a nibble. + */ + lea 10(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + sub $0x1000, %r10 /* subtract 4K pagesize */ + mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ + + .p2align 4 +LABEL(loop_ashr_10_use): + add $16, %r10 + jg LABEL(nibble_ashr_10_use) + +LABEL(nibble_ashr_10_restart_use): + movdqa (%rdi, %rdx), %xmm0 + palignr $10, -16(%rdi, %rdx), D(%xmm0) +#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a, (%rsi,%rdx), %xmm0 +#else + movdqa (%rsi,%rdx), %xmm1 + TOLOWER (%xmm0, %xmm1) + pcmpistri $0x1a, %xmm1, %xmm0 +#endif + jbe LABEL(exit_use) +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + + add $16, %rdx + add $16, %r10 + jg LABEL(nibble_ashr_10_use) + + movdqa (%rdi, %rdx), %xmm0 + palignr $10, -16(%rdi, %rdx), D(%xmm0) +#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a, (%rsi,%rdx), %xmm0 +#else + movdqa (%rsi,%rdx), %xmm1 + TOLOWER (%xmm0, %xmm1) + pcmpistri $0x1a, %xmm1, %xmm0 +#endif + jbe LABEL(exit_use) +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + add $16, %rdx + jmp LABEL(loop_ashr_10_use) + + .p2align 4 +LABEL(nibble_ashr_10_use): + sub $0x1000, %r10 + movdqa -16(%rdi, %rdx), %xmm0 + psrldq $10, D(%xmm0) + pcmpistri $0x3a,%xmm0, %xmm0 +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp %r11, %rcx + jae LABEL(nibble_ashr_exit_use) +#endif + cmp $5, %ecx + ja LABEL(nibble_ashr_10_restart_use) + + jmp LABEL(nibble_ashr_exit_use) + +/* + * The following cases will be handled by ashr_11 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(5~15) n - 5 10(15 +(n - 5) - n) ashr_11 + */ + .p2align 4 +LABEL(ashr_11): + pslldq $5, D(%xmm2) + TOLOWER (%xmm1, %xmm2) + pcmpeqb %xmm1, D(%xmm2) + psubb %xmm0, D(%xmm2) + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d + sub %r9d, %edx + jnz LABEL(less32bytes) + movdqa (%rdi), %xmm3 + + UPDATE_STRNCMP_COUNTER + + mov $16, %rcx /* index for loads */ + mov $11, %r9d /* byte position left over from less32bytes case */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we have crossed a page boundary and + * need to do a nibble. + */ + lea 11(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + sub $0x1000, %r10 /* subtract 4K pagesize */ + mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ + + .p2align 4 +LABEL(loop_ashr_11_use): + add $16, %r10 + jg LABEL(nibble_ashr_11_use) + +LABEL(nibble_ashr_11_restart_use): + movdqa (%rdi, %rdx), %xmm0 + palignr $11, -16(%rdi, %rdx), D(%xmm0) +#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a, (%rsi,%rdx), %xmm0 +#else + movdqa (%rsi,%rdx), %xmm1 + TOLOWER (%xmm0, %xmm1) + pcmpistri $0x1a, %xmm1, %xmm0 +#endif + jbe LABEL(exit_use) +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + + add $16, %rdx + add $16, %r10 + jg LABEL(nibble_ashr_11_use) + + movdqa (%rdi, %rdx), %xmm0 + palignr $11, -16(%rdi, %rdx), D(%xmm0) +#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a, (%rsi,%rdx), %xmm0 +#else + movdqa (%rsi,%rdx), %xmm1 + TOLOWER (%xmm0, %xmm1) + pcmpistri $0x1a, %xmm1, %xmm0 +#endif + jbe LABEL(exit_use) +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + add $16, %rdx + jmp LABEL(loop_ashr_11_use) + + .p2align 4 +LABEL(nibble_ashr_11_use): + sub $0x1000, %r10 + movdqa -16(%rdi, %rdx), %xmm0 + psrldq $11, D(%xmm0) + pcmpistri $0x3a,%xmm0, %xmm0 +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp %r11, %rcx + jae LABEL(nibble_ashr_exit_use) +#endif + cmp $4, %ecx + ja LABEL(nibble_ashr_11_restart_use) + + jmp LABEL(nibble_ashr_exit_use) + +/* + * The following cases will be handled by ashr_12 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(4~15) n - 4 11(15 +(n - 4) - n) ashr_12 + */ + .p2align 4 +LABEL(ashr_12): + pslldq $4, D(%xmm2) + TOLOWER (%xmm1, %xmm2) + pcmpeqb %xmm1, D(%xmm2) + psubb %xmm0, D(%xmm2) + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d + sub %r9d, %edx + jnz LABEL(less32bytes) + movdqa (%rdi), %xmm3 + + UPDATE_STRNCMP_COUNTER + + mov $16, %rcx /* index for loads */ + mov $12, %r9d /* byte position left over from less32bytes case */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we have crossed a page boundary and + * need to do a nibble. + */ + lea 12(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + sub $0x1000, %r10 /* subtract 4K pagesize */ + mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ + + .p2align 4 +LABEL(loop_ashr_12_use): + add $16, %r10 + jg LABEL(nibble_ashr_12_use) + +LABEL(nibble_ashr_12_restart_use): + movdqa (%rdi, %rdx), %xmm0 + palignr $12, -16(%rdi, %rdx), D(%xmm0) +#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a, (%rsi,%rdx), %xmm0 +#else + movdqa (%rsi,%rdx), %xmm1 + TOLOWER (%xmm0, %xmm1) + pcmpistri $0x1a, %xmm1, %xmm0 +#endif + jbe LABEL(exit_use) +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + + add $16, %rdx + add $16, %r10 + jg LABEL(nibble_ashr_12_use) + + movdqa (%rdi, %rdx), %xmm0 + palignr $12, -16(%rdi, %rdx), D(%xmm0) +#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a, (%rsi,%rdx), %xmm0 +#else + movdqa (%rsi,%rdx), %xmm1 + TOLOWER (%xmm0, %xmm1) + pcmpistri $0x1a, %xmm1, %xmm0 +#endif + jbe LABEL(exit_use) +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + add $16, %rdx + jmp LABEL(loop_ashr_12_use) + + .p2align 4 +LABEL(nibble_ashr_12_use): + sub $0x1000, %r10 + movdqa -16(%rdi, %rdx), %xmm0 + psrldq $12, D(%xmm0) + pcmpistri $0x3a,%xmm0, %xmm0 +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp %r11, %rcx + jae LABEL(nibble_ashr_exit_use) +#endif + cmp $3, %ecx + ja LABEL(nibble_ashr_12_restart_use) + + jmp LABEL(nibble_ashr_exit_use) + +/* + * The following cases will be handled by ashr_13 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(3~15) n - 3 12(15 +(n - 3) - n) ashr_13 + */ + .p2align 4 +LABEL(ashr_13): + pslldq $3, D(%xmm2) + TOLOWER (%xmm1, %xmm2) + pcmpeqb %xmm1, D(%xmm2) + psubb %xmm0, D(%xmm2) + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d + sub %r9d, %edx + jnz LABEL(less32bytes) + movdqa (%rdi), %xmm3 + + UPDATE_STRNCMP_COUNTER + + mov $16, %rcx /* index for loads */ + mov $13, %r9d /* byte position left over from less32bytes case */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we have crossed a page boundary and + * need to do a nibble. + */ + lea 13(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + sub $0x1000, %r10 /* subtract 4K pagesize */ + + mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ + + .p2align 4 +LABEL(loop_ashr_13_use): + add $16, %r10 + jg LABEL(nibble_ashr_13_use) + +LABEL(nibble_ashr_13_restart_use): + movdqa (%rdi, %rdx), %xmm0 + palignr $13, -16(%rdi, %rdx), D(%xmm0) +#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a, (%rsi,%rdx), %xmm0 +#else + movdqa (%rsi,%rdx), %xmm1 + TOLOWER (%xmm0, %xmm1) + pcmpistri $0x1a, %xmm1, %xmm0 +#endif + jbe LABEL(exit_use) +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + + add $16, %rdx + add $16, %r10 + jg LABEL(nibble_ashr_13_use) + + movdqa (%rdi, %rdx), %xmm0 + palignr $13, -16(%rdi, %rdx), D(%xmm0) +#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a, (%rsi,%rdx), %xmm0 +#else + movdqa (%rsi,%rdx), %xmm1 + TOLOWER (%xmm0, %xmm1) + pcmpistri $0x1a, %xmm1, %xmm0 +#endif + jbe LABEL(exit_use) +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + add $16, %rdx + jmp LABEL(loop_ashr_13_use) + + .p2align 4 +LABEL(nibble_ashr_13_use): + sub $0x1000, %r10 + movdqa -16(%rdi, %rdx), %xmm0 + psrldq $13, D(%xmm0) + pcmpistri $0x3a,%xmm0, %xmm0 +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp %r11, %rcx + jae LABEL(nibble_ashr_exit_use) +#endif + cmp $2, %ecx + ja LABEL(nibble_ashr_13_restart_use) + + jmp LABEL(nibble_ashr_exit_use) + +/* + * The following cases will be handled by ashr_14 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(2~15) n - 2 13(15 +(n - 2) - n) ashr_14 + */ + .p2align 4 +LABEL(ashr_14): + pslldq $2, D(%xmm2) + TOLOWER (%xmm1, %xmm2) + pcmpeqb %xmm1, D(%xmm2) + psubb %xmm0, D(%xmm2) + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d + sub %r9d, %edx + jnz LABEL(less32bytes) + movdqa (%rdi), %xmm3 + + UPDATE_STRNCMP_COUNTER + + mov $16, %rcx /* index for loads */ + mov $14, %r9d /* byte position left over from less32bytes case */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we have crossed a page boundary and + * need to do a nibble. + */ + lea 14(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + sub $0x1000, %r10 /* subtract 4K pagesize */ + + mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ + + .p2align 4 +LABEL(loop_ashr_14_use): + add $16, %r10 + jg LABEL(nibble_ashr_14_use) + +LABEL(nibble_ashr_14_restart_use): + movdqa (%rdi, %rdx), %xmm0 + palignr $14, -16(%rdi, %rdx), D(%xmm0) +#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a, (%rsi,%rdx), %xmm0 +#else + movdqa (%rsi,%rdx), %xmm1 + TOLOWER (%xmm0, %xmm1) + pcmpistri $0x1a, %xmm1, %xmm0 +#endif + jbe LABEL(exit_use) +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + + add $16, %rdx + add $16, %r10 + jg LABEL(nibble_ashr_14_use) + + movdqa (%rdi, %rdx), %xmm0 + palignr $14, -16(%rdi, %rdx), D(%xmm0) +#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a, (%rsi,%rdx), %xmm0 +#else + movdqa (%rsi,%rdx), %xmm1 + TOLOWER (%xmm0, %xmm1) + pcmpistri $0x1a, %xmm1, %xmm0 +#endif + jbe LABEL(exit_use) +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + add $16, %rdx + jmp LABEL(loop_ashr_14_use) + + .p2align 4 +LABEL(nibble_ashr_14_use): + sub $0x1000, %r10 + movdqa -16(%rdi, %rdx), %xmm0 + psrldq $14, D(%xmm0) + pcmpistri $0x3a,%xmm0, %xmm0 +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp %r11, %rcx + jae LABEL(nibble_ashr_exit_use) +#endif + cmp $1, %ecx + ja LABEL(nibble_ashr_14_restart_use) + + jmp LABEL(nibble_ashr_exit_use) + +/* + * The following cases will be handled by ashr_15 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(1~15) n - 1 14(15 +(n - 1) - n) ashr_15 + */ + .p2align 4 +LABEL(ashr_15): + pslldq $1, D(%xmm2) + TOLOWER (%xmm1, %xmm2) + pcmpeqb %xmm1, D(%xmm2) + psubb %xmm0, D(%xmm2) + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d + sub %r9d, %edx + jnz LABEL(less32bytes) + + movdqa (%rdi), %xmm3 + + UPDATE_STRNCMP_COUNTER + + mov $16, %rcx /* index for loads */ + mov $15, %r9d /* byte position left over from less32bytes case */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we have crossed a page boundary and + * need to do a nibble. + */ + lea 15(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + + sub $0x1000, %r10 /* subtract 4K pagesize */ + + mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ + + .p2align 4 +LABEL(loop_ashr_15_use): + add $16, %r10 + jg LABEL(nibble_ashr_15_use) + +LABEL(nibble_ashr_15_restart_use): + movdqa (%rdi, %rdx), %xmm0 + palignr $15, -16(%rdi, %rdx), D(%xmm0) +#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a, (%rsi,%rdx), %xmm0 +#else + movdqa (%rsi,%rdx), %xmm1 + TOLOWER (%xmm0, %xmm1) + pcmpistri $0x1a, %xmm1, %xmm0 +#endif + jbe LABEL(exit_use) +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + + add $16, %rdx + add $16, %r10 + jg LABEL(nibble_ashr_15_use) + + movdqa (%rdi, %rdx), %xmm0 + palignr $15, -16(%rdi, %rdx), D(%xmm0) +#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a, (%rsi,%rdx), %xmm0 +#else + movdqa (%rsi,%rdx), %xmm1 + TOLOWER (%xmm0, %xmm1) + pcmpistri $0x1a, %xmm1, %xmm0 +#endif + jbe LABEL(exit_use) +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + add $16, %rdx + jmp LABEL(loop_ashr_15_use) + + .p2align 4 +LABEL(nibble_ashr_15_use): + sub $0x1000, %r10 + movdqa -16(%rdi, %rdx), %xmm0 + psrldq $15, D(%xmm0) + pcmpistri $0x3a,%xmm0, %xmm0 +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp %r11, %rcx + jae LABEL(nibble_ashr_exit_use) +#endif + cmp $0, %ecx + ja LABEL(nibble_ashr_15_restart_use) + +LABEL(nibble_ashr_exit_use): +#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 +#else + movdqa (%rsi,%rdx), %xmm1 + TOLOWER (%xmm0, %xmm1) + pcmpistri $0x1a, %xmm1, %xmm0 +#endif + .p2align 4 +LABEL(exit_use): + jnc LABEL(strcmp_exitz) +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub %rcx, %r11 + jbe LABEL(strcmp_exitz) +#endif + add %rcx, %rdx + lea -16(%rdi, %r9), %rdi + movzbl (%rdi, %rdx), %eax + movzbl (%rsi, %rdx), %edx + test %r8d, %r8d + jz LABEL(ret_use) + xchg %eax, %edx +LABEL(ret_use): +#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L + leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rcx + movl (%rcx,%rdx,4), %edx + movl (%rcx,%rax,4), %eax +#endif + + sub %edx, %eax + ret + +LABEL(less32bytes): + lea (%rdi, %rax), %rdi /* locate the exact address for first operand(rdi) */ + lea (%rsi, %rcx), %rsi /* locate the exact address for second operand(rsi) */ + test %r8d, %r8d + jz LABEL(ret) + xchg %rsi, %rdi /* recover original order according to flag(%r8d) */ + + .p2align 4 +LABEL(ret): +LABEL(less16bytes): + bsf %rdx, %rdx /* find and store bit index in %rdx */ + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub %rdx, %r11 + jbe LABEL(strcmp_exitz) +#endif + movzbl (%rsi, %rdx), %ecx + movzbl (%rdi, %rdx), %eax + +#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L + leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx + movl (%rdx,%rcx,4), %ecx + movl (%rdx,%rax,4), %eax +#endif + + sub %ecx, %eax + ret + +LABEL(strcmp_exitz): + xor %eax, %eax + ret + + .p2align 4 + // XXX Same as code above +LABEL(Byte0): + movzx (%rsi), %ecx + movzx (%rdi), %eax + +#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L + leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx + movl (%rdx,%rcx,4), %ecx + movl (%rdx,%rax,4), %eax +#endif + + sub %ecx, %eax + ret + cfi_endproc + .size STRCMP_SSE42, .-STRCMP_SSE42 + +#undef UCLOW_reg +#undef UCHIGH_reg +#undef LCQWORD_reg +#undef TOLOWER + + /* Put all SSE 4.2 functions together. */ + .section .rodata.SECTION,"a",@progbits + .p2align 3 +LABEL(unaligned_table): + .int LABEL(ashr_1) - LABEL(unaligned_table) + .int LABEL(ashr_2) - LABEL(unaligned_table) + .int LABEL(ashr_3) - LABEL(unaligned_table) + .int LABEL(ashr_4) - LABEL(unaligned_table) + .int LABEL(ashr_5) - LABEL(unaligned_table) + .int LABEL(ashr_6) - LABEL(unaligned_table) + .int LABEL(ashr_7) - LABEL(unaligned_table) + .int LABEL(ashr_8) - LABEL(unaligned_table) + .int LABEL(ashr_9) - LABEL(unaligned_table) + .int LABEL(ashr_10) - LABEL(unaligned_table) + .int LABEL(ashr_11) - LABEL(unaligned_table) + .int LABEL(ashr_12) - LABEL(unaligned_table) + .int LABEL(ashr_13) - LABEL(unaligned_table) + .int LABEL(ashr_14) - LABEL(unaligned_table) + .int LABEL(ashr_15) - LABEL(unaligned_table) + .int LABEL(ashr_0) - LABEL(unaligned_table) + +#undef LABEL +#undef GLABEL +#undef SECTION +#undef movdqa +#undef movdqu +#undef pmovmskb +#undef pcmpistri +#undef psubb +#undef pcmpeqb +#undef psrldq +#undef pslldq +#undef palignr +#undef pxor +#undef D diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/strcmp-ssse3.S b/REORG.TODO/sysdeps/x86_64/multiarch/strcmp-ssse3.S new file mode 100644 index 0000000000..1b7fa33c91 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/strcmp-ssse3.S @@ -0,0 +1,5 @@ +#if IS_IN (libc) +# define USE_SSSE3 1 +# define STRCMP __strcmp_ssse3 +# include "../strcmp.S" +#endif diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/strcmp.S b/REORG.TODO/sysdeps/x86_64/multiarch/strcmp.S new file mode 100644 index 0000000000..54f8f7dd44 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/strcmp.S @@ -0,0 +1,209 @@ +/* Multiple versions of strcmp + Copyright (C) 2009-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + +#ifdef USE_AS_STRNCMP +/* Since the counter, %r11, is unsigned, we branch to strcmp_exitz + if the new counter > the old one or is 0. */ +# define UPDATE_STRNCMP_COUNTER \ + /* calculate left number to compare */ \ + lea -16(%rcx, %r11), %r9; \ + cmp %r9, %r11; \ + jb LABEL(strcmp_exitz); \ + test %r9, %r9; \ + je LABEL(strcmp_exitz); \ + mov %r9, %r11 + +# define STRCMP_SSE42 __strncmp_sse42 +# define STRCMP_SSSE3 __strncmp_ssse3 +# define STRCMP_SSE2 __strncmp_sse2 +# define __GI_STRCMP __GI_strncmp +#elif defined USE_AS_STRCASECMP_L +# include "locale-defines.h" + +# define UPDATE_STRNCMP_COUNTER + +# define STRCMP_AVX __strcasecmp_l_avx +# define STRCMP_SSE42 __strcasecmp_l_sse42 +# define STRCMP_SSSE3 __strcasecmp_l_ssse3 +# define STRCMP_SSE2 __strcasecmp_l_sse2 +# define __GI_STRCMP __GI___strcasecmp_l +#elif defined USE_AS_STRNCASECMP_L +# include "locale-defines.h" + +/* Since the counter, %r11, is unsigned, we branch to strcmp_exitz + if the new counter > the old one or is 0. */ +# define UPDATE_STRNCMP_COUNTER \ + /* calculate left number to compare */ \ + lea -16(%rcx, %r11), %r9; \ + cmp %r9, %r11; \ + jb LABEL(strcmp_exitz); \ + test %r9, %r9; \ + je LABEL(strcmp_exitz); \ + mov %r9, %r11 + +# define STRCMP_AVX __strncasecmp_l_avx +# define STRCMP_SSE42 __strncasecmp_l_sse42 +# define STRCMP_SSSE3 __strncasecmp_l_ssse3 +# define STRCMP_SSE2 __strncasecmp_l_sse2 +# define __GI_STRCMP __GI___strncasecmp_l +#else +# define USE_AS_STRCMP +# define UPDATE_STRNCMP_COUNTER +# ifndef STRCMP +# define STRCMP strcmp +# define STRCMP_SSE42 __strcmp_sse42 +# define STRCMP_SSSE3 __strcmp_ssse3 +# define STRCMP_SSE2 __strcmp_sse2 +# define __GI_STRCMP __GI_strcmp +# endif +#endif + +/* Define multiple versions only for the definition in libc. Don't + define multiple versions for strncmp in static library since we + need strncmp before the initialization happened. */ +#if (defined SHARED || !defined USE_AS_STRNCMP) && IS_IN (libc) + .text +ENTRY(STRCMP) + .type STRCMP, @gnu_indirect_function + LOAD_RTLD_GLOBAL_RO_RDX +#ifdef USE_AS_STRCMP + leaq __strcmp_sse2_unaligned(%rip), %rax + HAS_ARCH_FEATURE (Fast_Unaligned_Load) + jnz 3f +#else + HAS_ARCH_FEATURE (Slow_SSE4_2) + jnz 2f + leaq STRCMP_SSE42(%rip), %rax + HAS_CPU_FEATURE (SSE4_2) + jnz 3f +#endif +2: leaq STRCMP_SSSE3(%rip), %rax + HAS_CPU_FEATURE (SSSE3) + jnz 3f + leaq STRCMP_SSE2(%rip), %rax +3: ret +END(STRCMP) + +# ifdef USE_AS_STRCASECMP_L +ENTRY(__strcasecmp) + .type __strcasecmp, @gnu_indirect_function + LOAD_RTLD_GLOBAL_RO_RDX + leaq __strcasecmp_avx(%rip), %rax + HAS_ARCH_FEATURE (AVX_Usable) + jnz 3f + HAS_ARCH_FEATURE (Slow_SSE4_2) + jnz 2f + leaq __strcasecmp_sse42(%rip), %rax + HAS_CPU_FEATURE (SSE4_2) + jnz 3f +2: leaq __strcasecmp_ssse3(%rip), %rax + HAS_CPU_FEATURE (SSSE3) + jnz 3f + leaq __strcasecmp_sse2(%rip), %rax +3: ret +END(__strcasecmp) +weak_alias (__strcasecmp, strcasecmp) +# endif +# ifdef USE_AS_STRNCASECMP_L +ENTRY(__strncasecmp) + .type __strncasecmp, @gnu_indirect_function + LOAD_RTLD_GLOBAL_RO_RDX + leaq __strncasecmp_avx(%rip), %rax + HAS_ARCH_FEATURE (AVX_Usable) + jnz 3f + HAS_ARCH_FEATURE (Slow_SSE4_2) + jnz 2f + leaq __strncasecmp_sse42(%rip), %rax + HAS_CPU_FEATURE (SSE4_2) + jnz 3f +2: leaq __strncasecmp_ssse3(%rip), %rax + HAS_CPU_FEATURE (SSSE3) + jnz 3f + leaq __strncasecmp_sse2(%rip), %rax +3: ret +END(__strncasecmp) +weak_alias (__strncasecmp, strncasecmp) +# endif + +# undef LABEL +# define LABEL(l) .L##l##_sse42 +# define GLABEL(l) l##_sse42 +# define SECTION sse4.2 +# include "strcmp-sse42.S" + + +# if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L +# define LABEL(l) .L##l##_avx +# define GLABEL(l) l##_avx +# define USE_AVX 1 +# undef STRCMP_SSE42 +# define STRCMP_SSE42 STRCMP_AVX +# define SECTION avx +# include "strcmp-sse42.S" +# endif + + +# undef ENTRY +# define ENTRY(name) \ + .type STRCMP_SSE2, @function; \ + .align 16; \ + .globl STRCMP_SSE2; \ + .hidden STRCMP_SSE2; \ + STRCMP_SSE2: cfi_startproc; \ + CALL_MCOUNT +# undef END +# define END(name) \ + cfi_endproc; .size STRCMP_SSE2, .-STRCMP_SSE2 + +# ifdef USE_AS_STRCASECMP_L +# define ENTRY2(name) \ + .type __strcasecmp_sse2, @function; \ + .align 16; \ + .globl __strcasecmp_sse2; \ + .hidden __strcasecmp_sse2; \ + __strcasecmp_sse2: cfi_startproc; \ + CALL_MCOUNT +# define END2(name) \ + cfi_endproc; .size __strcasecmp_sse2, .-__strcasecmp_sse2 +# endif + +# ifdef USE_AS_STRNCASECMP_L +# define ENTRY2(name) \ + .type __strncasecmp_sse2, @function; \ + .align 16; \ + .globl __strncasecmp_sse2; \ + .hidden __strncasecmp_sse2; \ + __strncasecmp_sse2: cfi_startproc; \ + CALL_MCOUNT +# define END2(name) \ + cfi_endproc; .size __strncasecmp_sse2, .-__strncasecmp_sse2 +# endif + +# undef libc_hidden_builtin_def +/* It doesn't make sense to send libc-internal strcmp calls through a PLT. + The speedup we get from using SSE4.2 instruction is likely eaten away + by the indirect call in the PLT. */ +# define libc_hidden_builtin_def(name) \ + .globl __GI_STRCMP; __GI_STRCMP = STRCMP_SSE2 +#endif + +#include "../strcmp.S" diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S b/REORG.TODO/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S new file mode 100644 index 0000000000..6a5ab7ab26 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S @@ -0,0 +1,1889 @@ +/* strcpy with SSE2 and unaligned load + Copyright (C) 2011-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#if IS_IN (libc) + +# ifndef USE_AS_STRCAT +# include <sysdep.h> + +# ifndef STRCPY +# define STRCPY __strcpy_sse2_unaligned +# endif + +# endif + +# define JMPTBL(I, B) I - B +# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ + lea TABLE(%rip), %r11; \ + movslq (%r11, INDEX, SCALE), %rcx; \ + lea (%r11, %rcx), %rcx; \ + jmp *%rcx + +# ifndef USE_AS_STRCAT + +.text +ENTRY (STRCPY) +# ifdef USE_AS_STRNCPY + mov %rdx, %r8 + test %r8, %r8 + jz L(ExitZero) +# endif + mov %rsi, %rcx +# ifndef USE_AS_STPCPY + mov %rdi, %rax /* save result */ +# endif + +# endif + + and $63, %rcx + cmp $32, %rcx + jbe L(SourceStringAlignmentLess32) + + and $-16, %rsi + and $15, %rcx + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + + pcmpeqb (%rsi), %xmm1 + pmovmskb %xmm1, %rdx + shr %cl, %rdx + +# ifdef USE_AS_STRNCPY +# if defined USE_AS_STPCPY || defined USE_AS_STRCAT + mov $16, %r10 + sub %rcx, %r10 + cmp %r10, %r8 +# else + mov $17, %r10 + sub %rcx, %r10 + cmp %r10, %r8 +# endif + jbe L(CopyFrom1To16BytesTailCase2OrCase3) +# endif + test %rdx, %rdx + jnz L(CopyFrom1To16BytesTail) + + pcmpeqb 16(%rsi), %xmm0 + pmovmskb %xmm0, %rdx + +# ifdef USE_AS_STRNCPY + add $16, %r10 + cmp %r10, %r8 + jbe L(CopyFrom1To32BytesCase2OrCase3) +# endif + test %rdx, %rdx + jnz L(CopyFrom1To32Bytes) + + movdqu (%rsi, %rcx), %xmm1 /* copy 16 bytes */ + movdqu %xmm1, (%rdi) + +/* If source address alignment != destination address alignment */ + .p2align 4 +L(Unalign16Both): + sub %rcx, %rdi +# ifdef USE_AS_STRNCPY + add %rcx, %r8 + sbb %rcx, %rcx + or %rcx, %r8 +# endif + mov $16, %rcx + movdqa (%rsi, %rcx), %xmm1 + movaps 16(%rsi, %rcx), %xmm2 + movdqu %xmm1, (%rdi, %rcx) + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %rdx + add $16, %rcx +# ifdef USE_AS_STRNCPY + sub $48, %r8 + jbe L(CopyFrom1To16BytesCase2OrCase3) +# endif + test %rdx, %rdx +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + jnz L(CopyFrom1To16BytesUnalignedXmm2) +# else + jnz L(CopyFrom1To16Bytes) +# endif + + movaps 16(%rsi, %rcx), %xmm3 + movdqu %xmm2, (%rdi, %rcx) + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %rdx + add $16, %rcx +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(CopyFrom1To16BytesCase2OrCase3) +# endif + test %rdx, %rdx +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + jnz L(CopyFrom1To16BytesUnalignedXmm3) +# else + jnz L(CopyFrom1To16Bytes) +# endif + + movaps 16(%rsi, %rcx), %xmm4 + movdqu %xmm3, (%rdi, %rcx) + pcmpeqb %xmm4, %xmm0 + pmovmskb %xmm0, %rdx + add $16, %rcx +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(CopyFrom1To16BytesCase2OrCase3) +# endif + test %rdx, %rdx +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + jnz L(CopyFrom1To16BytesUnalignedXmm4) +# else + jnz L(CopyFrom1To16Bytes) +# endif + + movaps 16(%rsi, %rcx), %xmm1 + movdqu %xmm4, (%rdi, %rcx) + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %rdx + add $16, %rcx +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(CopyFrom1To16BytesCase2OrCase3) +# endif + test %rdx, %rdx +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + jnz L(CopyFrom1To16BytesUnalignedXmm1) +# else + jnz L(CopyFrom1To16Bytes) +# endif + + movaps 16(%rsi, %rcx), %xmm2 + movdqu %xmm1, (%rdi, %rcx) + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %rdx + add $16, %rcx +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(CopyFrom1To16BytesCase2OrCase3) +# endif + test %rdx, %rdx +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + jnz L(CopyFrom1To16BytesUnalignedXmm2) +# else + jnz L(CopyFrom1To16Bytes) +# endif + + movaps 16(%rsi, %rcx), %xmm3 + movdqu %xmm2, (%rdi, %rcx) + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %rdx + add $16, %rcx +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(CopyFrom1To16BytesCase2OrCase3) +# endif + test %rdx, %rdx +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + jnz L(CopyFrom1To16BytesUnalignedXmm3) +# else + jnz L(CopyFrom1To16Bytes) +# endif + + movdqu %xmm3, (%rdi, %rcx) + mov %rsi, %rdx + lea 16(%rsi, %rcx), %rsi + and $-0x40, %rsi + sub %rsi, %rdx + sub %rdx, %rdi +# ifdef USE_AS_STRNCPY + lea 128(%r8, %rdx), %r8 +# endif +L(Unaligned64Loop): + movaps (%rsi), %xmm2 + movaps %xmm2, %xmm4 + movaps 16(%rsi), %xmm5 + movaps 32(%rsi), %xmm3 + movaps %xmm3, %xmm6 + movaps 48(%rsi), %xmm7 + pminub %xmm5, %xmm2 + pminub %xmm7, %xmm3 + pminub %xmm2, %xmm3 + pcmpeqb %xmm0, %xmm3 + pmovmskb %xmm3, %rdx +# ifdef USE_AS_STRNCPY + sub $64, %r8 + jbe L(UnalignedLeaveCase2OrCase3) +# endif + test %rdx, %rdx + jnz L(Unaligned64Leave) + +L(Unaligned64Loop_start): + add $64, %rdi + add $64, %rsi + movdqu %xmm4, -64(%rdi) + movaps (%rsi), %xmm2 + movdqa %xmm2, %xmm4 + movdqu %xmm5, -48(%rdi) + movaps 16(%rsi), %xmm5 + pminub %xmm5, %xmm2 + movaps 32(%rsi), %xmm3 + movdqu %xmm6, -32(%rdi) + movaps %xmm3, %xmm6 + movdqu %xmm7, -16(%rdi) + movaps 48(%rsi), %xmm7 + pminub %xmm7, %xmm3 + pminub %xmm2, %xmm3 + pcmpeqb %xmm0, %xmm3 + pmovmskb %xmm3, %rdx +# ifdef USE_AS_STRNCPY + sub $64, %r8 + jbe L(UnalignedLeaveCase2OrCase3) +# endif + test %rdx, %rdx + jz L(Unaligned64Loop_start) + +L(Unaligned64Leave): + pxor %xmm1, %xmm1 + + pcmpeqb %xmm4, %xmm0 + pcmpeqb %xmm5, %xmm1 + pmovmskb %xmm0, %rdx + pmovmskb %xmm1, %rcx + test %rdx, %rdx + jnz L(CopyFrom1To16BytesUnaligned_0) + test %rcx, %rcx + jnz L(CopyFrom1To16BytesUnaligned_16) + + pcmpeqb %xmm6, %xmm0 + pcmpeqb %xmm7, %xmm1 + pmovmskb %xmm0, %rdx + pmovmskb %xmm1, %rcx + test %rdx, %rdx + jnz L(CopyFrom1To16BytesUnaligned_32) + + bsf %rcx, %rdx + movdqu %xmm4, (%rdi) + movdqu %xmm5, 16(%rdi) + movdqu %xmm6, 32(%rdi) +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT +# ifdef USE_AS_STPCPY + lea 48(%rdi, %rdx), %rax +# endif + movdqu %xmm7, 48(%rdi) + add $15, %r8 + sub %rdx, %r8 + lea 49(%rdi, %rdx), %rdi + jmp L(StrncpyFillTailWithZero) +# else + add $48, %rsi + add $48, %rdi + BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4) +# endif + +/* If source address alignment == destination address alignment */ + +L(SourceStringAlignmentLess32): + pxor %xmm0, %xmm0 + movdqu (%rsi), %xmm1 + movdqu 16(%rsi), %xmm2 + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %rdx + +# ifdef USE_AS_STRNCPY +# if defined USE_AS_STPCPY || defined USE_AS_STRCAT + cmp $16, %r8 +# else + cmp $17, %r8 +# endif + jbe L(CopyFrom1To16BytesTail1Case2OrCase3) +# endif + test %rdx, %rdx + jnz L(CopyFrom1To16BytesTail1) + + pcmpeqb %xmm2, %xmm0 + movdqu %xmm1, (%rdi) + pmovmskb %xmm0, %rdx + +# ifdef USE_AS_STRNCPY +# if defined USE_AS_STPCPY || defined USE_AS_STRCAT + cmp $32, %r8 +# else + cmp $33, %r8 +# endif + jbe L(CopyFrom1To32Bytes1Case2OrCase3) +# endif + test %rdx, %rdx + jnz L(CopyFrom1To32Bytes1) + + and $-16, %rsi + and $15, %rcx + jmp L(Unalign16Both) + +/*------End of main part with loops---------------------*/ + +/* Case1 */ + +# if (!defined USE_AS_STRNCPY) || (defined USE_AS_STRCAT) + .p2align 4 +L(CopyFrom1To16Bytes): + add %rcx, %rdi + add %rcx, %rsi + bsf %rdx, %rdx + BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4) +# endif + .p2align 4 +L(CopyFrom1To16BytesTail): + add %rcx, %rsi + bsf %rdx, %rdx + BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4) + + .p2align 4 +L(CopyFrom1To32Bytes1): + add $16, %rsi + add $16, %rdi +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $16, %r8 +# endif +L(CopyFrom1To16BytesTail1): + bsf %rdx, %rdx + BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4) + + .p2align 4 +L(CopyFrom1To32Bytes): + bsf %rdx, %rdx + add %rcx, %rsi + add $16, %rdx + sub %rcx, %rdx + BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4) + + .p2align 4 +L(CopyFrom1To16BytesUnaligned_0): + bsf %rdx, %rdx +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT +# ifdef USE_AS_STPCPY + lea (%rdi, %rdx), %rax +# endif + movdqu %xmm4, (%rdi) + add $63, %r8 + sub %rdx, %r8 + lea 1(%rdi, %rdx), %rdi + jmp L(StrncpyFillTailWithZero) +# else + BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4) +# endif + + .p2align 4 +L(CopyFrom1To16BytesUnaligned_16): + bsf %rcx, %rdx + movdqu %xmm4, (%rdi) +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT +# ifdef USE_AS_STPCPY + lea 16(%rdi, %rdx), %rax +# endif + movdqu %xmm5, 16(%rdi) + add $47, %r8 + sub %rdx, %r8 + lea 17(%rdi, %rdx), %rdi + jmp L(StrncpyFillTailWithZero) +# else + add $16, %rsi + add $16, %rdi + BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4) +# endif + + .p2align 4 +L(CopyFrom1To16BytesUnaligned_32): + bsf %rdx, %rdx + movdqu %xmm4, (%rdi) + movdqu %xmm5, 16(%rdi) +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT +# ifdef USE_AS_STPCPY + lea 32(%rdi, %rdx), %rax +# endif + movdqu %xmm6, 32(%rdi) + add $31, %r8 + sub %rdx, %r8 + lea 33(%rdi, %rdx), %rdi + jmp L(StrncpyFillTailWithZero) +# else + add $32, %rsi + add $32, %rdi + BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4) +# endif + +# ifdef USE_AS_STRNCPY +# ifndef USE_AS_STRCAT + .p2align 4 +L(CopyFrom1To16BytesUnalignedXmm6): + movdqu %xmm6, (%rdi, %rcx) + jmp L(CopyFrom1To16BytesXmmExit) + + .p2align 4 +L(CopyFrom1To16BytesUnalignedXmm5): + movdqu %xmm5, (%rdi, %rcx) + jmp L(CopyFrom1To16BytesXmmExit) + + .p2align 4 +L(CopyFrom1To16BytesUnalignedXmm4): + movdqu %xmm4, (%rdi, %rcx) + jmp L(CopyFrom1To16BytesXmmExit) + + .p2align 4 +L(CopyFrom1To16BytesUnalignedXmm3): + movdqu %xmm3, (%rdi, %rcx) + jmp L(CopyFrom1To16BytesXmmExit) + + .p2align 4 +L(CopyFrom1To16BytesUnalignedXmm1): + movdqu %xmm1, (%rdi, %rcx) + jmp L(CopyFrom1To16BytesXmmExit) +# endif + + .p2align 4 +L(CopyFrom1To16BytesExit): + BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4) + +/* Case2 */ + + .p2align 4 +L(CopyFrom1To16BytesCase2): + add $16, %r8 + add %rcx, %rdi + add %rcx, %rsi + bsf %rdx, %rdx + cmp %r8, %rdx + jb L(CopyFrom1To16BytesExit) + BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4) + + .p2align 4 +L(CopyFrom1To32BytesCase2): + add %rcx, %rsi + bsf %rdx, %rdx + add $16, %rdx + sub %rcx, %rdx + cmp %r8, %rdx + jb L(CopyFrom1To16BytesExit) + BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4) + +L(CopyFrom1To16BytesTailCase2): + add %rcx, %rsi + bsf %rdx, %rdx + cmp %r8, %rdx + jb L(CopyFrom1To16BytesExit) + BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4) + +L(CopyFrom1To16BytesTail1Case2): + bsf %rdx, %rdx + cmp %r8, %rdx + jb L(CopyFrom1To16BytesExit) + BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4) + +/* Case2 or Case3, Case3 */ + + .p2align 4 +L(CopyFrom1To16BytesCase2OrCase3): + test %rdx, %rdx + jnz L(CopyFrom1To16BytesCase2) +L(CopyFrom1To16BytesCase3): + add $16, %r8 + add %rcx, %rdi + add %rcx, %rsi + BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4) + + .p2align 4 +L(CopyFrom1To32BytesCase2OrCase3): + test %rdx, %rdx + jnz L(CopyFrom1To32BytesCase2) + add %rcx, %rsi + BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4) + + .p2align 4 +L(CopyFrom1To16BytesTailCase2OrCase3): + test %rdx, %rdx + jnz L(CopyFrom1To16BytesTailCase2) + add %rcx, %rsi + BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4) + + .p2align 4 +L(CopyFrom1To32Bytes1Case2OrCase3): + add $16, %rdi + add $16, %rsi + sub $16, %r8 +L(CopyFrom1To16BytesTail1Case2OrCase3): + test %rdx, %rdx + jnz L(CopyFrom1To16BytesTail1Case2) + BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4) + +# endif + +/*------------End labels regarding with copying 1-16 bytes--and 1-32 bytes----*/ + + .p2align 4 +L(Exit1): + mov %dh, (%rdi) +# ifdef USE_AS_STPCPY + lea (%rdi), %rax +# endif +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $1, %r8 + lea 1(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +# endif + ret + + .p2align 4 +L(Exit2): + mov (%rsi), %dx + mov %dx, (%rdi) +# ifdef USE_AS_STPCPY + lea 1(%rdi), %rax +# endif +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $2, %r8 + lea 2(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +# endif + ret + + .p2align 4 +L(Exit3): + mov (%rsi), %cx + mov %cx, (%rdi) + mov %dh, 2(%rdi) +# ifdef USE_AS_STPCPY + lea 2(%rdi), %rax +# endif +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $3, %r8 + lea 3(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +# endif + ret + + .p2align 4 +L(Exit4): + mov (%rsi), %edx + mov %edx, (%rdi) +# ifdef USE_AS_STPCPY + lea 3(%rdi), %rax +# endif +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $4, %r8 + lea 4(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +# endif + ret + + .p2align 4 +L(Exit5): + mov (%rsi), %ecx + mov %dh, 4(%rdi) + mov %ecx, (%rdi) +# ifdef USE_AS_STPCPY + lea 4(%rdi), %rax +# endif +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $5, %r8 + lea 5(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +# endif + ret + + .p2align 4 +L(Exit6): + mov (%rsi), %ecx + mov 4(%rsi), %dx + mov %ecx, (%rdi) + mov %dx, 4(%rdi) +# ifdef USE_AS_STPCPY + lea 5(%rdi), %rax +# endif +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $6, %r8 + lea 6(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +# endif + ret + + .p2align 4 +L(Exit7): + mov (%rsi), %ecx + mov 3(%rsi), %edx + mov %ecx, (%rdi) + mov %edx, 3(%rdi) +# ifdef USE_AS_STPCPY + lea 6(%rdi), %rax +# endif +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $7, %r8 + lea 7(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +# endif + ret + + .p2align 4 +L(Exit8): + mov (%rsi), %rdx + mov %rdx, (%rdi) +# ifdef USE_AS_STPCPY + lea 7(%rdi), %rax +# endif +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $8, %r8 + lea 8(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +# endif + ret + + .p2align 4 +L(Exit9): + mov (%rsi), %rcx + mov %dh, 8(%rdi) + mov %rcx, (%rdi) +# ifdef USE_AS_STPCPY + lea 8(%rdi), %rax +# endif +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $9, %r8 + lea 9(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +# endif + ret + + .p2align 4 +L(Exit10): + mov (%rsi), %rcx + mov 8(%rsi), %dx + mov %rcx, (%rdi) + mov %dx, 8(%rdi) +# ifdef USE_AS_STPCPY + lea 9(%rdi), %rax +# endif +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $10, %r8 + lea 10(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +# endif + ret + + .p2align 4 +L(Exit11): + mov (%rsi), %rcx + mov 7(%rsi), %edx + mov %rcx, (%rdi) + mov %edx, 7(%rdi) +# ifdef USE_AS_STPCPY + lea 10(%rdi), %rax +# endif +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $11, %r8 + lea 11(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +# endif + ret + + .p2align 4 +L(Exit12): + mov (%rsi), %rcx + mov 8(%rsi), %edx + mov %rcx, (%rdi) + mov %edx, 8(%rdi) +# ifdef USE_AS_STPCPY + lea 11(%rdi), %rax +# endif +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $12, %r8 + lea 12(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +# endif + ret + + .p2align 4 +L(Exit13): + mov (%rsi), %rcx + mov 5(%rsi), %rdx + mov %rcx, (%rdi) + mov %rdx, 5(%rdi) +# ifdef USE_AS_STPCPY + lea 12(%rdi), %rax +# endif +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $13, %r8 + lea 13(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +# endif + ret + + .p2align 4 +L(Exit14): + mov (%rsi), %rcx + mov 6(%rsi), %rdx + mov %rcx, (%rdi) + mov %rdx, 6(%rdi) +# ifdef USE_AS_STPCPY + lea 13(%rdi), %rax +# endif +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $14, %r8 + lea 14(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +# endif + ret + + .p2align 4 +L(Exit15): + mov (%rsi), %rcx + mov 7(%rsi), %rdx + mov %rcx, (%rdi) + mov %rdx, 7(%rdi) +# ifdef USE_AS_STPCPY + lea 14(%rdi), %rax +# endif +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $15, %r8 + lea 15(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +# endif + ret + + .p2align 4 +L(Exit16): + movdqu (%rsi), %xmm0 + movdqu %xmm0, (%rdi) +# ifdef USE_AS_STPCPY + lea 15(%rdi), %rax +# endif +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $16, %r8 + lea 16(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +# endif + ret + + .p2align 4 +L(Exit17): + movdqu (%rsi), %xmm0 + movdqu %xmm0, (%rdi) + mov %dh, 16(%rdi) +# ifdef USE_AS_STPCPY + lea 16(%rdi), %rax +# endif +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $17, %r8 + lea 17(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +# endif + ret + + .p2align 4 +L(Exit18): + movdqu (%rsi), %xmm0 + mov 16(%rsi), %cx + movdqu %xmm0, (%rdi) + mov %cx, 16(%rdi) +# ifdef USE_AS_STPCPY + lea 17(%rdi), %rax +# endif +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $18, %r8 + lea 18(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +# endif + ret + + .p2align 4 +L(Exit19): + movdqu (%rsi), %xmm0 + mov 15(%rsi), %ecx + movdqu %xmm0, (%rdi) + mov %ecx, 15(%rdi) +# ifdef USE_AS_STPCPY + lea 18(%rdi), %rax +# endif +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $19, %r8 + lea 19(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +# endif + ret + + .p2align 4 +L(Exit20): + movdqu (%rsi), %xmm0 + mov 16(%rsi), %ecx + movdqu %xmm0, (%rdi) + mov %ecx, 16(%rdi) +# ifdef USE_AS_STPCPY + lea 19(%rdi), %rax +# endif +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $20, %r8 + lea 20(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +# endif + ret + + .p2align 4 +L(Exit21): + movdqu (%rsi), %xmm0 + mov 16(%rsi), %ecx + movdqu %xmm0, (%rdi) + mov %ecx, 16(%rdi) + mov %dh, 20(%rdi) +# ifdef USE_AS_STPCPY + lea 20(%rdi), %rax +# endif +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $21, %r8 + lea 21(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +# endif + ret + + .p2align 4 +L(Exit22): + movdqu (%rsi), %xmm0 + mov 14(%rsi), %rcx + movdqu %xmm0, (%rdi) + mov %rcx, 14(%rdi) +# ifdef USE_AS_STPCPY + lea 21(%rdi), %rax +# endif +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $22, %r8 + lea 22(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +# endif + ret + + .p2align 4 +L(Exit23): + movdqu (%rsi), %xmm0 + mov 15(%rsi), %rcx + movdqu %xmm0, (%rdi) + mov %rcx, 15(%rdi) +# ifdef USE_AS_STPCPY + lea 22(%rdi), %rax +# endif +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $23, %r8 + lea 23(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +# endif + ret + + .p2align 4 +L(Exit24): + movdqu (%rsi), %xmm0 + mov 16(%rsi), %rcx + movdqu %xmm0, (%rdi) + mov %rcx, 16(%rdi) +# ifdef USE_AS_STPCPY + lea 23(%rdi), %rax +# endif +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $24, %r8 + lea 24(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +# endif + ret + + .p2align 4 +L(Exit25): + movdqu (%rsi), %xmm0 + mov 16(%rsi), %rcx + movdqu %xmm0, (%rdi) + mov %rcx, 16(%rdi) + mov %dh, 24(%rdi) +# ifdef USE_AS_STPCPY + lea 24(%rdi), %rax +# endif +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $25, %r8 + lea 25(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +# endif + ret + + .p2align 4 +L(Exit26): + movdqu (%rsi), %xmm0 + mov 16(%rsi), %rdx + mov 24(%rsi), %cx + movdqu %xmm0, (%rdi) + mov %rdx, 16(%rdi) + mov %cx, 24(%rdi) +# ifdef USE_AS_STPCPY + lea 25(%rdi), %rax +# endif +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $26, %r8 + lea 26(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +# endif + ret + + .p2align 4 +L(Exit27): + movdqu (%rsi), %xmm0 + mov 16(%rsi), %rdx + mov 23(%rsi), %ecx + movdqu %xmm0, (%rdi) + mov %rdx, 16(%rdi) + mov %ecx, 23(%rdi) +# ifdef USE_AS_STPCPY + lea 26(%rdi), %rax +# endif +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $27, %r8 + lea 27(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +# endif + ret + + .p2align 4 +L(Exit28): + movdqu (%rsi), %xmm0 + mov 16(%rsi), %rdx + mov 24(%rsi), %ecx + movdqu %xmm0, (%rdi) + mov %rdx, 16(%rdi) + mov %ecx, 24(%rdi) +# ifdef USE_AS_STPCPY + lea 27(%rdi), %rax +# endif +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $28, %r8 + lea 28(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +# endif + ret + + .p2align 4 +L(Exit29): + movdqu (%rsi), %xmm0 + movdqu 13(%rsi), %xmm2 + movdqu %xmm0, (%rdi) + movdqu %xmm2, 13(%rdi) +# ifdef USE_AS_STPCPY + lea 28(%rdi), %rax +# endif +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $29, %r8 + lea 29(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +# endif + ret + + .p2align 4 +L(Exit30): + movdqu (%rsi), %xmm0 + movdqu 14(%rsi), %xmm2 + movdqu %xmm0, (%rdi) + movdqu %xmm2, 14(%rdi) +# ifdef USE_AS_STPCPY + lea 29(%rdi), %rax +# endif +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $30, %r8 + lea 30(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +# endif + ret + + .p2align 4 +L(Exit31): + movdqu (%rsi), %xmm0 + movdqu 15(%rsi), %xmm2 + movdqu %xmm0, (%rdi) + movdqu %xmm2, 15(%rdi) +# ifdef USE_AS_STPCPY + lea 30(%rdi), %rax +# endif +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $31, %r8 + lea 31(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +# endif + ret + + .p2align 4 +L(Exit32): + movdqu (%rsi), %xmm0 + movdqu 16(%rsi), %xmm2 + movdqu %xmm0, (%rdi) + movdqu %xmm2, 16(%rdi) +# ifdef USE_AS_STPCPY + lea 31(%rdi), %rax +# endif +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $32, %r8 + lea 32(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +# endif + ret + +# ifdef USE_AS_STRNCPY + + .p2align 4 +L(StrncpyExit0): +# ifdef USE_AS_STPCPY + mov %rdi, %rax +# endif +# ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, (%rdi) +# endif + ret + + .p2align 4 +L(StrncpyExit1): + mov (%rsi), %dl + mov %dl, (%rdi) +# ifdef USE_AS_STPCPY + lea 1(%rdi), %rax +# endif +# ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 1(%rdi) +# endif + ret + + .p2align 4 +L(StrncpyExit2): + mov (%rsi), %dx + mov %dx, (%rdi) +# ifdef USE_AS_STPCPY + lea 2(%rdi), %rax +# endif +# ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 2(%rdi) +# endif + ret + + .p2align 4 +L(StrncpyExit3): + mov (%rsi), %cx + mov 2(%rsi), %dl + mov %cx, (%rdi) + mov %dl, 2(%rdi) +# ifdef USE_AS_STPCPY + lea 3(%rdi), %rax +# endif +# ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 3(%rdi) +# endif + ret + + .p2align 4 +L(StrncpyExit4): + mov (%rsi), %edx + mov %edx, (%rdi) +# ifdef USE_AS_STPCPY + lea 4(%rdi), %rax +# endif +# ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 4(%rdi) +# endif + ret + + .p2align 4 +L(StrncpyExit5): + mov (%rsi), %ecx + mov 4(%rsi), %dl + mov %ecx, (%rdi) + mov %dl, 4(%rdi) +# ifdef USE_AS_STPCPY + lea 5(%rdi), %rax +# endif +# ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 5(%rdi) +# endif + ret + + .p2align 4 +L(StrncpyExit6): + mov (%rsi), %ecx + mov 4(%rsi), %dx + mov %ecx, (%rdi) + mov %dx, 4(%rdi) +# ifdef USE_AS_STPCPY + lea 6(%rdi), %rax +# endif +# ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 6(%rdi) +# endif + ret + + .p2align 4 +L(StrncpyExit7): + mov (%rsi), %ecx + mov 3(%rsi), %edx + mov %ecx, (%rdi) + mov %edx, 3(%rdi) +# ifdef USE_AS_STPCPY + lea 7(%rdi), %rax +# endif +# ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 7(%rdi) +# endif + ret + + .p2align 4 +L(StrncpyExit8): + mov (%rsi), %rdx + mov %rdx, (%rdi) +# ifdef USE_AS_STPCPY + lea 8(%rdi), %rax +# endif +# ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 8(%rdi) +# endif + ret + + .p2align 4 +L(StrncpyExit9): + mov (%rsi), %rcx + mov 8(%rsi), %dl + mov %rcx, (%rdi) + mov %dl, 8(%rdi) +# ifdef USE_AS_STPCPY + lea 9(%rdi), %rax +# endif +# ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 9(%rdi) +# endif + ret + + .p2align 4 +L(StrncpyExit10): + mov (%rsi), %rcx + mov 8(%rsi), %dx + mov %rcx, (%rdi) + mov %dx, 8(%rdi) +# ifdef USE_AS_STPCPY + lea 10(%rdi), %rax +# endif +# ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 10(%rdi) +# endif + ret + + .p2align 4 +L(StrncpyExit11): + mov (%rsi), %rcx + mov 7(%rsi), %edx + mov %rcx, (%rdi) + mov %edx, 7(%rdi) +# ifdef USE_AS_STPCPY + lea 11(%rdi), %rax +# endif +# ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 11(%rdi) +# endif + ret + + .p2align 4 +L(StrncpyExit12): + mov (%rsi), %rcx + mov 8(%rsi), %edx + mov %rcx, (%rdi) + mov %edx, 8(%rdi) +# ifdef USE_AS_STPCPY + lea 12(%rdi), %rax +# endif +# ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 12(%rdi) +# endif + ret + + .p2align 4 +L(StrncpyExit13): + mov (%rsi), %rcx + mov 5(%rsi), %rdx + mov %rcx, (%rdi) + mov %rdx, 5(%rdi) +# ifdef USE_AS_STPCPY + lea 13(%rdi), %rax +# endif +# ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 13(%rdi) +# endif + ret + + .p2align 4 +L(StrncpyExit14): + mov (%rsi), %rcx + mov 6(%rsi), %rdx + mov %rcx, (%rdi) + mov %rdx, 6(%rdi) +# ifdef USE_AS_STPCPY + lea 14(%rdi), %rax +# endif +# ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 14(%rdi) +# endif + ret + + .p2align 4 +L(StrncpyExit15): + mov (%rsi), %rcx + mov 7(%rsi), %rdx + mov %rcx, (%rdi) + mov %rdx, 7(%rdi) +# ifdef USE_AS_STPCPY + lea 15(%rdi), %rax +# endif +# ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 15(%rdi) +# endif + ret + + .p2align 4 +L(StrncpyExit16): + movdqu (%rsi), %xmm0 + movdqu %xmm0, (%rdi) +# ifdef USE_AS_STPCPY + lea 16(%rdi), %rax +# endif +# ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 16(%rdi) +# endif + ret + + .p2align 4 +L(StrncpyExit17): + movdqu (%rsi), %xmm0 + mov 16(%rsi), %cl + movdqu %xmm0, (%rdi) + mov %cl, 16(%rdi) +# ifdef USE_AS_STPCPY + lea 17(%rdi), %rax +# endif +# ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 17(%rdi) +# endif + ret + + .p2align 4 +L(StrncpyExit18): + movdqu (%rsi), %xmm0 + mov 16(%rsi), %cx + movdqu %xmm0, (%rdi) + mov %cx, 16(%rdi) +# ifdef USE_AS_STPCPY + lea 18(%rdi), %rax +# endif +# ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 18(%rdi) +# endif + ret + + .p2align 4 +L(StrncpyExit19): + movdqu (%rsi), %xmm0 + mov 15(%rsi), %ecx + movdqu %xmm0, (%rdi) + mov %ecx, 15(%rdi) +# ifdef USE_AS_STPCPY + lea 19(%rdi), %rax +# endif +# ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 19(%rdi) +# endif + ret + + .p2align 4 +L(StrncpyExit20): + movdqu (%rsi), %xmm0 + mov 16(%rsi), %ecx + movdqu %xmm0, (%rdi) + mov %ecx, 16(%rdi) +# ifdef USE_AS_STPCPY + lea 20(%rdi), %rax +# endif +# ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 20(%rdi) +# endif + ret + + .p2align 4 +L(StrncpyExit21): + movdqu (%rsi), %xmm0 + mov 16(%rsi), %ecx + mov 20(%rsi), %dl + movdqu %xmm0, (%rdi) + mov %ecx, 16(%rdi) + mov %dl, 20(%rdi) +# ifdef USE_AS_STPCPY + lea 21(%rdi), %rax +# endif +# ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 21(%rdi) +# endif + ret + + .p2align 4 +L(StrncpyExit22): + movdqu (%rsi), %xmm0 + mov 14(%rsi), %rcx + movdqu %xmm0, (%rdi) + mov %rcx, 14(%rdi) +# ifdef USE_AS_STPCPY + lea 22(%rdi), %rax +# endif +# ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 22(%rdi) +# endif + ret + + .p2align 4 +L(StrncpyExit23): + movdqu (%rsi), %xmm0 + mov 15(%rsi), %rcx + movdqu %xmm0, (%rdi) + mov %rcx, 15(%rdi) +# ifdef USE_AS_STPCPY + lea 23(%rdi), %rax +# endif +# ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 23(%rdi) +# endif + ret + + .p2align 4 +L(StrncpyExit24): + movdqu (%rsi), %xmm0 + mov 16(%rsi), %rcx + movdqu %xmm0, (%rdi) + mov %rcx, 16(%rdi) +# ifdef USE_AS_STPCPY + lea 24(%rdi), %rax +# endif +# ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 24(%rdi) +# endif + ret + + .p2align 4 +L(StrncpyExit25): + movdqu (%rsi), %xmm0 + mov 16(%rsi), %rdx + mov 24(%rsi), %cl + movdqu %xmm0, (%rdi) + mov %rdx, 16(%rdi) + mov %cl, 24(%rdi) +# ifdef USE_AS_STPCPY + lea 25(%rdi), %rax +# endif +# ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 25(%rdi) +# endif + ret + + .p2align 4 +L(StrncpyExit26): + movdqu (%rsi), %xmm0 + mov 16(%rsi), %rdx + mov 24(%rsi), %cx + movdqu %xmm0, (%rdi) + mov %rdx, 16(%rdi) + mov %cx, 24(%rdi) +# ifdef USE_AS_STPCPY + lea 26(%rdi), %rax +# endif +# ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 26(%rdi) +# endif + ret + + .p2align 4 +L(StrncpyExit27): + movdqu (%rsi), %xmm0 + mov 16(%rsi), %rdx + mov 23(%rsi), %ecx + movdqu %xmm0, (%rdi) + mov %rdx, 16(%rdi) + mov %ecx, 23(%rdi) +# ifdef USE_AS_STPCPY + lea 27(%rdi), %rax +# endif +# ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 27(%rdi) +# endif + ret + + .p2align 4 +L(StrncpyExit28): + movdqu (%rsi), %xmm0 + mov 16(%rsi), %rdx + mov 24(%rsi), %ecx + movdqu %xmm0, (%rdi) + mov %rdx, 16(%rdi) + mov %ecx, 24(%rdi) +# ifdef USE_AS_STPCPY + lea 28(%rdi), %rax +# endif +# ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 28(%rdi) +# endif + ret + + .p2align 4 +L(StrncpyExit29): + movdqu (%rsi), %xmm0 + movdqu 13(%rsi), %xmm2 + movdqu %xmm0, (%rdi) + movdqu %xmm2, 13(%rdi) +# ifdef USE_AS_STPCPY + lea 29(%rdi), %rax +# endif +# ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 29(%rdi) +# endif + ret + + .p2align 4 +L(StrncpyExit30): + movdqu (%rsi), %xmm0 + movdqu 14(%rsi), %xmm2 + movdqu %xmm0, (%rdi) + movdqu %xmm2, 14(%rdi) +# ifdef USE_AS_STPCPY + lea 30(%rdi), %rax +# endif +# ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 30(%rdi) +# endif + ret + + .p2align 4 +L(StrncpyExit31): + movdqu (%rsi), %xmm0 + movdqu 15(%rsi), %xmm2 + movdqu %xmm0, (%rdi) + movdqu %xmm2, 15(%rdi) +# ifdef USE_AS_STPCPY + lea 31(%rdi), %rax +# endif +# ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 31(%rdi) +# endif + ret + + .p2align 4 +L(StrncpyExit32): + movdqu (%rsi), %xmm0 + movdqu 16(%rsi), %xmm2 + movdqu %xmm0, (%rdi) + movdqu %xmm2, 16(%rdi) +# ifdef USE_AS_STPCPY + lea 32(%rdi), %rax +# endif +# ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 32(%rdi) +# endif + ret + + .p2align 4 +L(StrncpyExit33): + movdqu (%rsi), %xmm0 + movdqu 16(%rsi), %xmm2 + mov 32(%rsi), %cl + movdqu %xmm0, (%rdi) + movdqu %xmm2, 16(%rdi) + mov %cl, 32(%rdi) +# ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 33(%rdi) +# endif + ret + +# ifndef USE_AS_STRCAT + + .p2align 4 +L(Fill0): + ret + + .p2align 4 +L(Fill1): + mov %dl, (%rdi) + ret + + .p2align 4 +L(Fill2): + mov %dx, (%rdi) + ret + + .p2align 4 +L(Fill3): + mov %edx, -1(%rdi) + ret + + .p2align 4 +L(Fill4): + mov %edx, (%rdi) + ret + + .p2align 4 +L(Fill5): + mov %edx, (%rdi) + mov %dl, 4(%rdi) + ret + + .p2align 4 +L(Fill6): + mov %edx, (%rdi) + mov %dx, 4(%rdi) + ret + + .p2align 4 +L(Fill7): + mov %rdx, -1(%rdi) + ret + + .p2align 4 +L(Fill8): + mov %rdx, (%rdi) + ret + + .p2align 4 +L(Fill9): + mov %rdx, (%rdi) + mov %dl, 8(%rdi) + ret + + .p2align 4 +L(Fill10): + mov %rdx, (%rdi) + mov %dx, 8(%rdi) + ret + + .p2align 4 +L(Fill11): + mov %rdx, (%rdi) + mov %edx, 7(%rdi) + ret + + .p2align 4 +L(Fill12): + mov %rdx, (%rdi) + mov %edx, 8(%rdi) + ret + + .p2align 4 +L(Fill13): + mov %rdx, (%rdi) + mov %rdx, 5(%rdi) + ret + + .p2align 4 +L(Fill14): + mov %rdx, (%rdi) + mov %rdx, 6(%rdi) + ret + + .p2align 4 +L(Fill15): + movdqu %xmm0, -1(%rdi) + ret + + .p2align 4 +L(Fill16): + movdqu %xmm0, (%rdi) + ret + + .p2align 4 +L(CopyFrom1To16BytesUnalignedXmm2): + movdqu %xmm2, (%rdi, %rcx) + + .p2align 4 +L(CopyFrom1To16BytesXmmExit): + bsf %rdx, %rdx + add $15, %r8 + add %rcx, %rdi +# ifdef USE_AS_STPCPY + lea (%rdi, %rdx), %rax +# endif + sub %rdx, %r8 + lea 1(%rdi, %rdx), %rdi + + .p2align 4 +L(StrncpyFillTailWithZero): + pxor %xmm0, %xmm0 + xor %rdx, %rdx + sub $16, %r8 + jbe L(StrncpyFillExit) + + movdqu %xmm0, (%rdi) + add $16, %rdi + + mov %rdi, %rsi + and $0xf, %rsi + sub %rsi, %rdi + add %rsi, %r8 + sub $64, %r8 + jb L(StrncpyFillLess64) + +L(StrncpyFillLoopMovdqa): + movdqa %xmm0, (%rdi) + movdqa %xmm0, 16(%rdi) + movdqa %xmm0, 32(%rdi) + movdqa %xmm0, 48(%rdi) + add $64, %rdi + sub $64, %r8 + jae L(StrncpyFillLoopMovdqa) + +L(StrncpyFillLess64): + add $32, %r8 + jl L(StrncpyFillLess32) + movdqa %xmm0, (%rdi) + movdqa %xmm0, 16(%rdi) + add $32, %rdi + sub $16, %r8 + jl L(StrncpyFillExit) + movdqa %xmm0, (%rdi) + add $16, %rdi + BRANCH_TO_JMPTBL_ENTRY (L(FillTable), %r8, 4) + +L(StrncpyFillLess32): + add $16, %r8 + jl L(StrncpyFillExit) + movdqa %xmm0, (%rdi) + add $16, %rdi + BRANCH_TO_JMPTBL_ENTRY (L(FillTable), %r8, 4) + +L(StrncpyFillExit): + add $16, %r8 + BRANCH_TO_JMPTBL_ENTRY (L(FillTable), %r8, 4) + +/* end of ifndef USE_AS_STRCAT */ +# endif + + .p2align 4 +L(UnalignedLeaveCase2OrCase3): + test %rdx, %rdx + jnz L(Unaligned64LeaveCase2) +L(Unaligned64LeaveCase3): + lea 64(%r8), %rcx + and $-16, %rcx + add $48, %r8 + jl L(CopyFrom1To16BytesCase3) + movdqu %xmm4, (%rdi) + sub $16, %r8 + jb L(CopyFrom1To16BytesCase3) + movdqu %xmm5, 16(%rdi) + sub $16, %r8 + jb L(CopyFrom1To16BytesCase3) + movdqu %xmm6, 32(%rdi) + sub $16, %r8 + jb L(CopyFrom1To16BytesCase3) + movdqu %xmm7, 48(%rdi) +# ifdef USE_AS_STPCPY + lea 64(%rdi), %rax +# endif +# ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 64(%rdi) +# endif + ret + + .p2align 4 +L(Unaligned64LeaveCase2): + xor %rcx, %rcx + pcmpeqb %xmm4, %xmm0 + pmovmskb %xmm0, %rdx + add $48, %r8 + jle L(CopyFrom1To16BytesCase2OrCase3) + test %rdx, %rdx +# ifndef USE_AS_STRCAT + jnz L(CopyFrom1To16BytesUnalignedXmm4) +# else + jnz L(CopyFrom1To16Bytes) +# endif + pcmpeqb %xmm5, %xmm0 + pmovmskb %xmm0, %rdx + movdqu %xmm4, (%rdi) + add $16, %rcx + sub $16, %r8 + jbe L(CopyFrom1To16BytesCase2OrCase3) + test %rdx, %rdx +# ifndef USE_AS_STRCAT + jnz L(CopyFrom1To16BytesUnalignedXmm5) +# else + jnz L(CopyFrom1To16Bytes) +# endif + + pcmpeqb %xmm6, %xmm0 + pmovmskb %xmm0, %rdx + movdqu %xmm5, 16(%rdi) + add $16, %rcx + sub $16, %r8 + jbe L(CopyFrom1To16BytesCase2OrCase3) + test %rdx, %rdx +# ifndef USE_AS_STRCAT + jnz L(CopyFrom1To16BytesUnalignedXmm6) +# else + jnz L(CopyFrom1To16Bytes) +# endif + + pcmpeqb %xmm7, %xmm0 + pmovmskb %xmm0, %rdx + movdqu %xmm6, 32(%rdi) + lea 16(%rdi, %rcx), %rdi + lea 16(%rsi, %rcx), %rsi + bsf %rdx, %rdx + cmp %r8, %rdx + jb L(CopyFrom1To16BytesExit) + BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4) + + .p2align 4 +L(ExitZero): +# ifndef USE_AS_STRCAT + mov %rdi, %rax +# endif + ret + +# endif + +# ifndef USE_AS_STRCAT +END (STRCPY) +# else +END (STRCAT) +# endif + .p2align 4 + .section .rodata +L(ExitTable): + .int JMPTBL(L(Exit1), L(ExitTable)) + .int JMPTBL(L(Exit2), L(ExitTable)) + .int JMPTBL(L(Exit3), L(ExitTable)) + .int JMPTBL(L(Exit4), L(ExitTable)) + .int JMPTBL(L(Exit5), L(ExitTable)) + .int JMPTBL(L(Exit6), L(ExitTable)) + .int JMPTBL(L(Exit7), L(ExitTable)) + .int JMPTBL(L(Exit8), L(ExitTable)) + .int JMPTBL(L(Exit9), L(ExitTable)) + .int JMPTBL(L(Exit10), L(ExitTable)) + .int JMPTBL(L(Exit11), L(ExitTable)) + .int JMPTBL(L(Exit12), L(ExitTable)) + .int JMPTBL(L(Exit13), L(ExitTable)) + .int JMPTBL(L(Exit14), L(ExitTable)) + .int JMPTBL(L(Exit15), L(ExitTable)) + .int JMPTBL(L(Exit16), L(ExitTable)) + .int JMPTBL(L(Exit17), L(ExitTable)) + .int JMPTBL(L(Exit18), L(ExitTable)) + .int JMPTBL(L(Exit19), L(ExitTable)) + .int JMPTBL(L(Exit20), L(ExitTable)) + .int JMPTBL(L(Exit21), L(ExitTable)) + .int JMPTBL(L(Exit22), L(ExitTable)) + .int JMPTBL(L(Exit23), L(ExitTable)) + .int JMPTBL(L(Exit24), L(ExitTable)) + .int JMPTBL(L(Exit25), L(ExitTable)) + .int JMPTBL(L(Exit26), L(ExitTable)) + .int JMPTBL(L(Exit27), L(ExitTable)) + .int JMPTBL(L(Exit28), L(ExitTable)) + .int JMPTBL(L(Exit29), L(ExitTable)) + .int JMPTBL(L(Exit30), L(ExitTable)) + .int JMPTBL(L(Exit31), L(ExitTable)) + .int JMPTBL(L(Exit32), L(ExitTable)) +# ifdef USE_AS_STRNCPY +L(ExitStrncpyTable): + .int JMPTBL(L(StrncpyExit0), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit1), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit2), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit3), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit4), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit5), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit6), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit7), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit8), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit9), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit10), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit11), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit12), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit13), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit14), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit15), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit16), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit17), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit18), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit19), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit20), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit21), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit22), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit23), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit24), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit25), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit26), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit27), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit28), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit29), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit30), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit31), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit32), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit33), L(ExitStrncpyTable)) +# ifndef USE_AS_STRCAT + .p2align 4 +L(FillTable): + .int JMPTBL(L(Fill0), L(FillTable)) + .int JMPTBL(L(Fill1), L(FillTable)) + .int JMPTBL(L(Fill2), L(FillTable)) + .int JMPTBL(L(Fill3), L(FillTable)) + .int JMPTBL(L(Fill4), L(FillTable)) + .int JMPTBL(L(Fill5), L(FillTable)) + .int JMPTBL(L(Fill6), L(FillTable)) + .int JMPTBL(L(Fill7), L(FillTable)) + .int JMPTBL(L(Fill8), L(FillTable)) + .int JMPTBL(L(Fill9), L(FillTable)) + .int JMPTBL(L(Fill10), L(FillTable)) + .int JMPTBL(L(Fill11), L(FillTable)) + .int JMPTBL(L(Fill12), L(FillTable)) + .int JMPTBL(L(Fill13), L(FillTable)) + .int JMPTBL(L(Fill14), L(FillTable)) + .int JMPTBL(L(Fill15), L(FillTable)) + .int JMPTBL(L(Fill16), L(FillTable)) +# endif +# endif +#endif diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/strcpy-ssse3.S b/REORG.TODO/sysdeps/x86_64/multiarch/strcpy-ssse3.S new file mode 100644 index 0000000000..47aaeae671 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/strcpy-ssse3.S @@ -0,0 +1,3551 @@ +/* strcpy with SSSE3 + Copyright (C) 2011-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#if IS_IN (libc) + +# ifndef USE_AS_STRCAT +# include <sysdep.h> + +# ifndef STRCPY +# define STRCPY __strcpy_ssse3 +# endif + + .section .text.ssse3,"ax",@progbits +ENTRY (STRCPY) + + mov %rsi, %rcx +# ifdef USE_AS_STRNCPY + mov %rdx, %r8 +# endif + mov %rdi, %rdx +# ifdef USE_AS_STRNCPY + test %r8, %r8 + jz L(Exit0) + cmp $8, %r8 + jbe L(StrncpyExit8Bytes) +# endif + cmpb $0, (%rcx) + jz L(Exit1) + cmpb $0, 1(%rcx) + jz L(Exit2) + cmpb $0, 2(%rcx) + jz L(Exit3) + cmpb $0, 3(%rcx) + jz L(Exit4) + cmpb $0, 4(%rcx) + jz L(Exit5) + cmpb $0, 5(%rcx) + jz L(Exit6) + cmpb $0, 6(%rcx) + jz L(Exit7) + cmpb $0, 7(%rcx) + jz L(Exit8) +# ifdef USE_AS_STRNCPY + cmp $16, %r8 + jb L(StrncpyExit15Bytes) +# endif + cmpb $0, 8(%rcx) + jz L(Exit9) + cmpb $0, 9(%rcx) + jz L(Exit10) + cmpb $0, 10(%rcx) + jz L(Exit11) + cmpb $0, 11(%rcx) + jz L(Exit12) + cmpb $0, 12(%rcx) + jz L(Exit13) + cmpb $0, 13(%rcx) + jz L(Exit14) + cmpb $0, 14(%rcx) + jz L(Exit15) +# ifdef USE_AS_STRNCPY + cmp $16, %r8 + je L(Exit16) +# endif + cmpb $0, 15(%rcx) + jz L(Exit16) +# endif + +# ifdef USE_AS_STRNCPY + mov %rcx, %rsi + sub $16, %r8 + and $0xf, %rsi + +/* add 16 bytes rcx_offset to r8 */ + + add %rsi, %r8 +# endif + lea 16(%rcx), %rsi + and $-16, %rsi + pxor %xmm0, %xmm0 + mov (%rcx), %r9 + mov %r9, (%rdx) + pcmpeqb (%rsi), %xmm0 + mov 8(%rcx), %r9 + mov %r9, 8(%rdx) + +/* convert byte mask in xmm0 to bit mask */ + + pmovmskb %xmm0, %rax + sub %rcx, %rsi + +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(CopyFrom1To16BytesCase2OrCase3) +# endif + test %rax, %rax + jnz L(CopyFrom1To16Bytes) + + mov %rdx, %rax + lea 16(%rdx), %rdx + and $-16, %rdx + sub %rdx, %rax + +# ifdef USE_AS_STRNCPY + add %rax, %rsi + lea -1(%rsi), %rsi + and $1<<31, %esi + test %rsi, %rsi + jnz L(ContinueCopy) + lea 16(%r8), %r8 + +L(ContinueCopy): +# endif + sub %rax, %rcx + mov %rcx, %rax + and $0xf, %rax + mov $0, %rsi + +/* case: rcx_offset == rdx_offset */ + + jz L(Align16Both) + + cmp $8, %rax + jae L(ShlHigh8) + cmp $1, %rax + je L(Shl1) + cmp $2, %rax + je L(Shl2) + cmp $3, %rax + je L(Shl3) + cmp $4, %rax + je L(Shl4) + cmp $5, %rax + je L(Shl5) + cmp $6, %rax + je L(Shl6) + jmp L(Shl7) + +L(ShlHigh8): + je L(Shl8) + cmp $9, %rax + je L(Shl9) + cmp $10, %rax + je L(Shl10) + cmp $11, %rax + je L(Shl11) + cmp $12, %rax + je L(Shl12) + cmp $13, %rax + je L(Shl13) + cmp $14, %rax + je L(Shl14) + jmp L(Shl15) + +L(Align16Both): + movaps (%rcx), %xmm1 + movaps 16(%rcx), %xmm2 + movaps %xmm1, (%rdx) + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %rax + lea 16(%rsi), %rsi +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(CopyFrom1To16BytesCase2OrCase3) +# endif + test %rax, %rax + jnz L(CopyFrom1To16Bytes) + + movaps 16(%rcx, %rsi), %xmm3 + movaps %xmm2, (%rdx, %rsi) + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %rax + lea 16(%rsi), %rsi +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(CopyFrom1To16BytesCase2OrCase3) +# endif + test %rax, %rax + jnz L(CopyFrom1To16Bytes) + + movaps 16(%rcx, %rsi), %xmm4 + movaps %xmm3, (%rdx, %rsi) + pcmpeqb %xmm4, %xmm0 + pmovmskb %xmm0, %rax + lea 16(%rsi), %rsi +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(CopyFrom1To16BytesCase2OrCase3) +# endif + test %rax, %rax + jnz L(CopyFrom1To16Bytes) + + movaps 16(%rcx, %rsi), %xmm1 + movaps %xmm4, (%rdx, %rsi) + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %rax + lea 16(%rsi), %rsi +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(CopyFrom1To16BytesCase2OrCase3) +# endif + test %rax, %rax + jnz L(CopyFrom1To16Bytes) + + movaps 16(%rcx, %rsi), %xmm2 + movaps %xmm1, (%rdx, %rsi) + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %rax + lea 16(%rsi), %rsi +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(CopyFrom1To16BytesCase2OrCase3) +# endif + test %rax, %rax + jnz L(CopyFrom1To16Bytes) + + movaps 16(%rcx, %rsi), %xmm3 + movaps %xmm2, (%rdx, %rsi) + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %rax + lea 16(%rsi), %rsi +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(CopyFrom1To16BytesCase2OrCase3) +# endif + test %rax, %rax + jnz L(CopyFrom1To16Bytes) + + movaps %xmm3, (%rdx, %rsi) + mov %rcx, %rax + lea 16(%rcx, %rsi), %rcx + and $-0x40, %rcx + sub %rcx, %rax + sub %rax, %rdx +# ifdef USE_AS_STRNCPY + lea 112(%r8, %rax), %r8 +# endif + mov $-0x40, %rsi + + .p2align 4 +L(Aligned64Loop): + movaps (%rcx), %xmm2 + movaps %xmm2, %xmm4 + movaps 16(%rcx), %xmm5 + movaps 32(%rcx), %xmm3 + movaps %xmm3, %xmm6 + movaps 48(%rcx), %xmm7 + pminub %xmm5, %xmm2 + pminub %xmm7, %xmm3 + pminub %xmm2, %xmm3 + pcmpeqb %xmm0, %xmm3 + pmovmskb %xmm3, %rax + lea 64(%rdx), %rdx + lea 64(%rcx), %rcx +# ifdef USE_AS_STRNCPY + sub $64, %r8 + jbe L(StrncpyLeaveCase2OrCase3) +# endif + test %rax, %rax + jnz L(Aligned64Leave) + movaps %xmm4, -64(%rdx) + movaps %xmm5, -48(%rdx) + movaps %xmm6, -32(%rdx) + movaps %xmm7, -16(%rdx) + jmp L(Aligned64Loop) + +L(Aligned64Leave): +# ifdef USE_AS_STRNCPY + lea 48(%r8), %r8 +# endif + pcmpeqb %xmm4, %xmm0 + pmovmskb %xmm0, %rax + test %rax, %rax + jnz L(CopyFrom1To16Bytes) + + pcmpeqb %xmm5, %xmm0 +# ifdef USE_AS_STRNCPY + lea -16(%r8), %r8 +# endif + pmovmskb %xmm0, %rax + movaps %xmm4, -64(%rdx) + test %rax, %rax + lea 16(%rsi), %rsi + jnz L(CopyFrom1To16Bytes) + + pcmpeqb %xmm6, %xmm0 +# ifdef USE_AS_STRNCPY + lea -16(%r8), %r8 +# endif + pmovmskb %xmm0, %rax + movaps %xmm5, -48(%rdx) + test %rax, %rax + lea 16(%rsi), %rsi + jnz L(CopyFrom1To16Bytes) + + movaps %xmm6, -32(%rdx) + pcmpeqb %xmm7, %xmm0 +# ifdef USE_AS_STRNCPY + lea -16(%r8), %r8 +# endif + pmovmskb %xmm0, %rax + lea 16(%rsi), %rsi + jmp L(CopyFrom1To16Bytes) + + .p2align 4 +L(Shl1): + movaps -1(%rcx), %xmm1 + movaps 15(%rcx), %xmm2 +L(Shl1Start): + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %rax + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit1Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl1LoopExit) + + palignr $1, %xmm1, %xmm2 + movaps %xmm2, (%rdx) + movaps 31(%rcx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm1 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit1Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl1LoopExit) + + palignr $1, %xmm3, %xmm2 + movaps %xmm2, (%rdx) + movaps 31(%rcx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit1Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl1LoopExit) + + palignr $1, %xmm1, %xmm2 + movaps %xmm2, (%rdx) + movaps 31(%rcx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit1Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl1LoopExit) + + palignr $1, %xmm3, %xmm2 + movaps %xmm2, (%rdx) + lea 31(%rcx), %rcx + lea 16(%rdx), %rdx + + mov %rcx, %rax + and $-0x40, %rcx + sub %rcx, %rax + lea -15(%rcx), %rcx + sub %rax, %rdx +# ifdef USE_AS_STRNCPY + add %rax, %r8 +# endif + movaps -1(%rcx), %xmm1 + +/* 64 bytes loop */ + .p2align 4 +L(Shl1LoopStart): + movaps 15(%rcx), %xmm2 + movaps 31(%rcx), %xmm3 + movaps %xmm3, %xmm6 + movaps 47(%rcx), %xmm4 + movaps %xmm4, %xmm7 + movaps 63(%rcx), %xmm5 + pminub %xmm2, %xmm6 + pminub %xmm5, %xmm7 + pminub %xmm6, %xmm7 + pcmpeqb %xmm0, %xmm7 + pmovmskb %xmm7, %rax + movaps %xmm5, %xmm7 + palignr $1, %xmm4, %xmm5 + test %rax, %rax + palignr $1, %xmm3, %xmm4 + jnz L(Shl1Start) +# ifdef USE_AS_STRNCPY + sub $64, %r8 + jbe L(StrncpyLeave1) +# endif + palignr $1, %xmm2, %xmm3 + lea 64(%rcx), %rcx + palignr $1, %xmm1, %xmm2 + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%rdx) + movaps %xmm4, 32(%rdx) + movaps %xmm3, 16(%rdx) + movaps %xmm2, (%rdx) + lea 64(%rdx), %rdx + jmp L(Shl1LoopStart) + +L(Shl1LoopExit): + movdqu -1(%rcx), %xmm1 + mov $15, %rsi + movdqu %xmm1, -1(%rdx) + jmp L(CopyFrom1To16Bytes) + + .p2align 4 +L(Shl2): + movaps -2(%rcx), %xmm1 + movaps 14(%rcx), %xmm2 +L(Shl2Start): + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %rax + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit2Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl2LoopExit) + + palignr $2, %xmm1, %xmm2 + movaps %xmm2, (%rdx) + movaps 30(%rcx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm1 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit2Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl2LoopExit) + + palignr $2, %xmm3, %xmm2 + movaps %xmm2, (%rdx) + movaps 30(%rcx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit2Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl2LoopExit) + + palignr $2, %xmm1, %xmm2 + movaps %xmm2, (%rdx) + movaps 30(%rcx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit2Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl2LoopExit) + + palignr $2, %xmm3, %xmm2 + movaps %xmm2, (%rdx) + lea 30(%rcx), %rcx + lea 16(%rdx), %rdx + + mov %rcx, %rax + and $-0x40, %rcx + sub %rcx, %rax + lea -14(%rcx), %rcx + sub %rax, %rdx +# ifdef USE_AS_STRNCPY + add %rax, %r8 +# endif + movaps -2(%rcx), %xmm1 + +/* 64 bytes loop */ + .p2align 4 +L(Shl2LoopStart): + movaps 14(%rcx), %xmm2 + movaps 30(%rcx), %xmm3 + movaps %xmm3, %xmm6 + movaps 46(%rcx), %xmm4 + movaps %xmm4, %xmm7 + movaps 62(%rcx), %xmm5 + pminub %xmm2, %xmm6 + pminub %xmm5, %xmm7 + pminub %xmm6, %xmm7 + pcmpeqb %xmm0, %xmm7 + pmovmskb %xmm7, %rax + movaps %xmm5, %xmm7 + palignr $2, %xmm4, %xmm5 + test %rax, %rax + palignr $2, %xmm3, %xmm4 + jnz L(Shl2Start) +# ifdef USE_AS_STRNCPY + sub $64, %r8 + jbe L(StrncpyLeave2) +# endif + palignr $2, %xmm2, %xmm3 + lea 64(%rcx), %rcx + palignr $2, %xmm1, %xmm2 + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%rdx) + movaps %xmm4, 32(%rdx) + movaps %xmm3, 16(%rdx) + movaps %xmm2, (%rdx) + lea 64(%rdx), %rdx + jmp L(Shl2LoopStart) + +L(Shl2LoopExit): + movdqu -2(%rcx), %xmm1 + mov $14, %rsi + movdqu %xmm1, -2(%rdx) + jmp L(CopyFrom1To16Bytes) + + .p2align 4 +L(Shl3): + movaps -3(%rcx), %xmm1 + movaps 13(%rcx), %xmm2 +L(Shl3Start): + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %rax + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit3Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl3LoopExit) + + palignr $3, %xmm1, %xmm2 + movaps %xmm2, (%rdx) + movaps 29(%rcx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm1 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit3Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl3LoopExit) + + palignr $3, %xmm3, %xmm2 + movaps %xmm2, (%rdx) + movaps 29(%rcx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit3Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl3LoopExit) + + palignr $3, %xmm1, %xmm2 + movaps %xmm2, (%rdx) + movaps 29(%rcx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit3Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl3LoopExit) + + palignr $3, %xmm3, %xmm2 + movaps %xmm2, (%rdx) + lea 29(%rcx), %rcx + lea 16(%rdx), %rdx + + mov %rcx, %rax + and $-0x40, %rcx + sub %rcx, %rax + lea -13(%rcx), %rcx + sub %rax, %rdx +# ifdef USE_AS_STRNCPY + add %rax, %r8 +# endif + movaps -3(%rcx), %xmm1 + +/* 64 bytes loop */ + .p2align 4 +L(Shl3LoopStart): + movaps 13(%rcx), %xmm2 + movaps 29(%rcx), %xmm3 + movaps %xmm3, %xmm6 + movaps 45(%rcx), %xmm4 + movaps %xmm4, %xmm7 + movaps 61(%rcx), %xmm5 + pminub %xmm2, %xmm6 + pminub %xmm5, %xmm7 + pminub %xmm6, %xmm7 + pcmpeqb %xmm0, %xmm7 + pmovmskb %xmm7, %rax + movaps %xmm5, %xmm7 + palignr $3, %xmm4, %xmm5 + test %rax, %rax + palignr $3, %xmm3, %xmm4 + jnz L(Shl3Start) +# ifdef USE_AS_STRNCPY + sub $64, %r8 + jbe L(StrncpyLeave3) +# endif + palignr $3, %xmm2, %xmm3 + lea 64(%rcx), %rcx + palignr $3, %xmm1, %xmm2 + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%rdx) + movaps %xmm4, 32(%rdx) + movaps %xmm3, 16(%rdx) + movaps %xmm2, (%rdx) + lea 64(%rdx), %rdx + jmp L(Shl3LoopStart) + +L(Shl3LoopExit): + movdqu -3(%rcx), %xmm1 + mov $13, %rsi + movdqu %xmm1, -3(%rdx) + jmp L(CopyFrom1To16Bytes) + + .p2align 4 +L(Shl4): + movaps -4(%rcx), %xmm1 + movaps 12(%rcx), %xmm2 +L(Shl4Start): + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %rax + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit4Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl4LoopExit) + + palignr $4, %xmm1, %xmm2 + movaps %xmm2, (%rdx) + movaps 28(%rcx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm1 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit4Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl4LoopExit) + + palignr $4, %xmm3, %xmm2 + movaps %xmm2, (%rdx) + movaps 28(%rcx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit4Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl4LoopExit) + + palignr $4, %xmm1, %xmm2 + movaps %xmm2, (%rdx) + movaps 28(%rcx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit4Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl4LoopExit) + + palignr $4, %xmm3, %xmm2 + movaps %xmm2, (%rdx) + lea 28(%rcx), %rcx + lea 16(%rdx), %rdx + + mov %rcx, %rax + and $-0x40, %rcx + sub %rcx, %rax + lea -12(%rcx), %rcx + sub %rax, %rdx +# ifdef USE_AS_STRNCPY + add %rax, %r8 +# endif + movaps -4(%rcx), %xmm1 + +/* 64 bytes loop */ + .p2align 4 +L(Shl4LoopStart): + movaps 12(%rcx), %xmm2 + movaps 28(%rcx), %xmm3 + movaps %xmm3, %xmm6 + movaps 44(%rcx), %xmm4 + movaps %xmm4, %xmm7 + movaps 60(%rcx), %xmm5 + pminub %xmm2, %xmm6 + pminub %xmm5, %xmm7 + pminub %xmm6, %xmm7 + pcmpeqb %xmm0, %xmm7 + pmovmskb %xmm7, %rax + movaps %xmm5, %xmm7 + palignr $4, %xmm4, %xmm5 + test %rax, %rax + palignr $4, %xmm3, %xmm4 + jnz L(Shl4Start) +# ifdef USE_AS_STRNCPY + sub $64, %r8 + jbe L(StrncpyLeave4) +# endif + palignr $4, %xmm2, %xmm3 + lea 64(%rcx), %rcx + palignr $4, %xmm1, %xmm2 + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%rdx) + movaps %xmm4, 32(%rdx) + movaps %xmm3, 16(%rdx) + movaps %xmm2, (%rdx) + lea 64(%rdx), %rdx + jmp L(Shl4LoopStart) + +L(Shl4LoopExit): + movdqu -4(%rcx), %xmm1 + mov $12, %rsi + movdqu %xmm1, -4(%rdx) + jmp L(CopyFrom1To16Bytes) + + .p2align 4 +L(Shl5): + movaps -5(%rcx), %xmm1 + movaps 11(%rcx), %xmm2 +L(Shl5Start): + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %rax + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit5Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl5LoopExit) + + palignr $5, %xmm1, %xmm2 + movaps %xmm2, (%rdx) + movaps 27(%rcx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm1 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit5Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl5LoopExit) + + palignr $5, %xmm3, %xmm2 + movaps %xmm2, (%rdx) + movaps 27(%rcx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit5Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl5LoopExit) + + palignr $5, %xmm1, %xmm2 + movaps %xmm2, (%rdx) + movaps 27(%rcx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit5Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl5LoopExit) + + palignr $5, %xmm3, %xmm2 + movaps %xmm2, (%rdx) + lea 27(%rcx), %rcx + lea 16(%rdx), %rdx + + mov %rcx, %rax + and $-0x40, %rcx + sub %rcx, %rax + lea -11(%rcx), %rcx + sub %rax, %rdx +# ifdef USE_AS_STRNCPY + add %rax, %r8 +# endif + movaps -5(%rcx), %xmm1 + +/* 64 bytes loop */ + .p2align 4 +L(Shl5LoopStart): + movaps 11(%rcx), %xmm2 + movaps 27(%rcx), %xmm3 + movaps %xmm3, %xmm6 + movaps 43(%rcx), %xmm4 + movaps %xmm4, %xmm7 + movaps 59(%rcx), %xmm5 + pminub %xmm2, %xmm6 + pminub %xmm5, %xmm7 + pminub %xmm6, %xmm7 + pcmpeqb %xmm0, %xmm7 + pmovmskb %xmm7, %rax + movaps %xmm5, %xmm7 + palignr $5, %xmm4, %xmm5 + test %rax, %rax + palignr $5, %xmm3, %xmm4 + jnz L(Shl5Start) +# ifdef USE_AS_STRNCPY + sub $64, %r8 + jbe L(StrncpyLeave5) +# endif + palignr $5, %xmm2, %xmm3 + lea 64(%rcx), %rcx + palignr $5, %xmm1, %xmm2 + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%rdx) + movaps %xmm4, 32(%rdx) + movaps %xmm3, 16(%rdx) + movaps %xmm2, (%rdx) + lea 64(%rdx), %rdx + jmp L(Shl5LoopStart) + +L(Shl5LoopExit): + movdqu -5(%rcx), %xmm1 + mov $11, %rsi + movdqu %xmm1, -5(%rdx) + jmp L(CopyFrom1To16Bytes) + + .p2align 4 +L(Shl6): + movaps -6(%rcx), %xmm1 + movaps 10(%rcx), %xmm2 +L(Shl6Start): + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %rax + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit6Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl6LoopExit) + + palignr $6, %xmm1, %xmm2 + movaps %xmm2, (%rdx) + movaps 26(%rcx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm1 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit6Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl6LoopExit) + + palignr $6, %xmm3, %xmm2 + movaps %xmm2, (%rdx) + movaps 26(%rcx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit6Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl6LoopExit) + + palignr $6, %xmm1, %xmm2 + movaps %xmm2, (%rdx) + movaps 26(%rcx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit6Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl6LoopExit) + + palignr $6, %xmm3, %xmm2 + movaps %xmm2, (%rdx) + lea 26(%rcx), %rcx + lea 16(%rdx), %rdx + + mov %rcx, %rax + and $-0x40, %rcx + sub %rcx, %rax + lea -10(%rcx), %rcx + sub %rax, %rdx +# ifdef USE_AS_STRNCPY + add %rax, %r8 +# endif + movaps -6(%rcx), %xmm1 + +/* 64 bytes loop */ + .p2align 4 +L(Shl6LoopStart): + movaps 10(%rcx), %xmm2 + movaps 26(%rcx), %xmm3 + movaps %xmm3, %xmm6 + movaps 42(%rcx), %xmm4 + movaps %xmm4, %xmm7 + movaps 58(%rcx), %xmm5 + pminub %xmm2, %xmm6 + pminub %xmm5, %xmm7 + pminub %xmm6, %xmm7 + pcmpeqb %xmm0, %xmm7 + pmovmskb %xmm7, %rax + movaps %xmm5, %xmm7 + palignr $6, %xmm4, %xmm5 + test %rax, %rax + palignr $6, %xmm3, %xmm4 + jnz L(Shl6Start) +# ifdef USE_AS_STRNCPY + sub $64, %r8 + jbe L(StrncpyLeave6) +# endif + palignr $6, %xmm2, %xmm3 + lea 64(%rcx), %rcx + palignr $6, %xmm1, %xmm2 + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%rdx) + movaps %xmm4, 32(%rdx) + movaps %xmm3, 16(%rdx) + movaps %xmm2, (%rdx) + lea 64(%rdx), %rdx + jmp L(Shl6LoopStart) + +L(Shl6LoopExit): + mov (%rcx), %r9 + mov 6(%rcx), %esi + mov %r9, (%rdx) + mov %esi, 6(%rdx) + mov $10, %rsi + jmp L(CopyFrom1To16Bytes) + + .p2align 4 +L(Shl7): + movaps -7(%rcx), %xmm1 + movaps 9(%rcx), %xmm2 +L(Shl7Start): + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %rax + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit7Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl7LoopExit) + + palignr $7, %xmm1, %xmm2 + movaps %xmm2, (%rdx) + movaps 25(%rcx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm1 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit7Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl7LoopExit) + + palignr $7, %xmm3, %xmm2 + movaps %xmm2, (%rdx) + movaps 25(%rcx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit7Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl7LoopExit) + + palignr $7, %xmm1, %xmm2 + movaps %xmm2, (%rdx) + movaps 25(%rcx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit7Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl7LoopExit) + + palignr $7, %xmm3, %xmm2 + movaps %xmm2, (%rdx) + lea 25(%rcx), %rcx + lea 16(%rdx), %rdx + + mov %rcx, %rax + and $-0x40, %rcx + sub %rcx, %rax + lea -9(%rcx), %rcx + sub %rax, %rdx +# ifdef USE_AS_STRNCPY + add %rax, %r8 +# endif + movaps -7(%rcx), %xmm1 + +/* 64 bytes loop */ + .p2align 4 +L(Shl7LoopStart): + movaps 9(%rcx), %xmm2 + movaps 25(%rcx), %xmm3 + movaps %xmm3, %xmm6 + movaps 41(%rcx), %xmm4 + movaps %xmm4, %xmm7 + movaps 57(%rcx), %xmm5 + pminub %xmm2, %xmm6 + pminub %xmm5, %xmm7 + pminub %xmm6, %xmm7 + pcmpeqb %xmm0, %xmm7 + pmovmskb %xmm7, %rax + movaps %xmm5, %xmm7 + palignr $7, %xmm4, %xmm5 + test %rax, %rax + palignr $7, %xmm3, %xmm4 + jnz L(Shl7Start) +# ifdef USE_AS_STRNCPY + sub $64, %r8 + jbe L(StrncpyLeave7) +# endif + palignr $7, %xmm2, %xmm3 + lea 64(%rcx), %rcx + palignr $7, %xmm1, %xmm2 + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%rdx) + movaps %xmm4, 32(%rdx) + movaps %xmm3, 16(%rdx) + movaps %xmm2, (%rdx) + lea 64(%rdx), %rdx + jmp L(Shl7LoopStart) + +L(Shl7LoopExit): + mov (%rcx), %r9 + mov 5(%rcx), %esi + mov %r9, (%rdx) + mov %esi, 5(%rdx) + mov $9, %rsi + jmp L(CopyFrom1To16Bytes) + + .p2align 4 +L(Shl8): + movaps -8(%rcx), %xmm1 + movaps 8(%rcx), %xmm2 +L(Shl8Start): + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %rax + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit8Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl8LoopExit) + + palignr $8, %xmm1, %xmm2 + movaps %xmm2, (%rdx) + movaps 24(%rcx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm1 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit8Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl8LoopExit) + + palignr $8, %xmm3, %xmm2 + movaps %xmm2, (%rdx) + movaps 24(%rcx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit8Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl8LoopExit) + + palignr $8, %xmm1, %xmm2 + movaps %xmm2, (%rdx) + movaps 24(%rcx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit8Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl8LoopExit) + + palignr $8, %xmm3, %xmm2 + movaps %xmm2, (%rdx) + lea 24(%rcx), %rcx + lea 16(%rdx), %rdx + + mov %rcx, %rax + and $-0x40, %rcx + sub %rcx, %rax + lea -8(%rcx), %rcx + sub %rax, %rdx +# ifdef USE_AS_STRNCPY + add %rax, %r8 +# endif + movaps -8(%rcx), %xmm1 + +/* 64 bytes loop */ + .p2align 4 +L(Shl8LoopStart): + movaps 8(%rcx), %xmm2 + movaps 24(%rcx), %xmm3 + movaps %xmm3, %xmm6 + movaps 40(%rcx), %xmm4 + movaps %xmm4, %xmm7 + movaps 56(%rcx), %xmm5 + pminub %xmm2, %xmm6 + pminub %xmm5, %xmm7 + pminub %xmm6, %xmm7 + pcmpeqb %xmm0, %xmm7 + pmovmskb %xmm7, %rax + movaps %xmm5, %xmm7 + palignr $8, %xmm4, %xmm5 + test %rax, %rax + palignr $8, %xmm3, %xmm4 + jnz L(Shl8Start) +# ifdef USE_AS_STRNCPY + sub $64, %r8 + jbe L(StrncpyLeave8) +# endif + palignr $8, %xmm2, %xmm3 + lea 64(%rcx), %rcx + palignr $8, %xmm1, %xmm2 + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%rdx) + movaps %xmm4, 32(%rdx) + movaps %xmm3, 16(%rdx) + movaps %xmm2, (%rdx) + lea 64(%rdx), %rdx + jmp L(Shl8LoopStart) + +L(Shl8LoopExit): + mov (%rcx), %r9 + mov $8, %rsi + mov %r9, (%rdx) + jmp L(CopyFrom1To16Bytes) + + .p2align 4 +L(Shl9): + movaps -9(%rcx), %xmm1 + movaps 7(%rcx), %xmm2 +L(Shl9Start): + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %rax + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit9Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl9LoopExit) + + palignr $9, %xmm1, %xmm2 + movaps %xmm2, (%rdx) + movaps 23(%rcx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm1 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit9Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl9LoopExit) + + palignr $9, %xmm3, %xmm2 + movaps %xmm2, (%rdx) + movaps 23(%rcx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit9Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl9LoopExit) + + palignr $9, %xmm1, %xmm2 + movaps %xmm2, (%rdx) + movaps 23(%rcx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit9Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl9LoopExit) + + palignr $9, %xmm3, %xmm2 + movaps %xmm2, (%rdx) + lea 23(%rcx), %rcx + lea 16(%rdx), %rdx + + mov %rcx, %rax + and $-0x40, %rcx + sub %rcx, %rax + lea -7(%rcx), %rcx + sub %rax, %rdx +# ifdef USE_AS_STRNCPY + add %rax, %r8 +# endif + movaps -9(%rcx), %xmm1 + +/* 64 bytes loop */ + .p2align 4 +L(Shl9LoopStart): + movaps 7(%rcx), %xmm2 + movaps 23(%rcx), %xmm3 + movaps %xmm3, %xmm6 + movaps 39(%rcx), %xmm4 + movaps %xmm4, %xmm7 + movaps 55(%rcx), %xmm5 + pminub %xmm2, %xmm6 + pminub %xmm5, %xmm7 + pminub %xmm6, %xmm7 + pcmpeqb %xmm0, %xmm7 + pmovmskb %xmm7, %rax + movaps %xmm5, %xmm7 + palignr $9, %xmm4, %xmm5 + test %rax, %rax + palignr $9, %xmm3, %xmm4 + jnz L(Shl9Start) +# ifdef USE_AS_STRNCPY + sub $64, %r8 + jbe L(StrncpyLeave9) +# endif + palignr $9, %xmm2, %xmm3 + lea 64(%rcx), %rcx + palignr $9, %xmm1, %xmm2 + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%rdx) + movaps %xmm4, 32(%rdx) + movaps %xmm3, 16(%rdx) + movaps %xmm2, (%rdx) + lea 64(%rdx), %rdx + jmp L(Shl9LoopStart) + +L(Shl9LoopExit): + mov -1(%rcx), %r9 + mov $7, %rsi + mov %r9, -1(%rdx) + jmp L(CopyFrom1To16Bytes) + + .p2align 4 +L(Shl10): + movaps -10(%rcx), %xmm1 + movaps 6(%rcx), %xmm2 +L(Shl10Start): + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %rax + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit10Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl10LoopExit) + + palignr $10, %xmm1, %xmm2 + movaps %xmm2, (%rdx) + movaps 22(%rcx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm1 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit10Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl10LoopExit) + + palignr $10, %xmm3, %xmm2 + movaps %xmm2, (%rdx) + movaps 22(%rcx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit10Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl10LoopExit) + + palignr $10, %xmm1, %xmm2 + movaps %xmm2, (%rdx) + movaps 22(%rcx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit10Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl10LoopExit) + + palignr $10, %xmm3, %xmm2 + movaps %xmm2, (%rdx) + lea 22(%rcx), %rcx + lea 16(%rdx), %rdx + + mov %rcx, %rax + and $-0x40, %rcx + sub %rcx, %rax + lea -6(%rcx), %rcx + sub %rax, %rdx +# ifdef USE_AS_STRNCPY + add %rax, %r8 +# endif + movaps -10(%rcx), %xmm1 + +/* 64 bytes loop */ + .p2align 4 +L(Shl10LoopStart): + movaps 6(%rcx), %xmm2 + movaps 22(%rcx), %xmm3 + movaps %xmm3, %xmm6 + movaps 38(%rcx), %xmm4 + movaps %xmm4, %xmm7 + movaps 54(%rcx), %xmm5 + pminub %xmm2, %xmm6 + pminub %xmm5, %xmm7 + pminub %xmm6, %xmm7 + pcmpeqb %xmm0, %xmm7 + pmovmskb %xmm7, %rax + movaps %xmm5, %xmm7 + palignr $10, %xmm4, %xmm5 + test %rax, %rax + palignr $10, %xmm3, %xmm4 + jnz L(Shl10Start) +# ifdef USE_AS_STRNCPY + sub $64, %r8 + jbe L(StrncpyLeave10) +# endif + palignr $10, %xmm2, %xmm3 + lea 64(%rcx), %rcx + palignr $10, %xmm1, %xmm2 + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%rdx) + movaps %xmm4, 32(%rdx) + movaps %xmm3, 16(%rdx) + movaps %xmm2, (%rdx) + lea 64(%rdx), %rdx + jmp L(Shl10LoopStart) + +L(Shl10LoopExit): + mov -2(%rcx), %r9 + mov $6, %rsi + mov %r9, -2(%rdx) + jmp L(CopyFrom1To16Bytes) + + .p2align 4 +L(Shl11): + movaps -11(%rcx), %xmm1 + movaps 5(%rcx), %xmm2 +L(Shl11Start): + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %rax + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit11Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl11LoopExit) + + palignr $11, %xmm1, %xmm2 + movaps %xmm2, (%rdx) + movaps 21(%rcx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm1 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit11Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl11LoopExit) + + palignr $11, %xmm3, %xmm2 + movaps %xmm2, (%rdx) + movaps 21(%rcx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit11Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl11LoopExit) + + palignr $11, %xmm1, %xmm2 + movaps %xmm2, (%rdx) + movaps 21(%rcx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit11Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl11LoopExit) + + palignr $11, %xmm3, %xmm2 + movaps %xmm2, (%rdx) + lea 21(%rcx), %rcx + lea 16(%rdx), %rdx + + mov %rcx, %rax + and $-0x40, %rcx + sub %rcx, %rax + lea -5(%rcx), %rcx + sub %rax, %rdx +# ifdef USE_AS_STRNCPY + add %rax, %r8 +# endif + movaps -11(%rcx), %xmm1 + +/* 64 bytes loop */ + .p2align 4 +L(Shl11LoopStart): + movaps 5(%rcx), %xmm2 + movaps 21(%rcx), %xmm3 + movaps %xmm3, %xmm6 + movaps 37(%rcx), %xmm4 + movaps %xmm4, %xmm7 + movaps 53(%rcx), %xmm5 + pminub %xmm2, %xmm6 + pminub %xmm5, %xmm7 + pminub %xmm6, %xmm7 + pcmpeqb %xmm0, %xmm7 + pmovmskb %xmm7, %rax + movaps %xmm5, %xmm7 + palignr $11, %xmm4, %xmm5 + test %rax, %rax + palignr $11, %xmm3, %xmm4 + jnz L(Shl11Start) +# ifdef USE_AS_STRNCPY + sub $64, %r8 + jbe L(StrncpyLeave11) +# endif + palignr $11, %xmm2, %xmm3 + lea 64(%rcx), %rcx + palignr $11, %xmm1, %xmm2 + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%rdx) + movaps %xmm4, 32(%rdx) + movaps %xmm3, 16(%rdx) + movaps %xmm2, (%rdx) + lea 64(%rdx), %rdx + jmp L(Shl11LoopStart) + +L(Shl11LoopExit): + mov -3(%rcx), %r9 + mov $5, %rsi + mov %r9, -3(%rdx) + jmp L(CopyFrom1To16Bytes) + + .p2align 4 +L(Shl12): + movaps -12(%rcx), %xmm1 + movaps 4(%rcx), %xmm2 +L(Shl12Start): + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %rax + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit12Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl12LoopExit) + + palignr $12, %xmm1, %xmm2 + movaps %xmm2, (%rdx) + movaps 20(%rcx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm1 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit12Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl12LoopExit) + + palignr $12, %xmm3, %xmm2 + movaps %xmm2, (%rdx) + movaps 20(%rcx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit12Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl12LoopExit) + + palignr $12, %xmm1, %xmm2 + movaps %xmm2, (%rdx) + movaps 20(%rcx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit12Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl12LoopExit) + + palignr $12, %xmm3, %xmm2 + movaps %xmm2, (%rdx) + lea 20(%rcx), %rcx + lea 16(%rdx), %rdx + + mov %rcx, %rax + and $-0x40, %rcx + sub %rcx, %rax + lea -4(%rcx), %rcx + sub %rax, %rdx +# ifdef USE_AS_STRNCPY + add %rax, %r8 +# endif + movaps -12(%rcx), %xmm1 + +/* 64 bytes loop */ + .p2align 4 +L(Shl12LoopStart): + movaps 4(%rcx), %xmm2 + movaps 20(%rcx), %xmm3 + movaps %xmm3, %xmm6 + movaps 36(%rcx), %xmm4 + movaps %xmm4, %xmm7 + movaps 52(%rcx), %xmm5 + pminub %xmm2, %xmm6 + pminub %xmm5, %xmm7 + pminub %xmm6, %xmm7 + pcmpeqb %xmm0, %xmm7 + pmovmskb %xmm7, %rax + movaps %xmm5, %xmm7 + palignr $12, %xmm4, %xmm5 + test %rax, %rax + palignr $12, %xmm3, %xmm4 + jnz L(Shl12Start) +# ifdef USE_AS_STRNCPY + sub $64, %r8 + jbe L(StrncpyLeave12) +# endif + palignr $12, %xmm2, %xmm3 + lea 64(%rcx), %rcx + palignr $12, %xmm1, %xmm2 + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%rdx) + movaps %xmm4, 32(%rdx) + movaps %xmm3, 16(%rdx) + movaps %xmm2, (%rdx) + lea 64(%rdx), %rdx + jmp L(Shl12LoopStart) + +L(Shl12LoopExit): + mov (%rcx), %r9d + mov $4, %rsi + mov %r9d, (%rdx) + jmp L(CopyFrom1To16Bytes) + + .p2align 4 +L(Shl13): + movaps -13(%rcx), %xmm1 + movaps 3(%rcx), %xmm2 +L(Shl13Start): + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %rax + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit13Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl13LoopExit) + + palignr $13, %xmm1, %xmm2 + movaps %xmm2, (%rdx) + movaps 19(%rcx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm1 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit13Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl13LoopExit) + + palignr $13, %xmm3, %xmm2 + movaps %xmm2, (%rdx) + movaps 19(%rcx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit13Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl13LoopExit) + + palignr $13, %xmm1, %xmm2 + movaps %xmm2, (%rdx) + movaps 19(%rcx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit13Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl13LoopExit) + + palignr $13, %xmm3, %xmm2 + movaps %xmm2, (%rdx) + lea 19(%rcx), %rcx + lea 16(%rdx), %rdx + + mov %rcx, %rax + and $-0x40, %rcx + sub %rcx, %rax + lea -3(%rcx), %rcx + sub %rax, %rdx +# ifdef USE_AS_STRNCPY + add %rax, %r8 +# endif + movaps -13(%rcx), %xmm1 + +/* 64 bytes loop */ + .p2align 4 +L(Shl13LoopStart): + movaps 3(%rcx), %xmm2 + movaps 19(%rcx), %xmm3 + movaps %xmm3, %xmm6 + movaps 35(%rcx), %xmm4 + movaps %xmm4, %xmm7 + movaps 51(%rcx), %xmm5 + pminub %xmm2, %xmm6 + pminub %xmm5, %xmm7 + pminub %xmm6, %xmm7 + pcmpeqb %xmm0, %xmm7 + pmovmskb %xmm7, %rax + movaps %xmm5, %xmm7 + palignr $13, %xmm4, %xmm5 + test %rax, %rax + palignr $13, %xmm3, %xmm4 + jnz L(Shl13Start) +# ifdef USE_AS_STRNCPY + sub $64, %r8 + jbe L(StrncpyLeave13) +# endif + palignr $13, %xmm2, %xmm3 + lea 64(%rcx), %rcx + palignr $13, %xmm1, %xmm2 + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%rdx) + movaps %xmm4, 32(%rdx) + movaps %xmm3, 16(%rdx) + movaps %xmm2, (%rdx) + lea 64(%rdx), %rdx + jmp L(Shl13LoopStart) + +L(Shl13LoopExit): + mov -1(%rcx), %r9d + mov $3, %rsi + mov %r9d, -1(%rdx) + jmp L(CopyFrom1To16Bytes) + + .p2align 4 +L(Shl14): + movaps -14(%rcx), %xmm1 + movaps 2(%rcx), %xmm2 +L(Shl14Start): + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %rax + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit14Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl14LoopExit) + + palignr $14, %xmm1, %xmm2 + movaps %xmm2, (%rdx) + movaps 18(%rcx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm1 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit14Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl14LoopExit) + + palignr $14, %xmm3, %xmm2 + movaps %xmm2, (%rdx) + movaps 18(%rcx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit14Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl14LoopExit) + + palignr $14, %xmm1, %xmm2 + movaps %xmm2, (%rdx) + movaps 18(%rcx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit14Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl14LoopExit) + + palignr $14, %xmm3, %xmm2 + movaps %xmm2, (%rdx) + lea 18(%rcx), %rcx + lea 16(%rdx), %rdx + + mov %rcx, %rax + and $-0x40, %rcx + sub %rcx, %rax + lea -2(%rcx), %rcx + sub %rax, %rdx +# ifdef USE_AS_STRNCPY + add %rax, %r8 +# endif + movaps -14(%rcx), %xmm1 + +/* 64 bytes loop */ + .p2align 4 +L(Shl14LoopStart): + movaps 2(%rcx), %xmm2 + movaps 18(%rcx), %xmm3 + movaps %xmm3, %xmm6 + movaps 34(%rcx), %xmm4 + movaps %xmm4, %xmm7 + movaps 50(%rcx), %xmm5 + pminub %xmm2, %xmm6 + pminub %xmm5, %xmm7 + pminub %xmm6, %xmm7 + pcmpeqb %xmm0, %xmm7 + pmovmskb %xmm7, %rax + movaps %xmm5, %xmm7 + palignr $14, %xmm4, %xmm5 + test %rax, %rax + palignr $14, %xmm3, %xmm4 + jnz L(Shl14Start) +# ifdef USE_AS_STRNCPY + sub $64, %r8 + jbe L(StrncpyLeave14) +# endif + palignr $14, %xmm2, %xmm3 + lea 64(%rcx), %rcx + palignr $14, %xmm1, %xmm2 + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%rdx) + movaps %xmm4, 32(%rdx) + movaps %xmm3, 16(%rdx) + movaps %xmm2, (%rdx) + lea 64(%rdx), %rdx + jmp L(Shl14LoopStart) + +L(Shl14LoopExit): + mov -2(%rcx), %r9d + mov $2, %rsi + mov %r9d, -2(%rdx) + jmp L(CopyFrom1To16Bytes) + + .p2align 4 +L(Shl15): + movaps -15(%rcx), %xmm1 + movaps 1(%rcx), %xmm2 +L(Shl15Start): + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %rax + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit15Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl15LoopExit) + + palignr $15, %xmm1, %xmm2 + movaps %xmm2, (%rdx) + movaps 17(%rcx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm1 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit15Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl15LoopExit) + + palignr $15, %xmm3, %xmm2 + movaps %xmm2, (%rdx) + movaps 17(%rcx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit15Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl15LoopExit) + + palignr $15, %xmm1, %xmm2 + movaps %xmm2, (%rdx) + movaps 17(%rcx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit15Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl15LoopExit) + + palignr $15, %xmm3, %xmm2 + movaps %xmm2, (%rdx) + lea 17(%rcx), %rcx + lea 16(%rdx), %rdx + + mov %rcx, %rax + and $-0x40, %rcx + sub %rcx, %rax + lea -1(%rcx), %rcx + sub %rax, %rdx +# ifdef USE_AS_STRNCPY + add %rax, %r8 +# endif + movaps -15(%rcx), %xmm1 + +/* 64 bytes loop */ + .p2align 4 +L(Shl15LoopStart): + movaps 1(%rcx), %xmm2 + movaps 17(%rcx), %xmm3 + movaps %xmm3, %xmm6 + movaps 33(%rcx), %xmm4 + movaps %xmm4, %xmm7 + movaps 49(%rcx), %xmm5 + pminub %xmm2, %xmm6 + pminub %xmm5, %xmm7 + pminub %xmm6, %xmm7 + pcmpeqb %xmm0, %xmm7 + pmovmskb %xmm7, %rax + movaps %xmm5, %xmm7 + palignr $15, %xmm4, %xmm5 + test %rax, %rax + palignr $15, %xmm3, %xmm4 + jnz L(Shl15Start) +# ifdef USE_AS_STRNCPY + sub $64, %r8 + jbe L(StrncpyLeave15) +# endif + palignr $15, %xmm2, %xmm3 + lea 64(%rcx), %rcx + palignr $15, %xmm1, %xmm2 + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%rdx) + movaps %xmm4, 32(%rdx) + movaps %xmm3, 16(%rdx) + movaps %xmm2, (%rdx) + lea 64(%rdx), %rdx + jmp L(Shl15LoopStart) + +L(Shl15LoopExit): + mov -3(%rcx), %r9d + mov $1, %rsi + mov %r9d, -3(%rdx) +# ifdef USE_AS_STRCAT + jmp L(CopyFrom1To16Bytes) +# endif + +# ifndef USE_AS_STRCAT + + .p2align 4 +L(CopyFrom1To16Bytes): +# ifdef USE_AS_STRNCPY + add $16, %r8 +# endif + add %rsi, %rdx + add %rsi, %rcx + + test %al, %al + jz L(ExitHigh) + test $0x01, %al + jnz L(Exit1) + test $0x02, %al + jnz L(Exit2) + test $0x04, %al + jnz L(Exit3) + test $0x08, %al + jnz L(Exit4) + test $0x10, %al + jnz L(Exit5) + test $0x20, %al + jnz L(Exit6) + test $0x40, %al + jnz L(Exit7) + + .p2align 4 +L(Exit8): + mov (%rcx), %rax + mov %rax, (%rdx) +# ifdef USE_AS_STPCPY + lea 7(%rdx), %rax +# else + mov %rdi, %rax +# endif +# ifdef USE_AS_STRNCPY + sub $8, %r8 + lea 8(%rdx), %rcx + jnz L(StrncpyFillTailWithZero1) +# ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +# endif +# endif + ret + + .p2align 4 +L(ExitHigh): + test $0x01, %ah + jnz L(Exit9) + test $0x02, %ah + jnz L(Exit10) + test $0x04, %ah + jnz L(Exit11) + test $0x08, %ah + jnz L(Exit12) + test $0x10, %ah + jnz L(Exit13) + test $0x20, %ah + jnz L(Exit14) + test $0x40, %ah + jnz L(Exit15) + + .p2align 4 +L(Exit16): + mov (%rcx), %rax + mov %rax, (%rdx) + mov 8(%rcx), %rax + mov %rax, 8(%rdx) +# ifdef USE_AS_STPCPY + lea 15(%rdx), %rax +# else + mov %rdi, %rax +# endif +# ifdef USE_AS_STRNCPY + sub $16, %r8 + lea 16(%rdx), %rcx + jnz L(StrncpyFillTailWithZero1) +# ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +# endif +# endif + ret + +# ifdef USE_AS_STRNCPY + + .p2align 4 +L(CopyFrom1To16BytesCase2): + add $16, %r8 + add %rsi, %rcx + lea (%rsi, %rdx), %rsi + lea -9(%r8), %rdx + and $1<<7, %dh + or %al, %dh + test %dh, %dh + lea (%rsi), %rdx + jz L(ExitHighCase2) + + cmp $1, %r8 + je L(Exit1) + test $0x01, %al + jnz L(Exit1) + cmp $2, %r8 + je L(Exit2) + test $0x02, %al + jnz L(Exit2) + cmp $3, %r8 + je L(Exit3) + test $0x04, %al + jnz L(Exit3) + cmp $4, %r8 + je L(Exit4) + test $0x08, %al + jnz L(Exit4) + cmp $5, %r8 + je L(Exit5) + test $0x10, %al + jnz L(Exit5) + cmp $6, %r8 + je L(Exit6) + test $0x20, %al + jnz L(Exit6) + cmp $7, %r8 + je L(Exit7) + test $0x40, %al + jnz L(Exit7) + jmp L(Exit8) + + .p2align 4 +L(ExitHighCase2): + cmp $9, %r8 + je L(Exit9) + test $0x01, %ah + jnz L(Exit9) + cmp $10, %r8 + je L(Exit10) + test $0x02, %ah + jnz L(Exit10) + cmp $11, %r8 + je L(Exit11) + test $0x04, %ah + jnz L(Exit11) + cmp $12, %r8 + je L(Exit12) + test $0x8, %ah + jnz L(Exit12) + cmp $13, %r8 + je L(Exit13) + test $0x10, %ah + jnz L(Exit13) + cmp $14, %r8 + je L(Exit14) + test $0x20, %ah + jnz L(Exit14) + cmp $15, %r8 + je L(Exit15) + test $0x40, %ah + jnz L(Exit15) + jmp L(Exit16) + +L(CopyFrom1To16BytesCase2OrCase3): + test %rax, %rax + jnz L(CopyFrom1To16BytesCase2) + + .p2align 4 +L(CopyFrom1To16BytesCase3): + add $16, %r8 + add %rsi, %rdx + add %rsi, %rcx + + cmp $16, %r8 + je L(Exit16) + cmp $8, %r8 + je L(Exit8) + jg L(More8Case3) + cmp $4, %r8 + je L(Exit4) + jg L(More4Case3) + cmp $2, %r8 + jl L(Exit1) + je L(Exit2) + jg L(Exit3) +L(More8Case3): /* but less than 16 */ + cmp $12, %r8 + je L(Exit12) + jl L(Less12Case3) + cmp $14, %r8 + jl L(Exit13) + je L(Exit14) + jg L(Exit15) +L(More4Case3): /* but less than 8 */ + cmp $6, %r8 + jl L(Exit5) + je L(Exit6) + jg L(Exit7) +L(Less12Case3): /* but more than 8 */ + cmp $10, %r8 + jl L(Exit9) + je L(Exit10) + jg L(Exit11) +# endif + + .p2align 4 +L(Exit1): + movb (%rcx), %al + movb %al, (%rdx) +# ifdef USE_AS_STPCPY + lea (%rdx), %rax +# else + mov %rdi, %rax +# endif +# ifdef USE_AS_STRNCPY + sub $1, %r8 + lea 1(%rdx), %rcx + jnz L(StrncpyFillTailWithZero1) +# ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +# endif +# endif + ret + + .p2align 4 +L(Exit2): + movw (%rcx), %ax + movw %ax, (%rdx) +# ifdef USE_AS_STPCPY + lea 1(%rdx), %rax +# else + mov %rdi, %rax +# endif +# ifdef USE_AS_STRNCPY + sub $2, %r8 + lea 2(%rdx), %rcx + jnz L(StrncpyFillTailWithZero1) +# ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +# endif +# endif + ret + + .p2align 4 +L(Exit3): + movw (%rcx), %ax + movw %ax, (%rdx) + movb 2(%rcx), %al + movb %al, 2(%rdx) +# ifdef USE_AS_STPCPY + lea 2(%rdx), %rax +# else + mov %rdi, %rax +# endif +# ifdef USE_AS_STRNCPY + sub $3, %r8 + lea 3(%rdx), %rcx + jnz L(StrncpyFillTailWithZero1) +# ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +# endif +# endif + ret + + .p2align 4 +L(Exit4): + movl (%rcx), %eax + movl %eax, (%rdx) +# ifdef USE_AS_STPCPY + lea 3(%rdx), %rax +# else + mov %rdi, %rax +# endif +# ifdef USE_AS_STRNCPY + sub $4, %r8 + lea 4(%rdx), %rcx + jnz L(StrncpyFillTailWithZero1) +# ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +# endif +# endif + ret + + .p2align 4 +L(Exit5): + movl (%rcx), %eax + movl %eax, (%rdx) + movb 4(%rcx), %al + movb %al, 4(%rdx) +# ifdef USE_AS_STPCPY + lea 4(%rdx), %rax +# else + mov %rdi, %rax +# endif +# ifdef USE_AS_STRNCPY + sub $5, %r8 + lea 5(%rdx), %rcx + jnz L(StrncpyFillTailWithZero1) +# ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +# endif +# endif + ret + + .p2align 4 +L(Exit6): + movl (%rcx), %eax + movl %eax, (%rdx) + movw 4(%rcx), %ax + movw %ax, 4(%rdx) +# ifdef USE_AS_STPCPY + lea 5(%rdx), %rax +# else + mov %rdi, %rax +# endif +# ifdef USE_AS_STRNCPY + sub $6, %r8 + lea 6(%rdx), %rcx + jnz L(StrncpyFillTailWithZero1) +# ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +# endif +# endif + ret + + .p2align 4 +L(Exit7): + movl (%rcx), %eax + movl %eax, (%rdx) + movl 3(%rcx), %eax + movl %eax, 3(%rdx) +# ifdef USE_AS_STPCPY + lea 6(%rdx), %rax +# else + mov %rdi, %rax +# endif +# ifdef USE_AS_STRNCPY + sub $7, %r8 + lea 7(%rdx), %rcx + jnz L(StrncpyFillTailWithZero1) +# ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +# endif +# endif + ret + + .p2align 4 +L(Exit9): + mov (%rcx), %rax + mov %rax, (%rdx) + mov 5(%rcx), %eax + mov %eax, 5(%rdx) +# ifdef USE_AS_STPCPY + lea 8(%rdx), %rax +# else + mov %rdi, %rax +# endif +# ifdef USE_AS_STRNCPY + sub $9, %r8 + lea 9(%rdx), %rcx + jnz L(StrncpyFillTailWithZero1) +# ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +# endif +# endif + ret + + .p2align 4 +L(Exit10): + mov (%rcx), %rax + mov %rax, (%rdx) + mov 6(%rcx), %eax + mov %eax, 6(%rdx) +# ifdef USE_AS_STPCPY + lea 9(%rdx), %rax +# else + mov %rdi, %rax +# endif +# ifdef USE_AS_STRNCPY + sub $10, %r8 + lea 10(%rdx), %rcx + jnz L(StrncpyFillTailWithZero1) +# ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +# endif +# endif + ret + + .p2align 4 +L(Exit11): + mov (%rcx), %rax + mov %rax, (%rdx) + mov 7(%rcx), %eax + mov %eax, 7(%rdx) +# ifdef USE_AS_STPCPY + lea 10(%rdx), %rax +# else + mov %rdi, %rax +# endif +# ifdef USE_AS_STRNCPY + sub $11, %r8 + lea 11(%rdx), %rcx + jnz L(StrncpyFillTailWithZero1) +# ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +# endif +# endif + ret + + .p2align 4 +L(Exit12): + mov (%rcx), %rax + mov %rax, (%rdx) + mov 8(%rcx), %eax + mov %eax, 8(%rdx) +# ifdef USE_AS_STPCPY + lea 11(%rdx), %rax +# else + mov %rdi, %rax +# endif +# ifdef USE_AS_STRNCPY + sub $12, %r8 + lea 12(%rdx), %rcx + jnz L(StrncpyFillTailWithZero1) +# ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +# endif +# endif + ret + + .p2align 4 +L(Exit13): + mov (%rcx), %rax + mov %rax, (%rdx) + mov 5(%rcx), %rax + mov %rax, 5(%rdx) +# ifdef USE_AS_STPCPY + lea 12(%rdx), %rax +# else + mov %rdi, %rax +# endif +# ifdef USE_AS_STRNCPY + sub $13, %r8 + lea 13(%rdx), %rcx + jnz L(StrncpyFillTailWithZero1) +# ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +# endif +# endif + ret + + .p2align 4 +L(Exit14): + mov (%rcx), %rax + mov %rax, (%rdx) + mov 6(%rcx), %rax + mov %rax, 6(%rdx) +# ifdef USE_AS_STPCPY + lea 13(%rdx), %rax +# else + mov %rdi, %rax +# endif +# ifdef USE_AS_STRNCPY + sub $14, %r8 + lea 14(%rdx), %rcx + jnz L(StrncpyFillTailWithZero1) +# ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +# endif +# endif + ret + + .p2align 4 +L(Exit15): + mov (%rcx), %rax + mov %rax, (%rdx) + mov 7(%rcx), %rax + mov %rax, 7(%rdx) +# ifdef USE_AS_STPCPY + lea 14(%rdx), %rax +# else + mov %rdi, %rax +# endif +# ifdef USE_AS_STRNCPY + sub $15, %r8 + lea 15(%rdx), %rcx + jnz L(StrncpyFillTailWithZero1) +# ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +# endif +# endif + ret + +# ifdef USE_AS_STRNCPY + .p2align 4 +L(Fill0): + ret + + .p2align 4 +L(Fill1): + movb %dl, (%rcx) + ret + + .p2align 4 +L(Fill2): + movw %dx, (%rcx) + ret + + .p2align 4 +L(Fill3): + movw %dx, (%rcx) + movb %dl, 2(%rcx) + ret + + .p2align 4 +L(Fill4): + movl %edx, (%rcx) + ret + + .p2align 4 +L(Fill5): + movl %edx, (%rcx) + movb %dl, 4(%rcx) + ret + + .p2align 4 +L(Fill6): + movl %edx, (%rcx) + movw %dx, 4(%rcx) + ret + + .p2align 4 +L(Fill7): + movl %edx, (%rcx) + movl %edx, 3(%rcx) + ret + + .p2align 4 +L(Fill8): + mov %rdx, (%rcx) + ret + + .p2align 4 +L(Fill9): + mov %rdx, (%rcx) + movb %dl, 8(%rcx) + ret + + .p2align 4 +L(Fill10): + mov %rdx, (%rcx) + movw %dx, 8(%rcx) + ret + + .p2align 4 +L(Fill11): + mov %rdx, (%rcx) + movl %edx, 7(%rcx) + ret + + .p2align 4 +L(Fill12): + mov %rdx, (%rcx) + movl %edx, 8(%rcx) + ret + + .p2align 4 +L(Fill13): + mov %rdx, (%rcx) + mov %rdx, 5(%rcx) + ret + + .p2align 4 +L(Fill14): + mov %rdx, (%rcx) + mov %rdx, 6(%rcx) + ret + + .p2align 4 +L(Fill15): + mov %rdx, (%rcx) + mov %rdx, 7(%rcx) + ret + + .p2align 4 +L(Fill16): + mov %rdx, (%rcx) + mov %rdx, 8(%rcx) + ret + + .p2align 4 +L(StrncpyFillExit1): + lea 16(%r8), %r8 +L(FillFrom1To16Bytes): + test %r8, %r8 + jz L(Fill0) + cmp $16, %r8 + je L(Fill16) + cmp $8, %r8 + je L(Fill8) + jg L(FillMore8) + cmp $4, %r8 + je L(Fill4) + jg L(FillMore4) + cmp $2, %r8 + jl L(Fill1) + je L(Fill2) + jg L(Fill3) +L(FillMore8): /* but less than 16 */ + cmp $12, %r8 + je L(Fill12) + jl L(FillLess12) + cmp $14, %r8 + jl L(Fill13) + je L(Fill14) + jg L(Fill15) +L(FillMore4): /* but less than 8 */ + cmp $6, %r8 + jl L(Fill5) + je L(Fill6) + jg L(Fill7) +L(FillLess12): /* but more than 8 */ + cmp $10, %r8 + jl L(Fill9) + je L(Fill10) + jmp L(Fill11) + + .p2align 4 +L(StrncpyFillTailWithZero1): + xor %rdx, %rdx + sub $16, %r8 + jbe L(StrncpyFillExit1) + + pxor %xmm0, %xmm0 + mov %rdx, (%rcx) + mov %rdx, 8(%rcx) + + lea 16(%rcx), %rcx + + mov %rcx, %rdx + and $0xf, %rdx + sub %rdx, %rcx + add %rdx, %r8 + xor %rdx, %rdx + sub $64, %r8 + jb L(StrncpyFillLess64) + +L(StrncpyFillLoopMovdqa): + movdqa %xmm0, (%rcx) + movdqa %xmm0, 16(%rcx) + movdqa %xmm0, 32(%rcx) + movdqa %xmm0, 48(%rcx) + lea 64(%rcx), %rcx + sub $64, %r8 + jae L(StrncpyFillLoopMovdqa) + +L(StrncpyFillLess64): + add $32, %r8 + jl L(StrncpyFillLess32) + movdqa %xmm0, (%rcx) + movdqa %xmm0, 16(%rcx) + lea 32(%rcx), %rcx + sub $16, %r8 + jl L(StrncpyFillExit1) + movdqa %xmm0, (%rcx) + lea 16(%rcx), %rcx + jmp L(FillFrom1To16Bytes) + +L(StrncpyFillLess32): + add $16, %r8 + jl L(StrncpyFillExit1) + movdqa %xmm0, (%rcx) + lea 16(%rcx), %rcx + jmp L(FillFrom1To16Bytes) + + .p2align 4 +L(Exit0): + mov %rdx, %rax + ret + + .p2align 4 +L(StrncpyExit15Bytes): + cmp $9, %r8 + je L(Exit9) + cmpb $0, 8(%rcx) + jz L(Exit9) + cmp $10, %r8 + je L(Exit10) + cmpb $0, 9(%rcx) + jz L(Exit10) + cmp $11, %r8 + je L(Exit11) + cmpb $0, 10(%rcx) + jz L(Exit11) + cmp $12, %r8 + je L(Exit12) + cmpb $0, 11(%rcx) + jz L(Exit12) + cmp $13, %r8 + je L(Exit13) + cmpb $0, 12(%rcx) + jz L(Exit13) + cmp $14, %r8 + je L(Exit14) + cmpb $0, 13(%rcx) + jz L(Exit14) + mov (%rcx), %rax + mov %rax, (%rdx) + mov 7(%rcx), %rax + mov %rax, 7(%rdx) +# ifdef USE_AS_STPCPY + lea 14(%rdx), %rax + cmpb $1, (%rax) + sbb $-1, %rax +# else + mov %rdi, %rax +# endif + ret + + .p2align 4 +L(StrncpyExit8Bytes): + cmp $1, %r8 + je L(Exit1) + cmpb $0, (%rcx) + jz L(Exit1) + cmp $2, %r8 + je L(Exit2) + cmpb $0, 1(%rcx) + jz L(Exit2) + cmp $3, %r8 + je L(Exit3) + cmpb $0, 2(%rcx) + jz L(Exit3) + cmp $4, %r8 + je L(Exit4) + cmpb $0, 3(%rcx) + jz L(Exit4) + cmp $5, %r8 + je L(Exit5) + cmpb $0, 4(%rcx) + jz L(Exit5) + cmp $6, %r8 + je L(Exit6) + cmpb $0, 5(%rcx) + jz L(Exit6) + cmp $7, %r8 + je L(Exit7) + cmpb $0, 6(%rcx) + jz L(Exit7) + mov (%rcx), %rax + mov %rax, (%rdx) +# ifdef USE_AS_STPCPY + lea 7(%rdx), %rax + cmpb $1, (%rax) + sbb $-1, %rax +# else + mov %rdi, %rax +# endif + ret + +# endif +# endif + +# ifdef USE_AS_STRNCPY + .p2align 4 +L(StrncpyLeaveCase2OrCase3): + test %rax, %rax + jnz L(Aligned64LeaveCase2) + +L(Aligned64LeaveCase3): + lea 64(%r8), %r8 + sub $16, %r8 + jbe L(CopyFrom1To16BytesCase3) + movaps %xmm4, -64(%rdx) + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(CopyFrom1To16BytesCase3) + movaps %xmm5, -48(%rdx) + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(CopyFrom1To16BytesCase3) + movaps %xmm6, -32(%rdx) + lea 16(%rsi), %rsi + lea -16(%r8), %r8 + jmp L(CopyFrom1To16BytesCase3) + +L(Aligned64LeaveCase2): + pcmpeqb %xmm4, %xmm0 + pmovmskb %xmm0, %rax + add $48, %r8 + jle L(CopyFrom1To16BytesCase2OrCase3) + test %rax, %rax + jnz L(CopyFrom1To16Bytes) + + pcmpeqb %xmm5, %xmm0 + pmovmskb %xmm0, %rax + movaps %xmm4, -64(%rdx) + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(CopyFrom1To16BytesCase2OrCase3) + test %rax, %rax + jnz L(CopyFrom1To16Bytes) + + pcmpeqb %xmm6, %xmm0 + pmovmskb %xmm0, %rax + movaps %xmm5, -48(%rdx) + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(CopyFrom1To16BytesCase2OrCase3) + test %rax, %rax + jnz L(CopyFrom1To16Bytes) + + pcmpeqb %xmm7, %xmm0 + pmovmskb %xmm0, %rax + movaps %xmm6, -32(%rdx) + lea 16(%rsi), %rsi + lea -16(%r8), %r8 + jmp L(CopyFrom1To16BytesCase2) +/*--------------------------------------------------*/ + .p2align 4 +L(StrncpyExit1Case2OrCase3): + movdqu -1(%rcx), %xmm0 + movdqu %xmm0, -1(%rdx) + mov $15, %rsi + test %rax, %rax + jnz L(CopyFrom1To16BytesCase2) + jmp L(CopyFrom1To16BytesCase3) + + .p2align 4 +L(StrncpyExit2Case2OrCase3): + movdqu -2(%rcx), %xmm0 + movdqu %xmm0, -2(%rdx) + mov $14, %rsi + test %rax, %rax + jnz L(CopyFrom1To16BytesCase2) + jmp L(CopyFrom1To16BytesCase3) + + .p2align 4 +L(StrncpyExit3Case2OrCase3): + movdqu -3(%rcx), %xmm0 + movdqu %xmm0, -3(%rdx) + mov $13, %rsi + test %rax, %rax + jnz L(CopyFrom1To16BytesCase2) + jmp L(CopyFrom1To16BytesCase3) + + .p2align 4 +L(StrncpyExit4Case2OrCase3): + movdqu -4(%rcx), %xmm0 + movdqu %xmm0, -4(%rdx) + mov $12, %rsi + test %rax, %rax + jnz L(CopyFrom1To16BytesCase2) + jmp L(CopyFrom1To16BytesCase3) + + .p2align 4 +L(StrncpyExit5Case2OrCase3): + movdqu -5(%rcx), %xmm0 + movdqu %xmm0, -5(%rdx) + mov $11, %rsi + test %rax, %rax + jnz L(CopyFrom1To16BytesCase2) + jmp L(CopyFrom1To16BytesCase3) + + .p2align 4 +L(StrncpyExit6Case2OrCase3): + mov (%rcx), %rsi + mov 6(%rcx), %r9d + mov %r9d, 6(%rdx) + mov %rsi, (%rdx) + test %rax, %rax + mov $10, %rsi + jnz L(CopyFrom1To16BytesCase2) + jmp L(CopyFrom1To16BytesCase3) + + .p2align 4 +L(StrncpyExit7Case2OrCase3): + mov (%rcx), %rsi + mov 5(%rcx), %r9d + mov %r9d, 5(%rdx) + mov %rsi, (%rdx) + test %rax, %rax + mov $9, %rsi + jnz L(CopyFrom1To16BytesCase2) + jmp L(CopyFrom1To16BytesCase3) + + .p2align 4 +L(StrncpyExit8Case2OrCase3): + mov (%rcx), %r9 + mov $8, %rsi + mov %r9, (%rdx) + test %rax, %rax + jnz L(CopyFrom1To16BytesCase2) + jmp L(CopyFrom1To16BytesCase3) + + .p2align 4 +L(StrncpyExit9Case2OrCase3): + mov -1(%rcx), %r9 + mov $7, %rsi + mov %r9, -1(%rdx) + test %rax, %rax + jnz L(CopyFrom1To16BytesCase2) + jmp L(CopyFrom1To16BytesCase3) + + .p2align 4 +L(StrncpyExit10Case2OrCase3): + mov -2(%rcx), %r9 + mov $6, %rsi + mov %r9, -2(%rdx) + test %rax, %rax + jnz L(CopyFrom1To16BytesCase2) + jmp L(CopyFrom1To16BytesCase3) + + .p2align 4 +L(StrncpyExit11Case2OrCase3): + mov -3(%rcx), %r9 + mov $5, %rsi + mov %r9, -3(%rdx) + test %rax, %rax + jnz L(CopyFrom1To16BytesCase2) + jmp L(CopyFrom1To16BytesCase3) + + .p2align 4 +L(StrncpyExit12Case2OrCase3): + mov (%rcx), %r9d + mov $4, %rsi + mov %r9d, (%rdx) + test %rax, %rax + jnz L(CopyFrom1To16BytesCase2) + jmp L(CopyFrom1To16BytesCase3) + + .p2align 4 +L(StrncpyExit13Case2OrCase3): + mov -1(%rcx), %r9d + mov $3, %rsi + mov %r9d, -1(%rdx) + test %rax, %rax + jnz L(CopyFrom1To16BytesCase2) + jmp L(CopyFrom1To16BytesCase3) + + .p2align 4 +L(StrncpyExit14Case2OrCase3): + mov -2(%rcx), %r9d + mov $2, %rsi + mov %r9d, -2(%rdx) + test %rax, %rax + jnz L(CopyFrom1To16BytesCase2) + jmp L(CopyFrom1To16BytesCase3) + + .p2align 4 +L(StrncpyExit15Case2OrCase3): + mov -3(%rcx), %r9d + mov $1, %rsi + mov %r9d, -3(%rdx) + test %rax, %rax + jnz L(CopyFrom1To16BytesCase2) + jmp L(CopyFrom1To16BytesCase3) + + .p2align 4 +L(StrncpyLeave1): + movaps %xmm2, %xmm3 + add $48, %r8 + jle L(StrncpyExit1) + palignr $1, %xmm1, %xmm2 + movaps %xmm2, (%rdx) + movaps 31(%rcx), %xmm2 + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(StrncpyExit1) + palignr $1, %xmm3, %xmm2 + movaps %xmm2, 16(%rdx) + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(StrncpyExit1) + movaps %xmm4, 32(%rdx) + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(StrncpyExit1) + movaps %xmm5, 48(%rdx) + lea 16(%rsi), %rsi + lea -16(%r8), %r8 + +L(StrncpyExit1): + lea 15(%rdx, %rsi), %rdx + lea 15(%rcx, %rsi), %rcx + mov -15(%rcx), %rsi + mov -8(%rcx), %rax + mov %rsi, -15(%rdx) + mov %rax, -8(%rdx) + xor %rsi, %rsi + jmp L(CopyFrom1To16BytesCase3) + + .p2align 4 +L(StrncpyLeave2): + movaps %xmm2, %xmm3 + add $48, %r8 + jle L(StrncpyExit2) + palignr $2, %xmm1, %xmm2 + movaps %xmm2, (%rdx) + movaps 30(%rcx), %xmm2 + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(StrncpyExit2) + palignr $2, %xmm3, %xmm2 + movaps %xmm2, 16(%rdx) + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(StrncpyExit2) + movaps %xmm4, 32(%rdx) + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(StrncpyExit2) + movaps %xmm5, 48(%rdx) + lea 16(%rsi), %rsi + lea -16(%r8), %r8 + +L(StrncpyExit2): + lea 14(%rdx, %rsi), %rdx + lea 14(%rcx, %rsi), %rcx + mov -14(%rcx), %rsi + mov -8(%rcx), %rax + mov %rsi, -14(%rdx) + mov %rax, -8(%rdx) + xor %rsi, %rsi + jmp L(CopyFrom1To16BytesCase3) + + .p2align 4 +L(StrncpyLeave3): + movaps %xmm2, %xmm3 + add $48, %r8 + jle L(StrncpyExit3) + palignr $3, %xmm1, %xmm2 + movaps %xmm2, (%rdx) + movaps 29(%rcx), %xmm2 + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(StrncpyExit3) + palignr $3, %xmm3, %xmm2 + movaps %xmm2, 16(%rdx) + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(StrncpyExit3) + movaps %xmm4, 32(%rdx) + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(StrncpyExit3) + movaps %xmm5, 48(%rdx) + lea 16(%rsi), %rsi + lea -16(%r8), %r8 + +L(StrncpyExit3): + lea 13(%rdx, %rsi), %rdx + lea 13(%rcx, %rsi), %rcx + mov -13(%rcx), %rsi + mov -8(%rcx), %rax + mov %rsi, -13(%rdx) + mov %rax, -8(%rdx) + xor %rsi, %rsi + jmp L(CopyFrom1To16BytesCase3) + + .p2align 4 +L(StrncpyLeave4): + movaps %xmm2, %xmm3 + add $48, %r8 + jle L(StrncpyExit4) + palignr $4, %xmm1, %xmm2 + movaps %xmm2, (%rdx) + movaps 28(%rcx), %xmm2 + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(StrncpyExit4) + palignr $4, %xmm3, %xmm2 + movaps %xmm2, 16(%rdx) + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(StrncpyExit4) + movaps %xmm4, 32(%rdx) + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(StrncpyExit4) + movaps %xmm5, 48(%rdx) + lea 16(%rsi), %rsi + lea -16(%r8), %r8 + +L(StrncpyExit4): + lea 12(%rdx, %rsi), %rdx + lea 12(%rcx, %rsi), %rcx + mov -12(%rcx), %rsi + mov -4(%rcx), %eax + mov %rsi, -12(%rdx) + mov %eax, -4(%rdx) + xor %rsi, %rsi + jmp L(CopyFrom1To16BytesCase3) + + .p2align 4 +L(StrncpyLeave5): + movaps %xmm2, %xmm3 + add $48, %r8 + jle L(StrncpyExit5) + palignr $5, %xmm1, %xmm2 + movaps %xmm2, (%rdx) + movaps 27(%rcx), %xmm2 + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(StrncpyExit5) + palignr $5, %xmm3, %xmm2 + movaps %xmm2, 16(%rdx) + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(StrncpyExit5) + movaps %xmm4, 32(%rdx) + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(StrncpyExit5) + movaps %xmm5, 48(%rdx) + lea 16(%rsi), %rsi + lea -16(%r8), %r8 + +L(StrncpyExit5): + lea 11(%rdx, %rsi), %rdx + lea 11(%rcx, %rsi), %rcx + mov -11(%rcx), %rsi + mov -4(%rcx), %eax + mov %rsi, -11(%rdx) + mov %eax, -4(%rdx) + xor %rsi, %rsi + jmp L(CopyFrom1To16BytesCase3) + + .p2align 4 +L(StrncpyLeave6): + movaps %xmm2, %xmm3 + add $48, %r8 + jle L(StrncpyExit6) + palignr $6, %xmm1, %xmm2 + movaps %xmm2, (%rdx) + movaps 26(%rcx), %xmm2 + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(StrncpyExit6) + palignr $6, %xmm3, %xmm2 + movaps %xmm2, 16(%rdx) + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(StrncpyExit6) + movaps %xmm4, 32(%rdx) + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(StrncpyExit6) + movaps %xmm5, 48(%rdx) + lea 16(%rsi), %rsi + lea -16(%r8), %r8 + +L(StrncpyExit6): + lea 10(%rdx, %rsi), %rdx + lea 10(%rcx, %rsi), %rcx + mov -10(%rcx), %rsi + movw -2(%rcx), %ax + mov %rsi, -10(%rdx) + movw %ax, -2(%rdx) + xor %rsi, %rsi + jmp L(CopyFrom1To16BytesCase3) + + .p2align 4 +L(StrncpyLeave7): + movaps %xmm2, %xmm3 + add $48, %r8 + jle L(StrncpyExit7) + palignr $7, %xmm1, %xmm2 + movaps %xmm2, (%rdx) + movaps 25(%rcx), %xmm2 + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(StrncpyExit7) + palignr $7, %xmm3, %xmm2 + movaps %xmm2, 16(%rdx) + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(StrncpyExit7) + movaps %xmm4, 32(%rdx) + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(StrncpyExit7) + movaps %xmm5, 48(%rdx) + lea 16(%rsi), %rsi + lea -16(%r8), %r8 + +L(StrncpyExit7): + lea 9(%rdx, %rsi), %rdx + lea 9(%rcx, %rsi), %rcx + mov -9(%rcx), %rsi + movb -1(%rcx), %ah + mov %rsi, -9(%rdx) + movb %ah, -1(%rdx) + xor %rsi, %rsi + jmp L(CopyFrom1To16BytesCase3) + + .p2align 4 +L(StrncpyLeave8): + movaps %xmm2, %xmm3 + add $48, %r8 + jle L(StrncpyExit8) + palignr $8, %xmm1, %xmm2 + movaps %xmm2, (%rdx) + movaps 24(%rcx), %xmm2 + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(StrncpyExit8) + palignr $8, %xmm3, %xmm2 + movaps %xmm2, 16(%rdx) + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(StrncpyExit8) + movaps %xmm4, 32(%rdx) + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(StrncpyExit8) + movaps %xmm5, 48(%rdx) + lea 16(%rsi), %rsi + lea -16(%r8), %r8 + +L(StrncpyExit8): + lea 8(%rdx, %rsi), %rdx + lea 8(%rcx, %rsi), %rcx + mov -8(%rcx), %rax + xor %rsi, %rsi + mov %rax, -8(%rdx) + jmp L(CopyFrom1To16BytesCase3) + + .p2align 4 +L(StrncpyLeave9): + movaps %xmm2, %xmm3 + add $48, %r8 + jle L(StrncpyExit9) + palignr $9, %xmm1, %xmm2 + movaps %xmm2, (%rdx) + movaps 23(%rcx), %xmm2 + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(StrncpyExit9) + palignr $9, %xmm3, %xmm2 + movaps %xmm2, 16(%rdx) + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(StrncpyExit9) + movaps %xmm4, 32(%rdx) + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(StrncpyExit9) + movaps %xmm5, 48(%rdx) + lea 16(%rsi), %rsi + lea -16(%r8), %r8 + +L(StrncpyExit9): + lea 7(%rdx, %rsi), %rdx + lea 7(%rcx, %rsi), %rcx + mov -8(%rcx), %rax + xor %rsi, %rsi + mov %rax, -8(%rdx) + jmp L(CopyFrom1To16BytesCase3) + + .p2align 4 +L(StrncpyLeave10): + movaps %xmm2, %xmm3 + add $48, %r8 + jle L(StrncpyExit10) + palignr $10, %xmm1, %xmm2 + movaps %xmm2, (%rdx) + movaps 22(%rcx), %xmm2 + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(StrncpyExit10) + palignr $10, %xmm3, %xmm2 + movaps %xmm2, 16(%rdx) + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(StrncpyExit10) + movaps %xmm4, 32(%rdx) + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(StrncpyExit10) + movaps %xmm5, 48(%rdx) + lea 16(%rsi), %rsi + lea -16(%r8), %r8 + +L(StrncpyExit10): + lea 6(%rdx, %rsi), %rdx + lea 6(%rcx, %rsi), %rcx + mov -8(%rcx), %rax + xor %rsi, %rsi + mov %rax, -8(%rdx) + jmp L(CopyFrom1To16BytesCase3) + + .p2align 4 +L(StrncpyLeave11): + movaps %xmm2, %xmm3 + add $48, %r8 + jle L(StrncpyExit11) + palignr $11, %xmm1, %xmm2 + movaps %xmm2, (%rdx) + movaps 21(%rcx), %xmm2 + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(StrncpyExit11) + palignr $11, %xmm3, %xmm2 + movaps %xmm2, 16(%rdx) + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(StrncpyExit11) + movaps %xmm4, 32(%rdx) + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(StrncpyExit11) + movaps %xmm5, 48(%rdx) + lea 16(%rsi), %rsi + lea -16(%r8), %r8 + +L(StrncpyExit11): + lea 5(%rdx, %rsi), %rdx + lea 5(%rcx, %rsi), %rcx + mov -8(%rcx), %rax + xor %rsi, %rsi + mov %rax, -8(%rdx) + jmp L(CopyFrom1To16BytesCase3) + + .p2align 4 +L(StrncpyLeave12): + movaps %xmm2, %xmm3 + add $48, %r8 + jle L(StrncpyExit12) + palignr $12, %xmm1, %xmm2 + movaps %xmm2, (%rdx) + movaps 20(%rcx), %xmm2 + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(StrncpyExit12) + palignr $12, %xmm3, %xmm2 + movaps %xmm2, 16(%rdx) + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(StrncpyExit12) + movaps %xmm4, 32(%rdx) + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(StrncpyExit12) + movaps %xmm5, 48(%rdx) + lea 16(%rsi), %rsi + lea -16(%r8), %r8 + +L(StrncpyExit12): + lea 4(%rdx, %rsi), %rdx + lea 4(%rcx, %rsi), %rcx + mov -4(%rcx), %eax + xor %rsi, %rsi + mov %eax, -4(%rdx) + jmp L(CopyFrom1To16BytesCase3) + + .p2align 4 +L(StrncpyLeave13): + movaps %xmm2, %xmm3 + add $48, %r8 + jle L(StrncpyExit13) + palignr $13, %xmm1, %xmm2 + movaps %xmm2, (%rdx) + movaps 19(%rcx), %xmm2 + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(StrncpyExit13) + palignr $13, %xmm3, %xmm2 + movaps %xmm2, 16(%rdx) + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(StrncpyExit13) + movaps %xmm4, 32(%rdx) + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(StrncpyExit13) + movaps %xmm5, 48(%rdx) + lea 16(%rsi), %rsi + lea -16(%r8), %r8 + +L(StrncpyExit13): + lea 3(%rdx, %rsi), %rdx + lea 3(%rcx, %rsi), %rcx + mov -4(%rcx), %eax + xor %rsi, %rsi + mov %eax, -4(%rdx) + jmp L(CopyFrom1To16BytesCase3) + + .p2align 4 +L(StrncpyLeave14): + movaps %xmm2, %xmm3 + add $48, %r8 + jle L(StrncpyExit14) + palignr $14, %xmm1, %xmm2 + movaps %xmm2, (%rdx) + movaps 18(%rcx), %xmm2 + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(StrncpyExit14) + palignr $14, %xmm3, %xmm2 + movaps %xmm2, 16(%rdx) + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(StrncpyExit14) + movaps %xmm4, 32(%rdx) + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(StrncpyExit14) + movaps %xmm5, 48(%rdx) + lea 16(%rsi), %rsi + lea -16(%r8), %r8 + +L(StrncpyExit14): + lea 2(%rdx, %rsi), %rdx + lea 2(%rcx, %rsi), %rcx + movw -2(%rcx), %ax + xor %rsi, %rsi + movw %ax, -2(%rdx) + jmp L(CopyFrom1To16BytesCase3) + + .p2align 4 +L(StrncpyLeave15): + movaps %xmm2, %xmm3 + add $48, %r8 + jle L(StrncpyExit15) + palignr $15, %xmm1, %xmm2 + movaps %xmm2, (%rdx) + movaps 17(%rcx), %xmm2 + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(StrncpyExit15) + palignr $15, %xmm3, %xmm2 + movaps %xmm2, 16(%rdx) + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(StrncpyExit15) + movaps %xmm4, 32(%rdx) + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(StrncpyExit15) + movaps %xmm5, 48(%rdx) + lea 16(%rsi), %rsi + lea -16(%r8), %r8 + +L(StrncpyExit15): + lea 1(%rdx, %rsi), %rdx + lea 1(%rcx, %rsi), %rcx + movb -1(%rcx), %ah + xor %rsi, %rsi + movb %ah, -1(%rdx) + jmp L(CopyFrom1To16BytesCase3) + +# endif +# ifndef USE_AS_STRCAT +END (STRCPY) +# endif +#endif diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/strcpy.S b/REORG.TODO/sysdeps/x86_64/multiarch/strcpy.S new file mode 100644 index 0000000000..77819ddc50 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/strcpy.S @@ -0,0 +1,99 @@ +/* Multiple versions of strcpy + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2009-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + +#if !defined (USE_AS_STPCPY) && !defined (USE_AS_STRNCPY) +# ifndef STRCPY +# define STRCPY strcpy +# endif +#endif + +#ifdef USE_AS_STPCPY +# ifdef USE_AS_STRNCPY +# define STRCPY_SSSE3 __stpncpy_ssse3 +# define STRCPY_SSE2 __stpncpy_sse2 +# define STRCPY_SSE2_UNALIGNED __stpncpy_sse2_unaligned +# define __GI_STRCPY __GI_stpncpy +# define __GI___STRCPY __GI___stpncpy +# else +# define STRCPY_SSSE3 __stpcpy_ssse3 +# define STRCPY_SSE2 __stpcpy_sse2 +# define STRCPY_SSE2_UNALIGNED __stpcpy_sse2_unaligned +# define __GI_STRCPY __GI_stpcpy +# define __GI___STRCPY __GI___stpcpy +# endif +#else +# ifdef USE_AS_STRNCPY +# define STRCPY_SSSE3 __strncpy_ssse3 +# define STRCPY_SSE2 __strncpy_sse2 +# define STRCPY_SSE2_UNALIGNED __strncpy_sse2_unaligned +# define __GI_STRCPY __GI_strncpy +# else +# define STRCPY_SSSE3 __strcpy_ssse3 +# define STRCPY_SSE2 __strcpy_sse2 +# define STRCPY_SSE2_UNALIGNED __strcpy_sse2_unaligned +# define __GI_STRCPY __GI_strcpy +# endif +#endif + + +/* Define multiple versions only for the definition in libc. */ +#if IS_IN (libc) + .text +ENTRY(STRCPY) + .type STRCPY, @gnu_indirect_function + LOAD_RTLD_GLOBAL_RO_RDX + leaq STRCPY_SSE2_UNALIGNED(%rip), %rax + HAS_ARCH_FEATURE (Fast_Unaligned_Load) + jnz 2f + leaq STRCPY_SSE2(%rip), %rax + HAS_CPU_FEATURE (SSSE3) + jz 2f + leaq STRCPY_SSSE3(%rip), %rax +2: ret +END(STRCPY) + +# undef ENTRY +# define ENTRY(name) \ + .type STRCPY_SSE2, @function; \ + .align 16; \ + .globl STRCPY_SSE2; \ + .hidden STRCPY_SSE2; \ + STRCPY_SSE2: cfi_startproc; \ + CALL_MCOUNT +# undef END +# define END(name) \ + cfi_endproc; .size STRCPY_SSE2, .-STRCPY_SSE2 +# undef libc_hidden_builtin_def +/* It doesn't make sense to send libc-internal strcpy calls through a PLT. + The speedup we get from using SSSE3 instruction is likely eaten away + by the indirect call in the PLT. */ +# define libc_hidden_builtin_def(name) \ + .globl __GI_STRCPY; __GI_STRCPY = STRCPY_SSE2 +# undef libc_hidden_def +# define libc_hidden_def(name) \ + .globl __GI___STRCPY; __GI___STRCPY = STRCPY_SSE2 +#endif + +#ifndef USE_AS_STRNCPY +#include "../strcpy.S" +#endif diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/strcspn-c.c b/REORG.TODO/sysdeps/x86_64/multiarch/strcspn-c.c new file mode 100644 index 0000000000..67991b5ca7 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/strcspn-c.c @@ -0,0 +1,173 @@ +/* strcspn with SSE4.2 intrinsics + Copyright (C) 2009-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <nmmintrin.h> +#include <string.h> +#include "varshift.h" + +/* We use 0x2: + _SIDD_SBYTE_OPS + | _SIDD_CMP_EQUAL_ANY + | _SIDD_POSITIVE_POLARITY + | _SIDD_LEAST_SIGNIFICANT + on pcmpistri to compare xmm/mem128 + + 0 1 2 3 4 5 6 7 8 9 A B C D E F + X X X X X X X X X X X X X X X X + + against xmm + + 0 1 2 3 4 5 6 7 8 9 A B C D E F + A A A A A A A A A A A A A A A A + + to find out if the first 16byte data element has any byte A and + the offset of the first byte. There are 3 cases: + + 1. The first 16byte data element has the byte A at the offset X. + 2. The first 16byte data element has EOS and doesn't have the byte A. + 3. The first 16byte data element is valid and doesn't have the byte A. + + Here is the table of ECX, CFlag, ZFlag and SFlag for 2 cases: + + 1 X 1 0/1 0 + 2 16 0 1 0 + 3 16 0 0 0 + + We exit from the loop for cases 1 and 2 with jbe which branches + when either CFlag or ZFlag is 1. If CFlag == 1, ECX has the offset + X for case 1. */ + +#ifndef STRCSPN_SSE2 +# define STRCSPN_SSE2 __strcspn_sse2 +# define STRCSPN_SSE42 __strcspn_sse42 +#endif + +#ifdef USE_AS_STRPBRK +# define RETURN(val1, val2) return val1 +#else +# define RETURN(val1, val2) return val2 +#endif + +extern +#ifdef USE_AS_STRPBRK +char * +#else +size_t +#endif +STRCSPN_SSE2 (const char *, const char *); + + +#ifdef USE_AS_STRPBRK +char * +#else +size_t +#endif +__attribute__ ((section (".text.sse4.2"))) +STRCSPN_SSE42 (const char *s, const char *a) +{ + if (*a == 0) + RETURN (NULL, strlen (s)); + + const char *aligned; + __m128i mask; + int offset = (int) ((size_t) a & 15); + if (offset != 0) + { + /* Load masks. */ + aligned = (const char *) ((size_t) a & -16L); + __m128i mask0 = _mm_load_si128 ((__m128i *) aligned); + + mask = __m128i_shift_right (mask0, offset); + + /* Find where the NULL terminator is. */ + int length = _mm_cmpistri (mask, mask, 0x3a); + if (length == 16 - offset) + { + /* There is no NULL terminator. */ + __m128i mask1 = _mm_load_si128 ((__m128i *) (aligned + 16)); + int index = _mm_cmpistri (mask1, mask1, 0x3a); + length += index; + + /* Don't use SSE4.2 if the length of A > 16. */ + if (length > 16) + return STRCSPN_SSE2 (s, a); + + if (index != 0) + { + /* Combine mask0 and mask1. We could play games with + palignr, but frankly this data should be in L1 now + so do the merge via an unaligned load. */ + mask = _mm_loadu_si128 ((__m128i *) a); + } + } + } + else + { + /* A is aligned. */ + mask = _mm_load_si128 ((__m128i *) a); + + /* Find where the NULL terminator is. */ + int length = _mm_cmpistri (mask, mask, 0x3a); + if (length == 16) + { + /* There is no NULL terminator. Don't use SSE4.2 if the length + of A > 16. */ + if (a[16] != 0) + return STRCSPN_SSE2 (s, a); + } + } + + offset = (int) ((size_t) s & 15); + if (offset != 0) + { + /* Check partial string. */ + aligned = (const char *) ((size_t) s & -16L); + __m128i value = _mm_load_si128 ((__m128i *) aligned); + + value = __m128i_shift_right (value, offset); + + int length = _mm_cmpistri (mask, value, 0x2); + /* No need to check ZFlag since ZFlag is always 1. */ + int cflag = _mm_cmpistrc (mask, value, 0x2); + if (cflag) + RETURN ((char *) (s + length), length); + /* Find where the NULL terminator is. */ + int index = _mm_cmpistri (value, value, 0x3a); + if (index < 16 - offset) + RETURN (NULL, index); + aligned += 16; + } + else + aligned = s; + + while (1) + { + __m128i value = _mm_load_si128 ((__m128i *) aligned); + int index = _mm_cmpistri (mask, value, 0x2); + int cflag = _mm_cmpistrc (mask, value, 0x2); + int zflag = _mm_cmpistrz (mask, value, 0x2); + if (cflag) + RETURN ((char *) (aligned + index), (size_t) (aligned + index - s)); + if (zflag) + RETURN (NULL, + /* Find where the NULL terminator is. */ + (size_t) (aligned + _mm_cmpistri (value, value, 0x3a) - s)); + aligned += 16; + } +} diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/strcspn.S b/REORG.TODO/sysdeps/x86_64/multiarch/strcspn.S new file mode 100644 index 0000000000..d102c7e80b --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/strcspn.S @@ -0,0 +1,69 @@ +/* Multiple versions of strcspn + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2009-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <config.h> +#include <sysdep.h> +#include <init-arch.h> + +#ifdef USE_AS_STRPBRK +#define STRCSPN_SSE42 __strpbrk_sse42 +#define STRCSPN_SSE2 __strpbrk_sse2 +#define __GI_STRCSPN __GI_strpbrk +#else +#ifndef STRCSPN +#define STRCSPN strcspn +#define STRCSPN_SSE42 __strcspn_sse42 +#define STRCSPN_SSE2 __strcspn_sse2 +#define __GI_STRCSPN __GI_strcspn +#endif +#endif + +/* Define multiple versions only for the definition in libc. Don't + define multiple versions for strpbrk in static library since we + need strpbrk before the initialization happened. */ +#if (defined SHARED || !defined USE_AS_STRPBRK) && IS_IN (libc) + .text +ENTRY(STRCSPN) + .type STRCSPN, @gnu_indirect_function + LOAD_RTLD_GLOBAL_RO_RDX + leaq STRCSPN_SSE2(%rip), %rax + HAS_CPU_FEATURE (SSE4_2) + jz 2f + leaq STRCSPN_SSE42(%rip), %rax +2: ret +END(STRCSPN) + +# undef ENTRY +# define ENTRY(name) \ + .type STRCSPN_SSE2, @function; \ + .globl STRCSPN_SSE2; \ + .align 16; \ + STRCSPN_SSE2: cfi_startproc; \ + CALL_MCOUNT +# undef END +# define END(name) \ + cfi_endproc; .size STRCSPN_SSE2, .-STRCSPN_SSE2 +#endif + +#ifdef USE_AS_STRPBRK +#include "../strpbrk.S" +#else +#include "../strcspn.S" +#endif diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/strncase_l-ssse3.S b/REORG.TODO/sysdeps/x86_64/multiarch/strncase_l-ssse3.S new file mode 100644 index 0000000000..6728678688 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/strncase_l-ssse3.S @@ -0,0 +1,6 @@ +#define USE_SSSE3 1 +#define USE_AS_STRNCASECMP_L +#define NO_NOLOCALE_ALIAS +#define STRCMP __strncasecmp_l_ssse3 +#define __strncasecmp __strncasecmp_ssse3 +#include "../strcmp.S" diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/strncase_l.S b/REORG.TODO/sysdeps/x86_64/multiarch/strncase_l.S new file mode 100644 index 0000000000..9c0149788e --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/strncase_l.S @@ -0,0 +1,8 @@ +/* Multiple versions of strncasecmp and strncasecmp_l + All versions must be listed in ifunc-impl-list.c. */ +#define STRCMP __strncasecmp_l +#define USE_AS_STRNCASECMP_L +#include "strcmp.S" + +weak_alias (__strncasecmp_l, strncasecmp_l) +libc_hidden_def (strncasecmp_l) diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/strncat-c.c b/REORG.TODO/sysdeps/x86_64/multiarch/strncat-c.c new file mode 100644 index 0000000000..a3cdbff689 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/strncat-c.c @@ -0,0 +1,8 @@ +#define STRNCAT __strncat_sse2 +#ifdef SHARED +#undef libc_hidden_def +#define libc_hidden_def(name) \ + __hidden_ver1 (__strncat_sse2, __GI___strncat, __strncat_sse2); +#endif + +#include "string/strncat.c" diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/strncat-sse2-unaligned.S b/REORG.TODO/sysdeps/x86_64/multiarch/strncat-sse2-unaligned.S new file mode 100644 index 0000000000..133e1d20b0 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/strncat-sse2-unaligned.S @@ -0,0 +1,3 @@ +#define USE_AS_STRNCAT +#define STRCAT __strncat_sse2_unaligned +#include "strcat-sse2-unaligned.S" diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/strncat-ssse3.S b/REORG.TODO/sysdeps/x86_64/multiarch/strncat-ssse3.S new file mode 100644 index 0000000000..6c45ff3ec7 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/strncat-ssse3.S @@ -0,0 +1,3 @@ +#define USE_AS_STRNCAT +#define STRCAT __strncat_ssse3 +#include "strcat-ssse3.S" diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/strncat.S b/REORG.TODO/sysdeps/x86_64/multiarch/strncat.S new file mode 100644 index 0000000000..5c1bf41453 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/strncat.S @@ -0,0 +1,5 @@ +/* Multiple versions of strncat + All versions must be listed in ifunc-impl-list.c. */ +#define STRCAT strncat +#define USE_AS_STRNCAT +#include "strcat.S" diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/strncmp-ssse3.S b/REORG.TODO/sysdeps/x86_64/multiarch/strncmp-ssse3.S new file mode 100644 index 0000000000..96380a46be --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/strncmp-ssse3.S @@ -0,0 +1,6 @@ +#ifdef SHARED +# define USE_SSSE3 1 +# define STRCMP __strncmp_ssse3 +# define USE_AS_STRNCMP +# include "../strcmp.S" +#endif diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/strncmp.S b/REORG.TODO/sysdeps/x86_64/multiarch/strncmp.S new file mode 100644 index 0000000000..fd5eb1397c --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/strncmp.S @@ -0,0 +1,5 @@ +/* Multiple versions of strncmp + All versions must be listed in ifunc-impl-list.c. */ +#define STRCMP strncmp +#define USE_AS_STRNCMP +#include "strcmp.S" diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/strncpy-c.c b/REORG.TODO/sysdeps/x86_64/multiarch/strncpy-c.c new file mode 100644 index 0000000000..296c32cb5d --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/strncpy-c.c @@ -0,0 +1,8 @@ +#define STRNCPY __strncpy_sse2 +#ifdef SHARED +#undef libc_hidden_builtin_def +#define libc_hidden_builtin_def(name) \ + __hidden_ver1 (__strncpy_sse2, __GI_strncpy, __strncpy_sse2); +#endif + +#include "strncpy.c" diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/strncpy-sse2-unaligned.S b/REORG.TODO/sysdeps/x86_64/multiarch/strncpy-sse2-unaligned.S new file mode 100644 index 0000000000..fcc23a754a --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/strncpy-sse2-unaligned.S @@ -0,0 +1,3 @@ +#define USE_AS_STRNCPY +#define STRCPY __strncpy_sse2_unaligned +#include "strcpy-sse2-unaligned.S" diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/strncpy-ssse3.S b/REORG.TODO/sysdeps/x86_64/multiarch/strncpy-ssse3.S new file mode 100644 index 0000000000..bf82ee447d --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/strncpy-ssse3.S @@ -0,0 +1,3 @@ +#define USE_AS_STRNCPY +#define STRCPY __strncpy_ssse3 +#include "strcpy-ssse3.S" diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/strncpy.S b/REORG.TODO/sysdeps/x86_64/multiarch/strncpy.S new file mode 100644 index 0000000000..6d87a0ba35 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/strncpy.S @@ -0,0 +1,5 @@ +/* Multiple versions of strncpy + All versions must be listed in ifunc-impl-list.c. */ +#define STRCPY strncpy +#define USE_AS_STRNCPY +#include "strcpy.S" diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/strpbrk-c.c b/REORG.TODO/sysdeps/x86_64/multiarch/strpbrk-c.c new file mode 100644 index 0000000000..bbf5c49d89 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/strpbrk-c.c @@ -0,0 +1,8 @@ +/* Don't define multiple versions for strpbrk in static library since we + need strpbrk before the initialization happened. */ +#ifdef SHARED +# define USE_AS_STRPBRK +# define STRCSPN_SSE2 __strpbrk_sse2 +# define STRCSPN_SSE42 __strpbrk_sse42 +# include "strcspn-c.c" +#endif diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/strpbrk.S b/REORG.TODO/sysdeps/x86_64/multiarch/strpbrk.S new file mode 100644 index 0000000000..7201d6376f --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/strpbrk.S @@ -0,0 +1,5 @@ +/* Multiple versions of strpbrk + All versions must be listed in ifunc-impl-list.c. */ +#define STRCSPN strpbrk +#define USE_AS_STRPBRK +#include "strcspn.S" diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/strspn-c.c b/REORG.TODO/sysdeps/x86_64/multiarch/strspn-c.c new file mode 100644 index 0000000000..1704606b80 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/strspn-c.c @@ -0,0 +1,145 @@ +/* strspn with SSE4.2 intrinsics + Copyright (C) 2009-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <nmmintrin.h> +#include <string.h> +#include "varshift.h" + +/* We use 0x12: + _SIDD_SBYTE_OPS + | _SIDD_CMP_EQUAL_ANY + | _SIDD_NEGATIVE_POLARITY + | _SIDD_LEAST_SIGNIFICANT + on pcmpistri to compare xmm/mem128 + + 0 1 2 3 4 5 6 7 8 9 A B C D E F + X X X X X X X X X X X X X X X X + + against xmm + + 0 1 2 3 4 5 6 7 8 9 A B C D E F + A A A A A A A A A A A A A A A A + + to find out if the first 16byte data element has any non-A byte and + the offset of the first byte. There are 2 cases: + + 1. The first 16byte data element has the non-A byte, including + EOS, at the offset X. + 2. The first 16byte data element is valid and doesn't have the non-A + byte. + + Here is the table of ECX, CFlag, ZFlag and SFlag for 2 cases: + + case ECX CFlag ZFlag SFlag + 1 X 1 0/1 0 + 2 16 0 0 0 + + We exit from the loop for case 1. */ + +extern size_t __strspn_sse2 (const char *, const char *); + + +size_t +__attribute__ ((section (".text.sse4.2"))) +__strspn_sse42 (const char *s, const char *a) +{ + if (*a == 0) + return 0; + + const char *aligned; + __m128i mask; + int offset = (int) ((size_t) a & 15); + if (offset != 0) + { + /* Load masks. */ + aligned = (const char *) ((size_t) a & -16L); + __m128i mask0 = _mm_load_si128 ((__m128i *) aligned); + + mask = __m128i_shift_right (mask0, offset); + + /* Find where the NULL terminator is. */ + int length = _mm_cmpistri (mask, mask, 0x3a); + if (length == 16 - offset) + { + /* There is no NULL terminator. */ + __m128i mask1 = _mm_load_si128 ((__m128i *) (aligned + 16)); + int index = _mm_cmpistri (mask1, mask1, 0x3a); + length += index; + + /* Don't use SSE4.2 if the length of A > 16. */ + if (length > 16) + return __strspn_sse2 (s, a); + + if (index != 0) + { + /* Combine mask0 and mask1. We could play games with + palignr, but frankly this data should be in L1 now + so do the merge via an unaligned load. */ + mask = _mm_loadu_si128 ((__m128i *) a); + } + } + } + else + { + /* A is aligned. */ + mask = _mm_load_si128 ((__m128i *) a); + + /* Find where the NULL terminator is. */ + int length = _mm_cmpistri (mask, mask, 0x3a); + if (length == 16) + { + /* There is no NULL terminator. Don't use SSE4.2 if the length + of A > 16. */ + if (a[16] != 0) + return __strspn_sse2 (s, a); + } + } + + offset = (int) ((size_t) s & 15); + if (offset != 0) + { + /* Check partial string. */ + aligned = (const char *) ((size_t) s & -16L); + __m128i value = _mm_load_si128 ((__m128i *) aligned); + + value = __m128i_shift_right (value, offset); + + int length = _mm_cmpistri (mask, value, 0x12); + /* No need to check CFlag since it is always 1. */ + if (length < 16 - offset) + return length; + /* Find where the NULL terminator is. */ + int index = _mm_cmpistri (value, value, 0x3a); + if (index < 16 - offset) + return length; + aligned += 16; + } + else + aligned = s; + + while (1) + { + __m128i value = _mm_load_si128 ((__m128i *) aligned); + int index = _mm_cmpistri (mask, value, 0x12); + int cflag = _mm_cmpistrc (mask, value, 0x12); + if (cflag) + return (size_t) (aligned + index - s); + aligned += 16; + } +} diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/strspn.S b/REORG.TODO/sysdeps/x86_64/multiarch/strspn.S new file mode 100644 index 0000000000..adf7d9e533 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/strspn.S @@ -0,0 +1,50 @@ +/* Multiple versions of strspn + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2009-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <config.h> +#include <sysdep.h> +#include <init-arch.h> + +/* Define multiple versions only for the definition in libc. */ +#if IS_IN (libc) + .text +ENTRY(strspn) + .type strspn, @gnu_indirect_function + LOAD_RTLD_GLOBAL_RO_RDX + leaq __strspn_sse2(%rip), %rax + HAS_CPU_FEATURE (SSE4_2) + jz 2f + leaq __strspn_sse42(%rip), %rax +2: ret +END(strspn) + +# undef ENTRY +# define ENTRY(name) \ + .type __strspn_sse2, @function; \ + .globl __strspn_sse2; \ + .align 16; \ + __strspn_sse2: cfi_startproc; \ + CALL_MCOUNT +# undef END +# define END(name) \ + cfi_endproc; .size __strspn_sse2, .-__strspn_sse2 +#endif + +#include "../strspn.S" diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/strstr-sse2-unaligned.S b/REORG.TODO/sysdeps/x86_64/multiarch/strstr-sse2-unaligned.S new file mode 100644 index 0000000000..138979d10a --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/strstr-sse2-unaligned.S @@ -0,0 +1,374 @@ +/* strstr with unaligned loads + Copyright (C) 2009-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> + +ENTRY(__strstr_sse2_unaligned) + movzbl (%rsi), %eax + testb %al, %al + je L(empty) + movzbl 1(%rsi), %edx + testb %dl, %dl + je L(strchr) + movd %eax, %xmm1 + movd %edx, %xmm2 + movq %rdi, %rax + andl $4095, %eax + punpcklbw %xmm1, %xmm1 + cmpq $4031, %rax + punpcklbw %xmm2, %xmm2 + punpcklwd %xmm1, %xmm1 + punpcklwd %xmm2, %xmm2 + pshufd $0, %xmm1, %xmm1 + pshufd $0, %xmm2, %xmm2 + ja L(cross_page) + movdqu (%rdi), %xmm3 + pxor %xmm5, %xmm5 + movdqu 1(%rdi), %xmm4 + movdqa %xmm3, %xmm6 + pcmpeqb %xmm1, %xmm3 + pcmpeqb %xmm2, %xmm4 + movdqu 16(%rdi), %xmm0 + pcmpeqb %xmm5, %xmm6 + pminub %xmm4, %xmm3 + movdqa %xmm3, %xmm4 + movdqu 17(%rdi), %xmm3 + pcmpeqb %xmm0, %xmm5 + pcmpeqb %xmm2, %xmm3 + por %xmm6, %xmm4 + pcmpeqb %xmm1, %xmm0 + pminub %xmm3, %xmm0 + por %xmm5, %xmm0 + pmovmskb %xmm4, %r8d + pmovmskb %xmm0, %eax + salq $16, %rax + orq %rax, %r8 + je L(next_32_bytes) +L(next_pair_index): + bsf %r8, %rax + addq %rdi, %rax + cmpb $0, (%rax) + je L(zero1) + movzbl 2(%rsi), %edx + testb %dl, %dl + je L(found1) + cmpb 2(%rax), %dl + jne L(next_pair) + xorl %edx, %edx + jmp L(pair_loop_start) + + .p2align 4 +L(strchr): + movzbl %al, %esi + jmp __strchr_sse2 + + .p2align 4 +L(pair_loop): + addq $1, %rdx + cmpb 2(%rax,%rdx), %cl + jne L(next_pair) +L(pair_loop_start): + movzbl 3(%rsi,%rdx), %ecx + testb %cl, %cl + jne L(pair_loop) +L(found1): + ret +L(zero1): + xorl %eax, %eax + ret + + .p2align 4 +L(next_pair): + leaq -1(%r8), %rax + andq %rax, %r8 + jne L(next_pair_index) + + .p2align 4 +L(next_32_bytes): + movdqu 32(%rdi), %xmm3 + pxor %xmm5, %xmm5 + movdqu 33(%rdi), %xmm4 + movdqa %xmm3, %xmm6 + pcmpeqb %xmm1, %xmm3 + pcmpeqb %xmm2, %xmm4 + movdqu 48(%rdi), %xmm0 + pcmpeqb %xmm5, %xmm6 + pminub %xmm4, %xmm3 + movdqa %xmm3, %xmm4 + movdqu 49(%rdi), %xmm3 + pcmpeqb %xmm0, %xmm5 + pcmpeqb %xmm2, %xmm3 + por %xmm6, %xmm4 + pcmpeqb %xmm1, %xmm0 + pminub %xmm3, %xmm0 + por %xmm5, %xmm0 + pmovmskb %xmm4, %eax + salq $32, %rax + pmovmskb %xmm0, %r8d + salq $48, %r8 + orq %rax, %r8 + je L(loop_header) +L(next_pair2_index): + bsfq %r8, %rax + addq %rdi, %rax + cmpb $0, (%rax) + je L(zero2) + movzbl 2(%rsi), %edx + testb %dl, %dl + je L(found2) + cmpb 2(%rax), %dl + jne L(next_pair2) + xorl %edx, %edx + jmp L(pair_loop2_start) + + .p2align 4 +L(pair_loop2): + addq $1, %rdx + cmpb 2(%rax,%rdx), %cl + jne L(next_pair2) +L(pair_loop2_start): + movzbl 3(%rsi,%rdx), %ecx + testb %cl, %cl + jne L(pair_loop2) +L(found2): + ret + L(zero2): + xorl %eax, %eax + ret +L(empty): + mov %rdi, %rax + ret + + .p2align 4 +L(next_pair2): + leaq -1(%r8), %rax + andq %rax, %r8 + jne L(next_pair2_index) +L(loop_header): + movq $-512, %r11 + movq %rdi, %r9 + + pxor %xmm7, %xmm7 + andq $-64, %rdi + + .p2align 4 +L(loop): + movdqa 64(%rdi), %xmm3 + movdqu 63(%rdi), %xmm6 + movdqa %xmm3, %xmm0 + pxor %xmm2, %xmm3 + pxor %xmm1, %xmm6 + movdqa 80(%rdi), %xmm10 + por %xmm3, %xmm6 + pminub %xmm10, %xmm0 + movdqu 79(%rdi), %xmm3 + pxor %xmm2, %xmm10 + pxor %xmm1, %xmm3 + movdqa 96(%rdi), %xmm9 + por %xmm10, %xmm3 + pminub %xmm9, %xmm0 + pxor %xmm2, %xmm9 + movdqa 112(%rdi), %xmm8 + addq $64, %rdi + pminub %xmm6, %xmm3 + movdqu 31(%rdi), %xmm4 + pminub %xmm8, %xmm0 + pxor %xmm2, %xmm8 + pxor %xmm1, %xmm4 + por %xmm9, %xmm4 + pminub %xmm4, %xmm3 + movdqu 47(%rdi), %xmm5 + pxor %xmm1, %xmm5 + por %xmm8, %xmm5 + pminub %xmm5, %xmm3 + pminub %xmm3, %xmm0 + pcmpeqb %xmm7, %xmm0 + pmovmskb %xmm0, %eax + testl %eax, %eax + je L(loop) + pminub (%rdi), %xmm6 + pminub 32(%rdi),%xmm4 + pminub 48(%rdi),%xmm5 + pcmpeqb %xmm7, %xmm6 + pcmpeqb %xmm7, %xmm5 + pmovmskb %xmm6, %edx + movdqa 16(%rdi), %xmm8 + pcmpeqb %xmm7, %xmm4 + movdqu 15(%rdi), %xmm0 + pmovmskb %xmm5, %r8d + movdqa %xmm8, %xmm3 + pmovmskb %xmm4, %ecx + pcmpeqb %xmm1,%xmm0 + pcmpeqb %xmm2,%xmm3 + salq $32, %rcx + pcmpeqb %xmm7,%xmm8 + salq $48, %r8 + pminub %xmm0,%xmm3 + orq %rcx, %rdx + por %xmm3,%xmm8 + orq %rdx, %r8 + pmovmskb %xmm8, %eax + salq $16, %rax + orq %rax, %r8 + je L(loop) +L(next_pair_index3): + bsfq %r8, %rcx + addq %rdi, %rcx + cmpb $0, (%rcx) + je L(zero) + xorl %eax, %eax + movzbl 2(%rsi), %edx + testb %dl, %dl + je L(success3) + cmpb 1(%rcx), %dl + jne L(next_pair3) + jmp L(pair_loop_start3) + + .p2align 4 +L(pair_loop3): + addq $1, %rax + cmpb 1(%rcx,%rax), %dl + jne L(next_pair3) +L(pair_loop_start3): + movzbl 3(%rsi,%rax), %edx + testb %dl, %dl + jne L(pair_loop3) +L(success3): + lea -1(%rcx), %rax + ret + + .p2align 4 +L(next_pair3): + addq %rax, %r11 + movq %rdi, %rax + subq %r9, %rax + cmpq %r11, %rax + jl L(switch_strstr) + leaq -1(%r8), %rax + andq %rax, %r8 + jne L(next_pair_index3) + jmp L(loop) + + .p2align 4 +L(switch_strstr): + movq %rdi, %rdi + jmp __strstr_sse2 + + .p2align 4 +L(cross_page): + + movq %rdi, %rax + pxor %xmm0, %xmm0 + andq $-64, %rax + movdqa (%rax), %xmm3 + movdqu -1(%rax), %xmm4 + movdqa %xmm3, %xmm8 + movdqa 16(%rax), %xmm5 + pcmpeqb %xmm1, %xmm4 + pcmpeqb %xmm0, %xmm8 + pcmpeqb %xmm2, %xmm3 + movdqa %xmm5, %xmm7 + pminub %xmm4, %xmm3 + movdqu 15(%rax), %xmm4 + pcmpeqb %xmm0, %xmm7 + por %xmm3, %xmm8 + movdqa %xmm5, %xmm3 + movdqa 32(%rax), %xmm5 + pcmpeqb %xmm1, %xmm4 + pcmpeqb %xmm2, %xmm3 + movdqa %xmm5, %xmm6 + pmovmskb %xmm8, %ecx + pminub %xmm4, %xmm3 + movdqu 31(%rax), %xmm4 + por %xmm3, %xmm7 + movdqa %xmm5, %xmm3 + pcmpeqb %xmm0, %xmm6 + movdqa 48(%rax), %xmm5 + pcmpeqb %xmm1, %xmm4 + pmovmskb %xmm7, %r8d + pcmpeqb %xmm2, %xmm3 + pcmpeqb %xmm5, %xmm0 + pminub %xmm4, %xmm3 + movdqu 47(%rax), %xmm4 + por %xmm3, %xmm6 + movdqa %xmm5, %xmm3 + salq $16, %r8 + pcmpeqb %xmm1, %xmm4 + pcmpeqb %xmm2, %xmm3 + pmovmskb %xmm6, %r10d + pminub %xmm4, %xmm3 + por %xmm3, %xmm0 + salq $32, %r10 + orq %r10, %r8 + orq %rcx, %r8 + movl %edi, %ecx + pmovmskb %xmm0, %edx + subl %eax, %ecx + salq $48, %rdx + orq %rdx, %r8 + shrq %cl, %r8 + je L(loop_header) +L(next_pair_index4): + bsfq %r8, %rax + addq %rdi, %rax + cmpb $0, (%rax) + je L(zero) + + cmpq %rax,%rdi + je L(next_pair4) + + movzbl 2(%rsi), %edx + testb %dl, %dl + je L(found3) + cmpb 1(%rax), %dl + jne L(next_pair4) + xorl %edx, %edx + jmp L(pair_loop_start4) + + .p2align 4 +L(pair_loop4): + addq $1, %rdx + cmpb 1(%rax,%rdx), %cl + jne L(next_pair4) +L(pair_loop_start4): + movzbl 3(%rsi,%rdx), %ecx + testb %cl, %cl + jne L(pair_loop4) +L(found3): + subq $1, %rax + ret + + .p2align 4 +L(next_pair4): + leaq -1(%r8), %rax + andq %rax, %r8 + jne L(next_pair_index4) + jmp L(loop_header) + + .p2align 4 +L(found): + rep + ret + + .p2align 4 +L(zero): + xorl %eax, %eax + ret + + +END(__strstr_sse2_unaligned) diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/strstr.c b/REORG.TODO/sysdeps/x86_64/multiarch/strstr.c new file mode 100644 index 0000000000..a7d181d797 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/strstr.c @@ -0,0 +1,50 @@ +/* Multiple versions of strstr. + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2012-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +/* Redefine strstr so that the compiler won't complain about the type + mismatch with the IFUNC selector in strong_alias, below. */ +#undef strstr +#define strstr __redirect_strstr +#include <string.h> +#undef strstr + +#define STRSTR __strstr_sse2 +#ifdef SHARED +# undef libc_hidden_builtin_def +# define libc_hidden_builtin_def(name) \ + __hidden_ver1 (__strstr_sse2, __GI_strstr, __strstr_sse2); +#endif + +#include "string/strstr.c" + +extern __typeof (__redirect_strstr) __strstr_sse2_unaligned attribute_hidden; +extern __typeof (__redirect_strstr) __strstr_sse2 attribute_hidden; + +#include "init-arch.h" + +/* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle + ifunc symbol properly. */ +extern __typeof (__redirect_strstr) __libc_strstr; +libc_ifunc (__libc_strstr, + HAS_ARCH_FEATURE (Fast_Unaligned_Load) + ? __strstr_sse2_unaligned + : __strstr_sse2) + +#undef strstr +strong_alias (__libc_strstr, strstr) diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/test-multiarch.c b/REORG.TODO/sysdeps/x86_64/multiarch/test-multiarch.c new file mode 100644 index 0000000000..597d64e1e8 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/test-multiarch.c @@ -0,0 +1,96 @@ +/* Test CPU feature data. + This file is part of the GNU C Library. + Copyright (C) 2012-2017 Free Software Foundation, Inc. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <cpu-features.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +static char *cpu_flags; + +/* Search for flags in /proc/cpuinfo and store line + in cpu_flags. */ +void +get_cpuinfo (void) +{ + FILE *f; + char *line = NULL; + size_t len = 0; + ssize_t read; + + f = fopen ("/proc/cpuinfo", "r"); + if (f == NULL) + { + printf ("cannot open /proc/cpuinfo\n"); + exit (1); + } + + while ((read = getline (&line, &len, f)) != -1) + { + if (strncmp (line, "flags", 5) == 0) + { + cpu_flags = strdup (line); + break; + } + } + fclose (f); + free (line); +} + +int +check_proc (const char *proc_name, int flag, const char *name) +{ + int found = 0; + + printf ("Checking %s:\n", name); + printf (" init-arch %d\n", flag); + if (strstr (cpu_flags, proc_name) != NULL) + found = 1; + printf (" cpuinfo (%s) %d\n", proc_name, found); + + if (found != flag) + printf (" *** failure ***\n"); + + return (found != flag); +} + +static int +do_test (int argc, char **argv) +{ + int fails; + + get_cpuinfo (); + fails = check_proc ("avx", HAS_ARCH_FEATURE (AVX_Usable), + "HAS_ARCH_FEATURE (AVX_Usable)"); + fails += check_proc ("fma4", HAS_ARCH_FEATURE (FMA4_Usable), + "HAS_ARCH_FEATURE (FMA4_Usable)"); + fails += check_proc ("sse4_2", HAS_CPU_FEATURE (SSE4_2), + "HAS_CPU_FEATURE (SSE4_2)"); + fails += check_proc ("sse4_1", HAS_CPU_FEATURE (SSE4_1) + , "HAS_CPU_FEATURE (SSE4_1)"); + fails += check_proc ("ssse3", HAS_CPU_FEATURE (SSSE3), + "HAS_CPU_FEATURE (SSSE3)"); + fails += check_proc ("popcnt", HAS_CPU_FEATURE (POPCOUNT), + "HAS_CPU_FEATURE (POPCOUNT)"); + + printf ("%d differences between /proc/cpuinfo and glibc code.\n", fails); + + return (fails != 0); +} + +#include "../../../test-skeleton.c" diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/varshift.c b/REORG.TODO/sysdeps/x86_64/multiarch/varshift.c new file mode 100644 index 0000000000..1c3e34845d --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/varshift.c @@ -0,0 +1,25 @@ +/* Helper for variable shifts of SSE registers. + Copyright (C) 2010-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include "varshift.h" + +const int8_t ___m128i_shift_right[31] attribute_hidden = + { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 + }; diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/varshift.h b/REORG.TODO/sysdeps/x86_64/multiarch/varshift.h new file mode 100644 index 0000000000..07bb76c4bf --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/varshift.h @@ -0,0 +1,30 @@ +/* Helper for variable shifts of SSE registers. + Copyright (C) 2010-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <stdint.h> +#include <tmmintrin.h> + +extern const int8_t ___m128i_shift_right[31] attribute_hidden; + +static __inline__ __m128i +__m128i_shift_right (__m128i value, unsigned long int offset) +{ + return _mm_shuffle_epi8 (value, + _mm_loadu_si128 ((__m128i *) (___m128i_shift_right + + offset))); +} diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/wcscpy-c.c b/REORG.TODO/sysdeps/x86_64/multiarch/wcscpy-c.c new file mode 100644 index 0000000000..a51a83a9be --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/wcscpy-c.c @@ -0,0 +1,5 @@ +#if IS_IN (libc) +# define wcscpy __wcscpy_sse2 +#endif + +#include "wcsmbs/wcscpy.c" diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/wcscpy-ssse3.S b/REORG.TODO/sysdeps/x86_64/multiarch/wcscpy-ssse3.S new file mode 100644 index 0000000000..53857ce4f5 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/wcscpy-ssse3.S @@ -0,0 +1,552 @@ +/* wcscpy with SSSE3 + Copyright (C) 2011-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#if IS_IN (libc) +# include <sysdep.h> + + .section .text.ssse3,"ax",@progbits +ENTRY (__wcscpy_ssse3) + + mov %rsi, %rcx + mov %rdi, %rdx + + cmpl $0, (%rcx) + jz L(Exit4) + cmpl $0, 4(%rcx) + jz L(Exit8) + cmpl $0, 8(%rcx) + jz L(Exit12) + cmpl $0, 12(%rcx) + jz L(Exit16) + + lea 16(%rcx), %rsi + and $-16, %rsi + + pxor %xmm0, %xmm0 + mov (%rcx), %r9 + mov %r9, (%rdx) + + pcmpeqd (%rsi), %xmm0 + mov 8(%rcx), %r9 + mov %r9, 8(%rdx) + + pmovmskb %xmm0, %rax + sub %rcx, %rsi + + test %rax, %rax + jnz L(CopyFrom1To16Bytes) + + mov %rdx, %rax + lea 16(%rdx), %rdx + and $-16, %rdx + sub %rdx, %rax + sub %rax, %rcx + mov %rcx, %rax + and $0xf, %rax + mov $0, %rsi + +/* case: rcx_offset == rdx_offset */ + + jz L(Align16Both) + + cmp $4, %rax + je L(Shl4) + cmp $8, %rax + je L(Shl8) + jmp L(Shl12) + +L(Align16Both): + movaps (%rcx), %xmm1 + movaps 16(%rcx), %xmm2 + movaps %xmm1, (%rdx) + pcmpeqd %xmm2, %xmm0 + pmovmskb %xmm0, %rax + lea 16(%rsi), %rsi + + test %rax, %rax + jnz L(CopyFrom1To16Bytes) + + movaps 16(%rcx, %rsi), %xmm3 + movaps %xmm2, (%rdx, %rsi) + pcmpeqd %xmm3, %xmm0 + pmovmskb %xmm0, %rax + lea 16(%rsi), %rsi + + test %rax, %rax + jnz L(CopyFrom1To16Bytes) + + movaps 16(%rcx, %rsi), %xmm4 + movaps %xmm3, (%rdx, %rsi) + pcmpeqd %xmm4, %xmm0 + pmovmskb %xmm0, %rax + lea 16(%rsi), %rsi + + test %rax, %rax + jnz L(CopyFrom1To16Bytes) + + movaps 16(%rcx, %rsi), %xmm1 + movaps %xmm4, (%rdx, %rsi) + pcmpeqd %xmm1, %xmm0 + pmovmskb %xmm0, %rax + lea 16(%rsi), %rsi + + test %rax, %rax + jnz L(CopyFrom1To16Bytes) + + movaps 16(%rcx, %rsi), %xmm2 + movaps %xmm1, (%rdx, %rsi) + pcmpeqd %xmm2, %xmm0 + pmovmskb %xmm0, %rax + lea 16(%rsi), %rsi + + test %rax, %rax + jnz L(CopyFrom1To16Bytes) + + movaps 16(%rcx, %rsi), %xmm3 + movaps %xmm2, (%rdx, %rsi) + pcmpeqd %xmm3, %xmm0 + pmovmskb %xmm0, %rax + lea 16(%rsi), %rsi + + test %rax, %rax + jnz L(CopyFrom1To16Bytes) + + movaps %xmm3, (%rdx, %rsi) + mov %rcx, %rax + lea 16(%rcx, %rsi), %rcx + and $-0x40, %rcx + sub %rcx, %rax + sub %rax, %rdx + + mov $-0x40, %rsi + + .p2align 4 +L(Aligned64Loop): + movaps (%rcx), %xmm2 + movaps %xmm2, %xmm4 + movaps 16(%rcx), %xmm5 + movaps 32(%rcx), %xmm3 + movaps %xmm3, %xmm6 + movaps 48(%rcx), %xmm7 + pminub %xmm5, %xmm2 + pminub %xmm7, %xmm3 + pminub %xmm2, %xmm3 + pcmpeqd %xmm0, %xmm3 + pmovmskb %xmm3, %rax + lea 64(%rdx), %rdx + lea 64(%rcx), %rcx + test %rax, %rax + jnz L(Aligned64Leave) + movaps %xmm4, -64(%rdx) + movaps %xmm5, -48(%rdx) + movaps %xmm6, -32(%rdx) + movaps %xmm7, -16(%rdx) + jmp L(Aligned64Loop) + +L(Aligned64Leave): + pcmpeqd %xmm4, %xmm0 + pmovmskb %xmm0, %rax + test %rax, %rax + jnz L(CopyFrom1To16Bytes) + + pcmpeqd %xmm5, %xmm0 + + pmovmskb %xmm0, %rax + movaps %xmm4, -64(%rdx) + test %rax, %rax + lea 16(%rsi), %rsi + jnz L(CopyFrom1To16Bytes) + + pcmpeqd %xmm6, %xmm0 + + pmovmskb %xmm0, %rax + movaps %xmm5, -48(%rdx) + test %rax, %rax + lea 16(%rsi), %rsi + jnz L(CopyFrom1To16Bytes) + + movaps %xmm6, -32(%rdx) + pcmpeqd %xmm7, %xmm0 + + pmovmskb %xmm0, %rax + lea 16(%rsi), %rsi + test %rax, %rax + jnz L(CopyFrom1To16Bytes) + + mov $-0x40, %rsi + movaps %xmm7, -16(%rdx) + jmp L(Aligned64Loop) + + .p2align 4 +L(Shl4): + movaps -4(%rcx), %xmm1 + movaps 12(%rcx), %xmm2 +L(Shl4Start): + pcmpeqd %xmm2, %xmm0 + pmovmskb %xmm0, %rax + movaps %xmm2, %xmm3 + + test %rax, %rax + jnz L(Shl4LoopExit) + + palignr $4, %xmm1, %xmm2 + movaps %xmm2, (%rdx) + movaps 28(%rcx), %xmm2 + + pcmpeqd %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm1 + + test %rax, %rax + jnz L(Shl4LoopExit) + + palignr $4, %xmm3, %xmm2 + movaps %xmm2, (%rdx) + movaps 28(%rcx), %xmm2 + + pcmpeqd %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm3 + + test %rax, %rax + jnz L(Shl4LoopExit) + + palignr $4, %xmm1, %xmm2 + movaps %xmm2, (%rdx) + movaps 28(%rcx), %xmm2 + + pcmpeqd %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + + test %rax, %rax + jnz L(Shl4LoopExit) + + palignr $4, %xmm3, %xmm2 + movaps %xmm2, (%rdx) + lea 28(%rcx), %rcx + lea 16(%rdx), %rdx + + mov %rcx, %rax + and $-0x40, %rcx + sub %rcx, %rax + lea -12(%rcx), %rcx + sub %rax, %rdx + + movaps -4(%rcx), %xmm1 + + .p2align 4 +L(Shl4LoopStart): + movaps 12(%rcx), %xmm2 + movaps 28(%rcx), %xmm3 + movaps %xmm3, %xmm6 + movaps 44(%rcx), %xmm4 + movaps %xmm4, %xmm7 + movaps 60(%rcx), %xmm5 + pminub %xmm2, %xmm6 + pminub %xmm5, %xmm7 + pminub %xmm6, %xmm7 + pcmpeqd %xmm0, %xmm7 + pmovmskb %xmm7, %rax + movaps %xmm5, %xmm7 + palignr $4, %xmm4, %xmm5 + test %rax, %rax + palignr $4, %xmm3, %xmm4 + jnz L(Shl4Start) + + palignr $4, %xmm2, %xmm3 + lea 64(%rcx), %rcx + palignr $4, %xmm1, %xmm2 + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%rdx) + movaps %xmm4, 32(%rdx) + movaps %xmm3, 16(%rdx) + movaps %xmm2, (%rdx) + lea 64(%rdx), %rdx + jmp L(Shl4LoopStart) + +L(Shl4LoopExit): + movdqu -4(%rcx), %xmm1 + mov $12, %rsi + movdqu %xmm1, -4(%rdx) + jmp L(CopyFrom1To16Bytes) + + .p2align 4 +L(Shl8): + movaps -8(%rcx), %xmm1 + movaps 8(%rcx), %xmm2 +L(Shl8Start): + pcmpeqd %xmm2, %xmm0 + pmovmskb %xmm0, %rax + movaps %xmm2, %xmm3 + + test %rax, %rax + jnz L(Shl8LoopExit) + + palignr $8, %xmm1, %xmm2 + movaps %xmm2, (%rdx) + movaps 24(%rcx), %xmm2 + + pcmpeqd %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm1 + + test %rax, %rax + jnz L(Shl8LoopExit) + + palignr $8, %xmm3, %xmm2 + movaps %xmm2, (%rdx) + movaps 24(%rcx), %xmm2 + + pcmpeqd %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm3 + + test %rax, %rax + jnz L(Shl8LoopExit) + + palignr $8, %xmm1, %xmm2 + movaps %xmm2, (%rdx) + movaps 24(%rcx), %xmm2 + + pcmpeqd %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + + test %rax, %rax + jnz L(Shl8LoopExit) + + palignr $8, %xmm3, %xmm2 + movaps %xmm2, (%rdx) + lea 24(%rcx), %rcx + lea 16(%rdx), %rdx + + mov %rcx, %rax + and $-0x40, %rcx + sub %rcx, %rax + lea -8(%rcx), %rcx + sub %rax, %rdx + + movaps -8(%rcx), %xmm1 + + .p2align 4 +L(Shl8LoopStart): + movaps 8(%rcx), %xmm2 + movaps 24(%rcx), %xmm3 + movaps %xmm3, %xmm6 + movaps 40(%rcx), %xmm4 + movaps %xmm4, %xmm7 + movaps 56(%rcx), %xmm5 + pminub %xmm2, %xmm6 + pminub %xmm5, %xmm7 + pminub %xmm6, %xmm7 + pcmpeqd %xmm0, %xmm7 + pmovmskb %xmm7, %rax + movaps %xmm5, %xmm7 + palignr $8, %xmm4, %xmm5 + test %rax, %rax + palignr $8, %xmm3, %xmm4 + jnz L(Shl8Start) + + palignr $8, %xmm2, %xmm3 + lea 64(%rcx), %rcx + palignr $8, %xmm1, %xmm2 + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%rdx) + movaps %xmm4, 32(%rdx) + movaps %xmm3, 16(%rdx) + movaps %xmm2, (%rdx) + lea 64(%rdx), %rdx + jmp L(Shl8LoopStart) + +L(Shl8LoopExit): + mov (%rcx), %r9 + mov $8, %rsi + mov %r9, (%rdx) + jmp L(CopyFrom1To16Bytes) + + .p2align 4 +L(Shl12): + movaps -12(%rcx), %xmm1 + movaps 4(%rcx), %xmm2 +L(Shl12Start): + pcmpeqd %xmm2, %xmm0 + pmovmskb %xmm0, %rax + movaps %xmm2, %xmm3 + + test %rax, %rax + jnz L(Shl12LoopExit) + + palignr $12, %xmm1, %xmm2 + movaps %xmm2, (%rdx) + movaps 20(%rcx), %xmm2 + + pcmpeqd %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm1 + + test %rax, %rax + jnz L(Shl12LoopExit) + + palignr $12, %xmm3, %xmm2 + movaps %xmm2, (%rdx) + movaps 20(%rcx), %xmm2 + + pcmpeqd %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm3 + + test %rax, %rax + jnz L(Shl12LoopExit) + + palignr $12, %xmm1, %xmm2 + movaps %xmm2, (%rdx) + movaps 20(%rcx), %xmm2 + + pcmpeqd %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + + test %rax, %rax + jnz L(Shl12LoopExit) + + palignr $12, %xmm3, %xmm2 + movaps %xmm2, (%rdx) + lea 20(%rcx), %rcx + lea 16(%rdx), %rdx + + mov %rcx, %rax + and $-0x40, %rcx + sub %rcx, %rax + lea -4(%rcx), %rcx + sub %rax, %rdx + + movaps -12(%rcx), %xmm1 + + .p2align 4 +L(Shl12LoopStart): + movaps 4(%rcx), %xmm2 + movaps 20(%rcx), %xmm3 + movaps %xmm3, %xmm6 + movaps 36(%rcx), %xmm4 + movaps %xmm4, %xmm7 + movaps 52(%rcx), %xmm5 + pminub %xmm2, %xmm6 + pminub %xmm5, %xmm7 + pminub %xmm6, %xmm7 + pcmpeqd %xmm0, %xmm7 + pmovmskb %xmm7, %rax + movaps %xmm5, %xmm7 + palignr $12, %xmm4, %xmm5 + test %rax, %rax + palignr $12, %xmm3, %xmm4 + jnz L(Shl12Start) + palignr $12, %xmm2, %xmm3 + lea 64(%rcx), %rcx + palignr $12, %xmm1, %xmm2 + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%rdx) + movaps %xmm4, 32(%rdx) + movaps %xmm3, 16(%rdx) + movaps %xmm2, (%rdx) + lea 64(%rdx), %rdx + jmp L(Shl12LoopStart) + +L(Shl12LoopExit): + mov (%rcx), %r9d + mov $4, %rsi + mov %r9d, (%rdx) + jmp L(CopyFrom1To16Bytes) + + .p2align 4 +L(CopyFrom1To16Bytes): + add %rsi, %rdx + add %rsi, %rcx + + test %al, %al + jz L(ExitHigh) + test $0x01, %al + jnz L(Exit4) + + mov (%rcx), %rax + mov %rax, (%rdx) + mov %rdi, %rax + ret + + .p2align 4 +L(ExitHigh): + test $0x01, %ah + jnz L(Exit12) + + mov (%rcx), %rax + mov %rax, (%rdx) + mov 8(%rcx), %rax + mov %rax, 8(%rdx) + mov %rdi, %rax + ret + + .p2align 4 +L(Exit4): + movl (%rcx), %eax + movl %eax, (%rdx) + mov %rdi, %rax + ret + + .p2align 4 +L(Exit8): + mov (%rcx), %rax + mov %rax, (%rdx) + mov %rdi, %rax + ret + + .p2align 4 +L(Exit12): + mov (%rcx), %rax + mov %rax, (%rdx) + mov 8(%rcx), %eax + mov %eax, 8(%rdx) + mov %rdi, %rax + ret + + .p2align 4 +L(Exit16): + mov (%rcx), %rax + mov %rax, (%rdx) + mov 8(%rcx), %rax + mov %rax, 8(%rdx) + mov %rdi, %rax + ret + +END(__wcscpy_ssse3) +#endif diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/wcscpy.S b/REORG.TODO/sysdeps/x86_64/multiarch/wcscpy.S new file mode 100644 index 0000000000..9150ab6d18 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/wcscpy.S @@ -0,0 +1,40 @@ +/* Multiple versions of wcscpy + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2011-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + +/* Define multiple versions only for the definition in libc. */ +#if IS_IN (libc) + + .text +ENTRY(wcscpy) + .type wcscpy, @gnu_indirect_function + LOAD_RTLD_GLOBAL_RO_RDX + HAS_CPU_FEATURE (SSSE3) + jnz 2f + leaq __wcscpy_sse2(%rip), %rax + ret + +2: leaq __wcscpy_ssse3(%rip), %rax + ret + +END(wcscpy) +#endif diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/wcsnlen-c.c b/REORG.TODO/sysdeps/x86_64/multiarch/wcsnlen-c.c new file mode 100644 index 0000000000..e1ec7cfbb5 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/wcsnlen-c.c @@ -0,0 +1,9 @@ +#if IS_IN (libc) +# include <wchar.h> + +# define WCSNLEN __wcsnlen_sse2 + +extern __typeof (wcsnlen) __wcsnlen_sse2; +#endif + +#include "wcsmbs/wcsnlen.c" diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S b/REORG.TODO/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S new file mode 100644 index 0000000000..a8cab0cb00 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S @@ -0,0 +1,5 @@ +#define AS_WCSLEN +#define AS_STRNLEN +#define strlen __wcsnlen_sse4_1 + +#include "../strlen.S" diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/wcsnlen.c b/REORG.TODO/sysdeps/x86_64/multiarch/wcsnlen.c new file mode 100644 index 0000000000..304f62eec3 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/wcsnlen.c @@ -0,0 +1,45 @@ +/* Multiple versions of wcsnlen. + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +/* Define multiple versions only for the definition in libc. */ +#if IS_IN (libc) +# define __wcsnlen __redirect_wcsnlen +# include <wchar.h> +# undef __wcsnlen + +# define SYMBOL_NAME wcsnlen +# include <init-arch.h> + +extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden; + +static inline void * +IFUNC_SELECTOR (void) +{ + const struct cpu_features* cpu_features = __get_cpu_features (); + + if (CPU_FEATURES_CPU_P (cpu_features, SSE4_1)) + return OPTIMIZE (sse4_1); + + return OPTIMIZE (sse2); +} + +libc_ifunc_redirected (__redirect_wcsnlen, __wcsnlen, IFUNC_SELECTOR ()); +weak_alias (__wcsnlen, wcsnlen); +#endif diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/wmemcmp-avx2-movbe.S b/REORG.TODO/sysdeps/x86_64/multiarch/wmemcmp-avx2-movbe.S new file mode 100644 index 0000000000..bfa1a16a35 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/wmemcmp-avx2-movbe.S @@ -0,0 +1,4 @@ +#define MEMCMP __wmemcmp_avx2_movbe +#define USE_AS_WMEMCMP 1 + +#include "memcmp-avx2-movbe.S" diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/wmemcmp-c.c b/REORG.TODO/sysdeps/x86_64/multiarch/wmemcmp-c.c new file mode 100644 index 0000000000..46b6715e18 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/wmemcmp-c.c @@ -0,0 +1,9 @@ +#if IS_IN (libc) +# include <wchar.h> + +# define WMEMCMP __wmemcmp_sse2 + +extern __typeof (wmemcmp) __wmemcmp_sse2; +#endif + +#include "wcsmbs/wmemcmp.c" diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/wmemcmp-sse4.S b/REORG.TODO/sysdeps/x86_64/multiarch/wmemcmp-sse4.S new file mode 100644 index 0000000000..b07973a4f6 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/wmemcmp-sse4.S @@ -0,0 +1,4 @@ +#define USE_AS_WMEMCMP 1 +#define MEMCMP __wmemcmp_sse4_1 + +#include "memcmp-sse4.S" diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/wmemcmp-ssse3.S b/REORG.TODO/sysdeps/x86_64/multiarch/wmemcmp-ssse3.S new file mode 100644 index 0000000000..a41ef95fc1 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/wmemcmp-ssse3.S @@ -0,0 +1,4 @@ +#define USE_AS_WMEMCMP 1 +#define MEMCMP __wmemcmp_ssse3 + +#include "memcmp-ssse3.S" diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/wmemcmp.S b/REORG.TODO/sysdeps/x86_64/multiarch/wmemcmp.S new file mode 100644 index 0000000000..94b25a214c --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/wmemcmp.S @@ -0,0 +1,55 @@ +/* Multiple versions of wmemcmp + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2011-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + +/* Define multiple versions only for the definition in libc. */ +#if IS_IN (libc) + .text +ENTRY(wmemcmp) + .type wmemcmp, @gnu_indirect_function + LOAD_RTLD_GLOBAL_RO_RDX + HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER) + jnz 1f + HAS_ARCH_FEATURE (AVX2_Usable) + jz 1f + HAS_CPU_FEATURE (MOVBE) + jz 1f + HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load) + jz 1f + leaq __wmemcmp_avx2_movbe(%rip), %rax + ret + +1: HAS_CPU_FEATURE (SSSE3) + jnz 2f + leaq __wmemcmp_sse2(%rip), %rax + ret + +2: HAS_CPU_FEATURE (SSE4_1) + jz 3f + leaq __wmemcmp_sse4_1(%rip), %rax + ret + +3: leaq __wmemcmp_ssse3(%rip), %rax + ret + +END(wmemcmp) +#endif diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/wmemset.c b/REORG.TODO/sysdeps/x86_64/multiarch/wmemset.c new file mode 100644 index 0000000000..dd35be6e49 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/wmemset.c @@ -0,0 +1,33 @@ +/* Multiple versions of wmemset. + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +/* Define multiple versions only for the definition in libc. */ +#if IS_IN (libc) +# define wmemset __redirect_wmemset +# define __wmemset __redirect___wmemset +# include <wchar.h> +# undef wmemset +# undef __wmemset + +# define SYMBOL_NAME wmemset +# include "ifunc-wmemset.h" + +libc_ifunc_redirected (__redirect_wmemset, __wmemset, IFUNC_SELECTOR ()); +weak_alias (__wmemset, wmemset) +#endif diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/wmemset_chk-nonshared.S b/REORG.TODO/sysdeps/x86_64/multiarch/wmemset_chk-nonshared.S new file mode 100644 index 0000000000..0a537fe272 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/wmemset_chk-nonshared.S @@ -0,0 +1,21 @@ +/* Non-shared version of wmemset_chk for x86-64. + Copyright (C) 2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#if IS_IN (libc) && !defined SHARED +# include "../wmemset_chk.S" +#endif diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/wmemset_chk.c b/REORG.TODO/sysdeps/x86_64/multiarch/wmemset_chk.c new file mode 100644 index 0000000000..d3ded5595b --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/wmemset_chk.c @@ -0,0 +1,31 @@ +/* Multiple versions of wmemset_chk. + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +/* Define multiple versions only for the definition in libc.so. */ +#if IS_IN (libc) && defined SHARED +# define __wmemset_chk __redirect_wmemset_chk +# include <wchar.h> +# undef __wmemset_chk + +# define SYMBOL_NAME wmemset_chk +# include "ifunc-wmemset.h" + +libc_ifunc_redirected (__redirect_wmemset_chk, __wmemset_chk, + IFUNC_SELECTOR ()); +#endif diff --git a/REORG.TODO/sysdeps/x86_64/nptl/Makefile b/REORG.TODO/sysdeps/x86_64/nptl/Makefile new file mode 100644 index 0000000000..bad3831869 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/nptl/Makefile @@ -0,0 +1,20 @@ +# Copyright (C) 2002-2017 Free Software Foundation, Inc. +# This file is part of the GNU C Library. + +# The GNU C Library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. + +# The GNU C Library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. + +# You should have received a copy of the GNU Lesser General Public +# License along with the GNU C Library; if not, see +# <http://www.gnu.org/licenses/>. + +ifeq ($(subdir),csu) +gen-as-const-headers += tcb-offsets.sym +endif diff --git a/REORG.TODO/sysdeps/x86_64/nptl/pthread_spin_init.c b/REORG.TODO/sysdeps/x86_64/nptl/pthread_spin_init.c new file mode 100644 index 0000000000..f249c6fef5 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/nptl/pthread_spin_init.c @@ -0,0 +1 @@ +#include <sysdeps/i386/nptl/pthread_spin_init.c> diff --git a/REORG.TODO/sysdeps/x86_64/nptl/pthread_spin_lock.S b/REORG.TODO/sysdeps/x86_64/nptl/pthread_spin_lock.S new file mode 100644 index 0000000000..36ba181d9b --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/nptl/pthread_spin_lock.S @@ -0,0 +1,34 @@ +/* Copyright (C) 2012-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <lowlevellock.h> +#include <sysdep.h> + +ENTRY(pthread_spin_lock) +1: LOCK + decl 0(%rdi) + jne 2f + xor %eax, %eax + ret + + .align 16 +2: rep + nop + cmpl $0, 0(%rdi) + jg 1b + jmp 2b +END(pthread_spin_lock) diff --git a/REORG.TODO/sysdeps/x86_64/nptl/pthread_spin_trylock.S b/REORG.TODO/sysdeps/x86_64/nptl/pthread_spin_trylock.S new file mode 100644 index 0000000000..3419f1fec8 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/nptl/pthread_spin_trylock.S @@ -0,0 +1,37 @@ +/* Copyright (C) 2002-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Ulrich Drepper <drepper@redhat.com>, 2002. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <pthread-errnos.h> +#include <sysdep.h> + + +#ifdef UP +# define LOCK +#else +# define LOCK lock +#endif + +ENTRY(pthread_spin_trylock) + movl $1, %eax + xorl %ecx, %ecx + LOCK + cmpxchgl %ecx, (%rdi) + movl $EBUSY, %eax + cmovel %ecx, %eax + retq +END(pthread_spin_trylock) diff --git a/REORG.TODO/sysdeps/x86_64/nptl/pthread_spin_unlock.S b/REORG.TODO/sysdeps/x86_64/nptl/pthread_spin_unlock.S new file mode 100644 index 0000000000..58f9388a36 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/nptl/pthread_spin_unlock.S @@ -0,0 +1,29 @@ +/* Copyright (C) 2002-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Ulrich Drepper <drepper@redhat.com>, 2002. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> + +ENTRY(pthread_spin_unlock) + movl $1, (%rdi) + xorl %eax, %eax + retq +END(pthread_spin_unlock) + + /* The implementation of pthread_spin_init is identical. */ + .globl pthread_spin_init +pthread_spin_init = pthread_spin_unlock diff --git a/REORG.TODO/sysdeps/x86_64/nptl/pthreaddef.h b/REORG.TODO/sysdeps/x86_64/nptl/pthreaddef.h new file mode 100644 index 0000000000..f248ecac80 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/nptl/pthreaddef.h @@ -0,0 +1,44 @@ +/* Copyright (C) 2002-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Ulrich Drepper <drepper@redhat.com>, 2002. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +/* Default stack size. */ +#define ARCH_STACK_DEFAULT_SIZE (2 * 1024 * 1024) + +/* Required stack pointer alignment at beginning. SSE requires 16 + bytes. */ +#define STACK_ALIGN 16 + +/* Minimal stack size after allocating thread descriptor and guard size. */ +#define MINIMAL_REST_STACK 2048 + +/* Alignment requirement for TCB. + + We need to store post-AVX vector registers in the TCB and we want the + storage to be aligned to at least 32 bytes. + + Some processors such as Intel Atom pay a big penalty on every + access using a segment override if that segment's base is not + aligned to the size of a cache line. (See Intel 64 and IA-32 + Architectures Optimization Reference Manual, section 13.3.3.3, + "Segment Base".) On such machines, a cache line is 64 bytes. */ +#define TCB_ALIGNMENT 64 + + +/* Location of current stack frame. The frame pointer is not usable. */ +#define CURRENT_STACK_FRAME \ + ({ register char *frame __asm__("rsp"); frame; }) diff --git a/REORG.TODO/sysdeps/x86_64/nptl/tcb-offsets.sym b/REORG.TODO/sysdeps/x86_64/nptl/tcb-offsets.sym new file mode 100644 index 0000000000..8a25c482cb --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/nptl/tcb-offsets.sym @@ -0,0 +1,27 @@ +#include <sysdep.h> +#include <tls.h> +#include <kernel-features.h> + +RESULT offsetof (struct pthread, result) +TID offsetof (struct pthread, tid) +CANCELHANDLING offsetof (struct pthread, cancelhandling) +CLEANUP_JMP_BUF offsetof (struct pthread, cleanup_jmp_buf) +CLEANUP offsetof (struct pthread, cleanup) +CLEANUP_PREV offsetof (struct _pthread_cleanup_buffer, __prev) +MUTEX_FUTEX offsetof (pthread_mutex_t, __data.__lock) +MULTIPLE_THREADS_OFFSET offsetof (tcbhead_t, multiple_threads) +POINTER_GUARD offsetof (tcbhead_t, pointer_guard) +VGETCPU_CACHE_OFFSET offsetof (tcbhead_t, vgetcpu_cache) +#ifndef __ASSUME_PRIVATE_FUTEX +PRIVATE_FUTEX offsetof (tcbhead_t, private_futex) +#endif + +-- Not strictly offsets, but these values are also used in the TCB. +TCB_CANCELSTATE_BITMASK CANCELSTATE_BITMASK +TCB_CANCELTYPE_BITMASK CANCELTYPE_BITMASK +TCB_CANCELING_BITMASK CANCELING_BITMASK +TCB_CANCELED_BITMASK CANCELED_BITMASK +TCB_EXITING_BITMASK EXITING_BITMASK +TCB_CANCEL_RESTMASK CANCEL_RESTMASK +TCB_TERMINATED_BITMASK TERMINATED_BITMASK +TCB_PTHREAD_CANCELED PTHREAD_CANCELED diff --git a/REORG.TODO/sysdeps/x86_64/nptl/tls.h b/REORG.TODO/sysdeps/x86_64/nptl/tls.h new file mode 100644 index 0000000000..9b8ad82550 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/nptl/tls.h @@ -0,0 +1,367 @@ +/* Definition for thread-local data handling. nptl/x86_64 version. + Copyright (C) 2002-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#ifndef _TLS_H +#define _TLS_H 1 + +#ifndef __ASSEMBLER__ +# include <asm/prctl.h> /* For ARCH_SET_FS. */ +# include <stdbool.h> +# include <stddef.h> +# include <stdint.h> +# include <stdlib.h> +# include <sysdep.h> +# include <libc-pointer-arith.h> /* For cast_to_integer. */ +# include <kernel-features.h> +# include <dl-dtv.h> + +/* Replacement type for __m128 since this file is included by ld.so, + which is compiled with -mno-sse. It must not change the alignment + of rtld_savespace_sse. */ +typedef struct +{ + int i[4]; +} __128bits; + + +typedef struct +{ + void *tcb; /* Pointer to the TCB. Not necessarily the + thread descriptor used by libpthread. */ + dtv_t *dtv; + void *self; /* Pointer to the thread descriptor. */ + int multiple_threads; + int gscope_flag; + uintptr_t sysinfo; + uintptr_t stack_guard; + uintptr_t pointer_guard; + unsigned long int vgetcpu_cache[2]; +# ifndef __ASSUME_PRIVATE_FUTEX + int private_futex; +# else + int __glibc_reserved1; +# endif + int __glibc_unused1; + /* Reservation of some values for the TM ABI. */ + void *__private_tm[4]; + /* GCC split stack support. */ + void *__private_ss; + long int __glibc_reserved2; + /* Must be kept even if it is no longer used by glibc since programs, + like AddressSanitizer, depend on the size of tcbhead_t. */ + __128bits __glibc_unused2[8][4] __attribute__ ((aligned (32))); + + void *__padding[8]; +} tcbhead_t; + +#else /* __ASSEMBLER__ */ +# include <tcb-offsets.h> +#endif + + +/* Alignment requirement for the stack. */ +#define STACK_ALIGN 16 + + +#ifndef __ASSEMBLER__ +/* Get system call information. */ +# include <sysdep.h> + +#ifndef LOCK_PREFIX +# ifdef UP +# define LOCK_PREFIX /* nothing */ +# else +# define LOCK_PREFIX "lock;" +# endif +#endif + +/* This is the size of the initial TCB. Can't be just sizeof (tcbhead_t), + because NPTL getpid, __libc_alloca_cutoff etc. need (almost) the whole + struct pthread even when not linked with -lpthread. */ +# define TLS_INIT_TCB_SIZE sizeof (struct pthread) + +/* Alignment requirements for the initial TCB. */ +# define TLS_INIT_TCB_ALIGN __alignof__ (struct pthread) + +/* This is the size of the TCB. */ +# define TLS_TCB_SIZE sizeof (struct pthread) + +/* Alignment requirements for the TCB. */ +# define TLS_TCB_ALIGN __alignof__ (struct pthread) + +/* The TCB can have any size and the memory following the address the + thread pointer points to is unspecified. Allocate the TCB there. */ +# define TLS_TCB_AT_TP 1 +# define TLS_DTV_AT_TP 0 + +/* Get the thread descriptor definition. */ +# include <nptl/descr.h> + + +/* Install the dtv pointer. The pointer passed is to the element with + index -1 which contain the length. */ +# define INSTALL_DTV(descr, dtvp) \ + ((tcbhead_t *) (descr))->dtv = (dtvp) + 1 + +/* Install new dtv for current thread. */ +# define INSTALL_NEW_DTV(dtvp) \ + ({ struct pthread *__pd; \ + THREAD_SETMEM (__pd, header.dtv, (dtvp)); }) + +/* Return dtv of given thread descriptor. */ +# define GET_DTV(descr) \ + (((tcbhead_t *) (descr))->dtv) + + +/* Code to initially initialize the thread pointer. This might need + special attention since 'errno' is not yet available and if the + operation can cause a failure 'errno' must not be touched. + + We have to make the syscall for both uses of the macro since the + address might be (and probably is) different. */ +# define TLS_INIT_TP(thrdescr) \ + ({ void *_thrdescr = (thrdescr); \ + tcbhead_t *_head = _thrdescr; \ + int _result; \ + \ + _head->tcb = _thrdescr; \ + /* For now the thread descriptor is at the same address. */ \ + _head->self = _thrdescr; \ + \ + /* It is a simple syscall to set the %fs value for the thread. */ \ + asm volatile ("syscall" \ + : "=a" (_result) \ + : "0" ((unsigned long int) __NR_arch_prctl), \ + "D" ((unsigned long int) ARCH_SET_FS), \ + "S" (_thrdescr) \ + : "memory", "cc", "r11", "cx"); \ + \ + _result ? "cannot set %fs base address for thread-local storage" : 0; \ + }) + +# define TLS_DEFINE_INIT_TP(tp, pd) void *tp = (pd) + + +/* Return the address of the dtv for the current thread. */ +# define THREAD_DTV() \ + ({ struct pthread *__pd; \ + THREAD_GETMEM (__pd, header.dtv); }) + + +/* Return the thread descriptor for the current thread. + + The contained asm must *not* be marked volatile since otherwise + assignments like + pthread_descr self = thread_self(); + do not get optimized away. */ +# define THREAD_SELF \ + ({ struct pthread *__self; \ + asm ("mov %%fs:%c1,%0" : "=r" (__self) \ + : "i" (offsetof (struct pthread, header.self))); \ + __self;}) + +/* Magic for libthread_db to know how to do THREAD_SELF. */ +# define DB_THREAD_SELF_INCLUDE <sys/reg.h> /* For the FS constant. */ +# define DB_THREAD_SELF CONST_THREAD_AREA (64, FS) + +/* Read member of the thread descriptor directly. */ +# define THREAD_GETMEM(descr, member) \ + ({ __typeof (descr->member) __value; \ + if (sizeof (__value) == 1) \ + asm volatile ("movb %%fs:%P2,%b0" \ + : "=q" (__value) \ + : "0" (0), "i" (offsetof (struct pthread, member))); \ + else if (sizeof (__value) == 4) \ + asm volatile ("movl %%fs:%P1,%0" \ + : "=r" (__value) \ + : "i" (offsetof (struct pthread, member))); \ + else \ + { \ + if (sizeof (__value) != 8) \ + /* There should not be any value with a size other than 1, \ + 4 or 8. */ \ + abort (); \ + \ + asm volatile ("movq %%fs:%P1,%q0" \ + : "=r" (__value) \ + : "i" (offsetof (struct pthread, member))); \ + } \ + __value; }) + + +/* Same as THREAD_GETMEM, but the member offset can be non-constant. */ +# define THREAD_GETMEM_NC(descr, member, idx) \ + ({ __typeof (descr->member[0]) __value; \ + if (sizeof (__value) == 1) \ + asm volatile ("movb %%fs:%P2(%q3),%b0" \ + : "=q" (__value) \ + : "0" (0), "i" (offsetof (struct pthread, member[0])), \ + "r" (idx)); \ + else if (sizeof (__value) == 4) \ + asm volatile ("movl %%fs:%P1(,%q2,4),%0" \ + : "=r" (__value) \ + : "i" (offsetof (struct pthread, member[0])), "r" (idx));\ + else \ + { \ + if (sizeof (__value) != 8) \ + /* There should not be any value with a size other than 1, \ + 4 or 8. */ \ + abort (); \ + \ + asm volatile ("movq %%fs:%P1(,%q2,8),%q0" \ + : "=r" (__value) \ + : "i" (offsetof (struct pthread, member[0])), \ + "r" (idx)); \ + } \ + __value; }) + + +/* Loading addresses of objects on x86-64 needs to be treated special + when generating PIC code. */ +#ifdef __pic__ +# define IMM_MODE "nr" +#else +# define IMM_MODE "ir" +#endif + + +/* Set member of the thread descriptor directly. */ +# define THREAD_SETMEM(descr, member, value) \ + ({ if (sizeof (descr->member) == 1) \ + asm volatile ("movb %b0,%%fs:%P1" : \ + : "iq" (value), \ + "i" (offsetof (struct pthread, member))); \ + else if (sizeof (descr->member) == 4) \ + asm volatile ("movl %0,%%fs:%P1" : \ + : IMM_MODE (value), \ + "i" (offsetof (struct pthread, member))); \ + else \ + { \ + if (sizeof (descr->member) != 8) \ + /* There should not be any value with a size other than 1, \ + 4 or 8. */ \ + abort (); \ + \ + asm volatile ("movq %q0,%%fs:%P1" : \ + : IMM_MODE ((uint64_t) cast_to_integer (value)), \ + "i" (offsetof (struct pthread, member))); \ + }}) + + +/* Same as THREAD_SETMEM, but the member offset can be non-constant. */ +# define THREAD_SETMEM_NC(descr, member, idx, value) \ + ({ if (sizeof (descr->member[0]) == 1) \ + asm volatile ("movb %b0,%%fs:%P1(%q2)" : \ + : "iq" (value), \ + "i" (offsetof (struct pthread, member[0])), \ + "r" (idx)); \ + else if (sizeof (descr->member[0]) == 4) \ + asm volatile ("movl %0,%%fs:%P1(,%q2,4)" : \ + : IMM_MODE (value), \ + "i" (offsetof (struct pthread, member[0])), \ + "r" (idx)); \ + else \ + { \ + if (sizeof (descr->member[0]) != 8) \ + /* There should not be any value with a size other than 1, \ + 4 or 8. */ \ + abort (); \ + \ + asm volatile ("movq %q0,%%fs:%P1(,%q2,8)" : \ + : IMM_MODE ((uint64_t) cast_to_integer (value)), \ + "i" (offsetof (struct pthread, member[0])), \ + "r" (idx)); \ + }}) + + +/* Atomic compare and exchange on TLS, returning old value. */ +# define THREAD_ATOMIC_CMPXCHG_VAL(descr, member, newval, oldval) \ + ({ __typeof (descr->member) __ret; \ + __typeof (oldval) __old = (oldval); \ + if (sizeof (descr->member) == 4) \ + asm volatile (LOCK_PREFIX "cmpxchgl %2, %%fs:%P3" \ + : "=a" (__ret) \ + : "0" (__old), "r" (newval), \ + "i" (offsetof (struct pthread, member))); \ + else \ + /* Not necessary for other sizes in the moment. */ \ + abort (); \ + __ret; }) + + +/* Atomic logical and. */ +# define THREAD_ATOMIC_AND(descr, member, val) \ + (void) ({ if (sizeof ((descr)->member) == 4) \ + asm volatile (LOCK_PREFIX "andl %1, %%fs:%P0" \ + :: "i" (offsetof (struct pthread, member)), \ + "ir" (val)); \ + else \ + /* Not necessary for other sizes in the moment. */ \ + abort (); }) + + +/* Atomic set bit. */ +# define THREAD_ATOMIC_BIT_SET(descr, member, bit) \ + (void) ({ if (sizeof ((descr)->member) == 4) \ + asm volatile (LOCK_PREFIX "orl %1, %%fs:%P0" \ + :: "i" (offsetof (struct pthread, member)), \ + "ir" (1 << (bit))); \ + else \ + /* Not necessary for other sizes in the moment. */ \ + abort (); }) + + +/* Set the stack guard field in TCB head. */ +# define THREAD_SET_STACK_GUARD(value) \ + THREAD_SETMEM (THREAD_SELF, header.stack_guard, value) +# define THREAD_COPY_STACK_GUARD(descr) \ + ((descr)->header.stack_guard \ + = THREAD_GETMEM (THREAD_SELF, header.stack_guard)) + + +/* Set the pointer guard field in the TCB head. */ +# define THREAD_SET_POINTER_GUARD(value) \ + THREAD_SETMEM (THREAD_SELF, header.pointer_guard, value) +# define THREAD_COPY_POINTER_GUARD(descr) \ + ((descr)->header.pointer_guard \ + = THREAD_GETMEM (THREAD_SELF, header.pointer_guard)) + + +/* Get and set the global scope generation counter in the TCB head. */ +# define THREAD_GSCOPE_FLAG_UNUSED 0 +# define THREAD_GSCOPE_FLAG_USED 1 +# define THREAD_GSCOPE_FLAG_WAIT 2 +# define THREAD_GSCOPE_RESET_FLAG() \ + do \ + { int __res; \ + asm volatile ("xchgl %0, %%fs:%P1" \ + : "=r" (__res) \ + : "i" (offsetof (struct pthread, header.gscope_flag)), \ + "0" (THREAD_GSCOPE_FLAG_UNUSED)); \ + if (__res == THREAD_GSCOPE_FLAG_WAIT) \ + lll_futex_wake (&THREAD_SELF->header.gscope_flag, 1, LLL_PRIVATE); \ + } \ + while (0) +# define THREAD_GSCOPE_SET_FLAG() \ + THREAD_SETMEM (THREAD_SELF, header.gscope_flag, THREAD_GSCOPE_FLAG_USED) +# define THREAD_GSCOPE_WAIT() \ + GL(dl_wait_lookup_done) () + +#endif /* __ASSEMBLER__ */ + +#endif /* tls.h */ diff --git a/REORG.TODO/sysdeps/x86_64/preconfigure b/REORG.TODO/sysdeps/x86_64/preconfigure new file mode 100644 index 0000000000..c8f1e0e132 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/preconfigure @@ -0,0 +1,42 @@ +# This file is generated from configure.ac by Autoconf. DO NOT EDIT! + # Local preconfigure fragment for sysdeps/x86_64 + +test -n "$base_machine" || case "$machine" in +x86_64) + base_machine=x86_64 + # Check if we are building for x32. + +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether $CC compiles in -mx32 mode by default" >&5 +$as_echo_n "checking whether $CC compiles in -mx32 mode by default... " >&6; } +if ${libc_cv_x32+:} false; then : + $as_echo_n "(cached) " >&6 +else + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +#ifndef __ILP32__ +# error not x32 +#endif +int +main () +{ + + ; + return 0; +} +_ACEOF +if ac_fn_c_try_compile "$LINENO"; then : + libc_cv_x32=yes +else + libc_cv_x32=no +fi +rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $libc_cv_x32" >&5 +$as_echo "$libc_cv_x32" >&6; } + if test $libc_cv_x32 = yes; then + machine=x86_64/x32 + else + machine=x86_64/64 + fi + ;; +esac diff --git a/REORG.TODO/sysdeps/x86_64/preconfigure.ac b/REORG.TODO/sysdeps/x86_64/preconfigure.ac new file mode 100644 index 0000000000..600700ea1a --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/preconfigure.ac @@ -0,0 +1,20 @@ +GLIBC_PROVIDES dnl See aclocal.m4 in the top level source directory. +# Local preconfigure fragment for sysdeps/x86_64 + +test -n "$base_machine" || case "$machine" in +x86_64) + base_machine=x86_64 + # Check if we are building for x32. + AC_CACHE_CHECK(whether $CC compiles in -mx32 mode by default, + libc_cv_x32, [dnl + AC_TRY_COMPILE(dnl +[#ifndef __ILP32__ +# error not x32 +#endif], [], libc_cv_x32=yes, libc_cv_x32=no)]) + if test $libc_cv_x32 = yes; then + machine=x86_64/x32 + else + machine=x86_64/64 + fi + ;; +esac diff --git a/REORG.TODO/sysdeps/x86_64/rawmemchr.S b/REORG.TODO/sysdeps/x86_64/rawmemchr.S new file mode 100644 index 0000000000..0405c7bb99 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/rawmemchr.S @@ -0,0 +1,202 @@ +/* fast SSE2 memchr with 64 byte loop and pmaxub instruction using + + Copyright (C) 2011-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> + + .text +ENTRY (__rawmemchr) + movd %rsi, %xmm1 + mov %rdi, %rcx + + punpcklbw %xmm1, %xmm1 + punpcklbw %xmm1, %xmm1 + + and $63, %rcx + pshufd $0, %xmm1, %xmm1 + + cmp $48, %rcx + ja L(crosscache) + + movdqu (%rdi), %xmm0 + pcmpeqb %xmm1, %xmm0 +/* Check if there is a match. */ + pmovmskb %xmm0, %eax + test %eax, %eax + + jnz L(matches) + add $16, %rdi + and $-16, %rdi + jmp L(loop_prolog) + + .p2align 4 +L(crosscache): + and $15, %rcx + and $-16, %rdi + movdqa (%rdi), %xmm0 + + pcmpeqb %xmm1, %xmm0 +/* Check if there is a match. */ + pmovmskb %xmm0, %eax +/* Remove the leading bytes. */ + sar %cl, %eax + test %eax, %eax + je L(unaligned_no_match) +/* Check which byte is a match. */ + bsf %eax, %eax + + add %rdi, %rax + add %rcx, %rax + ret + + .p2align 4 +L(unaligned_no_match): + add $16, %rdi + + .p2align 4 +L(loop_prolog): + movdqa (%rdi), %xmm0 + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(matches) + + movdqa 16(%rdi), %xmm2 + pcmpeqb %xmm1, %xmm2 + pmovmskb %xmm2, %eax + test %eax, %eax + jnz L(matches16) + + movdqa 32(%rdi), %xmm3 + pcmpeqb %xmm1, %xmm3 + pmovmskb %xmm3, %eax + test %eax, %eax + jnz L(matches32) + + movdqa 48(%rdi), %xmm4 + pcmpeqb %xmm1, %xmm4 + add $64, %rdi + pmovmskb %xmm4, %eax + test %eax, %eax + jnz L(matches0) + + test $0x3f, %rdi + jz L(align64_loop) + + movdqa (%rdi), %xmm0 + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(matches) + + movdqa 16(%rdi), %xmm2 + pcmpeqb %xmm1, %xmm2 + pmovmskb %xmm2, %eax + test %eax, %eax + jnz L(matches16) + + movdqa 32(%rdi), %xmm3 + pcmpeqb %xmm1, %xmm3 + pmovmskb %xmm3, %eax + test %eax, %eax + jnz L(matches32) + + movdqa 48(%rdi), %xmm3 + pcmpeqb %xmm1, %xmm3 + pmovmskb %xmm3, %eax + + add $64, %rdi + test %eax, %eax + jnz L(matches0) + + and $-64, %rdi + + .p2align 4 +L(align64_loop): + movdqa (%rdi), %xmm0 + movdqa 16(%rdi), %xmm2 + movdqa 32(%rdi), %xmm3 + movdqa 48(%rdi), %xmm4 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm1, %xmm2 + pcmpeqb %xmm1, %xmm3 + pcmpeqb %xmm1, %xmm4 + + pmaxub %xmm0, %xmm3 + pmaxub %xmm2, %xmm4 + pmaxub %xmm3, %xmm4 + pmovmskb %xmm4, %eax + + add $64, %rdi + + test %eax, %eax + jz L(align64_loop) + + sub $64, %rdi + + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(matches) + + pmovmskb %xmm2, %eax + test %eax, %eax + jnz L(matches16) + + movdqa 32(%rdi), %xmm3 + pcmpeqb %xmm1, %xmm3 + + pcmpeqb 48(%rdi), %xmm1 + pmovmskb %xmm3, %eax + test %eax, %eax + jnz L(matches32) + + pmovmskb %xmm1, %eax + bsf %eax, %eax + lea 48(%rdi, %rax), %rax + ret + + .p2align 4 +L(matches0): + bsf %eax, %eax + lea -16(%rax, %rdi), %rax + ret + + .p2align 4 +L(matches): + bsf %eax, %eax + add %rdi, %rax + ret + + .p2align 4 +L(matches16): + bsf %eax, %eax + lea 16(%rax, %rdi), %rax + ret + + .p2align 4 +L(matches32): + bsf %eax, %eax + lea 32(%rax, %rdi), %rax + ret + +END (__rawmemchr) + +weak_alias (__rawmemchr, rawmemchr) +libc_hidden_builtin_def (__rawmemchr) diff --git a/REORG.TODO/sysdeps/x86_64/rshift.S b/REORG.TODO/sysdeps/x86_64/rshift.S new file mode 100644 index 0000000000..1686339e5c --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/rshift.S @@ -0,0 +1,114 @@ +/* x86-64 __mpn_rshift -- + Copyright (C) 2007-2017 Free Software Foundation, Inc. + This file is part of the GNU MP Library. + + The GNU MP Library is free software; you can redistribute it and/or modify + it under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation; either version 2.1 of the License, or (at your + option) any later version. + + The GNU MP Library is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public + License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with the GNU MP Library; see the file COPYING.LIB. If not, + see <http://www.gnu.org/licenses/>. */ + +#include "sysdep.h" +#include "asm-syntax.h" + +#define rp %rdi +#define up %rsi +#define n %rdx +#define cnt %cl + + .text +ENTRY (__mpn_rshift) + mov %edx, %eax + and $3, %eax + jne L(nb00) +L(b00): /* n = 4, 8, 12, ... */ + mov (up), %r10 + mov 8(up), %r11 + xor %eax, %eax + shrd %cl, %r10, %rax + mov 16(up), %r8 + lea 8(up), up + lea -24(rp), rp + sub $4, n + jmp L(00) + +L(nb00):/* n = 1, 5, 9, ... */ + cmp $2, %eax + jae L(nb01) +L(b01): mov (up), %r9 + xor %eax, %eax + shrd %cl, %r9, %rax + sub $2, n + jb L(le1) + mov 8(up), %r10 + mov 16(up), %r11 + lea 16(up), up + lea -16(rp), rp + jmp L(01) +L(le1): shr %cl, %r9 + mov %r9, (rp) + ret + +L(nb01):/* n = 2, 6, 10, ... */ + jne L(b11) +L(b10): mov (up), %r8 + mov 8(up), %r9 + xor %eax, %eax + shrd %cl, %r8, %rax + sub $3, n + jb L(le2) + mov 16(up), %r10 + lea 24(up), up + lea -8(rp), rp + jmp L(10) +L(le2): shrd %cl, %r9, %r8 + mov %r8, (rp) + shr %cl, %r9 + mov %r9, 8(rp) + ret + + .p2align 4 +L(b11): /* n = 3, 7, 11, ... */ + mov (up), %r11 + mov 8(up), %r8 + xor %eax, %eax + shrd %cl, %r11, %rax + mov 16(up), %r9 + lea 32(up), up + sub $4, n + jb L(end) + + .p2align 4 +L(top): shrd %cl, %r8, %r11 + mov -8(up), %r10 + mov %r11, (rp) +L(10): shrd %cl, %r9, %r8 + mov (up), %r11 + mov %r8, 8(rp) +L(01): shrd %cl, %r10, %r9 + mov 8(up), %r8 + mov %r9, 16(rp) +L(00): shrd %cl, %r11, %r10 + mov 16(up), %r9 + mov %r10, 24(rp) + add $32, up + lea 32(rp), rp + sub $4, n + jnc L(top) + +L(end): shrd %cl, %r8, %r11 + mov %r11, (rp) + shrd %cl, %r9, %r8 + mov %r8, 8(rp) + shr %cl, %r9 + mov %r9, 16(rp) + ret +END (__mpn_rshift) diff --git a/REORG.TODO/sysdeps/x86_64/sched_cpucount.c b/REORG.TODO/sysdeps/x86_64/sched_cpucount.c new file mode 100644 index 0000000000..408ddc9d61 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/sched_cpucount.c @@ -0,0 +1,25 @@ +/* Copyright (C) 2007-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#ifdef __amdfam10 +# define POPCNT(l) \ + ({ __cpu_mask r; \ + asm ("popcntq %1, %0" : "=r" (r) : "0" (l)); \ + r; }) +#endif + +#include <posix/sched_cpucount.c> diff --git a/REORG.TODO/sysdeps/x86_64/setjmp.S b/REORG.TODO/sysdeps/x86_64/setjmp.S new file mode 100644 index 0000000000..3a889033cd --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/setjmp.S @@ -0,0 +1,66 @@ +/* setjmp for x86-64. + Copyright (C) 2001-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <jmpbuf-offsets.h> +#include <asm-syntax.h> +#include <stap-probe.h> + +ENTRY (__sigsetjmp) + /* Save registers. */ + movq %rbx, (JB_RBX*8)(%rdi) +#ifdef PTR_MANGLE +# ifdef __ILP32__ + /* Save the high bits of %rbp first, since PTR_MANGLE will + only handle the low bits but we cannot presume %rbp is + being used as a pointer and truncate it. Here we write all + of %rbp, but the low bits will be overwritten below. */ + movq %rbp, (JB_RBP*8)(%rdi) +# endif + mov %RBP_LP, %RAX_LP + PTR_MANGLE (%RAX_LP) + mov %RAX_LP, (JB_RBP*8)(%rdi) +#else + movq %rbp, (JB_RBP*8)(%rdi) +#endif + movq %r12, (JB_R12*8)(%rdi) + movq %r13, (JB_R13*8)(%rdi) + movq %r14, (JB_R14*8)(%rdi) + movq %r15, (JB_R15*8)(%rdi) + lea 8(%rsp), %RDX_LP /* Save SP as it will be after we return. */ +#ifdef PTR_MANGLE + PTR_MANGLE (%RDX_LP) +#endif + movq %rdx, (JB_RSP*8)(%rdi) + mov (%rsp), %RAX_LP /* Save PC we are returning to now. */ + LIBC_PROBE (setjmp, 3, LP_SIZE@%RDI_LP, -4@%esi, LP_SIZE@%RAX_LP) +#ifdef PTR_MANGLE + PTR_MANGLE (%RAX_LP) +#endif + movq %rax, (JB_PC*8)(%rdi) + +#if IS_IN (rtld) + /* In ld.so we never save the signal mask. */ + xorl %eax, %eax + retq +#else + /* Make a tail call to __sigjmp_save; it takes the same args. */ + jmp __sigjmp_save +#endif +END (__sigsetjmp) +hidden_def (__sigsetjmp) diff --git a/REORG.TODO/sysdeps/x86_64/stack-aliasing.h b/REORG.TODO/sysdeps/x86_64/stack-aliasing.h new file mode 100644 index 0000000000..2efdacb3b4 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/stack-aliasing.h @@ -0,0 +1 @@ +#include <sysdeps/i386/i686/stack-aliasing.h> diff --git a/REORG.TODO/sysdeps/x86_64/stackguard-macros.h b/REORG.TODO/sysdeps/x86_64/stackguard-macros.h new file mode 100644 index 0000000000..1948800cd0 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/stackguard-macros.h @@ -0,0 +1,11 @@ +#include <stdint.h> + +#define STACK_CHK_GUARD \ + ({ uintptr_t x; \ + asm ("mov %%fs:%c1, %0" : "=r" (x) \ + : "i" (offsetof (tcbhead_t, stack_guard))); x; }) + +#define POINTER_CHK_GUARD \ + ({ uintptr_t x; \ + asm ("mov %%fs:%c1, %0" : "=r" (x) \ + : "i" (offsetof (tcbhead_t, pointer_guard))); x; }) diff --git a/REORG.TODO/sysdeps/x86_64/stackinfo.h b/REORG.TODO/sysdeps/x86_64/stackinfo.h new file mode 100644 index 0000000000..a1cbb43322 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/stackinfo.h @@ -0,0 +1,43 @@ +/* Copyright (C) 2001-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +/* This file contains a bit of information about the stack allocation + of the processor. */ + +#ifndef _STACKINFO_H +#define _STACKINFO_H 1 + +#include <elf.h> + +/* On x86_64 the stack grows down. */ +#define _STACK_GROWS_DOWN 1 + +/* Default to an executable stack. PF_X can be overridden if PT_GNU_STACK is + * present, but it is presumed absent. */ +#define DEFAULT_STACK_PERMS (PF_R|PF_W|PF_X) + +/* Access to the stack pointer. The macros are used in alloca_account + for which they need to act as barriers as well, hence the additional + (unnecessary) parameters. */ +#define stackinfo_get_sp() \ + ({ void *p__; asm volatile ("mov %%" RSP_LP ", %0" : "=r" (p__)); p__; }) +#define stackinfo_sub_sp(ptr) \ + ({ ptrdiff_t d__; \ + asm volatile ("sub %%" RSP_LP " , %0" : "=r" (d__) : "0" (ptr)); \ + d__; }) + +#endif /* stackinfo.h */ diff --git a/REORG.TODO/sysdeps/x86_64/start.S b/REORG.TODO/sysdeps/x86_64/start.S new file mode 100644 index 0000000000..62a00eaeaa --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/start.S @@ -0,0 +1,131 @@ +/* Startup code compliant to the ELF x86-64 ABI. + Copyright (C) 2001-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Andreas Jaeger <aj@suse.de>, 2001. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + In addition to the permissions in the GNU Lesser General Public + License, the Free Software Foundation gives you unlimited + permission to link the compiled version of this file with other + programs, and to distribute those programs without any restriction + coming from the use of this file. (The GNU Lesser General Public + License restrictions do apply in other respects; for example, they + cover modification of the file, and distribution when not linked + into another program.) + + Note that people who make modified versions of this file are not + obligated to grant this special exception for their modified + versions; it is their choice whether to do so. The GNU Lesser + General Public License gives permission to release a modified + version without this exception; this exception also makes it + possible to release a modified version which carries forward this + exception. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +/* This is the canonical entry point, usually the first thing in the text + segment. The SVR4/i386 ABI (pages 3-31, 3-32) says that when the entry + point runs, most registers' values are unspecified, except for: + + %rdx Contains a function pointer to be registered with `atexit'. + This is how the dynamic linker arranges to have DT_FINI + functions called for shared libraries that have been loaded + before this code runs. + + %rsp The stack contains the arguments and environment: + 0(%rsp) argc + LP_SIZE(%rsp) argv[0] + ... + (LP_SIZE*argc)(%rsp) NULL + (LP_SIZE*(argc+1))(%rsp) envp[0] + ... + NULL +*/ + +#include <sysdep.h> + +ENTRY (_start) + /* Clearing frame pointer is insufficient, use CFI. */ + cfi_undefined (rip) + /* Clear the frame pointer. The ABI suggests this be done, to mark + the outermost frame obviously. */ + xorl %ebp, %ebp + + /* Extract the arguments as encoded on the stack and set up + the arguments for __libc_start_main (int (*main) (int, char **, char **), + int argc, char *argv, + void (*init) (void), void (*fini) (void), + void (*rtld_fini) (void), void *stack_end). + The arguments are passed via registers and on the stack: + main: %rdi + argc: %rsi + argv: %rdx + init: %rcx + fini: %r8 + rtld_fini: %r9 + stack_end: stack. */ + + mov %RDX_LP, %R9_LP /* Address of the shared library termination + function. */ +#ifdef __ILP32__ + mov (%rsp), %esi /* Simulate popping 4-byte argument count. */ + add $4, %esp +#else + popq %rsi /* Pop the argument count. */ +#endif + /* argv starts just at the current stack top. */ + mov %RSP_LP, %RDX_LP + /* Align the stack to a 16 byte boundary to follow the ABI. */ + and $~15, %RSP_LP + + /* Push garbage because we push 8 more bytes. */ + pushq %rax + + /* Provide the highest stack address to the user code (for stacks + which grow downwards). */ + pushq %rsp + +#ifdef SHARED + /* Pass address of our own entry points to .fini and .init. */ + mov __libc_csu_fini@GOTPCREL(%rip), %R8_LP + mov __libc_csu_init@GOTPCREL(%rip), %RCX_LP + + mov main@GOTPCREL(%rip), %RDI_LP +#else + /* Pass address of our own entry points to .fini and .init. */ + mov $__libc_csu_fini, %R8_LP + mov $__libc_csu_init, %RCX_LP + + mov $main, %RDI_LP +#endif + + /* Call the user's main function, and exit with its value. + But let the libc call main. Since __libc_start_main in + libc.so is called very early, lazy binding isn't relevant + here. Use indirect branch via GOT to avoid extra branch + to PLT slot. In case of static executable, ld in binutils + 2.26 or above can convert indirect branch into direct + branch. */ + call *__libc_start_main@GOTPCREL(%rip) + + hlt /* Crash if somehow `exit' does return. */ +END (_start) + +/* Define a symbol for the first piece of initialized data. */ + .data + .globl __data_start +__data_start: + .long 0 + .weak data_start + data_start = __data_start diff --git a/REORG.TODO/sysdeps/x86_64/stpcpy.S b/REORG.TODO/sysdeps/x86_64/stpcpy.S new file mode 100644 index 0000000000..ec23de1416 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/stpcpy.S @@ -0,0 +1,8 @@ +#define USE_AS_STPCPY +#define STRCPY __stpcpy + +#include <sysdeps/x86_64/strcpy.S> + +weak_alias (__stpcpy, stpcpy) +libc_hidden_def (__stpcpy) +libc_hidden_builtin_def (stpcpy) diff --git a/REORG.TODO/sysdeps/x86_64/strcasecmp.S b/REORG.TODO/sysdeps/x86_64/strcasecmp.S new file mode 100644 index 0000000000..fe49e820f2 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/strcasecmp.S @@ -0,0 +1 @@ +/* In strcasecmp_l.S. */ diff --git a/REORG.TODO/sysdeps/x86_64/strcasecmp_l-nonascii.c b/REORG.TODO/sysdeps/x86_64/strcasecmp_l-nonascii.c new file mode 100644 index 0000000000..30e8969603 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/strcasecmp_l-nonascii.c @@ -0,0 +1,8 @@ +#include <string.h> + +extern int __strcasecmp_l_nonascii (const char *__s1, const char *__s2, + __locale_t __loc); + +#define __strcasecmp_l __strcasecmp_l_nonascii +#define USE_IN_EXTENDED_LOCALE_MODEL 1 +#include <string/strcasecmp.c> diff --git a/REORG.TODO/sysdeps/x86_64/strcasecmp_l.S b/REORG.TODO/sysdeps/x86_64/strcasecmp_l.S new file mode 100644 index 0000000000..5456b3a49e --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/strcasecmp_l.S @@ -0,0 +1,6 @@ +#define STRCMP __strcasecmp_l +#define USE_AS_STRCASECMP_L +#include "strcmp.S" + +weak_alias (__strcasecmp_l, strcasecmp_l) +libc_hidden_def (strcasecmp_l) diff --git a/REORG.TODO/sysdeps/x86_64/strcat.S b/REORG.TODO/sysdeps/x86_64/strcat.S new file mode 100644 index 0000000000..44e6512339 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/strcat.S @@ -0,0 +1,258 @@ +/* strcat(dest, src) -- Append SRC on the end of DEST. + Optimized for x86-64. + Copyright (C) 2002-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Andreas Jaeger <aj@suse.de>, 2002. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "asm-syntax.h" + +/* Will be removed when new strcpy implementation gets merged. */ + + .text +ENTRY (strcat) + movq %rdi, %rcx /* Dest. register. */ + andl $7, %ecx /* mask alignment bits */ + movq %rdi, %rax /* Duplicate destination pointer. */ + movq $0xfefefefefefefeff,%r8 + + /* First step: Find end of destination. */ + jz 4f /* aligned => start loop */ + + neg %ecx /* We need to align to 8 bytes. */ + addl $8,%ecx + /* Search the first bytes directly. */ +0: cmpb $0x0,(%rax) /* is byte NUL? */ + je 2f /* yes => start copy */ + incq %rax /* increment pointer */ + decl %ecx + jnz 0b + + + + /* Now the source is aligned. Scan for NUL byte. */ + .p2align 4 +4: + /* First unroll. */ + movq (%rax), %rcx /* get double word (= 8 bytes) in question */ + addq $8,%rax /* adjust pointer for next word */ + movq %r8, %rdx /* magic value */ + addq %rcx, %rdx /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc 3f /* highest byte is NUL => return pointer */ + xorq %rcx, %rdx /* (word+magic)^word */ + orq %r8, %rdx /* set all non-carry bits */ + incq %rdx /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + jnz 3f /* found NUL => return pointer */ + + /* Second unroll. */ + movq (%rax), %rcx /* get double word (= 8 bytes) in question */ + addq $8,%rax /* adjust pointer for next word */ + movq %r8, %rdx /* magic value */ + addq %rcx, %rdx /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc 3f /* highest byte is NUL => return pointer */ + xorq %rcx, %rdx /* (word+magic)^word */ + orq %r8, %rdx /* set all non-carry bits */ + incq %rdx /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + jnz 3f /* found NUL => return pointer */ + + /* Third unroll. */ + movq (%rax), %rcx /* get double word (= 8 bytes) in question */ + addq $8,%rax /* adjust pointer for next word */ + movq %r8, %rdx /* magic value */ + addq %rcx, %rdx /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc 3f /* highest byte is NUL => return pointer */ + xorq %rcx, %rdx /* (word+magic)^word */ + orq %r8, %rdx /* set all non-carry bits */ + incq %rdx /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + jnz 3f /* found NUL => return pointer */ + + /* Fourth unroll. */ + movq (%rax), %rcx /* get double word (= 8 bytes) in question */ + addq $8,%rax /* adjust pointer for next word */ + movq %r8, %rdx /* magic value */ + addq %rcx, %rdx /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc 3f /* highest byte is NUL => return pointer */ + xorq %rcx, %rdx /* (word+magic)^word */ + orq %r8, %rdx /* set all non-carry bits */ + incq %rdx /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + jz 4b /* no NUL found => continue loop */ + + .p2align 4 /* Align, it's a jump target. */ +3: subq $8,%rax /* correct pointer increment. */ + + testb %cl, %cl /* is first byte NUL? */ + jz 2f /* yes => return */ + incq %rax /* increment pointer */ + + testb %ch, %ch /* is second byte NUL? */ + jz 2f /* yes => return */ + incq %rax /* increment pointer */ + + testl $0x00ff0000, %ecx /* is third byte NUL? */ + jz 2f /* yes => return pointer */ + incq %rax /* increment pointer */ + + testl $0xff000000, %ecx /* is fourth byte NUL? */ + jz 2f /* yes => return pointer */ + incq %rax /* increment pointer */ + + shrq $32, %rcx /* look at other half. */ + + testb %cl, %cl /* is first byte NUL? */ + jz 2f /* yes => return */ + incq %rax /* increment pointer */ + + testb %ch, %ch /* is second byte NUL? */ + jz 2f /* yes => return */ + incq %rax /* increment pointer */ + + testl $0xff0000, %ecx /* is third byte NUL? */ + jz 2f /* yes => return pointer */ + incq %rax /* increment pointer */ + +2: + /* Second step: Copy source to destination. */ + + movq %rsi, %rcx /* duplicate */ + andl $7,%ecx /* mask alignment bits */ + movq %rax, %rdx /* move around */ + jz 22f /* aligned => start loop */ + + neg %ecx /* align to 8 bytes. */ + addl $8, %ecx + /* Align the source pointer. */ +21: + movb (%rsi), %al /* Fetch a byte */ + testb %al, %al /* Is it NUL? */ + movb %al, (%rdx) /* Store it */ + jz 24f /* If it was NUL, done! */ + incq %rsi + incq %rdx + decl %ecx + jnz 21b + + /* Now the sources is aligned. Unfortunatly we cannot force + to have both source and destination aligned, so ignore the + alignment of the destination. */ + .p2align 4 +22: + /* 1st unroll. */ + movq (%rsi), %rax /* Read double word (8 bytes). */ + addq $8, %rsi /* Adjust pointer for next word. */ + movq %rax, %r9 /* Save a copy for NUL finding. */ + addq %r8, %r9 /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc 23f /* highest byte is NUL => return pointer */ + xorq %rax, %r9 /* (word+magic)^word */ + orq %r8, %r9 /* set all non-carry bits */ + incq %r9 /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + + jnz 23f /* found NUL => return pointer */ + + movq %rax, (%rdx) /* Write value to destination. */ + addq $8, %rdx /* Adjust pointer. */ + + /* 2nd unroll. */ + movq (%rsi), %rax /* Read double word (8 bytes). */ + addq $8, %rsi /* Adjust pointer for next word. */ + movq %rax, %r9 /* Save a copy for NUL finding. */ + addq %r8, %r9 /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc 23f /* highest byte is NUL => return pointer */ + xorq %rax, %r9 /* (word+magic)^word */ + orq %r8, %r9 /* set all non-carry bits */ + incq %r9 /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + + jnz 23f /* found NUL => return pointer */ + + movq %rax, (%rdx) /* Write value to destination. */ + addq $8, %rdx /* Adjust pointer. */ + + /* 3rd unroll. */ + movq (%rsi), %rax /* Read double word (8 bytes). */ + addq $8, %rsi /* Adjust pointer for next word. */ + movq %rax, %r9 /* Save a copy for NUL finding. */ + addq %r8, %r9 /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc 23f /* highest byte is NUL => return pointer */ + xorq %rax, %r9 /* (word+magic)^word */ + orq %r8, %r9 /* set all non-carry bits */ + incq %r9 /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + + jnz 23f /* found NUL => return pointer */ + + movq %rax, (%rdx) /* Write value to destination. */ + addq $8, %rdx /* Adjust pointer. */ + + /* 4th unroll. */ + movq (%rsi), %rax /* Read double word (8 bytes). */ + addq $8, %rsi /* Adjust pointer for next word. */ + movq %rax, %r9 /* Save a copy for NUL finding. */ + addq %r8, %r9 /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc 23f /* highest byte is NUL => return pointer */ + xorq %rax, %r9 /* (word+magic)^word */ + orq %r8, %r9 /* set all non-carry bits */ + incq %r9 /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + + jnz 23f /* found NUL => return pointer */ + + movq %rax, (%rdx) /* Write value to destination. */ + addq $8, %rdx /* Adjust pointer. */ + jmp 22b /* Next iteration. */ + + /* Do the last few bytes. %rax contains the value to write. + The loop is unrolled twice. */ + .p2align 4 +23: + movb %al, (%rdx) /* 1st byte. */ + testb %al, %al /* Is it NUL. */ + jz 24f /* yes, finish. */ + incq %rdx /* Increment destination. */ + movb %ah, (%rdx) /* 2nd byte. */ + testb %ah, %ah /* Is it NUL?. */ + jz 24f /* yes, finish. */ + incq %rdx /* Increment destination. */ + shrq $16, %rax /* Shift... */ + jmp 23b /* and look at next two bytes in %rax. */ + + +24: + movq %rdi, %rax /* Source is return value. */ + retq +END (strcat) +libc_hidden_builtin_def (strcat) diff --git a/REORG.TODO/sysdeps/x86_64/strchr.S b/REORG.TODO/sysdeps/x86_64/strchr.S new file mode 100644 index 0000000000..16c1726803 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/strchr.S @@ -0,0 +1,187 @@ +/* strchr (str, ch) -- Return pointer to first occurrence of CH in STR. + For AMD x86-64. + Copyright (C) 2009-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> + + .text +ENTRY (strchr) + movd %esi, %xmm1 + movl %edi, %eax + andl $4095, %eax + punpcklbw %xmm1, %xmm1 + cmpl $4032, %eax + punpcklwd %xmm1, %xmm1 + pshufd $0, %xmm1, %xmm1 + jg L(cross_page) + movdqu (%rdi), %xmm0 + pxor %xmm3, %xmm3 + movdqa %xmm0, %xmm4 + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm3, %xmm4 + por %xmm4, %xmm0 + pmovmskb %xmm0, %eax + test %eax, %eax + je L(next_48_bytes) + bsf %eax, %eax +#ifdef AS_STRCHRNUL + leaq (%rdi,%rax), %rax +#else + movl $0, %edx + leaq (%rdi,%rax), %rax + cmpb %sil, (%rax) + cmovne %rdx, %rax +#endif + ret + + .p2align 3 + L(next_48_bytes): + movdqu 16(%rdi), %xmm0 + movdqa %xmm0, %xmm4 + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm3, %xmm4 + por %xmm4, %xmm0 + pmovmskb %xmm0, %ecx + movdqu 32(%rdi), %xmm0 + movdqa %xmm0, %xmm4 + pcmpeqb %xmm1, %xmm0 + salq $16, %rcx + pcmpeqb %xmm3, %xmm4 + por %xmm4, %xmm0 + pmovmskb %xmm0, %eax + movdqu 48(%rdi), %xmm0 + pcmpeqb %xmm0, %xmm3 + salq $32, %rax + pcmpeqb %xmm1, %xmm0 + orq %rcx, %rax + por %xmm3, %xmm0 + pmovmskb %xmm0, %ecx + salq $48, %rcx + orq %rcx, %rax + testq %rax, %rax + jne L(return) +L(loop_start): + /* We use this alignment to force loop be aligned to 8 but not + 16 bytes. This gives better sheduling on AMD processors. */ + .p2align 4 + pxor %xmm6, %xmm6 + andq $-64, %rdi + .p2align 3 +L(loop64): + addq $64, %rdi + movdqa (%rdi), %xmm5 + movdqa 16(%rdi), %xmm2 + movdqa 32(%rdi), %xmm3 + pxor %xmm1, %xmm5 + movdqa 48(%rdi), %xmm4 + pxor %xmm1, %xmm2 + pxor %xmm1, %xmm3 + pminub (%rdi), %xmm5 + pxor %xmm1, %xmm4 + pminub 16(%rdi), %xmm2 + pminub 32(%rdi), %xmm3 + pminub %xmm2, %xmm5 + pminub 48(%rdi), %xmm4 + pminub %xmm3, %xmm5 + pminub %xmm4, %xmm5 + pcmpeqb %xmm6, %xmm5 + pmovmskb %xmm5, %eax + + testl %eax, %eax + je L(loop64) + + movdqa (%rdi), %xmm5 + movdqa %xmm5, %xmm0 + pcmpeqb %xmm1, %xmm5 + pcmpeqb %xmm6, %xmm0 + por %xmm0, %xmm5 + pcmpeqb %xmm6, %xmm2 + pcmpeqb %xmm6, %xmm3 + pcmpeqb %xmm6, %xmm4 + + pmovmskb %xmm5, %ecx + pmovmskb %xmm2, %eax + salq $16, %rax + pmovmskb %xmm3, %r8d + pmovmskb %xmm4, %edx + salq $32, %r8 + orq %r8, %rax + orq %rcx, %rax + salq $48, %rdx + orq %rdx, %rax + .p2align 3 +L(return): + bsfq %rax, %rax +#ifdef AS_STRCHRNUL + leaq (%rdi,%rax), %rax +#else + movl $0, %edx + leaq (%rdi,%rax), %rax + cmpb %sil, (%rax) + cmovne %rdx, %rax +#endif + ret + .p2align 4 + +L(cross_page): + movq %rdi, %rdx + pxor %xmm2, %xmm2 + andq $-64, %rdx + movdqa %xmm1, %xmm0 + movdqa (%rdx), %xmm3 + movdqa %xmm3, %xmm4 + pcmpeqb %xmm1, %xmm3 + pcmpeqb %xmm2, %xmm4 + por %xmm4, %xmm3 + pmovmskb %xmm3, %r8d + movdqa 16(%rdx), %xmm3 + movdqa %xmm3, %xmm4 + pcmpeqb %xmm1, %xmm3 + pcmpeqb %xmm2, %xmm4 + por %xmm4, %xmm3 + pmovmskb %xmm3, %eax + movdqa 32(%rdx), %xmm3 + movdqa %xmm3, %xmm4 + pcmpeqb %xmm1, %xmm3 + salq $16, %rax + pcmpeqb %xmm2, %xmm4 + por %xmm4, %xmm3 + pmovmskb %xmm3, %r9d + movdqa 48(%rdx), %xmm3 + pcmpeqb %xmm3, %xmm2 + salq $32, %r9 + pcmpeqb %xmm3, %xmm0 + orq %r9, %rax + orq %r8, %rax + por %xmm2, %xmm0 + pmovmskb %xmm0, %ecx + salq $48, %rcx + orq %rcx, %rax + movl %edi, %ecx + subb %dl, %cl + shrq %cl, %rax + testq %rax, %rax + jne L(return) + jmp L(loop_start) + +END (strchr) + +#ifndef AS_STRCHRNUL +weak_alias (strchr, index) +libc_hidden_builtin_def (strchr) +#endif diff --git a/REORG.TODO/sysdeps/x86_64/strchrnul.S b/REORG.TODO/sysdeps/x86_64/strchrnul.S new file mode 100644 index 0000000000..841dfc2783 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/strchrnul.S @@ -0,0 +1,27 @@ +/* strchrnul (str, ch) -- Return pointer to first occurrence of CH in STR + or terminating NUL byte. + For AMD x86-64. + Copyright (C) 2009-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> + +#define strchr __strchrnul +#define AS_STRCHRNUL +#include "strchr.S" + +weak_alias (__strchrnul, strchrnul) diff --git a/REORG.TODO/sysdeps/x86_64/strcmp.S b/REORG.TODO/sysdeps/x86_64/strcmp.S new file mode 100644 index 0000000000..076be04df5 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/strcmp.S @@ -0,0 +1,2267 @@ +/* Highly optimized version for x86-64. + Copyright (C) 1999-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Based on i686 version contributed by Ulrich Drepper + <drepper@cygnus.com>, 1999. + Updated with SSE2 support contributed by Intel Corporation. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "asm-syntax.h" + +#undef UPDATE_STRNCMP_COUNTER + +#ifndef LABEL +#define LABEL(l) L(l) +#endif + +#ifdef USE_AS_STRNCMP +/* Since the counter, %r11, is unsigned, we branch to strcmp_exitz + if the new counter > the old one or is 0. */ +# define UPDATE_STRNCMP_COUNTER \ + /* calculate left number to compare */ \ + lea -16(%rcx, %r11), %r9; \ + cmp %r9, %r11; \ + jb LABEL(strcmp_exitz); \ + test %r9, %r9; \ + je LABEL(strcmp_exitz); \ + mov %r9, %r11 + +#elif defined USE_AS_STRCASECMP_L +# include "locale-defines.h" + +# define UPDATE_STRNCMP_COUNTER +#elif defined USE_AS_STRNCASECMP_L +# include "locale-defines.h" + +# define UPDATE_STRNCMP_COUNTER \ + /* calculate left number to compare */ \ + lea -16(%rcx, %r11), %r9; \ + cmp %r9, %r11; \ + jb LABEL(strcmp_exitz); \ + test %r9, %r9; \ + je LABEL(strcmp_exitz); \ + mov %r9, %r11 +#else +# define UPDATE_STRNCMP_COUNTER +# ifndef STRCMP +# define STRCMP strcmp +# endif +#endif + +#ifndef USE_SSSE3 + .text +#else + .section .text.ssse3,"ax",@progbits +#endif + +#ifdef USE_AS_STRCASECMP_L +# ifndef ENTRY2 +# define ENTRY2(name) ENTRY (name) +# define END2(name) END (name) +# endif + +ENTRY2 (__strcasecmp) + movq __libc_tsd_LOCALE@gottpoff(%rip),%rax + mov %fs:(%rax),%RDX_LP + + // XXX 5 byte should be before the function + /* 5-byte NOP. */ + .byte 0x0f,0x1f,0x44,0x00,0x00 +END2 (__strcasecmp) +# ifndef NO_NOLOCALE_ALIAS +weak_alias (__strcasecmp, strcasecmp) +libc_hidden_def (__strcasecmp) +# endif + /* FALLTHROUGH to strcasecmp_l. */ +#elif defined USE_AS_STRNCASECMP_L +# ifndef ENTRY2 +# define ENTRY2(name) ENTRY (name) +# define END2(name) END (name) +# endif + +ENTRY2 (__strncasecmp) + movq __libc_tsd_LOCALE@gottpoff(%rip),%rax + mov %fs:(%rax),%RCX_LP + + // XXX 5 byte should be before the function + /* 5-byte NOP. */ + .byte 0x0f,0x1f,0x44,0x00,0x00 +END2 (__strncasecmp) +# ifndef NO_NOLOCALE_ALIAS +weak_alias (__strncasecmp, strncasecmp) +libc_hidden_def (__strncasecmp) +# endif + /* FALLTHROUGH to strncasecmp_l. */ +#endif + +ENTRY (STRCMP) +#ifdef USE_AS_STRCASECMP_L + /* We have to fall back on the C implementation for locales + with encodings not matching ASCII for single bytes. */ +# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0 + mov LOCALE_T___LOCALES+LC_CTYPE*LP_SIZE(%rdx), %RAX_LP +# else + mov (%rdx), %RAX_LP +# endif + testl $1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%rax) + jne __strcasecmp_l_nonascii +#elif defined USE_AS_STRNCASECMP_L + /* We have to fall back on the C implementation for locales + with encodings not matching ASCII for single bytes. */ +# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0 + mov LOCALE_T___LOCALES+LC_CTYPE*LP_SIZE(%rcx), %RAX_LP +# else + mov (%rcx), %RAX_LP +# endif + testl $1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%rax) + jne __strncasecmp_l_nonascii +#endif + +/* + * This implementation uses SSE to compare up to 16 bytes at a time. + */ +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + test %rdx, %rdx + je LABEL(strcmp_exitz) + cmp $1, %rdx + je LABEL(Byte0) + mov %rdx, %r11 +#endif + mov %esi, %ecx + mov %edi, %eax +/* Use 64bit AND here to avoid long NOP padding. */ + and $0x3f, %rcx /* rsi alignment in cache line */ + and $0x3f, %rax /* rdi alignment in cache line */ +#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L + .section .rodata.cst16,"aM",@progbits,16 + .align 16 +.Lbelowupper: + .quad 0x4040404040404040 + .quad 0x4040404040404040 +.Ltopupper: + .quad 0x5b5b5b5b5b5b5b5b + .quad 0x5b5b5b5b5b5b5b5b +.Ltouppermask: + .quad 0x2020202020202020 + .quad 0x2020202020202020 + .previous + movdqa .Lbelowupper(%rip), %xmm5 +# define UCLOW_reg %xmm5 + movdqa .Ltopupper(%rip), %xmm6 +# define UCHIGH_reg %xmm6 + movdqa .Ltouppermask(%rip), %xmm7 +# define LCQWORD_reg %xmm7 +#endif + cmp $0x30, %ecx + ja LABEL(crosscache) /* rsi: 16-byte load will cross cache line */ + cmp $0x30, %eax + ja LABEL(crosscache) /* rdi: 16-byte load will cross cache line */ + movlpd (%rdi), %xmm1 + movlpd (%rsi), %xmm2 + movhpd 8(%rdi), %xmm1 + movhpd 8(%rsi), %xmm2 +#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L +# define TOLOWER(reg1, reg2) \ + movdqa reg1, %xmm8; \ + movdqa UCHIGH_reg, %xmm9; \ + movdqa reg2, %xmm10; \ + movdqa UCHIGH_reg, %xmm11; \ + pcmpgtb UCLOW_reg, %xmm8; \ + pcmpgtb reg1, %xmm9; \ + pcmpgtb UCLOW_reg, %xmm10; \ + pcmpgtb reg2, %xmm11; \ + pand %xmm9, %xmm8; \ + pand %xmm11, %xmm10; \ + pand LCQWORD_reg, %xmm8; \ + pand LCQWORD_reg, %xmm10; \ + por %xmm8, reg1; \ + por %xmm10, reg2 + TOLOWER (%xmm1, %xmm2) +#else +# define TOLOWER(reg1, reg2) +#endif + pxor %xmm0, %xmm0 /* clear %xmm0 for null char checks */ + pcmpeqb %xmm1, %xmm0 /* Any null chars? */ + pcmpeqb %xmm2, %xmm1 /* compare first 16 bytes for equality */ + psubb %xmm0, %xmm1 /* packed sub of comparison results*/ + pmovmskb %xmm1, %edx + sub $0xffff, %edx /* if first 16 bytes are same, edx == 0xffff */ + jnz LABEL(less16bytes) /* If not, find different value or null char */ +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) /* finish comparision */ +#endif + add $16, %rsi /* prepare to search next 16 bytes */ + add $16, %rdi /* prepare to search next 16 bytes */ + + /* + * Determine source and destination string offsets from 16-byte alignment. + * Use relative offset difference between the two to determine which case + * below to use. + */ + .p2align 4 +LABEL(crosscache): + and $0xfffffffffffffff0, %rsi /* force %rsi is 16 byte aligned */ + and $0xfffffffffffffff0, %rdi /* force %rdi is 16 byte aligned */ + mov $0xffff, %edx /* for equivalent offset */ + xor %r8d, %r8d + and $0xf, %ecx /* offset of rsi */ + and $0xf, %eax /* offset of rdi */ + cmp %eax, %ecx + je LABEL(ashr_0) /* rsi and rdi relative offset same */ + ja LABEL(bigger) + mov %edx, %r8d /* r8d is offset flag for exit tail */ + xchg %ecx, %eax + xchg %rsi, %rdi +LABEL(bigger): + lea 15(%rax), %r9 + sub %rcx, %r9 + lea LABEL(unaligned_table)(%rip), %r10 + movslq (%r10, %r9,4), %r9 + lea (%r10, %r9), %r10 + jmp *%r10 /* jump to corresponding case */ + +/* + * The following cases will be handled by ashr_0 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(0~15) n(0~15) 15(15+ n-n) ashr_0 + */ + .p2align 4 +LABEL(ashr_0): + + movdqa (%rsi), %xmm1 + pxor %xmm0, %xmm0 /* clear %xmm0 for null char check */ + pcmpeqb %xmm1, %xmm0 /* Any null chars? */ +#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpeqb (%rdi), %xmm1 /* compare 16 bytes for equality */ +#else + movdqa (%rdi), %xmm2 + TOLOWER (%xmm1, %xmm2) + pcmpeqb %xmm2, %xmm1 /* compare 16 bytes for equality */ +#endif + psubb %xmm0, %xmm1 /* packed sub of comparison results*/ + pmovmskb %xmm1, %r9d + shr %cl, %edx /* adjust 0xffff for offset */ + shr %cl, %r9d /* adjust for 16-byte offset */ + sub %r9d, %edx + /* + * edx must be the same with r9d if in left byte (16-rcx) is equal to + * the start from (16-rax) and no null char was seen. + */ + jne LABEL(less32bytes) /* mismatch or null char */ + UPDATE_STRNCMP_COUNTER + mov $16, %rcx + mov $16, %r9 + pxor %xmm0, %xmm0 /* clear xmm0, may have changed above */ + + /* + * Now both strings are aligned at 16-byte boundary. Loop over strings + * checking 32-bytes per iteration. + */ + .p2align 4 +LABEL(loop_ashr_0): + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + TOLOWER (%xmm1, %xmm2) + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) /* mismatch or null char seen */ + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + add $16, %rcx + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + TOLOWER (%xmm1, %xmm2) + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + add $16, %rcx + jmp LABEL(loop_ashr_0) + +/* + * The following cases will be handled by ashr_1 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(15) n -15 0(15 +(n-15) - n) ashr_1 + */ + .p2align 4 +LABEL(ashr_1): + pxor %xmm0, %xmm0 + movdqa (%rdi), %xmm2 + movdqa (%rsi), %xmm1 + pcmpeqb %xmm1, %xmm0 /* Any null chars? */ + pslldq $15, %xmm2 /* shift first string to align with second */ + TOLOWER (%xmm1, %xmm2) + pcmpeqb %xmm1, %xmm2 /* compare 16 bytes for equality */ + psubb %xmm0, %xmm2 /* packed sub of comparison results*/ + pmovmskb %xmm2, %r9d + shr %cl, %edx /* adjust 0xffff for offset */ + shr %cl, %r9d /* adjust for 16-byte offset */ + sub %r9d, %edx + jnz LABEL(less32bytes) /* mismatch or null char seen */ + movdqa (%rdi), %xmm3 + UPDATE_STRNCMP_COUNTER + + pxor %xmm0, %xmm0 + mov $16, %rcx /* index for loads*/ + mov $1, %r9d /* byte position left over from less32bytes case */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we have crossed a page boundary and + * need to do a nibble. + */ + lea 1(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + sub $0x1000, %r10 /* subtract 4K pagesize */ + + .p2align 4 +LABEL(loop_ashr_1): + add $16, %r10 + jg LABEL(nibble_ashr_1) /* cross page boundary */ + +LABEL(gobble_ashr_1): + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 /* store for next cycle */ + +#ifndef USE_SSSE3 + psrldq $1, %xmm3 + pslldq $15, %xmm2 + por %xmm3, %xmm2 /* merge into one 16byte value */ +#else + palignr $1, %xmm3, %xmm2 /* merge into one 16byte value */ +#endif + TOLOWER (%xmm1, %xmm2) + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + add $16, %rcx + movdqa %xmm4, %xmm3 + + add $16, %r10 + jg LABEL(nibble_ashr_1) /* cross page boundary */ + + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 /* store for next cycle */ + +#ifndef USE_SSSE3 + psrldq $1, %xmm3 + pslldq $15, %xmm2 + por %xmm3, %xmm2 /* merge into one 16byte value */ +#else + palignr $1, %xmm3, %xmm2 /* merge into one 16byte value */ +#endif + TOLOWER (%xmm1, %xmm2) + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + add $16, %rcx + movdqa %xmm4, %xmm3 + jmp LABEL(loop_ashr_1) + + /* + * Nibble avoids loads across page boundary. This is to avoid a potential + * access into unmapped memory. + */ + .p2align 4 +LABEL(nibble_ashr_1): + pcmpeqb %xmm3, %xmm0 /* check nibble for null char*/ + pmovmskb %xmm0, %edx + test $0xfffe, %edx + jnz LABEL(ashr_1_exittail) /* find null char*/ + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $15, %r11 + jbe LABEL(ashr_1_exittail) +#endif + + pxor %xmm0, %xmm0 + sub $0x1000, %r10 /* substract 4K from %r10 */ + jmp LABEL(gobble_ashr_1) + + /* + * Once find null char, determine if there is a string mismatch + * before the null char. + */ + .p2align 4 +LABEL(ashr_1_exittail): + movdqa (%rsi, %rcx), %xmm1 + psrldq $1, %xmm0 + psrldq $1, %xmm3 + jmp LABEL(aftertail) + +/* + * The following cases will be handled by ashr_2 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(14~15) n -14 1(15 +(n-14) - n) ashr_2 + */ + .p2align 4 +LABEL(ashr_2): + pxor %xmm0, %xmm0 + movdqa (%rdi), %xmm2 + movdqa (%rsi), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $14, %xmm2 + TOLOWER (%xmm1, %xmm2) + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d + sub %r9d, %edx + jnz LABEL(less32bytes) + movdqa (%rdi), %xmm3 + UPDATE_STRNCMP_COUNTER + + pxor %xmm0, %xmm0 + mov $16, %rcx /* index for loads */ + mov $2, %r9d /* byte position left over from less32bytes case */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we have crossed a page boundary and + * need to do a nibble. + */ + lea 2(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + sub $0x1000, %r10 /* subtract 4K pagesize */ + + .p2align 4 +LABEL(loop_ashr_2): + add $16, %r10 + jg LABEL(nibble_ashr_2) + +LABEL(gobble_ashr_2): + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 + +#ifndef USE_SSSE3 + psrldq $2, %xmm3 + pslldq $14, %xmm2 + por %xmm3, %xmm2 /* merge into one 16byte value */ +#else + palignr $2, %xmm3, %xmm2 /* merge into one 16byte value */ +#endif + TOLOWER (%xmm1, %xmm2) + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + + add $16, %rcx + movdqa %xmm4, %xmm3 + + add $16, %r10 + jg LABEL(nibble_ashr_2) /* cross page boundary */ + + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 + +#ifndef USE_SSSE3 + psrldq $2, %xmm3 + pslldq $14, %xmm2 + por %xmm3, %xmm2 /* merge into one 16byte value */ +#else + palignr $2, %xmm3, %xmm2 /* merge into one 16byte value */ +#endif + TOLOWER (%xmm1, %xmm2) + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + + add $16, %rcx + movdqa %xmm4, %xmm3 + jmp LABEL(loop_ashr_2) + + .p2align 4 +LABEL(nibble_ashr_2): + pcmpeqb %xmm3, %xmm0 /* check nibble for null char */ + pmovmskb %xmm0, %edx + test $0xfffc, %edx + jnz LABEL(ashr_2_exittail) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $14, %r11 + jbe LABEL(ashr_2_exittail) +#endif + + pxor %xmm0, %xmm0 + sub $0x1000, %r10 + jmp LABEL(gobble_ashr_2) + + .p2align 4 +LABEL(ashr_2_exittail): + movdqa (%rsi, %rcx), %xmm1 + psrldq $2, %xmm0 + psrldq $2, %xmm3 + jmp LABEL(aftertail) + +/* + * The following cases will be handled by ashr_3 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(13~15) n -13 2(15 +(n-13) - n) ashr_3 + */ + .p2align 4 +LABEL(ashr_3): + pxor %xmm0, %xmm0 + movdqa (%rdi), %xmm2 + movdqa (%rsi), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $13, %xmm2 + TOLOWER (%xmm1, %xmm2) + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d + sub %r9d, %edx + jnz LABEL(less32bytes) + movdqa (%rdi), %xmm3 + + UPDATE_STRNCMP_COUNTER + + pxor %xmm0, %xmm0 + mov $16, %rcx /* index for loads */ + mov $3, %r9d /* byte position left over from less32bytes case */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we have crossed a page boundary and + * need to do a nibble. + */ + lea 3(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + sub $0x1000, %r10 /* subtract 4K pagesize */ + + .p2align 4 +LABEL(loop_ashr_3): + add $16, %r10 + jg LABEL(nibble_ashr_3) + +LABEL(gobble_ashr_3): + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 + +#ifndef USE_SSSE3 + psrldq $3, %xmm3 + pslldq $13, %xmm2 + por %xmm3, %xmm2 /* merge into one 16byte value */ +#else + palignr $3, %xmm3, %xmm2 /* merge into one 16byte value */ +#endif + TOLOWER (%xmm1, %xmm2) + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + + add $16, %rcx + movdqa %xmm4, %xmm3 + + add $16, %r10 + jg LABEL(nibble_ashr_3) /* cross page boundary */ + + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 + +#ifndef USE_SSSE3 + psrldq $3, %xmm3 + pslldq $13, %xmm2 + por %xmm3, %xmm2 /* merge into one 16byte value */ +#else + palignr $3, %xmm3, %xmm2 /* merge into one 16byte value */ +#endif + TOLOWER (%xmm1, %xmm2) + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + + add $16, %rcx + movdqa %xmm4, %xmm3 + jmp LABEL(loop_ashr_3) + + .p2align 4 +LABEL(nibble_ashr_3): + pcmpeqb %xmm3, %xmm0 /* check nibble for null char */ + pmovmskb %xmm0, %edx + test $0xfff8, %edx + jnz LABEL(ashr_3_exittail) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $13, %r11 + jbe LABEL(ashr_3_exittail) +#endif + + pxor %xmm0, %xmm0 + sub $0x1000, %r10 + jmp LABEL(gobble_ashr_3) + + .p2align 4 +LABEL(ashr_3_exittail): + movdqa (%rsi, %rcx), %xmm1 + psrldq $3, %xmm0 + psrldq $3, %xmm3 + jmp LABEL(aftertail) + +/* + * The following cases will be handled by ashr_4 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(12~15) n -12 3(15 +(n-12) - n) ashr_4 + */ + .p2align 4 +LABEL(ashr_4): + pxor %xmm0, %xmm0 + movdqa (%rdi), %xmm2 + movdqa (%rsi), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $12, %xmm2 + TOLOWER (%xmm1, %xmm2) + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d + sub %r9d, %edx + jnz LABEL(less32bytes) + movdqa (%rdi), %xmm3 + + UPDATE_STRNCMP_COUNTER + + pxor %xmm0, %xmm0 + mov $16, %rcx /* index for loads */ + mov $4, %r9d /* byte position left over from less32bytes case */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we have crossed a page boundary and + * need to do a nibble. + */ + lea 4(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + sub $0x1000, %r10 /* subtract 4K pagesize */ + + .p2align 4 +LABEL(loop_ashr_4): + add $16, %r10 + jg LABEL(nibble_ashr_4) + +LABEL(gobble_ashr_4): + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 + +#ifndef USE_SSSE3 + psrldq $4, %xmm3 + pslldq $12, %xmm2 + por %xmm3, %xmm2 /* merge into one 16byte value */ +#else + palignr $4, %xmm3, %xmm2 /* merge into one 16byte value */ +#endif + TOLOWER (%xmm1, %xmm2) + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + + add $16, %rcx + movdqa %xmm4, %xmm3 + + add $16, %r10 + jg LABEL(nibble_ashr_4) /* cross page boundary */ + + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 + +#ifndef USE_SSSE3 + psrldq $4, %xmm3 + pslldq $12, %xmm2 + por %xmm3, %xmm2 /* merge into one 16byte value */ +#else + palignr $4, %xmm3, %xmm2 /* merge into one 16byte value */ +#endif + TOLOWER (%xmm1, %xmm2) + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + + add $16, %rcx + movdqa %xmm4, %xmm3 + jmp LABEL(loop_ashr_4) + + .p2align 4 +LABEL(nibble_ashr_4): + pcmpeqb %xmm3, %xmm0 /* check nibble for null char */ + pmovmskb %xmm0, %edx + test $0xfff0, %edx + jnz LABEL(ashr_4_exittail) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $12, %r11 + jbe LABEL(ashr_4_exittail) +#endif + + pxor %xmm0, %xmm0 + sub $0x1000, %r10 + jmp LABEL(gobble_ashr_4) + + .p2align 4 +LABEL(ashr_4_exittail): + movdqa (%rsi, %rcx), %xmm1 + psrldq $4, %xmm0 + psrldq $4, %xmm3 + jmp LABEL(aftertail) + +/* + * The following cases will be handled by ashr_5 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(11~15) n - 11 4(15 +(n-11) - n) ashr_5 + */ + .p2align 4 +LABEL(ashr_5): + pxor %xmm0, %xmm0 + movdqa (%rdi), %xmm2 + movdqa (%rsi), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $11, %xmm2 + TOLOWER (%xmm1, %xmm2) + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d + sub %r9d, %edx + jnz LABEL(less32bytes) + movdqa (%rdi), %xmm3 + + UPDATE_STRNCMP_COUNTER + + pxor %xmm0, %xmm0 + mov $16, %rcx /* index for loads */ + mov $5, %r9d /* byte position left over from less32bytes case */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we have crossed a page boundary and + * need to do a nibble. + */ + lea 5(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + sub $0x1000, %r10 /* subtract 4K pagesize */ + + .p2align 4 +LABEL(loop_ashr_5): + add $16, %r10 + jg LABEL(nibble_ashr_5) + +LABEL(gobble_ashr_5): + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 + +#ifndef USE_SSSE3 + psrldq $5, %xmm3 + pslldq $11, %xmm2 + por %xmm3, %xmm2 /* merge into one 16byte value */ +#else + palignr $5, %xmm3, %xmm2 /* merge into one 16byte value */ +#endif + TOLOWER (%xmm1, %xmm2) + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + + add $16, %rcx + movdqa %xmm4, %xmm3 + + add $16, %r10 + jg LABEL(nibble_ashr_5) /* cross page boundary */ + + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 + +#ifndef USE_SSSE3 + psrldq $5, %xmm3 + pslldq $11, %xmm2 + por %xmm3, %xmm2 /* merge into one 16byte value */ +#else + palignr $5, %xmm3, %xmm2 /* merge into one 16byte value */ +#endif + TOLOWER (%xmm1, %xmm2) + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + + add $16, %rcx + movdqa %xmm4, %xmm3 + jmp LABEL(loop_ashr_5) + + .p2align 4 +LABEL(nibble_ashr_5): + pcmpeqb %xmm3, %xmm0 /* check nibble for null char */ + pmovmskb %xmm0, %edx + test $0xffe0, %edx + jnz LABEL(ashr_5_exittail) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $11, %r11 + jbe LABEL(ashr_5_exittail) +#endif + + pxor %xmm0, %xmm0 + sub $0x1000, %r10 + jmp LABEL(gobble_ashr_5) + + .p2align 4 +LABEL(ashr_5_exittail): + movdqa (%rsi, %rcx), %xmm1 + psrldq $5, %xmm0 + psrldq $5, %xmm3 + jmp LABEL(aftertail) + +/* + * The following cases will be handled by ashr_6 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(10~15) n - 10 5(15 +(n-10) - n) ashr_6 + */ + .p2align 4 +LABEL(ashr_6): + pxor %xmm0, %xmm0 + movdqa (%rdi), %xmm2 + movdqa (%rsi), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $10, %xmm2 + TOLOWER (%xmm1, %xmm2) + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d + sub %r9d, %edx + jnz LABEL(less32bytes) + movdqa (%rdi), %xmm3 + + UPDATE_STRNCMP_COUNTER + + pxor %xmm0, %xmm0 + mov $16, %rcx /* index for loads */ + mov $6, %r9d /* byte position left over from less32bytes case */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we have crossed a page boundary and + * need to do a nibble. + */ + lea 6(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + sub $0x1000, %r10 /* subtract 4K pagesize */ + + .p2align 4 +LABEL(loop_ashr_6): + add $16, %r10 + jg LABEL(nibble_ashr_6) + +LABEL(gobble_ashr_6): + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 + +#ifndef USE_SSSE3 + psrldq $6, %xmm3 + pslldq $10, %xmm2 + por %xmm3, %xmm2 /* merge into one 16byte value */ +#else + palignr $6, %xmm3, %xmm2 /* merge into one 16byte value */ +#endif + TOLOWER (%xmm1, %xmm2) + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + + add $16, %rcx + movdqa %xmm4, %xmm3 + + add $16, %r10 + jg LABEL(nibble_ashr_6) /* cross page boundary */ + + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 + +#ifndef USE_SSSE3 + psrldq $6, %xmm3 + pslldq $10, %xmm2 + por %xmm3, %xmm2 /* merge into one 16byte value */ +#else + palignr $6, %xmm3, %xmm2 /* merge into one 16byte value */ +#endif + TOLOWER (%xmm1, %xmm2) + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + + add $16, %rcx + movdqa %xmm4, %xmm3 + jmp LABEL(loop_ashr_6) + + .p2align 4 +LABEL(nibble_ashr_6): + pcmpeqb %xmm3, %xmm0 /* check nibble for null char */ + pmovmskb %xmm0, %edx + test $0xffc0, %edx + jnz LABEL(ashr_6_exittail) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $10, %r11 + jbe LABEL(ashr_6_exittail) +#endif + + pxor %xmm0, %xmm0 + sub $0x1000, %r10 + jmp LABEL(gobble_ashr_6) + + .p2align 4 +LABEL(ashr_6_exittail): + movdqa (%rsi, %rcx), %xmm1 + psrldq $6, %xmm0 + psrldq $6, %xmm3 + jmp LABEL(aftertail) + +/* + * The following cases will be handled by ashr_7 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(9~15) n - 9 6(15 +(n - 9) - n) ashr_7 + */ + .p2align 4 +LABEL(ashr_7): + pxor %xmm0, %xmm0 + movdqa (%rdi), %xmm2 + movdqa (%rsi), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $9, %xmm2 + TOLOWER (%xmm1, %xmm2) + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d + sub %r9d, %edx + jnz LABEL(less32bytes) + movdqa (%rdi), %xmm3 + + UPDATE_STRNCMP_COUNTER + + pxor %xmm0, %xmm0 + mov $16, %rcx /* index for loads */ + mov $7, %r9d /* byte position left over from less32bytes case */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we have crossed a page boundary and + * need to do a nibble. + */ + lea 7(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + sub $0x1000, %r10 /* subtract 4K pagesize */ + + .p2align 4 +LABEL(loop_ashr_7): + add $16, %r10 + jg LABEL(nibble_ashr_7) + +LABEL(gobble_ashr_7): + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 + +#ifndef USE_SSSE3 + psrldq $7, %xmm3 + pslldq $9, %xmm2 + por %xmm3, %xmm2 /* merge into one 16byte value */ +#else + palignr $7, %xmm3, %xmm2 /* merge into one 16byte value */ +#endif + TOLOWER (%xmm1, %xmm2) + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + + add $16, %rcx + movdqa %xmm4, %xmm3 + + add $16, %r10 + jg LABEL(nibble_ashr_7) /* cross page boundary */ + + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 + +#ifndef USE_SSSE3 + psrldq $7, %xmm3 + pslldq $9, %xmm2 + por %xmm3, %xmm2 /* merge into one 16byte value */ +#else + palignr $7, %xmm3, %xmm2 /* merge into one 16byte value */ +#endif + TOLOWER (%xmm1, %xmm2) + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + + add $16, %rcx + movdqa %xmm4, %xmm3 + jmp LABEL(loop_ashr_7) + + .p2align 4 +LABEL(nibble_ashr_7): + pcmpeqb %xmm3, %xmm0 /* check nibble for null char */ + pmovmskb %xmm0, %edx + test $0xff80, %edx + jnz LABEL(ashr_7_exittail) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $9, %r11 + jbe LABEL(ashr_7_exittail) +#endif + + pxor %xmm0, %xmm0 + sub $0x1000, %r10 + jmp LABEL(gobble_ashr_7) + + .p2align 4 +LABEL(ashr_7_exittail): + movdqa (%rsi, %rcx), %xmm1 + psrldq $7, %xmm0 + psrldq $7, %xmm3 + jmp LABEL(aftertail) + +/* + * The following cases will be handled by ashr_8 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(8~15) n - 8 7(15 +(n - 8) - n) ashr_8 + */ + .p2align 4 +LABEL(ashr_8): + pxor %xmm0, %xmm0 + movdqa (%rdi), %xmm2 + movdqa (%rsi), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $8, %xmm2 + TOLOWER (%xmm1, %xmm2) + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d + sub %r9d, %edx + jnz LABEL(less32bytes) + movdqa (%rdi), %xmm3 + + UPDATE_STRNCMP_COUNTER + + pxor %xmm0, %xmm0 + mov $16, %rcx /* index for loads */ + mov $8, %r9d /* byte position left over from less32bytes case */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we have crossed a page boundary and + * need to do a nibble. + */ + lea 8(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + sub $0x1000, %r10 /* subtract 4K pagesize */ + + .p2align 4 +LABEL(loop_ashr_8): + add $16, %r10 + jg LABEL(nibble_ashr_8) + +LABEL(gobble_ashr_8): + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 + +#ifndef USE_SSSE3 + psrldq $8, %xmm3 + pslldq $8, %xmm2 + por %xmm3, %xmm2 /* merge into one 16byte value */ +#else + palignr $8, %xmm3, %xmm2 /* merge into one 16byte value */ +#endif + TOLOWER (%xmm1, %xmm2) + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + + add $16, %rcx + movdqa %xmm4, %xmm3 + + add $16, %r10 + jg LABEL(nibble_ashr_8) /* cross page boundary */ + + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 + +#ifndef USE_SSSE3 + psrldq $8, %xmm3 + pslldq $8, %xmm2 + por %xmm3, %xmm2 /* merge into one 16byte value */ +#else + palignr $8, %xmm3, %xmm2 /* merge into one 16byte value */ +#endif + TOLOWER (%xmm1, %xmm2) + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + + add $16, %rcx + movdqa %xmm4, %xmm3 + jmp LABEL(loop_ashr_8) + + .p2align 4 +LABEL(nibble_ashr_8): + pcmpeqb %xmm3, %xmm0 /* check nibble for null char */ + pmovmskb %xmm0, %edx + test $0xff00, %edx + jnz LABEL(ashr_8_exittail) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $8, %r11 + jbe LABEL(ashr_8_exittail) +#endif + + pxor %xmm0, %xmm0 + sub $0x1000, %r10 + jmp LABEL(gobble_ashr_8) + + .p2align 4 +LABEL(ashr_8_exittail): + movdqa (%rsi, %rcx), %xmm1 + psrldq $8, %xmm0 + psrldq $8, %xmm3 + jmp LABEL(aftertail) + +/* + * The following cases will be handled by ashr_9 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(7~15) n - 7 8(15 +(n - 7) - n) ashr_9 + */ + .p2align 4 +LABEL(ashr_9): + pxor %xmm0, %xmm0 + movdqa (%rdi), %xmm2 + movdqa (%rsi), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $7, %xmm2 + TOLOWER (%xmm1, %xmm2) + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d + sub %r9d, %edx + jnz LABEL(less32bytes) + movdqa (%rdi), %xmm3 + + UPDATE_STRNCMP_COUNTER + + pxor %xmm0, %xmm0 + mov $16, %rcx /* index for loads */ + mov $9, %r9d /* byte position left over from less32bytes case */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we have crossed a page boundary and + * need to do a nibble. + */ + lea 9(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + sub $0x1000, %r10 /* subtract 4K pagesize */ + + .p2align 4 +LABEL(loop_ashr_9): + add $16, %r10 + jg LABEL(nibble_ashr_9) + +LABEL(gobble_ashr_9): + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 + +#ifndef USE_SSSE3 + psrldq $9, %xmm3 + pslldq $7, %xmm2 + por %xmm3, %xmm2 /* merge into one 16byte value */ +#else + palignr $9, %xmm3, %xmm2 /* merge into one 16byte value */ +#endif + TOLOWER (%xmm1, %xmm2) + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + + add $16, %rcx + movdqa %xmm4, %xmm3 + + add $16, %r10 + jg LABEL(nibble_ashr_9) /* cross page boundary */ + + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 + +#ifndef USE_SSSE3 + psrldq $9, %xmm3 + pslldq $7, %xmm2 + por %xmm3, %xmm2 /* merge into one 16byte value */ +#else + palignr $9, %xmm3, %xmm2 /* merge into one 16byte value */ +#endif + TOLOWER (%xmm1, %xmm2) + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + + add $16, %rcx + movdqa %xmm4, %xmm3 /* store for next cycle */ + jmp LABEL(loop_ashr_9) + + .p2align 4 +LABEL(nibble_ashr_9): + pcmpeqb %xmm3, %xmm0 /* check nibble for null char */ + pmovmskb %xmm0, %edx + test $0xfe00, %edx + jnz LABEL(ashr_9_exittail) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $7, %r11 + jbe LABEL(ashr_9_exittail) +#endif + + pxor %xmm0, %xmm0 + sub $0x1000, %r10 + jmp LABEL(gobble_ashr_9) + + .p2align 4 +LABEL(ashr_9_exittail): + movdqa (%rsi, %rcx), %xmm1 + psrldq $9, %xmm0 + psrldq $9, %xmm3 + jmp LABEL(aftertail) + +/* + * The following cases will be handled by ashr_10 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(6~15) n - 6 9(15 +(n - 6) - n) ashr_10 + */ + .p2align 4 +LABEL(ashr_10): + pxor %xmm0, %xmm0 + movdqa (%rdi), %xmm2 + movdqa (%rsi), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $6, %xmm2 + TOLOWER (%xmm1, %xmm2) + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d + sub %r9d, %edx + jnz LABEL(less32bytes) + movdqa (%rdi), %xmm3 + + UPDATE_STRNCMP_COUNTER + + pxor %xmm0, %xmm0 + mov $16, %rcx /* index for loads */ + mov $10, %r9d /* byte position left over from less32bytes case */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we have crossed a page boundary and + * need to do a nibble. + */ + lea 10(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + sub $0x1000, %r10 /* subtract 4K pagesize */ + + .p2align 4 +LABEL(loop_ashr_10): + add $16, %r10 + jg LABEL(nibble_ashr_10) + +LABEL(gobble_ashr_10): + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 + +#ifndef USE_SSSE3 + psrldq $10, %xmm3 + pslldq $6, %xmm2 + por %xmm3, %xmm2 /* merge into one 16byte value */ +#else + palignr $10, %xmm3, %xmm2 /* merge into one 16byte value */ +#endif + TOLOWER (%xmm1, %xmm2) + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + + add $16, %rcx + movdqa %xmm4, %xmm3 + + add $16, %r10 + jg LABEL(nibble_ashr_10) /* cross page boundary */ + + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 + +#ifndef USE_SSSE3 + psrldq $10, %xmm3 + pslldq $6, %xmm2 + por %xmm3, %xmm2 /* merge into one 16byte value */ +#else + palignr $10, %xmm3, %xmm2 /* merge into one 16byte value */ +#endif + TOLOWER (%xmm1, %xmm2) + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + + add $16, %rcx + movdqa %xmm4, %xmm3 + jmp LABEL(loop_ashr_10) + + .p2align 4 +LABEL(nibble_ashr_10): + pcmpeqb %xmm3, %xmm0 /* check nibble for null char */ + pmovmskb %xmm0, %edx + test $0xfc00, %edx + jnz LABEL(ashr_10_exittail) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $6, %r11 + jbe LABEL(ashr_10_exittail) +#endif + + pxor %xmm0, %xmm0 + sub $0x1000, %r10 + jmp LABEL(gobble_ashr_10) + + .p2align 4 +LABEL(ashr_10_exittail): + movdqa (%rsi, %rcx), %xmm1 + psrldq $10, %xmm0 + psrldq $10, %xmm3 + jmp LABEL(aftertail) + +/* + * The following cases will be handled by ashr_11 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(5~15) n - 5 10(15 +(n - 5) - n) ashr_11 + */ + .p2align 4 +LABEL(ashr_11): + pxor %xmm0, %xmm0 + movdqa (%rdi), %xmm2 + movdqa (%rsi), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $5, %xmm2 + TOLOWER (%xmm1, %xmm2) + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d + sub %r9d, %edx + jnz LABEL(less32bytes) + movdqa (%rdi), %xmm3 + + UPDATE_STRNCMP_COUNTER + + pxor %xmm0, %xmm0 + mov $16, %rcx /* index for loads */ + mov $11, %r9d /* byte position left over from less32bytes case */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we have crossed a page boundary and + * need to do a nibble. + */ + lea 11(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + sub $0x1000, %r10 /* subtract 4K pagesize */ + + .p2align 4 +LABEL(loop_ashr_11): + add $16, %r10 + jg LABEL(nibble_ashr_11) + +LABEL(gobble_ashr_11): + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 + +#ifndef USE_SSSE3 + psrldq $11, %xmm3 + pslldq $5, %xmm2 + por %xmm3, %xmm2 /* merge into one 16byte value */ +#else + palignr $11, %xmm3, %xmm2 /* merge into one 16byte value */ +#endif + TOLOWER (%xmm1, %xmm2) + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + + add $16, %rcx + movdqa %xmm4, %xmm3 + + add $16, %r10 + jg LABEL(nibble_ashr_11) /* cross page boundary */ + + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 + +#ifndef USE_SSSE3 + psrldq $11, %xmm3 + pslldq $5, %xmm2 + por %xmm3, %xmm2 /* merge into one 16byte value */ +#else + palignr $11, %xmm3, %xmm2 /* merge into one 16byte value */ +#endif + TOLOWER (%xmm1, %xmm2) + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + + add $16, %rcx + movdqa %xmm4, %xmm3 + jmp LABEL(loop_ashr_11) + + .p2align 4 +LABEL(nibble_ashr_11): + pcmpeqb %xmm3, %xmm0 /* check nibble for null char */ + pmovmskb %xmm0, %edx + test $0xf800, %edx + jnz LABEL(ashr_11_exittail) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $5, %r11 + jbe LABEL(ashr_11_exittail) +#endif + + pxor %xmm0, %xmm0 + sub $0x1000, %r10 + jmp LABEL(gobble_ashr_11) + + .p2align 4 +LABEL(ashr_11_exittail): + movdqa (%rsi, %rcx), %xmm1 + psrldq $11, %xmm0 + psrldq $11, %xmm3 + jmp LABEL(aftertail) + +/* + * The following cases will be handled by ashr_12 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(4~15) n - 4 11(15 +(n - 4) - n) ashr_12 + */ + .p2align 4 +LABEL(ashr_12): + pxor %xmm0, %xmm0 + movdqa (%rdi), %xmm2 + movdqa (%rsi), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $4, %xmm2 + TOLOWER (%xmm1, %xmm2) + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d + sub %r9d, %edx + jnz LABEL(less32bytes) + movdqa (%rdi), %xmm3 + + UPDATE_STRNCMP_COUNTER + + pxor %xmm0, %xmm0 + mov $16, %rcx /* index for loads */ + mov $12, %r9d /* byte position left over from less32bytes case */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we have crossed a page boundary and + * need to do a nibble. + */ + lea 12(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + sub $0x1000, %r10 /* subtract 4K pagesize */ + + .p2align 4 +LABEL(loop_ashr_12): + add $16, %r10 + jg LABEL(nibble_ashr_12) + +LABEL(gobble_ashr_12): + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 + +#ifndef USE_SSSE3 + psrldq $12, %xmm3 + pslldq $4, %xmm2 + por %xmm3, %xmm2 /* merge into one 16byte value */ +#else + palignr $12, %xmm3, %xmm2 /* merge into one 16byte value */ +#endif + TOLOWER (%xmm1, %xmm2) + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + + add $16, %rcx + movdqa %xmm4, %xmm3 + + add $16, %r10 + jg LABEL(nibble_ashr_12) /* cross page boundary */ + + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 + +#ifndef USE_SSSE3 + psrldq $12, %xmm3 + pslldq $4, %xmm2 + por %xmm3, %xmm2 /* merge into one 16byte value */ +#else + palignr $12, %xmm3, %xmm2 /* merge into one 16byte value */ +#endif + TOLOWER (%xmm1, %xmm2) + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + + add $16, %rcx + movdqa %xmm4, %xmm3 + jmp LABEL(loop_ashr_12) + + .p2align 4 +LABEL(nibble_ashr_12): + pcmpeqb %xmm3, %xmm0 /* check nibble for null char */ + pmovmskb %xmm0, %edx + test $0xf000, %edx + jnz LABEL(ashr_12_exittail) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $4, %r11 + jbe LABEL(ashr_12_exittail) +#endif + + pxor %xmm0, %xmm0 + sub $0x1000, %r10 + jmp LABEL(gobble_ashr_12) + + .p2align 4 +LABEL(ashr_12_exittail): + movdqa (%rsi, %rcx), %xmm1 + psrldq $12, %xmm0 + psrldq $12, %xmm3 + jmp LABEL(aftertail) + +/* + * The following cases will be handled by ashr_13 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(3~15) n - 3 12(15 +(n - 3) - n) ashr_13 + */ + .p2align 4 +LABEL(ashr_13): + pxor %xmm0, %xmm0 + movdqa (%rdi), %xmm2 + movdqa (%rsi), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $3, %xmm2 + TOLOWER (%xmm1, %xmm2) + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d + sub %r9d, %edx + jnz LABEL(less32bytes) + movdqa (%rdi), %xmm3 + + UPDATE_STRNCMP_COUNTER + + pxor %xmm0, %xmm0 + mov $16, %rcx /* index for loads */ + mov $13, %r9d /* byte position left over from less32bytes case */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we have crossed a page boundary and + * need to do a nibble. + */ + lea 13(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + sub $0x1000, %r10 /* subtract 4K pagesize */ + + .p2align 4 +LABEL(loop_ashr_13): + add $16, %r10 + jg LABEL(nibble_ashr_13) + +LABEL(gobble_ashr_13): + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 + +#ifndef USE_SSSE3 + psrldq $13, %xmm3 + pslldq $3, %xmm2 + por %xmm3, %xmm2 /* merge into one 16byte value */ +#else + palignr $13, %xmm3, %xmm2 /* merge into one 16byte value */ +#endif + TOLOWER (%xmm1, %xmm2) + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + + add $16, %rcx + movdqa %xmm4, %xmm3 + + add $16, %r10 + jg LABEL(nibble_ashr_13) /* cross page boundary */ + + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 + +#ifndef USE_SSSE3 + psrldq $13, %xmm3 + pslldq $3, %xmm2 + por %xmm3, %xmm2 /* merge into one 16byte value */ +#else + palignr $13, %xmm3, %xmm2 /* merge into one 16byte value */ +#endif + TOLOWER (%xmm1, %xmm2) + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + + add $16, %rcx + movdqa %xmm4, %xmm3 + jmp LABEL(loop_ashr_13) + + .p2align 4 +LABEL(nibble_ashr_13): + pcmpeqb %xmm3, %xmm0 /* check nibble for null char */ + pmovmskb %xmm0, %edx + test $0xe000, %edx + jnz LABEL(ashr_13_exittail) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $3, %r11 + jbe LABEL(ashr_13_exittail) +#endif + + pxor %xmm0, %xmm0 + sub $0x1000, %r10 + jmp LABEL(gobble_ashr_13) + + .p2align 4 +LABEL(ashr_13_exittail): + movdqa (%rsi, %rcx), %xmm1 + psrldq $13, %xmm0 + psrldq $13, %xmm3 + jmp LABEL(aftertail) + +/* + * The following cases will be handled by ashr_14 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(2~15) n - 2 13(15 +(n - 2) - n) ashr_14 + */ + .p2align 4 +LABEL(ashr_14): + pxor %xmm0, %xmm0 + movdqa (%rdi), %xmm2 + movdqa (%rsi), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $2, %xmm2 + TOLOWER (%xmm1, %xmm2) + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d + sub %r9d, %edx + jnz LABEL(less32bytes) + movdqa (%rdi), %xmm3 + + UPDATE_STRNCMP_COUNTER + + pxor %xmm0, %xmm0 + mov $16, %rcx /* index for loads */ + mov $14, %r9d /* byte position left over from less32bytes case */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we have crossed a page boundary and + * need to do a nibble. + */ + lea 14(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + sub $0x1000, %r10 /* subtract 4K pagesize */ + + .p2align 4 +LABEL(loop_ashr_14): + add $16, %r10 + jg LABEL(nibble_ashr_14) + +LABEL(gobble_ashr_14): + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 + +#ifndef USE_SSSE3 + psrldq $14, %xmm3 + pslldq $2, %xmm2 + por %xmm3, %xmm2 /* merge into one 16byte value */ +#else + palignr $14, %xmm3, %xmm2 /* merge into one 16byte value */ +#endif + TOLOWER (%xmm1, %xmm2) + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + + add $16, %rcx + movdqa %xmm4, %xmm3 + + add $16, %r10 + jg LABEL(nibble_ashr_14) /* cross page boundary */ + + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 + +#ifndef USE_SSSE3 + psrldq $14, %xmm3 + pslldq $2, %xmm2 + por %xmm3, %xmm2 /* merge into one 16byte value */ +#else + palignr $14, %xmm3, %xmm2 /* merge into one 16byte value */ +#endif + TOLOWER (%xmm1, %xmm2) + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) + +#if defined USE_AS_STRNCMP | defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + + add $16, %rcx + movdqa %xmm4, %xmm3 + jmp LABEL(loop_ashr_14) + + .p2align 4 +LABEL(nibble_ashr_14): + pcmpeqb %xmm3, %xmm0 /* check nibble for null char */ + pmovmskb %xmm0, %edx + test $0xc000, %edx + jnz LABEL(ashr_14_exittail) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $2, %r11 + jbe LABEL(ashr_14_exittail) +#endif + + pxor %xmm0, %xmm0 + sub $0x1000, %r10 + jmp LABEL(gobble_ashr_14) + + .p2align 4 +LABEL(ashr_14_exittail): + movdqa (%rsi, %rcx), %xmm1 + psrldq $14, %xmm0 + psrldq $14, %xmm3 + jmp LABEL(aftertail) + +/* + * The following cases will be handled by ashr_15 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(1~15) n - 1 14(15 +(n - 1) - n) ashr_15 + */ + .p2align 4 +LABEL(ashr_15): + pxor %xmm0, %xmm0 + movdqa (%rdi), %xmm2 + movdqa (%rsi), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $1, %xmm2 + TOLOWER (%xmm1, %xmm2) + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d + sub %r9d, %edx + jnz LABEL(less32bytes) + + movdqa (%rdi), %xmm3 + + UPDATE_STRNCMP_COUNTER + + pxor %xmm0, %xmm0 + mov $16, %rcx /* index for loads */ + mov $15, %r9d /* byte position left over from less32bytes case */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we have crossed a page boundary and + * need to do a nibble. + */ + lea 15(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + + sub $0x1000, %r10 /* subtract 4K pagesize */ + + .p2align 4 +LABEL(loop_ashr_15): + add $16, %r10 + jg LABEL(nibble_ashr_15) + +LABEL(gobble_ashr_15): + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 + +#ifndef USE_SSSE3 + psrldq $15, %xmm3 + pslldq $1, %xmm2 + por %xmm3, %xmm2 /* merge into one 16byte value */ +#else + palignr $15, %xmm3, %xmm2 /* merge into one 16byte value */ +#endif + TOLOWER (%xmm1, %xmm2) + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + + add $16, %rcx + movdqa %xmm4, %xmm3 + + add $16, %r10 + jg LABEL(nibble_ashr_15) /* cross page boundary */ + + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 + +#ifndef USE_SSSE3 + psrldq $15, %xmm3 + pslldq $1, %xmm2 + por %xmm3, %xmm2 /* merge into one 16byte value */ +#else + palignr $15, %xmm3, %xmm2 /* merge into one 16byte value */ +#endif + TOLOWER (%xmm1, %xmm2) + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + + add $16, %rcx + movdqa %xmm4, %xmm3 + jmp LABEL(loop_ashr_15) + + .p2align 4 +LABEL(nibble_ashr_15): + pcmpeqb %xmm3, %xmm0 /* check nibble for null char */ + pmovmskb %xmm0, %edx + test $0x8000, %edx + jnz LABEL(ashr_15_exittail) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmpq $1, %r11 + jbe LABEL(ashr_15_exittail) +#endif + + pxor %xmm0, %xmm0 + sub $0x1000, %r10 + jmp LABEL(gobble_ashr_15) + + .p2align 4 +LABEL(ashr_15_exittail): + movdqa (%rsi, %rcx), %xmm1 + psrldq $15, %xmm3 + psrldq $15, %xmm0 + + .p2align 4 +LABEL(aftertail): + TOLOWER (%xmm1, %xmm3) + pcmpeqb %xmm3, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + not %edx + + .p2align 4 +LABEL(exit): + lea -16(%r9, %rcx), %rax /* locate the exact offset for rdi */ +LABEL(less32bytes): + lea (%rdi, %rax), %rdi /* locate the exact address for first operand(rdi) */ + lea (%rsi, %rcx), %rsi /* locate the exact address for second operand(rsi) */ + test %r8d, %r8d + jz LABEL(ret) + xchg %rsi, %rdi /* recover original order according to flag(%r8d) */ + + .p2align 4 +LABEL(ret): +LABEL(less16bytes): + bsf %rdx, %rdx /* find and store bit index in %rdx */ + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub %rdx, %r11 + jbe LABEL(strcmp_exitz) +#endif + movzbl (%rsi, %rdx), %ecx + movzbl (%rdi, %rdx), %eax + +#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L + leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx + movl (%rdx,%rcx,4), %ecx + movl (%rdx,%rax,4), %eax +#endif + + sub %ecx, %eax + ret + +LABEL(strcmp_exitz): + xor %eax, %eax + ret + + .p2align 4 +LABEL(Byte0): + movzx (%rsi), %ecx + movzx (%rdi), %eax + +#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L + leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx + movl (%rdx,%rcx,4), %ecx + movl (%rdx,%rax,4), %eax +#endif + + sub %ecx, %eax + ret +END (STRCMP) + + .section .rodata,"a",@progbits + .p2align 3 +LABEL(unaligned_table): + .int LABEL(ashr_1) - LABEL(unaligned_table) + .int LABEL(ashr_2) - LABEL(unaligned_table) + .int LABEL(ashr_3) - LABEL(unaligned_table) + .int LABEL(ashr_4) - LABEL(unaligned_table) + .int LABEL(ashr_5) - LABEL(unaligned_table) + .int LABEL(ashr_6) - LABEL(unaligned_table) + .int LABEL(ashr_7) - LABEL(unaligned_table) + .int LABEL(ashr_8) - LABEL(unaligned_table) + .int LABEL(ashr_9) - LABEL(unaligned_table) + .int LABEL(ashr_10) - LABEL(unaligned_table) + .int LABEL(ashr_11) - LABEL(unaligned_table) + .int LABEL(ashr_12) - LABEL(unaligned_table) + .int LABEL(ashr_13) - LABEL(unaligned_table) + .int LABEL(ashr_14) - LABEL(unaligned_table) + .int LABEL(ashr_15) - LABEL(unaligned_table) + .int LABEL(ashr_0) - LABEL(unaligned_table) +libc_hidden_builtin_def (STRCMP) diff --git a/REORG.TODO/sysdeps/x86_64/strcpy.S b/REORG.TODO/sysdeps/x86_64/strcpy.S new file mode 100644 index 0000000000..0351b0820d --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/strcpy.S @@ -0,0 +1,156 @@ +/* strcpy/stpcpy implementation for x86-64. + Copyright (C) 2002-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Andreas Jaeger <aj@suse.de>, 2002. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "asm-syntax.h" + +#ifndef USE_AS_STPCPY +# define STRCPY strcpy +#endif + + .text +ENTRY (STRCPY) + movq %rsi, %rcx /* Source register. */ + andl $7, %ecx /* mask alignment bits */ + movq %rdi, %rdx /* Duplicate destination pointer. */ + + jz 5f /* aligned => start loop */ + + neg %ecx /* We need to align to 8 bytes. */ + addl $8,%ecx + /* Search the first bytes directly. */ +0: + movb (%rsi), %al /* Fetch a byte */ + testb %al, %al /* Is it NUL? */ + movb %al, (%rdx) /* Store it */ + jz 4f /* If it was NUL, done! */ + incq %rsi + incq %rdx + decl %ecx + jnz 0b + +5: + movq $0xfefefefefefefeff,%r8 + + /* Now the sources is aligned. Unfortunatly we cannot force + to have both source and destination aligned, so ignore the + alignment of the destination. */ + .p2align 4 +1: + /* 1st unroll. */ + movq (%rsi), %rax /* Read double word (8 bytes). */ + addq $8, %rsi /* Adjust pointer for next word. */ + movq %rax, %r9 /* Save a copy for NUL finding. */ + addq %r8, %r9 /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc 3f /* highest byte is NUL => return pointer */ + xorq %rax, %r9 /* (word+magic)^word */ + orq %r8, %r9 /* set all non-carry bits */ + incq %r9 /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + + jnz 3f /* found NUL => return pointer */ + + movq %rax, (%rdx) /* Write value to destination. */ + addq $8, %rdx /* Adjust pointer. */ + + /* 2nd unroll. */ + movq (%rsi), %rax /* Read double word (8 bytes). */ + addq $8, %rsi /* Adjust pointer for next word. */ + movq %rax, %r9 /* Save a copy for NUL finding. */ + addq %r8, %r9 /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc 3f /* highest byte is NUL => return pointer */ + xorq %rax, %r9 /* (word+magic)^word */ + orq %r8, %r9 /* set all non-carry bits */ + incq %r9 /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + + jnz 3f /* found NUL => return pointer */ + + movq %rax, (%rdx) /* Write value to destination. */ + addq $8, %rdx /* Adjust pointer. */ + + /* 3rd unroll. */ + movq (%rsi), %rax /* Read double word (8 bytes). */ + addq $8, %rsi /* Adjust pointer for next word. */ + movq %rax, %r9 /* Save a copy for NUL finding. */ + addq %r8, %r9 /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc 3f /* highest byte is NUL => return pointer */ + xorq %rax, %r9 /* (word+magic)^word */ + orq %r8, %r9 /* set all non-carry bits */ + incq %r9 /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + + jnz 3f /* found NUL => return pointer */ + + movq %rax, (%rdx) /* Write value to destination. */ + addq $8, %rdx /* Adjust pointer. */ + + /* 4th unroll. */ + movq (%rsi), %rax /* Read double word (8 bytes). */ + addq $8, %rsi /* Adjust pointer for next word. */ + movq %rax, %r9 /* Save a copy for NUL finding. */ + addq %r8, %r9 /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc 3f /* highest byte is NUL => return pointer */ + xorq %rax, %r9 /* (word+magic)^word */ + orq %r8, %r9 /* set all non-carry bits */ + incq %r9 /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + + jnz 3f /* found NUL => return pointer */ + + movq %rax, (%rdx) /* Write value to destination. */ + addq $8, %rdx /* Adjust pointer. */ + jmp 1b /* Next iteration. */ + + /* Do the last few bytes. %rax contains the value to write. + The loop is unrolled twice. */ + .p2align 4 +3: + /* Note that stpcpy needs to return with the value of the NUL + byte. */ + movb %al, (%rdx) /* 1st byte. */ + testb %al, %al /* Is it NUL. */ + jz 4f /* yes, finish. */ + incq %rdx /* Increment destination. */ + movb %ah, (%rdx) /* 2nd byte. */ + testb %ah, %ah /* Is it NUL?. */ + jz 4f /* yes, finish. */ + incq %rdx /* Increment destination. */ + shrq $16, %rax /* Shift... */ + jmp 3b /* and look at next two bytes in %rax. */ + +4: +#ifdef USE_AS_STPCPY + movq %rdx, %rax /* Destination is return value. */ +#else + movq %rdi, %rax /* Source is return value. */ +#endif + retq +END (STRCPY) +#ifndef USE_AS_STPCPY +libc_hidden_builtin_def (strcpy) +#endif diff --git a/REORG.TODO/sysdeps/x86_64/strcspn.S b/REORG.TODO/sysdeps/x86_64/strcspn.S new file mode 100644 index 0000000000..a1d1f7dfba --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/strcspn.S @@ -0,0 +1,125 @@ +/* strcspn (str, ss) -- Return the length of the initial segment of STR + which contains no characters from SS. + For AMD x86-64. + Copyright (C) 1994-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Ulrich Drepper <drepper@gnu.ai.mit.edu>. + Bug fixes by Alan Modra <Alan@SPRI.Levels.UniSA.Edu.Au>. + Adopted for x86-64 by Andreas Jaeger <aj@suse.de>. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "asm-syntax.h" + +/* BEWARE: `#ifdef strcspn' means that strcspn is redefined as `strpbrk' */ +#define STRPBRK_P (defined strcspn) + + .text +ENTRY (strcspn) + + movq %rdi, %rdx /* Save SRC. */ + + /* First we create a table with flags for all possible characters. + For the ASCII (7bit/8bit) or ISO-8859-X character sets which are + supported by the C string functions we have 256 characters. + Before inserting marks for the stop characters we clear the whole + table. */ + movq %rdi, %r8 /* Save value. */ + subq $256, %rsp /* Make space for 256 bytes. */ + cfi_adjust_cfa_offset(256) + movl $32, %ecx /* 32*8 bytes = 256 bytes. */ + movq %rsp, %rdi + xorl %eax, %eax /* We store 0s. */ + cld + rep + stosq + + movq %rsi, %rax /* Setup skipset. */ + +/* For understanding the following code remember that %rcx == 0 now. + Although all the following instruction only modify %cl we always + have a correct zero-extended 64-bit value in %rcx. */ + + .p2align 4 +L(2): movb (%rax), %cl /* get byte from skipset */ + testb %cl, %cl /* is NUL char? */ + jz L(1) /* yes => start compare loop */ + movb %cl, (%rsp,%rcx) /* set corresponding byte in skipset table */ + + movb 1(%rax), %cl /* get byte from skipset */ + testb $0xff, %cl /* is NUL char? */ + jz L(1) /* yes => start compare loop */ + movb %cl, (%rsp,%rcx) /* set corresponding byte in skipset table */ + + movb 2(%rax), %cl /* get byte from skipset */ + testb $0xff, %cl /* is NUL char? */ + jz L(1) /* yes => start compare loop */ + movb %cl, (%rsp,%rcx) /* set corresponding byte in skipset table */ + + movb 3(%rax), %cl /* get byte from skipset */ + addq $4, %rax /* increment skipset pointer */ + movb %cl, (%rsp,%rcx) /* set corresponding byte in skipset table */ + testb $0xff, %cl /* is NUL char? */ + jnz L(2) /* no => process next dword from skipset */ + +L(1): leaq -4(%rdx), %rax /* prepare loop */ + + /* We use a neat trick for the following loop. Normally we would + have to test for two termination conditions + 1. a character in the skipset was found + and + 2. the end of the string was found + But as a sign that the character is in the skipset we store its + value in the table. But the value of NUL is NUL so the loop + terminates for NUL in every case. */ + + .p2align 4 +L(3): addq $4, %rax /* adjust pointer for full loop round */ + + movb (%rax), %cl /* get byte from string */ + cmpb %cl, (%rsp,%rcx) /* is it contained in skipset? */ + je L(4) /* yes => return */ + + movb 1(%rax), %cl /* get byte from string */ + cmpb %cl, (%rsp,%rcx) /* is it contained in skipset? */ + je L(5) /* yes => return */ + + movb 2(%rax), %cl /* get byte from string */ + cmpb %cl, (%rsp,%rcx) /* is it contained in skipset? */ + jz L(6) /* yes => return */ + + movb 3(%rax), %cl /* get byte from string */ + cmpb %cl, (%rsp,%rcx) /* is it contained in skipset? */ + jne L(3) /* no => start loop again */ + + incq %rax /* adjust pointer */ +L(6): incq %rax +L(5): incq %rax + +L(4): addq $256, %rsp /* remove skipset */ + cfi_adjust_cfa_offset(-256) +#if STRPBRK_P + xorl %edx,%edx + orb %cl, %cl /* was last character NUL? */ + cmovzq %rdx, %rax /* Yes: return NULL */ +#else + subq %rdx, %rax /* we have to return the number of valid + characters, so compute distance to first + non-valid character */ +#endif + ret +END (strcspn) +libc_hidden_builtin_def (strcspn) diff --git a/REORG.TODO/sysdeps/x86_64/strlen.S b/REORG.TODO/sysdeps/x86_64/strlen.S new file mode 100644 index 0000000000..b5ab117c79 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/strlen.S @@ -0,0 +1,258 @@ +/* SSE2 version of strlen/wcslen. + Copyright (C) 2012-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> + +#ifdef AS_WCSLEN +# define PMINU pminud +# define PCMPEQ pcmpeqd +# define SHIFT_RETURN shrq $2, %rax +#else +# define PMINU pminub +# define PCMPEQ pcmpeqb +# define SHIFT_RETURN +#endif + +/* Long lived register in strlen(s), strnlen(s, n) are: + + %xmm3 - zero + %rdi - s + %r10 (s+n) & (~(64-1)) + %r11 s+n +*/ + + +.text +ENTRY(strlen) + +/* Test 64 bytes from %rax for zero. Save result as bitmask in %rdx. */ +#define FIND_ZERO \ + PCMPEQ (%rax), %xmm0; \ + PCMPEQ 16(%rax), %xmm1; \ + PCMPEQ 32(%rax), %xmm2; \ + PCMPEQ 48(%rax), %xmm3; \ + pmovmskb %xmm0, %esi; \ + pmovmskb %xmm1, %edx; \ + pmovmskb %xmm2, %r8d; \ + pmovmskb %xmm3, %ecx; \ + salq $16, %rdx; \ + salq $16, %rcx; \ + orq %rsi, %rdx; \ + orq %r8, %rcx; \ + salq $32, %rcx; \ + orq %rcx, %rdx; + +#ifdef AS_STRNLEN +/* Do not read anything when n==0. */ + test %rsi, %rsi + jne L(n_nonzero) + xor %rax, %rax + ret +L(n_nonzero): +# ifdef AS_WCSLEN + shlq $2, %rsi +# endif + +/* Initialize long lived registers. */ + + add %rdi, %rsi + mov %rsi, %r10 + and $-64, %r10 + mov %rsi, %r11 +#endif + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + movq %rdi, %rax + movq %rdi, %rcx + andq $4095, %rcx +/* Offsets 4032-4047 will be aligned into 4032 thus fit into page. */ + cmpq $4047, %rcx +/* We cannot unify this branching as it would be ~6 cycles slower. */ + ja L(cross_page) + +#ifdef AS_STRNLEN +/* Test if end is among first 64 bytes. */ +# define STRNLEN_PROLOG \ + mov %r11, %rsi; \ + subq %rax, %rsi; \ + andq $-64, %rax; \ + testq $-64, %rsi; \ + je L(strnlen_ret) +#else +# define STRNLEN_PROLOG andq $-64, %rax; +#endif + +/* Ignore bits in mask that come before start of string. */ +#define PROLOG(lab) \ + movq %rdi, %rcx; \ + xorq %rax, %rcx; \ + STRNLEN_PROLOG; \ + sarq %cl, %rdx; \ + test %rdx, %rdx; \ + je L(lab); \ + bsfq %rdx, %rax; \ + SHIFT_RETURN; \ + ret + +#ifdef AS_STRNLEN + andq $-16, %rax + FIND_ZERO +#else + /* Test first 16 bytes unaligned. */ + movdqu (%rax), %xmm4 + PCMPEQ %xmm0, %xmm4 + pmovmskb %xmm4, %edx + test %edx, %edx + je L(next48_bytes) + bsf %edx, %eax /* If eax is zeroed 16bit bsf can be used. */ + SHIFT_RETURN + ret + +L(next48_bytes): +/* Same as FIND_ZERO except we do not check first 16 bytes. */ + andq $-16, %rax + PCMPEQ 16(%rax), %xmm1 + PCMPEQ 32(%rax), %xmm2 + PCMPEQ 48(%rax), %xmm3 + pmovmskb %xmm1, %edx + pmovmskb %xmm2, %r8d + pmovmskb %xmm3, %ecx + salq $16, %rdx + salq $16, %rcx + orq %r8, %rcx + salq $32, %rcx + orq %rcx, %rdx +#endif + + /* When no zero byte is found xmm1-3 are zero so we do not have to + zero them. */ + PROLOG(loop) + + .p2align 4 +L(cross_page): + andq $-64, %rax + FIND_ZERO + PROLOG(loop_init) + +#ifdef AS_STRNLEN +/* We must do this check to correctly handle strnlen (s, -1). */ +L(strnlen_ret): + bts %rsi, %rdx + sarq %cl, %rdx + test %rdx, %rdx + je L(loop_init) + bsfq %rdx, %rax + SHIFT_RETURN + ret +#endif + .p2align 4 +L(loop_init): + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 +#ifdef AS_STRNLEN + .p2align 4 +L(loop): + + addq $64, %rax + cmpq %rax, %r10 + je L(exit_end) + + movdqa (%rax), %xmm0 + PMINU 16(%rax), %xmm0 + PMINU 32(%rax), %xmm0 + PMINU 48(%rax), %xmm0 + PCMPEQ %xmm3, %xmm0 + pmovmskb %xmm0, %edx + testl %edx, %edx + jne L(exit) + jmp L(loop) + + .p2align 4 +L(exit_end): + cmp %rax, %r11 + je L(first) /* Do not read when end is at page boundary. */ + pxor %xmm0, %xmm0 + FIND_ZERO + +L(first): + bts %r11, %rdx + bsfq %rdx, %rdx + addq %rdx, %rax + subq %rdi, %rax + SHIFT_RETURN + ret + + .p2align 4 +L(exit): + pxor %xmm0, %xmm0 + FIND_ZERO + + bsfq %rdx, %rdx + addq %rdx, %rax + subq %rdi, %rax + SHIFT_RETURN + ret + +#else + + /* Main loop. Unrolled twice to improve L2 cache performance on core2. */ + .p2align 4 +L(loop): + + movdqa 64(%rax), %xmm0 + PMINU 80(%rax), %xmm0 + PMINU 96(%rax), %xmm0 + PMINU 112(%rax), %xmm0 + PCMPEQ %xmm3, %xmm0 + pmovmskb %xmm0, %edx + testl %edx, %edx + jne L(exit64) + + subq $-128, %rax + + movdqa (%rax), %xmm0 + PMINU 16(%rax), %xmm0 + PMINU 32(%rax), %xmm0 + PMINU 48(%rax), %xmm0 + PCMPEQ %xmm3, %xmm0 + pmovmskb %xmm0, %edx + testl %edx, %edx + jne L(exit0) + jmp L(loop) + + .p2align 4 +L(exit64): + addq $64, %rax +L(exit0): + pxor %xmm0, %xmm0 + FIND_ZERO + + bsfq %rdx, %rdx + addq %rdx, %rax + subq %rdi, %rax + SHIFT_RETURN + ret + +#endif + +END(strlen) +libc_hidden_builtin_def (strlen) diff --git a/REORG.TODO/sysdeps/x86_64/strncase.S b/REORG.TODO/sysdeps/x86_64/strncase.S new file mode 100644 index 0000000000..2de2ce4b96 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/strncase.S @@ -0,0 +1 @@ +/* In strncase_l.S. */ diff --git a/REORG.TODO/sysdeps/x86_64/strncase_l-nonascii.c b/REORG.TODO/sysdeps/x86_64/strncase_l-nonascii.c new file mode 100644 index 0000000000..8664863778 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/strncase_l-nonascii.c @@ -0,0 +1,8 @@ +#include <string.h> + +extern int __strncasecmp_l_nonascii (const char *__s1, const char *__s2, + size_t __n, __locale_t __loc); + +#define __strncasecmp_l __strncasecmp_l_nonascii +#define USE_IN_EXTENDED_LOCALE_MODEL 1 +#include <string/strncase.c> diff --git a/REORG.TODO/sysdeps/x86_64/strncase_l.S b/REORG.TODO/sysdeps/x86_64/strncase_l.S new file mode 100644 index 0000000000..c725cd85b3 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/strncase_l.S @@ -0,0 +1,6 @@ +#define STRCMP __strncasecmp_l +#define USE_AS_STRNCASECMP_L +#include "strcmp.S" + +weak_alias (__strncasecmp_l, strncasecmp_l) +libc_hidden_def (strncasecmp_l) diff --git a/REORG.TODO/sysdeps/x86_64/strncmp.S b/REORG.TODO/sysdeps/x86_64/strncmp.S new file mode 100644 index 0000000000..0af34e7f15 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/strncmp.S @@ -0,0 +1,3 @@ +#define STRCMP strncmp +#define USE_AS_STRNCMP +#include "strcmp.S" diff --git a/REORG.TODO/sysdeps/x86_64/strnlen.S b/REORG.TODO/sysdeps/x86_64/strnlen.S new file mode 100644 index 0000000000..d3c43ac482 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/strnlen.S @@ -0,0 +1,6 @@ +#define AS_STRNLEN +#define strlen __strnlen +#include "strlen.S" + +weak_alias (__strnlen, strnlen); +libc_hidden_builtin_def (strnlen) diff --git a/REORG.TODO/sysdeps/x86_64/strpbrk.S b/REORG.TODO/sysdeps/x86_64/strpbrk.S new file mode 100644 index 0000000000..9b97ada84e --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/strpbrk.S @@ -0,0 +1,2 @@ +#define strcspn strpbrk +#include <sysdeps/x86_64/strcspn.S> diff --git a/REORG.TODO/sysdeps/x86_64/strrchr.S b/REORG.TODO/sysdeps/x86_64/strrchr.S new file mode 100644 index 0000000000..e6a33bc599 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/strrchr.S @@ -0,0 +1,228 @@ +/* strrchr (str, ch) -- Return pointer to last occurrence of CH in STR. + Copyright (C) 2013-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + + +#include <sysdep.h> + + .text +ENTRY (strrchr) + movd %esi, %xmm1 + movq %rdi, %rax + andl $4095, %eax + punpcklbw %xmm1, %xmm1 + cmpq $4032, %rax + punpcklwd %xmm1, %xmm1 + pshufd $0, %xmm1, %xmm1 + ja L(cross_page) + movdqu (%rdi), %xmm0 + pxor %xmm2, %xmm2 + movdqa %xmm0, %xmm3 + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm3 + pmovmskb %xmm0, %ecx + pmovmskb %xmm3, %edx + testq %rdx, %rdx + je L(next_48_bytes) + leaq -1(%rdx), %rax + xorq %rdx, %rax + andq %rcx, %rax + je L(exit) + bsrq %rax, %rax + addq %rdi, %rax + ret + + .p2align 4 +L(next_48_bytes): + movdqu 16(%rdi), %xmm4 + movdqa %xmm4, %xmm5 + movdqu 32(%rdi), %xmm3 + pcmpeqb %xmm1, %xmm4 + pcmpeqb %xmm2, %xmm5 + movdqu 48(%rdi), %xmm0 + pmovmskb %xmm5, %edx + movdqa %xmm3, %xmm5 + pcmpeqb %xmm1, %xmm3 + pcmpeqb %xmm2, %xmm5 + pcmpeqb %xmm0, %xmm2 + salq $16, %rdx + pmovmskb %xmm3, %r8d + pmovmskb %xmm5, %eax + pmovmskb %xmm2, %esi + salq $32, %r8 + salq $32, %rax + pcmpeqb %xmm1, %xmm0 + orq %rdx, %rax + movq %rsi, %rdx + pmovmskb %xmm4, %esi + salq $48, %rdx + salq $16, %rsi + orq %r8, %rsi + orq %rcx, %rsi + pmovmskb %xmm0, %ecx + salq $48, %rcx + orq %rcx, %rsi + orq %rdx, %rax + je L(loop_header2) + leaq -1(%rax), %rcx + xorq %rax, %rcx + andq %rcx, %rsi + je L(exit) + bsrq %rsi, %rsi + leaq (%rdi,%rsi), %rax + ret + + .p2align 4 +L(loop_header2): + testq %rsi, %rsi + movq %rdi, %rcx + je L(no_c_found) +L(loop_header): + addq $64, %rdi + pxor %xmm7, %xmm7 + andq $-64, %rdi + jmp L(loop_entry) + + .p2align 4 +L(loop64): + testq %rdx, %rdx + cmovne %rdx, %rsi + cmovne %rdi, %rcx + addq $64, %rdi +L(loop_entry): + movdqa 32(%rdi), %xmm3 + pxor %xmm6, %xmm6 + movdqa 48(%rdi), %xmm2 + movdqa %xmm3, %xmm0 + movdqa 16(%rdi), %xmm4 + pminub %xmm2, %xmm0 + movdqa (%rdi), %xmm5 + pminub %xmm4, %xmm0 + pminub %xmm5, %xmm0 + pcmpeqb %xmm7, %xmm0 + pmovmskb %xmm0, %eax + movdqa %xmm5, %xmm0 + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %r9d + movdqa %xmm4, %xmm0 + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %edx + movdqa %xmm3, %xmm0 + pcmpeqb %xmm1, %xmm0 + salq $16, %rdx + pmovmskb %xmm0, %r10d + movdqa %xmm2, %xmm0 + pcmpeqb %xmm1, %xmm0 + salq $32, %r10 + orq %r10, %rdx + pmovmskb %xmm0, %r8d + orq %r9, %rdx + salq $48, %r8 + orq %r8, %rdx + testl %eax, %eax + je L(loop64) + pcmpeqb %xmm6, %xmm4 + pcmpeqb %xmm6, %xmm3 + pcmpeqb %xmm6, %xmm5 + pmovmskb %xmm4, %eax + pmovmskb %xmm3, %r10d + pcmpeqb %xmm6, %xmm2 + pmovmskb %xmm5, %r9d + salq $32, %r10 + salq $16, %rax + pmovmskb %xmm2, %r8d + orq %r10, %rax + orq %r9, %rax + salq $48, %r8 + orq %r8, %rax + leaq -1(%rax), %r8 + xorq %rax, %r8 + andq %r8, %rdx + cmovne %rdi, %rcx + cmovne %rdx, %rsi + bsrq %rsi, %rsi + leaq (%rcx,%rsi), %rax + ret + + .p2align 4 +L(no_c_found): + movl $1, %esi + xorl %ecx, %ecx + jmp L(loop_header) + + .p2align 4 +L(exit): + xorl %eax, %eax + ret + + .p2align 4 +L(cross_page): + movq %rdi, %rax + pxor %xmm0, %xmm0 + andq $-64, %rax + movdqu (%rax), %xmm5 + movdqa %xmm5, %xmm6 + movdqu 16(%rax), %xmm4 + pcmpeqb %xmm1, %xmm5 + pcmpeqb %xmm0, %xmm6 + movdqu 32(%rax), %xmm3 + pmovmskb %xmm6, %esi + movdqa %xmm4, %xmm6 + movdqu 48(%rax), %xmm2 + pcmpeqb %xmm1, %xmm4 + pcmpeqb %xmm0, %xmm6 + pmovmskb %xmm6, %edx + movdqa %xmm3, %xmm6 + pcmpeqb %xmm1, %xmm3 + pcmpeqb %xmm0, %xmm6 + pcmpeqb %xmm2, %xmm0 + salq $16, %rdx + pmovmskb %xmm3, %r9d + pmovmskb %xmm6, %r8d + pmovmskb %xmm0, %ecx + salq $32, %r9 + salq $32, %r8 + pcmpeqb %xmm1, %xmm2 + orq %r8, %rdx + salq $48, %rcx + pmovmskb %xmm5, %r8d + orq %rsi, %rdx + pmovmskb %xmm4, %esi + orq %rcx, %rdx + pmovmskb %xmm2, %ecx + salq $16, %rsi + salq $48, %rcx + orq %r9, %rsi + orq %r8, %rsi + orq %rcx, %rsi + movl %edi, %ecx + subl %eax, %ecx + shrq %cl, %rdx + shrq %cl, %rsi + testq %rdx, %rdx + je L(loop_header2) + leaq -1(%rdx), %rax + xorq %rdx, %rax + andq %rax, %rsi + je L(exit) + bsrq %rsi, %rax + addq %rdi, %rax + ret +END (strrchr) + +weak_alias (strrchr, rindex) +libc_hidden_builtin_def (strrchr) diff --git a/REORG.TODO/sysdeps/x86_64/strspn.S b/REORG.TODO/sysdeps/x86_64/strspn.S new file mode 100644 index 0000000000..3da576f3d6 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/strspn.S @@ -0,0 +1,115 @@ +/* strspn (str, ss) -- Return the length of the initial segment of STR + which contains only characters from SS. + For AMD x86-64. + Copyright (C) 1994-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Ulrich Drepper <drepper@gnu.ai.mit.edu>. + Bug fixes by Alan Modra <Alan@SPRI.Levels.UniSA.Edu.Au>. + Adopted for x86-64 by Andreas Jaeger <aj@suse.de>. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> + + .text +ENTRY (strspn) + + movq %rdi, %rdx /* Save SRC. */ + + /* First we create a table with flags for all possible characters. + For the ASCII (7bit/8bit) or ISO-8859-X character sets which are + supported by the C string functions we have 256 characters. + Before inserting marks for the stop characters we clear the whole + table. */ + movq %rdi, %r8 /* Save value. */ + subq $256, %rsp /* Make space for 256 bytes. */ + cfi_adjust_cfa_offset(256) + movl $32, %ecx /* 32*8 bytes = 256 bytes. */ + movq %rsp, %rdi + xorl %eax, %eax /* We store 0s. */ + cld + rep + stosq + + movq %rsi, %rax /* Setup stopset. */ + +/* For understanding the following code remember that %rcx == 0 now. + Although all the following instruction only modify %cl we always + have a correct zero-extended 64-bit value in %rcx. */ + + .p2align 4 +L(2): movb (%rax), %cl /* get byte from stopset */ + testb %cl, %cl /* is NUL char? */ + jz L(1) /* yes => start compare loop */ + movb %cl, (%rsp,%rcx) /* set corresponding byte in stopset table */ + + movb 1(%rax), %cl /* get byte from stopset */ + testb $0xff, %cl /* is NUL char? */ + jz L(1) /* yes => start compare loop */ + movb %cl, (%rsp,%rcx) /* set corresponding byte in stopset table */ + + movb 2(%rax), %cl /* get byte from stopset */ + testb $0xff, %cl /* is NUL char? */ + jz L(1) /* yes => start compare loop */ + movb %cl, (%rsp,%rcx) /* set corresponding byte in stopset table */ + + movb 3(%rax), %cl /* get byte from stopset */ + addq $4, %rax /* increment stopset pointer */ + movb %cl, (%rsp,%rcx) /* set corresponding byte in stopset table */ + testb $0xff, %cl /* is NUL char? */ + jnz L(2) /* no => process next dword from stopset */ + +L(1): leaq -4(%rdx), %rax /* prepare loop */ + + /* We use a neat trick for the following loop. Normally we would + have to test for two termination conditions + 1. a character in the stopset was found + and + 2. the end of the string was found + But as a sign that the character is in the stopset we store its + value in the table. But the value of NUL is NUL so the loop + terminates for NUL in every case. */ + + .p2align 4 +L(3): addq $4, %rax /* adjust pointer for full loop round */ + + movb (%rax), %cl /* get byte from string */ + testb %cl, (%rsp,%rcx) /* is it contained in skipset? */ + jz L(4) /* no => return */ + + movb 1(%rax), %cl /* get byte from string */ + testb %cl, (%rsp,%rcx) /* is it contained in skipset? */ + jz L(5) /* no => return */ + + movb 2(%rax), %cl /* get byte from string */ + testb %cl, (%rsp,%rcx) /* is it contained in skipset? */ + jz L(6) /* no => return */ + + movb 3(%rax), %cl /* get byte from string */ + testb %cl, (%rsp,%rcx) /* is it contained in skipset? */ + jnz L(3) /* yes => start loop again */ + + incq %rax /* adjust pointer */ +L(6): incq %rax +L(5): incq %rax + +L(4): addq $256, %rsp /* remove stopset */ + cfi_adjust_cfa_offset(-256) + subq %rdx, %rax /* we have to return the number of valid + characters, so compute distance to first + non-valid character */ + ret +END (strspn) +libc_hidden_builtin_def (strspn) diff --git a/REORG.TODO/sysdeps/x86_64/sub_n.S b/REORG.TODO/sysdeps/x86_64/sub_n.S new file mode 100644 index 0000000000..44c0d88c58 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/sub_n.S @@ -0,0 +1,23 @@ +/* x86-64 __mpn_sub_n -- Add two limb vectors of the same length > 0 and store + sum in a third limb vector. + Copyright (C) 2006-2017 Free Software Foundation, Inc. + This file is part of the GNU MP Library. + + The GNU MP Library is free software; you can redistribute it and/or modify + it under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation; either version 2.1 of the License, or (at your + option) any later version. + + The GNU MP Library is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public + License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with the GNU MP Library; see the file COPYING.LIB. If not, + see <http://www.gnu.org/licenses/>. */ + +#define func __mpn_sub_n +#define ADCSBB sbb + +#include "add_n.S" diff --git a/REORG.TODO/sysdeps/x86_64/submul_1.S b/REORG.TODO/sysdeps/x86_64/submul_1.S new file mode 100644 index 0000000000..77f772cb0b --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/submul_1.S @@ -0,0 +1,23 @@ +/* x86-64 __mpn_submul_1 -- Multiply a limb vector with a limb and subtract + the result from a second limb vector. + Copyright (C) 2003-2017 Free Software Foundation, Inc. + This file is part of the GNU MP Library. + + The GNU MP Library is free software; you can redistribute it and/or modify + it under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation; either version 2.1 of the License, or (at your + option) any later version. + + The GNU MP Library is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public + License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with the GNU MP Library; see the file COPYING.LIB. If not, + see <http://www.gnu.org/licenses/>. */ + +#define func __mpn_submul_1 +#define ADDSUB sub + +#include "addmul_1.S" diff --git a/REORG.TODO/sysdeps/x86_64/sysdep.h b/REORG.TODO/sysdeps/x86_64/sysdep.h new file mode 100644 index 0000000000..1c52544fa3 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/sysdep.h @@ -0,0 +1,169 @@ +/* Assembler macros for x86-64. + Copyright (C) 2001-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#ifndef _X86_64_SYSDEP_H +#define _X86_64_SYSDEP_H 1 + +#include <sysdeps/generic/sysdep.h> + +#ifdef __ASSEMBLER__ + +/* Syntactic details of assembler. */ + +/* This macro is for setting proper CFI with DW_CFA_expression describing + the register as saved relative to %rsp instead of relative to the CFA. + Expression is DW_OP_drop, DW_OP_breg7 (%rsp is register 7), sleb128 offset + from %rsp. */ +#define cfi_offset_rel_rsp(regn, off) .cfi_escape 0x10, regn, 0x4, 0x13, \ + 0x77, off & 0x7F | 0x80, off >> 7 + +/* ELF uses byte-counts for .align, most others use log2 of count of bytes. */ +#define ALIGNARG(log2) 1<<log2 +#define ASM_SIZE_DIRECTIVE(name) .size name,.-name; + + +/* Define an entry point visible from C. */ +#define ENTRY(name) \ + .globl C_SYMBOL_NAME(name); \ + .type C_SYMBOL_NAME(name),@function; \ + .align ALIGNARG(4); \ + C_LABEL(name) \ + cfi_startproc; \ + CALL_MCOUNT + +#undef END +#define END(name) \ + cfi_endproc; \ + ASM_SIZE_DIRECTIVE(name) + +#define ENTRY_CHK(name) ENTRY (name) +#define END_CHK(name) END (name) + +/* If compiled for profiling, call `mcount' at the start of each function. */ +#ifdef PROF +/* The mcount code relies on a normal frame pointer being on the stack + to locate our caller, so push one just for its benefit. */ +#define CALL_MCOUNT \ + pushq %rbp; \ + cfi_adjust_cfa_offset(8); \ + movq %rsp, %rbp; \ + cfi_def_cfa_register(%rbp); \ + call JUMPTARGET(mcount); \ + popq %rbp; \ + cfi_def_cfa(rsp,8); +#else +#define CALL_MCOUNT /* Do nothing. */ +#endif + +/* Since C identifiers are not normally prefixed with an underscore + on this system, the asm identifier `syscall_error' intrudes on the + C name space. Make sure we use an innocuous name. */ +#define syscall_error __syscall_error +#define mcount _mcount + +#define PSEUDO(name, syscall_name, args) \ +lose: \ + jmp JUMPTARGET(syscall_error) \ + .globl syscall_error; \ + ENTRY (name) \ + DO_CALL (syscall_name, args); \ + jb lose + +#undef PSEUDO_END +#define PSEUDO_END(name) \ + END (name) + +#undef JUMPTARGET +#ifdef SHARED +# ifdef BIND_NOW +# define JUMPTARGET(name) *name##@GOTPCREL(%rip) +# else +# define JUMPTARGET(name) name##@PLT +# endif +#else +/* For static archives, branch to target directly. */ +# define JUMPTARGET(name) name +#endif + +/* Local label name for asm code. */ +#ifndef L +/* ELF-like local names start with `.L'. */ +# define L(name) .L##name +#endif + +#define atom_text_section .section ".text.atom", "ax" + +/* Long and pointer size in bytes. */ +#define LP_SIZE 8 + +/* Instruction to operate on long and pointer. */ +#define LP_OP(insn) insn##q + +/* Assembler address directive. */ +#define ASM_ADDR .quad + +/* Registers to hold long and pointer. */ +#define RAX_LP rax +#define RBP_LP rbp +#define RBX_LP rbx +#define RCX_LP rcx +#define RDI_LP rdi +#define RDX_LP rdx +#define RSI_LP rsi +#define RSP_LP rsp +#define R8_LP r8 +#define R9_LP r9 +#define R10_LP r10 +#define R11_LP r11 +#define R12_LP r12 +#define R13_LP r13 +#define R14_LP r14 +#define R15_LP r15 + +#else /* __ASSEMBLER__ */ + +/* Long and pointer size in bytes. */ +#define LP_SIZE "8" + +/* Instruction to operate on long and pointer. */ +#define LP_OP(insn) #insn "q" + +/* Assembler address directive. */ +#define ASM_ADDR ".quad" + +/* Registers to hold long and pointer. */ +#define RAX_LP "rax" +#define RBP_LP "rbp" +#define RBX_LP "rbx" +#define RCX_LP "rcx" +#define RDI_LP "rdi" +#define RDX_LP "rdx" +#define RSI_LP "rsi" +#define RSP_LP "rsp" +#define R8_LP "r8" +#define R9_LP "r9" +#define R10_LP "r10" +#define R11_LP "r11" +#define R12_LP "r12" +#define R13_LP "r13" +#define R14_LP "r14" +#define R15_LP "r15" + +#endif /* __ASSEMBLER__ */ + +#endif /* _X86_64_SYSDEP_H */ diff --git a/REORG.TODO/sysdeps/x86_64/tls-macros.h b/REORG.TODO/sysdeps/x86_64/tls-macros.h new file mode 100644 index 0000000000..22d2a4b592 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/tls-macros.h @@ -0,0 +1,39 @@ +#define TLS_LE(x) \ + ({ int *__l; \ + asm ("mov %%fs:0,%0\n\t" \ + "lea " #x "@tpoff(%0), %0" \ + : "=r" (__l)); \ + __l; }) + +#define TLS_IE(x) \ + ({ int *__l; \ + asm ("mov %%fs:0,%0\n\t" \ + "add " #x "@gottpoff(%%rip),%0" \ + : "=r" (__l)); \ + __l; }) + +#define TLS_LD(x) \ + ({ int *__l, __c, __d; \ + asm ("leaq " #x "@tlsld(%%rip),%%rdi\n\t" \ + "call __tls_get_addr@plt\n\t" \ + "leaq " #x "@dtpoff(%%rax), %%rax" \ + : "=a" (__l), "=&c" (__c), "=&d" (__d) \ + : : "rdi", "rsi", "r8", "r9", "r10", "r11"); \ + __l; }) + +#ifdef __ILP32__ +# define TLS_GD_PREFIX +#else +# define TLS_GD_PREFIX ".byte 0x66\n\t" +#endif + +#define TLS_GD(x) \ + ({ int *__l, __c, __d; \ + asm (TLS_GD_PREFIX \ + "leaq " #x "@tlsgd(%%rip),%%rdi\n\t" \ + ".word 0x6666\n\t" \ + "rex64\n\t" \ + "call __tls_get_addr@plt" \ + : "=a" (__l), "=&c" (__c), "=&d" (__d) \ + : : "rdi", "rsi", "r8", "r9", "r10", "r11"); \ + __l; }) diff --git a/REORG.TODO/sysdeps/x86_64/tlsdesc.c b/REORG.TODO/sysdeps/x86_64/tlsdesc.c new file mode 100644 index 0000000000..20d821ac66 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/tlsdesc.c @@ -0,0 +1,150 @@ +/* Manage TLS descriptors. x86_64 version. + Copyright (C) 2005-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <link.h> +#include <ldsodefs.h> +#include <elf/dynamic-link.h> +#include <tls.h> +#include <dl-tlsdesc.h> +#include <dl-unmap-segments.h> +#include <tlsdeschtab.h> + +/* The following 2 functions take a caller argument, that contains the + address expected to be in the TLS descriptor. If it's changed, we + want to return immediately. */ + +/* This function is used to lazily resolve TLS_DESC RELA relocations. + The argument location is used to hold a pointer to the relocation. */ + +void +attribute_hidden +_dl_tlsdesc_resolve_rela_fixup (struct tlsdesc volatile *td, + struct link_map *l) +{ + const ElfW(Rela) *reloc = td->arg; + + if (_dl_tlsdesc_resolve_early_return_p + (td, (void*)(D_PTR (l, l_info[ADDRIDX (DT_TLSDESC_PLT)]) + l->l_addr))) + return; + + /* The code below was borrowed from _dl_fixup(). */ + const ElfW(Sym) *const symtab + = (const void *) D_PTR (l, l_info[DT_SYMTAB]); + const char *strtab = (const void *) D_PTR (l, l_info[DT_STRTAB]); + const ElfW(Sym) *sym = &symtab[ELFW(R_SYM) (reloc->r_info)]; + lookup_t result; + + /* Look up the target symbol. If the normal lookup rules are not + used don't look in the global scope. */ + if (ELFW(ST_BIND) (sym->st_info) != STB_LOCAL + && __builtin_expect (ELFW(ST_VISIBILITY) (sym->st_other), 0) == 0) + { + const struct r_found_version *version = NULL; + + if (l->l_info[VERSYMIDX (DT_VERSYM)] != NULL) + { + const ElfW(Half) *vernum = + (const void *) D_PTR (l, l_info[VERSYMIDX (DT_VERSYM)]); + ElfW(Half) ndx = vernum[ELFW(R_SYM) (reloc->r_info)] & 0x7fff; + version = &l->l_versions[ndx]; + if (version->hash == 0) + version = NULL; + } + + result = _dl_lookup_symbol_x (strtab + sym->st_name, l, &sym, + l->l_scope, version, ELF_RTYPE_CLASS_PLT, + DL_LOOKUP_ADD_DEPENDENCY, NULL); + } + else + { + /* We already found the symbol. The module (and therefore its load + address) is also known. */ + result = l; + } + + if (! sym) + { + td->arg = (void*)reloc->r_addend; + td->entry = _dl_tlsdesc_undefweak; + } + else + { +# ifndef SHARED + CHECK_STATIC_TLS (l, result); +# else + if (!TRY_STATIC_TLS (l, result)) + { + td->arg = _dl_make_tlsdesc_dynamic (result, sym->st_value + + reloc->r_addend); + td->entry = _dl_tlsdesc_dynamic; + } + else +# endif + { + td->arg = (void*)(sym->st_value - result->l_tls_offset + + reloc->r_addend); + td->entry = _dl_tlsdesc_return; + } + } + + _dl_tlsdesc_wake_up_held_fixups (); +} + +/* This function is used to avoid busy waiting for other threads to + complete the lazy relocation. Once another thread wins the race to + relocate a TLS descriptor, it sets the descriptor up such that this + function is called to wait until the resolver releases the + lock. */ + +void +attribute_hidden +_dl_tlsdesc_resolve_hold_fixup (struct tlsdesc volatile *td, + void *caller) +{ + /* Maybe we're lucky and can return early. */ + if (caller != td->entry) + return; + + /* Locking here will stop execution until the running resolver runs + _dl_tlsdesc_wake_up_held_fixups(), releasing the lock. + + FIXME: We'd be better off waiting on a condition variable, such + that we didn't have to hold the lock throughout the relocation + processing. */ + __rtld_lock_lock_recursive (GL(dl_load_lock)); + __rtld_lock_unlock_recursive (GL(dl_load_lock)); +} + +/* Unmap the dynamic object, but also release its TLS descriptor table + if there is one. */ + +void +internal_function +_dl_unmap (struct link_map *map) +{ + _dl_unmap_segments (map); + +#ifdef SHARED + /* _dl_unmap is only called for dlopen()ed libraries, for which + calling free() is safe, or before we've completed the initial + relocation, in which case calling free() is probably pointless, + but still safe. */ + if (map->l_mach.tlsdesc_table) + htab_delete (map->l_mach.tlsdesc_table); +#endif +} diff --git a/REORG.TODO/sysdeps/x86_64/tlsdesc.sym b/REORG.TODO/sysdeps/x86_64/tlsdesc.sym new file mode 100644 index 0000000000..33854975d0 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/tlsdesc.sym @@ -0,0 +1,17 @@ +#include <stddef.h> +#include <sysdep.h> +#include <tls.h> +#include <link.h> +#include <dl-tlsdesc.h> + +-- + +-- Abuse tls.h macros to derive offsets relative to the thread register. + +DTV_OFFSET offsetof(struct pthread, header.dtv) + +TLSDESC_ARG offsetof(struct tlsdesc, arg) + +TLSDESC_GEN_COUNT offsetof(struct tlsdesc_dynamic_arg, gen_count) +TLSDESC_MODID offsetof(struct tlsdesc_dynamic_arg, tlsinfo.ti_module) +TLSDESC_MODOFF offsetof(struct tlsdesc_dynamic_arg, tlsinfo.ti_offset) diff --git a/REORG.TODO/sysdeps/x86_64/tst-audit.h b/REORG.TODO/sysdeps/x86_64/tst-audit.h new file mode 100644 index 0000000000..e3c780e42c --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/tst-audit.h @@ -0,0 +1,32 @@ +/* Definitions for testing PLT entry/exit auditing. x86_64 version. + + Copyright (C) 2012-2017 Free Software Foundation, Inc. + + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library. If not, see + <http://www.gnu.org/licenses/>. */ + +#ifndef __ILP32__ +# define pltenter la_x86_64_gnu_pltenter +# define pltexit la_x86_64_gnu_pltexit +# define La_regs La_x86_64_regs +# define La_retval La_x86_64_retval +#else +# define pltenter la_x32_gnu_pltenter +# define pltexit la_x32_gnu_pltexit +# define La_regs La_x32_regs +# define La_retval La_x32_retval +#endif +#define int_retval lrv_rax diff --git a/REORG.TODO/sysdeps/x86_64/tst-audit10-aux.c b/REORG.TODO/sysdeps/x86_64/tst-audit10-aux.c new file mode 100644 index 0000000000..4663136419 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/tst-audit10-aux.c @@ -0,0 +1,41 @@ +/* Test case for preserved AVX512 registers in dynamic linker, -mavx512f part. + Copyright (C) 2012-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <immintrin.h> +#include <stdlib.h> +#include <string.h> + +int +tst_audit10_aux (void) +{ +#ifdef __AVX512F__ + extern __m512i audit_test (__m512i, __m512i, __m512i, __m512i, + __m512i, __m512i, __m512i, __m512i); + + __m512i zmm = _mm512_setzero_si512 (); + __m512i ret = audit_test (zmm, zmm, zmm, zmm, zmm, zmm, zmm, zmm); + + zmm = _mm512_set1_epi64 (0x12349876); + + if (memcmp (&zmm, &ret, sizeof (ret))) + abort (); + return 0; +#else /* __AVX512F__ */ + return 77; +#endif /* __AVX512F__ */ +} diff --git a/REORG.TODO/sysdeps/x86_64/tst-audit10.c b/REORG.TODO/sysdeps/x86_64/tst-audit10.c new file mode 100644 index 0000000000..bda248ac7e --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/tst-audit10.c @@ -0,0 +1,57 @@ +/* Test case for preserved AVX512 registers in dynamic linker. + Copyright (C) 2012-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <cpuid.h> + +int tst_audit10_aux (void); + +static int +avx512_enabled (void) +{ +#ifdef bit_AVX512F + unsigned int eax, ebx, ecx, edx; + + if (__get_cpuid (1, &eax, &ebx, &ecx, &edx) == 0 + || (ecx & (bit_AVX | bit_OSXSAVE)) != (bit_AVX | bit_OSXSAVE)) + return 0; + + __cpuid_count (7, 0, eax, ebx, ecx, edx); + if (!(ebx & bit_AVX512F)) + return 0; + + asm ("xgetbv" : "=a" (eax), "=d" (edx) : "c" (0)); + + /* Verify that ZMM, YMM and XMM states are enabled. */ + return (eax & 0xe6) == 0xe6; +#else + return 0; +#endif +} + +static int +do_test (void) +{ + /* Run AVX512 test only if AVX512 is supported. */ + if (avx512_enabled ()) + return tst_audit10_aux (); + else + return 77; +} + +#define TEST_FUNCTION do_test () +#include "../../test-skeleton.c" diff --git a/REORG.TODO/sysdeps/x86_64/tst-audit3.c b/REORG.TODO/sysdeps/x86_64/tst-audit3.c new file mode 100644 index 0000000000..0602aa25db --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/tst-audit3.c @@ -0,0 +1,23 @@ +/* Test case for x86-64 preserved registers in dynamic linker. */ + +#include <stdlib.h> +#include <string.h> + +#include <emmintrin.h> + +extern __m128i audit_test (__m128i, __m128i, __m128i, __m128i, + __m128i, __m128i, __m128i, __m128i); +static int +do_test (void) +{ + __m128i xmm = _mm_setzero_si128 (); + __m128i ret = audit_test (xmm, xmm, xmm, xmm, xmm, xmm, xmm, xmm); + + if (memcmp (&xmm, &ret, sizeof (ret))) + abort (); + + return 0; +} + +#define TEST_FUNCTION do_test () +#include "../../test-skeleton.c" diff --git a/REORG.TODO/sysdeps/x86_64/tst-audit4-aux.c b/REORG.TODO/sysdeps/x86_64/tst-audit4-aux.c new file mode 100644 index 0000000000..c78c51c747 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/tst-audit4-aux.c @@ -0,0 +1,39 @@ +/* Test case for preserved AVX registers in dynamic linker, -mavx part. + Copyright (C) 2009-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <immintrin.h> +#include <stdlib.h> +#include <string.h> + +extern __m256i audit_test (__m256i, __m256i, __m256i, __m256i, + __m256i, __m256i, __m256i, __m256i); + +int +tst_audit4_aux (void) +{ +#ifdef __AVX__ + __m256i ymm = _mm256_setzero_si256 (); + __m256i ret = audit_test (ymm, ymm, ymm, ymm, ymm, ymm, ymm, ymm); + ymm = _mm256_set1_epi32 (0x12349876); + if (memcmp (&ymm, &ret, sizeof (ret))) + abort (); + return 0; +#else /* __AVX__ */ + return 77; +#endif /* __AVX__ */ +} diff --git a/REORG.TODO/sysdeps/x86_64/tst-audit4.c b/REORG.TODO/sysdeps/x86_64/tst-audit4.c new file mode 100644 index 0000000000..8178f2c6d2 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/tst-audit4.c @@ -0,0 +1,49 @@ +/* Test case for preserved AVX registers in dynamic linker. + Copyright (C) 2009-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <cpuid.h> + +int tst_audit4_aux (void); + +static int +avx_enabled (void) +{ + unsigned int eax, ebx, ecx, edx; + + if (__get_cpuid (1, &eax, &ebx, &ecx, &edx) == 0 + || (ecx & (bit_AVX | bit_OSXSAVE)) != (bit_AVX | bit_OSXSAVE)) + return 0; + + /* Check the OS has AVX and SSE saving enabled. */ + asm ("xgetbv" : "=a" (eax), "=d" (edx) : "c" (0)); + + return (eax & 6) == 6; +} + +static int +do_test (void) +{ + /* Run AVX test only if AVX is supported. */ + if (avx_enabled ()) + return tst_audit4_aux (); + else + return 77; +} + +#define TEST_FUNCTION do_test () +#include "../../test-skeleton.c" diff --git a/REORG.TODO/sysdeps/x86_64/tst-audit5.c b/REORG.TODO/sysdeps/x86_64/tst-audit5.c new file mode 100644 index 0000000000..225b4c86ac --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/tst-audit5.c @@ -0,0 +1,24 @@ +/* Test case for x86-64 preserved registers in dynamic linker. */ + +#include <stdlib.h> +#include <string.h> + +#include <emmintrin.h> + +extern __m128i audit_test (__m128i, __m128i, __m128i, __m128i, + __m128i, __m128i, __m128i, __m128i); +static int +do_test (void) +{ + __m128i xmm = _mm_setzero_si128 (); + __m128i ret = audit_test (xmm, xmm, xmm, xmm, xmm, xmm, xmm, xmm); + + xmm = _mm_set1_epi32 (0x12349876); + if (memcmp (&xmm, &ret, sizeof (ret))) + abort (); + + return 0; +} + +#define TEST_FUNCTION do_test () +#include "../../test-skeleton.c" diff --git a/REORG.TODO/sysdeps/x86_64/tst-audit6.c b/REORG.TODO/sysdeps/x86_64/tst-audit6.c new file mode 100644 index 0000000000..f2f6a487c4 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/tst-audit6.c @@ -0,0 +1,45 @@ +/* Test case for x86-64 preserved registers in dynamic linker. */ + +#include <stdlib.h> +#include <string.h> +#include <cpuid.h> +#include <emmintrin.h> + +extern __m128i audit_test (__m128i, __m128i, __m128i, __m128i, + __m128i, __m128i, __m128i, __m128i); + + +static int +avx_enabled (void) +{ + unsigned int eax, ebx, ecx, edx; + + if (__get_cpuid (1, &eax, &ebx, &ecx, &edx) == 0 + || (ecx & (bit_AVX | bit_OSXSAVE)) != (bit_AVX | bit_OSXSAVE)) + return 0; + + /* Check the OS has AVX and SSE saving enabled. */ + asm ("xgetbv" : "=a" (eax), "=d" (edx) : "c" (0)); + + return (eax & 6) == 6; +} + + +static int +do_test (void) +{ + /* Run AVX test only if AVX is supported. */ + if (avx_enabled ()) + { + __m128i xmm = _mm_setzero_si128 (); + __m128i ret = audit_test (xmm, xmm, xmm, xmm, xmm, xmm, xmm, xmm); + + xmm = _mm_set1_epi32 (0x98abcdef); + if (memcmp (&xmm, &ret, sizeof (ret))) + abort (); + } + return 0; +} + +#define TEST_FUNCTION do_test () +#include "../../test-skeleton.c" diff --git a/REORG.TODO/sysdeps/x86_64/tst-audit7.c b/REORG.TODO/sysdeps/x86_64/tst-audit7.c new file mode 100644 index 0000000000..1d2a7de439 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/tst-audit7.c @@ -0,0 +1 @@ +#include "tst-audit6.c" diff --git a/REORG.TODO/sysdeps/x86_64/tst-auditmod10a.c b/REORG.TODO/sysdeps/x86_64/tst-auditmod10a.c new file mode 100644 index 0000000000..41c77e98a5 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/tst-auditmod10a.c @@ -0,0 +1,65 @@ +/* Copyright (C) 2012-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +/* Test case for x86-64 preserved registers in dynamic linker. */ + +#ifdef __AVX512F__ +#include <stdlib.h> +#include <string.h> +#include <immintrin.h> + +__m512i +audit_test (__m512i x0, __m512i x1, __m512i x2, __m512i x3, + __m512i x4, __m512i x5, __m512i x6, __m512i x7) +{ + __m512i zmm; + + zmm = _mm512_set1_epi64 (1); + if (memcmp (&zmm, &x0, sizeof (zmm))) + abort (); + + zmm = _mm512_set1_epi64 (2); + if (memcmp (&zmm, &x1, sizeof (zmm))) + abort (); + + zmm = _mm512_set1_epi64 (3); + if (memcmp (&zmm, &x2, sizeof (zmm))) + abort (); + + zmm = _mm512_set1_epi64 (4); + if (memcmp (&zmm, &x3, sizeof (zmm))) + abort (); + + zmm = _mm512_set1_epi64 (5); + if (memcmp (&zmm, &x4, sizeof (zmm))) + abort (); + + zmm = _mm512_set1_epi64 (6); + if (memcmp (&zmm, &x5, sizeof (zmm))) + abort (); + + zmm = _mm512_set1_epi64 (7); + if (memcmp (&zmm, &x6, sizeof (zmm))) + abort (); + + zmm = _mm512_set1_epi64 (8); + if (memcmp (&zmm, &x7, sizeof (zmm))) + abort (); + + return _mm512_setzero_si512 (); +} +#endif diff --git a/REORG.TODO/sysdeps/x86_64/tst-auditmod10b.c b/REORG.TODO/sysdeps/x86_64/tst-auditmod10b.c new file mode 100644 index 0000000000..5b9a985ca2 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/tst-auditmod10b.c @@ -0,0 +1,231 @@ +/* Copyright (C) 2012-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +/* Verify that changing AVX512 registers in audit library won't affect + function parameter passing/return. */ + +#include <dlfcn.h> +#include <link.h> +#include <stddef.h> +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <bits/wordsize.h> +#include <gnu/lib-names.h> + +unsigned int +la_version (unsigned int v) +{ + setlinebuf (stdout); + + printf ("version: %u\n", v); + + char buf[20]; + sprintf (buf, "%u", v); + + return v; +} + +void +la_activity (uintptr_t *cookie, unsigned int flag) +{ + if (flag == LA_ACT_CONSISTENT) + printf ("activity: consistent\n"); + else if (flag == LA_ACT_ADD) + printf ("activity: add\n"); + else if (flag == LA_ACT_DELETE) + printf ("activity: delete\n"); + else + printf ("activity: unknown activity %u\n", flag); +} + +char * +la_objsearch (const char *name, uintptr_t *cookie, unsigned int flag) +{ + char buf[100]; + const char *flagstr; + if (flag == LA_SER_ORIG) + flagstr = "LA_SET_ORIG"; + else if (flag == LA_SER_LIBPATH) + flagstr = "LA_SER_LIBPATH"; + else if (flag == LA_SER_RUNPATH) + flagstr = "LA_SER_RUNPATH"; + else if (flag == LA_SER_CONFIG) + flagstr = "LA_SER_CONFIG"; + else if (flag == LA_SER_DEFAULT) + flagstr = "LA_SER_DEFAULT"; + else if (flag == LA_SER_SECURE) + flagstr = "LA_SER_SECURE"; + else + { + sprintf (buf, "unknown flag %d", flag); + flagstr = buf; + } + printf ("objsearch: %s, %s\n", name, flagstr); + + return (char *) name; +} + +unsigned int +la_objopen (struct link_map *l, Lmid_t lmid, uintptr_t *cookie) +{ + printf ("objopen: %ld, %s\n", lmid, l->l_name); + + return 3; +} + +void +la_preinit (uintptr_t *cookie) +{ + printf ("preinit\n"); +} + +unsigned int +la_objclose (uintptr_t *cookie) +{ + printf ("objclose\n"); + return 0; +} + +uintptr_t +la_symbind32 (Elf32_Sym *sym, unsigned int ndx, uintptr_t *refcook, + uintptr_t *defcook, unsigned int *flags, const char *symname) +{ + printf ("symbind32: symname=%s, st_value=%#lx, ndx=%u, flags=%u\n", + symname, (long int) sym->st_value, ndx, *flags); + + return sym->st_value; +} + +uintptr_t +la_symbind64 (Elf64_Sym *sym, unsigned int ndx, uintptr_t *refcook, + uintptr_t *defcook, unsigned int *flags, const char *symname) +{ + printf ("symbind64: symname=%s, st_value=%#lx, ndx=%u, flags=%u\n", + symname, (long int) sym->st_value, ndx, *flags); + + return sym->st_value; +} + +#include <tst-audit.h> + +#ifdef __AVX512F__ +#include <immintrin.h> +#include <cpuid.h> + +static int +check_avx512 (void) +{ + unsigned int eax, ebx, ecx, edx; + + if (__get_cpuid (1, &eax, &ebx, &ecx, &edx) == 0 + || (ecx & (bit_AVX | bit_OSXSAVE)) != (bit_AVX | bit_OSXSAVE)) + return 0; + + __cpuid_count (7, 0, eax, ebx, ecx, edx); + if (!(ebx & bit_AVX512F)) + return 0; + + asm ("xgetbv" : "=a" (eax), "=d" (edx) : "c" (0)); + + /* Verify that ZMM, YMM and XMM states are enabled. */ + return (eax & 0xe6) == 0xe6; +} + +#else +#include <emmintrin.h> +#endif + +ElfW(Addr) +pltenter (ElfW(Sym) *sym, unsigned int ndx, uintptr_t *refcook, + uintptr_t *defcook, La_regs *regs, unsigned int *flags, + const char *symname, long int *framesizep) +{ + printf ("pltenter: symname=%s, st_value=%#lx, ndx=%u, flags=%u\n", + symname, (long int) sym->st_value, ndx, *flags); + +#ifdef __AVX512F__ + if (check_avx512 () && strcmp (symname, "audit_test") == 0) + { + __m512i zero = _mm512_setzero_si512 (); + if (memcmp (®s->lr_vector[0], &zero, sizeof (zero)) + || memcmp (®s->lr_vector[1], &zero, sizeof (zero)) + || memcmp (®s->lr_vector[2], &zero, sizeof (zero)) + || memcmp (®s->lr_vector[3], &zero, sizeof (zero)) + || memcmp (®s->lr_vector[4], &zero, sizeof (zero)) + || memcmp (®s->lr_vector[5], &zero, sizeof (zero)) + || memcmp (®s->lr_vector[6], &zero, sizeof (zero)) + || memcmp (®s->lr_vector[7], &zero, sizeof (zero))) + abort (); + + for (int i = 0; i < 8; i++) + regs->lr_vector[i].zmm[0] + = (La_x86_64_zmm) _mm512_set1_epi64 (i + 1); + + __m512i zmm = _mm512_set1_epi64 (-1); + asm volatile ("vmovdqa64 %0, %%zmm0" : : "x" (zmm) : "xmm0" ); + asm volatile ("vmovdqa64 %0, %%zmm1" : : "x" (zmm) : "xmm1" ); + asm volatile ("vmovdqa64 %0, %%zmm2" : : "x" (zmm) : "xmm2" ); + asm volatile ("vmovdqa64 %0, %%zmm3" : : "x" (zmm) : "xmm3" ); + asm volatile ("vmovdqa64 %0, %%zmm4" : : "x" (zmm) : "xmm4" ); + asm volatile ("vmovdqa64 %0, %%zmm5" : : "x" (zmm) : "xmm5" ); + asm volatile ("vmovdqa64 %0, %%zmm6" : : "x" (zmm) : "xmm6" ); + asm volatile ("vmovdqa64 %0, %%zmm7" : : "x" (zmm) : "xmm7" ); + + *framesizep = 1024; + } +#endif + + return sym->st_value; +} + +unsigned int +pltexit (ElfW(Sym) *sym, unsigned int ndx, uintptr_t *refcook, + uintptr_t *defcook, const La_regs *inregs, La_retval *outregs, + const char *symname) +{ + printf ("pltexit: symname=%s, st_value=%#lx, ndx=%u, retval=%tu\n", + symname, (long int) sym->st_value, ndx, + (ptrdiff_t) outregs->int_retval); + +#ifdef __AVX512F__ + if (check_avx512 () && strcmp (symname, "audit_test") == 0) + { + __m512i zero = _mm512_setzero_si512 (); + if (memcmp (&outregs->lrv_vector0, &zero, sizeof (zero))) + abort (); + + for (int i = 0; i < 8; i++) + { + __m512i zmm = _mm512_set1_epi64 (i + 1); + if (memcmp (&inregs->lr_vector[i], &zmm, sizeof (zmm)) != 0) + abort (); + } + + outregs->lrv_vector0.zmm[0] + = (La_x86_64_zmm) _mm512_set1_epi64 (0x12349876); + + __m512i zmm = _mm512_set1_epi64 (-1); + asm volatile ("vmovdqa64 %0, %%zmm0" : : "x" (zmm) : "xmm0" ); + asm volatile ("vmovdqa64 %0, %%zmm1" : : "x" (zmm) : "xmm1" ); + } +#endif + + return 0; +} diff --git a/REORG.TODO/sysdeps/x86_64/tst-auditmod3a.c b/REORG.TODO/sysdeps/x86_64/tst-auditmod3a.c new file mode 100644 index 0000000000..9514aba505 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/tst-auditmod3a.c @@ -0,0 +1,24 @@ +/* Test case for x86-64 preserved registers in dynamic linker. */ + +#include <stdlib.h> +#include <string.h> +#include <emmintrin.h> + +__m128i +audit_test (__m128i x0, __m128i x1, __m128i x2, __m128i x3, + __m128i x4, __m128i x5, __m128i x6, __m128i x7) +{ + __m128i xmm = _mm_setzero_si128 (); + + if (memcmp (&xmm, &x0, sizeof (xmm)) + || memcmp (&xmm, &x1, sizeof (xmm)) + || memcmp (&xmm, &x2, sizeof (xmm)) + || memcmp (&xmm, &x3, sizeof (xmm)) + || memcmp (&xmm, &x4, sizeof (xmm)) + || memcmp (&xmm, &x5, sizeof (xmm)) + || memcmp (&xmm, &x6, sizeof (xmm)) + || memcmp (&xmm, &x7, sizeof (xmm))) + abort (); + + return xmm; +} diff --git a/REORG.TODO/sysdeps/x86_64/tst-auditmod3b.c b/REORG.TODO/sysdeps/x86_64/tst-auditmod3b.c new file mode 100644 index 0000000000..7aad92382e --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/tst-auditmod3b.c @@ -0,0 +1,153 @@ +/* Verify that changing xmm registers in audit library won't affect + function parameter passing/return. */ + +#include <dlfcn.h> +#include <link.h> +#include <stddef.h> +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <bits/wordsize.h> +#include <gnu/lib-names.h> +#include <emmintrin.h> + +unsigned int +la_version (unsigned int v) +{ + setlinebuf (stdout); + + printf ("version: %u\n", v); + + char buf[20]; + sprintf (buf, "%u", v); + + return v; +} + +void +la_activity (uintptr_t *cookie, unsigned int flag) +{ + if (flag == LA_ACT_CONSISTENT) + printf ("activity: consistent\n"); + else if (flag == LA_ACT_ADD) + printf ("activity: add\n"); + else if (flag == LA_ACT_DELETE) + printf ("activity: delete\n"); + else + printf ("activity: unknown activity %u\n", flag); +} + +char * +la_objsearch (const char *name, uintptr_t *cookie, unsigned int flag) +{ + char buf[100]; + const char *flagstr; + if (flag == LA_SER_ORIG) + flagstr = "LA_SET_ORIG"; + else if (flag == LA_SER_LIBPATH) + flagstr = "LA_SER_LIBPATH"; + else if (flag == LA_SER_RUNPATH) + flagstr = "LA_SER_RUNPATH"; + else if (flag == LA_SER_CONFIG) + flagstr = "LA_SER_CONFIG"; + else if (flag == LA_SER_DEFAULT) + flagstr = "LA_SER_DEFAULT"; + else if (flag == LA_SER_SECURE) + flagstr = "LA_SER_SECURE"; + else + { + sprintf (buf, "unknown flag %d", flag); + flagstr = buf; + } + printf ("objsearch: %s, %s\n", name, flagstr); + + return (char *) name; +} + +unsigned int +la_objopen (struct link_map *l, Lmid_t lmid, uintptr_t *cookie) +{ + printf ("objopen: %ld, %s\n", lmid, l->l_name); + + return 3; +} + +void +la_preinit (uintptr_t *cookie) +{ + printf ("preinit\n"); +} + +unsigned int +la_objclose (uintptr_t *cookie) +{ + printf ("objclose\n"); + return 0; +} + +uintptr_t +la_symbind32 (Elf32_Sym *sym, unsigned int ndx, uintptr_t *refcook, + uintptr_t *defcook, unsigned int *flags, const char *symname) +{ + printf ("symbind32: symname=%s, st_value=%#lx, ndx=%u, flags=%u\n", + symname, (long int) sym->st_value, ndx, *flags); + + return sym->st_value; +} + +uintptr_t +la_symbind64 (Elf64_Sym *sym, unsigned int ndx, uintptr_t *refcook, + uintptr_t *defcook, unsigned int *flags, const char *symname) +{ + printf ("symbind64: symname=%s, st_value=%#lx, ndx=%u, flags=%u\n", + symname, (long int) sym->st_value, ndx, *flags); + + return sym->st_value; +} + +#include <tst-audit.h> + +ElfW(Addr) +pltenter (ElfW(Sym) *sym, unsigned int ndx, uintptr_t *refcook, + uintptr_t *defcook, La_regs *regs, unsigned int *flags, + const char *symname, long int *framesizep) +{ + printf ("pltenter: symname=%s, st_value=%#lx, ndx=%u, flags=%u\n", + symname, (long int) sym->st_value, ndx, *flags); + + __m128i xmm = _mm_set1_epi32 (-1); + asm volatile ("movdqa %0, %%xmm0" : : "x" (xmm) : "xmm0" ); + asm volatile ("movdqa %0, %%xmm1" : : "x" (xmm) : "xmm1" ); + asm volatile ("movdqa %0, %%xmm2" : : "x" (xmm) : "xmm2" ); + asm volatile ("movdqa %0, %%xmm3" : : "x" (xmm) : "xmm3" ); + asm volatile ("movdqa %0, %%xmm4" : : "x" (xmm) : "xmm4" ); + asm volatile ("movdqa %0, %%xmm5" : : "x" (xmm) : "xmm5" ); + asm volatile ("movdqa %0, %%xmm6" : : "x" (xmm) : "xmm6" ); + asm volatile ("movdqa %0, %%xmm7" : : "x" (xmm) : "xmm7" ); + + return sym->st_value; +} + +unsigned int +pltexit (ElfW(Sym) *sym, unsigned int ndx, uintptr_t *refcook, + uintptr_t *defcook, const La_regs *inregs, La_retval *outregs, + const char *symname) +{ + printf ("pltexit: symname=%s, st_value=%#lx, ndx=%u, retval=%tu\n", + symname, (long int) sym->st_value, ndx, + (ptrdiff_t) outregs->int_retval); + + __m128i xmm = _mm_set1_epi32 (-1); + asm volatile ("movdqa %0, %%xmm0" : : "x" (xmm) : "xmm0" ); + asm volatile ("movdqa %0, %%xmm1" : : "x" (xmm) : "xmm1" ); + asm volatile ("movdqa %0, %%xmm2" : : "x" (xmm) : "xmm2" ); + asm volatile ("movdqa %0, %%xmm3" : : "x" (xmm) : "xmm3" ); + asm volatile ("movdqa %0, %%xmm4" : : "x" (xmm) : "xmm4" ); + asm volatile ("movdqa %0, %%xmm5" : : "x" (xmm) : "xmm5" ); + asm volatile ("movdqa %0, %%xmm6" : : "x" (xmm) : "xmm6" ); + asm volatile ("movdqa %0, %%xmm7" : : "x" (xmm) : "xmm7" ); + + return 0; +} diff --git a/REORG.TODO/sysdeps/x86_64/tst-auditmod4a.c b/REORG.TODO/sysdeps/x86_64/tst-auditmod4a.c new file mode 100644 index 0000000000..c9c24c04a8 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/tst-auditmod4a.c @@ -0,0 +1,48 @@ +/* Test case for x86-64 preserved registers in dynamic linker. */ + +#ifdef __AVX__ +#include <stdlib.h> +#include <string.h> +#include <immintrin.h> + +__m256i +audit_test (__m256i x0, __m256i x1, __m256i x2, __m256i x3, + __m256i x4, __m256i x5, __m256i x6, __m256i x7) +{ + __m256i ymm; + + ymm = _mm256_set1_epi32 (1); + if (memcmp (&ymm, &x0, sizeof (ymm))) + abort (); + + ymm = _mm256_set1_epi32 (2); + if (memcmp (&ymm, &x1, sizeof (ymm))) + abort (); + + ymm = _mm256_set1_epi32 (3); + if (memcmp (&ymm, &x2, sizeof (ymm))) + abort (); + + ymm = _mm256_set1_epi32 (4); + if (memcmp (&ymm, &x3, sizeof (ymm))) + abort (); + + ymm = _mm256_set1_epi32 (5); + if (memcmp (&ymm, &x4, sizeof (ymm))) + abort (); + + ymm = _mm256_set1_epi32 (6); + if (memcmp (&ymm, &x5, sizeof (ymm))) + abort (); + + ymm = _mm256_set1_epi32 (7); + if (memcmp (&ymm, &x6, sizeof (ymm))) + abort (); + + ymm = _mm256_set1_epi32 (8); + if (memcmp (&ymm, &x7, sizeof (ymm))) + abort (); + + return _mm256_setzero_si256 (); +} +#endif diff --git a/REORG.TODO/sysdeps/x86_64/tst-auditmod4b.c b/REORG.TODO/sysdeps/x86_64/tst-auditmod4b.c new file mode 100644 index 0000000000..1153ea442c --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/tst-auditmod4b.c @@ -0,0 +1,213 @@ +/* Verify that changing AVX registers in audit library won't affect + function parameter passing/return. */ + +#include <dlfcn.h> +#include <link.h> +#include <stddef.h> +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <bits/wordsize.h> +#include <gnu/lib-names.h> + +unsigned int +la_version (unsigned int v) +{ + setlinebuf (stdout); + + printf ("version: %u\n", v); + + char buf[20]; + sprintf (buf, "%u", v); + + return v; +} + +void +la_activity (uintptr_t *cookie, unsigned int flag) +{ + if (flag == LA_ACT_CONSISTENT) + printf ("activity: consistent\n"); + else if (flag == LA_ACT_ADD) + printf ("activity: add\n"); + else if (flag == LA_ACT_DELETE) + printf ("activity: delete\n"); + else + printf ("activity: unknown activity %u\n", flag); +} + +char * +la_objsearch (const char *name, uintptr_t *cookie, unsigned int flag) +{ + char buf[100]; + const char *flagstr; + if (flag == LA_SER_ORIG) + flagstr = "LA_SET_ORIG"; + else if (flag == LA_SER_LIBPATH) + flagstr = "LA_SER_LIBPATH"; + else if (flag == LA_SER_RUNPATH) + flagstr = "LA_SER_RUNPATH"; + else if (flag == LA_SER_CONFIG) + flagstr = "LA_SER_CONFIG"; + else if (flag == LA_SER_DEFAULT) + flagstr = "LA_SER_DEFAULT"; + else if (flag == LA_SER_SECURE) + flagstr = "LA_SER_SECURE"; + else + { + sprintf (buf, "unknown flag %d", flag); + flagstr = buf; + } + printf ("objsearch: %s, %s\n", name, flagstr); + + return (char *) name; +} + +unsigned int +la_objopen (struct link_map *l, Lmid_t lmid, uintptr_t *cookie) +{ + printf ("objopen: %ld, %s\n", lmid, l->l_name); + + return 3; +} + +void +la_preinit (uintptr_t *cookie) +{ + printf ("preinit\n"); +} + +unsigned int +la_objclose (uintptr_t *cookie) +{ + printf ("objclose\n"); + return 0; +} + +uintptr_t +la_symbind32 (Elf32_Sym *sym, unsigned int ndx, uintptr_t *refcook, + uintptr_t *defcook, unsigned int *flags, const char *symname) +{ + printf ("symbind32: symname=%s, st_value=%#lx, ndx=%u, flags=%u\n", + symname, (long int) sym->st_value, ndx, *flags); + + return sym->st_value; +} + +uintptr_t +la_symbind64 (Elf64_Sym *sym, unsigned int ndx, uintptr_t *refcook, + uintptr_t *defcook, unsigned int *flags, const char *symname) +{ + printf ("symbind64: symname=%s, st_value=%#lx, ndx=%u, flags=%u\n", + symname, (long int) sym->st_value, ndx, *flags); + + return sym->st_value; +} + +#include <tst-audit.h> + +#ifdef __AVX__ +#include <immintrin.h> +#include <cpuid.h> + +static int avx = -1; + +static inline int +__attribute ((always_inline)) +check_avx (void) +{ + if (avx == -1) + { + unsigned int eax, ebx, ecx, edx; + + if (__get_cpuid (1, &eax, &ebx, &ecx, &edx) + && (ecx & bit_AVX)) + avx = 1; + else + avx = 0; + } + return avx; +} +#else +#include <emmintrin.h> +#endif + +ElfW(Addr) +pltenter (ElfW(Sym) *sym, unsigned int ndx, uintptr_t *refcook, + uintptr_t *defcook, La_regs *regs, unsigned int *flags, + const char *symname, long int *framesizep) +{ + printf ("pltenter: symname=%s, st_value=%#lx, ndx=%u, flags=%u\n", + symname, (long int) sym->st_value, ndx, *flags); + +#ifdef __AVX__ + if (check_avx () && strcmp (symname, "audit_test") == 0) + { + __m256i zero = _mm256_setzero_si256 (); + if (memcmp (®s->lr_vector[0], &zero, sizeof (zero)) + || memcmp (®s->lr_vector[1], &zero, sizeof (zero)) + || memcmp (®s->lr_vector[2], &zero, sizeof (zero)) + || memcmp (®s->lr_vector[3], &zero, sizeof (zero)) + || memcmp (®s->lr_vector[4], &zero, sizeof (zero)) + || memcmp (®s->lr_vector[5], &zero, sizeof (zero)) + || memcmp (®s->lr_vector[6], &zero, sizeof (zero)) + || memcmp (®s->lr_vector[7], &zero, sizeof (zero))) + abort (); + + for (int i = 0; i < 8; i++) + regs->lr_vector[i].ymm[0] + = (La_x86_64_ymm) _mm256_set1_epi32 (i + 1); + + __m256i ymm = _mm256_set1_epi32 (-1); + asm volatile ("vmovdqa %0, %%ymm0" : : "x" (ymm) : "xmm0" ); + asm volatile ("vmovdqa %0, %%ymm1" : : "x" (ymm) : "xmm1" ); + asm volatile ("vmovdqa %0, %%ymm2" : : "x" (ymm) : "xmm2" ); + asm volatile ("vmovdqa %0, %%ymm3" : : "x" (ymm) : "xmm3" ); + asm volatile ("vmovdqa %0, %%ymm4" : : "x" (ymm) : "xmm4" ); + asm volatile ("vmovdqa %0, %%ymm5" : : "x" (ymm) : "xmm5" ); + asm volatile ("vmovdqa %0, %%ymm6" : : "x" (ymm) : "xmm6" ); + asm volatile ("vmovdqa %0, %%ymm7" : : "x" (ymm) : "xmm7" ); + + *framesizep = 1024; + } +#endif + + return sym->st_value; +} + +unsigned int +pltexit (ElfW(Sym) *sym, unsigned int ndx, uintptr_t *refcook, + uintptr_t *defcook, const La_regs *inregs, La_retval *outregs, + const char *symname) +{ + printf ("pltexit: symname=%s, st_value=%#lx, ndx=%u, retval=%tu\n", + symname, (long int) sym->st_value, ndx, + (ptrdiff_t) outregs->int_retval); + +#ifdef __AVX__ + if (check_avx () && strcmp (symname, "audit_test") == 0) + { + __m256i zero = _mm256_setzero_si256 (); + if (memcmp (&outregs->lrv_vector0, &zero, sizeof (zero))) + abort (); + + for (int i = 0; i < 8; i++) + { + __m256i ymm = _mm256_set1_epi32 (i + 1); + if (memcmp (&inregs->lr_vector[i], &ymm, sizeof (ymm)) != 0) + abort (); + } + + outregs->lrv_vector0.ymm[0] + = (La_x86_64_ymm) _mm256_set1_epi32 (0x12349876); + + __m256i ymm = _mm256_set1_epi32 (-1); + asm volatile ("vmovdqa %0, %%ymm0" : : "x" (ymm) : "xmm0" ); + asm volatile ("vmovdqa %0, %%ymm1" : : "x" (ymm) : "xmm1" ); + } +#endif + + return 0; +} diff --git a/REORG.TODO/sysdeps/x86_64/tst-auditmod5a.c b/REORG.TODO/sysdeps/x86_64/tst-auditmod5a.c new file mode 100644 index 0000000000..8511a70747 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/tst-auditmod5a.c @@ -0,0 +1,46 @@ +/* Test case for x86-64 preserved registers in dynamic linker. */ + +#include <stdlib.h> +#include <string.h> +#include <emmintrin.h> + +__m128i +audit_test (__m128i x0, __m128i x1, __m128i x2, __m128i x3, + __m128i x4, __m128i x5, __m128i x6, __m128i x7) +{ + __m128i xmm; + + xmm = _mm_set1_epi32 (1); + if (memcmp (&xmm, &x0, sizeof (xmm))) + abort (); + + xmm = _mm_set1_epi32 (2); + if (memcmp (&xmm, &x1, sizeof (xmm))) + abort (); + + xmm = _mm_set1_epi32 (3); + if (memcmp (&xmm, &x2, sizeof (xmm))) + abort (); + + xmm = _mm_set1_epi32 (4); + if (memcmp (&xmm, &x3, sizeof (xmm))) + abort (); + + xmm = _mm_set1_epi32 (5); + if (memcmp (&xmm, &x4, sizeof (xmm))) + abort (); + + xmm = _mm_set1_epi32 (6); + if (memcmp (&xmm, &x5, sizeof (xmm))) + abort (); + + xmm = _mm_set1_epi32 (7); + if (memcmp (&xmm, &x6, sizeof (xmm))) + abort (); + + xmm = _mm_set1_epi32 (8); + if (memcmp (&xmm, &x7, sizeof (xmm))) + abort (); + + return _mm_setzero_si128 (); +} diff --git a/REORG.TODO/sysdeps/x86_64/tst-auditmod5b.c b/REORG.TODO/sysdeps/x86_64/tst-auditmod5b.c new file mode 100644 index 0000000000..6a280fd61b --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/tst-auditmod5b.c @@ -0,0 +1,185 @@ +/* Verify that changing xmm registers in audit library won't affect + function parameter passing/return. */ + +#include <dlfcn.h> +#include <link.h> +#include <stddef.h> +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <bits/wordsize.h> +#include <gnu/lib-names.h> +#include <emmintrin.h> + +unsigned int +la_version (unsigned int v) +{ + setlinebuf (stdout); + + printf ("version: %u\n", v); + + char buf[20]; + sprintf (buf, "%u", v); + + return v; +} + +void +la_activity (uintptr_t *cookie, unsigned int flag) +{ + if (flag == LA_ACT_CONSISTENT) + printf ("activity: consistent\n"); + else if (flag == LA_ACT_ADD) + printf ("activity: add\n"); + else if (flag == LA_ACT_DELETE) + printf ("activity: delete\n"); + else + printf ("activity: unknown activity %u\n", flag); +} + +char * +la_objsearch (const char *name, uintptr_t *cookie, unsigned int flag) +{ + char buf[100]; + const char *flagstr; + if (flag == LA_SER_ORIG) + flagstr = "LA_SET_ORIG"; + else if (flag == LA_SER_LIBPATH) + flagstr = "LA_SER_LIBPATH"; + else if (flag == LA_SER_RUNPATH) + flagstr = "LA_SER_RUNPATH"; + else if (flag == LA_SER_CONFIG) + flagstr = "LA_SER_CONFIG"; + else if (flag == LA_SER_DEFAULT) + flagstr = "LA_SER_DEFAULT"; + else if (flag == LA_SER_SECURE) + flagstr = "LA_SER_SECURE"; + else + { + sprintf (buf, "unknown flag %d", flag); + flagstr = buf; + } + printf ("objsearch: %s, %s\n", name, flagstr); + + return (char *) name; +} + +unsigned int +la_objopen (struct link_map *l, Lmid_t lmid, uintptr_t *cookie) +{ + printf ("objopen: %ld, %s\n", lmid, l->l_name); + + return 3; +} + +void +la_preinit (uintptr_t *cookie) +{ + printf ("preinit\n"); +} + +unsigned int +la_objclose (uintptr_t *cookie) +{ + printf ("objclose\n"); + return 0; +} + +uintptr_t +la_symbind32 (Elf32_Sym *sym, unsigned int ndx, uintptr_t *refcook, + uintptr_t *defcook, unsigned int *flags, const char *symname) +{ + printf ("symbind32: symname=%s, st_value=%#lx, ndx=%u, flags=%u\n", + symname, (long int) sym->st_value, ndx, *flags); + + return sym->st_value; +} + +uintptr_t +la_symbind64 (Elf64_Sym *sym, unsigned int ndx, uintptr_t *refcook, + uintptr_t *defcook, unsigned int *flags, const char *symname) +{ + printf ("symbind64: symname=%s, st_value=%#lx, ndx=%u, flags=%u\n", + symname, (long int) sym->st_value, ndx, *flags); + + return sym->st_value; +} + +#include <tst-audit.h> + +ElfW(Addr) +pltenter (ElfW(Sym) *sym, unsigned int ndx, uintptr_t *refcook, + uintptr_t *defcook, La_regs *regs, unsigned int *flags, + const char *symname, long int *framesizep) +{ + printf ("pltenter: symname=%s, st_value=%#lx, ndx=%u, flags=%u\n", + symname, (long int) sym->st_value, ndx, *flags); + + __m128i minusone = _mm_set1_epi32 (-1); + + if (strcmp (symname, "audit_test") == 0) + { + __m128i zero = _mm_setzero_si128 (); + if (memcmp (®s->lr_xmm[0], &zero, sizeof (zero)) + || memcmp (®s->lr_xmm[1], &zero, sizeof (zero)) + || memcmp (®s->lr_xmm[2], &zero, sizeof (zero)) + || memcmp (®s->lr_xmm[3], &zero, sizeof (zero)) + || memcmp (®s->lr_xmm[4], &zero, sizeof (zero)) + || memcmp (®s->lr_xmm[5], &zero, sizeof (zero)) + || memcmp (®s->lr_xmm[6], &zero, sizeof (zero)) + || memcmp (®s->lr_xmm[7], &zero, sizeof (zero))) + abort (); + + for (int i = 0; i < 8; i++) + regs->lr_xmm[i] = (La_x86_64_xmm) _mm_set1_epi32 (i + 1); + + *framesizep = 1024; + } + + asm volatile ("movdqa %0, %%xmm0" : : "x" (minusone) : "xmm0" ); + asm volatile ("movdqa %0, %%xmm1" : : "x" (minusone) : "xmm1" ); + asm volatile ("movdqa %0, %%xmm2" : : "x" (minusone) : "xmm2" ); + asm volatile ("movdqa %0, %%xmm3" : : "x" (minusone) : "xmm3" ); + asm volatile ("movdqa %0, %%xmm4" : : "x" (minusone) : "xmm4" ); + asm volatile ("movdqa %0, %%xmm5" : : "x" (minusone) : "xmm5" ); + asm volatile ("movdqa %0, %%xmm6" : : "x" (minusone) : "xmm6" ); + asm volatile ("movdqa %0, %%xmm7" : : "x" (minusone) : "xmm7" ); + + return sym->st_value; +} + +unsigned int +pltexit (ElfW(Sym) *sym, unsigned int ndx, uintptr_t *refcook, + uintptr_t *defcook, const La_regs *inregs, La_retval *outregs, + const char *symname) +{ + printf ("pltexit: symname=%s, st_value=%#lx, ndx=%u, retval=%tu\n", + symname, (long int) sym->st_value, ndx, + (ptrdiff_t) outregs->int_retval); + + __m128i xmm; + + if (strcmp (symname, "audit_test") == 0) + { + __m128i zero = _mm_setzero_si128 (); + if (memcmp (&outregs->lrv_xmm0, &zero, sizeof (zero))) + abort (); + + for (int i = 0; i < 8; i++) + { + xmm = _mm_set1_epi32 (i + 1); + if (memcmp (&inregs->lr_xmm[i], &xmm, sizeof (xmm)) != 0) + abort (); + } + + outregs->lrv_xmm0 = (La_x86_64_xmm) _mm_set1_epi32 (0x12349876); + } + + xmm = _mm_set1_epi32 (-1); + asm volatile ("movdqa %0, %%xmm0" : : "x" (xmm) : "xmm0" ); + asm volatile ("movdqa %0, %%xmm1" : : "x" (xmm) : "xmm1" ); + + return 0; +} diff --git a/REORG.TODO/sysdeps/x86_64/tst-auditmod6a.c b/REORG.TODO/sysdeps/x86_64/tst-auditmod6a.c new file mode 100644 index 0000000000..c3a850ce98 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/tst-auditmod6a.c @@ -0,0 +1,46 @@ +/* Test case for x86-64 preserved registers in dynamic linker. */ + +#include <stdlib.h> +#include <string.h> +#include <emmintrin.h> + +__m128i +audit_test (__m128i x0, __m128i x1, __m128i x2, __m128i x3, + __m128i x4, __m128i x5, __m128i x6, __m128i x7) +{ + __m128i xmm; + + xmm = _mm_set1_epi32 (0x100); + if (memcmp (&xmm, &x0, sizeof (xmm))) + abort (); + + xmm = _mm_set1_epi32 (0x101); + if (memcmp (&xmm, &x1, sizeof (xmm))) + abort (); + + xmm = _mm_set1_epi32 (0x102); + if (memcmp (&xmm, &x2, sizeof (xmm))) + abort (); + + xmm = _mm_set1_epi32 (0x103); + if (memcmp (&xmm, &x3, sizeof (xmm))) + abort (); + + xmm = _mm_set1_epi32 (0x104); + if (memcmp (&xmm, &x4, sizeof (xmm))) + abort (); + + xmm = _mm_set1_epi32 (0x105); + if (memcmp (&xmm, &x5, sizeof (xmm))) + abort (); + + xmm = _mm_set1_epi32 (0x106); + if (memcmp (&xmm, &x6, sizeof (xmm))) + abort (); + + xmm = _mm_set1_epi32 (0x107); + if (memcmp (&xmm, &x7, sizeof (xmm))) + abort (); + + return _mm_setzero_si128 (); +} diff --git a/REORG.TODO/sysdeps/x86_64/tst-auditmod6b.c b/REORG.TODO/sysdeps/x86_64/tst-auditmod6b.c new file mode 100644 index 0000000000..3533602c07 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/tst-auditmod6b.c @@ -0,0 +1,227 @@ +/* Verify that changing AVX registers in audit library won't affect + function parameter passing/return. */ + +#include <dlfcn.h> +#include <link.h> +#include <stddef.h> +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <bits/wordsize.h> +#include <gnu/lib-names.h> + +unsigned int +la_version (unsigned int v) +{ + setlinebuf (stdout); + + printf ("version: %u\n", v); + + char buf[20]; + sprintf (buf, "%u", v); + + return v; +} + +void +la_activity (uintptr_t *cookie, unsigned int flag) +{ + if (flag == LA_ACT_CONSISTENT) + printf ("activity: consistent\n"); + else if (flag == LA_ACT_ADD) + printf ("activity: add\n"); + else if (flag == LA_ACT_DELETE) + printf ("activity: delete\n"); + else + printf ("activity: unknown activity %u\n", flag); +} + +char * +la_objsearch (const char *name, uintptr_t *cookie, unsigned int flag) +{ + char buf[100]; + const char *flagstr; + if (flag == LA_SER_ORIG) + flagstr = "LA_SET_ORIG"; + else if (flag == LA_SER_LIBPATH) + flagstr = "LA_SER_LIBPATH"; + else if (flag == LA_SER_RUNPATH) + flagstr = "LA_SER_RUNPATH"; + else if (flag == LA_SER_CONFIG) + flagstr = "LA_SER_CONFIG"; + else if (flag == LA_SER_DEFAULT) + flagstr = "LA_SER_DEFAULT"; + else if (flag == LA_SER_SECURE) + flagstr = "LA_SER_SECURE"; + else + { + sprintf (buf, "unknown flag %d", flag); + flagstr = buf; + } + printf ("objsearch: %s, %s\n", name, flagstr); + + return (char *) name; +} + +unsigned int +la_objopen (struct link_map *l, Lmid_t lmid, uintptr_t *cookie) +{ + printf ("objopen: %ld, %s\n", lmid, l->l_name); + + return 3; +} + +void +la_preinit (uintptr_t *cookie) +{ + printf ("preinit\n"); +} + +unsigned int +la_objclose (uintptr_t *cookie) +{ + printf ("objclose\n"); + return 0; +} + +uintptr_t +la_symbind32 (Elf32_Sym *sym, unsigned int ndx, uintptr_t *refcook, + uintptr_t *defcook, unsigned int *flags, const char *symname) +{ + printf ("symbind32: symname=%s, st_value=%#lx, ndx=%u, flags=%u\n", + symname, (long int) sym->st_value, ndx, *flags); + + return sym->st_value; +} + +uintptr_t +la_symbind64 (Elf64_Sym *sym, unsigned int ndx, uintptr_t *refcook, + uintptr_t *defcook, unsigned int *flags, const char *symname) +{ + printf ("symbind64: symname=%s, st_value=%#lx, ndx=%u, flags=%u\n", + symname, (long int) sym->st_value, ndx, *flags); + + return sym->st_value; +} + +#include <tst-audit.h> + +#ifdef __AVX__ +#include <immintrin.h> +#include <cpuid.h> + +static int avx = -1; + +static inline int +__attribute ((always_inline)) +check_avx (void) +{ + if (avx == -1) + { + unsigned int eax, ebx, ecx, edx; + + if (__get_cpuid (1, &eax, &ebx, &ecx, &edx) + && (ecx & bit_AVX)) + avx = 1; + else + avx = 0; + } + return avx; +} +#else +#include <emmintrin.h> +#endif + +ElfW(Addr) +pltenter (ElfW(Sym) *sym, unsigned int ndx, uintptr_t *refcook, + uintptr_t *defcook, La_regs *regs, unsigned int *flags, + const char *symname, long int *framesizep) +{ + printf ("pltenter: symname=%s, st_value=%#lx, ndx=%u, flags=%u\n", + symname, (long int) sym->st_value, ndx, *flags); + +#ifdef __AVX__ + if (check_avx () && strcmp (symname, "audit_test") == 0) + { + int i; + + __m128i xmm = _mm_setzero_si128 (); + for (i = 0; i < 8; i++) + if (memcmp (®s->lr_xmm[i], &xmm, sizeof (xmm)) + || memcmp (®s->lr_vector[i], &xmm, sizeof (xmm))) + abort (); + + for (i = 0; i < 8; i += 2) + { + regs->lr_xmm[i] = (La_x86_64_xmm) _mm_set1_epi32 (i + 1); + regs->lr_vector[i].xmm[0] = regs->lr_xmm[i]; + regs->lr_vector[i + 1].ymm[0] + = (La_x86_64_ymm) _mm256_set1_epi32 (i + 2); + regs->lr_xmm[i + 1] = regs->lr_vector[i + 1].xmm[0]; + } + + __m256i ymm = _mm256_set1_epi32 (-1); + asm volatile ("vmovdqa %0, %%ymm0" : : "x" (ymm) : "xmm0" ); + asm volatile ("vmovdqa %0, %%ymm1" : : "x" (ymm) : "xmm1" ); + asm volatile ("vmovdqa %0, %%ymm2" : : "x" (ymm) : "xmm2" ); + asm volatile ("vmovdqa %0, %%ymm3" : : "x" (ymm) : "xmm3" ); + asm volatile ("vmovdqa %0, %%ymm4" : : "x" (ymm) : "xmm4" ); + asm volatile ("vmovdqa %0, %%ymm5" : : "x" (ymm) : "xmm5" ); + asm volatile ("vmovdqa %0, %%ymm6" : : "x" (ymm) : "xmm6" ); + asm volatile ("vmovdqa %0, %%ymm7" : : "x" (ymm) : "xmm7" ); + + *framesizep = 1024; + } +#endif + + return sym->st_value; +} + +unsigned int +pltexit (ElfW(Sym) *sym, unsigned int ndx, uintptr_t *refcook, + uintptr_t *defcook, const La_regs *inregs, La_retval *outregs, + const char *symname) +{ + printf ("pltexit: symname=%s, st_value=%#lx, ndx=%u, retval=%tu\n", + symname, (long int) sym->st_value, ndx, + (ptrdiff_t) outregs->int_retval); + +#ifdef __AVX__ + if (check_avx () && strcmp (symname, "audit_test") == 0) + { + int i; + + __m128i xmm = _mm_setzero_si128 (); + if (memcmp (&outregs->lrv_xmm0, &xmm, sizeof (xmm)) + || memcmp (&outregs->lrv_vector0, &xmm, sizeof (xmm))) + abort (); + + __m256i ymm; + + for (i = 0; i < 8; i += 2) + { + xmm = _mm_set1_epi32 (i + 0x100); + if (memcmp (&inregs->lr_xmm[i], &xmm, sizeof (xmm)) + || memcmp (&inregs->lr_vector[i], &xmm, sizeof (xmm))) + abort (); + + ymm = _mm256_set1_epi32 (i + 0x101); + if (memcmp (&inregs->lr_xmm[i + 1], + &inregs->lr_vector[i + 1].xmm[0], sizeof (xmm)) + || memcmp (&inregs->lr_vector[i + 1], &ymm, sizeof (ymm))) + abort (); + } + + outregs->lrv_vector0.ymm[0] + = (La_x86_64_ymm) _mm256_set1_epi32 (0x12349876); + + ymm = _mm256_set1_epi32 (-1); + asm volatile ("vmovdqa %0, %%ymm0" : : "x" (ymm) : "xmm0" ); + asm volatile ("vmovdqa %0, %%ymm1" : : "x" (ymm) : "xmm1" ); + } +#endif + + return 0; +} diff --git a/REORG.TODO/sysdeps/x86_64/tst-auditmod6c.c b/REORG.TODO/sysdeps/x86_64/tst-auditmod6c.c new file mode 100644 index 0000000000..8000e89224 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/tst-auditmod6c.c @@ -0,0 +1,232 @@ +/* Verify that changing AVX registers in audit library won't affect + function parameter passing/return. */ + +#include <dlfcn.h> +#include <link.h> +#include <stddef.h> +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <bits/wordsize.h> +#include <gnu/lib-names.h> + +unsigned int +la_version (unsigned int v) +{ + setlinebuf (stdout); + + printf ("version: %u\n", v); + + char buf[20]; + sprintf (buf, "%u", v); + + return v; +} + +void +la_activity (uintptr_t *cookie, unsigned int flag) +{ + if (flag == LA_ACT_CONSISTENT) + printf ("activity: consistent\n"); + else if (flag == LA_ACT_ADD) + printf ("activity: add\n"); + else if (flag == LA_ACT_DELETE) + printf ("activity: delete\n"); + else + printf ("activity: unknown activity %u\n", flag); +} + +char * +la_objsearch (const char *name, uintptr_t *cookie, unsigned int flag) +{ + char buf[100]; + const char *flagstr; + if (flag == LA_SER_ORIG) + flagstr = "LA_SET_ORIG"; + else if (flag == LA_SER_LIBPATH) + flagstr = "LA_SER_LIBPATH"; + else if (flag == LA_SER_RUNPATH) + flagstr = "LA_SER_RUNPATH"; + else if (flag == LA_SER_CONFIG) + flagstr = "LA_SER_CONFIG"; + else if (flag == LA_SER_DEFAULT) + flagstr = "LA_SER_DEFAULT"; + else if (flag == LA_SER_SECURE) + flagstr = "LA_SER_SECURE"; + else + { + sprintf (buf, "unknown flag %d", flag); + flagstr = buf; + } + printf ("objsearch: %s, %s\n", name, flagstr); + + return (char *) name; +} + +unsigned int +la_objopen (struct link_map *l, Lmid_t lmid, uintptr_t *cookie) +{ + printf ("objopen: %ld, %s\n", lmid, l->l_name); + + return 3; +} + +void +la_preinit (uintptr_t *cookie) +{ + printf ("preinit\n"); +} + +unsigned int +la_objclose (uintptr_t *cookie) +{ + printf ("objclose\n"); + return 0; +} + +uintptr_t +la_symbind32 (Elf32_Sym *sym, unsigned int ndx, uintptr_t *refcook, + uintptr_t *defcook, unsigned int *flags, const char *symname) +{ + printf ("symbind32: symname=%s, st_value=%#lx, ndx=%u, flags=%u\n", + symname, (long int) sym->st_value, ndx, *flags); + + return sym->st_value; +} + +uintptr_t +la_symbind64 (Elf64_Sym *sym, unsigned int ndx, uintptr_t *refcook, + uintptr_t *defcook, unsigned int *flags, const char *symname) +{ + printf ("symbind64: symname=%s, st_value=%#lx, ndx=%u, flags=%u\n", + symname, (long int) sym->st_value, ndx, *flags); + + return sym->st_value; +} + +#include <tst-audit.h> + +#ifdef __AVX__ +#include <immintrin.h> +#include <cpuid.h> + +static int avx = -1; + +static inline int +__attribute ((always_inline)) +check_avx (void) +{ + if (avx == -1) + { + unsigned int eax, ebx, ecx, edx; + + if (__get_cpuid (1, &eax, &ebx, &ecx, &edx) + && (ecx & bit_AVX)) + avx = 1; + else + avx = 0; + } + return avx; +} +#else +#include <emmintrin.h> +#endif + +ElfW(Addr) +pltenter (ElfW(Sym) *sym, unsigned int ndx, uintptr_t *refcook, + uintptr_t *defcook, La_regs *regs, unsigned int *flags, + const char *symname, long int *framesizep) +{ + printf ("pltenter: symname=%s, st_value=%#lx, ndx=%u, flags=%u\n", + symname, (long int) sym->st_value, ndx, *flags); + +#ifdef __AVX__ + if (check_avx () && strcmp (symname, "audit_test") == 0) + { + int i; + __m128i xmm; + __m256i ymm; + + for (i = 0; i < 8; i += 2) + { + xmm = _mm_set1_epi32 (i + 1); + if (memcmp (®s->lr_xmm[i], &xmm, sizeof (xmm)) + || memcmp (®s->lr_vector[i], &xmm, sizeof (xmm))) + abort (); + regs->lr_xmm[i] = (La_x86_64_xmm) _mm_set1_epi32 (i + 0x100); + regs->lr_vector[i].xmm[0] = regs->lr_xmm[i]; + + ymm = _mm256_set1_epi32 (i + 2); + if (memcmp (®s->lr_xmm[i + 1], + ®s->lr_vector[i + 1].xmm[0], sizeof (xmm)) + || memcmp (®s->lr_vector[i + 1], &ymm, sizeof (ymm))) + abort (); + regs->lr_vector[i + 1].ymm[0] + = (La_x86_64_ymm) _mm256_set1_epi32 (i + 0x101); + regs->lr_xmm[i + 1] = regs->lr_vector[i + 1].xmm[0]; + } + + ymm = _mm256_set1_epi32 (-1); + asm volatile ("vmovdqa %0, %%ymm0" : : "x" (ymm) : "xmm0" ); + asm volatile ("vmovdqa %0, %%ymm1" : : "x" (ymm) : "xmm1" ); + asm volatile ("vmovdqa %0, %%ymm2" : : "x" (ymm) : "xmm2" ); + asm volatile ("vmovdqa %0, %%ymm3" : : "x" (ymm) : "xmm3" ); + asm volatile ("vmovdqa %0, %%ymm4" : : "x" (ymm) : "xmm4" ); + asm volatile ("vmovdqa %0, %%ymm5" : : "x" (ymm) : "xmm5" ); + asm volatile ("vmovdqa %0, %%ymm6" : : "x" (ymm) : "xmm6" ); + asm volatile ("vmovdqa %0, %%ymm7" : : "x" (ymm) : "xmm7" ); + + *framesizep = 1024; + } +#endif + + return sym->st_value; +} + +unsigned int +pltexit (ElfW(Sym) *sym, unsigned int ndx, uintptr_t *refcook, + uintptr_t *defcook, const La_regs *inregs, La_retval *outregs, + const char *symname) +{ + printf ("pltexit: symname=%s, st_value=%#lx, ndx=%u, retval=%tu\n", + symname, (long int) sym->st_value, ndx, + (ptrdiff_t) outregs->int_retval); + +#ifdef __AVX__ + if (check_avx () && strcmp (symname, "audit_test") == 0) + { + int i; + + __m256i ymm = _mm256_set1_epi32 (0x12349876);; + if (memcmp (&outregs->lrv_vector0, &ymm, sizeof (ymm))) + abort (); + + __m128i xmm; + + for (i = 0; i < 8; i += 2) + { + xmm = _mm_set1_epi32 (i + 0x100); + if (memcmp (&inregs->lr_xmm[i], &xmm, sizeof (xmm)) + || memcmp (&inregs->lr_vector[i], &xmm, sizeof (xmm))) + abort (); + + ymm = _mm256_set1_epi32 (i + 0x101); + if (memcmp (&inregs->lr_xmm[i + 1], + &inregs->lr_vector[i + 1].xmm[0], sizeof (xmm)) + || memcmp (&inregs->lr_vector[i + 1], &ymm, sizeof (ymm))) + abort (); + } + + outregs->lrv_vector0.ymm[0] + = (La_x86_64_ymm) _mm256_set1_epi32 (0x98abcdef); + + ymm = _mm256_set1_epi32 (-1); + asm volatile ("vmovdqa %0, %%ymm0" : : "x" (ymm) : "xmm0" ); + asm volatile ("vmovdqa %0, %%ymm1" : : "x" (ymm) : "xmm1" ); + } +#endif + + return 0; +} diff --git a/REORG.TODO/sysdeps/x86_64/tst-auditmod7a.c b/REORG.TODO/sysdeps/x86_64/tst-auditmod7a.c new file mode 100644 index 0000000000..b379df75d6 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/tst-auditmod7a.c @@ -0,0 +1 @@ +#include "tst-auditmod6a.c" diff --git a/REORG.TODO/sysdeps/x86_64/tst-auditmod7b.c b/REORG.TODO/sysdeps/x86_64/tst-auditmod7b.c new file mode 100644 index 0000000000..5abe6d1bc9 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/tst-auditmod7b.c @@ -0,0 +1,225 @@ +/* Verify that changing AVX registers in audit library won't affect + function parameter passing/return. */ + +#include <dlfcn.h> +#include <link.h> +#include <stddef.h> +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <bits/wordsize.h> +#include <gnu/lib-names.h> + +unsigned int +la_version (unsigned int v) +{ + setlinebuf (stdout); + + printf ("version: %u\n", v); + + char buf[20]; + sprintf (buf, "%u", v); + + return v; +} + +void +la_activity (uintptr_t *cookie, unsigned int flag) +{ + if (flag == LA_ACT_CONSISTENT) + printf ("activity: consistent\n"); + else if (flag == LA_ACT_ADD) + printf ("activity: add\n"); + else if (flag == LA_ACT_DELETE) + printf ("activity: delete\n"); + else + printf ("activity: unknown activity %u\n", flag); +} + +char * +la_objsearch (const char *name, uintptr_t *cookie, unsigned int flag) +{ + char buf[100]; + const char *flagstr; + if (flag == LA_SER_ORIG) + flagstr = "LA_SET_ORIG"; + else if (flag == LA_SER_LIBPATH) + flagstr = "LA_SER_LIBPATH"; + else if (flag == LA_SER_RUNPATH) + flagstr = "LA_SER_RUNPATH"; + else if (flag == LA_SER_CONFIG) + flagstr = "LA_SER_CONFIG"; + else if (flag == LA_SER_DEFAULT) + flagstr = "LA_SER_DEFAULT"; + else if (flag == LA_SER_SECURE) + flagstr = "LA_SER_SECURE"; + else + { + sprintf (buf, "unknown flag %d", flag); + flagstr = buf; + } + printf ("objsearch: %s, %s\n", name, flagstr); + + return (char *) name; +} + +unsigned int +la_objopen (struct link_map *l, Lmid_t lmid, uintptr_t *cookie) +{ + printf ("objopen: %ld, %s\n", lmid, l->l_name); + + return 3; +} + +void +la_preinit (uintptr_t *cookie) +{ + printf ("preinit\n"); +} + +unsigned int +la_objclose (uintptr_t *cookie) +{ + printf ("objclose\n"); + return 0; +} + +uintptr_t +la_symbind32 (Elf32_Sym *sym, unsigned int ndx, uintptr_t *refcook, + uintptr_t *defcook, unsigned int *flags, const char *symname) +{ + printf ("symbind32: symname=%s, st_value=%#lx, ndx=%u, flags=%u\n", + symname, (long int) sym->st_value, ndx, *flags); + + return sym->st_value; +} + +uintptr_t +la_symbind64 (Elf64_Sym *sym, unsigned int ndx, uintptr_t *refcook, + uintptr_t *defcook, unsigned int *flags, const char *symname) +{ + printf ("symbind64: symname=%s, st_value=%#lx, ndx=%u, flags=%u\n", + symname, (long int) sym->st_value, ndx, *flags); + + return sym->st_value; +} + +#include <tst-audit.h> + +#ifdef __AVX__ +#include <immintrin.h> +#include <cpuid.h> + +static int avx = -1; + +static inline int +__attribute ((always_inline)) +check_avx (void) +{ + if (avx == -1) + { + unsigned int eax, ebx, ecx, edx; + + if (__get_cpuid (1, &eax, &ebx, &ecx, &edx) + && (ecx & bit_AVX)) + avx = 1; + else + avx = 0; + } + return avx; +} +#else +#include <emmintrin.h> +#endif + +ElfW(Addr) +pltenter (ElfW(Sym) *sym, unsigned int ndx, uintptr_t *refcook, + uintptr_t *defcook, La_regs *regs, unsigned int *flags, + const char *symname, long int *framesizep) +{ + printf ("pltenter: symname=%s, st_value=%#lx, ndx=%u, flags=%u\n", + symname, (long int) sym->st_value, ndx, *flags); + +#ifdef __AVX__ + if (check_avx () && strcmp (symname, "audit_test") == 0) + { + int i; + + __m128i xmm = _mm_setzero_si128 (); + for (i = 0; i < 8; i++) + if (memcmp (®s->lr_xmm[i], &xmm, sizeof (xmm)) + || memcmp (®s->lr_vector[i], &xmm, sizeof (xmm))) + abort (); + + for (i = 0; i < 8; i += 2) + { + regs->lr_xmm[i] = (La_x86_64_xmm) _mm_set1_epi32 (i + 0x100); + regs->lr_vector[i + 1].ymm[0] + = (La_x86_64_ymm) _mm256_set1_epi32 (i + 0x101); + } + + __m256i ymm = _mm256_set1_epi32 (-1); + asm volatile ("vmovdqa %0, %%ymm0" : : "x" (ymm) : "xmm0" ); + asm volatile ("vmovdqa %0, %%ymm1" : : "x" (ymm) : "xmm1" ); + asm volatile ("vmovdqa %0, %%ymm2" : : "x" (ymm) : "xmm2" ); + asm volatile ("vmovdqa %0, %%ymm3" : : "x" (ymm) : "xmm3" ); + asm volatile ("vmovdqa %0, %%ymm4" : : "x" (ymm) : "xmm4" ); + asm volatile ("vmovdqa %0, %%ymm5" : : "x" (ymm) : "xmm5" ); + asm volatile ("vmovdqa %0, %%ymm6" : : "x" (ymm) : "xmm6" ); + asm volatile ("vmovdqa %0, %%ymm7" : : "x" (ymm) : "xmm7" ); + + *framesizep = 1024; + } +#endif + + return sym->st_value; +} + +unsigned int +pltexit (ElfW(Sym) *sym, unsigned int ndx, uintptr_t *refcook, + uintptr_t *defcook, const La_regs *inregs, La_retval *outregs, + const char *symname) +{ + printf ("pltexit: symname=%s, st_value=%#lx, ndx=%u, retval=%tu\n", + symname, (long int) sym->st_value, ndx, + (ptrdiff_t) outregs->int_retval); + +#ifdef __AVX__ + if (check_avx () && strcmp (symname, "audit_test") == 0) + { + int i; + + __m128i xmm = _mm_setzero_si128 (); + if (memcmp (&outregs->lrv_xmm0, &xmm, sizeof (xmm)) + || memcmp (&outregs->lrv_vector0, &xmm, sizeof (xmm))) + abort (); + + __m256i ymm; + + for (i = 0; i < 8; i += 2) + { + xmm = _mm_set1_epi32 (i + 0x100); + if (memcmp (&inregs->lr_xmm[i], &xmm, sizeof (xmm)) + || memcmp (&inregs->lr_vector[i], &xmm, sizeof (xmm))) + abort (); + + ymm = _mm256_set1_epi32 (i + 0x101); + if (memcmp (&inregs->lr_xmm[i + 1], + &inregs->lr_vector[i + 1].xmm[0], sizeof (xmm)) + || memcmp (&inregs->lr_vector[i + 1], &ymm, sizeof (ymm))) + abort (); + } + + outregs->lrv_vector0.ymm[0] + = (La_x86_64_ymm) _mm256_set1_epi32 (0x98abcdef); + + ymm = _mm256_set1_epi32 (-1); + asm volatile ("vmovdqa %0, %%ymm0" : : "x" (ymm) : "xmm0" ); + asm volatile ("vmovdqa %0, %%ymm1" : : "x" (ymm) : "xmm1" ); + } +#endif + + return 0; +} diff --git a/REORG.TODO/sysdeps/x86_64/tst-avx-aux.c b/REORG.TODO/sysdeps/x86_64/tst-avx-aux.c new file mode 100644 index 0000000000..e3807de7bb --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/tst-avx-aux.c @@ -0,0 +1,47 @@ +/* Test case for preserved AVX registers in dynamic linker, -mavx part. + Copyright (C) 2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <immintrin.h> +#include <stdlib.h> +#include <string.h> + +int +tst_avx_aux (void) +{ +#ifdef __AVX__ + extern __m256i avx_test (__m256i, __m256i, __m256i, __m256i, + __m256i, __m256i, __m256i, __m256i); + + __m256i ymm0 = _mm256_set1_epi32 (0); + __m256i ymm1 = _mm256_set1_epi32 (1); + __m256i ymm2 = _mm256_set1_epi32 (2); + __m256i ymm3 = _mm256_set1_epi32 (3); + __m256i ymm4 = _mm256_set1_epi32 (4); + __m256i ymm5 = _mm256_set1_epi32 (5); + __m256i ymm6 = _mm256_set1_epi32 (6); + __m256i ymm7 = _mm256_set1_epi32 (7); + __m256i ret = avx_test (ymm0, ymm1, ymm2, ymm3, + ymm4, ymm5, ymm6, ymm7); + ymm0 = _mm256_set1_epi32 (0x12349876); + if (memcmp (&ymm0, &ret, sizeof (ret))) + abort (); + return 0; +#else /* __AVX__ */ + return 77; +#endif /* __AVX__ */ +} diff --git a/REORG.TODO/sysdeps/x86_64/tst-avx.c b/REORG.TODO/sysdeps/x86_64/tst-avx.c new file mode 100644 index 0000000000..ec2e3a79ff --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/tst-avx.c @@ -0,0 +1,49 @@ +/* Test case for preserved AVX registers in dynamic linker. + Copyright (C) 2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <cpuid.h> + +int tst_avx_aux (void); + +static int +avx_enabled (void) +{ + unsigned int eax, ebx, ecx, edx; + + if (__get_cpuid (1, &eax, &ebx, &ecx, &edx) == 0 + || (ecx & (bit_AVX | bit_OSXSAVE)) != (bit_AVX | bit_OSXSAVE)) + return 0; + + /* Check the OS has AVX and SSE saving enabled. */ + asm ("xgetbv" : "=a" (eax), "=d" (edx) : "c" (0)); + + return (eax & 6) == 6; +} + +static int +do_test (void) +{ + /* Run AVX test only if AVX is supported. */ + if (avx_enabled ()) + return tst_avx_aux (); + else + return 77; +} + +#define TEST_FUNCTION do_test () +#include "../../test-skeleton.c" diff --git a/REORG.TODO/sysdeps/x86_64/tst-avx512-aux.c b/REORG.TODO/sysdeps/x86_64/tst-avx512-aux.c new file mode 100644 index 0000000000..6cebc523f2 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/tst-avx512-aux.c @@ -0,0 +1,48 @@ +/* Test case for preserved AVX512 registers in dynamic linker, + -mavx512 part. + Copyright (C) 2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <immintrin.h> +#include <stdlib.h> +#include <string.h> + +int +tst_avx512_aux (void) +{ +#ifdef __AVX512F__ + extern __m512i avx512_test (__m512i, __m512i, __m512i, __m512i, + __m512i, __m512i, __m512i, __m512i); + + __m512i zmm0 = _mm512_set1_epi32 (0); + __m512i zmm1 = _mm512_set1_epi32 (1); + __m512i zmm2 = _mm512_set1_epi32 (2); + __m512i zmm3 = _mm512_set1_epi32 (3); + __m512i zmm4 = _mm512_set1_epi32 (4); + __m512i zmm5 = _mm512_set1_epi32 (5); + __m512i zmm6 = _mm512_set1_epi32 (6); + __m512i zmm7 = _mm512_set1_epi32 (7); + __m512i ret = avx512_test (zmm0, zmm1, zmm2, zmm3, + zmm4, zmm5, zmm6, zmm7); + zmm0 = _mm512_set1_epi32 (0x12349876); + if (memcmp (&zmm0, &ret, sizeof (ret))) + abort (); + return 0; +#else /* __AVX512F__ */ + return 77; +#endif /* __AVX512F__ */ +} diff --git a/REORG.TODO/sysdeps/x86_64/tst-avx512.c b/REORG.TODO/sysdeps/x86_64/tst-avx512.c new file mode 100644 index 0000000000..a8e42ef553 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/tst-avx512.c @@ -0,0 +1,57 @@ +/* Test case for preserved AVX512 registers in dynamic linker. + Copyright (C) 2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <cpuid.h> + +int tst_avx512_aux (void); + +static int +avx512_enabled (void) +{ +#ifdef bit_AVX512F + unsigned int eax, ebx, ecx, edx; + + if (__get_cpuid (1, &eax, &ebx, &ecx, &edx) == 0 + || (ecx & (bit_AVX | bit_OSXSAVE)) != (bit_AVX | bit_OSXSAVE)) + return 0; + + __cpuid_count (7, 0, eax, ebx, ecx, edx); + if (!(ebx & bit_AVX512F)) + return 0; + + asm ("xgetbv" : "=a" (eax), "=d" (edx) : "c" (0)); + + /* Verify that ZMM, YMM and XMM states are enabled. */ + return (eax & 0xe6) == 0xe6; +#else + return 0; +#endif +} + +static int +do_test (void) +{ + /* Run AVX512 test only if AVX512 is supported. */ + if (avx512_enabled ()) + return tst_avx512_aux (); + else + return 77; +} + +#define TEST_FUNCTION do_test () +#include "../../test-skeleton.c" diff --git a/REORG.TODO/sysdeps/x86_64/tst-avx512mod.c b/REORG.TODO/sysdeps/x86_64/tst-avx512mod.c new file mode 100644 index 0000000000..4cfb3a2c3d --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/tst-avx512mod.c @@ -0,0 +1,48 @@ +/* Test case for x86-64 preserved AVX512 registers in dynamic linker. */ + +#ifdef __AVX512F__ +#include <stdlib.h> +#include <string.h> +#include <immintrin.h> + +__m512i +avx512_test (__m512i x0, __m512i x1, __m512i x2, __m512i x3, + __m512i x4, __m512i x5, __m512i x6, __m512i x7) +{ + __m512i zmm; + + zmm = _mm512_set1_epi32 (0); + if (memcmp (&zmm, &x0, sizeof (zmm))) + abort (); + + zmm = _mm512_set1_epi32 (1); + if (memcmp (&zmm, &x1, sizeof (zmm))) + abort (); + + zmm = _mm512_set1_epi32 (2); + if (memcmp (&zmm, &x2, sizeof (zmm))) + abort (); + + zmm = _mm512_set1_epi32 (3); + if (memcmp (&zmm, &x3, sizeof (zmm))) + abort (); + + zmm = _mm512_set1_epi32 (4); + if (memcmp (&zmm, &x4, sizeof (zmm))) + abort (); + + zmm = _mm512_set1_epi32 (5); + if (memcmp (&zmm, &x5, sizeof (zmm))) + abort (); + + zmm = _mm512_set1_epi32 (6); + if (memcmp (&zmm, &x6, sizeof (zmm))) + abort (); + + zmm = _mm512_set1_epi32 (7); + if (memcmp (&zmm, &x7, sizeof (zmm))) + abort (); + + return _mm512_set1_epi32 (0x12349876); +} +#endif diff --git a/REORG.TODO/sysdeps/x86_64/tst-avxmod.c b/REORG.TODO/sysdeps/x86_64/tst-avxmod.c new file mode 100644 index 0000000000..6e5b154997 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/tst-avxmod.c @@ -0,0 +1,48 @@ +/* Test case for x86-64 preserved AVX registers in dynamic linker. */ + +#ifdef __AVX__ +#include <stdlib.h> +#include <string.h> +#include <immintrin.h> + +__m256i +avx_test (__m256i x0, __m256i x1, __m256i x2, __m256i x3, + __m256i x4, __m256i x5, __m256i x6, __m256i x7) +{ + __m256i ymm; + + ymm = _mm256_set1_epi32 (0); + if (memcmp (&ymm, &x0, sizeof (ymm))) + abort (); + + ymm = _mm256_set1_epi32 (1); + if (memcmp (&ymm, &x1, sizeof (ymm))) + abort (); + + ymm = _mm256_set1_epi32 (2); + if (memcmp (&ymm, &x2, sizeof (ymm))) + abort (); + + ymm = _mm256_set1_epi32 (3); + if (memcmp (&ymm, &x3, sizeof (ymm))) + abort (); + + ymm = _mm256_set1_epi32 (4); + if (memcmp (&ymm, &x4, sizeof (ymm))) + abort (); + + ymm = _mm256_set1_epi32 (5); + if (memcmp (&ymm, &x5, sizeof (ymm))) + abort (); + + ymm = _mm256_set1_epi32 (6); + if (memcmp (&ymm, &x6, sizeof (ymm))) + abort (); + + ymm = _mm256_set1_epi32 (7); + if (memcmp (&ymm, &x7, sizeof (ymm))) + abort (); + + return _mm256_set1_epi32 (0x12349876); +} +#endif diff --git a/REORG.TODO/sysdeps/x86_64/tst-mallocalign1.c b/REORG.TODO/sysdeps/x86_64/tst-mallocalign1.c new file mode 100644 index 0000000000..1221829b44 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/tst-mallocalign1.c @@ -0,0 +1,72 @@ +/* Copyright (C) 2012-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <stdio.h> +#include <stdlib.h> + +/* Specified by x86-64 psABI. */ +#define ALIGN_MASK (16 - 1) + +void * +test (size_t s) +{ + void *p = malloc (s); + + printf ("malloc: %ld, %p: %ld\n", (unsigned long) s, p, + ((unsigned long) p) & ALIGN_MASK); + return p; +} + +static int +do_test (void) +{ + void *p; + int ret = 0; + + p = test (2); + ret |= (unsigned long) p & ALIGN_MASK; + free (p); + + p = test (8); + ret |= (unsigned long) p & ALIGN_MASK; + free (p); + + p = test (13); + ret |= (unsigned long) p & ALIGN_MASK; + free (p); + + p = test (16); + ret |= (unsigned long) p & ALIGN_MASK; + free (p); + + p = test (23); + ret |= (unsigned long) p & ALIGN_MASK; + free (p); + + p = test (43); + ret |= (unsigned long) p & ALIGN_MASK; + free (p); + + p = test (123); + ret |= (unsigned long) p & ALIGN_MASK; + free (p); + + return ret; +} + +#define TEST_FUNCTION do_test () +#include "../test-skeleton.c" diff --git a/REORG.TODO/sysdeps/x86_64/tst-quad1.c b/REORG.TODO/sysdeps/x86_64/tst-quad1.c new file mode 100644 index 0000000000..106bbac58b --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/tst-quad1.c @@ -0,0 +1,25 @@ +/* Copyright (C) 2012-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +extern void foo (void); + +int +main (void) +{ + foo (); + return 0; +} diff --git a/REORG.TODO/sysdeps/x86_64/tst-quad1pie.c b/REORG.TODO/sysdeps/x86_64/tst-quad1pie.c new file mode 100644 index 0000000000..f5fd45f9b0 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/tst-quad1pie.c @@ -0,0 +1 @@ +#include "tst-quad1.c" diff --git a/REORG.TODO/sysdeps/x86_64/tst-quad2.c b/REORG.TODO/sysdeps/x86_64/tst-quad2.c new file mode 100644 index 0000000000..f5fd45f9b0 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/tst-quad2.c @@ -0,0 +1 @@ +#include "tst-quad1.c" diff --git a/REORG.TODO/sysdeps/x86_64/tst-quad2pie.c b/REORG.TODO/sysdeps/x86_64/tst-quad2pie.c new file mode 100644 index 0000000000..a15d8d36ac --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/tst-quad2pie.c @@ -0,0 +1 @@ +#include "tst-quad2.c" diff --git a/REORG.TODO/sysdeps/x86_64/tst-quadmod1.S b/REORG.TODO/sysdeps/x86_64/tst-quadmod1.S new file mode 100644 index 0000000000..a2d9af87f3 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/tst-quadmod1.S @@ -0,0 +1,44 @@ +/* Copyright (C) 2012-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#ifndef BIAS +# define BIAS 0x7fffffff +#endif + + .section .data.rel,"aw",@progbits + .align 8 +.Ljmp: + .quad func + BIAS + .text + .globl func + .type func, @function +func: + .cfi_startproc + xorl %edi, %edi + jmp exit@PLT + .cfi_endproc + .size func, .-func + .globl foo + .type foo, @function +foo: + .cfi_startproc + .cfi_def_cfa_register 6 + movq .Ljmp(%rip), %rax + subq $BIAS, %rax + jmp *%rax + .cfi_endproc + .size foo, .-foo diff --git a/REORG.TODO/sysdeps/x86_64/tst-quadmod1pie.S b/REORG.TODO/sysdeps/x86_64/tst-quadmod1pie.S new file mode 100644 index 0000000000..c671d0cda9 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/tst-quadmod1pie.S @@ -0,0 +1,2 @@ +#define BIAS 0x7fff0000 +#include "tst-quadmod1.S" diff --git a/REORG.TODO/sysdeps/x86_64/tst-quadmod2.S b/REORG.TODO/sysdeps/x86_64/tst-quadmod2.S new file mode 100644 index 0000000000..78599cdeb0 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/tst-quadmod2.S @@ -0,0 +1,43 @@ +/* Copyright (C) 2012-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#ifndef BIAS +# define BIAS 0x7fff0000 +#endif + + .section .data.rel.local,"aw",@progbits + .align 8 +.Ljmp: + .quad func + BIAS + .text + .type func, @function +func: + .cfi_startproc + xorl %edi, %edi + jmp exit@PLT + .cfi_endproc + .size func, .-func + .globl foo + .type foo, @function +foo: + .cfi_startproc + .cfi_def_cfa_register 6 + movq .Ljmp(%rip), %rax + subq $BIAS, %rax + jmp *%rax + .cfi_endproc + .size foo, .-foo diff --git a/REORG.TODO/sysdeps/x86_64/tst-quadmod2pie.S b/REORG.TODO/sysdeps/x86_64/tst-quadmod2pie.S new file mode 100644 index 0000000000..609183fe58 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/tst-quadmod2pie.S @@ -0,0 +1 @@ +#include "tst-quadmod2.S" diff --git a/REORG.TODO/sysdeps/x86_64/tst-split-dynreloc.c b/REORG.TODO/sysdeps/x86_64/tst-split-dynreloc.c new file mode 100644 index 0000000000..2f9e9b9477 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/tst-split-dynreloc.c @@ -0,0 +1,28 @@ +/* This test will be used to create an executable with a specific + section layout in which .rela.dyn and .rela.plt are not contiguous. + For x86 case, readelf will report something like: + + ... + [10] .rela.dyn RELA + [11] .bar PROGBITS + [12] .rela.plt RELA + ... + + This is important as this case was not correctly handled by dynamic + linker in the bind-now case, and the second section was never + processed. */ + +#include <stdio.h> + +const int __attribute__ ((section(".bar"))) bar = 0x12345678; +static const char foo[] = "foo"; + +static int +do_test (void) +{ + printf ("%s %d\n", foo, bar); + return 0; +} + +#define TEST_FUNCTION do_test () +#include "../test-skeleton.c" diff --git a/REORG.TODO/sysdeps/x86_64/tst-split-dynreloc.lds b/REORG.TODO/sysdeps/x86_64/tst-split-dynreloc.lds new file mode 100644 index 0000000000..2229e698c9 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/tst-split-dynreloc.lds @@ -0,0 +1,5 @@ +SECTIONS +{ + .bar : { *(.bar) } +} +INSERT AFTER .rela.dyn; diff --git a/REORG.TODO/sysdeps/x86_64/tst-sse.c b/REORG.TODO/sysdeps/x86_64/tst-sse.c new file mode 100644 index 0000000000..dd1537cf27 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/tst-sse.c @@ -0,0 +1,46 @@ +/* Test case for preserved SSE registers in dynamic linker. + Copyright (C) 2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <immintrin.h> +#include <stdlib.h> +#include <string.h> + +extern __m128i sse_test (__m128i, __m128i, __m128i, __m128i, + __m128i, __m128i, __m128i, __m128i); + +static int +do_test (void) +{ + __m128i xmm0 = _mm_set1_epi32 (0); + __m128i xmm1 = _mm_set1_epi32 (1); + __m128i xmm2 = _mm_set1_epi32 (2); + __m128i xmm3 = _mm_set1_epi32 (3); + __m128i xmm4 = _mm_set1_epi32 (4); + __m128i xmm5 = _mm_set1_epi32 (5); + __m128i xmm6 = _mm_set1_epi32 (6); + __m128i xmm7 = _mm_set1_epi32 (7); + __m128i ret = sse_test (xmm0, xmm1, xmm2, xmm3, + xmm4, xmm5, xmm6, xmm7); + xmm0 = _mm_set1_epi32 (0x12349876); + if (memcmp (&xmm0, &ret, sizeof (ret))) + abort (); + return 0; +} + +#define TEST_FUNCTION do_test () +#include "../../test-skeleton.c" diff --git a/REORG.TODO/sysdeps/x86_64/tst-ssemod.c b/REORG.TODO/sysdeps/x86_64/tst-ssemod.c new file mode 100644 index 0000000000..907a64c69e --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/tst-ssemod.c @@ -0,0 +1,46 @@ +/* Test case for x86-64 preserved SSE registers in dynamic linker. */ + +#include <stdlib.h> +#include <string.h> +#include <immintrin.h> + +__m128i +sse_test (__m128i x0, __m128i x1, __m128i x2, __m128i x3, + __m128i x4, __m128i x5, __m128i x6, __m128i x7) +{ + __m128i xmm; + + xmm = _mm_set1_epi32 (0); + if (memcmp (&xmm, &x0, sizeof (xmm))) + abort (); + + xmm = _mm_set1_epi32 (1); + if (memcmp (&xmm, &x1, sizeof (xmm))) + abort (); + + xmm = _mm_set1_epi32 (2); + if (memcmp (&xmm, &x2, sizeof (xmm))) + abort (); + + xmm = _mm_set1_epi32 (3); + if (memcmp (&xmm, &x3, sizeof (xmm))) + abort (); + + xmm = _mm_set1_epi32 (4); + if (memcmp (&xmm, &x4, sizeof (xmm))) + abort (); + + xmm = _mm_set1_epi32 (5); + if (memcmp (&xmm, &x5, sizeof (xmm))) + abort (); + + xmm = _mm_set1_epi32 (6); + if (memcmp (&xmm, &x6, sizeof (xmm))) + abort (); + + xmm = _mm_set1_epi32 (7); + if (memcmp (&xmm, &x7, sizeof (xmm))) + abort (); + + return _mm_set1_epi32 (0x12349876); +} diff --git a/REORG.TODO/sysdeps/x86_64/tst-stack-align.h b/REORG.TODO/sysdeps/x86_64/tst-stack-align.h new file mode 100644 index 0000000000..abe14deb0f --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/tst-stack-align.h @@ -0,0 +1,46 @@ +/* Copyright (C) 2003-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <stdio.h> +#include <stdint.h> + +#define TEST_STACK_ALIGN() \ + ({ \ + /* AMD64 ABI mandates 16byte aligned stack. \ + Unfortunately, current GCC doesn't support __int128 or __float128 \ + types, so use aligned attribute instead. */ \ + struct _S \ + { \ + int _i __attribute__((aligned (16))); \ + int _pad[3]; \ + } _s = { ._i = 18 }; \ + double _d = 12.0; \ + long double _ld = 15.0; \ + int _ret = 0; \ + printf ("__int128: %d %p %zu\n", _s._i, &_s, __alignof (_s)); \ + if ((((uintptr_t) &_s) & (__alignof (_s) - 1)) != 0) \ + _ret = 1; \ + \ + printf ("double: %g %p %zu\n", _d, &_d, __alignof (double)); \ + if ((((uintptr_t) &_d) & (__alignof (double) - 1)) != 0) \ + _ret = 1; \ + \ + printf ("ldouble: %Lg %p %zu\n", _ld, &_ld, __alignof (long double)); \ + if ((((uintptr_t) &_ld) & (__alignof (long double) - 1)) != 0) \ + _ret = 1; \ + _ret; \ + }) diff --git a/REORG.TODO/sysdeps/x86_64/wcschr.S b/REORG.TODO/sysdeps/x86_64/wcschr.S new file mode 100644 index 0000000000..a3e7d67dec --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/wcschr.S @@ -0,0 +1,156 @@ +/* wcschr with SSSE3 + Copyright (C) 2011-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> + + .text +ENTRY (__wcschr) + + movd %rsi, %xmm1 + pxor %xmm2, %xmm2 + mov %rdi, %rcx + punpckldq %xmm1, %xmm1 + punpckldq %xmm1, %xmm1 + + and $63, %rcx + cmp $48, %rcx + ja L(cross_cache) + + movdqu (%rdi), %xmm0 + pcmpeqd %xmm0, %xmm2 + add $16, %rdi + pcmpeqd %xmm1, %xmm0 + pmovmskb %xmm2, %rdx + pmovmskb %xmm0, %rax + or %rax, %rdx + jnz L(matches) + + and $-16, %rdi + + movdqa (%rdi), %xmm0 + pcmpeqd %xmm0, %xmm2 + add $16, %rdi + pcmpeqd %xmm1, %xmm0 + pmovmskb %xmm2, %rdx + pmovmskb %xmm0, %rax + or %rax, %rdx + jnz L(matches) + + jmp L(loop) + +L(cross_cache): + and $15, %rcx + and $-16, %rdi + movdqa (%rdi), %xmm0 + pcmpeqd %xmm0, %xmm2 + pcmpeqd %xmm1, %xmm0 + pmovmskb %xmm2, %rdx + pmovmskb %xmm0, %rax + + sar %cl, %rdx + sar %cl, %rax + test %rax, %rax + je L(unaligned_no_match) + + bsf %rax, %rax + test %rdx, %rdx + je L(unaligned_match) + bsf %rdx, %rdx + cmp %rdx, %rax + ja L(return_null) + +L(unaligned_match): + add %rdi, %rax + add %rcx, %rax + ret + + .p2align 4 +L(unaligned_no_match): + test %rdx, %rdx + jne L(return_null) + pxor %xmm2, %xmm2 + + add $16, %rdi + + .p2align 4 +/* Loop start on aligned string. */ +L(loop): + movdqa (%rdi), %xmm0 + pcmpeqd %xmm0, %xmm2 + add $16, %rdi + pcmpeqd %xmm1, %xmm0 + pmovmskb %xmm2, %rdx + pmovmskb %xmm0, %rax + or %rax, %rdx + jnz L(matches) + + movdqa (%rdi), %xmm0 + pcmpeqd %xmm0, %xmm2 + add $16, %rdi + pcmpeqd %xmm1, %xmm0 + pmovmskb %xmm2, %rdx + pmovmskb %xmm0, %rax + or %rax, %rdx + jnz L(matches) + + movdqa (%rdi), %xmm0 + pcmpeqd %xmm0, %xmm2 + add $16, %rdi + pcmpeqd %xmm1, %xmm0 + pmovmskb %xmm2, %rdx + pmovmskb %xmm0, %rax + or %rax, %rdx + jnz L(matches) + + movdqa (%rdi), %xmm0 + pcmpeqd %xmm0, %xmm2 + add $16, %rdi + pcmpeqd %xmm1, %xmm0 + pmovmskb %xmm2, %rdx + pmovmskb %xmm0, %rax + or %rax, %rdx + jnz L(matches) + jmp L(loop) + + .p2align 4 +L(matches): + pmovmskb %xmm2, %rdx + test %rax, %rax + jz L(return_null) + bsf %rax, %rax + test %rdx, %rdx + je L(match) + bsf %rdx, %rcx + cmp %rcx, %rax + ja L(return_null) +L(match): + sub $16, %rdi + add %rdi, %rax + ret + + .p2align 4 +L(return_null): + xor %rax, %rax + ret + +END (__wcschr) + +libc_hidden_def(__wcschr) +weak_alias (__wcschr, wcschr) +libc_hidden_weak (wcschr) diff --git a/REORG.TODO/sysdeps/x86_64/wcscmp.S b/REORG.TODO/sysdeps/x86_64/wcscmp.S new file mode 100644 index 0000000000..3ef3341cd0 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/wcscmp.S @@ -0,0 +1,950 @@ +/* Optimized wcscmp for x86-64 with SSE2. + Copyright (C) 2011-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> + +/* Note: wcscmp uses signed comparison, not unsighed as in strcmp function. */ + + .text +ENTRY (__wcscmp) +/* + * This implementation uses SSE to compare up to 16 bytes at a time. +*/ + mov %esi, %eax + mov %edi, %edx + pxor %xmm0, %xmm0 /* clear %xmm0 for null char checks */ + mov %al, %ch + mov %dl, %cl + and $63, %eax /* rsi alignment in cache line */ + and $63, %edx /* rdi alignment in cache line */ + and $15, %cl + jz L(continue_00) + cmp $16, %edx + jb L(continue_0) + cmp $32, %edx + jb L(continue_16) + cmp $48, %edx + jb L(continue_32) + +L(continue_48): + and $15, %ch + jz L(continue_48_00) + cmp $16, %eax + jb L(continue_0_48) + cmp $32, %eax + jb L(continue_16_48) + cmp $48, %eax + jb L(continue_32_48) + + .p2align 4 +L(continue_48_48): + mov (%rsi), %ecx + cmp %ecx, (%rdi) + jne L(nequal) + test %ecx, %ecx + jz L(equal) + + mov 4(%rsi), %ecx + cmp %ecx, 4(%rdi) + jne L(nequal) + test %ecx, %ecx + jz L(equal) + + mov 8(%rsi), %ecx + cmp %ecx, 8(%rdi) + jne L(nequal) + test %ecx, %ecx + jz L(equal) + + mov 12(%rsi), %ecx + cmp %ecx, 12(%rdi) + jne L(nequal) + test %ecx, %ecx + jz L(equal) + + movdqu 16(%rdi), %xmm1 + movdqu 16(%rsi), %xmm2 + pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ + pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */ + psubb %xmm0, %xmm1 /* packed sub of comparison results*/ + pmovmskb %xmm1, %edx + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ + jnz L(less4_double_words_16) + + movdqu 32(%rdi), %xmm1 + movdqu 32(%rsi), %xmm2 + pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ + pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */ + psubb %xmm0, %xmm1 /* packed sub of comparison results*/ + pmovmskb %xmm1, %edx + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ + jnz L(less4_double_words_32) + + movdqu 48(%rdi), %xmm1 + movdqu 48(%rsi), %xmm2 + pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ + pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */ + psubb %xmm0, %xmm1 /* packed sub of comparison results*/ + pmovmskb %xmm1, %edx + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ + jnz L(less4_double_words_48) + + add $64, %rsi + add $64, %rdi + jmp L(continue_48_48) + +L(continue_0): + and $15, %ch + jz L(continue_0_00) + cmp $16, %eax + jb L(continue_0_0) + cmp $32, %eax + jb L(continue_0_16) + cmp $48, %eax + jb L(continue_0_32) + + .p2align 4 +L(continue_0_48): + mov (%rsi), %ecx + cmp %ecx, (%rdi) + jne L(nequal) + test %ecx, %ecx + jz L(equal) + + mov 4(%rsi), %ecx + cmp %ecx, 4(%rdi) + jne L(nequal) + test %ecx, %ecx + jz L(equal) + + mov 8(%rsi), %ecx + cmp %ecx, 8(%rdi) + jne L(nequal) + test %ecx, %ecx + jz L(equal) + + mov 12(%rsi), %ecx + cmp %ecx, 12(%rdi) + jne L(nequal) + test %ecx, %ecx + jz L(equal) + + movdqu 16(%rdi), %xmm1 + movdqu 16(%rsi), %xmm2 + pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ + pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */ + psubb %xmm0, %xmm1 /* packed sub of comparison results*/ + pmovmskb %xmm1, %edx + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ + jnz L(less4_double_words_16) + + movdqu 32(%rdi), %xmm1 + movdqu 32(%rsi), %xmm2 + pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ + pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */ + psubb %xmm0, %xmm1 /* packed sub of comparison results*/ + pmovmskb %xmm1, %edx + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ + jnz L(less4_double_words_32) + + mov 48(%rsi), %ecx + cmp %ecx, 48(%rdi) + jne L(nequal) + test %ecx, %ecx + jz L(equal) + + mov 52(%rsi), %ecx + cmp %ecx, 52(%rdi) + jne L(nequal) + test %ecx, %ecx + jz L(equal) + + mov 56(%rsi), %ecx + cmp %ecx, 56(%rdi) + jne L(nequal) + test %ecx, %ecx + jz L(equal) + + mov 60(%rsi), %ecx + cmp %ecx, 60(%rdi) + jne L(nequal) + test %ecx, %ecx + jz L(equal) + + add $64, %rsi + add $64, %rdi + jmp L(continue_0_48) + + .p2align 4 +L(continue_00): + and $15, %ch + jz L(continue_00_00) + cmp $16, %eax + jb L(continue_00_0) + cmp $32, %eax + jb L(continue_00_16) + cmp $48, %eax + jb L(continue_00_32) + + .p2align 4 +L(continue_00_48): + pcmpeqd (%rdi), %xmm0 + mov (%rdi), %eax + pmovmskb %xmm0, %ecx + test %ecx, %ecx + jnz L(less4_double_words1) + + cmp (%rsi), %eax + jne L(nequal) + + mov 4(%rdi), %eax + cmp 4(%rsi), %eax + jne L(nequal) + + mov 8(%rdi), %eax + cmp 8(%rsi), %eax + jne L(nequal) + + mov 12(%rdi), %eax + cmp 12(%rsi), %eax + jne L(nequal) + + movdqu 16(%rsi), %xmm2 + pcmpeqd %xmm2, %xmm0 /* Any null double_word? */ + pcmpeqd 16(%rdi), %xmm2 /* compare first 4 double_words for equality */ + psubb %xmm0, %xmm2 /* packed sub of comparison results*/ + pmovmskb %xmm2, %edx + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ + jnz L(less4_double_words_16) + + movdqu 32(%rsi), %xmm2 + pcmpeqd %xmm2, %xmm0 /* Any null double_word? */ + pcmpeqd 32(%rdi), %xmm2 /* compare first 4 double_words for equality */ + psubb %xmm0, %xmm2 /* packed sub of comparison results*/ + pmovmskb %xmm2, %edx + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ + jnz L(less4_double_words_32) + + movdqu 48(%rsi), %xmm2 + pcmpeqd %xmm2, %xmm0 /* Any null double_word? */ + pcmpeqd 48(%rdi), %xmm2 /* compare first 4 double_words for equality */ + psubb %xmm0, %xmm2 /* packed sub of comparison results*/ + pmovmskb %xmm2, %edx + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ + jnz L(less4_double_words_48) + + add $64, %rsi + add $64, %rdi + jmp L(continue_00_48) + + .p2align 4 +L(continue_32): + and $15, %ch + jz L(continue_32_00) + cmp $16, %eax + jb L(continue_0_32) + cmp $32, %eax + jb L(continue_16_32) + cmp $48, %eax + jb L(continue_32_32) + + .p2align 4 +L(continue_32_48): + mov (%rsi), %ecx + cmp %ecx, (%rdi) + jne L(nequal) + test %ecx, %ecx + jz L(equal) + + mov 4(%rsi), %ecx + cmp %ecx, 4(%rdi) + jne L(nequal) + test %ecx, %ecx + jz L(equal) + + mov 8(%rsi), %ecx + cmp %ecx, 8(%rdi) + jne L(nequal) + test %ecx, %ecx + jz L(equal) + + mov 12(%rsi), %ecx + cmp %ecx, 12(%rdi) + jne L(nequal) + test %ecx, %ecx + jz L(equal) + + mov 16(%rsi), %ecx + cmp %ecx, 16(%rdi) + jne L(nequal) + test %ecx, %ecx + jz L(equal) + + mov 20(%rsi), %ecx + cmp %ecx, 20(%rdi) + jne L(nequal) + test %ecx, %ecx + jz L(equal) + + mov 24(%rsi), %ecx + cmp %ecx, 24(%rdi) + jne L(nequal) + test %ecx, %ecx + jz L(equal) + + mov 28(%rsi), %ecx + cmp %ecx, 28(%rdi) + jne L(nequal) + test %ecx, %ecx + jz L(equal) + + movdqu 32(%rdi), %xmm1 + movdqu 32(%rsi), %xmm2 + pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ + pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */ + psubb %xmm0, %xmm1 /* packed sub of comparison results*/ + pmovmskb %xmm1, %edx + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ + jnz L(less4_double_words_32) + + movdqu 48(%rdi), %xmm1 + movdqu 48(%rsi), %xmm2 + pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ + pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */ + psubb %xmm0, %xmm1 /* packed sub of comparison results*/ + pmovmskb %xmm1, %edx + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ + jnz L(less4_double_words_48) + + add $64, %rsi + add $64, %rdi + jmp L(continue_32_48) + + .p2align 4 +L(continue_16): + and $15, %ch + jz L(continue_16_00) + cmp $16, %eax + jb L(continue_0_16) + cmp $32, %eax + jb L(continue_16_16) + cmp $48, %eax + jb L(continue_16_32) + + .p2align 4 +L(continue_16_48): + mov (%rsi), %ecx + cmp %ecx, (%rdi) + jne L(nequal) + test %ecx, %ecx + jz L(equal) + + mov 4(%rsi), %ecx + cmp %ecx, 4(%rdi) + jne L(nequal) + test %ecx, %ecx + jz L(equal) + + mov 8(%rsi), %ecx + cmp %ecx, 8(%rdi) + jne L(nequal) + test %ecx, %ecx + jz L(equal) + + mov 12(%rsi), %ecx + cmp %ecx, 12(%rdi) + jne L(nequal) + test %ecx, %ecx + jz L(equal) + + movdqu 16(%rdi), %xmm1 + movdqu 16(%rsi), %xmm2 + pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ + pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */ + psubb %xmm0, %xmm1 /* packed sub of comparison results*/ + pmovmskb %xmm1, %edx + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ + jnz L(less4_double_words_16) + + mov 32(%rsi), %ecx + cmp %ecx, 32(%rdi) + jne L(nequal) + test %ecx, %ecx + jz L(equal) + + mov 36(%rsi), %ecx + cmp %ecx, 36(%rdi) + jne L(nequal) + test %ecx, %ecx + jz L(equal) + + mov 40(%rsi), %ecx + cmp %ecx, 40(%rdi) + jne L(nequal) + test %ecx, %ecx + jz L(equal) + + mov 44(%rsi), %ecx + cmp %ecx, 44(%rdi) + jne L(nequal) + test %ecx, %ecx + jz L(equal) + + movdqu 48(%rdi), %xmm1 + movdqu 48(%rsi), %xmm2 + pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ + pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */ + psubb %xmm0, %xmm1 /* packed sub of comparison results*/ + pmovmskb %xmm1, %edx + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ + jnz L(less4_double_words_48) + + add $64, %rsi + add $64, %rdi + jmp L(continue_16_48) + + .p2align 4 +L(continue_00_00): + movdqa (%rdi), %xmm1 + pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ + pcmpeqd (%rsi), %xmm1 /* compare first 4 double_words for equality */ + psubb %xmm0, %xmm1 /* packed sub of comparison results*/ + pmovmskb %xmm1, %edx + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ + jnz L(less4_double_words) + + movdqa 16(%rdi), %xmm3 + pcmpeqd %xmm3, %xmm0 /* Any null double_word? */ + pcmpeqd 16(%rsi), %xmm3 /* compare first 4 double_words for equality */ + psubb %xmm0, %xmm3 /* packed sub of comparison results*/ + pmovmskb %xmm3, %edx + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ + jnz L(less4_double_words_16) + + movdqa 32(%rdi), %xmm5 + pcmpeqd %xmm5, %xmm0 /* Any null double_word? */ + pcmpeqd 32(%rsi), %xmm5 /* compare first 4 double_words for equality */ + psubb %xmm0, %xmm5 /* packed sub of comparison results*/ + pmovmskb %xmm5, %edx + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ + jnz L(less4_double_words_32) + + movdqa 48(%rdi), %xmm1 + pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ + pcmpeqd 48(%rsi), %xmm1 /* compare first 4 double_words for equality */ + psubb %xmm0, %xmm1 /* packed sub of comparison results*/ + pmovmskb %xmm1, %edx + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ + jnz L(less4_double_words_48) + + add $64, %rsi + add $64, %rdi + jmp L(continue_00_00) + + .p2align 4 +L(continue_00_32): + movdqu (%rsi), %xmm2 + pcmpeqd %xmm2, %xmm0 /* Any null double_word? */ + pcmpeqd (%rdi), %xmm2 /* compare first 4 double_words for equality */ + psubb %xmm0, %xmm2 /* packed sub of comparison results*/ + pmovmskb %xmm2, %edx + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ + jnz L(less4_double_words) + + add $16, %rsi + add $16, %rdi + jmp L(continue_00_48) + + .p2align 4 +L(continue_00_16): + movdqu (%rsi), %xmm2 + pcmpeqd %xmm2, %xmm0 /* Any null double_word? */ + pcmpeqd (%rdi), %xmm2 /* compare first 4 double_words for equality */ + psubb %xmm0, %xmm2 /* packed sub of comparison results*/ + pmovmskb %xmm2, %edx + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ + jnz L(less4_double_words) + + movdqu 16(%rsi), %xmm2 + pcmpeqd %xmm2, %xmm0 /* Any null double_word? */ + pcmpeqd 16(%rdi), %xmm2 /* compare first 4 double_words for equality */ + psubb %xmm0, %xmm2 /* packed sub of comparison results*/ + pmovmskb %xmm2, %edx + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ + jnz L(less4_double_words_16) + + add $32, %rsi + add $32, %rdi + jmp L(continue_00_48) + + .p2align 4 +L(continue_00_0): + movdqu (%rsi), %xmm2 + pcmpeqd %xmm2, %xmm0 /* Any null double_word? */ + pcmpeqd (%rdi), %xmm2 /* compare first 4 double_words for equality */ + psubb %xmm0, %xmm2 /* packed sub of comparison results*/ + pmovmskb %xmm2, %edx + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ + jnz L(less4_double_words) + + movdqu 16(%rsi), %xmm2 + pcmpeqd %xmm2, %xmm0 /* Any null double_word? */ + pcmpeqd 16(%rdi), %xmm2 /* compare first 4 double_words for equality */ + psubb %xmm0, %xmm2 /* packed sub of comparison results*/ + pmovmskb %xmm2, %edx + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ + jnz L(less4_double_words_16) + + movdqu 32(%rsi), %xmm2 + pcmpeqd %xmm2, %xmm0 /* Any null double_word? */ + pcmpeqd 32(%rdi), %xmm2 /* compare first 4 double_words for equality */ + psubb %xmm0, %xmm2 /* packed sub of comparison results*/ + pmovmskb %xmm2, %edx + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ + jnz L(less4_double_words_32) + + add $48, %rsi + add $48, %rdi + jmp L(continue_00_48) + + .p2align 4 +L(continue_48_00): + pcmpeqd (%rsi), %xmm0 + mov (%rdi), %eax + pmovmskb %xmm0, %ecx + test %ecx, %ecx + jnz L(less4_double_words1) + + cmp (%rsi), %eax + jne L(nequal) + + mov 4(%rdi), %eax + cmp 4(%rsi), %eax + jne L(nequal) + + mov 8(%rdi), %eax + cmp 8(%rsi), %eax + jne L(nequal) + + mov 12(%rdi), %eax + cmp 12(%rsi), %eax + jne L(nequal) + + movdqu 16(%rdi), %xmm1 + pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ + pcmpeqd 16(%rsi), %xmm1 /* compare first 4 double_words for equality */ + psubb %xmm0, %xmm1 /* packed sub of comparison results*/ + pmovmskb %xmm1, %edx + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ + jnz L(less4_double_words_16) + + movdqu 32(%rdi), %xmm1 + pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ + pcmpeqd 32(%rsi), %xmm1 /* compare first 4 double_words for equality */ + psubb %xmm0, %xmm1 /* packed sub of comparison results*/ + pmovmskb %xmm1, %edx + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ + jnz L(less4_double_words_32) + + movdqu 48(%rdi), %xmm1 + pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ + pcmpeqd 48(%rsi), %xmm1 /* compare first 4 double_words for equality */ + psubb %xmm0, %xmm1 /* packed sub of comparison results*/ + pmovmskb %xmm1, %edx + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ + jnz L(less4_double_words_48) + + add $64, %rsi + add $64, %rdi + jmp L(continue_48_00) + + .p2align 4 +L(continue_32_00): + movdqu (%rdi), %xmm1 + pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ + pcmpeqd (%rsi), %xmm1 /* compare first 4 double_words for equality */ + psubb %xmm0, %xmm1 /* packed sub of comparison results*/ + pmovmskb %xmm1, %edx + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ + jnz L(less4_double_words) + + add $16, %rsi + add $16, %rdi + jmp L(continue_48_00) + + .p2align 4 +L(continue_16_00): + movdqu (%rdi), %xmm1 + pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ + pcmpeqd (%rsi), %xmm1 /* compare first 4 double_words for equality */ + psubb %xmm0, %xmm1 /* packed sub of comparison results*/ + pmovmskb %xmm1, %edx + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ + jnz L(less4_double_words) + + movdqu 16(%rdi), %xmm1 + pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ + pcmpeqd 16(%rsi), %xmm1 /* compare first 4 double_words for equality */ + psubb %xmm0, %xmm1 /* packed sub of comparison results*/ + pmovmskb %xmm1, %edx + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ + jnz L(less4_double_words_16) + + add $32, %rsi + add $32, %rdi + jmp L(continue_48_00) + + .p2align 4 +L(continue_0_00): + movdqu (%rdi), %xmm1 + pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ + pcmpeqd (%rsi), %xmm1 /* compare first 4 double_words for equality */ + psubb %xmm0, %xmm1 /* packed sub of comparison results*/ + pmovmskb %xmm1, %edx + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ + jnz L(less4_double_words) + + movdqu 16(%rdi), %xmm1 + pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ + pcmpeqd 16(%rsi), %xmm1 /* compare first 4 double_words for equality */ + psubb %xmm0, %xmm1 /* packed sub of comparison results*/ + pmovmskb %xmm1, %edx + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ + jnz L(less4_double_words_16) + + movdqu 32(%rdi), %xmm1 + pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ + pcmpeqd 32(%rsi), %xmm1 /* compare first 4 double_words for equality */ + psubb %xmm0, %xmm1 /* packed sub of comparison results*/ + pmovmskb %xmm1, %edx + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ + jnz L(less4_double_words_32) + + add $48, %rsi + add $48, %rdi + jmp L(continue_48_00) + + .p2align 4 +L(continue_32_32): + movdqu (%rdi), %xmm1 + movdqu (%rsi), %xmm2 + pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ + pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */ + psubb %xmm0, %xmm1 /* packed sub of comparison results*/ + pmovmskb %xmm1, %edx + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ + jnz L(less4_double_words) + + add $16, %rsi + add $16, %rdi + jmp L(continue_48_48) + + .p2align 4 +L(continue_16_16): + movdqu (%rdi), %xmm1 + movdqu (%rsi), %xmm2 + pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ + pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */ + psubb %xmm0, %xmm1 /* packed sub of comparison results*/ + pmovmskb %xmm1, %edx + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ + jnz L(less4_double_words) + + movdqu 16(%rdi), %xmm3 + movdqu 16(%rsi), %xmm4 + pcmpeqd %xmm3, %xmm0 /* Any null double_word? */ + pcmpeqd %xmm4, %xmm3 /* compare first 4 double_words for equality */ + psubb %xmm0, %xmm3 /* packed sub of comparison results*/ + pmovmskb %xmm3, %edx + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ + jnz L(less4_double_words_16) + + add $32, %rsi + add $32, %rdi + jmp L(continue_48_48) + + .p2align 4 +L(continue_0_0): + movdqu (%rdi), %xmm1 + movdqu (%rsi), %xmm2 + pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ + pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */ + psubb %xmm0, %xmm1 /* packed sub of comparison results*/ + pmovmskb %xmm1, %edx + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ + jnz L(less4_double_words) + + movdqu 16(%rdi), %xmm3 + movdqu 16(%rsi), %xmm4 + pcmpeqd %xmm3, %xmm0 /* Any null double_word? */ + pcmpeqd %xmm4, %xmm3 /* compare first 4 double_words for equality */ + psubb %xmm0, %xmm3 /* packed sub of comparison results*/ + pmovmskb %xmm3, %edx + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ + jnz L(less4_double_words_16) + + movdqu 32(%rdi), %xmm1 + movdqu 32(%rsi), %xmm2 + pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ + pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */ + psubb %xmm0, %xmm1 /* packed sub of comparison results*/ + pmovmskb %xmm1, %edx + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ + jnz L(less4_double_words_32) + + add $48, %rsi + add $48, %rdi + jmp L(continue_48_48) + + .p2align 4 +L(continue_0_16): + movdqu (%rdi), %xmm1 + movdqu (%rsi), %xmm2 + pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ + pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */ + psubb %xmm0, %xmm1 /* packed sub of comparison results*/ + pmovmskb %xmm1, %edx + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ + jnz L(less4_double_words) + + movdqu 16(%rdi), %xmm1 + movdqu 16(%rsi), %xmm2 + pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ + pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */ + psubb %xmm0, %xmm1 /* packed sub of comparison results*/ + pmovmskb %xmm1, %edx + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ + jnz L(less4_double_words_16) + + add $32, %rsi + add $32, %rdi + jmp L(continue_32_48) + + .p2align 4 +L(continue_0_32): + movdqu (%rdi), %xmm1 + movdqu (%rsi), %xmm2 + pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ + pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */ + psubb %xmm0, %xmm1 /* packed sub of comparison results*/ + pmovmskb %xmm1, %edx + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ + jnz L(less4_double_words) + + add $16, %rsi + add $16, %rdi + jmp L(continue_16_48) + + .p2align 4 +L(continue_16_32): + movdqu (%rdi), %xmm1 + movdqu (%rsi), %xmm2 + pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ + pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */ + psubb %xmm0, %xmm1 /* packed sub of comparison results*/ + pmovmskb %xmm1, %edx + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ + jnz L(less4_double_words) + + add $16, %rsi + add $16, %rdi + jmp L(continue_32_48) + + .p2align 4 +L(less4_double_words1): + cmp (%rsi), %eax + jne L(nequal) + test %eax, %eax + jz L(equal) + + mov 4(%rsi), %ecx + cmp %ecx, 4(%rdi) + jne L(nequal) + test %ecx, %ecx + jz L(equal) + + mov 8(%rsi), %ecx + cmp %ecx, 8(%rdi) + jne L(nequal) + test %ecx, %ecx + jz L(equal) + + mov 12(%rsi), %ecx + cmp %ecx, 12(%rdi) + jne L(nequal) + xor %eax, %eax + ret + + .p2align 4 +L(less4_double_words): + xor %eax, %eax + test %dl, %dl + jz L(next_two_double_words) + and $15, %dl + jz L(second_double_word) + mov (%rdi), %eax + cmp (%rsi), %eax + jne L(nequal) + ret + + .p2align 4 +L(second_double_word): + mov 4(%rdi), %eax + cmp 4(%rsi), %eax + jne L(nequal) + ret + + .p2align 4 +L(next_two_double_words): + and $15, %dh + jz L(fourth_double_word) + mov 8(%rdi), %eax + cmp 8(%rsi), %eax + jne L(nequal) + ret + + .p2align 4 +L(fourth_double_word): + mov 12(%rdi), %eax + cmp 12(%rsi), %eax + jne L(nequal) + ret + + .p2align 4 +L(less4_double_words_16): + xor %eax, %eax + test %dl, %dl + jz L(next_two_double_words_16) + and $15, %dl + jz L(second_double_word_16) + mov 16(%rdi), %eax + cmp 16(%rsi), %eax + jne L(nequal) + ret + + .p2align 4 +L(second_double_word_16): + mov 20(%rdi), %eax + cmp 20(%rsi), %eax + jne L(nequal) + ret + + .p2align 4 +L(next_two_double_words_16): + and $15, %dh + jz L(fourth_double_word_16) + mov 24(%rdi), %eax + cmp 24(%rsi), %eax + jne L(nequal) + ret + + .p2align 4 +L(fourth_double_word_16): + mov 28(%rdi), %eax + cmp 28(%rsi), %eax + jne L(nequal) + ret + + .p2align 4 +L(less4_double_words_32): + xor %eax, %eax + test %dl, %dl + jz L(next_two_double_words_32) + and $15, %dl + jz L(second_double_word_32) + mov 32(%rdi), %eax + cmp 32(%rsi), %eax + jne L(nequal) + ret + + .p2align 4 +L(second_double_word_32): + mov 36(%rdi), %eax + cmp 36(%rsi), %eax + jne L(nequal) + ret + + .p2align 4 +L(next_two_double_words_32): + and $15, %dh + jz L(fourth_double_word_32) + mov 40(%rdi), %eax + cmp 40(%rsi), %eax + jne L(nequal) + ret + + .p2align 4 +L(fourth_double_word_32): + mov 44(%rdi), %eax + cmp 44(%rsi), %eax + jne L(nequal) + ret + + .p2align 4 +L(less4_double_words_48): + xor %eax, %eax + test %dl, %dl + jz L(next_two_double_words_48) + and $15, %dl + jz L(second_double_word_48) + mov 48(%rdi), %eax + cmp 48(%rsi), %eax + jne L(nequal) + ret + + .p2align 4 +L(second_double_word_48): + mov 52(%rdi), %eax + cmp 52(%rsi), %eax + jne L(nequal) + ret + + .p2align 4 +L(next_two_double_words_48): + and $15, %dh + jz L(fourth_double_word_48) + mov 56(%rdi), %eax + cmp 56(%rsi), %eax + jne L(nequal) + ret + + .p2align 4 +L(fourth_double_word_48): + mov 60(%rdi), %eax + cmp 60(%rsi), %eax + jne L(nequal) + ret + + .p2align 4 +L(nequal): + mov $1, %eax + jg L(nequal_bigger) + neg %eax + +L(nequal_bigger): + ret + + .p2align 4 +L(equal): + xor %rax, %rax + ret + +END (__wcscmp) +libc_hidden_def (__wcscmp) +weak_alias (__wcscmp, wcscmp) diff --git a/REORG.TODO/sysdeps/x86_64/wcslen.S b/REORG.TODO/sysdeps/x86_64/wcslen.S new file mode 100644 index 0000000000..c6081a482f --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/wcslen.S @@ -0,0 +1,238 @@ +/* Optimized wcslen for x86-64 with SSE2. + Copyright (C) 2011-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> + + .text +ENTRY (__wcslen) + cmpl $0, (%rdi) + jz L(exit_tail0) + cmpl $0, 4(%rdi) + jz L(exit_tail1) + cmpl $0, 8(%rdi) + jz L(exit_tail2) + cmpl $0, 12(%rdi) + jz L(exit_tail3) + cmpl $0, 16(%rdi) + jz L(exit_tail4) + cmpl $0, 20(%rdi) + jz L(exit_tail5) + cmpl $0, 24(%rdi) + jz L(exit_tail6) + cmpl $0, 28(%rdi) + jz L(exit_tail7) + + pxor %xmm0, %xmm0 + + lea 32(%rdi), %rax + lea 16(%rdi), %rcx + and $-16, %rax + + pcmpeqd (%rax), %xmm0 + pmovmskb %xmm0, %edx + pxor %xmm1, %xmm1 + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + + pcmpeqd (%rax), %xmm1 + pmovmskb %xmm1, %edx + pxor %xmm2, %xmm2 + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + + pcmpeqd (%rax), %xmm2 + pmovmskb %xmm2, %edx + pxor %xmm3, %xmm3 + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + + pcmpeqd (%rax), %xmm3 + pmovmskb %xmm3, %edx + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + + pcmpeqd (%rax), %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + + pcmpeqd (%rax), %xmm1 + pmovmskb %xmm1, %edx + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + + pcmpeqd (%rax), %xmm2 + pmovmskb %xmm2, %edx + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + + pcmpeqd (%rax), %xmm3 + pmovmskb %xmm3, %edx + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + + pcmpeqd (%rax), %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + + pcmpeqd (%rax), %xmm1 + pmovmskb %xmm1, %edx + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + + pcmpeqd (%rax), %xmm2 + pmovmskb %xmm2, %edx + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + + pcmpeqd (%rax), %xmm3 + pmovmskb %xmm3, %edx + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + + and $-0x40, %rax + + .p2align 4 +L(aligned_64_loop): + movaps (%rax), %xmm0 + movaps 16(%rax), %xmm1 + movaps 32(%rax), %xmm2 + movaps 48(%rax), %xmm6 + + pminub %xmm1, %xmm0 + pminub %xmm6, %xmm2 + pminub %xmm0, %xmm2 + pcmpeqd %xmm3, %xmm2 + pmovmskb %xmm2, %edx + test %edx, %edx + lea 64(%rax), %rax + jz L(aligned_64_loop) + + pcmpeqd -64(%rax), %xmm3 + pmovmskb %xmm3, %edx + test %edx, %edx + lea 48(%rcx), %rcx + jnz L(exit) + + pcmpeqd %xmm1, %xmm3 + pmovmskb %xmm3, %edx + test %edx, %edx + lea -16(%rcx), %rcx + jnz L(exit) + + pcmpeqd -32(%rax), %xmm3 + pmovmskb %xmm3, %edx + test %edx, %edx + lea -16(%rcx), %rcx + jnz L(exit) + + pcmpeqd %xmm6, %xmm3 + pmovmskb %xmm3, %edx + test %edx, %edx + lea -16(%rcx), %rcx + jnz L(exit) + + jmp L(aligned_64_loop) + + .p2align 4 +L(exit): + sub %rcx, %rax + shr $2, %rax + test %dl, %dl + jz L(exit_high) + + mov %dl, %cl + and $15, %cl + jz L(exit_1) + ret + + .p2align 4 +L(exit_high): + mov %dh, %ch + and $15, %ch + jz L(exit_3) + add $2, %rax + ret + + .p2align 4 +L(exit_1): + add $1, %rax + ret + + .p2align 4 +L(exit_3): + add $3, %rax + ret + + .p2align 4 +L(exit_tail0): + xor %rax, %rax + ret + + .p2align 4 +L(exit_tail1): + mov $1, %rax + ret + + .p2align 4 +L(exit_tail2): + mov $2, %rax + ret + + .p2align 4 +L(exit_tail3): + mov $3, %rax + ret + + .p2align 4 +L(exit_tail4): + mov $4, %rax + ret + + .p2align 4 +L(exit_tail5): + mov $5, %rax + ret + + .p2align 4 +L(exit_tail6): + mov $6, %rax + ret + + .p2align 4 +L(exit_tail7): + mov $7, %rax + ret + +END (__wcslen) + +weak_alias(__wcslen, wcslen) diff --git a/REORG.TODO/sysdeps/x86_64/wcsrchr.S b/REORG.TODO/sysdeps/x86_64/wcsrchr.S new file mode 100644 index 0000000000..a6c385c511 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/wcsrchr.S @@ -0,0 +1,282 @@ +/* wcsrchr with SSSE3 + Copyright (C) 2011-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> + + .text +ENTRY (wcsrchr) + + movd %rsi, %xmm1 + mov %rdi, %rcx + punpckldq %xmm1, %xmm1 + pxor %xmm2, %xmm2 + punpckldq %xmm1, %xmm1 + and $63, %rcx + cmp $48, %rcx + ja L(crosscache) + + movdqu (%rdi), %xmm0 + pcmpeqd %xmm0, %xmm2 + pcmpeqd %xmm1, %xmm0 + pmovmskb %xmm2, %rcx + pmovmskb %xmm0, %rax + add $16, %rdi + + test %rax, %rax + jnz L(unaligned_match1) + + test %rcx, %rcx + jnz L(return_null) + + and $-16, %rdi + xor %r8, %r8 + jmp L(loop) + + .p2align 4 +L(unaligned_match1): + test %rcx, %rcx + jnz L(prolog_find_zero_1) + + mov %rax, %r8 + mov %rdi, %rsi + and $-16, %rdi + jmp L(loop) + + .p2align 4 +L(crosscache): + and $15, %rcx + and $-16, %rdi + pxor %xmm3, %xmm3 + movdqa (%rdi), %xmm0 + pcmpeqd %xmm0, %xmm3 + pcmpeqd %xmm1, %xmm0 + pmovmskb %xmm3, %rdx + pmovmskb %xmm0, %rax + shr %cl, %rdx + shr %cl, %rax + add $16, %rdi + + test %rax, %rax + jnz L(unaligned_match) + + test %rdx, %rdx + jnz L(return_null) + + xor %r8, %r8 + jmp L(loop) + + .p2align 4 +L(unaligned_match): + test %rdx, %rdx + jnz L(prolog_find_zero) + + mov %rax, %r8 + lea (%rdi, %rcx), %rsi + +/* Loop start on aligned string. */ + .p2align 4 +L(loop): + movdqa (%rdi), %xmm0 + pcmpeqd %xmm0, %xmm2 + add $16, %rdi + pcmpeqd %xmm1, %xmm0 + pmovmskb %xmm2, %rcx + pmovmskb %xmm0, %rax + or %rax, %rcx + jnz L(matches) + + movdqa (%rdi), %xmm3 + pcmpeqd %xmm3, %xmm2 + add $16, %rdi + pcmpeqd %xmm1, %xmm3 + pmovmskb %xmm2, %rcx + pmovmskb %xmm3, %rax + or %rax, %rcx + jnz L(matches) + + movdqa (%rdi), %xmm4 + pcmpeqd %xmm4, %xmm2 + add $16, %rdi + pcmpeqd %xmm1, %xmm4 + pmovmskb %xmm2, %rcx + pmovmskb %xmm4, %rax + or %rax, %rcx + jnz L(matches) + + movdqa (%rdi), %xmm5 + pcmpeqd %xmm5, %xmm2 + add $16, %rdi + pcmpeqd %xmm1, %xmm5 + pmovmskb %xmm2, %rcx + pmovmskb %xmm5, %rax + or %rax, %rcx + jz L(loop) + + .p2align 4 +L(matches): + test %rax, %rax + jnz L(match) +L(return_value): + test %r8, %r8 + jz L(return_null) + mov %r8, %rax + mov %rsi, %rdi + + test $15 << 4, %ah + jnz L(match_fourth_wchar) + test %ah, %ah + jnz L(match_third_wchar) + test $15 << 4, %al + jnz L(match_second_wchar) + lea -16(%rdi), %rax + ret + + .p2align 4 +L(match): + pmovmskb %xmm2, %rcx + test %rcx, %rcx + jnz L(find_zero) + mov %rax, %r8 + mov %rdi, %rsi + jmp L(loop) + + .p2align 4 +L(find_zero): + test $15, %cl + jnz L(find_zero_in_first_wchar) + test %cl, %cl + jnz L(find_zero_in_second_wchar) + test $15, %ch + jnz L(find_zero_in_third_wchar) + + and $1 << 13 - 1, %rax + jz L(return_value) + + test $15 << 4, %ah + jnz L(match_fourth_wchar) + test %ah, %ah + jnz L(match_third_wchar) + test $15 << 4, %al + jnz L(match_second_wchar) + lea -16(%rdi), %rax + ret + + .p2align 4 +L(find_zero_in_first_wchar): + test $1, %rax + jz L(return_value) + lea -16(%rdi), %rax + ret + + .p2align 4 +L(find_zero_in_second_wchar): + and $1 << 5 - 1, %rax + jz L(return_value) + + test $15 << 4, %al + jnz L(match_second_wchar) + lea -16(%rdi), %rax + ret + + .p2align 4 +L(find_zero_in_third_wchar): + and $1 << 9 - 1, %rax + jz L(return_value) + + test %ah, %ah + jnz L(match_third_wchar) + test $15 << 4, %al + jnz L(match_second_wchar) + lea -16(%rdi), %rax + ret + + .p2align 4 +L(prolog_find_zero): + add %rcx, %rdi + mov %rdx, %rcx +L(prolog_find_zero_1): + test $15, %cl + jnz L(prolog_find_zero_in_first_wchar) + test %cl, %cl + jnz L(prolog_find_zero_in_second_wchar) + test $15, %ch + jnz L(prolog_find_zero_in_third_wchar) + + and $1 << 13 - 1, %rax + jz L(return_null) + + test $15 << 4, %ah + jnz L(match_fourth_wchar) + test %ah, %ah + jnz L(match_third_wchar) + test $15 << 4, %al + jnz L(match_second_wchar) + lea -16(%rdi), %rax + ret + + .p2align 4 +L(prolog_find_zero_in_first_wchar): + test $1, %rax + jz L(return_null) + lea -16(%rdi), %rax + ret + + .p2align 4 +L(prolog_find_zero_in_second_wchar): + and $1 << 5 - 1, %rax + jz L(return_null) + + test $15 << 4, %al + jnz L(match_second_wchar) + lea -16(%rdi), %rax + ret + + .p2align 4 +L(prolog_find_zero_in_third_wchar): + and $1 << 9 - 1, %rax + jz L(return_null) + + test %ah, %ah + jnz L(match_third_wchar) + test $15 << 4, %al + jnz L(match_second_wchar) + lea -16(%rdi), %rax + ret + + .p2align 4 +L(match_second_wchar): + lea -12(%rdi), %rax + ret + + .p2align 4 +L(match_third_wchar): + lea -8(%rdi), %rax + ret + + .p2align 4 +L(match_fourth_wchar): + lea -4(%rdi), %rax + ret + + .p2align 4 +L(return_null): + xor %rax, %rax + ret + +END (wcsrchr) diff --git a/REORG.TODO/sysdeps/x86_64/wmemset.S b/REORG.TODO/sysdeps/x86_64/wmemset.S new file mode 100644 index 0000000000..f96d567fd8 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/wmemset.S @@ -0,0 +1 @@ +/* Implemented in memset.S. */ diff --git a/REORG.TODO/sysdeps/x86_64/wmemset_chk.S b/REORG.TODO/sysdeps/x86_64/wmemset_chk.S new file mode 100644 index 0000000000..64c277413f --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/wmemset_chk.S @@ -0,0 +1,33 @@ +/* Checking wmemset for x86-64. + Copyright (C) 2004-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "asm-syntax.h" + +#ifndef SHARED + /* For libc.so this is defined in wmemset.S. + For libc.a, this is a separate source to avoid + wmemset bringing in __chk_fail and all routines + it calls. */ + .text +ENTRY (__wmemset_chk) + cmpq %rdx, %rcx + jb __chk_fail + jmp wmemset +END (__wmemset_chk) +#endif diff --git a/REORG.TODO/sysdeps/x86_64/wordcopy.c b/REORG.TODO/sysdeps/x86_64/wordcopy.c new file mode 100644 index 0000000000..590b6cb16b --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/wordcopy.c @@ -0,0 +1 @@ +/* X86-64 doesn't use memory copy functions. */ diff --git a/REORG.TODO/sysdeps/x86_64/x32/Implies-after b/REORG.TODO/sysdeps/x86_64/x32/Implies-after new file mode 100644 index 0000000000..39a34c5f57 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/x32/Implies-after @@ -0,0 +1 @@ +wordsize-32 diff --git a/REORG.TODO/sysdeps/x86_64/x32/Makefile b/REORG.TODO/sysdeps/x86_64/x32/Makefile new file mode 100644 index 0000000000..f2ebc24fb0 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/x32/Makefile @@ -0,0 +1,6 @@ +ifeq ($(subdir),math) +# Since x32 returns 32-bit long int and 64-bit long long int in the +# same 64-bit register, we make the 32b-bit lround an alias of the +# 64-bit llround. Add -fno-builtin-lround to silence the compiler. +CFLAGS-s_llround.c += -fno-builtin-lround +endif diff --git a/REORG.TODO/sysdeps/x86_64/x32/_itoa.h b/REORG.TODO/sysdeps/x86_64/x32/_itoa.h new file mode 100644 index 0000000000..0f9ed47726 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/x32/_itoa.h @@ -0,0 +1,4 @@ +/* X32 uses 64-bit _itoa_word and _itoa is mapped to _itoa_word. */ +#define _ITOA_NEEDED 0 +#define _ITOA_WORD_TYPE unsigned long long int +#include_next <_itoa.h> diff --git a/REORG.TODO/sysdeps/x86_64/x32/divdi3.c b/REORG.TODO/sysdeps/x86_64/x32/divdi3.c new file mode 100644 index 0000000000..bc7b4c4441 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/x32/divdi3.c @@ -0,0 +1 @@ +/* Fortunately nothing to do. */ diff --git a/REORG.TODO/sysdeps/x86_64/x32/dl-machine.h b/REORG.TODO/sysdeps/x86_64/x32/dl-machine.h new file mode 100644 index 0000000000..2c50688d94 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/x32/dl-machine.h @@ -0,0 +1,86 @@ +/* Machine-dependent ELF dynamic relocation inline functions. x32 version. + Copyright (C) 2012-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +/* Must allow <sysdeps/x86_64/dl-machine.h> to be included more than once. + See #ifdef RESOLVE_MAP in sysdeps/x86_64/dl-machine.h. */ +#include <sysdeps/x86_64/dl-machine.h> + +#ifndef _X32_DL_MACHINE_H +#define _X32_DL_MACHINE_H + +#undef ARCH_LA_PLTENTER +#undef ARCH_LA_PLTEXIT +#undef RTLD_START + +/* Names of the architecture-specific auditing callback functions. */ +#define ARCH_LA_PLTENTER x32_gnu_pltenter +#define ARCH_LA_PLTEXIT x32_gnu_pltexit + +/* Initial entry point code for the dynamic linker. + The C function `_dl_start' is the real entry point; + its return value is the user program's entry point. */ +#define RTLD_START asm ("\n\ +.text\n\ + .p2align 4\n\ +.globl _start\n\ +.globl _dl_start_user\n\ +_start:\n\ + movl %esp, %edi\n\ + call _dl_start\n\ +_dl_start_user:\n\ + # Save the user entry point address in %r12.\n\ + movl %eax, %r12d\n\ + # See if we were run as a command with the executable file\n\ + # name as an extra leading argument.\n\ + movl _dl_skip_args(%rip), %eax\n\ + # Pop the original argument count.\n\ + movl (%rsp), %edx\n\ + # Adjust the stack pointer to skip _dl_skip_args words.\n\ + lea 4(%rsp,%rax,4), %esp\n\ + # Subtract _dl_skip_args from argc.\n\ + subl %eax, %edx\n\ + # Push argc back on the stack.\n\ + subl $4, %esp\n\ + movl %edx, (%rsp)\n\ + # Call _dl_init (struct link_map *main_map, int argc, char **argv, char **env)\n\ + # argc -> rsi\n\ + movl %edx, %esi\n\ + # Save %rsp value in %r13.\n\ + movl %esp, %r13d\n\ + # And align stack for the _dl_init call.\n\ + and $-16, %esp\n\ + # _dl_loaded -> rdi\n\ + movl _rtld_local(%rip), %edi\n\ + # env -> rcx\n\ + lea 8(%r13,%rdx,4), %ecx\n\ + # argv -> rdx\n\ + lea 4(%r13), %edx\n\ + # Clear %rbp to mark outermost frame obviously even for constructors.\n\ + xorl %ebp, %ebp\n\ + # Call the function to run the initializers.\n\ + call _dl_init\n\ + # Pass our finalizer function to the user in %rdx, as per ELF ABI.\n\ + lea _dl_fini(%rip), %edx\n\ + # And make sure %rsp points to argc stored on the stack.\n\ + movl %r13d, %esp\n\ + # Jump to the user's entry point.\n\ + jmp *%r12\n\ +.previous\n\ +"); + +#endif /* !_X32_DL_MACHINE_H */ diff --git a/REORG.TODO/sysdeps/x86_64/x32/ffs.c b/REORG.TODO/sysdeps/x86_64/x32/ffs.c new file mode 100644 index 0000000000..fa7de8b887 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/x32/ffs.c @@ -0,0 +1,4 @@ +#define ffsl __something_else +#include <sysdeps/x86_64/ffs.c> +#undef ffsl +weak_alias (__ffs, ffsl) diff --git a/REORG.TODO/sysdeps/x86_64/x32/fpu/s_lrint.S b/REORG.TODO/sysdeps/x86_64/x32/fpu/s_lrint.S new file mode 100644 index 0000000000..86d258c192 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/x32/fpu/s_lrint.S @@ -0,0 +1,27 @@ +/* Round argument to nearest integral value according to current rounding + direction. + Copyright (C) 2015-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> + + .text +ENTRY(__lrint) + cvtsd2si %xmm0,%eax + ret +END(__lrint) +weak_alias (__lrint, lrint) diff --git a/REORG.TODO/sysdeps/x86_64/x32/fpu/s_lrintf.S b/REORG.TODO/sysdeps/x86_64/x32/fpu/s_lrintf.S new file mode 100644 index 0000000000..2e6f9aaf2b --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/x32/fpu/s_lrintf.S @@ -0,0 +1,27 @@ +/* Round argument to nearest integral value according to current rounding + direction. + Copyright (C) 2015-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> + + .text +ENTRY(__lrintf) + cvtss2si %xmm0,%eax + ret +END(__lrintf) +weak_alias (__lrintf, lrintf) diff --git a/REORG.TODO/sysdeps/x86_64/x32/fpu/s_lrintl.S b/REORG.TODO/sysdeps/x86_64/x32/fpu/s_lrintl.S new file mode 100644 index 0000000000..623c6fcbc9 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/x32/fpu/s_lrintl.S @@ -0,0 +1,30 @@ +/* Round argument to nearest integral value according to current rounding + direction. + Copyright (C) 1997-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> + + .text +ENTRY(__lrintl) + fldt 8(%rsp) + fistpl -4(%rsp) + fwait + movl -4(%rsp),%eax + ret +END(__lrintl) +weak_alias (__lrintl, lrintl) diff --git a/REORG.TODO/sysdeps/x86_64/x32/gmp-mparam.h b/REORG.TODO/sysdeps/x86_64/x32/gmp-mparam.h new file mode 100644 index 0000000000..1915bfc67a --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/x32/gmp-mparam.h @@ -0,0 +1,33 @@ +/* gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright (C) 2012-2017 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, see +<http://www.gnu.org/licenses/>. */ + +#if defined __GMP_H__ && ! defined _LONG_LONG_LIMB +#error "Included too late for _LONG_LONG_LIMB to take effect" +#endif + +#define _LONG_LONG_LIMB +#define BITS_PER_MP_LIMB 64 +#define BYTES_PER_MP_LIMB 8 +#define BITS_PER_LONGINT 32 +#define BITS_PER_INT 32 +#define BITS_PER_SHORTINT 16 +#define BITS_PER_CHAR 8 + +#define IEEE_DOUBLE_BIG_ENDIAN 0 diff --git a/REORG.TODO/sysdeps/x86_64/x32/symbol-hacks.h b/REORG.TODO/sysdeps/x86_64/x32/symbol-hacks.h new file mode 100644 index 0000000000..22aad04437 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/x32/symbol-hacks.h @@ -0,0 +1 @@ +#include <sysdeps/generic/symbol-hacks.h> diff --git a/REORG.TODO/sysdeps/x86_64/x32/sysdep.h b/REORG.TODO/sysdeps/x86_64/x32/sysdep.h new file mode 100644 index 0000000000..034a3f04e9 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/x32/sysdep.h @@ -0,0 +1,92 @@ +/* Assembler macros for x32. + Copyright (C) 2012-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdeps/x86_64/sysdep.h> + +#undef LP_SIZE +#undef LP_OP +#undef ASM_ADDR + +#undef RAX_LP +#undef RBP_LP +#undef RBX_LP +#undef RCX_LP +#undef RDI_LP +#undef RDX_LP +#undef RSP_LP +#undef RSI_LP +#undef R8_LP +#undef R9_LP +#undef R10_LP +#undef R11_LP +#undef R12_LP +#undef R13_LP +#undef R14_LP +#undef R15_LP + +#ifdef __ASSEMBLER__ + +# define LP_SIZE 4 + +# define LP_OP(insn) insn##l + +# define ASM_ADDR .long + +# define RAX_LP eax +# define RBP_LP ebp +# define RBX_LP ebx +# define RCX_LP ecx +# define RDI_LP edi +# define RDX_LP edx +# define RSI_LP esi +# define RSP_LP esp +# define R8_LP r8d +# define R9_LP r9d +# define R10_LP r10d +# define R11_LP r11d +# define R12_LP r12d +# define R13_LP r13d +# define R14_LP r14d +# define R15_LP r15d + +#else /* __ASSEMBLER__ */ + +# define LP_SIZE "4" + +# define LP_OP(insn) #insn "l" + +# define ASM_ADDR ".long" + +# define RAX_LP "eax" +# define RBP_LP "ebp" +# define RBX_LP "ebx" +# define RCX_LP "ecx" +# define RDI_LP "edi" +# define RDX_LP "edx" +# define RSI_LP "esi" +# define RSP_LP "esp" +# define R8_LP "r8d" +# define R9_LP "r9d" +# define R10_LP "r10d" +# define R11_LP "r11d" +# define R12_LP "r12d" +# define R13_LP "r13d" +# define R14_LP "r14d" +# define R15_LP "r15d" + +#endif /* __ASSEMBLER__ */ |