diff options
author | Andrew Senkevich <andrew.senkevich@intel.com> | 2015-06-17 15:58:05 +0300 |
---|---|---|
committer | Andrew Senkevich <andrew.senkevich@intel.com> | 2015-06-17 15:58:05 +0300 |
commit | 9c02f663f6b387b3905b629ffe584c9abf2030dc (patch) | |
tree | 587a88eca7b4c3abd7c5482c07c7a35778025785 | |
parent | 774488f88aeed6b838fe29c3c7561433c242a3c9 (diff) | |
download | glibc-9c02f663f6b387b3905b629ffe584c9abf2030dc.tar.gz |
Vector exp for x86_64 and tests.
Here is implementation of vectorized exp containing SSE, AVX,
AVX2 and AVX512 versions according to Vector ABI
<https://groups.google.com/forum/#!topic/x86-64-abi/LmppCfN1rZ4>.
* bits/libm-simd-decl-stubs.h: Added stubs for exp.
* math/bits/mathcalls.h: Added exp declaration with __MATHCALL_VEC.
* sysdeps/unix/sysv/linux/x86_64/libmvec.abilist: New versions added.
* sysdeps/x86/fpu/bits/math-vector.h: Added SIMD declaration and asm
redirections for exp.
* sysdeps/x86_64/fpu/Makefile (libmvec-support): Added new files.
* sysdeps/x86_64/fpu/Versions: New versions added.
* sysdeps/x86_64/fpu/libm-test-ulps: Regenerated.
* sysdeps/x86_64/fpu/multiarch/Makefile (libmvec-sysdep_routines): Added
build of SSE, AVX2 and AVX512 IFUNC versions.
* sysdeps/x86_64/fpu/multiarch/svml_d_exp2_core.S: New file.
* sysdeps/x86_64/fpu/multiarch/svml_d_exp2_core_sse4.S: New file.
* sysdeps/x86_64/fpu/multiarch/svml_d_exp4_core.S: New file.
* sysdeps/x86_64/fpu/multiarch/svml_d_exp4_core_avx2.S: New file.
* sysdeps/x86_64/fpu/multiarch/svml_d_exp8_core.S: New file.
* sysdeps/x86_64/fpu/multiarch/svml_d_exp8_core_avx512.S: New file.
* sysdeps/x86_64/fpu/svml_d_exp2_core.S: New file.
* sysdeps/x86_64/fpu/svml_d_exp4_core.S: New file.
* sysdeps/x86_64/fpu/svml_d_exp4_core_avx.S: New file.
* sysdeps/x86_64/fpu/svml_d_exp8_core.S: New file.
* sysdeps/x86_64/fpu/svml_d_exp_data.S: New file.
* sysdeps/x86_64/fpu/svml_d_exp_data.h: New file.
* sysdeps/x86_64/fpu/test-double-vlen2-wrappers.c: Added vector exp test.
* sysdeps/x86_64/fpu/test-double-vlen2.c: Likewise.
* sysdeps/x86_64/fpu/test-double-vlen4-avx2-wrappers.c: Likewise.
* sysdeps/x86_64/fpu/test-double-vlen4-avx2.c: Likewise.
* sysdeps/x86_64/fpu/test-double-vlen4-wrappers.c: Likewise.
* sysdeps/x86_64/fpu/test-double-vlen4.c: Likewise.
* sysdeps/x86_64/fpu/test-double-vlen8-wrappers.c: Likewise.
* sysdeps/x86_64/fpu/test-double-vlen8.c: Likewise.
* NEWS: Mention addition of x86_64 vector exp.
30 files changed, 2329 insertions, 4 deletions
@@ -1,5 +1,37 @@ 2015-06-17 Andrew Senkevich <andrew.senkevich@intel.com> + * bits/libm-simd-decl-stubs.h: Added stubs for exp. + * math/bits/mathcalls.h: Added exp declaration with __MATHCALL_VEC. + * sysdeps/unix/sysv/linux/x86_64/libmvec.abilist: New versions added. + * sysdeps/x86/fpu/bits/math-vector.h: Added SIMD declaration and asm + redirections for exp. + * sysdeps/x86_64/fpu/Makefile (libmvec-support): Added new files. + * sysdeps/x86_64/fpu/Versions: New versions added. + * sysdeps/x86_64/fpu/libm-test-ulps: Regenerated. + * sysdeps/x86_64/fpu/multiarch/Makefile (libmvec-sysdep_routines): Added + build of SSE, AVX2 and AVX512 IFUNC versions. + * sysdeps/x86_64/fpu/multiarch/svml_d_exp2_core.S: New file. + * sysdeps/x86_64/fpu/multiarch/svml_d_exp2_core_sse4.S: New file. + * sysdeps/x86_64/fpu/multiarch/svml_d_exp4_core.S: New file. + * sysdeps/x86_64/fpu/multiarch/svml_d_exp4_core_avx2.S: New file. + * sysdeps/x86_64/fpu/multiarch/svml_d_exp8_core.S: New file. + * sysdeps/x86_64/fpu/multiarch/svml_d_exp8_core_avx512.S: New file. + * sysdeps/x86_64/fpu/svml_d_exp2_core.S: New file. + * sysdeps/x86_64/fpu/svml_d_exp4_core.S: New file. + * sysdeps/x86_64/fpu/svml_d_exp4_core_avx.S: New file. + * sysdeps/x86_64/fpu/svml_d_exp8_core.S: New file. + * sysdeps/x86_64/fpu/svml_d_exp_data.S: New file. + * sysdeps/x86_64/fpu/svml_d_exp_data.h: New file. + * sysdeps/x86_64/fpu/test-double-vlen2-wrappers.c: Added vector exp test. + * sysdeps/x86_64/fpu/test-double-vlen2.c: Likewise. + * sysdeps/x86_64/fpu/test-double-vlen4-avx2-wrappers.c: Likewise. + * sysdeps/x86_64/fpu/test-double-vlen4-avx2.c: Likewise. + * sysdeps/x86_64/fpu/test-double-vlen4-wrappers.c: Likewise. + * sysdeps/x86_64/fpu/test-double-vlen4.c: Likewise. + * sysdeps/x86_64/fpu/test-double-vlen8-wrappers.c: Likewise. + * sysdeps/x86_64/fpu/test-double-vlen8.c: Likewise. + * NEWS: Mention addition of x86_64 vector exp. + * sysdeps/unix/sysv/linux/x86_64/libmvec.abilist: New symbols added. * sysdeps/x86/fpu/bits/math-vector.h: Added SIMD declaration and asm redirections for logf. @@ -53,7 +53,7 @@ Version 2.22 condition in some applications. * Added vector math library named libmvec with the following vectorized x86_64 - implementations: cos, cosf, sin, sinf, log, logf. + implementations: cos, cosf, sin, sinf, log, logf, exp. The library can be disabled with --disable-mathvec. Use of the functions is enabled with -fopenmp -ffast-math starting from -O1 for GCC version >= 4.9.0. The library is linked in as needed when using -lm (no need to specify -lmvec diff --git a/bits/libm-simd-decl-stubs.h b/bits/libm-simd-decl-stubs.h index 6367b775a3..1a8bf6f262 100644 --- a/bits/libm-simd-decl-stubs.h +++ b/bits/libm-simd-decl-stubs.h @@ -45,4 +45,8 @@ #define __DECL_SIMD_logf #define __DECL_SIMD_logl +#define __DECL_SIMD_exp +#define __DECL_SIMD_expf +#define __DECL_SIMD_expl + #endif diff --git a/math/bits/mathcalls.h b/math/bits/mathcalls.h index c41946fa0e..1d0c6bd347 100644 --- a/math/bits/mathcalls.h +++ b/math/bits/mathcalls.h @@ -97,7 +97,7 @@ __END_NAMESPACE_C99 _Mdouble_BEGIN_NAMESPACE /* Exponential function of X. */ -__MATHCALL (exp,, (_Mdouble_ __x)); +__MATHCALL_VEC (exp,, (_Mdouble_ __x)); /* Break VALUE into a normalized fraction and an integral power of 2. */ __MATHCALL (frexp,, (_Mdouble_ __x, int *__exponent)); diff --git a/sysdeps/unix/sysv/linux/x86_64/libmvec.abilist b/sysdeps/unix/sysv/linux/x86_64/libmvec.abilist index 3593edcbfa..ff9431fa10 100644 --- a/sysdeps/unix/sysv/linux/x86_64/libmvec.abilist +++ b/sysdeps/unix/sysv/linux/x86_64/libmvec.abilist @@ -1,18 +1,21 @@ GLIBC_2.22 GLIBC_2.22 A _ZGVbN2v_cos F + _ZGVbN2v_exp F _ZGVbN2v_log F _ZGVbN2v_sin F _ZGVbN4v_cosf F _ZGVbN4v_logf F _ZGVbN4v_sinf F _ZGVcN4v_cos F + _ZGVcN4v_exp F _ZGVcN4v_log F _ZGVcN4v_sin F _ZGVcN8v_cosf F _ZGVcN8v_logf F _ZGVcN8v_sinf F _ZGVdN4v_cos F + _ZGVdN4v_exp F _ZGVdN4v_log F _ZGVdN4v_sin F _ZGVdN8v_cosf F @@ -22,5 +25,6 @@ GLIBC_2.22 _ZGVeN16v_logf F _ZGVeN16v_sinf F _ZGVeN8v_cos F + _ZGVeN8v_exp F _ZGVeN8v_log F _ZGVeN8v_sin F diff --git a/sysdeps/x86/fpu/bits/math-vector.h b/sysdeps/x86/fpu/bits/math-vector.h index 5c3e492ef9..9a353bc62a 100644 --- a/sysdeps/x86/fpu/bits/math-vector.h +++ b/sysdeps/x86/fpu/bits/math-vector.h @@ -40,6 +40,8 @@ # define __DECL_SIMD_log __DECL_SIMD_x86_64 # undef __DECL_SIMD_logf # define __DECL_SIMD_logf __DECL_SIMD_x86_64 +# undef __DECL_SIMD_exp +# define __DECL_SIMD_exp __DECL_SIMD_x86_64 /* Workaround to exclude unnecessary symbol aliases in libmvec while GCC creates the vector names based on scalar asm name. @@ -53,6 +55,10 @@ __asm__ ("_ZGVbN4v___logf_finite = _ZGVbN4v_logf"); __asm__ ("_ZGVcN8v___logf_finite = _ZGVcN8v_logf"); __asm__ ("_ZGVdN8v___logf_finite = _ZGVdN8v_logf"); __asm__ ("_ZGVeN16v___logf_finite = _ZGVeN16v_logf"); +__asm__ ("_ZGVbN2v___exp_finite = _ZGVbN2v_exp"); +__asm__ ("_ZGVcN4v___exp_finite = _ZGVcN4v_exp"); +__asm__ ("_ZGVdN4v___exp_finite = _ZGVdN4v_exp"); +__asm__ ("_ZGVeN8v___exp_finite = _ZGVeN8v_exp"); # endif #endif diff --git a/sysdeps/x86_64/fpu/Makefile b/sysdeps/x86_64/fpu/Makefile index b610e3faf5..bd6d693d39 100644 --- a/sysdeps/x86_64/fpu/Makefile +++ b/sysdeps/x86_64/fpu/Makefile @@ -10,7 +10,8 @@ libmvec-support += svml_d_cos2_core svml_d_cos4_core_avx \ svml_d_log2_core svml_d_log4_core_avx svml_d_log4_core \ svml_d_log8_core svml_d_log_data svml_s_logf4_core \ svml_s_logf8_core_avx svml_s_logf8_core svml_s_logf16_core \ - svml_s_logf_data \ + svml_s_logf_data svml_d_exp2_core svml_d_exp4_core_avx \ + svml_d_exp4_core svml_d_exp8_core svml_d_exp_data \ init-arch endif diff --git a/sysdeps/x86_64/fpu/Versions b/sysdeps/x86_64/fpu/Versions index ecd1b7023e..00e34e771b 100644 --- a/sysdeps/x86_64/fpu/Versions +++ b/sysdeps/x86_64/fpu/Versions @@ -3,6 +3,7 @@ libmvec { _ZGVbN2v_cos; _ZGVcN4v_cos; _ZGVdN4v_cos; _ZGVeN8v_cos; _ZGVbN2v_sin; _ZGVcN4v_sin; _ZGVdN4v_sin; _ZGVeN8v_sin; _ZGVbN2v_log; _ZGVcN4v_log; _ZGVdN4v_log; _ZGVeN8v_log; + _ZGVbN2v_exp; _ZGVcN4v_exp; _ZGVdN4v_exp; _ZGVeN8v_exp; _ZGVbN4v_cosf; _ZGVcN8v_cosf; _ZGVdN8v_cosf; _ZGVeN16v_cosf; _ZGVbN4v_sinf; _ZGVcN8v_sinf; _ZGVdN8v_sinf; _ZGVeN16v_sinf; _ZGVbN4v_logf; _ZGVcN8v_logf; _ZGVdN8v_logf; _ZGVeN16v_logf; diff --git a/sysdeps/x86_64/fpu/libm-test-ulps b/sysdeps/x86_64/fpu/libm-test-ulps index 1812370ecc..45ebc04e58 100644 --- a/sysdeps/x86_64/fpu/libm-test-ulps +++ b/sysdeps/x86_64/fpu/libm-test-ulps @@ -1535,6 +1535,18 @@ idouble: 1 ildouble: 1 ldouble: 1 +Function: "exp_vlen2": +double: 1 + +Function: "exp_vlen4": +double: 1 + +Function: "exp_vlen4_avx2": +double: 1 + +Function: "exp_vlen8": +double: 1 + Function: "expm1": double: 1 float: 1 diff --git a/sysdeps/x86_64/fpu/multiarch/Makefile b/sysdeps/x86_64/fpu/multiarch/Makefile index 5fc6ea3d23..d6355ae98d 100644 --- a/sysdeps/x86_64/fpu/multiarch/Makefile +++ b/sysdeps/x86_64/fpu/multiarch/Makefile @@ -62,5 +62,6 @@ libmvec-sysdep_routines += svml_d_cos2_core_sse4 svml_d_cos4_core_avx2 \ svml_s_cosf16_core_avx512 svml_s_sinf4_core_sse4 \ svml_s_sinf8_core_avx2 svml_s_sinf16_core_avx512 \ svml_s_logf4_core_sse4 svml_s_logf8_core_avx2 \ - svml_s_logf16_core_avx512 + svml_s_logf16_core_avx512 svml_d_exp2_core_sse4 \ + svml_d_exp4_core_avx2 svml_d_exp8_core_avx512 endif diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_exp2_core.S b/sysdeps/x86_64/fpu/multiarch/svml_d_exp2_core.S new file mode 100644 index 0000000000..ef3dc49a1c --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_exp2_core.S @@ -0,0 +1,38 @@ +/* Multiple versions of vectorized exp. + Copyright (C) 2014-2015 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + + .text +ENTRY (_ZGVbN2v_exp) + .type _ZGVbN2v_exp, @gnu_indirect_function + cmpl $0, KIND_OFFSET+__cpu_features(%rip) + jne 1f + call __init_cpu_features +1: leaq _ZGVbN2v_exp_sse4(%rip), %rax + testl $bit_SSE4_1, __cpu_features+CPUID_OFFSET+index_SSE4_1(%rip) + jz 2f + ret +2: leaq _ZGVbN2v_exp_sse2(%rip), %rax + ret +END (_ZGVbN2v_exp) +libmvec_hidden_def (_ZGVbN2v_exp) + +#define _ZGVbN2v_exp _ZGVbN2v_exp_sse2 +#include "../svml_d_exp2_core.S" diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_exp2_core_sse4.S b/sysdeps/x86_64/fpu/multiarch/svml_d_exp2_core_sse4.S new file mode 100644 index 0000000000..1f5445924a --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_exp2_core_sse4.S @@ -0,0 +1,225 @@ +/* Function exp vectorized with SSE4. + Copyright (C) 2014-2015 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "svml_d_exp_data.h" + + .text +ENTRY (_ZGVbN2v_exp_sse4) +/* + ALGORITHM DESCRIPTION: + + Argument representation: + N = rint(X*2^k/ln2) = 2^k*M+j + X = N*ln2/2^k + r = M*ln2 + ln2*(j/2^k) + r + then -ln2/2^(k+1) < r < ln2/2^(k+1) + Alternatively: + N = trunc(X*2^k/ln2) + then 0 < r < ln2/2^k + + Result calculation: + exp(X) = exp(M*ln2 + ln2*(j/2^k) + r) + = 2^M * 2^(j/2^k) * exp(r) + 2^M is calculated by bit manipulation + 2^(j/2^k) is stored in table + exp(r) is approximated by polynomial. + + The table lookup is skipped if k = 0. */ + + pushq %rbp + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%rbp, 0) + movq %rsp, %rbp + cfi_def_cfa_register (%rbp) + andq $-64, %rsp + subq $320, %rsp + movaps %xmm0, %xmm3 + movq __svml_dexp_data@GOTPCREL(%rip), %r8 + +/* iAbsX = (int)(lX>>32), lX = *(longlong*)&X */ + pshufd $221, %xmm3, %xmm7 + movups __dbInvLn2(%r8), %xmm0 + +/* dK = X*dbInvLn2 */ + mulpd %xmm3, %xmm0 + movq __iAbsMask(%r8), %xmm5 + movq __iDomainRange(%r8), %xmm6 + +/* iAbsX = iAbsX&iAbsMask */ + pand %xmm5, %xmm7 + +/* iRangeMask = (iAbsX>iDomainRange) */ + pcmpgtd %xmm6, %xmm7 + +/* Mask = iRangeMask?1:0, set mask for overflow/underflow */ + movmskps %xmm7, %eax + +/* dN = rint(X*2^k/Ln2) */ + xorps %xmm7, %xmm7 + movups __dbLn2hi(%r8), %xmm5 + movups __dbLn2lo(%r8), %xmm6 + roundpd $0, %xmm0, %xmm7 + +/* dR = X - dN*dbLn2hi, dbLn2hi is 52-8-k hi bits of ln2/2^k */ + mulpd %xmm7, %xmm5 + +/* dR = dR - dN*dbLn2lo, dbLn2lo is 40..94 bits of lo part of ln2/2^k */ + mulpd %xmm6, %xmm7 + movups __dbShifter(%r8), %xmm4 + +/* dM = X*dbInvLn2+dbShifter */ + addpd %xmm0, %xmm4 + movaps %xmm3, %xmm0 + subpd %xmm5, %xmm0 + subpd %xmm7, %xmm0 + movups __dPC2(%r8), %xmm5 + +/* exp(r) = b0+r*(b0+r*(b1+r*b2)) */ + mulpd %xmm0, %xmm5 + addpd __dPC1(%r8), %xmm5 + mulpd %xmm0, %xmm5 + movups __dPC0(%r8), %xmm6 + addpd %xmm6, %xmm5 + mulpd %xmm5, %xmm0 + movdqu __lIndexMask(%r8), %xmm2 + +/* lIndex = (*(longlong*)&dM)&lIndexMask, lIndex is the lower K bits of lM */ + movdqa %xmm2, %xmm1 + +/* lM = (*(longlong*)&dM)&(~lIndexMask) */ + pandn %xmm4, %xmm2 + pand %xmm4, %xmm1 + +/* lM = lM<<(52-K), 2^M */ + psllq $42, %xmm2 + +/* table lookup for dT[j] = 2^(j/2^k) */ + movd %xmm1, %edx + pextrw $4, %xmm1, %ecx + addpd %xmm0, %xmm6 + shll $3, %edx + shll $3, %ecx + movq (%r8,%rdx), %xmm0 + andl $3, %eax + movhpd (%r8,%rcx), %xmm0 + +/* 2^(j/2^k) * exp(r) */ + mulpd %xmm6, %xmm0 + +/* multiply by 2^M through integer add */ + paddq %xmm2, %xmm0 + jne .LBL_1_3 + +.LBL_1_2: + cfi_remember_state + movq %rbp, %rsp + cfi_def_cfa_register (%rsp) + popq %rbp + cfi_adjust_cfa_offset (-8) + cfi_restore (%rbp) + ret + +.LBL_1_3: + cfi_restore_state + movups %xmm3, 192(%rsp) + movups %xmm0, 256(%rsp) + je .LBL_1_2 + + xorb %cl, %cl + xorl %edx, %edx + movups %xmm8, 112(%rsp) + movups %xmm9, 96(%rsp) + movups %xmm10, 80(%rsp) + movups %xmm11, 64(%rsp) + movups %xmm12, 48(%rsp) + movups %xmm13, 32(%rsp) + movups %xmm14, 16(%rsp) + movups %xmm15, (%rsp) + movq %rsi, 136(%rsp) + movq %rdi, 128(%rsp) + movq %r12, 168(%rsp) + cfi_offset_rel_rsp (12, 168) + movb %cl, %r12b + movq %r13, 160(%rsp) + cfi_offset_rel_rsp (13, 160) + movl %eax, %r13d + movq %r14, 152(%rsp) + cfi_offset_rel_rsp (14, 152) + movl %edx, %r14d + movq %r15, 144(%rsp) + cfi_offset_rel_rsp (15, 144) + cfi_remember_state + +.LBL_1_6: + btl %r14d, %r13d + jc .LBL_1_12 + +.LBL_1_7: + lea 1(%r14), %esi + btl %esi, %r13d + jc .LBL_1_10 + +.LBL_1_8: + incb %r12b + addl $2, %r14d + cmpb $16, %r12b + jb .LBL_1_6 + + movups 112(%rsp), %xmm8 + movups 96(%rsp), %xmm9 + movups 80(%rsp), %xmm10 + movups 64(%rsp), %xmm11 + movups 48(%rsp), %xmm12 + movups 32(%rsp), %xmm13 + movups 16(%rsp), %xmm14 + movups (%rsp), %xmm15 + movq 136(%rsp), %rsi + movq 128(%rsp), %rdi + movq 168(%rsp), %r12 + cfi_restore (%r12) + movq 160(%rsp), %r13 + cfi_restore (%r13) + movq 152(%rsp), %r14 + cfi_restore (%r14) + movq 144(%rsp), %r15 + cfi_restore (%r15) + movups 256(%rsp), %xmm0 + jmp .LBL_1_2 + +.LBL_1_10: + cfi_restore_state + movzbl %r12b, %r15d + shlq $4, %r15 + movsd 200(%rsp,%r15), %xmm0 + + call exp@PLT + + movsd %xmm0, 264(%rsp,%r15) + jmp .LBL_1_8 + +.LBL_1_12: + movzbl %r12b, %r15d + shlq $4, %r15 + movsd 192(%rsp,%r15), %xmm0 + + call exp@PLT + + movsd %xmm0, 256(%rsp,%r15) + jmp .LBL_1_7 + +END (_ZGVbN2v_exp_sse4) diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_exp4_core.S b/sysdeps/x86_64/fpu/multiarch/svml_d_exp4_core.S new file mode 100644 index 0000000000..7f2ebdef67 --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_exp4_core.S @@ -0,0 +1,38 @@ +/* Multiple versions of vectorized exp. + Copyright (C) 2014-2015 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + + .text +ENTRY (_ZGVdN4v_exp) + .type _ZGVdN4v_exp, @gnu_indirect_function + cmpl $0, KIND_OFFSET+__cpu_features(%rip) + jne 1f + call __init_cpu_features +1: leaq _ZGVdN4v_exp_avx2(%rip), %rax + testl $bit_AVX2_Usable, __cpu_features+FEATURE_OFFSET+index_AVX2_Usable(%rip) + jz 2f + ret +2: leaq _ZGVdN4v_exp_sse_wrapper(%rip), %rax + ret +END (_ZGVdN4v_exp) +libmvec_hidden_def (_ZGVdN4v_exp) + +#define _ZGVdN4v_exp _ZGVdN4v_exp_sse_wrapper +#include "../svml_d_exp4_core.S" diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_exp4_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_exp4_core_avx2.S new file mode 100644 index 0000000000..a34e267433 --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_exp4_core_avx2.S @@ -0,0 +1,212 @@ +/* Function exp vectorized with AVX2. + Copyright (C) 2014-2015 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "svml_d_exp_data.h" + + .text +ENTRY (_ZGVdN4v_exp_avx2) +/* + ALGORITHM DESCRIPTION: + + Argument representation: + N = rint(X*2^k/ln2) = 2^k*M+j + X = N*ln2/2^k + r = M*ln2 + ln2*(j/2^k) + r + then -ln2/2^(k+1) < r < ln2/2^(k+1) + Alternatively: + N = trunc(X*2^k/ln2) + then 0 < r < ln2/2^k + + Result calculation: + exp(X) = exp(M*ln2 + ln2*(j/2^k) + r) + = 2^M * 2^(j/2^k) * exp(r) + 2^M is calculated by bit manipulation + 2^(j/2^k) is stored in table + exp(r) is approximated by polynomial + + The table lookup is skipped if k = 0. */ + + pushq %rbp + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%rbp, 0) + movq %rsp, %rbp + cfi_def_cfa_register (%rbp) + andq $-64, %rsp + subq $448, %rsp + movq __svml_dexp_data@GOTPCREL(%rip), %rax + vmovdqa %ymm0, %ymm2 + vmovupd __dbInvLn2(%rax), %ymm3 + vmovupd __dbShifter(%rax), %ymm1 + vmovupd __lIndexMask(%rax), %ymm4 + +/* dM = X*dbInvLn2+dbShifter, dbInvLn2 = 2^k/Ln2 */ + vfmadd213pd %ymm1, %ymm2, %ymm3 + +/* iAbsX = (int)(lX>>32), lX = *(longlong*)&X */ + vextracti128 $1, %ymm2, %xmm5 + vshufps $221, %xmm5, %xmm2, %xmm6 + +/* iAbsX = iAbsX&iAbsMask */ + vandps __iAbsMask(%rax), %xmm6, %xmm7 + +/* dN = dM-dbShifter, dN = rint(X*2^k/Ln2) */ + vsubpd %ymm1, %ymm3, %ymm6 + +/* iRangeMask = (iAbsX>iDomainRange) */ + vpcmpgtd __iDomainRange(%rax), %xmm7, %xmm0 + vmovupd __dbLn2hi(%rax), %ymm1 + vmovupd __dPC0(%rax), %ymm7 + +/* Mask = iRangeMask?1:0, set mask for overflow/underflow */ + vmovmskps %xmm0, %ecx + vmovupd __dPC2(%rax), %ymm0 + +/* dR = X - dN*dbLn2hi, dbLn2hi is 52-8-k hi bits of ln2/2^k */ + vmovdqa %ymm2, %ymm5 + vfnmadd231pd %ymm6, %ymm1, %ymm5 + +/* dR = dR - dN*dbLn2lo, dbLn2lo is 40..94 bits of lo part of ln2/2^k */ + vfnmadd132pd __dbLn2lo(%rax), %ymm5, %ymm6 + +/* exp(r) = b0+r*(b0+r*(b1+r*b2)) */ + vfmadd213pd __dPC1(%rax), %ymm6, %ymm0 + vfmadd213pd %ymm7, %ymm6, %ymm0 + vfmadd213pd %ymm7, %ymm6, %ymm0 + +/* lIndex = (*(longlong*)&dM)&lIndexMask, lIndex is the lower K bits of lM */ + vandps %ymm4, %ymm3, %ymm1 + +/* table lookup for dT[j] = 2^(j/2^k) */ + vxorpd %ymm6, %ymm6, %ymm6 + vpcmpeqd %ymm5, %ymm5, %ymm5 + vgatherqpd %ymm5, (%rax,%ymm1,8), %ymm6 + +/* lM = (*(longlong*)&dM)&(~lIndexMask) */ + vpandn %ymm3, %ymm4, %ymm3 + +/* 2^(j/2^k) * exp(r) */ + vmulpd %ymm0, %ymm6, %ymm0 + +/* lM = lM<<(52-K), 2^M */ + vpsllq $42, %ymm3, %ymm4 + +/* multiply by 2^M through integer add */ + vpaddq %ymm4, %ymm0, %ymm0 + testl %ecx, %ecx + jne .LBL_1_3 + +.LBL_1_2: + cfi_remember_state + movq %rbp, %rsp + cfi_def_cfa_register (%rsp) + popq %rbp + cfi_adjust_cfa_offset (-8) + cfi_restore (%rbp) + ret + +.LBL_1_3: + cfi_restore_state + vmovupd %ymm2, 320(%rsp) + vmovupd %ymm0, 384(%rsp) + je .LBL_1_2 + + xorb %dl, %dl + xorl %eax, %eax + vmovups %ymm8, 224(%rsp) + vmovups %ymm9, 192(%rsp) + vmovups %ymm10, 160(%rsp) + vmovups %ymm11, 128(%rsp) + vmovups %ymm12, 96(%rsp) + vmovups %ymm13, 64(%rsp) + vmovups %ymm14, 32(%rsp) + vmovups %ymm15, (%rsp) + movq %rsi, 264(%rsp) + movq %rdi, 256(%rsp) + movq %r12, 296(%rsp) + cfi_offset_rel_rsp (12, 296) + movb %dl, %r12b + movq %r13, 288(%rsp) + cfi_offset_rel_rsp (13, 288) + movl %ecx, %r13d + movq %r14, 280(%rsp) + cfi_offset_rel_rsp (14, 280) + movl %eax, %r14d + movq %r15, 272(%rsp) + cfi_offset_rel_rsp (15, 272) + cfi_remember_state + +.LBL_1_6: + btl %r14d, %r13d + jc .LBL_1_12 + +.LBL_1_7: + lea 1(%r14), %esi + btl %esi, %r13d + jc .LBL_1_10 + +.LBL_1_8: + incb %r12b + addl $2, %r14d + cmpb $16, %r12b + jb .LBL_1_6 + + vmovups 224(%rsp), %ymm8 + vmovups 192(%rsp), %ymm9 + vmovups 160(%rsp), %ymm10 + vmovups 128(%rsp), %ymm11 + vmovups 96(%rsp), %ymm12 + vmovups 64(%rsp), %ymm13 + vmovups 32(%rsp), %ymm14 + vmovups (%rsp), %ymm15 + vmovupd 384(%rsp), %ymm0 + movq 264(%rsp), %rsi + movq 256(%rsp), %rdi + movq 296(%rsp), %r12 + cfi_restore (%r12) + movq 288(%rsp), %r13 + cfi_restore (%r13) + movq 280(%rsp), %r14 + cfi_restore (%r14) + movq 272(%rsp), %r15 + cfi_restore (%r15) + jmp .LBL_1_2 + +.LBL_1_10: + cfi_restore_state + movzbl %r12b, %r15d + shlq $4, %r15 + vmovsd 328(%rsp,%r15), %xmm0 + vzeroupper + + call exp@PLT + + vmovsd %xmm0, 392(%rsp,%r15) + jmp .LBL_1_8 + +.LBL_1_12: + movzbl %r12b, %r15d + shlq $4, %r15 + vmovsd 320(%rsp,%r15), %xmm0 + vzeroupper + + call exp@PLT + + vmovsd %xmm0, 384(%rsp,%r15) + jmp .LBL_1_7 + +END (_ZGVdN4v_exp_avx2) diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_exp8_core.S b/sysdeps/x86_64/fpu/multiarch/svml_d_exp8_core.S new file mode 100644 index 0000000000..8f837fbfb9 --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_exp8_core.S @@ -0,0 +1,39 @@ +/* Multiple versions of vectorized exp. + Copyright (C) 2014-2015 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + + .text +ENTRY (_ZGVeN8v_exp) + .type _ZGVeN8v_exp, @gnu_indirect_function + cmpl $0, KIND_OFFSET+__cpu_features(%rip) + jne 1 + call __init_cpu_features +1: leaq _ZGVeN8v_exp_skx(%rip), %rax + testl $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip) + jnz 3 +2: leaq _ZGVeN8v_exp_knl(%rip), %rax + testl $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip) + jnz 3 + leaq _ZGVeN8v_exp_avx2_wrapper(%rip), %rax +3: ret +END (_ZGVeN8v_exp) + +#define _ZGVeN8v_exp _ZGVeN8v_exp_avx2_wrapper +#include "../svml_d_exp8_core.S" diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_exp8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_exp8_core_avx512.S new file mode 100644 index 0000000000..049a7e49cd --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_exp8_core_avx512.S @@ -0,0 +1,456 @@ +/* Function exp vectorized with AVX-512. KNL and SKX versions. + Copyright (C) 2014-2015 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "svml_d_exp_data.h" +#include "svml_d_wrapper_impl.h" + + .text +ENTRY (_ZGVeN8v_exp_knl) +#ifndef HAVE_AVX512_ASM_SUPPORT +WRAPPER_IMPL_AVX512 _ZGVdN4v_exp +#else +/* + ALGORITHM DESCRIPTION: + + Argument representation: + N = rint(X*2^k/ln2) = 2^k*M+j + X = N*ln2/2^k + r = M*ln2 + ln2*(j/2^k) + r + then -ln2/2^(k+1) < r < ln2/2^(k+1) + Alternatively: + N = trunc(X*2^k/ln2) + then 0 < r < ln2/2^k + + Result calculation: + exp(X) = exp(M*ln2 + ln2*(j/2^k) + r) + = 2^M * 2^(j/2^k) * exp(r) + 2^M is calculated by bit manipulation + 2^(j/2^k) is stored in table + exp(r) is approximated by polynomial + + The table lookup is skipped if k = 0. */ + + pushq %rbp + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%rbp, 0) + movq %rsp, %rbp + cfi_def_cfa_register (%rbp) + andq $-64, %rsp + subq $1280, %rsp + movq __svml_dexp_data@GOTPCREL(%rip), %rax + +/* dR = X - dN*dbLn2hi, dbLn2hi is 52-8-k hi bits of ln2/2^k */ + vmovaps %zmm0, %zmm8 + +/* iAbsX = (int)(lX>>32), lX = *(longlong*)&X */ + vpsrlq $32, %zmm0, %zmm1 + +/* iAbsX = iAbsX&iAbsMask */ + movl $255, %edx + vpmovqd %zmm1, %ymm2 + kmovw %edx, %k2 + +/* iRangeMask = (iAbsX>iDomainRange) */ + movl $-1, %ecx + +/* table lookup for dT[j] = 2^(j/2^k) */ + vpxord %zmm11, %zmm11, %zmm11 + vmovups __dbInvLn2(%rax), %zmm5 + vmovups __dbLn2hi(%rax), %zmm7 + kxnorw %k3, %k3, %k3 + +/* dM = X*dbInvLn2+dbShifter, dbInvLn2 = 2^k/Ln2 */ + vfmadd213pd __dbShifter(%rax), %zmm0, %zmm5 + vmovups __dPC2(%rax), %zmm12 + +/* dN = dM-dbShifter, dN = rint(X*2^k/Ln2) */ + vsubpd __dbShifter(%rax), %zmm5, %zmm9 + vmovups __lIndexMask(%rax), %zmm4 + vfnmadd231pd %zmm9, %zmm7, %zmm8 + vpandd __iAbsMask(%rax), %zmm2, %zmm2{%k2} + +/* lIndex = (*(longlong*)&dM)&lIndexMask, lIndex is the lower K bits of lM */ + vpandq %zmm4, %zmm5, %zmm10 + vgatherqpd (%rax,%zmm10,8), %zmm11{%k3} + vpcmpgtd __iDomainRange(%rax), %zmm2, %k1{%k2} + +/* lM = (*(longlong*)&dM)&(~lIndexMask) */ + vpandnq %zmm5, %zmm4, %zmm6 + vpbroadcastd %ecx, %zmm3{%k1}{z} + +/* lM = lM<<(52-K), 2^M */ + vpsllq $42, %zmm6, %zmm14 + +/* dR = dR - dN*dbLn2lo, dbLn2lo is 40..94 bits of lo part of ln2/2^k */ + vfnmadd132pd __dbLn2lo(%rax), %zmm8, %zmm9 + +/* Mask = iRangeMask?1:0, set mask for overflow/underflow */ + vptestmd %zmm3, %zmm3, %k0{%k2} + +/* exp(r) = b0+r*(b0+r*(b1+r*b2)) */ + vfmadd213pd __dPC1(%rax), %zmm9, %zmm12 + kmovw %k0, %ecx + movzbl %cl, %ecx + vfmadd213pd __dPC0(%rax), %zmm9, %zmm12 + vfmadd213pd __dPC0(%rax), %zmm9, %zmm12 + +/* 2^(j/2^k) * exp(r) */ + vmulpd %zmm12, %zmm11, %zmm13 + +/* multiply by 2^M through integer add */ + vpaddq %zmm14, %zmm13, %zmm1 + testl %ecx, %ecx + jne .LBL_1_3 + +.LBL_1_2: + cfi_remember_state + vmovaps %zmm1, %zmm0 + movq %rbp, %rsp + cfi_def_cfa_register (%rsp) + popq %rbp + cfi_adjust_cfa_offset (-8) + cfi_restore (%rbp) + ret + +.LBL_1_3: + cfi_restore_state + vmovups %zmm0, 1152(%rsp) + vmovups %zmm1, 1216(%rsp) + je .LBL_1_2 + + xorb %dl, %dl + kmovw %k4, 1048(%rsp) + xorl %eax, %eax + kmovw %k5, 1040(%rsp) + kmovw %k6, 1032(%rsp) + kmovw %k7, 1024(%rsp) + vmovups %zmm16, 960(%rsp) + vmovups %zmm17, 896(%rsp) + vmovups %zmm18, 832(%rsp) + vmovups %zmm19, 768(%rsp) + vmovups %zmm20, 704(%rsp) + vmovups %zmm21, 640(%rsp) + vmovups %zmm22, 576(%rsp) + vmovups %zmm23, 512(%rsp) + vmovups %zmm24, 448(%rsp) + vmovups %zmm25, 384(%rsp) + vmovups %zmm26, 320(%rsp) + vmovups %zmm27, 256(%rsp) + vmovups %zmm28, 192(%rsp) + vmovups %zmm29, 128(%rsp) + vmovups %zmm30, 64(%rsp) + vmovups %zmm31, (%rsp) + movq %rsi, 1064(%rsp) + movq %rdi, 1056(%rsp) + movq %r12, 1096(%rsp) + cfi_offset_rel_rsp (12, 1096) + movb %dl, %r12b + movq %r13, 1088(%rsp) + cfi_offset_rel_rsp (13, 1088) + movl %ecx, %r13d + movq %r14, 1080(%rsp) + cfi_offset_rel_rsp (14, 1080) + movl %eax, %r14d + movq %r15, 1072(%rsp) + cfi_offset_rel_rsp (15, 1072) + cfi_remember_state + +.LBL_1_6: + btl %r14d, %r13d + jc .LBL_1_12 + +.LBL_1_7: + lea 1(%r14), %esi + btl %esi, %r13d + jc .LBL_1_10 + +.LBL_1_8: + addb $1, %r12b + addl $2, %r14d + cmpb $16, %r12b + jb .LBL_1_6 + + kmovw 1048(%rsp), %k4 + movq 1064(%rsp), %rsi + kmovw 1040(%rsp), %k5 + movq 1056(%rsp), %rdi + kmovw 1032(%rsp), %k6 + movq 1096(%rsp), %r12 + cfi_restore (%r12) + movq 1088(%rsp), %r13 + cfi_restore (%r13) + kmovw 1024(%rsp), %k7 + vmovups 960(%rsp), %zmm16 + vmovups 896(%rsp), %zmm17 + vmovups 832(%rsp), %zmm18 + vmovups 768(%rsp), %zmm19 + vmovups 704(%rsp), %zmm20 + vmovups 640(%rsp), %zmm21 + vmovups 576(%rsp), %zmm22 + vmovups 512(%rsp), %zmm23 + vmovups 448(%rsp), %zmm24 + vmovups 384(%rsp), %zmm25 + vmovups 320(%rsp), %zmm26 + vmovups 256(%rsp), %zmm27 + vmovups 192(%rsp), %zmm28 + vmovups 128(%rsp), %zmm29 + vmovups 64(%rsp), %zmm30 + vmovups (%rsp), %zmm31 + movq 1080(%rsp), %r14 + cfi_restore (%r14) + movq 1072(%rsp), %r15 + cfi_restore (%r15) + vmovups 1216(%rsp), %zmm1 + jmp .LBL_1_2 + +.LBL_1_10: + cfi_restore_state + movzbl %r12b, %r15d + shlq $4, %r15 + vmovsd 1160(%rsp,%r15), %xmm0 + call exp@PLT + vmovsd %xmm0, 1224(%rsp,%r15) + jmp .LBL_1_8 + +.LBL_1_12: + movzbl %r12b, %r15d + shlq $4, %r15 + vmovsd 1152(%rsp,%r15), %xmm0 + call exp@PLT + vmovsd %xmm0, 1216(%rsp,%r15) + jmp .LBL_1_7 +#endif +END (_ZGVeN8v_exp_knl) + +ENTRY (_ZGVeN8v_exp_skx) +#ifndef HAVE_AVX512_ASM_SUPPORT +WRAPPER_IMPL_AVX512 _ZGVdN4v_exp +#else +/* + ALGORITHM DESCRIPTION: + + Argument representation: + N = rint(X*2^k/ln2) = 2^k*M+j + X = N*ln2/2^k + r = M*ln2 + ln2*(j/2^k) + r + then -ln2/2^(k+1) < r < ln2/2^(k+1) + Alternatively: + N = trunc(X*2^k/ln2) + then 0 < r < ln2/2^k + + Result calculation: + exp(X) = exp(M*ln2 + ln2*(j/2^k) + r) + = 2^M * 2^(j/2^k) * exp(r) + 2^M is calculated by bit manipulation + 2^(j/2^k) is stored in table + exp(r) is approximated by polynomial + + The table lookup is skipped if k = 0. */ + + pushq %rbp + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%rbp, 0) + movq %rsp, %rbp + cfi_def_cfa_register (%rbp) + andq $-64, %rsp + subq $1280, %rsp + movq __svml_dexp_data@GOTPCREL(%rip), %rax + +/* table lookup for dT[j] = 2^(j/2^k) */ + kxnorw %k1, %k1, %k1 + +/* iAbsX = (int)(lX>>32), lX = *(longlong*)&X */ + vpsrlq $32, %zmm0, %zmm1 + vmovups __dbInvLn2(%rax), %zmm7 + vmovups __dbShifter(%rax), %zmm5 + vmovups __lIndexMask(%rax), %zmm6 + vmovups __dbLn2hi(%rax), %zmm9 + vmovups __dPC0(%rax), %zmm12 + +/* dM = X*dbInvLn2+dbShifter, dbInvLn2 = 2^k/Ln2 */ + vfmadd213pd %zmm5, %zmm0, %zmm7 + vpmovqd %zmm1, %ymm2 + +/* dN = dM-dbShifter, dN = rint(X*2^k/Ln2) */ + vsubpd %zmm5, %zmm7, %zmm11 + +/* iAbsX = iAbsX&iAbsMask */ + vpand __iAbsMask(%rax), %ymm2, %ymm3 + +/* dR = X - dN*dbLn2hi, dbLn2hi is 52-8-k hi bits of ln2/2^k */ + vmovaps %zmm0, %zmm10 + vfnmadd231pd %zmm11, %zmm9, %zmm10 + vmovups __dPC2(%rax), %zmm9 + +/* dR = dR - dN*dbLn2lo, dbLn2lo is 40..94 bits of lo part of ln2/2^k */ + vfnmadd132pd __dbLn2lo(%rax), %zmm10, %zmm11 + +/* exp(r) = b0+r*(b0+r*(b1+r*b2)) */ + vfmadd213pd __dPC1(%rax), %zmm11, %zmm9 + vfmadd213pd %zmm12, %zmm11, %zmm9 + vfmadd213pd %zmm12, %zmm11, %zmm9 + +/* iRangeMask = (iAbsX>iDomainRange) */ + vpcmpgtd __iDomainRange(%rax), %ymm3, %ymm4 + +/* Mask = iRangeMask?1:0, set mask for overflow/underflow */ + vmovmskps %ymm4, %ecx + +/* lIndex = (*(longlong*)&dM)&lIndexMask, lIndex is the lower K bits of lM */ + vpandq %zmm6, %zmm7, %zmm13 + vpmovqd %zmm13, %ymm14 + vpxord %zmm15, %zmm15, %zmm15 + vgatherdpd (%rax,%ymm14,8), %zmm15{%k1} + +/* 2^(j/2^k) * exp(r) */ + vmulpd %zmm9, %zmm15, %zmm10 + +/* lM = (*(longlong*)&dM)&(~lIndexMask) */ + vpandnq %zmm7, %zmm6, %zmm8 + +/* lM = lM<<(52-K), 2^M */ + vpsllq $42, %zmm8, %zmm1 + +/* multiply by 2^M through integer add */ + vpaddq %zmm1, %zmm10, %zmm1 + testl %ecx, %ecx + jne .LBL_2_3 + +.LBL_2_2: + cfi_remember_state + vmovaps %zmm1, %zmm0 + movq %rbp, %rsp + cfi_def_cfa_register (%rsp) + popq %rbp + cfi_adjust_cfa_offset (-8) + cfi_restore (%rbp) + ret + +.LBL_2_3: + cfi_restore_state + vmovups %zmm0, 1152(%rsp) + vmovups %zmm1, 1216(%rsp) + je .LBL_2_2 + + xorb %dl, %dl + xorl %eax, %eax + kmovw %k4, 1048(%rsp) + kmovw %k5, 1040(%rsp) + kmovw %k6, 1032(%rsp) + kmovw %k7, 1024(%rsp) + vmovups %zmm16, 960(%rsp) + vmovups %zmm17, 896(%rsp) + vmovups %zmm18, 832(%rsp) + vmovups %zmm19, 768(%rsp) + vmovups %zmm20, 704(%rsp) + vmovups %zmm21, 640(%rsp) + vmovups %zmm22, 576(%rsp) + vmovups %zmm23, 512(%rsp) + vmovups %zmm24, 448(%rsp) + vmovups %zmm25, 384(%rsp) + vmovups %zmm26, 320(%rsp) + vmovups %zmm27, 256(%rsp) + vmovups %zmm28, 192(%rsp) + vmovups %zmm29, 128(%rsp) + vmovups %zmm30, 64(%rsp) + vmovups %zmm31, (%rsp) + movq %rsi, 1064(%rsp) + movq %rdi, 1056(%rsp) + movq %r12, 1096(%rsp) + cfi_offset_rel_rsp (12, 1096) + movb %dl, %r12b + movq %r13, 1088(%rsp) + cfi_offset_rel_rsp (13, 1088) + movl %ecx, %r13d + movq %r14, 1080(%rsp) + cfi_offset_rel_rsp (14, 1080) + movl %eax, %r14d + movq %r15, 1072(%rsp) + cfi_offset_rel_rsp (15, 1072) + cfi_remember_state + +.LBL_2_6: + btl %r14d, %r13d + jc .LBL_2_12 + +.LBL_2_7: + lea 1(%r14), %esi + btl %esi, %r13d + jc .LBL_2_10 + +.LBL_2_8: + incb %r12b + addl $2, %r14d + cmpb $16, %r12b + jb .LBL_2_6 + + kmovw 1048(%rsp), %k4 + kmovw 1040(%rsp), %k5 + kmovw 1032(%rsp), %k6 + kmovw 1024(%rsp), %k7 + vmovups 960(%rsp), %zmm16 + vmovups 896(%rsp), %zmm17 + vmovups 832(%rsp), %zmm18 + vmovups 768(%rsp), %zmm19 + vmovups 704(%rsp), %zmm20 + vmovups 640(%rsp), %zmm21 + vmovups 576(%rsp), %zmm22 + vmovups 512(%rsp), %zmm23 + vmovups 448(%rsp), %zmm24 + vmovups 384(%rsp), %zmm25 + vmovups 320(%rsp), %zmm26 + vmovups 256(%rsp), %zmm27 + vmovups 192(%rsp), %zmm28 + vmovups 128(%rsp), %zmm29 + vmovups 64(%rsp), %zmm30 + vmovups (%rsp), %zmm31 + vmovups 1216(%rsp), %zmm1 + movq 1064(%rsp), %rsi + movq 1056(%rsp), %rdi + movq 1096(%rsp), %r12 + cfi_restore (%r12) + movq 1088(%rsp), %r13 + cfi_restore (%r13) + movq 1080(%rsp), %r14 + cfi_restore (%r14) + movq 1072(%rsp), %r15 + cfi_restore (%r15) + jmp .LBL_2_2 + +.LBL_2_10: + cfi_restore_state + movzbl %r12b, %r15d + shlq $4, %r15 + vmovsd 1160(%rsp,%r15), %xmm0 + vzeroupper + vmovsd 1160(%rsp,%r15), %xmm0 + call exp@PLT + vmovsd %xmm0, 1224(%rsp,%r15) + jmp .LBL_2_8 + +.LBL_2_12: + movzbl %r12b, %r15d + shlq $4, %r15 + vmovsd 1152(%rsp,%r15), %xmm0 + vzeroupper + vmovsd 1152(%rsp,%r15), %xmm0 + call exp@PLT + vmovsd %xmm0, 1216(%rsp,%r15) + jmp .LBL_2_7 + +#endif +END (_ZGVeN8v_exp_skx) diff --git a/sysdeps/x86_64/fpu/svml_d_exp2_core.S b/sysdeps/x86_64/fpu/svml_d_exp2_core.S new file mode 100644 index 0000000000..ca3dd76364 --- /dev/null +++ b/sysdeps/x86_64/fpu/svml_d_exp2_core.S @@ -0,0 +1,29 @@ +/* Function exp vectorized with SSE2. + Copyright (C) 2014-2015 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "svml_d_wrapper_impl.h" + + .text +ENTRY (_ZGVbN2v_exp) +WRAPPER_IMPL_SSE2 exp +END (_ZGVbN2v_exp) + +#ifndef USE_MULTIARCH + libmvec_hidden_def (_ZGVbN2v_exp) +#endif diff --git a/sysdeps/x86_64/fpu/svml_d_exp4_core.S b/sysdeps/x86_64/fpu/svml_d_exp4_core.S new file mode 100644 index 0000000000..d497811980 --- /dev/null +++ b/sysdeps/x86_64/fpu/svml_d_exp4_core.S @@ -0,0 +1,29 @@ +/* Function exp vectorized with AVX2, wrapper version. + Copyright (C) 2014-2015 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "svml_d_wrapper_impl.h" + + .text +ENTRY (_ZGVdN4v_exp) +WRAPPER_IMPL_AVX _ZGVbN2v_exp +END (_ZGVdN4v_exp) + +#ifndef USE_MULTIARCH + libmvec_hidden_def (_ZGVdN4v_exp) +#endif diff --git a/sysdeps/x86_64/fpu/svml_d_exp4_core_avx.S b/sysdeps/x86_64/fpu/svml_d_exp4_core_avx.S new file mode 100644 index 0000000000..5dd2f6cd17 --- /dev/null +++ b/sysdeps/x86_64/fpu/svml_d_exp4_core_avx.S @@ -0,0 +1,25 @@ +/* Function exp vectorized in AVX ISA as wrapper to SSE4 ISA version. + Copyright (C) 2014-2015 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "svml_d_wrapper_impl.h" + + .text +ENTRY (_ZGVcN4v_exp) +WRAPPER_IMPL_AVX _ZGVbN2v_exp +END (_ZGVcN4v_exp) diff --git a/sysdeps/x86_64/fpu/svml_d_exp8_core.S b/sysdeps/x86_64/fpu/svml_d_exp8_core.S new file mode 100644 index 0000000000..3e273a3e71 --- /dev/null +++ b/sysdeps/x86_64/fpu/svml_d_exp8_core.S @@ -0,0 +1,25 @@ +/* Function exp vectorized with AVX-512. Wrapper to AVX2 version. + Copyright (C) 2014-2015 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "svml_d_wrapper_impl.h" + + .text +ENTRY (_ZGVeN8v_exp) +WRAPPER_IMPL_AVX512 _ZGVdN4v_exp +END (_ZGVeN8v_exp) diff --git a/sysdeps/x86_64/fpu/svml_d_exp_data.S b/sysdeps/x86_64/fpu/svml_d_exp_data.S new file mode 100644 index 0000000000..66fa3b88d7 --- /dev/null +++ b/sysdeps/x86_64/fpu/svml_d_exp_data.S @@ -0,0 +1,1088 @@ +/* Data for vector function exp. + Copyright (C) 2014-2015 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include "svml_d_exp_data.h" + + .section .rodata, "a" + .align 64 + +/* Data table for vector implementations of function exp. + * The table may contain polynomial, reduction, lookup + * coefficients and other constants obtained through different + * methods of research and experimental work. */ + .globl __svml_dexp_data +__svml_dexp_data: + +/* Lookup table of 2^(j/2^K): */ +.if .-__svml_dexp_data != __dbT +.err +.endif + .quad 0x3ff0000000000000 + .quad 0x3ff002c605e2e8cf + .quad 0x3ff0058c86da1c0a + .quad 0x3ff0085382faef83 + .quad 0x3ff00b1afa5abcbf + .quad 0x3ff00de2ed0ee0f5 + .quad 0x3ff010ab5b2cbd11 + .quad 0x3ff0137444c9b5b5 + .quad 0x3ff0163da9fb3335 + .quad 0x3ff019078ad6a19f + .quad 0x3ff01bd1e77170b4 + .quad 0x3ff01e9cbfe113ef + .quad 0x3ff02168143b0281 + .quad 0x3ff02433e494b755 + .quad 0x3ff027003103b10e + .quad 0x3ff029ccf99d720a + .quad 0x3ff02c9a3e778061 + .quad 0x3ff02f67ffa765e6 + .quad 0x3ff032363d42b027 + .quad 0x3ff03504f75ef071 + .quad 0x3ff037d42e11bbcc + .quad 0x3ff03aa3e170aafe + .quad 0x3ff03d7411915a8a + .quad 0x3ff04044be896ab6 + .quad 0x3ff04315e86e7f85 + .quad 0x3ff045e78f5640b9 + .quad 0x3ff048b9b35659d8 + .quad 0x3ff04b8c54847a28 + .quad 0x3ff04e5f72f654b1 + .quad 0x3ff051330ec1a03f + .quad 0x3ff0540727fc1762 + .quad 0x3ff056dbbebb786b + .quad 0x3ff059b0d3158574 + .quad 0x3ff05c866520045b + .quad 0x3ff05f5c74f0bec2 + .quad 0x3ff06233029d8216 + .quad 0x3ff0650a0e3c1f89 + .quad 0x3ff067e197e26c14 + .quad 0x3ff06ab99fa6407c + .quad 0x3ff06d92259d794d + .quad 0x3ff0706b29ddf6de + .quad 0x3ff07344ac7d9d51 + .quad 0x3ff0761ead925493 + .quad 0x3ff078f92d32085d + .quad 0x3ff07bd42b72a836 + .quad 0x3ff07eafa86a2771 + .quad 0x3ff0818ba42e7d30 + .quad 0x3ff084681ed5a462 + .quad 0x3ff0874518759bc8 + .quad 0x3ff08a22912465f2 + .quad 0x3ff08d0088f8093f + .quad 0x3ff08fdf00068fe2 + .quad 0x3ff092bdf66607e0 + .quad 0x3ff0959d6c2c830d + .quad 0x3ff0987d61701716 + .quad 0x3ff09b5dd646dd77 + .quad 0x3ff09e3ecac6f383 + .quad 0x3ff0a1203f067a63 + .quad 0x3ff0a402331b9715 + .quad 0x3ff0a6e4a71c726e + .quad 0x3ff0a9c79b1f3919 + .quad 0x3ff0acab0f3a1b9c + .quad 0x3ff0af8f03834e52 + .quad 0x3ff0b27378110974 + .quad 0x3ff0b5586cf9890f + .quad 0x3ff0b83de2530d11 + .quad 0x3ff0bb23d833d93f + .quad 0x3ff0be0a4eb2353b + .quad 0x3ff0c0f145e46c85 + .quad 0x3ff0c3d8bde0ce7a + .quad 0x3ff0c6c0b6bdae53 + .quad 0x3ff0c9a93091632a + .quad 0x3ff0cc922b7247f7 + .quad 0x3ff0cf7ba776bb94 + .quad 0x3ff0d265a4b520ba + .quad 0x3ff0d5502343de02 + .quad 0x3ff0d83b23395dec + .quad 0x3ff0db26a4ac0ed5 + .quad 0x3ff0de12a7b26300 + .quad 0x3ff0e0ff2c62d096 + .quad 0x3ff0e3ec32d3d1a2 + .quad 0x3ff0e6d9bb1be415 + .quad 0x3ff0e9c7c55189c6 + .quad 0x3ff0ecb6518b4874 + .quad 0x3ff0efa55fdfa9c5 + .quad 0x3ff0f294f0653b45 + .quad 0x3ff0f58503328e6d + .quad 0x3ff0f875985e389b + .quad 0x3ff0fb66affed31b + .quad 0x3ff0fe584a2afb21 + .quad 0x3ff1014a66f951ce + .quad 0x3ff1043d06807c2f + .quad 0x3ff1073028d7233e + .quad 0x3ff10a23ce13f3e2 + .quad 0x3ff10d17f64d9ef1 + .quad 0x3ff1100ca19ad92f + .quad 0x3ff11301d0125b51 + .quad 0x3ff115f781cae1fa + .quad 0x3ff118edb6db2dc1 + .quad 0x3ff11be46f5a032c + .quad 0x3ff11edbab5e2ab6 + .quad 0x3ff121d36afe70c9 + .quad 0x3ff124cbae51a5c8 + .quad 0x3ff127c4756e9e05 + .quad 0x3ff12abdc06c31cc + .quad 0x3ff12db78f613d5b + .quad 0x3ff130b1e264a0e9 + .quad 0x3ff133acb98d40a2 + .quad 0x3ff136a814f204ab + .quad 0x3ff139a3f4a9d922 + .quad 0x3ff13ca058cbae1e + .quad 0x3ff13f9d416e77af + .quad 0x3ff1429aaea92de0 + .quad 0x3ff14598a092ccb7 + .quad 0x3ff1489717425438 + .quad 0x3ff14b9612cec861 + .quad 0x3ff14e95934f312e + .quad 0x3ff1519598da9a9a + .quad 0x3ff154962388149e + .quad 0x3ff15797336eb333 + .quad 0x3ff15a98c8a58e51 + .quad 0x3ff15d9ae343c1f2 + .quad 0x3ff1609d83606e12 + .quad 0x3ff163a0a912b6ac + .quad 0x3ff166a45471c3c2 + .quad 0x3ff169a88594c157 + .quad 0x3ff16cad3c92df73 + .quad 0x3ff16fb279835224 + .quad 0x3ff172b83c7d517b + .quad 0x3ff175be85981992 + .quad 0x3ff178c554eaea89 + .quad 0x3ff17bccaa8d0888 + .quad 0x3ff17ed48695bbc0 + .quad 0x3ff181dce91c506a + .quad 0x3ff184e5d23816c9 + .quad 0x3ff187ef4200632b + .quad 0x3ff18af9388c8dea + .quad 0x3ff18e03b5f3f36b + .quad 0x3ff1910eba4df41f + .quad 0x3ff1941a45b1f487 + .quad 0x3ff1972658375d2f + .quad 0x3ff19a32f1f59ab4 + .quad 0x3ff19d4013041dc2 + .quad 0x3ff1a04dbb7a5b13 + .quad 0x3ff1a35beb6fcb75 + .quad 0x3ff1a66aa2fbebc7 + .quad 0x3ff1a979e2363cf8 + .quad 0x3ff1ac89a936440d + .quad 0x3ff1af99f8138a1c + .quad 0x3ff1b2aacee59c53 + .quad 0x3ff1b5bc2dc40bf0 + .quad 0x3ff1b8ce14c66e4c + .quad 0x3ff1bbe084045cd4 + .quad 0x3ff1bef37b95750b + .quad 0x3ff1c206fb91588f + .quad 0x3ff1c51b040fad15 + .quad 0x3ff1c82f95281c6b + .quad 0x3ff1cb44aef2547a + .quad 0x3ff1ce5a51860746 + .quad 0x3ff1d1707cfaeaed + .quad 0x3ff1d4873168b9aa + .quad 0x3ff1d79e6ee731d7 + .quad 0x3ff1dab6358e15e8 + .quad 0x3ff1ddce85752c71 + .quad 0x3ff1e0e75eb44027 + .quad 0x3ff1e400c1631fdb + .quad 0x3ff1e71aad999e82 + .quad 0x3ff1ea35236f9330 + .quad 0x3ff1ed5022fcd91d + .quad 0x3ff1f06bac594fa0 + .quad 0x3ff1f387bf9cda38 + .quad 0x3ff1f6a45cdf6085 + .quad 0x3ff1f9c18438ce4d + .quad 0x3ff1fcdf35c1137a + .quad 0x3ff1fffd7190241e + .quad 0x3ff2031c37bdf872 + .quad 0x3ff2063b88628cd6 + .quad 0x3ff2095b6395e1d2 + .quad 0x3ff20c7bc96ffc18 + .quad 0x3ff20f9cba08e483 + .quad 0x3ff212be3578a819 + .quad 0x3ff215e03bd7580c + .quad 0x3ff21902cd3d09b9 + .quad 0x3ff21c25e9c1d6aa + .quad 0x3ff21f49917ddc96 + .quad 0x3ff2226dc4893d64 + .quad 0x3ff2259282fc1f27 + .quad 0x3ff228b7cceeac25 + .quad 0x3ff22bdda27912d1 + .quad 0x3ff22f0403b385d2 + .quad 0x3ff2322af0b63bff + .quad 0x3ff2355269997062 + .quad 0x3ff2387a6e756238 + .quad 0x3ff23ba2ff6254f4 + .quad 0x3ff23ecc1c78903a + .quad 0x3ff241f5c5d05fe6 + .quad 0x3ff2451ffb82140a + .quad 0x3ff2484abda600ef + .quad 0x3ff24b760c547f15 + .quad 0x3ff24ea1e7a5eb35 + .quad 0x3ff251ce4fb2a63f + .quad 0x3ff254fb44931561 + .quad 0x3ff25828c65fa1ff + .quad 0x3ff25b56d530b9bc + .quad 0x3ff25e85711ece75 + .quad 0x3ff261b49a425645 + .quad 0x3ff264e450b3cb82 + .quad 0x3ff26814948bacc3 + .quad 0x3ff26b4565e27cdd + .quad 0x3ff26e76c4d0c2e5 + .quad 0x3ff271a8b16f0a30 + .quad 0x3ff274db2bd5e254 + .quad 0x3ff2780e341ddf29 + .quad 0x3ff27b41ca5f98cb + .quad 0x3ff27e75eeb3ab98 + .quad 0x3ff281aaa132b832 + .quad 0x3ff284dfe1f56381 + .quad 0x3ff28815b11456b1 + .quad 0x3ff28b4c0ea83f36 + .quad 0x3ff28e82fac9ceca + .quad 0x3ff291ba7591bb70 + .quad 0x3ff294f27f18bf72 + .quad 0x3ff2982b17779965 + .quad 0x3ff29b643ec70c27 + .quad 0x3ff29e9df51fdee1 + .quad 0x3ff2a1d83a9add08 + .quad 0x3ff2a5130f50d65c + .quad 0x3ff2a84e735a9eec + .quad 0x3ff2ab8a66d10f13 + .quad 0x3ff2aec6e9cd037b + .quad 0x3ff2b203fc675d1f + .quad 0x3ff2b5419eb90148 + .quad 0x3ff2b87fd0dad990 + .quad 0x3ff2bbbe92e5d3e3 + .quad 0x3ff2befde4f2e280 + .quad 0x3ff2c23dc71afbf7 + .quad 0x3ff2c57e39771b2f + .quad 0x3ff2c8bf3c203f5f + .quad 0x3ff2cc00cf2f6c18 + .quad 0x3ff2cf42f2bda93d + .quad 0x3ff2d285a6e4030b + .quad 0x3ff2d5c8ebbb8a15 + .quad 0x3ff2d90cc15d5346 + .quad 0x3ff2dc5127e277e3 + .quad 0x3ff2df961f641589 + .quad 0x3ff2e2dba7fb4e33 + .quad 0x3ff2e621c1c14833 + .quad 0x3ff2e9686ccf2e3b + .quad 0x3ff2ecafa93e2f56 + .quad 0x3ff2eff777277ef0 + .quad 0x3ff2f33fd6a454d2 + .quad 0x3ff2f688c7cded23 + .quad 0x3ff2f9d24abd886b + .quad 0x3ff2fd1c5f8c6b93 + .quad 0x3ff300670653dfe4 + .quad 0x3ff303b23f2d330b + .quad 0x3ff306fe0a31b715 + .quad 0x3ff30a4a677ac276 + .quad 0x3ff30d975721b004 + .quad 0x3ff310e4d93fdefb + .quad 0x3ff31432edeeb2fd + .quad 0x3ff3178195479413 + .quad 0x3ff31ad0cf63eeac + .quad 0x3ff31e209c5d33a0 + .quad 0x3ff32170fc4cd831 + .quad 0x3ff324c1ef4c560a + .quad 0x3ff3281375752b40 + .quad 0x3ff32b658ee0da54 + .quad 0x3ff32eb83ba8ea32 + .quad 0x3ff3320b7be6e633 + .quad 0x3ff3355f4fb45e20 + .quad 0x3ff338b3b72ae62d + .quad 0x3ff33c08b26416ff + .quad 0x3ff33f5e41798daa + .quad 0x3ff342b46484ebb4 + .quad 0x3ff3460b1b9fd712 + .quad 0x3ff3496266e3fa2d + .quad 0x3ff34cba466b03e1 + .quad 0x3ff35012ba4ea77d + .quad 0x3ff3536bc2a89cc4 + .quad 0x3ff356c55f929ff1 + .quad 0x3ff35a1f912671b1 + .quad 0x3ff35d7a577dd72b + .quad 0x3ff360d5b2b299fc + .quad 0x3ff36431a2de883b + .quad 0x3ff3678e281b7475 + .quad 0x3ff36aeb428335b4 + .quad 0x3ff36e48f22fa77c + .quad 0x3ff371a7373aa9cb + .quad 0x3ff3750611be211c + .quad 0x3ff3786581d3f669 + .quad 0x3ff37bc587961726 + .quad 0x3ff37f26231e754a + .quad 0x3ff3828754870746 + .quad 0x3ff385e91be9c811 + .quad 0x3ff3894b7960b71f + .quad 0x3ff38cae6d05d866 + .quad 0x3ff39011f6f3345f + .quad 0x3ff393761742d808 + .quad 0x3ff396dace0ed4e1 + .quad 0x3ff39a401b7140ef + .quad 0x3ff39da5ff8436bc + .quad 0x3ff3a10c7a61d55b + .quad 0x3ff3a4738c244064 + .quad 0x3ff3a7db34e59ff7 + .quad 0x3ff3ab4374c020bd + .quad 0x3ff3aeac4bcdf3ea + .quad 0x3ff3b215ba294f39 + .quad 0x3ff3b57fbfec6cf4 + .quad 0x3ff3b8ea5d318bef + .quad 0x3ff3bc559212ef89 + .quad 0x3ff3bfc15eaadfb1 + .quad 0x3ff3c32dc313a8e5 + .quad 0x3ff3c69abf679c2e + .quad 0x3ff3ca0853c10f28 + .quad 0x3ff3cd76803a5c00 + .quad 0x3ff3d0e544ede173 + .quad 0x3ff3d454a1f602d0 + .quad 0x3ff3d7c4976d27fa + .quad 0x3ff3db35256dbd67 + .quad 0x3ff3dea64c123422 + .quad 0x3ff3e2180b7501cc + .quad 0x3ff3e58a63b0a09b + .quad 0x3ff3e8fd54df8f5c + .quad 0x3ff3ec70df1c5175 + .quad 0x3ff3efe502816ee3 + .quad 0x3ff3f359bf29743f + .quad 0x3ff3f6cf152ef2b8 + .quad 0x3ff3fa4504ac801c + .quad 0x3ff3fdbb8dbcb6d2 + .quad 0x3ff40132b07a35df + .quad 0x3ff404aa6cffa0e5 + .quad 0x3ff40822c367a024 + .quad 0x3ff40b9bb3cce07c + .quad 0x3ff40f153e4a136a + .quad 0x3ff4128f62f9ef0e + .quad 0x3ff4160a21f72e2a + .quad 0x3ff419857b5c901f + .quad 0x3ff41d016f44d8f5 + .quad 0x3ff4207dfdcad153 + .quad 0x3ff423fb2709468a + .quad 0x3ff42778eb1b0a8b + .quad 0x3ff42af74a1af3f1 + .quad 0x3ff42e764423ddfd + .quad 0x3ff431f5d950a897 + .quad 0x3ff4357609bc3850 + .quad 0x3ff438f6d5817663 + .quad 0x3ff43c783cbb50b4 + .quad 0x3ff43ffa3f84b9d4 + .quad 0x3ff4437cddf8a8fe + .quad 0x3ff4470018321a1a + .quad 0x3ff44a83ee4c0dbd + .quad 0x3ff44e086061892d + .quad 0x3ff4518d6e8d965b + .quad 0x3ff4551318eb43ec + .quad 0x3ff458995f95a532 + .quad 0x3ff45c2042a7d232 + .quad 0x3ff45fa7c23ce7a4 + .quad 0x3ff4632fde7006f4 + .quad 0x3ff466b8975c563e + .quad 0x3ff46a41ed1d0057 + .quad 0x3ff46dcbdfcd34c8 + .quad 0x3ff471566f8827d0 + .quad 0x3ff474e19c691265 + .quad 0x3ff4786d668b3237 + .quad 0x3ff47bf9ce09c9ab + .quad 0x3ff47f86d3001fe5 + .quad 0x3ff48314758980bf + .quad 0x3ff486a2b5c13cd0 + .quad 0x3ff48a3193c2a96c + .quad 0x3ff48dc10fa920a1 + .quad 0x3ff491512990013f + .quad 0x3ff494e1e192aed2 + .quad 0x3ff4987337cc91a5 + .quad 0x3ff49c052c5916c4 + .quad 0x3ff49f97bf53affd + .quad 0x3ff4a32af0d7d3de + .quad 0x3ff4a6bec100fdba + .quad 0x3ff4aa532feaada6 + .quad 0x3ff4ade83db0687a + .quad 0x3ff4b17dea6db7d7 + .quad 0x3ff4b514363e2a20 + .quad 0x3ff4b8ab213d5283 + .quad 0x3ff4bc42ab86c8f1 + .quad 0x3ff4bfdad5362a27 + .quad 0x3ff4c3739e6717aa + .quad 0x3ff4c70d073537ca + .quad 0x3ff4caa70fbc35a1 + .quad 0x3ff4ce41b817c114 + .quad 0x3ff4d1dd00638ed8 + .quad 0x3ff4d578e8bb586b + .quad 0x3ff4d915713adc1e + .quad 0x3ff4dcb299fddd0d + .quad 0x3ff4e05063202327 + .quad 0x3ff4e3eeccbd7b2a + .quad 0x3ff4e78dd6f1b6a6 + .quad 0x3ff4eb2d81d8abff + .quad 0x3ff4eecdcd8e3669 + .quad 0x3ff4f26eba2e35f0 + .quad 0x3ff4f61047d48f73 + .quad 0x3ff4f9b2769d2ca7 + .quad 0x3ff4fd5546a3fc17 + .quad 0x3ff500f8b804f127 + .quad 0x3ff5049ccadc0412 + .quad 0x3ff508417f4531ee + .quad 0x3ff50be6d55c7ca9 + .quad 0x3ff50f8ccd3deb0d + .quad 0x3ff51333670588bf + .quad 0x3ff516daa2cf6642 + .quad 0x3ff51a8280b798f4 + .quad 0x3ff51e2b00da3b14 + .quad 0x3ff521d423536bbe + .quad 0x3ff5257de83f4eef + .quad 0x3ff529284fba0d84 + .quad 0x3ff52cd359dfd53d + .quad 0x3ff5307f06ccd8ba + .quad 0x3ff5342b569d4f82 + .quad 0x3ff537d8496d75fc + .quad 0x3ff53b85df598d78 + .quad 0x3ff53f34187ddc28 + .quad 0x3ff542e2f4f6ad27 + .quad 0x3ff5469274e05078 + .quad 0x3ff54a4298571b06 + .quad 0x3ff54df35f7766a3 + .quad 0x3ff551a4ca5d920f + .quad 0x3ff55556d92600f1 + .quad 0x3ff559098bed1bdf + .quad 0x3ff55cbce2cf505b + .quad 0x3ff56070dde910d2 + .quad 0x3ff564257d56d4a2 + .quad 0x3ff567dac1351819 + .quad 0x3ff56b90a9a05c72 + .quad 0x3ff56f4736b527da + .quad 0x3ff572fe68900573 + .quad 0x3ff576b63f4d854c + .quad 0x3ff57a6ebb0a3c6d + .quad 0x3ff57e27dbe2c4cf + .quad 0x3ff581e1a1f3bd60 + .quad 0x3ff5859c0d59ca07 + .quad 0x3ff589571e31939f + .quad 0x3ff58d12d497c7fd + .quad 0x3ff590cf30a919ed + .quad 0x3ff5948c32824135 + .quad 0x3ff59849da3ffa96 + .quad 0x3ff59c0827ff07cc + .quad 0x3ff59fc71bdc2f8e + .quad 0x3ff5a386b5f43d92 + .quad 0x3ff5a746f664028b + .quad 0x3ff5ab07dd485429 + .quad 0x3ff5aec96abe0d1f + .quad 0x3ff5b28b9ee20d1e + .quad 0x3ff5b64e79d138d8 + .quad 0x3ff5ba11fba87a03 + .quad 0x3ff5bdd62484bf56 + .quad 0x3ff5c19af482fc8f + .quad 0x3ff5c5606bc02a6d + .quad 0x3ff5c9268a5946b7 + .quad 0x3ff5cced506b543a + .quad 0x3ff5d0b4be135acc + .quad 0x3ff5d47cd36e6747 + .quad 0x3ff5d84590998b93 + .quad 0x3ff5dc0ef5b1de9e + .quad 0x3ff5dfd902d47c65 + .quad 0x3ff5e3a3b81e85ec + .quad 0x3ff5e76f15ad2148 + .quad 0x3ff5eb3b1b9d799a + .quad 0x3ff5ef07ca0cbf0f + .quad 0x3ff5f2d5211826e8 + .quad 0x3ff5f6a320dceb71 + .quad 0x3ff5fa71c9784c0b + .quad 0x3ff5fe411b078d26 + .quad 0x3ff6021115a7f849 + .quad 0x3ff605e1b976dc09 + .quad 0x3ff609b306918c13 + .quad 0x3ff60d84fd15612a + .quad 0x3ff611579d1fb925 + .quad 0x3ff6152ae6cdf6f4 + .quad 0x3ff618feda3d829f + .quad 0x3ff61cd3778bc944 + .quad 0x3ff620a8bed63d1f + .quad 0x3ff6247eb03a5585 + .quad 0x3ff628554bd58ee5 + .quad 0x3ff62c2c91c56acd + .quad 0x3ff6300482276fe8 + .quad 0x3ff633dd1d1929fd + .quad 0x3ff637b662b829f5 + .quad 0x3ff63b90532205d8 + .quad 0x3ff63f6aee7458cd + .quad 0x3ff6434634ccc320 + .quad 0x3ff647222648ea3d + .quad 0x3ff64afec30678b7 + .quad 0x3ff64edc0b231e41 + .quad 0x3ff652b9febc8fb7 + .quad 0x3ff656989df08719 + .quad 0x3ff65a77e8dcc390 + .quad 0x3ff65e57df9f096b + .quad 0x3ff6623882552225 + .quad 0x3ff66619d11cdc5f + .quad 0x3ff669fbcc140be7 + .quad 0x3ff66dde735889b8 + .quad 0x3ff671c1c70833f6 + .quad 0x3ff675a5c740edf5 + .quad 0x3ff6798a7420a036 + .quad 0x3ff67d6fcdc5386a + .quad 0x3ff68155d44ca973 + .quad 0x3ff6853c87d4eb62 + .quad 0x3ff68923e87bfb7a + .quad 0x3ff68d0bf65fdc34 + .quad 0x3ff690f4b19e9538 + .quad 0x3ff694de1a563367 + .quad 0x3ff698c830a4c8d4 + .quad 0x3ff69cb2f4a86cca + .quad 0x3ff6a09e667f3bcd + .quad 0x3ff6a48a86475795 + .quad 0x3ff6a877541ee718 + .quad 0x3ff6ac64d0241683 + .quad 0x3ff6b052fa75173e + .quad 0x3ff6b441d3301fee + .quad 0x3ff6b8315a736c75 + .quad 0x3ff6bc21905d3df0 + .quad 0x3ff6c012750bdabf + .quad 0x3ff6c404089d8e7d + .quad 0x3ff6c7f64b30aa09 + .quad 0x3ff6cbe93ce38381 + .quad 0x3ff6cfdcddd47645 + .quad 0x3ff6d3d12e21e2fb + .quad 0x3ff6d7c62dea2f8a + .quad 0x3ff6dbbbdd4bc720 + .quad 0x3ff6dfb23c651a2f + .quad 0x3ff6e3a94b549e71 + .quad 0x3ff6e7a10a38cee8 + .quad 0x3ff6eb9979302bdd + .quad 0x3ff6ef9298593ae5 + .quad 0x3ff6f38c67d286dd + .quad 0x3ff6f786e7ba9fef + .quad 0x3ff6fb8218301b90 + .quad 0x3ff6ff7df9519484 + .quad 0x3ff7037a8b3daadb + .quad 0x3ff70777ce1303f6 + .quad 0x3ff70b75c1f04a84 + .quad 0x3ff70f7466f42e87 + .quad 0x3ff71373bd3d6551 + .quad 0x3ff71773c4eaa988 + .quad 0x3ff71b747e1abb24 + .quad 0x3ff71f75e8ec5f74 + .quad 0x3ff72378057e611a + .quad 0x3ff7277ad3ef9011 + .quad 0x3ff72b7e545ec1a8 + .quad 0x3ff72f8286ead08a + .quad 0x3ff733876bb29cb8 + .quad 0x3ff7378d02d50b8f + .quad 0x3ff73b934c7107c7 + .quad 0x3ff73f9a48a58174 + .quad 0x3ff743a1f7916e05 + .quad 0x3ff747aa5953c849 + .quad 0x3ff74bb36e0b906d + .quad 0x3ff74fbd35d7cbfd + .quad 0x3ff753c7b0d785e8 + .quad 0x3ff757d2df29ce7c + .quad 0x3ff75bdec0edbb6b + .quad 0x3ff75feb564267c9 + .quad 0x3ff763f89f46f40f + .quad 0x3ff768069c1a861d + .quad 0x3ff76c154cdc4937 + .quad 0x3ff77024b1ab6e09 + .quad 0x3ff77434caa72aa7 + .quad 0x3ff7784597eeba8f + .quad 0x3ff77c5719a15ea6 + .quad 0x3ff780694fde5d3f + .quad 0x3ff7847c3ac50219 + .quad 0x3ff7888fda749e5d + .quad 0x3ff78ca42f0c88a5 + .quad 0x3ff790b938ac1cf6 + .quad 0x3ff794cef772bcc9 + .quad 0x3ff798e56b7fcf03 + .quad 0x3ff79cfc94f2bfff + .quad 0x3ff7a11473eb0187 + .quad 0x3ff7a52d08880ad9 + .quad 0x3ff7a94652e958aa + .quad 0x3ff7ad60532e6d20 + .quad 0x3ff7b17b0976cfdb + .quad 0x3ff7b59675e20def + .quad 0x3ff7b9b2988fb9ec + .quad 0x3ff7bdcf719f6bd7 + .quad 0x3ff7c1ed0130c132 + .quad 0x3ff7c60b47635cf9 + .quad 0x3ff7ca2a4456e7a3 + .quad 0x3ff7ce49f82b0f24 + .quad 0x3ff7d26a62ff86f0 + .quad 0x3ff7d68b84f407f8 + .quad 0x3ff7daad5e2850ac + .quad 0x3ff7decfeebc24fe + .quad 0x3ff7e2f336cf4e62 + .quad 0x3ff7e71736819bcd + .quad 0x3ff7eb3bedf2e1b9 + .quad 0x3ff7ef615d42fa24 + .quad 0x3ff7f3878491c491 + .quad 0x3ff7f7ae63ff260a + .quad 0x3ff7fbd5fbab091f + .quad 0x3ff7fffe4bb55dec + .quad 0x3ff80427543e1a12 + .quad 0x3ff80851156538be + .quad 0x3ff80c7b8f4abaa9 + .quad 0x3ff810a6c20ea617 + .quad 0x3ff814d2add106d9 + .quad 0x3ff818ff52b1ee50 + .quad 0x3ff81d2cb0d1736a + .quad 0x3ff8215ac84fb2a6 + .quad 0x3ff82589994cce13 + .quad 0x3ff829b923e8ed53 + .quad 0x3ff82de968443d9a + .quad 0x3ff8321a667ef1b2 + .quad 0x3ff8364c1eb941f7 + .quad 0x3ff83a7e91136c5d + .quad 0x3ff83eb1bdadb46d + .quad 0x3ff842e5a4a8634a + .quad 0x3ff8471a4623c7ad + .quad 0x3ff84b4fa24035ea + .quad 0x3ff84f85b91e07f1 + .quad 0x3ff853bc8add9d4c + .quad 0x3ff857f4179f5b21 + .quad 0x3ff85c2c5f83ac35 + .quad 0x3ff8606562ab00ec + .quad 0x3ff8649f2135cf48 + .quad 0x3ff868d99b4492ed + .quad 0x3ff86d14d0f7cd1d + .quad 0x3ff87150c27004c2 + .quad 0x3ff8758d6fcdc666 + .quad 0x3ff879cad931a436 + .quad 0x3ff87e08febc3608 + .quad 0x3ff88247e08e1957 + .quad 0x3ff886877ec7f144 + .quad 0x3ff88ac7d98a6699 + .quad 0x3ff88f08f0f627cb + .quad 0x3ff8934ac52be8f7 + .quad 0x3ff8978d564c63e7 + .quad 0x3ff89bd0a478580f + .quad 0x3ff8a014afd08a94 + .quad 0x3ff8a4597875c644 + .quad 0x3ff8a89efe88dba1 + .quad 0x3ff8ace5422aa0db + .quad 0x3ff8b12c437bf1d4 + .quad 0x3ff8b574029db01e + .quad 0x3ff8b9bc7fb0c302 + .quad 0x3ff8be05bad61778 + .quad 0x3ff8c24fb42ea033 + .quad 0x3ff8c69a6bdb5598 + .quad 0x3ff8cae5e1fd35c4 + .quad 0x3ff8cf3216b5448c + .quad 0x3ff8d37f0a248b7f + .quad 0x3ff8d7ccbc6c19e6 + .quad 0x3ff8dc1b2dad04c4 + .quad 0x3ff8e06a5e0866d9 + .quad 0x3ff8e4ba4d9f60a1 + .quad 0x3ff8e90afc931857 + .quad 0x3ff8ed5c6b04b9f6 + .quad 0x3ff8f1ae99157736 + .quad 0x3ff8f60186e68793 + .quad 0x3ff8fa553499284b + .quad 0x3ff8fea9a24e9c5c + .quad 0x3ff902fed0282c8a + .quad 0x3ff90754be472760 + .quad 0x3ff90bab6ccce12c + .quad 0x3ff91002dbdab403 + .quad 0x3ff9145b0b91ffc6 + .quad 0x3ff918b3fc142a19 + .quad 0x3ff91d0dad829e70 + .quad 0x3ff921681ffece05 + .quad 0x3ff925c353aa2fe2 + .quad 0x3ff92a1f48a640dc + .quad 0x3ff92e7bff148396 + .quad 0x3ff932d977168083 + .quad 0x3ff93737b0cdc5e5 + .quad 0x3ff93b96ac5be7d1 + .quad 0x3ff93ff669e2802b + .quad 0x3ff94456e9832ead + .quad 0x3ff948b82b5f98e5 + .quad 0x3ff94d1a2f996a33 + .quad 0x3ff9517cf65253d1 + .quad 0x3ff955e07fac0ccd + .quad 0x3ff95a44cbc8520f + .quad 0x3ff95ea9dac8e658 + .quad 0x3ff9630faccf9243 + .quad 0x3ff9677641fe2446 + .quad 0x3ff96bdd9a7670b3 + .quad 0x3ff97045b65a51ba + .quad 0x3ff974ae95cba768 + .quad 0x3ff9791838ec57ab + .quad 0x3ff97d829fde4e50 + .quad 0x3ff981edcac37d05 + .quad 0x3ff98659b9bddb5b + .quad 0x3ff98ac66cef66c8 + .quad 0x3ff98f33e47a22a2 + .quad 0x3ff993a220801829 + .quad 0x3ff9981121235681 + .quad 0x3ff99c80e685f2b5 + .quad 0x3ff9a0f170ca07ba + .quad 0x3ff9a562c011b66d + .quad 0x3ff9a9d4d47f2598 + .quad 0x3ff9ae47ae3481ed + .quad 0x3ff9b2bb4d53fe0d + .quad 0x3ff9b72fb1ffd285 + .quad 0x3ff9bba4dc5a3dd3 + .quad 0x3ff9c01acc858463 + .quad 0x3ff9c49182a3f090 + .quad 0x3ff9c908fed7d2aa + .quad 0x3ff9cd81414380f2 + .quad 0x3ff9d1fa4a09579d + .quad 0x3ff9d674194bb8d5 + .quad 0x3ff9daeeaf2d0cb8 + .quad 0x3ff9df6a0bcfc15e + .quad 0x3ff9e3e62f564ad5 + .quad 0x3ff9e86319e32323 + .quad 0x3ff9ece0cb98ca4b + .quad 0x3ff9f15f4499c647 + .quad 0x3ff9f5de8508a311 + .quad 0x3ff9fa5e8d07f29e + .quad 0x3ff9fedf5cba4ce0 + .quad 0x3ffa0360f4424fcb + .quad 0x3ffa07e353c29f50 + .quad 0x3ffa0c667b5de565 + .quad 0x3ffa10ea6b36d1fe + .quad 0x3ffa156f23701b15 + .quad 0x3ffa19f4a42c7ca9 + .quad 0x3ffa1e7aed8eb8bb + .quad 0x3ffa2301ffb99757 + .quad 0x3ffa2789dacfe68c + .quad 0x3ffa2c127ef47a74 + .quad 0x3ffa309bec4a2d33 + .quad 0x3ffa352622f3def6 + .quad 0x3ffa39b1231475f7 + .quad 0x3ffa3e3ceccede7c + .quad 0x3ffa42c980460ad8 + .quad 0x3ffa4756dd9cf36e + .quad 0x3ffa4be504f696b1 + .quad 0x3ffa5073f675f924 + .quad 0x3ffa5503b23e255d + .quad 0x3ffa599438722c03 + .quad 0x3ffa5e25893523d4 + .quad 0x3ffa62b7a4aa29a1 + .quad 0x3ffa674a8af46052 + .quad 0x3ffa6bde3c36f0e6 + .quad 0x3ffa7072b8950a73 + .quad 0x3ffa75080031e22b + .quad 0x3ffa799e1330b358 + .quad 0x3ffa7e34f1b4bf62 + .quad 0x3ffa82cc9be14dca + .quad 0x3ffa876511d9ac32 + .quad 0x3ffa8bfe53c12e59 + .quad 0x3ffa909861bb2e1d + .quad 0x3ffa95333beb0b7e + .quad 0x3ffa99cee2742c9d + .quad 0x3ffa9e6b5579fdbf + .quad 0x3ffaa308951ff14d + .quad 0x3ffaa7a6a1897fd2 + .quad 0x3ffaac457ada2803 + .quad 0x3ffab0e521356eba + .quad 0x3ffab58594bedefa + .quad 0x3ffaba26d59a09ee + .quad 0x3ffabec8e3ea86ee + .quad 0x3ffac36bbfd3f37a + .quad 0x3ffac80f6979f340 + .quad 0x3ffaccb3e100301e + .quad 0x3ffad159268a5a1c + .quad 0x3ffad5ff3a3c2774 + .quad 0x3ffadaa61c395493 + .quad 0x3ffadf4dcca5a413 + .quad 0x3ffae3f64ba4dec6 + .quad 0x3ffae89f995ad3ad + .quad 0x3ffaed49b5eb5803 + .quad 0x3ffaf1f4a17a4735 + .quad 0x3ffaf6a05c2b82e9 + .quad 0x3ffafb4ce622f2ff + .quad 0x3ffafffa3f84858c + .quad 0x3ffb04a868742ee4 + .quad 0x3ffb09576115e994 + .quad 0x3ffb0e07298db666 + .quad 0x3ffb12b7c1ff9c61 + .quad 0x3ffb17692a8fa8cd + .quad 0x3ffb1c1b6361ef31 + .quad 0x3ffb20ce6c9a8952 + .quad 0x3ffb2582465d973c + .quad 0x3ffb2a36f0cf3f3a + .quad 0x3ffb2eec6c13addd + .quad 0x3ffb33a2b84f15fb + .quad 0x3ffb3859d5a5b0b1 + .quad 0x3ffb3d11c43bbd62 + .quad 0x3ffb41ca843581ba + .quad 0x3ffb468415b749b1 + .quad 0x3ffb4b3e78e56786 + .quad 0x3ffb4ff9ade433c6 + .quad 0x3ffb54b5b4d80d4a + .quad 0x3ffb59728de5593a + .quad 0x3ffb5e303930830c + .quad 0x3ffb62eeb6ddfc87 + .quad 0x3ffb67ae07123dc3 + .quad 0x3ffb6c6e29f1c52a + .quad 0x3ffb712f1fa1177b + .quad 0x3ffb75f0e844bfc6 + .quad 0x3ffb7ab384014f76 + .quad 0x3ffb7f76f2fb5e47 + .quad 0x3ffb843b35578a51 + .quad 0x3ffb89004b3a7804 + .quad 0x3ffb8dc634c8d228 + .quad 0x3ffb928cf22749e4 + .quad 0x3ffb9754837a96b7 + .quad 0x3ffb9c1ce8e77680 + .quad 0x3ffba0e62292ad7d + .quad 0x3ffba5b030a1064a + .quad 0x3ffbaa7b133751e3 + .quad 0x3ffbaf46ca7a67a7 + .quad 0x3ffbb413568f255a + .quad 0x3ffbb8e0b79a6f1f + .quad 0x3ffbbdaeedc12f82 + .quad 0x3ffbc27df9285775 + .quad 0x3ffbc74dd9f4de4f + .quad 0x3ffbcc1e904bc1d2 + .quad 0x3ffbd0f01c520628 + .quad 0x3ffbd5c27e2cb5e5 + .quad 0x3ffbda95b600e20b + .quad 0x3ffbdf69c3f3a207 + .quad 0x3ffbe43ea82a13b5 + .quad 0x3ffbe91462c95b60 + .quad 0x3ffbedeaf3f6a3c2 + .quad 0x3ffbf2c25bd71e09 + .quad 0x3ffbf79a9a9001d2 + .quad 0x3ffbfc73b0468d30 + .quad 0x3ffc014d9d2004aa + .quad 0x3ffc06286141b33d + .quad 0x3ffc0b03fcd0ea5c + .quad 0x3ffc0fe06ff301f4 + .quad 0x3ffc14bdbacd586a + .quad 0x3ffc199bdd85529c + .quad 0x3ffc1e7ad8405be6 + .quad 0x3ffc235aab23e61e + .quad 0x3ffc283b56556999 + .quad 0x3ffc2d1cd9fa652c + .quad 0x3ffc31ff36385e29 + .quad 0x3ffc36e26b34e065 + .quad 0x3ffc3bc679157e38 + .quad 0x3ffc40ab5fffd07a + .quad 0x3ffc45912019768c + .quad 0x3ffc4a77b9881650 + .quad 0x3ffc4f5f2c715c31 + .quad 0x3ffc544778fafb22 + .quad 0x3ffc59309f4aac9f + .quad 0x3ffc5e1a9f8630ad + .quad 0x3ffc630579d34ddd + .quad 0x3ffc67f12e57d14b + .quad 0x3ffc6cddbd398ea4 + .quad 0x3ffc71cb269e601f + .quad 0x3ffc76b96aac2686 + .quad 0x3ffc7ba88988c933 + .quad 0x3ffc8098835a3611 + .quad 0x3ffc8589584661a1 + .quad 0x3ffc8a7b087346f4 + .quad 0x3ffc8f6d9406e7b5 + .quad 0x3ffc9460fb274c22 + .quad 0x3ffc99553dfa8313 + .quad 0x3ffc9e4a5ca6a1f8 + .quad 0x3ffca3405751c4db + .quad 0x3ffca8372e220e61 + .quad 0x3ffcad2ee13da7cb + .quad 0x3ffcb22770cac0f9 + .quad 0x3ffcb720dcef9069 + .quad 0x3ffcbc1b25d25337 + .quad 0x3ffcc1164b994d23 + .quad 0x3ffcc6124e6ac88b + .quad 0x3ffccb0f2e6d1675 + .quad 0x3ffcd00cebc68e87 + .quad 0x3ffcd50b869d8f0f + .quad 0x3ffcda0aff187d02 + .quad 0x3ffcdf0b555dc3fa + .quad 0x3ffce40c8993d63d + .quad 0x3ffce90e9be12cb9 + .quad 0x3ffcee118c6c4709 + .quad 0x3ffcf3155b5bab74 + .quad 0x3ffcf81a08d5e6ec + .quad 0x3ffcfd1f95018d17 + .quad 0x3ffd022600053845 + .quad 0x3ffd072d4a07897c + .quad 0x3ffd0c35732f2870 + .quad 0x3ffd113e7ba2c38c + .quad 0x3ffd164863890fee + .quad 0x3ffd1b532b08c968 + .quad 0x3ffd205ed248b287 + .quad 0x3ffd256b596f948c + .quad 0x3ffd2a78c0a43f72 + .quad 0x3ffd2f87080d89f2 + .quad 0x3ffd34962fd2517a + .quad 0x3ffd39a638197a3c + .quad 0x3ffd3eb72109ef21 + .quad 0x3ffd43c8eacaa1d6 + .quad 0x3ffd48db95828ac7 + .quad 0x3ffd4def2158a91f + .quad 0x3ffd53038e7402ce + .quad 0x3ffd5818dcfba487 + .quad 0x3ffd5d2f0d16a1c3 + .quad 0x3ffd62461eec14be + .quad 0x3ffd675e12a31e7f + .quad 0x3ffd6c76e862e6d3 + .quad 0x3ffd7190a0529c51 + .quad 0x3ffd76ab3a99745b + .quad 0x3ffd7bc6b75eab1f + .quad 0x3ffd80e316c98398 + .quad 0x3ffd86005901478f + .quad 0x3ffd8b1e7e2d479d + .quad 0x3ffd903d8674db2b + .quad 0x3ffd955d71ff6075 + .quad 0x3ffd9a7e40f43c89 + .quad 0x3ffd9f9ff37adb4a + .quad 0x3ffda4c289baaf6e + .quad 0x3ffda9e603db3285 + .quad 0x3ffdaf0a6203e4f5 + .quad 0x3ffdb42fa45c4dfd + .quad 0x3ffdb955cb0bfbb6 + .quad 0x3ffdbe7cd63a8315 + .quad 0x3ffdc3a4c60f7fea + .quad 0x3ffdc8cd9ab294e4 + .quad 0x3ffdcdf7544b6b92 + .quad 0x3ffdd321f301b460 + .quad 0x3ffdd84d76fd269e + .quad 0x3ffddd79e065807d + .quad 0x3ffde2a72f628712 + .quad 0x3ffde7d5641c0658 + .quad 0x3ffded047eb9d12d + .quad 0x3ffdf2347f63c159 + .quad 0x3ffdf7656641b78c + .quad 0x3ffdfc97337b9b5f + .quad 0x3ffe01c9e7395b56 + .quad 0x3ffe06fd81a2ece1 + .quad 0x3ffe0c3202e04c5d + .quad 0x3ffe11676b197d17 + .quad 0x3ffe169dba768949 + .quad 0x3ffe1bd4f11f8220 + .quad 0x3ffe210d0f3c7fba + .quad 0x3ffe264614f5a129 + .quad 0x3ffe2b8002730c71 + .quad 0x3ffe30bad7dcee90 + .quad 0x3ffe35f6955b7b78 + .quad 0x3ffe3b333b16ee12 + .quad 0x3ffe4070c9378842 + .quad 0x3ffe45af3fe592e8 + .quad 0x3ffe4aee9f495ddc + .quad 0x3ffe502ee78b3ff6 + .quad 0x3ffe557018d3970b + .quad 0x3ffe5ab2334ac7ee + .quad 0x3ffe5ff537193e75 + .quad 0x3ffe653924676d76 + .quad 0x3ffe6a7dfb5dceca + .quad 0x3ffe6fc3bc24e350 + .quad 0x3ffe750a66e532eb + .quad 0x3ffe7a51fbc74c83 + .quad 0x3ffe7f9a7af3c60b + .quad 0x3ffe84e3e4933c7e + .quad 0x3ffe8a2e38ce53df + .quad 0x3ffe8f7977cdb740 + .quad 0x3ffe94c5a1ba18bd + .quad 0x3ffe9a12b6bc3181 + .quad 0x3ffe9f60b6fcc1c7 + .quad 0x3ffea4afa2a490da + .quad 0x3ffea9ff79dc6d14 + .quad 0x3ffeaf503ccd2be5 + .quad 0x3ffeb4a1eb9fa9d1 + .quad 0x3ffeb9f4867cca6e + .quad 0x3ffebf480d8d786d + .quad 0x3ffec49c80faa594 + .quad 0x3ffec9f1e0ed4ac2 + .quad 0x3ffecf482d8e67f1 + .quad 0x3ffed49f67070435 + .quad 0x3ffed9f78d802dc2 + .quad 0x3ffedf50a122f9e6 + .quad 0x3ffee4aaa2188510 + .quad 0x3ffeea059089f2d0 + .quad 0x3ffeef616ca06dd6 + .quad 0x3ffef4be368527f6 + .quad 0x3ffefa1bee615a27 + .quad 0x3ffeff7a945e4487 + .quad 0x3fff04da28a52e59 + .quad 0x3fff0a3aab5f6609 + .quad 0x3fff0f9c1cb6412a + .quad 0x3fff14fe7cd31c7b + .quad 0x3fff1a61cbdf5be7 + .quad 0x3fff1fc60a046a84 + .quad 0x3fff252b376bba97 + .quad 0x3fff2a91543ec595 + .quad 0x3fff2ff860a70c22 + .quad 0x3fff35605cce1613 + .quad 0x3fff3ac948dd7274 + .quad 0x3fff403324feb781 + .quad 0x3fff459df15b82ac + .quad 0x3fff4b09ae1d78a1 + .quad 0x3fff50765b6e4540 + .quad 0x3fff55e3f9779ba5 + .quad 0x3fff5b5288633625 + .quad 0x3fff60c2085ad652 + .quad 0x3fff6632798844f8 + .quad 0x3fff6ba3dc155226 + .quad 0x3fff7116302bd526 + .quad 0x3fff768975f5ac86 + .quad 0x3fff7bfdad9cbe14 + .quad 0x3fff8172d74af6e1 + .quad 0x3fff86e8f32a4b45 + .quad 0x3fff8c600164b6dc + .quad 0x3fff91d802243c89 + .quad 0x3fff9750f592e677 + .quad 0x3fff9ccadbdac61d + .quad 0x3fffa245b525f439 + .quad 0x3fffa7c1819e90d8 + .quad 0x3fffad3e416ec354 + .quad 0x3fffb2bbf4c0ba54 + .quad 0x3fffb83a9bbeabd1 + .quad 0x3fffbdba3692d514 + .quad 0x3fffc33ac5677ab8 + .quad 0x3fffc8bc4866e8ad + .quad 0x3fffce3ebfbb7237 + .quad 0x3fffd3c22b8f71f1 + .quad 0x3fffd9468c0d49cc + .quad 0x3fffdecbe15f6314 + .quad 0x3fffe4522bb02e6e + .quad 0x3fffe9d96b2a23d9 + .quad 0x3fffef619ff7c2b3 + .quad 0x3ffff4eaca4391b6 + .quad 0x3ffffa74ea381efc + +/* Range reduction coefficients: + * log(2) inverted = 2^k/ln2 */ +double_vector __dbInvLn2 0x40971547652b82fe + +/* right-shifter value = 3*2^52 */ +double_vector __dbShifter 0x4338000000000000 + +/* log(2) high part = ln2/2^k(52-k-9 hibits) */ +double_vector __dbLn2hi 0x3f462e42fec00000 + +/* log(2) low part = ln2/2^k(52-k-9..104-k-9 lobits) */ +double_vector __dbLn2lo 0x3d5d1cf79abc9e3b + +/* Polynomial coefficients (k=10, deg=3): */ +double_vector __dPC0 0x3ff0000000000000 +double_vector __dPC1 0x3fe0000001ebfbe0 +double_vector __dPC2 0x3fc5555555555556 + +/* Other constants: + * index mask = 2^k-1 */ +double_vector __lIndexMask 0x00000000000003ff + +/* absolute value mask (SP) */ +float_vector __iAbsMask 0x7fffffff + +/* domain range (SP) (>=4086232B) */ +float_vector __iDomainRange 0x4086232a + .type __svml_dexp_data,@object + .size __svml_dexp_data,.-__svml_dexp_data diff --git a/sysdeps/x86_64/fpu/svml_d_exp_data.h b/sysdeps/x86_64/fpu/svml_d_exp_data.h new file mode 100644 index 0000000000..71ebdb799e --- /dev/null +++ b/sysdeps/x86_64/fpu/svml_d_exp_data.h @@ -0,0 +1,52 @@ +/* Offsets for data table for function exp. + Copyright (C) 2014-2015 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#ifndef D_EXP_DATA_H +#define D_EXP_DATA_H + +#define __dbT 0 +#define __dbInvLn2 8192 +#define __dbShifter 8256 +#define __dbLn2hi 8320 +#define __dbLn2lo 8384 +#define __dPC0 8448 +#define __dPC1 8512 +#define __dPC2 8576 +#define __lIndexMask 8640 +#define __iAbsMask 8704 +#define __iDomainRange 8768 + +.macro double_vector offset value +.if .-__svml_dexp_data != \offset +.err +.endif +.rept 8 +.quad \value +.endr +.endm + +.macro float_vector offset value +.if .-__svml_dexp_data != \offset +.err +.endif +.rept 16 +.long \value +.endr +.endm + +#endif diff --git a/sysdeps/x86_64/fpu/test-double-vlen2-wrappers.c b/sysdeps/x86_64/fpu/test-double-vlen2-wrappers.c index dfbc3d3708..946a8f690a 100644 --- a/sysdeps/x86_64/fpu/test-double-vlen2-wrappers.c +++ b/sysdeps/x86_64/fpu/test-double-vlen2-wrappers.c @@ -25,3 +25,4 @@ VECTOR_WRAPPER (WRAPPER_NAME (cos), _ZGVbN2v_cos) VECTOR_WRAPPER (WRAPPER_NAME (sin), _ZGVbN2v_sin) VECTOR_WRAPPER (WRAPPER_NAME (log), _ZGVbN2v_log) +VECTOR_WRAPPER (WRAPPER_NAME (exp), _ZGVbN2v_exp) diff --git a/sysdeps/x86_64/fpu/test-double-vlen2.c b/sysdeps/x86_64/fpu/test-double-vlen2.c index a119bfc33a..1b72748a3d 100644 --- a/sysdeps/x86_64/fpu/test-double-vlen2.c +++ b/sysdeps/x86_64/fpu/test-double-vlen2.c @@ -21,5 +21,6 @@ #define TEST_VECTOR_cos 1 #define TEST_VECTOR_sin 1 #define TEST_VECTOR_log 1 +#define TEST_VECTOR_exp 1 #include "libm-test.c" diff --git a/sysdeps/x86_64/fpu/test-double-vlen4-avx2-wrappers.c b/sysdeps/x86_64/fpu/test-double-vlen4-avx2-wrappers.c index 6e01a8945f..40c3e25a86 100644 --- a/sysdeps/x86_64/fpu/test-double-vlen4-avx2-wrappers.c +++ b/sysdeps/x86_64/fpu/test-double-vlen4-avx2-wrappers.c @@ -28,3 +28,4 @@ VECTOR_WRAPPER (WRAPPER_NAME (cos), _ZGVdN4v_cos) VECTOR_WRAPPER (WRAPPER_NAME (sin), _ZGVdN4v_sin) VECTOR_WRAPPER (WRAPPER_NAME (log), _ZGVdN4v_log) +VECTOR_WRAPPER (WRAPPER_NAME (exp), _ZGVdN4v_exp) diff --git a/sysdeps/x86_64/fpu/test-double-vlen4-avx2.c b/sysdeps/x86_64/fpu/test-double-vlen4-avx2.c index ef6e1c2a42..45d6ed6f43 100644 --- a/sysdeps/x86_64/fpu/test-double-vlen4-avx2.c +++ b/sysdeps/x86_64/fpu/test-double-vlen4-avx2.c @@ -24,6 +24,7 @@ #define TEST_VECTOR_cos 1 #define TEST_VECTOR_sin 1 #define TEST_VECTOR_log 1 +#define TEST_VECTOR_exp 1 #define REQUIRE_AVX2 diff --git a/sysdeps/x86_64/fpu/test-double-vlen4-wrappers.c b/sysdeps/x86_64/fpu/test-double-vlen4-wrappers.c index e9f890573f..094c9bfb14 100644 --- a/sysdeps/x86_64/fpu/test-double-vlen4-wrappers.c +++ b/sysdeps/x86_64/fpu/test-double-vlen4-wrappers.c @@ -25,3 +25,4 @@ VECTOR_WRAPPER (WRAPPER_NAME (cos), _ZGVcN4v_cos) VECTOR_WRAPPER (WRAPPER_NAME (sin), _ZGVcN4v_sin) VECTOR_WRAPPER (WRAPPER_NAME (log), _ZGVcN4v_log) +VECTOR_WRAPPER (WRAPPER_NAME (exp), _ZGVcN4v_exp) diff --git a/sysdeps/x86_64/fpu/test-double-vlen4.c b/sysdeps/x86_64/fpu/test-double-vlen4.c index 71ea85cc0e..b89e77f5f3 100644 --- a/sysdeps/x86_64/fpu/test-double-vlen4.c +++ b/sysdeps/x86_64/fpu/test-double-vlen4.c @@ -21,5 +21,6 @@ #define TEST_VECTOR_cos 1 #define TEST_VECTOR_sin 1 #define TEST_VECTOR_log 1 +#define TEST_VECTOR_exp 1 #include "libm-test.c" diff --git a/sysdeps/x86_64/fpu/test-double-vlen8-wrappers.c b/sysdeps/x86_64/fpu/test-double-vlen8-wrappers.c index 290d59c74c..0b4398a8c5 100644 --- a/sysdeps/x86_64/fpu/test-double-vlen8-wrappers.c +++ b/sysdeps/x86_64/fpu/test-double-vlen8-wrappers.c @@ -25,3 +25,4 @@ VECTOR_WRAPPER (WRAPPER_NAME (cos), _ZGVeN8v_cos) VECTOR_WRAPPER (WRAPPER_NAME (sin), _ZGVeN8v_sin) VECTOR_WRAPPER (WRAPPER_NAME (log), _ZGVeN8v_log) +VECTOR_WRAPPER (WRAPPER_NAME (exp), _ZGVeN8v_exp) diff --git a/sysdeps/x86_64/fpu/test-double-vlen8.c b/sysdeps/x86_64/fpu/test-double-vlen8.c index e2f2cfef2e..277b31241f 100644 --- a/sysdeps/x86_64/fpu/test-double-vlen8.c +++ b/sysdeps/x86_64/fpu/test-double-vlen8.c @@ -21,6 +21,7 @@ #define TEST_VECTOR_cos 1 #define TEST_VECTOR_sin 1 #define TEST_VECTOR_log 1 +#define TEST_VECTOR_exp 1 #define REQUIRE_AVX512F |