1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
|
/* Function erff vectorized with AVX-512.
Copyright (C) 2021-2022 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
https://www.gnu.org/licenses/. */
/*
* ALGORITHM DESCRIPTION:
*
* erf(x) is computed as higher precision simple polynomial
* with no lookup table:
*
* R = P0 + x^2*(P1 + x^2*(P2 + .... x^2*P12));
* erf(x) = R * R * x;
*
* Special cases:
*
* erf(0) = 0
* erf(+INF) = +1
* erf(-INF) = -1
* erf(QNaN) = QNaN
* erf(SNaN) = QNaN
*
*/
/* Offsets for data table __svml_serf_data_internal
*/
#define _AbsMask 0
#define _One 64
#define _gf_MaxThreshold_LA 128
#define _gf_la_poly_0 192
#define _gf_la_poly_1 256
#define _gf_la_poly_2 320
#define _gf_la_poly_3 384
#define _gf_la_poly_4 448
#define _gf_la_poly_5 512
#define _gf_la_poly_6 576
#define _gf_la_poly_7 640
#define _gf_la_poly_8 704
#define _gf_la_poly_9 768
#define _gf_la_poly_10 832
#define _gf_la_poly_11 896
#define _gf_la_poly_12 960
#include <sysdep.h>
.text
.section .text.exex512,"ax",@progbits
ENTRY(_ZGVeN16v_erff_skx)
vmovaps %zmm0, %zmm8
vmulps {rn-sae}, %zmm8, %zmm8, %zmm11
vmovups _gf_la_poly_11+__svml_serf_data_internal(%rip), %zmm15
vmovups _gf_la_poly_12+__svml_serf_data_internal(%rip), %zmm10
vmovups _gf_la_poly_10+__svml_serf_data_internal(%rip), %zmm9
vmovups _gf_la_poly_9+__svml_serf_data_internal(%rip), %zmm7
vmovups _gf_la_poly_8+__svml_serf_data_internal(%rip), %zmm0
vmovups _gf_la_poly_7+__svml_serf_data_internal(%rip), %zmm1
vmovups _gf_la_poly_6+__svml_serf_data_internal(%rip), %zmm2
vmovups _gf_la_poly_5+__svml_serf_data_internal(%rip), %zmm3
vmovups _gf_la_poly_4+__svml_serf_data_internal(%rip), %zmm4
vmovups _gf_la_poly_3+__svml_serf_data_internal(%rip), %zmm5
vmovups _gf_la_poly_2+__svml_serf_data_internal(%rip), %zmm6
vextractf32x8 $1, %zmm8, %ymm13
vcvtps2pd {sae}, %ymm8, %zmm12
vcvtps2pd {sae}, %ymm13, %zmm14
vmulpd {rn-sae}, %zmm12, %zmm12, %zmm12
vmulpd {rn-sae}, %zmm14, %zmm14, %zmm13
/* R = P0 + x^2*(P1 + x^2*(P2 + .... x^2*P12)); */
vmovaps %zmm15, %zmm14
vfmadd231pd {rn-sae}, %zmm12, %zmm10, %zmm14
vfmadd231pd {rn-sae}, %zmm13, %zmm10, %zmm15
vmovups _gf_la_poly_1+__svml_serf_data_internal(%rip), %zmm10
vfmadd213pd {rn-sae}, %zmm9, %zmm12, %zmm14
vfmadd231pd {rn-sae}, %zmm13, %zmm15, %zmm9
vfmadd213pd {rn-sae}, %zmm7, %zmm12, %zmm14
vfmadd231pd {rn-sae}, %zmm13, %zmm9, %zmm7
vfmadd213pd {rn-sae}, %zmm0, %zmm12, %zmm14
vfmadd231pd {rn-sae}, %zmm13, %zmm7, %zmm0
vmovups _gf_MaxThreshold_LA+__svml_serf_data_internal(%rip), %zmm7
vfmadd213pd {rn-sae}, %zmm1, %zmm12, %zmm14
vfmadd231pd {rn-sae}, %zmm13, %zmm0, %zmm1
vmovups _gf_la_poly_0+__svml_serf_data_internal(%rip), %zmm0
vcmpps $22, {sae}, %zmm11, %zmm7, %k1
vfmadd213pd {rn-sae}, %zmm2, %zmm12, %zmm14
vfmadd231pd {rn-sae}, %zmm13, %zmm1, %zmm2
vfmadd213pd {rn-sae}, %zmm3, %zmm12, %zmm14
vfmadd231pd {rn-sae}, %zmm13, %zmm2, %zmm3
vfmadd213pd {rn-sae}, %zmm4, %zmm12, %zmm14
vfmadd231pd {rn-sae}, %zmm13, %zmm3, %zmm4
vfmadd213pd {rn-sae}, %zmm5, %zmm12, %zmm14
vfmadd231pd {rn-sae}, %zmm13, %zmm4, %zmm5
vfmadd213pd {rn-sae}, %zmm6, %zmm12, %zmm14
vfmadd231pd {rn-sae}, %zmm13, %zmm5, %zmm6
vmovups _AbsMask+__svml_serf_data_internal(%rip), %zmm5
vfmadd213pd {rn-sae}, %zmm10, %zmm12, %zmm14
vfmadd231pd {rn-sae}, %zmm13, %zmm6, %zmm10
vandnps %zmm8, %zmm5, %zmm6
vfmadd213pd {rn-sae}, %zmm0, %zmm14, %zmm12
vfmadd213pd {rn-sae}, %zmm0, %zmm10, %zmm13
vorps _One+__svml_serf_data_internal(%rip), %zmm6, %zmm0
vmulpd {rn-sae}, %zmm12, %zmm12, %zmm1
vmulpd {rn-sae}, %zmm13, %zmm13, %zmm3
vcvtpd2ps {rn-sae}, %zmm1, %ymm2
vcvtpd2ps {rn-sae}, %zmm3, %ymm4
vinsertf32x8 $1, %ymm4, %zmm2, %zmm9
/* erf(x) = R * R * x; */
vmulps {rn-sae}, %zmm8, %zmm9, %zmm0{%k1}
ret
END(_ZGVeN16v_erff_skx)
.section .rodata, "a"
.align 64
#ifdef __svml_serf_data_internal_typedef
typedef unsigned int VUINT32;
typedef struct
{
__declspec(align(64)) VUINT32 _AbsMask[16][1];
__declspec(align(64)) VUINT32 _One[16][1];
__declspec(align(64)) VUINT32 _gf_MaxThreshold_LA[16][1];
__declspec(align(64)) VUINT32 _gf_la_poly_0[8][2];
__declspec(align(64)) VUINT32 _gf_la_poly_1[8][2];
__declspec(align(64)) VUINT32 _gf_la_poly_2[8][2];
__declspec(align(64)) VUINT32 _gf_la_poly_3[8][2];
__declspec(align(64)) VUINT32 _gf_la_poly_4[8][2];
__declspec(align(64)) VUINT32 _gf_la_poly_5[8][2];
__declspec(align(64)) VUINT32 _gf_la_poly_6[8][2];
__declspec(align(64)) VUINT32 _gf_la_poly_7[8][2];
__declspec(align(64)) VUINT32 _gf_la_poly_8[8][2];
__declspec(align(64)) VUINT32 _gf_la_poly_9[8][2];
__declspec(align(64)) VUINT32 _gf_la_poly_10[8][2];
__declspec(align(64)) VUINT32 _gf_la_poly_11[8][2];
__declspec(align(64)) VUINT32 _gf_la_poly_12[8][2];
} __svml_serf_data_internal;
#endif
__svml_serf_data_internal:
.long 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff /* _AbsMask */
.align 64
.long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000 /* _One */
.align 64
.long 0x41558c5a, 0x41558c5a, 0x41558c5a, 0x41558c5a, 0x41558c5a, 0x41558c5a, 0x41558c5a, 0x41558c5a, 0x41558c5a, 0x41558c5a, 0x41558c5a, 0x41558c5a, 0x41558c5a, 0x41558c5a, 0x41558c5a, 0x41558c5a /* _gf_MaxThreshold_LA */
.align 64
.quad 0x3ff0fefbd933b903, 0x3ff0fefbd933b903, 0x3ff0fefbd933b903, 0x3ff0fefbd933b903, 0x3ff0fefbd933b903, 0x3ff0fefbd933b903, 0x3ff0fefbd933b903, 0x3ff0fefbd933b903 /* _gf_la_poly_0 */
.align 64
.quad 0xbfc6a948101e6367, 0xbfc6a948101e6367, 0xbfc6a948101e6367, 0xbfc6a948101e6367, 0xbfc6a948101e6367, 0xbfc6a948101e6367, 0xbfc6a948101e6367, 0xbfc6a948101e6367 /* _gf_la_poly_1 */
.align 64
.quad 0x3fa3a334ce602c6b, 0x3fa3a334ce602c6b, 0x3fa3a334ce602c6b, 0x3fa3a334ce602c6b, 0x3fa3a334ce602c6b, 0x3fa3a334ce602c6b, 0x3fa3a334ce602c6b, 0x3fa3a334ce602c6b /* _gf_la_poly_2 */
.align 64
.quad 0xbf799309ea0c81dc, 0xbf799309ea0c81dc, 0xbf799309ea0c81dc, 0xbf799309ea0c81dc, 0xbf799309ea0c81dc, 0xbf799309ea0c81dc, 0xbf799309ea0c81dc, 0xbf799309ea0c81dc /* _gf_la_poly_3 */
.align 64
.quad 0x3f476df64a40e392, 0x3f476df64a40e392, 0x3f476df64a40e392, 0x3f476df64a40e392, 0x3f476df64a40e392, 0x3f476df64a40e392, 0x3f476df64a40e392, 0x3f476df64a40e392 /* _gf_la_poly_4 */
.align 64
.quad 0xbf0a5216b9508ede, 0xbf0a5216b9508ede, 0xbf0a5216b9508ede, 0xbf0a5216b9508ede, 0xbf0a5216b9508ede, 0xbf0a5216b9508ede, 0xbf0a5216b9508ede, 0xbf0a5216b9508ede /* _gf_la_poly_5 */
.align 64
.quad 0x3ea5794b95c8e8a0, 0x3ea5794b95c8e8a0, 0x3ea5794b95c8e8a0, 0x3ea5794b95c8e8a0, 0x3ea5794b95c8e8a0, 0x3ea5794b95c8e8a0, 0x3ea5794b95c8e8a0, 0x3ea5794b95c8e8a0 /* _gf_la_poly_6 */
.align 64
.quad 0x3e94b6c0b485f30f, 0x3e94b6c0b485f30f, 0x3e94b6c0b485f30f, 0x3e94b6c0b485f30f, 0x3e94b6c0b485f30f, 0x3e94b6c0b485f30f, 0x3e94b6c0b485f30f, 0x3e94b6c0b485f30f /* _gf_la_poly_7 */
.align 64
.quad 0xbe65806ce17f0523, 0xbe65806ce17f0523, 0xbe65806ce17f0523, 0xbe65806ce17f0523, 0xbe65806ce17f0523, 0xbe65806ce17f0523, 0xbe65806ce17f0523, 0xbe65806ce17f0523 /* _gf_la_poly_8 */
.align 64
.quad 0x3e2715640470db47, 0x3e2715640470db47, 0x3e2715640470db47, 0x3e2715640470db47, 0x3e2715640470db47, 0x3e2715640470db47, 0x3e2715640470db47, 0x3e2715640470db47 /* _gf_la_poly_9 */
.align 64
.quad 0xbdddcb2653d80f03, 0xbdddcb2653d80f03, 0xbdddcb2653d80f03, 0xbdddcb2653d80f03, 0xbdddcb2653d80f03, 0xbdddcb2653d80f03, 0xbdddcb2653d80f03, 0xbdddcb2653d80f03 /* _gf_la_poly_10 */
.align 64
.quad 0x3d85eadfc762d3eb, 0x3d85eadfc762d3eb, 0x3d85eadfc762d3eb, 0x3d85eadfc762d3eb, 0x3d85eadfc762d3eb, 0x3d85eadfc762d3eb, 0x3d85eadfc762d3eb, 0x3d85eadfc762d3eb /* _gf_la_poly_11 */
.align 64
.quad 0xbd1c668a2871f0f1, 0xbd1c668a2871f0f1, 0xbd1c668a2871f0f1, 0xbd1c668a2871f0f1, 0xbd1c668a2871f0f1, 0xbd1c668a2871f0f1, 0xbd1c668a2871f0f1, 0xbd1c668a2871f0f1 /* _gf_la_poly_12 */
.align 64
.type __svml_serf_data_internal,@object
.size __svml_serf_data_internal,.-__svml_serf_data_internal
|