1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
|
/* strrchr/wcsrchr optimized with AVX2.
Copyright (C) 2017-2022 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#if IS_IN (libc)
# include <sysdep.h>
# ifndef STRRCHR
# define STRRCHR __strrchr_avx2
# endif
# ifdef USE_AS_WCSRCHR
# define VPBROADCAST vpbroadcastd
# define VPCMPEQ vpcmpeqd
# define VPMIN vpminud
# define CHAR_SIZE 4
# else
# define VPBROADCAST vpbroadcastb
# define VPCMPEQ vpcmpeqb
# define VPMIN vpminub
# define CHAR_SIZE 1
# endif
# ifndef VZEROUPPER
# define VZEROUPPER vzeroupper
# endif
# ifndef SECTION
# define SECTION(p) p##.avx
# endif
# define VEC_SIZE 32
# define PAGE_SIZE 4096
.section SECTION(.text), "ax", @progbits
ENTRY(STRRCHR)
vmovd %esi, %xmm7
movl %edi, %eax
/* Broadcast CHAR to YMM4. */
VPBROADCAST %xmm7, %ymm7
vpxor %xmm0, %xmm0, %xmm0
/* Shift here instead of `andl` to save code size (saves a fetch
block). */
sall $20, %eax
cmpl $((PAGE_SIZE - VEC_SIZE) << 20), %eax
ja L(cross_page)
L(page_cross_continue):
vmovdqu (%rdi), %ymm1
/* Check end of string match. */
VPCMPEQ %ymm1, %ymm0, %ymm6
vpmovmskb %ymm6, %ecx
testl %ecx, %ecx
jz L(aligned_more)
/* Only check match with search CHAR if needed. */
VPCMPEQ %ymm1, %ymm7, %ymm1
vpmovmskb %ymm1, %eax
/* Check if match before first zero. */
blsmskl %ecx, %ecx
andl %ecx, %eax
jz L(ret0)
bsrl %eax, %eax
addq %rdi, %rax
/* We are off by 3 for wcsrchr if search CHAR is non-zero. If
search CHAR is zero we are correct. Either way `andq
-CHAR_SIZE, %rax` gets the correct result. */
# ifdef USE_AS_WCSRCHR
andq $-CHAR_SIZE, %rax
# endif
L(ret0):
L(return_vzeroupper):
ZERO_UPPER_VEC_REGISTERS_RETURN
/* Returns for first vec x1/x2 have hard coded backward search
path for earlier matches. */
.p2align 4,, 10
L(first_vec_x1):
VPCMPEQ %ymm2, %ymm7, %ymm6
vpmovmskb %ymm6, %eax
blsmskl %ecx, %ecx
andl %ecx, %eax
jnz L(first_vec_x1_return)
.p2align 4,, 4
L(first_vec_x0_test):
VPCMPEQ %ymm1, %ymm7, %ymm6
vpmovmskb %ymm6, %eax
testl %eax, %eax
jz L(ret1)
bsrl %eax, %eax
addq %r8, %rax
# ifdef USE_AS_WCSRCHR
andq $-CHAR_SIZE, %rax
# endif
L(ret1):
VZEROUPPER_RETURN
.p2align 4,, 10
L(first_vec_x0_x1_test):
VPCMPEQ %ymm2, %ymm7, %ymm6
vpmovmskb %ymm6, %eax
/* Check ymm2 for search CHAR match. If no match then check ymm1
before returning. */
testl %eax, %eax
jz L(first_vec_x0_test)
.p2align 4,, 4
L(first_vec_x1_return):
bsrl %eax, %eax
leaq 1(%rdi, %rax), %rax
# ifdef USE_AS_WCSRCHR
andq $-CHAR_SIZE, %rax
# endif
VZEROUPPER_RETURN
.p2align 4,, 10
L(first_vec_x2):
VPCMPEQ %ymm3, %ymm7, %ymm6
vpmovmskb %ymm6, %eax
blsmskl %ecx, %ecx
/* If no in-range search CHAR match in ymm3 then need to check
ymm1/ymm2 for an earlier match (we delay checking search
CHAR matches until needed). */
andl %ecx, %eax
jz L(first_vec_x0_x1_test)
bsrl %eax, %eax
leaq (VEC_SIZE + 1)(%rdi, %rax), %rax
# ifdef USE_AS_WCSRCHR
andq $-CHAR_SIZE, %rax
# endif
VZEROUPPER_RETURN
.p2align 4
L(aligned_more):
/* Save original pointer if match was in VEC 0. */
movq %rdi, %r8
/* Align src. */
orq $(VEC_SIZE - 1), %rdi
vmovdqu 1(%rdi), %ymm2
VPCMPEQ %ymm2, %ymm0, %ymm6
vpmovmskb %ymm6, %ecx
testl %ecx, %ecx
jnz L(first_vec_x1)
vmovdqu (VEC_SIZE + 1)(%rdi), %ymm3
VPCMPEQ %ymm3, %ymm0, %ymm6
vpmovmskb %ymm6, %ecx
testl %ecx, %ecx
jnz L(first_vec_x2)
/* Save pointer again before realigning. */
movq %rdi, %rsi
addq $(VEC_SIZE + 1), %rdi
andq $-(VEC_SIZE * 2), %rdi
.p2align 4
L(first_aligned_loop):
/* Do 2x VEC at a time. Any more and the cost of finding the
match outweights loop benefit. */
vmovdqa (VEC_SIZE * 0)(%rdi), %ymm4
vmovdqa (VEC_SIZE * 1)(%rdi), %ymm5
VPCMPEQ %ymm4, %ymm7, %ymm6
VPMIN %ymm4, %ymm5, %ymm8
VPCMPEQ %ymm5, %ymm7, %ymm10
vpor %ymm6, %ymm10, %ymm5
VPCMPEQ %ymm8, %ymm0, %ymm8
vpor %ymm5, %ymm8, %ymm9
vpmovmskb %ymm9, %eax
addq $(VEC_SIZE * 2), %rdi
/* No zero or search CHAR. */
testl %eax, %eax
jz L(first_aligned_loop)
/* If no zero CHAR then go to second loop (this allows us to
throw away all prior work). */
vpmovmskb %ymm8, %ecx
testl %ecx, %ecx
jz L(second_aligned_loop_prep)
/* Search char could be zero so we need to get the true match.
*/
vpmovmskb %ymm5, %eax
testl %eax, %eax
jnz L(first_aligned_loop_return)
.p2align 4,, 4
L(first_vec_x1_or_x2):
VPCMPEQ %ymm3, %ymm7, %ymm3
VPCMPEQ %ymm2, %ymm7, %ymm2
vpmovmskb %ymm3, %eax
vpmovmskb %ymm2, %edx
/* Use add for macro-fusion. */
addq %rax, %rdx
jz L(first_vec_x0_test)
/* NB: We could move this shift to before the branch and save a
bit of code size / performance on the fall through. The
branch leads to the null case which generally seems hotter
than char in first 3x VEC. */
salq $32, %rax
addq %rdx, %rax
bsrq %rax, %rax
leaq 1(%rsi, %rax), %rax
# ifdef USE_AS_WCSRCHR
andq $-CHAR_SIZE, %rax
# endif
VZEROUPPER_RETURN
.p2align 4,, 8
L(first_aligned_loop_return):
VPCMPEQ %ymm4, %ymm0, %ymm4
vpmovmskb %ymm4, %edx
salq $32, %rcx
orq %rdx, %rcx
vpmovmskb %ymm10, %eax
vpmovmskb %ymm6, %edx
salq $32, %rax
orq %rdx, %rax
blsmskq %rcx, %rcx
andq %rcx, %rax
jz L(first_vec_x1_or_x2)
bsrq %rax, %rax
leaq -(VEC_SIZE * 2)(%rdi, %rax), %rax
# ifdef USE_AS_WCSRCHR
andq $-CHAR_SIZE, %rax
# endif
VZEROUPPER_RETURN
/* Search char cannot be zero. */
.p2align 4
L(second_aligned_loop_set_furthest_match):
/* Save VEC and pointer from most recent match. */
L(second_aligned_loop_prep):
movq %rdi, %rsi
vmovdqu %ymm6, %ymm2
vmovdqu %ymm10, %ymm3
.p2align 4
L(second_aligned_loop):
/* Search 2x at at time. */
vmovdqa (VEC_SIZE * 0)(%rdi), %ymm4
vmovdqa (VEC_SIZE * 1)(%rdi), %ymm5
VPCMPEQ %ymm4, %ymm7, %ymm6
VPMIN %ymm4, %ymm5, %ymm1
VPCMPEQ %ymm5, %ymm7, %ymm10
vpor %ymm6, %ymm10, %ymm5
VPCMPEQ %ymm1, %ymm0, %ymm1
vpor %ymm5, %ymm1, %ymm9
vpmovmskb %ymm9, %eax
addq $(VEC_SIZE * 2), %rdi
testl %eax, %eax
jz L(second_aligned_loop)
vpmovmskb %ymm1, %ecx
testl %ecx, %ecx
jz L(second_aligned_loop_set_furthest_match)
vpmovmskb %ymm5, %eax
testl %eax, %eax
jnz L(return_new_match)
/* This is the hot patch. We know CHAR is inbounds and that
ymm3/ymm2 have latest match. */
.p2align 4,, 4
L(return_old_match):
vpmovmskb %ymm3, %eax
vpmovmskb %ymm2, %edx
salq $32, %rax
orq %rdx, %rax
bsrq %rax, %rax
/* Search char cannot be zero so safe to just use lea for
wcsrchr. */
leaq (VEC_SIZE * -2 -(CHAR_SIZE - 1))(%rsi, %rax), %rax
VZEROUPPER_RETURN
/* Last iteration also potentially has a match. */
.p2align 4,, 8
L(return_new_match):
VPCMPEQ %ymm4, %ymm0, %ymm4
vpmovmskb %ymm4, %edx
salq $32, %rcx
orq %rdx, %rcx
vpmovmskb %ymm10, %eax
vpmovmskb %ymm6, %edx
salq $32, %rax
orq %rdx, %rax
blsmskq %rcx, %rcx
andq %rcx, %rax
jz L(return_old_match)
bsrq %rax, %rax
/* Search char cannot be zero so safe to just use lea for
wcsrchr. */
leaq (VEC_SIZE * -2 -(CHAR_SIZE - 1))(%rdi, %rax), %rax
VZEROUPPER_RETURN
.p2align 4,, 4
L(cross_page):
movq %rdi, %rsi
andq $-VEC_SIZE, %rsi
vmovdqu (%rsi), %ymm1
VPCMPEQ %ymm1, %ymm0, %ymm6
vpmovmskb %ymm6, %ecx
/* Shift out zero CHAR matches that are before the begining of
src (rdi). */
shrxl %edi, %ecx, %ecx
testl %ecx, %ecx
jz L(page_cross_continue)
VPCMPEQ %ymm1, %ymm7, %ymm1
vpmovmskb %ymm1, %eax
/* Shift out search CHAR matches that are before the begining of
src (rdi). */
shrxl %edi, %eax, %eax
blsmskl %ecx, %ecx
/* Check if any search CHAR match in range. */
andl %ecx, %eax
jz L(ret2)
bsrl %eax, %eax
addq %rdi, %rax
# ifdef USE_AS_WCSRCHR
andq $-CHAR_SIZE, %rax
# endif
L(ret2):
VZEROUPPER_RETURN
END(STRRCHR)
#endif
|