1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
|
/* { dg-do assemble } */
/* { dg-options "-O2 -ftree-vectorize -march=armv8-a+sve --save-temps" } */
#define MASK_SCATTER_STORE1(OBJTYPE,MASKTYPE,STRIDETYPE,STRIDE)\
void mscatter_store1##OBJTYPE##STRIDETYPE##STRIDE (OBJTYPE * restrict dst,\
OBJTYPE * restrict src,\
MASKTYPE * restrict masks,\
STRIDETYPE count)\
{\
for (STRIDETYPE i=0; i<count; i++)\
if (masks[i * STRIDE])\
dst[i * STRIDE] = src[i];\
}
#define MASK_SCATTER_STORE2(OBJTYPE,MASKTYPE,STRIDETYPE)\
void mscatter_store2##OBJTYPE##STRIDETYPE (OBJTYPE * restrict dst,\
OBJTYPE * restrict src,\
MASKTYPE * restrict masks,\
STRIDETYPE stride,\
STRIDETYPE count)\
{\
for (STRIDETYPE i=0; i<count; i++)\
if (masks[i * stride])\
dst[i * stride] = src[i];\
}
#define MASK_SCATTER_STORE3(OBJTYPE,MASKTYPE,STRIDETYPE)\
void mscatter_store3s5##OBJTYPE##STRIDETYPE\
(OBJTYPE * restrict dst, OBJTYPE * restrict s1, OBJTYPE * restrict s2,\
OBJTYPE * restrict s3, OBJTYPE * restrict s4, OBJTYPE * restrict s5,\
MASKTYPE * restrict masks, STRIDETYPE count)\
{\
const STRIDETYPE STRIDE = 5;\
for (STRIDETYPE i=0; i<count; i++)\
if (masks[i * STRIDE])\
{\
dst[0 + (i * STRIDE)] = s1[i];\
dst[1 + (i * STRIDE)] = s2[i];\
dst[2 + (i * STRIDE)] = s3[i];\
dst[3 + (i * STRIDE)] = s4[i];\
dst[4 + (i * STRIDE)] = s5[i];\
}\
}
#define MASK_SCATTER_STORE4(OBJTYPE,MASKTYPE,STRIDETYPE,STRIDE)\
void mscatter_store4##OBJTYPE##STRIDETYPE##STRIDE (OBJTYPE * restrict dst,\
OBJTYPE * restrict src,\
MASKTYPE * restrict masks,\
STRIDETYPE count)\
{\
for (STRIDETYPE i=0; i<count; i++)\
{\
if (masks[i * STRIDE])\
*dst = *src;\
dst += STRIDE;\
src += 1;\
}\
}
#define MASK_SCATTER_STORE5(OBJTYPE,MASKTYPE,STRIDETYPE)\
void mscatter_store5##OBJTYPE##STRIDETYPE (OBJTYPE * restrict dst,\
OBJTYPE * restrict src,\
MASKTYPE * restrict masks,\
STRIDETYPE stride,\
STRIDETYPE count)\
{\
for (STRIDETYPE i=0; i<count; i++)\
{\
if (masks[i * stride])\
*dst = *src;\
dst += stride;\
src += 1;\
}\
}
MASK_SCATTER_STORE1 (double, long, long, 5)
MASK_SCATTER_STORE1 (double, long, long, 8)
MASK_SCATTER_STORE1 (double, long, long, 21)
MASK_SCATTER_STORE1 (double, long, long, 1009)
MASK_SCATTER_STORE1 (float, int, int, 5)
MASK_SCATTER_STORE1 (float, int, int, 8)
MASK_SCATTER_STORE1 (float, int, int, 21)
MASK_SCATTER_STORE1 (float, int, int, 1009)
MASK_SCATTER_STORE2 (double, long, long)
MASK_SCATTER_STORE2 (float, int, int)
MASK_SCATTER_STORE3 (double, long, long)
MASK_SCATTER_STORE3 (float, int, int)
MASK_SCATTER_STORE4 (double, long, long, 5)
/* NOTE: We can't vectorize MASK_SCATTER_STORE4 (float, int, int, 3) because we
can't prove that the offsets used for the gather load won't overflow. */
MASK_SCATTER_STORE5 (double, long, long)
MASK_SCATTER_STORE5 (float, int, int)
/* Widened forms. */
MASK_SCATTER_STORE1 (double, long, int, 5)
MASK_SCATTER_STORE1 (double, long, int, 8)
MASK_SCATTER_STORE1 (double, long, short, 5)
MASK_SCATTER_STORE1 (double, long, short, 8)
MASK_SCATTER_STORE1 (float, int, short, 5)
MASK_SCATTER_STORE1 (float, int, short, 8)
MASK_SCATTER_STORE2 (double, long, int)
MASK_SCATTER_STORE2 (float, int, short)
MASK_SCATTER_STORE4 (double, long, int, 5)
MASK_SCATTER_STORE4 (float, int, short, 5)
MASK_SCATTER_STORE5 (double, long, int)
/* Gather loads are for the masks. */
/* { dg-final { scan-assembler-times "ld1d\\tz\[0-9\]+.d, p\[0-9\]+/z, \\\[x\[0-9\]+, z\[0-9\]+.d\\\]" 15 } } */
/* { dg-final { scan-assembler-times "ld1w\\tz\[0-9\]+.s, p\[0-9\]+/z, \\\[x\[0-9\]+, z\[0-9\]+.s, sxtw 2\\\]" 8 } } */
/* { dg-final { scan-assembler-times "ld1w\\tz\[0-9\]+.s, p\[0-9\]+/z, \\\[x\[0-9\]+, z\[0-9\]+.s, sxtw\\\]" 3 } } */
/* { dg-final { scan-assembler-times "st1d\\tz\[0-9\]+.d, p\[0-9\]+, \\\[x\[0-9\]+, z\[0-9\]+.d\\\]" 19 } } */
/* { dg-final { scan-assembler-times "st1w\\tz\[0-9\]+.s, p\[0-9\]+, \\\[x\[0-9\]+, z\[0-9\]+.s, sxtw 2\\\]" 12 } } */
/* { dg-final { scan-assembler-times "st1w\\tz\[0-9\]+.s, p\[0-9\]+, \\\[x\[0-9\]+, z\[0-9\]+.s, sxtw\\\]" 3 } } */
|