1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
|
//
// Copyright (C) 2009-2021 Intel Corporation
//
// SPDX-License-Identifier: MIT
//
//
// LSC Loads
uint load_uchar_to_uint_L1UC_L3UC(global uchar* it, int offset);
uint load_uchar_to_uint_L1UC_L3C(global uchar* it, int offset);
uint load_uchar_to_uint_L1C_L3UC(global uchar* it, int offset);
uint load_uchar_to_uint_L1C_L3C(global uchar* it, int offset);
uint load_uchar_to_uint_L1S_L3UC(global uchar* it, int offset);
uint load_uchar_to_uint_L1S_L3C(global uchar* it, int offset);
uint load_uchar_to_uint_L1IAR_L3C(global uchar* it, int offset);
uint load_ushort_to_uint_L1UC_L3UC(global ushort* it, int offset);
uint load_ushort_to_uint_L1UC_L3C(global ushort* it, int offset);
uint load_ushort_to_uint_L1C_L3UC(global ushort* it, int offset);
uint load_ushort_to_uint_L1C_L3C(global ushort* it, int offset);
uint load_ushort_to_uint_L1S_L3UC(global ushort* it, int offset);
uint load_ushort_to_uint_L1S_L3C(global ushort* it, int offset);
uint load_ushort_to_uint_L1IAR_L3C(global ushort* it, int offset);
uint load_uint_L1UC_L3UC(global uint* it, int offset);
uint load_uint_L1UC_L3C(global uint* it, int offset);
uint load_uint_L1C_L3UC(global uint* it, int offset);
uint load_uint_L1C_L3C(global uint* it, int offset);
uint load_uint_L1S_L3UC(global uint* it, int offset);
uint load_uint_L1S_L3C(global uint* it, int offset);
uint load_uint_L1IAR_L3C(global uint* it, int offset);
uint2 load_uint2_L1UC_L3UC(global uint2* it, int offset);
uint2 load_uint2_L1UC_L3C(global uint2* it, int offset);
uint2 load_uint2_L1C_L3UC(global uint2* it, int offset);
uint2 load_uint2_L1C_L3C(global uint2* it, int offset);
uint2 load_uint2_L1S_L3UC(global uint2* it, int offset);
uint2 load_uint2_L1S_L3C(global uint2* it, int offset);
uint2 load_uint2_L1IAR_L3C(global uint2* it, int offset);
uint3 load_uint3_L1UC_L3UC(global uint3* it, int offset);
uint3 load_uint3_L1UC_L3C(global uint3* it, int offset);
uint3 load_uint3_L1C_L3UC(global uint3* it, int offset);
uint3 load_uint3_L1C_L3C(global uint3* it, int offset);
uint3 load_uint3_L1S_L3UC(global uint3* it, int offset);
uint3 load_uint3_L1S_L3C(global uint3* it, int offset);
uint3 load_uint3_L1IAR_L3C(global uint3* it, int offset);
uint4 load_uint4_L1UC_L3UC(global uint4* it, int offset);
uint4 load_uint4_L1UC_L3C(global uint4* it, int offset);
uint4 load_uint4_L1C_L3UC(global uint4* it, int offset);
uint4 load_uint4_L1C_L3C(global uint4* it, int offset);
uint4 load_uint4_L1S_L3UC(global uint4* it, int offset);
uint4 load_uint4_L1S_L3C(global uint4* it, int offset);
uint4 load_uint4_L1IAR_L3C(global uint4* it, int offset);
uint8 load_uint8_L1UC_L3UC(global uint8* it, int offset);
uint8 load_uint8_L1UC_L3C(global uint8* it, int offset);
uint8 load_uint8_L1C_L3UC(global uint8* it, int offset);
uint8 load_uint8_L1C_L3C(global uint8* it, int offset);
uint8 load_uint8_L1S_L3UC(global uint8* it, int offset);
uint8 load_uint8_L1S_L3C(global uint8* it, int offset);
uint8 load_uint8_L1IAR_L3C(global uint8* it, int offset);
ulong load_ulong_L1UC_L3UC(global ulong* it, int offset);
ulong load_ulong_L1UC_L3C(global ulong* it, int offset);
ulong load_ulong_L1C_L3UC(global ulong* it, int offset);
ulong load_ulong_L1C_L3C(global ulong* it, int offset);
ulong load_ulong_L1S_L3UC(global ulong* it, int offset);
ulong load_ulong_L1S_L3C(global ulong* it, int offset);
ulong load_ulong_L1IAR_L3C(global ulong* it, int offset);
ulong2 load_ulong2_L1UC_L3UC(global ulong2* it, int offset);
ulong2 load_ulong2_L1UC_L3C(global ulong2* it, int offset);
ulong2 load_ulong2_L1C_L3UC(global ulong2* it, int offset);
ulong2 load_ulong2_L1C_L3C(global ulong2* it, int offset);
ulong2 load_ulong2_L1S_L3UC(global ulong2* it, int offset);
ulong2 load_ulong2_L1S_L3C(global ulong2* it, int offset);
ulong2 load_ulong2_L1IAR_L3C(global ulong2* it, int offset);
ulong3 load_ulong3_L1UC_L3UC(global ulong3* it, int offset);
ulong3 load_ulong3_L1UC_L3C(global ulong3* it, int offset);
ulong3 load_ulong3_L1C_L3UC(global ulong3* it, int offset);
ulong3 load_ulong3_L1C_L3C(global ulong3* it, int offset);
ulong3 load_ulong3_L1S_L3UC(global ulong3* it, int offset);
ulong3 load_ulong3_L1S_L3C(global ulong3* it, int offset);
ulong3 load_ulong3_L1IAR_L3C(global ulong3* it, int offset);
ulong4 load_ulong4_L1UC_L3UC(global ulong4* it, int offset);
ulong4 load_ulong4_L1UC_L3C(global ulong4* it, int offset);
ulong4 load_ulong4_L1C_L3UC(global ulong4* it, int offset);
ulong4 load_ulong4_L1C_L3C(global ulong4* it, int offset);
ulong4 load_ulong4_L1S_L3UC(global ulong4* it, int offset);
ulong4 load_ulong4_L1S_L3C(global ulong4* it, int offset);
ulong4 load_ulong4_L1IAR_L3C(global ulong4* it, int offset);
ulong8 load_ulong8_L1UC_L3UC(global ulong8* it, int offset);
ulong8 load_ulong8_L1UC_L3C(global ulong8* it, int offset);
ulong8 load_ulong8_L1C_L3UC(global ulong8* it, int offset);
ulong8 load_ulong8_L1C_L3C(global ulong8* it, int offset);
ulong8 load_ulong8_L1S_L3UC(global ulong8* it, int offset);
ulong8 load_ulong8_L1S_L3C(global ulong8* it, int offset);
ulong8 load_ulong8_L1IAR_L3C(global ulong8* it, int offset);
// LSC Stores
void store_uchar_from_uint_L1UC_L3UC(global uchar* it, int offset, uint value);
void store_uchar_from_uint_L1UC_L3WB(global uchar* it, int offset, uint value);
void store_uchar_from_uint_L1WT_L3UC(global uchar* it, int offset, uint value);
void store_uchar_from_uint_L1WT_L3WB(global uchar* it, int offset, uint value);
void store_uchar_from_uint_L1S_L3UC(global uchar* it, int offset, uint value);
void store_uchar_from_uint_L1S_L3WB(global uchar* it, int offset, uint value);
void store_uchar_from_uint_L1WB_L3WB(global uchar* it, int offset, uint value);
void store_ushort_from_uint_L1UC_L3UC(global ushort* it, int offset, uint value);
void store_ushort_from_uint_L1UC_L3WB(global ushort* it, int offset, uint value);
void store_ushort_from_uint_L1WT_L3UC(global ushort* it, int offset, uint value);
void store_ushort_from_uint_L1WT_L3WB(global ushort* it, int offset, uint value);
void store_ushort_from_uint_L1S_L3UC(global ushort* it, int offset, uint value);
void store_ushort_from_uint_L1S_L3WB(global ushort* it, int offset, uint value);
void store_ushort_from_uint_L1WB_L3WB(global ushort* it, int offset, uint value);
void store_uint_L1UC_L3UC(global uint* it, int offset, uint value);
void store_uint_L1UC_L3WB(global uint* it, int offset, uint value);
void store_uint_L1WT_L3UC(global uint* it, int offset, uint value);
void store_uint_L1WT_L3WB(global uint* it, int offset, uint value);
void store_uint_L1S_L3UC(global uint* it, int offset, uint value);
void store_uint_L1S_L3WB(global uint* it, int offset, uint value);
void store_uint_L1WB_L3WB(global uint* it, int offset, uint value);
void store_uint2_L1UC_L3UC(global uint2* it, int offset, uint2 value);
void store_uint2_L1UC_L3WB(global uint2* it, int offset, uint2 value);
void store_uint2_L1WT_L3UC(global uint2* it, int offset, uint2 value);
void store_uint2_L1WT_L3WB(global uint2* it, int offset, uint2 value);
void store_uint2_L1S_L3UC(global uint2* it, int offset, uint2 value);
void store_uint2_L1S_L3WB(global uint2* it, int offset, uint2 value);
void store_uint2_L1WB_L3WB(global uint2* it, int offset, uint2 value);
void store_uint3_L1UC_L3UC(global uint3* it, int offset, uint3 value);
void store_uint3_L1UC_L3WB(global uint3* it, int offset, uint3 value);
void store_uint3_L1WT_L3UC(global uint3* it, int offset, uint3 value);
void store_uint3_L1WT_L3WB(global uint3* it, int offset, uint3 value);
void store_uint3_L1S_L3UC(global uint3* it, int offset, uint3 value);
void store_uint3_L1S_L3WB(global uint3* it, int offset, uint3 value);
void store_uint3_L1WB_L3WB(global uint3* it, int offset, uint3 value);
void store_uint4_L1UC_L3UC(global uint4* it, int offset, uint4 value);
void store_uint4_L1UC_L3WB(global uint4* it, int offset, uint4 value);
void store_uint4_L1WT_L3UC(global uint4* it, int offset, uint4 value);
void store_uint4_L1WT_L3WB(global uint4* it, int offset, uint4 value);
void store_uint4_L1S_L3UC(global uint4* it, int offset, uint4 value);
void store_uint4_L1S_L3WB(global uint4* it, int offset, uint4 value);
void store_uint4_L1WB_L3WB(global uint4* it, int offset, uint4 value);
void store_uint8_L1UC_L3UC(global uint8* it, int offset, uint8 value);
void store_uint8_L1UC_L3WB(global uint8* it, int offset, uint8 value);
void store_uint8_L1WT_L3UC(global uint8* it, int offset, uint8 value);
void store_uint8_L1WT_L3WB(global uint8* it, int offset, uint8 value);
void store_uint8_L1S_L3UC(global uint8* it, int offset, uint8 value);
void store_uint8_L1S_L3WB(global uint8* it, int offset, uint8 value);
void store_uint8_L1WB_L3WB(global uint8* it, int offset, uint8 value);
void store_ulong_L1UC_L3UC(global ulong* it, int offset, ulong value);
void store_ulong_L1UC_L3WB(global ulong* it, int offset, ulong value);
void store_ulong_L1WT_L3UC(global ulong* it, int offset, ulong value);
void store_ulong_L1WT_L3WB(global ulong* it, int offset, ulong value);
void store_ulong_L1S_L3UC(global ulong* it, int offset, ulong value);
void store_ulong_L1S_L3WB(global ulong* it, int offset, ulong value);
void store_ulong_L1WB_L3WB(global ulong* it, int offset, ulong value);
void store_ulong2_L1UC_L3UC(global ulong2* it, int offset, ulong2 value);
void store_ulong2_L1UC_L3WB(global ulong2* it, int offset, ulong2 value);
void store_ulong2_L1WT_L3UC(global ulong2* it, int offset, ulong2 value);
void store_ulong2_L1WT_L3WB(global ulong2* it, int offset, ulong2 value);
void store_ulong2_L1S_L3UC(global ulong2* it, int offset, ulong2 value);
void store_ulong2_L1S_L3WB(global ulong2* it, int offset, ulong2 value);
void store_ulong2_L1WB_L3WB(global ulong2* it, int offset, ulong2 value);
void store_ulong3_L1UC_L3UC(global ulong3* it, int offset, ulong3 value);
void store_ulong3_L1UC_L3WB(global ulong3* it, int offset, ulong3 value);
void store_ulong3_L1WT_L3UC(global ulong3* it, int offset, ulong3 value);
void store_ulong3_L1WT_L3WB(global ulong3* it, int offset, ulong3 value);
void store_ulong3_L1S_L3UC(global ulong3* it, int offset, ulong3 value);
void store_ulong3_L1S_L3WB(global ulong3* it, int offset, ulong3 value);
void store_ulong3_L1WB_L3WB(global ulong3* it, int offset, ulong3 value);
void store_ulong4_L1UC_L3UC(global ulong4* it, int offset, ulong4 value);
void store_ulong4_L1UC_L3WB(global ulong4* it, int offset, ulong4 value);
void store_ulong4_L1WT_L3UC(global ulong4* it, int offset, ulong4 value);
void store_ulong4_L1WT_L3WB(global ulong4* it, int offset, ulong4 value);
void store_ulong4_L1S_L3UC(global ulong4* it, int offset, ulong4 value);
void store_ulong4_L1S_L3WB(global ulong4* it, int offset, ulong4 value);
void store_ulong4_L1WB_L3WB(global ulong4* it, int offset, ulong4 value);
void store_ulong8_L1UC_L3UC(global ulong8* it, int offset, ulong8 value);
void store_ulong8_L1UC_L3WB(global ulong8* it, int offset, ulong8 value);
void store_ulong8_L1WT_L3UC(global ulong8* it, int offset, ulong8 value);
void store_ulong8_L1WT_L3WB(global ulong8* it, int offset, ulong8 value);
void store_ulong8_L1S_L3UC(global ulong8* it, int offset, ulong8 value);
void store_ulong8_L1S_L3WB(global ulong8* it, int offset, ulong8 value);
void store_ulong8_L1WB_L3WB(global ulong8* it, int offset, ulong8 value);
// LSC Fence support
void mem_fence_gpu_default();
void mem_fence_workgroup_default();
void mem_fence_gpu_invalidate();
void mem_fence_gpu_evict();
void mem_fence_evict_to_memory();
|