src/intel/vulkan/grl/gpu/morton/phase2.cl


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314

//
// Copyright (C) 2009-2022 Intel Corporation
//
// SPDX-License-Identifier: MIT
//
//

#include "bvh_build_refit.h"
#include "libs/lsc_intrinsics.h"
#include "morton/morton_common.h"

/*

  POSTSORT PHASE2:
  Two kernels here, selected by MORTON_BUILDER_P2_SINGLE_WG_THRESHOLD whish is set to very big value.
  1. parallel_build_phase2_refit - performs refit using global synchronization and mem_fence_gpu_invalidate.
                                   This kernel should be used only for very big bvh, it is faster than non-SLM fallback
                                   in parallel_build_phase2_refit_local.
  2. parallel_build_phase2_refit_local - should be used for most of the cases, we usually fit into SLM with the number of
                                   nodes allocated in phase0, but there is also non-SLM fallback there, as the
                                   decision on which kernel to run is based on the nodes estimates on the host
                                   side.

*/


GRL_INLINE void refit_bottom_up_global_sync(
    global char* bvh_mem,
    global uint* global_refit_startpoints,
    uniform uint nodeId,
    uniform ushort lane)
{
    global struct BVHBase* bvh = (global struct BVHBase*)bvh_mem;

    BackPointers* backPointers = BVHBase_GetBackPointers( bvh );
    global struct QBVHNodeN* nodeData = BVHBase_nodeData( bvh );

    // Get the node idx that was put here in phase1
    const uint innerNodeIdx = global_refit_startpoints[nodeId];

    // Get the qnode and backpointer
    uniform global struct QBVHNodeN* qnode = nodeData + innerNodeIdx;
    uint backPointer = *InnerNode_GetBackPointer(backPointers, innerNodeIdx);

    varying struct AABB childrenAABB; // one child AABB per lane
    AABB_init(&childrenAABB);

    uniform uint numChildren = (backPointer >> 3) & 0x7;
    if(numChildren == 0) return;

    global struct QBVHNodeN* qnode_child = (global struct QBVHNodeN*)QBVHNodeN_childrenPointer( qnode );
    varying ushort child_idx = (lane < numChildren) ? lane : 0;
    childrenAABB = getAABB_QBVHNodeN( qnode_child + child_idx );

#if MORTON_VERBOSE_LOG
    if(lane == 0)
        printf("REFIT2: index: %d, child_idx: %d\n", innerNodeIdx, child_idx);
#endif

    struct AABB reduce_bounds = AABB_sub_group_reduce_N6( &childrenAABB );
    reduce_bounds = AABB_sub_group_shuffle( &reduce_bounds, 0 );

    subgroup_QBVHNodeN_setBounds(qnode, reduce_bounds, childrenAABB, numChildren, lane);

    uint children_mask = qnode_child[child_idx].instMask;
    qnode->instMask = sub_group_reduce_or_N6(children_mask);

    SUBGROUP_refit_bottom_up( qnode, bvh, reduce_bounds, numChildren, lane, 0 );
}

__attribute__( (reqd_work_group_size( 16, 1, 1 )) ) void kernel
parallel_build_phase2_refit( global char* bvh_mem,
    global uint* global_refit_startpoints )
{
    refit_bottom_up_global_sync(bvh_mem, global_refit_startpoints, get_group_id(0), get_local_id(0));
}


GRL_INLINE void SUBGROUP_refit_bottom_up_global(
    uniform global struct QBVHNodeN* globalNodeData,
    uniform struct BackPointers* backPointers,
    varying ushort lane,
    varying uint curNodeIndex)
{
    uniform uint backpointer = *InnerNode_GetBackPointer(backPointers, curNodeIndex);

    const uint head_lane = 0;
    uniform struct AABB child_aabb; // this carries reduced aabb between loop turns

    while (curNodeIndex != 0)
    {
        global struct QBVHNodeN* qnode = globalNodeData + curNodeIndex;
        global struct QBVHNodeN* qnode_child = (global struct QBVHNodeN*)QBVHNodeN_childrenPointer( qnode );
        uint numChildren = BackPointer_GetNumChildren(backpointer);

        varying ushort child_idx = (lane < numChildren) ? lane : 0;
        child_aabb = getAABB_QBVHNodeN( qnode_child + child_idx );

        struct AABB reduced_bounds = AABB_sub_group_reduce_N6(&child_aabb);
        reduced_bounds = AABB_sub_group_shuffle(&reduced_bounds, head_lane);

        /* get bounds of all children from child nodes directly */
        subgroup_QBVHNodeN_setBounds(qnode, reduced_bounds, child_aabb, numChildren, lane);

        uchar childrenMask = qnode_child[child_idx].instMask;
        qnode->instMask = sub_group_reduce_or_N6(childrenMask);

        uint parentIndex = BackPointer_GetParentIndex(backpointer);

        mem_fence_gpu_invalidate();

        if (lane == 0)
        {
            backpointer = atomic_inc_global((__global uint *)InnerNode_GetBackPointer(backPointers, parentIndex));

            uint globalBackpointer = (parentIndex << 6) | (numChildren << 3);

            /* set global back pointer */
            *InnerNode_GetBackPointer(backPointers, curNodeIndex) = globalBackpointer;

#if MORTON_VERBOSE_LOG
            printf("BU_INNER: index: %d, first_child_id: %d, offset: %d, parent: %d, numChildren: %d, child_loc_idx: %d reduced_bounds: %f\n",
                   curNodeIndex, curNodeIndex + qnode->offset, qnode->offset, backpointer >> 6, numChildren, child_idx, reduced_bounds.lower.x);
#endif
        }

        backpointer = 1 + intel_sub_group_shuffle(backpointer, head_lane);
        curNodeIndex = parentIndex;

        /* if all children got refitted, then continue */
        uniform uint numChildrenRefitted = (backpointer >> 0) & 0x7;
        uniform uint numChildrenTotal = (backpointer >> 3) & 0x7;

        if (numChildrenRefitted != numChildrenTotal)
                return;
    }

    // process root of the treelet
    {

#if MORTON_DEBUG_CHECKS
        if (curNodeIndex != 0) printf("SUBGROUP_refit_bottom_up_local: this should be local node index 0\n");
#endif

        global struct QBVHNodeN* qnode_child = (global struct QBVHNodeN*)QBVHNodeN_childrenPointer( globalNodeData );
        uint numChildren = BackPointer_GetNumChildren(backpointer);

        varying ushort child_idx = (lane < numChildren) ? lane : 0;
        child_aabb = getAABB_QBVHNodeN( qnode_child + child_idx );

        struct AABB reduced_bounds = AABB_sub_group_reduce_N6(&child_aabb);
        reduced_bounds = AABB_sub_group_shuffle(&reduced_bounds, head_lane);

        /* get bounds of all children from child nodes directly */
        subgroup_QBVHNodeN_setBounds(globalNodeData, reduced_bounds, child_aabb, numChildren, lane);

        uchar childrenMask = qnode_child[child_idx].instMask;
        globalNodeData->instMask = sub_group_reduce_or_N6(childrenMask);

        /* reset refit counter for next refit */
        if (lane == 0)
        {
            /* set global back pointer */
            *InnerNode_GetBackPointer(backPointers, 0) = backpointer & (~7u);

#if MORTON_VERBOSE_LOG
        printf("BU_ROOT: curNodeIndex: %d, index: %d, first_child_id: %d, offset: %d, parent: %d, numChildren: %d, sg_bu_startpoints_cnt: %d\n",
               curNodeIndex, 0, 0 + globalNodeData->offset, globalNodeData->offset, backpointer >> 6, numChildren, sg_bu_startpoints_cnt);
#endif
        }
    }
}


// TODO: Check why 512 wg size has worse performance than 256
__attribute__( (reqd_work_group_size( 512, 1, 1 )) )
__attribute__((intel_reqd_sub_group_size(16))) void kernel
parallel_build_phase2_refit_local( global struct Globals* globals,
    global char* bvh_mem,
    global struct MortonFlattenedBoxlessNode *boxless_nodes)
{
    // Number of nodes created in P0, to be refitted in this stage
    uint p0_created_num = globals->p0_created_num;

    // Return immediately if host executed this kernel but there is nothing to do
    if(p0_created_num == 0)
        return;

    global struct BVHBase* bvh = (global struct BVHBase*)bvh_mem;
    BackPointers* backPointers = BVHBase_GetBackPointers( bvh );
    global struct QBVHNodeN* nodeData = BVHBase_nodeData( bvh );
    varying ushort lane = get_sub_group_local_id();

    // Hardcode SLM to max here as we do not know upfront how much mem will be needed
    local union UPerNodeData perNodeData[MORTON_BUILDER_P2_ELEMENTS_IN_SLM]; /* 16kb is max slm for 256 wg_size */

    // Number of allocated nodes in phase0 (p0_created_num + children)
    uint p0_allocated_num = globals->p0_allocated_num;

    // array that will keep 2x8 shorts indices
    varying uint sg_fatleaf_array = 0x0;
    uniform uint8_t sg_bu_startpoints_cnt = 0;

    // Determine if we can fit into SLM with all the nodes allocated in phase0,
    // There are two paths here:
    // 1. Copy all needed flattened nodes and bounding boxes to SLM and reuse bottom up local,
    //    which does refit nad creates qnodes in bvh
    // 2. If not fit into SLM, first create qnodes in bvh, and perform bottom up refit with global atomics synchronization.
    //    It is not performant to do so, keep it as a guardrail here. On the host side we do fallback
    //    to the old refit separated path, with wg_size 8 with better EU reuse.
    if(p0_allocated_num < MORTON_BUILDER_P2_ELEMENTS_IN_SLM)
    {
        for (uint ID = get_sub_group_id(); ID < p0_created_num; ID += get_num_sub_groups() )
        {
            MortonFlattenedBoxlessNode boxless_node = boxless_nodes[ID];
            uint current_id = boxless_node.binary_hierarchy_index >> 6;

            // Put the mask for the children that are subtree roots in the binary_hierarchy_index that is unused
            uchar children_root_mask = (boxless_node.binary_hierarchy_index & 0x3F);

            if(lane == 0)
                perNodeData[current_id].boxlessNode = boxless_node;

            // When no children are subtree roots, we are done and skip to the next iteration
            if(children_root_mask == 0x0)
            {
                continue;
            }
            // When all children are subtree roots, put them to sg_fatleaf_array
            else if(children_root_mask == 0x3F)
            {
                set_2xSG_arr_first_write(sg_bu_startpoints_cnt++, &sg_fatleaf_array, current_id, lane);
            }

            uniform global struct QBVHNodeN* qnode = nodeData + current_id;

            uniform uint numChildren = (boxless_node.backPointer >> 3) & 0x7;
            uint lead_child_offset = MortonFlattenedBoxlessNode_GetChildOffset(boxless_node);
            varying ushort child_idx = (lane < numChildren) ? lane : 0;

            varying struct AABB childrenAABB; // one child AABB per lane
            AABB_init(&childrenAABB);

            uint lead_child_global_id = current_id + lead_child_offset;

            uniform global struct QBVHNodeN* qnode_child = nodeData + lead_child_global_id;
            childrenAABB = getAABB_QBVHNodeN( qnode_child + child_idx );

            // Get only AABBs of children that are p1 subtree roots
            bool lane_active = boxless_node.binary_hierarchy_index & (1 << child_idx);
            if(lane_active)
            {
                uint child_global_id = lead_child_global_id + child_idx;
                perNodeData[child_global_id].box = childrenAABB;
                perNodeData[child_global_id].box.lower.w = as_float((uint)qnode_child->instMask);
            }

#if MORTON_VERBOSE_LOG
            if(lane == 0)
                printf("P2_LOCAL: ID: %d, lead_child_offset: %d, child_idx: %d, lane_active: %d, boxless_node >> 6: %d, perNodeData[ID].box = %f, qnode->offset: %d\n", ID, lead_child_offset, child_idx, lane_active, boxless_node.backPointer >> 6, perNodeData[ID].box.lower.x, qnode->offset);
#endif
        }

        work_group_barrier(CLK_LOCAL_MEM_FENCE);

        SUBGROUP_refit_bottom_up_local(nodeData, backPointers, 0, 0, lane, perNodeData, sg_fatleaf_array, sg_bu_startpoints_cnt);
    }
    else
    {
        for (uint ID = get_sub_group_id(); ID < p0_created_num; ID += get_num_sub_groups() )
        {
            MortonFlattenedBoxlessNode boxless_node = boxless_nodes[ID];
            uint current_id = boxless_node.binary_hierarchy_index >> 6;

            // Put the mask for the children that are subtree roots in the binary_hierarchy_index that is unused
            uchar children_root_mask = (boxless_node.binary_hierarchy_index & 0x3F);
            uniform uint numChildren = (boxless_node.backPointer >> 3) & 0x7;

            uniform global struct QBVHNodeN* qnode = nodeData + current_id;
            uint nodeType = MortonFlattenedBoxlessNode_GetType(boxless_node);
            uint lead_child_offset = MortonFlattenedBoxlessNode_GetChildOffset(boxless_node);

            SUBGROUP_QBVHNodeN_setChildIncr1( qnode );
            if(lane == 0)
            {
                QBVH6Node_set_type( qnode, nodeType );
                qnode->offset = lead_child_offset;
            }

            // When no children are subtree roots, we are done and skip to the next iteration
            if(children_root_mask == 0x0)
            {
                continue;
            }
            // When all children are subtree roots, put them to sg_fatleaf_array
            else if(children_root_mask == 0x3F)
            {
                set_2xSG_arr_first_write(sg_bu_startpoints_cnt++, &sg_fatleaf_array, current_id, lane);
            }

#if MORTON_VERBOSE_LOG
            if(lane == 0)
                printf("P2_GLOBAL: ID: %d, lead_child_offset: %d, child_idx: %d, boxless_node >> 6: %d, perNodeData[ID].box = %f, qnode->offset: %d\n", ID, lead_child_offset, child_idx, boxless_node.backPointer >> 6, reduce_bounds.lower.x, qnode->offset);
#endif
        }

        while (sg_bu_startpoints_cnt > 0)
        {
            uint curNodeIndex = get_from_2xSG_arr(--sg_bu_startpoints_cnt, sg_fatleaf_array, lane);

            SUBGROUP_refit_bottom_up_global(nodeData, backPointers, lane, curNodeIndex);
        }
    }
}