summaryrefslogtreecommitdiff
path: root/src/intel/vulkan/grl/gpu/morton/morton_common.h
blob: 2beb7a1aff306ad6a6daf3d3b3defb12b4b07d91 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
//
// Copyright (C) 2009-2022 Intel Corporation
//
// SPDX-License-Identifier: MIT
//
//

#include "common.h"

#define MORTON_DEBUG_CHECKS 0
#define MORTON_VERBOSE_LOG 0

GRL_INLINE uint get_morton_sort_lsb_req_iterations( uint shift )
{
#if 0 // turn off, because current hierarchy build requires full sort
    // Difference between max iterations needed for LSB sorting and
    // number of iterations needed for LSB sorting without primIDs
    // This indicates how many of first iterations would be skipped in LSB
    return 8 - (8 - (shift >> 3));
#else
    return 0;
#endif
}

typedef struct BuildRecordLocalMortonFlattener
{
    unsigned int leftChild;  // global
    unsigned int rightChild; // global
    unsigned int rangeStart; // global
    unsigned int local_parent_index__numItems;
} BuildRecordLocalMortonFlattener;

// TODO: Currently sizeof UPerNodeData is 32, AABB struct allocates more data than needed and can be reduced
typedef union UPerNodeData {
    float4                           four_DWs;
    BuildRecordLocalMortonFlattener  buildRecord;
    MortonFlattenedBoxlessNode             boxlessNode;
    struct AABB                      box;
} UPerNodeData;

GRL_INLINE uint MortonFlattenedBoxlessNode_GetChildOffset(MortonFlattenedBoxlessNode bn)
{
    return bn.childOffset_type >> 6;
}

GRL_INLINE uint MortonFlattenedBoxlessNode_GetType(MortonFlattenedBoxlessNode bn)
{
    return bn.childOffset_type & ((1<<6) -1);
}

GRL_INLINE void set_2xSG_arr_first_write(uint index, uint* arr, ushort val, short lane)
{
    short lane_used = index % get_sub_group_size();
    short shift = (index / get_sub_group_size()) * get_sub_group_size();
    if (lane_used == lane) {
        *arr |= (val << shift);
    }
}

GRL_INLINE short get_from_2xSG_arr(uint index, uint arr, short lane)
{
    short r = 0;
    short lane_used = index % get_sub_group_size();
    short shift =    (index / get_sub_group_size()) * get_sub_group_size();
        r = arr >> shift;
    r = sub_group_broadcast(r, lane_used);
    return r;
}

GRL_INLINE void unpack_from_2xSG_arr(uint count, uint arr, short lane, ushort* dst)
{
    if (lane < count)
    {
        dst[lane]=(ushort)(arr & 0xFFFF);
        short hi_idx = lane + get_sub_group_size();
        if (hi_idx < count) {
            dst[hi_idx] = (ushort)(arr >> 16);
        }
    }
}


GRL_INLINE void pack_from_2xSG_arr(ushort* src, uint count, uint *arr, short lane)
{
    if (lane < count)
    {
        *arr = src[lane];
        short hi_idx = lane + get_sub_group_size();
        if (hi_idx < count) {
            *arr |= ((uint)(src[hi_idx])) << 16u;
        }
    }
}

GRL_INLINE void set_2xSG_arr(uint index, uint* arr, short val, short lane)
{
    short lane_used = index % get_sub_group_size();
    short shift = (index / get_sub_group_size()) * get_sub_group_size();
    if (lane_used == lane) {
        uint rem_val = (*arr) & (0xFFFF0000 >> shift); //calculate the ramaining other half in the uint
        *arr = (val << shift) | rem_val;
    }
}

GRL_INLINE void SUBGROUP_refit_bottom_up_local(
    uniform struct QBVHNodeN* globalNodeData,
    uniform struct BackPointers* backPointers,
    uniform uint treeletRootGlobalIndex,
    uniform uint globalBaseForInternalNodes,
    varying ushort lane,
    uniform local union UPerNodeData* local_nodes,
    varying uint sg_bu_startpoints,
    uniform uint sg_bu_startpoints_cnt)
{
    if(sg_bu_startpoints_cnt == 0)
        return;

    const uint head_lane = 0;
    uint curNodeIndex = get_from_2xSG_arr(--sg_bu_startpoints_cnt, sg_bu_startpoints, lane);

    uniform uint prev_loc_index = 0;
    uniform struct AABB child_aabb; // this carries reduced aabb between loop turns

    uniform uint backpointer = local_nodes[curNodeIndex].boxlessNode.backPointer;

    while (curNodeIndex != 0)
    {
        uniform uint lead_child_loc_offset = MortonFlattenedBoxlessNode_GetChildOffset(local_nodes[curNodeIndex].boxlessNode);
        uniform uint nodeType = MortonFlattenedBoxlessNode_GetType(local_nodes[curNodeIndex].boxlessNode);
        varying uint child_loc_idx = lead_child_loc_offset + curNodeIndex + lane;

        uint numChildren = BackPointer_GetNumChildren(backpointer);
        if (child_loc_idx != prev_loc_index &&
            lane < numChildren)
        {
            child_aabb = local_nodes[child_loc_idx].box;
        }
        else if (lane >= numChildren) {
            AABB_init(&child_aabb);
            child_aabb.lower.w = as_float(0u);
        }

        // TODO: perNode data could hold 7 dwords per node instead of 8 as long as we keep it in SLM
        struct AABB reduced_bounds = AABB_sub_group_reduce_N6(&child_aabb);
        reduced_bounds = AABB_sub_group_shuffle( &reduced_bounds, 0 );

        uint instMask = (uint)sub_group_reduce_or_N6(as_uint(child_aabb.lower.w));
        reduced_bounds.lower.w = as_float((uint)instMask);
        uint reduce_bounds_lane = AABB_sub_group_shuffle_coordPerLane(&reduced_bounds, 0);
        local uint* pbox = (local uint*)(local_nodes+ curNodeIndex);
        if (lane < 8)
        {
            pbox[lane] = reduce_bounds_lane;
        }

        uint global_node_idx = globalBaseForInternalNodes + curNodeIndex;
        /* get bounds of all children from child nodes directly */
        struct QBVHNodeN* qnode = globalNodeData + global_node_idx;
        subgroup_setQBVHNodeN_setFields(lead_child_loc_offset, nodeType, &child_aabb, numChildren, instMask, qnode, false);
        child_aabb = reduced_bounds;
        uint parentIndex = BackPointer_GetParentIndex(backpointer);

        write_mem_fence(CLK_LOCAL_MEM_FENCE);

        if (lane == 0)
        {
            backpointer = atomic_inc_local(&(local_nodes[parentIndex].boxlessNode.backPointer));
            uint globalParentIndex = (parentIndex > 0) ? (parentIndex + globalBaseForInternalNodes) : treeletRootGlobalIndex;
            uint globalBackpointer = (globalParentIndex << 6) | (numChildren << 3);

            /* set global back pointer */
            *InnerNode_GetBackPointer(backPointers, global_node_idx) = globalBackpointer;

#if MORTON_VERBOSE_LOG
            printf("BU_INNER: index: %d, first_child_id: %d, offset: %d, parent: %d, lead_child_loc_offset: %d, numChildren: %d, child_loc_idx: %d\n",
                   global_node_idx, global_node_idx + qnode->offset, qnode->offset, globalBackpointer >> 6, lead_child_loc_offset, numChildren, child_loc_idx);
#endif
        }

        backpointer = 1 + intel_sub_group_shuffle(backpointer, head_lane);
        prev_loc_index = curNodeIndex;
        curNodeIndex = parentIndex;

        /* if all children got refitted, then continue */
        uniform uint numChildrenRefitted = (backpointer >> 0) & 0x7;
        uniform uint numChildrenTotal = (backpointer >> 3) & 0x7;
        if (numChildrenRefitted != numChildrenTotal)
        {
            if(sg_bu_startpoints_cnt)
            {
                curNodeIndex = get_from_2xSG_arr(--sg_bu_startpoints_cnt, sg_bu_startpoints, lane);
                backpointer = local_nodes[curNodeIndex].boxlessNode.backPointer;
            }
            else
                return;
        }
    }

    // process root of the treelet
    {

#if MORTON_DEBUG_CHECKS
        if (curNodeIndex != 0) printf("SUBGROUP_refit_bottom_up_local: this should be local node index 0\n");
#endif

        uniform uint lead_child_loc_offset = MortonFlattenedBoxlessNode_GetChildOffset(local_nodes[0].boxlessNode);
        varying uint child_loc_idx = lead_child_loc_offset + 0 + lane;
        uint numChildren = BackPointer_GetNumChildren(backpointer);

        if (child_loc_idx != prev_loc_index &&
            lane < numChildren)
        {
            child_aabb = local_nodes[child_loc_idx].box;
        }
        else if (lane >= numChildren) {
            AABB_init(&child_aabb);
            child_aabb.lower.w = as_float(0u);
        }

        // TODO: perNode data could hold 7 dwords per node instead of 8 as long as we keep it in SLM
        uint instMask = (uint)sub_group_reduce_or_N6(as_uint(child_aabb.lower.w));
        uint nodeType = MortonFlattenedBoxlessNode_GetType(local_nodes[curNodeIndex].boxlessNode);
        uint global_node_idx = treeletRootGlobalIndex;
        uint lead_child_global_idx = globalBaseForInternalNodes + lead_child_loc_offset;

        /* get bounds of all children from child nodes directly */
        struct QBVHNodeN* qnode = globalNodeData + global_node_idx;

        subgroup_setQBVHNodeN_setFields(lead_child_global_idx - global_node_idx, nodeType, &child_aabb, numChildren, instMask, qnode, false);

        /* reset refit counter for next refit */
        if (lane == 0)
        {
            /* set global back pointer */
            *InnerNode_GetBackPointer(backPointers, global_node_idx) = backpointer & (~7u);

            // TODO: Move AABBs to separate buffer, but for now communicate bottom-tip boxes through qnodes

#if MORTON_VERBOSE_LOG
            printf("BU_ROOT: curNodeIndex: %d, index: %d, first_child_id: %d, offset: %d, parent: %d, numChildren: %d, sg_bu_startpoints_cnt: %d\n",
                   curNodeIndex, global_node_idx, global_node_idx + qnode->offset, qnode->offset, backpointer >> 6, numChildren, sg_bu_startpoints_cnt);
#endif
        }
    }
}