// // Copyright (C) 2009-2021 Intel Corporation // // SPDX-License-Identifier: MIT // // #include "intrinsics.h" #include "AABB3f.h" #include "AABB.h" #include "GRLGen12.h" #include "quad.h" #include "common.h" #include "instance.h" #include "api_interface.h" #include "binned_sah_shared.h" #if 0 #define LOOP_TRIPWIRE_INIT uint _loop_trip=0; #define LOOP_TRIPWIRE_INCREMENT(max_iterations) \ _loop_trip++;\ if ( _loop_trip > max_iterations )\ {\ if( get_local_id(0) == 0 )\ printf( "@@@@@@@@@@@@@@@@@@@@ TRIPWIRE!!!!!!!!!!! group=%u\n", get_group_id(0) );\ break;\ } #else #define LOOP_TRIPWIRE_INIT #define LOOP_TRIPWIRE_INCREMENT(max_iterations) #endif // ========================================================= // DFS // ========================================================= // there are 128 threads x SIMD16 == 2048 lanes in a DSS // There is 128KB of SLM. Upper limit of 64KB per WG, so target is 2 groups of 1024 lanes @ 64K each // --> Full occupancy requires using less than 64B per lane // // Groups of 256 lanes gives us 16KB per group // // We use subgroups very heavily here in order to avoid // use of per-thread scratch space for intermediate values #define DFS_WG_SIZE 256 #define DFS_NUM_SUBGROUPS 16 #define DFS_BVH2_NODE_COUNT (2*(DFS_WG_SIZE)-1) #define TREE_ARITY 6 // FlatTree node limits: // these are the derivations if we always collapse to one primitive and pack nodes as tightly as possible // If BVH2 construction is allowed to terminate early and place multiple prims in a leaf, these numbers will be too low #if 0 // maximum flattree size is the number of inner nodes in a full M-ary tree with one leaf per primitive // This is given by I = (L-1)/(M-1) // For a 256 thread workgroup, L=256, M=6, this gives: 51 #define DFS_MAX_FLATTREE_NODES 51 // A flattree leaf is a node which contains only primitives. // // The maximum number of leaves is related to the number of nodes as: // L(N) = ((M-1)*N + 1) / M // #define DFS_MAX_FLATTREE_LEAFS 43 // = 43 for 256 thread WG (L=256, M=6) #else // This is the result of estimate_qbvh6_nodes(256) #define DFS_MAX_FLATTREE_LEAFS 256 #define DFS_MAX_FLATTREE_NODES 307 // 256 fat-leaves + 51 inner nodes. 51 = ceil(256/5) #define DFS_MAX_FLATTREE_DEPTH 52 // number of inner nodes in the worst-case tree #endif #define uniform #define varying struct DFSArgs { global struct BVHBase* bvh_base; global PrimRef* primref_buffer; ushort leaf_node_type; ushort inner_node_type; ushort leaf_size_in_bytes; bool need_backpointers; bool need_masks; ushort num_primrefs; global uint* primref_index_buffer; }; struct DFSPrimRefAABB { half lower[3]; half upper[3]; }; GRL_INLINE void DFSPrimRefAABB_init( struct DFSPrimRefAABB* bb ) { bb->lower[0] = 1; bb->lower[1] = 1; bb->lower[2] = 1; bb->upper[0] = 0; bb->upper[1] = 0; bb->upper[2] = 0; } GRL_INLINE void DFSPrimRefAABB_extend( struct DFSPrimRefAABB* aabb, struct DFSPrimRefAABB* v ) { aabb->lower[0] = min( aabb->lower[0], v->lower[0] ); aabb->lower[1] = min( aabb->lower[1], v->lower[1] ); aabb->lower[2] = min( aabb->lower[2], v->lower[2] ); aabb->upper[0] = max( aabb->upper[0], v->upper[0] ); aabb->upper[1] = max( aabb->upper[1], v->upper[1] ); aabb->upper[2] = max( aabb->upper[2], v->upper[2] ); } GRL_INLINE float DFSPrimRefAABB_halfArea( struct DFSPrimRefAABB* aabb ) { const half3 d = (half3)(aabb->upper[0] - aabb->lower[0], aabb->upper[1] - aabb->lower[1], aabb->upper[2] - aabb->lower[2]); return fma( d.x, (d.y + d.z), d.y * d.z ); } GRL_INLINE struct DFSPrimRefAABB DFSPrimRefAABB_sub_group_reduce( struct DFSPrimRefAABB* aabb ) { struct DFSPrimRefAABB bounds; bounds.lower[0] = sub_group_reduce_min( aabb->lower[0] ); bounds.lower[1] = sub_group_reduce_min( aabb->lower[1] ); bounds.lower[2] = sub_group_reduce_min( aabb->lower[2] ); bounds.upper[0] = sub_group_reduce_max( aabb->upper[0] ); bounds.upper[1] = sub_group_reduce_max( aabb->upper[1] ); bounds.upper[2] = sub_group_reduce_max( aabb->upper[2] ); return bounds; } struct DFSPrimRef { struct DFSPrimRefAABB aabb; uint2 meta; }; struct PrimRefMeta { uchar2 meta; }; GRL_INLINE uint PrimRefMeta_GetInputIndex( struct PrimRefMeta* it ) { return it->meta.x; } GRL_INLINE uint PrimRefMeta_GetInstanceMask( struct PrimRefMeta* it ) { return it->meta.y; } struct PrimRefSet { struct AABB3f root_aabb; struct DFSPrimRefAABB AABB[DFS_WG_SIZE]; uint2 meta[DFS_WG_SIZE]; }; GRL_INLINE local struct DFSPrimRefAABB* PrimRefSet_GetAABBPointer( local struct PrimRefSet* refs, ushort id ) { return &refs->AABB[id]; } GRL_INLINE float PrimRefSet_GetMaxAABBArea( local struct PrimRefSet* refs ) { float3 root_l = AABB3f_load_lower( &refs->root_aabb ); float3 root_u = AABB3f_load_upper( &refs->root_aabb ); float3 d = root_u - root_l; float scale = 1.0f / max( d.x, max( d.y, d.z ) ); half3 dh = convert_half3_rtp( d * scale ); return fma( dh.x, (dh.y + dh.z), dh.y * dh.z ); } GRL_INLINE float3 ulp3( float3 v ) { return fabs(v) * FLT_EPSILON; } GRL_INLINE struct AABB PrimRefSet_ConvertAABB( local struct PrimRefSet* refs, struct DFSPrimRefAABB* box ) { float3 root_l = AABB3f_load_lower( &refs->root_aabb ); float3 root_u = AABB3f_load_upper( &refs->root_aabb ); float3 d = root_u - root_l; float scale = max( d.x, max( d.y, d.z ) ); float3 l = convert_float3_rtz( (half3)(box->lower[0], box->lower[1], box->lower[2]) ); float3 u = convert_float3_rtp( (half3)(box->upper[0], box->upper[1], box->upper[2]) ); l = l * scale + root_l ; u = u * scale + root_l ; // clamping is necessary in case that a vertex lies exactly in the upper AABB plane. // If we use unclamped values, roundoff error in the scale factor calculation can cause us // to snap to a flattened AABB that lies outside of the original one, resulting in missed geometry. u = min( u, root_u ); l = min( l, root_u ); struct AABB r; r.lower.xyz = l.xyz; r.upper.xyz = u.xyz; return r; } GRL_INLINE PrimRef PrimRefSet_GetFullPrecisionAABB( local struct PrimRefSet* refs, ushort id ) { struct AABB r; r = PrimRefSet_ConvertAABB( refs, &refs->AABB[id] ); r.lower.w = 0; r.upper.w = 0; return r; } GRL_INLINE uint PrimRefSet_GetInputIndex( local struct PrimRefSet* refs, ushort id ) { return refs->meta[id].x; } GRL_INLINE uint PrimRefSet_GetInstanceMask( local struct PrimRefSet* refs, ushort id ) { return refs->meta[id].y; } GRL_INLINE struct PrimRefMeta PrimRefSet_GetMeta( local struct PrimRefSet* refs, ushort id ) { struct PrimRefMeta meta; meta.meta.x = refs->meta[id].x; meta.meta.y = refs->meta[id].y; return meta; } GRL_INLINE struct DFSPrimRef PrimRefSet_GetPrimRef( local struct PrimRefSet* refs, ushort id ) { struct DFSPrimRef r; r.aabb = refs->AABB[id]; r.meta = refs->meta[id]; return r; } GRL_INLINE void PrimRefSet_SetPrimRef_FullPrecision( local struct PrimRefSet* refs, PrimRef ref, ushort id ) { float3 root_l = AABB3f_load_lower( &refs->root_aabb ); float3 root_u = AABB3f_load_upper( &refs->root_aabb ); float3 d = root_u - root_l; float scale = 1.0f / max(d.x, max(d.y,d.z)); float3 l = ref.lower.xyz; float3 u = ref.upper.xyz; half3 lh = convert_half3_rtz( (l - root_l) * scale ); half3 uh = convert_half3_rtp( (u - root_l) * scale ); refs->AABB[id].lower[0] = lh.x; refs->AABB[id].lower[1] = lh.y; refs->AABB[id].lower[2] = lh.z; refs->AABB[id].upper[0] = uh.x; refs->AABB[id].upper[1] = uh.y; refs->AABB[id].upper[2] = uh.z; refs->meta[id].x = id; refs->meta[id].y = PRIMREF_instanceMask(&ref); } GRL_INLINE void PrimRefSet_SetPrimRef( local struct PrimRefSet* refs, struct DFSPrimRef ref, ushort id ) { refs->AABB[id] = ref.aabb; refs->meta[id] = ref.meta; } GRL_INLINE struct AABB3f PrimRefSet_GetRootAABB( local struct PrimRefSet* refs ) { return refs->root_aabb; } GRL_INLINE void SUBGROUP_PrimRefSet_Initialize( local struct PrimRefSet* refs ) { if ( get_sub_group_local_id() == 0 ) AABB3f_init( &refs->root_aabb ); // TODO_OPT: subgroup-vectorized version of AABB3f_init } GRL_INLINE void PrimRefSet_Printf( local struct PrimRefSet* refs, ushort num_prims ) { barrier( CLK_LOCAL_MEM_FENCE ); if ( get_local_id( 0 ) == 0 ) { printf( "Scene AABB:\n" ); struct AABB3f rootBox = PrimRefSet_GetRootAABB( refs ); AABB3f_print( &rootBox ); float ma = PrimRefSet_GetMaxAABBArea( refs ); for ( uint i = 0; i < num_prims; i++ ) { printf( "Ref: %u\n", i ); struct AABB r = PrimRefSet_GetFullPrecisionAABB( refs, i ); AABB_print( &r ); float a = DFSPrimRefAABB_halfArea( PrimRefSet_GetAABBPointer( refs, i ) ); printf( "Scaled Area: %f / %f = %f \n", a, ma, a / ma ); } } barrier( CLK_LOCAL_MEM_FENCE ); } GRL_INLINE void PrimRefSet_CheckBounds( local struct PrimRefSet* refs, ushort num_prims, PrimRef* primref_buffer ) { barrier( CLK_LOCAL_MEM_FENCE ); if ( get_local_id( 0 ) == 0 ) { for ( uint i = 0; i < num_prims; i++ ) { PrimRef ref = primref_buffer[i]; struct AABB r2 = PrimRefSet_GetFullPrecisionAABB( refs, i ); struct DFSPrimRefAABB* box = &refs->AABB[i]; float3 l = convert_float3_rtz( (half3)(box->lower[0], box->lower[1], box->lower[2]) ); float3 u = convert_float3_rtp( (half3)(box->upper[0], box->upper[1], box->upper[2]) ); printf( " halfs:{%x,%x,%x}{%x,%x,%x}\n", as_uint(l.x), as_uint(l.y), as_uint(l.z), as_uint(u.x), as_uint(u.y), as_uint(u.z) ); printf( " {%f,%f,%f} {%f,%f,%f} {%f,%f,%f} {%f,%f,%f} {%u,%u,%u,%u,%u,%u}\n", ref.lower.x, ref.lower.y, ref.lower.z, r2.lower.x, r2.lower.y, r2.lower.z, ref.upper.x, ref.upper.y, ref.upper.z, r2.upper.x, r2.upper.y, r2.upper.z, r2.lower.x <= ref.lower.x, r2.lower.y <= ref.lower.y, r2.lower.z <= ref.lower.z, r2.upper.x >= ref.upper.x, r2.upper.y >= ref.upper.y, r2.upper.z >= ref.upper.z ); } } barrier( CLK_LOCAL_MEM_FENCE ); } struct LocalBVH2 { uint num_nodes; uint nodes[DFS_BVH2_NODE_COUNT]; // nodes are a bitfield: // bits 8:0 (9b) ==> number of primrefs in this subtree // // bits 17:9 (9b) ==> for an inner node: contains offset to a pair of children // ==> for a leaf node: contains index of the first primref in this leaf // // bits 30:18 (13b) ==> quantized AABB area (relative to root box) // bit 31 (1b) ==> is_inner flag // // NOTE: The left child offset of any node is always odd.. therefore, it is possible to recover a bit if we need it // by storing only the 8 MSBs }; #define DFS_BVH2_AREA_QUANT 8191.0f GRL_INLINE void SUBGROUP_LocalBVH2_Initialize( local struct LocalBVH2* tree, ushort num_prims ) { tree->num_nodes = 1; // include the root node tree->nodes[0] = num_prims; // initialize root node as a leaf containing the full subtree } GRL_INLINE void LocalBVH2_CreateInnerNode( local struct LocalBVH2* tree, ushort node_index, ushort start_left, ushort start_right, ushort quantized_left_area, ushort quantized_right_area ) { uint child_pos = atomic_add_local( &tree->num_nodes, 2 ); // set the inner node flag and child position in the parent // leave the other bits intact uint parent_node = tree->nodes[node_index]; parent_node |= 0x80000000; parent_node = (parent_node & ~(0x1ff<<9)) | (child_pos << 9); tree->nodes[node_index] = parent_node; // setup children as leaf nodes with prim-count zero uint left_child = (convert_uint(start_left) << 9) | (convert_uint( quantized_left_area ) << 18); uint right_child = (convert_uint(start_right) << 9) | (convert_uint( quantized_right_area ) << 18); tree->nodes[child_pos] = left_child; tree->nodes[child_pos + 1] = right_child; } GRL_INLINE ushort LocalBVH2_IncrementPrimCount( local struct LocalBVH2* tree, ushort node_index ) { // increment only the lower bits. Given correct tree construction algorithm this will not overflow into MSBs return (atomic_inc_local( &tree->nodes[node_index] )) & 0x1ff; } GRL_INLINE ushort LocalBVH2_GetNodeArea( local struct LocalBVH2* tree, ushort nodeID ) { return (tree->nodes[nodeID] >> 18) & 0x1FFF; } GRL_INLINE bool LocalBVH2_IsInnerNode( local struct LocalBVH2* tree, ushort nodeID ) { return (tree->nodes[nodeID] & 0x80000000) != 0; } GRL_INLINE ushort2 LocalBVH2_GetChildIndices( local struct LocalBVH2* tree, ushort nodeID ) { ushort idx = ((tree->nodes[nodeID] >> 9) & 0x1FF); return (ushort2)(idx, idx + 1); } GRL_INLINE ushort LocalBVH2_GetSubtreePrimCount( local struct LocalBVH2* tree, ushort node ) { return tree->nodes[node] & 0x1FF; } GRL_INLINE ushort LocalBVH2_GetLeafPrimStart( local struct LocalBVH2* tree, ushort node ) { return ((tree->nodes[node] >> 9) & 0x1FF); } GRL_INLINE void LocalBVH2_Printf( local struct LocalBVH2* tree ) { barrier( CLK_LOCAL_MEM_FENCE ); if ( get_local_id( 0 ) == 0 ) { printf( "Nodes: %u\n", tree->num_nodes ); for ( uint i = 0; i < tree->num_nodes; i++ ) { uint num_prims = LocalBVH2_GetSubtreePrimCount( tree, i ); printf( "%3u : 0x%08x %3u 0x%04x ", i, tree->nodes[i], num_prims, LocalBVH2_GetNodeArea(tree,i) ); if ( LocalBVH2_IsInnerNode( tree, i ) ) { ushort2 kids = LocalBVH2_GetChildIndices( tree, i ); printf( " INNER ( %3u %3u )\n", kids.x, kids.y ); } else { printf( " LEAF {" ); for ( uint j = 0; j < num_prims; j++ ) printf( " %3u ", LocalBVH2_GetLeafPrimStart( tree, i ) + j ); printf( "}\n" ); } } } barrier( CLK_LOCAL_MEM_FENCE ); } struct FlatTreeInnerNode { uint DW0; // lower 16b are index of corresponding LocalBVH2 node.. Bits 30:16 are an atomic flag used during refit. Bit 31 is a leaf marker ushort parent_index; ushort first_child; uchar index_in_parent; uchar num_children; //struct DFSPrimRefAABB AABB; }; struct FlatTree { uint num_nodes; uint qnode_byte_offset; // byte offset from the BVHBase to the flat-tree's first QNode uint qnode_base_index; struct FlatTreeInnerNode nodes[DFS_MAX_FLATTREE_NODES]; uchar primref_back_pointers[DFS_WG_SIZE]; }; GRL_INLINE void FlatTree_Printf( local struct FlatTree* flat_tree ) { barrier( CLK_LOCAL_MEM_FENCE ); if ( get_local_id( 0 ) == 0 ) { printf( "NumNodes: %u\n", flat_tree->num_nodes ); for ( uint i = 0; i < flat_tree->num_nodes; i++ ) { ushort bvh2_node = flat_tree->nodes[i].DW0 & 0xffff; printf( "%2u Parent: %2u Index_in_parent: %u, NumKids: %u FirstKid: %3u bvh2: %3u DW0: 0x%x\n", i, flat_tree->nodes[i].parent_index, flat_tree->nodes[i].index_in_parent, flat_tree->nodes[i].num_children, flat_tree->nodes[i].first_child, bvh2_node, flat_tree->nodes[i].DW0 ); } } barrier( CLK_LOCAL_MEM_FENCE ); } GRL_INLINE ushort FlatTree_GetNodeCount( local struct FlatTree* flat_tree ) { return flat_tree->num_nodes; } GRL_INLINE uint FlatTree_GetParentIndex( local struct FlatTree* flat_tree, ushort id ) { return flat_tree->nodes[id].parent_index; } GRL_INLINE ushort FlatTree_GetBVH2Root( local struct FlatTree* flat_tree, ushort node_index ) { return (flat_tree->nodes[node_index].DW0) & 0xffff; } GRL_INLINE ushort FlatTree_GetNumChildren( local struct FlatTree* flat_tree, ushort node_index ) { return flat_tree->nodes[node_index].num_children; } GRL_INLINE bool FlatTree_IsLeafNode( local struct FlatTree* flat_tree, ushort node_index ) { return (flat_tree->nodes[node_index].DW0 & 0x80000000) != 0; } GRL_INLINE uint FlatTree_GetQNodeByteOffset( struct FlatTree* flat_tree, ushort node_index ) { return flat_tree->qnode_byte_offset + node_index * sizeof(struct QBVHNodeN); } GRL_INLINE uint FlatTree_GetQNodeIndex( struct FlatTree* flat_tree, ushort node_index ) { return flat_tree->qnode_base_index + node_index; } GRL_INLINE void FlatTree_AllocateQNodes( struct FlatTree* flat_tree, struct DFSArgs args ) { uint node_base = 64*allocate_inner_nodes( args.bvh_base, flat_tree->num_nodes ); flat_tree->qnode_base_index = (node_base - BVH_ROOT_NODE_OFFSET) / sizeof( struct QBVHNodeN ); flat_tree->qnode_byte_offset = node_base; } GRL_INLINE ushort FlatTree_GetFirstChild( struct FlatTree* flat_tree, ushort node_index ) { return flat_tree->nodes[node_index].first_child; } GRL_INLINE ushort FlatTree_GetPrimRefStart( struct FlatTree* flat_tree, ushort node_index ) { return flat_tree->nodes[node_index].first_child; } GRL_INLINE ushort FlatTree_GetPrimRefCount( struct FlatTree* flat_tree, ushort node_index ) { return flat_tree->nodes[node_index].num_children; } GRL_INLINE uint FlatTree_BuildBackPointer( local struct FlatTree* flat_tree, ushort node_index ) { uint parent_index = flat_tree->nodes[node_index].parent_index + flat_tree->qnode_base_index; parent_index = (parent_index << 6) | (FlatTree_GetNumChildren( flat_tree, node_index ) << 3); return parent_index; } GRL_INLINE void SUBGROUP_FlatTree_Initialize( uniform local struct FlatTree* flat_tree, struct DFSArgs args ) { if ( get_sub_group_local_id() == 0 ) { flat_tree->num_nodes = 1; flat_tree->nodes[0].DW0 = 0; // point first node at BVH2 root node, which is assumed to be at index zero } } /* GRL_INLINE void SUBGROUP_FlatTree_ReduceAndSetAABB( uniform local struct FlatTree* flat_tree, uniform ushort node_index, varying local struct DFSPrimRefAABB* box ) { // TODO_OPT: Replace this with an optimized reduction which exploits the fact that we only ever have 6 active lanes // Try using the "negated max" trick here to compute min/max simultaneously, with max in top 6 lanes // This will replace 6 reductions with 3 // TODO_OPT: This only utilizes up to 6 SIMD lanes. We can use up to 12 of them by putting // min into even lanes, and -max into odd lanes, and using a manual min-reduction on pairs of lanes struct DFSPrimRefAABB bb = DFSPrimRefAABB_sub_group_reduce( box ); if( get_sub_group_local_id() ) flat_tree->nodes[node_index].AABB = bb; } */ GRL_INLINE void SUBGROUP_FlatTree_CreateInnerNode( uniform local struct FlatTree* flat_tree, uniform ushort flat_tree_root, varying ushort sg_child_bvh2_root, uniform ushort num_children ) { uniform uint lane = get_sub_group_local_id(); // increment counter to allocate new nodes.. set required root node fields uniform uint child_base; if ( lane == 0 ) { child_base = atomic_add_local( &flat_tree->num_nodes, num_children ); flat_tree->nodes[flat_tree_root].first_child = (uchar) child_base; flat_tree->nodes[flat_tree_root].num_children = num_children; // initialize mask bits for this node's live children uint child_mask = ((1 << num_children) - 1) << 16; flat_tree->nodes[flat_tree_root].DW0 |= child_mask; } child_base = sub_group_broadcast( child_base, 0 ); // initialize child nodes if ( lane < num_children ) { varying uint child = child_base + lane; flat_tree->nodes[child].DW0 = sg_child_bvh2_root; flat_tree->nodes[child].index_in_parent = lane; flat_tree->nodes[child].parent_index = flat_tree_root; } } GRL_INLINE void SUBGROUP_FlatTree_CreateLeafNode( uniform local struct FlatTree* flat_tree, uniform ushort flat_tree_root, uniform ushort primref_start, uniform ushort num_prims ) { ushort lane = get_sub_group_local_id(); if ( lane < num_prims ) { flat_tree->primref_back_pointers[primref_start + lane] = (uchar) flat_tree_root; if ( lane == 0 ) { flat_tree->nodes[flat_tree_root].first_child = (uchar) primref_start; flat_tree->nodes[flat_tree_root].num_children = (uchar) num_prims; flat_tree->nodes[flat_tree_root].DW0 |= 0x80000000; } } } GRL_INLINE uniform bool SUBGROUP_FlatTree_SignalRefitComplete( uniform local struct FlatTree* flat_tree, uniform ushort* p_node_index ) { uniform ushort node_index = *p_node_index; uniform ushort parent = flat_tree->nodes[node_index].parent_index; uniform ushort index_in_parent = flat_tree->nodes[node_index].index_in_parent; // clear the corresponding mask bit in the parent node uniform uint child_mask = (0x10000 << index_in_parent); uniform uint old_mask_bits = 0; if( get_sub_group_local_id() == 0 ) old_mask_bits = atomic_xor( &flat_tree->nodes[parent].DW0, child_mask ); old_mask_bits = sub_group_broadcast( old_mask_bits, 0 ); // if we cleared the last mask bit, this subgroup proceeds up the tree and refits the next node // otherwise, it looks for something else to do if ( ((old_mask_bits^child_mask) & 0xffff0000) == 0 ) { *p_node_index = parent; return true; } return false; } /* GRL_INLINE local struct DFSPrimRefAABB* FlatTree_GetChildAABB( local struct FlatTree* flat_tree, local struct PrimRefSet* prim_refs, ushort node_index, ushort child_index ) { ushort child_id = FlatTree_GetFirstChild( flat_tree, node_index ) + child_index; if( !FlatTree_IsLeafNode( flat_tree, node_index ) ) return &flat_tree->nodes[child_id].AABB; else return PrimRefSet_GetAABBPointer( prim_refs, child_id ); } */ GRL_INLINE uint FlatTree_GetPrimRefBackPointer( local struct FlatTree* flat_tree, ushort primref_index ) { return flat_tree->primref_back_pointers[primref_index] * sizeof(struct QBVHNodeN) + flat_tree->qnode_byte_offset; } GRL_INLINE void FlatTree_check_boxes(local struct FlatTree* flat_tree, global struct AABB* primref_buffer, local struct AABB3f* boxes, local struct PrimRefMeta* meta ) { barrier(CLK_LOCAL_MEM_FENCE); if (get_local_id(0) == 0) { printf("checking flattree bounds...\n"); for (uint i = 0; i < flat_tree->num_nodes; i++) { struct AABB rb; rb.lower.xyz = AABB3f_load_lower(&boxes[i]); rb.upper.xyz = AABB3f_load_upper(&boxes[i]); uint offs = FlatTree_GetFirstChild( flat_tree, i ); uint count = FlatTree_GetNumChildren( flat_tree, i ); for (uint c = 0; c < count; c++) { struct AABB lb; if (FlatTree_IsLeafNode( flat_tree, i )) { lb = primref_buffer[ PrimRefMeta_GetInputIndex( &meta[offs+c] ) ]; } else { lb.lower.xyz = AABB3f_load_lower(&boxes[ offs+c ]); lb.upper.xyz = AABB3f_load_upper(&boxes[ offs+c ]); } if( !AABB_subset( &lb, &rb ) ) printf("Bad bounds!! child %u of %u %f : %f %f : %f %f : %f %f : %f %f : %f %f : %f \n", c, i , rb.lower.x, rb.upper.x, rb.lower.y, rb.upper.y, rb.lower.z, rb.upper.z, lb.lower.x, lb.upper.x, lb.lower.y, lb.upper.y, lb.lower.z, lb.upper.z ); } } } barrier(CLK_LOCAL_MEM_FENCE); } struct FlatTreeScheduler { int num_leafs; uint writeout_produce_count; uint writeout_consume_count; uint active_subgroups; uint num_built_nodes; uint num_levels; // number of depth levels in the tree //uchar leaf_indices[DFS_MAX_FLATTREE_LEAFS]; // indices of leaf FlatTree nodes to be refitted //uchar writeout_indices[DFS_MAX_FLATTREE_NODES]; // indices of flattree nodes to be written out or collapsed ushort level_ordered_nodes[DFS_MAX_FLATTREE_NODES]; // node indices sorted by depth (pre-order, high depth before low depth) ushort level_start[DFS_MAX_FLATTREE_DEPTH]; // first node at given level in the level-ordered node array uint level_count[DFS_MAX_FLATTREE_DEPTH]; // number of nodes at given level }; GRL_INLINE void SUBGROUP_FlatTreeScheduler_Initialize( uniform local struct FlatTreeScheduler* scheduler ) { scheduler->num_built_nodes = 0; scheduler->num_leafs = 0; scheduler->writeout_produce_count = 0; scheduler->writeout_consume_count = 0; scheduler->active_subgroups = DFS_NUM_SUBGROUPS; } /* GRL_INLINE void SUBGROUP_FlatTreeScheduler_QueueLeafForRefit( uniform local struct FlatTreeScheduler* scheduler, uniform ushort leaf ) { if ( get_sub_group_local_id() == 0 ) scheduler->leaf_indices[atomic_inc( &scheduler->num_leafs )] = leaf; }*/ GRL_INLINE void SUBGROUP_FlatTreeScheduler_SignalNodeBuilt( uniform local struct FlatTreeScheduler* scheduler, uniform ushort node ) { if ( get_sub_group_local_id() == 0 ) atomic_inc_local( &scheduler->num_built_nodes ); } GRL_INLINE uint FlatTreeScheduler_GetNumBuiltNodes( uniform local struct FlatTreeScheduler* scheduler ) { return scheduler->num_built_nodes; } /* GRL_INLINE void SUBGROUP_FlatTreeScheduler_QueueNodeForWriteOut( uniform local struct FlatTreeScheduler* scheduler, uniform ushort node ) { if ( get_sub_group_local_id() == 0 ) scheduler->writeout_indices[atomic_inc( &scheduler->writeout_produce_count )] = node; }*/ /* GRL_INLINE bool SUBGROUP_FlatTreeScheduler_GetRefitTask( uniform local struct FlatTreeScheduler* scheduler, uniform ushort* leaf_idx ) { // schedule the leaves in reverse order to ensure that later leaves // complete before earlier ones.. This prevents contention during the WriteOut stage // // There is a barrier between this function and 'QueueLeafForRefit' so we can safely decrement the same counter // that we incremented earlier varying int idx = 0; if( get_sub_group_local_id() == 0 ) idx = atomic_dec( &scheduler->num_leafs ); sub_group_barrier( CLK_LOCAL_MEM_FENCE ); idx = sub_group_broadcast( idx, 0 ); if ( idx <= 0 ) return false; *leaf_idx = scheduler->leaf_indices[idx-1]; return true; }*/ /* // Signal the scheduler that a subgroup has reached the DONE state. // Return true if this is the last subgroup to be done void SUBGROUP_FlatTreeScheduler_SubGroupDone( local struct FlatTreeScheduler* scheduler ) { if ( get_sub_group_local_id() == 0 ) atomic_dec( &scheduler->active_subgroups ); } */ /* #define STATE_SCHEDULE_REFIT 0x1234 #define STATE_SCHEDULE_WRITEOUT 0x5679 #define STATE_REFIT 0xabcd #define STATE_WRITEOUT 0xefef #define STATE_DONE 0xaabb // Get a flattree node to write out. Returns the new scheduler state GRL_INLINE ushort SUBGROUP_FlatTreeScheduler_GetWriteOutTask( uniform local struct FlatTreeScheduler* scheduler, uniform ushort num_nodes, uniform ushort* node_idx ) { uniform ushort return_state = STATE_WRITEOUT; uniform ushort idx = 0; if ( get_sub_group_local_id() == 0 ) { idx = atomic_inc( &scheduler->writeout_consume_count ); if ( idx >= scheduler->writeout_produce_count ) { // more consumers than there are produced tasks.... if ( scheduler->writeout_produce_count == num_nodes ) { // if all nodes have been written out, flattening is done return_state = STATE_DONE; } else { // some writeout tasks remain, and have not been produced by refit threads yet // we need to put this one back atomic_dec( &scheduler->writeout_consume_count ); return_state = STATE_SCHEDULE_WRITEOUT; } } else { // scheduled successfully idx = scheduler->writeout_indices[idx]; } } *node_idx = sub_group_broadcast( idx, 0 ); return sub_group_broadcast( return_state, 0 ); } */ /* GRL_INLINE void FlatTreeScheduler_Printf( local struct FlatTreeScheduler* scheduler ) { barrier( CLK_LOCAL_MEM_FENCE ); if ( get_local_id( 0 ) == 0 ) { printf( "***SCHEDULER***\n" ); printf( "built_nodes=%u active_sgs=%u leafs=%u wo_p=%u wo_c=%u\n", scheduler->num_built_nodes, scheduler->active_subgroups, scheduler->num_leafs, scheduler->writeout_produce_count, scheduler->writeout_consume_count ); printf( "leafs for refit: {" ); int nleaf = max( scheduler->num_leafs, 0 ); for ( uint i = 0; i < nleaf; i++ ) printf( "%u ", scheduler->leaf_indices[i] ); printf( "}\n" ); printf( "writeout queue: %u:%u {", scheduler->writeout_produce_count, scheduler->writeout_consume_count ); for ( uint i = 0; i < scheduler->writeout_produce_count; i++ ) printf( "%u ", scheduler->writeout_indices[i] ); printf( "}\n" ); } barrier( CLK_LOCAL_MEM_FENCE ); } */ GRL_INLINE void SUBGROUP_BuildFlatTreeNode( local struct LocalBVH2* bvh2, local struct FlatTree* flat_tree, local struct FlatTreeScheduler* scheduler, uniform ushort flat_tree_root ) { varying ushort lane = get_sub_group_local_id(); varying ushort bvh2_root = FlatTree_GetBVH2Root( flat_tree, flat_tree_root ); if ( !LocalBVH2_IsInnerNode( bvh2, bvh2_root ) ) { uniform ushort num_prims = LocalBVH2_GetSubtreePrimCount( bvh2, bvh2_root ); uniform ushort primref_start = LocalBVH2_GetLeafPrimStart( bvh2, bvh2_root ); SUBGROUP_FlatTree_CreateLeafNode( flat_tree, flat_tree_root, primref_start, num_prims ); } else { // collapse BVH2 into BVH6. // We will spread the root node's children across the subgroup, and keep adding SIMD lanes until we have enough uniform ushort num_children = 2; uniform ushort2 kids = LocalBVH2_GetChildIndices( bvh2, bvh2_root ); varying ushort sg_bvh2_node = kids.x; if ( lane == 1 ) sg_bvh2_node = kids.y; do { // choose the inner node with maximum area to replace. // Its left child goes in its old location. Its right child goes in a new lane varying ushort sg_area = LocalBVH2_GetNodeArea( bvh2, sg_bvh2_node ); varying bool sg_is_inner = LocalBVH2_IsInnerNode( bvh2, sg_bvh2_node ); sg_area = (sg_is_inner && lane < num_children) ? sg_area : 0; // prevent early exit if the largest child is a leaf uniform ushort max_area = sub_group_reduce_max( sg_area ); varying bool sg_reducable = max_area == sg_area && (lane < num_children) && sg_is_inner; uniform uint mask = intel_sub_group_ballot( sg_reducable ); // TODO_OPT: Some of these ops seem redundant.. look at trimming further // TODO_OPT: sub_group_reduce_max results in too many instructions...... unroll the loop and specialize it.. // or ask IGC to give us a version that declares a static maximum number of subgroups to use if ( mask == 0 ) break; // choose the inner node with maximum area to replace uniform ushort victim_child = ctz( mask ); uniform ushort victim_node = sub_group_broadcast( sg_bvh2_node, victim_child ); uniform ushort2 kids = LocalBVH2_GetChildIndices( bvh2, victim_node ); if ( lane == victim_child ) sg_bvh2_node = kids.x; else if ( lane == num_children ) sg_bvh2_node = kids.y; num_children++; }while ( num_children < TREE_ARITY ); SUBGROUP_FlatTree_CreateInnerNode( flat_tree, flat_tree_root, sg_bvh2_node, num_children ); } } GRL_INLINE void SUBGROUP_DFS_BuildFlatTree( uniform local struct LocalBVH2* bvh2, uniform local struct FlatTree* flat_tree, uniform local struct FlatTreeScheduler* scheduler ) { uniform ushort flat_tree_node_index = get_sub_group_id(); uniform ushort num_nodes = 1; uniform ushort num_built = 0; uint tid = get_local_id(0); if (tid < DFS_MAX_FLATTREE_DEPTH) { scheduler->level_start[tid] = DFS_MAX_FLATTREE_NODES; scheduler->level_count[tid] = 0; scheduler->num_levels = 0; } LOOP_TRIPWIRE_INIT; do { // process one flat tree node per sub group, as many as are available // // The first pass will only run one sub-group, the second up to 6, the third up to 36, and so on // nodes will be processed in breadth-first order, but they are not guaranteed to be stored in this order // due to use of atomic counters for node allocation // if ( flat_tree_node_index < num_nodes ) { SUBGROUP_BuildFlatTreeNode( bvh2, flat_tree, scheduler, flat_tree_node_index ); SUBGROUP_FlatTreeScheduler_SignalNodeBuilt( scheduler, flat_tree_node_index ); flat_tree_node_index += get_num_sub_groups(); } barrier( CLK_LOCAL_MEM_FENCE ); // bump up the node count if new nodes were created // stop as soon as all flattree nodes have been processed num_nodes = FlatTree_GetNodeCount( flat_tree ); num_built = FlatTreeScheduler_GetNumBuiltNodes( scheduler ); barrier( CLK_LOCAL_MEM_FENCE ); LOOP_TRIPWIRE_INCREMENT( 300 ); } while ( num_built < num_nodes ); barrier( CLK_LOCAL_MEM_FENCE ); // determine depth of each node, compute node ranges and counts for each depth level, // and prepare a depth-ordered node index array uint depth = 0; uint level_pos = 0; for( uint i=tid; ilevel_count[depth] ); // compute total number of levels atomic_max_local( &scheduler->num_levels, depth+1 ); } barrier( CLK_LOCAL_MEM_FENCE ); for( uint i=tid; ilevel_count[d]; scheduler->level_start[depth] = level_start; // scatter node indices into level-ordered node array scheduler->level_ordered_nodes[level_start + level_pos] = tid; } barrier( CLK_LOCAL_MEM_FENCE ); } /* GRL_INLINE bool SUBGROUP_RefitNode( uniform local struct FlatTree* flat_tree, uniform local struct PrimRefSet* prim_refs, uniform ushort* p_node_index ) { // fetch and reduce child AABBs across the subgroup uniform ushort node_index = *p_node_index; uniform ushort num_kids = FlatTree_GetNumChildren( flat_tree, node_index ); varying ushort sg_child_index = (get_sub_group_local_id() < num_kids) ? get_sub_group_local_id() : 0; varying local struct DFSPrimRefAABB* box = FlatTree_GetChildAABB( flat_tree, prim_refs, node_index, sg_child_index ); SUBGROUP_FlatTree_ReduceAndSetAABB( flat_tree, node_index, box ); if ( node_index == 0 ) return false; // if we just refitted the root, we can stop now // signal the parent node that this node was refitted. If this was the last child to be refitted // returns true and sets 'node_index' to the parent node, so that this thread can continue refitting return SUBGROUP_FlatTree_SignalRefitComplete( flat_tree, p_node_index ); }*/ GRL_INLINE struct QBVHNodeN* qnode_ptr( BVHBase* bvh_mem, uint byte_offset ) { return (struct QBVHNodeN*)(((char*)bvh_mem) + byte_offset); } GRL_INLINE void SUBGROUP_WriteQBVHNode( uniform local struct FlatTree* flat_tree, uniform local struct PrimRefMeta* primref_meta, uniform local struct AABB3f* boxes, uniform ushort flat_tree_root, uniform struct DFSArgs args, uniform local uchar* masks ) { uniform ushort num_children = FlatTree_GetNumChildren( flat_tree, flat_tree_root ); uniform bool is_leaf = FlatTree_IsLeafNode( flat_tree, flat_tree_root ); varying ushort lane = get_sub_group_local_id(); varying ushort sg_child_index = (lane < num_children) ? lane : 0; uniform ushort child_base = FlatTree_GetFirstChild( flat_tree, flat_tree_root ); varying struct AABB sg_box4; if (FlatTree_IsLeafNode( flat_tree, flat_tree_root )) { // fetch AABBs for primrefs sg_box4 = args.primref_buffer[ PrimRefMeta_GetInputIndex( &primref_meta[child_base + sg_child_index] ) ]; } else { // fetch AABBs for child nodes sg_box4.lower.xyz = AABB3f_load_lower( &boxes[child_base+sg_child_index] ); sg_box4.upper.xyz = AABB3f_load_upper( &boxes[child_base+sg_child_index] ); } struct QBVHNodeN* qnode = qnode_ptr( args.bvh_base, FlatTree_GetQNodeByteOffset( flat_tree, flat_tree_root ) ); uniform int offset; uniform uint child_type; if ( is_leaf ) { char* leaf_mem = (char*)BVHBase_GetQuadLeaves( args.bvh_base ); leaf_mem += ( FlatTree_GetPrimRefStart( flat_tree, flat_tree_root )) * args.leaf_size_in_bytes; offset = (int)(leaf_mem - (char*)qnode); child_type = args.leaf_node_type; } else { struct QBVHNodeN* kid = qnode_ptr( args.bvh_base, FlatTree_GetQNodeByteOffset( flat_tree, FlatTree_GetFirstChild( flat_tree, flat_tree_root ) ) ); offset = (int) ((char*)kid - (char*)qnode); child_type = args.inner_node_type; } offset = offset >> 6; if (child_type == NODE_TYPE_INSTANCE) { uint instanceMask = PrimRefMeta_GetInstanceMask( &primref_meta[child_base + sg_child_index] ); subgroup_setInstanceQBVHNodeN( offset, &sg_box4, num_children, qnode, lane < num_children ? instanceMask : 0 ); } else { uint mask = BVH_NODE_DEFAULT_MASK; if( args.need_masks ) mask = masks[flat_tree_root]; subgroup_setQBVHNodeN( offset, child_type, &sg_box4, num_children, qnode, mask ); } if ( args.need_backpointers ) { global uint* back_pointers = (global uint*) BVHBase_GetBackPointers( args.bvh_base ); uint idx = FlatTree_GetQNodeIndex( flat_tree, flat_tree_root ); uint bp = FlatTree_BuildBackPointer( flat_tree, flat_tree_root ); back_pointers[idx] = bp; } /* // TODO_OPT: Eventually this section should also handle leaf splitting due to mixed primref types // For now this is done by the leaf creation pipeline, but that path should probably be refactored // such that all inner node creation is done in one place uniform ushort num_children = FlatTree_GetNumChildren( flat_tree, flat_tree_root ); uniform bool is_leaf = FlatTree_IsLeafNode( flat_tree, flat_tree_root ); varying ushort lane = get_sub_group_local_id(); varying ushort sg_child_index = (lane < num_children) ? lane : 0; varying local struct DFSPrimRefAABB* sg_box = FlatTree_GetChildAABB( flat_tree, prim_refs, flat_tree_root, sg_child_index ); varying struct AABB sg_box4 = PrimRefSet_ConvertAABB( prim_refs, sg_box ); struct QBVHNodeN* qnode = qnode_ptr( args.bvh_base, FlatTree_GetQNodeByteOffset( flat_tree, flat_tree_root ) ); uniform int offset; uniform uint child_type; if ( is_leaf ) { char* leaf_mem = (char*)BVHBase_GetQuadLeaves( args.bvh_base ); leaf_mem += ( FlatTree_GetPrimRefStart( flat_tree, flat_tree_root )) * args.leaf_size_in_bytes; offset = (int)(leaf_mem - (char*)qnode); child_type = args.leaf_node_type; } else { struct QBVHNodeN* kid = qnode_ptr( args.bvh_base, FlatTree_GetQNodeByteOffset( flat_tree, FlatTree_GetFirstChild( flat_tree, flat_tree_root ) ) ); offset = (int) ((char*)kid - (char*)qnode); child_type = args.inner_node_type; } offset = offset >> 6; if (child_type == NODE_TYPE_INSTANCE) { uint instanceMask = PrimRefSet_GetInstanceMask( prim_refs, FlatTree_GetPrimRefStart(flat_tree, flat_tree_root) + lane ); subgroup_setInstanceQBVHNodeN( offset, &sg_box4, num_children, qnode, lane < num_children ? instanceMask : 0 ); } else subgroup_setQBVHNodeN( offset, child_type, &sg_box4, num_children, qnode ); if ( args.need_backpointers ) { global uint* back_pointers = (global uint*) BVHBase_GetBackPointers( args.bvh_base ); uint idx = FlatTree_GetQNodeIndex( flat_tree, flat_tree_root ); uint bp = FlatTree_BuildBackPointer( flat_tree, flat_tree_root ); back_pointers[idx] = bp; } */ } /* GRL_INLINE void SUBGROUP_DFS_RefitAndWriteOutFlatTree( uniform local struct FlatTree* flat_tree, uniform local struct PrimRefSet* prim_refs, uniform local struct FlatTreeScheduler* scheduler, uniform struct DFSArgs args) { uniform ushort state = STATE_SCHEDULE_REFIT; uniform ushort node_index = 0; uniform ushort num_nodes = FlatTree_GetNodeCount(flat_tree); { LOOP_TRIPWIRE_INIT; bool active = true; bool continue_refit = false; while (1) { if (active) { if (continue_refit || SUBGROUP_FlatTreeScheduler_GetRefitTask(scheduler, &node_index)) { continue_refit = SUBGROUP_RefitNode(flat_tree, prim_refs, &node_index); } else { active = false; if (get_sub_group_local_id() == 0) atomic_dec(&scheduler->active_subgroups); sub_group_barrier(CLK_LOCAL_MEM_FENCE); } } barrier(CLK_LOCAL_MEM_FENCE); // finish all atomics if (scheduler->active_subgroups == 0) break; barrier(CLK_LOCAL_MEM_FENCE); // finish all checks.. prevent race between thread which loops around and thread which doesn't LOOP_TRIPWIRE_INCREMENT(200); } } for (uint i = get_sub_group_id(); i < num_nodes; i += get_num_sub_groups()) SUBGROUP_WriteQBVHInnerNodes(flat_tree, prim_refs, i, args); barrier(CLK_LOCAL_MEM_FENCE); // JDB: Version below attempts to interleave refit and qnode write-out // This could theoretically reduce thread idle time, but it is more complex and does more atomics for scheduling #if 0 // after we've constructed the flat tree (phase 1), there are two things that need to happen: // PHASE 2: Refit the flat tree, computing all of the node ABBs // PHASE 3: Write the nodes out to memory // // all of this is sub-group centric. Different subgroups can execute phases 2 and 3 concurrently // // TODO_OPT: The scheduling algorithm might need to be re-thought. // Fused EUs are very hard to reason about. It's possible that by scheduling independent // SGs in this way we would lose a lot of performance due to fused EU serialization. // Needs to be tested experimentally if such a thing is possible uniform ushort state = STATE_SCHEDULE_REFIT; uniform ushort node_index = 0; uniform ushort num_nodes = FlatTree_GetNodeCount(flat_tree); LOOP_TRIPWIRE_INIT; do { // barrier necessary to protect access to scheduler->active_subgroups barrier(CLK_LOCAL_MEM_FENCE); if (state == STATE_SCHEDULE_REFIT) { if (SUBGROUP_FlatTreeScheduler_GetRefitTask(scheduler, &node_index)) state = STATE_REFIT; else state = STATE_SCHEDULE_WRITEOUT; // fallthrough } if (state == STATE_SCHEDULE_WRITEOUT) { state = SUBGROUP_FlatTreeScheduler_GetWriteOutTask(scheduler, num_nodes, &node_index); if (state == STATE_DONE) SUBGROUP_FlatTreeScheduler_SubGroupDone(scheduler); } // A barrier is necessary to ensure that 'QueueNodeForWriteOut' is synchronized with 'GetWriteOutTask' // Note that in theory we could have the write-out tasks spin until the refit tasks clear, which would make this barrier unnecessary // However, we cannot do this safely on SKUs which do not support independent subgroup forward progress. barrier(CLK_LOCAL_MEM_FENCE); if (state == STATE_REFIT) { uniform ushort prev_node = node_index; uniform bool continue_refit = SUBGROUP_RefitNode(flat_tree, prim_refs, &node_index); SUBGROUP_FlatTreeScheduler_QueueNodeForWriteOut(scheduler, prev_node); if (!continue_refit) state = STATE_SCHEDULE_REFIT; } else if (state == STATE_WRITEOUT) { SUBGROUP_WriteQBVHInnerNodes(flat_tree, prim_refs, node_index, args); state = STATE_SCHEDULE_WRITEOUT; } // A barrier is necessary to ensure that 'QueueNodeForWriteOut' is synchronized with 'GetWriteOutTask' barrier(CLK_LOCAL_MEM_FENCE); LOOP_TRIPWIRE_INCREMENT(200); } while (scheduler->active_subgroups > 0); #endif } */ GRL_INLINE void DFS_CreatePrimRefSet( struct DFSArgs args, local struct PrimRefSet* prim_refs ) { ushort id = get_local_id( 0 ); ushort num_primrefs = args.num_primrefs; PrimRef ref; struct AABB3f local_aabb; if ( id < num_primrefs ) { ref = args.primref_buffer[id]; AABB3f_set_lower( &local_aabb, ref.lower.xyz ); AABB3f_set_upper( &local_aabb, ref.upper.xyz ); } else { AABB3f_init( &local_aabb ); } AABB3f_atomic_merge_localBB_nocheck( &prim_refs->root_aabb, &local_aabb ); barrier( CLK_LOCAL_MEM_FENCE ); if ( id < num_primrefs ) PrimRefSet_SetPrimRef_FullPrecision( prim_refs, ref, id ); } struct BVHBuildLocals { float Al[DFS_WG_SIZE]; float Ar[DFS_WG_SIZE]; uchar2 axis_and_left_count[ DFS_WG_SIZE ]; uint sah[DFS_WG_SIZE]; uint num_active_threads; }; GRL_INLINE void DFS_ConstructBVH2( local struct LocalBVH2* bvh2, local struct PrimRefSet* prim_refs, ushort num_prims, local struct BVHBuildLocals* locals ) { ushort tid = get_local_id( 0 ); ushort bvh2_root = 0; ushort prim_range_start = 0; ushort primref_position = tid; bool active_thread = tid < num_prims; float root_area = PrimRefSet_GetMaxAABBArea( prim_refs ); float area_scale = DFS_BVH2_AREA_QUANT / root_area; locals->num_active_threads = num_prims; barrier( CLK_LOCAL_MEM_FENCE ); LOOP_TRIPWIRE_INIT; do { if(active_thread && prim_range_start == primref_position) locals->sah[primref_position] = UINT_MAX; if ( active_thread ) { local struct DFSPrimRefAABB* my_box = PrimRefSet_GetAABBPointer( prim_refs, primref_position ); // each thread evaluates a possible split candidate. Scan primrefs and compute sah cost // do this axis-by-axis to keep register pressure low float best_sah = INFINITY; ushort best_axis = 3; ushort best_count = 0; float best_al = INFINITY; float best_ar = INFINITY; struct DFSPrimRefAABB box_left[3]; struct DFSPrimRefAABB box_right[3]; float CSplit[3]; ushort count_left[3]; for ( ushort axis = 0; axis < 3; axis++ ) { DFSPrimRefAABB_init( &box_left[axis] ); DFSPrimRefAABB_init( &box_right[axis] ); CSplit[axis] = my_box->lower[axis] + my_box->upper[axis]; count_left[axis] = 0; } // scan primrefs in our subtree and partition using this thread's prim as a split plane { struct DFSPrimRefAABB box = *PrimRefSet_GetAABBPointer( prim_refs, prim_range_start ); for ( ushort p = 1; p < num_prims; p++ ) { struct DFSPrimRefAABB next_box = *PrimRefSet_GetAABBPointer( prim_refs, prim_range_start + p ); //preloading box for next iteration for( ushort axis = 0; axis < 3; axis++ ) { float c = box.lower[axis] + box.upper[axis]; if ( c < CSplit[axis] ) { // this primitive is to our left. DFSPrimRefAABB_extend( &box_left[axis], &box ); count_left[axis]++; } else { // this primitive is to our right DFSPrimRefAABB_extend( &box_right[axis], &box ); } } box = next_box; } // last iteration without preloading box for( ushort axis = 0; axis < 3; axis++ ) { float c = box.lower[axis] + box.upper[axis]; if ( c < CSplit[axis] ) { // this primitive is to our left. DFSPrimRefAABB_extend( &box_left[axis], &box ); count_left[axis]++; } else { // this primitive is to our right DFSPrimRefAABB_extend( &box_right[axis], &box ); } } } for ( ushort axis = 0; axis < 3; axis++ ) { float Al = DFSPrimRefAABB_halfArea( &box_left[axis] ); float Ar = DFSPrimRefAABB_halfArea( &box_right[axis] ); // Avoid NANs in SAH calculation in the corner case where all prims go right // In this case we set Al=Ar, because such a split will only be selected if all primrefs // are co-incident.. In that case, we will fall back to split-in-the-middle and both subtrees // should store the same quantized area value if ( count_left[axis] == 0 ) Al = Ar; // compute sah cost ushort count_right = num_prims - count_left[axis]; float sah = Ar * count_right + Al * count_left[axis]; // keep this split if it is better than the previous one, or if the previous one was a corner-case if ( sah < best_sah || best_count == 0 ) { // yes, keep it best_axis = axis; best_sah = sah; best_count = count_left[axis]; best_al = Al; best_ar = Ar; } } // write split information to SLM locals->Al[primref_position] = best_al; locals->Ar[primref_position] = best_ar; locals->axis_and_left_count[primref_position].x = best_axis; locals->axis_and_left_count[primref_position].y = best_count; uint sah = as_uint(best_sah); // break ties by axis to ensure deterministic split selection // otherwise builder can produce non-deterministic tree structure run to run // based on the ordering of primitives (which can vary due to non-determinism in atomic counters) // Embed split axis and index into sah value; compute min over sah and max over axis sah = ( ( sah & ~1023 ) | ( 2 - best_axis ) << 8 | primref_position ); // reduce on split candidates in our local subtree and decide the best one atomic_min_local( &locals->sah[ prim_range_start ], sah); } barrier( CLK_LOCAL_MEM_FENCE ); ushort split_index = locals->sah[ prim_range_start ] & 255; ushort split_axis = locals->axis_and_left_count[split_index].x; ushort split_left_count = locals->axis_and_left_count[split_index].y; float split_al = locals->Al[split_index]; float split_ar = locals->Ar[split_index]; if ( (primref_position == prim_range_start) && active_thread ) { // first thread in a given subtree creates the inner node ushort quantized_left_area = convert_ushort_rtn( split_al * area_scale ); ushort quantized_right_area = convert_ushort_rtn( split_ar * area_scale ); ushort start_left = prim_range_start; ushort start_right = prim_range_start + split_left_count; if ( split_left_count == 0 ) start_right = start_left + (num_prims / 2); // handle split-in-the-middle case LocalBVH2_CreateInnerNode( bvh2, bvh2_root, start_left, start_right, quantized_left_area, quantized_right_area ); } barrier( CLK_LOCAL_MEM_FENCE ); struct DFSPrimRef ref; ushort new_primref_position; if ( active_thread ) { ushort2 kids = LocalBVH2_GetChildIndices( bvh2, bvh2_root ); bool go_left; if ( split_left_count == 0 ) { // We chose a split with no left-side prims // This will only happen if all primrefs are located in the exact same position // In that case, fall back to split-in-the-middle split_left_count = (num_prims / 2); go_left = (primref_position - prim_range_start < split_left_count); } else { // determine what side of the split this thread's primref belongs on local struct DFSPrimRefAABB* my_box = PrimRefSet_GetAABBPointer( prim_refs, primref_position ); local struct DFSPrimRefAABB* split_box = PrimRefSet_GetAABBPointer( prim_refs, split_index ); float c = my_box->lower[split_axis] + my_box->upper[split_axis]; float Csplit = split_box->lower[split_axis] + split_box->upper[split_axis]; go_left = c < Csplit; } // adjust state variables for next loop iteration bvh2_root = (go_left) ? kids.x : kids.y; num_prims = (go_left) ? split_left_count : (num_prims - split_left_count); prim_range_start = (go_left) ? prim_range_start : prim_range_start + split_left_count; // determine the new primref position by incrementing a counter in the destination subtree new_primref_position = prim_range_start + LocalBVH2_IncrementPrimCount( bvh2, bvh2_root ); // load our primref from its previous position ref = PrimRefSet_GetPrimRef( prim_refs, primref_position ); } barrier( CLK_LOCAL_MEM_FENCE ); if ( active_thread ) { // write our primref into its sorted position PrimRefSet_SetPrimRef( prim_refs, ref, new_primref_position ); primref_position = new_primref_position; // deactivate all threads whose subtrees are small enough to form a leaf if ( num_prims <= TREE_ARITY ) { active_thread = false; atomic_dec_local( &locals->num_active_threads ); } } barrier( CLK_LOCAL_MEM_FENCE ); LOOP_TRIPWIRE_INCREMENT( 50 ); } while ( locals->num_active_threads > 0 ); } // fast path for #prims <= TREE_ARITY GRL_INLINE void Trivial_DFS( struct DFSArgs args ) { ushort tid = get_local_id( 0 ); PrimRef myRef; AABB_init( &myRef ); if( tid < args.num_primrefs ) myRef = args.primref_buffer[tid]; uint node_offset; if ( tid == 0 ) node_offset = 64*allocate_inner_nodes( args.bvh_base, 1 ); node_offset = sub_group_broadcast(node_offset,0); char* bvh_mem = (char*) args.bvh_base; struct QBVHNodeN* qnode = (struct QBVHNodeN*) (bvh_mem + node_offset); uint child_type = args.leaf_node_type; uint prim_base = args.bvh_base->quadLeafStart*64 ; char* leaf_mem = bvh_mem + prim_base; int offset = (int)( leaf_mem - (char*)qnode ); if (child_type == NODE_TYPE_INSTANCE) { subgroup_setInstanceQBVHNodeN( offset >> 6, &myRef, args.num_primrefs, qnode, tid < args.num_primrefs ? PRIMREF_instanceMask(&myRef) : 0 ); } else subgroup_setQBVHNodeN( offset >> 6, child_type, &myRef, args.num_primrefs, qnode, BVH_NODE_DEFAULT_MASK ); if ( tid < args.num_primrefs ) { global uint* primref_back_pointers = args.primref_index_buffer + args.num_primrefs; uint bp = node_offset; // TODO_OPT: Leaf creation pipeline can be made simpler by having a sideband buffer containing // fatleaf index + position in fatleaf for each primref, instead of forcing leaf creation shader to reconstruct it // should also probably do the fat-leaf splitting here args.primref_buffer[tid] = myRef; args.primref_index_buffer[tid] = tid; primref_back_pointers[tid] = bp / sizeof(struct QBVHNodeN); if ( tid == 0 && args.need_backpointers ) { uint bp = ((uint)-1) << 6; bp |= (args.num_primrefs) << 3; *(InnerNode_GetBackPointer(BVHBase_GetBackPointers( args.bvh_base ),0)) = bp; } } } void SUBGROUP_DFS_ComputeFlatTreeBoxesAndMasks( uniform local struct FlatTree* flat_tree, uniform local struct FlatTreeScheduler* flat_scheduler, uniform local struct AABB3f* boxes, uniform local struct PrimRefMeta* primref_meta, uniform global struct AABB* primref_buffer, uniform local uchar* masks, bool need_masks ) { uniform int num_levels = (int) flat_scheduler->num_levels; varying ushort lane = get_sub_group_local_id(); // iterate over depth levels in the tree... deepest to shallowest for (uniform int level = num_levels - 1; level >= 0; level--) { // loop over a range of flattree nodes at this level, one node per sub-group // TODO_OPT: Try and enable this code to process two nodes in a SIMD16 subgroup uniform ushort level_start = flat_scheduler->level_start[level]; uniform ushort level_node_count = flat_scheduler->level_count[level]; for (uniform ushort i = get_sub_group_id(); i < level_node_count; i += get_num_sub_groups()) { uniform ushort node_index = flat_scheduler->level_ordered_nodes[ level_start + i ]; varying struct AABB box; AABB_init(&box); uniform uint child_base = FlatTree_GetFirstChild( flat_tree, node_index ); uniform uint num_children = FlatTree_GetNumChildren( flat_tree, node_index ); varying uint child_index = child_base + ((laneflat_tree, args ); else if ( get_sub_group_id() == 2 ) SUBGROUP_LocalBVH2_Initialize( &slm->u.s1.bvh2, args.num_primrefs ); else if ( get_sub_group_id() == 4 ) SUBGROUP_FlatTreeScheduler_Initialize( &slm->flat_scheduler ); else if ( get_sub_group_id() == 6 ) SUBGROUP_PrimRefSet_Initialize( &slm->u.s1.prim_refs ); barrier( CLK_LOCAL_MEM_FENCE ); // load the PrimRefs DFS_CreatePrimRefSet( args, &slm->u.s1.prim_refs ); // build the BVH2 DFS_ConstructBVH2( &slm->u.s1.bvh2, &slm->u.s1.prim_refs, args.num_primrefs, &slm->u.s1.bvh2_locals ); // copy out metadata for primrefs now that they have been sorted if( tid < args.num_primrefs ) { slm->primitive_meta[tid] = PrimRefSet_GetMeta( &slm->u.s1.prim_refs, tid ); } barrier( CLK_LOCAL_MEM_FENCE ); // collapse into a FlatTree SUBGROUP_DFS_BuildFlatTree( &slm->u.s1.bvh2, &slm->flat_tree, &slm->flat_scheduler ); // allocate output QBVH6 nodes if ( get_local_id( 0 ) == 0 ) FlatTree_AllocateQNodes( &slm->flat_tree, args ); barrier( CLK_LOCAL_MEM_FENCE ); SUBGROUP_DFS_ComputeFlatTreeBoxesAndMasks( &slm->flat_tree, &slm->flat_scheduler, &slm->u.s2.boxes[0], slm->primitive_meta, args.primref_buffer, slm->u.s2.masks, args.need_masks ); //FlatTree_Printf( &slm->flat_tree ); //FlatTree_check_boxes ( &slm->flat_tree, args.primref_buffer, &slm->u.s2.boxes[0], slm->primitive_meta ); SUBGROUP_DFS_WriteNodes( &slm->flat_tree, &slm->u.s2.boxes[0], slm->primitive_meta, args, slm->u.s2.masks ); // generate sorted primref index buffer and backpointers to feed the leaf creation pipeilne if ( tid < args.num_primrefs ) { uint input_index = PrimRefMeta_GetInputIndex(&slm->primitive_meta[tid]); uint bp = FlatTree_GetPrimRefBackPointer( &slm->flat_tree, tid ); global uint* primref_back_pointers = args.primref_index_buffer + args.num_primrefs; args.primref_index_buffer[tid] = input_index; primref_back_pointers[tid] = bp / sizeof(struct QBVHNodeN); if ( tid == 0 && args.need_backpointers ) { *(InnerNode_GetBackPointer(BVHBase_GetBackPointers( args.bvh_base ),0)) |= ((uint)-1) << 6; } } } GRL_ANNOTATE_IGC_DO_NOT_SPILL __attribute__( (reqd_work_group_size( DFS_WG_SIZE, 1, 1 )) ) __attribute__( (intel_reqd_sub_group_size( 16 )) ) kernel void DFS( global struct Globals* globals, global char* bvh_mem, global PrimRef* primref_buffer, global uint* primref_index_buffer, uint alloc_backpointers ) { struct DFSArgs args; args.bvh_base = (global struct BVHBase*) bvh_mem; args.leaf_node_type = globals->leafPrimType; args.inner_node_type = NODE_TYPE_INTERNAL; args.leaf_size_in_bytes = globals->leafSize; args.primref_buffer = primref_buffer; args.need_backpointers = alloc_backpointers != 0; args.num_primrefs = globals->numPrimitives; args.primref_index_buffer = primref_index_buffer; args.need_masks = args.leaf_node_type == NODE_TYPE_INSTANCE; if ( args.num_primrefs <= TREE_ARITY ) { // TODO_OPT: This decision should be made using indirect dispatch if( get_sub_group_id() == 0 ) Trivial_DFS( args ); return; } local struct Single_WG_build_SLM slm; execute_single_WG_build( args, &slm ); } GRL_ANNOTATE_IGC_DO_NOT_SPILL __attribute__( (reqd_work_group_size( DFS_WG_SIZE, 1, 1 )) ) __attribute__( (intel_reqd_sub_group_size( 16 )) ) kernel void DFS_single_wg( global struct Globals* globals, global char* bvh_mem, global PrimRef* primref_buffer, global uint* primref_index_buffer, uint sah_flags ) { struct DFSArgs args; args.bvh_base = (global struct BVHBase*) bvh_mem; args.leaf_node_type = globals->leafPrimType; args.inner_node_type = NODE_TYPE_INTERNAL; args.leaf_size_in_bytes = globals->leafSize; args.primref_buffer = primref_buffer; args.need_backpointers = sah_flags & SAH_FLAG_NEED_BACKPOINTERS; args.num_primrefs = globals->numPrimitives; args.primref_index_buffer = primref_index_buffer; args.need_masks = sah_flags & SAH_FLAG_NEED_MASKS; local struct Single_WG_build_SLM slm; execute_single_WG_build( args, &slm ); } GRL_ANNOTATE_IGC_DO_NOT_SPILL __attribute__( (reqd_work_group_size( 16, 1, 1 )) ) __attribute__( (intel_reqd_sub_group_size( 16 )) ) kernel void DFS_trivial( global struct Globals* globals, global char* bvh_mem, global PrimRef* primref_buffer, global uint* primref_index_buffer, uint sah_flags ) { struct DFSArgs args; args.bvh_base = (global struct BVHBase*) bvh_mem; args.leaf_node_type = globals->leafPrimType; args.inner_node_type = NODE_TYPE_INTERNAL; args.leaf_size_in_bytes = globals->leafSize; args.primref_buffer = primref_buffer; args.need_backpointers = sah_flags & SAH_FLAG_NEED_BACKPOINTERS; args.num_primrefs = globals->numPrimitives; args.primref_index_buffer = primref_index_buffer; args.need_masks = sah_flags & SAH_FLAG_NEED_MASKS; Trivial_DFS( args ); } struct DFSArgs dfs_args_from_sah_globals( global struct SAHBuildGlobals* sah_globals ) { struct DFSArgs args; args.bvh_base = (global struct BVHBase*) sah_globals->p_bvh_base; args.leaf_node_type = sah_globals->leaf_type; args.inner_node_type = NODE_TYPE_INTERNAL; args.leaf_size_in_bytes = sah_globals->leaf_size; args.primref_buffer = (global PrimRef*) sah_globals->p_primrefs_buffer; args.need_backpointers = sah_globals->flags & SAH_FLAG_NEED_BACKPOINTERS; args.num_primrefs = sah_globals->num_primrefs; args.primref_index_buffer = (global uint*) sah_globals->p_primref_index_buffers; args.need_masks = sah_globals->flags & SAH_FLAG_NEED_MASKS; return args; } GRL_ANNOTATE_IGC_DO_NOT_SPILL __attribute__((reqd_work_group_size(DFS_WG_SIZE, 1, 1))) __attribute__((intel_reqd_sub_group_size(16))) kernel void DFS_single_wg_batchable( global struct SAHBuildGlobals* globals_buffer, global struct VContextScheduler* scheduler ) { global struct SAHBuildGlobals* sah_globals = globals_buffer + scheduler->num_trivial_builds + get_group_id(0); struct DFSArgs args = dfs_args_from_sah_globals( sah_globals ); local struct Single_WG_build_SLM slm; execute_single_WG_build(args, &slm); } GRL_ANNOTATE_IGC_DO_NOT_SPILL __attribute__((reqd_work_group_size(16, 1, 1))) __attribute__((intel_reqd_sub_group_size(16))) kernel void DFS_trivial_batchable( global struct SAHBuildGlobals* globals_buffer ) { global struct SAHBuildGlobals* sah_globals = globals_buffer + get_group_id(0); struct DFSArgs args = dfs_args_from_sah_globals(sah_globals); Trivial_DFS(args); }