// // Copyright (C) 2009-2021 Intel Corporation // // SPDX-License-Identifier: MIT // // #include "binned_sah_shared.h" #include "libs/lsc_intrinsics.h" #include "intrinsics.h" #include "AABB.h" #include "AABB3f.h" #include "qbvh6.h" #include "common.h" #include "libs/lsc_intrinsics.h" #define SGPRINT_16x(prefix,fmt,type,val) {\ type v0 = sub_group_broadcast( val, 0 );\ type v1 = sub_group_broadcast( val, 1 );\ type v2 = sub_group_broadcast( val, 2 );\ type v3 = sub_group_broadcast( val, 3 );\ type v4 = sub_group_broadcast( val, 4 );\ type v5 = sub_group_broadcast( val, 5 );\ type v6 = sub_group_broadcast( val, 6 );\ type v7 = sub_group_broadcast( val, 7 );\ type v8 = sub_group_broadcast( val, 8 );\ type v9 = sub_group_broadcast( val, 9 );\ type v10 = sub_group_broadcast( val, 10 );\ type v11 = sub_group_broadcast( val, 11 );\ type v12 = sub_group_broadcast( val, 12 );\ type v13 = sub_group_broadcast( val, 13 );\ type v14 = sub_group_broadcast( val, 14 );\ type v15 = sub_group_broadcast( val, 15 );\ sub_group_barrier(CLK_LOCAL_MEM_FENCE); \ if( get_sub_group_local_id() == 0 ) { \ printf(prefix fmt fmt fmt fmt fmt fmt fmt fmt \ fmt fmt fmt fmt fmt fmt fmt fmt"\n" , \ v0,v1,v2,v3,v4,v5,v6,v7,v8,v9,v10,v11,v12,v13,v14,v15);}} #define SGPRINT_6x(prefix,fmt,type,val) {\ type v0 = sub_group_broadcast( val, 0 );\ type v1 = sub_group_broadcast( val, 1 );\ type v2 = sub_group_broadcast( val, 2 );\ type v3 = sub_group_broadcast( val, 3 );\ type v4 = sub_group_broadcast( val, 4 );\ type v5 = sub_group_broadcast( val, 5 );\ sub_group_barrier(CLK_LOCAL_MEM_FENCE); \ if( get_sub_group_local_id() == 0 ) { \ printf(prefix fmt fmt fmt fmt fmt fmt "\n" , \ v0,v1,v2,v3,v4,v5);}} #define BFS_WG_SIZE 512 #define BFS_NUM_VCONTEXTS 256 // must be multiple of 64 #define TREE_ARITY 6 #define DFS_WG_SIZE 256 #define DFS_THRESHOLD 256 void BFSDispatchQueue_print(struct BFSDispatchQueue* q, uint n) { for (uint i = 0; i < q->num_dispatches; i++) printf(" %u,ctx=%u,batch=%u\n", q->wg_count[i], q->records[i].context_id, q->records[i].batch_index); } void VContextScheduler_print(struct VContextScheduler* scheduler) { if (get_local_id(0) == 0) { printf("SCHEDULER:\n"); printf(" bfs=%u dfs=%u\n", scheduler->num_bfs_wgs, scheduler->num_dfs_wgs); printf("BFS QUEUE:\n"); BFSDispatchQueue_print(&scheduler->bfs_queue, scheduler->num_bfs_wgs); printf("DFS QUEUE\n"); for (uint i = 0; i < scheduler->num_dfs_wgs; i++) { struct DFSDispatchRecord* r = &scheduler->dfs_queue.records[i]; printf(" (%u-%u) root=%u depth=%u batch_index=%u\n", r->primref_base, r->primref_base + r->num_primrefs, r->bvh2_base, r->tree_depth, r->batch_index); } printf("CONTEXTS:\n"); for (uint i = 0; i < BFS_NUM_VCONTEXTS; i++) { if (scheduler->vcontext_state[i] != VCONTEXT_STATE_UNALLOCATED) { printf(" context: %u state=%u\n", i, scheduler->vcontext_state[i]); printf(" prims: %u-%u\n", scheduler->contexts[i].dispatch_primref_begin, scheduler->contexts[i].dispatch_primref_end); printf(" depth: %u\n", scheduler->contexts[i].tree_depth); printf(" root: %u\n", scheduler->contexts[i].bvh2_root); printf(" batch: %u\n", scheduler->contexts[i].batch_index); } } } } inline float3 select_min(float3 v, bool mask) { return (float3)(mask ? v.x : (float)(INFINITY), mask ? v.y : (float)(INFINITY), mask ? v.z : (float)(INFINITY)); } inline float3 select_max(float3 v, bool mask) { return (float3)(mask ? v.x : -(float)(INFINITY), mask ? v.y : -(float)(INFINITY), mask ? v.z : -(float)(INFINITY)); } /////////////////////////////////////////////////////////////////////////// // The 'LRBounds' structure uses negated-max to allow // both atomic_min and atomic_max to be issued fused into one message struct AABB3f LRBounds_get_left_centroid( LRBounds* b ) { struct AABB3f* pbox = &b->boxes.left_centroid_bounds; return AABB3f_construct( AABB3f_load_lower(pbox), -AABB3f_load_upper(pbox) ); } struct AABB3f LRBounds_get_right_centroid( LRBounds* b ) { struct AABB3f* pbox = &b->boxes.right_centroid_bounds; return AABB3f_construct( AABB3f_load_lower(pbox), -AABB3f_load_upper(pbox) ); } struct AABB3f LRBounds_get_left_geom( LRBounds* b ) { struct AABB3f* pbox = &b->boxes.left_geom_bounds; return AABB3f_construct( AABB3f_load_lower(pbox), -AABB3f_load_upper(pbox) ); } struct AABB3f LRBounds_get_right_geom( LRBounds* b ) { struct AABB3f* pbox = &b->boxes.right_geom_bounds; return AABB3f_construct( AABB3f_load_lower(pbox), -AABB3f_load_upper(pbox) ); } void LRBounds_merge_left( local LRBounds* b, float3 CMin, float3 CMax, float3 GMin, float3 GMax ) { // All of the input vectors have come from sub-group reductions and are thus uniform // Using atomic_min calls as below results in IGC generating 12 atomic_min messages and a large stack of movs // The code below should result in 1 atomic_min message and a simularly large stack of movs float mergeVal0 = INFINITY; float mergeVal1 = INFINITY; uint i = get_sub_group_local_id(); // insert the various merge values into one register // We use two parallel variables here to enable some ILP uint imod = (i>=6) ? (i-6) : i; mergeVal0 = (imod==0) ? CMin.x : mergeVal0; mergeVal1 = (imod==0) ? GMin.x : mergeVal1; mergeVal0 = (imod==1) ? CMin.y : mergeVal0; mergeVal1 = (imod==1) ? GMin.y : mergeVal1; mergeVal0 = (imod==2) ? CMin.z : mergeVal0; mergeVal1 = (imod==2) ? GMin.z : mergeVal1; mergeVal0 = (imod==3) ? -CMax.x : mergeVal0; mergeVal1 = (imod==3) ? -GMax.x : mergeVal1; mergeVal0 = (imod==4) ? -CMax.y : mergeVal0; mergeVal1 = (imod==4) ? -GMax.y : mergeVal1; mergeVal0 = (imod==5) ? -CMax.z : mergeVal0; mergeVal1 = (imod==5) ? -GMax.z : mergeVal1; float merge = (i<6) ? mergeVal0 : mergeVal1; if( i < 12 ) atomic_min( &b->scalars.Array[i], merge ); //atomic_min( &b->boxes.left_centroid_bounds.lower[0], CMin.x ); //atomic_min( &b->boxes.left_centroid_bounds.lower[1], CMin.y ); //atomic_min( &b->boxes.left_centroid_bounds.lower[2], CMin.z ); //atomic_min( &b->boxes.left_centroid_bounds.upper[0], -CMax.x ); //atomic_min( &b->boxes.left_centroid_bounds.upper[1], -CMax.y ); //atomic_min( &b->boxes.left_centroid_bounds.upper[2], -CMax.z ); //atomic_min( &b->boxes.left_geom_bounds.lower[0], GMin.x ); //atomic_min( &b->boxes.left_geom_bounds.lower[1], GMin.y ); //atomic_min( &b->boxes.left_geom_bounds.lower[2], GMin.z ); //atomic_min( &b->boxes.left_geom_bounds.upper[0], -GMax.x ); //atomic_min( &b->boxes.left_geom_bounds.upper[1], -GMax.y ); //atomic_min( &b->boxes.left_geom_bounds.upper[2], -GMax.z ); } void LRBounds_merge_right( local LRBounds* b, float3 CMin, float3 CMax, float3 GMin, float3 GMax ) { // All of the input vectors have come from sub-group reductions and are thus uniform // Using atomic_min calls as below results in IGC generating 12 atomic_min messages and a large stack of movs // The code below should result in 1 atomic_min message and a simularly large stack of movs float mergeVal0 = INFINITY; float mergeVal1 = INFINITY; uint i = get_sub_group_local_id(); // insert the various merge values into one register // We use two parallel variables here to enable some ILP uint imod = (i>=6) ? (i-6) : i; mergeVal0 = (imod==0) ? CMin.x : mergeVal0; mergeVal1 = (imod==0) ? GMin.x : mergeVal1; mergeVal0 = (imod==1) ? CMin.y : mergeVal0; mergeVal1 = (imod==1) ? GMin.y : mergeVal1; mergeVal0 = (imod==2) ? CMin.z : mergeVal0; mergeVal1 = (imod==2) ? GMin.z : mergeVal1; mergeVal0 = (imod==3) ? -CMax.x : mergeVal0; mergeVal1 = (imod==3) ? -GMax.x : mergeVal1; mergeVal0 = (imod==4) ? -CMax.y : mergeVal0; mergeVal1 = (imod==4) ? -GMax.y : mergeVal1; mergeVal0 = (imod==5) ? -CMax.z : mergeVal0; mergeVal1 = (imod==5) ? -GMax.z : mergeVal1; float merge = (i<6) ? mergeVal0 : mergeVal1; if( i < 12 ) atomic_min( &b->scalars.Array[i+12], merge ); //atomic_min( &b->boxes.right_centroid_bounds.lower[0], CMin.x ); //atomic_min( &b->boxes.right_centroid_bounds.lower[1], CMin.y ); //atomic_min( &b->boxes.right_centroid_bounds.lower[2], CMin.z ); //atomic_min( &b->boxes.right_centroid_bounds.upper[0], -CMax.x ); //atomic_min( &b->boxes.right_centroid_bounds.upper[1], -CMax.y ); //atomic_min( &b->boxes.right_centroid_bounds.upper[2], -CMax.z ); //atomic_min( &b->boxes.right_geom_bounds.lower[0], GMin.x ); //atomic_min( &b->boxes.right_geom_bounds.lower[1], GMin.y ); //atomic_min( &b->boxes.right_geom_bounds.lower[2], GMin.z ); //atomic_min( &b->boxes.right_geom_bounds.upper[0], -GMax.x ); //atomic_min( &b->boxes.right_geom_bounds.upper[1], -GMax.y ); //atomic_min( &b->boxes.right_geom_bounds.upper[2], -GMax.z ); } void LRBounds_merge( global LRBounds* globalBounds, local LRBounds* localBounds ) { uint i = get_local_id(0); if( i < 24 ) atomic_min(&globalBounds->scalars.Array[i], localBounds->scalars.Array[i] ); } void LRBounds_init( LRBounds* bounds ) { uint i = get_local_id(0) * 4; if( i < 24 ) { // compiler should merge it into a 4xdword send bounds->scalars.Array[i+0] = INFINITY; bounds->scalars.Array[i+1] = INFINITY; bounds->scalars.Array[i+2] = INFINITY; bounds->scalars.Array[i+3] = INFINITY; } } inline void LRBounds_init_subgroup( LRBounds* bounds) { uint sg_size = get_sub_group_size(); uint lane = get_sub_group_local_id(); for (uint i = lane * 4; i < 24; i += sg_size * 4) { // compiler should merge it into a 4xdword send bounds->scalars.Array[i+0] = INFINITY; bounds->scalars.Array[i+1] = INFINITY; bounds->scalars.Array[i+2] = INFINITY; bounds->scalars.Array[i+3] = INFINITY; } } /////////////////////////////////////////////////////////////////////////// inline void BinInfo_init(struct BFS_BinInfo* bin_info) { for (uint id = get_local_id(0) * 4; id < 18 * BFS_NUM_BINS; id += get_local_size(0) * 4) { float inf = INFINITY; // compiler should merge it into a 4xdword send bin_info->min_max[id+0] = inf; bin_info->min_max[id+1] = inf; bin_info->min_max[id+2] = inf; bin_info->min_max[id+3] = inf; } for (uint id = get_local_id(0) * 4; id < 3 * BFS_NUM_BINS; id += get_local_size(0) * 4) { // compiler should merge it into a 4xdword send bin_info->counts[id+0] = 0; bin_info->counts[id+1] = 0; bin_info->counts[id+2] = 0; bin_info->counts[id+3] = 0; } } // copy global to local inline void BinInfo_copy( local struct BFS_BinInfo* local_bin_info, global struct BFS_BinInfo* global_bin_info ) { for (uint id = get_local_id(0); id < 18 * BFS_NUM_BINS; id += get_local_size(0)) { float inf = INFINITY ; float f = global_bin_info->min_max[id]; local_bin_info->min_max[id] = f; } for (uint id = get_local_id(0); id < 3 * BFS_NUM_BINS; id += get_local_size(0)) { local_bin_info->counts[id] = global_bin_info->counts[id]; } } inline void BinInfo_init_subgroup(struct BFS_BinInfo* bin_info) { uint sg_size = get_sub_group_size(); uint lane = get_sub_group_local_id(); for (uint i = lane * 4; i < 3 * BFS_NUM_BINS; i += sg_size * 4) { // compiler should merge it into a 4xdword send bin_info->counts[i+0] = 0; bin_info->counts[i+1] = 0; bin_info->counts[i+2] = 0; bin_info->counts[i+3] = 0; } for (uint i = lane * 4; i < 18 * BFS_NUM_BINS; i += sg_size * 4) { // compiler should merge it into a 4xdword send bin_info->min_max[i+0] = INFINITY; bin_info->min_max[i+1] = INFINITY; bin_info->min_max[i+2] = INFINITY; bin_info->min_max[i+3] = INFINITY; } } float3 shuffle_down_float3( float3 a, float3 b, uint delta ) { return (float3)( intel_sub_group_shuffle_down( a.x, b.x, delta ), intel_sub_group_shuffle_down( a.y, b.y, delta ), intel_sub_group_shuffle_down( a.z, b.z, delta ) ); } void BinInfo_primref_ballot_loop( local struct BFS_BinInfo* bin_info, uint axis, uint bin, float3 lower, float3 upper, bool active_lane ) { local float* bins_min = &bin_info->min_max[0]; local float* bins_max = &bin_info->min_max[3]; varying uint place = (bin + axis*BFS_NUM_BINS); varying uint lane = get_sub_group_local_id(); uniform uint active_mask = intel_sub_group_ballot(active_lane); while( active_mask ) { uniform uint leader = ctz( active_mask ); uniform uint lead_place = intel_sub_group_shuffle( place, leader ); varying bool matching_bin = lead_place == place && active_lane; varying float3 lo = (float3)(INFINITY,INFINITY,INFINITY); varying float3 hi = (float3)(-INFINITY,-INFINITY,-INFINITY); if (matching_bin) { lo = lower.xyz; hi = upper.xyz; } lo = sub_group_reduce_min_float3( lo ); hi = sub_group_reduce_max_float3( hi ); { // atomic min operation vectorized across 6 lanes // [ lower.xyz ][-][upper.xyz][-] // // Lanes 3 and 7 are inactive uint lmod = lane % 4; uint ldiv = lane / 4; float vlo = lo.x; float vhi = hi.x; vlo = (lmod == 1) ? lo.y : vlo; vhi = (lmod == 1) ? hi.y : vhi; vlo = (lmod == 2) ? lo.z : vlo; vhi = (lmod == 2) ? hi.z : vhi; float v = (ldiv == 0) ? vlo : -vhi; if( (1<min_max[ 6*lead_place + lmod + 3*ldiv ], v ); } //if( lane == 0 ) // atomic_add_local(&bin_info->counts[lead_place], popcount(active_mask & intel_sub_group_ballot(matching_bin)) ); active_mask = active_mask & intel_sub_group_ballot(!matching_bin); } } inline void BinInfo_add_primref(struct BinMapping* binMapping, local struct BFS_BinInfo* bin_info, PrimRef* primref, bool active_lane ) { const float4 lower = primref->lower; const float4 upper = primref->upper; const float4 p = lower + upper; const uint4 i = convert_uint4( (p - binMapping->ofs) * binMapping->scale ); BinInfo_primref_ballot_loop( bin_info, 0, i.x, lower.xyz, upper.xyz, active_lane ); BinInfo_primref_ballot_loop( bin_info, 1, i.y, lower.xyz, upper.xyz, active_lane ); BinInfo_primref_ballot_loop( bin_info, 2, i.z, lower.xyz, upper.xyz, active_lane ); if (active_lane) { atomic_inc_local( &bin_info->counts[i.x + 0 * BFS_NUM_BINS] ); atomic_inc_local( &bin_info->counts[i.y + 1 * BFS_NUM_BINS] ); atomic_inc_local( &bin_info->counts[i.z + 2 * BFS_NUM_BINS] ); } } inline void BinInfo_merge(global struct BFS_BinInfo* global_info, local struct BFS_BinInfo* local_info) { uint id = get_local_id(0); for (uint id = get_local_id(0); id < 18 * BFS_NUM_BINS; id += get_local_size(0)) { float v = local_info->min_max[id]; if( v != INFINITY ) atomic_min(&global_info->min_max[id], v); } for (uint id = get_local_id(0); id < 3 * BFS_NUM_BINS; id += get_local_size(0)) { uint c = local_info->counts[id]; if( c ) atomic_add_global(&global_info->counts[id], c); } } inline struct AABB3f BinInfo_get_AABB(struct BFS_BinInfo* bin_info, ushort bin, ushort axis) { float* min = &bin_info->min_max[6*(bin + axis*BFS_NUM_BINS)]; float* max = min + 3; struct AABB3f box; for (uint i = 0; i < 3; i++) { box.lower[i] = min[i]; box.upper[i] = -max[i]; } return box; } inline uint3 BinInfo_get_counts(struct BFS_BinInfo* bin_info, ushort bin) { uint3 counts; counts.x = bin_info->counts[bin + 0 * BFS_NUM_BINS]; // TODO: block load these counts.y = bin_info->counts[bin + 1 * BFS_NUM_BINS]; counts.z = bin_info->counts[bin + 2 * BFS_NUM_BINS]; return counts; } inline uint BinInfo_get_count(struct BFS_BinInfo* bin_info, ushort bin, ushort axis) { return bin_info->counts[bin + axis * BFS_NUM_BINS]; } void BVH2_Initialize( struct BVH2* bvh ) { bvh->num_nodes = 1; } inline bool BVH2_IsInnerNode( global struct BVH2* bvh, uint node_index ) { global struct BVH2Node* n = ((global struct BVH2Node*)(bvh + 1)) + node_index; return (n->meta_ss & 0x10000) != 0; } inline uint BVH2_GetRoot( struct BVH2* bvh ) { return 0; } ////////////////////////////////////////////// // BVH2NodeMetaData funcs ////////////////////////////////////////////// struct BVH2NodeMetaData { uint meta_u; // leaf: primref start. inner: offset from node to its first child uint meta_ss; }; inline struct BVH2NodeMetaData BVH2_GetNodeMetaData( global struct BVH2* bvh, uint node_index ) { global struct BVH2Node* n = ((global struct BVH2Node*)(bvh + 1)) + node_index; struct BVH2NodeMetaData meta; meta.meta_u = n->meta_u; meta.meta_ss = n->meta_ss; return meta; } inline bool BVH2NodeMetaData_IsInnerNode( struct BVH2NodeMetaData* meta ) { return (meta->meta_ss & 0x10000) != 0; } inline ushort BVH2NodeMetaData_GetLeafPrimCount( struct BVH2NodeMetaData* meta ) { return meta->meta_ss & 0xffff; } inline uint BVH2NodeMetaData_GetLeafPrimStart( struct BVH2NodeMetaData* meta ) { return meta->meta_u; } inline uint BVH2NodeMetaData_GetMask( struct BVH2NodeMetaData* meta ) { return (meta->meta_ss>>24); } ////////////////////////////////////////////// inline ushort BVH2_GetLeafPrimCount( struct BVH2* bvh, uint node_index ) { struct BVH2Node* n = ((struct BVH2Node*)(bvh + 1)) + node_index; return n->meta_ss & 0xffff; } inline uint BVH2_GetLeafPrimStart( struct BVH2* bvh, uint node_index ) { struct BVH2Node* n = ((struct BVH2Node*)(bvh + 1)) + node_index; return n->meta_u; } inline uint2 BVH2_GetChildIndices( struct BVH2* bvh, uint node_index ) { struct BVH2Node* n = ((struct BVH2Node*)(bvh + 1)) + node_index; uint2 idx; idx.x = n->meta_u; idx.y = idx.x + (n->meta_ss & 0xffff); return idx; } inline float BVH2_GetNodeArea( global struct BVH2* bvh, uint node_index ) { global struct BVH2Node* n = ((global struct BVH2Node*)(bvh + 1)) + node_index; return AABB3f_halfArea( &n->box ); } inline struct AABB3f BVH2_GetNodeBox( global struct BVH2* bvh, uint node_index ) { global struct BVH2Node* n = ((global struct BVH2Node*)(bvh + 1)) + node_index; return n->box; } inline void BVH2_SetNodeBox( global struct BVH2* bvh, uint node_index, struct AABB3f* box ) { global struct BVH2Node* n = ((global struct BVH2Node*)(bvh + 1)) + node_index; n->box = *box; } inline void BVH2_SetNodeBox_lu( global struct BVH2* bvh, uint node_index, float3 lower, float3 upper ) { global struct BVH2Node* n = ((global struct BVH2Node*)(bvh + 1)) + node_index; AABB3f_set( &n->box, lower, upper ); } inline void BVH2_InitNodeBox( struct BVH2* bvh, uint node_index ) { struct BVH2Node* n = ((struct BVH2Node*)(bvh + 1)) + node_index; AABB3f_init( &n->box ); } inline struct AABB BVH2_GetAABB( global struct BVH2* bvh, uint node_index ) { global struct BVH2Node* n = ((global struct BVH2Node*)(bvh + 1)) + node_index; struct AABB r; r.lower.xyz = AABB3f_load_lower( &n->box ); r.upper.xyz = AABB3f_load_upper( &n->box ); return r; } inline void BVH2_WriteInnerNode( global struct BVH2* bvh, uint node_index, struct AABB3f* box, uint2 child_offsets, uint mask ) { global struct BVH2Node* n = ((global struct BVH2Node*)(bvh + 1)) + node_index; n->box = *box; n->meta_u = child_offsets.x; n->meta_ss = 0x10000 + (child_offsets.y - child_offsets.x) + (mask<<24); // n->is_inner = true; } inline void BVH2_WriteLeafNode( global struct BVH2* bvh, uint node_index, struct AABB3f* box, uint prim_start, uint prim_count, uint mask ) { global struct BVH2Node* n = ((global struct BVH2Node*)(bvh + 1)) + node_index; n->box = *box; n->meta_u = prim_start; n->meta_ss = prim_count + (mask<<24); // n->is_inner = true; } inline uint BVH2_GetMask( global struct BVH2* bvh, uint node_index ) { global struct BVH2Node* n = ((global struct BVH2Node*)(bvh + 1)) + node_index; return (n->meta_ss>>24); } uint BVH2_AllocateNodes( global struct BVH2* bvh, uint num_nodes ) { return atomic_add_global( &bvh->num_nodes, num_nodes ); } inline void BVH2_AtomicMergeNodeBox( global struct BVH2* bvh, uint node_index, float3 lower, float3 upper ) { global struct BVH2Node* n = ((global struct BVH2Node*)(bvh + 1)) + node_index; AABB3f_atomic_merge_global_lu( &n->box, lower, upper ); } void BVH2_print( global struct BVH2* bvh, uint start_node ) { if ( get_local_id( 0 ) == 0 && get_sub_group_id() == 0 ) { uint num_nodes = bvh->num_nodes; uint2 stack[BFS_MAX_DEPTH * 2]; uint sp = 0; printf( "allocated_nodes=%u\n", num_nodes ); stack[sp++] = (uint2)(start_node, 0); while ( sp > 0 ) { uint2 data = stack[--sp]; uint node = data.x; uint depth = data.y; for ( uint i = 0; i < depth; i++ ) printf( " " ); if ( BVH2_IsInnerNode( bvh, node ) ) { uint2 kids = BVH2_GetChildIndices( bvh, node ); printf( " %5u: inner: %u %u \n", node, kids.x, kids.y ); stack[sp++] = (uint2)(kids.y, depth + 1); stack[sp++] = (uint2)(kids.x, depth + 1); struct AABB3f l = BVH2_GetNodeBox( bvh, kids.x ); struct AABB3f r = BVH2_GetNodeBox( bvh, kids.y ); struct AABB3f p = BVH2_GetNodeBox( bvh, node ); float3 pl = AABB3f_load_lower( &p ); float3 pu = AABB3f_load_upper( &p ); float3 ll = AABB3f_load_lower( &l ); float3 lu = AABB3f_load_upper( &l ); float3 rl = AABB3f_load_lower( &r ); float3 ru = AABB3f_load_upper( &r ); if ( any( ll < pl ) || any( rl < pl ) || any( lu > pu ) || any( ru > pu ) ) { for ( uint i = 0; i < depth; i++ ) printf( " " ); printf( "BAD_BOUNDS!!!!!!!! %u\n", node ); } } else { uint start = BVH2_GetLeafPrimStart( bvh, node ); uint count = BVH2_GetLeafPrimCount( bvh, node ); printf( " %5u: leaf: start=%u count=%u\n ",node,start,count ); } } } barrier( CLK_LOCAL_MEM_FENCE ); } global uint* SAHBuildGlobals_GetPrimrefIndices_In( struct SAHBuildGlobals* globals, bool odd_pass ) { uint num_refs = globals->num_primrefs; global uint* ib = (global uint*) globals->p_primref_index_buffers; return ib + (odd_pass ? num_refs : 0); } global uint* SAHBuildGlobals_GetPrimrefIndices_Out( struct SAHBuildGlobals* globals, bool odd_pass ) { uint num_refs = globals->num_primrefs; global uint* ib = (global uint*) globals->p_primref_index_buffers; return ib + (odd_pass ? 0 : num_refs); } global PrimRef* SAHBuildGlobals_GetPrimrefs( struct SAHBuildGlobals* globals ) { return (global PrimRef*) globals->p_primrefs_buffer; } global struct BVH2* SAHBuildGlobals_GetBVH2( struct SAHBuildGlobals* globals ) { return (global struct BVH2*)globals->p_bvh2; } uint SAHBuildGlobals_GetLeafSizeInBytes( struct SAHBuildGlobals* globals ) { return globals->leaf_size; } uint SAHBuildGlobals_GetLeafType( struct SAHBuildGlobals* globals ) { return globals->leaf_type; } uint SAHBuildGlobals_GetInternalNodeType( struct SAHBuildGlobals* globals ) { return NODE_TYPE_INTERNAL; } global struct BVHBase* SAHBuildGlobals_GetBVHBase( struct SAHBuildGlobals* globals ) { return (global struct BVHBase*) globals->p_bvh_base; } uint SAHBuildGlobals_GetTotalPrimRefs( struct SAHBuildGlobals* globals ) { return globals->num_primrefs; } inline bool SAHBuildGlobals_NeedBackPointers( struct SAHBuildGlobals* globals ) { return globals->flags & SAH_FLAG_NEED_BACKPOINTERS; } inline bool SAHBuildGlobals_NeedMasks( struct SAHBuildGlobals* globals ) { return globals->flags & SAH_FLAG_NEED_MASKS; } void SAHBuildGlobals_print( struct SAHBuildGlobals* globals ) { if ( get_local_id( 0 ) == 0 ) { printf( "SAHBuildGlobals: %p\n", globals ); printf( " p_primref_index_buffers =%p\n", globals->p_primref_index_buffers ); printf( " p_primrefs_buffer =%p\n", globals->p_primrefs_buffer ); printf( " p_bvh2 =%p\n", globals->p_bvh2 ); printf( " p_globals =%p\n", globals->p_globals ); printf( " p_bvh_base =%p\n", globals->p_bvh_base ); printf( " num_primrefs = %u\n", globals->num_primrefs ); printf( " leaf_size = %u\n", globals->leaf_size ); printf( " leaf_type = %u\n", globals->leaf_type ); printf( " p_qnode_buffer = %p\n", globals->p_qnode_root_buffer); } barrier( CLK_LOCAL_MEM_FENCE ); } uint get_num_wgs(uint thread_count, uint WG_SIZE) { return (thread_count + WG_SIZE - 1) / WG_SIZE; } struct BFSDispatchArgs { global struct VContextScheduler* scheduler; global struct VContext* context; global struct BVH2* bvh2; global uint* primref_index_in; global uint* primref_index_out; global PrimRef* primref_buffer; uint wg_primref_begin; uint wg_primref_end; uint dispatch_primref_begin; uint dispatch_primref_end; uint context_id; uint num_wgs; uint bvh2_root; uint global_num_primrefs; bool do_mask_processing; }; // TODO_OPT: Enable larger WGs // We need a way to do this in a portable fashion. // Gen12 can support larger WGs than Gen9 can // GRL_ANNOTATE_IGC_DO_NOT_SPILL __attribute__( (reqd_work_group_size( 512, 1, 1 )) ) kernel void begin( global struct VContextScheduler* scheduler, dword leaf_size, dword leaf_type, global uint* primref_index_buffers, global PrimRef* primref_buffer, global struct BVH2* bvh2, global struct BVHBase* bvh_base, global struct Globals* globals, global struct SAHBuildGlobals* sah_globals, global uint2* qnode_root_buffer, dword sah_globals_flags ) { dword num_primrefs = globals->numPrimitives; if ( get_local_id( 0 ) == 0 ) { sah_globals->p_primrefs_buffer = (qword) primref_buffer; sah_globals->p_primref_index_buffers = (qword)primref_index_buffers; sah_globals->p_bvh2 = (qword) bvh2; sah_globals->p_bvh_base = (qword) bvh_base; sah_globals->leaf_size = leaf_size; sah_globals->leaf_type = leaf_type; sah_globals->num_primrefs = num_primrefs; sah_globals->p_globals = (qword) globals; sah_globals->p_qnode_root_buffer = (gpuva_t) qnode_root_buffer; sah_globals->flags = sah_globals_flags; // initialize the spill stack scheduler->bfs2_spill_stack.size = 0; // initialize BVH2 node counter BVH2_Initialize( bvh2 ); // configure first vcontext for first build scheduler->contexts[0].dispatch_primref_begin = 0; scheduler->contexts[0].dispatch_primref_end = num_primrefs; scheduler->contexts[0].bvh2_root = BVH2_GetRoot( bvh2 ); scheduler->contexts[0].tree_depth = 0; scheduler->contexts[0].batch_index = 0; scheduler->bfs_queue.records[0].context_id = 0; scheduler->contexts[0].num_left = 0; scheduler->contexts[0].num_right = 0; scheduler->contexts[0].lr_mask = 0; // copy centroid bounds into the BVH2 root node' BVH2_SetNodeBox_lu( bvh2, BVH2_GetRoot( bvh2 ), globals->centroidBounds.lower.xyz, globals->centroidBounds.upper.xyz ); // zero the trivial build counters.. these are only used by the batch-build path // but single-wg QNode path (if used) depends on them scheduler->num_trivial_builds = 0; scheduler->num_single_builds = 0; // initialize the root-buffer counters sah_globals->root_buffer_num_produced = 0; sah_globals->root_buffer_num_produced_hi = 0; sah_globals->root_buffer_num_consumed = 0; sah_globals->root_buffer_num_consumed_hi = 0; } // initialize vcontext states for ( uint i = get_local_id( 0 ); i < BFS_NUM_VCONTEXTS; i += get_local_size( 0 ) ) scheduler->vcontext_state[i] = (i==0) ? VCONTEXT_STATE_EXECUTING : VCONTEXT_STATE_UNALLOCATED; // initialize global bin info in vcontext - only context[0] will be used in first iteration BinInfo_init( &scheduler->contexts[0].global_bin_info ); LRBounds_init( &scheduler->contexts[0].lr_bounds ); // barrier( CLK_GLOBAL_MEM_FENCE ); // lsc flush ... driver now does these as part of COMPUTE_WALKER } // TODO_OPT: Enable larger WGs // We need a way to do this in a portable fashion. // Gen12 can support larger WGs than Gen9 can // // TODO_OPT: Enable larger WGs // We need a way to do this in a portable fashion. // Gen12 can support larger WGs than Gen9 can // GRL_ANNOTATE_IGC_DO_NOT_SPILL __attribute__((reqd_work_group_size(512, 1, 1))) kernel void categorize_builds_and_init_scheduler( global struct VContextScheduler* scheduler, global gpuva_t* globals_ptrs, // OCL-C does not allow kernel parameters to be pointer-to-pointer, so we trick it... global struct SAHBuildBuffersInfo* buffers_info, global struct SAHBuildGlobals* builds_out, dword num_builds ) { local uint num_trivial; local uint num_single; local uint num_full; if (get_group_id(0) == 0) // first workgroup performs build categorization { if (get_local_id(0) == 0) { num_trivial = 0; num_single = 0; num_full = 0; } barrier(CLK_LOCAL_MEM_FENCE); // first pass, count builds of each type uint triv = 0; uint single = 0; uint full = 0; for (uint i = get_local_id(0); i < num_builds; i += get_local_size(0)) { global struct Globals* globals = (global struct Globals*) globals_ptrs[i]; dword num_refs = globals->numPrimitives; if (num_refs <= TRIVIAL_BUILD_THRESHOLD) triv++; else if (num_refs <= SINGLE_WG_BUILD_THRESHOLD) single++; else full++; } // merge counts across work-group. These variables are now offsets into this thread's ranges triv = atomic_add_local(&num_trivial, triv); single = atomic_add_local(&num_single, single); full = atomic_add_local(&num_full, full); barrier(CLK_LOCAL_MEM_FENCE); global struct SAHBuildGlobals* trivial_builds_out = builds_out; global struct SAHBuildGlobals* single_builds_out = builds_out + num_trivial; global struct SAHBuildGlobals* full_builds_out = builds_out + num_trivial + num_single; for (uint i = get_local_id(0); i < num_builds; i += get_local_size(0)) { global struct Globals* globals = (global struct Globals*) globals_ptrs[i]; global struct SAHBuildBuffersInfo* buffers = &buffers_info[i]; dword num_refs = globals->numPrimitives; dword leaf_type = globals->leafPrimType; dword leaf_size = globals->leafSize; global struct SAHBuildGlobals* place; if (num_refs <= TRIVIAL_BUILD_THRESHOLD) place = trivial_builds_out + (triv++); else if (num_refs <= SINGLE_WG_BUILD_THRESHOLD) place = single_builds_out + (single++); else place = full_builds_out + (full++); place->p_primref_index_buffers = buffers->p_primref_index_buffers; place->p_primrefs_buffer = buffers->p_primrefs_buffer; place->p_bvh2 = buffers->p_bvh2; place->p_bvh_base = buffers->p_bvh_base; place->p_globals = (gpuva_t)globals; place->num_primrefs = num_refs; place->leaf_size = leaf_size; place->leaf_type = leaf_type; place->flags = buffers->sah_globals_flags; place->p_qnode_root_buffer = buffers->p_qnode_root_buffer; // only initialize BVH2 if it will actually be used by the build // trivial passes will not use it if( num_refs > SINGLE_WG_BUILD_THRESHOLD ) { // initialize BVH2 node counter global struct BVH2* bvh2 = SAHBuildGlobals_GetBVH2(place); BVH2_Initialize(bvh2); // copy centroid bounds into the BVH2 root node' BVH2_SetNodeBox_lu(bvh2, BVH2_GetRoot(bvh2), globals->centroidBounds.lower.xyz, globals->centroidBounds.upper.xyz); } } if (get_local_id(0) == 0) { scheduler->num_trivial_builds = num_trivial; scheduler->num_single_builds = num_single; scheduler->batched_build_offset = num_trivial + num_single; scheduler->batched_build_count = num_full; } } else // second workgroup initializes the scheduler { // initialize vcontext states for (uint i = get_local_id(0); i < BFS_NUM_VCONTEXTS; i += get_local_size(0)) scheduler->vcontext_state[i] = (i == 0) ? VCONTEXT_STATE_EXECUTING : VCONTEXT_STATE_UNALLOCATED; // initialize global bin info in vcontexts for (uint i = get_sub_group_id(); i < BFS_NUM_VCONTEXTS; i += get_num_sub_groups()) BinInfo_init_subgroup(&scheduler->contexts[i].global_bin_info); // initialize the spill stack if (get_local_id(0) == 0) scheduler->bfs2_spill_stack.size = 0; } //barrier( CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE );// lsc flush ... driver now does these as part of COMPUTE_WALKER } GRL_ANNOTATE_IGC_DO_NOT_SPILL __attribute__((reqd_work_group_size(BFS_NUM_VCONTEXTS, 1, 1))) kernel void begin_batchable( global struct VContextScheduler* scheduler, global struct SAHBuildGlobals* sah_globals ) { ushort scheduler_build_offset = scheduler->batched_build_offset; ushort scheduler_num_builds = scheduler->batched_build_count; ushort num_builds = min( scheduler_num_builds, (ushort)BFS_NUM_VCONTEXTS ); uint num_wgs = 0; ushort tid = get_local_id(0); if ( tid < num_builds ) { ushort batch_index = scheduler_build_offset + tid; uint num_primrefs = sah_globals[batch_index].num_primrefs; // configure first vcontext for first build scheduler->contexts[tid].dispatch_primref_begin = 0; scheduler->contexts[tid].dispatch_primref_end = num_primrefs; scheduler->contexts[tid].bvh2_root = BVH2_GetRoot( SAHBuildGlobals_GetBVH2(&sah_globals[batch_index]) ); scheduler->contexts[tid].tree_depth = 0; scheduler->contexts[tid].batch_index = batch_index; scheduler->vcontext_state[tid] = VCONTEXT_STATE_EXECUTING; scheduler->contexts[tid].num_left = 0; scheduler->contexts[tid].num_right = 0; scheduler->contexts[tid].lr_mask = 0; num_wgs = get_num_wgs( num_primrefs, BFS_WG_SIZE ); scheduler->bfs_queue.wg_count[tid] = num_wgs; scheduler->bfs_queue.records[tid].batch_index = batch_index; scheduler->bfs_queue.records[tid].context_id = tid; } num_wgs = work_group_reduce_add(num_wgs); if (tid == 0) { // write out build count and offset for next BFS iteration scheduler->batched_build_offset = scheduler_build_offset + num_builds; scheduler->batched_build_count = scheduler_num_builds - num_builds; // write out initial WG count and loop termination mask for command streamer to consume scheduler->batched_build_wg_count = num_wgs; scheduler->batched_build_loop_mask = (scheduler_num_builds > num_builds) ? 1 : 0; scheduler->bfs_queue.num_dispatches = num_builds; } for ( uint i = get_sub_group_id(); i < num_builds; i += get_num_sub_groups() ) BinInfo_init_subgroup( &scheduler->contexts[i].global_bin_info ); for ( uint i = get_sub_group_id(); i < num_builds; i += get_num_sub_groups() ) LRBounds_init_subgroup( &scheduler->contexts[i].lr_bounds ); } bool is_leaf( uint num_refs ) { return num_refs <= TREE_ARITY; } bool is_dfs( uint num_refs ) { return num_refs > TREE_ARITY&& num_refs <= DFS_THRESHOLD; } bool is_bfs( uint num_refs ) { return num_refs > DFS_THRESHOLD; } int2 is_leaf_2( uint2 num_refs ) { return num_refs.xy <= TREE_ARITY; } int2 is_bfs_2( uint2 num_refs ) { return num_refs.xy > DFS_THRESHOLD; } int2 is_dfs_2( uint2 num_refs ) { return num_refs.xy > TREE_ARITY && num_refs.xy <= DFS_THRESHOLD; } #if 0 GRL_ANNOTATE_IGC_DO_NOT_SPILL __attribute__((reqd_work_group_size(16, 1, 1))) __attribute__((intel_reqd_sub_group_size(16))) kernel void sg_scheduler( global struct VContextScheduler* scheduler ) { local struct BFS1SpillStackEntry SLM_local_spill_stack[BFS_NUM_VCONTEXTS]; local uchar SLM_context_state[BFS_NUM_VCONTEXTS]; local vcontext_id_t SLM_free_list[BFS_NUM_VCONTEXTS]; local vcontext_id_t SLM_exec_list[BFS_NUM_VCONTEXTS]; varying ushort lane = get_sub_group_local_id(); uniform uint free_list_size = 0; uniform uint exec_list_size = 0; // read context states, build lists of free and executing contexts for (varying uint i = lane; i < BFS_NUM_VCONTEXTS; i += get_sub_group_size()) { uchar state = scheduler->vcontext_state[i]; SLM_context_state[i] = state; uniform ushort exec_mask = intel_sub_group_ballot(state == VCONTEXT_STATE_EXECUTING); varying ushort prefix_exec = subgroup_bit_prefix_exclusive(exec_mask); varying ushort prefix_free = lane - prefix_exec; varying ushort exec_list_pos = exec_list_size + prefix_exec; varying ushort free_list_pos = free_list_size + prefix_free; if (state == VCONTEXT_STATE_EXECUTING) SLM_exec_list[exec_list_pos] = i; else SLM_free_list[free_list_pos] = i; uniform ushort num_exec = popcount(exec_mask); exec_list_size += num_exec; free_list_size += get_sub_group_size() - num_exec; } uniform uint total_bfs_dispatches = 0; uniform uint total_dfs_dispatches = 0; uniform uint bfs_spill_stack_size = 0; uniform uint total_bfs_wgs = 0; // process executing context. accumulate bfs/dfs dispatches and free-list entries for (uint i = 0; i < exec_list_size; i+= get_sub_group_size() ) { varying ushort num_dfs_dispatches = 0; varying ushort num_bfs_spills = 0; varying ushort num_bfs_children; varying ushort context_id; struct VContext* context; varying uint num_left ; varying uint num_right ; varying uint primref_begin ; varying uint primref_end ; varying uint depth ; bool active_lane = (i + lane) < exec_list_size; if ( active_lane ) { context_id = SLM_exec_list[i + lane]; context = &scheduler->contexts[context_id]; num_left = context->num_left; num_right = context->num_right; primref_begin = context->dispatch_primref_begin; primref_end = context->dispatch_primref_end; depth = context->tree_depth; // get dispatch counts num_dfs_dispatches = is_dfs(num_left) + is_dfs(num_right); num_bfs_children = is_bfs(num_left) + is_bfs(num_right); num_bfs_spills = (num_bfs_children == 2) ? 1 : 0; } // allocate space for DFS, BFS dispatches, and BFS spills varying uint dfs_pos = total_dfs_dispatches + sub_group_scan_exclusive_add(num_dfs_dispatches); varying ushort mask_bfs_spills = intel_sub_group_ballot(num_bfs_children & 2); // spill if #children == 2 varying ushort mask_bfs_dispatches = intel_sub_group_ballot(num_bfs_children & 3); // dispatch if #children == 1 or 2 varying uint bfs_spill_pos = bfs_spill_stack_size + subgroup_bit_prefix_exclusive(mask_bfs_spills); varying uint bfs_dispatch_pos = total_bfs_dispatches + subgroup_bit_prefix_exclusive(mask_bfs_dispatches); total_dfs_dispatches += sub_group_reduce_add(num_dfs_dispatches); bfs_spill_stack_size += popcount(mask_bfs_spills); total_bfs_dispatches += popcount(mask_bfs_dispatches); varying uint num_bfs_wgs = 0; if (active_lane) { if (num_dfs_dispatches) { if (is_dfs(num_left)) { scheduler->dfs_queue.records[dfs_pos].primref_base = primref_begin; scheduler->dfs_queue.records[dfs_pos].num_primrefs = num_left; scheduler->dfs_queue.records[dfs_pos].bvh2_base = context->left_bvh2_root; scheduler->dfs_queue.records[dfs_pos].tree_depth = depth + 1; dfs_pos++; } if (is_dfs(num_right)) { scheduler->dfs_queue.records[dfs_pos].primref_base = primref_begin + num_left; scheduler->dfs_queue.records[dfs_pos].num_primrefs = num_right; scheduler->dfs_queue.records[dfs_pos].bvh2_base = context->right_bvh2_root; scheduler->dfs_queue.records[dfs_pos].tree_depth = depth + 1; } } uint num_bfs_children = is_bfs(num_left) + is_bfs(num_right); if (num_bfs_children == 2) { // spill the right child.. push an entry onto local spill stack SLM_local_spill_stack[bfs_spill_pos].primref_begin = primref_begin + num_left; SLM_local_spill_stack[bfs_spill_pos].primref_end = primref_end; SLM_local_spill_stack[bfs_spill_pos].bvh2_root = context->right_bvh2_root; SLM_local_spill_stack[bfs_spill_pos].tree_depth = depth + 1; // setup BFS1 dispatch for left child context->dispatch_primref_end = primref_begin + num_left; context->bvh2_root = context->left_bvh2_root; context->tree_depth = depth + 1; num_bfs_wgs = get_num_wgs(num_left, BFS_WG_SIZE); scheduler->bfs_queue.wg_count[bfs_dispatch_pos] = num_bfs_wgs; scheduler->bfs_queue.records[bfs_dispatch_pos].context_id = context_id; } else if (num_bfs_children == 1) { // setup BFS1 dispatch for whichever child wants it if (is_bfs(num_left)) { // bfs on left child context->dispatch_primref_end = context->dispatch_primref_begin + num_left; context->bvh2_root = context->left_bvh2_root; context->tree_depth = depth + 1; num_bfs_wgs = get_num_wgs(num_left, BFS_WG_SIZE); } else { // bfs on right child context->dispatch_primref_begin = context->dispatch_primref_begin + num_left; context->bvh2_root = context->right_bvh2_root; context->tree_depth = depth + 1; num_bfs_wgs = get_num_wgs(num_right, BFS_WG_SIZE); } scheduler->bfs_queue.wg_count[bfs_dispatch_pos] = num_bfs_wgs; scheduler->bfs_queue.records[bfs_dispatch_pos].context_id = context_id; } else { // no bfs dispatch.. this context is now free SLM_context_state[context_id] = VCONTEXT_STATE_UNALLOCATED; } } // count bfs work groups total_bfs_wgs += sub_group_reduce_add(num_bfs_wgs); // add newly deallocated contexts to the free list uniform uint free_mask = intel_sub_group_ballot( active_lane && num_bfs_children == 0); varying uint free_list_pos = free_list_size + subgroup_bit_prefix_exclusive(free_mask); free_list_size += popcount(free_mask); if ( free_mask & (1<bfs2_spill_stack.size; if(bfs_spill_stack_size < free_list_size && memory_spill_stack_size > 0 ) { uniform uint read_count = min(free_list_size - bfs_spill_stack_size, memory_spill_stack_size); for (varying uint i = lane; i < read_count; i+= get_sub_group_size()) SLM_local_spill_stack[bfs_spill_stack_size + i] = scheduler->bfs2_spill_stack.entries[memory_spill_stack_size - 1 - i]; bfs_spill_stack_size += read_count; memory_spill_stack_size -= read_count; } // steal pending BFS work and assign it to free contexts uniform uint num_steals = min(bfs_spill_stack_size, free_list_size); for (uniform uint i = 0; i < num_steals; i += get_sub_group_size()) { varying uint num_bfs_wgs = 0; if (i + lane < num_steals) { uint context_id = SLM_free_list[i+lane]; struct VContext* context = &scheduler->contexts[context_id]; struct BFS1SpillStackEntry entry = SLM_local_spill_stack[i+lane]; context->dispatch_primref_begin = entry.primref_begin; context->dispatch_primref_end = entry.primref_end; context->bvh2_root = entry.bvh2_root; context->tree_depth = entry.tree_depth; num_bfs_wgs = get_num_wgs(entry.primref_end - entry.primref_begin, BFS_WG_SIZE); scheduler->bfs_queue.wg_count[total_bfs_dispatches + i + lane] = num_bfs_wgs; scheduler->bfs_queue.records[total_bfs_dispatches + i + lane].context_id = context_id; SLM_context_state[context_id] = VCONTEXT_STATE_EXECUTING; } total_bfs_wgs += sub_group_reduce_add( num_bfs_wgs ); } total_bfs_dispatches += num_steals; // write out excess spills to global spill stack uniform uint extra_spills = bfs_spill_stack_size - num_steals; for (varying uint i = lane; i < extra_spills; i += get_sub_group_size()) { scheduler->bfs2_spill_stack.entries[memory_spill_stack_size + i] = SLM_local_spill_stack[num_steals+i]; } // write out modified context states for ( varying uint i = lane; i < BFS_NUM_VCONTEXTS; i += get_sub_group_size()) scheduler->vcontext_state[i] = SLM_context_state[i]; if (get_local_id(0) == 0) { // write out new memory stack size scheduler->bfs2_spill_stack.size = memory_spill_stack_size + extra_spills; // store workgroup counters scheduler->bfs_queue.num_dispatches = total_bfs_dispatches; scheduler->num_bfs_wgs = total_bfs_wgs; scheduler->num_dfs_wgs = total_dfs_dispatches; } // barrier(CLK_GLOBAL_MEM_FENCE); // make memory writes globally visible// lsc flush ... driver now does these as part of COMPUTE_WALKER } #endif #define SCHEDULER_SG_SIZE 16 #define SCHEDULER_WG_SIZE BFS_NUM_VCONTEXTS #define SCHEDULER_NUM_SGS (SCHEDULER_WG_SIZE / SCHEDULER_SG_SIZE) struct BFSDispatchArgs get_bfs_args_from_record_batchable( struct BFSDispatchRecord* record, global struct VContextScheduler* scheduler, global struct SAHBuildGlobals* globals_buffer ); GRL_ANNOTATE_IGC_DO_NOT_SPILL __attribute__((reqd_work_group_size(SCHEDULER_WG_SIZE, 1, 1))) __attribute__((intel_reqd_sub_group_size(SCHEDULER_SG_SIZE))) kernel void scheduler(global struct VContextScheduler* scheduler, global struct SAHBuildGlobals* sah_globals ) { local struct BFS1SpillStackEntry SLM_local_spill_stack[2 * BFS_NUM_VCONTEXTS]; local uint SLM_local_spill_stack_size; local uint SLM_dfs_dispatch_count; if (get_local_id(0) == 0) { SLM_local_spill_stack_size = 0; SLM_dfs_dispatch_count = 0; } uint context_id = get_local_id(0); uint state = scheduler->vcontext_state[context_id]; uint initial_state = state; uint batch_index = 0; global struct VContext* context = &scheduler->contexts[context_id]; barrier(CLK_LOCAL_MEM_FENCE); uint global_spill_stack_size = scheduler->bfs2_spill_stack.size; if (state == VCONTEXT_STATE_EXECUTING) { uint left_bvh2_root; uint right_bvh2_root; uint num_left = context->num_left; uint num_right = context->num_right; uint primref_begin = context->dispatch_primref_begin; uint primref_end = context->dispatch_primref_end; uint depth = context->tree_depth; uint batch_index = context->batch_index; struct BFSDispatchRecord record; record.context_id = context_id; record.batch_index = context->batch_index; struct BFSDispatchArgs args = get_bfs_args_from_record_batchable( &record, scheduler, sah_globals); // do cleanup of bfs_pass2 { // compute geom bounds struct AABB3f left_geom_bounds; struct AABB3f right_geom_bounds; struct AABB3f left_centroid_bounds; struct AABB3f right_centroid_bounds; uint2 lr_counts = (uint2)(num_left, num_right); { left_centroid_bounds = LRBounds_get_left_centroid( &context->lr_bounds ); left_geom_bounds = LRBounds_get_left_geom( &context->lr_bounds ); right_centroid_bounds = LRBounds_get_right_centroid( &context->lr_bounds ); right_geom_bounds = LRBounds_get_right_geom( &context->lr_bounds ); } int2 v_is_leaf = is_leaf_2( lr_counts ); int2 v_is_dfs = is_dfs_2( lr_counts ); int2 v_is_bfs = is_bfs_2( lr_counts ); uint left_mask = args.do_mask_processing ? context->lr_mask & 0xff : 0xff; uint right_mask = args.do_mask_processing ? (context->lr_mask & 0xff00) >> 8 : 0xff; // how many BVH2 nodes do we need to allocate? For DFS, we need to pre-allocate full subtree uint2 lr_node_counts = select( (uint2)(1,1), (2*lr_counts-1), v_is_dfs ); uint left_node_count = lr_node_counts.x; uint right_node_count = lr_node_counts.y; // allocate the nodes uint first_node = BVH2_AllocateNodes( args.bvh2, left_node_count + right_node_count ); // point our root node at its children left_bvh2_root = first_node; right_bvh2_root = first_node + left_node_count; // store combined geom bounds in the root node's AABB.. we previously stored centroid bounds there // but node creation requires geom bounds struct AABB3f geom_bounds = left_geom_bounds; AABB3f_extend(&geom_bounds, &right_geom_bounds); BVH2_WriteInnerNode( args.bvh2, args.bvh2_root, &geom_bounds, (uint2)(left_bvh2_root,right_bvh2_root), left_mask | right_mask ); // printf(" node: %u mask: %x\n", args.bvh2_root, left_mask|right_mask ); // store the appropriate AABBs in the child nodes // - BFS passes need centroid bounds // - DFS passes need geom bounds // Here we also write leaf connectivity information (prim start+count) // this will be overwritten later if we are creating an inner node struct AABB3f left_box, right_box; left_box = AABB3f_select( left_geom_bounds, left_centroid_bounds, v_is_bfs.xxx ); right_box = AABB3f_select( right_geom_bounds, right_centroid_bounds, v_is_bfs.yyy ); uint left_start = primref_begin; uint right_start = primref_begin + num_left; BVH2_WriteLeafNode( args.bvh2, left_bvh2_root, &left_box, left_start, num_left, left_mask ); BVH2_WriteLeafNode( args.bvh2, right_bvh2_root, &right_box, right_start, num_right, right_mask ); // make input and output primref index buffers consistent in the event we're creating a leaf // There should only ever be one leaf created, otherwise we'd have done a DFS pass sooner if (any( v_is_leaf.xy )) { uint start = v_is_leaf.x ? left_start : right_start; uint num_refs = v_is_leaf.x ? num_left : num_right; for(uint i = 0; i < num_refs; i++) { args.primref_index_in[start + i] = args.primref_index_out[start + i]; } } } // when BFS2 finishes, we need to dispatch two child tasks. // DFS dispatches can run free and do not need a context // BFS dispatches need a context. // In the case where both of the child nodes are BFS, the current context can immediately run one of the child dispatches // and the other is spilled for an unallocated context to pick up uint num_dfs_dispatches = is_dfs(num_left) + is_dfs(num_right); if (num_dfs_dispatches) { uint dfs_pos = atomic_add_local(&SLM_dfs_dispatch_count, num_dfs_dispatches); if (is_dfs(num_left)) { scheduler->dfs_queue.records[dfs_pos].primref_base = primref_begin; scheduler->dfs_queue.records[dfs_pos].num_primrefs = num_left; scheduler->dfs_queue.records[dfs_pos].bvh2_base = left_bvh2_root; scheduler->dfs_queue.records[dfs_pos].tree_depth = depth + 1; scheduler->dfs_queue.records[dfs_pos].batch_index = batch_index; dfs_pos++; } if (is_dfs(num_right)) { scheduler->dfs_queue.records[dfs_pos].primref_base = primref_begin + num_left; scheduler->dfs_queue.records[dfs_pos].num_primrefs = num_right; scheduler->dfs_queue.records[dfs_pos].bvh2_base = right_bvh2_root; scheduler->dfs_queue.records[dfs_pos].tree_depth = depth + 1; scheduler->dfs_queue.records[dfs_pos].batch_index = batch_index; } } uint num_bfs_children = is_bfs(num_left) + is_bfs(num_right); if (num_bfs_children) { uint place = atomic_add_local(&SLM_local_spill_stack_size, num_bfs_children); if (is_bfs(num_left)) { SLM_local_spill_stack[place].primref_begin = primref_begin; SLM_local_spill_stack[place].primref_end = primref_begin + num_left; SLM_local_spill_stack[place].bvh2_root = left_bvh2_root; SLM_local_spill_stack[place].tree_depth = depth + 1; SLM_local_spill_stack[place].batch_index = batch_index; place++; } if (is_bfs(num_right)) { SLM_local_spill_stack[place].primref_begin = primref_begin + num_left; SLM_local_spill_stack[place].primref_end = primref_end; SLM_local_spill_stack[place].bvh2_root = right_bvh2_root; SLM_local_spill_stack[place].tree_depth = depth + 1; SLM_local_spill_stack[place].batch_index = batch_index; place++; } } } barrier(CLK_LOCAL_MEM_FENCE); uint local_spill_stack_size = SLM_local_spill_stack_size; struct BFS1SpillStackEntry entry; state = VCONTEXT_STATE_UNALLOCATED; if (context_id < local_spill_stack_size) { // pull BFS work from the local spill stack if there's enough work there entry = SLM_local_spill_stack[context_id]; state = VCONTEXT_STATE_EXECUTING; } else if ((context_id - local_spill_stack_size) < (global_spill_stack_size)) { // if there isn't enough work on the local stack, consume from the global one uint global_pos = (global_spill_stack_size - 1) - (context_id - local_spill_stack_size); entry = scheduler->bfs2_spill_stack.entries[global_pos]; state = VCONTEXT_STATE_EXECUTING; } // contexts which received work set themselves up for the next BFS1 dispatch uint num_bfs_wgs = 0; uint num_bfs_dispatches = 0; if (state == VCONTEXT_STATE_EXECUTING) { context->dispatch_primref_begin = entry.primref_begin; context->dispatch_primref_end = entry.primref_end; context->bvh2_root = entry.bvh2_root; context->tree_depth = entry.tree_depth; context->batch_index = entry.batch_index; context->num_left = 0; context->num_right = 0; context->lr_mask = 0; batch_index = entry.batch_index; num_bfs_wgs = get_num_wgs(entry.primref_end - entry.primref_begin, BFS_WG_SIZE); num_bfs_dispatches = 1; } if (local_spill_stack_size > BFS_NUM_VCONTEXTS) { // write out additional spills if we produced more work than we can consume uint excess_spills = local_spill_stack_size - BFS_NUM_VCONTEXTS; uint write_base = global_spill_stack_size; uint lid = get_local_id(0); if (lid < excess_spills) scheduler->bfs2_spill_stack.entries[write_base + lid] = SLM_local_spill_stack[BFS_NUM_VCONTEXTS + lid]; if (lid == 0) scheduler->bfs2_spill_stack.size = global_spill_stack_size + excess_spills; } else if (global_spill_stack_size > 0) { // otherwise, if we consumed any spills from the global stack, update the stack size if (get_local_id(0) == 0) { uint global_spills_consumed = min(global_spill_stack_size, BFS_NUM_VCONTEXTS - local_spill_stack_size); scheduler->bfs2_spill_stack.size = global_spill_stack_size - global_spills_consumed; } } // Do various WG reductions.. the code below is a hand-written version of the following: // // uint bfs_dispatch_queue_pos = work_group_scan_exclusive_add( num_bfs_dispatches ); // uint reduce_num_bfs_wgs = work_group_reduce_add(num_bfs_wgs); // uint reduce_num_bfs_dispatches = work_group_reduce_add(num_bfs_dispatches); uint bfs_dispatch_queue_pos; uint reduce_num_bfs_dispatches; uint reduce_num_bfs_wgs; local uint partial_dispatches[SCHEDULER_WG_SIZE / SCHEDULER_SG_SIZE]; local uint partial_wgs[SCHEDULER_WG_SIZE / SCHEDULER_SG_SIZE]; { partial_dispatches[get_sub_group_id()] = sub_group_reduce_add(num_bfs_dispatches); partial_wgs[get_sub_group_id()] = sub_group_reduce_add(num_bfs_wgs); uint sg_prefix = sub_group_scan_exclusive_add(num_bfs_dispatches); uint prefix_dispatches = 0; uint total_dispatches = 0; uint total_wgs = 0; ushort lane = get_sub_group_local_id(); barrier(CLK_LOCAL_MEM_FENCE); for (ushort i = 0; i < SCHEDULER_NUM_SGS; i += SCHEDULER_SG_SIZE) // this loop is intended to be fully unrolled after compilation { uint p_dispatch = partial_dispatches[i + lane]; uint p_wg = partial_wgs[i + lane]; prefix_dispatches += (i + lane < get_sub_group_id()) ? p_dispatch : 0; total_dispatches += p_dispatch; total_wgs += p_wg; } bfs_dispatch_queue_pos = sg_prefix + sub_group_reduce_add(prefix_dispatches); reduce_num_bfs_dispatches = sub_group_reduce_add(total_dispatches); reduce_num_bfs_wgs = sub_group_reduce_add(total_wgs); } // insert records into BFS queue if (num_bfs_dispatches) { scheduler->bfs_queue.wg_count[bfs_dispatch_queue_pos] = num_bfs_wgs; scheduler->bfs_queue.records[bfs_dispatch_queue_pos].context_id = context_id; scheduler->bfs_queue.records[bfs_dispatch_queue_pos].batch_index = batch_index; } // store modified vcontext state if it has changed if (initial_state != state) scheduler->vcontext_state[context_id] = state; // store workgroup counters if (get_local_id(0) == 0) { scheduler->bfs_queue.num_dispatches = reduce_num_bfs_dispatches; scheduler->num_bfs_wgs = reduce_num_bfs_wgs; scheduler->num_dfs_wgs = SLM_dfs_dispatch_count; } const uint contexts_to_clear = min( (uint)BFS_NUM_VCONTEXTS, (uint)(local_spill_stack_size+global_spill_stack_size) ); for ( uint i = get_sub_group_id(); i < contexts_to_clear; i += get_num_sub_groups() ) BinInfo_init_subgroup( &scheduler->contexts[i].global_bin_info ); for ( uint i = get_sub_group_id(); i < contexts_to_clear; i += get_num_sub_groups() ) LRBounds_init_subgroup( &scheduler->contexts[i].lr_bounds ); } #if 0 uint record_search( struct BFSDispatchRecord* record_out, global struct BFSDispatchQueue* queue ) { uint group = get_group_id(0); ushort lane = get_sub_group_local_id(); uint num_dispatches = queue->num_dispatches; uint base = 0; for (uint i = 0; i < num_dispatches; i += get_sub_group_size()) { uint counts = intel_sub_group_block_read(&queue->wg_count[i]); for (uint j = 0; j < get_sub_group_size(); j++) { uint n = sub_group_broadcast(counts, j); if (group < n) { *record_out = queue->records[i + j]; return group; } group -= n; } } return 0; // NOTE: unreachable in practice } #endif uint record_search(struct BFSDispatchRecord* record_out, global struct BFSDispatchQueue* queue) { uint group = get_group_id(0); uint num_dispatches = queue->num_dispatches; uint dispatch_id = 0; uint local_id = 0; uint i = 0; do { uint counts = intel_sub_group_block_read(&queue->wg_count[i]); uint prefix = sub_group_scan_exclusive_add(counts); uint g = group - prefix; uint ballot = intel_sub_group_ballot(g < counts); if (ballot) { uint lane = ctz(ballot); dispatch_id = i + lane; local_id = intel_sub_group_shuffle(g, lane); break; } group -= sub_group_broadcast(prefix + counts, get_sub_group_size() - 1); i += get_sub_group_size(); } while (i < num_dispatches); *record_out = queue->records[dispatch_id]; return local_id; } struct BFSDispatchArgs get_bfs_args(struct BFSDispatchRecord* record, global struct VContextScheduler* scheduler, global struct SAHBuildGlobals* globals, uint local_group_id) { uint context_id = record->context_id; struct VContext* context = &scheduler->contexts[context_id]; bool odd_pass = context->tree_depth & 1; struct BFSDispatchArgs args; args.scheduler = scheduler; args.primref_index_in = SAHBuildGlobals_GetPrimrefIndices_In( globals, odd_pass ); args.primref_index_out = SAHBuildGlobals_GetPrimrefIndices_Out( globals, odd_pass ); args.primref_buffer = SAHBuildGlobals_GetPrimrefs( globals ); args.wg_primref_begin = context->dispatch_primref_begin + local_group_id * BFS_WG_SIZE; args.wg_primref_end = min( args.wg_primref_begin + BFS_WG_SIZE, context->dispatch_primref_end ); args.dispatch_primref_begin = context->dispatch_primref_begin; args.dispatch_primref_end = context->dispatch_primref_end; args.context_id = context_id; args.context = &scheduler->contexts[context_id]; args.num_wgs = ((args.dispatch_primref_end - args.dispatch_primref_begin) + BFS_WG_SIZE - 1) / BFS_WG_SIZE; args.bvh2_root = context->bvh2_root; args.bvh2 = SAHBuildGlobals_GetBVH2( globals ); args.global_num_primrefs = SAHBuildGlobals_GetTotalPrimRefs( globals ); args.do_mask_processing = SAHBuildGlobals_NeedMasks( globals ); return args; } struct BFSDispatchArgs get_bfs_args_queue( global struct BFSDispatchQueue* queue, global struct VContextScheduler* scheduler, global struct SAHBuildGlobals* globals ) { // TODO_OPT: Load this entire prefix array into SLM instead of searching.. // Or use sub-group ops struct BFSDispatchRecord record; uint local_group_id = record_search(&record, queue); return get_bfs_args(&record, scheduler, globals, local_group_id); } struct BFSDispatchArgs get_bfs_args_from_record( struct BFSDispatchRecord* record, global struct VContextScheduler* scheduler, global struct SAHBuildGlobals* globals ) { return get_bfs_args(record, scheduler, globals, 0); } struct BFSDispatchArgs get_bfs_args_batchable( global struct BFSDispatchQueue* queue, global struct VContextScheduler* scheduler, global struct SAHBuildGlobals* globals_buffer ) { // TODO_OPT: Load this entire prefix array into SLM instead of searching.. // Or use sub-group ops struct BFSDispatchRecord record; uint local_group_id = record_search(&record, queue); global struct SAHBuildGlobals* globals = globals_buffer + record.batch_index; return get_bfs_args(&record, scheduler, globals, local_group_id); } struct BFSDispatchArgs get_bfs_args_from_record_batchable( struct BFSDispatchRecord* record, global struct VContextScheduler* scheduler, global struct SAHBuildGlobals* globals_buffer ) { global struct SAHBuildGlobals* globals = globals_buffer + record->batch_index; return get_bfs_args(record, scheduler, globals, 0); } struct BFSDispatchArgs get_bfs_args_initial( global struct VContextScheduler* scheduler, global struct SAHBuildGlobals* globals ) { uint context_id = 0; uint num_refs = SAHBuildGlobals_GetTotalPrimRefs( globals ); struct BFSDispatchArgs args; args.scheduler = scheduler; args.primref_index_in = SAHBuildGlobals_GetPrimrefIndices_In( globals, false ); args.primref_index_out = SAHBuildGlobals_GetPrimrefIndices_Out( globals, false ); args.primref_buffer = SAHBuildGlobals_GetPrimrefs( globals ); args.wg_primref_begin = get_group_id(0) * BFS_WG_SIZE; args.wg_primref_end = min( args.wg_primref_begin + BFS_WG_SIZE, num_refs ); args.dispatch_primref_begin = 0; args.dispatch_primref_end = num_refs; args.context_id = context_id; args.context = &scheduler->contexts[context_id]; args.num_wgs = ((args.dispatch_primref_end - args.dispatch_primref_begin) + BFS_WG_SIZE - 1) / BFS_WG_SIZE; args.bvh2 = SAHBuildGlobals_GetBVH2( globals ); args.bvh2_root = BVH2_GetRoot( args.bvh2 ); args.global_num_primrefs = SAHBuildGlobals_GetTotalPrimRefs( globals ); args.do_mask_processing = SAHBuildGlobals_NeedMasks(globals); return args; } inline void BinMapping_init( struct BinMapping* binMapping, struct AABB3f* centBounds, const uint bins ) { const float4 eps = 1E-34f; const float4 omega = 1E+34f; float3 l = AABB3f_load_lower( centBounds ); float3 u = AABB3f_load_upper( centBounds ); float4 diag; diag.xyz = max( eps.xyz, u - l ); diag.w = 0; float4 scale = (float4)(0.99f * (float)bins) / diag; scale = select( (float4)(0.0f), scale, diag > eps ); scale = select( (float4)(0.0f), scale, diag < omega ); binMapping->scale = scale; binMapping->ofs.xyz = l.xyz; binMapping->ofs.w = 0; } inline ulong getBestSplit( float3 sah, uint ID, const float4 scale, const ulong defaultSplit ) { ulong splitX = (((ulong)as_uint( sah.x )) << 32) | ((uint)ID << 2) | 0; ulong splitY = (((ulong)as_uint( sah.y )) << 32) | ((uint)ID << 2) | 1; ulong splitZ = (((ulong)as_uint( sah.z )) << 32) | ((uint)ID << 2) | 2; /* ignore zero sized dimensions */ splitX = select( splitX, defaultSplit, (ulong)(scale.x == 0) ); splitY = select( splitY, defaultSplit, (ulong)(scale.y == 0) ); splitZ = select( splitZ, defaultSplit, (ulong)(scale.z == 0) ); ulong bestSplit = min( min( splitX, splitY ), splitZ ); bestSplit = sub_group_reduce_min( bestSplit ); return bestSplit; } inline float left_to_right_area16( struct AABB3f* low ) { struct AABB3f low_prefix = AABB3f_sub_group_scan_exclusive_min_max( low ); return halfArea_AABB3f( &low_prefix ); } inline uint left_to_right_counts16( uint low ) { return sub_group_scan_exclusive_add( low ); } inline float right_to_left_area16( struct AABB3f* low ) { const uint subgroupLocalID = get_sub_group_local_id(); const uint subgroup_size = get_sub_group_size(); const uint ID = subgroup_size - 1 - subgroupLocalID; struct AABB3f low_reverse = AABB3f_sub_group_shuffle( low, ID ); struct AABB3f low_prefix = AABB3f_sub_group_scan_inclusive_min_max( &low_reverse ); const float low_area = intel_sub_group_shuffle( halfArea_AABB3f( &low_prefix ), ID ); return low_area; } inline uint right_to_left_counts16( uint low ) { const uint subgroupLocalID = get_sub_group_local_id(); const uint subgroup_size = get_sub_group_size(); const uint ID = subgroup_size - 1 - subgroupLocalID; const uint low_reverse = intel_sub_group_shuffle( low, ID ); const uint low_prefix = sub_group_scan_inclusive_add( low_reverse ); return intel_sub_group_shuffle( low_prefix, ID ); } inline float2 left_to_right_area32( struct AABB3f* low, struct AABB3f* high ) { struct AABB3f low_prefix = AABB3f_sub_group_scan_exclusive_min_max( low ); struct AABB3f low_reduce = AABB3f_sub_group_reduce( low ); struct AABB3f high_prefix = AABB3f_sub_group_scan_exclusive_min_max( high ); AABB3f_extend( &high_prefix, &low_reduce ); const float low_area = halfArea_AABB3f( &low_prefix ); const float high_area = halfArea_AABB3f( &high_prefix ); return (float2)(low_area, high_area); } inline uint2 left_to_right_counts32( uint low, uint high ) { const uint low_prefix = sub_group_scan_exclusive_add( low ); const uint low_reduce = sub_group_reduce_add( low ); const uint high_prefix = sub_group_scan_exclusive_add( high ); return (uint2)(low_prefix, low_reduce + high_prefix); } inline float2 right_to_left_area32( struct AABB3f* low, struct AABB3f* high ) { const uint subgroupLocalID = get_sub_group_local_id(); const uint subgroup_size = get_sub_group_size(); const uint ID = subgroup_size - 1 - subgroupLocalID; struct AABB3f low_reverse = AABB3f_sub_group_shuffle( high, ID ); struct AABB3f high_reverse = AABB3f_sub_group_shuffle( low, ID ); struct AABB3f low_prefix = AABB3f_sub_group_scan_inclusive_min_max( &low_reverse ); struct AABB3f low_reduce = AABB3f_sub_group_reduce( &low_reverse ); struct AABB3f high_prefix = AABB3f_sub_group_scan_inclusive_min_max( &high_reverse ); AABB3f_extend( &high_prefix, &low_reduce ); const float low_area = intel_sub_group_shuffle( halfArea_AABB3f( &high_prefix ), ID ); const float high_area = intel_sub_group_shuffle( halfArea_AABB3f( &low_prefix ), ID ); return (float2)(low_area, high_area); } inline uint2 right_to_left_counts32( uint low, uint high ) { const uint subgroupLocalID = get_sub_group_local_id(); const uint subgroup_size = get_sub_group_size(); const uint ID = subgroup_size - 1 - subgroupLocalID; const uint low_reverse = intel_sub_group_shuffle( high, ID ); const uint high_reverse = intel_sub_group_shuffle( low, ID ); const uint low_prefix = sub_group_scan_inclusive_add( low_reverse ); const uint low_reduce = sub_group_reduce_add( low_reverse ); const uint high_prefix = sub_group_scan_inclusive_add( high_reverse ) + low_reduce; return (uint2)(intel_sub_group_shuffle( high_prefix, ID ), intel_sub_group_shuffle( low_prefix, ID )); } inline uint fastDivideBy6_uint( uint v ) { #if 1 const ulong u = (ulong)v >> 1; return (uint)((u * 0x55555556ul) >> 32); #else return v / 6; #endif } inline uint3 fastDivideBy6_uint3( uint3 v ) { return (uint3)(fastDivideBy6_uint( v.x ), fastDivideBy6_uint( v.y ), fastDivideBy6_uint( v.z )); } #define SAH_LOG_BLOCK_SHIFT 2 inline struct BFS_Split BinInfo_reduce( struct BFS_BinInfo* binInfo, const float4 scale ) { const uint subgroupLocalID = get_sub_group_local_id(); const uint subgroup_size = get_sub_group_size(); struct AABB3f boundsX = BinInfo_get_AABB( binInfo, subgroupLocalID, 0 ); const float lr_areaX = left_to_right_area16( &boundsX ); const float rl_areaX = right_to_left_area16( &boundsX ); struct AABB3f boundsY = BinInfo_get_AABB( binInfo, subgroupLocalID, 1 ); const float lr_areaY = left_to_right_area16( &boundsY ); const float rl_areaY = right_to_left_area16( &boundsY ); struct AABB3f boundsZ = BinInfo_get_AABB( binInfo, subgroupLocalID, 2 ); const float lr_areaZ = left_to_right_area16( &boundsZ ); const float rl_areaZ = right_to_left_area16( &boundsZ ); const uint3 counts = BinInfo_get_counts( binInfo, subgroupLocalID ); const uint lr_countsX = left_to_right_counts16( counts.x ); const uint rl_countsX = right_to_left_counts16( counts.x ); const uint lr_countsY = left_to_right_counts16( counts.y ); const uint rl_countsY = right_to_left_counts16( counts.y ); const uint lr_countsZ = left_to_right_counts16( counts.z ); const uint rl_countsZ = right_to_left_counts16( counts.z ); const float3 lr_area = (float3)(lr_areaX, lr_areaY, lr_areaZ); const float3 rl_area = (float3)(rl_areaX, rl_areaY, rl_areaZ); const uint3 lr_count = fastDivideBy6_uint3( (uint3)(lr_countsX, lr_countsY, lr_countsZ) + 6 - 1 ); const uint3 rl_count = fastDivideBy6_uint3( (uint3)(rl_countsX, rl_countsY, rl_countsZ) + 6 - 1 ); float3 sah = fma( lr_area, convert_float3( lr_count ), rl_area * convert_float3( rl_count ) ); /* first bin is invalid */ sah.x = select( (float)(INFINITY), sah.x, subgroupLocalID != 0 ); sah.y = select( (float)(INFINITY), sah.y, subgroupLocalID != 0 ); sah.z = select( (float)(INFINITY), sah.z, subgroupLocalID != 0 ); const ulong defaultSplit = (((ulong)as_uint( (float)(INFINITY) )) << 32); const ulong bestSplit = getBestSplit( sah, subgroupLocalID, scale, defaultSplit ); struct BFS_Split split; split.sah = as_float( (uint)(bestSplit >> 32) ); split.dim = (uint)bestSplit & 3; split.pos = (uint)bestSplit >> 2; return split; } struct BFS_BinInfoReduce3_SLM { uint sah[3*BFS_NUM_BINS]; }; inline struct BFS_Split BinInfo_reduce3( local struct BFS_BinInfoReduce3_SLM* slm, struct BFS_BinInfo* binInfo, const float4 scale ) { // process each bin/axis combination across sub-groups for (uint i = get_sub_group_id(); i < 3 * BFS_NUM_BINS; i += get_num_sub_groups()) { uint my_bin = i % BFS_NUM_BINS; uint my_axis = i / BFS_NUM_BINS; float3 left_lower = (float3)(INFINITY,INFINITY,INFINITY); float3 left_upper = -left_lower; float3 right_lower = (float3)(INFINITY,INFINITY,INFINITY); float3 right_upper = -right_lower; // load the other bins and assign them to the left or to the right // of this subgroup's bin uint lane = get_sub_group_local_id(); struct AABB3f sg_bins = BinInfo_get_AABB(binInfo,lane,my_axis); bool is_left = (lane < my_bin); float3 lower = AABB3f_load_lower(&sg_bins); float3 upper = AABB3f_load_upper(&sg_bins); float3 lower_l = select_min( lower, is_left ); float3 upper_l = select_max( upper, is_left ); float3 lower_r = select_min( lower, !is_left ); float3 upper_r = select_max( upper, !is_left ); lower_l = sub_group_reduce_min_float3( lower_l ); lower_r = sub_group_reduce_min_float3( lower_r ); upper_l = sub_group_reduce_max_float3( upper_l ); upper_r = sub_group_reduce_max_float3( upper_r ); float3 dl = upper_l - lower_l; float3 dr = upper_r - lower_r; float area_l = dl.x* (dl.y + dl.z) + (dl.y * dl.z); float area_r = dr.x* (dr.y + dr.z) + (dr.y * dr.z); // get the counts uint sg_bin_count = BinInfo_get_count(binInfo, lane, my_axis); uint count_l = (is_left) ? sg_bin_count : 0; uint count_r = (is_left) ? 0 : sg_bin_count; count_l = sub_group_reduce_add(count_l); count_r = sub_group_reduce_add(count_r); // compute sah count_l = fastDivideBy6_uint(count_l + 6 - 1); count_r = fastDivideBy6_uint(count_r + 6 - 1); float lr_partial = area_l * count_l; float rl_partial = area_r * count_r; float sah = lr_partial + rl_partial; // first bin is invalid sah = select((float)(INFINITY), sah, my_bin != 0); // ignore zero sized dimensions sah = select( sah, (float)(INFINITY), (scale.x == 0 && my_axis == 0) ); sah = select( sah, (float)(INFINITY), (scale.y == 0 && my_axis == 1) ); sah = select( sah, (float)(INFINITY), (scale.z == 0 && my_axis == 2) ); // tuck the axis into the bottom bits of sah cost. // The result is an integer between 0 and +inf (7F800000) // If we have 3 axes with infinite sah cost, we will select axis 0 slm->sah[i] = (as_uint(sah)&~0x3) | my_axis; } barrier( CLK_LOCAL_MEM_FENCE ); // reduce split candidates down to one subgroup // sah is strictly positive, so integer compares can be used // which results in a faster sub_group_reduce_min() // uint best_sah = 0xffffffff; uint lid = get_sub_group_local_id(); if (lid < BFS_NUM_BINS) { best_sah = slm->sah[lid]; lid += BFS_NUM_BINS; best_sah = min( best_sah, slm->sah[lid] ); lid += BFS_NUM_BINS; best_sah = min( best_sah, slm->sah[lid] ); } uint reduced_bestsah = sub_group_reduce_min( best_sah ); uint best_bin = ctz(intel_sub_group_ballot(best_sah == reduced_bestsah)); uint best_axis = as_uint(reduced_bestsah) & 0x3; struct BFS_Split ret; ret.sah = as_float(reduced_bestsah); ret.dim = best_axis; ret.pos = best_bin; return ret; } struct BFS_BinInfoReduce_SLM { struct { float sah; uint bin; } axisInfo[3]; }; inline struct BFS_Split BinInfo_reduce2( local struct BFS_BinInfoReduce_SLM* slm, struct BFS_BinInfo* binInfo, const float4 scale, uint num_primrefs) { ushort my_axis = get_sub_group_id(); ushort my_bin = get_sub_group_local_id(); if (my_axis < 3) { struct AABB3f aabb = BinInfo_get_AABB(binInfo, my_bin, my_axis); uint count = BinInfo_get_count(binInfo, my_bin, my_axis); float lr_area = left_to_right_area16(&aabb); float rl_area = right_to_left_area16(&aabb); uint lr_count = sub_group_scan_exclusive_add(count); uint rl_count = num_primrefs - lr_count; lr_count = fastDivideBy6_uint(lr_count + 6 - 1); rl_count = fastDivideBy6_uint(rl_count + 6 - 1); float lr_partial = lr_area * lr_count; float rl_partial = rl_area * rl_count; float sah = lr_partial + rl_partial; // first bin is invalid sah = select((float)(INFINITY), sah, my_bin != 0); float best_sah = sub_group_reduce_min( sah ); uint best_bin = ctz(intel_sub_group_ballot(sah == best_sah)); // ignore zero sized dimensions best_sah = select( best_sah, (float)(INFINITY), (scale.x == 0 && my_axis == 0) ); best_sah = select( best_sah, (float)(INFINITY), (scale.y == 0 && my_axis == 1) ); best_sah = select( best_sah, (float)(INFINITY), (scale.z == 0 && my_axis == 2) ); if (get_sub_group_local_id() == 0) { slm->axisInfo[my_axis].sah = best_sah; slm->axisInfo[my_axis].bin = best_bin; } } barrier( CLK_LOCAL_MEM_FENCE ); float sah = (float)(INFINITY); if( get_sub_group_local_id() < 3 ) sah = slm->axisInfo[get_sub_group_local_id()].sah; float bestsah = min(sub_group_broadcast(sah, 0), min(sub_group_broadcast(sah, 1), sub_group_broadcast(sah, 2))); uint bestAxis = ctz( intel_sub_group_ballot(bestsah == sah) ); struct BFS_Split split; split.sah = bestsah; split.dim = bestAxis; split.pos = slm->axisInfo[bestAxis].bin; return split; } inline bool is_left( struct BinMapping* binMapping, struct BFS_Split* split, struct AABB* primref ) { const uint dim = split->dim; const float lower = primref->lower[dim]; const float upper = primref->upper[dim]; const float c = lower + upper; const uint pos = convert_uint_rtz( (c - binMapping->ofs[dim]) * binMapping->scale[dim] ); return pos < split->pos; } struct BFS_Pass1_SLM { struct BFS_BinInfo bin_info; // struct BFS_BinInfoReduce3_SLM reduce3; }; void DO_BFS_pass1( local struct BFS_Pass1_SLM* slm, uint thread_primref_id, bool thread_primref_valid, struct BFSDispatchArgs args ) { local struct BFS_BinInfo* local_bin_info = &slm->bin_info; global struct VContext* context = args.context; struct AABB3f centroid_bounds = BVH2_GetNodeBox( args.bvh2, args.bvh2_root ); // root AABB is initialized to centroid bounds struct BinMapping bin_mapping; BinMapping_init( &bin_mapping, ¢roid_bounds, BFS_NUM_BINS ); // fetch this thread's primref PrimRef ref; if ( thread_primref_valid ) ref = args.primref_buffer[thread_primref_id]; // init bin info BinInfo_init( local_bin_info ); // fence on local bin-info init barrier( CLK_LOCAL_MEM_FENCE ); // merge this thread's primref into local bin info BinInfo_add_primref( &bin_mapping, local_bin_info, &ref, thread_primref_valid ); // fence on local bin-info update barrier( CLK_LOCAL_MEM_FENCE ); BinInfo_merge(&context->global_bin_info, local_bin_info); } GRL_ANNOTATE_IGC_DO_NOT_SPILL __attribute__( (reqd_work_group_size(BFS_WG_SIZE,1,1))) __attribute__((intel_reqd_sub_group_size(16))) kernel void BFS_pass1_indexed( global struct VContextScheduler* scheduler, global struct SAHBuildGlobals* sah_globals ) { local struct BFS_Pass1_SLM slm; struct BFSDispatchArgs args = get_bfs_args_queue( &scheduler->bfs_queue, scheduler, sah_globals ); bool thread_primref_valid = (args.wg_primref_begin + get_local_id( 0 )) < args.wg_primref_end; uint thread_primref_id = 0; if ( thread_primref_valid ) thread_primref_id = args.primref_index_in[args.wg_primref_begin + get_local_id( 0 )]; DO_BFS_pass1( &slm, thread_primref_id, thread_primref_valid, args ); } __attribute__( (reqd_work_group_size( BFS_WG_SIZE, 1, 1 )) ) __attribute__((intel_reqd_sub_group_size(16))) kernel void BFS_pass1_initial( global struct VContextScheduler* scheduler, global struct SAHBuildGlobals* sah_globals ) { local struct BFS_Pass1_SLM slm; struct BFSDispatchArgs args = get_bfs_args_initial( scheduler, sah_globals ); uint thread_primref_id = args.wg_primref_begin + get_local_id( 0 ); bool thread_primref_valid = thread_primref_id < args.wg_primref_end; DO_BFS_pass1( &slm, thread_primref_id, thread_primref_valid, args ); } GRL_ANNOTATE_IGC_DO_NOT_SPILL __attribute__((reqd_work_group_size(BFS_WG_SIZE, 1, 1))) __attribute__((intel_reqd_sub_group_size(16))) kernel void BFS_pass1_indexed_batchable( global struct VContextScheduler* scheduler, global struct SAHBuildGlobals* globals_buffer ) { local struct BFS_Pass1_SLM slm; struct BFSDispatchArgs args = get_bfs_args_batchable( &scheduler->bfs_queue, scheduler, globals_buffer ); bool thread_primref_valid = (args.wg_primref_begin + get_local_id(0)) < args.wg_primref_end; uint thread_primref_id = 0; if (thread_primref_valid) thread_primref_id = args.primref_index_in[args.wg_primref_begin + get_local_id(0)]; DO_BFS_pass1(&slm, thread_primref_id, thread_primref_valid, args); } GRL_ANNOTATE_IGC_DO_NOT_SPILL __attribute__((reqd_work_group_size(BFS_WG_SIZE, 1, 1))) __attribute__((intel_reqd_sub_group_size(16))) kernel void BFS_pass1_initial_batchable( global struct VContextScheduler* scheduler, global struct SAHBuildGlobals* globals_buffer ) { local struct BFS_Pass1_SLM slm; struct BFSDispatchArgs args = get_bfs_args_batchable( &scheduler->bfs_queue, scheduler, globals_buffer ); uint thread_primref_id = args.wg_primref_begin + get_local_id(0); bool thread_primref_valid = thread_primref_id < args.wg_primref_end; DO_BFS_pass1(&slm, thread_primref_id, thread_primref_valid, args); } ///////////////////////////////////////////////////////////////////////////////////////////////// /// /// BVH2 construction -- BFS Phase Pass2 /// ///////////////////////////////////////////////////////////////////////////////////////////////// struct BFS_Pass2_SLM { struct BFS_BinInfoReduce3_SLM reduce3; //struct AABB3f left_centroid_bounds; //struct AABB3f right_centroid_bounds; //struct AABB3f left_geom_bounds; //struct AABB3f right_geom_bounds; LRBounds lr_bounds; uint left_count; uint right_count; uint lr_mask; uint left_primref_base; uint right_primref_base; // uint num_wgs; // uint output_indices[BFS_WG_SIZE]; }; void DO_BFS_pass2( local struct BFS_Pass2_SLM* slm, uint thread_primref_id, bool thread_primref_valid, struct BFSDispatchArgs args ) { global struct VContext* context = args.context; struct AABB3f centroid_bounds = BVH2_GetNodeBox( args.bvh2, args.bvh2_root ); // load the thread's primref PrimRef ref; if ( thread_primref_valid ) ref = args.primref_buffer[thread_primref_id]; struct BinMapping bin_mapping; BinMapping_init( &bin_mapping, ¢roid_bounds, BFS_NUM_BINS ); // initialize working SLM space LRBounds_init(&slm->lr_bounds); if(get_local_id(0) == 0) { slm->left_count = 0; slm->right_count = 0; if( args.do_mask_processing ) slm->lr_mask = 0; } // compute split - every workgroup does the same computation // local barrier inside BinInfo_reduce3 struct BFS_Split split = BinInfo_reduce3( &slm->reduce3, &context->global_bin_info,bin_mapping.scale ); uint wg_prim_count = args.wg_primref_end - args.wg_primref_begin; // partition primrefs into L/R subsets... bool go_left = false; if (split.sah == (float)(INFINITY)) // no valid split, split in the middle.. This can happen due to floating-point limit cases in huge scenes go_left = get_local_id(0) < (wg_prim_count / 2); else go_left = is_left( &bin_mapping, &split, &ref ); // assign this primref a position in the output array, and expand corresponding centroid-bounds uint local_index; { float3 centroid = ref.lower.xyz + ref.upper.xyz; uint l_ballot = intel_sub_group_ballot( go_left && thread_primref_valid ); uint r_ballot = intel_sub_group_ballot( !go_left && thread_primref_valid ); if (l_ballot) { bool active_lane = l_ballot & (1 << get_sub_group_local_id()); float3 Cmin, Cmax, Gmin, Gmax; Cmin = select_min( centroid.xyz, active_lane ); Cmax = select_max( centroid.xyz, active_lane ); Gmin = select_min( ref.lower.xyz, active_lane ); Gmax = select_max( ref.upper.xyz, active_lane ); Cmin = sub_group_reduce_min_float3( Cmin ); Cmax = sub_group_reduce_max_float3( Cmax ); Gmin = sub_group_reduce_min_float3( Gmin ); Gmax = sub_group_reduce_max_float3( Gmax ); LRBounds_merge_left( &slm->lr_bounds, Cmin,Cmax,Gmin,Gmax ); } if (r_ballot) { bool active_lane = r_ballot & (1 << get_sub_group_local_id()); float3 Cmin, Cmax, Gmin, Gmax; Cmin = select_min(centroid.xyz, active_lane); Cmax = select_max(centroid.xyz, active_lane); Gmin = select_min(ref.lower.xyz, active_lane); Gmax = select_max(ref.upper.xyz, active_lane); Cmin = sub_group_reduce_min_float3(Cmin); Cmax = sub_group_reduce_max_float3(Cmax); Gmin = sub_group_reduce_min_float3(Gmin); Gmax = sub_group_reduce_max_float3(Gmax); LRBounds_merge_right( &slm->lr_bounds, Cmin,Cmax,Gmin,Gmax ); } if( args.do_mask_processing ) { uint mask =0; if (thread_primref_valid) { mask = PRIMREF_instanceMask(&ref) ; mask = go_left ? mask : mask<<8; } // TODO OPT: there is no 'sub_group_reduce_or' and IGC does not do the reduction trick // for atomics on sub-group uniform addresses for( uint i= get_sub_group_size()/2; i>0; i/= 2) mask = mask | intel_sub_group_shuffle_down(mask,mask,i); if( get_sub_group_local_id() == 0 ) atomic_or_local( &slm->lr_mask, mask ); } uint l_base = 0; uint r_base = 0; if( get_sub_group_local_id() == 0 && l_ballot ) l_base = atomic_add_local( &slm->left_count, popcount(l_ballot) ); if( get_sub_group_local_id() == 0 && r_ballot ) r_base = atomic_add_local( &slm->right_count, popcount(r_ballot) ); sub_group_barrier( CLK_LOCAL_MEM_FENCE ); l_base = sub_group_broadcast(l_base,0); r_base = sub_group_broadcast(r_base,0); l_base = l_base + subgroup_bit_prefix_exclusive( l_ballot ); r_base = r_base + subgroup_bit_prefix_exclusive( r_ballot ); local_index = (go_left) ? l_base : r_base; } barrier( CLK_LOCAL_MEM_FENCE ); // merge local into global // TODO_OPT: Look at spreading some of this across subgroups if ( get_sub_group_id() == 0 ) { // allocate primref space for this wg and merge local/global centroid bounds uint num_left = slm->left_count; { if (num_left && get_sub_group_local_id() == 0) { num_left = atomic_add_global( &context->num_left, num_left ); slm->left_primref_base = args.dispatch_primref_begin + num_left; } } uint num_right = slm->right_count; { if (num_right && get_sub_group_local_id() == 0) { num_right = atomic_add_global( &context->num_right, num_right ); slm->right_primref_base = (args.dispatch_primref_end - 1) - num_right; } } if( args.do_mask_processing && get_sub_group_local_id() == 0 ) atomic_or_global( &context->lr_mask, slm->lr_mask ); } barrier( CLK_LOCAL_MEM_FENCE ); LRBounds_merge( &context->lr_bounds, &slm->lr_bounds ); // move thread's primref ID into correct position in output index buffer if (thread_primref_valid) { uint pos = go_left ? slm->left_primref_base + local_index : slm->right_primref_base - local_index; args.primref_index_out[pos] = thread_primref_id; } } GRL_ANNOTATE_IGC_DO_NOT_SPILL __attribute__( (reqd_work_group_size( BFS_WG_SIZE, 1, 1 )) ) __attribute__( (intel_reqd_sub_group_size( 16 )) ) kernel void BFS_pass2_indexed( global struct VContextScheduler* scheduler, global struct SAHBuildGlobals* sah_globals ) { local struct BFS_Pass2_SLM slm; struct BFSDispatchArgs args = get_bfs_args_queue( &scheduler->bfs_queue, scheduler, sah_globals ); bool thread_primref_valid = (args.wg_primref_begin + get_local_id( 0 )) < args.wg_primref_end; uint thread_primref_id = 0; if ( thread_primref_valid ) thread_primref_id = args.primref_index_in[args.wg_primref_begin + get_local_id( 0 )]; DO_BFS_pass2( &slm, thread_primref_id, thread_primref_valid, args ); } GRL_ANNOTATE_IGC_DO_NOT_SPILL __attribute__( (reqd_work_group_size( BFS_WG_SIZE, 1, 1 )) ) __attribute__( (intel_reqd_sub_group_size( 16 )) ) kernel void BFS_pass2_initial( global struct VContextScheduler* scheduler, global struct SAHBuildGlobals* sah_globals ) { local struct BFS_Pass2_SLM slm; struct BFSDispatchArgs args = get_bfs_args_initial( scheduler, sah_globals ); uint thread_primref_id = args.wg_primref_begin + get_local_id( 0 ); bool thread_primref_valid = thread_primref_id < args.wg_primref_end; DO_BFS_pass2( &slm, thread_primref_id, thread_primref_valid, args ); } __attribute__((reqd_work_group_size(BFS_WG_SIZE, 1, 1))) __attribute__((intel_reqd_sub_group_size(16))) kernel void BFS_pass2_indexed_batchable( global struct VContextScheduler* scheduler, global struct SAHBuildGlobals* globals_buffer ) { local struct BFS_Pass2_SLM slm; struct BFSDispatchArgs args = get_bfs_args_batchable(&scheduler->bfs_queue, scheduler, globals_buffer ); bool thread_primref_valid = (args.wg_primref_begin + get_local_id(0)) < args.wg_primref_end; uint thread_primref_id = 0; if (thread_primref_valid) thread_primref_id = args.primref_index_in[args.wg_primref_begin + get_local_id(0)]; DO_BFS_pass2(&slm, thread_primref_id, thread_primref_valid, args); } GRL_ANNOTATE_IGC_DO_NOT_SPILL __attribute__((reqd_work_group_size(BFS_WG_SIZE, 1, 1))) __attribute__((intel_reqd_sub_group_size(16))) kernel void BFS_pass2_initial_batchable(global struct VContextScheduler* scheduler, global struct SAHBuildGlobals* globals_buffer) { local struct BFS_Pass2_SLM slm; struct BFSDispatchArgs args = get_bfs_args_batchable(&scheduler->bfs_queue, scheduler, globals_buffer ); uint thread_primref_id = args.wg_primref_begin + get_local_id(0); bool thread_primref_valid = thread_primref_id < args.wg_primref_end; DO_BFS_pass2(&slm, thread_primref_id, thread_primref_valid, args); } ///////////////////////////////////////////////////////////////////////////////////////////////// /// /// BVH2 construction -- DFS Phase /// ///////////////////////////////////////////////////////////////////////////////////////////////// struct DFSArgs { uint primref_base; uint global_bvh2_base; bool do_mask_processing; ushort num_primrefs; global uint* primref_indices_in; global uint* primref_indices_out; global PrimRef* primref_buffer; global struct BVH2* global_bvh2; }; struct DFSPrimRefAABB { half lower[3]; half upper[3]; }; void DFSPrimRefAABB_init( struct DFSPrimRefAABB* bb ) { bb->lower[0] = 1; bb->lower[1] = 1; bb->lower[2] = 1; bb->upper[0] = 0; bb->upper[1] = 0; bb->upper[2] = 0; } void DFSPrimRefAABB_extend( struct DFSPrimRefAABB* aabb, struct DFSPrimRefAABB* v ) { aabb->lower[0] = min( aabb->lower[0], v->lower[0] ); aabb->lower[1] = min( aabb->lower[1], v->lower[1] ); aabb->lower[2] = min( aabb->lower[2], v->lower[2] ); aabb->upper[0] = max( aabb->upper[0], v->upper[0] ); aabb->upper[1] = max( aabb->upper[1], v->upper[1] ); aabb->upper[2] = max( aabb->upper[2], v->upper[2] ); } half DFSPrimRefAABB_halfArea( struct DFSPrimRefAABB* aabb ) { const half3 d = (half3)(aabb->upper[0] - aabb->lower[0], aabb->upper[1] - aabb->lower[1], aabb->upper[2] - aabb->lower[2]); return fma( d.x, (d.y + d.z), d.y * d.z ); } struct DFSPrimRef { struct DFSPrimRefAABB aabb; ushort2 meta; }; void DFSPrimRef_SetBVH2Root( struct DFSPrimRef* ref, ushort root ) { ref->meta.y = root; } uint DFSPrimRef_GetInputIndex( struct DFSPrimRef* ref ) { return ref->meta.x; } uint DFSPrimRef_GetBVH2Parent( struct DFSPrimRef* ref ) { return ref->meta.y; } struct PrimRefSet { struct DFSPrimRefAABB AABB[DFS_WG_SIZE]; ushort2 meta[DFS_WG_SIZE]; uint input_indices[DFS_WG_SIZE]; }; local struct DFSPrimRefAABB* PrimRefSet_GetAABBPointer( local struct PrimRefSet* refs, ushort id ) { return &refs->AABB[id]; } struct DFSPrimRef PrimRefSet_GetPrimRef( local struct PrimRefSet* refs, ushort id ) { struct DFSPrimRef r; r.aabb = refs->AABB[id]; r.meta = refs->meta[id]; return r; } void PrimRefSet_SetPrimRef( local struct PrimRefSet* refs, struct DFSPrimRef ref, ushort id ) { refs->AABB[id] = ref.aabb; refs->meta[id] = ref.meta; } void PrimRefSet_SetPrimRef_FullPrecision( struct AABB3f* root_aabb, local struct PrimRefSet* refs, PrimRef ref, ushort id ) { float3 root_l = AABB3f_load_lower( root_aabb ); float3 root_u = AABB3f_load_upper( root_aabb ); float3 d = root_u - root_l; float scale = 1.0f / max( d.x, max( d.y, d.z ) ); float3 l = ref.lower.xyz; float3 u = ref.upper.xyz; half3 lh = convert_half3_rtz( (l - root_l) * scale ); half3 uh = convert_half3_rtp( (u - root_l) * scale ); refs->AABB[id].lower[0] = lh.x; refs->AABB[id].lower[1] = lh.y; refs->AABB[id].lower[2] = lh.z; refs->AABB[id].upper[0] = uh.x; refs->AABB[id].upper[1] = uh.y; refs->AABB[id].upper[2] = uh.z; refs->meta[id].x = id; refs->meta[id].y = 0; } void DFS_CreatePrimRefSet( struct DFSArgs args, local struct PrimRefSet* prim_refs ) { ushort id = get_local_id( 0 ); ushort num_primrefs = args.num_primrefs; struct AABB3f box = BVH2_GetNodeBox( args.global_bvh2, args.global_bvh2_base ); if ( id < num_primrefs ) { PrimRef ref = args.primref_buffer[args.primref_indices_in[id]]; prim_refs->input_indices[id] = args.primref_indices_in[id]; PrimRefSet_SetPrimRef_FullPrecision( &box, prim_refs, ref, id ); } } struct ThreadRangeInfo { uchar start; uchar local_num_prims; uchar bvh2_root; bool active; }; struct BVHBuildLocals // size: ~3.8K { uchar2 axis_and_left_count[ DFS_WG_SIZE ]; struct ThreadRangeInfo range[ DFS_WG_SIZE ]; uint sah[ DFS_WG_SIZE ]; }; #define LOCAL_BVH2_NODE_COUNT (2*(DFS_WG_SIZE) -1) struct LocalBVH2 { uint nodes[LOCAL_BVH2_NODE_COUNT]; uint num_nodes; // bit layout is for a node is // uchar child_ptr; // this is right_child_index >> 1. right child's msb is always 0 // uchar primref_base; // index of the node's first primref. will be 0 at the root // uchar parent_dist; // distance in nodes from this node to its parent // uchar prim_counter; // number of prims in this subtree. For a complete tree (256 prims), the root may be off by 1 // for a WG size of 256, 8b is enough for parent distance, because the tree is built in level order // the maximum distance between parent and child occurs for a complete tree. // in this scenario the left-most leaf has index 255, its parent has index 127, the deltas to the children are 128 and 129 }; void LocalBVH2_Initialize( struct LocalBVH2* bvh2, ushort num_prims ) { bvh2->num_nodes = 1; bvh2->nodes[0] = min(num_prims,(ushort)255); } void LocalBVH2_Initialize_Presplit(struct LocalBVH2* bvh2, ushort num_prims, ushort left_count, ushort right_count ) { bvh2->num_nodes = 3; bvh2->nodes[0] = min(num_prims, (ushort)255); ushort bvh2_root = 0; ushort child_place = 1; uint child_ptr = (child_place + 1) >> 1; bvh2->nodes[bvh2_root] |= (child_ptr) << 24; uint parent_dist = child_place - bvh2_root; // initialize child nodes ushort primref_base_left = 0; ushort primref_base_right = left_count; uint left = (primref_base_left << 16) + ((parent_dist << 8)) + left_count; uint right = (primref_base_right << 16) + ((parent_dist + 1) << 8) + right_count; bvh2->nodes[child_place] = left; bvh2->nodes[child_place + 1] = right; } void LocalBVH2_CreateInnerNode( local struct LocalBVH2* bvh2, ushort bvh2_root, uint primref_base_left, uint primref_base_right ) { ushort child_place = atomic_add_local( &(bvh2-> num_nodes), 2 ); uint child_ptr = (child_place + 1) >> 1; bvh2->nodes[bvh2_root] |= (child_ptr) << 24; uint parent_dist = child_place - bvh2_root; // initialize child nodes uint left = (primref_base_left << 16) + ((parent_dist << 8)); uint right = (primref_base_right << 16) + ((parent_dist + 1) << 8); bvh2->nodes[child_place] = left; bvh2->nodes[child_place + 1] = right; } ushort2 LocalBVH2_GetChildIndices( struct LocalBVH2* bvh2, ushort bvh2_root ) { ushort right_idx = (bvh2->nodes[bvh2_root] & 0xff000000) >> 23; return (ushort2)(right_idx - 1, right_idx); } ushort LocalBVH2_IncrementPrimCount( local struct LocalBVH2* bvh2, ushort bvh2_root ) { // increment only the lower 8 bits. Algorithm will not overflow by design return atomic_inc_local( &bvh2->nodes[bvh2_root] ) & 0xff; } ushort LocalBVH2_SetLeafPrimCount(local struct LocalBVH2* bvh2, ushort bvh2_root, ushort count) { return bvh2->nodes[bvh2_root] |= (count& 0xff); } bool LocalBVH2_IsRoot( struct LocalBVH2* bvh2, ushort node_id ) { return node_id == 0; } ushort LocalBVH2_GetLeafPrimrefStart( struct LocalBVH2* bvh2, ushort bvh2_node_id ) { return (bvh2->nodes[bvh2_node_id] >> 16) & 255; } bool LocalBVH2_IsLeftChild( struct LocalBVH2* bvh2, ushort parent_node, ushort current_node ) { return (current_node & 1); // nodes are allocated in pairs. first node is root, left child is an odd index } ushort LocalBVH2_GetParent( struct LocalBVH2* bvh2, ushort node ) { return node - ((bvh2->nodes[node] >> 8) & 255); } uint LocalBVH2_GetNodeCount( struct LocalBVH2* bvh2 ) { return bvh2->num_nodes; } bool LocalBVH2_IsLeaf( struct LocalBVH2* bvh2, ushort node_index ) { return (bvh2->nodes[node_index] & 255) <= TREE_ARITY; } ushort LocalBVH2_GetLeafPrimCount( struct LocalBVH2* bvh2, ushort node_index ) { return (bvh2->nodes[node_index] & 255); } void DFS_ConstructBVH2( local struct LocalBVH2* bvh2, local struct PrimRefSet* prim_refs, ushort bvh2_root, ushort prim_range_start, ushort local_num_prims, ushort global_num_prims, local struct BVHBuildLocals* locals, local uint* num_active_threads ) { ushort tid = get_local_id( 0 ); ushort primref_position = tid; bool active_thread = tid < global_num_prims; // Handle cases where initial binner creates leaves if ( active_thread && local_num_prims <= TREE_ARITY ) { struct DFSPrimRef ref = PrimRefSet_GetPrimRef(prim_refs, primref_position); DFSPrimRef_SetBVH2Root(&ref, bvh2_root); PrimRefSet_SetPrimRef(prim_refs, ref, primref_position); active_thread = false; if (primref_position == prim_range_start) atomic_sub_local(num_active_threads, local_num_prims); } barrier( CLK_LOCAL_MEM_FENCE ); locals->range[ tid ].start = prim_range_start; locals->range[ tid ].local_num_prims = local_num_prims; locals->range[ tid ].bvh2_root = bvh2_root; locals->range[ tid ].active = active_thread; do { if(active_thread && prim_range_start == primref_position) locals->sah[primref_position] = UINT_MAX; barrier( CLK_LOCAL_MEM_FENCE ); if ( active_thread ) { local struct DFSPrimRefAABB* my_box = PrimRefSet_GetAABBPointer( prim_refs, primref_position ); // each thread evaluates a possible split candidate. Scan primrefs and compute sah cost // do this axis-by-axis to keep register pressure low float best_sah = INFINITY; ushort best_axis = 3; ushort best_count = 0; struct DFSPrimRefAABB box_left[3]; struct DFSPrimRefAABB box_right[3]; float CSplit[3]; ushort count_left[3]; for ( ushort axis = 0; axis < 3; axis++ ) { DFSPrimRefAABB_init( &box_left[axis] ); DFSPrimRefAABB_init( &box_right[axis] ); CSplit[axis] = my_box->lower[axis] + my_box->upper[axis]; count_left[axis] = 0; } // scan primrefs in our subtree and partition using this thread's prim as a split plane { struct DFSPrimRefAABB box = *PrimRefSet_GetAABBPointer( prim_refs, prim_range_start ); for ( ushort p = 1; p < local_num_prims; p++ ) { struct DFSPrimRefAABB next_box = *PrimRefSet_GetAABBPointer( prim_refs, prim_range_start + p ); //preloading box for next iteration for( ushort axis = 0; axis < 3; axis++ ) { float c = box.lower[axis] + box.upper[axis]; if ( c < CSplit[axis] ) { // this primitive is to our left. DFSPrimRefAABB_extend( &box_left[axis], &box ); count_left[axis]++; } else { // this primitive is to our right DFSPrimRefAABB_extend( &box_right[axis], &box ); } } box = next_box; } // last iteration without preloading box for( ushort axis = 0; axis < 3; axis++ ) { float c = box.lower[axis] + box.upper[axis]; if ( c < CSplit[axis] ) { // this primitive is to our left. DFSPrimRefAABB_extend( &box_left[axis], &box ); count_left[axis]++; } else { // this primitive is to our right DFSPrimRefAABB_extend( &box_right[axis], &box ); } } } for ( ushort axis = 0; axis < 3; axis++ ) { float Al = DFSPrimRefAABB_halfArea( &box_left[axis] ); float Ar = DFSPrimRefAABB_halfArea( &box_right[axis] ); // Avoid NANs in SAH calculation in the corner case where all prims go right // In this case we set Al=Ar, because such a split will only be selected if all primrefs // are co-incident.. In that case, we will fall back to split-in-the-middle and both subtrees // should store the same quantized area value if ( count_left[axis] == 0 ) Al = Ar; // compute sah cost ushort count_right = local_num_prims - count_left[axis]; float sah = Ar * count_right + Al * count_left[axis]; // keep this split if it is better than the previous one, or if the previous one was a corner-case if ( sah < best_sah || best_count == 0 ) { // yes, keep it best_axis = axis; best_sah = sah; best_count = count_left[axis]; } } // write split information to SLM locals->axis_and_left_count[primref_position].x = best_axis; locals->axis_and_left_count[primref_position].y = best_count; uint sah = as_uint(best_sah); // break ties by axis to ensure deterministic split selection // otherwise builder can produce non-deterministic tree structure run to run // based on the ordering of primitives (which can vary due to non-determinism in atomic counters) // Embed split axis and index into sah value; compute min over sah and max over axis sah = ( ( sah & ~1023 ) | ( 2 - best_axis ) << 8 | tid ); // reduce on split candidates in our local subtree and decide the best one atomic_min_local( &locals->sah[ prim_range_start ], sah); } barrier( CLK_LOCAL_MEM_FENCE ); ushort split_index = locals->sah[ prim_range_start ] & 255; ushort split_axis = locals->axis_and_left_count[split_index].x; ushort split_left_count = locals->axis_and_left_count[split_index].y; if ( (primref_position == split_index) && active_thread ) { // first thread in a given subtree creates the inner node ushort start_left = prim_range_start; ushort start_right = prim_range_start + split_left_count; if ( split_left_count == 0 ) start_right = start_left + (local_num_prims / 2); // handle split-in-the-middle case LocalBVH2_CreateInnerNode( bvh2, bvh2_root, start_left, start_right ); } barrier( CLK_LOCAL_MEM_FENCE ); struct DFSPrimRef ref; ushort new_primref_position; if ( active_thread ) { ushort2 kids = LocalBVH2_GetChildIndices( bvh2, bvh2_root ); bool go_left; if ( split_left_count == 0 ) { // We chose a split with no left-side prims // This will only happen if all primrefs are located in the exact same position // In that case, fall back to split-in-the-middle split_left_count = (local_num_prims / 2); go_left = (primref_position - prim_range_start < split_left_count); } else { // determine what side of the split this thread's primref belongs on local struct DFSPrimRefAABB* my_box = PrimRefSet_GetAABBPointer( prim_refs, primref_position ); local struct DFSPrimRefAABB* split_box = PrimRefSet_GetAABBPointer( prim_refs, split_index ); float c = my_box->lower[split_axis] + my_box->upper[split_axis]; float Csplit = split_box->lower[split_axis] + split_box->upper[split_axis]; go_left = c < Csplit; } // adjust state variables for next loop iteration bvh2_root = (go_left) ? kids.x : kids.y; local_num_prims = (go_left) ? split_left_count : (local_num_prims - split_left_count); prim_range_start = (go_left) ? prim_range_start : prim_range_start + split_left_count; // determine the new primref position by incrementing a counter in the destination subtree new_primref_position = prim_range_start + LocalBVH2_IncrementPrimCount( bvh2, bvh2_root ); // load our primref from its previous position ref = PrimRefSet_GetPrimRef( prim_refs, primref_position ); } barrier( CLK_LOCAL_MEM_FENCE ); if ( active_thread ) { // write our primref into its sorted position and note which node it went in DFSPrimRef_SetBVH2Root( &ref, bvh2_root ); PrimRefSet_SetPrimRef( prim_refs, ref, new_primref_position ); primref_position = new_primref_position; // deactivate all threads whose subtrees are small enough to form a leaf if ( local_num_prims <= TREE_ARITY ) { active_thread = false; if( primref_position == prim_range_start ) atomic_sub_local( num_active_threads, local_num_prims ); } locals->range[ primref_position ].start = prim_range_start; locals->range[ primref_position ].local_num_prims = local_num_prims; locals->range[ primref_position ].bvh2_root = bvh2_root; locals->range[ primref_position ].active = active_thread; } barrier( CLK_LOCAL_MEM_FENCE ); // if we'll have next iteration then load from SLM if(*num_active_threads) { prim_range_start = locals->range[ tid ].start; local_num_prims = locals->range[ tid ].local_num_prims; bvh2_root = locals->range[ tid ].bvh2_root; active_thread = locals->range[ tid ].active; primref_position = tid; } else { break; } } while ( true ); } #define REFIT_BIT_DWORDS (LOCAL_BVH2_NODE_COUNT - DFS_WG_SIZE)/32 struct RefitBits { uint bits[REFIT_BIT_DWORDS]; }; struct DFS_SLM { union { struct LocalBVH2 bvh2; struct { struct AABB3f centroid_bounds; uint left_count; uint right_count; struct BFS_BinInfo bins; struct BFS_BinInfoReduce3_SLM reduce3; } binning; } u1; union { struct { struct PrimRefSet prim_refs; struct BVHBuildLocals locals; } pass0; struct AABB3f node_boxes[LOCAL_BVH2_NODE_COUNT]; } u2; union { uchar bytes[DFS_WG_SIZE]; uint dwords[DFS_WG_SIZE/4]; } mask_info; struct RefitBits refit_bits; }; void DFS_InitialBinningPass( local struct BFS_BinInfo* bins, local struct BFS_BinInfoReduce3_SLM* reduce3, uniform local struct AABB3f* centroid_bounds, local struct PrimRefSet* refs, local uint* left_counter, local uint* right_counter, ushort num_refs ) { uint tid = get_local_id(0); // initialize SLM structures if (tid == 0) { AABB3f_init(centroid_bounds); *left_counter = 0; *right_counter = 0; } BinInfo_init(bins); PrimRef ref; struct DFSPrimRef dfs_ref; if (tid < num_refs) { dfs_ref = PrimRefSet_GetPrimRef(refs, tid); struct DFSPrimRefAABB box = dfs_ref.aabb; ref.lower.xyz = (float3)(box.lower[0], box.lower[1], box.lower[2]); ref.upper.xyz = (float3)(box.upper[0], box.upper[1], box.upper[2]); } barrier(CLK_LOCAL_MEM_FENCE); // compute centroid bounds so that we can bin if (tid < num_refs) { float3 centroid = ref.lower.xyz + ref.upper.xyz; Uniform_AABB3f_atomic_merge_local_sub_group_lu(centroid_bounds, centroid, centroid); } barrier(CLK_LOCAL_MEM_FENCE); // add primrefs to bins struct BinMapping mapping; BinMapping_init(&mapping, centroid_bounds, BFS_NUM_BINS); BinInfo_add_primref( &mapping, bins, &ref, tidu1.bvh2; global struct BVH2* global_bvh2 = args.global_bvh2; PrimRef ref; uint parent_node; { local struct BVHBuildLocals* locals = &slm->u2.pass0.locals; local struct PrimRefSet* prim_refs = &slm->u2.pass0.prim_refs; DFS_CreatePrimRefSet(args, prim_refs); uint local_id = get_local_id(0); ushort bvh2_root = 0; ushort prim_range_start = 0; ushort local_num_prims = args.num_primrefs; if(local_id == 0) *num_active_threads = local_num_prims; // barrier for DFS_CreatePrimRefSet and num_active_threads barrier(CLK_LOCAL_MEM_FENCE); // initial binning pass if number of primrefs is large if( args.num_primrefs > 32 ) { DFS_InitialBinningPass(&slm->u1.binning.bins, &slm->u1.binning.reduce3, &slm->u1.binning.centroid_bounds, prim_refs, &slm->u1.binning.left_count, &slm->u1.binning.right_count, args.num_primrefs); barrier(CLK_LOCAL_MEM_FENCE); ushort left_count = slm->u1.binning.left_count; ushort right_count = args.num_primrefs - left_count; if (get_local_id(0) == 0) LocalBVH2_Initialize_Presplit(bvh2, args.num_primrefs, left_count, right_count); bvh2_root = (local_id < left_count) ? 1 : 2; local_num_prims = (local_id < left_count) ? left_count : right_count; prim_range_start = (local_id < left_count) ? 0 : left_count; } else { if (get_local_id(0) == 0) LocalBVH2_Initialize(bvh2, args.num_primrefs); } DFS_ConstructBVH2( bvh2, prim_refs, bvh2_root, prim_range_start, local_num_prims, args.num_primrefs, locals, num_active_threads); // move the prim refs into their sorted position // keep this thread's primref around for later use if ( local_id < args.num_primrefs ) { struct DFSPrimRef dfs_ref = PrimRefSet_GetPrimRef( prim_refs, local_id ); uint input_id = DFSPrimRef_GetInputIndex( &dfs_ref ); parent_node = DFSPrimRef_GetBVH2Parent( &dfs_ref ); uint primref_index = prim_refs->input_indices[input_id]; ref = args.primref_buffer[primref_index]; args.primref_indices_out[local_id] = primref_index; args.primref_indices_in[local_id] = primref_index; // these buffers are not read again until the end of kernel } barrier( CLK_LOCAL_MEM_FENCE ); } // initialize flags for determining when subtrees are done refit if ( get_local_id( 0 ) < REFIT_BIT_DWORDS ) slm->refit_bits.bits[get_local_id( 0 )] = 0; // stash full-precision primref AABBs in slm storage local struct AABB3f* slm_boxes = &slm->u2.node_boxes[0]; bool active_thread = get_local_id( 0 ) < args.num_primrefs; if( active_thread ) { AABB3f_set( &slm_boxes[get_local_id( 0 )], ref.lower.xyz, ref.upper.xyz ); // stash instance masks in SLM storage if( args.do_mask_processing ) slm->mask_info.bytes[get_local_id(0)] = PRIMREF_instanceMask( &ref ); } barrier( CLK_LOCAL_MEM_FENCE ); // Refit leaf nodes uint box_index; if ( active_thread ) { // the thread for the first primref in every leaf is the one that will ascend // remaining threads merge their AABB/mask into the first one and terminate uint first_ref = LocalBVH2_GetLeafPrimrefStart( bvh2, parent_node ); if ( first_ref != get_local_id( 0 ) ) { AABB3f_atomic_merge_local_lu( &slm_boxes[first_ref], ref.lower.xyz, ref.upper.xyz ); if( args.do_mask_processing ) { uint dword_index = first_ref/4; uint shift = (first_ref%4)*8; uint mask = PRIMREF_instanceMask(&ref) << shift; atomic_or_local( &slm->mask_info.dwords[dword_index], mask ); } active_thread = false; // switch off all primref threads except the first one } box_index = first_ref; } barrier( CLK_LOCAL_MEM_FENCE ); if ( active_thread ) { uint current_node = parent_node; parent_node = LocalBVH2_GetParent( bvh2, current_node ); // write out the leaf node's AABB uint num_prims = LocalBVH2_GetLeafPrimCount( bvh2, current_node ); uint prim_offs = args.primref_base + LocalBVH2_GetLeafPrimrefStart( bvh2, current_node ); uint mask = 0xff; if( args.do_mask_processing ) mask = slm->mask_info.bytes[box_index]; BVH2_WriteLeafNode( global_bvh2, args.global_bvh2_base + current_node, &slm_boxes[box_index], prim_offs, num_prims, mask ); // we no longer need the BVH2 bits for this node, so re-purpose the memory to store the AABB index bvh2->nodes[current_node] = box_index; // toggle flag bit in parent node. The second thread to flip the bit is the one that gets to proceed uint thread_mask = (1 << (parent_node % 32)); if ( (atomic_xor_local( &slm->refit_bits.bits[parent_node / 32], thread_mask ) & thread_mask) == 0 ) active_thread = false; } // count how many active threads in sub_group we have and increment wg's number of active threads uint sg_active = sub_group_reduce_add(active_thread ? 1 : 0); if(get_sub_group_local_id() == 0) { atomic_add_local(num_active_threads, sg_active); } // refit internal nodes: // walk up the tree and refit AABBs do { barrier( CLK_LOCAL_MEM_FENCE ); // we need this barrier because we need to make sure all threads read num_active_threads before modifying it if ( active_thread ) { uint current_node = parent_node; parent_node = LocalBVH2_GetParent( bvh2, current_node ); // pull left/right box indices from current node ushort2 kids = LocalBVH2_GetChildIndices( bvh2, current_node ); uint left_box = bvh2->nodes[kids.x]; uint right_box = bvh2->nodes[kids.y]; struct AABB3f left = slm_boxes[left_box]; struct AABB3f right = slm_boxes[right_box]; AABB3f_extend( &left, &right ); uint2 child_offsets = (uint2)( args.global_bvh2_base + kids.x, args.global_bvh2_base + kids.y); uint mask = 0xff; if( args.do_mask_processing ) { mask = slm->mask_info.bytes[left_box] | slm->mask_info.bytes[right_box]; slm->mask_info.bytes[left_box] = mask; } BVH2_WriteInnerNode( args.global_bvh2, args.global_bvh2_base+current_node, &left, child_offsets, mask ); slm_boxes[left_box] = left; bvh2->nodes[current_node] = left_box; // stop at the root if ( LocalBVH2_IsRoot( bvh2, current_node ) ) { active_thread = false; atomic_dec_local(num_active_threads); } else { // toggle flag bit in parent node. The second thread to flip the bit is the one that gets to proceed uint mask = (1 << (parent_node % 32)); if ( (atomic_xor_local( &slm->refit_bits.bits[parent_node / 32], mask ) & mask) == 0 ) { active_thread = false; atomic_dec_local(num_active_threads); } } } barrier( CLK_LOCAL_MEM_FENCE ); } while ( *num_active_threads > 0 ); } GRL_ANNOTATE_IGC_DO_NOT_SPILL __attribute__( (reqd_work_group_size(DFS_WG_SIZE,1,1) )) __attribute__( (intel_reqd_sub_group_size(16)) ) kernel void DFS( global struct VContextScheduler* scheduler, global struct SAHBuildGlobals* globals_buffer ) { local struct DFS_SLM slm; local struct DFSDispatchRecord record; local uint num_active_threads; if ( get_local_id( 0 ) == 0 ) { // pop an entry off the DFS dispatch queue //uint wg_index = atomic_dec_global( &scheduler->num_dfs_wgs ) - 1; //record = scheduler->dfs_queue.records[wg_index]; // TODO: The version above races, but is considerably faster... investigate uint wg_index = get_group_id(0); record = scheduler->dfs_queue.records[wg_index]; write_mem_fence( CLK_LOCAL_MEM_FENCE ); atomic_dec_global( &scheduler->num_dfs_wgs ); } barrier( CLK_LOCAL_MEM_FENCE ); bool odd_pass = record.tree_depth & 1; global struct SAHBuildGlobals* sah_globals = globals_buffer + record.batch_index; struct DFSArgs args; args.num_primrefs = record.num_primrefs; args.primref_indices_in = SAHBuildGlobals_GetPrimrefIndices_In( sah_globals, odd_pass ); args.primref_indices_out = SAHBuildGlobals_GetPrimrefIndices_Out( sah_globals, odd_pass ); args.primref_buffer = SAHBuildGlobals_GetPrimrefs( sah_globals ); args.global_bvh2 = SAHBuildGlobals_GetBVH2( sah_globals ); args.primref_indices_in += record.primref_base; args.primref_indices_out += record.primref_base; args.primref_base = record.primref_base; args.global_bvh2_base = record.bvh2_base; args.do_mask_processing = SAHBuildGlobals_NeedMasks( sah_globals ); Do_DFS( args, &slm, &num_active_threads ); } ///////////////////////////////////////////////////////////////////////////////////////////////// /// /// BVH2 to BVH6 /// ///////////////////////////////////////////////////////////////////////////////////////////////// struct BuildFlatTreeArgs { ushort leaf_size_in_bytes; ushort leaf_type; ushort inner_node_type; bool do_mask_processing; global uint* primref_indices; global PrimRef* primref_buffer; global struct Globals* globals; global struct BVHBase* bvh_base; global struct BVH2* bvh2; }; // lane i in the return value is the index of the ith largest primref in the input // the return value can be used with shuffle() to move data into its sorted position // the elements of 'key' must be unique.. only the first 6 elements are sorted varying ushort SUBGROUP_get_sort_indices_N6( varying uint key ) { // each lane computes the number of items larger than it // this is its position in the descending order // TODO_OPT: Compiler can vectorize these uint16 adds by packing into lower and upper halves of same GPR.... make sure it does it // if compiler is not generating optimal code, consider moving to Cm varying ushort cmp0 = (sub_group_broadcast(key, 0) > key) ? 1 : 0; varying ushort cmp1 = (sub_group_broadcast(key, 1) > key) ? 1 : 0; varying ushort cmp2 = (sub_group_broadcast(key, 2) > key) ? 1 : 0; varying ushort cmp3 = (sub_group_broadcast(key, 3) > key) ? 1 : 0; varying ushort cmp4 = (sub_group_broadcast(key, 4) > key) ? 1 : 0; varying ushort cmp5 = (sub_group_broadcast(key, 5) > key) ? 1 : 0; varying ushort a = cmp0 + cmp2 + cmp4; varying ushort b = cmp1 + cmp3 + cmp5; varying ushort num_larger = a + b; // each lane determines which of the input elements it should pull varying ushort lane = get_sub_group_local_id(); a = (sub_group_broadcast(num_larger, 0) == lane) ? 0 : 0; b = (sub_group_broadcast(num_larger, 1) == lane) ? 1 : 0; a += (sub_group_broadcast(num_larger, 2) == lane) ? 2 : 0; b += (sub_group_broadcast(num_larger, 3) == lane) ? 3 : 0; a += (sub_group_broadcast(num_larger, 4) == lane) ? 4 : 0; b += (sub_group_broadcast(num_larger, 5) == lane) ? 5 : 0; return a + b; } uint SUBGROUP_area_to_sort_key( varying float area, uniform ushort num_children ) { varying ushort lane = get_sub_group_local_id(); area = (lane < num_children) ? area : 0; // put inactive nodes last // drop LSBs and break ties by lane number to ensure unique keys // use descending lane IDs to ensure that sort is stable if the upper MSBs are equal. // If we do not do this it can lead to non-deterministic tree structure return (as_uint(area) & 0xffffff80) + (lane^(get_sub_group_size()-1)); } // lane i in the return value is the index of the ith largest primref in the input // the return value can be used with shuffle() to move data into its sorted position // the elements of 'key' must be unique.. only the first 6 elements are sorted varying ushort SUBGROUP_get_sort_indices_N6_2xSIMD8_in_SIMD16( varying uint key ) { // each lane computes the number of items larger than it // this is its position in the descending order // TODO_OPT: Compiler can vectorize these uint16 adds by packing into lower and upper halves of same GPR.... make sure it does it // if compiler is not generating optimal code, consider moving to Cm varying ushort cmp0 = (sub_group_broadcast(key, 0) > key) ? 1 : 0; varying ushort cmp1 = (sub_group_broadcast(key, 1) > key) ? 1 : 0; varying ushort cmp2 = (sub_group_broadcast(key, 2) > key) ? 1 : 0; varying ushort cmp3 = (sub_group_broadcast(key, 3) > key) ? 1 : 0; varying ushort cmp4 = (sub_group_broadcast(key, 4) > key) ? 1 : 0; varying ushort cmp5 = (sub_group_broadcast(key, 5) > key) ? 1 : 0; varying ushort a = cmp0 + cmp2 + cmp4; varying ushort b = cmp1 + cmp3 + cmp5; varying ushort num_larger = a + b; varying ushort cmp0_1 = (sub_group_broadcast(key, 8) > key) ? 1 : 0; varying ushort cmp1_1 = (sub_group_broadcast(key, 9) > key) ? 1 : 0; varying ushort cmp2_1 = (sub_group_broadcast(key, 10) > key) ? 1 : 0; varying ushort cmp3_1 = (sub_group_broadcast(key, 11) > key) ? 1 : 0; varying ushort cmp4_1 = (sub_group_broadcast(key, 12) > key) ? 1 : 0; varying ushort cmp5_1 = (sub_group_broadcast(key, 13) > key) ? 1 : 0; varying ushort a_1 = cmp0_1 + cmp2_1 + cmp4_1; varying ushort b_1 = cmp1_1 + cmp3_1 + cmp5_1; varying ushort num_larger_1 = a_1 + b_1; // each lane determines which of the input elements it should pull varying ushort lane = get_sub_group_local_id(); if(lane < 8) { a = (sub_group_broadcast(num_larger, 0) == lane) ? 0 : 0; b = (sub_group_broadcast(num_larger, 1) == lane) ? 1 : 0; a += (sub_group_broadcast(num_larger, 2) == lane) ? 2 : 0; b += (sub_group_broadcast(num_larger, 3) == lane) ? 3 : 0; a += (sub_group_broadcast(num_larger, 4) == lane) ? 4 : 0; b += (sub_group_broadcast(num_larger, 5) == lane) ? 5 : 0; } else { a = (sub_group_broadcast(num_larger_1, 8) == lane-8) ? 8 : 8; b = (sub_group_broadcast(num_larger_1, 9) == lane-8) ? 1 : 0; a += (sub_group_broadcast(num_larger_1, 10) == lane-8) ? 2 : 0; b += (sub_group_broadcast(num_larger_1, 11) == lane-8) ? 3 : 0; a += (sub_group_broadcast(num_larger_1, 12) == lane-8) ? 4 : 0; b += (sub_group_broadcast(num_larger_1, 13) == lane-8) ? 5 : 0; } return a + b; } uint SUBGROUP_area_to_sort_key_2xSIMD8_in_SIMD16( varying float area, uniform ushort num_children ) { varying ushort lane = get_sub_group_local_id() % 8; area = (lane < num_children) ? area : 0; // put inactive nodes last // drop LSBs and break ties by lane number to ensure unique keys // use descending lane IDs to ensure that sort is stable if the upper MSBs are equal. // If we do not do this it can lead to non-deterministic tree structure return (as_uint(area) & 0xffffff80) + (lane^7); } ushort SUBGROUP_BuildFlatTreeNode( uniform struct BuildFlatTreeArgs args, uniform uint bvh2_root, uniform struct InternalNode* qnode, uniform uint qnode_index, varying uint3* sg_children_out // if an inner node is created, receives the indices of the 6 child nodes (X), and the QNode position (y), and num_children(z) // if a leaf is created, receives number of primrefs (z) ) // return value is the number of child nodes or 0 for a leaf { global struct BVH2* bvh2 = args.bvh2; varying ushort lane = get_sub_group_local_id(); global struct BVHBase* base = args.bvh_base; if ( !BVH2_IsInnerNode( bvh2, bvh2_root ) ) { uniform ushort num_prims = BVH2_GetLeafPrimCount( bvh2, bvh2_root ); uniform uint primref_start = BVH2_GetLeafPrimStart( bvh2, bvh2_root ); varying uint primref_index = primref_start + ((lane < num_prims) ? lane : 0); varying uint ref_id = args.primref_indices[primref_index]; varying PrimRef ref = args.primref_buffer[ref_id]; uniform char* leaf_mem_base = (char*)BVHBase_GetQuadLeaves( args.bvh_base ); uniform char* leaf_mem = leaf_mem_base + primref_start * args.leaf_size_in_bytes; uniform int offset = (int)(leaf_mem - (char*)qnode); offset = offset >> 6; varying uint key = SUBGROUP_area_to_sort_key(AABB_halfArea(&ref), num_prims ); varying ushort sort_index = SUBGROUP_get_sort_indices_N6(key); ref = PrimRef_sub_group_shuffle(&ref, sort_index); ref_id = intel_sub_group_shuffle(ref_id, sort_index); if (lane < num_prims) args.primref_indices[primref_index] = ref_id; uint global_num_prims = args.globals->numPrimitives; char* bvh_mem = (char*) args.bvh_base; if(lane < num_prims) args.primref_indices[primref_index + global_num_prims] = qnode - (struct InternalNode*)bvh_mem; if (args.leaf_type == NODE_TYPE_INSTANCE) subgroup_setInstanceQBVHNodeN( offset, &ref, num_prims, (struct QBVHNodeN*)qnode, lane < num_prims ? PRIMREF_instanceMask(&ref) : 0 ); else subgroup_setQBVHNodeN( offset, args.leaf_type, &ref, num_prims, (struct QBVHNodeN*)qnode, BVH_NODE_DEFAULT_MASK ); sg_children_out->z = num_prims; return 0; } else { // collapse BVH2 into BVH6. // We will spread the root node's children across the subgroup, and keep adding SIMD lanes until we have enough uniform ushort num_children = 2; uniform uint2 kids = BVH2_GetChildIndices( bvh2, bvh2_root ); varying uint sg_bvh2_node = kids.x; if ( lane == 1 ) sg_bvh2_node = kids.y; do { // choose the inner node with maximum area to replace. // Its left child goes in its old location. Its right child goes in a new lane // TODO_OPT: We re-read the AABBs again and again to compute area // ... store per-lane boxes instead and pre-compute areas varying float sg_area = BVH2_GetNodeArea( bvh2, sg_bvh2_node ); varying bool sg_is_inner = BVH2_IsInnerNode( bvh2, sg_bvh2_node ); sg_area = (sg_is_inner && lane < num_children) ? sg_area : 0; // prevent early exit if the largest child is a leaf uniform float max_area = sub_group_reduce_max_N6( sg_area ); varying bool sg_reducable = max_area == sg_area && (lane < num_children) && sg_is_inner; uniform uint mask = intel_sub_group_ballot( sg_reducable ); // TODO_OPT: Some of these ops seem redundant.. look at trimming further if ( mask == 0 ) break; // choose the inner node with maximum area to replace uniform ushort victim_child = ctz( mask ); uniform uint victim_node = sub_group_broadcast( sg_bvh2_node, victim_child ); kids = BVH2_GetChildIndices( bvh2, victim_node ); if ( lane == victim_child ) sg_bvh2_node = kids.x; else if ( lane == num_children ) sg_bvh2_node = kids.y; num_children++; } while ( num_children < TREE_ARITY ); // allocate inner node space uniform uint kids_offset; if (get_sub_group_local_id() == 0) kids_offset = allocate_inner_nodes( args.bvh_base, num_children ); kids_offset = sub_group_broadcast(kids_offset, 0); uniform struct QBVHNodeN* kid = (((struct QBVHNodeN*)args.bvh_base) + kids_offset); uniform int offset = (int)((char*)kid - (char*)qnode) >> 6; #if 0 uniform uint kids_offset; if ( get_sub_group_local_id() == 0 ) kids_offset = alloc_node_mem( args.globals, sizeof( struct QBVHNodeN ) * num_children ); kids_offset = sub_group_broadcast( kids_offset, 0 ); // create inner node uniform struct QBVHNodeN* kid = (struct QBVHNodeN*) ((char*)(args.bvh_base) + kids_offset); uniform int offset = (int)((char*)kid - (char*)qnode) >> 6; #endif uniform uint child_type = args.inner_node_type; // sort child nodes in descending order by AABB area varying struct AABB box = BVH2_GetAABB( bvh2, sg_bvh2_node ); varying uint key = SUBGROUP_area_to_sort_key(AABB_halfArea(&box), num_children ); varying ushort sort_index = SUBGROUP_get_sort_indices_N6(key); box = AABB_sub_group_shuffle(&box, sort_index); sg_bvh2_node = intel_sub_group_shuffle(sg_bvh2_node, sort_index); uniform uint node_mask = (args.do_mask_processing) ? BVH2_GetMask( bvh2, bvh2_root ) : 0xff; subgroup_setQBVHNodeN( offset, child_type, &box, num_children, (struct QBVHNodeN*)qnode, node_mask ); // return child information *sg_children_out = (uint3)(sg_bvh2_node, qnode_index + offset + get_sub_group_local_id(), num_children ); return num_children; } } ushort SUBGROUP_BuildFlatTreeNode_2xSIMD8_in_SIMD16( uniform struct BuildFlatTreeArgs args, varying uint bvh2_root, varying struct InternalNode* qnode_base, varying uint qnode_index, varying uint3* sg_children_out, // if an inner node is created, receives the indices of the 6 child nodes (X), and the QNode position (y), and num_children(z) // if a leaf is created, receives number of primrefs (z) bool active_lane ) // return value is the number of child nodes or 0 for a leaf { global struct BVH2* bvh2 = args.bvh2; varying ushort SIMD16_lane = get_sub_group_local_id(); varying ushort SIMD8_lane = get_sub_group_local_id() % 8; varying ushort SIMD8_id = get_sub_group_local_id() / 8; varying ushort lane = get_sub_group_local_id(); global struct BVHBase* base = args.bvh_base; struct BVH2NodeMetaData nodeMetaData = BVH2_GetNodeMetaData( bvh2, bvh2_root ); bool is_leaf = active_lane && !BVH2NodeMetaData_IsInnerNode( &nodeMetaData ); bool is_inner = active_lane && BVH2NodeMetaData_IsInnerNode( &nodeMetaData ); uchar mask = BVH_NODE_DEFAULT_MASK; if(is_inner) mask = (args.do_mask_processing) ? BVH2NodeMetaData_GetMask( &nodeMetaData ) : 0xff; int offset; varying struct InternalNode* qnode = qnode_base + qnode_index; // TOOD: we don't need unions, I left them only for readability union { uint num_prims; uint num_children; } lane_num_data; union { PrimRef ref; // this is in fact AABB struct AABB box; } lane_box_data; union { uint ref_id; uint sg_bvh2_node; } lane_id_data; // for leafs varying uint primref_index; if(is_leaf) { lane_num_data.num_prims = BVH2NodeMetaData_GetLeafPrimCount( &nodeMetaData ); uint primref_start = BVH2NodeMetaData_GetLeafPrimStart( &nodeMetaData ); primref_index = primref_start + ((SIMD8_lane < lane_num_data.num_prims) ? SIMD8_lane : 0); lane_id_data.ref_id = args.primref_indices[primref_index]; lane_box_data.ref = args.primref_buffer[lane_id_data.ref_id]; char* leaf_mem_base = (char*)BVHBase_GetQuadLeaves( args.bvh_base ); char* leaf_mem = leaf_mem_base + primref_start * args.leaf_size_in_bytes; offset = (int)(leaf_mem - (char*)qnode); offset = offset >> 6; } if(intel_sub_group_ballot(is_inner)) { // collapse BVH2 into BVH6. // We will spread the root node's children across the subgroup, and keep adding SIMD lanes until we have enough uint2 kids; if(is_inner) { lane_num_data.num_children = 2; kids = BVH2_GetChildIndices( bvh2, bvh2_root ); lane_id_data.sg_bvh2_node = kids.x; if ( SIMD8_lane == 1 ) lane_id_data.sg_bvh2_node = kids.y; } bool active = is_inner; do { // choose the inner node with maximum area to replace. // Its left child goes in its old location. Its right child goes in a new lane // TODO_OPT: We re-read the AABBs again and again to compute area // ... store per-lane boxes instead and pre-compute areas varying float sg_area = 0; varying bool sg_is_inner = false; if(active) { sg_area = BVH2_GetNodeArea( bvh2, lane_id_data.sg_bvh2_node ); sg_is_inner = BVH2_IsInnerNode( bvh2, lane_id_data.sg_bvh2_node ); sg_area = (sg_is_inner && SIMD8_lane < lane_num_data.num_children) ? sg_area : 0; // prevent early exit if the largest child is a leaf } float max_area = sub_group_reduce_max_N6_2xSIMD8_in_SIMD16( sg_area ); varying bool sg_reducable = max_area == sg_area && sg_is_inner && (SIMD8_lane < lane_num_data.num_children); uint mask = intel_sub_group_ballot( sg_reducable ) & (0xFF << SIMD8_id * 8); // we'll end up with two different masks for two SIMD8 in SIMD16 due to bits masking // TODO_OPT: Some of these ops seem redundant.. look at trimming further if ( mask == 0 ) active = false; // choose the inner node with maximum area to replace ushort victim_child = ctz( mask ); uint victim_node = intel_sub_group_shuffle( lane_id_data.sg_bvh2_node, victim_child ); if(active) { kids = BVH2_GetChildIndices( bvh2, victim_node ); if ( SIMD16_lane == victim_child ) // we use SIMD16_lane, cause victim_child was calculated based on SIMD16 i.e. second node will have victim from 8..13 lane_id_data.sg_bvh2_node = kids.x; else if ( SIMD8_lane == lane_num_data.num_children ) lane_id_data.sg_bvh2_node = kids.y; lane_num_data.num_children++; if(lane_num_data.num_children >= TREE_ARITY) active = false; } } while ( intel_sub_group_ballot(active) ); // if any active, then continue // sum children from both halfs of SIMD16 to allocate nodes only once per sub_group uniform ushort num_children = is_inner ? lane_num_data.num_children : 0; uniform ushort first_SIMD8_num_children = sub_group_broadcast(num_children, 0); uniform ushort second_SIMD8_num_children = sub_group_broadcast(num_children, 8); num_children = first_SIMD8_num_children + second_SIMD8_num_children; uint kids_offset; // allocate inner node space if(num_children && SIMD16_lane == 0) kids_offset = allocate_inner_nodes( args.bvh_base, num_children ); kids_offset = sub_group_broadcast(kids_offset, 0); if((is_inner)) { kids_offset += SIMD8_id * first_SIMD8_num_children; struct QBVHNodeN* kid = (((struct QBVHNodeN*)args.bvh_base) + kids_offset); offset = (int)((char*)kid - (char*)qnode) >> 6; lane_box_data.box = BVH2_GetAABB( bvh2, lane_id_data.sg_bvh2_node ); } } // sort child nodes in descending order by AABB area varying uint key = SUBGROUP_area_to_sort_key_2xSIMD8_in_SIMD16(AABB_halfArea(&lane_box_data.box), lane_num_data.num_children ); varying ushort sort_index = SUBGROUP_get_sort_indices_N6_2xSIMD8_in_SIMD16(key); lane_box_data.box = PrimRef_sub_group_shuffle(&lane_box_data.box, sort_index); lane_id_data.sg_bvh2_node = intel_sub_group_shuffle(lane_id_data.sg_bvh2_node, sort_index); char* bvh_mem = (char*) args.bvh_base; if (is_leaf && SIMD8_lane < lane_num_data.num_prims) { args.primref_indices[primref_index] = lane_id_data.ref_id; args.primref_indices[primref_index + args.globals->numPrimitives] = qnode - (struct InternalNode*)bvh_mem; } bool degenerated = false; uint node_type = is_leaf ? args.leaf_type : args.inner_node_type; if(args.leaf_type == NODE_TYPE_INSTANCE) degenerated = subgroup_setInstanceBox_2xSIMD8_in_SIMD16(&lane_box_data.box, lane_num_data.num_children, &mask, SIMD8_lane < lane_num_data.num_prims ? PRIMREF_instanceMask(&lane_box_data.ref) : 0, is_leaf); subgroup_setQBVHNodeN_setFields_2xSIMD8_in_SIMD16(offset, node_type, &lane_box_data.box, lane_num_data.num_children, mask, (struct QBVHNodeN*)(qnode), degenerated, active_lane); // return child information if(is_inner) { sg_children_out->x = lane_id_data.sg_bvh2_node; sg_children_out->y = qnode_index + offset + SIMD8_lane; } sg_children_out->z = lane_num_data.num_children; return is_inner ? lane_num_data.num_children : 0; } void check_primref_integrity( global struct SAHBuildGlobals* globals ) { global uint* primref_in = SAHBuildGlobals_GetPrimrefIndices_In( globals, 0 ); global uint* primref_out = SAHBuildGlobals_GetPrimrefIndices_Out( globals, 0 ); dword num_primrefs = SAHBuildGlobals_GetTotalPrimRefs( globals ); if ( get_local_id( 0 ) == 0 ) { for ( uint i = 0; i < num_primrefs; i++ ) { primref_out[i] = 0; } for ( uint i = 0; i < num_primrefs; i++ ) primref_out[primref_in[i]]++; for ( uint i = 0; i < num_primrefs; i++ ) if ( primref_out[i] != 1 ) printf( "Foo: %u %u\n", i, primref_out[i] ); } } void check_bvh2(global struct SAHBuildGlobals* globals ) { global struct BVH2* bvh2 = SAHBuildGlobals_GetBVH2(globals); global uint* primref_in = SAHBuildGlobals_GetPrimrefIndices_In(globals, 0); global uint* primref_out = SAHBuildGlobals_GetPrimrefIndices_Out(globals, 0); dword num_primrefs = SAHBuildGlobals_GetTotalPrimRefs(globals); if (get_local_id(0) == 0) { for (uint i = 0; i < num_primrefs; i++) primref_out[i] = 0; uint stack[256]; uint sp=0; uint r = BVH2_GetRoot(bvh2); stack[sp++] = r; while (sp) { r = stack[--sp]; if (BVH2_IsInnerNode(bvh2,r)) { uint2 kids = BVH2_GetChildIndices( bvh2, r); if (kids.x >= bvh2->num_nodes || kids.y >= bvh2->num_nodes) { printf("BVH2!! Bad node index found!\n"); return; } stack[sp++] = kids.x; stack[sp++] = kids.y; } else { uint ref = BVH2_GetLeafPrimStart(bvh2,r); uint count = BVH2_GetLeafPrimCount(bvh2,r); if( count == 0 ) { printf("BVH2!! Empty leaf found!\n"); return; } for (uint i = 0; i < count; i++) { if (ref + i > num_primrefs) { printf("BVH2!! Bad leaf range!\n"); return; } uint c = primref_out[ref+i]; if (c != 0) { printf("BVH2!! overlapped prim ranges\n"); return; } primref_out[ref+i] = 1; if (primref_in[ref + i] >= num_primrefs) { printf("BAD PRIMREF ID FOUND!\n"); return; } } } } } printf("bvh2 is ok!\n"); } #if 0 // TODO_OPT: Enable larger WGs. WGSize 512 at SIMD8 hangs on Gen9, but Gen12 can go bigger GRL_ANNOTATE_IGC_DO_NOT_SPILL __attribute__( (reqd_work_group_size(256,1,1)) ) __attribute__( (intel_reqd_sub_group_size(8) ) ) kernel void build_qnodes( global struct SAHBuildGlobals* globals, global struct VContextScheduler* scheduler ) { globals = globals + (scheduler->num_trivial_builds + scheduler->num_single_builds); globals = globals + get_group_id(0); struct BuildFlatTreeArgs args; args.leaf_size_in_bytes = SAHBuildGlobals_GetLeafSizeInBytes( globals ); args.leaf_type = SAHBuildGlobals_GetLeafType( globals ); args.inner_node_type = SAHBuildGlobals_GetInternalNodeType( globals ); args.primref_indices = SAHBuildGlobals_GetPrimrefIndices_In( globals, 0 ); args.primref_buffer = SAHBuildGlobals_GetPrimrefs( globals ); args.bvh_base = SAHBuildGlobals_GetBVHBase( globals ); args.bvh2 = SAHBuildGlobals_GetBVH2( globals ); args.globals = (global struct Globals*) globals->p_globals; args.do_mask_processing = SAHBuildGlobals_NeedMasks( globals ); dword alloc_backpointers = SAHBuildGlobals_NeedBackPointers( globals ); global uint2* root_buffer = (global uint2*) globals->p_qnode_root_buffer; global struct InternalNode* qnodes = (global struct InternalNode*) BVHBase_GetInternalNodes( args.bvh_base ); global uint* back_pointers = (global uint*) BVHBase_GetBackPointers( args.bvh_base ); local uint nodes_produced; if ( get_sub_group_id() == 0 ) { // allocate first node if (get_sub_group_local_id() == 0) allocate_inner_nodes( args.bvh_base, 1 ); // first subgroup does first node varying uint3 children_info; uniform ushort num_children = SUBGROUP_BuildFlatTreeNode(args, BVH2_GetRoot(args.bvh2), qnodes, 0, &children_info ); if ( get_sub_group_local_id() < num_children ) root_buffer[get_sub_group_local_id()] = children_info.xy; if ( alloc_backpointers ) { // set root's backpointer if( get_sub_group_local_id() == 0 ) back_pointers[0] = (0xffffffc0) | (children_info.z << 3); // point child backpointers at the parent if( get_sub_group_local_id() < num_children ) back_pointers[children_info.y] = 0; } if ( get_sub_group_local_id() == 0 ) nodes_produced = num_children; } barrier( CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE ); uniform uint buffer_index = get_sub_group_id(); uniform bool sg_active = buffer_index < nodes_produced; while ( work_group_any( sg_active ) ) { if( sg_active ) { uniform uint bvh2_node = root_buffer[buffer_index].x; uniform uint qnode_index = root_buffer[buffer_index].y; // build a node varying uint3 children_info; uniform ushort num_children = SUBGROUP_BuildFlatTreeNode( args, bvh2_node, qnodes + qnode_index, qnode_index, &children_info ); // handle backpointers if ( alloc_backpointers ) { // update this node's backpointer with child count if ( get_sub_group_local_id() == 0 ) back_pointers[qnode_index] |= (children_info.z << 3); // point child backpointers at parent if ( get_sub_group_local_id() < num_children ) back_pointers[children_info.y] = (qnode_index << 6); } if ( num_children ) { // allocate space in the child buffer uint root_buffer_position = 0; if ( get_sub_group_local_id() == 0 ) root_buffer_position = atomic_add_local( &nodes_produced, num_children ); root_buffer_position = sub_group_broadcast( root_buffer_position, 0 ); // store child indices in root buffer if ( get_sub_group_local_id() < num_children ) root_buffer[root_buffer_position + get_sub_group_local_id()] = children_info.xy; } } // sync everyone work_group_barrier( CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE, memory_scope_work_group ); if( sg_active ) buffer_index += get_num_sub_groups(); sg_active = (buffer_index < nodes_produced); } } #endif inline bool buffer_may_overflow( uint capacity, uint current_size, uint elements_processed_per_sub_group ) { uint num_consumed = min( get_num_sub_groups() * elements_processed_per_sub_group, current_size ); uint space_available = (capacity - current_size) + num_consumed; uint space_needed = TREE_ARITY * num_consumed; return space_available < space_needed; } inline uint build_qnodes_pc( global struct SAHBuildGlobals* globals, bool alloc_backpointers, bool process_masks, uint first_qnode, uint first_bvh2_node, local uint2* SLM_local_root_buffer, local uint* SLM_ring_tail, const uint RING_SIZE ) { struct BuildFlatTreeArgs args; args.leaf_size_in_bytes = SAHBuildGlobals_GetLeafSizeInBytes( globals ); args.leaf_type = SAHBuildGlobals_GetLeafType( globals ); args.inner_node_type = SAHBuildGlobals_GetInternalNodeType( globals ); args.primref_indices = SAHBuildGlobals_GetPrimrefIndices_In( globals, 0 ); args.primref_buffer = SAHBuildGlobals_GetPrimrefs( globals ); args.bvh_base = SAHBuildGlobals_GetBVHBase( globals ); args.bvh2 = SAHBuildGlobals_GetBVH2( globals ); args.globals = (global struct Globals*) globals->p_globals; args.do_mask_processing = process_masks; global struct InternalNode* qnodes = (global struct InternalNode*) BVHBase_GetInternalNodes( args.bvh_base ); global uint* back_pointers = (global uint*) BVHBase_GetBackPointers( args.bvh_base ); // first subgroup adds first node if ( get_sub_group_id() == 0 && get_sub_group_local_id() == 0) { SLM_local_root_buffer[0].x = first_bvh2_node; SLM_local_root_buffer[0].y = first_qnode; *SLM_ring_tail = 1; } uint ring_head = 0; uint ring_tail = 1; uint ring_size = 1; barrier( CLK_LOCAL_MEM_FENCE ); const uniform uint elements_processed_in_sg = 2; while ( ring_size > 0 && !buffer_may_overflow( RING_SIZE, ring_size, elements_processed_in_sg ) ) { ushort SIMD16_lane = get_sub_group_local_id(); // SIMD16 as 2xSIMD8 ushort SIMD8_lane = get_sub_group_local_id() % 8; ushort SIMD8_id = get_sub_group_local_id() / 8; bool active_lane; uniform uint nodes_consumed = min( get_num_sub_groups() * elements_processed_in_sg, ring_size ); // times two because we process two nodes in subgroup uniform bool sg_active = get_sub_group_id() * elements_processed_in_sg < nodes_consumed; ushort num_children = 0; varying uint3 children_info = 0; uint bvh2_node = 0; uint qnode_index = 0; if (sg_active) { ushort consumed_pos = get_sub_group_id() * elements_processed_in_sg + SIMD8_id; active_lane = consumed_pos < nodes_consumed ? true : false; consumed_pos = consumed_pos < nodes_consumed ? consumed_pos : consumed_pos-1; uint buffer_index = (ring_head + consumed_pos) % RING_SIZE; bvh2_node = SLM_local_root_buffer[buffer_index].x; qnode_index = SLM_local_root_buffer[buffer_index].y; } barrier( CLK_LOCAL_MEM_FENCE ); if (sg_active) { // build a node num_children = SUBGROUP_BuildFlatTreeNode_2xSIMD8_in_SIMD16(args, bvh2_node, qnodes, qnode_index, &children_info, active_lane); // handle backpointers // TODO_OPT: This should be separate shaders not a runtime branch // doing it this way for now because GRLTLK does not make dynamic shader selection on host very easy. // this needs to change... GRLTLK should if (alloc_backpointers && active_lane) { // update this node's backpointer with child count if (SIMD8_lane == 0) back_pointers[qnode_index] |= (children_info.z << 3); // point child backpointers at parent if (SIMD8_lane < num_children) back_pointers[children_info.y] = (qnode_index << 6); } // save data uniform ushort first_SIMD8_num_children = sub_group_broadcast(num_children, 0); uniform ushort second_SIMD8_num_children = sub_group_broadcast(num_children, 8); uniform ushort SIMD16_num_children = first_SIMD8_num_children + second_SIMD8_num_children; uint root_buffer_position = 0; // allocate space in the child buffer if (SIMD16_lane == 0 && SIMD16_num_children) root_buffer_position = atomic_add_local(SLM_ring_tail, SIMD16_num_children); root_buffer_position = sub_group_broadcast( root_buffer_position, 0 ); root_buffer_position += SIMD8_id * first_SIMD8_num_children; // update offset for second half of SIMD16 // store child indices in root buffer if (SIMD8_lane < num_children) { uint store_pos = (root_buffer_position + SIMD8_lane) % RING_SIZE; SLM_local_root_buffer[store_pos] = children_info.xy; } } // sync everyone barrier( CLK_LOCAL_MEM_FENCE ); ring_head += nodes_consumed; ring_tail = *SLM_ring_tail; ring_size = ring_tail - ring_head; } return ring_head; } inline void amplify_and_spill( global struct SAHBuildGlobals* globals, dword alloc_backpointers, uint first_qnode, uint first_bvh2_node, global uint2* global_root_buffer, local uint* root_buffer_counter, const uint RING_SIZE ) { struct BuildFlatTreeArgs args; args.leaf_size_in_bytes = SAHBuildGlobals_GetLeafSizeInBytes(globals); args.leaf_type = SAHBuildGlobals_GetLeafType(globals); args.inner_node_type = SAHBuildGlobals_GetInternalNodeType(globals); args.primref_indices = SAHBuildGlobals_GetPrimrefIndices_In(globals, 0); args.primref_buffer = SAHBuildGlobals_GetPrimrefs(globals); args.bvh_base = SAHBuildGlobals_GetBVHBase(globals); args.bvh2 = SAHBuildGlobals_GetBVH2(globals); args.globals = (global struct Globals*) globals->p_globals; global struct InternalNode* qnodes = (global struct InternalNode*) BVHBase_GetInternalNodes(args.bvh_base); global uint* back_pointers = (global uint*) BVHBase_GetBackPointers(args.bvh_base); varying uint3 children_info; uniform ushort num_children = SUBGROUP_BuildFlatTreeNode(args, first_bvh2_node, qnodes + first_qnode, first_qnode, &children_info); if (alloc_backpointers) { // set first node's backpointer if (get_sub_group_local_id() == 0) { // if first node is root, use root sentinel in backpointer // otherwise, need to merge the child count in with the parent offset (which was already put there by the parent's thread) uint bp = 0xffffffc0; if (first_qnode != 0) bp = back_pointers[first_qnode]; bp |= (children_info.z << 3); back_pointers[first_qnode] = bp; } // point child backpointers at the parent if (get_sub_group_local_id() < num_children) back_pointers[children_info.y] = (first_qnode << 6); } if (num_children) { uint spill_pos = 0; if (get_sub_group_local_id() == 0) spill_pos = atomic_add_local(root_buffer_counter,num_children); spill_pos = sub_group_broadcast(spill_pos, 0); if (get_sub_group_local_id() < num_children) global_root_buffer[spill_pos+get_sub_group_local_id()] = children_info.xy; } } inline void build_qnodes_pc_kickoff_func( global struct SAHBuildGlobals* globals, global uint2* root_buffer, bool alloc_backpointers, bool process_masks, local uint2* SLM_local_root_buffer, local uint* SLM_spill_pos, local uint* SLM_ring_tail, int RING_SIZE ) { // allocate first node if ( get_sub_group_id() == 0 && get_sub_group_local_id() == 0 ) allocate_inner_nodes( SAHBuildGlobals_GetBVHBase(globals), 1 ); *SLM_spill_pos=0; uint ring_head = build_qnodes_pc( globals, alloc_backpointers, process_masks, 0, BVH2_GetRoot(SAHBuildGlobals_GetBVH2(globals)), SLM_local_root_buffer, SLM_ring_tail, RING_SIZE ); uint n = *SLM_ring_tail - ring_head; if (n > 0) { #if 0 // do an additional round of amplification so we can get more nodes into the root buffer and go wider in the next phase /// JDB TODO: this is causing hangs on DG2 for metro, so disabling for now... for (uint i = get_sub_group_id(); i < n; i+= get_num_sub_groups() ) { uint consume_pos = (ring_head + i) % RING_SIZE; uniform uint bvh2_root = SLM_local_root_buffer[consume_pos].x; uniform uint qnode_root = SLM_local_root_buffer[consume_pos].y; amplify_and_spill( globals, alloc_backpointers, qnode_root, bvh2_root, root_buffer, SLM_spill_pos, RING_SIZE ); } barrier( CLK_LOCAL_MEM_FENCE ); #else for (uint i = get_local_id(0); i < n; i += get_local_size(0)) root_buffer[i] = SLM_local_root_buffer[(ring_head + i) % RING_SIZE]; #endif if (get_local_id(0) == 0) { globals->root_buffer_num_produced = n; globals->root_buffer_num_produced_hi = 0; globals->root_buffer_num_consumed = 0; globals->root_buffer_num_consumed_hi = 0; } } } GRL_ANNOTATE_IGC_DO_NOT_SPILL __attribute__( (reqd_work_group_size( 256, 1, 1 )) ) __attribute__( (intel_reqd_sub_group_size( 16 )) ) kernel void build_qnodes_pc_kickoff( global struct SAHBuildGlobals* globals, global uint2* root_buffer, dword sah_flags ) { bool alloc_backpointers = sah_flags & SAH_FLAG_NEED_BACKPOINTERS; bool process_masks = sah_flags & SAH_FLAG_NEED_MASKS; const int RING_SIZE = 64; local uint2 SLM_local_root_buffer[RING_SIZE]; local uint SLM_spill_pos; local uint SLM_ring_tail; build_qnodes_pc_kickoff_func(globals, root_buffer, alloc_backpointers, process_masks, SLM_local_root_buffer, &SLM_spill_pos, &SLM_ring_tail, RING_SIZE ); } inline void build_qnodes_pc_amplify_func( global struct SAHBuildGlobals* globals, global uint2* root_buffer, bool alloc_backpointers, bool process_masks, local uint2* SLM_local_root_buffer, local uint* SLM_broadcast, local uint* SLM_ring_tail, int RING_SIZE ) { // TODO_OPT: Probably don't need this atomic.. could clear 'num_consumed' every time // and just use get_group_id() // if (get_local_id(0) == 0) *SLM_broadcast = atomic_inc_global(&globals->root_buffer_num_consumed); barrier( CLK_LOCAL_MEM_FENCE ); uniform uint consume_pos = *SLM_broadcast; uniform uint bvh2_root = root_buffer[consume_pos].x; uniform uint qnode_root = root_buffer[consume_pos].y; uint ring_head = build_qnodes_pc(globals, alloc_backpointers,process_masks, qnode_root, bvh2_root, SLM_local_root_buffer, SLM_ring_tail, RING_SIZE); // TODO_OPT: Instead of spilling the nodes, do one more round of amplification and write // generated children directly into the root buffer. This should allow faster amplification // spill root buffer contents uint n = *SLM_ring_tail - ring_head; if (n > 0) { if (get_local_id(0) == 0) *SLM_broadcast = atomic_add_global(&globals->root_buffer_num_produced, n); barrier( CLK_LOCAL_MEM_FENCE ); uint produce_pos = *SLM_broadcast; for (uint i = get_local_id(0); i < n; i += get_local_size(0)) root_buffer[produce_pos + i] = SLM_local_root_buffer[(ring_head + i) % RING_SIZE]; } } // Process two nodes per wg during amplification phase. // DOing it this way ensures maximum parallelism GRL_ANNOTATE_IGC_DO_NOT_SPILL __attribute__((reqd_work_group_size(16, 1, 1))) __attribute__((intel_reqd_sub_group_size(16))) kernel void build_qnodes_pc_amplify( global struct SAHBuildGlobals* globals, global uint2* root_buffer, dword sah_flags ) { bool alloc_backpointers = sah_flags & SAH_FLAG_NEED_BACKPOINTERS; struct BuildFlatTreeArgs args; args.leaf_size_in_bytes = SAHBuildGlobals_GetLeafSizeInBytes(globals); args.leaf_type = SAHBuildGlobals_GetLeafType(globals); args.inner_node_type = SAHBuildGlobals_GetInternalNodeType(globals); args.primref_indices = SAHBuildGlobals_GetPrimrefIndices_In(globals, 0); args.primref_buffer = SAHBuildGlobals_GetPrimrefs(globals); args.bvh_base = SAHBuildGlobals_GetBVHBase(globals); args.bvh2 = SAHBuildGlobals_GetBVH2(globals); args.globals = (global struct Globals*) globals->p_globals; args.do_mask_processing = sah_flags & SAH_FLAG_NEED_MASKS; global struct InternalNode* qnodes = (global struct InternalNode*) BVHBase_GetInternalNodes(args.bvh_base); global uint* back_pointers = (global uint*) BVHBase_GetBackPointers(args.bvh_base); ushort SIMD16_lane = get_sub_group_local_id(); // SIMD16 as 2xSIMD8 ushort SIMD8_lane = get_sub_group_local_id() % 8; ushort SIMD8_id = get_sub_group_local_id() / 8; bool active_lane = false; uint consume_pos; consume_pos = globals->root_buffer_num_consumed + get_group_id(0) * 2; // times 2 because we process two nodes in workgroup consume_pos += SIMD8_id; active_lane = consume_pos < globals->root_buffer_num_to_consume ? true : false; consume_pos = consume_pos < globals->root_buffer_num_to_consume ? consume_pos : consume_pos-1; uint first_bvh2_node = root_buffer[consume_pos].x; uint first_qnode = root_buffer[consume_pos].y; varying uint3 children_info; ushort num_children = SUBGROUP_BuildFlatTreeNode_2xSIMD8_in_SIMD16(args, first_bvh2_node, qnodes, first_qnode, &children_info, active_lane); if (alloc_backpointers && active_lane) { // set first node's backpointer if (SIMD8_lane == 0) { // if first node is root, use root sentinel in backpointer // otherwise, need to merge the child count in with the parent offset (which was already put there by the parent's thread) uint bp = 0xffffffc0; if (first_qnode != 0) bp = back_pointers[first_qnode]; bp |= (children_info.z << 3); back_pointers[first_qnode] = bp; } // point child backpointers at the parent if (SIMD8_lane < num_children) back_pointers[children_info.y] = (first_qnode << 6); } // save data { // sum children from both halfs of SIMD16 to do only one atomic per sub_group uint produce_pos; uniform ushort first_SIMD8_num_children = sub_group_broadcast(num_children, 0); uniform ushort second_SIMD8_num_children = sub_group_broadcast(num_children, 8); uniform ushort SIMD16_num_children = first_SIMD8_num_children + second_SIMD8_num_children; if (SIMD16_lane == 0 && SIMD16_num_children) produce_pos = atomic_add_global(&globals->root_buffer_num_produced, SIMD16_num_children); produce_pos = sub_group_broadcast(produce_pos, 0); produce_pos += SIMD8_id * first_SIMD8_num_children; // update offset for second half of SIMD16 if (SIMD8_lane < num_children) { root_buffer[produce_pos + SIMD8_lane] = children_info.xy; } } } ////////// // // Batched version of qnode creation // ////////// GRL_ANNOTATE_IGC_DO_NOT_SPILL __attribute__((reqd_work_group_size(1, 1, 1))) kernel void build_qnodes_init_scheduler_batched(global struct QnodeScheduler* scheduler, dword num_builds, dword num_max_qnode_global_root_buffer_entries) { scheduler->batched_build_offset = scheduler->num_trivial_builds + scheduler->num_single_builds; scheduler->batched_build_count = num_builds - scheduler->batched_build_offset; scheduler->num_max_qnode_global_root_buffer_entries = num_max_qnode_global_root_buffer_entries; const uint num_builds_to_process = scheduler->batched_build_count; const uint max_qnode_grb_entries = scheduler->num_max_qnode_global_root_buffer_entries; scheduler->batched_builds_to_process = num_builds_to_process; scheduler->num_qnode_grb_curr_entries = (num_builds_to_process + 15) / 16; // here we store number of workgroups for "build_qnodes_begin_batchable" kernel scheduler->num_qnode_grb_new_entries = num_builds_to_process; scheduler->qnode_global_root_buffer.curr_entries_offset = max_qnode_grb_entries; } GRL_ANNOTATE_IGC_DO_NOT_SPILL __attribute__((reqd_work_group_size(16, 1, 1))) __attribute__( (intel_reqd_sub_group_size( 16 )) ) kernel void build_qnodes_begin_batchable(global struct QnodeScheduler* scheduler, global struct SAHBuildGlobals* builds_globals) { const uint tid = get_group_id(0) * get_local_size(0) + get_local_id(0); const uint num_builds_to_process = scheduler->batched_builds_to_process; if(tid < num_builds_to_process) { const uint build_idx = scheduler->batched_build_offset + tid; uint bvh2_node = BVH2_GetRoot(SAHBuildGlobals_GetBVH2(&builds_globals[build_idx])); uint qnode = 0; struct QNodeGlobalRootBufferEntry entry = { bvh2_node, qnode, build_idx, 1}; scheduler->qnode_global_root_buffer.entries[tid] = entry; builds_globals[build_idx].root_buffer_num_produced = 0; builds_globals[build_idx].root_buffer_num_produced_hi = 0; builds_globals[build_idx].root_buffer_num_consumed = 0; builds_globals[build_idx].root_buffer_num_consumed_hi = 0; // allocate first node for this build //allocate_inner_nodes( SAHBuildGlobals_GetBVHBase(&builds_globals[build_idx]), 1 ); SAHBuildGlobals_GetBVHBase(&builds_globals[build_idx])->nodeDataCur++; } } GRL_ANNOTATE_IGC_DO_NOT_SPILL __attribute__( (reqd_work_group_size( 1, 1, 1 )) ) kernel void build_qnodes_scheduler(global struct QnodeScheduler* scheduler) { const uint max_qnode_grb_entries = scheduler->num_max_qnode_global_root_buffer_entries; uint new_entries = min(scheduler->num_qnode_grb_new_entries, max_qnode_grb_entries); scheduler->num_qnode_grb_curr_entries = new_entries; scheduler->num_qnode_grb_new_entries = 0; scheduler->qnode_global_root_buffer.curr_entries_offset = scheduler->qnode_global_root_buffer.curr_entries_offset ? 0 : max_qnode_grb_entries; } // TODO_OPT: Enable larger WGs. WGSize 512 at SIMD8 hangs on Gen9, but Gen12 can go bigger GRL_ANNOTATE_IGC_DO_NOT_SPILL __attribute__( (reqd_work_group_size( 32, 1, 1 )) ) __attribute__( (intel_reqd_sub_group_size( 16 )) ) kernel void build_qnodes_pc_amplify_batched( global struct SAHBuildGlobals* builds_globals, global struct QnodeScheduler* scheduler ) { const uint group_id = get_group_id(0); global struct QNodeGlobalRootBuffer* global_root_buffer = &scheduler->qnode_global_root_buffer; const uint curr_entries_offset = global_root_buffer->curr_entries_offset; struct QNodeGlobalRootBufferEntry entry = global_root_buffer->entries[curr_entries_offset + group_id]; const uint build_id = entry.build_idx; global struct SAHBuildGlobals* globals = &builds_globals[build_id]; global uint2* root_buffer = (global uint2*)globals->p_qnode_root_buffer; bool alloc_backpointers = SAHBuildGlobals_NeedBackPointers(globals); bool process_masks = SAHBuildGlobals_NeedMasks(globals); const int RING_SIZE = 32; // for 2 SGs, 16 should result in 2 rounds: one SG produces 6, then 2 SGs consume 2 and produce 12 // for 4 SGs, 32 results in 2 rounds: one SG produces 6, 4 SGs consume 4 and produce 24, resulting in 26 local uint2 SLM_local_root_buffer[RING_SIZE]; local uint SLM_broadcast; local uint SLM_ring_tail; local uint SLM_grb_broadcast; //// This below can be moved to separate function if needed for TLAS //// uniform uint bvh2_root = entry.bvh2_node; uniform uint qnode_root = entry.qnode; uint ring_head = build_qnodes_pc(globals, alloc_backpointers, process_masks, qnode_root, bvh2_root, SLM_local_root_buffer, &SLM_ring_tail, RING_SIZE); // spill root buffer contents uint n = SLM_ring_tail - ring_head; if (n > 0) { const uint max_qnode_grb_entries = scheduler->num_max_qnode_global_root_buffer_entries; if (get_local_id(0) == 0) { SLM_grb_broadcast = atomic_add_global(&scheduler->num_qnode_grb_new_entries, n); if(SLM_grb_broadcast >= max_qnode_grb_entries) // if global_root_buffer is full, then make space in build's root_buffer SLM_broadcast = atomic_add_global(&globals->root_buffer_num_produced, n); else if( (SLM_grb_broadcast + n) >= max_qnode_grb_entries) // if we exceed global_root_buffer with our entries, then make space in build's root_buffer SLM_broadcast = atomic_add_global(&globals->root_buffer_num_produced, n - (max_qnode_grb_entries - SLM_grb_broadcast)); } barrier( CLK_LOCAL_MEM_FENCE ); uint produce_pos = SLM_broadcast; uint grb_produce_num = n; // grb stands for global_root_buffer uint lrb_produce_num = 0; // lrb stands for local root buffer, meaning this build's root_buffer if(SLM_grb_broadcast >= max_qnode_grb_entries) // if global_root_buffer is full, don't write to it { grb_produce_num = 0; lrb_produce_num = n; } else if( (SLM_grb_broadcast + n) >= max_qnode_grb_entries) // if we exceed global_root_buffer with our entries, then decrease amount of entries and store rest in build's root buffer { grb_produce_num = max_qnode_grb_entries - SLM_grb_broadcast; lrb_produce_num = n - grb_produce_num; } // save data to global_root_buffer for(uint i = get_local_id(0); i < grb_produce_num; i += get_local_size(0)) { const uint2 slm_record = SLM_local_root_buffer[(ring_head + i) % RING_SIZE]; struct QNodeGlobalRootBufferEntry new_entry; new_entry.bvh2_node = slm_record.x; new_entry.qnode = slm_record.y; new_entry.build_idx = entry.build_idx; const uint new_entries_offset = curr_entries_offset ? 0 : max_qnode_grb_entries; global_root_buffer->entries[new_entries_offset + SLM_grb_broadcast + i] = new_entry; } // if anything left, write to build's root buffer for (uint i = get_local_id(0); i < lrb_produce_num; i += get_local_size(0)) root_buffer[produce_pos + i] = SLM_local_root_buffer[(ring_head + i + grb_produce_num) % RING_SIZE]; } } GRL_ANNOTATE_IGC_DO_NOT_SPILL __attribute__( (reqd_work_group_size( 16, 1, 1 )) ) __attribute__( (intel_reqd_sub_group_size( 16 )) ) kernel void build_qnodes_try_to_fill_grb_batched( global struct SAHBuildGlobals* builds_globals, global struct QnodeScheduler* scheduler ) { const uint build_id = scheduler->batched_build_offset + get_group_id(0); global struct SAHBuildGlobals* globals = &builds_globals[build_id]; global uint2* root_buffer = (global uint2*)globals->p_qnode_root_buffer; global struct QNodeGlobalRootBuffer* qnode_root_buffer = (global struct QNodeGlobalRootBuffer*)&scheduler->qnode_global_root_buffer; const uint num_produced = globals->root_buffer_num_produced; const uint num_consumed = globals->root_buffer_num_consumed; const uint entries = num_produced - num_consumed; // entries to build's root buffer if(!entries) return; uint global_root_buffer_offset; if(get_local_id(0) == 0) global_root_buffer_offset = atomic_add_global(&scheduler->num_qnode_grb_new_entries, entries); global_root_buffer_offset = sub_group_broadcast(global_root_buffer_offset, 0); const uint max_qnode_grb_entries = scheduler->num_max_qnode_global_root_buffer_entries; if(global_root_buffer_offset >= max_qnode_grb_entries) // if global_root_buffer is full, then return return; uint global_root_buffer_produce_num = entries; if(global_root_buffer_offset + entries >= max_qnode_grb_entries) // if we exceed global_root_buffer with our entries, then reduce number of entries to push global_root_buffer_produce_num = max_qnode_grb_entries - global_root_buffer_offset; for(uint i = get_local_id(0); i < global_root_buffer_produce_num; i += get_local_size(0)) { const uint2 entry = root_buffer[num_consumed + i]; struct QNodeGlobalRootBufferEntry new_entry; new_entry.bvh2_node = entry.x; new_entry.qnode = entry.y; new_entry.build_idx = build_id; const uint new_entries_offset = qnode_root_buffer->curr_entries_offset ? 0 : max_qnode_grb_entries; qnode_root_buffer->entries[new_entries_offset + global_root_buffer_offset + i] = new_entry; } if(get_local_id(0) == 0) globals->root_buffer_num_consumed += global_root_buffer_produce_num; }