//
// Copyright (C) 2009-2022 Intel Corporation
//
// SPDX-License-Identifier: MIT
//
//

#include "bvh_build_refit.h"
#include "libs/lsc_intrinsics.h"
#include "morton/morton_common.h"

/*

  POSTSORT PHASE2:
  Two kernels here, selected by MORTON_BUILDER_P2_SINGLE_WG_THRESHOLD whish is set to very big value.
  1. parallel_build_phase2_refit - performs refit using global synchronization and mem_fence_gpu_invalidate.
                                   This kernel should be used only for very big bvh, it is faster than non-SLM fallback
                                   in parallel_build_phase2_refit_local.
  2. parallel_build_phase2_refit_local - should be used for most of the cases, we usually fit into SLM with the number of
                                   nodes allocated in phase0, but there is also non-SLM fallback there, as the
                                   decision on which kernel to run is based on the nodes estimates on the host
                                   side.

*/


GRL_INLINE void refit_bottom_up_global_sync(
    global char* bvh_mem,
    global uint* global_refit_startpoints,
    uniform uint nodeId,
    uniform ushort lane)
{
    global struct BVHBase* bvh = (global struct BVHBase*)bvh_mem;

    BackPointers* backPointers = BVHBase_GetBackPointers( bvh );
    global struct QBVHNodeN* nodeData = BVHBase_nodeData( bvh );

    // Get the node idx that was put here in phase1
    const uint innerNodeIdx = global_refit_startpoints[nodeId];

    // Get the qnode and backpointer
    uniform global struct QBVHNodeN* qnode = nodeData + innerNodeIdx;
    uint backPointer = *InnerNode_GetBackPointer(backPointers, innerNodeIdx);

    varying struct AABB childrenAABB; // one child AABB per lane
    AABB_init(&childrenAABB);

    uniform uint numChildren = (backPointer >> 3) & 0x7;
    if(numChildren == 0) return;

    global struct QBVHNodeN* qnode_child = (global struct QBVHNodeN*)QBVHNodeN_childrenPointer( qnode );
    varying ushort child_idx = (lane < numChildren) ? lane : 0;
    childrenAABB = getAABB_QBVHNodeN( qnode_child + child_idx );

#if MORTON_VERBOSE_LOG
    if(lane == 0)
        printf("REFIT2: index: %d, child_idx: %d\n", innerNodeIdx, child_idx);
#endif

    struct AABB reduce_bounds = AABB_sub_group_reduce_N6( &childrenAABB );
    reduce_bounds = AABB_sub_group_shuffle( &reduce_bounds, 0 );

    subgroup_QBVHNodeN_setBounds(qnode, reduce_bounds, childrenAABB, numChildren, lane);

    uint children_mask = qnode_child[child_idx].instMask;
    qnode->instMask = sub_group_reduce_or_N6(children_mask);

    SUBGROUP_refit_bottom_up( qnode, bvh, reduce_bounds, numChildren, lane, 0 );
}

__attribute__( (reqd_work_group_size( 16, 1, 1 )) ) void kernel
parallel_build_phase2_refit( global char* bvh_mem,
    global uint* global_refit_startpoints )
{
    refit_bottom_up_global_sync(bvh_mem, global_refit_startpoints, get_group_id(0), get_local_id(0));
}


GRL_INLINE void SUBGROUP_refit_bottom_up_global(
    uniform global struct QBVHNodeN* globalNodeData,
    uniform struct BackPointers* backPointers,
    varying ushort lane,
    varying uint curNodeIndex)
{
    uniform uint backpointer = *InnerNode_GetBackPointer(backPointers, curNodeIndex);

    const uint head_lane = 0;
    uniform struct AABB child_aabb; // this carries reduced aabb between loop turns

    while (curNodeIndex != 0)
    {
        global struct QBVHNodeN* qnode = globalNodeData + curNodeIndex;
        global struct QBVHNodeN* qnode_child = (global struct QBVHNodeN*)QBVHNodeN_childrenPointer( qnode );
        uint numChildren = BackPointer_GetNumChildren(backpointer);

        varying ushort child_idx = (lane < numChildren) ? lane : 0;
        child_aabb = getAABB_QBVHNodeN( qnode_child + child_idx );

        struct AABB reduced_bounds = AABB_sub_group_reduce_N6(&child_aabb);
        reduced_bounds = AABB_sub_group_shuffle(&reduced_bounds, head_lane);

        /* get bounds of all children from child nodes directly */
        subgroup_QBVHNodeN_setBounds(qnode, reduced_bounds, child_aabb, numChildren, lane);

        uchar childrenMask = qnode_child[child_idx].instMask;
        qnode->instMask = sub_group_reduce_or_N6(childrenMask);

        uint parentIndex = BackPointer_GetParentIndex(backpointer);

        mem_fence_gpu_invalidate();

        if (lane == 0)
        {
            backpointer = atomic_inc_global((__global uint *)InnerNode_GetBackPointer(backPointers, parentIndex));

            uint globalBackpointer = (parentIndex << 6) | (numChildren << 3);

            /* set global back pointer */
            *InnerNode_GetBackPointer(backPointers, curNodeIndex) = globalBackpointer;

#if MORTON_VERBOSE_LOG
            printf("BU_INNER: index: %d, first_child_id: %d, offset: %d, parent: %d, numChildren: %d, child_loc_idx: %d reduced_bounds: %f\n",
                   curNodeIndex, curNodeIndex + qnode->offset, qnode->offset, backpointer >> 6, numChildren, child_idx, reduced_bounds.lower.x);
#endif
        }

        backpointer = 1 + intel_sub_group_shuffle(backpointer, head_lane);
        curNodeIndex = parentIndex;

        /* if all children got refitted, then continue */
        uniform uint numChildrenRefitted = (backpointer >> 0) & 0x7;
        uniform uint numChildrenTotal = (backpointer >> 3) & 0x7;

        if (numChildrenRefitted != numChildrenTotal)
                return;
    }

    // process root of the treelet
    {

#if MORTON_DEBUG_CHECKS
        if (curNodeIndex != 0) printf("SUBGROUP_refit_bottom_up_local: this should be local node index 0\n");
#endif

        global struct QBVHNodeN* qnode_child = (global struct QBVHNodeN*)QBVHNodeN_childrenPointer( globalNodeData );
        uint numChildren = BackPointer_GetNumChildren(backpointer);

        varying ushort child_idx = (lane < numChildren) ? lane : 0;
        child_aabb = getAABB_QBVHNodeN( qnode_child + child_idx );

        struct AABB reduced_bounds = AABB_sub_group_reduce_N6(&child_aabb);
        reduced_bounds = AABB_sub_group_shuffle(&reduced_bounds, head_lane);

        /* get bounds of all children from child nodes directly */
        subgroup_QBVHNodeN_setBounds(globalNodeData, reduced_bounds, child_aabb, numChildren, lane);

        uchar childrenMask = qnode_child[child_idx].instMask;
        globalNodeData->instMask = sub_group_reduce_or_N6(childrenMask);

        /* reset refit counter for next refit */
        if (lane == 0)
        {
            /* set global back pointer */
            *InnerNode_GetBackPointer(backPointers, 0) = backpointer & (~7u);

#if MORTON_VERBOSE_LOG
        printf("BU_ROOT: curNodeIndex: %d, index: %d, first_child_id: %d, offset: %d, parent: %d, numChildren: %d, sg_bu_startpoints_cnt: %d\n",
               curNodeIndex, 0, 0 + globalNodeData->offset, globalNodeData->offset, backpointer >> 6, numChildren, sg_bu_startpoints_cnt);
#endif
        }
    }
}


// TODO: Check why 512 wg size has worse performance than 256
__attribute__( (reqd_work_group_size( 512, 1, 1 )) )
__attribute__((intel_reqd_sub_group_size(16))) void kernel
parallel_build_phase2_refit_local( global struct Globals* globals,
    global char* bvh_mem,
    global struct MortonFlattenedBoxlessNode *boxless_nodes)
{
    // Number of nodes created in P0, to be refitted in this stage
    uint p0_created_num = globals->p0_created_num;

    // Return immediately if host executed this kernel but there is nothing to do
    if(p0_created_num == 0)
        return;

    global struct BVHBase* bvh = (global struct BVHBase*)bvh_mem;
    BackPointers* backPointers = BVHBase_GetBackPointers( bvh );
    global struct QBVHNodeN* nodeData = BVHBase_nodeData( bvh );
    varying ushort lane = get_sub_group_local_id();

    // Hardcode SLM to max here as we do not know upfront how much mem will be needed
    local union UPerNodeData perNodeData[MORTON_BUILDER_P2_ELEMENTS_IN_SLM]; /* 16kb is max slm for 256 wg_size */

    // Number of allocated nodes in phase0 (p0_created_num + children)
    uint p0_allocated_num = globals->p0_allocated_num;

    // array that will keep 2x8 shorts indices
    varying uint sg_fatleaf_array = 0x0;
    uniform uint8_t sg_bu_startpoints_cnt = 0;

    // Determine if we can fit into SLM with all the nodes allocated in phase0,
    // There are two paths here:
    // 1. Copy all needed flattened nodes and bounding boxes to SLM and reuse bottom up local,
    //    which does refit nad creates qnodes in bvh
    // 2. If not fit into SLM, first create qnodes in bvh, and perform bottom up refit with global atomics synchronization.
    //    It is not performant to do so, keep it as a guardrail here. On the host side we do fallback
    //    to the old refit separated path, with wg_size 8 with better EU reuse.
    if(p0_allocated_num < MORTON_BUILDER_P2_ELEMENTS_IN_SLM)
    {
        for (uint ID = get_sub_group_id(); ID < p0_created_num; ID += get_num_sub_groups() )
        {
            MortonFlattenedBoxlessNode boxless_node = boxless_nodes[ID];
            uint current_id = boxless_node.binary_hierarchy_index >> 6;

            // Put the mask for the children that are subtree roots in the binary_hierarchy_index that is unused
            uchar children_root_mask = (boxless_node.binary_hierarchy_index & 0x3F);

            if(lane == 0)
                perNodeData[current_id].boxlessNode = boxless_node;

            // When no children are subtree roots, we are done and skip to the next iteration
            if(children_root_mask == 0x0)
            {
                continue;
            }
            // When all children are subtree roots, put them to sg_fatleaf_array
            else if(children_root_mask == 0x3F)
            {
                set_2xSG_arr_first_write(sg_bu_startpoints_cnt++, &sg_fatleaf_array, current_id, lane);
            }

            uniform global struct QBVHNodeN* qnode = nodeData + current_id;

            uniform uint numChildren = (boxless_node.backPointer >> 3) & 0x7;
            uint lead_child_offset = MortonFlattenedBoxlessNode_GetChildOffset(boxless_node);
            varying ushort child_idx = (lane < numChildren) ? lane : 0;

            varying struct AABB childrenAABB; // one child AABB per lane
            AABB_init(&childrenAABB);

            uint lead_child_global_id = current_id + lead_child_offset;

            uniform global struct QBVHNodeN* qnode_child = nodeData + lead_child_global_id;
            childrenAABB = getAABB_QBVHNodeN( qnode_child + child_idx );

            // Get only AABBs of children that are p1 subtree roots
            bool lane_active = boxless_node.binary_hierarchy_index & (1 << child_idx);
            if(lane_active)
            {
                uint child_global_id = lead_child_global_id + child_idx;
                perNodeData[child_global_id].box = childrenAABB;
                perNodeData[child_global_id].box.lower.w = as_float((uint)qnode_child->instMask);
            }

#if MORTON_VERBOSE_LOG
            if(lane == 0)
                printf("P2_LOCAL: ID: %d, lead_child_offset: %d, child_idx: %d, lane_active: %d, boxless_node >> 6: %d, perNodeData[ID].box = %f, qnode->offset: %d\n", ID, lead_child_offset, child_idx, lane_active, boxless_node.backPointer >> 6, perNodeData[ID].box.lower.x, qnode->offset);
#endif
        }

        work_group_barrier(CLK_LOCAL_MEM_FENCE);

        SUBGROUP_refit_bottom_up_local(nodeData, backPointers, 0, 0, lane, perNodeData, sg_fatleaf_array, sg_bu_startpoints_cnt);
    }
    else
    {
        for (uint ID = get_sub_group_id(); ID < p0_created_num; ID += get_num_sub_groups() )
        {
            MortonFlattenedBoxlessNode boxless_node = boxless_nodes[ID];
            uint current_id = boxless_node.binary_hierarchy_index >> 6;

            // Put the mask for the children that are subtree roots in the binary_hierarchy_index that is unused
            uchar children_root_mask = (boxless_node.binary_hierarchy_index & 0x3F);
            uniform uint numChildren = (boxless_node.backPointer >> 3) & 0x7;

            uniform global struct QBVHNodeN* qnode = nodeData + current_id;
            uint nodeType = MortonFlattenedBoxlessNode_GetType(boxless_node);
            uint lead_child_offset = MortonFlattenedBoxlessNode_GetChildOffset(boxless_node);

            SUBGROUP_QBVHNodeN_setChildIncr1( qnode );
            if(lane == 0)
            {
                QBVH6Node_set_type( qnode, nodeType );
                qnode->offset = lead_child_offset;
            }

            // When no children are subtree roots, we are done and skip to the next iteration
            if(children_root_mask == 0x0)
            {
                continue;
            }
            // When all children are subtree roots, put them to sg_fatleaf_array
            else if(children_root_mask == 0x3F)
            {
                set_2xSG_arr_first_write(sg_bu_startpoints_cnt++, &sg_fatleaf_array, current_id, lane);
            }

#if MORTON_VERBOSE_LOG
            if(lane == 0)
                printf("P2_GLOBAL: ID: %d, lead_child_offset: %d, child_idx: %d, boxless_node >> 6: %d, perNodeData[ID].box = %f, qnode->offset: %d\n", ID, lead_child_offset, child_idx, boxless_node.backPointer >> 6, reduce_bounds.lower.x, qnode->offset);
#endif
        }

        while (sg_bu_startpoints_cnt > 0)
        {
            uint curNodeIndex = get_from_2xSG_arr(--sg_bu_startpoints_cnt, sg_fatleaf_array, lane);

            SUBGROUP_refit_bottom_up_global(nodeData, backPointers, lane, curNodeIndex);
        }
    }
}