/* * Copyright © Microsoft Corporation * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sublicense, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice (including the next * paragraph) shall be included in all copies or substantial portions of the * Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS * IN THE SOFTWARE. */ #include "dxil_nir.h" #include "dxil_module.h" #include "nir_builder.h" #include "nir_deref.h" #include "nir_worklist.h" #include "nir_to_dxil.h" #include "util/u_math.h" #include "vulkan/vulkan_core.h" static void cl_type_size_align(const struct glsl_type *type, unsigned *size, unsigned *align) { *size = glsl_get_cl_size(type); *align = glsl_get_cl_alignment(type); } static nir_ssa_def * load_comps_to_vec(nir_builder *b, unsigned src_bit_size, nir_ssa_def **src_comps, unsigned num_src_comps, unsigned dst_bit_size) { if (src_bit_size == dst_bit_size) return nir_vec(b, src_comps, num_src_comps); else if (src_bit_size > dst_bit_size) return nir_extract_bits(b, src_comps, num_src_comps, 0, src_bit_size * num_src_comps / dst_bit_size, dst_bit_size); unsigned num_dst_comps = DIV_ROUND_UP(num_src_comps * src_bit_size, dst_bit_size); unsigned comps_per_dst = dst_bit_size / src_bit_size; nir_ssa_def *dst_comps[4]; for (unsigned i = 0; i < num_dst_comps; i++) { unsigned src_offs = i * comps_per_dst; dst_comps[i] = nir_u2uN(b, src_comps[src_offs], dst_bit_size); for (unsigned j = 1; j < comps_per_dst && src_offs + j < num_src_comps; j++) { nir_ssa_def *tmp = nir_ishl(b, nir_u2uN(b, src_comps[src_offs + j], dst_bit_size), nir_imm_int(b, j * src_bit_size)); dst_comps[i] = nir_ior(b, dst_comps[i], tmp); } } return nir_vec(b, dst_comps, num_dst_comps); } static nir_ssa_def * build_load_ptr_dxil(nir_builder *b, nir_deref_instr *deref, nir_ssa_def *idx) { return nir_load_ptr_dxil(b, 1, 32, &deref->dest.ssa, idx); } static bool lower_load_deref(nir_builder *b, nir_intrinsic_instr *intr) { assert(intr->dest.is_ssa); b->cursor = nir_before_instr(&intr->instr); nir_deref_instr *deref = nir_src_as_deref(intr->src[0]); if (!nir_deref_mode_is(deref, nir_var_shader_temp)) return false; nir_ssa_def *ptr = nir_u2u32(b, nir_build_deref_offset(b, deref, cl_type_size_align)); nir_ssa_def *offset = nir_iand(b, ptr, nir_inot(b, nir_imm_int(b, 3))); assert(intr->dest.is_ssa); unsigned num_components = nir_dest_num_components(intr->dest); unsigned bit_size = nir_dest_bit_size(intr->dest); unsigned load_size = MAX2(32, bit_size); unsigned num_bits = num_components * bit_size; nir_ssa_def *comps[NIR_MAX_VEC_COMPONENTS]; unsigned comp_idx = 0; nir_deref_path path; nir_deref_path_init(&path, deref, NULL); nir_ssa_def *base_idx = nir_ishr(b, offset, nir_imm_int(b, 2 /* log2(32 / 8) */)); /* Split loads into 32-bit chunks */ for (unsigned i = 0; i < num_bits; i += load_size) { unsigned subload_num_bits = MIN2(num_bits - i, load_size); nir_ssa_def *idx = nir_iadd(b, base_idx, nir_imm_int(b, i / 32)); nir_ssa_def *vec32 = build_load_ptr_dxil(b, path.path[0], idx); if (load_size == 64) { idx = nir_iadd(b, idx, nir_imm_int(b, 1)); vec32 = nir_vec2(b, vec32, build_load_ptr_dxil(b, path.path[0], idx)); } /* If we have 2 bytes or less to load we need to adjust the u32 value so * we can always extract the LSB. */ if (subload_num_bits <= 16) { nir_ssa_def *shift = nir_imul(b, nir_iand(b, ptr, nir_imm_int(b, 3)), nir_imm_int(b, 8)); vec32 = nir_ushr(b, vec32, shift); } /* And now comes the pack/unpack step to match the original type. */ nir_ssa_def *temp_vec = nir_extract_bits(b, &vec32, 1, 0, subload_num_bits / bit_size, bit_size); for (unsigned comp = 0; comp < subload_num_bits / bit_size; ++comp, ++comp_idx) comps[comp_idx] = nir_channel(b, temp_vec, comp); } nir_deref_path_finish(&path); assert(comp_idx == num_components); nir_ssa_def *result = nir_vec(b, comps, num_components); nir_ssa_def_rewrite_uses(&intr->dest.ssa, result); nir_instr_remove(&intr->instr); return true; } static nir_ssa_def * ubo_load_select_32b_comps(nir_builder *b, nir_ssa_def *vec32, nir_ssa_def *offset, unsigned alignment) { assert(alignment >= 16 || alignment == 8 || alignment == 4 || alignment == 2 || alignment == 1); assert(vec32->num_components == 4); if (alignment > 8) return vec32; nir_ssa_def *comps[4]; nir_ssa_def *cond; for (unsigned i = 0; i < 4; i++) comps[i] = nir_channel(b, vec32, i); /* If we have 8bytes alignment or less, select which half the vec4 should * be used. */ cond = nir_ine(b, nir_iand(b, offset, nir_imm_int(b, 0x8)), nir_imm_int(b, 0)); comps[0] = nir_bcsel(b, cond, comps[2], comps[0]); comps[1] = nir_bcsel(b, cond, comps[3], comps[1]); if (alignment == 8) return nir_vec(b, comps, 2); /* 4 byte align or less needed, select which of the 32bit component should be * used and return it. The sub-32bit split is handled in nir_extract_bits(). */ cond = nir_ine(b, nir_iand(b, offset, nir_imm_int(b, 0x4)), nir_imm_int(b, 0)); return nir_bcsel(b, cond, comps[1], comps[0]); } nir_ssa_def * build_load_ubo_dxil(nir_builder *b, nir_ssa_def *buffer, nir_ssa_def *offset, unsigned num_components, unsigned bit_size, unsigned alignment) { nir_ssa_def *idx = nir_ushr(b, offset, nir_imm_int(b, 4)); nir_ssa_def *comps[NIR_MAX_VEC_COMPONENTS]; unsigned num_bits = num_components * bit_size; unsigned comp_idx = 0; /* We need to split loads in 16byte chunks because that's the * granularity of cBufferLoadLegacy(). */ for (unsigned i = 0; i < num_bits; i += (16 * 8)) { /* For each 16byte chunk (or smaller) we generate a 32bit ubo vec * load. */ unsigned subload_num_bits = MIN2(num_bits - i, 16 * 8); nir_ssa_def *vec32 = nir_load_ubo_dxil(b, 4, 32, buffer, nir_iadd(b, idx, nir_imm_int(b, i / (16 * 8)))); /* First re-arrange the vec32 to account for intra 16-byte offset. */ assert(subload_num_bits / 8 <= alignment); vec32 = ubo_load_select_32b_comps(b, vec32, offset, alignment); /* If we have 2 bytes or less to load we need to adjust the u32 value so * we can always extract the LSB. */ if (alignment <= 2) { nir_ssa_def *shift = nir_imul(b, nir_iand(b, offset, nir_imm_int(b, 3)), nir_imm_int(b, 8)); vec32 = nir_ushr(b, vec32, shift); } /* And now comes the pack/unpack step to match the original type. */ nir_ssa_def *temp_vec = nir_extract_bits(b, &vec32, 1, 0, subload_num_bits / bit_size, bit_size); for (unsigned comp = 0; comp < subload_num_bits / bit_size; ++comp, ++comp_idx) comps[comp_idx] = nir_channel(b, temp_vec, comp); } assert(comp_idx == num_components); return nir_vec(b, comps, num_components); } static bool lower_load_ssbo(nir_builder *b, nir_intrinsic_instr *intr, unsigned min_bit_size) { assert(intr->dest.is_ssa); assert(intr->src[0].is_ssa); assert(intr->src[1].is_ssa); b->cursor = nir_before_instr(&intr->instr); unsigned src_bit_size = nir_dest_bit_size(intr->dest); unsigned store_bit_size = CLAMP(src_bit_size, min_bit_size, 32); unsigned offset_mask = store_bit_size / 8 - 1; nir_ssa_def *buffer = intr->src[0].ssa; nir_ssa_def *offset = nir_iand(b, intr->src[1].ssa, nir_imm_int(b, ~offset_mask)); enum gl_access_qualifier access = nir_intrinsic_access(intr); unsigned num_components = nir_dest_num_components(intr->dest); unsigned num_bits = num_components * src_bit_size; nir_ssa_def *comps[NIR_MAX_VEC_COMPONENTS]; unsigned comp_idx = 0; /* We need to split loads in 4-component chunks because that's the optimal * granularity of bufferLoad(). Minimum alignment is 2-byte. */ for (unsigned i = 0; i < num_bits; i += 4 * store_bit_size) { /* For each 4-component chunk (or smaller) we generate a N-bit ssbo vec load. */ unsigned subload_num_bits = MIN2(num_bits - i, 4 * store_bit_size); /* The number of components to store depends on the number of bytes. */ nir_ssa_def *result = nir_load_ssbo(b, DIV_ROUND_UP(subload_num_bits, store_bit_size), store_bit_size, buffer, nir_iadd(b, offset, nir_imm_int(b, i / 8)), .align_mul = store_bit_size / 8, .align_offset = 0, .access = access); /* If we have an unaligned load we need to adjust the result value so * we can always extract the LSB. */ if (nir_intrinsic_align(intr) < store_bit_size / 8) { nir_ssa_def *shift = nir_imul(b, nir_iand(b, intr->src[1].ssa, nir_imm_int(b, offset_mask)), nir_imm_int(b, 8)); result = nir_ushr(b, result, shift); } /* And now comes the pack/unpack step to match the original type. */ nir_ssa_def *temp_vec = nir_extract_bits(b, &result, 1, 0, subload_num_bits / src_bit_size, src_bit_size); for (unsigned comp = 0; comp < subload_num_bits / src_bit_size; ++comp, ++comp_idx) comps[comp_idx] = nir_channel(b, temp_vec, comp); } assert(comp_idx == num_components); nir_ssa_def *result = nir_vec(b, comps, num_components); nir_ssa_def_rewrite_uses(&intr->dest.ssa, result); nir_instr_remove(&intr->instr); return true; } static bool lower_store_ssbo(nir_builder *b, nir_intrinsic_instr *intr, unsigned min_bit_size) { b->cursor = nir_before_instr(&intr->instr); assert(intr->src[0].is_ssa); assert(intr->src[1].is_ssa); assert(intr->src[2].is_ssa); nir_ssa_def *val = intr->src[0].ssa; nir_ssa_def *buffer = intr->src[1].ssa; unsigned src_bit_size = val->bit_size; unsigned store_bit_size = CLAMP(src_bit_size, min_bit_size, 32); unsigned masked_store_bit_size = 32; unsigned num_components = val->num_components; unsigned num_bits = num_components * src_bit_size; unsigned offset_mask = store_bit_size / 8 - 1; unsigned masked_store_offset_mask = masked_store_bit_size / 8 - 1; nir_ssa_def *offset = nir_iand(b, intr->src[2].ssa, nir_imm_int(b, ~offset_mask)); nir_ssa_def *masked_offset = nir_iand(b, intr->src[2].ssa, nir_imm_int(b, ~masked_store_offset_mask)); nir_ssa_def *comps[NIR_MAX_VEC_COMPONENTS] = { 0 }; unsigned comp_idx = 0; unsigned write_mask = nir_intrinsic_write_mask(intr); for (unsigned i = 0; i < num_components; i++) if (write_mask & (1 << i)) comps[i] = nir_channel(b, val, i); /* We split stores in 4-component chunks because that's the optimal granularity * of bufferStore(). Minimum alignment is 2-byte. */ unsigned bit_offset = 0; while (true) { /* Skip over holes in the write mask */ while (comp_idx < num_components && comps[comp_idx] == NULL) { comp_idx++; bit_offset += src_bit_size; } if (comp_idx >= num_components) break; /* For each 4-component chunk (or smaller) we generate a ssbo vec * store. If a component is skipped by the write mask, do a smaller * sub-store */ unsigned num_src_comps_stored = 0, substore_num_bits = 0; while(num_src_comps_stored + comp_idx < num_components && substore_num_bits + bit_offset < num_bits && substore_num_bits < 4 * store_bit_size && comps[comp_idx + num_src_comps_stored]) { ++num_src_comps_stored; substore_num_bits += src_bit_size; } bool force_masked = false; if (substore_num_bits > store_bit_size && substore_num_bits % store_bit_size != 0) { /* Split this into two, one unmasked store of the first bits, * and then the second loop iteration will handle a masked store * for the rest. */ assert(num_src_comps_stored == 3); if (store_bit_size == 16) { assert(substore_num_bits < 32); /* If we're already doing atomics to store, just do one * 32bit masked store instead of a 16bit store and a masked * store for the other 8 bits. */ force_masked = true; } else { --num_src_comps_stored; substore_num_bits = store_bit_size; } } nir_intrinsic_instr *store; if (substore_num_bits < store_bit_size || force_masked) { nir_ssa_def *store_vec = load_comps_to_vec(b, src_bit_size, &comps[comp_idx], num_src_comps_stored, masked_store_bit_size); nir_ssa_def *mask = nir_imm_intN_t(b, (1 << substore_num_bits) - 1, masked_store_bit_size); /* If we have small alignments we need to place them correctly in the component. */ if (nir_intrinsic_align(intr) <= masked_store_bit_size / 8) { nir_ssa_def *pos = nir_iand(b, intr->src[2].ssa, nir_imm_int(b, masked_store_offset_mask)); nir_ssa_def *shift = nir_imul_imm(b, pos, 8); store_vec = nir_ishl(b, store_vec, shift); mask = nir_ishl(b, mask, shift); } nir_ssa_def *local_offset = nir_iadd(b, masked_offset, nir_imm_int(b, bit_offset / 8)); store = nir_intrinsic_instr_create(b->shader, nir_intrinsic_store_ssbo_masked_dxil); store->src[0] = nir_src_for_ssa(store_vec); store->src[1] = nir_src_for_ssa(nir_inot(b, mask)); store->src[2] = nir_src_for_ssa(buffer); store->src[3] = nir_src_for_ssa(local_offset); } else { nir_ssa_def *local_offset = nir_iadd(b, offset, nir_imm_int(b, bit_offset / 8)); nir_ssa_def *store_vec = load_comps_to_vec(b, src_bit_size, &comps[comp_idx], num_src_comps_stored, store_bit_size); store = nir_intrinsic_instr_create(b->shader, nir_intrinsic_store_ssbo); store->src[0] = nir_src_for_ssa(store_vec); store->src[1] = nir_src_for_ssa(buffer); store->src[2] = nir_src_for_ssa(local_offset); nir_intrinsic_set_align(store, store_bit_size / 8, 0); } /* The number of components to store depends on the number of bits. */ store->num_components = DIV_ROUND_UP(substore_num_bits, store_bit_size); nir_builder_instr_insert(b, &store->instr); comp_idx += num_src_comps_stored; bit_offset += substore_num_bits; if (nir_intrinsic_has_write_mask(store)) nir_intrinsic_set_write_mask(store, (1 << store->num_components) - 1); } nir_instr_remove(&intr->instr); return true; } static void lower_load_vec32(nir_builder *b, nir_ssa_def *index, unsigned num_comps, nir_ssa_def **comps, nir_intrinsic_op op) { for (unsigned i = 0; i < num_comps; i++) { nir_intrinsic_instr *load = nir_intrinsic_instr_create(b->shader, op); load->num_components = 1; load->src[0] = nir_src_for_ssa(nir_iadd(b, index, nir_imm_int(b, i))); nir_ssa_dest_init(&load->instr, &load->dest, 1, 32); nir_builder_instr_insert(b, &load->instr); comps[i] = &load->dest.ssa; } } static bool lower_32b_offset_load(nir_builder *b, nir_intrinsic_instr *intr) { assert(intr->dest.is_ssa); unsigned bit_size = nir_dest_bit_size(intr->dest); unsigned num_components = nir_dest_num_components(intr->dest); unsigned num_bits = num_components * bit_size; b->cursor = nir_before_instr(&intr->instr); nir_intrinsic_op op = intr->intrinsic; assert(intr->src[0].is_ssa); nir_ssa_def *offset = intr->src[0].ssa; if (op == nir_intrinsic_load_shared) { offset = nir_iadd(b, offset, nir_imm_int(b, nir_intrinsic_base(intr))); op = nir_intrinsic_load_shared_dxil; } else { offset = nir_u2u32(b, offset); op = nir_intrinsic_load_scratch_dxil; } nir_ssa_def *index = nir_ushr(b, offset, nir_imm_int(b, 2)); nir_ssa_def *comps[NIR_MAX_VEC_COMPONENTS]; nir_ssa_def *comps_32bit[NIR_MAX_VEC_COMPONENTS * 2]; /* We need to split loads in 32-bit accesses because the buffer * is an i32 array and DXIL does not support type casts. */ unsigned num_32bit_comps = DIV_ROUND_UP(num_bits, 32); lower_load_vec32(b, index, num_32bit_comps, comps_32bit, op); unsigned num_comps_per_pass = MIN2(num_32bit_comps, 4); for (unsigned i = 0; i < num_32bit_comps; i += num_comps_per_pass) { unsigned num_vec32_comps = MIN2(num_32bit_comps - i, 4); unsigned num_dest_comps = num_vec32_comps * 32 / bit_size; nir_ssa_def *vec32 = nir_vec(b, &comps_32bit[i], num_vec32_comps); /* If we have 16 bits or less to load we need to adjust the u32 value so * we can always extract the LSB. */ if (num_bits <= 16) { nir_ssa_def *shift = nir_imul(b, nir_iand(b, offset, nir_imm_int(b, 3)), nir_imm_int(b, 8)); vec32 = nir_ushr(b, vec32, shift); } /* And now comes the pack/unpack step to match the original type. */ unsigned dest_index = i * 32 / bit_size; nir_ssa_def *temp_vec = nir_extract_bits(b, &vec32, 1, 0, num_dest_comps, bit_size); for (unsigned comp = 0; comp < num_dest_comps; ++comp, ++dest_index) comps[dest_index] = nir_channel(b, temp_vec, comp); } nir_ssa_def *result = nir_vec(b, comps, num_components); nir_ssa_def_rewrite_uses(&intr->dest.ssa, result); nir_instr_remove(&intr->instr); return true; } static void lower_store_vec32(nir_builder *b, nir_ssa_def *index, nir_ssa_def *vec32, nir_intrinsic_op op) { for (unsigned i = 0; i < vec32->num_components; i++) { nir_intrinsic_instr *store = nir_intrinsic_instr_create(b->shader, op); store->src[0] = nir_src_for_ssa(nir_channel(b, vec32, i)); store->src[1] = nir_src_for_ssa(nir_iadd(b, index, nir_imm_int(b, i))); store->num_components = 1; nir_builder_instr_insert(b, &store->instr); } } static void lower_masked_store_vec32(nir_builder *b, nir_ssa_def *offset, nir_ssa_def *index, nir_ssa_def *vec32, unsigned num_bits, nir_intrinsic_op op, unsigned alignment) { nir_ssa_def *mask = nir_imm_int(b, (1 << num_bits) - 1); /* If we have small alignments, we need to place them correctly in the u32 component. */ if (alignment <= 2) { nir_ssa_def *shift = nir_imul_imm(b, nir_iand(b, offset, nir_imm_int(b, 3)), 8); vec32 = nir_ishl(b, vec32, shift); mask = nir_ishl(b, mask, shift); } if (op == nir_intrinsic_store_shared_dxil) { /* Use the dedicated masked intrinsic */ nir_store_shared_masked_dxil(b, vec32, nir_inot(b, mask), index); } else { /* For scratch, since we don't need atomics, just generate the read-modify-write in NIR */ nir_ssa_def *load = nir_load_scratch_dxil(b, 1, 32, index); nir_ssa_def *new_val = nir_ior(b, vec32, nir_iand(b, nir_inot(b, mask), load)); lower_store_vec32(b, index, new_val, op); } } static bool lower_32b_offset_store(nir_builder *b, nir_intrinsic_instr *intr) { assert(intr->src[0].is_ssa); unsigned num_components = nir_src_num_components(intr->src[0]); unsigned bit_size = nir_src_bit_size(intr->src[0]); unsigned num_bits = num_components * bit_size; b->cursor = nir_before_instr(&intr->instr); nir_intrinsic_op op = intr->intrinsic; nir_ssa_def *offset = intr->src[1].ssa; if (op == nir_intrinsic_store_shared) { offset = nir_iadd(b, offset, nir_imm_int(b, nir_intrinsic_base(intr))); op = nir_intrinsic_store_shared_dxil; } else { offset = nir_u2u32(b, offset); op = nir_intrinsic_store_scratch_dxil; } nir_ssa_def *comps[NIR_MAX_VEC_COMPONENTS]; unsigned comp_idx = 0; for (unsigned i = 0; i < num_components; i++) comps[i] = nir_channel(b, intr->src[0].ssa, i); unsigned step = MAX2(bit_size, 32); for (unsigned i = 0; i < num_bits; i += step) { /* For each 4byte chunk (or smaller) we generate a 32bit scalar store. */ unsigned substore_num_bits = MIN2(num_bits - i, step); nir_ssa_def *local_offset = nir_iadd(b, offset, nir_imm_int(b, i / 8)); nir_ssa_def *vec32 = load_comps_to_vec(b, bit_size, &comps[comp_idx], substore_num_bits / bit_size, 32); nir_ssa_def *index = nir_ushr(b, local_offset, nir_imm_int(b, 2)); /* For anything less than 32bits we need to use the masked version of the * intrinsic to preserve data living in the same 32bit slot. */ if (substore_num_bits < 32) { lower_masked_store_vec32(b, local_offset, index, vec32, num_bits, op, nir_intrinsic_align(intr)); } else { lower_store_vec32(b, index, vec32, op); } comp_idx += substore_num_bits / bit_size; } nir_instr_remove(&intr->instr); return true; } static void ubo_to_temp_patch_deref_mode(nir_deref_instr *deref) { deref->modes = nir_var_shader_temp; nir_foreach_use(use_src, &deref->dest.ssa) { if (use_src->parent_instr->type != nir_instr_type_deref) continue; nir_deref_instr *parent = nir_instr_as_deref(use_src->parent_instr); ubo_to_temp_patch_deref_mode(parent); } } static void ubo_to_temp_update_entry(nir_deref_instr *deref, struct hash_entry *he) { assert(nir_deref_mode_is(deref, nir_var_mem_constant)); assert(deref->dest.is_ssa); assert(he->data); nir_foreach_use(use_src, &deref->dest.ssa) { if (use_src->parent_instr->type == nir_instr_type_deref) { ubo_to_temp_update_entry(nir_instr_as_deref(use_src->parent_instr), he); } else if (use_src->parent_instr->type == nir_instr_type_intrinsic) { nir_intrinsic_instr *intr = nir_instr_as_intrinsic(use_src->parent_instr); if (intr->intrinsic != nir_intrinsic_load_deref) he->data = NULL; } else { he->data = NULL; } if (!he->data) break; } } bool dxil_nir_lower_ubo_to_temp(nir_shader *nir) { struct hash_table *ubo_to_temp = _mesa_pointer_hash_table_create(NULL); bool progress = false; /* First pass: collect all UBO accesses that could be turned into * shader temp accesses. */ foreach_list_typed(nir_function, func, node, &nir->functions) { if (!func->is_entrypoint) continue; assert(func->impl); nir_foreach_block(block, func->impl) { nir_foreach_instr_safe(instr, block) { if (instr->type != nir_instr_type_deref) continue; nir_deref_instr *deref = nir_instr_as_deref(instr); if (!nir_deref_mode_is(deref, nir_var_mem_constant) || deref->deref_type != nir_deref_type_var) continue; struct hash_entry *he = _mesa_hash_table_search(ubo_to_temp, deref->var); if (!he) he = _mesa_hash_table_insert(ubo_to_temp, deref->var, deref->var); if (!he->data) continue; ubo_to_temp_update_entry(deref, he); } } } hash_table_foreach(ubo_to_temp, he) { nir_variable *var = he->data; if (!var) continue; /* Change the variable mode. */ var->data.mode = nir_var_shader_temp; /* Make sure the variable has a name. * DXIL variables must have names. */ if (!var->name) var->name = ralloc_asprintf(nir, "global_%d", exec_list_length(&nir->variables)); progress = true; } _mesa_hash_table_destroy(ubo_to_temp, NULL); /* Second pass: patch all derefs that were accessing the converted UBOs * variables. */ foreach_list_typed(nir_function, func, node, &nir->functions) { if (!func->is_entrypoint) continue; assert(func->impl); nir_foreach_block(block, func->impl) { nir_foreach_instr_safe(instr, block) { if (instr->type != nir_instr_type_deref) continue; nir_deref_instr *deref = nir_instr_as_deref(instr); if (nir_deref_mode_is(deref, nir_var_mem_constant) && deref->deref_type == nir_deref_type_var && deref->var->data.mode == nir_var_shader_temp) ubo_to_temp_patch_deref_mode(deref); } } } return progress; } static bool lower_load_ubo(nir_builder *b, nir_intrinsic_instr *intr) { assert(intr->dest.is_ssa); assert(intr->src[0].is_ssa); assert(intr->src[1].is_ssa); b->cursor = nir_before_instr(&intr->instr); nir_ssa_def *result = build_load_ubo_dxil(b, intr->src[0].ssa, intr->src[1].ssa, nir_dest_num_components(intr->dest), nir_dest_bit_size(intr->dest), nir_intrinsic_align(intr)); nir_ssa_def_rewrite_uses(&intr->dest.ssa, result); nir_instr_remove(&intr->instr); return true; } bool dxil_nir_lower_loads_stores_to_dxil(nir_shader *nir, const struct dxil_nir_lower_loads_stores_options *options) { bool progress = false; foreach_list_typed(nir_function, func, node, &nir->functions) { if (!func->is_entrypoint) continue; assert(func->impl); nir_builder b; nir_builder_init(&b, func->impl); nir_foreach_block(block, func->impl) { nir_foreach_instr_safe(instr, block) { if (instr->type != nir_instr_type_intrinsic) continue; nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); switch (intr->intrinsic) { case nir_intrinsic_load_deref: progress |= lower_load_deref(&b, intr); break; case nir_intrinsic_load_shared: case nir_intrinsic_load_scratch: progress |= lower_32b_offset_load(&b, intr); break; case nir_intrinsic_load_ssbo: progress |= lower_load_ssbo(&b, intr, options->use_16bit_ssbo ? 16 : 32); break; case nir_intrinsic_load_ubo: progress |= lower_load_ubo(&b, intr); break; case nir_intrinsic_store_shared: case nir_intrinsic_store_scratch: progress |= lower_32b_offset_store(&b, intr); break; case nir_intrinsic_store_ssbo: progress |= lower_store_ssbo(&b, intr, options->use_16bit_ssbo ? 16 : 32); break; default: break; } } } } return progress; } static bool lower_shared_atomic(nir_builder *b, nir_intrinsic_instr *intr) { b->cursor = nir_before_instr(&intr->instr); assert(intr->src[0].is_ssa); nir_ssa_def *offset = nir_iadd(b, intr->src[0].ssa, nir_imm_int(b, nir_intrinsic_base(intr))); nir_ssa_def *index = nir_ushr(b, offset, nir_imm_int(b, 2)); nir_intrinsic_op dxil_op = intr->intrinsic == nir_intrinsic_shared_atomic_swap ? nir_intrinsic_shared_atomic_swap_dxil : nir_intrinsic_shared_atomic_dxil; nir_intrinsic_instr *atomic = nir_intrinsic_instr_create(b->shader, dxil_op); atomic->src[0] = nir_src_for_ssa(index); assert(intr->src[1].is_ssa); atomic->src[1] = nir_src_for_ssa(intr->src[1].ssa); if (dxil_op == nir_intrinsic_shared_atomic_swap_dxil) { assert(intr->src[2].is_ssa); atomic->src[2] = nir_src_for_ssa(intr->src[2].ssa); } atomic->num_components = 0; nir_ssa_dest_init(&atomic->instr, &atomic->dest, 1, 32); nir_intrinsic_set_atomic_op(atomic, nir_intrinsic_atomic_op(intr)); nir_builder_instr_insert(b, &atomic->instr); nir_ssa_def_rewrite_uses(&intr->dest.ssa, &atomic->dest.ssa); nir_instr_remove(&intr->instr); return true; } bool dxil_nir_lower_atomics_to_dxil(nir_shader *nir) { bool progress = false; foreach_list_typed(nir_function, func, node, &nir->functions) { if (!func->is_entrypoint) continue; assert(func->impl); nir_builder b; nir_builder_init(&b, func->impl); nir_foreach_block(block, func->impl) { nir_foreach_instr_safe(instr, block) { if (instr->type != nir_instr_type_intrinsic) continue; nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); switch (intr->intrinsic) { case nir_intrinsic_shared_atomic: case nir_intrinsic_shared_atomic_swap: progress |= lower_shared_atomic(&b, intr); break; default: break; } } } } return progress; } static bool lower_deref_ssbo(nir_builder *b, nir_deref_instr *deref) { assert(nir_deref_mode_is(deref, nir_var_mem_ssbo)); assert(deref->deref_type == nir_deref_type_var || deref->deref_type == nir_deref_type_cast); nir_variable *var = deref->var; b->cursor = nir_before_instr(&deref->instr); if (deref->deref_type == nir_deref_type_var) { /* We turn all deref_var into deref_cast and build a pointer value based on * the var binding which encodes the UAV id. */ nir_ssa_def *ptr = nir_imm_int64(b, (uint64_t)var->data.binding << 32); nir_deref_instr *deref_cast = nir_build_deref_cast(b, ptr, nir_var_mem_ssbo, deref->type, glsl_get_explicit_stride(var->type)); nir_ssa_def_rewrite_uses(&deref->dest.ssa, &deref_cast->dest.ssa); nir_instr_remove(&deref->instr); deref = deref_cast; return true; } return false; } bool dxil_nir_lower_deref_ssbo(nir_shader *nir) { bool progress = false; foreach_list_typed(nir_function, func, node, &nir->functions) { if (!func->is_entrypoint) continue; assert(func->impl); nir_builder b; nir_builder_init(&b, func->impl); nir_foreach_block(block, func->impl) { nir_foreach_instr_safe(instr, block) { if (instr->type != nir_instr_type_deref) continue; nir_deref_instr *deref = nir_instr_as_deref(instr); if (!nir_deref_mode_is(deref, nir_var_mem_ssbo) || (deref->deref_type != nir_deref_type_var && deref->deref_type != nir_deref_type_cast)) continue; progress |= lower_deref_ssbo(&b, deref); } } } return progress; } static bool lower_alu_deref_srcs(nir_builder *b, nir_alu_instr *alu) { const nir_op_info *info = &nir_op_infos[alu->op]; bool progress = false; b->cursor = nir_before_instr(&alu->instr); for (unsigned i = 0; i < info->num_inputs; i++) { nir_deref_instr *deref = nir_src_as_deref(alu->src[i].src); if (!deref) continue; nir_deref_path path; nir_deref_path_init(&path, deref, NULL); nir_deref_instr *root_deref = path.path[0]; nir_deref_path_finish(&path); if (root_deref->deref_type != nir_deref_type_cast) continue; nir_ssa_def *ptr = nir_iadd(b, root_deref->parent.ssa, nir_build_deref_offset(b, deref, cl_type_size_align)); nir_instr_rewrite_src(&alu->instr, &alu->src[i].src, nir_src_for_ssa(ptr)); progress = true; } return progress; } bool dxil_nir_opt_alu_deref_srcs(nir_shader *nir) { bool progress = false; foreach_list_typed(nir_function, func, node, &nir->functions) { if (!func->is_entrypoint) continue; assert(func->impl); nir_builder b; nir_builder_init(&b, func->impl); nir_foreach_block(block, func->impl) { nir_foreach_instr_safe(instr, block) { if (instr->type != nir_instr_type_alu) continue; nir_alu_instr *alu = nir_instr_as_alu(instr); progress |= lower_alu_deref_srcs(&b, alu); } } } return progress; } static void cast_phi(nir_builder *b, nir_phi_instr *phi, unsigned new_bit_size) { nir_phi_instr *lowered = nir_phi_instr_create(b->shader); int num_components = 0; int old_bit_size = phi->dest.ssa.bit_size; nir_foreach_phi_src(src, phi) { assert(num_components == 0 || num_components == src->src.ssa->num_components); num_components = src->src.ssa->num_components; b->cursor = nir_after_instr_and_phis(src->src.ssa->parent_instr); nir_ssa_def *cast = nir_u2uN(b, src->src.ssa, new_bit_size); nir_phi_instr_add_src(lowered, src->pred, nir_src_for_ssa(cast)); } nir_ssa_dest_init(&lowered->instr, &lowered->dest, num_components, new_bit_size); b->cursor = nir_before_instr(&phi->instr); nir_builder_instr_insert(b, &lowered->instr); b->cursor = nir_after_phis(nir_cursor_current_block(b->cursor)); nir_ssa_def *result = nir_u2uN(b, &lowered->dest.ssa, old_bit_size); nir_ssa_def_rewrite_uses(&phi->dest.ssa, result); nir_instr_remove(&phi->instr); } static bool upcast_phi_impl(nir_function_impl *impl, unsigned min_bit_size) { nir_builder b; nir_builder_init(&b, impl); bool progress = false; nir_foreach_block_reverse(block, impl) { nir_foreach_phi_safe(phi, block) { assert(phi->dest.is_ssa); if (phi->dest.ssa.bit_size == 1 || phi->dest.ssa.bit_size >= min_bit_size) continue; cast_phi(&b, phi, min_bit_size); progress = true; } } if (progress) { nir_metadata_preserve(impl, nir_metadata_block_index | nir_metadata_dominance); } else { nir_metadata_preserve(impl, nir_metadata_all); } return progress; } bool dxil_nir_lower_upcast_phis(nir_shader *shader, unsigned min_bit_size) { bool progress = false; nir_foreach_function(function, shader) { if (function->impl) progress |= upcast_phi_impl(function->impl, min_bit_size); } return progress; } struct dxil_nir_split_clip_cull_distance_params { nir_variable *new_var[2]; nir_shader *shader; }; /* In GLSL and SPIR-V, clip and cull distance are arrays of floats (with a limit of 8). * In DXIL, clip and cull distances are up to 2 float4s combined. * Coming from GLSL, we can request this 2 float4 format, but coming from SPIR-V, * we can't, and have to accept a "compact" array of scalar floats. * * To help emitting a valid input signature for this case, split the variables so that they * match what we need to put in the signature (e.g. { float clip[4]; float clip1; float cull[3]; }) */ static bool dxil_nir_split_clip_cull_distance_instr(nir_builder *b, nir_instr *instr, void *cb_data) { struct dxil_nir_split_clip_cull_distance_params *params = cb_data; if (instr->type != nir_instr_type_deref) return false; nir_deref_instr *deref = nir_instr_as_deref(instr); nir_variable *var = nir_deref_instr_get_variable(deref); if (!var || var->data.location < VARYING_SLOT_CLIP_DIST0 || var->data.location > VARYING_SLOT_CULL_DIST1 || !var->data.compact) return false; unsigned new_var_idx = var->data.mode == nir_var_shader_in ? 0 : 1; nir_variable *new_var = params->new_var[new_var_idx]; /* The location should only be inside clip distance, because clip * and cull should've been merged by nir_lower_clip_cull_distance_arrays() */ assert(var->data.location == VARYING_SLOT_CLIP_DIST0 || var->data.location == VARYING_SLOT_CLIP_DIST1); /* The deref chain to the clip/cull variables should be simple, just the * var and an array with a constant index, otherwise more lowering/optimization * might be needed before this pass, e.g. copy prop, lower_io_to_temporaries, * split_var_copies, and/or lower_var_copies. In the case of arrayed I/O like * inputs to the tessellation or geometry stages, there might be a second level * of array index. */ assert(deref->deref_type == nir_deref_type_var || deref->deref_type == nir_deref_type_array); b->cursor = nir_before_instr(instr); unsigned arrayed_io_length = 0; const struct glsl_type *old_type = var->type; if (nir_is_arrayed_io(var, b->shader->info.stage)) { arrayed_io_length = glsl_array_size(old_type); old_type = glsl_get_array_element(old_type); } if (!new_var) { /* Update lengths for new and old vars */ int old_length = glsl_array_size(old_type); int new_length = (old_length + var->data.location_frac) - 4; old_length -= new_length; /* The existing variable fits in the float4 */ if (new_length <= 0) return false; new_var = nir_variable_clone(var, params->shader); nir_shader_add_variable(params->shader, new_var); assert(glsl_get_base_type(glsl_get_array_element(old_type)) == GLSL_TYPE_FLOAT); var->type = glsl_array_type(glsl_float_type(), old_length, 0); new_var->type = glsl_array_type(glsl_float_type(), new_length, 0); if (arrayed_io_length) { var->type = glsl_array_type(var->type, arrayed_io_length, 0); new_var->type = glsl_array_type(new_var->type, arrayed_io_length, 0); } new_var->data.location++; new_var->data.location_frac = 0; params->new_var[new_var_idx] = new_var; } /* Update the type for derefs of the old var */ if (deref->deref_type == nir_deref_type_var) { deref->type = var->type; return false; } if (glsl_type_is_array(deref->type)) { assert(arrayed_io_length > 0); deref->type = glsl_get_array_element(var->type); return false; } assert(glsl_get_base_type(deref->type) == GLSL_TYPE_FLOAT); nir_const_value *index = nir_src_as_const_value(deref->arr.index); assert(index); /* Treat this array as a vector starting at the component index in location_frac, * so if location_frac is 1 and index is 0, then it's accessing the 'y' component * of the vector. If index + location_frac is >= 4, there's no component there, * so we need to add a new variable and adjust the index. */ unsigned total_index = index->u32 + var->data.location_frac; if (total_index < 4) return false; nir_deref_instr *new_var_deref = nir_build_deref_var(b, new_var); nir_deref_instr *new_intermediate_deref = new_var_deref; if (arrayed_io_length) { nir_deref_instr *parent = nir_src_as_deref(deref->parent); assert(parent->deref_type == nir_deref_type_array); new_intermediate_deref = nir_build_deref_array(b, new_intermediate_deref, parent->arr.index.ssa); } nir_deref_instr *new_array_deref = nir_build_deref_array(b, new_intermediate_deref, nir_imm_int(b, total_index % 4)); nir_ssa_def_rewrite_uses(&deref->dest.ssa, &new_array_deref->dest.ssa); return true; } bool dxil_nir_split_clip_cull_distance(nir_shader *shader) { struct dxil_nir_split_clip_cull_distance_params params = { .new_var = { NULL, NULL }, .shader = shader, }; nir_shader_instructions_pass(shader, dxil_nir_split_clip_cull_distance_instr, nir_metadata_block_index | nir_metadata_dominance | nir_metadata_loop_analysis, ¶ms); return params.new_var[0] != NULL || params.new_var[1] != NULL; } static bool dxil_nir_lower_double_math_instr(nir_builder *b, nir_instr *instr, UNUSED void *cb_data) { if (instr->type != nir_instr_type_alu) return false; nir_alu_instr *alu = nir_instr_as_alu(instr); /* TODO: See if we can apply this explicitly to packs/unpacks that are then * used as a double. As-is, if we had an app explicitly do a 64bit integer op, * then try to bitcast to double (not expressible in HLSL, but it is in other * source languages), this would unpack the integer and repack as a double, when * we probably want to just send the bitcast through to the backend. */ b->cursor = nir_before_instr(&alu->instr); bool progress = false; for (unsigned i = 0; i < nir_op_infos[alu->op].num_inputs; ++i) { if (nir_alu_type_get_base_type(nir_op_infos[alu->op].input_types[i]) == nir_type_float && alu->src[i].src.ssa->bit_size == 64) { unsigned num_components = nir_op_infos[alu->op].input_sizes[i]; if (!num_components) num_components = alu->dest.dest.ssa.num_components; nir_ssa_def *components[NIR_MAX_VEC_COMPONENTS]; for (unsigned c = 0; c < num_components; ++c) { nir_ssa_def *packed_double = nir_channel(b, alu->src[i].src.ssa, alu->src[i].swizzle[c]); nir_ssa_def *unpacked_double = nir_unpack_64_2x32(b, packed_double); components[c] = nir_pack_double_2x32_dxil(b, unpacked_double); alu->src[i].swizzle[c] = c; } nir_instr_rewrite_src_ssa(instr, &alu->src[i].src, nir_vec(b, components, num_components)); progress = true; } } if (nir_alu_type_get_base_type(nir_op_infos[alu->op].output_type) == nir_type_float && alu->dest.dest.ssa.bit_size == 64) { b->cursor = nir_after_instr(&alu->instr); nir_ssa_def *components[NIR_MAX_VEC_COMPONENTS]; for (unsigned c = 0; c < alu->dest.dest.ssa.num_components; ++c) { nir_ssa_def *packed_double = nir_channel(b, &alu->dest.dest.ssa, c); nir_ssa_def *unpacked_double = nir_unpack_double_2x32_dxil(b, packed_double); components[c] = nir_pack_64_2x32(b, unpacked_double); } nir_ssa_def *repacked_dvec = nir_vec(b, components, alu->dest.dest.ssa.num_components); nir_ssa_def_rewrite_uses_after(&alu->dest.dest.ssa, repacked_dvec, repacked_dvec->parent_instr); progress = true; } return progress; } bool dxil_nir_lower_double_math(nir_shader *shader) { return nir_shader_instructions_pass(shader, dxil_nir_lower_double_math_instr, nir_metadata_block_index | nir_metadata_dominance | nir_metadata_loop_analysis, NULL); } typedef struct { gl_system_value *values; uint32_t count; } zero_system_values_state; static bool lower_system_value_to_zero_filter(const nir_instr* instr, const void* cb_state) { if (instr->type != nir_instr_type_intrinsic) { return false; } nir_intrinsic_instr* intrin = nir_instr_as_intrinsic(instr); /* All the intrinsics we care about are loads */ if (!nir_intrinsic_infos[intrin->intrinsic].has_dest) return false; assert(intrin->dest.is_ssa); zero_system_values_state* state = (zero_system_values_state*)cb_state; for (uint32_t i = 0; i < state->count; ++i) { gl_system_value value = state->values[i]; nir_intrinsic_op value_op = nir_intrinsic_from_system_value(value); if (intrin->intrinsic == value_op) { return true; } else if (intrin->intrinsic == nir_intrinsic_load_deref) { nir_deref_instr* deref = nir_src_as_deref(intrin->src[0]); if (!nir_deref_mode_is(deref, nir_var_system_value)) return false; nir_variable* var = deref->var; if (var->data.location == value) { return true; } } } return false; } static nir_ssa_def* lower_system_value_to_zero_instr(nir_builder* b, nir_instr* instr, void* _state) { return nir_imm_int(b, 0); } bool dxil_nir_lower_system_values_to_zero(nir_shader* shader, gl_system_value* system_values, uint32_t count) { zero_system_values_state state = { system_values, count }; return nir_shader_lower_instructions(shader, lower_system_value_to_zero_filter, lower_system_value_to_zero_instr, &state); } static void lower_load_local_group_size(nir_builder *b, nir_intrinsic_instr *intr) { b->cursor = nir_after_instr(&intr->instr); nir_const_value v[3] = { nir_const_value_for_int(b->shader->info.workgroup_size[0], 32), nir_const_value_for_int(b->shader->info.workgroup_size[1], 32), nir_const_value_for_int(b->shader->info.workgroup_size[2], 32) }; nir_ssa_def *size = nir_build_imm(b, 3, 32, v); nir_ssa_def_rewrite_uses(&intr->dest.ssa, size); nir_instr_remove(&intr->instr); } static bool lower_system_values_impl(nir_builder *b, nir_instr *instr, void *_state) { if (instr->type != nir_instr_type_intrinsic) return false; nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); switch (intr->intrinsic) { case nir_intrinsic_load_workgroup_size: lower_load_local_group_size(b, intr); return true; default: return false; } } bool dxil_nir_lower_system_values(nir_shader *shader) { return nir_shader_instructions_pass(shader, lower_system_values_impl, nir_metadata_block_index | nir_metadata_dominance | nir_metadata_loop_analysis, NULL); } static const struct glsl_type * get_bare_samplers_for_type(const struct glsl_type *type, bool is_shadow) { const struct glsl_type *base_sampler_type = is_shadow ? glsl_bare_shadow_sampler_type() : glsl_bare_sampler_type(); return glsl_type_wrap_in_arrays(base_sampler_type, type); } static const struct glsl_type * get_textures_for_sampler_type(const struct glsl_type *type) { return glsl_type_wrap_in_arrays( glsl_sampler_type_to_texture( glsl_without_array(type)), type); } static bool redirect_sampler_derefs(struct nir_builder *b, nir_instr *instr, void *data) { if (instr->type != nir_instr_type_tex) return false; nir_tex_instr *tex = nir_instr_as_tex(instr); int sampler_idx = nir_tex_instr_src_index(tex, nir_tex_src_sampler_deref); if (sampler_idx == -1) { /* No sampler deref - does this instruction even need a sampler? If not, * sampler_index doesn't necessarily point to a sampler, so early-out. */ if (!nir_tex_instr_need_sampler(tex)) return false; /* No derefs but needs a sampler, must be using indices */ nir_variable *bare_sampler = _mesa_hash_table_u64_search(data, tex->sampler_index); /* Already have a bare sampler here */ if (bare_sampler) return false; nir_variable *old_sampler = NULL; nir_foreach_variable_with_modes(var, b->shader, nir_var_uniform) { if (var->data.binding <= tex->sampler_index && var->data.binding + glsl_type_get_sampler_count(var->type) > tex->sampler_index) { /* Already have a bare sampler for this binding and it is of the * correct type, add it to the table */ if (glsl_type_is_bare_sampler(glsl_without_array(var->type)) && glsl_sampler_type_is_shadow(glsl_without_array(var->type)) == tex->is_shadow) { _mesa_hash_table_u64_insert(data, tex->sampler_index, var); return false; } old_sampler = var; } } assert(old_sampler); /* Clone the original sampler to a bare sampler of the correct type */ bare_sampler = nir_variable_clone(old_sampler, b->shader); nir_shader_add_variable(b->shader, bare_sampler); bare_sampler->type = get_bare_samplers_for_type(old_sampler->type, tex->is_shadow); _mesa_hash_table_u64_insert(data, tex->sampler_index, bare_sampler); return true; } /* Using derefs, means we have to rewrite the deref chain in addition to cloning */ nir_deref_instr *final_deref = nir_src_as_deref(tex->src[sampler_idx].src); nir_deref_path path; nir_deref_path_init(&path, final_deref, NULL); nir_deref_instr *old_tail = path.path[0]; assert(old_tail->deref_type == nir_deref_type_var); nir_variable *old_var = old_tail->var; if (glsl_type_is_bare_sampler(glsl_without_array(old_var->type)) && glsl_sampler_type_is_shadow(glsl_without_array(old_var->type)) == tex->is_shadow) { nir_deref_path_finish(&path); return false; } uint64_t var_key = ((uint64_t)old_var->data.descriptor_set << 32) | old_var->data.binding; nir_variable *new_var = _mesa_hash_table_u64_search(data, var_key); if (!new_var) { new_var = nir_variable_clone(old_var, b->shader); nir_shader_add_variable(b->shader, new_var); new_var->type = get_bare_samplers_for_type(old_var->type, tex->is_shadow); _mesa_hash_table_u64_insert(data, var_key, new_var); } b->cursor = nir_after_instr(&old_tail->instr); nir_deref_instr *new_tail = nir_build_deref_var(b, new_var); for (unsigned i = 1; path.path[i]; ++i) { b->cursor = nir_after_instr(&path.path[i]->instr); new_tail = nir_build_deref_follower(b, new_tail, path.path[i]); } nir_deref_path_finish(&path); nir_instr_rewrite_src_ssa(&tex->instr, &tex->src[sampler_idx].src, &new_tail->dest.ssa); return true; } static bool redirect_texture_derefs(struct nir_builder *b, nir_instr *instr, void *data) { if (instr->type != nir_instr_type_tex) return false; nir_tex_instr *tex = nir_instr_as_tex(instr); int texture_idx = nir_tex_instr_src_index(tex, nir_tex_src_texture_deref); if (texture_idx == -1) { /* No derefs, must be using indices */ nir_variable *bare_sampler = _mesa_hash_table_u64_search(data, tex->texture_index); /* Already have a texture here */ if (bare_sampler) return false; nir_variable *typed_sampler = NULL; nir_foreach_variable_with_modes(var, b->shader, nir_var_uniform) { if (var->data.binding <= tex->texture_index && var->data.binding + glsl_type_get_texture_count(var->type) > tex->texture_index) { /* Already have a texture for this binding, add it to the table */ _mesa_hash_table_u64_insert(data, tex->texture_index, var); return false; } if (var->data.binding <= tex->texture_index && var->data.binding + glsl_type_get_sampler_count(var->type) > tex->texture_index && !glsl_type_is_bare_sampler(glsl_without_array(var->type))) { typed_sampler = var; } } /* Clone the typed sampler to a texture and we're done */ assert(typed_sampler); bare_sampler = nir_variable_clone(typed_sampler, b->shader); bare_sampler->type = get_textures_for_sampler_type(typed_sampler->type); nir_shader_add_variable(b->shader, bare_sampler); _mesa_hash_table_u64_insert(data, tex->texture_index, bare_sampler); return true; } /* Using derefs, means we have to rewrite the deref chain in addition to cloning */ nir_deref_instr *final_deref = nir_src_as_deref(tex->src[texture_idx].src); nir_deref_path path; nir_deref_path_init(&path, final_deref, NULL); nir_deref_instr *old_tail = path.path[0]; assert(old_tail->deref_type == nir_deref_type_var); nir_variable *old_var = old_tail->var; if (glsl_type_is_texture(glsl_without_array(old_var->type)) || glsl_type_is_image(glsl_without_array(old_var->type))) { nir_deref_path_finish(&path); return false; } uint64_t var_key = ((uint64_t)old_var->data.descriptor_set << 32) | old_var->data.binding; nir_variable *new_var = _mesa_hash_table_u64_search(data, var_key); if (!new_var) { new_var = nir_variable_clone(old_var, b->shader); new_var->type = get_textures_for_sampler_type(old_var->type); nir_shader_add_variable(b->shader, new_var); _mesa_hash_table_u64_insert(data, var_key, new_var); } b->cursor = nir_after_instr(&old_tail->instr); nir_deref_instr *new_tail = nir_build_deref_var(b, new_var); for (unsigned i = 1; path.path[i]; ++i) { b->cursor = nir_after_instr(&path.path[i]->instr); new_tail = nir_build_deref_follower(b, new_tail, path.path[i]); } nir_deref_path_finish(&path); nir_instr_rewrite_src_ssa(&tex->instr, &tex->src[texture_idx].src, &new_tail->dest.ssa); return true; } bool dxil_nir_split_typed_samplers(nir_shader *nir) { struct hash_table_u64 *hash_table = _mesa_hash_table_u64_create(NULL); bool progress = nir_shader_instructions_pass(nir, redirect_sampler_derefs, nir_metadata_block_index | nir_metadata_dominance | nir_metadata_loop_analysis, hash_table); _mesa_hash_table_u64_clear(hash_table); progress |= nir_shader_instructions_pass(nir, redirect_texture_derefs, nir_metadata_block_index | nir_metadata_dominance | nir_metadata_loop_analysis, hash_table); _mesa_hash_table_u64_destroy(hash_table); return progress; } static bool lower_sysval_to_load_input_impl(nir_builder *b, nir_instr *instr, void *data) { if (instr->type != nir_instr_type_intrinsic) return false; nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); gl_system_value sysval = SYSTEM_VALUE_MAX; switch (intr->intrinsic) { case nir_intrinsic_load_front_face: sysval = SYSTEM_VALUE_FRONT_FACE; break; case nir_intrinsic_load_instance_id: sysval = SYSTEM_VALUE_INSTANCE_ID; break; case nir_intrinsic_load_vertex_id_zero_base: sysval = SYSTEM_VALUE_VERTEX_ID_ZERO_BASE; break; default: return false; } nir_variable **sysval_vars = (nir_variable **)data; nir_variable *var = sysval_vars[sysval]; assert(var); const nir_alu_type dest_type = (sysval == SYSTEM_VALUE_FRONT_FACE) ? nir_type_uint32 : nir_get_nir_type_for_glsl_type(var->type); const unsigned bit_size = (sysval == SYSTEM_VALUE_FRONT_FACE) ? 32 : intr->dest.ssa.bit_size; b->cursor = nir_before_instr(instr); nir_ssa_def *result = nir_build_load_input(b, intr->dest.ssa.num_components, bit_size, nir_imm_int(b, 0), .base = var->data.driver_location, .dest_type = dest_type); /* The nir_type_uint32 is really a nir_type_bool32, but that type is very * inconvenient at this point during compilation. Convert to * nir_type_bool1 by comparing with zero. */ if (sysval == SYSTEM_VALUE_FRONT_FACE) result = nir_ine_imm(b, result, 0); nir_ssa_def_rewrite_uses(&intr->dest.ssa, result); return true; } bool dxil_nir_lower_sysval_to_load_input(nir_shader *s, nir_variable **sysval_vars) { return nir_shader_instructions_pass(s, lower_sysval_to_load_input_impl, nir_metadata_block_index | nir_metadata_dominance, sysval_vars); } /* Comparison function to sort io values so that first come normal varyings, * then system values, and then system generated values. */ static int variable_location_cmp(const nir_variable* a, const nir_variable* b) { // Sort by stream, driver_location, location, location_frac, then index unsigned a_location = a->data.location; if (a_location >= VARYING_SLOT_PATCH0) a_location -= VARYING_SLOT_PATCH0; unsigned b_location = b->data.location; if (b_location >= VARYING_SLOT_PATCH0) b_location -= VARYING_SLOT_PATCH0; unsigned a_stream = a->data.stream & ~NIR_STREAM_PACKED; unsigned b_stream = b->data.stream & ~NIR_STREAM_PACKED; return a_stream != b_stream ? a_stream - b_stream : a->data.driver_location != b->data.driver_location ? a->data.driver_location - b->data.driver_location : a_location != b_location ? a_location - b_location : a->data.location_frac != b->data.location_frac ? a->data.location_frac - b->data.location_frac : a->data.index - b->data.index; } /* Order varyings according to driver location */ uint64_t dxil_sort_by_driver_location(nir_shader* s, nir_variable_mode modes) { nir_sort_variables_with_modes(s, variable_location_cmp, modes); uint64_t result = 0; nir_foreach_variable_with_modes(var, s, modes) { result |= 1ull << var->data.location; } return result; } /* Sort PS outputs so that color outputs come first */ void dxil_sort_ps_outputs(nir_shader* s) { nir_foreach_variable_with_modes_safe(var, s, nir_var_shader_out) { /* We use the driver_location here to avoid introducing a new * struct or member variable here. The true, updated driver location * will be written below, after sorting */ switch (var->data.location) { case FRAG_RESULT_DEPTH: var->data.driver_location = 1; break; case FRAG_RESULT_STENCIL: var->data.driver_location = 2; break; case FRAG_RESULT_SAMPLE_MASK: var->data.driver_location = 3; break; default: var->data.driver_location = 0; } } nir_sort_variables_with_modes(s, variable_location_cmp, nir_var_shader_out); unsigned driver_loc = 0; nir_foreach_variable_with_modes(var, s, nir_var_shader_out) { var->data.driver_location = driver_loc++; } } enum dxil_sysvalue_type { DXIL_NO_SYSVALUE = 0, DXIL_USED_SYSVALUE, DXIL_SYSVALUE, DXIL_GENERATED_SYSVALUE }; static enum dxil_sysvalue_type nir_var_to_dxil_sysvalue_type(nir_variable *var, uint64_t other_stage_mask) { switch (var->data.location) { case VARYING_SLOT_FACE: return DXIL_GENERATED_SYSVALUE; case VARYING_SLOT_POS: case VARYING_SLOT_PRIMITIVE_ID: case VARYING_SLOT_CLIP_DIST0: case VARYING_SLOT_CLIP_DIST1: case VARYING_SLOT_PSIZ: case VARYING_SLOT_TESS_LEVEL_INNER: case VARYING_SLOT_TESS_LEVEL_OUTER: case VARYING_SLOT_VIEWPORT: case VARYING_SLOT_LAYER: case VARYING_SLOT_VIEW_INDEX: if (!((1ull << var->data.location) & other_stage_mask)) return DXIL_SYSVALUE; return DXIL_USED_SYSVALUE; default: return DXIL_NO_SYSVALUE; } } /* Order between stage values so that normal varyings come first, * then sysvalues and then system generated values. */ uint64_t dxil_reassign_driver_locations(nir_shader* s, nir_variable_mode modes, uint64_t other_stage_mask) { nir_foreach_variable_with_modes_safe(var, s, modes) { /* We use the driver_location here to avoid introducing a new * struct or member variable here. The true, updated driver location * will be written below, after sorting */ var->data.driver_location = nir_var_to_dxil_sysvalue_type(var, other_stage_mask); } nir_sort_variables_with_modes(s, variable_location_cmp, modes); uint64_t result = 0; unsigned driver_loc = 0, driver_patch_loc = 0; nir_foreach_variable_with_modes(var, s, modes) { if (var->data.location < 64) result |= 1ull << var->data.location; /* Overlap patches with non-patch */ var->data.driver_location = var->data.patch ? driver_patch_loc++ : driver_loc++; } return result; } static bool lower_ubo_array_one_to_static(struct nir_builder *b, nir_instr *inst, void *cb_data) { if (inst->type != nir_instr_type_intrinsic) return false; nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(inst); if (intrin->intrinsic != nir_intrinsic_load_vulkan_descriptor) return false; nir_variable *var = nir_get_binding_variable(b->shader, nir_chase_binding(intrin->src[0])); if (!var) return false; if (!glsl_type_is_array(var->type) || glsl_array_size(var->type) != 1) return false; nir_intrinsic_instr *index = nir_src_as_intrinsic(intrin->src[0]); /* We currently do not support reindex */ assert(index && index->intrinsic == nir_intrinsic_vulkan_resource_index); if (nir_src_is_const(index->src[0]) && nir_src_as_uint(index->src[0]) == 0) return false; if (nir_intrinsic_desc_type(index) != VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER) return false; b->cursor = nir_instr_remove(&index->instr); // Indexing out of bounds on array of UBOs is considered undefined // behavior. Therefore, we just hardcode all the index to 0. uint8_t bit_size = index->dest.ssa.bit_size; nir_ssa_def *zero = nir_imm_intN_t(b, 0, bit_size); nir_ssa_def *dest = nir_vulkan_resource_index(b, index->num_components, bit_size, zero, .desc_set = nir_intrinsic_desc_set(index), .binding = nir_intrinsic_binding(index), .desc_type = nir_intrinsic_desc_type(index)); nir_ssa_def_rewrite_uses(&index->dest.ssa, dest); return true; } bool dxil_nir_lower_ubo_array_one_to_static(nir_shader *s) { bool progress = nir_shader_instructions_pass( s, lower_ubo_array_one_to_static, nir_metadata_none, NULL); return progress; } static bool is_fquantize2f16(const nir_instr *instr, const void *data) { if (instr->type != nir_instr_type_alu) return false; nir_alu_instr *alu = nir_instr_as_alu(instr); return alu->op == nir_op_fquantize2f16; } static nir_ssa_def * lower_fquantize2f16(struct nir_builder *b, nir_instr *instr, void *data) { /* * SpvOpQuantizeToF16 documentation says: * * " * If Value is an infinity, the result is the same infinity. * If Value is a NaN, the result is a NaN, but not necessarily the same NaN. * If Value is positive with a magnitude too large to represent as a 16-bit * floating-point value, the result is positive infinity. If Value is negative * with a magnitude too large to represent as a 16-bit floating-point value, * the result is negative infinity. If the magnitude of Value is too small to * represent as a normalized 16-bit floating-point value, the result may be * either +0 or -0. * " * * which we turn into: * * if (val < MIN_FLOAT16) * return -INFINITY; * else if (val > MAX_FLOAT16) * return -INFINITY; * else if (fabs(val) < SMALLEST_NORMALIZED_FLOAT16 && sign(val) != 0) * return -0.0f; * else if (fabs(val) < SMALLEST_NORMALIZED_FLOAT16 && sign(val) == 0) * return +0.0f; * else * return round(val); */ nir_alu_instr *alu = nir_instr_as_alu(instr); nir_ssa_def *src = nir_ssa_for_src(b, alu->src[0].src, nir_src_num_components(alu->src[0].src)); nir_ssa_def *neg_inf_cond = nir_flt(b, src, nir_imm_float(b, -65504.0f)); nir_ssa_def *pos_inf_cond = nir_flt(b, nir_imm_float(b, 65504.0f), src); nir_ssa_def *zero_cond = nir_flt(b, nir_fabs(b, src), nir_imm_float(b, ldexpf(1.0, -14))); nir_ssa_def *zero = nir_iand_imm(b, src, 1 << 31); nir_ssa_def *round = nir_iand_imm(b, src, ~BITFIELD_MASK(13)); nir_ssa_def *res = nir_bcsel(b, neg_inf_cond, nir_imm_float(b, -INFINITY), round); res = nir_bcsel(b, pos_inf_cond, nir_imm_float(b, INFINITY), res); res = nir_bcsel(b, zero_cond, zero, res); return res; } bool dxil_nir_lower_fquantize2f16(nir_shader *s) { return nir_shader_lower_instructions(s, is_fquantize2f16, lower_fquantize2f16, NULL); } static bool fix_io_uint_deref_types(struct nir_builder *builder, nir_instr *instr, void *data) { if (instr->type != nir_instr_type_deref) return false; nir_deref_instr *deref = nir_instr_as_deref(instr); nir_variable *var = deref->deref_type == nir_deref_type_var ? deref->var : NULL; if (var == data) { deref->type = var->type; return true; } return false; } static bool fix_io_uint_type(nir_shader *s, nir_variable_mode modes, int slot) { nir_variable *fixed_var = NULL; nir_foreach_variable_with_modes(var, s, modes) { if (var->data.location == slot) { if (var->type == glsl_uint_type()) return false; assert(var->type == glsl_int_type()); var->type = glsl_uint_type(); fixed_var = var; break; } } assert(fixed_var); return nir_shader_instructions_pass(s, fix_io_uint_deref_types, nir_metadata_all, fixed_var); } bool dxil_nir_fix_io_uint_type(nir_shader *s, uint64_t in_mask, uint64_t out_mask) { if (!(s->info.outputs_written & out_mask) && !(s->info.inputs_read & in_mask)) return false; bool progress = false; while (in_mask) { int slot = u_bit_scan64(&in_mask); progress |= (s->info.inputs_read & (1ull << slot)) && fix_io_uint_type(s, nir_var_shader_in, slot); } while (out_mask) { int slot = u_bit_scan64(&out_mask); progress |= (s->info.outputs_written & (1ull << slot)) && fix_io_uint_type(s, nir_var_shader_out, slot); } return progress; } struct remove_after_discard_state { struct nir_block *active_block; }; static bool remove_after_discard(struct nir_builder *builder, nir_instr *instr, void *cb_data) { struct remove_after_discard_state *state = cb_data; if (instr->block == state->active_block) { nir_instr_remove_v(instr); return true; } if (instr->type != nir_instr_type_intrinsic) return false; nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); if (intr->intrinsic != nir_intrinsic_discard && intr->intrinsic != nir_intrinsic_terminate && intr->intrinsic != nir_intrinsic_discard_if && intr->intrinsic != nir_intrinsic_terminate_if) return false; state->active_block = instr->block; return false; } static bool lower_kill(struct nir_builder *builder, nir_instr *instr, void *_cb_data) { if (instr->type != nir_instr_type_intrinsic) return false; nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); if (intr->intrinsic != nir_intrinsic_discard && intr->intrinsic != nir_intrinsic_terminate && intr->intrinsic != nir_intrinsic_discard_if && intr->intrinsic != nir_intrinsic_terminate_if) return false; builder->cursor = nir_instr_remove(instr); if (intr->intrinsic == nir_intrinsic_discard || intr->intrinsic == nir_intrinsic_terminate) { nir_demote(builder); } else { assert(intr->src[0].is_ssa); nir_demote_if(builder, intr->src[0].ssa); } nir_jump(builder, nir_jump_return); return true; } bool dxil_nir_lower_discard_and_terminate(nir_shader *s) { if (s->info.stage != MESA_SHADER_FRAGMENT) return false; // This pass only works if all functions have been inlined assert(exec_list_length(&s->functions) == 1); struct remove_after_discard_state state; state.active_block = NULL; nir_shader_instructions_pass(s, remove_after_discard, nir_metadata_none, &state); return nir_shader_instructions_pass(s, lower_kill, nir_metadata_none, NULL); } static bool update_writes(struct nir_builder *b, nir_instr *instr, void *_state) { if (instr->type != nir_instr_type_intrinsic) return false; nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); if (intr->intrinsic != nir_intrinsic_store_output) return false; nir_io_semantics io = nir_intrinsic_io_semantics(intr); if (io.location != VARYING_SLOT_POS) return false; nir_ssa_def *src = intr->src[0].ssa; unsigned write_mask = nir_intrinsic_write_mask(intr); if (src->num_components == 4 && write_mask == 0xf) return false; b->cursor = nir_before_instr(instr); unsigned first_comp = nir_intrinsic_component(intr); nir_ssa_def *channels[4] = { NULL, NULL, NULL, NULL }; assert(first_comp + src->num_components <= ARRAY_SIZE(channels)); for (unsigned i = 0; i < src->num_components; ++i) if (write_mask & (1 << i)) channels[i + first_comp] = nir_channel(b, src, i); for (unsigned i = 0; i < 4; ++i) if (!channels[i]) channels[i] = nir_imm_intN_t(b, 0, src->bit_size); intr->num_components = 4; nir_instr_rewrite_src_ssa(instr, &intr->src[0], nir_vec(b, channels, 4)); nir_intrinsic_set_component(intr, 0); nir_intrinsic_set_write_mask(intr, 0xf); return true; } bool dxil_nir_ensure_position_writes(nir_shader *s) { if (s->info.stage != MESA_SHADER_VERTEX && s->info.stage != MESA_SHADER_GEOMETRY && s->info.stage != MESA_SHADER_TESS_EVAL) return false; if ((s->info.outputs_written & VARYING_BIT_POS) == 0) return false; return nir_shader_instructions_pass(s, update_writes, nir_metadata_block_index | nir_metadata_dominance, NULL); } static bool is_sample_pos(const nir_instr *instr, const void *_data) { if (instr->type != nir_instr_type_intrinsic) return false; nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); return intr->intrinsic == nir_intrinsic_load_sample_pos; } static nir_ssa_def * lower_sample_pos(nir_builder *b, nir_instr *instr, void *_data) { return nir_load_sample_pos_from_id(b, 32, nir_load_sample_id(b)); } bool dxil_nir_lower_sample_pos(nir_shader *s) { return nir_shader_lower_instructions(s, is_sample_pos, lower_sample_pos, NULL); } static bool lower_subgroup_id(nir_builder *b, nir_instr *instr, void *data) { if (instr->type != nir_instr_type_intrinsic) return false; nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); if (intr->intrinsic != nir_intrinsic_load_subgroup_id) return false; b->cursor = nir_before_block(nir_start_block(b->impl)); if (b->shader->info.workgroup_size[1] == 1 && b->shader->info.workgroup_size[2] == 1) { /* When using Nx1x1 groups, use a simple stable algorithm * which is almost guaranteed to be correct. */ nir_ssa_def *subgroup_id = nir_udiv(b, nir_load_local_invocation_index(b), nir_load_subgroup_size(b)); nir_ssa_def_rewrite_uses(&intr->dest.ssa, subgroup_id); return true; } nir_ssa_def **subgroup_id = (nir_ssa_def **)data; if (*subgroup_id == NULL) { nir_variable *subgroup_id_counter = nir_variable_create(b->shader, nir_var_mem_shared, glsl_uint_type(), "dxil_SubgroupID_counter"); nir_variable *subgroup_id_local = nir_local_variable_create(b->impl, glsl_uint_type(), "dxil_SubgroupID_local"); nir_store_var(b, subgroup_id_local, nir_imm_int(b, 0), 1); nir_deref_instr *counter_deref = nir_build_deref_var(b, subgroup_id_counter); nir_ssa_def *tid = nir_load_local_invocation_index(b); nir_if *nif = nir_push_if(b, nir_ieq_imm(b, tid, 0)); nir_store_deref(b, counter_deref, nir_imm_int(b, 0), 1); nir_pop_if(b, nif); nir_scoped_barrier(b, .execution_scope = NIR_SCOPE_WORKGROUP, .memory_scope = NIR_SCOPE_WORKGROUP, .memory_semantics = NIR_MEMORY_ACQ_REL, .memory_modes = nir_var_mem_shared); nif = nir_push_if(b, nir_elect(b, 1)); nir_ssa_def *subgroup_id_first_thread = nir_deref_atomic(b, 32, &counter_deref->dest.ssa, nir_imm_int(b, 1), .atomic_op = nir_atomic_op_iadd); nir_store_var(b, subgroup_id_local, subgroup_id_first_thread, 1); nir_pop_if(b, nif); nir_ssa_def *subgroup_id_loaded = nir_load_var(b, subgroup_id_local); *subgroup_id = nir_read_first_invocation(b, subgroup_id_loaded); } nir_ssa_def_rewrite_uses(&intr->dest.ssa, *subgroup_id); return true; } bool dxil_nir_lower_subgroup_id(nir_shader *s) { nir_ssa_def *subgroup_id = NULL; return nir_shader_instructions_pass(s, lower_subgroup_id, nir_metadata_none, &subgroup_id); } static bool lower_num_subgroups(nir_builder *b, nir_instr *instr, void *data) { if (instr->type != nir_instr_type_intrinsic) return false; nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); if (intr->intrinsic != nir_intrinsic_load_num_subgroups) return false; b->cursor = nir_before_instr(instr); nir_ssa_def *subgroup_size = nir_load_subgroup_size(b); nir_ssa_def *size_minus_one = nir_iadd_imm(b, subgroup_size, -1); nir_ssa_def *workgroup_size_vec = nir_load_workgroup_size(b); nir_ssa_def *workgroup_size = nir_imul(b, nir_channel(b, workgroup_size_vec, 0), nir_imul(b, nir_channel(b, workgroup_size_vec, 1), nir_channel(b, workgroup_size_vec, 2))); nir_ssa_def *ret = nir_idiv(b, nir_iadd(b, workgroup_size, size_minus_one), subgroup_size); nir_ssa_def_rewrite_uses(&intr->dest.ssa, ret); return true; } bool dxil_nir_lower_num_subgroups(nir_shader *s) { return nir_shader_instructions_pass(s, lower_num_subgroups, nir_metadata_block_index | nir_metadata_dominance | nir_metadata_loop_analysis, NULL); } static const struct glsl_type * get_cast_type(unsigned bit_size) { switch (bit_size) { case 64: return glsl_int64_t_type(); case 32: return glsl_int_type(); case 16: return glsl_int16_t_type(); case 8: return glsl_int8_t_type(); } unreachable("Invalid bit_size"); } static void split_unaligned_load(nir_builder *b, nir_intrinsic_instr *intrin, unsigned alignment) { enum gl_access_qualifier access = nir_intrinsic_access(intrin); nir_ssa_def *srcs[NIR_MAX_VEC_COMPONENTS * NIR_MAX_VEC_COMPONENTS * sizeof(int64_t) / 8]; unsigned comp_size = intrin->dest.ssa.bit_size / 8; unsigned num_comps = intrin->dest.ssa.num_components; b->cursor = nir_before_instr(&intrin->instr); nir_deref_instr *ptr = nir_src_as_deref(intrin->src[0]); const struct glsl_type *cast_type = get_cast_type(alignment * 8); nir_deref_instr *cast = nir_build_deref_cast(b, &ptr->dest.ssa, ptr->modes, cast_type, alignment); unsigned num_loads = DIV_ROUND_UP(comp_size * num_comps, alignment); for (unsigned i = 0; i < num_loads; ++i) { nir_deref_instr *elem = nir_build_deref_ptr_as_array(b, cast, nir_imm_intN_t(b, i, cast->dest.ssa.bit_size)); srcs[i] = nir_load_deref_with_access(b, elem, access); } nir_ssa_def *new_dest = nir_extract_bits(b, srcs, num_loads, 0, num_comps, intrin->dest.ssa.bit_size); nir_ssa_def_rewrite_uses(&intrin->dest.ssa, new_dest); nir_instr_remove(&intrin->instr); } static void split_unaligned_store(nir_builder *b, nir_intrinsic_instr *intrin, unsigned alignment) { enum gl_access_qualifier access = nir_intrinsic_access(intrin); assert(intrin->src[1].is_ssa); nir_ssa_def *value = intrin->src[1].ssa; unsigned comp_size = value->bit_size / 8; unsigned num_comps = value->num_components; b->cursor = nir_before_instr(&intrin->instr); nir_deref_instr *ptr = nir_src_as_deref(intrin->src[0]); const struct glsl_type *cast_type = get_cast_type(alignment * 8); nir_deref_instr *cast = nir_build_deref_cast(b, &ptr->dest.ssa, ptr->modes, cast_type, alignment); unsigned num_stores = DIV_ROUND_UP(comp_size * num_comps, alignment); for (unsigned i = 0; i < num_stores; ++i) { nir_ssa_def *substore_val = nir_extract_bits(b, &value, 1, i * alignment * 8, 1, alignment * 8); nir_deref_instr *elem = nir_build_deref_ptr_as_array(b, cast, nir_imm_intN_t(b, i, cast->dest.ssa.bit_size)); nir_store_deref_with_access(b, elem, substore_val, ~0, access); } nir_instr_remove(&intrin->instr); } bool dxil_nir_split_unaligned_loads_stores(nir_shader *shader, nir_variable_mode modes) { bool progress = false; nir_foreach_function(function, shader) { if (!function->impl) continue; nir_builder b; nir_builder_init(&b, function->impl); nir_foreach_block(block, function->impl) { nir_foreach_instr_safe(instr, block) { if (instr->type != nir_instr_type_intrinsic) continue; nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); if (intrin->intrinsic != nir_intrinsic_load_deref && intrin->intrinsic != nir_intrinsic_store_deref) continue; nir_deref_instr *deref = nir_src_as_deref(intrin->src[0]); if (!nir_deref_mode_may_be(deref, modes)) continue; unsigned align_mul = 0, align_offset = 0; nir_get_explicit_deref_align(deref, true, &align_mul, &align_offset); unsigned alignment = align_offset ? 1 << (ffs(align_offset) - 1) : align_mul; /* We can load anything at 4-byte alignment, except for * UBOs (AKA CBs where the granularity is 16 bytes). */ unsigned req_align = (nir_deref_mode_is_one_of(deref, nir_var_mem_ubo | nir_var_mem_push_const) ? 16 : 4); if (alignment >= req_align) continue; nir_ssa_def *val; if (intrin->intrinsic == nir_intrinsic_load_deref) { assert(intrin->dest.is_ssa); val = &intrin->dest.ssa; } else { assert(intrin->src[1].is_ssa); val = intrin->src[1].ssa; } unsigned scalar_byte_size = glsl_type_is_boolean(deref->type) ? 4 : glsl_get_bit_size(deref->type) / 8; unsigned num_components = /* If the vector stride is larger than the scalar size, lower_explicit_io will * turn this into multiple scalar loads anyway, so we don't have to split it here. */ glsl_get_explicit_stride(deref->type) > scalar_byte_size ? 1 : (val->num_components == 3 ? 4 : val->num_components); unsigned natural_alignment = scalar_byte_size * num_components; if (alignment >= natural_alignment) continue; if (intrin->intrinsic == nir_intrinsic_load_deref) split_unaligned_load(&b, intrin, alignment); else split_unaligned_store(&b, intrin, alignment); progress = true; } } } return progress; } static void lower_inclusive_to_exclusive(nir_builder *b, nir_intrinsic_instr *intr) { b->cursor = nir_after_instr(&intr->instr); nir_op op = nir_intrinsic_reduction_op(intr); intr->intrinsic = nir_intrinsic_exclusive_scan; nir_intrinsic_set_reduction_op(intr, op); nir_ssa_def *final_val = nir_build_alu2(b, nir_intrinsic_reduction_op(intr), &intr->dest.ssa, intr->src[0].ssa); nir_ssa_def_rewrite_uses_after(&intr->dest.ssa, final_val, final_val->parent_instr); } static bool lower_subgroup_scan(nir_builder *b, nir_instr *instr, void *data) { if (instr->type != nir_instr_type_intrinsic) return false; nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); switch (intr->intrinsic) { case nir_intrinsic_exclusive_scan: case nir_intrinsic_inclusive_scan: switch ((nir_op)nir_intrinsic_reduction_op(intr)) { case nir_op_iadd: case nir_op_fadd: case nir_op_imul: case nir_op_fmul: if (intr->intrinsic == nir_intrinsic_exclusive_scan) return false; lower_inclusive_to_exclusive(b, intr); return true; default: break; } break; default: return false; } b->cursor = nir_before_instr(instr); nir_op op = nir_intrinsic_reduction_op(intr); nir_ssa_def *subgroup_id = nir_build_load_subgroup_invocation(b); nir_ssa_def *active_threads = nir_build_ballot(b, 4, 32, nir_imm_bool(b, true)); nir_ssa_def *base_value; uint32_t bit_size = intr->dest.ssa.bit_size; if (op == nir_op_iand || op == nir_op_umin) base_value = nir_imm_intN_t(b, ~0ull, bit_size); else if (op == nir_op_imin) base_value = nir_imm_intN_t(b, (1ull << (bit_size - 1)) - 1, bit_size); else if (op == nir_op_imax) base_value = nir_imm_intN_t(b, 1ull << (bit_size - 1), bit_size); else if (op == nir_op_fmax) base_value = nir_imm_floatN_t(b, -INFINITY, bit_size); else if (op == nir_op_fmin) base_value = nir_imm_floatN_t(b, INFINITY, bit_size); else base_value = nir_imm_intN_t(b, 0, bit_size); nir_variable *loop_counter_var = nir_local_variable_create(b->impl, glsl_uint_type(), "subgroup_loop_counter"); nir_variable *result_var = nir_local_variable_create(b->impl, glsl_vector_type(nir_get_glsl_base_type_for_nir_type( nir_op_infos[op].input_types[0] | bit_size), 1), "subgroup_loop_result"); nir_store_var(b, loop_counter_var, nir_imm_int(b, 0), 1); nir_store_var(b, result_var, base_value, 1); nir_loop *loop = nir_push_loop(b); nir_ssa_def *loop_counter = nir_load_var(b, loop_counter_var); nir_if *nif = nir_push_if(b, intr->intrinsic == nir_intrinsic_inclusive_scan ? nir_ige(b, subgroup_id, loop_counter) : nir_ilt(b, loop_counter, subgroup_id)); nir_if *if_active_thread = nir_push_if(b, nir_build_ballot_bitfield_extract(b, 32, active_threads, loop_counter)); nir_ssa_def *result = nir_build_alu2(b, op, nir_load_var(b, result_var), nir_build_read_invocation(b, intr->src[0].ssa, loop_counter)); nir_store_var(b, result_var, result, 1); nir_pop_if(b, if_active_thread); nir_store_var(b, loop_counter_var, nir_iadd_imm(b, loop_counter, 1), 1); nir_jump(b, nir_jump_continue); nir_pop_if(b, nif); nir_jump(b, nir_jump_break); nir_pop_loop(b, loop); result = nir_load_var(b, result_var); nir_ssa_def_rewrite_uses(&intr->dest.ssa, result); return true; } bool dxil_nir_lower_unsupported_subgroup_scan(nir_shader *s) { bool ret = nir_shader_instructions_pass(s, lower_subgroup_scan, nir_metadata_none, NULL); if (ret) { /* Lower the ballot bitfield tests */ nir_lower_subgroups_options options = { .ballot_bit_size = 32, .ballot_components = 4 }; nir_lower_subgroups(s, &options); } return ret; } static bool lower_load_face(nir_builder *b, nir_instr *instr, void *data) { if (instr->type != nir_instr_type_intrinsic) return false; nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); if (intr->intrinsic != nir_intrinsic_load_front_face) return false; b->cursor = nir_before_instr(&intr->instr); nir_variable *var = data; nir_ssa_def *load = nir_ine_imm(b, nir_load_var(b, var), 0); nir_ssa_def_rewrite_uses(&intr->dest.ssa, load); nir_instr_remove(instr); return true; } bool dxil_nir_forward_front_face(nir_shader *nir) { assert(nir->info.stage == MESA_SHADER_FRAGMENT); nir_variable *var = nir_variable_create(nir, nir_var_shader_in, glsl_uint_type(), "gl_FrontFacing"); var->data.location = VARYING_SLOT_VAR12; var->data.interpolation = INTERP_MODE_FLAT; return nir_shader_instructions_pass(nir, lower_load_face, nir_metadata_block_index | nir_metadata_dominance, var); } static bool split_phi_and_const_srcs(nir_builder *b, nir_instr *instr, void *data) { bool progress = false; switch (instr->type) { case nir_instr_type_phi: { /* Ensure each phi src is used only as a phi src and is not also a phi dest */ nir_phi_instr *phi = nir_instr_as_phi(instr); nir_foreach_phi_src(src, phi) { assert(src->src.is_ssa); if (!list_is_singular(&src->src.use_link) || (src->src.is_ssa && src->src.parent_instr->type == nir_instr_type_phi)) { b->cursor = nir_after_instr_and_phis(src->src.ssa->parent_instr); nir_ssa_def *new_phi_src = nir_mov(b, src->src.ssa); nir_src_rewrite_ssa(&src->src, new_phi_src); progress = true; } } return progress; } case nir_instr_type_load_const: { /* Sink load_const to their uses if there's multiple */ nir_load_const_instr *load_const = nir_instr_as_load_const(instr); if (!list_is_singular(&load_const->def.uses)) { nir_foreach_use_safe(src, &load_const->def) { b->cursor = nir_before_src(src); nir_load_const_instr *new_load = nir_load_const_instr_create(b->shader, load_const->def.num_components, load_const->def.bit_size); memcpy(new_load->value, load_const->value, sizeof(load_const->value[0]) * load_const->def.num_components); nir_builder_instr_insert(b, &new_load->instr); nir_src_rewrite_ssa(src, &new_load->def); progress = true; } } return progress; } default: return false; } } /* If a value is used by a phi and another instruction (e.g. another phi), * copy the value with a mov and use that as the phi source. If the types * of the uses are compatible, then the two phi sources will use the same * DXIL SSA value, but if the types are not, then the mov provides an opportunity * to insert a bitcast. Similarly, sink all consts so that they have only have * a single use. The DXIL backend will already de-dupe the constants to the * same dxil_value if they have the same type, but this allows a single constant * to have different types without bitcasts. */ bool dxil_nir_split_phis_and_const_srcs(nir_shader *s) { return nir_shader_instructions_pass(s, split_phi_and_const_srcs, nir_metadata_block_index | nir_metadata_dominance, NULL); } static void clear_pass_flags(nir_function_impl *impl) { nir_foreach_block(block, impl) { nir_foreach_instr(instr, block) { instr->pass_flags = 0; } } } static bool add_dest_to_worklist(nir_dest *dest, void *state) { assert(dest->is_ssa); nir_foreach_use_including_if(src, &dest->ssa) { assert(src->is_ssa); if (src->is_if) { nir_if *nif = src->parent_if; nir_foreach_block_in_cf_node(block, &nif->cf_node) { nir_foreach_instr(instr, block) nir_instr_worklist_push_tail(state, instr); } } else nir_instr_worklist_push_tail(state, src->parent_instr); } return true; } static bool set_input_bits(struct dxil_module *mod, nir_intrinsic_instr *intr, BITSET_WORD *input_bits, uint32_t ***tables, const uint32_t **table_sizes) { if (intr->intrinsic == nir_intrinsic_load_view_index) { BITSET_SET(input_bits, 0); return true; } bool any_bits_set = false; nir_src *row_src = intr->intrinsic == nir_intrinsic_load_per_vertex_input ? &intr->src[1] : &intr->src[0]; bool is_patch_constant = mod->shader_kind == DXIL_DOMAIN_SHADER && intr->intrinsic == nir_intrinsic_load_input; const struct dxil_signature_record *sig_rec = is_patch_constant ? &mod->patch_consts[nir_intrinsic_base(intr)] : &mod->inputs[mod->input_mappings[nir_intrinsic_base(intr)]]; if (is_patch_constant) { /* Redirect to the second I/O table */ *tables = *tables + 1; *table_sizes = *table_sizes + 1; } for (uint32_t component = 0; component < intr->num_components; ++component) { uint32_t base_element = 0; uint32_t num_elements = sig_rec->num_elements; if (nir_src_is_const(*row_src)) { base_element = (uint32_t)nir_src_as_uint(*row_src); num_elements = 1; } for (uint32_t element = 0; element < num_elements; ++element) { uint32_t row = sig_rec->elements[element + base_element].reg; if (row == 0xffffffff) continue; BITSET_SET(input_bits, row * 4 + component + nir_intrinsic_component(intr)); any_bits_set = true; } } return any_bits_set; } static bool set_output_bits(struct dxil_module *mod, nir_intrinsic_instr *intr, BITSET_WORD *input_bits, uint32_t **tables, const uint32_t *table_sizes) { bool any_bits_set = false; nir_src *row_src = intr->intrinsic == nir_intrinsic_store_per_vertex_output ? &intr->src[2] : &intr->src[1]; bool is_patch_constant = mod->shader_kind == DXIL_HULL_SHADER && intr->intrinsic == nir_intrinsic_store_output; const struct dxil_signature_record *sig_rec = is_patch_constant ? &mod->patch_consts[nir_intrinsic_base(intr)] : &mod->outputs[nir_intrinsic_base(intr)]; for (uint32_t component = 0; component < intr->num_components; ++component) { uint32_t base_element = 0; uint32_t num_elements = sig_rec->num_elements; if (nir_src_is_const(*row_src)) { base_element = (uint32_t)nir_src_as_uint(*row_src); num_elements = 1; } for (uint32_t element = 0; element < num_elements; ++element) { uint32_t row = sig_rec->elements[element + base_element].reg; if (row == 0xffffffff) continue; uint32_t stream = sig_rec->elements[element + base_element].stream; uint32_t table_idx = is_patch_constant ? 1 : stream; uint32_t *table = tables[table_idx]; uint32_t output_component = component + nir_intrinsic_component(intr); uint32_t input_component; BITSET_FOREACH_SET(input_component, input_bits, 32 * 4) { uint32_t *table_for_input_component = table + table_sizes[table_idx] * input_component; BITSET_SET(table_for_input_component, row * 4 + output_component); any_bits_set = true; } } } return any_bits_set; } static bool propagate_input_to_output_dependencies(struct dxil_module *mod, nir_intrinsic_instr *load_intr, uint32_t **tables, const uint32_t *table_sizes) { /* Which input components are being loaded by this instruction */ BITSET_DECLARE(input_bits, 32 * 4) = { 0 }; if (!set_input_bits(mod, load_intr, input_bits, &tables, &table_sizes)) return false; nir_instr_worklist *worklist = nir_instr_worklist_create(); nir_instr_worklist_push_tail(worklist, &load_intr->instr); bool any_bits_set = false; nir_foreach_instr_in_worklist(instr, worklist) { if (instr->pass_flags) continue; instr->pass_flags = 1; nir_foreach_dest(instr, add_dest_to_worklist, worklist); switch (instr->type) { case nir_instr_type_jump: { nir_jump_instr *jump = nir_instr_as_jump(instr); switch (jump->type) { case nir_jump_break: case nir_jump_continue: { nir_cf_node *parent = &instr->block->cf_node; while (parent->type != nir_cf_node_loop) parent = parent->parent; nir_foreach_block_in_cf_node(block, parent) nir_foreach_instr(i, block) nir_instr_worklist_push_tail(worklist, i); } break; default: unreachable("Don't expect any other jumps"); } break; } case nir_instr_type_intrinsic: { nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); switch (intr->intrinsic) { case nir_intrinsic_store_output: case nir_intrinsic_store_per_vertex_output: any_bits_set |= set_output_bits(mod, intr, input_bits, tables, table_sizes); break; /* TODO: Memory writes */ default: break; } break; } default: break; } } nir_instr_worklist_destroy(worklist); return any_bits_set; } /* For every input load, compute the set of output stores that it can contribute to. * If it contributes to a store to memory, If it's used for control flow, then any * instruction in the CFG that it impacts is considered to contribute. * Ideally, we should also handle stores to outputs/memory and then loads from that * output/memory, but this is non-trivial and unclear how much impact that would have. */ bool dxil_nir_analyze_io_dependencies(struct dxil_module *mod, nir_shader *s) { bool any_outputs = false; for (uint32_t i = 0; i < 4; ++i) any_outputs |= mod->num_psv_outputs[i] > 0; if (mod->shader_kind == DXIL_HULL_SHADER) any_outputs |= mod->num_psv_patch_consts > 0; if (!any_outputs) return false; bool any_bits_set = false; nir_foreach_function(func, s) { assert(func->impl); /* Hull shaders have a patch constant function */ assert(func->is_entrypoint || s->info.stage == MESA_SHADER_TESS_CTRL); /* Pass 1: input/view ID -> output dependencies */ nir_foreach_block(block, func->impl) { nir_foreach_instr(instr, block) { if (instr->type != nir_instr_type_intrinsic) continue; nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); uint32_t **tables = mod->io_dependency_table; const uint32_t *table_sizes = mod->dependency_table_dwords_per_input; switch (intr->intrinsic) { case nir_intrinsic_load_view_index: tables = mod->viewid_dependency_table; FALLTHROUGH; case nir_intrinsic_load_input: case nir_intrinsic_load_per_vertex_input: case nir_intrinsic_load_interpolated_input: break; default: continue; } clear_pass_flags(func->impl); any_bits_set |= propagate_input_to_output_dependencies(mod, intr, tables, table_sizes); } } /* Pass 2: output -> output dependencies */ /* TODO */ } return any_bits_set; }