diff options
author | Vasily Khoruzhick <anarsoul@gmail.com> | 2021-11-16 22:43:52 -0800 |
---|---|---|
committer | Marge Bot <emma+marge@anholt.net> | 2021-11-24 02:26:08 +0000 |
commit | 3b15fb35753763a0611d1209f7f55742228a2bca (patch) | |
tree | 0f980370b98a1581cf4398a28e1f7280813c8dbd | |
parent | 98a7c4c6f8e0dd8aca665ff1ae475ab3cdd53b12 (diff) | |
download | mesa-3b15fb35753763a0611d1209f7f55742228a2bca.tar.gz |
lima/ppir: implement gl_FragDepth support
Mali4x0 supports writing depth and stencil from fragment shader
and we've been using it quite a while for depth/stencil buffer reload.
The missing part was specifying output register for depth/stencil.
To figure it out, I changed reload shader to use register $4 as output
and poked RSW bits (or rather consecutive 4 bit groups) until tests
that rely on reload started to pass again.
It turns out that register number for gl_FragDepth/gl_FragStencil is in
rsw->depth_test and register number for gl_FragColor is in
rsw->multi_sample and it's repeated 4 times for some reason (likely for
MSAA?)
With this knowledge we now can modify ppir compiler to support multiple
store_output intrinsics.
To do that just add destination SSA for store_output to the registers
list for regalloc and mark them explicitly as output. Since it's never
read in shader we have to take care about it in liveness analysis -
basically just mark it alive from the time when it's written to the end
of the block. If it's live only in the last instruction, mark it as
live_internal, so regalloc doesn't clobber it.
Then just let regalloc do its job, and then copy register number to the
shader state and program it in RSW.
The tricky part is gl_FragStencil, since it resides in the same register
as gl_FragDepth and with the current design of the compiler it's hard to
merge them. However gl_FragStencil doesn't seem to be part of GL2
or GLES2, so we can just leave it not implemented.
Also we need to take care of stop bit for instructions - now we can't
just set it in every instruction that stores output, since there may be
several outputs. So if there's any store_output instructions in the
block just mark that block has a stop, and set stop bit in the last
instruction in the block. The only exception is discard - we always need
to set stop bit in discard instruction.
Reviewed-by: Andreas Baierl <ichgeh@imkreisrum.de>
Reviewed-by: Erico Nunes <nunes.erico@gmail.com>
Signed-off-by: Vasily Khoruzhick <anarsoul@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/13830>
-rw-r--r-- | src/gallium/drivers/lima/ir/pp/codegen.c | 7 | ||||
-rw-r--r-- | src/gallium/drivers/lima/ir/pp/instr.c | 2 | ||||
-rw-r--r-- | src/gallium/drivers/lima/ir/pp/liveness.c | 17 | ||||
-rw-r--r-- | src/gallium/drivers/lima/ir/pp/nir.c | 42 | ||||
-rw-r--r-- | src/gallium/drivers/lima/ir/pp/node.c | 6 | ||||
-rw-r--r-- | src/gallium/drivers/lima/ir/pp/node_to_instr.c | 11 | ||||
-rw-r--r-- | src/gallium/drivers/lima/ir/pp/ppir.h | 40 | ||||
-rw-r--r-- | src/gallium/drivers/lima/ir/pp/regalloc.c | 28 | ||||
-rw-r--r-- | src/gallium/drivers/lima/lima_context.h | 2 | ||||
-rw-r--r-- | src/gallium/drivers/lima/lima_draw.c | 15 | ||||
-rw-r--r-- | src/gallium/drivers/lima/lima_parser.c | 11 |
11 files changed, 144 insertions, 37 deletions
diff --git a/src/gallium/drivers/lima/ir/pp/codegen.c b/src/gallium/drivers/lima/ir/pp/codegen.c index 5a5ee14c78e..0760e19c1b9 100644 --- a/src/gallium/drivers/lima/ir/pp/codegen.c +++ b/src/gallium/drivers/lima/ir/pp/codegen.c @@ -773,7 +773,7 @@ static int encode_instr(ppir_instr *instr, void *code, void *last_code) size = align_to_word(size) + 1; ctrl->count = size; - if (instr->is_end) + if (instr->stop) ctrl->stop = true; if (last_code) { @@ -818,6 +818,11 @@ bool ppir_codegen_prog(ppir_compiler *comp) instr->encode_size = get_instr_encode_size(instr); size += instr->encode_size; } + /* Set stop flag for the last instruction if block has stop flag */ + if (block->stop) { + ppir_instr *instr = list_last_entry(&block->instr_list, ppir_instr, list); + instr->stop = true; + } } uint32_t *prog = rzalloc_size(comp->prog, size * sizeof(uint32_t)); diff --git a/src/gallium/drivers/lima/ir/pp/instr.c b/src/gallium/drivers/lima/ir/pp/instr.c index 8e1bc95158d..fc64de8812f 100644 --- a/src/gallium/drivers/lima/ir/pp/instr.c +++ b/src/gallium/drivers/lima/ir/pp/instr.c @@ -284,7 +284,7 @@ void ppir_instr_print_list(ppir_compiler *comp) list_for_each_entry(ppir_block, block, &comp->block_list, list) { printf("-------block %3d-------\n", block->index); list_for_each_entry(ppir_instr, instr, &block->instr_list, list) { - printf("%c%03d: ", instr->is_end ? '*' : ' ', instr->index); + printf("%c%03d: ", instr->stop ? '*' : ' ', instr->index); for (int i = 0; i < PPIR_INSTR_SLOT_NUM; i++) { ppir_node *node = instr->slots[i]; if (node) diff --git a/src/gallium/drivers/lima/ir/pp/liveness.c b/src/gallium/drivers/lima/ir/pp/liveness.c index 1799a53b165..8f642e2d05c 100644 --- a/src/gallium/drivers/lima/ir/pp/liveness.c +++ b/src/gallium/drivers/lima/ir/pp/liveness.c @@ -121,7 +121,7 @@ ppir_liveness_instr_srcs(ppir_compiler *comp, ppir_instr *instr) /* Update the liveness information of the instruction by removing its * dests from the live_in set. */ static void -ppir_liveness_instr_dest(ppir_compiler *comp, ppir_instr *instr) +ppir_liveness_instr_dest(ppir_compiler *comp, ppir_instr *instr, ppir_instr *last) { for (int i = PPIR_INSTR_SLOT_NUM-1; i >= 0; i--) { ppir_node *node = instr->slots[i]; @@ -146,9 +146,18 @@ ppir_liveness_instr_dest(ppir_compiler *comp, ppir_instr *instr) unsigned int index = reg->regalloc_index; bool live = BITSET_TEST(instr->live_set, index); + /* If it's an out reg, it's alive till the end of the block, so add it + * to live_set of the last instruction */ + if (!live && reg->out_reg && (instr != last)) { + BITSET_SET(last->live_set, index); + BITSET_CLEAR(instr->live_set, index); + continue; + } + /* If a register is written but wasn't read in a later instruction, it is - * either dead code or a bug. For now, assign an interference to it to - * ensure it doesn't get assigned a live register and overwrites it. */ + * either an output register in last instruction, dead code or a bug. + * For now, assign an interference to it to ensure it doesn't get assigned + * a live register and overwrites it. */ if (!live) { BITSET_SET(instr->live_internal, index); continue; @@ -230,7 +239,7 @@ ppir_liveness_compute_live_sets(ppir_compiler *comp) instr->live_mask, next_instr->live_mask); } - ppir_liveness_instr_dest(comp, instr); + ppir_liveness_instr_dest(comp, instr, last); ppir_liveness_instr_srcs(comp, instr); cont |= !ppir_liveness_set_equal(comp, diff --git a/src/gallium/drivers/lima/ir/pp/nir.c b/src/gallium/drivers/lima/ir/pp/nir.c index 6f61a0986fb..fc8030fc745 100644 --- a/src/gallium/drivers/lima/ir/pp/nir.c +++ b/src/gallium/drivers/lima/ir/pp/nir.c @@ -345,6 +345,18 @@ static bool ppir_emit_intrinsic(ppir_block *block, nir_instr *ni) * back to inserting a mov at the end. * If the source node will only be able to output to pipeline * registers, fall back to the mov as well. */ + assert(nir_src_is_const(instr->src[1]) && + "lima doesn't support indirect outputs"); + + nir_io_semantics io = nir_intrinsic_io_semantics(instr); + unsigned offset = nir_src_as_uint(instr->src[1]); + unsigned slot = io.location + offset; + ppir_output_type out_type = ppir_nir_output_to_ppir(slot); + if (out_type == ppir_output_invalid) { + ppir_debug("Unsupported output type: %d\n", slot); + return false; + } + if (!block->comp->uses_discard && instr->src->is_ssa) { node = block->comp->var_nodes[instr->src->ssa->index]; switch (node->op) { @@ -352,9 +364,12 @@ static bool ppir_emit_intrinsic(ppir_block *block, nir_instr *ni) case ppir_op_load_texture: case ppir_op_const: break; - default: - node->is_end = 1; + default: { + ppir_dest *dest = ppir_node_get_dest(node); + dest->ssa.out_type = out_type; + node->is_out = 1; return true; + } } } @@ -367,6 +382,7 @@ static bool ppir_emit_intrinsic(ppir_block *block, nir_instr *ni) dest->ssa.num_components = instr->num_components; dest->ssa.index = 0; dest->write_mask = u_bit_consecutive(0, instr->num_components); + dest->ssa.out_type = out_type; alu_node->num_src = 1; @@ -376,7 +392,7 @@ static bool ppir_emit_intrinsic(ppir_block *block, nir_instr *ni) ppir_node_add_src(block->comp, &alu_node->node, alu_node->src, instr->src, u_bit_consecutive(0, instr->num_components)); - alu_node->node.is_end = 1; + alu_node->node.is_out = 1; list_addtail(&alu_node->node.list, &block->node_list); return true; @@ -798,6 +814,7 @@ static ppir_compiler *ppir_compiler_create(void *prog, unsigned num_reg, unsigne comp->var_nodes = (ppir_node **)(comp + 1); comp->reg_base = num_ssa; comp->prog = prog; + return comp; } @@ -833,7 +850,7 @@ static void ppir_add_ordering_deps(ppir_compiler *comp) if (prev_node && ppir_node_is_root(node) && node->op != ppir_op_const) { ppir_node_add_dep(prev_node, node, ppir_dep_sequence); } - if (node->is_end || + if (node->is_out || node->op == ppir_op_discard || node->op == ppir_op_store_temp || node->op == ppir_op_branch) { @@ -930,18 +947,11 @@ bool ppir_compile_nir(struct lima_fs_compiled_shader *prog, struct nir_shader *n } } - /* Validate outputs, we support only gl_FragColor */ - nir_foreach_shader_out_variable(var, nir) { - switch (var->data.location) { - case FRAG_RESULT_COLOR: - case FRAG_RESULT_DATA0: - break; - default: - ppir_error("unsupported output type\n"); - goto err_out0; - break; - } - } + comp->out_type_to_reg = rzalloc_size(comp, sizeof(int) * ppir_output_num); + + /* -1 means reg is not written by the shader */ + for (int i = 0; i < ppir_output_num; i++) + comp->out_type_to_reg[i] = -1; foreach_list_typed(nir_register, reg, node, &func->registers) { ppir_reg *r = rzalloc(comp, ppir_reg); diff --git a/src/gallium/drivers/lima/ir/pp/node.c b/src/gallium/drivers/lima/ir/pp/node.c index 99d025e2c05..cc51448bfb3 100644 --- a/src/gallium/drivers/lima/ir/pp/node.c +++ b/src/gallium/drivers/lima/ir/pp/node.c @@ -618,9 +618,9 @@ static ppir_node *ppir_node_insert_mov_local(ppir_node *node) ppir_node_add_dep(move, node, ppir_dep_src); list_addtail(&move->list, &node->list); - if (node->is_end) { - node->is_end = false; - move->is_end = true; + if (node->is_out) { + node->is_out = false; + move->is_out = true; } return move; diff --git a/src/gallium/drivers/lima/ir/pp/node_to_instr.c b/src/gallium/drivers/lima/ir/pp/node_to_instr.c index 0d5d37f4927..ff8d735d9a7 100644 --- a/src/gallium/drivers/lima/ir/pp/node_to_instr.c +++ b/src/gallium/drivers/lima/ir/pp/node_to_instr.c @@ -203,7 +203,7 @@ static bool ppir_do_one_node_to_instr(ppir_block *block, ppir_node *node) case ppir_node_type_discard: if (!create_new_instr(block, node)) return false; - node->instr->is_end = true; + block->stop = true; break; case ppir_node_type_branch: if (!create_new_instr(block, node)) @@ -276,8 +276,13 @@ static bool ppir_do_node_to_instr(ppir_block *block, ppir_node *root) if (!ppir_do_one_node_to_instr(block, node)) return false; - if (node->is_end) - node->instr->is_end = true; + /* The node writes output register. We can't stop at this exact + * instruction because there may be another node that writes another + * output, so set stop flag for the block. We will set stop flag on + * the last instruction of the block during codegen + */ + if (node->is_out) + block->stop = true; ppir_node_foreach_pred(node, dep) { ppir_node *pred = dep->pred; diff --git a/src/gallium/drivers/lima/ir/pp/ppir.h b/src/gallium/drivers/lima/ir/pp/ppir.h index 74a508e504e..f434786f4cf 100644 --- a/src/gallium/drivers/lima/ir/pp/ppir.h +++ b/src/gallium/drivers/lima/ir/pp/ppir.h @@ -161,7 +161,7 @@ typedef struct ppir_node { struct ppir_instr *instr; int instr_pos; struct ppir_block *block; - bool is_end; + bool is_out; bool succ_different_block; /* for scheduler */ @@ -179,9 +179,42 @@ typedef enum { ppir_pipeline_reg_discard, /* varying load */ } ppir_pipeline; +typedef enum { + ppir_output_color, + ppir_output_depth, + ppir_output_num, + ppir_output_invalid = -1, +} ppir_output_type; + +static inline const char *ppir_output_type_to_str(ppir_output_type type) +{ + switch (type) { + case ppir_output_color: + return "OUTPUT_COLOR"; + case ppir_output_depth: + return "OUTPUT_DEPTH"; + default: + return "INVALID"; + } +} + +static inline ppir_output_type ppir_nir_output_to_ppir(gl_frag_result res) +{ + switch (res) { + case FRAG_RESULT_COLOR: + case FRAG_RESULT_DATA0: + return ppir_output_color; + case FRAG_RESULT_DEPTH: + return ppir_output_depth; + default: + return ppir_output_invalid; + } +} + typedef struct ppir_reg { struct list_head list; int index; + ppir_output_type out_type; int regalloc_index; int num_components; @@ -191,6 +224,7 @@ typedef struct ppir_reg { bool is_head; bool spilled; bool undef; + bool out_reg; } ppir_reg; typedef enum { @@ -316,7 +350,7 @@ typedef struct ppir_instr { ppir_node *slots[PPIR_INSTR_SLOT_NUM]; ppir_const constant[2]; - bool is_end; + bool stop; /* for scheduler */ struct list_head succ_list; @@ -340,6 +374,7 @@ typedef struct ppir_block { struct list_head list; struct list_head node_list; struct list_head instr_list; + bool stop; struct ppir_block *successors[2]; @@ -370,6 +405,7 @@ typedef struct ppir_compiler { struct hash_table_u64 *blocks; int cur_index; int cur_instr_index; + int *out_type_to_reg; struct list_head reg_list; int reg_num; diff --git a/src/gallium/drivers/lima/ir/pp/regalloc.c b/src/gallium/drivers/lima/ir/pp/regalloc.c index 3ea136b5660..37b54b5a4bf 100644 --- a/src/gallium/drivers/lima/ir/pp/regalloc.c +++ b/src/gallium/drivers/lima/ir/pp/regalloc.c @@ -82,9 +82,6 @@ static void ppir_regalloc_update_reglist_ssa(ppir_compiler *comp) { list_for_each_entry(ppir_block, block, &comp->block_list, list) { list_for_each_entry(ppir_node, node, &block->node_list, list) { - if (node->is_end) - continue; - if (!node->instr || node->op == ppir_op_const) continue; @@ -94,6 +91,8 @@ static void ppir_regalloc_update_reglist_ssa(ppir_compiler *comp) if (dest->type == ppir_target_ssa) { reg = &dest->ssa; + if (node->is_out) + reg->out_reg = true; list_addtail(®->list, &comp->reg_list); comp->reg_num++; } @@ -133,6 +132,14 @@ static void ppir_regalloc_print_result(ppir_compiler *comp) } } printf("--------------------------\n"); + + printf("======ppir output regs======\n"); + for (int i = 0; i < ppir_output_num; i++) { + if (comp->out_type_to_reg[i] != -1) + printf("%s: $%d\n", ppir_output_type_to_str(i), + (int)comp->out_type_to_reg[i]); + } + printf("--------------------------\n"); } static bool create_new_instr_after(ppir_block *block, ppir_instr *ref, @@ -578,6 +585,11 @@ static bool ppir_regalloc_prog_try(ppir_compiler *comp, bool *spilled) n = 0; list_for_each_entry(ppir_reg, reg, &comp->reg_list, list) { reg->index = ra_get_node_reg(g, n++); + if (reg->out_reg) { + /* We need actual reg number, we don't have swizzle for output regs */ + assert(!(reg->index & 0x3) && "ppir: output regs don't have swizzle"); + comp->out_type_to_reg[reg->out_type] = reg->index / 4; + } } ralloc_free(g); @@ -604,8 +616,11 @@ bool ppir_regalloc_prog(ppir_compiler *comp) ppir_regalloc_update_reglist_ssa(comp); /* No registers? Probably shader consists of discard instruction */ - if (list_is_empty(&comp->reg_list)) + if (list_is_empty(&comp->reg_list)) { + comp->prog->state.frag_color_reg = 0; + comp->prog->state.frag_depth_reg = -1; return true; + } /* this will most likely succeed in the first * try, except for very complicated shaders */ @@ -613,5 +628,10 @@ bool ppir_regalloc_prog(ppir_compiler *comp) if (!spilled) return false; + comp->prog->state.frag_color_reg = + comp->out_type_to_reg[ppir_output_color]; + comp->prog->state.frag_depth_reg = + comp->out_type_to_reg[ppir_output_depth]; + return true; } diff --git a/src/gallium/drivers/lima/lima_context.h b/src/gallium/drivers/lima/lima_context.h index e871d0b952d..bea1ddeb477 100644 --- a/src/gallium/drivers/lima/lima_context.h +++ b/src/gallium/drivers/lima/lima_context.h @@ -49,6 +49,8 @@ struct lima_fs_compiled_shader { struct { int shader_size; int stack_size; + int frag_color_reg; + int frag_depth_reg; bool uses_discard; } state; }; diff --git a/src/gallium/drivers/lima/lima_draw.c b/src/gallium/drivers/lima/lima_draw.c index 5c9f03b1d1c..889f0192179 100644 --- a/src/gallium/drivers/lima/lima_draw.c +++ b/src/gallium/drivers/lima/lima_draw.c @@ -677,6 +677,12 @@ lima_pack_render_state(struct lima_context *ctx, const struct pipe_draw_info *in if (!rst->depth_clip_far || ctx->viewport.far == 1.0f) render->depth_test |= 0x20; /* don't clip depth far */ + if (fs->state.frag_depth_reg != -1) { + render->depth_test |= (fs->state.frag_depth_reg << 6); + /* Shader writes depth */ + render->depth_test |= 0x801; + } + ushort far, near; near = float_to_ushort(ctx->viewport.near); @@ -729,6 +735,12 @@ lima_pack_render_state(struct lima_context *ctx, const struct pipe_draw_info *in if (ctx->framebuffer.base.samples) render->multi_sample |= 0x68; + /* Set gl_FragColor register, need to specify it 4 times */ + render->multi_sample |= (fs->state.frag_color_reg << 28) | + (fs->state.frag_color_reg << 24) | + (fs->state.frag_color_reg << 20) | + (fs->state.frag_color_reg << 16); + /* alpha test */ if (ctx->zsa->base.alpha_enabled) { render->multi_sample |= ctx->zsa->base.alpha_func; @@ -755,7 +767,8 @@ lima_pack_render_state(struct lima_context *ctx, const struct pipe_draw_info *in render->aux1 |= 0x00002000; if (fs->state.uses_discard || - ctx->zsa->base.alpha_enabled) { + ctx->zsa->base.alpha_enabled || + fs->state.frag_depth_reg != -1) { early_z = false; pixel_kill = false; } diff --git a/src/gallium/drivers/lima/lima_parser.c b/src/gallium/drivers/lima/lima_parser.c index 3e69daee1b2..bcacd290aff 100644 --- a/src/gallium/drivers/lima/lima_parser.c +++ b/src/gallium/drivers/lima/lima_parser.c @@ -525,7 +525,7 @@ parse_rsw(FILE *fp, uint32_t *value, int i, uint32_t *helper) fprintf(fp, ": ignore depth clip near"); if ((*value & 0x00000020) == 0x00000020) fprintf(fp, ", ignore depth clip far"); - fprintf(fp, ", unknown bits 6-9: 0x%08x", *value & 0x000003c0); + fprintf(fp, ", register for gl_FragDepth: $%d", (*value & 0x000003c0) >> 6); fprintf(fp, ", unknown bits 13-15: 0x%08x */\n", *value & 0x00000e000); break; case 4: /* DEPTH RANGE */ @@ -594,7 +594,14 @@ parse_rsw(FILE *fp, uint32_t *value, int i, uint32_t *helper) fprintf(fp, " */\n"); else fprintf(fp, ", UNKNOWN\n"); - fprintf(fp, "\t\t\t\t\t\t/* %s(2)", render_state_infos[i].info); + + fprintf(fp, "\t\t\t\t\t\t/* %s(3)", render_state_infos[i].info); + fprintf(fp, ", register for gl_FragColor: $%d $%d $%d $%d */\n", + (*value & 0xf0000000) >> 28, + (*value & 0x0f000000) >> 24, + (*value & 0x00f00000) >> 20, + (*value & 0x000f0000) >> 16); + fprintf(fp, "\t\t\t\t\t\t/* %s(3)", render_state_infos[i].info); fprintf(fp, ": alpha_test_func: %d (%s) */\n", (*value & 0x00000007), lima_get_compare_func_string((*value & 0x00000007))); /* alpha_test_func */ |