diff options
author | Marek Olšák <marek.olsak@amd.com> | 2020-12-09 19:18:37 -0500 |
---|---|---|
committer | Dylan Baker <dylan.c.baker@intel.com> | 2020-12-17 13:21:37 -0800 |
commit | 840a9a6c67bdeb110a2c1c6412c4a27670d2c3da (patch) | |
tree | c86b7e75f36142ef681b13be555c3ab5c502eb54 | |
parent | 4a87e4441348cc45faea1f442294c8cac3b5a288 (diff) | |
download | mesa-840a9a6c67bdeb110a2c1c6412c4a27670d2c3da.tar.gz |
radeonsi: fix small primitive culling with MSAA force-disabled and smoothing
The problem was that the shader constants were based on the framebuffer
sample count and ignored the multisample enable state and the line/polygon
smoothing state, which uses MSAA rasterization that only sets SampleMaskIn
to get the coverage for alpha-blended smoothing (the PS epilog computes
the alpha channel from SampleMaskIn and blending generates the AA results).
- This is a complete rework that adds a new state for NGG cull constants.
- It fixes the same thing for the prim discard compute shader.
- It documents how VS_STATE.SMALL_PRIM_PRECISION is encoded.
It fixes blue corruption in Unigine Heaven with MSAA and Medium details
or better.
Fixes: 7648060dc03 - radeonsi: enable NGG culling by default on gfx10.3 dGPUs
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/8134>
-rw-r--r-- | .pick_status.json | 2 | ||||
-rw-r--r-- | src/gallium/drivers/radeonsi/si_compute_prim_discard.c | 15 | ||||
-rw-r--r-- | src/gallium/drivers/radeonsi/si_gfx_cs.c | 13 | ||||
-rw-r--r-- | src/gallium/drivers/radeonsi/si_pipe.h | 17 | ||||
-rw-r--r-- | src/gallium/drivers/radeonsi/si_state.c | 16 | ||||
-rw-r--r-- | src/gallium/drivers/radeonsi/si_state.h | 1 | ||||
-rw-r--r-- | src/gallium/drivers/radeonsi/si_state_shaders.c | 4 | ||||
-rw-r--r-- | src/gallium/drivers/radeonsi/si_state_viewport.c | 112 |
8 files changed, 99 insertions, 81 deletions
diff --git a/.pick_status.json b/.pick_status.json index 3a3e0f1ff28..c04b56572dd 100644 --- a/.pick_status.json +++ b/.pick_status.json @@ -274,7 +274,7 @@ "description": "radeonsi: fix small primitive culling with MSAA force-disabled and smoothing", "nominated": true, "nomination_type": 1, - "resolution": 0, + "resolution": 3, "master_sha": null, "because_sha": "7648060dc03775979e3fa8904c4948c084e82b6a" }, diff --git a/src/gallium/drivers/radeonsi/si_compute_prim_discard.c b/src/gallium/drivers/radeonsi/si_compute_prim_discard.c index d540f260c9b..b4053cae9ec 100644 --- a/src/gallium/drivers/radeonsi/si_compute_prim_discard.c +++ b/src/gallium/drivers/radeonsi/si_compute_prim_discard.c @@ -1314,19 +1314,6 @@ void si_dispatch_prim_discard_cs_and_draw(struct si_context *sctx, desc[10] = fui(cull_info.translate[0]); desc[11] = fui(cull_info.translate[1]); - /* Better subpixel precision increases the efficiency of small - * primitive culling. */ - unsigned num_samples = sctx->framebuffer.nr_samples; - unsigned quant_mode = sctx->viewports.as_scissor[0].quant_mode; - float small_prim_cull_precision; - - if (quant_mode == SI_QUANT_MODE_12_12_FIXED_POINT_1_4096TH) - small_prim_cull_precision = num_samples / 4096.0; - else if (quant_mode == SI_QUANT_MODE_14_10_FIXED_POINT_1_1024TH) - small_prim_cull_precision = num_samples / 1024.0; - else - small_prim_cull_precision = num_samples / 256.0; - /* Set user data SGPRs. */ /* This can't be greater than 14 if we want the fastest launch rate. */ unsigned user_sgprs = 13; @@ -1490,7 +1477,7 @@ void si_dispatch_prim_discard_cs_and_draw(struct si_context *sctx, radeon_emit(cs, num_prims_udiv.post_shift | (num_prims_per_instance << 5)); radeon_emit(cs, info->restart_index); /* small-prim culling precision (same as rasterizer precision = QUANT_MODE) */ - radeon_emit(cs, fui(small_prim_cull_precision)); + radeon_emit(cs, fui(cull_info.small_prim_precision)); } else { assert(VERTEX_COUNTER_GDS_MODE == 2); /* Only update the SGPRs that changed. */ diff --git a/src/gallium/drivers/radeonsi/si_gfx_cs.c b/src/gallium/drivers/radeonsi/si_gfx_cs.c index 436a1116360..e7c169e7408 100644 --- a/src/gallium/drivers/radeonsi/si_gfx_cs.c +++ b/src/gallium/drivers/radeonsi/si_gfx_cs.c @@ -494,15 +494,14 @@ void si_begin_new_gfx_cs(struct si_context *ctx, bool first_cs) ctx->framebuffer.dirty_cbufs = u_bit_consecutive(0, 8); ctx->framebuffer.dirty_zsbuf = true; } - /* This should always be marked as dirty to set the framebuffer scissor - * at least. - * - * Even with shadowed registers, we have to add buffers to the buffer list. - * All of these do that. + + /* Even with shadowed registers, we have to add buffers to the buffer list. + * These atoms are the only ones that add buffers. */ si_mark_atom_dirty(ctx, &ctx->atoms.s.framebuffer); si_mark_atom_dirty(ctx, &ctx->atoms.s.render_cond); - si_mark_atom_dirty(ctx, &ctx->atoms.s.viewports); + if (ctx->screen->use_ngg_culling) + si_mark_atom_dirty(ctx, &ctx->atoms.s.ngg_cull_state); if (first_cs || !ctx->shadowed_regs) { /* These don't add any buffers, so skip them with shadowing. */ @@ -532,6 +531,7 @@ void si_begin_new_gfx_cs(struct si_context *ctx, bool first_cs) si_mark_atom_dirty(ctx, &ctx->atoms.s.window_rectangles); si_mark_atom_dirty(ctx, &ctx->atoms.s.guardband); si_mark_atom_dirty(ctx, &ctx->atoms.s.scissors); + si_mark_atom_dirty(ctx, &ctx->atoms.s.viewports); /* Invalidate various draw states so that they are emitted before * the first draw call. */ @@ -576,7 +576,6 @@ void si_begin_new_gfx_cs(struct si_context *ctx, bool first_cs) assert(!ctx->gfx_cs->prev_dw); ctx->initial_gfx_cs_size = ctx->gfx_cs->current.cdw; - ctx->small_prim_cull_info_dirty = ctx->small_prim_cull_info_buf != NULL; ctx->prim_discard_compute_ib_initialized = false; /* Compute-based primitive discard: diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h index a5c7daeacdb..b36d695727d 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.h +++ b/src/gallium/drivers/radeonsi/si_pipe.h @@ -891,6 +891,7 @@ struct si_sdma_upload { struct si_small_prim_cull_info { float scale[2], translate[2]; + float small_prim_precision; }; struct si_context { @@ -1133,7 +1134,6 @@ struct si_context { struct si_small_prim_cull_info last_small_prim_cull_info; struct si_resource *small_prim_cull_info_buf; uint64_t small_prim_cull_info_address; - bool small_prim_cull_info_dirty; /* Scratch buffer */ struct si_resource *scratch_buffer; @@ -1518,7 +1518,6 @@ struct pipe_video_buffer *si_video_buffer_create(struct pipe_context *pipe, const struct pipe_video_buffer *tmpl); /* si_viewport.c */ -void si_update_ngg_small_prim_precision(struct si_context *ctx); void si_get_small_prim_cull_info(struct si_context *sctx, struct si_small_prim_cull_info *out); void si_update_vs_viewport_state(struct si_context *ctx); void si_init_viewport_functions(struct si_context *ctx); @@ -1933,6 +1932,20 @@ static inline unsigned si_get_shader_wave_size(struct si_shader *shader) shader->key.opt.vs_as_prim_discard_cs); } +/* Return the number of samples that the rasterizer uses. */ +static inline unsigned si_get_num_coverage_samples(struct si_context *sctx) +{ + if (sctx->framebuffer.nr_samples > 1 && + sctx->queued.named.rasterizer->multisample_enable) + return sctx->framebuffer.nr_samples; + + /* Note that smoothing_enabled is set by si_update_shaders. */ + if (sctx->smoothing_enabled) + return SI_NUM_SMOOTH_AA_SAMPLES; + + return 1; +} + #define PRINT_ERR(fmt, args...) \ fprintf(stderr, "EE %s:%d %s - " fmt, __FILE__, __LINE__, __func__, ##args) diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c index 0352345bbc9..6f018007ea7 100644 --- a/src/gallium/drivers/radeonsi/si_state.c +++ b/src/gallium/drivers/radeonsi/si_state.c @@ -988,6 +988,10 @@ static void si_bind_rs_state(struct pipe_context *ctx, void *state) /* Update the small primitive filter workaround if necessary. */ if (sctx->screen->info.has_msaa_sample_loc_bug && sctx->framebuffer.nr_samples > 1) si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_sample_locs); + + /* NGG cull state uses multisample_enable. */ + if (sctx->screen->use_ngg_culling) + si_mark_atom_dirty(sctx, &sctx->atoms.s.ngg_cull_state); } sctx->current_vs_state &= C_VS_STATE_CLAMP_VERTEX_COLOR; @@ -2812,10 +2816,13 @@ static void si_set_framebuffer_state(struct pipe_context *ctx, si_update_ps_colorbuf0_slot(sctx); si_update_poly_offset_state(sctx); - si_update_ngg_small_prim_precision(sctx); si_mark_atom_dirty(sctx, &sctx->atoms.s.cb_render_state); si_mark_atom_dirty(sctx, &sctx->atoms.s.framebuffer); + /* NGG cull state uses the sample count. */ + if (sctx->screen->use_ngg_culling) + si_mark_atom_dirty(sctx, &sctx->atoms.s.ngg_cull_state); + if (sctx->screen->dpbb_allowed) si_mark_atom_dirty(sctx, &sctx->atoms.s.dpbb_state); @@ -3417,8 +3424,9 @@ static void si_emit_msaa_config(struct si_context *sctx) * EQAA 4s 4z 2f - might look the same as 4x MSAA with low-density geometry * EQAA 2s 2z 2f = 2x MSAA */ + coverage_samples = color_samples = z_samples = si_get_num_coverage_samples(sctx); + if (sctx->framebuffer.nr_samples > 1 && rs->multisample_enable) { - coverage_samples = sctx->framebuffer.nr_samples; color_samples = sctx->framebuffer.nr_color_samples; if (sctx->framebuffer.state.zsbuf) { @@ -3427,10 +3435,6 @@ static void si_emit_msaa_config(struct si_context *sctx) } else { z_samples = coverage_samples; } - } else if (sctx->smoothing_enabled) { - coverage_samples = color_samples = z_samples = SI_NUM_SMOOTH_AA_SAMPLES; - } else { - coverage_samples = color_samples = z_samples = 1; } /* Required by OpenGL line rasterization. diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h index 9ac4d51889c..6f585afd7db 100644 --- a/src/gallium/drivers/radeonsi/si_state.h +++ b/src/gallium/drivers/radeonsi/si_state.h @@ -228,6 +228,7 @@ union si_state_atoms { struct si_atom scratch_state; struct si_atom window_rectangles; struct si_atom shader_query; + struct si_atom ngg_cull_state; } s; struct si_atom array[sizeof(struct si_atoms_s) / sizeof(struct si_atom)]; }; diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c index 8df7b9b5fca..8370ed7db10 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.c +++ b/src/gallium/drivers/radeonsi/si_state_shaders.c @@ -4061,6 +4061,10 @@ bool si_update_shaders(struct si_context *sctx) sctx->smoothing_enabled = sctx->ps_shader.current->key.part.ps.epilog.poly_line_smoothing; si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_config); + /* NGG cull state uses smoothing_enabled. */ + if (sctx->screen->use_ngg_culling) + si_mark_atom_dirty(sctx, &sctx->atoms.s.ngg_cull_state); + if (sctx->chip_class == GFX6) si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state); diff --git a/src/gallium/drivers/radeonsi/si_state_viewport.c b/src/gallium/drivers/radeonsi/si_state_viewport.c index 9d62b2c85f2..c431c6a72cf 100644 --- a/src/gallium/drivers/radeonsi/si_state_viewport.c +++ b/src/gallium/drivers/radeonsi/si_state_viewport.c @@ -28,34 +28,13 @@ #define SI_MAX_SCISSOR 16384 -void si_update_ngg_small_prim_precision(struct si_context *ctx) -{ - if (!ctx->screen->use_ngg_culling) - return; - - /* Set VS_STATE.SMALL_PRIM_PRECISION for NGG culling. */ - unsigned num_samples = ctx->framebuffer.nr_samples; - unsigned quant_mode = ctx->viewports.as_scissor[0].quant_mode; - float precision; - - if (quant_mode == SI_QUANT_MODE_12_12_FIXED_POINT_1_4096TH) - precision = num_samples / 4096.0; - else if (quant_mode == SI_QUANT_MODE_14_10_FIXED_POINT_1_1024TH) - precision = num_samples / 1024.0; - else - precision = num_samples / 256.0; - - ctx->current_vs_state &= C_VS_STATE_SMALL_PRIM_PRECISION; - ctx->current_vs_state |= S_VS_STATE_SMALL_PRIM_PRECISION(fui(precision) >> 23); -} - void si_get_small_prim_cull_info(struct si_context *sctx, struct si_small_prim_cull_info *out) { /* This is needed by the small primitive culling, because it's done * in screen space. */ struct si_small_prim_cull_info info; - unsigned num_samples = sctx->framebuffer.nr_samples; + unsigned num_samples = si_get_num_coverage_samples(sctx); assert(num_samples >= 1); info.scale[0] = sctx->viewports.states[0].scale[0]; @@ -85,9 +64,64 @@ void si_get_small_prim_cull_info(struct si_context *sctx, struct si_small_prim_c info.scale[i] *= num_samples; info.translate[i] *= num_samples; } + + /* Better subpixel precision increases the efficiency of small + * primitive culling. (more precision means a tighter bounding box + * around primitives and more accurate elimination) + */ + unsigned quant_mode = sctx->viewports.as_scissor[0].quant_mode; + + if (quant_mode == SI_QUANT_MODE_12_12_FIXED_POINT_1_4096TH) + info.small_prim_precision = num_samples / 4096.0; + else if (quant_mode == SI_QUANT_MODE_14_10_FIXED_POINT_1_1024TH) + info.small_prim_precision = num_samples / 1024.0; + else + info.small_prim_precision = num_samples / 256.0; + *out = info; } +static void si_emit_cull_state(struct si_context *sctx) +{ + assert(sctx->screen->use_ngg_culling); + + struct si_small_prim_cull_info info; + si_get_small_prim_cull_info(sctx, &info); + + if (!sctx->small_prim_cull_info_buf || + memcmp(&info, &sctx->last_small_prim_cull_info, sizeof(info))) { + unsigned offset = 0; + + /* Align to 256, because the address is shifted by 8 bits. */ + u_upload_data(sctx->b.const_uploader, 0, sizeof(info), 256, &info, &offset, + (struct pipe_resource **)&sctx->small_prim_cull_info_buf); + + sctx->small_prim_cull_info_address = sctx->small_prim_cull_info_buf->gpu_address + offset; + sctx->last_small_prim_cull_info = info; + } + + /* This will end up in SGPR6 as (value << 8), shifted by the hw. */ + radeon_add_to_buffer_list(sctx, sctx->gfx_cs, sctx->small_prim_cull_info_buf, + RADEON_USAGE_READ, RADEON_PRIO_CONST_BUFFER); + radeon_set_sh_reg(sctx->gfx_cs, R_00B220_SPI_SHADER_PGM_LO_GS, + sctx->small_prim_cull_info_address >> 8); + + /* Set VS_STATE.SMALL_PRIM_PRECISION for NGG culling. + * + * small_prim_precision is 1 / 2^n. We only need n between 5 (1/32) and 12 (1/4096). + * Such a floating point value can be packed into 4 bits as follows: + * If we pass the first 4 bits of the exponent to the shader and set the next 3 bits + * to 1, we'll get the number exactly because all other bits are always 0. See: + * 1 + * value = (0x70 | value.exponent[0:3]) << 23 = ------------------------------ + * 2 ^ (15 - value.exponent[0:3]) + * + * So pass only the first 4 bits of the float exponent to the shader. + */ + sctx->current_vs_state &= C_VS_STATE_SMALL_PRIM_PRECISION; + sctx->current_vs_state |= S_VS_STATE_SMALL_PRIM_PRECISION(fui(info.small_prim_precision) >> 23); +} + static void si_set_scissor_states(struct pipe_context *pctx, unsigned start_slot, unsigned num_scissors, const struct pipe_scissor_state *state) { @@ -330,8 +364,6 @@ static void si_emit_guardband(struct si_context *ctx) S_028BE4_QUANT_MODE(V_028BE4_X_16_8_FIXED_POINT_1_256TH + vp_as_scissor.quant_mode)); if (initial_cdw != ctx->gfx_cs->current.cdw) ctx->context_roll = true; - - si_update_ngg_small_prim_precision(ctx); } static void si_emit_scissors(struct si_context *ctx) @@ -430,6 +462,10 @@ static void si_set_viewport_states(struct pipe_context *pctx, unsigned start_slo if (start_slot == 0) { ctx->viewports.y_inverted = -state->scale[1] + state->translate[1] > state->scale[1] + state->translate[1]; + + /* NGG cull state uses the viewport and quant mode. */ + if (ctx->screen->use_ngg_culling) + si_mark_atom_dirty(ctx, &ctx->atoms.s.ngg_cull_state); } si_mark_atom_dirty(ctx, &ctx->atoms.s.viewports); @@ -454,33 +490,6 @@ static void si_emit_viewports(struct si_context *ctx) struct radeon_cmdbuf *cs = ctx->gfx_cs; struct pipe_viewport_state *states = ctx->viewports.states; - if (ctx->screen->use_ngg_culling) { - /* Set the viewport info for small primitive culling. */ - struct si_small_prim_cull_info info; - si_get_small_prim_cull_info(ctx, &info); - - if (memcmp(&info, &ctx->last_small_prim_cull_info, sizeof(info))) { - unsigned offset = 0; - - /* Align to 256, because the address is shifted by 8 bits. */ - u_upload_data(ctx->b.const_uploader, 0, sizeof(info), 256, &info, &offset, - (struct pipe_resource **)&ctx->small_prim_cull_info_buf); - - ctx->small_prim_cull_info_address = ctx->small_prim_cull_info_buf->gpu_address + offset; - ctx->last_small_prim_cull_info = info; - ctx->small_prim_cull_info_dirty = true; - } - - if (ctx->small_prim_cull_info_dirty) { - /* This will end up in SGPR6 as (value << 8), shifted by the hw. */ - radeon_add_to_buffer_list(ctx, ctx->gfx_cs, ctx->small_prim_cull_info_buf, - RADEON_USAGE_READ, RADEON_PRIO_CONST_BUFFER); - radeon_set_sh_reg(ctx->gfx_cs, R_00B220_SPI_SHADER_PGM_LO_GS, - ctx->small_prim_cull_info_address >> 8); - ctx->small_prim_cull_info_dirty = false; - } - } - /* The simple case: Only 1 viewport is active. */ if (!ctx->vs_writes_viewport_index) { radeon_set_context_reg_seq(cs, R_02843C_PA_CL_VPORT_XSCALE, 6); @@ -655,6 +664,7 @@ void si_init_viewport_functions(struct si_context *ctx) ctx->atoms.s.scissors.emit = si_emit_scissors; ctx->atoms.s.viewports.emit = si_emit_viewport_states; ctx->atoms.s.window_rectangles.emit = si_emit_window_rectangles; + ctx->atoms.s.ngg_cull_state.emit = si_emit_cull_state; ctx->b.set_scissor_states = si_set_scissor_states; ctx->b.set_viewport_states = si_set_viewport_states; |