summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMarek Olšák <marek.olsak@amd.com>2020-12-09 19:18:37 -0500
committerDylan Baker <dylan.c.baker@intel.com>2020-12-17 13:21:37 -0800
commit840a9a6c67bdeb110a2c1c6412c4a27670d2c3da (patch)
treec86b7e75f36142ef681b13be555c3ab5c502eb54
parent4a87e4441348cc45faea1f442294c8cac3b5a288 (diff)
downloadmesa-840a9a6c67bdeb110a2c1c6412c4a27670d2c3da.tar.gz
radeonsi: fix small primitive culling with MSAA force-disabled and smoothing
The problem was that the shader constants were based on the framebuffer sample count and ignored the multisample enable state and the line/polygon smoothing state, which uses MSAA rasterization that only sets SampleMaskIn to get the coverage for alpha-blended smoothing (the PS epilog computes the alpha channel from SampleMaskIn and blending generates the AA results). - This is a complete rework that adds a new state for NGG cull constants. - It fixes the same thing for the prim discard compute shader. - It documents how VS_STATE.SMALL_PRIM_PRECISION is encoded. It fixes blue corruption in Unigine Heaven with MSAA and Medium details or better. Fixes: 7648060dc03 - radeonsi: enable NGG culling by default on gfx10.3 dGPUs Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/8134>
-rw-r--r--.pick_status.json2
-rw-r--r--src/gallium/drivers/radeonsi/si_compute_prim_discard.c15
-rw-r--r--src/gallium/drivers/radeonsi/si_gfx_cs.c13
-rw-r--r--src/gallium/drivers/radeonsi/si_pipe.h17
-rw-r--r--src/gallium/drivers/radeonsi/si_state.c16
-rw-r--r--src/gallium/drivers/radeonsi/si_state.h1
-rw-r--r--src/gallium/drivers/radeonsi/si_state_shaders.c4
-rw-r--r--src/gallium/drivers/radeonsi/si_state_viewport.c112
8 files changed, 99 insertions, 81 deletions
diff --git a/.pick_status.json b/.pick_status.json
index 3a3e0f1ff28..c04b56572dd 100644
--- a/.pick_status.json
+++ b/.pick_status.json
@@ -274,7 +274,7 @@
"description": "radeonsi: fix small primitive culling with MSAA force-disabled and smoothing",
"nominated": true,
"nomination_type": 1,
- "resolution": 0,
+ "resolution": 3,
"master_sha": null,
"because_sha": "7648060dc03775979e3fa8904c4948c084e82b6a"
},
diff --git a/src/gallium/drivers/radeonsi/si_compute_prim_discard.c b/src/gallium/drivers/radeonsi/si_compute_prim_discard.c
index d540f260c9b..b4053cae9ec 100644
--- a/src/gallium/drivers/radeonsi/si_compute_prim_discard.c
+++ b/src/gallium/drivers/radeonsi/si_compute_prim_discard.c
@@ -1314,19 +1314,6 @@ void si_dispatch_prim_discard_cs_and_draw(struct si_context *sctx,
desc[10] = fui(cull_info.translate[0]);
desc[11] = fui(cull_info.translate[1]);
- /* Better subpixel precision increases the efficiency of small
- * primitive culling. */
- unsigned num_samples = sctx->framebuffer.nr_samples;
- unsigned quant_mode = sctx->viewports.as_scissor[0].quant_mode;
- float small_prim_cull_precision;
-
- if (quant_mode == SI_QUANT_MODE_12_12_FIXED_POINT_1_4096TH)
- small_prim_cull_precision = num_samples / 4096.0;
- else if (quant_mode == SI_QUANT_MODE_14_10_FIXED_POINT_1_1024TH)
- small_prim_cull_precision = num_samples / 1024.0;
- else
- small_prim_cull_precision = num_samples / 256.0;
-
/* Set user data SGPRs. */
/* This can't be greater than 14 if we want the fastest launch rate. */
unsigned user_sgprs = 13;
@@ -1490,7 +1477,7 @@ void si_dispatch_prim_discard_cs_and_draw(struct si_context *sctx,
radeon_emit(cs, num_prims_udiv.post_shift | (num_prims_per_instance << 5));
radeon_emit(cs, info->restart_index);
/* small-prim culling precision (same as rasterizer precision = QUANT_MODE) */
- radeon_emit(cs, fui(small_prim_cull_precision));
+ radeon_emit(cs, fui(cull_info.small_prim_precision));
} else {
assert(VERTEX_COUNTER_GDS_MODE == 2);
/* Only update the SGPRs that changed. */
diff --git a/src/gallium/drivers/radeonsi/si_gfx_cs.c b/src/gallium/drivers/radeonsi/si_gfx_cs.c
index 436a1116360..e7c169e7408 100644
--- a/src/gallium/drivers/radeonsi/si_gfx_cs.c
+++ b/src/gallium/drivers/radeonsi/si_gfx_cs.c
@@ -494,15 +494,14 @@ void si_begin_new_gfx_cs(struct si_context *ctx, bool first_cs)
ctx->framebuffer.dirty_cbufs = u_bit_consecutive(0, 8);
ctx->framebuffer.dirty_zsbuf = true;
}
- /* This should always be marked as dirty to set the framebuffer scissor
- * at least.
- *
- * Even with shadowed registers, we have to add buffers to the buffer list.
- * All of these do that.
+
+ /* Even with shadowed registers, we have to add buffers to the buffer list.
+ * These atoms are the only ones that add buffers.
*/
si_mark_atom_dirty(ctx, &ctx->atoms.s.framebuffer);
si_mark_atom_dirty(ctx, &ctx->atoms.s.render_cond);
- si_mark_atom_dirty(ctx, &ctx->atoms.s.viewports);
+ if (ctx->screen->use_ngg_culling)
+ si_mark_atom_dirty(ctx, &ctx->atoms.s.ngg_cull_state);
if (first_cs || !ctx->shadowed_regs) {
/* These don't add any buffers, so skip them with shadowing. */
@@ -532,6 +531,7 @@ void si_begin_new_gfx_cs(struct si_context *ctx, bool first_cs)
si_mark_atom_dirty(ctx, &ctx->atoms.s.window_rectangles);
si_mark_atom_dirty(ctx, &ctx->atoms.s.guardband);
si_mark_atom_dirty(ctx, &ctx->atoms.s.scissors);
+ si_mark_atom_dirty(ctx, &ctx->atoms.s.viewports);
/* Invalidate various draw states so that they are emitted before
* the first draw call. */
@@ -576,7 +576,6 @@ void si_begin_new_gfx_cs(struct si_context *ctx, bool first_cs)
assert(!ctx->gfx_cs->prev_dw);
ctx->initial_gfx_cs_size = ctx->gfx_cs->current.cdw;
- ctx->small_prim_cull_info_dirty = ctx->small_prim_cull_info_buf != NULL;
ctx->prim_discard_compute_ib_initialized = false;
/* Compute-based primitive discard:
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
index a5c7daeacdb..b36d695727d 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -891,6 +891,7 @@ struct si_sdma_upload {
struct si_small_prim_cull_info {
float scale[2], translate[2];
+ float small_prim_precision;
};
struct si_context {
@@ -1133,7 +1134,6 @@ struct si_context {
struct si_small_prim_cull_info last_small_prim_cull_info;
struct si_resource *small_prim_cull_info_buf;
uint64_t small_prim_cull_info_address;
- bool small_prim_cull_info_dirty;
/* Scratch buffer */
struct si_resource *scratch_buffer;
@@ -1518,7 +1518,6 @@ struct pipe_video_buffer *si_video_buffer_create(struct pipe_context *pipe,
const struct pipe_video_buffer *tmpl);
/* si_viewport.c */
-void si_update_ngg_small_prim_precision(struct si_context *ctx);
void si_get_small_prim_cull_info(struct si_context *sctx, struct si_small_prim_cull_info *out);
void si_update_vs_viewport_state(struct si_context *ctx);
void si_init_viewport_functions(struct si_context *ctx);
@@ -1933,6 +1932,20 @@ static inline unsigned si_get_shader_wave_size(struct si_shader *shader)
shader->key.opt.vs_as_prim_discard_cs);
}
+/* Return the number of samples that the rasterizer uses. */
+static inline unsigned si_get_num_coverage_samples(struct si_context *sctx)
+{
+ if (sctx->framebuffer.nr_samples > 1 &&
+ sctx->queued.named.rasterizer->multisample_enable)
+ return sctx->framebuffer.nr_samples;
+
+ /* Note that smoothing_enabled is set by si_update_shaders. */
+ if (sctx->smoothing_enabled)
+ return SI_NUM_SMOOTH_AA_SAMPLES;
+
+ return 1;
+}
+
#define PRINT_ERR(fmt, args...) \
fprintf(stderr, "EE %s:%d %s - " fmt, __FILE__, __LINE__, __func__, ##args)
diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c
index 0352345bbc9..6f018007ea7 100644
--- a/src/gallium/drivers/radeonsi/si_state.c
+++ b/src/gallium/drivers/radeonsi/si_state.c
@@ -988,6 +988,10 @@ static void si_bind_rs_state(struct pipe_context *ctx, void *state)
/* Update the small primitive filter workaround if necessary. */
if (sctx->screen->info.has_msaa_sample_loc_bug && sctx->framebuffer.nr_samples > 1)
si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_sample_locs);
+
+ /* NGG cull state uses multisample_enable. */
+ if (sctx->screen->use_ngg_culling)
+ si_mark_atom_dirty(sctx, &sctx->atoms.s.ngg_cull_state);
}
sctx->current_vs_state &= C_VS_STATE_CLAMP_VERTEX_COLOR;
@@ -2812,10 +2816,13 @@ static void si_set_framebuffer_state(struct pipe_context *ctx,
si_update_ps_colorbuf0_slot(sctx);
si_update_poly_offset_state(sctx);
- si_update_ngg_small_prim_precision(sctx);
si_mark_atom_dirty(sctx, &sctx->atoms.s.cb_render_state);
si_mark_atom_dirty(sctx, &sctx->atoms.s.framebuffer);
+ /* NGG cull state uses the sample count. */
+ if (sctx->screen->use_ngg_culling)
+ si_mark_atom_dirty(sctx, &sctx->atoms.s.ngg_cull_state);
+
if (sctx->screen->dpbb_allowed)
si_mark_atom_dirty(sctx, &sctx->atoms.s.dpbb_state);
@@ -3417,8 +3424,9 @@ static void si_emit_msaa_config(struct si_context *sctx)
* EQAA 4s 4z 2f - might look the same as 4x MSAA with low-density geometry
* EQAA 2s 2z 2f = 2x MSAA
*/
+ coverage_samples = color_samples = z_samples = si_get_num_coverage_samples(sctx);
+
if (sctx->framebuffer.nr_samples > 1 && rs->multisample_enable) {
- coverage_samples = sctx->framebuffer.nr_samples;
color_samples = sctx->framebuffer.nr_color_samples;
if (sctx->framebuffer.state.zsbuf) {
@@ -3427,10 +3435,6 @@ static void si_emit_msaa_config(struct si_context *sctx)
} else {
z_samples = coverage_samples;
}
- } else if (sctx->smoothing_enabled) {
- coverage_samples = color_samples = z_samples = SI_NUM_SMOOTH_AA_SAMPLES;
- } else {
- coverage_samples = color_samples = z_samples = 1;
}
/* Required by OpenGL line rasterization.
diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h
index 9ac4d51889c..6f585afd7db 100644
--- a/src/gallium/drivers/radeonsi/si_state.h
+++ b/src/gallium/drivers/radeonsi/si_state.h
@@ -228,6 +228,7 @@ union si_state_atoms {
struct si_atom scratch_state;
struct si_atom window_rectangles;
struct si_atom shader_query;
+ struct si_atom ngg_cull_state;
} s;
struct si_atom array[sizeof(struct si_atoms_s) / sizeof(struct si_atom)];
};
diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c
index 8df7b9b5fca..8370ed7db10 100644
--- a/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
@@ -4061,6 +4061,10 @@ bool si_update_shaders(struct si_context *sctx)
sctx->smoothing_enabled = sctx->ps_shader.current->key.part.ps.epilog.poly_line_smoothing;
si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_config);
+ /* NGG cull state uses smoothing_enabled. */
+ if (sctx->screen->use_ngg_culling)
+ si_mark_atom_dirty(sctx, &sctx->atoms.s.ngg_cull_state);
+
if (sctx->chip_class == GFX6)
si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
diff --git a/src/gallium/drivers/radeonsi/si_state_viewport.c b/src/gallium/drivers/radeonsi/si_state_viewport.c
index 9d62b2c85f2..c431c6a72cf 100644
--- a/src/gallium/drivers/radeonsi/si_state_viewport.c
+++ b/src/gallium/drivers/radeonsi/si_state_viewport.c
@@ -28,34 +28,13 @@
#define SI_MAX_SCISSOR 16384
-void si_update_ngg_small_prim_precision(struct si_context *ctx)
-{
- if (!ctx->screen->use_ngg_culling)
- return;
-
- /* Set VS_STATE.SMALL_PRIM_PRECISION for NGG culling. */
- unsigned num_samples = ctx->framebuffer.nr_samples;
- unsigned quant_mode = ctx->viewports.as_scissor[0].quant_mode;
- float precision;
-
- if (quant_mode == SI_QUANT_MODE_12_12_FIXED_POINT_1_4096TH)
- precision = num_samples / 4096.0;
- else if (quant_mode == SI_QUANT_MODE_14_10_FIXED_POINT_1_1024TH)
- precision = num_samples / 1024.0;
- else
- precision = num_samples / 256.0;
-
- ctx->current_vs_state &= C_VS_STATE_SMALL_PRIM_PRECISION;
- ctx->current_vs_state |= S_VS_STATE_SMALL_PRIM_PRECISION(fui(precision) >> 23);
-}
-
void si_get_small_prim_cull_info(struct si_context *sctx, struct si_small_prim_cull_info *out)
{
/* This is needed by the small primitive culling, because it's done
* in screen space.
*/
struct si_small_prim_cull_info info;
- unsigned num_samples = sctx->framebuffer.nr_samples;
+ unsigned num_samples = si_get_num_coverage_samples(sctx);
assert(num_samples >= 1);
info.scale[0] = sctx->viewports.states[0].scale[0];
@@ -85,9 +64,64 @@ void si_get_small_prim_cull_info(struct si_context *sctx, struct si_small_prim_c
info.scale[i] *= num_samples;
info.translate[i] *= num_samples;
}
+
+ /* Better subpixel precision increases the efficiency of small
+ * primitive culling. (more precision means a tighter bounding box
+ * around primitives and more accurate elimination)
+ */
+ unsigned quant_mode = sctx->viewports.as_scissor[0].quant_mode;
+
+ if (quant_mode == SI_QUANT_MODE_12_12_FIXED_POINT_1_4096TH)
+ info.small_prim_precision = num_samples / 4096.0;
+ else if (quant_mode == SI_QUANT_MODE_14_10_FIXED_POINT_1_1024TH)
+ info.small_prim_precision = num_samples / 1024.0;
+ else
+ info.small_prim_precision = num_samples / 256.0;
+
*out = info;
}
+static void si_emit_cull_state(struct si_context *sctx)
+{
+ assert(sctx->screen->use_ngg_culling);
+
+ struct si_small_prim_cull_info info;
+ si_get_small_prim_cull_info(sctx, &info);
+
+ if (!sctx->small_prim_cull_info_buf ||
+ memcmp(&info, &sctx->last_small_prim_cull_info, sizeof(info))) {
+ unsigned offset = 0;
+
+ /* Align to 256, because the address is shifted by 8 bits. */
+ u_upload_data(sctx->b.const_uploader, 0, sizeof(info), 256, &info, &offset,
+ (struct pipe_resource **)&sctx->small_prim_cull_info_buf);
+
+ sctx->small_prim_cull_info_address = sctx->small_prim_cull_info_buf->gpu_address + offset;
+ sctx->last_small_prim_cull_info = info;
+ }
+
+ /* This will end up in SGPR6 as (value << 8), shifted by the hw. */
+ radeon_add_to_buffer_list(sctx, sctx->gfx_cs, sctx->small_prim_cull_info_buf,
+ RADEON_USAGE_READ, RADEON_PRIO_CONST_BUFFER);
+ radeon_set_sh_reg(sctx->gfx_cs, R_00B220_SPI_SHADER_PGM_LO_GS,
+ sctx->small_prim_cull_info_address >> 8);
+
+ /* Set VS_STATE.SMALL_PRIM_PRECISION for NGG culling.
+ *
+ * small_prim_precision is 1 / 2^n. We only need n between 5 (1/32) and 12 (1/4096).
+ * Such a floating point value can be packed into 4 bits as follows:
+ * If we pass the first 4 bits of the exponent to the shader and set the next 3 bits
+ * to 1, we'll get the number exactly because all other bits are always 0. See:
+ * 1
+ * value = (0x70 | value.exponent[0:3]) << 23 = ------------------------------
+ * 2 ^ (15 - value.exponent[0:3])
+ *
+ * So pass only the first 4 bits of the float exponent to the shader.
+ */
+ sctx->current_vs_state &= C_VS_STATE_SMALL_PRIM_PRECISION;
+ sctx->current_vs_state |= S_VS_STATE_SMALL_PRIM_PRECISION(fui(info.small_prim_precision) >> 23);
+}
+
static void si_set_scissor_states(struct pipe_context *pctx, unsigned start_slot,
unsigned num_scissors, const struct pipe_scissor_state *state)
{
@@ -330,8 +364,6 @@ static void si_emit_guardband(struct si_context *ctx)
S_028BE4_QUANT_MODE(V_028BE4_X_16_8_FIXED_POINT_1_256TH + vp_as_scissor.quant_mode));
if (initial_cdw != ctx->gfx_cs->current.cdw)
ctx->context_roll = true;
-
- si_update_ngg_small_prim_precision(ctx);
}
static void si_emit_scissors(struct si_context *ctx)
@@ -430,6 +462,10 @@ static void si_set_viewport_states(struct pipe_context *pctx, unsigned start_slo
if (start_slot == 0) {
ctx->viewports.y_inverted =
-state->scale[1] + state->translate[1] > state->scale[1] + state->translate[1];
+
+ /* NGG cull state uses the viewport and quant mode. */
+ if (ctx->screen->use_ngg_culling)
+ si_mark_atom_dirty(ctx, &ctx->atoms.s.ngg_cull_state);
}
si_mark_atom_dirty(ctx, &ctx->atoms.s.viewports);
@@ -454,33 +490,6 @@ static void si_emit_viewports(struct si_context *ctx)
struct radeon_cmdbuf *cs = ctx->gfx_cs;
struct pipe_viewport_state *states = ctx->viewports.states;
- if (ctx->screen->use_ngg_culling) {
- /* Set the viewport info for small primitive culling. */
- struct si_small_prim_cull_info info;
- si_get_small_prim_cull_info(ctx, &info);
-
- if (memcmp(&info, &ctx->last_small_prim_cull_info, sizeof(info))) {
- unsigned offset = 0;
-
- /* Align to 256, because the address is shifted by 8 bits. */
- u_upload_data(ctx->b.const_uploader, 0, sizeof(info), 256, &info, &offset,
- (struct pipe_resource **)&ctx->small_prim_cull_info_buf);
-
- ctx->small_prim_cull_info_address = ctx->small_prim_cull_info_buf->gpu_address + offset;
- ctx->last_small_prim_cull_info = info;
- ctx->small_prim_cull_info_dirty = true;
- }
-
- if (ctx->small_prim_cull_info_dirty) {
- /* This will end up in SGPR6 as (value << 8), shifted by the hw. */
- radeon_add_to_buffer_list(ctx, ctx->gfx_cs, ctx->small_prim_cull_info_buf,
- RADEON_USAGE_READ, RADEON_PRIO_CONST_BUFFER);
- radeon_set_sh_reg(ctx->gfx_cs, R_00B220_SPI_SHADER_PGM_LO_GS,
- ctx->small_prim_cull_info_address >> 8);
- ctx->small_prim_cull_info_dirty = false;
- }
- }
-
/* The simple case: Only 1 viewport is active. */
if (!ctx->vs_writes_viewport_index) {
radeon_set_context_reg_seq(cs, R_02843C_PA_CL_VPORT_XSCALE, 6);
@@ -655,6 +664,7 @@ void si_init_viewport_functions(struct si_context *ctx)
ctx->atoms.s.scissors.emit = si_emit_scissors;
ctx->atoms.s.viewports.emit = si_emit_viewport_states;
ctx->atoms.s.window_rectangles.emit = si_emit_window_rectangles;
+ ctx->atoms.s.ngg_cull_state.emit = si_emit_cull_state;
ctx->b.set_scissor_states = si_set_scissor_states;
ctx->b.set_viewport_states = si_set_viewport_states;