summaryrefslogtreecommitdiff
path: root/src/gallium/drivers/lima/ir/pp/nir.c
diff options
context:
space:
mode:
authorErico Nunes <nunes.erico@gmail.com>2020-01-12 15:11:55 +0100
committerMarge Bot <eric+marge@anholt.net>2020-01-15 22:55:31 +0000
commit9bf210ba982ba4e0a1cd125285eb65bc2213242f (patch)
treee7318271722df422841a50350151491c098d5c3c /src/gallium/drivers/lima/ir/pp/nir.c
parent7e2765fded33ed13693939b0e4ef94943fedf2cb (diff)
downloadmesa-9bf210ba982ba4e0a1cd125285eb65bc2213242f.tar.gz
lima/ppir: implement full liveness analysis for regalloc
The existing liveness analysis in ppir still ultimately relies on a single continuous live_in and live_out range per register and was observed to be the bottleneck for register allocation on complicated examples with several control flow blocks. The use of live_in and live_out ranges was fine before ppir got control flow, but now it ends up creating unnecessary interferences as live_in and live_out ranges may span across entire blocks after blocks get placed sequentially. This new liveness analysis implementation generates a set of live variables at each program point; before and after each instruction and beginning and end of each block. This is a global analysis and propagates the sets of live registers across blocks independently of their sequence. The resulting sets optimally represent all variables that cannot share a register at each program point, so can be directly translated as interferences to the register allocator. Special care has to be taken with non-ssa registers. In order to properly define their live range, their alive components also need to be tracked. Therefore ppir can't use simple bitsets to keep track of live registers. The algorithm uses an auxiliary set data structure to keep track of the live registers. The initial implementation used only trivial arrays, however regalloc execution time was then prohibitive (>1minute on Cortex-A53) on extreme benchmarks with hundreds of instructions, hundreds of registers and several spilling iterations, mostly due to the n^2 complexity to generate the interferences from the live sets. Since the live registers set are only a very sparse subset of all registers at each instruction, iterating only over this subset allows it to run very fast again (a couple of seconds for the same benchmark). Signed-off-by: Erico Nunes <nunes.erico@gmail.com> Reviewed-by: Vasily Khoruzhick <anarsoul@gmail.com> Tested-by: Marge Bot <https://gitlab.freedesktop.org/mesa/mesa/merge_requests/3358> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/merge_requests/3358>
Diffstat (limited to 'src/gallium/drivers/lima/ir/pp/nir.c')
-rw-r--r--src/gallium/drivers/lima/ir/pp/nir.c6
1 files changed, 0 insertions, 6 deletions
diff --git a/src/gallium/drivers/lima/ir/pp/nir.c b/src/gallium/drivers/lima/ir/pp/nir.c
index a1d10a0be37..0c91c09831b 100644
--- a/src/gallium/drivers/lima/ir/pp/nir.c
+++ b/src/gallium/drivers/lima/ir/pp/nir.c
@@ -42,8 +42,6 @@ static void *ppir_node_create_ssa(ppir_block *block, ppir_op op, nir_ssa_def *ss
ppir_dest *dest = ppir_node_get_dest(node);
dest->type = ppir_target_ssa;
dest->ssa.num_components = ssa->num_components;
- dest->ssa.live_in = INT_MAX;
- dest->ssa.live_out = 0;
dest->write_mask = u_bit_consecutive(0, ssa->num_components);
if (node->type == ppir_node_type_load ||
@@ -389,8 +387,6 @@ static ppir_node *ppir_emit_intrinsic(ppir_block *block, nir_instr *ni)
ppir_dest *dest = ppir_node_get_dest(&alu_node->node);
dest->type = ppir_target_ssa;
dest->ssa.num_components = instr->num_components;
- dest->ssa.live_in = INT_MAX;
- dest->ssa.live_out = 0;
dest->ssa.index = 0;
dest->write_mask = u_bit_consecutive(0, instr->num_components);
@@ -898,8 +894,6 @@ bool ppir_compile_nir(struct lima_fs_shader_state *prog, struct nir_shader *nir,
r->index = reg->index;
r->num_components = reg->num_components;
- r->live_in = INT_MAX;
- r->live_out = 0;
r->is_head = false;
list_addtail(&r->list, &comp->reg_list);
}