diff options
author | Guillaume Emont <guijemont@igalia.com> | 2012-12-18 20:47:37 +0100 |
---|---|---|
committer | Guillaume Emont <guijemont@igalia.com> | 2012-12-28 15:23:39 +0100 |
commit | 9d39b049b9faaf302ff063115ffcafe7051512ee (patch) | |
tree | 8d754ab9d1772d699b84c4435e5173400431d28a | |
parent | e0485969bb9d317556913a0d32aff193fb1a8741 (diff) | |
download | orc-9d39b049b9faaf302ff063115ffcafe7051512ee.tar.gz |
mips: introduced loop unrolling
For now, this only happens in the main loop, and is decided by the
ORC_UNROLL_SHIFT environment variable.
-rw-r--r-- | orc/orcprogram-mips.c | 89 | ||||
-rw-r--r-- | orc/orcrules-mips.c | 46 |
2 files changed, 79 insertions, 56 deletions
diff --git a/orc/orcprogram-mips.c b/orc/orcprogram-mips.c index 2253890..a93cb48 100644 --- a/orc/orcprogram-mips.c +++ b/orc/orcprogram-mips.c @@ -151,6 +151,11 @@ orc_compiler_orc_mips_init (OrcCompiler *compiler) ORC_ERROR("unhandled variable size %d", compiler->max_var_size); } + /* Empirical evidence in a colorspace conversion benchmark shows that 3 is + * the best unroll shift. */ + compiler->unroll_shift = 3; + compiler->unroll_index = 0; + for(i=0;i<compiler->n_insns;i++){ OrcInstruction *insn = compiler->insns + i; OrcStaticOpcode *opcode = insn->opcode; @@ -307,40 +312,52 @@ orc_mips_load_constants_inner (OrcCompiler *compiler) } void -orc_mips_emit_loop (OrcCompiler *compiler) +orc_mips_emit_loop (OrcCompiler *compiler, int unroll) { int i, j; + int iteration_per_loop = 1; OrcInstruction *insn; OrcStaticOpcode *opcode; OrcRule *rule; ORC_DEBUG ("loop_shift=%d", compiler->loop_shift); - for (i=0; i<compiler->n_insns; i++) { - insn = compiler->insns + i; - opcode = insn->opcode; - if (insn->flags & ORC_INSN_FLAG_INVARIANT) continue; - - orc_compiler_append_code(compiler,"/* %d: %s */\n", i, insn->opcode->name); - - compiler->min_temp_reg = ORC_MIPS_T3; - - rule = insn->rule; - if (rule && rule->emit) { - compiler->insn_shift = compiler->loop_shift; - if (insn->flags & ORC_INSTRUCTION_FLAG_X2) { - compiler->insn_shift += 1; - } - if (insn->flags & ORC_INSTRUCTION_FLAG_X4) { - compiler->insn_shift += 2; + if (unroll) + iteration_per_loop = 1 << compiler->unroll_shift; + + for (j=0; j<iteration_per_loop; j++) { + compiler->unroll_index = j; + for (i=0; i<compiler->n_insns; i++) { + insn = compiler->insns + i; + opcode = insn->opcode; + if (insn->flags & ORC_INSN_FLAG_INVARIANT) continue; + + orc_compiler_append_code(compiler,"/* %d: %s */\n", i, insn->opcode->name); + + compiler->min_temp_reg = ORC_MIPS_T3; + + rule = insn->rule; + if (rule && rule->emit) { + compiler->insn_shift = compiler->loop_shift; + if (insn->flags & ORC_INSTRUCTION_FLAG_X2) { + compiler->insn_shift += 1; + } + if (insn->flags & ORC_INSTRUCTION_FLAG_X4) { + compiler->insn_shift += 2; + } + rule->emit (compiler, rule->emit_user, insn); + } else { + orc_compiler_append_code (compiler, "No rule for %s\n", opcode->name); } - rule->emit (compiler, rule->emit_user, insn); - } else { - orc_compiler_append_code (compiler, "No rule for %s\n", opcode->name); } } + compiler->unroll_index = 0; + for (j=0; j<ORC_N_COMPILER_VARIABLES; j++) { OrcVariable *var = compiler->vars + j; + int total_shift = compiler->loop_shift; + if (unroll) + total_shift += compiler->unroll_shift; if (var->name == NULL) continue; if (var->vartype == ORC_VAR_TYPE_SRC || @@ -349,9 +366,9 @@ orc_mips_emit_loop (OrcCompiler *compiler) if (var->update_type == 0) { offset = 0; } else if (var->update_type == 1) { - offset = (var->size << compiler->loop_shift) >> 1; + offset = (var->size << total_shift) >> 1; } else { - offset = var->size << compiler->loop_shift; + offset = var->size << total_shift; } if (offset !=0 && var->ptr_register) { orc_mips_emit_addiu (compiler, @@ -417,7 +434,7 @@ orc_mips_get_alignment (OrcCompiler *compiler) void orc_mips_emit_full_loop (OrcCompiler *compiler, OrcMipsRegister counter, - int loop_shift, int loop_label, int alignment) + int loop_shift, int loop_label, int alignment, int unroll) { int saved_loop_shift; int saved_alignment; @@ -426,7 +443,7 @@ orc_mips_emit_full_loop (OrcCompiler *compiler, OrcMipsRegister counter, compiler->loop_shift = loop_shift; saved_alignment = orc_mips_get_alignment (compiler); orc_mips_set_alignment (compiler, alignment); - orc_mips_emit_loop (compiler); + orc_mips_emit_loop (compiler, unroll); orc_mips_set_alignment (compiler, saved_alignment); compiler->loop_shift = saved_loop_shift; orc_mips_emit_addi (compiler, counter, counter, -1); @@ -555,8 +572,8 @@ orc_compiler_orc_mips_assemble (OrcCompiler *compiler) orc_mips_emit_srl (compiler, ORC_MIPS_T0, ORC_MIPS_T0, var_size_shift); /* $t1 = number of iterations in region1 (aligned) - = (n - $t0) / loop_size - = (n - $t0) >> loop_shift + = (n - $t0) / (loop_size * unroll) + = (n - $t0) >> (loop_shift + unroll_shift) */ orc_mips_emit_sub (compiler, ORC_MIPS_T2, ORC_MIPS_T2, ORC_MIPS_T0); @@ -583,9 +600,9 @@ usual_case: orc_mips_emit_nop (compiler); - if (compiler->loop_shift> 0) + if (compiler->loop_shift + compiler->unroll_shift > 0) orc_mips_emit_srl (compiler, ORC_MIPS_T1, ORC_MIPS_T2, - compiler->loop_shift); + compiler->loop_shift + compiler->unroll_shift); else orc_mips_emit_move (compiler, ORC_MIPS_T1, ORC_MIPS_T2); @@ -596,19 +613,19 @@ usual_case: /* $t2 = number of iterations in region2 (after aligned region) = (n - $t0) % loop_size = (previous $t2) % loop_size - = $t2 & ((1 << loop_shift) - 1) + = $t2 & ((1 << loop_shift + unroll_shift) - 1) */ /* note that this instruction is in the branch delay slot */ - if (compiler->loop_shift > 0) + if (compiler->loop_shift + compiler->unroll_shift > 0) orc_mips_emit_andi (compiler, ORC_MIPS_T2, ORC_MIPS_T2, - (1 << compiler->loop_shift) - 1); + (1 << (compiler->loop_shift + compiler->unroll_shift)) - 1); else /* loop_shift==0: $t2 should be 0 because we can handle all our data in region 1*/ orc_mips_emit_move (compiler, ORC_MIPS_T2, ORC_MIPS_ZERO); /* FIXME: when loop_shift == 0, we only need to emit region1 */ - orc_mips_emit_full_loop (compiler, ORC_MIPS_T0, 0, LABEL_REGION0_LOOP, 0); + orc_mips_emit_full_loop (compiler, ORC_MIPS_T0, 0, LABEL_REGION0_LOOP, 0, FALSE); orc_mips_emit_label (compiler, LABEL_REGION1); orc_mips_emit_beqz (compiler, ORC_MIPS_T1, LABEL_REGION2); @@ -669,7 +686,7 @@ usual_case: break; /* is strictly monotonic and increasing */ orc_mips_emit_full_loop (compiler, ORC_MIPS_T1, compiler->loop_shift, - label, i | (1 << align_var)); + label, i | (1 << align_var), TRUE); /* Jump the other loop versions and go to REGION2 */ orc_mips_emit_beqz (compiler, ORC_MIPS_ZERO, LABEL_REGION2); @@ -679,7 +696,7 @@ usual_case: /* Fallback loop that works for any alignment combination */ orc_mips_emit_full_loop (compiler, ORC_MIPS_T1, compiler->loop_shift, - LABEL_REGION1_LOOP, 1 << align_var); + LABEL_REGION1_LOOP, 1 << align_var, TRUE); compiler->vars[align_var].is_aligned = FALSE; @@ -688,7 +705,7 @@ usual_case: orc_mips_emit_beqz (compiler, ORC_MIPS_T2, LABEL_REGION2_LOOP_END); orc_mips_emit_nop (compiler); - orc_mips_emit_full_loop (compiler, ORC_MIPS_T2, 0, LABEL_REGION2_LOOP, 0); + orc_mips_emit_full_loop (compiler, ORC_MIPS_T2, 0, LABEL_REGION2_LOOP, 0, FALSE); orc_mips_emit_label (compiler, LABEL_REGION2_LOOP_END); if (compiler->program->is_2d) { diff --git a/orc/orcrules-mips.c b/orc/orcrules-mips.c index 8bb5aba..a61a79c 100644 --- a/orc/orcrules-mips.c +++ b/orc/orcrules-mips.c @@ -46,6 +46,7 @@ mips_rule_load (OrcCompiler *compiler, void *user, OrcInstruction *insn) /* such that 2^total_shift is the amount to load at a time */ int total_shift = compiler->insn_shift + ORC_PTR_TO_INT (user); int is_aligned = compiler->vars[insn->src_args[0]].is_aligned; + int offset; if (compiler->vars[insn->src_args[0]].vartype == ORC_VAR_TYPE_CONST) { ORC_PROGRAM_ERROR (compiler, "not implemented"); @@ -53,23 +54,23 @@ mips_rule_load (OrcCompiler *compiler, void *user, OrcInstruction *insn) } ORC_DEBUG ("insn_shift=%d", compiler->insn_shift); - /* FIXME: Check alignment. We are assuming data is aligned here */ + offset = compiler->unroll_index << total_shift; switch (total_shift) { case 0: - orc_mips_emit_lbu (compiler, dest, src, 0); + orc_mips_emit_lbu (compiler, dest, src, offset); break; case 1: - orc_mips_emit_lbu (compiler, ORC_MIPS_T3, src, 0); - orc_mips_emit_lbu (compiler, dest, src, 1); + orc_mips_emit_lbu (compiler, ORC_MIPS_T3, src, offset); + orc_mips_emit_lbu (compiler, dest, src, offset+1); orc_mips_emit_append (compiler, dest, ORC_MIPS_T3, 8); break; case 2: if (is_aligned) { - orc_mips_emit_lw (compiler, dest, src, 0); + orc_mips_emit_lw (compiler, dest, src, offset); } else { /* note: the code below is little endian specific */ - orc_mips_emit_lwr (compiler, dest, src, 0); - orc_mips_emit_lwl (compiler, dest, src, 3); + orc_mips_emit_lwr (compiler, dest, src, offset); + orc_mips_emit_lwl (compiler, dest, src, offset+3); } break; default: @@ -85,30 +86,31 @@ mips_rule_store (OrcCompiler *compiler, void *user, OrcInstruction *insn) int dest = compiler->vars[insn->dest_args[0]].ptr_register; int total_shift = compiler->insn_shift + ORC_PTR_TO_INT (user); int is_aligned = compiler->vars[insn->dest_args[0]].is_aligned; + int offset; ORC_DEBUG ("insn_shift=%d", compiler->insn_shift); - /* FIXME: Check alignment. We are assuming data is aligned here */ + offset = compiler->unroll_index << total_shift; switch (total_shift) { case 0: - orc_mips_emit_sb (compiler, src, dest, 0); + orc_mips_emit_sb (compiler, src, dest, offset); break; case 1: if (is_aligned) { - orc_mips_emit_sh (compiler, src, dest, 0); + orc_mips_emit_sh (compiler, src, dest, offset); } else { /* Note: the code below is little endian specific */ - orc_mips_emit_sb (compiler, src, dest, 0); + orc_mips_emit_sb (compiler, src, dest, offset); orc_mips_emit_srl (compiler, ORC_MIPS_T3, src, 8); - orc_mips_emit_sb (compiler, ORC_MIPS_T3, dest, 1); + orc_mips_emit_sb (compiler, ORC_MIPS_T3, dest, offset+1); } break; case 2: if (is_aligned) { - orc_mips_emit_sw (compiler, src, dest, 0); + orc_mips_emit_sw (compiler, src, dest, offset); } else { - orc_mips_emit_swr (compiler, src, dest, 0); - orc_mips_emit_swl (compiler, src, dest, 3); + orc_mips_emit_swr (compiler, src, dest, offset); + orc_mips_emit_swl (compiler, src, dest, offset+3); } break; default: @@ -391,6 +393,7 @@ mips_rule_loadupib (OrcCompiler *compiler, void *user, OrcInstruction *insn) OrcMipsRegister tmp0 = ORC_MIPS_T3; OrcMipsRegister tmp1 = ORC_MIPS_T4; OrcMipsRegister tmp2 = ORC_MIPS_T5; + int offset; if (src->vartype != ORC_VAR_TYPE_SRC) { ORC_PROGRAM_ERROR (compiler, "not implemented"); @@ -436,9 +439,10 @@ mips_rule_loadupib (OrcCompiler *compiler, void *user, OrcInstruction *insn) adduh_r.qb dest, tmp0, tmp1 # (b,c)b(a,b)a | c(b,c)b(a,b) */ - orc_mips_emit_lb (compiler, tmp0, src->ptr_register, 0); - orc_mips_emit_lb (compiler, tmp1, src->ptr_register, 1); - orc_mips_emit_lb (compiler, dest->alloc, src->ptr_register, 2); + offset = compiler->unroll_index << (compiler->insn_shift - 1); + orc_mips_emit_lb (compiler, tmp0, src->ptr_register, offset); + orc_mips_emit_lb (compiler, tmp1, src->ptr_register, offset + 1); + orc_mips_emit_lb (compiler, dest->alloc, src->ptr_register, offset + 2); orc_mips_emit_andi (compiler, tmp2, src->ptr_offset, 1); orc_mips_emit_replv_qb (compiler, tmp0, tmp0); orc_mips_emit_replv_qb (compiler, tmp1, tmp1); @@ -464,6 +468,7 @@ mips_rule_loadupdb (OrcCompiler *compiler, void *user, OrcInstruction *insn) OrcVariable *src = compiler->vars + insn->src_args[0]; OrcVariable *dest = compiler->vars + insn->dest_args[0]; OrcMipsRegister tmp = ORC_MIPS_T3; + int offset; if (src->vartype != ORC_VAR_TYPE_SRC) { ORC_PROGRAM_ERROR (compiler, "not implemented"); @@ -489,8 +494,9 @@ mips_rule_loadupdb (OrcCompiler *compiler, void *user, OrcInstruction *insn) orc_mips_emit_addiu (compiler, src->ptr_offset, src->ptr_offset, 1); break; case 2: - orc_mips_emit_lb (compiler, tmp, src->ptr_register, 0); - orc_mips_emit_lb (compiler, dest->alloc, src->ptr_register, 1); + offset = compiler->unroll_index << (compiler->insn_shift - 1); + orc_mips_emit_lb (compiler, tmp, src->ptr_register, offset + 0); + orc_mips_emit_lb (compiler, dest->alloc, src->ptr_register, offset + 1); orc_mips_emit_replv_qb (compiler, tmp, tmp); orc_mips_emit_replv_qb (compiler, dest->alloc, dest->alloc); orc_mips_emit_packrl_ph (compiler, dest->alloc, dest->alloc, tmp); |