diff options
author | tsmigiel <tsmigiel@138bc75d-0d04-0410-961f-82ee72b054a4> | 2008-09-05 22:12:24 +0000 |
---|---|---|
committer | tsmigiel <tsmigiel@138bc75d-0d04-0410-961f-82ee72b054a4> | 2008-09-05 22:12:24 +0000 |
commit | 5a976006131fae932edbe2f9732367a512373218 (patch) | |
tree | c37a03103b3833c1eaba6a9dccfbcb1a8045587e /gcc | |
parent | 5422d457a256845828efe38f13789e8dd618d333 (diff) | |
download | gcc-5a976006131fae932edbe2f9732367a512373218.tar.gz |
Improved branch hints, safe hints, and scheduling.
* haifa-sched.c (sched_emit_insn) : Define.
* sched-int.h (sched_emit_insn) : Add prototype.
* doc/invoke.texi (-mdual-nops, -mhint-max-nops,
-mhint-max-distance -msafe-hints) : Document.
* config/spu/spu.c (spu_flag_var_tracking): New.
(TARGET_SCHED_INIT_GLOBAL, TARGET_SCHED_INIT,
TARGET_SCHED_REORDER, TARGET_SCHED_REORDER2,
TARGET_ASM_FILE_START): Define.
(TARGET_SCHED_ADJUST_PRIORITY): Remove.
(STOP_HINT_P, HINTED_P, SCHED_ON_EVEN_P): Define.
(spu_emit_branch_hint): Add blocks argument.
(insert_branch_hints, insert_nops): Remove.
(pad_bb, insert_hbrp_for_ilb_runout, insert_hbrp, in_spu_reorg,
uses_ls_unit, spu_sched_init_global, spu_sched_init,
spu_sched_reorder, asm_file_start): New functions.
(clock_var, spu_sched_length, pipe0_clock,
pipe1_clock, prev_clock_var, prev_priority,
spu_ls_first, prev_ls_clock): New static variables.
* config/spu/spu.h (TARGET_DEFAULT): Add MASK_SAFE_HINTS.
* config/spu.md (iprefetch): Add operand, make it clobber MEM.
(nopn_nv): Add a non-volatile version of nop.
* config/spu/spu.opt (-mdual-nops, -mhint-max-nops,
-mhint-max-distance, -msafe-hints): New options.
git-svn-id: svn+ssh://gcc.gnu.org/svn/gcc/trunk@140047 138bc75d-0d04-0410-961f-82ee72b054a4
Diffstat (limited to 'gcc')
-rw-r--r-- | gcc/ChangeLog | 28 | ||||
-rw-r--r-- | gcc/config/spu/spu.c | 1165 | ||||
-rw-r--r-- | gcc/config/spu/spu.h | 3 | ||||
-rw-r--r-- | gcc/config/spu/spu.md | 15 | ||||
-rw-r--r-- | gcc/config/spu/spu.opt | 20 | ||||
-rw-r--r-- | gcc/doc/invoke.texi | 27 | ||||
-rw-r--r-- | gcc/haifa-sched.c | 11 | ||||
-rw-r--r-- | gcc/sched-int.h | 1 |
8 files changed, 989 insertions, 281 deletions
diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 11743b4bc44..e1e7df6801b 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,31 @@ +2008-09-05 Trevor Smigiel <Trevor_Smigiel@playstation.sony.com> + + Improved branch hints, safe hints, and scheduling. + + * haifa-sched.c (sched_emit_insn) : Define. + * sched-int.h (sched_emit_insn) : Add prototype. + * doc/invoke.texi (-mdual-nops, -mhint-max-nops, + -mhint-max-distance -msafe-hints) : Document. + * config/spu/spu.c (spu_flag_var_tracking): New. + (TARGET_SCHED_INIT_GLOBAL, TARGET_SCHED_INIT, + TARGET_SCHED_REORDER, TARGET_SCHED_REORDER2, + TARGET_ASM_FILE_START): Define. + (TARGET_SCHED_ADJUST_PRIORITY): Remove. + (STOP_HINT_P, HINTED_P, SCHED_ON_EVEN_P): Define. + (spu_emit_branch_hint): Add blocks argument. + (insert_branch_hints, insert_nops): Remove. + (pad_bb, insert_hbrp_for_ilb_runout, insert_hbrp, in_spu_reorg, + uses_ls_unit, spu_sched_init_global, spu_sched_init, + spu_sched_reorder, asm_file_start): New functions. + (clock_var, spu_sched_length, pipe0_clock, + pipe1_clock, prev_clock_var, prev_priority, + spu_ls_first, prev_ls_clock): New static variables. + * config/spu/spu.h (TARGET_DEFAULT): Add MASK_SAFE_HINTS. + * config/spu.md (iprefetch): Add operand, make it clobber MEM. + (nopn_nv): Add a non-volatile version of nop. + * config/spu/spu.opt (-mdual-nops, -mhint-max-nops, + -mhint-max-distance, -msafe-hints): New options. + 2008-09-05 Janis Johnson <janis187@us.ibm.com> Samuel Tardieu <sam@rfc1149.net> diff --git a/gcc/config/spu/spu.c b/gcc/config/spu/spu.c index 0b74a9c18e4..35a04a7a820 100644 --- a/gcc/config/spu/spu.c +++ b/gcc/config/spu/spu.c @@ -54,6 +54,9 @@ #include "tm-constrs.h" #include "spu-builtins.h" #include "ddg.h" +#include "sbitmap.h" +#include "timevar.h" +#include "df.h" /* Builtin types, data and prototypes. */ struct spu_builtin_range @@ -94,19 +97,19 @@ static rtx frame_emit_add_imm (rtx dst, rtx src, HOST_WIDE_INT imm, static void emit_nop_for_insn (rtx insn); static bool insn_clobbers_hbr (rtx insn); static void spu_emit_branch_hint (rtx before, rtx branch, rtx target, - int distance); + int distance, sbitmap blocks); static rtx spu_emit_vector_compare (enum rtx_code rcode, rtx op0, rtx op1, enum machine_mode dmode); static rtx get_branch_target (rtx branch); -static void insert_branch_hints (void); -static void insert_nops (void); static void spu_machine_dependent_reorg (void); static int spu_sched_issue_rate (void); static int spu_sched_variable_issue (FILE * dump, int verbose, rtx insn, int can_issue_more); static int get_pipe (rtx insn); -static int spu_sched_adjust_priority (rtx insn, int pri); static int spu_sched_adjust_cost (rtx insn, rtx link, rtx dep_insn, int cost); +static void spu_sched_init_global (FILE *, int, int); +static void spu_sched_init (FILE *, int, int); +static int spu_sched_reorder (FILE *, int, rtx *, int *, int); static tree spu_handle_fndecl_attribute (tree * node, tree name, tree args, int flags, unsigned char *no_add_attrs); @@ -139,6 +142,7 @@ static int spu_builtin_vectorization_cost (bool); static bool spu_vector_alignment_reachable (const_tree, bool); static tree spu_builtin_vec_perm (tree, tree *); static int spu_sms_res_mii (struct ddg *g); +static void asm_file_start (void); extern const char *reg_names[]; rtx spu_compare_op0, spu_compare_op1; @@ -148,6 +152,18 @@ int spu_arch; /* Which cpu are we tuning for. */ int spu_tune; +/* The hardware requires 8 insns between a hint and the branch it + effects. This variable describes how many rtl instructions the + compiler needs to see before inserting a hint, and then the compiler + will insert enough nops to make it at least 8 insns. The default is + for the compiler to allow up to 2 nops be emitted. The nops are + inserted in pairs, so we round down. */ +int spu_hint_dist = (8*4) - (2*4); + +/* Determines whether we run variable tracking in machine dependent + reorganization. */ +static int spu_flag_var_tracking; + enum spu_immediate { SPU_NONE, SPU_IL, @@ -213,11 +229,20 @@ tree spu_builtin_types[SPU_BTI_MAX]; #undef TARGET_SCHED_ISSUE_RATE #define TARGET_SCHED_ISSUE_RATE spu_sched_issue_rate +#undef TARGET_SCHED_INIT_GLOBAL +#define TARGET_SCHED_INIT_GLOBAL spu_sched_init_global + +#undef TARGET_SCHED_INIT +#define TARGET_SCHED_INIT spu_sched_init + #undef TARGET_SCHED_VARIABLE_ISSUE #define TARGET_SCHED_VARIABLE_ISSUE spu_sched_variable_issue -#undef TARGET_SCHED_ADJUST_PRIORITY -#define TARGET_SCHED_ADJUST_PRIORITY spu_sched_adjust_priority +#undef TARGET_SCHED_REORDER +#define TARGET_SCHED_REORDER spu_sched_reorder + +#undef TARGET_SCHED_REORDER2 +#define TARGET_SCHED_REORDER2 spu_sched_reorder #undef TARGET_SCHED_ADJUST_COST #define TARGET_SCHED_ADJUST_COST spu_sched_adjust_cost @@ -301,6 +326,9 @@ const struct attribute_spec spu_attribute_table[]; #undef TARGET_SCHED_SMS_RES_MII #define TARGET_SCHED_SMS_RES_MII spu_sms_res_mii +#undef TARGET_ASM_FILE_START +#define TARGET_ASM_FILE_START asm_file_start + struct gcc_target targetm = TARGET_INITIALIZER; void @@ -329,9 +357,14 @@ spu_override_options (void) flag_omit_frame_pointer = 1; + /* Functions must be 8 byte aligned so we correctly handle dual issue */ if (align_functions < 8) align_functions = 8; + spu_hint_dist = 8*4 - spu_max_nops*4; + if (spu_hint_dist < 0) + spu_hint_dist = 0; + if (spu_fixed_range_string) fix_range (spu_fixed_range_string); @@ -1983,16 +2016,6 @@ spu_const (enum machine_mode mode, HOST_WIDE_INT val) return gen_rtx_CONST_VECTOR (mode, v); } - -/* branch hint stuff */ - -/* The hardware requires 8 insns between a hint and the branch it - effects. This variable describes how many rtl instructions the - compiler needs to see before inserting a hint. (FIXME: We should - accept less and insert nops to enforce it because hinting is always - profitable for performance, but we do need to be careful of code - size.) */ -int spu_hint_dist = (8 * 4); /* Create a MODE vector constant from 4 ints. */ rtx @@ -2017,75 +2040,200 @@ spu_const_from_ints(enum machine_mode mode, int a, int b, int c, int d) arr[15] = (d >> 0) & 0xff; return array_to_constant(mode, arr); } + +/* branch hint stuff */ /* An array of these is used to propagate hints to predecessor blocks. */ struct spu_bb_info { - rtx prop_jump; /* propagated from another block */ - basic_block bb; /* the original block. */ + rtx prop_jump; /* propagated from another block */ + int bb_index; /* the original block. */ }; +static struct spu_bb_info *spu_bb_info; -/* The special $hbr register is used to prevent the insn scheduler from - moving hbr insns across instructions which invalidate them. It - should only be used in a clobber, and this function searches for - insns which clobber it. */ -static bool -insn_clobbers_hbr (rtx insn) +#define STOP_HINT_P(INSN) \ + (GET_CODE(INSN) == CALL_INSN \ + || INSN_CODE(INSN) == CODE_FOR_divmodsi4 \ + || INSN_CODE(INSN) == CODE_FOR_udivmodsi4) + +/* 1 when RTX is a hinted branch or its target. We keep track of + what has been hinted so the safe-hint code can test it easily. */ +#define HINTED_P(RTX) \ + (RTL_FLAG_CHECK3("HINTED_P", (RTX), CODE_LABEL, JUMP_INSN, CALL_INSN)->unchanging) + +/* 1 when RTX is an insn that must be scheduled on an even boundary. */ +#define SCHED_ON_EVEN_P(RTX) \ + (RTL_FLAG_CHECK2("SCHED_ON_EVEN_P", (RTX), JUMP_INSN, CALL_INSN)->in_struct) + +/* Emit a nop for INSN such that the two will dual issue. This assumes + INSN is 8-byte aligned. When INSN is inline asm we emit an lnop. + We check for TImode to handle a MULTI1 insn which has dual issued its + first instruction. get_pipe returns -1 for MULTI0, inline asm, or + ADDR_VEC insns. */ +static void +emit_nop_for_insn (rtx insn) { - if (INSN_P (insn) && GET_CODE (PATTERN (insn)) == PARALLEL) + int p; + rtx new_insn; + p = get_pipe (insn); + if ((CALL_P (insn) || JUMP_P (insn)) && SCHED_ON_EVEN_P (insn)) + new_insn = emit_insn_after (gen_lnop (), insn); + else if (p == 1 && GET_MODE (insn) == TImode) { - rtx parallel = PATTERN (insn); - rtx clobber; - int j; - for (j = XVECLEN (parallel, 0) - 1; j >= 0; j--) + new_insn = emit_insn_before (gen_nopn (GEN_INT (127)), insn); + PUT_MODE (new_insn, TImode); + PUT_MODE (insn, VOIDmode); + } + else + new_insn = emit_insn_after (gen_lnop (), insn); + recog_memoized (new_insn); +} + +/* Insert nops in basic blocks to meet dual issue alignment + requirements. Also make sure hbrp and hint instructions are at least + one cycle apart, possibly inserting a nop. */ +static void +pad_bb(void) +{ + rtx insn, next_insn, prev_insn, hbr_insn = 0; + int length; + int addr; + + /* This sets up INSN_ADDRESSES. */ + shorten_branches (get_insns ()); + + /* Keep track of length added by nops. */ + length = 0; + + prev_insn = 0; + insn = get_insns (); + if (!active_insn_p (insn)) + insn = next_active_insn (insn); + for (; insn; insn = next_insn) + { + next_insn = next_active_insn (insn); + if (INSN_CODE (insn) == CODE_FOR_iprefetch + || INSN_CODE (insn) == CODE_FOR_hbr) { - clobber = XVECEXP (parallel, 0, j); - if (GET_CODE (clobber) == CLOBBER - && GET_CODE (XEXP (clobber, 0)) == REG - && REGNO (XEXP (clobber, 0)) == HBR_REGNUM) - return 1; + if (hbr_insn) + { + int a0 = INSN_ADDRESSES (INSN_UID (hbr_insn)); + int a1 = INSN_ADDRESSES (INSN_UID (insn)); + if ((a1 - a0 == 8 && GET_MODE (insn) != TImode) + || (a1 - a0 == 4)) + { + prev_insn = emit_insn_before (gen_lnop (), insn); + PUT_MODE (prev_insn, GET_MODE (insn)); + PUT_MODE (insn, TImode); + length += 4; + } + } + hbr_insn = insn; + } + if (INSN_CODE (insn) == CODE_FOR_blockage) + { + if (GET_MODE (insn) == TImode) + PUT_MODE (next_insn, TImode); + insn = next_insn; + next_insn = next_active_insn (insn); + } + addr = INSN_ADDRESSES (INSN_UID (insn)); + if ((CALL_P (insn) || JUMP_P (insn)) && SCHED_ON_EVEN_P (insn)) + { + if (((addr + length) & 7) != 0) + { + emit_nop_for_insn (prev_insn); + length += 4; + } } + else if (GET_MODE (insn) == TImode + && ((next_insn && GET_MODE (next_insn) != TImode) + || get_attr_type (insn) == TYPE_MULTI0) + && ((addr + length) & 7) != 0) + { + /* prev_insn will always be set because the first insn is + always 8-byte aligned. */ + emit_nop_for_insn (prev_insn); + length += 4; + } + prev_insn = insn; } - return 0; } + +/* Routines for branch hints. */ + static void -spu_emit_branch_hint (rtx before, rtx branch, rtx target, int distance) +spu_emit_branch_hint (rtx before, rtx branch, rtx target, + int distance, sbitmap blocks) { - rtx branch_label; - rtx hint, insn, prev, next; + rtx branch_label = 0; + rtx hint; + rtx insn; + rtx table; if (before == 0 || branch == 0 || target == 0) return; + /* While scheduling we require hints to be no further than 600, so + we need to enforce that here too */ if (distance > 600) return; + /* If we have a Basic block note, emit it after the basic block note. */ + if (NOTE_KIND (before) == NOTE_INSN_BASIC_BLOCK) + before = NEXT_INSN (before); branch_label = gen_label_rtx (); LABEL_NUSES (branch_label)++; LABEL_PRESERVE_P (branch_label) = 1; insn = emit_label_before (branch_label, branch); branch_label = gen_rtx_LABEL_REF (VOIDmode, branch_label); + SET_BIT (blocks, BLOCK_FOR_INSN (branch)->index); + + hint = emit_insn_before (gen_hbr (branch_label, target), before); + recog_memoized (hint); + HINTED_P (branch) = 1; - /* If the previous insn is pipe0, make the hbr dual issue with it. If - the current insn is pipe0, dual issue with it. */ - prev = prev_active_insn (before); - if (prev && get_pipe (prev) == 0) - hint = emit_insn_before (gen_hbr (branch_label, target), before); - else if (get_pipe (before) == 0 && distance > spu_hint_dist) + if (GET_CODE (target) == LABEL_REF) + HINTED_P (XEXP (target, 0)) = 1; + else if (tablejump_p (branch, 0, &table)) { - next = next_active_insn (before); - hint = emit_insn_after (gen_hbr (branch_label, target), before); - if (next) - PUT_MODE (next, TImode); + rtvec vec; + int j; + if (GET_CODE (PATTERN (table)) == ADDR_VEC) + vec = XVEC (PATTERN (table), 0); + else + vec = XVEC (PATTERN (table), 1); + for (j = GET_NUM_ELEM (vec) - 1; j >= 0; --j) + HINTED_P (XEXP (RTVEC_ELT (vec, j), 0)) = 1; } - else + + if (distance >= 588) { - hint = emit_insn_before (gen_hbr (branch_label, target), before); - PUT_MODE (hint, TImode); + /* Make sure the hint isn't scheduled any earlier than this point, + which could make it too far for the branch offest to fit */ + recog_memoized (emit_insn_before (gen_blockage (), hint)); + } + else if (distance <= 8 * 4) + { + /* To guarantee at least 8 insns between the hint and branch we + insert nops. */ + int d; + for (d = distance; d < 8 * 4; d += 4) + { + insn = + emit_insn_after (gen_nopn_nv (gen_rtx_REG (SImode, 127)), hint); + recog_memoized (insn); + } + + /* Make sure any nops inserted aren't scheduled before the hint. */ + recog_memoized (emit_insn_after (gen_blockage (), hint)); + + /* Make sure any nops inserted aren't scheduled after the call. */ + if (CALL_P (branch) && distance < 8 * 4) + recog_memoized (emit_insn_before (gen_blockage (), branch)); } - recog_memoized (hint); } /* Returns 0 if we don't want a hint for this branch. Otherwise return @@ -2155,245 +2303,403 @@ get_branch_target (rtx branch) return 0; } +/* The special $hbr register is used to prevent the insn scheduler from + moving hbr insns across instructions which invalidate them. It + should only be used in a clobber, and this function searches for + insns which clobber it. */ +static bool +insn_clobbers_hbr (rtx insn) +{ + if (INSN_P (insn) + && GET_CODE (PATTERN (insn)) == PARALLEL) + { + rtx parallel = PATTERN (insn); + rtx clobber; + int j; + for (j = XVECLEN (parallel, 0) - 1; j >= 0; j--) + { + clobber = XVECEXP (parallel, 0, j); + if (GET_CODE (clobber) == CLOBBER + && GET_CODE (XEXP (clobber, 0)) == REG + && REGNO (XEXP (clobber, 0)) == HBR_REGNUM) + return 1; + } + } + return 0; +} + +/* Search up to 32 insns starting at FIRST: + - at any kind of hinted branch, just return + - at any unconditional branch in the first 15 insns, just return + - at a call or indirect branch, after the first 15 insns, force it to + an even address and return + - at any unconditional branch, after the first 15 insns, force it to + an even address. + At then end of the search, insert an hbrp within 4 insns of FIRST, + and an hbrp within 16 instructions of FIRST. + */ static void -insert_branch_hints (void) +insert_hbrp_for_ilb_runout (rtx first) { - struct spu_bb_info *spu_bb_info; - rtx branch, insn, next; - rtx branch_target = 0; - int branch_addr = 0, insn_addr, head_addr; - basic_block bb; - unsigned int j; + rtx insn, before_4 = 0, before_16 = 0; + int addr = 0, length, first_addr = -1; + int hbrp_addr0 = 128 * 4, hbrp_addr1 = 128 * 4; + int insert_lnop_after = 0; + for (insn = first; insn; insn = NEXT_INSN (insn)) + if (INSN_P (insn)) + { + if (first_addr == -1) + first_addr = INSN_ADDRESSES (INSN_UID (insn)); + addr = INSN_ADDRESSES (INSN_UID (insn)) - first_addr; + length = get_attr_length (insn); + + if (before_4 == 0 && addr + length >= 4 * 4) + before_4 = insn; + /* We test for 14 instructions because the first hbrp will add + up to 2 instructions. */ + if (before_16 == 0 && addr + length >= 14 * 4) + before_16 = insn; + + if (INSN_CODE (insn) == CODE_FOR_hbr) + { + /* Make sure an hbrp is at least 2 cycles away from a hint. + Insert an lnop after the hbrp when necessary. */ + if (before_4 == 0 && addr > 0) + { + before_4 = insn; + insert_lnop_after |= 1; + } + else if (before_4 && addr <= 4 * 4) + insert_lnop_after |= 1; + if (before_16 == 0 && addr > 10 * 4) + { + before_16 = insn; + insert_lnop_after |= 2; + } + else if (before_16 && addr <= 14 * 4) + insert_lnop_after |= 2; + } - spu_bb_info = - (struct spu_bb_info *) xcalloc (last_basic_block + 1, - sizeof (struct spu_bb_info)); + if (INSN_CODE (insn) == CODE_FOR_iprefetch) + { + if (addr < hbrp_addr0) + hbrp_addr0 = addr; + else if (addr < hbrp_addr1) + hbrp_addr1 = addr; + } - /* We need exact insn addresses and lengths. */ - shorten_branches (get_insns ()); + if (CALL_P (insn) || JUMP_P (insn)) + { + if (HINTED_P (insn)) + return; + + /* Any branch after the first 15 insns should be on an even + address to avoid a special case branch. There might be + some nops and/or hbrps inserted, so we test after 10 + insns. */ + if (addr > 10 * 4) + SCHED_ON_EVEN_P (insn) = 1; + } - FOR_EACH_BB_REVERSE (bb) - { - head_addr = INSN_ADDRESSES (INSN_UID (BB_HEAD (bb))); - branch = 0; - if (spu_bb_info[bb->index].prop_jump) - { - branch = spu_bb_info[bb->index].prop_jump; - branch_target = get_branch_target (branch); - branch_addr = INSN_ADDRESSES (INSN_UID (branch)); - } - /* Search from end of a block to beginning. In this loop, find - jumps which need a branch and emit them only when: - - it's an indirect branch and we're at the insn which sets - the register - - we're at an insn that will invalidate the hint. e.g., a - call, another hint insn, inline asm that clobbers $hbr, and - some inlined operations (divmodsi4). Don't consider jumps - because they are only at the end of a block and are - considered when we are deciding whether to propagate - - we're getting too far away from the branch. The hbr insns - only have a signed 10-bit offset - We go back as far as possible so the branch will be considered - for propagation when we get to the beginning of the block. */ - next = 0; - for (insn = BB_END (bb); insn; insn = PREV_INSN (insn)) - { - if (INSN_P (insn)) + if (CALL_P (insn) || tablejump_p (insn, 0, 0)) + return; + + + if (addr + length >= 32 * 4) { - insn_addr = INSN_ADDRESSES (INSN_UID (insn)); - if (branch && next - && ((GET_CODE (branch_target) == REG - && set_of (branch_target, insn) != NULL_RTX) - || insn_clobbers_hbr (insn) - || branch_addr - insn_addr > 600)) + gcc_assert (before_4 && before_16); + if (hbrp_addr0 > 4 * 4) { - int next_addr = INSN_ADDRESSES (INSN_UID (next)); - if (insn != BB_END (bb) - && branch_addr - next_addr >= spu_hint_dist) + insn = + emit_insn_before (gen_iprefetch (GEN_INT (1)), before_4); + recog_memoized (insn); + INSN_ADDRESSES_NEW (insn, + INSN_ADDRESSES (INSN_UID (before_4))); + PUT_MODE (insn, GET_MODE (before_4)); + PUT_MODE (before_4, TImode); + if (insert_lnop_after & 1) { - if (dump_file) - fprintf (dump_file, - "hint for %i in block %i before %i\n", - INSN_UID (branch), bb->index, INSN_UID (next)); - spu_emit_branch_hint (next, branch, branch_target, - branch_addr - next_addr); + insn = emit_insn_before (gen_lnop (), before_4); + recog_memoized (insn); + INSN_ADDRESSES_NEW (insn, + INSN_ADDRESSES (INSN_UID (before_4))); + PUT_MODE (insn, TImode); } - branch = 0; } - - /* JUMP_P will only be true at the end of a block. When - branch is already set it means we've previously decided - to propagate a hint for that branch into this block. */ - if (CALL_P (insn) || (JUMP_P (insn) && !branch)) + if ((hbrp_addr0 <= 4 * 4 || hbrp_addr0 > 16 * 4) + && hbrp_addr1 > 16 * 4) { - branch = 0; - if ((branch_target = get_branch_target (insn))) + insn = + emit_insn_before (gen_iprefetch (GEN_INT (2)), before_16); + recog_memoized (insn); + INSN_ADDRESSES_NEW (insn, + INSN_ADDRESSES (INSN_UID (before_16))); + PUT_MODE (insn, GET_MODE (before_16)); + PUT_MODE (before_16, TImode); + if (insert_lnop_after & 2) { - branch = insn; - branch_addr = insn_addr; + insn = emit_insn_before (gen_lnop (), before_16); + recog_memoized (insn); + INSN_ADDRESSES_NEW (insn, + INSN_ADDRESSES (INSN_UID + (before_16))); + PUT_MODE (insn, TImode); } } - - /* When a branch hint is emitted it will be inserted - before "next". Make sure next is the beginning of a - cycle to minimize impact on the scheduled insns. */ - if (GET_MODE (insn) == TImode) - next = insn; + return; } - if (insn == BB_HEAD (bb)) - break; } + else if (BARRIER_P (insn)) + return; - if (branch) - { - /* If we haven't emitted a hint for this branch yet, it might - be profitable to emit it in one of the predecessor blocks, - especially for loops. */ - rtx bbend; - basic_block prev = 0, prop = 0, prev2 = 0; - int loop_exit = 0, simple_loop = 0; - int next_addr = 0; - if (next) - next_addr = INSN_ADDRESSES (INSN_UID (next)); - - for (j = 0; j < EDGE_COUNT (bb->preds); j++) - if (EDGE_PRED (bb, j)->flags & EDGE_FALLTHRU) - prev = EDGE_PRED (bb, j)->src; - else - prev2 = EDGE_PRED (bb, j)->src; - - for (j = 0; j < EDGE_COUNT (bb->succs); j++) - if (EDGE_SUCC (bb, j)->flags & EDGE_LOOP_EXIT) - loop_exit = 1; - else if (EDGE_SUCC (bb, j)->dest == bb) - simple_loop = 1; - - /* If this branch is a loop exit then propagate to previous - fallthru block. This catches the cases when it is a simple - loop or when there is an initial branch into the loop. */ - if (prev && loop_exit && prev->loop_depth <= bb->loop_depth) - prop = prev; - - /* If there is only one adjacent predecessor. Don't propagate - outside this loop. This loop_depth test isn't perfect, but - I'm not sure the loop_father member is valid at this point. */ - else if (prev && single_pred_p (bb) - && prev->loop_depth == bb->loop_depth) - prop = prev; - - /* If this is the JOIN block of a simple IF-THEN then - propagate the hint to the HEADER block. */ - else if (prev && prev2 - && EDGE_COUNT (bb->preds) == 2 - && EDGE_COUNT (prev->preds) == 1 - && EDGE_PRED (prev, 0)->src == prev2 - && prev2->loop_depth == bb->loop_depth - && GET_CODE (branch_target) != REG) - prop = prev; - - /* Don't propagate when: - - this is a simple loop and the hint would be too far - - this is not a simple loop and there are 16 insns in - this block already - - the predecessor block ends in a branch that will be - hinted - - the predecessor block ends in an insn that invalidates - the hint */ - if (prop - && prop->index >= 0 - && (bbend = BB_END (prop)) - && branch_addr - INSN_ADDRESSES (INSN_UID (bbend)) < - (simple_loop ? 600 : 16 * 4) && get_branch_target (bbend) == 0 - && (JUMP_P (bbend) || !insn_clobbers_hbr (bbend))) - { - if (dump_file) - fprintf (dump_file, "propagate from %i to %i (loop depth %i) " - "for %i (loop_exit %i simple_loop %i dist %i)\n", - bb->index, prop->index, bb->loop_depth, - INSN_UID (branch), loop_exit, simple_loop, - branch_addr - INSN_ADDRESSES (INSN_UID (bbend))); - - spu_bb_info[prop->index].prop_jump = branch; - spu_bb_info[prop->index].bb = bb; - } - else if (next && branch_addr - next_addr >= spu_hint_dist) - { - if (dump_file) - fprintf (dump_file, "hint for %i in block %i before %i\n", - INSN_UID (branch), bb->index, INSN_UID (next)); - spu_emit_branch_hint (next, branch, branch_target, - branch_addr - next_addr); - } - branch = 0; - } - } - free (spu_bb_info); } - -/* Emit a nop for INSN such that the two will dual issue. This assumes - INSN is 8-byte aligned. When INSN is inline asm we emit an lnop. - We check for TImode to handle a MULTI1 insn which has dual issued its - first instruction. get_pipe returns -1 for MULTI0, inline asm, or - ADDR_VEC insns. */ + +/* The SPU might hang when it executes 48 inline instructions after a + hinted branch jumps to its hinted target. The beginning of a + function and the return from a call might have been hinted, and must + be handled as well. To prevent a hang we insert 2 hbrps. The first + should be within 6 insns of the branch target. The second should be + within 22 insns of the branch target. When determining if hbrps are + necessary, we look for only 32 inline instructions, because up to to + 12 nops and 4 hbrps could be inserted. Similarily, when inserting + new hbrps, we insert them within 4 and 16 insns of the target. */ static void -emit_nop_for_insn (rtx insn) +insert_hbrp (void) { - int p; - rtx new_insn; - p = get_pipe (insn); - if (p == 1 && GET_MODE (insn) == TImode) + rtx insn; + if (TARGET_SAFE_HINTS) { - new_insn = emit_insn_before (gen_nopn (GEN_INT (127)), insn); - PUT_MODE (new_insn, TImode); - PUT_MODE (insn, VOIDmode); + shorten_branches (get_insns ()); + /* Insert hbrp at beginning of function */ + insn = next_active_insn (get_insns ()); + if (insn) + insert_hbrp_for_ilb_runout (insn); + /* Insert hbrp after hinted targets. */ + for (insn = get_insns (); insn; insn = NEXT_INSN (insn)) + if ((LABEL_P (insn) && HINTED_P (insn)) || CALL_P (insn)) + insert_hbrp_for_ilb_runout (next_active_insn (insn)); } - else - new_insn = emit_insn_after (gen_lnop (), insn); } -/* Insert nops in basic blocks to meet dual issue alignment - requirements. */ +static int in_spu_reorg; + +/* Insert branch hints. There are no branch optimizations after this + pass, so it's safe to set our branch hints now. */ static void -insert_nops (void) +spu_machine_dependent_reorg (void) { - rtx insn, next_insn, prev_insn; - int length; - int addr; + sbitmap blocks; + basic_block bb; + rtx branch, insn; + rtx branch_target = 0; + int branch_addr = 0, insn_addr, required_dist = 0; + int i; + unsigned int j; - /* This sets up INSN_ADDRESSES. */ - shorten_branches (get_insns ()); + if (!TARGET_BRANCH_HINTS || optimize == 0) + { + /* We still do it for unoptimized code because an external + function might have hinted a call or return. */ + insert_hbrp (); + pad_bb (); + return; + } - /* Keep track of length added by nops. */ - length = 0; + blocks = sbitmap_alloc (last_basic_block); + sbitmap_zero (blocks); - prev_insn = 0; - for (insn = get_insns (); insn; insn = next_insn) + in_spu_reorg = 1; + compute_bb_for_insn (); + + compact_blocks (); + + spu_bb_info = + (struct spu_bb_info *) xcalloc (n_basic_blocks, + sizeof (struct spu_bb_info)); + + /* We need exact insn addresses and lengths. */ + shorten_branches (get_insns ()); + + for (i = n_basic_blocks - 1; i >= 0; i--) { - next_insn = next_active_insn (insn); - addr = INSN_ADDRESSES (INSN_UID (insn)); - if (GET_MODE (insn) == TImode - && next_insn - && GET_MODE (next_insn) != TImode - && ((addr + length) & 7) != 0) + bb = BASIC_BLOCK (i); + branch = 0; + if (spu_bb_info[i].prop_jump) { - /* prev_insn will always be set because the first insn is - always 8-byte aligned. */ - emit_nop_for_insn (prev_insn); - length += 4; + branch = spu_bb_info[i].prop_jump; + branch_target = get_branch_target (branch); + branch_addr = INSN_ADDRESSES (INSN_UID (branch)); + required_dist = spu_hint_dist; + } + /* Search from end of a block to beginning. In this loop, find + jumps which need a branch and emit them only when: + - it's an indirect branch and we're at the insn which sets + the register + - we're at an insn that will invalidate the hint. e.g., a + call, another hint insn, inline asm that clobbers $hbr, and + some inlined operations (divmodsi4). Don't consider jumps + because they are only at the end of a block and are + considered when we are deciding whether to propagate + - we're getting too far away from the branch. The hbr insns + only have a signed 10 bit offset + We go back as far as possible so the branch will be considered + for propagation when we get to the beginning of the block. */ + for (insn = BB_END (bb); insn; insn = PREV_INSN (insn)) + { + if (INSN_P (insn)) + { + insn_addr = INSN_ADDRESSES (INSN_UID (insn)); + if (branch + && ((GET_CODE (branch_target) == REG + && set_of (branch_target, insn) != NULL_RTX) + || insn_clobbers_hbr (insn) + || branch_addr - insn_addr > 600)) + { + rtx next = NEXT_INSN (insn); + int next_addr = INSN_ADDRESSES (INSN_UID (next)); + if (insn != BB_END (bb) + && branch_addr - next_addr >= required_dist) + { + if (dump_file) + fprintf (dump_file, + "hint for %i in block %i before %i\n", + INSN_UID (branch), bb->index, + INSN_UID (next)); + spu_emit_branch_hint (next, branch, branch_target, + branch_addr - next_addr, blocks); + } + branch = 0; + } + + /* JUMP_P will only be true at the end of a block. When + branch is already set it means we've previously decided + to propagate a hint for that branch into this block. */ + if (CALL_P (insn) || (JUMP_P (insn) && !branch)) + { + branch = 0; + if ((branch_target = get_branch_target (insn))) + { + branch = insn; + branch_addr = insn_addr; + required_dist = spu_hint_dist; + } + } + } + if (insn == BB_HEAD (bb)) + break; + } + + if (branch) + { + /* If we haven't emitted a hint for this branch yet, it might + be profitable to emit it in one of the predecessor blocks, + especially for loops. */ + rtx bbend; + basic_block prev = 0, prop = 0, prev2 = 0; + int loop_exit = 0, simple_loop = 0; + int next_addr = INSN_ADDRESSES (INSN_UID (NEXT_INSN (insn))); + + for (j = 0; j < EDGE_COUNT (bb->preds); j++) + if (EDGE_PRED (bb, j)->flags & EDGE_FALLTHRU) + prev = EDGE_PRED (bb, j)->src; + else + prev2 = EDGE_PRED (bb, j)->src; + + for (j = 0; j < EDGE_COUNT (bb->succs); j++) + if (EDGE_SUCC (bb, j)->flags & EDGE_LOOP_EXIT) + loop_exit = 1; + else if (EDGE_SUCC (bb, j)->dest == bb) + simple_loop = 1; + + /* If this branch is a loop exit then propagate to previous + fallthru block. This catches the cases when it is a simple + loop or when there is an initial branch into the loop. */ + if (prev && (loop_exit || simple_loop) + && prev->loop_depth <= bb->loop_depth) + prop = prev; + + /* If there is only one adjacent predecessor. Don't propagate + outside this loop. This loop_depth test isn't perfect, but + I'm not sure the loop_father member is valid at this point. */ + else if (prev && single_pred_p (bb) + && prev->loop_depth == bb->loop_depth) + prop = prev; + + /* If this is the JOIN block of a simple IF-THEN then + propogate the hint to the HEADER block. */ + else if (prev && prev2 + && EDGE_COUNT (bb->preds) == 2 + && EDGE_COUNT (prev->preds) == 1 + && EDGE_PRED (prev, 0)->src == prev2 + && prev2->loop_depth == bb->loop_depth + && GET_CODE (branch_target) != REG) + prop = prev; + + /* Don't propagate when: + - this is a simple loop and the hint would be too far + - this is not a simple loop and there are 16 insns in + this block already + - the predecessor block ends in a branch that will be + hinted + - the predecessor block ends in an insn that invalidates + the hint */ + if (prop + && prop->index >= 0 + && (bbend = BB_END (prop)) + && branch_addr - INSN_ADDRESSES (INSN_UID (bbend)) < + (simple_loop ? 600 : 16 * 4) && get_branch_target (bbend) == 0 + && (JUMP_P (bbend) || !insn_clobbers_hbr (bbend))) + { + if (dump_file) + fprintf (dump_file, "propagate from %i to %i (loop depth %i) " + "for %i (loop_exit %i simple_loop %i dist %i)\n", + bb->index, prop->index, bb->loop_depth, + INSN_UID (branch), loop_exit, simple_loop, + branch_addr - INSN_ADDRESSES (INSN_UID (bbend))); + + spu_bb_info[prop->index].prop_jump = branch; + spu_bb_info[prop->index].bb_index = i; + } + else if (branch_addr - next_addr >= required_dist) + { + if (dump_file) + fprintf (dump_file, "hint for %i in block %i before %i\n", + INSN_UID (branch), bb->index, + INSN_UID (NEXT_INSN (insn))); + spu_emit_branch_hint (NEXT_INSN (insn), branch, branch_target, + branch_addr - next_addr, blocks); + } + branch = 0; } - prev_insn = insn; } -} + free (spu_bb_info); -static void -spu_machine_dependent_reorg (void) -{ - if (optimize > 0) + if (!sbitmap_empty_p (blocks)) + find_many_sub_basic_blocks (blocks); + + /* We have to schedule to make sure alignment is ok. */ + FOR_EACH_BB (bb) bb->flags &= ~BB_DISABLE_SCHEDULE; + + /* The hints need to be scheduled, so call it again. */ + schedule_insns (); + + insert_hbrp (); + + pad_bb (); + + + if (spu_flag_var_tracking) { - if (TARGET_BRANCH_HINTS) - insert_branch_hints (); - insert_nops (); + df_analyze (); + timevar_push (TV_VAR_TRACKING); + variable_tracking_main (); + timevar_pop (TV_VAR_TRACKING); + df_finish_pass (false); } + + free_bb_for_insn (); + + in_spu_reorg = 0; } @@ -2405,15 +2711,14 @@ spu_sched_issue_rate (void) } static int -spu_sched_variable_issue (FILE * dump ATTRIBUTE_UNUSED, - int verbose ATTRIBUTE_UNUSED, rtx insn, - int can_issue_more) +uses_ls_unit(rtx insn) { - if (GET_CODE (PATTERN (insn)) != USE - && GET_CODE (PATTERN (insn)) != CLOBBER - && get_pipe (insn) != -2) - can_issue_more--; - return can_issue_more; + rtx set = single_set (insn); + if (set != 0 + && (GET_CODE (SET_DEST (set)) == MEM + || GET_CODE (SET_SRC (set)) == MEM)) + return 1; + return 0; } static int @@ -2439,7 +2744,6 @@ get_pipe (rtx insn) case TYPE_FPD: case TYPE_FP6: case TYPE_FP7: - case TYPE_IPREFETCH: return 0; case TYPE_LNOP: @@ -2449,41 +2753,332 @@ get_pipe (rtx insn) case TYPE_BR: case TYPE_MULTI1: case TYPE_HBR: + case TYPE_IPREFETCH: return 1; default: abort (); } } + +/* haifa-sched.c has a static variable that keeps track of the current + cycle. It is passed to spu_sched_reorder, and we record it here for + use by spu_sched_variable_issue. It won't be accurate if the + scheduler updates it's clock_var between the two calls. */ +static int clock_var; + +/* This is used to keep track of insn alignment. Set to 0 at the + beginning of each block and increased by the "length" attr of each + insn scheduled. */ +static int spu_sched_length; + +/* Record when we've issued pipe0 and pipe1 insns so we can reorder the + ready list appropriately in spu_sched_reorder(). */ +static int pipe0_clock; +static int pipe1_clock; + +static int prev_clock_var; + +static int prev_priority; + +/* The SPU needs to load the next ilb sometime during the execution of + the previous ilb. There is a potential conflict if every cycle has a + load or store. To avoid the conflict we make sure the load/store + unit is free for at least one cycle during the execution of insns in + the previous ilb. */ +static int spu_ls_first; +static int prev_ls_clock; + +static void +spu_sched_init_global (FILE *file ATTRIBUTE_UNUSED, int verbose ATTRIBUTE_UNUSED, + int max_ready ATTRIBUTE_UNUSED) +{ + spu_sched_length = 0; +} + +static void +spu_sched_init (FILE *file ATTRIBUTE_UNUSED, int verbose ATTRIBUTE_UNUSED, + int max_ready ATTRIBUTE_UNUSED) +{ + if (align_labels > 4 || align_loops > 4 || align_jumps > 4) + { + /* When any block might be at least 8-byte aligned, assume they + will all be at least 8-byte aligned to make sure dual issue + works out correctly. */ + spu_sched_length = 0; + } + spu_ls_first = INT_MAX; + clock_var = -1; + prev_ls_clock = -1; + pipe0_clock = -1; + pipe1_clock = -1; + prev_clock_var = -1; + prev_priority = -1; +} + static int -spu_sched_adjust_priority (rtx insn, int pri) +spu_sched_variable_issue (FILE *file ATTRIBUTE_UNUSED, + int verbose ATTRIBUTE_UNUSED, rtx insn, int more) { - int p = get_pipe (insn); - /* Schedule UNSPEC_CONVERT's early so they have less effect on - * scheduling. */ + int len; + int p; if (GET_CODE (PATTERN (insn)) == USE || GET_CODE (PATTERN (insn)) == CLOBBER - || p == -2) - return pri + 100; - /* Schedule pipe0 insns early for greedier dual issue. */ - if (p != 1) - return pri + 50; - return pri; + || (len = get_attr_length (insn)) == 0) + return more; + + spu_sched_length += len; + + /* Reset on inline asm */ + if (INSN_CODE (insn) == -1) + { + spu_ls_first = INT_MAX; + pipe0_clock = -1; + pipe1_clock = -1; + return 0; + } + p = get_pipe (insn); + if (p == 0) + pipe0_clock = clock_var; + else + pipe1_clock = clock_var; + + if (in_spu_reorg) + { + if (clock_var - prev_ls_clock > 1 + || INSN_CODE (insn) == CODE_FOR_iprefetch) + spu_ls_first = INT_MAX; + if (uses_ls_unit (insn)) + { + if (spu_ls_first == INT_MAX) + spu_ls_first = spu_sched_length; + prev_ls_clock = clock_var; + } + + /* The scheduler hasn't inserted the nop, but we will later on. + Include those nops in spu_sched_length. */ + if (prev_clock_var == clock_var && (spu_sched_length & 7)) + spu_sched_length += 4; + prev_clock_var = clock_var; + + /* more is -1 when called from spu_sched_reorder for new insns + that don't have INSN_PRIORITY */ + if (more >= 0) + prev_priority = INSN_PRIORITY (insn); + } + + /* Always try issueing more insns. spu_sched_reorder will decide + when the cycle should be advanced. */ + return 1; +} + +/* This function is called for both TARGET_SCHED_REORDER and + TARGET_SCHED_REORDER2. */ +static int +spu_sched_reorder (FILE *file ATTRIBUTE_UNUSED, int verbose ATTRIBUTE_UNUSED, + rtx *ready, int *nreadyp, int clock) +{ + int i, nready = *nreadyp; + int pipe_0, pipe_1, pipe_hbrp, pipe_ls, schedule_i; + rtx insn; + + clock_var = clock; + + if (nready <= 0 || pipe1_clock >= clock) + return 0; + + /* Find any rtl insns that don't generate assembly insns and schedule + them first. */ + for (i = nready - 1; i >= 0; i--) + { + insn = ready[i]; + if (INSN_CODE (insn) == -1 + || INSN_CODE (insn) == CODE_FOR_blockage + || INSN_CODE (insn) == CODE_FOR__spu_convert) + { + ready[i] = ready[nready - 1]; + ready[nready - 1] = insn; + return 1; + } + } + + pipe_0 = pipe_1 = pipe_hbrp = pipe_ls = schedule_i = -1; + for (i = 0; i < nready; i++) + if (INSN_CODE (ready[i]) != -1) + { + insn = ready[i]; + switch (get_attr_type (insn)) + { + default: + case TYPE_MULTI0: + case TYPE_CONVERT: + case TYPE_FX2: + case TYPE_FX3: + case TYPE_SPR: + case TYPE_NOP: + case TYPE_FXB: + case TYPE_FPD: + case TYPE_FP6: + case TYPE_FP7: + pipe_0 = i; + break; + case TYPE_LOAD: + case TYPE_STORE: + pipe_ls = i; + case TYPE_LNOP: + case TYPE_SHUF: + case TYPE_BR: + case TYPE_MULTI1: + case TYPE_HBR: + pipe_1 = i; + break; + case TYPE_IPREFETCH: + pipe_hbrp = i; + break; + } + } + + /* In the first scheduling phase, schedule loads and stores together + to increase the chance they will get merged during postreload CSE. */ + if (!reload_completed && pipe_ls >= 0) + { + insn = ready[pipe_ls]; + ready[pipe_ls] = ready[nready - 1]; + ready[nready - 1] = insn; + return 1; + } + + /* If there is an hbrp ready, prefer it over other pipe 1 insns. */ + if (pipe_hbrp >= 0) + pipe_1 = pipe_hbrp; + + /* When we have loads/stores in every cycle of the last 15 insns and + we are about to schedule another load/store, emit an hbrp insn + instead. */ + if (in_spu_reorg + && spu_sched_length - spu_ls_first >= 4 * 15 + && !(pipe0_clock < clock && pipe_0 >= 0) && pipe_1 == pipe_ls) + { + insn = sched_emit_insn (gen_iprefetch (GEN_INT (3))); + recog_memoized (insn); + if (pipe0_clock < clock) + PUT_MODE (insn, TImode); + spu_sched_variable_issue (file, verbose, insn, -1); + return 0; + } + + /* In general, we want to emit nops to increase dual issue, but dual + issue isn't faster when one of the insns could be scheduled later + without effecting the critical path. We look at INSN_PRIORITY to + make a good guess, but it isn't perfect so -mdual-nops=n can be + used to effect it. */ + if (in_spu_reorg && spu_dual_nops < 10) + { + /* When we are at an even address and we are not issueing nops to + improve scheduling then we need to advance the cycle. */ + if ((spu_sched_length & 7) == 0 && prev_clock_var == clock + && (spu_dual_nops == 0 + || (pipe_1 != -1 + && prev_priority > + INSN_PRIORITY (ready[pipe_1]) + spu_dual_nops))) + return 0; + + /* When at an odd address, schedule the highest priority insn + without considering pipeline. */ + if ((spu_sched_length & 7) == 4 && prev_clock_var != clock + && (spu_dual_nops == 0 + || (prev_priority > + INSN_PRIORITY (ready[nready - 1]) + spu_dual_nops))) + return 1; + } + + + /* We haven't issued a pipe0 insn yet this cycle, if there is a + pipe0 insn in the ready list, schedule it. */ + if (pipe0_clock < clock && pipe_0 >= 0) + schedule_i = pipe_0; + + /* Either we've scheduled a pipe0 insn already or there is no pipe0 + insn to schedule. Put a pipe1 insn at the front of the ready list. */ + else + schedule_i = pipe_1; + + if (schedule_i > -1) + { + insn = ready[schedule_i]; + ready[schedule_i] = ready[nready - 1]; + ready[nready - 1] = insn; + return 1; + } + return 0; } /* INSN is dependent on DEP_INSN. */ static int -spu_sched_adjust_cost (rtx insn, rtx link ATTRIBUTE_UNUSED, - rtx dep_insn ATTRIBUTE_UNUSED, int cost) +spu_sched_adjust_cost (rtx insn, rtx link, rtx dep_insn, int cost) { - if (GET_CODE (insn) == CALL_INSN) + rtx set; + + /* The blockage pattern is used to prevent instructions from being + moved across it and has no cost. */ + if (INSN_CODE (insn) == CODE_FOR_blockage + || INSN_CODE (dep_insn) == CODE_FOR_blockage) + return 0; + + if (INSN_CODE (insn) == CODE_FOR__spu_convert + || INSN_CODE (dep_insn) == CODE_FOR__spu_convert) + return 0; + + /* Make sure hbrps are spread out. */ + if (INSN_CODE (insn) == CODE_FOR_iprefetch + && INSN_CODE (dep_insn) == CODE_FOR_iprefetch) + return 8; + + /* Make sure hints and hbrps are 2 cycles apart. */ + if ((INSN_CODE (insn) == CODE_FOR_iprefetch + || INSN_CODE (insn) == CODE_FOR_hbr) + && (INSN_CODE (dep_insn) == CODE_FOR_iprefetch + || INSN_CODE (dep_insn) == CODE_FOR_hbr)) + return 2; + + /* An hbrp has no real dependency on other insns. */ + if (INSN_CODE (insn) == CODE_FOR_iprefetch + || INSN_CODE (dep_insn) == CODE_FOR_iprefetch) + return 0; + + /* Assuming that it is unlikely an argument register will be used in + the first cycle of the called function, we reduce the cost for + slightly better scheduling of dep_insn. When not hinted, the + mispredicted branch would hide the cost as well. */ + if (CALL_P (insn)) + { + rtx target = get_branch_target (insn); + if (GET_CODE (target) != REG || !set_of (target, insn)) + return cost - 2; + return cost; + } + + /* And when returning from a function, let's assume the return values + are completed sooner too. */ + if (CALL_P (dep_insn)) return cost - 2; + + /* Make sure an instruction that loads from the back chain is schedule + away from the return instruction so a hint is more likely to get + issued. */ + if (INSN_CODE (insn) == CODE_FOR__return + && (set = single_set (dep_insn)) + && GET_CODE (SET_DEST (set)) == REG + && REGNO (SET_DEST (set)) == LINK_REGISTER_REGNUM) + return 20; + /* The dfa scheduler sets cost to 0 for all anti-dependencies and the scheduler makes every insn in a block anti-dependent on the final jump_insn. We adjust here so higher cost insns will get scheduled earlier. */ - if (GET_CODE (insn) == JUMP_INSN && REG_NOTE_KIND (link) == REG_DEP_ANTI) + if (JUMP_P (insn) && REG_NOTE_KIND (link) == REG_DEP_ANTI) return insn_cost (dep_insn) - 3; + return cost; } @@ -5660,3 +6255,17 @@ spu_libgcc_shift_count_mode (void) for shift counts. */ return SImode; } + +/* An early place to adjust some flags after GCC has finished processing + * them. */ +static void +asm_file_start (void) +{ + /* Variable tracking should be run after all optimizations which + change order of insns. It also needs a valid CFG. */ + spu_flag_var_tracking = flag_var_tracking; + flag_var_tracking = 0; + + default_file_start (); +} + diff --git a/gcc/config/spu/spu.h b/gcc/config/spu/spu.h index 9839822885e..578c7427cf4 100644 --- a/gcc/config/spu/spu.h +++ b/gcc/config/spu/spu.h @@ -50,7 +50,8 @@ extern GTY(()) int spu_tune; /* Default target_flags if no switches specified. */ #ifndef TARGET_DEFAULT -#define TARGET_DEFAULT (MASK_ERROR_RELOC | MASK_SAFE_DMA | MASK_BRANCH_HINTS) +#define TARGET_DEFAULT (MASK_ERROR_RELOC | MASK_SAFE_DMA | MASK_BRANCH_HINTS \ + | MASK_SAFE_HINTS) #endif diff --git a/gcc/config/spu/spu.md b/gcc/config/spu/spu.md index 7b4b743fc8d..d1fa6f03508 100644 --- a/gcc/config/spu/spu.md +++ b/gcc/config/spu/spu.md @@ -4200,12 +4200,23 @@ selb\t%0,%4,%0,%3" "lnop" [(set_attr "type" "lnop")]) +;; The operand is so we know why we generated this hbrp. +;; We clobber mem to make sure it isn't moved over any +;; loads, stores or calls while scheduling. (define_insn "iprefetch" - [(unspec [(const_int 0)] UNSPEC_IPREFETCH)] + [(unspec [(match_operand:SI 0 "const_int_operand" "n")] UNSPEC_IPREFETCH) + (clobber (mem:BLK (scratch)))] "" - "hbrp" + "hbrp\t# %0" [(set_attr "type" "iprefetch")]) +;; A non-volatile version so it gets scheduled +(define_insn "nopn_nv" + [(unspec [(match_operand:SI 0 "register_operand" "r")] UNSPEC_NOP)] + "" + "nop\t%0" + [(set_attr "type" "nop")]) + (define_insn "hbr" [(set (reg:SI 130) (unspec:SI [(match_operand:SI 0 "immediate_operand" "i,i,i") diff --git a/gcc/config/spu/spu.opt b/gcc/config/spu/spu.opt index 9957e1ec7fb..9cd63a14aea 100644 --- a/gcc/config/spu/spu.opt +++ b/gcc/config/spu/spu.opt @@ -35,6 +35,14 @@ munsafe-dma Target Report RejectNegative InverseMask(SAFE_DMA) volatile must be specified on any memory that is effected by DMA +mdual-nops +Target Report Var(spu_dual_nops,10) Init(10) +Insert nops when it might improve performance by allowing dual issue (default) + +mdual-nops= +Target RejectNegative Joined UInteger Var(spu_dual_nops) +Insert nops when it might improve performance by allowing dual issue (default) + mstdmain Target Report Mask(STD_MAIN) Use standard main function as entry for startup @@ -43,6 +51,14 @@ mbranch-hints Target Report Mask(BRANCH_HINTS) Generate branch hints for branches +mhint-max-nops= +Target RejectNegative Joined UInteger Var(spu_max_nops) Init(2) +Maximum number of nops to insert for a hint (Default 2) + +mhint-max-distance= +Target RejectNegative Joined Var(spu_max_distance_str) +Approximate maximum number of instructions to allow between a hint and its branch [125] + msmall-mem Target Report RejectNegative InverseMask(LARGE_MEM) Generate code for 18 bit addressing @@ -55,6 +71,10 @@ mfixed-range= Target RejectNegative Joined Var(spu_fixed_range_string) Specify range of registers to make fixed +msafe-hints +Target Report Mask(SAFE_HINTS) +Insert hbrp instructions after hinted branch targets to avoid the SPU hang issue + march= Target RejectNegative Joined Var(spu_arch_string) Generate code for given CPU diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi index 2b2ebc85aef..b5bd719d659 100644 --- a/gcc/doc/invoke.texi +++ b/gcc/doc/invoke.texi @@ -14797,6 +14797,33 @@ useful when compiling kernel code. A register range is specified as two registers separated by a dash. Multiple register ranges can be specified separated by a comma. +@item -mdual-nops +@itemx -mdual-nops=@var{n} +@opindex mdual-nops +By default, GCC will insert nops to increase dual issue when it expects +it to increase performance. @var{n} can be a value from 0 to 10. A +smaller @var{n} will insert fewer nops. 10 is the default, 0 is the +same as @option{-mno-dual-nops}. Disabled with @option{-Os}. + +@item -mhint-max-nops=@var{n} +@opindex mhint-max-nops +Maximum number of nops to insert for a branch hint. A branch hint must +be at least 8 instructions away from the branch it is effecting. GCC +will insert up to @var{n} nops to enforce this, otherwise it will not +generate the branch hint. + +@item -mhint-max-distance=@var{n} +@opindex mhint-max-distance +The encoding of the branch hint instruction limits the hint to be within +256 instructions of the branch it is effecting. By default, GCC makes +sure it is within 125. + +@item -msafe-hints +@opindex msafe-hints +Work around a hardware bug which causes the SPU to stall indefinitely. +By default, GCC will insert the @code{hbrp} instruction to make sure +this stall won't happen. + @end table @node System V Options diff --git a/gcc/haifa-sched.c b/gcc/haifa-sched.c index 2b5130c66b9..833e1552b31 100644 --- a/gcc/haifa-sched.c +++ b/gcc/haifa-sched.c @@ -4979,4 +4979,15 @@ sched_create_empty_bb_1 (basic_block after) return create_empty_bb (after); } +/* Insert PAT as an INSN into the schedule and update the necessary data + structures to account for it. */ +rtx +sched_emit_insn (rtx pat) +{ + rtx insn = emit_insn_after (pat, last_scheduled_insn); + last_scheduled_insn = insn; + haifa_init_insn (insn); + return insn; +} + #endif /* INSN_SCHEDULING */ diff --git a/gcc/sched-int.h b/gcc/sched-int.h index d358650e8fe..7fd3b5526a2 100644 --- a/gcc/sched-int.h +++ b/gcc/sched-int.h @@ -1147,6 +1147,7 @@ extern void unlink_bb_notes (basic_block, basic_block); extern void add_block (basic_block, basic_block); extern rtx bb_note (basic_block); extern void concat_note_lists (rtx, rtx *); +extern rtx sched_emit_insn (rtx); /* Types and functions in sched-rgn.c. */ |