summaryrefslogtreecommitdiff
path: root/gcc
diff options
context:
space:
mode:
authortsmigiel <tsmigiel@138bc75d-0d04-0410-961f-82ee72b054a4>2008-09-05 22:12:24 +0000
committertsmigiel <tsmigiel@138bc75d-0d04-0410-961f-82ee72b054a4>2008-09-05 22:12:24 +0000
commit5a976006131fae932edbe2f9732367a512373218 (patch)
treec37a03103b3833c1eaba6a9dccfbcb1a8045587e /gcc
parent5422d457a256845828efe38f13789e8dd618d333 (diff)
downloadgcc-5a976006131fae932edbe2f9732367a512373218.tar.gz
Improved branch hints, safe hints, and scheduling.
* haifa-sched.c (sched_emit_insn) : Define. * sched-int.h (sched_emit_insn) : Add prototype. * doc/invoke.texi (-mdual-nops, -mhint-max-nops, -mhint-max-distance -msafe-hints) : Document. * config/spu/spu.c (spu_flag_var_tracking): New. (TARGET_SCHED_INIT_GLOBAL, TARGET_SCHED_INIT, TARGET_SCHED_REORDER, TARGET_SCHED_REORDER2, TARGET_ASM_FILE_START): Define. (TARGET_SCHED_ADJUST_PRIORITY): Remove. (STOP_HINT_P, HINTED_P, SCHED_ON_EVEN_P): Define. (spu_emit_branch_hint): Add blocks argument. (insert_branch_hints, insert_nops): Remove. (pad_bb, insert_hbrp_for_ilb_runout, insert_hbrp, in_spu_reorg, uses_ls_unit, spu_sched_init_global, spu_sched_init, spu_sched_reorder, asm_file_start): New functions. (clock_var, spu_sched_length, pipe0_clock, pipe1_clock, prev_clock_var, prev_priority, spu_ls_first, prev_ls_clock): New static variables. * config/spu/spu.h (TARGET_DEFAULT): Add MASK_SAFE_HINTS. * config/spu.md (iprefetch): Add operand, make it clobber MEM. (nopn_nv): Add a non-volatile version of nop. * config/spu/spu.opt (-mdual-nops, -mhint-max-nops, -mhint-max-distance, -msafe-hints): New options. git-svn-id: svn+ssh://gcc.gnu.org/svn/gcc/trunk@140047 138bc75d-0d04-0410-961f-82ee72b054a4
Diffstat (limited to 'gcc')
-rw-r--r--gcc/ChangeLog28
-rw-r--r--gcc/config/spu/spu.c1165
-rw-r--r--gcc/config/spu/spu.h3
-rw-r--r--gcc/config/spu/spu.md15
-rw-r--r--gcc/config/spu/spu.opt20
-rw-r--r--gcc/doc/invoke.texi27
-rw-r--r--gcc/haifa-sched.c11
-rw-r--r--gcc/sched-int.h1
8 files changed, 989 insertions, 281 deletions
diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index 11743b4bc44..e1e7df6801b 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,31 @@
+2008-09-05 Trevor Smigiel <Trevor_Smigiel@playstation.sony.com>
+
+ Improved branch hints, safe hints, and scheduling.
+
+ * haifa-sched.c (sched_emit_insn) : Define.
+ * sched-int.h (sched_emit_insn) : Add prototype.
+ * doc/invoke.texi (-mdual-nops, -mhint-max-nops,
+ -mhint-max-distance -msafe-hints) : Document.
+ * config/spu/spu.c (spu_flag_var_tracking): New.
+ (TARGET_SCHED_INIT_GLOBAL, TARGET_SCHED_INIT,
+ TARGET_SCHED_REORDER, TARGET_SCHED_REORDER2,
+ TARGET_ASM_FILE_START): Define.
+ (TARGET_SCHED_ADJUST_PRIORITY): Remove.
+ (STOP_HINT_P, HINTED_P, SCHED_ON_EVEN_P): Define.
+ (spu_emit_branch_hint): Add blocks argument.
+ (insert_branch_hints, insert_nops): Remove.
+ (pad_bb, insert_hbrp_for_ilb_runout, insert_hbrp, in_spu_reorg,
+ uses_ls_unit, spu_sched_init_global, spu_sched_init,
+ spu_sched_reorder, asm_file_start): New functions.
+ (clock_var, spu_sched_length, pipe0_clock,
+ pipe1_clock, prev_clock_var, prev_priority,
+ spu_ls_first, prev_ls_clock): New static variables.
+ * config/spu/spu.h (TARGET_DEFAULT): Add MASK_SAFE_HINTS.
+ * config/spu.md (iprefetch): Add operand, make it clobber MEM.
+ (nopn_nv): Add a non-volatile version of nop.
+ * config/spu/spu.opt (-mdual-nops, -mhint-max-nops,
+ -mhint-max-distance, -msafe-hints): New options.
+
2008-09-05 Janis Johnson <janis187@us.ibm.com>
Samuel Tardieu <sam@rfc1149.net>
diff --git a/gcc/config/spu/spu.c b/gcc/config/spu/spu.c
index 0b74a9c18e4..35a04a7a820 100644
--- a/gcc/config/spu/spu.c
+++ b/gcc/config/spu/spu.c
@@ -54,6 +54,9 @@
#include "tm-constrs.h"
#include "spu-builtins.h"
#include "ddg.h"
+#include "sbitmap.h"
+#include "timevar.h"
+#include "df.h"
/* Builtin types, data and prototypes. */
struct spu_builtin_range
@@ -94,19 +97,19 @@ static rtx frame_emit_add_imm (rtx dst, rtx src, HOST_WIDE_INT imm,
static void emit_nop_for_insn (rtx insn);
static bool insn_clobbers_hbr (rtx insn);
static void spu_emit_branch_hint (rtx before, rtx branch, rtx target,
- int distance);
+ int distance, sbitmap blocks);
static rtx spu_emit_vector_compare (enum rtx_code rcode, rtx op0, rtx op1,
enum machine_mode dmode);
static rtx get_branch_target (rtx branch);
-static void insert_branch_hints (void);
-static void insert_nops (void);
static void spu_machine_dependent_reorg (void);
static int spu_sched_issue_rate (void);
static int spu_sched_variable_issue (FILE * dump, int verbose, rtx insn,
int can_issue_more);
static int get_pipe (rtx insn);
-static int spu_sched_adjust_priority (rtx insn, int pri);
static int spu_sched_adjust_cost (rtx insn, rtx link, rtx dep_insn, int cost);
+static void spu_sched_init_global (FILE *, int, int);
+static void spu_sched_init (FILE *, int, int);
+static int spu_sched_reorder (FILE *, int, rtx *, int *, int);
static tree spu_handle_fndecl_attribute (tree * node, tree name, tree args,
int flags,
unsigned char *no_add_attrs);
@@ -139,6 +142,7 @@ static int spu_builtin_vectorization_cost (bool);
static bool spu_vector_alignment_reachable (const_tree, bool);
static tree spu_builtin_vec_perm (tree, tree *);
static int spu_sms_res_mii (struct ddg *g);
+static void asm_file_start (void);
extern const char *reg_names[];
rtx spu_compare_op0, spu_compare_op1;
@@ -148,6 +152,18 @@ int spu_arch;
/* Which cpu are we tuning for. */
int spu_tune;
+/* The hardware requires 8 insns between a hint and the branch it
+ effects. This variable describes how many rtl instructions the
+ compiler needs to see before inserting a hint, and then the compiler
+ will insert enough nops to make it at least 8 insns. The default is
+ for the compiler to allow up to 2 nops be emitted. The nops are
+ inserted in pairs, so we round down. */
+int spu_hint_dist = (8*4) - (2*4);
+
+/* Determines whether we run variable tracking in machine dependent
+ reorganization. */
+static int spu_flag_var_tracking;
+
enum spu_immediate {
SPU_NONE,
SPU_IL,
@@ -213,11 +229,20 @@ tree spu_builtin_types[SPU_BTI_MAX];
#undef TARGET_SCHED_ISSUE_RATE
#define TARGET_SCHED_ISSUE_RATE spu_sched_issue_rate
+#undef TARGET_SCHED_INIT_GLOBAL
+#define TARGET_SCHED_INIT_GLOBAL spu_sched_init_global
+
+#undef TARGET_SCHED_INIT
+#define TARGET_SCHED_INIT spu_sched_init
+
#undef TARGET_SCHED_VARIABLE_ISSUE
#define TARGET_SCHED_VARIABLE_ISSUE spu_sched_variable_issue
-#undef TARGET_SCHED_ADJUST_PRIORITY
-#define TARGET_SCHED_ADJUST_PRIORITY spu_sched_adjust_priority
+#undef TARGET_SCHED_REORDER
+#define TARGET_SCHED_REORDER spu_sched_reorder
+
+#undef TARGET_SCHED_REORDER2
+#define TARGET_SCHED_REORDER2 spu_sched_reorder
#undef TARGET_SCHED_ADJUST_COST
#define TARGET_SCHED_ADJUST_COST spu_sched_adjust_cost
@@ -301,6 +326,9 @@ const struct attribute_spec spu_attribute_table[];
#undef TARGET_SCHED_SMS_RES_MII
#define TARGET_SCHED_SMS_RES_MII spu_sms_res_mii
+#undef TARGET_ASM_FILE_START
+#define TARGET_ASM_FILE_START asm_file_start
+
struct gcc_target targetm = TARGET_INITIALIZER;
void
@@ -329,9 +357,14 @@ spu_override_options (void)
flag_omit_frame_pointer = 1;
+ /* Functions must be 8 byte aligned so we correctly handle dual issue */
if (align_functions < 8)
align_functions = 8;
+ spu_hint_dist = 8*4 - spu_max_nops*4;
+ if (spu_hint_dist < 0)
+ spu_hint_dist = 0;
+
if (spu_fixed_range_string)
fix_range (spu_fixed_range_string);
@@ -1983,16 +2016,6 @@ spu_const (enum machine_mode mode, HOST_WIDE_INT val)
return gen_rtx_CONST_VECTOR (mode, v);
}
-
-/* branch hint stuff */
-
-/* The hardware requires 8 insns between a hint and the branch it
- effects. This variable describes how many rtl instructions the
- compiler needs to see before inserting a hint. (FIXME: We should
- accept less and insert nops to enforce it because hinting is always
- profitable for performance, but we do need to be careful of code
- size.) */
-int spu_hint_dist = (8 * 4);
/* Create a MODE vector constant from 4 ints. */
rtx
@@ -2017,75 +2040,200 @@ spu_const_from_ints(enum machine_mode mode, int a, int b, int c, int d)
arr[15] = (d >> 0) & 0xff;
return array_to_constant(mode, arr);
}
+
+/* branch hint stuff */
/* An array of these is used to propagate hints to predecessor blocks. */
struct spu_bb_info
{
- rtx prop_jump; /* propagated from another block */
- basic_block bb; /* the original block. */
+ rtx prop_jump; /* propagated from another block */
+ int bb_index; /* the original block. */
};
+static struct spu_bb_info *spu_bb_info;
-/* The special $hbr register is used to prevent the insn scheduler from
- moving hbr insns across instructions which invalidate them. It
- should only be used in a clobber, and this function searches for
- insns which clobber it. */
-static bool
-insn_clobbers_hbr (rtx insn)
+#define STOP_HINT_P(INSN) \
+ (GET_CODE(INSN) == CALL_INSN \
+ || INSN_CODE(INSN) == CODE_FOR_divmodsi4 \
+ || INSN_CODE(INSN) == CODE_FOR_udivmodsi4)
+
+/* 1 when RTX is a hinted branch or its target. We keep track of
+ what has been hinted so the safe-hint code can test it easily. */
+#define HINTED_P(RTX) \
+ (RTL_FLAG_CHECK3("HINTED_P", (RTX), CODE_LABEL, JUMP_INSN, CALL_INSN)->unchanging)
+
+/* 1 when RTX is an insn that must be scheduled on an even boundary. */
+#define SCHED_ON_EVEN_P(RTX) \
+ (RTL_FLAG_CHECK2("SCHED_ON_EVEN_P", (RTX), JUMP_INSN, CALL_INSN)->in_struct)
+
+/* Emit a nop for INSN such that the two will dual issue. This assumes
+ INSN is 8-byte aligned. When INSN is inline asm we emit an lnop.
+ We check for TImode to handle a MULTI1 insn which has dual issued its
+ first instruction. get_pipe returns -1 for MULTI0, inline asm, or
+ ADDR_VEC insns. */
+static void
+emit_nop_for_insn (rtx insn)
{
- if (INSN_P (insn) && GET_CODE (PATTERN (insn)) == PARALLEL)
+ int p;
+ rtx new_insn;
+ p = get_pipe (insn);
+ if ((CALL_P (insn) || JUMP_P (insn)) && SCHED_ON_EVEN_P (insn))
+ new_insn = emit_insn_after (gen_lnop (), insn);
+ else if (p == 1 && GET_MODE (insn) == TImode)
{
- rtx parallel = PATTERN (insn);
- rtx clobber;
- int j;
- for (j = XVECLEN (parallel, 0) - 1; j >= 0; j--)
+ new_insn = emit_insn_before (gen_nopn (GEN_INT (127)), insn);
+ PUT_MODE (new_insn, TImode);
+ PUT_MODE (insn, VOIDmode);
+ }
+ else
+ new_insn = emit_insn_after (gen_lnop (), insn);
+ recog_memoized (new_insn);
+}
+
+/* Insert nops in basic blocks to meet dual issue alignment
+ requirements. Also make sure hbrp and hint instructions are at least
+ one cycle apart, possibly inserting a nop. */
+static void
+pad_bb(void)
+{
+ rtx insn, next_insn, prev_insn, hbr_insn = 0;
+ int length;
+ int addr;
+
+ /* This sets up INSN_ADDRESSES. */
+ shorten_branches (get_insns ());
+
+ /* Keep track of length added by nops. */
+ length = 0;
+
+ prev_insn = 0;
+ insn = get_insns ();
+ if (!active_insn_p (insn))
+ insn = next_active_insn (insn);
+ for (; insn; insn = next_insn)
+ {
+ next_insn = next_active_insn (insn);
+ if (INSN_CODE (insn) == CODE_FOR_iprefetch
+ || INSN_CODE (insn) == CODE_FOR_hbr)
{
- clobber = XVECEXP (parallel, 0, j);
- if (GET_CODE (clobber) == CLOBBER
- && GET_CODE (XEXP (clobber, 0)) == REG
- && REGNO (XEXP (clobber, 0)) == HBR_REGNUM)
- return 1;
+ if (hbr_insn)
+ {
+ int a0 = INSN_ADDRESSES (INSN_UID (hbr_insn));
+ int a1 = INSN_ADDRESSES (INSN_UID (insn));
+ if ((a1 - a0 == 8 && GET_MODE (insn) != TImode)
+ || (a1 - a0 == 4))
+ {
+ prev_insn = emit_insn_before (gen_lnop (), insn);
+ PUT_MODE (prev_insn, GET_MODE (insn));
+ PUT_MODE (insn, TImode);
+ length += 4;
+ }
+ }
+ hbr_insn = insn;
+ }
+ if (INSN_CODE (insn) == CODE_FOR_blockage)
+ {
+ if (GET_MODE (insn) == TImode)
+ PUT_MODE (next_insn, TImode);
+ insn = next_insn;
+ next_insn = next_active_insn (insn);
+ }
+ addr = INSN_ADDRESSES (INSN_UID (insn));
+ if ((CALL_P (insn) || JUMP_P (insn)) && SCHED_ON_EVEN_P (insn))
+ {
+ if (((addr + length) & 7) != 0)
+ {
+ emit_nop_for_insn (prev_insn);
+ length += 4;
+ }
}
+ else if (GET_MODE (insn) == TImode
+ && ((next_insn && GET_MODE (next_insn) != TImode)
+ || get_attr_type (insn) == TYPE_MULTI0)
+ && ((addr + length) & 7) != 0)
+ {
+ /* prev_insn will always be set because the first insn is
+ always 8-byte aligned. */
+ emit_nop_for_insn (prev_insn);
+ length += 4;
+ }
+ prev_insn = insn;
}
- return 0;
}
+
+/* Routines for branch hints. */
+
static void
-spu_emit_branch_hint (rtx before, rtx branch, rtx target, int distance)
+spu_emit_branch_hint (rtx before, rtx branch, rtx target,
+ int distance, sbitmap blocks)
{
- rtx branch_label;
- rtx hint, insn, prev, next;
+ rtx branch_label = 0;
+ rtx hint;
+ rtx insn;
+ rtx table;
if (before == 0 || branch == 0 || target == 0)
return;
+ /* While scheduling we require hints to be no further than 600, so
+ we need to enforce that here too */
if (distance > 600)
return;
+ /* If we have a Basic block note, emit it after the basic block note. */
+ if (NOTE_KIND (before) == NOTE_INSN_BASIC_BLOCK)
+ before = NEXT_INSN (before);
branch_label = gen_label_rtx ();
LABEL_NUSES (branch_label)++;
LABEL_PRESERVE_P (branch_label) = 1;
insn = emit_label_before (branch_label, branch);
branch_label = gen_rtx_LABEL_REF (VOIDmode, branch_label);
+ SET_BIT (blocks, BLOCK_FOR_INSN (branch)->index);
+
+ hint = emit_insn_before (gen_hbr (branch_label, target), before);
+ recog_memoized (hint);
+ HINTED_P (branch) = 1;
- /* If the previous insn is pipe0, make the hbr dual issue with it. If
- the current insn is pipe0, dual issue with it. */
- prev = prev_active_insn (before);
- if (prev && get_pipe (prev) == 0)
- hint = emit_insn_before (gen_hbr (branch_label, target), before);
- else if (get_pipe (before) == 0 && distance > spu_hint_dist)
+ if (GET_CODE (target) == LABEL_REF)
+ HINTED_P (XEXP (target, 0)) = 1;
+ else if (tablejump_p (branch, 0, &table))
{
- next = next_active_insn (before);
- hint = emit_insn_after (gen_hbr (branch_label, target), before);
- if (next)
- PUT_MODE (next, TImode);
+ rtvec vec;
+ int j;
+ if (GET_CODE (PATTERN (table)) == ADDR_VEC)
+ vec = XVEC (PATTERN (table), 0);
+ else
+ vec = XVEC (PATTERN (table), 1);
+ for (j = GET_NUM_ELEM (vec) - 1; j >= 0; --j)
+ HINTED_P (XEXP (RTVEC_ELT (vec, j), 0)) = 1;
}
- else
+
+ if (distance >= 588)
{
- hint = emit_insn_before (gen_hbr (branch_label, target), before);
- PUT_MODE (hint, TImode);
+ /* Make sure the hint isn't scheduled any earlier than this point,
+ which could make it too far for the branch offest to fit */
+ recog_memoized (emit_insn_before (gen_blockage (), hint));
+ }
+ else if (distance <= 8 * 4)
+ {
+ /* To guarantee at least 8 insns between the hint and branch we
+ insert nops. */
+ int d;
+ for (d = distance; d < 8 * 4; d += 4)
+ {
+ insn =
+ emit_insn_after (gen_nopn_nv (gen_rtx_REG (SImode, 127)), hint);
+ recog_memoized (insn);
+ }
+
+ /* Make sure any nops inserted aren't scheduled before the hint. */
+ recog_memoized (emit_insn_after (gen_blockage (), hint));
+
+ /* Make sure any nops inserted aren't scheduled after the call. */
+ if (CALL_P (branch) && distance < 8 * 4)
+ recog_memoized (emit_insn_before (gen_blockage (), branch));
}
- recog_memoized (hint);
}
/* Returns 0 if we don't want a hint for this branch. Otherwise return
@@ -2155,245 +2303,403 @@ get_branch_target (rtx branch)
return 0;
}
+/* The special $hbr register is used to prevent the insn scheduler from
+ moving hbr insns across instructions which invalidate them. It
+ should only be used in a clobber, and this function searches for
+ insns which clobber it. */
+static bool
+insn_clobbers_hbr (rtx insn)
+{
+ if (INSN_P (insn)
+ && GET_CODE (PATTERN (insn)) == PARALLEL)
+ {
+ rtx parallel = PATTERN (insn);
+ rtx clobber;
+ int j;
+ for (j = XVECLEN (parallel, 0) - 1; j >= 0; j--)
+ {
+ clobber = XVECEXP (parallel, 0, j);
+ if (GET_CODE (clobber) == CLOBBER
+ && GET_CODE (XEXP (clobber, 0)) == REG
+ && REGNO (XEXP (clobber, 0)) == HBR_REGNUM)
+ return 1;
+ }
+ }
+ return 0;
+}
+
+/* Search up to 32 insns starting at FIRST:
+ - at any kind of hinted branch, just return
+ - at any unconditional branch in the first 15 insns, just return
+ - at a call or indirect branch, after the first 15 insns, force it to
+ an even address and return
+ - at any unconditional branch, after the first 15 insns, force it to
+ an even address.
+ At then end of the search, insert an hbrp within 4 insns of FIRST,
+ and an hbrp within 16 instructions of FIRST.
+ */
static void
-insert_branch_hints (void)
+insert_hbrp_for_ilb_runout (rtx first)
{
- struct spu_bb_info *spu_bb_info;
- rtx branch, insn, next;
- rtx branch_target = 0;
- int branch_addr = 0, insn_addr, head_addr;
- basic_block bb;
- unsigned int j;
+ rtx insn, before_4 = 0, before_16 = 0;
+ int addr = 0, length, first_addr = -1;
+ int hbrp_addr0 = 128 * 4, hbrp_addr1 = 128 * 4;
+ int insert_lnop_after = 0;
+ for (insn = first; insn; insn = NEXT_INSN (insn))
+ if (INSN_P (insn))
+ {
+ if (first_addr == -1)
+ first_addr = INSN_ADDRESSES (INSN_UID (insn));
+ addr = INSN_ADDRESSES (INSN_UID (insn)) - first_addr;
+ length = get_attr_length (insn);
+
+ if (before_4 == 0 && addr + length >= 4 * 4)
+ before_4 = insn;
+ /* We test for 14 instructions because the first hbrp will add
+ up to 2 instructions. */
+ if (before_16 == 0 && addr + length >= 14 * 4)
+ before_16 = insn;
+
+ if (INSN_CODE (insn) == CODE_FOR_hbr)
+ {
+ /* Make sure an hbrp is at least 2 cycles away from a hint.
+ Insert an lnop after the hbrp when necessary. */
+ if (before_4 == 0 && addr > 0)
+ {
+ before_4 = insn;
+ insert_lnop_after |= 1;
+ }
+ else if (before_4 && addr <= 4 * 4)
+ insert_lnop_after |= 1;
+ if (before_16 == 0 && addr > 10 * 4)
+ {
+ before_16 = insn;
+ insert_lnop_after |= 2;
+ }
+ else if (before_16 && addr <= 14 * 4)
+ insert_lnop_after |= 2;
+ }
- spu_bb_info =
- (struct spu_bb_info *) xcalloc (last_basic_block + 1,
- sizeof (struct spu_bb_info));
+ if (INSN_CODE (insn) == CODE_FOR_iprefetch)
+ {
+ if (addr < hbrp_addr0)
+ hbrp_addr0 = addr;
+ else if (addr < hbrp_addr1)
+ hbrp_addr1 = addr;
+ }
- /* We need exact insn addresses and lengths. */
- shorten_branches (get_insns ());
+ if (CALL_P (insn) || JUMP_P (insn))
+ {
+ if (HINTED_P (insn))
+ return;
+
+ /* Any branch after the first 15 insns should be on an even
+ address to avoid a special case branch. There might be
+ some nops and/or hbrps inserted, so we test after 10
+ insns. */
+ if (addr > 10 * 4)
+ SCHED_ON_EVEN_P (insn) = 1;
+ }
- FOR_EACH_BB_REVERSE (bb)
- {
- head_addr = INSN_ADDRESSES (INSN_UID (BB_HEAD (bb)));
- branch = 0;
- if (spu_bb_info[bb->index].prop_jump)
- {
- branch = spu_bb_info[bb->index].prop_jump;
- branch_target = get_branch_target (branch);
- branch_addr = INSN_ADDRESSES (INSN_UID (branch));
- }
- /* Search from end of a block to beginning. In this loop, find
- jumps which need a branch and emit them only when:
- - it's an indirect branch and we're at the insn which sets
- the register
- - we're at an insn that will invalidate the hint. e.g., a
- call, another hint insn, inline asm that clobbers $hbr, and
- some inlined operations (divmodsi4). Don't consider jumps
- because they are only at the end of a block and are
- considered when we are deciding whether to propagate
- - we're getting too far away from the branch. The hbr insns
- only have a signed 10-bit offset
- We go back as far as possible so the branch will be considered
- for propagation when we get to the beginning of the block. */
- next = 0;
- for (insn = BB_END (bb); insn; insn = PREV_INSN (insn))
- {
- if (INSN_P (insn))
+ if (CALL_P (insn) || tablejump_p (insn, 0, 0))
+ return;
+
+
+ if (addr + length >= 32 * 4)
{
- insn_addr = INSN_ADDRESSES (INSN_UID (insn));
- if (branch && next
- && ((GET_CODE (branch_target) == REG
- && set_of (branch_target, insn) != NULL_RTX)
- || insn_clobbers_hbr (insn)
- || branch_addr - insn_addr > 600))
+ gcc_assert (before_4 && before_16);
+ if (hbrp_addr0 > 4 * 4)
{
- int next_addr = INSN_ADDRESSES (INSN_UID (next));
- if (insn != BB_END (bb)
- && branch_addr - next_addr >= spu_hint_dist)
+ insn =
+ emit_insn_before (gen_iprefetch (GEN_INT (1)), before_4);
+ recog_memoized (insn);
+ INSN_ADDRESSES_NEW (insn,
+ INSN_ADDRESSES (INSN_UID (before_4)));
+ PUT_MODE (insn, GET_MODE (before_4));
+ PUT_MODE (before_4, TImode);
+ if (insert_lnop_after & 1)
{
- if (dump_file)
- fprintf (dump_file,
- "hint for %i in block %i before %i\n",
- INSN_UID (branch), bb->index, INSN_UID (next));
- spu_emit_branch_hint (next, branch, branch_target,
- branch_addr - next_addr);
+ insn = emit_insn_before (gen_lnop (), before_4);
+ recog_memoized (insn);
+ INSN_ADDRESSES_NEW (insn,
+ INSN_ADDRESSES (INSN_UID (before_4)));
+ PUT_MODE (insn, TImode);
}
- branch = 0;
}
-
- /* JUMP_P will only be true at the end of a block. When
- branch is already set it means we've previously decided
- to propagate a hint for that branch into this block. */
- if (CALL_P (insn) || (JUMP_P (insn) && !branch))
+ if ((hbrp_addr0 <= 4 * 4 || hbrp_addr0 > 16 * 4)
+ && hbrp_addr1 > 16 * 4)
{
- branch = 0;
- if ((branch_target = get_branch_target (insn)))
+ insn =
+ emit_insn_before (gen_iprefetch (GEN_INT (2)), before_16);
+ recog_memoized (insn);
+ INSN_ADDRESSES_NEW (insn,
+ INSN_ADDRESSES (INSN_UID (before_16)));
+ PUT_MODE (insn, GET_MODE (before_16));
+ PUT_MODE (before_16, TImode);
+ if (insert_lnop_after & 2)
{
- branch = insn;
- branch_addr = insn_addr;
+ insn = emit_insn_before (gen_lnop (), before_16);
+ recog_memoized (insn);
+ INSN_ADDRESSES_NEW (insn,
+ INSN_ADDRESSES (INSN_UID
+ (before_16)));
+ PUT_MODE (insn, TImode);
}
}
-
- /* When a branch hint is emitted it will be inserted
- before "next". Make sure next is the beginning of a
- cycle to minimize impact on the scheduled insns. */
- if (GET_MODE (insn) == TImode)
- next = insn;
+ return;
}
- if (insn == BB_HEAD (bb))
- break;
}
+ else if (BARRIER_P (insn))
+ return;
- if (branch)
- {
- /* If we haven't emitted a hint for this branch yet, it might
- be profitable to emit it in one of the predecessor blocks,
- especially for loops. */
- rtx bbend;
- basic_block prev = 0, prop = 0, prev2 = 0;
- int loop_exit = 0, simple_loop = 0;
- int next_addr = 0;
- if (next)
- next_addr = INSN_ADDRESSES (INSN_UID (next));
-
- for (j = 0; j < EDGE_COUNT (bb->preds); j++)
- if (EDGE_PRED (bb, j)->flags & EDGE_FALLTHRU)
- prev = EDGE_PRED (bb, j)->src;
- else
- prev2 = EDGE_PRED (bb, j)->src;
-
- for (j = 0; j < EDGE_COUNT (bb->succs); j++)
- if (EDGE_SUCC (bb, j)->flags & EDGE_LOOP_EXIT)
- loop_exit = 1;
- else if (EDGE_SUCC (bb, j)->dest == bb)
- simple_loop = 1;
-
- /* If this branch is a loop exit then propagate to previous
- fallthru block. This catches the cases when it is a simple
- loop or when there is an initial branch into the loop. */
- if (prev && loop_exit && prev->loop_depth <= bb->loop_depth)
- prop = prev;
-
- /* If there is only one adjacent predecessor. Don't propagate
- outside this loop. This loop_depth test isn't perfect, but
- I'm not sure the loop_father member is valid at this point. */
- else if (prev && single_pred_p (bb)
- && prev->loop_depth == bb->loop_depth)
- prop = prev;
-
- /* If this is the JOIN block of a simple IF-THEN then
- propagate the hint to the HEADER block. */
- else if (prev && prev2
- && EDGE_COUNT (bb->preds) == 2
- && EDGE_COUNT (prev->preds) == 1
- && EDGE_PRED (prev, 0)->src == prev2
- && prev2->loop_depth == bb->loop_depth
- && GET_CODE (branch_target) != REG)
- prop = prev;
-
- /* Don't propagate when:
- - this is a simple loop and the hint would be too far
- - this is not a simple loop and there are 16 insns in
- this block already
- - the predecessor block ends in a branch that will be
- hinted
- - the predecessor block ends in an insn that invalidates
- the hint */
- if (prop
- && prop->index >= 0
- && (bbend = BB_END (prop))
- && branch_addr - INSN_ADDRESSES (INSN_UID (bbend)) <
- (simple_loop ? 600 : 16 * 4) && get_branch_target (bbend) == 0
- && (JUMP_P (bbend) || !insn_clobbers_hbr (bbend)))
- {
- if (dump_file)
- fprintf (dump_file, "propagate from %i to %i (loop depth %i) "
- "for %i (loop_exit %i simple_loop %i dist %i)\n",
- bb->index, prop->index, bb->loop_depth,
- INSN_UID (branch), loop_exit, simple_loop,
- branch_addr - INSN_ADDRESSES (INSN_UID (bbend)));
-
- spu_bb_info[prop->index].prop_jump = branch;
- spu_bb_info[prop->index].bb = bb;
- }
- else if (next && branch_addr - next_addr >= spu_hint_dist)
- {
- if (dump_file)
- fprintf (dump_file, "hint for %i in block %i before %i\n",
- INSN_UID (branch), bb->index, INSN_UID (next));
- spu_emit_branch_hint (next, branch, branch_target,
- branch_addr - next_addr);
- }
- branch = 0;
- }
- }
- free (spu_bb_info);
}
-
-/* Emit a nop for INSN such that the two will dual issue. This assumes
- INSN is 8-byte aligned. When INSN is inline asm we emit an lnop.
- We check for TImode to handle a MULTI1 insn which has dual issued its
- first instruction. get_pipe returns -1 for MULTI0, inline asm, or
- ADDR_VEC insns. */
+
+/* The SPU might hang when it executes 48 inline instructions after a
+ hinted branch jumps to its hinted target. The beginning of a
+ function and the return from a call might have been hinted, and must
+ be handled as well. To prevent a hang we insert 2 hbrps. The first
+ should be within 6 insns of the branch target. The second should be
+ within 22 insns of the branch target. When determining if hbrps are
+ necessary, we look for only 32 inline instructions, because up to to
+ 12 nops and 4 hbrps could be inserted. Similarily, when inserting
+ new hbrps, we insert them within 4 and 16 insns of the target. */
static void
-emit_nop_for_insn (rtx insn)
+insert_hbrp (void)
{
- int p;
- rtx new_insn;
- p = get_pipe (insn);
- if (p == 1 && GET_MODE (insn) == TImode)
+ rtx insn;
+ if (TARGET_SAFE_HINTS)
{
- new_insn = emit_insn_before (gen_nopn (GEN_INT (127)), insn);
- PUT_MODE (new_insn, TImode);
- PUT_MODE (insn, VOIDmode);
+ shorten_branches (get_insns ());
+ /* Insert hbrp at beginning of function */
+ insn = next_active_insn (get_insns ());
+ if (insn)
+ insert_hbrp_for_ilb_runout (insn);
+ /* Insert hbrp after hinted targets. */
+ for (insn = get_insns (); insn; insn = NEXT_INSN (insn))
+ if ((LABEL_P (insn) && HINTED_P (insn)) || CALL_P (insn))
+ insert_hbrp_for_ilb_runout (next_active_insn (insn));
}
- else
- new_insn = emit_insn_after (gen_lnop (), insn);
}
-/* Insert nops in basic blocks to meet dual issue alignment
- requirements. */
+static int in_spu_reorg;
+
+/* Insert branch hints. There are no branch optimizations after this
+ pass, so it's safe to set our branch hints now. */
static void
-insert_nops (void)
+spu_machine_dependent_reorg (void)
{
- rtx insn, next_insn, prev_insn;
- int length;
- int addr;
+ sbitmap blocks;
+ basic_block bb;
+ rtx branch, insn;
+ rtx branch_target = 0;
+ int branch_addr = 0, insn_addr, required_dist = 0;
+ int i;
+ unsigned int j;
- /* This sets up INSN_ADDRESSES. */
- shorten_branches (get_insns ());
+ if (!TARGET_BRANCH_HINTS || optimize == 0)
+ {
+ /* We still do it for unoptimized code because an external
+ function might have hinted a call or return. */
+ insert_hbrp ();
+ pad_bb ();
+ return;
+ }
- /* Keep track of length added by nops. */
- length = 0;
+ blocks = sbitmap_alloc (last_basic_block);
+ sbitmap_zero (blocks);
- prev_insn = 0;
- for (insn = get_insns (); insn; insn = next_insn)
+ in_spu_reorg = 1;
+ compute_bb_for_insn ();
+
+ compact_blocks ();
+
+ spu_bb_info =
+ (struct spu_bb_info *) xcalloc (n_basic_blocks,
+ sizeof (struct spu_bb_info));
+
+ /* We need exact insn addresses and lengths. */
+ shorten_branches (get_insns ());
+
+ for (i = n_basic_blocks - 1; i >= 0; i--)
{
- next_insn = next_active_insn (insn);
- addr = INSN_ADDRESSES (INSN_UID (insn));
- if (GET_MODE (insn) == TImode
- && next_insn
- && GET_MODE (next_insn) != TImode
- && ((addr + length) & 7) != 0)
+ bb = BASIC_BLOCK (i);
+ branch = 0;
+ if (spu_bb_info[i].prop_jump)
{
- /* prev_insn will always be set because the first insn is
- always 8-byte aligned. */
- emit_nop_for_insn (prev_insn);
- length += 4;
+ branch = spu_bb_info[i].prop_jump;
+ branch_target = get_branch_target (branch);
+ branch_addr = INSN_ADDRESSES (INSN_UID (branch));
+ required_dist = spu_hint_dist;
+ }
+ /* Search from end of a block to beginning. In this loop, find
+ jumps which need a branch and emit them only when:
+ - it's an indirect branch and we're at the insn which sets
+ the register
+ - we're at an insn that will invalidate the hint. e.g., a
+ call, another hint insn, inline asm that clobbers $hbr, and
+ some inlined operations (divmodsi4). Don't consider jumps
+ because they are only at the end of a block and are
+ considered when we are deciding whether to propagate
+ - we're getting too far away from the branch. The hbr insns
+ only have a signed 10 bit offset
+ We go back as far as possible so the branch will be considered
+ for propagation when we get to the beginning of the block. */
+ for (insn = BB_END (bb); insn; insn = PREV_INSN (insn))
+ {
+ if (INSN_P (insn))
+ {
+ insn_addr = INSN_ADDRESSES (INSN_UID (insn));
+ if (branch
+ && ((GET_CODE (branch_target) == REG
+ && set_of (branch_target, insn) != NULL_RTX)
+ || insn_clobbers_hbr (insn)
+ || branch_addr - insn_addr > 600))
+ {
+ rtx next = NEXT_INSN (insn);
+ int next_addr = INSN_ADDRESSES (INSN_UID (next));
+ if (insn != BB_END (bb)
+ && branch_addr - next_addr >= required_dist)
+ {
+ if (dump_file)
+ fprintf (dump_file,
+ "hint for %i in block %i before %i\n",
+ INSN_UID (branch), bb->index,
+ INSN_UID (next));
+ spu_emit_branch_hint (next, branch, branch_target,
+ branch_addr - next_addr, blocks);
+ }
+ branch = 0;
+ }
+
+ /* JUMP_P will only be true at the end of a block. When
+ branch is already set it means we've previously decided
+ to propagate a hint for that branch into this block. */
+ if (CALL_P (insn) || (JUMP_P (insn) && !branch))
+ {
+ branch = 0;
+ if ((branch_target = get_branch_target (insn)))
+ {
+ branch = insn;
+ branch_addr = insn_addr;
+ required_dist = spu_hint_dist;
+ }
+ }
+ }
+ if (insn == BB_HEAD (bb))
+ break;
+ }
+
+ if (branch)
+ {
+ /* If we haven't emitted a hint for this branch yet, it might
+ be profitable to emit it in one of the predecessor blocks,
+ especially for loops. */
+ rtx bbend;
+ basic_block prev = 0, prop = 0, prev2 = 0;
+ int loop_exit = 0, simple_loop = 0;
+ int next_addr = INSN_ADDRESSES (INSN_UID (NEXT_INSN (insn)));
+
+ for (j = 0; j < EDGE_COUNT (bb->preds); j++)
+ if (EDGE_PRED (bb, j)->flags & EDGE_FALLTHRU)
+ prev = EDGE_PRED (bb, j)->src;
+ else
+ prev2 = EDGE_PRED (bb, j)->src;
+
+ for (j = 0; j < EDGE_COUNT (bb->succs); j++)
+ if (EDGE_SUCC (bb, j)->flags & EDGE_LOOP_EXIT)
+ loop_exit = 1;
+ else if (EDGE_SUCC (bb, j)->dest == bb)
+ simple_loop = 1;
+
+ /* If this branch is a loop exit then propagate to previous
+ fallthru block. This catches the cases when it is a simple
+ loop or when there is an initial branch into the loop. */
+ if (prev && (loop_exit || simple_loop)
+ && prev->loop_depth <= bb->loop_depth)
+ prop = prev;
+
+ /* If there is only one adjacent predecessor. Don't propagate
+ outside this loop. This loop_depth test isn't perfect, but
+ I'm not sure the loop_father member is valid at this point. */
+ else if (prev && single_pred_p (bb)
+ && prev->loop_depth == bb->loop_depth)
+ prop = prev;
+
+ /* If this is the JOIN block of a simple IF-THEN then
+ propogate the hint to the HEADER block. */
+ else if (prev && prev2
+ && EDGE_COUNT (bb->preds) == 2
+ && EDGE_COUNT (prev->preds) == 1
+ && EDGE_PRED (prev, 0)->src == prev2
+ && prev2->loop_depth == bb->loop_depth
+ && GET_CODE (branch_target) != REG)
+ prop = prev;
+
+ /* Don't propagate when:
+ - this is a simple loop and the hint would be too far
+ - this is not a simple loop and there are 16 insns in
+ this block already
+ - the predecessor block ends in a branch that will be
+ hinted
+ - the predecessor block ends in an insn that invalidates
+ the hint */
+ if (prop
+ && prop->index >= 0
+ && (bbend = BB_END (prop))
+ && branch_addr - INSN_ADDRESSES (INSN_UID (bbend)) <
+ (simple_loop ? 600 : 16 * 4) && get_branch_target (bbend) == 0
+ && (JUMP_P (bbend) || !insn_clobbers_hbr (bbend)))
+ {
+ if (dump_file)
+ fprintf (dump_file, "propagate from %i to %i (loop depth %i) "
+ "for %i (loop_exit %i simple_loop %i dist %i)\n",
+ bb->index, prop->index, bb->loop_depth,
+ INSN_UID (branch), loop_exit, simple_loop,
+ branch_addr - INSN_ADDRESSES (INSN_UID (bbend)));
+
+ spu_bb_info[prop->index].prop_jump = branch;
+ spu_bb_info[prop->index].bb_index = i;
+ }
+ else if (branch_addr - next_addr >= required_dist)
+ {
+ if (dump_file)
+ fprintf (dump_file, "hint for %i in block %i before %i\n",
+ INSN_UID (branch), bb->index,
+ INSN_UID (NEXT_INSN (insn)));
+ spu_emit_branch_hint (NEXT_INSN (insn), branch, branch_target,
+ branch_addr - next_addr, blocks);
+ }
+ branch = 0;
}
- prev_insn = insn;
}
-}
+ free (spu_bb_info);
-static void
-spu_machine_dependent_reorg (void)
-{
- if (optimize > 0)
+ if (!sbitmap_empty_p (blocks))
+ find_many_sub_basic_blocks (blocks);
+
+ /* We have to schedule to make sure alignment is ok. */
+ FOR_EACH_BB (bb) bb->flags &= ~BB_DISABLE_SCHEDULE;
+
+ /* The hints need to be scheduled, so call it again. */
+ schedule_insns ();
+
+ insert_hbrp ();
+
+ pad_bb ();
+
+
+ if (spu_flag_var_tracking)
{
- if (TARGET_BRANCH_HINTS)
- insert_branch_hints ();
- insert_nops ();
+ df_analyze ();
+ timevar_push (TV_VAR_TRACKING);
+ variable_tracking_main ();
+ timevar_pop (TV_VAR_TRACKING);
+ df_finish_pass (false);
}
+
+ free_bb_for_insn ();
+
+ in_spu_reorg = 0;
}
@@ -2405,15 +2711,14 @@ spu_sched_issue_rate (void)
}
static int
-spu_sched_variable_issue (FILE * dump ATTRIBUTE_UNUSED,
- int verbose ATTRIBUTE_UNUSED, rtx insn,
- int can_issue_more)
+uses_ls_unit(rtx insn)
{
- if (GET_CODE (PATTERN (insn)) != USE
- && GET_CODE (PATTERN (insn)) != CLOBBER
- && get_pipe (insn) != -2)
- can_issue_more--;
- return can_issue_more;
+ rtx set = single_set (insn);
+ if (set != 0
+ && (GET_CODE (SET_DEST (set)) == MEM
+ || GET_CODE (SET_SRC (set)) == MEM))
+ return 1;
+ return 0;
}
static int
@@ -2439,7 +2744,6 @@ get_pipe (rtx insn)
case TYPE_FPD:
case TYPE_FP6:
case TYPE_FP7:
- case TYPE_IPREFETCH:
return 0;
case TYPE_LNOP:
@@ -2449,41 +2753,332 @@ get_pipe (rtx insn)
case TYPE_BR:
case TYPE_MULTI1:
case TYPE_HBR:
+ case TYPE_IPREFETCH:
return 1;
default:
abort ();
}
}
+
+/* haifa-sched.c has a static variable that keeps track of the current
+ cycle. It is passed to spu_sched_reorder, and we record it here for
+ use by spu_sched_variable_issue. It won't be accurate if the
+ scheduler updates it's clock_var between the two calls. */
+static int clock_var;
+
+/* This is used to keep track of insn alignment. Set to 0 at the
+ beginning of each block and increased by the "length" attr of each
+ insn scheduled. */
+static int spu_sched_length;
+
+/* Record when we've issued pipe0 and pipe1 insns so we can reorder the
+ ready list appropriately in spu_sched_reorder(). */
+static int pipe0_clock;
+static int pipe1_clock;
+
+static int prev_clock_var;
+
+static int prev_priority;
+
+/* The SPU needs to load the next ilb sometime during the execution of
+ the previous ilb. There is a potential conflict if every cycle has a
+ load or store. To avoid the conflict we make sure the load/store
+ unit is free for at least one cycle during the execution of insns in
+ the previous ilb. */
+static int spu_ls_first;
+static int prev_ls_clock;
+
+static void
+spu_sched_init_global (FILE *file ATTRIBUTE_UNUSED, int verbose ATTRIBUTE_UNUSED,
+ int max_ready ATTRIBUTE_UNUSED)
+{
+ spu_sched_length = 0;
+}
+
+static void
+spu_sched_init (FILE *file ATTRIBUTE_UNUSED, int verbose ATTRIBUTE_UNUSED,
+ int max_ready ATTRIBUTE_UNUSED)
+{
+ if (align_labels > 4 || align_loops > 4 || align_jumps > 4)
+ {
+ /* When any block might be at least 8-byte aligned, assume they
+ will all be at least 8-byte aligned to make sure dual issue
+ works out correctly. */
+ spu_sched_length = 0;
+ }
+ spu_ls_first = INT_MAX;
+ clock_var = -1;
+ prev_ls_clock = -1;
+ pipe0_clock = -1;
+ pipe1_clock = -1;
+ prev_clock_var = -1;
+ prev_priority = -1;
+}
+
static int
-spu_sched_adjust_priority (rtx insn, int pri)
+spu_sched_variable_issue (FILE *file ATTRIBUTE_UNUSED,
+ int verbose ATTRIBUTE_UNUSED, rtx insn, int more)
{
- int p = get_pipe (insn);
- /* Schedule UNSPEC_CONVERT's early so they have less effect on
- * scheduling. */
+ int len;
+ int p;
if (GET_CODE (PATTERN (insn)) == USE
|| GET_CODE (PATTERN (insn)) == CLOBBER
- || p == -2)
- return pri + 100;
- /* Schedule pipe0 insns early for greedier dual issue. */
- if (p != 1)
- return pri + 50;
- return pri;
+ || (len = get_attr_length (insn)) == 0)
+ return more;
+
+ spu_sched_length += len;
+
+ /* Reset on inline asm */
+ if (INSN_CODE (insn) == -1)
+ {
+ spu_ls_first = INT_MAX;
+ pipe0_clock = -1;
+ pipe1_clock = -1;
+ return 0;
+ }
+ p = get_pipe (insn);
+ if (p == 0)
+ pipe0_clock = clock_var;
+ else
+ pipe1_clock = clock_var;
+
+ if (in_spu_reorg)
+ {
+ if (clock_var - prev_ls_clock > 1
+ || INSN_CODE (insn) == CODE_FOR_iprefetch)
+ spu_ls_first = INT_MAX;
+ if (uses_ls_unit (insn))
+ {
+ if (spu_ls_first == INT_MAX)
+ spu_ls_first = spu_sched_length;
+ prev_ls_clock = clock_var;
+ }
+
+ /* The scheduler hasn't inserted the nop, but we will later on.
+ Include those nops in spu_sched_length. */
+ if (prev_clock_var == clock_var && (spu_sched_length & 7))
+ spu_sched_length += 4;
+ prev_clock_var = clock_var;
+
+ /* more is -1 when called from spu_sched_reorder for new insns
+ that don't have INSN_PRIORITY */
+ if (more >= 0)
+ prev_priority = INSN_PRIORITY (insn);
+ }
+
+ /* Always try issueing more insns. spu_sched_reorder will decide
+ when the cycle should be advanced. */
+ return 1;
+}
+
+/* This function is called for both TARGET_SCHED_REORDER and
+ TARGET_SCHED_REORDER2. */
+static int
+spu_sched_reorder (FILE *file ATTRIBUTE_UNUSED, int verbose ATTRIBUTE_UNUSED,
+ rtx *ready, int *nreadyp, int clock)
+{
+ int i, nready = *nreadyp;
+ int pipe_0, pipe_1, pipe_hbrp, pipe_ls, schedule_i;
+ rtx insn;
+
+ clock_var = clock;
+
+ if (nready <= 0 || pipe1_clock >= clock)
+ return 0;
+
+ /* Find any rtl insns that don't generate assembly insns and schedule
+ them first. */
+ for (i = nready - 1; i >= 0; i--)
+ {
+ insn = ready[i];
+ if (INSN_CODE (insn) == -1
+ || INSN_CODE (insn) == CODE_FOR_blockage
+ || INSN_CODE (insn) == CODE_FOR__spu_convert)
+ {
+ ready[i] = ready[nready - 1];
+ ready[nready - 1] = insn;
+ return 1;
+ }
+ }
+
+ pipe_0 = pipe_1 = pipe_hbrp = pipe_ls = schedule_i = -1;
+ for (i = 0; i < nready; i++)
+ if (INSN_CODE (ready[i]) != -1)
+ {
+ insn = ready[i];
+ switch (get_attr_type (insn))
+ {
+ default:
+ case TYPE_MULTI0:
+ case TYPE_CONVERT:
+ case TYPE_FX2:
+ case TYPE_FX3:
+ case TYPE_SPR:
+ case TYPE_NOP:
+ case TYPE_FXB:
+ case TYPE_FPD:
+ case TYPE_FP6:
+ case TYPE_FP7:
+ pipe_0 = i;
+ break;
+ case TYPE_LOAD:
+ case TYPE_STORE:
+ pipe_ls = i;
+ case TYPE_LNOP:
+ case TYPE_SHUF:
+ case TYPE_BR:
+ case TYPE_MULTI1:
+ case TYPE_HBR:
+ pipe_1 = i;
+ break;
+ case TYPE_IPREFETCH:
+ pipe_hbrp = i;
+ break;
+ }
+ }
+
+ /* In the first scheduling phase, schedule loads and stores together
+ to increase the chance they will get merged during postreload CSE. */
+ if (!reload_completed && pipe_ls >= 0)
+ {
+ insn = ready[pipe_ls];
+ ready[pipe_ls] = ready[nready - 1];
+ ready[nready - 1] = insn;
+ return 1;
+ }
+
+ /* If there is an hbrp ready, prefer it over other pipe 1 insns. */
+ if (pipe_hbrp >= 0)
+ pipe_1 = pipe_hbrp;
+
+ /* When we have loads/stores in every cycle of the last 15 insns and
+ we are about to schedule another load/store, emit an hbrp insn
+ instead. */
+ if (in_spu_reorg
+ && spu_sched_length - spu_ls_first >= 4 * 15
+ && !(pipe0_clock < clock && pipe_0 >= 0) && pipe_1 == pipe_ls)
+ {
+ insn = sched_emit_insn (gen_iprefetch (GEN_INT (3)));
+ recog_memoized (insn);
+ if (pipe0_clock < clock)
+ PUT_MODE (insn, TImode);
+ spu_sched_variable_issue (file, verbose, insn, -1);
+ return 0;
+ }
+
+ /* In general, we want to emit nops to increase dual issue, but dual
+ issue isn't faster when one of the insns could be scheduled later
+ without effecting the critical path. We look at INSN_PRIORITY to
+ make a good guess, but it isn't perfect so -mdual-nops=n can be
+ used to effect it. */
+ if (in_spu_reorg && spu_dual_nops < 10)
+ {
+ /* When we are at an even address and we are not issueing nops to
+ improve scheduling then we need to advance the cycle. */
+ if ((spu_sched_length & 7) == 0 && prev_clock_var == clock
+ && (spu_dual_nops == 0
+ || (pipe_1 != -1
+ && prev_priority >
+ INSN_PRIORITY (ready[pipe_1]) + spu_dual_nops)))
+ return 0;
+
+ /* When at an odd address, schedule the highest priority insn
+ without considering pipeline. */
+ if ((spu_sched_length & 7) == 4 && prev_clock_var != clock
+ && (spu_dual_nops == 0
+ || (prev_priority >
+ INSN_PRIORITY (ready[nready - 1]) + spu_dual_nops)))
+ return 1;
+ }
+
+
+ /* We haven't issued a pipe0 insn yet this cycle, if there is a
+ pipe0 insn in the ready list, schedule it. */
+ if (pipe0_clock < clock && pipe_0 >= 0)
+ schedule_i = pipe_0;
+
+ /* Either we've scheduled a pipe0 insn already or there is no pipe0
+ insn to schedule. Put a pipe1 insn at the front of the ready list. */
+ else
+ schedule_i = pipe_1;
+
+ if (schedule_i > -1)
+ {
+ insn = ready[schedule_i];
+ ready[schedule_i] = ready[nready - 1];
+ ready[nready - 1] = insn;
+ return 1;
+ }
+ return 0;
}
/* INSN is dependent on DEP_INSN. */
static int
-spu_sched_adjust_cost (rtx insn, rtx link ATTRIBUTE_UNUSED,
- rtx dep_insn ATTRIBUTE_UNUSED, int cost)
+spu_sched_adjust_cost (rtx insn, rtx link, rtx dep_insn, int cost)
{
- if (GET_CODE (insn) == CALL_INSN)
+ rtx set;
+
+ /* The blockage pattern is used to prevent instructions from being
+ moved across it and has no cost. */
+ if (INSN_CODE (insn) == CODE_FOR_blockage
+ || INSN_CODE (dep_insn) == CODE_FOR_blockage)
+ return 0;
+
+ if (INSN_CODE (insn) == CODE_FOR__spu_convert
+ || INSN_CODE (dep_insn) == CODE_FOR__spu_convert)
+ return 0;
+
+ /* Make sure hbrps are spread out. */
+ if (INSN_CODE (insn) == CODE_FOR_iprefetch
+ && INSN_CODE (dep_insn) == CODE_FOR_iprefetch)
+ return 8;
+
+ /* Make sure hints and hbrps are 2 cycles apart. */
+ if ((INSN_CODE (insn) == CODE_FOR_iprefetch
+ || INSN_CODE (insn) == CODE_FOR_hbr)
+ && (INSN_CODE (dep_insn) == CODE_FOR_iprefetch
+ || INSN_CODE (dep_insn) == CODE_FOR_hbr))
+ return 2;
+
+ /* An hbrp has no real dependency on other insns. */
+ if (INSN_CODE (insn) == CODE_FOR_iprefetch
+ || INSN_CODE (dep_insn) == CODE_FOR_iprefetch)
+ return 0;
+
+ /* Assuming that it is unlikely an argument register will be used in
+ the first cycle of the called function, we reduce the cost for
+ slightly better scheduling of dep_insn. When not hinted, the
+ mispredicted branch would hide the cost as well. */
+ if (CALL_P (insn))
+ {
+ rtx target = get_branch_target (insn);
+ if (GET_CODE (target) != REG || !set_of (target, insn))
+ return cost - 2;
+ return cost;
+ }
+
+ /* And when returning from a function, let's assume the return values
+ are completed sooner too. */
+ if (CALL_P (dep_insn))
return cost - 2;
+
+ /* Make sure an instruction that loads from the back chain is schedule
+ away from the return instruction so a hint is more likely to get
+ issued. */
+ if (INSN_CODE (insn) == CODE_FOR__return
+ && (set = single_set (dep_insn))
+ && GET_CODE (SET_DEST (set)) == REG
+ && REGNO (SET_DEST (set)) == LINK_REGISTER_REGNUM)
+ return 20;
+
/* The dfa scheduler sets cost to 0 for all anti-dependencies and the
scheduler makes every insn in a block anti-dependent on the final
jump_insn. We adjust here so higher cost insns will get scheduled
earlier. */
- if (GET_CODE (insn) == JUMP_INSN && REG_NOTE_KIND (link) == REG_DEP_ANTI)
+ if (JUMP_P (insn) && REG_NOTE_KIND (link) == REG_DEP_ANTI)
return insn_cost (dep_insn) - 3;
+
return cost;
}
@@ -5660,3 +6255,17 @@ spu_libgcc_shift_count_mode (void)
for shift counts. */
return SImode;
}
+
+/* An early place to adjust some flags after GCC has finished processing
+ * them. */
+static void
+asm_file_start (void)
+{
+ /* Variable tracking should be run after all optimizations which
+ change order of insns. It also needs a valid CFG. */
+ spu_flag_var_tracking = flag_var_tracking;
+ flag_var_tracking = 0;
+
+ default_file_start ();
+}
+
diff --git a/gcc/config/spu/spu.h b/gcc/config/spu/spu.h
index 9839822885e..578c7427cf4 100644
--- a/gcc/config/spu/spu.h
+++ b/gcc/config/spu/spu.h
@@ -50,7 +50,8 @@ extern GTY(()) int spu_tune;
/* Default target_flags if no switches specified. */
#ifndef TARGET_DEFAULT
-#define TARGET_DEFAULT (MASK_ERROR_RELOC | MASK_SAFE_DMA | MASK_BRANCH_HINTS)
+#define TARGET_DEFAULT (MASK_ERROR_RELOC | MASK_SAFE_DMA | MASK_BRANCH_HINTS \
+ | MASK_SAFE_HINTS)
#endif
diff --git a/gcc/config/spu/spu.md b/gcc/config/spu/spu.md
index 7b4b743fc8d..d1fa6f03508 100644
--- a/gcc/config/spu/spu.md
+++ b/gcc/config/spu/spu.md
@@ -4200,12 +4200,23 @@ selb\t%0,%4,%0,%3"
"lnop"
[(set_attr "type" "lnop")])
+;; The operand is so we know why we generated this hbrp.
+;; We clobber mem to make sure it isn't moved over any
+;; loads, stores or calls while scheduling.
(define_insn "iprefetch"
- [(unspec [(const_int 0)] UNSPEC_IPREFETCH)]
+ [(unspec [(match_operand:SI 0 "const_int_operand" "n")] UNSPEC_IPREFETCH)
+ (clobber (mem:BLK (scratch)))]
""
- "hbrp"
+ "hbrp\t# %0"
[(set_attr "type" "iprefetch")])
+;; A non-volatile version so it gets scheduled
+(define_insn "nopn_nv"
+ [(unspec [(match_operand:SI 0 "register_operand" "r")] UNSPEC_NOP)]
+ ""
+ "nop\t%0"
+ [(set_attr "type" "nop")])
+
(define_insn "hbr"
[(set (reg:SI 130)
(unspec:SI [(match_operand:SI 0 "immediate_operand" "i,i,i")
diff --git a/gcc/config/spu/spu.opt b/gcc/config/spu/spu.opt
index 9957e1ec7fb..9cd63a14aea 100644
--- a/gcc/config/spu/spu.opt
+++ b/gcc/config/spu/spu.opt
@@ -35,6 +35,14 @@ munsafe-dma
Target Report RejectNegative InverseMask(SAFE_DMA)
volatile must be specified on any memory that is effected by DMA
+mdual-nops
+Target Report Var(spu_dual_nops,10) Init(10)
+Insert nops when it might improve performance by allowing dual issue (default)
+
+mdual-nops=
+Target RejectNegative Joined UInteger Var(spu_dual_nops)
+Insert nops when it might improve performance by allowing dual issue (default)
+
mstdmain
Target Report Mask(STD_MAIN)
Use standard main function as entry for startup
@@ -43,6 +51,14 @@ mbranch-hints
Target Report Mask(BRANCH_HINTS)
Generate branch hints for branches
+mhint-max-nops=
+Target RejectNegative Joined UInteger Var(spu_max_nops) Init(2)
+Maximum number of nops to insert for a hint (Default 2)
+
+mhint-max-distance=
+Target RejectNegative Joined Var(spu_max_distance_str)
+Approximate maximum number of instructions to allow between a hint and its branch [125]
+
msmall-mem
Target Report RejectNegative InverseMask(LARGE_MEM)
Generate code for 18 bit addressing
@@ -55,6 +71,10 @@ mfixed-range=
Target RejectNegative Joined Var(spu_fixed_range_string)
Specify range of registers to make fixed
+msafe-hints
+Target Report Mask(SAFE_HINTS)
+Insert hbrp instructions after hinted branch targets to avoid the SPU hang issue
+
march=
Target RejectNegative Joined Var(spu_arch_string)
Generate code for given CPU
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 2b2ebc85aef..b5bd719d659 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -14797,6 +14797,33 @@ useful when compiling kernel code. A register range is specified as
two registers separated by a dash. Multiple register ranges can be
specified separated by a comma.
+@item -mdual-nops
+@itemx -mdual-nops=@var{n}
+@opindex mdual-nops
+By default, GCC will insert nops to increase dual issue when it expects
+it to increase performance. @var{n} can be a value from 0 to 10. A
+smaller @var{n} will insert fewer nops. 10 is the default, 0 is the
+same as @option{-mno-dual-nops}. Disabled with @option{-Os}.
+
+@item -mhint-max-nops=@var{n}
+@opindex mhint-max-nops
+Maximum number of nops to insert for a branch hint. A branch hint must
+be at least 8 instructions away from the branch it is effecting. GCC
+will insert up to @var{n} nops to enforce this, otherwise it will not
+generate the branch hint.
+
+@item -mhint-max-distance=@var{n}
+@opindex mhint-max-distance
+The encoding of the branch hint instruction limits the hint to be within
+256 instructions of the branch it is effecting. By default, GCC makes
+sure it is within 125.
+
+@item -msafe-hints
+@opindex msafe-hints
+Work around a hardware bug which causes the SPU to stall indefinitely.
+By default, GCC will insert the @code{hbrp} instruction to make sure
+this stall won't happen.
+
@end table
@node System V Options
diff --git a/gcc/haifa-sched.c b/gcc/haifa-sched.c
index 2b5130c66b9..833e1552b31 100644
--- a/gcc/haifa-sched.c
+++ b/gcc/haifa-sched.c
@@ -4979,4 +4979,15 @@ sched_create_empty_bb_1 (basic_block after)
return create_empty_bb (after);
}
+/* Insert PAT as an INSN into the schedule and update the necessary data
+ structures to account for it. */
+rtx
+sched_emit_insn (rtx pat)
+{
+ rtx insn = emit_insn_after (pat, last_scheduled_insn);
+ last_scheduled_insn = insn;
+ haifa_init_insn (insn);
+ return insn;
+}
+
#endif /* INSN_SCHEDULING */
diff --git a/gcc/sched-int.h b/gcc/sched-int.h
index d358650e8fe..7fd3b5526a2 100644
--- a/gcc/sched-int.h
+++ b/gcc/sched-int.h
@@ -1147,6 +1147,7 @@ extern void unlink_bb_notes (basic_block, basic_block);
extern void add_block (basic_block, basic_block);
extern rtx bb_note (basic_block);
extern void concat_note_lists (rtx, rtx *);
+extern rtx sched_emit_insn (rtx);
/* Types and functions in sched-rgn.c. */