diff options
author | Mircea Namolaru <namolaru@il.ibm.com> | 2007-08-28 06:52:16 +0000 |
---|---|---|
committer | Revital Eres <revitale@gcc.gnu.org> | 2007-08-28 06:52:16 +0000 |
commit | 46dc0789fed233da1561e6c2ee1a4a7ab82e9fd4 (patch) | |
tree | bc3b2739b170fd0d12aa9aeb627ced2e462e8b9d /gcc | |
parent | 7368348cb7d8665e6eb213264d6ca056f2f05219 (diff) | |
download | gcc-46dc0789fed233da1561e6c2ee1a4a7ab82e9fd4.tar.gz |
Modulo-scheduling improvements. Patch 2 of 2
Co-Authored-By: Andrey Belevantsev <abel@ispras.ru>
Co-Authored-By: Revital Eres <eres@il.ibm.com>
Co-Authored-By: Vladimir Yanovsky <yanov@il.ibm.com>
From-SVN: r127848
Diffstat (limited to 'gcc')
-rw-r--r-- | gcc/ChangeLog | 28 | ||||
-rw-r--r-- | gcc/config/spu/spu.md | 42 | ||||
-rw-r--r-- | gcc/ddg.c | 10 | ||||
-rw-r--r-- | gcc/loop-doloop.c | 82 | ||||
-rw-r--r-- | gcc/modulo-sched.c | 148 | ||||
-rw-r--r-- | gcc/testsuite/ChangeLog | 7 | ||||
-rw-r--r-- | gcc/testsuite/gcc.dg/sms-1.c | 38 |
7 files changed, 293 insertions, 62 deletions
diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 82c559d4cb2..b440e936605 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,31 @@ +2007-08-28 Mircea Namolaru <namolaru@il.ibm.com> + Vladimir Yanovsky <yanov@il.ibm.com> + Revital Eres <eres@il.ibm.com> + Andrey Belevantsev <abel@ispras.ru> + + * config/spu/spu.md: Recognize doloop pattern when -fmodulo-sched + is set. + * modulo-sched.c: Add documentation regarding do-loop. + (doloop_register_get): Change number of arguments to support + the new do-loop pattern and check whether COUNT_REG has no other + occurences in the loop besides in the control part. + (duplicate_insns_of_cycles): Do not duplicate the insn which + changes count_reg as it is already adjusted. + (generate_prolog_epilog): New argument to support the new + do-loop pattern. Change the subtract instruction to use + expand_simple_binop. Call duplicate_insns_of_cycles with new + argument. + (sms_schedule): Call doloop_register_get and + generate_prolog_epilog with new argument. Do not handle loops + with single sets insns with subreg in their lhs. + * loop-doloop.c (doloop_optimize): Support for another do-loop + pattern. + (doloop_condition_get): Gets an instruction instead of a pattern + and change the return condition when the do-loop pattern is + not parallel. + * ddg.c (create_ddg_dep_from_intra_loop_link): Handle only reg + deps when considering to not create edges. + 2007-08-27 Alexandre Oliva <aoliva@redhat.com> * doc/extend.texi (gnu_inline funtion attribute): Document C++ diff --git a/gcc/config/spu/spu.md b/gcc/config/spu/spu.md index 5dcc45ed0c4..1afdd11f130 100644 --- a/gcc/config/spu/spu.md +++ b/gcc/config/spu/spu.md @@ -3887,6 +3887,48 @@ selb\t%0,%4,%0,%3" [(set_attr "type" "br")]) + + ;; Define the subtract-one-and-jump insns so loop.c + ;; knows what to generate. + (define_expand "doloop_end" + [(use (match_operand 0 "" "")) ; loop pseudo + (use (match_operand 1 "" "")) ; iterations; zero if unknown + (use (match_operand 2 "" "")) ; max iterations + (use (match_operand 3 "" "")) ; loop level + (use (match_operand 4 "" ""))] ; label + "" + " + { + /* Currently SMS relies on the do-loop pattern to recognize loops + where (1) the control part comprises of all insns defining and/or + using a certain 'count' register and (2) the loop count can be + adjusted by modifying this register prior to the loop. +. ??? The possible introduction of a new block to initialize the + new IV can potentially effects branch optimizations. */ + if (optimize > 0 && flag_modulo_sched) + { + rtx s0; + rtx bcomp; + rtx loc_ref; + + /* Only use this on innermost loops. */ + if (INTVAL (operands[3]) > 1) + FAIL; + if (GET_MODE (operands[0]) != SImode) + FAIL; + + s0 = operands [0]; + emit_move_insn (s0, gen_rtx_PLUS (SImode, s0, GEN_INT (-1))); + bcomp = gen_rtx_NE(SImode, s0, const0_rtx); + loc_ref = gen_rtx_LABEL_REF (VOIDmode, operands [4]); + emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, + gen_rtx_IF_THEN_ELSE (VOIDmode, bcomp, + loc_ref, pc_rtx))); + + DONE; + } + }") + ;; convert between any two modes, avoiding any GCC assumptions (define_expand "spu_convert" [(set (match_operand 0 "spu_reg_operand" "") diff --git a/gcc/ddg.c b/gcc/ddg.c index 295811db4c1..f1ec8fdb121 100644 --- a/gcc/ddg.c +++ b/gcc/ddg.c @@ -176,13 +176,17 @@ create_ddg_dep_from_intra_loop_link (ddg_ptr g, ddg_node_ptr src_node, rtx set; set = single_set (dest_node->insn); - if (set) + /* TODO: Handle registers that REG_P is not true for them, i.e. + subregs and special registers. */ + if (set && REG_P (SET_DEST (set))) { int regno = REGNO (SET_DEST (set)); - struct df_ref *first_def = - df_bb_regno_first_def_find (g->bb, regno); + struct df_ref *first_def; struct df_rd_bb_info *bb_info = DF_RD_BB_INFO (g->bb); + first_def = df_bb_regno_first_def_find (g->bb, regno); + gcc_assert (first_def); + if (bitmap_bit_p (bb_info->gen, first_def->id)) return; } diff --git a/gcc/loop-doloop.c b/gcc/loop-doloop.c index 162ef4ad58a..6e33a4f9ba0 100644 --- a/gcc/loop-doloop.c +++ b/gcc/loop-doloop.c @@ -69,35 +69,59 @@ along with GCC; see the file COPYING3. If not see if it is not a decrement and branch jump insn. */ rtx -doloop_condition_get (rtx pattern) +doloop_condition_get (rtx doloop_pat) { rtx cmp; rtx inc; rtx reg; rtx inc_src; rtx condition; + rtx pattern; - /* The canonical doloop pattern we expect is: + /* The canonical doloop pattern we expect has one of the following + forms: - (parallel [(set (pc) (if_then_else (condition) - (label_ref (label)) - (pc))) - (set (reg) (plus (reg) (const_int -1))) - (additional clobbers and uses)]) + 1) (parallel [(set (pc) (if_then_else (condition) + (label_ref (label)) + (pc))) + (set (reg) (plus (reg) (const_int -1))) + (additional clobbers and uses)]) - Some targets (IA-64) wrap the set of the loop counter in - an if_then_else too. + The branch must be the first entry of the parallel (also required + by jump.c), and the second entry of the parallel must be a set of + the loop counter register. Some targets (IA-64) wrap the set of + the loop counter in an if_then_else too. - In summary, the branch must be the first entry of the - parallel (also required by jump.c), and the second - entry of the parallel must be a set of the loop counter - register. */ + 2) (set (reg) (plus (reg) (const_int -1)) + (set (pc) (if_then_else (reg != 0) + (label_ref (label)) + (pc))). */ + + pattern = PATTERN (doloop_pat); if (GET_CODE (pattern) != PARALLEL) - return 0; + { + rtx cond; + + /* We expect the decrement to immediately precede the branch. */ - cmp = XVECEXP (pattern, 0, 0); - inc = XVECEXP (pattern, 0, 1); + if ((PREV_INSN (doloop_pat) == NULL_RTX) + || !INSN_P (PREV_INSN (doloop_pat))) + return 0; + + cmp = pattern; + inc = PATTERN (PREV_INSN (doloop_pat)); + /* We expect the condition to be of the form (reg != 0) */ + cond = XEXP (SET_SRC (cmp), 0); + if (GET_CODE (cond) != NE || XEXP (cond, 1) != const0_rtx) + return 0; + + } + else + { + cmp = XVECEXP (pattern, 0, 0); + inc = XVECEXP (pattern, 0, 1); + } /* Check for (set (reg) (something)). */ if (GET_CODE (inc) != SET) @@ -139,7 +163,29 @@ doloop_condition_get (rtx pattern) if ((XEXP (condition, 0) == reg) || (GET_CODE (XEXP (condition, 0)) == PLUS && XEXP (XEXP (condition, 0), 0) == reg)) + { + if (GET_CODE (pattern) != PARALLEL) + /* The second form we expect: + + (set (reg) (plus (reg) (const_int -1)) + (set (pc) (if_then_else (reg != 0) + (label_ref (label)) + (pc))). + + is equivalent to the following: + + (parallel [(set (pc) (if_then_else (reg != 1) + (label_ref (label)) + (pc))) + (set (reg) (plus (reg) (const_int -1))) + (additional clobbers and uses)]) + + So we return that form instead. + */ + condition = gen_rtx_fmt_ee (NE, VOIDmode, inc_src, const1_rtx); + return condition; + } /* ??? If a machine uses a funny comparison, we could return a canonicalized form here. */ @@ -597,9 +643,7 @@ doloop_optimize (struct loop *loop) { while (NEXT_INSN (doloop_pat) != NULL_RTX) doloop_pat = NEXT_INSN (doloop_pat); - if (JUMP_P (doloop_pat)) - doloop_pat = PATTERN (doloop_pat); - else + if (!JUMP_P (doloop_pat)) doloop_pat = NULL_RTX; } diff --git a/gcc/modulo-sched.c b/gcc/modulo-sched.c index 73c4adc84b0..7e9f6aa1a69 100644 --- a/gcc/modulo-sched.c +++ b/gcc/modulo-sched.c @@ -82,8 +82,21 @@ along with GCC; see the file COPYING3. If not see perform modulo variable expansion for live ranges that span more than II cycles (i.e. use register copies to prevent a def from overwriting itself before reaching the use). -*/ + SMS works with countable loops (1) whose control part can be easily + decoupled from the rest of the loop and (2) whose loop count can + be easily adjusted. This is because we peel a constant number of + iterations into a prologue and epilogue for which we want to avoid + emitting the control part, and a kernel which is to iterate that + constant number of iterations less than the original loop. So the + control part should be a set of insns clearly identified and having + its own iv, not otherwise used in the loop (at-least for now), which + initializes a register before the loop to the number of iterations. + Currently SMS relies on the do-loop pattern to recognize such loops, + where (1) the control part comprises of all insns defining and/or + using a certain 'count' register and (2) the loop count can be + adjusted by modifying this register prior to the loop. + TODO: Rely on cfgloop analysis instead. */ /* This page defines partial-schedule structures and functions for modulo scheduling. */ @@ -182,10 +195,11 @@ static int sms_order_nodes (ddg_ptr, int, int * result); static void set_node_sched_params (ddg_ptr); static partial_schedule_ptr sms_schedule_by_order (ddg_ptr, int, int, int *); static void permute_partial_schedule (partial_schedule_ptr ps, rtx last); -static void generate_prolog_epilog (partial_schedule_ptr ,struct loop * loop, rtx); +static void generate_prolog_epilog (partial_schedule_ptr, struct loop *loop, + rtx, rtx); static void duplicate_insns_of_cycles (partial_schedule_ptr ps, int from_stage, int to_stage, - int is_prolog); + int is_prolog, rtx count_reg); #define SCHED_ASAP(x) (((node_sched_params_ptr)(x)->aux.info)->asap) #define SCHED_TIME(x) (((node_sched_params_ptr)(x)->aux.info)->time) @@ -261,20 +275,22 @@ static struct sched_info sms_sched_info = }; -/* Return the register decremented and tested in INSN, - or zero if it is not a decrement-and-branch insn. */ - +/* Given HEAD and TAIL which are the first and last insns in a loop; + return the register which controls the loop. Return zero if it has + more than one occurrence in the loop besides the control part or the + do-loop pattern is not of the form we expect. */ static rtx -doloop_register_get (rtx insn ATTRIBUTE_UNUSED) +doloop_register_get (rtx head ATTRIBUTE_UNUSED, rtx tail ATTRIBUTE_UNUSED) { #ifdef HAVE_doloop_end - rtx pattern, reg, condition; + rtx reg, condition, insn; + bool found = false; - if (! JUMP_P (insn)) + if (!JUMP_P (tail)) return NULL_RTX; - pattern = PATTERN (insn); - condition = doloop_condition_get (pattern); + /* TODO: Free SMS's dependence on doloop_condition_get. */ + condition = doloop_condition_get (tail); if (! condition) return NULL_RTX; @@ -286,6 +302,29 @@ doloop_register_get (rtx insn ATTRIBUTE_UNUSED) else gcc_unreachable (); + /* Check that the COUNT_REG has no other occurrences in the loop + until the decrement. We assume the control part consists of + either a single (parallel) branch-on-count or a (non-parallel) + branch immediately preceded by a single (decrement) insn. */ + for (insn = head; insn != PREV_INSN (tail); insn = NEXT_INSN (insn)) + if ((found = reg_mentioned_p (reg, insn)) == true) + break; + if (found) + { + if (dump_file) + fprintf (dump_file, "SMS count_reg found outside control\n"); + + return NULL_RTX; + } + /* One last check in case the do-loop pattern is parallel. */ + if (GET_CODE (PATTERN (tail)) == PARALLEL) + if (reg_mentioned_p (reg, PREV_INSN (tail))) + { + if (dump_file) + fprintf (dump_file, "SMS count_reg found outside control\n"); + + return NULL_RTX; + } return reg; #else return NULL_RTX; @@ -583,7 +622,7 @@ permute_partial_schedule (partial_schedule_ptr ps, rtx last) static void duplicate_insns_of_cycles (partial_schedule_ptr ps, int from_stage, - int to_stage, int for_prolog) + int to_stage, int for_prolog, rtx count_reg) { int row; ps_insn_ptr ps_ij; @@ -595,6 +634,13 @@ duplicate_insns_of_cycles (partial_schedule_ptr ps, int from_stage, int j, i_reg_moves; rtx reg_move = NULL_RTX; + /* Do not duplicate any insn which refers to count_reg as it + belongs to the control part. + TODO: This should be done by analyzing the control part of + the loop. */ + if (reg_mentioned_p (count_reg, u_node->insn)) + continue; + if (for_prolog) { /* SCHED_STAGE (u_node) >= from_stage == 0. Generate increasing @@ -643,23 +689,35 @@ duplicate_insns_of_cycles (partial_schedule_ptr ps, int from_stage, /* Generate the instructions (including reg_moves) for prolog & epilog. */ static void -generate_prolog_epilog (partial_schedule_ptr ps, struct loop * loop, rtx count_reg) +generate_prolog_epilog (partial_schedule_ptr ps, struct loop *loop, + rtx count_reg, rtx count_init) { int i; int last_stage = PS_STAGE_COUNT (ps) - 1; edge e; - + /* Generate the prolog, inserting its insns on the loop-entry edge. */ start_sequence (); - if (count_reg) - /* Generate a subtract instruction at the beginning of the prolog to - adjust the loop count by STAGE_COUNT. */ - emit_insn (gen_sub2_insn (count_reg, GEN_INT (last_stage))); + if (!count_init) + { + /* Generate instructions at the beginning of the prolog to + adjust the loop count by STAGE_COUNT. If loop count is constant + (count_init), this constant is adjusted by STAGE_COUNT in + generate_prolog_epilog function. */ + rtx sub_reg = NULL_RTX; + + sub_reg = expand_simple_binop (GET_MODE (count_reg), MINUS, + count_reg, GEN_INT (last_stage), + count_reg, 1, OPTAB_DIRECT); + gcc_assert (REG_P (sub_reg)); + if (REGNO (sub_reg) != REGNO (count_reg)) + emit_move_insn (count_reg, sub_reg); + } for (i = 0; i < last_stage; i++) - duplicate_insns_of_cycles (ps, 0, i, 1); - + duplicate_insns_of_cycles (ps, 0, i, 1, count_reg); + /* Put the prolog on the entry edge. */ e = loop_preheader_edge (loop); split_edge_and_insert (e, get_insns ()); @@ -670,8 +728,8 @@ generate_prolog_epilog (partial_schedule_ptr ps, struct loop * loop, rtx count_r start_sequence (); for (i = 0; i < last_stage; i++) - duplicate_insns_of_cycles (ps, i + 1, last_stage, 0); - + duplicate_insns_of_cycles (ps, i + 1, last_stage, 0, count_reg); + /* Put the epilogue on the exit edge. */ gcc_assert (single_exit (loop)); e = single_exit (loop); @@ -907,20 +965,27 @@ sms_schedule (void) } /* Make sure this is a doloop. */ - if ( !(count_reg = doloop_register_get (tail))) + if ( !(count_reg = doloop_register_get (head, tail))) continue; /* Don't handle BBs with calls or barriers, or !single_set insns, or auto-increment insns (to avoid creating invalid reg-moves for the auto-increment insns). - ??? Should handle auto-increment insns. */ - for (insn = head; insn != NEXT_INSN (tail); insn = NEXT_INSN (insn)) - if (CALL_P (insn) - || BARRIER_P (insn) - || (INSN_P (insn) && !JUMP_P (insn) - && !single_set (insn) && GET_CODE (PATTERN (insn)) != USE) - || (FIND_REG_INC_NOTE (insn, NULL_RTX) != 0)) - break; + ??? Should handle auto-increment insns. + ??? Should handle insns defining subregs. */ + for (insn = head; insn != NEXT_INSN (tail); insn = NEXT_INSN (insn)) + { + rtx set; + + if (CALL_P (insn) + || BARRIER_P (insn) + || (INSN_P (insn) && !JUMP_P (insn) + && !single_set (insn) && GET_CODE (PATTERN (insn)) != USE) + || (FIND_REG_INC_NOTE (insn, NULL_RTX) != 0) + || (INSN_P (insn) && (set = single_set (insn)) + && GET_CODE (SET_DEST (set)) == SUBREG)) + break; + } if (insn != NEXT_INSN (tail)) { @@ -932,8 +997,11 @@ sms_schedule (void) fprintf (dump_file, "SMS loop-with-barrier\n"); else if (FIND_REG_INC_NOTE (insn, NULL_RTX) != 0) fprintf (dump_file, "SMS reg inc\n"); - else - fprintf (dump_file, "SMS loop-with-not-single-set\n"); + else if ((INSN_P (insn) && !JUMP_P (insn) + && !single_set (insn) && GET_CODE (PATTERN (insn)) != USE)) + fprintf (dump_file, "SMS loop-with-not-single-set\n"); + else + fprintf (dump_file, "SMS loop with subreg in lhs\n"); print_rtl_single (dump_file, insn); } @@ -998,7 +1066,7 @@ sms_schedule (void) /* In case of th loop have doloop register it gets special handling. */ count_init = NULL_RTX; - if ((count_reg = doloop_register_get (tail))) + if ((count_reg = doloop_register_get (head, tail))) { basic_block pre_header; @@ -1072,7 +1140,10 @@ sms_schedule (void) the closing_branch was scheduled and should appear in the last (ii-1) row. Otherwise, we are free to schedule the branch, and we let nodes that were scheduled at the first PS_MIN_CYCLE cycle appear in the first - row; this should reduce stage_count to minimum. */ + row; this should reduce stage_count to minimum. + TODO: Revisit the issue of scheduling the insns of the + control part relative to the branch when the control part + has more than one insn. */ normalize_sched_times (ps); rotate_partial_schedule (ps, PS_MIN_CYCLE (ps)); set_columns_for_ps (ps); @@ -1111,11 +1182,8 @@ sms_schedule (void) if (dump_file) print_node_sched_params (dump_file, g->num_nodes, g); /* Generate prolog and epilog. */ - if (count_reg && !count_init) - generate_prolog_epilog (ps, loop, count_reg); - else - generate_prolog_epilog (ps, loop, NULL_RTX); - + generate_prolog_epilog (ps, loop, count_reg, count_init); + free_undo_replace_buff (reg_move_replaces); } diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index 7839eaa831a..690128fcca8 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,3 +1,10 @@ +2007-08-28 Mircea Namolaru <namolaru@il.ibm.com> + Vladimir Yanovsky <yanov@il.ibm.com> + Revital Eres <eres@il.ibm.com> + Andrey Belevantsev <abel@ispras.ru> + + * gcc.dg/sms-1.c: New test. + 2007-08-27 Alexandre Oliva <aoliva@redhat.com> * g++.dg/ext/gnu-inline-common.h: New. diff --git a/gcc/testsuite/gcc.dg/sms-1.c b/gcc/testsuite/gcc.dg/sms-1.c new file mode 100644 index 00000000000..d915ef54f2b --- /dev/null +++ b/gcc/testsuite/gcc.dg/sms-1.c @@ -0,0 +1,38 @@ +/* The same test as loop-3c.c. It failed on ia64 + due to not handling of subreg in the lhs that is fixed. */ +/* { dg-do run } */ +/* { dg-options "-O2 -fmodulo-sched -fmodulo-sched-allow-regmoves -w" } */ + + +#include <limits.h> + +void * a[255]; + +f (m) +{ + int i; + int sh = 0x100; + i = m; + do + { + a[sh >>= 1] = ((unsigned)i << 3) + (char*)a; + i += 4; + } + while (i < INT_MAX/2 + 1 + 4 * 4); +} + +main () +{ + a[0x10] = 0; + a[0x08] = 0; + f (INT_MAX/2 + INT_MAX/4 + 2); + if (a[0x10] || a[0x08]) + abort (); + a[0x10] = 0; + a[0x08] = 0; + f (INT_MAX/2 + 1); + if (! a[0x10] || a[0x08]) + abort (); + exit (0); +} + |