diff options
author | spop <spop@138bc75d-0d04-0410-961f-82ee72b054a4> | 2008-02-28 12:37:24 +0000 |
---|---|---|
committer | spop <spop@138bc75d-0d04-0410-961f-82ee72b054a4> | 2008-02-28 12:37:24 +0000 |
commit | 801c5610886591ce60ea92d2073a0f8cf2caea31 (patch) | |
tree | 9b84bf0f68098103bdd76fd0c6c2fa17afc3dd07 | |
parent | 9c467f1355d2f806a7ac06c22a18fd1d2e7e748f (diff) | |
download | gcc-801c5610886591ce60ea92d2073a0f8cf2caea31.tar.gz |
* doc/invoke.texi: Document -ftree-loop-distribution.
* tree-loop-distribution.c: New.
* tree-pass.h (pass_loop_distribution): New.
* graphds.h (struct graph): Add htab_t indices.
* timevar.def (TV_TREE_LOOP_DISTRIBUTION): New.
* tree-vectorizer.c (rename_variables_in_loop): Extern.
(slpeel_tree_duplicate_loop_to_edge_cfg): Init PENDING_STMT to NULL.
* tree-vectorizer.h (tree_duplicate_loop_on_edge): Declared.
* tree-data-ref.c (debug_data_dependence_relations): New.
(dump_data_dependence_relation): Also print data references.
(free_data_ref): Extern.
(same_access_functions): Moved...
(find_vertex_for_stmt): Renamed rdg_vertex_for_stmt.
(dump_rdg_vertex, debug_rdg_vertex, dump_rdg_component,
debug_rdg_component, dump_rdg, debug_rdg, dot_rdg_1, dot_rdg,
struct rdg_vertex_info, rdg_vertex_for_stmt): New.
(create_rdg_edge_for_ddr, create_rdg_vertices): Cleaned up.
(stmts_from_loop): Skip LABEL_EXPR.
(hash_stmt_vertex_info, eq_stmt_vertex_info, hash_stmt_vertex_del): New.
(build_rdg): Initialize rdg->indices htab.
(free_rdg, stores_from_loop, ref_base_address,
rdg_defs_used_in_other_loops_p, have_similar_memory_accesses,
have_similar_memory_accesses_1, ref_base_address_1,
remove_similar_memory_refs): New.
* tree-data-ref.h: Depend on tree-chrec.h.
(debug_data_dependence_relations, free_data_ref): Declared.
(same_access_functions): ... here.
(ddr_is_anti_dependent, ddrs_have_anti_deps, ddr_dependence_level): New.
(struct rdg_vertex): Add has_mem_write and has_mem_reads.
(RDGV_HAS_MEM_WRITE, RDGV_HAS_MEM_READS, RDG_STMT,
RDG_MEM_WRITE_STMT, RDG_MEM_READS_STMT): New.
(dump_rdg_vertex, debug_rdg_vertex, dump_rdg_component,
debug_rdg_component, dump_rdg, debug_rdg, dot_rdg,
rdg_vertex_for_stmt): Declared.
(struct rdg_edge): Add level.
(RDGE_LEVEL): New.
(free_rdg, stores_from_loop, remove_similar_memory_refs,
rdg_defs_used_in_other_loops_p, have_similar_memory_accesses): Declared.
(rdg_has_similar_memory_accesses): New.
* tree-vect-analyze.c: Remove unused static decls.
* lambda.h (dependence_level): New.
* common.opt (ftree-loop-distribution): New.
* tree-flow.h (mark_virtual_ops_in_bb,
slpeel_tree_duplicate_loop_to_edge_cfg,
rename_variables_in_loop): Declared.
* Makefile.in (TREE_DATA_REF_H): Depend on tree-chrec.h.
(OBJS-common): Add tree-loop-distribution.o.
(tree-loop-distribution.o): New rule.
* tree-cfg.c (mark_virtual_ops_in_bb): New.
(mark_virtual_ops_in_region): Use mark_virtual_ops_in_bb.
* passes.c (init_optimization_passes): Schedule pass_loop_distribution.
* testsuite/gcc.dg/tree-ssa/ldist-{1..12}.c: New.
git-svn-id: svn+ssh://gcc.gnu.org/svn/gcc/trunk@132745 138bc75d-0d04-0410-961f-82ee72b054a4
32 files changed, 2326 insertions, 105 deletions
diff --git a/gcc/ChangeLog b/gcc/ChangeLog index a0663ff0d3a..13854195354 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,57 @@ +2008-02-28 Sebastian Pop <sebastian.pop@amd.com> + + * doc/invoke.texi: Document -ftree-loop-distribution. + * tree-loop-distribution.c: New. + * tree-pass.h (pass_loop_distribution): New. + * graphds.h (struct graph): Add htab_t indices. + * timevar.def (TV_TREE_LOOP_DISTRIBUTION): New. + * tree-vectorizer.c (rename_variables_in_loop): Extern. + (slpeel_tree_duplicate_loop_to_edge_cfg): Init PENDING_STMT to NULL. + * tree-vectorizer.h (tree_duplicate_loop_on_edge): Declared. + * tree-data-ref.c (debug_data_dependence_relations): New. + (dump_data_dependence_relation): Also print data references. + (free_data_ref): Extern. + (same_access_functions): Moved... + (find_vertex_for_stmt): Renamed rdg_vertex_for_stmt. + (dump_rdg_vertex, debug_rdg_vertex, dump_rdg_component, + debug_rdg_component, dump_rdg, debug_rdg, dot_rdg_1, dot_rdg, + struct rdg_vertex_info, rdg_vertex_for_stmt): New. + (create_rdg_edge_for_ddr, create_rdg_vertices): Cleaned up. + (stmts_from_loop): Skip LABEL_EXPR. + (hash_stmt_vertex_info, eq_stmt_vertex_info, hash_stmt_vertex_del): New. + (build_rdg): Initialize rdg->indices htab. + (free_rdg, stores_from_loop, ref_base_address, + rdg_defs_used_in_other_loops_p, have_similar_memory_accesses, + have_similar_memory_accesses_1, ref_base_address_1, + remove_similar_memory_refs): New. + * tree-data-ref.h: Depend on tree-chrec.h. + (debug_data_dependence_relations, free_data_ref): Declared. + (same_access_functions): ... here. + (ddr_is_anti_dependent, ddrs_have_anti_deps, ddr_dependence_level): New. + (struct rdg_vertex): Add has_mem_write and has_mem_reads. + (RDGV_HAS_MEM_WRITE, RDGV_HAS_MEM_READS, RDG_STMT, + RDG_MEM_WRITE_STMT, RDG_MEM_READS_STMT): New. + (dump_rdg_vertex, debug_rdg_vertex, dump_rdg_component, + debug_rdg_component, dump_rdg, debug_rdg, dot_rdg, + rdg_vertex_for_stmt): Declared. + (struct rdg_edge): Add level. + (RDGE_LEVEL): New. + (free_rdg, stores_from_loop, remove_similar_memory_refs, + rdg_defs_used_in_other_loops_p, have_similar_memory_accesses): Declared. + (rdg_has_similar_memory_accesses): New. + * tree-vect-analyze.c: Remove unused static decls. + * lambda.h (dependence_level): New. + * common.opt (ftree-loop-distribution): New. + * tree-flow.h (mark_virtual_ops_in_bb, + slpeel_tree_duplicate_loop_to_edge_cfg, + rename_variables_in_loop): Declared. + * Makefile.in (TREE_DATA_REF_H): Depend on tree-chrec.h. + (OBJS-common): Add tree-loop-distribution.o. + (tree-loop-distribution.o): New rule. + * tree-cfg.c (mark_virtual_ops_in_bb): New. + (mark_virtual_ops_in_region): Use mark_virtual_ops_in_bb. + * passes.c (init_optimization_passes): Schedule pass_loop_distribution. + 2008-02-28 Joseph Myers <joseph@codesourcery.com> PR target/33963 diff --git a/gcc/Makefile.in b/gcc/Makefile.in index 49ba245b2ad..5878d1411cc 100644 --- a/gcc/Makefile.in +++ b/gcc/Makefile.in @@ -838,7 +838,7 @@ DIAGNOSTIC_H = diagnostic.h diagnostic.def $(PRETTY_PRINT_H) options.h C_PRETTY_PRINT_H = c-pretty-print.h $(PRETTY_PRINT_H) $(C_COMMON_H) $(TREE_H) SCEV_H = tree-scalar-evolution.h $(GGC_H) tree-chrec.h $(PARAMS_H) LAMBDA_H = lambda.h $(TREE_H) vec.h $(GGC_H) -TREE_DATA_REF_H = tree-data-ref.h $(LAMBDA_H) omega.h graphds.h +TREE_DATA_REF_H = tree-data-ref.h $(LAMBDA_H) omega.h graphds.h tree-chrec.h VARRAY_H = varray.h $(MACHMODE_H) $(SYSTEM_H) coretypes.h $(TM_H) TREE_INLINE_H = tree-inline.h $(VARRAY_H) pointer-set.h REAL_H = real.h $(MACHMODE_H) @@ -1156,6 +1156,7 @@ OBJS-common = \ tree-if-conv.o \ tree-into-ssa.o \ tree-iterator.o \ + tree-loop-distribution.o \ tree-loop-linear.o \ tree-nested.o \ tree-nrv.o \ @@ -2283,6 +2284,11 @@ tree-loop-linear.o: tree-loop-linear.c $(CONFIG_H) $(SYSTEM_H) coretypes.h \ $(DIAGNOSTIC_H) $(TREE_FLOW_H) $(TREE_DUMP_H) $(TIMEVAR_H) $(CFGLOOP_H) \ tree-pass.h $(TREE_DATA_REF_H) $(SCEV_H) $(EXPR_H) $(LAMBDA_H) \ $(TARGET_H) tree-chrec.h $(OBSTACK_H) +tree-loop-distribution.o: tree-loop-distribution.c $(CONFIG_H) $(SYSTEM_H) coretypes.h \ + $(TM_H) $(GGC_H) $(OPTABS_H) $(TREE_H) $(RTL_H) $(BASIC_BLOCK_H) \ + $(DIAGNOSTIC_H) $(TREE_FLOW_H) $(TREE_DUMP_H) $(TIMEVAR_H) $(CFGLOOP_H) \ + tree-pass.h $(TREE_DATA_REF_H) $(SCEV_H) $(EXPR_H) \ + $(TARGET_H) tree-chrec.h tree-vectorizer.h tree-parloops.o: tree-parloops.c $(CONFIG_H) $(SYSTEM_H) coretypes.h $(TM_H) \ $(TREE_FLOW_H) $(TREE_H) $(RTL_H) $(CFGLOOP_H) $(TREE_DATA_REF_H) $(GGC_H) \ $(DIAGNOSTIC_H) tree-pass.h $(SCEV_H) langhooks.h gt-tree-parloops.h \ diff --git a/gcc/common.opt b/gcc/common.opt index 48a1f80c3e1..74b3255afc7 100644 --- a/gcc/common.opt +++ b/gcc/common.opt @@ -1098,6 +1098,10 @@ ftree-fre Common Report Var(flag_tree_fre) Optimization Enable Full Redundancy Elimination (FRE) on trees +ftree-loop-distribution +Common Report Var(flag_tree_loop_distribution) +Enable loop distribution on trees + ftree-loop-im Common Report Var(flag_tree_loop_im) Init(1) Optimization Enable loop invariant motion on trees diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi index d30c0cf3ba8..dbd66eab30c 100644 --- a/gcc/doc/invoke.texi +++ b/gcc/doc/invoke.texi @@ -354,6 +354,7 @@ Objective-C and Objective-C++ Dialects}. -fstrict-aliasing -fstrict-overflow -fthread-jumps -ftracer -ftree-ccp @gol -ftree-ch -ftree-copy-prop -ftree-copyrename -ftree-dce @gol -ftree-dominator-opts -ftree-dse -ftree-fre -ftree-loop-im @gol +-ftree-loop-distribution @gol -ftree-loop-ivcanon -ftree-loop-linear -ftree-loop-optimize @gol -ftree-parallelize-loops=@var{n} -ftree-pre -ftree-reassoc -ftree-salias @gol -ftree-sink -ftree-sra -ftree-store-ccp -ftree-ter @gol @@ -5928,6 +5929,11 @@ performance and allow further loop optimizations to take place. Compare the results of several data dependence analyzers. This option is used for debugging the data dependence analyzers. +@item -ftree-loop-distribution +Perform loop distribution. This flag can improve cache performance on +big loop bodies and allow further loop optimizations, like +parallelization or vectorization, to take place. + @item -ftree-loop-im @opindex ftree-loop-im Perform loop invariant motion on trees. This pass moves only invariants that diff --git a/gcc/graphds.h b/gcc/graphds.h index 49fe6cd9176..83cf90c2e31 100644 --- a/gcc/graphds.h +++ b/gcc/graphds.h @@ -47,6 +47,7 @@ struct graph int n_vertices; /* Number of vertices. */ struct vertex *vertices; /* The vertices. */ + htab_t indices; /* Fast lookup for indices. */ }; struct graph *new_graph (int); diff --git a/gcc/lambda.h b/gcc/lambda.h index fc28d463e3c..641b3bcaa05 100644 --- a/gcc/lambda.h +++ b/gcc/lambda.h @@ -469,5 +469,22 @@ build_linear_expr (tree type, lambda_vector coefs, VEC (tree, heap) *ivs) return expr; } +/* Returns the dependence level for a vector DIST of size LENGTH. + LEVEL = 0 means a lexicographic dependence, i.e. a dependence due + to the sequence of statements, not carried by any loop. */ + + +static inline unsigned +dependence_level (lambda_vector dist_vect, int length) +{ + int i; + + for (i = 0; i < length; i++) + if (dist_vect[i] != 0) + return i + 1; + + return 0; +} + #endif /* LAMBDA_H */ diff --git a/gcc/passes.c b/gcc/passes.c index 2614c90b3ca..bec473a0670 100644 --- a/gcc/passes.c +++ b/gcc/passes.c @@ -625,6 +625,7 @@ init_optimization_passes (void) NEXT_PASS (pass_empty_loop); NEXT_PASS (pass_record_bounds); NEXT_PASS (pass_check_data_deps); + NEXT_PASS (pass_loop_distribution); NEXT_PASS (pass_linear_transform); NEXT_PASS (pass_iv_canon); NEXT_PASS (pass_if_conversion); diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index e9034ed5201..882de989160 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,3 +1,20 @@ +2008-02-28 Sebastian Pop <sebastian.pop@amd.com> + + * testsuite/gcc.dg/tree-ssa/ldist-1.c: New. + * testsuite/gcc.dg/tree-ssa/ldist-1a.c: New. + * testsuite/gcc.dg/tree-ssa/ldist-2.c: New. + * testsuite/gcc.dg/tree-ssa/ldist-3.c: New. + * testsuite/gcc.dg/tree-ssa/ldist-4.c: New. + * testsuite/gcc.dg/tree-ssa/ldist-5.c: New. + * testsuite/gcc.dg/tree-ssa/ldist-6.c: New. + * testsuite/gcc.dg/tree-ssa/ldist-7.c: New. + * testsuite/gcc.dg/tree-ssa/ldist-8.c: New. + * testsuite/gcc.dg/tree-ssa/ldist-9.c: New. + * testsuite/gcc.dg/tree-ssa/ldist-10.c: New. + * testsuite/gcc.dg/tree-ssa/ldist-11.c: New. + * testsuite/gcc.dg/tree-ssa/ldist-12.c: New. + * testsuite/gfortran.dg/ldist-1.f90: New. + 2008-02-28 Uros Bizjak <ubizjak@gmail.com> * gcc.dg/pr34351.c: Compile for x86 targets only. Use %ebx register. diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ldist-1.c b/gcc/testsuite/gcc.dg/tree-ssa/ldist-1.c new file mode 100644 index 00000000000..43c10466525 --- /dev/null +++ b/gcc/testsuite/gcc.dg/tree-ssa/ldist-1.c @@ -0,0 +1,38 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -ftree-loop-distribution -fdump-tree-ldist-all" } */ + +void foo (int * __restrict__ ia, + int * __restrict__ ib, + int * __restrict__ oxa, + int * __restrict__ oxb, + int * __restrict__ oya, + int * __restrict__ oyb) +{ + int i; + long int mya[52]; + long int myb[52]; + + for (i=0; i < 52; i++) + { + mya[i] = ia[i] * oxa[i] + ib[i] * oxb[i]; + myb[i] = -ia[i] * oxb[i] + ib[i] * oxa[i]; + oya[i] = mya[i] >> 10; + oyb[i] = myb[i] >> 10; + } + + /* This loop was distributed, but it is not anymore due to the cost + model changes: the result of a distribution would look like this: + + | for (i=0; i < 52; i++) + | oya[i] = ia[i] * oxa[i] + ib[i] * oxb[i] >> 10; + | + | for (i=0; i < 52; i++) + | oyb[i] = -ia[i] * oxb[i] + ib[i] * oxa[i] >> 10; + + and in this the array IA is read in both tasks. For maximizing + the cache reuse, ldist does not distributes this loop anymore. + */ +} + +/* { dg-final { scan-tree-dump-times "distributed: split to 2 loops" 0 "ldist" } } */ +/* { dg-final { cleanup-tree-dump "ldist" } } */ diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ldist-10.c b/gcc/testsuite/gcc.dg/tree-ssa/ldist-10.c new file mode 100644 index 00000000000..0790c18a9da --- /dev/null +++ b/gcc/testsuite/gcc.dg/tree-ssa/ldist-10.c @@ -0,0 +1,24 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -ftree-loop-distribution -fdump-tree-ldist-all" } */ + +int loop1 (int k) +{ + unsigned int i; + int a[1000], b[1000], c[1000]; + + for (i = 1; i < 1000; i ++) + { + a[i] = c[i]; /* S1 */ + b[i] = a[i-1]+1; /* S2 */ + } + /* Dependences: + S1->S2 (flow, level 1) + + One partition as A is used in both S1 and S2. + */ + + return a[1000-2] + b[1000-1] + c[1000-2]; +} + +/* { dg-final { scan-tree-dump-times "distributed: split to 2 loops" 0 "ldist" } } */ +/* { dg-final { cleanup-tree-dump "ldist" } } */ diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ldist-11.c b/gcc/testsuite/gcc.dg/tree-ssa/ldist-11.c new file mode 100644 index 00000000000..88651e7b72d --- /dev/null +++ b/gcc/testsuite/gcc.dg/tree-ssa/ldist-11.c @@ -0,0 +1,33 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -ftree-loop-distribution -fdump-tree-ldist-all" } */ + +void foo (int * __restrict__ ia, + int * __restrict__ ib, + int * __restrict__ oxa, + int * __restrict__ oxb, + int * __restrict__ oya, + int * __restrict__ oyb) +{ + int i; + long int mya[52]; + long int myb[52]; + + for (i=0; i < 52; i++) + { + mya[i] = ia[i] * oxa[i] + ib[i] * oxb[i]; + myb[i] = -ia[i] * oxb[i] + ib[i] * oxa[i]; + oya[i] = 0; + oyb[i] = myb[i] >> 10; + } + + /* This loop should be distributed, and the result should look like + this: + | memset (oya, 0, 208); + | for (i=0; i < 52; i++) + | oyb[i] = -ia[i] * oxb[i] + ib[i] * oxa[i] >> 10; + */ +} + +/* { dg-final { scan-tree-dump-times "distributed: split to 2 loops" 1 "ldist" } } */ +/* { dg-final { scan-tree-dump-times "generated memset zero" 1 "ldist" } } */ +/* { dg-final { cleanup-tree-dump "ldist" } } */ diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ldist-12.c b/gcc/testsuite/gcc.dg/tree-ssa/ldist-12.c new file mode 100644 index 00000000000..1e555fe26ad --- /dev/null +++ b/gcc/testsuite/gcc.dg/tree-ssa/ldist-12.c @@ -0,0 +1,22 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -ftree-loop-distribution -fdump-tree-ldist-all" } */ + +int foo (int * __restrict__ ia, + int * __restrict__ ib, + int * __restrict__ oxa, + int * __restrict__ oxb) +{ + int i; + int oya[52], oyb[52]; + + for (i=0; i < 52; i++) + { + oya[i] = (ia[i] * oxa[i]) >> 10; + oyb[i] = (ib[i] * oxb[i]) >> 10; + } + + return oya[22] + oyb[21]; +} + +/* { dg-final { scan-tree-dump-times "distributed: split to 2 loops" 1 "ldist" } } */ +/* { dg-final { cleanup-tree-dump "ldist" } } */ diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ldist-1a.c b/gcc/testsuite/gcc.dg/tree-ssa/ldist-1a.c new file mode 100644 index 00000000000..623aacfdbf5 --- /dev/null +++ b/gcc/testsuite/gcc.dg/tree-ssa/ldist-1a.c @@ -0,0 +1,22 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -ftree-loop-distribution -fdump-tree-ldist-all" } */ + +int foo (int * __restrict__ ia, + int * __restrict__ ib, + int * __restrict__ oxa, + int * __restrict__ oxb) +{ + int i; + int oya[52], oyb[52]; + + for (i=0; i < 52; i++) + { + oya[i] = (ia[i] * oxa[i] + ib[i] * oxb[i]) >> 10; + oyb[i] = (-ia[i] * oxb[i] + ib[i] * oxa[i]) >> 10; + } + + return oya[22] + oyb[21]; +} + +/* { dg-final { scan-tree-dump-times "distributed: split to 2 loops" 0 "ldist" } } */ +/* { dg-final { cleanup-tree-dump "ldist" } } */ diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ldist-2.c b/gcc/testsuite/gcc.dg/tree-ssa/ldist-2.c new file mode 100644 index 00000000000..de98ccc4c30 --- /dev/null +++ b/gcc/testsuite/gcc.dg/tree-ssa/ldist-2.c @@ -0,0 +1,31 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -ftree-loop-distribution -fdump-tree-ldist-all" } */ + +void foo (int * __restrict__ a, + int * __restrict__ b, + int * __restrict__ c) +{ + int i; + + for (i=1; i < 10; i++) + { + a[i] += c[i]; + b[i] = a[i - 1] + 1; + } + + /* This loop is not distributed because the cost of spliting it: + + | for (i=1; i < N; i++) + | a[i] += c[i]; + | + | for (i=1; i < N; i++) + | b[i] = a[i - 1] + 1; + + is higher due to data in array A that is written and then read in + another task. The cost model should forbid the transformation in + this case. + */ +} + +/* { dg-final { scan-tree-dump-times "distributed: split to 2 loops" 0 "ldist" } } */ +/* { dg-final { cleanup-tree-dump "ldist" } } */ diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ldist-3.c b/gcc/testsuite/gcc.dg/tree-ssa/ldist-3.c new file mode 100644 index 00000000000..524fb4542b8 --- /dev/null +++ b/gcc/testsuite/gcc.dg/tree-ssa/ldist-3.c @@ -0,0 +1,34 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -ftree-loop-distribution -fdump-tree-ldist-all" } */ + +int loop1 (int k) +{ + unsigned int i; + int a[10000], b[10000], c[10000], d[10000]; + + a[0] = k; a[3] = k*2; + c[1] = k+1; + for (i = 2; i < (10000-1); i ++) + { + a[i] = k * i; /* S1 */ + b[i] = a[i-2] + k; /* S2 */ + c[i] = b[i] + a[i+1]; /* S3 */ + d[i] = c[i-1] + k + i; /* S4 */ + } + /* + Dependences: + S1 -> S2 (flow, level 1) + S1 -> S3 (anti, level 1) + S2 -> S3 (flow, level 0) + S3 -> S4 (flow, level 1) + + There are three partitions: {S1, S3}, {S2} and {S4}. + + The cost model should fuse together all the partitions, as they + are reusing the same data, ending on a single partition. + */ + return a[10000-2] + b[10000-1] + c[10000-2] + d[10000-2]; +} + +/* { dg-final { scan-tree-dump-times "distributed: split to 3 loops" 0 "ldist" } } */ +/* { dg-final { cleanup-tree-dump "ldist" } } */ diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ldist-4.c b/gcc/testsuite/gcc.dg/tree-ssa/ldist-4.c new file mode 100644 index 00000000000..c9b1293f0f2 --- /dev/null +++ b/gcc/testsuite/gcc.dg/tree-ssa/ldist-4.c @@ -0,0 +1,28 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -ftree-loop-distribution -fdump-tree-ldist-all" } */ + +int loop1 (int k) +{ + unsigned int i; + unsigned int j; + int a[100], b[100][100]; + + a[0] = k; + for (i = 1; i < 100; i ++) + { + for (j = 0; j < 100; j++) + { + a[j] = k * i; + b[i][j] = a[j-1] + k; + } + } + + return b[100-1][0]; +} + +/* We used to distribute also innermost loops, but these could produce + too much code in the outer loop, degrading performance of scalar + code. So this test is XFAILed because the cost model of the stand + alone distribution pass has evolved. */ +/* { dg-final { scan-tree-dump-times "distributed: split to 2 loops" 0 "ldist" } } */ +/* { dg-final { cleanup-tree-dump "ldist" } } */ diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ldist-5.c b/gcc/testsuite/gcc.dg/tree-ssa/ldist-5.c new file mode 100644 index 00000000000..af74557024e --- /dev/null +++ b/gcc/testsuite/gcc.dg/tree-ssa/ldist-5.c @@ -0,0 +1,33 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -ftree-loop-distribution -fdump-tree-ldist-all" } */ + +int loop1 (int k) +{ + unsigned int i; + unsigned int j; + int a[100][100], b[100][100], c[100][100], d[100][100]; + + a[0][0] = k; + for (i = 1; i < 100; i ++) + for (j = 1; j < (100-1); j++) + { + a[i][j] = k * i; /* S1 */ + b[i][j] = a[i][j-1] + k; /* S2 */ + c[i][j] = b[i][j] + a[i][j+1]; /* S3 */ + d[i][j] = c[i][j] + k + i; /* S4 */ + } + /* Dependences: + S1->S2 (flow, level 2) + S1->S3 (anti, level 2) + S2->S3 (flow, level 0) + S3->S4 (flow, level 0) + */ + + return a[100-1][100-1] + b[100-1][100-1] + c[100-1][100-1] + d[100-1][100-1]; +} + +/* FIXME: This is XFAILed because of a data dependence analysis + problem: the dependence test fails with a "don't know" relation. */ + +/* { dg-final { scan-tree-dump-times "distributed: split to 2 loops" 1 "ldist" { xfail *-*-* } } } */ +/* { dg-final { cleanup-tree-dump "ldist" } } */ diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ldist-6.c b/gcc/testsuite/gcc.dg/tree-ssa/ldist-6.c new file mode 100644 index 00000000000..7a38c86832b --- /dev/null +++ b/gcc/testsuite/gcc.dg/tree-ssa/ldist-6.c @@ -0,0 +1,38 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -ftree-loop-distribution -fdump-tree-ldist-all" } */ + +int loop1 (int k) +{ + unsigned int i; + int a[1000], b[1000], c[1000], d[1000]; + + for (i = 2; i < (1000-1); i ++) { + a[i] = k * i; /* S1 */ + b[i] = a[i-2] + k; /* S2 */ + c[i] = b[i-1] + a[i+1]; /* S3 */ + d[i] = c[i-1] + k + i; /* S4 */ + } + /* Dependences: + S1->S2 (flow, level 1) + S2->S3 (flow, level 1) + S3->S1 (anti, level 1) + S3->S4 (flow, level 1) + + There are two partitions: {S1, S2, S3} and {S4}. + + {S1, S2, S3} have to be in the same partition because: + - S1 (i) has to be executed before S2 (i+2), as S1 produces a[i] that is then consumed 2 iterations later by S2. + - S2 (i) has to be executed before S3 (i+1), as S2 produces b[i] that is then consumed one iteration later by S3, + - S3 (i) has to be executed before S1 (i+1), as a[i+1] has to execute before the update to a[i], + + {S4} is the consumer partition: it consumes the values from array "c" produced in S3. + + The cost model should fuse all the tasks together as the cost of + fetching data from caches is too high. + */ + + return a[1000-2] + b[1000-1] + c[1000-2] + d[1000-2]; +} + +/* { dg-final { scan-tree-dump-times "distributed: split to 2 loops" 0 "ldist" } } */ +/* { dg-final { cleanup-tree-dump "ldist" } } */ diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ldist-7.c b/gcc/testsuite/gcc.dg/tree-ssa/ldist-7.c new file mode 100644 index 00000000000..124fcdedd03 --- /dev/null +++ b/gcc/testsuite/gcc.dg/tree-ssa/ldist-7.c @@ -0,0 +1,32 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -ftree-loop-distribution -fdump-tree-ldist-all" } */ + +int loop1 (int k) +{ + unsigned int i, z; + int a[1000], b[1000], c[1000], d[1000]; + + for (i = 2; i < (1000-1); i ++) { + z = a[i+1]; /* S1 */ + a[i] = k * i; /* S2 */ + b[i] = a[i-2] + k; /* S3 */ + c[i] = b[i-1] + z; /* S4 */ + d[i] = c[i-1] + b[i+1] + k + i; /* S5 */ + } + /* Dependences: + S1->S2 (anti, level 1) + S1->S4 (flow, level 1, scalar) + S2->S3 (flow, level 1) + S3->S4 (flow, level 1) + S4->S5 (flow, level 1) + S5->S3 (anti, level 1) + + There is a single partition: {S1, S2, S3, S4, S5}, because of the + scalar dependence z between the two partitions {S1, S2} and {S3, S4, S5}. + */ + + return a[1000-2] + b[1000-1] + c[1000-2] + d[1000-2]; +} + +/* { dg-final { scan-tree-dump-times "distributed" 0 "ldist" } } */ +/* { dg-final { cleanup-tree-dump "ldist" } } */ diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ldist-8.c b/gcc/testsuite/gcc.dg/tree-ssa/ldist-8.c new file mode 100644 index 00000000000..4a8e0660061 --- /dev/null +++ b/gcc/testsuite/gcc.dg/tree-ssa/ldist-8.c @@ -0,0 +1,34 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -ftree-loop-distribution -fdump-tree-ldist-all" } */ + +int loop1 (int k) +{ + unsigned int i; + int a[1000], b[1000], c[1000], d[1000]; + + for (i = 2; i < (1000-1); i ++) + { + a[i] = k * i; /* S1 */ + b[i] = a[i+1] + k; /* S2 */ + c[i] = a[i-1] + b[i-1] + d[i-1]; /* S3 */ + d[i] = a[i-1] + b[i+1] + k + i; /* S4 */ + } + /* Dependences: + S1->S2 (anti, level 1) + S1->S3 (flow, level 1) + S1->S4 (flow, level 1) + S2->S3 (flow, level 1) + S2->S4 (anti, level 1) + S4->S3 (flow, level 1) + + Two partitions: {S1, S2, S4} produce information that is consumed in {S3}. + + So that means that the current cost model will also fuse these + two partitions into a single one for avoiding cache misses. + */ + + return a[1000-2] + b[1000-1] + c[1000-2] + d[1000-2]; +} + +/* { dg-final { scan-tree-dump-times "distributed: split to 2 loops" 0 "ldist" } } */ +/* { dg-final { cleanup-tree-dump "ldist" } } */ diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ldist-9.c b/gcc/testsuite/gcc.dg/tree-ssa/ldist-9.c new file mode 100644 index 00000000000..ee8d023dee3 --- /dev/null +++ b/gcc/testsuite/gcc.dg/tree-ssa/ldist-9.c @@ -0,0 +1,26 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -ftree-loop-distribution -fdump-tree-ldist-all" } */ + +int loop1 (int k) +{ + unsigned int i; + int a[1000], b[1000]; + + for (i = 1; i < (1000-1); i ++) { + a[i] = a[i+1] + a[i-1]; /* S1 */ + b[i] = a[i-1] + k; /* S2 */ + } + /* + Dependences: + S1->S2 (flow, level 1) + S1->S1 (anti, level 1) + S1->S1 (flow, level 1) + + One partition, because of the cost of cache misses. + */ + + return a[1000-2] + b[1000-1]; +} + +/* { dg-final { scan-tree-dump-times "distributed: split to 2 loops" 0 "ldist" } } */ +/* { dg-final { cleanup-tree-dump "ldist" } } */ diff --git a/gcc/testsuite/gfortran.dg/ldist-1.f90 b/gcc/testsuite/gfortran.dg/ldist-1.f90 new file mode 100644 index 00000000000..dd1f02a176b --- /dev/null +++ b/gcc/testsuite/gfortran.dg/ldist-1.f90 @@ -0,0 +1,33 @@ +! { dg-do compile } +! { dg-options "-O2 -ftree-loop-distribution -fdump-tree-ldist-all" } + +Subroutine PADEC(DKS,DKDS,HVAR,WM,WG,FN,NS,AN,BN,CN,IT) + IMPLICIT REAL*8 (A-H, O-Z) + DIMENSION DKS(*),DKDS(*),HVAR(*) + COMPLEX*16 WM(*),WG(*),FN(*),AN(*),BN(*),CN(*) + COMPLEX*16 H2,CONST + COMMON/STRCH/ALP,BET,DH,ZH,UG,VG,T1,T2,DT,TOL,ALPHA ,HAMP,BUMP + Parameter (F1 = .8333333333333333D0, F2 = .0833333333333333D0) + + SS=DT/(2.0D0) + + do J=2,NS + BS=SS*DKS(J)*HVAR(J)*HVAR(J) + AN(J)=F1+2.*BS + BN(J)=F2-BS + CN(J)=F2-BS + H2=WM(J+1) + + if(J.EQ.NS) then + CONST=CN(J)*H2 + else + CONST=(0.D0,0.D0) + endif + FN(J)=(BS+F2)*(H2)+(F1-2.D0*BS)-CONST + end do + + return +end Subroutine PADEC + +! { dg-final { scan-tree-dump-times "distributed: split to 4 loops" 1 "ldist" } } +! { dg-final { cleanup-tree-dump "ldist" } } diff --git a/gcc/timevar.def b/gcc/timevar.def index e503894677b..1091724540c 100644 --- a/gcc/timevar.def +++ b/gcc/timevar.def @@ -123,6 +123,7 @@ DEFTIMEVAR (TV_COMPLETE_UNROLL , "complete unrolling") DEFTIMEVAR (TV_TREE_PARALLELIZE_LOOPS, "tree parallelize loops") DEFTIMEVAR (TV_TREE_VECTORIZATION , "tree vectorization") DEFTIMEVAR (TV_TREE_LINEAR_TRANSFORM , "tree loop linear") +DEFTIMEVAR (TV_TREE_LOOP_DISTRIBUTION, "tree loop distribution") DEFTIMEVAR (TV_CHECK_DATA_DEPS , "tree check data dependences") DEFTIMEVAR (TV_TREE_PREFETCH , "tree prefetching") DEFTIMEVAR (TV_TREE_LOOP_IVOPTS , "tree iv optimization") diff --git a/gcc/tree-cfg.c b/gcc/tree-cfg.c index 6d848a0c2d8..844e7c14a0e 100644 --- a/gcc/tree-cfg.c +++ b/gcc/tree-cfg.c @@ -5646,22 +5646,30 @@ move_stmt_r (tree *tp, int *walk_subtrees, void *data) /* Marks virtual operands of all statements in basic blocks BBS for renaming. */ -static void -mark_virtual_ops_in_region (VEC (basic_block,heap) *bbs) +void +mark_virtual_ops_in_bb (basic_block bb) { tree phi; block_stmt_iterator bsi; + + for (phi = phi_nodes (bb); phi; phi = PHI_CHAIN (phi)) + mark_virtual_ops_for_renaming (phi); + + for (bsi = bsi_start (bb); !bsi_end_p (bsi); bsi_next (&bsi)) + mark_virtual_ops_for_renaming (bsi_stmt (bsi)); +} + +/* Marks virtual operands of all statements in basic blocks BBS for + renaming. */ + +static void +mark_virtual_ops_in_region (VEC (basic_block,heap) *bbs) +{ basic_block bb; unsigned i; for (i = 0; VEC_iterate (basic_block, bbs, i, bb); i++) - { - for (phi = phi_nodes (bb); phi; phi = PHI_CHAIN (phi)) - mark_virtual_ops_for_renaming (phi); - - for (bsi = bsi_start (bb); !bsi_end_p (bsi); bsi_next (&bsi)) - mark_virtual_ops_for_renaming (bsi_stmt (bsi)); - } + mark_virtual_ops_in_bb (bb); } /* Move basic block BB from function CFUN to function DEST_FN. The diff --git a/gcc/tree-data-ref.c b/gcc/tree-data-ref.c index 2f17ed1deb4..70266034aab 100644 --- a/gcc/tree-data-ref.c +++ b/gcc/tree-data-ref.c @@ -88,7 +88,6 @@ along with GCC; see the file COPYING3. If not see #include "tree-dump.h" #include "timevar.h" #include "cfgloop.h" -#include "tree-chrec.h" #include "tree-data-ref.h" #include "tree-scalar-evolution.h" #include "tree-pass.h" @@ -157,6 +156,14 @@ dump_data_references (FILE *file, VEC (data_reference_p, heap) *datarefs) dump_data_reference (file, dr); } +/* Dump to STDERR all the dependence relations from DDRS. */ + +void +debug_data_dependence_relations (VEC (ddr_p, heap) *ddrs) +{ + dump_data_dependence_relations (stderr, ddrs); +} + /* Dump into FILE all the dependence relations from DDRS. */ void @@ -354,6 +361,10 @@ dump_data_dependence_relation (FILE *outf, dra = DDR_A (ddr); drb = DDR_B (ddr); fprintf (outf, "(Data Dep: \n"); + + dump_data_reference (outf, dra); + dump_data_reference (outf, drb); + if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know) fprintf (outf, " (don't know)\n"); @@ -808,7 +819,7 @@ dr_address_invariant_p (struct data_reference *dr) /* Frees data reference DR. */ -static void +void free_data_ref (data_reference_p dr) { BITMAP_FREE (DR_VOPS (dr)); @@ -2787,22 +2798,6 @@ build_classic_dist_vector_1 (struct data_dependence_relation *ddr, return true; } -/* Return true when the DDR contains two data references that have the - same access functions. */ - -static bool -same_access_functions (const struct data_dependence_relation *ddr) -{ - unsigned i; - - for (i = 0; i < DDR_NUM_SUBSCRIPTS (ddr); i++) - if (!eq_evolutions_p (DR_ACCESS_FN (DDR_A (ddr), i), - DR_ACCESS_FN (DDR_B (ddr), i))) - return false; - - return true; -} - /* Return true when the DDR contains only constant access functions. */ static bool @@ -4371,48 +4366,219 @@ free_data_refs (VEC (data_reference_p, heap) *datarefs) -/* Returns the index of STMT in RDG. */ +/* Dump vertex I in RDG to FILE. */ -static int -find_vertex_for_stmt (const struct graph *rdg, const_tree stmt) +void +dump_rdg_vertex (FILE *file, struct graph *rdg, int i) +{ + struct vertex *v = &(rdg->vertices[i]); + struct graph_edge *e; + + fprintf (file, "(vertex %d: (%s%s) (in:", i, + RDG_MEM_WRITE_STMT (rdg, i) ? "w" : "", + RDG_MEM_READS_STMT (rdg, i) ? "r" : ""); + + if (v->pred) + for (e = v->pred; e; e = e->pred_next) + fprintf (file, " %d", e->src); + + fprintf (file, ") (out:"); + + if (v->succ) + for (e = v->succ; e; e = e->succ_next) + fprintf (file, " %d", e->dest); + + fprintf (file, ") \n"); + print_generic_stmt (file, RDGV_STMT (v), TDF_VOPS|TDF_MEMSYMS); + fprintf (file, ")\n"); +} + +/* Call dump_rdg_vertex on stderr. */ + +void +debug_rdg_vertex (struct graph *rdg, int i) +{ + dump_rdg_vertex (stderr, rdg, i); +} + +/* Dump component C of RDG to FILE. If DUMPED is non-null, set the + dumped vertices to that bitmap. */ + +void dump_rdg_component (FILE *file, struct graph *rdg, int c, bitmap dumped) +{ + int i; + + fprintf (file, "(%d\n", c); + + for (i = 0; i < rdg->n_vertices; i++) + if (rdg->vertices[i].component == c) + { + if (dumped) + bitmap_set_bit (dumped, i); + + dump_rdg_vertex (file, rdg, i); + } + + fprintf (file, ")\n"); +} + +/* Call dump_rdg_vertex on stderr. */ + +void +debug_rdg_component (struct graph *rdg, int c) +{ + dump_rdg_component (stderr, rdg, c, NULL); +} + +/* Dump the reduced dependence graph RDG to FILE. */ + +void +dump_rdg (FILE *file, struct graph *rdg) { int i; + bitmap dumped = BITMAP_ALLOC (NULL); + + fprintf (file, "(rdg\n"); for (i = 0; i < rdg->n_vertices; i++) - if (RDGV_STMT (&(rdg->vertices[i])) == stmt) - return i; + if (!bitmap_bit_p (dumped, i)) + dump_rdg_component (file, rdg, rdg->vertices[i].component, dumped); - gcc_unreachable (); - return 0; + fprintf (file, ")\n"); + BITMAP_FREE (dumped); } -/* Creates an edge in RDG for each distance vector from DDR. */ +/* Call dump_rdg on stderr. */ + +void +debug_rdg (struct graph *rdg) +{ + dump_rdg (stderr, rdg); +} static void -create_rdg_edge_for_ddr (struct graph *rdg, ddr_p ddr) +dot_rdg_1 (FILE *file, struct graph *rdg) { - int va, vb; - data_reference_p dra; - data_reference_p drb; - struct graph_edge *e; + int i; + + fprintf (file, "digraph RDG {\n"); - if (DDR_REVERSED_P (ddr)) + for (i = 0; i < rdg->n_vertices; i++) { - dra = DDR_B (ddr); - drb = DDR_A (ddr); + struct vertex *v = &(rdg->vertices[i]); + struct graph_edge *e; + + /* Highlight reads from memory. */ + if (RDG_MEM_READS_STMT (rdg, i)) + fprintf (file, "%d [style=filled, fillcolor=green]\n", i); + + /* Highlight stores to memory. */ + if (RDG_MEM_WRITE_STMT (rdg, i)) + fprintf (file, "%d [style=filled, fillcolor=red]\n", i); + + if (v->succ) + for (e = v->succ; e; e = e->succ_next) + switch (RDGE_TYPE (e)) + { + case input_dd: + fprintf (file, "%d -> %d [label=input] \n", i, e->dest); + break; + + case output_dd: + fprintf (file, "%d -> %d [label=output] \n", i, e->dest); + break; + + case flow_dd: + /* These are the most common dependences: don't print these. */ + fprintf (file, "%d -> %d \n", i, e->dest); + break; + + case anti_dd: + fprintf (file, "%d -> %d [label=anti] \n", i, e->dest); + break; + + default: + gcc_unreachable (); + } } - else + + fprintf (file, "}\n\n"); +} + +/* Display SCOP using dotty. */ + +void +dot_rdg (struct graph *rdg) +{ + FILE *file = fopen ("/tmp/rdg.dot", "w"); + gcc_assert (file != NULL); + + dot_rdg_1 (file, rdg); + fclose (file); + + system ("dotty /tmp/rdg.dot"); +} + + +/* This structure is used for recording the mapping statement index in + the RDG. */ + +struct rdg_vertex_info GTY(()) +{ + tree stmt; + int index; +}; + +/* Returns the index of STMT in RDG. */ + +int +rdg_vertex_for_stmt (struct graph *rdg, tree stmt) +{ + struct rdg_vertex_info rvi, *slot; + + rvi.stmt = stmt; + slot = (struct rdg_vertex_info *) htab_find (rdg->indices, &rvi); + + if (!slot) + return -1; + + return slot->index; +} + +/* Creates an edge in RDG for each distance vector from DDR. The + order that we keep track of in the RDG is the order in which + statements have to be executed. */ + +static void +create_rdg_edge_for_ddr (struct graph *rdg, ddr_p ddr) +{ + struct graph_edge *e; + int va, vb; + data_reference_p dra = DDR_A (ddr); + data_reference_p drb = DDR_B (ddr); + unsigned level = ddr_dependence_level (ddr); + + /* For non scalar dependences, when the dependence is REVERSED, + statement B has to be executed before statement A. */ + if (level > 0 + && !DDR_REVERSED_P (ddr)) { - dra = DDR_A (ddr); - drb = DDR_B (ddr); + data_reference_p tmp = dra; + dra = drb; + drb = tmp; } - va = find_vertex_for_stmt (rdg, DR_STMT (dra)); - vb = find_vertex_for_stmt (rdg, DR_STMT (drb)); + va = rdg_vertex_for_stmt (rdg, DR_STMT (dra)); + vb = rdg_vertex_for_stmt (rdg, DR_STMT (drb)); + + if (va < 0 || vb < 0) + return; e = add_edge (rdg, va, vb); e->data = XNEW (struct rdg_edge); + RDGE_LEVEL (e) = level; + /* Determines the type of the data dependence. */ if (DR_IS_READ (dra) && DR_IS_READ (drb)) RDGE_TYPE (e) = input_dd; @@ -4435,9 +4601,13 @@ create_rdg_edges_for_scalar (struct graph *rdg, tree def, int idef) FOR_EACH_IMM_USE_FAST (imm_use_p, iterator, def) { - int use = find_vertex_for_stmt (rdg, USE_STMT (imm_use_p)); - struct graph_edge *e = add_edge (rdg, idef, use); + struct graph_edge *e; + int use = rdg_vertex_for_stmt (rdg, USE_STMT (imm_use_p)); + if (use < 0) + continue; + + e = add_edge (rdg, idef, use); e->data = XNEW (struct rdg_edge); RDGE_TYPE (e) = flow_dd; } @@ -4458,8 +4628,8 @@ create_rdg_edges (struct graph *rdg, VEC (ddr_p, heap) *ddrs) create_rdg_edge_for_ddr (rdg, ddr); for (i = 0; i < rdg->n_vertices; i++) - FOR_EACH_PHI_OR_STMT_DEF (def_p, RDGV_STMT (&(rdg->vertices[i])), - iter, SSA_OP_ALL_DEFS) + FOR_EACH_PHI_OR_STMT_DEF (def_p, RDG_STMT (rdg, i), + iter, SSA_OP_DEF) create_rdg_edges_for_scalar (rdg, DEF_FROM_PTR (def_p), i); } @@ -4468,19 +4638,50 @@ create_rdg_edges (struct graph *rdg, VEC (ddr_p, heap) *ddrs) static void create_rdg_vertices (struct graph *rdg, VEC (tree, heap) *stmts) { - int i; - tree s; + int i, j; + tree stmt; - for (i = 0; VEC_iterate (tree, stmts, i, s); i++) + for (i = 0; VEC_iterate (tree, stmts, i, stmt); i++) { + VEC (data_ref_loc, heap) *references; + data_ref_loc *ref; struct vertex *v = &(rdg->vertices[i]); + struct rdg_vertex_info *rvi = XNEW (struct rdg_vertex_info); + struct rdg_vertex_info **slot; + + rvi->stmt = stmt; + rvi->index = i; + slot = (struct rdg_vertex_info **) htab_find_slot (rdg->indices, rvi, INSERT); + + if (!*slot) + *slot = rvi; + else + free (rvi); v->data = XNEW (struct rdg_vertex); - RDGV_STMT (v) = s; + RDG_STMT (rdg, i) = stmt; + + RDG_MEM_WRITE_STMT (rdg, i) = false; + RDG_MEM_READS_STMT (rdg, i) = false; + if (TREE_CODE (stmt) == PHI_NODE) + continue; + + get_references_in_stmt (stmt, &references); + for (j = 0; VEC_iterate (data_ref_loc, references, j, ref); j++) + if (!ref->is_read) + RDG_MEM_WRITE_STMT (rdg, i) = true; + else + RDG_MEM_READS_STMT (rdg, i) = true; + + VEC_free (data_ref_loc, heap, references); } } -/* Initialize STMTS with all the statements and PHI nodes of LOOP. */ +/* Initialize STMTS with all the statements of LOOP. When + INCLUDE_PHIS is true, include also the PHI nodes. The order in + which we discover statements is important as + generate_loops_for_partition is using the same traversal for + identifying statements. */ static void stmts_from_loop (struct loop *loop, VEC (tree, heap) **stmts) @@ -4490,7 +4691,7 @@ stmts_from_loop (struct loop *loop, VEC (tree, heap) **stmts) for (i = 0; i < loop->num_nodes; i++) { - tree phi; + tree phi, stmt; basic_block bb = bbs[i]; block_stmt_iterator bsi; @@ -4498,7 +4699,8 @@ stmts_from_loop (struct loop *loop, VEC (tree, heap) **stmts) VEC_safe_push (tree, heap, *stmts, phi); for (bsi = bsi_start (bb); !bsi_end_p (bsi); bsi_next (&bsi)) - VEC_safe_push (tree, heap, *stmts, bsi_stmt (bsi)); + if (TREE_CODE (stmt = bsi_stmt (bsi)) != LABEL_EXPR) + VEC_safe_push (tree, heap, *stmts, stmt); } free (bbs); @@ -4519,8 +4721,39 @@ known_dependences_p (VEC (ddr_p, heap) *dependence_relations) return true; } -/* Build a Reduced Dependence Graph with one vertex per statement of the - loop nest and one edge per data dependence or scalar dependence. */ +/* Computes a hash function for element ELT. */ + +static hashval_t +hash_stmt_vertex_info (const void *elt) +{ + struct rdg_vertex_info *rvi = (struct rdg_vertex_info *) elt; + tree stmt = rvi->stmt; + + return htab_hash_pointer (stmt); +} + +/* Compares database elements E1 and E2. */ + +static int +eq_stmt_vertex_info (const void *e1, const void *e2) +{ + const struct rdg_vertex_info *elt1 = (const struct rdg_vertex_info *) e1; + const struct rdg_vertex_info *elt2 = (const struct rdg_vertex_info *) e2; + + return elt1->stmt == elt2->stmt; +} + +/* Free the element E. */ + +static void +hash_stmt_vertex_del (void *e) +{ + free (e); +} + +/* Build the Reduced Dependence Graph (RDG) with one vertex per + statement of the loop nest, and one edge per data dependence or + scalar dependence. */ struct graph * build_rdg (struct loop *loop) @@ -4529,7 +4762,7 @@ build_rdg (struct loop *loop) struct graph *rdg = NULL; VEC (ddr_p, heap) *dependence_relations; VEC (data_reference_p, heap) *datarefs; - VEC (tree, heap) *stmts = VEC_alloc (tree, heap, 10); + VEC (tree, heap) *stmts = VEC_alloc (tree, heap, nb_data_refs); dependence_relations = VEC_alloc (ddr_p, heap, nb_data_refs * nb_data_refs) ; datarefs = VEC_alloc (data_reference_p, heap, nb_data_refs); @@ -4537,12 +4770,15 @@ build_rdg (struct loop *loop) false, &datarefs, &dependence_relations); - + if (!known_dependences_p (dependence_relations)) goto end_rdg; stmts_from_loop (loop, &stmts); rdg = new_graph (VEC_length (tree, stmts)); + + rdg->indices = htab_create (nb_data_refs, hash_stmt_vertex_info, + eq_stmt_vertex_info, hash_stmt_vertex_del); create_rdg_vertices (rdg, stmts); create_rdg_edges (rdg, dependence_relations); @@ -4553,3 +4789,197 @@ build_rdg (struct loop *loop) return rdg; } + +/* Free the reduced dependence graph RDG. */ + +void +free_rdg (struct graph *rdg) +{ + int i; + + for (i = 0; i < rdg->n_vertices; i++) + free (rdg->vertices[i].data); + + htab_delete (rdg->indices); + free_graph (rdg); +} + +/* Initialize STMTS with all the statements of LOOP that contain a + store to memory. */ + +void +stores_from_loop (struct loop *loop, VEC (tree, heap) **stmts) +{ + unsigned int i; + basic_block *bbs = get_loop_body_in_dom_order (loop); + + for (i = 0; i < loop->num_nodes; i++) + { + basic_block bb = bbs[i]; + block_stmt_iterator bsi; + + for (bsi = bsi_start (bb); !bsi_end_p (bsi); bsi_next (&bsi)) + if (!ZERO_SSA_OPERANDS (bsi_stmt (bsi), SSA_OP_VDEF)) + VEC_safe_push (tree, heap, *stmts, bsi_stmt (bsi)); + } + + free (bbs); +} + +/* For a data reference REF, return the declaration of its base + address or NULL_TREE if the base is not determined. */ + +static inline tree +ref_base_address (tree stmt, data_ref_loc *ref) +{ + tree base = NULL_TREE; + tree base_address; + struct data_reference *dr = XCNEW (struct data_reference); + + DR_STMT (dr) = stmt; + DR_REF (dr) = *ref->pos; + dr_analyze_innermost (dr); + base_address = DR_BASE_ADDRESS (dr); + + if (!base_address) + goto end; + + switch (TREE_CODE (base_address)) + { + case ADDR_EXPR: + base = TREE_OPERAND (base_address, 0); + break; + + default: + base = base_address; + break; + } + + end: + free_data_ref (dr); + return base; +} + +/* Determines whether the statement from vertex V of the RDG has a + definition used outside the loop that contains this statement. */ + +bool +rdg_defs_used_in_other_loops_p (struct graph *rdg, int v) +{ + tree stmt = RDG_STMT (rdg, v); + struct loop *loop = loop_containing_stmt (stmt); + use_operand_p imm_use_p; + imm_use_iterator iterator; + ssa_op_iter it; + def_operand_p def_p; + + if (!loop) + return true; + + FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, it, SSA_OP_DEF) + { + FOR_EACH_IMM_USE_FAST (imm_use_p, iterator, DEF_FROM_PTR (def_p)) + { + if (loop_containing_stmt (USE_STMT (imm_use_p)) != loop) + return true; + } + } + + return false; +} + +/* Determines whether statements S1 and S2 access to similar memory + locations. Two memory accesses are considered similar when they + have the same base address declaration, i.e. when their + ref_base_address is the same. */ + +bool +have_similar_memory_accesses (tree s1, tree s2) +{ + bool res = false; + unsigned i, j; + VEC (data_ref_loc, heap) *refs1, *refs2; + data_ref_loc *ref1, *ref2; + + get_references_in_stmt (s1, &refs1); + get_references_in_stmt (s2, &refs2); + + for (i = 0; VEC_iterate (data_ref_loc, refs1, i, ref1); i++) + { + tree base1 = ref_base_address (s1, ref1); + + if (base1) + for (j = 0; VEC_iterate (data_ref_loc, refs2, j, ref2); j++) + if (base1 == ref_base_address (s2, ref2)) + { + res = true; + goto end; + } + } + + end: + VEC_free (data_ref_loc, heap, refs1); + VEC_free (data_ref_loc, heap, refs2); + return res; +} + +/* Helper function for the hashtab. */ + +static int +have_similar_memory_accesses_1 (const void *s1, const void *s2) +{ + return have_similar_memory_accesses ((tree) s1, (tree) s2); +} + +/* Helper function for the hashtab. */ + +static hashval_t +ref_base_address_1 (const void *s) +{ + tree stmt = (tree) s; + unsigned i; + VEC (data_ref_loc, heap) *refs; + data_ref_loc *ref; + hashval_t res = 0; + + get_references_in_stmt (stmt, &refs); + + for (i = 0; VEC_iterate (data_ref_loc, refs, i, ref); i++) + if (!ref->is_read) + { + res = htab_hash_pointer (ref_base_address (stmt, ref)); + break; + } + + VEC_free (data_ref_loc, heap, refs); + return res; +} + +/* Try to remove duplicated write data references from STMTS. */ + +void +remove_similar_memory_refs (VEC (tree, heap) **stmts) +{ + unsigned i; + tree stmt; + htab_t seen = htab_create (VEC_length (tree, *stmts), ref_base_address_1, + have_similar_memory_accesses_1, NULL); + + for (i = 0; VEC_iterate (tree, *stmts, i, stmt); ) + { + void **slot; + + slot = htab_find_slot (seen, stmt, INSERT); + + if (*slot) + VEC_ordered_remove (tree, *stmts, i); + else + { + *slot = (void *) stmt; + i++; + } + } + + htab_delete (seen); +} + diff --git a/gcc/tree-data-ref.h b/gcc/tree-data-ref.h index 2ae58266db6..b24fd63095f 100644 --- a/gcc/tree-data-ref.h +++ b/gcc/tree-data-ref.h @@ -24,6 +24,7 @@ along with GCC; see the file COPYING3. If not see #include "graphds.h" #include "lambda.h" #include "omega.h" +#include "tree-chrec.h" /* innermost_loop_behavior describes the evolution of the address of the memory @@ -38,6 +39,7 @@ along with GCC; see the file COPYING3. If not see Example 1 Example 2 data-ref a[j].b[i][j] *(p + x + 16B + 4B * j) + innermost_loop_behavior base_address &a p offset i * D_i x @@ -319,26 +321,107 @@ extern void debug_data_dependence_relation (struct data_dependence_relation *); extern void dump_data_dependence_relation (FILE *, struct data_dependence_relation *); extern void dump_data_dependence_relations (FILE *, VEC (ddr_p, heap) *); +extern void debug_data_dependence_relations (VEC (ddr_p, heap) *); extern void dump_data_dependence_direction (FILE *, enum data_dependence_direction); extern void free_dependence_relation (struct data_dependence_relation *); extern void free_dependence_relations (VEC (ddr_p, heap) *); +extern void free_data_ref (data_reference_p); extern void free_data_refs (VEC (data_reference_p, heap) *); struct data_reference *create_data_ref (struct loop *, tree, tree, bool); bool find_loop_nest (struct loop *, VEC (loop_p, heap) **); void compute_all_dependences (VEC (data_reference_p, heap) *, VEC (ddr_p, heap) **, VEC (loop_p, heap) *, bool); +/* Return true when the DDR contains two data references that have the + same access functions. */ + +static inline bool +same_access_functions (const struct data_dependence_relation *ddr) +{ + unsigned i; + + for (i = 0; i < DDR_NUM_SUBSCRIPTS (ddr); i++) + if (!eq_evolutions_p (DR_ACCESS_FN (DDR_A (ddr), i), + DR_ACCESS_FN (DDR_B (ddr), i))) + return false; + + return true; +} + +/* Return true when DDR is an anti-dependence relation. */ + +static inline bool +ddr_is_anti_dependent (ddr_p ddr) +{ + return (DDR_ARE_DEPENDENT (ddr) == NULL_TREE + && DR_IS_READ (DDR_A (ddr)) + && !DR_IS_READ (DDR_B (ddr)) + && !same_access_functions (ddr)); +} + +/* Return true when DEPENDENCE_RELATIONS contains an anti-dependence. */ + +static inline bool +ddrs_have_anti_deps (VEC (ddr_p, heap) *dependence_relations) +{ + unsigned i; + ddr_p ddr; + + for (i = 0; VEC_iterate (ddr_p, dependence_relations, i, ddr); i++) + if (ddr_is_anti_dependent (ddr)) + return true; + + return false; +} + +/* Return the dependence level for the DDR relation. */ + +static inline unsigned +ddr_dependence_level (ddr_p ddr) +{ + unsigned vector; + unsigned level = 0; + + if (DDR_DIST_VECTS (ddr)) + level = dependence_level (DDR_DIST_VECT (ddr, 0), DDR_NB_LOOPS (ddr)); + + for (vector = 1; vector < DDR_NUM_DIST_VECTS (ddr); vector++) + level = MIN (level, dependence_level (DDR_DIST_VECT (ddr, vector), + DDR_NB_LOOPS (ddr))); + return level; +} + -/* A RDG vertex representing a statement. */ +/* A Reduced Dependence Graph (RDG) vertex representing a statement. */ typedef struct rdg_vertex { /* The statement represented by this vertex. */ tree stmt; + + /* True when the statement contains a write to memory. */ + bool has_mem_write; + + /* True when the statement contains a read from memory. */ + bool has_mem_reads; } *rdg_vertex_p; -#define RDGV_STMT(V) ((struct rdg_vertex *) ((V)->data))->stmt +#define RDGV_STMT(V) ((struct rdg_vertex *) ((V)->data))->stmt +#define RDGV_HAS_MEM_WRITE(V) ((struct rdg_vertex *) ((V)->data))->has_mem_write +#define RDGV_HAS_MEM_READS(V) ((struct rdg_vertex *) ((V)->data))->has_mem_reads +#define RDG_STMT(RDG, I) RDGV_STMT (&(RDG->vertices[I])) +#define RDG_MEM_WRITE_STMT(RDG, I) RDGV_HAS_MEM_WRITE (&(RDG->vertices[I])) +#define RDG_MEM_READS_STMT(RDG, I) RDGV_HAS_MEM_READS (&(RDG->vertices[I])) + +void dump_rdg_vertex (FILE *, struct graph *, int); +void debug_rdg_vertex (struct graph *, int); +void dump_rdg_component (FILE *, struct graph *, int, bitmap); +void debug_rdg_component (struct graph *, int); +void dump_rdg (FILE *, struct graph *); +void debug_rdg (struct graph *); +void dot_rdg (struct graph *); +int rdg_vertex_for_stmt (struct graph *, tree); /* Data dependence type. */ @@ -363,11 +446,17 @@ typedef struct rdg_edge { /* Type of the dependence. */ enum rdg_dep_type type; + + /* Levels of the dependence: the depth of the loops that + carry the dependence. */ + unsigned level; } *rdg_edge_p; #define RDGE_TYPE(E) ((struct rdg_edge *) ((E)->data))->type +#define RDGE_LEVEL(E) ((struct rdg_edge *) ((E)->data))->level struct graph *build_rdg (struct loop *); +void free_rdg (struct graph *); /* Return the index of the variable VAR in the LOOP_NEST array. */ @@ -385,6 +474,21 @@ index_in_loop_nest (int var, VEC (loop_p, heap) *loop_nest) return var_index; } +void stores_from_loop (struct loop *, VEC (tree, heap) **); +void remove_similar_memory_refs (VEC (tree, heap) **); +bool rdg_defs_used_in_other_loops_p (struct graph *, int); +bool have_similar_memory_accesses (tree, tree); + +/* Determines whether RDG vertices V1 and V2 access to similar memory + locations, in which case they have to be in the same partition. */ + +static inline bool +rdg_has_similar_memory_accesses (struct graph *rdg, int v1, int v2) +{ + return have_similar_memory_accesses (RDG_STMT (rdg, v1), + RDG_STMT (rdg, v2)); +} + /* In lambda-code.c */ bool lambda_transform_legal_p (lambda_trans_matrix, int, VEC (ddr_p, heap) *); diff --git a/gcc/tree-flow.h b/gcc/tree-flow.h index 286c60bce20..555d3a32a57 100644 --- a/gcc/tree-flow.h +++ b/gcc/tree-flow.h @@ -792,6 +792,7 @@ extern void end_recording_case_labels (void); extern basic_block move_sese_region_to_fn (struct function *, basic_block, basic_block); void remove_edge_and_dominated_blocks (edge); +void mark_virtual_ops_in_bb (basic_block); /* In tree-cfgcleanup.c */ extern bitmap cfgcleanup_altered_bbs; @@ -1022,6 +1023,8 @@ bool tree_duplicate_loop_to_header_edge (struct loop *, edge, unsigned int, sbitmap, edge, VEC (edge, heap) **, int); +struct loop *slpeel_tree_duplicate_loop_to_edge_cfg (struct loop *, edge); +void rename_variables_in_loop (struct loop *); struct loop *tree_ssa_loop_version (struct loop *, tree, basic_block *); tree expand_simple_operations (tree); diff --git a/gcc/tree-loop-distribution.c b/gcc/tree-loop-distribution.c new file mode 100644 index 00000000000..2d4a5d69373 --- /dev/null +++ b/gcc/tree-loop-distribution.c @@ -0,0 +1,1173 @@ +/* Loop distribution. + Copyright (C) 2006, 2007 Free Software Foundation, Inc. + Contributed by Georges-Andre Silber <Georges-Andre.Silber@ensmp.fr> + and Sebastian Pop <sebastian.pop@amd.com>. + +This file is part of GCC. + +GCC is free software; you can redistribute it and/or modify it +under the terms of the GNU General Public License as published by the +Free Software Foundation; either version 3, or (at your option) any +later version. + +GCC is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received a copy of the GNU General Public License +along with GCC; see the file COPYING3. If not see +<http://www.gnu.org/licenses/>. */ + +/* This pass performs loop distribution: for example, the loop + + |DO I = 2, N + | A(I) = B(I) + C + | D(I) = A(I-1)*E + |ENDDO + + is transformed to + + |DOALL I = 2, N + | A(I) = B(I) + C + |ENDDO + | + |DOALL I = 2, N + | D(I) = A(I-1)*E + |ENDDO + + This pass uses an RDG, Reduced Dependence Graph built on top of the + data dependence relations. The RDG is then topologically sorted to + obtain a map of information producers/consumers based on which it + generates the new loops. */ + +#include "config.h" +#include "system.h" +#include "coretypes.h" +#include "tm.h" +#include "ggc.h" +#include "tree.h" +#include "target.h" + +#include "rtl.h" +#include "basic-block.h" +#include "diagnostic.h" +#include "tree-flow.h" +#include "tree-dump.h" +#include "timevar.h" +#include "cfgloop.h" +#include "expr.h" +#include "optabs.h" +#include "tree-chrec.h" +#include "tree-data-ref.h" +#include "tree-scalar-evolution.h" +#include "tree-pass.h" +#include "lambda.h" +#include "langhooks.h" +#include "tree-vectorizer.h" + +/* If bit I is not set, it means that this node represents an + operation that has already been performed, and that should not be + performed again. This is the subgraph of remaining important + computations that is passed to the DFS algorithm for avoiding to + include several times the same stores in different loops. */ +static bitmap remaining_stmts; + +/* A node of the RDG is marked in this bitmap when it has as a + predecessor a node that writes to memory. */ +static bitmap upstream_mem_writes; + +/* Update the PHI nodes of NEW_LOOP. NEW_LOOP is a duplicate of + ORIG_LOOP. */ + +static void +update_phis_for_loop_copy (struct loop *orig_loop, struct loop *new_loop) +{ + tree new_ssa_name; + tree phi_new, phi_orig; + edge orig_loop_latch = loop_latch_edge (orig_loop); + edge orig_entry_e = loop_preheader_edge (orig_loop); + edge new_loop_entry_e = loop_preheader_edge (new_loop); + + /* Scan the phis in the headers of the old and new loops + (they are organized in exactly the same order). */ + + for (phi_new = phi_nodes (new_loop->header), + phi_orig = phi_nodes (orig_loop->header); + phi_new && phi_orig; + phi_new = PHI_CHAIN (phi_new), phi_orig = PHI_CHAIN (phi_orig)) + { + /* Add the first phi argument for the phi in NEW_LOOP (the one + associated with the entry of NEW_LOOP) */ + tree def = PHI_ARG_DEF_FROM_EDGE (phi_orig, orig_entry_e); + add_phi_arg (phi_new, def, new_loop_entry_e); + + /* Add the second phi argument for the phi in NEW_LOOP (the one + associated with the latch of NEW_LOOP) */ + def = PHI_ARG_DEF_FROM_EDGE (phi_orig, orig_loop_latch); + + if (TREE_CODE (def) == SSA_NAME) + { + new_ssa_name = get_current_def (def); + + if (!new_ssa_name) + /* This only happens if there are no definitions inside the + loop. Use the phi_result in this case. */ + new_ssa_name = PHI_RESULT (phi_new); + } + else + /* Could be an integer. */ + new_ssa_name = def; + + add_phi_arg (phi_new, new_ssa_name, loop_latch_edge (new_loop)); + } +} + +/* Return a copy of LOOP placed before LOOP. */ + +static struct loop * +copy_loop_before (struct loop *loop) +{ + struct loop *res; + edge preheader = loop_preheader_edge (loop); + + if (!single_exit (loop)) + return NULL; + + initialize_original_copy_tables (); + res = slpeel_tree_duplicate_loop_to_edge_cfg (loop, preheader); + free_original_copy_tables (); + + if (!res) + return NULL; + + update_phis_for_loop_copy (loop, res); + rename_variables_in_loop (res); + + return res; +} + +/* Creates an empty basic block after LOOP. */ + +static void +create_bb_after_loop (struct loop *loop) +{ + edge exit = single_exit (loop); + + if (!exit) + return; + + split_edge (exit); +} + +/* Generate code for PARTITION from the code in LOOP. The loop is + copied when COPY_P is true. All the statements not flagged in the + PARTITION bitmap are removed from the loop or from its copy. The + statements are indexed in sequence inside a basic block, and the + basic blocks of a loop are taken in dom order. Returns true when + the code gen succeeded. */ + +static bool +generate_loops_for_partition (struct loop *loop, bitmap partition, bool copy_p) +{ + unsigned i, x; + block_stmt_iterator bsi; + basic_block *bbs; + + if (copy_p) + { + loop = copy_loop_before (loop); + create_preheader (loop, CP_SIMPLE_PREHEADERS); + create_bb_after_loop (loop); + } + + if (loop == NULL) + return false; + + /* Remove stmts not in the PARTITION bitmap. The order in which we + visit the phi nodes and the statements is exactly as in + stmts_from_loop. */ + bbs = get_loop_body_in_dom_order (loop); + + for (x = 0, i = 0; i < loop->num_nodes; i++) + { + basic_block bb = bbs[i]; + tree phi, prev = NULL_TREE, next; + + for (phi = phi_nodes (bb); phi;) + if (!bitmap_bit_p (partition, x++)) + { + next = PHI_CHAIN (phi); + remove_phi_node (phi, prev, true); + phi = next; + } + else + { + prev = phi; + phi = PHI_CHAIN (phi); + } + + for (bsi = bsi_start (bb); !bsi_end_p (bsi);) + if (TREE_CODE (bsi_stmt (bsi)) != LABEL_EXPR + && !bitmap_bit_p (partition, x++)) + bsi_remove (&bsi, false); + else + bsi_next (&bsi); + + mark_virtual_ops_in_bb (bb); + } + + free (bbs); + return true; +} + +/* Generate a call to memset. Return true when the operation succeeded. */ + +static bool +generate_memset_zero (tree stmt, tree op0, tree nb_iter, + block_stmt_iterator bsi) +{ + tree s, t, stmts, nb_bytes, addr_base; + bool res = false; + tree stmt_list = NULL_TREE; + tree args [3]; + tree fn_call, mem, fndecl, fntype, fn; + tree_stmt_iterator i; + ssa_op_iter iter; + struct data_reference *dr = XCNEW (struct data_reference); + + nb_bytes = fold_build2 (MULT_EXPR, TREE_TYPE (nb_iter), + nb_iter, TYPE_SIZE_UNIT (TREE_TYPE (op0))); + nb_bytes = force_gimple_operand (nb_bytes, &stmts, true, NULL); + append_to_statement_list_force (stmts, &stmt_list); + + DR_STMT (dr) = stmt; + DR_REF (dr) = op0; + dr_analyze_innermost (dr); + + /* Test for a positive stride, iterating over every element. */ + if (integer_zerop (fold_build2 (MINUS_EXPR, integer_type_node, DR_STEP (dr), + TYPE_SIZE_UNIT (TREE_TYPE (op0))))) + addr_base = fold_build2 (PLUS_EXPR, TREE_TYPE (DR_BASE_ADDRESS (dr)), + DR_BASE_ADDRESS (dr), + size_binop (PLUS_EXPR, + DR_OFFSET (dr), DR_INIT (dr))); + + /* Test for a negative stride, iterating over every element. */ + else if (integer_zerop (fold_build2 (PLUS_EXPR, integer_type_node, + TYPE_SIZE_UNIT (TREE_TYPE (op0)), + DR_STEP (dr)))) + { + addr_base = size_binop (PLUS_EXPR, DR_OFFSET (dr), DR_INIT (dr)); + addr_base = fold_build2 (MINUS_EXPR, sizetype, addr_base, nb_bytes); + addr_base = force_gimple_operand (addr_base, &stmts, true, NULL); + append_to_statement_list_force (stmts, &stmt_list); + + addr_base = fold_build2 (POINTER_PLUS_EXPR, + TREE_TYPE (DR_BASE_ADDRESS (dr)), + DR_BASE_ADDRESS (dr), addr_base); + } + else + goto end; + + mem = force_gimple_operand (addr_base, &stmts, true, NULL); + append_to_statement_list_force (stmts, &stmt_list); + + + fndecl = implicit_built_in_decls [BUILT_IN_MEMSET]; + fntype = TREE_TYPE (fndecl); + fn = build1 (ADDR_EXPR, build_pointer_type (fntype), fndecl); + + args[0] = mem; + args[1] = integer_zero_node; + args[2] = nb_bytes; + + fn_call = build_call_array (fntype, fn, 3, args); + append_to_statement_list_force (fn_call, &stmt_list); + + for (i = tsi_start (stmt_list); !tsi_end_p (i); tsi_next (&i)) + { + s = tsi_stmt (i); + update_stmt_if_modified (s); + + FOR_EACH_SSA_TREE_OPERAND (t, s, iter, SSA_OP_VIRTUAL_DEFS) + { + if (TREE_CODE (t) == SSA_NAME) + t = SSA_NAME_VAR (t); + mark_sym_for_renaming (t); + } + } + + /* Mark also the uses of the VDEFS of STMT to be renamed. */ + FOR_EACH_SSA_TREE_OPERAND (t, stmt, iter, SSA_OP_VIRTUAL_DEFS) + { + if (TREE_CODE (t) == SSA_NAME) + { + imm_use_iterator imm_iter; + + FOR_EACH_IMM_USE_STMT (s, imm_iter, t) + update_stmt (s); + + t = SSA_NAME_VAR (t); + } + mark_sym_for_renaming (t); + } + + bsi_insert_after (&bsi, stmt_list, BSI_CONTINUE_LINKING); + res = true; + + if (dump_file && (dump_flags & TDF_DETAILS)) + fprintf (dump_file, "generated memset zero\n"); + + end: + free_data_ref (dr); + return res; +} + +/* Tries to generate a builtin function for the instructions of LOOP + pointed to by the bits set in PARTITION. Returns true when the + operation succeeded. */ + +static bool +generate_builtin (struct loop *loop, bitmap partition, bool copy_p) +{ + bool res = false; + unsigned i, x = 0; + basic_block *bbs; + tree write = NULL_TREE; + tree op0, op1; + block_stmt_iterator bsi; + tree nb_iter = number_of_exit_cond_executions (loop); + + if (!nb_iter || nb_iter == chrec_dont_know) + return false; + + bbs = get_loop_body_in_dom_order (loop); + + for (i = 0; i < loop->num_nodes; i++) + { + basic_block bb = bbs[i]; + tree phi; + + for (phi = phi_nodes (bb); phi; phi = PHI_CHAIN (phi)) + x++; + + for (bsi = bsi_start (bb); !bsi_end_p (bsi); bsi_next (&bsi)) + { + tree stmt = bsi_stmt (bsi); + + if (bitmap_bit_p (partition, x++) + && TREE_CODE (stmt) == GIMPLE_MODIFY_STMT + && !is_gimple_reg (GIMPLE_STMT_OPERAND (stmt, 0))) + { + /* Don't generate the builtins when there are more than + one memory write. */ + if (write != NULL) + goto end; + + write = stmt; + } + } + } + + if (!write) + goto end; + + op0 = GIMPLE_STMT_OPERAND (write, 0); + op1 = GIMPLE_STMT_OPERAND (write, 1); + + if (!(TREE_CODE (op0) == ARRAY_REF + || TREE_CODE (op0) == INDIRECT_REF)) + goto end; + + /* The new statements will be placed before LOOP. */ + bsi = bsi_last (loop_preheader_edge (loop)->src); + + if (integer_zerop (op1) || real_zerop (op1)) + res = generate_memset_zero (write, op0, nb_iter, bsi); + + /* If this is the last partition for which we generate code, we have + to destroy the loop. */ + if (res && !copy_p) + { + unsigned nbbs = loop->num_nodes; + basic_block src = loop_preheader_edge (loop)->src; + basic_block dest = single_exit (loop)->dest; + make_edge (src, dest, EDGE_FALLTHRU); + set_immediate_dominator (CDI_DOMINATORS, dest, src); + cancel_loop_tree (loop); + + for (i = 0; i < nbbs; i++) + delete_basic_block (bbs[i]); + } + + end: + free (bbs); + return res; +} + +/* Generates code for PARTITION. For simple loops, this function can + generate a built-in. */ + +static bool +generate_code_for_partition (struct loop *loop, bitmap partition, bool copy_p) +{ + if (generate_builtin (loop, partition, copy_p)) + return true; + + return generate_loops_for_partition (loop, partition, copy_p); +} + + +/* Returns true if the node V of RDG cannot be recomputed. */ + +static bool +rdg_cannot_recompute_vertex_p (struct graph *rdg, int v) +{ + if (RDG_MEM_WRITE_STMT (rdg, v)) + return true; + + return false; +} + +/* Returns true when the vertex V has already been generated in the + current partition (V is in PROCESSED), or when V belongs to another + partition and cannot be recomputed (V is not in REMAINING_STMTS). */ + +static inline bool +already_processed_vertex_p (bitmap processed, int v) +{ + return (bitmap_bit_p (processed, v) + || !bitmap_bit_p (remaining_stmts, v)); +} + +/* Returns NULL when there is no anti-dependence among the successors + of vertex V, otherwise returns the edge with the anti-dep. */ + +static struct graph_edge * +has_anti_dependence (struct vertex *v) +{ + struct graph_edge *e; + + if (v->succ) + for (e = v->succ; e; e = e->succ_next) + if (RDGE_TYPE (e) == anti_dd) + return e; + + return NULL; +} + +/* Returns true when V has an anti-dependence edge among its successors. */ + +static bool +predecessor_has_mem_write (struct graph *rdg, struct vertex *v) +{ + struct graph_edge *e; + + if (v->pred) + for (e = v->pred; e; e = e->pred_next) + if (bitmap_bit_p (upstream_mem_writes, e->src) + /* Don't consider flow channels: a write to memory followed + by a read from memory. These channels allow the split of + the RDG in different partitions. */ + && !RDG_MEM_WRITE_STMT (rdg, e->src)) + return true; + + return false; +} + +/* Initializes the upstream_mem_writes bitmap following the + information from RDG. */ + +static void +mark_nodes_having_upstream_mem_writes (struct graph *rdg) +{ + int v, x; + bitmap seen = BITMAP_ALLOC (NULL); + + for (v = rdg->n_vertices - 1; v >= 0; v--) + if (!bitmap_bit_p (seen, v)) + { + unsigned i; + VEC (int, heap) *nodes = VEC_alloc (int, heap, 3); + bool has_upstream_mem_write_p = false; + + graphds_dfs (rdg, &v, 1, &nodes, false, NULL); + + for (i = 0; VEC_iterate (int, nodes, i, x); i++) + { + if (bitmap_bit_p (seen, x)) + continue; + + bitmap_set_bit (seen, x); + + if (RDG_MEM_WRITE_STMT (rdg, x) + || predecessor_has_mem_write (rdg, &(rdg->vertices[x])) + /* In anti dependences the read should occur before + the write, this is why both the read and the write + should be placed in the same partition. */ + || has_anti_dependence (&(rdg->vertices[x]))) + { + has_upstream_mem_write_p = true; + bitmap_set_bit (upstream_mem_writes, x); + } + } + + VEC_free (int, heap, nodes); + } +} + +/* Returns true when vertex u has a memory write node as a predecessor + in RDG. */ + +static bool +has_upstream_mem_writes (int u) +{ + return bitmap_bit_p (upstream_mem_writes, u); +} + +static void rdg_flag_vertex_and_dependent (struct graph *, int, bitmap, bitmap, + bitmap, bool *); + +/* Flag all the uses of U. */ + +static void +rdg_flag_all_uses (struct graph *rdg, int u, bitmap partition, bitmap loops, + bitmap processed, bool *part_has_writes) +{ + struct graph_edge *e; + + for (e = rdg->vertices[u].succ; e; e = e->succ_next) + if (!bitmap_bit_p (processed, e->dest)) + { + rdg_flag_vertex_and_dependent (rdg, e->dest, partition, loops, + processed, part_has_writes); + rdg_flag_all_uses (rdg, e->dest, partition, loops, processed, + part_has_writes); + } +} + +/* Flag the uses of U stopping following the information from + upstream_mem_writes. */ + +static void +rdg_flag_uses (struct graph *rdg, int u, bitmap partition, bitmap loops, + bitmap processed, bool *part_has_writes) +{ + ssa_op_iter iter; + use_operand_p use_p; + struct vertex *x = &(rdg->vertices[u]); + tree stmt = RDGV_STMT (x); + struct graph_edge *anti_dep = has_anti_dependence (x); + + /* Keep in the same partition the destination of an antidependence, + because this is a store to the exact same location. Putting this + in another partition is bad for cache locality. */ + if (anti_dep) + { + int v = anti_dep->dest; + + if (!already_processed_vertex_p (processed, v)) + rdg_flag_vertex_and_dependent (rdg, v, partition, loops, + processed, part_has_writes); + } + + if (TREE_CODE (stmt) != PHI_NODE) + { + FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_VIRTUAL_USES) + { + tree use = USE_FROM_PTR (use_p); + + if (TREE_CODE (use) == SSA_NAME) + { + tree def_stmt = SSA_NAME_DEF_STMT (use); + int v = rdg_vertex_for_stmt (rdg, def_stmt); + + if (v >= 0 + && !already_processed_vertex_p (processed, v)) + rdg_flag_vertex_and_dependent (rdg, v, partition, loops, + processed, part_has_writes); + } + } + } + + if (TREE_CODE (stmt) == GIMPLE_MODIFY_STMT + && has_upstream_mem_writes (u)) + { + tree op0 = GIMPLE_STMT_OPERAND (stmt, 0); + + /* Scalar channels don't have enough space for transmitting data + between tasks, unless we add more storage by privatizing. */ + if (is_gimple_reg (op0)) + { + use_operand_p use_p; + imm_use_iterator iter; + + FOR_EACH_IMM_USE_FAST (use_p, iter, op0) + { + int v = rdg_vertex_for_stmt (rdg, USE_STMT (use_p)); + + if (!already_processed_vertex_p (processed, v)) + rdg_flag_vertex_and_dependent (rdg, v, partition, loops, + processed, part_has_writes); + } + } + } +} + +/* Flag V from RDG as part of PARTITION, and also flag its loop number + in LOOPS. */ + +static void +rdg_flag_vertex (struct graph *rdg, int v, bitmap partition, bitmap loops, + bool *part_has_writes) +{ + struct loop *loop; + + if (bitmap_bit_p (partition, v)) + return; + + loop = loop_containing_stmt (RDG_STMT (rdg, v)); + bitmap_set_bit (loops, loop->num); + bitmap_set_bit (partition, v); + + if (rdg_cannot_recompute_vertex_p (rdg, v)) + { + *part_has_writes = true; + bitmap_clear_bit (remaining_stmts, v); + } +} + +/* Flag in the bitmap PARTITION the vertex V and all its predecessors. + Alse flag their loop number in LOOPS. */ + +static void +rdg_flag_vertex_and_dependent (struct graph *rdg, int v, bitmap partition, + bitmap loops, bitmap processed, + bool *part_has_writes) +{ + unsigned i; + VEC (int, heap) *nodes = VEC_alloc (int, heap, 3); + int x; + + bitmap_set_bit (processed, v); + rdg_flag_uses (rdg, v, partition, loops, processed, part_has_writes); + graphds_dfs (rdg, &v, 1, &nodes, false, remaining_stmts); + rdg_flag_vertex (rdg, v, partition, loops, part_has_writes); + + for (i = 0; VEC_iterate (int, nodes, i, x); i++) + if (!already_processed_vertex_p (processed, x)) + rdg_flag_vertex_and_dependent (rdg, x, partition, loops, processed, + part_has_writes); + + VEC_free (int, heap, nodes); +} + +/* Initialize CONDS with all the condition statements from the basic + blocks of LOOP. */ + +static void +collect_condition_stmts (struct loop *loop, VEC (tree, heap) **conds) +{ + unsigned i; + edge e; + VEC (edge, heap) *exits = get_loop_exit_edges (loop); + + for (i = 0; VEC_iterate (edge, exits, i, e); i++) + { + tree cond = last_stmt (e->src); + + if (cond) + VEC_safe_push (tree, heap, *conds, cond); + } + + VEC_free (edge, heap, exits); +} + +/* Add to PARTITION all the exit condition statements for LOOPS + together with all their dependent statements determined from + RDG. */ + +static void +rdg_flag_loop_exits (struct graph *rdg, bitmap loops, bitmap partition, + bitmap processed, bool *part_has_writes) +{ + unsigned i; + bitmap_iterator bi; + VEC (tree, heap) *conds = VEC_alloc (tree, heap, 3); + + EXECUTE_IF_SET_IN_BITMAP (loops, 0, i, bi) + collect_condition_stmts (get_loop (i), &conds); + + while (!VEC_empty (tree, conds)) + { + tree cond = VEC_pop (tree, conds); + int v = rdg_vertex_for_stmt (rdg, cond); + bitmap new_loops = BITMAP_ALLOC (NULL); + + if (!already_processed_vertex_p (processed, v)) + rdg_flag_vertex_and_dependent (rdg, v, partition, new_loops, processed, + part_has_writes); + + EXECUTE_IF_SET_IN_BITMAP (new_loops, 0, i, bi) + if (!bitmap_bit_p (loops, i)) + { + bitmap_set_bit (loops, i); + collect_condition_stmts (get_loop (i), &conds); + } + + BITMAP_FREE (new_loops); + } +} + +/* Strongly connected components of the reduced data dependence graph. */ + +typedef struct rdg_component +{ + int num; + VEC (int, heap) *vertices; +} *rdgc; + +DEF_VEC_P (rdgc); +DEF_VEC_ALLOC_P (rdgc, heap); + +/* Flag all the nodes of RDG containing memory accesses that could + potentially belong to arrays already accessed in the current + PARTITION. */ + +static void +rdg_flag_similar_memory_accesses (struct graph *rdg, bitmap partition, + bitmap loops, bitmap processed, + VEC (int, heap) **other_stores) +{ + bool foo; + unsigned i, n; + int j, k, kk; + bitmap_iterator ii; + struct graph_edge *e; + + EXECUTE_IF_SET_IN_BITMAP (partition, 0, i, ii) + if (RDG_MEM_WRITE_STMT (rdg, i) + || RDG_MEM_READS_STMT (rdg, i)) + { + for (j = 0; j < rdg->n_vertices; j++) + if (!bitmap_bit_p (processed, j) + && (RDG_MEM_WRITE_STMT (rdg, j) + || RDG_MEM_READS_STMT (rdg, j)) + && rdg_has_similar_memory_accesses (rdg, i, j)) + { + /* Flag first the node J itself, and all the nodes that + are needed to compute J. */ + rdg_flag_vertex_and_dependent (rdg, j, partition, loops, + processed, &foo); + + /* When J is a read, we want to coalesce in the same + PARTITION all the nodes that are using J: this is + needed for better cache locality. */ + rdg_flag_all_uses (rdg, j, partition, loops, processed, &foo); + + /* Remove from OTHER_STORES the vertex that we flagged. */ + if (RDG_MEM_WRITE_STMT (rdg, j)) + for (k = 0; VEC_iterate (int, *other_stores, k, kk); k++) + if (kk == j) + { + VEC_unordered_remove (int, *other_stores, k); + break; + } + } + + /* If the node I has two uses, then keep these together in the + same PARTITION. */ + for (n = 0, e = rdg->vertices[i].succ; e; e = e->succ_next, n++); + + if (n > 1) + rdg_flag_all_uses (rdg, i, partition, loops, processed, &foo); + } +} + +/* Returns a bitmap in which all the statements needed for computing + the strongly connected component C of the RDG are flagged, also + including the loop exit conditions. */ + +static bitmap +build_rdg_partition_for_component (struct graph *rdg, rdgc c, + bool *part_has_writes, + VEC (int, heap) **other_stores) +{ + int i, v; + bitmap partition = BITMAP_ALLOC (NULL); + bitmap loops = BITMAP_ALLOC (NULL); + bitmap processed = BITMAP_ALLOC (NULL); + + for (i = 0; VEC_iterate (int, c->vertices, i, v); i++) + if (!already_processed_vertex_p (processed, v)) + rdg_flag_vertex_and_dependent (rdg, v, partition, loops, processed, + part_has_writes); + + /* Also iterate on the array of stores not in the starting vertices, + and determine those vertices that have some memory affinity with + the current nodes in the component: these are stores to the same + arrays, i.e. we're taking care of cache locality. */ + rdg_flag_similar_memory_accesses (rdg, partition, loops, processed, + other_stores); + + rdg_flag_loop_exits (rdg, loops, partition, processed, part_has_writes); + + BITMAP_FREE (processed); + BITMAP_FREE (loops); + return partition; +} + +/* Free memory for COMPONENTS. */ + +static void +free_rdg_components (VEC (rdgc, heap) *components) +{ + int i; + rdgc x; + + for (i = 0; VEC_iterate (rdgc, components, i, x); i++) + { + VEC_free (int, heap, x->vertices); + free (x); + } +} + +/* Build the COMPONENTS vector with the strongly connected components + of RDG in which the STARTING_VERTICES occur. */ + +static void +rdg_build_components (struct graph *rdg, VEC (int, heap) *starting_vertices, + VEC (rdgc, heap) **components) +{ + int i, v; + bitmap saved_components = BITMAP_ALLOC (NULL); + int n_components = graphds_scc (rdg, NULL); + VEC (int, heap) **all_components = XNEWVEC (VEC (int, heap) *, n_components); + + for (i = 0; i < n_components; i++) + all_components[i] = VEC_alloc (int, heap, 3); + + for (i = 0; i < rdg->n_vertices; i++) + VEC_safe_push (int, heap, all_components[rdg->vertices[i].component], i); + + for (i = 0; VEC_iterate (int, starting_vertices, i, v); i++) + { + int c = rdg->vertices[v].component; + + if (!bitmap_bit_p (saved_components, c)) + { + rdgc x = XCNEW (struct rdg_component); + x->num = c; + x->vertices = all_components[c]; + + VEC_safe_push (rdgc, heap, *components, x); + bitmap_set_bit (saved_components, c); + } + } + + for (i = 0; i < n_components; i++) + if (!bitmap_bit_p (saved_components, i)) + VEC_free (int, heap, all_components[i]); + + free (all_components); + BITMAP_FREE (saved_components); +} + +DEF_VEC_P (bitmap); +DEF_VEC_ALLOC_P (bitmap, heap); + +/* Aggregate several components into a useful partition that is + registered in the PARTITIONS vector. Partitions will be + distributed in different loops. */ + +static void +rdg_build_partitions (struct graph *rdg, VEC (rdgc, heap) *components, + VEC (int, heap) **other_stores, + VEC (bitmap, heap) **partitions, bitmap processed) +{ + int i; + rdgc x; + bitmap partition = BITMAP_ALLOC (NULL); + + for (i = 0; VEC_iterate (rdgc, components, i, x); i++) + { + bitmap np; + bool part_has_writes = false; + int v = VEC_index (int, x->vertices, 0); + + if (bitmap_bit_p (processed, v)) + continue; + + np = build_rdg_partition_for_component (rdg, x, &part_has_writes, + other_stores); + bitmap_ior_into (partition, np); + bitmap_ior_into (processed, np); + BITMAP_FREE (np); + + if (part_has_writes) + { + if (dump_file && (dump_flags & TDF_DETAILS)) + { + fprintf (dump_file, "ldist useful partition:\n"); + dump_bitmap (dump_file, partition); + } + + VEC_safe_push (bitmap, heap, *partitions, partition); + partition = BITMAP_ALLOC (NULL); + } + } + + /* Add the nodes from the RDG that were not marked as processed, and + that are used outside the current loop. These are scalar + computations that are not yet part of previous partitions. */ + for (i = 0; i < rdg->n_vertices; i++) + if (!bitmap_bit_p (processed, i) + && rdg_defs_used_in_other_loops_p (rdg, i)) + VEC_safe_push (int, heap, *other_stores, i); + + /* If there are still statements left in the OTHER_STORES array, + create other components and partitions with these stores and + their dependences. */ + if (VEC_length (int, *other_stores) > 0) + { + VEC (rdgc, heap) *comps = VEC_alloc (rdgc, heap, 3); + VEC (int, heap) *foo = VEC_alloc (int, heap, 3); + + rdg_build_components (rdg, *other_stores, &comps); + rdg_build_partitions (rdg, comps, &foo, partitions, processed); + + VEC_free (int, heap, foo); + free_rdg_components (comps); + } + + /* If there is something left in the last partition, save it. */ + if (bitmap_count_bits (partition) > 0) + VEC_safe_push (bitmap, heap, *partitions, partition); + else + BITMAP_FREE (partition); +} + +/* Dump to FILE the PARTITIONS. */ + +static void +dump_rdg_partitions (FILE *file, VEC (bitmap, heap) *partitions) +{ + int i; + bitmap partition; + + for (i = 0; VEC_iterate (bitmap, partitions, i, partition); i++) + debug_bitmap_file (file, partition); +} + +/* Debug PARTITIONS. */ +extern void debug_rdg_partitions (VEC (bitmap, heap) *); + +void +debug_rdg_partitions (VEC (bitmap, heap) *partitions) +{ + dump_rdg_partitions (stderr, partitions); +} + +/* Generate code from STARTING_VERTICES in RDG. Returns the number of + distributed loops. */ + +static int +ldist_gen (struct loop *loop, struct graph *rdg, + VEC (int, heap) *starting_vertices) +{ + int i, nbp; + VEC (rdgc, heap) *components = VEC_alloc (rdgc, heap, 3); + VEC (bitmap, heap) *partitions = VEC_alloc (bitmap, heap, 3); + VEC (int, heap) *other_stores = VEC_alloc (int, heap, 3); + bitmap partition, processed = BITMAP_ALLOC (NULL); + + remaining_stmts = BITMAP_ALLOC (NULL); + upstream_mem_writes = BITMAP_ALLOC (NULL); + + for (i = 0; i < rdg->n_vertices; i++) + { + bitmap_set_bit (remaining_stmts, i); + + /* Save in OTHER_STORES all the memory writes that are not in + STARTING_VERTICES. */ + if (RDG_MEM_WRITE_STMT (rdg, i)) + { + int v; + unsigned j; + bool found = false; + + for (j = 0; VEC_iterate (int, starting_vertices, j, v); j++) + if (i == v) + { + found = true; + break; + } + + if (!found) + VEC_safe_push (int, heap, other_stores, i); + } + } + + mark_nodes_having_upstream_mem_writes (rdg); + rdg_build_components (rdg, starting_vertices, &components); + rdg_build_partitions (rdg, components, &other_stores, &partitions, + processed); + BITMAP_FREE (processed); + nbp = VEC_length (bitmap, partitions); + + if (nbp <= 1) + goto ldist_done; + + if (dump_file && (dump_flags & TDF_DETAILS)) + dump_rdg_partitions (dump_file, partitions); + + for (i = 0; VEC_iterate (bitmap, partitions, i, partition); i++) + if (!generate_code_for_partition (loop, partition, i < nbp - 1)) + goto ldist_done; + + rewrite_into_loop_closed_ssa (NULL, TODO_update_ssa); + update_ssa (TODO_update_ssa_only_virtuals | TODO_update_ssa); + + ldist_done: + + BITMAP_FREE (remaining_stmts); + BITMAP_FREE (upstream_mem_writes); + + for (i = 0; VEC_iterate (bitmap, partitions, i, partition); i++) + BITMAP_FREE (partition); + + VEC_free (int, heap, other_stores); + VEC_free (bitmap, heap, partitions); + free_rdg_components (components); + return nbp; +} + +/* Distributes the code from LOOP in such a way that producer + statements are placed before consumer statements. When STMTS is + NULL, performs the maximal distribution, if STMTS is not NULL, + tries to separate only these statements from the LOOP's body. + Returns the number of distributed loops. */ + +static int +distribute_loop (struct loop *loop, VEC (tree, heap) *stmts) +{ + bool res = false; + struct graph *rdg; + tree s; + unsigned i; + VEC (int, heap) *vertices; + + if (loop->num_nodes > 2) + { + if (dump_file && (dump_flags & TDF_DETAILS)) + fprintf (dump_file, + "FIXME: Loop %d not distributed: it has more than two basic blocks.\n", + loop->num); + + return res; + } + + rdg = build_rdg (loop); + + if (!rdg) + { + if (dump_file && (dump_flags & TDF_DETAILS)) + fprintf (dump_file, + "FIXME: Loop %d not distributed: failed to build the RDG.\n", + loop->num); + + return res; + } + + vertices = VEC_alloc (int, heap, 3); + + if (dump_file && (dump_flags & TDF_DETAILS)) + dump_rdg (dump_file, rdg); + + for (i = 0; VEC_iterate (tree, stmts, i, s); i++) + { + int v = rdg_vertex_for_stmt (rdg, s); + + if (v >= 0) + { + VEC_safe_push (int, heap, vertices, v); + + if (dump_file && (dump_flags & TDF_DETAILS)) + fprintf (dump_file, + "ldist asked to generate code for vertex %d\n", v); + } + } + + res = ldist_gen (loop, rdg, vertices); + VEC_free (int, heap, vertices); + free_rdg (rdg); + + return res; +} + +/* Distribute all loops in the current function. */ + +static unsigned int +tree_loop_distribution (void) +{ + struct loop *loop; + loop_iterator li; + int nb_generated_loops = 0; + + FOR_EACH_LOOP (li, loop, 0) + { + VEC (tree, heap) *work_list = VEC_alloc (tree, heap, 3); + + /* With the following working list, we're asking distribute_loop + to separate the stores of the loop: when dependences allow, + it will end on having one store per loop. */ + stores_from_loop (loop, &work_list); + + /* A simple heuristic for cache locality is to not split stores + to the same array. Without this call, an unrolled loop would + be split into as many loops as unroll factor, each loop + storing in the same array. */ + remove_similar_memory_refs (&work_list); + + nb_generated_loops = distribute_loop (loop, work_list); + + if (dump_file && (dump_flags & TDF_DETAILS)) + { + if (nb_generated_loops > 1) + fprintf (dump_file, "Loop %d distributed: split to %d loops.\n", + loop->num, nb_generated_loops); + else + fprintf (dump_file, "Loop %d is the same.\n", loop->num); + } + + verify_loop_structure (); + + VEC_free (tree, heap, work_list); + } + + return 0; +} + +static bool +gate_tree_loop_distribution (void) +{ + return flag_tree_loop_distribution != 0; +} + +struct tree_opt_pass pass_loop_distribution = +{ + "ldist", /* name */ + gate_tree_loop_distribution, /* gate */ + tree_loop_distribution, /* execute */ + NULL, /* sub */ + NULL, /* next */ + 0, /* static_pass_number */ + TV_TREE_LOOP_DISTRIBUTION, /* tv_id */ + PROP_cfg | PROP_ssa, /* properties_required */ + 0, /* properties_provided */ + 0, /* properties_destroyed */ + 0, /* todo_flags_start */ + TODO_dump_func | TODO_verify_loops, /* todo_flags_finish */ + 0 /* letter */ +}; diff --git a/gcc/tree-pass.h b/gcc/tree-pass.h index 01056827c07..55aa5fcf77f 100644 --- a/gcc/tree-pass.h +++ b/gcc/tree-pass.h @@ -265,6 +265,7 @@ extern struct tree_opt_pass pass_scev_cprop; extern struct tree_opt_pass pass_empty_loop; extern struct tree_opt_pass pass_record_bounds; extern struct tree_opt_pass pass_if_conversion; +extern struct tree_opt_pass pass_loop_distribution; extern struct tree_opt_pass pass_vectorize; extern struct tree_opt_pass pass_complete_unroll; extern struct tree_opt_pass pass_parallelize_loops; diff --git a/gcc/tree-vect-analyze.c b/gcc/tree-vect-analyze.c index 3e8002f172b..9f9ea823f20 100644 --- a/gcc/tree-vect-analyze.c +++ b/gcc/tree-vect-analyze.c @@ -41,28 +41,7 @@ along with GCC; see the file COPYING3. If not see #include "toplev.h" #include "recog.h" -/* Main analysis functions. */ -static bool vect_analyze_data_refs (loop_vec_info); -static bool vect_mark_stmts_to_be_vectorized (loop_vec_info); -static void vect_analyze_scalar_cycles (loop_vec_info); -static bool vect_analyze_data_ref_accesses (loop_vec_info); -static bool vect_analyze_data_ref_dependences (loop_vec_info); -static bool vect_analyze_data_refs_alignment (loop_vec_info); -static bool vect_compute_data_refs_alignment (loop_vec_info); -static bool vect_enhance_data_refs_alignment (loop_vec_info); -static bool vect_analyze_operations (loop_vec_info); -static bool vect_determine_vectorization_factor (loop_vec_info); - -/* Utility functions for the analyses. */ -static bool exist_non_indexing_operands_for_use_p (tree, tree); -static tree vect_get_loop_niters (struct loop *, tree *); -static bool vect_analyze_data_ref_dependence - (struct data_dependence_relation *, loop_vec_info); -static bool vect_compute_data_ref_alignment (struct data_reference *); -static bool vect_analyze_data_ref_access (struct data_reference *); static bool vect_can_advance_ivs_p (loop_vec_info); -static void vect_update_misalignment_for_peel - (struct data_reference *, struct data_reference *, int npeel); /* Function vect_determine_vectorization_factor diff --git a/gcc/tree-vectorizer.c b/gcc/tree-vectorizer.c index 16b0ecacdb2..f657eeaa198 100644 --- a/gcc/tree-vectorizer.c +++ b/gcc/tree-vectorizer.c @@ -147,24 +147,8 @@ along with GCC; see the file COPYING3. If not see #include "tree-pass.h" /************************************************************************* - Simple Loop Peeling Utilities - *************************************************************************/ -static void slpeel_update_phis_for_duplicate_loop - (struct loop *, struct loop *, bool after); -static void slpeel_update_phi_nodes_for_guard1 - (edge, struct loop *, bool, basic_block *, bitmap *); -static void slpeel_update_phi_nodes_for_guard2 - (edge, struct loop *, bool, basic_block *); -static edge slpeel_add_loop_guard (basic_block, tree, basic_block, basic_block); - -static void rename_use_op (use_operand_p); -static void rename_variables_in_bb (basic_block); -static void rename_variables_in_loop (struct loop *); - -/************************************************************************* General Vectorization Utilities *************************************************************************/ -static void vect_set_dump_settings (void); /* vect_dump will be set to stderr or dump_file if exist. */ FILE *vect_dump; @@ -241,7 +225,7 @@ rename_variables_in_bb (basic_block bb) /* Renames variables in new generated LOOP. */ -static void +void rename_variables_in_loop (struct loop *loop) { unsigned i; @@ -806,7 +790,7 @@ slpeel_make_loop_iterate_ntimes (struct loop *loop, tree niters) /* Given LOOP this function generates a new copy of it and puts it on E which is either the entry or exit of LOOP. */ -static struct loop * +struct loop * slpeel_tree_duplicate_loop_to_edge_cfg (struct loop *loop, edge e) { struct loop *new_loop; @@ -871,6 +855,7 @@ slpeel_tree_duplicate_loop_to_edge_cfg (struct loop *loop, edge e) if (at_exit) /* Add the loop copy at exit. */ { redirect_edge_and_branch_force (e, new_loop->header); + PENDING_STMT (e) = NULL; set_immediate_dominator (CDI_DOMINATORS, new_loop->header, e->src); if (was_imm_dom) set_immediate_dominator (CDI_DOMINATORS, exit_dest, new_loop->header); @@ -888,6 +873,7 @@ slpeel_tree_duplicate_loop_to_edge_cfg (struct loop *loop, edge e) new_exit_e = EDGE_SUCC (new_loop->header, 1); redirect_edge_and_branch_force (new_exit_e, loop->header); + PENDING_STMT (new_exit_e) = NULL; set_immediate_dominator (CDI_DOMINATORS, loop->header, new_exit_e->src); @@ -901,6 +887,7 @@ slpeel_tree_duplicate_loop_to_edge_cfg (struct loop *loop, edge e) } redirect_edge_and_branch_force (entry_e, new_loop->header); + PENDING_STMT (entry_e) = NULL; set_immediate_dominator (CDI_DOMINATORS, new_loop->header, preheader); } diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h index 93aa7978dec..7b2be7457b6 100644 --- a/gcc/tree-vectorizer.h +++ b/gcc/tree-vectorizer.h @@ -630,6 +630,7 @@ extern struct loop *slpeel_tree_peel_loop_to_edge (struct loop *, edge, tree, tree, bool, unsigned int, bool); extern void set_prologue_iterations (basic_block, tree, struct loop *, unsigned int); +struct loop *tree_duplicate_loop_on_edge (struct loop *, edge); extern void slpeel_make_loop_iterate_ntimes (struct loop *, tree); extern bool slpeel_can_duplicate_loop_p (const struct loop *, const_edge); #ifdef ENABLE_CHECKING |