diff options
author | vries <vries@138bc75d-0d04-0410-961f-82ee72b054a4> | 2016-01-18 12:52:32 +0000 |
---|---|---|
committer | vries <vries@138bc75d-0d04-0410-961f-82ee72b054a4> | 2016-01-18 12:52:32 +0000 |
commit | 31c2b13becdad09c84c010f085ada469bf67b7b2 (patch) | |
tree | 139f5264c912aebb738b36613946a5f9c12f9595 /gcc | |
parent | 7c5e2ac77a2e5db3d2a99593c59142d15b473af1 (diff) | |
download | gcc-31c2b13becdad09c84c010f085ada469bf67b7b2.tar.gz |
Add oacc_kernels_p argument to pass_parallelize_loops
2016-01-18 Tom de Vries <tom@codesourcery.com>
* omp-low.c (set_oacc_fn_attrib): Make extern.
* omp-low.h (set_oacc_fn_attrib): Declare.
* tree-parloops.c (struct reduction_info): Add reduc_addr field.
(create_call_for_reduction_1): Handle case that reduc_addr is non-NULL.
(create_parallel_loop, gen_parallel_loop, try_create_reduction_list):
Add and handle function parameter oacc_kernels_p.
(find_reduc_addr, get_omp_data_i_param): New function.
(ref_conflicts_with_region, oacc_entry_exit_ok_1)
(oacc_entry_exit_single_gang, oacc_entry_exit_ok): New function.
(parallelize_loops): Add and handle function parameter oacc_kernels_p.
Calculate dominance info. Skip loops that are not in a kernels region
in oacc_kernels_p mode. Skip inner loops of parallelized loops.
(pass_parallelize_loops::execute): Call parallelize_loops with
oacc_kernels_p argument.
(pass_parallelize_loops::clone, pass_parallelize_loops::set_pass_param):
New member function.
(pass_parallelize_loops::bool oacc_kernels_p): New member var.
* passes.def: Add argument to pass_parallelize_loops instantation.
git-svn-id: svn+ssh://gcc.gnu.org/svn/gcc/trunk@232512 138bc75d-0d04-0410-961f-82ee72b054a4
Diffstat (limited to 'gcc')
-rw-r--r-- | gcc/ChangeLog | 21 | ||||
-rw-r--r-- | gcc/omp-low.c | 2 | ||||
-rw-r--r-- | gcc/omp-low.h | 1 | ||||
-rw-r--r-- | gcc/passes.def | 2 | ||||
-rw-r--r-- | gcc/tree-parloops.c | 744 |
5 files changed, 662 insertions, 108 deletions
diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 19a9af1befd..ea8645dccaf 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,5 +1,26 @@ 2016-01-18 Tom de Vries <tom@codesourcery.com> + * omp-low.c (set_oacc_fn_attrib): Make extern. + * omp-low.h (set_oacc_fn_attrib): Declare. + * tree-parloops.c (struct reduction_info): Add reduc_addr field. + (create_call_for_reduction_1): Handle case that reduc_addr is non-NULL. + (create_parallel_loop, gen_parallel_loop, try_create_reduction_list): + Add and handle function parameter oacc_kernels_p. + (find_reduc_addr, get_omp_data_i_param): New function. + (ref_conflicts_with_region, oacc_entry_exit_ok_1) + (oacc_entry_exit_single_gang, oacc_entry_exit_ok): New function. + (parallelize_loops): Add and handle function parameter oacc_kernels_p. + Calculate dominance info. Skip loops that are not in a kernels region + in oacc_kernels_p mode. Skip inner loops of parallelized loops. + (pass_parallelize_loops::execute): Call parallelize_loops with + oacc_kernels_p argument. + (pass_parallelize_loops::clone, pass_parallelize_loops::set_pass_param): + New member function. + (pass_parallelize_loops::bool oacc_kernels_p): New member var. + * passes.def: Add argument to pass_parallelize_loops instantation. + +2016-01-18 Tom de Vries <tom@codesourcery.com> + * tree-parloops.c (pass_parallelize_loops::execute): Allow pass_parallelize_loops to be run outside the loop pipeline. diff --git a/gcc/omp-low.c b/gcc/omp-low.c index be472c03536..d7df3dbfc63 100644 --- a/gcc/omp-low.c +++ b/gcc/omp-low.c @@ -12401,7 +12401,7 @@ replace_oacc_fn_attrib (tree fn, tree dims) function attribute. Push any that are non-constant onto the ARGS list, along with an appropriate GOMP_LAUNCH_DIM tag. */ -static void +void set_oacc_fn_attrib (tree fn, tree clauses, vec<tree> *args) { /* Must match GOMP_DIM ordering. */ diff --git a/gcc/omp-low.h b/gcc/omp-low.h index 3459c1b7c7a..64caef8db69 100644 --- a/gcc/omp-low.h +++ b/gcc/omp-low.h @@ -33,6 +33,7 @@ extern tree omp_member_access_dummy_var (tree); extern void replace_oacc_fn_attrib (tree, tree); extern tree build_oacc_routine_dims (tree); extern tree get_oacc_fn_attrib (tree); +extern void set_oacc_fn_attrib (tree, tree, vec<tree> *); extern int get_oacc_ifn_dim_arg (const gimple *); extern int get_oacc_fn_dim_size (tree, int); diff --git a/gcc/passes.def b/gcc/passes.def index 392a9bce323..d9a8c4e9cf7 100644 --- a/gcc/passes.def +++ b/gcc/passes.def @@ -272,7 +272,7 @@ along with GCC; see the file COPYING3. If not see NEXT_PASS (pass_dce); POP_INSERT_PASSES () NEXT_PASS (pass_iv_canon); - NEXT_PASS (pass_parallelize_loops); + NEXT_PASS (pass_parallelize_loops, false /* oacc_kernels_p */); NEXT_PASS (pass_expand_omp_ssa); NEXT_PASS (pass_ch_vect); NEXT_PASS (pass_if_conversion); diff --git a/gcc/tree-parloops.c b/gcc/tree-parloops.c index 885103e1016..7749d34fab3 100644 --- a/gcc/tree-parloops.c +++ b/gcc/tree-parloops.c @@ -53,6 +53,10 @@ along with GCC; see the file COPYING3. If not see #include "tree-ssa.h" #include "params.h" #include "params-enum.h" +#include "tree-ssa-alias.h" +#include "tree-eh.h" +#include "gomp-constants.h" +#include "tree-dfa.h" /* This pass tries to distribute iterations of loops into several threads. The implementation is straightforward -- for each loop we test whether its @@ -192,6 +196,8 @@ struct reduction_info of the reduction variable when existing the loop. */ tree initial_value; /* The initial value of the reduction var before entering the loop. */ tree field; /* the name of the field in the parloop data structure intended for reduction. */ + tree reduc_addr; /* The address of the reduction variable for + openacc reductions. */ tree init; /* reduction initialization value. */ gphi *new_phi; /* (helper field) Newly created phi node whose result will be passed to the atomic operation. Represents @@ -1085,10 +1091,29 @@ create_call_for_reduction_1 (reduction_info **slot, struct clsn_data *clsn_data) tree tmp_load, name; gimple *load; - load_struct = build_simple_mem_ref (clsn_data->load); - t = build3 (COMPONENT_REF, type, load_struct, reduc->field, NULL_TREE); + if (reduc->reduc_addr == NULL_TREE) + { + load_struct = build_simple_mem_ref (clsn_data->load); + t = build3 (COMPONENT_REF, type, load_struct, reduc->field, NULL_TREE); - addr = build_addr (t); + addr = build_addr (t); + } + else + { + /* Set the address for the atomic store. */ + addr = reduc->reduc_addr; + + /* Remove the non-atomic store '*addr = sum'. */ + tree res = PHI_RESULT (reduc->keep_res); + use_operand_p use_p; + gimple *stmt; + bool single_use_p = single_imm_use (res, &use_p, &stmt); + gcc_assert (single_use_p); + replace_uses_by (gimple_vdef (stmt), + gimple_vuse (stmt)); + gimple_stmt_iterator gsi = gsi_for_stmt (stmt); + gsi_remove (&gsi, true); + } /* Create phi node. */ bb = clsn_data->load_bb; @@ -1994,10 +2019,11 @@ transform_to_exit_first_loop (struct loop *loop, static void create_parallel_loop (struct loop *loop, tree loop_fn, tree data, - tree new_data, unsigned n_threads, location_t loc) + tree new_data, unsigned n_threads, location_t loc, + bool oacc_kernels_p) { gimple_stmt_iterator gsi; - basic_block bb, paral_bb, for_bb, ex_bb, continue_bb; + basic_block for_bb, ex_bb, continue_bb; tree t, param; gomp_parallel *omp_par_stmt; gimple *omp_return_stmt1, *omp_return_stmt2; @@ -2009,40 +2035,50 @@ create_parallel_loop (struct loop *loop, tree loop_fn, tree data, edge exit, nexit, guard, end, e; /* Prepare the GIMPLE_OMP_PARALLEL statement. */ - bb = loop_preheader_edge (loop)->src; - paral_bb = single_pred (bb); - gsi = gsi_last_bb (paral_bb); + if (oacc_kernels_p) + { + tree clause = build_omp_clause (loc, OMP_CLAUSE_NUM_GANGS); + OMP_CLAUSE_NUM_GANGS_EXPR (clause) + = build_int_cst (integer_type_node, n_threads); + set_oacc_fn_attrib (cfun->decl, clause, NULL); + } + else + { + basic_block bb = loop_preheader_edge (loop)->src; + basic_block paral_bb = single_pred (bb); + gsi = gsi_last_bb (paral_bb); - t = build_omp_clause (loc, OMP_CLAUSE_NUM_THREADS); - OMP_CLAUSE_NUM_THREADS_EXPR (t) - = build_int_cst (integer_type_node, n_threads); - omp_par_stmt = gimple_build_omp_parallel (NULL, t, loop_fn, data); - gimple_set_location (omp_par_stmt, loc); + t = build_omp_clause (loc, OMP_CLAUSE_NUM_THREADS); + OMP_CLAUSE_NUM_THREADS_EXPR (t) + = build_int_cst (integer_type_node, n_threads); + omp_par_stmt = gimple_build_omp_parallel (NULL, t, loop_fn, data); + gimple_set_location (omp_par_stmt, loc); - gsi_insert_after (&gsi, omp_par_stmt, GSI_NEW_STMT); + gsi_insert_after (&gsi, omp_par_stmt, GSI_NEW_STMT); - /* Initialize NEW_DATA. */ - if (data) - { - gassign *assign_stmt; + /* Initialize NEW_DATA. */ + if (data) + { + gassign *assign_stmt; - gsi = gsi_after_labels (bb); + gsi = gsi_after_labels (bb); - param = make_ssa_name (DECL_ARGUMENTS (loop_fn)); - assign_stmt = gimple_build_assign (param, build_fold_addr_expr (data)); - gsi_insert_before (&gsi, assign_stmt, GSI_SAME_STMT); + param = make_ssa_name (DECL_ARGUMENTS (loop_fn)); + assign_stmt = gimple_build_assign (param, build_fold_addr_expr (data)); + gsi_insert_before (&gsi, assign_stmt, GSI_SAME_STMT); - assign_stmt = gimple_build_assign (new_data, - fold_convert (TREE_TYPE (new_data), param)); - gsi_insert_before (&gsi, assign_stmt, GSI_SAME_STMT); - } + assign_stmt = gimple_build_assign (new_data, + fold_convert (TREE_TYPE (new_data), param)); + gsi_insert_before (&gsi, assign_stmt, GSI_SAME_STMT); + } - /* Emit GIMPLE_OMP_RETURN for GIMPLE_OMP_PARALLEL. */ - bb = split_loop_exit_edge (single_dom_exit (loop)); - gsi = gsi_last_bb (bb); - omp_return_stmt1 = gimple_build_omp_return (false); - gimple_set_location (omp_return_stmt1, loc); - gsi_insert_after (&gsi, omp_return_stmt1, GSI_NEW_STMT); + /* Emit GIMPLE_OMP_RETURN for GIMPLE_OMP_PARALLEL. */ + bb = split_loop_exit_edge (single_dom_exit (loop)); + gsi = gsi_last_bb (bb); + omp_return_stmt1 = gimple_build_omp_return (false); + gimple_set_location (omp_return_stmt1, loc); + gsi_insert_after (&gsi, omp_return_stmt1, GSI_NEW_STMT); + } /* Extract data for GIMPLE_OMP_FOR. */ gcc_assert (loop->header == single_dom_exit (loop)->src); @@ -2107,39 +2143,50 @@ create_parallel_loop (struct loop *loop, tree loop_fn, tree data, PENDING_STMT (e) = NULL; /* Emit GIMPLE_OMP_FOR. */ - gimple_cond_set_lhs (cond_stmt, cvar_base); - type = TREE_TYPE (cvar); - t = build_omp_clause (loc, OMP_CLAUSE_SCHEDULE); - int chunk_size = PARAM_VALUE (PARAM_PARLOOPS_CHUNK_SIZE); - enum PARAM_PARLOOPS_SCHEDULE_KIND schedule_type \ - = (enum PARAM_PARLOOPS_SCHEDULE_KIND) PARAM_VALUE (PARAM_PARLOOPS_SCHEDULE); - switch (schedule_type) + if (oacc_kernels_p) + /* In combination with the NUM_GANGS on the parallel. */ + t = build_omp_clause (loc, OMP_CLAUSE_GANG); + else { - case PARAM_PARLOOPS_SCHEDULE_KIND_static: - OMP_CLAUSE_SCHEDULE_KIND (t) = OMP_CLAUSE_SCHEDULE_STATIC; - break; - case PARAM_PARLOOPS_SCHEDULE_KIND_dynamic: - OMP_CLAUSE_SCHEDULE_KIND (t) = OMP_CLAUSE_SCHEDULE_DYNAMIC; - break; - case PARAM_PARLOOPS_SCHEDULE_KIND_guided: - OMP_CLAUSE_SCHEDULE_KIND (t) = OMP_CLAUSE_SCHEDULE_GUIDED; - break; - case PARAM_PARLOOPS_SCHEDULE_KIND_auto: - OMP_CLAUSE_SCHEDULE_KIND (t) = OMP_CLAUSE_SCHEDULE_AUTO; - chunk_size = 0; - break; - case PARAM_PARLOOPS_SCHEDULE_KIND_runtime: - OMP_CLAUSE_SCHEDULE_KIND (t) = OMP_CLAUSE_SCHEDULE_RUNTIME; - chunk_size = 0; - break; - default: - gcc_unreachable (); + t = build_omp_clause (loc, OMP_CLAUSE_SCHEDULE); + int chunk_size = PARAM_VALUE (PARAM_PARLOOPS_CHUNK_SIZE); + enum PARAM_PARLOOPS_SCHEDULE_KIND schedule_type \ + = (enum PARAM_PARLOOPS_SCHEDULE_KIND) PARAM_VALUE (PARAM_PARLOOPS_SCHEDULE); + switch (schedule_type) + { + case PARAM_PARLOOPS_SCHEDULE_KIND_static: + OMP_CLAUSE_SCHEDULE_KIND (t) = OMP_CLAUSE_SCHEDULE_STATIC; + break; + case PARAM_PARLOOPS_SCHEDULE_KIND_dynamic: + OMP_CLAUSE_SCHEDULE_KIND (t) = OMP_CLAUSE_SCHEDULE_DYNAMIC; + break; + case PARAM_PARLOOPS_SCHEDULE_KIND_guided: + OMP_CLAUSE_SCHEDULE_KIND (t) = OMP_CLAUSE_SCHEDULE_GUIDED; + break; + case PARAM_PARLOOPS_SCHEDULE_KIND_auto: + OMP_CLAUSE_SCHEDULE_KIND (t) = OMP_CLAUSE_SCHEDULE_AUTO; + chunk_size = 0; + break; + case PARAM_PARLOOPS_SCHEDULE_KIND_runtime: + OMP_CLAUSE_SCHEDULE_KIND (t) = OMP_CLAUSE_SCHEDULE_RUNTIME; + chunk_size = 0; + break; + default: + gcc_unreachable (); + } + if (chunk_size != 0) + OMP_CLAUSE_SCHEDULE_CHUNK_EXPR (t) + = build_int_cst (integer_type_node, chunk_size); } - if (chunk_size != 0) - OMP_CLAUSE_SCHEDULE_CHUNK_EXPR (t) - = build_int_cst (integer_type_node, chunk_size); - for_stmt = gimple_build_omp_for (NULL, GF_OMP_FOR_KIND_FOR, t, 1, NULL); + for_stmt = gimple_build_omp_for (NULL, + (oacc_kernels_p + ? GF_OMP_FOR_KIND_OACC_LOOP + : GF_OMP_FOR_KIND_FOR), + t, 1, NULL); + + gimple_cond_set_lhs (cond_stmt, cvar_base); + type = TREE_TYPE (cvar); gimple_set_location (for_stmt, loc); gimple_omp_for_set_index (for_stmt, 0, initvar); gimple_omp_for_set_initial (for_stmt, 0, cvar_init); @@ -2181,7 +2228,8 @@ create_parallel_loop (struct loop *loop, tree loop_fn, tree data, static void gen_parallel_loop (struct loop *loop, reduction_info_table_type *reduction_list, - unsigned n_threads, struct tree_niter_desc *niter) + unsigned n_threads, struct tree_niter_desc *niter, + bool oacc_kernels_p) { tree many_iterations_cond, type, nit; tree arg_struct, new_arg_struct; @@ -2262,40 +2310,44 @@ gen_parallel_loop (struct loop *loop, if (stmts) gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts); - if (loop->inner) - m_p_thread=2; - else - m_p_thread=MIN_PER_THREAD; - - many_iterations_cond = - fold_build2 (GE_EXPR, boolean_type_node, - nit, build_int_cst (type, m_p_thread * n_threads)); - - many_iterations_cond - = fold_build2 (TRUTH_AND_EXPR, boolean_type_node, - invert_truthvalue (unshare_expr (niter->may_be_zero)), - many_iterations_cond); - many_iterations_cond - = force_gimple_operand (many_iterations_cond, &stmts, false, NULL_TREE); - if (stmts) - gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts); - if (!is_gimple_condexpr (many_iterations_cond)) + if (!oacc_kernels_p) { + if (loop->inner) + m_p_thread=2; + else + m_p_thread=MIN_PER_THREAD; + + many_iterations_cond = + fold_build2 (GE_EXPR, boolean_type_node, + nit, build_int_cst (type, m_p_thread * n_threads)); + many_iterations_cond - = force_gimple_operand (many_iterations_cond, &stmts, - true, NULL_TREE); + = fold_build2 (TRUTH_AND_EXPR, boolean_type_node, + invert_truthvalue (unshare_expr (niter->may_be_zero)), + many_iterations_cond); + many_iterations_cond + = force_gimple_operand (many_iterations_cond, &stmts, false, NULL_TREE); if (stmts) gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts); - } + if (!is_gimple_condexpr (many_iterations_cond)) + { + many_iterations_cond + = force_gimple_operand (many_iterations_cond, &stmts, + true, NULL_TREE); + if (stmts) + gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), + stmts); + } - initialize_original_copy_tables (); + initialize_original_copy_tables (); - /* We assume that the loop usually iterates a lot. */ - prob = 4 * REG_BR_PROB_BASE / 5; - loop_version (loop, many_iterations_cond, NULL, - prob, prob, REG_BR_PROB_BASE - prob, true); - update_ssa (TODO_update_ssa); - free_original_copy_tables (); + /* We assume that the loop usually iterates a lot. */ + prob = 4 * REG_BR_PROB_BASE / 5; + loop_version (loop, many_iterations_cond, NULL, + prob, prob, REG_BR_PROB_BASE - prob, true); + update_ssa (TODO_update_ssa); + free_original_copy_tables (); + } /* Base all the induction variables in LOOP on a single control one. */ canonicalize_loop_ivs (loop, &nit, true); @@ -2315,6 +2367,9 @@ gen_parallel_loop (struct loop *loop, } else { + if (oacc_kernels_p) + n_threads = 1; + /* Fall back on the method that handles more cases, but duplicates the loop body: move the exit condition of LOOP to the beginning of its header, and duplicate the part of the last iteration that gets disabled @@ -2331,19 +2386,34 @@ gen_parallel_loop (struct loop *loop, entry = loop_preheader_edge (loop); exit = single_dom_exit (loop); - eliminate_local_variables (entry, exit); - /* In the old loop, move all variables non-local to the loop to a structure - and back, and create separate decls for the variables used in loop. */ - separate_decls_in_region (entry, exit, reduction_list, &arg_struct, - &new_arg_struct, &clsn_data); + /* This rewrites the body in terms of new variables. This has already + been done for oacc_kernels_p in pass_lower_omp/lower_omp (). */ + if (!oacc_kernels_p) + { + eliminate_local_variables (entry, exit); + /* In the old loop, move all variables non-local to the loop to a + structure and back, and create separate decls for the variables used in + loop. */ + separate_decls_in_region (entry, exit, reduction_list, &arg_struct, + &new_arg_struct, &clsn_data); + } + else + { + arg_struct = NULL_TREE; + new_arg_struct = NULL_TREE; + clsn_data.load = NULL_TREE; + clsn_data.load_bb = exit->dest; + clsn_data.store = NULL_TREE; + clsn_data.store_bb = NULL; + } /* Create the parallel constructs. */ loc = UNKNOWN_LOCATION; cond_stmt = last_stmt (loop->header); if (cond_stmt) loc = gimple_location (cond_stmt); - create_parallel_loop (loop, create_loop_fn (loc), arg_struct, - new_arg_struct, n_threads, loc); + create_parallel_loop (loop, create_loop_fn (loc), arg_struct, new_arg_struct, + n_threads, loc, oacc_kernels_p); if (reduction_list->elements () > 0) create_call_for_reduction (loop, reduction_list, &clsn_data); @@ -2542,12 +2612,65 @@ try_get_loop_niter (loop_p loop, struct tree_niter_desc *niter) return true; } +/* Return the default def of the first function argument. */ + +static tree +get_omp_data_i_param (void) +{ + tree decl = DECL_ARGUMENTS (cfun->decl); + gcc_assert (DECL_CHAIN (decl) == NULL_TREE); + return ssa_default_def (cfun, decl); +} + +/* For PHI in loop header of LOOP, look for pattern: + + <bb preheader> + .omp_data_i = &.omp_data_arr; + addr = .omp_data_i->sum; + sum_a = *addr; + + <bb header>: + sum_b = PHI <sum_a (preheader), sum_c (latch)> + + and return addr. Otherwise, return NULL_TREE. */ + +static tree +find_reduc_addr (struct loop *loop, gphi *phi) +{ + edge e = loop_preheader_edge (loop); + tree arg = PHI_ARG_DEF_FROM_EDGE (phi, e); + gimple *stmt = SSA_NAME_DEF_STMT (arg); + if (!gimple_assign_single_p (stmt)) + return NULL_TREE; + tree memref = gimple_assign_rhs1 (stmt); + if (TREE_CODE (memref) != MEM_REF) + return NULL_TREE; + tree addr = TREE_OPERAND (memref, 0); + + gimple *stmt2 = SSA_NAME_DEF_STMT (addr); + if (!gimple_assign_single_p (stmt2)) + return NULL_TREE; + tree compref = gimple_assign_rhs1 (stmt2); + if (TREE_CODE (compref) != COMPONENT_REF) + return NULL_TREE; + tree addr2 = TREE_OPERAND (compref, 0); + if (TREE_CODE (addr2) != MEM_REF) + return NULL_TREE; + addr2 = TREE_OPERAND (addr2, 0); + if (TREE_CODE (addr2) != SSA_NAME + || addr2 != get_omp_data_i_param ()) + return NULL_TREE; + + return addr; +} + /* Try to initialize REDUCTION_LIST for code generation part. REDUCTION_LIST describes the reductions. */ static bool try_create_reduction_list (loop_p loop, - reduction_info_table_type *reduction_list) + reduction_info_table_type *reduction_list, + bool oacc_kernels_p) { edge exit = single_dom_exit (loop); gphi_iterator gsi; @@ -2647,6 +2770,26 @@ try_create_reduction_list (loop_p loop, } } + if (oacc_kernels_p) + { + for (gsi = gsi_start_phis (loop->header); !gsi_end_p (gsi); + gsi_next (&gsi)) + { + gphi *phi = gsi.phi (); + tree def = PHI_RESULT (phi); + affine_iv iv; + + if (!virtual_operand_p (def) + && !simple_iv (loop, loop, def, &iv, true)) + { + tree addr = find_reduc_addr (loop, phi); + if (addr == NULL_TREE) + return false; + struct reduction_info *red = reduction_phi (reduction_list, phi); + red->reduc_addr = addr; + } + } + } return true; } @@ -2679,6 +2822,350 @@ loop_has_phi_with_address_arg (struct loop *loop) } end: free (bbs); + + return res; +} + +/* Return true if memory ref REF (corresponding to the stmt at GSI in + REGIONS_BB[I]) conflicts with the statements in REGIONS_BB[I] after gsi, + or the statements in REGIONS_BB[I + n]. REF_IS_STORE indicates if REF is a + store. Ignore conflicts with SKIP_STMT. */ + +static bool +ref_conflicts_with_region (gimple_stmt_iterator gsi, ao_ref *ref, + bool ref_is_store, vec<basic_block> region_bbs, + unsigned int i, gimple *skip_stmt) +{ + basic_block bb = region_bbs[i]; + gsi_next (&gsi); + + while (true) + { + for (; !gsi_end_p (gsi); + gsi_next (&gsi)) + { + gimple *stmt = gsi_stmt (gsi); + if (stmt == skip_stmt) + { + if (dump_file) + { + fprintf (dump_file, "skipping reduction store: "); + print_gimple_stmt (dump_file, stmt, 0, 0); + } + continue; + } + + if (!gimple_vdef (stmt) + && !gimple_vuse (stmt)) + continue; + + if (gimple_code (stmt) == GIMPLE_RETURN) + continue; + + if (ref_is_store) + { + if (ref_maybe_used_by_stmt_p (stmt, ref)) + { + if (dump_file) + { + fprintf (dump_file, "Stmt "); + print_gimple_stmt (dump_file, stmt, 0, 0); + } + return true; + } + } + else + { + if (stmt_may_clobber_ref_p_1 (stmt, ref)) + { + if (dump_file) + { + fprintf (dump_file, "Stmt "); + print_gimple_stmt (dump_file, stmt, 0, 0); + } + return true; + } + } + } + i++; + if (i == region_bbs.length ()) + break; + bb = region_bbs[i]; + gsi = gsi_start_bb (bb); + } + + return false; +} + +/* Return true if the bbs in REGION_BBS but not in in_loop_bbs can be executed + in parallel with REGION_BBS containing the loop. Return the stores of + reduction results in REDUCTION_STORES. */ + +static bool +oacc_entry_exit_ok_1 (bitmap in_loop_bbs, vec<basic_block> region_bbs, + reduction_info_table_type *reduction_list, + bitmap reduction_stores) +{ + tree omp_data_i = get_omp_data_i_param (); + + unsigned i; + basic_block bb; + FOR_EACH_VEC_ELT (region_bbs, i, bb) + { + if (bitmap_bit_p (in_loop_bbs, bb->index)) + continue; + + gimple_stmt_iterator gsi; + for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); + gsi_next (&gsi)) + { + gimple *stmt = gsi_stmt (gsi); + gimple *skip_stmt = NULL; + + if (is_gimple_debug (stmt) + || gimple_code (stmt) == GIMPLE_COND) + continue; + + ao_ref ref; + bool ref_is_store = false; + if (gimple_assign_load_p (stmt)) + { + tree rhs = gimple_assign_rhs1 (stmt); + tree base = get_base_address (rhs); + if (TREE_CODE (base) == MEM_REF + && operand_equal_p (TREE_OPERAND (base, 0), omp_data_i, 0)) + continue; + + tree lhs = gimple_assign_lhs (stmt); + if (TREE_CODE (lhs) == SSA_NAME + && has_single_use (lhs)) + { + use_operand_p use_p; + gimple *use_stmt; + single_imm_use (lhs, &use_p, &use_stmt); + if (gimple_code (use_stmt) == GIMPLE_PHI) + { + struct reduction_info *red; + red = reduction_phi (reduction_list, use_stmt); + tree val = PHI_RESULT (red->keep_res); + if (has_single_use (val)) + { + single_imm_use (val, &use_p, &use_stmt); + if (gimple_store_p (use_stmt)) + { + unsigned int id + = SSA_NAME_VERSION (gimple_vdef (use_stmt)); + bitmap_set_bit (reduction_stores, id); + skip_stmt = use_stmt; + if (dump_file) + { + fprintf (dump_file, "found reduction load: "); + print_gimple_stmt (dump_file, stmt, 0, 0); + } + } + } + } + } + + ao_ref_init (&ref, rhs); + } + else if (gimple_store_p (stmt)) + { + ao_ref_init (&ref, gimple_assign_lhs (stmt)); + ref_is_store = true; + } + else if (gimple_code (stmt) == GIMPLE_OMP_RETURN) + continue; + else if (!gimple_has_side_effects (stmt) + && !gimple_could_trap_p (stmt) + && !stmt_could_throw_p (stmt) + && !gimple_vdef (stmt) + && !gimple_vuse (stmt)) + continue; + else if (is_gimple_call (stmt) + && gimple_call_internal_p (stmt) + && gimple_call_internal_fn (stmt) == IFN_GOACC_DIM_POS) + continue; + else if (gimple_code (stmt) == GIMPLE_RETURN) + continue; + else + { + if (dump_file) + { + fprintf (dump_file, "Unhandled stmt in entry/exit: "); + print_gimple_stmt (dump_file, stmt, 0, 0); + } + return false; + } + + if (ref_conflicts_with_region (gsi, &ref, ref_is_store, region_bbs, + i, skip_stmt)) + { + if (dump_file) + { + fprintf (dump_file, "conflicts with entry/exit stmt: "); + print_gimple_stmt (dump_file, stmt, 0, 0); + } + return false; + } + } + } + + return true; +} + +/* Find stores inside REGION_BBS and outside IN_LOOP_BBS, and guard them with + gang_pos == 0, except when the stores are REDUCTION_STORES. Return true + if any changes were made. */ + +static bool +oacc_entry_exit_single_gang (bitmap in_loop_bbs, vec<basic_block> region_bbs, + bitmap reduction_stores) +{ + tree gang_pos = NULL_TREE; + bool changed = false; + + unsigned i; + basic_block bb; + FOR_EACH_VEC_ELT (region_bbs, i, bb) + { + if (bitmap_bit_p (in_loop_bbs, bb->index)) + continue; + + gimple_stmt_iterator gsi; + for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);) + { + gimple *stmt = gsi_stmt (gsi); + + if (!gimple_store_p (stmt)) + { + /* Update gsi to point to next stmt. */ + gsi_next (&gsi); + continue; + } + + if (bitmap_bit_p (reduction_stores, + SSA_NAME_VERSION (gimple_vdef (stmt)))) + { + if (dump_file) + { + fprintf (dump_file, + "skipped reduction store for single-gang" + " neutering: "); + print_gimple_stmt (dump_file, stmt, 0, 0); + } + + /* Update gsi to point to next stmt. */ + gsi_next (&gsi); + continue; + } + + changed = true; + + if (gang_pos == NULL_TREE) + { + tree arg = build_int_cst (integer_type_node, GOMP_DIM_GANG); + gcall *gang_single + = gimple_build_call_internal (IFN_GOACC_DIM_POS, 1, arg); + gang_pos = make_ssa_name (integer_type_node); + gimple_call_set_lhs (gang_single, gang_pos); + gimple_stmt_iterator start + = gsi_start_bb (single_succ (ENTRY_BLOCK_PTR_FOR_FN (cfun))); + tree vuse = ssa_default_def (cfun, gimple_vop (cfun)); + gimple_set_vuse (gang_single, vuse); + gsi_insert_before (&start, gang_single, GSI_SAME_STMT); + } + + if (dump_file) + { + fprintf (dump_file, + "found store that needs single-gang neutering: "); + print_gimple_stmt (dump_file, stmt, 0, 0); + } + + { + /* Split block before store. */ + gimple_stmt_iterator gsi2 = gsi; + gsi_prev (&gsi2); + edge e; + if (gsi_end_p (gsi2)) + { + e = split_block_after_labels (bb); + gsi2 = gsi_last_bb (bb); + } + else + e = split_block (bb, gsi_stmt (gsi2)); + basic_block bb2 = e->dest; + + /* Split block after store. */ + gimple_stmt_iterator gsi3 = gsi_start_bb (bb2); + edge e2 = split_block (bb2, gsi_stmt (gsi3)); + basic_block bb3 = e2->dest; + + gimple *cond + = gimple_build_cond (EQ_EXPR, gang_pos, integer_zero_node, + NULL_TREE, NULL_TREE); + gsi_insert_after (&gsi2, cond, GSI_NEW_STMT); + + edge e3 = make_edge (bb, bb3, EDGE_FALSE_VALUE); + e->flags = EDGE_TRUE_VALUE; + + tree vdef = gimple_vdef (stmt); + tree vuse = gimple_vuse (stmt); + + tree phi_res = copy_ssa_name (vdef); + gphi *new_phi = create_phi_node (phi_res, bb3); + replace_uses_by (vdef, phi_res); + add_phi_arg (new_phi, vuse, e3, UNKNOWN_LOCATION); + add_phi_arg (new_phi, vdef, e2, UNKNOWN_LOCATION); + + /* Update gsi to point to next stmt. */ + bb = bb3; + gsi = gsi_start_bb (bb); + } + } + } + + return changed; +} + +/* Return true if the statements before and after the LOOP can be executed in + parallel with the function containing the loop. Resolve conflicting stores + outside LOOP by guarding them such that only a single gang executes them. */ + +static bool +oacc_entry_exit_ok (struct loop *loop, + reduction_info_table_type *reduction_list) +{ + basic_block *loop_bbs = get_loop_body_in_dom_order (loop); + vec<basic_block> region_bbs + = get_all_dominated_blocks (CDI_DOMINATORS, ENTRY_BLOCK_PTR_FOR_FN (cfun)); + + bitmap in_loop_bbs = BITMAP_ALLOC (NULL); + bitmap_clear (in_loop_bbs); + for (unsigned int i = 0; i < loop->num_nodes; i++) + bitmap_set_bit (in_loop_bbs, loop_bbs[i]->index); + + bitmap reduction_stores = BITMAP_ALLOC (NULL); + bool res = oacc_entry_exit_ok_1 (in_loop_bbs, region_bbs, reduction_list, + reduction_stores); + + if (res) + { + bool changed = oacc_entry_exit_single_gang (in_loop_bbs, region_bbs, + reduction_stores); + if (changed) + { + free_dominance_info (CDI_DOMINATORS); + calculate_dominance_info (CDI_DOMINATORS); + } + } + + free (loop_bbs); + + BITMAP_FREE (in_loop_bbs); + BITMAP_FREE (reduction_stores); + return res; } @@ -2687,7 +3174,7 @@ loop_has_phi_with_address_arg (struct loop *loop) otherwise. */ static bool -parallelize_loops (void) +parallelize_loops (bool oacc_kernels_p) { unsigned n_threads = flag_tree_parallelize_loops; bool changed = false; @@ -2699,19 +3186,29 @@ parallelize_loops (void) source_location loop_loc; /* Do not parallelize loops in the functions created by parallelization. */ - if (parallelized_function_p (cfun->decl)) + if (!oacc_kernels_p + && parallelized_function_p (cfun->decl)) return false; + + /* Do not parallelize loops in offloaded functions. */ + if (!oacc_kernels_p + && get_oacc_fn_attrib (cfun->decl) != NULL) + return false; + if (cfun->has_nonlocal_label) return false; gcc_obstack_init (&parloop_obstack); reduction_info_table_type reduction_list (10); + calculate_dominance_info (CDI_DOMINATORS); + FOR_EACH_LOOP (loop, 0) { if (loop == skip_loop) { - if (dump_file && (dump_flags & TDF_DETAILS)) + if (!loop->in_oacc_kernels_region + && dump_file && (dump_flags & TDF_DETAILS)) fprintf (dump_file, "Skipping loop %d as inner loop of parallelized loop\n", loop->num); @@ -2723,6 +3220,22 @@ parallelize_loops (void) skip_loop = NULL; reduction_list.empty (); + + if (oacc_kernels_p) + { + if (!loop->in_oacc_kernels_region) + continue; + + /* Don't try to parallelize inner loops in an oacc kernels region. */ + if (loop->inner) + skip_loop = loop->inner; + + if (dump_file && (dump_flags & TDF_DETAILS)) + fprintf (dump_file, + "Trying loop %d with header bb %d in oacc kernels" + " region\n", loop->num, loop->header->index); + } + if (dump_file && (dump_flags & TDF_DETAILS)) { fprintf (dump_file, "Trying loop %d as candidate\n",loop->num); @@ -2764,6 +3277,7 @@ parallelize_loops (void) /* FIXME: Bypass this check as graphite doesn't update the count and frequency correctly now. */ if (!flag_loop_parallelize_all + && !oacc_kernels_p && ((estimated != -1 && estimated <= (HOST_WIDE_INT) n_threads * MIN_PER_THREAD) /* Do not bother with loops in cold areas. */ @@ -2773,7 +3287,7 @@ parallelize_loops (void) if (!try_get_loop_niter (loop, &niter_desc)) continue; - if (!try_create_reduction_list (loop, &reduction_list)) + if (!try_create_reduction_list (loop, &reduction_list, oacc_kernels_p)) continue; if (loop_has_phi_with_address_arg (loop)) @@ -2783,6 +3297,14 @@ parallelize_loops (void) && !loop_parallel_p (loop, &parloop_obstack)) continue; + if (oacc_kernels_p + && !oacc_entry_exit_ok (loop, &reduction_list)) + { + if (dump_file) + fprintf (dump_file, "entry/exit not ok: FAILED\n"); + continue; + } + changed = true; skip_loop = loop->inner; if (dump_file && (dump_flags & TDF_DETAILS)) @@ -2796,8 +3318,9 @@ parallelize_loops (void) fprintf (dump_file, "\nloop at %s:%d: ", LOCATION_FILE (loop_loc), LOCATION_LINE (loop_loc)); } + gen_parallel_loop (loop, &reduction_list, - n_threads, &niter_desc); + n_threads, &niter_desc, oacc_kernels_p); } obstack_free (&parloop_obstack, NULL); @@ -2832,13 +3355,22 @@ class pass_parallelize_loops : public gimple_opt_pass { public: pass_parallelize_loops (gcc::context *ctxt) - : gimple_opt_pass (pass_data_parallelize_loops, ctxt) + : gimple_opt_pass (pass_data_parallelize_loops, ctxt), + oacc_kernels_p (false) {} /* opt_pass methods: */ virtual bool gate (function *) { return flag_tree_parallelize_loops > 1; } virtual unsigned int execute (function *); + opt_pass * clone () { return new pass_parallelize_loops (m_ctxt); } + void set_pass_param (unsigned int n, bool param) + { + gcc_assert (n == 0); + oacc_kernels_p = param; + } + private: + bool oacc_kernels_p; }; // class pass_parallelize_loops unsigned @@ -2863,7 +3395,7 @@ pass_parallelize_loops::execute (function *fun) } unsigned int todo = 0; - if (parallelize_loops ()) + if (parallelize_loops (oacc_kernels_p)) { fun->curr_properties &= ~(PROP_gimple_eomp); |