diff options
79 files changed, 3983 insertions, 587 deletions
diff --git a/gcc/ChangeLog.gomp-nvptx b/gcc/ChangeLog.gomp-nvptx new file mode 100644 index 00000000000..8b555f25414 --- /dev/null +++ b/gcc/ChangeLog.gomp-nvptx @@ -0,0 +1,269 @@ +2016-11-08 Alexander Monakov <amonakov@ispras.ru> + + * omp-low.c (lower_lastprivate_clauses): Add missing call to + unshare_expr. + +2016-07-08 Alexander Monakov <amonakov@ispras.ru> + + * config/nvptx/nvptx.c (nvptx_file_end): Do emit element count in + external declaration of __nvptx_stacks. + +2016-06-02 Alexander Monakov <amonakov@ispras.ru> + + * doc/invoke.texi (msoft-stack): Reword, fix errors. + +2016-05-19 Alexander Monakov <amonakov@ispras.ru> + + * config/nvptx/nvptx.c (nvptx_declare_function_name): Expand comments. + (nvptx_file_end): Do not emit element count in external declaration of + __nvptx_stacks. + +2016-05-19 Alexander Monakov <amonakov@ispras.ru> + + * doc/invoke.texi (msoft-stack): Rewrite. + +2016-05-06 Alexander Monakov <amonakov@ispras.ru> + + * config/nvptx/nvptx.c (nvptx_encode_section_info): Diagnose explicit + static initialization of variables in .shared memory. + (nvptx_handle_shared_attribute): Reword diagnostic message. + +2016-04-19 Alexander Monakov <amonakov@ispras.ru> + + * doc/extend.texi (Nvidia PTX Variable Attributes): New section. + +2016-04-18 Alexander Monakov <amonakov@ispras.ru> + + * config/nvptx/nvptx.c (write_omp_entry): Adjust. + (nvptx_declare_function_name): Adjust. + +2016-03-15 Alexander Monakov <amonakov@ispras.ru> + + * config/nvptx/nvptx.h (STACK_SIZE_MODE): Define. + +2016-03-15 Alexander Monakov <amonakov@ispras.ru> + + * config/nvptx/nvptx.c (nvptx_init_unisimt_predicate): Emit + 'mul.lo.u32' instead of 'mul.u32' for 32-bit ABI target. + (nvptx_declare_function_name): Ditto. + +2016-03-15 Alexander Monakov <amonakov@ispras.ru> + + * config/nvptx/nvptx.c (nvptx_init_unisimt_predicate): Use + POINTER_SIZE instead of BITS_PER_WORD. + (nvptx_declare_function_name): Ditto. + (nvptx_output_return): Ditto. + (nvptx_file_end): Ditto. + +2016-03-15 Alexander Monakov <amonakov@ispras.ru> + + * config/nvptx/nvptx.c (nvptx_option_override): Remove custom handling + of debug info options. + +2016-02-15 Alexander Monakov <amonakov@ispras.ru> + + * config/nvptx/nvptx.c (write_omp_entry): Expand entry code to + initialize __nvptx_uni and __nvptx_stacks (based on pointer to storage + allocated by the libgomp plugin). + +2016-01-17 Alexander Monakov <amonakov@ispras.ru> + + * config/nvptx/nvptx.c (nvptx_encode_section_info): Handle "shared" + attribute. + (nvptx_handle_shared_attribute): New. Use it... + (nvptx_attribute_table): ... here (new entry). + +2016-01-17 Alexander Monakov <amonakov@ispras.ru> + + * config/nvptx/nvptx.c (nvptx_declare_function_name): Fix warning. + +2016-01-17 Alexander Monakov <amonakov@ispras.ru> + + * internal-fn.c (expand_GOMP_SIMT_LANE): Update. + (expand_GOMP_SIMT_LAST_LANE): New. + (expand_GOMP_SIMT_ORDERED_PRED): New. + (expand_GOMP_SIMT_VOTE_ANY): New. + (expand_GOMP_SIMT_XCHG_BFLY): New. + (expand_GOMP_SIMT_XCHG_IDX): New. + * internal-fn.def (GOMP_SIMT_LAST_LANE): New. + (GOMP_SIMT_ORDERED_PRED): New. + (GOMP_SIMT_VOTE_ANY): New. + (GOMP_SIMT_XCHG_BFLY): New. + (GOMP_SIMT_XCHG_IDX): New. + * omp-low.c (omp_maybe_offloaded_ctx): New, outlined from... + (create_omp_child_function): ...here. Simplify. + (omp_max_simt_vf): New. Use it... + (omp_max_vf): ...here. + (lower_rec_input_clauses): Add reduction lowering for SIMT execution. + (lower_lastprivate_clauses): Likewise, for lastprivate lowering. + (lower_omp_ordered): Likewise, for "ordered" lowering. + (expand_omp_simd): Update SIMT transforms. + (execute_omp_device_lower): Update. Fold SIMD ifns on SIMT targets. + +2016-01-17 Alexander Monakov <amonakov@ispras.ru> + + * config/nvptx/nvptx-protos.h (nvptx_shuffle_kind): Move enum + declaration from nvptx.c. + (nvptx_gen_shuffle): Declare. + * config/nvptx/nvptx.c (nvptx_shuffle_kind): Moved to nvptx-protos.h. + (nvptx_gen_shuffle): No longer static. + * config/nvptx/nvptx.md (UNSPEC_VOTE_BALLOT): New unspec. + (UNSPEC_LANEID): Ditto. + (UNSPECV_NOUNROLL): Ditto. + (nvptx_vote_ballot): New pattern. + (omp_simt_lane): Ditto. + (nvptx_nounroll): Ditto. + (omp_simt_last_lane): Ditto. + (omp_simt_ordered): Ditto. + (omp_simt_vote_any): Ditto. + (omp_simt_xchg_bfly): Ditto. + (omp_simt_xchg_idx): Ditto. + * target-insns.def (omp_simt_lane): New. + (omp_simt_last_lane): New. + (omp_simt_ordered): New. + (omp_simt_vote_any): New. + (omp_simt_xchg_bfly): New. + (omp_simt_xchg_idx): New. + +2015-12-16 Alexander Monakov <amonakov@ispras.ru> + + * config/nvptx/nvptx.c (write_omp_entry): Work around failure on sm_5x + devices by loading ...$impl function address outside of call sequence. + +2015-12-14 Alexander Monakov <amonakov@ispras.ru> + + * config/nvptx/nvptx.c (nvptx_declare_function_name): Emit %outargs + using .local %outargs_ar only if not TARGET_SOFT_STACK. Emit %outargs + under TARGET_SOFT_STACK by offsetting from %frame. + (nvptx_get_drap_rtx): Return %argp as the DRAP if needed. + * config/nvptx/nvptx.md (nvptx_register_operand): Allow %outargs under + TARGET_SOFT_STACK. + (nvptx_nonimmediate_operand): Ditto. + (allocate_stack): Implement for TARGET_SOFT_STACK. Remove unused code. + (allocate_stack_<mode>): Remove unused pattern. + (set_softstack_insn): New pattern. + (restore_stack_block): Handle for TARGET_SOFT_STACK. + +2015-12-09 Alexander Monakov <amonakov@ispras.ru> + + * config/nvptx/nvptx.c (nvptx_record_offload_symbol): Allow NULL attrs + for OpenMP offloading. + +2015-12-09 Alexander Monakov <amonakov@ispras.ru> + + * internal-fn.c (expand_GOMP_SIMT_LANE): New. + (expand_GOMP_SIMT_VF): New. + * internal-fn.def (GOMP_SIMT_LANE): New. + (GOMP_SIMT_VF): New. + * omp-low.c (expand_omp_simd): Do SIMT transforms. + (pass_data_lower_omp): Add PROP_gimple_lomp_dev. + (execute_omp_device_lower): New. + (pass_data_omp_device_lower): New. + (pass_omp_device_lower): New pass. + (make_pass_omp_device_lower): New. + * passes.def (pass_omp_device_lower): Position new pass. + * tree-pass.h (PROP_gimple_lomp_dev): Define. + (make_pass_omp_device_lower): Declare. + +2015-12-09 Alexander Monakov <amonakov@ispras.ru> + + * config/nvptx/mkoffload.c (main): Check that either OpenACC or OpenMP + is selected. Pass -mgomp to offload compiler in OpenMP case. + +2015-12-09 Alexander Monakov <amonakov@ispras.ru> + + * config/nvptx/nvptx.c (nvptx_simt_vf): New. + (TARGET_SIMT_VF): Define. + * doc/tm.texi: Regenerate. + * doc/tm.texi.in: (TARGET_SIMT_VF): New hook. + * target.def: Define it. + +2015-12-09 Alexander Monakov <amonakov@ispras.ru> + + * config/nvptx/nvptx.c (nvptx_option_override): Handle TARGET_GOMP. + * config/nvptx/nvptx.opt (mgomp): New option. + * config/nvptx/t-nvptx (MULTILIB_OPTIONS): New. + * doc/invoke.texi (mgomp): Document. + +2015-12-09 Alexander Monakov <amonakov@ispras.ru> + + * config/nvptx/nvptx.c (nvptx_name_replacement): Rewrite. Add + __nvptx_real_malloc -> malloc and __nvptx_real_free -> free + replacements. + +2015-12-09 Alexander Monakov <amonakov@ispras.ru> + + * config/nvptx/nvptx.c (need_unisimt_decl): New variable. Set it... + (nvptx_init_unisimt_predicate): ...here (new function) and use it... + (nvptx_file_end): ...here to emit declaration of __nvptx_uni array. + (nvptx_declare_function_name): Call nvptx_init_unisimt_predicate. + (nvptx_get_unisimt_master): New helper function. + (nvptx_get_unisimt_predicate): Ditto. + (nvptx_call_insn_is_syscall_p): Ditto. + (nvptx_unisimt_handle_set): Ditto. + (nvptx_reorg_uniform_simt): New. Transform code for -muniform-simt. + (nvptx_get_axis_predicate): New helper function, factored out from... + (nvptx_single): ...here. + (nvptx_reorg): Call nvptx_reorg_uniform_simt. + * config/nvptx/nvptx.h (TARGET_CPU_CPP_BUILTINS): Define + __nvptx_unisimt__ when -muniform-simt option is active. + (struct machine_function): Add unisimt_master, unisimt_predicate + rtx fields. + * config/nvptx/nvptx.md (divergent): New attribute. + (atomic_compare_and_swap<mode>_1): Mark as divergent. + (atomic_exchange<mode>): Ditto. + (atomic_fetch_add<mode>): Ditto. + (atomic_fetch_addsf): Ditto. + (atomic_fetch_<logic><mode>): Ditto. + * config/nvptx/nvptx.opt (muniform-simt): New option. + * doc/invoke.texi (-muniform-simt): Document. + +2015-12-09 Alexander Monakov <amonakov@ispras.ru> + + * config/nvptx/nvptx.c (nvptx_output_call_insn): Handle COND_EXEC + patterns. Emit instruction predicate. + (nvptx_print_operand): Unbreak handling of instruction predicates. + * config/nvptx/nvptx.md (predicable): New attribute. Generate + predicated forms via define_cond_exec. + (br_true): Mark as not predicable. + (br_false): Ditto. + (br_true_uni): Ditto. + (br_false_uni): Ditto. + (return): Ditto. + (trap_if_true): Ditto. + (trap_if_false): Ditto. + (nvptx_fork): Ditto. + (nvptx_forked): Ditto. + (nvptx_joining): Ditto. + (nvptx_join): Ditto. + (nvptx_barsync): Ditto. + +2015-12-09 Alexander Monakov <amonakov@ispras.ru> + + * config/nvptx/nvptx.c: (need_softstack_decl): New variable. + (nvptx_declare_function_name): Handle TARGET_SOFT_STACK. + (nvptx_output_return): Emit stack restore if needed. + (nvptx_file_end): Handle need_softstack_decl. + * config/nvptx/nvptx.h: (TARGET_CPU_CPP_BUILTINS): Define + __nvptx_softstack__ when -msoft-stack is active. + (struct machine_function): New bool field using_softstack. + * config/nvptx/nvptx.opt: (msoft-stack): New option. + * doc/invoke.texi (msoft-stack): Document. + +2015-12-08 Alexander Monakov <amonakov@ispras.ru> + + * config/nvptx/nvptx.c: (write_omp_entry): New. Use it... + (nvptx_declare_function_name): ...here to emit pointers for libgomp. + +2015-12-08 Alexander Monakov <amonakov@ispras.ru> + + * omp-low.c (create_omp_child_function): Set "omp target entrypoint", + "omp acc target entrypoint" or "omp declare target" attribute based on + is_gimple_omp_offloaded and is_gimple_omp_oacc. + * config/nvptx/nvptx.c (write_as_kernel): Test OpenACC-specific + attribute "omp acc target entrypoint". Add a comment about the OpenMP + attribute handling. + +2015-12-08 Alexander Monakov <amonakov@ispras.ru> + + * config/nvptx/mkoffload.c (main): Allow -fopenmp. diff --git a/gcc/config/nvptx/mkoffload.c b/gcc/config/nvptx/mkoffload.c index c8eed451078..e99ef373770 100644 --- a/gcc/config/nvptx/mkoffload.c +++ b/gcc/config/nvptx/mkoffload.c @@ -460,6 +460,7 @@ main (int argc, char **argv) /* Scan the argument vector. */ bool fopenmp = false; + bool fopenacc = false; for (int i = 1; i < argc; i++) { #define STR "-foffload-abi=" @@ -476,11 +477,15 @@ main (int argc, char **argv) #undef STR else if (strcmp (argv[i], "-fopenmp") == 0) fopenmp = true; + else if (strcmp (argv[i], "-fopenacc") == 0) + fopenacc = true; else if (strcmp (argv[i], "-save-temps") == 0) save_temps = true; else if (strcmp (argv[i], "-v") == 0) verbose = true; } + if (!(fopenacc ^ fopenmp)) + fatal_error (input_location, "either -fopenacc or -fopenmp must be set"); struct obstack argv_obstack; obstack_init (&argv_obstack); @@ -501,6 +506,8 @@ main (int argc, char **argv) default: gcc_unreachable (); } + if (fopenmp) + obstack_ptr_grow (&argv_obstack, "-mgomp"); for (int ix = 1; ix != argc; ix++) { @@ -517,8 +524,8 @@ main (int argc, char **argv) fatal_error (input_location, "cannot open '%s'", ptx_cfile_name); /* PR libgomp/65099: Currently, we only support offloading in 64-bit - configurations. PR target/67822: OpenMP offloading to nvptx fails. */ - if (offload_abi == OFFLOAD_ABI_LP64 && !fopenmp) + configurations. */ + if (offload_abi == OFFLOAD_ABI_LP64) { ptx_name = make_temp_file (".mkoffload"); obstack_ptr_grow (&argv_obstack, "-o"); diff --git a/gcc/config/nvptx/nvptx-protos.h b/gcc/config/nvptx/nvptx-protos.h index ec4588e6dc0..331ec0af6bf 100644 --- a/gcc/config/nvptx/nvptx-protos.h +++ b/gcc/config/nvptx/nvptx-protos.h @@ -21,6 +21,16 @@ #ifndef GCC_NVPTX_PROTOS_H #define GCC_NVPTX_PROTOS_H +/* The kind of shuffe instruction. */ +enum nvptx_shuffle_kind +{ + SHUFFLE_UP, + SHUFFLE_DOWN, + SHUFFLE_BFLY, + SHUFFLE_IDX, + SHUFFLE_MAX +}; + extern void nvptx_declare_function_name (FILE *, const char *, const_tree decl); extern void nvptx_declare_object_name (FILE *file, const char *name, const_tree decl); @@ -36,10 +46,12 @@ extern void nvptx_register_pragmas (void); extern void nvptx_expand_oacc_fork (unsigned); extern void nvptx_expand_oacc_join (unsigned); extern void nvptx_expand_call (rtx, rtx); +extern rtx nvptx_gen_shuffle (rtx, rtx, rtx, nvptx_shuffle_kind); extern rtx nvptx_expand_compare (rtx); extern const char *nvptx_ptx_type_from_mode (machine_mode, bool); extern const char *nvptx_output_mov_insn (rtx, rtx); extern const char *nvptx_output_call_insn (rtx_insn *, rtx, rtx); extern const char *nvptx_output_return (void); +extern const char *nvptx_output_set_softstack (unsigned); #endif #endif diff --git a/gcc/config/nvptx/nvptx.c b/gcc/config/nvptx/nvptx.c index 782bbdecb37..68bf0a89a81 100644 --- a/gcc/config/nvptx/nvptx.c +++ b/gcc/config/nvptx/nvptx.c @@ -72,16 +72,6 @@ /* This file should be included last. */ #include "target-def.h" -/* The kind of shuffe instruction. */ -enum nvptx_shuffle_kind -{ - SHUFFLE_UP, - SHUFFLE_DOWN, - SHUFFLE_BFLY, - SHUFFLE_IDX, - SHUFFLE_MAX -}; - /* The various PTX memory areas an object might reside in. */ enum nvptx_data_area { @@ -141,6 +131,12 @@ static GTY(()) rtx worker_red_sym; /* Global lock variable, needed for 128bit worker & gang reductions. */ static GTY(()) tree global_lock_var; +/* True if any function references __nvptx_stacks. */ +static bool need_softstack_decl; + +/* True if any function references __nvptx_uni. */ +static bool need_unisimt_decl; + /* Allocate a new, cleared machine_function structure. */ static struct machine_function * @@ -151,6 +147,16 @@ nvptx_init_machine_status (void) return p; } +/* Issue a diagnostic when option OPTNAME is enabled (as indicated by OPTVAL) + and -fopenacc is also enabled. */ + +static void +diagnose_openacc_conflict (bool optval, const char *optname) +{ + if (flag_openacc && optval) + error ("option %s is not supported together with -fopenacc", optname); +} + /* Implement TARGET_OPTION_OVERRIDE. */ static void @@ -188,6 +194,13 @@ nvptx_option_override (void) worker_red_sym = gen_rtx_SYMBOL_REF (Pmode, "__worker_red"); SET_SYMBOL_DATA_AREA (worker_red_sym, DATA_AREA_SHARED); worker_red_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT; + + diagnose_openacc_conflict (TARGET_GOMP, "-mgomp"); + diagnose_openacc_conflict (TARGET_SOFT_STACK, "-msoft-stack"); + diagnose_openacc_conflict (TARGET_UNIFORM_SIMT, "-muniform-simt"); + + if (TARGET_GOMP) + target_flags |= MASK_SOFT_STACK | MASK_UNIFORM_SIMT; } /* Return a ptx type for MODE. If PROMOTE, then use .u32 for QImode to @@ -238,9 +251,17 @@ nvptx_encode_section_info (tree decl, rtx rtl, int first) if (TREE_CONSTANT (decl)) area = DATA_AREA_CONST; else if (TREE_CODE (decl) == VAR_DECL) - /* TODO: This would be a good place to check for a .shared or - other section name. */ - area = TREE_READONLY (decl) ? DATA_AREA_CONST : DATA_AREA_GLOBAL; + { + if (lookup_attribute ("shared", DECL_ATTRIBUTES (decl))) + { + area = DATA_AREA_SHARED; + if (DECL_INITIAL (decl)) + error ("static initialization of variable %q+D in %<.shared%>" + " memory is not supported", decl); + } + else + area = TREE_READONLY (decl) ? DATA_AREA_CONST : DATA_AREA_GLOBAL; + } SET_SYMBOL_DATA_AREA (XEXP (rtl, 0), area); } @@ -718,7 +739,10 @@ static bool write_as_kernel (tree attrs) { return (lookup_attribute ("kernel", attrs) != NULL_TREE - || lookup_attribute ("omp target entrypoint", attrs) != NULL_TREE); + || (lookup_attribute ("omp target entrypoint", attrs) != NULL_TREE + && lookup_attribute ("oacc function", attrs) != NULL_TREE)); + /* For OpenMP target regions, the corresponding kernel entry is emitted from + write_omp_entry as a separate function. */ } /* Emit a linker marker for a function decl or defn. */ @@ -973,6 +997,67 @@ init_frame (FILE *file, int regno, unsigned align, unsigned size) POINTER_SIZE, reg_names[regno], reg_names[regno]); } +/* Emit soft stack frame setup sequence. */ + +static void +init_softstack_frame (FILE *file, unsigned alignment, HOST_WIDE_INT size) +{ + /* Maintain 64-bit stack alignment. */ + unsigned keep_align = BIGGEST_ALIGNMENT / BITS_PER_UNIT; + size = ROUND_UP (size, keep_align); + int bits = POINTER_SIZE; + const char *reg_stack = reg_names[STACK_POINTER_REGNUM]; + const char *reg_frame = reg_names[FRAME_POINTER_REGNUM]; + const char *reg_sspslot = reg_names[SOFTSTACK_SLOT_REGNUM]; + const char *reg_sspprev = reg_names[SOFTSTACK_PREV_REGNUM]; + fprintf (file, "\t.reg.u%d %s;\n", bits, reg_stack); + fprintf (file, "\t.reg.u%d %s;\n", bits, reg_frame); + fprintf (file, "\t.reg.u%d %s;\n", bits, reg_sspslot); + fprintf (file, "\t.reg.u%d %s;\n", bits, reg_sspprev); + fprintf (file, "\t{\n"); + fprintf (file, "\t\t.reg.u32 %%fstmp0;\n"); + fprintf (file, "\t\t.reg.u%d %%fstmp1;\n", bits); + fprintf (file, "\t\t.reg.u%d %%fstmp2;\n", bits); + fprintf (file, "\t\tmov.u32 %%fstmp0, %%tid.y;\n"); + fprintf (file, "\t\tmul%s.u32 %%fstmp1, %%fstmp0, %d;\n", + bits == 64 ? ".wide" : ".lo", bits / 8); + fprintf (file, "\t\tmov.u%d %%fstmp2, __nvptx_stacks;\n", bits); + + /* Initialize %sspslot = &__nvptx_stacks[tid.y]. */ + fprintf (file, "\t\tadd.u%d %s, %%fstmp2, %%fstmp1;\n", bits, reg_sspslot); + + /* Initialize %sspprev = __nvptx_stacks[tid.y]. */ + fprintf (file, "\t\tld.shared.u%d %s, [%s];\n", + bits, reg_sspprev, reg_sspslot); + + /* Initialize %frame = %sspprev - size. */ + fprintf (file, "\t\tsub.u%d %s, %s, " HOST_WIDE_INT_PRINT_DEC ";\n", + bits, reg_frame, reg_sspprev, size); + + /* Apply alignment, if larger than 64. */ + if (alignment > keep_align) + fprintf (file, "\t\tand.b%d %s, %s, %d;\n", + bits, reg_frame, reg_frame, -alignment); + + size = crtl->outgoing_args_size; + gcc_assert (size % keep_align == 0); + + /* Initialize %stack. */ + fprintf (file, "\t\tsub.u%d %s, %s, " HOST_WIDE_INT_PRINT_DEC ";\n", + bits, reg_stack, reg_frame, size); + + /* Usually 'crtl->is_leaf' is computed during register allocator + initialization, which is not done on NVPTX. Compute it now. */ + gcc_assert (!crtl->is_leaf); + crtl->is_leaf = leaf_function_p (); + if (!crtl->is_leaf) + fprintf (file, "\t\tst.shared.u%d [%s], %s;\n", + bits, reg_sspslot, reg_stack); + fprintf (file, "\t}\n"); + cfun->machine->has_softstack = true; + need_softstack_decl = true; +} + /* Emit code to initialize the REGNO predicate register to indicate whether we are not lane zero on the NAME axis. */ @@ -986,6 +1071,97 @@ nvptx_init_axis_predicate (FILE *file, int regno, const char *name) fprintf (file, "\t}\n"); } +/* Emit code to initialize predicate and master lane index registers for + -muniform-simt code generation variant. */ + +static void +nvptx_init_unisimt_predicate (FILE *file) +{ + int bits = POINTER_SIZE; + int master = REGNO (cfun->machine->unisimt_master); + int pred = REGNO (cfun->machine->unisimt_predicate); + fprintf (file, "\t{\n"); + fprintf (file, "\t\t.reg.u32 %%ustmp0;\n"); + fprintf (file, "\t\t.reg.u%d %%ustmp1;\n", bits); + fprintf (file, "\t\t.reg.u%d %%ustmp2;\n", bits); + fprintf (file, "\t\tmov.u32 %%ustmp0, %%tid.y;\n"); + fprintf (file, "\t\tmul%s.u32 %%ustmp1, %%ustmp0, 4;\n", + bits == 64 ? ".wide" : ".lo"); + fprintf (file, "\t\tmov.u%d %%ustmp2, __nvptx_uni;\n", bits); + fprintf (file, "\t\tadd.u%d %%ustmp2, %%ustmp2, %%ustmp1;\n", bits); + fprintf (file, "\t\tld.shared.u32 %%r%d, [%%ustmp2];\n", master); + fprintf (file, "\t\tmov.u32 %%ustmp0, %%tid.x;\n"); + /* Compute 'master lane index' as 'tid.x & __nvptx_uni[tid.y]'. */ + fprintf (file, "\t\tand.b32 %%r%d, %%r%d, %%ustmp0;\n", master, master); + /* Compute predicate as 'tid.x == master'. */ + fprintf (file, "\t\tsetp.eq.u32 %%r%d, %%r%d, %%ustmp0;\n", pred, master); + fprintf (file, "\t}\n"); + need_unisimt_decl = true; +} + +/* Emit kernel NAME for function ORIG outlined for an OpenMP 'target' region: + + extern void gomp_nvptx_main (void (*fn)(void*), void *fnarg); + void __attribute__((kernel)) NAME (void *arg, char *stack, size_t stacksize) + { + __nvptx_stacks[tid.y] = stack + stacksize * (ctaid.x * ntid.y + tid.y + 1); + __nvptx_uni[tid.y] = 0; + gomp_nvptx_main (ORIG, arg); + } + ORIG itself should not be emitted as a PTX .entry function. */ + +static void +write_omp_entry (FILE *file, const char *name, const char *orig) +{ + static bool gomp_nvptx_main_declared; + if (!gomp_nvptx_main_declared) + { + gomp_nvptx_main_declared = true; + write_fn_marker (func_decls, false, true, "gomp_nvptx_main"); + func_decls << ".extern .func gomp_nvptx_main (.param.u" << POINTER_SIZE + << " %in_ar1, .param.u" << POINTER_SIZE << " %in_ar2);\n"; + } +#define ENTRY_TEMPLATE(PS, PS_BYTES, MAD_PS_32) "\ + (.param.u" PS " %arg, .param.u" PS " %stack, .param.u" PS " %sz)\n\ +{\n\ + .reg.u32 %r<3>;\n\ + .reg.u" PS " %R<4>;\n\ + mov.u32 %r0, %tid.y;\n\ + mov.u32 %r1, %ntid.y;\n\ + mov.u32 %r2, %ctaid.x;\n\ + cvt.u" PS ".u32 %R1, %r0;\n\ + " MAD_PS_32 " %R1, %r1, %r2, %R1;\n\ + mov.u" PS " %R0, __nvptx_stacks;\n\ + " MAD_PS_32 " %R0, %r0, " PS_BYTES ", %R0;\n\ + ld.param.u" PS " %R2, [%stack];\n\ + ld.param.u" PS " %R3, [%sz];\n\ + add.u" PS " %R2, %R2, %R3;\n\ + mad.lo.u" PS " %R2, %R1, %R3, %R2;\n\ + st.shared.u" PS " [%R0], %R2;\n\ + mov.u" PS " %R0, __nvptx_uni;\n\ + " MAD_PS_32 " %R0, %r0, 4, %R0;\n\ + mov.u32 %r0, 0;\n\ + st.shared.u32 [%R0], %r0;\n\ + mov.u" PS " %R0, \0;\n\ + ld.param.u" PS " %R1, [%arg];\n\ + {\n\ + .param.u" PS " %P<2>;\n\ + st.param.u" PS " [%P0], %R0;\n\ + st.param.u" PS " [%P1], %R1;\n\ + call.uni gomp_nvptx_main, (%P0, %P1);\n\ + }\n\ + ret.uni;\n\ +}\n" + static const char entry64[] = ENTRY_TEMPLATE ("64", "8", "mad.wide.u32"); + static const char entry32[] = ENTRY_TEMPLATE ("32", "4", "mad.lo.u32 "); +#undef ENTRY_TEMPLATE + const char *entry_1 = TARGET_ABI64 ? entry64 : entry32; + /* Position ENTRY_2 after the embedded nul using strlen of the prefix. */ + const char *entry_2 = entry_1 + strlen (entry64) + 1; + fprintf (file, ".visible .entry %s%s%s%s", name, entry_1, orig, entry_2); + need_softstack_decl = need_unisimt_decl = true; +} + /* Implement ASM_DECLARE_FUNCTION_NAME. Writes the start of a ptx function, including local var decls and copies from the arguments to local regs. */ @@ -997,6 +1173,14 @@ nvptx_declare_function_name (FILE *file, const char *name, const_tree decl) tree result_type = TREE_TYPE (fntype); int argno = 0; + if (lookup_attribute ("omp target entrypoint", DECL_ATTRIBUTES (decl)) + && !lookup_attribute ("oacc function", DECL_ATTRIBUTES (decl))) + { + char *buf = (char *) alloca (strlen (name) + sizeof ("$impl")); + sprintf (buf, "%s$impl", name); + write_omp_entry (file, name, buf); + name = buf; + } /* We construct the initial part of the function into a string stream, in order to share the prototype writing code. */ std::stringstream s; @@ -1034,19 +1218,24 @@ nvptx_declare_function_name (FILE *file, const char *name, const_tree decl) fprintf (file, "%s", s.str().c_str()); - /* Declare a local var for outgoing varargs. */ - if (cfun->machine->has_varadic) - init_frame (file, STACK_POINTER_REGNUM, - UNITS_PER_WORD, crtl->outgoing_args_size); - - /* Declare a local variable for the frame. Force its size to be - DImode-compatible. */ HOST_WIDE_INT sz = get_frame_size (); - if (sz || cfun->machine->has_chain) - init_frame (file, FRAME_POINTER_REGNUM, - crtl->stack_alignment_needed / BITS_PER_UNIT, - (sz + GET_MODE_SIZE (DImode) - 1) - & ~(HOST_WIDE_INT)(GET_MODE_SIZE (DImode) - 1)); + bool need_frameptr = sz || cfun->machine->has_chain; + int alignment = crtl->stack_alignment_needed / BITS_PER_UNIT; + if (!TARGET_SOFT_STACK) + { + /* Declare a local var for outgoing varargs. */ + if (cfun->machine->has_varadic) + init_frame (file, STACK_POINTER_REGNUM, + UNITS_PER_WORD, crtl->outgoing_args_size); + + /* Declare a local variable for the frame. Force its size to be + DImode-compatible. */ + if (need_frameptr) + init_frame (file, FRAME_POINTER_REGNUM, alignment, + ROUND_UP (sz, GET_MODE_SIZE (DImode))); + } + else if (need_frameptr || cfun->machine->has_varadic || cfun->calls_alloca) + init_softstack_frame (file, alignment, sz); /* Declare the pseudos we have as ptx registers. */ int maxregs = max_reg_num (); @@ -1072,8 +1261,25 @@ nvptx_declare_function_name (FILE *file, const char *name, const_tree decl) if (cfun->machine->axis_predicate[1]) nvptx_init_axis_predicate (file, REGNO (cfun->machine->axis_predicate[1]), "x"); + if (cfun->machine->unisimt_predicate) + nvptx_init_unisimt_predicate (file); } +/* Output instruction that sets soft stack pointer in shared memory to the + value in register given by SRC_REGNO. */ + +const char * +nvptx_output_set_softstack (unsigned src_regno) +{ + if (cfun->machine->has_softstack && !crtl->is_leaf) + { + fprintf (asm_out_file, "\tst.shared.u%d\t[%s], ", + POINTER_SIZE, reg_names[SOFTSTACK_SLOT_REGNUM]); + output_reg (asm_out_file, src_regno, VOIDmode); + fprintf (asm_out_file, ";\n"); + } + return ""; +} /* Output a return instruction. Also copy the return value to its outgoing location. */ @@ -1113,6 +1319,8 @@ nvptx_function_ok_for_sibcall (tree, tree) static rtx nvptx_get_drap_rtx (void) { + if (TARGET_SOFT_STACK && stack_realign_drap) + return arg_pointer_rtx; return NULL_RTX; } @@ -1311,7 +1519,7 @@ nvptx_gen_pack (rtx dst, rtx src0, rtx src1) /* Generate an instruction or sequence to broadcast register REG across the vectors of a single warp. */ -static rtx +rtx nvptx_gen_shuffle (rtx dst, rtx src, rtx idx, nvptx_shuffle_kind kind) { rtx res; @@ -1833,6 +2041,8 @@ nvptx_output_mov_insn (rtx dst, rtx src) return "%.\tcvt%t0%t1\t%0, %1;"; } +static void nvptx_print_operand (FILE *, rtx, int); + /* Output INSN, which is a call to CALLEE with result RESULT. For ptx, this involves writing .param declarations and in/out copies into them. For indirect calls, also write the .callprototype. */ @@ -1844,6 +2054,8 @@ nvptx_output_call_insn (rtx_insn *insn, rtx result, rtx callee) static int labelno; bool needs_tgt = register_operand (callee, Pmode); rtx pat = PATTERN (insn); + if (GET_CODE (pat) == COND_EXEC) + pat = COND_EXEC_CODE (pat); int arg_end = XVECLEN (pat, 0); tree decl = NULL_TREE; @@ -1888,6 +2100,8 @@ nvptx_output_call_insn (rtx_insn *insn, rtx result, rtx callee) fprintf (asm_out_file, ";\n"); } + /* The '.' stands for the call's predicate, if any. */ + nvptx_print_operand (asm_out_file, NULL_RTX, '.'); fprintf (asm_out_file, "\t\tcall "); if (result != NULL_RTX) fprintf (asm_out_file, "(%s_in), ", reg_names[NVPTX_RETURN_REGNUM]); @@ -1951,8 +2165,6 @@ nvptx_print_operand_punct_valid_p (unsigned char c) return c == '.' || c== '#'; } -static void nvptx_print_operand (FILE *, rtx, int); - /* Subroutine of nvptx_print_operand; used to print a memory reference X to FILE. */ static void @@ -2013,12 +2225,10 @@ nvptx_print_operand (FILE *file, rtx x, int code) x = current_insn_predicate; if (x) { - unsigned int regno = REGNO (XEXP (x, 0)); - fputs ("[", file); + fputs ("@", file); if (GET_CODE (x) == EQ) fputs ("!", file); - fputs (reg_names [regno], file); - fputs ("]", file); + output_reg (file, REGNO (XEXP (x, 0)), VOIDmode); } return; } @@ -2313,6 +2523,89 @@ nvptx_reorg_subreg (void) } } +/* Return a SImode "master lane index" register for uniform-simt, allocating on + first use. */ + +static rtx +nvptx_get_unisimt_master () +{ + rtx &master = cfun->machine->unisimt_master; + return master ? master : master = gen_reg_rtx (SImode); +} + +/* Return a BImode "predicate" register for uniform-simt, similar to above. */ + +static rtx +nvptx_get_unisimt_predicate () +{ + rtx &pred = cfun->machine->unisimt_predicate; + return pred ? pred : pred = gen_reg_rtx (BImode); +} + +/* Return true if given call insn references one of the functions provided by + the CUDA runtime: malloc, free, vprintf. */ + +static bool +nvptx_call_insn_is_syscall_p (rtx_insn *insn) +{ + rtx pat = PATTERN (insn); + gcc_checking_assert (GET_CODE (pat) == PARALLEL); + pat = XVECEXP (pat, 0, 0); + if (GET_CODE (pat) == SET) + pat = SET_SRC (pat); + gcc_checking_assert (GET_CODE (pat) == CALL + && GET_CODE (XEXP (pat, 0)) == MEM); + rtx addr = XEXP (XEXP (pat, 0), 0); + if (GET_CODE (addr) != SYMBOL_REF) + return false; + const char *name = XSTR (addr, 0); + /* Ordinary malloc/free are redirected to __nvptx_{malloc,free), so only the + references with forced assembler name refer to PTX syscalls. For vprintf, + accept both normal and forced-assembler-name references. */ + return (!strcmp (name, "vprintf") || !strcmp (name, "*vprintf") + || !strcmp (name, "*malloc") + || !strcmp (name, "*free")); +} + +/* If SET subexpression of INSN sets a register, emit a shuffle instruction to + propagate its value from lane MASTER to current lane. */ + +static void +nvptx_unisimt_handle_set (rtx set, rtx_insn *insn, rtx master) +{ + rtx reg; + if (GET_CODE (set) == SET && REG_P (reg = SET_DEST (set))) + emit_insn_after (nvptx_gen_shuffle (reg, reg, master, SHUFFLE_IDX), insn); +} + +/* Adjust code for uniform-simt code generation variant by making atomics and + "syscalls" conditionally executed, and inserting shuffle-based propagation + for registers being set. */ + +static void +nvptx_reorg_uniform_simt () +{ + rtx_insn *insn, *next; + + for (insn = get_insns (); insn; insn = next) + { + next = NEXT_INSN (insn); + if (!(CALL_P (insn) && nvptx_call_insn_is_syscall_p (insn)) + && !(NONJUMP_INSN_P (insn) + && GET_CODE (PATTERN (insn)) == PARALLEL + && get_attr_atomic (insn))) + continue; + rtx pat = PATTERN (insn); + rtx master = nvptx_get_unisimt_master (); + for (int i = 0; i < XVECLEN (pat, 0); i++) + nvptx_unisimt_handle_set (XVECEXP (pat, 0, i), insn, master); + rtx pred = nvptx_get_unisimt_predicate (); + pred = gen_rtx_NE (BImode, pred, const0_rtx); + pat = gen_rtx_COND_EXEC (VOIDmode, pred, pat); + validate_change (insn, &PATTERN (insn), pat, false); + } +} + /* Loop structure of the function. The entire function is described as a NULL loop. */ @@ -3091,11 +3384,17 @@ nvptx_find_sese (auto_vec<basic_block> &blocks, bb_pair_vec_t ®ions) int ix; /* First clear each BB of the whole function. */ - FOR_ALL_BB_FN (block, cfun) + FOR_EACH_BB_FN (block, cfun) { block->flags &= ~BB_VISITED; BB_SET_SESE (block, 0); } + block = EXIT_BLOCK_PTR_FOR_FN (cfun); + block->flags &= ~BB_VISITED; + BB_SET_SESE (block, 0); + block = ENTRY_BLOCK_PTR_FOR_FN (cfun); + block->flags &= ~BB_VISITED; + BB_SET_SESE (block, 0); /* Mark blocks in the function that are in this graph. */ for (ix = 0; blocks.iterate (ix, &block); ix++) @@ -3829,6 +4128,9 @@ nvptx_reorg (void) /* Replace subregs. */ nvptx_reorg_subreg (); + if (TARGET_UNIFORM_SIMT) + nvptx_reorg_uniform_simt (); + regstat_free_n_sets_and_refs (); df_finish_pass (true); @@ -3857,12 +4159,36 @@ nvptx_handle_kernel_attribute (tree *node, tree name, tree ARG_UNUSED (args), return NULL_TREE; } +/* Handle a "shared" attribute; arguments as in + struct attribute_spec.handler. */ + +static tree +nvptx_handle_shared_attribute (tree *node, tree name, tree ARG_UNUSED (args), + int ARG_UNUSED (flags), bool *no_add_attrs) +{ + tree decl = *node; + + if (TREE_CODE (decl) != VAR_DECL) + { + error ("%qE attribute only applies to variables", name); + *no_add_attrs = true; + } + else if (!(TREE_PUBLIC (decl) || TREE_STATIC (decl))) + { + error ("%qE attribute not allowed with auto storage class", name); + *no_add_attrs = true; + } + + return NULL_TREE; +} + /* Table of valid machine attributes. */ static const struct attribute_spec nvptx_attribute_table[] = { /* { name, min_len, max_len, decl_req, type_req, fn_type_req, handler, affects_type_identity } */ { "kernel", 0, 0, true, false, false, nvptx_handle_kernel_attribute, false }, + { "shared", 0, 0, true, false, false, nvptx_handle_shared_attribute, false }, { NULL, 0, 0, false, false, false, NULL, false } }; @@ -3924,13 +4250,13 @@ nvptx_record_offload_symbol (tree decl) case FUNCTION_DECL: { tree attr = get_oacc_fn_attrib (decl); - tree dims = TREE_VALUE (attr); - unsigned ix; + /* OpenMP offloading does not set this attribute. */ + tree dims = attr ? TREE_VALUE (attr) : NULL_TREE; fprintf (asm_out_file, "//:FUNC_MAP \"%s\"", IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl))); - for (ix = 0; ix != GOMP_DIM_MAX; ix++, dims = TREE_CHAIN (dims)) + for (; dims; dims = TREE_CHAIN (dims)) { int size = TREE_INT_CST_LOW (TREE_VALUE (dims)); @@ -3991,6 +4317,21 @@ nvptx_file_end (void) if (worker_red_size) write_worker_buffer (asm_out_file, worker_red_sym, worker_red_align, worker_red_size); + + if (need_softstack_decl) + { + write_var_marker (asm_out_file, false, true, "__nvptx_stacks"); + /* 32 is the maximum number of warps in a block. Even though it's an + external declaration, emit the array size explicitly; otherwise, it + may fail at PTX JIT time if the definition is later in link order. */ + fprintf (asm_out_file, ".extern .shared .u%d __nvptx_stacks[32];\n", + POINTER_SIZE); + } + if (need_unisimt_decl) + { + write_var_marker (asm_out_file, false, true, "__nvptx_uni"); + fprintf (asm_out_file, ".extern .shared .u32 __nvptx_uni[32];\n"); + } } /* Expander for the shuffle builtins. */ @@ -4176,6 +4517,14 @@ nvptx_expand_builtin (tree exp, rtx target, rtx ARG_UNUSED (subtarget), #define PTX_WORKER_LENGTH 32 #define PTX_GANG_DEFAULT 0 /* Defer to runtime. */ +/* Implement TARGET_SIMT_VF target hook: number of threads in a warp. */ + +static int +nvptx_simt_vf () +{ + return PTX_VECTOR_LENGTH; +} + /* Validate compute dimensions of an OpenACC offload or routine, fill in non-unity defaults. FN_LEVEL indicates the level at which a routine might spawn a loop. It is negative for non-routines. If @@ -4944,6 +5293,9 @@ nvptx_goacc_reduction (gcall *call) #undef TARGET_BUILTIN_DECL #define TARGET_BUILTIN_DECL nvptx_builtin_decl +#undef TARGET_SIMT_VF +#define TARGET_SIMT_VF nvptx_simt_vf + #undef TARGET_GOACC_VALIDATE_DIMS #define TARGET_GOACC_VALIDATE_DIMS nvptx_goacc_validate_dims diff --git a/gcc/config/nvptx/nvptx.h b/gcc/config/nvptx/nvptx.h index 381269e3bcc..1702178eeb9 100644 --- a/gcc/config/nvptx/nvptx.h +++ b/gcc/config/nvptx/nvptx.h @@ -31,6 +31,10 @@ builtin_assert ("machine=nvptx"); \ builtin_assert ("cpu=nvptx"); \ builtin_define ("__nvptx__"); \ + if (TARGET_SOFT_STACK) \ + builtin_define ("__nvptx_softstack__"); \ + if (TARGET_UNIFORM_SIMT) \ + builtin_define ("__nvptx_unisimt__"); \ } while (0) /* Avoid the default in ../../gcc.c, which adds "-pthread", which is not @@ -79,13 +83,14 @@ #define POINTER_SIZE (TARGET_ABI64 ? 64 : 32) #define Pmode (TARGET_ABI64 ? DImode : SImode) +#define STACK_SIZE_MODE Pmode /* Registers. Since ptx is a virtual target, we just define a few hard registers for special purposes and leave pseudos unallocated. We have to have some available hard registers, to keep gcc setup happy. */ #define FIRST_PSEUDO_REGISTER 16 -#define FIXED_REGISTERS { 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } +#define FIXED_REGISTERS { 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0 } #define CALL_USED_REGISTERS { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 } #define HARD_REGNO_NREGS(REG, MODE) \ @@ -133,10 +138,17 @@ enum reg_class { NO_REGS, ALL_REGS, LIM_REG_CLASSES }; #define FRAME_POINTER_REGNUM 2 #define ARG_POINTER_REGNUM 3 #define STATIC_CHAIN_REGNUM 4 +/* This register points to the shared memory location with the current warp's + soft stack pointer (__nvptx_stacks[tid.y]). */ +#define SOFTSTACK_SLOT_REGNUM 5 +/* This register is used to save the previous value of the soft stack pointer + in the prologue and restore it when returning. */ +#define SOFTSTACK_PREV_REGNUM 6 #define REGISTER_NAMES \ { \ - "%value", "%stack", "%frame", "%args", "%chain", "%hr5", "%hr6", "%hr7", \ + "%value", "%stack", "%frame", "%args", \ + "%chain", "%sspslot", "%sspprev", "%hr7", \ "%hr8", "%hr9", "%hr10", "%hr11", "%hr12", "%hr13", "%hr14", "%hr15" \ } @@ -200,10 +212,13 @@ struct GTY(()) machine_function bool is_varadic; /* This call is varadic */ bool has_varadic; /* Current function has a varadic call. */ bool has_chain; /* Current function has outgoing static chain. */ + bool has_softstack; /* Current function has a soft stack frame. */ int num_args; /* Number of args of current call. */ int return_mode; /* Return mode of current fn. (machine_mode not defined yet.) */ rtx axis_predicate[2]; /* Neutering predicates. */ + rtx unisimt_master; /* 'Master lane index' for -muniform-simt. */ + rtx unisimt_predicate; /* Predicate for -muniform-simt. */ }; #endif diff --git a/gcc/config/nvptx/nvptx.md b/gcc/config/nvptx/nvptx.md index d117343c531..91d11290860 100644 --- a/gcc/config/nvptx/nvptx.md +++ b/gcc/config/nvptx/nvptx.md @@ -36,10 +36,16 @@ UNSPEC_ALLOCA + UNSPEC_SET_SOFTSTACK + UNSPEC_DIM_SIZE UNSPEC_BIT_CONV + UNSPEC_VOTE_BALLOT + + UNSPEC_LANEID + UNSPEC_SHUFFLE UNSPEC_BR_UNIFIED ]) @@ -55,11 +61,16 @@ UNSPECV_FORKED UNSPECV_JOINING UNSPECV_JOIN + + UNSPECV_NOUNROLL ]) (define_attr "subregs_ok" "false,true" (const_string "false")) +(define_attr "atomic" "false,true" + (const_string "false")) + ;; The nvptx operand predicates, in general, don't permit subregs and ;; only literal constants, which differ from the generic ones, which ;; permit subregs and symbolc constants (as appropriate) @@ -124,6 +135,17 @@ return true; }) +(define_attr "predicable" "false,true" + (const_string "true")) + +(define_cond_exec + [(match_operator 0 "predicate_operator" + [(match_operand:BI 1 "nvptx_register_operand" "") + (match_operand:BI 2 "const0_operand" "")])] + "" + "" + ) + (define_constraint "P0" "An integer with the value 0." (and (match_code "const_int") @@ -509,7 +531,8 @@ (label_ref (match_operand 1 "" "")) (pc)))] "" - "%j0\\tbra\\t%l1;") + "%j0\\tbra\\t%l1;" + [(set_attr "predicable" "false")]) (define_insn "br_false" [(set (pc) @@ -518,7 +541,8 @@ (label_ref (match_operand 1 "" "")) (pc)))] "" - "%J0\\tbra\\t%l1;") + "%J0\\tbra\\t%l1;" + [(set_attr "predicable" "false")]) ;; unified conditional branch (define_insn "br_true_uni" @@ -527,7 +551,8 @@ UNSPEC_BR_UNIFIED) (const_int 0)) (label_ref (match_operand 1 "" "")) (pc)))] "" - "%j0\\tbra.uni\\t%l1;") + "%j0\\tbra.uni\\t%l1;" + [(set_attr "predicable" "false")]) (define_insn "br_false_uni" [(set (pc) (if_then_else @@ -535,7 +560,8 @@ UNSPEC_BR_UNIFIED) (const_int 0)) (label_ref (match_operand 1 "" "")) (pc)))] "" - "%J0\\tbra.uni\\t%l1;") + "%J0\\tbra.uni\\t%l1;" + [(set_attr "predicable" "false")]) (define_expand "cbranch<mode>4" [(set (pc) @@ -938,12 +964,16 @@ "" { return nvptx_output_return (); -}) +} + [(set_attr "predicable" "false")]) (define_expand "epilogue" [(clobber (const_int 0))] "" { + if (TARGET_SOFT_STACK) + emit_insn (gen_set_softstack_insn (gen_rtx_REG (Pmode, + SOFTSTACK_PREV_REGNUM))); emit_jump_insn (gen_return ()); DONE; }) @@ -972,31 +1002,40 @@ (match_operand 1 "nvptx_register_operand")] "" { + if (TARGET_SOFT_STACK) + { + emit_move_insn (stack_pointer_rtx, + gen_rtx_MINUS (Pmode, stack_pointer_rtx, operands[1])); + emit_insn (gen_set_softstack_insn (stack_pointer_rtx)); + emit_move_insn (operands[0], virtual_stack_dynamic_rtx); + DONE; + } /* The ptx documentation specifies an alloca intrinsic (for 32 bit only) but notes it is not implemented. The assembler emits a confused error message. Issue a blunt one now instead. */ sorry ("target cannot support alloca."); emit_insn (gen_nop ()); DONE; - if (TARGET_ABI64) - emit_insn (gen_allocate_stack_di (operands[0], operands[1])); - else - emit_insn (gen_allocate_stack_si (operands[0], operands[1])); - DONE; }) -(define_insn "allocate_stack_<mode>" - [(set (match_operand:P 0 "nvptx_register_operand" "=R") - (unspec:P [(match_operand:P 1 "nvptx_register_operand" "R")] - UNSPEC_ALLOCA))] - "" - "%.\\tcall (%0), %%alloca, (%1);") +(define_insn "set_softstack_insn" + [(unspec [(match_operand 0 "nvptx_register_operand" "R")] + UNSPEC_SET_SOFTSTACK)] + "TARGET_SOFT_STACK" +{ + return nvptx_output_set_softstack (REGNO (operands[0])); +}) (define_expand "restore_stack_block" [(match_operand 0 "register_operand" "") (match_operand 1 "register_operand" "")] "" { + if (TARGET_SOFT_STACK) + { + emit_move_insn (operands[0], operands[1]); + emit_insn (gen_set_softstack_insn (operands[0])); + } DONE; }) @@ -1018,14 +1057,16 @@ (const_int 0)) (const_int 0))] "" - "%j0 trap;") + "%j0 trap;" + [(set_attr "predicable" "false")]) (define_insn "trap_if_false" [(trap_if (eq (match_operand:BI 0 "nvptx_register_operand" "R") (const_int 0)) (const_int 0))] "" - "%J0 trap;") + "%J0 trap;" + [(set_attr "predicable" "false")]) (define_expand "ctrap<mode>4" [(trap_if (match_operator 0 "nvptx_comparison_operator" @@ -1074,28 +1115,28 @@ UNSPECV_FORK)] "" "// fork %0;" -) + [(set_attr "predicable" "false")]) (define_insn "nvptx_forked" [(unspec_volatile:SI [(match_operand:SI 0 "const_int_operand" "")] UNSPECV_FORKED)] "" "// forked %0;" -) + [(set_attr "predicable" "false")]) (define_insn "nvptx_joining" [(unspec_volatile:SI [(match_operand:SI 0 "const_int_operand" "")] UNSPECV_JOINING)] "" "// joining %0;" -) + [(set_attr "predicable" "false")]) (define_insn "nvptx_join" [(unspec_volatile:SI [(match_operand:SI 0 "const_int_operand" "")] UNSPECV_JOIN)] "" "// join %0;" -) + [(set_attr "predicable" "false")]) (define_expand "oacc_fork" [(set (match_operand:SI 0 "nvptx_nonmemory_operand" "") @@ -1134,6 +1175,88 @@ "" "%.\\tshfl%S3.b32\\t%0, %1, %2, 31;") +(define_insn "nvptx_vote_ballot" + [(set (match_operand:SI 0 "nvptx_register_operand" "=R") + (unspec:SI [(match_operand:BI 1 "nvptx_register_operand" "R")] + UNSPEC_VOTE_BALLOT))] + "" + "%.\\tvote.ballot.b32\\t%0, %1;") + +;; Patterns for OpenMP SIMD-via-SIMT lowering + +;; Implement IFN_GOMP_SIMT_LANE: set operand 0 to lane index +(define_insn "omp_simt_lane" + [(set (match_operand:SI 0 "nvptx_register_operand" "") + (unspec:SI [(const_int 0)] UNSPEC_LANEID))] + "" + "%.\\tmov.u32\\t%0, %%laneid;") + +;; Implement IFN_GOMP_SIMT_ORDERED: copy operand 1 to operand 0 and +;; place a compiler barrier to disallow unrolling/peeling the containing loop +(define_expand "omp_simt_ordered" + [(match_operand:SI 0 "nvptx_register_operand" "=R") + (match_operand:SI 1 "nvptx_register_operand" "R")] + "" +{ + emit_move_insn (operands[0], operands[1]); + emit_insn (gen_nvptx_nounroll ()); + DONE; +}) + +;; Implement IFN_GOMP_SIMT_XCHG_BFLY: perform a "butterfly" exchange +;; across lanes +(define_expand "omp_simt_xchg_bfly" + [(match_operand 0 "nvptx_register_operand" "=R") + (match_operand 1 "nvptx_register_operand" "R") + (match_operand:SI 2 "nvptx_nonmemory_operand" "Ri")] + "" +{ + emit_insn (nvptx_gen_shuffle (operands[0], operands[1], operands[2], + SHUFFLE_BFLY)); + DONE; +}) + +;; Implement IFN_GOMP_SIMT_XCHG_IDX: broadcast value in operand 1 +;; from lane given by index in operand 2 to operand 0 in all lanes +(define_expand "omp_simt_xchg_idx" + [(match_operand 0 "nvptx_register_operand" "=R") + (match_operand 1 "nvptx_register_operand" "R") + (match_operand:SI 2 "nvptx_nonmemory_operand" "Ri")] + "" +{ + emit_insn (nvptx_gen_shuffle (operands[0], operands[1], operands[2], + SHUFFLE_IDX)); + DONE; +}) + +;; Implement IFN_GOMP_SIMT_VOTE_ANY: +;; set operand 0 to zero iff all lanes supply zero in operand 1 +(define_expand "omp_simt_vote_any" + [(match_operand:SI 0 "nvptx_register_operand" "=R") + (match_operand:SI 1 "nvptx_register_operand" "R")] + "" +{ + rtx pred = gen_reg_rtx (BImode); + emit_move_insn (pred, gen_rtx_NE (BImode, operands[1], const0_rtx)); + emit_insn (gen_nvptx_vote_ballot (operands[0], pred)); + DONE; +}) + +;; Implement IFN_GOMP_SIMT_LAST_LANE: +;; set operand 0 to the lowest lane index that passed non-zero in operand 1 +(define_expand "omp_simt_last_lane" + [(match_operand:SI 0 "nvptx_register_operand" "=R") + (match_operand:SI 1 "nvptx_register_operand" "R")] + "" +{ + rtx pred = gen_reg_rtx (BImode); + rtx tmp = gen_reg_rtx (SImode); + emit_move_insn (pred, gen_rtx_NE (BImode, operands[1], const0_rtx)); + emit_insn (gen_nvptx_vote_ballot (tmp, pred)); + emit_insn (gen_ctzsi2 (operands[0], tmp)); + DONE; +}) + ;; extract parts of a 64 bit object into 2 32-bit ints (define_insn "unpack<mode>si2" [(set (match_operand:SI 0 "nvptx_register_operand" "=R") @@ -1186,7 +1309,8 @@ (set (match_dup 1) (unspec_volatile:SDIM [(const_int 0)] UNSPECV_CAS))] "" - "%.\\tatom%A1.cas.b%T0\\t%0, %1, %2, %3;") + "%.\\tatom%A1.cas.b%T0\\t%0, %1, %2, %3;" + [(set_attr "atomic" "true")]) (define_insn "atomic_exchange<mode>" [(set (match_operand:SDIM 0 "nvptx_register_operand" "=R") ;; output @@ -1197,7 +1321,8 @@ (set (match_dup 1) (match_operand:SDIM 2 "nvptx_nonmemory_operand" "Ri"))] ;; input "" - "%.\\tatom%A1.exch.b%T0\\t%0, %1, %2;") + "%.\\tatom%A1.exch.b%T0\\t%0, %1, %2;" + [(set_attr "atomic" "true")]) (define_insn "atomic_fetch_add<mode>" [(set (match_operand:SDIM 1 "memory_operand" "+m") @@ -1209,7 +1334,8 @@ (set (match_operand:SDIM 0 "nvptx_register_operand" "=R") (match_dup 1))] "" - "%.\\tatom%A1.add%t0\\t%0, %1, %2;") + "%.\\tatom%A1.add%t0\\t%0, %1, %2;" + [(set_attr "atomic" "true")]) (define_insn "atomic_fetch_addsf" [(set (match_operand:SF 1 "memory_operand" "+m") @@ -1221,7 +1347,8 @@ (set (match_operand:SF 0 "nvptx_register_operand" "=R") (match_dup 1))] "" - "%.\\tatom%A1.add%t0\\t%0, %1, %2;") + "%.\\tatom%A1.add%t0\\t%0, %1, %2;" + [(set_attr "atomic" "true")]) (define_code_iterator any_logic [and ior xor]) (define_code_attr logic [(and "and") (ior "or") (xor "xor")]) @@ -1237,10 +1364,18 @@ (set (match_operand:SDIM 0 "nvptx_register_operand" "=R") (match_dup 1))] "0" - "%.\\tatom%A1.b%T0.<logic>\\t%0, %1, %2;") + "%.\\tatom%A1.b%T0.<logic>\\t%0, %1, %2;" + [(set_attr "atomic" "true")]) (define_insn "nvptx_barsync" [(unspec_volatile [(match_operand:SI 0 "const_int_operand" "")] UNSPECV_BARSYNC)] "" - "\\tbar.sync\\t%0;") + "\\tbar.sync\\t%0;" + [(set_attr "predicable" "false")]) + +(define_insn "nvptx_nounroll" + [(unspec_volatile [(const_int 0)] UNSPECV_NOUNROLL)] + "" + "\\t.pragma \\\"nounroll\\\";" + [(set_attr "predicable" "false")]) diff --git a/gcc/config/nvptx/nvptx.opt b/gcc/config/nvptx/nvptx.opt index 601cf124e36..cb6194da9dc 100644 --- a/gcc/config/nvptx/nvptx.opt +++ b/gcc/config/nvptx/nvptx.opt @@ -32,3 +32,15 @@ Link in code for a __main kernel. moptimize Target Report Var(nvptx_optimize) Init(-1) Optimize partition neutering. + +msoft-stack +Target Report Mask(SOFT_STACK) +Use custom stacks instead of local memory for automatic storage. + +muniform-simt +Target Report Mask(UNIFORM_SIMT) +Generate code that can keep local state uniform across all lanes. + +mgomp +Target Report Mask(GOMP) +Generate code for OpenMP offloading: enables -msoft-stack and -muniform-simt. diff --git a/gcc/config/nvptx/t-nvptx b/gcc/config/nvptx/t-nvptx index e2580c956e0..6c1010ddd66 100644 --- a/gcc/config/nvptx/t-nvptx +++ b/gcc/config/nvptx/t-nvptx @@ -8,3 +8,5 @@ ALL_HOST_OBJS += mkoffload.o mkoffload$(exeext): mkoffload.o collect-utils.o libcommon-target.a $(LIBIBERTY) $(LIBDEPS) +$(LINKER) $(ALL_LINKERFLAGS) $(LDFLAGS) -o $@ \ mkoffload.o collect-utils.o libcommon-target.a $(LIBIBERTY) $(LIBS) + +MULTILIB_OPTIONS = mgomp diff --git a/gcc/doc/extend.texi b/gcc/doc/extend.texi index 0669f7999be..4dcc7f6b555 100644 --- a/gcc/doc/extend.texi +++ b/gcc/doc/extend.texi @@ -5576,6 +5576,7 @@ attributes. * MeP Variable Attributes:: * Microsoft Windows Variable Attributes:: * MSP430 Variable Attributes:: +* Nvidia PTX Variable Attributes:: * PowerPC Variable Attributes:: * RL78 Variable Attributes:: * SPU Variable Attributes:: @@ -6257,6 +6258,20 @@ same name (@pxref{MSP430 Function Attributes}). These attributes can be applied to both functions and variables. @end table +@node Nvidia PTX Variable Attributes +@subsection Nvidia PTX Variable Attributes + +These variable attributes are supported by the Nvidia PTX back end: + +@table @code +@item shared +@cindex @code{shared} attribute, Nvidia PTX +Use this attribute to place a variable in the @code{.shared} memory space. +This memory space is private to each cooperative thread array; only threads +within one thread block refer to the same instance of the variable. +The runtime does not initialize variables in this memory space. +@end table + @node PowerPC Variable Attributes @subsection PowerPC Variable Attributes diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi index 17c5c225d0c..f06e2b8206d 100644 --- a/gcc/doc/invoke.texi +++ b/gcc/doc/invoke.texi @@ -20462,6 +20462,37 @@ offloading execution. Apply partitioned execution optimizations. This is the default when any level of optimization is selected. +@item -msoft-stack +@opindex msoft-stack +Generate code that does not use @code{.local} memory +directly for stack storage. Instead, a per-warp stack pointer is +maintained explicitly. This enables variable-length stack allocation (with +variable-length arrays or @code{alloca}), and when global memory is used for +underlying storage, makes it possible to access automatic variables from other +threads, or with atomic instructions. This code generation variant is used +for OpenMP offloading, but the option is exposed on its own for the purpose +of testing the compiler; to generate code suitable for linking into programs +using OpenMP offloading, use option @option{-mgomp}. + +@item -muniform-simt +@opindex muniform-simt +Switch to code generation variant that allows to execute all threads in each +warp, while maintaining memory state and side effects as if only one thread +in each warp was active outside of OpenMP SIMD regions. All atomic operations +and calls to runtime (malloc, free, vprintf) are conditionally executed (iff +current lane index equals the master lane index), and the register being +assigned is copied via a shuffle instruction from the master lane. Outside of +SIMD regions lane 0 is the master; inside, each thread sees itself as the +master. Shared memory array @code{int __nvptx_uni[]} stores all-zeros or +all-ones bitmasks for each warp, indicating current mode (0 outside of SIMD +regions). Each thread can bitwise-and the bitmask at position @code{tid.y} +with current lane index to compute the master lane index. + +@item -mgomp +@opindex mgomp +Generate code for use in OpenMP offloading: enables @option{-msoft-stack} and +@option{-muniform-simt} options, and selects corresponding multilib variant. + @end table @node PDP-11 Options diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi index 81c63b71bff..10122439c30 100644 --- a/gcc/doc/tm.texi +++ b/gcc/doc/tm.texi @@ -5858,6 +5858,10 @@ usable. In that case, the smaller the number is, the more desirable it is to use it. @end deftypefn +@deftypefn {Target Hook} int TARGET_SIMT_VF (void) +Return number of threads in SIMT thread group on the target. +@end deftypefn + @deftypefn {Target Hook} bool TARGET_GOACC_VALIDATE_DIMS (tree @var{decl}, int *@var{dims}, int @var{fn_level}) This hook should check the launch dimensions provided for an OpenACC compute region, or routine. Defaulted values are represented as -1 diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in index 74e9200f749..0a7eca32639 100644 --- a/gcc/doc/tm.texi.in +++ b/gcc/doc/tm.texi.in @@ -4293,6 +4293,8 @@ address; but often a machine-dependent strategy can generate better code. @hook TARGET_SIMD_CLONE_USABLE +@hook TARGET_SIMT_VF + @hook TARGET_GOACC_VALIDATE_DIMS @hook TARGET_GOACC_DIM_LIMIT diff --git a/gcc/internal-fn.c b/gcc/internal-fn.c index cbee97ea82c..fd1cd8b6445 100644 --- a/gcc/internal-fn.c +++ b/gcc/internal-fn.c @@ -157,6 +157,132 @@ expand_ANNOTATE (internal_fn, gcall *) gcc_unreachable (); } +/* Lane index on SIMT targets: thread index in the warp on NVPTX. On targets + without SIMT execution this should be expanded in omp_device_lower pass. */ + +static void +expand_GOMP_SIMT_LANE (internal_fn, gcall *stmt) +{ + tree lhs = gimple_call_lhs (stmt); + if (!lhs) + return; + + rtx target = expand_expr (lhs, NULL_RTX, VOIDmode, EXPAND_WRITE); + gcc_assert (targetm.have_omp_simt_lane ()); + emit_insn (targetm.gen_omp_simt_lane (target)); +} + +/* This should get expanded in omp_device_lower pass. */ + +static void +expand_GOMP_SIMT_VF (internal_fn, gcall *) +{ + gcc_unreachable (); +} + +/* Lane index of the first SIMT lane that supplies a non-zero argument. + This is a SIMT counterpart to GOMP_SIMD_LAST_LANE, used to represent the + lane that executed the last iteration for handling OpenMP lastprivate. */ + +static void +expand_GOMP_SIMT_LAST_LANE (internal_fn, gcall *stmt) +{ + tree lhs = gimple_call_lhs (stmt); + if (!lhs) + return; + + rtx target = expand_expr (lhs, NULL_RTX, VOIDmode, EXPAND_WRITE); + rtx cond = expand_normal (gimple_call_arg (stmt, 0)); + machine_mode mode = TYPE_MODE (TREE_TYPE (lhs)); + struct expand_operand ops[2]; + create_output_operand (&ops[0], target, mode); + create_input_operand (&ops[1], cond, mode); + gcc_assert (targetm.have_omp_simt_last_lane ()); + expand_insn (targetm.code_for_omp_simt_last_lane, 2, ops); +} + +/* Non-transparent predicate used in SIMT lowering of OpenMP "ordered". */ + +static void +expand_GOMP_SIMT_ORDERED_PRED (internal_fn, gcall *stmt) +{ + tree lhs = gimple_call_lhs (stmt); + if (!lhs) + return; + + rtx target = expand_expr (lhs, NULL_RTX, VOIDmode, EXPAND_WRITE); + rtx ctr = expand_normal (gimple_call_arg (stmt, 0)); + machine_mode mode = TYPE_MODE (TREE_TYPE (lhs)); + struct expand_operand ops[2]; + create_output_operand (&ops[0], target, mode); + create_input_operand (&ops[1], ctr, mode); + gcc_assert (targetm.have_omp_simt_ordered ()); + expand_insn (targetm.code_for_omp_simt_ordered, 2, ops); +} + +/* "Or" boolean reduction across SIMT lanes: return non-zero in all lanes if + any lane supplies a non-zero argument. */ + +static void +expand_GOMP_SIMT_VOTE_ANY (internal_fn, gcall *stmt) +{ + tree lhs = gimple_call_lhs (stmt); + if (!lhs) + return; + + rtx target = expand_expr (lhs, NULL_RTX, VOIDmode, EXPAND_WRITE); + rtx cond = expand_normal (gimple_call_arg (stmt, 0)); + machine_mode mode = TYPE_MODE (TREE_TYPE (lhs)); + struct expand_operand ops[2]; + create_output_operand (&ops[0], target, mode); + create_input_operand (&ops[1], cond, mode); + gcc_assert (targetm.have_omp_simt_vote_any ()); + expand_insn (targetm.code_for_omp_simt_vote_any, 2, ops); +} + +/* Exchange between SIMT lanes with a "butterfly" pattern: source lane index + is destination lane index XOR given offset. */ + +static void +expand_GOMP_SIMT_XCHG_BFLY (internal_fn, gcall *stmt) +{ + tree lhs = gimple_call_lhs (stmt); + if (!lhs) + return; + + rtx target = expand_expr (lhs, NULL_RTX, VOIDmode, EXPAND_WRITE); + rtx src = expand_normal (gimple_call_arg (stmt, 0)); + rtx idx = expand_normal (gimple_call_arg (stmt, 1)); + machine_mode mode = TYPE_MODE (TREE_TYPE (lhs)); + struct expand_operand ops[3]; + create_output_operand (&ops[0], target, mode); + create_input_operand (&ops[1], src, mode); + create_input_operand (&ops[2], idx, SImode); + gcc_assert (targetm.have_omp_simt_xchg_bfly ()); + expand_insn (targetm.code_for_omp_simt_xchg_bfly, 3, ops); +} + +/* Exchange between SIMT lanes according to given source lane index. */ + +static void +expand_GOMP_SIMT_XCHG_IDX (internal_fn, gcall *stmt) +{ + tree lhs = gimple_call_lhs (stmt); + if (!lhs) + return; + + rtx target = expand_expr (lhs, NULL_RTX, VOIDmode, EXPAND_WRITE); + rtx src = expand_normal (gimple_call_arg (stmt, 0)); + rtx idx = expand_normal (gimple_call_arg (stmt, 1)); + machine_mode mode = TYPE_MODE (TREE_TYPE (lhs)); + struct expand_operand ops[3]; + create_output_operand (&ops[0], target, mode); + create_input_operand (&ops[1], src, mode); + create_input_operand (&ops[2], idx, SImode); + gcc_assert (targetm.have_omp_simt_xchg_idx ()); + expand_insn (targetm.code_for_omp_simt_xchg_idx, 3, ops); +} + /* This should get expanded in adjust_simduid_builtins. */ static void diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def index 0869b2fd52d..77ce63a6408 100644 --- a/gcc/internal-fn.def +++ b/gcc/internal-fn.def @@ -141,6 +141,13 @@ DEF_INTERNAL_INT_FN (FFS, ECF_CONST, ffs, unary) DEF_INTERNAL_INT_FN (PARITY, ECF_CONST, parity, unary) DEF_INTERNAL_INT_FN (POPCOUNT, ECF_CONST, popcount, unary) +DEF_INTERNAL_FN (GOMP_SIMT_LANE, ECF_NOVOPS | ECF_LEAF | ECF_NOTHROW, NULL) +DEF_INTERNAL_FN (GOMP_SIMT_VF, ECF_NOVOPS | ECF_LEAF | ECF_NOTHROW, NULL) +DEF_INTERNAL_FN (GOMP_SIMT_LAST_LANE, ECF_NOVOPS | ECF_LEAF | ECF_NOTHROW, NULL) +DEF_INTERNAL_FN (GOMP_SIMT_ORDERED_PRED, ECF_LEAF | ECF_NOTHROW, NULL) +DEF_INTERNAL_FN (GOMP_SIMT_VOTE_ANY, ECF_NOVOPS | ECF_LEAF | ECF_NOTHROW, NULL) +DEF_INTERNAL_FN (GOMP_SIMT_XCHG_BFLY, ECF_NOVOPS | ECF_LEAF | ECF_NOTHROW, NULL) +DEF_INTERNAL_FN (GOMP_SIMT_XCHG_IDX, ECF_NOVOPS | ECF_LEAF | ECF_NOTHROW, NULL) DEF_INTERNAL_FN (GOMP_SIMD_LANE, ECF_NOVOPS | ECF_LEAF | ECF_NOTHROW, NULL) DEF_INTERNAL_FN (GOMP_SIMD_VF, ECF_CONST | ECF_LEAF | ECF_NOTHROW, NULL) DEF_INTERNAL_FN (GOMP_SIMD_LAST_LANE, ECF_CONST | ECF_LEAF | ECF_NOTHROW, NULL) diff --git a/gcc/omp-low.c b/gcc/omp-low.c index e5b9e4c1091..da5476bc99f 100644 --- a/gcc/omp-low.c +++ b/gcc/omp-low.c @@ -2427,6 +2427,20 @@ cilk_for_check_loop_diff_type (tree type) } } +/* Return true if CTX may belong to offloaded code: either if current function + is offloaded, or any enclosing context corresponds to a target region. */ + +static bool +omp_maybe_offloaded_ctx (omp_context *ctx) +{ + if (cgraph_node::get (current_function_decl)->offloadable) + return true; + for (; ctx; ctx = ctx->outer) + if (is_gimple_omp_offloaded (ctx->stmt)) + return true; + return false; +} + /* Build a decl for the omp child function. It'll not contain a body yet, just the bare decl. */ @@ -2475,28 +2489,24 @@ create_omp_child_function (omp_context *ctx, bool task_copy) DECL_CONTEXT (decl) = NULL_TREE; DECL_INITIAL (decl) = make_node (BLOCK); BLOCK_SUPERCONTEXT (DECL_INITIAL (decl)) = decl; - if (cgraph_node::get (current_function_decl)->offloadable) - cgraph_node::get_create (decl)->offloadable = 1; - else + if (omp_maybe_offloaded_ctx (ctx)) { - omp_context *octx; - for (octx = ctx; octx; octx = octx->outer) - if (is_gimple_omp_offloaded (octx->stmt)) - { - cgraph_node::get_create (decl)->offloadable = 1; - if (ENABLE_OFFLOADING) - g->have_offload = true; - - break; - } + cgraph_node::get_create (decl)->offloadable = 1; + if (ENABLE_OFFLOADING) + g->have_offload = true; } if (cgraph_node::get_create (decl)->offloadable && !lookup_attribute ("omp declare target", DECL_ATTRIBUTES (current_function_decl))) - DECL_ATTRIBUTES (decl) - = tree_cons (get_identifier ("omp target entrypoint"), - NULL_TREE, DECL_ATTRIBUTES (decl)); + { + const char *target_attr = (is_gimple_omp_offloaded (ctx->stmt) + ? "omp target entrypoint" + : "omp declare target"); + DECL_ATTRIBUTES (decl) + = tree_cons (get_identifier (target_attr), + NULL_TREE, DECL_ATTRIBUTES (decl)); + } t = build_decl (DECL_SOURCE_LOCATION (decl), RESULT_DECL, NULL_TREE, void_type_node); @@ -4264,6 +4274,25 @@ omp_clause_aligned_alignment (tree clause) return build_int_cst (integer_type_node, al); } + +/* Return maximum SIMT width if offloading may target SIMT hardware. */ + +static int +omp_max_simt_vf (void) +{ + if (!optimize) + return 0; + if (ENABLE_OFFLOADING) + for (const char *c = getenv ("OFFLOAD_TARGET_NAMES"); c; ) + { + if (!strncmp (c, "nvptx", strlen ("nvptx"))) + return 32; + else if ((c = strchr (c, ','))) + c++; + } + return 0; +} + /* Return maximum possible vectorization factor for the target. */ static int @@ -4277,16 +4306,18 @@ omp_max_vf (void) || global_options_set.x_flag_tree_vectorize))) return 1; + int vf = 1; int vs = targetm.vectorize.autovectorize_vector_sizes (); if (vs) + vf = 1 << floor_log2 (vs); + else { - vs = 1 << floor_log2 (vs); - return vs; + machine_mode vqimode = targetm.vectorize.preferred_simd_mode (QImode); + if (GET_MODE_CLASS (vqimode) == MODE_VECTOR_INT) + vf = GET_MODE_NUNITS (vqimode); } - machine_mode vqimode = targetm.vectorize.preferred_simd_mode (QImode); - if (GET_MODE_CLASS (vqimode) == MODE_VECTOR_INT) - return GET_MODE_NUNITS (vqimode); - return 1; + int svf = omp_max_simt_vf (); + return MAX (vf, svf); } /* Helper function of lower_rec_input_clauses, used for #pragma omp simd @@ -4374,10 +4405,13 @@ lower_rec_input_clauses (tree clauses, gimple_seq *ilist, gimple_seq *dlist, int pass; bool is_simd = (gimple_code (ctx->stmt) == GIMPLE_OMP_FOR && gimple_omp_for_kind (ctx->stmt) & GF_OMP_FOR_SIMD); + bool maybe_simt + = is_simd && omp_maybe_offloaded_ctx (ctx) && omp_max_simt_vf () > 1; int max_vf = 0; tree lane = NULL_TREE, idx = NULL_TREE; + tree simt_lane = NULL_TREE; tree ivar = NULL_TREE, lvar = NULL_TREE; - gimple_seq llist[2] = { NULL, NULL }; + gimple_seq llist[3] = { }; copyin_seq = NULL; @@ -5251,6 +5285,16 @@ lower_rec_input_clauses (tree clauses, gimple_seq *ilist, gimple_seq *dlist, gimplify_assign (unshare_expr (ivar), x, &llist[0]); + if (maybe_simt) + { + if (!simt_lane) + simt_lane = create_tmp_var (unsigned_type_node); + x = build_call_expr_internal_loc + (UNKNOWN_LOCATION, IFN_GOMP_SIMT_XCHG_BFLY, + TREE_TYPE (ivar), 2, ivar, simt_lane); + x = build2 (code, TREE_TYPE (ivar), ivar, x); + gimplify_assign (ivar, x, &llist[2]); + } x = build2 (code, TREE_TYPE (ref), ref, ivar); ref = build_outer_var_ref (var, ctx); gimplify_assign (ref, x, &llist[1]); @@ -5303,6 +5347,39 @@ lower_rec_input_clauses (tree clauses, gimple_seq *ilist, gimple_seq *dlist, g = gimple_build_assign (lane, INTEGER_CST, build_int_cst (unsigned_type_node, 0)); gimple_seq_add_stmt (ilist, g); + /* Emit reductions across SIMT lanes in log_2(simt_vf) steps. */ + if (llist[2]) + { + tree simt_vf = create_tmp_var (unsigned_type_node); + g = gimple_build_call_internal (IFN_GOMP_SIMT_VF, 0); + gimple_call_set_lhs (g, simt_vf); + gimple_seq_add_stmt (dlist, g); + + tree t = build_int_cst (unsigned_type_node, 1); + g = gimple_build_assign (simt_lane, INTEGER_CST, t); + gimple_seq_add_stmt (dlist, g); + + t = build_int_cst (unsigned_type_node, 0); + g = gimple_build_assign (idx, INTEGER_CST, t); + gimple_seq_add_stmt (dlist, g); + + tree body = create_artificial_label (UNKNOWN_LOCATION); + tree header = create_artificial_label (UNKNOWN_LOCATION); + tree end = create_artificial_label (UNKNOWN_LOCATION); + gimple_seq_add_stmt (dlist, gimple_build_goto (header)); + gimple_seq_add_stmt (dlist, gimple_build_label (body)); + + gimple_seq_add_seq (dlist, llist[2]); + + g = gimple_build_assign (simt_lane, LSHIFT_EXPR, simt_lane, integer_one_node); + gimple_seq_add_stmt (dlist, g); + + gimple_seq_add_stmt (dlist, gimple_build_label (header)); + g = gimple_build_cond (LT_EXPR, simt_lane, simt_vf, body, end); + gimple_seq_add_stmt (dlist, g); + + gimple_seq_add_stmt (dlist, gimple_build_label (end)); + } for (int i = 0; i < 2; i++) if (llist[i]) { @@ -5389,7 +5466,7 @@ lower_lastprivate_clauses (tree clauses, tree predicate, gimple_seq *stmt_list, { tree x, c, label = NULL, orig_clauses = clauses; bool par_clauses = false; - tree simduid = NULL, lastlane = NULL; + tree simduid = NULL, lastlane = NULL, simtcond = NULL, simtlast = NULL; /* Early exit if there are no lastprivate or linear clauses. */ for (; clauses ; clauses = OMP_CLAUSE_CHAIN (clauses)) @@ -5416,6 +5493,16 @@ lower_lastprivate_clauses (tree clauses, tree predicate, gimple_seq *stmt_list, par_clauses = true; } + bool maybe_simt = false; + if (gimple_code (ctx->stmt) == GIMPLE_OMP_FOR + && gimple_omp_for_kind (ctx->stmt) & GF_OMP_FOR_SIMD) + { + maybe_simt = omp_maybe_offloaded_ctx (ctx) && omp_max_simt_vf () > 1; + simduid = find_omp_clause (orig_clauses, OMP_CLAUSE__SIMDUID_); + if (simduid) + simduid = OMP_CLAUSE__SIMDUID__DECL (simduid); + } + if (predicate) { gcond *stmt; @@ -5427,20 +5514,27 @@ lower_lastprivate_clauses (tree clauses, tree predicate, gimple_seq *stmt_list, arm2 = TREE_OPERAND (predicate, 1); gimplify_expr (&arm1, stmt_list, NULL, is_gimple_val, fb_rvalue); gimplify_expr (&arm2, stmt_list, NULL, is_gimple_val, fb_rvalue); - stmt = gimple_build_cond (TREE_CODE (predicate), arm1, arm2, - label_true, label); + if (maybe_simt) + { + c = build2 (TREE_CODE (predicate), boolean_type_node, arm1, arm2); + c = fold_convert (integer_type_node, c); + simtcond = create_tmp_var (integer_type_node); + gimplify_assign (simtcond, c, stmt_list); + gcall *g = gimple_build_call_internal (IFN_GOMP_SIMT_VOTE_ANY, + 1, simtcond); + c = create_tmp_var (integer_type_node); + gimple_call_set_lhs (g, c); + gimple_seq_add_stmt (stmt_list, g); + stmt = gimple_build_cond (NE_EXPR, c, integer_zero_node, + label_true, label); + } + else + stmt = gimple_build_cond (TREE_CODE (predicate), arm1, arm2, + label_true, label); gimple_seq_add_stmt (stmt_list, stmt); gimple_seq_add_stmt (stmt_list, gimple_build_label (label_true)); } - if (gimple_code (ctx->stmt) == GIMPLE_OMP_FOR - && gimple_omp_for_kind (ctx->stmt) & GF_OMP_FOR_SIMD) - { - simduid = find_omp_clause (orig_clauses, OMP_CLAUSE__SIMDUID_); - if (simduid) - simduid = OMP_CLAUSE__SIMDUID__DECL (simduid); - } - for (c = clauses; c ;) { tree var, new_var; @@ -5491,6 +5585,24 @@ lower_lastprivate_clauses (tree clauses, tree predicate, gimple_seq *stmt_list, new_var = build4 (ARRAY_REF, TREE_TYPE (val), TREE_OPERAND (val, 0), lastlane, NULL_TREE, NULL_TREE); + if (maybe_simt) + { + gcall *g; + if (simtlast == NULL) + { + simtlast = create_tmp_var (unsigned_type_node); + g = gimple_build_call_internal + (IFN_GOMP_SIMT_LAST_LANE, 1, simtcond); + gimple_call_set_lhs (g, simtlast); + gimple_seq_add_stmt (stmt_list, g); + } + x = build_call_expr_internal_loc + (UNKNOWN_LOCATION, IFN_GOMP_SIMT_XCHG_IDX, + TREE_TYPE (new_var), 2, new_var, simtlast); + new_var = unshare_expr (new_var); + gimplify_assign (new_var, x, stmt_list); + new_var = unshare_expr (new_var); + } } } @@ -10498,12 +10610,23 @@ expand_omp_simd (struct omp_region *region, struct omp_for_data *fd) edge e, ne; tree *counts = NULL; int i; + int safelen_int = INT_MAX; tree safelen = find_omp_clause (gimple_omp_for_clauses (fd->for_stmt), OMP_CLAUSE_SAFELEN); tree simduid = find_omp_clause (gimple_omp_for_clauses (fd->for_stmt), OMP_CLAUSE__SIMDUID_); tree n1, n2; + if (safelen) + { + safelen = OMP_CLAUSE_SAFELEN_EXPR (safelen); + if (TREE_CODE (safelen) != INTEGER_CST) + safelen_int = 0; + else if (tree_fits_uhwi_p (safelen) && tree_to_uhwi (safelen) < INT_MAX) + safelen_int = tree_to_uhwi (safelen); + if (safelen_int == 1) + safelen_int = 0; + } type = TREE_TYPE (fd->loop.v); entry_bb = region->entry; cont_bb = region->cont; @@ -10557,20 +10680,53 @@ expand_omp_simd (struct omp_region *region, struct omp_for_data *fd) OMP_CLAUSE__LOOPTEMP_); gcc_assert (innerc); n2 = OMP_CLAUSE_DECL (innerc); - expand_omp_build_assign (&gsi, fd->loop.v, - fold_convert (type, n1)); + } + tree step = fd->loop.step; + + bool offloaded = cgraph_node::get (current_function_decl)->offloadable; + for (struct omp_region *rgn = region; !offloaded && rgn; rgn = rgn->outer) + offloaded = rgn->type == GIMPLE_OMP_TARGET; + bool is_simt = offloaded && omp_max_simt_vf () > 1 && safelen_int > 1; + tree simt_lane = NULL_TREE, simt_maxlane = NULL_TREE; + if (is_simt) + { + cfun->curr_properties &= ~PROP_gimple_lomp_dev; + simt_lane = create_tmp_var (unsigned_type_node); + gimple *g = gimple_build_call_internal (IFN_GOMP_SIMT_LANE, 0); + gimple_call_set_lhs (g, simt_lane); + gsi_insert_before (&gsi, g, GSI_SAME_STMT); + tree offset = fold_build2 (MULT_EXPR, TREE_TYPE (step), step, + fold_convert (TREE_TYPE (step), simt_lane)); + n1 = fold_convert (type, n1); + if (POINTER_TYPE_P (type)) + n1 = fold_build_pointer_plus (n1, offset); + else + n1 = fold_build2 (PLUS_EXPR, type, n1, fold_convert (type, offset)); + + /* Collapsed loops not handled for SIMT yet: limit to one lane only. */ if (fd->collapse > 1) + simt_maxlane = build_one_cst (unsigned_type_node); + else if (safelen_int < omp_max_simt_vf ()) + simt_maxlane = build_int_cst (unsigned_type_node, safelen_int); + tree vf + = build_call_expr_internal_loc (UNKNOWN_LOCATION, IFN_GOMP_SIMT_VF, + unsigned_type_node, 0); + if (simt_maxlane) + vf = fold_build2 (MIN_EXPR, unsigned_type_node, vf, simt_maxlane); + vf = fold_convert (TREE_TYPE (step), vf); + step = fold_build2 (MULT_EXPR, TREE_TYPE (step), step, vf); + } + + expand_omp_build_assign (&gsi, fd->loop.v, fold_convert (type, n1)); + if (fd->collapse > 1) + { + if (gimple_omp_for_combined_into_p (fd->for_stmt)) { gsi_prev (&gsi); expand_omp_for_init_vars (fd, &gsi, counts, NULL, n1); gsi_next (&gsi); } - } - else - { - expand_omp_build_assign (&gsi, fd->loop.v, - fold_convert (type, fd->loop.n1)); - if (fd->collapse > 1) + else for (i = 0; i < fd->collapse; i++) { tree itype = TREE_TYPE (fd->loops[i].v); @@ -10579,7 +10735,7 @@ expand_omp_simd (struct omp_region *region, struct omp_for_data *fd) t = fold_convert (TREE_TYPE (fd->loops[i].v), fd->loops[i].n1); expand_omp_build_assign (&gsi, fd->loops[i].v, t); } - } + } /* Remove the GIMPLE_OMP_FOR statement. */ gsi_remove (&gsi, true); @@ -10592,9 +10748,9 @@ expand_omp_simd (struct omp_region *region, struct omp_for_data *fd) gcc_assert (gimple_code (stmt) == GIMPLE_OMP_CONTINUE); if (POINTER_TYPE_P (type)) - t = fold_build_pointer_plus (fd->loop.v, fd->loop.step); + t = fold_build_pointer_plus (fd->loop.v, step); else - t = fold_build2 (PLUS_EXPR, type, fd->loop.v, fd->loop.step); + t = fold_build2 (PLUS_EXPR, type, fd->loop.v, step); expand_omp_build_assign (&gsi, fd->loop.v, t); if (fd->collapse > 1) @@ -10668,6 +10824,18 @@ expand_omp_simd (struct omp_region *region, struct omp_for_data *fd) gimple_regimplify_operands (cond_stmt, &gsi); } + /* Add 'V -= STEP * (SIMT_VF - 1)' after the loop. */ + if (is_simt) + { + gsi = gsi_start_bb (l2_bb); + step = fold_build2 (MINUS_EXPR, TREE_TYPE (step), fd->loop.step, step); + if (POINTER_TYPE_P (type)) + t = fold_build_pointer_plus (fd->loop.v, step); + else + t = fold_build2 (PLUS_EXPR, type, fd->loop.v, step); + expand_omp_build_assign (&gsi, fd->loop.v, t); + } + /* Remove GIMPLE_OMP_RETURN. */ gsi = gsi_last_bb (exit_bb); gsi_remove (&gsi, true); @@ -10697,30 +10865,29 @@ expand_omp_simd (struct omp_region *region, struct omp_for_data *fd) ne->probability = REG_BR_PROB_BASE / 8; set_immediate_dominator (CDI_DOMINATORS, l1_bb, entry_bb); - set_immediate_dominator (CDI_DOMINATORS, l2_bb, l2_dom_bb); set_immediate_dominator (CDI_DOMINATORS, l0_bb, l1_bb); + if (simt_maxlane) + { + cond_stmt = gimple_build_cond (LT_EXPR, simt_lane, simt_maxlane, + NULL_TREE, NULL_TREE); + gsi = gsi_last_bb (entry_bb); + gsi_insert_after (&gsi, cond_stmt, GSI_NEW_STMT); + make_edge (entry_bb, l2_bb, EDGE_FALSE_VALUE); + FALLTHRU_EDGE (entry_bb)->flags = EDGE_TRUE_VALUE; + FALLTHRU_EDGE (entry_bb)->probability = REG_BR_PROB_BASE * 7 / 8; + BRANCH_EDGE (entry_bb)->probability = REG_BR_PROB_BASE / 8; + l2_dom_bb = entry_bb; + } + set_immediate_dominator (CDI_DOMINATORS, l2_bb, l2_dom_bb); + if (!broken_loop) { struct loop *loop = alloc_loop (); loop->header = l1_bb; loop->latch = cont_bb; add_loop (loop, l1_bb->loop_father); - if (safelen == NULL_TREE) - loop->safelen = INT_MAX; - else - { - safelen = OMP_CLAUSE_SAFELEN_EXPR (safelen); - if (TREE_CODE (safelen) != INTEGER_CST) - loop->safelen = 0; - else if (!tree_fits_uhwi_p (safelen) - || tree_to_uhwi (safelen) > INT_MAX) - loop->safelen = INT_MAX; - else - loop->safelen = tree_to_uhwi (safelen); - if (loop->safelen == 1) - loop->safelen = 0; - } + loop->safelen = safelen_int; if (simduid) { loop->simduid = OMP_CLAUSE__SIMDUID__DECL (simduid); @@ -13885,7 +14052,6 @@ expand_omp (struct omp_region *region) } } - /* Helper for build_omp_regions. Scan the dominator tree starting at block BB. PARENT is the region that contains BB. If SINGLE_TREE is true, the function ends once a single tree is built (otherwise, whole @@ -14768,12 +14934,14 @@ static void lower_omp_ordered (gimple_stmt_iterator *gsi_p, omp_context *ctx) { tree block; - gimple *stmt = gsi_stmt (*gsi_p); + gimple *stmt = gsi_stmt (*gsi_p), *g; gomp_ordered *ord_stmt = as_a <gomp_ordered *> (stmt); gcall *x; gbind *bind; bool simd = find_omp_clause (gimple_omp_ordered_clauses (ord_stmt), OMP_CLAUSE_SIMD); + bool maybe_simt + = simd && omp_maybe_offloaded_ctx (ctx) && omp_max_simt_vf () > 1; bool threads = find_omp_clause (gimple_omp_ordered_clauses (ord_stmt), OMP_CLAUSE_THREADS); @@ -14807,11 +14975,56 @@ lower_omp_ordered (gimple_stmt_iterator *gsi_p, omp_context *ctx) 0); gimple_bind_add_stmt (bind, x); + tree counter = NULL_TREE, test = NULL_TREE, body = NULL_TREE; + if (maybe_simt) + { + counter = create_tmp_var (integer_type_node); + g = gimple_build_call_internal (IFN_GOMP_SIMT_LANE, 0); + gimple_call_set_lhs (g, counter); + gimple_bind_add_stmt (bind, g); + + body = create_artificial_label (UNKNOWN_LOCATION); + test = create_artificial_label (UNKNOWN_LOCATION); + gimple_bind_add_stmt (bind, gimple_build_label (body)); + + tree simt_pred = create_tmp_var (integer_type_node); + g = gimple_build_call_internal (IFN_GOMP_SIMT_ORDERED_PRED, 1, counter); + gimple_call_set_lhs (g, simt_pred); + gimple_bind_add_stmt (bind, g); + + tree t = create_artificial_label (UNKNOWN_LOCATION); + g = gimple_build_cond (EQ_EXPR, simt_pred, integer_zero_node, t, test); + gimple_bind_add_stmt (bind, g); + + gimple_bind_add_stmt (bind, gimple_build_label (t)); + } lower_omp (gimple_omp_body_ptr (stmt), ctx); gimple_omp_set_body (stmt, maybe_catch_exception (gimple_omp_body (stmt))); gimple_bind_add_seq (bind, gimple_omp_body (stmt)); gimple_omp_set_body (stmt, NULL); + if (maybe_simt) + { + gimple_bind_add_stmt (bind, gimple_build_label (test)); + g = gimple_build_assign (counter, MINUS_EXPR, counter, integer_one_node); + gimple_bind_add_stmt (bind, g); + + tree c = build2 (GE_EXPR, boolean_type_node, counter, integer_zero_node); + tree nonneg = create_tmp_var (integer_type_node); + gimple_seq tseq = NULL; + gimplify_assign (nonneg, fold_convert (integer_type_node, c), &tseq); + gimple_bind_add_seq (bind, tseq); + + g = gimple_build_call_internal (IFN_GOMP_SIMT_VOTE_ANY, 1, nonneg); + gimple_call_set_lhs (g, nonneg); + gimple_bind_add_stmt (bind, g); + + tree end = create_artificial_label (UNKNOWN_LOCATION); + g = gimple_build_cond (NE_EXPR, nonneg, integer_zero_node, body, end); + gimple_bind_add_stmt (bind, g); + + gimple_bind_add_stmt (bind, gimple_build_label (end)); + } if (simd) x = gimple_build_call_internal (IFN_GOMP_SIMD_ORDERED_END, 1, build_int_cst (NULL_TREE, threads)); @@ -17932,7 +18145,7 @@ const pass_data pass_data_lower_omp = OPTGROUP_NONE, /* optinfo_flags */ TV_NONE, /* tv_id */ PROP_gimple_any, /* properties_required */ - PROP_gimple_lomp, /* properties_provided */ + PROP_gimple_lomp | PROP_gimple_lomp_dev, /* properties_provided */ 0, /* properties_destroyed */ 0, /* todo_flags_start */ 0, /* todo_flags_finish */ @@ -19864,6 +20077,109 @@ make_pass_oacc_device_lower (gcc::context *ctxt) { return new pass_oacc_device_lower (ctxt); } + + +/* Cleanup uses of SIMT placeholder internal functions: on non-SIMT targets, + VF is 1 and LANE is 0; on SIMT targets, VF is folded to a constant, and + LANE is kept to be expanded to RTL later on. Also cleanup all other SIMT + internal functions on non-SIMT targets, and likewise some SIMD internal + functions on SIMT targets. */ + +static unsigned int +execute_omp_device_lower () +{ + int vf = targetm.simt.vf ? targetm.simt.vf () : 1; + basic_block bb; + gimple_stmt_iterator gsi; + FOR_EACH_BB_FN (bb, cfun) + for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi)) + { + gimple *stmt = gsi_stmt (gsi); + if (!is_gimple_call (stmt) || !gimple_call_internal_p (stmt)) + continue; + tree lhs = gimple_call_lhs (stmt), rhs = NULL_TREE; + tree type = lhs ? TREE_TYPE (lhs) : integer_type_node; + switch (gimple_call_internal_fn (stmt)) + { + case IFN_GOMP_SIMT_LANE: + case IFN_GOMP_SIMT_LAST_LANE: + rhs = vf == 1 ? build_zero_cst (type) : NULL_TREE; + break; + case IFN_GOMP_SIMT_VF: + rhs = build_int_cst (type, vf); + break; + case IFN_GOMP_SIMT_ORDERED_PRED: + rhs = vf == 1 ? integer_zero_node : NULL_TREE; + if (rhs || !lhs) + unlink_stmt_vdef (stmt); + break; + case IFN_GOMP_SIMT_VOTE_ANY: + case IFN_GOMP_SIMT_XCHG_BFLY: + case IFN_GOMP_SIMT_XCHG_IDX: + rhs = vf == 1 ? gimple_call_arg (stmt, 0) : NULL_TREE; + break; + case IFN_GOMP_SIMD_LANE: + case IFN_GOMP_SIMD_LAST_LANE: + rhs = vf != 1 ? build_zero_cst (type) : NULL_TREE; + break; + case IFN_GOMP_SIMD_VF: + rhs = vf != 1 ? build_one_cst (type) : NULL_TREE; + break; + default: + continue; + } + if (lhs && !rhs) + continue; + stmt = lhs ? gimple_build_assign (lhs, rhs) : gimple_build_nop (); + gsi_replace (&gsi, stmt, false); + } + if (vf != 1) + cfun->has_force_vectorize_loops = false; + return 0; +} + +namespace { + +const pass_data pass_data_omp_device_lower = +{ + GIMPLE_PASS, /* type */ + "ompdevlow", /* name */ + OPTGROUP_NONE, /* optinfo_flags */ + TV_NONE, /* tv_id */ + PROP_cfg, /* properties_required */ + PROP_gimple_lomp_dev, /* properties_provided */ + 0, /* properties_destroyed */ + 0, /* todo_flags_start */ + TODO_update_ssa, /* todo_flags_finish */ +}; + +class pass_omp_device_lower : public gimple_opt_pass +{ +public: + pass_omp_device_lower (gcc::context *ctxt) + : gimple_opt_pass (pass_data_omp_device_lower, ctxt) + {} + + /* opt_pass methods: */ + virtual bool gate (function *fun) + { + /* FIXME: inlining does not propagate the lomp_dev property. */ + return 1 || !(fun->curr_properties & PROP_gimple_lomp_dev); + } + virtual unsigned int execute (function *) + { + return execute_omp_device_lower (); + } + +}; // class pass_expand_omp_ssa + +} // anon namespace + +gimple_opt_pass * +make_pass_omp_device_lower (gcc::context *ctxt) +{ + return new pass_omp_device_lower (ctxt); +} /* "omp declare target link" handling pass. */ diff --git a/gcc/passes.def b/gcc/passes.def index 85a5af088f3..2a470a7de07 100644 --- a/gcc/passes.def +++ b/gcc/passes.def @@ -183,6 +183,7 @@ along with GCC; see the file COPYING3. If not see NEXT_PASS (pass_fixup_cfg); NEXT_PASS (pass_lower_eh_dispatch); NEXT_PASS (pass_oacc_device_lower); + NEXT_PASS (pass_omp_device_lower); NEXT_PASS (pass_omp_target_link); NEXT_PASS (pass_all_optimizations); PUSH_INSERT_PASSES_WITHIN (pass_all_optimizations) diff --git a/gcc/target-insns.def b/gcc/target-insns.def index a6a040eb173..e011a5a7f26 100644 --- a/gcc/target-insns.def +++ b/gcc/target-insns.def @@ -68,6 +68,12 @@ DEF_TARGET_INSN (oacc_dim_pos, (rtx x0, rtx x1)) DEF_TARGET_INSN (oacc_dim_size, (rtx x0, rtx x1)) DEF_TARGET_INSN (oacc_fork, (rtx x0, rtx x1, rtx x2)) DEF_TARGET_INSN (oacc_join, (rtx x0, rtx x1, rtx x2)) +DEF_TARGET_INSN (omp_simt_lane, (rtx x0)) +DEF_TARGET_INSN (omp_simt_last_lane, (rtx x0, rtx x1)) +DEF_TARGET_INSN (omp_simt_ordered, (rtx x0, rtx x1)) +DEF_TARGET_INSN (omp_simt_vote_any, (rtx x0, rtx x1)) +DEF_TARGET_INSN (omp_simt_xchg_bfly, (rtx x0, rtx x1, rtx x2)) +DEF_TARGET_INSN (omp_simt_xchg_idx, (rtx x0, rtx x1, rtx x2)) DEF_TARGET_INSN (prefetch, (rtx x0, rtx x1, rtx x2)) DEF_TARGET_INSN (probe_stack, (rtx x0)) DEF_TARGET_INSN (probe_stack_address, (rtx x0)) diff --git a/gcc/target.def b/gcc/target.def index bcdbc0e2452..2b357494ede 100644 --- a/gcc/target.def +++ b/gcc/target.def @@ -1648,6 +1648,18 @@ int, (struct cgraph_node *), NULL) HOOK_VECTOR_END (simd_clone) +/* Functions relating to OpenMP SIMT vectorization transform. */ +#undef HOOK_PREFIX +#define HOOK_PREFIX "TARGET_SIMT_" +HOOK_VECTOR (TARGET_SIMT, simt) + +DEFHOOK +(vf, +"Return number of threads in SIMT thread group on the target.", +int, (void), NULL) + +HOOK_VECTOR_END (simt) + /* Functions relating to openacc. */ #undef HOOK_PREFIX #define HOOK_PREFIX "TARGET_GOACC_" diff --git a/gcc/testsuite/ChangeLog.gomp-nvptx b/gcc/testsuite/ChangeLog.gomp-nvptx new file mode 100644 index 00000000000..24d26edeb64 --- /dev/null +++ b/gcc/testsuite/ChangeLog.gomp-nvptx @@ -0,0 +1,13 @@ +2016-05-19 Alexander Monakov <amonakov@ispras.ru> + + * lib/target-supports.exp (check_effective_target_alloca): Use a + compile test. + +2016-05-19 Alexander Monakov <amonakov@ispras.ru> + + * gcc.target/nvptx/softstack.c: New test. + +2016-05-06 Alexander Monakov <amonakov@ispras.ru> + + * gcc.target/nvptx/decl-shared.c: New test. + * gcc.target/nvptx/decl-shared-init.c: New test. diff --git a/gcc/testsuite/gcc.target/nvptx/decl-shared-init.c b/gcc/testsuite/gcc.target/nvptx/decl-shared-init.c new file mode 100644 index 00000000000..6a99b1c338a --- /dev/null +++ b/gcc/testsuite/gcc.target/nvptx/decl-shared-init.c @@ -0,0 +1 @@ +int var __attribute__((shared)) = 0; /* { dg-error "static initialization .* not supported" } */ diff --git a/gcc/testsuite/gcc.target/nvptx/decl-shared.c b/gcc/testsuite/gcc.target/nvptx/decl-shared.c new file mode 100644 index 00000000000..367075cebe2 --- /dev/null +++ b/gcc/testsuite/gcc.target/nvptx/decl-shared.c @@ -0,0 +1,14 @@ +static int v_internal __attribute__((shared,used)); +int v_common __attribute__((shared)); +int v_extdef __attribute__((shared,nocommon)); +extern int v_extdecl __attribute__((shared)); + +int use() +{ + return v_extdecl; +} + +/* { dg-final { scan-assembler "\[\r\n\]\[\t \]*.shared \[^,\r\n\]*v_internal" } } */ +/* { dg-final { scan-assembler "\[\r\n\]\[\t \]*.weak .shared \[^,\r\n\]*v_common" } } */ +/* { dg-final { scan-assembler "\[\r\n\]\[\t \]*.visible .shared \[^,\r\n\]*v_extdef" } } */ +/* { dg-final { scan-assembler "\[\r\n\]\[\t \]*.extern .shared \[^,\r\n\]*v_extdecl" } } */ diff --git a/gcc/testsuite/gcc.target/nvptx/softstack.c b/gcc/testsuite/gcc.target/nvptx/softstack.c new file mode 100644 index 00000000000..73e60f282a7 --- /dev/null +++ b/gcc/testsuite/gcc.target/nvptx/softstack.c @@ -0,0 +1,23 @@ +/* { dg-options "-O2 -msoft-stack" } */ +/* { dg-do run } */ + +static __attribute__((noinline,noclone)) int f(int *p) +{ + return __sync_lock_test_and_set(p, 1); +} + +static __attribute__((noinline,noclone)) int g(int n) +{ + /* Check that variable-length stack allocation works. */ + int v[n]; + v[0] = 0; + /* Check that atomic operations can be applied to auto data. */ + return f(v) == 0 && v[0] == 1; +} + +int main() +{ + if (!g(1)) + __builtin_abort(); + return 0; +} diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-supports.exp index b683c09c1db..ce60cce9cae 100644 --- a/gcc/testsuite/lib/target-supports.exp +++ b/gcc/testsuite/lib/target-supports.exp @@ -763,7 +763,10 @@ proc check_effective_target_untyped_assembly {} { proc check_effective_target_alloca {} { if { [istarget nvptx-*-*] } { - return 0 + return [check_no_compiler_messages alloca assembly { + void f (void*); + void g (int n) { f (__builtin_alloca (n)); } + }] } return 1 } diff --git a/gcc/tree-pass.h b/gcc/tree-pass.h index da9ba1374b4..8befebe17b1 100644 --- a/gcc/tree-pass.h +++ b/gcc/tree-pass.h @@ -222,6 +222,7 @@ protected: of math functions; the current choices have been optimized. */ +#define PROP_gimple_lomp_dev (1 << 16) /* done omp_device_lower */ #define PROP_trees \ (PROP_gimple_any | PROP_gimple_lcf | PROP_gimple_leh | PROP_gimple_lomp) @@ -417,6 +418,7 @@ extern gimple_opt_pass *make_pass_expand_omp (gcc::context *ctxt); extern gimple_opt_pass *make_pass_expand_omp_ssa (gcc::context *ctxt); extern gimple_opt_pass *make_pass_omp_target_link (gcc::context *ctxt); extern gimple_opt_pass *make_pass_oacc_device_lower (gcc::context *ctxt); +extern gimple_opt_pass *make_pass_omp_device_lower (gcc::context *ctxt); extern gimple_opt_pass *make_pass_object_sizes (gcc::context *ctxt); extern gimple_opt_pass *make_pass_strlen (gcc::context *ctxt); extern gimple_opt_pass *make_pass_fold_builtins (gcc::context *ctxt); diff --git a/libgcc/ChangeLog.gomp-nvptx b/libgcc/ChangeLog.gomp-nvptx new file mode 100644 index 00000000000..f2cb0d88743 --- /dev/null +++ b/libgcc/ChangeLog.gomp-nvptx @@ -0,0 +1,27 @@ +2016-05-06 Alexander Monakov <amonakov@ispras.ru> + + * config/nvptx/stacks.c (__nvptx_stacks): Use 'nocommon' attribute to + emit a strong definition. + (__nvptx_uni): Ditto. + +2016-03-10 Alexander Monakov <amonakov@ispras.ru> + + * config/nvptx/crt0.c (__nvptx_stacks): Define in C. Use it... + (__nvptx_uni): Ditto. + (__main): ...here instead of inline asm. + * config/nvptx/stacks.c (__nvptx_stacks): Define in C. + (__nvptx_uni): Ditto. + +2015-12-09 Alexander Monakov <amonakov@ispras.ru> + + * config/nvptx/crt0.c: New, rewritten in C from ... + * config/nvptx/crt0.s: ...this. Delete. + * config/nvptx/free.c: New, rewritten in C from ... + * config/nvptx/free.asm: ...this. Delete. + * config/nvptx/malloc.c: New, rewritten in C from ... + * config/nvptx/malloc.asm: ...this. Delete. + * config/nvptx/realloc.c: Handle out-of-memory condition. + * config/nvptx/nvptx-malloc.h (__nvptx_real_free, + __nvptx_real_malloc): Declare. + * config/nvptx/stacks.c: New. + * config/nvptx/t-nvptx: Adjust. diff --git a/libgcc/config/nvptx/crt0.c b/libgcc/config/nvptx/crt0.c index 3b7382d43af..5a7dbf6f22f 100644 --- a/libgcc/config/nvptx/crt0.c +++ b/libgcc/config/nvptx/crt0.c @@ -24,6 +24,14 @@ int *__exitval_ptr; extern void __attribute__((noreturn)) exit (int status); extern int main (int, void **); +/* Always setup soft stacks to allow testing with -msoft-stack but without + -mgomp. 32 is the maximum number of warps in a CTA: the definition here + must match the external declaration emitted by the compiler. */ +void *__nvptx_stacks[32] __attribute__((shared,nocommon)); + +/* Likewise for -muniform-simt. */ +unsigned __nvptx_uni[32] __attribute__((shared,nocommon)); + void __attribute__((kernel)) __main (int *rval_ptr, int argc, void **argv) { @@ -33,5 +41,9 @@ __main (int *rval_ptr, int argc, void **argv) if (rval_ptr) *rval_ptr = 255; + static char stack[131072] __attribute__((aligned(8))); + __nvptx_stacks[0] = stack + sizeof stack; + __nvptx_uni[0] = 0; + exit (main (argc, argv)); } diff --git a/libgcc/config/nvptx/nvptx-malloc.h b/libgcc/config/nvptx/mgomp.c index 9cbd84689b7..d8ca5818314 100644 --- a/libgcc/config/nvptx/nvptx-malloc.h +++ b/libgcc/config/nvptx/mgomp.c @@ -1,6 +1,6 @@ -/* Declarations for the malloc wrappers. +/* Define shared memory arrays for -msoft-stack and -muniform-simt. - Copyright (C) 2014-2016 Free Software Foundation, Inc. + Copyright (C) 2015-2016 Free Software Foundation, Inc. This file is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the @@ -21,6 +21,12 @@ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see <http://www.gnu.org/licenses/>. */ -extern void __nvptx_free (void *); -extern void *__nvptx_malloc (size_t); -extern void *__nvptx_realloc (void *, size_t); +/* OpenACC offloading does not use these symbols; thus, they are exposed + only for the -mgomp multilib. The same definitions are also provided + in crt0.c for the case of non-offloading compilation. 32 is the maximum + number of warps in a CTA. */ + +#if defined(__nvptx_softstack__) && defined(__nvptx_unisimt__) +void *__nvptx_stacks[32] __attribute__((shared,nocommon)); +unsigned __nvptx_uni[32] __attribute__((shared,nocommon)); +#endif diff --git a/libgcc/config/nvptx/t-nvptx b/libgcc/config/nvptx/t-nvptx index daf252f2472..c4d20c94cbb 100644 --- a/libgcc/config/nvptx/t-nvptx +++ b/libgcc/config/nvptx/t-nvptx @@ -1,4 +1,5 @@ -LIB2ADD=$(srcdir)/config/nvptx/reduction.c +LIB2ADD=$(srcdir)/config/nvptx/reduction.c \ + $(srcdir)/config/nvptx/mgomp.c LIB2ADDEH= LIB2FUNCS_EXCLUDE=__main diff --git a/libgomp/ChangeLog.gomp-nvptx b/libgomp/ChangeLog.gomp-nvptx new file mode 100644 index 00000000000..65f64a2174c --- /dev/null +++ b/libgomp/ChangeLog.gomp-nvptx @@ -0,0 +1,296 @@ +2016-11-08 Alexander Monakov <amonakov@ispras.ru> + + * config/nvptx/team.c: Adjust comments. + +2016-05-06 Alexander Monakov <amonakov@ispras.ru> + + * config/nvptx/team.c (nvptx_thrs): Use 'nocommon' attribute to emit a + strong definition. + +2016-03-24 Alexander Monakov <amonakov@ispras.ru> + + * config/nvptx/team.c (gomp_thread_start): Work around NVIDIA driver + bug by adding an exit edge to the loop, + +2016-03-24 Alexander Monakov <amonakov@ispras.ru> + + * config/nvptx/target.c (GOMP_teams): Do not call 'free'. + * config/nvptx/team.c (gomp_nvptx_main): Use 'alloca' instead of + 'malloc' to obtain storage. Do not call 'free'. + * team.c (gomp_free_thread) [__nvptx__]: Do not call 'free'. + +2016-03-11 Alexander Monakov <amonakov@ispras.ru> + + * plugin/plugin-nvptx.c (map_fini): Make cuMemFreeHost error non-fatal. + +2016-03-04 Alexander Monakov <amonakov@ispras.ru> + + * config/nvptx/bar.c: Remove wrong invocation of + gomp_barrier_wait_end from gomp_team_barrier_wait_end. + +2016-02-15 Alexander Monakov <amonakov@ispras.ru> + + * plugin/plugin-nvptx.c (nvptx_stacks_size): New. + (nvptx_stacks_alloc): New. + (nvptx_stacks_free): New. + (GOMP_OFFLOAD_run): Allocate soft-stacks storage from the host using + the above new functions. Use kernel launch interface that allows + checking for mismatched total size of entry function arguments. + +2016-02-15 Alexander Monakov <amonakov@ispras.ru> + + * config/nvptx/team.c: (gomp_nvptx_main_1): Rename back to... + (gomp_nvptx_main): ...this; delete the wrapper. + +2016-02-15 Alexander Monakov <amonakov@ispras.ru> + + Revert + 2015-12-09 Alexander Monakov <amonakov@ispras.ru> + + * plugin/plugin-nvptx.c (nvptx_open_device): Adjust heap size. + +2016-02-15 Alexander Monakov <amonakov@ispras.ru> + + * plugin/plugin-nvptx.c (nvptx_adjust_launch_bounds): Adjust types. + (GOMP_OFFLOAD_run): Ditto. + +2016-01-21 Alexander Monakov <amonakov@ispras.ru> + + * config/posix/affinity.c: Move to... + * affinity.c: ...here (new file). Guard use of PThreads-specific + interface by LIBGOMP_USE_PTHREADS. + * config/nvptx/affinity.c: Delete (identical to affinity.c). + +2016-01-20 Alexander Monakov <amonakov@ispras.ru> + + * plugin/plugin-nvptx.c (struct targ_fn_descriptor): Add new fields. + (struct ptx_device): Ditto. Set them... + (nvptx_open_device): ...here. + (GOMP_OFFLOAD_load_image): Set new targ_fn_descriptor fields. + (nvptx_adjust_launch_bounds): New. Use it... + (GOMP_OFFLOAD_run): ...here. + +2016-01-20 Alexander Monakov <amonakov@ispras.ru> + + * config/nvptx/icv-device.c (omp_get_num_teams): Update. + (omp_get_team_num): Ditto. + * config/nvptx/target.c (GOMP_teams): Update. + * config/nvptx/team.c (nvptx_thrs): Place in shared memory. + * icv.c (gomp_num_teams_var): Define. + * libgomp.h (gomp_num_teams_var): Declare. + (nvptx_thrs): Place in shared memory. + +2016-01-20 Alexander Monakov <amonakov@ispras.ru> + + * testsuite/libgomp.fortran/fortran.exp (lang_link_flags): Pass + -foffload=-lgfortran in addition to -lgfortran. + * testsuite/libgomp.oacc-fortran/fortran.exp (lang_link_flags): Ditto. + +2016-01-20 Alexander Monakov <amonakov@ispras.ru> + + * config/nvptx/fortran.c: Delete. + +2016-01-20 Alexander Monakov <amonakov@ispras.ru> + + * config/linux/lock.c (gomp_init_lock_30): Move to generic lock.c. + (gomp_destroy_lock_30): Ditto. + (gomp_set_lock_30): Ditto. + (gomp_unset_lock_30): Ditto. + (gomp_test_lock_30): Ditto. + (gomp_init_nest_lock_30): Ditto. + (gomp_destroy_nest_lock_30): Ditto. + (gomp_set_nest_lock_30): Ditto. + (gomp_unset_nest_lock_30): Ditto. + (gomp_test_nest_lock_30): Ditto. + * lock.c: New. + * config/nvptx/lock.c: New. + +2016-01-20 Alexander Monakov <amonakov@ispras.ru> + + * plugin/plugin-nvptx.c (struct ptx_device): New field (clock_khz). + (nvptx_open_device): Set it. + (nvptx_set_clocktick): New. Use it... + (GOMP_OFFLOAD_load_image): ...here. + +2016-01-20 Dmitry Melnik <dm@ispras.ru> + + * config/nvptx/time.c: New. + +2016-01-20 Alexander Monakov <amonakov@ispras.ru> + + * config/nvptx/section.c: Delete. + * config/nvptx/splay-tree.c: Delete. + +2015-12-14 Alexander Monakov <amonakov@ispras.ru> + + * config/nvptx/task.c: Guard with #ifdef __nvptx_softstack__. + +2015-12-14 Alexander Monakov <amonakov@ispras.ru> + + Revert + 2015-12-09 Alexander Monakov <amonakov@ispras.ru> + + * task.c (GOMP_task): Use a fixed-size on-stack buffer or a heap + allocation instead of a variable-size on-stack allocation. + +2015-12-14 Alexander Monakov <amonakov@ispras.ru> + + Revert + 2015-12-09 Alexander Monakov <amonakov@ispras.ru> + + * taskloop.c (GOMP_taskloop): Avoid alloca on NVPTX. + +2015-12-09 Alexander Monakov <amonakov@ispras.ru> + + * config/nvptx/priority_queue.c: Delete. + +2015-12-09 Alexander Monakov <amonakov@ispras.ru> + + * plugin/plugin-nvptx.c (GOMP_OFFLOAD_async_run): New (stub). + +2015-12-09 Alexander Monakov <amonakov@ispras.ru> + + * config/nvptx/task.c: New. + +2015-12-09 Alexander Monakov <amonakov@ispras.ru> + + * config/nvptx/team.c (gomp_nvptx_main): Rename to... + (gomp_nvptx_main_1): ... this and mark noinline. + (gomp_nvptx_main): Wrap the above, set up __nvptx_uni and + __nvptx_stacks. + +2015-12-09 Alexander Monakov <amonakov@ispras.ru> + + * plugin/plugin-nvptx.c (nvptx_open_device): Adjust heap size. + +2015-12-09 Alexander Monakov <amonakov@ispras.ru> + + * config/nvptx/pool.h: New. + +2015-12-09 Alexander Monakov <amonakov@ispras.ru> + + * config/nvptx/doacross.h: New. + +2015-12-09 Alexander Monakov <amonakov@ispras.ru> + + * taskloop.c (GOMP_taskloop): Avoid alloca on NVPTX. + +2015-12-09 Alexander Monakov <amonakov@ispras.ru> + + * plugin/plugin-nvptx.c (GOMP_OFFLOAD_dev2dev): New. + +2015-12-09 Alexander Monakov <amonakov@ispras.ru> + + * config/nvptx/affinity.c: New. + +2015-12-09 Alexander Monakov <amonakov@ispras.ru> + + * plugin/plugin-nvptx.c (link_ptx): Adjust log sizes. + +2015-12-09 Alexander Monakov <amonakov@ispras.ru> + + * plugin/plugin-nvptx.c (GOMP_OFFLOAD_run): Start 8 warps. + +2015-12-09 Alexander Monakov <amonakov@ispras.ru> + + * config/nvptx/alloc.c: Delete. + * config/nvptx/barrier.c: Ditto. + * config/nvptx/iter.c: Ditto. + * config/nvptx/iter_ull.c: Ditto. + * config/nvptx/loop.c: Ditto. + * config/nvptx/loop_ull.c: Ditto. + * config/nvptx/ordered.c: Ditto. + * config/nvptx/parallel.c: Ditto. + * config/nvptx/single.c: Ditto. + * config/nvptx/task.c: Ditto. + * config/nvptx/work.c: Ditto. + +2015-12-09 Alexander Monakov <amonakov@ispras.ru> + + * Makefile.am (libgomp_la_SOURCES): Add atomic.c. + * Makefile.in: Regenerate. + * critical.c: Split out GOMP_atomic_{start,end} into... + * atomic.c: ...here (new file). + * config/nvptx/critical.c: Delete. + +2015-12-09 Alexander Monakov <amonakov@ispras.ru> + + * config/nvptx/target.c: New. + +2015-12-09 Alexander Monakov <amonakov@ispras.ru> + + * config/nvptx/error.c: New. + +2015-12-09 Alexander Monakov <amonakov@ispras.ru> + + * libgomp.h [__nvptx__] (gomp_thread): New implementation. + * config/nvptx/team.c (gomp_nvptx_main): New. + (gomp_thread_start): New (NVPTX-specific implementation). + (gomp_team_start): Ditto. + * team.c: Guard uses of PThreads-specific interfaces by + LIBGOMP_USE_PTHREADS. + +2015-12-08 Alexander Monakov <amonakov@ispras.ru> + + * config/nvptx/simple-bar.h: New file. + +2015-12-08 Alexander Monakov <amonakov@ispras.ru> + + * config/posix/simple-bar.h: New file. Use it ... + * libgomp.h: ...here: new include. + (struct gomp_thread_pool): Change threads_dock member to + gomp_simple_barrier_t. + * team.c: Adjust all uses. + +2015-12-08 Alexander Monakov <amonakov@ispras.ru> + + * config/nvptx/bar.c: New file. + * config/nvptx/bar.h: Ditto. + +2015-12-08 Alexander Monakov <amonakov@ispras.ru> + + * config/nvptx/proc.c: New. + +2015-12-08 Alexander Monakov <amonakov@ispras.ru> + + * plugin/plugin-nvptx.c (link_ptx): Do not set CU_JIT_TARGET. + +2015-12-08 Alexander Monakov <amonakov@ispras.ru> + + * config/nvptx/icv-device.c: New file. + +2015-12-08 Alexander Monakov <amonakov@ispras.ru> + + * Makefile.am (libgomp_la_SOURCES): Add icv.c and icv-device.c. + * Makefile.in: Regenerate. + * env.c: Split out ICV definitions into... + * icv.c: ...here (new file) and... + * icv-device.c: ...here. New file. + +2015-12-08 Alexander Monakov <amonakov@ispras.ru> + + * configure.ac [nvptx*-*-*] (libgomp_use_pthreads): Set and use it... + (LIBGOMP_USE_PTHREADS): ...here; new define. + * configure: Regenerate. + * config.h.in: Likewise. + * libgomp.h: Guard pthread.h inclusion. + (gomp_thread_attr): Guard by LIBGOMP_USE_PTHREADS. + (gomp_init_thread_affinity): Ditto. + +2015-12-08 Alexander Monakov <amonakov@ispras.ru> + + * config/nvptx/bar.h: New file. + +2015-12-08 Alexander Monakov <amonakov@ispras.ru> + + * config/nvptx/mutex.h: New file. + * config/nvptx/ptrlock.h: New file. + * config/nvptx/sem.h: New file. + +2015-12-08 Jakub Jelinek <jakub@redhat.com> + + * plugin/plugin-nvptx.c (nvptx_host2dev): Allow NULL 'nvthd'. + (nvptx_dev2host): Ditto. + (GOMP_OFFLOAD_get_caps): Add GOMP_OFFLOAD_CAP_OPENMP_400. + (GOMP_OFFLOAD_run): New. + diff --git a/libgomp/Makefile.am b/libgomp/Makefile.am index a3e1c2b2c0f..4090336a773 100644 --- a/libgomp/Makefile.am +++ b/libgomp/Makefile.am @@ -58,12 +58,12 @@ libgomp_la_LDFLAGS = $(libgomp_version_info) $(libgomp_version_script) \ libgomp_la_DEPENDENCIES = $(libgomp_version_dep) libgomp_la_LINK = $(LINK) $(libgomp_la_LDFLAGS) -libgomp_la_SOURCES = alloc.c barrier.c critical.c env.c error.c iter.c \ - iter_ull.c loop.c loop_ull.c ordered.c parallel.c sections.c single.c \ - task.c team.c work.c lock.c mutex.c proc.c sem.c bar.c ptrlock.c \ - time.c fortran.c affinity.c target.c splay-tree.c libgomp-plugin.c \ - oacc-parallel.c oacc-host.c oacc-init.c oacc-mem.c oacc-async.c \ - oacc-plugin.c oacc-cuda.c priority_queue.c +libgomp_la_SOURCES = alloc.c atomic.c barrier.c critical.c env.c error.c \ + icv.c icv-device.c iter.c iter_ull.c loop.c loop_ull.c ordered.c \ + parallel.c sections.c single.c task.c team.c work.c lock.c mutex.c \ + proc.c sem.c bar.c ptrlock.c time.c fortran.c affinity.c target.c \ + splay-tree.c libgomp-plugin.c oacc-parallel.c oacc-host.c oacc-init.c \ + oacc-mem.c oacc-async.c oacc-plugin.c oacc-cuda.c priority_queue.c include $(top_srcdir)/plugin/Makefrag.am diff --git a/libgomp/Makefile.in b/libgomp/Makefile.in index 88c8517fddd..a630f2b6438 100644 --- a/libgomp/Makefile.in +++ b/libgomp/Makefile.in @@ -173,14 +173,14 @@ libgomp_plugin_nvptx_la_LINK = $(LIBTOOL) --tag=CC \ @PLUGIN_NVPTX_TRUE@ $(toolexeclibdir) libgomp_la_LIBADD = @USE_FORTRAN_TRUE@am__objects_1 = openacc.lo -am_libgomp_la_OBJECTS = alloc.lo barrier.lo critical.lo env.lo \ - error.lo iter.lo iter_ull.lo loop.lo loop_ull.lo ordered.lo \ - parallel.lo sections.lo single.lo task.lo team.lo work.lo \ - lock.lo mutex.lo proc.lo sem.lo bar.lo ptrlock.lo time.lo \ - fortran.lo affinity.lo target.lo splay-tree.lo \ - libgomp-plugin.lo oacc-parallel.lo oacc-host.lo oacc-init.lo \ - oacc-mem.lo oacc-async.lo oacc-plugin.lo oacc-cuda.lo \ - priority_queue.lo $(am__objects_1) +am_libgomp_la_OBJECTS = alloc.lo atomic.lo barrier.lo critical.lo \ + env.lo error.lo icv.lo icv-device.lo iter.lo iter_ull.lo \ + loop.lo loop_ull.lo ordered.lo parallel.lo sections.lo \ + single.lo task.lo team.lo work.lo lock.lo mutex.lo proc.lo \ + sem.lo bar.lo ptrlock.lo time.lo fortran.lo affinity.lo \ + target.lo splay-tree.lo libgomp-plugin.lo oacc-parallel.lo \ + oacc-host.lo oacc-init.lo oacc-mem.lo oacc-async.lo \ + oacc-plugin.lo oacc-cuda.lo priority_queue.lo $(am__objects_1) libgomp_la_OBJECTS = $(am_libgomp_la_OBJECTS) DEFAULT_INCLUDES = -I.@am__isrc@ depcomp = $(SHELL) $(top_srcdir)/../depcomp @@ -429,13 +429,14 @@ libgomp_la_LDFLAGS = $(libgomp_version_info) $(libgomp_version_script) \ libgomp_la_DEPENDENCIES = $(libgomp_version_dep) libgomp_la_LINK = $(LINK) $(libgomp_la_LDFLAGS) -libgomp_la_SOURCES = alloc.c barrier.c critical.c env.c error.c iter.c \ - iter_ull.c loop.c loop_ull.c ordered.c parallel.c sections.c \ - single.c task.c team.c work.c lock.c mutex.c proc.c sem.c \ - bar.c ptrlock.c time.c fortran.c affinity.c target.c \ - splay-tree.c libgomp-plugin.c oacc-parallel.c oacc-host.c \ - oacc-init.c oacc-mem.c oacc-async.c oacc-plugin.c oacc-cuda.c \ - priority_queue.c $(am__append_3) +libgomp_la_SOURCES = alloc.c atomic.c barrier.c critical.c env.c \ + error.c icv.c icv-device.c iter.c iter_ull.c loop.c loop_ull.c \ + ordered.c parallel.c sections.c single.c task.c team.c work.c \ + lock.c mutex.c proc.c sem.c bar.c ptrlock.c time.c fortran.c \ + affinity.c target.c splay-tree.c libgomp-plugin.c \ + oacc-parallel.c oacc-host.c oacc-init.c oacc-mem.c \ + oacc-async.c oacc-plugin.c oacc-cuda.c priority_queue.c \ + $(am__append_3) # Nvidia PTX OpenACC plugin. @PLUGIN_NVPTX_TRUE@libgomp_plugin_nvptx_version_info = -version-info $(libtool_VERSION) @@ -598,12 +599,15 @@ distclean-compile: @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/affinity.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/alloc.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/atomic.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/bar.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/barrier.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/critical.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/env.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/error.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/fortran.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/icv-device.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/icv.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/iter.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/iter_ull.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libgomp-plugin.Plo@am__quote@ diff --git a/libgomp/config/posix/affinity.c b/libgomp/affinity.c index 32986fa8281..3549e54b67c 100644 --- a/libgomp/config/posix/affinity.c +++ b/libgomp/affinity.c @@ -32,12 +32,14 @@ gomp_init_affinity (void) { } +#ifdef LIBGOMP_USE_PTHREADS void gomp_init_thread_affinity (pthread_attr_t *attr, unsigned int place) { (void) attr; (void) place; } +#endif void ** gomp_affinity_alloc (unsigned long count, bool quiet) diff --git a/libgomp/atomic.c b/libgomp/atomic.c new file mode 100644 index 00000000000..bad0736e21e --- /dev/null +++ b/libgomp/atomic.c @@ -0,0 +1,57 @@ +/* Copyright (C) 2005-2016 Free Software Foundation, Inc. + Contributed by Richard Henderson <rth@redhat.com>. + + This file is part of the GNU Offloading and Multi Processing Library + (libgomp). + + Libgomp is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY + WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + FOR A PARTICULAR PURPOSE. See the GNU General Public License for + more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + <http://www.gnu.org/licenses/>. */ + +/* This file contains helpers for the ATOMIC construct. */ + +#include "libgomp.h" + +/* This mutex is used when atomic operations don't exist for the target + in the mode requested. The result is not globally atomic, but works so + long as all parallel references are within #pragma omp atomic directives. + According to responses received from omp@openmp.org, appears to be within + spec. Which makes sense, since that's how several other compilers + handle this situation as well. */ + +static gomp_mutex_t atomic_lock; + +void +GOMP_atomic_start (void) +{ + gomp_mutex_lock (&atomic_lock); +} + +void +GOMP_atomic_end (void) +{ + gomp_mutex_unlock (&atomic_lock); +} + +#if !GOMP_MUTEX_INIT_0 +static void __attribute__((constructor)) +initialize_atomic (void) +{ + gomp_mutex_init (&atomic_lock); +} +#endif diff --git a/libgomp/config.h.in b/libgomp/config.h.in index 226ac5358a7..e5526f6be82 100644 --- a/libgomp/config.h.in +++ b/libgomp/config.h.in @@ -94,6 +94,9 @@ /* Define to 1 if GNU symbol versioning is used for libgomp. */ #undef LIBGOMP_GNU_SYMBOL_VERSIONING +/* Define to 1 if libgomp should use POSIX threads. */ +#undef LIBGOMP_USE_PTHREADS + /* Define to the sub-directory in which libtool stores uninstalled libraries. */ #undef LT_OBJDIR diff --git a/libgomp/config/linux/lock.c b/libgomp/config/linux/lock.c index a671b7e0a89..07845c98ad1 100644 --- a/libgomp/config/linux/lock.c +++ b/libgomp/config/linux/lock.c @@ -32,98 +32,8 @@ #include <sys/syscall.h> #include "wait.h" - -/* The internal gomp_mutex_t and the external non-recursive omp_lock_t - have the same form. Re-use it. */ - -void -gomp_init_lock_30 (omp_lock_t *lock) -{ - gomp_mutex_init (lock); -} - -void -gomp_destroy_lock_30 (omp_lock_t *lock) -{ - gomp_mutex_destroy (lock); -} - -void -gomp_set_lock_30 (omp_lock_t *lock) -{ - gomp_mutex_lock (lock); -} - -void -gomp_unset_lock_30 (omp_lock_t *lock) -{ - gomp_mutex_unlock (lock); -} - -int -gomp_test_lock_30 (omp_lock_t *lock) -{ - int oldval = 0; - - return __atomic_compare_exchange_n (lock, &oldval, 1, false, - MEMMODEL_ACQUIRE, MEMMODEL_RELAXED); -} - -void -gomp_init_nest_lock_30 (omp_nest_lock_t *lock) -{ - memset (lock, '\0', sizeof (*lock)); -} - -void -gomp_destroy_nest_lock_30 (omp_nest_lock_t *lock) -{ -} - -void -gomp_set_nest_lock_30 (omp_nest_lock_t *lock) -{ - void *me = gomp_icv (true); - - if (lock->owner != me) - { - gomp_mutex_lock (&lock->lock); - lock->owner = me; - } - - lock->count++; -} - -void -gomp_unset_nest_lock_30 (omp_nest_lock_t *lock) -{ - if (--lock->count == 0) - { - lock->owner = NULL; - gomp_mutex_unlock (&lock->lock); - } -} - -int -gomp_test_nest_lock_30 (omp_nest_lock_t *lock) -{ - void *me = gomp_icv (true); - int oldval; - - if (lock->owner == me) - return ++lock->count; - - oldval = 0; - if (__atomic_compare_exchange_n (&lock->lock, &oldval, 1, false, - MEMMODEL_ACQUIRE, MEMMODEL_RELAXED)) - { - lock->owner = me; - lock->count = 1; - return 1; - } - - return 0; -} +/* Reuse the generic implementation in terms of gomp_mutex_t. */ +#include "../../lock.c" #ifdef LIBGOMP_GNU_SYMBOL_VERSIONING /* gomp_mutex_* can be safely locked in one thread and diff --git a/libgomp/config/nvptx/affinity.c b/libgomp/config/nvptx/affinity.c deleted file mode 100644 index e69de29bb2d..00000000000 --- a/libgomp/config/nvptx/affinity.c +++ /dev/null diff --git a/libgomp/config/nvptx/alloc.c b/libgomp/config/nvptx/alloc.c deleted file mode 100644 index e69de29bb2d..00000000000 --- a/libgomp/config/nvptx/alloc.c +++ /dev/null diff --git a/libgomp/config/nvptx/bar.c b/libgomp/config/nvptx/bar.c index e69de29bb2d..a0d8a44dacf 100644 --- a/libgomp/config/nvptx/bar.c +++ b/libgomp/config/nvptx/bar.c @@ -0,0 +1,209 @@ +/* Copyright (C) 2015-2016 Free Software Foundation, Inc. + Contributed by Alexander Monakov <amonakov@ispras.ru> + + This file is part of the GNU Offloading and Multi Processing Library + (libgomp). + + Libgomp is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY + WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + FOR A PARTICULAR PURPOSE. See the GNU General Public License for + more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + <http://www.gnu.org/licenses/>. */ + +/* This is an NVPTX specific implementation of a barrier synchronization + mechanism for libgomp. This type is private to the library. This + implementation uses atomic instructions and bar.sync instruction. */ + +#include <limits.h> +#include "libgomp.h" + + +void +gomp_barrier_wait_end (gomp_barrier_t *bar, gomp_barrier_state_t state) +{ + if (__builtin_expect (state & BAR_WAS_LAST, 0)) + { + /* Next time we'll be awaiting TOTAL threads again. */ + bar->awaited = bar->total; + __atomic_store_n (&bar->generation, bar->generation + BAR_INCR, + MEMMODEL_RELEASE); + } + asm ("bar.sync 1, %0;" : : "r" (32 * bar->total)); +} + +void +gomp_barrier_wait (gomp_barrier_t *bar) +{ + gomp_barrier_wait_end (bar, gomp_barrier_wait_start (bar)); +} + +/* Like gomp_barrier_wait, except that if the encountering thread + is not the last one to hit the barrier, it returns immediately. + The intended usage is that a thread which intends to gomp_barrier_destroy + this barrier calls gomp_barrier_wait, while all other threads + call gomp_barrier_wait_last. When gomp_barrier_wait returns, + the barrier can be safely destroyed. */ + +void +gomp_barrier_wait_last (gomp_barrier_t *bar) +{ +#if 0 + gomp_barrier_state_t state = gomp_barrier_wait_start (bar); + if (state & BAR_WAS_LAST) + gomp_barrier_wait_end (bar, state); +#else + gomp_barrier_wait (bar); +#endif +} + +void +gomp_team_barrier_wake (gomp_barrier_t *bar, int count) +{ + asm ("bar.sync 1, %0;" : : "r" (32 * bar->total)); +} + +void +gomp_team_barrier_wait_end (gomp_barrier_t *bar, gomp_barrier_state_t state) +{ + unsigned int generation, gen; + + if (__builtin_expect (state & BAR_WAS_LAST, 0)) + { + /* Next time we'll be awaiting TOTAL threads again. */ + struct gomp_thread *thr = gomp_thread (); + struct gomp_team *team = thr->ts.team; + + bar->awaited = bar->total; + team->work_share_cancelled = 0; + if (__builtin_expect (team->task_count, 0)) + { + gomp_barrier_handle_tasks (state); + state &= ~BAR_WAS_LAST; + } + else + { + state &= ~BAR_CANCELLED; + state += BAR_INCR - BAR_WAS_LAST; + __atomic_store_n (&bar->generation, state, MEMMODEL_RELEASE); + asm ("bar.sync 1, %0;" : : "r" (32 * bar->total)); + return; + } + } + + generation = state; + state &= ~BAR_CANCELLED; + do + { + asm ("bar.sync 1, %0;" : : "r" (32 * bar->total)); + gen = __atomic_load_n (&bar->generation, MEMMODEL_ACQUIRE); + if (__builtin_expect (gen & BAR_TASK_PENDING, 0)) + { + gomp_barrier_handle_tasks (state); + gen = __atomic_load_n (&bar->generation, MEMMODEL_ACQUIRE); + } + generation |= gen & BAR_WAITING_FOR_TASK; + } + while (gen != state + BAR_INCR); +} + +void +gomp_team_barrier_wait (gomp_barrier_t *bar) +{ + gomp_team_barrier_wait_end (bar, gomp_barrier_wait_start (bar)); +} + +void +gomp_team_barrier_wait_final (gomp_barrier_t *bar) +{ + gomp_barrier_state_t state = gomp_barrier_wait_final_start (bar); + if (__builtin_expect (state & BAR_WAS_LAST, 0)) + bar->awaited_final = bar->total; + gomp_team_barrier_wait_end (bar, state); +} + +bool +gomp_team_barrier_wait_cancel_end (gomp_barrier_t *bar, + gomp_barrier_state_t state) +{ + unsigned int generation, gen; + + if (__builtin_expect (state & BAR_WAS_LAST, 0)) + { + /* Next time we'll be awaiting TOTAL threads again. */ + /* BAR_CANCELLED should never be set in state here, because + cancellation means that at least one of the threads has been + cancelled, thus on a cancellable barrier we should never see + all threads to arrive. */ + struct gomp_thread *thr = gomp_thread (); + struct gomp_team *team = thr->ts.team; + + bar->awaited = bar->total; + team->work_share_cancelled = 0; + if (__builtin_expect (team->task_count, 0)) + { + gomp_barrier_handle_tasks (state); + state &= ~BAR_WAS_LAST; + } + else + { + state += BAR_INCR - BAR_WAS_LAST; + __atomic_store_n (&bar->generation, state, MEMMODEL_RELEASE); + asm ("bar.sync 1, %0;" : : "r" (32 * bar->total)); + return false; + } + } + + if (__builtin_expect (state & BAR_CANCELLED, 0)) + return true; + + generation = state; + do + { + asm ("bar.sync 1, %0;" : : "r" (32 * bar->total)); + gen = __atomic_load_n (&bar->generation, MEMMODEL_ACQUIRE); + if (__builtin_expect (gen & BAR_CANCELLED, 0)) + return true; + if (__builtin_expect (gen & BAR_TASK_PENDING, 0)) + { + gomp_barrier_handle_tasks (state); + gen = __atomic_load_n (&bar->generation, MEMMODEL_ACQUIRE); + } + generation |= gen & BAR_WAITING_FOR_TASK; + } + while (gen != state + BAR_INCR); + + return false; +} + +bool +gomp_team_barrier_wait_cancel (gomp_barrier_t *bar) +{ + return gomp_team_barrier_wait_cancel_end (bar, gomp_barrier_wait_start (bar)); +} + +void +gomp_team_barrier_cancel (struct gomp_team *team) +{ + gomp_mutex_lock (&team->task_lock); + if (team->barrier.generation & BAR_CANCELLED) + { + gomp_mutex_unlock (&team->task_lock); + return; + } + team->barrier.generation |= BAR_CANCELLED; + gomp_mutex_unlock (&team->task_lock); + gomp_team_barrier_wake (&team->barrier, INT_MAX); +} diff --git a/libgomp/config/nvptx/bar.h b/libgomp/config/nvptx/bar.h new file mode 100644 index 00000000000..757edf1d7eb --- /dev/null +++ b/libgomp/config/nvptx/bar.h @@ -0,0 +1,166 @@ +/* Copyright (C) 2015-2016 Free Software Foundation, Inc. + Contributed by Alexander Monakov <amonakov@ispras.ru> + + This file is part of the GNU Offloading and Multi Processing Library + (libgomp). + + Libgomp is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY + WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + FOR A PARTICULAR PURPOSE. See the GNU General Public License for + more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + <http://www.gnu.org/licenses/>. */ + +/* This is an NVPTX specific implementation of a barrier synchronization + mechanism for libgomp. This type is private to the library. This + implementation uses atomic instructions and bar.sync instruction. */ + +#ifndef GOMP_BARRIER_H +#define GOMP_BARRIER_H 1 + +#include "mutex.h" + +typedef struct +{ + unsigned total; + unsigned generation; + unsigned awaited; + unsigned awaited_final; +} gomp_barrier_t; + +typedef unsigned int gomp_barrier_state_t; + +/* The generation field contains a counter in the high bits, with a few + low bits dedicated to flags. Note that TASK_PENDING and WAS_LAST can + share space because WAS_LAST is never stored back to generation. */ +#define BAR_TASK_PENDING 1 +#define BAR_WAS_LAST 1 +#define BAR_WAITING_FOR_TASK 2 +#define BAR_CANCELLED 4 +#define BAR_INCR 8 + +static inline void gomp_barrier_init (gomp_barrier_t *bar, unsigned count) +{ + bar->total = count; + bar->awaited = count; + bar->awaited_final = count; + bar->generation = 0; +} + +static inline void gomp_barrier_reinit (gomp_barrier_t *bar, unsigned count) +{ + __atomic_add_fetch (&bar->awaited, count - bar->total, MEMMODEL_ACQ_REL); + bar->total = count; +} + +static inline void gomp_barrier_destroy (gomp_barrier_t *bar) +{ +} + +extern void gomp_barrier_wait (gomp_barrier_t *); +extern void gomp_barrier_wait_last (gomp_barrier_t *); +extern void gomp_barrier_wait_end (gomp_barrier_t *, gomp_barrier_state_t); +extern void gomp_team_barrier_wait (gomp_barrier_t *); +extern void gomp_team_barrier_wait_final (gomp_barrier_t *); +extern void gomp_team_barrier_wait_end (gomp_barrier_t *, + gomp_barrier_state_t); +extern bool gomp_team_barrier_wait_cancel (gomp_barrier_t *); +extern bool gomp_team_barrier_wait_cancel_end (gomp_barrier_t *, + gomp_barrier_state_t); +extern void gomp_team_barrier_wake (gomp_barrier_t *, int); +struct gomp_team; +extern void gomp_team_barrier_cancel (struct gomp_team *); + +static inline gomp_barrier_state_t +gomp_barrier_wait_start (gomp_barrier_t *bar) +{ + unsigned int ret = __atomic_load_n (&bar->generation, MEMMODEL_ACQUIRE); + ret &= -BAR_INCR | BAR_CANCELLED; + /* A memory barrier is needed before exiting from the various forms + of gomp_barrier_wait, to satisfy OpenMP API version 3.1 section + 2.8.6 flush Construct, which says there is an implicit flush during + a barrier region. This is a convenient place to add the barrier, + so we use MEMMODEL_ACQ_REL here rather than MEMMODEL_ACQUIRE. */ + if (__atomic_add_fetch (&bar->awaited, -1, MEMMODEL_ACQ_REL) == 0) + ret |= BAR_WAS_LAST; + return ret; +} + +static inline gomp_barrier_state_t +gomp_barrier_wait_cancel_start (gomp_barrier_t *bar) +{ + return gomp_barrier_wait_start (bar); +} + +/* This is like gomp_barrier_wait_start, except it decrements + bar->awaited_final rather than bar->awaited and should be used + for the gomp_team_end barrier only. */ +static inline gomp_barrier_state_t +gomp_barrier_wait_final_start (gomp_barrier_t *bar) +{ + unsigned int ret = __atomic_load_n (&bar->generation, MEMMODEL_ACQUIRE); + ret &= -BAR_INCR | BAR_CANCELLED; + /* See above gomp_barrier_wait_start comment. */ + if (__atomic_add_fetch (&bar->awaited_final, -1, MEMMODEL_ACQ_REL) == 0) + ret |= BAR_WAS_LAST; + return ret; +} + +static inline bool +gomp_barrier_last_thread (gomp_barrier_state_t state) +{ + return state & BAR_WAS_LAST; +} + +/* All the inlines below must be called with team->task_lock + held. */ + +static inline void +gomp_team_barrier_set_task_pending (gomp_barrier_t *bar) +{ + bar->generation |= BAR_TASK_PENDING; +} + +static inline void +gomp_team_barrier_clear_task_pending (gomp_barrier_t *bar) +{ + bar->generation &= ~BAR_TASK_PENDING; +} + +static inline void +gomp_team_barrier_set_waiting_for_tasks (gomp_barrier_t *bar) +{ + bar->generation |= BAR_WAITING_FOR_TASK; +} + +static inline bool +gomp_team_barrier_waiting_for_tasks (gomp_barrier_t *bar) +{ + return (bar->generation & BAR_WAITING_FOR_TASK) != 0; +} + +static inline bool +gomp_team_barrier_cancelled (gomp_barrier_t *bar) +{ + return __builtin_expect ((bar->generation & BAR_CANCELLED) != 0, 0); +} + +static inline void +gomp_team_barrier_done (gomp_barrier_t *bar, gomp_barrier_state_t state) +{ + bar->generation = (state & -BAR_INCR) + BAR_INCR; +} + +#endif /* GOMP_BARRIER_H */ diff --git a/libgomp/config/nvptx/barrier.c b/libgomp/config/nvptx/barrier.c deleted file mode 100644 index e69de29bb2d..00000000000 --- a/libgomp/config/nvptx/barrier.c +++ /dev/null diff --git a/libgomp/config/nvptx/critical.c b/libgomp/config/nvptx/critical.c deleted file mode 100644 index e69de29bb2d..00000000000 --- a/libgomp/config/nvptx/critical.c +++ /dev/null diff --git a/libgomp/config/nvptx/doacross.h b/libgomp/config/nvptx/doacross.h new file mode 100644 index 00000000000..fd011d492bd --- /dev/null +++ b/libgomp/config/nvptx/doacross.h @@ -0,0 +1,60 @@ +/* Copyright (C) 2015-2016 Free Software Foundation, Inc. + Contributed by Alexander Monakov <amonakov@ispras.ru> + + This file is part of the GNU Offloading and Multi Processing Library + (libgomp). + + Libgomp is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY + WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + FOR A PARTICULAR PURPOSE. See the GNU General Public License for + more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + <http://www.gnu.org/licenses/>. */ + +/* This is the NVPTX implementation of doacross spinning. */ + +#ifndef GOMP_DOACROSS_H +#define GOMP_DOACROSS_H 1 + +#include "libgomp.h" + +static int zero; + +static inline int +cpu_relax (void) +{ + int r; + /* Here we need a long-latency operation to make the current warp yield. + We could use ld.cv, uncached load from system (host) memory, but that + would require allocating locked memory in the plugin. Alternatively, + we can use ld.cg, which evicts from L1 and caches in L2. */ + asm volatile ("ld.cg.s32 %0, [%1];" : "=r" (r) : "i" (&zero) : "memory"); + return r; +} + +static inline void doacross_spin (unsigned long *addr, unsigned long expected, + unsigned long cur) +{ + /* Prevent compiler from optimizing based on bounds of containing object. */ + asm ("" : "+r" (addr)); + do + { + int i = cpu_relax (); + cur = addr[i]; + } + while (cur <= expected); +} + +#endif /* GOMP_DOACROSS_H */ diff --git a/libgomp/config/nvptx/error.c b/libgomp/config/nvptx/error.c index e69de29bb2d..f3e74cd9449 100644 --- a/libgomp/config/nvptx/error.c +++ b/libgomp/config/nvptx/error.c @@ -0,0 +1,42 @@ +/* Copyright (C) 2015-2016 Free Software Foundation, Inc. + Contributed by Alexander Monakov <amonakov@ispras.ru> + + This file is part of the GNU Offloading and Multi Processing Library + (libgomp). + + Libgomp is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY + WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + FOR A PARTICULAR PURPOSE. See the GNU General Public License for + more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + <http://www.gnu.org/licenses/>. */ + +/* This file contains routines used to signal errors. On NVPTX, we have + one default output stream (stdout), so redirect everything there. */ + +#include "libgomp.h" +#include <stdarg.h> +#include <stdio.h> +#include <stdlib.h> + +#undef vfprintf +#undef fputs +#undef fputc + +#define vfprintf(stream, fmt, list) vprintf (fmt, list) +#define fputs(s, stream) printf ("%s", s) +#define fputc(c, stream) printf ("%c", c) + +#include "../../error.c" diff --git a/libgomp/config/nvptx/icv-device.c b/libgomp/config/nvptx/icv-device.c new file mode 100644 index 00000000000..831ba110e95 --- /dev/null +++ b/libgomp/config/nvptx/icv-device.c @@ -0,0 +1,74 @@ +/* Copyright (C) 2015-2016 Free Software Foundation, Inc. + Contributed by Alexander Monakov <amonakov@ispras.ru> + + This file is part of the GNU Offloading and Multi Processing Library + (libgomp). + + Libgomp is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY + WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + FOR A PARTICULAR PURPOSE. See the GNU General Public License for + more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + <http://www.gnu.org/licenses/>. */ + +/* This file defines OpenMP API entry points that accelerator targets are + expected to replace. */ + +#include "libgomp.h" + +void +omp_set_default_device (int device_num __attribute__((unused))) +{ +} + +int +omp_get_default_device (void) +{ + return 0; +} + +int +omp_get_num_devices (void) +{ + return 0; +} + +int +omp_get_num_teams (void) +{ + return gomp_num_teams_var + 1; +} + +int +omp_get_team_num (void) +{ + int ctaid; + asm ("mov.u32 %0, %%ctaid.x;" : "=r" (ctaid)); + return ctaid; +} + +int +omp_is_initial_device (void) +{ + /* NVPTX is an accelerator-only target. */ + return 0; +} + +ialias (omp_set_default_device) +ialias (omp_get_default_device) +ialias (omp_get_num_devices) +ialias (omp_get_num_teams) +ialias (omp_get_team_num) +ialias (omp_is_initial_device) diff --git a/libgomp/config/nvptx/iter.c b/libgomp/config/nvptx/iter.c deleted file mode 100644 index e69de29bb2d..00000000000 --- a/libgomp/config/nvptx/iter.c +++ /dev/null diff --git a/libgomp/config/nvptx/iter_ull.c b/libgomp/config/nvptx/iter_ull.c deleted file mode 100644 index e69de29bb2d..00000000000 --- a/libgomp/config/nvptx/iter_ull.c +++ /dev/null diff --git a/libgomp/config/nvptx/lock.c b/libgomp/config/nvptx/lock.c index e69de29bb2d..7731704c6fb 100644 --- a/libgomp/config/nvptx/lock.c +++ b/libgomp/config/nvptx/lock.c @@ -0,0 +1,41 @@ +/* Copyright (C) 2016 Free Software Foundation, Inc. + Contributed by Alexander Monakov <amonakov@ispras.ru>. + + This file is part of the GNU Offloading and Multi Processing Library + (libgomp). + + Libgomp is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY + WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + FOR A PARTICULAR PURPOSE. See the GNU General Public License for + more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + <http://www.gnu.org/licenses/>. */ + +/* This is a NVPTX specific implementation of the public OpenMP locking + primitives. */ + +/* Reuse the generic implementation in terms of gomp_mutex_t. */ +#include "../../lock.c" + +ialias (omp_init_lock) +ialias (omp_init_nest_lock) +ialias (omp_destroy_lock) +ialias (omp_destroy_nest_lock) +ialias (omp_set_lock) +ialias (omp_set_nest_lock) +ialias (omp_unset_lock) +ialias (omp_unset_nest_lock) +ialias (omp_test_lock) +ialias (omp_test_nest_lock) diff --git a/libgomp/config/nvptx/loop.c b/libgomp/config/nvptx/loop.c deleted file mode 100644 index e69de29bb2d..00000000000 --- a/libgomp/config/nvptx/loop.c +++ /dev/null diff --git a/libgomp/config/nvptx/loop_ull.c b/libgomp/config/nvptx/loop_ull.c deleted file mode 100644 index e69de29bb2d..00000000000 --- a/libgomp/config/nvptx/loop_ull.c +++ /dev/null diff --git a/libgomp/config/nvptx/mutex.h b/libgomp/config/nvptx/mutex.h new file mode 100644 index 00000000000..e408ca72933 --- /dev/null +++ b/libgomp/config/nvptx/mutex.h @@ -0,0 +1,60 @@ +/* Copyright (C) 2015-2016 Free Software Foundation, Inc. + Contributed by Alexander Monakov <amonakov@ispras.ru> + + This file is part of the GNU Offloading and Multi Processing Library + (libgomp). + + Libgomp is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY + WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + FOR A PARTICULAR PURPOSE. See the GNU General Public License for + more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + <http://www.gnu.org/licenses/>. */ + +/* This is an NVPTX specific implementation of a mutex synchronization + mechanism for libgomp. This type is private to the library. This + implementation uses atomic instructions and busy waiting. */ + +#ifndef GOMP_MUTEX_H +#define GOMP_MUTEX_H 1 + +typedef int gomp_mutex_t; + +#define GOMP_MUTEX_INIT_0 1 + +static inline void +gomp_mutex_init (gomp_mutex_t *mutex) +{ + *mutex = 0; +} + +static inline void +gomp_mutex_destroy (gomp_mutex_t *mutex) +{ +} + +static inline void +gomp_mutex_lock (gomp_mutex_t *mutex) +{ + while (__sync_lock_test_and_set (mutex, 1)) + /* spin */ ; +} + +static inline void +gomp_mutex_unlock (gomp_mutex_t *mutex) +{ + __sync_lock_release (mutex); +} +#endif /* GOMP_MUTEX_H */ diff --git a/libgomp/config/nvptx/ordered.c b/libgomp/config/nvptx/ordered.c deleted file mode 100644 index e69de29bb2d..00000000000 --- a/libgomp/config/nvptx/ordered.c +++ /dev/null diff --git a/libgomp/config/nvptx/parallel.c b/libgomp/config/nvptx/parallel.c deleted file mode 100644 index e69de29bb2d..00000000000 --- a/libgomp/config/nvptx/parallel.c +++ /dev/null diff --git a/libgomp/config/nvptx/fortran.c b/libgomp/config/nvptx/pool.h index 71ba6ed73f5..70e233c7caa 100644 --- a/libgomp/config/nvptx/fortran.c +++ b/libgomp/config/nvptx/pool.h @@ -1,8 +1,5 @@ -/* OpenACC Runtime Fortran wrapper routines - - Copyright (C) 2014-2016 Free Software Foundation, Inc. - - Contributed by Mentor Embedded. +/* Copyright (C) 2015-2016 Free Software Foundation, Inc. + Contributed by Alexander Monakov <amonakov@ispras.ru> This file is part of the GNU Offloading and Multi Processing Library (libgomp). @@ -26,15 +23,27 @@ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see <http://www.gnu.org/licenses/>. */ -/* Temporary hack; this will be provided by libgfortran. */ +/* This is the NVPTX implementation of the thread pool management + for libgomp. This type is private to the library. */ + +#ifndef GOMP_POOL_H +#define GOMP_POOL_H 1 + +#include "libgomp.h" + +/* Get the thread pool. */ + +static inline struct gomp_thread_pool * +gomp_get_thread_pool (struct gomp_thread *thr, unsigned nthreads) +{ + /* NVPTX is running with a fixed pool of pre-started threads. */ + return thr->thread_pool; +} -extern void _gfortran_abort (void); +static inline void +gomp_release_thread_pool (struct gomp_thread_pool *pool) +{ + /* Do nothing. */ +} -__asm__ ("// BEGIN GLOBAL FUNCTION DECL: _gfortran_abort\n" - ".visible .func _gfortran_abort;\n" - "// BEGIN GLOBAL FUNCTION DEF: _gfortran_abort\n" - ".visible .func _gfortran_abort\n" - "{\n" - "trap;\n" - "ret;\n" - "}\n"); +#endif /* GOMP_POOL_H */ diff --git a/libgomp/config/nvptx/priority_queue.c b/libgomp/config/nvptx/priority_queue.c deleted file mode 100644 index 63aecd249fb..00000000000 --- a/libgomp/config/nvptx/priority_queue.c +++ /dev/null @@ -1 +0,0 @@ -/* Empty stub for omp task priority support. */ diff --git a/libgomp/config/nvptx/proc.c b/libgomp/config/nvptx/proc.c index e69de29bb2d..8c1c3664660 100644 --- a/libgomp/config/nvptx/proc.c +++ b/libgomp/config/nvptx/proc.c @@ -0,0 +1,41 @@ +/* Copyright (C) 2015-2016 Free Software Foundation, Inc. + Contributed by Alexander Monakov <amonakov@ispras.ru> + + This file is part of the GNU Offloading and Multi Processing Library + (libgomp). + + Libgomp is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY + WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + FOR A PARTICULAR PURPOSE. See the GNU General Public License for + more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + <http://www.gnu.org/licenses/>. */ + +/* This file contains system specific routines related to counting + online processors and dynamic load balancing. */ + +#include "libgomp.h" + +unsigned +gomp_dynamic_max_threads (void) +{ + return gomp_icv (false)->nthreads_var; +} + +int +omp_get_num_procs (void) +{ + return gomp_icv (false)->nthreads_var; +} diff --git a/libgomp/config/nvptx/ptrlock.h b/libgomp/config/nvptx/ptrlock.h new file mode 100644 index 00000000000..c2eae75720d --- /dev/null +++ b/libgomp/config/nvptx/ptrlock.h @@ -0,0 +1,73 @@ +/* Copyright (C) 2015-2016 Free Software Foundation, Inc. + Contributed by Alexander Monakov <amonakov@ispras.ru> + + This file is part of the GNU Offloading and Multi Processing Library + (libgomp). + + Libgomp is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY + WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + FOR A PARTICULAR PURPOSE. See the GNU General Public License for + more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + <http://www.gnu.org/licenses/>. */ + +/* This is an NVPTX specific implementation of a mutex synchronization + mechanism for libgomp. This type is private to the library. This + implementation uses atomic instructions and busy waiting. + + A ptrlock has four states: + 0/NULL Initial + 1 Owned by me, I get to write a pointer to ptrlock. + 2 Some thread is waiting on the ptrlock. + >2 Ptrlock contains a valid pointer. + It is not valid to gain the ptrlock and then write a NULL to it. */ + +#ifndef GOMP_PTRLOCK_H +#define GOMP_PTRLOCK_H 1 + +typedef void *gomp_ptrlock_t; + +static inline void gomp_ptrlock_init (gomp_ptrlock_t *ptrlock, void *ptr) +{ + *ptrlock = ptr; +} + +static inline void *gomp_ptrlock_get (gomp_ptrlock_t *ptrlock) +{ + uintptr_t v = (uintptr_t) __atomic_load_n (ptrlock, MEMMODEL_ACQUIRE); + if (v > 2) + return (void *) v; + + if (v == 0 + && __atomic_compare_exchange_n (ptrlock, &v, 1, false, + MEMMODEL_ACQUIRE, MEMMODEL_ACQUIRE)) + return NULL; + + while (v == 1) + v = (uintptr_t) __atomic_load_n (ptrlock, MEMMODEL_ACQUIRE); + + return (void *) v; +} + +static inline void gomp_ptrlock_set (gomp_ptrlock_t *ptrlock, void *ptr) +{ + __atomic_store_n (ptrlock, ptr, MEMMODEL_RELEASE); +} + +static inline void gomp_ptrlock_destroy (gomp_ptrlock_t *ptrlock) +{ +} + +#endif /* GOMP_PTRLOCK_H */ diff --git a/libgomp/config/nvptx/sections.c b/libgomp/config/nvptx/sections.c deleted file mode 100644 index e69de29bb2d..00000000000 --- a/libgomp/config/nvptx/sections.c +++ /dev/null diff --git a/libgomp/config/nvptx/sem.h b/libgomp/config/nvptx/sem.h new file mode 100644 index 00000000000..82c0dfbf7e0 --- /dev/null +++ b/libgomp/config/nvptx/sem.h @@ -0,0 +1,65 @@ +/* Copyright (C) 2015-2016 Free Software Foundation, Inc. + Contributed by Alexander Monakov <amonakov@ispras.ru> + + This file is part of the GNU Offloading and Multi Processing Library + (libgomp). + + Libgomp is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY + WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + FOR A PARTICULAR PURPOSE. See the GNU General Public License for + more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + <http://www.gnu.org/licenses/>. */ + +/* This is an NVPTX specific implementation of a semaphore synchronization + mechanism for libgomp. This type is private to the library. This + semaphore implementation uses atomic instructions and busy waiting. */ + +#ifndef GOMP_SEM_H +#define GOMP_SEM_H 1 + +typedef int gomp_sem_t; + +static inline void +gomp_sem_init (gomp_sem_t *sem, int value) +{ + *sem = value; +} + +static inline void +gomp_sem_destroy (gomp_sem_t *sem) +{ +} + +static inline void +gomp_sem_wait (gomp_sem_t *sem) +{ + int count = __atomic_load_n (sem, MEMMODEL_ACQUIRE); + for (;;) + { + while (count == 0) + count = __atomic_load_n (sem, MEMMODEL_ACQUIRE); + if (__atomic_compare_exchange_n (sem, &count, count - 1, false, + MEMMODEL_ACQUIRE, MEMMODEL_RELAXED)) + return; + } +} + +static inline void +gomp_sem_post (gomp_sem_t *sem) +{ + (void) __atomic_add_fetch (sem, 1, MEMMODEL_RELEASE); +} +#endif /* GOMP_SEM_H */ diff --git a/libgomp/config/nvptx/simple-bar.h b/libgomp/config/nvptx/simple-bar.h new file mode 100644 index 00000000000..e7b56d98826 --- /dev/null +++ b/libgomp/config/nvptx/simple-bar.h @@ -0,0 +1,70 @@ +/* Copyright (C) 2015-2016 Free Software Foundation, Inc. + Contributed by Alexander Monakov <amonakov@ispras.ru> + + This file is part of the GNU Offloading and Multi Processing Library + (libgomp). + + Libgomp is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY + WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + FOR A PARTICULAR PURPOSE. See the GNU General Public License for + more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + <http://www.gnu.org/licenses/>. */ + +/* This is a simplified barrier that is suitable for thread pool + synchronizaton. Only a subset of full barrier API (bar.h) is exposed. + Here in the NVPTX-specific implementation, we expect that thread pool + corresponds to a PTX CTA (thread block). */ + +#ifndef GOMP_SIMPLE_BARRIER_H +#define GOMP_SIMPLE_BARRIER_H 1 + +typedef struct +{ + unsigned count; +} gomp_simple_barrier_t; + +static inline void +gomp_simple_barrier_init (gomp_simple_barrier_t *bar, unsigned count) +{ + bar->count = count * 32; +} + +/* Unused on NVPTX. +static inline void +gomp_simple_barrier_reinit (gomp_simple_barrier_t *bar, unsigned count) +{ + bar->count = count * 32; +} +*/ + +static inline void +gomp_simple_barrier_destroy (gomp_simple_barrier_t *bar) +{ +} + +static inline void +gomp_simple_barrier_wait (gomp_simple_barrier_t *bar) +{ + asm volatile ("bar.sync 0, %0;" : : "r" (bar->count) : "memory"); +} + +static inline void +gomp_simple_barrier_wait_last (gomp_simple_barrier_t *bar) +{ + asm volatile ("bar.arrive 0, %0;" : : "r" (bar->count) : "memory"); +} + +#endif /* GOMP_SIMPLE_BARRIER_H */ diff --git a/libgomp/config/nvptx/single.c b/libgomp/config/nvptx/single.c deleted file mode 100644 index e69de29bb2d..00000000000 --- a/libgomp/config/nvptx/single.c +++ /dev/null diff --git a/libgomp/config/nvptx/splay-tree.c b/libgomp/config/nvptx/splay-tree.c deleted file mode 100644 index e69de29bb2d..00000000000 --- a/libgomp/config/nvptx/splay-tree.c +++ /dev/null diff --git a/libgomp/config/nvptx/target.c b/libgomp/config/nvptx/target.c index e69de29bb2d..38ea7f7aa68 100644 --- a/libgomp/config/nvptx/target.c +++ b/libgomp/config/nvptx/target.c @@ -0,0 +1,49 @@ +/* Copyright (C) 2013-2016 Free Software Foundation, Inc. + Contributed by Jakub Jelinek <jakub@redhat.com>. + + This file is part of the GNU Offloading and Multi Processing Library + (libgomp). + + Libgomp is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY + WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + FOR A PARTICULAR PURPOSE. See the GNU General Public License for + more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + <http://www.gnu.org/licenses/>. */ + +#include "libgomp.h" +#include <limits.h> + +void +GOMP_teams (unsigned int num_teams, unsigned int thread_limit) +{ + if (thread_limit) + { + struct gomp_task_icv *icv = gomp_icv (true); + icv->thread_limit_var + = thread_limit > INT_MAX ? UINT_MAX : thread_limit; + } + unsigned int num_blocks, block_id; + asm ("mov.u32 %0, %%nctaid.x;" : "=r" (num_blocks)); + asm ("mov.u32 %0, %%ctaid.x;" : "=r" (block_id)); + if (!num_teams || num_teams >= num_blocks) + num_teams = num_blocks; + else if (block_id >= num_teams) + { + gomp_free_thread (nvptx_thrs); + asm ("exit;"); + } + gomp_num_teams_var = num_teams - 1; +} diff --git a/libgomp/config/nvptx/task.c b/libgomp/config/nvptx/task.c index e69de29bb2d..c183716c94e 100644 --- a/libgomp/config/nvptx/task.c +++ b/libgomp/config/nvptx/task.c @@ -0,0 +1,43 @@ +/* Copyright (C) 2015-2016 Free Software Foundation, Inc. + Contributed by Alexander Monakov <amonakov@ispras.ru>. + + This file is part of the GNU Offloading and Multi Processing Library + (libgomp). + + Libgomp is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY + WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + FOR A PARTICULAR PURPOSE. See the GNU General Public License for + more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + <http://www.gnu.org/licenses/>. */ + +/* This file handles the maintainence of tasks in response to task + creation and termination. */ + +#ifdef __nvptx_softstack__ + +#include "libgomp.h" + +/* NVPTX is an accelerator-only target, so this should never be called. */ + +bool +gomp_target_task_fn (void *data) +{ + __builtin_unreachable (); +} + +#include "../../task.c" + +#endif diff --git a/libgomp/config/nvptx/team.c b/libgomp/config/nvptx/team.c index e69de29bb2d..f7b5e3e81b5 100644 --- a/libgomp/config/nvptx/team.c +++ b/libgomp/config/nvptx/team.c @@ -0,0 +1,178 @@ +/* Copyright (C) 2015-2016 Free Software Foundation, Inc. + Contributed by Alexander Monakov <amonakov@ispras.ru> + + This file is part of the GNU Offloading and Multi Processing Library + (libgomp). + + Libgomp is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY + WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + FOR A PARTICULAR PURPOSE. See the GNU General Public License for + more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + <http://www.gnu.org/licenses/>. */ + +/* This file handles maintainance of threads on NVPTX. */ + +#if defined __nvptx_softstack__ && defined __nvptx_unisimt__ + +#include "libgomp.h" +#include <stdlib.h> +#include <string.h> + +struct gomp_thread *nvptx_thrs __attribute__((shared,nocommon)); + +static void gomp_thread_start (struct gomp_thread_pool *); + + +/* This externally visible function handles target region entry. It + sets up a per-team thread pool and transfers control by calling FN (FN_DATA) + in the master thread or gomp_thread_start in other threads. + + The name of this function is part of the interface with the compiler: for + each target region, GCC emits a PTX .kernel function that sets up soft-stack + and uniform-simt state and calls this function, passing in FN the original + function outlined for the target region. */ + +void +gomp_nvptx_main (void (*fn) (void *), void *fn_data) +{ + int tid, ntids; + asm ("mov.u32 %0, %%tid.y;" : "=r" (tid)); + asm ("mov.u32 %0, %%ntid.y;" : "=r" (ntids)); + if (tid == 0) + { + gomp_global_icv.nthreads_var = ntids; + /* Starting additional threads is not supported. */ + gomp_global_icv.dyn_var = true; + + nvptx_thrs = alloca (ntids * sizeof (*nvptx_thrs)); + memset (nvptx_thrs, 0, ntids * sizeof (*nvptx_thrs)); + + struct gomp_thread_pool *pool = alloca (sizeof (*pool)); + pool->threads = alloca (ntids * sizeof (*pool->threads)); + for (tid = 0; tid < ntids; tid++) + pool->threads[tid] = nvptx_thrs + tid; + pool->threads_size = ntids; + pool->threads_used = ntids; + pool->threads_busy = 1; + pool->last_team = NULL; + gomp_simple_barrier_init (&pool->threads_dock, ntids); + + nvptx_thrs[0].thread_pool = pool; + asm ("bar.sync 0;"); + fn (fn_data); + + gomp_free_thread (nvptx_thrs); + } + else + { + asm ("bar.sync 0;"); + gomp_thread_start (nvptx_thrs[0].thread_pool); + } +} + +/* This function contains the idle loop in which a thread waits + to be called up to become part of a team. */ + +static void +gomp_thread_start (struct gomp_thread_pool *pool) +{ + struct gomp_thread *thr = gomp_thread (); + + gomp_sem_init (&thr->release, 0); + thr->thread_pool = pool; + + do + { + gomp_simple_barrier_wait (&pool->threads_dock); + if (!thr->fn) + continue; + thr->fn (thr->data); + thr->fn = NULL; + + struct gomp_task *task = thr->task; + gomp_team_barrier_wait_final (&thr->ts.team->barrier); + gomp_finish_task (task); + } + /* Work around an NVIDIA driver bug: when generating sm_50 machine code, + it can trash stack pointer R1 in loops lacking exit edges. Add a cheap + artificial exit that the driver would not be able to optimize out. */ + while (nvptx_thrs); +} + +/* Launch a team. */ + +void +gomp_team_start (void (*fn) (void *), void *data, unsigned nthreads, + unsigned flags, struct gomp_team *team) +{ + struct gomp_thread *thr, *nthr; + struct gomp_task *task; + struct gomp_task_icv *icv; + struct gomp_thread_pool *pool; + unsigned long nthreads_var; + + thr = gomp_thread (); + pool = thr->thread_pool; + task = thr->task; + icv = task ? &task->icv : &gomp_global_icv; + + /* Always save the previous state, even if this isn't a nested team. + In particular, we should save any work share state from an outer + orphaned work share construct. */ + team->prev_ts = thr->ts; + + thr->ts.team = team; + thr->ts.team_id = 0; + ++thr->ts.level; + if (nthreads > 1) + ++thr->ts.active_level; + thr->ts.work_share = &team->work_shares[0]; + thr->ts.last_work_share = NULL; + thr->ts.single_count = 0; + thr->ts.static_trip = 0; + thr->task = &team->implicit_task[0]; + nthreads_var = icv->nthreads_var; + gomp_init_task (thr->task, task, icv); + team->implicit_task[0].icv.nthreads_var = nthreads_var; + + if (nthreads == 1) + return; + + /* Release existing idle threads. */ + for (unsigned i = 1; i < nthreads; ++i) + { + nthr = pool->threads[i]; + nthr->ts.team = team; + nthr->ts.work_share = &team->work_shares[0]; + nthr->ts.last_work_share = NULL; + nthr->ts.team_id = i; + nthr->ts.level = team->prev_ts.level + 1; + nthr->ts.active_level = thr->ts.active_level; + nthr->ts.single_count = 0; + nthr->ts.static_trip = 0; + nthr->task = &team->implicit_task[i]; + gomp_init_task (nthr->task, task, icv); + team->implicit_task[i].icv.nthreads_var = nthreads_var; + nthr->fn = fn; + nthr->data = data; + team->ordered_release[i] = &nthr->release; + } + + gomp_simple_barrier_wait (&pool->threads_dock); +} + +#include "../../team.c" +#endif diff --git a/libgomp/config/nvptx/time.c b/libgomp/config/nvptx/time.c index e69de29bb2d..88fb13050c0 100644 --- a/libgomp/config/nvptx/time.c +++ b/libgomp/config/nvptx/time.c @@ -0,0 +1,49 @@ +/* Copyright (C) 2015-2016 Free Software Foundation, Inc. + Contributed by Dmitry Melnik <dm@ispras.ru> + + This file is part of the GNU Offloading and Multi Processing Library + (libgomp). + + Libgomp is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY + WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + FOR A PARTICULAR PURPOSE. See the GNU General Public License for + more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + <http://www.gnu.org/licenses/>. */ + +/* This file implements timer routines for NVPTX. It uses the %clock64 cycle + counter. */ + +#include "libgomp.h" + +/* This is set from host in plugin-nvptx.c. */ +double __nvptx_clocktick = 0; + +double +omp_get_wtime (void) +{ + uint64_t clock; + asm ("mov.u64 %0, %%clock64;" : "=r" (clock)); + return clock * __nvptx_clocktick; +} + +double +omp_get_wtick (void) +{ + return __nvptx_clocktick; +} + +ialias (omp_get_wtime) +ialias (omp_get_wtick) diff --git a/libgomp/config/nvptx/work.c b/libgomp/config/nvptx/work.c deleted file mode 100644 index e69de29bb2d..00000000000 --- a/libgomp/config/nvptx/work.c +++ /dev/null diff --git a/libgomp/config/posix/simple-bar.h b/libgomp/config/posix/simple-bar.h new file mode 100644 index 00000000000..b77491c19f0 --- /dev/null +++ b/libgomp/config/posix/simple-bar.h @@ -0,0 +1,69 @@ +/* Copyright (C) 2015-2016 Free Software Foundation, Inc. + Contributed by Alexander Monakov <amonakov@ispras.ru> + + This file is part of the GNU Offloading and Multi Processing Library + (libgomp). + + Libgomp is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY + WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + FOR A PARTICULAR PURPOSE. See the GNU General Public License for + more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + <http://www.gnu.org/licenses/>. */ + +/* This is a simplified barrier that is suitable for thread pool + synchronizaton. Only a subset of full barrier API (bar.h) is exposed. */ + +#ifndef GOMP_SIMPLE_BARRIER_H +#define GOMP_SIMPLE_BARRIER_H 1 + +#include "bar.h" + +typedef struct +{ + gomp_barrier_t bar; +} gomp_simple_barrier_t; + +static inline void +gomp_simple_barrier_init (gomp_simple_barrier_t *bar, unsigned count) +{ + gomp_barrier_init (&bar->bar, count); +} + +static inline void +gomp_simple_barrier_reinit (gomp_simple_barrier_t *bar, unsigned count) +{ + gomp_barrier_reinit (&bar->bar, count); +} + +static inline void +gomp_simple_barrier_destroy (gomp_simple_barrier_t *bar) +{ + gomp_barrier_destroy (&bar->bar); +} + +static inline void +gomp_simple_barrier_wait (gomp_simple_barrier_t *bar) +{ + gomp_barrier_wait (&bar->bar); +} + +static inline void +gomp_simple_barrier_wait_last (gomp_simple_barrier_t *bar) +{ + gomp_barrier_wait_last (&bar->bar); +} + +#endif /* GOMP_SIMPLE_BARRIER_H */ diff --git a/libgomp/configure b/libgomp/configure index 8d03eb6cddc..84d5876cce2 100755 --- a/libgomp/configure +++ b/libgomp/configure @@ -15067,6 +15067,7 @@ case "$host" in ;; nvptx*-*-*) # NVPTX does not support Pthreads, has its own code replacement. + libgomp_use_pthreads=no ;; *) # Check to see if -pthread or -lpthread is needed. Prefer the former. @@ -15112,6 +15113,12 @@ rm -f core conftest.err conftest.$ac_objext \ conftest$ac_exeext conftest.$ac_ext esac +if test x$libgomp_use_pthreads != xno; then + +$as_echo "#define LIBGOMP_USE_PTHREADS 1" >>confdefs.h + +fi + # Plugins for offload execution, configure.ac fragment. -*- mode: autoconf -*- # # Copyright (C) 2014-2016 Free Software Foundation, Inc. diff --git a/libgomp/configure.ac b/libgomp/configure.ac index 2e41ca8aee5..5f1db7e1e0e 100644 --- a/libgomp/configure.ac +++ b/libgomp/configure.ac @@ -181,6 +181,7 @@ case "$host" in ;; nvptx*-*-*) # NVPTX does not support Pthreads, has its own code replacement. + libgomp_use_pthreads=no ;; *) # Check to see if -pthread or -lpthread is needed. Prefer the former. @@ -202,6 +203,11 @@ case "$host" in [AC_MSG_ERROR([Pthreads are required to build libgomp])])]) esac +if test x$libgomp_use_pthreads != xno; then + AC_DEFINE(LIBGOMP_USE_PTHREADS, 1, + [Define to 1 if libgomp should use POSIX threads.]) +fi + m4_include([plugin/configfrag.ac]) # Check for functions needed. diff --git a/libgomp/critical.c b/libgomp/critical.c index 2b1f7f25013..cd520dbe5b3 100644 --- a/libgomp/critical.c +++ b/libgomp/critical.c @@ -115,33 +115,11 @@ GOMP_critical_name_end (void **pptr) gomp_mutex_unlock (plock); } -/* This mutex is used when atomic operations don't exist for the target - in the mode requested. The result is not globally atomic, but works so - long as all parallel references are within #pragma omp atomic directives. - According to responses received from omp@openmp.org, appears to be within - spec. Which makes sense, since that's how several other compilers - handle this situation as well. */ - -static gomp_mutex_t atomic_lock; - -void -GOMP_atomic_start (void) -{ - gomp_mutex_lock (&atomic_lock); -} - -void -GOMP_atomic_end (void) -{ - gomp_mutex_unlock (&atomic_lock); -} - #if !GOMP_MUTEX_INIT_0 static void __attribute__((constructor)) initialize_critical (void) { gomp_mutex_init (&default_lock); - gomp_mutex_init (&atomic_lock); #ifndef HAVE_SYNC_BUILTINS gomp_mutex_init (&create_lock_lock); #endif diff --git a/libgomp/env.c b/libgomp/env.c index ac05c3b7b75..7ba7663da9a 100644 --- a/libgomp/env.c +++ b/libgomp/env.c @@ -23,8 +23,8 @@ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see <http://www.gnu.org/licenses/>. */ -/* This file defines the OpenMP internal control variables, and arranges - for them to be initialized from environment variables at startup. */ +/* This file arranges for OpenMP internal control variables to be initialized + from environment variables at startup. */ #include "libgomp.h" #include "libgomp_f.h" @@ -55,35 +55,6 @@ # define strtoull(ptr, eptr, base) strtoul (ptr, eptr, base) #endif -struct gomp_task_icv gomp_global_icv = { - .nthreads_var = 1, - .thread_limit_var = UINT_MAX, - .run_sched_var = GFS_DYNAMIC, - .run_sched_chunk_size = 1, - .default_device_var = 0, - .dyn_var = false, - .nest_var = false, - .bind_var = omp_proc_bind_false, - .target_data = NULL -}; - -unsigned long gomp_max_active_levels_var = INT_MAX; -bool gomp_cancel_var = false; -int gomp_max_task_priority_var = 0; -#ifndef HAVE_SYNC_BUILTINS -gomp_mutex_t gomp_managed_threads_lock; -#endif -unsigned long gomp_available_cpus = 1, gomp_managed_threads = 1; -unsigned long long gomp_spin_count_var, gomp_throttled_spin_count_var; -unsigned long *gomp_nthreads_var_list, gomp_nthreads_var_list_len; -char *gomp_bind_var_list; -unsigned long gomp_bind_var_list_len; -void **gomp_places_list; -unsigned long gomp_places_list_len; -int gomp_debug_var; -char *goacc_device_type; -int goacc_device_num; - /* Parse the OMP_SCHEDULE environment variable. */ static void @@ -1302,240 +1273,3 @@ initialize_env (void) goacc_runtime_initialize (); } - - -/* The public OpenMP API routines that access these variables. */ - -void -omp_set_num_threads (int n) -{ - struct gomp_task_icv *icv = gomp_icv (true); - icv->nthreads_var = (n > 0 ? n : 1); -} - -void -omp_set_dynamic (int val) -{ - struct gomp_task_icv *icv = gomp_icv (true); - icv->dyn_var = val; -} - -int -omp_get_dynamic (void) -{ - struct gomp_task_icv *icv = gomp_icv (false); - return icv->dyn_var; -} - -void -omp_set_nested (int val) -{ - struct gomp_task_icv *icv = gomp_icv (true); - icv->nest_var = val; -} - -int -omp_get_nested (void) -{ - struct gomp_task_icv *icv = gomp_icv (false); - return icv->nest_var; -} - -void -omp_set_schedule (omp_sched_t kind, int chunk_size) -{ - struct gomp_task_icv *icv = gomp_icv (true); - switch (kind) - { - case omp_sched_static: - if (chunk_size < 1) - chunk_size = 0; - icv->run_sched_chunk_size = chunk_size; - break; - case omp_sched_dynamic: - case omp_sched_guided: - if (chunk_size < 1) - chunk_size = 1; - icv->run_sched_chunk_size = chunk_size; - break; - case omp_sched_auto: - break; - default: - return; - } - icv->run_sched_var = kind; -} - -void -omp_get_schedule (omp_sched_t *kind, int *chunk_size) -{ - struct gomp_task_icv *icv = gomp_icv (false); - *kind = icv->run_sched_var; - *chunk_size = icv->run_sched_chunk_size; -} - -int -omp_get_max_threads (void) -{ - struct gomp_task_icv *icv = gomp_icv (false); - return icv->nthreads_var; -} - -int -omp_get_thread_limit (void) -{ - struct gomp_task_icv *icv = gomp_icv (false); - return icv->thread_limit_var > INT_MAX ? INT_MAX : icv->thread_limit_var; -} - -void -omp_set_max_active_levels (int max_levels) -{ - if (max_levels >= 0) - gomp_max_active_levels_var = max_levels; -} - -int -omp_get_max_active_levels (void) -{ - return gomp_max_active_levels_var; -} - -int -omp_get_cancellation (void) -{ - return gomp_cancel_var; -} - -int -omp_get_max_task_priority (void) -{ - return gomp_max_task_priority_var; -} - -omp_proc_bind_t -omp_get_proc_bind (void) -{ - struct gomp_task_icv *icv = gomp_icv (false); - return icv->bind_var; -} - -void -omp_set_default_device (int device_num) -{ - struct gomp_task_icv *icv = gomp_icv (true); - icv->default_device_var = device_num >= 0 ? device_num : 0; -} - -int -omp_get_default_device (void) -{ - struct gomp_task_icv *icv = gomp_icv (false); - return icv->default_device_var; -} - -int -omp_get_num_devices (void) -{ - return gomp_get_num_devices (); -} - -int -omp_get_num_teams (void) -{ - /* Hardcoded to 1 on host, MIC, HSAIL? Maybe variable on PTX. */ - return 1; -} - -int -omp_get_team_num (void) -{ - /* Hardcoded to 0 on host, MIC, HSAIL? Maybe variable on PTX. */ - return 0; -} - -int -omp_is_initial_device (void) -{ - /* Hardcoded to 1 on host, should be 0 on MIC, HSAIL, PTX. */ - return 1; -} - -int -omp_get_initial_device (void) -{ - return GOMP_DEVICE_HOST_FALLBACK; -} - -int -omp_get_num_places (void) -{ - return gomp_places_list_len; -} - -int -omp_get_place_num (void) -{ - if (gomp_places_list == NULL) - return -1; - - struct gomp_thread *thr = gomp_thread (); - if (thr->place == 0) - gomp_init_affinity (); - - return (int) thr->place - 1; -} - -int -omp_get_partition_num_places (void) -{ - if (gomp_places_list == NULL) - return 0; - - struct gomp_thread *thr = gomp_thread (); - if (thr->place == 0) - gomp_init_affinity (); - - return thr->ts.place_partition_len; -} - -void -omp_get_partition_place_nums (int *place_nums) -{ - if (gomp_places_list == NULL) - return; - - struct gomp_thread *thr = gomp_thread (); - if (thr->place == 0) - gomp_init_affinity (); - - unsigned int i; - for (i = 0; i < thr->ts.place_partition_len; i++) - *place_nums++ = thr->ts.place_partition_off + i; -} - -ialias (omp_set_dynamic) -ialias (omp_set_nested) -ialias (omp_set_num_threads) -ialias (omp_get_dynamic) -ialias (omp_get_nested) -ialias (omp_set_schedule) -ialias (omp_get_schedule) -ialias (omp_get_max_threads) -ialias (omp_get_thread_limit) -ialias (omp_set_max_active_levels) -ialias (omp_get_max_active_levels) -ialias (omp_get_cancellation) -ialias (omp_get_proc_bind) -ialias (omp_set_default_device) -ialias (omp_get_default_device) -ialias (omp_get_num_devices) -ialias (omp_get_num_teams) -ialias (omp_get_team_num) -ialias (omp_is_initial_device) -ialias (omp_get_initial_device) -ialias (omp_get_max_task_priority) -ialias (omp_get_num_places) -ialias (omp_get_place_num) -ialias (omp_get_partition_num_places) -ialias (omp_get_partition_place_nums) diff --git a/libgomp/icv-device.c b/libgomp/icv-device.c new file mode 100644 index 00000000000..9bbc0b9b767 --- /dev/null +++ b/libgomp/icv-device.c @@ -0,0 +1,77 @@ +/* Copyright (C) 2005-2016 Free Software Foundation, Inc. + Contributed by Richard Henderson <rth@redhat.com>. + + This file is part of the GNU Offloading and Multi Processing Library + (libgomp). + + Libgomp is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY + WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + FOR A PARTICULAR PURPOSE. See the GNU General Public License for + more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + <http://www.gnu.org/licenses/>. */ + +/* This file defines OpenMP API entry points that accelerator targets are + expected to replace. */ + +#include "libgomp.h" + +void +omp_set_default_device (int device_num) +{ + struct gomp_task_icv *icv = gomp_icv (true); + icv->default_device_var = device_num >= 0 ? device_num : 0; +} + +int +omp_get_default_device (void) +{ + struct gomp_task_icv *icv = gomp_icv (false); + return icv->default_device_var; +} + +int +omp_get_num_devices (void) +{ + return gomp_get_num_devices (); +} + +int +omp_get_num_teams (void) +{ + /* Hardcoded to 1 on host, MIC, HSAIL? Maybe variable on PTX. */ + return 1; +} + +int +omp_get_team_num (void) +{ + /* Hardcoded to 0 on host, MIC, HSAIL? Maybe variable on PTX. */ + return 0; +} + +int +omp_is_initial_device (void) +{ + /* Hardcoded to 1 on host, should be 0 on MIC, HSAIL, PTX. */ + return 1; +} + +ialias (omp_set_default_device) +ialias (omp_get_default_device) +ialias (omp_get_num_devices) +ialias (omp_get_num_teams) +ialias (omp_get_team_num) +ialias (omp_is_initial_device) diff --git a/libgomp/icv.c b/libgomp/icv.c new file mode 100644 index 00000000000..e58b961558e --- /dev/null +++ b/libgomp/icv.c @@ -0,0 +1,248 @@ +/* Copyright (C) 2005-2016 Free Software Foundation, Inc. + Contributed by Richard Henderson <rth@redhat.com>. + + This file is part of the GNU Offloading and Multi Processing Library + (libgomp). + + Libgomp is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY + WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + FOR A PARTICULAR PURPOSE. See the GNU General Public License for + more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + <http://www.gnu.org/licenses/>. */ + +/* This file defines the OpenMP internal control variables and associated + OpenMP API entry points. */ + +#include "libgomp.h" +#include "gomp-constants.h" +#include <limits.h> + +struct gomp_task_icv gomp_global_icv = { + .nthreads_var = 1, + .thread_limit_var = UINT_MAX, + .run_sched_var = GFS_DYNAMIC, + .run_sched_chunk_size = 1, + .default_device_var = 0, + .dyn_var = false, + .nest_var = false, + .bind_var = omp_proc_bind_false, + .target_data = NULL +}; + +unsigned long gomp_max_active_levels_var = INT_MAX; +bool gomp_cancel_var = false; +int gomp_max_task_priority_var = 0; +#ifndef HAVE_SYNC_BUILTINS +gomp_mutex_t gomp_managed_threads_lock; +#endif +unsigned long gomp_available_cpus = 1, gomp_managed_threads = 1; +unsigned long long gomp_spin_count_var, gomp_throttled_spin_count_var; +unsigned long *gomp_nthreads_var_list, gomp_nthreads_var_list_len; +char *gomp_bind_var_list; +unsigned long gomp_bind_var_list_len; +void **gomp_places_list; +unsigned long gomp_places_list_len; +int gomp_debug_var; +unsigned int gomp_num_teams_var; +char *goacc_device_type; +int goacc_device_num; + +void +omp_set_num_threads (int n) +{ + struct gomp_task_icv *icv = gomp_icv (true); + icv->nthreads_var = (n > 0 ? n : 1); +} + +void +omp_set_dynamic (int val) +{ + struct gomp_task_icv *icv = gomp_icv (true); + icv->dyn_var = val; +} + +int +omp_get_dynamic (void) +{ + struct gomp_task_icv *icv = gomp_icv (false); + return icv->dyn_var; +} + +void +omp_set_nested (int val) +{ + struct gomp_task_icv *icv = gomp_icv (true); + icv->nest_var = val; +} + +int +omp_get_nested (void) +{ + struct gomp_task_icv *icv = gomp_icv (false); + return icv->nest_var; +} + +void +omp_set_schedule (omp_sched_t kind, int chunk_size) +{ + struct gomp_task_icv *icv = gomp_icv (true); + switch (kind) + { + case omp_sched_static: + if (chunk_size < 1) + chunk_size = 0; + icv->run_sched_chunk_size = chunk_size; + break; + case omp_sched_dynamic: + case omp_sched_guided: + if (chunk_size < 1) + chunk_size = 1; + icv->run_sched_chunk_size = chunk_size; + break; + case omp_sched_auto: + break; + default: + return; + } + icv->run_sched_var = kind; +} + +void +omp_get_schedule (omp_sched_t *kind, int *chunk_size) +{ + struct gomp_task_icv *icv = gomp_icv (false); + *kind = icv->run_sched_var; + *chunk_size = icv->run_sched_chunk_size; +} + +int +omp_get_max_threads (void) +{ + struct gomp_task_icv *icv = gomp_icv (false); + return icv->nthreads_var; +} + +int +omp_get_thread_limit (void) +{ + struct gomp_task_icv *icv = gomp_icv (false); + return icv->thread_limit_var > INT_MAX ? INT_MAX : icv->thread_limit_var; +} + +void +omp_set_max_active_levels (int max_levels) +{ + if (max_levels >= 0) + gomp_max_active_levels_var = max_levels; +} + +int +omp_get_max_active_levels (void) +{ + return gomp_max_active_levels_var; +} + +int +omp_get_cancellation (void) +{ + return gomp_cancel_var; +} + +int +omp_get_max_task_priority (void) +{ + return gomp_max_task_priority_var; +} + +omp_proc_bind_t +omp_get_proc_bind (void) +{ + struct gomp_task_icv *icv = gomp_icv (false); + return icv->bind_var; +} + +int +omp_get_initial_device (void) +{ + return GOMP_DEVICE_HOST_FALLBACK; +} + +int +omp_get_num_places (void) +{ + return gomp_places_list_len; +} + +int +omp_get_place_num (void) +{ + if (gomp_places_list == NULL) + return -1; + + struct gomp_thread *thr = gomp_thread (); + if (thr->place == 0) + gomp_init_affinity (); + + return (int) thr->place - 1; +} + +int +omp_get_partition_num_places (void) +{ + if (gomp_places_list == NULL) + return 0; + + struct gomp_thread *thr = gomp_thread (); + if (thr->place == 0) + gomp_init_affinity (); + + return thr->ts.place_partition_len; +} + +void +omp_get_partition_place_nums (int *place_nums) +{ + if (gomp_places_list == NULL) + return; + + struct gomp_thread *thr = gomp_thread (); + if (thr->place == 0) + gomp_init_affinity (); + + unsigned int i; + for (i = 0; i < thr->ts.place_partition_len; i++) + *place_nums++ = thr->ts.place_partition_off + i; +} + +ialias (omp_set_dynamic) +ialias (omp_set_nested) +ialias (omp_set_num_threads) +ialias (omp_get_dynamic) +ialias (omp_get_nested) +ialias (omp_set_schedule) +ialias (omp_get_schedule) +ialias (omp_get_max_threads) +ialias (omp_get_thread_limit) +ialias (omp_set_max_active_levels) +ialias (omp_get_max_active_levels) +ialias (omp_get_cancellation) +ialias (omp_get_proc_bind) +ialias (omp_get_initial_device) +ialias (omp_get_max_task_priority) +ialias (omp_get_num_places) +ialias (omp_get_place_num) +ialias (omp_get_partition_num_places) +ialias (omp_get_partition_place_nums) diff --git a/libgomp/libgomp.h b/libgomp/libgomp.h index 7b2671ba49d..c86ac3d8778 100644 --- a/libgomp/libgomp.h +++ b/libgomp/libgomp.h @@ -45,7 +45,9 @@ #include "gstdint.h" #include "libgomp-plugin.h" +#ifdef HAVE_PTHREAD_H #include <pthread.h> +#endif #include <stdbool.h> #include <stdlib.h> #include <stdarg.h> @@ -122,6 +124,7 @@ struct htab; #include "sem.h" #include "mutex.h" #include "bar.h" +#include "simple-bar.h" #include "ptrlock.h" @@ -360,6 +363,7 @@ extern char *gomp_bind_var_list; extern unsigned long gomp_bind_var_list_len; extern void **gomp_places_list; extern unsigned long gomp_places_list_len; +extern unsigned int gomp_num_teams_var; extern int gomp_debug_var; extern int goacc_device_num; extern char *goacc_device_type; @@ -626,8 +630,8 @@ struct gomp_thread_pool /* Number of threads running in this contention group. */ unsigned long threads_busy; - /* This barrier holds and releases threads waiting in threads. */ - gomp_barrier_t threads_dock; + /* This barrier holds and releases threads waiting in thread pools. */ + gomp_simple_barrier_t threads_dock; }; enum gomp_cancel_kind @@ -642,7 +646,15 @@ enum gomp_cancel_kind /* ... and here is that TLS data. */ -#if defined HAVE_TLS || defined USE_EMUTLS +#if defined __nvptx__ +extern struct gomp_thread *nvptx_thrs __attribute__((shared)); +static inline struct gomp_thread *gomp_thread (void) +{ + int tid; + asm ("mov.u32 %0, %%tid.y;" : "=r" (tid)); + return nvptx_thrs + tid; +} +#elif defined HAVE_TLS || defined USE_EMUTLS extern __thread struct gomp_thread gomp_tls_data; static inline struct gomp_thread *gomp_thread (void) { @@ -671,17 +683,21 @@ static inline struct gomp_task_icv *gomp_icv (bool write) return &gomp_global_icv; } +#ifdef LIBGOMP_USE_PTHREADS /* The attributes to be used during thread creation. */ extern pthread_attr_t gomp_thread_attr; extern pthread_key_t gomp_thread_destructor; +#endif /* Function prototypes. */ /* affinity.c */ extern void gomp_init_affinity (void); +#ifdef LIBGOMP_USE_PTHREADS extern void gomp_init_thread_affinity (pthread_attr_t *, unsigned int); +#endif extern void **gomp_affinity_alloc (unsigned long, bool); extern void gomp_affinity_init_place (void *); extern bool gomp_affinity_add_cpus (void *, unsigned long, unsigned long, diff --git a/libgomp/lock.c b/libgomp/lock.c new file mode 100644 index 00000000000..783bd77dd90 --- /dev/null +++ b/libgomp/lock.c @@ -0,0 +1,123 @@ +/* Copyright (C) 2005-2016 Free Software Foundation, Inc. + Contributed by Richard Henderson <rth@redhat.com>. + + This file is part of the GNU Offloading and Multi Processing Library + (libgomp). + + Libgomp is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY + WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + FOR A PARTICULAR PURPOSE. See the GNU General Public License for + more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + <http://www.gnu.org/licenses/>. */ + +/* This is a generic implementation of the public OpenMP locking primitives in + terms of internal gomp_mutex_t. It is not meant to be compiled on its own. + It is #include'd from config/{linux,nvptx}/lock.c. */ + +#include <string.h> +#include "libgomp.h" + +/* The internal gomp_mutex_t and the external non-recursive omp_lock_t + have the same form. Re-use it. */ + +void +gomp_init_lock_30 (omp_lock_t *lock) +{ + gomp_mutex_init (lock); +} + +void +gomp_destroy_lock_30 (omp_lock_t *lock) +{ + gomp_mutex_destroy (lock); +} + +void +gomp_set_lock_30 (omp_lock_t *lock) +{ + gomp_mutex_lock (lock); +} + +void +gomp_unset_lock_30 (omp_lock_t *lock) +{ + gomp_mutex_unlock (lock); +} + +int +gomp_test_lock_30 (omp_lock_t *lock) +{ + int oldval = 0; + + return __atomic_compare_exchange_n (lock, &oldval, 1, false, + MEMMODEL_ACQUIRE, MEMMODEL_RELAXED); +} + +void +gomp_init_nest_lock_30 (omp_nest_lock_t *lock) +{ + memset (lock, '\0', sizeof (*lock)); +} + +void +gomp_destroy_nest_lock_30 (omp_nest_lock_t *lock) +{ +} + +void +gomp_set_nest_lock_30 (omp_nest_lock_t *lock) +{ + void *me = gomp_icv (true); + + if (lock->owner != me) + { + gomp_mutex_lock (&lock->lock); + lock->owner = me; + } + + lock->count++; +} + +void +gomp_unset_nest_lock_30 (omp_nest_lock_t *lock) +{ + if (--lock->count == 0) + { + lock->owner = NULL; + gomp_mutex_unlock (&lock->lock); + } +} + +int +gomp_test_nest_lock_30 (omp_nest_lock_t *lock) +{ + void *me = gomp_icv (true); + int oldval; + + if (lock->owner == me) + return ++lock->count; + + oldval = 0; + if (__atomic_compare_exchange_n (&lock->lock, &oldval, 1, false, + MEMMODEL_ACQUIRE, MEMMODEL_RELAXED)) + { + lock->owner = me; + lock->count = 1; + return 1; + } + + return 0; +} diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c index 5ee350d4c1d..ca33c51db5a 100644 --- a/libgomp/plugin/plugin-nvptx.c +++ b/libgomp/plugin/plugin-nvptx.c @@ -41,6 +41,7 @@ #include <cuda.h> #include <stdbool.h> #include <stdint.h> +#include <limits.h> #include <string.h> #include <stdio.h> #include <unistd.h> @@ -274,6 +275,8 @@ struct targ_fn_descriptor { CUfunction fn; const struct targ_fn_launch *launch; + int regs_per_thread; + int max_threads_per_block; }; /* A loaded PTX image. */ @@ -307,8 +310,12 @@ struct ptx_device bool overlap; bool map; bool concur; - int mode; bool mkern; + int mode; + int clock_khz; + int num_sms; + int regs_per_block; + int regs_per_sm; struct ptx_image_data *images; /* Images loaded on device. */ pthread_mutex_t image_lock; /* Lock for above list. */ @@ -658,6 +665,39 @@ nvptx_open_device (int n) &pi, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev); ptx_dev->mkern = pi; + CUDA_CALL_ERET (NULL, cuDeviceGetAttribute, + &pi, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev); + ptx_dev->clock_khz = pi; + + CUDA_CALL_ERET (NULL, cuDeviceGetAttribute, + &pi, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev); + ptx_dev->num_sms = pi; + + CUDA_CALL_ERET (NULL, cuDeviceGetAttribute, + &pi, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, dev); + ptx_dev->regs_per_block = pi; + + /* CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR = 82 is defined only + in CUDA 6.0 and newer. */ + r = cuDeviceGetAttribute (&pi, 82, dev); + /* Fallback: use limit of registers per block, which is usually equal. */ + if (r == CUDA_ERROR_INVALID_VALUE) + pi = ptx_dev->regs_per_block; + else if (r != CUDA_SUCCESS) + { + GOMP_PLUGIN_error ("cuDeviceGetAttribute error: %s", cuda_error (r)); + return NULL; + } + ptx_dev->regs_per_sm = pi; + + CUDA_CALL_ERET (NULL, cuDeviceGetAttribute, + &pi, CU_DEVICE_ATTRIBUTE_WARP_SIZE, dev); + if (pi != 32) + { + GOMP_PLUGIN_error ("Only warp size 32 is supported"); + return NULL; + } + r = cuDeviceGetAttribute (&async_engines, CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, dev); if (r != CUDA_SUCCESS) @@ -725,10 +765,8 @@ link_ptx (CUmodule *module, const struct targ_ptx_obj *ptx_objs, CUjit_option opts[6]; void *optvals[6]; float elapsed = 0.0; -#define LOGSIZE 8192 - char elog[LOGSIZE]; - char ilog[LOGSIZE]; - unsigned long logsize = LOGSIZE; + char elog[1024]; + char ilog[16384]; CUlinkState linkstate; CUresult r; void *linkout; @@ -741,13 +779,13 @@ link_ptx (CUmodule *module, const struct targ_ptx_obj *ptx_objs, optvals[1] = &ilog[0]; opts[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES; - optvals[2] = (void *) logsize; + optvals[2] = (void *) sizeof ilog; opts[3] = CU_JIT_ERROR_LOG_BUFFER; optvals[3] = &elog[0]; opts[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES; - optvals[4] = (void *) logsize; + optvals[4] = (void *) sizeof elog; opts[5] = CU_JIT_LOG_VERBOSE; optvals[5] = (void *) 1; @@ -1164,7 +1202,7 @@ nvptx_host2dev (void *d, const void *h, size_t s) } #ifndef DISABLE_ASYNC - if (nvthd->current_stream != nvthd->ptx_dev->null_stream) + if (nvthd && nvthd->current_stream != nvthd->ptx_dev->null_stream) { CUevent *e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent)); CUDA_CALL (cuEventCreate, e, CU_EVENT_DISABLE_TIMING); @@ -1220,7 +1258,7 @@ nvptx_dev2host (void *h, const void *d, size_t s) } #ifndef DISABLE_ASYNC - if (nvthd->current_stream != nvthd->ptx_dev->null_stream) + if (nvthd && nvthd->current_stream != nvthd->ptx_dev->null_stream) { CUevent *e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent)); CUDA_CALL (cuEventCreate, e, CU_EVENT_DISABLE_TIMING); @@ -1518,7 +1556,7 @@ GOMP_OFFLOAD_get_name (void) unsigned int GOMP_OFFLOAD_get_caps (void) { - return GOMP_OFFLOAD_CAP_OPENACC_200; + return GOMP_OFFLOAD_CAP_OPENACC_200 | GOMP_OFFLOAD_CAP_OPENMP_400; } int @@ -1588,6 +1626,23 @@ GOMP_OFFLOAD_version (void) return GOMP_VERSION; } +/* Initialize __nvptx_clocktick, if present in MODULE. */ + +static void +nvptx_set_clocktick (CUmodule module, struct ptx_device *dev) +{ + CUdeviceptr dptr; + CUresult r = cuModuleGetGlobal (&dptr, NULL, module, "__nvptx_clocktick"); + if (r == CUDA_ERROR_NOT_FOUND) + return; + if (r != CUDA_SUCCESS) + GOMP_PLUGIN_fatal ("cuModuleGetGlobal error: %s", cuda_error (r)); + double __nvptx_clocktick = 1e-3 / dev->clock_khz; + r = cuMemcpyHtoD (dptr, &__nvptx_clocktick, sizeof (__nvptx_clocktick)); + if (r != CUDA_SUCCESS) + GOMP_PLUGIN_fatal ("cuMemcpyHtoD error: %s", cuda_error (r)); +} + /* Load the (partial) program described by TARGET_DATA to device number ORD. Allocate and return TARGET_TABLE. */ @@ -1648,12 +1703,19 @@ GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data, for (i = 0; i < fn_entries; i++, targ_fns++, targ_tbl++) { CUfunction function; + int nregs, mthrs; CUDA_CALL_ERET (-1, cuModuleGetFunction, &function, module, fn_descs[i].fn); + CUDA_CALL_ERET (-1, cuFuncGetAttribute, &nregs, + CU_FUNC_ATTRIBUTE_NUM_REGS, function); + CUDA_CALL_ERET (-1, cuFuncGetAttribute, &mthrs, + CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, function); targ_fns->fn = function; targ_fns->launch = &fn_descs[i]; + targ_fns->regs_per_thread = nregs; + targ_fns->max_threads_per_block = mthrs; targ_tbl->start = (uintptr_t) targ_fns; targ_tbl->end = targ_tbl->start + 1; @@ -1671,6 +1733,8 @@ GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data, targ_tbl->end = targ_tbl->start + bytes; } + nvptx_set_clocktick (module, dev); + return fn_entries + var_entries; } @@ -1736,6 +1800,15 @@ GOMP_OFFLOAD_host2dev (int ord, void *dst, const void *src, size_t n) && nvptx_host2dev (dst, src, n)); } +bool +GOMP_OFFLOAD_dev2dev (int ord, void *dst, const void *src, size_t n) +{ + struct ptx_device *ptx_dev = ptx_devices[ord]; + CUDA_CALL (cuMemcpyDtoDAsync, (CUdeviceptr) dst, (CUdeviceptr) src, n, + ptx_dev->null_stream->stream); + return true; +} + void (*device_run) (int n, void *fn_ptr, void *vars) = NULL; void @@ -1857,3 +1930,123 @@ GOMP_OFFLOAD_openacc_set_cuda_stream (int async, void *stream) { return nvptx_set_cuda_stream (async, stream); } + +/* Adjust launch dimensions: pick good values for number of blocks and warps + and ensure that number of warps does not exceed CUDA limits as well as GCC's + own limits. */ + +static void +nvptx_adjust_launch_bounds (struct targ_fn_descriptor *fn, + struct ptx_device *ptx_dev, + int *teams_p, int *threads_p) +{ + int max_warps_block = fn->max_threads_per_block / 32; + /* Maximum 32 warps per block is an implementation limit in NVPTX backend + and libgcc, which matches documented limit of all GPUs as of 2015. */ + if (max_warps_block > 32) + max_warps_block = 32; + if (*threads_p <= 0) + *threads_p = 8; + if (*threads_p > max_warps_block) + *threads_p = max_warps_block; + + int regs_per_block = fn->regs_per_thread * 32 * *threads_p; + /* This is an estimate of how many blocks the device can host simultaneously. + Actual limit, which may be lower, can be queried with "occupancy control" + driver interface (since CUDA 6.0). */ + int max_blocks = ptx_dev->regs_per_sm / regs_per_block * ptx_dev->num_sms; + if (*teams_p <= 0 || *teams_p > max_blocks) + *teams_p = max_blocks; +} + +/* Return the size of per-warp stacks (see gcc -msoft-stack) to use for OpenMP + target regions. */ + +static size_t +nvptx_stacks_size () +{ + return 128 * 1024; +} + +/* Return contiguous storage for NUM stacks, each SIZE bytes. */ + +static void * +nvptx_stacks_alloc (size_t size, int num) +{ + CUdeviceptr stacks; + CUresult r = cuMemAlloc (&stacks, size * num); + if (r != CUDA_SUCCESS) + GOMP_PLUGIN_fatal ("cuMemAlloc error: %s", cuda_error (r)); + return (void *) stacks; +} + +/* Release storage previously allocated by nvptx_stacks_alloc. */ + +static void +nvptx_stacks_free (void *p, int num) +{ + CUresult r = cuMemFree ((CUdeviceptr) p); + if (r != CUDA_SUCCESS) + GOMP_PLUGIN_fatal ("cuMemFree error: %s", cuda_error (r)); +} + +void +GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args) +{ + CUfunction function = ((struct targ_fn_descriptor *) tgt_fn)->fn; + CUresult r; + struct ptx_device *ptx_dev = ptx_devices[ord]; + const char *maybe_abort_msg = "(perhaps abort was called)"; + int teams = 0, threads = 0; + + if (!args) + GOMP_PLUGIN_fatal ("No target arguments provided"); + while (*args) + { + intptr_t id = (intptr_t) *args++, val; + if (id & GOMP_TARGET_ARG_SUBSEQUENT_PARAM) + val = (intptr_t) *args++; + else + val = id >> GOMP_TARGET_ARG_VALUE_SHIFT; + if ((id & GOMP_TARGET_ARG_DEVICE_MASK) != GOMP_TARGET_ARG_DEVICE_ALL) + continue; + val = val > INT_MAX ? INT_MAX : val; + id &= GOMP_TARGET_ARG_ID_MASK; + if (id == GOMP_TARGET_ARG_NUM_TEAMS) + teams = val; + else if (id == GOMP_TARGET_ARG_THREAD_LIMIT) + threads = val; + } + nvptx_adjust_launch_bounds (tgt_fn, ptx_dev, &teams, &threads); + + size_t stack_size = nvptx_stacks_size (); + void *stacks = nvptx_stacks_alloc (stack_size, teams * threads); + void *fn_args[] = {tgt_vars, stacks, (void *) stack_size}; + size_t fn_args_size = sizeof fn_args; + void *config[] = { + CU_LAUNCH_PARAM_BUFFER_POINTER, fn_args, + CU_LAUNCH_PARAM_BUFFER_SIZE, &fn_args_size, + CU_LAUNCH_PARAM_END + }; + r = cuLaunchKernel (function, + teams, 1, 1, + 32, threads, 1, + 0, ptx_dev->null_stream->stream, NULL, config); + if (r != CUDA_SUCCESS) + GOMP_PLUGIN_fatal ("cuLaunchKernel error: %s", cuda_error (r)); + + r = cuCtxSynchronize (); + if (r == CUDA_ERROR_LAUNCH_FAILED) + GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r), + maybe_abort_msg); + else if (r != CUDA_SUCCESS) + GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r)); + nvptx_stacks_free (stacks, teams * threads); +} + +void +GOMP_OFFLOAD_async_run (int ord, void *tgt_fn, void *tgt_vars, void **args, + void *async_data) +{ + GOMP_PLUGIN_fatal ("GOMP_OFFLOAD_async_run unimplemented"); +} diff --git a/libgomp/team.c b/libgomp/team.c index de5b915f78b..e3013451662 100644 --- a/libgomp/team.c +++ b/libgomp/team.c @@ -31,6 +31,7 @@ #include <stdlib.h> #include <string.h> +#ifdef LIBGOMP_USE_PTHREADS /* This attribute contains PTHREAD_CREATE_DETACHED. */ pthread_attr_t gomp_thread_attr; @@ -110,7 +111,7 @@ gomp_thread_start (void *xdata) { pool->threads[thr->ts.team_id] = thr; - gomp_barrier_wait (&pool->threads_dock); + gomp_simple_barrier_wait (&pool->threads_dock); do { struct gomp_team *team = thr->ts.team; @@ -120,7 +121,7 @@ gomp_thread_start (void *xdata) gomp_team_barrier_wait_final (&team->barrier); gomp_finish_task (task); - gomp_barrier_wait (&pool->threads_dock); + gomp_simple_barrier_wait (&pool->threads_dock); local_fn = thr->fn; local_data = thr->data; @@ -134,6 +135,7 @@ gomp_thread_start (void *xdata) thr->task = NULL; return NULL; } +#endif static inline struct gomp_team * get_last_team (unsigned nthreads) @@ -224,11 +226,17 @@ gomp_free_pool_helper (void *thread_pool) struct gomp_thread *thr = gomp_thread (); struct gomp_thread_pool *pool = (struct gomp_thread_pool *) thread_pool; - gomp_barrier_wait_last (&pool->threads_dock); + gomp_simple_barrier_wait_last (&pool->threads_dock); gomp_sem_destroy (&thr->release); thr->thread_pool = NULL; thr->task = NULL; +#ifdef LIBGOMP_USE_PTHREADS pthread_exit (NULL); +#elif defined(__nvptx__) + asm ("exit;"); +#else +#error gomp_free_pool_helper must terminate the thread +#endif } /* Free a thread pool and release its threads. */ @@ -250,12 +258,12 @@ gomp_free_thread (void *arg __attribute__((unused))) nthr->data = pool; } /* This barrier undocks threads docked on pool->threads_dock. */ - gomp_barrier_wait (&pool->threads_dock); + gomp_simple_barrier_wait (&pool->threads_dock); /* And this waits till all threads have called gomp_barrier_wait_last in gomp_free_pool_helper. */ - gomp_barrier_wait (&pool->threads_dock); + gomp_simple_barrier_wait (&pool->threads_dock); /* Now it is safe to destroy the barrier and free the pool. */ - gomp_barrier_destroy (&pool->threads_dock); + gomp_simple_barrier_destroy (&pool->threads_dock); #ifdef HAVE_SYNC_BUILTINS __sync_fetch_and_add (&gomp_managed_threads, @@ -266,10 +274,12 @@ gomp_free_thread (void *arg __attribute__((unused))) gomp_mutex_unlock (&gomp_managed_threads_lock); #endif } - free (pool->threads); if (pool->last_team) free_team (pool->last_team); +#ifndef __nvptx__ + free (pool->threads); free (pool); +#endif thr->thread_pool = NULL; } if (thr->ts.level == 0 && __builtin_expect (thr->ts.team != NULL, 0)) @@ -284,6 +294,7 @@ gomp_free_thread (void *arg __attribute__((unused))) /* Launch a team. */ +#ifdef LIBGOMP_USE_PTHREADS void gomp_team_start (void (*fn) (void *), void *data, unsigned nthreads, unsigned flags, struct gomp_team *team) @@ -429,7 +440,7 @@ gomp_team_start (void (*fn) (void *), void *data, unsigned nthreads, else if (old_threads_used == 0) { n = 0; - gomp_barrier_init (&pool->threads_dock, nthreads); + gomp_simple_barrier_init (&pool->threads_dock, nthreads); } else { @@ -437,7 +448,7 @@ gomp_team_start (void (*fn) (void *), void *data, unsigned nthreads, /* Increase the barrier threshold to make sure all new threads arrive before the team is released. */ - gomp_barrier_reinit (&pool->threads_dock, nthreads); + gomp_simple_barrier_reinit (&pool->threads_dock, nthreads); } /* Not true yet, but soon will be. We're going to release all @@ -670,8 +681,8 @@ gomp_team_start (void (*fn) (void *), void *data, unsigned nthreads, threads and all the threads we're going to let die arrive before the team is released. */ if (affinity_count) - gomp_barrier_reinit (&pool->threads_dock, - nthreads + affinity_count); + gomp_simple_barrier_reinit (&pool->threads_dock, + nthreads + affinity_count); } } @@ -812,7 +823,10 @@ gomp_team_start (void (*fn) (void *), void *data, unsigned nthreads, pthread_attr_destroy (&thread_attr); do_release: - gomp_barrier_wait (nested ? &team->barrier : &pool->threads_dock); + if (nested) + gomp_barrier_wait (&team->barrier); + else + gomp_simple_barrier_wait (&pool->threads_dock); /* Decrease the barrier threshold to match the number of threads that should arrive back at the end of this team. The extra @@ -830,7 +844,7 @@ gomp_team_start (void (*fn) (void *), void *data, unsigned nthreads, if (affinity_count) diff = -affinity_count; - gomp_barrier_reinit (&pool->threads_dock, nthreads); + gomp_simple_barrier_reinit (&pool->threads_dock, nthreads); #ifdef HAVE_SYNC_BUILTINS __sync_fetch_and_add (&gomp_managed_threads, diff); @@ -844,6 +858,7 @@ gomp_team_start (void (*fn) (void *), void *data, unsigned nthreads, && team->prev_ts.place_partition_len > 64) free (affinity_thr); } +#endif /* Terminate the current team. This is only to be called by the master @@ -919,6 +934,7 @@ gomp_team_end (void) } } +#ifdef LIBGOMP_USE_PTHREADS /* Constructors for this file. */ @@ -943,6 +959,7 @@ team_destructor (void) crashes. */ pthread_key_delete (gomp_thread_destructor); } +#endif struct gomp_task_icv * gomp_new_icv (void) @@ -951,6 +968,8 @@ gomp_new_icv (void) struct gomp_task *task = gomp_malloc (sizeof (struct gomp_task)); gomp_init_task (task, NULL, &gomp_global_icv); thr->task = task; +#ifdef LIBGOMP_USE_PTHREADS pthread_setspecific (gomp_thread_destructor, thr); +#endif return &task->icv; } diff --git a/libgomp/testsuite/libgomp.fortran/fortran.exp b/libgomp/testsuite/libgomp.fortran/fortran.exp index 9e6b6437751..d848ed4d47f 100644 --- a/libgomp/testsuite/libgomp.fortran/fortran.exp +++ b/libgomp/testsuite/libgomp.fortran/fortran.exp @@ -7,7 +7,7 @@ global ALWAYS_CFLAGS set shlib_ext [get_shlib_extension] set lang_library_path "../libgfortran/.libs" -set lang_link_flags "-lgfortran" +set lang_link_flags "-lgfortran -foffload=-lgfortran" if [info exists lang_include_flags] then { unset lang_include_flags } diff --git a/libgomp/testsuite/libgomp.oacc-fortran/fortran.exp b/libgomp/testsuite/libgomp.oacc-fortran/fortran.exp index 2d6b647af22..663c9323b72 100644 --- a/libgomp/testsuite/libgomp.oacc-fortran/fortran.exp +++ b/libgomp/testsuite/libgomp.oacc-fortran/fortran.exp @@ -9,7 +9,7 @@ global ALWAYS_CFLAGS set shlib_ext [get_shlib_extension] set lang_library_path "../libgfortran/.libs" -set lang_link_flags "-lgfortran" +set lang_link_flags "-lgfortran -foffload=-lgfortran" if [info exists lang_include_flags] then { unset lang_include_flags } |