summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--gcc/ChangeLog.gomp-nvptx269
-rw-r--r--gcc/config/nvptx/mkoffload.c11
-rw-r--r--gcc/config/nvptx/nvptx-protos.h12
-rw-r--r--gcc/config/nvptx/nvptx.c426
-rw-r--r--gcc/config/nvptx/nvptx.h19
-rw-r--r--gcc/config/nvptx/nvptx.md191
-rw-r--r--gcc/config/nvptx/nvptx.opt12
-rw-r--r--gcc/config/nvptx/t-nvptx2
-rw-r--r--gcc/doc/extend.texi15
-rw-r--r--gcc/doc/invoke.texi31
-rw-r--r--gcc/doc/tm.texi4
-rw-r--r--gcc/doc/tm.texi.in2
-rw-r--r--gcc/internal-fn.c126
-rw-r--r--gcc/internal-fn.def7
-rw-r--r--gcc/omp-low.c444
-rw-r--r--gcc/passes.def1
-rw-r--r--gcc/target-insns.def6
-rw-r--r--gcc/target.def12
-rw-r--r--gcc/testsuite/ChangeLog.gomp-nvptx13
-rw-r--r--gcc/testsuite/gcc.target/nvptx/decl-shared-init.c1
-rw-r--r--gcc/testsuite/gcc.target/nvptx/decl-shared.c14
-rw-r--r--gcc/testsuite/gcc.target/nvptx/softstack.c23
-rw-r--r--gcc/testsuite/lib/target-supports.exp5
-rw-r--r--gcc/tree-pass.h2
-rw-r--r--libgcc/ChangeLog.gomp-nvptx27
-rw-r--r--libgcc/config/nvptx/crt0.c12
-rw-r--r--libgcc/config/nvptx/mgomp.c (renamed from libgcc/config/nvptx/nvptx-malloc.h)16
-rw-r--r--libgcc/config/nvptx/t-nvptx3
-rw-r--r--libgomp/ChangeLog.gomp-nvptx296
-rw-r--r--libgomp/Makefile.am12
-rw-r--r--libgomp/Makefile.in34
-rw-r--r--libgomp/affinity.c (renamed from libgomp/config/posix/affinity.c)2
-rw-r--r--libgomp/atomic.c57
-rw-r--r--libgomp/config.h.in3
-rw-r--r--libgomp/config/linux/lock.c94
-rw-r--r--libgomp/config/nvptx/affinity.c0
-rw-r--r--libgomp/config/nvptx/alloc.c0
-rw-r--r--libgomp/config/nvptx/bar.c209
-rw-r--r--libgomp/config/nvptx/bar.h166
-rw-r--r--libgomp/config/nvptx/barrier.c0
-rw-r--r--libgomp/config/nvptx/critical.c0
-rw-r--r--libgomp/config/nvptx/doacross.h60
-rw-r--r--libgomp/config/nvptx/error.c42
-rw-r--r--libgomp/config/nvptx/icv-device.c74
-rw-r--r--libgomp/config/nvptx/iter.c0
-rw-r--r--libgomp/config/nvptx/iter_ull.c0
-rw-r--r--libgomp/config/nvptx/lock.c41
-rw-r--r--libgomp/config/nvptx/loop.c0
-rw-r--r--libgomp/config/nvptx/loop_ull.c0
-rw-r--r--libgomp/config/nvptx/mutex.h60
-rw-r--r--libgomp/config/nvptx/ordered.c0
-rw-r--r--libgomp/config/nvptx/parallel.c0
-rw-r--r--libgomp/config/nvptx/pool.h (renamed from libgomp/config/nvptx/fortran.c)39
-rw-r--r--libgomp/config/nvptx/priority_queue.c1
-rw-r--r--libgomp/config/nvptx/proc.c41
-rw-r--r--libgomp/config/nvptx/ptrlock.h73
-rw-r--r--libgomp/config/nvptx/sections.c0
-rw-r--r--libgomp/config/nvptx/sem.h65
-rw-r--r--libgomp/config/nvptx/simple-bar.h70
-rw-r--r--libgomp/config/nvptx/single.c0
-rw-r--r--libgomp/config/nvptx/splay-tree.c0
-rw-r--r--libgomp/config/nvptx/target.c49
-rw-r--r--libgomp/config/nvptx/task.c43
-rw-r--r--libgomp/config/nvptx/team.c178
-rw-r--r--libgomp/config/nvptx/time.c49
-rw-r--r--libgomp/config/nvptx/work.c0
-rw-r--r--libgomp/config/posix/simple-bar.h69
-rwxr-xr-xlibgomp/configure7
-rw-r--r--libgomp/configure.ac6
-rw-r--r--libgomp/critical.c22
-rw-r--r--libgomp/env.c270
-rw-r--r--libgomp/icv-device.c77
-rw-r--r--libgomp/icv.c248
-rw-r--r--libgomp/libgomp.h22
-rw-r--r--libgomp/lock.c123
-rw-r--r--libgomp/plugin/plugin-nvptx.c213
-rw-r--r--libgomp/team.c45
-rw-r--r--libgomp/testsuite/libgomp.fortran/fortran.exp2
-rw-r--r--libgomp/testsuite/libgomp.oacc-fortran/fortran.exp2
79 files changed, 3983 insertions, 587 deletions
diff --git a/gcc/ChangeLog.gomp-nvptx b/gcc/ChangeLog.gomp-nvptx
new file mode 100644
index 00000000000..8b555f25414
--- /dev/null
+++ b/gcc/ChangeLog.gomp-nvptx
@@ -0,0 +1,269 @@
+2016-11-08 Alexander Monakov <amonakov@ispras.ru>
+
+ * omp-low.c (lower_lastprivate_clauses): Add missing call to
+ unshare_expr.
+
+2016-07-08 Alexander Monakov <amonakov@ispras.ru>
+
+ * config/nvptx/nvptx.c (nvptx_file_end): Do emit element count in
+ external declaration of __nvptx_stacks.
+
+2016-06-02 Alexander Monakov <amonakov@ispras.ru>
+
+ * doc/invoke.texi (msoft-stack): Reword, fix errors.
+
+2016-05-19 Alexander Monakov <amonakov@ispras.ru>
+
+ * config/nvptx/nvptx.c (nvptx_declare_function_name): Expand comments.
+ (nvptx_file_end): Do not emit element count in external declaration of
+ __nvptx_stacks.
+
+2016-05-19 Alexander Monakov <amonakov@ispras.ru>
+
+ * doc/invoke.texi (msoft-stack): Rewrite.
+
+2016-05-06 Alexander Monakov <amonakov@ispras.ru>
+
+ * config/nvptx/nvptx.c (nvptx_encode_section_info): Diagnose explicit
+ static initialization of variables in .shared memory.
+ (nvptx_handle_shared_attribute): Reword diagnostic message.
+
+2016-04-19 Alexander Monakov <amonakov@ispras.ru>
+
+ * doc/extend.texi (Nvidia PTX Variable Attributes): New section.
+
+2016-04-18 Alexander Monakov <amonakov@ispras.ru>
+
+ * config/nvptx/nvptx.c (write_omp_entry): Adjust.
+ (nvptx_declare_function_name): Adjust.
+
+2016-03-15 Alexander Monakov <amonakov@ispras.ru>
+
+ * config/nvptx/nvptx.h (STACK_SIZE_MODE): Define.
+
+2016-03-15 Alexander Monakov <amonakov@ispras.ru>
+
+ * config/nvptx/nvptx.c (nvptx_init_unisimt_predicate): Emit
+ 'mul.lo.u32' instead of 'mul.u32' for 32-bit ABI target.
+ (nvptx_declare_function_name): Ditto.
+
+2016-03-15 Alexander Monakov <amonakov@ispras.ru>
+
+ * config/nvptx/nvptx.c (nvptx_init_unisimt_predicate): Use
+ POINTER_SIZE instead of BITS_PER_WORD.
+ (nvptx_declare_function_name): Ditto.
+ (nvptx_output_return): Ditto.
+ (nvptx_file_end): Ditto.
+
+2016-03-15 Alexander Monakov <amonakov@ispras.ru>
+
+ * config/nvptx/nvptx.c (nvptx_option_override): Remove custom handling
+ of debug info options.
+
+2016-02-15 Alexander Monakov <amonakov@ispras.ru>
+
+ * config/nvptx/nvptx.c (write_omp_entry): Expand entry code to
+ initialize __nvptx_uni and __nvptx_stacks (based on pointer to storage
+ allocated by the libgomp plugin).
+
+2016-01-17 Alexander Monakov <amonakov@ispras.ru>
+
+ * config/nvptx/nvptx.c (nvptx_encode_section_info): Handle "shared"
+ attribute.
+ (nvptx_handle_shared_attribute): New. Use it...
+ (nvptx_attribute_table): ... here (new entry).
+
+2016-01-17 Alexander Monakov <amonakov@ispras.ru>
+
+ * config/nvptx/nvptx.c (nvptx_declare_function_name): Fix warning.
+
+2016-01-17 Alexander Monakov <amonakov@ispras.ru>
+
+ * internal-fn.c (expand_GOMP_SIMT_LANE): Update.
+ (expand_GOMP_SIMT_LAST_LANE): New.
+ (expand_GOMP_SIMT_ORDERED_PRED): New.
+ (expand_GOMP_SIMT_VOTE_ANY): New.
+ (expand_GOMP_SIMT_XCHG_BFLY): New.
+ (expand_GOMP_SIMT_XCHG_IDX): New.
+ * internal-fn.def (GOMP_SIMT_LAST_LANE): New.
+ (GOMP_SIMT_ORDERED_PRED): New.
+ (GOMP_SIMT_VOTE_ANY): New.
+ (GOMP_SIMT_XCHG_BFLY): New.
+ (GOMP_SIMT_XCHG_IDX): New.
+ * omp-low.c (omp_maybe_offloaded_ctx): New, outlined from...
+ (create_omp_child_function): ...here. Simplify.
+ (omp_max_simt_vf): New. Use it...
+ (omp_max_vf): ...here.
+ (lower_rec_input_clauses): Add reduction lowering for SIMT execution.
+ (lower_lastprivate_clauses): Likewise, for lastprivate lowering.
+ (lower_omp_ordered): Likewise, for "ordered" lowering.
+ (expand_omp_simd): Update SIMT transforms.
+ (execute_omp_device_lower): Update. Fold SIMD ifns on SIMT targets.
+
+2016-01-17 Alexander Monakov <amonakov@ispras.ru>
+
+ * config/nvptx/nvptx-protos.h (nvptx_shuffle_kind): Move enum
+ declaration from nvptx.c.
+ (nvptx_gen_shuffle): Declare.
+ * config/nvptx/nvptx.c (nvptx_shuffle_kind): Moved to nvptx-protos.h.
+ (nvptx_gen_shuffle): No longer static.
+ * config/nvptx/nvptx.md (UNSPEC_VOTE_BALLOT): New unspec.
+ (UNSPEC_LANEID): Ditto.
+ (UNSPECV_NOUNROLL): Ditto.
+ (nvptx_vote_ballot): New pattern.
+ (omp_simt_lane): Ditto.
+ (nvptx_nounroll): Ditto.
+ (omp_simt_last_lane): Ditto.
+ (omp_simt_ordered): Ditto.
+ (omp_simt_vote_any): Ditto.
+ (omp_simt_xchg_bfly): Ditto.
+ (omp_simt_xchg_idx): Ditto.
+ * target-insns.def (omp_simt_lane): New.
+ (omp_simt_last_lane): New.
+ (omp_simt_ordered): New.
+ (omp_simt_vote_any): New.
+ (omp_simt_xchg_bfly): New.
+ (omp_simt_xchg_idx): New.
+
+2015-12-16 Alexander Monakov <amonakov@ispras.ru>
+
+ * config/nvptx/nvptx.c (write_omp_entry): Work around failure on sm_5x
+ devices by loading ...$impl function address outside of call sequence.
+
+2015-12-14 Alexander Monakov <amonakov@ispras.ru>
+
+ * config/nvptx/nvptx.c (nvptx_declare_function_name): Emit %outargs
+ using .local %outargs_ar only if not TARGET_SOFT_STACK. Emit %outargs
+ under TARGET_SOFT_STACK by offsetting from %frame.
+ (nvptx_get_drap_rtx): Return %argp as the DRAP if needed.
+ * config/nvptx/nvptx.md (nvptx_register_operand): Allow %outargs under
+ TARGET_SOFT_STACK.
+ (nvptx_nonimmediate_operand): Ditto.
+ (allocate_stack): Implement for TARGET_SOFT_STACK. Remove unused code.
+ (allocate_stack_<mode>): Remove unused pattern.
+ (set_softstack_insn): New pattern.
+ (restore_stack_block): Handle for TARGET_SOFT_STACK.
+
+2015-12-09 Alexander Monakov <amonakov@ispras.ru>
+
+ * config/nvptx/nvptx.c (nvptx_record_offload_symbol): Allow NULL attrs
+ for OpenMP offloading.
+
+2015-12-09 Alexander Monakov <amonakov@ispras.ru>
+
+ * internal-fn.c (expand_GOMP_SIMT_LANE): New.
+ (expand_GOMP_SIMT_VF): New.
+ * internal-fn.def (GOMP_SIMT_LANE): New.
+ (GOMP_SIMT_VF): New.
+ * omp-low.c (expand_omp_simd): Do SIMT transforms.
+ (pass_data_lower_omp): Add PROP_gimple_lomp_dev.
+ (execute_omp_device_lower): New.
+ (pass_data_omp_device_lower): New.
+ (pass_omp_device_lower): New pass.
+ (make_pass_omp_device_lower): New.
+ * passes.def (pass_omp_device_lower): Position new pass.
+ * tree-pass.h (PROP_gimple_lomp_dev): Define.
+ (make_pass_omp_device_lower): Declare.
+
+2015-12-09 Alexander Monakov <amonakov@ispras.ru>
+
+ * config/nvptx/mkoffload.c (main): Check that either OpenACC or OpenMP
+ is selected. Pass -mgomp to offload compiler in OpenMP case.
+
+2015-12-09 Alexander Monakov <amonakov@ispras.ru>
+
+ * config/nvptx/nvptx.c (nvptx_simt_vf): New.
+ (TARGET_SIMT_VF): Define.
+ * doc/tm.texi: Regenerate.
+ * doc/tm.texi.in: (TARGET_SIMT_VF): New hook.
+ * target.def: Define it.
+
+2015-12-09 Alexander Monakov <amonakov@ispras.ru>
+
+ * config/nvptx/nvptx.c (nvptx_option_override): Handle TARGET_GOMP.
+ * config/nvptx/nvptx.opt (mgomp): New option.
+ * config/nvptx/t-nvptx (MULTILIB_OPTIONS): New.
+ * doc/invoke.texi (mgomp): Document.
+
+2015-12-09 Alexander Monakov <amonakov@ispras.ru>
+
+ * config/nvptx/nvptx.c (nvptx_name_replacement): Rewrite. Add
+ __nvptx_real_malloc -> malloc and __nvptx_real_free -> free
+ replacements.
+
+2015-12-09 Alexander Monakov <amonakov@ispras.ru>
+
+ * config/nvptx/nvptx.c (need_unisimt_decl): New variable. Set it...
+ (nvptx_init_unisimt_predicate): ...here (new function) and use it...
+ (nvptx_file_end): ...here to emit declaration of __nvptx_uni array.
+ (nvptx_declare_function_name): Call nvptx_init_unisimt_predicate.
+ (nvptx_get_unisimt_master): New helper function.
+ (nvptx_get_unisimt_predicate): Ditto.
+ (nvptx_call_insn_is_syscall_p): Ditto.
+ (nvptx_unisimt_handle_set): Ditto.
+ (nvptx_reorg_uniform_simt): New. Transform code for -muniform-simt.
+ (nvptx_get_axis_predicate): New helper function, factored out from...
+ (nvptx_single): ...here.
+ (nvptx_reorg): Call nvptx_reorg_uniform_simt.
+ * config/nvptx/nvptx.h (TARGET_CPU_CPP_BUILTINS): Define
+ __nvptx_unisimt__ when -muniform-simt option is active.
+ (struct machine_function): Add unisimt_master, unisimt_predicate
+ rtx fields.
+ * config/nvptx/nvptx.md (divergent): New attribute.
+ (atomic_compare_and_swap<mode>_1): Mark as divergent.
+ (atomic_exchange<mode>): Ditto.
+ (atomic_fetch_add<mode>): Ditto.
+ (atomic_fetch_addsf): Ditto.
+ (atomic_fetch_<logic><mode>): Ditto.
+ * config/nvptx/nvptx.opt (muniform-simt): New option.
+ * doc/invoke.texi (-muniform-simt): Document.
+
+2015-12-09 Alexander Monakov <amonakov@ispras.ru>
+
+ * config/nvptx/nvptx.c (nvptx_output_call_insn): Handle COND_EXEC
+ patterns. Emit instruction predicate.
+ (nvptx_print_operand): Unbreak handling of instruction predicates.
+ * config/nvptx/nvptx.md (predicable): New attribute. Generate
+ predicated forms via define_cond_exec.
+ (br_true): Mark as not predicable.
+ (br_false): Ditto.
+ (br_true_uni): Ditto.
+ (br_false_uni): Ditto.
+ (return): Ditto.
+ (trap_if_true): Ditto.
+ (trap_if_false): Ditto.
+ (nvptx_fork): Ditto.
+ (nvptx_forked): Ditto.
+ (nvptx_joining): Ditto.
+ (nvptx_join): Ditto.
+ (nvptx_barsync): Ditto.
+
+2015-12-09 Alexander Monakov <amonakov@ispras.ru>
+
+ * config/nvptx/nvptx.c: (need_softstack_decl): New variable.
+ (nvptx_declare_function_name): Handle TARGET_SOFT_STACK.
+ (nvptx_output_return): Emit stack restore if needed.
+ (nvptx_file_end): Handle need_softstack_decl.
+ * config/nvptx/nvptx.h: (TARGET_CPU_CPP_BUILTINS): Define
+ __nvptx_softstack__ when -msoft-stack is active.
+ (struct machine_function): New bool field using_softstack.
+ * config/nvptx/nvptx.opt: (msoft-stack): New option.
+ * doc/invoke.texi (msoft-stack): Document.
+
+2015-12-08 Alexander Monakov <amonakov@ispras.ru>
+
+ * config/nvptx/nvptx.c: (write_omp_entry): New. Use it...
+ (nvptx_declare_function_name): ...here to emit pointers for libgomp.
+
+2015-12-08 Alexander Monakov <amonakov@ispras.ru>
+
+ * omp-low.c (create_omp_child_function): Set "omp target entrypoint",
+ "omp acc target entrypoint" or "omp declare target" attribute based on
+ is_gimple_omp_offloaded and is_gimple_omp_oacc.
+ * config/nvptx/nvptx.c (write_as_kernel): Test OpenACC-specific
+ attribute "omp acc target entrypoint". Add a comment about the OpenMP
+ attribute handling.
+
+2015-12-08 Alexander Monakov <amonakov@ispras.ru>
+
+ * config/nvptx/mkoffload.c (main): Allow -fopenmp.
diff --git a/gcc/config/nvptx/mkoffload.c b/gcc/config/nvptx/mkoffload.c
index c8eed451078..e99ef373770 100644
--- a/gcc/config/nvptx/mkoffload.c
+++ b/gcc/config/nvptx/mkoffload.c
@@ -460,6 +460,7 @@ main (int argc, char **argv)
/* Scan the argument vector. */
bool fopenmp = false;
+ bool fopenacc = false;
for (int i = 1; i < argc; i++)
{
#define STR "-foffload-abi="
@@ -476,11 +477,15 @@ main (int argc, char **argv)
#undef STR
else if (strcmp (argv[i], "-fopenmp") == 0)
fopenmp = true;
+ else if (strcmp (argv[i], "-fopenacc") == 0)
+ fopenacc = true;
else if (strcmp (argv[i], "-save-temps") == 0)
save_temps = true;
else if (strcmp (argv[i], "-v") == 0)
verbose = true;
}
+ if (!(fopenacc ^ fopenmp))
+ fatal_error (input_location, "either -fopenacc or -fopenmp must be set");
struct obstack argv_obstack;
obstack_init (&argv_obstack);
@@ -501,6 +506,8 @@ main (int argc, char **argv)
default:
gcc_unreachable ();
}
+ if (fopenmp)
+ obstack_ptr_grow (&argv_obstack, "-mgomp");
for (int ix = 1; ix != argc; ix++)
{
@@ -517,8 +524,8 @@ main (int argc, char **argv)
fatal_error (input_location, "cannot open '%s'", ptx_cfile_name);
/* PR libgomp/65099: Currently, we only support offloading in 64-bit
- configurations. PR target/67822: OpenMP offloading to nvptx fails. */
- if (offload_abi == OFFLOAD_ABI_LP64 && !fopenmp)
+ configurations. */
+ if (offload_abi == OFFLOAD_ABI_LP64)
{
ptx_name = make_temp_file (".mkoffload");
obstack_ptr_grow (&argv_obstack, "-o");
diff --git a/gcc/config/nvptx/nvptx-protos.h b/gcc/config/nvptx/nvptx-protos.h
index ec4588e6dc0..331ec0af6bf 100644
--- a/gcc/config/nvptx/nvptx-protos.h
+++ b/gcc/config/nvptx/nvptx-protos.h
@@ -21,6 +21,16 @@
#ifndef GCC_NVPTX_PROTOS_H
#define GCC_NVPTX_PROTOS_H
+/* The kind of shuffe instruction. */
+enum nvptx_shuffle_kind
+{
+ SHUFFLE_UP,
+ SHUFFLE_DOWN,
+ SHUFFLE_BFLY,
+ SHUFFLE_IDX,
+ SHUFFLE_MAX
+};
+
extern void nvptx_declare_function_name (FILE *, const char *, const_tree decl);
extern void nvptx_declare_object_name (FILE *file, const char *name,
const_tree decl);
@@ -36,10 +46,12 @@ extern void nvptx_register_pragmas (void);
extern void nvptx_expand_oacc_fork (unsigned);
extern void nvptx_expand_oacc_join (unsigned);
extern void nvptx_expand_call (rtx, rtx);
+extern rtx nvptx_gen_shuffle (rtx, rtx, rtx, nvptx_shuffle_kind);
extern rtx nvptx_expand_compare (rtx);
extern const char *nvptx_ptx_type_from_mode (machine_mode, bool);
extern const char *nvptx_output_mov_insn (rtx, rtx);
extern const char *nvptx_output_call_insn (rtx_insn *, rtx, rtx);
extern const char *nvptx_output_return (void);
+extern const char *nvptx_output_set_softstack (unsigned);
#endif
#endif
diff --git a/gcc/config/nvptx/nvptx.c b/gcc/config/nvptx/nvptx.c
index 782bbdecb37..68bf0a89a81 100644
--- a/gcc/config/nvptx/nvptx.c
+++ b/gcc/config/nvptx/nvptx.c
@@ -72,16 +72,6 @@
/* This file should be included last. */
#include "target-def.h"
-/* The kind of shuffe instruction. */
-enum nvptx_shuffle_kind
-{
- SHUFFLE_UP,
- SHUFFLE_DOWN,
- SHUFFLE_BFLY,
- SHUFFLE_IDX,
- SHUFFLE_MAX
-};
-
/* The various PTX memory areas an object might reside in. */
enum nvptx_data_area
{
@@ -141,6 +131,12 @@ static GTY(()) rtx worker_red_sym;
/* Global lock variable, needed for 128bit worker & gang reductions. */
static GTY(()) tree global_lock_var;
+/* True if any function references __nvptx_stacks. */
+static bool need_softstack_decl;
+
+/* True if any function references __nvptx_uni. */
+static bool need_unisimt_decl;
+
/* Allocate a new, cleared machine_function structure. */
static struct machine_function *
@@ -151,6 +147,16 @@ nvptx_init_machine_status (void)
return p;
}
+/* Issue a diagnostic when option OPTNAME is enabled (as indicated by OPTVAL)
+ and -fopenacc is also enabled. */
+
+static void
+diagnose_openacc_conflict (bool optval, const char *optname)
+{
+ if (flag_openacc && optval)
+ error ("option %s is not supported together with -fopenacc", optname);
+}
+
/* Implement TARGET_OPTION_OVERRIDE. */
static void
@@ -188,6 +194,13 @@ nvptx_option_override (void)
worker_red_sym = gen_rtx_SYMBOL_REF (Pmode, "__worker_red");
SET_SYMBOL_DATA_AREA (worker_red_sym, DATA_AREA_SHARED);
worker_red_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT;
+
+ diagnose_openacc_conflict (TARGET_GOMP, "-mgomp");
+ diagnose_openacc_conflict (TARGET_SOFT_STACK, "-msoft-stack");
+ diagnose_openacc_conflict (TARGET_UNIFORM_SIMT, "-muniform-simt");
+
+ if (TARGET_GOMP)
+ target_flags |= MASK_SOFT_STACK | MASK_UNIFORM_SIMT;
}
/* Return a ptx type for MODE. If PROMOTE, then use .u32 for QImode to
@@ -238,9 +251,17 @@ nvptx_encode_section_info (tree decl, rtx rtl, int first)
if (TREE_CONSTANT (decl))
area = DATA_AREA_CONST;
else if (TREE_CODE (decl) == VAR_DECL)
- /* TODO: This would be a good place to check for a .shared or
- other section name. */
- area = TREE_READONLY (decl) ? DATA_AREA_CONST : DATA_AREA_GLOBAL;
+ {
+ if (lookup_attribute ("shared", DECL_ATTRIBUTES (decl)))
+ {
+ area = DATA_AREA_SHARED;
+ if (DECL_INITIAL (decl))
+ error ("static initialization of variable %q+D in %<.shared%>"
+ " memory is not supported", decl);
+ }
+ else
+ area = TREE_READONLY (decl) ? DATA_AREA_CONST : DATA_AREA_GLOBAL;
+ }
SET_SYMBOL_DATA_AREA (XEXP (rtl, 0), area);
}
@@ -718,7 +739,10 @@ static bool
write_as_kernel (tree attrs)
{
return (lookup_attribute ("kernel", attrs) != NULL_TREE
- || lookup_attribute ("omp target entrypoint", attrs) != NULL_TREE);
+ || (lookup_attribute ("omp target entrypoint", attrs) != NULL_TREE
+ && lookup_attribute ("oacc function", attrs) != NULL_TREE));
+ /* For OpenMP target regions, the corresponding kernel entry is emitted from
+ write_omp_entry as a separate function. */
}
/* Emit a linker marker for a function decl or defn. */
@@ -973,6 +997,67 @@ init_frame (FILE *file, int regno, unsigned align, unsigned size)
POINTER_SIZE, reg_names[regno], reg_names[regno]);
}
+/* Emit soft stack frame setup sequence. */
+
+static void
+init_softstack_frame (FILE *file, unsigned alignment, HOST_WIDE_INT size)
+{
+ /* Maintain 64-bit stack alignment. */
+ unsigned keep_align = BIGGEST_ALIGNMENT / BITS_PER_UNIT;
+ size = ROUND_UP (size, keep_align);
+ int bits = POINTER_SIZE;
+ const char *reg_stack = reg_names[STACK_POINTER_REGNUM];
+ const char *reg_frame = reg_names[FRAME_POINTER_REGNUM];
+ const char *reg_sspslot = reg_names[SOFTSTACK_SLOT_REGNUM];
+ const char *reg_sspprev = reg_names[SOFTSTACK_PREV_REGNUM];
+ fprintf (file, "\t.reg.u%d %s;\n", bits, reg_stack);
+ fprintf (file, "\t.reg.u%d %s;\n", bits, reg_frame);
+ fprintf (file, "\t.reg.u%d %s;\n", bits, reg_sspslot);
+ fprintf (file, "\t.reg.u%d %s;\n", bits, reg_sspprev);
+ fprintf (file, "\t{\n");
+ fprintf (file, "\t\t.reg.u32 %%fstmp0;\n");
+ fprintf (file, "\t\t.reg.u%d %%fstmp1;\n", bits);
+ fprintf (file, "\t\t.reg.u%d %%fstmp2;\n", bits);
+ fprintf (file, "\t\tmov.u32 %%fstmp0, %%tid.y;\n");
+ fprintf (file, "\t\tmul%s.u32 %%fstmp1, %%fstmp0, %d;\n",
+ bits == 64 ? ".wide" : ".lo", bits / 8);
+ fprintf (file, "\t\tmov.u%d %%fstmp2, __nvptx_stacks;\n", bits);
+
+ /* Initialize %sspslot = &__nvptx_stacks[tid.y]. */
+ fprintf (file, "\t\tadd.u%d %s, %%fstmp2, %%fstmp1;\n", bits, reg_sspslot);
+
+ /* Initialize %sspprev = __nvptx_stacks[tid.y]. */
+ fprintf (file, "\t\tld.shared.u%d %s, [%s];\n",
+ bits, reg_sspprev, reg_sspslot);
+
+ /* Initialize %frame = %sspprev - size. */
+ fprintf (file, "\t\tsub.u%d %s, %s, " HOST_WIDE_INT_PRINT_DEC ";\n",
+ bits, reg_frame, reg_sspprev, size);
+
+ /* Apply alignment, if larger than 64. */
+ if (alignment > keep_align)
+ fprintf (file, "\t\tand.b%d %s, %s, %d;\n",
+ bits, reg_frame, reg_frame, -alignment);
+
+ size = crtl->outgoing_args_size;
+ gcc_assert (size % keep_align == 0);
+
+ /* Initialize %stack. */
+ fprintf (file, "\t\tsub.u%d %s, %s, " HOST_WIDE_INT_PRINT_DEC ";\n",
+ bits, reg_stack, reg_frame, size);
+
+ /* Usually 'crtl->is_leaf' is computed during register allocator
+ initialization, which is not done on NVPTX. Compute it now. */
+ gcc_assert (!crtl->is_leaf);
+ crtl->is_leaf = leaf_function_p ();
+ if (!crtl->is_leaf)
+ fprintf (file, "\t\tst.shared.u%d [%s], %s;\n",
+ bits, reg_sspslot, reg_stack);
+ fprintf (file, "\t}\n");
+ cfun->machine->has_softstack = true;
+ need_softstack_decl = true;
+}
+
/* Emit code to initialize the REGNO predicate register to indicate
whether we are not lane zero on the NAME axis. */
@@ -986,6 +1071,97 @@ nvptx_init_axis_predicate (FILE *file, int regno, const char *name)
fprintf (file, "\t}\n");
}
+/* Emit code to initialize predicate and master lane index registers for
+ -muniform-simt code generation variant. */
+
+static void
+nvptx_init_unisimt_predicate (FILE *file)
+{
+ int bits = POINTER_SIZE;
+ int master = REGNO (cfun->machine->unisimt_master);
+ int pred = REGNO (cfun->machine->unisimt_predicate);
+ fprintf (file, "\t{\n");
+ fprintf (file, "\t\t.reg.u32 %%ustmp0;\n");
+ fprintf (file, "\t\t.reg.u%d %%ustmp1;\n", bits);
+ fprintf (file, "\t\t.reg.u%d %%ustmp2;\n", bits);
+ fprintf (file, "\t\tmov.u32 %%ustmp0, %%tid.y;\n");
+ fprintf (file, "\t\tmul%s.u32 %%ustmp1, %%ustmp0, 4;\n",
+ bits == 64 ? ".wide" : ".lo");
+ fprintf (file, "\t\tmov.u%d %%ustmp2, __nvptx_uni;\n", bits);
+ fprintf (file, "\t\tadd.u%d %%ustmp2, %%ustmp2, %%ustmp1;\n", bits);
+ fprintf (file, "\t\tld.shared.u32 %%r%d, [%%ustmp2];\n", master);
+ fprintf (file, "\t\tmov.u32 %%ustmp0, %%tid.x;\n");
+ /* Compute 'master lane index' as 'tid.x & __nvptx_uni[tid.y]'. */
+ fprintf (file, "\t\tand.b32 %%r%d, %%r%d, %%ustmp0;\n", master, master);
+ /* Compute predicate as 'tid.x == master'. */
+ fprintf (file, "\t\tsetp.eq.u32 %%r%d, %%r%d, %%ustmp0;\n", pred, master);
+ fprintf (file, "\t}\n");
+ need_unisimt_decl = true;
+}
+
+/* Emit kernel NAME for function ORIG outlined for an OpenMP 'target' region:
+
+ extern void gomp_nvptx_main (void (*fn)(void*), void *fnarg);
+ void __attribute__((kernel)) NAME (void *arg, char *stack, size_t stacksize)
+ {
+ __nvptx_stacks[tid.y] = stack + stacksize * (ctaid.x * ntid.y + tid.y + 1);
+ __nvptx_uni[tid.y] = 0;
+ gomp_nvptx_main (ORIG, arg);
+ }
+ ORIG itself should not be emitted as a PTX .entry function. */
+
+static void
+write_omp_entry (FILE *file, const char *name, const char *orig)
+{
+ static bool gomp_nvptx_main_declared;
+ if (!gomp_nvptx_main_declared)
+ {
+ gomp_nvptx_main_declared = true;
+ write_fn_marker (func_decls, false, true, "gomp_nvptx_main");
+ func_decls << ".extern .func gomp_nvptx_main (.param.u" << POINTER_SIZE
+ << " %in_ar1, .param.u" << POINTER_SIZE << " %in_ar2);\n";
+ }
+#define ENTRY_TEMPLATE(PS, PS_BYTES, MAD_PS_32) "\
+ (.param.u" PS " %arg, .param.u" PS " %stack, .param.u" PS " %sz)\n\
+{\n\
+ .reg.u32 %r<3>;\n\
+ .reg.u" PS " %R<4>;\n\
+ mov.u32 %r0, %tid.y;\n\
+ mov.u32 %r1, %ntid.y;\n\
+ mov.u32 %r2, %ctaid.x;\n\
+ cvt.u" PS ".u32 %R1, %r0;\n\
+ " MAD_PS_32 " %R1, %r1, %r2, %R1;\n\
+ mov.u" PS " %R0, __nvptx_stacks;\n\
+ " MAD_PS_32 " %R0, %r0, " PS_BYTES ", %R0;\n\
+ ld.param.u" PS " %R2, [%stack];\n\
+ ld.param.u" PS " %R3, [%sz];\n\
+ add.u" PS " %R2, %R2, %R3;\n\
+ mad.lo.u" PS " %R2, %R1, %R3, %R2;\n\
+ st.shared.u" PS " [%R0], %R2;\n\
+ mov.u" PS " %R0, __nvptx_uni;\n\
+ " MAD_PS_32 " %R0, %r0, 4, %R0;\n\
+ mov.u32 %r0, 0;\n\
+ st.shared.u32 [%R0], %r0;\n\
+ mov.u" PS " %R0, \0;\n\
+ ld.param.u" PS " %R1, [%arg];\n\
+ {\n\
+ .param.u" PS " %P<2>;\n\
+ st.param.u" PS " [%P0], %R0;\n\
+ st.param.u" PS " [%P1], %R1;\n\
+ call.uni gomp_nvptx_main, (%P0, %P1);\n\
+ }\n\
+ ret.uni;\n\
+}\n"
+ static const char entry64[] = ENTRY_TEMPLATE ("64", "8", "mad.wide.u32");
+ static const char entry32[] = ENTRY_TEMPLATE ("32", "4", "mad.lo.u32 ");
+#undef ENTRY_TEMPLATE
+ const char *entry_1 = TARGET_ABI64 ? entry64 : entry32;
+ /* Position ENTRY_2 after the embedded nul using strlen of the prefix. */
+ const char *entry_2 = entry_1 + strlen (entry64) + 1;
+ fprintf (file, ".visible .entry %s%s%s%s", name, entry_1, orig, entry_2);
+ need_softstack_decl = need_unisimt_decl = true;
+}
+
/* Implement ASM_DECLARE_FUNCTION_NAME. Writes the start of a ptx
function, including local var decls and copies from the arguments to
local regs. */
@@ -997,6 +1173,14 @@ nvptx_declare_function_name (FILE *file, const char *name, const_tree decl)
tree result_type = TREE_TYPE (fntype);
int argno = 0;
+ if (lookup_attribute ("omp target entrypoint", DECL_ATTRIBUTES (decl))
+ && !lookup_attribute ("oacc function", DECL_ATTRIBUTES (decl)))
+ {
+ char *buf = (char *) alloca (strlen (name) + sizeof ("$impl"));
+ sprintf (buf, "%s$impl", name);
+ write_omp_entry (file, name, buf);
+ name = buf;
+ }
/* We construct the initial part of the function into a string
stream, in order to share the prototype writing code. */
std::stringstream s;
@@ -1034,19 +1218,24 @@ nvptx_declare_function_name (FILE *file, const char *name, const_tree decl)
fprintf (file, "%s", s.str().c_str());
- /* Declare a local var for outgoing varargs. */
- if (cfun->machine->has_varadic)
- init_frame (file, STACK_POINTER_REGNUM,
- UNITS_PER_WORD, crtl->outgoing_args_size);
-
- /* Declare a local variable for the frame. Force its size to be
- DImode-compatible. */
HOST_WIDE_INT sz = get_frame_size ();
- if (sz || cfun->machine->has_chain)
- init_frame (file, FRAME_POINTER_REGNUM,
- crtl->stack_alignment_needed / BITS_PER_UNIT,
- (sz + GET_MODE_SIZE (DImode) - 1)
- & ~(HOST_WIDE_INT)(GET_MODE_SIZE (DImode) - 1));
+ bool need_frameptr = sz || cfun->machine->has_chain;
+ int alignment = crtl->stack_alignment_needed / BITS_PER_UNIT;
+ if (!TARGET_SOFT_STACK)
+ {
+ /* Declare a local var for outgoing varargs. */
+ if (cfun->machine->has_varadic)
+ init_frame (file, STACK_POINTER_REGNUM,
+ UNITS_PER_WORD, crtl->outgoing_args_size);
+
+ /* Declare a local variable for the frame. Force its size to be
+ DImode-compatible. */
+ if (need_frameptr)
+ init_frame (file, FRAME_POINTER_REGNUM, alignment,
+ ROUND_UP (sz, GET_MODE_SIZE (DImode)));
+ }
+ else if (need_frameptr || cfun->machine->has_varadic || cfun->calls_alloca)
+ init_softstack_frame (file, alignment, sz);
/* Declare the pseudos we have as ptx registers. */
int maxregs = max_reg_num ();
@@ -1072,8 +1261,25 @@ nvptx_declare_function_name (FILE *file, const char *name, const_tree decl)
if (cfun->machine->axis_predicate[1])
nvptx_init_axis_predicate (file,
REGNO (cfun->machine->axis_predicate[1]), "x");
+ if (cfun->machine->unisimt_predicate)
+ nvptx_init_unisimt_predicate (file);
}
+/* Output instruction that sets soft stack pointer in shared memory to the
+ value in register given by SRC_REGNO. */
+
+const char *
+nvptx_output_set_softstack (unsigned src_regno)
+{
+ if (cfun->machine->has_softstack && !crtl->is_leaf)
+ {
+ fprintf (asm_out_file, "\tst.shared.u%d\t[%s], ",
+ POINTER_SIZE, reg_names[SOFTSTACK_SLOT_REGNUM]);
+ output_reg (asm_out_file, src_regno, VOIDmode);
+ fprintf (asm_out_file, ";\n");
+ }
+ return "";
+}
/* Output a return instruction. Also copy the return value to its outgoing
location. */
@@ -1113,6 +1319,8 @@ nvptx_function_ok_for_sibcall (tree, tree)
static rtx
nvptx_get_drap_rtx (void)
{
+ if (TARGET_SOFT_STACK && stack_realign_drap)
+ return arg_pointer_rtx;
return NULL_RTX;
}
@@ -1311,7 +1519,7 @@ nvptx_gen_pack (rtx dst, rtx src0, rtx src1)
/* Generate an instruction or sequence to broadcast register REG
across the vectors of a single warp. */
-static rtx
+rtx
nvptx_gen_shuffle (rtx dst, rtx src, rtx idx, nvptx_shuffle_kind kind)
{
rtx res;
@@ -1833,6 +2041,8 @@ nvptx_output_mov_insn (rtx dst, rtx src)
return "%.\tcvt%t0%t1\t%0, %1;";
}
+static void nvptx_print_operand (FILE *, rtx, int);
+
/* Output INSN, which is a call to CALLEE with result RESULT. For ptx, this
involves writing .param declarations and in/out copies into them. For
indirect calls, also write the .callprototype. */
@@ -1844,6 +2054,8 @@ nvptx_output_call_insn (rtx_insn *insn, rtx result, rtx callee)
static int labelno;
bool needs_tgt = register_operand (callee, Pmode);
rtx pat = PATTERN (insn);
+ if (GET_CODE (pat) == COND_EXEC)
+ pat = COND_EXEC_CODE (pat);
int arg_end = XVECLEN (pat, 0);
tree decl = NULL_TREE;
@@ -1888,6 +2100,8 @@ nvptx_output_call_insn (rtx_insn *insn, rtx result, rtx callee)
fprintf (asm_out_file, ";\n");
}
+ /* The '.' stands for the call's predicate, if any. */
+ nvptx_print_operand (asm_out_file, NULL_RTX, '.');
fprintf (asm_out_file, "\t\tcall ");
if (result != NULL_RTX)
fprintf (asm_out_file, "(%s_in), ", reg_names[NVPTX_RETURN_REGNUM]);
@@ -1951,8 +2165,6 @@ nvptx_print_operand_punct_valid_p (unsigned char c)
return c == '.' || c== '#';
}
-static void nvptx_print_operand (FILE *, rtx, int);
-
/* Subroutine of nvptx_print_operand; used to print a memory reference X to FILE. */
static void
@@ -2013,12 +2225,10 @@ nvptx_print_operand (FILE *file, rtx x, int code)
x = current_insn_predicate;
if (x)
{
- unsigned int regno = REGNO (XEXP (x, 0));
- fputs ("[", file);
+ fputs ("@", file);
if (GET_CODE (x) == EQ)
fputs ("!", file);
- fputs (reg_names [regno], file);
- fputs ("]", file);
+ output_reg (file, REGNO (XEXP (x, 0)), VOIDmode);
}
return;
}
@@ -2313,6 +2523,89 @@ nvptx_reorg_subreg (void)
}
}
+/* Return a SImode "master lane index" register for uniform-simt, allocating on
+ first use. */
+
+static rtx
+nvptx_get_unisimt_master ()
+{
+ rtx &master = cfun->machine->unisimt_master;
+ return master ? master : master = gen_reg_rtx (SImode);
+}
+
+/* Return a BImode "predicate" register for uniform-simt, similar to above. */
+
+static rtx
+nvptx_get_unisimt_predicate ()
+{
+ rtx &pred = cfun->machine->unisimt_predicate;
+ return pred ? pred : pred = gen_reg_rtx (BImode);
+}
+
+/* Return true if given call insn references one of the functions provided by
+ the CUDA runtime: malloc, free, vprintf. */
+
+static bool
+nvptx_call_insn_is_syscall_p (rtx_insn *insn)
+{
+ rtx pat = PATTERN (insn);
+ gcc_checking_assert (GET_CODE (pat) == PARALLEL);
+ pat = XVECEXP (pat, 0, 0);
+ if (GET_CODE (pat) == SET)
+ pat = SET_SRC (pat);
+ gcc_checking_assert (GET_CODE (pat) == CALL
+ && GET_CODE (XEXP (pat, 0)) == MEM);
+ rtx addr = XEXP (XEXP (pat, 0), 0);
+ if (GET_CODE (addr) != SYMBOL_REF)
+ return false;
+ const char *name = XSTR (addr, 0);
+ /* Ordinary malloc/free are redirected to __nvptx_{malloc,free), so only the
+ references with forced assembler name refer to PTX syscalls. For vprintf,
+ accept both normal and forced-assembler-name references. */
+ return (!strcmp (name, "vprintf") || !strcmp (name, "*vprintf")
+ || !strcmp (name, "*malloc")
+ || !strcmp (name, "*free"));
+}
+
+/* If SET subexpression of INSN sets a register, emit a shuffle instruction to
+ propagate its value from lane MASTER to current lane. */
+
+static void
+nvptx_unisimt_handle_set (rtx set, rtx_insn *insn, rtx master)
+{
+ rtx reg;
+ if (GET_CODE (set) == SET && REG_P (reg = SET_DEST (set)))
+ emit_insn_after (nvptx_gen_shuffle (reg, reg, master, SHUFFLE_IDX), insn);
+}
+
+/* Adjust code for uniform-simt code generation variant by making atomics and
+ "syscalls" conditionally executed, and inserting shuffle-based propagation
+ for registers being set. */
+
+static void
+nvptx_reorg_uniform_simt ()
+{
+ rtx_insn *insn, *next;
+
+ for (insn = get_insns (); insn; insn = next)
+ {
+ next = NEXT_INSN (insn);
+ if (!(CALL_P (insn) && nvptx_call_insn_is_syscall_p (insn))
+ && !(NONJUMP_INSN_P (insn)
+ && GET_CODE (PATTERN (insn)) == PARALLEL
+ && get_attr_atomic (insn)))
+ continue;
+ rtx pat = PATTERN (insn);
+ rtx master = nvptx_get_unisimt_master ();
+ for (int i = 0; i < XVECLEN (pat, 0); i++)
+ nvptx_unisimt_handle_set (XVECEXP (pat, 0, i), insn, master);
+ rtx pred = nvptx_get_unisimt_predicate ();
+ pred = gen_rtx_NE (BImode, pred, const0_rtx);
+ pat = gen_rtx_COND_EXEC (VOIDmode, pred, pat);
+ validate_change (insn, &PATTERN (insn), pat, false);
+ }
+}
+
/* Loop structure of the function. The entire function is described as
a NULL loop. */
@@ -3091,11 +3384,17 @@ nvptx_find_sese (auto_vec<basic_block> &blocks, bb_pair_vec_t &regions)
int ix;
/* First clear each BB of the whole function. */
- FOR_ALL_BB_FN (block, cfun)
+ FOR_EACH_BB_FN (block, cfun)
{
block->flags &= ~BB_VISITED;
BB_SET_SESE (block, 0);
}
+ block = EXIT_BLOCK_PTR_FOR_FN (cfun);
+ block->flags &= ~BB_VISITED;
+ BB_SET_SESE (block, 0);
+ block = ENTRY_BLOCK_PTR_FOR_FN (cfun);
+ block->flags &= ~BB_VISITED;
+ BB_SET_SESE (block, 0);
/* Mark blocks in the function that are in this graph. */
for (ix = 0; blocks.iterate (ix, &block); ix++)
@@ -3829,6 +4128,9 @@ nvptx_reorg (void)
/* Replace subregs. */
nvptx_reorg_subreg ();
+ if (TARGET_UNIFORM_SIMT)
+ nvptx_reorg_uniform_simt ();
+
regstat_free_n_sets_and_refs ();
df_finish_pass (true);
@@ -3857,12 +4159,36 @@ nvptx_handle_kernel_attribute (tree *node, tree name, tree ARG_UNUSED (args),
return NULL_TREE;
}
+/* Handle a "shared" attribute; arguments as in
+ struct attribute_spec.handler. */
+
+static tree
+nvptx_handle_shared_attribute (tree *node, tree name, tree ARG_UNUSED (args),
+ int ARG_UNUSED (flags), bool *no_add_attrs)
+{
+ tree decl = *node;
+
+ if (TREE_CODE (decl) != VAR_DECL)
+ {
+ error ("%qE attribute only applies to variables", name);
+ *no_add_attrs = true;
+ }
+ else if (!(TREE_PUBLIC (decl) || TREE_STATIC (decl)))
+ {
+ error ("%qE attribute not allowed with auto storage class", name);
+ *no_add_attrs = true;
+ }
+
+ return NULL_TREE;
+}
+
/* Table of valid machine attributes. */
static const struct attribute_spec nvptx_attribute_table[] =
{
/* { name, min_len, max_len, decl_req, type_req, fn_type_req, handler,
affects_type_identity } */
{ "kernel", 0, 0, true, false, false, nvptx_handle_kernel_attribute, false },
+ { "shared", 0, 0, true, false, false, nvptx_handle_shared_attribute, false },
{ NULL, 0, 0, false, false, false, NULL, false }
};
@@ -3924,13 +4250,13 @@ nvptx_record_offload_symbol (tree decl)
case FUNCTION_DECL:
{
tree attr = get_oacc_fn_attrib (decl);
- tree dims = TREE_VALUE (attr);
- unsigned ix;
+ /* OpenMP offloading does not set this attribute. */
+ tree dims = attr ? TREE_VALUE (attr) : NULL_TREE;
fprintf (asm_out_file, "//:FUNC_MAP \"%s\"",
IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl)));
- for (ix = 0; ix != GOMP_DIM_MAX; ix++, dims = TREE_CHAIN (dims))
+ for (; dims; dims = TREE_CHAIN (dims))
{
int size = TREE_INT_CST_LOW (TREE_VALUE (dims));
@@ -3991,6 +4317,21 @@ nvptx_file_end (void)
if (worker_red_size)
write_worker_buffer (asm_out_file, worker_red_sym,
worker_red_align, worker_red_size);
+
+ if (need_softstack_decl)
+ {
+ write_var_marker (asm_out_file, false, true, "__nvptx_stacks");
+ /* 32 is the maximum number of warps in a block. Even though it's an
+ external declaration, emit the array size explicitly; otherwise, it
+ may fail at PTX JIT time if the definition is later in link order. */
+ fprintf (asm_out_file, ".extern .shared .u%d __nvptx_stacks[32];\n",
+ POINTER_SIZE);
+ }
+ if (need_unisimt_decl)
+ {
+ write_var_marker (asm_out_file, false, true, "__nvptx_uni");
+ fprintf (asm_out_file, ".extern .shared .u32 __nvptx_uni[32];\n");
+ }
}
/* Expander for the shuffle builtins. */
@@ -4176,6 +4517,14 @@ nvptx_expand_builtin (tree exp, rtx target, rtx ARG_UNUSED (subtarget),
#define PTX_WORKER_LENGTH 32
#define PTX_GANG_DEFAULT 0 /* Defer to runtime. */
+/* Implement TARGET_SIMT_VF target hook: number of threads in a warp. */
+
+static int
+nvptx_simt_vf ()
+{
+ return PTX_VECTOR_LENGTH;
+}
+
/* Validate compute dimensions of an OpenACC offload or routine, fill
in non-unity defaults. FN_LEVEL indicates the level at which a
routine might spawn a loop. It is negative for non-routines. If
@@ -4944,6 +5293,9 @@ nvptx_goacc_reduction (gcall *call)
#undef TARGET_BUILTIN_DECL
#define TARGET_BUILTIN_DECL nvptx_builtin_decl
+#undef TARGET_SIMT_VF
+#define TARGET_SIMT_VF nvptx_simt_vf
+
#undef TARGET_GOACC_VALIDATE_DIMS
#define TARGET_GOACC_VALIDATE_DIMS nvptx_goacc_validate_dims
diff --git a/gcc/config/nvptx/nvptx.h b/gcc/config/nvptx/nvptx.h
index 381269e3bcc..1702178eeb9 100644
--- a/gcc/config/nvptx/nvptx.h
+++ b/gcc/config/nvptx/nvptx.h
@@ -31,6 +31,10 @@
builtin_assert ("machine=nvptx"); \
builtin_assert ("cpu=nvptx"); \
builtin_define ("__nvptx__"); \
+ if (TARGET_SOFT_STACK) \
+ builtin_define ("__nvptx_softstack__"); \
+ if (TARGET_UNIFORM_SIMT) \
+ builtin_define ("__nvptx_unisimt__"); \
} while (0)
/* Avoid the default in ../../gcc.c, which adds "-pthread", which is not
@@ -79,13 +83,14 @@
#define POINTER_SIZE (TARGET_ABI64 ? 64 : 32)
#define Pmode (TARGET_ABI64 ? DImode : SImode)
+#define STACK_SIZE_MODE Pmode
/* Registers. Since ptx is a virtual target, we just define a few
hard registers for special purposes and leave pseudos unallocated.
We have to have some available hard registers, to keep gcc setup
happy. */
#define FIRST_PSEUDO_REGISTER 16
-#define FIXED_REGISTERS { 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }
+#define FIXED_REGISTERS { 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0 }
#define CALL_USED_REGISTERS { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }
#define HARD_REGNO_NREGS(REG, MODE) \
@@ -133,10 +138,17 @@ enum reg_class { NO_REGS, ALL_REGS, LIM_REG_CLASSES };
#define FRAME_POINTER_REGNUM 2
#define ARG_POINTER_REGNUM 3
#define STATIC_CHAIN_REGNUM 4
+/* This register points to the shared memory location with the current warp's
+ soft stack pointer (__nvptx_stacks[tid.y]). */
+#define SOFTSTACK_SLOT_REGNUM 5
+/* This register is used to save the previous value of the soft stack pointer
+ in the prologue and restore it when returning. */
+#define SOFTSTACK_PREV_REGNUM 6
#define REGISTER_NAMES \
{ \
- "%value", "%stack", "%frame", "%args", "%chain", "%hr5", "%hr6", "%hr7", \
+ "%value", "%stack", "%frame", "%args", \
+ "%chain", "%sspslot", "%sspprev", "%hr7", \
"%hr8", "%hr9", "%hr10", "%hr11", "%hr12", "%hr13", "%hr14", "%hr15" \
}
@@ -200,10 +212,13 @@ struct GTY(()) machine_function
bool is_varadic; /* This call is varadic */
bool has_varadic; /* Current function has a varadic call. */
bool has_chain; /* Current function has outgoing static chain. */
+ bool has_softstack; /* Current function has a soft stack frame. */
int num_args; /* Number of args of current call. */
int return_mode; /* Return mode of current fn.
(machine_mode not defined yet.) */
rtx axis_predicate[2]; /* Neutering predicates. */
+ rtx unisimt_master; /* 'Master lane index' for -muniform-simt. */
+ rtx unisimt_predicate; /* Predicate for -muniform-simt. */
};
#endif
diff --git a/gcc/config/nvptx/nvptx.md b/gcc/config/nvptx/nvptx.md
index d117343c531..91d11290860 100644
--- a/gcc/config/nvptx/nvptx.md
+++ b/gcc/config/nvptx/nvptx.md
@@ -36,10 +36,16 @@
UNSPEC_ALLOCA
+ UNSPEC_SET_SOFTSTACK
+
UNSPEC_DIM_SIZE
UNSPEC_BIT_CONV
+ UNSPEC_VOTE_BALLOT
+
+ UNSPEC_LANEID
+
UNSPEC_SHUFFLE
UNSPEC_BR_UNIFIED
])
@@ -55,11 +61,16 @@
UNSPECV_FORKED
UNSPECV_JOINING
UNSPECV_JOIN
+
+ UNSPECV_NOUNROLL
])
(define_attr "subregs_ok" "false,true"
(const_string "false"))
+(define_attr "atomic" "false,true"
+ (const_string "false"))
+
;; The nvptx operand predicates, in general, don't permit subregs and
;; only literal constants, which differ from the generic ones, which
;; permit subregs and symbolc constants (as appropriate)
@@ -124,6 +135,17 @@
return true;
})
+(define_attr "predicable" "false,true"
+ (const_string "true"))
+
+(define_cond_exec
+ [(match_operator 0 "predicate_operator"
+ [(match_operand:BI 1 "nvptx_register_operand" "")
+ (match_operand:BI 2 "const0_operand" "")])]
+ ""
+ ""
+ )
+
(define_constraint "P0"
"An integer with the value 0."
(and (match_code "const_int")
@@ -509,7 +531,8 @@
(label_ref (match_operand 1 "" ""))
(pc)))]
""
- "%j0\\tbra\\t%l1;")
+ "%j0\\tbra\\t%l1;"
+ [(set_attr "predicable" "false")])
(define_insn "br_false"
[(set (pc)
@@ -518,7 +541,8 @@
(label_ref (match_operand 1 "" ""))
(pc)))]
""
- "%J0\\tbra\\t%l1;")
+ "%J0\\tbra\\t%l1;"
+ [(set_attr "predicable" "false")])
;; unified conditional branch
(define_insn "br_true_uni"
@@ -527,7 +551,8 @@
UNSPEC_BR_UNIFIED) (const_int 0))
(label_ref (match_operand 1 "" "")) (pc)))]
""
- "%j0\\tbra.uni\\t%l1;")
+ "%j0\\tbra.uni\\t%l1;"
+ [(set_attr "predicable" "false")])
(define_insn "br_false_uni"
[(set (pc) (if_then_else
@@ -535,7 +560,8 @@
UNSPEC_BR_UNIFIED) (const_int 0))
(label_ref (match_operand 1 "" "")) (pc)))]
""
- "%J0\\tbra.uni\\t%l1;")
+ "%J0\\tbra.uni\\t%l1;"
+ [(set_attr "predicable" "false")])
(define_expand "cbranch<mode>4"
[(set (pc)
@@ -938,12 +964,16 @@
""
{
return nvptx_output_return ();
-})
+}
+ [(set_attr "predicable" "false")])
(define_expand "epilogue"
[(clobber (const_int 0))]
""
{
+ if (TARGET_SOFT_STACK)
+ emit_insn (gen_set_softstack_insn (gen_rtx_REG (Pmode,
+ SOFTSTACK_PREV_REGNUM)));
emit_jump_insn (gen_return ());
DONE;
})
@@ -972,31 +1002,40 @@
(match_operand 1 "nvptx_register_operand")]
""
{
+ if (TARGET_SOFT_STACK)
+ {
+ emit_move_insn (stack_pointer_rtx,
+ gen_rtx_MINUS (Pmode, stack_pointer_rtx, operands[1]));
+ emit_insn (gen_set_softstack_insn (stack_pointer_rtx));
+ emit_move_insn (operands[0], virtual_stack_dynamic_rtx);
+ DONE;
+ }
/* The ptx documentation specifies an alloca intrinsic (for 32 bit
only) but notes it is not implemented. The assembler emits a
confused error message. Issue a blunt one now instead. */
sorry ("target cannot support alloca.");
emit_insn (gen_nop ());
DONE;
- if (TARGET_ABI64)
- emit_insn (gen_allocate_stack_di (operands[0], operands[1]));
- else
- emit_insn (gen_allocate_stack_si (operands[0], operands[1]));
- DONE;
})
-(define_insn "allocate_stack_<mode>"
- [(set (match_operand:P 0 "nvptx_register_operand" "=R")
- (unspec:P [(match_operand:P 1 "nvptx_register_operand" "R")]
- UNSPEC_ALLOCA))]
- ""
- "%.\\tcall (%0), %%alloca, (%1);")
+(define_insn "set_softstack_insn"
+ [(unspec [(match_operand 0 "nvptx_register_operand" "R")]
+ UNSPEC_SET_SOFTSTACK)]
+ "TARGET_SOFT_STACK"
+{
+ return nvptx_output_set_softstack (REGNO (operands[0]));
+})
(define_expand "restore_stack_block"
[(match_operand 0 "register_operand" "")
(match_operand 1 "register_operand" "")]
""
{
+ if (TARGET_SOFT_STACK)
+ {
+ emit_move_insn (operands[0], operands[1]);
+ emit_insn (gen_set_softstack_insn (operands[0]));
+ }
DONE;
})
@@ -1018,14 +1057,16 @@
(const_int 0))
(const_int 0))]
""
- "%j0 trap;")
+ "%j0 trap;"
+ [(set_attr "predicable" "false")])
(define_insn "trap_if_false"
[(trap_if (eq (match_operand:BI 0 "nvptx_register_operand" "R")
(const_int 0))
(const_int 0))]
""
- "%J0 trap;")
+ "%J0 trap;"
+ [(set_attr "predicable" "false")])
(define_expand "ctrap<mode>4"
[(trap_if (match_operator 0 "nvptx_comparison_operator"
@@ -1074,28 +1115,28 @@
UNSPECV_FORK)]
""
"// fork %0;"
-)
+ [(set_attr "predicable" "false")])
(define_insn "nvptx_forked"
[(unspec_volatile:SI [(match_operand:SI 0 "const_int_operand" "")]
UNSPECV_FORKED)]
""
"// forked %0;"
-)
+ [(set_attr "predicable" "false")])
(define_insn "nvptx_joining"
[(unspec_volatile:SI [(match_operand:SI 0 "const_int_operand" "")]
UNSPECV_JOINING)]
""
"// joining %0;"
-)
+ [(set_attr "predicable" "false")])
(define_insn "nvptx_join"
[(unspec_volatile:SI [(match_operand:SI 0 "const_int_operand" "")]
UNSPECV_JOIN)]
""
"// join %0;"
-)
+ [(set_attr "predicable" "false")])
(define_expand "oacc_fork"
[(set (match_operand:SI 0 "nvptx_nonmemory_operand" "")
@@ -1134,6 +1175,88 @@
""
"%.\\tshfl%S3.b32\\t%0, %1, %2, 31;")
+(define_insn "nvptx_vote_ballot"
+ [(set (match_operand:SI 0 "nvptx_register_operand" "=R")
+ (unspec:SI [(match_operand:BI 1 "nvptx_register_operand" "R")]
+ UNSPEC_VOTE_BALLOT))]
+ ""
+ "%.\\tvote.ballot.b32\\t%0, %1;")
+
+;; Patterns for OpenMP SIMD-via-SIMT lowering
+
+;; Implement IFN_GOMP_SIMT_LANE: set operand 0 to lane index
+(define_insn "omp_simt_lane"
+ [(set (match_operand:SI 0 "nvptx_register_operand" "")
+ (unspec:SI [(const_int 0)] UNSPEC_LANEID))]
+ ""
+ "%.\\tmov.u32\\t%0, %%laneid;")
+
+;; Implement IFN_GOMP_SIMT_ORDERED: copy operand 1 to operand 0 and
+;; place a compiler barrier to disallow unrolling/peeling the containing loop
+(define_expand "omp_simt_ordered"
+ [(match_operand:SI 0 "nvptx_register_operand" "=R")
+ (match_operand:SI 1 "nvptx_register_operand" "R")]
+ ""
+{
+ emit_move_insn (operands[0], operands[1]);
+ emit_insn (gen_nvptx_nounroll ());
+ DONE;
+})
+
+;; Implement IFN_GOMP_SIMT_XCHG_BFLY: perform a "butterfly" exchange
+;; across lanes
+(define_expand "omp_simt_xchg_bfly"
+ [(match_operand 0 "nvptx_register_operand" "=R")
+ (match_operand 1 "nvptx_register_operand" "R")
+ (match_operand:SI 2 "nvptx_nonmemory_operand" "Ri")]
+ ""
+{
+ emit_insn (nvptx_gen_shuffle (operands[0], operands[1], operands[2],
+ SHUFFLE_BFLY));
+ DONE;
+})
+
+;; Implement IFN_GOMP_SIMT_XCHG_IDX: broadcast value in operand 1
+;; from lane given by index in operand 2 to operand 0 in all lanes
+(define_expand "omp_simt_xchg_idx"
+ [(match_operand 0 "nvptx_register_operand" "=R")
+ (match_operand 1 "nvptx_register_operand" "R")
+ (match_operand:SI 2 "nvptx_nonmemory_operand" "Ri")]
+ ""
+{
+ emit_insn (nvptx_gen_shuffle (operands[0], operands[1], operands[2],
+ SHUFFLE_IDX));
+ DONE;
+})
+
+;; Implement IFN_GOMP_SIMT_VOTE_ANY:
+;; set operand 0 to zero iff all lanes supply zero in operand 1
+(define_expand "omp_simt_vote_any"
+ [(match_operand:SI 0 "nvptx_register_operand" "=R")
+ (match_operand:SI 1 "nvptx_register_operand" "R")]
+ ""
+{
+ rtx pred = gen_reg_rtx (BImode);
+ emit_move_insn (pred, gen_rtx_NE (BImode, operands[1], const0_rtx));
+ emit_insn (gen_nvptx_vote_ballot (operands[0], pred));
+ DONE;
+})
+
+;; Implement IFN_GOMP_SIMT_LAST_LANE:
+;; set operand 0 to the lowest lane index that passed non-zero in operand 1
+(define_expand "omp_simt_last_lane"
+ [(match_operand:SI 0 "nvptx_register_operand" "=R")
+ (match_operand:SI 1 "nvptx_register_operand" "R")]
+ ""
+{
+ rtx pred = gen_reg_rtx (BImode);
+ rtx tmp = gen_reg_rtx (SImode);
+ emit_move_insn (pred, gen_rtx_NE (BImode, operands[1], const0_rtx));
+ emit_insn (gen_nvptx_vote_ballot (tmp, pred));
+ emit_insn (gen_ctzsi2 (operands[0], tmp));
+ DONE;
+})
+
;; extract parts of a 64 bit object into 2 32-bit ints
(define_insn "unpack<mode>si2"
[(set (match_operand:SI 0 "nvptx_register_operand" "=R")
@@ -1186,7 +1309,8 @@
(set (match_dup 1)
(unspec_volatile:SDIM [(const_int 0)] UNSPECV_CAS))]
""
- "%.\\tatom%A1.cas.b%T0\\t%0, %1, %2, %3;")
+ "%.\\tatom%A1.cas.b%T0\\t%0, %1, %2, %3;"
+ [(set_attr "atomic" "true")])
(define_insn "atomic_exchange<mode>"
[(set (match_operand:SDIM 0 "nvptx_register_operand" "=R") ;; output
@@ -1197,7 +1321,8 @@
(set (match_dup 1)
(match_operand:SDIM 2 "nvptx_nonmemory_operand" "Ri"))] ;; input
""
- "%.\\tatom%A1.exch.b%T0\\t%0, %1, %2;")
+ "%.\\tatom%A1.exch.b%T0\\t%0, %1, %2;"
+ [(set_attr "atomic" "true")])
(define_insn "atomic_fetch_add<mode>"
[(set (match_operand:SDIM 1 "memory_operand" "+m")
@@ -1209,7 +1334,8 @@
(set (match_operand:SDIM 0 "nvptx_register_operand" "=R")
(match_dup 1))]
""
- "%.\\tatom%A1.add%t0\\t%0, %1, %2;")
+ "%.\\tatom%A1.add%t0\\t%0, %1, %2;"
+ [(set_attr "atomic" "true")])
(define_insn "atomic_fetch_addsf"
[(set (match_operand:SF 1 "memory_operand" "+m")
@@ -1221,7 +1347,8 @@
(set (match_operand:SF 0 "nvptx_register_operand" "=R")
(match_dup 1))]
""
- "%.\\tatom%A1.add%t0\\t%0, %1, %2;")
+ "%.\\tatom%A1.add%t0\\t%0, %1, %2;"
+ [(set_attr "atomic" "true")])
(define_code_iterator any_logic [and ior xor])
(define_code_attr logic [(and "and") (ior "or") (xor "xor")])
@@ -1237,10 +1364,18 @@
(set (match_operand:SDIM 0 "nvptx_register_operand" "=R")
(match_dup 1))]
"0"
- "%.\\tatom%A1.b%T0.<logic>\\t%0, %1, %2;")
+ "%.\\tatom%A1.b%T0.<logic>\\t%0, %1, %2;"
+ [(set_attr "atomic" "true")])
(define_insn "nvptx_barsync"
[(unspec_volatile [(match_operand:SI 0 "const_int_operand" "")]
UNSPECV_BARSYNC)]
""
- "\\tbar.sync\\t%0;")
+ "\\tbar.sync\\t%0;"
+ [(set_attr "predicable" "false")])
+
+(define_insn "nvptx_nounroll"
+ [(unspec_volatile [(const_int 0)] UNSPECV_NOUNROLL)]
+ ""
+ "\\t.pragma \\\"nounroll\\\";"
+ [(set_attr "predicable" "false")])
diff --git a/gcc/config/nvptx/nvptx.opt b/gcc/config/nvptx/nvptx.opt
index 601cf124e36..cb6194da9dc 100644
--- a/gcc/config/nvptx/nvptx.opt
+++ b/gcc/config/nvptx/nvptx.opt
@@ -32,3 +32,15 @@ Link in code for a __main kernel.
moptimize
Target Report Var(nvptx_optimize) Init(-1)
Optimize partition neutering.
+
+msoft-stack
+Target Report Mask(SOFT_STACK)
+Use custom stacks instead of local memory for automatic storage.
+
+muniform-simt
+Target Report Mask(UNIFORM_SIMT)
+Generate code that can keep local state uniform across all lanes.
+
+mgomp
+Target Report Mask(GOMP)
+Generate code for OpenMP offloading: enables -msoft-stack and -muniform-simt.
diff --git a/gcc/config/nvptx/t-nvptx b/gcc/config/nvptx/t-nvptx
index e2580c956e0..6c1010ddd66 100644
--- a/gcc/config/nvptx/t-nvptx
+++ b/gcc/config/nvptx/t-nvptx
@@ -8,3 +8,5 @@ ALL_HOST_OBJS += mkoffload.o
mkoffload$(exeext): mkoffload.o collect-utils.o libcommon-target.a $(LIBIBERTY) $(LIBDEPS)
+$(LINKER) $(ALL_LINKERFLAGS) $(LDFLAGS) -o $@ \
mkoffload.o collect-utils.o libcommon-target.a $(LIBIBERTY) $(LIBS)
+
+MULTILIB_OPTIONS = mgomp
diff --git a/gcc/doc/extend.texi b/gcc/doc/extend.texi
index 0669f7999be..4dcc7f6b555 100644
--- a/gcc/doc/extend.texi
+++ b/gcc/doc/extend.texi
@@ -5576,6 +5576,7 @@ attributes.
* MeP Variable Attributes::
* Microsoft Windows Variable Attributes::
* MSP430 Variable Attributes::
+* Nvidia PTX Variable Attributes::
* PowerPC Variable Attributes::
* RL78 Variable Attributes::
* SPU Variable Attributes::
@@ -6257,6 +6258,20 @@ same name (@pxref{MSP430 Function Attributes}).
These attributes can be applied to both functions and variables.
@end table
+@node Nvidia PTX Variable Attributes
+@subsection Nvidia PTX Variable Attributes
+
+These variable attributes are supported by the Nvidia PTX back end:
+
+@table @code
+@item shared
+@cindex @code{shared} attribute, Nvidia PTX
+Use this attribute to place a variable in the @code{.shared} memory space.
+This memory space is private to each cooperative thread array; only threads
+within one thread block refer to the same instance of the variable.
+The runtime does not initialize variables in this memory space.
+@end table
+
@node PowerPC Variable Attributes
@subsection PowerPC Variable Attributes
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 17c5c225d0c..f06e2b8206d 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -20462,6 +20462,37 @@ offloading execution.
Apply partitioned execution optimizations. This is the default when any
level of optimization is selected.
+@item -msoft-stack
+@opindex msoft-stack
+Generate code that does not use @code{.local} memory
+directly for stack storage. Instead, a per-warp stack pointer is
+maintained explicitly. This enables variable-length stack allocation (with
+variable-length arrays or @code{alloca}), and when global memory is used for
+underlying storage, makes it possible to access automatic variables from other
+threads, or with atomic instructions. This code generation variant is used
+for OpenMP offloading, but the option is exposed on its own for the purpose
+of testing the compiler; to generate code suitable for linking into programs
+using OpenMP offloading, use option @option{-mgomp}.
+
+@item -muniform-simt
+@opindex muniform-simt
+Switch to code generation variant that allows to execute all threads in each
+warp, while maintaining memory state and side effects as if only one thread
+in each warp was active outside of OpenMP SIMD regions. All atomic operations
+and calls to runtime (malloc, free, vprintf) are conditionally executed (iff
+current lane index equals the master lane index), and the register being
+assigned is copied via a shuffle instruction from the master lane. Outside of
+SIMD regions lane 0 is the master; inside, each thread sees itself as the
+master. Shared memory array @code{int __nvptx_uni[]} stores all-zeros or
+all-ones bitmasks for each warp, indicating current mode (0 outside of SIMD
+regions). Each thread can bitwise-and the bitmask at position @code{tid.y}
+with current lane index to compute the master lane index.
+
+@item -mgomp
+@opindex mgomp
+Generate code for use in OpenMP offloading: enables @option{-msoft-stack} and
+@option{-muniform-simt} options, and selects corresponding multilib variant.
+
@end table
@node PDP-11 Options
diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
index 81c63b71bff..10122439c30 100644
--- a/gcc/doc/tm.texi
+++ b/gcc/doc/tm.texi
@@ -5858,6 +5858,10 @@ usable. In that case, the smaller the number is, the more desirable it is
to use it.
@end deftypefn
+@deftypefn {Target Hook} int TARGET_SIMT_VF (void)
+Return number of threads in SIMT thread group on the target.
+@end deftypefn
+
@deftypefn {Target Hook} bool TARGET_GOACC_VALIDATE_DIMS (tree @var{decl}, int *@var{dims}, int @var{fn_level})
This hook should check the launch dimensions provided for an OpenACC
compute region, or routine. Defaulted values are represented as -1
diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in
index 74e9200f749..0a7eca32639 100644
--- a/gcc/doc/tm.texi.in
+++ b/gcc/doc/tm.texi.in
@@ -4293,6 +4293,8 @@ address; but often a machine-dependent strategy can generate better code.
@hook TARGET_SIMD_CLONE_USABLE
+@hook TARGET_SIMT_VF
+
@hook TARGET_GOACC_VALIDATE_DIMS
@hook TARGET_GOACC_DIM_LIMIT
diff --git a/gcc/internal-fn.c b/gcc/internal-fn.c
index cbee97ea82c..fd1cd8b6445 100644
--- a/gcc/internal-fn.c
+++ b/gcc/internal-fn.c
@@ -157,6 +157,132 @@ expand_ANNOTATE (internal_fn, gcall *)
gcc_unreachable ();
}
+/* Lane index on SIMT targets: thread index in the warp on NVPTX. On targets
+ without SIMT execution this should be expanded in omp_device_lower pass. */
+
+static void
+expand_GOMP_SIMT_LANE (internal_fn, gcall *stmt)
+{
+ tree lhs = gimple_call_lhs (stmt);
+ if (!lhs)
+ return;
+
+ rtx target = expand_expr (lhs, NULL_RTX, VOIDmode, EXPAND_WRITE);
+ gcc_assert (targetm.have_omp_simt_lane ());
+ emit_insn (targetm.gen_omp_simt_lane (target));
+}
+
+/* This should get expanded in omp_device_lower pass. */
+
+static void
+expand_GOMP_SIMT_VF (internal_fn, gcall *)
+{
+ gcc_unreachable ();
+}
+
+/* Lane index of the first SIMT lane that supplies a non-zero argument.
+ This is a SIMT counterpart to GOMP_SIMD_LAST_LANE, used to represent the
+ lane that executed the last iteration for handling OpenMP lastprivate. */
+
+static void
+expand_GOMP_SIMT_LAST_LANE (internal_fn, gcall *stmt)
+{
+ tree lhs = gimple_call_lhs (stmt);
+ if (!lhs)
+ return;
+
+ rtx target = expand_expr (lhs, NULL_RTX, VOIDmode, EXPAND_WRITE);
+ rtx cond = expand_normal (gimple_call_arg (stmt, 0));
+ machine_mode mode = TYPE_MODE (TREE_TYPE (lhs));
+ struct expand_operand ops[2];
+ create_output_operand (&ops[0], target, mode);
+ create_input_operand (&ops[1], cond, mode);
+ gcc_assert (targetm.have_omp_simt_last_lane ());
+ expand_insn (targetm.code_for_omp_simt_last_lane, 2, ops);
+}
+
+/* Non-transparent predicate used in SIMT lowering of OpenMP "ordered". */
+
+static void
+expand_GOMP_SIMT_ORDERED_PRED (internal_fn, gcall *stmt)
+{
+ tree lhs = gimple_call_lhs (stmt);
+ if (!lhs)
+ return;
+
+ rtx target = expand_expr (lhs, NULL_RTX, VOIDmode, EXPAND_WRITE);
+ rtx ctr = expand_normal (gimple_call_arg (stmt, 0));
+ machine_mode mode = TYPE_MODE (TREE_TYPE (lhs));
+ struct expand_operand ops[2];
+ create_output_operand (&ops[0], target, mode);
+ create_input_operand (&ops[1], ctr, mode);
+ gcc_assert (targetm.have_omp_simt_ordered ());
+ expand_insn (targetm.code_for_omp_simt_ordered, 2, ops);
+}
+
+/* "Or" boolean reduction across SIMT lanes: return non-zero in all lanes if
+ any lane supplies a non-zero argument. */
+
+static void
+expand_GOMP_SIMT_VOTE_ANY (internal_fn, gcall *stmt)
+{
+ tree lhs = gimple_call_lhs (stmt);
+ if (!lhs)
+ return;
+
+ rtx target = expand_expr (lhs, NULL_RTX, VOIDmode, EXPAND_WRITE);
+ rtx cond = expand_normal (gimple_call_arg (stmt, 0));
+ machine_mode mode = TYPE_MODE (TREE_TYPE (lhs));
+ struct expand_operand ops[2];
+ create_output_operand (&ops[0], target, mode);
+ create_input_operand (&ops[1], cond, mode);
+ gcc_assert (targetm.have_omp_simt_vote_any ());
+ expand_insn (targetm.code_for_omp_simt_vote_any, 2, ops);
+}
+
+/* Exchange between SIMT lanes with a "butterfly" pattern: source lane index
+ is destination lane index XOR given offset. */
+
+static void
+expand_GOMP_SIMT_XCHG_BFLY (internal_fn, gcall *stmt)
+{
+ tree lhs = gimple_call_lhs (stmt);
+ if (!lhs)
+ return;
+
+ rtx target = expand_expr (lhs, NULL_RTX, VOIDmode, EXPAND_WRITE);
+ rtx src = expand_normal (gimple_call_arg (stmt, 0));
+ rtx idx = expand_normal (gimple_call_arg (stmt, 1));
+ machine_mode mode = TYPE_MODE (TREE_TYPE (lhs));
+ struct expand_operand ops[3];
+ create_output_operand (&ops[0], target, mode);
+ create_input_operand (&ops[1], src, mode);
+ create_input_operand (&ops[2], idx, SImode);
+ gcc_assert (targetm.have_omp_simt_xchg_bfly ());
+ expand_insn (targetm.code_for_omp_simt_xchg_bfly, 3, ops);
+}
+
+/* Exchange between SIMT lanes according to given source lane index. */
+
+static void
+expand_GOMP_SIMT_XCHG_IDX (internal_fn, gcall *stmt)
+{
+ tree lhs = gimple_call_lhs (stmt);
+ if (!lhs)
+ return;
+
+ rtx target = expand_expr (lhs, NULL_RTX, VOIDmode, EXPAND_WRITE);
+ rtx src = expand_normal (gimple_call_arg (stmt, 0));
+ rtx idx = expand_normal (gimple_call_arg (stmt, 1));
+ machine_mode mode = TYPE_MODE (TREE_TYPE (lhs));
+ struct expand_operand ops[3];
+ create_output_operand (&ops[0], target, mode);
+ create_input_operand (&ops[1], src, mode);
+ create_input_operand (&ops[2], idx, SImode);
+ gcc_assert (targetm.have_omp_simt_xchg_idx ());
+ expand_insn (targetm.code_for_omp_simt_xchg_idx, 3, ops);
+}
+
/* This should get expanded in adjust_simduid_builtins. */
static void
diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
index 0869b2fd52d..77ce63a6408 100644
--- a/gcc/internal-fn.def
+++ b/gcc/internal-fn.def
@@ -141,6 +141,13 @@ DEF_INTERNAL_INT_FN (FFS, ECF_CONST, ffs, unary)
DEF_INTERNAL_INT_FN (PARITY, ECF_CONST, parity, unary)
DEF_INTERNAL_INT_FN (POPCOUNT, ECF_CONST, popcount, unary)
+DEF_INTERNAL_FN (GOMP_SIMT_LANE, ECF_NOVOPS | ECF_LEAF | ECF_NOTHROW, NULL)
+DEF_INTERNAL_FN (GOMP_SIMT_VF, ECF_NOVOPS | ECF_LEAF | ECF_NOTHROW, NULL)
+DEF_INTERNAL_FN (GOMP_SIMT_LAST_LANE, ECF_NOVOPS | ECF_LEAF | ECF_NOTHROW, NULL)
+DEF_INTERNAL_FN (GOMP_SIMT_ORDERED_PRED, ECF_LEAF | ECF_NOTHROW, NULL)
+DEF_INTERNAL_FN (GOMP_SIMT_VOTE_ANY, ECF_NOVOPS | ECF_LEAF | ECF_NOTHROW, NULL)
+DEF_INTERNAL_FN (GOMP_SIMT_XCHG_BFLY, ECF_NOVOPS | ECF_LEAF | ECF_NOTHROW, NULL)
+DEF_INTERNAL_FN (GOMP_SIMT_XCHG_IDX, ECF_NOVOPS | ECF_LEAF | ECF_NOTHROW, NULL)
DEF_INTERNAL_FN (GOMP_SIMD_LANE, ECF_NOVOPS | ECF_LEAF | ECF_NOTHROW, NULL)
DEF_INTERNAL_FN (GOMP_SIMD_VF, ECF_CONST | ECF_LEAF | ECF_NOTHROW, NULL)
DEF_INTERNAL_FN (GOMP_SIMD_LAST_LANE, ECF_CONST | ECF_LEAF | ECF_NOTHROW, NULL)
diff --git a/gcc/omp-low.c b/gcc/omp-low.c
index e5b9e4c1091..da5476bc99f 100644
--- a/gcc/omp-low.c
+++ b/gcc/omp-low.c
@@ -2427,6 +2427,20 @@ cilk_for_check_loop_diff_type (tree type)
}
}
+/* Return true if CTX may belong to offloaded code: either if current function
+ is offloaded, or any enclosing context corresponds to a target region. */
+
+static bool
+omp_maybe_offloaded_ctx (omp_context *ctx)
+{
+ if (cgraph_node::get (current_function_decl)->offloadable)
+ return true;
+ for (; ctx; ctx = ctx->outer)
+ if (is_gimple_omp_offloaded (ctx->stmt))
+ return true;
+ return false;
+}
+
/* Build a decl for the omp child function. It'll not contain a body
yet, just the bare decl. */
@@ -2475,28 +2489,24 @@ create_omp_child_function (omp_context *ctx, bool task_copy)
DECL_CONTEXT (decl) = NULL_TREE;
DECL_INITIAL (decl) = make_node (BLOCK);
BLOCK_SUPERCONTEXT (DECL_INITIAL (decl)) = decl;
- if (cgraph_node::get (current_function_decl)->offloadable)
- cgraph_node::get_create (decl)->offloadable = 1;
- else
+ if (omp_maybe_offloaded_ctx (ctx))
{
- omp_context *octx;
- for (octx = ctx; octx; octx = octx->outer)
- if (is_gimple_omp_offloaded (octx->stmt))
- {
- cgraph_node::get_create (decl)->offloadable = 1;
- if (ENABLE_OFFLOADING)
- g->have_offload = true;
-
- break;
- }
+ cgraph_node::get_create (decl)->offloadable = 1;
+ if (ENABLE_OFFLOADING)
+ g->have_offload = true;
}
if (cgraph_node::get_create (decl)->offloadable
&& !lookup_attribute ("omp declare target",
DECL_ATTRIBUTES (current_function_decl)))
- DECL_ATTRIBUTES (decl)
- = tree_cons (get_identifier ("omp target entrypoint"),
- NULL_TREE, DECL_ATTRIBUTES (decl));
+ {
+ const char *target_attr = (is_gimple_omp_offloaded (ctx->stmt)
+ ? "omp target entrypoint"
+ : "omp declare target");
+ DECL_ATTRIBUTES (decl)
+ = tree_cons (get_identifier (target_attr),
+ NULL_TREE, DECL_ATTRIBUTES (decl));
+ }
t = build_decl (DECL_SOURCE_LOCATION (decl),
RESULT_DECL, NULL_TREE, void_type_node);
@@ -4264,6 +4274,25 @@ omp_clause_aligned_alignment (tree clause)
return build_int_cst (integer_type_node, al);
}
+
+/* Return maximum SIMT width if offloading may target SIMT hardware. */
+
+static int
+omp_max_simt_vf (void)
+{
+ if (!optimize)
+ return 0;
+ if (ENABLE_OFFLOADING)
+ for (const char *c = getenv ("OFFLOAD_TARGET_NAMES"); c; )
+ {
+ if (!strncmp (c, "nvptx", strlen ("nvptx")))
+ return 32;
+ else if ((c = strchr (c, ',')))
+ c++;
+ }
+ return 0;
+}
+
/* Return maximum possible vectorization factor for the target. */
static int
@@ -4277,16 +4306,18 @@ omp_max_vf (void)
|| global_options_set.x_flag_tree_vectorize)))
return 1;
+ int vf = 1;
int vs = targetm.vectorize.autovectorize_vector_sizes ();
if (vs)
+ vf = 1 << floor_log2 (vs);
+ else
{
- vs = 1 << floor_log2 (vs);
- return vs;
+ machine_mode vqimode = targetm.vectorize.preferred_simd_mode (QImode);
+ if (GET_MODE_CLASS (vqimode) == MODE_VECTOR_INT)
+ vf = GET_MODE_NUNITS (vqimode);
}
- machine_mode vqimode = targetm.vectorize.preferred_simd_mode (QImode);
- if (GET_MODE_CLASS (vqimode) == MODE_VECTOR_INT)
- return GET_MODE_NUNITS (vqimode);
- return 1;
+ int svf = omp_max_simt_vf ();
+ return MAX (vf, svf);
}
/* Helper function of lower_rec_input_clauses, used for #pragma omp simd
@@ -4374,10 +4405,13 @@ lower_rec_input_clauses (tree clauses, gimple_seq *ilist, gimple_seq *dlist,
int pass;
bool is_simd = (gimple_code (ctx->stmt) == GIMPLE_OMP_FOR
&& gimple_omp_for_kind (ctx->stmt) & GF_OMP_FOR_SIMD);
+ bool maybe_simt
+ = is_simd && omp_maybe_offloaded_ctx (ctx) && omp_max_simt_vf () > 1;
int max_vf = 0;
tree lane = NULL_TREE, idx = NULL_TREE;
+ tree simt_lane = NULL_TREE;
tree ivar = NULL_TREE, lvar = NULL_TREE;
- gimple_seq llist[2] = { NULL, NULL };
+ gimple_seq llist[3] = { };
copyin_seq = NULL;
@@ -5251,6 +5285,16 @@ lower_rec_input_clauses (tree clauses, gimple_seq *ilist, gimple_seq *dlist,
gimplify_assign (unshare_expr (ivar), x, &llist[0]);
+ if (maybe_simt)
+ {
+ if (!simt_lane)
+ simt_lane = create_tmp_var (unsigned_type_node);
+ x = build_call_expr_internal_loc
+ (UNKNOWN_LOCATION, IFN_GOMP_SIMT_XCHG_BFLY,
+ TREE_TYPE (ivar), 2, ivar, simt_lane);
+ x = build2 (code, TREE_TYPE (ivar), ivar, x);
+ gimplify_assign (ivar, x, &llist[2]);
+ }
x = build2 (code, TREE_TYPE (ref), ref, ivar);
ref = build_outer_var_ref (var, ctx);
gimplify_assign (ref, x, &llist[1]);
@@ -5303,6 +5347,39 @@ lower_rec_input_clauses (tree clauses, gimple_seq *ilist, gimple_seq *dlist,
g = gimple_build_assign (lane, INTEGER_CST,
build_int_cst (unsigned_type_node, 0));
gimple_seq_add_stmt (ilist, g);
+ /* Emit reductions across SIMT lanes in log_2(simt_vf) steps. */
+ if (llist[2])
+ {
+ tree simt_vf = create_tmp_var (unsigned_type_node);
+ g = gimple_build_call_internal (IFN_GOMP_SIMT_VF, 0);
+ gimple_call_set_lhs (g, simt_vf);
+ gimple_seq_add_stmt (dlist, g);
+
+ tree t = build_int_cst (unsigned_type_node, 1);
+ g = gimple_build_assign (simt_lane, INTEGER_CST, t);
+ gimple_seq_add_stmt (dlist, g);
+
+ t = build_int_cst (unsigned_type_node, 0);
+ g = gimple_build_assign (idx, INTEGER_CST, t);
+ gimple_seq_add_stmt (dlist, g);
+
+ tree body = create_artificial_label (UNKNOWN_LOCATION);
+ tree header = create_artificial_label (UNKNOWN_LOCATION);
+ tree end = create_artificial_label (UNKNOWN_LOCATION);
+ gimple_seq_add_stmt (dlist, gimple_build_goto (header));
+ gimple_seq_add_stmt (dlist, gimple_build_label (body));
+
+ gimple_seq_add_seq (dlist, llist[2]);
+
+ g = gimple_build_assign (simt_lane, LSHIFT_EXPR, simt_lane, integer_one_node);
+ gimple_seq_add_stmt (dlist, g);
+
+ gimple_seq_add_stmt (dlist, gimple_build_label (header));
+ g = gimple_build_cond (LT_EXPR, simt_lane, simt_vf, body, end);
+ gimple_seq_add_stmt (dlist, g);
+
+ gimple_seq_add_stmt (dlist, gimple_build_label (end));
+ }
for (int i = 0; i < 2; i++)
if (llist[i])
{
@@ -5389,7 +5466,7 @@ lower_lastprivate_clauses (tree clauses, tree predicate, gimple_seq *stmt_list,
{
tree x, c, label = NULL, orig_clauses = clauses;
bool par_clauses = false;
- tree simduid = NULL, lastlane = NULL;
+ tree simduid = NULL, lastlane = NULL, simtcond = NULL, simtlast = NULL;
/* Early exit if there are no lastprivate or linear clauses. */
for (; clauses ; clauses = OMP_CLAUSE_CHAIN (clauses))
@@ -5416,6 +5493,16 @@ lower_lastprivate_clauses (tree clauses, tree predicate, gimple_seq *stmt_list,
par_clauses = true;
}
+ bool maybe_simt = false;
+ if (gimple_code (ctx->stmt) == GIMPLE_OMP_FOR
+ && gimple_omp_for_kind (ctx->stmt) & GF_OMP_FOR_SIMD)
+ {
+ maybe_simt = omp_maybe_offloaded_ctx (ctx) && omp_max_simt_vf () > 1;
+ simduid = find_omp_clause (orig_clauses, OMP_CLAUSE__SIMDUID_);
+ if (simduid)
+ simduid = OMP_CLAUSE__SIMDUID__DECL (simduid);
+ }
+
if (predicate)
{
gcond *stmt;
@@ -5427,20 +5514,27 @@ lower_lastprivate_clauses (tree clauses, tree predicate, gimple_seq *stmt_list,
arm2 = TREE_OPERAND (predicate, 1);
gimplify_expr (&arm1, stmt_list, NULL, is_gimple_val, fb_rvalue);
gimplify_expr (&arm2, stmt_list, NULL, is_gimple_val, fb_rvalue);
- stmt = gimple_build_cond (TREE_CODE (predicate), arm1, arm2,
- label_true, label);
+ if (maybe_simt)
+ {
+ c = build2 (TREE_CODE (predicate), boolean_type_node, arm1, arm2);
+ c = fold_convert (integer_type_node, c);
+ simtcond = create_tmp_var (integer_type_node);
+ gimplify_assign (simtcond, c, stmt_list);
+ gcall *g = gimple_build_call_internal (IFN_GOMP_SIMT_VOTE_ANY,
+ 1, simtcond);
+ c = create_tmp_var (integer_type_node);
+ gimple_call_set_lhs (g, c);
+ gimple_seq_add_stmt (stmt_list, g);
+ stmt = gimple_build_cond (NE_EXPR, c, integer_zero_node,
+ label_true, label);
+ }
+ else
+ stmt = gimple_build_cond (TREE_CODE (predicate), arm1, arm2,
+ label_true, label);
gimple_seq_add_stmt (stmt_list, stmt);
gimple_seq_add_stmt (stmt_list, gimple_build_label (label_true));
}
- if (gimple_code (ctx->stmt) == GIMPLE_OMP_FOR
- && gimple_omp_for_kind (ctx->stmt) & GF_OMP_FOR_SIMD)
- {
- simduid = find_omp_clause (orig_clauses, OMP_CLAUSE__SIMDUID_);
- if (simduid)
- simduid = OMP_CLAUSE__SIMDUID__DECL (simduid);
- }
-
for (c = clauses; c ;)
{
tree var, new_var;
@@ -5491,6 +5585,24 @@ lower_lastprivate_clauses (tree clauses, tree predicate, gimple_seq *stmt_list,
new_var = build4 (ARRAY_REF, TREE_TYPE (val),
TREE_OPERAND (val, 0), lastlane,
NULL_TREE, NULL_TREE);
+ if (maybe_simt)
+ {
+ gcall *g;
+ if (simtlast == NULL)
+ {
+ simtlast = create_tmp_var (unsigned_type_node);
+ g = gimple_build_call_internal
+ (IFN_GOMP_SIMT_LAST_LANE, 1, simtcond);
+ gimple_call_set_lhs (g, simtlast);
+ gimple_seq_add_stmt (stmt_list, g);
+ }
+ x = build_call_expr_internal_loc
+ (UNKNOWN_LOCATION, IFN_GOMP_SIMT_XCHG_IDX,
+ TREE_TYPE (new_var), 2, new_var, simtlast);
+ new_var = unshare_expr (new_var);
+ gimplify_assign (new_var, x, stmt_list);
+ new_var = unshare_expr (new_var);
+ }
}
}
@@ -10498,12 +10610,23 @@ expand_omp_simd (struct omp_region *region, struct omp_for_data *fd)
edge e, ne;
tree *counts = NULL;
int i;
+ int safelen_int = INT_MAX;
tree safelen = find_omp_clause (gimple_omp_for_clauses (fd->for_stmt),
OMP_CLAUSE_SAFELEN);
tree simduid = find_omp_clause (gimple_omp_for_clauses (fd->for_stmt),
OMP_CLAUSE__SIMDUID_);
tree n1, n2;
+ if (safelen)
+ {
+ safelen = OMP_CLAUSE_SAFELEN_EXPR (safelen);
+ if (TREE_CODE (safelen) != INTEGER_CST)
+ safelen_int = 0;
+ else if (tree_fits_uhwi_p (safelen) && tree_to_uhwi (safelen) < INT_MAX)
+ safelen_int = tree_to_uhwi (safelen);
+ if (safelen_int == 1)
+ safelen_int = 0;
+ }
type = TREE_TYPE (fd->loop.v);
entry_bb = region->entry;
cont_bb = region->cont;
@@ -10557,20 +10680,53 @@ expand_omp_simd (struct omp_region *region, struct omp_for_data *fd)
OMP_CLAUSE__LOOPTEMP_);
gcc_assert (innerc);
n2 = OMP_CLAUSE_DECL (innerc);
- expand_omp_build_assign (&gsi, fd->loop.v,
- fold_convert (type, n1));
+ }
+ tree step = fd->loop.step;
+
+ bool offloaded = cgraph_node::get (current_function_decl)->offloadable;
+ for (struct omp_region *rgn = region; !offloaded && rgn; rgn = rgn->outer)
+ offloaded = rgn->type == GIMPLE_OMP_TARGET;
+ bool is_simt = offloaded && omp_max_simt_vf () > 1 && safelen_int > 1;
+ tree simt_lane = NULL_TREE, simt_maxlane = NULL_TREE;
+ if (is_simt)
+ {
+ cfun->curr_properties &= ~PROP_gimple_lomp_dev;
+ simt_lane = create_tmp_var (unsigned_type_node);
+ gimple *g = gimple_build_call_internal (IFN_GOMP_SIMT_LANE, 0);
+ gimple_call_set_lhs (g, simt_lane);
+ gsi_insert_before (&gsi, g, GSI_SAME_STMT);
+ tree offset = fold_build2 (MULT_EXPR, TREE_TYPE (step), step,
+ fold_convert (TREE_TYPE (step), simt_lane));
+ n1 = fold_convert (type, n1);
+ if (POINTER_TYPE_P (type))
+ n1 = fold_build_pointer_plus (n1, offset);
+ else
+ n1 = fold_build2 (PLUS_EXPR, type, n1, fold_convert (type, offset));
+
+ /* Collapsed loops not handled for SIMT yet: limit to one lane only. */
if (fd->collapse > 1)
+ simt_maxlane = build_one_cst (unsigned_type_node);
+ else if (safelen_int < omp_max_simt_vf ())
+ simt_maxlane = build_int_cst (unsigned_type_node, safelen_int);
+ tree vf
+ = build_call_expr_internal_loc (UNKNOWN_LOCATION, IFN_GOMP_SIMT_VF,
+ unsigned_type_node, 0);
+ if (simt_maxlane)
+ vf = fold_build2 (MIN_EXPR, unsigned_type_node, vf, simt_maxlane);
+ vf = fold_convert (TREE_TYPE (step), vf);
+ step = fold_build2 (MULT_EXPR, TREE_TYPE (step), step, vf);
+ }
+
+ expand_omp_build_assign (&gsi, fd->loop.v, fold_convert (type, n1));
+ if (fd->collapse > 1)
+ {
+ if (gimple_omp_for_combined_into_p (fd->for_stmt))
{
gsi_prev (&gsi);
expand_omp_for_init_vars (fd, &gsi, counts, NULL, n1);
gsi_next (&gsi);
}
- }
- else
- {
- expand_omp_build_assign (&gsi, fd->loop.v,
- fold_convert (type, fd->loop.n1));
- if (fd->collapse > 1)
+ else
for (i = 0; i < fd->collapse; i++)
{
tree itype = TREE_TYPE (fd->loops[i].v);
@@ -10579,7 +10735,7 @@ expand_omp_simd (struct omp_region *region, struct omp_for_data *fd)
t = fold_convert (TREE_TYPE (fd->loops[i].v), fd->loops[i].n1);
expand_omp_build_assign (&gsi, fd->loops[i].v, t);
}
- }
+ }
/* Remove the GIMPLE_OMP_FOR statement. */
gsi_remove (&gsi, true);
@@ -10592,9 +10748,9 @@ expand_omp_simd (struct omp_region *region, struct omp_for_data *fd)
gcc_assert (gimple_code (stmt) == GIMPLE_OMP_CONTINUE);
if (POINTER_TYPE_P (type))
- t = fold_build_pointer_plus (fd->loop.v, fd->loop.step);
+ t = fold_build_pointer_plus (fd->loop.v, step);
else
- t = fold_build2 (PLUS_EXPR, type, fd->loop.v, fd->loop.step);
+ t = fold_build2 (PLUS_EXPR, type, fd->loop.v, step);
expand_omp_build_assign (&gsi, fd->loop.v, t);
if (fd->collapse > 1)
@@ -10668,6 +10824,18 @@ expand_omp_simd (struct omp_region *region, struct omp_for_data *fd)
gimple_regimplify_operands (cond_stmt, &gsi);
}
+ /* Add 'V -= STEP * (SIMT_VF - 1)' after the loop. */
+ if (is_simt)
+ {
+ gsi = gsi_start_bb (l2_bb);
+ step = fold_build2 (MINUS_EXPR, TREE_TYPE (step), fd->loop.step, step);
+ if (POINTER_TYPE_P (type))
+ t = fold_build_pointer_plus (fd->loop.v, step);
+ else
+ t = fold_build2 (PLUS_EXPR, type, fd->loop.v, step);
+ expand_omp_build_assign (&gsi, fd->loop.v, t);
+ }
+
/* Remove GIMPLE_OMP_RETURN. */
gsi = gsi_last_bb (exit_bb);
gsi_remove (&gsi, true);
@@ -10697,30 +10865,29 @@ expand_omp_simd (struct omp_region *region, struct omp_for_data *fd)
ne->probability = REG_BR_PROB_BASE / 8;
set_immediate_dominator (CDI_DOMINATORS, l1_bb, entry_bb);
- set_immediate_dominator (CDI_DOMINATORS, l2_bb, l2_dom_bb);
set_immediate_dominator (CDI_DOMINATORS, l0_bb, l1_bb);
+ if (simt_maxlane)
+ {
+ cond_stmt = gimple_build_cond (LT_EXPR, simt_lane, simt_maxlane,
+ NULL_TREE, NULL_TREE);
+ gsi = gsi_last_bb (entry_bb);
+ gsi_insert_after (&gsi, cond_stmt, GSI_NEW_STMT);
+ make_edge (entry_bb, l2_bb, EDGE_FALSE_VALUE);
+ FALLTHRU_EDGE (entry_bb)->flags = EDGE_TRUE_VALUE;
+ FALLTHRU_EDGE (entry_bb)->probability = REG_BR_PROB_BASE * 7 / 8;
+ BRANCH_EDGE (entry_bb)->probability = REG_BR_PROB_BASE / 8;
+ l2_dom_bb = entry_bb;
+ }
+ set_immediate_dominator (CDI_DOMINATORS, l2_bb, l2_dom_bb);
+
if (!broken_loop)
{
struct loop *loop = alloc_loop ();
loop->header = l1_bb;
loop->latch = cont_bb;
add_loop (loop, l1_bb->loop_father);
- if (safelen == NULL_TREE)
- loop->safelen = INT_MAX;
- else
- {
- safelen = OMP_CLAUSE_SAFELEN_EXPR (safelen);
- if (TREE_CODE (safelen) != INTEGER_CST)
- loop->safelen = 0;
- else if (!tree_fits_uhwi_p (safelen)
- || tree_to_uhwi (safelen) > INT_MAX)
- loop->safelen = INT_MAX;
- else
- loop->safelen = tree_to_uhwi (safelen);
- if (loop->safelen == 1)
- loop->safelen = 0;
- }
+ loop->safelen = safelen_int;
if (simduid)
{
loop->simduid = OMP_CLAUSE__SIMDUID__DECL (simduid);
@@ -13885,7 +14052,6 @@ expand_omp (struct omp_region *region)
}
}
-
/* Helper for build_omp_regions. Scan the dominator tree starting at
block BB. PARENT is the region that contains BB. If SINGLE_TREE is
true, the function ends once a single tree is built (otherwise, whole
@@ -14768,12 +14934,14 @@ static void
lower_omp_ordered (gimple_stmt_iterator *gsi_p, omp_context *ctx)
{
tree block;
- gimple *stmt = gsi_stmt (*gsi_p);
+ gimple *stmt = gsi_stmt (*gsi_p), *g;
gomp_ordered *ord_stmt = as_a <gomp_ordered *> (stmt);
gcall *x;
gbind *bind;
bool simd = find_omp_clause (gimple_omp_ordered_clauses (ord_stmt),
OMP_CLAUSE_SIMD);
+ bool maybe_simt
+ = simd && omp_maybe_offloaded_ctx (ctx) && omp_max_simt_vf () > 1;
bool threads = find_omp_clause (gimple_omp_ordered_clauses (ord_stmt),
OMP_CLAUSE_THREADS);
@@ -14807,11 +14975,56 @@ lower_omp_ordered (gimple_stmt_iterator *gsi_p, omp_context *ctx)
0);
gimple_bind_add_stmt (bind, x);
+ tree counter = NULL_TREE, test = NULL_TREE, body = NULL_TREE;
+ if (maybe_simt)
+ {
+ counter = create_tmp_var (integer_type_node);
+ g = gimple_build_call_internal (IFN_GOMP_SIMT_LANE, 0);
+ gimple_call_set_lhs (g, counter);
+ gimple_bind_add_stmt (bind, g);
+
+ body = create_artificial_label (UNKNOWN_LOCATION);
+ test = create_artificial_label (UNKNOWN_LOCATION);
+ gimple_bind_add_stmt (bind, gimple_build_label (body));
+
+ tree simt_pred = create_tmp_var (integer_type_node);
+ g = gimple_build_call_internal (IFN_GOMP_SIMT_ORDERED_PRED, 1, counter);
+ gimple_call_set_lhs (g, simt_pred);
+ gimple_bind_add_stmt (bind, g);
+
+ tree t = create_artificial_label (UNKNOWN_LOCATION);
+ g = gimple_build_cond (EQ_EXPR, simt_pred, integer_zero_node, t, test);
+ gimple_bind_add_stmt (bind, g);
+
+ gimple_bind_add_stmt (bind, gimple_build_label (t));
+ }
lower_omp (gimple_omp_body_ptr (stmt), ctx);
gimple_omp_set_body (stmt, maybe_catch_exception (gimple_omp_body (stmt)));
gimple_bind_add_seq (bind, gimple_omp_body (stmt));
gimple_omp_set_body (stmt, NULL);
+ if (maybe_simt)
+ {
+ gimple_bind_add_stmt (bind, gimple_build_label (test));
+ g = gimple_build_assign (counter, MINUS_EXPR, counter, integer_one_node);
+ gimple_bind_add_stmt (bind, g);
+
+ tree c = build2 (GE_EXPR, boolean_type_node, counter, integer_zero_node);
+ tree nonneg = create_tmp_var (integer_type_node);
+ gimple_seq tseq = NULL;
+ gimplify_assign (nonneg, fold_convert (integer_type_node, c), &tseq);
+ gimple_bind_add_seq (bind, tseq);
+
+ g = gimple_build_call_internal (IFN_GOMP_SIMT_VOTE_ANY, 1, nonneg);
+ gimple_call_set_lhs (g, nonneg);
+ gimple_bind_add_stmt (bind, g);
+
+ tree end = create_artificial_label (UNKNOWN_LOCATION);
+ g = gimple_build_cond (NE_EXPR, nonneg, integer_zero_node, body, end);
+ gimple_bind_add_stmt (bind, g);
+
+ gimple_bind_add_stmt (bind, gimple_build_label (end));
+ }
if (simd)
x = gimple_build_call_internal (IFN_GOMP_SIMD_ORDERED_END, 1,
build_int_cst (NULL_TREE, threads));
@@ -17932,7 +18145,7 @@ const pass_data pass_data_lower_omp =
OPTGROUP_NONE, /* optinfo_flags */
TV_NONE, /* tv_id */
PROP_gimple_any, /* properties_required */
- PROP_gimple_lomp, /* properties_provided */
+ PROP_gimple_lomp | PROP_gimple_lomp_dev, /* properties_provided */
0, /* properties_destroyed */
0, /* todo_flags_start */
0, /* todo_flags_finish */
@@ -19864,6 +20077,109 @@ make_pass_oacc_device_lower (gcc::context *ctxt)
{
return new pass_oacc_device_lower (ctxt);
}
+
+
+/* Cleanup uses of SIMT placeholder internal functions: on non-SIMT targets,
+ VF is 1 and LANE is 0; on SIMT targets, VF is folded to a constant, and
+ LANE is kept to be expanded to RTL later on. Also cleanup all other SIMT
+ internal functions on non-SIMT targets, and likewise some SIMD internal
+ functions on SIMT targets. */
+
+static unsigned int
+execute_omp_device_lower ()
+{
+ int vf = targetm.simt.vf ? targetm.simt.vf () : 1;
+ basic_block bb;
+ gimple_stmt_iterator gsi;
+ FOR_EACH_BB_FN (bb, cfun)
+ for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
+ {
+ gimple *stmt = gsi_stmt (gsi);
+ if (!is_gimple_call (stmt) || !gimple_call_internal_p (stmt))
+ continue;
+ tree lhs = gimple_call_lhs (stmt), rhs = NULL_TREE;
+ tree type = lhs ? TREE_TYPE (lhs) : integer_type_node;
+ switch (gimple_call_internal_fn (stmt))
+ {
+ case IFN_GOMP_SIMT_LANE:
+ case IFN_GOMP_SIMT_LAST_LANE:
+ rhs = vf == 1 ? build_zero_cst (type) : NULL_TREE;
+ break;
+ case IFN_GOMP_SIMT_VF:
+ rhs = build_int_cst (type, vf);
+ break;
+ case IFN_GOMP_SIMT_ORDERED_PRED:
+ rhs = vf == 1 ? integer_zero_node : NULL_TREE;
+ if (rhs || !lhs)
+ unlink_stmt_vdef (stmt);
+ break;
+ case IFN_GOMP_SIMT_VOTE_ANY:
+ case IFN_GOMP_SIMT_XCHG_BFLY:
+ case IFN_GOMP_SIMT_XCHG_IDX:
+ rhs = vf == 1 ? gimple_call_arg (stmt, 0) : NULL_TREE;
+ break;
+ case IFN_GOMP_SIMD_LANE:
+ case IFN_GOMP_SIMD_LAST_LANE:
+ rhs = vf != 1 ? build_zero_cst (type) : NULL_TREE;
+ break;
+ case IFN_GOMP_SIMD_VF:
+ rhs = vf != 1 ? build_one_cst (type) : NULL_TREE;
+ break;
+ default:
+ continue;
+ }
+ if (lhs && !rhs)
+ continue;
+ stmt = lhs ? gimple_build_assign (lhs, rhs) : gimple_build_nop ();
+ gsi_replace (&gsi, stmt, false);
+ }
+ if (vf != 1)
+ cfun->has_force_vectorize_loops = false;
+ return 0;
+}
+
+namespace {
+
+const pass_data pass_data_omp_device_lower =
+{
+ GIMPLE_PASS, /* type */
+ "ompdevlow", /* name */
+ OPTGROUP_NONE, /* optinfo_flags */
+ TV_NONE, /* tv_id */
+ PROP_cfg, /* properties_required */
+ PROP_gimple_lomp_dev, /* properties_provided */
+ 0, /* properties_destroyed */
+ 0, /* todo_flags_start */
+ TODO_update_ssa, /* todo_flags_finish */
+};
+
+class pass_omp_device_lower : public gimple_opt_pass
+{
+public:
+ pass_omp_device_lower (gcc::context *ctxt)
+ : gimple_opt_pass (pass_data_omp_device_lower, ctxt)
+ {}
+
+ /* opt_pass methods: */
+ virtual bool gate (function *fun)
+ {
+ /* FIXME: inlining does not propagate the lomp_dev property. */
+ return 1 || !(fun->curr_properties & PROP_gimple_lomp_dev);
+ }
+ virtual unsigned int execute (function *)
+ {
+ return execute_omp_device_lower ();
+ }
+
+}; // class pass_expand_omp_ssa
+
+} // anon namespace
+
+gimple_opt_pass *
+make_pass_omp_device_lower (gcc::context *ctxt)
+{
+ return new pass_omp_device_lower (ctxt);
+}
/* "omp declare target link" handling pass. */
diff --git a/gcc/passes.def b/gcc/passes.def
index 85a5af088f3..2a470a7de07 100644
--- a/gcc/passes.def
+++ b/gcc/passes.def
@@ -183,6 +183,7 @@ along with GCC; see the file COPYING3. If not see
NEXT_PASS (pass_fixup_cfg);
NEXT_PASS (pass_lower_eh_dispatch);
NEXT_PASS (pass_oacc_device_lower);
+ NEXT_PASS (pass_omp_device_lower);
NEXT_PASS (pass_omp_target_link);
NEXT_PASS (pass_all_optimizations);
PUSH_INSERT_PASSES_WITHIN (pass_all_optimizations)
diff --git a/gcc/target-insns.def b/gcc/target-insns.def
index a6a040eb173..e011a5a7f26 100644
--- a/gcc/target-insns.def
+++ b/gcc/target-insns.def
@@ -68,6 +68,12 @@ DEF_TARGET_INSN (oacc_dim_pos, (rtx x0, rtx x1))
DEF_TARGET_INSN (oacc_dim_size, (rtx x0, rtx x1))
DEF_TARGET_INSN (oacc_fork, (rtx x0, rtx x1, rtx x2))
DEF_TARGET_INSN (oacc_join, (rtx x0, rtx x1, rtx x2))
+DEF_TARGET_INSN (omp_simt_lane, (rtx x0))
+DEF_TARGET_INSN (omp_simt_last_lane, (rtx x0, rtx x1))
+DEF_TARGET_INSN (omp_simt_ordered, (rtx x0, rtx x1))
+DEF_TARGET_INSN (omp_simt_vote_any, (rtx x0, rtx x1))
+DEF_TARGET_INSN (omp_simt_xchg_bfly, (rtx x0, rtx x1, rtx x2))
+DEF_TARGET_INSN (omp_simt_xchg_idx, (rtx x0, rtx x1, rtx x2))
DEF_TARGET_INSN (prefetch, (rtx x0, rtx x1, rtx x2))
DEF_TARGET_INSN (probe_stack, (rtx x0))
DEF_TARGET_INSN (probe_stack_address, (rtx x0))
diff --git a/gcc/target.def b/gcc/target.def
index bcdbc0e2452..2b357494ede 100644
--- a/gcc/target.def
+++ b/gcc/target.def
@@ -1648,6 +1648,18 @@ int, (struct cgraph_node *), NULL)
HOOK_VECTOR_END (simd_clone)
+/* Functions relating to OpenMP SIMT vectorization transform. */
+#undef HOOK_PREFIX
+#define HOOK_PREFIX "TARGET_SIMT_"
+HOOK_VECTOR (TARGET_SIMT, simt)
+
+DEFHOOK
+(vf,
+"Return number of threads in SIMT thread group on the target.",
+int, (void), NULL)
+
+HOOK_VECTOR_END (simt)
+
/* Functions relating to openacc. */
#undef HOOK_PREFIX
#define HOOK_PREFIX "TARGET_GOACC_"
diff --git a/gcc/testsuite/ChangeLog.gomp-nvptx b/gcc/testsuite/ChangeLog.gomp-nvptx
new file mode 100644
index 00000000000..24d26edeb64
--- /dev/null
+++ b/gcc/testsuite/ChangeLog.gomp-nvptx
@@ -0,0 +1,13 @@
+2016-05-19 Alexander Monakov <amonakov@ispras.ru>
+
+ * lib/target-supports.exp (check_effective_target_alloca): Use a
+ compile test.
+
+2016-05-19 Alexander Monakov <amonakov@ispras.ru>
+
+ * gcc.target/nvptx/softstack.c: New test.
+
+2016-05-06 Alexander Monakov <amonakov@ispras.ru>
+
+ * gcc.target/nvptx/decl-shared.c: New test.
+ * gcc.target/nvptx/decl-shared-init.c: New test.
diff --git a/gcc/testsuite/gcc.target/nvptx/decl-shared-init.c b/gcc/testsuite/gcc.target/nvptx/decl-shared-init.c
new file mode 100644
index 00000000000..6a99b1c338a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nvptx/decl-shared-init.c
@@ -0,0 +1 @@
+int var __attribute__((shared)) = 0; /* { dg-error "static initialization .* not supported" } */
diff --git a/gcc/testsuite/gcc.target/nvptx/decl-shared.c b/gcc/testsuite/gcc.target/nvptx/decl-shared.c
new file mode 100644
index 00000000000..367075cebe2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nvptx/decl-shared.c
@@ -0,0 +1,14 @@
+static int v_internal __attribute__((shared,used));
+int v_common __attribute__((shared));
+int v_extdef __attribute__((shared,nocommon));
+extern int v_extdecl __attribute__((shared));
+
+int use()
+{
+ return v_extdecl;
+}
+
+/* { dg-final { scan-assembler "\[\r\n\]\[\t \]*.shared \[^,\r\n\]*v_internal" } } */
+/* { dg-final { scan-assembler "\[\r\n\]\[\t \]*.weak .shared \[^,\r\n\]*v_common" } } */
+/* { dg-final { scan-assembler "\[\r\n\]\[\t \]*.visible .shared \[^,\r\n\]*v_extdef" } } */
+/* { dg-final { scan-assembler "\[\r\n\]\[\t \]*.extern .shared \[^,\r\n\]*v_extdecl" } } */
diff --git a/gcc/testsuite/gcc.target/nvptx/softstack.c b/gcc/testsuite/gcc.target/nvptx/softstack.c
new file mode 100644
index 00000000000..73e60f282a7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nvptx/softstack.c
@@ -0,0 +1,23 @@
+/* { dg-options "-O2 -msoft-stack" } */
+/* { dg-do run } */
+
+static __attribute__((noinline,noclone)) int f(int *p)
+{
+ return __sync_lock_test_and_set(p, 1);
+}
+
+static __attribute__((noinline,noclone)) int g(int n)
+{
+ /* Check that variable-length stack allocation works. */
+ int v[n];
+ v[0] = 0;
+ /* Check that atomic operations can be applied to auto data. */
+ return f(v) == 0 && v[0] == 1;
+}
+
+int main()
+{
+ if (!g(1))
+ __builtin_abort();
+ return 0;
+}
diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-supports.exp
index b683c09c1db..ce60cce9cae 100644
--- a/gcc/testsuite/lib/target-supports.exp
+++ b/gcc/testsuite/lib/target-supports.exp
@@ -763,7 +763,10 @@ proc check_effective_target_untyped_assembly {} {
proc check_effective_target_alloca {} {
if { [istarget nvptx-*-*] } {
- return 0
+ return [check_no_compiler_messages alloca assembly {
+ void f (void*);
+ void g (int n) { f (__builtin_alloca (n)); }
+ }]
}
return 1
}
diff --git a/gcc/tree-pass.h b/gcc/tree-pass.h
index da9ba1374b4..8befebe17b1 100644
--- a/gcc/tree-pass.h
+++ b/gcc/tree-pass.h
@@ -222,6 +222,7 @@ protected:
of math functions; the
current choices have
been optimized. */
+#define PROP_gimple_lomp_dev (1 << 16) /* done omp_device_lower */
#define PROP_trees \
(PROP_gimple_any | PROP_gimple_lcf | PROP_gimple_leh | PROP_gimple_lomp)
@@ -417,6 +418,7 @@ extern gimple_opt_pass *make_pass_expand_omp (gcc::context *ctxt);
extern gimple_opt_pass *make_pass_expand_omp_ssa (gcc::context *ctxt);
extern gimple_opt_pass *make_pass_omp_target_link (gcc::context *ctxt);
extern gimple_opt_pass *make_pass_oacc_device_lower (gcc::context *ctxt);
+extern gimple_opt_pass *make_pass_omp_device_lower (gcc::context *ctxt);
extern gimple_opt_pass *make_pass_object_sizes (gcc::context *ctxt);
extern gimple_opt_pass *make_pass_strlen (gcc::context *ctxt);
extern gimple_opt_pass *make_pass_fold_builtins (gcc::context *ctxt);
diff --git a/libgcc/ChangeLog.gomp-nvptx b/libgcc/ChangeLog.gomp-nvptx
new file mode 100644
index 00000000000..f2cb0d88743
--- /dev/null
+++ b/libgcc/ChangeLog.gomp-nvptx
@@ -0,0 +1,27 @@
+2016-05-06 Alexander Monakov <amonakov@ispras.ru>
+
+ * config/nvptx/stacks.c (__nvptx_stacks): Use 'nocommon' attribute to
+ emit a strong definition.
+ (__nvptx_uni): Ditto.
+
+2016-03-10 Alexander Monakov <amonakov@ispras.ru>
+
+ * config/nvptx/crt0.c (__nvptx_stacks): Define in C. Use it...
+ (__nvptx_uni): Ditto.
+ (__main): ...here instead of inline asm.
+ * config/nvptx/stacks.c (__nvptx_stacks): Define in C.
+ (__nvptx_uni): Ditto.
+
+2015-12-09 Alexander Monakov <amonakov@ispras.ru>
+
+ * config/nvptx/crt0.c: New, rewritten in C from ...
+ * config/nvptx/crt0.s: ...this. Delete.
+ * config/nvptx/free.c: New, rewritten in C from ...
+ * config/nvptx/free.asm: ...this. Delete.
+ * config/nvptx/malloc.c: New, rewritten in C from ...
+ * config/nvptx/malloc.asm: ...this. Delete.
+ * config/nvptx/realloc.c: Handle out-of-memory condition.
+ * config/nvptx/nvptx-malloc.h (__nvptx_real_free,
+ __nvptx_real_malloc): Declare.
+ * config/nvptx/stacks.c: New.
+ * config/nvptx/t-nvptx: Adjust.
diff --git a/libgcc/config/nvptx/crt0.c b/libgcc/config/nvptx/crt0.c
index 3b7382d43af..5a7dbf6f22f 100644
--- a/libgcc/config/nvptx/crt0.c
+++ b/libgcc/config/nvptx/crt0.c
@@ -24,6 +24,14 @@ int *__exitval_ptr;
extern void __attribute__((noreturn)) exit (int status);
extern int main (int, void **);
+/* Always setup soft stacks to allow testing with -msoft-stack but without
+ -mgomp. 32 is the maximum number of warps in a CTA: the definition here
+ must match the external declaration emitted by the compiler. */
+void *__nvptx_stacks[32] __attribute__((shared,nocommon));
+
+/* Likewise for -muniform-simt. */
+unsigned __nvptx_uni[32] __attribute__((shared,nocommon));
+
void __attribute__((kernel))
__main (int *rval_ptr, int argc, void **argv)
{
@@ -33,5 +41,9 @@ __main (int *rval_ptr, int argc, void **argv)
if (rval_ptr)
*rval_ptr = 255;
+ static char stack[131072] __attribute__((aligned(8)));
+ __nvptx_stacks[0] = stack + sizeof stack;
+ __nvptx_uni[0] = 0;
+
exit (main (argc, argv));
}
diff --git a/libgcc/config/nvptx/nvptx-malloc.h b/libgcc/config/nvptx/mgomp.c
index 9cbd84689b7..d8ca5818314 100644
--- a/libgcc/config/nvptx/nvptx-malloc.h
+++ b/libgcc/config/nvptx/mgomp.c
@@ -1,6 +1,6 @@
-/* Declarations for the malloc wrappers.
+/* Define shared memory arrays for -msoft-stack and -muniform-simt.
- Copyright (C) 2014-2016 Free Software Foundation, Inc.
+ Copyright (C) 2015-2016 Free Software Foundation, Inc.
This file is free software; you can redistribute it and/or modify it
under the terms of the GNU General Public License as published by the
@@ -21,6 +21,12 @@
see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
<http://www.gnu.org/licenses/>. */
-extern void __nvptx_free (void *);
-extern void *__nvptx_malloc (size_t);
-extern void *__nvptx_realloc (void *, size_t);
+/* OpenACC offloading does not use these symbols; thus, they are exposed
+ only for the -mgomp multilib. The same definitions are also provided
+ in crt0.c for the case of non-offloading compilation. 32 is the maximum
+ number of warps in a CTA. */
+
+#if defined(__nvptx_softstack__) && defined(__nvptx_unisimt__)
+void *__nvptx_stacks[32] __attribute__((shared,nocommon));
+unsigned __nvptx_uni[32] __attribute__((shared,nocommon));
+#endif
diff --git a/libgcc/config/nvptx/t-nvptx b/libgcc/config/nvptx/t-nvptx
index daf252f2472..c4d20c94cbb 100644
--- a/libgcc/config/nvptx/t-nvptx
+++ b/libgcc/config/nvptx/t-nvptx
@@ -1,4 +1,5 @@
-LIB2ADD=$(srcdir)/config/nvptx/reduction.c
+LIB2ADD=$(srcdir)/config/nvptx/reduction.c \
+ $(srcdir)/config/nvptx/mgomp.c
LIB2ADDEH=
LIB2FUNCS_EXCLUDE=__main
diff --git a/libgomp/ChangeLog.gomp-nvptx b/libgomp/ChangeLog.gomp-nvptx
new file mode 100644
index 00000000000..65f64a2174c
--- /dev/null
+++ b/libgomp/ChangeLog.gomp-nvptx
@@ -0,0 +1,296 @@
+2016-11-08 Alexander Monakov <amonakov@ispras.ru>
+
+ * config/nvptx/team.c: Adjust comments.
+
+2016-05-06 Alexander Monakov <amonakov@ispras.ru>
+
+ * config/nvptx/team.c (nvptx_thrs): Use 'nocommon' attribute to emit a
+ strong definition.
+
+2016-03-24 Alexander Monakov <amonakov@ispras.ru>
+
+ * config/nvptx/team.c (gomp_thread_start): Work around NVIDIA driver
+ bug by adding an exit edge to the loop,
+
+2016-03-24 Alexander Monakov <amonakov@ispras.ru>
+
+ * config/nvptx/target.c (GOMP_teams): Do not call 'free'.
+ * config/nvptx/team.c (gomp_nvptx_main): Use 'alloca' instead of
+ 'malloc' to obtain storage. Do not call 'free'.
+ * team.c (gomp_free_thread) [__nvptx__]: Do not call 'free'.
+
+2016-03-11 Alexander Monakov <amonakov@ispras.ru>
+
+ * plugin/plugin-nvptx.c (map_fini): Make cuMemFreeHost error non-fatal.
+
+2016-03-04 Alexander Monakov <amonakov@ispras.ru>
+
+ * config/nvptx/bar.c: Remove wrong invocation of
+ gomp_barrier_wait_end from gomp_team_barrier_wait_end.
+
+2016-02-15 Alexander Monakov <amonakov@ispras.ru>
+
+ * plugin/plugin-nvptx.c (nvptx_stacks_size): New.
+ (nvptx_stacks_alloc): New.
+ (nvptx_stacks_free): New.
+ (GOMP_OFFLOAD_run): Allocate soft-stacks storage from the host using
+ the above new functions. Use kernel launch interface that allows
+ checking for mismatched total size of entry function arguments.
+
+2016-02-15 Alexander Monakov <amonakov@ispras.ru>
+
+ * config/nvptx/team.c: (gomp_nvptx_main_1): Rename back to...
+ (gomp_nvptx_main): ...this; delete the wrapper.
+
+2016-02-15 Alexander Monakov <amonakov@ispras.ru>
+
+ Revert
+ 2015-12-09 Alexander Monakov <amonakov@ispras.ru>
+
+ * plugin/plugin-nvptx.c (nvptx_open_device): Adjust heap size.
+
+2016-02-15 Alexander Monakov <amonakov@ispras.ru>
+
+ * plugin/plugin-nvptx.c (nvptx_adjust_launch_bounds): Adjust types.
+ (GOMP_OFFLOAD_run): Ditto.
+
+2016-01-21 Alexander Monakov <amonakov@ispras.ru>
+
+ * config/posix/affinity.c: Move to...
+ * affinity.c: ...here (new file). Guard use of PThreads-specific
+ interface by LIBGOMP_USE_PTHREADS.
+ * config/nvptx/affinity.c: Delete (identical to affinity.c).
+
+2016-01-20 Alexander Monakov <amonakov@ispras.ru>
+
+ * plugin/plugin-nvptx.c (struct targ_fn_descriptor): Add new fields.
+ (struct ptx_device): Ditto. Set them...
+ (nvptx_open_device): ...here.
+ (GOMP_OFFLOAD_load_image): Set new targ_fn_descriptor fields.
+ (nvptx_adjust_launch_bounds): New. Use it...
+ (GOMP_OFFLOAD_run): ...here.
+
+2016-01-20 Alexander Monakov <amonakov@ispras.ru>
+
+ * config/nvptx/icv-device.c (omp_get_num_teams): Update.
+ (omp_get_team_num): Ditto.
+ * config/nvptx/target.c (GOMP_teams): Update.
+ * config/nvptx/team.c (nvptx_thrs): Place in shared memory.
+ * icv.c (gomp_num_teams_var): Define.
+ * libgomp.h (gomp_num_teams_var): Declare.
+ (nvptx_thrs): Place in shared memory.
+
+2016-01-20 Alexander Monakov <amonakov@ispras.ru>
+
+ * testsuite/libgomp.fortran/fortran.exp (lang_link_flags): Pass
+ -foffload=-lgfortran in addition to -lgfortran.
+ * testsuite/libgomp.oacc-fortran/fortran.exp (lang_link_flags): Ditto.
+
+2016-01-20 Alexander Monakov <amonakov@ispras.ru>
+
+ * config/nvptx/fortran.c: Delete.
+
+2016-01-20 Alexander Monakov <amonakov@ispras.ru>
+
+ * config/linux/lock.c (gomp_init_lock_30): Move to generic lock.c.
+ (gomp_destroy_lock_30): Ditto.
+ (gomp_set_lock_30): Ditto.
+ (gomp_unset_lock_30): Ditto.
+ (gomp_test_lock_30): Ditto.
+ (gomp_init_nest_lock_30): Ditto.
+ (gomp_destroy_nest_lock_30): Ditto.
+ (gomp_set_nest_lock_30): Ditto.
+ (gomp_unset_nest_lock_30): Ditto.
+ (gomp_test_nest_lock_30): Ditto.
+ * lock.c: New.
+ * config/nvptx/lock.c: New.
+
+2016-01-20 Alexander Monakov <amonakov@ispras.ru>
+
+ * plugin/plugin-nvptx.c (struct ptx_device): New field (clock_khz).
+ (nvptx_open_device): Set it.
+ (nvptx_set_clocktick): New. Use it...
+ (GOMP_OFFLOAD_load_image): ...here.
+
+2016-01-20 Dmitry Melnik <dm@ispras.ru>
+
+ * config/nvptx/time.c: New.
+
+2016-01-20 Alexander Monakov <amonakov@ispras.ru>
+
+ * config/nvptx/section.c: Delete.
+ * config/nvptx/splay-tree.c: Delete.
+
+2015-12-14 Alexander Monakov <amonakov@ispras.ru>
+
+ * config/nvptx/task.c: Guard with #ifdef __nvptx_softstack__.
+
+2015-12-14 Alexander Monakov <amonakov@ispras.ru>
+
+ Revert
+ 2015-12-09 Alexander Monakov <amonakov@ispras.ru>
+
+ * task.c (GOMP_task): Use a fixed-size on-stack buffer or a heap
+ allocation instead of a variable-size on-stack allocation.
+
+2015-12-14 Alexander Monakov <amonakov@ispras.ru>
+
+ Revert
+ 2015-12-09 Alexander Monakov <amonakov@ispras.ru>
+
+ * taskloop.c (GOMP_taskloop): Avoid alloca on NVPTX.
+
+2015-12-09 Alexander Monakov <amonakov@ispras.ru>
+
+ * config/nvptx/priority_queue.c: Delete.
+
+2015-12-09 Alexander Monakov <amonakov@ispras.ru>
+
+ * plugin/plugin-nvptx.c (GOMP_OFFLOAD_async_run): New (stub).
+
+2015-12-09 Alexander Monakov <amonakov@ispras.ru>
+
+ * config/nvptx/task.c: New.
+
+2015-12-09 Alexander Monakov <amonakov@ispras.ru>
+
+ * config/nvptx/team.c (gomp_nvptx_main): Rename to...
+ (gomp_nvptx_main_1): ... this and mark noinline.
+ (gomp_nvptx_main): Wrap the above, set up __nvptx_uni and
+ __nvptx_stacks.
+
+2015-12-09 Alexander Monakov <amonakov@ispras.ru>
+
+ * plugin/plugin-nvptx.c (nvptx_open_device): Adjust heap size.
+
+2015-12-09 Alexander Monakov <amonakov@ispras.ru>
+
+ * config/nvptx/pool.h: New.
+
+2015-12-09 Alexander Monakov <amonakov@ispras.ru>
+
+ * config/nvptx/doacross.h: New.
+
+2015-12-09 Alexander Monakov <amonakov@ispras.ru>
+
+ * taskloop.c (GOMP_taskloop): Avoid alloca on NVPTX.
+
+2015-12-09 Alexander Monakov <amonakov@ispras.ru>
+
+ * plugin/plugin-nvptx.c (GOMP_OFFLOAD_dev2dev): New.
+
+2015-12-09 Alexander Monakov <amonakov@ispras.ru>
+
+ * config/nvptx/affinity.c: New.
+
+2015-12-09 Alexander Monakov <amonakov@ispras.ru>
+
+ * plugin/plugin-nvptx.c (link_ptx): Adjust log sizes.
+
+2015-12-09 Alexander Monakov <amonakov@ispras.ru>
+
+ * plugin/plugin-nvptx.c (GOMP_OFFLOAD_run): Start 8 warps.
+
+2015-12-09 Alexander Monakov <amonakov@ispras.ru>
+
+ * config/nvptx/alloc.c: Delete.
+ * config/nvptx/barrier.c: Ditto.
+ * config/nvptx/iter.c: Ditto.
+ * config/nvptx/iter_ull.c: Ditto.
+ * config/nvptx/loop.c: Ditto.
+ * config/nvptx/loop_ull.c: Ditto.
+ * config/nvptx/ordered.c: Ditto.
+ * config/nvptx/parallel.c: Ditto.
+ * config/nvptx/single.c: Ditto.
+ * config/nvptx/task.c: Ditto.
+ * config/nvptx/work.c: Ditto.
+
+2015-12-09 Alexander Monakov <amonakov@ispras.ru>
+
+ * Makefile.am (libgomp_la_SOURCES): Add atomic.c.
+ * Makefile.in: Regenerate.
+ * critical.c: Split out GOMP_atomic_{start,end} into...
+ * atomic.c: ...here (new file).
+ * config/nvptx/critical.c: Delete.
+
+2015-12-09 Alexander Monakov <amonakov@ispras.ru>
+
+ * config/nvptx/target.c: New.
+
+2015-12-09 Alexander Monakov <amonakov@ispras.ru>
+
+ * config/nvptx/error.c: New.
+
+2015-12-09 Alexander Monakov <amonakov@ispras.ru>
+
+ * libgomp.h [__nvptx__] (gomp_thread): New implementation.
+ * config/nvptx/team.c (gomp_nvptx_main): New.
+ (gomp_thread_start): New (NVPTX-specific implementation).
+ (gomp_team_start): Ditto.
+ * team.c: Guard uses of PThreads-specific interfaces by
+ LIBGOMP_USE_PTHREADS.
+
+2015-12-08 Alexander Monakov <amonakov@ispras.ru>
+
+ * config/nvptx/simple-bar.h: New file.
+
+2015-12-08 Alexander Monakov <amonakov@ispras.ru>
+
+ * config/posix/simple-bar.h: New file. Use it ...
+ * libgomp.h: ...here: new include.
+ (struct gomp_thread_pool): Change threads_dock member to
+ gomp_simple_barrier_t.
+ * team.c: Adjust all uses.
+
+2015-12-08 Alexander Monakov <amonakov@ispras.ru>
+
+ * config/nvptx/bar.c: New file.
+ * config/nvptx/bar.h: Ditto.
+
+2015-12-08 Alexander Monakov <amonakov@ispras.ru>
+
+ * config/nvptx/proc.c: New.
+
+2015-12-08 Alexander Monakov <amonakov@ispras.ru>
+
+ * plugin/plugin-nvptx.c (link_ptx): Do not set CU_JIT_TARGET.
+
+2015-12-08 Alexander Monakov <amonakov@ispras.ru>
+
+ * config/nvptx/icv-device.c: New file.
+
+2015-12-08 Alexander Monakov <amonakov@ispras.ru>
+
+ * Makefile.am (libgomp_la_SOURCES): Add icv.c and icv-device.c.
+ * Makefile.in: Regenerate.
+ * env.c: Split out ICV definitions into...
+ * icv.c: ...here (new file) and...
+ * icv-device.c: ...here. New file.
+
+2015-12-08 Alexander Monakov <amonakov@ispras.ru>
+
+ * configure.ac [nvptx*-*-*] (libgomp_use_pthreads): Set and use it...
+ (LIBGOMP_USE_PTHREADS): ...here; new define.
+ * configure: Regenerate.
+ * config.h.in: Likewise.
+ * libgomp.h: Guard pthread.h inclusion.
+ (gomp_thread_attr): Guard by LIBGOMP_USE_PTHREADS.
+ (gomp_init_thread_affinity): Ditto.
+
+2015-12-08 Alexander Monakov <amonakov@ispras.ru>
+
+ * config/nvptx/bar.h: New file.
+
+2015-12-08 Alexander Monakov <amonakov@ispras.ru>
+
+ * config/nvptx/mutex.h: New file.
+ * config/nvptx/ptrlock.h: New file.
+ * config/nvptx/sem.h: New file.
+
+2015-12-08 Jakub Jelinek <jakub@redhat.com>
+
+ * plugin/plugin-nvptx.c (nvptx_host2dev): Allow NULL 'nvthd'.
+ (nvptx_dev2host): Ditto.
+ (GOMP_OFFLOAD_get_caps): Add GOMP_OFFLOAD_CAP_OPENMP_400.
+ (GOMP_OFFLOAD_run): New.
+
diff --git a/libgomp/Makefile.am b/libgomp/Makefile.am
index a3e1c2b2c0f..4090336a773 100644
--- a/libgomp/Makefile.am
+++ b/libgomp/Makefile.am
@@ -58,12 +58,12 @@ libgomp_la_LDFLAGS = $(libgomp_version_info) $(libgomp_version_script) \
libgomp_la_DEPENDENCIES = $(libgomp_version_dep)
libgomp_la_LINK = $(LINK) $(libgomp_la_LDFLAGS)
-libgomp_la_SOURCES = alloc.c barrier.c critical.c env.c error.c iter.c \
- iter_ull.c loop.c loop_ull.c ordered.c parallel.c sections.c single.c \
- task.c team.c work.c lock.c mutex.c proc.c sem.c bar.c ptrlock.c \
- time.c fortran.c affinity.c target.c splay-tree.c libgomp-plugin.c \
- oacc-parallel.c oacc-host.c oacc-init.c oacc-mem.c oacc-async.c \
- oacc-plugin.c oacc-cuda.c priority_queue.c
+libgomp_la_SOURCES = alloc.c atomic.c barrier.c critical.c env.c error.c \
+ icv.c icv-device.c iter.c iter_ull.c loop.c loop_ull.c ordered.c \
+ parallel.c sections.c single.c task.c team.c work.c lock.c mutex.c \
+ proc.c sem.c bar.c ptrlock.c time.c fortran.c affinity.c target.c \
+ splay-tree.c libgomp-plugin.c oacc-parallel.c oacc-host.c oacc-init.c \
+ oacc-mem.c oacc-async.c oacc-plugin.c oacc-cuda.c priority_queue.c
include $(top_srcdir)/plugin/Makefrag.am
diff --git a/libgomp/Makefile.in b/libgomp/Makefile.in
index 88c8517fddd..a630f2b6438 100644
--- a/libgomp/Makefile.in
+++ b/libgomp/Makefile.in
@@ -173,14 +173,14 @@ libgomp_plugin_nvptx_la_LINK = $(LIBTOOL) --tag=CC \
@PLUGIN_NVPTX_TRUE@ $(toolexeclibdir)
libgomp_la_LIBADD =
@USE_FORTRAN_TRUE@am__objects_1 = openacc.lo
-am_libgomp_la_OBJECTS = alloc.lo barrier.lo critical.lo env.lo \
- error.lo iter.lo iter_ull.lo loop.lo loop_ull.lo ordered.lo \
- parallel.lo sections.lo single.lo task.lo team.lo work.lo \
- lock.lo mutex.lo proc.lo sem.lo bar.lo ptrlock.lo time.lo \
- fortran.lo affinity.lo target.lo splay-tree.lo \
- libgomp-plugin.lo oacc-parallel.lo oacc-host.lo oacc-init.lo \
- oacc-mem.lo oacc-async.lo oacc-plugin.lo oacc-cuda.lo \
- priority_queue.lo $(am__objects_1)
+am_libgomp_la_OBJECTS = alloc.lo atomic.lo barrier.lo critical.lo \
+ env.lo error.lo icv.lo icv-device.lo iter.lo iter_ull.lo \
+ loop.lo loop_ull.lo ordered.lo parallel.lo sections.lo \
+ single.lo task.lo team.lo work.lo lock.lo mutex.lo proc.lo \
+ sem.lo bar.lo ptrlock.lo time.lo fortran.lo affinity.lo \
+ target.lo splay-tree.lo libgomp-plugin.lo oacc-parallel.lo \
+ oacc-host.lo oacc-init.lo oacc-mem.lo oacc-async.lo \
+ oacc-plugin.lo oacc-cuda.lo priority_queue.lo $(am__objects_1)
libgomp_la_OBJECTS = $(am_libgomp_la_OBJECTS)
DEFAULT_INCLUDES = -I.@am__isrc@
depcomp = $(SHELL) $(top_srcdir)/../depcomp
@@ -429,13 +429,14 @@ libgomp_la_LDFLAGS = $(libgomp_version_info) $(libgomp_version_script) \
libgomp_la_DEPENDENCIES = $(libgomp_version_dep)
libgomp_la_LINK = $(LINK) $(libgomp_la_LDFLAGS)
-libgomp_la_SOURCES = alloc.c barrier.c critical.c env.c error.c iter.c \
- iter_ull.c loop.c loop_ull.c ordered.c parallel.c sections.c \
- single.c task.c team.c work.c lock.c mutex.c proc.c sem.c \
- bar.c ptrlock.c time.c fortran.c affinity.c target.c \
- splay-tree.c libgomp-plugin.c oacc-parallel.c oacc-host.c \
- oacc-init.c oacc-mem.c oacc-async.c oacc-plugin.c oacc-cuda.c \
- priority_queue.c $(am__append_3)
+libgomp_la_SOURCES = alloc.c atomic.c barrier.c critical.c env.c \
+ error.c icv.c icv-device.c iter.c iter_ull.c loop.c loop_ull.c \
+ ordered.c parallel.c sections.c single.c task.c team.c work.c \
+ lock.c mutex.c proc.c sem.c bar.c ptrlock.c time.c fortran.c \
+ affinity.c target.c splay-tree.c libgomp-plugin.c \
+ oacc-parallel.c oacc-host.c oacc-init.c oacc-mem.c \
+ oacc-async.c oacc-plugin.c oacc-cuda.c priority_queue.c \
+ $(am__append_3)
# Nvidia PTX OpenACC plugin.
@PLUGIN_NVPTX_TRUE@libgomp_plugin_nvptx_version_info = -version-info $(libtool_VERSION)
@@ -598,12 +599,15 @@ distclean-compile:
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/affinity.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/alloc.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/atomic.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/bar.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/barrier.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/critical.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/env.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/error.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/fortran.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/icv-device.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/icv.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/iter.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/iter_ull.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libgomp-plugin.Plo@am__quote@
diff --git a/libgomp/config/posix/affinity.c b/libgomp/affinity.c
index 32986fa8281..3549e54b67c 100644
--- a/libgomp/config/posix/affinity.c
+++ b/libgomp/affinity.c
@@ -32,12 +32,14 @@ gomp_init_affinity (void)
{
}
+#ifdef LIBGOMP_USE_PTHREADS
void
gomp_init_thread_affinity (pthread_attr_t *attr, unsigned int place)
{
(void) attr;
(void) place;
}
+#endif
void **
gomp_affinity_alloc (unsigned long count, bool quiet)
diff --git a/libgomp/atomic.c b/libgomp/atomic.c
new file mode 100644
index 00000000000..bad0736e21e
--- /dev/null
+++ b/libgomp/atomic.c
@@ -0,0 +1,57 @@
+/* Copyright (C) 2005-2016 Free Software Foundation, Inc.
+ Contributed by Richard Henderson <rth@redhat.com>.
+
+ This file is part of the GNU Offloading and Multi Processing Library
+ (libgomp).
+
+ Libgomp is free software; you can redistribute it and/or modify it
+ under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+ WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* This file contains helpers for the ATOMIC construct. */
+
+#include "libgomp.h"
+
+/* This mutex is used when atomic operations don't exist for the target
+ in the mode requested. The result is not globally atomic, but works so
+ long as all parallel references are within #pragma omp atomic directives.
+ According to responses received from omp@openmp.org, appears to be within
+ spec. Which makes sense, since that's how several other compilers
+ handle this situation as well. */
+
+static gomp_mutex_t atomic_lock;
+
+void
+GOMP_atomic_start (void)
+{
+ gomp_mutex_lock (&atomic_lock);
+}
+
+void
+GOMP_atomic_end (void)
+{
+ gomp_mutex_unlock (&atomic_lock);
+}
+
+#if !GOMP_MUTEX_INIT_0
+static void __attribute__((constructor))
+initialize_atomic (void)
+{
+ gomp_mutex_init (&atomic_lock);
+}
+#endif
diff --git a/libgomp/config.h.in b/libgomp/config.h.in
index 226ac5358a7..e5526f6be82 100644
--- a/libgomp/config.h.in
+++ b/libgomp/config.h.in
@@ -94,6 +94,9 @@
/* Define to 1 if GNU symbol versioning is used for libgomp. */
#undef LIBGOMP_GNU_SYMBOL_VERSIONING
+/* Define to 1 if libgomp should use POSIX threads. */
+#undef LIBGOMP_USE_PTHREADS
+
/* Define to the sub-directory in which libtool stores uninstalled libraries.
*/
#undef LT_OBJDIR
diff --git a/libgomp/config/linux/lock.c b/libgomp/config/linux/lock.c
index a671b7e0a89..07845c98ad1 100644
--- a/libgomp/config/linux/lock.c
+++ b/libgomp/config/linux/lock.c
@@ -32,98 +32,8 @@
#include <sys/syscall.h>
#include "wait.h"
-
-/* The internal gomp_mutex_t and the external non-recursive omp_lock_t
- have the same form. Re-use it. */
-
-void
-gomp_init_lock_30 (omp_lock_t *lock)
-{
- gomp_mutex_init (lock);
-}
-
-void
-gomp_destroy_lock_30 (omp_lock_t *lock)
-{
- gomp_mutex_destroy (lock);
-}
-
-void
-gomp_set_lock_30 (omp_lock_t *lock)
-{
- gomp_mutex_lock (lock);
-}
-
-void
-gomp_unset_lock_30 (omp_lock_t *lock)
-{
- gomp_mutex_unlock (lock);
-}
-
-int
-gomp_test_lock_30 (omp_lock_t *lock)
-{
- int oldval = 0;
-
- return __atomic_compare_exchange_n (lock, &oldval, 1, false,
- MEMMODEL_ACQUIRE, MEMMODEL_RELAXED);
-}
-
-void
-gomp_init_nest_lock_30 (omp_nest_lock_t *lock)
-{
- memset (lock, '\0', sizeof (*lock));
-}
-
-void
-gomp_destroy_nest_lock_30 (omp_nest_lock_t *lock)
-{
-}
-
-void
-gomp_set_nest_lock_30 (omp_nest_lock_t *lock)
-{
- void *me = gomp_icv (true);
-
- if (lock->owner != me)
- {
- gomp_mutex_lock (&lock->lock);
- lock->owner = me;
- }
-
- lock->count++;
-}
-
-void
-gomp_unset_nest_lock_30 (omp_nest_lock_t *lock)
-{
- if (--lock->count == 0)
- {
- lock->owner = NULL;
- gomp_mutex_unlock (&lock->lock);
- }
-}
-
-int
-gomp_test_nest_lock_30 (omp_nest_lock_t *lock)
-{
- void *me = gomp_icv (true);
- int oldval;
-
- if (lock->owner == me)
- return ++lock->count;
-
- oldval = 0;
- if (__atomic_compare_exchange_n (&lock->lock, &oldval, 1, false,
- MEMMODEL_ACQUIRE, MEMMODEL_RELAXED))
- {
- lock->owner = me;
- lock->count = 1;
- return 1;
- }
-
- return 0;
-}
+/* Reuse the generic implementation in terms of gomp_mutex_t. */
+#include "../../lock.c"
#ifdef LIBGOMP_GNU_SYMBOL_VERSIONING
/* gomp_mutex_* can be safely locked in one thread and
diff --git a/libgomp/config/nvptx/affinity.c b/libgomp/config/nvptx/affinity.c
deleted file mode 100644
index e69de29bb2d..00000000000
--- a/libgomp/config/nvptx/affinity.c
+++ /dev/null
diff --git a/libgomp/config/nvptx/alloc.c b/libgomp/config/nvptx/alloc.c
deleted file mode 100644
index e69de29bb2d..00000000000
--- a/libgomp/config/nvptx/alloc.c
+++ /dev/null
diff --git a/libgomp/config/nvptx/bar.c b/libgomp/config/nvptx/bar.c
index e69de29bb2d..a0d8a44dacf 100644
--- a/libgomp/config/nvptx/bar.c
+++ b/libgomp/config/nvptx/bar.c
@@ -0,0 +1,209 @@
+/* Copyright (C) 2015-2016 Free Software Foundation, Inc.
+ Contributed by Alexander Monakov <amonakov@ispras.ru>
+
+ This file is part of the GNU Offloading and Multi Processing Library
+ (libgomp).
+
+ Libgomp is free software; you can redistribute it and/or modify it
+ under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+ WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* This is an NVPTX specific implementation of a barrier synchronization
+ mechanism for libgomp. This type is private to the library. This
+ implementation uses atomic instructions and bar.sync instruction. */
+
+#include <limits.h>
+#include "libgomp.h"
+
+
+void
+gomp_barrier_wait_end (gomp_barrier_t *bar, gomp_barrier_state_t state)
+{
+ if (__builtin_expect (state & BAR_WAS_LAST, 0))
+ {
+ /* Next time we'll be awaiting TOTAL threads again. */
+ bar->awaited = bar->total;
+ __atomic_store_n (&bar->generation, bar->generation + BAR_INCR,
+ MEMMODEL_RELEASE);
+ }
+ asm ("bar.sync 1, %0;" : : "r" (32 * bar->total));
+}
+
+void
+gomp_barrier_wait (gomp_barrier_t *bar)
+{
+ gomp_barrier_wait_end (bar, gomp_barrier_wait_start (bar));
+}
+
+/* Like gomp_barrier_wait, except that if the encountering thread
+ is not the last one to hit the barrier, it returns immediately.
+ The intended usage is that a thread which intends to gomp_barrier_destroy
+ this barrier calls gomp_barrier_wait, while all other threads
+ call gomp_barrier_wait_last. When gomp_barrier_wait returns,
+ the barrier can be safely destroyed. */
+
+void
+gomp_barrier_wait_last (gomp_barrier_t *bar)
+{
+#if 0
+ gomp_barrier_state_t state = gomp_barrier_wait_start (bar);
+ if (state & BAR_WAS_LAST)
+ gomp_barrier_wait_end (bar, state);
+#else
+ gomp_barrier_wait (bar);
+#endif
+}
+
+void
+gomp_team_barrier_wake (gomp_barrier_t *bar, int count)
+{
+ asm ("bar.sync 1, %0;" : : "r" (32 * bar->total));
+}
+
+void
+gomp_team_barrier_wait_end (gomp_barrier_t *bar, gomp_barrier_state_t state)
+{
+ unsigned int generation, gen;
+
+ if (__builtin_expect (state & BAR_WAS_LAST, 0))
+ {
+ /* Next time we'll be awaiting TOTAL threads again. */
+ struct gomp_thread *thr = gomp_thread ();
+ struct gomp_team *team = thr->ts.team;
+
+ bar->awaited = bar->total;
+ team->work_share_cancelled = 0;
+ if (__builtin_expect (team->task_count, 0))
+ {
+ gomp_barrier_handle_tasks (state);
+ state &= ~BAR_WAS_LAST;
+ }
+ else
+ {
+ state &= ~BAR_CANCELLED;
+ state += BAR_INCR - BAR_WAS_LAST;
+ __atomic_store_n (&bar->generation, state, MEMMODEL_RELEASE);
+ asm ("bar.sync 1, %0;" : : "r" (32 * bar->total));
+ return;
+ }
+ }
+
+ generation = state;
+ state &= ~BAR_CANCELLED;
+ do
+ {
+ asm ("bar.sync 1, %0;" : : "r" (32 * bar->total));
+ gen = __atomic_load_n (&bar->generation, MEMMODEL_ACQUIRE);
+ if (__builtin_expect (gen & BAR_TASK_PENDING, 0))
+ {
+ gomp_barrier_handle_tasks (state);
+ gen = __atomic_load_n (&bar->generation, MEMMODEL_ACQUIRE);
+ }
+ generation |= gen & BAR_WAITING_FOR_TASK;
+ }
+ while (gen != state + BAR_INCR);
+}
+
+void
+gomp_team_barrier_wait (gomp_barrier_t *bar)
+{
+ gomp_team_barrier_wait_end (bar, gomp_barrier_wait_start (bar));
+}
+
+void
+gomp_team_barrier_wait_final (gomp_barrier_t *bar)
+{
+ gomp_barrier_state_t state = gomp_barrier_wait_final_start (bar);
+ if (__builtin_expect (state & BAR_WAS_LAST, 0))
+ bar->awaited_final = bar->total;
+ gomp_team_barrier_wait_end (bar, state);
+}
+
+bool
+gomp_team_barrier_wait_cancel_end (gomp_barrier_t *bar,
+ gomp_barrier_state_t state)
+{
+ unsigned int generation, gen;
+
+ if (__builtin_expect (state & BAR_WAS_LAST, 0))
+ {
+ /* Next time we'll be awaiting TOTAL threads again. */
+ /* BAR_CANCELLED should never be set in state here, because
+ cancellation means that at least one of the threads has been
+ cancelled, thus on a cancellable barrier we should never see
+ all threads to arrive. */
+ struct gomp_thread *thr = gomp_thread ();
+ struct gomp_team *team = thr->ts.team;
+
+ bar->awaited = bar->total;
+ team->work_share_cancelled = 0;
+ if (__builtin_expect (team->task_count, 0))
+ {
+ gomp_barrier_handle_tasks (state);
+ state &= ~BAR_WAS_LAST;
+ }
+ else
+ {
+ state += BAR_INCR - BAR_WAS_LAST;
+ __atomic_store_n (&bar->generation, state, MEMMODEL_RELEASE);
+ asm ("bar.sync 1, %0;" : : "r" (32 * bar->total));
+ return false;
+ }
+ }
+
+ if (__builtin_expect (state & BAR_CANCELLED, 0))
+ return true;
+
+ generation = state;
+ do
+ {
+ asm ("bar.sync 1, %0;" : : "r" (32 * bar->total));
+ gen = __atomic_load_n (&bar->generation, MEMMODEL_ACQUIRE);
+ if (__builtin_expect (gen & BAR_CANCELLED, 0))
+ return true;
+ if (__builtin_expect (gen & BAR_TASK_PENDING, 0))
+ {
+ gomp_barrier_handle_tasks (state);
+ gen = __atomic_load_n (&bar->generation, MEMMODEL_ACQUIRE);
+ }
+ generation |= gen & BAR_WAITING_FOR_TASK;
+ }
+ while (gen != state + BAR_INCR);
+
+ return false;
+}
+
+bool
+gomp_team_barrier_wait_cancel (gomp_barrier_t *bar)
+{
+ return gomp_team_barrier_wait_cancel_end (bar, gomp_barrier_wait_start (bar));
+}
+
+void
+gomp_team_barrier_cancel (struct gomp_team *team)
+{
+ gomp_mutex_lock (&team->task_lock);
+ if (team->barrier.generation & BAR_CANCELLED)
+ {
+ gomp_mutex_unlock (&team->task_lock);
+ return;
+ }
+ team->barrier.generation |= BAR_CANCELLED;
+ gomp_mutex_unlock (&team->task_lock);
+ gomp_team_barrier_wake (&team->barrier, INT_MAX);
+}
diff --git a/libgomp/config/nvptx/bar.h b/libgomp/config/nvptx/bar.h
new file mode 100644
index 00000000000..757edf1d7eb
--- /dev/null
+++ b/libgomp/config/nvptx/bar.h
@@ -0,0 +1,166 @@
+/* Copyright (C) 2015-2016 Free Software Foundation, Inc.
+ Contributed by Alexander Monakov <amonakov@ispras.ru>
+
+ This file is part of the GNU Offloading and Multi Processing Library
+ (libgomp).
+
+ Libgomp is free software; you can redistribute it and/or modify it
+ under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+ WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* This is an NVPTX specific implementation of a barrier synchronization
+ mechanism for libgomp. This type is private to the library. This
+ implementation uses atomic instructions and bar.sync instruction. */
+
+#ifndef GOMP_BARRIER_H
+#define GOMP_BARRIER_H 1
+
+#include "mutex.h"
+
+typedef struct
+{
+ unsigned total;
+ unsigned generation;
+ unsigned awaited;
+ unsigned awaited_final;
+} gomp_barrier_t;
+
+typedef unsigned int gomp_barrier_state_t;
+
+/* The generation field contains a counter in the high bits, with a few
+ low bits dedicated to flags. Note that TASK_PENDING and WAS_LAST can
+ share space because WAS_LAST is never stored back to generation. */
+#define BAR_TASK_PENDING 1
+#define BAR_WAS_LAST 1
+#define BAR_WAITING_FOR_TASK 2
+#define BAR_CANCELLED 4
+#define BAR_INCR 8
+
+static inline void gomp_barrier_init (gomp_barrier_t *bar, unsigned count)
+{
+ bar->total = count;
+ bar->awaited = count;
+ bar->awaited_final = count;
+ bar->generation = 0;
+}
+
+static inline void gomp_barrier_reinit (gomp_barrier_t *bar, unsigned count)
+{
+ __atomic_add_fetch (&bar->awaited, count - bar->total, MEMMODEL_ACQ_REL);
+ bar->total = count;
+}
+
+static inline void gomp_barrier_destroy (gomp_barrier_t *bar)
+{
+}
+
+extern void gomp_barrier_wait (gomp_barrier_t *);
+extern void gomp_barrier_wait_last (gomp_barrier_t *);
+extern void gomp_barrier_wait_end (gomp_barrier_t *, gomp_barrier_state_t);
+extern void gomp_team_barrier_wait (gomp_barrier_t *);
+extern void gomp_team_barrier_wait_final (gomp_barrier_t *);
+extern void gomp_team_barrier_wait_end (gomp_barrier_t *,
+ gomp_barrier_state_t);
+extern bool gomp_team_barrier_wait_cancel (gomp_barrier_t *);
+extern bool gomp_team_barrier_wait_cancel_end (gomp_barrier_t *,
+ gomp_barrier_state_t);
+extern void gomp_team_barrier_wake (gomp_barrier_t *, int);
+struct gomp_team;
+extern void gomp_team_barrier_cancel (struct gomp_team *);
+
+static inline gomp_barrier_state_t
+gomp_barrier_wait_start (gomp_barrier_t *bar)
+{
+ unsigned int ret = __atomic_load_n (&bar->generation, MEMMODEL_ACQUIRE);
+ ret &= -BAR_INCR | BAR_CANCELLED;
+ /* A memory barrier is needed before exiting from the various forms
+ of gomp_barrier_wait, to satisfy OpenMP API version 3.1 section
+ 2.8.6 flush Construct, which says there is an implicit flush during
+ a barrier region. This is a convenient place to add the barrier,
+ so we use MEMMODEL_ACQ_REL here rather than MEMMODEL_ACQUIRE. */
+ if (__atomic_add_fetch (&bar->awaited, -1, MEMMODEL_ACQ_REL) == 0)
+ ret |= BAR_WAS_LAST;
+ return ret;
+}
+
+static inline gomp_barrier_state_t
+gomp_barrier_wait_cancel_start (gomp_barrier_t *bar)
+{
+ return gomp_barrier_wait_start (bar);
+}
+
+/* This is like gomp_barrier_wait_start, except it decrements
+ bar->awaited_final rather than bar->awaited and should be used
+ for the gomp_team_end barrier only. */
+static inline gomp_barrier_state_t
+gomp_barrier_wait_final_start (gomp_barrier_t *bar)
+{
+ unsigned int ret = __atomic_load_n (&bar->generation, MEMMODEL_ACQUIRE);
+ ret &= -BAR_INCR | BAR_CANCELLED;
+ /* See above gomp_barrier_wait_start comment. */
+ if (__atomic_add_fetch (&bar->awaited_final, -1, MEMMODEL_ACQ_REL) == 0)
+ ret |= BAR_WAS_LAST;
+ return ret;
+}
+
+static inline bool
+gomp_barrier_last_thread (gomp_barrier_state_t state)
+{
+ return state & BAR_WAS_LAST;
+}
+
+/* All the inlines below must be called with team->task_lock
+ held. */
+
+static inline void
+gomp_team_barrier_set_task_pending (gomp_barrier_t *bar)
+{
+ bar->generation |= BAR_TASK_PENDING;
+}
+
+static inline void
+gomp_team_barrier_clear_task_pending (gomp_barrier_t *bar)
+{
+ bar->generation &= ~BAR_TASK_PENDING;
+}
+
+static inline void
+gomp_team_barrier_set_waiting_for_tasks (gomp_barrier_t *bar)
+{
+ bar->generation |= BAR_WAITING_FOR_TASK;
+}
+
+static inline bool
+gomp_team_barrier_waiting_for_tasks (gomp_barrier_t *bar)
+{
+ return (bar->generation & BAR_WAITING_FOR_TASK) != 0;
+}
+
+static inline bool
+gomp_team_barrier_cancelled (gomp_barrier_t *bar)
+{
+ return __builtin_expect ((bar->generation & BAR_CANCELLED) != 0, 0);
+}
+
+static inline void
+gomp_team_barrier_done (gomp_barrier_t *bar, gomp_barrier_state_t state)
+{
+ bar->generation = (state & -BAR_INCR) + BAR_INCR;
+}
+
+#endif /* GOMP_BARRIER_H */
diff --git a/libgomp/config/nvptx/barrier.c b/libgomp/config/nvptx/barrier.c
deleted file mode 100644
index e69de29bb2d..00000000000
--- a/libgomp/config/nvptx/barrier.c
+++ /dev/null
diff --git a/libgomp/config/nvptx/critical.c b/libgomp/config/nvptx/critical.c
deleted file mode 100644
index e69de29bb2d..00000000000
--- a/libgomp/config/nvptx/critical.c
+++ /dev/null
diff --git a/libgomp/config/nvptx/doacross.h b/libgomp/config/nvptx/doacross.h
new file mode 100644
index 00000000000..fd011d492bd
--- /dev/null
+++ b/libgomp/config/nvptx/doacross.h
@@ -0,0 +1,60 @@
+/* Copyright (C) 2015-2016 Free Software Foundation, Inc.
+ Contributed by Alexander Monakov <amonakov@ispras.ru>
+
+ This file is part of the GNU Offloading and Multi Processing Library
+ (libgomp).
+
+ Libgomp is free software; you can redistribute it and/or modify it
+ under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+ WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* This is the NVPTX implementation of doacross spinning. */
+
+#ifndef GOMP_DOACROSS_H
+#define GOMP_DOACROSS_H 1
+
+#include "libgomp.h"
+
+static int zero;
+
+static inline int
+cpu_relax (void)
+{
+ int r;
+ /* Here we need a long-latency operation to make the current warp yield.
+ We could use ld.cv, uncached load from system (host) memory, but that
+ would require allocating locked memory in the plugin. Alternatively,
+ we can use ld.cg, which evicts from L1 and caches in L2. */
+ asm volatile ("ld.cg.s32 %0, [%1];" : "=r" (r) : "i" (&zero) : "memory");
+ return r;
+}
+
+static inline void doacross_spin (unsigned long *addr, unsigned long expected,
+ unsigned long cur)
+{
+ /* Prevent compiler from optimizing based on bounds of containing object. */
+ asm ("" : "+r" (addr));
+ do
+ {
+ int i = cpu_relax ();
+ cur = addr[i];
+ }
+ while (cur <= expected);
+}
+
+#endif /* GOMP_DOACROSS_H */
diff --git a/libgomp/config/nvptx/error.c b/libgomp/config/nvptx/error.c
index e69de29bb2d..f3e74cd9449 100644
--- a/libgomp/config/nvptx/error.c
+++ b/libgomp/config/nvptx/error.c
@@ -0,0 +1,42 @@
+/* Copyright (C) 2015-2016 Free Software Foundation, Inc.
+ Contributed by Alexander Monakov <amonakov@ispras.ru>
+
+ This file is part of the GNU Offloading and Multi Processing Library
+ (libgomp).
+
+ Libgomp is free software; you can redistribute it and/or modify it
+ under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+ WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* This file contains routines used to signal errors. On NVPTX, we have
+ one default output stream (stdout), so redirect everything there. */
+
+#include "libgomp.h"
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#undef vfprintf
+#undef fputs
+#undef fputc
+
+#define vfprintf(stream, fmt, list) vprintf (fmt, list)
+#define fputs(s, stream) printf ("%s", s)
+#define fputc(c, stream) printf ("%c", c)
+
+#include "../../error.c"
diff --git a/libgomp/config/nvptx/icv-device.c b/libgomp/config/nvptx/icv-device.c
new file mode 100644
index 00000000000..831ba110e95
--- /dev/null
+++ b/libgomp/config/nvptx/icv-device.c
@@ -0,0 +1,74 @@
+/* Copyright (C) 2015-2016 Free Software Foundation, Inc.
+ Contributed by Alexander Monakov <amonakov@ispras.ru>
+
+ This file is part of the GNU Offloading and Multi Processing Library
+ (libgomp).
+
+ Libgomp is free software; you can redistribute it and/or modify it
+ under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+ WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* This file defines OpenMP API entry points that accelerator targets are
+ expected to replace. */
+
+#include "libgomp.h"
+
+void
+omp_set_default_device (int device_num __attribute__((unused)))
+{
+}
+
+int
+omp_get_default_device (void)
+{
+ return 0;
+}
+
+int
+omp_get_num_devices (void)
+{
+ return 0;
+}
+
+int
+omp_get_num_teams (void)
+{
+ return gomp_num_teams_var + 1;
+}
+
+int
+omp_get_team_num (void)
+{
+ int ctaid;
+ asm ("mov.u32 %0, %%ctaid.x;" : "=r" (ctaid));
+ return ctaid;
+}
+
+int
+omp_is_initial_device (void)
+{
+ /* NVPTX is an accelerator-only target. */
+ return 0;
+}
+
+ialias (omp_set_default_device)
+ialias (omp_get_default_device)
+ialias (omp_get_num_devices)
+ialias (omp_get_num_teams)
+ialias (omp_get_team_num)
+ialias (omp_is_initial_device)
diff --git a/libgomp/config/nvptx/iter.c b/libgomp/config/nvptx/iter.c
deleted file mode 100644
index e69de29bb2d..00000000000
--- a/libgomp/config/nvptx/iter.c
+++ /dev/null
diff --git a/libgomp/config/nvptx/iter_ull.c b/libgomp/config/nvptx/iter_ull.c
deleted file mode 100644
index e69de29bb2d..00000000000
--- a/libgomp/config/nvptx/iter_ull.c
+++ /dev/null
diff --git a/libgomp/config/nvptx/lock.c b/libgomp/config/nvptx/lock.c
index e69de29bb2d..7731704c6fb 100644
--- a/libgomp/config/nvptx/lock.c
+++ b/libgomp/config/nvptx/lock.c
@@ -0,0 +1,41 @@
+/* Copyright (C) 2016 Free Software Foundation, Inc.
+ Contributed by Alexander Monakov <amonakov@ispras.ru>.
+
+ This file is part of the GNU Offloading and Multi Processing Library
+ (libgomp).
+
+ Libgomp is free software; you can redistribute it and/or modify it
+ under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+ WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* This is a NVPTX specific implementation of the public OpenMP locking
+ primitives. */
+
+/* Reuse the generic implementation in terms of gomp_mutex_t. */
+#include "../../lock.c"
+
+ialias (omp_init_lock)
+ialias (omp_init_nest_lock)
+ialias (omp_destroy_lock)
+ialias (omp_destroy_nest_lock)
+ialias (omp_set_lock)
+ialias (omp_set_nest_lock)
+ialias (omp_unset_lock)
+ialias (omp_unset_nest_lock)
+ialias (omp_test_lock)
+ialias (omp_test_nest_lock)
diff --git a/libgomp/config/nvptx/loop.c b/libgomp/config/nvptx/loop.c
deleted file mode 100644
index e69de29bb2d..00000000000
--- a/libgomp/config/nvptx/loop.c
+++ /dev/null
diff --git a/libgomp/config/nvptx/loop_ull.c b/libgomp/config/nvptx/loop_ull.c
deleted file mode 100644
index e69de29bb2d..00000000000
--- a/libgomp/config/nvptx/loop_ull.c
+++ /dev/null
diff --git a/libgomp/config/nvptx/mutex.h b/libgomp/config/nvptx/mutex.h
new file mode 100644
index 00000000000..e408ca72933
--- /dev/null
+++ b/libgomp/config/nvptx/mutex.h
@@ -0,0 +1,60 @@
+/* Copyright (C) 2015-2016 Free Software Foundation, Inc.
+ Contributed by Alexander Monakov <amonakov@ispras.ru>
+
+ This file is part of the GNU Offloading and Multi Processing Library
+ (libgomp).
+
+ Libgomp is free software; you can redistribute it and/or modify it
+ under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+ WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* This is an NVPTX specific implementation of a mutex synchronization
+ mechanism for libgomp. This type is private to the library. This
+ implementation uses atomic instructions and busy waiting. */
+
+#ifndef GOMP_MUTEX_H
+#define GOMP_MUTEX_H 1
+
+typedef int gomp_mutex_t;
+
+#define GOMP_MUTEX_INIT_0 1
+
+static inline void
+gomp_mutex_init (gomp_mutex_t *mutex)
+{
+ *mutex = 0;
+}
+
+static inline void
+gomp_mutex_destroy (gomp_mutex_t *mutex)
+{
+}
+
+static inline void
+gomp_mutex_lock (gomp_mutex_t *mutex)
+{
+ while (__sync_lock_test_and_set (mutex, 1))
+ /* spin */ ;
+}
+
+static inline void
+gomp_mutex_unlock (gomp_mutex_t *mutex)
+{
+ __sync_lock_release (mutex);
+}
+#endif /* GOMP_MUTEX_H */
diff --git a/libgomp/config/nvptx/ordered.c b/libgomp/config/nvptx/ordered.c
deleted file mode 100644
index e69de29bb2d..00000000000
--- a/libgomp/config/nvptx/ordered.c
+++ /dev/null
diff --git a/libgomp/config/nvptx/parallel.c b/libgomp/config/nvptx/parallel.c
deleted file mode 100644
index e69de29bb2d..00000000000
--- a/libgomp/config/nvptx/parallel.c
+++ /dev/null
diff --git a/libgomp/config/nvptx/fortran.c b/libgomp/config/nvptx/pool.h
index 71ba6ed73f5..70e233c7caa 100644
--- a/libgomp/config/nvptx/fortran.c
+++ b/libgomp/config/nvptx/pool.h
@@ -1,8 +1,5 @@
-/* OpenACC Runtime Fortran wrapper routines
-
- Copyright (C) 2014-2016 Free Software Foundation, Inc.
-
- Contributed by Mentor Embedded.
+/* Copyright (C) 2015-2016 Free Software Foundation, Inc.
+ Contributed by Alexander Monakov <amonakov@ispras.ru>
This file is part of the GNU Offloading and Multi Processing Library
(libgomp).
@@ -26,15 +23,27 @@
see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
<http://www.gnu.org/licenses/>. */
-/* Temporary hack; this will be provided by libgfortran. */
+/* This is the NVPTX implementation of the thread pool management
+ for libgomp. This type is private to the library. */
+
+#ifndef GOMP_POOL_H
+#define GOMP_POOL_H 1
+
+#include "libgomp.h"
+
+/* Get the thread pool. */
+
+static inline struct gomp_thread_pool *
+gomp_get_thread_pool (struct gomp_thread *thr, unsigned nthreads)
+{
+ /* NVPTX is running with a fixed pool of pre-started threads. */
+ return thr->thread_pool;
+}
-extern void _gfortran_abort (void);
+static inline void
+gomp_release_thread_pool (struct gomp_thread_pool *pool)
+{
+ /* Do nothing. */
+}
-__asm__ ("// BEGIN GLOBAL FUNCTION DECL: _gfortran_abort\n"
- ".visible .func _gfortran_abort;\n"
- "// BEGIN GLOBAL FUNCTION DEF: _gfortran_abort\n"
- ".visible .func _gfortran_abort\n"
- "{\n"
- "trap;\n"
- "ret;\n"
- "}\n");
+#endif /* GOMP_POOL_H */
diff --git a/libgomp/config/nvptx/priority_queue.c b/libgomp/config/nvptx/priority_queue.c
deleted file mode 100644
index 63aecd249fb..00000000000
--- a/libgomp/config/nvptx/priority_queue.c
+++ /dev/null
@@ -1 +0,0 @@
-/* Empty stub for omp task priority support. */
diff --git a/libgomp/config/nvptx/proc.c b/libgomp/config/nvptx/proc.c
index e69de29bb2d..8c1c3664660 100644
--- a/libgomp/config/nvptx/proc.c
+++ b/libgomp/config/nvptx/proc.c
@@ -0,0 +1,41 @@
+/* Copyright (C) 2015-2016 Free Software Foundation, Inc.
+ Contributed by Alexander Monakov <amonakov@ispras.ru>
+
+ This file is part of the GNU Offloading and Multi Processing Library
+ (libgomp).
+
+ Libgomp is free software; you can redistribute it and/or modify it
+ under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+ WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* This file contains system specific routines related to counting
+ online processors and dynamic load balancing. */
+
+#include "libgomp.h"
+
+unsigned
+gomp_dynamic_max_threads (void)
+{
+ return gomp_icv (false)->nthreads_var;
+}
+
+int
+omp_get_num_procs (void)
+{
+ return gomp_icv (false)->nthreads_var;
+}
diff --git a/libgomp/config/nvptx/ptrlock.h b/libgomp/config/nvptx/ptrlock.h
new file mode 100644
index 00000000000..c2eae75720d
--- /dev/null
+++ b/libgomp/config/nvptx/ptrlock.h
@@ -0,0 +1,73 @@
+/* Copyright (C) 2015-2016 Free Software Foundation, Inc.
+ Contributed by Alexander Monakov <amonakov@ispras.ru>
+
+ This file is part of the GNU Offloading and Multi Processing Library
+ (libgomp).
+
+ Libgomp is free software; you can redistribute it and/or modify it
+ under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+ WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* This is an NVPTX specific implementation of a mutex synchronization
+ mechanism for libgomp. This type is private to the library. This
+ implementation uses atomic instructions and busy waiting.
+
+ A ptrlock has four states:
+ 0/NULL Initial
+ 1 Owned by me, I get to write a pointer to ptrlock.
+ 2 Some thread is waiting on the ptrlock.
+ >2 Ptrlock contains a valid pointer.
+ It is not valid to gain the ptrlock and then write a NULL to it. */
+
+#ifndef GOMP_PTRLOCK_H
+#define GOMP_PTRLOCK_H 1
+
+typedef void *gomp_ptrlock_t;
+
+static inline void gomp_ptrlock_init (gomp_ptrlock_t *ptrlock, void *ptr)
+{
+ *ptrlock = ptr;
+}
+
+static inline void *gomp_ptrlock_get (gomp_ptrlock_t *ptrlock)
+{
+ uintptr_t v = (uintptr_t) __atomic_load_n (ptrlock, MEMMODEL_ACQUIRE);
+ if (v > 2)
+ return (void *) v;
+
+ if (v == 0
+ && __atomic_compare_exchange_n (ptrlock, &v, 1, false,
+ MEMMODEL_ACQUIRE, MEMMODEL_ACQUIRE))
+ return NULL;
+
+ while (v == 1)
+ v = (uintptr_t) __atomic_load_n (ptrlock, MEMMODEL_ACQUIRE);
+
+ return (void *) v;
+}
+
+static inline void gomp_ptrlock_set (gomp_ptrlock_t *ptrlock, void *ptr)
+{
+ __atomic_store_n (ptrlock, ptr, MEMMODEL_RELEASE);
+}
+
+static inline void gomp_ptrlock_destroy (gomp_ptrlock_t *ptrlock)
+{
+}
+
+#endif /* GOMP_PTRLOCK_H */
diff --git a/libgomp/config/nvptx/sections.c b/libgomp/config/nvptx/sections.c
deleted file mode 100644
index e69de29bb2d..00000000000
--- a/libgomp/config/nvptx/sections.c
+++ /dev/null
diff --git a/libgomp/config/nvptx/sem.h b/libgomp/config/nvptx/sem.h
new file mode 100644
index 00000000000..82c0dfbf7e0
--- /dev/null
+++ b/libgomp/config/nvptx/sem.h
@@ -0,0 +1,65 @@
+/* Copyright (C) 2015-2016 Free Software Foundation, Inc.
+ Contributed by Alexander Monakov <amonakov@ispras.ru>
+
+ This file is part of the GNU Offloading and Multi Processing Library
+ (libgomp).
+
+ Libgomp is free software; you can redistribute it and/or modify it
+ under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+ WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* This is an NVPTX specific implementation of a semaphore synchronization
+ mechanism for libgomp. This type is private to the library. This
+ semaphore implementation uses atomic instructions and busy waiting. */
+
+#ifndef GOMP_SEM_H
+#define GOMP_SEM_H 1
+
+typedef int gomp_sem_t;
+
+static inline void
+gomp_sem_init (gomp_sem_t *sem, int value)
+{
+ *sem = value;
+}
+
+static inline void
+gomp_sem_destroy (gomp_sem_t *sem)
+{
+}
+
+static inline void
+gomp_sem_wait (gomp_sem_t *sem)
+{
+ int count = __atomic_load_n (sem, MEMMODEL_ACQUIRE);
+ for (;;)
+ {
+ while (count == 0)
+ count = __atomic_load_n (sem, MEMMODEL_ACQUIRE);
+ if (__atomic_compare_exchange_n (sem, &count, count - 1, false,
+ MEMMODEL_ACQUIRE, MEMMODEL_RELAXED))
+ return;
+ }
+}
+
+static inline void
+gomp_sem_post (gomp_sem_t *sem)
+{
+ (void) __atomic_add_fetch (sem, 1, MEMMODEL_RELEASE);
+}
+#endif /* GOMP_SEM_H */
diff --git a/libgomp/config/nvptx/simple-bar.h b/libgomp/config/nvptx/simple-bar.h
new file mode 100644
index 00000000000..e7b56d98826
--- /dev/null
+++ b/libgomp/config/nvptx/simple-bar.h
@@ -0,0 +1,70 @@
+/* Copyright (C) 2015-2016 Free Software Foundation, Inc.
+ Contributed by Alexander Monakov <amonakov@ispras.ru>
+
+ This file is part of the GNU Offloading and Multi Processing Library
+ (libgomp).
+
+ Libgomp is free software; you can redistribute it and/or modify it
+ under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+ WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* This is a simplified barrier that is suitable for thread pool
+ synchronizaton. Only a subset of full barrier API (bar.h) is exposed.
+ Here in the NVPTX-specific implementation, we expect that thread pool
+ corresponds to a PTX CTA (thread block). */
+
+#ifndef GOMP_SIMPLE_BARRIER_H
+#define GOMP_SIMPLE_BARRIER_H 1
+
+typedef struct
+{
+ unsigned count;
+} gomp_simple_barrier_t;
+
+static inline void
+gomp_simple_barrier_init (gomp_simple_barrier_t *bar, unsigned count)
+{
+ bar->count = count * 32;
+}
+
+/* Unused on NVPTX.
+static inline void
+gomp_simple_barrier_reinit (gomp_simple_barrier_t *bar, unsigned count)
+{
+ bar->count = count * 32;
+}
+*/
+
+static inline void
+gomp_simple_barrier_destroy (gomp_simple_barrier_t *bar)
+{
+}
+
+static inline void
+gomp_simple_barrier_wait (gomp_simple_barrier_t *bar)
+{
+ asm volatile ("bar.sync 0, %0;" : : "r" (bar->count) : "memory");
+}
+
+static inline void
+gomp_simple_barrier_wait_last (gomp_simple_barrier_t *bar)
+{
+ asm volatile ("bar.arrive 0, %0;" : : "r" (bar->count) : "memory");
+}
+
+#endif /* GOMP_SIMPLE_BARRIER_H */
diff --git a/libgomp/config/nvptx/single.c b/libgomp/config/nvptx/single.c
deleted file mode 100644
index e69de29bb2d..00000000000
--- a/libgomp/config/nvptx/single.c
+++ /dev/null
diff --git a/libgomp/config/nvptx/splay-tree.c b/libgomp/config/nvptx/splay-tree.c
deleted file mode 100644
index e69de29bb2d..00000000000
--- a/libgomp/config/nvptx/splay-tree.c
+++ /dev/null
diff --git a/libgomp/config/nvptx/target.c b/libgomp/config/nvptx/target.c
index e69de29bb2d..38ea7f7aa68 100644
--- a/libgomp/config/nvptx/target.c
+++ b/libgomp/config/nvptx/target.c
@@ -0,0 +1,49 @@
+/* Copyright (C) 2013-2016 Free Software Foundation, Inc.
+ Contributed by Jakub Jelinek <jakub@redhat.com>.
+
+ This file is part of the GNU Offloading and Multi Processing Library
+ (libgomp).
+
+ Libgomp is free software; you can redistribute it and/or modify it
+ under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+ WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include "libgomp.h"
+#include <limits.h>
+
+void
+GOMP_teams (unsigned int num_teams, unsigned int thread_limit)
+{
+ if (thread_limit)
+ {
+ struct gomp_task_icv *icv = gomp_icv (true);
+ icv->thread_limit_var
+ = thread_limit > INT_MAX ? UINT_MAX : thread_limit;
+ }
+ unsigned int num_blocks, block_id;
+ asm ("mov.u32 %0, %%nctaid.x;" : "=r" (num_blocks));
+ asm ("mov.u32 %0, %%ctaid.x;" : "=r" (block_id));
+ if (!num_teams || num_teams >= num_blocks)
+ num_teams = num_blocks;
+ else if (block_id >= num_teams)
+ {
+ gomp_free_thread (nvptx_thrs);
+ asm ("exit;");
+ }
+ gomp_num_teams_var = num_teams - 1;
+}
diff --git a/libgomp/config/nvptx/task.c b/libgomp/config/nvptx/task.c
index e69de29bb2d..c183716c94e 100644
--- a/libgomp/config/nvptx/task.c
+++ b/libgomp/config/nvptx/task.c
@@ -0,0 +1,43 @@
+/* Copyright (C) 2015-2016 Free Software Foundation, Inc.
+ Contributed by Alexander Monakov <amonakov@ispras.ru>.
+
+ This file is part of the GNU Offloading and Multi Processing Library
+ (libgomp).
+
+ Libgomp is free software; you can redistribute it and/or modify it
+ under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+ WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* This file handles the maintainence of tasks in response to task
+ creation and termination. */
+
+#ifdef __nvptx_softstack__
+
+#include "libgomp.h"
+
+/* NVPTX is an accelerator-only target, so this should never be called. */
+
+bool
+gomp_target_task_fn (void *data)
+{
+ __builtin_unreachable ();
+}
+
+#include "../../task.c"
+
+#endif
diff --git a/libgomp/config/nvptx/team.c b/libgomp/config/nvptx/team.c
index e69de29bb2d..f7b5e3e81b5 100644
--- a/libgomp/config/nvptx/team.c
+++ b/libgomp/config/nvptx/team.c
@@ -0,0 +1,178 @@
+/* Copyright (C) 2015-2016 Free Software Foundation, Inc.
+ Contributed by Alexander Monakov <amonakov@ispras.ru>
+
+ This file is part of the GNU Offloading and Multi Processing Library
+ (libgomp).
+
+ Libgomp is free software; you can redistribute it and/or modify it
+ under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+ WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* This file handles maintainance of threads on NVPTX. */
+
+#if defined __nvptx_softstack__ && defined __nvptx_unisimt__
+
+#include "libgomp.h"
+#include <stdlib.h>
+#include <string.h>
+
+struct gomp_thread *nvptx_thrs __attribute__((shared,nocommon));
+
+static void gomp_thread_start (struct gomp_thread_pool *);
+
+
+/* This externally visible function handles target region entry. It
+ sets up a per-team thread pool and transfers control by calling FN (FN_DATA)
+ in the master thread or gomp_thread_start in other threads.
+
+ The name of this function is part of the interface with the compiler: for
+ each target region, GCC emits a PTX .kernel function that sets up soft-stack
+ and uniform-simt state and calls this function, passing in FN the original
+ function outlined for the target region. */
+
+void
+gomp_nvptx_main (void (*fn) (void *), void *fn_data)
+{
+ int tid, ntids;
+ asm ("mov.u32 %0, %%tid.y;" : "=r" (tid));
+ asm ("mov.u32 %0, %%ntid.y;" : "=r" (ntids));
+ if (tid == 0)
+ {
+ gomp_global_icv.nthreads_var = ntids;
+ /* Starting additional threads is not supported. */
+ gomp_global_icv.dyn_var = true;
+
+ nvptx_thrs = alloca (ntids * sizeof (*nvptx_thrs));
+ memset (nvptx_thrs, 0, ntids * sizeof (*nvptx_thrs));
+
+ struct gomp_thread_pool *pool = alloca (sizeof (*pool));
+ pool->threads = alloca (ntids * sizeof (*pool->threads));
+ for (tid = 0; tid < ntids; tid++)
+ pool->threads[tid] = nvptx_thrs + tid;
+ pool->threads_size = ntids;
+ pool->threads_used = ntids;
+ pool->threads_busy = 1;
+ pool->last_team = NULL;
+ gomp_simple_barrier_init (&pool->threads_dock, ntids);
+
+ nvptx_thrs[0].thread_pool = pool;
+ asm ("bar.sync 0;");
+ fn (fn_data);
+
+ gomp_free_thread (nvptx_thrs);
+ }
+ else
+ {
+ asm ("bar.sync 0;");
+ gomp_thread_start (nvptx_thrs[0].thread_pool);
+ }
+}
+
+/* This function contains the idle loop in which a thread waits
+ to be called up to become part of a team. */
+
+static void
+gomp_thread_start (struct gomp_thread_pool *pool)
+{
+ struct gomp_thread *thr = gomp_thread ();
+
+ gomp_sem_init (&thr->release, 0);
+ thr->thread_pool = pool;
+
+ do
+ {
+ gomp_simple_barrier_wait (&pool->threads_dock);
+ if (!thr->fn)
+ continue;
+ thr->fn (thr->data);
+ thr->fn = NULL;
+
+ struct gomp_task *task = thr->task;
+ gomp_team_barrier_wait_final (&thr->ts.team->barrier);
+ gomp_finish_task (task);
+ }
+ /* Work around an NVIDIA driver bug: when generating sm_50 machine code,
+ it can trash stack pointer R1 in loops lacking exit edges. Add a cheap
+ artificial exit that the driver would not be able to optimize out. */
+ while (nvptx_thrs);
+}
+
+/* Launch a team. */
+
+void
+gomp_team_start (void (*fn) (void *), void *data, unsigned nthreads,
+ unsigned flags, struct gomp_team *team)
+{
+ struct gomp_thread *thr, *nthr;
+ struct gomp_task *task;
+ struct gomp_task_icv *icv;
+ struct gomp_thread_pool *pool;
+ unsigned long nthreads_var;
+
+ thr = gomp_thread ();
+ pool = thr->thread_pool;
+ task = thr->task;
+ icv = task ? &task->icv : &gomp_global_icv;
+
+ /* Always save the previous state, even if this isn't a nested team.
+ In particular, we should save any work share state from an outer
+ orphaned work share construct. */
+ team->prev_ts = thr->ts;
+
+ thr->ts.team = team;
+ thr->ts.team_id = 0;
+ ++thr->ts.level;
+ if (nthreads > 1)
+ ++thr->ts.active_level;
+ thr->ts.work_share = &team->work_shares[0];
+ thr->ts.last_work_share = NULL;
+ thr->ts.single_count = 0;
+ thr->ts.static_trip = 0;
+ thr->task = &team->implicit_task[0];
+ nthreads_var = icv->nthreads_var;
+ gomp_init_task (thr->task, task, icv);
+ team->implicit_task[0].icv.nthreads_var = nthreads_var;
+
+ if (nthreads == 1)
+ return;
+
+ /* Release existing idle threads. */
+ for (unsigned i = 1; i < nthreads; ++i)
+ {
+ nthr = pool->threads[i];
+ nthr->ts.team = team;
+ nthr->ts.work_share = &team->work_shares[0];
+ nthr->ts.last_work_share = NULL;
+ nthr->ts.team_id = i;
+ nthr->ts.level = team->prev_ts.level + 1;
+ nthr->ts.active_level = thr->ts.active_level;
+ nthr->ts.single_count = 0;
+ nthr->ts.static_trip = 0;
+ nthr->task = &team->implicit_task[i];
+ gomp_init_task (nthr->task, task, icv);
+ team->implicit_task[i].icv.nthreads_var = nthreads_var;
+ nthr->fn = fn;
+ nthr->data = data;
+ team->ordered_release[i] = &nthr->release;
+ }
+
+ gomp_simple_barrier_wait (&pool->threads_dock);
+}
+
+#include "../../team.c"
+#endif
diff --git a/libgomp/config/nvptx/time.c b/libgomp/config/nvptx/time.c
index e69de29bb2d..88fb13050c0 100644
--- a/libgomp/config/nvptx/time.c
+++ b/libgomp/config/nvptx/time.c
@@ -0,0 +1,49 @@
+/* Copyright (C) 2015-2016 Free Software Foundation, Inc.
+ Contributed by Dmitry Melnik <dm@ispras.ru>
+
+ This file is part of the GNU Offloading and Multi Processing Library
+ (libgomp).
+
+ Libgomp is free software; you can redistribute it and/or modify it
+ under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+ WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* This file implements timer routines for NVPTX. It uses the %clock64 cycle
+ counter. */
+
+#include "libgomp.h"
+
+/* This is set from host in plugin-nvptx.c. */
+double __nvptx_clocktick = 0;
+
+double
+omp_get_wtime (void)
+{
+ uint64_t clock;
+ asm ("mov.u64 %0, %%clock64;" : "=r" (clock));
+ return clock * __nvptx_clocktick;
+}
+
+double
+omp_get_wtick (void)
+{
+ return __nvptx_clocktick;
+}
+
+ialias (omp_get_wtime)
+ialias (omp_get_wtick)
diff --git a/libgomp/config/nvptx/work.c b/libgomp/config/nvptx/work.c
deleted file mode 100644
index e69de29bb2d..00000000000
--- a/libgomp/config/nvptx/work.c
+++ /dev/null
diff --git a/libgomp/config/posix/simple-bar.h b/libgomp/config/posix/simple-bar.h
new file mode 100644
index 00000000000..b77491c19f0
--- /dev/null
+++ b/libgomp/config/posix/simple-bar.h
@@ -0,0 +1,69 @@
+/* Copyright (C) 2015-2016 Free Software Foundation, Inc.
+ Contributed by Alexander Monakov <amonakov@ispras.ru>
+
+ This file is part of the GNU Offloading and Multi Processing Library
+ (libgomp).
+
+ Libgomp is free software; you can redistribute it and/or modify it
+ under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+ WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* This is a simplified barrier that is suitable for thread pool
+ synchronizaton. Only a subset of full barrier API (bar.h) is exposed. */
+
+#ifndef GOMP_SIMPLE_BARRIER_H
+#define GOMP_SIMPLE_BARRIER_H 1
+
+#include "bar.h"
+
+typedef struct
+{
+ gomp_barrier_t bar;
+} gomp_simple_barrier_t;
+
+static inline void
+gomp_simple_barrier_init (gomp_simple_barrier_t *bar, unsigned count)
+{
+ gomp_barrier_init (&bar->bar, count);
+}
+
+static inline void
+gomp_simple_barrier_reinit (gomp_simple_barrier_t *bar, unsigned count)
+{
+ gomp_barrier_reinit (&bar->bar, count);
+}
+
+static inline void
+gomp_simple_barrier_destroy (gomp_simple_barrier_t *bar)
+{
+ gomp_barrier_destroy (&bar->bar);
+}
+
+static inline void
+gomp_simple_barrier_wait (gomp_simple_barrier_t *bar)
+{
+ gomp_barrier_wait (&bar->bar);
+}
+
+static inline void
+gomp_simple_barrier_wait_last (gomp_simple_barrier_t *bar)
+{
+ gomp_barrier_wait_last (&bar->bar);
+}
+
+#endif /* GOMP_SIMPLE_BARRIER_H */
diff --git a/libgomp/configure b/libgomp/configure
index 8d03eb6cddc..84d5876cce2 100755
--- a/libgomp/configure
+++ b/libgomp/configure
@@ -15067,6 +15067,7 @@ case "$host" in
;;
nvptx*-*-*)
# NVPTX does not support Pthreads, has its own code replacement.
+ libgomp_use_pthreads=no
;;
*)
# Check to see if -pthread or -lpthread is needed. Prefer the former.
@@ -15112,6 +15113,12 @@ rm -f core conftest.err conftest.$ac_objext \
conftest$ac_exeext conftest.$ac_ext
esac
+if test x$libgomp_use_pthreads != xno; then
+
+$as_echo "#define LIBGOMP_USE_PTHREADS 1" >>confdefs.h
+
+fi
+
# Plugins for offload execution, configure.ac fragment. -*- mode: autoconf -*-
#
# Copyright (C) 2014-2016 Free Software Foundation, Inc.
diff --git a/libgomp/configure.ac b/libgomp/configure.ac
index 2e41ca8aee5..5f1db7e1e0e 100644
--- a/libgomp/configure.ac
+++ b/libgomp/configure.ac
@@ -181,6 +181,7 @@ case "$host" in
;;
nvptx*-*-*)
# NVPTX does not support Pthreads, has its own code replacement.
+ libgomp_use_pthreads=no
;;
*)
# Check to see if -pthread or -lpthread is needed. Prefer the former.
@@ -202,6 +203,11 @@ case "$host" in
[AC_MSG_ERROR([Pthreads are required to build libgomp])])])
esac
+if test x$libgomp_use_pthreads != xno; then
+ AC_DEFINE(LIBGOMP_USE_PTHREADS, 1,
+ [Define to 1 if libgomp should use POSIX threads.])
+fi
+
m4_include([plugin/configfrag.ac])
# Check for functions needed.
diff --git a/libgomp/critical.c b/libgomp/critical.c
index 2b1f7f25013..cd520dbe5b3 100644
--- a/libgomp/critical.c
+++ b/libgomp/critical.c
@@ -115,33 +115,11 @@ GOMP_critical_name_end (void **pptr)
gomp_mutex_unlock (plock);
}
-/* This mutex is used when atomic operations don't exist for the target
- in the mode requested. The result is not globally atomic, but works so
- long as all parallel references are within #pragma omp atomic directives.
- According to responses received from omp@openmp.org, appears to be within
- spec. Which makes sense, since that's how several other compilers
- handle this situation as well. */
-
-static gomp_mutex_t atomic_lock;
-
-void
-GOMP_atomic_start (void)
-{
- gomp_mutex_lock (&atomic_lock);
-}
-
-void
-GOMP_atomic_end (void)
-{
- gomp_mutex_unlock (&atomic_lock);
-}
-
#if !GOMP_MUTEX_INIT_0
static void __attribute__((constructor))
initialize_critical (void)
{
gomp_mutex_init (&default_lock);
- gomp_mutex_init (&atomic_lock);
#ifndef HAVE_SYNC_BUILTINS
gomp_mutex_init (&create_lock_lock);
#endif
diff --git a/libgomp/env.c b/libgomp/env.c
index ac05c3b7b75..7ba7663da9a 100644
--- a/libgomp/env.c
+++ b/libgomp/env.c
@@ -23,8 +23,8 @@
see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
<http://www.gnu.org/licenses/>. */
-/* This file defines the OpenMP internal control variables, and arranges
- for them to be initialized from environment variables at startup. */
+/* This file arranges for OpenMP internal control variables to be initialized
+ from environment variables at startup. */
#include "libgomp.h"
#include "libgomp_f.h"
@@ -55,35 +55,6 @@
# define strtoull(ptr, eptr, base) strtoul (ptr, eptr, base)
#endif
-struct gomp_task_icv gomp_global_icv = {
- .nthreads_var = 1,
- .thread_limit_var = UINT_MAX,
- .run_sched_var = GFS_DYNAMIC,
- .run_sched_chunk_size = 1,
- .default_device_var = 0,
- .dyn_var = false,
- .nest_var = false,
- .bind_var = omp_proc_bind_false,
- .target_data = NULL
-};
-
-unsigned long gomp_max_active_levels_var = INT_MAX;
-bool gomp_cancel_var = false;
-int gomp_max_task_priority_var = 0;
-#ifndef HAVE_SYNC_BUILTINS
-gomp_mutex_t gomp_managed_threads_lock;
-#endif
-unsigned long gomp_available_cpus = 1, gomp_managed_threads = 1;
-unsigned long long gomp_spin_count_var, gomp_throttled_spin_count_var;
-unsigned long *gomp_nthreads_var_list, gomp_nthreads_var_list_len;
-char *gomp_bind_var_list;
-unsigned long gomp_bind_var_list_len;
-void **gomp_places_list;
-unsigned long gomp_places_list_len;
-int gomp_debug_var;
-char *goacc_device_type;
-int goacc_device_num;
-
/* Parse the OMP_SCHEDULE environment variable. */
static void
@@ -1302,240 +1273,3 @@ initialize_env (void)
goacc_runtime_initialize ();
}
-
-
-/* The public OpenMP API routines that access these variables. */
-
-void
-omp_set_num_threads (int n)
-{
- struct gomp_task_icv *icv = gomp_icv (true);
- icv->nthreads_var = (n > 0 ? n : 1);
-}
-
-void
-omp_set_dynamic (int val)
-{
- struct gomp_task_icv *icv = gomp_icv (true);
- icv->dyn_var = val;
-}
-
-int
-omp_get_dynamic (void)
-{
- struct gomp_task_icv *icv = gomp_icv (false);
- return icv->dyn_var;
-}
-
-void
-omp_set_nested (int val)
-{
- struct gomp_task_icv *icv = gomp_icv (true);
- icv->nest_var = val;
-}
-
-int
-omp_get_nested (void)
-{
- struct gomp_task_icv *icv = gomp_icv (false);
- return icv->nest_var;
-}
-
-void
-omp_set_schedule (omp_sched_t kind, int chunk_size)
-{
- struct gomp_task_icv *icv = gomp_icv (true);
- switch (kind)
- {
- case omp_sched_static:
- if (chunk_size < 1)
- chunk_size = 0;
- icv->run_sched_chunk_size = chunk_size;
- break;
- case omp_sched_dynamic:
- case omp_sched_guided:
- if (chunk_size < 1)
- chunk_size = 1;
- icv->run_sched_chunk_size = chunk_size;
- break;
- case omp_sched_auto:
- break;
- default:
- return;
- }
- icv->run_sched_var = kind;
-}
-
-void
-omp_get_schedule (omp_sched_t *kind, int *chunk_size)
-{
- struct gomp_task_icv *icv = gomp_icv (false);
- *kind = icv->run_sched_var;
- *chunk_size = icv->run_sched_chunk_size;
-}
-
-int
-omp_get_max_threads (void)
-{
- struct gomp_task_icv *icv = gomp_icv (false);
- return icv->nthreads_var;
-}
-
-int
-omp_get_thread_limit (void)
-{
- struct gomp_task_icv *icv = gomp_icv (false);
- return icv->thread_limit_var > INT_MAX ? INT_MAX : icv->thread_limit_var;
-}
-
-void
-omp_set_max_active_levels (int max_levels)
-{
- if (max_levels >= 0)
- gomp_max_active_levels_var = max_levels;
-}
-
-int
-omp_get_max_active_levels (void)
-{
- return gomp_max_active_levels_var;
-}
-
-int
-omp_get_cancellation (void)
-{
- return gomp_cancel_var;
-}
-
-int
-omp_get_max_task_priority (void)
-{
- return gomp_max_task_priority_var;
-}
-
-omp_proc_bind_t
-omp_get_proc_bind (void)
-{
- struct gomp_task_icv *icv = gomp_icv (false);
- return icv->bind_var;
-}
-
-void
-omp_set_default_device (int device_num)
-{
- struct gomp_task_icv *icv = gomp_icv (true);
- icv->default_device_var = device_num >= 0 ? device_num : 0;
-}
-
-int
-omp_get_default_device (void)
-{
- struct gomp_task_icv *icv = gomp_icv (false);
- return icv->default_device_var;
-}
-
-int
-omp_get_num_devices (void)
-{
- return gomp_get_num_devices ();
-}
-
-int
-omp_get_num_teams (void)
-{
- /* Hardcoded to 1 on host, MIC, HSAIL? Maybe variable on PTX. */
- return 1;
-}
-
-int
-omp_get_team_num (void)
-{
- /* Hardcoded to 0 on host, MIC, HSAIL? Maybe variable on PTX. */
- return 0;
-}
-
-int
-omp_is_initial_device (void)
-{
- /* Hardcoded to 1 on host, should be 0 on MIC, HSAIL, PTX. */
- return 1;
-}
-
-int
-omp_get_initial_device (void)
-{
- return GOMP_DEVICE_HOST_FALLBACK;
-}
-
-int
-omp_get_num_places (void)
-{
- return gomp_places_list_len;
-}
-
-int
-omp_get_place_num (void)
-{
- if (gomp_places_list == NULL)
- return -1;
-
- struct gomp_thread *thr = gomp_thread ();
- if (thr->place == 0)
- gomp_init_affinity ();
-
- return (int) thr->place - 1;
-}
-
-int
-omp_get_partition_num_places (void)
-{
- if (gomp_places_list == NULL)
- return 0;
-
- struct gomp_thread *thr = gomp_thread ();
- if (thr->place == 0)
- gomp_init_affinity ();
-
- return thr->ts.place_partition_len;
-}
-
-void
-omp_get_partition_place_nums (int *place_nums)
-{
- if (gomp_places_list == NULL)
- return;
-
- struct gomp_thread *thr = gomp_thread ();
- if (thr->place == 0)
- gomp_init_affinity ();
-
- unsigned int i;
- for (i = 0; i < thr->ts.place_partition_len; i++)
- *place_nums++ = thr->ts.place_partition_off + i;
-}
-
-ialias (omp_set_dynamic)
-ialias (omp_set_nested)
-ialias (omp_set_num_threads)
-ialias (omp_get_dynamic)
-ialias (omp_get_nested)
-ialias (omp_set_schedule)
-ialias (omp_get_schedule)
-ialias (omp_get_max_threads)
-ialias (omp_get_thread_limit)
-ialias (omp_set_max_active_levels)
-ialias (omp_get_max_active_levels)
-ialias (omp_get_cancellation)
-ialias (omp_get_proc_bind)
-ialias (omp_set_default_device)
-ialias (omp_get_default_device)
-ialias (omp_get_num_devices)
-ialias (omp_get_num_teams)
-ialias (omp_get_team_num)
-ialias (omp_is_initial_device)
-ialias (omp_get_initial_device)
-ialias (omp_get_max_task_priority)
-ialias (omp_get_num_places)
-ialias (omp_get_place_num)
-ialias (omp_get_partition_num_places)
-ialias (omp_get_partition_place_nums)
diff --git a/libgomp/icv-device.c b/libgomp/icv-device.c
new file mode 100644
index 00000000000..9bbc0b9b767
--- /dev/null
+++ b/libgomp/icv-device.c
@@ -0,0 +1,77 @@
+/* Copyright (C) 2005-2016 Free Software Foundation, Inc.
+ Contributed by Richard Henderson <rth@redhat.com>.
+
+ This file is part of the GNU Offloading and Multi Processing Library
+ (libgomp).
+
+ Libgomp is free software; you can redistribute it and/or modify it
+ under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+ WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* This file defines OpenMP API entry points that accelerator targets are
+ expected to replace. */
+
+#include "libgomp.h"
+
+void
+omp_set_default_device (int device_num)
+{
+ struct gomp_task_icv *icv = gomp_icv (true);
+ icv->default_device_var = device_num >= 0 ? device_num : 0;
+}
+
+int
+omp_get_default_device (void)
+{
+ struct gomp_task_icv *icv = gomp_icv (false);
+ return icv->default_device_var;
+}
+
+int
+omp_get_num_devices (void)
+{
+ return gomp_get_num_devices ();
+}
+
+int
+omp_get_num_teams (void)
+{
+ /* Hardcoded to 1 on host, MIC, HSAIL? Maybe variable on PTX. */
+ return 1;
+}
+
+int
+omp_get_team_num (void)
+{
+ /* Hardcoded to 0 on host, MIC, HSAIL? Maybe variable on PTX. */
+ return 0;
+}
+
+int
+omp_is_initial_device (void)
+{
+ /* Hardcoded to 1 on host, should be 0 on MIC, HSAIL, PTX. */
+ return 1;
+}
+
+ialias (omp_set_default_device)
+ialias (omp_get_default_device)
+ialias (omp_get_num_devices)
+ialias (omp_get_num_teams)
+ialias (omp_get_team_num)
+ialias (omp_is_initial_device)
diff --git a/libgomp/icv.c b/libgomp/icv.c
new file mode 100644
index 00000000000..e58b961558e
--- /dev/null
+++ b/libgomp/icv.c
@@ -0,0 +1,248 @@
+/* Copyright (C) 2005-2016 Free Software Foundation, Inc.
+ Contributed by Richard Henderson <rth@redhat.com>.
+
+ This file is part of the GNU Offloading and Multi Processing Library
+ (libgomp).
+
+ Libgomp is free software; you can redistribute it and/or modify it
+ under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+ WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* This file defines the OpenMP internal control variables and associated
+ OpenMP API entry points. */
+
+#include "libgomp.h"
+#include "gomp-constants.h"
+#include <limits.h>
+
+struct gomp_task_icv gomp_global_icv = {
+ .nthreads_var = 1,
+ .thread_limit_var = UINT_MAX,
+ .run_sched_var = GFS_DYNAMIC,
+ .run_sched_chunk_size = 1,
+ .default_device_var = 0,
+ .dyn_var = false,
+ .nest_var = false,
+ .bind_var = omp_proc_bind_false,
+ .target_data = NULL
+};
+
+unsigned long gomp_max_active_levels_var = INT_MAX;
+bool gomp_cancel_var = false;
+int gomp_max_task_priority_var = 0;
+#ifndef HAVE_SYNC_BUILTINS
+gomp_mutex_t gomp_managed_threads_lock;
+#endif
+unsigned long gomp_available_cpus = 1, gomp_managed_threads = 1;
+unsigned long long gomp_spin_count_var, gomp_throttled_spin_count_var;
+unsigned long *gomp_nthreads_var_list, gomp_nthreads_var_list_len;
+char *gomp_bind_var_list;
+unsigned long gomp_bind_var_list_len;
+void **gomp_places_list;
+unsigned long gomp_places_list_len;
+int gomp_debug_var;
+unsigned int gomp_num_teams_var;
+char *goacc_device_type;
+int goacc_device_num;
+
+void
+omp_set_num_threads (int n)
+{
+ struct gomp_task_icv *icv = gomp_icv (true);
+ icv->nthreads_var = (n > 0 ? n : 1);
+}
+
+void
+omp_set_dynamic (int val)
+{
+ struct gomp_task_icv *icv = gomp_icv (true);
+ icv->dyn_var = val;
+}
+
+int
+omp_get_dynamic (void)
+{
+ struct gomp_task_icv *icv = gomp_icv (false);
+ return icv->dyn_var;
+}
+
+void
+omp_set_nested (int val)
+{
+ struct gomp_task_icv *icv = gomp_icv (true);
+ icv->nest_var = val;
+}
+
+int
+omp_get_nested (void)
+{
+ struct gomp_task_icv *icv = gomp_icv (false);
+ return icv->nest_var;
+}
+
+void
+omp_set_schedule (omp_sched_t kind, int chunk_size)
+{
+ struct gomp_task_icv *icv = gomp_icv (true);
+ switch (kind)
+ {
+ case omp_sched_static:
+ if (chunk_size < 1)
+ chunk_size = 0;
+ icv->run_sched_chunk_size = chunk_size;
+ break;
+ case omp_sched_dynamic:
+ case omp_sched_guided:
+ if (chunk_size < 1)
+ chunk_size = 1;
+ icv->run_sched_chunk_size = chunk_size;
+ break;
+ case omp_sched_auto:
+ break;
+ default:
+ return;
+ }
+ icv->run_sched_var = kind;
+}
+
+void
+omp_get_schedule (omp_sched_t *kind, int *chunk_size)
+{
+ struct gomp_task_icv *icv = gomp_icv (false);
+ *kind = icv->run_sched_var;
+ *chunk_size = icv->run_sched_chunk_size;
+}
+
+int
+omp_get_max_threads (void)
+{
+ struct gomp_task_icv *icv = gomp_icv (false);
+ return icv->nthreads_var;
+}
+
+int
+omp_get_thread_limit (void)
+{
+ struct gomp_task_icv *icv = gomp_icv (false);
+ return icv->thread_limit_var > INT_MAX ? INT_MAX : icv->thread_limit_var;
+}
+
+void
+omp_set_max_active_levels (int max_levels)
+{
+ if (max_levels >= 0)
+ gomp_max_active_levels_var = max_levels;
+}
+
+int
+omp_get_max_active_levels (void)
+{
+ return gomp_max_active_levels_var;
+}
+
+int
+omp_get_cancellation (void)
+{
+ return gomp_cancel_var;
+}
+
+int
+omp_get_max_task_priority (void)
+{
+ return gomp_max_task_priority_var;
+}
+
+omp_proc_bind_t
+omp_get_proc_bind (void)
+{
+ struct gomp_task_icv *icv = gomp_icv (false);
+ return icv->bind_var;
+}
+
+int
+omp_get_initial_device (void)
+{
+ return GOMP_DEVICE_HOST_FALLBACK;
+}
+
+int
+omp_get_num_places (void)
+{
+ return gomp_places_list_len;
+}
+
+int
+omp_get_place_num (void)
+{
+ if (gomp_places_list == NULL)
+ return -1;
+
+ struct gomp_thread *thr = gomp_thread ();
+ if (thr->place == 0)
+ gomp_init_affinity ();
+
+ return (int) thr->place - 1;
+}
+
+int
+omp_get_partition_num_places (void)
+{
+ if (gomp_places_list == NULL)
+ return 0;
+
+ struct gomp_thread *thr = gomp_thread ();
+ if (thr->place == 0)
+ gomp_init_affinity ();
+
+ return thr->ts.place_partition_len;
+}
+
+void
+omp_get_partition_place_nums (int *place_nums)
+{
+ if (gomp_places_list == NULL)
+ return;
+
+ struct gomp_thread *thr = gomp_thread ();
+ if (thr->place == 0)
+ gomp_init_affinity ();
+
+ unsigned int i;
+ for (i = 0; i < thr->ts.place_partition_len; i++)
+ *place_nums++ = thr->ts.place_partition_off + i;
+}
+
+ialias (omp_set_dynamic)
+ialias (omp_set_nested)
+ialias (omp_set_num_threads)
+ialias (omp_get_dynamic)
+ialias (omp_get_nested)
+ialias (omp_set_schedule)
+ialias (omp_get_schedule)
+ialias (omp_get_max_threads)
+ialias (omp_get_thread_limit)
+ialias (omp_set_max_active_levels)
+ialias (omp_get_max_active_levels)
+ialias (omp_get_cancellation)
+ialias (omp_get_proc_bind)
+ialias (omp_get_initial_device)
+ialias (omp_get_max_task_priority)
+ialias (omp_get_num_places)
+ialias (omp_get_place_num)
+ialias (omp_get_partition_num_places)
+ialias (omp_get_partition_place_nums)
diff --git a/libgomp/libgomp.h b/libgomp/libgomp.h
index 7b2671ba49d..c86ac3d8778 100644
--- a/libgomp/libgomp.h
+++ b/libgomp/libgomp.h
@@ -45,7 +45,9 @@
#include "gstdint.h"
#include "libgomp-plugin.h"
+#ifdef HAVE_PTHREAD_H
#include <pthread.h>
+#endif
#include <stdbool.h>
#include <stdlib.h>
#include <stdarg.h>
@@ -122,6 +124,7 @@ struct htab;
#include "sem.h"
#include "mutex.h"
#include "bar.h"
+#include "simple-bar.h"
#include "ptrlock.h"
@@ -360,6 +363,7 @@ extern char *gomp_bind_var_list;
extern unsigned long gomp_bind_var_list_len;
extern void **gomp_places_list;
extern unsigned long gomp_places_list_len;
+extern unsigned int gomp_num_teams_var;
extern int gomp_debug_var;
extern int goacc_device_num;
extern char *goacc_device_type;
@@ -626,8 +630,8 @@ struct gomp_thread_pool
/* Number of threads running in this contention group. */
unsigned long threads_busy;
- /* This barrier holds and releases threads waiting in threads. */
- gomp_barrier_t threads_dock;
+ /* This barrier holds and releases threads waiting in thread pools. */
+ gomp_simple_barrier_t threads_dock;
};
enum gomp_cancel_kind
@@ -642,7 +646,15 @@ enum gomp_cancel_kind
/* ... and here is that TLS data. */
-#if defined HAVE_TLS || defined USE_EMUTLS
+#if defined __nvptx__
+extern struct gomp_thread *nvptx_thrs __attribute__((shared));
+static inline struct gomp_thread *gomp_thread (void)
+{
+ int tid;
+ asm ("mov.u32 %0, %%tid.y;" : "=r" (tid));
+ return nvptx_thrs + tid;
+}
+#elif defined HAVE_TLS || defined USE_EMUTLS
extern __thread struct gomp_thread gomp_tls_data;
static inline struct gomp_thread *gomp_thread (void)
{
@@ -671,17 +683,21 @@ static inline struct gomp_task_icv *gomp_icv (bool write)
return &gomp_global_icv;
}
+#ifdef LIBGOMP_USE_PTHREADS
/* The attributes to be used during thread creation. */
extern pthread_attr_t gomp_thread_attr;
extern pthread_key_t gomp_thread_destructor;
+#endif
/* Function prototypes. */
/* affinity.c */
extern void gomp_init_affinity (void);
+#ifdef LIBGOMP_USE_PTHREADS
extern void gomp_init_thread_affinity (pthread_attr_t *, unsigned int);
+#endif
extern void **gomp_affinity_alloc (unsigned long, bool);
extern void gomp_affinity_init_place (void *);
extern bool gomp_affinity_add_cpus (void *, unsigned long, unsigned long,
diff --git a/libgomp/lock.c b/libgomp/lock.c
new file mode 100644
index 00000000000..783bd77dd90
--- /dev/null
+++ b/libgomp/lock.c
@@ -0,0 +1,123 @@
+/* Copyright (C) 2005-2016 Free Software Foundation, Inc.
+ Contributed by Richard Henderson <rth@redhat.com>.
+
+ This file is part of the GNU Offloading and Multi Processing Library
+ (libgomp).
+
+ Libgomp is free software; you can redistribute it and/or modify it
+ under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+ WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* This is a generic implementation of the public OpenMP locking primitives in
+ terms of internal gomp_mutex_t. It is not meant to be compiled on its own.
+ It is #include'd from config/{linux,nvptx}/lock.c. */
+
+#include <string.h>
+#include "libgomp.h"
+
+/* The internal gomp_mutex_t and the external non-recursive omp_lock_t
+ have the same form. Re-use it. */
+
+void
+gomp_init_lock_30 (omp_lock_t *lock)
+{
+ gomp_mutex_init (lock);
+}
+
+void
+gomp_destroy_lock_30 (omp_lock_t *lock)
+{
+ gomp_mutex_destroy (lock);
+}
+
+void
+gomp_set_lock_30 (omp_lock_t *lock)
+{
+ gomp_mutex_lock (lock);
+}
+
+void
+gomp_unset_lock_30 (omp_lock_t *lock)
+{
+ gomp_mutex_unlock (lock);
+}
+
+int
+gomp_test_lock_30 (omp_lock_t *lock)
+{
+ int oldval = 0;
+
+ return __atomic_compare_exchange_n (lock, &oldval, 1, false,
+ MEMMODEL_ACQUIRE, MEMMODEL_RELAXED);
+}
+
+void
+gomp_init_nest_lock_30 (omp_nest_lock_t *lock)
+{
+ memset (lock, '\0', sizeof (*lock));
+}
+
+void
+gomp_destroy_nest_lock_30 (omp_nest_lock_t *lock)
+{
+}
+
+void
+gomp_set_nest_lock_30 (omp_nest_lock_t *lock)
+{
+ void *me = gomp_icv (true);
+
+ if (lock->owner != me)
+ {
+ gomp_mutex_lock (&lock->lock);
+ lock->owner = me;
+ }
+
+ lock->count++;
+}
+
+void
+gomp_unset_nest_lock_30 (omp_nest_lock_t *lock)
+{
+ if (--lock->count == 0)
+ {
+ lock->owner = NULL;
+ gomp_mutex_unlock (&lock->lock);
+ }
+}
+
+int
+gomp_test_nest_lock_30 (omp_nest_lock_t *lock)
+{
+ void *me = gomp_icv (true);
+ int oldval;
+
+ if (lock->owner == me)
+ return ++lock->count;
+
+ oldval = 0;
+ if (__atomic_compare_exchange_n (&lock->lock, &oldval, 1, false,
+ MEMMODEL_ACQUIRE, MEMMODEL_RELAXED))
+ {
+ lock->owner = me;
+ lock->count = 1;
+ return 1;
+ }
+
+ return 0;
+}
diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c
index 5ee350d4c1d..ca33c51db5a 100644
--- a/libgomp/plugin/plugin-nvptx.c
+++ b/libgomp/plugin/plugin-nvptx.c
@@ -41,6 +41,7 @@
#include <cuda.h>
#include <stdbool.h>
#include <stdint.h>
+#include <limits.h>
#include <string.h>
#include <stdio.h>
#include <unistd.h>
@@ -274,6 +275,8 @@ struct targ_fn_descriptor
{
CUfunction fn;
const struct targ_fn_launch *launch;
+ int regs_per_thread;
+ int max_threads_per_block;
};
/* A loaded PTX image. */
@@ -307,8 +310,12 @@ struct ptx_device
bool overlap;
bool map;
bool concur;
- int mode;
bool mkern;
+ int mode;
+ int clock_khz;
+ int num_sms;
+ int regs_per_block;
+ int regs_per_sm;
struct ptx_image_data *images; /* Images loaded on device. */
pthread_mutex_t image_lock; /* Lock for above list. */
@@ -658,6 +665,39 @@ nvptx_open_device (int n)
&pi, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev);
ptx_dev->mkern = pi;
+ CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
+ &pi, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
+ ptx_dev->clock_khz = pi;
+
+ CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
+ &pi, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev);
+ ptx_dev->num_sms = pi;
+
+ CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
+ &pi, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, dev);
+ ptx_dev->regs_per_block = pi;
+
+ /* CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR = 82 is defined only
+ in CUDA 6.0 and newer. */
+ r = cuDeviceGetAttribute (&pi, 82, dev);
+ /* Fallback: use limit of registers per block, which is usually equal. */
+ if (r == CUDA_ERROR_INVALID_VALUE)
+ pi = ptx_dev->regs_per_block;
+ else if (r != CUDA_SUCCESS)
+ {
+ GOMP_PLUGIN_error ("cuDeviceGetAttribute error: %s", cuda_error (r));
+ return NULL;
+ }
+ ptx_dev->regs_per_sm = pi;
+
+ CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
+ &pi, CU_DEVICE_ATTRIBUTE_WARP_SIZE, dev);
+ if (pi != 32)
+ {
+ GOMP_PLUGIN_error ("Only warp size 32 is supported");
+ return NULL;
+ }
+
r = cuDeviceGetAttribute (&async_engines,
CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, dev);
if (r != CUDA_SUCCESS)
@@ -725,10 +765,8 @@ link_ptx (CUmodule *module, const struct targ_ptx_obj *ptx_objs,
CUjit_option opts[6];
void *optvals[6];
float elapsed = 0.0;
-#define LOGSIZE 8192
- char elog[LOGSIZE];
- char ilog[LOGSIZE];
- unsigned long logsize = LOGSIZE;
+ char elog[1024];
+ char ilog[16384];
CUlinkState linkstate;
CUresult r;
void *linkout;
@@ -741,13 +779,13 @@ link_ptx (CUmodule *module, const struct targ_ptx_obj *ptx_objs,
optvals[1] = &ilog[0];
opts[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
- optvals[2] = (void *) logsize;
+ optvals[2] = (void *) sizeof ilog;
opts[3] = CU_JIT_ERROR_LOG_BUFFER;
optvals[3] = &elog[0];
opts[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
- optvals[4] = (void *) logsize;
+ optvals[4] = (void *) sizeof elog;
opts[5] = CU_JIT_LOG_VERBOSE;
optvals[5] = (void *) 1;
@@ -1164,7 +1202,7 @@ nvptx_host2dev (void *d, const void *h, size_t s)
}
#ifndef DISABLE_ASYNC
- if (nvthd->current_stream != nvthd->ptx_dev->null_stream)
+ if (nvthd && nvthd->current_stream != nvthd->ptx_dev->null_stream)
{
CUevent *e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
CUDA_CALL (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
@@ -1220,7 +1258,7 @@ nvptx_dev2host (void *h, const void *d, size_t s)
}
#ifndef DISABLE_ASYNC
- if (nvthd->current_stream != nvthd->ptx_dev->null_stream)
+ if (nvthd && nvthd->current_stream != nvthd->ptx_dev->null_stream)
{
CUevent *e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
CUDA_CALL (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
@@ -1518,7 +1556,7 @@ GOMP_OFFLOAD_get_name (void)
unsigned int
GOMP_OFFLOAD_get_caps (void)
{
- return GOMP_OFFLOAD_CAP_OPENACC_200;
+ return GOMP_OFFLOAD_CAP_OPENACC_200 | GOMP_OFFLOAD_CAP_OPENMP_400;
}
int
@@ -1588,6 +1626,23 @@ GOMP_OFFLOAD_version (void)
return GOMP_VERSION;
}
+/* Initialize __nvptx_clocktick, if present in MODULE. */
+
+static void
+nvptx_set_clocktick (CUmodule module, struct ptx_device *dev)
+{
+ CUdeviceptr dptr;
+ CUresult r = cuModuleGetGlobal (&dptr, NULL, module, "__nvptx_clocktick");
+ if (r == CUDA_ERROR_NOT_FOUND)
+ return;
+ if (r != CUDA_SUCCESS)
+ GOMP_PLUGIN_fatal ("cuModuleGetGlobal error: %s", cuda_error (r));
+ double __nvptx_clocktick = 1e-3 / dev->clock_khz;
+ r = cuMemcpyHtoD (dptr, &__nvptx_clocktick, sizeof (__nvptx_clocktick));
+ if (r != CUDA_SUCCESS)
+ GOMP_PLUGIN_fatal ("cuMemcpyHtoD error: %s", cuda_error (r));
+}
+
/* Load the (partial) program described by TARGET_DATA to device
number ORD. Allocate and return TARGET_TABLE. */
@@ -1648,12 +1703,19 @@ GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data,
for (i = 0; i < fn_entries; i++, targ_fns++, targ_tbl++)
{
CUfunction function;
+ int nregs, mthrs;
CUDA_CALL_ERET (-1, cuModuleGetFunction, &function, module,
fn_descs[i].fn);
+ CUDA_CALL_ERET (-1, cuFuncGetAttribute, &nregs,
+ CU_FUNC_ATTRIBUTE_NUM_REGS, function);
+ CUDA_CALL_ERET (-1, cuFuncGetAttribute, &mthrs,
+ CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, function);
targ_fns->fn = function;
targ_fns->launch = &fn_descs[i];
+ targ_fns->regs_per_thread = nregs;
+ targ_fns->max_threads_per_block = mthrs;
targ_tbl->start = (uintptr_t) targ_fns;
targ_tbl->end = targ_tbl->start + 1;
@@ -1671,6 +1733,8 @@ GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data,
targ_tbl->end = targ_tbl->start + bytes;
}
+ nvptx_set_clocktick (module, dev);
+
return fn_entries + var_entries;
}
@@ -1736,6 +1800,15 @@ GOMP_OFFLOAD_host2dev (int ord, void *dst, const void *src, size_t n)
&& nvptx_host2dev (dst, src, n));
}
+bool
+GOMP_OFFLOAD_dev2dev (int ord, void *dst, const void *src, size_t n)
+{
+ struct ptx_device *ptx_dev = ptx_devices[ord];
+ CUDA_CALL (cuMemcpyDtoDAsync, (CUdeviceptr) dst, (CUdeviceptr) src, n,
+ ptx_dev->null_stream->stream);
+ return true;
+}
+
void (*device_run) (int n, void *fn_ptr, void *vars) = NULL;
void
@@ -1857,3 +1930,123 @@ GOMP_OFFLOAD_openacc_set_cuda_stream (int async, void *stream)
{
return nvptx_set_cuda_stream (async, stream);
}
+
+/* Adjust launch dimensions: pick good values for number of blocks and warps
+ and ensure that number of warps does not exceed CUDA limits as well as GCC's
+ own limits. */
+
+static void
+nvptx_adjust_launch_bounds (struct targ_fn_descriptor *fn,
+ struct ptx_device *ptx_dev,
+ int *teams_p, int *threads_p)
+{
+ int max_warps_block = fn->max_threads_per_block / 32;
+ /* Maximum 32 warps per block is an implementation limit in NVPTX backend
+ and libgcc, which matches documented limit of all GPUs as of 2015. */
+ if (max_warps_block > 32)
+ max_warps_block = 32;
+ if (*threads_p <= 0)
+ *threads_p = 8;
+ if (*threads_p > max_warps_block)
+ *threads_p = max_warps_block;
+
+ int regs_per_block = fn->regs_per_thread * 32 * *threads_p;
+ /* This is an estimate of how many blocks the device can host simultaneously.
+ Actual limit, which may be lower, can be queried with "occupancy control"
+ driver interface (since CUDA 6.0). */
+ int max_blocks = ptx_dev->regs_per_sm / regs_per_block * ptx_dev->num_sms;
+ if (*teams_p <= 0 || *teams_p > max_blocks)
+ *teams_p = max_blocks;
+}
+
+/* Return the size of per-warp stacks (see gcc -msoft-stack) to use for OpenMP
+ target regions. */
+
+static size_t
+nvptx_stacks_size ()
+{
+ return 128 * 1024;
+}
+
+/* Return contiguous storage for NUM stacks, each SIZE bytes. */
+
+static void *
+nvptx_stacks_alloc (size_t size, int num)
+{
+ CUdeviceptr stacks;
+ CUresult r = cuMemAlloc (&stacks, size * num);
+ if (r != CUDA_SUCCESS)
+ GOMP_PLUGIN_fatal ("cuMemAlloc error: %s", cuda_error (r));
+ return (void *) stacks;
+}
+
+/* Release storage previously allocated by nvptx_stacks_alloc. */
+
+static void
+nvptx_stacks_free (void *p, int num)
+{
+ CUresult r = cuMemFree ((CUdeviceptr) p);
+ if (r != CUDA_SUCCESS)
+ GOMP_PLUGIN_fatal ("cuMemFree error: %s", cuda_error (r));
+}
+
+void
+GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args)
+{
+ CUfunction function = ((struct targ_fn_descriptor *) tgt_fn)->fn;
+ CUresult r;
+ struct ptx_device *ptx_dev = ptx_devices[ord];
+ const char *maybe_abort_msg = "(perhaps abort was called)";
+ int teams = 0, threads = 0;
+
+ if (!args)
+ GOMP_PLUGIN_fatal ("No target arguments provided");
+ while (*args)
+ {
+ intptr_t id = (intptr_t) *args++, val;
+ if (id & GOMP_TARGET_ARG_SUBSEQUENT_PARAM)
+ val = (intptr_t) *args++;
+ else
+ val = id >> GOMP_TARGET_ARG_VALUE_SHIFT;
+ if ((id & GOMP_TARGET_ARG_DEVICE_MASK) != GOMP_TARGET_ARG_DEVICE_ALL)
+ continue;
+ val = val > INT_MAX ? INT_MAX : val;
+ id &= GOMP_TARGET_ARG_ID_MASK;
+ if (id == GOMP_TARGET_ARG_NUM_TEAMS)
+ teams = val;
+ else if (id == GOMP_TARGET_ARG_THREAD_LIMIT)
+ threads = val;
+ }
+ nvptx_adjust_launch_bounds (tgt_fn, ptx_dev, &teams, &threads);
+
+ size_t stack_size = nvptx_stacks_size ();
+ void *stacks = nvptx_stacks_alloc (stack_size, teams * threads);
+ void *fn_args[] = {tgt_vars, stacks, (void *) stack_size};
+ size_t fn_args_size = sizeof fn_args;
+ void *config[] = {
+ CU_LAUNCH_PARAM_BUFFER_POINTER, fn_args,
+ CU_LAUNCH_PARAM_BUFFER_SIZE, &fn_args_size,
+ CU_LAUNCH_PARAM_END
+ };
+ r = cuLaunchKernel (function,
+ teams, 1, 1,
+ 32, threads, 1,
+ 0, ptx_dev->null_stream->stream, NULL, config);
+ if (r != CUDA_SUCCESS)
+ GOMP_PLUGIN_fatal ("cuLaunchKernel error: %s", cuda_error (r));
+
+ r = cuCtxSynchronize ();
+ if (r == CUDA_ERROR_LAUNCH_FAILED)
+ GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r),
+ maybe_abort_msg);
+ else if (r != CUDA_SUCCESS)
+ GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r));
+ nvptx_stacks_free (stacks, teams * threads);
+}
+
+void
+GOMP_OFFLOAD_async_run (int ord, void *tgt_fn, void *tgt_vars, void **args,
+ void *async_data)
+{
+ GOMP_PLUGIN_fatal ("GOMP_OFFLOAD_async_run unimplemented");
+}
diff --git a/libgomp/team.c b/libgomp/team.c
index de5b915f78b..e3013451662 100644
--- a/libgomp/team.c
+++ b/libgomp/team.c
@@ -31,6 +31,7 @@
#include <stdlib.h>
#include <string.h>
+#ifdef LIBGOMP_USE_PTHREADS
/* This attribute contains PTHREAD_CREATE_DETACHED. */
pthread_attr_t gomp_thread_attr;
@@ -110,7 +111,7 @@ gomp_thread_start (void *xdata)
{
pool->threads[thr->ts.team_id] = thr;
- gomp_barrier_wait (&pool->threads_dock);
+ gomp_simple_barrier_wait (&pool->threads_dock);
do
{
struct gomp_team *team = thr->ts.team;
@@ -120,7 +121,7 @@ gomp_thread_start (void *xdata)
gomp_team_barrier_wait_final (&team->barrier);
gomp_finish_task (task);
- gomp_barrier_wait (&pool->threads_dock);
+ gomp_simple_barrier_wait (&pool->threads_dock);
local_fn = thr->fn;
local_data = thr->data;
@@ -134,6 +135,7 @@ gomp_thread_start (void *xdata)
thr->task = NULL;
return NULL;
}
+#endif
static inline struct gomp_team *
get_last_team (unsigned nthreads)
@@ -224,11 +226,17 @@ gomp_free_pool_helper (void *thread_pool)
struct gomp_thread *thr = gomp_thread ();
struct gomp_thread_pool *pool
= (struct gomp_thread_pool *) thread_pool;
- gomp_barrier_wait_last (&pool->threads_dock);
+ gomp_simple_barrier_wait_last (&pool->threads_dock);
gomp_sem_destroy (&thr->release);
thr->thread_pool = NULL;
thr->task = NULL;
+#ifdef LIBGOMP_USE_PTHREADS
pthread_exit (NULL);
+#elif defined(__nvptx__)
+ asm ("exit;");
+#else
+#error gomp_free_pool_helper must terminate the thread
+#endif
}
/* Free a thread pool and release its threads. */
@@ -250,12 +258,12 @@ gomp_free_thread (void *arg __attribute__((unused)))
nthr->data = pool;
}
/* This barrier undocks threads docked on pool->threads_dock. */
- gomp_barrier_wait (&pool->threads_dock);
+ gomp_simple_barrier_wait (&pool->threads_dock);
/* And this waits till all threads have called gomp_barrier_wait_last
in gomp_free_pool_helper. */
- gomp_barrier_wait (&pool->threads_dock);
+ gomp_simple_barrier_wait (&pool->threads_dock);
/* Now it is safe to destroy the barrier and free the pool. */
- gomp_barrier_destroy (&pool->threads_dock);
+ gomp_simple_barrier_destroy (&pool->threads_dock);
#ifdef HAVE_SYNC_BUILTINS
__sync_fetch_and_add (&gomp_managed_threads,
@@ -266,10 +274,12 @@ gomp_free_thread (void *arg __attribute__((unused)))
gomp_mutex_unlock (&gomp_managed_threads_lock);
#endif
}
- free (pool->threads);
if (pool->last_team)
free_team (pool->last_team);
+#ifndef __nvptx__
+ free (pool->threads);
free (pool);
+#endif
thr->thread_pool = NULL;
}
if (thr->ts.level == 0 && __builtin_expect (thr->ts.team != NULL, 0))
@@ -284,6 +294,7 @@ gomp_free_thread (void *arg __attribute__((unused)))
/* Launch a team. */
+#ifdef LIBGOMP_USE_PTHREADS
void
gomp_team_start (void (*fn) (void *), void *data, unsigned nthreads,
unsigned flags, struct gomp_team *team)
@@ -429,7 +440,7 @@ gomp_team_start (void (*fn) (void *), void *data, unsigned nthreads,
else if (old_threads_used == 0)
{
n = 0;
- gomp_barrier_init (&pool->threads_dock, nthreads);
+ gomp_simple_barrier_init (&pool->threads_dock, nthreads);
}
else
{
@@ -437,7 +448,7 @@ gomp_team_start (void (*fn) (void *), void *data, unsigned nthreads,
/* Increase the barrier threshold to make sure all new
threads arrive before the team is released. */
- gomp_barrier_reinit (&pool->threads_dock, nthreads);
+ gomp_simple_barrier_reinit (&pool->threads_dock, nthreads);
}
/* Not true yet, but soon will be. We're going to release all
@@ -670,8 +681,8 @@ gomp_team_start (void (*fn) (void *), void *data, unsigned nthreads,
threads and all the threads we're going to let die
arrive before the team is released. */
if (affinity_count)
- gomp_barrier_reinit (&pool->threads_dock,
- nthreads + affinity_count);
+ gomp_simple_barrier_reinit (&pool->threads_dock,
+ nthreads + affinity_count);
}
}
@@ -812,7 +823,10 @@ gomp_team_start (void (*fn) (void *), void *data, unsigned nthreads,
pthread_attr_destroy (&thread_attr);
do_release:
- gomp_barrier_wait (nested ? &team->barrier : &pool->threads_dock);
+ if (nested)
+ gomp_barrier_wait (&team->barrier);
+ else
+ gomp_simple_barrier_wait (&pool->threads_dock);
/* Decrease the barrier threshold to match the number of threads
that should arrive back at the end of this team. The extra
@@ -830,7 +844,7 @@ gomp_team_start (void (*fn) (void *), void *data, unsigned nthreads,
if (affinity_count)
diff = -affinity_count;
- gomp_barrier_reinit (&pool->threads_dock, nthreads);
+ gomp_simple_barrier_reinit (&pool->threads_dock, nthreads);
#ifdef HAVE_SYNC_BUILTINS
__sync_fetch_and_add (&gomp_managed_threads, diff);
@@ -844,6 +858,7 @@ gomp_team_start (void (*fn) (void *), void *data, unsigned nthreads,
&& team->prev_ts.place_partition_len > 64)
free (affinity_thr);
}
+#endif
/* Terminate the current team. This is only to be called by the master
@@ -919,6 +934,7 @@ gomp_team_end (void)
}
}
+#ifdef LIBGOMP_USE_PTHREADS
/* Constructors for this file. */
@@ -943,6 +959,7 @@ team_destructor (void)
crashes. */
pthread_key_delete (gomp_thread_destructor);
}
+#endif
struct gomp_task_icv *
gomp_new_icv (void)
@@ -951,6 +968,8 @@ gomp_new_icv (void)
struct gomp_task *task = gomp_malloc (sizeof (struct gomp_task));
gomp_init_task (task, NULL, &gomp_global_icv);
thr->task = task;
+#ifdef LIBGOMP_USE_PTHREADS
pthread_setspecific (gomp_thread_destructor, thr);
+#endif
return &task->icv;
}
diff --git a/libgomp/testsuite/libgomp.fortran/fortran.exp b/libgomp/testsuite/libgomp.fortran/fortran.exp
index 9e6b6437751..d848ed4d47f 100644
--- a/libgomp/testsuite/libgomp.fortran/fortran.exp
+++ b/libgomp/testsuite/libgomp.fortran/fortran.exp
@@ -7,7 +7,7 @@ global ALWAYS_CFLAGS
set shlib_ext [get_shlib_extension]
set lang_library_path "../libgfortran/.libs"
-set lang_link_flags "-lgfortran"
+set lang_link_flags "-lgfortran -foffload=-lgfortran"
if [info exists lang_include_flags] then {
unset lang_include_flags
}
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/fortran.exp b/libgomp/testsuite/libgomp.oacc-fortran/fortran.exp
index 2d6b647af22..663c9323b72 100644
--- a/libgomp/testsuite/libgomp.oacc-fortran/fortran.exp
+++ b/libgomp/testsuite/libgomp.oacc-fortran/fortran.exp
@@ -9,7 +9,7 @@ global ALWAYS_CFLAGS
set shlib_ext [get_shlib_extension]
set lang_library_path "../libgfortran/.libs"
-set lang_link_flags "-lgfortran"
+set lang_link_flags "-lgfortran -foffload=-lgfortran"
if [info exists lang_include_flags] then {
unset lang_include_flags
}