79 files changed, 3983 insertions, 587 deletions
diff --git a/gcc/ChangeLog.gomp-nvptx b/gcc/ChangeLog.gomp-nvptx
new file mode 100644
index 00000000000..8b555f25414
--- /dev/null
+++ b/gcc/ChangeLog.gomp-nvptx
@@ -0,0 +1,269 @@
+2016-11-08  Alexander Monakov  <amonakov@ispras.ru>
+
+	* omp-low.c (lower_lastprivate_clauses): Add missing call to
+	unshare_expr.
+
+2016-07-08  Alexander Monakov  <amonakov@ispras.ru>
+
+	* config/nvptx/nvptx.c (nvptx_file_end): Do emit element count in
+	external declaration of __nvptx_stacks.
+
+2016-06-02  Alexander Monakov  <amonakov@ispras.ru>
+
+	* doc/invoke.texi (msoft-stack): Reword, fix errors.
+
+2016-05-19  Alexander Monakov  <amonakov@ispras.ru>
+
+	* config/nvptx/nvptx.c (nvptx_declare_function_name): Expand comments.
+	(nvptx_file_end): Do not emit element count in external declaration of
+	__nvptx_stacks.
+
+2016-05-19  Alexander Monakov  <amonakov@ispras.ru>
+
+	* doc/invoke.texi (msoft-stack): Rewrite.
+
+2016-05-06  Alexander Monakov  <amonakov@ispras.ru>
+
+	* config/nvptx/nvptx.c (nvptx_encode_section_info): Diagnose explicit
+	static initialization of variables in .shared memory.
+	(nvptx_handle_shared_attribute): Reword diagnostic message.
+
+2016-04-19  Alexander Monakov  <amonakov@ispras.ru>
+
+	* doc/extend.texi (Nvidia PTX Variable Attributes): New section.
+
+2016-04-18  Alexander Monakov  <amonakov@ispras.ru>
+
+	* config/nvptx/nvptx.c (write_omp_entry): Adjust.
+	(nvptx_declare_function_name): Adjust.
+
+2016-03-15  Alexander Monakov  <amonakov@ispras.ru>
+
+	* config/nvptx/nvptx.h (STACK_SIZE_MODE): Define.
+
+2016-03-15  Alexander Monakov  <amonakov@ispras.ru>
+
+	* config/nvptx/nvptx.c (nvptx_init_unisimt_predicate): Emit
+	'mul.lo.u32' instead of 'mul.u32' for 32-bit ABI target.
+	(nvptx_declare_function_name): Ditto.
+
+2016-03-15  Alexander Monakov  <amonakov@ispras.ru>
+
+	* config/nvptx/nvptx.c (nvptx_init_unisimt_predicate): Use
+	POINTER_SIZE instead of BITS_PER_WORD.
+	(nvptx_declare_function_name): Ditto.
+	(nvptx_output_return): Ditto.
+	(nvptx_file_end): Ditto.
+
+2016-03-15  Alexander Monakov  <amonakov@ispras.ru>
+
+	* config/nvptx/nvptx.c (nvptx_option_override): Remove custom handling
+	of debug info options.
+
+2016-02-15  Alexander Monakov  <amonakov@ispras.ru>
+
+	* config/nvptx/nvptx.c (write_omp_entry): Expand entry code to
+	initialize __nvptx_uni and __nvptx_stacks (based on pointer to storage
+	allocated by the libgomp plugin).
+
+2016-01-17  Alexander Monakov  <amonakov@ispras.ru>
+
+	* config/nvptx/nvptx.c (nvptx_encode_section_info): Handle "shared"
+	attribute.
+	(nvptx_handle_shared_attribute): New.  Use it...
+	(nvptx_attribute_table): ... here (new entry).
+
+2016-01-17  Alexander Monakov  <amonakov@ispras.ru>
+
+	* config/nvptx/nvptx.c (nvptx_declare_function_name): Fix warning.
+
+2016-01-17  Alexander Monakov  <amonakov@ispras.ru>
+
+	* internal-fn.c (expand_GOMP_SIMT_LANE): Update.
+	(expand_GOMP_SIMT_LAST_LANE): New.
+	(expand_GOMP_SIMT_ORDERED_PRED): New.
+	(expand_GOMP_SIMT_VOTE_ANY): New.
+	(expand_GOMP_SIMT_XCHG_BFLY): New.
+	(expand_GOMP_SIMT_XCHG_IDX): New.
+	* internal-fn.def (GOMP_SIMT_LAST_LANE): New.
+	(GOMP_SIMT_ORDERED_PRED): New.
+	(GOMP_SIMT_VOTE_ANY): New.
+	(GOMP_SIMT_XCHG_BFLY): New.
+	(GOMP_SIMT_XCHG_IDX): New.
+	* omp-low.c (omp_maybe_offloaded_ctx): New, outlined from...
+	(create_omp_child_function): ...here.  Simplify.
+	(omp_max_simt_vf): New.  Use it...
+	(omp_max_vf): ...here.
+	(lower_rec_input_clauses): Add reduction lowering for SIMT execution.
+	(lower_lastprivate_clauses): Likewise, for lastprivate lowering.
+	(lower_omp_ordered): Likewise, for "ordered" lowering.
+	(expand_omp_simd): Update SIMT transforms.
+	(execute_omp_device_lower): Update.  Fold SIMD ifns on SIMT targets.
+
+2016-01-17  Alexander Monakov  <amonakov@ispras.ru>
+
+	* config/nvptx/nvptx-protos.h (nvptx_shuffle_kind): Move enum
+	declaration from nvptx.c.
+	(nvptx_gen_shuffle): Declare.
+	* config/nvptx/nvptx.c (nvptx_shuffle_kind): Moved to nvptx-protos.h.
+	(nvptx_gen_shuffle): No longer static.
+	* config/nvptx/nvptx.md (UNSPEC_VOTE_BALLOT): New unspec.
+	(UNSPEC_LANEID): Ditto.
+	(UNSPECV_NOUNROLL): Ditto.
+	(nvptx_vote_ballot): New pattern.
+	(omp_simt_lane): Ditto.
+	(nvptx_nounroll): Ditto.
+	(omp_simt_last_lane): Ditto.
+	(omp_simt_ordered): Ditto.
+	(omp_simt_vote_any): Ditto.
+	(omp_simt_xchg_bfly): Ditto.
+	(omp_simt_xchg_idx): Ditto.
+	* target-insns.def (omp_simt_lane): New.
+	(omp_simt_last_lane): New.
+	(omp_simt_ordered): New.
+	(omp_simt_vote_any): New.
+	(omp_simt_xchg_bfly): New.
+	(omp_simt_xchg_idx): New.
+
+2015-12-16  Alexander Monakov  <amonakov@ispras.ru>
+
+	* config/nvptx/nvptx.c (write_omp_entry): Work around failure on sm_5x
+	devices by loading ...$impl function address outside of call sequence.
+
+2015-12-14  Alexander Monakov  <amonakov@ispras.ru>
+
+	* config/nvptx/nvptx.c (nvptx_declare_function_name): Emit %outargs
+	using .local %outargs_ar only if not TARGET_SOFT_STACK.  Emit %outargs
+	under TARGET_SOFT_STACK by offsetting from %frame.
+	(nvptx_get_drap_rtx): Return %argp as the DRAP if needed.
+	* config/nvptx/nvptx.md (nvptx_register_operand): Allow %outargs under
+	TARGET_SOFT_STACK.
+	(nvptx_nonimmediate_operand): Ditto.
+	(allocate_stack): Implement for TARGET_SOFT_STACK.  Remove unused code.
+	(allocate_stack_<mode>): Remove unused pattern.
+	(set_softstack_insn): New pattern.
+	(restore_stack_block): Handle for TARGET_SOFT_STACK.
+
+2015-12-09  Alexander Monakov  <amonakov@ispras.ru>
+
+	* config/nvptx/nvptx.c (nvptx_record_offload_symbol): Allow NULL attrs
+	for OpenMP offloading.
+
+2015-12-09  Alexander Monakov  <amonakov@ispras.ru>
+
+	* internal-fn.c (expand_GOMP_SIMT_LANE): New.
+	(expand_GOMP_SIMT_VF): New.
+	* internal-fn.def (GOMP_SIMT_LANE): New.
+	(GOMP_SIMT_VF): New.
+	* omp-low.c (expand_omp_simd): Do SIMT transforms.
+	(pass_data_lower_omp): Add PROP_gimple_lomp_dev.
+	(execute_omp_device_lower): New.
+	(pass_data_omp_device_lower): New.
+	(pass_omp_device_lower): New pass.
+	(make_pass_omp_device_lower): New.
+	* passes.def (pass_omp_device_lower): Position new pass.
+	* tree-pass.h (PROP_gimple_lomp_dev): Define.
+	(make_pass_omp_device_lower): Declare.
+
+2015-12-09  Alexander Monakov  <amonakov@ispras.ru>
+
+	* config/nvptx/mkoffload.c (main): Check that either OpenACC or OpenMP
+	is selected.  Pass -mgomp to offload compiler in OpenMP case.
+
+2015-12-09  Alexander Monakov  <amonakov@ispras.ru>
+
+	* config/nvptx/nvptx.c (nvptx_simt_vf): New.
+	(TARGET_SIMT_VF): Define.
+	* doc/tm.texi: Regenerate.
+	* doc/tm.texi.in: (TARGET_SIMT_VF): New hook.
+	* target.def: Define it.
+
+2015-12-09  Alexander Monakov  <amonakov@ispras.ru>
+
+	* config/nvptx/nvptx.c (nvptx_option_override): Handle TARGET_GOMP.
+	* config/nvptx/nvptx.opt (mgomp): New option.
+	* config/nvptx/t-nvptx (MULTILIB_OPTIONS): New.
+	* doc/invoke.texi (mgomp): Document.
+
+2015-12-09  Alexander Monakov  <amonakov@ispras.ru>
+
+	* config/nvptx/nvptx.c (nvptx_name_replacement): Rewrite.  Add
+	__nvptx_real_malloc -> malloc and __nvptx_real_free -> free
+	replacements.
+
+2015-12-09  Alexander Monakov  <amonakov@ispras.ru>
+
+	* config/nvptx/nvptx.c (need_unisimt_decl): New variable.  Set it...
+	(nvptx_init_unisimt_predicate): ...here (new function) and use it...
+	(nvptx_file_end): ...here to emit declaration of __nvptx_uni array.
+	(nvptx_declare_function_name): Call nvptx_init_unisimt_predicate.
+	(nvptx_get_unisimt_master): New helper function.
+	(nvptx_get_unisimt_predicate): Ditto.
+	(nvptx_call_insn_is_syscall_p): Ditto.
+	(nvptx_unisimt_handle_set): Ditto.
+	(nvptx_reorg_uniform_simt): New.  Transform code for -muniform-simt.
+	(nvptx_get_axis_predicate): New helper function, factored out from...
+	(nvptx_single): ...here.
+	(nvptx_reorg): Call nvptx_reorg_uniform_simt.
+	* config/nvptx/nvptx.h (TARGET_CPU_CPP_BUILTINS): Define
+	__nvptx_unisimt__ when -muniform-simt option is active.
+	(struct machine_function): Add unisimt_master, unisimt_predicate
+	rtx fields.
+	* config/nvptx/nvptx.md (divergent): New attribute.
+	(atomic_compare_and_swap<mode>_1): Mark as divergent.
+	(atomic_exchange<mode>): Ditto.
+	(atomic_fetch_add<mode>): Ditto.
+	(atomic_fetch_addsf): Ditto.
+	(atomic_fetch_<logic><mode>): Ditto.
+	* config/nvptx/nvptx.opt (muniform-simt): New option.
+	* doc/invoke.texi (-muniform-simt): Document.
+
+2015-12-09  Alexander Monakov  <amonakov@ispras.ru>
+
+	* config/nvptx/nvptx.c (nvptx_output_call_insn): Handle COND_EXEC
+	patterns.  Emit instruction predicate.
+	(nvptx_print_operand): Unbreak handling of instruction predicates.
+	* config/nvptx/nvptx.md (predicable): New attribute.  Generate
+	predicated forms via define_cond_exec.
+	(br_true): Mark as not predicable.
+	(br_false): Ditto.
+	(br_true_uni): Ditto.
+	(br_false_uni): Ditto.
+	(return): Ditto.
+	(trap_if_true): Ditto.
+	(trap_if_false): Ditto.
+	(nvptx_fork): Ditto.
+	(nvptx_forked): Ditto.
+	(nvptx_joining): Ditto.
+	(nvptx_join): Ditto.
+	(nvptx_barsync): Ditto.
+
+2015-12-09  Alexander Monakov  <amonakov@ispras.ru>
+
+	* config/nvptx/nvptx.c: (need_softstack_decl): New variable.
+	(nvptx_declare_function_name): Handle TARGET_SOFT_STACK.
+	(nvptx_output_return): Emit stack restore if needed.
+	(nvptx_file_end): Handle need_softstack_decl.
+	* config/nvptx/nvptx.h: (TARGET_CPU_CPP_BUILTINS): Define
+	__nvptx_softstack__ when -msoft-stack is active.
+	(struct machine_function): New bool field using_softstack.
+	* config/nvptx/nvptx.opt: (msoft-stack): New option.
+	* doc/invoke.texi (msoft-stack): Document.
+
+2015-12-08  Alexander Monakov  <amonakov@ispras.ru>
+
+	* config/nvptx/nvptx.c: (write_omp_entry): New.  Use it...
+	(nvptx_declare_function_name): ...here to emit pointers for libgomp.
+
+2015-12-08  Alexander Monakov  <amonakov@ispras.ru>
+
+	* omp-low.c (create_omp_child_function): Set "omp target entrypoint",
+	"omp acc target entrypoint" or "omp declare target" attribute based on
+	is_gimple_omp_offloaded and is_gimple_omp_oacc.
+	* config/nvptx/nvptx.c (write_as_kernel): Test OpenACC-specific
+	attribute "omp acc target entrypoint".  Add a comment about the OpenMP
+	attribute handling.
+
+2015-12-08  Alexander Monakov  <amonakov@ispras.ru>
+
+	* config/nvptx/mkoffload.c (main): Allow -fopenmp.
diff --git a/gcc/config/nvptx/mkoffload.c b/gcc/config/nvptx/mkoffload.c
index c8eed451078..e99ef373770 100644
--- a/gcc/config/nvptx/mkoffload.c
+++ b/gcc/config/nvptx/mkoffload.c
@@ -460,6 +460,7 @@ main (int argc, char **argv)
 
   /* Scan the argument vector.  */
   bool fopenmp = false;
+  bool fopenacc = false;
   for (int i = 1; i < argc; i++)
     {
 #define STR "-foffload-abi="
@@ -476,11 +477,15 @@ main (int argc, char **argv)
 #undef STR
       else if (strcmp (argv[i], "-fopenmp") == 0)
 	fopenmp = true;
+      else if (strcmp (argv[i], "-fopenacc") == 0)
+	fopenacc = true;
       else if (strcmp (argv[i], "-save-temps") == 0)
 	save_temps = true;
       else if (strcmp (argv[i], "-v") == 0)
 	verbose = true;
     }
+  if (!(fopenacc ^ fopenmp))
+    fatal_error (input_location, "either -fopenacc or -fopenmp must be set");
 
   struct obstack argv_obstack;
   obstack_init (&argv_obstack);
@@ -501,6 +506,8 @@ main (int argc, char **argv)
     default:
       gcc_unreachable ();
     }
+  if (fopenmp)
+    obstack_ptr_grow (&argv_obstack, "-mgomp");
 
   for (int ix = 1; ix != argc; ix++)
     {
@@ -517,8 +524,8 @@ main (int argc, char **argv)
     fatal_error (input_location, "cannot open '%s'", ptx_cfile_name);
 
   /* PR libgomp/65099: Currently, we only support offloading in 64-bit
-     configurations.  PR target/67822: OpenMP offloading to nvptx fails.  */
-  if (offload_abi == OFFLOAD_ABI_LP64 && !fopenmp)
+     configurations.  */
+  if (offload_abi == OFFLOAD_ABI_LP64)
     {
       ptx_name = make_temp_file (".mkoffload");
       obstack_ptr_grow (&argv_obstack, "-o");
diff --git a/gcc/config/nvptx/nvptx-protos.h b/gcc/config/nvptx/nvptx-protos.h
index ec4588e6dc0..331ec0af6bf 100644
--- a/gcc/config/nvptx/nvptx-protos.h
+++ b/gcc/config/nvptx/nvptx-protos.h
@@ -21,6 +21,16 @@
 #ifndef GCC_NVPTX_PROTOS_H
 #define GCC_NVPTX_PROTOS_H
 
+/* The kind of shuffe instruction.  */
+enum nvptx_shuffle_kind
+{
+  SHUFFLE_UP,
+  SHUFFLE_DOWN,
+  SHUFFLE_BFLY,
+  SHUFFLE_IDX,
+  SHUFFLE_MAX
+};
+
 extern void nvptx_declare_function_name (FILE *, const char *, const_tree decl);
 extern void nvptx_declare_object_name (FILE *file, const char *name,
 				       const_tree decl);
@@ -36,10 +46,12 @@ extern void nvptx_register_pragmas (void);
 extern void nvptx_expand_oacc_fork (unsigned);
 extern void nvptx_expand_oacc_join (unsigned);
 extern void nvptx_expand_call (rtx, rtx);
+extern rtx nvptx_gen_shuffle (rtx, rtx, rtx, nvptx_shuffle_kind);
 extern rtx nvptx_expand_compare (rtx);
 extern const char *nvptx_ptx_type_from_mode (machine_mode, bool);
 extern const char *nvptx_output_mov_insn (rtx, rtx);
 extern const char *nvptx_output_call_insn (rtx_insn *, rtx, rtx);
 extern const char *nvptx_output_return (void);
+extern const char *nvptx_output_set_softstack (unsigned);
 #endif
 #endif
diff --git a/gcc/config/nvptx/nvptx.c b/gcc/config/nvptx/nvptx.c
index 782bbdecb37..68bf0a89a81 100644
--- a/gcc/config/nvptx/nvptx.c
+++ b/gcc/config/nvptx/nvptx.c
@@ -72,16 +72,6 @@
 /* This file should be included last.  */
 #include "target-def.h"
 
-/* The kind of shuffe instruction.  */
-enum nvptx_shuffle_kind
-{
-  SHUFFLE_UP,
-  SHUFFLE_DOWN,
-  SHUFFLE_BFLY,
-  SHUFFLE_IDX,
-  SHUFFLE_MAX
-};
-
 /* The various PTX memory areas an object might reside in.  */
 enum nvptx_data_area
 {
@@ -141,6 +131,12 @@ static GTY(()) rtx worker_red_sym;
 /* Global lock variable, needed for 128bit worker & gang reductions.  */
 static GTY(()) tree global_lock_var;
 
+/* True if any function references __nvptx_stacks.  */
+static bool need_softstack_decl;
+
+/* True if any function references __nvptx_uni.  */
+static bool need_unisimt_decl;
+
 /* Allocate a new, cleared machine_function structure.  */
 
 static struct machine_function *
@@ -151,6 +147,16 @@ nvptx_init_machine_status (void)
   return p;
 }
 
+/* Issue a diagnostic when option OPTNAME is enabled (as indicated by OPTVAL)
+   and -fopenacc is also enabled.  */
+
+static void
+diagnose_openacc_conflict (bool optval, const char *optname)
+{
+  if (flag_openacc && optval)
+    error ("option %s is not supported together with -fopenacc", optname);
+}
+
 /* Implement TARGET_OPTION_OVERRIDE.  */
 
 static void
@@ -188,6 +194,13 @@ nvptx_option_override (void)
   worker_red_sym = gen_rtx_SYMBOL_REF (Pmode, "__worker_red");
   SET_SYMBOL_DATA_AREA (worker_red_sym, DATA_AREA_SHARED);
   worker_red_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT;
+
+  diagnose_openacc_conflict (TARGET_GOMP, "-mgomp");
+  diagnose_openacc_conflict (TARGET_SOFT_STACK, "-msoft-stack");
+  diagnose_openacc_conflict (TARGET_UNIFORM_SIMT, "-muniform-simt");
+
+  if (TARGET_GOMP)
+    target_flags |= MASK_SOFT_STACK | MASK_UNIFORM_SIMT;
 }
 
 /* Return a ptx type for MODE.  If PROMOTE, then use .u32 for QImode to
@@ -238,9 +251,17 @@ nvptx_encode_section_info (tree decl, rtx rtl, int first)
       if (TREE_CONSTANT (decl))
 	area = DATA_AREA_CONST;
       else if (TREE_CODE (decl) == VAR_DECL)
-	/* TODO: This would be a good place to check for a .shared or
-	   other section name.  */
-	area = TREE_READONLY (decl) ? DATA_AREA_CONST : DATA_AREA_GLOBAL;
+	{
+	  if (lookup_attribute ("shared", DECL_ATTRIBUTES (decl)))
+	    {
+	      area = DATA_AREA_SHARED;
+	      if (DECL_INITIAL (decl))
+		error ("static initialization of variable %q+D in %<.shared%>"
+		       " memory is not supported", decl);
+	    }
+	  else
+	    area = TREE_READONLY (decl) ? DATA_AREA_CONST : DATA_AREA_GLOBAL;
+	}
 
       SET_SYMBOL_DATA_AREA (XEXP (rtl, 0), area);
     }
@@ -718,7 +739,10 @@ static bool
 write_as_kernel (tree attrs)
 {
   return (lookup_attribute ("kernel", attrs) != NULL_TREE
-	  || lookup_attribute ("omp target entrypoint", attrs) != NULL_TREE);
+	  || (lookup_attribute ("omp target entrypoint", attrs) != NULL_TREE
+	      && lookup_attribute ("oacc function", attrs) != NULL_TREE));
+  /* For OpenMP target regions, the corresponding kernel entry is emitted from
+     write_omp_entry as a separate function.  */
 }
 
 /* Emit a linker marker for a function decl or defn.  */
@@ -973,6 +997,67 @@ init_frame (FILE  *file, int regno, unsigned align, unsigned size)
 	   POINTER_SIZE, reg_names[regno], reg_names[regno]);
 }
 
+/* Emit soft stack frame setup sequence.  */
+
+static void
+init_softstack_frame (FILE *file, unsigned alignment, HOST_WIDE_INT size)
+{
+  /* Maintain 64-bit stack alignment.  */
+  unsigned keep_align = BIGGEST_ALIGNMENT / BITS_PER_UNIT;
+  size = ROUND_UP (size, keep_align);
+  int bits = POINTER_SIZE;
+  const char *reg_stack = reg_names[STACK_POINTER_REGNUM];
+  const char *reg_frame = reg_names[FRAME_POINTER_REGNUM];
+  const char *reg_sspslot = reg_names[SOFTSTACK_SLOT_REGNUM];
+  const char *reg_sspprev = reg_names[SOFTSTACK_PREV_REGNUM];
+  fprintf (file, "\t.reg.u%d %s;\n", bits, reg_stack);
+  fprintf (file, "\t.reg.u%d %s;\n", bits, reg_frame);
+  fprintf (file, "\t.reg.u%d %s;\n", bits, reg_sspslot);
+  fprintf (file, "\t.reg.u%d %s;\n", bits, reg_sspprev);
+  fprintf (file, "\t{\n");
+  fprintf (file, "\t\t.reg.u32 %%fstmp0;\n");
+  fprintf (file, "\t\t.reg.u%d %%fstmp1;\n", bits);
+  fprintf (file, "\t\t.reg.u%d %%fstmp2;\n", bits);
+  fprintf (file, "\t\tmov.u32 %%fstmp0, %%tid.y;\n");
+  fprintf (file, "\t\tmul%s.u32 %%fstmp1, %%fstmp0, %d;\n",
+	   bits == 64 ? ".wide" : ".lo", bits / 8);
+  fprintf (file, "\t\tmov.u%d %%fstmp2, __nvptx_stacks;\n", bits);
+
+  /* Initialize %sspslot = &__nvptx_stacks[tid.y].  */
+  fprintf (file, "\t\tadd.u%d %s, %%fstmp2, %%fstmp1;\n", bits, reg_sspslot);
+
+  /* Initialize %sspprev = __nvptx_stacks[tid.y].  */
+  fprintf (file, "\t\tld.shared.u%d %s, [%s];\n",
+	   bits, reg_sspprev, reg_sspslot);
+
+  /* Initialize %frame = %sspprev - size.  */
+  fprintf (file, "\t\tsub.u%d %s, %s, " HOST_WIDE_INT_PRINT_DEC ";\n",
+	   bits, reg_frame, reg_sspprev, size);
+
+  /* Apply alignment, if larger than 64.  */
+  if (alignment > keep_align)
+    fprintf (file, "\t\tand.b%d %s, %s, %d;\n",
+	     bits, reg_frame, reg_frame, -alignment);
+
+  size = crtl->outgoing_args_size;
+  gcc_assert (size % keep_align == 0);
+
+  /* Initialize %stack.  */
+  fprintf (file, "\t\tsub.u%d %s, %s, " HOST_WIDE_INT_PRINT_DEC ";\n",
+	   bits, reg_stack, reg_frame, size);
+
+  /* Usually 'crtl->is_leaf' is computed during register allocator
+     initialization, which is not done on NVPTX.  Compute it now.  */
+  gcc_assert (!crtl->is_leaf);
+  crtl->is_leaf = leaf_function_p ();
+  if (!crtl->is_leaf)
+    fprintf (file, "\t\tst.shared.u%d [%s], %s;\n",
+	     bits, reg_sspslot, reg_stack);
+  fprintf (file, "\t}\n");
+  cfun->machine->has_softstack = true;
+  need_softstack_decl = true;
+}
+
 /* Emit code to initialize the REGNO predicate register to indicate
    whether we are not lane zero on the NAME axis.  */
 
@@ -986,6 +1071,97 @@ nvptx_init_axis_predicate (FILE *file, int regno, const char *name)
   fprintf (file, "\t}\n");
 }
 
+/* Emit code to initialize predicate and master lane index registers for
+   -muniform-simt code generation variant.  */
+
+static void
+nvptx_init_unisimt_predicate (FILE *file)
+{
+  int bits = POINTER_SIZE;
+  int master = REGNO (cfun->machine->unisimt_master);
+  int pred = REGNO (cfun->machine->unisimt_predicate);
+  fprintf (file, "\t{\n");
+  fprintf (file, "\t\t.reg.u32 %%ustmp0;\n");
+  fprintf (file, "\t\t.reg.u%d %%ustmp1;\n", bits);
+  fprintf (file, "\t\t.reg.u%d %%ustmp2;\n", bits);
+  fprintf (file, "\t\tmov.u32 %%ustmp0, %%tid.y;\n");
+  fprintf (file, "\t\tmul%s.u32 %%ustmp1, %%ustmp0, 4;\n",
+	   bits == 64 ? ".wide" : ".lo");
+  fprintf (file, "\t\tmov.u%d %%ustmp2, __nvptx_uni;\n", bits);
+  fprintf (file, "\t\tadd.u%d %%ustmp2, %%ustmp2, %%ustmp1;\n", bits);
+  fprintf (file, "\t\tld.shared.u32 %%r%d, [%%ustmp2];\n", master);
+  fprintf (file, "\t\tmov.u32 %%ustmp0, %%tid.x;\n");
+  /* Compute 'master lane index' as 'tid.x & __nvptx_uni[tid.y]'.  */
+  fprintf (file, "\t\tand.b32 %%r%d, %%r%d, %%ustmp0;\n", master, master);
+  /* Compute predicate as 'tid.x == master'.  */
+  fprintf (file, "\t\tsetp.eq.u32 %%r%d, %%r%d, %%ustmp0;\n", pred, master);
+  fprintf (file, "\t}\n");
+  need_unisimt_decl = true;
+}
+
+/* Emit kernel NAME for function ORIG outlined for an OpenMP 'target' region:
+
+   extern void gomp_nvptx_main (void (*fn)(void*), void *fnarg);
+   void __attribute__((kernel)) NAME (void *arg, char *stack, size_t stacksize)
+   {
+     __nvptx_stacks[tid.y] = stack + stacksize * (ctaid.x * ntid.y + tid.y + 1);
+     __nvptx_uni[tid.y] = 0;
+     gomp_nvptx_main (ORIG, arg);
+   }
+   ORIG itself should not be emitted as a PTX .entry function.  */
+
+static void
+write_omp_entry (FILE *file, const char *name, const char *orig)
+{
+  static bool gomp_nvptx_main_declared;
+  if (!gomp_nvptx_main_declared)
+    {
+      gomp_nvptx_main_declared = true;
+      write_fn_marker (func_decls, false, true, "gomp_nvptx_main");
+      func_decls << ".extern .func gomp_nvptx_main (.param.u" << POINTER_SIZE
+        << " %in_ar1, .param.u" << POINTER_SIZE << " %in_ar2);\n";
+    }
+#define ENTRY_TEMPLATE(PS, PS_BYTES, MAD_PS_32) "\
+ (.param.u" PS " %arg, .param.u" PS " %stack, .param.u" PS " %sz)\n\
+{\n\
+	.reg.u32 %r<3>;\n\
+	.reg.u" PS " %R<4>;\n\
+	mov.u32 %r0, %tid.y;\n\
+	mov.u32 %r1, %ntid.y;\n\
+	mov.u32 %r2, %ctaid.x;\n\
+	cvt.u" PS ".u32 %R1, %r0;\n\
+	" MAD_PS_32 " %R1, %r1, %r2, %R1;\n\
+	mov.u" PS " %R0, __nvptx_stacks;\n\
+	" MAD_PS_32 " %R0, %r0, " PS_BYTES ", %R0;\n\
+	ld.param.u" PS " %R2, [%stack];\n\
+	ld.param.u" PS " %R3, [%sz];\n\
+	add.u" PS " %R2, %R2, %R3;\n\
+	mad.lo.u" PS " %R2, %R1, %R3, %R2;\n\
+	st.shared.u" PS " [%R0], %R2;\n\
+	mov.u" PS " %R0, __nvptx_uni;\n\
+	" MAD_PS_32 " %R0, %r0, 4, %R0;\n\
+	mov.u32 %r0, 0;\n\
+	st.shared.u32 [%R0], %r0;\n\
+	mov.u" PS " %R0, \0;\n\
+	ld.param.u" PS " %R1, [%arg];\n\
+	{\n\
+		.param.u" PS " %P<2>;\n\
+		st.param.u" PS " [%P0], %R0;\n\
+		st.param.u" PS " [%P1], %R1;\n\
+		call.uni gomp_nvptx_main, (%P0, %P1);\n\
+	}\n\
+	ret.uni;\n\
+}\n"
+  static const char entry64[] = ENTRY_TEMPLATE ("64", "8", "mad.wide.u32");
+  static const char entry32[] = ENTRY_TEMPLATE ("32", "4", "mad.lo.u32  ");
+#undef ENTRY_TEMPLATE
+  const char *entry_1 = TARGET_ABI64 ? entry64 : entry32;
+  /* Position ENTRY_2 after the embedded nul using strlen of the prefix.  */
+  const char *entry_2 = entry_1 + strlen (entry64) + 1;
+  fprintf (file, ".visible .entry %s%s%s%s", name, entry_1, orig, entry_2);
+  need_softstack_decl = need_unisimt_decl = true;
+}
+
 /* Implement ASM_DECLARE_FUNCTION_NAME.  Writes the start of a ptx
    function, including local var decls and copies from the arguments to
    local regs.  */
@@ -997,6 +1173,14 @@ nvptx_declare_function_name (FILE *file, const char *name, const_tree decl)
   tree result_type = TREE_TYPE (fntype);
   int argno = 0;
 
+  if (lookup_attribute ("omp target entrypoint", DECL_ATTRIBUTES (decl))
+      && !lookup_attribute ("oacc function", DECL_ATTRIBUTES (decl)))
+    {
+      char *buf = (char *) alloca (strlen (name) + sizeof ("$impl"));
+      sprintf (buf, "%s$impl", name);
+      write_omp_entry (file, name, buf);
+      name = buf;
+    }
   /* We construct the initial part of the function into a string
      stream, in order to share the prototype writing code.  */
   std::stringstream s;
@@ -1034,19 +1218,24 @@ nvptx_declare_function_name (FILE *file, const char *name, const_tree decl)
 
   fprintf (file, "%s", s.str().c_str());
 
-  /* Declare a local var for outgoing varargs.  */
-  if (cfun->machine->has_varadic)
-    init_frame (file, STACK_POINTER_REGNUM,
-		UNITS_PER_WORD, crtl->outgoing_args_size);
-
-  /* Declare a local variable for the frame.  Force its size to be
-     DImode-compatible.  */
   HOST_WIDE_INT sz = get_frame_size ();
-  if (sz || cfun->machine->has_chain)
-    init_frame (file, FRAME_POINTER_REGNUM,
-		crtl->stack_alignment_needed / BITS_PER_UNIT,
-		(sz + GET_MODE_SIZE (DImode) - 1)
-		& ~(HOST_WIDE_INT)(GET_MODE_SIZE (DImode) - 1));
+  bool need_frameptr = sz || cfun->machine->has_chain;
+  int alignment = crtl->stack_alignment_needed / BITS_PER_UNIT;
+  if (!TARGET_SOFT_STACK)
+    {
+      /* Declare a local var for outgoing varargs.  */
+      if (cfun->machine->has_varadic)
+	init_frame (file, STACK_POINTER_REGNUM,
+		    UNITS_PER_WORD, crtl->outgoing_args_size);
+
+      /* Declare a local variable for the frame.  Force its size to be
+	 DImode-compatible.  */
+      if (need_frameptr)
+	init_frame (file, FRAME_POINTER_REGNUM, alignment,
+		    ROUND_UP (sz, GET_MODE_SIZE (DImode)));
+    }
+  else if (need_frameptr || cfun->machine->has_varadic || cfun->calls_alloca)
+    init_softstack_frame (file, alignment, sz);
 
   /* Declare the pseudos we have as ptx registers.  */
   int maxregs = max_reg_num ();
@@ -1072,8 +1261,25 @@ nvptx_declare_function_name (FILE *file, const char *name, const_tree decl)
   if (cfun->machine->axis_predicate[1])
     nvptx_init_axis_predicate (file,
 			       REGNO (cfun->machine->axis_predicate[1]), "x");
+  if (cfun->machine->unisimt_predicate)
+    nvptx_init_unisimt_predicate (file);
 }
 
+/* Output instruction that sets soft stack pointer in shared memory to the
+   value in register given by SRC_REGNO.  */
+
+const char *
+nvptx_output_set_softstack (unsigned src_regno)
+{
+  if (cfun->machine->has_softstack && !crtl->is_leaf)
+    {
+      fprintf (asm_out_file, "\tst.shared.u%d\t[%s], ",
+	       POINTER_SIZE, reg_names[SOFTSTACK_SLOT_REGNUM]);
+      output_reg (asm_out_file, src_regno, VOIDmode);
+      fprintf (asm_out_file, ";\n");
+    }
+  return "";
+}
 /* Output a return instruction.  Also copy the return value to its outgoing
    location.  */
 
@@ -1113,6 +1319,8 @@ nvptx_function_ok_for_sibcall (tree, tree)
 static rtx
 nvptx_get_drap_rtx (void)
 {
+  if (TARGET_SOFT_STACK && stack_realign_drap)
+    return arg_pointer_rtx;
   return NULL_RTX;
 }
 
@@ -1311,7 +1519,7 @@ nvptx_gen_pack (rtx dst, rtx src0, rtx src1)
 /* Generate an instruction or sequence to broadcast register REG
    across the vectors of a single warp.  */
 
-static rtx
+rtx
 nvptx_gen_shuffle (rtx dst, rtx src, rtx idx, nvptx_shuffle_kind kind)
 {
   rtx res;
@@ -1833,6 +2041,8 @@ nvptx_output_mov_insn (rtx dst, rtx src)
   return "%.\tcvt%t0%t1\t%0, %1;";
 }
 
+static void nvptx_print_operand (FILE *, rtx, int);
+
 /* Output INSN, which is a call to CALLEE with result RESULT.  For ptx, this
    involves writing .param declarations and in/out copies into them.  For
    indirect calls, also write the .callprototype.  */
@@ -1844,6 +2054,8 @@ nvptx_output_call_insn (rtx_insn *insn, rtx result, rtx callee)
   static int labelno;
   bool needs_tgt = register_operand (callee, Pmode);
   rtx pat = PATTERN (insn);
+  if (GET_CODE (pat) == COND_EXEC)
+    pat = COND_EXEC_CODE (pat);
   int arg_end = XVECLEN (pat, 0);
   tree decl = NULL_TREE;
 
@@ -1888,6 +2100,8 @@ nvptx_output_call_insn (rtx_insn *insn, rtx result, rtx callee)
       fprintf (asm_out_file, ";\n");
     }
 
+  /* The '.' stands for the call's predicate, if any.  */
+  nvptx_print_operand (asm_out_file, NULL_RTX, '.');
   fprintf (asm_out_file, "\t\tcall ");
   if (result != NULL_RTX)
     fprintf (asm_out_file, "(%s_in), ", reg_names[NVPTX_RETURN_REGNUM]);
@@ -1951,8 +2165,6 @@ nvptx_print_operand_punct_valid_p (unsigned char c)
   return c == '.' || c== '#';
 }
 
-static void nvptx_print_operand (FILE *, rtx, int);
-
 /* Subroutine of nvptx_print_operand; used to print a memory reference X to FILE.  */
 
 static void
@@ -2013,12 +2225,10 @@ nvptx_print_operand (FILE *file, rtx x, int code)
       x = current_insn_predicate;
       if (x)
 	{
-	  unsigned int regno = REGNO (XEXP (x, 0));
-	  fputs ("[", file);
+	  fputs ("@", file);
 	  if (GET_CODE (x) == EQ)
 	    fputs ("!", file);
-	  fputs (reg_names [regno], file);
-	  fputs ("]", file);
+	  output_reg (file, REGNO (XEXP (x, 0)), VOIDmode);
 	}
       return;
     }
@@ -2313,6 +2523,89 @@ nvptx_reorg_subreg (void)
     }
 }
 
+/* Return a SImode "master lane index" register for uniform-simt, allocating on
+   first use.  */
+
+static rtx
+nvptx_get_unisimt_master ()
+{
+  rtx &master = cfun->machine->unisimt_master;
+  return master ? master : master = gen_reg_rtx (SImode);
+}
+
+/* Return a BImode "predicate" register for uniform-simt, similar to above.  */
+
+static rtx
+nvptx_get_unisimt_predicate ()
+{
+  rtx &pred = cfun->machine->unisimt_predicate;
+  return pred ? pred : pred = gen_reg_rtx (BImode);
+}
+
+/* Return true if given call insn references one of the functions provided by
+   the CUDA runtime: malloc, free, vprintf.  */
+
+static bool
+nvptx_call_insn_is_syscall_p (rtx_insn *insn)
+{
+  rtx pat = PATTERN (insn);
+  gcc_checking_assert (GET_CODE (pat) == PARALLEL);
+  pat = XVECEXP (pat, 0, 0);
+  if (GET_CODE (pat) == SET)
+    pat = SET_SRC (pat);
+  gcc_checking_assert (GET_CODE (pat) == CALL
+		       && GET_CODE (XEXP (pat, 0)) == MEM);
+  rtx addr = XEXP (XEXP (pat, 0), 0);
+  if (GET_CODE (addr) != SYMBOL_REF)
+    return false;
+  const char *name = XSTR (addr, 0);
+  /* Ordinary malloc/free are redirected to __nvptx_{malloc,free), so only the
+     references with forced assembler name refer to PTX syscalls.  For vprintf,
+     accept both normal and forced-assembler-name references.  */
+  return (!strcmp (name, "vprintf") || !strcmp (name, "*vprintf")
+	  || !strcmp (name, "*malloc")
+	  || !strcmp (name, "*free"));
+}
+
+/* If SET subexpression of INSN sets a register, emit a shuffle instruction to
+   propagate its value from lane MASTER to current lane.  */
+
+static void
+nvptx_unisimt_handle_set (rtx set, rtx_insn *insn, rtx master)
+{
+  rtx reg;
+  if (GET_CODE (set) == SET && REG_P (reg = SET_DEST (set)))
+    emit_insn_after (nvptx_gen_shuffle (reg, reg, master, SHUFFLE_IDX), insn);
+}
+
+/* Adjust code for uniform-simt code generation variant by making atomics and
+   "syscalls" conditionally executed, and inserting shuffle-based propagation
+   for registers being set.  */
+
+static void
+nvptx_reorg_uniform_simt ()
+{
+  rtx_insn *insn, *next;
+
+  for (insn = get_insns (); insn; insn = next)
+    {
+      next = NEXT_INSN (insn);
+      if (!(CALL_P (insn) && nvptx_call_insn_is_syscall_p (insn))
+	  && !(NONJUMP_INSN_P (insn)
+	       && GET_CODE (PATTERN (insn)) == PARALLEL
+	       && get_attr_atomic (insn)))
+	continue;
+      rtx pat = PATTERN (insn);
+      rtx master = nvptx_get_unisimt_master ();
+      for (int i = 0; i < XVECLEN (pat, 0); i++)
+	nvptx_unisimt_handle_set (XVECEXP (pat, 0, i), insn, master);
+      rtx pred = nvptx_get_unisimt_predicate ();
+      pred = gen_rtx_NE (BImode, pred, const0_rtx);
+      pat = gen_rtx_COND_EXEC (VOIDmode, pred, pat);
+      validate_change (insn, &PATTERN (insn), pat, false);
+    }
+}
+
 /* Loop structure of the function.  The entire function is described as
    a NULL loop.  */
 
@@ -3091,11 +3384,17 @@ nvptx_find_sese (auto_vec<basic_block> &blocks, bb_pair_vec_t &regions)
   int ix;
 
   /* First clear each BB of the whole function.  */ 
-  FOR_ALL_BB_FN (block, cfun)
+  FOR_EACH_BB_FN (block, cfun)
     {
       block->flags &= ~BB_VISITED;
       BB_SET_SESE (block, 0);
     }
+  block = EXIT_BLOCK_PTR_FOR_FN (cfun);
+  block->flags &= ~BB_VISITED;
+  BB_SET_SESE (block, 0);
+  block = ENTRY_BLOCK_PTR_FOR_FN (cfun);
+  block->flags &= ~BB_VISITED;
+  BB_SET_SESE (block, 0);
 
   /* Mark blocks in the function that are in this graph.  */
   for (ix = 0; blocks.iterate (ix, &block); ix++)
@@ -3829,6 +4128,9 @@ nvptx_reorg (void)
   /* Replace subregs.  */
   nvptx_reorg_subreg ();
 
+  if (TARGET_UNIFORM_SIMT)
+    nvptx_reorg_uniform_simt ();
+
   regstat_free_n_sets_and_refs ();
 
   df_finish_pass (true);
@@ -3857,12 +4159,36 @@ nvptx_handle_kernel_attribute (tree *node, tree name, tree ARG_UNUSED (args),
   return NULL_TREE;
 }
 
+/* Handle a "shared" attribute; arguments as in
+   struct attribute_spec.handler.  */
+
+static tree
+nvptx_handle_shared_attribute (tree *node, tree name, tree ARG_UNUSED (args),
+			       int ARG_UNUSED (flags), bool *no_add_attrs)
+{
+  tree decl = *node;
+
+  if (TREE_CODE (decl) != VAR_DECL)
+    {
+      error ("%qE attribute only applies to variables", name);
+      *no_add_attrs = true;
+    }
+  else if (!(TREE_PUBLIC (decl) || TREE_STATIC (decl)))
+    {
+      error ("%qE attribute not allowed with auto storage class", name);
+      *no_add_attrs = true;
+    }
+
+  return NULL_TREE;
+}
+
 /* Table of valid machine attributes.  */
 static const struct attribute_spec nvptx_attribute_table[] =
 {
   /* { name, min_len, max_len, decl_req, type_req, fn_type_req, handler,
        affects_type_identity } */
   { "kernel", 0, 0, true, false,  false, nvptx_handle_kernel_attribute, false },
+  { "shared", 0, 0, true, false,  false, nvptx_handle_shared_attribute, false },
   { NULL, 0, 0, false, false, false, NULL, false }
 };
 
@@ -3924,13 +4250,13 @@ nvptx_record_offload_symbol (tree decl)
     case FUNCTION_DECL:
       {
 	tree attr = get_oacc_fn_attrib (decl);
-	tree dims = TREE_VALUE (attr);
-	unsigned ix;
+	/* OpenMP offloading does not set this attribute.  */
+	tree dims = attr ? TREE_VALUE (attr) : NULL_TREE;
 
 	fprintf (asm_out_file, "//:FUNC_MAP \"%s\"",
 		 IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl)));
 
-	for (ix = 0; ix != GOMP_DIM_MAX; ix++, dims = TREE_CHAIN (dims))
+	for (; dims; dims = TREE_CHAIN (dims))
 	  {
 	    int size = TREE_INT_CST_LOW (TREE_VALUE (dims));
 
@@ -3991,6 +4317,21 @@ nvptx_file_end (void)
   if (worker_red_size)
     write_worker_buffer (asm_out_file, worker_red_sym,
 			 worker_red_align, worker_red_size);
+
+  if (need_softstack_decl)
+    {
+      write_var_marker (asm_out_file, false, true, "__nvptx_stacks");
+      /* 32 is the maximum number of warps in a block.  Even though it's an
+         external declaration, emit the array size explicitly; otherwise, it
+         may fail at PTX JIT time if the definition is later in link order.  */
+      fprintf (asm_out_file, ".extern .shared .u%d __nvptx_stacks[32];\n",
+	       POINTER_SIZE);
+    }
+  if (need_unisimt_decl)
+    {
+      write_var_marker (asm_out_file, false, true, "__nvptx_uni");
+      fprintf (asm_out_file, ".extern .shared .u32 __nvptx_uni[32];\n");
+    }
 }
 
 /* Expander for the shuffle builtins.  */
@@ -4176,6 +4517,14 @@ nvptx_expand_builtin (tree exp, rtx target, rtx ARG_UNUSED (subtarget),
 #define PTX_WORKER_LENGTH 32
 #define PTX_GANG_DEFAULT  0 /* Defer to runtime.  */
 
+/* Implement TARGET_SIMT_VF target hook: number of threads in a warp.  */
+
+static int
+nvptx_simt_vf ()
+{
+  return PTX_VECTOR_LENGTH;
+}
+
 /* Validate compute dimensions of an OpenACC offload or routine, fill
    in non-unity defaults.  FN_LEVEL indicates the level at which a
    routine might spawn a loop.  It is negative for non-routines.  If
@@ -4944,6 +5293,9 @@ nvptx_goacc_reduction (gcall *call)
 #undef  TARGET_BUILTIN_DECL
 #define TARGET_BUILTIN_DECL nvptx_builtin_decl
 
+#undef TARGET_SIMT_VF
+#define TARGET_SIMT_VF nvptx_simt_vf
+
 #undef TARGET_GOACC_VALIDATE_DIMS
 #define TARGET_GOACC_VALIDATE_DIMS nvptx_goacc_validate_dims
 
diff --git a/gcc/config/nvptx/nvptx.h b/gcc/config/nvptx/nvptx.h
index 381269e3bcc..1702178eeb9 100644
--- a/gcc/config/nvptx/nvptx.h
+++ b/gcc/config/nvptx/nvptx.h
@@ -31,6 +31,10 @@
       builtin_assert ("machine=nvptx");		\
       builtin_assert ("cpu=nvptx");		\
       builtin_define ("__nvptx__");		\
+      if (TARGET_SOFT_STACK)			\
+        builtin_define ("__nvptx_softstack__");	\
+      if (TARGET_UNIFORM_SIMT)			\
+        builtin_define ("__nvptx_unisimt__");	\
     } while (0)
 
 /* Avoid the default in ../../gcc.c, which adds "-pthread", which is not
@@ -79,13 +83,14 @@
 
 #define POINTER_SIZE (TARGET_ABI64 ? 64 : 32)
 #define Pmode (TARGET_ABI64 ? DImode : SImode)
+#define STACK_SIZE_MODE Pmode
 
 /* Registers.  Since ptx is a virtual target, we just define a few
    hard registers for special purposes and leave pseudos unallocated.
    We have to have some available hard registers, to keep gcc setup
    happy.  */
 #define FIRST_PSEUDO_REGISTER 16
-#define FIXED_REGISTERS	    { 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }
+#define FIXED_REGISTERS	    { 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0 }
 #define CALL_USED_REGISTERS { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }
 
 #define HARD_REGNO_NREGS(REG, MODE)		\
@@ -133,10 +138,17 @@ enum reg_class             {  NO_REGS,    ALL_REGS,	LIM_REG_CLASSES };
 #define FRAME_POINTER_REGNUM 2
 #define ARG_POINTER_REGNUM 3
 #define STATIC_CHAIN_REGNUM 4
+/* This register points to the shared memory location with the current warp's
+   soft stack pointer (__nvptx_stacks[tid.y]).  */
+#define SOFTSTACK_SLOT_REGNUM 5
+/* This register is used to save the previous value of the soft stack pointer
+   in the prologue and restore it when returning.  */
+#define SOFTSTACK_PREV_REGNUM 6
 
 #define REGISTER_NAMES							\
   {									\
-    "%value", "%stack", "%frame", "%args", "%chain", "%hr5", "%hr6", "%hr7", \
+    "%value", "%stack", "%frame", "%args",                              \
+    "%chain", "%sspslot", "%sspprev", "%hr7",                           \
     "%hr8", "%hr9", "%hr10", "%hr11", "%hr12", "%hr13", "%hr14", "%hr15" \
   }
 
@@ -200,10 +212,13 @@ struct GTY(()) machine_function
   bool is_varadic;  /* This call is varadic  */
   bool has_varadic;  /* Current function has a varadic call.  */
   bool has_chain; /* Current function has outgoing static chain.  */
+  bool has_softstack; /* Current function has a soft stack frame.  */
   int num_args;	/* Number of args of current call.  */
   int return_mode; /* Return mode of current fn.
 		      (machine_mode not defined yet.) */
   rtx axis_predicate[2]; /* Neutering predicates.  */
+  rtx unisimt_master; /* 'Master lane index' for -muniform-simt.  */
+  rtx unisimt_predicate; /* Predicate for -muniform-simt.  */
 };
 #endif
 
diff --git a/gcc/config/nvptx/nvptx.md b/gcc/config/nvptx/nvptx.md
index d117343c531..91d11290860 100644
--- a/gcc/config/nvptx/nvptx.md
+++ b/gcc/config/nvptx/nvptx.md
@@ -36,10 +36,16 @@
 
    UNSPEC_ALLOCA
 
+   UNSPEC_SET_SOFTSTACK
+
    UNSPEC_DIM_SIZE
 
    UNSPEC_BIT_CONV
 
+   UNSPEC_VOTE_BALLOT
+
+   UNSPEC_LANEID
+
    UNSPEC_SHUFFLE
    UNSPEC_BR_UNIFIED
 ])
@@ -55,11 +61,16 @@
    UNSPECV_FORKED
    UNSPECV_JOINING
    UNSPECV_JOIN
+
+   UNSPECV_NOUNROLL
 ])
 
 (define_attr "subregs_ok" "false,true"
   (const_string "false"))
 
+(define_attr "atomic" "false,true"
+  (const_string "false"))
+
 ;; The nvptx operand predicates, in general, don't permit subregs and
 ;; only literal constants, which differ from the generic ones, which
 ;; permit subregs and symbolc constants (as appropriate)
@@ -124,6 +135,17 @@
   return true;
 })
 
+(define_attr "predicable" "false,true"
+  (const_string "true"))
+
+(define_cond_exec
+  [(match_operator 0 "predicate_operator"
+      [(match_operand:BI 1 "nvptx_register_operand" "")
+       (match_operand:BI 2 "const0_operand" "")])]
+  ""
+  ""
+  )
+
 (define_constraint "P0"
   "An integer with the value 0."
   (and (match_code "const_int")
@@ -509,7 +531,8 @@
 		      (label_ref (match_operand 1 "" ""))
 		      (pc)))]
   ""
-  "%j0\\tbra\\t%l1;")
+  "%j0\\tbra\\t%l1;"
+  [(set_attr "predicable" "false")])
 
 (define_insn "br_false"
   [(set (pc)
@@ -518,7 +541,8 @@
 		      (label_ref (match_operand 1 "" ""))
 		      (pc)))]
   ""
-  "%J0\\tbra\\t%l1;")
+  "%J0\\tbra\\t%l1;"
+  [(set_attr "predicable" "false")])
 
 ;; unified conditional branch
 (define_insn "br_true_uni"
@@ -527,7 +551,8 @@
 		       UNSPEC_BR_UNIFIED) (const_int 0))
         (label_ref (match_operand 1 "" "")) (pc)))]
   ""
-  "%j0\\tbra.uni\\t%l1;")
+  "%j0\\tbra.uni\\t%l1;"
+  [(set_attr "predicable" "false")])
 
 (define_insn "br_false_uni"
   [(set (pc) (if_then_else
@@ -535,7 +560,8 @@
 		       UNSPEC_BR_UNIFIED) (const_int 0))
         (label_ref (match_operand 1 "" "")) (pc)))]
   ""
-  "%J0\\tbra.uni\\t%l1;")
+  "%J0\\tbra.uni\\t%l1;"
+  [(set_attr "predicable" "false")])
 
 (define_expand "cbranch<mode>4"
   [(set (pc)
@@ -938,12 +964,16 @@
   ""
 {
   return nvptx_output_return ();
-})
+}
+  [(set_attr "predicable" "false")])
 
 (define_expand "epilogue"
   [(clobber (const_int 0))]
   ""
 {
+  if (TARGET_SOFT_STACK)
+    emit_insn (gen_set_softstack_insn (gen_rtx_REG (Pmode,
+						    SOFTSTACK_PREV_REGNUM)));
   emit_jump_insn (gen_return ());
   DONE;
 })
@@ -972,31 +1002,40 @@
    (match_operand 1 "nvptx_register_operand")]
   ""
 {
+  if (TARGET_SOFT_STACK)
+    {
+      emit_move_insn (stack_pointer_rtx,
+		      gen_rtx_MINUS (Pmode, stack_pointer_rtx, operands[1]));
+      emit_insn (gen_set_softstack_insn (stack_pointer_rtx));
+      emit_move_insn (operands[0], virtual_stack_dynamic_rtx);
+      DONE;
+    }
   /* The ptx documentation specifies an alloca intrinsic (for 32 bit
      only)  but notes it is not implemented.  The assembler emits a
      confused error message.  Issue a blunt one now instead.  */
   sorry ("target cannot support alloca.");
   emit_insn (gen_nop ());
   DONE;
-  if (TARGET_ABI64)
-    emit_insn (gen_allocate_stack_di (operands[0], operands[1]));
-  else
-    emit_insn (gen_allocate_stack_si (operands[0], operands[1]));
-  DONE;
 })
 
-(define_insn "allocate_stack_<mode>"
-  [(set (match_operand:P 0 "nvptx_register_operand" "=R")
-        (unspec:P [(match_operand:P 1 "nvptx_register_operand" "R")]
-                   UNSPEC_ALLOCA))]
-  ""
-  "%.\\tcall (%0), %%alloca, (%1);")
+(define_insn "set_softstack_insn"
+  [(unspec [(match_operand 0 "nvptx_register_operand" "R")]
+	   UNSPEC_SET_SOFTSTACK)]
+  "TARGET_SOFT_STACK"
+{
+  return nvptx_output_set_softstack (REGNO (operands[0]));
+})
 
 (define_expand "restore_stack_block"
   [(match_operand 0 "register_operand" "")
    (match_operand 1 "register_operand" "")]
   ""
 {
+  if (TARGET_SOFT_STACK)
+    {
+      emit_move_insn (operands[0], operands[1]);
+      emit_insn (gen_set_softstack_insn (operands[0]));
+    }
   DONE;
 })
 
@@ -1018,14 +1057,16 @@
 		(const_int 0))
 	    (const_int 0))]
   ""
-  "%j0 trap;")
+  "%j0 trap;"
+  [(set_attr "predicable" "false")])
 
 (define_insn "trap_if_false"
   [(trap_if (eq (match_operand:BI 0 "nvptx_register_operand" "R")
 		(const_int 0))
 	    (const_int 0))]
   ""
-  "%J0 trap;")
+  "%J0 trap;"
+  [(set_attr "predicable" "false")])
 
 (define_expand "ctrap<mode>4"
   [(trap_if (match_operator 0 "nvptx_comparison_operator"
@@ -1074,28 +1115,28 @@
 		       UNSPECV_FORK)]
   ""
   "// fork %0;"
-)
+  [(set_attr "predicable" "false")])
 
 (define_insn "nvptx_forked"
   [(unspec_volatile:SI [(match_operand:SI 0 "const_int_operand" "")]
 		       UNSPECV_FORKED)]
   ""
   "// forked %0;"
-)
+  [(set_attr "predicable" "false")])
 
 (define_insn "nvptx_joining"
   [(unspec_volatile:SI [(match_operand:SI 0 "const_int_operand" "")]
 		       UNSPECV_JOINING)]
   ""
   "// joining %0;"
-)
+  [(set_attr "predicable" "false")])
 
 (define_insn "nvptx_join"
   [(unspec_volatile:SI [(match_operand:SI 0 "const_int_operand" "")]
 		       UNSPECV_JOIN)]
   ""
   "// join %0;"
-)
+  [(set_attr "predicable" "false")])
 
 (define_expand "oacc_fork"
   [(set (match_operand:SI 0 "nvptx_nonmemory_operand" "")
@@ -1134,6 +1175,88 @@
   ""
   "%.\\tshfl%S3.b32\\t%0, %1, %2, 31;")
 
+(define_insn "nvptx_vote_ballot"
+  [(set (match_operand:SI 0 "nvptx_register_operand" "=R")
+	(unspec:SI [(match_operand:BI 1 "nvptx_register_operand" "R")]
+		   UNSPEC_VOTE_BALLOT))]
+  ""
+  "%.\\tvote.ballot.b32\\t%0, %1;")
+
+;; Patterns for OpenMP SIMD-via-SIMT lowering
+
+;; Implement IFN_GOMP_SIMT_LANE: set operand 0 to lane index
+(define_insn "omp_simt_lane"
+  [(set (match_operand:SI 0 "nvptx_register_operand" "")
+	(unspec:SI [(const_int 0)] UNSPEC_LANEID))]
+  ""
+  "%.\\tmov.u32\\t%0, %%laneid;")
+
+;; Implement IFN_GOMP_SIMT_ORDERED: copy operand 1 to operand 0 and
+;; place a compiler barrier to disallow unrolling/peeling the containing loop
+(define_expand "omp_simt_ordered"
+  [(match_operand:SI 0 "nvptx_register_operand" "=R")
+   (match_operand:SI 1 "nvptx_register_operand" "R")]
+  ""
+{
+  emit_move_insn (operands[0], operands[1]);
+  emit_insn (gen_nvptx_nounroll ());
+  DONE;
+})
+
+;; Implement IFN_GOMP_SIMT_XCHG_BFLY: perform a "butterfly" exchange
+;; across lanes
+(define_expand "omp_simt_xchg_bfly"
+  [(match_operand 0 "nvptx_register_operand" "=R")
+   (match_operand 1 "nvptx_register_operand" "R")
+   (match_operand:SI 2 "nvptx_nonmemory_operand" "Ri")]
+  ""
+{
+  emit_insn (nvptx_gen_shuffle (operands[0], operands[1], operands[2],
+				SHUFFLE_BFLY));
+  DONE;
+})
+
+;; Implement IFN_GOMP_SIMT_XCHG_IDX: broadcast value in operand 1
+;; from lane given by index in operand 2 to operand 0 in all lanes
+(define_expand "omp_simt_xchg_idx"
+  [(match_operand 0 "nvptx_register_operand" "=R")
+   (match_operand 1 "nvptx_register_operand" "R")
+   (match_operand:SI 2 "nvptx_nonmemory_operand" "Ri")]
+  ""
+{
+  emit_insn (nvptx_gen_shuffle (operands[0], operands[1], operands[2],
+				SHUFFLE_IDX));
+  DONE;
+})
+
+;; Implement IFN_GOMP_SIMT_VOTE_ANY:
+;; set operand 0 to zero iff all lanes supply zero in operand 1
+(define_expand "omp_simt_vote_any"
+  [(match_operand:SI 0 "nvptx_register_operand" "=R")
+   (match_operand:SI 1 "nvptx_register_operand" "R")]
+  ""
+{
+  rtx pred = gen_reg_rtx (BImode);
+  emit_move_insn (pred, gen_rtx_NE (BImode, operands[1], const0_rtx));
+  emit_insn (gen_nvptx_vote_ballot (operands[0], pred));
+  DONE;
+})
+
+;; Implement IFN_GOMP_SIMT_LAST_LANE:
+;; set operand 0 to the lowest lane index that passed non-zero in operand 1
+(define_expand "omp_simt_last_lane"
+  [(match_operand:SI 0 "nvptx_register_operand" "=R")
+   (match_operand:SI 1 "nvptx_register_operand" "R")]
+  ""
+{
+  rtx pred = gen_reg_rtx (BImode);
+  rtx tmp = gen_reg_rtx (SImode);
+  emit_move_insn (pred, gen_rtx_NE (BImode, operands[1], const0_rtx));
+  emit_insn (gen_nvptx_vote_ballot (tmp, pred));
+  emit_insn (gen_ctzsi2 (operands[0], tmp));
+  DONE;
+})
+
 ;; extract parts of a 64 bit object into 2 32-bit ints
 (define_insn "unpack<mode>si2"
   [(set (match_operand:SI 0 "nvptx_register_operand" "=R")
@@ -1186,7 +1309,8 @@
    (set (match_dup 1)
 	(unspec_volatile:SDIM [(const_int 0)] UNSPECV_CAS))]
   ""
-  "%.\\tatom%A1.cas.b%T0\\t%0, %1, %2, %3;")
+  "%.\\tatom%A1.cas.b%T0\\t%0, %1, %2, %3;"
+  [(set_attr "atomic" "true")])
 
 (define_insn "atomic_exchange<mode>"
   [(set (match_operand:SDIM 0 "nvptx_register_operand" "=R")	;; output
@@ -1197,7 +1321,8 @@
    (set (match_dup 1)
 	(match_operand:SDIM 2 "nvptx_nonmemory_operand" "Ri"))]	;; input
   ""
-  "%.\\tatom%A1.exch.b%T0\\t%0, %1, %2;")
+  "%.\\tatom%A1.exch.b%T0\\t%0, %1, %2;"
+  [(set_attr "atomic" "true")])
 
 (define_insn "atomic_fetch_add<mode>"
   [(set (match_operand:SDIM 1 "memory_operand" "+m")
@@ -1209,7 +1334,8 @@
    (set (match_operand:SDIM 0 "nvptx_register_operand" "=R")
 	(match_dup 1))]
   ""
-  "%.\\tatom%A1.add%t0\\t%0, %1, %2;")
+  "%.\\tatom%A1.add%t0\\t%0, %1, %2;"
+  [(set_attr "atomic" "true")])
 
 (define_insn "atomic_fetch_addsf"
   [(set (match_operand:SF 1 "memory_operand" "+m")
@@ -1221,7 +1347,8 @@
    (set (match_operand:SF 0 "nvptx_register_operand" "=R")
 	(match_dup 1))]
   ""
-  "%.\\tatom%A1.add%t0\\t%0, %1, %2;")
+  "%.\\tatom%A1.add%t0\\t%0, %1, %2;"
+  [(set_attr "atomic" "true")])
 
 (define_code_iterator any_logic [and ior xor])
 (define_code_attr logic [(and "and") (ior "or") (xor "xor")])
@@ -1237,10 +1364,18 @@
    (set (match_operand:SDIM 0 "nvptx_register_operand" "=R")
 	(match_dup 1))]
   "0"
-  "%.\\tatom%A1.b%T0.<logic>\\t%0, %1, %2;")
+  "%.\\tatom%A1.b%T0.<logic>\\t%0, %1, %2;"
+  [(set_attr "atomic" "true")])
 
 (define_insn "nvptx_barsync"
   [(unspec_volatile [(match_operand:SI 0 "const_int_operand" "")]
 		    UNSPECV_BARSYNC)]
   ""
-  "\\tbar.sync\\t%0;")
+  "\\tbar.sync\\t%0;"
+  [(set_attr "predicable" "false")])
+
+(define_insn "nvptx_nounroll"
+  [(unspec_volatile [(const_int 0)] UNSPECV_NOUNROLL)]
+  ""
+  "\\t.pragma \\\"nounroll\\\";"
+  [(set_attr "predicable" "false")])
diff --git a/gcc/config/nvptx/nvptx.opt b/gcc/config/nvptx/nvptx.opt
index 601cf124e36..cb6194da9dc 100644
--- a/gcc/config/nvptx/nvptx.opt
+++ b/gcc/config/nvptx/nvptx.opt
@@ -32,3 +32,15 @@ Link in code for a __main kernel.
 moptimize
 Target Report Var(nvptx_optimize) Init(-1)
 Optimize partition neutering.
+
+msoft-stack
+Target Report Mask(SOFT_STACK)
+Use custom stacks instead of local memory for automatic storage.
+
+muniform-simt
+Target Report Mask(UNIFORM_SIMT)
+Generate code that can keep local state uniform across all lanes.
+
+mgomp
+Target Report Mask(GOMP)
+Generate code for OpenMP offloading: enables -msoft-stack and -muniform-simt.
diff --git a/gcc/config/nvptx/t-nvptx b/gcc/config/nvptx/t-nvptx
index e2580c956e0..6c1010ddd66 100644
--- a/gcc/config/nvptx/t-nvptx
+++ b/gcc/config/nvptx/t-nvptx
@@ -8,3 +8,5 @@ ALL_HOST_OBJS += mkoffload.o
 mkoffload$(exeext): mkoffload.o collect-utils.o libcommon-target.a $(LIBIBERTY) $(LIBDEPS)
 	+$(LINKER) $(ALL_LINKERFLAGS) $(LDFLAGS) -o $@ \
 	  mkoffload.o collect-utils.o libcommon-target.a $(LIBIBERTY) $(LIBS)
+
+MULTILIB_OPTIONS = mgomp
diff --git a/gcc/doc/extend.texi b/gcc/doc/extend.texi
index 0669f7999be..4dcc7f6b555 100644
--- a/gcc/doc/extend.texi
+++ b/gcc/doc/extend.texi
@@ -5576,6 +5576,7 @@ attributes.
 * MeP Variable Attributes::
 * Microsoft Windows Variable Attributes::
 * MSP430 Variable Attributes::
+* Nvidia PTX Variable Attributes::
 * PowerPC Variable Attributes::
 * RL78 Variable Attributes::
 * SPU Variable Attributes::
@@ -6257,6 +6258,20 @@ same name (@pxref{MSP430 Function Attributes}).
 These attributes can be applied to both functions and variables.
 @end table
 
+@node Nvidia PTX Variable Attributes
+@subsection Nvidia PTX Variable Attributes
+
+These variable attributes are supported by the Nvidia PTX back end:
+
+@table @code
+@item shared
+@cindex @code{shared} attribute, Nvidia PTX
+Use this attribute to place a variable in the @code{.shared} memory space.
+This memory space is private to each cooperative thread array; only threads
+within one thread block refer to the same instance of the variable.
+The runtime does not initialize variables in this memory space.
+@end table
+
 @node PowerPC Variable Attributes
 @subsection PowerPC Variable Attributes
 
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 17c5c225d0c..f06e2b8206d 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -20462,6 +20462,37 @@ offloading execution.
 Apply partitioned execution optimizations.  This is the default when any
 level of optimization is selected.
 
+@item -msoft-stack
+@opindex msoft-stack
+Generate code that does not use @code{.local} memory
+directly for stack storage. Instead, a per-warp stack pointer is
+maintained explicitly. This enables variable-length stack allocation (with
+variable-length arrays or @code{alloca}), and when global memory is used for
+underlying storage, makes it possible to access automatic variables from other
+threads, or with atomic instructions. This code generation variant is used
+for OpenMP offloading, but the option is exposed on its own for the purpose
+of testing the compiler; to generate code suitable for linking into programs
+using OpenMP offloading, use option @option{-mgomp}.
+
+@item -muniform-simt
+@opindex muniform-simt
+Switch to code generation variant that allows to execute all threads in each
+warp, while maintaining memory state and side effects as if only one thread
+in each warp was active outside of OpenMP SIMD regions.  All atomic operations
+and calls to runtime (malloc, free, vprintf) are conditionally executed (iff
+current lane index equals the master lane index), and the register being
+assigned is copied via a shuffle instruction from the master lane.  Outside of
+SIMD regions lane 0 is the master; inside, each thread sees itself as the
+master.  Shared memory array @code{int __nvptx_uni[]} stores all-zeros or
+all-ones bitmasks for each warp, indicating current mode (0 outside of SIMD
+regions).  Each thread can bitwise-and the bitmask at position @code{tid.y}
+with current lane index to compute the master lane index.
+
+@item -mgomp
+@opindex mgomp
+Generate code for use in OpenMP offloading: enables @option{-msoft-stack} and
+@option{-muniform-simt} options, and selects corresponding multilib variant.
+
 @end table
 
 @node PDP-11 Options
diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
index 81c63b71bff..10122439c30 100644
--- a/gcc/doc/tm.texi
+++ b/gcc/doc/tm.texi
@@ -5858,6 +5858,10 @@ usable.  In that case, the smaller the number is, the more desirable it is
 to use it.
 @end deftypefn
 
+@deftypefn {Target Hook} int TARGET_SIMT_VF (void)
+Return number of threads in SIMT thread group on the target.
+@end deftypefn
+
 @deftypefn {Target Hook} bool TARGET_GOACC_VALIDATE_DIMS (tree @var{decl}, int *@var{dims}, int @var{fn_level})
 This hook should check the launch dimensions provided for an OpenACC
 compute region, or routine.  Defaulted values are represented as -1
diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in
index 74e9200f749..0a7eca32639 100644
--- a/gcc/doc/tm.texi.in
+++ b/gcc/doc/tm.texi.in
@@ -4293,6 +4293,8 @@ address;  but often a machine-dependent strategy can generate better code.
 
 @hook TARGET_SIMD_CLONE_USABLE
 
+@hook TARGET_SIMT_VF
+
 @hook TARGET_GOACC_VALIDATE_DIMS
 
 @hook TARGET_GOACC_DIM_LIMIT
diff --git a/gcc/internal-fn.c b/gcc/internal-fn.c
index cbee97ea82c..fd1cd8b6445 100644
--- a/gcc/internal-fn.c
+++ b/gcc/internal-fn.c
@@ -157,6 +157,132 @@ expand_ANNOTATE (internal_fn, gcall *)
   gcc_unreachable ();
 }
 
+/* Lane index on SIMT targets: thread index in the warp on NVPTX.  On targets
+   without SIMT execution this should be expanded in omp_device_lower pass.  */
+
+static void
+expand_GOMP_SIMT_LANE (internal_fn, gcall *stmt)
+{
+  tree lhs = gimple_call_lhs (stmt);
+  if (!lhs)
+    return;
+
+  rtx target = expand_expr (lhs, NULL_RTX, VOIDmode, EXPAND_WRITE);
+  gcc_assert (targetm.have_omp_simt_lane ());
+  emit_insn (targetm.gen_omp_simt_lane (target));
+}
+
+/* This should get expanded in omp_device_lower pass.  */
+
+static void
+expand_GOMP_SIMT_VF (internal_fn, gcall *)
+{
+  gcc_unreachable ();
+}
+
+/* Lane index of the first SIMT lane that supplies a non-zero argument.
+   This is a SIMT counterpart to GOMP_SIMD_LAST_LANE, used to represent the
+   lane that executed the last iteration for handling OpenMP lastprivate.  */
+
+static void
+expand_GOMP_SIMT_LAST_LANE (internal_fn, gcall *stmt)
+{
+  tree lhs = gimple_call_lhs (stmt);
+  if (!lhs)
+    return;
+
+  rtx target = expand_expr (lhs, NULL_RTX, VOIDmode, EXPAND_WRITE);
+  rtx cond = expand_normal (gimple_call_arg (stmt, 0));
+  machine_mode mode = TYPE_MODE (TREE_TYPE (lhs));
+  struct expand_operand ops[2];
+  create_output_operand (&ops[0], target, mode);
+  create_input_operand (&ops[1], cond, mode);
+  gcc_assert (targetm.have_omp_simt_last_lane ());
+  expand_insn (targetm.code_for_omp_simt_last_lane, 2, ops);
+}
+
+/* Non-transparent predicate used in SIMT lowering of OpenMP "ordered".  */
+
+static void
+expand_GOMP_SIMT_ORDERED_PRED (internal_fn, gcall *stmt)
+{
+  tree lhs = gimple_call_lhs (stmt);
+  if (!lhs)
+    return;
+
+  rtx target = expand_expr (lhs, NULL_RTX, VOIDmode, EXPAND_WRITE);
+  rtx ctr = expand_normal (gimple_call_arg (stmt, 0));
+  machine_mode mode = TYPE_MODE (TREE_TYPE (lhs));
+  struct expand_operand ops[2];
+  create_output_operand (&ops[0], target, mode);
+  create_input_operand (&ops[1], ctr, mode);
+  gcc_assert (targetm.have_omp_simt_ordered ());
+  expand_insn (targetm.code_for_omp_simt_ordered, 2, ops);
+}
+
+/* "Or" boolean reduction across SIMT lanes: return non-zero in all lanes if
+   any lane supplies a non-zero argument.  */
+
+static void
+expand_GOMP_SIMT_VOTE_ANY (internal_fn, gcall *stmt)
+{
+  tree lhs = gimple_call_lhs (stmt);
+  if (!lhs)
+    return;
+
+  rtx target = expand_expr (lhs, NULL_RTX, VOIDmode, EXPAND_WRITE);
+  rtx cond = expand_normal (gimple_call_arg (stmt, 0));
+  machine_mode mode = TYPE_MODE (TREE_TYPE (lhs));
+  struct expand_operand ops[2];
+  create_output_operand (&ops[0], target, mode);
+  create_input_operand (&ops[1], cond, mode);
+  gcc_assert (targetm.have_omp_simt_vote_any ());
+  expand_insn (targetm.code_for_omp_simt_vote_any, 2, ops);
+}
+
+/* Exchange between SIMT lanes with a "butterfly" pattern: source lane index
+   is destination lane index XOR given offset.  */
+
+static void
+expand_GOMP_SIMT_XCHG_BFLY (internal_fn, gcall *stmt)
+{
+  tree lhs = gimple_call_lhs (stmt);
+  if (!lhs)
+    return;
+
+  rtx target = expand_expr (lhs, NULL_RTX, VOIDmode, EXPAND_WRITE);
+  rtx src = expand_normal (gimple_call_arg (stmt, 0));
+  rtx idx = expand_normal (gimple_call_arg (stmt, 1));
+  machine_mode mode = TYPE_MODE (TREE_TYPE (lhs));
+  struct expand_operand ops[3];
+  create_output_operand (&ops[0], target, mode);
+  create_input_operand (&ops[1], src, mode);
+  create_input_operand (&ops[2], idx, SImode);
+  gcc_assert (targetm.have_omp_simt_xchg_bfly ());
+  expand_insn (targetm.code_for_omp_simt_xchg_bfly, 3, ops);
+}
+
+/* Exchange between SIMT lanes according to given source lane index.  */
+
+static void
+expand_GOMP_SIMT_XCHG_IDX (internal_fn, gcall *stmt)
+{
+  tree lhs = gimple_call_lhs (stmt);
+  if (!lhs)
+    return;
+
+  rtx target = expand_expr (lhs, NULL_RTX, VOIDmode, EXPAND_WRITE);
+  rtx src = expand_normal (gimple_call_arg (stmt, 0));
+  rtx idx = expand_normal (gimple_call_arg (stmt, 1));
+  machine_mode mode = TYPE_MODE (TREE_TYPE (lhs));
+  struct expand_operand ops[3];
+  create_output_operand (&ops[0], target, mode);
+  create_input_operand (&ops[1], src, mode);
+  create_input_operand (&ops[2], idx, SImode);
+  gcc_assert (targetm.have_omp_simt_xchg_idx ());
+  expand_insn (targetm.code_for_omp_simt_xchg_idx, 3, ops);
+}
+
 /* This should get expanded in adjust_simduid_builtins.  */
 
 static void
diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
index 0869b2fd52d..77ce63a6408 100644
--- a/gcc/internal-fn.def
+++ b/gcc/internal-fn.def
@@ -141,6 +141,13 @@ DEF_INTERNAL_INT_FN (FFS, ECF_CONST, ffs, unary)
 DEF_INTERNAL_INT_FN (PARITY, ECF_CONST, parity, unary)
 DEF_INTERNAL_INT_FN (POPCOUNT, ECF_CONST, popcount, unary)
 
+DEF_INTERNAL_FN (GOMP_SIMT_LANE, ECF_NOVOPS | ECF_LEAF | ECF_NOTHROW, NULL)
+DEF_INTERNAL_FN (GOMP_SIMT_VF, ECF_NOVOPS | ECF_LEAF | ECF_NOTHROW, NULL)
+DEF_INTERNAL_FN (GOMP_SIMT_LAST_LANE, ECF_NOVOPS | ECF_LEAF | ECF_NOTHROW, NULL)
+DEF_INTERNAL_FN (GOMP_SIMT_ORDERED_PRED, ECF_LEAF | ECF_NOTHROW, NULL)
+DEF_INTERNAL_FN (GOMP_SIMT_VOTE_ANY, ECF_NOVOPS | ECF_LEAF | ECF_NOTHROW, NULL)
+DEF_INTERNAL_FN (GOMP_SIMT_XCHG_BFLY, ECF_NOVOPS | ECF_LEAF | ECF_NOTHROW, NULL)
+DEF_INTERNAL_FN (GOMP_SIMT_XCHG_IDX, ECF_NOVOPS | ECF_LEAF | ECF_NOTHROW, NULL)
 DEF_INTERNAL_FN (GOMP_SIMD_LANE, ECF_NOVOPS | ECF_LEAF | ECF_NOTHROW, NULL)
 DEF_INTERNAL_FN (GOMP_SIMD_VF, ECF_CONST | ECF_LEAF | ECF_NOTHROW, NULL)
 DEF_INTERNAL_FN (GOMP_SIMD_LAST_LANE, ECF_CONST | ECF_LEAF | ECF_NOTHROW, NULL)
diff --git a/gcc/omp-low.c b/gcc/omp-low.c
index e5b9e4c1091..da5476bc99f 100644
--- a/gcc/omp-low.c
+++ b/gcc/omp-low.c
@@ -2427,6 +2427,20 @@ cilk_for_check_loop_diff_type (tree type)
     }
 }
 
+/* Return true if CTX may belong to offloaded code: either if current function
+   is offloaded, or any enclosing context corresponds to a target region.  */
+
+static bool
+omp_maybe_offloaded_ctx (omp_context *ctx)
+{
+  if (cgraph_node::get (current_function_decl)->offloadable)
+    return true;
+  for (; ctx; ctx = ctx->outer)
+    if (is_gimple_omp_offloaded (ctx->stmt))
+      return true;
+  return false;
+}
+
 /* Build a decl for the omp child function.  It'll not contain a body
    yet, just the bare decl.  */
 
@@ -2475,28 +2489,24 @@ create_omp_child_function (omp_context *ctx, bool task_copy)
   DECL_CONTEXT (decl) = NULL_TREE;
   DECL_INITIAL (decl) = make_node (BLOCK);
   BLOCK_SUPERCONTEXT (DECL_INITIAL (decl)) = decl;
-  if (cgraph_node::get (current_function_decl)->offloadable)
-    cgraph_node::get_create (decl)->offloadable = 1;
-  else
+  if (omp_maybe_offloaded_ctx (ctx))
     {
-      omp_context *octx;
-      for (octx = ctx; octx; octx = octx->outer)
-	if (is_gimple_omp_offloaded (octx->stmt))
-	  {
-	    cgraph_node::get_create (decl)->offloadable = 1;
-	    if (ENABLE_OFFLOADING)
-	      g->have_offload = true;
-
-	    break;
-	  }
+      cgraph_node::get_create (decl)->offloadable = 1;
+      if (ENABLE_OFFLOADING)
+	g->have_offload = true;
     }
 
   if (cgraph_node::get_create (decl)->offloadable
       && !lookup_attribute ("omp declare target",
                            DECL_ATTRIBUTES (current_function_decl)))
-    DECL_ATTRIBUTES (decl)
-      = tree_cons (get_identifier ("omp target entrypoint"),
-                   NULL_TREE, DECL_ATTRIBUTES (decl));
+    {
+      const char *target_attr = (is_gimple_omp_offloaded (ctx->stmt)
+				 ? "omp target entrypoint"
+				 : "omp declare target");
+      DECL_ATTRIBUTES (decl)
+	= tree_cons (get_identifier (target_attr),
+		     NULL_TREE, DECL_ATTRIBUTES (decl));
+    }
 
   t = build_decl (DECL_SOURCE_LOCATION (decl),
 		  RESULT_DECL, NULL_TREE, void_type_node);
@@ -4264,6 +4274,25 @@ omp_clause_aligned_alignment (tree clause)
   return build_int_cst (integer_type_node, al);
 }
 
+
+/* Return maximum SIMT width if offloading may target SIMT hardware.  */
+
+static int
+omp_max_simt_vf (void)
+{
+  if (!optimize)
+    return 0;
+  if (ENABLE_OFFLOADING)
+    for (const char *c = getenv ("OFFLOAD_TARGET_NAMES"); c; )
+      {
+	if (!strncmp (c, "nvptx", strlen ("nvptx")))
+	  return 32;
+	else if ((c = strchr (c, ',')))
+	  c++;
+      }
+  return 0;
+}
+
 /* Return maximum possible vectorization factor for the target.  */
 
 static int
@@ -4277,16 +4306,18 @@ omp_max_vf (void)
               || global_options_set.x_flag_tree_vectorize)))
     return 1;
 
+  int vf = 1;
   int vs = targetm.vectorize.autovectorize_vector_sizes ();
   if (vs)
+    vf = 1 << floor_log2 (vs);
+  else
     {
-      vs = 1 << floor_log2 (vs);
-      return vs;
+      machine_mode vqimode = targetm.vectorize.preferred_simd_mode (QImode);
+      if (GET_MODE_CLASS (vqimode) == MODE_VECTOR_INT)
+	vf = GET_MODE_NUNITS (vqimode);
     }
-  machine_mode vqimode = targetm.vectorize.preferred_simd_mode (QImode);
-  if (GET_MODE_CLASS (vqimode) == MODE_VECTOR_INT)
-    return GET_MODE_NUNITS (vqimode);
-  return 1;
+  int svf = omp_max_simt_vf ();
+  return MAX (vf, svf);
 }
 
 /* Helper function of lower_rec_input_clauses, used for #pragma omp simd
@@ -4374,10 +4405,13 @@ lower_rec_input_clauses (tree clauses, gimple_seq *ilist, gimple_seq *dlist,
   int pass;
   bool is_simd = (gimple_code (ctx->stmt) == GIMPLE_OMP_FOR
 		  && gimple_omp_for_kind (ctx->stmt) & GF_OMP_FOR_SIMD);
+  bool maybe_simt
+    = is_simd && omp_maybe_offloaded_ctx (ctx) && omp_max_simt_vf () > 1;
   int max_vf = 0;
   tree lane = NULL_TREE, idx = NULL_TREE;
+  tree simt_lane = NULL_TREE;
   tree ivar = NULL_TREE, lvar = NULL_TREE;
-  gimple_seq llist[2] = { NULL, NULL };
+  gimple_seq llist[3] = { };
 
   copyin_seq = NULL;
 
@@ -5251,6 +5285,16 @@ lower_rec_input_clauses (tree clauses, gimple_seq *ilist, gimple_seq *dlist,
 
 		      gimplify_assign (unshare_expr (ivar), x, &llist[0]);
 
+		      if (maybe_simt)
+			{
+			  if (!simt_lane)
+			    simt_lane = create_tmp_var (unsigned_type_node);
+			  x = build_call_expr_internal_loc
+			    (UNKNOWN_LOCATION, IFN_GOMP_SIMT_XCHG_BFLY,
+			     TREE_TYPE (ivar), 2, ivar, simt_lane);
+			  x = build2 (code, TREE_TYPE (ivar), ivar, x);
+			  gimplify_assign (ivar, x, &llist[2]);
+			}
 		      x = build2 (code, TREE_TYPE (ref), ref, ivar);
 		      ref = build_outer_var_ref (var, ctx);
 		      gimplify_assign (ref, x, &llist[1]);
@@ -5303,6 +5347,39 @@ lower_rec_input_clauses (tree clauses, gimple_seq *ilist, gimple_seq *dlist,
       g = gimple_build_assign (lane, INTEGER_CST,
 			       build_int_cst (unsigned_type_node, 0));
       gimple_seq_add_stmt (ilist, g);
+      /* Emit reductions across SIMT lanes in log_2(simt_vf) steps.  */
+      if (llist[2])
+	{
+	  tree simt_vf = create_tmp_var (unsigned_type_node);
+	  g = gimple_build_call_internal (IFN_GOMP_SIMT_VF, 0);
+	  gimple_call_set_lhs (g, simt_vf);
+	  gimple_seq_add_stmt (dlist, g);
+
+	  tree t = build_int_cst (unsigned_type_node, 1);
+	  g = gimple_build_assign (simt_lane, INTEGER_CST, t);
+	  gimple_seq_add_stmt (dlist, g);
+
+	  t = build_int_cst (unsigned_type_node, 0);
+	  g = gimple_build_assign (idx, INTEGER_CST, t);
+	  gimple_seq_add_stmt (dlist, g);
+
+	  tree body = create_artificial_label (UNKNOWN_LOCATION);
+	  tree header = create_artificial_label (UNKNOWN_LOCATION);
+	  tree end = create_artificial_label (UNKNOWN_LOCATION);
+	  gimple_seq_add_stmt (dlist, gimple_build_goto (header));
+	  gimple_seq_add_stmt (dlist, gimple_build_label (body));
+
+	  gimple_seq_add_seq (dlist, llist[2]);
+
+	  g = gimple_build_assign (simt_lane, LSHIFT_EXPR, simt_lane, integer_one_node);
+	  gimple_seq_add_stmt (dlist, g);
+
+	  gimple_seq_add_stmt (dlist, gimple_build_label (header));
+	  g = gimple_build_cond (LT_EXPR, simt_lane, simt_vf, body, end);
+	  gimple_seq_add_stmt (dlist, g);
+
+	  gimple_seq_add_stmt (dlist, gimple_build_label (end));
+	}
       for (int i = 0; i < 2; i++)
 	if (llist[i])
 	  {
@@ -5389,7 +5466,7 @@ lower_lastprivate_clauses (tree clauses, tree predicate, gimple_seq *stmt_list,
 {
   tree x, c, label = NULL, orig_clauses = clauses;
   bool par_clauses = false;
-  tree simduid = NULL, lastlane = NULL;
+  tree simduid = NULL, lastlane = NULL, simtcond = NULL, simtlast = NULL;
 
   /* Early exit if there are no lastprivate or linear clauses.  */
   for (; clauses ; clauses = OMP_CLAUSE_CHAIN (clauses))
@@ -5416,6 +5493,16 @@ lower_lastprivate_clauses (tree clauses, tree predicate, gimple_seq *stmt_list,
       par_clauses = true;
     }
 
+  bool maybe_simt = false;
+  if (gimple_code (ctx->stmt) == GIMPLE_OMP_FOR
+      && gimple_omp_for_kind (ctx->stmt) & GF_OMP_FOR_SIMD)
+    {
+      maybe_simt = omp_maybe_offloaded_ctx (ctx) && omp_max_simt_vf () > 1;
+      simduid = find_omp_clause (orig_clauses, OMP_CLAUSE__SIMDUID_);
+      if (simduid)
+	simduid = OMP_CLAUSE__SIMDUID__DECL (simduid);
+    }
+
   if (predicate)
     {
       gcond *stmt;
@@ -5427,20 +5514,27 @@ lower_lastprivate_clauses (tree clauses, tree predicate, gimple_seq *stmt_list,
       arm2 = TREE_OPERAND (predicate, 1);
       gimplify_expr (&arm1, stmt_list, NULL, is_gimple_val, fb_rvalue);
       gimplify_expr (&arm2, stmt_list, NULL, is_gimple_val, fb_rvalue);
-      stmt = gimple_build_cond (TREE_CODE (predicate), arm1, arm2,
-				label_true, label);
+      if (maybe_simt)
+	{
+	  c = build2 (TREE_CODE (predicate), boolean_type_node, arm1, arm2);
+	  c = fold_convert (integer_type_node, c);
+	  simtcond = create_tmp_var (integer_type_node);
+	  gimplify_assign (simtcond, c, stmt_list);
+	  gcall *g = gimple_build_call_internal (IFN_GOMP_SIMT_VOTE_ANY,
+						 1, simtcond);
+	  c = create_tmp_var (integer_type_node);
+	  gimple_call_set_lhs (g, c);
+	  gimple_seq_add_stmt (stmt_list, g);
+	  stmt = gimple_build_cond (NE_EXPR, c, integer_zero_node,
+				    label_true, label);
+	}
+      else
+	stmt = gimple_build_cond (TREE_CODE (predicate), arm1, arm2,
+				  label_true, label);
       gimple_seq_add_stmt (stmt_list, stmt);
       gimple_seq_add_stmt (stmt_list, gimple_build_label (label_true));
     }
 
-  if (gimple_code (ctx->stmt) == GIMPLE_OMP_FOR
-      && gimple_omp_for_kind (ctx->stmt) & GF_OMP_FOR_SIMD)
-    {
-      simduid = find_omp_clause (orig_clauses, OMP_CLAUSE__SIMDUID_);
-      if (simduid)
-	simduid = OMP_CLAUSE__SIMDUID__DECL (simduid);
-    }
-
   for (c = clauses; c ;)
     {
       tree var, new_var;
@@ -5491,6 +5585,24 @@ lower_lastprivate_clauses (tree clauses, tree predicate, gimple_seq *stmt_list,
 		  new_var = build4 (ARRAY_REF, TREE_TYPE (val),
 				    TREE_OPERAND (val, 0), lastlane,
 				    NULL_TREE, NULL_TREE);
+		  if (maybe_simt)
+		    {
+		      gcall *g;
+		      if (simtlast == NULL)
+			{
+			  simtlast = create_tmp_var (unsigned_type_node);
+			  g = gimple_build_call_internal
+			    (IFN_GOMP_SIMT_LAST_LANE, 1, simtcond);
+			  gimple_call_set_lhs (g, simtlast);
+			  gimple_seq_add_stmt (stmt_list, g);
+			}
+		      x = build_call_expr_internal_loc
+			(UNKNOWN_LOCATION, IFN_GOMP_SIMT_XCHG_IDX,
+			 TREE_TYPE (new_var), 2, new_var, simtlast);
+		      new_var = unshare_expr (new_var);
+		      gimplify_assign (new_var, x, stmt_list);
+		      new_var = unshare_expr (new_var);
+		    }
 		}
 	    }
 
@@ -10498,12 +10610,23 @@ expand_omp_simd (struct omp_region *region, struct omp_for_data *fd)
   edge e, ne;
   tree *counts = NULL;
   int i;
+  int safelen_int = INT_MAX;
   tree safelen = find_omp_clause (gimple_omp_for_clauses (fd->for_stmt),
 				  OMP_CLAUSE_SAFELEN);
   tree simduid = find_omp_clause (gimple_omp_for_clauses (fd->for_stmt),
 				  OMP_CLAUSE__SIMDUID_);
   tree n1, n2;
 
+  if (safelen)
+    {
+      safelen = OMP_CLAUSE_SAFELEN_EXPR (safelen);
+      if (TREE_CODE (safelen) != INTEGER_CST)
+	safelen_int = 0;
+      else if (tree_fits_uhwi_p (safelen) && tree_to_uhwi (safelen) < INT_MAX)
+	safelen_int = tree_to_uhwi (safelen);
+      if (safelen_int == 1)
+	safelen_int = 0;
+    }
   type = TREE_TYPE (fd->loop.v);
   entry_bb = region->entry;
   cont_bb = region->cont;
@@ -10557,20 +10680,53 @@ expand_omp_simd (struct omp_region *region, struct omp_for_data *fd)
 				OMP_CLAUSE__LOOPTEMP_);
       gcc_assert (innerc);
       n2 = OMP_CLAUSE_DECL (innerc);
-      expand_omp_build_assign (&gsi, fd->loop.v,
-			       fold_convert (type, n1));
+    }
+  tree step = fd->loop.step;
+
+  bool offloaded = cgraph_node::get (current_function_decl)->offloadable;
+  for (struct omp_region *rgn = region; !offloaded && rgn; rgn = rgn->outer)
+    offloaded = rgn->type == GIMPLE_OMP_TARGET;
+  bool is_simt = offloaded && omp_max_simt_vf () > 1 && safelen_int > 1;
+  tree simt_lane = NULL_TREE, simt_maxlane = NULL_TREE;
+  if (is_simt)
+    {
+      cfun->curr_properties &= ~PROP_gimple_lomp_dev;
+      simt_lane = create_tmp_var (unsigned_type_node);
+      gimple *g = gimple_build_call_internal (IFN_GOMP_SIMT_LANE, 0);
+      gimple_call_set_lhs (g, simt_lane);
+      gsi_insert_before (&gsi, g, GSI_SAME_STMT);
+      tree offset = fold_build2 (MULT_EXPR, TREE_TYPE (step), step,
+				 fold_convert (TREE_TYPE (step), simt_lane));
+      n1 = fold_convert (type, n1);
+      if (POINTER_TYPE_P (type))
+	n1 = fold_build_pointer_plus (n1, offset);
+      else
+	n1 = fold_build2 (PLUS_EXPR, type, n1, fold_convert (type, offset));
+
+      /* Collapsed loops not handled for SIMT yet: limit to one lane only.  */
       if (fd->collapse > 1)
+	simt_maxlane = build_one_cst (unsigned_type_node);
+      else if (safelen_int < omp_max_simt_vf ())
+	simt_maxlane = build_int_cst (unsigned_type_node, safelen_int);
+      tree vf
+	= build_call_expr_internal_loc (UNKNOWN_LOCATION, IFN_GOMP_SIMT_VF,
+					unsigned_type_node, 0);
+      if (simt_maxlane)
+	vf = fold_build2 (MIN_EXPR, unsigned_type_node, vf, simt_maxlane);
+      vf = fold_convert (TREE_TYPE (step), vf);
+      step = fold_build2 (MULT_EXPR, TREE_TYPE (step), step, vf);
+    }
+
+  expand_omp_build_assign (&gsi, fd->loop.v, fold_convert (type, n1));
+  if (fd->collapse > 1)
+    {
+      if (gimple_omp_for_combined_into_p (fd->for_stmt))
 	{
 	  gsi_prev (&gsi);
 	  expand_omp_for_init_vars (fd, &gsi, counts, NULL, n1);
 	  gsi_next (&gsi);
 	}
-    }
-  else
-    {
-      expand_omp_build_assign (&gsi, fd->loop.v,
-			       fold_convert (type, fd->loop.n1));
-      if (fd->collapse > 1)
+      else
 	for (i = 0; i < fd->collapse; i++)
 	  {
 	    tree itype = TREE_TYPE (fd->loops[i].v);
@@ -10579,7 +10735,7 @@ expand_omp_simd (struct omp_region *region, struct omp_for_data *fd)
 	    t = fold_convert (TREE_TYPE (fd->loops[i].v), fd->loops[i].n1);
 	    expand_omp_build_assign (&gsi, fd->loops[i].v, t);
 	  }
-      }
+    }
 
   /* Remove the GIMPLE_OMP_FOR statement.  */
   gsi_remove (&gsi, true);
@@ -10592,9 +10748,9 @@ expand_omp_simd (struct omp_region *region, struct omp_for_data *fd)
       gcc_assert (gimple_code (stmt) == GIMPLE_OMP_CONTINUE);
 
       if (POINTER_TYPE_P (type))
-	t = fold_build_pointer_plus (fd->loop.v, fd->loop.step);
+	t = fold_build_pointer_plus (fd->loop.v, step);
       else
-	t = fold_build2 (PLUS_EXPR, type, fd->loop.v, fd->loop.step);
+	t = fold_build2 (PLUS_EXPR, type, fd->loop.v, step);
       expand_omp_build_assign (&gsi, fd->loop.v, t);
 
       if (fd->collapse > 1)
@@ -10668,6 +10824,18 @@ expand_omp_simd (struct omp_region *region, struct omp_for_data *fd)
       gimple_regimplify_operands (cond_stmt, &gsi);
     }
 
+  /* Add 'V -= STEP * (SIMT_VF - 1)' after the loop.  */
+  if (is_simt)
+    {
+      gsi = gsi_start_bb (l2_bb);
+      step = fold_build2 (MINUS_EXPR, TREE_TYPE (step), fd->loop.step, step);
+      if (POINTER_TYPE_P (type))
+	t = fold_build_pointer_plus (fd->loop.v, step);
+      else
+	t = fold_build2 (PLUS_EXPR, type, fd->loop.v, step);
+      expand_omp_build_assign (&gsi, fd->loop.v, t);
+    }
+
   /* Remove GIMPLE_OMP_RETURN.  */
   gsi = gsi_last_bb (exit_bb);
   gsi_remove (&gsi, true);
@@ -10697,30 +10865,29 @@ expand_omp_simd (struct omp_region *region, struct omp_for_data *fd)
   ne->probability = REG_BR_PROB_BASE / 8;
 
   set_immediate_dominator (CDI_DOMINATORS, l1_bb, entry_bb);
-  set_immediate_dominator (CDI_DOMINATORS, l2_bb, l2_dom_bb);
   set_immediate_dominator (CDI_DOMINATORS, l0_bb, l1_bb);
 
+  if (simt_maxlane)
+    {
+      cond_stmt = gimple_build_cond (LT_EXPR, simt_lane, simt_maxlane,
+				     NULL_TREE, NULL_TREE);
+      gsi = gsi_last_bb (entry_bb);
+      gsi_insert_after (&gsi, cond_stmt, GSI_NEW_STMT);
+      make_edge (entry_bb, l2_bb, EDGE_FALSE_VALUE);
+      FALLTHRU_EDGE (entry_bb)->flags = EDGE_TRUE_VALUE;
+      FALLTHRU_EDGE (entry_bb)->probability = REG_BR_PROB_BASE * 7 / 8;
+      BRANCH_EDGE (entry_bb)->probability = REG_BR_PROB_BASE / 8;
+      l2_dom_bb = entry_bb;
+    }
+  set_immediate_dominator (CDI_DOMINATORS, l2_bb, l2_dom_bb);
+
   if (!broken_loop)
     {
       struct loop *loop = alloc_loop ();
       loop->header = l1_bb;
       loop->latch = cont_bb;
       add_loop (loop, l1_bb->loop_father);
-      if (safelen == NULL_TREE)
-	loop->safelen = INT_MAX;
-      else
-	{
-	  safelen = OMP_CLAUSE_SAFELEN_EXPR (safelen);
-	  if (TREE_CODE (safelen) != INTEGER_CST)
-	    loop->safelen = 0;
-	  else if (!tree_fits_uhwi_p (safelen)
-		   || tree_to_uhwi (safelen) > INT_MAX)
-	    loop->safelen = INT_MAX;
-	  else
-	    loop->safelen = tree_to_uhwi (safelen);
-	  if (loop->safelen == 1)
-	    loop->safelen = 0;
-	}
+      loop->safelen = safelen_int;
       if (simduid)
 	{
 	  loop->simduid = OMP_CLAUSE__SIMDUID__DECL (simduid);
@@ -13885,7 +14052,6 @@ expand_omp (struct omp_region *region)
     }
 }
 
-
 /* Helper for build_omp_regions.  Scan the dominator tree starting at
    block BB.  PARENT is the region that contains BB.  If SINGLE_TREE is
    true, the function ends once a single tree is built (otherwise, whole
@@ -14768,12 +14934,14 @@ static void
 lower_omp_ordered (gimple_stmt_iterator *gsi_p, omp_context *ctx)
 {
   tree block;
-  gimple *stmt = gsi_stmt (*gsi_p);
+  gimple *stmt = gsi_stmt (*gsi_p), *g;
   gomp_ordered *ord_stmt = as_a <gomp_ordered *> (stmt);
   gcall *x;
   gbind *bind;
   bool simd = find_omp_clause (gimple_omp_ordered_clauses (ord_stmt),
 			       OMP_CLAUSE_SIMD);
+  bool maybe_simt
+    = simd && omp_maybe_offloaded_ctx (ctx) && omp_max_simt_vf () > 1;
   bool threads = find_omp_clause (gimple_omp_ordered_clauses (ord_stmt),
 				  OMP_CLAUSE_THREADS);
 
@@ -14807,11 +14975,56 @@ lower_omp_ordered (gimple_stmt_iterator *gsi_p, omp_context *ctx)
 			   0);
   gimple_bind_add_stmt (bind, x);
 
+  tree counter = NULL_TREE, test = NULL_TREE, body = NULL_TREE;
+  if (maybe_simt)
+    {
+      counter = create_tmp_var (integer_type_node);
+      g = gimple_build_call_internal (IFN_GOMP_SIMT_LANE, 0);
+      gimple_call_set_lhs (g, counter);
+      gimple_bind_add_stmt (bind, g);
+
+      body = create_artificial_label (UNKNOWN_LOCATION);
+      test = create_artificial_label (UNKNOWN_LOCATION);
+      gimple_bind_add_stmt (bind, gimple_build_label (body));
+
+      tree simt_pred = create_tmp_var (integer_type_node);
+      g = gimple_build_call_internal (IFN_GOMP_SIMT_ORDERED_PRED, 1, counter);
+      gimple_call_set_lhs (g, simt_pred);
+      gimple_bind_add_stmt (bind, g);
+
+      tree t = create_artificial_label (UNKNOWN_LOCATION);
+      g = gimple_build_cond (EQ_EXPR, simt_pred, integer_zero_node, t, test);
+      gimple_bind_add_stmt (bind, g);
+
+      gimple_bind_add_stmt (bind, gimple_build_label (t));
+    }
   lower_omp (gimple_omp_body_ptr (stmt), ctx);
   gimple_omp_set_body (stmt, maybe_catch_exception (gimple_omp_body (stmt)));
   gimple_bind_add_seq (bind, gimple_omp_body (stmt));
   gimple_omp_set_body (stmt, NULL);
 
+  if (maybe_simt)
+    {
+      gimple_bind_add_stmt (bind, gimple_build_label (test));
+      g = gimple_build_assign (counter, MINUS_EXPR, counter, integer_one_node);
+      gimple_bind_add_stmt (bind, g);
+
+      tree c = build2 (GE_EXPR, boolean_type_node, counter, integer_zero_node);
+      tree nonneg = create_tmp_var (integer_type_node);
+      gimple_seq tseq = NULL;
+      gimplify_assign (nonneg, fold_convert (integer_type_node, c), &tseq);
+      gimple_bind_add_seq (bind, tseq);
+
+      g = gimple_build_call_internal (IFN_GOMP_SIMT_VOTE_ANY, 1, nonneg);
+      gimple_call_set_lhs (g, nonneg);
+      gimple_bind_add_stmt (bind, g);
+
+      tree end = create_artificial_label (UNKNOWN_LOCATION);
+      g = gimple_build_cond (NE_EXPR, nonneg, integer_zero_node, body, end);
+      gimple_bind_add_stmt (bind, g);
+
+      gimple_bind_add_stmt (bind, gimple_build_label (end));
+    }
   if (simd)
     x = gimple_build_call_internal (IFN_GOMP_SIMD_ORDERED_END, 1,
 				    build_int_cst (NULL_TREE, threads));
@@ -17932,7 +18145,7 @@ const pass_data pass_data_lower_omp =
   OPTGROUP_NONE, /* optinfo_flags */
   TV_NONE, /* tv_id */
   PROP_gimple_any, /* properties_required */
-  PROP_gimple_lomp, /* properties_provided */
+  PROP_gimple_lomp | PROP_gimple_lomp_dev, /* properties_provided */
   0, /* properties_destroyed */
   0, /* todo_flags_start */
   0, /* todo_flags_finish */
@@ -19864,6 +20077,109 @@ make_pass_oacc_device_lower (gcc::context *ctxt)
 {
   return new pass_oacc_device_lower (ctxt);
 }
+
+
+/* Cleanup uses of SIMT placeholder internal functions: on non-SIMT targets,
+   VF is 1 and LANE is 0; on SIMT targets, VF is folded to a constant, and
+   LANE is kept to be expanded to RTL later on.  Also cleanup all other SIMT
+   internal functions on non-SIMT targets, and likewise some SIMD internal
+   functions on SIMT targets.  */
+
+static unsigned int
+execute_omp_device_lower ()
+{
+  int vf = targetm.simt.vf ? targetm.simt.vf () : 1;
+  basic_block bb;
+  gimple_stmt_iterator gsi;
+  FOR_EACH_BB_FN (bb, cfun)
+    for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
+      {
+	gimple *stmt = gsi_stmt (gsi);
+	if (!is_gimple_call (stmt) || !gimple_call_internal_p (stmt))
+	  continue;
+	tree lhs = gimple_call_lhs (stmt), rhs = NULL_TREE;
+	tree type = lhs ? TREE_TYPE (lhs) : integer_type_node;
+	switch (gimple_call_internal_fn (stmt))
+	  {
+	  case IFN_GOMP_SIMT_LANE:
+	  case IFN_GOMP_SIMT_LAST_LANE:
+	    rhs = vf == 1 ? build_zero_cst (type) : NULL_TREE;
+	    break;
+	  case IFN_GOMP_SIMT_VF:
+	    rhs = build_int_cst (type, vf);
+	    break;
+	  case IFN_GOMP_SIMT_ORDERED_PRED:
+	    rhs = vf == 1 ? integer_zero_node : NULL_TREE;
+	    if (rhs || !lhs)
+	      unlink_stmt_vdef (stmt);
+	    break;
+	  case IFN_GOMP_SIMT_VOTE_ANY:
+	  case IFN_GOMP_SIMT_XCHG_BFLY:
+	  case IFN_GOMP_SIMT_XCHG_IDX:
+	    rhs = vf == 1 ? gimple_call_arg (stmt, 0) : NULL_TREE;
+	    break;
+	  case IFN_GOMP_SIMD_LANE:
+	  case IFN_GOMP_SIMD_LAST_LANE:
+	    rhs = vf != 1 ? build_zero_cst (type) : NULL_TREE;
+	    break;
+	  case IFN_GOMP_SIMD_VF:
+	    rhs = vf != 1 ? build_one_cst (type) : NULL_TREE;
+	    break;
+	  default:
+	    continue;
+	  }
+	if (lhs && !rhs)
+	  continue;
+	stmt = lhs ? gimple_build_assign (lhs, rhs) : gimple_build_nop ();
+	gsi_replace (&gsi, stmt, false);
+      }
+  if (vf != 1)
+    cfun->has_force_vectorize_loops = false;
+  return 0;
+}
+
+namespace {
+
+const pass_data pass_data_omp_device_lower =
+{
+  GIMPLE_PASS, /* type */
+  "ompdevlow", /* name */
+  OPTGROUP_NONE, /* optinfo_flags */
+  TV_NONE, /* tv_id */
+  PROP_cfg, /* properties_required */
+  PROP_gimple_lomp_dev, /* properties_provided */
+  0, /* properties_destroyed */
+  0, /* todo_flags_start */
+  TODO_update_ssa, /* todo_flags_finish */
+};
+
+class pass_omp_device_lower : public gimple_opt_pass
+{
+public:
+  pass_omp_device_lower (gcc::context *ctxt)
+    : gimple_opt_pass (pass_data_omp_device_lower, ctxt)
+  {}
+
+  /* opt_pass methods: */
+  virtual bool gate (function *fun)
+    {
+      /* FIXME: inlining does not propagate the lomp_dev property.  */
+      return 1 || !(fun->curr_properties & PROP_gimple_lomp_dev);
+    }
+  virtual unsigned int execute (function *)
+    {
+      return execute_omp_device_lower ();
+    }
+
+}; // class pass_expand_omp_ssa
+
+} // anon namespace
+
+gimple_opt_pass *
+make_pass_omp_device_lower (gcc::context *ctxt)
+{
+  return new pass_omp_device_lower (ctxt);
+}
 
 /* "omp declare target link" handling pass.  */
 
diff --git a/gcc/passes.def b/gcc/passes.def
index 85a5af088f3..2a470a7de07 100644
--- a/gcc/passes.def
+++ b/gcc/passes.def
@@ -183,6 +183,7 @@ along with GCC; see the file COPYING3.  If not see
   NEXT_PASS (pass_fixup_cfg);
   NEXT_PASS (pass_lower_eh_dispatch);
   NEXT_PASS (pass_oacc_device_lower);
+  NEXT_PASS (pass_omp_device_lower);
   NEXT_PASS (pass_omp_target_link);
   NEXT_PASS (pass_all_optimizations);
   PUSH_INSERT_PASSES_WITHIN (pass_all_optimizations)
diff --git a/gcc/target-insns.def b/gcc/target-insns.def
index a6a040eb173..e011a5a7f26 100644
--- a/gcc/target-insns.def
+++ b/gcc/target-insns.def
@@ -68,6 +68,12 @@ DEF_TARGET_INSN (oacc_dim_pos, (rtx x0, rtx x1))
 DEF_TARGET_INSN (oacc_dim_size, (rtx x0, rtx x1))
 DEF_TARGET_INSN (oacc_fork, (rtx x0, rtx x1, rtx x2))
 DEF_TARGET_INSN (oacc_join, (rtx x0, rtx x1, rtx x2))
+DEF_TARGET_INSN (omp_simt_lane, (rtx x0))
+DEF_TARGET_INSN (omp_simt_last_lane, (rtx x0, rtx x1))
+DEF_TARGET_INSN (omp_simt_ordered, (rtx x0, rtx x1))
+DEF_TARGET_INSN (omp_simt_vote_any, (rtx x0, rtx x1))
+DEF_TARGET_INSN (omp_simt_xchg_bfly, (rtx x0, rtx x1, rtx x2))
+DEF_TARGET_INSN (omp_simt_xchg_idx, (rtx x0, rtx x1, rtx x2))
 DEF_TARGET_INSN (prefetch, (rtx x0, rtx x1, rtx x2))
 DEF_TARGET_INSN (probe_stack, (rtx x0))
 DEF_TARGET_INSN (probe_stack_address, (rtx x0))
diff --git a/gcc/target.def b/gcc/target.def
index bcdbc0e2452..2b357494ede 100644
--- a/gcc/target.def
+++ b/gcc/target.def
@@ -1648,6 +1648,18 @@ int, (struct cgraph_node *), NULL)
 
 HOOK_VECTOR_END (simd_clone)
 
+/* Functions relating to OpenMP SIMT vectorization transform.  */
+#undef HOOK_PREFIX
+#define HOOK_PREFIX "TARGET_SIMT_"
+HOOK_VECTOR (TARGET_SIMT, simt)
+
+DEFHOOK
+(vf,
+"Return number of threads in SIMT thread group on the target.",
+int, (void), NULL)
+
+HOOK_VECTOR_END (simt)
+
 /* Functions relating to openacc.  */
 #undef HOOK_PREFIX
 #define HOOK_PREFIX "TARGET_GOACC_"
diff --git a/gcc/testsuite/ChangeLog.gomp-nvptx b/gcc/testsuite/ChangeLog.gomp-nvptx
new file mode 100644
index 00000000000..24d26edeb64
--- /dev/null
+++ b/gcc/testsuite/ChangeLog.gomp-nvptx
@@ -0,0 +1,13 @@
+2016-05-19  Alexander Monakov  <amonakov@ispras.ru>
+
+	* lib/target-supports.exp (check_effective_target_alloca): Use a
+	compile test.
+
+2016-05-19  Alexander Monakov  <amonakov@ispras.ru>
+
+	* gcc.target/nvptx/softstack.c: New test.
+
+2016-05-06  Alexander Monakov  <amonakov@ispras.ru>
+
+	* gcc.target/nvptx/decl-shared.c: New test.
+	* gcc.target/nvptx/decl-shared-init.c: New test.
diff --git a/gcc/testsuite/gcc.target/nvptx/decl-shared-init.c b/gcc/testsuite/gcc.target/nvptx/decl-shared-init.c
new file mode 100644
index 00000000000..6a99b1c338a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nvptx/decl-shared-init.c
@@ -0,0 +1 @@
+int var __attribute__((shared)) = 0; /* { dg-error "static initialization .* not supported" } */
diff --git a/gcc/testsuite/gcc.target/nvptx/decl-shared.c b/gcc/testsuite/gcc.target/nvptx/decl-shared.c
new file mode 100644
index 00000000000..367075cebe2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nvptx/decl-shared.c
@@ -0,0 +1,14 @@
+static int v_internal __attribute__((shared,used));
+int v_common __attribute__((shared));
+int v_extdef __attribute__((shared,nocommon));
+extern int v_extdecl __attribute__((shared));
+
+int use()
+{
+  return v_extdecl;
+}
+
+/* { dg-final { scan-assembler "\[\r\n\]\[\t \]*.shared \[^,\r\n\]*v_internal" } } */
+/* { dg-final { scan-assembler "\[\r\n\]\[\t \]*.weak .shared \[^,\r\n\]*v_common" } } */
+/* { dg-final { scan-assembler "\[\r\n\]\[\t \]*.visible .shared \[^,\r\n\]*v_extdef" } } */
+/* { dg-final { scan-assembler "\[\r\n\]\[\t \]*.extern .shared \[^,\r\n\]*v_extdecl" } } */
diff --git a/gcc/testsuite/gcc.target/nvptx/softstack.c b/gcc/testsuite/gcc.target/nvptx/softstack.c
new file mode 100644
index 00000000000..73e60f282a7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nvptx/softstack.c
@@ -0,0 +1,23 @@
+/* { dg-options "-O2 -msoft-stack" } */
+/* { dg-do run } */
+
+static __attribute__((noinline,noclone)) int f(int *p)
+{
+  return __sync_lock_test_and_set(p, 1);
+}
+
+static __attribute__((noinline,noclone)) int g(int n)
+{
+  /* Check that variable-length stack allocation works.  */
+  int v[n];
+  v[0] = 0;
+  /* Check that atomic operations can be applied to auto data.  */
+  return f(v) == 0 && v[0] == 1;
+}
+
+int main()
+{
+  if (!g(1))
+    __builtin_abort();
+  return 0;
+}
diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-supports.exp
index b683c09c1db..ce60cce9cae 100644
--- a/gcc/testsuite/lib/target-supports.exp
+++ b/gcc/testsuite/lib/target-supports.exp
@@ -763,7 +763,10 @@ proc check_effective_target_untyped_assembly {} {
 
 proc check_effective_target_alloca {} {
     if { [istarget nvptx-*-*] } {
-	return 0
+	return [check_no_compiler_messages alloca assembly {
+	    void f (void*);
+	    void g (int n) { f (__builtin_alloca (n)); }
+	}]
     }
     return 1
 }
diff --git a/gcc/tree-pass.h b/gcc/tree-pass.h
index da9ba1374b4..8befebe17b1 100644
--- a/gcc/tree-pass.h
+++ b/gcc/tree-pass.h
@@ -222,6 +222,7 @@ protected:
 						   of math functions; the
 						   current choices have
 						   been optimized.  */
+#define PROP_gimple_lomp_dev	(1 << 16)	/* done omp_device_lower */
 
 #define PROP_trees \
   (PROP_gimple_any | PROP_gimple_lcf | PROP_gimple_leh | PROP_gimple_lomp)
@@ -417,6 +418,7 @@ extern gimple_opt_pass *make_pass_expand_omp (gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_expand_omp_ssa (gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_omp_target_link (gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_oacc_device_lower (gcc::context *ctxt);
+extern gimple_opt_pass *make_pass_omp_device_lower (gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_object_sizes (gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_strlen (gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_fold_builtins (gcc::context *ctxt);
diff --git a/libgcc/ChangeLog.gomp-nvptx b/libgcc/ChangeLog.gomp-nvptx
new file mode 100644
index 00000000000..f2cb0d88743
--- /dev/null
+++ b/libgcc/ChangeLog.gomp-nvptx
@@ -0,0 +1,27 @@
+2016-05-06  Alexander Monakov  <amonakov@ispras.ru>
+
+	* config/nvptx/stacks.c (__nvptx_stacks): Use 'nocommon' attribute to
+	emit a strong definition.
+	(__nvptx_uni): Ditto.
+
+2016-03-10  Alexander Monakov  <amonakov@ispras.ru>
+
+	* config/nvptx/crt0.c (__nvptx_stacks): Define in C.  Use it...
+	(__nvptx_uni): Ditto.
+	(__main): ...here instead of inline asm.
+	* config/nvptx/stacks.c (__nvptx_stacks): Define in C.
+	(__nvptx_uni): Ditto.
+
+2015-12-09  Alexander Monakov  <amonakov@ispras.ru>
+
+	* config/nvptx/crt0.c: New, rewritten in C from ...
+	* config/nvptx/crt0.s: ...this.  Delete.
+	* config/nvptx/free.c: New, rewritten in C from ...
+	* config/nvptx/free.asm: ...this.  Delete.
+	* config/nvptx/malloc.c: New, rewritten in C from ...
+	* config/nvptx/malloc.asm: ...this.  Delete.
+	* config/nvptx/realloc.c: Handle out-of-memory condition.
+	* config/nvptx/nvptx-malloc.h (__nvptx_real_free,
+	__nvptx_real_malloc): Declare.
+	* config/nvptx/stacks.c: New.
+	* config/nvptx/t-nvptx: Adjust.
diff --git a/libgcc/config/nvptx/crt0.c b/libgcc/config/nvptx/crt0.c
index 3b7382d43af..5a7dbf6f22f 100644
--- a/libgcc/config/nvptx/crt0.c
+++ b/libgcc/config/nvptx/crt0.c
@@ -24,6 +24,14 @@ int *__exitval_ptr;
 extern void __attribute__((noreturn)) exit (int status);
 extern int main (int, void **);
 
+/* Always setup soft stacks to allow testing with -msoft-stack but without
+   -mgomp.  32 is the maximum number of warps in a CTA: the definition here
+   must match the external declaration emitted by the compiler.  */
+void *__nvptx_stacks[32] __attribute__((shared,nocommon));
+
+/* Likewise for -muniform-simt.  */
+unsigned __nvptx_uni[32] __attribute__((shared,nocommon));
+
 void __attribute__((kernel))
 __main (int *rval_ptr, int argc, void **argv)
 {
@@ -33,5 +41,9 @@ __main (int *rval_ptr, int argc, void **argv)
   if (rval_ptr)
     *rval_ptr = 255;
 
+  static char stack[131072] __attribute__((aligned(8)));
+  __nvptx_stacks[0] = stack + sizeof stack;
+  __nvptx_uni[0] = 0;
+
   exit (main (argc, argv));
 }
diff --git a/libgcc/config/nvptx/nvptx-malloc.h b/libgcc/config/nvptx/mgomp.c
index 9cbd84689b7..d8ca5818314 100644
--- a/libgcc/config/nvptx/nvptx-malloc.h
+++ b/libgcc/config/nvptx/mgomp.c
@@ -1,6 +1,6 @@
-/* Declarations for the malloc wrappers.
+/* Define shared memory arrays for -msoft-stack and -muniform-simt.
 
-   Copyright (C) 2014-2016 Free Software Foundation, Inc.
+   Copyright (C) 2015-2016 Free Software Foundation, Inc.
 
    This file is free software; you can redistribute it and/or modify it
    under the terms of the GNU General Public License as published by the
@@ -21,6 +21,12 @@
    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
    <http://www.gnu.org/licenses/>.  */
 
-extern void __nvptx_free (void *);
-extern void *__nvptx_malloc (size_t);
-extern void *__nvptx_realloc (void *, size_t);
+/* OpenACC offloading does not use these symbols; thus, they are exposed
+   only for the -mgomp multilib.  The same definitions are also provided
+   in crt0.c for the case of non-offloading compilation.  32 is the maximum
+   number of warps in a CTA.  */
+
+#if defined(__nvptx_softstack__) && defined(__nvptx_unisimt__)
+void *__nvptx_stacks[32] __attribute__((shared,nocommon));
+unsigned __nvptx_uni[32] __attribute__((shared,nocommon));
+#endif
diff --git a/libgcc/config/nvptx/t-nvptx b/libgcc/config/nvptx/t-nvptx
index daf252f2472..c4d20c94cbb 100644
--- a/libgcc/config/nvptx/t-nvptx
+++ b/libgcc/config/nvptx/t-nvptx
@@ -1,4 +1,5 @@
-LIB2ADD=$(srcdir)/config/nvptx/reduction.c
+LIB2ADD=$(srcdir)/config/nvptx/reduction.c \
+	$(srcdir)/config/nvptx/mgomp.c
 
 LIB2ADDEH=
 LIB2FUNCS_EXCLUDE=__main
diff --git a/libgomp/ChangeLog.gomp-nvptx b/libgomp/ChangeLog.gomp-nvptx
new file mode 100644
index 00000000000..65f64a2174c
--- /dev/null
+++ b/libgomp/ChangeLog.gomp-nvptx
@@ -0,0 +1,296 @@
+2016-11-08  Alexander Monakov  <amonakov@ispras.ru>
+
+	* config/nvptx/team.c: Adjust comments.
+
+2016-05-06  Alexander Monakov  <amonakov@ispras.ru>
+
+	* config/nvptx/team.c (nvptx_thrs): Use 'nocommon' attribute to emit a
+	strong definition.
+
+2016-03-24  Alexander Monakov  <amonakov@ispras.ru>
+
+	* config/nvptx/team.c (gomp_thread_start): Work around NVIDIA driver
+	bug by adding an exit edge to the loop,
+
+2016-03-24  Alexander Monakov  <amonakov@ispras.ru>
+
+	* config/nvptx/target.c (GOMP_teams): Do not call 'free'.
+	* config/nvptx/team.c (gomp_nvptx_main): Use 'alloca' instead of
+	'malloc' to obtain storage.  Do not call 'free'.
+	* team.c (gomp_free_thread) [__nvptx__]: Do not call 'free'.
+
+2016-03-11  Alexander Monakov  <amonakov@ispras.ru>
+
+	* plugin/plugin-nvptx.c (map_fini): Make cuMemFreeHost error non-fatal.
+
+2016-03-04  Alexander Monakov  <amonakov@ispras.ru>
+
+	* config/nvptx/bar.c: Remove wrong invocation of
+	gomp_barrier_wait_end from gomp_team_barrier_wait_end.
+
+2016-02-15  Alexander Monakov  <amonakov@ispras.ru>
+
+	* plugin/plugin-nvptx.c (nvptx_stacks_size): New.
+	(nvptx_stacks_alloc): New.
+	(nvptx_stacks_free): New.
+	(GOMP_OFFLOAD_run): Allocate soft-stacks storage from the host using
+	the above new functions.  Use kernel launch interface that allows
+	checking for mismatched total size of entry function arguments.
+
+2016-02-15  Alexander Monakov  <amonakov@ispras.ru>
+
+	* config/nvptx/team.c: (gomp_nvptx_main_1): Rename back to...
+	(gomp_nvptx_main): ...this; delete the wrapper.
+
+2016-02-15  Alexander Monakov  <amonakov@ispras.ru>
+
+	Revert
+	2015-12-09  Alexander Monakov  <amonakov@ispras.ru>
+
+	* plugin/plugin-nvptx.c (nvptx_open_device): Adjust heap size.
+
+2016-02-15  Alexander Monakov  <amonakov@ispras.ru>
+
+	* plugin/plugin-nvptx.c (nvptx_adjust_launch_bounds): Adjust types.
+	(GOMP_OFFLOAD_run): Ditto.
+
+2016-01-21  Alexander Monakov  <amonakov@ispras.ru>
+
+	* config/posix/affinity.c: Move to...
+	* affinity.c: ...here (new file).  Guard use of PThreads-specific
+	interface by LIBGOMP_USE_PTHREADS. 
+	* config/nvptx/affinity.c: Delete (identical to affinity.c).
+
+2016-01-20  Alexander Monakov  <amonakov@ispras.ru>
+
+	* plugin/plugin-nvptx.c (struct targ_fn_descriptor): Add new fields.
+	(struct ptx_device): Ditto.  Set them...
+	(nvptx_open_device): ...here.
+	(GOMP_OFFLOAD_load_image): Set new targ_fn_descriptor fields.
+	(nvptx_adjust_launch_bounds): New.  Use it...
+	(GOMP_OFFLOAD_run): ...here.
+
+2016-01-20  Alexander Monakov  <amonakov@ispras.ru>
+
+	* config/nvptx/icv-device.c (omp_get_num_teams): Update.
+	(omp_get_team_num): Ditto.
+	* config/nvptx/target.c (GOMP_teams): Update.
+	* config/nvptx/team.c (nvptx_thrs): Place in shared memory.
+	* icv.c (gomp_num_teams_var): Define.
+	* libgomp.h (gomp_num_teams_var): Declare.
+	(nvptx_thrs): Place in shared memory.
+
+2016-01-20  Alexander Monakov  <amonakov@ispras.ru>
+
+	* testsuite/libgomp.fortran/fortran.exp (lang_link_flags): Pass
+	-foffload=-lgfortran in addition to -lgfortran.
+	* testsuite/libgomp.oacc-fortran/fortran.exp (lang_link_flags): Ditto.
+
+2016-01-20  Alexander Monakov  <amonakov@ispras.ru>
+
+	* config/nvptx/fortran.c: Delete.
+
+2016-01-20  Alexander Monakov  <amonakov@ispras.ru>
+
+	* config/linux/lock.c (gomp_init_lock_30): Move to generic lock.c.
+	(gomp_destroy_lock_30): Ditto.
+	(gomp_set_lock_30): Ditto.
+	(gomp_unset_lock_30): Ditto.
+	(gomp_test_lock_30): Ditto.
+	(gomp_init_nest_lock_30): Ditto.
+	(gomp_destroy_nest_lock_30): Ditto.
+	(gomp_set_nest_lock_30): Ditto.
+	(gomp_unset_nest_lock_30): Ditto.
+	(gomp_test_nest_lock_30): Ditto.
+	* lock.c: New.
+	* config/nvptx/lock.c: New.
+
+2016-01-20  Alexander Monakov  <amonakov@ispras.ru>
+
+	* plugin/plugin-nvptx.c (struct ptx_device): New field (clock_khz).
+	(nvptx_open_device): Set it.
+	(nvptx_set_clocktick): New.  Use it...
+	(GOMP_OFFLOAD_load_image): ...here.
+
+2016-01-20  Dmitry Melnik  <dm@ispras.ru>
+
+	* config/nvptx/time.c: New.
+
+2016-01-20  Alexander Monakov  <amonakov@ispras.ru>
+
+	* config/nvptx/section.c: Delete.
+	* config/nvptx/splay-tree.c: Delete.
+
+2015-12-14  Alexander Monakov  <amonakov@ispras.ru>
+
+	* config/nvptx/task.c: Guard with #ifdef __nvptx_softstack__.
+
+2015-12-14  Alexander Monakov  <amonakov@ispras.ru>
+
+	Revert
+	2015-12-09  Alexander Monakov  <amonakov@ispras.ru>
+
+	* task.c (GOMP_task): Use a fixed-size on-stack buffer or a heap
+	allocation instead of a variable-size on-stack allocation.
+
+2015-12-14  Alexander Monakov  <amonakov@ispras.ru>
+
+	Revert
+	2015-12-09  Alexander Monakov  <amonakov@ispras.ru>
+
+	* taskloop.c (GOMP_taskloop): Avoid alloca on NVPTX.
+
+2015-12-09  Alexander Monakov  <amonakov@ispras.ru>
+
+	* config/nvptx/priority_queue.c: Delete.
+	
+2015-12-09  Alexander Monakov  <amonakov@ispras.ru>
+
+	* plugin/plugin-nvptx.c (GOMP_OFFLOAD_async_run): New (stub).
+
+2015-12-09  Alexander Monakov  <amonakov@ispras.ru>
+
+	* config/nvptx/task.c: New.
+
+2015-12-09  Alexander Monakov  <amonakov@ispras.ru>
+
+	* config/nvptx/team.c (gomp_nvptx_main): Rename to...
+	(gomp_nvptx_main_1): ... this and mark noinline.
+	(gomp_nvptx_main): Wrap the above, set up __nvptx_uni and
+	__nvptx_stacks.
+
+2015-12-09  Alexander Monakov  <amonakov@ispras.ru>
+
+	* plugin/plugin-nvptx.c (nvptx_open_device): Adjust heap size.
+
+2015-12-09  Alexander Monakov  <amonakov@ispras.ru>
+
+	* config/nvptx/pool.h: New.
+
+2015-12-09  Alexander Monakov  <amonakov@ispras.ru>
+
+	* config/nvptx/doacross.h: New.
+
+2015-12-09  Alexander Monakov  <amonakov@ispras.ru>
+
+	* taskloop.c (GOMP_taskloop): Avoid alloca on NVPTX.
+
+2015-12-09  Alexander Monakov  <amonakov@ispras.ru>
+
+	* plugin/plugin-nvptx.c (GOMP_OFFLOAD_dev2dev): New.
+
+2015-12-09  Alexander Monakov  <amonakov@ispras.ru>
+
+	* config/nvptx/affinity.c: New.
+
+2015-12-09  Alexander Monakov  <amonakov@ispras.ru>
+
+	* plugin/plugin-nvptx.c (link_ptx): Adjust log sizes.
+
+2015-12-09  Alexander Monakov  <amonakov@ispras.ru>
+
+	* plugin/plugin-nvptx.c (GOMP_OFFLOAD_run): Start 8 warps.
+
+2015-12-09  Alexander Monakov  <amonakov@ispras.ru>
+
+	* config/nvptx/alloc.c: Delete.
+	* config/nvptx/barrier.c: Ditto.
+	* config/nvptx/iter.c: Ditto.
+	* config/nvptx/iter_ull.c: Ditto.
+	* config/nvptx/loop.c: Ditto.
+	* config/nvptx/loop_ull.c: Ditto.
+	* config/nvptx/ordered.c: Ditto.
+	* config/nvptx/parallel.c: Ditto.
+	* config/nvptx/single.c: Ditto.
+	* config/nvptx/task.c: Ditto.
+	* config/nvptx/work.c: Ditto.
+
+2015-12-09  Alexander Monakov  <amonakov@ispras.ru>
+
+	* Makefile.am (libgomp_la_SOURCES): Add atomic.c.
+	* Makefile.in: Regenerate.
+	* critical.c: Split out GOMP_atomic_{start,end} into...
+	* atomic.c: ...here (new file).
+	* config/nvptx/critical.c: Delete.
+
+2015-12-09  Alexander Monakov  <amonakov@ispras.ru>
+
+	* config/nvptx/target.c: New.
+
+2015-12-09  Alexander Monakov  <amonakov@ispras.ru>
+
+	* config/nvptx/error.c: New.
+
+2015-12-09  Alexander Monakov  <amonakov@ispras.ru>
+
+	* libgomp.h [__nvptx__] (gomp_thread): New implementation.
+	* config/nvptx/team.c (gomp_nvptx_main): New.
+	(gomp_thread_start): New (NVPTX-specific implementation).
+	(gomp_team_start): Ditto.
+	* team.c: Guard uses of PThreads-specific interfaces by
+	LIBGOMP_USE_PTHREADS.
+
+2015-12-08  Alexander Monakov  <amonakov@ispras.ru>
+
+	* config/nvptx/simple-bar.h: New file.
+
+2015-12-08  Alexander Monakov  <amonakov@ispras.ru>
+
+	* config/posix/simple-bar.h: New file.  Use it ...
+	* libgomp.h: ...here: new include.
+	(struct gomp_thread_pool): Change threads_dock member to
+	gomp_simple_barrier_t.
+	* team.c: Adjust all uses.
+
+2015-12-08  Alexander Monakov  <amonakov@ispras.ru>
+
+	* config/nvptx/bar.c: New file.
+	* config/nvptx/bar.h: Ditto.
+
+2015-12-08  Alexander Monakov  <amonakov@ispras.ru>
+
+	* config/nvptx/proc.c: New.
+
+2015-12-08  Alexander Monakov  <amonakov@ispras.ru>
+
+	* plugin/plugin-nvptx.c (link_ptx): Do not set CU_JIT_TARGET.
+
+2015-12-08  Alexander Monakov  <amonakov@ispras.ru>
+
+	* config/nvptx/icv-device.c: New file.
+
+2015-12-08  Alexander Monakov  <amonakov@ispras.ru>
+
+	* Makefile.am (libgomp_la_SOURCES): Add icv.c and icv-device.c.
+	* Makefile.in: Regenerate.
+	* env.c: Split out ICV definitions into...
+	* icv.c: ...here (new file) and...
+	* icv-device.c: ...here. New file.
+
+2015-12-08  Alexander Monakov  <amonakov@ispras.ru>
+
+	* configure.ac [nvptx*-*-*] (libgomp_use_pthreads): Set and use it...
+	(LIBGOMP_USE_PTHREADS): ...here; new define.
+	* configure: Regenerate.
+	* config.h.in: Likewise.
+	* libgomp.h: Guard pthread.h inclusion.
+	(gomp_thread_attr): Guard by LIBGOMP_USE_PTHREADS.
+	(gomp_init_thread_affinity): Ditto.
+
+2015-12-08  Alexander Monakov  <amonakov@ispras.ru>
+
+	* config/nvptx/bar.h: New file.
+
+2015-12-08  Alexander Monakov  <amonakov@ispras.ru>
+
+	* config/nvptx/mutex.h: New file.
+	* config/nvptx/ptrlock.h: New file.
+	* config/nvptx/sem.h: New file.
+
+2015-12-08  Jakub Jelinek  <jakub@redhat.com>
+
+	* plugin/plugin-nvptx.c (nvptx_host2dev): Allow NULL 'nvthd'.
+	(nvptx_dev2host): Ditto.
+	(GOMP_OFFLOAD_get_caps): Add GOMP_OFFLOAD_CAP_OPENMP_400.
+	(GOMP_OFFLOAD_run): New.
+
diff --git a/libgomp/Makefile.am b/libgomp/Makefile.am
index a3e1c2b2c0f..4090336a773 100644
--- a/libgomp/Makefile.am
+++ b/libgomp/Makefile.am
@@ -58,12 +58,12 @@ libgomp_la_LDFLAGS = $(libgomp_version_info) $(libgomp_version_script) \
 libgomp_la_DEPENDENCIES = $(libgomp_version_dep)
 libgomp_la_LINK = $(LINK) $(libgomp_la_LDFLAGS)
 
-libgomp_la_SOURCES = alloc.c barrier.c critical.c env.c error.c iter.c \
-	iter_ull.c loop.c loop_ull.c ordered.c parallel.c sections.c single.c \
-	task.c team.c work.c lock.c mutex.c proc.c sem.c bar.c ptrlock.c \
-	time.c fortran.c affinity.c target.c splay-tree.c libgomp-plugin.c \
-	oacc-parallel.c oacc-host.c oacc-init.c oacc-mem.c oacc-async.c \
-	oacc-plugin.c oacc-cuda.c priority_queue.c
+libgomp_la_SOURCES = alloc.c atomic.c barrier.c critical.c env.c error.c \
+	icv.c icv-device.c iter.c iter_ull.c loop.c loop_ull.c ordered.c \
+	parallel.c sections.c single.c task.c team.c work.c lock.c mutex.c \
+	proc.c sem.c bar.c ptrlock.c time.c fortran.c affinity.c target.c \
+	splay-tree.c libgomp-plugin.c oacc-parallel.c oacc-host.c oacc-init.c \
+	oacc-mem.c oacc-async.c oacc-plugin.c oacc-cuda.c priority_queue.c
 
 include $(top_srcdir)/plugin/Makefrag.am
 
diff --git a/libgomp/Makefile.in b/libgomp/Makefile.in
index 88c8517fddd..a630f2b6438 100644
--- a/libgomp/Makefile.in
+++ b/libgomp/Makefile.in
@@ -173,14 +173,14 @@ libgomp_plugin_nvptx_la_LINK = $(LIBTOOL) --tag=CC \
 @PLUGIN_NVPTX_TRUE@	$(toolexeclibdir)
 libgomp_la_LIBADD =
 @USE_FORTRAN_TRUE@am__objects_1 = openacc.lo
-am_libgomp_la_OBJECTS = alloc.lo barrier.lo critical.lo env.lo \
-	error.lo iter.lo iter_ull.lo loop.lo loop_ull.lo ordered.lo \
-	parallel.lo sections.lo single.lo task.lo team.lo work.lo \
-	lock.lo mutex.lo proc.lo sem.lo bar.lo ptrlock.lo time.lo \
-	fortran.lo affinity.lo target.lo splay-tree.lo \
-	libgomp-plugin.lo oacc-parallel.lo oacc-host.lo oacc-init.lo \
-	oacc-mem.lo oacc-async.lo oacc-plugin.lo oacc-cuda.lo \
-	priority_queue.lo $(am__objects_1)
+am_libgomp_la_OBJECTS = alloc.lo atomic.lo barrier.lo critical.lo \
+	env.lo error.lo icv.lo icv-device.lo iter.lo iter_ull.lo \
+	loop.lo loop_ull.lo ordered.lo parallel.lo sections.lo \
+	single.lo task.lo team.lo work.lo lock.lo mutex.lo proc.lo \
+	sem.lo bar.lo ptrlock.lo time.lo fortran.lo affinity.lo \
+	target.lo splay-tree.lo libgomp-plugin.lo oacc-parallel.lo \
+	oacc-host.lo oacc-init.lo oacc-mem.lo oacc-async.lo \
+	oacc-plugin.lo oacc-cuda.lo priority_queue.lo $(am__objects_1)
 libgomp_la_OBJECTS = $(am_libgomp_la_OBJECTS)
 DEFAULT_INCLUDES = -I.@am__isrc@
 depcomp = $(SHELL) $(top_srcdir)/../depcomp
@@ -429,13 +429,14 @@ libgomp_la_LDFLAGS = $(libgomp_version_info) $(libgomp_version_script) \
 
 libgomp_la_DEPENDENCIES = $(libgomp_version_dep)
 libgomp_la_LINK = $(LINK) $(libgomp_la_LDFLAGS)
-libgomp_la_SOURCES = alloc.c barrier.c critical.c env.c error.c iter.c \
-	iter_ull.c loop.c loop_ull.c ordered.c parallel.c sections.c \
-	single.c task.c team.c work.c lock.c mutex.c proc.c sem.c \
-	bar.c ptrlock.c time.c fortran.c affinity.c target.c \
-	splay-tree.c libgomp-plugin.c oacc-parallel.c oacc-host.c \
-	oacc-init.c oacc-mem.c oacc-async.c oacc-plugin.c oacc-cuda.c \
-	priority_queue.c $(am__append_3)
+libgomp_la_SOURCES = alloc.c atomic.c barrier.c critical.c env.c \
+	error.c icv.c icv-device.c iter.c iter_ull.c loop.c loop_ull.c \
+	ordered.c parallel.c sections.c single.c task.c team.c work.c \
+	lock.c mutex.c proc.c sem.c bar.c ptrlock.c time.c fortran.c \
+	affinity.c target.c splay-tree.c libgomp-plugin.c \
+	oacc-parallel.c oacc-host.c oacc-init.c oacc-mem.c \
+	oacc-async.c oacc-plugin.c oacc-cuda.c priority_queue.c \
+	$(am__append_3)
 
 # Nvidia PTX OpenACC plugin.
 @PLUGIN_NVPTX_TRUE@libgomp_plugin_nvptx_version_info = -version-info $(libtool_VERSION)
@@ -598,12 +599,15 @@ distclean-compile:
 
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/affinity.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/alloc.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/atomic.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/bar.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/barrier.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/critical.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/env.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/error.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/fortran.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/icv-device.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/icv.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/iter.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/iter_ull.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libgomp-plugin.Plo@am__quote@
diff --git a/libgomp/config/posix/affinity.c b/libgomp/affinity.c
index 32986fa8281..3549e54b67c 100644
--- a/libgomp/config/posix/affinity.c
+++ b/libgomp/affinity.c
@@ -32,12 +32,14 @@ gomp_init_affinity (void)
 {
 }
 
+#ifdef LIBGOMP_USE_PTHREADS
 void
 gomp_init_thread_affinity (pthread_attr_t *attr, unsigned int place)
 {
   (void) attr;
   (void) place;
 }
+#endif
 
 void **
 gomp_affinity_alloc (unsigned long count, bool quiet)
diff --git a/libgomp/atomic.c b/libgomp/atomic.c
new file mode 100644
index 00000000000..bad0736e21e
--- /dev/null
+++ b/libgomp/atomic.c
@@ -0,0 +1,57 @@
+/* Copyright (C) 2005-2016 Free Software Foundation, Inc.
+   Contributed by Richard Henderson <rth@redhat.com>.
+
+   This file is part of the GNU Offloading and Multi Processing Library
+   (libgomp).
+
+   Libgomp is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+   FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+   more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* This file contains helpers for the ATOMIC construct.  */
+
+#include "libgomp.h"
+
+/* This mutex is used when atomic operations don't exist for the target
+   in the mode requested.  The result is not globally atomic, but works so
+   long as all parallel references are within #pragma omp atomic directives.
+   According to responses received from omp@openmp.org, appears to be within
+   spec.  Which makes sense, since that's how several other compilers
+   handle this situation as well.  */
+
+static gomp_mutex_t atomic_lock;
+
+void
+GOMP_atomic_start (void)
+{
+  gomp_mutex_lock (&atomic_lock);
+}
+
+void
+GOMP_atomic_end (void)
+{
+  gomp_mutex_unlock (&atomic_lock);
+}
+
+#if !GOMP_MUTEX_INIT_0
+static void __attribute__((constructor))
+initialize_atomic (void)
+{
+  gomp_mutex_init (&atomic_lock);
+}
+#endif
diff --git a/libgomp/config.h.in b/libgomp/config.h.in
index 226ac5358a7..e5526f6be82 100644
--- a/libgomp/config.h.in
+++ b/libgomp/config.h.in
@@ -94,6 +94,9 @@
 /* Define to 1 if GNU symbol versioning is used for libgomp. */
 #undef LIBGOMP_GNU_SYMBOL_VERSIONING
 
+/* Define to 1 if libgomp should use POSIX threads. */
+#undef LIBGOMP_USE_PTHREADS
+
 /* Define to the sub-directory in which libtool stores uninstalled libraries.
    */
 #undef LT_OBJDIR
diff --git a/libgomp/config/linux/lock.c b/libgomp/config/linux/lock.c
index a671b7e0a89..07845c98ad1 100644
--- a/libgomp/config/linux/lock.c
+++ b/libgomp/config/linux/lock.c
@@ -32,98 +32,8 @@
 #include <sys/syscall.h>
 #include "wait.h"
 
-
-/* The internal gomp_mutex_t and the external non-recursive omp_lock_t
-   have the same form.  Re-use it.  */
-
-void
-gomp_init_lock_30 (omp_lock_t *lock)
-{
-  gomp_mutex_init (lock);
-}
-
-void
-gomp_destroy_lock_30 (omp_lock_t *lock)
-{
-  gomp_mutex_destroy (lock);
-}
-
-void
-gomp_set_lock_30 (omp_lock_t *lock)
-{
-  gomp_mutex_lock (lock);
-}
-
-void
-gomp_unset_lock_30 (omp_lock_t *lock)
-{
-  gomp_mutex_unlock (lock);
-}
-
-int
-gomp_test_lock_30 (omp_lock_t *lock)
-{
-  int oldval = 0;
-
-  return __atomic_compare_exchange_n (lock, &oldval, 1, false,
-				      MEMMODEL_ACQUIRE, MEMMODEL_RELAXED);
-}
-
-void
-gomp_init_nest_lock_30 (omp_nest_lock_t *lock)
-{
-  memset (lock, '\0', sizeof (*lock));
-}
-
-void
-gomp_destroy_nest_lock_30 (omp_nest_lock_t *lock)
-{
-}
-
-void
-gomp_set_nest_lock_30 (omp_nest_lock_t *lock)
-{
-  void *me = gomp_icv (true);
-
-  if (lock->owner != me)
-    {
-      gomp_mutex_lock (&lock->lock);
-      lock->owner = me;
-    }
-
-  lock->count++;
-}
-
-void
-gomp_unset_nest_lock_30 (omp_nest_lock_t *lock)
-{
-  if (--lock->count == 0)
-    {
-      lock->owner = NULL;
-      gomp_mutex_unlock (&lock->lock);
-    }
-}
-
-int
-gomp_test_nest_lock_30 (omp_nest_lock_t *lock)
-{
-  void *me = gomp_icv (true);
-  int oldval;
-
-  if (lock->owner == me)
-    return ++lock->count;
-
-  oldval = 0;
-  if (__atomic_compare_exchange_n (&lock->lock, &oldval, 1, false,
-				   MEMMODEL_ACQUIRE, MEMMODEL_RELAXED))
-    {
-      lock->owner = me;
-      lock->count = 1;
-      return 1;
-    }
-
-  return 0;
-}
+/* Reuse the generic implementation in terms of gomp_mutex_t.  */
+#include "../../lock.c"
 
 #ifdef LIBGOMP_GNU_SYMBOL_VERSIONING
 /* gomp_mutex_* can be safely locked in one thread and
diff --git a/libgomp/config/nvptx/affinity.c b/libgomp/config/nvptx/affinity.c
deleted file mode 100644
index e69de29bb2d..00000000000
--- a/libgomp/config/nvptx/affinity.c
+++ /dev/null
diff --git a/libgomp/config/nvptx/alloc.c b/libgomp/config/nvptx/alloc.c
deleted file mode 100644
index e69de29bb2d..00000000000
--- a/libgomp/config/nvptx/alloc.c
+++ /dev/null
diff --git a/libgomp/config/nvptx/bar.c b/libgomp/config/nvptx/bar.c
index e69de29bb2d..a0d8a44dacf 100644
--- a/libgomp/config/nvptx/bar.c
+++ b/libgomp/config/nvptx/bar.c
@@ -0,0 +1,209 @@
+/* Copyright (C) 2015-2016 Free Software Foundation, Inc.
+   Contributed by Alexander Monakov <amonakov@ispras.ru>
+
+   This file is part of the GNU Offloading and Multi Processing Library
+   (libgomp).
+
+   Libgomp is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+   FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+   more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* This is an NVPTX specific implementation of a barrier synchronization
+   mechanism for libgomp.  This type is private to the library.  This
+   implementation uses atomic instructions and bar.sync instruction.  */
+
+#include <limits.h>
+#include "libgomp.h"
+
+
+void
+gomp_barrier_wait_end (gomp_barrier_t *bar, gomp_barrier_state_t state)
+{
+  if (__builtin_expect (state & BAR_WAS_LAST, 0))
+    {
+      /* Next time we'll be awaiting TOTAL threads again.  */
+      bar->awaited = bar->total;
+      __atomic_store_n (&bar->generation, bar->generation + BAR_INCR,
+			MEMMODEL_RELEASE);
+    }
+  asm ("bar.sync 1, %0;" : : "r" (32 * bar->total));
+}
+
+void
+gomp_barrier_wait (gomp_barrier_t *bar)
+{
+  gomp_barrier_wait_end (bar, gomp_barrier_wait_start (bar));
+}
+
+/* Like gomp_barrier_wait, except that if the encountering thread
+   is not the last one to hit the barrier, it returns immediately.
+   The intended usage is that a thread which intends to gomp_barrier_destroy
+   this barrier calls gomp_barrier_wait, while all other threads
+   call gomp_barrier_wait_last.  When gomp_barrier_wait returns,
+   the barrier can be safely destroyed.  */
+
+void
+gomp_barrier_wait_last (gomp_barrier_t *bar)
+{
+#if 0
+  gomp_barrier_state_t state = gomp_barrier_wait_start (bar);
+  if (state & BAR_WAS_LAST)
+    gomp_barrier_wait_end (bar, state);
+#else
+  gomp_barrier_wait (bar);
+#endif
+}
+
+void
+gomp_team_barrier_wake (gomp_barrier_t *bar, int count)
+{
+  asm ("bar.sync 1, %0;" : : "r" (32 * bar->total));
+}
+
+void
+gomp_team_barrier_wait_end (gomp_barrier_t *bar, gomp_barrier_state_t state)
+{
+  unsigned int generation, gen;
+
+  if (__builtin_expect (state & BAR_WAS_LAST, 0))
+    {
+      /* Next time we'll be awaiting TOTAL threads again.  */
+      struct gomp_thread *thr = gomp_thread ();
+      struct gomp_team *team = thr->ts.team;
+
+      bar->awaited = bar->total;
+      team->work_share_cancelled = 0;
+      if (__builtin_expect (team->task_count, 0))
+	{
+	  gomp_barrier_handle_tasks (state);
+	  state &= ~BAR_WAS_LAST;
+	}
+      else
+	{
+	  state &= ~BAR_CANCELLED;
+	  state += BAR_INCR - BAR_WAS_LAST;
+	  __atomic_store_n (&bar->generation, state, MEMMODEL_RELEASE);
+	  asm ("bar.sync 1, %0;" : : "r" (32 * bar->total));
+	  return;
+	}
+    }
+
+  generation = state;
+  state &= ~BAR_CANCELLED;
+  do
+    {
+      asm ("bar.sync 1, %0;" : : "r" (32 * bar->total));
+      gen = __atomic_load_n (&bar->generation, MEMMODEL_ACQUIRE);
+      if (__builtin_expect (gen & BAR_TASK_PENDING, 0))
+	{
+	  gomp_barrier_handle_tasks (state);
+	  gen = __atomic_load_n (&bar->generation, MEMMODEL_ACQUIRE);
+	}
+      generation |= gen & BAR_WAITING_FOR_TASK;
+    }
+  while (gen != state + BAR_INCR);
+}
+
+void
+gomp_team_barrier_wait (gomp_barrier_t *bar)
+{
+  gomp_team_barrier_wait_end (bar, gomp_barrier_wait_start (bar));
+}
+
+void
+gomp_team_barrier_wait_final (gomp_barrier_t *bar)
+{
+  gomp_barrier_state_t state = gomp_barrier_wait_final_start (bar);
+  if (__builtin_expect (state & BAR_WAS_LAST, 0))
+    bar->awaited_final = bar->total;
+  gomp_team_barrier_wait_end (bar, state);
+}
+
+bool
+gomp_team_barrier_wait_cancel_end (gomp_barrier_t *bar,
+				   gomp_barrier_state_t state)
+{
+  unsigned int generation, gen;
+
+  if (__builtin_expect (state & BAR_WAS_LAST, 0))
+    {
+      /* Next time we'll be awaiting TOTAL threads again.  */
+      /* BAR_CANCELLED should never be set in state here, because
+	 cancellation means that at least one of the threads has been
+	 cancelled, thus on a cancellable barrier we should never see
+	 all threads to arrive.  */
+      struct gomp_thread *thr = gomp_thread ();
+      struct gomp_team *team = thr->ts.team;
+
+      bar->awaited = bar->total;
+      team->work_share_cancelled = 0;
+      if (__builtin_expect (team->task_count, 0))
+	{
+	  gomp_barrier_handle_tasks (state);
+	  state &= ~BAR_WAS_LAST;
+	}
+      else
+	{
+	  state += BAR_INCR - BAR_WAS_LAST;
+	  __atomic_store_n (&bar->generation, state, MEMMODEL_RELEASE);
+	  asm ("bar.sync 1, %0;" : : "r" (32 * bar->total));
+	  return false;
+	}
+    }
+
+  if (__builtin_expect (state & BAR_CANCELLED, 0))
+    return true;
+
+  generation = state;
+  do
+    {
+      asm ("bar.sync 1, %0;" : : "r" (32 * bar->total));
+      gen = __atomic_load_n (&bar->generation, MEMMODEL_ACQUIRE);
+      if (__builtin_expect (gen & BAR_CANCELLED, 0))
+	return true;
+      if (__builtin_expect (gen & BAR_TASK_PENDING, 0))
+	{
+	  gomp_barrier_handle_tasks (state);
+	  gen = __atomic_load_n (&bar->generation, MEMMODEL_ACQUIRE);
+	}
+      generation |= gen & BAR_WAITING_FOR_TASK;
+    }
+  while (gen != state + BAR_INCR);
+
+  return false;
+}
+
+bool
+gomp_team_barrier_wait_cancel (gomp_barrier_t *bar)
+{
+  return gomp_team_barrier_wait_cancel_end (bar, gomp_barrier_wait_start (bar));
+}
+
+void
+gomp_team_barrier_cancel (struct gomp_team *team)
+{
+  gomp_mutex_lock (&team->task_lock);
+  if (team->barrier.generation & BAR_CANCELLED)
+    {
+      gomp_mutex_unlock (&team->task_lock);
+      return;
+    }
+  team->barrier.generation |= BAR_CANCELLED;
+  gomp_mutex_unlock (&team->task_lock);
+  gomp_team_barrier_wake (&team->barrier, INT_MAX);
+}
diff --git a/libgomp/config/nvptx/bar.h b/libgomp/config/nvptx/bar.h
new file mode 100644
index 00000000000..757edf1d7eb
--- /dev/null
+++ b/libgomp/config/nvptx/bar.h
@@ -0,0 +1,166 @@
+/* Copyright (C) 2015-2016 Free Software Foundation, Inc.
+   Contributed by Alexander Monakov <amonakov@ispras.ru>
+
+   This file is part of the GNU Offloading and Multi Processing Library
+   (libgomp).
+
+   Libgomp is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+   FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+   more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* This is an NVPTX specific implementation of a barrier synchronization
+   mechanism for libgomp.  This type is private to the library.  This
+   implementation uses atomic instructions and bar.sync instruction.  */
+
+#ifndef GOMP_BARRIER_H
+#define GOMP_BARRIER_H 1
+
+#include "mutex.h"
+
+typedef struct
+{
+  unsigned total;
+  unsigned generation;
+  unsigned awaited;
+  unsigned awaited_final;
+} gomp_barrier_t;
+
+typedef unsigned int gomp_barrier_state_t;
+
+/* The generation field contains a counter in the high bits, with a few
+   low bits dedicated to flags.  Note that TASK_PENDING and WAS_LAST can
+   share space because WAS_LAST is never stored back to generation.  */
+#define BAR_TASK_PENDING	1
+#define BAR_WAS_LAST		1
+#define BAR_WAITING_FOR_TASK	2
+#define BAR_CANCELLED		4
+#define BAR_INCR		8
+
+static inline void gomp_barrier_init (gomp_barrier_t *bar, unsigned count)
+{
+  bar->total = count;
+  bar->awaited = count;
+  bar->awaited_final = count;
+  bar->generation = 0;
+}
+
+static inline void gomp_barrier_reinit (gomp_barrier_t *bar, unsigned count)
+{
+  __atomic_add_fetch (&bar->awaited, count - bar->total, MEMMODEL_ACQ_REL);
+  bar->total = count;
+}
+
+static inline void gomp_barrier_destroy (gomp_barrier_t *bar)
+{
+}
+
+extern void gomp_barrier_wait (gomp_barrier_t *);
+extern void gomp_barrier_wait_last (gomp_barrier_t *);
+extern void gomp_barrier_wait_end (gomp_barrier_t *, gomp_barrier_state_t);
+extern void gomp_team_barrier_wait (gomp_barrier_t *);
+extern void gomp_team_barrier_wait_final (gomp_barrier_t *);
+extern void gomp_team_barrier_wait_end (gomp_barrier_t *,
+					gomp_barrier_state_t);
+extern bool gomp_team_barrier_wait_cancel (gomp_barrier_t *);
+extern bool gomp_team_barrier_wait_cancel_end (gomp_barrier_t *,
+					       gomp_barrier_state_t);
+extern void gomp_team_barrier_wake (gomp_barrier_t *, int);
+struct gomp_team;
+extern void gomp_team_barrier_cancel (struct gomp_team *);
+
+static inline gomp_barrier_state_t
+gomp_barrier_wait_start (gomp_barrier_t *bar)
+{
+  unsigned int ret = __atomic_load_n (&bar->generation, MEMMODEL_ACQUIRE);
+  ret &= -BAR_INCR | BAR_CANCELLED;
+  /* A memory barrier is needed before exiting from the various forms
+     of gomp_barrier_wait, to satisfy OpenMP API version 3.1 section
+     2.8.6 flush Construct, which says there is an implicit flush during
+     a barrier region.  This is a convenient place to add the barrier,
+     so we use MEMMODEL_ACQ_REL here rather than MEMMODEL_ACQUIRE.  */
+  if (__atomic_add_fetch (&bar->awaited, -1, MEMMODEL_ACQ_REL) == 0)
+    ret |= BAR_WAS_LAST;
+  return ret;
+}
+
+static inline gomp_barrier_state_t
+gomp_barrier_wait_cancel_start (gomp_barrier_t *bar)
+{
+  return gomp_barrier_wait_start (bar);
+}
+
+/* This is like gomp_barrier_wait_start, except it decrements
+   bar->awaited_final rather than bar->awaited and should be used
+   for the gomp_team_end barrier only.  */
+static inline gomp_barrier_state_t
+gomp_barrier_wait_final_start (gomp_barrier_t *bar)
+{
+  unsigned int ret = __atomic_load_n (&bar->generation, MEMMODEL_ACQUIRE);
+  ret &= -BAR_INCR | BAR_CANCELLED;
+  /* See above gomp_barrier_wait_start comment.  */
+  if (__atomic_add_fetch (&bar->awaited_final, -1, MEMMODEL_ACQ_REL) == 0)
+    ret |= BAR_WAS_LAST;
+  return ret;
+}
+
+static inline bool
+gomp_barrier_last_thread (gomp_barrier_state_t state)
+{
+  return state & BAR_WAS_LAST;
+}
+
+/* All the inlines below must be called with team->task_lock
+   held.  */
+
+static inline void
+gomp_team_barrier_set_task_pending (gomp_barrier_t *bar)
+{
+  bar->generation |= BAR_TASK_PENDING;
+}
+
+static inline void
+gomp_team_barrier_clear_task_pending (gomp_barrier_t *bar)
+{
+  bar->generation &= ~BAR_TASK_PENDING;
+}
+
+static inline void
+gomp_team_barrier_set_waiting_for_tasks (gomp_barrier_t *bar)
+{
+  bar->generation |= BAR_WAITING_FOR_TASK;
+}
+
+static inline bool
+gomp_team_barrier_waiting_for_tasks (gomp_barrier_t *bar)
+{
+  return (bar->generation & BAR_WAITING_FOR_TASK) != 0;
+}
+
+static inline bool
+gomp_team_barrier_cancelled (gomp_barrier_t *bar)
+{
+  return __builtin_expect ((bar->generation & BAR_CANCELLED) != 0, 0);
+}
+
+static inline void
+gomp_team_barrier_done (gomp_barrier_t *bar, gomp_barrier_state_t state)
+{
+  bar->generation = (state & -BAR_INCR) + BAR_INCR;
+}
+
+#endif /* GOMP_BARRIER_H */
diff --git a/libgomp/config/nvptx/barrier.c b/libgomp/config/nvptx/barrier.c
deleted file mode 100644
index e69de29bb2d..00000000000
--- a/libgomp/config/nvptx/barrier.c
+++ /dev/null
diff --git a/libgomp/config/nvptx/critical.c b/libgomp/config/nvptx/critical.c
deleted file mode 100644
index e69de29bb2d..00000000000
--- a/libgomp/config/nvptx/critical.c
+++ /dev/null
diff --git a/libgomp/config/nvptx/doacross.h b/libgomp/config/nvptx/doacross.h
new file mode 100644
index 00000000000..fd011d492bd
--- /dev/null
+++ b/libgomp/config/nvptx/doacross.h
@@ -0,0 +1,60 @@
+/* Copyright (C) 2015-2016 Free Software Foundation, Inc.
+   Contributed by Alexander Monakov <amonakov@ispras.ru>
+
+   This file is part of the GNU Offloading and Multi Processing Library
+   (libgomp).
+
+   Libgomp is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+   FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+   more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* This is the NVPTX implementation of doacross spinning.  */
+
+#ifndef GOMP_DOACROSS_H
+#define GOMP_DOACROSS_H 1
+
+#include "libgomp.h"
+
+static int zero;
+
+static inline int
+cpu_relax (void)
+{
+  int r;
+  /* Here we need a long-latency operation to make the current warp yield.
+     We could use ld.cv, uncached load from system (host) memory, but that
+     would require allocating locked memory in the plugin.  Alternatively,
+     we can use ld.cg, which evicts from L1 and caches in L2.  */
+  asm volatile ("ld.cg.s32 %0, [%1];" : "=r" (r) : "i" (&zero) : "memory");
+  return r;
+}
+
+static inline void doacross_spin (unsigned long *addr, unsigned long expected,
+				  unsigned long cur)
+{
+  /* Prevent compiler from optimizing based on bounds of containing object.  */
+  asm ("" : "+r" (addr));
+  do
+    {
+      int i = cpu_relax ();
+      cur = addr[i];
+    }
+  while (cur <= expected);
+}
+
+#endif /* GOMP_DOACROSS_H */
diff --git a/libgomp/config/nvptx/error.c b/libgomp/config/nvptx/error.c
index e69de29bb2d..f3e74cd9449 100644
--- a/libgomp/config/nvptx/error.c
+++ b/libgomp/config/nvptx/error.c
@@ -0,0 +1,42 @@
+/* Copyright (C) 2015-2016 Free Software Foundation, Inc.
+   Contributed by Alexander Monakov <amonakov@ispras.ru>
+
+   This file is part of the GNU Offloading and Multi Processing Library
+   (libgomp).
+
+   Libgomp is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+   FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+   more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* This file contains routines used to signal errors.  On NVPTX, we have
+   one default output stream (stdout), so redirect everything there.  */
+
+#include "libgomp.h"
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#undef vfprintf
+#undef fputs
+#undef fputc
+
+#define vfprintf(stream, fmt, list) vprintf (fmt, list)
+#define fputs(s, stream) printf ("%s", s)
+#define fputc(c, stream) printf ("%c", c)
+
+#include "../../error.c"
diff --git a/libgomp/config/nvptx/icv-device.c b/libgomp/config/nvptx/icv-device.c
new file mode 100644
index 00000000000..831ba110e95
--- /dev/null
+++ b/libgomp/config/nvptx/icv-device.c
@@ -0,0 +1,74 @@
+/* Copyright (C) 2015-2016 Free Software Foundation, Inc.
+   Contributed by Alexander Monakov <amonakov@ispras.ru>
+
+   This file is part of the GNU Offloading and Multi Processing Library
+   (libgomp).
+
+   Libgomp is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+   FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+   more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* This file defines OpenMP API entry points that accelerator targets are
+   expected to replace.  */
+
+#include "libgomp.h"
+
+void
+omp_set_default_device (int device_num __attribute__((unused)))
+{
+}
+
+int
+omp_get_default_device (void)
+{
+  return 0;
+}
+
+int
+omp_get_num_devices (void)
+{
+  return 0;
+}
+
+int
+omp_get_num_teams (void)
+{
+  return gomp_num_teams_var + 1;
+}
+
+int
+omp_get_team_num (void)
+{
+  int ctaid;
+  asm ("mov.u32 %0, %%ctaid.x;" : "=r" (ctaid));
+  return ctaid;
+}
+
+int
+omp_is_initial_device (void)
+{
+  /* NVPTX is an accelerator-only target.  */
+  return 0;
+}
+
+ialias (omp_set_default_device)
+ialias (omp_get_default_device)
+ialias (omp_get_num_devices)
+ialias (omp_get_num_teams)
+ialias (omp_get_team_num)
+ialias (omp_is_initial_device)
diff --git a/libgomp/config/nvptx/iter.c b/libgomp/config/nvptx/iter.c
deleted file mode 100644
index e69de29bb2d..00000000000
--- a/libgomp/config/nvptx/iter.c
+++ /dev/null
diff --git a/libgomp/config/nvptx/iter_ull.c b/libgomp/config/nvptx/iter_ull.c
deleted file mode 100644
index e69de29bb2d..00000000000
--- a/libgomp/config/nvptx/iter_ull.c
+++ /dev/null
diff --git a/libgomp/config/nvptx/lock.c b/libgomp/config/nvptx/lock.c
index e69de29bb2d..7731704c6fb 100644
--- a/libgomp/config/nvptx/lock.c
+++ b/libgomp/config/nvptx/lock.c
@@ -0,0 +1,41 @@
+/* Copyright (C) 2016 Free Software Foundation, Inc.
+   Contributed by Alexander Monakov <amonakov@ispras.ru>.
+
+   This file is part of the GNU Offloading and Multi Processing Library
+   (libgomp).
+
+   Libgomp is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+   FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+   more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* This is a NVPTX specific implementation of the public OpenMP locking
+   primitives.  */
+
+/* Reuse the generic implementation in terms of gomp_mutex_t.  */
+#include "../../lock.c"
+
+ialias (omp_init_lock)
+ialias (omp_init_nest_lock)
+ialias (omp_destroy_lock)
+ialias (omp_destroy_nest_lock)
+ialias (omp_set_lock)
+ialias (omp_set_nest_lock)
+ialias (omp_unset_lock)
+ialias (omp_unset_nest_lock)
+ialias (omp_test_lock)
+ialias (omp_test_nest_lock)
diff --git a/libgomp/config/nvptx/loop.c b/libgomp/config/nvptx/loop.c
deleted file mode 100644
index e69de29bb2d..00000000000
--- a/libgomp/config/nvptx/loop.c
+++ /dev/null
diff --git a/libgomp/config/nvptx/loop_ull.c b/libgomp/config/nvptx/loop_ull.c
deleted file mode 100644
index e69de29bb2d..00000000000
--- a/libgomp/config/nvptx/loop_ull.c
+++ /dev/null
diff --git a/libgomp/config/nvptx/mutex.h b/libgomp/config/nvptx/mutex.h
new file mode 100644
index 00000000000..e408ca72933
--- /dev/null
+++ b/libgomp/config/nvptx/mutex.h
@@ -0,0 +1,60 @@
+/* Copyright (C) 2015-2016 Free Software Foundation, Inc.
+   Contributed by Alexander Monakov <amonakov@ispras.ru>
+
+   This file is part of the GNU Offloading and Multi Processing Library
+   (libgomp).
+
+   Libgomp is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+   FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+   more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* This is an NVPTX specific implementation of a mutex synchronization
+   mechanism for libgomp.  This type is private to the library.  This
+   implementation uses atomic instructions and busy waiting.  */
+
+#ifndef GOMP_MUTEX_H
+#define GOMP_MUTEX_H 1
+
+typedef int gomp_mutex_t;
+
+#define GOMP_MUTEX_INIT_0 1
+
+static inline void
+gomp_mutex_init (gomp_mutex_t *mutex)
+{
+  *mutex = 0;
+}
+
+static inline void
+gomp_mutex_destroy (gomp_mutex_t *mutex)
+{
+}
+
+static inline void
+gomp_mutex_lock (gomp_mutex_t *mutex)
+{
+  while (__sync_lock_test_and_set (mutex, 1))
+    /* spin */ ;
+}
+
+static inline void
+gomp_mutex_unlock (gomp_mutex_t *mutex)
+{
+  __sync_lock_release (mutex);
+}
+#endif /* GOMP_MUTEX_H */
diff --git a/libgomp/config/nvptx/ordered.c b/libgomp/config/nvptx/ordered.c
deleted file mode 100644
index e69de29bb2d..00000000000
--- a/libgomp/config/nvptx/ordered.c
+++ /dev/null
diff --git a/libgomp/config/nvptx/parallel.c b/libgomp/config/nvptx/parallel.c
deleted file mode 100644
index e69de29bb2d..00000000000
--- a/libgomp/config/nvptx/parallel.c
+++ /dev/null
diff --git a/libgomp/config/nvptx/fortran.c b/libgomp/config/nvptx/pool.h
index 71ba6ed73f5..70e233c7caa 100644
--- a/libgomp/config/nvptx/fortran.c
+++ b/libgomp/config/nvptx/pool.h
@@ -1,8 +1,5 @@
-/* OpenACC Runtime Fortran wrapper routines
-
-   Copyright (C) 2014-2016 Free Software Foundation, Inc.
-
-   Contributed by Mentor Embedded.
+/* Copyright (C) 2015-2016 Free Software Foundation, Inc.
+   Contributed by Alexander Monakov <amonakov@ispras.ru>
 
    This file is part of the GNU Offloading and Multi Processing Library
    (libgomp).
@@ -26,15 +23,27 @@
    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
    <http://www.gnu.org/licenses/>.  */
 
-/* Temporary hack; this will be provided by libgfortran.  */
+/* This is the NVPTX implementation of the thread pool management
+   for libgomp.  This type is private to the library.  */
+
+#ifndef GOMP_POOL_H
+#define GOMP_POOL_H 1
+
+#include "libgomp.h"
+
+/* Get the thread pool.  */
+
+static inline struct gomp_thread_pool *
+gomp_get_thread_pool (struct gomp_thread *thr, unsigned nthreads)
+{
+  /* NVPTX is running with a fixed pool of pre-started threads.  */
+  return thr->thread_pool;
+}
 
-extern void _gfortran_abort (void);
+static inline void
+gomp_release_thread_pool (struct gomp_thread_pool *pool)
+{
+  /* Do nothing.  */
+}
 
-__asm__ ("// BEGIN GLOBAL FUNCTION DECL: _gfortran_abort\n"
-	 ".visible .func _gfortran_abort;\n"
-	 "// BEGIN GLOBAL FUNCTION DEF: _gfortran_abort\n"
-	 ".visible .func _gfortran_abort\n"
-	 "{\n"
-	 "trap;\n"
-	 "ret;\n"
-	 "}\n");
+#endif /* GOMP_POOL_H */
diff --git a/libgomp/config/nvptx/priority_queue.c b/libgomp/config/nvptx/priority_queue.c
deleted file mode 100644
index 63aecd249fb..00000000000
--- a/libgomp/config/nvptx/priority_queue.c
+++ /dev/null
@@ -1 +0,0 @@
-/* Empty stub for omp task priority support.  */
diff --git a/libgomp/config/nvptx/proc.c b/libgomp/config/nvptx/proc.c
index e69de29bb2d..8c1c3664660 100644
--- a/libgomp/config/nvptx/proc.c
+++ b/libgomp/config/nvptx/proc.c
@@ -0,0 +1,41 @@
+/* Copyright (C) 2015-2016 Free Software Foundation, Inc.
+   Contributed by Alexander Monakov <amonakov@ispras.ru>
+
+   This file is part of the GNU Offloading and Multi Processing Library
+   (libgomp).
+
+   Libgomp is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+   FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+   more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* This file contains system specific routines related to counting
+   online processors and dynamic load balancing.  */
+
+#include "libgomp.h"
+
+unsigned
+gomp_dynamic_max_threads (void)
+{
+  return gomp_icv (false)->nthreads_var;
+}
+
+int
+omp_get_num_procs (void)
+{
+  return gomp_icv (false)->nthreads_var;
+}
diff --git a/libgomp/config/nvptx/ptrlock.h b/libgomp/config/nvptx/ptrlock.h
new file mode 100644
index 00000000000..c2eae75720d
--- /dev/null
+++ b/libgomp/config/nvptx/ptrlock.h
@@ -0,0 +1,73 @@
+/* Copyright (C) 2015-2016 Free Software Foundation, Inc.
+   Contributed by Alexander Monakov <amonakov@ispras.ru>
+
+   This file is part of the GNU Offloading and Multi Processing Library
+   (libgomp).
+
+   Libgomp is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+   FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+   more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* This is an NVPTX specific implementation of a mutex synchronization
+   mechanism for libgomp.  This type is private to the library.  This
+   implementation uses atomic instructions and busy waiting.
+
+   A ptrlock has four states:
+   0/NULL Initial
+   1      Owned by me, I get to write a pointer to ptrlock.
+   2      Some thread is waiting on the ptrlock.
+   >2     Ptrlock contains a valid pointer.
+   It is not valid to gain the ptrlock and then write a NULL to it.  */
+
+#ifndef GOMP_PTRLOCK_H
+#define GOMP_PTRLOCK_H 1
+
+typedef void *gomp_ptrlock_t;
+
+static inline void gomp_ptrlock_init (gomp_ptrlock_t *ptrlock, void *ptr)
+{
+  *ptrlock = ptr;
+}
+
+static inline void *gomp_ptrlock_get (gomp_ptrlock_t *ptrlock)
+{
+  uintptr_t v = (uintptr_t) __atomic_load_n (ptrlock, MEMMODEL_ACQUIRE);
+  if (v > 2)
+    return (void *) v;
+
+  if (v == 0
+      && __atomic_compare_exchange_n (ptrlock, &v, 1, false,
+				      MEMMODEL_ACQUIRE, MEMMODEL_ACQUIRE))
+    return NULL;
+
+  while (v == 1)
+    v = (uintptr_t) __atomic_load_n (ptrlock, MEMMODEL_ACQUIRE);
+
+  return (void *) v;
+}
+
+static inline void gomp_ptrlock_set (gomp_ptrlock_t *ptrlock, void *ptr)
+{
+  __atomic_store_n (ptrlock, ptr, MEMMODEL_RELEASE);
+}
+
+static inline void gomp_ptrlock_destroy (gomp_ptrlock_t *ptrlock)
+{
+}
+
+#endif /* GOMP_PTRLOCK_H */
diff --git a/libgomp/config/nvptx/sections.c b/libgomp/config/nvptx/sections.c
deleted file mode 100644
index e69de29bb2d..00000000000
--- a/libgomp/config/nvptx/sections.c
+++ /dev/null
diff --git a/libgomp/config/nvptx/sem.h b/libgomp/config/nvptx/sem.h
new file mode 100644
index 00000000000..82c0dfbf7e0
--- /dev/null
+++ b/libgomp/config/nvptx/sem.h
@@ -0,0 +1,65 @@
+/* Copyright (C) 2015-2016 Free Software Foundation, Inc.
+   Contributed by Alexander Monakov <amonakov@ispras.ru>
+
+   This file is part of the GNU Offloading and Multi Processing Library
+   (libgomp).
+
+   Libgomp is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+   FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+   more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* This is an NVPTX specific implementation of a semaphore synchronization
+   mechanism for libgomp.  This type is private to the library.  This
+   semaphore implementation uses atomic instructions and busy waiting.  */
+
+#ifndef GOMP_SEM_H
+#define GOMP_SEM_H 1
+
+typedef int gomp_sem_t;
+
+static inline void
+gomp_sem_init (gomp_sem_t *sem, int value)
+{
+  *sem = value;
+}
+
+static inline void
+gomp_sem_destroy (gomp_sem_t *sem)
+{
+}
+
+static inline void
+gomp_sem_wait (gomp_sem_t *sem)
+{
+  int count = __atomic_load_n (sem, MEMMODEL_ACQUIRE);
+  for (;;)
+    {
+      while (count == 0)
+	count = __atomic_load_n (sem, MEMMODEL_ACQUIRE);
+      if (__atomic_compare_exchange_n (sem, &count, count - 1, false,
+				       MEMMODEL_ACQUIRE, MEMMODEL_RELAXED))
+	return;
+    }
+}
+
+static inline void
+gomp_sem_post (gomp_sem_t *sem)
+{
+  (void) __atomic_add_fetch (sem, 1, MEMMODEL_RELEASE);
+}
+#endif /* GOMP_SEM_H */
diff --git a/libgomp/config/nvptx/simple-bar.h b/libgomp/config/nvptx/simple-bar.h
new file mode 100644
index 00000000000..e7b56d98826
--- /dev/null
+++ b/libgomp/config/nvptx/simple-bar.h
@@ -0,0 +1,70 @@
+/* Copyright (C) 2015-2016 Free Software Foundation, Inc.
+   Contributed by Alexander Monakov <amonakov@ispras.ru>
+
+   This file is part of the GNU Offloading and Multi Processing Library
+   (libgomp).
+
+   Libgomp is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+   FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+   more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* This is a simplified barrier that is suitable for thread pool
+   synchronizaton.  Only a subset of full barrier API (bar.h) is exposed.
+   Here in the NVPTX-specific implementation, we expect that thread pool
+   corresponds to a PTX CTA (thread block).  */
+
+#ifndef GOMP_SIMPLE_BARRIER_H
+#define GOMP_SIMPLE_BARRIER_H 1
+
+typedef struct
+{
+  unsigned count;
+} gomp_simple_barrier_t;
+
+static inline void
+gomp_simple_barrier_init (gomp_simple_barrier_t *bar, unsigned count)
+{
+  bar->count = count * 32;
+}
+
+/* Unused on NVPTX.
+static inline void
+gomp_simple_barrier_reinit (gomp_simple_barrier_t *bar, unsigned count)
+{
+  bar->count = count * 32;
+}
+*/
+
+static inline void
+gomp_simple_barrier_destroy (gomp_simple_barrier_t *bar)
+{
+}
+
+static inline void
+gomp_simple_barrier_wait (gomp_simple_barrier_t *bar)
+{
+  asm volatile ("bar.sync 0, %0;" : : "r" (bar->count) : "memory");
+}
+
+static inline void
+gomp_simple_barrier_wait_last (gomp_simple_barrier_t *bar)
+{
+  asm volatile ("bar.arrive 0, %0;" : : "r" (bar->count) : "memory");
+}
+
+#endif /* GOMP_SIMPLE_BARRIER_H */
diff --git a/libgomp/config/nvptx/single.c b/libgomp/config/nvptx/single.c
deleted file mode 100644
index e69de29bb2d..00000000000
--- a/libgomp/config/nvptx/single.c
+++ /dev/null
diff --git a/libgomp/config/nvptx/splay-tree.c b/libgomp/config/nvptx/splay-tree.c
deleted file mode 100644
index e69de29bb2d..00000000000
--- a/libgomp/config/nvptx/splay-tree.c
+++ /dev/null
diff --git a/libgomp/config/nvptx/target.c b/libgomp/config/nvptx/target.c
index e69de29bb2d..38ea7f7aa68 100644
--- a/libgomp/config/nvptx/target.c
+++ b/libgomp/config/nvptx/target.c
@@ -0,0 +1,49 @@
+/* Copyright (C) 2013-2016 Free Software Foundation, Inc.
+   Contributed by Jakub Jelinek <jakub@redhat.com>.
+
+   This file is part of the GNU Offloading and Multi Processing Library
+   (libgomp).
+
+   Libgomp is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+   FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+   more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include "libgomp.h"
+#include <limits.h>
+
+void
+GOMP_teams (unsigned int num_teams, unsigned int thread_limit)
+{
+  if (thread_limit)
+    {
+      struct gomp_task_icv *icv = gomp_icv (true);
+      icv->thread_limit_var
+	= thread_limit > INT_MAX ? UINT_MAX : thread_limit;
+    }
+  unsigned int num_blocks, block_id;
+  asm ("mov.u32 %0, %%nctaid.x;" : "=r" (num_blocks));
+  asm ("mov.u32 %0, %%ctaid.x;" : "=r" (block_id));
+  if (!num_teams || num_teams >= num_blocks)
+    num_teams = num_blocks;
+  else if (block_id >= num_teams)
+    {
+      gomp_free_thread (nvptx_thrs);
+      asm ("exit;");
+    }
+  gomp_num_teams_var = num_teams - 1;
+}
diff --git a/libgomp/config/nvptx/task.c b/libgomp/config/nvptx/task.c
index e69de29bb2d..c183716c94e 100644
--- a/libgomp/config/nvptx/task.c
+++ b/libgomp/config/nvptx/task.c
@@ -0,0 +1,43 @@
+/* Copyright (C) 2015-2016 Free Software Foundation, Inc.
+   Contributed by Alexander Monakov <amonakov@ispras.ru>.
+
+   This file is part of the GNU Offloading and Multi Processing Library
+   (libgomp).
+
+   Libgomp is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+   FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+   more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* This file handles the maintainence of tasks in response to task
+   creation and termination.  */
+
+#ifdef __nvptx_softstack__
+
+#include "libgomp.h"
+
+/* NVPTX is an accelerator-only target, so this should never be called.  */
+
+bool
+gomp_target_task_fn (void *data)
+{
+  __builtin_unreachable ();
+}
+
+#include "../../task.c"
+
+#endif
diff --git a/libgomp/config/nvptx/team.c b/libgomp/config/nvptx/team.c
index e69de29bb2d..f7b5e3e81b5 100644
--- a/libgomp/config/nvptx/team.c
+++ b/libgomp/config/nvptx/team.c
@@ -0,0 +1,178 @@
+/* Copyright (C) 2015-2016 Free Software Foundation, Inc.
+   Contributed by Alexander Monakov <amonakov@ispras.ru>
+
+   This file is part of the GNU Offloading and Multi Processing Library
+   (libgomp).
+
+   Libgomp is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+   FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+   more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* This file handles maintainance of threads on NVPTX.  */
+
+#if defined __nvptx_softstack__ && defined __nvptx_unisimt__
+
+#include "libgomp.h"
+#include <stdlib.h>
+#include <string.h>
+
+struct gomp_thread *nvptx_thrs __attribute__((shared,nocommon));
+
+static void gomp_thread_start (struct gomp_thread_pool *);
+
+
+/* This externally visible function handles target region entry.  It
+   sets up a per-team thread pool and transfers control by calling FN (FN_DATA)
+   in the master thread or gomp_thread_start in other threads.
+
+   The name of this function is part of the interface with the compiler: for
+   each target region, GCC emits a PTX .kernel function that sets up soft-stack
+   and uniform-simt state and calls this function, passing in FN the original
+   function outlined for the target region.  */
+
+void
+gomp_nvptx_main (void (*fn) (void *), void *fn_data)
+{
+  int tid, ntids;
+  asm ("mov.u32 %0, %%tid.y;" : "=r" (tid));
+  asm ("mov.u32 %0, %%ntid.y;" : "=r" (ntids));
+  if (tid == 0)
+    {
+      gomp_global_icv.nthreads_var = ntids;
+      /* Starting additional threads is not supported.  */
+      gomp_global_icv.dyn_var = true;
+
+      nvptx_thrs = alloca (ntids * sizeof (*nvptx_thrs));
+      memset (nvptx_thrs, 0, ntids * sizeof (*nvptx_thrs));
+
+      struct gomp_thread_pool *pool = alloca (sizeof (*pool));
+      pool->threads = alloca (ntids * sizeof (*pool->threads));
+      for (tid = 0; tid < ntids; tid++)
+	pool->threads[tid] = nvptx_thrs + tid;
+      pool->threads_size = ntids;
+      pool->threads_used = ntids;
+      pool->threads_busy = 1;
+      pool->last_team = NULL;
+      gomp_simple_barrier_init (&pool->threads_dock, ntids);
+
+      nvptx_thrs[0].thread_pool = pool;
+      asm ("bar.sync 0;");
+      fn (fn_data);
+
+      gomp_free_thread (nvptx_thrs);
+    }
+  else
+    {
+      asm ("bar.sync 0;");
+      gomp_thread_start (nvptx_thrs[0].thread_pool);
+    }
+}
+
+/* This function contains the idle loop in which a thread waits
+   to be called up to become part of a team.  */
+
+static void
+gomp_thread_start (struct gomp_thread_pool *pool)
+{
+  struct gomp_thread *thr = gomp_thread ();
+
+  gomp_sem_init (&thr->release, 0);
+  thr->thread_pool = pool;
+
+  do
+    {
+      gomp_simple_barrier_wait (&pool->threads_dock);
+      if (!thr->fn)
+	continue;
+      thr->fn (thr->data);
+      thr->fn = NULL;
+
+      struct gomp_task *task = thr->task;
+      gomp_team_barrier_wait_final (&thr->ts.team->barrier);
+      gomp_finish_task (task);
+    }
+  /* Work around an NVIDIA driver bug: when generating sm_50 machine code,
+     it can trash stack pointer R1 in loops lacking exit edges.  Add a cheap
+     artificial exit that the driver would not be able to optimize out.  */
+  while (nvptx_thrs);
+}
+
+/* Launch a team.  */
+
+void
+gomp_team_start (void (*fn) (void *), void *data, unsigned nthreads,
+		 unsigned flags, struct gomp_team *team)
+{
+  struct gomp_thread *thr, *nthr;
+  struct gomp_task *task;
+  struct gomp_task_icv *icv;
+  struct gomp_thread_pool *pool;
+  unsigned long nthreads_var;
+
+  thr = gomp_thread ();
+  pool = thr->thread_pool;
+  task = thr->task;
+  icv = task ? &task->icv : &gomp_global_icv;
+
+  /* Always save the previous state, even if this isn't a nested team.
+     In particular, we should save any work share state from an outer
+     orphaned work share construct.  */
+  team->prev_ts = thr->ts;
+
+  thr->ts.team = team;
+  thr->ts.team_id = 0;
+  ++thr->ts.level;
+  if (nthreads > 1)
+    ++thr->ts.active_level;
+  thr->ts.work_share = &team->work_shares[0];
+  thr->ts.last_work_share = NULL;
+  thr->ts.single_count = 0;
+  thr->ts.static_trip = 0;
+  thr->task = &team->implicit_task[0];
+  nthreads_var = icv->nthreads_var;
+  gomp_init_task (thr->task, task, icv);
+  team->implicit_task[0].icv.nthreads_var = nthreads_var;
+
+  if (nthreads == 1)
+    return;
+
+  /* Release existing idle threads.  */
+  for (unsigned i = 1; i < nthreads; ++i)
+    {
+      nthr = pool->threads[i];
+      nthr->ts.team = team;
+      nthr->ts.work_share = &team->work_shares[0];
+      nthr->ts.last_work_share = NULL;
+      nthr->ts.team_id = i;
+      nthr->ts.level = team->prev_ts.level + 1;
+      nthr->ts.active_level = thr->ts.active_level;
+      nthr->ts.single_count = 0;
+      nthr->ts.static_trip = 0;
+      nthr->task = &team->implicit_task[i];
+      gomp_init_task (nthr->task, task, icv);
+      team->implicit_task[i].icv.nthreads_var = nthreads_var;
+      nthr->fn = fn;
+      nthr->data = data;
+      team->ordered_release[i] = &nthr->release;
+    }
+
+  gomp_simple_barrier_wait (&pool->threads_dock);
+}
+
+#include "../../team.c"
+#endif
diff --git a/libgomp/config/nvptx/time.c b/libgomp/config/nvptx/time.c
index e69de29bb2d..88fb13050c0 100644
--- a/libgomp/config/nvptx/time.c
+++ b/libgomp/config/nvptx/time.c
@@ -0,0 +1,49 @@
+/* Copyright (C) 2015-2016 Free Software Foundation, Inc.
+   Contributed by Dmitry Melnik <dm@ispras.ru>
+
+   This file is part of the GNU Offloading and Multi Processing Library
+   (libgomp).
+
+   Libgomp is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+   FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+   more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* This file implements timer routines for NVPTX.  It uses the %clock64 cycle
+   counter.  */
+
+#include "libgomp.h"
+
+/* This is set from host in plugin-nvptx.c.  */
+double __nvptx_clocktick = 0;
+
+double
+omp_get_wtime (void)
+{
+  uint64_t clock;
+  asm ("mov.u64 %0, %%clock64;" : "=r" (clock));
+  return clock * __nvptx_clocktick;
+}
+
+double
+omp_get_wtick (void)
+{
+  return __nvptx_clocktick;
+}
+
+ialias (omp_get_wtime)
+ialias (omp_get_wtick)
diff --git a/libgomp/config/nvptx/work.c b/libgomp/config/nvptx/work.c
deleted file mode 100644
index e69de29bb2d..00000000000
--- a/libgomp/config/nvptx/work.c
+++ /dev/null
diff --git a/libgomp/config/posix/simple-bar.h b/libgomp/config/posix/simple-bar.h
new file mode 100644
index 00000000000..b77491c19f0
--- /dev/null
+++ b/libgomp/config/posix/simple-bar.h
@@ -0,0 +1,69 @@
+/* Copyright (C) 2015-2016 Free Software Foundation, Inc.
+   Contributed by Alexander Monakov <amonakov@ispras.ru>
+
+   This file is part of the GNU Offloading and Multi Processing Library
+   (libgomp).
+
+   Libgomp is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+   FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+   more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* This is a simplified barrier that is suitable for thread pool
+   synchronizaton.  Only a subset of full barrier API (bar.h) is exposed.  */
+
+#ifndef GOMP_SIMPLE_BARRIER_H
+#define GOMP_SIMPLE_BARRIER_H 1
+
+#include "bar.h"
+
+typedef struct
+{
+  gomp_barrier_t bar;
+} gomp_simple_barrier_t;
+
+static inline void
+gomp_simple_barrier_init (gomp_simple_barrier_t *bar, unsigned count)
+{
+  gomp_barrier_init (&bar->bar, count);
+}
+
+static inline void
+gomp_simple_barrier_reinit (gomp_simple_barrier_t *bar, unsigned count)
+{
+  gomp_barrier_reinit (&bar->bar, count);
+}
+
+static inline void
+gomp_simple_barrier_destroy (gomp_simple_barrier_t *bar)
+{
+  gomp_barrier_destroy (&bar->bar);
+}
+
+static inline void
+gomp_simple_barrier_wait (gomp_simple_barrier_t *bar)
+{
+  gomp_barrier_wait (&bar->bar);
+}
+
+static inline void
+gomp_simple_barrier_wait_last (gomp_simple_barrier_t *bar)
+{
+  gomp_barrier_wait_last (&bar->bar);
+}
+
+#endif /* GOMP_SIMPLE_BARRIER_H */
diff --git a/libgomp/configure b/libgomp/configure
index 8d03eb6cddc..84d5876cce2 100755
--- a/libgomp/configure
+++ b/libgomp/configure
@@ -15067,6 +15067,7 @@ case "$host" in
     ;;
   nvptx*-*-*)
     # NVPTX does not support Pthreads, has its own code replacement.
+    libgomp_use_pthreads=no
     ;;
   *)
     # Check to see if -pthread or -lpthread is needed.  Prefer the former.
@@ -15112,6 +15113,12 @@ rm -f core conftest.err conftest.$ac_objext \
     conftest$ac_exeext conftest.$ac_ext
 esac
 
+if test x$libgomp_use_pthreads != xno; then
+
+$as_echo "#define LIBGOMP_USE_PTHREADS 1" >>confdefs.h
+
+fi
+
 # Plugins for offload execution, configure.ac fragment.  -*- mode: autoconf -*-
 #
 # Copyright (C) 2014-2016 Free Software Foundation, Inc.
diff --git a/libgomp/configure.ac b/libgomp/configure.ac
index 2e41ca8aee5..5f1db7e1e0e 100644
--- a/libgomp/configure.ac
+++ b/libgomp/configure.ac
@@ -181,6 +181,7 @@ case "$host" in
     ;;
   nvptx*-*-*)
     # NVPTX does not support Pthreads, has its own code replacement.
+    libgomp_use_pthreads=no
     ;;
   *)
     # Check to see if -pthread or -lpthread is needed.  Prefer the former.
@@ -202,6 +203,11 @@ case "$host" in
        [AC_MSG_ERROR([Pthreads are required to build libgomp])])])
 esac
 
+if test x$libgomp_use_pthreads != xno; then
+  AC_DEFINE(LIBGOMP_USE_PTHREADS, 1,
+            [Define to 1 if libgomp should use POSIX threads.])
+fi
+
 m4_include([plugin/configfrag.ac])
 
 # Check for functions needed.
diff --git a/libgomp/critical.c b/libgomp/critical.c
index 2b1f7f25013..cd520dbe5b3 100644
--- a/libgomp/critical.c
+++ b/libgomp/critical.c
@@ -115,33 +115,11 @@ GOMP_critical_name_end (void **pptr)
   gomp_mutex_unlock (plock);
 }
 
-/* This mutex is used when atomic operations don't exist for the target
-   in the mode requested.  The result is not globally atomic, but works so
-   long as all parallel references are within #pragma omp atomic directives.
-   According to responses received from omp@openmp.org, appears to be within
-   spec.  Which makes sense, since that's how several other compilers 
-   handle this situation as well.  */
-
-static gomp_mutex_t atomic_lock;
-
-void
-GOMP_atomic_start (void)
-{
-  gomp_mutex_lock (&atomic_lock);
-}
-
-void
-GOMP_atomic_end (void)
-{
-  gomp_mutex_unlock (&atomic_lock);
-}
-
 #if !GOMP_MUTEX_INIT_0
 static void __attribute__((constructor))
 initialize_critical (void)
 {
   gomp_mutex_init (&default_lock);
-  gomp_mutex_init (&atomic_lock);
 #ifndef HAVE_SYNC_BUILTINS
   gomp_mutex_init (&create_lock_lock);
 #endif
diff --git a/libgomp/env.c b/libgomp/env.c
index ac05c3b7b75..7ba7663da9a 100644
--- a/libgomp/env.c
+++ b/libgomp/env.c
@@ -23,8 +23,8 @@
    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
    <http://www.gnu.org/licenses/>.  */
 
-/* This file defines the OpenMP internal control variables, and arranges
-   for them to be initialized from environment variables at startup.  */
+/* This file arranges for OpenMP internal control variables to be initialized
+   from environment variables at startup.  */
 
 #include "libgomp.h"
 #include "libgomp_f.h"
@@ -55,35 +55,6 @@
 # define strtoull(ptr, eptr, base) strtoul (ptr, eptr, base)
 #endif
 
-struct gomp_task_icv gomp_global_icv = {
-  .nthreads_var = 1,
-  .thread_limit_var = UINT_MAX,
-  .run_sched_var = GFS_DYNAMIC,
-  .run_sched_chunk_size = 1,
-  .default_device_var = 0,
-  .dyn_var = false,
-  .nest_var = false,
-  .bind_var = omp_proc_bind_false,
-  .target_data = NULL
-};
-
-unsigned long gomp_max_active_levels_var = INT_MAX;
-bool gomp_cancel_var = false;
-int gomp_max_task_priority_var = 0;
-#ifndef HAVE_SYNC_BUILTINS
-gomp_mutex_t gomp_managed_threads_lock;
-#endif
-unsigned long gomp_available_cpus = 1, gomp_managed_threads = 1;
-unsigned long long gomp_spin_count_var, gomp_throttled_spin_count_var;
-unsigned long *gomp_nthreads_var_list, gomp_nthreads_var_list_len;
-char *gomp_bind_var_list;
-unsigned long gomp_bind_var_list_len;
-void **gomp_places_list;
-unsigned long gomp_places_list_len;
-int gomp_debug_var;
-char *goacc_device_type;
-int goacc_device_num;
-
 /* Parse the OMP_SCHEDULE environment variable.  */
 
 static void
@@ -1302,240 +1273,3 @@ initialize_env (void)
 
   goacc_runtime_initialize ();
 }
-
-
-/* The public OpenMP API routines that access these variables.  */
-
-void
-omp_set_num_threads (int n)
-{
-  struct gomp_task_icv *icv = gomp_icv (true);
-  icv->nthreads_var = (n > 0 ? n : 1);
-}
-
-void
-omp_set_dynamic (int val)
-{
-  struct gomp_task_icv *icv = gomp_icv (true);
-  icv->dyn_var = val;
-}
-
-int
-omp_get_dynamic (void)
-{
-  struct gomp_task_icv *icv = gomp_icv (false);
-  return icv->dyn_var;
-}
-
-void
-omp_set_nested (int val)
-{
-  struct gomp_task_icv *icv = gomp_icv (true);
-  icv->nest_var = val;
-}
-
-int
-omp_get_nested (void)
-{
-  struct gomp_task_icv *icv = gomp_icv (false);
-  return icv->nest_var;
-}
-
-void
-omp_set_schedule (omp_sched_t kind, int chunk_size)
-{
-  struct gomp_task_icv *icv = gomp_icv (true);
-  switch (kind)
-    {
-    case omp_sched_static:
-      if (chunk_size < 1)
-	chunk_size = 0;
-      icv->run_sched_chunk_size = chunk_size;
-      break;
-    case omp_sched_dynamic:
-    case omp_sched_guided:
-      if (chunk_size < 1)
-	chunk_size = 1;
-      icv->run_sched_chunk_size = chunk_size;
-      break;
-    case omp_sched_auto:
-      break;
-    default:
-      return;
-    }
-  icv->run_sched_var = kind;
-}
-
-void
-omp_get_schedule (omp_sched_t *kind, int *chunk_size)
-{
-  struct gomp_task_icv *icv = gomp_icv (false);
-  *kind = icv->run_sched_var;
-  *chunk_size = icv->run_sched_chunk_size;
-}
-
-int
-omp_get_max_threads (void)
-{
-  struct gomp_task_icv *icv = gomp_icv (false);
-  return icv->nthreads_var;
-}
-
-int
-omp_get_thread_limit (void)
-{
-  struct gomp_task_icv *icv = gomp_icv (false);
-  return icv->thread_limit_var > INT_MAX ? INT_MAX : icv->thread_limit_var;
-}
-
-void
-omp_set_max_active_levels (int max_levels)
-{
-  if (max_levels >= 0)
-    gomp_max_active_levels_var = max_levels;
-}
-
-int
-omp_get_max_active_levels (void)
-{
-  return gomp_max_active_levels_var;
-}
-
-int
-omp_get_cancellation (void)
-{
-  return gomp_cancel_var;
-}
-
-int
-omp_get_max_task_priority (void)
-{
-  return gomp_max_task_priority_var;
-}
-
-omp_proc_bind_t
-omp_get_proc_bind (void)
-{
-  struct gomp_task_icv *icv = gomp_icv (false);
-  return icv->bind_var;
-}
-
-void
-omp_set_default_device (int device_num)
-{
-  struct gomp_task_icv *icv = gomp_icv (true);
-  icv->default_device_var = device_num >= 0 ? device_num : 0;
-}
-
-int
-omp_get_default_device (void)
-{
-  struct gomp_task_icv *icv = gomp_icv (false);
-  return icv->default_device_var;
-}
-
-int
-omp_get_num_devices (void)
-{
-  return gomp_get_num_devices ();
-}
-
-int
-omp_get_num_teams (void)
-{
-  /* Hardcoded to 1 on host, MIC, HSAIL?  Maybe variable on PTX.  */
-  return 1;
-}
-
-int
-omp_get_team_num (void)
-{
-  /* Hardcoded to 0 on host, MIC, HSAIL?  Maybe variable on PTX.  */
-  return 0;
-}
-
-int
-omp_is_initial_device (void)
-{
-  /* Hardcoded to 1 on host, should be 0 on MIC, HSAIL, PTX.  */
-  return 1;
-}
-
-int
-omp_get_initial_device (void)
-{
-  return GOMP_DEVICE_HOST_FALLBACK;
-}
-
-int
-omp_get_num_places (void)
-{
-  return gomp_places_list_len;
-}
-
-int
-omp_get_place_num (void)
-{
-  if (gomp_places_list == NULL)
-    return -1;
-
-  struct gomp_thread *thr = gomp_thread ();
-  if (thr->place == 0)
-    gomp_init_affinity ();
-
-  return (int) thr->place - 1;
-}
-
-int
-omp_get_partition_num_places (void)
-{
-  if (gomp_places_list == NULL)
-    return 0;
-
-  struct gomp_thread *thr = gomp_thread ();
-  if (thr->place == 0)
-    gomp_init_affinity ();
-
-  return thr->ts.place_partition_len;
-}
-
-void
-omp_get_partition_place_nums (int *place_nums)
-{
-  if (gomp_places_list == NULL)
-    return;
-
-  struct gomp_thread *thr = gomp_thread ();
-  if (thr->place == 0)
-    gomp_init_affinity ();
-
-  unsigned int i;
-  for (i = 0; i < thr->ts.place_partition_len; i++)
-    *place_nums++ = thr->ts.place_partition_off + i;
-}
-
-ialias (omp_set_dynamic)
-ialias (omp_set_nested)
-ialias (omp_set_num_threads)
-ialias (omp_get_dynamic)
-ialias (omp_get_nested)
-ialias (omp_set_schedule)
-ialias (omp_get_schedule)
-ialias (omp_get_max_threads)
-ialias (omp_get_thread_limit)
-ialias (omp_set_max_active_levels)
-ialias (omp_get_max_active_levels)
-ialias (omp_get_cancellation)
-ialias (omp_get_proc_bind)
-ialias (omp_set_default_device)
-ialias (omp_get_default_device)
-ialias (omp_get_num_devices)
-ialias (omp_get_num_teams)
-ialias (omp_get_team_num)
-ialias (omp_is_initial_device)
-ialias (omp_get_initial_device)
-ialias (omp_get_max_task_priority)
-ialias (omp_get_num_places)
-ialias (omp_get_place_num)
-ialias (omp_get_partition_num_places)
-ialias (omp_get_partition_place_nums)
diff --git a/libgomp/icv-device.c b/libgomp/icv-device.c
new file mode 100644
index 00000000000..9bbc0b9b767
--- /dev/null
+++ b/libgomp/icv-device.c
@@ -0,0 +1,77 @@
+/* Copyright (C) 2005-2016 Free Software Foundation, Inc.
+   Contributed by Richard Henderson <rth@redhat.com>.
+
+   This file is part of the GNU Offloading and Multi Processing Library
+   (libgomp).
+
+   Libgomp is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+   FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+   more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* This file defines OpenMP API entry points that accelerator targets are
+   expected to replace.  */
+
+#include "libgomp.h"
+
+void
+omp_set_default_device (int device_num)
+{
+  struct gomp_task_icv *icv = gomp_icv (true);
+  icv->default_device_var = device_num >= 0 ? device_num : 0;
+}
+
+int
+omp_get_default_device (void)
+{
+  struct gomp_task_icv *icv = gomp_icv (false);
+  return icv->default_device_var;
+}
+
+int
+omp_get_num_devices (void)
+{
+  return gomp_get_num_devices ();
+}
+
+int
+omp_get_num_teams (void)
+{
+  /* Hardcoded to 1 on host, MIC, HSAIL?  Maybe variable on PTX.  */
+  return 1;
+}
+
+int
+omp_get_team_num (void)
+{
+  /* Hardcoded to 0 on host, MIC, HSAIL?  Maybe variable on PTX.  */
+  return 0;
+}
+
+int
+omp_is_initial_device (void)
+{
+  /* Hardcoded to 1 on host, should be 0 on MIC, HSAIL, PTX.  */
+  return 1;
+}
+
+ialias (omp_set_default_device)
+ialias (omp_get_default_device)
+ialias (omp_get_num_devices)
+ialias (omp_get_num_teams)
+ialias (omp_get_team_num)
+ialias (omp_is_initial_device)
diff --git a/libgomp/icv.c b/libgomp/icv.c
new file mode 100644
index 00000000000..e58b961558e
--- /dev/null
+++ b/libgomp/icv.c
@@ -0,0 +1,248 @@
+/* Copyright (C) 2005-2016 Free Software Foundation, Inc.
+   Contributed by Richard Henderson <rth@redhat.com>.
+
+   This file is part of the GNU Offloading and Multi Processing Library
+   (libgomp).
+
+   Libgomp is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+   FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+   more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* This file defines the OpenMP internal control variables and associated
+   OpenMP API entry points.  */
+
+#include "libgomp.h"
+#include "gomp-constants.h"
+#include <limits.h>
+
+struct gomp_task_icv gomp_global_icv = {
+  .nthreads_var = 1,
+  .thread_limit_var = UINT_MAX,
+  .run_sched_var = GFS_DYNAMIC,
+  .run_sched_chunk_size = 1,
+  .default_device_var = 0,
+  .dyn_var = false,
+  .nest_var = false,
+  .bind_var = omp_proc_bind_false,
+  .target_data = NULL
+};
+
+unsigned long gomp_max_active_levels_var = INT_MAX;
+bool gomp_cancel_var = false;
+int gomp_max_task_priority_var = 0;
+#ifndef HAVE_SYNC_BUILTINS
+gomp_mutex_t gomp_managed_threads_lock;
+#endif
+unsigned long gomp_available_cpus = 1, gomp_managed_threads = 1;
+unsigned long long gomp_spin_count_var, gomp_throttled_spin_count_var;
+unsigned long *gomp_nthreads_var_list, gomp_nthreads_var_list_len;
+char *gomp_bind_var_list;
+unsigned long gomp_bind_var_list_len;
+void **gomp_places_list;
+unsigned long gomp_places_list_len;
+int gomp_debug_var;
+unsigned int gomp_num_teams_var;
+char *goacc_device_type;
+int goacc_device_num;
+
+void
+omp_set_num_threads (int n)
+{
+  struct gomp_task_icv *icv = gomp_icv (true);
+  icv->nthreads_var = (n > 0 ? n : 1);
+}
+
+void
+omp_set_dynamic (int val)
+{
+  struct gomp_task_icv *icv = gomp_icv (true);
+  icv->dyn_var = val;
+}
+
+int
+omp_get_dynamic (void)
+{
+  struct gomp_task_icv *icv = gomp_icv (false);
+  return icv->dyn_var;
+}
+
+void
+omp_set_nested (int val)
+{
+  struct gomp_task_icv *icv = gomp_icv (true);
+  icv->nest_var = val;
+}
+
+int
+omp_get_nested (void)
+{
+  struct gomp_task_icv *icv = gomp_icv (false);
+  return icv->nest_var;
+}
+
+void
+omp_set_schedule (omp_sched_t kind, int chunk_size)
+{
+  struct gomp_task_icv *icv = gomp_icv (true);
+  switch (kind)
+    {
+    case omp_sched_static:
+      if (chunk_size < 1)
+	chunk_size = 0;
+      icv->run_sched_chunk_size = chunk_size;
+      break;
+    case omp_sched_dynamic:
+    case omp_sched_guided:
+      if (chunk_size < 1)
+	chunk_size = 1;
+      icv->run_sched_chunk_size = chunk_size;
+      break;
+    case omp_sched_auto:
+      break;
+    default:
+      return;
+    }
+  icv->run_sched_var = kind;
+}
+
+void
+omp_get_schedule (omp_sched_t *kind, int *chunk_size)
+{
+  struct gomp_task_icv *icv = gomp_icv (false);
+  *kind = icv->run_sched_var;
+  *chunk_size = icv->run_sched_chunk_size;
+}
+
+int
+omp_get_max_threads (void)
+{
+  struct gomp_task_icv *icv = gomp_icv (false);
+  return icv->nthreads_var;
+}
+
+int
+omp_get_thread_limit (void)
+{
+  struct gomp_task_icv *icv = gomp_icv (false);
+  return icv->thread_limit_var > INT_MAX ? INT_MAX : icv->thread_limit_var;
+}
+
+void
+omp_set_max_active_levels (int max_levels)
+{
+  if (max_levels >= 0)
+    gomp_max_active_levels_var = max_levels;
+}
+
+int
+omp_get_max_active_levels (void)
+{
+  return gomp_max_active_levels_var;
+}
+
+int
+omp_get_cancellation (void)
+{
+  return gomp_cancel_var;
+}
+
+int
+omp_get_max_task_priority (void)
+{
+  return gomp_max_task_priority_var;
+}
+
+omp_proc_bind_t
+omp_get_proc_bind (void)
+{
+  struct gomp_task_icv *icv = gomp_icv (false);
+  return icv->bind_var;
+}
+
+int
+omp_get_initial_device (void)
+{
+  return GOMP_DEVICE_HOST_FALLBACK;
+}
+
+int
+omp_get_num_places (void)
+{
+  return gomp_places_list_len;
+}
+
+int
+omp_get_place_num (void)
+{
+  if (gomp_places_list == NULL)
+    return -1;
+
+  struct gomp_thread *thr = gomp_thread ();
+  if (thr->place == 0)
+    gomp_init_affinity ();
+
+  return (int) thr->place - 1;
+}
+
+int
+omp_get_partition_num_places (void)
+{
+  if (gomp_places_list == NULL)
+    return 0;
+
+  struct gomp_thread *thr = gomp_thread ();
+  if (thr->place == 0)
+    gomp_init_affinity ();
+
+  return thr->ts.place_partition_len;
+}
+
+void
+omp_get_partition_place_nums (int *place_nums)
+{
+  if (gomp_places_list == NULL)
+    return;
+
+  struct gomp_thread *thr = gomp_thread ();
+  if (thr->place == 0)
+    gomp_init_affinity ();
+
+  unsigned int i;
+  for (i = 0; i < thr->ts.place_partition_len; i++)
+    *place_nums++ = thr->ts.place_partition_off + i;
+}
+
+ialias (omp_set_dynamic)
+ialias (omp_set_nested)
+ialias (omp_set_num_threads)
+ialias (omp_get_dynamic)
+ialias (omp_get_nested)
+ialias (omp_set_schedule)
+ialias (omp_get_schedule)
+ialias (omp_get_max_threads)
+ialias (omp_get_thread_limit)
+ialias (omp_set_max_active_levels)
+ialias (omp_get_max_active_levels)
+ialias (omp_get_cancellation)
+ialias (omp_get_proc_bind)
+ialias (omp_get_initial_device)
+ialias (omp_get_max_task_priority)
+ialias (omp_get_num_places)
+ialias (omp_get_place_num)
+ialias (omp_get_partition_num_places)
+ialias (omp_get_partition_place_nums)
diff --git a/libgomp/libgomp.h b/libgomp/libgomp.h
index 7b2671ba49d..c86ac3d8778 100644
--- a/libgomp/libgomp.h
+++ b/libgomp/libgomp.h
@@ -45,7 +45,9 @@
 #include "gstdint.h"
 #include "libgomp-plugin.h"
 
+#ifdef HAVE_PTHREAD_H
 #include <pthread.h>
+#endif
 #include <stdbool.h>
 #include <stdlib.h>
 #include <stdarg.h>
@@ -122,6 +124,7 @@ struct htab;
 #include "sem.h"
 #include "mutex.h"
 #include "bar.h"
+#include "simple-bar.h"
 #include "ptrlock.h"
 
 
@@ -360,6 +363,7 @@ extern char *gomp_bind_var_list;
 extern unsigned long gomp_bind_var_list_len;
 extern void **gomp_places_list;
 extern unsigned long gomp_places_list_len;
+extern unsigned int gomp_num_teams_var;
 extern int gomp_debug_var;
 extern int goacc_device_num;
 extern char *goacc_device_type;
@@ -626,8 +630,8 @@ struct gomp_thread_pool
   /* Number of threads running in this contention group.  */
   unsigned long threads_busy;
 
-  /* This barrier holds and releases threads waiting in threads.  */
-  gomp_barrier_t threads_dock;
+  /* This barrier holds and releases threads waiting in thread pools.  */
+  gomp_simple_barrier_t threads_dock;
 };
 
 enum gomp_cancel_kind
@@ -642,7 +646,15 @@ enum gomp_cancel_kind
 
 /* ... and here is that TLS data.  */
 
-#if defined HAVE_TLS || defined USE_EMUTLS
+#if defined __nvptx__
+extern struct gomp_thread *nvptx_thrs __attribute__((shared));
+static inline struct gomp_thread *gomp_thread (void)
+{
+  int tid;
+  asm ("mov.u32 %0, %%tid.y;" : "=r" (tid));
+  return nvptx_thrs + tid;
+}
+#elif defined HAVE_TLS || defined USE_EMUTLS
 extern __thread struct gomp_thread gomp_tls_data;
 static inline struct gomp_thread *gomp_thread (void)
 {
@@ -671,17 +683,21 @@ static inline struct gomp_task_icv *gomp_icv (bool write)
     return &gomp_global_icv;
 }
 
+#ifdef LIBGOMP_USE_PTHREADS
 /* The attributes to be used during thread creation.  */
 extern pthread_attr_t gomp_thread_attr;
 
 extern pthread_key_t gomp_thread_destructor;
+#endif
 
 /* Function prototypes.  */
 
 /* affinity.c */
 
 extern void gomp_init_affinity (void);
+#ifdef LIBGOMP_USE_PTHREADS
 extern void gomp_init_thread_affinity (pthread_attr_t *, unsigned int);
+#endif
 extern void **gomp_affinity_alloc (unsigned long, bool);
 extern void gomp_affinity_init_place (void *);
 extern bool gomp_affinity_add_cpus (void *, unsigned long, unsigned long,
diff --git a/libgomp/lock.c b/libgomp/lock.c
new file mode 100644
index 00000000000..783bd77dd90
--- /dev/null
+++ b/libgomp/lock.c
@@ -0,0 +1,123 @@
+/* Copyright (C) 2005-2016 Free Software Foundation, Inc.
+   Contributed by Richard Henderson <rth@redhat.com>.
+
+   This file is part of the GNU Offloading and Multi Processing Library
+   (libgomp).
+
+   Libgomp is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+   FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+   more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* This is a generic implementation of the public OpenMP locking primitives in
+   terms of internal gomp_mutex_t.  It is not meant to be compiled on its own.
+   It is #include'd from config/{linux,nvptx}/lock.c.  */
+
+#include <string.h>
+#include "libgomp.h"
+
+/* The internal gomp_mutex_t and the external non-recursive omp_lock_t
+   have the same form.  Re-use it.  */
+
+void
+gomp_init_lock_30 (omp_lock_t *lock)
+{
+  gomp_mutex_init (lock);
+}
+
+void
+gomp_destroy_lock_30 (omp_lock_t *lock)
+{
+  gomp_mutex_destroy (lock);
+}
+
+void
+gomp_set_lock_30 (omp_lock_t *lock)
+{
+  gomp_mutex_lock (lock);
+}
+
+void
+gomp_unset_lock_30 (omp_lock_t *lock)
+{
+  gomp_mutex_unlock (lock);
+}
+
+int
+gomp_test_lock_30 (omp_lock_t *lock)
+{
+  int oldval = 0;
+
+  return __atomic_compare_exchange_n (lock, &oldval, 1, false,
+				      MEMMODEL_ACQUIRE, MEMMODEL_RELAXED);
+}
+
+void
+gomp_init_nest_lock_30 (omp_nest_lock_t *lock)
+{
+  memset (lock, '\0', sizeof (*lock));
+}
+
+void
+gomp_destroy_nest_lock_30 (omp_nest_lock_t *lock)
+{
+}
+
+void
+gomp_set_nest_lock_30 (omp_nest_lock_t *lock)
+{
+  void *me = gomp_icv (true);
+
+  if (lock->owner != me)
+    {
+      gomp_mutex_lock (&lock->lock);
+      lock->owner = me;
+    }
+
+  lock->count++;
+}
+
+void
+gomp_unset_nest_lock_30 (omp_nest_lock_t *lock)
+{
+  if (--lock->count == 0)
+    {
+      lock->owner = NULL;
+      gomp_mutex_unlock (&lock->lock);
+    }
+}
+
+int
+gomp_test_nest_lock_30 (omp_nest_lock_t *lock)
+{
+  void *me = gomp_icv (true);
+  int oldval;
+
+  if (lock->owner == me)
+    return ++lock->count;
+
+  oldval = 0;
+  if (__atomic_compare_exchange_n (&lock->lock, &oldval, 1, false,
+				   MEMMODEL_ACQUIRE, MEMMODEL_RELAXED))
+    {
+      lock->owner = me;
+      lock->count = 1;
+      return 1;
+    }
+
+  return 0;
+}
diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c
index 5ee350d4c1d..ca33c51db5a 100644
--- a/libgomp/plugin/plugin-nvptx.c
+++ b/libgomp/plugin/plugin-nvptx.c
@@ -41,6 +41,7 @@
 #include <cuda.h>
 #include <stdbool.h>
 #include <stdint.h>
+#include <limits.h>
 #include <string.h>
 #include <stdio.h>
 #include <unistd.h>
@@ -274,6 +275,8 @@ struct targ_fn_descriptor
 {
   CUfunction fn;
   const struct targ_fn_launch *launch;
+  int regs_per_thread;
+  int max_threads_per_block;
 };
 
 /* A loaded PTX image.  */
@@ -307,8 +310,12 @@ struct ptx_device
   bool overlap;
   bool map;
   bool concur;
-  int  mode;
   bool mkern;
+  int  mode;
+  int clock_khz;
+  int num_sms;
+  int regs_per_block;
+  int regs_per_sm;
 
   struct ptx_image_data *images;  /* Images loaded on device.  */
   pthread_mutex_t image_lock;     /* Lock for above list.  */
@@ -658,6 +665,39 @@ nvptx_open_device (int n)
 		  &pi, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev);
   ptx_dev->mkern = pi;
 
+  CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
+		  &pi, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
+  ptx_dev->clock_khz = pi;
+
+  CUDA_CALL_ERET (NULL,  cuDeviceGetAttribute,
+		  &pi, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev);
+  ptx_dev->num_sms = pi;
+
+  CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
+		  &pi, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, dev);
+  ptx_dev->regs_per_block = pi;
+
+  /* CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR = 82 is defined only
+     in CUDA 6.0 and newer.  */
+  r = cuDeviceGetAttribute (&pi, 82, dev);
+  /* Fallback: use limit of registers per block, which is usually equal.  */
+  if (r == CUDA_ERROR_INVALID_VALUE)
+    pi = ptx_dev->regs_per_block;
+  else if (r != CUDA_SUCCESS)
+    {
+      GOMP_PLUGIN_error ("cuDeviceGetAttribute error: %s", cuda_error (r));
+      return NULL;
+    }
+  ptx_dev->regs_per_sm = pi;
+
+  CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
+		  &pi, CU_DEVICE_ATTRIBUTE_WARP_SIZE, dev);
+  if (pi != 32)
+    {
+      GOMP_PLUGIN_error ("Only warp size 32 is supported");
+      return NULL;
+    }
+
   r = cuDeviceGetAttribute (&async_engines,
 			    CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, dev);
   if (r != CUDA_SUCCESS)
@@ -725,10 +765,8 @@ link_ptx (CUmodule *module, const struct targ_ptx_obj *ptx_objs,
   CUjit_option opts[6];
   void *optvals[6];
   float elapsed = 0.0;
-#define LOGSIZE 8192
-  char elog[LOGSIZE];
-  char ilog[LOGSIZE];
-  unsigned long logsize = LOGSIZE;
+  char elog[1024];
+  char ilog[16384];
   CUlinkState linkstate;
   CUresult r;
   void *linkout;
@@ -741,13 +779,13 @@ link_ptx (CUmodule *module, const struct targ_ptx_obj *ptx_objs,
   optvals[1] = &ilog[0];
 
   opts[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
-  optvals[2] = (void *) logsize;
+  optvals[2] = (void *) sizeof ilog;
 
   opts[3] = CU_JIT_ERROR_LOG_BUFFER;
   optvals[3] = &elog[0];
 
   opts[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
-  optvals[4] = (void *) logsize;
+  optvals[4] = (void *) sizeof elog;
 
   opts[5] = CU_JIT_LOG_VERBOSE;
   optvals[5] = (void *) 1;
@@ -1164,7 +1202,7 @@ nvptx_host2dev (void *d, const void *h, size_t s)
     }
 
 #ifndef DISABLE_ASYNC
-  if (nvthd->current_stream != nvthd->ptx_dev->null_stream)
+  if (nvthd && nvthd->current_stream != nvthd->ptx_dev->null_stream)
     {
       CUevent *e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
       CUDA_CALL (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
@@ -1220,7 +1258,7 @@ nvptx_dev2host (void *h, const void *d, size_t s)
     }
 
 #ifndef DISABLE_ASYNC
-  if (nvthd->current_stream != nvthd->ptx_dev->null_stream)
+  if (nvthd && nvthd->current_stream != nvthd->ptx_dev->null_stream)
     {
       CUevent *e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
       CUDA_CALL (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
@@ -1518,7 +1556,7 @@ GOMP_OFFLOAD_get_name (void)
 unsigned int
 GOMP_OFFLOAD_get_caps (void)
 {
-  return GOMP_OFFLOAD_CAP_OPENACC_200;
+  return GOMP_OFFLOAD_CAP_OPENACC_200 | GOMP_OFFLOAD_CAP_OPENMP_400;
 }
 
 int
@@ -1588,6 +1626,23 @@ GOMP_OFFLOAD_version (void)
   return GOMP_VERSION;
 }
 
+/* Initialize __nvptx_clocktick, if present in MODULE.  */
+
+static void
+nvptx_set_clocktick (CUmodule module, struct ptx_device *dev)
+{
+  CUdeviceptr dptr;
+  CUresult r = cuModuleGetGlobal (&dptr, NULL, module, "__nvptx_clocktick");
+  if (r == CUDA_ERROR_NOT_FOUND)
+    return;
+  if (r != CUDA_SUCCESS)
+    GOMP_PLUGIN_fatal ("cuModuleGetGlobal error: %s", cuda_error (r));
+  double __nvptx_clocktick = 1e-3 / dev->clock_khz;
+  r = cuMemcpyHtoD (dptr, &__nvptx_clocktick, sizeof (__nvptx_clocktick));
+  if (r != CUDA_SUCCESS)
+    GOMP_PLUGIN_fatal ("cuMemcpyHtoD error: %s", cuda_error (r));
+}
+
 /* Load the (partial) program described by TARGET_DATA to device
    number ORD.  Allocate and return TARGET_TABLE.  */
 
@@ -1648,12 +1703,19 @@ GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data,
   for (i = 0; i < fn_entries; i++, targ_fns++, targ_tbl++)
     {
       CUfunction function;
+      int nregs, mthrs;
 
       CUDA_CALL_ERET (-1, cuModuleGetFunction, &function, module,
 		      fn_descs[i].fn);
+      CUDA_CALL_ERET (-1, cuFuncGetAttribute, &nregs,
+		      CU_FUNC_ATTRIBUTE_NUM_REGS, function);
+      CUDA_CALL_ERET (-1, cuFuncGetAttribute, &mthrs,
+		      CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, function);
 
       targ_fns->fn = function;
       targ_fns->launch = &fn_descs[i];
+      targ_fns->regs_per_thread = nregs;
+      targ_fns->max_threads_per_block = mthrs;
 
       targ_tbl->start = (uintptr_t) targ_fns;
       targ_tbl->end = targ_tbl->start + 1;
@@ -1671,6 +1733,8 @@ GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data,
       targ_tbl->end = targ_tbl->start + bytes;
     }
 
+  nvptx_set_clocktick (module, dev);
+
   return fn_entries + var_entries;
 }
 
@@ -1736,6 +1800,15 @@ GOMP_OFFLOAD_host2dev (int ord, void *dst, const void *src, size_t n)
 	  && nvptx_host2dev (dst, src, n));
 }
 
+bool
+GOMP_OFFLOAD_dev2dev (int ord, void *dst, const void *src, size_t n)
+{
+  struct ptx_device *ptx_dev = ptx_devices[ord];
+  CUDA_CALL (cuMemcpyDtoDAsync, (CUdeviceptr) dst, (CUdeviceptr) src, n,
+				ptx_dev->null_stream->stream);
+  return true;
+}
+
 void (*device_run) (int n, void *fn_ptr, void *vars) = NULL;
 
 void
@@ -1857,3 +1930,123 @@ GOMP_OFFLOAD_openacc_set_cuda_stream (int async, void *stream)
 {
   return nvptx_set_cuda_stream (async, stream);
 }
+
+/* Adjust launch dimensions: pick good values for number of blocks and warps
+   and ensure that number of warps does not exceed CUDA limits as well as GCC's
+   own limits.  */
+
+static void
+nvptx_adjust_launch_bounds (struct targ_fn_descriptor *fn,
+			    struct ptx_device *ptx_dev,
+			    int *teams_p, int *threads_p)
+{
+  int max_warps_block = fn->max_threads_per_block / 32;
+  /* Maximum 32 warps per block is an implementation limit in NVPTX backend
+     and libgcc, which matches documented limit of all GPUs as of 2015.  */
+  if (max_warps_block > 32)
+    max_warps_block = 32;
+  if (*threads_p <= 0)
+    *threads_p = 8;
+  if (*threads_p > max_warps_block)
+    *threads_p = max_warps_block;
+
+  int regs_per_block = fn->regs_per_thread * 32 * *threads_p;
+  /* This is an estimate of how many blocks the device can host simultaneously.
+     Actual limit, which may be lower, can be queried with "occupancy control"
+     driver interface (since CUDA 6.0).  */
+  int max_blocks = ptx_dev->regs_per_sm / regs_per_block * ptx_dev->num_sms;
+  if (*teams_p <= 0 || *teams_p > max_blocks)
+    *teams_p = max_blocks;
+}
+
+/* Return the size of per-warp stacks (see gcc -msoft-stack) to use for OpenMP
+   target regions.  */
+
+static size_t
+nvptx_stacks_size ()
+{
+  return 128 * 1024;
+}
+
+/* Return contiguous storage for NUM stacks, each SIZE bytes.  */
+
+static void *
+nvptx_stacks_alloc (size_t size, int num)
+{
+  CUdeviceptr stacks;
+  CUresult r = cuMemAlloc (&stacks, size * num);
+  if (r != CUDA_SUCCESS)
+    GOMP_PLUGIN_fatal ("cuMemAlloc error: %s", cuda_error (r));
+  return (void *) stacks;
+}
+
+/* Release storage previously allocated by nvptx_stacks_alloc.  */
+
+static void
+nvptx_stacks_free (void *p, int num)
+{
+  CUresult r = cuMemFree ((CUdeviceptr) p);
+  if (r != CUDA_SUCCESS)
+    GOMP_PLUGIN_fatal ("cuMemFree error: %s", cuda_error (r));
+}
+
+void
+GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args)
+{
+  CUfunction function = ((struct targ_fn_descriptor *) tgt_fn)->fn;
+  CUresult r;
+  struct ptx_device *ptx_dev = ptx_devices[ord];
+  const char *maybe_abort_msg = "(perhaps abort was called)";
+  int teams = 0, threads = 0;
+
+  if (!args)
+    GOMP_PLUGIN_fatal ("No target arguments provided");
+  while (*args)
+    {
+      intptr_t id = (intptr_t) *args++, val;
+      if (id & GOMP_TARGET_ARG_SUBSEQUENT_PARAM)
+	val = (intptr_t) *args++;
+      else
+        val = id >> GOMP_TARGET_ARG_VALUE_SHIFT;
+      if ((id & GOMP_TARGET_ARG_DEVICE_MASK) != GOMP_TARGET_ARG_DEVICE_ALL)
+	continue;
+      val = val > INT_MAX ? INT_MAX : val;
+      id &= GOMP_TARGET_ARG_ID_MASK;
+      if (id == GOMP_TARGET_ARG_NUM_TEAMS)
+	teams = val;
+      else if (id == GOMP_TARGET_ARG_THREAD_LIMIT)
+	threads = val;
+    }
+  nvptx_adjust_launch_bounds (tgt_fn, ptx_dev, &teams, &threads);
+
+  size_t stack_size = nvptx_stacks_size ();
+  void *stacks = nvptx_stacks_alloc (stack_size, teams * threads);
+  void *fn_args[] = {tgt_vars, stacks, (void *) stack_size};
+  size_t fn_args_size = sizeof fn_args;
+  void *config[] = {
+    CU_LAUNCH_PARAM_BUFFER_POINTER, fn_args,
+    CU_LAUNCH_PARAM_BUFFER_SIZE, &fn_args_size,
+    CU_LAUNCH_PARAM_END
+  };
+  r = cuLaunchKernel (function,
+		      teams, 1, 1,
+		      32, threads, 1,
+		      0, ptx_dev->null_stream->stream, NULL, config);
+  if (r != CUDA_SUCCESS)
+    GOMP_PLUGIN_fatal ("cuLaunchKernel error: %s", cuda_error (r));
+
+  r = cuCtxSynchronize ();
+  if (r == CUDA_ERROR_LAUNCH_FAILED)
+    GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r),
+		       maybe_abort_msg);
+  else if (r != CUDA_SUCCESS)
+    GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r));
+  nvptx_stacks_free (stacks, teams * threads);
+}
+
+void
+GOMP_OFFLOAD_async_run (int ord, void *tgt_fn, void *tgt_vars, void **args,
+			void *async_data)
+{
+  GOMP_PLUGIN_fatal ("GOMP_OFFLOAD_async_run unimplemented");
+}
diff --git a/libgomp/team.c b/libgomp/team.c
index de5b915f78b..e3013451662 100644
--- a/libgomp/team.c
+++ b/libgomp/team.c
@@ -31,6 +31,7 @@
 #include <stdlib.h>
 #include <string.h>
 
+#ifdef LIBGOMP_USE_PTHREADS
 /* This attribute contains PTHREAD_CREATE_DETACHED.  */
 pthread_attr_t gomp_thread_attr;
 
@@ -110,7 +111,7 @@ gomp_thread_start (void *xdata)
     {
       pool->threads[thr->ts.team_id] = thr;
 
-      gomp_barrier_wait (&pool->threads_dock);
+      gomp_simple_barrier_wait (&pool->threads_dock);
       do
 	{
 	  struct gomp_team *team = thr->ts.team;
@@ -120,7 +121,7 @@ gomp_thread_start (void *xdata)
 	  gomp_team_barrier_wait_final (&team->barrier);
 	  gomp_finish_task (task);
 
-	  gomp_barrier_wait (&pool->threads_dock);
+	  gomp_simple_barrier_wait (&pool->threads_dock);
 
 	  local_fn = thr->fn;
 	  local_data = thr->data;
@@ -134,6 +135,7 @@ gomp_thread_start (void *xdata)
   thr->task = NULL;
   return NULL;
 }
+#endif
 
 static inline struct gomp_team *
 get_last_team (unsigned nthreads)
@@ -224,11 +226,17 @@ gomp_free_pool_helper (void *thread_pool)
   struct gomp_thread *thr = gomp_thread ();
   struct gomp_thread_pool *pool
     = (struct gomp_thread_pool *) thread_pool;
-  gomp_barrier_wait_last (&pool->threads_dock);
+  gomp_simple_barrier_wait_last (&pool->threads_dock);
   gomp_sem_destroy (&thr->release);
   thr->thread_pool = NULL;
   thr->task = NULL;
+#ifdef LIBGOMP_USE_PTHREADS
   pthread_exit (NULL);
+#elif defined(__nvptx__)
+  asm ("exit;");
+#else
+#error gomp_free_pool_helper must terminate the thread
+#endif
 }
 
 /* Free a thread pool and release its threads. */
@@ -250,12 +258,12 @@ gomp_free_thread (void *arg __attribute__((unused)))
 	      nthr->data = pool;
 	    }
 	  /* This barrier undocks threads docked on pool->threads_dock.  */
-	  gomp_barrier_wait (&pool->threads_dock);
+	  gomp_simple_barrier_wait (&pool->threads_dock);
 	  /* And this waits till all threads have called gomp_barrier_wait_last
 	     in gomp_free_pool_helper.  */
-	  gomp_barrier_wait (&pool->threads_dock);
+	  gomp_simple_barrier_wait (&pool->threads_dock);
 	  /* Now it is safe to destroy the barrier and free the pool.  */
-	  gomp_barrier_destroy (&pool->threads_dock);
+	  gomp_simple_barrier_destroy (&pool->threads_dock);
 
 #ifdef HAVE_SYNC_BUILTINS
 	  __sync_fetch_and_add (&gomp_managed_threads,
@@ -266,10 +274,12 @@ gomp_free_thread (void *arg __attribute__((unused)))
 	  gomp_mutex_unlock (&gomp_managed_threads_lock);
 #endif
 	}
-      free (pool->threads);
       if (pool->last_team)
 	free_team (pool->last_team);
+#ifndef __nvptx__
+      free (pool->threads);
       free (pool);
+#endif
       thr->thread_pool = NULL;
     }
   if (thr->ts.level == 0 && __builtin_expect (thr->ts.team != NULL, 0))
@@ -284,6 +294,7 @@ gomp_free_thread (void *arg __attribute__((unused)))
 
 /* Launch a team.  */
 
+#ifdef LIBGOMP_USE_PTHREADS
 void
 gomp_team_start (void (*fn) (void *), void *data, unsigned nthreads,
 		 unsigned flags, struct gomp_team *team)
@@ -429,7 +440,7 @@ gomp_team_start (void (*fn) (void *), void *data, unsigned nthreads,
       else if (old_threads_used == 0)
 	{
 	  n = 0;
-	  gomp_barrier_init (&pool->threads_dock, nthreads);
+	  gomp_simple_barrier_init (&pool->threads_dock, nthreads);
 	}
       else
 	{
@@ -437,7 +448,7 @@ gomp_team_start (void (*fn) (void *), void *data, unsigned nthreads,
 
 	  /* Increase the barrier threshold to make sure all new
 	     threads arrive before the team is released.  */
-	  gomp_barrier_reinit (&pool->threads_dock, nthreads);
+	  gomp_simple_barrier_reinit (&pool->threads_dock, nthreads);
 	}
 
       /* Not true yet, but soon will be.  We're going to release all
@@ -670,8 +681,8 @@ gomp_team_start (void (*fn) (void *), void *data, unsigned nthreads,
 		 threads and all the threads we're going to let die
 		 arrive before the team is released.  */
 	      if (affinity_count)
-		gomp_barrier_reinit (&pool->threads_dock,
-				     nthreads + affinity_count);
+		gomp_simple_barrier_reinit (&pool->threads_dock,
+					    nthreads + affinity_count);
 	    }
 	}
 
@@ -812,7 +823,10 @@ gomp_team_start (void (*fn) (void *), void *data, unsigned nthreads,
     pthread_attr_destroy (&thread_attr);
 
  do_release:
-  gomp_barrier_wait (nested ? &team->barrier : &pool->threads_dock);
+  if (nested)
+    gomp_barrier_wait (&team->barrier);
+  else
+    gomp_simple_barrier_wait (&pool->threads_dock);
 
   /* Decrease the barrier threshold to match the number of threads
      that should arrive back at the end of this team.  The extra
@@ -830,7 +844,7 @@ gomp_team_start (void (*fn) (void *), void *data, unsigned nthreads,
       if (affinity_count)
 	diff = -affinity_count;
 
-      gomp_barrier_reinit (&pool->threads_dock, nthreads);
+      gomp_simple_barrier_reinit (&pool->threads_dock, nthreads);
 
 #ifdef HAVE_SYNC_BUILTINS
       __sync_fetch_and_add (&gomp_managed_threads, diff);
@@ -844,6 +858,7 @@ gomp_team_start (void (*fn) (void *), void *data, unsigned nthreads,
       && team->prev_ts.place_partition_len > 64)
     free (affinity_thr);
 }
+#endif
 
 
 /* Terminate the current team.  This is only to be called by the master
@@ -919,6 +934,7 @@ gomp_team_end (void)
     }
 }
 
+#ifdef LIBGOMP_USE_PTHREADS
 
 /* Constructors for this file.  */
 
@@ -943,6 +959,7 @@ team_destructor (void)
      crashes.  */
   pthread_key_delete (gomp_thread_destructor);
 }
+#endif
 
 struct gomp_task_icv *
 gomp_new_icv (void)
@@ -951,6 +968,8 @@ gomp_new_icv (void)
   struct gomp_task *task = gomp_malloc (sizeof (struct gomp_task));
   gomp_init_task (task, NULL, &gomp_global_icv);
   thr->task = task;
+#ifdef LIBGOMP_USE_PTHREADS
   pthread_setspecific (gomp_thread_destructor, thr);
+#endif
   return &task->icv;
 }
diff --git a/libgomp/testsuite/libgomp.fortran/fortran.exp b/libgomp/testsuite/libgomp.fortran/fortran.exp
index 9e6b6437751..d848ed4d47f 100644
--- a/libgomp/testsuite/libgomp.fortran/fortran.exp
+++ b/libgomp/testsuite/libgomp.fortran/fortran.exp
@@ -7,7 +7,7 @@ global ALWAYS_CFLAGS
 
 set shlib_ext [get_shlib_extension]
 set lang_library_path	"../libgfortran/.libs"
-set lang_link_flags	"-lgfortran"
+set lang_link_flags	"-lgfortran -foffload=-lgfortran"
 if [info exists lang_include_flags] then {
     unset lang_include_flags
 }
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/fortran.exp b/libgomp/testsuite/libgomp.oacc-fortran/fortran.exp
index 2d6b647af22..663c9323b72 100644
--- a/libgomp/testsuite/libgomp.oacc-fortran/fortran.exp
+++ b/libgomp/testsuite/libgomp.oacc-fortran/fortran.exp
@@ -9,7 +9,7 @@ global ALWAYS_CFLAGS
 
 set shlib_ext [get_shlib_extension]
 set lang_library_path	"../libgfortran/.libs"
-set lang_link_flags	"-lgfortran"
+set lang_link_flags	"-lgfortran -foffload=-lgfortran"
 if [info exists lang_include_flags] then {
     unset lang_include_flags
 }