inlude/

* gomp-constants.h (GOMP_VERSION_NVIDIA_PTX): Increment. (GOMP_DIM_GANG, GOMP_DIM_WORKER, GOMP_DIM_VECTOR, GOMP_DIM_MAX, GOMP_DIM_MASK): New. (GOMP_LAUNCH_DIM, GOMP_LAUNCH_ASYNC, GOMP_LAUNCH_WAIT): New. (GOMP_LAUNCH_CODE_SHIFT, GOMP_LAUNCH_DEVICE_SHIFT, GOMP_LAUNCH_OP_SHIFT): New. (GOMP_LAUNCH_PACK, GOMP_LAUNCH_CODE, GOMP_LAUNCH_DEVICE, GOMP_LAUNCH_OP): New. (GOMP_LAUNCH_OP_MAX): New. libgomp/ * libgomp.h (acc_dispatch_t): Replace separate geometry args with array. * libgomp.map (GOACC_parallel_keyed): New. * oacc-parallel.c (goacc_wait): Take pointer to va_list. Adjust all callers. (GOACC_parallel_keyed): New interface. Lose geometry arguments and take keyed varargs list. Adjust call to exec_func. (GOACC_parallel): Force host fallback. * libgomp_g.h (GOACC_parallel): Remove. (GOACC_parallel_keyed): Declare. * plugin/plugin-nvptx.c (struct targ_fn_launch): New struct. (stuct targ_gn_descriptor): Replace name field with launch field. (nvptx_exec): Lose separate geometry args, take array. Process dynamic dimensions and adjust. (struct nvptx_tdata): Replace fn_names field with fn_descs. (GOMP_OFFLOAD_load_image): Adjust for change in function table data. (GOMP_OFFLOAD_openacc_parallel): Adjust for change in dimension passing. * oacc-host.c (host_openacc_exec): Adjust for change in dimension passing. gcc/ * config/nvptx/nvptx.c: Include omp-low.h and gomp-constants.h. (nvptx_record_offload_symbol): Record function execution geometry. * config/nvptx/mkoffload.c (process): Include launch geometry in function data. * omp-low.c (oacc_launch_pack): New. (replace_oacc_fn_attrib): New. (set_oacc_fn_attrib): New. (get_oacc_fn_attrib): New. (expand_omp_target): Create keyed varargs for GOACC_parallel call generation. * omp-low.h (get_oacc_fn_attrib): Declare. * builtin-types.def (DEF_FUNCTION_TyPE_VAR_6): New. (DEF_FUNCTION_TYPE_VAR_11): Delete. * tree.h (OMP_CLAUSE_EXPR): New. * omp-builtins.def (BUILT_IN_GOACC_PARALLEL): Change target fn name. gcc/lto/ * lto-lang.c (DEF_FUNCTION_TYPE_VAR_6): New. (DEF_FUNCTION_TYPE_VAR_11): Delete. gcc/c-family/ * c-common.c (DEF_FUNCTION_TYPE_VAR_6): New. (DEF_FUNCTION_TYPE_VAR_11): Delete. gcc/fortran/ * f95-lang.c (DEF_FUNCTION_TYPE_VAR_6): New. (DEF_FUNCTION_TYPE_VAR_11): Delete. * types.def (DEF_FUNCTION_TYPE_VAR_6): New. (DEF_FUNCTION_TYPE_VAR_11): Delete. gcc/ada/ * gcc-interface/utils.c (DEF_FUNCTION_TYPE_VAR_6): Define git-svn-id: svn+ssh://gcc.gnu.org/svn/gcc/trunk@228220 138bc75d-0d04-0410-961f-82ee72b054a4
author: nathan <nathan@138bc75d-0d04-0410-961f-82ee72b054a4> 2015-09-28 19:37:33 +0000
committer: nathan <nathan@138bc75d-0d04-0410-961f-82ee72b054a4> 2015-09-28 19:37:33 +0000
commit: e561d5e184c4c65db03875b5ecde57b5dbd23775 (patch)
tree: 82bea03a2a53289a91a90b899f03a9be503a60df /libgomp/plugin/plugin-nvptx.c
parent: 28b6dd02a1ff3cad8c40f27a1590030a5680699f (diff)
download: gcc-e561d5e184c4c65db03875b5ecde57b5dbd23775.tar.gz
1 files changed, 46 insertions, 39 deletions
diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c
index a2f950db580..0c4e1afb50c 100644
--- a/libgomp/plugin/plugin-nvptx.c
+++ b/libgomp/plugin/plugin-nvptx.c
@@ -282,12 +282,20 @@ map_push (struct ptx_stream *s, int async, size_t size, void **h, void **d)
   return;
 }
 
+/* Target data function launch information.  */
+
+struct targ_fn_launch
+{
+  const char *fn;
+  unsigned short dim[3];
+};
+
 /* Descriptor of a loaded function.  */
 
 struct targ_fn_descriptor
 {
   CUfunction fn;
-  const char *name;
+  const struct targ_fn_launch *launch;
 };
 
 /* A loaded PTX image.  */
@@ -929,8 +937,8 @@ event_add (enum ptx_event_type type, CUevent *e, void *h)
 
 void
 nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
-	  size_t *sizes, unsigned short *kinds, int num_gangs, int num_workers,
-	  int vector_length, int async, void *targ_mem_desc)
+	    size_t *sizes, unsigned short *kinds, int async, unsigned *dims,
+	    void *targ_mem_desc)
 {
   struct targ_fn_descriptor *targ_fn = (struct targ_fn_descriptor *) fn;
   CUfunction function;
@@ -939,7 +947,6 @@ nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
   struct ptx_stream *dev_str;
   void *kargs[1];
   void *hp, *dp;
-  unsigned int nthreads_in_block;
   struct nvptx_thread *nvthd = nvptx_thread ();
   const char *maybe_abort_msg = "(perhaps abort was called)";
 
@@ -948,6 +955,20 @@ nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
   dev_str = select_stream_for_async (async, pthread_self (), false, NULL);
   assert (dev_str == nvthd->current_stream);
 
+  /* Initialize the launch dimensions.  Typically this is constant,
+     provided by the device compiler, but we must permit runtime
+     values.  */
+  for (i = 0; i != 3; i++)
+    if (targ_fn->launch->dim[i])
+      dims[i] = targ_fn->launch->dim[i];
+
+  if (dims[GOMP_DIM_GANG] != 1)
+    GOMP_PLUGIN_fatal ("non-unity num_gangs (%d) not supported",
+		       dims[GOMP_DIM_GANG]);
+  if (dims[GOMP_DIM_WORKER] != 1)
+    GOMP_PLUGIN_fatal ("non-unity num_workers (%d) not supported",
+		       dims[GOMP_DIM_WORKER]);
+
   /* This reserves a chunk of a pre-allocated page of memory mapped on both
      the host and the device. HP is a host pointer to the new chunk, and DP is
      the corresponding device pointer.  */
@@ -965,35 +986,21 @@ nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
   if (r != CUDA_SUCCESS)
     GOMP_PLUGIN_fatal ("cuMemcpy failed: %s", cuda_error (r));
 
-  GOMP_PLUGIN_debug (0, "  %s: kernel %s: launch\n", __FUNCTION__, targ_fn->name);
+  GOMP_PLUGIN_debug (0, "  %s: kernel %s: launch"
+		     " gangs=%u, workers=%u, vectors=%u\n",
+		     __FUNCTION__, targ_fn->launch->fn,
+		     dims[0], dims[1], dims[2]);
 
   // OpenACC		CUDA
   //
-  // num_gangs		blocks
-  // num_workers	warps (where a warp is equivalent to 32 threads)
-  // vector length	threads
-  //
-
-  /* The openacc vector_length clause 'determines the vector length to use for
-     vector or SIMD operations'.  The question is how to map this to CUDA.
-
-     In CUDA, the warp size is the vector length of a CUDA device.  However, the
-     CUDA interface abstracts away from that, and only shows us warp size
-     indirectly in maximum number of threads per block, which is a product of
-     warp size and the number of hyperthreads of a multiprocessor.
-
-     We choose to map openacc vector_length directly onto the number of threads
-     in a block, in the x dimension.  This is reflected in gcc code generation
-     that uses ThreadIdx.x to access vector elements.
-
-     Attempting to use an openacc vector_length of more than the maximum number
-     of threads per block will result in a cuda error.  */
-  nthreads_in_block = vector_length;
+  // num_gangs		nctaid.x
+  // num_workers	ntid.y
+  // vector length	ntid.x
 
   kargs[0] = &dp;
   r = cuLaunchKernel (function,
-		      num_gangs, 1, 1,
-		      nthreads_in_block, 1, 1,
+		      dims[GOMP_DIM_GANG], 1, 1,
+		      dims[GOMP_DIM_VECTOR], dims[GOMP_DIM_WORKER], 1,
 		      0, dev_str->stream, kargs, 0);
   if (r != CUDA_SUCCESS)
     GOMP_PLUGIN_fatal ("cuLaunchKernel error: %s", cuda_error (r));
@@ -1039,7 +1046,7 @@ nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
 #endif
 
   GOMP_PLUGIN_debug (0, "  %s: kernel %s: finished\n", __FUNCTION__,
-		     targ_fn->name);
+		     targ_fn->launch->fn);
 
 #ifndef DISABLE_ASYNC
   if (async < acc_async_noval)
@@ -1567,7 +1574,7 @@ typedef struct nvptx_tdata
   const char *const *var_names;
   size_t var_num;
 
-  const char *const *fn_names;
+  const struct targ_fn_launch *fn_descs;
   size_t fn_num;
 } nvptx_tdata_t;
 
@@ -1588,7 +1595,8 @@ GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data,
 			 struct addr_pair **target_table)
 {
   CUmodule module;
-  const char *const *fn_names, *const *var_names;
+  const char *const *var_names;
+  const struct targ_fn_launch *fn_descs;
   unsigned int fn_entries, var_entries, i, j;
   CUresult r;
   struct targ_fn_descriptor *targ_fns;
@@ -1617,7 +1625,7 @@ GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data,
   var_entries = img_header->var_num;
   var_names = img_header->var_names;
   fn_entries = img_header->fn_num;
-  fn_names = img_header->fn_names;
+  fn_descs = img_header->fn_descs;
 
   targ_tbl = GOMP_PLUGIN_malloc (sizeof (struct addr_pair)
 				 * (fn_entries + var_entries));
@@ -1640,12 +1648,12 @@ GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data,
     {
       CUfunction function;
 
-      r = cuModuleGetFunction (&function, module, fn_names[i]);
+      r = cuModuleGetFunction (&function, module, fn_descs[i].fn);
       if (r != CUDA_SUCCESS)
 	GOMP_PLUGIN_fatal ("cuModuleGetFunction error: %s", cuda_error (r));
 
       targ_fns->fn = function;
-      targ_fns->name = (const char *) fn_names[i];
+      targ_fns->launch = &fn_descs[i];
 
       targ_tbl->start = (uintptr_t) targ_fns;
       targ_tbl->end = targ_tbl->start + 1;
@@ -1724,13 +1732,12 @@ void (*device_run) (int n, void *fn_ptr, void *vars) = NULL;
 
 void
 GOMP_OFFLOAD_openacc_parallel (void (*fn) (void *), size_t mapnum,
-			       void **hostaddrs, void **devaddrs, size_t *sizes,
-			       unsigned short *kinds, int num_gangs,
-			       int num_workers, int vector_length, int async,
-			       void *targ_mem_desc)
+			       void **hostaddrs, void **devaddrs,
+			       size_t *sizes, unsigned short *kinds,
+			       int async, unsigned *dims, void *targ_mem_desc)
 {
-  nvptx_exec (fn, mapnum, hostaddrs, devaddrs, sizes, kinds, num_gangs,
-	    num_workers, vector_length, async, targ_mem_desc);
+  nvptx_exec (fn, mapnum, hostaddrs, devaddrs, sizes, kinds,
+	      async, dims, targ_mem_desc);
 }
 
 void
author	nathan <nathan@138bc75d-0d04-0410-961f-82ee72b054a4>	2015-09-28 19:37:33 +0000
committer	nathan <nathan@138bc75d-0d04-0410-961f-82ee72b054a4>	2015-09-28 19:37:33 +0000
commit	e561d5e184c4c65db03875b5ecde57b5dbd23775 (patch)
tree	82bea03a2a53289a91a90b899f03a9be503a60df /libgomp/plugin/plugin-nvptx.c
parent	28b6dd02a1ff3cad8c40f27a1590030a5680699f (diff)
download	gcc-e561d5e184c4c65db03875b5ecde57b5dbd23775.tar.gz