diff options
author | nathan <nathan@138bc75d-0d04-0410-961f-82ee72b054a4> | 2015-09-28 19:37:33 +0000 |
---|---|---|
committer | nathan <nathan@138bc75d-0d04-0410-961f-82ee72b054a4> | 2015-09-28 19:37:33 +0000 |
commit | e561d5e184c4c65db03875b5ecde57b5dbd23775 (patch) | |
tree | 82bea03a2a53289a91a90b899f03a9be503a60df /libgomp/plugin/plugin-nvptx.c | |
parent | 28b6dd02a1ff3cad8c40f27a1590030a5680699f (diff) | |
download | gcc-e561d5e184c4c65db03875b5ecde57b5dbd23775.tar.gz |
inlude/
* gomp-constants.h (GOMP_VERSION_NVIDIA_PTX): Increment.
(GOMP_DIM_GANG, GOMP_DIM_WORKER, GOMP_DIM_VECTOR, GOMP_DIM_MAX,
GOMP_DIM_MASK): New.
(GOMP_LAUNCH_DIM, GOMP_LAUNCH_ASYNC, GOMP_LAUNCH_WAIT): New.
(GOMP_LAUNCH_CODE_SHIFT, GOMP_LAUNCH_DEVICE_SHIFT,
GOMP_LAUNCH_OP_SHIFT): New.
(GOMP_LAUNCH_PACK, GOMP_LAUNCH_CODE, GOMP_LAUNCH_DEVICE,
GOMP_LAUNCH_OP): New.
(GOMP_LAUNCH_OP_MAX): New.
libgomp/
* libgomp.h (acc_dispatch_t): Replace separate geometry args with
array.
* libgomp.map (GOACC_parallel_keyed): New.
* oacc-parallel.c (goacc_wait): Take pointer to va_list. Adjust
all callers.
(GOACC_parallel_keyed): New interface. Lose geometry arguments
and take keyed varargs list. Adjust call to exec_func.
(GOACC_parallel): Force host fallback.
* libgomp_g.h (GOACC_parallel): Remove.
(GOACC_parallel_keyed): Declare.
* plugin/plugin-nvptx.c (struct targ_fn_launch): New struct.
(stuct targ_gn_descriptor): Replace name field with launch field.
(nvptx_exec): Lose separate geometry args, take array. Process
dynamic dimensions and adjust.
(struct nvptx_tdata): Replace fn_names field with fn_descs.
(GOMP_OFFLOAD_load_image): Adjust for change in function table
data.
(GOMP_OFFLOAD_openacc_parallel): Adjust for change in dimension
passing.
* oacc-host.c (host_openacc_exec): Adjust for change in dimension
passing.
gcc/
* config/nvptx/nvptx.c: Include omp-low.h and gomp-constants.h.
(nvptx_record_offload_symbol): Record function execution geometry.
* config/nvptx/mkoffload.c (process): Include launch geometry in
function data.
* omp-low.c (oacc_launch_pack): New.
(replace_oacc_fn_attrib): New.
(set_oacc_fn_attrib): New.
(get_oacc_fn_attrib): New.
(expand_omp_target): Create keyed varargs for GOACC_parallel call
generation.
* omp-low.h (get_oacc_fn_attrib): Declare.
* builtin-types.def (DEF_FUNCTION_TyPE_VAR_6): New.
(DEF_FUNCTION_TYPE_VAR_11): Delete.
* tree.h (OMP_CLAUSE_EXPR): New.
* omp-builtins.def (BUILT_IN_GOACC_PARALLEL): Change target fn name.
gcc/lto/
* lto-lang.c (DEF_FUNCTION_TYPE_VAR_6): New.
(DEF_FUNCTION_TYPE_VAR_11): Delete.
gcc/c-family/
* c-common.c (DEF_FUNCTION_TYPE_VAR_6): New.
(DEF_FUNCTION_TYPE_VAR_11): Delete.
gcc/fortran/
* f95-lang.c (DEF_FUNCTION_TYPE_VAR_6): New.
(DEF_FUNCTION_TYPE_VAR_11): Delete.
* types.def (DEF_FUNCTION_TYPE_VAR_6): New.
(DEF_FUNCTION_TYPE_VAR_11): Delete.
gcc/ada/
* gcc-interface/utils.c (DEF_FUNCTION_TYPE_VAR_6): Define
git-svn-id: svn+ssh://gcc.gnu.org/svn/gcc/trunk@228220 138bc75d-0d04-0410-961f-82ee72b054a4
Diffstat (limited to 'libgomp/plugin/plugin-nvptx.c')
-rw-r--r-- | libgomp/plugin/plugin-nvptx.c | 85 |
1 files changed, 46 insertions, 39 deletions
diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c index a2f950db580..0c4e1afb50c 100644 --- a/libgomp/plugin/plugin-nvptx.c +++ b/libgomp/plugin/plugin-nvptx.c @@ -282,12 +282,20 @@ map_push (struct ptx_stream *s, int async, size_t size, void **h, void **d) return; } +/* Target data function launch information. */ + +struct targ_fn_launch +{ + const char *fn; + unsigned short dim[3]; +}; + /* Descriptor of a loaded function. */ struct targ_fn_descriptor { CUfunction fn; - const char *name; + const struct targ_fn_launch *launch; }; /* A loaded PTX image. */ @@ -929,8 +937,8 @@ event_add (enum ptx_event_type type, CUevent *e, void *h) void nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs, - size_t *sizes, unsigned short *kinds, int num_gangs, int num_workers, - int vector_length, int async, void *targ_mem_desc) + size_t *sizes, unsigned short *kinds, int async, unsigned *dims, + void *targ_mem_desc) { struct targ_fn_descriptor *targ_fn = (struct targ_fn_descriptor *) fn; CUfunction function; @@ -939,7 +947,6 @@ nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs, struct ptx_stream *dev_str; void *kargs[1]; void *hp, *dp; - unsigned int nthreads_in_block; struct nvptx_thread *nvthd = nvptx_thread (); const char *maybe_abort_msg = "(perhaps abort was called)"; @@ -948,6 +955,20 @@ nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs, dev_str = select_stream_for_async (async, pthread_self (), false, NULL); assert (dev_str == nvthd->current_stream); + /* Initialize the launch dimensions. Typically this is constant, + provided by the device compiler, but we must permit runtime + values. */ + for (i = 0; i != 3; i++) + if (targ_fn->launch->dim[i]) + dims[i] = targ_fn->launch->dim[i]; + + if (dims[GOMP_DIM_GANG] != 1) + GOMP_PLUGIN_fatal ("non-unity num_gangs (%d) not supported", + dims[GOMP_DIM_GANG]); + if (dims[GOMP_DIM_WORKER] != 1) + GOMP_PLUGIN_fatal ("non-unity num_workers (%d) not supported", + dims[GOMP_DIM_WORKER]); + /* This reserves a chunk of a pre-allocated page of memory mapped on both the host and the device. HP is a host pointer to the new chunk, and DP is the corresponding device pointer. */ @@ -965,35 +986,21 @@ nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs, if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuMemcpy failed: %s", cuda_error (r)); - GOMP_PLUGIN_debug (0, " %s: kernel %s: launch\n", __FUNCTION__, targ_fn->name); + GOMP_PLUGIN_debug (0, " %s: kernel %s: launch" + " gangs=%u, workers=%u, vectors=%u\n", + __FUNCTION__, targ_fn->launch->fn, + dims[0], dims[1], dims[2]); // OpenACC CUDA // - // num_gangs blocks - // num_workers warps (where a warp is equivalent to 32 threads) - // vector length threads - // - - /* The openacc vector_length clause 'determines the vector length to use for - vector or SIMD operations'. The question is how to map this to CUDA. - - In CUDA, the warp size is the vector length of a CUDA device. However, the - CUDA interface abstracts away from that, and only shows us warp size - indirectly in maximum number of threads per block, which is a product of - warp size and the number of hyperthreads of a multiprocessor. - - We choose to map openacc vector_length directly onto the number of threads - in a block, in the x dimension. This is reflected in gcc code generation - that uses ThreadIdx.x to access vector elements. - - Attempting to use an openacc vector_length of more than the maximum number - of threads per block will result in a cuda error. */ - nthreads_in_block = vector_length; + // num_gangs nctaid.x + // num_workers ntid.y + // vector length ntid.x kargs[0] = &dp; r = cuLaunchKernel (function, - num_gangs, 1, 1, - nthreads_in_block, 1, 1, + dims[GOMP_DIM_GANG], 1, 1, + dims[GOMP_DIM_VECTOR], dims[GOMP_DIM_WORKER], 1, 0, dev_str->stream, kargs, 0); if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuLaunchKernel error: %s", cuda_error (r)); @@ -1039,7 +1046,7 @@ nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs, #endif GOMP_PLUGIN_debug (0, " %s: kernel %s: finished\n", __FUNCTION__, - targ_fn->name); + targ_fn->launch->fn); #ifndef DISABLE_ASYNC if (async < acc_async_noval) @@ -1567,7 +1574,7 @@ typedef struct nvptx_tdata const char *const *var_names; size_t var_num; - const char *const *fn_names; + const struct targ_fn_launch *fn_descs; size_t fn_num; } nvptx_tdata_t; @@ -1588,7 +1595,8 @@ GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data, struct addr_pair **target_table) { CUmodule module; - const char *const *fn_names, *const *var_names; + const char *const *var_names; + const struct targ_fn_launch *fn_descs; unsigned int fn_entries, var_entries, i, j; CUresult r; struct targ_fn_descriptor *targ_fns; @@ -1617,7 +1625,7 @@ GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data, var_entries = img_header->var_num; var_names = img_header->var_names; fn_entries = img_header->fn_num; - fn_names = img_header->fn_names; + fn_descs = img_header->fn_descs; targ_tbl = GOMP_PLUGIN_malloc (sizeof (struct addr_pair) * (fn_entries + var_entries)); @@ -1640,12 +1648,12 @@ GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data, { CUfunction function; - r = cuModuleGetFunction (&function, module, fn_names[i]); + r = cuModuleGetFunction (&function, module, fn_descs[i].fn); if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuModuleGetFunction error: %s", cuda_error (r)); targ_fns->fn = function; - targ_fns->name = (const char *) fn_names[i]; + targ_fns->launch = &fn_descs[i]; targ_tbl->start = (uintptr_t) targ_fns; targ_tbl->end = targ_tbl->start + 1; @@ -1724,13 +1732,12 @@ void (*device_run) (int n, void *fn_ptr, void *vars) = NULL; void GOMP_OFFLOAD_openacc_parallel (void (*fn) (void *), size_t mapnum, - void **hostaddrs, void **devaddrs, size_t *sizes, - unsigned short *kinds, int num_gangs, - int num_workers, int vector_length, int async, - void *targ_mem_desc) + void **hostaddrs, void **devaddrs, + size_t *sizes, unsigned short *kinds, + int async, unsigned *dims, void *targ_mem_desc) { - nvptx_exec (fn, mapnum, hostaddrs, devaddrs, sizes, kinds, num_gangs, - num_workers, vector_length, async, targ_mem_desc); + nvptx_exec (fn, mapnum, hostaddrs, devaddrs, sizes, kinds, + async, dims, targ_mem_desc); } void |