summaryrefslogtreecommitdiff
path: root/libgomp
diff options
context:
space:
mode:
authorCesar Philippidis <cesar@codesourcery.com>2018-08-13 05:04:24 -0700
committerTom de Vries <vries@gcc.gnu.org>2018-08-13 12:04:24 +0000
commitbd9b3d3d1a8d33e460ae137da0cb0d5a919e8f8f (patch)
tree0729abf2a8dd6b64d2d8313c3c16b16b428c15ab /libgomp
parentcdf899781c7321987a9948e5ca0847e8b38da798 (diff)
downloadgcc-bd9b3d3d1a8d33e460ae137da0cb0d5a919e8f8f.tar.gz
[nvptx] Use CUDA driver API to select default runtime launch geometry
The CUDA driver API starting version 6.5 offers a set of runtime functions to calculate several occupancy-related measures, as a replacement for the occupancy calculator spreadsheet. This patch adds a heuristic for default runtime launch geometry, based on the new runtime function cuOccupancyMaxPotentialBlockSize. Build on x86_64 with nvptx accelerator and ran libgomp testsuite. 2018-08-13 Cesar Philippidis <cesar@codesourcery.com> Tom de Vries <tdevries@suse.de> PR target/85590 * plugin/cuda/cuda.h (CUoccupancyB2DSize): New typedef. (cuOccupancyMaxPotentialBlockSize): Declare. * plugin/cuda-lib.def (cuOccupancyMaxPotentialBlockSize): New CUDA_ONE_CALL_MAYBE_NULL. * plugin/plugin-nvptx.c (CUDA_VERSION < 6050): Define CUoccupancyB2DSize and declare cuOccupancyMaxPotentialBlockSize. (nvptx_exec): Use cuOccupancyMaxPotentialBlockSize to set the default num_gangs and num_workers when the driver supports it. Co-Authored-By: Tom de Vries <tdevries@suse.de> From-SVN: r263505
Diffstat (limited to 'libgomp')
-rw-r--r--libgomp/ChangeLog14
-rw-r--r--libgomp/plugin/cuda-lib.def1
-rw-r--r--libgomp/plugin/cuda/cuda.h3
-rw-r--r--libgomp/plugin/plugin-nvptx.c83
4 files changed, 89 insertions, 12 deletions
diff --git a/libgomp/ChangeLog b/libgomp/ChangeLog
index 4aff6cd9a33..f54a695fb38 100644
--- a/libgomp/ChangeLog
+++ b/libgomp/ChangeLog
@@ -1,3 +1,17 @@
+2018-08-13 Cesar Philippidis <cesar@codesourcery.com>
+ Tom de Vries <tdevries@suse.de>
+
+ PR target/85590
+ * plugin/cuda/cuda.h (CUoccupancyB2DSize): New typedef.
+ (cuOccupancyMaxPotentialBlockSize): Declare.
+ * plugin/cuda-lib.def (cuOccupancyMaxPotentialBlockSize): New
+ CUDA_ONE_CALL_MAYBE_NULL.
+ * plugin/plugin-nvptx.c (CUDA_VERSION < 6050): Define
+ CUoccupancyB2DSize and declare
+ cuOccupancyMaxPotentialBlockSize.
+ (nvptx_exec): Use cuOccupancyMaxPotentialBlockSize to set the
+ default num_gangs and num_workers when the driver supports it.
+
2018-08-08 Tom de Vries <tdevries@suse.de>
* plugin/cuda-lib.def (cuLinkAddData_v2, cuLinkCreate_v2): Declare using
diff --git a/libgomp/plugin/cuda-lib.def b/libgomp/plugin/cuda-lib.def
index 29028b504a0..b2a4c2154eb 100644
--- a/libgomp/plugin/cuda-lib.def
+++ b/libgomp/plugin/cuda-lib.def
@@ -41,6 +41,7 @@ CUDA_ONE_CALL (cuModuleGetGlobal)
CUDA_ONE_CALL (cuModuleLoad)
CUDA_ONE_CALL (cuModuleLoadData)
CUDA_ONE_CALL (cuModuleUnload)
+CUDA_ONE_CALL_MAYBE_NULL (cuOccupancyMaxPotentialBlockSize)
CUDA_ONE_CALL (cuStreamCreate)
CUDA_ONE_CALL (cuStreamDestroy)
CUDA_ONE_CALL (cuStreamQuery)
diff --git a/libgomp/plugin/cuda/cuda.h b/libgomp/plugin/cuda/cuda.h
index 4799825bda2..b4c1b29c5d8 100644
--- a/libgomp/plugin/cuda/cuda.h
+++ b/libgomp/plugin/cuda/cuda.h
@@ -44,6 +44,7 @@ typedef void *CUevent;
typedef void *CUfunction;
typedef void *CUlinkState;
typedef void *CUmodule;
+typedef size_t (*CUoccupancyB2DSize)(int);
typedef void *CUstream;
typedef enum {
@@ -170,6 +171,8 @@ CUresult cuModuleGetGlobal (CUdeviceptr *, size_t *, CUmodule, const char *);
CUresult cuModuleLoad (CUmodule *, const char *);
CUresult cuModuleLoadData (CUmodule *, const void *);
CUresult cuModuleUnload (CUmodule);
+CUresult cuOccupancyMaxPotentialBlockSize(int *, int *, CUfunction,
+ CUoccupancyB2DSize, size_t, int);
CUresult cuStreamCreate (CUstream *, unsigned);
#define cuStreamDestroy cuStreamDestroy_v2
CUresult cuStreamDestroy (CUstream);
diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c
index 6799a264976..bae1b05ccaa 100644
--- a/libgomp/plugin/plugin-nvptx.c
+++ b/libgomp/plugin/plugin-nvptx.c
@@ -61,9 +61,12 @@ CUresult cuLinkAddData (CUlinkState, CUjitInputType, void *, size_t,
const char *, unsigned, CUjit_option *, void **);
CUresult cuLinkCreate (unsigned, CUjit_option *, void **, CUlinkState *);
#else
+typedef size_t (*CUoccupancyB2DSize)(int);
CUresult cuLinkAddData_v2 (CUlinkState, CUjitInputType, void *, size_t,
const char *, unsigned, CUjit_option *, void **);
CUresult cuLinkCreate_v2 (unsigned, CUjit_option *, void **, CUlinkState *);
+CUresult cuOccupancyMaxPotentialBlockSize(int *, int *, CUfunction,
+ CUoccupancyB2DSize, size_t, int);
#endif
#define DO_PRAGMA(x) _Pragma (#x)
@@ -1200,21 +1203,77 @@ nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
{
bool default_dim_p[GOMP_DIM_MAX];
for (i = 0; i != GOMP_DIM_MAX; i++)
+ default_dim_p[i] = !dims[i];
+
+ if (!CUDA_CALL_EXISTS (cuOccupancyMaxPotentialBlockSize))
{
- default_dim_p[i] = !dims[i];
- if (default_dim_p[i])
- dims[i] = nvthd->ptx_dev->default_dims[i];
+ for (i = 0; i != GOMP_DIM_MAX; i++)
+ if (default_dim_p[i])
+ dims[i] = nvthd->ptx_dev->default_dims[i];
+
+ if (default_dim_p[GOMP_DIM_VECTOR])
+ dims[GOMP_DIM_VECTOR]
+ = MIN (dims[GOMP_DIM_VECTOR],
+ (targ_fn->max_threads_per_block / warp_size
+ * warp_size));
+
+ if (default_dim_p[GOMP_DIM_WORKER])
+ dims[GOMP_DIM_WORKER]
+ = MIN (dims[GOMP_DIM_WORKER],
+ targ_fn->max_threads_per_block / dims[GOMP_DIM_VECTOR]);
}
+ else
+ {
+ /* Handle the case that the compiler allows the runtime to choose
+ the vector-length conservatively, by ignoring
+ gomp_openacc_dims[GOMP_DIM_VECTOR]. TODO: actually handle
+ it. */
+ int vectors = 0;
+ /* TODO: limit gomp_openacc_dims[GOMP_DIM_WORKER] such that that
+ gomp_openacc_dims[GOMP_DIM_WORKER] * actual_vectors does not
+ exceed targ_fn->max_threads_per_block. */
+ int workers = gomp_openacc_dims[GOMP_DIM_WORKER];
+ int gangs = gomp_openacc_dims[GOMP_DIM_GANG];
+ int grids, blocks;
+
+ CUDA_CALL_ASSERT (cuOccupancyMaxPotentialBlockSize, &grids,
+ &blocks, function, NULL, 0,
+ dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR]);
+ GOMP_PLUGIN_debug (0, "cuOccupancyMaxPotentialBlockSize: "
+ "grid = %d, block = %d\n", grids, blocks);
+
+ /* Keep the num_gangs proportional to the block size. In
+ the case were a block size is limited by shared-memory
+ or the register file capacity, the runtime will not
+ excessively over assign gangs to the multiprocessor
+ units if their state is going to be swapped out even
+ more than necessary. The constant factor 2 is there to
+ prevent threads from idling when there is insufficient
+ work for them. */
+ if (gangs == 0)
+ gangs = 2 * grids * (blocks / warp_size);
+
+ if (vectors == 0)
+ vectors = warp_size;
+
+ if (workers == 0)
+ {
+ int actual_vectors = (default_dim_p[GOMP_DIM_VECTOR]
+ ? vectors
+ : dims[GOMP_DIM_VECTOR]);
+ workers = blocks / actual_vectors;
+ }
- if (default_dim_p[GOMP_DIM_VECTOR])
- dims[GOMP_DIM_VECTOR]
- = MIN (dims[GOMP_DIM_VECTOR],
- (targ_fn->max_threads_per_block / warp_size * warp_size));
-
- if (default_dim_p[GOMP_DIM_WORKER])
- dims[GOMP_DIM_WORKER]
- = MIN (dims[GOMP_DIM_WORKER],
- targ_fn->max_threads_per_block / dims[GOMP_DIM_VECTOR]);
+ for (i = 0; i != GOMP_DIM_MAX; i++)
+ if (default_dim_p[i])
+ switch (i)
+ {
+ case GOMP_DIM_GANG: dims[i] = gangs; break;
+ case GOMP_DIM_WORKER: dims[i] = workers; break;
+ case GOMP_DIM_VECTOR: dims[i] = vectors; break;
+ default: GOMP_PLUGIN_fatal ("invalid dim");
+ }
+ }
}
}