[OpenCL] Add '-cl-uniform-work-group-size' compile option

Summary: OpenCL 2.0 specification defines '-cl-uniform-work-group-size' option, which requires that the global work-size be a multiple of the work-group size specified to clEnqueueNDRangeKernel and allows optimizations that are made possible by this restriction. The patch introduces the support of this option. To keep information about whether an OpenCL kernel has uniform work group size or not, clang generates 'uniform-work-group-size' function attribute for every kernel: - "uniform-work-group-size"="true" for OpenCL 1.2 and lower, - "uniform-work-group-size"="true" for OpenCL 2.0 and higher if '-cl-uniform-work-group-size' option was specified, - "uniform-work-group-size"="false" for OpenCL 2.0 and higher if no '-cl-uniform-work-group-size' options was specified. If the function is not an OpenCL kernel, 'uniform-work-group-size' attribute isn't generated. Patch by: krisb Reviewers: yaxunl, Anastasia, b-sumner Reviewed By: yaxunl, Anastasia Subscribers: nhaehnle, yaxunl, Anastasia, cfe-commits Differential Revision: https://reviews.llvm.org/D43570 git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@325771 91177308-0d34-0410-b5e6-96231b3b80d8
author: Alexey Sotkin <alexey.sotkin@intel.com> 2018-02-22 11:54:14 +0000
committer: Alexey Sotkin <alexey.sotkin@intel.com> 2018-02-22 11:54:14 +0000
commit: 9acbe6ca409da5a824a2ee6e967dedddc7cf4b69 (patch)
tree: 0a3243db274d85243f6b462ea47365ded5a61d88
parent: 0972dc790621cb6c6718ef9a5090a993cc0724c2 (diff)
download: clang-9acbe6ca409da5a824a2ee6e967dedddc7cf4b69.tar.gz
9 files changed, 43 insertions, 3 deletions
diff --git a/include/clang/Driver/Options.td b/include/clang/Driver/Options.td
index a1b9810c54..50865f22f8 100644
--- a/include/clang/Driver/Options.td
+++ b/include/clang/Driver/Options.td
@@ -518,6 +518,8 @@ def cl_denorms_are_zero : Flag<["-"], "cl-denorms-are-zero">, Group<opencl_Group
   HelpText<"OpenCL only. Allow denormals to be flushed to zero.">;
 def cl_fp32_correctly_rounded_divide_sqrt : Flag<["-"], "cl-fp32-correctly-rounded-divide-sqrt">, Group<opencl_Group>, Flags<[CC1Option]>,
   HelpText<"OpenCL only. Specify that single precision floating-point divide and sqrt used in the program source are correctly rounded.">;
+def cl_uniform_work_group_size : Flag<["-"], "cl-uniform-work-group-size">, Group<opencl_Group>, Flags<[CC1Option]>,
+  HelpText<"OpenCL only. Defines that the global work-size be a multiple of the work-group size specified to clEnqueueNDRangeKernel">;
 def client__name : JoinedOrSeparate<["-"], "client_name">;
 def combine : Flag<["-", "--"], "combine">, Flags<[DriverOption, Unsupported]>;
 def compatibility__version : JoinedOrSeparate<["-"], "compatibility_version">;
diff --git a/include/clang/Frontend/CodeGenOptions.def b/include/clang/Frontend/CodeGenOptions.def
index 599b57c189..e987252b2f 100644
--- a/include/clang/Frontend/CodeGenOptions.def
+++ b/include/clang/Frontend/CodeGenOptions.def
@@ -128,6 +128,7 @@ CODEGENOPT(NoTrappingMath    , 1, 0) ///< Set when -fno-trapping-math is enabled
 CODEGENOPT(NoNaNsFPMath      , 1, 0) ///< Assume FP arguments, results not NaN.
 CODEGENOPT(FlushDenorm       , 1, 0) ///< Allow FP denorm numbers to be flushed to zero
 CODEGENOPT(CorrectlyRoundedDivSqrt, 1, 0) ///< -cl-fp32-correctly-rounded-divide-sqrt
+CODEGENOPT(UniformWGSize     , 1, 0) ///< -cl-uniform-work-group-size
 CODEGENOPT(NoZeroInitializedInBSS , 1, 0) ///< -fno-zero-initialized-in-bss.
 /// \brief Method of Objective-C dispatch to use.
 ENUM_CODEGENOPT(ObjCDispatchMethod, ObjCDispatchMethodKind, 2, Legacy)
diff --git a/lib/CodeGen/CGCall.cpp b/lib/CodeGen/CGCall.cpp
index d62b080e31..959df69f22 100644
--- a/lib/CodeGen/CGCall.cpp
+++ b/lib/CodeGen/CGCall.cpp
@@ -1870,6 +1870,21 @@ void CodeGenModule::ConstructAttributeList(
     }
   }
 
+  if (TargetDecl && TargetDecl->hasAttr<OpenCLKernelAttr>()) {
+    if (getLangOpts().OpenCLVersion <= 120) {
+      // OpenCL v1.2 Work groups are always uniform
+      FuncAttrs.addAttribute("uniform-work-group-size", "true");
+    } else {
+      // OpenCL v2.0 Work groups may be whether uniform or not.
+      // '-cl-uniform-work-group-size' compile option gets a hint
+      // to the compiler that the global work-size be a multiple of
+      // the work-group size specified to clEnqueueNDRangeKernel
+      // (i.e. work groups are uniform).
+      FuncAttrs.addAttribute("uniform-work-group-size",
+                             llvm::toStringRef(CodeGenOpts.UniformWGSize));
+    }
+  }
+
   if (!AttrOnCallSite) {
     bool DisableTailCalls =
         CodeGenOpts.DisableTailCalls ||
diff --git a/lib/Driver/ToolChains/Clang.cpp b/lib/Driver/ToolChains/Clang.cpp
index 64bea9e56a..cc5cf0f042 100644
--- a/lib/Driver/ToolChains/Clang.cpp
+++ b/lib/Driver/ToolChains/Clang.cpp
@@ -2379,6 +2379,7 @@ static void RenderOpenCLOptions(const ArgList &Args, ArgStringList &CmdArgs) {
       options::OPT_cl_no_signed_zeros,
       options::OPT_cl_denorms_are_zero,
       options::OPT_cl_fp32_correctly_rounded_divide_sqrt,
+      options::OPT_cl_uniform_work_group_size
   };
 
   if (Arg *A = Args.getLastArg(options::OPT_cl_std_EQ)) {
diff --git a/lib/Frontend/CompilerInvocation.cpp b/lib/Frontend/CompilerInvocation.cpp
index 3bdc116b96..5be02c9682 100644
--- a/lib/Frontend/CompilerInvocation.cpp
+++ b/lib/Frontend/CompilerInvocation.cpp
@@ -659,6 +659,8 @@ static bool ParseCodeGenArgs(CodeGenOptions &Opts, ArgList &Args, InputKind IK,
   Opts.FlushDenorm = Args.hasArg(OPT_cl_denorms_are_zero);
   Opts.CorrectlyRoundedDivSqrt =
       Args.hasArg(OPT_cl_fp32_correctly_rounded_divide_sqrt);
+  Opts.UniformWGSize =
+      Args.hasArg(OPT_cl_uniform_work_group_size);
   Opts.Reciprocals = Args.getAllArgValues(OPT_mrecip_EQ);
   Opts.ReciprocalMath = Args.hasArg(OPT_freciprocal_math);
   Opts.NoTrappingMath = Args.hasArg(OPT_fno_trapping_math);
diff --git a/test/CodeGenOpenCL/amdgpu-abi-struct-coerce.cl b/test/CodeGenOpenCL/amdgpu-abi-struct-coerce.cl
index 1027fd740c..aec00e7601 100644
--- a/test/CodeGenOpenCL/amdgpu-abi-struct-coerce.cl
+++ b/test/CodeGenOpenCL/amdgpu-abi-struct-coerce.cl
@@ -425,7 +425,7 @@ struct_char_arr32 func_ret_struct_char_arr32()
   return s;
 }
 
-// CHECK: define i32 @func_transparent_union_ret() local_unnamed_addr #0 {
+// CHECK: define i32 @func_transparent_union_ret() local_unnamed_addr #1 {
 // CHECK: ret i32 0
 transparent_u func_transparent_union_ret()
 {
diff --git a/test/CodeGenOpenCL/cl-uniform-wg-size.cl b/test/CodeGenOpenCL/cl-uniform-wg-size.cl
new file mode 100644
index 0000000000..76ace5dca2
--- /dev/null
+++ b/test/CodeGenOpenCL/cl-uniform-wg-size.cl
@@ -0,0 +1,16 @@
+// RUN: %clang_cc1 -emit-llvm -O0 -cl-std=CL1.2 -o - %s 2>&1 | FileCheck %s -check-prefixes CHECK,CHECK-UNIFORM
+// RUN: %clang_cc1 -emit-llvm -O0 -cl-std=CL2.0 -o - %s 2>&1 | FileCheck %s -check-prefixes CHECK,CHECK-NONUNIFORM
+// RUN: %clang_cc1 -emit-llvm -O0 -cl-std=CL2.0 -cl-uniform-work-group-size -o - %s 2>&1 | FileCheck %s -check-prefixes CHECK,CHECK-UNIFORM
+
+kernel void ker() {};
+// CHECK: define{{.*}}@ker() #0
+
+void foo() {};
+// CHECK: define{{.*}}@foo() #1
+
+// CHECK-LABEL: attributes #0
+// CHECK-UNIFORM: "uniform-work-group-size"="true"
+// CHECK-NONUNIFORM: "uniform-work-group-size"="false"
+
+// CHECK-LABEL: attributes #1
+// CHECK-NOT: uniform-work-group-size
diff --git a/test/CodeGenOpenCL/convergent.cl b/test/CodeGenOpenCL/convergent.cl
index 285b637ca6..a011920761 100644
--- a/test/CodeGenOpenCL/convergent.cl
+++ b/test/CodeGenOpenCL/convergent.cl
@@ -127,7 +127,7 @@ void test_not_unroll() {
 // CHECK: declare spir_func void @nodupfun(){{[^#]*}} #[[attr3:[0-9]+]]
 
 // CHECK-LABEL: @assume_convergent_asm
-// CHECK: tail call void asm sideeffect "s_barrier", ""() #4
+// CHECK: tail call void asm sideeffect "s_barrier", ""() #5
 kernel void assume_convergent_asm()
 {
   __asm__ volatile("s_barrier");
@@ -138,4 +138,5 @@ kernel void assume_convergent_asm()
 // CHECK: attributes #2 = { {{[^}]*}}convergent{{[^}]*}} }
 // CHECK: attributes #3 = { {{[^}]*}}convergent noduplicate{{[^}]*}} }
 // CHECK: attributes #4 = { {{[^}]*}}convergent{{[^}]*}} }
-// CHECK: attributes #5 = { {{[^}]*}}convergent noduplicate{{[^}]*}} }
+// CHECK: attributes #5 = { {{[^}]*}}convergent{{[^}]*}} }
+// CHECK: attributes #6 = { {{[^}]*}}convergent noduplicate{{[^}]*}} }
diff --git a/test/Driver/opencl.cl b/test/Driver/opencl.cl
index d68d424b6e..8c421beeef 100644
--- a/test/Driver/opencl.cl
+++ b/test/Driver/opencl.cl
@@ -13,6 +13,7 @@
 // RUN: %clang -S -### -cl-no-signed-zeros %s 2>&1 | FileCheck --check-prefix=CHECK-NO-SIGNED-ZEROS %s
 // RUN: %clang -S -### -cl-denorms-are-zero %s 2>&1 | FileCheck --check-prefix=CHECK-DENORMS-ARE-ZERO %s
 // RUN: %clang -S -### -cl-fp32-correctly-rounded-divide-sqrt %s 2>&1 | FileCheck --check-prefix=CHECK-ROUND-DIV %s
+// RUN: %clang -S -### -cl-uniform-work-group-size %s 2>&1 | FileCheck --check-prefix=CHECK-UNIFORM-WG %s
 // RUN: not %clang -cl-std=c99 -DOPENCL %s 2>&1 | FileCheck --check-prefix=CHECK-C99 %s
 // RUN: not %clang -cl-std=invalid -DOPENCL %s 2>&1 | FileCheck --check-prefix=CHECK-INVALID %s
 
@@ -31,6 +32,7 @@
 // CHECK-NO-SIGNED-ZEROS: "-cc1" {{.*}} "-cl-no-signed-zeros"
 // CHECK-DENORMS-ARE-ZERO: "-cc1" {{.*}} "-cl-denorms-are-zero"
 // CHECK-ROUND-DIV: "-cc1" {{.*}} "-cl-fp32-correctly-rounded-divide-sqrt"
+// CHECK-UNIFORM-WG: "-cc1" {{.*}} "-cl-uniform-work-group-size"
 // CHECK-C99: error: invalid value 'c99' in '-cl-std=c99'
 // CHECK-INVALID: error: invalid value 'invalid' in '-cl-std=invalid'
author	Alexey Sotkin <alexey.sotkin@intel.com>	2018-02-22 11:54:14 +0000
committer	Alexey Sotkin <alexey.sotkin@intel.com>	2018-02-22 11:54:14 +0000
commit	9acbe6ca409da5a824a2ee6e967dedddc7cf4b69 (patch)
tree	0a3243db274d85243f6b462ea47365ded5a61d88
parent	0972dc790621cb6c6718ef9a5090a993cc0724c2 (diff)
download	clang-9acbe6ca409da5a824a2ee6e967dedddc7cf4b69.tar.gz