runtime: fix max group size calculation issue.

If the kernel doesn't use slm/barrier, there is no hard limitation for the max group size. And if the max work group size is more than 1024, the original 64 urb entry count will not be sufficient to hold all the curbe payload. Change the entry count to max thread count to fix this potential issue. I found this bug when I tried to run phoronix test suite's juliagpu test case on my MBA. v2: refine the max kernel work group size calculation mechanism. the wg_sz should not be a device's member variable, it should be a variable derived from kernel and device's attriute at runtime. also fix wrong configuration for IVB GT1. v3: Add an important max thread limitation in the GPGPU_WALKER command. For non-Baytrail, the max thread depth * max thread height * max thread width should less than 64 (under either simd16 or simd8), no matter whether SLM/barrier is used. We oversighted that limitation before, thus for a simd8 kernel which use work group size 1024 will exceed this limitation and half of the thread will not be executed at all. Signed-off-by: Zhigang Gong <zhigang.gong@linux.intel.com> Reviewed-by: "Song, Ruiling" <ruiling.song@intel.com>
author: Zhigang Gong <zhigang.gong@linux.intel.com> 2014-06-28 23:25:53 +0800
committer: Zhigang Gong <zhigang.gong@intel.com> 2014-07-01 10:31:10 +0800
commit: a1ebbf7d61fa357a645b28580aafa44a20d4197b (patch)
tree: dc28e41fbc19c5358b369f8f3aedddcaffcd1a7f
parent: ff4eaa52340ab666c0fb5f8b2476aaf855422a96 (diff)
download: beignet-a1ebbf7d61fa357a645b28580aafa44a20d4197b.tar.gz
5 files changed, 53 insertions, 33 deletions
diff --git a/src/cl_command_queue_gen7.c b/src/cl_command_queue_gen7.c
index 9af4829a..5b80d746 100644
--- a/src/cl_command_queue_gen7.c
+++ b/src/cl_command_queue_gen7.c
@@ -304,12 +304,6 @@ cl_command_queue_ND_range_gen7(cl_command_queue queue,
   kernel.thread_n = thread_n = (local_sz + simd_sz - 1) / simd_sz;
   kernel.curbe_sz = cst_sz;
 
-  /* Barrier and SLM must fit into a single half slice */
-  if(kernel.use_slm > 0 && simd_sz == 8 && local_sz > MAX_GROUP_SIZE_IN_HALFSLICE){
-    fprintf(stderr, "Beignet: Work group CAN NOT large than %d when using barrier or local momery.\n", MAX_GROUP_SIZE_IN_HALFSLICE);
-    return CL_OUT_OF_RESOURCES;
-  }
-
   if (scratch_sz > ker->program->ctx->device->scratch_mem_size) {
     fprintf(stderr, "Beignet: Out of scratch memory %d.\n", scratch_sz);
     return CL_OUT_OF_RESOURCES;
diff --git a/src/cl_device_id.c b/src/cl_device_id.c
index 0263f02f..4db580ee 100644
--- a/src/cl_device_id.c
+++ b/src/cl_device_id.c
@@ -40,21 +40,19 @@ static struct _cl_device_id intel_ivb_gt2_device = {
   INIT_ICD(dispatch)
   .max_compute_unit = 16,
   .max_thread_per_unit = 8,
-  .max_work_item_sizes = {512, 512, 512},
+  .max_work_item_sizes = {1024, 1024, 1024},
   .max_work_group_size = 1024,
   .max_clock_frequency = 1000,
-  .wg_sz = 1024,
 #include "cl_gen7_device.h"
 };
 
 static struct _cl_device_id intel_ivb_gt1_device = {
   INIT_ICD(dispatch)
-  .max_compute_unit = 8,
-  .max_thread_per_unit = 8,
+  .max_compute_unit = 6,
+  .max_thread_per_unit = 6,
   .max_work_item_sizes = {512, 512, 512},
   .max_work_group_size = 512,
   .max_clock_frequency = 1000,
-  .wg_sz = 512,
 #include "cl_gen7_device.h"
 };
 
@@ -63,9 +61,8 @@ static struct _cl_device_id intel_baytrail_t_device = {
   .max_compute_unit = 4,
   .max_thread_per_unit = 8,
   .max_work_item_sizes = {512, 512, 512},
-  .max_work_group_size = 256,
+  .max_work_group_size = 512,
   .max_clock_frequency = 1000,
-  .wg_sz = 256,
 #include "cl_gen7_device.h"
 };
 
@@ -74,10 +71,9 @@ static struct _cl_device_id intel_hsw_gt1_device = {
   INIT_ICD(dispatch)
   .max_compute_unit = 10,
   .max_thread_per_unit = 7,
-  .max_work_item_sizes = {512, 512, 512},
-  .max_work_group_size = 512,
+  .max_work_item_sizes = {1024, 1024, 1024},
+  .max_work_group_size = 1024,
   .max_clock_frequency = 1000,
-  .wg_sz = 512,
 #include "cl_gen75_device.h"
 };
 
@@ -85,10 +81,9 @@ static struct _cl_device_id intel_hsw_gt2_device = {
   INIT_ICD(dispatch)
   .max_compute_unit = 20,
   .max_thread_per_unit = 7,
-  .max_work_item_sizes = {512, 512, 512},
+  .max_work_item_sizes = {1024, 1024, 1024},
   .max_work_group_size = 1024,
   .max_clock_frequency = 1000,
-  .wg_sz = 1024,
 #include "cl_gen75_device.h"
 };
 
@@ -96,10 +91,9 @@ static struct _cl_device_id intel_hsw_gt3_device = {
   INIT_ICD(dispatch)
   .max_compute_unit = 40,
   .max_thread_per_unit = 7,
-  .max_work_item_sizes = {512, 512, 512},
+  .max_work_item_sizes = {1024, 1024, 1024},
   .max_work_group_size = 1024,
   .max_clock_frequency = 1000,
-  .wg_sz = 2048,
 #include "cl_gen75_device.h"
 };
 
@@ -465,6 +459,26 @@ cl_device_get_version(cl_device_id device, cl_int *ver)
   _DECL_FIELD(FIELD)
 
 #include "cl_kernel.h"
+#include "cl_program.h"
+
+LOCAL size_t
+cl_get_kernel_max_wg_sz(cl_kernel kernel)
+{
+  size_t work_group_size;
+  int simd_width = interp_kernel_get_simd_width(kernel->opaque);
+  int vendor_id = kernel->program->ctx->device->vendor_id;
+  if (!interp_kernel_use_slm(kernel->opaque)) {
+    if (!IS_BAYTRAIL_T(vendor_id) || simd_width == 16)
+      work_group_size = simd_width * 64;
+    else
+      work_group_size = kernel->program->ctx->device->max_compute_unit *
+                        kernel->program->ctx->device->max_thread_per_unit * simd_width;
+  } else
+    work_group_size = kernel->program->ctx->device->max_work_group_size /
+                      (16 / simd_width);
+  return work_group_size;
+}
+
 LOCAL cl_int
 cl_get_kernel_workgroup_info(cl_kernel kernel,
                              cl_device_id device,
@@ -484,13 +498,24 @@ cl_get_kernel_workgroup_info(cl_kernel kernel,
 
   CHECK_KERNEL(kernel);
   switch (param_name) {
-    DECL_FIELD(WORK_GROUP_SIZE, device->wg_sz)
+    case CL_KERNEL_WORK_GROUP_SIZE:
+    {
+      if (param_value && param_value_size < sizeof(size_t))
+        return CL_INVALID_VALUE;
+      if (param_value_size_ret != NULL)
+        *param_value_size_ret = sizeof(size_t);
+      if (param_value) {
+        size_t work_group_size = cl_get_kernel_max_wg_sz(kernel);
+        *(size_t*)param_value = work_group_size;
+        return CL_SUCCESS;
+      }
+    }
     DECL_FIELD(PREFERRED_WORK_GROUP_SIZE_MULTIPLE, device->preferred_wg_sz_mul)
     case CL_KERNEL_LOCAL_MEM_SIZE:
-      {
-        size_t local_mem_sz =  interp_kernel_get_slm_size(kernel->opaque) + kernel->local_mem_sz;
-        _DECL_FIELD(local_mem_sz)
-      }
+    {
+      size_t local_mem_sz =  interp_kernel_get_slm_size(kernel->opaque) + kernel->local_mem_sz;
+      _DECL_FIELD(local_mem_sz)
+    }
     DECL_FIELD(COMPILE_WORK_GROUP_SIZE, kernel->compile_wg_sz)
     DECL_FIELD(PRIVATE_MEM_SIZE, kernel->stack_size)
     default:
diff --git a/src/cl_device_id.h b/src/cl_device_id.h
index 422ef37f..c4f82279 100644
--- a/src/cl_device_id.h
+++ b/src/cl_device_id.h
@@ -25,11 +25,11 @@ struct _cl_device_id {
   DEFINE_ICD(dispatch)
   cl_device_type device_type;
   cl_uint  vendor_id;
-  cl_uint  max_compute_unit;
-  cl_uint  max_thread_per_unit;
-  cl_uint  max_work_item_dimensions;
-  size_t   max_work_item_sizes[3];
-  size_t   max_work_group_size;
+  cl_uint  max_compute_unit;               // maximum EU number
+  cl_uint  max_thread_per_unit;            // maximum EU threads per EU.
+  cl_uint  max_work_item_dimensions;       // should be 3.
+  size_t   max_work_item_sizes[3];         // equal to maximum work group size.
+  size_t   max_work_group_size;            // maximum work group size under simd16 mode.
   cl_uint  preferred_vector_width_char;
   cl_uint  preferred_vector_width_short;
   cl_uint  preferred_vector_width_int;
@@ -101,7 +101,6 @@ struct _cl_device_id {
   size_t driver_version_sz;
   size_t built_in_kernels_sz;
   /* Kernel specific info that we're assigning statically */
-  size_t wg_sz;
   size_t preferred_wg_sz_mul;
   /* SubDevice specific info */
   cl_device_id parent_device;
@@ -137,6 +136,7 @@ extern cl_int cl_get_kernel_workgroup_info(cl_kernel kernel,
                                            size_t *         param_value_size_ret);
 /* Returns the Gen device ID */
 extern cl_int cl_device_get_version(cl_device_id device, cl_int *ver);
+extern size_t cl_get_kernel_max_wg_sz(cl_kernel);
 
 #endif /* __CL_DEVICE_ID_H__ */
 
diff --git a/src/cl_kernel.c b/src/cl_kernel.c
index 5d0b36e1..aad3c047 100644
--- a/src/cl_kernel.c
+++ b/src/cl_kernel.c
@@ -410,7 +410,7 @@ cl_kernel_work_group_sz(cl_kernel ker,
   for (i = 1; i < wk_dim; ++i)
     sz *= local_wk_sz[i];
 
-  if (sz > ker->program->ctx->device->max_work_group_size) {
+  if (sz > cl_get_kernel_max_wg_sz(ker)) {
     err = CL_INVALID_WORK_ITEM_SIZE;
     goto error;
   }
diff --git a/src/intel/intel_gpgpu.c b/src/intel/intel_gpgpu.c
index 3b895394..d403aa04 100644
--- a/src/intel/intel_gpgpu.c
+++ b/src/intel/intel_gpgpu.c
@@ -577,7 +577,7 @@ intel_gpgpu_state_init(intel_gpgpu_t *gpgpu,
   gpgpu->sampler_bitmap = ~((1 << max_sampler_n) - 1);
 
   /* URB */
-  gpgpu->urb.num_cs_entries = 64;
+  gpgpu->urb.num_cs_entries = max_threads;
   gpgpu->urb.size_cs_entry = size_cs_entry;
   gpgpu->max_threads = max_threads;
 
@@ -1113,6 +1113,7 @@ intel_gpgpu_walker(intel_gpgpu_t *gpgpu,
   BEGIN_BATCH(gpgpu->batch, 11);
   OUT_BATCH(gpgpu->batch, CMD_GPGPU_WALKER | 9);
   OUT_BATCH(gpgpu->batch, 0);                        /* kernel index == 0 */
+  assert(thread_n <= 64);
   if (simd_sz == 16)
     OUT_BATCH(gpgpu->batch, (1 << 30) | (thread_n-1)); /* SIMD16 | thread max */
   else
author	Zhigang Gong <zhigang.gong@linux.intel.com>	2014-06-28 23:25:53 +0800
committer	Zhigang Gong <zhigang.gong@intel.com>	2014-07-01 10:31:10 +0800
commit	a1ebbf7d61fa357a645b28580aafa44a20d4197b (patch)
tree	dc28e41fbc19c5358b369f8f3aedddcaffcd1a7f
parent	ff4eaa52340ab666c0fb5f8b2476aaf855422a96 (diff)
download	beignet-a1ebbf7d61fa357a645b28580aafa44a20d4197b.tar.gz