summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--kernels/compiler_async_copy.cl16
-rw-r--r--kernels/compiler_async_stride_copy.cl16
-rw-r--r--utests/CMakeLists.txt2
-rw-r--r--utests/compiler_async_copy.cpp39
-rw-r--r--utests/compiler_async_stride_copy.cpp45
5 files changed, 118 insertions, 0 deletions
diff --git a/kernels/compiler_async_copy.cl b/kernels/compiler_async_copy.cl
new file mode 100644
index 00000000..a2432a46
--- /dev/null
+++ b/kernels/compiler_async_copy.cl
@@ -0,0 +1,16 @@
+__kernel void
+compiler_async_copy(__global int2 *dst, __global int2 *src, __local int2 *localBuffer, int copiesPerWorkItem)
+{
+ event_t event;
+ int copiesPerWorkgroup = copiesPerWorkItem * get_local_size(0);
+ int i;
+ event = async_work_group_copy((__local int2*)localBuffer, (__global const int2*)(src+copiesPerWorkgroup*get_group_id(0)), (size_t)copiesPerWorkgroup, (event_t)0 );
+ wait_group_events( 1, &event );
+
+ for(i=0; i<copiesPerWorkItem; i++)
+ localBuffer[ get_local_id( 0 )*copiesPerWorkItem+i ] = localBuffer[ get_local_id( 0 )*copiesPerWorkItem+i ] + (int2)(3, 3);
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ event = async_work_group_copy((__global int2*)(dst+copiesPerWorkgroup*get_group_id(0)), (__local const int2*)localBuffer, (size_t)copiesPerWorkgroup, (event_t)0 );
+ wait_group_events( 1, &event );
+}
diff --git a/kernels/compiler_async_stride_copy.cl b/kernels/compiler_async_stride_copy.cl
new file mode 100644
index 00000000..a9265888
--- /dev/null
+++ b/kernels/compiler_async_stride_copy.cl
@@ -0,0 +1,16 @@
+__kernel void
+compiler_async_stride_copy(__global char4 *dst, __global char4 *src, __local char4 *localBuffer, int copiesPerWorkItem, int stride)
+{
+ event_t event;
+ int copiesPerWorkgroup = copiesPerWorkItem * get_local_size(0);
+ int i;
+ event = async_work_group_strided_copy( (__local char4*)localBuffer, (__global const char4*)(src+copiesPerWorkgroup*stride*get_group_id(0)), (size_t)copiesPerWorkgroup, (size_t)stride, (event_t)0 );
+ wait_group_events( 1, &event );
+
+ for(i=0; i<copiesPerWorkItem; i++)
+ localBuffer[ get_local_id( 0 )*copiesPerWorkItem+i ] = localBuffer[ get_local_id( 0 )*copiesPerWorkItem+i ] + (char4)(3);
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ event = async_work_group_strided_copy((__global char4*)(dst+copiesPerWorkgroup*stride*get_group_id(0)), (__local const char4*)localBuffer, (size_t)copiesPerWorkgroup, (size_t)stride, (event_t)0 );
+ wait_group_events( 1, &event );
+}
diff --git a/utests/CMakeLists.txt b/utests/CMakeLists.txt
index fe7156f3..97b75196 100644
--- a/utests/CMakeLists.txt
+++ b/utests/CMakeLists.txt
@@ -85,6 +85,8 @@ set (utests_sources
compiler_switch.cpp
compiler_math.cpp
compiler_atomic_functions.cpp
+ compiler_async_copy.cpp
+ compiler_async_stride_copy.cpp
compiler_insn_selection_min.cpp
compiler_insn_selection_max.cpp
compiler_insn_selection_masked_min_max.cpp
diff --git a/utests/compiler_async_copy.cpp b/utests/compiler_async_copy.cpp
new file mode 100644
index 00000000..9384f85b
--- /dev/null
+++ b/utests/compiler_async_copy.cpp
@@ -0,0 +1,39 @@
+#include "utest_helper.hpp"
+
+static void compiler_async_copy(void)
+{
+ const size_t n = 1024;
+ const size_t local_size = 32;
+ const int copiesPerWorkItem = 5;
+
+ // Setup kernel and buffers
+ OCL_CREATE_KERNEL("compiler_async_copy");
+ OCL_CREATE_BUFFER(buf[0], 0, n * copiesPerWorkItem * sizeof(int) * 2, NULL);
+ OCL_CREATE_BUFFER(buf[1], 0, n * copiesPerWorkItem * sizeof(int) * 2, NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ OCL_SET_ARG(2, local_size*copiesPerWorkItem*sizeof(int)*2, NULL);
+ OCL_SET_ARG(3, sizeof(int), &copiesPerWorkItem);
+
+ OCL_MAP_BUFFER(1);
+ for (uint32_t i = 0; i < n * copiesPerWorkItem * 2; ++i)
+ ((int*)buf_data[1])[i] = rand();
+ OCL_UNMAP_BUFFER(1);
+
+ // Run the kernel
+ globals[0] = n;
+ locals[0] = local_size;
+ OCL_NDRANGE(1);
+ OCL_MAP_BUFFER(0);
+ OCL_MAP_BUFFER(1);
+
+ // Check results
+ int *dst = (int*)buf_data[0];
+ int *src = (int*)buf_data[1];
+ for (uint32_t i = 0; i < n * copiesPerWorkItem * 2; i++)
+ OCL_ASSERT(dst[i] == src[i] + 3);
+ OCL_UNMAP_BUFFER(0);
+ OCL_UNMAP_BUFFER(1);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_async_copy);
diff --git a/utests/compiler_async_stride_copy.cpp b/utests/compiler_async_stride_copy.cpp
new file mode 100644
index 00000000..132f9177
--- /dev/null
+++ b/utests/compiler_async_stride_copy.cpp
@@ -0,0 +1,45 @@
+#include "utest_helper.hpp"
+
+static void compiler_async_stride_copy(void)
+{
+ const size_t n = 1024;
+ const size_t local_size = 128;
+ const int copiesPerWorkItem = 5;
+ const int stride =3;
+
+ // Setup kernel and buffers
+ OCL_CREATE_KERNEL("compiler_async_stride_copy");
+ OCL_CREATE_BUFFER(buf[0], 0, n * copiesPerWorkItem * sizeof(char) * 4 * stride, NULL);
+ OCL_CREATE_BUFFER(buf[1], 0, n * copiesPerWorkItem * sizeof(char) * 4 * stride, NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ OCL_SET_ARG(2, local_size*copiesPerWorkItem*sizeof(char)*4, NULL);
+ OCL_SET_ARG(3, sizeof(int), &copiesPerWorkItem);
+ OCL_SET_ARG(4, sizeof(int), &stride);
+
+ OCL_MAP_BUFFER(1);
+ for (uint32_t i = 0; i < n * copiesPerWorkItem * 4 * stride; ++i)
+ ((char*)buf_data[1])[i] = rand() && 0xff;
+ OCL_UNMAP_BUFFER(1);
+
+ // Run the kernel
+ globals[0] = n;
+ locals[0] = local_size;
+ OCL_NDRANGE(1);
+ OCL_MAP_BUFFER(0);
+ OCL_MAP_BUFFER(1);
+
+ // Check results
+ char *dst = (char*)buf_data[0];
+ char *src = (char*)buf_data[1];
+ for (uint32_t i = 0; i < n * copiesPerWorkItem; i += stride * 4) {
+ OCL_ASSERT(dst[i + 0] == src[i + 0] + 3);
+ OCL_ASSERT(dst[i + 1] == src[i + 1] + 3);
+ OCL_ASSERT(dst[i + 2] == src[i + 2] + 3);
+ OCL_ASSERT(dst[i + 3] == src[i + 3] + 3);
+ }
+ OCL_UNMAP_BUFFER(0);
+ OCL_UNMAP_BUFFER(1);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_async_stride_copy);