diff options
-rw-r--r-- | kernels/compiler_async_copy.cl | 16 | ||||
-rw-r--r-- | kernels/compiler_async_stride_copy.cl | 16 | ||||
-rw-r--r-- | utests/CMakeLists.txt | 2 | ||||
-rw-r--r-- | utests/compiler_async_copy.cpp | 39 | ||||
-rw-r--r-- | utests/compiler_async_stride_copy.cpp | 45 |
5 files changed, 118 insertions, 0 deletions
diff --git a/kernels/compiler_async_copy.cl b/kernels/compiler_async_copy.cl new file mode 100644 index 00000000..a2432a46 --- /dev/null +++ b/kernels/compiler_async_copy.cl @@ -0,0 +1,16 @@ +__kernel void +compiler_async_copy(__global int2 *dst, __global int2 *src, __local int2 *localBuffer, int copiesPerWorkItem) +{ + event_t event; + int copiesPerWorkgroup = copiesPerWorkItem * get_local_size(0); + int i; + event = async_work_group_copy((__local int2*)localBuffer, (__global const int2*)(src+copiesPerWorkgroup*get_group_id(0)), (size_t)copiesPerWorkgroup, (event_t)0 ); + wait_group_events( 1, &event ); + + for(i=0; i<copiesPerWorkItem; i++) + localBuffer[ get_local_id( 0 )*copiesPerWorkItem+i ] = localBuffer[ get_local_id( 0 )*copiesPerWorkItem+i ] + (int2)(3, 3); + barrier(CLK_LOCAL_MEM_FENCE); + + event = async_work_group_copy((__global int2*)(dst+copiesPerWorkgroup*get_group_id(0)), (__local const int2*)localBuffer, (size_t)copiesPerWorkgroup, (event_t)0 ); + wait_group_events( 1, &event ); +} diff --git a/kernels/compiler_async_stride_copy.cl b/kernels/compiler_async_stride_copy.cl new file mode 100644 index 00000000..a9265888 --- /dev/null +++ b/kernels/compiler_async_stride_copy.cl @@ -0,0 +1,16 @@ +__kernel void +compiler_async_stride_copy(__global char4 *dst, __global char4 *src, __local char4 *localBuffer, int copiesPerWorkItem, int stride) +{ + event_t event; + int copiesPerWorkgroup = copiesPerWorkItem * get_local_size(0); + int i; + event = async_work_group_strided_copy( (__local char4*)localBuffer, (__global const char4*)(src+copiesPerWorkgroup*stride*get_group_id(0)), (size_t)copiesPerWorkgroup, (size_t)stride, (event_t)0 ); + wait_group_events( 1, &event ); + + for(i=0; i<copiesPerWorkItem; i++) + localBuffer[ get_local_id( 0 )*copiesPerWorkItem+i ] = localBuffer[ get_local_id( 0 )*copiesPerWorkItem+i ] + (char4)(3); + barrier(CLK_LOCAL_MEM_FENCE); + + event = async_work_group_strided_copy((__global char4*)(dst+copiesPerWorkgroup*stride*get_group_id(0)), (__local const char4*)localBuffer, (size_t)copiesPerWorkgroup, (size_t)stride, (event_t)0 ); + wait_group_events( 1, &event ); +} diff --git a/utests/CMakeLists.txt b/utests/CMakeLists.txt index fe7156f3..97b75196 100644 --- a/utests/CMakeLists.txt +++ b/utests/CMakeLists.txt @@ -85,6 +85,8 @@ set (utests_sources compiler_switch.cpp compiler_math.cpp compiler_atomic_functions.cpp + compiler_async_copy.cpp + compiler_async_stride_copy.cpp compiler_insn_selection_min.cpp compiler_insn_selection_max.cpp compiler_insn_selection_masked_min_max.cpp diff --git a/utests/compiler_async_copy.cpp b/utests/compiler_async_copy.cpp new file mode 100644 index 00000000..9384f85b --- /dev/null +++ b/utests/compiler_async_copy.cpp @@ -0,0 +1,39 @@ +#include "utest_helper.hpp" + +static void compiler_async_copy(void) +{ + const size_t n = 1024; + const size_t local_size = 32; + const int copiesPerWorkItem = 5; + + // Setup kernel and buffers + OCL_CREATE_KERNEL("compiler_async_copy"); + OCL_CREATE_BUFFER(buf[0], 0, n * copiesPerWorkItem * sizeof(int) * 2, NULL); + OCL_CREATE_BUFFER(buf[1], 0, n * copiesPerWorkItem * sizeof(int) * 2, NULL); + OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]); + OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]); + OCL_SET_ARG(2, local_size*copiesPerWorkItem*sizeof(int)*2, NULL); + OCL_SET_ARG(3, sizeof(int), &copiesPerWorkItem); + + OCL_MAP_BUFFER(1); + for (uint32_t i = 0; i < n * copiesPerWorkItem * 2; ++i) + ((int*)buf_data[1])[i] = rand(); + OCL_UNMAP_BUFFER(1); + + // Run the kernel + globals[0] = n; + locals[0] = local_size; + OCL_NDRANGE(1); + OCL_MAP_BUFFER(0); + OCL_MAP_BUFFER(1); + + // Check results + int *dst = (int*)buf_data[0]; + int *src = (int*)buf_data[1]; + for (uint32_t i = 0; i < n * copiesPerWorkItem * 2; i++) + OCL_ASSERT(dst[i] == src[i] + 3); + OCL_UNMAP_BUFFER(0); + OCL_UNMAP_BUFFER(1); +} + +MAKE_UTEST_FROM_FUNCTION(compiler_async_copy); diff --git a/utests/compiler_async_stride_copy.cpp b/utests/compiler_async_stride_copy.cpp new file mode 100644 index 00000000..132f9177 --- /dev/null +++ b/utests/compiler_async_stride_copy.cpp @@ -0,0 +1,45 @@ +#include "utest_helper.hpp" + +static void compiler_async_stride_copy(void) +{ + const size_t n = 1024; + const size_t local_size = 128; + const int copiesPerWorkItem = 5; + const int stride =3; + + // Setup kernel and buffers + OCL_CREATE_KERNEL("compiler_async_stride_copy"); + OCL_CREATE_BUFFER(buf[0], 0, n * copiesPerWorkItem * sizeof(char) * 4 * stride, NULL); + OCL_CREATE_BUFFER(buf[1], 0, n * copiesPerWorkItem * sizeof(char) * 4 * stride, NULL); + OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]); + OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]); + OCL_SET_ARG(2, local_size*copiesPerWorkItem*sizeof(char)*4, NULL); + OCL_SET_ARG(3, sizeof(int), &copiesPerWorkItem); + OCL_SET_ARG(4, sizeof(int), &stride); + + OCL_MAP_BUFFER(1); + for (uint32_t i = 0; i < n * copiesPerWorkItem * 4 * stride; ++i) + ((char*)buf_data[1])[i] = rand() && 0xff; + OCL_UNMAP_BUFFER(1); + + // Run the kernel + globals[0] = n; + locals[0] = local_size; + OCL_NDRANGE(1); + OCL_MAP_BUFFER(0); + OCL_MAP_BUFFER(1); + + // Check results + char *dst = (char*)buf_data[0]; + char *src = (char*)buf_data[1]; + for (uint32_t i = 0; i < n * copiesPerWorkItem; i += stride * 4) { + OCL_ASSERT(dst[i + 0] == src[i + 0] + 3); + OCL_ASSERT(dst[i + 1] == src[i + 1] + 3); + OCL_ASSERT(dst[i + 2] == src[i + 2] + 3); + OCL_ASSERT(dst[i + 3] == src[i + 3] + 3); + } + OCL_UNMAP_BUFFER(0); + OCL_UNMAP_BUFFER(1); +} + +MAKE_UTEST_FROM_FUNCTION(compiler_async_stride_copy); |