diff options
-rw-r--r-- | kernels/compiler_sub_group_shuffle.cl | 22 | ||||
-rw-r--r-- | kernels/compiler_sub_group_shuffle_down.cl | 23 | ||||
-rw-r--r-- | kernels/compiler_sub_group_shuffle_up.cl | 23 | ||||
-rw-r--r-- | kernels/compiler_sub_group_shuffle_xor.cl | 23 | ||||
-rw-r--r-- | utests/compiler_sub_group_shuffle.cpp | 52 | ||||
-rw-r--r-- | utests/compiler_sub_group_shuffle_down.cpp | 54 | ||||
-rw-r--r-- | utests/compiler_sub_group_shuffle_up.cpp | 54 | ||||
-rw-r--r-- | utests/compiler_sub_group_shuffle_xor.cpp | 54 |
8 files changed, 289 insertions, 16 deletions
diff --git a/kernels/compiler_sub_group_shuffle.cl b/kernels/compiler_sub_group_shuffle.cl index 322da743..c771eea4 100644 --- a/kernels/compiler_sub_group_shuffle.cl +++ b/kernels/compiler_sub_group_shuffle.cl @@ -1,4 +1,4 @@ -__kernel void compiler_sub_group_shuffle(global int *dst, int c) +__kernel void compiler_sub_group_shuffle_int(global int *dst, int c) { int i = get_global_id(0); if (i == 0) @@ -16,3 +16,23 @@ __kernel void compiler_sub_group_shuffle(global int *dst, int c) dst[i*4+2] = o2; dst[i*4+3] = o3; } +#ifdef SHORT +__kernel void compiler_sub_group_shuffle_short(global short *dst, int c) +{ + short i = get_global_id(0); + if (i == 0) + dst[0] = get_max_sub_group_size(); + dst++; + + short from = i; + int j = get_max_sub_group_size() - get_sub_group_local_id() - 1; + short o0 = get_sub_group_local_id(); + short o1 = intel_sub_group_shuffle(from, c); + short o2 = intel_sub_group_shuffle(from, 5); + short o3 = intel_sub_group_shuffle(from, j); + dst[i*4] = o0; + dst[i*4+1] = o1; + dst[i*4+2] = o2; + dst[i*4+3] = o3; +} +#endif diff --git a/kernels/compiler_sub_group_shuffle_down.cl b/kernels/compiler_sub_group_shuffle_down.cl index 769fc3fc..40bac056 100644 --- a/kernels/compiler_sub_group_shuffle_down.cl +++ b/kernels/compiler_sub_group_shuffle_down.cl @@ -1,4 +1,4 @@ -__kernel void compiler_sub_group_shuffle_down(global int *dst, int c) +__kernel void compiler_sub_group_shuffle_down_int(global int *dst, int c) { int i = get_global_id(0); if (i == 0) @@ -17,3 +17,24 @@ __kernel void compiler_sub_group_shuffle_down(global int *dst, int c) dst[i*4+2] = o2; dst[i*4+3] = o3; } +#ifdef SHORT +__kernel void compiler_sub_group_shuffle_down_short(global short *dst, int c) +{ + short i = get_global_id(0); + if (i == 0) + dst[0] = get_max_sub_group_size(); + dst++; + + short from = i; + int j = get_max_sub_group_size() - get_sub_group_local_id() - 1; + int k = get_sub_group_local_id() + 1; + short o0 = intel_sub_group_shuffle_down((short)123, (short)456, c); + short o1 = intel_sub_group_shuffle_down((short)123, from, c); + short o2 = intel_sub_group_shuffle_down(from, (short)-from, k); + short o3 = intel_sub_group_shuffle_down(from, (short)321, j); + dst[i*4] = o0; + dst[i*4+1] = o1; + dst[i*4+2] = o2; + dst[i*4+3] = o3; +} +#endif diff --git a/kernels/compiler_sub_group_shuffle_up.cl b/kernels/compiler_sub_group_shuffle_up.cl index 5c5cee12..fd287d52 100644 --- a/kernels/compiler_sub_group_shuffle_up.cl +++ b/kernels/compiler_sub_group_shuffle_up.cl @@ -1,4 +1,4 @@ -__kernel void compiler_sub_group_shuffle_up(global int *dst, int c) +__kernel void compiler_sub_group_shuffle_up_int(global int *dst, int c) { int i = get_global_id(0); if (i == 0) @@ -17,3 +17,24 @@ __kernel void compiler_sub_group_shuffle_up(global int *dst, int c) dst[i*4+2] = o2; dst[i*4+3] = o3; } +#ifdef SHORT +__kernel void compiler_sub_group_shuffle_up_short(global short *dst, int c) +{ + short i = get_global_id(0); + if (i == 0) + dst[0] = get_max_sub_group_size(); + dst++; + + short from = i; + int j = get_sub_group_local_id() + 1; + int k = get_max_sub_group_size() - get_sub_group_local_id() - 1; + short o0 = intel_sub_group_shuffle_up((short)123, (short)456, c); + short o1 = intel_sub_group_shuffle_up((short)123, from, c); + short o2 = intel_sub_group_shuffle_up(from, (short)-from, k); + short o3 = intel_sub_group_shuffle_up(from, (short)321, j); + dst[i*4] = o0; + dst[i*4+1] = o1; + dst[i*4+2] = o2; + dst[i*4+3] = o3; +} +#endif diff --git a/kernels/compiler_sub_group_shuffle_xor.cl b/kernels/compiler_sub_group_shuffle_xor.cl index 8bc15d35..df3dfe70 100644 --- a/kernels/compiler_sub_group_shuffle_xor.cl +++ b/kernels/compiler_sub_group_shuffle_xor.cl @@ -1,4 +1,4 @@ -__kernel void compiler_sub_group_shuffle_xor(global int *dst, int c) +__kernel void compiler_sub_group_shuffle_xor_int(global int *dst, int c) { int i = get_global_id(0); if (i == 0) @@ -17,3 +17,24 @@ __kernel void compiler_sub_group_shuffle_xor(global int *dst, int c) dst[i*4+2] = o2; dst[i*4+3] = o3; } +#ifdef SHORT +__kernel void compiler_sub_group_shuffle_xor_short(global short *dst, int c) +{ + short i = get_global_id(0); + if (i == 0) + dst[0] = get_max_sub_group_size(); + dst++; + + short from = i; + int j = get_max_sub_group_size() - get_sub_group_local_id() - 1; + int k = get_sub_group_local_id() + 1; + short o0 = get_sub_group_local_id(); + short o1 = intel_sub_group_shuffle_xor(from, c); + short o2 = intel_sub_group_shuffle_xor(from, j); + short o3 = intel_sub_group_shuffle_xor(from, k); + dst[i*4] = o0; + dst[i*4+1] = o1; + dst[i*4+2] = o2; + dst[i*4+3] = o3; +} +#endif diff --git a/utests/compiler_sub_group_shuffle.cpp b/utests/compiler_sub_group_shuffle.cpp index f33e9de5..2aadfeda 100644 --- a/utests/compiler_sub_group_shuffle.cpp +++ b/utests/compiler_sub_group_shuffle.cpp @@ -1,6 +1,6 @@ #include "utest_helper.hpp" -void compiler_sub_group_shuffle(void) +void compiler_sub_group_shuffle_int(void) { if(!cl_check_subgroups()) return; @@ -8,7 +8,8 @@ void compiler_sub_group_shuffle(void) const int32_t buf_size = 4 * n + 1; // Setup kernel and buffers - OCL_CREATE_KERNEL("compiler_sub_group_shuffle"); + OCL_CREATE_KERNEL_FROM_FILE("compiler_sub_group_shuffle", + "compiler_sub_group_shuffle_int"); OCL_CREATE_BUFFER(buf[0], 0, buf_size * sizeof(int), NULL); OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]); @@ -43,5 +44,50 @@ void compiler_sub_group_shuffle(void) } OCL_UNMAP_BUFFER(0); } +MAKE_UTEST_FROM_FUNCTION(compiler_sub_group_shuffle_int); +void compiler_sub_group_shuffle_short(void) +{ + if(!cl_check_subgroups_short()) + return; + const size_t n = 32; + const int32_t buf_size = 4 * n + 1; + + // Setup kernel and buffers + OCL_CALL(cl_kernel_init, "compiler_sub_group_shuffle.cl", + "compiler_sub_group_shuffle_short", + SOURCE, "-DSHORT"); + OCL_CREATE_BUFFER(buf[0], 0, buf_size * sizeof(short), NULL); + OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]); + + int c = 3; + OCL_SET_ARG(1, sizeof(int), &c); + + globals[0] = n; + locals[0] = 16; + + OCL_MAP_BUFFER(0); + for (int32_t i = 0; i < buf_size; ++i) + ((short*)buf_data[0])[i] = -1; + OCL_UNMAP_BUFFER(0); + + // Run the kernel on GPU + OCL_NDRANGE(1); -MAKE_UTEST_FROM_FUNCTION(compiler_sub_group_shuffle); + // Compare + OCL_MAP_BUFFER(0); + short* dst = (short*)buf_data[0]; + int suggroupsize = dst[0]; + OCL_ASSERT(suggroupsize == 8 || suggroupsize == 16); + + dst++; + for (int32_t i = 0; i < (int32_t) n; ++i){ + int round = i / suggroupsize; + int index = i % suggroupsize; + OCL_ASSERT(index == dst[4*i]); + OCL_ASSERT((round * suggroupsize + c) == dst[4*i+1]); + OCL_ASSERT((round * suggroupsize + 5) == dst[4*i+2]); + OCL_ASSERT((round * suggroupsize + (suggroupsize - index - 1)) == dst[4*i+3]); + } + OCL_UNMAP_BUFFER(0); +} +MAKE_UTEST_FROM_FUNCTION(compiler_sub_group_shuffle_short); diff --git a/utests/compiler_sub_group_shuffle_down.cpp b/utests/compiler_sub_group_shuffle_down.cpp index 8b232345..13f6e124 100644 --- a/utests/compiler_sub_group_shuffle_down.cpp +++ b/utests/compiler_sub_group_shuffle_down.cpp @@ -1,6 +1,6 @@ #include "utest_helper.hpp" -void compiler_sub_group_shuffle_down(void) +void compiler_sub_group_shuffle_down_int(void) { if(!cl_check_subgroups()) return; @@ -8,7 +8,8 @@ void compiler_sub_group_shuffle_down(void) const int32_t buf_size = 4 * n + 1; // Setup kernel and buffers - OCL_CREATE_KERNEL("compiler_sub_group_shuffle_down"); + OCL_CREATE_KERNEL_FROM_FILE("compiler_sub_group_shuffle_down", + "compiler_sub_group_shuffle_down_int"); OCL_CREATE_BUFFER(buf[0], 0, buf_size * sizeof(int), NULL); OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]); @@ -44,5 +45,52 @@ void compiler_sub_group_shuffle_down(void) } OCL_UNMAP_BUFFER(0); } +MAKE_UTEST_FROM_FUNCTION(compiler_sub_group_shuffle_down_int); -MAKE_UTEST_FROM_FUNCTION(compiler_sub_group_shuffle_down); +void compiler_sub_group_shuffle_down_short(void) +{ + if(!cl_check_subgroups_short()) + return; + const size_t n = 32; + const int32_t buf_size = 4 * n + 1; + + // Setup kernel and buffers + OCL_CALL(cl_kernel_init, "compiler_sub_group_shuffle_down.cl", + "compiler_sub_group_shuffle_down_short", + SOURCE, "-DSHORT"); + OCL_CREATE_BUFFER(buf[0], 0, buf_size * sizeof(short), NULL); + OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]); + + int c = 13; + OCL_SET_ARG(1, sizeof(int), &c); + + globals[0] = n; + locals[0] = 16; + + OCL_MAP_BUFFER(0); + for (int32_t i = 0; i < buf_size; ++i) + ((short*)buf_data[0])[i] = -1; + OCL_UNMAP_BUFFER(0); + + // Run the kernel on GPU + OCL_NDRANGE(1); + + // Compare + OCL_MAP_BUFFER(0); + short* dst = (short *)buf_data[0]; + short suggroupsize = dst[0]; + OCL_ASSERT(suggroupsize == 8 || suggroupsize == 16); + + dst++; + for (int32_t i = 0; i < (int32_t) n; ++i){ + int round = i / suggroupsize; + int index = i % suggroupsize; + //printf("%d %d %d %d\n",dst[4*i], dst[4*i+1], dst[4*i+2], dst[4*i+3]); + OCL_ASSERT( (index + c >= suggroupsize ? 456 : 123) == dst[4*i]); + OCL_ASSERT( (index + c >= suggroupsize ? (round * suggroupsize + (i + c) % suggroupsize): 123) == dst[4*i+1]); + OCL_ASSERT( (index + index + 1 >= suggroupsize ? -(round * suggroupsize + (i + index + 1) % suggroupsize) : (round * suggroupsize + (i + index + 1) % suggroupsize)) == dst[4*i+2]); + OCL_ASSERT((round * suggroupsize + (suggroupsize - 1)) == dst[4*i+3]); + } + OCL_UNMAP_BUFFER(0); +} +MAKE_UTEST_FROM_FUNCTION(compiler_sub_group_shuffle_down_short); diff --git a/utests/compiler_sub_group_shuffle_up.cpp b/utests/compiler_sub_group_shuffle_up.cpp index d2e054b6..f79f03c0 100644 --- a/utests/compiler_sub_group_shuffle_up.cpp +++ b/utests/compiler_sub_group_shuffle_up.cpp @@ -1,6 +1,6 @@ #include "utest_helper.hpp" -void compiler_sub_group_shuffle_up(void) +void compiler_sub_group_shuffle_up_int(void) { if(!cl_check_subgroups()) return; @@ -8,7 +8,8 @@ void compiler_sub_group_shuffle_up(void) const int32_t buf_size = 4 * n + 1; // Setup kernel and buffers - OCL_CREATE_KERNEL("compiler_sub_group_shuffle_up"); + OCL_CREATE_KERNEL_FROM_FILE("compiler_sub_group_shuffle_up", + "compiler_sub_group_shuffle_up_int"); OCL_CREATE_BUFFER(buf[0], 0, buf_size * sizeof(int), NULL); OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]); @@ -44,5 +45,52 @@ void compiler_sub_group_shuffle_up(void) } OCL_UNMAP_BUFFER(0); } +MAKE_UTEST_FROM_FUNCTION(compiler_sub_group_shuffle_up_int); -MAKE_UTEST_FROM_FUNCTION(compiler_sub_group_shuffle_up); +void compiler_sub_group_shuffle_up_short(void) +{ + if(!cl_check_subgroups_short()) + return; + const size_t n = 32; + const int32_t buf_size = 4 * n + 1; + + // Setup kernel and buffers + OCL_CALL(cl_kernel_init, "compiler_sub_group_shuffle_up.cl", + "compiler_sub_group_shuffle_up_short", + SOURCE, "-DSHORT"); + OCL_CREATE_BUFFER(buf[0], 0, buf_size * sizeof(short), NULL); + OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]); + + int c = 3; + OCL_SET_ARG(1, sizeof(int), &c); + + globals[0] = n; + locals[0] = 16; + + OCL_MAP_BUFFER(0); + for (int32_t i = 0; i < buf_size; ++i) + ((short*)buf_data[0])[i] = -1; + OCL_UNMAP_BUFFER(0); + + // Run the kernel on GPU + OCL_NDRANGE(1); + + // Compare + OCL_MAP_BUFFER(0); + short* dst = (short *)buf_data[0]; + short suggroupsize = dst[0]; + OCL_ASSERT(suggroupsize == 8 || suggroupsize == 16); + + dst++; + for (int32_t i = 0; i < (int32_t) n; ++i){ + int round = i / suggroupsize; + int index = i % suggroupsize; + //printf("%d %d %d %d\n",dst[4*i], dst[4*i+1], dst[4*i+2], dst[4*i+3]); + OCL_ASSERT( ((c - index) > 0 ? 123 : 456) == dst[4*i]); + OCL_ASSERT( ((c - index) > 0 ? 123 : (i - c)) == dst[4*i+1]); + OCL_ASSERT( ((suggroupsize - index - 1 - index) > 0 ? (i + index + 1) : -(i + index + 1 - suggroupsize)) == dst[4*i+2]); + OCL_ASSERT((round * suggroupsize + (suggroupsize - 1)) == dst[4*i+3]); + } + OCL_UNMAP_BUFFER(0); +} +MAKE_UTEST_FROM_FUNCTION(compiler_sub_group_shuffle_up_short); diff --git a/utests/compiler_sub_group_shuffle_xor.cpp b/utests/compiler_sub_group_shuffle_xor.cpp index 967ec3ed..b0ad3ee1 100644 --- a/utests/compiler_sub_group_shuffle_xor.cpp +++ b/utests/compiler_sub_group_shuffle_xor.cpp @@ -1,6 +1,6 @@ #include "utest_helper.hpp" -void compiler_sub_group_shuffle_xor(void) +void compiler_sub_group_shuffle_xor_int(void) { if(!cl_check_subgroups()) return; @@ -8,7 +8,8 @@ void compiler_sub_group_shuffle_xor(void) const int32_t buf_size = 4 * n + 1; // Setup kernel and buffers - OCL_CREATE_KERNEL("compiler_sub_group_shuffle_xor"); + OCL_CREATE_KERNEL_FROM_FILE("compiler_sub_group_shuffle_xor", + "compiler_sub_group_shuffle_xor_int"); OCL_CREATE_BUFFER(buf[0], 0, buf_size * sizeof(int), NULL); OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]); @@ -44,5 +45,52 @@ void compiler_sub_group_shuffle_xor(void) } OCL_UNMAP_BUFFER(0); } +MAKE_UTEST_FROM_FUNCTION(compiler_sub_group_shuffle_xor_int); -MAKE_UTEST_FROM_FUNCTION(compiler_sub_group_shuffle_xor); +void compiler_sub_group_shuffle_xor_short(void) +{ + if(!cl_check_subgroups_short()) + return; + const size_t n = 32; + const int32_t buf_size = 4 * n + 1; + + // Setup kernel and buffers + OCL_CALL(cl_kernel_init, "compiler_sub_group_shuffle_xor.cl", + "compiler_sub_group_shuffle_xor_short", + SOURCE, "-DSHORT"); + OCL_CREATE_BUFFER(buf[0], 0, buf_size * sizeof(short), NULL); + OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]); + + int c = 3; + OCL_SET_ARG(1, sizeof(int), &c); + + globals[0] = n; + locals[0] = 16; + + OCL_MAP_BUFFER(0); + for (int32_t i = 0; i < buf_size; ++i) + ((short*)buf_data[0])[i] = -1; + OCL_UNMAP_BUFFER(0); + + // Run the kernel on GPU + OCL_NDRANGE(1); + + // Compare + OCL_MAP_BUFFER(0); + short* dst = (short *)buf_data[0]; + short suggroupsize = dst[0]; + OCL_ASSERT(suggroupsize == 8 || suggroupsize == 16); + + dst++; + for (int32_t i = 0; i < (int32_t) n; ++i){ + int round = i / suggroupsize; + int index = i % suggroupsize; + OCL_ASSERT(index == dst[4*i]); + //printf("%d %d %d %d\n", i, dst[4*i+1], dst[4*i+2], dst[4*i+3]); + OCL_ASSERT((round * suggroupsize + (c ^ index)) == dst[4*i+1]); + OCL_ASSERT((round * suggroupsize + (index ^ (suggroupsize - index -1))) == dst[4*i+2]); + OCL_ASSERT((round * suggroupsize + (index ^ (index + 1) % suggroupsize)) == dst[4*i+3]); + } + OCL_UNMAP_BUFFER(0); +} +MAKE_UTEST_FROM_FUNCTION(compiler_sub_group_shuffle_xor_short); |