diff options
-rw-r--r-- | src/cl_command_queue.c | 4 | ||||
-rw-r--r-- | src/cl_command_queue_gen7.c | 2 | ||||
-rw-r--r-- | src/cl_context.c | 1 | ||||
-rw-r--r-- | src/cl_device_id.c | 1 | ||||
-rw-r--r-- | src/cl_thread.c | 254 | ||||
-rw-r--r-- | src/cl_thread.h | 6 |
6 files changed, 192 insertions, 76 deletions
diff --git a/src/cl_command_queue.c b/src/cl_command_queue.c index 6a699c00..a2109d77 100644 --- a/src/cl_command_queue.c +++ b/src/cl_command_queue.c @@ -89,7 +89,7 @@ cl_command_queue_delete(cl_command_queue queue) queue->fulsim_out = NULL; } - cl_thread_data_destroy(queue->thread_data); + cl_thread_data_destroy(queue); queue->thread_data = NULL; cl_mem_delete(queue->perf); cl_context_delete(queue->ctx); @@ -430,7 +430,7 @@ cl_command_queue_flush(cl_command_queue queue) LOCAL cl_int cl_command_queue_finish(cl_command_queue queue) { - cl_gpgpu_sync(cl_get_thread_batch_buf()); + cl_gpgpu_sync(cl_get_thread_batch_buf(queue)); return CL_SUCCESS; } diff --git a/src/cl_command_queue_gen7.c b/src/cl_command_queue_gen7.c index 891d6f18..d875021c 100644 --- a/src/cl_command_queue_gen7.c +++ b/src/cl_command_queue_gen7.c @@ -327,7 +327,7 @@ cl_command_queue_ND_range_gen7(cl_command_queue queue, /* Start a new batch buffer */ batch_sz = cl_kernel_compute_batch_sz(ker); cl_gpgpu_batch_reset(gpgpu, batch_sz); - cl_set_thread_batch_buf(cl_gpgpu_ref_batch_buf(gpgpu)); + cl_set_thread_batch_buf(queue, cl_gpgpu_ref_batch_buf(gpgpu)); cl_gpgpu_batch_start(gpgpu); /* Issue the GPGPU_WALKER command */ diff --git a/src/cl_context.c b/src/cl_context.c index 8190e6a8..1911bf21 100644 --- a/src/cl_context.c +++ b/src/cl_context.c @@ -203,7 +203,6 @@ cl_context_delete(cl_context ctx) assert(ctx->buffers == NULL); assert(ctx->drv); cl_free(ctx->prop_user); - cl_set_thread_batch_buf(NULL); cl_driver_delete(ctx->drv); ctx->magic = CL_MAGIC_DEAD_HEADER; /* For safety */ cl_free(ctx); diff --git a/src/cl_device_id.c b/src/cl_device_id.c index 427c50e2..d2b3bed2 100644 --- a/src/cl_device_id.c +++ b/src/cl_device_id.c @@ -107,7 +107,6 @@ LOCAL cl_device_id cl_get_gt_device(void) { cl_device_id ret = NULL; - cl_set_thread_batch_buf(NULL); const int device_id = cl_driver_get_device_id(); cl_device_id device = NULL; diff --git a/src/cl_thread.c b/src/cl_thread.c index cadc3cdf..4692ed7c 100644 --- a/src/cl_thread.c +++ b/src/cl_thread.c @@ -15,113 +15,231 @@ * License along with this library. If not, see <http://www.gnu.org/licenses/>. * */ +#include <string.h> +#include <stdio.h> #include "cl_thread.h" #include "cl_alloc.h" #include "cl_utils.h" -static __thread void* thread_batch_buf = NULL; - -typedef struct _cl_thread_spec_data { +/* Because the cl_command_queue can be used in several threads simultaneously but + without add ref to it, we now handle it like this: + Keep one threads_slot_array, every time the thread get gpgpu or batch buffer, if it + does not have a slot, assign it. + The resources are keeped in queue private, and resize it if needed. + When the thread exit, the slot will be set invalid. + When queue released, all the resources will be released. If user still enqueue, flush + or finish the queue after it has been released, the behavior is undefined. + TODO: Need to shrink the slot map. + */ + +static int thread_array_num = 1; +static int *thread_slot_map = NULL; +static int thread_magic_num = 1; +static pthread_mutex_t thread_queue_map_lock = PTHREAD_MUTEX_INITIALIZER; +static pthread_key_t destroy_key; + +static __thread int thread_id = -1; +static __thread int thread_magic = -1; + +typedef struct _thread_spec_data { cl_gpgpu gpgpu ; int valid; -}cl_thread_spec_data; + void* thread_batch_buf; + int thread_magic; +} thread_spec_data; + +typedef struct _queue_thread_private { + thread_spec_data** threads_data; + int threads_data_num; + pthread_mutex_t thread_data_lock; +} queue_thread_private; + +static void thread_data_destructor(void *dummy) { + pthread_mutex_lock(&thread_queue_map_lock); + thread_slot_map[thread_id] = 0; + pthread_mutex_unlock(&thread_queue_map_lock); + free(dummy); +} + +static thread_spec_data * __create_thread_spec_data(cl_command_queue queue, int create) +{ + queue_thread_private *thread_private = ((queue_thread_private *)(queue->thread_data)); + thread_spec_data* spec = NULL; + int i = 0; + + if (thread_id == -1) { + void * dummy = malloc(sizeof(int)); + + pthread_mutex_lock(&thread_queue_map_lock); + for (i = 0; i < thread_array_num; i++) { + if (thread_slot_map[i] == 0) { + thread_id = i; + break; + } + } + + if (i == thread_array_num) { + thread_array_num *= 2; + thread_slot_map = realloc(thread_slot_map, sizeof(int) * thread_array_num); + memset(thread_slot_map + thread_array_num/2, 0, sizeof(int) * (thread_array_num/2)); + thread_id = thread_array_num/2; + } + + thread_slot_map[thread_id] = 1; + + thread_magic = thread_magic_num++; + pthread_mutex_unlock(&thread_queue_map_lock); -void cl_set_thread_batch_buf(void* buf) { - if (thread_batch_buf) { - cl_gpgpu_unref_batch_buf(thread_batch_buf); + pthread_setspecific(destroy_key, dummy); + } + + pthread_mutex_lock(&thread_private->thread_data_lock); + if (thread_array_num > thread_private->threads_data_num) {// just enlarge + int old_num = thread_private->threads_data_num; + thread_private->threads_data_num = thread_array_num; + thread_private->threads_data = realloc(thread_private->threads_data, + thread_private->threads_data_num * sizeof(void *)); + memset(thread_private->threads_data + old_num, 0, + sizeof(void*) * (thread_private->threads_data_num - old_num)); } - thread_batch_buf = buf; -} -void* cl_get_thread_batch_buf(void) { - return thread_batch_buf; + assert(thread_id != -1 && thread_id < thread_array_num); + spec = thread_private->threads_data[thread_id]; + if (!spec && create) { + spec = CALLOC(thread_spec_data); + spec->thread_magic = thread_magic; + thread_private->threads_data[thread_id] = spec; + } + + pthread_mutex_unlock(&thread_private->thread_data_lock); + + return spec; } -cl_gpgpu cl_get_thread_gpgpu(cl_command_queue queue) +void* cl_thread_data_create(void) { - pthread_key_t* key = queue->thread_data; - cl_thread_spec_data* thread_spec_data = pthread_getspecific(*key); - - if (!thread_spec_data) { - TRY_ALLOC_NO_ERR(thread_spec_data, CALLOC(struct _cl_thread_spec_data)); - if (pthread_setspecific(*key, thread_spec_data)) { - cl_free(thread_spec_data); - return NULL; - } - } + queue_thread_private* thread_private = CALLOC(queue_thread_private); + + if (thread_private == NULL) + return NULL; - if (!thread_spec_data->valid) { - TRY_ALLOC_NO_ERR(thread_spec_data->gpgpu, cl_gpgpu_new(queue->ctx->drv)); - thread_spec_data->valid = 1; + if (thread_slot_map == NULL) { + pthread_mutex_lock(&thread_queue_map_lock); + thread_slot_map = calloc(thread_array_num, sizeof(int)); + pthread_mutex_unlock(&thread_queue_map_lock); + + pthread_key_create(&destroy_key, thread_data_destructor); } -error: - return thread_spec_data->gpgpu; + pthread_mutex_init(&thread_private->thread_data_lock, NULL); + + pthread_mutex_lock(&thread_private->thread_data_lock); + thread_private->threads_data = malloc(thread_array_num * sizeof(void *)); + memset(thread_private->threads_data, 0, sizeof(void*) * thread_array_num); + thread_private->threads_data_num = thread_array_num; + pthread_mutex_unlock(&thread_private->thread_data_lock); + + return thread_private; } -void cl_invalid_thread_gpgpu(cl_command_queue queue) +cl_gpgpu cl_get_thread_gpgpu(cl_command_queue queue) { - pthread_key_t* key = queue->thread_data; - cl_thread_spec_data* thread_spec_data = pthread_getspecific(*key); + thread_spec_data* spec = __create_thread_spec_data(queue, 1); - if (!thread_spec_data) { - return; + if (!spec->thread_magic && spec->thread_magic != thread_magic) { + //We may get the slot from last thread. So free the resource. + spec->valid = 0; } - if (!thread_spec_data->valid) { - return; + if (!spec->valid) { + if (spec->thread_batch_buf) { + cl_gpgpu_unref_batch_buf(spec->thread_batch_buf); + spec->thread_batch_buf = NULL; + } + if (spec->gpgpu) { + cl_gpgpu_delete(spec->gpgpu); + spec->gpgpu = NULL; + } + TRY_ALLOC_NO_ERR(spec->gpgpu, cl_gpgpu_new(queue->ctx->drv)); + spec->valid = 1; } - assert(thread_spec_data->gpgpu); - cl_gpgpu_delete(thread_spec_data->gpgpu); - thread_spec_data->valid = 0; + error: + return spec->gpgpu; } -static void thread_data_destructor(void *data) { - cl_thread_spec_data* thread_spec_data = (cl_thread_spec_data *)data; +void cl_set_thread_batch_buf(cl_command_queue queue, void* buf) +{ + thread_spec_data* spec = __create_thread_spec_data(queue, 1); + + assert(spec && spec->thread_magic == thread_magic); - if (thread_batch_buf) { - cl_gpgpu_unref_batch_buf(thread_batch_buf); - thread_batch_buf = NULL; + if (spec->thread_batch_buf) { + cl_gpgpu_unref_batch_buf(spec->thread_batch_buf); } + spec->thread_batch_buf = buf; +} + +void* cl_get_thread_batch_buf(cl_command_queue queue) { + thread_spec_data* spec = __create_thread_spec_data(queue, 1); - if (thread_spec_data->valid) - cl_gpgpu_delete(thread_spec_data->gpgpu); - cl_free(thread_spec_data); + assert(spec && spec->thread_magic == thread_magic); + + return spec->thread_batch_buf; } -/* Create the thread specific data. */ -void* cl_thread_data_create(void) +void cl_invalid_thread_gpgpu(cl_command_queue queue) { - int rc = 0; + queue_thread_private *thread_private = ((queue_thread_private *)(queue->thread_data)); + thread_spec_data* spec = NULL; - pthread_key_t *thread_specific_key = CALLOC(pthread_key_t); - if (thread_specific_key == NULL) - return NULL; - - rc = pthread_key_create(thread_specific_key, thread_data_destructor); + pthread_mutex_lock(&thread_private->thread_data_lock); + spec = thread_private->threads_data[thread_id]; + assert(spec); + pthread_mutex_unlock(&thread_private->thread_data_lock); - if (rc != 0) - return NULL; + if (!spec->valid) { + return; + } - return thread_specific_key; + assert(spec->gpgpu); + cl_gpgpu_delete(spec->gpgpu); + spec->gpgpu = NULL; + spec->valid = 0; } /* The destructor for clean the thread specific data. */ -void cl_thread_data_destroy(void * data) +void cl_thread_data_destroy(cl_command_queue queue) { - pthread_key_t *thread_specific_key = (pthread_key_t *)data; - - /* First release self spec data. */ - cl_thread_spec_data* thread_spec_data = - pthread_getspecific(*thread_specific_key); - if (thread_spec_data && thread_spec_data->valid) { - cl_gpgpu_delete(thread_spec_data->gpgpu); - if (thread_spec_data) - cl_free(thread_spec_data); + int i = 0; + queue_thread_private *thread_private = ((queue_thread_private *)(queue->thread_data)); + int threads_data_num; + thread_spec_data** threads_data; + + pthread_mutex_lock(&thread_private->thread_data_lock); + assert(thread_private->threads_data_num == thread_array_num); + threads_data_num = thread_private->threads_data_num; + threads_data = thread_private->threads_data; + thread_private->threads_data_num = 0; + thread_private->threads_data = NULL; + pthread_mutex_unlock(&thread_private->thread_data_lock); + cl_free(thread_private); + queue->thread_data = NULL; + + for (i = 0; i < threads_data_num; i++) { + if (threads_data[i] != NULL && threads_data[i]->thread_batch_buf) { + cl_gpgpu_unref_batch_buf(threads_data[i]->thread_batch_buf); + threads_data[i]->thread_batch_buf = NULL; + } + + if (threads_data[i] != NULL && threads_data[i]->valid) { + cl_gpgpu_delete(threads_data[i]->gpgpu); + threads_data[i]->gpgpu = NULL; + threads_data[i]->valid = 0; + } + cl_free(threads_data[i]); } - pthread_key_delete(*thread_specific_key); - cl_free(thread_specific_key); + cl_free(threads_data); } diff --git a/src/cl_thread.h b/src/cl_thread.h index c8ab63c6..bf855a24 100644 --- a/src/cl_thread.h +++ b/src/cl_thread.h @@ -27,7 +27,7 @@ void* cl_thread_data_create(void); /* The destructor for clean the thread specific data. */ -void cl_thread_data_destroy(void * data); +void cl_thread_data_destroy(cl_command_queue queue); /* Used to get the gpgpu struct of each thread. */ cl_gpgpu cl_get_thread_gpgpu(cl_command_queue queue); @@ -36,9 +36,9 @@ cl_gpgpu cl_get_thread_gpgpu(cl_command_queue queue); void cl_invalid_thread_gpgpu(cl_command_queue queue); /* Used to set the batch buffer of each thread. */ -void cl_set_thread_batch_buf(void* buf); +void cl_set_thread_batch_buf(cl_command_queue queue, void* buf); /* Used to get the batch buffer of each thread. */ -void* cl_get_thread_batch_buf(void); +void* cl_get_thread_batch_buf(cl_command_queue queue); #endif /* __CL_THREAD_H__ */ |