diff options
author | Yongjia Zhang <zhang_yong_jia@126.com> | 2014-06-23 23:09:33 +0800 |
---|---|---|
committer | Zhigang Gong <zhigang.gong@intel.com> | 2014-06-12 16:46:01 +0800 |
commit | 47874592adac08862cfefd5c7915b926cc37a900 (patch) | |
tree | 95e57b940a8744a61c39ea3ba97427d8a316b218 /src | |
parent | c5bdf04f9a3c295258f67fcdba39ef72fe5d9c8c (diff) | |
download | beignet-47874592adac08862cfefd5c7915b926cc37a900.tar.gz |
Fix the same kernel name issue of OCL_OUTPUT_KERNEL_PERF
Now it treats kernels with same kernel name and different build
options separately. When OCL_OUTPUT_KERNEL_PERF==1, it outputs the
time summary as before, but if OCL_OUTPUT_KERNEL_PERF==2, it will
output the time details including the kernel build options and
kernels with same kernel name but different build options will
output separately.
v2: use strncmp and strncpy instead of strcmp and strcpy.
Signed-off-by: Yongjia Zhang <yongjia.zhang@intel.com>
Reviewed-by: Zhigang Gong <zhigang.gong@linux.intel.com>
Diffstat (limited to 'src')
-rw-r--r-- | src/cl_api.c | 17 | ||||
-rw-r--r-- | src/performance.c | 113 | ||||
-rw-r--r-- | src/performance.h | 2 |
3 files changed, 92 insertions, 40 deletions
diff --git a/src/cl_api.c b/src/cl_api.c index 6206e109..c8d3cee6 100644 --- a/src/cl_api.c +++ b/src/cl_api.c @@ -1681,7 +1681,7 @@ clEnqueueCopyBuffer(cl_command_queue command_queue, } if(b_output_kernel_perf) - time_end(command_queue->ctx, "beignet internal kernel : cl_mem_copy", command_queue); + time_end(command_queue->ctx, "beignet internal kernel : cl_mem_copy", "", command_queue); return 0; @@ -1785,7 +1785,7 @@ clEnqueueCopyBufferRect(cl_command_queue command_queue, } if(b_output_kernel_perf) - time_end(command_queue->ctx, "beignet internal kernel : cl_mem_copy_buffer_rect", command_queue); + time_end(command_queue->ctx, "beignet internal kernel : cl_mem_copy_buffer_rect", "", command_queue); error: return err; @@ -2027,7 +2027,7 @@ clEnqueueCopyImage(cl_command_queue command_queue, } if(b_output_kernel_perf) - time_end(command_queue->ctx, "beignet internal kernel : cl_mem_kernel_copy_image", command_queue); + time_end(command_queue->ctx, "beignet internal kernel : cl_mem_kernel_copy_image", "", command_queue); error: return err; @@ -2091,7 +2091,7 @@ clEnqueueCopyImageToBuffer(cl_command_queue command_queue, } if(b_output_kernel_perf) - time_end(command_queue->ctx, "beignet internal kernel : cl_mem_copy_image_to_buffer", command_queue); + time_end(command_queue->ctx, "beignet internal kernel : cl_mem_copy_image_to_buffer", "", command_queue); error: return err; @@ -2155,7 +2155,7 @@ clEnqueueCopyBufferToImage(cl_command_queue command_queue, } if(b_output_kernel_perf) - time_end(command_queue->ctx, "beignet internal kernel : cl_mem_copy_buffer_to_image", command_queue); + time_end(command_queue->ctx, "beignet internal kernel : cl_mem_copy_buffer_to_image", "", command_queue); error: return err; @@ -2556,7 +2556,12 @@ clEnqueueNDRangeKernel(cl_command_queue command_queue, } if(b_output_kernel_perf) - time_end(command_queue->ctx, cl_kernel_get_name(kernel), command_queue); + { + if(kernel->program->build_opts != NULL) + time_end(command_queue->ctx, cl_kernel_get_name(kernel), kernel->program->build_opts, command_queue); + else + time_end(command_queue->ctx, cl_kernel_get_name(kernel), "", command_queue); + } error: return err; } diff --git a/src/performance.c b/src/performance.c index a7854604..85cd481d 100644 --- a/src/performance.c +++ b/src/performance.c @@ -8,11 +8,12 @@ #define MAX_KERNEL_NAME_LENGTH 100 #define MAX_KERNEL_EXECUTION_COUNT 100000 - +#define MAX_KERNEL_BUILD_OPT 1000 typedef struct kernel_storage_node { char kernel_name[MAX_KERNEL_NAME_LENGTH]; float kernel_times[MAX_KERNEL_EXECUTION_COUNT]; + char build_option[MAX_KERNEL_BUILD_OPT]; int current_count; float kernel_sum_time; struct kernel_storage_node *next; @@ -81,11 +82,12 @@ static context_storage_node * find_context(cl_context context) return pre; } -static kernel_storage_node * find_kernel(context_storage_node *p_context, const char *kernel_name) +static kernel_storage_node * find_kernel(context_storage_node *p_context, const char *kernel_name, const char *build_opt) { if(NULL != prev_kernel_pointer && NULL != prev_context_pointer && p_context == prev_context_pointer && - !strcmp(kernel_name, prev_kernel_pointer->kernel_name)) + !strncmp(kernel_name, prev_kernel_pointer->kernel_name, MAX_KERNEL_NAME_LENGTH) && + !strncmp(build_opt, prev_kernel_pointer->build_option, MAX_KERNEL_BUILD_OPT)) return prev_kernel_pointer; if(NULL == p_context) @@ -95,30 +97,38 @@ static kernel_storage_node * find_kernel(context_storage_node *p_context, const { p_context->kernels_storage = (kernel_storage_node *)malloc(sizeof(kernel_storage_node)); p_context->kernel_count++; - strcpy(p_context->kernels_storage->kernel_name,kernel_name); + strncpy(p_context->kernels_storage->kernel_name,kernel_name, MAX_KERNEL_NAME_LENGTH); + p_context->kernels_storage->kernel_name[MAX_KERNEL_NAME_LENGTH - 1] = '\0'; + strncpy(p_context->kernels_storage->build_option, build_opt, MAX_KERNEL_BUILD_OPT); + p_context->kernels_storage->build_option[MAX_KERNEL_BUILD_OPT - 1] = '\0'; p_context->kernels_storage->current_count = 0; p_context->kernels_storage->kernel_sum_time = 0.0f; p_context->kernels_storage->next = NULL; return p_context->kernels_storage; } + kernel_storage_node *pre = p_context->kernels_storage; kernel_storage_node *cur = p_context->kernels_storage; - while(NULL != cur && strcmp(cur->kernel_name, kernel_name)) + while(NULL != cur && + (strncmp(cur->kernel_name, kernel_name, MAX_KERNEL_NAME_LENGTH) || + strncmp(cur->build_option, build_opt, MAX_KERNEL_BUILD_OPT))) { pre = cur; cur = cur->next; } if(NULL != cur) - { return cur; - } + p_context->kernel_count++; pre->next = (kernel_storage_node *)malloc(sizeof(kernel_storage_node)); pre = pre->next; pre->current_count = 0; pre->kernel_sum_time = 0.0f; pre->next = NULL; - strcpy(pre->kernel_name, kernel_name); + strncpy(pre->kernel_name, kernel_name, MAX_KERNEL_NAME_LENGTH); + pre->kernel_name[MAX_KERNEL_NAME_LENGTH - 1] = '\0'; + strncpy(pre->build_option, build_opt, MAX_KERNEL_BUILD_OPT); + pre->build_option[MAX_KERNEL_NAME_LENGTH - 1] = '\0'; return pre; } @@ -146,6 +156,8 @@ typedef struct time_element float kernel_sum_time; int kernel_execute_count; double dev; + float kernel_times[MAX_KERNEL_EXECUTION_COUNT]; + uint32_t time_index; } time_element; static int cmp(const void *a, const void *b) @@ -172,40 +184,62 @@ static void print_time_info() { printf("[------------ CONTEXT %4d ------------]\n", tmp_context_id++); printf(" ->>>> KERNELS TIME SUMMARY <<<<-\n"); + kernel_storage_node *p_kernel = p_context->kernels_storage; kernel_storage_node *p_tmp_kernel = p_kernel; time_element *te = (time_element *)malloc(sizeof(time_element)*p_context->kernel_count); - int i = 0, j = 0; + memset(te, 0, sizeof(time_element)*p_context->kernel_count); + int i = -1, j = 0, k = 0; while(NULL != p_tmp_kernel) { - te[i].kernel_execute_count = p_tmp_kernel->current_count; - strcpy(te[i].kernel_name, p_tmp_kernel->kernel_name); - te[i].kernel_sum_time = p_tmp_kernel->kernel_sum_time; - float average = p_tmp_kernel->kernel_sum_time / p_tmp_kernel->current_count; - float sumsquare = 0.0f; + for(k=0; k<=i; k++) + { + if(!strncmp(te[k].kernel_name, p_tmp_kernel->kernel_name, MAX_KERNEL_NAME_LENGTH)) + break; + } + if(k == i+1) + { + i++; + k = i; + } + te[k].kernel_execute_count += p_tmp_kernel->current_count; + strncpy(te[k].kernel_name, p_tmp_kernel->kernel_name, MAX_KERNEL_NAME_LENGTH); + te[k].kernel_name[MAX_KERNEL_NAME_LENGTH - 1] = '\0'; + te[k].kernel_sum_time += p_tmp_kernel->kernel_sum_time; for(j=0; j != p_tmp_kernel->current_count; ++j) - sumsquare += pow((p_tmp_kernel->kernel_times[j] - average), 2.0 ); - te[i++].dev = sqrt(sumsquare/p_tmp_kernel->current_count); + te[k].kernel_times[te[k].time_index++] = p_tmp_kernel->kernel_times[j]; p_tmp_kernel = p_tmp_kernel->next; } + + for(k=0; k<=i; k++) + { + float average = te[k].kernel_sum_time / te[k].kernel_execute_count; + double sumsquare = 0.0; + for(j=0; j<te[k].time_index; ++j) + sumsquare += pow((te[k].kernel_times[j] - average), 2.0); + te[k].dev = sqrt(sumsquare / te[k].kernel_execute_count); + } + float sum_time = 0.0f; qsort((void *)te, p_context->kernel_count, sizeof(time_element), cmp); - for(i=0; i<p_context->kernel_count; ++i) - sum_time += te[i].kernel_sum_time; - for(i=0; i<p_context->kernel_count; ++i) + for(j=0; j<=i; ++j) + sum_time += te[j].kernel_sum_time; + + for(j=0; j<=i; ++j) { - printf(" [Kernel Name: %-30s Time(ms): (%4.1f%%) %9.2f Count: %-7d Ave(ms): %7.2f Dev: %.1lf%%] \n", - te[i].kernel_name, - te[i].kernel_sum_time / sum_time * 100, - te[i].kernel_sum_time, - te[i].kernel_execute_count, - te[i].kernel_sum_time / te[i].kernel_execute_count, - te[i].dev / te[i].kernel_sum_time * te[i].kernel_execute_count * 100); + printf(" [Kernel Name: %-30s Time(ms): (%4.1f%%) %9.2f Count: %-7d Ave(ms): %7.2f Dev: %.1lf%%]\n", + te[j].kernel_name, + te[j].kernel_sum_time / sum_time * 100, + te[j].kernel_sum_time, + te[j].kernel_execute_count, + te[j].kernel_sum_time / te[j].kernel_execute_count, + te[j].dev / te[j].kernel_sum_time * te[j].kernel_execute_count * 100); } free(te); printf(" Total : %.2f\n", sum_time); if(2 != b_output_kernel_perf) { + printf("[------------ CONTEXT ENDS------------]\n\n"); p_context = p_context->next; continue; } @@ -214,6 +248,18 @@ static void print_time_info() while(NULL != p_kernel) { printf(" [Kernel Name : %30s Time(ms): %.2f]\n", p_kernel->kernel_name, p_kernel->kernel_sum_time); + if(*p_kernel->build_option != '\0') + { + int count = 0; + printf(" ->Build Options : "); + while(p_kernel->build_option[count] != '\0' ) + { + printf("%c", p_kernel->build_option[count++]); + if(count % 100 == 0) + printf("\n "); + } + printf("\n"); + } for(i=0; i!=p_kernel->current_count; ++i) printf(" Execution Round%5d : %.2f (ms)\n", i+1, p_kernel->kernel_times[i]); p_kernel = p_kernel->next; @@ -225,7 +271,7 @@ static void print_time_info() } -static void insert(cl_context context, const char *kernel_name, float time) +static void insert(cl_context context, const char *kernel_name, const char *build_opt, float time) { if(!atexit_registered) { @@ -233,7 +279,7 @@ static void insert(cl_context context, const char *kernel_name, float time) atexit(print_time_info); } context_storage_node *p_context = find_context(context); - kernel_storage_node *p_kernel = find_kernel(p_context, kernel_name); + kernel_storage_node *p_kernel = find_kernel(p_context, kernel_name, build_opt); prev_context_pointer = p_context; prev_kernel_pointer = p_kernel; p_kernel->kernel_times[p_kernel->current_count++] = time; @@ -241,7 +287,8 @@ static void insert(cl_context context, const char *kernel_name, float time) if(p_kernel->kernel_sum_time > p_context->kernel_max_time) { p_context->kernel_max_time = p_kernel->kernel_sum_time; - strcpy(p_context->max_time_kernel_name, kernel_name); + strncpy(p_context->max_time_kernel_name, kernel_name, MAX_KERNEL_NAME_LENGTH); + p_context->max_time_kernel_name[MAX_KERNEL_NAME_LENGTH - 1] = '\0'; } } @@ -253,9 +300,9 @@ static struct timeval start, end; void initialize_env_var() { char *env = getenv("OCL_OUTPUT_KERNEL_PERF"); - if(NULL == env || !strcmp(env,"0")) + if(NULL == env || !strncmp(env,"0", 1)) b_output_kernel_perf = 0; - else if(!strcmp(env,"1")) + else if(!strncmp(env,"1", 1)) b_output_kernel_perf = 1; else b_output_kernel_perf = 2; @@ -267,11 +314,11 @@ void time_start(cl_context context, const char * kernel_name, cl_command_queue c gettimeofday(&start, NULL); } -void time_end(cl_context context, const char * kernel_name, cl_command_queue cq) +void time_end(cl_context context, const char * kernel_name, const char * build_opt, cl_command_queue cq) { clFinish(cq); gettimeofday(&end, NULL); float t = (end.tv_sec - start.tv_sec)*1000 + (end.tv_usec - start.tv_usec)/1000.0f; - insert(context, kernel_name, t); + insert(context, kernel_name, build_opt, t); pthread_mutex_unlock(&mutex); } diff --git a/src/performance.h b/src/performance.h index c747743c..1e750545 100644 --- a/src/performance.h +++ b/src/performance.h @@ -5,7 +5,7 @@ extern int b_output_kernel_perf; void time_start(cl_context context, const char * kernel_name, cl_command_queue cq); -void time_end(cl_context context, const char * kernel_name, cl_command_queue cq); +void time_end(cl_context context, const char * kernel_name, const char * build_opt, cl_command_queue cq); void initialize_env_var(); |