summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorYongjia Zhang <zhang_yong_jia@126.com>2014-06-23 23:09:33 +0800
committerZhigang Gong <zhigang.gong@intel.com>2014-06-12 16:46:01 +0800
commit47874592adac08862cfefd5c7915b926cc37a900 (patch)
tree95e57b940a8744a61c39ea3ba97427d8a316b218 /src
parentc5bdf04f9a3c295258f67fcdba39ef72fe5d9c8c (diff)
downloadbeignet-47874592adac08862cfefd5c7915b926cc37a900.tar.gz
Fix the same kernel name issue of OCL_OUTPUT_KERNEL_PERF
Now it treats kernels with same kernel name and different build options separately. When OCL_OUTPUT_KERNEL_PERF==1, it outputs the time summary as before, but if OCL_OUTPUT_KERNEL_PERF==2, it will output the time details including the kernel build options and kernels with same kernel name but different build options will output separately. v2: use strncmp and strncpy instead of strcmp and strcpy. Signed-off-by: Yongjia Zhang <yongjia.zhang@intel.com> Reviewed-by: Zhigang Gong <zhigang.gong@linux.intel.com>
Diffstat (limited to 'src')
-rw-r--r--src/cl_api.c17
-rw-r--r--src/performance.c113
-rw-r--r--src/performance.h2
3 files changed, 92 insertions, 40 deletions
diff --git a/src/cl_api.c b/src/cl_api.c
index 6206e109..c8d3cee6 100644
--- a/src/cl_api.c
+++ b/src/cl_api.c
@@ -1681,7 +1681,7 @@ clEnqueueCopyBuffer(cl_command_queue command_queue,
}
if(b_output_kernel_perf)
- time_end(command_queue->ctx, "beignet internal kernel : cl_mem_copy", command_queue);
+ time_end(command_queue->ctx, "beignet internal kernel : cl_mem_copy", "", command_queue);
return 0;
@@ -1785,7 +1785,7 @@ clEnqueueCopyBufferRect(cl_command_queue command_queue,
}
if(b_output_kernel_perf)
- time_end(command_queue->ctx, "beignet internal kernel : cl_mem_copy_buffer_rect", command_queue);
+ time_end(command_queue->ctx, "beignet internal kernel : cl_mem_copy_buffer_rect", "", command_queue);
error:
return err;
@@ -2027,7 +2027,7 @@ clEnqueueCopyImage(cl_command_queue command_queue,
}
if(b_output_kernel_perf)
- time_end(command_queue->ctx, "beignet internal kernel : cl_mem_kernel_copy_image", command_queue);
+ time_end(command_queue->ctx, "beignet internal kernel : cl_mem_kernel_copy_image", "", command_queue);
error:
return err;
@@ -2091,7 +2091,7 @@ clEnqueueCopyImageToBuffer(cl_command_queue command_queue,
}
if(b_output_kernel_perf)
- time_end(command_queue->ctx, "beignet internal kernel : cl_mem_copy_image_to_buffer", command_queue);
+ time_end(command_queue->ctx, "beignet internal kernel : cl_mem_copy_image_to_buffer", "", command_queue);
error:
return err;
@@ -2155,7 +2155,7 @@ clEnqueueCopyBufferToImage(cl_command_queue command_queue,
}
if(b_output_kernel_perf)
- time_end(command_queue->ctx, "beignet internal kernel : cl_mem_copy_buffer_to_image", command_queue);
+ time_end(command_queue->ctx, "beignet internal kernel : cl_mem_copy_buffer_to_image", "", command_queue);
error:
return err;
@@ -2556,7 +2556,12 @@ clEnqueueNDRangeKernel(cl_command_queue command_queue,
}
if(b_output_kernel_perf)
- time_end(command_queue->ctx, cl_kernel_get_name(kernel), command_queue);
+ {
+ if(kernel->program->build_opts != NULL)
+ time_end(command_queue->ctx, cl_kernel_get_name(kernel), kernel->program->build_opts, command_queue);
+ else
+ time_end(command_queue->ctx, cl_kernel_get_name(kernel), "", command_queue);
+ }
error:
return err;
}
diff --git a/src/performance.c b/src/performance.c
index a7854604..85cd481d 100644
--- a/src/performance.c
+++ b/src/performance.c
@@ -8,11 +8,12 @@
#define MAX_KERNEL_NAME_LENGTH 100
#define MAX_KERNEL_EXECUTION_COUNT 100000
-
+#define MAX_KERNEL_BUILD_OPT 1000
typedef struct kernel_storage_node
{
char kernel_name[MAX_KERNEL_NAME_LENGTH];
float kernel_times[MAX_KERNEL_EXECUTION_COUNT];
+ char build_option[MAX_KERNEL_BUILD_OPT];
int current_count;
float kernel_sum_time;
struct kernel_storage_node *next;
@@ -81,11 +82,12 @@ static context_storage_node * find_context(cl_context context)
return pre;
}
-static kernel_storage_node * find_kernel(context_storage_node *p_context, const char *kernel_name)
+static kernel_storage_node * find_kernel(context_storage_node *p_context, const char *kernel_name, const char *build_opt)
{
if(NULL != prev_kernel_pointer && NULL != prev_context_pointer &&
p_context == prev_context_pointer &&
- !strcmp(kernel_name, prev_kernel_pointer->kernel_name))
+ !strncmp(kernel_name, prev_kernel_pointer->kernel_name, MAX_KERNEL_NAME_LENGTH) &&
+ !strncmp(build_opt, prev_kernel_pointer->build_option, MAX_KERNEL_BUILD_OPT))
return prev_kernel_pointer;
if(NULL == p_context)
@@ -95,30 +97,38 @@ static kernel_storage_node * find_kernel(context_storage_node *p_context, const
{
p_context->kernels_storage = (kernel_storage_node *)malloc(sizeof(kernel_storage_node));
p_context->kernel_count++;
- strcpy(p_context->kernels_storage->kernel_name,kernel_name);
+ strncpy(p_context->kernels_storage->kernel_name,kernel_name, MAX_KERNEL_NAME_LENGTH);
+ p_context->kernels_storage->kernel_name[MAX_KERNEL_NAME_LENGTH - 1] = '\0';
+ strncpy(p_context->kernels_storage->build_option, build_opt, MAX_KERNEL_BUILD_OPT);
+ p_context->kernels_storage->build_option[MAX_KERNEL_BUILD_OPT - 1] = '\0';
p_context->kernels_storage->current_count = 0;
p_context->kernels_storage->kernel_sum_time = 0.0f;
p_context->kernels_storage->next = NULL;
return p_context->kernels_storage;
}
+
kernel_storage_node *pre = p_context->kernels_storage;
kernel_storage_node *cur = p_context->kernels_storage;
- while(NULL != cur && strcmp(cur->kernel_name, kernel_name))
+ while(NULL != cur &&
+ (strncmp(cur->kernel_name, kernel_name, MAX_KERNEL_NAME_LENGTH) ||
+ strncmp(cur->build_option, build_opt, MAX_KERNEL_BUILD_OPT)))
{
pre = cur;
cur = cur->next;
}
if(NULL != cur)
- {
return cur;
- }
+
p_context->kernel_count++;
pre->next = (kernel_storage_node *)malloc(sizeof(kernel_storage_node));
pre = pre->next;
pre->current_count = 0;
pre->kernel_sum_time = 0.0f;
pre->next = NULL;
- strcpy(pre->kernel_name, kernel_name);
+ strncpy(pre->kernel_name, kernel_name, MAX_KERNEL_NAME_LENGTH);
+ pre->kernel_name[MAX_KERNEL_NAME_LENGTH - 1] = '\0';
+ strncpy(pre->build_option, build_opt, MAX_KERNEL_BUILD_OPT);
+ pre->build_option[MAX_KERNEL_NAME_LENGTH - 1] = '\0';
return pre;
}
@@ -146,6 +156,8 @@ typedef struct time_element
float kernel_sum_time;
int kernel_execute_count;
double dev;
+ float kernel_times[MAX_KERNEL_EXECUTION_COUNT];
+ uint32_t time_index;
} time_element;
static int cmp(const void *a, const void *b)
@@ -172,40 +184,62 @@ static void print_time_info()
{
printf("[------------ CONTEXT %4d ------------]\n", tmp_context_id++);
printf(" ->>>> KERNELS TIME SUMMARY <<<<-\n");
+
kernel_storage_node *p_kernel = p_context->kernels_storage;
kernel_storage_node *p_tmp_kernel = p_kernel;
time_element *te = (time_element *)malloc(sizeof(time_element)*p_context->kernel_count);
- int i = 0, j = 0;
+ memset(te, 0, sizeof(time_element)*p_context->kernel_count);
+ int i = -1, j = 0, k = 0;
while(NULL != p_tmp_kernel)
{
- te[i].kernel_execute_count = p_tmp_kernel->current_count;
- strcpy(te[i].kernel_name, p_tmp_kernel->kernel_name);
- te[i].kernel_sum_time = p_tmp_kernel->kernel_sum_time;
- float average = p_tmp_kernel->kernel_sum_time / p_tmp_kernel->current_count;
- float sumsquare = 0.0f;
+ for(k=0; k<=i; k++)
+ {
+ if(!strncmp(te[k].kernel_name, p_tmp_kernel->kernel_name, MAX_KERNEL_NAME_LENGTH))
+ break;
+ }
+ if(k == i+1)
+ {
+ i++;
+ k = i;
+ }
+ te[k].kernel_execute_count += p_tmp_kernel->current_count;
+ strncpy(te[k].kernel_name, p_tmp_kernel->kernel_name, MAX_KERNEL_NAME_LENGTH);
+ te[k].kernel_name[MAX_KERNEL_NAME_LENGTH - 1] = '\0';
+ te[k].kernel_sum_time += p_tmp_kernel->kernel_sum_time;
for(j=0; j != p_tmp_kernel->current_count; ++j)
- sumsquare += pow((p_tmp_kernel->kernel_times[j] - average), 2.0 );
- te[i++].dev = sqrt(sumsquare/p_tmp_kernel->current_count);
+ te[k].kernel_times[te[k].time_index++] = p_tmp_kernel->kernel_times[j];
p_tmp_kernel = p_tmp_kernel->next;
}
+
+ for(k=0; k<=i; k++)
+ {
+ float average = te[k].kernel_sum_time / te[k].kernel_execute_count;
+ double sumsquare = 0.0;
+ for(j=0; j<te[k].time_index; ++j)
+ sumsquare += pow((te[k].kernel_times[j] - average), 2.0);
+ te[k].dev = sqrt(sumsquare / te[k].kernel_execute_count);
+ }
+
float sum_time = 0.0f;
qsort((void *)te, p_context->kernel_count, sizeof(time_element), cmp);
- for(i=0; i<p_context->kernel_count; ++i)
- sum_time += te[i].kernel_sum_time;
- for(i=0; i<p_context->kernel_count; ++i)
+ for(j=0; j<=i; ++j)
+ sum_time += te[j].kernel_sum_time;
+
+ for(j=0; j<=i; ++j)
{
- printf(" [Kernel Name: %-30s Time(ms): (%4.1f%%) %9.2f Count: %-7d Ave(ms): %7.2f Dev: %.1lf%%] \n",
- te[i].kernel_name,
- te[i].kernel_sum_time / sum_time * 100,
- te[i].kernel_sum_time,
- te[i].kernel_execute_count,
- te[i].kernel_sum_time / te[i].kernel_execute_count,
- te[i].dev / te[i].kernel_sum_time * te[i].kernel_execute_count * 100);
+ printf(" [Kernel Name: %-30s Time(ms): (%4.1f%%) %9.2f Count: %-7d Ave(ms): %7.2f Dev: %.1lf%%]\n",
+ te[j].kernel_name,
+ te[j].kernel_sum_time / sum_time * 100,
+ te[j].kernel_sum_time,
+ te[j].kernel_execute_count,
+ te[j].kernel_sum_time / te[j].kernel_execute_count,
+ te[j].dev / te[j].kernel_sum_time * te[j].kernel_execute_count * 100);
}
free(te);
printf(" Total : %.2f\n", sum_time);
if(2 != b_output_kernel_perf)
{
+ printf("[------------ CONTEXT ENDS------------]\n\n");
p_context = p_context->next;
continue;
}
@@ -214,6 +248,18 @@ static void print_time_info()
while(NULL != p_kernel)
{
printf(" [Kernel Name : %30s Time(ms): %.2f]\n", p_kernel->kernel_name, p_kernel->kernel_sum_time);
+ if(*p_kernel->build_option != '\0')
+ {
+ int count = 0;
+ printf(" ->Build Options : ");
+ while(p_kernel->build_option[count] != '\0' )
+ {
+ printf("%c", p_kernel->build_option[count++]);
+ if(count % 100 == 0)
+ printf("\n ");
+ }
+ printf("\n");
+ }
for(i=0; i!=p_kernel->current_count; ++i)
printf(" Execution Round%5d : %.2f (ms)\n", i+1, p_kernel->kernel_times[i]);
p_kernel = p_kernel->next;
@@ -225,7 +271,7 @@ static void print_time_info()
}
-static void insert(cl_context context, const char *kernel_name, float time)
+static void insert(cl_context context, const char *kernel_name, const char *build_opt, float time)
{
if(!atexit_registered)
{
@@ -233,7 +279,7 @@ static void insert(cl_context context, const char *kernel_name, float time)
atexit(print_time_info);
}
context_storage_node *p_context = find_context(context);
- kernel_storage_node *p_kernel = find_kernel(p_context, kernel_name);
+ kernel_storage_node *p_kernel = find_kernel(p_context, kernel_name, build_opt);
prev_context_pointer = p_context;
prev_kernel_pointer = p_kernel;
p_kernel->kernel_times[p_kernel->current_count++] = time;
@@ -241,7 +287,8 @@ static void insert(cl_context context, const char *kernel_name, float time)
if(p_kernel->kernel_sum_time > p_context->kernel_max_time)
{
p_context->kernel_max_time = p_kernel->kernel_sum_time;
- strcpy(p_context->max_time_kernel_name, kernel_name);
+ strncpy(p_context->max_time_kernel_name, kernel_name, MAX_KERNEL_NAME_LENGTH);
+ p_context->max_time_kernel_name[MAX_KERNEL_NAME_LENGTH - 1] = '\0';
}
}
@@ -253,9 +300,9 @@ static struct timeval start, end;
void initialize_env_var()
{
char *env = getenv("OCL_OUTPUT_KERNEL_PERF");
- if(NULL == env || !strcmp(env,"0"))
+ if(NULL == env || !strncmp(env,"0", 1))
b_output_kernel_perf = 0;
- else if(!strcmp(env,"1"))
+ else if(!strncmp(env,"1", 1))
b_output_kernel_perf = 1;
else
b_output_kernel_perf = 2;
@@ -267,11 +314,11 @@ void time_start(cl_context context, const char * kernel_name, cl_command_queue c
gettimeofday(&start, NULL);
}
-void time_end(cl_context context, const char * kernel_name, cl_command_queue cq)
+void time_end(cl_context context, const char * kernel_name, const char * build_opt, cl_command_queue cq)
{
clFinish(cq);
gettimeofday(&end, NULL);
float t = (end.tv_sec - start.tv_sec)*1000 + (end.tv_usec - start.tv_usec)/1000.0f;
- insert(context, kernel_name, t);
+ insert(context, kernel_name, build_opt, t);
pthread_mutex_unlock(&mutex);
}
diff --git a/src/performance.h b/src/performance.h
index c747743c..1e750545 100644
--- a/src/performance.h
+++ b/src/performance.h
@@ -5,7 +5,7 @@
extern int b_output_kernel_perf;
void time_start(cl_context context, const char * kernel_name, cl_command_queue cq);
-void time_end(cl_context context, const char * kernel_name, cl_command_queue cq);
+void time_end(cl_context context, const char * kernel_name, const char * build_opt, cl_command_queue cq);
void initialize_env_var();