From 6a3eddc4dd70c895c426f1f8231778eb98ea7ac3 Mon Sep 17 00:00:00 2001 From: Chuanbo Weng Date: Fri, 6 Nov 2015 11:27:48 +0800 Subject: Add extensions intel_accelerator and basic intel_motion_estimation. v2: 1. Just upload the first vme_state. 2. Remove duplicated code in check_opt1_extension. 3. Check image format before cl_gpgpu_bind_image_for_vme. 4. Fix error of getting mv. Because we suppose this kernel run in SIMD16 mode, so dword 0 of grf 1 should be __gen_ocl_region(8,vme_result.s0), not __gen_ocl_region(0,vme_result.s1). v3: Return CL_IMAGE_FORMAT_NOT_SUPPORTED if image format is not the required one. v4: Fix two conflicts after code rebase and wordaround a curbe related bug. v6: Treat simd8 and simd16 differently when getting mv. Signed-off-by: Guo Yejun Signed-off-by: Chuanbo Weng Reviewed-by: Ruiling Song --- include/CL/cl_ext.h | 103 +++++++++ src/CMakeLists.txt | 4 +- src/cl_accelerator_intel.c | 86 ++++++++ src/cl_accelerator_intel.h | 29 +++ src/cl_api.c | 107 +++++++++- src/cl_command_queue.c | 17 +- src/cl_command_queue_gen7.c | 8 +- src/cl_context.c | 1 + src/cl_context.h | 3 + src/cl_driver.h | 21 ++ src/cl_driver_defs.c | 2 + src/cl_extensions.c | 4 +- src/cl_extensions.h | 8 + src/cl_gen7_device.h | 5 +- src/cl_gt_device.h | 6 +- src/cl_internals.h | 1 + src/cl_kernel.c | 57 ++++- src/cl_kernel.h | 6 +- src/cl_utils.h | 12 ++ src/intel/intel_gpgpu.c | 217 ++++++++++++++++++- src/intel/intel_structs.h | 120 +++++++++++ .../cl_internal_block_motion_estimate_intel.cl | 233 +++++++++++++++++++++ 22 files changed, 1017 insertions(+), 33 deletions(-) create mode 100644 src/cl_accelerator_intel.c create mode 100644 src/cl_accelerator_intel.h create mode 100644 src/kernels/cl_internal_block_motion_estimate_intel.cl diff --git a/include/CL/cl_ext.h b/include/CL/cl_ext.h index 710bea88..0a66d704 100644 --- a/include/CL/cl_ext.h +++ b/include/CL/cl_ext.h @@ -184,6 +184,109 @@ typedef CL_API_ENTRY cl_int (CL_API_CALL *clTerminateContextKHR_fn)(cl_context / #define CL_PRINTF_CALLBACK_ARM 0x40B0 #define CL_PRINTF_BUFFERSIZE_ARM 0x40B1 +/********************************* +* cl_intel_accelerator extension * +*********************************/ +#define cl_intel_accelerator 1 +#define cl_intel_motion_estimation 1 + +typedef struct _cl_accelerator_intel* cl_accelerator_intel; +typedef cl_uint cl_accelerator_type_intel; +typedef cl_uint cl_accelerator_info_intel; + +typedef struct _cl_motion_estimation_desc_intel { + cl_uint mb_block_type; + cl_uint subpixel_mode; + cl_uint sad_adjust_mode; + cl_uint search_path_type; +} cl_motion_estimation_desc_intel; + +/* Error Codes */ +#define CL_INVALID_ACCELERATOR_INTEL -1094 +#define CL_INVALID_ACCELERATOR_TYPE_INTEL -1095 +#define CL_INVALID_ACCELERATOR_DESCRIPTOR_INTEL -1096 +#define CL_ACCELERATOR_TYPE_NOT_SUPPORTED_INTEL -1097 + +/* Deprecated Error Codes */ +#define CL_INVALID_ACCELERATOR_INTEL_DEPRECATED -6000 +#define CL_INVALID_ACCELERATOR_TYPE_INTEL_DEPRECATED -6001 +#define CL_INVALID_ACCELERATOR_DESCRIPTOR_INTEL_DEPRECATED -6002 +#define CL_ACCELERATOR_TYPE_NOT_SUPPORTED_INTEL_DEPRECATED -6003 + +/* cl_accelerator_type_intel */ +#define CL_ACCELERATOR_TYPE_MOTION_ESTIMATION_INTEL 0x0 + +/* cl_accelerator_info_intel */ +#define CL_ACCELERATOR_DESCRIPTOR_INTEL 0x4090 +#define CL_ACCELERATOR_REFERENCE_COUNT_INTEL 0x4091 +#define CL_ACCELERATOR_CONTEXT_INTEL 0x4092 +#define CL_ACCELERATOR_TYPE_INTEL 0x4093 + +/*cl_motion_detect_desc_intel flags */ +#define CL_ME_MB_TYPE_16x16_INTEL 0x0 +#define CL_ME_MB_TYPE_8x8_INTEL 0x1 +#define CL_ME_MB_TYPE_4x4_INTEL 0x2 + +#define CL_ME_SUBPIXEL_MODE_INTEGER_INTEL 0x0 +#define CL_ME_SUBPIXEL_MODE_HPEL_INTEL 0x1 +#define CL_ME_SUBPIXEL_MODE_QPEL_INTEL 0x2 + +#define CL_ME_SAD_ADJUST_MODE_NONE_INTEL 0x0 +#define CL_ME_SAD_ADJUST_MODE_HAAR_INTEL 0x1 + +#define CL_ME_SEARCH_PATH_RADIUS_2_2_INTEL 0x0 +#define CL_ME_SEARCH_PATH_RADIUS_4_4_INTEL 0x1 +#define CL_ME_SEARCH_PATH_RADIUS_16_12_INTEL 0x5 + +extern CL_API_ENTRY cl_accelerator_intel CL_API_CALL +clCreateAcceleratorINTEL( + cl_context /* context */, + cl_accelerator_type_intel /* accelerator_type */, + size_t /* descriptor_size */, + const void* /* descriptor */, + cl_int* /* errcode_ret */ ) CL_EXT_SUFFIX__VERSION_1_2; + +typedef CL_API_ENTRY cl_accelerator_intel + (CL_API_CALL *clCreateAcceleratorINTEL_fn)( + cl_context /* context */, + cl_accelerator_type_intel /* accelerator_type */, + size_t /* descriptor_size */, + const void* /* descriptor */, + cl_int* /* errcode_ret */ ) CL_EXT_SUFFIX__VERSION_1_2; + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetAcceleratorInfoINTEL +( + cl_accelerator_intel /* accelerator */, + cl_accelerator_info_intel /* param_name */, + size_t /* param_value_size */, + void* /* param_value */, + size_t* /* param_value_size_ret */ ) CL_EXT_SUFFIX__VERSION_1_2; + +typedef CL_API_ENTRY cl_int + (CL_API_CALL *clGetAcceleratorInfoINTEL_fn)( + cl_accelerator_intel /* accelerator */, + cl_accelerator_info_intel /* param_name */, + size_t /* param_value_size */, + void* /* param_value */, + size_t* /* param_value_size_ret */ ) CL_EXT_SUFFIX__VERSION_1_2; + +extern CL_API_ENTRY cl_int CL_API_CALL +clRetainAcceleratorINTEL( + cl_accelerator_intel /* accelerator */ ) CL_EXT_SUFFIX__VERSION_1_2; + +typedef CL_API_ENTRY cl_int + (CL_API_CALL *clRetainAcceleratorINTEL_fn)( + cl_accelerator_intel /* accelerator */ ) CL_EXT_SUFFIX__VERSION_1_2; + +extern CL_API_ENTRY cl_int CL_API_CALL +clReleaseAcceleratorINTEL( + cl_accelerator_intel /* accelerator */ ) CL_EXT_SUFFIX__VERSION_1_2; + +typedef CL_API_ENTRY cl_int + (CL_API_CALL *clReleaseAcceleratorINTEL_fn)( + cl_accelerator_intel /* accelerator */ ) CL_EXT_SUFFIX__VERSION_1_2; + #ifdef CL_VERSION_1_1 /*********************************** * cl_ext_device_fission extension * diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 40a9afbe..c917e76d 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -56,7 +56,8 @@ cl_internal_fill_buf_align8 cl_internal_fill_buf_align4 cl_internal_fill_buf_align2 cl_internal_fill_buf_unalign cl_internal_fill_buf_align128 cl_internal_fill_image_1d cl_internal_fill_image_1d_array cl_internal_fill_image_2d -cl_internal_fill_image_2d_array cl_internal_fill_image_3d) +cl_internal_fill_image_2d_array cl_internal_fill_image_3d +cl_internal_block_motion_estimate_intel) set (BUILT_IN_NAME cl_internal_built_in_kernel) MakeBuiltInKernelStr ("${CMAKE_CURRENT_SOURCE_DIR}/kernels/" "${KERNEL_NAMES}") MakeKernelBinStr ("${CMAKE_CURRENT_SOURCE_DIR}/kernels/" "${KERNEL_NAMES}") @@ -70,6 +71,7 @@ set(OPENCL_SRC cl_program.c cl_gbe_loader.cpp cl_sampler.c + cl_accelerator_intel.c cl_event.c cl_enqueue.c cl_image.c diff --git a/src/cl_accelerator_intel.c b/src/cl_accelerator_intel.c new file mode 100644 index 00000000..cda89635 --- /dev/null +++ b/src/cl_accelerator_intel.c @@ -0,0 +1,86 @@ +#include "cl_context.h" +#include "cl_accelerator_intel.h" +#include "cl_utils.h" +#include "cl_alloc.h" +#include "cl_khr_icd.h" +#include "cl_kernel.h" + +#include + +LOCAL cl_accelerator_intel +cl_accelerator_intel_new(cl_context ctx, + cl_accelerator_type_intel accel_type, + size_t desc_sz, + const void* desc, + cl_int* errcode_ret) +{ + cl_accelerator_intel accel = NULL; + cl_int err = CL_SUCCESS; + + /* Allocate and inialize the structure itself */ + TRY_ALLOC(accel, CALLOC(struct _cl_accelerator_intel)); + SET_ICD(accel->dispatch) + accel->ref_n = 1; + accel->magic = CL_MAGIC_ACCELERATOR_INTEL_HEADER; + + if (accel_type != CL_ACCELERATOR_TYPE_MOTION_ESTIMATION_INTEL) { + err = CL_INVALID_ACCELERATOR_TYPE_INTEL; + goto error; + } + accel->type = accel_type; + + if (desc == NULL) { // and check inside desc + err = CL_INVALID_ACCELERATOR_DESCRIPTOR_INTEL; + goto error; + } + accel->desc.me = *(cl_motion_estimation_desc_intel*)desc; + + /* Append the accelerator_intel in the context accelerator_intel list */ + /* does this really needed? */ + pthread_mutex_lock(&ctx->accelerator_intel_lock); + accel->next = ctx->accels; + if (ctx->accels != NULL) + ctx->accels->prev = accel; + ctx->accels = accel; + pthread_mutex_unlock(&ctx->accelerator_intel_lock); + + accel->ctx = ctx; + cl_context_add_ref(ctx); + +exit: + if (errcode_ret) + *errcode_ret = err; + return accel; +error: + cl_accelerator_intel_delete(accel); + accel = NULL; + goto exit; +} + +LOCAL void +cl_accelerator_intel_add_ref(cl_accelerator_intel accel) +{ + atomic_inc(&accel->ref_n); +} + +LOCAL void +cl_accelerator_intel_delete(cl_accelerator_intel accel) +{ + if (UNLIKELY(accel == NULL)) + return; + if (atomic_dec(&accel->ref_n) > 1) + return; + + /* Remove the accelerator_intel in the context accelerator_intel list */ + pthread_mutex_lock(&accel->ctx->accelerator_intel_lock); + if (accel->prev) + accel->prev->next = accel->next; + if (accel->next) + accel->next->prev = accel->prev; + if (accel->ctx->accels == accel) + accel->ctx->accels = accel->next; + pthread_mutex_unlock(&accel->ctx->accelerator_intel_lock); + + cl_context_delete(accel->ctx); + cl_free(accel); +} diff --git a/src/cl_accelerator_intel.h b/src/cl_accelerator_intel.h new file mode 100644 index 00000000..cecfd2a8 --- /dev/null +++ b/src/cl_accelerator_intel.h @@ -0,0 +1,29 @@ +#ifndef __CL_ACCELERATOR_INTEL_H__ +#define __CL_ACCELERATOR_INTEL_H__ + +#include "CL/cl.h" +#include "CL/cl_ext.h" +#include + +struct _cl_accelerator_intel { + DEFINE_ICD(dispatch) + uint64_t magic; /* To identify it as a accelerator_intel object */ + volatile int ref_n; /* This object is reference counted */ + cl_accelerator_intel prev, next; /* We chain in the allocator, why chain? */ + cl_context ctx; /* Context it belongs to */ + cl_accelerator_type_intel type; + union { + cl_motion_estimation_desc_intel me; + }desc; /* save desc before we decide how to handle it */ +}; + +cl_accelerator_intel cl_accelerator_intel_new(cl_context ctx, + cl_accelerator_type_intel accel_type, + size_t desc_sz, + const void* desc, + cl_int* errcode_ret); + +void cl_accelerator_intel_add_ref(cl_accelerator_intel accel); +void cl_accelerator_intel_delete(cl_accelerator_intel accel); + +#endif diff --git a/src/cl_api.c b/src/cl_api.c index ec417d4e..ddd39cfc 100644 --- a/src/cl_api.c +++ b/src/cl_api.c @@ -28,6 +28,7 @@ #include "cl_mem.h" #include "cl_image.h" #include "cl_sampler.h" +#include "cl_accelerator_intel.h" #include "cl_alloc.h" #include "cl_utils.h" @@ -2913,6 +2914,17 @@ clEnqueueNDRangeKernel(cl_command_queue command_queue, goto error; } + if (kernel->vme) { + if (work_dim != 2) { + err = CL_INVALID_WORK_DIMENSION; + goto error; + } + if (local_work_size != NULL) { + err = CL_INVALID_WORK_GROUP_SIZE; + goto error; + } + } + if (global_work_offset != NULL) for (i = 0; i < work_dim; ++i) { if (UNLIKELY(global_work_offset[i] + global_work_size[i] > (size_t)-1)) { @@ -2946,22 +2958,31 @@ clEnqueueNDRangeKernel(cl_command_queue command_queue, for (i = 0; i < work_dim; ++i) fixed_local_sz[i] = local_work_size[i]; } else { - uint j, maxDimSize = 64 /* from 64? */, maxGroupSize = 256; //MAX_WORK_GROUP_SIZE may too large - for (i = 0; i< work_dim; i++) { - for (j = maxDimSize; j > 1; j--) { - if (global_work_size[i] % j == 0 && j <= maxGroupSize) { - fixed_local_sz[i] = j; - maxGroupSize = maxGroupSize /j; - maxDimSize = maxGroupSize > maxDimSize ? maxDimSize : maxGroupSize; - break; //choose next work_dim + if (kernel->vme) { + fixed_local_sz[0] = 16; + fixed_local_sz[1] = 1; + } else { + uint j, maxDimSize = 64 /* from 64? */, maxGroupSize = 256; //MAX_WORK_GROUP_SIZE may too large + for (i = 0; i< work_dim; i++) { + for (j = maxDimSize; j > 1; j--) { + if (global_work_size[i] % j == 0 && j <= maxGroupSize) { + fixed_local_sz[i] = j; + maxGroupSize = maxGroupSize /j; + maxDimSize = maxGroupSize > maxDimSize ? maxDimSize : maxGroupSize; + break; //choose next work_dim + } } } } } - if (global_work_size != NULL) + if (kernel->vme) { + fixed_global_sz[0] = (global_work_size[0]+15) / 16 * 16; + fixed_global_sz[1] = (global_work_size[1]+15) / 16; + } else { for (i = 0; i < work_dim; ++i) fixed_global_sz[i] = global_work_size[i]; + } if (global_work_offset != NULL) for (i = 0; i < work_dim; ++i) fixed_global_off[i] = global_work_offset[i]; @@ -3192,6 +3213,10 @@ internal_clGetExtensionFunctionAddress(const char *func_name) EXTFUNC(clGetMemObjectFdIntel) EXTFUNC(clCreateBufferFromFdINTEL) EXTFUNC(clCreateImageFromFdINTEL) + EXTFUNC(clCreateAcceleratorINTEL) + EXTFUNC(clRetainAcceleratorINTEL) + EXTFUNC(clReleaseAcceleratorINTEL) + EXTFUNC(clGetAcceleratorInfoINTEL) return NULL; } @@ -3419,3 +3444,67 @@ error: *errorcode_ret = err; return mem; } + +cl_accelerator_intel +clCreateAcceleratorINTEL(cl_context context, + cl_accelerator_type_intel accel_type, + size_t desc_sz, + const void* desc, + cl_int* errcode_ret) +{ + + cl_accelerator_intel accel = NULL; + cl_int err = CL_SUCCESS; + CHECK_CONTEXT(context); + accel = cl_accelerator_intel_new(context, accel_type, desc_sz, desc, &err); +error: + if (errcode_ret) + *errcode_ret = err; + return accel; +} + +cl_int +clRetainAcceleratorINTEL(cl_accelerator_intel accel) +{ + cl_int err = CL_SUCCESS; + CHECK_ACCELERATOR_INTEL(accel); + cl_accelerator_intel_add_ref(accel); +error: + return err; +} + +cl_int +clReleaseAcceleratorINTEL(cl_accelerator_intel accel) +{ + cl_int err = CL_SUCCESS; + CHECK_ACCELERATOR_INTEL(accel); + cl_accelerator_intel_delete(accel); +error: + return err; +} + +cl_int +clGetAcceleratorInfoINTEL(cl_accelerator_intel accel, + cl_accelerator_info_intel param_name, + size_t param_value_size, + void* param_value, + size_t* param_value_size_ret) +{ + cl_int err = CL_SUCCESS; + CHECK_ACCELERATOR_INTEL(accel); + + if (param_name == CL_ACCELERATOR_REFERENCE_COUNT_INTEL) { + FILL_GETINFO_RET (cl_uint, 1, (cl_uint*)&accel->ref_n, CL_SUCCESS); + } else if (param_name == CL_ACCELERATOR_CONTEXT_INTEL) { + FILL_GETINFO_RET (cl_context, 1, &accel->ctx, CL_SUCCESS); + } else if (param_name == CL_ACCELERATOR_TYPE_INTEL) { + FILL_GETINFO_RET (cl_uint, 1, &accel->type, CL_SUCCESS); + } else if (param_name == CL_ACCELERATOR_DESCRIPTOR_INTEL) { + FILL_GETINFO_RET (cl_motion_estimation_desc_intel, 1, &(accel->desc.me), CL_SUCCESS); + } else{ + return CL_INVALID_VALUE; + } + +error: + return err; +} diff --git a/src/cl_command_queue.c b/src/cl_command_queue.c index 9dc3fe64..033e7df7 100644 --- a/src/cl_command_queue.c +++ b/src/cl_command_queue.c @@ -135,10 +135,19 @@ cl_command_queue_bind_image(cl_command_queue queue, cl_kernel k) image = cl_mem_image(k->args[id].mem); set_image_info(k->curbe, &k->images[i], image); - cl_gpgpu_bind_image(gpgpu, k->images[i].idx, image->base.bo, image->offset + k->args[id].mem->offset, - image->intel_fmt, image->image_type, image->bpp, - image->w, image->h, image->depth, - image->row_pitch, image->slice_pitch, (cl_gpgpu_tiling)image->tiling); + if(k->vme){ + if( (image->fmt.image_channel_order != CL_R) || (image->fmt.image_channel_data_type != CL_UNORM_INT8) ) + return CL_IMAGE_FORMAT_NOT_SUPPORTED; + cl_gpgpu_bind_image_for_vme(gpgpu, k->images[i].idx, image->base.bo, image->offset + k->args[id].mem->offset, + image->intel_fmt, image->image_type, image->bpp, + image->w, image->h, image->depth, + image->row_pitch, image->slice_pitch, (cl_gpgpu_tiling)image->tiling); + } + else + cl_gpgpu_bind_image(gpgpu, k->images[i].idx, image->base.bo, image->offset + k->args[id].mem->offset, + image->intel_fmt, image->image_type, image->bpp, + image->w, image->h, image->depth, + image->row_pitch, image->slice_pitch, (cl_gpgpu_tiling)image->tiling); // TODO, this workaround is for GEN7/GEN75 only, we may need to do it in the driver layer // on demand. if (image->image_type == CL_MEM_OBJECT_IMAGE1D_ARRAY) diff --git a/src/cl_command_queue_gen7.c b/src/cl_command_queue_gen7.c index 2edc3be4..2a49ec24 100644 --- a/src/cl_command_queue_gen7.c +++ b/src/cl_command_queue_gen7.c @@ -367,9 +367,13 @@ cl_command_queue_ND_range_gen7(cl_command_queue queue, /* Bind user buffers */ cl_command_queue_bind_surface(queue, ker); /* Bind user images */ - cl_command_queue_bind_image(queue, ker); + if(UNLIKELY(err = cl_command_queue_bind_image(queue, ker) != CL_SUCCESS)) + return err; /* Bind all samplers */ - cl_gpgpu_bind_sampler(gpgpu, ker->samplers, ker->sampler_sz); + if (ker->vme) + cl_gpgpu_bind_vme_state(gpgpu, ker->accel); + else + cl_gpgpu_bind_sampler(gpgpu, ker->samplers, ker->sampler_sz); if (cl_gpgpu_set_scratch(gpgpu, scratch_sz) != 0) goto error; diff --git a/src/cl_context.c b/src/cl_context.c index c45e0aa7..a6bde7d2 100644 --- a/src/cl_context.c +++ b/src/cl_context.c @@ -177,6 +177,7 @@ cl_context_new(struct _cl_context_prop *props) pthread_mutex_init(&ctx->queue_lock, NULL); pthread_mutex_init(&ctx->buffer_lock, NULL); pthread_mutex_init(&ctx->sampler_lock, NULL); + pthread_mutex_init(&ctx->accelerator_intel_lock, NULL); exit: return ctx; diff --git a/src/cl_context.h b/src/cl_context.h index ef94823d..489e5d7d 100644 --- a/src/cl_context.h +++ b/src/cl_context.h @@ -21,6 +21,7 @@ #define __CL_CONTEXT_H__ #include "CL/cl.h" +#include "CL/cl_ext.h" #include "cl_internals.h" #include "cl_driver.h" #include "cl_khr_icd.h" @@ -107,11 +108,13 @@ struct _cl_context { cl_program programs; /* All programs currently allocated */ cl_mem buffers; /* All memory object currently allocated */ cl_sampler samplers; /* All sampler object currently allocated */ + cl_accelerator_intel accels; /* All accelerator_intel object currently allocated */ cl_event events; /* All event object currently allocated */ pthread_mutex_t queue_lock; /* To allocate and deallocate queues */ pthread_mutex_t program_lock; /* To allocate and deallocate programs */ pthread_mutex_t buffer_lock; /* To allocate and deallocate buffers */ pthread_mutex_t sampler_lock; /* To allocate and deallocate samplers */ + pthread_mutex_t accelerator_intel_lock; /* To allocate and deallocate accelerator_intel */ pthread_mutex_t event_lock; /* To allocate and deallocate events */ cl_program internal_prgs[CL_INTERNAL_KERNEL_MAX]; /* All programs internal used, for example clEnqueuexxx api use */ diff --git a/src/cl_driver.h b/src/cl_driver.h index 19afb433..9d986b1f 100644 --- a/src/cl_driver.h +++ b/src/cl_driver.h @@ -23,9 +23,11 @@ #include #include #include "cl_driver_type.h" +#include "CL/cl_ext.h" /* Various limitations we should remove actually */ #define GEN_MAX_SURFACES 256 #define GEN_MAX_SAMPLERS 16 +#define GEN_MAX_VME_STATES 8 /************************************************************************** * cl_driver: @@ -145,6 +147,9 @@ extern cl_gpgpu_bind_buf_cb *cl_gpgpu_bind_buf; typedef void (cl_gpgpu_bind_sampler_cb)(cl_gpgpu, uint32_t *samplers, size_t sampler_sz); extern cl_gpgpu_bind_sampler_cb *cl_gpgpu_bind_sampler; +typedef void (cl_gpgpu_bind_vme_state_cb)(cl_gpgpu, cl_accelerator_intel accel); +extern cl_gpgpu_bind_vme_state_cb *cl_gpgpu_bind_vme_state; + /* get the default cache control value. */ typedef uint32_t (cl_gpgpu_get_cache_ctrl_cb)(); extern cl_gpgpu_get_cache_ctrl_cb *cl_gpgpu_get_cache_ctrl; @@ -165,6 +170,22 @@ typedef void (cl_gpgpu_bind_image_cb)(cl_gpgpu state, extern cl_gpgpu_bind_image_cb *cl_gpgpu_bind_image; +typedef void (cl_gpgpu_bind_image_for_vme_cb)(cl_gpgpu state, + uint32_t id, + cl_buffer obj_bo, + uint32_t obj_bo_offset, + uint32_t format, + uint32_t bpp, + uint32_t type, + int32_t w, + int32_t h, + int32_t depth, + int pitch, + int32_t slice_pitch, + cl_gpgpu_tiling tiling); + +extern cl_gpgpu_bind_image_for_vme_cb *cl_gpgpu_bind_image_for_vme; + /* Setup a stack */ typedef void (cl_gpgpu_set_stack_cb)(cl_gpgpu, uint32_t offset, uint32_t size, uint32_t cchint); extern cl_gpgpu_set_stack_cb *cl_gpgpu_set_stack; diff --git a/src/cl_driver_defs.c b/src/cl_driver_defs.c index d25fd5d1..58c4f8f0 100644 --- a/src/cl_driver_defs.c +++ b/src/cl_driver_defs.c @@ -71,6 +71,7 @@ LOCAL cl_gpgpu_bind_buf_cb *cl_gpgpu_bind_buf = NULL; LOCAL cl_gpgpu_set_stack_cb *cl_gpgpu_set_stack = NULL; LOCAL cl_gpgpu_set_scratch_cb *cl_gpgpu_set_scratch = NULL; LOCAL cl_gpgpu_bind_image_cb *cl_gpgpu_bind_image = NULL; +LOCAL cl_gpgpu_bind_image_cb *cl_gpgpu_bind_image_for_vme = NULL; LOCAL cl_gpgpu_get_cache_ctrl_cb *cl_gpgpu_get_cache_ctrl = NULL; LOCAL cl_gpgpu_state_init_cb *cl_gpgpu_state_init = NULL; LOCAL cl_gpgpu_alloc_constant_buffer_cb * cl_gpgpu_alloc_constant_buffer = NULL; @@ -84,6 +85,7 @@ LOCAL cl_gpgpu_batch_end_cb *cl_gpgpu_batch_end = NULL; LOCAL cl_gpgpu_flush_cb *cl_gpgpu_flush = NULL; LOCAL cl_gpgpu_walker_cb *cl_gpgpu_walker = NULL; LOCAL cl_gpgpu_bind_sampler_cb *cl_gpgpu_bind_sampler = NULL; +LOCAL cl_gpgpu_bind_vme_state_cb *cl_gpgpu_bind_vme_state = NULL; LOCAL cl_gpgpu_event_new_cb *cl_gpgpu_event_new = NULL; LOCAL cl_gpgpu_event_update_status_cb *cl_gpgpu_event_update_status = NULL; LOCAL cl_gpgpu_event_flush_cb *cl_gpgpu_event_flush = NULL; diff --git a/src/cl_extensions.c b/src/cl_extensions.c index 3e714ac3..ba910c12 100644 --- a/src/cl_extensions.c +++ b/src/cl_extensions.c @@ -65,7 +65,9 @@ check_gl_extension(cl_extensions_t *extensions) { void check_intel_extension(cl_extensions_t *extensions) { - /* Should put those map/unmap extensions here. */ + int id; + for(id = INTEL_EXT_START_ID; id <= INTEL_EXT_END_ID; id++) + extensions->extensions[id].base.ext_enabled = 1; } void diff --git a/src/cl_extensions.h b/src/cl_extensions.h index 0006651e..f744fa3a 100644 --- a/src/cl_extensions.h +++ b/src/cl_extensions.h @@ -23,6 +23,10 @@ DECL_EXT(khr_spir) \ DECL_EXT(khr_icd) +#define DECL_INTEL_EXTENSIONS \ + DECL_EXT(intel_accelerator) \ + DECL_EXT(intel_motion_estimation) + #define DECL_GL_EXTENSIONS \ DECL_EXT(khr_gl_sharing)\ DECL_EXT(khr_gl_event)\ @@ -37,6 +41,7 @@ #define DECL_ALL_EXTENSIONS \ DECL_BASE_EXTENSIONS \ DECL_OPT1_EXTENSIONS \ + DECL_INTEL_EXTENSIONS \ DECL_GL_EXTENSIONS \ DECL_D3D_EXTENSIONS @@ -54,6 +59,8 @@ cl_khr_extension_id_max #define BASE_EXT_END_ID EXT_ID(khr_fp64) #define OPT1_EXT_START_ID EXT_ID(khr_int64_base_atomics) #define OPT1_EXT_END_ID EXT_ID(khr_icd) +#define INTEL_EXT_START_ID EXT_ID(intel_accelerator) +#define INTEL_EXT_END_ID EXT_ID(intel_motion_estimation) #define GL_EXT_START_ID EXT_ID(khr_gl_sharing) #define GL_EXT_END_ID EXT_ID(khr_gl_msaa_sharing) @@ -75,6 +82,7 @@ struct EXT_STRUCT_NAME(name) { \ DECL_BASE_EXTENSIONS DECL_OPT1_EXTENSIONS +DECL_INTEL_EXTENSIONS DECL_D3D_EXTENSIONS DECL_GL_EXTENSIONS #undef DECL_EXT diff --git a/src/cl_gen7_device.h b/src/cl_gen7_device.h index 104e9297..e755cad6 100644 --- a/src/cl_gen7_device.h +++ b/src/cl_gen7_device.h @@ -27,5 +27,8 @@ .max_mem_alloc_size = 2 * 1024 * 1024 * 1024ul, .global_mem_size = 2 * 1024 * 1024 * 1024ul, +//temporarily define to only export builtin kernel block_motion_estimate_intel only for Gen7 +//will remove after HSW+ also support +#define GEN7_DEVICE #include "cl_gt_device.h" - +#undef GEN7_DEVICE diff --git a/src/cl_gt_device.h b/src/cl_gt_device.h index e61ff613..d8089c23 100644 --- a/src/cl_gt_device.h +++ b/src/cl_gt_device.h @@ -114,7 +114,11 @@ DECL_INFO_STRING(built_in_kernels, "__cl_copy_region_align4;" "__cl_fill_image_1d_array;" "__cl_fill_image_2d;" "__cl_fill_image_2d_array;" - "__cl_fill_image_3d;") + "__cl_fill_image_3d;" +#ifdef GEN7_DEVICE + "block_motion_estimate_intel;" +#endif + ) DECL_INFO_STRING(driver_version, LIBCL_DRIVER_VERSION_STRING) DECL_INFO_STRING(spir_versions, "1.2") diff --git a/src/cl_internals.h b/src/cl_internals.h index cb3fc238..9aeb8c12 100644 --- a/src/cl_internals.h +++ b/src/cl_internals.h @@ -31,6 +31,7 @@ #define CL_MAGIC_EVENT_HEADER 0x8324a9c810ebf90fLL #define CL_MAGIC_MEM_HEADER 0x381a27b9ce6504dfLL #define CL_MAGIC_DEAD_HEADER 0xdeaddeaddeaddeadLL +#define CL_MAGIC_ACCELERATOR_INTEL_HEADER 0x7c6a08c9a7ac3e3fLL #endif /* __CL_INTERNALS_H__ */ diff --git a/src/cl_kernel.c b/src/cl_kernel.c index 58a1224e..b2d1955f 100644 --- a/src/cl_kernel.c +++ b/src/cl_kernel.c @@ -27,6 +27,7 @@ #include "cl_khr_icd.h" #include "CL/cl.h" #include "cl_sampler.h" +#include "cl_accelerator_intel.h" #include #include @@ -113,10 +114,22 @@ cl_kernel_set_arg(cl_kernel k, cl_uint index, size_t sz, const void *value) arg_type = interp_kernel_get_arg_type(k->opaque, index); arg_sz = interp_kernel_get_arg_size(k->opaque, index); - if (UNLIKELY(arg_type != GBE_ARG_LOCAL_PTR && arg_sz != sz)) { - if (arg_type != GBE_ARG_SAMPLER || - (arg_type == GBE_ARG_SAMPLER && sz != sizeof(cl_sampler))) + if (k->vme && index == 0) { + //the best method is to return the arg type of GBE_ARG_ACCELERATOR_INTEL + //but it is not straightforward since clang does not support it now + //the easy way is to consider typedef accelerator_intel_t as a struct, + //this easy way makes the size mismatched, so use another size check method. + if (sz != sizeof(cl_accelerator_intel) || arg_sz != sizeof(cl_motion_estimation_desc_intel)) return CL_INVALID_ARG_SIZE; + cl_accelerator_intel* accel = (cl_accelerator_intel*)value; + if ((*accel)->type != CL_ACCELERATOR_TYPE_MOTION_ESTIMATION_INTEL) + return CL_INVALID_ACCELERATOR_TYPE_INTEL; + } else { + if (UNLIKELY(arg_type != GBE_ARG_LOCAL_PTR && arg_sz != sz)) { + if (arg_type != GBE_ARG_SAMPLER || + (arg_type == GBE_ARG_SAMPLER && sz != sizeof(cl_sampler))) + return CL_INVALID_ARG_SIZE; + } } if(UNLIKELY(arg_type == GBE_ARG_LOCAL_PTR && sz == 0)) @@ -152,15 +165,30 @@ cl_kernel_set_arg(cl_kernel k, cl_uint index, size_t sz, const void *value) /* Copy the structure or the value directly into the curbe */ if (arg_type == GBE_ARG_VALUE) { - offset = interp_kernel_get_curbe_offset(k->opaque, GBE_CURBE_KERNEL_ARGUMENT, index); - if (offset >= 0) { - assert(offset + sz <= k->curbe_sz); - memcpy(k->curbe + offset, value, sz); + if (k->vme && index == 0) { + cl_accelerator_intel accel; + memcpy(&accel, value, sz); + offset = interp_kernel_get_curbe_offset(k->opaque, GBE_CURBE_KERNEL_ARGUMENT, index); + if (offset >= 0) { + assert(offset + sz <= k->curbe_sz); + memcpy(k->curbe + offset, &(accel->desc.me), arg_sz); + } + k->args[index].local_sz = 0; + k->args[index].is_set = 1; + k->args[index].mem = NULL; + k->accel = accel; + return CL_SUCCESS; + } else { + offset = interp_kernel_get_curbe_offset(k->opaque, GBE_CURBE_KERNEL_ARGUMENT, index); + if (offset >= 0) { + assert(offset + sz <= k->curbe_sz); + memcpy(k->curbe + offset, value, sz); + } + k->args[index].local_sz = 0; + k->args[index].is_set = 1; + k->args[index].mem = NULL; + return CL_SUCCESS; } - k->args[index].local_sz = 0; - k->args[index].is_set = 1; - k->args[index].mem = NULL; - return CL_SUCCESS; } /* For a local pointer just save the size */ @@ -331,6 +359,12 @@ cl_kernel_setup(cl_kernel k, gbe_kernel opaque) cl_buffer_subdata(k->bo, 0, code_sz, code); k->opaque = opaque; + const char* kname = cl_kernel_get_name(k); + if (strncmp(kname, "block_motion_estimate_intel", sizeof("block_motion_estimate_intel")) == 0) + k->vme = 1; + else + k->vme = 0; + /* Create the curbe */ k->curbe_sz = interp_kernel_get_curbe_size(k->opaque); @@ -367,6 +401,7 @@ cl_kernel_dup(cl_kernel from) SET_ICD(to->dispatch) to->bo = from->bo; to->opaque = from->opaque; + to->vme = from->vme; to->ref_n = 1; to->magic = CL_MAGIC_KERNEL_HEADER; to->program = from->program; diff --git a/src/cl_kernel.h b/src/cl_kernel.h index 140bbb10..7f59162c 100644 --- a/src/cl_kernel.h +++ b/src/cl_kernel.h @@ -24,6 +24,7 @@ #include "cl_driver.h" #include "cl_gbe_loader.h" #include "CL/cl.h" +#include "CL/cl_ext.h" #include #include @@ -37,6 +38,7 @@ struct _gbe_kernel; typedef struct cl_argument { cl_mem mem; /* For image and regular buffers */ cl_sampler sampler; /* For sampler. */ + cl_accelerator_intel accel; unsigned char bti; uint32_t local_sz:31; /* For __local size specification */ uint32_t is_set:1; /* All args must be set before NDRange */ @@ -50,6 +52,7 @@ struct _cl_kernel { cl_buffer bo; /* The code itself */ cl_program program; /* Owns this structure (and pointers) */ gbe_kernel opaque; /* (Opaque) compiler structure for the OCL kernel */ + cl_accelerator_intel accel; /* accelerator */ char *curbe; /* One curbe per kernel */ size_t curbe_sz; /* Size of it */ uint32_t samplers[GEN_MAX_SAMPLERS]; /* samplers defined in kernel & kernel args */ @@ -63,8 +66,9 @@ struct _cl_kernel { (i.e. global_work_size argument to clEnqueueNDRangeKernel.)*/ size_t stack_size; /* stack size per work item. */ cl_argument *args; /* To track argument setting */ - uint32_t arg_n:31; /* Number of arguments */ + uint32_t arg_n:30; /* Number of arguments */ uint32_t ref_its_program:1; /* True only for the user kernel (created by clCreateKernel) */ + uint32_t vme:1; /* True only if it is a built-in kernel for VME */ }; /* Allocate an empty kernel */ diff --git a/src/cl_utils.h b/src/cl_utils.h index 28fdef62..ee9d6143 100644 --- a/src/cl_utils.h +++ b/src/cl_utils.h @@ -202,6 +202,18 @@ do { \ } \ } while (0) +#define CHECK_ACCELERATOR_INTEL(ACCELERATOR_INTEL) \ +do { \ + if (UNLIKELY(ACCELERATOR_INTEL == NULL)) { \ + err = CL_INVALID_ACCELERATOR_INTEL; \ + goto error; \ + } \ + if (UNLIKELY(ACCELERATOR_INTEL->magic != CL_MAGIC_ACCELERATOR_INTEL_HEADER)) {\ + err = CL_INVALID_ACCELERATOR_INTEL; \ + goto error; \ + } \ +} while (0) + #define CHECK_KERNEL(KERNEL) \ do { \ if (UNLIKELY(KERNEL == NULL)) { \ diff --git a/src/intel/intel_gpgpu.c b/src/intel/intel_gpgpu.c index 110c36f2..0c34ca97 100644 --- a/src/intel/intel_gpgpu.c +++ b/src/intel/intel_gpgpu.c @@ -38,6 +38,7 @@ #include "cl_alloc.h" #include "cl_utils.h" #include "cl_sampler.h" +#include "cl_accelerator_intel.h" #ifndef CL_VERSION_1_2 #define CL_MEM_OBJECT_IMAGE1D 0x10F4 @@ -941,10 +942,12 @@ intel_gpgpu_state_init(intel_gpgpu_t *gpgpu, gpgpu->aux_offset.idrt_offset = size_aux; size_aux += MAX_IF_DESC * sizeof(struct gen6_interface_descriptor); - //sampler state must be 32 bytes aligned + //must be 32 bytes aligned + //sampler state and vme state share the same buffer, size_aux = ALIGN(size_aux, 32); gpgpu->aux_offset.sampler_state_offset = size_aux; - size_aux += GEN_MAX_SAMPLERS * sizeof(gen6_sampler_state_t); + size_aux += MAX(GEN_MAX_SAMPLERS * sizeof(gen6_sampler_state_t), + GEN_MAX_VME_STATES * sizeof(gen7_vme_state_t)); //sampler border color state must be 32 bytes aligned size_aux = ALIGN(size_aux, 32); @@ -985,6 +988,22 @@ intel_gpgpu_set_buf_reloc_gen7(intel_gpgpu_t *gpgpu, int32_t index, dri_bo* obj_ obj_bo); } +static void +intel_gpgpu_set_buf_reloc_for_vme_gen7(intel_gpgpu_t *gpgpu, int32_t index, dri_bo* obj_bo, uint32_t obj_bo_offset) +{ + surface_heap_t *heap = gpgpu->aux_buf.bo->virtual + gpgpu->aux_offset.surface_heap_offset; + heap->binding_table[index] = offsetof(surface_heap_t, surface) + + index * sizeof(gen7_surface_state_t); + dri_bo_emit_reloc(gpgpu->aux_buf.bo, + I915_GEM_DOMAIN_RENDER, + I915_GEM_DOMAIN_RENDER, + obj_bo_offset, + gpgpu->aux_offset.surface_heap_offset + + heap->binding_table[index] + + offsetof(gen7_media_surface_state_t, ss0), + obj_bo); +} + static dri_bo* intel_gpgpu_alloc_constant_buffer(intel_gpgpu_t *gpgpu, uint32_t size, uint8_t bti) { @@ -1240,6 +1259,55 @@ intel_gpgpu_bind_image_gen7(intel_gpgpu_t *gpgpu, assert(index < GEN_MAX_SURFACES); } +static void +intel_gpgpu_bind_image_for_vme_gen7(intel_gpgpu_t *gpgpu, + uint32_t index, + dri_bo* obj_bo, + uint32_t obj_bo_offset, + uint32_t format, + cl_mem_object_type type, + uint32_t bpp, + int32_t w, + int32_t h, + int32_t depth, + int32_t pitch, + int32_t slice_pitch, + int32_t tiling) +{ + surface_heap_t *heap = gpgpu->aux_buf.bo->virtual + gpgpu->aux_offset.surface_heap_offset; + gen7_media_surface_state_t *ss = (gen7_media_surface_state_t *) &heap->surface[index * sizeof(gen7_surface_state_t)]; + + memset(ss, 0, sizeof(*ss)); + ss->ss0.base_addr = obj_bo->offset + obj_bo_offset; + ss->ss1.uv_offset_v_direction = 0; + ss->ss1.pic_struct = 0; + ss->ss1.width = w - 1; + ss->ss1.height = h - 1; + if (tiling == GPGPU_NO_TILE) { + ss->ss2.tile_mode = 0; + } + else if (tiling == GPGPU_TILE_X){ + ss->ss2.tile_mode = 2; + } + else if (tiling == GPGPU_TILE_Y){ + ss->ss2.tile_mode = 3; + } + ss->ss2.half_pitch_for_chroma = 0; + ss->ss2.surface_pitch = pitch - 1; + ss->ss2.surface_object_control_state = cl_gpgpu_get_cache_ctrl(); + ss->ss2.interleave_chroma = 0; + ss->ss2.surface_format = 12; //Y8_UNORM + ss->ss3.y_offset_for_u = 0; + ss->ss3.x_offset_for_u = 0; + ss->ss4.y_offset_for_v = 0; + ss->ss4.x_offset_for_v = 0; + + intel_gpgpu_set_buf_reloc_for_vme_gen7(gpgpu, index, obj_bo, obj_bo_offset); + + assert(index < GEN_MAX_SURFACES); +} + + static void intel_gpgpu_bind_image_gen75(intel_gpgpu_t *gpgpu, uint32_t index, @@ -1676,6 +1744,149 @@ int translate_wrap_mode(uint32_t cl_address_mode, int using_nearest) } } +static void intel_gpgpu_insert_vme_state_gen7(intel_gpgpu_t *gpgpu, cl_accelerator_intel accel, uint32_t index) +{ + gen7_vme_state_t* vme = (gen7_vme_state_t*)(gpgpu->aux_buf.bo->virtual + gpgpu->aux_offset.sampler_state_offset) + index; + memset(vme, 0, sizeof(*vme)); + gen7_vme_search_path_state_t* sp = vme->sp; + + if(accel->desc.me.search_path_type == CL_ME_SEARCH_PATH_RADIUS_2_2_INTEL){ + sp[0].dw0.SPD_0_X = 0; + sp[0].dw0.SPD_0_Y = 0; + sp[0].dw0.SPD_1_X = 0; + sp[0].dw0.SPD_1_Y = 0; + sp[0].dw0.SPD_2_X = 0; + sp[0].dw0.SPD_2_Y = 0; + sp[0].dw0.SPD_3_X = 0; + sp[0].dw0.SPD_3_Y = 0; + } + else if(accel->desc.me.search_path_type == CL_ME_SEARCH_PATH_RADIUS_4_4_INTEL){ + sp[0].dw0.SPD_0_X = 1; + sp[0].dw0.SPD_0_Y = 0; + sp[0].dw0.SPD_1_X = 0; + sp[0].dw0.SPD_1_Y = 1; + sp[0].dw0.SPD_2_X = -1; + sp[0].dw0.SPD_2_Y = 0; + sp[0].dw0.SPD_3_X = 0; + sp[0].dw0.SPD_3_Y = 0; + } + else if(accel->desc.me.search_path_type == CL_ME_SEARCH_PATH_RADIUS_16_12_INTEL){ + sp[0].dw0.SPD_0_X = 1; + sp[0].dw0.SPD_0_Y = 0; + sp[0].dw0.SPD_1_X = 1; + sp[0].dw0.SPD_1_Y = 0; + sp[0].dw0.SPD_2_X = 1; + sp[0].dw0.SPD_2_Y = 0; + sp[0].dw0.SPD_3_X = 1; + sp[0].dw0.SPD_3_Y = 0; + + sp[1].dw0.SPD_0_X = 1; + sp[1].dw0.SPD_0_Y = 0; + sp[1].dw0.SPD_1_X = 1; + sp[1].dw0.SPD_1_Y = 0; + sp[1].dw0.SPD_2_X = 1; + sp[1].dw0.SPD_2_Y = 0; + sp[1].dw0.SPD_3_X = 0; + sp[1].dw0.SPD_3_Y = 1; + + sp[2].dw0.SPD_0_X = -1; + sp[2].dw0.SPD_0_Y = 0; + sp[2].dw0.SPD_1_X = -1; + sp[2].dw0.SPD_1_Y = 0; + sp[2].dw0.SPD_2_X = -1; + sp[2].dw0.SPD_2_Y = 0; + sp[2].dw0.SPD_3_X = -1; + sp[2].dw0.SPD_3_Y = 0; + + sp[3].dw0.SPD_0_X = -1; + sp[3].dw0.SPD_0_Y = 0; + sp[3].dw0.SPD_1_X = -1; + sp[3].dw0.SPD_1_Y = 0; + sp[3].dw0.SPD_2_X = -1; + sp[3].dw0.SPD_2_Y = 0; + sp[3].dw0.SPD_3_X = 0; + sp[3].dw0.SPD_3_Y = 1; + + sp[4].dw0.SPD_0_X = 1; + sp[4].dw0.SPD_0_Y = 0; + sp[4].dw0.SPD_1_X = 1; + sp[4].dw0.SPD_1_Y = 0; + sp[4].dw0.SPD_2_X = 1; + sp[4].dw0.SPD_2_Y = 0; + sp[4].dw0.SPD_3_X = 1; + sp[4].dw0.SPD_3_Y = 0; + + sp[5].dw0.SPD_0_X = 1; + sp[5].dw0.SPD_0_Y = 0; + sp[5].dw0.SPD_1_X = 1; + sp[5].dw0.SPD_1_Y = 0; + sp[5].dw0.SPD_2_X = 1; + sp[5].dw0.SPD_2_Y = 0; + sp[5].dw0.SPD_3_X = 0; + sp[5].dw0.SPD_3_Y = 1; + + sp[6].dw0.SPD_0_X = -1; + sp[6].dw0.SPD_0_Y = 0; + sp[6].dw0.SPD_1_X = -1; + sp[6].dw0.SPD_1_Y = 0; + sp[6].dw0.SPD_2_X = -1; + sp[6].dw0.SPD_2_Y = 0; + sp[6].dw0.SPD_3_X = -1; + sp[6].dw0.SPD_3_Y = 0; + + sp[7].dw0.SPD_0_X = -1; + sp[7].dw0.SPD_0_Y = 0; + sp[7].dw0.SPD_1_X = -1; + sp[7].dw0.SPD_1_Y = 0; + sp[7].dw0.SPD_2_X = -1; + sp[7].dw0.SPD_2_Y = 0; + sp[7].dw0.SPD_3_X = 0; + sp[7].dw0.SPD_3_Y = 1; + + sp[8].dw0.SPD_0_X = 1; + sp[8].dw0.SPD_0_Y = 0; + sp[8].dw0.SPD_1_X = 1; + sp[8].dw0.SPD_1_Y = 0; + sp[8].dw0.SPD_2_X = 1; + sp[8].dw0.SPD_2_Y = 0; + sp[8].dw0.SPD_3_X = 1; + sp[8].dw0.SPD_3_Y = 0; + + sp[9].dw0.SPD_0_X = 1; + sp[9].dw0.SPD_0_Y = 0; + sp[9].dw0.SPD_1_X = 1; + sp[9].dw0.SPD_1_Y = 0; + sp[9].dw0.SPD_2_X = 1; + sp[9].dw0.SPD_2_Y = 0; + sp[9].dw0.SPD_3_X = 0; + sp[9].dw0.SPD_3_Y = 1; + + sp[10].dw0.SPD_0_X = -1; + sp[10].dw0.SPD_0_Y = 0; + sp[10].dw0.SPD_1_X = -1; + sp[10].dw0.SPD_1_Y = 0; + sp[10].dw0.SPD_2_X = -1; + sp[10].dw0.SPD_2_Y = 0; + sp[10].dw0.SPD_3_X = -1; + sp[10].dw0.SPD_3_Y = 0; + + sp[11].dw0.SPD_0_X = -1; + sp[11].dw0.SPD_0_Y = 0; + sp[11].dw0.SPD_1_X = -1; + sp[11].dw0.SPD_1_Y = 0; + sp[11].dw0.SPD_2_X = -1; + sp[11].dw0.SPD_2_Y = 0; + sp[11].dw0.SPD_3_X = 0; + sp[11].dw0.SPD_3_Y = 0; + } +} + +static void +intel_gpgpu_bind_vme_state_gen7(intel_gpgpu_t *gpgpu, cl_accelerator_intel accel) +{ + intel_gpgpu_insert_vme_state_gen7(gpgpu, accel, 0); +} + static void intel_gpgpu_insert_sampler_gen7(intel_gpgpu_t *gpgpu, uint32_t index, uint32_t clk_sampler) { @@ -2181,6 +2392,7 @@ intel_set_gpgpu_callbacks(int device_id) cl_gpgpu_batch_end = (cl_gpgpu_batch_end_cb *) intel_gpgpu_batch_end; cl_gpgpu_flush = (cl_gpgpu_flush_cb *) intel_gpgpu_flush; cl_gpgpu_bind_sampler = (cl_gpgpu_bind_sampler_cb *) intel_gpgpu_bind_sampler_gen7; + cl_gpgpu_bind_vme_state = (cl_gpgpu_bind_vme_state_cb *) intel_gpgpu_bind_vme_state_gen7; cl_gpgpu_set_scratch = (cl_gpgpu_set_scratch_cb *) intel_gpgpu_set_scratch; cl_gpgpu_event_new = (cl_gpgpu_event_new_cb *)intel_gpgpu_event_new; cl_gpgpu_event_flush = (cl_gpgpu_event_flush_cb *)intel_gpgpu_event_flush; @@ -2258,6 +2470,7 @@ intel_set_gpgpu_callbacks(int device_id) } else if (IS_IVYBRIDGE(device_id)) { cl_gpgpu_bind_image = (cl_gpgpu_bind_image_cb *) intel_gpgpu_bind_image_gen7; + cl_gpgpu_bind_image_for_vme = (cl_gpgpu_bind_image_cb *) intel_gpgpu_bind_image_for_vme_gen7; if (IS_BAYTRAIL_T(device_id)) { intel_gpgpu_set_L3 = intel_gpgpu_set_L3_baytrail; intel_gpgpu_read_ts_reg = intel_gpgpu_read_ts_reg_baytrail; diff --git a/src/intel/intel_structs.h b/src/intel/intel_structs.h index fd6a82b9..c112a160 100644 --- a/src/intel/intel_structs.h +++ b/src/intel/intel_structs.h @@ -381,6 +381,57 @@ typedef struct gen8_surface_state } ss15; } gen8_surface_state_t; +typedef struct gen7_media_surface_state +{ + struct { + uint32_t base_addr; + } ss0; + + struct { + uint32_t uv_offset_v_direction:2; + uint32_t pic_struct:2; + uint32_t width:14; + uint32_t height:14; + } ss1; + + struct { + uint32_t tile_mode:2; + uint32_t half_pitch_for_chroma:1; + uint32_t surface_pitch:18; + uint32_t pad1:1; + uint32_t surface_object_control_state:4; + uint32_t pad0:1; + uint32_t interleave_chroma:1; + uint32_t surface_format:4; + } ss2; + + struct { + uint32_t y_offset_for_u:14; + uint32_t pad1:2; + uint32_t x_offset_for_u:14; + uint32_t pad0:2; + } ss3; + + struct { + uint32_t y_offset_for_v:15; + uint32_t pad1:1; + uint32_t x_offset_for_v:14; + uint32_t pad0:2; + } ss4; + + struct { + uint32_t pad0; + } ss5; + + struct { + uint32_t pad0; + } ss6; + + struct { + uint32_t pad0; + } ss7; +} gen7_media_surface_state_t; + typedef union gen_surface_state { gen7_surface_state_t gen7_surface_state; @@ -555,6 +606,75 @@ typedef struct gen8_pipe_control } dw5; } gen8_pipe_control_t; +#define GEN7_NUM_VME_SEARCH_PATH_STATES 14 +#define GEN7_NUM_VME_RD_LUT_SETS 4 + +typedef struct gen7_vme_search_path_state +{ + struct { + uint32_t SPD_0_X : BITFIELD_RANGE(0, 3); //search path distance + uint32_t SPD_0_Y : BITFIELD_RANGE(4, 7); + uint32_t SPD_1_X : BITFIELD_RANGE(8, 11); + uint32_t SPD_1_Y : BITFIELD_RANGE(12, 15); + uint32_t SPD_2_X : BITFIELD_RANGE(16, 19); + uint32_t SPD_2_Y : BITFIELD_RANGE(20, 23); + uint32_t SPD_3_X : BITFIELD_RANGE(24, 27); + uint32_t SPD_3_Y : BITFIELD_RANGE(28, 31); + }dw0; +}gen7_vme_search_path_state_t; + +typedef struct gen7_vme_rd_lut_set +{ + struct { + uint32_t LUT_MbMode_0 : BITFIELD_RANGE(0, 7); + uint32_t LUT_MbMode_1 : BITFIELD_RANGE(8, 15); + uint32_t LUT_MbMode_2 : BITFIELD_RANGE(16, 23); + uint32_t LUT_MbMode_3 : BITFIELD_RANGE(24, 31); + }dw0; + + struct { + uint32_t LUT_MbMode_4 : BITFIELD_RANGE(0, 7); + uint32_t LUT_MbMode_5 : BITFIELD_RANGE(8, 15); + uint32_t LUT_MbMode_6 : BITFIELD_RANGE(16, 23); + uint32_t LUT_MbMode_7 : BITFIELD_RANGE(24, 31); + }dw1; + + struct { + uint32_t LUT_MV_0 : BITFIELD_RANGE(0, 7); + uint32_t LUT_MV_1 : BITFIELD_RANGE(8, 15); + uint32_t LUT_MV_2 : BITFIELD_RANGE(16, 23); + uint32_t LUT_MV_3 : BITFIELD_RANGE(24, 31); + }dw2; + + struct { + uint32_t LUT_MV_4 : BITFIELD_RANGE(0, 7); + uint32_t LUT_MV_5 : BITFIELD_RANGE(8, 15); + uint32_t LUT_MV_6 : BITFIELD_RANGE(16, 23); + uint32_t LUT_MV_7 : BITFIELD_RANGE(24, 31); + }dw3; +}gen7_vme_rd_lut_set_t; + +typedef struct gen7_vme_state +{ + gen7_vme_search_path_state_t sp[GEN7_NUM_VME_SEARCH_PATH_STATES]; + + struct { + uint32_t LUT_MbMode_8_0 : BITFIELD_RANGE(0, 7); + uint32_t LUT_MbMode_9_0 : BITFIELD_RANGE(8, 15); + uint32_t LUT_MbMode_8_1 : BITFIELD_RANGE(16, 23); + uint32_t LUT_MbMode_9_1 : BITFIELD_RANGE(24, 31); + }dw14; + + struct { + uint32_t LUT_MbMode_8_2 : BITFIELD_RANGE(0, 7); + uint32_t LUT_MbMode_9_2 : BITFIELD_RANGE(8, 15); + uint32_t LUT_MbMode_8_3 : BITFIELD_RANGE(16, 23); + uint32_t LUT_MbMode_9_3 : BITFIELD_RANGE(24, 31); + }dw15; + + gen7_vme_rd_lut_set_t lut[GEN7_NUM_VME_RD_LUT_SETS]; +}gen7_vme_state_t; + typedef struct gen6_sampler_state { struct { diff --git a/src/kernels/cl_internal_block_motion_estimate_intel.cl b/src/kernels/cl_internal_block_motion_estimate_intel.cl new file mode 100644 index 00000000..5a223381 --- /dev/null +++ b/src/kernels/cl_internal_block_motion_estimate_intel.cl @@ -0,0 +1,233 @@ +typedef struct _motion_estimation_desc_intel { + uint mb_block_type; + uint subpixel_mode; + uint sad_adjust_mode; + uint search_path_type; +} accelerator_intel_t; + +__kernel __attribute__((reqd_work_group_size(16,1,1))) +void block_motion_estimate_intel(accelerator_intel_t accel, + __read_only image2d_t src_image, + __read_only image2d_t ref_image, + __global short2 * prediction_motion_vector_buffer, + __global short2 * motion_vector_buffer, + __global ushort * residuals){ + + uint src_grf0_dw7; + uint src_grf0_dw6; + uint src_grf0_dw5; + uint src_grf0_dw4; + uint src_grf0_dw3; + uint src_grf0_dw2; + uint src_grf0_dw1; + uint src_grf0_dw0; + uint src_grf1_dw7; + uint src_grf1_dw6; + uint src_grf1_dw5; + uint src_grf1_dw4; + uint src_grf1_dw3; + uint src_grf1_dw2; + uint src_grf1_dw1; + uint src_grf1_dw0; + uint src_grf2_dw7; + uint src_grf2_dw6; + uint src_grf2_dw5; + uint src_grf2_dw4; + uint src_grf2_dw3; + uint src_grf2_dw2; + uint src_grf2_dw1; + uint src_grf2_dw0; + uint src_grf3_dw7; + uint src_grf3_dw6; + uint src_grf3_dw5; + uint src_grf3_dw4; + uint src_grf3_dw3; + uint src_grf3_dw2; + uint src_grf3_dw1; + uint src_grf3_dw0; + uint src_grf4_dw7; + uint src_grf4_dw6; + uint src_grf4_dw5; + uint src_grf4_dw4; + uint src_grf4_dw3; + uint src_grf4_dw2; + uint src_grf4_dw1; + uint src_grf4_dw0; + + uint8 vme_result = (0, 0, 0, 0, 0, 0, 0, 0); + + int lgid_x = get_group_id(0); + int lgid_y = get_group_id(1); + + uint2 srcCoord = 0; + + srcCoord.x = lgid_x * 16; + srcCoord.y = lgid_y * 16; + + //TODO: This line of code is just to workaround a curbe related bug caused by commit 061d214a6fc2876a0e24e094f87f2a172984bc23 + //After fix, this line should be removed. + src_grf0_dw5 = accel.mb_block_type; + + //CL_ME_SEARCH_PATH_RADIUS_2_2_INTEL + if(accel.search_path_type == 0x0){ + //src_grf0_dw5 = (Ref_Height << 24) | (Ref_Width << 16) | (Ignored << 8) | (Dispatch_Id?); + src_grf0_dw5 = (20 << 24) | (20 << 16) | (0 << 8) | (0); + //src_grf0_dw1 = (Ref1Y << 16) | (Ref1X); + src_grf0_dw1 = 0xfffefffe; + //src_grf0_dw0 = (Ref0Y << 16) | (Ref0X); + src_grf0_dw0 = 0xfffefffe; + //src_grf1_dw2 = (Start1Y << 28) | (Start1X << 24) | (Start0Y << 20) + src_grf1_dw2 = (0 << 28) | (0 << 24) | (0 << 20) + //| (Start0X << 16) | (Max_Num_SU << 8) | (LenSP); + | (0 << 16) | (2 << 8) | (2); + } + //CL_ME_SEARCH_PATH_RADIUS_4_4_INTEL + else if(accel.search_path_type == 0x1){ + src_grf0_dw5 = (24 << 24) | (24 << 16) | (0 << 8) | (0); + src_grf0_dw1 = 0xfffcfffc; + src_grf0_dw0 = 0xfffcfffc; + src_grf1_dw2 = (0 << 28) | (0 << 24) | (0 << 20) + | (0 << 16) | (48 << 8) | (48); + } + //CL_ME_SEARCH_PATH_RADIUS_16_12_INTEL + else if(accel.search_path_type == 0x5){ + src_grf0_dw5 = (40 << 24) | (48 << 16) | (0 << 8) | (0); + src_grf0_dw1 = 0xfff4fff0; + src_grf0_dw0 = 0xfff4fff0; + src_grf1_dw2 = (0 << 28) | (0 << 24) | (0 << 20) + | (0 << 16) | (48 << 8) | (48); + } + + //src_grf0_dw7 = Debug; + src_grf0_dw7 = 0; + //src_grf0_dw6 = Debug; + src_grf0_dw6 = 0; + //src_grf0_dw5 = (Ref_Height << 24) | (Ref_Width << 16) | (Ignored << 8) | (Dispatch_Id?); + //src_grf0_dw4 = Ignored; + src_grf0_dw4 = 0; + //src_grf0_dw3 = (Reserved << 31) | (Sub_Mb_Part_Mask << 24) | (Intra_SAD << 22) + src_grf0_dw3 = (0 << 31) | (0x7e << 24) | (0 << 22) + //| (Inter_SAD << 20) | (BB_Skip_Enabled << 19) | (Reserverd << 18) + | (0 << 20) | (0 << 19) | (0 << 18) + //| (Dis_Aligned_Src_Fetch << 17) | (Dis_Aligned_Ref_Fetch << 16) | (Dis_Field_Cache_Alloc << 15) + | (0 << 17) | (0 << 16) | (0 << 15) + //| (Skip_Type << 14) | (Sub_Pel_Mode << 12) | (Dual_Search_Path_Opt << 11) + | (0 << 14) | (0 << 12) | (0 << 11) + //| (Search_Ctrl << 8) | (Ref_Access << 7) | (SrcAccess << 6) + | (0 << 8) | (0 << 7) | (0 << 6) + //| (Mb_Type_Remap << 4) | (Reserved_Workaround << 3) | (Reserved_Workaround << 2) + | (0 << 4) | (0 << 3) | (0 << 2) + //| (Src_Size); + | (0); + + //src_grf0_dw2 = (SrcY << 16) | (SrcX); + src_grf0_dw2 = (srcCoord.y << 16) | (srcCoord.x); + //src_grf0_dw1 = (Ref1Y << 16) | (Ref1X); + //src_grf0_dw0 = (Ref0Y << 16) | (Ref0X); + /*src_grf1_dw7 = (Skip_Center_Mask << 24) | (Reserved << 22) | (Ref1_Field_Polarity << 21) + | (Ref0_Field_Polarity << 20) | (Src_Field_Polarity << 19) | (Bilinear_Enable << 18) + | (MV_Cost_Scale_Factor << 16) | (Mb_Intra_Struct << 8) | (Intra_Corner_Swap << 7) + | (Non_Skip_Mode_Added << 6) | (Non_Skip_ZMv_Added << 5) | (IntraPartMask);*/ + src_grf1_dw7 = 0; + //src_grf1_dw6 = Reserved; + src_grf1_dw6 = 0; + /*src_grf1_dw5 = (Cost_Center1Y << 16) | (Cost_Center1X); + src_grf1_dw4 = (Cost_Center0Y << 16) | (Cost_Center0X); + src_grf1_dw3 = (Ime_Too_Good << 24 ) | (Ime_Too_Bad << 16) | (Part_Tolerance_Thrhd << 8) | (FBPrunThrhd);*/ + src_grf1_dw5 = 0; + src_grf1_dw4 = 0; + src_grf1_dw3 = 0; + //src_grf1_dw2 = (Start1Y << 28) | (Start1X << 24) | (Start0Y << 20) + //| (Start0X << 16) | (Max_Num_SU << 8) | (LenSP); + /*src_grf1_dw1 = (RepartEn << 31) | (FBPrunEn << 30) | (AdaptiveValidationControl << 29) + | (Uni_Mix_Disable << 28) | (Bi_Sub_Mb_Part_Mask << 24) | (Reserverd << 22) + | (Bi_Weight << 16) | (Reserved << 6) | (MaxNumMVs);*/ + src_grf1_dw1 = (0 << 24) | (2); + /*src_grf1_dw0 = (Early_Ime_Stop << 24) | (Early_Fme_Success << 16) | (Skip_Success << 8) + | (T8x8_Flag_For_Inter_En << 7) | (Quit_Inter_En << 6) | (Early_Ime_Success_En << 5) + | (Early_Success_En << 4) | (Part_Candidate_En << 3) | (Bi_Mix_Dis << 2) + | (Adaptive_En << 1) | (SkipModeEn);*/ + src_grf1_dw0 = 0; + /*src_grf2_dw7 = Ref1_SkipCenter_3_Delta_XY; + src_grf2_dw6 = Ref0_SkipCenter_3_Delta_XY; + src_grf2_dw5 = Ref1_SkipCenter_2_Delta_XY; + src_grf2_dw4 = Ref0_SkipCenter_3_Delta_XY; + src_grf2_dw3 = Ref1_SkipCenter_1_Delta_XY; + src_grf2_dw2 = Ref0_SkipCenter_1_Delta_XY; + src_grf2_dw1 = Ref1_SkipCenter_0_Delta_XY; + src_grf2_dw0 = (Ref0_Skip_Center_0_Delta_Y << 16) | (Ref0_Skip_Center_0_Delta_X); + src_grf3_dw7 = Neighbor pixel Luma value [23, -1] to [20, -1]; + src_grf3_dw6 = Neighbor pixel Luma value [19, -1] to [16, -1]; + src_grf3_dw5 = Neighbor pixel Luma value [15, -1] to [12, -1]; + src_grf3_dw4 = Neighbor pixel Luma value [11, -1] to [8, -1]; + src_grf3_dw3 = Neighbor pixel Luma value [7, -1] to [4, -1]; + src_grf3_dw2 = (Neighbor pixel Luma value [3, -1] << 24) | (Neighbor pixel Luma value [2, -1] << 16) + | (Neighbor pixel Luma value [1, -1] << 8) | (Neighbor pixel Luma value [0, -1]); + //src_grf3_dw1 = (?) | (Reserved) | ((Intra_16x16_Mode_Mask); + src_grf3_dw0 = (Reserved<<25) | (Intra_16x16_Mode_Mask << 16) | (Reserved) | (Intra_16x16_Mode_Mask); + src_grf4_dw7 = Reserved; + src_grf4_dw6 = Reserved; + src_grf4_dw5 = Reserved; + src_grf4_dw4 = (Intra_MxM_Pred_Mode_B15 << 28) | (Intra_MxM_Pred_Mode_B14 << 24) | (Intra_MxM_Pred_Mode_B11 << 20) + | (Intra_MxM_Pred_Mode_B10 << 16) | (Intra_MxM_Pred_Mode_A15 << 12) | (Intra_MxM_Pred_Mode_A13 << 8) + | (Intra_MxM_Pred_Mode_A7 << 4) | (Intra_MxM_Pred_Mode_A5); + //src_grf4_dw3 = (?) | (Neighbor pixel Luma value [-1, 14] to [-1, 12]); + src_grf4_dw2 = Neighbor pixel Luma value [-1, 11] to [-1, 8]; + src_grf4_dw1 = Neighbor pixel Luma value [-1, 7] to [-1, 4]; + src_grf4_dw0 = (Neighbor pixel Luma value [-1, 3] << 24) | (Neighbor pixel Luma value [-1, 2] << 16) + | (Neighbor pixel Luma value [-1, 1] << 8) | (Neighbor pixel Luma value [-1, 0]);*/ + src_grf2_dw7 = 0; + src_grf2_dw6 = 0; + src_grf2_dw5 = 0; + src_grf2_dw4 = 0; + src_grf2_dw3 = 0; + src_grf2_dw2 = 0; + src_grf2_dw1 = 0; + src_grf2_dw0 = 0; + src_grf3_dw7 = 0; + src_grf3_dw6 = 0; + src_grf3_dw5 = 0; + src_grf3_dw4 = 0; + src_grf3_dw3 = 0; + src_grf3_dw2 = 0; + src_grf3_dw1 = 0; + src_grf3_dw0 = 0; + src_grf4_dw7 = 0; + src_grf4_dw6 = 0; + src_grf4_dw5 = 0; + src_grf4_dw4 = 0; + src_grf4_dw3 = 0; + src_grf4_dw2 = 0; + src_grf4_dw1 = 0; + src_grf4_dw0 = 0; + + vme_result = __gen_ocl_vme(src_image, ref_image, + src_grf0_dw7, src_grf0_dw6, src_grf0_dw5, src_grf0_dw4, + src_grf0_dw3, src_grf0_dw2, src_grf0_dw1, src_grf0_dw0, + src_grf1_dw7, src_grf1_dw6, src_grf1_dw5, src_grf1_dw4, + src_grf1_dw3, src_grf1_dw2, src_grf1_dw1, src_grf1_dw0, + src_grf2_dw7, src_grf2_dw6, src_grf2_dw5, src_grf2_dw4, + src_grf2_dw3, src_grf2_dw2, src_grf2_dw1, src_grf2_dw0, + src_grf3_dw7, src_grf3_dw6, src_grf3_dw5, src_grf3_dw4, + src_grf3_dw3, src_grf3_dw2, src_grf3_dw1, src_grf3_dw0, + src_grf4_dw7, src_grf4_dw6, src_grf4_dw5, src_grf4_dw4, + src_grf4_dw3, src_grf4_dw2, src_grf4_dw1, src_grf4_dw0, + //msg_type, vme_search_path_lut, lut_sub, + 1, 0, 0); + + barrier(CLK_LOCAL_MEM_FENCE); + + int lid_x = get_local_id(0); + uint simd_width = get_sub_group_size(); + uint write_back_grf1_dw0; + if(simd_width == 8) + write_back_grf1_dw0 = __gen_ocl_region(0, vme_result.s1); + else if(simd_width == 16) + write_back_grf1_dw0 = __gen_ocl_region(8, vme_result.s0); + short2 val = as_short2( write_back_grf1_dw0 ); + int index = lgid_y * get_num_groups(0) + lgid_x; + if( lid_x == 0 ){ + motion_vector_buffer[index] = val; + } + +} -- cgit v1.2.1