summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorChuanbo Weng <chuanbo.weng@intel.com>2015-11-06 11:27:48 +0800
committerYang Rong <rong.r.yang@intel.com>2015-11-10 12:22:18 +0800
commit6a3eddc4dd70c895c426f1f8231778eb98ea7ac3 (patch)
treeadf8c19a83b3ec13fc25307365c7afccef6a306c
parent9cac82def65594d0dc5f9b9402be6f1fb2c8fcd6 (diff)
downloadbeignet-6a3eddc4dd70c895c426f1f8231778eb98ea7ac3.tar.gz
Add extensions intel_accelerator and basic intel_motion_estimation.
v2: 1. Just upload the first vme_state. 2. Remove duplicated code in check_opt1_extension. 3. Check image format before cl_gpgpu_bind_image_for_vme. 4. Fix error of getting mv. Because we suppose this kernel run in SIMD16 mode, so dword 0 of grf 1 should be __gen_ocl_region(8,vme_result.s0), not __gen_ocl_region(0,vme_result.s1). v3: Return CL_IMAGE_FORMAT_NOT_SUPPORTED if image format is not the required one. v4: Fix two conflicts after code rebase and wordaround a curbe related bug. v6: Treat simd8 and simd16 differently when getting mv. Signed-off-by: Guo Yejun <yejun.guo@intel.com> Signed-off-by: Chuanbo Weng <chuanbo.weng@intel.com> Reviewed-by: Ruiling Song <ruiling.song@intel.com>
-rw-r--r--include/CL/cl_ext.h103
-rw-r--r--src/CMakeLists.txt4
-rw-r--r--src/cl_accelerator_intel.c86
-rw-r--r--src/cl_accelerator_intel.h29
-rw-r--r--src/cl_api.c107
-rw-r--r--src/cl_command_queue.c17
-rw-r--r--src/cl_command_queue_gen7.c8
-rw-r--r--src/cl_context.c1
-rw-r--r--src/cl_context.h3
-rw-r--r--src/cl_driver.h21
-rw-r--r--src/cl_driver_defs.c2
-rw-r--r--src/cl_extensions.c4
-rw-r--r--src/cl_extensions.h8
-rw-r--r--src/cl_gen7_device.h5
-rw-r--r--src/cl_gt_device.h6
-rw-r--r--src/cl_internals.h1
-rw-r--r--src/cl_kernel.c57
-rw-r--r--src/cl_kernel.h6
-rw-r--r--src/cl_utils.h12
-rw-r--r--src/intel/intel_gpgpu.c217
-rw-r--r--src/intel/intel_structs.h120
-rw-r--r--src/kernels/cl_internal_block_motion_estimate_intel.cl233
22 files changed, 1017 insertions, 33 deletions
diff --git a/include/CL/cl_ext.h b/include/CL/cl_ext.h
index 710bea88..0a66d704 100644
--- a/include/CL/cl_ext.h
+++ b/include/CL/cl_ext.h
@@ -184,6 +184,109 @@ typedef CL_API_ENTRY cl_int (CL_API_CALL *clTerminateContextKHR_fn)(cl_context /
#define CL_PRINTF_CALLBACK_ARM 0x40B0
#define CL_PRINTF_BUFFERSIZE_ARM 0x40B1
+/*********************************
+* cl_intel_accelerator extension *
+*********************************/
+#define cl_intel_accelerator 1
+#define cl_intel_motion_estimation 1
+
+typedef struct _cl_accelerator_intel* cl_accelerator_intel;
+typedef cl_uint cl_accelerator_type_intel;
+typedef cl_uint cl_accelerator_info_intel;
+
+typedef struct _cl_motion_estimation_desc_intel {
+ cl_uint mb_block_type;
+ cl_uint subpixel_mode;
+ cl_uint sad_adjust_mode;
+ cl_uint search_path_type;
+} cl_motion_estimation_desc_intel;
+
+/* Error Codes */
+#define CL_INVALID_ACCELERATOR_INTEL -1094
+#define CL_INVALID_ACCELERATOR_TYPE_INTEL -1095
+#define CL_INVALID_ACCELERATOR_DESCRIPTOR_INTEL -1096
+#define CL_ACCELERATOR_TYPE_NOT_SUPPORTED_INTEL -1097
+
+/* Deprecated Error Codes */
+#define CL_INVALID_ACCELERATOR_INTEL_DEPRECATED -6000
+#define CL_INVALID_ACCELERATOR_TYPE_INTEL_DEPRECATED -6001
+#define CL_INVALID_ACCELERATOR_DESCRIPTOR_INTEL_DEPRECATED -6002
+#define CL_ACCELERATOR_TYPE_NOT_SUPPORTED_INTEL_DEPRECATED -6003
+
+/* cl_accelerator_type_intel */
+#define CL_ACCELERATOR_TYPE_MOTION_ESTIMATION_INTEL 0x0
+
+/* cl_accelerator_info_intel */
+#define CL_ACCELERATOR_DESCRIPTOR_INTEL 0x4090
+#define CL_ACCELERATOR_REFERENCE_COUNT_INTEL 0x4091
+#define CL_ACCELERATOR_CONTEXT_INTEL 0x4092
+#define CL_ACCELERATOR_TYPE_INTEL 0x4093
+
+/*cl_motion_detect_desc_intel flags */
+#define CL_ME_MB_TYPE_16x16_INTEL 0x0
+#define CL_ME_MB_TYPE_8x8_INTEL 0x1
+#define CL_ME_MB_TYPE_4x4_INTEL 0x2
+
+#define CL_ME_SUBPIXEL_MODE_INTEGER_INTEL 0x0
+#define CL_ME_SUBPIXEL_MODE_HPEL_INTEL 0x1
+#define CL_ME_SUBPIXEL_MODE_QPEL_INTEL 0x2
+
+#define CL_ME_SAD_ADJUST_MODE_NONE_INTEL 0x0
+#define CL_ME_SAD_ADJUST_MODE_HAAR_INTEL 0x1
+
+#define CL_ME_SEARCH_PATH_RADIUS_2_2_INTEL 0x0
+#define CL_ME_SEARCH_PATH_RADIUS_4_4_INTEL 0x1
+#define CL_ME_SEARCH_PATH_RADIUS_16_12_INTEL 0x5
+
+extern CL_API_ENTRY cl_accelerator_intel CL_API_CALL
+clCreateAcceleratorINTEL(
+ cl_context /* context */,
+ cl_accelerator_type_intel /* accelerator_type */,
+ size_t /* descriptor_size */,
+ const void* /* descriptor */,
+ cl_int* /* errcode_ret */ ) CL_EXT_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_accelerator_intel
+ (CL_API_CALL *clCreateAcceleratorINTEL_fn)(
+ cl_context /* context */,
+ cl_accelerator_type_intel /* accelerator_type */,
+ size_t /* descriptor_size */,
+ const void* /* descriptor */,
+ cl_int* /* errcode_ret */ ) CL_EXT_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetAcceleratorInfoINTEL
+(
+ cl_accelerator_intel /* accelerator */,
+ cl_accelerator_info_intel /* param_name */,
+ size_t /* param_value_size */,
+ void* /* param_value */,
+ size_t* /* param_value_size_ret */ ) CL_EXT_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_int
+ (CL_API_CALL *clGetAcceleratorInfoINTEL_fn)(
+ cl_accelerator_intel /* accelerator */,
+ cl_accelerator_info_intel /* param_name */,
+ size_t /* param_value_size */,
+ void* /* param_value */,
+ size_t* /* param_value_size_ret */ ) CL_EXT_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainAcceleratorINTEL(
+ cl_accelerator_intel /* accelerator */ ) CL_EXT_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_int
+ (CL_API_CALL *clRetainAcceleratorINTEL_fn)(
+ cl_accelerator_intel /* accelerator */ ) CL_EXT_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseAcceleratorINTEL(
+ cl_accelerator_intel /* accelerator */ ) CL_EXT_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_int
+ (CL_API_CALL *clReleaseAcceleratorINTEL_fn)(
+ cl_accelerator_intel /* accelerator */ ) CL_EXT_SUFFIX__VERSION_1_2;
+
#ifdef CL_VERSION_1_1
/***********************************
* cl_ext_device_fission extension *
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 40a9afbe..c917e76d 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -56,7 +56,8 @@ cl_internal_fill_buf_align8 cl_internal_fill_buf_align4
cl_internal_fill_buf_align2 cl_internal_fill_buf_unalign
cl_internal_fill_buf_align128 cl_internal_fill_image_1d
cl_internal_fill_image_1d_array cl_internal_fill_image_2d
-cl_internal_fill_image_2d_array cl_internal_fill_image_3d)
+cl_internal_fill_image_2d_array cl_internal_fill_image_3d
+cl_internal_block_motion_estimate_intel)
set (BUILT_IN_NAME cl_internal_built_in_kernel)
MakeBuiltInKernelStr ("${CMAKE_CURRENT_SOURCE_DIR}/kernels/" "${KERNEL_NAMES}")
MakeKernelBinStr ("${CMAKE_CURRENT_SOURCE_DIR}/kernels/" "${KERNEL_NAMES}")
@@ -70,6 +71,7 @@ set(OPENCL_SRC
cl_program.c
cl_gbe_loader.cpp
cl_sampler.c
+ cl_accelerator_intel.c
cl_event.c
cl_enqueue.c
cl_image.c
diff --git a/src/cl_accelerator_intel.c b/src/cl_accelerator_intel.c
new file mode 100644
index 00000000..cda89635
--- /dev/null
+++ b/src/cl_accelerator_intel.c
@@ -0,0 +1,86 @@
+#include "cl_context.h"
+#include "cl_accelerator_intel.h"
+#include "cl_utils.h"
+#include "cl_alloc.h"
+#include "cl_khr_icd.h"
+#include "cl_kernel.h"
+
+#include <assert.h>
+
+LOCAL cl_accelerator_intel
+cl_accelerator_intel_new(cl_context ctx,
+ cl_accelerator_type_intel accel_type,
+ size_t desc_sz,
+ const void* desc,
+ cl_int* errcode_ret)
+{
+ cl_accelerator_intel accel = NULL;
+ cl_int err = CL_SUCCESS;
+
+ /* Allocate and inialize the structure itself */
+ TRY_ALLOC(accel, CALLOC(struct _cl_accelerator_intel));
+ SET_ICD(accel->dispatch)
+ accel->ref_n = 1;
+ accel->magic = CL_MAGIC_ACCELERATOR_INTEL_HEADER;
+
+ if (accel_type != CL_ACCELERATOR_TYPE_MOTION_ESTIMATION_INTEL) {
+ err = CL_INVALID_ACCELERATOR_TYPE_INTEL;
+ goto error;
+ }
+ accel->type = accel_type;
+
+ if (desc == NULL) { // and check inside desc
+ err = CL_INVALID_ACCELERATOR_DESCRIPTOR_INTEL;
+ goto error;
+ }
+ accel->desc.me = *(cl_motion_estimation_desc_intel*)desc;
+
+ /* Append the accelerator_intel in the context accelerator_intel list */
+ /* does this really needed? */
+ pthread_mutex_lock(&ctx->accelerator_intel_lock);
+ accel->next = ctx->accels;
+ if (ctx->accels != NULL)
+ ctx->accels->prev = accel;
+ ctx->accels = accel;
+ pthread_mutex_unlock(&ctx->accelerator_intel_lock);
+
+ accel->ctx = ctx;
+ cl_context_add_ref(ctx);
+
+exit:
+ if (errcode_ret)
+ *errcode_ret = err;
+ return accel;
+error:
+ cl_accelerator_intel_delete(accel);
+ accel = NULL;
+ goto exit;
+}
+
+LOCAL void
+cl_accelerator_intel_add_ref(cl_accelerator_intel accel)
+{
+ atomic_inc(&accel->ref_n);
+}
+
+LOCAL void
+cl_accelerator_intel_delete(cl_accelerator_intel accel)
+{
+ if (UNLIKELY(accel == NULL))
+ return;
+ if (atomic_dec(&accel->ref_n) > 1)
+ return;
+
+ /* Remove the accelerator_intel in the context accelerator_intel list */
+ pthread_mutex_lock(&accel->ctx->accelerator_intel_lock);
+ if (accel->prev)
+ accel->prev->next = accel->next;
+ if (accel->next)
+ accel->next->prev = accel->prev;
+ if (accel->ctx->accels == accel)
+ accel->ctx->accels = accel->next;
+ pthread_mutex_unlock(&accel->ctx->accelerator_intel_lock);
+
+ cl_context_delete(accel->ctx);
+ cl_free(accel);
+}
diff --git a/src/cl_accelerator_intel.h b/src/cl_accelerator_intel.h
new file mode 100644
index 00000000..cecfd2a8
--- /dev/null
+++ b/src/cl_accelerator_intel.h
@@ -0,0 +1,29 @@
+#ifndef __CL_ACCELERATOR_INTEL_H__
+#define __CL_ACCELERATOR_INTEL_H__
+
+#include "CL/cl.h"
+#include "CL/cl_ext.h"
+#include <stdint.h>
+
+struct _cl_accelerator_intel {
+ DEFINE_ICD(dispatch)
+ uint64_t magic; /* To identify it as a accelerator_intel object */
+ volatile int ref_n; /* This object is reference counted */
+ cl_accelerator_intel prev, next; /* We chain in the allocator, why chain? */
+ cl_context ctx; /* Context it belongs to */
+ cl_accelerator_type_intel type;
+ union {
+ cl_motion_estimation_desc_intel me;
+ }desc; /* save desc before we decide how to handle it */
+};
+
+cl_accelerator_intel cl_accelerator_intel_new(cl_context ctx,
+ cl_accelerator_type_intel accel_type,
+ size_t desc_sz,
+ const void* desc,
+ cl_int* errcode_ret);
+
+void cl_accelerator_intel_add_ref(cl_accelerator_intel accel);
+void cl_accelerator_intel_delete(cl_accelerator_intel accel);
+
+#endif
diff --git a/src/cl_api.c b/src/cl_api.c
index ec417d4e..ddd39cfc 100644
--- a/src/cl_api.c
+++ b/src/cl_api.c
@@ -28,6 +28,7 @@
#include "cl_mem.h"
#include "cl_image.h"
#include "cl_sampler.h"
+#include "cl_accelerator_intel.h"
#include "cl_alloc.h"
#include "cl_utils.h"
@@ -2913,6 +2914,17 @@ clEnqueueNDRangeKernel(cl_command_queue command_queue,
goto error;
}
+ if (kernel->vme) {
+ if (work_dim != 2) {
+ err = CL_INVALID_WORK_DIMENSION;
+ goto error;
+ }
+ if (local_work_size != NULL) {
+ err = CL_INVALID_WORK_GROUP_SIZE;
+ goto error;
+ }
+ }
+
if (global_work_offset != NULL)
for (i = 0; i < work_dim; ++i) {
if (UNLIKELY(global_work_offset[i] + global_work_size[i] > (size_t)-1)) {
@@ -2946,22 +2958,31 @@ clEnqueueNDRangeKernel(cl_command_queue command_queue,
for (i = 0; i < work_dim; ++i)
fixed_local_sz[i] = local_work_size[i];
} else {
- uint j, maxDimSize = 64 /* from 64? */, maxGroupSize = 256; //MAX_WORK_GROUP_SIZE may too large
- for (i = 0; i< work_dim; i++) {
- for (j = maxDimSize; j > 1; j--) {
- if (global_work_size[i] % j == 0 && j <= maxGroupSize) {
- fixed_local_sz[i] = j;
- maxGroupSize = maxGroupSize /j;
- maxDimSize = maxGroupSize > maxDimSize ? maxDimSize : maxGroupSize;
- break; //choose next work_dim
+ if (kernel->vme) {
+ fixed_local_sz[0] = 16;
+ fixed_local_sz[1] = 1;
+ } else {
+ uint j, maxDimSize = 64 /* from 64? */, maxGroupSize = 256; //MAX_WORK_GROUP_SIZE may too large
+ for (i = 0; i< work_dim; i++) {
+ for (j = maxDimSize; j > 1; j--) {
+ if (global_work_size[i] % j == 0 && j <= maxGroupSize) {
+ fixed_local_sz[i] = j;
+ maxGroupSize = maxGroupSize /j;
+ maxDimSize = maxGroupSize > maxDimSize ? maxDimSize : maxGroupSize;
+ break; //choose next work_dim
+ }
}
}
}
}
- if (global_work_size != NULL)
+ if (kernel->vme) {
+ fixed_global_sz[0] = (global_work_size[0]+15) / 16 * 16;
+ fixed_global_sz[1] = (global_work_size[1]+15) / 16;
+ } else {
for (i = 0; i < work_dim; ++i)
fixed_global_sz[i] = global_work_size[i];
+ }
if (global_work_offset != NULL)
for (i = 0; i < work_dim; ++i)
fixed_global_off[i] = global_work_offset[i];
@@ -3192,6 +3213,10 @@ internal_clGetExtensionFunctionAddress(const char *func_name)
EXTFUNC(clGetMemObjectFdIntel)
EXTFUNC(clCreateBufferFromFdINTEL)
EXTFUNC(clCreateImageFromFdINTEL)
+ EXTFUNC(clCreateAcceleratorINTEL)
+ EXTFUNC(clRetainAcceleratorINTEL)
+ EXTFUNC(clReleaseAcceleratorINTEL)
+ EXTFUNC(clGetAcceleratorInfoINTEL)
return NULL;
}
@@ -3419,3 +3444,67 @@ error:
*errorcode_ret = err;
return mem;
}
+
+cl_accelerator_intel
+clCreateAcceleratorINTEL(cl_context context,
+ cl_accelerator_type_intel accel_type,
+ size_t desc_sz,
+ const void* desc,
+ cl_int* errcode_ret)
+{
+
+ cl_accelerator_intel accel = NULL;
+ cl_int err = CL_SUCCESS;
+ CHECK_CONTEXT(context);
+ accel = cl_accelerator_intel_new(context, accel_type, desc_sz, desc, &err);
+error:
+ if (errcode_ret)
+ *errcode_ret = err;
+ return accel;
+}
+
+cl_int
+clRetainAcceleratorINTEL(cl_accelerator_intel accel)
+{
+ cl_int err = CL_SUCCESS;
+ CHECK_ACCELERATOR_INTEL(accel);
+ cl_accelerator_intel_add_ref(accel);
+error:
+ return err;
+}
+
+cl_int
+clReleaseAcceleratorINTEL(cl_accelerator_intel accel)
+{
+ cl_int err = CL_SUCCESS;
+ CHECK_ACCELERATOR_INTEL(accel);
+ cl_accelerator_intel_delete(accel);
+error:
+ return err;
+}
+
+cl_int
+clGetAcceleratorInfoINTEL(cl_accelerator_intel accel,
+ cl_accelerator_info_intel param_name,
+ size_t param_value_size,
+ void* param_value,
+ size_t* param_value_size_ret)
+{
+ cl_int err = CL_SUCCESS;
+ CHECK_ACCELERATOR_INTEL(accel);
+
+ if (param_name == CL_ACCELERATOR_REFERENCE_COUNT_INTEL) {
+ FILL_GETINFO_RET (cl_uint, 1, (cl_uint*)&accel->ref_n, CL_SUCCESS);
+ } else if (param_name == CL_ACCELERATOR_CONTEXT_INTEL) {
+ FILL_GETINFO_RET (cl_context, 1, &accel->ctx, CL_SUCCESS);
+ } else if (param_name == CL_ACCELERATOR_TYPE_INTEL) {
+ FILL_GETINFO_RET (cl_uint, 1, &accel->type, CL_SUCCESS);
+ } else if (param_name == CL_ACCELERATOR_DESCRIPTOR_INTEL) {
+ FILL_GETINFO_RET (cl_motion_estimation_desc_intel, 1, &(accel->desc.me), CL_SUCCESS);
+ } else{
+ return CL_INVALID_VALUE;
+ }
+
+error:
+ return err;
+}
diff --git a/src/cl_command_queue.c b/src/cl_command_queue.c
index 9dc3fe64..033e7df7 100644
--- a/src/cl_command_queue.c
+++ b/src/cl_command_queue.c
@@ -135,10 +135,19 @@ cl_command_queue_bind_image(cl_command_queue queue, cl_kernel k)
image = cl_mem_image(k->args[id].mem);
set_image_info(k->curbe, &k->images[i], image);
- cl_gpgpu_bind_image(gpgpu, k->images[i].idx, image->base.bo, image->offset + k->args[id].mem->offset,
- image->intel_fmt, image->image_type, image->bpp,
- image->w, image->h, image->depth,
- image->row_pitch, image->slice_pitch, (cl_gpgpu_tiling)image->tiling);
+ if(k->vme){
+ if( (image->fmt.image_channel_order != CL_R) || (image->fmt.image_channel_data_type != CL_UNORM_INT8) )
+ return CL_IMAGE_FORMAT_NOT_SUPPORTED;
+ cl_gpgpu_bind_image_for_vme(gpgpu, k->images[i].idx, image->base.bo, image->offset + k->args[id].mem->offset,
+ image->intel_fmt, image->image_type, image->bpp,
+ image->w, image->h, image->depth,
+ image->row_pitch, image->slice_pitch, (cl_gpgpu_tiling)image->tiling);
+ }
+ else
+ cl_gpgpu_bind_image(gpgpu, k->images[i].idx, image->base.bo, image->offset + k->args[id].mem->offset,
+ image->intel_fmt, image->image_type, image->bpp,
+ image->w, image->h, image->depth,
+ image->row_pitch, image->slice_pitch, (cl_gpgpu_tiling)image->tiling);
// TODO, this workaround is for GEN7/GEN75 only, we may need to do it in the driver layer
// on demand.
if (image->image_type == CL_MEM_OBJECT_IMAGE1D_ARRAY)
diff --git a/src/cl_command_queue_gen7.c b/src/cl_command_queue_gen7.c
index 2edc3be4..2a49ec24 100644
--- a/src/cl_command_queue_gen7.c
+++ b/src/cl_command_queue_gen7.c
@@ -367,9 +367,13 @@ cl_command_queue_ND_range_gen7(cl_command_queue queue,
/* Bind user buffers */
cl_command_queue_bind_surface(queue, ker);
/* Bind user images */
- cl_command_queue_bind_image(queue, ker);
+ if(UNLIKELY(err = cl_command_queue_bind_image(queue, ker) != CL_SUCCESS))
+ return err;
/* Bind all samplers */
- cl_gpgpu_bind_sampler(gpgpu, ker->samplers, ker->sampler_sz);
+ if (ker->vme)
+ cl_gpgpu_bind_vme_state(gpgpu, ker->accel);
+ else
+ cl_gpgpu_bind_sampler(gpgpu, ker->samplers, ker->sampler_sz);
if (cl_gpgpu_set_scratch(gpgpu, scratch_sz) != 0)
goto error;
diff --git a/src/cl_context.c b/src/cl_context.c
index c45e0aa7..a6bde7d2 100644
--- a/src/cl_context.c
+++ b/src/cl_context.c
@@ -177,6 +177,7 @@ cl_context_new(struct _cl_context_prop *props)
pthread_mutex_init(&ctx->queue_lock, NULL);
pthread_mutex_init(&ctx->buffer_lock, NULL);
pthread_mutex_init(&ctx->sampler_lock, NULL);
+ pthread_mutex_init(&ctx->accelerator_intel_lock, NULL);
exit:
return ctx;
diff --git a/src/cl_context.h b/src/cl_context.h
index ef94823d..489e5d7d 100644
--- a/src/cl_context.h
+++ b/src/cl_context.h
@@ -21,6 +21,7 @@
#define __CL_CONTEXT_H__
#include "CL/cl.h"
+#include "CL/cl_ext.h"
#include "cl_internals.h"
#include "cl_driver.h"
#include "cl_khr_icd.h"
@@ -107,11 +108,13 @@ struct _cl_context {
cl_program programs; /* All programs currently allocated */
cl_mem buffers; /* All memory object currently allocated */
cl_sampler samplers; /* All sampler object currently allocated */
+ cl_accelerator_intel accels; /* All accelerator_intel object currently allocated */
cl_event events; /* All event object currently allocated */
pthread_mutex_t queue_lock; /* To allocate and deallocate queues */
pthread_mutex_t program_lock; /* To allocate and deallocate programs */
pthread_mutex_t buffer_lock; /* To allocate and deallocate buffers */
pthread_mutex_t sampler_lock; /* To allocate and deallocate samplers */
+ pthread_mutex_t accelerator_intel_lock; /* To allocate and deallocate accelerator_intel */
pthread_mutex_t event_lock; /* To allocate and deallocate events */
cl_program internal_prgs[CL_INTERNAL_KERNEL_MAX];
/* All programs internal used, for example clEnqueuexxx api use */
diff --git a/src/cl_driver.h b/src/cl_driver.h
index 19afb433..9d986b1f 100644
--- a/src/cl_driver.h
+++ b/src/cl_driver.h
@@ -23,9 +23,11 @@
#include <stdint.h>
#include <stdlib.h>
#include "cl_driver_type.h"
+#include "CL/cl_ext.h"
/* Various limitations we should remove actually */
#define GEN_MAX_SURFACES 256
#define GEN_MAX_SAMPLERS 16
+#define GEN_MAX_VME_STATES 8
/**************************************************************************
* cl_driver:
@@ -145,6 +147,9 @@ extern cl_gpgpu_bind_buf_cb *cl_gpgpu_bind_buf;
typedef void (cl_gpgpu_bind_sampler_cb)(cl_gpgpu, uint32_t *samplers, size_t sampler_sz);
extern cl_gpgpu_bind_sampler_cb *cl_gpgpu_bind_sampler;
+typedef void (cl_gpgpu_bind_vme_state_cb)(cl_gpgpu, cl_accelerator_intel accel);
+extern cl_gpgpu_bind_vme_state_cb *cl_gpgpu_bind_vme_state;
+
/* get the default cache control value. */
typedef uint32_t (cl_gpgpu_get_cache_ctrl_cb)();
extern cl_gpgpu_get_cache_ctrl_cb *cl_gpgpu_get_cache_ctrl;
@@ -165,6 +170,22 @@ typedef void (cl_gpgpu_bind_image_cb)(cl_gpgpu state,
extern cl_gpgpu_bind_image_cb *cl_gpgpu_bind_image;
+typedef void (cl_gpgpu_bind_image_for_vme_cb)(cl_gpgpu state,
+ uint32_t id,
+ cl_buffer obj_bo,
+ uint32_t obj_bo_offset,
+ uint32_t format,
+ uint32_t bpp,
+ uint32_t type,
+ int32_t w,
+ int32_t h,
+ int32_t depth,
+ int pitch,
+ int32_t slice_pitch,
+ cl_gpgpu_tiling tiling);
+
+extern cl_gpgpu_bind_image_for_vme_cb *cl_gpgpu_bind_image_for_vme;
+
/* Setup a stack */
typedef void (cl_gpgpu_set_stack_cb)(cl_gpgpu, uint32_t offset, uint32_t size, uint32_t cchint);
extern cl_gpgpu_set_stack_cb *cl_gpgpu_set_stack;
diff --git a/src/cl_driver_defs.c b/src/cl_driver_defs.c
index d25fd5d1..58c4f8f0 100644
--- a/src/cl_driver_defs.c
+++ b/src/cl_driver_defs.c
@@ -71,6 +71,7 @@ LOCAL cl_gpgpu_bind_buf_cb *cl_gpgpu_bind_buf = NULL;
LOCAL cl_gpgpu_set_stack_cb *cl_gpgpu_set_stack = NULL;
LOCAL cl_gpgpu_set_scratch_cb *cl_gpgpu_set_scratch = NULL;
LOCAL cl_gpgpu_bind_image_cb *cl_gpgpu_bind_image = NULL;
+LOCAL cl_gpgpu_bind_image_cb *cl_gpgpu_bind_image_for_vme = NULL;
LOCAL cl_gpgpu_get_cache_ctrl_cb *cl_gpgpu_get_cache_ctrl = NULL;
LOCAL cl_gpgpu_state_init_cb *cl_gpgpu_state_init = NULL;
LOCAL cl_gpgpu_alloc_constant_buffer_cb * cl_gpgpu_alloc_constant_buffer = NULL;
@@ -84,6 +85,7 @@ LOCAL cl_gpgpu_batch_end_cb *cl_gpgpu_batch_end = NULL;
LOCAL cl_gpgpu_flush_cb *cl_gpgpu_flush = NULL;
LOCAL cl_gpgpu_walker_cb *cl_gpgpu_walker = NULL;
LOCAL cl_gpgpu_bind_sampler_cb *cl_gpgpu_bind_sampler = NULL;
+LOCAL cl_gpgpu_bind_vme_state_cb *cl_gpgpu_bind_vme_state = NULL;
LOCAL cl_gpgpu_event_new_cb *cl_gpgpu_event_new = NULL;
LOCAL cl_gpgpu_event_update_status_cb *cl_gpgpu_event_update_status = NULL;
LOCAL cl_gpgpu_event_flush_cb *cl_gpgpu_event_flush = NULL;
diff --git a/src/cl_extensions.c b/src/cl_extensions.c
index 3e714ac3..ba910c12 100644
--- a/src/cl_extensions.c
+++ b/src/cl_extensions.c
@@ -65,7 +65,9 @@ check_gl_extension(cl_extensions_t *extensions) {
void
check_intel_extension(cl_extensions_t *extensions)
{
- /* Should put those map/unmap extensions here. */
+ int id;
+ for(id = INTEL_EXT_START_ID; id <= INTEL_EXT_END_ID; id++)
+ extensions->extensions[id].base.ext_enabled = 1;
}
void
diff --git a/src/cl_extensions.h b/src/cl_extensions.h
index 0006651e..f744fa3a 100644
--- a/src/cl_extensions.h
+++ b/src/cl_extensions.h
@@ -23,6 +23,10 @@
DECL_EXT(khr_spir) \
DECL_EXT(khr_icd)
+#define DECL_INTEL_EXTENSIONS \
+ DECL_EXT(intel_accelerator) \
+ DECL_EXT(intel_motion_estimation)
+
#define DECL_GL_EXTENSIONS \
DECL_EXT(khr_gl_sharing)\
DECL_EXT(khr_gl_event)\
@@ -37,6 +41,7 @@
#define DECL_ALL_EXTENSIONS \
DECL_BASE_EXTENSIONS \
DECL_OPT1_EXTENSIONS \
+ DECL_INTEL_EXTENSIONS \
DECL_GL_EXTENSIONS \
DECL_D3D_EXTENSIONS
@@ -54,6 +59,8 @@ cl_khr_extension_id_max
#define BASE_EXT_END_ID EXT_ID(khr_fp64)
#define OPT1_EXT_START_ID EXT_ID(khr_int64_base_atomics)
#define OPT1_EXT_END_ID EXT_ID(khr_icd)
+#define INTEL_EXT_START_ID EXT_ID(intel_accelerator)
+#define INTEL_EXT_END_ID EXT_ID(intel_motion_estimation)
#define GL_EXT_START_ID EXT_ID(khr_gl_sharing)
#define GL_EXT_END_ID EXT_ID(khr_gl_msaa_sharing)
@@ -75,6 +82,7 @@ struct EXT_STRUCT_NAME(name) { \
DECL_BASE_EXTENSIONS
DECL_OPT1_EXTENSIONS
+DECL_INTEL_EXTENSIONS
DECL_D3D_EXTENSIONS
DECL_GL_EXTENSIONS
#undef DECL_EXT
diff --git a/src/cl_gen7_device.h b/src/cl_gen7_device.h
index 104e9297..e755cad6 100644
--- a/src/cl_gen7_device.h
+++ b/src/cl_gen7_device.h
@@ -27,5 +27,8 @@
.max_mem_alloc_size = 2 * 1024 * 1024 * 1024ul,
.global_mem_size = 2 * 1024 * 1024 * 1024ul,
+//temporarily define to only export builtin kernel block_motion_estimate_intel only for Gen7
+//will remove after HSW+ also support
+#define GEN7_DEVICE
#include "cl_gt_device.h"
-
+#undef GEN7_DEVICE
diff --git a/src/cl_gt_device.h b/src/cl_gt_device.h
index e61ff613..d8089c23 100644
--- a/src/cl_gt_device.h
+++ b/src/cl_gt_device.h
@@ -114,7 +114,11 @@ DECL_INFO_STRING(built_in_kernels, "__cl_copy_region_align4;"
"__cl_fill_image_1d_array;"
"__cl_fill_image_2d;"
"__cl_fill_image_2d_array;"
- "__cl_fill_image_3d;")
+ "__cl_fill_image_3d;"
+#ifdef GEN7_DEVICE
+ "block_motion_estimate_intel;"
+#endif
+ )
DECL_INFO_STRING(driver_version, LIBCL_DRIVER_VERSION_STRING)
DECL_INFO_STRING(spir_versions, "1.2")
diff --git a/src/cl_internals.h b/src/cl_internals.h
index cb3fc238..9aeb8c12 100644
--- a/src/cl_internals.h
+++ b/src/cl_internals.h
@@ -31,6 +31,7 @@
#define CL_MAGIC_EVENT_HEADER 0x8324a9c810ebf90fLL
#define CL_MAGIC_MEM_HEADER 0x381a27b9ce6504dfLL
#define CL_MAGIC_DEAD_HEADER 0xdeaddeaddeaddeadLL
+#define CL_MAGIC_ACCELERATOR_INTEL_HEADER 0x7c6a08c9a7ac3e3fLL
#endif /* __CL_INTERNALS_H__ */
diff --git a/src/cl_kernel.c b/src/cl_kernel.c
index 58a1224e..b2d1955f 100644
--- a/src/cl_kernel.c
+++ b/src/cl_kernel.c
@@ -27,6 +27,7 @@
#include "cl_khr_icd.h"
#include "CL/cl.h"
#include "cl_sampler.h"
+#include "cl_accelerator_intel.h"
#include <stdio.h>
#include <string.h>
@@ -113,10 +114,22 @@ cl_kernel_set_arg(cl_kernel k, cl_uint index, size_t sz, const void *value)
arg_type = interp_kernel_get_arg_type(k->opaque, index);
arg_sz = interp_kernel_get_arg_size(k->opaque, index);
- if (UNLIKELY(arg_type != GBE_ARG_LOCAL_PTR && arg_sz != sz)) {
- if (arg_type != GBE_ARG_SAMPLER ||
- (arg_type == GBE_ARG_SAMPLER && sz != sizeof(cl_sampler)))
+ if (k->vme && index == 0) {
+ //the best method is to return the arg type of GBE_ARG_ACCELERATOR_INTEL
+ //but it is not straightforward since clang does not support it now
+ //the easy way is to consider typedef accelerator_intel_t as a struct,
+ //this easy way makes the size mismatched, so use another size check method.
+ if (sz != sizeof(cl_accelerator_intel) || arg_sz != sizeof(cl_motion_estimation_desc_intel))
return CL_INVALID_ARG_SIZE;
+ cl_accelerator_intel* accel = (cl_accelerator_intel*)value;
+ if ((*accel)->type != CL_ACCELERATOR_TYPE_MOTION_ESTIMATION_INTEL)
+ return CL_INVALID_ACCELERATOR_TYPE_INTEL;
+ } else {
+ if (UNLIKELY(arg_type != GBE_ARG_LOCAL_PTR && arg_sz != sz)) {
+ if (arg_type != GBE_ARG_SAMPLER ||
+ (arg_type == GBE_ARG_SAMPLER && sz != sizeof(cl_sampler)))
+ return CL_INVALID_ARG_SIZE;
+ }
}
if(UNLIKELY(arg_type == GBE_ARG_LOCAL_PTR && sz == 0))
@@ -152,15 +165,30 @@ cl_kernel_set_arg(cl_kernel k, cl_uint index, size_t sz, const void *value)
/* Copy the structure or the value directly into the curbe */
if (arg_type == GBE_ARG_VALUE) {
- offset = interp_kernel_get_curbe_offset(k->opaque, GBE_CURBE_KERNEL_ARGUMENT, index);
- if (offset >= 0) {
- assert(offset + sz <= k->curbe_sz);
- memcpy(k->curbe + offset, value, sz);
+ if (k->vme && index == 0) {
+ cl_accelerator_intel accel;
+ memcpy(&accel, value, sz);
+ offset = interp_kernel_get_curbe_offset(k->opaque, GBE_CURBE_KERNEL_ARGUMENT, index);
+ if (offset >= 0) {
+ assert(offset + sz <= k->curbe_sz);
+ memcpy(k->curbe + offset, &(accel->desc.me), arg_sz);
+ }
+ k->args[index].local_sz = 0;
+ k->args[index].is_set = 1;
+ k->args[index].mem = NULL;
+ k->accel = accel;
+ return CL_SUCCESS;
+ } else {
+ offset = interp_kernel_get_curbe_offset(k->opaque, GBE_CURBE_KERNEL_ARGUMENT, index);
+ if (offset >= 0) {
+ assert(offset + sz <= k->curbe_sz);
+ memcpy(k->curbe + offset, value, sz);
+ }
+ k->args[index].local_sz = 0;
+ k->args[index].is_set = 1;
+ k->args[index].mem = NULL;
+ return CL_SUCCESS;
}
- k->args[index].local_sz = 0;
- k->args[index].is_set = 1;
- k->args[index].mem = NULL;
- return CL_SUCCESS;
}
/* For a local pointer just save the size */
@@ -331,6 +359,12 @@ cl_kernel_setup(cl_kernel k, gbe_kernel opaque)
cl_buffer_subdata(k->bo, 0, code_sz, code);
k->opaque = opaque;
+ const char* kname = cl_kernel_get_name(k);
+ if (strncmp(kname, "block_motion_estimate_intel", sizeof("block_motion_estimate_intel")) == 0)
+ k->vme = 1;
+ else
+ k->vme = 0;
+
/* Create the curbe */
k->curbe_sz = interp_kernel_get_curbe_size(k->opaque);
@@ -367,6 +401,7 @@ cl_kernel_dup(cl_kernel from)
SET_ICD(to->dispatch)
to->bo = from->bo;
to->opaque = from->opaque;
+ to->vme = from->vme;
to->ref_n = 1;
to->magic = CL_MAGIC_KERNEL_HEADER;
to->program = from->program;
diff --git a/src/cl_kernel.h b/src/cl_kernel.h
index 140bbb10..7f59162c 100644
--- a/src/cl_kernel.h
+++ b/src/cl_kernel.h
@@ -24,6 +24,7 @@
#include "cl_driver.h"
#include "cl_gbe_loader.h"
#include "CL/cl.h"
+#include "CL/cl_ext.h"
#include <stdint.h>
#include <stdlib.h>
@@ -37,6 +38,7 @@ struct _gbe_kernel;
typedef struct cl_argument {
cl_mem mem; /* For image and regular buffers */
cl_sampler sampler; /* For sampler. */
+ cl_accelerator_intel accel;
unsigned char bti;
uint32_t local_sz:31; /* For __local size specification */
uint32_t is_set:1; /* All args must be set before NDRange */
@@ -50,6 +52,7 @@ struct _cl_kernel {
cl_buffer bo; /* The code itself */
cl_program program; /* Owns this structure (and pointers) */
gbe_kernel opaque; /* (Opaque) compiler structure for the OCL kernel */
+ cl_accelerator_intel accel; /* accelerator */
char *curbe; /* One curbe per kernel */
size_t curbe_sz; /* Size of it */
uint32_t samplers[GEN_MAX_SAMPLERS]; /* samplers defined in kernel & kernel args */
@@ -63,8 +66,9 @@ struct _cl_kernel {
(i.e. global_work_size argument to clEnqueueNDRangeKernel.)*/
size_t stack_size; /* stack size per work item. */
cl_argument *args; /* To track argument setting */
- uint32_t arg_n:31; /* Number of arguments */
+ uint32_t arg_n:30; /* Number of arguments */
uint32_t ref_its_program:1; /* True only for the user kernel (created by clCreateKernel) */
+ uint32_t vme:1; /* True only if it is a built-in kernel for VME */
};
/* Allocate an empty kernel */
diff --git a/src/cl_utils.h b/src/cl_utils.h
index 28fdef62..ee9d6143 100644
--- a/src/cl_utils.h
+++ b/src/cl_utils.h
@@ -202,6 +202,18 @@ do { \
} \
} while (0)
+#define CHECK_ACCELERATOR_INTEL(ACCELERATOR_INTEL) \
+do { \
+ if (UNLIKELY(ACCELERATOR_INTEL == NULL)) { \
+ err = CL_INVALID_ACCELERATOR_INTEL; \
+ goto error; \
+ } \
+ if (UNLIKELY(ACCELERATOR_INTEL->magic != CL_MAGIC_ACCELERATOR_INTEL_HEADER)) {\
+ err = CL_INVALID_ACCELERATOR_INTEL; \
+ goto error; \
+ } \
+} while (0)
+
#define CHECK_KERNEL(KERNEL) \
do { \
if (UNLIKELY(KERNEL == NULL)) { \
diff --git a/src/intel/intel_gpgpu.c b/src/intel/intel_gpgpu.c
index 110c36f2..0c34ca97 100644
--- a/src/intel/intel_gpgpu.c
+++ b/src/intel/intel_gpgpu.c
@@ -38,6 +38,7 @@
#include "cl_alloc.h"
#include "cl_utils.h"
#include "cl_sampler.h"
+#include "cl_accelerator_intel.h"
#ifndef CL_VERSION_1_2
#define CL_MEM_OBJECT_IMAGE1D 0x10F4
@@ -941,10 +942,12 @@ intel_gpgpu_state_init(intel_gpgpu_t *gpgpu,
gpgpu->aux_offset.idrt_offset = size_aux;
size_aux += MAX_IF_DESC * sizeof(struct gen6_interface_descriptor);
- //sampler state must be 32 bytes aligned
+ //must be 32 bytes aligned
+ //sampler state and vme state share the same buffer,
size_aux = ALIGN(size_aux, 32);
gpgpu->aux_offset.sampler_state_offset = size_aux;
- size_aux += GEN_MAX_SAMPLERS * sizeof(gen6_sampler_state_t);
+ size_aux += MAX(GEN_MAX_SAMPLERS * sizeof(gen6_sampler_state_t),
+ GEN_MAX_VME_STATES * sizeof(gen7_vme_state_t));
//sampler border color state must be 32 bytes aligned
size_aux = ALIGN(size_aux, 32);
@@ -985,6 +988,22 @@ intel_gpgpu_set_buf_reloc_gen7(intel_gpgpu_t *gpgpu, int32_t index, dri_bo* obj_
obj_bo);
}
+static void
+intel_gpgpu_set_buf_reloc_for_vme_gen7(intel_gpgpu_t *gpgpu, int32_t index, dri_bo* obj_bo, uint32_t obj_bo_offset)
+{
+ surface_heap_t *heap = gpgpu->aux_buf.bo->virtual + gpgpu->aux_offset.surface_heap_offset;
+ heap->binding_table[index] = offsetof(surface_heap_t, surface) +
+ index * sizeof(gen7_surface_state_t);
+ dri_bo_emit_reloc(gpgpu->aux_buf.bo,
+ I915_GEM_DOMAIN_RENDER,
+ I915_GEM_DOMAIN_RENDER,
+ obj_bo_offset,
+ gpgpu->aux_offset.surface_heap_offset +
+ heap->binding_table[index] +
+ offsetof(gen7_media_surface_state_t, ss0),
+ obj_bo);
+}
+
static dri_bo*
intel_gpgpu_alloc_constant_buffer(intel_gpgpu_t *gpgpu, uint32_t size, uint8_t bti)
{
@@ -1241,6 +1260,55 @@ intel_gpgpu_bind_image_gen7(intel_gpgpu_t *gpgpu,
}
static void
+intel_gpgpu_bind_image_for_vme_gen7(intel_gpgpu_t *gpgpu,
+ uint32_t index,
+ dri_bo* obj_bo,
+ uint32_t obj_bo_offset,
+ uint32_t format,
+ cl_mem_object_type type,
+ uint32_t bpp,
+ int32_t w,
+ int32_t h,
+ int32_t depth,
+ int32_t pitch,
+ int32_t slice_pitch,
+ int32_t tiling)
+{
+ surface_heap_t *heap = gpgpu->aux_buf.bo->virtual + gpgpu->aux_offset.surface_heap_offset;
+ gen7_media_surface_state_t *ss = (gen7_media_surface_state_t *) &heap->surface[index * sizeof(gen7_surface_state_t)];
+
+ memset(ss, 0, sizeof(*ss));
+ ss->ss0.base_addr = obj_bo->offset + obj_bo_offset;
+ ss->ss1.uv_offset_v_direction = 0;
+ ss->ss1.pic_struct = 0;
+ ss->ss1.width = w - 1;
+ ss->ss1.height = h - 1;
+ if (tiling == GPGPU_NO_TILE) {
+ ss->ss2.tile_mode = 0;
+ }
+ else if (tiling == GPGPU_TILE_X){
+ ss->ss2.tile_mode = 2;
+ }
+ else if (tiling == GPGPU_TILE_Y){
+ ss->ss2.tile_mode = 3;
+ }
+ ss->ss2.half_pitch_for_chroma = 0;
+ ss->ss2.surface_pitch = pitch - 1;
+ ss->ss2.surface_object_control_state = cl_gpgpu_get_cache_ctrl();
+ ss->ss2.interleave_chroma = 0;
+ ss->ss2.surface_format = 12; //Y8_UNORM
+ ss->ss3.y_offset_for_u = 0;
+ ss->ss3.x_offset_for_u = 0;
+ ss->ss4.y_offset_for_v = 0;
+ ss->ss4.x_offset_for_v = 0;
+
+ intel_gpgpu_set_buf_reloc_for_vme_gen7(gpgpu, index, obj_bo, obj_bo_offset);
+
+ assert(index < GEN_MAX_SURFACES);
+}
+
+
+static void
intel_gpgpu_bind_image_gen75(intel_gpgpu_t *gpgpu,
uint32_t index,
dri_bo* obj_bo,
@@ -1676,6 +1744,149 @@ int translate_wrap_mode(uint32_t cl_address_mode, int using_nearest)
}
}
+static void intel_gpgpu_insert_vme_state_gen7(intel_gpgpu_t *gpgpu, cl_accelerator_intel accel, uint32_t index)
+{
+ gen7_vme_state_t* vme = (gen7_vme_state_t*)(gpgpu->aux_buf.bo->virtual + gpgpu->aux_offset.sampler_state_offset) + index;
+ memset(vme, 0, sizeof(*vme));
+ gen7_vme_search_path_state_t* sp = vme->sp;
+
+ if(accel->desc.me.search_path_type == CL_ME_SEARCH_PATH_RADIUS_2_2_INTEL){
+ sp[0].dw0.SPD_0_X = 0;
+ sp[0].dw0.SPD_0_Y = 0;
+ sp[0].dw0.SPD_1_X = 0;
+ sp[0].dw0.SPD_1_Y = 0;
+ sp[0].dw0.SPD_2_X = 0;
+ sp[0].dw0.SPD_2_Y = 0;
+ sp[0].dw0.SPD_3_X = 0;
+ sp[0].dw0.SPD_3_Y = 0;
+ }
+ else if(accel->desc.me.search_path_type == CL_ME_SEARCH_PATH_RADIUS_4_4_INTEL){
+ sp[0].dw0.SPD_0_X = 1;
+ sp[0].dw0.SPD_0_Y = 0;
+ sp[0].dw0.SPD_1_X = 0;
+ sp[0].dw0.SPD_1_Y = 1;
+ sp[0].dw0.SPD_2_X = -1;
+ sp[0].dw0.SPD_2_Y = 0;
+ sp[0].dw0.SPD_3_X = 0;
+ sp[0].dw0.SPD_3_Y = 0;
+ }
+ else if(accel->desc.me.search_path_type == CL_ME_SEARCH_PATH_RADIUS_16_12_INTEL){
+ sp[0].dw0.SPD_0_X = 1;
+ sp[0].dw0.SPD_0_Y = 0;
+ sp[0].dw0.SPD_1_X = 1;
+ sp[0].dw0.SPD_1_Y = 0;
+ sp[0].dw0.SPD_2_X = 1;
+ sp[0].dw0.SPD_2_Y = 0;
+ sp[0].dw0.SPD_3_X = 1;
+ sp[0].dw0.SPD_3_Y = 0;
+
+ sp[1].dw0.SPD_0_X = 1;
+ sp[1].dw0.SPD_0_Y = 0;
+ sp[1].dw0.SPD_1_X = 1;
+ sp[1].dw0.SPD_1_Y = 0;
+ sp[1].dw0.SPD_2_X = 1;
+ sp[1].dw0.SPD_2_Y = 0;
+ sp[1].dw0.SPD_3_X = 0;
+ sp[1].dw0.SPD_3_Y = 1;
+
+ sp[2].dw0.SPD_0_X = -1;
+ sp[2].dw0.SPD_0_Y = 0;
+ sp[2].dw0.SPD_1_X = -1;
+ sp[2].dw0.SPD_1_Y = 0;
+ sp[2].dw0.SPD_2_X = -1;
+ sp[2].dw0.SPD_2_Y = 0;
+ sp[2].dw0.SPD_3_X = -1;
+ sp[2].dw0.SPD_3_Y = 0;
+
+ sp[3].dw0.SPD_0_X = -1;
+ sp[3].dw0.SPD_0_Y = 0;
+ sp[3].dw0.SPD_1_X = -1;
+ sp[3].dw0.SPD_1_Y = 0;
+ sp[3].dw0.SPD_2_X = -1;
+ sp[3].dw0.SPD_2_Y = 0;
+ sp[3].dw0.SPD_3_X = 0;
+ sp[3].dw0.SPD_3_Y = 1;
+
+ sp[4].dw0.SPD_0_X = 1;
+ sp[4].dw0.SPD_0_Y = 0;
+ sp[4].dw0.SPD_1_X = 1;
+ sp[4].dw0.SPD_1_Y = 0;
+ sp[4].dw0.SPD_2_X = 1;
+ sp[4].dw0.SPD_2_Y = 0;
+ sp[4].dw0.SPD_3_X = 1;
+ sp[4].dw0.SPD_3_Y = 0;
+
+ sp[5].dw0.SPD_0_X = 1;
+ sp[5].dw0.SPD_0_Y = 0;
+ sp[5].dw0.SPD_1_X = 1;
+ sp[5].dw0.SPD_1_Y = 0;
+ sp[5].dw0.SPD_2_X = 1;
+ sp[5].dw0.SPD_2_Y = 0;
+ sp[5].dw0.SPD_3_X = 0;
+ sp[5].dw0.SPD_3_Y = 1;
+
+ sp[6].dw0.SPD_0_X = -1;
+ sp[6].dw0.SPD_0_Y = 0;
+ sp[6].dw0.SPD_1_X = -1;
+ sp[6].dw0.SPD_1_Y = 0;
+ sp[6].dw0.SPD_2_X = -1;
+ sp[6].dw0.SPD_2_Y = 0;
+ sp[6].dw0.SPD_3_X = -1;
+ sp[6].dw0.SPD_3_Y = 0;
+
+ sp[7].dw0.SPD_0_X = -1;
+ sp[7].dw0.SPD_0_Y = 0;
+ sp[7].dw0.SPD_1_X = -1;
+ sp[7].dw0.SPD_1_Y = 0;
+ sp[7].dw0.SPD_2_X = -1;
+ sp[7].dw0.SPD_2_Y = 0;
+ sp[7].dw0.SPD_3_X = 0;
+ sp[7].dw0.SPD_3_Y = 1;
+
+ sp[8].dw0.SPD_0_X = 1;
+ sp[8].dw0.SPD_0_Y = 0;
+ sp[8].dw0.SPD_1_X = 1;
+ sp[8].dw0.SPD_1_Y = 0;
+ sp[8].dw0.SPD_2_X = 1;
+ sp[8].dw0.SPD_2_Y = 0;
+ sp[8].dw0.SPD_3_X = 1;
+ sp[8].dw0.SPD_3_Y = 0;
+
+ sp[9].dw0.SPD_0_X = 1;
+ sp[9].dw0.SPD_0_Y = 0;
+ sp[9].dw0.SPD_1_X = 1;
+ sp[9].dw0.SPD_1_Y = 0;
+ sp[9].dw0.SPD_2_X = 1;
+ sp[9].dw0.SPD_2_Y = 0;
+ sp[9].dw0.SPD_3_X = 0;
+ sp[9].dw0.SPD_3_Y = 1;
+
+ sp[10].dw0.SPD_0_X = -1;
+ sp[10].dw0.SPD_0_Y = 0;
+ sp[10].dw0.SPD_1_X = -1;
+ sp[10].dw0.SPD_1_Y = 0;
+ sp[10].dw0.SPD_2_X = -1;
+ sp[10].dw0.SPD_2_Y = 0;
+ sp[10].dw0.SPD_3_X = -1;
+ sp[10].dw0.SPD_3_Y = 0;
+
+ sp[11].dw0.SPD_0_X = -1;
+ sp[11].dw0.SPD_0_Y = 0;
+ sp[11].dw0.SPD_1_X = -1;
+ sp[11].dw0.SPD_1_Y = 0;
+ sp[11].dw0.SPD_2_X = -1;
+ sp[11].dw0.SPD_2_Y = 0;
+ sp[11].dw0.SPD_3_X = 0;
+ sp[11].dw0.SPD_3_Y = 0;
+ }
+}
+
+static void
+intel_gpgpu_bind_vme_state_gen7(intel_gpgpu_t *gpgpu, cl_accelerator_intel accel)
+{
+ intel_gpgpu_insert_vme_state_gen7(gpgpu, accel, 0);
+}
+
static void
intel_gpgpu_insert_sampler_gen7(intel_gpgpu_t *gpgpu, uint32_t index, uint32_t clk_sampler)
{
@@ -2181,6 +2392,7 @@ intel_set_gpgpu_callbacks(int device_id)
cl_gpgpu_batch_end = (cl_gpgpu_batch_end_cb *) intel_gpgpu_batch_end;
cl_gpgpu_flush = (cl_gpgpu_flush_cb *) intel_gpgpu_flush;
cl_gpgpu_bind_sampler = (cl_gpgpu_bind_sampler_cb *) intel_gpgpu_bind_sampler_gen7;
+ cl_gpgpu_bind_vme_state = (cl_gpgpu_bind_vme_state_cb *) intel_gpgpu_bind_vme_state_gen7;
cl_gpgpu_set_scratch = (cl_gpgpu_set_scratch_cb *) intel_gpgpu_set_scratch;
cl_gpgpu_event_new = (cl_gpgpu_event_new_cb *)intel_gpgpu_event_new;
cl_gpgpu_event_flush = (cl_gpgpu_event_flush_cb *)intel_gpgpu_event_flush;
@@ -2258,6 +2470,7 @@ intel_set_gpgpu_callbacks(int device_id)
}
else if (IS_IVYBRIDGE(device_id)) {
cl_gpgpu_bind_image = (cl_gpgpu_bind_image_cb *) intel_gpgpu_bind_image_gen7;
+ cl_gpgpu_bind_image_for_vme = (cl_gpgpu_bind_image_cb *) intel_gpgpu_bind_image_for_vme_gen7;
if (IS_BAYTRAIL_T(device_id)) {
intel_gpgpu_set_L3 = intel_gpgpu_set_L3_baytrail;
intel_gpgpu_read_ts_reg = intel_gpgpu_read_ts_reg_baytrail;
diff --git a/src/intel/intel_structs.h b/src/intel/intel_structs.h
index fd6a82b9..c112a160 100644
--- a/src/intel/intel_structs.h
+++ b/src/intel/intel_structs.h
@@ -381,6 +381,57 @@ typedef struct gen8_surface_state
} ss15;
} gen8_surface_state_t;
+typedef struct gen7_media_surface_state
+{
+ struct {
+ uint32_t base_addr;
+ } ss0;
+
+ struct {
+ uint32_t uv_offset_v_direction:2;
+ uint32_t pic_struct:2;
+ uint32_t width:14;
+ uint32_t height:14;
+ } ss1;
+
+ struct {
+ uint32_t tile_mode:2;
+ uint32_t half_pitch_for_chroma:1;
+ uint32_t surface_pitch:18;
+ uint32_t pad1:1;
+ uint32_t surface_object_control_state:4;
+ uint32_t pad0:1;
+ uint32_t interleave_chroma:1;
+ uint32_t surface_format:4;
+ } ss2;
+
+ struct {
+ uint32_t y_offset_for_u:14;
+ uint32_t pad1:2;
+ uint32_t x_offset_for_u:14;
+ uint32_t pad0:2;
+ } ss3;
+
+ struct {
+ uint32_t y_offset_for_v:15;
+ uint32_t pad1:1;
+ uint32_t x_offset_for_v:14;
+ uint32_t pad0:2;
+ } ss4;
+
+ struct {
+ uint32_t pad0;
+ } ss5;
+
+ struct {
+ uint32_t pad0;
+ } ss6;
+
+ struct {
+ uint32_t pad0;
+ } ss7;
+} gen7_media_surface_state_t;
+
typedef union gen_surface_state
{
gen7_surface_state_t gen7_surface_state;
@@ -555,6 +606,75 @@ typedef struct gen8_pipe_control
} dw5;
} gen8_pipe_control_t;
+#define GEN7_NUM_VME_SEARCH_PATH_STATES 14
+#define GEN7_NUM_VME_RD_LUT_SETS 4
+
+typedef struct gen7_vme_search_path_state
+{
+ struct {
+ uint32_t SPD_0_X : BITFIELD_RANGE(0, 3); //search path distance
+ uint32_t SPD_0_Y : BITFIELD_RANGE(4, 7);
+ uint32_t SPD_1_X : BITFIELD_RANGE(8, 11);
+ uint32_t SPD_1_Y : BITFIELD_RANGE(12, 15);
+ uint32_t SPD_2_X : BITFIELD_RANGE(16, 19);
+ uint32_t SPD_2_Y : BITFIELD_RANGE(20, 23);
+ uint32_t SPD_3_X : BITFIELD_RANGE(24, 27);
+ uint32_t SPD_3_Y : BITFIELD_RANGE(28, 31);
+ }dw0;
+}gen7_vme_search_path_state_t;
+
+typedef struct gen7_vme_rd_lut_set
+{
+ struct {
+ uint32_t LUT_MbMode_0 : BITFIELD_RANGE(0, 7);
+ uint32_t LUT_MbMode_1 : BITFIELD_RANGE(8, 15);
+ uint32_t LUT_MbMode_2 : BITFIELD_RANGE(16, 23);
+ uint32_t LUT_MbMode_3 : BITFIELD_RANGE(24, 31);
+ }dw0;
+
+ struct {
+ uint32_t LUT_MbMode_4 : BITFIELD_RANGE(0, 7);
+ uint32_t LUT_MbMode_5 : BITFIELD_RANGE(8, 15);
+ uint32_t LUT_MbMode_6 : BITFIELD_RANGE(16, 23);
+ uint32_t LUT_MbMode_7 : BITFIELD_RANGE(24, 31);
+ }dw1;
+
+ struct {
+ uint32_t LUT_MV_0 : BITFIELD_RANGE(0, 7);
+ uint32_t LUT_MV_1 : BITFIELD_RANGE(8, 15);
+ uint32_t LUT_MV_2 : BITFIELD_RANGE(16, 23);
+ uint32_t LUT_MV_3 : BITFIELD_RANGE(24, 31);
+ }dw2;
+
+ struct {
+ uint32_t LUT_MV_4 : BITFIELD_RANGE(0, 7);
+ uint32_t LUT_MV_5 : BITFIELD_RANGE(8, 15);
+ uint32_t LUT_MV_6 : BITFIELD_RANGE(16, 23);
+ uint32_t LUT_MV_7 : BITFIELD_RANGE(24, 31);
+ }dw3;
+}gen7_vme_rd_lut_set_t;
+
+typedef struct gen7_vme_state
+{
+ gen7_vme_search_path_state_t sp[GEN7_NUM_VME_SEARCH_PATH_STATES];
+
+ struct {
+ uint32_t LUT_MbMode_8_0 : BITFIELD_RANGE(0, 7);
+ uint32_t LUT_MbMode_9_0 : BITFIELD_RANGE(8, 15);
+ uint32_t LUT_MbMode_8_1 : BITFIELD_RANGE(16, 23);
+ uint32_t LUT_MbMode_9_1 : BITFIELD_RANGE(24, 31);
+ }dw14;
+
+ struct {
+ uint32_t LUT_MbMode_8_2 : BITFIELD_RANGE(0, 7);
+ uint32_t LUT_MbMode_9_2 : BITFIELD_RANGE(8, 15);
+ uint32_t LUT_MbMode_8_3 : BITFIELD_RANGE(16, 23);
+ uint32_t LUT_MbMode_9_3 : BITFIELD_RANGE(24, 31);
+ }dw15;
+
+ gen7_vme_rd_lut_set_t lut[GEN7_NUM_VME_RD_LUT_SETS];
+}gen7_vme_state_t;
+
typedef struct gen6_sampler_state
{
struct {
diff --git a/src/kernels/cl_internal_block_motion_estimate_intel.cl b/src/kernels/cl_internal_block_motion_estimate_intel.cl
new file mode 100644
index 00000000..5a223381
--- /dev/null
+++ b/src/kernels/cl_internal_block_motion_estimate_intel.cl
@@ -0,0 +1,233 @@
+typedef struct _motion_estimation_desc_intel {
+ uint mb_block_type;
+ uint subpixel_mode;
+ uint sad_adjust_mode;
+ uint search_path_type;
+} accelerator_intel_t;
+
+__kernel __attribute__((reqd_work_group_size(16,1,1)))
+void block_motion_estimate_intel(accelerator_intel_t accel,
+ __read_only image2d_t src_image,
+ __read_only image2d_t ref_image,
+ __global short2 * prediction_motion_vector_buffer,
+ __global short2 * motion_vector_buffer,
+ __global ushort * residuals){
+
+ uint src_grf0_dw7;
+ uint src_grf0_dw6;
+ uint src_grf0_dw5;
+ uint src_grf0_dw4;
+ uint src_grf0_dw3;
+ uint src_grf0_dw2;
+ uint src_grf0_dw1;
+ uint src_grf0_dw0;
+ uint src_grf1_dw7;
+ uint src_grf1_dw6;
+ uint src_grf1_dw5;
+ uint src_grf1_dw4;
+ uint src_grf1_dw3;
+ uint src_grf1_dw2;
+ uint src_grf1_dw1;
+ uint src_grf1_dw0;
+ uint src_grf2_dw7;
+ uint src_grf2_dw6;
+ uint src_grf2_dw5;
+ uint src_grf2_dw4;
+ uint src_grf2_dw3;
+ uint src_grf2_dw2;
+ uint src_grf2_dw1;
+ uint src_grf2_dw0;
+ uint src_grf3_dw7;
+ uint src_grf3_dw6;
+ uint src_grf3_dw5;
+ uint src_grf3_dw4;
+ uint src_grf3_dw3;
+ uint src_grf3_dw2;
+ uint src_grf3_dw1;
+ uint src_grf3_dw0;
+ uint src_grf4_dw7;
+ uint src_grf4_dw6;
+ uint src_grf4_dw5;
+ uint src_grf4_dw4;
+ uint src_grf4_dw3;
+ uint src_grf4_dw2;
+ uint src_grf4_dw1;
+ uint src_grf4_dw0;
+
+ uint8 vme_result = (0, 0, 0, 0, 0, 0, 0, 0);
+
+ int lgid_x = get_group_id(0);
+ int lgid_y = get_group_id(1);
+
+ uint2 srcCoord = 0;
+
+ srcCoord.x = lgid_x * 16;
+ srcCoord.y = lgid_y * 16;
+
+ //TODO: This line of code is just to workaround a curbe related bug caused by commit 061d214a6fc2876a0e24e094f87f2a172984bc23
+ //After fix, this line should be removed.
+ src_grf0_dw5 = accel.mb_block_type;
+
+ //CL_ME_SEARCH_PATH_RADIUS_2_2_INTEL
+ if(accel.search_path_type == 0x0){
+ //src_grf0_dw5 = (Ref_Height << 24) | (Ref_Width << 16) | (Ignored << 8) | (Dispatch_Id?);
+ src_grf0_dw5 = (20 << 24) | (20 << 16) | (0 << 8) | (0);
+ //src_grf0_dw1 = (Ref1Y << 16) | (Ref1X);
+ src_grf0_dw1 = 0xfffefffe;
+ //src_grf0_dw0 = (Ref0Y << 16) | (Ref0X);
+ src_grf0_dw0 = 0xfffefffe;
+ //src_grf1_dw2 = (Start1Y << 28) | (Start1X << 24) | (Start0Y << 20)
+ src_grf1_dw2 = (0 << 28) | (0 << 24) | (0 << 20)
+ //| (Start0X << 16) | (Max_Num_SU << 8) | (LenSP);
+ | (0 << 16) | (2 << 8) | (2);
+ }
+ //CL_ME_SEARCH_PATH_RADIUS_4_4_INTEL
+ else if(accel.search_path_type == 0x1){
+ src_grf0_dw5 = (24 << 24) | (24 << 16) | (0 << 8) | (0);
+ src_grf0_dw1 = 0xfffcfffc;
+ src_grf0_dw0 = 0xfffcfffc;
+ src_grf1_dw2 = (0 << 28) | (0 << 24) | (0 << 20)
+ | (0 << 16) | (48 << 8) | (48);
+ }
+ //CL_ME_SEARCH_PATH_RADIUS_16_12_INTEL
+ else if(accel.search_path_type == 0x5){
+ src_grf0_dw5 = (40 << 24) | (48 << 16) | (0 << 8) | (0);
+ src_grf0_dw1 = 0xfff4fff0;
+ src_grf0_dw0 = 0xfff4fff0;
+ src_grf1_dw2 = (0 << 28) | (0 << 24) | (0 << 20)
+ | (0 << 16) | (48 << 8) | (48);
+ }
+
+ //src_grf0_dw7 = Debug;
+ src_grf0_dw7 = 0;
+ //src_grf0_dw6 = Debug;
+ src_grf0_dw6 = 0;
+ //src_grf0_dw5 = (Ref_Height << 24) | (Ref_Width << 16) | (Ignored << 8) | (Dispatch_Id?);
+ //src_grf0_dw4 = Ignored;
+ src_grf0_dw4 = 0;
+ //src_grf0_dw3 = (Reserved << 31) | (Sub_Mb_Part_Mask << 24) | (Intra_SAD << 22)
+ src_grf0_dw3 = (0 << 31) | (0x7e << 24) | (0 << 22)
+ //| (Inter_SAD << 20) | (BB_Skip_Enabled << 19) | (Reserverd << 18)
+ | (0 << 20) | (0 << 19) | (0 << 18)
+ //| (Dis_Aligned_Src_Fetch << 17) | (Dis_Aligned_Ref_Fetch << 16) | (Dis_Field_Cache_Alloc << 15)
+ | (0 << 17) | (0 << 16) | (0 << 15)
+ //| (Skip_Type << 14) | (Sub_Pel_Mode << 12) | (Dual_Search_Path_Opt << 11)
+ | (0 << 14) | (0 << 12) | (0 << 11)
+ //| (Search_Ctrl << 8) | (Ref_Access << 7) | (SrcAccess << 6)
+ | (0 << 8) | (0 << 7) | (0 << 6)
+ //| (Mb_Type_Remap << 4) | (Reserved_Workaround << 3) | (Reserved_Workaround << 2)
+ | (0 << 4) | (0 << 3) | (0 << 2)
+ //| (Src_Size);
+ | (0);
+
+ //src_grf0_dw2 = (SrcY << 16) | (SrcX);
+ src_grf0_dw2 = (srcCoord.y << 16) | (srcCoord.x);
+ //src_grf0_dw1 = (Ref1Y << 16) | (Ref1X);
+ //src_grf0_dw0 = (Ref0Y << 16) | (Ref0X);
+ /*src_grf1_dw7 = (Skip_Center_Mask << 24) | (Reserved << 22) | (Ref1_Field_Polarity << 21)
+ | (Ref0_Field_Polarity << 20) | (Src_Field_Polarity << 19) | (Bilinear_Enable << 18)
+ | (MV_Cost_Scale_Factor << 16) | (Mb_Intra_Struct << 8) | (Intra_Corner_Swap << 7)
+ | (Non_Skip_Mode_Added << 6) | (Non_Skip_ZMv_Added << 5) | (IntraPartMask);*/
+ src_grf1_dw7 = 0;
+ //src_grf1_dw6 = Reserved;
+ src_grf1_dw6 = 0;
+ /*src_grf1_dw5 = (Cost_Center1Y << 16) | (Cost_Center1X);
+ src_grf1_dw4 = (Cost_Center0Y << 16) | (Cost_Center0X);
+ src_grf1_dw3 = (Ime_Too_Good << 24 ) | (Ime_Too_Bad << 16) | (Part_Tolerance_Thrhd << 8) | (FBPrunThrhd);*/
+ src_grf1_dw5 = 0;
+ src_grf1_dw4 = 0;
+ src_grf1_dw3 = 0;
+ //src_grf1_dw2 = (Start1Y << 28) | (Start1X << 24) | (Start0Y << 20)
+ //| (Start0X << 16) | (Max_Num_SU << 8) | (LenSP);
+ /*src_grf1_dw1 = (RepartEn << 31) | (FBPrunEn << 30) | (AdaptiveValidationControl << 29)
+ | (Uni_Mix_Disable << 28) | (Bi_Sub_Mb_Part_Mask << 24) | (Reserverd << 22)
+ | (Bi_Weight << 16) | (Reserved << 6) | (MaxNumMVs);*/
+ src_grf1_dw1 = (0 << 24) | (2);
+ /*src_grf1_dw0 = (Early_Ime_Stop << 24) | (Early_Fme_Success << 16) | (Skip_Success << 8)
+ | (T8x8_Flag_For_Inter_En << 7) | (Quit_Inter_En << 6) | (Early_Ime_Success_En << 5)
+ | (Early_Success_En << 4) | (Part_Candidate_En << 3) | (Bi_Mix_Dis << 2)
+ | (Adaptive_En << 1) | (SkipModeEn);*/
+ src_grf1_dw0 = 0;
+ /*src_grf2_dw7 = Ref1_SkipCenter_3_Delta_XY;
+ src_grf2_dw6 = Ref0_SkipCenter_3_Delta_XY;
+ src_grf2_dw5 = Ref1_SkipCenter_2_Delta_XY;
+ src_grf2_dw4 = Ref0_SkipCenter_3_Delta_XY;
+ src_grf2_dw3 = Ref1_SkipCenter_1_Delta_XY;
+ src_grf2_dw2 = Ref0_SkipCenter_1_Delta_XY;
+ src_grf2_dw1 = Ref1_SkipCenter_0_Delta_XY;
+ src_grf2_dw0 = (Ref0_Skip_Center_0_Delta_Y << 16) | (Ref0_Skip_Center_0_Delta_X);
+ src_grf3_dw7 = Neighbor pixel Luma value [23, -1] to [20, -1];
+ src_grf3_dw6 = Neighbor pixel Luma value [19, -1] to [16, -1];
+ src_grf3_dw5 = Neighbor pixel Luma value [15, -1] to [12, -1];
+ src_grf3_dw4 = Neighbor pixel Luma value [11, -1] to [8, -1];
+ src_grf3_dw3 = Neighbor pixel Luma value [7, -1] to [4, -1];
+ src_grf3_dw2 = (Neighbor pixel Luma value [3, -1] << 24) | (Neighbor pixel Luma value [2, -1] << 16)
+ | (Neighbor pixel Luma value [1, -1] << 8) | (Neighbor pixel Luma value [0, -1]);
+ //src_grf3_dw1 = (?) | (Reserved) | ((Intra_16x16_Mode_Mask);
+ src_grf3_dw0 = (Reserved<<25) | (Intra_16x16_Mode_Mask << 16) | (Reserved) | (Intra_16x16_Mode_Mask);
+ src_grf4_dw7 = Reserved;
+ src_grf4_dw6 = Reserved;
+ src_grf4_dw5 = Reserved;
+ src_grf4_dw4 = (Intra_MxM_Pred_Mode_B15 << 28) | (Intra_MxM_Pred_Mode_B14 << 24) | (Intra_MxM_Pred_Mode_B11 << 20)
+ | (Intra_MxM_Pred_Mode_B10 << 16) | (Intra_MxM_Pred_Mode_A15 << 12) | (Intra_MxM_Pred_Mode_A13 << 8)
+ | (Intra_MxM_Pred_Mode_A7 << 4) | (Intra_MxM_Pred_Mode_A5);
+ //src_grf4_dw3 = (?) | (Neighbor pixel Luma value [-1, 14] to [-1, 12]);
+ src_grf4_dw2 = Neighbor pixel Luma value [-1, 11] to [-1, 8];
+ src_grf4_dw1 = Neighbor pixel Luma value [-1, 7] to [-1, 4];
+ src_grf4_dw0 = (Neighbor pixel Luma value [-1, 3] << 24) | (Neighbor pixel Luma value [-1, 2] << 16)
+ | (Neighbor pixel Luma value [-1, 1] << 8) | (Neighbor pixel Luma value [-1, 0]);*/
+ src_grf2_dw7 = 0;
+ src_grf2_dw6 = 0;
+ src_grf2_dw5 = 0;
+ src_grf2_dw4 = 0;
+ src_grf2_dw3 = 0;
+ src_grf2_dw2 = 0;
+ src_grf2_dw1 = 0;
+ src_grf2_dw0 = 0;
+ src_grf3_dw7 = 0;
+ src_grf3_dw6 = 0;
+ src_grf3_dw5 = 0;
+ src_grf3_dw4 = 0;
+ src_grf3_dw3 = 0;
+ src_grf3_dw2 = 0;
+ src_grf3_dw1 = 0;
+ src_grf3_dw0 = 0;
+ src_grf4_dw7 = 0;
+ src_grf4_dw6 = 0;
+ src_grf4_dw5 = 0;
+ src_grf4_dw4 = 0;
+ src_grf4_dw3 = 0;
+ src_grf4_dw2 = 0;
+ src_grf4_dw1 = 0;
+ src_grf4_dw0 = 0;
+
+ vme_result = __gen_ocl_vme(src_image, ref_image,
+ src_grf0_dw7, src_grf0_dw6, src_grf0_dw5, src_grf0_dw4,
+ src_grf0_dw3, src_grf0_dw2, src_grf0_dw1, src_grf0_dw0,
+ src_grf1_dw7, src_grf1_dw6, src_grf1_dw5, src_grf1_dw4,
+ src_grf1_dw3, src_grf1_dw2, src_grf1_dw1, src_grf1_dw0,
+ src_grf2_dw7, src_grf2_dw6, src_grf2_dw5, src_grf2_dw4,
+ src_grf2_dw3, src_grf2_dw2, src_grf2_dw1, src_grf2_dw0,
+ src_grf3_dw7, src_grf3_dw6, src_grf3_dw5, src_grf3_dw4,
+ src_grf3_dw3, src_grf3_dw2, src_grf3_dw1, src_grf3_dw0,
+ src_grf4_dw7, src_grf4_dw6, src_grf4_dw5, src_grf4_dw4,
+ src_grf4_dw3, src_grf4_dw2, src_grf4_dw1, src_grf4_dw0,
+ //msg_type, vme_search_path_lut, lut_sub,
+ 1, 0, 0);
+
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ int lid_x = get_local_id(0);
+ uint simd_width = get_sub_group_size();
+ uint write_back_grf1_dw0;
+ if(simd_width == 8)
+ write_back_grf1_dw0 = __gen_ocl_region(0, vme_result.s1);
+ else if(simd_width == 16)
+ write_back_grf1_dw0 = __gen_ocl_region(8, vme_result.s0);
+ short2 val = as_short2( write_back_grf1_dw0 );
+ int index = lgid_y * get_num_groups(0) + lgid_x;
+ if( lid_x == 0 ){
+ motion_vector_buffer[index] = val;
+ }
+
+}