summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJunyan He <junyan.he@intel.com>2017-06-11 13:50:25 +0800
committerYang Rong <rong.r.yang@intel.com>2017-08-02 17:16:30 +0800
commit11da003c26b5c35e06656d06907cb5c151950fa3 (patch)
tree9ef336ee9e9fdb00d329e3c98d9913119a9a22f9
parentf574c9c450d35db11937e024c2db0ebd41c6980c (diff)
downloadbeignet-11da003c26b5c35e06656d06907cb5c151950fa3.tar.gz
Add GEN device's GPGPU helper functions.
Add all useful functions for GEN device setting, e.g, BTI setting, sampler status and image parameters. Signed-off-by: Junyan He <junyan.he@intel.com>
-rw-r--r--runtime/gen/gen_gpgpu_func.c1864
1 files changed, 1864 insertions, 0 deletions
diff --git a/runtime/gen/gen_gpgpu_func.c b/runtime/gen/gen_gpgpu_func.c
new file mode 100644
index 00000000..b07fea1a
--- /dev/null
+++ b/runtime/gen/gen_gpgpu_func.c
@@ -0,0 +1,1864 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <sys/utsname.h>
+
+/* Cache control options for gen7 */
+typedef enum cl_cache_control {
+ cc_gtt = 0x0,
+ cc_l3 = 0x1,
+ cc_llc = 0x2,
+ cc_llc_l3 = 0x3
+} cl_cache_control;
+
+/* LLCCC Cache control options for gen75 */
+typedef enum cl_llccc_cache_control {
+ llccc_pte = 0x0 << 1,
+ llccc_uc = 0x1 << 1,
+ llccc_ec = 0x2 << 1,
+ llccc_ucllc = 0x3 << 1
+} cl_llccc_cache_control;
+
+/* L3 Cache control options for gen75 */
+typedef enum cl_l3_cache_control {
+ l3cc_uc = 0x0,
+ l3cc_ec = 0x1
+} cl_l3_cache_control;
+
+/* Target Cache control options for gen8 */
+typedef enum cl_target_cache_control {
+ tcc_ec_only = 0x0 << 3,
+ tcc_llc_only = 0x1 << 3,
+ tcc_llc_ec = 0x2 << 3,
+ tcc_llc_ec_l3 = 0x3 << 3
+} cl_target_cache_control;
+
+/* Memory type LLC/ELLC Cache control options for gen8 */
+typedef enum cl_mtllc_cache_control {
+ mtllc_pte = 0x0 << 5,
+ mtllc_none = 0x1 << 5,
+ mtllc_wt = 0x2 << 5,
+ mtllc_wb = 0x3 << 5
+} cl_mtllc_cache_control;
+
+/* Various limitations we should remove actually */
+#define GEN_MAX_SURFACES 256
+#define GEN_MAX_SAMPLERS 16
+#define GEN_MAX_VME_STATES 8
+
+/* Describe texture tiling */
+typedef enum cl_gpgpu_tiling {
+ GPGPU_NO_TILE = 0,
+ GPGPU_TILE_X = 1,
+ GPGPU_TILE_Y = 2,
+} cl_gpgpu_tiling;
+
+static uint32_t
+__gen_gpgpu_get_cache_ctrl_gen7(void)
+{
+ return cc_llc_l3;
+}
+
+static uint32_t
+__gen_gpgpu_get_cache_ctrl_gen75(void)
+{
+ return llccc_ec | l3cc_ec;
+}
+
+static uint32_t
+__gen_gpgpu_get_cache_ctrl_gen8(void)
+{
+ return tcc_llc_ec_l3 | mtllc_wb;
+}
+
+static uint32_t
+__gen_gpgpu_get_cache_ctrl_gen9(void)
+{
+ //Kernel-defined cache control registers 2:
+ //L3CC: WB; LeCC: WB; TC: LLC/eLLC;
+ int major = 0, minor = 0;
+ int mocs_index = 0x2;
+
+ struct utsname buf;
+ uname(&buf);
+ sscanf(buf.release, "%d.%d", &major, &minor);
+ //From linux 4.3, kernel redefined the mocs table's value,
+ //But before 4.3, still used the hw defautl value.
+ if (strcmp(buf.sysname, "Linux") == 0 &&
+ major == 4 && minor < 3) { /* linux kernel support skl from 4.x, so check from 4 */
+ mocs_index = 0x9;
+ }
+
+ return (mocs_index << 1);
+}
+
+static uint32_t
+gen_gpgpu_get_cache_ctrl(gen_gpgpu *gpgpu)
+{
+ if (IS_BROADWELL(gpgpu->device->device_id) || IS_CHERRYVIEW(gpgpu->device->device_id))
+ return __gen_gpgpu_get_cache_ctrl_gen8();
+
+ if (IS_GEN9(gpgpu->device->device_id))
+ return __gen_gpgpu_get_cache_ctrl_gen9();
+
+ if (IS_HASWELL(gpgpu->device->device_id))
+ return __gen_gpgpu_get_cache_ctrl_gen75();
+
+ if (IS_IVYBRIDGE(gpgpu->device->device_id))
+ return __gen_gpgpu_get_cache_ctrl_gen7();
+
+ assert(0);
+ return 0;
+}
+
+static void
+__gen_gpgpu_setup_bti_gen7(gen_gpgpu *gpgpu, drm_intel_bo *buf, uint32_t internal_offset,
+ size_t size, unsigned char index, uint32_t format)
+{
+ assert(size <= (2ul << 30));
+ size_t s = size - 1;
+ surface_heap_t *heap = gpgpu->aux.aux_bo->virtual + gpgpu->aux.surface_heap_offset;
+ gen7_surface_state_t *ss0 = (gen7_surface_state_t *)&heap->surface[index * sizeof(gen7_surface_state_t)];
+ memset(ss0, 0, sizeof(gen7_surface_state_t));
+ ss0->ss0.surface_type = I965_SURFACE_BUFFER;
+ ss0->ss0.surface_format = format;
+ ss0->ss2.width = s & 0x7f; /* bits 6:0 of sz */
+ // Per bspec, I965_SURFACE_BUFFER and RAW format, size must be a multiple of 4 byte.
+ if (format == I965_SURFACEFORMAT_RAW)
+ assert((ss0->ss2.width & 0x03) == 3);
+ ss0->ss2.height = (s >> 7) & 0x3fff; /* bits 20:7 of sz */
+ ss0->ss3.depth = (s >> 21) & 0x3ff; /* bits 30:21 of sz */
+ ss0->ss5.cache_control = gen_gpgpu_get_cache_ctrl(gpgpu);
+ heap->binding_table[index] = offsetof(surface_heap_t, surface) + index * sizeof(gen7_surface_state_t);
+
+ ss0->ss1.base_addr = buf->offset + internal_offset;
+ dri_bo_emit_reloc(gpgpu->aux.aux_bo,
+ I915_GEM_DOMAIN_RENDER,
+ I915_GEM_DOMAIN_RENDER,
+ internal_offset,
+ gpgpu->aux.surface_heap_offset +
+ heap->binding_table[index] +
+ offsetof(gen7_surface_state_t, ss1),
+ buf);
+}
+
+static void
+__gen_gpgpu_setup_bti_gen75(gen_gpgpu *gpgpu, drm_intel_bo *buf, uint32_t internal_offset,
+ size_t size, unsigned char index, uint32_t format)
+{
+ assert(size <= (2ul << 30));
+ size_t s = size - 1;
+ surface_heap_t *heap = gpgpu->aux.aux_bo->virtual + gpgpu->aux.surface_heap_offset;
+ gen7_surface_state_t *ss0 = (gen7_surface_state_t *)&heap->surface[index * sizeof(gen7_surface_state_t)];
+ memset(ss0, 0, sizeof(gen7_surface_state_t));
+ ss0->ss0.surface_type = I965_SURFACE_BUFFER;
+ ss0->ss0.surface_format = format;
+ if (format != I965_SURFACEFORMAT_RAW) {
+ ss0->ss7.shader_r = I965_SURCHAN_SELECT_RED;
+ ss0->ss7.shader_g = I965_SURCHAN_SELECT_GREEN;
+ ss0->ss7.shader_b = I965_SURCHAN_SELECT_BLUE;
+ ss0->ss7.shader_a = I965_SURCHAN_SELECT_ALPHA;
+ }
+ ss0->ss2.width = s & 0x7f; /* bits 6:0 of sz */
+ // Per bspec, I965_SURFACE_BUFFER and RAW format, size must be a multiple of 4 byte.
+ if (format == I965_SURFACEFORMAT_RAW)
+ assert((ss0->ss2.width & 0x03) == 3);
+ ss0->ss2.height = (s >> 7) & 0x3fff; /* bits 20:7 of sz */
+ ss0->ss3.depth = (s >> 21) & 0x3ff; /* bits 30:21 of sz */
+ ss0->ss5.cache_control = gen_gpgpu_get_cache_ctrl(gpgpu);
+ heap->binding_table[index] = offsetof(surface_heap_t, surface) + index * sizeof(gen7_surface_state_t);
+
+ ss0->ss1.base_addr = buf->offset + internal_offset;
+ dri_bo_emit_reloc(gpgpu->aux.aux_bo,
+ I915_GEM_DOMAIN_RENDER,
+ I915_GEM_DOMAIN_RENDER,
+ internal_offset,
+ gpgpu->aux.surface_heap_offset +
+ heap->binding_table[index] +
+ offsetof(gen7_surface_state_t, ss1),
+ buf);
+}
+
+static void
+__gen_gpgpu_setup_bti_gen8(gen_gpgpu *gpgpu, drm_intel_bo *buf, uint32_t internal_offset,
+ size_t size, unsigned char index, uint32_t format)
+{
+ assert(size <= (2ul << 30));
+ size_t s = size - 1;
+ surface_heap_t *heap = gpgpu->aux.aux_bo->virtual + gpgpu->aux.surface_heap_offset;
+ gen8_surface_state_t *ss0 = (gen8_surface_state_t *)&heap->surface[index * sizeof(gen8_surface_state_t)];
+ memset(ss0, 0, sizeof(gen8_surface_state_t));
+ ss0->ss0.surface_type = I965_SURFACE_BUFFER;
+ ss0->ss0.surface_format = format;
+ if (format != I965_SURFACEFORMAT_RAW) {
+ ss0->ss7.shader_channel_select_red = I965_SURCHAN_SELECT_RED;
+ ss0->ss7.shader_channel_select_green = I965_SURCHAN_SELECT_GREEN;
+ ss0->ss7.shader_channel_select_blue = I965_SURCHAN_SELECT_BLUE;
+ ss0->ss7.shader_channel_select_alpha = I965_SURCHAN_SELECT_ALPHA;
+ }
+ ss0->ss2.width = s & 0x7f; /* bits 6:0 of sz */
+ // Per bspec, I965_SURFACE_BUFFER and RAW format, size must be a multiple of 4 byte.
+ if (format == I965_SURFACEFORMAT_RAW)
+ assert((ss0->ss2.width & 0x03) == 3);
+ ss0->ss2.height = (s >> 7) & 0x3fff; /* bits 20:7 of sz */
+ ss0->ss3.depth = (s >> 21) & 0x3ff; /* bits 30:21 of sz */
+ ss0->ss1.mem_obj_ctrl_state = gen_gpgpu_get_cache_ctrl(gpgpu);
+ heap->binding_table[index] = offsetof(surface_heap_t, surface) + index * sizeof(gen8_surface_state_t);
+ ss0->ss8.surface_base_addr_lo = (buf->offset64 + internal_offset) & 0xffffffff;
+ ss0->ss9.surface_base_addr_hi = ((buf->offset64 + internal_offset) >> 32) & 0xffffffff;
+ dri_bo_emit_reloc(gpgpu->aux.aux_bo,
+ I915_GEM_DOMAIN_RENDER,
+ I915_GEM_DOMAIN_RENDER,
+ internal_offset,
+ gpgpu->aux.surface_heap_offset +
+ heap->binding_table[index] +
+ offsetof(gen8_surface_state_t, ss8),
+ buf);
+}
+
+static void
+__gen_gpgpu_setup_bti_gen9(gen_gpgpu *gpgpu, drm_intel_bo *buf, uint32_t internal_offset,
+ size_t size, unsigned char index, uint32_t format)
+{
+ assert(size <= (4ul << 30));
+ size_t s = size - 1;
+ surface_heap_t *heap = gpgpu->aux.aux_bo->virtual + gpgpu->aux.surface_heap_offset;
+ gen8_surface_state_t *ss0 = (gen8_surface_state_t *)&heap->surface[index * sizeof(gen8_surface_state_t)];
+ memset(ss0, 0, sizeof(gen8_surface_state_t));
+ ss0->ss0.surface_type = I965_SURFACE_BUFFER;
+ ss0->ss0.surface_format = format;
+ if (format != I965_SURFACEFORMAT_RAW) {
+ ss0->ss7.shader_channel_select_red = I965_SURCHAN_SELECT_RED;
+ ss0->ss7.shader_channel_select_green = I965_SURCHAN_SELECT_GREEN;
+ ss0->ss7.shader_channel_select_blue = I965_SURCHAN_SELECT_BLUE;
+ ss0->ss7.shader_channel_select_alpha = I965_SURCHAN_SELECT_ALPHA;
+ }
+ ss0->ss2.width = s & 0x7f; /* bits 6:0 of sz */
+ // Per bspec, I965_SURFACE_BUFFER and RAW format, size must be a multiple of 4 byte.
+ if (format == I965_SURFACEFORMAT_RAW)
+ assert((ss0->ss2.width & 0x03) == 3);
+ ss0->ss2.height = (s >> 7) & 0x3fff; /* bits 20:7 of sz */
+ ss0->ss3.depth = (s >> 21) & 0x7ff; /* bits 31:21 of sz, from bespec only gen 9 support that*/
+ ss0->ss1.mem_obj_ctrl_state = gen_gpgpu_get_cache_ctrl(gpgpu);
+ heap->binding_table[index] = offsetof(surface_heap_t, surface) + index * sizeof(gen8_surface_state_t);
+ ss0->ss8.surface_base_addr_lo = (buf->offset64 + internal_offset) & 0xffffffff;
+ ss0->ss9.surface_base_addr_hi = ((buf->offset64 + internal_offset) >> 32) & 0xffffffff;
+ dri_bo_emit_reloc(gpgpu->aux.aux_bo,
+ I915_GEM_DOMAIN_RENDER,
+ I915_GEM_DOMAIN_RENDER,
+ internal_offset,
+ gpgpu->aux.surface_heap_offset +
+ heap->binding_table[index] +
+ offsetof(gen8_surface_state_t, ss8),
+ buf);
+}
+
+static void
+gen_gpgpu_setup_bti(gen_gpgpu *gpgpu, drm_intel_bo *buf, uint32_t internal_offset,
+ size_t size, unsigned char index, uint32_t format)
+{
+ if (IS_BROADWELL(gpgpu->device->device_id) || IS_CHERRYVIEW(gpgpu->device->device_id))
+ return __gen_gpgpu_setup_bti_gen8(gpgpu, buf, internal_offset, size, index, format);
+
+ if (IS_GEN9(gpgpu->device->device_id))
+ return __gen_gpgpu_setup_bti_gen9(gpgpu, buf, internal_offset, size, index, format);
+
+ if (IS_HASWELL(gpgpu->device->device_id))
+ return __gen_gpgpu_setup_bti_gen75(gpgpu, buf, internal_offset, size, index, format);
+
+ if (IS_IVYBRIDGE(gpgpu->device->device_id))
+ return __gen_gpgpu_setup_bti_gen7(gpgpu, buf, internal_offset, size, index, format);
+
+ assert(0);
+}
+
+static cl_int
+gen_gpgpu_setup_aux(gen_gpgpu *gpu)
+{
+ uint32_t size_aux = 0;
+
+ /* begin with surface heap to make sure it's page aligned,
+ because state base address use 20bit for the address */
+ gpu->aux.surface_heap_offset = size_aux;
+ size_aux += sizeof(surface_heap_t);
+
+ //curbe must be 32 bytes aligned
+ size_aux = ALIGN(size_aux, 64);
+ gpu->aux.curbe_offset = size_aux;
+ size_aux += gpu->thread.num_cs_entries * gpu->thread.size_cs_entry * 32;
+
+ //idrt must be 32 bytes aligned
+ size_aux = ALIGN(size_aux, 32);
+ gpu->aux.idrt_offset = size_aux;
+ size_aux += MAX_IF_DESC * sizeof(struct gen6_interface_descriptor);
+
+ //must be 32 bytes aligned
+ //sampler state and vme state share the same buffer,
+ size_aux = ALIGN(size_aux, 32);
+ gpu->aux.sampler_state_offset = size_aux;
+ size_aux += MAX(GEN_MAX_SAMPLERS * sizeof(gen6_sampler_state_t),
+ GEN_MAX_VME_STATES * sizeof(gen7_vme_state_t));
+
+ //sampler border color state must be 32 bytes aligned
+ size_aux = ALIGN(size_aux, 32);
+ gpu->aux.sampler_border_color_state_offset = size_aux;
+ size_aux += GEN_MAX_SAMPLERS * sizeof(gen7_sampler_border_color_t);
+
+ /* make sure aux buffer is page aligned */
+ size_aux = ALIGN(size_aux, 4096);
+
+ gpu->aux.aux_bo = dri_bo_alloc(gpu->bufmgr, "AUX_BUFFER", size_aux, 4096);
+ if (gpu->aux.aux_bo == NULL)
+ return CL_OUT_OF_RESOURCES;
+
+ if (dri_bo_map(gpu->aux.aux_bo, 1) != 0)
+ return CL_OUT_OF_RESOURCES;
+
+ memset(gpu->aux.aux_bo->virtual, 0, size_aux);
+ return CL_SUCCESS;
+}
+
+static void
+gen_gpgpu_finish_aux(gen_gpgpu *gpu)
+{
+ if (gpu->aux.aux_bo && gpu->aux.aux_bo->virtual)
+ dri_bo_unmap(gpu->aux.aux_bo);
+}
+
+static void
+__gen_gpgpu_set_buf_reloc_gen7(gen_gpgpu *gpgpu, int32_t index, dri_bo *obj_bo, uint32_t obj_bo_offset)
+{
+ surface_heap_t *heap = gpgpu->aux.aux_bo->virtual + gpgpu->aux.surface_heap_offset;
+ heap->binding_table[index] = offsetof(surface_heap_t, surface) +
+ index * sizeof(gen7_surface_state_t);
+ dri_bo_emit_reloc(gpgpu->aux.aux_bo,
+ I915_GEM_DOMAIN_RENDER,
+ I915_GEM_DOMAIN_RENDER,
+ obj_bo_offset,
+ gpgpu->aux.surface_heap_offset +
+ heap->binding_table[index] +
+ offsetof(gen7_surface_state_t, ss1),
+ obj_bo);
+}
+
+static int
+__gen_is_surface_array(cl_mem_object_type type)
+{
+ if (type == CL_MEM_OBJECT_IMAGE1D_ARRAY ||
+ type == CL_MEM_OBJECT_IMAGE2D_ARRAY)
+ return 1;
+
+ return 0;
+}
+
+static int
+__get_surface_type(cl_mem_object_type type)
+{
+ switch (type) {
+ case CL_MEM_OBJECT_IMAGE1D:
+ case CL_MEM_OBJECT_IMAGE1D_ARRAY:
+ return I965_SURFACE_1D;
+
+ case CL_MEM_OBJECT_IMAGE1D_BUFFER:
+ case CL_MEM_OBJECT_IMAGE2D:
+ case CL_MEM_OBJECT_IMAGE2D_ARRAY:
+ return I965_SURFACE_2D;
+
+ case CL_MEM_OBJECT_IMAGE3D:
+ return I965_SURFACE_3D;
+
+ default:
+ assert(0);
+ }
+ return 0;
+}
+
+/* Get fixed surface type. If it is a 1D array image with a large index,
+ we need to fixup it to 2D type due to a Gen7/Gen75's sampler issue
+ on a integer type surface with clamp address mode and nearest filter mode.
+*/
+static uint32_t
+__gen_get_surface_type(gen_gpgpu *gpgpu, int index, cl_mem_object_type type)
+{
+ uint32_t surface_type;
+ //Now all platforms need it, so disable platform, re-enable it
+ //when some platform don't need this workaround
+ if (/*((IS_IVYBRIDGE(gpgpu->drv->device_id) ||
+ IS_HASWELL(gpgpu->drv->device_id) ||
+ IS_BROADWELL(gpgpu->drv->device_id) ||
+ IS_CHERRYVIEW(gpgpu->drv->device_id) ||
+ IS_SKYLAKE(gpgpu->drv->device_id) ||
+ IS_BROXTON(gpgpu->drv->device_id) ||
+ IS_KABYLAKE(gpgpu->drv_device_id))) && */
+ index >= BTI_WORKAROUND_IMAGE_OFFSET + BTI_RESERVED_NUM &&
+ type == CL_MEM_OBJECT_IMAGE1D_ARRAY)
+ surface_type = I965_SURFACE_2D;
+ else
+ surface_type = __get_surface_type(type);
+ return surface_type;
+}
+
+static void
+__gen_gpgpu_bind_image_gen7(gen_gpgpu *gpgpu, uint32_t index, dri_bo *obj_bo, uint32_t obj_bo_offset,
+ uint32_t format, cl_mem_object_type type, uint32_t bpp, int32_t w, int32_t h,
+ int32_t depth, int32_t pitch, int32_t slice_pitch, int32_t tiling)
+{
+ surface_heap_t *heap = gpgpu->aux.aux_bo->virtual + gpgpu->aux.surface_heap_offset;
+ gen7_surface_state_t *ss = (gen7_surface_state_t *)&heap->surface[index * sizeof(gen7_surface_state_t)];
+
+ memset(ss, 0, sizeof(*ss));
+ ss->ss0.vertical_line_stride = 0; // always choose VALIGN_2
+ ss->ss0.surface_type = __gen_get_surface_type(gpgpu, index, type);
+ if (__gen_is_surface_array(type)) {
+ ss->ss0.surface_array = 1;
+ ss->ss0.surface_array_spacing = 1;
+ }
+ ss->ss0.surface_format = format;
+ ss->ss1.base_addr = obj_bo->offset + obj_bo_offset;
+ ss->ss2.width = w - 1;
+
+ ss->ss2.height = h - 1;
+ ss->ss3.depth = depth - 1;
+ ss->ss4.not_str_buf.rt_view_extent = depth - 1;
+ ss->ss4.not_str_buf.min_array_element = 0;
+ ss->ss3.pitch = pitch - 1;
+ ss->ss5.cache_control = gen_gpgpu_get_cache_ctrl(gpgpu);
+ if (tiling == GPGPU_TILE_X) {
+ ss->ss0.tiled_surface = 1;
+ ss->ss0.tile_walk = I965_TILEWALK_XMAJOR;
+ } else if (tiling == GPGPU_TILE_Y) {
+ ss->ss0.tiled_surface = 1;
+ ss->ss0.tile_walk = I965_TILEWALK_YMAJOR;
+ }
+ ss->ss0.render_cache_rw_mode = 1; /* XXX do we need to set it? */
+ __gen_gpgpu_set_buf_reloc_gen7(gpgpu, index, obj_bo, obj_bo_offset);
+
+ assert(index < GEN_MAX_SURFACES);
+}
+
+static void
+__gen_gpgpu_bind_image_gen75(gen_gpgpu *gpgpu, uint32_t index, dri_bo *obj_bo, uint32_t obj_bo_offset,
+ uint32_t format, cl_mem_object_type type, uint32_t bpp, int32_t w, int32_t h,
+ int32_t depth, int32_t pitch, int32_t slice_pitch, int32_t tiling)
+{
+ surface_heap_t *heap = gpgpu->aux.aux_bo->virtual + gpgpu->aux.surface_heap_offset;
+ gen7_surface_state_t *ss = (gen7_surface_state_t *)&heap->surface[index * sizeof(gen7_surface_state_t)];
+ memset(ss, 0, sizeof(*ss));
+ ss->ss0.vertical_line_stride = 0; // always choose VALIGN_2
+ ss->ss0.surface_type = __gen_get_surface_type(gpgpu, index, type);
+ if (__gen_is_surface_array(type)) {
+ ss->ss0.surface_array = 1;
+ ss->ss0.surface_array_spacing = 1;
+ }
+ ss->ss0.surface_format = format;
+ ss->ss1.base_addr = obj_bo->offset + obj_bo_offset;
+ ss->ss2.width = w - 1;
+ ss->ss2.height = h - 1;
+ ss->ss3.depth = depth - 1;
+ ss->ss4.not_str_buf.rt_view_extent = depth - 1;
+ ss->ss4.not_str_buf.min_array_element = 0;
+ ss->ss3.pitch = pitch - 1;
+ ss->ss5.cache_control = gen_gpgpu_get_cache_ctrl(gpgpu);
+ ss->ss7.shader_r = I965_SURCHAN_SELECT_RED;
+ ss->ss7.shader_g = I965_SURCHAN_SELECT_GREEN;
+ ss->ss7.shader_b = I965_SURCHAN_SELECT_BLUE;
+ ss->ss7.shader_a = I965_SURCHAN_SELECT_ALPHA;
+ if (tiling == GPGPU_TILE_X) {
+ ss->ss0.tiled_surface = 1;
+ ss->ss0.tile_walk = I965_TILEWALK_XMAJOR;
+ } else if (tiling == GPGPU_TILE_Y) {
+ ss->ss0.tiled_surface = 1;
+ ss->ss0.tile_walk = I965_TILEWALK_YMAJOR;
+ }
+ ss->ss0.render_cache_rw_mode = 1; /* XXX do we need to set it? */
+ __gen_gpgpu_set_buf_reloc_gen7(gpgpu, index, obj_bo, obj_bo_offset);
+
+ assert(index < GEN_MAX_SURFACES);
+}
+
+static void
+__gen_gpgpu_bind_image_gen8(gen_gpgpu *gpgpu, uint32_t index, dri_bo *obj_bo, uint32_t obj_bo_offset,
+ uint32_t format, cl_mem_object_type type, uint32_t bpp, int32_t w, int32_t h,
+ int32_t depth, int32_t pitch, int32_t slice_pitch, int32_t tiling)
+{
+ surface_heap_t *heap = gpgpu->aux.aux_bo->virtual + gpgpu->aux.surface_heap_offset;
+ gen8_surface_state_t *ss = (gen8_surface_state_t *)&heap->surface[index * sizeof(gen8_surface_state_t)];
+ memset(ss, 0, sizeof(*ss));
+ ss->ss0.vertical_line_stride = 0; // always choose VALIGN_2
+ ss->ss0.surface_type = __gen_get_surface_type(gpgpu, index, type);
+ ss->ss0.surface_format = format;
+ if (__gen_is_surface_array(type)) {
+ ss->ss0.surface_array = 1;
+ ss->ss1.surface_qpitch = (h + 3) / 4;
+ }
+ ss->ss0.horizontal_alignment = 1;
+ ss->ss0.vertical_alignment = 1;
+
+ if (tiling == GPGPU_TILE_X) {
+ ss->ss0.tile_mode = GEN8_TILEMODE_XMAJOR;
+ } else if (tiling == GPGPU_TILE_Y) {
+ ss->ss0.tile_mode = GEN8_TILEMODE_YMAJOR;
+ } else
+ assert(tiling == GPGPU_NO_TILE); // W mode is not supported now.
+
+ ss->ss2.width = w - 1;
+ ss->ss2.height = h - 1;
+ ss->ss3.depth = depth - 1;
+
+ ss->ss8.surface_base_addr_lo = (obj_bo->offset64 + obj_bo_offset) & 0xffffffff;
+ ss->ss9.surface_base_addr_hi = ((obj_bo->offset64 + obj_bo_offset) >> 32) & 0xffffffff;
+
+ ss->ss4.render_target_view_ext = depth - 1;
+ ss->ss4.min_array_elt = 0;
+ ss->ss3.surface_pitch = pitch - 1;
+
+ ss->ss1.mem_obj_ctrl_state = gen_gpgpu_get_cache_ctrl(gpgpu);
+ ss->ss7.shader_channel_select_red = I965_SURCHAN_SELECT_RED;
+ ss->ss7.shader_channel_select_green = I965_SURCHAN_SELECT_GREEN;
+ ss->ss7.shader_channel_select_blue = I965_SURCHAN_SELECT_BLUE;
+ ss->ss7.shader_channel_select_alpha = I965_SURCHAN_SELECT_ALPHA;
+ ss->ss0.render_cache_rw_mode = 1; /* XXX do we need to set it? */
+
+ heap->binding_table[index] = offsetof(surface_heap_t, surface) +
+ index * surface_state_sz;
+ dri_bo_emit_reloc(gpgpu->aux.aux_bo,
+ I915_GEM_DOMAIN_RENDER,
+ I915_GEM_DOMAIN_RENDER,
+ obj_bo_offset,
+ gpgpu->aux.surface_heap_offset +
+ heap->binding_table[index] +
+ offsetof(gen8_surface_state_t, ss8),
+ obj_bo);
+
+ assert(index < GEN_MAX_SURFACES);
+}
+
+static void
+__gen_gpgpu_bind_image_gen9(gen_gpgpu *gpgpu, uint32_t index, dri_bo *obj_bo, uint32_t obj_bo_offset,
+ uint32_t format, cl_mem_object_type type, uint32_t bpp, int32_t w, int32_t h,
+ int32_t depth, int32_t pitch, int32_t slice_pitch, int32_t tiling)
+{
+ surface_heap_t *heap = gpgpu->aux.aux_bo->virtual + gpgpu->aux.surface_heap_offset;
+ gen8_surface_state_t *ss = (gen8_surface_state_t *)&heap->surface[index * sizeof(gen8_surface_state_t)];
+ memset(ss, 0, sizeof(*ss));
+ ss->ss0.vertical_line_stride = 0; // always choose VALIGN_2
+ ss->ss0.surface_type = __gen_get_surface_type(gpgpu, index, type);
+ ss->ss0.surface_format = format;
+ if (__gen_is_surface_array(type) && ss->ss0.surface_type == I965_SURFACE_1D) {
+ ss->ss0.surface_array = 1;
+ ss->ss1.surface_qpitch = (slice_pitch / bpp + 3) / 4; //align_h
+ }
+
+ if (__gen_is_surface_array(type) && ss->ss0.surface_type == I965_SURFACE_2D) {
+ ss->ss0.surface_array = 1;
+ ss->ss1.surface_qpitch = (slice_pitch / pitch + 3) / 4;
+ }
+
+ if (ss->ss0.surface_type == I965_SURFACE_3D)
+ ss->ss1.surface_qpitch = (slice_pitch / pitch + 3) / 4;
+
+ ss->ss0.horizontal_alignment = 1;
+ ss->ss0.vertical_alignment = 1;
+
+ if (tiling == GPGPU_TILE_X) {
+ ss->ss0.tile_mode = GEN8_TILEMODE_XMAJOR;
+ } else if (tiling == GPGPU_TILE_Y) {
+ ss->ss0.tile_mode = GEN8_TILEMODE_YMAJOR;
+ } else
+ assert(tiling == GPGPU_NO_TILE); // W mode is not supported now.
+
+ ss->ss2.width = w - 1;
+ ss->ss2.height = h - 1;
+ ss->ss3.depth = depth - 1;
+
+ ss->ss8.surface_base_addr_lo = (obj_bo->offset64 + obj_bo_offset) & 0xffffffff;
+ ss->ss9.surface_base_addr_hi = ((obj_bo->offset64 + obj_bo_offset) >> 32) & 0xffffffff;
+
+ ss->ss4.render_target_view_ext = depth - 1;
+ ss->ss4.min_array_elt = 0;
+ ss->ss3.surface_pitch = pitch - 1;
+
+ ss->ss1.mem_obj_ctrl_state = gen_gpgpu_get_cache_ctrl(gpgpu);
+ ss->ss7.shader_channel_select_red = I965_SURCHAN_SELECT_RED;
+ ss->ss7.shader_channel_select_green = I965_SURCHAN_SELECT_GREEN;
+ ss->ss7.shader_channel_select_blue = I965_SURCHAN_SELECT_BLUE;
+ ss->ss7.shader_channel_select_alpha = I965_SURCHAN_SELECT_ALPHA;
+ ss->ss0.render_cache_rw_mode = 1; /* XXX do we need to set it? */
+
+ heap->binding_table[index] = offsetof(surface_heap_t, surface) +
+ index * surface_state_sz;
+ dri_bo_emit_reloc(gpgpu->aux.aux_bo,
+ I915_GEM_DOMAIN_RENDER,
+ I915_GEM_DOMAIN_RENDER,
+ obj_bo_offset,
+ gpgpu->aux.surface_heap_offset +
+ heap->binding_table[index] +
+ offsetof(gen8_surface_state_t, ss8),
+ obj_bo);
+
+ assert(index < GEN_MAX_SURFACES);
+}
+
+static void
+gen_gpgpu_bind_image(gen_gpgpu *gpgpu, uint32_t index, dri_bo *obj_bo, uint32_t obj_bo_offset,
+ uint32_t format, cl_mem_object_type type, uint32_t bpp, int32_t w, int32_t h,
+ int32_t depth, int32_t pitch, int32_t slice_pitch, int32_t tiling)
+{
+ if (IS_BROADWELL(gpgpu->device->device_id) || IS_CHERRYVIEW(gpgpu->device->device_id))
+ return __gen_gpgpu_bind_image_gen8(gpgpu, index, obj_bo, obj_bo_offset, format, type, bpp,
+ w, h, depth, pitch, slice_pitch, tiling);
+
+ if (IS_GEN9(gpgpu->device->device_id))
+ return __gen_gpgpu_bind_image_gen9(gpgpu, index, obj_bo, obj_bo_offset, format, type, bpp,
+ w, h, depth, pitch, slice_pitch, tiling);
+
+ if (IS_HASWELL(gpgpu->device->device_id))
+ return __gen_gpgpu_bind_image_gen75(gpgpu, index, obj_bo, obj_bo_offset, format, type, bpp,
+ w, h, depth, pitch, slice_pitch, tiling);
+
+ if (IS_IVYBRIDGE(gpgpu->device->device_id))
+ return __gen_gpgpu_bind_image_gen7(gpgpu, index, obj_bo, obj_bo_offset, format, type, bpp,
+ w, h, depth, pitch, slice_pitch, tiling);
+
+ assert(0);
+}
+
+static int __translate_wrap_mode(uint32_t cl_address_mode, int using_nearest)
+{
+ switch (cl_address_mode) {
+ case CLK_ADDRESS_NONE:
+ case CLK_ADDRESS_REPEAT:
+ return GEN_TEXCOORDMODE_WRAP;
+ case CLK_ADDRESS_CLAMP:
+ return GEN_TEXCOORDMODE_CLAMP_BORDER;
+ case CLK_ADDRESS_CLAMP_TO_EDGE:
+ return GEN_TEXCOORDMODE_CLAMP;
+ case CLK_ADDRESS_MIRRORED_REPEAT:
+ return GEN_TEXCOORDMODE_MIRROR;
+ default:
+ return GEN_TEXCOORDMODE_WRAP;
+ }
+}
+
+static void
+__gen_gpgpu_insert_sampler_gen7(gen_gpgpu *gpgpu, uint32_t index, uint32_t clk_sampler)
+{
+ int using_nearest = 0;
+ uint32_t wrap_mode;
+ gen7_sampler_state_t *sampler;
+
+ sampler = (gen7_sampler_state_t *)(gpgpu->aux.aux_bo->virtual + gpgpu->aux.sampler_state_offset) + index;
+ memset(sampler, 0, sizeof(*sampler));
+ assert((gpgpu->aux.aux_bo->offset + gpgpu->aux.sampler_border_color_state_offset) % 32 == 0);
+ sampler->ss2.default_color_pointer = (gpgpu->aux.aux_bo->offset + gpgpu->aux.sampler_border_color_state_offset) >> 5;
+ if ((clk_sampler & __CLK_NORMALIZED_MASK) == CLK_NORMALIZED_COORDS_FALSE)
+ sampler->ss3.non_normalized_coord = 1;
+ else
+ sampler->ss3.non_normalized_coord = 0;
+
+ switch (clk_sampler & __CLK_FILTER_MASK) {
+ case CLK_FILTER_NEAREST:
+ sampler->ss0.min_filter = GEN_MAPFILTER_NEAREST;
+ sampler->ss0.mip_filter = GEN_MIPFILTER_NONE;
+ sampler->ss0.mag_filter = GEN_MAPFILTER_NEAREST;
+ using_nearest = 1;
+ break;
+ case CLK_FILTER_LINEAR:
+ sampler->ss0.min_filter = GEN_MAPFILTER_LINEAR;
+ sampler->ss0.mip_filter = GEN_MIPFILTER_NONE;
+ sampler->ss0.mag_filter = GEN_MAPFILTER_LINEAR;
+ break;
+ }
+
+ wrap_mode = __translate_wrap_mode(clk_sampler & __CLK_ADDRESS_MASK, using_nearest);
+ sampler->ss3.s_wrap_mode = wrap_mode;
+ /* XXX mesa i965 driver code point out that if the surface is a 1D surface, we may need
+ * to set t_wrap_mode to GEN_TEXCOORDMODE_WRAP. */
+ sampler->ss3.t_wrap_mode = wrap_mode;
+ sampler->ss3.r_wrap_mode = wrap_mode;
+
+ sampler->ss0.lod_preclamp = 1; /* OpenGL mode */
+ sampler->ss0.default_color_mode = 0; /* OpenGL/DX10 mode */
+
+ sampler->ss0.base_level = 0;
+
+ sampler->ss1.max_lod = 0;
+ sampler->ss1.min_lod = 0;
+
+ if (sampler->ss0.min_filter != GEN_MAPFILTER_NEAREST)
+ sampler->ss3.address_round |= GEN_ADDRESS_ROUNDING_ENABLE_U_MIN |
+ GEN_ADDRESS_ROUNDING_ENABLE_V_MIN |
+ GEN_ADDRESS_ROUNDING_ENABLE_R_MIN;
+ if (sampler->ss0.mag_filter != GEN_MAPFILTER_NEAREST)
+ sampler->ss3.address_round |= GEN_ADDRESS_ROUNDING_ENABLE_U_MAG |
+ GEN_ADDRESS_ROUNDING_ENABLE_V_MAG |
+ GEN_ADDRESS_ROUNDING_ENABLE_R_MAG;
+
+ dri_bo_emit_reloc(gpgpu->aux.aux_bo,
+ I915_GEM_DOMAIN_SAMPLER, 0,
+ gpgpu->aux.sampler_border_color_state_offset,
+ gpgpu->aux.sampler_state_offset +
+ index * sizeof(gen7_sampler_state_t) +
+ offsetof(gen7_sampler_state_t, ss2),
+ gpgpu->aux.aux_bo);
+}
+
+static void
+__gen_gpgpu_insert_sampler_gen8(gen_gpgpu *gpgpu, uint32_t index, uint32_t clk_sampler)
+{
+ int using_nearest = 0;
+ uint32_t wrap_mode;
+ gen8_sampler_state_t *sampler;
+
+ sampler = (gen8_sampler_state_t *)(gpgpu->aux.aux_bo->virtual + gpgpu->aux.sampler_state_offset) + index;
+ memset(sampler, 0, sizeof(*sampler));
+ assert((gpgpu->aux.aux_bo->offset + gpgpu->aux.sampler_border_color_state_offset) % 32 == 0);
+ if ((clk_sampler & __CLK_NORMALIZED_MASK) == CLK_NORMALIZED_COORDS_FALSE)
+ sampler->ss3.non_normalized_coord = 1;
+ else
+ sampler->ss3.non_normalized_coord = 0;
+
+ switch (clk_sampler & __CLK_FILTER_MASK) {
+ case CLK_FILTER_NEAREST:
+ sampler->ss0.min_filter = GEN_MAPFILTER_NEAREST;
+ sampler->ss0.mip_filter = GEN_MIPFILTER_NONE;
+ sampler->ss0.mag_filter = GEN_MAPFILTER_NEAREST;
+ using_nearest = 1;
+ break;
+ case CLK_FILTER_LINEAR:
+ sampler->ss0.min_filter = GEN_MAPFILTER_LINEAR;
+ sampler->ss0.mip_filter = GEN_MIPFILTER_NONE;
+ sampler->ss0.mag_filter = GEN_MAPFILTER_LINEAR;
+ break;
+ }
+
+ wrap_mode = __translate_wrap_mode(clk_sampler & __CLK_ADDRESS_MASK, using_nearest);
+ sampler->ss3.s_wrap_mode = wrap_mode;
+ /* XXX mesa i965 driver code point out that if the surface is a 1D surface, we may need
+ * to set t_wrap_mode to GEN_TEXCOORDMODE_WRAP. */
+ sampler->ss3.t_wrap_mode = wrap_mode;
+ sampler->ss3.r_wrap_mode = wrap_mode;
+
+ sampler->ss0.lod_preclamp = 1; /* OpenGL mode */
+ sampler->ss0.default_color_mode = 0; /* OpenGL/DX10 mode */
+
+ sampler->ss0.base_level = 0;
+
+ sampler->ss1.max_lod = 0;
+ sampler->ss1.min_lod = 0;
+
+ if (sampler->ss0.min_filter != GEN_MAPFILTER_NEAREST)
+ sampler->ss3.address_round |= GEN_ADDRESS_ROUNDING_ENABLE_U_MIN |
+ GEN_ADDRESS_ROUNDING_ENABLE_V_MIN |
+ GEN_ADDRESS_ROUNDING_ENABLE_R_MIN;
+ if (sampler->ss0.mag_filter != GEN_MAPFILTER_NEAREST)
+ sampler->ss3.address_round |= GEN_ADDRESS_ROUNDING_ENABLE_U_MAG |
+ GEN_ADDRESS_ROUNDING_ENABLE_V_MAG |
+ GEN_ADDRESS_ROUNDING_ENABLE_R_MAG;
+}
+
+static void
+gen_gpgpu_bind_sampler(gen_gpgpu *gpgpu, uint32_t *samplers, size_t sampler_sz)
+{
+ int index;
+ assert(sampler_sz <= GEN_MAX_SAMPLERS);
+ cl_uint device_id = gpgpu->device->device_id;
+
+ for (index = 0; index < sampler_sz; index++) {
+ if (IS_BROADWELL(device_id) || IS_CHERRYVIEW(device_id) || IS_GEN9(device_id)) {
+ __gen_gpgpu_insert_sampler_gen8(gpgpu, index, samplers[index]);
+ continue;
+ }
+
+ __gen_gpgpu_insert_sampler_gen7(gpgpu, index, samplers[index]);
+ }
+}
+
+static void
+gen_gpgpu_alloc_constant_buffer(gen_gpgpu *gpgpu, uint32_t size, uint8_t bti)
+{
+ gpgpu->mem.const_bo = drm_intel_bo_alloc(gpgpu->bufmgr, "CONSTANT_BUFFER", size, 64);
+ if (gpgpu->mem.const_bo == NULL)
+ return;
+
+ gen_gpgpu_setup_bti(gpgpu, gpgpu->mem.const_bo, 0, size, bti, I965_SURFACEFORMAT_R32G32B32A32_UINT);
+}
+
+static void
+__gen_gpgpu_build_idrt_gen7(gen_gpgpu *gpgpu)
+{
+ gen6_interface_descriptor_t *desc;
+
+ desc = (gen6_interface_descriptor_t *)(gpgpu->aux.aux_bo->virtual + gpgpu->aux.idrt_offset);
+
+ memset(desc, 0, sizeof(*desc));
+ desc->desc0.kernel_start_pointer = (gpgpu->kernel_bo->offset) >> 6; /* reloc */
+ desc->desc1.single_program_flow = 0;
+ desc->desc1.floating_point_mode = 0; /* use IEEE-754 rule */
+ desc->desc5.rounding_mode = 0; /* round to nearest even */
+
+ assert((gpgpu->aux.aux_bo->offset + gpgpu->aux.sampler_state_offset) % 32 == 0);
+ desc->desc2.sampler_state_pointer = (gpgpu->aux.aux_bo->offset + gpgpu->aux.sampler_state_offset) >> 5;
+ desc->desc3.binding_table_entry_count = 0; /* no prefetch */
+ desc->desc3.binding_table_pointer = 0;
+ desc->desc4.curbe_read_len = (gpgpu->thread.curbe_size) / 32;
+ desc->desc4.curbe_read_offset = 0;
+
+ /* Barriers / SLM are automatically handled on Gen7+ */
+ size_t slm_sz = gpgpu->mem.local_mem_size;
+ desc->desc5.group_threads_num = gpgpu->thread.barrier_slm_used ? gpgpu->thread.thread_num : 0;
+ desc->desc5.barrier_enable = gpgpu->thread.barrier_slm_used;
+ if (slm_sz <= 4 * KB)
+ slm_sz = 4 * KB;
+ else if (slm_sz <= 8 * KB)
+ slm_sz = 8 * KB;
+ else if (slm_sz <= 16 * KB)
+ slm_sz = 16 * KB;
+ else if (slm_sz <= 32 * KB)
+ slm_sz = 32 * KB;
+ else
+ slm_sz = 64 * KB;
+ slm_sz = slm_sz >> 12;
+ desc->desc5.slm_sz = slm_sz;
+
+ dri_bo_emit_reloc(gpgpu->aux.aux_bo,
+ I915_GEM_DOMAIN_INSTRUCTION, 0,
+ 0,
+ gpgpu->aux.idrt_offset + offsetof(gen6_interface_descriptor_t, desc0),
+ gpgpu->kernel_bo);
+
+ dri_bo_emit_reloc(gpgpu->aux.aux_bo,
+ I915_GEM_DOMAIN_SAMPLER, 0,
+ gpgpu->aux.sampler_state_offset,
+ gpgpu->aux.idrt_offset + offsetof(gen6_interface_descriptor_t, desc2),
+ gpgpu->aux.aux_bo);
+}
+
+static void
+__gen_gpgpu_build_idrt_gen8(gen_gpgpu *gpgpu)
+{
+ gen8_interface_descriptor_t *desc;
+
+ desc = (gen8_interface_descriptor_t *)(gpgpu->aux.aux_bo->virtual + gpgpu->aux.idrt_offset);
+
+ memset(desc, 0, sizeof(*desc));
+ desc->desc0.kernel_start_pointer = 0; /* reloc */
+ desc->desc2.single_program_flow = 0;
+ desc->desc2.floating_point_mode = 0; /* use IEEE-754 rule */
+ desc->desc6.rounding_mode = 0; /* round to nearest even */
+
+ assert((gpgpu->aux.aux_bo->offset + gpgpu->aux.sampler_state_offset) % 32 == 0);
+ desc->desc3.sampler_state_pointer = gpgpu->aux.sampler_state_offset >> 5;
+ desc->desc4.binding_table_entry_count = 0; /* no prefetch */
+ desc->desc4.binding_table_pointer = 0;
+ desc->desc5.curbe_read_len = (gpgpu->thread.curbe_size) / 32;
+ desc->desc5.curbe_read_offset = 0;
+
+ /* Barriers / SLM are automatically handled on Gen7+ */
+ size_t slm_sz = gpgpu->mem.local_mem_size;
+ /* group_threads_num should not be set to 0 even if the barrier is disabled per bspec */
+ desc->desc6.group_threads_num = gpgpu->thread.thread_num;
+ desc->desc6.barrier_enable = gpgpu->thread.barrier_slm_used;
+ if (slm_sz == 0)
+ slm_sz = 0;
+ else if (slm_sz <= 4 * KB)
+ slm_sz = 4 * KB;
+ else if (slm_sz <= 8 * KB)
+ slm_sz = 8 * KB;
+ else if (slm_sz <= 16 * KB)
+ slm_sz = 16 * KB;
+ else if (slm_sz <= 32 * KB)
+ slm_sz = 32 * KB;
+ else
+ slm_sz = 64 * KB;
+ slm_sz = slm_sz >> 12;
+ desc->desc6.slm_sz = slm_sz;
+}
+
+static void
+__gen_gpgpu_build_idrt_gen9(gen_gpgpu *gpgpu)
+{
+ gen8_interface_descriptor_t *desc;
+
+ desc = (gen8_interface_descriptor_t *)(gpgpu->aux.aux_bo->virtual + gpgpu->aux.idrt_offset);
+
+ memset(desc, 0, sizeof(*desc));
+ desc->desc0.kernel_start_pointer = 0; /* reloc */
+ desc->desc2.single_program_flow = 0;
+ desc->desc2.floating_point_mode = 0; /* use IEEE-754 rule */
+ desc->desc6.rounding_mode = 0; /* round to nearest even */
+
+ assert((gpgpu->aux.aux_bo->offset + gpgpu->aux.sampler_state_offset) % 32 == 0);
+ desc->desc3.sampler_state_pointer = gpgpu->aux.sampler_state_offset >> 5;
+ desc->desc4.binding_table_entry_count = 0; /* no prefetch */
+ desc->desc4.binding_table_pointer = 0;
+ desc->desc5.curbe_read_len = (gpgpu->thread.curbe_size) / 32;
+ desc->desc5.curbe_read_offset = 0;
+
+ /* Barriers / SLM are automatically handled on Gen7+ */
+ size_t slm_sz = gpgpu->mem.local_mem_size;
+ /* group_threads_num should not be set to 0 even if the barrier is disabled per bspec */
+ desc->desc6.group_threads_num = gpgpu->thread.thread_num;
+ desc->desc6.barrier_enable = gpgpu->thread.barrier_slm_used;
+
+ if (slm_sz == 0)
+ slm_sz = 0;
+ else if (slm_sz <= 1 * KB)
+ slm_sz = 1;
+ else if (slm_sz <= 2 * KB)
+ slm_sz = 2;
+ else if (slm_sz <= 4 * KB)
+ slm_sz = 3;
+ else if (slm_sz <= 8 * KB)
+ slm_sz = 4;
+ else if (slm_sz <= 16 * KB)
+ slm_sz = 5;
+ else if (slm_sz <= 32 * KB)
+ slm_sz = 6;
+ else
+ slm_sz = 7;
+ desc->desc6.slm_sz = slm_sz;
+}
+
+static void
+gen_gpgpu_build_idrt(gen_gpgpu *gpgpu)
+{
+ if (IS_BROADWELL(gpgpu->device->device_id) || IS_CHERRYVIEW(gpgpu->device->device_id))
+ return __gen_gpgpu_build_idrt_gen8(gpgpu);
+
+ if (IS_GEN9(gpgpu->device->device_id))
+ return __gen_gpgpu_build_idrt_gen9(gpgpu);
+
+ return __gen_gpgpu_build_idrt_gen7(gpgpu);
+
+ assert(0);
+}
+
+static void
+__gen_gpgpu_upload_curbes_gen7(gen_gpgpu *gpgpu, const void *data, uint32_t size)
+{
+ unsigned char *curbe = NULL;
+ uint32_t i, j;
+
+ /* Upload the data first */
+ assert(gpgpu->aux.aux_bo->virtual);
+ curbe = (unsigned char *)(gpgpu->aux.aux_bo->virtual + gpgpu->aux.curbe_offset);
+ memcpy(curbe, data, size);
+
+ /* Now put all the relocations for our flat address space */
+ for (i = 0; i < gpgpu->thread.thread_num; ++i)
+ for (j = 0; j < gpgpu->mem.binded_n; ++j) {
+ *(uint32_t *)(curbe + gpgpu->mem.binded_offset[j] + i * gpgpu->thread.curbe_size) =
+ gpgpu->mem.binded_buf[j]->offset64 + gpgpu->mem.target_buf_offset[j];
+ drm_intel_bo_emit_reloc(gpgpu->aux.aux_bo,
+ gpgpu->aux.curbe_offset + gpgpu->mem.binded_offset[j] + i * gpgpu->thread.curbe_size,
+ gpgpu->mem.binded_buf[j],
+ gpgpu->mem.target_buf_offset[j],
+ I915_GEM_DOMAIN_RENDER,
+ I915_GEM_DOMAIN_RENDER);
+ }
+}
+
+static void
+__gen_gpgpu_upload_curbes_gen8(gen_gpgpu *gpgpu, const void *data, uint32_t size)
+{
+ unsigned char *curbe = NULL;
+ uint32_t i, j;
+
+ /* Upload the data first */
+ assert(gpgpu->aux.aux_bo->virtual);
+ curbe = (unsigned char *)(gpgpu->aux.aux_bo->virtual + gpgpu->aux.curbe_offset);
+ memcpy(curbe, data, size);
+
+ /* Now put all the relocations for our flat address space */
+ for (i = 0; i < gpgpu->thread.thread_num; ++i)
+ for (j = 0; j < gpgpu->mem.binded_n; ++j) {
+ *(size_t *)(curbe + gpgpu->mem.binded_offset[j] + i * gpgpu->thread.curbe_size) =
+ gpgpu->mem.binded_buf[j]->offset64 + gpgpu->mem.target_buf_offset[j];
+ drm_intel_bo_emit_reloc(gpgpu->aux.aux_bo,
+ gpgpu->aux.curbe_offset + gpgpu->mem.binded_offset[j] + i * gpgpu->thread.curbe_size,
+ gpgpu->mem.binded_buf[j],
+ gpgpu->mem.target_buf_offset[j],
+ I915_GEM_DOMAIN_RENDER,
+ I915_GEM_DOMAIN_RENDER);
+ }
+}
+
+static void
+gen_gpgpu_upload_curbes(gen_gpgpu *gpgpu, const void *data, uint32_t size)
+{
+ if (IS_BROADWELL(gpgpu->device->device_id) ||
+ IS_CHERRYVIEW(gpgpu->device->device_id) || IS_GEN9(gpgpu->device->device_id))
+ return __gen_gpgpu_upload_curbes_gen8(gpgpu, data, size);
+
+ return __gen_gpgpu_upload_curbes_gen7(gpgpu, data, size);
+}
+
+static void
+__gen_gpgpu_pipe_control_gen7(gen_gpgpu *gpgpu)
+{
+ gen6_pipe_control_t *pc = (gen6_pipe_control_t *)
+ intel_batchbuffer_alloc_space(gpgpu->batch, sizeof(gen6_pipe_control_t));
+ memset(pc, 0, sizeof(*pc));
+ pc->dw0.length = SIZEOF32(gen6_pipe_control_t) - 2;
+ pc->dw0.instruction_subopcode = GEN7_PIPE_CONTROL_SUBOPCODE_3D_CONTROL;
+ pc->dw0.instruction_opcode = GEN7_PIPE_CONTROL_OPCODE_3D_CONTROL;
+ pc->dw0.instruction_pipeline = GEN7_PIPE_CONTROL_3D;
+ pc->dw0.instruction_type = GEN7_PIPE_CONTROL_INSTRUCTION_GFX;
+ pc->dw1.render_target_cache_flush_enable = 1;
+ pc->dw1.texture_cache_invalidation_enable = 1;
+ pc->dw1.cs_stall = 1;
+ pc->dw1.dc_flush_enable = 1;
+ //pc->dw1.instruction_cache_invalidate_enable = 1;
+ ADVANCE_BATCH(gpgpu->batch);
+}
+
+static void
+__gen_gpgpu_pipe_control_gen75(gen_gpgpu *gpgpu)
+{
+ gen6_pipe_control_t *pc = (gen6_pipe_control_t *)
+ intel_batchbuffer_alloc_space(gpgpu->batch, sizeof(gen6_pipe_control_t));
+ memset(pc, 0, sizeof(*pc));
+ pc->dw0.length = SIZEOF32(gen6_pipe_control_t) - 2;
+ pc->dw0.instruction_subopcode = GEN7_PIPE_CONTROL_SUBOPCODE_3D_CONTROL;
+ pc->dw0.instruction_opcode = GEN7_PIPE_CONTROL_OPCODE_3D_CONTROL;
+ pc->dw0.instruction_pipeline = GEN7_PIPE_CONTROL_3D;
+ pc->dw0.instruction_type = GEN7_PIPE_CONTROL_INSTRUCTION_GFX;
+ pc->dw1.cs_stall = 1;
+ pc->dw1.dc_flush_enable = 1;
+
+ pc = (gen6_pipe_control_t *)
+ intel_batchbuffer_alloc_space(gpgpu->batch, sizeof(gen6_pipe_control_t));
+ memset(pc, 0, sizeof(*pc));
+ pc->dw0.length = SIZEOF32(gen6_pipe_control_t) - 2;
+ pc->dw0.instruction_subopcode = GEN7_PIPE_CONTROL_SUBOPCODE_3D_CONTROL;
+ pc->dw0.instruction_opcode = GEN7_PIPE_CONTROL_OPCODE_3D_CONTROL;
+ pc->dw0.instruction_pipeline = GEN7_PIPE_CONTROL_3D;
+ pc->dw0.instruction_type = GEN7_PIPE_CONTROL_INSTRUCTION_GFX;
+ pc->dw1.render_target_cache_flush_enable = 1;
+ pc->dw1.texture_cache_invalidation_enable = 1;
+ pc->dw1.cs_stall = 1;
+ ADVANCE_BATCH(gpgpu->batch);
+}
+
+static void
+__gen_gpgpu_pipe_control_gen8(gen_gpgpu *gpgpu)
+{
+ gen8_pipe_control_t *pc = (gen8_pipe_control_t *)
+ intel_batchbuffer_alloc_space(gpgpu->batch, sizeof(gen8_pipe_control_t));
+ memset(pc, 0, sizeof(*pc));
+ pc->dw0.length = SIZEOF32(gen8_pipe_control_t) - 2;
+ pc->dw0.instruction_subopcode = GEN7_PIPE_CONTROL_SUBOPCODE_3D_CONTROL;
+ pc->dw0.instruction_opcode = GEN7_PIPE_CONTROL_OPCODE_3D_CONTROL;
+ pc->dw0.instruction_pipeline = GEN7_PIPE_CONTROL_3D;
+ pc->dw0.instruction_type = GEN7_PIPE_CONTROL_INSTRUCTION_GFX;
+ pc->dw1.render_target_cache_flush_enable = 1;
+ pc->dw1.texture_cache_invalidation_enable = 1;
+ pc->dw1.cs_stall = 1;
+ pc->dw1.dc_flush_enable = 1;
+ //pc->dw1.instruction_cache_invalidate_enable = 1;
+ ADVANCE_BATCH(gpgpu->batch);
+}
+
+static void
+gen_gpgpu_pipe_control(gen_gpgpu *gpgpu)
+{
+ if (IS_BROADWELL(gpgpu->device->device_id) ||
+ IS_CHERRYVIEW(gpgpu->device->device_id) || IS_GEN9(gpgpu->device->device_id))
+ return __gen_gpgpu_pipe_control_gen8(gpgpu);
+
+ if (IS_HASWELL(gpgpu->device->device_id))
+ return __gen_gpgpu_pipe_control_gen75(gpgpu);
+
+ if (IS_IVYBRIDGE(gpgpu->device->device_id))
+ return __gen_gpgpu_pipe_control_gen7(gpgpu);
+
+ assert(0);
+ return;
+}
+
+static const uint32_t gpgpu_l3_config_reg1[] = {
+ 0x00080040, 0x02040040, 0x00800040, 0x01000038,
+ 0x02000030, 0x01000038, 0x00000038, 0x00000040,
+ 0x0A140091, 0x09100091, 0x08900091, 0x08900091,
+ 0x010000a1,
+};
+
+static const uint32_t gpgpu_l3_config_reg2[] = {
+ 0x00000000, 0x00000000, 0x00080410, 0x00080410,
+ 0x00040410, 0x00040420, 0x00080420, 0x00080020,
+ 0x00204080, 0x00244890, 0x00284490, 0x002444A0,
+ 0x00040810,
+};
+
+static void
+__gen_gpgpu_set_L3_gen7(gen_gpgpu *gpgpu, uint32_t use_slm)
+{
+ BEGIN_BATCH(gpgpu->batch, 9);
+ OUT_BATCH(gpgpu->batch, CMD_LOAD_REGISTER_IMM | 1); /* length - 2 */
+ OUT_BATCH(gpgpu->batch, GEN7_L3_SQC_REG1_ADDRESS_OFFSET);
+ OUT_BATCH(gpgpu->batch, 0x00A00000);
+
+ OUT_BATCH(gpgpu->batch, CMD_LOAD_REGISTER_IMM | 1); /* length - 2 */
+ OUT_BATCH(gpgpu->batch, GEN7_L3_CNTL_REG2_ADDRESS_OFFSET);
+
+ if (use_slm)
+ OUT_BATCH(gpgpu->batch, gpgpu_l3_config_reg1[12]);
+ else
+ OUT_BATCH(gpgpu->batch, gpgpu_l3_config_reg1[4]);
+
+ OUT_BATCH(gpgpu->batch, CMD_LOAD_REGISTER_IMM | 1); /* length - 2 */
+ OUT_BATCH(gpgpu->batch, GEN7_L3_CNTL_REG3_ADDRESS_OFFSET);
+ if (use_slm)
+ OUT_BATCH(gpgpu->batch, gpgpu_l3_config_reg2[12]);
+ else
+ OUT_BATCH(gpgpu->batch, gpgpu_l3_config_reg2[4]);
+ ADVANCE_BATCH(gpgpu->batch);
+
+ gen_gpgpu_pipe_control(gpgpu);
+}
+
+static void
+__gen_gpgpu_set_L3_baytrail(gen_gpgpu *gpgpu, uint32_t use_slm)
+{
+ BEGIN_BATCH(gpgpu->batch, 9);
+
+ OUT_BATCH(gpgpu->batch, CMD_LOAD_REGISTER_IMM | 1); /* length - 2 */
+ OUT_BATCH(gpgpu->batch, GEN7_L3_SQC_REG1_ADDRESS_OFFSET);
+ OUT_BATCH(gpgpu->batch, 0x00D30000); /* General credit : High credit = 26 : 6 */
+
+ OUT_BATCH(gpgpu->batch, CMD_LOAD_REGISTER_IMM | 1); /* length - 2 */
+ OUT_BATCH(gpgpu->batch, GEN7_L3_CNTL_REG2_ADDRESS_OFFSET);
+ if (use_slm)
+ OUT_BATCH(gpgpu->batch, 0x01020021); /* {SLM=64, URB=96, DC=16, RO=16, Sum=192} */
+ else
+ OUT_BATCH(gpgpu->batch, 0x02040040); /* {SLM=0, URB=128, DC=32, RO=32, Sum=192} */
+
+ OUT_BATCH(gpgpu->batch, CMD_LOAD_REGISTER_IMM | 1); /* length - 2 */
+ OUT_BATCH(gpgpu->batch, GEN7_L3_CNTL_REG3_ADDRESS_OFFSET);
+ OUT_BATCH(gpgpu->batch, 0x0); /* {I/S=0, Const=0, Tex=0} */
+
+ ADVANCE_BATCH(gpgpu->batch);
+
+ gen_gpgpu_pipe_control(gpgpu);
+}
+
+static void
+__gen_gpgpu_set_L3_gen75(gen_gpgpu *gpgpu, uint32_t use_slm)
+{
+ /* still set L3 in batch buffer for fulsim. */
+ if (gpgpu->atomic_test_result != 0) {
+ BEGIN_BATCH(gpgpu->batch, 15);
+ OUT_BATCH(gpgpu->batch, CMD_LOAD_REGISTER_IMM | 1); /* length - 2 */
+ /* FIXME: KMD always disable the atomic in L3 for some reason.
+ I checked the spec, and don't think we need that workaround now.
+ Before I send a patch to kernel, let's just enable it here. */
+ OUT_BATCH(gpgpu->batch, HSW_SCRATCH1_OFFSET);
+ OUT_BATCH(gpgpu->batch, 0); /* enable atomic in L3 */
+ OUT_BATCH(gpgpu->batch, CMD_LOAD_REGISTER_IMM | 1); /* length - 2 */
+ OUT_BATCH(gpgpu->batch, HSW_ROW_CHICKEN3_HDC_OFFSET);
+ OUT_BATCH(gpgpu->batch, (1 << 6ul) << 16); /* enable atomic in L3 */
+ } else {
+ BEGIN_BATCH(gpgpu->batch, 9);
+ }
+ OUT_BATCH(gpgpu->batch, CMD_LOAD_REGISTER_IMM | 1); /* length - 2 */
+ OUT_BATCH(gpgpu->batch, GEN7_L3_SQC_REG1_ADDRESS_OFFSET);
+ OUT_BATCH(gpgpu->batch, 0x08800000);
+
+ OUT_BATCH(gpgpu->batch, CMD_LOAD_REGISTER_IMM | 1); /* length - 2 */
+ OUT_BATCH(gpgpu->batch, GEN7_L3_CNTL_REG2_ADDRESS_OFFSET);
+
+ if (use_slm)
+ OUT_BATCH(gpgpu->batch, gpgpu_l3_config_reg1[12]);
+ else
+ OUT_BATCH(gpgpu->batch, gpgpu_l3_config_reg1[4]);
+
+ OUT_BATCH(gpgpu->batch, CMD_LOAD_REGISTER_IMM | 1); /* length - 2 */
+ OUT_BATCH(gpgpu->batch, GEN7_L3_CNTL_REG3_ADDRESS_OFFSET);
+ if (use_slm)
+ OUT_BATCH(gpgpu->batch, gpgpu_l3_config_reg2[12]);
+ else
+ OUT_BATCH(gpgpu->batch, gpgpu_l3_config_reg2[4]);
+ ADVANCE_BATCH(gpgpu->batch);
+
+ //if(use_slm)
+ // gpgpu->batch->enable_slm = 1;
+ gen_gpgpu_pipe_control(gpgpu);
+}
+
+static void
+__gen_gpgpu_set_L3_gen8(gen_gpgpu *gpgpu, uint32_t use_slm)
+{
+ BEGIN_BATCH(gpgpu->batch, 3);
+ OUT_BATCH(gpgpu->batch, CMD_LOAD_REGISTER_IMM | 1); /* length - 2 */
+ OUT_BATCH(gpgpu->batch, GEN8_L3_CNTL_REG_ADDRESS_OFFSET);
+ // FIXME, this is a workaround for switch SLM enable and disable random hang
+ if (use_slm)
+ OUT_BATCH(gpgpu->batch, 0x60000121); /* {SLM=192, URB=128, Rest=384} */
+ else
+ OUT_BATCH(gpgpu->batch, 0x60000160); /* {SLM=0, URB=384, Rest=384, Sum=768} */
+
+ //if(use_slm)
+ // gpgpu->batch->enable_slm = 1;
+ gen_gpgpu_pipe_control(gpgpu);
+}
+
+static void
+gen_gpgpu_set_L3(gen_gpgpu *gpgpu, uint32_t use_slm)
+{
+ if (IS_BROADWELL(gpgpu->device->device_id) ||
+ IS_CHERRYVIEW(gpgpu->device->device_id) || IS_GEN9(gpgpu->device->device_id))
+ return __gen_gpgpu_set_L3_gen8(gpgpu, use_slm);
+
+ if (IS_HASWELL(gpgpu->device->device_id))
+ return __gen_gpgpu_set_L3_gen75(gpgpu, use_slm);
+
+ if (IS_BAYTRAIL_T(gpgpu->device->device_id))
+ return __gen_gpgpu_set_L3_baytrail(gpgpu, use_slm);
+
+ if (IS_IVYBRIDGE(gpgpu->device->device_id))
+ return __gen_gpgpu_set_L3_gen7(gpgpu, use_slm);
+
+ assert(0);
+ return;
+}
+
+static void
+__gen_gpgpu_select_pipeline_gen7(gen_gpgpu *gpgpu)
+{
+ BEGIN_BATCH(gpgpu->batch, 1);
+ OUT_BATCH(gpgpu->batch, CMD_PIPELINE_SELECT | PIPELINE_SELECT_GPGPU);
+ ADVANCE_BATCH(gpgpu->batch);
+}
+
+static void
+__gen_gpgpu_select_pipeline_gen9(gen_gpgpu *gpgpu)
+{
+ BEGIN_BATCH(gpgpu->batch, 1);
+ OUT_BATCH(gpgpu->batch, CMD_PIPELINE_SELECT | PIPELINE_SELECT_MASK | PIPELINE_SELECT_GPGPU);
+ ADVANCE_BATCH(gpgpu->batch);
+}
+
+static void
+gen_gpgpu_select_pipeline(gen_gpgpu *gpgpu)
+{
+ if (IS_GEN9(gpgpu->device->device_id))
+ return __gen_gpgpu_select_pipeline_gen9(gpgpu);
+
+ return __gen_gpgpu_select_pipeline_gen7(gpgpu);
+}
+
+static void
+__gen_gpgpu_set_base_address_gen7(gen_gpgpu *gpgpu)
+{
+ const uint32_t def_cc = gen_gpgpu_get_cache_ctrl(gpgpu); /* default Cache Control value */
+ BEGIN_BATCH(gpgpu->batch, 10);
+ OUT_BATCH(gpgpu->batch, CMD_STATE_BASE_ADDRESS | 8);
+ /* 0, Gen State Mem Obj CC, Stateless Mem Obj CC, Stateless Access Write Back */
+ OUT_BATCH(gpgpu->batch, 0 | (def_cc << 8) | (def_cc << 4) | (0 << 3) | BASE_ADDRESS_MODIFY); /* General State Base Addr */
+ /* 0, State Mem Obj CC */
+ /* We use a state base address for the surface heap since IVB clamp the
+ * binding table pointer at 11 bits. So, we cannot use pointers directly while
+ * using the surface heap
+ */
+ assert(gpgpu->aux.surface_heap_offset % 4096 == 0);
+ OUT_RELOC(gpgpu->batch, gpgpu->aux.aux_bo,
+ I915_GEM_DOMAIN_INSTRUCTION,
+ I915_GEM_DOMAIN_INSTRUCTION,
+ gpgpu->aux.surface_heap_offset + (0 | (def_cc << 8) | (def_cc << 4) | (0 << 3) | BASE_ADDRESS_MODIFY));
+
+ OUT_BATCH(gpgpu->batch, 0 | (def_cc << 8) | BASE_ADDRESS_MODIFY); /* Dynamic State Base Addr */
+
+ OUT_BATCH(gpgpu->batch, 0 | (def_cc << 8) | BASE_ADDRESS_MODIFY); /* Indirect Obj Base Addr */
+ OUT_BATCH(gpgpu->batch, 0 | (def_cc << 8) | BASE_ADDRESS_MODIFY); /* Instruction Base Addr */
+ OUT_BATCH(gpgpu->batch, 0 | BASE_ADDRESS_MODIFY);
+ /* According to mesa i965 driver code, we must set the dynamic state access upper bound
+ * to a valid bound value, otherwise, the border color pointer may be rejected and you
+ * may get incorrect border color. This is a known hardware bug. */
+ OUT_BATCH(gpgpu->batch, 0xfffff000 | BASE_ADDRESS_MODIFY);
+ OUT_BATCH(gpgpu->batch, 0 | BASE_ADDRESS_MODIFY);
+ OUT_BATCH(gpgpu->batch, 0 | BASE_ADDRESS_MODIFY);
+ ADVANCE_BATCH(gpgpu->batch);
+}
+
+static void
+__gen_gpgpu_set_base_address_gen8(gen_gpgpu *gpgpu)
+{
+ const uint32_t def_cc = gen_gpgpu_get_cache_ctrl(gpgpu); /* default Cache Control value */
+ BEGIN_BATCH(gpgpu->batch, 16);
+ OUT_BATCH(gpgpu->batch, CMD_STATE_BASE_ADDRESS | 14);
+ /* 0, Gen State Mem Obj CC, Stateless Mem Obj CC, Stateless Access Write Back */
+ OUT_BATCH(gpgpu->batch, 0 | (def_cc << 4) | (0 << 1) | BASE_ADDRESS_MODIFY); /* General State Base Addr */
+ OUT_BATCH(gpgpu->batch, 0);
+ OUT_BATCH(gpgpu->batch, 0 | (def_cc << 16));
+ /* 0, State Mem Obj CC */
+ /* We use a state base address for the surface heap since IVB clamp the
+ * binding table pointer at 11 bits. So, we cannot use pointers directly while
+ * using the surface heap
+ */
+ assert(gpgpu->aux.surface_heap_offset % 4096 == 0);
+ OUT_RELOC(gpgpu->batch, gpgpu->aux.aux_bo,
+ I915_GEM_DOMAIN_SAMPLER,
+ I915_GEM_DOMAIN_SAMPLER,
+ gpgpu->aux.surface_heap_offset + (0 | (def_cc << 4) | (0 << 1) | BASE_ADDRESS_MODIFY));
+ OUT_BATCH(gpgpu->batch, 0);
+ OUT_RELOC(gpgpu->batch, gpgpu->aux.aux_bo,
+ I915_GEM_DOMAIN_RENDER,
+ I915_GEM_DOMAIN_RENDER,
+ (0 | (def_cc << 4) | (0 << 1) | BASE_ADDRESS_MODIFY)); /* Dynamic State Base Addr */
+ OUT_BATCH(gpgpu->batch, 0);
+ OUT_BATCH(gpgpu->batch, 0 | (def_cc << 4) | BASE_ADDRESS_MODIFY); /* Indirect Obj Base Addr */
+ OUT_BATCH(gpgpu->batch, 0);
+ //OUT_BATCH(gpgpu->batch, 0 | (def_cc << 4) | BASE_ADDRESS_MODIFY); /* Instruction Base Addr */
+ OUT_RELOC(gpgpu->batch, gpgpu->kernel_bo,
+ I915_GEM_DOMAIN_INSTRUCTION,
+ I915_GEM_DOMAIN_INSTRUCTION,
+ 0 + (0 | (def_cc << 4) | (0 << 1) | BASE_ADDRESS_MODIFY));
+ OUT_BATCH(gpgpu->batch, 0);
+
+ OUT_BATCH(gpgpu->batch, 0xfffff000 | BASE_ADDRESS_MODIFY);
+ /* According to mesa i965 driver code, we must set the dynamic state access upper bound
+ * to a valid bound value, otherwise, the border color pointer may be rejected and you
+ * may get incorrect border color. This is a known hardware bug. */
+ OUT_BATCH(gpgpu->batch, 0xfffff000 | BASE_ADDRESS_MODIFY);
+ OUT_BATCH(gpgpu->batch, 0xfffff000 | BASE_ADDRESS_MODIFY);
+ OUT_BATCH(gpgpu->batch, 0xfffff000 | BASE_ADDRESS_MODIFY);
+ ADVANCE_BATCH(gpgpu->batch);
+}
+
+static void
+__gen_gpgpu_set_base_address_gen9(gen_gpgpu *gpgpu)
+{
+ const uint32_t def_cc = gen_gpgpu_get_cache_ctrl(gpgpu); /* default Cache Control value */
+ BEGIN_BATCH(gpgpu->batch, 19);
+ OUT_BATCH(gpgpu->batch, CMD_STATE_BASE_ADDRESS | 17);
+ /* 0, Gen State Mem Obj CC, Stateless Mem Obj CC, Stateless Access Write Back */
+ OUT_BATCH(gpgpu->batch, 0 | (def_cc << 4) | (0 << 1) | BASE_ADDRESS_MODIFY); /* General State Base Addr */
+ OUT_BATCH(gpgpu->batch, 0);
+ OUT_BATCH(gpgpu->batch, 0 | (def_cc << 16));
+ /* 0, State Mem Obj CC */
+ /* We use a state base address for the surface heap since IVB clamp the
+ * binding table pointer at 11 bits. So, we cannot use pointers directly while
+ * using the surface heap
+ */
+ assert(gpgpu->aux.surface_heap_offset % 4096 == 0);
+ OUT_RELOC(gpgpu->batch, gpgpu->aux.aux_bo,
+ I915_GEM_DOMAIN_SAMPLER,
+ I915_GEM_DOMAIN_SAMPLER,
+ gpgpu->aux.surface_heap_offset + (0 | (def_cc << 4) | (0 << 1) | BASE_ADDRESS_MODIFY));
+ OUT_BATCH(gpgpu->batch, 0);
+ OUT_RELOC(gpgpu->batch, gpgpu->aux.aux_bo,
+ I915_GEM_DOMAIN_RENDER,
+ I915_GEM_DOMAIN_RENDER,
+ (0 | (def_cc << 4) | (0 << 1) | BASE_ADDRESS_MODIFY)); /* Dynamic State Base Addr */
+ OUT_BATCH(gpgpu->batch, 0);
+ OUT_BATCH(gpgpu->batch, 0 | (def_cc << 4) | BASE_ADDRESS_MODIFY); /* Indirect Obj Base Addr */
+ OUT_BATCH(gpgpu->batch, 0);
+ //OUT_BATCH(gpgpu->batch, 0 | (def_cc << 4) | BASE_ADDRESS_MODIFY); /* Instruction Base Addr */
+ OUT_RELOC(gpgpu->batch, gpgpu->kernel_bo,
+ I915_GEM_DOMAIN_INSTRUCTION,
+ I915_GEM_DOMAIN_INSTRUCTION,
+ 0 + (0 | (def_cc << 4) | (0 << 1) | BASE_ADDRESS_MODIFY));
+ OUT_BATCH(gpgpu->batch, 0);
+
+ OUT_BATCH(gpgpu->batch, 0xfffff000 | BASE_ADDRESS_MODIFY);
+ /* According to mesa i965 driver code, we must set the dynamic state access upper bound
+ * to a valid bound value, otherwise, the border color pointer may be rejected and you
+ * may get incorrect border color. This is a known hardware bug. */
+ OUT_BATCH(gpgpu->batch, 0xfffff000 | BASE_ADDRESS_MODIFY);
+ OUT_BATCH(gpgpu->batch, 0xfffff000 | BASE_ADDRESS_MODIFY);
+ OUT_BATCH(gpgpu->batch, 0xfffff000 | BASE_ADDRESS_MODIFY);
+ /* Bindless surface state base address */
+ OUT_BATCH(gpgpu->batch, (def_cc << 4) | BASE_ADDRESS_MODIFY);
+ OUT_BATCH(gpgpu->batch, 0);
+ OUT_BATCH(gpgpu->batch, 0xfffff000);
+ ADVANCE_BATCH(gpgpu->batch);
+}
+
+static void
+gen_gpgpu_set_base_address(gen_gpgpu *gpgpu)
+{
+ if (IS_BROADWELL(gpgpu->device->device_id) || IS_CHERRYVIEW(gpgpu->device->device_id))
+ return __gen_gpgpu_set_base_address_gen8(gpgpu);
+
+ if (IS_GEN9(gpgpu->device->device_id))
+ return __gen_gpgpu_set_base_address_gen9(gpgpu);
+
+ return __gen_gpgpu_set_base_address_gen7(gpgpu);
+}
+
+static uint32_t
+__gen_gpgpu_get_scratch_index_gen7(uint32_t size)
+{
+ return size / 1024 - 1;
+}
+
+static uint32_t
+__gen_gpgpu_get_scratch_index_gen75(uint32_t size)
+{
+ //align in backend, if non pow2, must align when alloc scratch bo.
+ assert((size & (size - 1)) == 0);
+ size = size >> 11;
+ uint32_t index = 0;
+ while ((size >>= 1) > 0)
+ index++; //get leading one
+
+ return index;
+}
+
+static uint32_t
+__gen_gpgpu_get_scratch_index_gen8(uint32_t size)
+{
+ //align in backend, if non pow2, must align when alloc scratch bo.
+ assert((size & (size - 1)) == 0);
+ size = size >> 10;
+ uint32_t index = 0;
+ while ((size >>= 1) > 0)
+ index++; //get leading one
+
+ return index;
+}
+
+static uint32_t
+gen_gpgpu_get_scratch_index(gen_gpgpu *gpgpu, uint32_t size)
+{
+ if (IS_BROADWELL(gpgpu->device->device_id) ||
+ IS_CHERRYVIEW(gpgpu->device->device_id) || IS_GEN9(gpgpu->device->device_id))
+ return __gen_gpgpu_get_scratch_index_gen8(size);
+
+ if (IS_HASWELL(gpgpu->device->device_id))
+ return __gen_gpgpu_get_scratch_index_gen75(size);
+
+ if (IS_IVYBRIDGE(gpgpu->device->device_id))
+ return __gen_gpgpu_get_scratch_index_gen7(size);
+
+ assert(0);
+ return -1;
+}
+
+static cl_int
+__gen_gpgpu_get_max_curbe_size(uint32_t device_id)
+{
+ if (IS_BAYTRAIL_T(device_id) ||
+ IS_IVB_GT1(device_id))
+ return 992;
+ else
+ return 2016;
+}
+
+static cl_int
+__gen_gpgpu_get_curbe_size(gen_gpgpu *gpgpu)
+{
+ int curbe_size = gpgpu->thread.size_cs_entry * gpgpu->thread.num_cs_entries;
+ int max_curbe_size = __gen_gpgpu_get_max_curbe_size(gpgpu->device->device_id);
+
+ if (curbe_size > max_curbe_size) {
+ CL_LOG_WARNING("warning, curbe size exceed limitation.");
+ return max_curbe_size;
+ } else
+ return curbe_size;
+}
+
+static void
+__gen_gpgpu_load_vfe_state_gen7(gen_gpgpu *gpgpu)
+{
+ int32_t scratch_index;
+ BEGIN_BATCH(gpgpu->batch, 8);
+ OUT_BATCH(gpgpu->batch, CMD_MEDIA_STATE_POINTERS | (8 - 2));
+
+ if (gpgpu->thread.per_thread_scratch > 0) {
+ scratch_index = gen_gpgpu_get_scratch_index(gpgpu, gpgpu->thread.per_thread_scratch);
+ OUT_RELOC(gpgpu->batch, gpgpu->mem.scratch_bo,
+ I915_GEM_DOMAIN_RENDER,
+ I915_GEM_DOMAIN_RENDER,
+ scratch_index);
+ } else {
+ OUT_BATCH(gpgpu->batch, 0);
+ }
+ /* max_thread | urb entries | (reset_gateway|bypass_gate_way | gpgpu_mode) */
+ OUT_BATCH(gpgpu->batch, 0 | ((gpgpu->thread.max_thread_num - 1) << 16) | (0 << 8) | 0xc4);
+ OUT_BATCH(gpgpu->batch, 0);
+ /* curbe_size */
+ OUT_BATCH(gpgpu->batch, __gen_gpgpu_get_curbe_size(gpgpu));
+ OUT_BATCH(gpgpu->batch, 0);
+ OUT_BATCH(gpgpu->batch, 0);
+ OUT_BATCH(gpgpu->batch, 0);
+ ADVANCE_BATCH(gpgpu->batch);
+}
+
+static void
+__gen_gpgpu_load_vfe_state_gen8(gen_gpgpu *gpgpu)
+{
+ int32_t scratch_index;
+ BEGIN_BATCH(gpgpu->batch, 9);
+ OUT_BATCH(gpgpu->batch, CMD_MEDIA_STATE_POINTERS | (9 - 2));
+
+ if (gpgpu->thread.per_thread_scratch > 0) {
+ scratch_index = gen_gpgpu_get_scratch_index(gpgpu, gpgpu->thread.per_thread_scratch);
+ OUT_RELOC(gpgpu->batch, gpgpu->mem.scratch_bo,
+ I915_GEM_DOMAIN_RENDER,
+ I915_GEM_DOMAIN_RENDER,
+ scratch_index);
+ } else {
+ OUT_BATCH(gpgpu->batch, 0);
+ }
+ OUT_BATCH(gpgpu->batch, 0);
+
+ /* max_thread | urb entries | (reset_gateway|bypass_gate_way | gpgpu_mode) */
+ OUT_BATCH(gpgpu->batch, 0 | ((gpgpu->thread.max_thread_num - 1) << 16) | (2 << 8) | 0xc0); //urb entries can't be 0
+ OUT_BATCH(gpgpu->batch, 0);
+ /* urb entries size | curbe_size */
+ OUT_BATCH(gpgpu->batch, 2 << 16 | __gen_gpgpu_get_curbe_size(gpgpu));
+ OUT_BATCH(gpgpu->batch, 0);
+ OUT_BATCH(gpgpu->batch, 0);
+ OUT_BATCH(gpgpu->batch, 0);
+
+ ADVANCE_BATCH(gpgpu->batch);
+}
+
+static void
+gen_gpgpu_load_vfe_state(gen_gpgpu *gpgpu)
+{
+ if (IS_BROADWELL(gpgpu->device->device_id) ||
+ IS_CHERRYVIEW(gpgpu->device->device_id) || IS_GEN9(gpgpu->device->device_id))
+ return __gen_gpgpu_load_vfe_state_gen8(gpgpu);
+
+ return __gen_gpgpu_load_vfe_state_gen7(gpgpu);
+}
+
+static void
+__gen_gpgpu_load_curbe_buffer_gen7(gen_gpgpu *gpgpu)
+{
+ BEGIN_BATCH(gpgpu->batch, 4);
+ OUT_BATCH(gpgpu->batch, CMD(2, 0, 1) | (4 - 2)); /* length-2 */
+ OUT_BATCH(gpgpu->batch, 0); /* mbz */
+ OUT_BATCH(gpgpu->batch, __gen_gpgpu_get_curbe_size(gpgpu) * 32);
+ OUT_RELOC(gpgpu->batch, gpgpu->aux.aux_bo, I915_GEM_DOMAIN_INSTRUCTION, 0, gpgpu->aux.curbe_offset);
+ ADVANCE_BATCH(gpgpu->batch);
+}
+
+static void
+__gen_gpgpu_load_curbe_buffer_gen8(gen_gpgpu *gpgpu)
+{
+ BEGIN_BATCH(gpgpu->batch, 4);
+ OUT_BATCH(gpgpu->batch, CMD(2, 0, 1) | (4 - 2)); /* length-2 */
+ OUT_BATCH(gpgpu->batch, 0); /* mbz */
+ OUT_BATCH(gpgpu->batch, __gen_gpgpu_get_curbe_size(gpgpu) * 32);
+ OUT_BATCH(gpgpu->batch, gpgpu->aux.curbe_offset);
+ ADVANCE_BATCH(gpgpu->batch);
+}
+
+static void
+gen_gpgpu_load_curbe_buffer(gen_gpgpu *gpgpu)
+{
+ if (IS_BROADWELL(gpgpu->device->device_id) ||
+ IS_CHERRYVIEW(gpgpu->device->device_id) || IS_GEN9(gpgpu->device->device_id))
+ return __gen_gpgpu_load_curbe_buffer_gen8(gpgpu);
+
+ return __gen_gpgpu_load_curbe_buffer_gen7(gpgpu);
+}
+
+static void
+__gen_gpgpu_load_idrt_gen7(gen_gpgpu *gpgpu)
+{
+ BEGIN_BATCH(gpgpu->batch, 4);
+ OUT_BATCH(gpgpu->batch, CMD(2, 0, 2) | (4 - 2)); /* length-2 */
+ OUT_BATCH(gpgpu->batch, 0); /* mbz */
+ OUT_BATCH(gpgpu->batch, 1 << 5);
+ OUT_RELOC(gpgpu->batch, gpgpu->aux.aux_bo, I915_GEM_DOMAIN_INSTRUCTION, 0, gpgpu->aux.idrt_offset);
+ ADVANCE_BATCH(gpgpu->batch);
+}
+
+static void
+__gen_gpgpu_load_idrt_gen8(gen_gpgpu *gpgpu)
+{
+ BEGIN_BATCH(gpgpu->batch, 4);
+ OUT_BATCH(gpgpu->batch, CMD(2, 0, 2) | (4 - 2)); /* length-2 */
+ OUT_BATCH(gpgpu->batch, 0); /* mbz */
+ OUT_BATCH(gpgpu->batch, 1 << 5);
+ OUT_BATCH(gpgpu->batch, gpgpu->aux.idrt_offset);
+ ADVANCE_BATCH(gpgpu->batch);
+}
+
+static void
+gen_gpgpu_load_idrt(gen_gpgpu *gpgpu)
+{
+ if (IS_BROADWELL(gpgpu->device->device_id) ||
+ IS_CHERRYVIEW(gpgpu->device->device_id) || IS_GEN9(gpgpu->device->device_id))
+ return __gen_gpgpu_load_idrt_gen8(gpgpu);
+
+ return __gen_gpgpu_load_idrt_gen7(gpgpu);
+}
+
+/* Emit PIPE_CONTROLs to write the current GPU timestamp into a buffer. */
+static void
+gen_gpgpu_write_timestamp(gen_gpgpu *gpgpu, int idx)
+{
+ BEGIN_BATCH(gpgpu->batch, 5);
+ OUT_BATCH(gpgpu->batch, CMD_PIPE_CONTROL | (5 - 2));
+ OUT_BATCH(gpgpu->batch, GEN7_PIPE_CONTROL_WRITE_TIMESTAMP);
+ OUT_RELOC(gpgpu->batch, gpgpu->mem.time_stamp_bo,
+ I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
+ GEN7_PIPE_CONTROL_GLOBAL_GTT_WRITE | idx * sizeof(uint64_t));
+ OUT_BATCH(gpgpu->batch, 0);
+ OUT_BATCH(gpgpu->batch, 0);
+ ADVANCE_BATCH();
+}
+
+static void
+gen_gpgpu_batch_start(gen_gpgpu *gpgpu)
+{
+ intel_batchbuffer_start_atomic(gpgpu->batch, 256);
+ gen_gpgpu_pipe_control(gpgpu);
+ gen_gpgpu_set_L3(gpgpu, gpgpu->thread.barrier_slm_used);
+ gen_gpgpu_select_pipeline(gpgpu);
+ gen_gpgpu_set_base_address(gpgpu);
+ gen_gpgpu_load_vfe_state(gpgpu);
+ gen_gpgpu_load_curbe_buffer(gpgpu);
+ gen_gpgpu_load_idrt(gpgpu);
+
+ /* Insert PIPE_CONTROL for time stamp of start*/
+ if (gpgpu->mem.time_stamp_bo)
+ gen_gpgpu_write_timestamp(gpgpu, 0);
+}
+
+static void
+__gen_gpgpu_walker_gen7(gen_gpgpu *gpgpu, uint32_t simd_sz, uint32_t thread_n,
+ const size_t global_wk_off[3], const size_t global_dim_off[3],
+ const size_t global_wk_sz[3], const size_t local_wk_sz[3])
+{
+ const uint32_t global_wk_dim[3] = {
+ global_wk_sz[0] / local_wk_sz[0],
+ global_wk_sz[1] / local_wk_sz[1],
+ global_wk_sz[2] / local_wk_sz[2]};
+ uint32_t right_mask = ~0x0;
+ size_t group_sz = local_wk_sz[0] * local_wk_sz[1] * local_wk_sz[2];
+
+ assert(simd_sz == 8 || simd_sz == 16);
+
+ uint32_t shift = (group_sz & (simd_sz - 1));
+ shift = (shift == 0) ? simd_sz : shift;
+ right_mask = (1 << shift) - 1;
+
+ BEGIN_BATCH(gpgpu->batch, 11);
+ OUT_BATCH(gpgpu->batch, CMD_GPGPU_WALKER | 9);
+ OUT_BATCH(gpgpu->batch, 0); /* kernel index == 0 */
+ assert(thread_n <= 64);
+ if (simd_sz == 16)
+ OUT_BATCH(gpgpu->batch, (1 << 30) | (thread_n - 1)); /* SIMD16 | thread max */
+ else
+ OUT_BATCH(gpgpu->batch, (0 << 30) | (thread_n - 1)); /* SIMD8 | thread max */
+ OUT_BATCH(gpgpu->batch, 0);
+ OUT_BATCH(gpgpu->batch, global_wk_dim[0]);
+ OUT_BATCH(gpgpu->batch, 0);
+ OUT_BATCH(gpgpu->batch, global_wk_dim[1]);
+ OUT_BATCH(gpgpu->batch, 0);
+ OUT_BATCH(gpgpu->batch, global_wk_dim[2]);
+ OUT_BATCH(gpgpu->batch, right_mask);
+ OUT_BATCH(gpgpu->batch, ~0x0); /* we always set height as 1, so set bottom mask as all 1*/
+ ADVANCE_BATCH(gpgpu->batch);
+
+ BEGIN_BATCH(gpgpu->batch, 2);
+ OUT_BATCH(gpgpu->batch, CMD_MEDIA_STATE_FLUSH | 0);
+ OUT_BATCH(gpgpu->batch, 0); /* kernel index == 0 */
+ ADVANCE_BATCH(gpgpu->batch);
+
+ if (IS_IVYBRIDGE(gpgpu->device->device_id))
+ gen_gpgpu_pipe_control(gpgpu);
+}
+
+static void
+__gen_gpgpu_walker_gen8(gen_gpgpu *gpgpu, uint32_t simd_sz, uint32_t thread_n,
+ const size_t global_wk_off[3], const size_t global_dim_off[3],
+ const size_t global_wk_sz[3], const size_t local_wk_sz[3])
+{
+ const uint32_t global_wk_dim[3] = {
+ global_wk_sz[0] / local_wk_sz[0],
+ global_wk_sz[1] / local_wk_sz[1],
+ global_wk_sz[2] / local_wk_sz[2]};
+ uint32_t right_mask = ~0x0;
+ size_t group_sz = local_wk_sz[0] * local_wk_sz[1] * local_wk_sz[2];
+
+ assert(simd_sz == 8 || simd_sz == 16);
+
+ uint32_t shift = (group_sz & (simd_sz - 1));
+ shift = (shift == 0) ? simd_sz : shift;
+ right_mask = (1 << shift) - 1;
+
+ BEGIN_BATCH(gpgpu->batch, 15);
+ OUT_BATCH(gpgpu->batch, CMD_GPGPU_WALKER | 13);
+ OUT_BATCH(gpgpu->batch, 0); /* kernel index == 0 */
+ OUT_BATCH(gpgpu->batch, 0); /* Indirect Data Length */
+ OUT_BATCH(gpgpu->batch, 0); /* Indirect Data Start Address */
+ assert(thread_n <= 64);
+ if (simd_sz == 16)
+ OUT_BATCH(gpgpu->batch, (1 << 30) | (thread_n - 1)); /* SIMD16 | thread max */
+ else
+ OUT_BATCH(gpgpu->batch, (0 << 30) | (thread_n - 1)); /* SIMD8 | thread max */
+ OUT_BATCH(gpgpu->batch, global_dim_off[0]);
+ OUT_BATCH(gpgpu->batch, 0);
+ OUT_BATCH(gpgpu->batch, global_wk_dim[0] + global_dim_off[0]);
+ OUT_BATCH(gpgpu->batch, global_dim_off[1]);
+ OUT_BATCH(gpgpu->batch, 0);
+ OUT_BATCH(gpgpu->batch, global_wk_dim[1] + global_dim_off[1]);
+ OUT_BATCH(gpgpu->batch, global_dim_off[2]);
+ OUT_BATCH(gpgpu->batch, global_wk_dim[2] + global_dim_off[2]);
+ OUT_BATCH(gpgpu->batch, right_mask);
+ OUT_BATCH(gpgpu->batch, ~0x0); /* we always set height as 1, so set bottom mask as all 1*/
+ ADVANCE_BATCH(gpgpu->batch);
+
+ BEGIN_BATCH(gpgpu->batch, 2);
+ OUT_BATCH(gpgpu->batch, CMD_MEDIA_STATE_FLUSH | 0);
+ OUT_BATCH(gpgpu->batch, 0); /* kernel index == 0 */
+ ADVANCE_BATCH(gpgpu->batch);
+
+ gen_gpgpu_pipe_control(gpgpu);
+}
+
+static void
+gen_gpgpu_walker(gen_gpgpu *gpgpu, uint32_t simd_sz, uint32_t thread_n,
+ const size_t global_wk_off[3], const size_t global_dim_off[3],
+ const size_t global_wk_sz[3], const size_t local_wk_sz[3])
+{
+ if (IS_BROADWELL(gpgpu->device->device_id) ||
+ IS_CHERRYVIEW(gpgpu->device->device_id) || IS_GEN9(gpgpu->device->device_id))
+ return __gen_gpgpu_walker_gen8(gpgpu, simd_sz, thread_n, global_wk_off,
+ global_dim_off, global_wk_sz, local_wk_sz);
+
+ return __gen_gpgpu_walker_gen7(gpgpu, simd_sz, thread_n, global_wk_off,
+ global_dim_off, global_wk_sz, local_wk_sz);
+}
+
+static void
+__gen_gpgpu_post_action_gen7(gen_gpgpu *gpgpu, int32_t flush_mode)
+{
+ if (flush_mode)
+ gen_gpgpu_pipe_control(gpgpu);
+}
+
+static void
+__gen_gpgpu_post_action_gen75(gen_gpgpu *gpgpu, int32_t flush_mode)
+{
+ /* flush force for set L3 */
+ gen_gpgpu_pipe_control(gpgpu);
+
+ /* Restore L3 control to disable SLM mode,
+ otherwise, may affect 3D pipeline */
+ gen_gpgpu_set_L3(gpgpu, 0);
+}
+
+static void
+gen_gpgpu_post_action(gen_gpgpu *gpgpu, int32_t flush_mode)
+{
+ if (IS_HASWELL(gpgpu->device->device_id))
+ return __gen_gpgpu_post_action_gen75(gpgpu, flush_mode);
+
+ return __gen_gpgpu_post_action_gen7(gpgpu, flush_mode);
+}
+
+static void
+gen_gpgpu_batch_end(gen_gpgpu *gpgpu, int32_t flush_mode)
+{
+ /* Insert PIPE_CONTROL for time stamp of end*/
+ if (gpgpu->mem.time_stamp_bo)
+ gen_gpgpu_write_timestamp(gpgpu, 1);
+
+ gen_gpgpu_post_action(gpgpu, flush_mode);
+ intel_batchbuffer_end_atomic(gpgpu->batch);
+}
+
+/* Get the GPU execute time. */
+LOCAL void
+gen_gpgpu_event_get_exec_timestamp(void *gpgpu_ctx, int index, uint64_t *ret_ts)
+{
+ gen_gpgpu_exec_ctx *gpu_exec_ctx = gpgpu_ctx;
+ uint64_t result = 0;
+ /* We use last one's time */
+ gen_gpgpu *gpgpu = gpu_exec_ctx->all_gpu[gpu_exec_ctx->gpu_num - 1];
+ assert(gpgpu);
+
+ assert(gpgpu->mem.time_stamp_bo);
+ assert(index == 0 || index == 1);
+ drm_intel_gem_bo_map_gtt(gpgpu->mem.time_stamp_bo);
+ uint64_t *ptr = gpgpu->mem.time_stamp_bo->virtual;
+ result = ptr[index];
+
+ /* According to BSpec, the timestamp counter should be 36 bits,
+ but comparing to the timestamp counter from IO control reading,
+ we find the first 4 bits seems to be fake. In order to keep the
+ timestamp counter conformable, we just skip the first 4 bits.
+ */
+ result = (result & 0x0FFFFFFFF) * 80; //convert to nanoseconds
+ *ret_ts = result;
+
+ drm_intel_gem_bo_unmap_gtt(gpgpu->mem.time_stamp_bo);
+}
+
+#define TIMESTAMP_ADDR 0x2358
+
+/* IVB and HSW's result MUST shift in x86_64 system */
+static uint64_t
+__gen_gpgpu_read_ts_reg_gen7(drm_intel_bufmgr *bufmgr)
+{
+ uint64_t result = 0;
+ drm_intel_reg_read(bufmgr, TIMESTAMP_ADDR, &result);
+ /* In x86_64 system, the low 32bits of timestamp count are stored in the high 32 bits of
+ result which got from drm_intel_reg_read, and 32-35 bits are lost; but match bspec in
+ i386 system. It seems the kernel readq bug. So shift 32 bit in x86_64, and only remain
+ 32 bits data in i386.
+ */
+ struct utsname buf;
+ uname(&buf);
+ /* In some systems, the user space is 32 bit, but kernel is 64 bit, so can't use the
+ * compiler's flag to determine the kernel'a architecture, use uname to get it. */
+ /* x86_64 in linux, amd64 in bsd */
+ if (strcmp(buf.machine, "x86_64") == 0 || strcmp(buf.machine, "amd64") == 0)
+ return result >> 32;
+ else
+ return result & 0x0ffffffff;
+}
+
+/* baytrail's result should clear high 4 bits */
+static uint64_t
+__gen_gpgpu_read_ts_reg_baytrail(drm_intel_bufmgr *bufmgr)
+{
+ uint64_t result = 0;
+ drm_intel_reg_read(bufmgr, TIMESTAMP_ADDR, &result);
+ return result & 0x0ffffffff;
+}
+
+static uint64_t
+gen_gpgpu_read_ts_reg(cl_device_id device, intel_driver_t *drv)
+{
+ if (IS_CHERRYVIEW(device->device_id) || IS_BAYTRAIL_T(device->device_id))
+ return __gen_gpgpu_read_ts_reg_baytrail(drv->bufmgr);
+
+ return __gen_gpgpu_read_ts_reg_gen7(drv->bufmgr);
+}
+
+LOCAL void
+gen_gpgpu_event_get_gpu_cur_timestamp(cl_device_id device, intel_driver_t *drv, uint64_t *ret_ts)
+{
+ uint64_t result = 0;
+
+ /* Get the ts that match the bspec */
+ result = gen_gpgpu_read_ts_reg(device, drv);
+ result *= 80;
+
+ *ret_ts = result;
+ return;
+}