/* * Copyright © 2012 Intel Corporation * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library. If not, see . * * Author: Benjamin Segovia * Alexei Soupikov */ #include #include #include #include #include #include #include #include #include #include #include #include #include "intel/intel_gpgpu.h" #include "intel/intel_defines.h" #include "intel/intel_structs.h" #include "program.h" // for BTI_RESERVED_NUM #include "cl_alloc.h" #include "cl_utils.h" #include "cl_sampler.h" #include "cl_accelerator_intel.h" #ifndef CL_VERSION_1_2 #define CL_MEM_OBJECT_IMAGE1D 0x10F4 #define CL_MEM_OBJECT_IMAGE1D_ARRAY 0x10F5 #define CL_MEM_OBJECT_IMAGE1D_BUFFER 0x10F6 #define CL_MEM_OBJECT_IMAGE2D_ARRAY 0x10F3 #endif #define GEN_CMD_MEDIA_OBJECT (0x71000000) #define MO_TS_BIT (1 << 24) #define MO_RETAIN_BIT (1 << 28) #define SAMPLER_STATE_SIZE (16) #define TIMESTAMP_ADDR 0x2358 /* Stores both binding tables and surface states */ typedef struct surface_heap { uint32_t binding_table[256]; char surface[256*sizeof(gen_surface_state_t)]; } surface_heap_t; typedef struct intel_event { drm_intel_bo *buffer; drm_intel_bo *ts_buf; int status; } intel_event_t; #define MAX_IF_DESC 32 typedef struct intel_gpgpu intel_gpgpu_t; typedef void (intel_gpgpu_set_L3_t)(intel_gpgpu_t *gpgpu, uint32_t use_slm); intel_gpgpu_set_L3_t *intel_gpgpu_set_L3 = NULL; typedef uint32_t (intel_gpgpu_get_scratch_index_t)(uint32_t size); intel_gpgpu_get_scratch_index_t *intel_gpgpu_get_scratch_index = NULL; typedef void (intel_gpgpu_post_action_t)(intel_gpgpu_t *gpgpu, int32_t flush_mode); intel_gpgpu_post_action_t *intel_gpgpu_post_action = NULL; typedef uint64_t (intel_gpgpu_read_ts_reg_t)(drm_intel_bufmgr *bufmgr); intel_gpgpu_read_ts_reg_t *intel_gpgpu_read_ts_reg = NULL; typedef void (intel_gpgpu_set_base_address_t)(intel_gpgpu_t *gpgpu); intel_gpgpu_set_base_address_t *intel_gpgpu_set_base_address = NULL; typedef void (intel_gpgpu_setup_bti_t)(intel_gpgpu_t *gpgpu, drm_intel_bo *buf, uint32_t internal_offset, size_t size, unsigned char index, uint32_t format); intel_gpgpu_setup_bti_t *intel_gpgpu_setup_bti = NULL; typedef void (intel_gpgpu_load_vfe_state_t)(intel_gpgpu_t *gpgpu); intel_gpgpu_load_vfe_state_t *intel_gpgpu_load_vfe_state = NULL; typedef void (intel_gpgpu_build_idrt_t)(intel_gpgpu_t *gpgpu, cl_gpgpu_kernel *kernel); intel_gpgpu_build_idrt_t *intel_gpgpu_build_idrt = NULL; typedef void (intel_gpgpu_load_curbe_buffer_t)(intel_gpgpu_t *gpgpu); intel_gpgpu_load_curbe_buffer_t *intel_gpgpu_load_curbe_buffer = NULL; typedef void (intel_gpgpu_load_idrt_t)(intel_gpgpu_t *gpgpu); intel_gpgpu_load_idrt_t *intel_gpgpu_load_idrt = NULL; typedef void (intel_gpgpu_pipe_control_t)(intel_gpgpu_t *gpgpu); intel_gpgpu_pipe_control_t *intel_gpgpu_pipe_control = NULL; typedef void (intel_gpgpu_select_pipeline_t)(intel_gpgpu_t *gpgpu); intel_gpgpu_select_pipeline_t *intel_gpgpu_select_pipeline = NULL; static void intel_gpgpu_sync(void *buf) { if (buf) drm_intel_bo_wait_rendering((drm_intel_bo *)buf); } static void *intel_gpgpu_ref_batch_buf(intel_gpgpu_t *gpgpu) { if (gpgpu->batch->last_bo) drm_intel_bo_reference(gpgpu->batch->last_bo); return gpgpu->batch->last_bo; } static void intel_gpgpu_unref_batch_buf(void *buf) { if (buf) drm_intel_bo_unreference((drm_intel_bo *)buf); } static void intel_gpgpu_delete_finished(intel_gpgpu_t *gpgpu) { if (gpgpu == NULL) return; if(gpgpu->time_stamp_b.bo) drm_intel_bo_unreference(gpgpu->time_stamp_b.bo); if(gpgpu->printf_b.bo) drm_intel_bo_unreference(gpgpu->printf_b.bo); if (gpgpu->aux_buf.bo) drm_intel_bo_unreference(gpgpu->aux_buf.bo); if (gpgpu->perf_b.bo) drm_intel_bo_unreference(gpgpu->perf_b.bo); if (gpgpu->stack_b.bo) drm_intel_bo_unreference(gpgpu->stack_b.bo); if (gpgpu->scratch_b.bo) drm_intel_bo_unreference(gpgpu->scratch_b.bo); if (gpgpu->profiling_b.bo) drm_intel_bo_unreference(gpgpu->profiling_b.bo); if(gpgpu->constant_b.bo) drm_intel_bo_unreference(gpgpu->constant_b.bo); intel_batchbuffer_delete(gpgpu->batch); cl_free(gpgpu); } /* Destroy the all intel_gpgpu, no matter finish or not, when driver destroy */ void intel_gpgpu_delete_all(intel_driver_t *drv) { struct intel_gpgpu_node *p; if(drv->gpgpu_list == NULL) return; PPTHREAD_MUTEX_LOCK(drv); while(drv->gpgpu_list) { p = drv->gpgpu_list; drv->gpgpu_list = p->next; intel_gpgpu_delete_finished(p->gpgpu); cl_free(p); } PPTHREAD_MUTEX_UNLOCK(drv); } static void intel_gpgpu_delete(intel_gpgpu_t *gpgpu) { if (gpgpu == NULL) return; intel_driver_t *drv = gpgpu->drv; struct intel_gpgpu_node *p, *node; PPTHREAD_MUTEX_LOCK(drv); p = drv->gpgpu_list; if(p) { node = p->next; while(node) { if(node->gpgpu->batch && node->gpgpu->batch->buffer && !drm_intel_bo_busy(node->gpgpu->batch->buffer)) { p->next = node->next; intel_gpgpu_delete_finished(node->gpgpu); cl_free(node); node = p->next; } else { p = node; node = node->next; } } node = drv->gpgpu_list; if(node->gpgpu->batch && node->gpgpu->batch->buffer && !drm_intel_bo_busy(node->gpgpu->batch->buffer)) { drv->gpgpu_list = drv->gpgpu_list->next; intel_gpgpu_delete_finished(node->gpgpu); cl_free(node); } } if (gpgpu == NULL) return; if(gpgpu->batch && gpgpu->batch->buffer && drm_intel_bo_busy(gpgpu->batch->buffer)) { TRY_ALLOC_NO_ERR (node, CALLOC(struct intel_gpgpu_node)); node->gpgpu = gpgpu; node->next = NULL; p = drv->gpgpu_list; if(p == NULL) drv->gpgpu_list= node; else { while(p->next) p = p->next; p->next = node; } } else intel_gpgpu_delete_finished(gpgpu); error: PPTHREAD_MUTEX_UNLOCK(drv); } static intel_gpgpu_t* intel_gpgpu_new(intel_driver_t *drv) { intel_gpgpu_t *state = NULL; TRY_ALLOC_NO_ERR (state, CALLOC(intel_gpgpu_t)); state->drv = drv; state->batch = intel_batchbuffer_new(state->drv); assert(state->batch); exit: return state; error: intel_gpgpu_delete(state); state = NULL; goto exit; } static void intel_gpgpu_select_pipeline_gen7(intel_gpgpu_t *gpgpu) { BEGIN_BATCH(gpgpu->batch, 1); OUT_BATCH(gpgpu->batch, CMD_PIPELINE_SELECT | PIPELINE_SELECT_GPGPU); ADVANCE_BATCH(gpgpu->batch); } static void intel_gpgpu_select_pipeline_gen9(intel_gpgpu_t *gpgpu) { BEGIN_BATCH(gpgpu->batch, 1); OUT_BATCH(gpgpu->batch, CMD_PIPELINE_SELECT | PIPELINE_SELECT_MASK | PIPELINE_SELECT_GPGPU); ADVANCE_BATCH(gpgpu->batch); } static uint32_t intel_gpgpu_get_cache_ctrl_gen7() { return cc_llc_l3; } static uint32_t intel_gpgpu_get_cache_ctrl_gen75() { return llccc_ec | l3cc_ec; } static uint32_t intel_gpgpu_get_cache_ctrl_gen8() { return tcc_llc_ec_l3 | mtllc_wb; } static uint32_t intel_gpgpu_get_cache_ctrl_gen9() { //Kernel-defined cache control registers 2: //L3CC: WB; LeCC: WB; TC: LLC/eLLC; int major = 0, minor = 0; int mocs_index = 0x2; struct utsname buf; uname(&buf); sscanf(buf.release, "%d.%d", &major, &minor); //From linux 4.3, kernel redefined the mocs table's value, //But before 4.3, still used the hw defautl value. if(strcmp(buf.sysname, "Linux") == 0 && major == 4 && minor < 3) { /* linux kernel support skl from 4.x, so check from 4 */ mocs_index = 0x9; } return (mocs_index << 1); } static void intel_gpgpu_set_base_address_gen7(intel_gpgpu_t *gpgpu) { const uint32_t def_cc = cl_gpgpu_get_cache_ctrl(); /* default Cache Control value */ BEGIN_BATCH(gpgpu->batch, 10); OUT_BATCH(gpgpu->batch, CMD_STATE_BASE_ADDRESS | 8); /* 0, Gen State Mem Obj CC, Stateless Mem Obj CC, Stateless Access Write Back */ OUT_BATCH(gpgpu->batch, 0 | (def_cc << 8) | (def_cc << 4) | (0 << 3)| BASE_ADDRESS_MODIFY); /* General State Base Addr */ /* 0, State Mem Obj CC */ /* We use a state base address for the surface heap since IVB clamp the * binding table pointer at 11 bits. So, we cannot use pointers directly while * using the surface heap */ assert(gpgpu->aux_offset.surface_heap_offset % 4096 == 0); OUT_RELOC(gpgpu->batch, gpgpu->aux_buf.bo, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION, gpgpu->aux_offset.surface_heap_offset + (0 | (def_cc << 8) | (def_cc << 4) | (0 << 3)| BASE_ADDRESS_MODIFY)); OUT_BATCH(gpgpu->batch, 0 | (def_cc << 8) | BASE_ADDRESS_MODIFY); /* Dynamic State Base Addr */ OUT_BATCH(gpgpu->batch, 0 | (def_cc << 8) | BASE_ADDRESS_MODIFY); /* Indirect Obj Base Addr */ OUT_BATCH(gpgpu->batch, 0 | (def_cc << 8) | BASE_ADDRESS_MODIFY); /* Instruction Base Addr */ OUT_BATCH(gpgpu->batch, 0 | BASE_ADDRESS_MODIFY); /* According to mesa i965 driver code, we must set the dynamic state access upper bound * to a valid bound value, otherwise, the border color pointer may be rejected and you * may get incorrect border color. This is a known hardware bug. */ OUT_BATCH(gpgpu->batch, 0xfffff000 | BASE_ADDRESS_MODIFY); OUT_BATCH(gpgpu->batch, 0 | BASE_ADDRESS_MODIFY); OUT_BATCH(gpgpu->batch, 0 | BASE_ADDRESS_MODIFY); ADVANCE_BATCH(gpgpu->batch); } static void intel_gpgpu_set_base_address_gen8(intel_gpgpu_t *gpgpu) { const uint32_t def_cc = cl_gpgpu_get_cache_ctrl(); /* default Cache Control value */ BEGIN_BATCH(gpgpu->batch, 16); OUT_BATCH(gpgpu->batch, CMD_STATE_BASE_ADDRESS | 14); /* 0, Gen State Mem Obj CC, Stateless Mem Obj CC, Stateless Access Write Back */ OUT_BATCH(gpgpu->batch, 0 | (def_cc << 4) | (0 << 1)| BASE_ADDRESS_MODIFY); /* General State Base Addr */ OUT_BATCH(gpgpu->batch, 0); OUT_BATCH(gpgpu->batch, 0 | (def_cc << 16)); /* 0, State Mem Obj CC */ /* We use a state base address for the surface heap since IVB clamp the * binding table pointer at 11 bits. So, we cannot use pointers directly while * using the surface heap */ assert(gpgpu->aux_offset.surface_heap_offset % 4096 == 0); OUT_RELOC(gpgpu->batch, gpgpu->aux_buf.bo, I915_GEM_DOMAIN_SAMPLER, I915_GEM_DOMAIN_SAMPLER, gpgpu->aux_offset.surface_heap_offset + (0 | (def_cc << 4) | (0 << 1)| BASE_ADDRESS_MODIFY)); OUT_BATCH(gpgpu->batch, 0); OUT_RELOC(gpgpu->batch, gpgpu->aux_buf.bo, I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER, (0 | (def_cc << 4) | (0 << 1)| BASE_ADDRESS_MODIFY)); /* Dynamic State Base Addr */ OUT_BATCH(gpgpu->batch, 0); OUT_BATCH(gpgpu->batch, 0 | (def_cc << 4) | BASE_ADDRESS_MODIFY); /* Indirect Obj Base Addr */ OUT_BATCH(gpgpu->batch, 0); //OUT_BATCH(gpgpu->batch, 0 | (def_cc << 4) | BASE_ADDRESS_MODIFY); /* Instruction Base Addr */ OUT_RELOC(gpgpu->batch, (drm_intel_bo *)gpgpu->ker->bo, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION, 0 + (0 | (def_cc << 4) | (0 << 1)| BASE_ADDRESS_MODIFY)); OUT_BATCH(gpgpu->batch, 0); OUT_BATCH(gpgpu->batch, 0xfffff000 | BASE_ADDRESS_MODIFY); /* According to mesa i965 driver code, we must set the dynamic state access upper bound * to a valid bound value, otherwise, the border color pointer may be rejected and you * may get incorrect border color. This is a known hardware bug. */ OUT_BATCH(gpgpu->batch, 0xfffff000 | BASE_ADDRESS_MODIFY); OUT_BATCH(gpgpu->batch, 0xfffff000 | BASE_ADDRESS_MODIFY); OUT_BATCH(gpgpu->batch, 0xfffff000 | BASE_ADDRESS_MODIFY); ADVANCE_BATCH(gpgpu->batch); } static void intel_gpgpu_set_base_address_gen9(intel_gpgpu_t *gpgpu) { const uint32_t def_cc = cl_gpgpu_get_cache_ctrl(); /* default Cache Control value */ BEGIN_BATCH(gpgpu->batch, 19); OUT_BATCH(gpgpu->batch, CMD_STATE_BASE_ADDRESS | 17); /* 0, Gen State Mem Obj CC, Stateless Mem Obj CC, Stateless Access Write Back */ OUT_BATCH(gpgpu->batch, 0 | (def_cc << 4) | (0 << 1)| BASE_ADDRESS_MODIFY); /* General State Base Addr */ OUT_BATCH(gpgpu->batch, 0); OUT_BATCH(gpgpu->batch, 0 | (def_cc << 16)); /* 0, State Mem Obj CC */ /* We use a state base address for the surface heap since IVB clamp the * binding table pointer at 11 bits. So, we cannot use pointers directly while * using the surface heap */ assert(gpgpu->aux_offset.surface_heap_offset % 4096 == 0); OUT_RELOC(gpgpu->batch, gpgpu->aux_buf.bo, I915_GEM_DOMAIN_SAMPLER, I915_GEM_DOMAIN_SAMPLER, gpgpu->aux_offset.surface_heap_offset + (0 | (def_cc << 4) | (0 << 1)| BASE_ADDRESS_MODIFY)); OUT_BATCH(gpgpu->batch, 0); OUT_RELOC(gpgpu->batch, gpgpu->aux_buf.bo, I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER, (0 | (def_cc << 4) | (0 << 1)| BASE_ADDRESS_MODIFY)); /* Dynamic State Base Addr */ OUT_BATCH(gpgpu->batch, 0); OUT_BATCH(gpgpu->batch, 0 | (def_cc << 4) | BASE_ADDRESS_MODIFY); /* Indirect Obj Base Addr */ OUT_BATCH(gpgpu->batch, 0); //OUT_BATCH(gpgpu->batch, 0 | (def_cc << 4) | BASE_ADDRESS_MODIFY); /* Instruction Base Addr */ OUT_RELOC(gpgpu->batch, (drm_intel_bo *)gpgpu->ker->bo, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION, 0 + (0 | (def_cc << 4) | (0 << 1)| BASE_ADDRESS_MODIFY)); OUT_BATCH(gpgpu->batch, 0); OUT_BATCH(gpgpu->batch, 0xfffff000 | BASE_ADDRESS_MODIFY); /* According to mesa i965 driver code, we must set the dynamic state access upper bound * to a valid bound value, otherwise, the border color pointer may be rejected and you * may get incorrect border color. This is a known hardware bug. */ OUT_BATCH(gpgpu->batch, 0xfffff000 | BASE_ADDRESS_MODIFY); OUT_BATCH(gpgpu->batch, 0xfffff000 | BASE_ADDRESS_MODIFY); OUT_BATCH(gpgpu->batch, 0xfffff000 | BASE_ADDRESS_MODIFY); /* Bindless surface state base address */ OUT_BATCH(gpgpu->batch, (def_cc << 4) | BASE_ADDRESS_MODIFY); OUT_BATCH(gpgpu->batch, 0); OUT_BATCH(gpgpu->batch, 0xfffff000); ADVANCE_BATCH(gpgpu->batch); } uint32_t intel_gpgpu_get_scratch_index_gen7(uint32_t size) { return size / 1024 - 1; } uint32_t intel_gpgpu_get_scratch_index_gen75(uint32_t size) { //align in backend, if non pow2, must align when alloc scratch bo. assert((size & (size - 1)) == 0); size = size >> 11; uint32_t index = 0; while((size >>= 1) > 0) index++; //get leading one return index; } uint32_t intel_gpgpu_get_scratch_index_gen8(uint32_t size) { //align in backend, if non pow2, must align when alloc scratch bo. assert((size & (size - 1)) == 0); size = size >> 10; uint32_t index = 0; while((size >>= 1) > 0) index++; //get leading one return index; } static cl_int intel_gpgpu_get_max_curbe_size(uint32_t device_id) { if (IS_BAYTRAIL_T(device_id) || IS_IVB_GT1(device_id)) return 992; else return 2016; } static cl_int intel_gpgpu_get_curbe_size(intel_gpgpu_t *gpgpu) { int curbe_size = gpgpu->curb.size_cs_entry * gpgpu->curb.num_cs_entries; int max_curbe_size = intel_gpgpu_get_max_curbe_size(gpgpu->drv->device_id); if (curbe_size > max_curbe_size) { fprintf(stderr, "warning, curbe size exceed limitation.\n"); return max_curbe_size; } else return curbe_size; } static void intel_gpgpu_load_vfe_state_gen7(intel_gpgpu_t *gpgpu) { int32_t scratch_index; BEGIN_BATCH(gpgpu->batch, 8); OUT_BATCH(gpgpu->batch, CMD_MEDIA_STATE_POINTERS | (8-2)); if(gpgpu->per_thread_scratch > 0) { scratch_index = intel_gpgpu_get_scratch_index(gpgpu->per_thread_scratch); OUT_RELOC(gpgpu->batch, gpgpu->scratch_b.bo, I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER, scratch_index); } else { OUT_BATCH(gpgpu->batch, 0); } /* max_thread | urb entries | (reset_gateway|bypass_gate_way | gpgpu_mode) */ OUT_BATCH(gpgpu->batch, 0 | ((gpgpu->max_threads - 1) << 16) | (0 << 8) | 0xc4); OUT_BATCH(gpgpu->batch, 0); /* curbe_size */ OUT_BATCH(gpgpu->batch, intel_gpgpu_get_curbe_size(gpgpu)); OUT_BATCH(gpgpu->batch, 0); OUT_BATCH(gpgpu->batch, 0); OUT_BATCH(gpgpu->batch, 0); ADVANCE_BATCH(gpgpu->batch); } static void intel_gpgpu_load_vfe_state_gen8(intel_gpgpu_t *gpgpu) { int32_t scratch_index; BEGIN_BATCH(gpgpu->batch, 9); OUT_BATCH(gpgpu->batch, CMD_MEDIA_STATE_POINTERS | (9-2)); if(gpgpu->per_thread_scratch > 0) { scratch_index = intel_gpgpu_get_scratch_index(gpgpu->per_thread_scratch); OUT_RELOC(gpgpu->batch, gpgpu->scratch_b.bo, I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER, scratch_index); } else { OUT_BATCH(gpgpu->batch, 0); } OUT_BATCH(gpgpu->batch, 0); /* max_thread | urb entries | (reset_gateway|bypass_gate_way | gpgpu_mode) */ OUT_BATCH(gpgpu->batch, 0 | ((gpgpu->max_threads - 1) << 16) | (2 << 8) | 0xc0); //urb entries can't be 0 OUT_BATCH(gpgpu->batch, 0); /* urb entries size | curbe_size */ OUT_BATCH(gpgpu->batch, 2<<16 | intel_gpgpu_get_curbe_size(gpgpu)); OUT_BATCH(gpgpu->batch, 0); OUT_BATCH(gpgpu->batch, 0); OUT_BATCH(gpgpu->batch, 0); ADVANCE_BATCH(gpgpu->batch); } static void intel_gpgpu_load_curbe_buffer_gen7(intel_gpgpu_t *gpgpu) { BEGIN_BATCH(gpgpu->batch, 4); OUT_BATCH(gpgpu->batch, CMD(2,0,1) | (4 - 2)); /* length-2 */ OUT_BATCH(gpgpu->batch, 0); /* mbz */ OUT_BATCH(gpgpu->batch, intel_gpgpu_get_curbe_size(gpgpu) * 32); OUT_RELOC(gpgpu->batch, gpgpu->aux_buf.bo, I915_GEM_DOMAIN_INSTRUCTION, 0, gpgpu->aux_offset.curbe_offset); ADVANCE_BATCH(gpgpu->batch); } static void intel_gpgpu_load_curbe_buffer_gen8(intel_gpgpu_t *gpgpu) { BEGIN_BATCH(gpgpu->batch, 4); OUT_BATCH(gpgpu->batch, CMD(2,0,1) | (4 - 2)); /* length-2 */ OUT_BATCH(gpgpu->batch, 0); /* mbz */ OUT_BATCH(gpgpu->batch, intel_gpgpu_get_curbe_size(gpgpu) * 32); OUT_BATCH(gpgpu->batch, gpgpu->aux_offset.curbe_offset); ADVANCE_BATCH(gpgpu->batch); } static void intel_gpgpu_load_idrt_gen7(intel_gpgpu_t *gpgpu) { BEGIN_BATCH(gpgpu->batch, 4); OUT_BATCH(gpgpu->batch, CMD(2,0,2) | (4 - 2)); /* length-2 */ OUT_BATCH(gpgpu->batch, 0); /* mbz */ OUT_BATCH(gpgpu->batch, 1 << 5); OUT_RELOC(gpgpu->batch, gpgpu->aux_buf.bo, I915_GEM_DOMAIN_INSTRUCTION, 0, gpgpu->aux_offset.idrt_offset); ADVANCE_BATCH(gpgpu->batch); } static void intel_gpgpu_load_idrt_gen8(intel_gpgpu_t *gpgpu) { BEGIN_BATCH(gpgpu->batch, 4); OUT_BATCH(gpgpu->batch, CMD(2,0,2) | (4 - 2)); /* length-2 */ OUT_BATCH(gpgpu->batch, 0); /* mbz */ OUT_BATCH(gpgpu->batch, 1 << 5); OUT_BATCH(gpgpu->batch, gpgpu->aux_offset.idrt_offset); ADVANCE_BATCH(gpgpu->batch); } static const uint32_t gpgpu_l3_config_reg1[] = { 0x00080040, 0x02040040, 0x00800040, 0x01000038, 0x02000030, 0x01000038, 0x00000038, 0x00000040, 0x0A140091, 0x09100091, 0x08900091, 0x08900091, 0x010000a1 }; static const uint32_t gpgpu_l3_config_reg2[] = { 0x00000000, 0x00000000, 0x00080410, 0x00080410, 0x00040410, 0x00040420, 0x00080420, 0x00080020, 0x00204080, 0x00244890, 0x00284490, 0x002444A0, 0x00040810 }; /* Emit PIPE_CONTROLs to write the current GPU timestamp into a buffer. */ static void intel_gpgpu_write_timestamp(intel_gpgpu_t *gpgpu, int idx) { BEGIN_BATCH(gpgpu->batch, 5); OUT_BATCH(gpgpu->batch, CMD_PIPE_CONTROL | (5-2)); OUT_BATCH(gpgpu->batch, GEN7_PIPE_CONTROL_WRITE_TIMESTAMP); OUT_RELOC(gpgpu->batch, gpgpu->time_stamp_b.bo, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION, GEN7_PIPE_CONTROL_GLOBAL_GTT_WRITE | idx * sizeof(uint64_t)); OUT_BATCH(gpgpu->batch, 0); OUT_BATCH(gpgpu->batch, 0); ADVANCE_BATCH(); } static void intel_gpgpu_pipe_control_gen7(intel_gpgpu_t *gpgpu) { gen6_pipe_control_t* pc = (gen6_pipe_control_t*) intel_batchbuffer_alloc_space(gpgpu->batch, sizeof(gen6_pipe_control_t)); memset(pc, 0, sizeof(*pc)); pc->dw0.length = SIZEOF32(gen6_pipe_control_t) - 2; pc->dw0.instruction_subopcode = GEN7_PIPE_CONTROL_SUBOPCODE_3D_CONTROL; pc->dw0.instruction_opcode = GEN7_PIPE_CONTROL_OPCODE_3D_CONTROL; pc->dw0.instruction_pipeline = GEN7_PIPE_CONTROL_3D; pc->dw0.instruction_type = GEN7_PIPE_CONTROL_INSTRUCTION_GFX; pc->dw1.render_target_cache_flush_enable = 1; pc->dw1.texture_cache_invalidation_enable = 1; pc->dw1.cs_stall = 1; pc->dw1.dc_flush_enable = 1; //pc->dw1.instruction_cache_invalidate_enable = 1; ADVANCE_BATCH(gpgpu->batch); } static void intel_gpgpu_pipe_control_gen75(intel_gpgpu_t *gpgpu) { gen6_pipe_control_t* pc = (gen6_pipe_control_t*) intel_batchbuffer_alloc_space(gpgpu->batch, sizeof(gen6_pipe_control_t)); memset(pc, 0, sizeof(*pc)); pc->dw0.length = SIZEOF32(gen6_pipe_control_t) - 2; pc->dw0.instruction_subopcode = GEN7_PIPE_CONTROL_SUBOPCODE_3D_CONTROL; pc->dw0.instruction_opcode = GEN7_PIPE_CONTROL_OPCODE_3D_CONTROL; pc->dw0.instruction_pipeline = GEN7_PIPE_CONTROL_3D; pc->dw0.instruction_type = GEN7_PIPE_CONTROL_INSTRUCTION_GFX; pc->dw1.cs_stall = 1; pc->dw1.dc_flush_enable = 1; pc = (gen6_pipe_control_t*) intel_batchbuffer_alloc_space(gpgpu->batch, sizeof(gen6_pipe_control_t)); memset(pc, 0, sizeof(*pc)); pc->dw0.length = SIZEOF32(gen6_pipe_control_t) - 2; pc->dw0.instruction_subopcode = GEN7_PIPE_CONTROL_SUBOPCODE_3D_CONTROL; pc->dw0.instruction_opcode = GEN7_PIPE_CONTROL_OPCODE_3D_CONTROL; pc->dw0.instruction_pipeline = GEN7_PIPE_CONTROL_3D; pc->dw0.instruction_type = GEN7_PIPE_CONTROL_INSTRUCTION_GFX; pc->dw1.render_target_cache_flush_enable = 1; pc->dw1.texture_cache_invalidation_enable = 1; pc->dw1.cs_stall = 1; ADVANCE_BATCH(gpgpu->batch); } static void intel_gpgpu_pipe_control_gen8(intel_gpgpu_t *gpgpu) { gen8_pipe_control_t* pc = (gen8_pipe_control_t*) intel_batchbuffer_alloc_space(gpgpu->batch, sizeof(gen8_pipe_control_t)); memset(pc, 0, sizeof(*pc)); pc->dw0.length = SIZEOF32(gen8_pipe_control_t) - 2; pc->dw0.instruction_subopcode = GEN7_PIPE_CONTROL_SUBOPCODE_3D_CONTROL; pc->dw0.instruction_opcode = GEN7_PIPE_CONTROL_OPCODE_3D_CONTROL; pc->dw0.instruction_pipeline = GEN7_PIPE_CONTROL_3D; pc->dw0.instruction_type = GEN7_PIPE_CONTROL_INSTRUCTION_GFX; pc->dw1.render_target_cache_flush_enable = 1; pc->dw1.texture_cache_invalidation_enable = 1; pc->dw1.cs_stall = 1; pc->dw1.dc_flush_enable = 1; //pc->dw1.instruction_cache_invalidate_enable = 1; ADVANCE_BATCH(gpgpu->batch); } static void intel_gpgpu_set_L3_gen7(intel_gpgpu_t *gpgpu, uint32_t use_slm) { BEGIN_BATCH(gpgpu->batch, 9); OUT_BATCH(gpgpu->batch, CMD_LOAD_REGISTER_IMM | 1); /* length - 2 */ OUT_BATCH(gpgpu->batch, GEN7_L3_SQC_REG1_ADDRESS_OFFSET); OUT_BATCH(gpgpu->batch, 0x00A00000); OUT_BATCH(gpgpu->batch, CMD_LOAD_REGISTER_IMM | 1); /* length - 2 */ OUT_BATCH(gpgpu->batch, GEN7_L3_CNTL_REG2_ADDRESS_OFFSET); if (use_slm) OUT_BATCH(gpgpu->batch, gpgpu_l3_config_reg1[12]); else OUT_BATCH(gpgpu->batch, gpgpu_l3_config_reg1[4]); OUT_BATCH(gpgpu->batch, CMD_LOAD_REGISTER_IMM | 1); /* length - 2 */ OUT_BATCH(gpgpu->batch, GEN7_L3_CNTL_REG3_ADDRESS_OFFSET); if (use_slm) OUT_BATCH(gpgpu->batch, gpgpu_l3_config_reg2[12]); else OUT_BATCH(gpgpu->batch, gpgpu_l3_config_reg2[4]); ADVANCE_BATCH(gpgpu->batch); intel_gpgpu_pipe_control(gpgpu); } static void intel_gpgpu_set_L3_baytrail(intel_gpgpu_t *gpgpu, uint32_t use_slm) { BEGIN_BATCH(gpgpu->batch, 9); OUT_BATCH(gpgpu->batch, CMD_LOAD_REGISTER_IMM | 1); /* length - 2 */ OUT_BATCH(gpgpu->batch, GEN7_L3_SQC_REG1_ADDRESS_OFFSET); OUT_BATCH(gpgpu->batch, 0x00D30000); /* General credit : High credit = 26 : 6 */ OUT_BATCH(gpgpu->batch, CMD_LOAD_REGISTER_IMM | 1); /* length - 2 */ OUT_BATCH(gpgpu->batch, GEN7_L3_CNTL_REG2_ADDRESS_OFFSET); if (use_slm) OUT_BATCH(gpgpu->batch, 0x01020021); /* {SLM=64, URB=96, DC=16, RO=16, Sum=192} */ else OUT_BATCH(gpgpu->batch, 0x02040040); /* {SLM=0, URB=128, DC=32, RO=32, Sum=192} */ OUT_BATCH(gpgpu->batch, CMD_LOAD_REGISTER_IMM | 1); /* length - 2 */ OUT_BATCH(gpgpu->batch, GEN7_L3_CNTL_REG3_ADDRESS_OFFSET); OUT_BATCH(gpgpu->batch, 0x0); /* {I/S=0, Const=0, Tex=0} */ ADVANCE_BATCH(gpgpu->batch); intel_gpgpu_pipe_control(gpgpu); } static void intel_gpgpu_set_L3_gen75(intel_gpgpu_t *gpgpu, uint32_t use_slm) { /* still set L3 in batch buffer for fulsim. */ if(gpgpu->drv->atomic_test_result != SELF_TEST_ATOMIC_FAIL) { BEGIN_BATCH(gpgpu->batch, 15); OUT_BATCH(gpgpu->batch, CMD_LOAD_REGISTER_IMM | 1); /* length - 2 */ /* FIXME: KMD always disable the atomic in L3 for some reason. I checked the spec, and don't think we need that workaround now. Before I send a patch to kernel, let's just enable it here. */ OUT_BATCH(gpgpu->batch, HSW_SCRATCH1_OFFSET); OUT_BATCH(gpgpu->batch, 0); /* enable atomic in L3 */ OUT_BATCH(gpgpu->batch, CMD_LOAD_REGISTER_IMM | 1); /* length - 2 */ OUT_BATCH(gpgpu->batch, HSW_ROW_CHICKEN3_HDC_OFFSET); OUT_BATCH(gpgpu->batch, (1 << 6ul) << 16); /* enable atomic in L3 */ } else { BEGIN_BATCH(gpgpu->batch, 9); } OUT_BATCH(gpgpu->batch, CMD_LOAD_REGISTER_IMM | 1); /* length - 2 */ OUT_BATCH(gpgpu->batch, GEN7_L3_SQC_REG1_ADDRESS_OFFSET); OUT_BATCH(gpgpu->batch, 0x08800000); OUT_BATCH(gpgpu->batch, CMD_LOAD_REGISTER_IMM | 1); /* length - 2 */ OUT_BATCH(gpgpu->batch, GEN7_L3_CNTL_REG2_ADDRESS_OFFSET); if (use_slm) OUT_BATCH(gpgpu->batch, gpgpu_l3_config_reg1[12]); else OUT_BATCH(gpgpu->batch, gpgpu_l3_config_reg1[4]); OUT_BATCH(gpgpu->batch, CMD_LOAD_REGISTER_IMM | 1); /* length - 2 */ OUT_BATCH(gpgpu->batch, GEN7_L3_CNTL_REG3_ADDRESS_OFFSET); if (use_slm) OUT_BATCH(gpgpu->batch, gpgpu_l3_config_reg2[12]); else OUT_BATCH(gpgpu->batch, gpgpu_l3_config_reg2[4]); ADVANCE_BATCH(gpgpu->batch); //if(use_slm) // gpgpu->batch->enable_slm = 1; intel_gpgpu_pipe_control(gpgpu); } static void intel_gpgpu_set_L3_gen8(intel_gpgpu_t *gpgpu, uint32_t use_slm) { BEGIN_BATCH(gpgpu->batch, 3); OUT_BATCH(gpgpu->batch, CMD_LOAD_REGISTER_IMM | 1); /* length - 2 */ OUT_BATCH(gpgpu->batch, GEN8_L3_CNTL_REG_ADDRESS_OFFSET); // FIXME, this is a workaround for switch SLM enable and disable random hang if(use_slm) OUT_BATCH(gpgpu->batch, 0x60000121); /* {SLM=192, URB=128, Rest=384} */ else OUT_BATCH(gpgpu->batch, 0x60000160); /* {SLM=0, URB=384, Rest=384, Sum=768} */ //if(use_slm) // gpgpu->batch->enable_slm = 1; intel_gpgpu_pipe_control(gpgpu); } static void intel_gpgpu_batch_start(intel_gpgpu_t *gpgpu) { intel_batchbuffer_start_atomic(gpgpu->batch, 256); intel_gpgpu_pipe_control(gpgpu); assert(intel_gpgpu_set_L3); intel_gpgpu_set_L3(gpgpu, gpgpu->ker->use_slm); intel_gpgpu_select_pipeline(gpgpu); intel_gpgpu_set_base_address(gpgpu); intel_gpgpu_load_vfe_state(gpgpu); intel_gpgpu_load_curbe_buffer(gpgpu); intel_gpgpu_load_idrt(gpgpu); if (gpgpu->perf_b.bo) { BEGIN_BATCH(gpgpu->batch, 3); OUT_BATCH(gpgpu->batch, (0x28 << 23) | /* MI_REPORT_PERF_COUNT */ (3 - 2)); /* length-2 */ OUT_RELOC(gpgpu->batch, gpgpu->perf_b.bo, I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER, 0 | /* Offset for the start "counters" */ 1); /* Use GTT and not PGTT */ OUT_BATCH(gpgpu->batch, 0); ADVANCE_BATCH(gpgpu->batch); } /* Insert PIPE_CONTROL for time stamp of start*/ if (gpgpu->time_stamp_b.bo) intel_gpgpu_write_timestamp(gpgpu, 0); } static void intel_gpgpu_post_action_gen7(intel_gpgpu_t *gpgpu, int32_t flush_mode) { if(flush_mode) intel_gpgpu_pipe_control(gpgpu); } static void intel_gpgpu_post_action_gen75(intel_gpgpu_t *gpgpu, int32_t flush_mode) { /* flush force for set L3 */ intel_gpgpu_pipe_control(gpgpu); /* Restore L3 control to disable SLM mode, otherwise, may affect 3D pipeline */ intel_gpgpu_set_L3(gpgpu, 0); } static void intel_gpgpu_batch_end(intel_gpgpu_t *gpgpu, int32_t flush_mode) { /* Insert PIPE_CONTROL for time stamp of end*/ if (gpgpu->time_stamp_b.bo) intel_gpgpu_write_timestamp(gpgpu, 1); /* Insert the performance counter command */ if (gpgpu->perf_b.bo) { BEGIN_BATCH(gpgpu->batch, 3); OUT_BATCH(gpgpu->batch, (0x28 << 23) | /* MI_REPORT_PERF_COUNT */ (3 - 2)); /* length-2 */ OUT_RELOC(gpgpu->batch, gpgpu->perf_b.bo, I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER, 512 | /* Offset for the end "counters" */ 1); /* Use GTT and not PGTT */ OUT_BATCH(gpgpu->batch, 0); ADVANCE_BATCH(gpgpu->batch); } intel_gpgpu_post_action(gpgpu, flush_mode); intel_batchbuffer_end_atomic(gpgpu->batch); } static int intel_gpgpu_batch_reset(intel_gpgpu_t *gpgpu, size_t sz) { return intel_batchbuffer_reset(gpgpu->batch, sz); } static int intel_gpgpu_flush(intel_gpgpu_t *gpgpu) { if (!gpgpu->batch || !gpgpu->batch->buffer) return 0; return intel_batchbuffer_flush(gpgpu->batch); /* FIXME: Remove old assert here for binded buffer offset 0 which tried to guard possible NULL buffer pointer check in kernel, as in case like "runtime_null_kernel_arg", but that's wrong to just take buffer offset 0 as NULL, and cause failure for normal kernels which has no such NULL ptr check but with buffer offset 0 (which is possible now and will be normal if full PPGTT is on). Need to fix NULL ptr check otherwise. */ } static int intel_gpgpu_state_init(intel_gpgpu_t *gpgpu, uint32_t max_threads, uint32_t size_cs_entry, int profiling) { drm_intel_bo *bo; /* Binded buffers */ gpgpu->binded_n = 0; gpgpu->img_bitmap = 0; gpgpu->img_index_base = 3; gpgpu->sampler_bitmap = ~((1 << max_sampler_n) - 1); /* URB */ gpgpu->curb.num_cs_entries = 64; gpgpu->curb.size_cs_entry = size_cs_entry; gpgpu->max_threads = max_threads; if (gpgpu->printf_b.bo) dri_bo_unreference(gpgpu->printf_b.bo); gpgpu->printf_b.bo = NULL; if (gpgpu->profiling_b.bo) dri_bo_unreference(gpgpu->profiling_b.bo); gpgpu->profiling_b.bo = NULL; /* Set the profile buffer*/ if(gpgpu->time_stamp_b.bo) dri_bo_unreference(gpgpu->time_stamp_b.bo); gpgpu->time_stamp_b.bo = NULL; if (profiling) { bo = dri_bo_alloc(gpgpu->drv->bufmgr, "timestamp query", 4096, 4096); gpgpu->time_stamp_b.bo = bo; if (!bo) fprintf(stderr, "Could not allocate buffer for profiling.\n"); } /* stack */ if (gpgpu->stack_b.bo) dri_bo_unreference(gpgpu->stack_b.bo); gpgpu->stack_b.bo = NULL; /* Set the auxiliary buffer*/ uint32_t size_aux = 0; if(gpgpu->aux_buf.bo) dri_bo_unreference(gpgpu->aux_buf.bo); gpgpu->aux_buf.bo = NULL; /* begin with surface heap to make sure it's page aligned, because state base address use 20bit for the address */ gpgpu->aux_offset.surface_heap_offset = size_aux; size_aux += sizeof(surface_heap_t); //curbe must be 32 bytes aligned size_aux = ALIGN(size_aux, 64); gpgpu->aux_offset.curbe_offset = size_aux; size_aux += gpgpu->curb.num_cs_entries * gpgpu->curb.size_cs_entry * 32; //idrt must be 32 bytes aligned size_aux = ALIGN(size_aux, 32); gpgpu->aux_offset.idrt_offset = size_aux; size_aux += MAX_IF_DESC * sizeof(struct gen6_interface_descriptor); //must be 32 bytes aligned //sampler state and vme state share the same buffer, size_aux = ALIGN(size_aux, 32); gpgpu->aux_offset.sampler_state_offset = size_aux; size_aux += MAX(GEN_MAX_SAMPLERS * sizeof(gen6_sampler_state_t), GEN_MAX_VME_STATES * sizeof(gen7_vme_state_t)); //sampler border color state must be 32 bytes aligned size_aux = ALIGN(size_aux, 32); gpgpu->aux_offset.sampler_border_color_state_offset = size_aux; size_aux += GEN_MAX_SAMPLERS * sizeof(gen7_sampler_border_color_t); /* make sure aux buffer is page aligned */ size_aux = ALIGN(size_aux, 4096); bo = dri_bo_alloc(gpgpu->drv->bufmgr, "AUX_BUFFER", size_aux, 4096); if (!bo || dri_bo_map(bo, 1) != 0) { fprintf(stderr, "%s:%d: %s.\n", __FILE__, __LINE__, strerror(errno)); if (bo) dri_bo_unreference(bo); if (profiling && gpgpu->time_stamp_b.bo) dri_bo_unreference(gpgpu->time_stamp_b.bo); gpgpu->time_stamp_b.bo = NULL; return -1; } memset(bo->virtual, 0, size_aux); gpgpu->aux_buf.bo = bo; return 0; } static void intel_gpgpu_set_buf_reloc_gen7(intel_gpgpu_t *gpgpu, int32_t index, dri_bo* obj_bo, uint32_t obj_bo_offset) { surface_heap_t *heap = gpgpu->aux_buf.bo->virtual + gpgpu->aux_offset.surface_heap_offset; heap->binding_table[index] = offsetof(surface_heap_t, surface) + index * sizeof(gen7_surface_state_t); dri_bo_emit_reloc(gpgpu->aux_buf.bo, I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER, obj_bo_offset, gpgpu->aux_offset.surface_heap_offset + heap->binding_table[index] + offsetof(gen7_surface_state_t, ss1), obj_bo); } static void intel_gpgpu_set_buf_reloc_for_vme_gen7(intel_gpgpu_t *gpgpu, int32_t index, dri_bo* obj_bo, uint32_t obj_bo_offset) { surface_heap_t *heap = gpgpu->aux_buf.bo->virtual + gpgpu->aux_offset.surface_heap_offset; heap->binding_table[index] = offsetof(surface_heap_t, surface) + index * sizeof(gen7_surface_state_t); dri_bo_emit_reloc(gpgpu->aux_buf.bo, I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER, obj_bo_offset, gpgpu->aux_offset.surface_heap_offset + heap->binding_table[index] + offsetof(gen7_media_surface_state_t, ss0), obj_bo); } static dri_bo* intel_gpgpu_alloc_constant_buffer(intel_gpgpu_t *gpgpu, uint32_t size, uint8_t bti) { if(gpgpu->constant_b.bo) dri_bo_unreference(gpgpu->constant_b.bo); gpgpu->constant_b.bo = drm_intel_bo_alloc(gpgpu->drv->bufmgr, "CONSTANT_BUFFER", size, 64); if (gpgpu->constant_b.bo == NULL) return NULL; intel_gpgpu_setup_bti(gpgpu, gpgpu->constant_b.bo, 0, size, bti, I965_SURFACEFORMAT_R32G32B32A32_UINT); return gpgpu->constant_b.bo; } static void intel_gpgpu_setup_bti_gen7(intel_gpgpu_t *gpgpu, drm_intel_bo *buf, uint32_t internal_offset, size_t size, unsigned char index, uint32_t format) { assert(size <= (2ul<<30)); size_t s = size - 1; surface_heap_t *heap = gpgpu->aux_buf.bo->virtual + gpgpu->aux_offset.surface_heap_offset; gen7_surface_state_t *ss0 = (gen7_surface_state_t *) &heap->surface[index * sizeof(gen7_surface_state_t)]; memset(ss0, 0, sizeof(gen7_surface_state_t)); ss0->ss0.surface_type = I965_SURFACE_BUFFER; ss0->ss0.surface_format = format; ss0->ss2.width = s & 0x7f; /* bits 6:0 of sz */ // Per bspec, I965_SURFACE_BUFFER and RAW format, size must be a multiple of 4 byte. if(format == I965_SURFACEFORMAT_RAW) assert((ss0->ss2.width & 0x03) == 3); ss0->ss2.height = (s >> 7) & 0x3fff; /* bits 20:7 of sz */ ss0->ss3.depth = (s >> 21) & 0x3ff; /* bits 30:21 of sz */ ss0->ss5.cache_control = cl_gpgpu_get_cache_ctrl(); heap->binding_table[index] = offsetof(surface_heap_t, surface) + index * sizeof(gen7_surface_state_t); ss0->ss1.base_addr = buf->offset + internal_offset; dri_bo_emit_reloc(gpgpu->aux_buf.bo, I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER, internal_offset, gpgpu->aux_offset.surface_heap_offset + heap->binding_table[index] + offsetof(gen7_surface_state_t, ss1), buf); } static void intel_gpgpu_setup_bti_gen75(intel_gpgpu_t *gpgpu, drm_intel_bo *buf, uint32_t internal_offset, size_t size, unsigned char index, uint32_t format) { assert(size <= (2ul<<30)); size_t s = size - 1; surface_heap_t *heap = gpgpu->aux_buf.bo->virtual + gpgpu->aux_offset.surface_heap_offset; gen7_surface_state_t *ss0 = (gen7_surface_state_t *) &heap->surface[index * sizeof(gen7_surface_state_t)]; memset(ss0, 0, sizeof(gen7_surface_state_t)); ss0->ss0.surface_type = I965_SURFACE_BUFFER; ss0->ss0.surface_format = format; if(format != I965_SURFACEFORMAT_RAW) { ss0->ss7.shader_r = I965_SURCHAN_SELECT_RED; ss0->ss7.shader_g = I965_SURCHAN_SELECT_GREEN; ss0->ss7.shader_b = I965_SURCHAN_SELECT_BLUE; ss0->ss7.shader_a = I965_SURCHAN_SELECT_ALPHA; } ss0->ss2.width = s & 0x7f; /* bits 6:0 of sz */ // Per bspec, I965_SURFACE_BUFFER and RAW format, size must be a multiple of 4 byte. if(format == I965_SURFACEFORMAT_RAW) assert((ss0->ss2.width & 0x03) == 3); ss0->ss2.height = (s >> 7) & 0x3fff; /* bits 20:7 of sz */ ss0->ss3.depth = (s >> 21) & 0x3ff; /* bits 30:21 of sz */ ss0->ss5.cache_control = cl_gpgpu_get_cache_ctrl(); heap->binding_table[index] = offsetof(surface_heap_t, surface) + index * sizeof(gen7_surface_state_t); ss0->ss1.base_addr = buf->offset + internal_offset; dri_bo_emit_reloc(gpgpu->aux_buf.bo, I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER, internal_offset, gpgpu->aux_offset.surface_heap_offset + heap->binding_table[index] + offsetof(gen7_surface_state_t, ss1), buf); } static void intel_gpgpu_setup_bti_gen8(intel_gpgpu_t *gpgpu, drm_intel_bo *buf, uint32_t internal_offset, size_t size, unsigned char index, uint32_t format) { assert(size <= (2ul<<30)); size_t s = size - 1; surface_heap_t *heap = gpgpu->aux_buf.bo->virtual + gpgpu->aux_offset.surface_heap_offset; gen8_surface_state_t *ss0 = (gen8_surface_state_t *) &heap->surface[index * sizeof(gen8_surface_state_t)]; memset(ss0, 0, sizeof(gen8_surface_state_t)); ss0->ss0.surface_type = I965_SURFACE_BUFFER; ss0->ss0.surface_format = format; if(format != I965_SURFACEFORMAT_RAW) { ss0->ss7.shader_channel_select_red = I965_SURCHAN_SELECT_RED; ss0->ss7.shader_channel_select_green = I965_SURCHAN_SELECT_GREEN; ss0->ss7.shader_channel_select_blue = I965_SURCHAN_SELECT_BLUE; ss0->ss7.shader_channel_select_alpha = I965_SURCHAN_SELECT_ALPHA; } ss0->ss2.width = s & 0x7f; /* bits 6:0 of sz */ // Per bspec, I965_SURFACE_BUFFER and RAW format, size must be a multiple of 4 byte. if(format == I965_SURFACEFORMAT_RAW) assert((ss0->ss2.width & 0x03) == 3); ss0->ss2.height = (s >> 7) & 0x3fff; /* bits 20:7 of sz */ ss0->ss3.depth = (s >> 21) & 0x3ff; /* bits 30:21 of sz */ ss0->ss1.mem_obj_ctrl_state = cl_gpgpu_get_cache_ctrl(); heap->binding_table[index] = offsetof(surface_heap_t, surface) + index * sizeof(gen8_surface_state_t); ss0->ss8.surface_base_addr_lo = (buf->offset64 + internal_offset) & 0xffffffff; ss0->ss9.surface_base_addr_hi = ((buf->offset64 + internal_offset) >> 32) & 0xffffffff; dri_bo_emit_reloc(gpgpu->aux_buf.bo, I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER, internal_offset, gpgpu->aux_offset.surface_heap_offset + heap->binding_table[index] + offsetof(gen8_surface_state_t, ss8), buf); } static void intel_gpgpu_setup_bti_gen9(intel_gpgpu_t *gpgpu, drm_intel_bo *buf, uint32_t internal_offset, size_t size, unsigned char index, uint32_t format) { assert(size <= (4ul<<30)); size_t s = size - 1; surface_heap_t *heap = gpgpu->aux_buf.bo->virtual + gpgpu->aux_offset.surface_heap_offset; gen8_surface_state_t *ss0 = (gen8_surface_state_t *) &heap->surface[index * sizeof(gen8_surface_state_t)]; memset(ss0, 0, sizeof(gen8_surface_state_t)); ss0->ss0.surface_type = I965_SURFACE_BUFFER; ss0->ss0.surface_format = format; if(format != I965_SURFACEFORMAT_RAW) { ss0->ss7.shader_channel_select_red = I965_SURCHAN_SELECT_RED; ss0->ss7.shader_channel_select_green = I965_SURCHAN_SELECT_GREEN; ss0->ss7.shader_channel_select_blue = I965_SURCHAN_SELECT_BLUE; ss0->ss7.shader_channel_select_alpha = I965_SURCHAN_SELECT_ALPHA; } ss0->ss2.width = s & 0x7f; /* bits 6:0 of sz */ // Per bspec, I965_SURFACE_BUFFER and RAW format, size must be a multiple of 4 byte. if(format == I965_SURFACEFORMAT_RAW) assert((ss0->ss2.width & 0x03) == 3); ss0->ss2.height = (s >> 7) & 0x3fff; /* bits 20:7 of sz */ ss0->ss3.depth = (s >> 21) & 0x7ff; /* bits 31:21 of sz, from bespec only gen 9 support that*/ ss0->ss1.mem_obj_ctrl_state = cl_gpgpu_get_cache_ctrl(); heap->binding_table[index] = offsetof(surface_heap_t, surface) + index * sizeof(gen8_surface_state_t); ss0->ss8.surface_base_addr_lo = (buf->offset64 + internal_offset) & 0xffffffff; ss0->ss9.surface_base_addr_hi = ((buf->offset64 + internal_offset) >> 32) & 0xffffffff; dri_bo_emit_reloc(gpgpu->aux_buf.bo, I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER, internal_offset, gpgpu->aux_offset.surface_heap_offset + heap->binding_table[index] + offsetof(gen8_surface_state_t, ss8), buf); } static int intel_is_surface_array(cl_mem_object_type type) { if (type == CL_MEM_OBJECT_IMAGE1D_ARRAY || type == CL_MEM_OBJECT_IMAGE2D_ARRAY) return 1; return 0; } static int intel_get_surface_type(cl_mem_object_type type) { switch (type) { case CL_MEM_OBJECT_IMAGE1D: case CL_MEM_OBJECT_IMAGE1D_ARRAY: return I965_SURFACE_1D; case CL_MEM_OBJECT_IMAGE1D_BUFFER: case CL_MEM_OBJECT_IMAGE2D: case CL_MEM_OBJECT_IMAGE2D_ARRAY: return I965_SURFACE_2D; case CL_MEM_OBJECT_IMAGE3D: return I965_SURFACE_3D; default: assert(0); } return 0; } /* Get fixed surface type. If it is a 1D array image with a large index, we need to fixup it to 2D type due to a Gen7/Gen75's sampler issue on a integer type surface with clamp address mode and nearest filter mode. */ static uint32_t get_surface_type(intel_gpgpu_t *gpgpu, int index, cl_mem_object_type type) { uint32_t surface_type; //Now all platforms need it, so disable platform, re-enable it //when some platform don't need this workaround if (/*((IS_IVYBRIDGE(gpgpu->drv->device_id) || IS_HASWELL(gpgpu->drv->device_id) || IS_BROADWELL(gpgpu->drv->device_id) || IS_CHERRYVIEW(gpgpu->drv->device_id) || IS_SKYLAKE(gpgpu->drv->device_id) || IS_BROXTON(gpgpu->drv->device_id) || IS_KABYLAKE(gpgpu->drv_device_id))) && */ index >= BTI_WORKAROUND_IMAGE_OFFSET + BTI_RESERVED_NUM && type == CL_MEM_OBJECT_IMAGE1D_ARRAY) surface_type = I965_SURFACE_2D; else surface_type = intel_get_surface_type(type); return surface_type; } static void intel_gpgpu_bind_image_gen7(intel_gpgpu_t *gpgpu, uint32_t index, dri_bo* obj_bo, uint32_t obj_bo_offset, uint32_t format, cl_mem_object_type type, uint32_t bpp, int32_t w, int32_t h, int32_t depth, int32_t pitch, int32_t slice_pitch, int32_t tiling) { surface_heap_t *heap = gpgpu->aux_buf.bo->virtual + gpgpu->aux_offset.surface_heap_offset; gen7_surface_state_t *ss = (gen7_surface_state_t *) &heap->surface[index * sizeof(gen7_surface_state_t)]; memset(ss, 0, sizeof(*ss)); ss->ss0.vertical_line_stride = 0; // always choose VALIGN_2 ss->ss0.surface_type = get_surface_type(gpgpu, index, type); if (intel_is_surface_array(type)) { ss->ss0.surface_array = 1; ss->ss0.surface_array_spacing = 1; } if (obj_bo_offset && tiling != GPGPU_NO_TILE) { uint32_t unaligned = obj_bo_offset; obj_bo_offset = (obj_bo_offset / 0x1000) * 0x1000; uint32_t h_ = (unaligned - obj_bo_offset )/ pitch; ss->ss5.y_offset = h_ / 2; } ss->ss0.surface_format = format; ss->ss1.base_addr = obj_bo->offset + obj_bo_offset; ss->ss2.width = w - 1; ss->ss2.height = h - 1; ss->ss3.depth = depth - 1; ss->ss4.not_str_buf.rt_view_extent = depth - 1; ss->ss4.not_str_buf.min_array_element = 0; ss->ss3.pitch = pitch - 1; ss->ss5.cache_control = cl_gpgpu_get_cache_ctrl(); if (tiling == GPGPU_TILE_X) { ss->ss0.tiled_surface = 1; ss->ss0.tile_walk = I965_TILEWALK_XMAJOR; } else if (tiling == GPGPU_TILE_Y) { ss->ss0.tiled_surface = 1; ss->ss0.tile_walk = I965_TILEWALK_YMAJOR; } ss->ss0.render_cache_rw_mode = 1; /* XXX do we need to set it? */ intel_gpgpu_set_buf_reloc_gen7(gpgpu, index, obj_bo, obj_bo_offset); assert(index < GEN_MAX_SURFACES); } static void intel_gpgpu_bind_image_for_vme_gen7(intel_gpgpu_t *gpgpu, uint32_t index, dri_bo* obj_bo, uint32_t obj_bo_offset, uint32_t format, cl_mem_object_type type, uint32_t bpp, int32_t w, int32_t h, int32_t depth, int32_t pitch, int32_t slice_pitch, int32_t tiling) { surface_heap_t *heap = gpgpu->aux_buf.bo->virtual + gpgpu->aux_offset.surface_heap_offset; gen7_media_surface_state_t *ss = (gen7_media_surface_state_t *) &heap->surface[index * sizeof(gen7_surface_state_t)]; memset(ss, 0, sizeof(*ss)); ss->ss0.base_addr = obj_bo->offset + obj_bo_offset; ss->ss1.uv_offset_v_direction = 0; ss->ss1.pic_struct = 0; ss->ss1.width = w - 1; ss->ss1.height = h - 1; if (tiling == GPGPU_NO_TILE) { ss->ss2.tile_mode = 0; } else if (tiling == GPGPU_TILE_X){ ss->ss2.tile_mode = 2; } else if (tiling == GPGPU_TILE_Y){ ss->ss2.tile_mode = 3; } ss->ss2.half_pitch_for_chroma = 0; ss->ss2.surface_pitch = pitch - 1; ss->ss2.surface_object_control_state = cl_gpgpu_get_cache_ctrl(); ss->ss2.interleave_chroma = 0; ss->ss2.surface_format = 12; //Y8_UNORM ss->ss3.y_offset_for_u = 0; ss->ss3.x_offset_for_u = 0; ss->ss4.y_offset_for_v = 0; ss->ss4.x_offset_for_v = 0; intel_gpgpu_set_buf_reloc_for_vme_gen7(gpgpu, index, obj_bo, obj_bo_offset); assert(index < GEN_MAX_SURFACES); } static void intel_gpgpu_bind_image_for_vme_gen9(intel_gpgpu_t *gpgpu, uint32_t index, dri_bo* obj_bo, uint32_t obj_bo_offset, uint32_t format, cl_mem_object_type type, uint32_t bpp, int32_t w, int32_t h, int32_t depth, int32_t pitch, int32_t slice_pitch, int32_t tiling) { surface_heap_t *heap = gpgpu->aux_buf.bo->virtual + gpgpu->aux_offset.surface_heap_offset; gen9_media_surface_state_t *ss = (gen9_media_surface_state_t *) &heap->surface[index * sizeof(gen8_surface_state_t)]; memset(ss, 0, sizeof(gen8_surface_state_t)); ss->ss0.rotation = 0; //++ ss->ss1.uv_offset_v_direction = 0; ss->ss1.pic_struct = 0; ss->ss1.width = w - 1; ss->ss1.height = h - 1; if (tiling == GPGPU_NO_TILE) { ss->ss2.tile_mode = 0; } else if (tiling == GPGPU_TILE_X){ ss->ss2.tile_mode = 2; } else if (tiling == GPGPU_TILE_Y){ ss->ss2.tile_mode = 3; } ss->ss2.half_pitch_for_chroma = 0; ss->ss2.surface_pitch = pitch - 1; ss->ss2.address_control = 1; //++ CLAMP: 0; MIRROR:1; ss->ss2.mem_compress_enable = 0; //++ ss->ss2.mem_compress_mode = 0; //++ ss->ss2.uv_offset_v_direction_msb = 0; //++ ss->ss2.uv_offset_u_direction = 0; //++ ss->ss2.interleave_chroma = 0; ss->ss2.surface_format = 12; //Y8_UNORM //ss->ss2.surface_format = 4; //PLANAR_420_8 ss->ss3.y_offset_for_u = 0; ss->ss3.x_offset_for_u = 0; ss->ss4.y_offset_for_v = 0; ss->ss4.x_offset_for_v = 0; ss->ss5.surface_object_control_state = cl_gpgpu_get_cache_ctrl(); ss->ss5.tiled_res_mode = 0; //++ TRMODE_NONE: 0; TRMODE_TILEYF: 1; TRMODE_TILEYS:2 ss->ss5.vert_line_stride_offset = 0; //++ ss->ss5.vert_line_stride = 0; //++ ss->ss6.base_addr = (obj_bo->offset64 + obj_bo_offset) & 0xffffffff; // ss->ss7.base_addr_high = ((obj_bo->offset64 + obj_bo_offset) >> 32) & 0xffffffff; // heap->binding_table[index] = offsetof(surface_heap_t, surface) + index * surface_state_sz; dri_bo_emit_reloc(gpgpu->aux_buf.bo, I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER, obj_bo_offset, gpgpu->aux_offset.surface_heap_offset + heap->binding_table[index] + offsetof(gen9_media_surface_state_t, ss6), obj_bo); assert(index < GEN_MAX_SURFACES); } static void intel_gpgpu_bind_image_gen75(intel_gpgpu_t *gpgpu, uint32_t index, dri_bo* obj_bo, uint32_t obj_bo_offset, uint32_t format, cl_mem_object_type type, uint32_t bpp, int32_t w, int32_t h, int32_t depth, int32_t pitch, int32_t slice_pitch, int32_t tiling) { surface_heap_t *heap = gpgpu->aux_buf.bo->virtual + gpgpu->aux_offset.surface_heap_offset; gen7_surface_state_t *ss = (gen7_surface_state_t *) &heap->surface[index * sizeof(gen7_surface_state_t)]; memset(ss, 0, sizeof(*ss)); ss->ss0.vertical_line_stride = 0; // always choose VALIGN_2 ss->ss0.surface_type = get_surface_type(gpgpu, index, type); if (intel_is_surface_array(type)) { ss->ss0.surface_array = 1; ss->ss0.surface_array_spacing = 1; } if (obj_bo_offset && tiling != GPGPU_NO_TILE) { uint32_t unaligned = obj_bo_offset; obj_bo_offset = (obj_bo_offset / 0x1000) * 0x1000; uint32_t h_ = (unaligned - obj_bo_offset )/ pitch; ss->ss5.y_offset = h_ / 2; } ss->ss0.surface_format = format; ss->ss1.base_addr = obj_bo->offset + obj_bo_offset; ss->ss2.width = w - 1; ss->ss2.height = h - 1; ss->ss3.depth = depth - 1; ss->ss4.not_str_buf.rt_view_extent = depth - 1; ss->ss4.not_str_buf.min_array_element = 0; ss->ss3.pitch = pitch - 1; ss->ss5.cache_control = cl_gpgpu_get_cache_ctrl(); ss->ss7.shader_r = I965_SURCHAN_SELECT_RED; ss->ss7.shader_g = I965_SURCHAN_SELECT_GREEN; ss->ss7.shader_b = I965_SURCHAN_SELECT_BLUE; ss->ss7.shader_a = I965_SURCHAN_SELECT_ALPHA; if (tiling == GPGPU_TILE_X) { ss->ss0.tiled_surface = 1; ss->ss0.tile_walk = I965_TILEWALK_XMAJOR; } else if (tiling == GPGPU_TILE_Y) { ss->ss0.tiled_surface = 1; ss->ss0.tile_walk = I965_TILEWALK_YMAJOR; } ss->ss0.render_cache_rw_mode = 1; /* XXX do we need to set it? */ intel_gpgpu_set_buf_reloc_gen7(gpgpu, index, obj_bo, obj_bo_offset); assert(index < GEN_MAX_SURFACES); } static void intel_gpgpu_bind_image_gen8(intel_gpgpu_t *gpgpu, uint32_t index, dri_bo* obj_bo, uint32_t obj_bo_offset, uint32_t format, cl_mem_object_type type, uint32_t bpp, int32_t w, int32_t h, int32_t depth, int32_t pitch, int32_t slice_pitch, int32_t tiling) { surface_heap_t *heap = gpgpu->aux_buf.bo->virtual + gpgpu->aux_offset.surface_heap_offset; gen8_surface_state_t *ss = (gen8_surface_state_t *) &heap->surface[index * sizeof(gen8_surface_state_t)]; memset(ss, 0, sizeof(*ss)); ss->ss0.vertical_line_stride = 0; // always choose VALIGN_2 ss->ss0.surface_type = get_surface_type(gpgpu, index, type); ss->ss0.surface_format = format; if (intel_is_surface_array(type)) { ss->ss0.surface_array = 1; ss->ss1.surface_qpitch = (h + 3)/4; } ss->ss0.horizontal_alignment = 1; ss->ss0.vertical_alignment = 1; if (tiling == GPGPU_TILE_X) { ss->ss0.tile_mode = GEN8_TILEMODE_XMAJOR; } else if (tiling == GPGPU_TILE_Y) { ss->ss0.tile_mode = GEN8_TILEMODE_YMAJOR; } else assert(tiling == GPGPU_NO_TILE);// W mode is not supported now. ss->ss2.width = w - 1; ss->ss2.height = h - 1; ss->ss3.depth = depth - 1; if(obj_bo_offset && tiling != GPGPU_NO_TILE) { uint32_t unaligned = obj_bo_offset; obj_bo_offset = (obj_bo_offset / 0x1000) * 0x1000; uint32_t h_ = (unaligned - obj_bo_offset) / pitch; ss->ss5.y_offset = h_ / 4; } ss->ss8.surface_base_addr_lo = (obj_bo->offset64 + obj_bo_offset) & 0xffffffff; ss->ss9.surface_base_addr_hi = ((obj_bo->offset64 + obj_bo_offset) >> 32) & 0xffffffff; ss->ss4.render_target_view_ext = depth - 1; ss->ss4.min_array_elt = 0; ss->ss3.surface_pitch = pitch - 1; ss->ss1.mem_obj_ctrl_state = cl_gpgpu_get_cache_ctrl(); //NV12 surface. the height is 3/2 * h, so need set proper offset here. if (format == I965_SURFACEFORMAT_PLANAR_420_8) ss->ss6.uv_plane_y_offset = h * 2 / 3; ss->ss7.shader_channel_select_red = I965_SURCHAN_SELECT_RED; ss->ss7.shader_channel_select_green = I965_SURCHAN_SELECT_GREEN; ss->ss7.shader_channel_select_blue = I965_SURCHAN_SELECT_BLUE; ss->ss7.shader_channel_select_alpha = I965_SURCHAN_SELECT_ALPHA; ss->ss0.render_cache_rw_mode = 1; /* XXX do we need to set it? */ heap->binding_table[index] = offsetof(surface_heap_t, surface) + index * surface_state_sz; dri_bo_emit_reloc(gpgpu->aux_buf.bo, I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER, obj_bo_offset, gpgpu->aux_offset.surface_heap_offset + heap->binding_table[index] + offsetof(gen8_surface_state_t, ss8), obj_bo); assert(index < GEN_MAX_SURFACES); } static void intel_gpgpu_bind_image_gen9(intel_gpgpu_t *gpgpu, uint32_t index, dri_bo* obj_bo, uint32_t obj_bo_offset, uint32_t format, cl_mem_object_type type, uint32_t bpp, int32_t w, int32_t h, int32_t depth, int32_t pitch, int32_t slice_pitch, int32_t tiling) { surface_heap_t *heap = gpgpu->aux_buf.bo->virtual + gpgpu->aux_offset.surface_heap_offset; gen8_surface_state_t *ss = (gen8_surface_state_t *) &heap->surface[index * sizeof(gen8_surface_state_t)]; memset(ss, 0, sizeof(*ss)); ss->ss0.vertical_line_stride = 0; // always choose VALIGN_2 ss->ss0.surface_type = get_surface_type(gpgpu, index, type); ss->ss0.surface_format = format; if (intel_is_surface_array(type) && ss->ss0.surface_type == I965_SURFACE_1D) { ss->ss0.surface_array = 1; ss->ss1.surface_qpitch = (slice_pitch/bpp + 3)/4; //align_h } if (intel_is_surface_array(type) && ss->ss0.surface_type == I965_SURFACE_2D) { ss->ss0.surface_array = 1; ss->ss1.surface_qpitch = (slice_pitch/pitch + 3)/4; } if(ss->ss0.surface_type == I965_SURFACE_3D) ss->ss1.surface_qpitch = (slice_pitch/pitch + 3)/4; ss->ss0.horizontal_alignment = 1; ss->ss0.vertical_alignment = 1; if (tiling == GPGPU_TILE_X) { ss->ss0.tile_mode = GEN8_TILEMODE_XMAJOR; } else if (tiling == GPGPU_TILE_Y) { ss->ss0.tile_mode = GEN8_TILEMODE_YMAJOR; } else assert(tiling == GPGPU_NO_TILE);// W mode is not supported now. ss->ss2.width = w - 1; ss->ss2.height = h - 1; ss->ss3.depth = depth - 1; if (obj_bo_offset && tiling != GPGPU_NO_TILE) { uint32_t unaligned = obj_bo_offset; obj_bo_offset = (obj_bo_offset / 0x1000) * 0x1000; uint32_t h_ = (unaligned - obj_bo_offset )/ pitch; ss->ss5.y_offset = h_ / 4; } ss->ss8.surface_base_addr_lo = (obj_bo->offset64 + obj_bo_offset) & 0xffffffff; ss->ss9.surface_base_addr_hi = ((obj_bo->offset64 + obj_bo_offset) >> 32) & 0xffffffff; ss->ss4.render_target_view_ext = depth - 1; ss->ss4.min_array_elt = 0; ss->ss3.surface_pitch = pitch - 1; //NV12 surface. the height is 3/2 * h, so need set proper offset here. if (format == I965_SURFACEFORMAT_PLANAR_420_8) ss->ss6.uv_plane_y_offset = h * 2 / 3; ss->ss1.mem_obj_ctrl_state = cl_gpgpu_get_cache_ctrl(); ss->ss7.shader_channel_select_red = I965_SURCHAN_SELECT_RED; ss->ss7.shader_channel_select_green = I965_SURCHAN_SELECT_GREEN; ss->ss7.shader_channel_select_blue = I965_SURCHAN_SELECT_BLUE; ss->ss7.shader_channel_select_alpha = I965_SURCHAN_SELECT_ALPHA; ss->ss0.render_cache_rw_mode = 1; /* XXX do we need to set it? */ heap->binding_table[index] = offsetof(surface_heap_t, surface) + index * surface_state_sz; dri_bo_emit_reloc(gpgpu->aux_buf.bo, I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER, obj_bo_offset, gpgpu->aux_offset.surface_heap_offset + heap->binding_table[index] + offsetof(gen8_surface_state_t, ss8), obj_bo); assert(index < GEN_MAX_SURFACES); } static void intel_gpgpu_bind_buf(intel_gpgpu_t *gpgpu, drm_intel_bo *buf, uint32_t offset, uint32_t internal_offset, size_t size, uint8_t bti) { assert(gpgpu->binded_n < max_buf_n); if(offset != -1) { gpgpu->binded_buf[gpgpu->binded_n] = buf; gpgpu->target_buf_offset[gpgpu->binded_n] = internal_offset; gpgpu->binded_offset[gpgpu->binded_n] = offset; gpgpu->binded_n++; } intel_gpgpu_setup_bti(gpgpu, buf, internal_offset, size, bti, I965_SURFACEFORMAT_RAW); } static int intel_gpgpu_set_scratch(intel_gpgpu_t * gpgpu, uint32_t per_thread_size) { drm_intel_bufmgr *bufmgr = gpgpu->drv->bufmgr; drm_intel_bo* old = gpgpu->scratch_b.bo; uint32_t total = per_thread_size * gpgpu->max_threads; /* Per Bspec, scratch should 2X the desired size when EU index is not continuous */ if (IS_HASWELL(gpgpu->drv->device_id) || IS_CHERRYVIEW(gpgpu->drv->device_id) || PCI_CHIP_BROXTON_1 == gpgpu->drv->device_id || PCI_CHIP_BROXTON_3 == gpgpu->drv->device_id) total *= 2; gpgpu->per_thread_scratch = per_thread_size; if(old && old->size < total) { drm_intel_bo_unreference(old); old = NULL; } if(!old && total) { gpgpu->scratch_b.bo = drm_intel_bo_alloc(bufmgr, "SCRATCH_BO", total, 4096); if (gpgpu->scratch_b.bo == NULL) return -1; } return 0; } static void intel_gpgpu_set_stack(intel_gpgpu_t *gpgpu, uint32_t offset, uint32_t size, uint8_t bti) { drm_intel_bufmgr *bufmgr = gpgpu->drv->bufmgr; gpgpu->stack_b.bo = drm_intel_bo_alloc(bufmgr, "STACK", size, 64); cl_gpgpu_bind_buf((cl_gpgpu)gpgpu, (cl_buffer)gpgpu->stack_b.bo, offset, 0, size, bti); } static void intel_gpgpu_build_idrt_gen7(intel_gpgpu_t *gpgpu, cl_gpgpu_kernel *kernel) { gen6_interface_descriptor_t *desc; drm_intel_bo *ker_bo = NULL; desc = (gen6_interface_descriptor_t*) (gpgpu->aux_buf.bo->virtual + gpgpu->aux_offset.idrt_offset); memset(desc, 0, sizeof(*desc)); ker_bo = (drm_intel_bo *) kernel->bo; desc->desc0.kernel_start_pointer = ker_bo->offset >> 6; /* reloc */ desc->desc1.single_program_flow = 0; desc->desc1.floating_point_mode = 0; /* use IEEE-754 rule */ desc->desc5.rounding_mode = 0; /* round to nearest even */ assert((gpgpu->aux_buf.bo->offset + gpgpu->aux_offset.sampler_state_offset) % 32 == 0); desc->desc2.sampler_state_pointer = (gpgpu->aux_buf.bo->offset + gpgpu->aux_offset.sampler_state_offset) >> 5; desc->desc3.binding_table_entry_count = 0; /* no prefetch */ desc->desc3.binding_table_pointer = 0; desc->desc4.curbe_read_len = kernel->curbe_sz / 32; desc->desc4.curbe_read_offset = 0; /* Barriers / SLM are automatically handled on Gen7+ */ if (gpgpu->drv->gen_ver == 7 || gpgpu->drv->gen_ver == 75) { size_t slm_sz = kernel->slm_sz; desc->desc5.group_threads_num = kernel->use_slm ? kernel->thread_n : 0; desc->desc5.barrier_enable = kernel->use_slm; if (slm_sz <= 4*KB) slm_sz = 4*KB; else if (slm_sz <= 8*KB) slm_sz = 8*KB; else if (slm_sz <= 16*KB) slm_sz = 16*KB; else if (slm_sz <= 32*KB) slm_sz = 32*KB; else slm_sz = 64*KB; slm_sz = slm_sz >> 12; desc->desc5.slm_sz = slm_sz; } else desc->desc5.group_threads_num = kernel->barrierID; /* BarrierID on GEN6 */ dri_bo_emit_reloc(gpgpu->aux_buf.bo, I915_GEM_DOMAIN_INSTRUCTION, 0, 0, gpgpu->aux_offset.idrt_offset + offsetof(gen6_interface_descriptor_t, desc0), ker_bo); dri_bo_emit_reloc(gpgpu->aux_buf.bo, I915_GEM_DOMAIN_SAMPLER, 0, gpgpu->aux_offset.sampler_state_offset, gpgpu->aux_offset.idrt_offset + offsetof(gen6_interface_descriptor_t, desc2), gpgpu->aux_buf.bo); } static void intel_gpgpu_build_idrt_gen8(intel_gpgpu_t *gpgpu, cl_gpgpu_kernel *kernel) { gen8_interface_descriptor_t *desc; desc = (gen8_interface_descriptor_t*) (gpgpu->aux_buf.bo->virtual + gpgpu->aux_offset.idrt_offset); memset(desc, 0, sizeof(*desc)); desc->desc0.kernel_start_pointer = 0; /* reloc */ desc->desc2.single_program_flow = 0; desc->desc2.floating_point_mode = 0; /* use IEEE-754 rule */ desc->desc6.rounding_mode = 0; /* round to nearest even */ assert((gpgpu->aux_buf.bo->offset + gpgpu->aux_offset.sampler_state_offset) % 32 == 0); desc->desc3.sampler_state_pointer = gpgpu->aux_offset.sampler_state_offset >> 5; desc->desc4.binding_table_entry_count = 0; /* no prefetch */ desc->desc4.binding_table_pointer = 0; desc->desc5.curbe_read_len = kernel->curbe_sz / 32; desc->desc5.curbe_read_offset = 0; /* Barriers / SLM are automatically handled on Gen7+ */ size_t slm_sz = kernel->slm_sz; /* group_threads_num should not be set to 0 even if the barrier is disabled per bspec */ desc->desc6.group_threads_num = kernel->thread_n; desc->desc6.barrier_enable = kernel->use_slm; if (slm_sz == 0) slm_sz = 0; else if (slm_sz <= 4*KB) slm_sz = 4*KB; else if (slm_sz <= 8*KB) slm_sz = 8*KB; else if (slm_sz <= 16*KB) slm_sz = 16*KB; else if (slm_sz <= 32*KB) slm_sz = 32*KB; else slm_sz = 64*KB; slm_sz = slm_sz >> 12; desc->desc6.slm_sz = slm_sz; } static void intel_gpgpu_build_idrt_gen9(intel_gpgpu_t *gpgpu, cl_gpgpu_kernel *kernel) { gen8_interface_descriptor_t *desc; desc = (gen8_interface_descriptor_t*) (gpgpu->aux_buf.bo->virtual + gpgpu->aux_offset.idrt_offset); memset(desc, 0, sizeof(*desc)); desc->desc0.kernel_start_pointer = 0; /* reloc */ desc->desc2.single_program_flow = 0; desc->desc2.floating_point_mode = 0; /* use IEEE-754 rule */ desc->desc6.rounding_mode = 0; /* round to nearest even */ assert((gpgpu->aux_buf.bo->offset + gpgpu->aux_offset.sampler_state_offset) % 32 == 0); desc->desc3.sampler_state_pointer = gpgpu->aux_offset.sampler_state_offset >> 5; desc->desc4.binding_table_entry_count = 0; /* no prefetch */ desc->desc4.binding_table_pointer = 0; desc->desc5.curbe_read_len = kernel->curbe_sz / 32; desc->desc5.curbe_read_offset = 0; /* Barriers / SLM are automatically handled on Gen7+ */ size_t slm_sz = kernel->slm_sz; /* group_threads_num should not be set to 0 even if the barrier is disabled per bspec */ desc->desc6.group_threads_num = kernel->thread_n; desc->desc6.barrier_enable = kernel->use_slm; if (slm_sz == 0) slm_sz = 0; else if (slm_sz <= 1*KB) slm_sz = 1; else if (slm_sz <= 2*KB) slm_sz = 2; else if (slm_sz <= 4*KB) slm_sz = 3; else if (slm_sz <= 8*KB) slm_sz = 4; else if (slm_sz <= 16*KB) slm_sz = 5; else if (slm_sz <= 32*KB) slm_sz = 6; else slm_sz = 7; desc->desc6.slm_sz = slm_sz; } static int intel_gpgpu_upload_curbes_gen7(intel_gpgpu_t *gpgpu, const void* data, uint32_t size) { unsigned char *curbe = NULL; cl_gpgpu_kernel *k = gpgpu->ker; uint32_t i, j; /* Upload the data first */ if (dri_bo_map(gpgpu->aux_buf.bo, 1) != 0) { fprintf(stderr, "%s:%d: %s.\n", __FILE__, __LINE__, strerror(errno)); return -1; } assert(gpgpu->aux_buf.bo->virtual); curbe = (unsigned char *) (gpgpu->aux_buf.bo->virtual + gpgpu->aux_offset.curbe_offset); memcpy(curbe, data, size); /* Now put all the relocations for our flat address space */ for (i = 0; i < k->thread_n; ++i) for (j = 0; j < gpgpu->binded_n; ++j) { *(uint32_t *)(curbe + gpgpu->binded_offset[j]+i*k->curbe_sz) = gpgpu->binded_buf[j]->offset64 + gpgpu->target_buf_offset[j]; drm_intel_bo_emit_reloc(gpgpu->aux_buf.bo, gpgpu->aux_offset.curbe_offset + gpgpu->binded_offset[j]+i*k->curbe_sz, gpgpu->binded_buf[j], gpgpu->target_buf_offset[j], I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER); } dri_bo_unmap(gpgpu->aux_buf.bo); return 0; } static int intel_gpgpu_upload_curbes_gen8(intel_gpgpu_t *gpgpu, const void* data, uint32_t size) { unsigned char *curbe = NULL; cl_gpgpu_kernel *k = gpgpu->ker; uint32_t i, j; /* Upload the data first */ if (dri_bo_map(gpgpu->aux_buf.bo, 1) != 0) { fprintf(stderr, "%s:%d: %s.\n", __FILE__, __LINE__, strerror(errno)); return -1; } assert(gpgpu->aux_buf.bo->virtual); curbe = (unsigned char *) (gpgpu->aux_buf.bo->virtual + gpgpu->aux_offset.curbe_offset); memcpy(curbe, data, size); /* Now put all the relocations for our flat address space */ for (i = 0; i < k->thread_n; ++i) for (j = 0; j < gpgpu->binded_n; ++j) { *(size_t *)(curbe + gpgpu->binded_offset[j]+i*k->curbe_sz) = gpgpu->binded_buf[j]->offset64 + gpgpu->target_buf_offset[j]; drm_intel_bo_emit_reloc(gpgpu->aux_buf.bo, gpgpu->aux_offset.curbe_offset + gpgpu->binded_offset[j]+i*k->curbe_sz, gpgpu->binded_buf[j], gpgpu->target_buf_offset[j], I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER); } dri_bo_unmap(gpgpu->aux_buf.bo); return 0; } static void intel_gpgpu_upload_samplers(intel_gpgpu_t *gpgpu, const void *data, uint32_t n) { if (n) { const size_t sz = n * sizeof(gen6_sampler_state_t); memcpy(gpgpu->aux_buf.bo->virtual + gpgpu->aux_offset.sampler_state_offset, data, sz); } } int translate_wrap_mode(uint32_t cl_address_mode, int using_nearest) { switch( cl_address_mode ) { case CLK_ADDRESS_NONE: case CLK_ADDRESS_REPEAT: return GEN_TEXCOORDMODE_WRAP; case CLK_ADDRESS_CLAMP: return GEN_TEXCOORDMODE_CLAMP_BORDER; case CLK_ADDRESS_CLAMP_TO_EDGE: return GEN_TEXCOORDMODE_CLAMP; case CLK_ADDRESS_MIRRORED_REPEAT: return GEN_TEXCOORDMODE_MIRROR; default: return GEN_TEXCOORDMODE_WRAP; } } static void intel_gpgpu_insert_vme_state_gen7(intel_gpgpu_t *gpgpu, cl_accelerator_intel accel, uint32_t index) { gen7_vme_state_t* vme = (gen7_vme_state_t*)(gpgpu->aux_buf.bo->virtual + gpgpu->aux_offset.sampler_state_offset) + index; memset(vme, 0, sizeof(*vme)); gen7_vme_search_path_state_t* sp = vme->sp; if(accel->desc.me.search_path_type == CL_ME_SEARCH_PATH_RADIUS_2_2_INTEL){ sp[0].dw0.SPD_0_X = 0; sp[0].dw0.SPD_0_Y = 0; sp[0].dw0.SPD_1_X = 0; sp[0].dw0.SPD_1_Y = 0; sp[0].dw0.SPD_2_X = 0; sp[0].dw0.SPD_2_Y = 0; sp[0].dw0.SPD_3_X = 0; sp[0].dw0.SPD_3_Y = 0; } else if(accel->desc.me.search_path_type == CL_ME_SEARCH_PATH_RADIUS_4_4_INTEL){ sp[0].dw0.SPD_0_X = 1; sp[0].dw0.SPD_0_Y = 0; sp[0].dw0.SPD_1_X = 0; sp[0].dw0.SPD_1_Y = 1; sp[0].dw0.SPD_2_X = -1; sp[0].dw0.SPD_2_Y = 0; sp[0].dw0.SPD_3_X = 0; sp[0].dw0.SPD_3_Y = 0; } else if(accel->desc.me.search_path_type == CL_ME_SEARCH_PATH_RADIUS_16_12_INTEL){ sp[0].dw0.SPD_0_X = 1; sp[0].dw0.SPD_0_Y = 0; sp[0].dw0.SPD_1_X = 1; sp[0].dw0.SPD_1_Y = 0; sp[0].dw0.SPD_2_X = 1; sp[0].dw0.SPD_2_Y = 0; sp[0].dw0.SPD_3_X = 1; sp[0].dw0.SPD_3_Y = 0; sp[1].dw0.SPD_0_X = 1; sp[1].dw0.SPD_0_Y = 0; sp[1].dw0.SPD_1_X = 1; sp[1].dw0.SPD_1_Y = 0; sp[1].dw0.SPD_2_X = 1; sp[1].dw0.SPD_2_Y = 0; sp[1].dw0.SPD_3_X = 0; sp[1].dw0.SPD_3_Y = 1; sp[2].dw0.SPD_0_X = -1; sp[2].dw0.SPD_0_Y = 0; sp[2].dw0.SPD_1_X = -1; sp[2].dw0.SPD_1_Y = 0; sp[2].dw0.SPD_2_X = -1; sp[2].dw0.SPD_2_Y = 0; sp[2].dw0.SPD_3_X = -1; sp[2].dw0.SPD_3_Y = 0; sp[3].dw0.SPD_0_X = -1; sp[3].dw0.SPD_0_Y = 0; sp[3].dw0.SPD_1_X = -1; sp[3].dw0.SPD_1_Y = 0; sp[3].dw0.SPD_2_X = -1; sp[3].dw0.SPD_2_Y = 0; sp[3].dw0.SPD_3_X = 0; sp[3].dw0.SPD_3_Y = 1; sp[4].dw0.SPD_0_X = 1; sp[4].dw0.SPD_0_Y = 0; sp[4].dw0.SPD_1_X = 1; sp[4].dw0.SPD_1_Y = 0; sp[4].dw0.SPD_2_X = 1; sp[4].dw0.SPD_2_Y = 0; sp[4].dw0.SPD_3_X = 1; sp[4].dw0.SPD_3_Y = 0; sp[5].dw0.SPD_0_X = 1; sp[5].dw0.SPD_0_Y = 0; sp[5].dw0.SPD_1_X = 1; sp[5].dw0.SPD_1_Y = 0; sp[5].dw0.SPD_2_X = 1; sp[5].dw0.SPD_2_Y = 0; sp[5].dw0.SPD_3_X = 0; sp[5].dw0.SPD_3_Y = 1; sp[6].dw0.SPD_0_X = -1; sp[6].dw0.SPD_0_Y = 0; sp[6].dw0.SPD_1_X = -1; sp[6].dw0.SPD_1_Y = 0; sp[6].dw0.SPD_2_X = -1; sp[6].dw0.SPD_2_Y = 0; sp[6].dw0.SPD_3_X = -1; sp[6].dw0.SPD_3_Y = 0; sp[7].dw0.SPD_0_X = -1; sp[7].dw0.SPD_0_Y = 0; sp[7].dw0.SPD_1_X = -1; sp[7].dw0.SPD_1_Y = 0; sp[7].dw0.SPD_2_X = -1; sp[7].dw0.SPD_2_Y = 0; sp[7].dw0.SPD_3_X = 0; sp[7].dw0.SPD_3_Y = 1; sp[8].dw0.SPD_0_X = 1; sp[8].dw0.SPD_0_Y = 0; sp[8].dw0.SPD_1_X = 1; sp[8].dw0.SPD_1_Y = 0; sp[8].dw0.SPD_2_X = 1; sp[8].dw0.SPD_2_Y = 0; sp[8].dw0.SPD_3_X = 1; sp[8].dw0.SPD_3_Y = 0; sp[9].dw0.SPD_0_X = 1; sp[9].dw0.SPD_0_Y = 0; sp[9].dw0.SPD_1_X = 1; sp[9].dw0.SPD_1_Y = 0; sp[9].dw0.SPD_2_X = 1; sp[9].dw0.SPD_2_Y = 0; sp[9].dw0.SPD_3_X = 0; sp[9].dw0.SPD_3_Y = 1; sp[10].dw0.SPD_0_X = -1; sp[10].dw0.SPD_0_Y = 0; sp[10].dw0.SPD_1_X = -1; sp[10].dw0.SPD_1_Y = 0; sp[10].dw0.SPD_2_X = -1; sp[10].dw0.SPD_2_Y = 0; sp[10].dw0.SPD_3_X = -1; sp[10].dw0.SPD_3_Y = 0; sp[11].dw0.SPD_0_X = -1; sp[11].dw0.SPD_0_Y = 0; sp[11].dw0.SPD_1_X = -1; sp[11].dw0.SPD_1_Y = 0; sp[11].dw0.SPD_2_X = -1; sp[11].dw0.SPD_2_Y = 0; sp[11].dw0.SPD_3_X = 0; sp[11].dw0.SPD_3_Y = 0; } } static void intel_gpgpu_bind_vme_state_gen7(intel_gpgpu_t *gpgpu, cl_accelerator_intel accel) { intel_gpgpu_insert_vme_state_gen7(gpgpu, accel, 0); } static void intel_gpgpu_insert_sampler_gen7(intel_gpgpu_t *gpgpu, uint32_t index, uint32_t clk_sampler) { int using_nearest = 0; uint32_t wrap_mode; gen7_sampler_state_t *sampler; sampler = (gen7_sampler_state_t *)(gpgpu->aux_buf.bo->virtual + gpgpu->aux_offset.sampler_state_offset) + index; memset(sampler, 0, sizeof(*sampler)); assert((gpgpu->aux_buf.bo->offset + gpgpu->aux_offset.sampler_border_color_state_offset) % 32 == 0); sampler->ss2.default_color_pointer = (gpgpu->aux_buf.bo->offset + gpgpu->aux_offset.sampler_border_color_state_offset) >> 5; if ((clk_sampler & __CLK_NORMALIZED_MASK) == CLK_NORMALIZED_COORDS_FALSE) sampler->ss3.non_normalized_coord = 1; else sampler->ss3.non_normalized_coord = 0; switch (clk_sampler & __CLK_FILTER_MASK) { case CLK_FILTER_NEAREST: sampler->ss0.min_filter = GEN_MAPFILTER_NEAREST; sampler->ss0.mip_filter = GEN_MIPFILTER_NONE; sampler->ss0.mag_filter = GEN_MAPFILTER_NEAREST; using_nearest = 1; break; case CLK_FILTER_LINEAR: sampler->ss0.min_filter = GEN_MAPFILTER_LINEAR; sampler->ss0.mip_filter = GEN_MIPFILTER_NONE; sampler->ss0.mag_filter = GEN_MAPFILTER_LINEAR; break; } wrap_mode = translate_wrap_mode(clk_sampler & __CLK_ADDRESS_MASK, using_nearest); sampler->ss3.s_wrap_mode = wrap_mode; /* XXX mesa i965 driver code point out that if the surface is a 1D surface, we may need * to set t_wrap_mode to GEN_TEXCOORDMODE_WRAP. */ sampler->ss3.t_wrap_mode = wrap_mode; sampler->ss3.r_wrap_mode = wrap_mode; sampler->ss0.lod_preclamp = 1; /* OpenGL mode */ sampler->ss0.default_color_mode = 0; /* OpenGL/DX10 mode */ sampler->ss0.base_level = 0; sampler->ss1.max_lod = 0; sampler->ss1.min_lod = 0; if (sampler->ss0.min_filter != GEN_MAPFILTER_NEAREST) sampler->ss3.address_round |= GEN_ADDRESS_ROUNDING_ENABLE_U_MIN | GEN_ADDRESS_ROUNDING_ENABLE_V_MIN | GEN_ADDRESS_ROUNDING_ENABLE_R_MIN; if (sampler->ss0.mag_filter != GEN_MAPFILTER_NEAREST) sampler->ss3.address_round |= GEN_ADDRESS_ROUNDING_ENABLE_U_MAG | GEN_ADDRESS_ROUNDING_ENABLE_V_MAG | GEN_ADDRESS_ROUNDING_ENABLE_R_MAG; dri_bo_emit_reloc(gpgpu->aux_buf.bo, I915_GEM_DOMAIN_SAMPLER, 0, gpgpu->aux_offset.sampler_border_color_state_offset, gpgpu->aux_offset.sampler_state_offset + index * sizeof(gen7_sampler_state_t) + offsetof(gen7_sampler_state_t, ss2), gpgpu->aux_buf.bo); } static void intel_gpgpu_insert_sampler_gen8(intel_gpgpu_t *gpgpu, uint32_t index, uint32_t clk_sampler) { int using_nearest = 0; uint32_t wrap_mode; gen8_sampler_state_t *sampler; sampler = (gen8_sampler_state_t *)(gpgpu->aux_buf.bo->virtual + gpgpu->aux_offset.sampler_state_offset) + index; memset(sampler, 0, sizeof(*sampler)); assert((gpgpu->aux_buf.bo->offset + gpgpu->aux_offset.sampler_border_color_state_offset) % 32 == 0); if ((clk_sampler & __CLK_NORMALIZED_MASK) == CLK_NORMALIZED_COORDS_FALSE) sampler->ss3.non_normalized_coord = 1; else sampler->ss3.non_normalized_coord = 0; switch (clk_sampler & __CLK_FILTER_MASK) { case CLK_FILTER_NEAREST: sampler->ss0.min_filter = GEN_MAPFILTER_NEAREST; sampler->ss0.mip_filter = GEN_MIPFILTER_NONE; sampler->ss0.mag_filter = GEN_MAPFILTER_NEAREST; using_nearest = 1; break; case CLK_FILTER_LINEAR: sampler->ss0.min_filter = GEN_MAPFILTER_LINEAR; sampler->ss0.mip_filter = GEN_MIPFILTER_NONE; sampler->ss0.mag_filter = GEN_MAPFILTER_LINEAR; break; } wrap_mode = translate_wrap_mode(clk_sampler & __CLK_ADDRESS_MASK, using_nearest); sampler->ss3.s_wrap_mode = wrap_mode; /* XXX mesa i965 driver code point out that if the surface is a 1D surface, we may need * to set t_wrap_mode to GEN_TEXCOORDMODE_WRAP. */ sampler->ss3.t_wrap_mode = wrap_mode; sampler->ss3.r_wrap_mode = wrap_mode; sampler->ss0.lod_preclamp = 1; /* OpenGL mode */ sampler->ss0.default_color_mode = 0; /* OpenGL/DX10 mode */ sampler->ss0.base_level = 0; sampler->ss1.max_lod = 0; sampler->ss1.min_lod = 0; if (sampler->ss0.min_filter != GEN_MAPFILTER_NEAREST) sampler->ss3.address_round |= GEN_ADDRESS_ROUNDING_ENABLE_U_MIN | GEN_ADDRESS_ROUNDING_ENABLE_V_MIN | GEN_ADDRESS_ROUNDING_ENABLE_R_MIN; if (sampler->ss0.mag_filter != GEN_MAPFILTER_NEAREST) sampler->ss3.address_round |= GEN_ADDRESS_ROUNDING_ENABLE_U_MAG | GEN_ADDRESS_ROUNDING_ENABLE_V_MAG | GEN_ADDRESS_ROUNDING_ENABLE_R_MAG; } static void intel_gpgpu_bind_sampler_gen7(intel_gpgpu_t *gpgpu, uint32_t *samplers, size_t sampler_sz) { int index; assert(sampler_sz <= GEN_MAX_SAMPLERS); for(index = 0; index < sampler_sz; index++) intel_gpgpu_insert_sampler_gen7(gpgpu, index, samplers[index]); } static void intel_gpgpu_bind_sampler_gen8(intel_gpgpu_t *gpgpu, uint32_t *samplers, size_t sampler_sz) { int index; assert(sampler_sz <= GEN_MAX_SAMPLERS); for(index = 0; index < sampler_sz; index++) intel_gpgpu_insert_sampler_gen8(gpgpu, index, samplers[index]); } static void intel_gpgpu_states_setup(intel_gpgpu_t *gpgpu, cl_gpgpu_kernel *kernel) { gpgpu->ker = kernel; if (gpgpu->drv->null_bo) intel_gpgpu_setup_bti(gpgpu, gpgpu->drv->null_bo, 0, 64*1024, 0xfe, I965_SURFACEFORMAT_RAW); intel_gpgpu_build_idrt(gpgpu, kernel); dri_bo_unmap(gpgpu->aux_buf.bo); } static void intel_gpgpu_set_perf_counters(intel_gpgpu_t *gpgpu, cl_buffer *perf) { if (gpgpu->perf_b.bo) drm_intel_bo_unreference(gpgpu->perf_b.bo); drm_intel_bo_reference((drm_intel_bo*) perf); gpgpu->perf_b.bo = (drm_intel_bo*) perf; } static void intel_gpgpu_walker_gen7(intel_gpgpu_t *gpgpu, uint32_t simd_sz, uint32_t thread_n, const size_t global_wk_off[3], const size_t global_dim_off[3], const size_t global_wk_sz[3], const size_t local_wk_sz[3]) { const uint32_t global_wk_dim[3] = { global_wk_sz[0] / local_wk_sz[0], global_wk_sz[1] / local_wk_sz[1], global_wk_sz[2] / local_wk_sz[2] }; uint32_t right_mask = ~0x0; size_t group_sz = local_wk_sz[0] * local_wk_sz[1] * local_wk_sz[2]; assert(simd_sz == 8 || simd_sz == 16); uint32_t shift = (group_sz & (simd_sz - 1)); shift = (shift == 0) ? simd_sz : shift; right_mask = (1 << shift) - 1; BEGIN_BATCH(gpgpu->batch, 11); OUT_BATCH(gpgpu->batch, CMD_GPGPU_WALKER | 9); OUT_BATCH(gpgpu->batch, 0); /* kernel index == 0 */ assert(thread_n <= 64); if (simd_sz == 16) OUT_BATCH(gpgpu->batch, (1 << 30) | (thread_n-1)); /* SIMD16 | thread max */ else OUT_BATCH(gpgpu->batch, (0 << 30) | (thread_n-1)); /* SIMD8 | thread max */ OUT_BATCH(gpgpu->batch, 0); OUT_BATCH(gpgpu->batch, global_wk_dim[0]); OUT_BATCH(gpgpu->batch, 0); OUT_BATCH(gpgpu->batch, global_wk_dim[1]); OUT_BATCH(gpgpu->batch, 0); OUT_BATCH(gpgpu->batch, global_wk_dim[2]); OUT_BATCH(gpgpu->batch, right_mask); OUT_BATCH(gpgpu->batch, ~0x0); /* we always set height as 1, so set bottom mask as all 1*/ ADVANCE_BATCH(gpgpu->batch); BEGIN_BATCH(gpgpu->batch, 2); OUT_BATCH(gpgpu->batch, CMD_MEDIA_STATE_FLUSH | 0); OUT_BATCH(gpgpu->batch, 0); /* kernel index == 0 */ ADVANCE_BATCH(gpgpu->batch); if (IS_IVYBRIDGE(gpgpu->drv->device_id)) intel_gpgpu_pipe_control(gpgpu); } static void intel_gpgpu_walker_gen8(intel_gpgpu_t *gpgpu, uint32_t simd_sz, uint32_t thread_n, const size_t global_wk_off[3], const size_t global_dim_off[3], const size_t global_wk_sz[3], const size_t local_wk_sz[3]) { const uint32_t global_wk_dim[3] = { global_wk_sz[0] / local_wk_sz[0], global_wk_sz[1] / local_wk_sz[1], global_wk_sz[2] / local_wk_sz[2] }; uint32_t right_mask = ~0x0; size_t group_sz = local_wk_sz[0] * local_wk_sz[1] * local_wk_sz[2]; assert(simd_sz == 8 || simd_sz == 16); uint32_t shift = (group_sz & (simd_sz - 1)); shift = (shift == 0) ? simd_sz : shift; right_mask = (1 << shift) - 1; BEGIN_BATCH(gpgpu->batch, 15); OUT_BATCH(gpgpu->batch, CMD_GPGPU_WALKER | 13); OUT_BATCH(gpgpu->batch, 0); /* kernel index == 0 */ OUT_BATCH(gpgpu->batch, 0); /* Indirect Data Length */ OUT_BATCH(gpgpu->batch, 0); /* Indirect Data Start Address */ assert(thread_n <= 64); if (simd_sz == 16) OUT_BATCH(gpgpu->batch, (1 << 30) | (thread_n-1)); /* SIMD16 | thread max */ else OUT_BATCH(gpgpu->batch, (0 << 30) | (thread_n-1)); /* SIMD8 | thread max */ OUT_BATCH(gpgpu->batch, global_dim_off[0]); OUT_BATCH(gpgpu->batch, 0); OUT_BATCH(gpgpu->batch, global_wk_dim[0]+global_dim_off[0]); OUT_BATCH(gpgpu->batch, global_dim_off[1]); OUT_BATCH(gpgpu->batch, 0); OUT_BATCH(gpgpu->batch, global_wk_dim[1]+global_dim_off[1]); OUT_BATCH(gpgpu->batch, global_dim_off[2]); OUT_BATCH(gpgpu->batch, global_wk_dim[2]+global_dim_off[2]); OUT_BATCH(gpgpu->batch, right_mask); OUT_BATCH(gpgpu->batch, ~0x0); /* we always set height as 1, so set bottom mask as all 1*/ ADVANCE_BATCH(gpgpu->batch); BEGIN_BATCH(gpgpu->batch, 2); OUT_BATCH(gpgpu->batch, CMD_MEDIA_STATE_FLUSH | 0); OUT_BATCH(gpgpu->batch, 0); /* kernel index == 0 */ ADVANCE_BATCH(gpgpu->batch); intel_gpgpu_pipe_control(gpgpu); } static intel_event_t* intel_gpgpu_event_new(intel_gpgpu_t *gpgpu) { intel_event_t *event = NULL; TRY_ALLOC_NO_ERR (event, CALLOC(intel_event_t)); event->buffer = gpgpu->batch->buffer; if (event->buffer) drm_intel_bo_reference(event->buffer); event->status = command_queued; if(gpgpu->time_stamp_b.bo) { event->ts_buf = gpgpu->time_stamp_b.bo; drm_intel_bo_reference(event->ts_buf); } exit: return event; error: cl_free(event); event = NULL; goto exit; } /* The upper layer already flushed the batch buffer, just update internal status to command_submitted. */ static void intel_gpgpu_event_flush(intel_event_t *event) { assert(event->status == command_queued); event->status = command_running; } static int intel_gpgpu_event_update_status(intel_event_t *event, int wait) { if(event->status == command_complete) return event->status; if (event->buffer && event->status == command_running && !drm_intel_bo_busy(event->buffer)) { event->status = command_complete; drm_intel_bo_unreference(event->buffer); event->buffer = NULL; return event->status; } if(wait == 0) return event->status; if (event->buffer) { drm_intel_bo_wait_rendering(event->buffer); event->status = command_complete; drm_intel_bo_unreference(event->buffer); event->buffer = NULL; } return event->status; } static void intel_gpgpu_event_delete(intel_event_t *event) { if(event->buffer) drm_intel_bo_unreference(event->buffer); if(event->ts_buf) drm_intel_bo_unreference(event->ts_buf); cl_free(event); } /* IVB and HSW's result MUST shift in x86_64 system */ static uint64_t intel_gpgpu_read_ts_reg_gen7(drm_intel_bufmgr *bufmgr) { uint64_t result = 0; drm_intel_reg_read(bufmgr, TIMESTAMP_ADDR, &result); /* In x86_64 system, the low 32bits of timestamp count are stored in the high 32 bits of result which got from drm_intel_reg_read, and 32-35 bits are lost; but match bspec in i386 system. It seems the kernel readq bug. So shift 32 bit in x86_64, and only remain 32 bits data in i386. */ struct utsname buf; uname(&buf); /* In some systems, the user space is 32 bit, but kernel is 64 bit, so can't use the * compiler's flag to determine the kernel'a architecture, use uname to get it. */ /* x86_64 in linux, amd64 in bsd */ if(strcmp(buf.machine, "x86_64") == 0 || strcmp(buf.machine, "amd64") == 0) return result >> 32; else return result & 0x0ffffffff; } /* baytrail's result should clear high 4 bits */ static uint64_t intel_gpgpu_read_ts_reg_baytrail(drm_intel_bufmgr *bufmgr) { uint64_t result = 0; drm_intel_reg_read(bufmgr, TIMESTAMP_ADDR, &result); return result & 0x0ffffffff; } /* We want to get the current time of GPU. */ static void intel_gpgpu_event_get_gpu_cur_timestamp(intel_driver_t* gen_driver, uint64_t* ret_ts) { uint64_t result = 0; drm_intel_bufmgr *bufmgr = gen_driver->bufmgr; /* Get the ts that match the bspec */ result = intel_gpgpu_read_ts_reg(bufmgr); result *= 80; *ret_ts = result; return; } /* Get the GPU execute time. */ static void intel_gpgpu_event_get_exec_timestamp(intel_gpgpu_t* gpgpu, int index, uint64_t* ret_ts) { uint64_t result = 0; assert(gpgpu->time_stamp_b.bo); assert(index == 0 || index == 1); drm_intel_gem_bo_map_gtt(gpgpu->time_stamp_b.bo); uint64_t* ptr = gpgpu->time_stamp_b.bo->virtual; result = ptr[index]; /* According to BSpec, the timestamp counter should be 36 bits, but comparing to the timestamp counter from IO control reading, we find the first 4 bits seems to be fake. In order to keep the timestamp counter conformable, we just skip the first 4 bits. */ result = (result & 0x0FFFFFFFF) * 80; //convert to nanoseconds *ret_ts = result; drm_intel_gem_bo_unmap_gtt(gpgpu->time_stamp_b.bo); } static int intel_gpgpu_set_profiling_buf(intel_gpgpu_t *gpgpu, uint32_t size, uint32_t offset, uint8_t bti) { drm_intel_bo *bo = NULL; gpgpu->profiling_b.bo = drm_intel_bo_alloc(gpgpu->drv->bufmgr, "Profiling buffer", size, 64); bo = gpgpu->profiling_b.bo; if (!bo || (drm_intel_bo_map(bo, 1) != 0)) { fprintf(stderr, "%s:%d: %s.\n", __FILE__, __LINE__, strerror(errno)); return -1; } memset(bo->virtual, 0, size); drm_intel_bo_unmap(bo); cl_gpgpu_bind_buf((cl_gpgpu)gpgpu, (cl_buffer)bo, offset, 0, size, bti); return 0; } static void intel_gpgpu_set_profiling_info(intel_gpgpu_t *gpgpu, void* profiling_info) { gpgpu->profiling_info = profiling_info; } static void* intel_gpgpu_get_profiling_info(intel_gpgpu_t *gpgpu) { return gpgpu->profiling_info; } static int intel_gpgpu_set_printf_buf(intel_gpgpu_t *gpgpu, uint32_t size, uint8_t bti) { if (gpgpu->printf_b.bo) dri_bo_unreference(gpgpu->printf_b.bo); gpgpu->printf_b.bo = dri_bo_alloc(gpgpu->drv->bufmgr, "Printf buffer", size, 4096); if (!gpgpu->printf_b.bo || (drm_intel_bo_map(gpgpu->printf_b.bo, 1) != 0)) { fprintf(stderr, "%s:%d: %s.\n", __FILE__, __LINE__, strerror(errno)); return -1; } memset(gpgpu->printf_b.bo->virtual, 0, size); *(uint32_t *)(gpgpu->printf_b.bo->virtual) = 4; // first four is for the length. drm_intel_bo_unmap(gpgpu->printf_b.bo); /* No need to bind, we do not need to emit reloc. */ intel_gpgpu_setup_bti(gpgpu, gpgpu->printf_b.bo, 0, size, bti, I965_SURFACEFORMAT_RAW); return 0; } static void* intel_gpgpu_map_profiling_buf(intel_gpgpu_t *gpgpu) { drm_intel_bo *bo = NULL; bo = gpgpu->profiling_b.bo; drm_intel_bo_map(bo, 1); return bo->virtual; } static void intel_gpgpu_unmap_profiling_buf_addr(intel_gpgpu_t *gpgpu) { drm_intel_bo *bo = NULL; bo = gpgpu->profiling_b.bo; drm_intel_bo_unmap(bo); } static void* intel_gpgpu_map_printf_buf(intel_gpgpu_t *gpgpu) { drm_intel_bo *bo = NULL; bo = gpgpu->printf_b.bo; drm_intel_bo_map(bo, 1); return bo->virtual; } static void intel_gpgpu_unmap_printf_buf_addr(intel_gpgpu_t *gpgpu) { drm_intel_bo *bo = NULL; bo = gpgpu->printf_b.bo; drm_intel_bo_unmap(bo); } static void intel_gpgpu_release_printf_buf(intel_gpgpu_t *gpgpu) { drm_intel_bo_unreference(gpgpu->printf_b.bo); gpgpu->printf_b.bo = NULL; } static void intel_gpgpu_set_printf_info(intel_gpgpu_t *gpgpu, void* printf_info) { gpgpu->printf_info = printf_info; } static void* intel_gpgpu_get_printf_info(intel_gpgpu_t *gpgpu) { return gpgpu->printf_info; } static void intel_gpgpu_set_kernel(intel_gpgpu_t *gpgpu, void * kernel) { gpgpu->kernel = kernel; } static void* intel_gpgpu_get_kernel(intel_gpgpu_t *gpgpu) { return gpgpu->kernel; } LOCAL void intel_set_gpgpu_callbacks(int device_id) { cl_gpgpu_new = (cl_gpgpu_new_cb *) intel_gpgpu_new; cl_gpgpu_delete = (cl_gpgpu_delete_cb *) intel_gpgpu_delete; cl_gpgpu_sync = (cl_gpgpu_sync_cb *) intel_gpgpu_sync; cl_gpgpu_bind_buf = (cl_gpgpu_bind_buf_cb *) intel_gpgpu_bind_buf; cl_gpgpu_set_stack = (cl_gpgpu_set_stack_cb *) intel_gpgpu_set_stack; cl_gpgpu_state_init = (cl_gpgpu_state_init_cb *) intel_gpgpu_state_init; cl_gpgpu_set_perf_counters = (cl_gpgpu_set_perf_counters_cb *) intel_gpgpu_set_perf_counters; cl_gpgpu_alloc_constant_buffer = (cl_gpgpu_alloc_constant_buffer_cb *) intel_gpgpu_alloc_constant_buffer; cl_gpgpu_states_setup = (cl_gpgpu_states_setup_cb *) intel_gpgpu_states_setup; cl_gpgpu_upload_samplers = (cl_gpgpu_upload_samplers_cb *) intel_gpgpu_upload_samplers; cl_gpgpu_batch_reset = (cl_gpgpu_batch_reset_cb *) intel_gpgpu_batch_reset; cl_gpgpu_batch_start = (cl_gpgpu_batch_start_cb *) intel_gpgpu_batch_start; cl_gpgpu_batch_end = (cl_gpgpu_batch_end_cb *) intel_gpgpu_batch_end; cl_gpgpu_flush = (cl_gpgpu_flush_cb *) intel_gpgpu_flush; cl_gpgpu_bind_sampler = (cl_gpgpu_bind_sampler_cb *) intel_gpgpu_bind_sampler_gen7; cl_gpgpu_bind_vme_state = (cl_gpgpu_bind_vme_state_cb *) intel_gpgpu_bind_vme_state_gen7; cl_gpgpu_set_scratch = (cl_gpgpu_set_scratch_cb *) intel_gpgpu_set_scratch; cl_gpgpu_event_new = (cl_gpgpu_event_new_cb *)intel_gpgpu_event_new; cl_gpgpu_event_flush = (cl_gpgpu_event_flush_cb *)intel_gpgpu_event_flush; cl_gpgpu_event_update_status = (cl_gpgpu_event_update_status_cb *)intel_gpgpu_event_update_status; cl_gpgpu_event_delete = (cl_gpgpu_event_delete_cb *)intel_gpgpu_event_delete; cl_gpgpu_event_get_exec_timestamp = (cl_gpgpu_event_get_exec_timestamp_cb *)intel_gpgpu_event_get_exec_timestamp; cl_gpgpu_event_get_gpu_cur_timestamp = (cl_gpgpu_event_get_gpu_cur_timestamp_cb *)intel_gpgpu_event_get_gpu_cur_timestamp; cl_gpgpu_ref_batch_buf = (cl_gpgpu_ref_batch_buf_cb *)intel_gpgpu_ref_batch_buf; cl_gpgpu_unref_batch_buf = (cl_gpgpu_unref_batch_buf_cb *)intel_gpgpu_unref_batch_buf; cl_gpgpu_set_profiling_buffer = (cl_gpgpu_set_profiling_buffer_cb *)intel_gpgpu_set_profiling_buf; cl_gpgpu_set_profiling_info = (cl_gpgpu_set_profiling_info_cb *)intel_gpgpu_set_profiling_info; cl_gpgpu_get_profiling_info = (cl_gpgpu_get_profiling_info_cb *)intel_gpgpu_get_profiling_info; cl_gpgpu_map_profiling_buffer = (cl_gpgpu_map_profiling_buffer_cb *)intel_gpgpu_map_profiling_buf; cl_gpgpu_unmap_profiling_buffer = (cl_gpgpu_unmap_profiling_buffer_cb *)intel_gpgpu_unmap_profiling_buf_addr; cl_gpgpu_set_printf_buffer = (cl_gpgpu_set_printf_buffer_cb *)intel_gpgpu_set_printf_buf; cl_gpgpu_map_printf_buffer = (cl_gpgpu_map_printf_buffer_cb *)intel_gpgpu_map_printf_buf; cl_gpgpu_unmap_printf_buffer = (cl_gpgpu_unmap_printf_buffer_cb *)intel_gpgpu_unmap_printf_buf_addr; cl_gpgpu_release_printf_buffer = (cl_gpgpu_release_printf_buffer_cb *)intel_gpgpu_release_printf_buf; cl_gpgpu_set_printf_info = (cl_gpgpu_set_printf_info_cb *)intel_gpgpu_set_printf_info; cl_gpgpu_get_printf_info = (cl_gpgpu_get_printf_info_cb *)intel_gpgpu_get_printf_info; cl_gpgpu_set_kernel = (cl_gpgpu_set_kernel_cb *)intel_gpgpu_set_kernel; cl_gpgpu_get_kernel = (cl_gpgpu_get_kernel_cb *)intel_gpgpu_get_kernel; if (IS_BROADWELL(device_id) || IS_CHERRYVIEW(device_id)) { cl_gpgpu_bind_image = (cl_gpgpu_bind_image_cb *) intel_gpgpu_bind_image_gen8; intel_gpgpu_set_L3 = intel_gpgpu_set_L3_gen8; cl_gpgpu_get_cache_ctrl = (cl_gpgpu_get_cache_ctrl_cb *)intel_gpgpu_get_cache_ctrl_gen8; intel_gpgpu_get_scratch_index = intel_gpgpu_get_scratch_index_gen8; intel_gpgpu_post_action = intel_gpgpu_post_action_gen7; //BDW need not restore SLM, same as gen7 intel_gpgpu_read_ts_reg = intel_gpgpu_read_ts_reg_gen7; if(IS_CHERRYVIEW(device_id)) intel_gpgpu_read_ts_reg = intel_gpgpu_read_ts_reg_baytrail; intel_gpgpu_set_base_address = intel_gpgpu_set_base_address_gen8; intel_gpgpu_setup_bti = intel_gpgpu_setup_bti_gen8; intel_gpgpu_load_vfe_state = intel_gpgpu_load_vfe_state_gen8; cl_gpgpu_walker = (cl_gpgpu_walker_cb *)intel_gpgpu_walker_gen8; intel_gpgpu_build_idrt = intel_gpgpu_build_idrt_gen8; intel_gpgpu_load_curbe_buffer = intel_gpgpu_load_curbe_buffer_gen8; intel_gpgpu_load_idrt = intel_gpgpu_load_idrt_gen8; cl_gpgpu_bind_sampler = (cl_gpgpu_bind_sampler_cb *) intel_gpgpu_bind_sampler_gen8; intel_gpgpu_pipe_control = intel_gpgpu_pipe_control_gen8; intel_gpgpu_select_pipeline = intel_gpgpu_select_pipeline_gen7; cl_gpgpu_upload_curbes = (cl_gpgpu_upload_curbes_cb *) intel_gpgpu_upload_curbes_gen8; return; } if (IS_GEN9(device_id)) { cl_gpgpu_bind_image = (cl_gpgpu_bind_image_cb *) intel_gpgpu_bind_image_gen9; cl_gpgpu_bind_image_for_vme = (cl_gpgpu_bind_image_cb *) intel_gpgpu_bind_image_for_vme_gen9; intel_gpgpu_set_L3 = intel_gpgpu_set_L3_gen8; cl_gpgpu_get_cache_ctrl = (cl_gpgpu_get_cache_ctrl_cb *)intel_gpgpu_get_cache_ctrl_gen9; intel_gpgpu_get_scratch_index = intel_gpgpu_get_scratch_index_gen8; intel_gpgpu_post_action = intel_gpgpu_post_action_gen7; //SKL need not restore SLM, same as gen7 intel_gpgpu_read_ts_reg = intel_gpgpu_read_ts_reg_gen7; if(IS_GEMINILAKE(device_id)) intel_gpgpu_read_ts_reg = intel_gpgpu_read_ts_reg_baytrail; intel_gpgpu_set_base_address = intel_gpgpu_set_base_address_gen9; intel_gpgpu_setup_bti = intel_gpgpu_setup_bti_gen9; intel_gpgpu_load_vfe_state = intel_gpgpu_load_vfe_state_gen8; cl_gpgpu_walker = (cl_gpgpu_walker_cb *)intel_gpgpu_walker_gen8; intel_gpgpu_build_idrt = intel_gpgpu_build_idrt_gen9; intel_gpgpu_load_curbe_buffer = intel_gpgpu_load_curbe_buffer_gen8; intel_gpgpu_load_idrt = intel_gpgpu_load_idrt_gen8; cl_gpgpu_bind_sampler = (cl_gpgpu_bind_sampler_cb *) intel_gpgpu_bind_sampler_gen8; intel_gpgpu_pipe_control = intel_gpgpu_pipe_control_gen8; intel_gpgpu_select_pipeline = intel_gpgpu_select_pipeline_gen9; cl_gpgpu_upload_curbes = (cl_gpgpu_upload_curbes_cb *) intel_gpgpu_upload_curbes_gen8; return; } cl_gpgpu_upload_curbes = (cl_gpgpu_upload_curbes_cb *) intel_gpgpu_upload_curbes_gen7; intel_gpgpu_set_base_address = intel_gpgpu_set_base_address_gen7; intel_gpgpu_load_vfe_state = intel_gpgpu_load_vfe_state_gen7; cl_gpgpu_walker = (cl_gpgpu_walker_cb *)intel_gpgpu_walker_gen7; intel_gpgpu_build_idrt = intel_gpgpu_build_idrt_gen7; intel_gpgpu_load_curbe_buffer = intel_gpgpu_load_curbe_buffer_gen7; intel_gpgpu_load_idrt = intel_gpgpu_load_idrt_gen7; intel_gpgpu_select_pipeline = intel_gpgpu_select_pipeline_gen7; if (IS_HASWELL(device_id)) { cl_gpgpu_bind_image = (cl_gpgpu_bind_image_cb *) intel_gpgpu_bind_image_gen75; intel_gpgpu_set_L3 = intel_gpgpu_set_L3_gen75; cl_gpgpu_get_cache_ctrl = (cl_gpgpu_get_cache_ctrl_cb *)intel_gpgpu_get_cache_ctrl_gen75; intel_gpgpu_get_scratch_index = intel_gpgpu_get_scratch_index_gen75; intel_gpgpu_post_action = intel_gpgpu_post_action_gen75; intel_gpgpu_read_ts_reg = intel_gpgpu_read_ts_reg_gen7; //HSW same as ivb intel_gpgpu_setup_bti = intel_gpgpu_setup_bti_gen75; intel_gpgpu_pipe_control = intel_gpgpu_pipe_control_gen75; } else if (IS_IVYBRIDGE(device_id)) { cl_gpgpu_bind_image = (cl_gpgpu_bind_image_cb *) intel_gpgpu_bind_image_gen7; cl_gpgpu_bind_image_for_vme = (cl_gpgpu_bind_image_cb *) intel_gpgpu_bind_image_for_vme_gen7; if (IS_BAYTRAIL_T(device_id)) { intel_gpgpu_set_L3 = intel_gpgpu_set_L3_baytrail; intel_gpgpu_read_ts_reg = intel_gpgpu_read_ts_reg_baytrail; } else { intel_gpgpu_set_L3 = intel_gpgpu_set_L3_gen7; intel_gpgpu_read_ts_reg = intel_gpgpu_read_ts_reg_gen7; } cl_gpgpu_get_cache_ctrl = (cl_gpgpu_get_cache_ctrl_cb *)intel_gpgpu_get_cache_ctrl_gen7; intel_gpgpu_get_scratch_index = intel_gpgpu_get_scratch_index_gen7; intel_gpgpu_post_action = intel_gpgpu_post_action_gen7; intel_gpgpu_setup_bti = intel_gpgpu_setup_bti_gen7; intel_gpgpu_pipe_control = intel_gpgpu_pipe_control_gen7; } }