/*
* Copyright © 2012 Intel Corporation
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library. If not, see .
*
* Author: Benjamin Segovia
* Alexei Soupikov
*/
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include "intel/intel_gpgpu.h"
#include "intel/intel_defines.h"
#include "intel/intel_structs.h"
#include "program.h" // for BTI_RESERVED_NUM
#include "cl_alloc.h"
#include "cl_utils.h"
#include "cl_sampler.h"
#include "cl_accelerator_intel.h"
#ifndef CL_VERSION_1_2
#define CL_MEM_OBJECT_IMAGE1D 0x10F4
#define CL_MEM_OBJECT_IMAGE1D_ARRAY 0x10F5
#define CL_MEM_OBJECT_IMAGE1D_BUFFER 0x10F6
#define CL_MEM_OBJECT_IMAGE2D_ARRAY 0x10F3
#endif
#define GEN_CMD_MEDIA_OBJECT (0x71000000)
#define MO_TS_BIT (1 << 24)
#define MO_RETAIN_BIT (1 << 28)
#define SAMPLER_STATE_SIZE (16)
#define TIMESTAMP_ADDR 0x2358
/* Stores both binding tables and surface states */
typedef struct surface_heap {
uint32_t binding_table[256];
char surface[256*sizeof(gen_surface_state_t)];
} surface_heap_t;
typedef struct intel_event {
drm_intel_bo *buffer;
drm_intel_bo *ts_buf;
int status;
} intel_event_t;
#define MAX_IF_DESC 32
typedef struct intel_gpgpu intel_gpgpu_t;
typedef void (intel_gpgpu_set_L3_t)(intel_gpgpu_t *gpgpu, uint32_t use_slm);
intel_gpgpu_set_L3_t *intel_gpgpu_set_L3 = NULL;
typedef uint32_t (intel_gpgpu_get_scratch_index_t)(uint32_t size);
intel_gpgpu_get_scratch_index_t *intel_gpgpu_get_scratch_index = NULL;
typedef void (intel_gpgpu_post_action_t)(intel_gpgpu_t *gpgpu, int32_t flush_mode);
intel_gpgpu_post_action_t *intel_gpgpu_post_action = NULL;
typedef uint64_t (intel_gpgpu_read_ts_reg_t)(drm_intel_bufmgr *bufmgr);
intel_gpgpu_read_ts_reg_t *intel_gpgpu_read_ts_reg = NULL;
typedef void (intel_gpgpu_set_base_address_t)(intel_gpgpu_t *gpgpu);
intel_gpgpu_set_base_address_t *intel_gpgpu_set_base_address = NULL;
typedef void (intel_gpgpu_setup_bti_t)(intel_gpgpu_t *gpgpu, drm_intel_bo *buf, uint32_t internal_offset,
size_t size, unsigned char index, uint32_t format);
intel_gpgpu_setup_bti_t *intel_gpgpu_setup_bti = NULL;
typedef void (intel_gpgpu_load_vfe_state_t)(intel_gpgpu_t *gpgpu);
intel_gpgpu_load_vfe_state_t *intel_gpgpu_load_vfe_state = NULL;
typedef void (intel_gpgpu_build_idrt_t)(intel_gpgpu_t *gpgpu, cl_gpgpu_kernel *kernel);
intel_gpgpu_build_idrt_t *intel_gpgpu_build_idrt = NULL;
typedef void (intel_gpgpu_load_curbe_buffer_t)(intel_gpgpu_t *gpgpu);
intel_gpgpu_load_curbe_buffer_t *intel_gpgpu_load_curbe_buffer = NULL;
typedef void (intel_gpgpu_load_idrt_t)(intel_gpgpu_t *gpgpu);
intel_gpgpu_load_idrt_t *intel_gpgpu_load_idrt = NULL;
typedef void (intel_gpgpu_pipe_control_t)(intel_gpgpu_t *gpgpu);
intel_gpgpu_pipe_control_t *intel_gpgpu_pipe_control = NULL;
typedef void (intel_gpgpu_select_pipeline_t)(intel_gpgpu_t *gpgpu);
intel_gpgpu_select_pipeline_t *intel_gpgpu_select_pipeline = NULL;
static void
intel_gpgpu_sync(void *buf)
{
if (buf)
drm_intel_bo_wait_rendering((drm_intel_bo *)buf);
}
static void *intel_gpgpu_ref_batch_buf(intel_gpgpu_t *gpgpu)
{
if (gpgpu->batch->last_bo)
drm_intel_bo_reference(gpgpu->batch->last_bo);
return gpgpu->batch->last_bo;
}
static void intel_gpgpu_unref_batch_buf(void *buf)
{
if (buf)
drm_intel_bo_unreference((drm_intel_bo *)buf);
}
static void
intel_gpgpu_delete_finished(intel_gpgpu_t *gpgpu)
{
if (gpgpu == NULL)
return;
if(gpgpu->time_stamp_b.bo)
drm_intel_bo_unreference(gpgpu->time_stamp_b.bo);
if(gpgpu->printf_b.bo)
drm_intel_bo_unreference(gpgpu->printf_b.bo);
if (gpgpu->aux_buf.bo)
drm_intel_bo_unreference(gpgpu->aux_buf.bo);
if (gpgpu->perf_b.bo)
drm_intel_bo_unreference(gpgpu->perf_b.bo);
if (gpgpu->stack_b.bo)
drm_intel_bo_unreference(gpgpu->stack_b.bo);
if (gpgpu->scratch_b.bo)
drm_intel_bo_unreference(gpgpu->scratch_b.bo);
if (gpgpu->profiling_b.bo)
drm_intel_bo_unreference(gpgpu->profiling_b.bo);
if(gpgpu->constant_b.bo)
drm_intel_bo_unreference(gpgpu->constant_b.bo);
intel_batchbuffer_delete(gpgpu->batch);
cl_free(gpgpu);
}
/* Destroy the all intel_gpgpu, no matter finish or not, when driver destroy */
void intel_gpgpu_delete_all(intel_driver_t *drv)
{
struct intel_gpgpu_node *p;
if(drv->gpgpu_list == NULL)
return;
PPTHREAD_MUTEX_LOCK(drv);
while(drv->gpgpu_list) {
p = drv->gpgpu_list;
drv->gpgpu_list = p->next;
intel_gpgpu_delete_finished(p->gpgpu);
cl_free(p);
}
PPTHREAD_MUTEX_UNLOCK(drv);
}
static void
intel_gpgpu_delete(intel_gpgpu_t *gpgpu)
{
if (gpgpu == NULL)
return;
intel_driver_t *drv = gpgpu->drv;
struct intel_gpgpu_node *p, *node;
PPTHREAD_MUTEX_LOCK(drv);
p = drv->gpgpu_list;
if(p) {
node = p->next;
while(node) {
if(node->gpgpu->batch && node->gpgpu->batch->buffer &&
!drm_intel_bo_busy(node->gpgpu->batch->buffer)) {
p->next = node->next;
intel_gpgpu_delete_finished(node->gpgpu);
cl_free(node);
node = p->next;
} else {
p = node;
node = node->next;
}
}
node = drv->gpgpu_list;
if(node->gpgpu->batch && node->gpgpu->batch->buffer &&
!drm_intel_bo_busy(node->gpgpu->batch->buffer)) {
drv->gpgpu_list = drv->gpgpu_list->next;
intel_gpgpu_delete_finished(node->gpgpu);
cl_free(node);
}
}
if (gpgpu == NULL)
return;
if(gpgpu->batch && gpgpu->batch->buffer &&
drm_intel_bo_busy(gpgpu->batch->buffer)) {
TRY_ALLOC_NO_ERR (node, CALLOC(struct intel_gpgpu_node));
node->gpgpu = gpgpu;
node->next = NULL;
p = drv->gpgpu_list;
if(p == NULL)
drv->gpgpu_list= node;
else {
while(p->next)
p = p->next;
p->next = node;
}
} else
intel_gpgpu_delete_finished(gpgpu);
error:
PPTHREAD_MUTEX_UNLOCK(drv);
}
static intel_gpgpu_t*
intel_gpgpu_new(intel_driver_t *drv)
{
intel_gpgpu_t *state = NULL;
TRY_ALLOC_NO_ERR (state, CALLOC(intel_gpgpu_t));
state->drv = drv;
state->batch = intel_batchbuffer_new(state->drv);
assert(state->batch);
exit:
return state;
error:
intel_gpgpu_delete(state);
state = NULL;
goto exit;
}
static void
intel_gpgpu_select_pipeline_gen7(intel_gpgpu_t *gpgpu)
{
BEGIN_BATCH(gpgpu->batch, 1);
OUT_BATCH(gpgpu->batch, CMD_PIPELINE_SELECT | PIPELINE_SELECT_GPGPU);
ADVANCE_BATCH(gpgpu->batch);
}
static void
intel_gpgpu_select_pipeline_gen9(intel_gpgpu_t *gpgpu)
{
BEGIN_BATCH(gpgpu->batch, 1);
OUT_BATCH(gpgpu->batch, CMD_PIPELINE_SELECT | PIPELINE_SELECT_MASK | PIPELINE_SELECT_GPGPU);
ADVANCE_BATCH(gpgpu->batch);
}
static uint32_t
intel_gpgpu_get_cache_ctrl_gen7()
{
return cc_llc_l3;
}
static uint32_t
intel_gpgpu_get_cache_ctrl_gen75()
{
return llccc_ec | l3cc_ec;
}
static uint32_t
intel_gpgpu_get_cache_ctrl_gen8()
{
return tcc_llc_ec_l3 | mtllc_wb;
}
static uint32_t
intel_gpgpu_get_cache_ctrl_gen9()
{
//Kernel-defined cache control registers 2:
//L3CC: WB; LeCC: WB; TC: LLC/eLLC;
int major = 0, minor = 0;
int mocs_index = 0x2;
struct utsname buf;
uname(&buf);
sscanf(buf.release, "%d.%d", &major, &minor);
//From linux 4.3, kernel redefined the mocs table's value,
//But before 4.3, still used the hw defautl value.
if(strcmp(buf.sysname, "Linux") == 0 &&
major == 4 && minor < 3) { /* linux kernel support skl from 4.x, so check from 4 */
mocs_index = 0x9;
}
return (mocs_index << 1);
}
static void
intel_gpgpu_set_base_address_gen7(intel_gpgpu_t *gpgpu)
{
const uint32_t def_cc = cl_gpgpu_get_cache_ctrl(); /* default Cache Control value */
BEGIN_BATCH(gpgpu->batch, 10);
OUT_BATCH(gpgpu->batch, CMD_STATE_BASE_ADDRESS | 8);
/* 0, Gen State Mem Obj CC, Stateless Mem Obj CC, Stateless Access Write Back */
OUT_BATCH(gpgpu->batch, 0 | (def_cc << 8) | (def_cc << 4) | (0 << 3)| BASE_ADDRESS_MODIFY); /* General State Base Addr */
/* 0, State Mem Obj CC */
/* We use a state base address for the surface heap since IVB clamp the
* binding table pointer at 11 bits. So, we cannot use pointers directly while
* using the surface heap
*/
assert(gpgpu->aux_offset.surface_heap_offset % 4096 == 0);
OUT_RELOC(gpgpu->batch, gpgpu->aux_buf.bo,
I915_GEM_DOMAIN_INSTRUCTION,
I915_GEM_DOMAIN_INSTRUCTION,
gpgpu->aux_offset.surface_heap_offset + (0 | (def_cc << 8) | (def_cc << 4) | (0 << 3)| BASE_ADDRESS_MODIFY));
OUT_BATCH(gpgpu->batch, 0 | (def_cc << 8) | BASE_ADDRESS_MODIFY); /* Dynamic State Base Addr */
OUT_BATCH(gpgpu->batch, 0 | (def_cc << 8) | BASE_ADDRESS_MODIFY); /* Indirect Obj Base Addr */
OUT_BATCH(gpgpu->batch, 0 | (def_cc << 8) | BASE_ADDRESS_MODIFY); /* Instruction Base Addr */
OUT_BATCH(gpgpu->batch, 0 | BASE_ADDRESS_MODIFY);
/* According to mesa i965 driver code, we must set the dynamic state access upper bound
* to a valid bound value, otherwise, the border color pointer may be rejected and you
* may get incorrect border color. This is a known hardware bug. */
OUT_BATCH(gpgpu->batch, 0xfffff000 | BASE_ADDRESS_MODIFY);
OUT_BATCH(gpgpu->batch, 0 | BASE_ADDRESS_MODIFY);
OUT_BATCH(gpgpu->batch, 0 | BASE_ADDRESS_MODIFY);
ADVANCE_BATCH(gpgpu->batch);
}
static void
intel_gpgpu_set_base_address_gen8(intel_gpgpu_t *gpgpu)
{
const uint32_t def_cc = cl_gpgpu_get_cache_ctrl(); /* default Cache Control value */
BEGIN_BATCH(gpgpu->batch, 16);
OUT_BATCH(gpgpu->batch, CMD_STATE_BASE_ADDRESS | 14);
/* 0, Gen State Mem Obj CC, Stateless Mem Obj CC, Stateless Access Write Back */
OUT_BATCH(gpgpu->batch, 0 | (def_cc << 4) | (0 << 1)| BASE_ADDRESS_MODIFY); /* General State Base Addr */
OUT_BATCH(gpgpu->batch, 0);
OUT_BATCH(gpgpu->batch, 0 | (def_cc << 16));
/* 0, State Mem Obj CC */
/* We use a state base address for the surface heap since IVB clamp the
* binding table pointer at 11 bits. So, we cannot use pointers directly while
* using the surface heap
*/
assert(gpgpu->aux_offset.surface_heap_offset % 4096 == 0);
OUT_RELOC(gpgpu->batch, gpgpu->aux_buf.bo,
I915_GEM_DOMAIN_SAMPLER,
I915_GEM_DOMAIN_SAMPLER,
gpgpu->aux_offset.surface_heap_offset + (0 | (def_cc << 4) | (0 << 1)| BASE_ADDRESS_MODIFY));
OUT_BATCH(gpgpu->batch, 0);
OUT_RELOC(gpgpu->batch, gpgpu->aux_buf.bo,
I915_GEM_DOMAIN_RENDER,
I915_GEM_DOMAIN_RENDER,
(0 | (def_cc << 4) | (0 << 1)| BASE_ADDRESS_MODIFY)); /* Dynamic State Base Addr */
OUT_BATCH(gpgpu->batch, 0);
OUT_BATCH(gpgpu->batch, 0 | (def_cc << 4) | BASE_ADDRESS_MODIFY); /* Indirect Obj Base Addr */
OUT_BATCH(gpgpu->batch, 0);
//OUT_BATCH(gpgpu->batch, 0 | (def_cc << 4) | BASE_ADDRESS_MODIFY); /* Instruction Base Addr */
OUT_RELOC(gpgpu->batch, (drm_intel_bo *)gpgpu->ker->bo,
I915_GEM_DOMAIN_INSTRUCTION,
I915_GEM_DOMAIN_INSTRUCTION,
0 + (0 | (def_cc << 4) | (0 << 1)| BASE_ADDRESS_MODIFY));
OUT_BATCH(gpgpu->batch, 0);
OUT_BATCH(gpgpu->batch, 0xfffff000 | BASE_ADDRESS_MODIFY);
/* According to mesa i965 driver code, we must set the dynamic state access upper bound
* to a valid bound value, otherwise, the border color pointer may be rejected and you
* may get incorrect border color. This is a known hardware bug. */
OUT_BATCH(gpgpu->batch, 0xfffff000 | BASE_ADDRESS_MODIFY);
OUT_BATCH(gpgpu->batch, 0xfffff000 | BASE_ADDRESS_MODIFY);
OUT_BATCH(gpgpu->batch, 0xfffff000 | BASE_ADDRESS_MODIFY);
ADVANCE_BATCH(gpgpu->batch);
}
static void
intel_gpgpu_set_base_address_gen9(intel_gpgpu_t *gpgpu)
{
const uint32_t def_cc = cl_gpgpu_get_cache_ctrl(); /* default Cache Control value */
BEGIN_BATCH(gpgpu->batch, 19);
OUT_BATCH(gpgpu->batch, CMD_STATE_BASE_ADDRESS | 17);
/* 0, Gen State Mem Obj CC, Stateless Mem Obj CC, Stateless Access Write Back */
OUT_BATCH(gpgpu->batch, 0 | (def_cc << 4) | (0 << 1)| BASE_ADDRESS_MODIFY); /* General State Base Addr */
OUT_BATCH(gpgpu->batch, 0);
OUT_BATCH(gpgpu->batch, 0 | (def_cc << 16));
/* 0, State Mem Obj CC */
/* We use a state base address for the surface heap since IVB clamp the
* binding table pointer at 11 bits. So, we cannot use pointers directly while
* using the surface heap
*/
assert(gpgpu->aux_offset.surface_heap_offset % 4096 == 0);
OUT_RELOC(gpgpu->batch, gpgpu->aux_buf.bo,
I915_GEM_DOMAIN_SAMPLER,
I915_GEM_DOMAIN_SAMPLER,
gpgpu->aux_offset.surface_heap_offset + (0 | (def_cc << 4) | (0 << 1)| BASE_ADDRESS_MODIFY));
OUT_BATCH(gpgpu->batch, 0);
OUT_RELOC(gpgpu->batch, gpgpu->aux_buf.bo,
I915_GEM_DOMAIN_RENDER,
I915_GEM_DOMAIN_RENDER,
(0 | (def_cc << 4) | (0 << 1)| BASE_ADDRESS_MODIFY)); /* Dynamic State Base Addr */
OUT_BATCH(gpgpu->batch, 0);
OUT_BATCH(gpgpu->batch, 0 | (def_cc << 4) | BASE_ADDRESS_MODIFY); /* Indirect Obj Base Addr */
OUT_BATCH(gpgpu->batch, 0);
//OUT_BATCH(gpgpu->batch, 0 | (def_cc << 4) | BASE_ADDRESS_MODIFY); /* Instruction Base Addr */
OUT_RELOC(gpgpu->batch, (drm_intel_bo *)gpgpu->ker->bo,
I915_GEM_DOMAIN_INSTRUCTION,
I915_GEM_DOMAIN_INSTRUCTION,
0 + (0 | (def_cc << 4) | (0 << 1)| BASE_ADDRESS_MODIFY));
OUT_BATCH(gpgpu->batch, 0);
OUT_BATCH(gpgpu->batch, 0xfffff000 | BASE_ADDRESS_MODIFY);
/* According to mesa i965 driver code, we must set the dynamic state access upper bound
* to a valid bound value, otherwise, the border color pointer may be rejected and you
* may get incorrect border color. This is a known hardware bug. */
OUT_BATCH(gpgpu->batch, 0xfffff000 | BASE_ADDRESS_MODIFY);
OUT_BATCH(gpgpu->batch, 0xfffff000 | BASE_ADDRESS_MODIFY);
OUT_BATCH(gpgpu->batch, 0xfffff000 | BASE_ADDRESS_MODIFY);
/* Bindless surface state base address */
OUT_BATCH(gpgpu->batch, (def_cc << 4) | BASE_ADDRESS_MODIFY);
OUT_BATCH(gpgpu->batch, 0);
OUT_BATCH(gpgpu->batch, 0xfffff000);
ADVANCE_BATCH(gpgpu->batch);
}
uint32_t intel_gpgpu_get_scratch_index_gen7(uint32_t size) {
return size / 1024 - 1;
}
uint32_t intel_gpgpu_get_scratch_index_gen75(uint32_t size) {
//align in backend, if non pow2, must align when alloc scratch bo.
assert((size & (size - 1)) == 0);
size = size >> 11;
uint32_t index = 0;
while((size >>= 1) > 0)
index++; //get leading one
return index;
}
uint32_t intel_gpgpu_get_scratch_index_gen8(uint32_t size) {
//align in backend, if non pow2, must align when alloc scratch bo.
assert((size & (size - 1)) == 0);
size = size >> 10;
uint32_t index = 0;
while((size >>= 1) > 0)
index++; //get leading one
return index;
}
static cl_int
intel_gpgpu_get_max_curbe_size(uint32_t device_id)
{
if (IS_BAYTRAIL_T(device_id) ||
IS_IVB_GT1(device_id))
return 992;
else
return 2016;
}
static cl_int
intel_gpgpu_get_curbe_size(intel_gpgpu_t *gpgpu)
{
int curbe_size = gpgpu->curb.size_cs_entry * gpgpu->curb.num_cs_entries;
int max_curbe_size = intel_gpgpu_get_max_curbe_size(gpgpu->drv->device_id);
if (curbe_size > max_curbe_size) {
fprintf(stderr, "warning, curbe size exceed limitation.\n");
return max_curbe_size;
} else
return curbe_size;
}
static void
intel_gpgpu_load_vfe_state_gen7(intel_gpgpu_t *gpgpu)
{
int32_t scratch_index;
BEGIN_BATCH(gpgpu->batch, 8);
OUT_BATCH(gpgpu->batch, CMD_MEDIA_STATE_POINTERS | (8-2));
if(gpgpu->per_thread_scratch > 0) {
scratch_index = intel_gpgpu_get_scratch_index(gpgpu->per_thread_scratch);
OUT_RELOC(gpgpu->batch, gpgpu->scratch_b.bo,
I915_GEM_DOMAIN_RENDER,
I915_GEM_DOMAIN_RENDER,
scratch_index);
}
else {
OUT_BATCH(gpgpu->batch, 0);
}
/* max_thread | urb entries | (reset_gateway|bypass_gate_way | gpgpu_mode) */
OUT_BATCH(gpgpu->batch, 0 | ((gpgpu->max_threads - 1) << 16) | (0 << 8) | 0xc4);
OUT_BATCH(gpgpu->batch, 0);
/* curbe_size */
OUT_BATCH(gpgpu->batch, intel_gpgpu_get_curbe_size(gpgpu));
OUT_BATCH(gpgpu->batch, 0);
OUT_BATCH(gpgpu->batch, 0);
OUT_BATCH(gpgpu->batch, 0);
ADVANCE_BATCH(gpgpu->batch);
}
static void
intel_gpgpu_load_vfe_state_gen8(intel_gpgpu_t *gpgpu)
{
int32_t scratch_index;
BEGIN_BATCH(gpgpu->batch, 9);
OUT_BATCH(gpgpu->batch, CMD_MEDIA_STATE_POINTERS | (9-2));
if(gpgpu->per_thread_scratch > 0) {
scratch_index = intel_gpgpu_get_scratch_index(gpgpu->per_thread_scratch);
OUT_RELOC(gpgpu->batch, gpgpu->scratch_b.bo,
I915_GEM_DOMAIN_RENDER,
I915_GEM_DOMAIN_RENDER,
scratch_index);
}
else {
OUT_BATCH(gpgpu->batch, 0);
}
OUT_BATCH(gpgpu->batch, 0);
/* max_thread | urb entries | (reset_gateway|bypass_gate_way | gpgpu_mode) */
OUT_BATCH(gpgpu->batch, 0 | ((gpgpu->max_threads - 1) << 16) | (2 << 8) | 0xc0); //urb entries can't be 0
OUT_BATCH(gpgpu->batch, 0);
/* urb entries size | curbe_size */
OUT_BATCH(gpgpu->batch, 2<<16 | intel_gpgpu_get_curbe_size(gpgpu));
OUT_BATCH(gpgpu->batch, 0);
OUT_BATCH(gpgpu->batch, 0);
OUT_BATCH(gpgpu->batch, 0);
ADVANCE_BATCH(gpgpu->batch);
}
static void
intel_gpgpu_load_curbe_buffer_gen7(intel_gpgpu_t *gpgpu)
{
BEGIN_BATCH(gpgpu->batch, 4);
OUT_BATCH(gpgpu->batch, CMD(2,0,1) | (4 - 2)); /* length-2 */
OUT_BATCH(gpgpu->batch, 0); /* mbz */
OUT_BATCH(gpgpu->batch, intel_gpgpu_get_curbe_size(gpgpu) * 32);
OUT_RELOC(gpgpu->batch, gpgpu->aux_buf.bo, I915_GEM_DOMAIN_INSTRUCTION, 0, gpgpu->aux_offset.curbe_offset);
ADVANCE_BATCH(gpgpu->batch);
}
static void
intel_gpgpu_load_curbe_buffer_gen8(intel_gpgpu_t *gpgpu)
{
BEGIN_BATCH(gpgpu->batch, 4);
OUT_BATCH(gpgpu->batch, CMD(2,0,1) | (4 - 2)); /* length-2 */
OUT_BATCH(gpgpu->batch, 0); /* mbz */
OUT_BATCH(gpgpu->batch, intel_gpgpu_get_curbe_size(gpgpu) * 32);
OUT_BATCH(gpgpu->batch, gpgpu->aux_offset.curbe_offset);
ADVANCE_BATCH(gpgpu->batch);
}
static void
intel_gpgpu_load_idrt_gen7(intel_gpgpu_t *gpgpu)
{
BEGIN_BATCH(gpgpu->batch, 4);
OUT_BATCH(gpgpu->batch, CMD(2,0,2) | (4 - 2)); /* length-2 */
OUT_BATCH(gpgpu->batch, 0); /* mbz */
OUT_BATCH(gpgpu->batch, 1 << 5);
OUT_RELOC(gpgpu->batch, gpgpu->aux_buf.bo, I915_GEM_DOMAIN_INSTRUCTION, 0, gpgpu->aux_offset.idrt_offset);
ADVANCE_BATCH(gpgpu->batch);
}
static void
intel_gpgpu_load_idrt_gen8(intel_gpgpu_t *gpgpu)
{
BEGIN_BATCH(gpgpu->batch, 4);
OUT_BATCH(gpgpu->batch, CMD(2,0,2) | (4 - 2)); /* length-2 */
OUT_BATCH(gpgpu->batch, 0); /* mbz */
OUT_BATCH(gpgpu->batch, 1 << 5);
OUT_BATCH(gpgpu->batch, gpgpu->aux_offset.idrt_offset);
ADVANCE_BATCH(gpgpu->batch);
}
static const uint32_t gpgpu_l3_config_reg1[] = {
0x00080040, 0x02040040, 0x00800040, 0x01000038,
0x02000030, 0x01000038, 0x00000038, 0x00000040,
0x0A140091, 0x09100091, 0x08900091, 0x08900091,
0x010000a1
};
static const uint32_t gpgpu_l3_config_reg2[] = {
0x00000000, 0x00000000, 0x00080410, 0x00080410,
0x00040410, 0x00040420, 0x00080420, 0x00080020,
0x00204080, 0x00244890, 0x00284490, 0x002444A0,
0x00040810
};
/* Emit PIPE_CONTROLs to write the current GPU timestamp into a buffer. */
static void
intel_gpgpu_write_timestamp(intel_gpgpu_t *gpgpu, int idx)
{
BEGIN_BATCH(gpgpu->batch, 5);
OUT_BATCH(gpgpu->batch, CMD_PIPE_CONTROL | (5-2));
OUT_BATCH(gpgpu->batch, GEN7_PIPE_CONTROL_WRITE_TIMESTAMP);
OUT_RELOC(gpgpu->batch, gpgpu->time_stamp_b.bo,
I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
GEN7_PIPE_CONTROL_GLOBAL_GTT_WRITE | idx * sizeof(uint64_t));
OUT_BATCH(gpgpu->batch, 0);
OUT_BATCH(gpgpu->batch, 0);
ADVANCE_BATCH();
}
static void
intel_gpgpu_pipe_control_gen7(intel_gpgpu_t *gpgpu)
{
gen6_pipe_control_t* pc = (gen6_pipe_control_t*)
intel_batchbuffer_alloc_space(gpgpu->batch, sizeof(gen6_pipe_control_t));
memset(pc, 0, sizeof(*pc));
pc->dw0.length = SIZEOF32(gen6_pipe_control_t) - 2;
pc->dw0.instruction_subopcode = GEN7_PIPE_CONTROL_SUBOPCODE_3D_CONTROL;
pc->dw0.instruction_opcode = GEN7_PIPE_CONTROL_OPCODE_3D_CONTROL;
pc->dw0.instruction_pipeline = GEN7_PIPE_CONTROL_3D;
pc->dw0.instruction_type = GEN7_PIPE_CONTROL_INSTRUCTION_GFX;
pc->dw1.render_target_cache_flush_enable = 1;
pc->dw1.texture_cache_invalidation_enable = 1;
pc->dw1.cs_stall = 1;
pc->dw1.dc_flush_enable = 1;
//pc->dw1.instruction_cache_invalidate_enable = 1;
ADVANCE_BATCH(gpgpu->batch);
}
static void
intel_gpgpu_pipe_control_gen75(intel_gpgpu_t *gpgpu)
{
gen6_pipe_control_t* pc = (gen6_pipe_control_t*)
intel_batchbuffer_alloc_space(gpgpu->batch, sizeof(gen6_pipe_control_t));
memset(pc, 0, sizeof(*pc));
pc->dw0.length = SIZEOF32(gen6_pipe_control_t) - 2;
pc->dw0.instruction_subopcode = GEN7_PIPE_CONTROL_SUBOPCODE_3D_CONTROL;
pc->dw0.instruction_opcode = GEN7_PIPE_CONTROL_OPCODE_3D_CONTROL;
pc->dw0.instruction_pipeline = GEN7_PIPE_CONTROL_3D;
pc->dw0.instruction_type = GEN7_PIPE_CONTROL_INSTRUCTION_GFX;
pc->dw1.cs_stall = 1;
pc->dw1.dc_flush_enable = 1;
pc = (gen6_pipe_control_t*)
intel_batchbuffer_alloc_space(gpgpu->batch, sizeof(gen6_pipe_control_t));
memset(pc, 0, sizeof(*pc));
pc->dw0.length = SIZEOF32(gen6_pipe_control_t) - 2;
pc->dw0.instruction_subopcode = GEN7_PIPE_CONTROL_SUBOPCODE_3D_CONTROL;
pc->dw0.instruction_opcode = GEN7_PIPE_CONTROL_OPCODE_3D_CONTROL;
pc->dw0.instruction_pipeline = GEN7_PIPE_CONTROL_3D;
pc->dw0.instruction_type = GEN7_PIPE_CONTROL_INSTRUCTION_GFX;
pc->dw1.render_target_cache_flush_enable = 1;
pc->dw1.texture_cache_invalidation_enable = 1;
pc->dw1.cs_stall = 1;
ADVANCE_BATCH(gpgpu->batch);
}
static void
intel_gpgpu_pipe_control_gen8(intel_gpgpu_t *gpgpu)
{
gen8_pipe_control_t* pc = (gen8_pipe_control_t*)
intel_batchbuffer_alloc_space(gpgpu->batch, sizeof(gen8_pipe_control_t));
memset(pc, 0, sizeof(*pc));
pc->dw0.length = SIZEOF32(gen8_pipe_control_t) - 2;
pc->dw0.instruction_subopcode = GEN7_PIPE_CONTROL_SUBOPCODE_3D_CONTROL;
pc->dw0.instruction_opcode = GEN7_PIPE_CONTROL_OPCODE_3D_CONTROL;
pc->dw0.instruction_pipeline = GEN7_PIPE_CONTROL_3D;
pc->dw0.instruction_type = GEN7_PIPE_CONTROL_INSTRUCTION_GFX;
pc->dw1.render_target_cache_flush_enable = 1;
pc->dw1.texture_cache_invalidation_enable = 1;
pc->dw1.cs_stall = 1;
pc->dw1.dc_flush_enable = 1;
//pc->dw1.instruction_cache_invalidate_enable = 1;
ADVANCE_BATCH(gpgpu->batch);
}
static void
intel_gpgpu_set_L3_gen7(intel_gpgpu_t *gpgpu, uint32_t use_slm)
{
BEGIN_BATCH(gpgpu->batch, 9);
OUT_BATCH(gpgpu->batch, CMD_LOAD_REGISTER_IMM | 1); /* length - 2 */
OUT_BATCH(gpgpu->batch, GEN7_L3_SQC_REG1_ADDRESS_OFFSET);
OUT_BATCH(gpgpu->batch, 0x00A00000);
OUT_BATCH(gpgpu->batch, CMD_LOAD_REGISTER_IMM | 1); /* length - 2 */
OUT_BATCH(gpgpu->batch, GEN7_L3_CNTL_REG2_ADDRESS_OFFSET);
if (use_slm)
OUT_BATCH(gpgpu->batch, gpgpu_l3_config_reg1[12]);
else
OUT_BATCH(gpgpu->batch, gpgpu_l3_config_reg1[4]);
OUT_BATCH(gpgpu->batch, CMD_LOAD_REGISTER_IMM | 1); /* length - 2 */
OUT_BATCH(gpgpu->batch, GEN7_L3_CNTL_REG3_ADDRESS_OFFSET);
if (use_slm)
OUT_BATCH(gpgpu->batch, gpgpu_l3_config_reg2[12]);
else
OUT_BATCH(gpgpu->batch, gpgpu_l3_config_reg2[4]);
ADVANCE_BATCH(gpgpu->batch);
intel_gpgpu_pipe_control(gpgpu);
}
static void
intel_gpgpu_set_L3_baytrail(intel_gpgpu_t *gpgpu, uint32_t use_slm)
{
BEGIN_BATCH(gpgpu->batch, 9);
OUT_BATCH(gpgpu->batch, CMD_LOAD_REGISTER_IMM | 1); /* length - 2 */
OUT_BATCH(gpgpu->batch, GEN7_L3_SQC_REG1_ADDRESS_OFFSET);
OUT_BATCH(gpgpu->batch, 0x00D30000); /* General credit : High credit = 26 : 6 */
OUT_BATCH(gpgpu->batch, CMD_LOAD_REGISTER_IMM | 1); /* length - 2 */
OUT_BATCH(gpgpu->batch, GEN7_L3_CNTL_REG2_ADDRESS_OFFSET);
if (use_slm)
OUT_BATCH(gpgpu->batch, 0x01020021); /* {SLM=64, URB=96, DC=16, RO=16, Sum=192} */
else
OUT_BATCH(gpgpu->batch, 0x02040040); /* {SLM=0, URB=128, DC=32, RO=32, Sum=192} */
OUT_BATCH(gpgpu->batch, CMD_LOAD_REGISTER_IMM | 1); /* length - 2 */
OUT_BATCH(gpgpu->batch, GEN7_L3_CNTL_REG3_ADDRESS_OFFSET);
OUT_BATCH(gpgpu->batch, 0x0); /* {I/S=0, Const=0, Tex=0} */
ADVANCE_BATCH(gpgpu->batch);
intel_gpgpu_pipe_control(gpgpu);
}
static void
intel_gpgpu_set_L3_gen75(intel_gpgpu_t *gpgpu, uint32_t use_slm)
{
/* still set L3 in batch buffer for fulsim. */
if(gpgpu->drv->atomic_test_result != SELF_TEST_ATOMIC_FAIL)
{
BEGIN_BATCH(gpgpu->batch, 15);
OUT_BATCH(gpgpu->batch, CMD_LOAD_REGISTER_IMM | 1); /* length - 2 */
/* FIXME: KMD always disable the atomic in L3 for some reason.
I checked the spec, and don't think we need that workaround now.
Before I send a patch to kernel, let's just enable it here. */
OUT_BATCH(gpgpu->batch, HSW_SCRATCH1_OFFSET);
OUT_BATCH(gpgpu->batch, 0); /* enable atomic in L3 */
OUT_BATCH(gpgpu->batch, CMD_LOAD_REGISTER_IMM | 1); /* length - 2 */
OUT_BATCH(gpgpu->batch, HSW_ROW_CHICKEN3_HDC_OFFSET);
OUT_BATCH(gpgpu->batch, (1 << 6ul) << 16); /* enable atomic in L3 */
}
else
{
BEGIN_BATCH(gpgpu->batch, 9);
}
OUT_BATCH(gpgpu->batch, CMD_LOAD_REGISTER_IMM | 1); /* length - 2 */
OUT_BATCH(gpgpu->batch, GEN7_L3_SQC_REG1_ADDRESS_OFFSET);
OUT_BATCH(gpgpu->batch, 0x08800000);
OUT_BATCH(gpgpu->batch, CMD_LOAD_REGISTER_IMM | 1); /* length - 2 */
OUT_BATCH(gpgpu->batch, GEN7_L3_CNTL_REG2_ADDRESS_OFFSET);
if (use_slm)
OUT_BATCH(gpgpu->batch, gpgpu_l3_config_reg1[12]);
else
OUT_BATCH(gpgpu->batch, gpgpu_l3_config_reg1[4]);
OUT_BATCH(gpgpu->batch, CMD_LOAD_REGISTER_IMM | 1); /* length - 2 */
OUT_BATCH(gpgpu->batch, GEN7_L3_CNTL_REG3_ADDRESS_OFFSET);
if (use_slm)
OUT_BATCH(gpgpu->batch, gpgpu_l3_config_reg2[12]);
else
OUT_BATCH(gpgpu->batch, gpgpu_l3_config_reg2[4]);
ADVANCE_BATCH(gpgpu->batch);
//if(use_slm)
// gpgpu->batch->enable_slm = 1;
intel_gpgpu_pipe_control(gpgpu);
}
static void
intel_gpgpu_set_L3_gen8(intel_gpgpu_t *gpgpu, uint32_t use_slm)
{
BEGIN_BATCH(gpgpu->batch, 3);
OUT_BATCH(gpgpu->batch, CMD_LOAD_REGISTER_IMM | 1); /* length - 2 */
OUT_BATCH(gpgpu->batch, GEN8_L3_CNTL_REG_ADDRESS_OFFSET);
// FIXME, this is a workaround for switch SLM enable and disable random hang
if(use_slm)
OUT_BATCH(gpgpu->batch, 0x60000121); /* {SLM=192, URB=128, Rest=384} */
else
OUT_BATCH(gpgpu->batch, 0x60000160); /* {SLM=0, URB=384, Rest=384, Sum=768} */
//if(use_slm)
// gpgpu->batch->enable_slm = 1;
intel_gpgpu_pipe_control(gpgpu);
}
static void
intel_gpgpu_batch_start(intel_gpgpu_t *gpgpu)
{
intel_batchbuffer_start_atomic(gpgpu->batch, 256);
intel_gpgpu_pipe_control(gpgpu);
assert(intel_gpgpu_set_L3);
intel_gpgpu_set_L3(gpgpu, gpgpu->ker->use_slm);
intel_gpgpu_select_pipeline(gpgpu);
intel_gpgpu_set_base_address(gpgpu);
intel_gpgpu_load_vfe_state(gpgpu);
intel_gpgpu_load_curbe_buffer(gpgpu);
intel_gpgpu_load_idrt(gpgpu);
if (gpgpu->perf_b.bo) {
BEGIN_BATCH(gpgpu->batch, 3);
OUT_BATCH(gpgpu->batch,
(0x28 << 23) | /* MI_REPORT_PERF_COUNT */
(3 - 2)); /* length-2 */
OUT_RELOC(gpgpu->batch, gpgpu->perf_b.bo,
I915_GEM_DOMAIN_RENDER,
I915_GEM_DOMAIN_RENDER,
0 | /* Offset for the start "counters" */
1); /* Use GTT and not PGTT */
OUT_BATCH(gpgpu->batch, 0);
ADVANCE_BATCH(gpgpu->batch);
}
/* Insert PIPE_CONTROL for time stamp of start*/
if (gpgpu->time_stamp_b.bo)
intel_gpgpu_write_timestamp(gpgpu, 0);
}
static void
intel_gpgpu_post_action_gen7(intel_gpgpu_t *gpgpu, int32_t flush_mode)
{
if(flush_mode)
intel_gpgpu_pipe_control(gpgpu);
}
static void
intel_gpgpu_post_action_gen75(intel_gpgpu_t *gpgpu, int32_t flush_mode)
{
/* flush force for set L3 */
intel_gpgpu_pipe_control(gpgpu);
/* Restore L3 control to disable SLM mode,
otherwise, may affect 3D pipeline */
intel_gpgpu_set_L3(gpgpu, 0);
}
static void
intel_gpgpu_batch_end(intel_gpgpu_t *gpgpu, int32_t flush_mode)
{
/* Insert PIPE_CONTROL for time stamp of end*/
if (gpgpu->time_stamp_b.bo)
intel_gpgpu_write_timestamp(gpgpu, 1);
/* Insert the performance counter command */
if (gpgpu->perf_b.bo) {
BEGIN_BATCH(gpgpu->batch, 3);
OUT_BATCH(gpgpu->batch,
(0x28 << 23) | /* MI_REPORT_PERF_COUNT */
(3 - 2)); /* length-2 */
OUT_RELOC(gpgpu->batch, gpgpu->perf_b.bo,
I915_GEM_DOMAIN_RENDER,
I915_GEM_DOMAIN_RENDER,
512 | /* Offset for the end "counters" */
1); /* Use GTT and not PGTT */
OUT_BATCH(gpgpu->batch, 0);
ADVANCE_BATCH(gpgpu->batch);
}
intel_gpgpu_post_action(gpgpu, flush_mode);
intel_batchbuffer_end_atomic(gpgpu->batch);
}
static int
intel_gpgpu_batch_reset(intel_gpgpu_t *gpgpu, size_t sz)
{
return intel_batchbuffer_reset(gpgpu->batch, sz);
}
static int
intel_gpgpu_flush(intel_gpgpu_t *gpgpu)
{
if (!gpgpu->batch || !gpgpu->batch->buffer)
return 0;
return intel_batchbuffer_flush(gpgpu->batch);
/* FIXME:
Remove old assert here for binded buffer offset 0 which
tried to guard possible NULL buffer pointer check in kernel, as
in case like "runtime_null_kernel_arg", but that's wrong to just
take buffer offset 0 as NULL, and cause failure for normal
kernels which has no such NULL ptr check but with buffer offset 0
(which is possible now and will be normal if full PPGTT is on).
Need to fix NULL ptr check otherwise.
*/
}
static int
intel_gpgpu_state_init(intel_gpgpu_t *gpgpu,
uint32_t max_threads,
uint32_t size_cs_entry,
int profiling)
{
drm_intel_bo *bo;
/* Binded buffers */
gpgpu->binded_n = 0;
gpgpu->img_bitmap = 0;
gpgpu->img_index_base = 3;
gpgpu->sampler_bitmap = ~((1 << max_sampler_n) - 1);
/* URB */
gpgpu->curb.num_cs_entries = 64;
gpgpu->curb.size_cs_entry = size_cs_entry;
gpgpu->max_threads = max_threads;
if (gpgpu->printf_b.bo)
dri_bo_unreference(gpgpu->printf_b.bo);
gpgpu->printf_b.bo = NULL;
if (gpgpu->profiling_b.bo)
dri_bo_unreference(gpgpu->profiling_b.bo);
gpgpu->profiling_b.bo = NULL;
/* Set the profile buffer*/
if(gpgpu->time_stamp_b.bo)
dri_bo_unreference(gpgpu->time_stamp_b.bo);
gpgpu->time_stamp_b.bo = NULL;
if (profiling) {
bo = dri_bo_alloc(gpgpu->drv->bufmgr, "timestamp query", 4096, 4096);
gpgpu->time_stamp_b.bo = bo;
if (!bo)
fprintf(stderr, "Could not allocate buffer for profiling.\n");
}
/* stack */
if (gpgpu->stack_b.bo)
dri_bo_unreference(gpgpu->stack_b.bo);
gpgpu->stack_b.bo = NULL;
/* Set the auxiliary buffer*/
uint32_t size_aux = 0;
if(gpgpu->aux_buf.bo)
dri_bo_unreference(gpgpu->aux_buf.bo);
gpgpu->aux_buf.bo = NULL;
/* begin with surface heap to make sure it's page aligned,
because state base address use 20bit for the address */
gpgpu->aux_offset.surface_heap_offset = size_aux;
size_aux += sizeof(surface_heap_t);
//curbe must be 32 bytes aligned
size_aux = ALIGN(size_aux, 64);
gpgpu->aux_offset.curbe_offset = size_aux;
size_aux += gpgpu->curb.num_cs_entries * gpgpu->curb.size_cs_entry * 32;
//idrt must be 32 bytes aligned
size_aux = ALIGN(size_aux, 32);
gpgpu->aux_offset.idrt_offset = size_aux;
size_aux += MAX_IF_DESC * sizeof(struct gen6_interface_descriptor);
//must be 32 bytes aligned
//sampler state and vme state share the same buffer,
size_aux = ALIGN(size_aux, 32);
gpgpu->aux_offset.sampler_state_offset = size_aux;
size_aux += MAX(GEN_MAX_SAMPLERS * sizeof(gen6_sampler_state_t),
GEN_MAX_VME_STATES * sizeof(gen7_vme_state_t));
//sampler border color state must be 32 bytes aligned
size_aux = ALIGN(size_aux, 32);
gpgpu->aux_offset.sampler_border_color_state_offset = size_aux;
size_aux += GEN_MAX_SAMPLERS * sizeof(gen7_sampler_border_color_t);
/* make sure aux buffer is page aligned */
size_aux = ALIGN(size_aux, 4096);
bo = dri_bo_alloc(gpgpu->drv->bufmgr, "AUX_BUFFER", size_aux, 4096);
if (!bo || dri_bo_map(bo, 1) != 0) {
fprintf(stderr, "%s:%d: %s.\n", __FILE__, __LINE__, strerror(errno));
if (bo)
dri_bo_unreference(bo);
if (profiling && gpgpu->time_stamp_b.bo)
dri_bo_unreference(gpgpu->time_stamp_b.bo);
gpgpu->time_stamp_b.bo = NULL;
return -1;
}
memset(bo->virtual, 0, size_aux);
gpgpu->aux_buf.bo = bo;
return 0;
}
static void
intel_gpgpu_set_buf_reloc_gen7(intel_gpgpu_t *gpgpu, int32_t index, dri_bo* obj_bo, uint32_t obj_bo_offset)
{
surface_heap_t *heap = gpgpu->aux_buf.bo->virtual + gpgpu->aux_offset.surface_heap_offset;
heap->binding_table[index] = offsetof(surface_heap_t, surface) +
index * sizeof(gen7_surface_state_t);
dri_bo_emit_reloc(gpgpu->aux_buf.bo,
I915_GEM_DOMAIN_RENDER,
I915_GEM_DOMAIN_RENDER,
obj_bo_offset,
gpgpu->aux_offset.surface_heap_offset +
heap->binding_table[index] +
offsetof(gen7_surface_state_t, ss1),
obj_bo);
}
static void
intel_gpgpu_set_buf_reloc_for_vme_gen7(intel_gpgpu_t *gpgpu, int32_t index, dri_bo* obj_bo, uint32_t obj_bo_offset)
{
surface_heap_t *heap = gpgpu->aux_buf.bo->virtual + gpgpu->aux_offset.surface_heap_offset;
heap->binding_table[index] = offsetof(surface_heap_t, surface) +
index * sizeof(gen7_surface_state_t);
dri_bo_emit_reloc(gpgpu->aux_buf.bo,
I915_GEM_DOMAIN_RENDER,
I915_GEM_DOMAIN_RENDER,
obj_bo_offset,
gpgpu->aux_offset.surface_heap_offset +
heap->binding_table[index] +
offsetof(gen7_media_surface_state_t, ss0),
obj_bo);
}
static dri_bo*
intel_gpgpu_alloc_constant_buffer(intel_gpgpu_t *gpgpu, uint32_t size, uint8_t bti)
{
if(gpgpu->constant_b.bo)
dri_bo_unreference(gpgpu->constant_b.bo);
gpgpu->constant_b.bo = drm_intel_bo_alloc(gpgpu->drv->bufmgr, "CONSTANT_BUFFER", size, 64);
if (gpgpu->constant_b.bo == NULL)
return NULL;
intel_gpgpu_setup_bti(gpgpu, gpgpu->constant_b.bo, 0, size, bti, I965_SURFACEFORMAT_R32G32B32A32_UINT);
return gpgpu->constant_b.bo;
}
static void
intel_gpgpu_setup_bti_gen7(intel_gpgpu_t *gpgpu, drm_intel_bo *buf, uint32_t internal_offset,
size_t size, unsigned char index, uint32_t format)
{
assert(size <= (2ul<<30));
size_t s = size - 1;
surface_heap_t *heap = gpgpu->aux_buf.bo->virtual + gpgpu->aux_offset.surface_heap_offset;
gen7_surface_state_t *ss0 = (gen7_surface_state_t *) &heap->surface[index * sizeof(gen7_surface_state_t)];
memset(ss0, 0, sizeof(gen7_surface_state_t));
ss0->ss0.surface_type = I965_SURFACE_BUFFER;
ss0->ss0.surface_format = format;
ss0->ss2.width = s & 0x7f; /* bits 6:0 of sz */
// Per bspec, I965_SURFACE_BUFFER and RAW format, size must be a multiple of 4 byte.
if(format == I965_SURFACEFORMAT_RAW)
assert((ss0->ss2.width & 0x03) == 3);
ss0->ss2.height = (s >> 7) & 0x3fff; /* bits 20:7 of sz */
ss0->ss3.depth = (s >> 21) & 0x3ff; /* bits 30:21 of sz */
ss0->ss5.cache_control = cl_gpgpu_get_cache_ctrl();
heap->binding_table[index] = offsetof(surface_heap_t, surface) + index * sizeof(gen7_surface_state_t);
ss0->ss1.base_addr = buf->offset + internal_offset;
dri_bo_emit_reloc(gpgpu->aux_buf.bo,
I915_GEM_DOMAIN_RENDER,
I915_GEM_DOMAIN_RENDER,
internal_offset,
gpgpu->aux_offset.surface_heap_offset +
heap->binding_table[index] +
offsetof(gen7_surface_state_t, ss1),
buf);
}
static void
intel_gpgpu_setup_bti_gen75(intel_gpgpu_t *gpgpu, drm_intel_bo *buf, uint32_t internal_offset,
size_t size, unsigned char index, uint32_t format)
{
assert(size <= (2ul<<30));
size_t s = size - 1;
surface_heap_t *heap = gpgpu->aux_buf.bo->virtual + gpgpu->aux_offset.surface_heap_offset;
gen7_surface_state_t *ss0 = (gen7_surface_state_t *) &heap->surface[index * sizeof(gen7_surface_state_t)];
memset(ss0, 0, sizeof(gen7_surface_state_t));
ss0->ss0.surface_type = I965_SURFACE_BUFFER;
ss0->ss0.surface_format = format;
if(format != I965_SURFACEFORMAT_RAW) {
ss0->ss7.shader_r = I965_SURCHAN_SELECT_RED;
ss0->ss7.shader_g = I965_SURCHAN_SELECT_GREEN;
ss0->ss7.shader_b = I965_SURCHAN_SELECT_BLUE;
ss0->ss7.shader_a = I965_SURCHAN_SELECT_ALPHA;
}
ss0->ss2.width = s & 0x7f; /* bits 6:0 of sz */
// Per bspec, I965_SURFACE_BUFFER and RAW format, size must be a multiple of 4 byte.
if(format == I965_SURFACEFORMAT_RAW)
assert((ss0->ss2.width & 0x03) == 3);
ss0->ss2.height = (s >> 7) & 0x3fff; /* bits 20:7 of sz */
ss0->ss3.depth = (s >> 21) & 0x3ff; /* bits 30:21 of sz */
ss0->ss5.cache_control = cl_gpgpu_get_cache_ctrl();
heap->binding_table[index] = offsetof(surface_heap_t, surface) + index * sizeof(gen7_surface_state_t);
ss0->ss1.base_addr = buf->offset + internal_offset;
dri_bo_emit_reloc(gpgpu->aux_buf.bo,
I915_GEM_DOMAIN_RENDER,
I915_GEM_DOMAIN_RENDER,
internal_offset,
gpgpu->aux_offset.surface_heap_offset +
heap->binding_table[index] +
offsetof(gen7_surface_state_t, ss1),
buf);
}
static void
intel_gpgpu_setup_bti_gen8(intel_gpgpu_t *gpgpu, drm_intel_bo *buf, uint32_t internal_offset,
size_t size, unsigned char index, uint32_t format)
{
assert(size <= (2ul<<30));
size_t s = size - 1;
surface_heap_t *heap = gpgpu->aux_buf.bo->virtual + gpgpu->aux_offset.surface_heap_offset;
gen8_surface_state_t *ss0 = (gen8_surface_state_t *) &heap->surface[index * sizeof(gen8_surface_state_t)];
memset(ss0, 0, sizeof(gen8_surface_state_t));
ss0->ss0.surface_type = I965_SURFACE_BUFFER;
ss0->ss0.surface_format = format;
if(format != I965_SURFACEFORMAT_RAW) {
ss0->ss7.shader_channel_select_red = I965_SURCHAN_SELECT_RED;
ss0->ss7.shader_channel_select_green = I965_SURCHAN_SELECT_GREEN;
ss0->ss7.shader_channel_select_blue = I965_SURCHAN_SELECT_BLUE;
ss0->ss7.shader_channel_select_alpha = I965_SURCHAN_SELECT_ALPHA;
}
ss0->ss2.width = s & 0x7f; /* bits 6:0 of sz */
// Per bspec, I965_SURFACE_BUFFER and RAW format, size must be a multiple of 4 byte.
if(format == I965_SURFACEFORMAT_RAW)
assert((ss0->ss2.width & 0x03) == 3);
ss0->ss2.height = (s >> 7) & 0x3fff; /* bits 20:7 of sz */
ss0->ss3.depth = (s >> 21) & 0x3ff; /* bits 30:21 of sz */
ss0->ss1.mem_obj_ctrl_state = cl_gpgpu_get_cache_ctrl();
heap->binding_table[index] = offsetof(surface_heap_t, surface) + index * sizeof(gen8_surface_state_t);
ss0->ss8.surface_base_addr_lo = (buf->offset64 + internal_offset) & 0xffffffff;
ss0->ss9.surface_base_addr_hi = ((buf->offset64 + internal_offset) >> 32) & 0xffffffff;
dri_bo_emit_reloc(gpgpu->aux_buf.bo,
I915_GEM_DOMAIN_RENDER,
I915_GEM_DOMAIN_RENDER,
internal_offset,
gpgpu->aux_offset.surface_heap_offset +
heap->binding_table[index] +
offsetof(gen8_surface_state_t, ss8),
buf);
}
static void
intel_gpgpu_setup_bti_gen9(intel_gpgpu_t *gpgpu, drm_intel_bo *buf, uint32_t internal_offset,
size_t size, unsigned char index, uint32_t format)
{
assert(size <= (4ul<<30));
size_t s = size - 1;
surface_heap_t *heap = gpgpu->aux_buf.bo->virtual + gpgpu->aux_offset.surface_heap_offset;
gen8_surface_state_t *ss0 = (gen8_surface_state_t *) &heap->surface[index * sizeof(gen8_surface_state_t)];
memset(ss0, 0, sizeof(gen8_surface_state_t));
ss0->ss0.surface_type = I965_SURFACE_BUFFER;
ss0->ss0.surface_format = format;
if(format != I965_SURFACEFORMAT_RAW) {
ss0->ss7.shader_channel_select_red = I965_SURCHAN_SELECT_RED;
ss0->ss7.shader_channel_select_green = I965_SURCHAN_SELECT_GREEN;
ss0->ss7.shader_channel_select_blue = I965_SURCHAN_SELECT_BLUE;
ss0->ss7.shader_channel_select_alpha = I965_SURCHAN_SELECT_ALPHA;
}
ss0->ss2.width = s & 0x7f; /* bits 6:0 of sz */
// Per bspec, I965_SURFACE_BUFFER and RAW format, size must be a multiple of 4 byte.
if(format == I965_SURFACEFORMAT_RAW)
assert((ss0->ss2.width & 0x03) == 3);
ss0->ss2.height = (s >> 7) & 0x3fff; /* bits 20:7 of sz */
ss0->ss3.depth = (s >> 21) & 0x7ff; /* bits 31:21 of sz, from bespec only gen 9 support that*/
ss0->ss1.mem_obj_ctrl_state = cl_gpgpu_get_cache_ctrl();
heap->binding_table[index] = offsetof(surface_heap_t, surface) + index * sizeof(gen8_surface_state_t);
ss0->ss8.surface_base_addr_lo = (buf->offset64 + internal_offset) & 0xffffffff;
ss0->ss9.surface_base_addr_hi = ((buf->offset64 + internal_offset) >> 32) & 0xffffffff;
dri_bo_emit_reloc(gpgpu->aux_buf.bo,
I915_GEM_DOMAIN_RENDER,
I915_GEM_DOMAIN_RENDER,
internal_offset,
gpgpu->aux_offset.surface_heap_offset +
heap->binding_table[index] +
offsetof(gen8_surface_state_t, ss8),
buf);
}
static int
intel_is_surface_array(cl_mem_object_type type)
{
if (type == CL_MEM_OBJECT_IMAGE1D_ARRAY ||
type == CL_MEM_OBJECT_IMAGE2D_ARRAY)
return 1;
return 0;
}
static int
intel_get_surface_type(cl_mem_object_type type)
{
switch (type) {
case CL_MEM_OBJECT_IMAGE1D:
case CL_MEM_OBJECT_IMAGE1D_ARRAY:
return I965_SURFACE_1D;
case CL_MEM_OBJECT_IMAGE1D_BUFFER:
case CL_MEM_OBJECT_IMAGE2D:
case CL_MEM_OBJECT_IMAGE2D_ARRAY:
return I965_SURFACE_2D;
case CL_MEM_OBJECT_IMAGE3D:
return I965_SURFACE_3D;
default:
assert(0);
}
return 0;
}
/* Get fixed surface type. If it is a 1D array image with a large index,
we need to fixup it to 2D type due to a Gen7/Gen75's sampler issue
on a integer type surface with clamp address mode and nearest filter mode.
*/
static uint32_t get_surface_type(intel_gpgpu_t *gpgpu, int index, cl_mem_object_type type)
{
uint32_t surface_type;
//Now all platforms need it, so disable platform, re-enable it
//when some platform don't need this workaround
if (/*((IS_IVYBRIDGE(gpgpu->drv->device_id) ||
IS_HASWELL(gpgpu->drv->device_id) ||
IS_BROADWELL(gpgpu->drv->device_id) ||
IS_CHERRYVIEW(gpgpu->drv->device_id) ||
IS_SKYLAKE(gpgpu->drv->device_id) ||
IS_BROXTON(gpgpu->drv->device_id) ||
IS_KABYLAKE(gpgpu->drv_device_id))) && */
index >= BTI_WORKAROUND_IMAGE_OFFSET + BTI_RESERVED_NUM &&
type == CL_MEM_OBJECT_IMAGE1D_ARRAY)
surface_type = I965_SURFACE_2D;
else
surface_type = intel_get_surface_type(type);
return surface_type;
}
static void
intel_gpgpu_bind_image_gen7(intel_gpgpu_t *gpgpu,
uint32_t index,
dri_bo* obj_bo,
uint32_t obj_bo_offset,
uint32_t format,
cl_mem_object_type type,
uint32_t bpp,
int32_t w,
int32_t h,
int32_t depth,
int32_t pitch,
int32_t slice_pitch,
int32_t tiling)
{
surface_heap_t *heap = gpgpu->aux_buf.bo->virtual + gpgpu->aux_offset.surface_heap_offset;
gen7_surface_state_t *ss = (gen7_surface_state_t *) &heap->surface[index * sizeof(gen7_surface_state_t)];
memset(ss, 0, sizeof(*ss));
ss->ss0.vertical_line_stride = 0; // always choose VALIGN_2
ss->ss0.surface_type = get_surface_type(gpgpu, index, type);
if (intel_is_surface_array(type)) {
ss->ss0.surface_array = 1;
ss->ss0.surface_array_spacing = 1;
}
if (obj_bo_offset && tiling != GPGPU_NO_TILE) {
uint32_t unaligned = obj_bo_offset;
obj_bo_offset = (obj_bo_offset / 0x1000) * 0x1000;
uint32_t h_ = (unaligned - obj_bo_offset )/ pitch;
ss->ss5.y_offset = h_ / 2;
}
ss->ss0.surface_format = format;
ss->ss1.base_addr = obj_bo->offset + obj_bo_offset;
ss->ss2.width = w - 1;
ss->ss2.height = h - 1;
ss->ss3.depth = depth - 1;
ss->ss4.not_str_buf.rt_view_extent = depth - 1;
ss->ss4.not_str_buf.min_array_element = 0;
ss->ss3.pitch = pitch - 1;
ss->ss5.cache_control = cl_gpgpu_get_cache_ctrl();
if (tiling == GPGPU_TILE_X) {
ss->ss0.tiled_surface = 1;
ss->ss0.tile_walk = I965_TILEWALK_XMAJOR;
} else if (tiling == GPGPU_TILE_Y) {
ss->ss0.tiled_surface = 1;
ss->ss0.tile_walk = I965_TILEWALK_YMAJOR;
}
ss->ss0.render_cache_rw_mode = 1; /* XXX do we need to set it? */
intel_gpgpu_set_buf_reloc_gen7(gpgpu, index, obj_bo, obj_bo_offset);
assert(index < GEN_MAX_SURFACES);
}
static void
intel_gpgpu_bind_image_for_vme_gen7(intel_gpgpu_t *gpgpu,
uint32_t index,
dri_bo* obj_bo,
uint32_t obj_bo_offset,
uint32_t format,
cl_mem_object_type type,
uint32_t bpp,
int32_t w,
int32_t h,
int32_t depth,
int32_t pitch,
int32_t slice_pitch,
int32_t tiling)
{
surface_heap_t *heap = gpgpu->aux_buf.bo->virtual + gpgpu->aux_offset.surface_heap_offset;
gen7_media_surface_state_t *ss = (gen7_media_surface_state_t *) &heap->surface[index * sizeof(gen7_surface_state_t)];
memset(ss, 0, sizeof(*ss));
ss->ss0.base_addr = obj_bo->offset + obj_bo_offset;
ss->ss1.uv_offset_v_direction = 0;
ss->ss1.pic_struct = 0;
ss->ss1.width = w - 1;
ss->ss1.height = h - 1;
if (tiling == GPGPU_NO_TILE) {
ss->ss2.tile_mode = 0;
}
else if (tiling == GPGPU_TILE_X){
ss->ss2.tile_mode = 2;
}
else if (tiling == GPGPU_TILE_Y){
ss->ss2.tile_mode = 3;
}
ss->ss2.half_pitch_for_chroma = 0;
ss->ss2.surface_pitch = pitch - 1;
ss->ss2.surface_object_control_state = cl_gpgpu_get_cache_ctrl();
ss->ss2.interleave_chroma = 0;
ss->ss2.surface_format = 12; //Y8_UNORM
ss->ss3.y_offset_for_u = 0;
ss->ss3.x_offset_for_u = 0;
ss->ss4.y_offset_for_v = 0;
ss->ss4.x_offset_for_v = 0;
intel_gpgpu_set_buf_reloc_for_vme_gen7(gpgpu, index, obj_bo, obj_bo_offset);
assert(index < GEN_MAX_SURFACES);
}
static void
intel_gpgpu_bind_image_for_vme_gen9(intel_gpgpu_t *gpgpu,
uint32_t index,
dri_bo* obj_bo,
uint32_t obj_bo_offset,
uint32_t format,
cl_mem_object_type type,
uint32_t bpp,
int32_t w,
int32_t h,
int32_t depth,
int32_t pitch,
int32_t slice_pitch,
int32_t tiling)
{
surface_heap_t *heap = gpgpu->aux_buf.bo->virtual + gpgpu->aux_offset.surface_heap_offset;
gen9_media_surface_state_t *ss = (gen9_media_surface_state_t *) &heap->surface[index * sizeof(gen8_surface_state_t)];
memset(ss, 0, sizeof(gen8_surface_state_t));
ss->ss0.rotation = 0; //++
ss->ss1.uv_offset_v_direction = 0;
ss->ss1.pic_struct = 0;
ss->ss1.width = w - 1;
ss->ss1.height = h - 1;
if (tiling == GPGPU_NO_TILE) {
ss->ss2.tile_mode = 0;
}
else if (tiling == GPGPU_TILE_X){
ss->ss2.tile_mode = 2;
}
else if (tiling == GPGPU_TILE_Y){
ss->ss2.tile_mode = 3;
}
ss->ss2.half_pitch_for_chroma = 0;
ss->ss2.surface_pitch = pitch - 1;
ss->ss2.address_control = 1; //++ CLAMP: 0; MIRROR:1;
ss->ss2.mem_compress_enable = 0; //++
ss->ss2.mem_compress_mode = 0; //++
ss->ss2.uv_offset_v_direction_msb = 0; //++
ss->ss2.uv_offset_u_direction = 0; //++
ss->ss2.interleave_chroma = 0;
ss->ss2.surface_format = 12; //Y8_UNORM
//ss->ss2.surface_format = 4; //PLANAR_420_8
ss->ss3.y_offset_for_u = 0;
ss->ss3.x_offset_for_u = 0;
ss->ss4.y_offset_for_v = 0;
ss->ss4.x_offset_for_v = 0;
ss->ss5.surface_object_control_state = cl_gpgpu_get_cache_ctrl();
ss->ss5.tiled_res_mode = 0; //++ TRMODE_NONE: 0; TRMODE_TILEYF: 1; TRMODE_TILEYS:2
ss->ss5.vert_line_stride_offset = 0; //++
ss->ss5.vert_line_stride = 0; //++
ss->ss6.base_addr = (obj_bo->offset64 + obj_bo_offset) & 0xffffffff; //
ss->ss7.base_addr_high = ((obj_bo->offset64 + obj_bo_offset) >> 32) & 0xffffffff; //
heap->binding_table[index] = offsetof(surface_heap_t, surface) +
index * surface_state_sz;
dri_bo_emit_reloc(gpgpu->aux_buf.bo,
I915_GEM_DOMAIN_RENDER,
I915_GEM_DOMAIN_RENDER,
obj_bo_offset,
gpgpu->aux_offset.surface_heap_offset +
heap->binding_table[index] +
offsetof(gen9_media_surface_state_t, ss6),
obj_bo);
assert(index < GEN_MAX_SURFACES);
}
static void
intel_gpgpu_bind_image_gen75(intel_gpgpu_t *gpgpu,
uint32_t index,
dri_bo* obj_bo,
uint32_t obj_bo_offset,
uint32_t format,
cl_mem_object_type type,
uint32_t bpp,
int32_t w,
int32_t h,
int32_t depth,
int32_t pitch,
int32_t slice_pitch,
int32_t tiling)
{
surface_heap_t *heap = gpgpu->aux_buf.bo->virtual + gpgpu->aux_offset.surface_heap_offset;
gen7_surface_state_t *ss = (gen7_surface_state_t *) &heap->surface[index * sizeof(gen7_surface_state_t)];
memset(ss, 0, sizeof(*ss));
ss->ss0.vertical_line_stride = 0; // always choose VALIGN_2
ss->ss0.surface_type = get_surface_type(gpgpu, index, type);
if (intel_is_surface_array(type)) {
ss->ss0.surface_array = 1;
ss->ss0.surface_array_spacing = 1;
}
if (obj_bo_offset && tiling != GPGPU_NO_TILE) {
uint32_t unaligned = obj_bo_offset;
obj_bo_offset = (obj_bo_offset / 0x1000) * 0x1000;
uint32_t h_ = (unaligned - obj_bo_offset )/ pitch;
ss->ss5.y_offset = h_ / 2;
}
ss->ss0.surface_format = format;
ss->ss1.base_addr = obj_bo->offset + obj_bo_offset;
ss->ss2.width = w - 1;
ss->ss2.height = h - 1;
ss->ss3.depth = depth - 1;
ss->ss4.not_str_buf.rt_view_extent = depth - 1;
ss->ss4.not_str_buf.min_array_element = 0;
ss->ss3.pitch = pitch - 1;
ss->ss5.cache_control = cl_gpgpu_get_cache_ctrl();
ss->ss7.shader_r = I965_SURCHAN_SELECT_RED;
ss->ss7.shader_g = I965_SURCHAN_SELECT_GREEN;
ss->ss7.shader_b = I965_SURCHAN_SELECT_BLUE;
ss->ss7.shader_a = I965_SURCHAN_SELECT_ALPHA;
if (tiling == GPGPU_TILE_X) {
ss->ss0.tiled_surface = 1;
ss->ss0.tile_walk = I965_TILEWALK_XMAJOR;
} else if (tiling == GPGPU_TILE_Y) {
ss->ss0.tiled_surface = 1;
ss->ss0.tile_walk = I965_TILEWALK_YMAJOR;
}
ss->ss0.render_cache_rw_mode = 1; /* XXX do we need to set it? */
intel_gpgpu_set_buf_reloc_gen7(gpgpu, index, obj_bo, obj_bo_offset);
assert(index < GEN_MAX_SURFACES);
}
static void
intel_gpgpu_bind_image_gen8(intel_gpgpu_t *gpgpu,
uint32_t index,
dri_bo* obj_bo,
uint32_t obj_bo_offset,
uint32_t format,
cl_mem_object_type type,
uint32_t bpp,
int32_t w,
int32_t h,
int32_t depth,
int32_t pitch,
int32_t slice_pitch,
int32_t tiling)
{
surface_heap_t *heap = gpgpu->aux_buf.bo->virtual + gpgpu->aux_offset.surface_heap_offset;
gen8_surface_state_t *ss = (gen8_surface_state_t *) &heap->surface[index * sizeof(gen8_surface_state_t)];
memset(ss, 0, sizeof(*ss));
ss->ss0.vertical_line_stride = 0; // always choose VALIGN_2
ss->ss0.surface_type = get_surface_type(gpgpu, index, type);
ss->ss0.surface_format = format;
if (intel_is_surface_array(type)) {
ss->ss0.surface_array = 1;
ss->ss1.surface_qpitch = (h + 3)/4;
}
ss->ss0.horizontal_alignment = 1;
ss->ss0.vertical_alignment = 1;
if (tiling == GPGPU_TILE_X) {
ss->ss0.tile_mode = GEN8_TILEMODE_XMAJOR;
} else if (tiling == GPGPU_TILE_Y) {
ss->ss0.tile_mode = GEN8_TILEMODE_YMAJOR;
} else
assert(tiling == GPGPU_NO_TILE);// W mode is not supported now.
ss->ss2.width = w - 1;
ss->ss2.height = h - 1;
ss->ss3.depth = depth - 1;
if(obj_bo_offset && tiling != GPGPU_NO_TILE) {
uint32_t unaligned = obj_bo_offset;
obj_bo_offset = (obj_bo_offset / 0x1000) * 0x1000;
uint32_t h_ = (unaligned - obj_bo_offset) / pitch;
ss->ss5.y_offset = h_ / 4;
}
ss->ss8.surface_base_addr_lo = (obj_bo->offset64 + obj_bo_offset) & 0xffffffff;
ss->ss9.surface_base_addr_hi = ((obj_bo->offset64 + obj_bo_offset) >> 32) & 0xffffffff;
ss->ss4.render_target_view_ext = depth - 1;
ss->ss4.min_array_elt = 0;
ss->ss3.surface_pitch = pitch - 1;
ss->ss1.mem_obj_ctrl_state = cl_gpgpu_get_cache_ctrl();
//NV12 surface. the height is 3/2 * h, so need set proper offset here.
if (format == I965_SURFACEFORMAT_PLANAR_420_8)
ss->ss6.uv_plane_y_offset = h * 2 / 3;
ss->ss7.shader_channel_select_red = I965_SURCHAN_SELECT_RED;
ss->ss7.shader_channel_select_green = I965_SURCHAN_SELECT_GREEN;
ss->ss7.shader_channel_select_blue = I965_SURCHAN_SELECT_BLUE;
ss->ss7.shader_channel_select_alpha = I965_SURCHAN_SELECT_ALPHA;
ss->ss0.render_cache_rw_mode = 1; /* XXX do we need to set it? */
heap->binding_table[index] = offsetof(surface_heap_t, surface) +
index * surface_state_sz;
dri_bo_emit_reloc(gpgpu->aux_buf.bo,
I915_GEM_DOMAIN_RENDER,
I915_GEM_DOMAIN_RENDER,
obj_bo_offset,
gpgpu->aux_offset.surface_heap_offset +
heap->binding_table[index] +
offsetof(gen8_surface_state_t, ss8),
obj_bo);
assert(index < GEN_MAX_SURFACES);
}
static void
intel_gpgpu_bind_image_gen9(intel_gpgpu_t *gpgpu,
uint32_t index,
dri_bo* obj_bo,
uint32_t obj_bo_offset,
uint32_t format,
cl_mem_object_type type,
uint32_t bpp,
int32_t w,
int32_t h,
int32_t depth,
int32_t pitch,
int32_t slice_pitch,
int32_t tiling)
{
surface_heap_t *heap = gpgpu->aux_buf.bo->virtual + gpgpu->aux_offset.surface_heap_offset;
gen8_surface_state_t *ss = (gen8_surface_state_t *) &heap->surface[index * sizeof(gen8_surface_state_t)];
memset(ss, 0, sizeof(*ss));
ss->ss0.vertical_line_stride = 0; // always choose VALIGN_2
ss->ss0.surface_type = get_surface_type(gpgpu, index, type);
ss->ss0.surface_format = format;
if (intel_is_surface_array(type) && ss->ss0.surface_type == I965_SURFACE_1D) {
ss->ss0.surface_array = 1;
ss->ss1.surface_qpitch = (slice_pitch/bpp + 3)/4; //align_h
}
if (intel_is_surface_array(type) && ss->ss0.surface_type == I965_SURFACE_2D) {
ss->ss0.surface_array = 1;
ss->ss1.surface_qpitch = (slice_pitch/pitch + 3)/4;
}
if(ss->ss0.surface_type == I965_SURFACE_3D)
ss->ss1.surface_qpitch = (slice_pitch/pitch + 3)/4;
ss->ss0.horizontal_alignment = 1;
ss->ss0.vertical_alignment = 1;
if (tiling == GPGPU_TILE_X) {
ss->ss0.tile_mode = GEN8_TILEMODE_XMAJOR;
} else if (tiling == GPGPU_TILE_Y) {
ss->ss0.tile_mode = GEN8_TILEMODE_YMAJOR;
} else
assert(tiling == GPGPU_NO_TILE);// W mode is not supported now.
ss->ss2.width = w - 1;
ss->ss2.height = h - 1;
ss->ss3.depth = depth - 1;
if (obj_bo_offset && tiling != GPGPU_NO_TILE) {
uint32_t unaligned = obj_bo_offset;
obj_bo_offset = (obj_bo_offset / 0x1000) * 0x1000;
uint32_t h_ = (unaligned - obj_bo_offset )/ pitch;
ss->ss5.y_offset = h_ / 4;
}
ss->ss8.surface_base_addr_lo = (obj_bo->offset64 + obj_bo_offset) & 0xffffffff;
ss->ss9.surface_base_addr_hi = ((obj_bo->offset64 + obj_bo_offset) >> 32) & 0xffffffff;
ss->ss4.render_target_view_ext = depth - 1;
ss->ss4.min_array_elt = 0;
ss->ss3.surface_pitch = pitch - 1;
//NV12 surface. the height is 3/2 * h, so need set proper offset here.
if (format == I965_SURFACEFORMAT_PLANAR_420_8)
ss->ss6.uv_plane_y_offset = h * 2 / 3;
ss->ss1.mem_obj_ctrl_state = cl_gpgpu_get_cache_ctrl();
ss->ss7.shader_channel_select_red = I965_SURCHAN_SELECT_RED;
ss->ss7.shader_channel_select_green = I965_SURCHAN_SELECT_GREEN;
ss->ss7.shader_channel_select_blue = I965_SURCHAN_SELECT_BLUE;
ss->ss7.shader_channel_select_alpha = I965_SURCHAN_SELECT_ALPHA;
ss->ss0.render_cache_rw_mode = 1; /* XXX do we need to set it? */
heap->binding_table[index] = offsetof(surface_heap_t, surface) +
index * surface_state_sz;
dri_bo_emit_reloc(gpgpu->aux_buf.bo,
I915_GEM_DOMAIN_RENDER,
I915_GEM_DOMAIN_RENDER,
obj_bo_offset,
gpgpu->aux_offset.surface_heap_offset +
heap->binding_table[index] +
offsetof(gen8_surface_state_t, ss8),
obj_bo);
assert(index < GEN_MAX_SURFACES);
}
static void
intel_gpgpu_bind_buf(intel_gpgpu_t *gpgpu, drm_intel_bo *buf, uint32_t offset,
uint32_t internal_offset, size_t size, uint8_t bti)
{
assert(gpgpu->binded_n < max_buf_n);
if(offset != -1) {
gpgpu->binded_buf[gpgpu->binded_n] = buf;
gpgpu->target_buf_offset[gpgpu->binded_n] = internal_offset;
gpgpu->binded_offset[gpgpu->binded_n] = offset;
gpgpu->binded_n++;
}
intel_gpgpu_setup_bti(gpgpu, buf, internal_offset, size, bti, I965_SURFACEFORMAT_RAW);
}
static int
intel_gpgpu_set_scratch(intel_gpgpu_t * gpgpu, uint32_t per_thread_size)
{
drm_intel_bufmgr *bufmgr = gpgpu->drv->bufmgr;
drm_intel_bo* old = gpgpu->scratch_b.bo;
uint32_t total = per_thread_size * gpgpu->max_threads;
/* Per Bspec, scratch should 2X the desired size when EU index is not continuous */
if (IS_HASWELL(gpgpu->drv->device_id) || IS_CHERRYVIEW(gpgpu->drv->device_id) ||
PCI_CHIP_BROXTON_1 == gpgpu->drv->device_id || PCI_CHIP_BROXTON_3 == gpgpu->drv->device_id)
total *= 2;
gpgpu->per_thread_scratch = per_thread_size;
if(old && old->size < total) {
drm_intel_bo_unreference(old);
old = NULL;
}
if(!old && total) {
gpgpu->scratch_b.bo = drm_intel_bo_alloc(bufmgr, "SCRATCH_BO", total, 4096);
if (gpgpu->scratch_b.bo == NULL)
return -1;
}
return 0;
}
static void
intel_gpgpu_set_stack(intel_gpgpu_t *gpgpu, uint32_t offset, uint32_t size, uint8_t bti)
{
drm_intel_bufmgr *bufmgr = gpgpu->drv->bufmgr;
gpgpu->stack_b.bo = drm_intel_bo_alloc(bufmgr, "STACK", size, 64);
cl_gpgpu_bind_buf((cl_gpgpu)gpgpu, (cl_buffer)gpgpu->stack_b.bo, offset, 0, size, bti);
}
static void
intel_gpgpu_build_idrt_gen7(intel_gpgpu_t *gpgpu, cl_gpgpu_kernel *kernel)
{
gen6_interface_descriptor_t *desc;
drm_intel_bo *ker_bo = NULL;
desc = (gen6_interface_descriptor_t*) (gpgpu->aux_buf.bo->virtual + gpgpu->aux_offset.idrt_offset);
memset(desc, 0, sizeof(*desc));
ker_bo = (drm_intel_bo *) kernel->bo;
desc->desc0.kernel_start_pointer = ker_bo->offset >> 6; /* reloc */
desc->desc1.single_program_flow = 0;
desc->desc1.floating_point_mode = 0; /* use IEEE-754 rule */
desc->desc5.rounding_mode = 0; /* round to nearest even */
assert((gpgpu->aux_buf.bo->offset + gpgpu->aux_offset.sampler_state_offset) % 32 == 0);
desc->desc2.sampler_state_pointer = (gpgpu->aux_buf.bo->offset + gpgpu->aux_offset.sampler_state_offset) >> 5;
desc->desc3.binding_table_entry_count = 0; /* no prefetch */
desc->desc3.binding_table_pointer = 0;
desc->desc4.curbe_read_len = kernel->curbe_sz / 32;
desc->desc4.curbe_read_offset = 0;
/* Barriers / SLM are automatically handled on Gen7+ */
if (gpgpu->drv->gen_ver == 7 || gpgpu->drv->gen_ver == 75) {
size_t slm_sz = kernel->slm_sz;
desc->desc5.group_threads_num = kernel->use_slm ? kernel->thread_n : 0;
desc->desc5.barrier_enable = kernel->use_slm;
if (slm_sz <= 4*KB)
slm_sz = 4*KB;
else if (slm_sz <= 8*KB)
slm_sz = 8*KB;
else if (slm_sz <= 16*KB)
slm_sz = 16*KB;
else if (slm_sz <= 32*KB)
slm_sz = 32*KB;
else
slm_sz = 64*KB;
slm_sz = slm_sz >> 12;
desc->desc5.slm_sz = slm_sz;
}
else
desc->desc5.group_threads_num = kernel->barrierID; /* BarrierID on GEN6 */
dri_bo_emit_reloc(gpgpu->aux_buf.bo,
I915_GEM_DOMAIN_INSTRUCTION, 0,
0,
gpgpu->aux_offset.idrt_offset + offsetof(gen6_interface_descriptor_t, desc0),
ker_bo);
dri_bo_emit_reloc(gpgpu->aux_buf.bo,
I915_GEM_DOMAIN_SAMPLER, 0,
gpgpu->aux_offset.sampler_state_offset,
gpgpu->aux_offset.idrt_offset + offsetof(gen6_interface_descriptor_t, desc2),
gpgpu->aux_buf.bo);
}
static void
intel_gpgpu_build_idrt_gen8(intel_gpgpu_t *gpgpu, cl_gpgpu_kernel *kernel)
{
gen8_interface_descriptor_t *desc;
desc = (gen8_interface_descriptor_t*) (gpgpu->aux_buf.bo->virtual + gpgpu->aux_offset.idrt_offset);
memset(desc, 0, sizeof(*desc));
desc->desc0.kernel_start_pointer = 0; /* reloc */
desc->desc2.single_program_flow = 0;
desc->desc2.floating_point_mode = 0; /* use IEEE-754 rule */
desc->desc6.rounding_mode = 0; /* round to nearest even */
assert((gpgpu->aux_buf.bo->offset + gpgpu->aux_offset.sampler_state_offset) % 32 == 0);
desc->desc3.sampler_state_pointer = gpgpu->aux_offset.sampler_state_offset >> 5;
desc->desc4.binding_table_entry_count = 0; /* no prefetch */
desc->desc4.binding_table_pointer = 0;
desc->desc5.curbe_read_len = kernel->curbe_sz / 32;
desc->desc5.curbe_read_offset = 0;
/* Barriers / SLM are automatically handled on Gen7+ */
size_t slm_sz = kernel->slm_sz;
/* group_threads_num should not be set to 0 even if the barrier is disabled per bspec */
desc->desc6.group_threads_num = kernel->thread_n;
desc->desc6.barrier_enable = kernel->use_slm;
if (slm_sz == 0)
slm_sz = 0;
else if (slm_sz <= 4*KB)
slm_sz = 4*KB;
else if (slm_sz <= 8*KB)
slm_sz = 8*KB;
else if (slm_sz <= 16*KB)
slm_sz = 16*KB;
else if (slm_sz <= 32*KB)
slm_sz = 32*KB;
else
slm_sz = 64*KB;
slm_sz = slm_sz >> 12;
desc->desc6.slm_sz = slm_sz;
}
static void
intel_gpgpu_build_idrt_gen9(intel_gpgpu_t *gpgpu, cl_gpgpu_kernel *kernel)
{
gen8_interface_descriptor_t *desc;
desc = (gen8_interface_descriptor_t*) (gpgpu->aux_buf.bo->virtual + gpgpu->aux_offset.idrt_offset);
memset(desc, 0, sizeof(*desc));
desc->desc0.kernel_start_pointer = 0; /* reloc */
desc->desc2.single_program_flow = 0;
desc->desc2.floating_point_mode = 0; /* use IEEE-754 rule */
desc->desc6.rounding_mode = 0; /* round to nearest even */
assert((gpgpu->aux_buf.bo->offset + gpgpu->aux_offset.sampler_state_offset) % 32 == 0);
desc->desc3.sampler_state_pointer = gpgpu->aux_offset.sampler_state_offset >> 5;
desc->desc4.binding_table_entry_count = 0; /* no prefetch */
desc->desc4.binding_table_pointer = 0;
desc->desc5.curbe_read_len = kernel->curbe_sz / 32;
desc->desc5.curbe_read_offset = 0;
/* Barriers / SLM are automatically handled on Gen7+ */
size_t slm_sz = kernel->slm_sz;
/* group_threads_num should not be set to 0 even if the barrier is disabled per bspec */
desc->desc6.group_threads_num = kernel->thread_n;
desc->desc6.barrier_enable = kernel->use_slm;
if (slm_sz == 0)
slm_sz = 0;
else if (slm_sz <= 1*KB)
slm_sz = 1;
else if (slm_sz <= 2*KB)
slm_sz = 2;
else if (slm_sz <= 4*KB)
slm_sz = 3;
else if (slm_sz <= 8*KB)
slm_sz = 4;
else if (slm_sz <= 16*KB)
slm_sz = 5;
else if (slm_sz <= 32*KB)
slm_sz = 6;
else
slm_sz = 7;
desc->desc6.slm_sz = slm_sz;
}
static int
intel_gpgpu_upload_curbes_gen7(intel_gpgpu_t *gpgpu, const void* data, uint32_t size)
{
unsigned char *curbe = NULL;
cl_gpgpu_kernel *k = gpgpu->ker;
uint32_t i, j;
/* Upload the data first */
if (dri_bo_map(gpgpu->aux_buf.bo, 1) != 0) {
fprintf(stderr, "%s:%d: %s.\n", __FILE__, __LINE__, strerror(errno));
return -1;
}
assert(gpgpu->aux_buf.bo->virtual);
curbe = (unsigned char *) (gpgpu->aux_buf.bo->virtual + gpgpu->aux_offset.curbe_offset);
memcpy(curbe, data, size);
/* Now put all the relocations for our flat address space */
for (i = 0; i < k->thread_n; ++i)
for (j = 0; j < gpgpu->binded_n; ++j) {
*(uint32_t *)(curbe + gpgpu->binded_offset[j]+i*k->curbe_sz) = gpgpu->binded_buf[j]->offset64 + gpgpu->target_buf_offset[j];
drm_intel_bo_emit_reloc(gpgpu->aux_buf.bo,
gpgpu->aux_offset.curbe_offset + gpgpu->binded_offset[j]+i*k->curbe_sz,
gpgpu->binded_buf[j],
gpgpu->target_buf_offset[j],
I915_GEM_DOMAIN_RENDER,
I915_GEM_DOMAIN_RENDER);
}
dri_bo_unmap(gpgpu->aux_buf.bo);
return 0;
}
static int
intel_gpgpu_upload_curbes_gen8(intel_gpgpu_t *gpgpu, const void* data, uint32_t size)
{
unsigned char *curbe = NULL;
cl_gpgpu_kernel *k = gpgpu->ker;
uint32_t i, j;
/* Upload the data first */
if (dri_bo_map(gpgpu->aux_buf.bo, 1) != 0) {
fprintf(stderr, "%s:%d: %s.\n", __FILE__, __LINE__, strerror(errno));
return -1;
}
assert(gpgpu->aux_buf.bo->virtual);
curbe = (unsigned char *) (gpgpu->aux_buf.bo->virtual + gpgpu->aux_offset.curbe_offset);
memcpy(curbe, data, size);
/* Now put all the relocations for our flat address space */
for (i = 0; i < k->thread_n; ++i)
for (j = 0; j < gpgpu->binded_n; ++j) {
*(size_t *)(curbe + gpgpu->binded_offset[j]+i*k->curbe_sz) = gpgpu->binded_buf[j]->offset64 + gpgpu->target_buf_offset[j];
drm_intel_bo_emit_reloc(gpgpu->aux_buf.bo,
gpgpu->aux_offset.curbe_offset + gpgpu->binded_offset[j]+i*k->curbe_sz,
gpgpu->binded_buf[j],
gpgpu->target_buf_offset[j],
I915_GEM_DOMAIN_RENDER,
I915_GEM_DOMAIN_RENDER);
}
dri_bo_unmap(gpgpu->aux_buf.bo);
return 0;
}
static void
intel_gpgpu_upload_samplers(intel_gpgpu_t *gpgpu, const void *data, uint32_t n)
{
if (n) {
const size_t sz = n * sizeof(gen6_sampler_state_t);
memcpy(gpgpu->aux_buf.bo->virtual + gpgpu->aux_offset.sampler_state_offset, data, sz);
}
}
int translate_wrap_mode(uint32_t cl_address_mode, int using_nearest)
{
switch( cl_address_mode ) {
case CLK_ADDRESS_NONE:
case CLK_ADDRESS_REPEAT:
return GEN_TEXCOORDMODE_WRAP;
case CLK_ADDRESS_CLAMP:
return GEN_TEXCOORDMODE_CLAMP_BORDER;
case CLK_ADDRESS_CLAMP_TO_EDGE:
return GEN_TEXCOORDMODE_CLAMP;
case CLK_ADDRESS_MIRRORED_REPEAT:
return GEN_TEXCOORDMODE_MIRROR;
default:
return GEN_TEXCOORDMODE_WRAP;
}
}
static void intel_gpgpu_insert_vme_state_gen7(intel_gpgpu_t *gpgpu, cl_accelerator_intel accel, uint32_t index)
{
gen7_vme_state_t* vme = (gen7_vme_state_t*)(gpgpu->aux_buf.bo->virtual + gpgpu->aux_offset.sampler_state_offset) + index;
memset(vme, 0, sizeof(*vme));
gen7_vme_search_path_state_t* sp = vme->sp;
if(accel->desc.me.search_path_type == CL_ME_SEARCH_PATH_RADIUS_2_2_INTEL){
sp[0].dw0.SPD_0_X = 0;
sp[0].dw0.SPD_0_Y = 0;
sp[0].dw0.SPD_1_X = 0;
sp[0].dw0.SPD_1_Y = 0;
sp[0].dw0.SPD_2_X = 0;
sp[0].dw0.SPD_2_Y = 0;
sp[0].dw0.SPD_3_X = 0;
sp[0].dw0.SPD_3_Y = 0;
}
else if(accel->desc.me.search_path_type == CL_ME_SEARCH_PATH_RADIUS_4_4_INTEL){
sp[0].dw0.SPD_0_X = 1;
sp[0].dw0.SPD_0_Y = 0;
sp[0].dw0.SPD_1_X = 0;
sp[0].dw0.SPD_1_Y = 1;
sp[0].dw0.SPD_2_X = -1;
sp[0].dw0.SPD_2_Y = 0;
sp[0].dw0.SPD_3_X = 0;
sp[0].dw0.SPD_3_Y = 0;
}
else if(accel->desc.me.search_path_type == CL_ME_SEARCH_PATH_RADIUS_16_12_INTEL){
sp[0].dw0.SPD_0_X = 1;
sp[0].dw0.SPD_0_Y = 0;
sp[0].dw0.SPD_1_X = 1;
sp[0].dw0.SPD_1_Y = 0;
sp[0].dw0.SPD_2_X = 1;
sp[0].dw0.SPD_2_Y = 0;
sp[0].dw0.SPD_3_X = 1;
sp[0].dw0.SPD_3_Y = 0;
sp[1].dw0.SPD_0_X = 1;
sp[1].dw0.SPD_0_Y = 0;
sp[1].dw0.SPD_1_X = 1;
sp[1].dw0.SPD_1_Y = 0;
sp[1].dw0.SPD_2_X = 1;
sp[1].dw0.SPD_2_Y = 0;
sp[1].dw0.SPD_3_X = 0;
sp[1].dw0.SPD_3_Y = 1;
sp[2].dw0.SPD_0_X = -1;
sp[2].dw0.SPD_0_Y = 0;
sp[2].dw0.SPD_1_X = -1;
sp[2].dw0.SPD_1_Y = 0;
sp[2].dw0.SPD_2_X = -1;
sp[2].dw0.SPD_2_Y = 0;
sp[2].dw0.SPD_3_X = -1;
sp[2].dw0.SPD_3_Y = 0;
sp[3].dw0.SPD_0_X = -1;
sp[3].dw0.SPD_0_Y = 0;
sp[3].dw0.SPD_1_X = -1;
sp[3].dw0.SPD_1_Y = 0;
sp[3].dw0.SPD_2_X = -1;
sp[3].dw0.SPD_2_Y = 0;
sp[3].dw0.SPD_3_X = 0;
sp[3].dw0.SPD_3_Y = 1;
sp[4].dw0.SPD_0_X = 1;
sp[4].dw0.SPD_0_Y = 0;
sp[4].dw0.SPD_1_X = 1;
sp[4].dw0.SPD_1_Y = 0;
sp[4].dw0.SPD_2_X = 1;
sp[4].dw0.SPD_2_Y = 0;
sp[4].dw0.SPD_3_X = 1;
sp[4].dw0.SPD_3_Y = 0;
sp[5].dw0.SPD_0_X = 1;
sp[5].dw0.SPD_0_Y = 0;
sp[5].dw0.SPD_1_X = 1;
sp[5].dw0.SPD_1_Y = 0;
sp[5].dw0.SPD_2_X = 1;
sp[5].dw0.SPD_2_Y = 0;
sp[5].dw0.SPD_3_X = 0;
sp[5].dw0.SPD_3_Y = 1;
sp[6].dw0.SPD_0_X = -1;
sp[6].dw0.SPD_0_Y = 0;
sp[6].dw0.SPD_1_X = -1;
sp[6].dw0.SPD_1_Y = 0;
sp[6].dw0.SPD_2_X = -1;
sp[6].dw0.SPD_2_Y = 0;
sp[6].dw0.SPD_3_X = -1;
sp[6].dw0.SPD_3_Y = 0;
sp[7].dw0.SPD_0_X = -1;
sp[7].dw0.SPD_0_Y = 0;
sp[7].dw0.SPD_1_X = -1;
sp[7].dw0.SPD_1_Y = 0;
sp[7].dw0.SPD_2_X = -1;
sp[7].dw0.SPD_2_Y = 0;
sp[7].dw0.SPD_3_X = 0;
sp[7].dw0.SPD_3_Y = 1;
sp[8].dw0.SPD_0_X = 1;
sp[8].dw0.SPD_0_Y = 0;
sp[8].dw0.SPD_1_X = 1;
sp[8].dw0.SPD_1_Y = 0;
sp[8].dw0.SPD_2_X = 1;
sp[8].dw0.SPD_2_Y = 0;
sp[8].dw0.SPD_3_X = 1;
sp[8].dw0.SPD_3_Y = 0;
sp[9].dw0.SPD_0_X = 1;
sp[9].dw0.SPD_0_Y = 0;
sp[9].dw0.SPD_1_X = 1;
sp[9].dw0.SPD_1_Y = 0;
sp[9].dw0.SPD_2_X = 1;
sp[9].dw0.SPD_2_Y = 0;
sp[9].dw0.SPD_3_X = 0;
sp[9].dw0.SPD_3_Y = 1;
sp[10].dw0.SPD_0_X = -1;
sp[10].dw0.SPD_0_Y = 0;
sp[10].dw0.SPD_1_X = -1;
sp[10].dw0.SPD_1_Y = 0;
sp[10].dw0.SPD_2_X = -1;
sp[10].dw0.SPD_2_Y = 0;
sp[10].dw0.SPD_3_X = -1;
sp[10].dw0.SPD_3_Y = 0;
sp[11].dw0.SPD_0_X = -1;
sp[11].dw0.SPD_0_Y = 0;
sp[11].dw0.SPD_1_X = -1;
sp[11].dw0.SPD_1_Y = 0;
sp[11].dw0.SPD_2_X = -1;
sp[11].dw0.SPD_2_Y = 0;
sp[11].dw0.SPD_3_X = 0;
sp[11].dw0.SPD_3_Y = 0;
}
}
static void
intel_gpgpu_bind_vme_state_gen7(intel_gpgpu_t *gpgpu, cl_accelerator_intel accel)
{
intel_gpgpu_insert_vme_state_gen7(gpgpu, accel, 0);
}
static void
intel_gpgpu_insert_sampler_gen7(intel_gpgpu_t *gpgpu, uint32_t index, uint32_t clk_sampler)
{
int using_nearest = 0;
uint32_t wrap_mode;
gen7_sampler_state_t *sampler;
sampler = (gen7_sampler_state_t *)(gpgpu->aux_buf.bo->virtual + gpgpu->aux_offset.sampler_state_offset) + index;
memset(sampler, 0, sizeof(*sampler));
assert((gpgpu->aux_buf.bo->offset + gpgpu->aux_offset.sampler_border_color_state_offset) % 32 == 0);
sampler->ss2.default_color_pointer = (gpgpu->aux_buf.bo->offset + gpgpu->aux_offset.sampler_border_color_state_offset) >> 5;
if ((clk_sampler & __CLK_NORMALIZED_MASK) == CLK_NORMALIZED_COORDS_FALSE)
sampler->ss3.non_normalized_coord = 1;
else
sampler->ss3.non_normalized_coord = 0;
switch (clk_sampler & __CLK_FILTER_MASK) {
case CLK_FILTER_NEAREST:
sampler->ss0.min_filter = GEN_MAPFILTER_NEAREST;
sampler->ss0.mip_filter = GEN_MIPFILTER_NONE;
sampler->ss0.mag_filter = GEN_MAPFILTER_NEAREST;
using_nearest = 1;
break;
case CLK_FILTER_LINEAR:
sampler->ss0.min_filter = GEN_MAPFILTER_LINEAR;
sampler->ss0.mip_filter = GEN_MIPFILTER_NONE;
sampler->ss0.mag_filter = GEN_MAPFILTER_LINEAR;
break;
}
wrap_mode = translate_wrap_mode(clk_sampler & __CLK_ADDRESS_MASK, using_nearest);
sampler->ss3.s_wrap_mode = wrap_mode;
/* XXX mesa i965 driver code point out that if the surface is a 1D surface, we may need
* to set t_wrap_mode to GEN_TEXCOORDMODE_WRAP. */
sampler->ss3.t_wrap_mode = wrap_mode;
sampler->ss3.r_wrap_mode = wrap_mode;
sampler->ss0.lod_preclamp = 1; /* OpenGL mode */
sampler->ss0.default_color_mode = 0; /* OpenGL/DX10 mode */
sampler->ss0.base_level = 0;
sampler->ss1.max_lod = 0;
sampler->ss1.min_lod = 0;
if (sampler->ss0.min_filter != GEN_MAPFILTER_NEAREST)
sampler->ss3.address_round |= GEN_ADDRESS_ROUNDING_ENABLE_U_MIN |
GEN_ADDRESS_ROUNDING_ENABLE_V_MIN |
GEN_ADDRESS_ROUNDING_ENABLE_R_MIN;
if (sampler->ss0.mag_filter != GEN_MAPFILTER_NEAREST)
sampler->ss3.address_round |= GEN_ADDRESS_ROUNDING_ENABLE_U_MAG |
GEN_ADDRESS_ROUNDING_ENABLE_V_MAG |
GEN_ADDRESS_ROUNDING_ENABLE_R_MAG;
dri_bo_emit_reloc(gpgpu->aux_buf.bo,
I915_GEM_DOMAIN_SAMPLER, 0,
gpgpu->aux_offset.sampler_border_color_state_offset,
gpgpu->aux_offset.sampler_state_offset +
index * sizeof(gen7_sampler_state_t) +
offsetof(gen7_sampler_state_t, ss2),
gpgpu->aux_buf.bo);
}
static void
intel_gpgpu_insert_sampler_gen8(intel_gpgpu_t *gpgpu, uint32_t index, uint32_t clk_sampler)
{
int using_nearest = 0;
uint32_t wrap_mode;
gen8_sampler_state_t *sampler;
sampler = (gen8_sampler_state_t *)(gpgpu->aux_buf.bo->virtual + gpgpu->aux_offset.sampler_state_offset) + index;
memset(sampler, 0, sizeof(*sampler));
assert((gpgpu->aux_buf.bo->offset + gpgpu->aux_offset.sampler_border_color_state_offset) % 32 == 0);
if ((clk_sampler & __CLK_NORMALIZED_MASK) == CLK_NORMALIZED_COORDS_FALSE)
sampler->ss3.non_normalized_coord = 1;
else
sampler->ss3.non_normalized_coord = 0;
switch (clk_sampler & __CLK_FILTER_MASK) {
case CLK_FILTER_NEAREST:
sampler->ss0.min_filter = GEN_MAPFILTER_NEAREST;
sampler->ss0.mip_filter = GEN_MIPFILTER_NONE;
sampler->ss0.mag_filter = GEN_MAPFILTER_NEAREST;
using_nearest = 1;
break;
case CLK_FILTER_LINEAR:
sampler->ss0.min_filter = GEN_MAPFILTER_LINEAR;
sampler->ss0.mip_filter = GEN_MIPFILTER_NONE;
sampler->ss0.mag_filter = GEN_MAPFILTER_LINEAR;
break;
}
wrap_mode = translate_wrap_mode(clk_sampler & __CLK_ADDRESS_MASK, using_nearest);
sampler->ss3.s_wrap_mode = wrap_mode;
/* XXX mesa i965 driver code point out that if the surface is a 1D surface, we may need
* to set t_wrap_mode to GEN_TEXCOORDMODE_WRAP. */
sampler->ss3.t_wrap_mode = wrap_mode;
sampler->ss3.r_wrap_mode = wrap_mode;
sampler->ss0.lod_preclamp = 1; /* OpenGL mode */
sampler->ss0.default_color_mode = 0; /* OpenGL/DX10 mode */
sampler->ss0.base_level = 0;
sampler->ss1.max_lod = 0;
sampler->ss1.min_lod = 0;
if (sampler->ss0.min_filter != GEN_MAPFILTER_NEAREST)
sampler->ss3.address_round |= GEN_ADDRESS_ROUNDING_ENABLE_U_MIN |
GEN_ADDRESS_ROUNDING_ENABLE_V_MIN |
GEN_ADDRESS_ROUNDING_ENABLE_R_MIN;
if (sampler->ss0.mag_filter != GEN_MAPFILTER_NEAREST)
sampler->ss3.address_round |= GEN_ADDRESS_ROUNDING_ENABLE_U_MAG |
GEN_ADDRESS_ROUNDING_ENABLE_V_MAG |
GEN_ADDRESS_ROUNDING_ENABLE_R_MAG;
}
static void
intel_gpgpu_bind_sampler_gen7(intel_gpgpu_t *gpgpu, uint32_t *samplers, size_t sampler_sz)
{
int index;
assert(sampler_sz <= GEN_MAX_SAMPLERS);
for(index = 0; index < sampler_sz; index++)
intel_gpgpu_insert_sampler_gen7(gpgpu, index, samplers[index]);
}
static void
intel_gpgpu_bind_sampler_gen8(intel_gpgpu_t *gpgpu, uint32_t *samplers, size_t sampler_sz)
{
int index;
assert(sampler_sz <= GEN_MAX_SAMPLERS);
for(index = 0; index < sampler_sz; index++)
intel_gpgpu_insert_sampler_gen8(gpgpu, index, samplers[index]);
}
static void
intel_gpgpu_states_setup(intel_gpgpu_t *gpgpu, cl_gpgpu_kernel *kernel)
{
gpgpu->ker = kernel;
if (gpgpu->drv->null_bo)
intel_gpgpu_setup_bti(gpgpu, gpgpu->drv->null_bo, 0, 64*1024, 0xfe, I965_SURFACEFORMAT_RAW);
intel_gpgpu_build_idrt(gpgpu, kernel);
dri_bo_unmap(gpgpu->aux_buf.bo);
}
static void
intel_gpgpu_set_perf_counters(intel_gpgpu_t *gpgpu, cl_buffer *perf)
{
if (gpgpu->perf_b.bo)
drm_intel_bo_unreference(gpgpu->perf_b.bo);
drm_intel_bo_reference((drm_intel_bo*) perf);
gpgpu->perf_b.bo = (drm_intel_bo*) perf;
}
static void
intel_gpgpu_walker_gen7(intel_gpgpu_t *gpgpu,
uint32_t simd_sz,
uint32_t thread_n,
const size_t global_wk_off[3],
const size_t global_dim_off[3],
const size_t global_wk_sz[3],
const size_t local_wk_sz[3])
{
const uint32_t global_wk_dim[3] = {
global_wk_sz[0] / local_wk_sz[0],
global_wk_sz[1] / local_wk_sz[1],
global_wk_sz[2] / local_wk_sz[2]
};
uint32_t right_mask = ~0x0;
size_t group_sz = local_wk_sz[0] * local_wk_sz[1] * local_wk_sz[2];
assert(simd_sz == 8 || simd_sz == 16);
uint32_t shift = (group_sz & (simd_sz - 1));
shift = (shift == 0) ? simd_sz : shift;
right_mask = (1 << shift) - 1;
BEGIN_BATCH(gpgpu->batch, 11);
OUT_BATCH(gpgpu->batch, CMD_GPGPU_WALKER | 9);
OUT_BATCH(gpgpu->batch, 0); /* kernel index == 0 */
assert(thread_n <= 64);
if (simd_sz == 16)
OUT_BATCH(gpgpu->batch, (1 << 30) | (thread_n-1)); /* SIMD16 | thread max */
else
OUT_BATCH(gpgpu->batch, (0 << 30) | (thread_n-1)); /* SIMD8 | thread max */
OUT_BATCH(gpgpu->batch, 0);
OUT_BATCH(gpgpu->batch, global_wk_dim[0]);
OUT_BATCH(gpgpu->batch, 0);
OUT_BATCH(gpgpu->batch, global_wk_dim[1]);
OUT_BATCH(gpgpu->batch, 0);
OUT_BATCH(gpgpu->batch, global_wk_dim[2]);
OUT_BATCH(gpgpu->batch, right_mask);
OUT_BATCH(gpgpu->batch, ~0x0); /* we always set height as 1, so set bottom mask as all 1*/
ADVANCE_BATCH(gpgpu->batch);
BEGIN_BATCH(gpgpu->batch, 2);
OUT_BATCH(gpgpu->batch, CMD_MEDIA_STATE_FLUSH | 0);
OUT_BATCH(gpgpu->batch, 0); /* kernel index == 0 */
ADVANCE_BATCH(gpgpu->batch);
if (IS_IVYBRIDGE(gpgpu->drv->device_id))
intel_gpgpu_pipe_control(gpgpu);
}
static void
intel_gpgpu_walker_gen8(intel_gpgpu_t *gpgpu,
uint32_t simd_sz,
uint32_t thread_n,
const size_t global_wk_off[3],
const size_t global_dim_off[3],
const size_t global_wk_sz[3],
const size_t local_wk_sz[3])
{
const uint32_t global_wk_dim[3] = {
global_wk_sz[0] / local_wk_sz[0],
global_wk_sz[1] / local_wk_sz[1],
global_wk_sz[2] / local_wk_sz[2]
};
uint32_t right_mask = ~0x0;
size_t group_sz = local_wk_sz[0] * local_wk_sz[1] * local_wk_sz[2];
assert(simd_sz == 8 || simd_sz == 16);
uint32_t shift = (group_sz & (simd_sz - 1));
shift = (shift == 0) ? simd_sz : shift;
right_mask = (1 << shift) - 1;
BEGIN_BATCH(gpgpu->batch, 15);
OUT_BATCH(gpgpu->batch, CMD_GPGPU_WALKER | 13);
OUT_BATCH(gpgpu->batch, 0); /* kernel index == 0 */
OUT_BATCH(gpgpu->batch, 0); /* Indirect Data Length */
OUT_BATCH(gpgpu->batch, 0); /* Indirect Data Start Address */
assert(thread_n <= 64);
if (simd_sz == 16)
OUT_BATCH(gpgpu->batch, (1 << 30) | (thread_n-1)); /* SIMD16 | thread max */
else
OUT_BATCH(gpgpu->batch, (0 << 30) | (thread_n-1)); /* SIMD8 | thread max */
OUT_BATCH(gpgpu->batch, global_dim_off[0]);
OUT_BATCH(gpgpu->batch, 0);
OUT_BATCH(gpgpu->batch, global_wk_dim[0]+global_dim_off[0]);
OUT_BATCH(gpgpu->batch, global_dim_off[1]);
OUT_BATCH(gpgpu->batch, 0);
OUT_BATCH(gpgpu->batch, global_wk_dim[1]+global_dim_off[1]);
OUT_BATCH(gpgpu->batch, global_dim_off[2]);
OUT_BATCH(gpgpu->batch, global_wk_dim[2]+global_dim_off[2]);
OUT_BATCH(gpgpu->batch, right_mask);
OUT_BATCH(gpgpu->batch, ~0x0); /* we always set height as 1, so set bottom mask as all 1*/
ADVANCE_BATCH(gpgpu->batch);
BEGIN_BATCH(gpgpu->batch, 2);
OUT_BATCH(gpgpu->batch, CMD_MEDIA_STATE_FLUSH | 0);
OUT_BATCH(gpgpu->batch, 0); /* kernel index == 0 */
ADVANCE_BATCH(gpgpu->batch);
intel_gpgpu_pipe_control(gpgpu);
}
static intel_event_t*
intel_gpgpu_event_new(intel_gpgpu_t *gpgpu)
{
intel_event_t *event = NULL;
TRY_ALLOC_NO_ERR (event, CALLOC(intel_event_t));
event->buffer = gpgpu->batch->buffer;
if (event->buffer)
drm_intel_bo_reference(event->buffer);
event->status = command_queued;
if(gpgpu->time_stamp_b.bo) {
event->ts_buf = gpgpu->time_stamp_b.bo;
drm_intel_bo_reference(event->ts_buf);
}
exit:
return event;
error:
cl_free(event);
event = NULL;
goto exit;
}
/*
The upper layer already flushed the batch buffer, just update
internal status to command_submitted.
*/
static void
intel_gpgpu_event_flush(intel_event_t *event)
{
assert(event->status == command_queued);
event->status = command_running;
}
static int
intel_gpgpu_event_update_status(intel_event_t *event, int wait)
{
if(event->status == command_complete)
return event->status;
if (event->buffer &&
event->status == command_running &&
!drm_intel_bo_busy(event->buffer)) {
event->status = command_complete;
drm_intel_bo_unreference(event->buffer);
event->buffer = NULL;
return event->status;
}
if(wait == 0)
return event->status;
if (event->buffer) {
drm_intel_bo_wait_rendering(event->buffer);
event->status = command_complete;
drm_intel_bo_unreference(event->buffer);
event->buffer = NULL;
}
return event->status;
}
static void
intel_gpgpu_event_delete(intel_event_t *event)
{
if(event->buffer)
drm_intel_bo_unreference(event->buffer);
if(event->ts_buf)
drm_intel_bo_unreference(event->ts_buf);
cl_free(event);
}
/* IVB and HSW's result MUST shift in x86_64 system */
static uint64_t
intel_gpgpu_read_ts_reg_gen7(drm_intel_bufmgr *bufmgr)
{
uint64_t result = 0;
drm_intel_reg_read(bufmgr, TIMESTAMP_ADDR, &result);
/* In x86_64 system, the low 32bits of timestamp count are stored in the high 32 bits of
result which got from drm_intel_reg_read, and 32-35 bits are lost; but match bspec in
i386 system. It seems the kernel readq bug. So shift 32 bit in x86_64, and only remain
32 bits data in i386.
*/
struct utsname buf;
uname(&buf);
/* In some systems, the user space is 32 bit, but kernel is 64 bit, so can't use the
* compiler's flag to determine the kernel'a architecture, use uname to get it. */
/* x86_64 in linux, amd64 in bsd */
if(strcmp(buf.machine, "x86_64") == 0 || strcmp(buf.machine, "amd64") == 0)
return result >> 32;
else
return result & 0x0ffffffff;
}
/* baytrail's result should clear high 4 bits */
static uint64_t
intel_gpgpu_read_ts_reg_baytrail(drm_intel_bufmgr *bufmgr)
{
uint64_t result = 0;
drm_intel_reg_read(bufmgr, TIMESTAMP_ADDR, &result);
return result & 0x0ffffffff;
}
/* We want to get the current time of GPU. */
static void
intel_gpgpu_event_get_gpu_cur_timestamp(intel_driver_t* gen_driver, uint64_t* ret_ts)
{
uint64_t result = 0;
drm_intel_bufmgr *bufmgr = gen_driver->bufmgr;
/* Get the ts that match the bspec */
result = intel_gpgpu_read_ts_reg(bufmgr);
result *= 80;
*ret_ts = result;
return;
}
/* Get the GPU execute time. */
static void
intel_gpgpu_event_get_exec_timestamp(intel_gpgpu_t* gpgpu, int index, uint64_t* ret_ts)
{
uint64_t result = 0;
assert(gpgpu->time_stamp_b.bo);
assert(index == 0 || index == 1);
drm_intel_gem_bo_map_gtt(gpgpu->time_stamp_b.bo);
uint64_t* ptr = gpgpu->time_stamp_b.bo->virtual;
result = ptr[index];
/* According to BSpec, the timestamp counter should be 36 bits,
but comparing to the timestamp counter from IO control reading,
we find the first 4 bits seems to be fake. In order to keep the
timestamp counter conformable, we just skip the first 4 bits.
*/
result = (result & 0x0FFFFFFFF) * 80; //convert to nanoseconds
*ret_ts = result;
drm_intel_gem_bo_unmap_gtt(gpgpu->time_stamp_b.bo);
}
static int
intel_gpgpu_set_profiling_buf(intel_gpgpu_t *gpgpu, uint32_t size, uint32_t offset, uint8_t bti)
{
drm_intel_bo *bo = NULL;
gpgpu->profiling_b.bo = drm_intel_bo_alloc(gpgpu->drv->bufmgr, "Profiling buffer", size, 64);
bo = gpgpu->profiling_b.bo;
if (!bo || (drm_intel_bo_map(bo, 1) != 0)) {
fprintf(stderr, "%s:%d: %s.\n", __FILE__, __LINE__, strerror(errno));
return -1;
}
memset(bo->virtual, 0, size);
drm_intel_bo_unmap(bo);
cl_gpgpu_bind_buf((cl_gpgpu)gpgpu, (cl_buffer)bo, offset, 0, size, bti);
return 0;
}
static void
intel_gpgpu_set_profiling_info(intel_gpgpu_t *gpgpu, void* profiling_info)
{
gpgpu->profiling_info = profiling_info;
}
static void*
intel_gpgpu_get_profiling_info(intel_gpgpu_t *gpgpu)
{
return gpgpu->profiling_info;
}
static int
intel_gpgpu_set_printf_buf(intel_gpgpu_t *gpgpu, uint32_t size, uint8_t bti)
{
if (gpgpu->printf_b.bo)
dri_bo_unreference(gpgpu->printf_b.bo);
gpgpu->printf_b.bo = dri_bo_alloc(gpgpu->drv->bufmgr, "Printf buffer", size, 4096);
if (!gpgpu->printf_b.bo || (drm_intel_bo_map(gpgpu->printf_b.bo, 1) != 0)) {
fprintf(stderr, "%s:%d: %s.\n", __FILE__, __LINE__, strerror(errno));
return -1;
}
memset(gpgpu->printf_b.bo->virtual, 0, size);
*(uint32_t *)(gpgpu->printf_b.bo->virtual) = 4; // first four is for the length.
drm_intel_bo_unmap(gpgpu->printf_b.bo);
/* No need to bind, we do not need to emit reloc. */
intel_gpgpu_setup_bti(gpgpu, gpgpu->printf_b.bo, 0, size, bti, I965_SURFACEFORMAT_RAW);
return 0;
}
static void*
intel_gpgpu_map_profiling_buf(intel_gpgpu_t *gpgpu)
{
drm_intel_bo *bo = NULL;
bo = gpgpu->profiling_b.bo;
drm_intel_bo_map(bo, 1);
return bo->virtual;
}
static void
intel_gpgpu_unmap_profiling_buf_addr(intel_gpgpu_t *gpgpu)
{
drm_intel_bo *bo = NULL;
bo = gpgpu->profiling_b.bo;
drm_intel_bo_unmap(bo);
}
static void*
intel_gpgpu_map_printf_buf(intel_gpgpu_t *gpgpu)
{
drm_intel_bo *bo = NULL;
bo = gpgpu->printf_b.bo;
drm_intel_bo_map(bo, 1);
return bo->virtual;
}
static void
intel_gpgpu_unmap_printf_buf_addr(intel_gpgpu_t *gpgpu)
{
drm_intel_bo *bo = NULL;
bo = gpgpu->printf_b.bo;
drm_intel_bo_unmap(bo);
}
static void
intel_gpgpu_release_printf_buf(intel_gpgpu_t *gpgpu)
{
drm_intel_bo_unreference(gpgpu->printf_b.bo);
gpgpu->printf_b.bo = NULL;
}
static void
intel_gpgpu_set_printf_info(intel_gpgpu_t *gpgpu, void* printf_info)
{
gpgpu->printf_info = printf_info;
}
static void*
intel_gpgpu_get_printf_info(intel_gpgpu_t *gpgpu)
{
return gpgpu->printf_info;
}
static void
intel_gpgpu_set_kernel(intel_gpgpu_t *gpgpu, void * kernel)
{
gpgpu->kernel = kernel;
}
static void*
intel_gpgpu_get_kernel(intel_gpgpu_t *gpgpu)
{
return gpgpu->kernel;
}
LOCAL void
intel_set_gpgpu_callbacks(int device_id)
{
cl_gpgpu_new = (cl_gpgpu_new_cb *) intel_gpgpu_new;
cl_gpgpu_delete = (cl_gpgpu_delete_cb *) intel_gpgpu_delete;
cl_gpgpu_sync = (cl_gpgpu_sync_cb *) intel_gpgpu_sync;
cl_gpgpu_bind_buf = (cl_gpgpu_bind_buf_cb *) intel_gpgpu_bind_buf;
cl_gpgpu_set_stack = (cl_gpgpu_set_stack_cb *) intel_gpgpu_set_stack;
cl_gpgpu_state_init = (cl_gpgpu_state_init_cb *) intel_gpgpu_state_init;
cl_gpgpu_set_perf_counters = (cl_gpgpu_set_perf_counters_cb *) intel_gpgpu_set_perf_counters;
cl_gpgpu_alloc_constant_buffer = (cl_gpgpu_alloc_constant_buffer_cb *) intel_gpgpu_alloc_constant_buffer;
cl_gpgpu_states_setup = (cl_gpgpu_states_setup_cb *) intel_gpgpu_states_setup;
cl_gpgpu_upload_samplers = (cl_gpgpu_upload_samplers_cb *) intel_gpgpu_upload_samplers;
cl_gpgpu_batch_reset = (cl_gpgpu_batch_reset_cb *) intel_gpgpu_batch_reset;
cl_gpgpu_batch_start = (cl_gpgpu_batch_start_cb *) intel_gpgpu_batch_start;
cl_gpgpu_batch_end = (cl_gpgpu_batch_end_cb *) intel_gpgpu_batch_end;
cl_gpgpu_flush = (cl_gpgpu_flush_cb *) intel_gpgpu_flush;
cl_gpgpu_bind_sampler = (cl_gpgpu_bind_sampler_cb *) intel_gpgpu_bind_sampler_gen7;
cl_gpgpu_bind_vme_state = (cl_gpgpu_bind_vme_state_cb *) intel_gpgpu_bind_vme_state_gen7;
cl_gpgpu_set_scratch = (cl_gpgpu_set_scratch_cb *) intel_gpgpu_set_scratch;
cl_gpgpu_event_new = (cl_gpgpu_event_new_cb *)intel_gpgpu_event_new;
cl_gpgpu_event_flush = (cl_gpgpu_event_flush_cb *)intel_gpgpu_event_flush;
cl_gpgpu_event_update_status = (cl_gpgpu_event_update_status_cb *)intel_gpgpu_event_update_status;
cl_gpgpu_event_delete = (cl_gpgpu_event_delete_cb *)intel_gpgpu_event_delete;
cl_gpgpu_event_get_exec_timestamp = (cl_gpgpu_event_get_exec_timestamp_cb *)intel_gpgpu_event_get_exec_timestamp;
cl_gpgpu_event_get_gpu_cur_timestamp = (cl_gpgpu_event_get_gpu_cur_timestamp_cb *)intel_gpgpu_event_get_gpu_cur_timestamp;
cl_gpgpu_ref_batch_buf = (cl_gpgpu_ref_batch_buf_cb *)intel_gpgpu_ref_batch_buf;
cl_gpgpu_unref_batch_buf = (cl_gpgpu_unref_batch_buf_cb *)intel_gpgpu_unref_batch_buf;
cl_gpgpu_set_profiling_buffer = (cl_gpgpu_set_profiling_buffer_cb *)intel_gpgpu_set_profiling_buf;
cl_gpgpu_set_profiling_info = (cl_gpgpu_set_profiling_info_cb *)intel_gpgpu_set_profiling_info;
cl_gpgpu_get_profiling_info = (cl_gpgpu_get_profiling_info_cb *)intel_gpgpu_get_profiling_info;
cl_gpgpu_map_profiling_buffer = (cl_gpgpu_map_profiling_buffer_cb *)intel_gpgpu_map_profiling_buf;
cl_gpgpu_unmap_profiling_buffer = (cl_gpgpu_unmap_profiling_buffer_cb *)intel_gpgpu_unmap_profiling_buf_addr;
cl_gpgpu_set_printf_buffer = (cl_gpgpu_set_printf_buffer_cb *)intel_gpgpu_set_printf_buf;
cl_gpgpu_map_printf_buffer = (cl_gpgpu_map_printf_buffer_cb *)intel_gpgpu_map_printf_buf;
cl_gpgpu_unmap_printf_buffer = (cl_gpgpu_unmap_printf_buffer_cb *)intel_gpgpu_unmap_printf_buf_addr;
cl_gpgpu_release_printf_buffer = (cl_gpgpu_release_printf_buffer_cb *)intel_gpgpu_release_printf_buf;
cl_gpgpu_set_printf_info = (cl_gpgpu_set_printf_info_cb *)intel_gpgpu_set_printf_info;
cl_gpgpu_get_printf_info = (cl_gpgpu_get_printf_info_cb *)intel_gpgpu_get_printf_info;
cl_gpgpu_set_kernel = (cl_gpgpu_set_kernel_cb *)intel_gpgpu_set_kernel;
cl_gpgpu_get_kernel = (cl_gpgpu_get_kernel_cb *)intel_gpgpu_get_kernel;
if (IS_BROADWELL(device_id) || IS_CHERRYVIEW(device_id)) {
cl_gpgpu_bind_image = (cl_gpgpu_bind_image_cb *) intel_gpgpu_bind_image_gen8;
intel_gpgpu_set_L3 = intel_gpgpu_set_L3_gen8;
cl_gpgpu_get_cache_ctrl = (cl_gpgpu_get_cache_ctrl_cb *)intel_gpgpu_get_cache_ctrl_gen8;
intel_gpgpu_get_scratch_index = intel_gpgpu_get_scratch_index_gen8;
intel_gpgpu_post_action = intel_gpgpu_post_action_gen7; //BDW need not restore SLM, same as gen7
intel_gpgpu_read_ts_reg = intel_gpgpu_read_ts_reg_gen7;
if(IS_CHERRYVIEW(device_id))
intel_gpgpu_read_ts_reg = intel_gpgpu_read_ts_reg_baytrail;
intel_gpgpu_set_base_address = intel_gpgpu_set_base_address_gen8;
intel_gpgpu_setup_bti = intel_gpgpu_setup_bti_gen8;
intel_gpgpu_load_vfe_state = intel_gpgpu_load_vfe_state_gen8;
cl_gpgpu_walker = (cl_gpgpu_walker_cb *)intel_gpgpu_walker_gen8;
intel_gpgpu_build_idrt = intel_gpgpu_build_idrt_gen8;
intel_gpgpu_load_curbe_buffer = intel_gpgpu_load_curbe_buffer_gen8;
intel_gpgpu_load_idrt = intel_gpgpu_load_idrt_gen8;
cl_gpgpu_bind_sampler = (cl_gpgpu_bind_sampler_cb *) intel_gpgpu_bind_sampler_gen8;
intel_gpgpu_pipe_control = intel_gpgpu_pipe_control_gen8;
intel_gpgpu_select_pipeline = intel_gpgpu_select_pipeline_gen7;
cl_gpgpu_upload_curbes = (cl_gpgpu_upload_curbes_cb *) intel_gpgpu_upload_curbes_gen8;
return;
}
if (IS_GEN9(device_id)) {
cl_gpgpu_bind_image = (cl_gpgpu_bind_image_cb *) intel_gpgpu_bind_image_gen9;
cl_gpgpu_bind_image_for_vme = (cl_gpgpu_bind_image_cb *) intel_gpgpu_bind_image_for_vme_gen9;
intel_gpgpu_set_L3 = intel_gpgpu_set_L3_gen8;
cl_gpgpu_get_cache_ctrl = (cl_gpgpu_get_cache_ctrl_cb *)intel_gpgpu_get_cache_ctrl_gen9;
intel_gpgpu_get_scratch_index = intel_gpgpu_get_scratch_index_gen8;
intel_gpgpu_post_action = intel_gpgpu_post_action_gen7; //SKL need not restore SLM, same as gen7
intel_gpgpu_read_ts_reg = intel_gpgpu_read_ts_reg_gen7;
if(IS_GEMINILAKE(device_id))
intel_gpgpu_read_ts_reg = intel_gpgpu_read_ts_reg_baytrail;
intel_gpgpu_set_base_address = intel_gpgpu_set_base_address_gen9;
intel_gpgpu_setup_bti = intel_gpgpu_setup_bti_gen9;
intel_gpgpu_load_vfe_state = intel_gpgpu_load_vfe_state_gen8;
cl_gpgpu_walker = (cl_gpgpu_walker_cb *)intel_gpgpu_walker_gen8;
intel_gpgpu_build_idrt = intel_gpgpu_build_idrt_gen9;
intel_gpgpu_load_curbe_buffer = intel_gpgpu_load_curbe_buffer_gen8;
intel_gpgpu_load_idrt = intel_gpgpu_load_idrt_gen8;
cl_gpgpu_bind_sampler = (cl_gpgpu_bind_sampler_cb *) intel_gpgpu_bind_sampler_gen8;
intel_gpgpu_pipe_control = intel_gpgpu_pipe_control_gen8;
intel_gpgpu_select_pipeline = intel_gpgpu_select_pipeline_gen9;
cl_gpgpu_upload_curbes = (cl_gpgpu_upload_curbes_cb *) intel_gpgpu_upload_curbes_gen8;
return;
}
cl_gpgpu_upload_curbes = (cl_gpgpu_upload_curbes_cb *) intel_gpgpu_upload_curbes_gen7;
intel_gpgpu_set_base_address = intel_gpgpu_set_base_address_gen7;
intel_gpgpu_load_vfe_state = intel_gpgpu_load_vfe_state_gen7;
cl_gpgpu_walker = (cl_gpgpu_walker_cb *)intel_gpgpu_walker_gen7;
intel_gpgpu_build_idrt = intel_gpgpu_build_idrt_gen7;
intel_gpgpu_load_curbe_buffer = intel_gpgpu_load_curbe_buffer_gen7;
intel_gpgpu_load_idrt = intel_gpgpu_load_idrt_gen7;
intel_gpgpu_select_pipeline = intel_gpgpu_select_pipeline_gen7;
if (IS_HASWELL(device_id)) {
cl_gpgpu_bind_image = (cl_gpgpu_bind_image_cb *) intel_gpgpu_bind_image_gen75;
intel_gpgpu_set_L3 = intel_gpgpu_set_L3_gen75;
cl_gpgpu_get_cache_ctrl = (cl_gpgpu_get_cache_ctrl_cb *)intel_gpgpu_get_cache_ctrl_gen75;
intel_gpgpu_get_scratch_index = intel_gpgpu_get_scratch_index_gen75;
intel_gpgpu_post_action = intel_gpgpu_post_action_gen75;
intel_gpgpu_read_ts_reg = intel_gpgpu_read_ts_reg_gen7; //HSW same as ivb
intel_gpgpu_setup_bti = intel_gpgpu_setup_bti_gen75;
intel_gpgpu_pipe_control = intel_gpgpu_pipe_control_gen75;
}
else if (IS_IVYBRIDGE(device_id)) {
cl_gpgpu_bind_image = (cl_gpgpu_bind_image_cb *) intel_gpgpu_bind_image_gen7;
cl_gpgpu_bind_image_for_vme = (cl_gpgpu_bind_image_cb *) intel_gpgpu_bind_image_for_vme_gen7;
if (IS_BAYTRAIL_T(device_id)) {
intel_gpgpu_set_L3 = intel_gpgpu_set_L3_baytrail;
intel_gpgpu_read_ts_reg = intel_gpgpu_read_ts_reg_baytrail;
} else {
intel_gpgpu_set_L3 = intel_gpgpu_set_L3_gen7;
intel_gpgpu_read_ts_reg = intel_gpgpu_read_ts_reg_gen7;
}
cl_gpgpu_get_cache_ctrl = (cl_gpgpu_get_cache_ctrl_cb *)intel_gpgpu_get_cache_ctrl_gen7;
intel_gpgpu_get_scratch_index = intel_gpgpu_get_scratch_index_gen7;
intel_gpgpu_post_action = intel_gpgpu_post_action_gen7;
intel_gpgpu_setup_bti = intel_gpgpu_setup_bti_gen7;
intel_gpgpu_pipe_control = intel_gpgpu_pipe_control_gen7;
}
}