summaryrefslogtreecommitdiff
path: root/src/amd/compiler/aco_assembler.cpp
diff options
context:
space:
mode:
authorDaniel Schürmann <daniel@schuermann.dev>2019-09-17 13:22:17 +0200
committerDaniel Schürmann <daniel@schuermann.dev>2019-09-19 12:10:00 +0200
commit93c8ebfa780ebd1495095e794731881aef29e7d3 (patch)
tree547268dbeabb0d17f14202d4429b3f6abfdb01c5 /src/amd/compiler/aco_assembler.cpp
parent99cbec0a5f463fef4d9c61f34482d9eb00293704 (diff)
downloadmesa-93c8ebfa780ebd1495095e794731881aef29e7d3.tar.gz
aco: Initial commit of independent AMD compiler
ACO (short for AMD Compiler) is a new compiler backend with the goal to replace LLVM for Radeon hardware for the RADV driver. ACO currently supports only VS, PS and CS on VI and Vega. There are some optimizations missing because of unmerged NIR changes which may decrease performance. Full commit history can be found at https://github.com/daniel-schuermann/mesa/commits/backend Co-authored-by: Daniel Schürmann <daniel@schuermann.dev> Co-authored-by: Rhys Perry <pendingchaos02@gmail.com> Co-authored-by: Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl> Co-authored-by: Connor Abbott <cwabbott0@gmail.com> Co-authored-by: Michael Schellenberger Costa <mschellenbergercosta@googlemail.com> Co-authored-by: Timur Kristóf <timur.kristof@gmail.com> Acked-by: Samuel Pitoiset <samuel.pitoiset@gmail.com> Acked-by: Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl>
Diffstat (limited to 'src/amd/compiler/aco_assembler.cpp')
-rw-r--r--src/amd/compiler/aco_assembler.cpp497
1 files changed, 497 insertions, 0 deletions
diff --git a/src/amd/compiler/aco_assembler.cpp b/src/amd/compiler/aco_assembler.cpp
new file mode 100644
index 00000000000..a6bf2a3e0db
--- /dev/null
+++ b/src/amd/compiler/aco_assembler.cpp
@@ -0,0 +1,497 @@
+#include <map>
+
+#include "aco_ir.h"
+#include "common/sid.h"
+
+namespace aco {
+
+struct asm_context {
+ Program *program;
+ enum chip_class chip_class;
+ std::map<int, SOPP_instruction*> branches;
+ std::vector<unsigned> constaddrs;
+ const int16_t* opcode;
+ // TODO: keep track of branch instructions referring blocks
+ // and, when emitting the block, correct the offset in instr
+ asm_context(Program* program) : program(program), chip_class(program->chip_class) {
+ if (chip_class <= GFX9)
+ opcode = &instr_info.opcode_gfx9[0];
+ }
+};
+
+void emit_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruction* instr)
+{
+ uint32_t instr_offset = out.size() * 4u;
+
+ /* lower remaining pseudo-instructions */
+ if (instr->opcode == aco_opcode::p_constaddr) {
+ unsigned dest = instr->definitions[0].physReg();
+ unsigned offset = instr->operands[0].constantValue();
+
+ /* s_getpc_b64 dest[0:1] */
+ uint32_t encoding = (0b101111101 << 23);
+ uint32_t opcode = ctx.opcode[(int)aco_opcode::s_getpc_b64];
+ if (opcode >= 55 && ctx.chip_class <= GFX9) {
+ assert(ctx.chip_class == GFX9 && opcode < 60);
+ opcode = opcode - 4;
+ }
+ encoding |= dest << 16;
+ encoding |= opcode << 8;
+ out.push_back(encoding);
+
+ /* s_add_u32 dest[0], dest[0], ... */
+ encoding = (0b10 << 30);
+ encoding |= ctx.opcode[(int)aco_opcode::s_add_u32] << 23;
+ encoding |= dest << 16;
+ encoding |= dest;
+ encoding |= 255 << 8;
+ out.push_back(encoding);
+ ctx.constaddrs.push_back(out.size());
+ out.push_back(-(instr_offset + 4) + offset);
+
+ /* s_addc_u32 dest[1], dest[1], 0 */
+ encoding = (0b10 << 30);
+ encoding |= ctx.opcode[(int)aco_opcode::s_addc_u32] << 23;
+ encoding |= (dest + 1) << 16;
+ encoding |= dest + 1;
+ encoding |= 128 << 8;
+ out.push_back(encoding);
+ return;
+ }
+
+ uint32_t opcode = ctx.opcode[(int)instr->opcode];
+ if (opcode == (uint32_t)-1) {
+ fprintf(stderr, "Unsupported opcode: ");
+ aco_print_instr(instr, stderr);
+ abort();
+ }
+
+ switch (instr->format) {
+ case Format::SOP2: {
+ uint32_t encoding = (0b10 << 30);
+ encoding |= opcode << 23;
+ encoding |= !instr->definitions.empty() ? instr->definitions[0].physReg() << 16 : 0;
+ encoding |= instr->operands.size() >= 2 ? instr->operands[1].physReg() << 8 : 0;
+ encoding |= !instr->operands.empty() ? instr->operands[0].physReg() : 0;
+ out.push_back(encoding);
+ break;
+ }
+ case Format::SOPK: {
+ uint32_t encoding = (0b1011 << 28);
+ encoding |= opcode << 23;
+ encoding |=
+ !instr->definitions.empty() && !(instr->definitions[0].physReg() == scc) ?
+ instr->definitions[0].physReg() << 16 :
+ !instr->operands.empty() && !(instr->operands[0].physReg() == scc) ?
+ instr->operands[0].physReg() << 16 : 0;
+ encoding |= static_cast<SOPK_instruction*>(instr)->imm;
+ out.push_back(encoding);
+ break;
+ }
+ case Format::SOP1: {
+ uint32_t encoding = (0b101111101 << 23);
+ if (opcode >= 55 && ctx.chip_class <= GFX9) {
+ assert(ctx.chip_class == GFX9 && opcode < 60);
+ opcode = opcode - 4;
+ }
+ encoding |= !instr->definitions.empty() ? instr->definitions[0].physReg() << 16 : 0;
+ encoding |= opcode << 8;
+ encoding |= !instr->operands.empty() ? instr->operands[0].physReg() : 0;
+ out.push_back(encoding);
+ break;
+ }
+ case Format::SOPC: {
+ uint32_t encoding = (0b101111110 << 23);
+ encoding |= opcode << 16;
+ encoding |= instr->operands.size() == 2 ? instr->operands[1].physReg() << 8 : 0;
+ encoding |= !instr->operands.empty() ? instr->operands[0].physReg() : 0;
+ out.push_back(encoding);
+ break;
+ }
+ case Format::SOPP: {
+ SOPP_instruction* sopp = static_cast<SOPP_instruction*>(instr);
+ uint32_t encoding = (0b101111111 << 23);
+ encoding |= opcode << 16;
+ encoding |= (uint16_t) sopp->imm;
+ if (sopp->block != -1)
+ ctx.branches.insert({out.size(), sopp});
+ out.push_back(encoding);
+ break;
+ }
+ case Format::SMEM: {
+ SMEM_instruction* smem = static_cast<SMEM_instruction*>(instr);
+ uint32_t encoding = (0b110000 << 26);
+ encoding |= opcode << 18;
+ if (instr->operands.size() >= 2)
+ encoding |= instr->operands[1].isConstant() ? 1 << 17 : 0;
+ bool soe = instr->operands.size() >= (!instr->definitions.empty() ? 3 : 4);
+ assert(!soe || ctx.chip_class >= GFX9);
+ encoding |= soe ? 1 << 14 : 0;
+ encoding |= smem->glc ? 1 << 16 : 0;
+ if (!instr->definitions.empty() || instr->operands.size() >= 3)
+ encoding |= (!instr->definitions.empty() ? instr->definitions[0].physReg() : instr->operands[2].physReg().reg) << 6;
+ if (instr->operands.size() >= 1)
+ encoding |= instr->operands[0].physReg() >> 1;
+ out.push_back(encoding);
+ encoding = 0;
+ if (instr->operands.size() >= 2)
+ encoding |= instr->operands[1].isConstant() ? instr->operands[1].constantValue() : instr->operands[1].physReg().reg;
+ encoding |= soe ? instr->operands.back().physReg() << 25 : 0;
+ out.push_back(encoding);
+ return;
+ }
+ case Format::VOP2: {
+ uint32_t encoding = 0;
+ encoding |= opcode << 25;
+ encoding |= (0xFF & instr->definitions[0].physReg().reg) << 17;
+ encoding |= (0xFF & instr->operands[1].physReg().reg) << 9;
+ encoding |= instr->operands[0].physReg().reg;
+ out.push_back(encoding);
+ break;
+ }
+ case Format::VOP1: {
+ uint32_t encoding = (0b0111111 << 25);
+ encoding |= (0xFF & instr->definitions[0].physReg().reg) << 17;
+ encoding |= opcode << 9;
+ encoding |= instr->operands[0].physReg().reg;
+ out.push_back(encoding);
+ break;
+ }
+ case Format::VOPC: {
+ uint32_t encoding = (0b0111110 << 25);
+ encoding |= opcode << 17;
+ encoding |= (0xFF & instr->operands[1].physReg().reg) << 9;
+ encoding |= instr->operands[0].physReg().reg;
+ out.push_back(encoding);
+ break;
+ }
+ case Format::VINTRP: {
+ Interp_instruction* interp = static_cast<Interp_instruction*>(instr);
+ uint32_t encoding = (0b110101 << 26);
+ encoding |= (0xFF & instr->definitions[0].physReg().reg) << 18;
+ encoding |= opcode << 16;
+ encoding |= interp->attribute << 10;
+ encoding |= interp->component << 8;
+ if (instr->opcode == aco_opcode::v_interp_mov_f32)
+ encoding |= (0x3 & instr->operands[0].constantValue());
+ else
+ encoding |= (0xFF & instr->operands[0].physReg().reg);
+ out.push_back(encoding);
+ break;
+ }
+ case Format::DS: {
+ DS_instruction* ds = static_cast<DS_instruction*>(instr);
+ uint32_t encoding = (0b110110 << 26);
+ encoding |= opcode << 17;
+ encoding |= (ds->gds ? 1 : 0) << 16;
+ encoding |= ((0xFF & ds->offset1) << 8);
+ encoding |= (0xFFFF & ds->offset0);
+ out.push_back(encoding);
+ encoding = 0;
+ unsigned reg = !instr->definitions.empty() ? instr->definitions[0].physReg() : 0;
+ encoding |= (0xFF & reg) << 24;
+ reg = instr->operands.size() >= 3 && !(instr->operands[2].physReg() == m0) ? instr->operands[2].physReg() : 0;
+ encoding |= (0xFF & reg) << 16;
+ reg = instr->operands.size() >= 2 && !(instr->operands[1].physReg() == m0) ? instr->operands[1].physReg() : 0;
+ encoding |= (0xFF & reg) << 8;
+ encoding |= (0xFF & instr->operands[0].physReg().reg);
+ out.push_back(encoding);
+ break;
+ }
+ case Format::MUBUF: {
+ MUBUF_instruction* mubuf = static_cast<MUBUF_instruction*>(instr);
+ uint32_t encoding = (0b111000 << 26);
+ encoding |= opcode << 18;
+ encoding |= (mubuf->slc ? 1 : 0) << 17;
+ encoding |= (mubuf->lds ? 1 : 0) << 16;
+ encoding |= (mubuf->glc ? 1 : 0) << 14;
+ encoding |= (mubuf->idxen ? 1 : 0) << 13;
+ encoding |= (mubuf->offen ? 1 : 0) << 12;
+ encoding |= 0x0FFF & mubuf->offset;
+ out.push_back(encoding);
+ encoding = 0;
+ encoding |= instr->operands[2].physReg() << 24;
+ encoding |= (mubuf->tfe ? 1 : 0) << 23;
+ encoding |= (instr->operands[1].physReg() >> 2) << 16;
+ unsigned reg = instr->operands.size() > 3 ? instr->operands[3].physReg() : instr->definitions[0].physReg().reg;
+ encoding |= (0xFF & reg) << 8;
+ encoding |= (0xFF & instr->operands[0].physReg().reg);
+ out.push_back(encoding);
+ break;
+ }
+ case Format::MTBUF: {
+ MTBUF_instruction* mtbuf = static_cast<MTBUF_instruction*>(instr);
+ uint32_t encoding = (0b111010 << 26);
+ encoding |= opcode << 15;
+ encoding |= (mtbuf->glc ? 1 : 0) << 14;
+ encoding |= (mtbuf->idxen ? 1 : 0) << 13;
+ encoding |= (mtbuf->offen ? 1 : 0) << 12;
+ encoding |= 0x0FFF & mtbuf->offset;
+ encoding |= (0xF & mtbuf->dfmt) << 19;
+ encoding |= (0x7 & mtbuf->nfmt) << 23;
+ out.push_back(encoding);
+ encoding = 0;
+ encoding |= instr->operands[2].physReg().reg << 24;
+ encoding |= (mtbuf->tfe ? 1 : 0) << 23;
+ encoding |= (mtbuf->slc ? 1 : 0) << 22;
+ encoding |= (instr->operands[1].physReg().reg >> 2) << 16;
+ unsigned reg = instr->operands.size() > 3 ? instr->operands[3].physReg().reg : instr->definitions[0].physReg().reg;
+ encoding |= (0xFF & reg) << 8;
+ encoding |= (0xFF & instr->operands[0].physReg().reg);
+ out.push_back(encoding);
+ break;
+ }
+ case Format::MIMG: {
+ MIMG_instruction* mimg = static_cast<MIMG_instruction*>(instr);
+ uint32_t encoding = (0b111100 << 26);
+ encoding |= mimg->slc ? 1 << 25 : 0;
+ encoding |= opcode << 18;
+ encoding |= mimg->lwe ? 1 << 17 : 0;
+ encoding |= mimg->tfe ? 1 << 16 : 0;
+ encoding |= mimg->r128 ? 1 << 15 : 0;
+ encoding |= mimg->da ? 1 << 14 : 0;
+ encoding |= mimg->glc ? 1 << 13 : 0;
+ encoding |= mimg->unrm ? 1 << 12 : 0;
+ encoding |= (0xF & mimg->dmask) << 8;
+ out.push_back(encoding);
+ encoding = (0xFF & instr->operands[0].physReg().reg); /* VADDR */
+ if (!instr->definitions.empty()) {
+ encoding |= (0xFF & instr->definitions[0].physReg().reg) << 8; /* VDATA */
+ } else if (instr->operands.size() == 4) {
+ encoding |= (0xFF & instr->operands[3].physReg().reg) << 8; /* VDATA */
+ }
+ encoding |= (0x1F & (instr->operands[1].physReg() >> 2)) << 16; /* T# (resource) */
+ if (instr->operands.size() > 2)
+ encoding |= (0x1F & (instr->operands[2].physReg() >> 2)) << 21; /* sampler */
+ // TODO VEGA: D16
+ out.push_back(encoding);
+ break;
+ }
+ case Format::FLAT:
+ case Format::SCRATCH:
+ case Format::GLOBAL: {
+ FLAT_instruction *flat = static_cast<FLAT_instruction*>(instr);
+ uint32_t encoding = (0b110111 << 26);
+ encoding |= opcode << 18;
+ encoding |= flat->offset & 0x1fff;
+ if (instr->format == Format::SCRATCH)
+ encoding |= 1 << 14;
+ else if (instr->format == Format::GLOBAL)
+ encoding |= 2 << 14;
+ encoding |= flat->lds ? 1 << 13 : 0;
+ encoding |= flat->glc ? 1 << 13 : 0;
+ encoding |= flat->slc ? 1 << 13 : 0;
+ out.push_back(encoding);
+ encoding = (0xFF & instr->operands[0].physReg().reg);
+ if (!instr->definitions.empty())
+ encoding |= (0xFF & instr->definitions[0].physReg().reg) << 24;
+ else
+ encoding |= (0xFF & instr->operands[2].physReg().reg) << 8;
+ if (!instr->operands[1].isUndefined()) {
+ assert(instr->operands[1].physReg() != 0x7f);
+ assert(instr->format != Format::FLAT);
+ encoding |= instr->operands[1].physReg() << 16;
+ } else if (instr->format != Format::FLAT) {
+ encoding |= 0x7F << 16;
+ }
+ encoding |= flat->nv ? 1 << 23 : 0;
+ out.push_back(encoding);
+ break;
+ }
+ case Format::EXP: {
+ Export_instruction* exp = static_cast<Export_instruction*>(instr);
+ uint32_t encoding = (0b110001 << 26);
+ encoding |= exp->valid_mask ? 0b1 << 12 : 0;
+ encoding |= exp->done ? 0b1 << 11 : 0;
+ encoding |= exp->compressed ? 0b1 << 10 : 0;
+ encoding |= exp->dest << 4;
+ encoding |= exp->enabled_mask;
+ out.push_back(encoding);
+ encoding = 0xFF & exp->operands[0].physReg().reg;
+ encoding |= (0xFF & exp->operands[1].physReg().reg) << 8;
+ encoding |= (0xFF & exp->operands[2].physReg().reg) << 16;
+ encoding |= (0xFF & exp->operands[3].physReg().reg) << 24;
+ out.push_back(encoding);
+ break;
+ }
+ case Format::PSEUDO:
+ case Format::PSEUDO_BARRIER:
+ unreachable("Pseudo instructions should be lowered before assembly.");
+ default:
+ if ((uint16_t) instr->format & (uint16_t) Format::VOP3A) {
+ VOP3A_instruction* vop3 = static_cast<VOP3A_instruction*>(instr);
+
+ if ((uint16_t) instr->format & (uint16_t) Format::VOP2)
+ opcode = opcode + 0x100;
+ else if ((uint16_t) instr->format & (uint16_t) Format::VOP1)
+ opcode = opcode + 0x140;
+ else if ((uint16_t) instr->format & (uint16_t) Format::VOPC)
+ opcode = opcode + 0x0;
+ else if ((uint16_t) instr->format & (uint16_t) Format::VINTRP)
+ opcode = opcode + 0x270;
+
+ // TODO: op_sel
+ uint32_t encoding = (0b110100 << 26);
+ encoding |= opcode << 16;
+ encoding |= (vop3->clamp ? 1 : 0) << 15;
+ for (unsigned i = 0; i < 3; i++)
+ encoding |= vop3->abs[i] << (8+i);
+ if (instr->definitions.size() == 2)
+ encoding |= instr->definitions[1].physReg() << 8;
+ encoding |= (0xFF & instr->definitions[0].physReg().reg);
+ out.push_back(encoding);
+ encoding = 0;
+ if (instr->opcode == aco_opcode::v_interp_mov_f32) {
+ encoding = 0x3 & instr->operands[0].constantValue();
+ } else {
+ for (unsigned i = 0; i < instr->operands.size(); i++)
+ encoding |= instr->operands[i].physReg() << (i * 9);
+ }
+ encoding |= vop3->omod << 27;
+ for (unsigned i = 0; i < 3; i++)
+ encoding |= vop3->neg[i] << (29+i);
+ out.push_back(encoding);
+ return;
+
+ } else if (instr->isDPP()){
+ /* first emit the instruction without the DPP operand */
+ Operand dpp_op = instr->operands[0];
+ instr->operands[0] = Operand(PhysReg{250}, v1);
+ instr->format = (Format) ((uint32_t) instr->format & ~(1 << 14));
+ emit_instruction(ctx, out, instr);
+ DPP_instruction* dpp = static_cast<DPP_instruction*>(instr);
+ uint32_t encoding = (0xF & dpp->row_mask) << 28;
+ encoding |= (0xF & dpp->bank_mask) << 24;
+ encoding |= dpp->abs[1] << 23;
+ encoding |= dpp->neg[1] << 22;
+ encoding |= dpp->abs[0] << 21;
+ encoding |= dpp->neg[0] << 20;
+ encoding |= dpp->bound_ctrl << 19;
+ encoding |= dpp->dpp_ctrl << 8;
+ encoding |= (0xFF) & dpp_op.physReg().reg;
+ out.push_back(encoding);
+ return;
+ } else {
+ unreachable("unimplemented instruction format");
+ }
+ }
+
+ /* append literal dword */
+ for (const Operand& op : instr->operands) {
+ if (op.isLiteral()) {
+ out.push_back(op.constantValue());
+ break;
+ }
+ }
+}
+
+void emit_block(asm_context& ctx, std::vector<uint32_t>& out, Block& block)
+{
+ for (aco_ptr<Instruction>& instr : block.instructions) {
+#if 0
+ int start_idx = out.size();
+ std::cerr << "Encoding:\t" << std::endl;
+ aco_print_instr(&*instr, stderr);
+ std::cerr << std::endl;
+#endif
+ emit_instruction(ctx, out, instr.get());
+#if 0
+ for (int i = start_idx; i < out.size(); i++)
+ std::cerr << "encoding: " << "0x" << std::setfill('0') << std::setw(8) << std::hex << out[i] << std::endl;
+#endif
+ }
+}
+
+void fix_exports(asm_context& ctx, std::vector<uint32_t>& out, Program* program)
+{
+ for (int idx = program->blocks.size() - 1; idx >= 0; idx--) {
+ Block& block = program->blocks[idx];
+ std::vector<aco_ptr<Instruction>>::reverse_iterator it = block.instructions.rbegin();
+ bool endBlock = false;
+ bool exported = false;
+ while ( it != block.instructions.rend())
+ {
+ if ((*it)->format == Format::EXP && endBlock) {
+ Export_instruction* exp = static_cast<Export_instruction*>((*it).get());
+ if (program->stage & hw_vs) {
+ if (exp->dest >= V_008DFC_SQ_EXP_POS && exp->dest <= (V_008DFC_SQ_EXP_POS + 3)) {
+ exp->done = true;
+ exported = true;
+ break;
+ }
+ } else {
+ exp->done = true;
+ exp->valid_mask = true;
+ exported = true;
+ break;
+ }
+ } else if ((*it)->definitions.size() && (*it)->definitions[0].physReg() == exec)
+ break;
+ else if ((*it)->opcode == aco_opcode::s_endpgm) {
+ if (endBlock)
+ break;
+ endBlock = true;
+ }
+ ++it;
+ }
+ if (!endBlock || exported)
+ continue;
+ /* we didn't find an Export instruction and have to insert a null export */
+ aco_ptr<Export_instruction> exp{create_instruction<Export_instruction>(aco_opcode::exp, Format::EXP, 4, 0)};
+ for (unsigned i = 0; i < 4; i++)
+ exp->operands[i] = Operand(v1);
+ exp->enabled_mask = 0;
+ exp->compressed = false;
+ exp->done = true;
+ exp->valid_mask = program->stage & hw_fs;
+ if (program->stage & hw_fs)
+ exp->dest = 9; /* NULL */
+ else
+ exp->dest = V_008DFC_SQ_EXP_POS;
+ /* insert the null export 1 instruction before endpgm */
+ block.instructions.insert(block.instructions.end() - 1, std::move(exp));
+ }
+}
+
+void fix_branches(asm_context& ctx, std::vector<uint32_t>& out)
+{
+ for (std::pair<int, SOPP_instruction*> branch : ctx.branches)
+ {
+ int offset = (int)ctx.program->blocks[branch.second->block].offset - branch.first - 1;
+ out[branch.first] |= (uint16_t) offset;
+ }
+}
+
+void fix_constaddrs(asm_context& ctx, std::vector<uint32_t>& out)
+{
+ for (unsigned addr : ctx.constaddrs)
+ out[addr] += out.size() * 4u;
+}
+
+unsigned emit_program(Program* program,
+ std::vector<uint32_t>& code)
+{
+ asm_context ctx(program);
+
+ if (program->stage & (hw_vs | hw_fs))
+ fix_exports(ctx, code, program);
+
+ for (Block& block : program->blocks) {
+ block.offset = code.size();
+ emit_block(ctx, code, block);
+ }
+
+ fix_branches(ctx, code);
+ fix_constaddrs(ctx, code);
+
+ unsigned constant_data_offset = code.size() * sizeof(uint32_t);
+ while (program->constant_data.size() % 4u)
+ program->constant_data.push_back(0);
+ /* Copy constant data */
+ code.insert(code.end(), (uint32_t*)program->constant_data.data(),
+ (uint32_t*)(program->constant_data.data() + program->constant_data.size()));
+
+ return constant_data_offset;
+}
+
+}