summaryrefslogtreecommitdiff
path: root/gcc/config/nvptx/nvptx.c
diff options
context:
space:
mode:
authoramonakov <amonakov@138bc75d-0d04-0410-961f-82ee72b054a4>2016-11-16 17:17:00 +0000
committeramonakov <amonakov@138bc75d-0d04-0410-961f-82ee72b054a4>2016-11-16 17:17:00 +0000
commit7fce87689574db0ae92935e75a911ca6262db9e6 (patch)
tree9db609d99ee4957a92a3ad468eb36d855e6c1bc6 /gcc/config/nvptx/nvptx.c
parent35293eb1ed07c56046b40b7d32aa2883286af512 (diff)
downloadgcc-7fce87689574db0ae92935e75a911ca6262db9e6.tar.gz
nvptx backend prerequisites for OpenMP offloading
gcc/ * config/nvptx/mkoffload.c (main): Check that either OpenACC or OpenMP is selected. Pass -mgomp to offload compiler in OpenMP case. * config/nvptx/nvptx-protos.h (nvptx_shuffle_kind): Move enum declaration from nvptx.c. (nvptx_gen_shuffle): Declare. (nvptx_output_set_softstack): Declare. * config/nvptx/nvptx.c (nvptx_shuffle_kind): Move to nvptx-protos.h. (need_softstack_decl): New variable. (need_unisimt_decl): New variable. (diagnose_openacc_conflict): New. Use it... (nvptx_option_override): ...here. Handle TARGET_GOMP. (nvptx_encode_section_info): Handle "shared" attribute. (write_as_kernel): Restrict to OpenACC target regions. (init_softstack_frame): New. (nvptx_init_unisimt_predicate): New. (write_omp_entry): New. Use it... (nvptx_declare_function_name): ...here to emit OpenMP target region entrypoints. Handle TARGET_SOFT_STACK. Call nvptx_init_unisimt_predicate. (nvptx_output_set_softstack): New. (nvptx_get_drap_rtx): Return %argp as the DRAP if needed. (nvptx_gen_shuffle): Export. (nvptx_output_call_insn): Handle COND_EXEC patterns. Emit instruction predicate. (nvptx_print_operand): Fix handling of instruction predicates. (nvptx_get_unisimt_master): New helper function. (nvptx_get_unisimt_predicate): Ditto. (nvptx_call_insn_is_syscall_p): Ditto. (nvptx_unisimt_handle_set): Ditto. (nvptx_reorg_uniform_simt): New. Transform code for -muniform-simt. (nvptx_reorg): Call nvptx_reorg_uniform_simt. (nvptx_handle_shared_attribute): New. Use it... (nvptx_attribute_table): ... here (new entry). (nvptx_record_offload_symbol): Handle NULL attributes. (nvptx_file_end): Handle need_softstack_decl and need_unisimt_decl. (nvptx_simt_vf): New. (TARGET_SIMT_VF): Define. * config/nvptx/nvptx.h (TARGET_CPU_CPP_BUILTINS): Define __nvptx_softstack or __nvptx_unisimt__ when -msoft-stack, or resp. -muniform-simt option is active. (STACK_SIZE_MODE): Define. (FIXED_REGISTERS): Adjust. (SOFTSTACK_SLOT_REGNUM): New. (SOFTSTACK_PREV_REGNUM): New. (REGISTER_NAMES): Adjust. (struct machine_function): New fields. * config/nvptx/nvptx.md (UNSPEC_SET_SOFTSTACK): New. (UNSPEC_VOTE_BALLOT): Ditto. (UNSPEC_LANEID): Ditto. (UNSPECV_NOUNROLL): Ditto. (atomic): New attribute. (predicable): New attribute. Generate predicated forms via define_cond_exec. (br_true): Mark as not predicable. (br_false): Ditto. (br_true_uni): Ditto. (br_false_uni): Ditto. (return): Ditto. (trap_if_true): Ditto. (trap_if_false): Ditto. (nvptx_fork): Ditto. (nvptx_forked): Ditto. (nvptx_joining): Ditto. (nvptx_join): Ditto. (nvptx_barsync): Ditto. (epilogue): Emit stack restore if TARGET_SOFT_STACK. (allocate_stack): Implement for TARGET_SOFT_STACK. Remove unused code. (allocate_stack_<mode>): Remove unused pattern. (set_softstack_insn): New pattern. (restore_stack_block): Handle for TARGET_SOFT_STACK. (nvptx_vote_ballot): New pattern. (omp_simt_lane): Ditto. (omp_simt_last_lane): Ditto. (omp_simt_ordered): Ditto. (omp_simt_vote_any): Ditto. (omp_simt_xchg_bfly): Ditto. (omp_simt_xchg_idx): Ditto. (nvptx_nounroll): Ditto. (atomic_compare_and_swap<mode>_1): Mark with atomic attribute. (atomic_exchange<mode>): Ditto. (atomic_fetch_add<mode>): Ditto. (atomic_fetch_addsf): Ditto. (atomic_fetch_<logic><mode>): Ditto. * config/nvptx/nvptx.opt: (msoft-stack): New option. (muniform-simt): Ditto. (mgomp): Ditto. * config/nvptx/t-nvptx (MULTILIB_OPTIONS): New. * doc/extend.texi (Nvidia PTX Variable Attributes): New section. * doc/invoke.texi (msoft-stack): Document. (muniform-simt): Document (mgomp): Document. * doc/tm.texi: Regenerate. * doc/tm.texi.in: (TARGET_SIMT_VF): New hook. * target.def: Define it. * target-insns.def (omp_simt_lane): New. (omp_simt_last_lane): New. (omp_simt_ordered): New. (omp_simt_vote_any): New. (omp_simt_xchg_bfly): New. (omp_simt_xchg_idx): New. libgcc/ * config/nvptx/crt0.c (__main): Setup __nvptx_stacks and __nvptx_uni. * config/nvptx/mgomp.c: New file. * config/nvptx/t-nvptx: Add mgomp.c gcc/testsuite/ * lib/target-supports.exp (check_effective_target_alloca): Use a compile test. * gcc.target/nvptx/softstack.c: New test. * gcc.target/nvptx/decl-shared.c: New test. * gcc.target/nvptx/decl-shared-init.c: New test. git-svn-id: svn+ssh://gcc.gnu.org/svn/gcc/trunk@242503 138bc75d-0d04-0410-961f-82ee72b054a4
Diffstat (limited to 'gcc/config/nvptx/nvptx.c')
-rw-r--r--gcc/config/nvptx/nvptx.c418
1 files changed, 382 insertions, 36 deletions
diff --git a/gcc/config/nvptx/nvptx.c b/gcc/config/nvptx/nvptx.c
index 782bbdecb37..405a91b2604 100644
--- a/gcc/config/nvptx/nvptx.c
+++ b/gcc/config/nvptx/nvptx.c
@@ -72,16 +72,6 @@
/* This file should be included last. */
#include "target-def.h"
-/* The kind of shuffe instruction. */
-enum nvptx_shuffle_kind
-{
- SHUFFLE_UP,
- SHUFFLE_DOWN,
- SHUFFLE_BFLY,
- SHUFFLE_IDX,
- SHUFFLE_MAX
-};
-
/* The various PTX memory areas an object might reside in. */
enum nvptx_data_area
{
@@ -141,6 +131,12 @@ static GTY(()) rtx worker_red_sym;
/* Global lock variable, needed for 128bit worker & gang reductions. */
static GTY(()) tree global_lock_var;
+/* True if any function references __nvptx_stacks. */
+static bool need_softstack_decl;
+
+/* True if any function references __nvptx_uni. */
+static bool need_unisimt_decl;
+
/* Allocate a new, cleared machine_function structure. */
static struct machine_function *
@@ -151,6 +147,16 @@ nvptx_init_machine_status (void)
return p;
}
+/* Issue a diagnostic when option OPTNAME is enabled (as indicated by OPTVAL)
+ and -fopenacc is also enabled. */
+
+static void
+diagnose_openacc_conflict (bool optval, const char *optname)
+{
+ if (flag_openacc && optval)
+ error ("option %s is not supported together with -fopenacc", optname);
+}
+
/* Implement TARGET_OPTION_OVERRIDE. */
static void
@@ -188,6 +194,13 @@ nvptx_option_override (void)
worker_red_sym = gen_rtx_SYMBOL_REF (Pmode, "__worker_red");
SET_SYMBOL_DATA_AREA (worker_red_sym, DATA_AREA_SHARED);
worker_red_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT;
+
+ diagnose_openacc_conflict (TARGET_GOMP, "-mgomp");
+ diagnose_openacc_conflict (TARGET_SOFT_STACK, "-msoft-stack");
+ diagnose_openacc_conflict (TARGET_UNIFORM_SIMT, "-muniform-simt");
+
+ if (TARGET_GOMP)
+ target_flags |= MASK_SOFT_STACK | MASK_UNIFORM_SIMT;
}
/* Return a ptx type for MODE. If PROMOTE, then use .u32 for QImode to
@@ -238,9 +251,17 @@ nvptx_encode_section_info (tree decl, rtx rtl, int first)
if (TREE_CONSTANT (decl))
area = DATA_AREA_CONST;
else if (TREE_CODE (decl) == VAR_DECL)
- /* TODO: This would be a good place to check for a .shared or
- other section name. */
- area = TREE_READONLY (decl) ? DATA_AREA_CONST : DATA_AREA_GLOBAL;
+ {
+ if (lookup_attribute ("shared", DECL_ATTRIBUTES (decl)))
+ {
+ area = DATA_AREA_SHARED;
+ if (DECL_INITIAL (decl))
+ error ("static initialization of variable %q+D in %<.shared%>"
+ " memory is not supported", decl);
+ }
+ else
+ area = TREE_READONLY (decl) ? DATA_AREA_CONST : DATA_AREA_GLOBAL;
+ }
SET_SYMBOL_DATA_AREA (XEXP (rtl, 0), area);
}
@@ -718,7 +739,10 @@ static bool
write_as_kernel (tree attrs)
{
return (lookup_attribute ("kernel", attrs) != NULL_TREE
- || lookup_attribute ("omp target entrypoint", attrs) != NULL_TREE);
+ || (lookup_attribute ("omp target entrypoint", attrs) != NULL_TREE
+ && lookup_attribute ("oacc function", attrs) != NULL_TREE));
+ /* For OpenMP target regions, the corresponding kernel entry is emitted from
+ write_omp_entry as a separate function. */
}
/* Emit a linker marker for a function decl or defn. */
@@ -973,6 +997,67 @@ init_frame (FILE *file, int regno, unsigned align, unsigned size)
POINTER_SIZE, reg_names[regno], reg_names[regno]);
}
+/* Emit soft stack frame setup sequence. */
+
+static void
+init_softstack_frame (FILE *file, unsigned alignment, HOST_WIDE_INT size)
+{
+ /* Maintain 64-bit stack alignment. */
+ unsigned keep_align = BIGGEST_ALIGNMENT / BITS_PER_UNIT;
+ size = ROUND_UP (size, keep_align);
+ int bits = POINTER_SIZE;
+ const char *reg_stack = reg_names[STACK_POINTER_REGNUM];
+ const char *reg_frame = reg_names[FRAME_POINTER_REGNUM];
+ const char *reg_sspslot = reg_names[SOFTSTACK_SLOT_REGNUM];
+ const char *reg_sspprev = reg_names[SOFTSTACK_PREV_REGNUM];
+ fprintf (file, "\t.reg.u%d %s;\n", bits, reg_stack);
+ fprintf (file, "\t.reg.u%d %s;\n", bits, reg_frame);
+ fprintf (file, "\t.reg.u%d %s;\n", bits, reg_sspslot);
+ fprintf (file, "\t.reg.u%d %s;\n", bits, reg_sspprev);
+ fprintf (file, "\t{\n");
+ fprintf (file, "\t\t.reg.u32 %%fstmp0;\n");
+ fprintf (file, "\t\t.reg.u%d %%fstmp1;\n", bits);
+ fprintf (file, "\t\t.reg.u%d %%fstmp2;\n", bits);
+ fprintf (file, "\t\tmov.u32 %%fstmp0, %%tid.y;\n");
+ fprintf (file, "\t\tmul%s.u32 %%fstmp1, %%fstmp0, %d;\n",
+ bits == 64 ? ".wide" : ".lo", bits / 8);
+ fprintf (file, "\t\tmov.u%d %%fstmp2, __nvptx_stacks;\n", bits);
+
+ /* Initialize %sspslot = &__nvptx_stacks[tid.y]. */
+ fprintf (file, "\t\tadd.u%d %s, %%fstmp2, %%fstmp1;\n", bits, reg_sspslot);
+
+ /* Initialize %sspprev = __nvptx_stacks[tid.y]. */
+ fprintf (file, "\t\tld.shared.u%d %s, [%s];\n",
+ bits, reg_sspprev, reg_sspslot);
+
+ /* Initialize %frame = %sspprev - size. */
+ fprintf (file, "\t\tsub.u%d %s, %s, " HOST_WIDE_INT_PRINT_DEC ";\n",
+ bits, reg_frame, reg_sspprev, size);
+
+ /* Apply alignment, if larger than 64. */
+ if (alignment > keep_align)
+ fprintf (file, "\t\tand.b%d %s, %s, %d;\n",
+ bits, reg_frame, reg_frame, -alignment);
+
+ size = crtl->outgoing_args_size;
+ gcc_assert (size % keep_align == 0);
+
+ /* Initialize %stack. */
+ fprintf (file, "\t\tsub.u%d %s, %s, " HOST_WIDE_INT_PRINT_DEC ";\n",
+ bits, reg_stack, reg_frame, size);
+
+ /* Usually 'crtl->is_leaf' is computed during register allocator
+ initialization, which is not done on NVPTX. Compute it now. */
+ gcc_assert (!crtl->is_leaf);
+ crtl->is_leaf = leaf_function_p ();
+ if (!crtl->is_leaf)
+ fprintf (file, "\t\tst.shared.u%d [%s], %s;\n",
+ bits, reg_sspslot, reg_stack);
+ fprintf (file, "\t}\n");
+ cfun->machine->has_softstack = true;
+ need_softstack_decl = true;
+}
+
/* Emit code to initialize the REGNO predicate register to indicate
whether we are not lane zero on the NAME axis. */
@@ -986,6 +1071,97 @@ nvptx_init_axis_predicate (FILE *file, int regno, const char *name)
fprintf (file, "\t}\n");
}
+/* Emit code to initialize predicate and master lane index registers for
+ -muniform-simt code generation variant. */
+
+static void
+nvptx_init_unisimt_predicate (FILE *file)
+{
+ int bits = POINTER_SIZE;
+ int master = REGNO (cfun->machine->unisimt_master);
+ int pred = REGNO (cfun->machine->unisimt_predicate);
+ fprintf (file, "\t{\n");
+ fprintf (file, "\t\t.reg.u32 %%ustmp0;\n");
+ fprintf (file, "\t\t.reg.u%d %%ustmp1;\n", bits);
+ fprintf (file, "\t\t.reg.u%d %%ustmp2;\n", bits);
+ fprintf (file, "\t\tmov.u32 %%ustmp0, %%tid.y;\n");
+ fprintf (file, "\t\tmul%s.u32 %%ustmp1, %%ustmp0, 4;\n",
+ bits == 64 ? ".wide" : ".lo");
+ fprintf (file, "\t\tmov.u%d %%ustmp2, __nvptx_uni;\n", bits);
+ fprintf (file, "\t\tadd.u%d %%ustmp2, %%ustmp2, %%ustmp1;\n", bits);
+ fprintf (file, "\t\tld.shared.u32 %%r%d, [%%ustmp2];\n", master);
+ fprintf (file, "\t\tmov.u32 %%ustmp0, %%tid.x;\n");
+ /* Compute 'master lane index' as 'tid.x & __nvptx_uni[tid.y]'. */
+ fprintf (file, "\t\tand.b32 %%r%d, %%r%d, %%ustmp0;\n", master, master);
+ /* Compute predicate as 'tid.x == master'. */
+ fprintf (file, "\t\tsetp.eq.u32 %%r%d, %%r%d, %%ustmp0;\n", pred, master);
+ fprintf (file, "\t}\n");
+ need_unisimt_decl = true;
+}
+
+/* Emit kernel NAME for function ORIG outlined for an OpenMP 'target' region:
+
+ extern void gomp_nvptx_main (void (*fn)(void*), void *fnarg);
+ void __attribute__((kernel)) NAME (void *arg, char *stack, size_t stacksize)
+ {
+ __nvptx_stacks[tid.y] = stack + stacksize * (ctaid.x * ntid.y + tid.y + 1);
+ __nvptx_uni[tid.y] = 0;
+ gomp_nvptx_main (ORIG, arg);
+ }
+ ORIG itself should not be emitted as a PTX .entry function. */
+
+static void
+write_omp_entry (FILE *file, const char *name, const char *orig)
+{
+ static bool gomp_nvptx_main_declared;
+ if (!gomp_nvptx_main_declared)
+ {
+ gomp_nvptx_main_declared = true;
+ write_fn_marker (func_decls, false, true, "gomp_nvptx_main");
+ func_decls << ".extern .func gomp_nvptx_main (.param.u" << POINTER_SIZE
+ << " %in_ar1, .param.u" << POINTER_SIZE << " %in_ar2);\n";
+ }
+#define ENTRY_TEMPLATE(PS, PS_BYTES, MAD_PS_32) "\
+ (.param.u" PS " %arg, .param.u" PS " %stack, .param.u" PS " %sz)\n\
+{\n\
+ .reg.u32 %r<3>;\n\
+ .reg.u" PS " %R<4>;\n\
+ mov.u32 %r0, %tid.y;\n\
+ mov.u32 %r1, %ntid.y;\n\
+ mov.u32 %r2, %ctaid.x;\n\
+ cvt.u" PS ".u32 %R1, %r0;\n\
+ " MAD_PS_32 " %R1, %r1, %r2, %R1;\n\
+ mov.u" PS " %R0, __nvptx_stacks;\n\
+ " MAD_PS_32 " %R0, %r0, " PS_BYTES ", %R0;\n\
+ ld.param.u" PS " %R2, [%stack];\n\
+ ld.param.u" PS " %R3, [%sz];\n\
+ add.u" PS " %R2, %R2, %R3;\n\
+ mad.lo.u" PS " %R2, %R1, %R3, %R2;\n\
+ st.shared.u" PS " [%R0], %R2;\n\
+ mov.u" PS " %R0, __nvptx_uni;\n\
+ " MAD_PS_32 " %R0, %r0, 4, %R0;\n\
+ mov.u32 %r0, 0;\n\
+ st.shared.u32 [%R0], %r0;\n\
+ mov.u" PS " %R0, \0;\n\
+ ld.param.u" PS " %R1, [%arg];\n\
+ {\n\
+ .param.u" PS " %P<2>;\n\
+ st.param.u" PS " [%P0], %R0;\n\
+ st.param.u" PS " [%P1], %R1;\n\
+ call.uni gomp_nvptx_main, (%P0, %P1);\n\
+ }\n\
+ ret.uni;\n\
+}\n"
+ static const char entry64[] = ENTRY_TEMPLATE ("64", "8", "mad.wide.u32");
+ static const char entry32[] = ENTRY_TEMPLATE ("32", "4", "mad.lo.u32 ");
+#undef ENTRY_TEMPLATE
+ const char *entry_1 = TARGET_ABI64 ? entry64 : entry32;
+ /* Position ENTRY_2 after the embedded nul using strlen of the prefix. */
+ const char *entry_2 = entry_1 + strlen (entry64) + 1;
+ fprintf (file, ".visible .entry %s%s%s%s", name, entry_1, orig, entry_2);
+ need_softstack_decl = need_unisimt_decl = true;
+}
+
/* Implement ASM_DECLARE_FUNCTION_NAME. Writes the start of a ptx
function, including local var decls and copies from the arguments to
local regs. */
@@ -997,6 +1173,14 @@ nvptx_declare_function_name (FILE *file, const char *name, const_tree decl)
tree result_type = TREE_TYPE (fntype);
int argno = 0;
+ if (lookup_attribute ("omp target entrypoint", DECL_ATTRIBUTES (decl))
+ && !lookup_attribute ("oacc function", DECL_ATTRIBUTES (decl)))
+ {
+ char *buf = (char *) alloca (strlen (name) + sizeof ("$impl"));
+ sprintf (buf, "%s$impl", name);
+ write_omp_entry (file, name, buf);
+ name = buf;
+ }
/* We construct the initial part of the function into a string
stream, in order to share the prototype writing code. */
std::stringstream s;
@@ -1034,19 +1218,24 @@ nvptx_declare_function_name (FILE *file, const char *name, const_tree decl)
fprintf (file, "%s", s.str().c_str());
- /* Declare a local var for outgoing varargs. */
- if (cfun->machine->has_varadic)
- init_frame (file, STACK_POINTER_REGNUM,
- UNITS_PER_WORD, crtl->outgoing_args_size);
-
- /* Declare a local variable for the frame. Force its size to be
- DImode-compatible. */
HOST_WIDE_INT sz = get_frame_size ();
- if (sz || cfun->machine->has_chain)
- init_frame (file, FRAME_POINTER_REGNUM,
- crtl->stack_alignment_needed / BITS_PER_UNIT,
- (sz + GET_MODE_SIZE (DImode) - 1)
- & ~(HOST_WIDE_INT)(GET_MODE_SIZE (DImode) - 1));
+ bool need_frameptr = sz || cfun->machine->has_chain;
+ int alignment = crtl->stack_alignment_needed / BITS_PER_UNIT;
+ if (!TARGET_SOFT_STACK)
+ {
+ /* Declare a local var for outgoing varargs. */
+ if (cfun->machine->has_varadic)
+ init_frame (file, STACK_POINTER_REGNUM,
+ UNITS_PER_WORD, crtl->outgoing_args_size);
+
+ /* Declare a local variable for the frame. Force its size to be
+ DImode-compatible. */
+ if (need_frameptr)
+ init_frame (file, FRAME_POINTER_REGNUM, alignment,
+ ROUND_UP (sz, GET_MODE_SIZE (DImode)));
+ }
+ else if (need_frameptr || cfun->machine->has_varadic || cfun->calls_alloca)
+ init_softstack_frame (file, alignment, sz);
/* Declare the pseudos we have as ptx registers. */
int maxregs = max_reg_num ();
@@ -1072,8 +1261,25 @@ nvptx_declare_function_name (FILE *file, const char *name, const_tree decl)
if (cfun->machine->axis_predicate[1])
nvptx_init_axis_predicate (file,
REGNO (cfun->machine->axis_predicate[1]), "x");
+ if (cfun->machine->unisimt_predicate)
+ nvptx_init_unisimt_predicate (file);
}
+/* Output instruction that sets soft stack pointer in shared memory to the
+ value in register given by SRC_REGNO. */
+
+const char *
+nvptx_output_set_softstack (unsigned src_regno)
+{
+ if (cfun->machine->has_softstack && !crtl->is_leaf)
+ {
+ fprintf (asm_out_file, "\tst.shared.u%d\t[%s], ",
+ POINTER_SIZE, reg_names[SOFTSTACK_SLOT_REGNUM]);
+ output_reg (asm_out_file, src_regno, VOIDmode);
+ fprintf (asm_out_file, ";\n");
+ }
+ return "";
+}
/* Output a return instruction. Also copy the return value to its outgoing
location. */
@@ -1113,6 +1319,8 @@ nvptx_function_ok_for_sibcall (tree, tree)
static rtx
nvptx_get_drap_rtx (void)
{
+ if (TARGET_SOFT_STACK && stack_realign_drap)
+ return arg_pointer_rtx;
return NULL_RTX;
}
@@ -1311,7 +1519,7 @@ nvptx_gen_pack (rtx dst, rtx src0, rtx src1)
/* Generate an instruction or sequence to broadcast register REG
across the vectors of a single warp. */
-static rtx
+rtx
nvptx_gen_shuffle (rtx dst, rtx src, rtx idx, nvptx_shuffle_kind kind)
{
rtx res;
@@ -1833,6 +2041,8 @@ nvptx_output_mov_insn (rtx dst, rtx src)
return "%.\tcvt%t0%t1\t%0, %1;";
}
+static void nvptx_print_operand (FILE *, rtx, int);
+
/* Output INSN, which is a call to CALLEE with result RESULT. For ptx, this
involves writing .param declarations and in/out copies into them. For
indirect calls, also write the .callprototype. */
@@ -1844,6 +2054,8 @@ nvptx_output_call_insn (rtx_insn *insn, rtx result, rtx callee)
static int labelno;
bool needs_tgt = register_operand (callee, Pmode);
rtx pat = PATTERN (insn);
+ if (GET_CODE (pat) == COND_EXEC)
+ pat = COND_EXEC_CODE (pat);
int arg_end = XVECLEN (pat, 0);
tree decl = NULL_TREE;
@@ -1888,6 +2100,8 @@ nvptx_output_call_insn (rtx_insn *insn, rtx result, rtx callee)
fprintf (asm_out_file, ";\n");
}
+ /* The '.' stands for the call's predicate, if any. */
+ nvptx_print_operand (asm_out_file, NULL_RTX, '.');
fprintf (asm_out_file, "\t\tcall ");
if (result != NULL_RTX)
fprintf (asm_out_file, "(%s_in), ", reg_names[NVPTX_RETURN_REGNUM]);
@@ -1951,8 +2165,6 @@ nvptx_print_operand_punct_valid_p (unsigned char c)
return c == '.' || c== '#';
}
-static void nvptx_print_operand (FILE *, rtx, int);
-
/* Subroutine of nvptx_print_operand; used to print a memory reference X to FILE. */
static void
@@ -2013,12 +2225,10 @@ nvptx_print_operand (FILE *file, rtx x, int code)
x = current_insn_predicate;
if (x)
{
- unsigned int regno = REGNO (XEXP (x, 0));
- fputs ("[", file);
+ fputs ("@", file);
if (GET_CODE (x) == EQ)
fputs ("!", file);
- fputs (reg_names [regno], file);
- fputs ("]", file);
+ output_reg (file, REGNO (XEXP (x, 0)), VOIDmode);
}
return;
}
@@ -2313,6 +2523,89 @@ nvptx_reorg_subreg (void)
}
}
+/* Return a SImode "master lane index" register for uniform-simt, allocating on
+ first use. */
+
+static rtx
+nvptx_get_unisimt_master ()
+{
+ rtx &master = cfun->machine->unisimt_master;
+ return master ? master : master = gen_reg_rtx (SImode);
+}
+
+/* Return a BImode "predicate" register for uniform-simt, similar to above. */
+
+static rtx
+nvptx_get_unisimt_predicate ()
+{
+ rtx &pred = cfun->machine->unisimt_predicate;
+ return pred ? pred : pred = gen_reg_rtx (BImode);
+}
+
+/* Return true if given call insn references one of the functions provided by
+ the CUDA runtime: malloc, free, vprintf. */
+
+static bool
+nvptx_call_insn_is_syscall_p (rtx_insn *insn)
+{
+ rtx pat = PATTERN (insn);
+ gcc_checking_assert (GET_CODE (pat) == PARALLEL);
+ pat = XVECEXP (pat, 0, 0);
+ if (GET_CODE (pat) == SET)
+ pat = SET_SRC (pat);
+ gcc_checking_assert (GET_CODE (pat) == CALL
+ && GET_CODE (XEXP (pat, 0)) == MEM);
+ rtx addr = XEXP (XEXP (pat, 0), 0);
+ if (GET_CODE (addr) != SYMBOL_REF)
+ return false;
+ const char *name = XSTR (addr, 0);
+ /* Ordinary malloc/free are redirected to __nvptx_{malloc,free), so only the
+ references with forced assembler name refer to PTX syscalls. For vprintf,
+ accept both normal and forced-assembler-name references. */
+ return (!strcmp (name, "vprintf") || !strcmp (name, "*vprintf")
+ || !strcmp (name, "*malloc")
+ || !strcmp (name, "*free"));
+}
+
+/* If SET subexpression of INSN sets a register, emit a shuffle instruction to
+ propagate its value from lane MASTER to current lane. */
+
+static void
+nvptx_unisimt_handle_set (rtx set, rtx_insn *insn, rtx master)
+{
+ rtx reg;
+ if (GET_CODE (set) == SET && REG_P (reg = SET_DEST (set)))
+ emit_insn_after (nvptx_gen_shuffle (reg, reg, master, SHUFFLE_IDX), insn);
+}
+
+/* Adjust code for uniform-simt code generation variant by making atomics and
+ "syscalls" conditionally executed, and inserting shuffle-based propagation
+ for registers being set. */
+
+static void
+nvptx_reorg_uniform_simt ()
+{
+ rtx_insn *insn, *next;
+
+ for (insn = get_insns (); insn; insn = next)
+ {
+ next = NEXT_INSN (insn);
+ if (!(CALL_P (insn) && nvptx_call_insn_is_syscall_p (insn))
+ && !(NONJUMP_INSN_P (insn)
+ && GET_CODE (PATTERN (insn)) == PARALLEL
+ && get_attr_atomic (insn)))
+ continue;
+ rtx pat = PATTERN (insn);
+ rtx master = nvptx_get_unisimt_master ();
+ for (int i = 0; i < XVECLEN (pat, 0); i++)
+ nvptx_unisimt_handle_set (XVECEXP (pat, 0, i), insn, master);
+ rtx pred = nvptx_get_unisimt_predicate ();
+ pred = gen_rtx_NE (BImode, pred, const0_rtx);
+ pat = gen_rtx_COND_EXEC (VOIDmode, pred, pat);
+ validate_change (insn, &PATTERN (insn), pat, false);
+ }
+}
+
/* Loop structure of the function. The entire function is described as
a NULL loop. */
@@ -3829,6 +4122,9 @@ nvptx_reorg (void)
/* Replace subregs. */
nvptx_reorg_subreg ();
+ if (TARGET_UNIFORM_SIMT)
+ nvptx_reorg_uniform_simt ();
+
regstat_free_n_sets_and_refs ();
df_finish_pass (true);
@@ -3857,12 +4153,36 @@ nvptx_handle_kernel_attribute (tree *node, tree name, tree ARG_UNUSED (args),
return NULL_TREE;
}
+/* Handle a "shared" attribute; arguments as in
+ struct attribute_spec.handler. */
+
+static tree
+nvptx_handle_shared_attribute (tree *node, tree name, tree ARG_UNUSED (args),
+ int ARG_UNUSED (flags), bool *no_add_attrs)
+{
+ tree decl = *node;
+
+ if (TREE_CODE (decl) != VAR_DECL)
+ {
+ error ("%qE attribute only applies to variables", name);
+ *no_add_attrs = true;
+ }
+ else if (!(TREE_PUBLIC (decl) || TREE_STATIC (decl)))
+ {
+ error ("%qE attribute not allowed with auto storage class", name);
+ *no_add_attrs = true;
+ }
+
+ return NULL_TREE;
+}
+
/* Table of valid machine attributes. */
static const struct attribute_spec nvptx_attribute_table[] =
{
/* { name, min_len, max_len, decl_req, type_req, fn_type_req, handler,
affects_type_identity } */
{ "kernel", 0, 0, true, false, false, nvptx_handle_kernel_attribute, false },
+ { "shared", 0, 0, true, false, false, nvptx_handle_shared_attribute, false },
{ NULL, 0, 0, false, false, false, NULL, false }
};
@@ -3924,13 +4244,13 @@ nvptx_record_offload_symbol (tree decl)
case FUNCTION_DECL:
{
tree attr = get_oacc_fn_attrib (decl);
- tree dims = TREE_VALUE (attr);
- unsigned ix;
+ /* OpenMP offloading does not set this attribute. */
+ tree dims = attr ? TREE_VALUE (attr) : NULL_TREE;
fprintf (asm_out_file, "//:FUNC_MAP \"%s\"",
IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl)));
- for (ix = 0; ix != GOMP_DIM_MAX; ix++, dims = TREE_CHAIN (dims))
+ for (; dims; dims = TREE_CHAIN (dims))
{
int size = TREE_INT_CST_LOW (TREE_VALUE (dims));
@@ -3991,6 +4311,21 @@ nvptx_file_end (void)
if (worker_red_size)
write_worker_buffer (asm_out_file, worker_red_sym,
worker_red_align, worker_red_size);
+
+ if (need_softstack_decl)
+ {
+ write_var_marker (asm_out_file, false, true, "__nvptx_stacks");
+ /* 32 is the maximum number of warps in a block. Even though it's an
+ external declaration, emit the array size explicitly; otherwise, it
+ may fail at PTX JIT time if the definition is later in link order. */
+ fprintf (asm_out_file, ".extern .shared .u%d __nvptx_stacks[32];\n",
+ POINTER_SIZE);
+ }
+ if (need_unisimt_decl)
+ {
+ write_var_marker (asm_out_file, false, true, "__nvptx_uni");
+ fprintf (asm_out_file, ".extern .shared .u32 __nvptx_uni[32];\n");
+ }
}
/* Expander for the shuffle builtins. */
@@ -4176,6 +4511,14 @@ nvptx_expand_builtin (tree exp, rtx target, rtx ARG_UNUSED (subtarget),
#define PTX_WORKER_LENGTH 32
#define PTX_GANG_DEFAULT 0 /* Defer to runtime. */
+/* Implement TARGET_SIMT_VF target hook: number of threads in a warp. */
+
+static int
+nvptx_simt_vf ()
+{
+ return PTX_VECTOR_LENGTH;
+}
+
/* Validate compute dimensions of an OpenACC offload or routine, fill
in non-unity defaults. FN_LEVEL indicates the level at which a
routine might spawn a loop. It is negative for non-routines. If
@@ -4944,6 +5287,9 @@ nvptx_goacc_reduction (gcall *call)
#undef TARGET_BUILTIN_DECL
#define TARGET_BUILTIN_DECL nvptx_builtin_decl
+#undef TARGET_SIMT_VF
+#define TARGET_SIMT_VF nvptx_simt_vf
+
#undef TARGET_GOACC_VALIDATE_DIMS
#define TARGET_GOACC_VALIDATE_DIMS nvptx_goacc_validate_dims