summaryrefslogtreecommitdiff
path: root/gcc/config/nvptx/nvptx.c
diff options
context:
space:
mode:
Diffstat (limited to 'gcc/config/nvptx/nvptx.c')
-rw-r--r--gcc/config/nvptx/nvptx.c206
1 files changed, 198 insertions, 8 deletions
diff --git a/gcc/config/nvptx/nvptx.c b/gcc/config/nvptx/nvptx.c
index daeec2733ea..208b11555f2 100644
--- a/gcc/config/nvptx/nvptx.c
+++ b/gcc/config/nvptx/nvptx.c
@@ -74,6 +74,8 @@
/* This file should be included last. */
#include "target-def.h"
+#define WORKAROUND_PTXJIT_BUG 1
+
/* The various PTX memory areas an object might reside in. */
enum nvptx_data_area
{
@@ -205,6 +207,17 @@ nvptx_option_override (void)
target_flags |= MASK_SOFT_STACK | MASK_UNIFORM_SIMT;
}
+/* Implement TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE. */
+
+static void
+nvptx_override_options_after_change (void)
+{
+ /* This is a workaround for PR81430 - nvptx acceleration compilation broken
+ because of running pass_partition_blocks. This should be dealt with in the
+ common code, not in the target. */
+ flag_reorder_blocks_and_partition = 0;
+}
+
/* Return a ptx type for MODE. If PROMOTE, then use .u32 for QImode to
deal with ptx ideosyncracies. */
@@ -234,6 +247,11 @@ nvptx_ptx_type_from_mode (machine_mode mode, bool promote)
case DFmode:
return ".f64";
+ case V2SImode:
+ return ".v2.u32";
+ case V2DImode:
+ return ".v2.u64";
+
default:
gcc_unreachable ();
}
@@ -2178,7 +2196,20 @@ nvptx_output_mov_insn (rtx dst, rtx src)
? "%.\tmov%t0\t%0, %1;" : "%.\tmov.b%T0\t%0, %1;");
if (GET_MODE_SIZE (dst_inner) == GET_MODE_SIZE (src_inner))
- return "%.\tmov.b%T0\t%0, %1;";
+ {
+ if (GET_MODE_BITSIZE (dst_mode) == 128
+ && GET_MODE_BITSIZE (GET_MODE (src)) == 128)
+ {
+ /* mov.b128 is not supported. */
+ if (dst_inner == V2DImode && src_inner == TImode)
+ return "%.\tmov.u64\t%0.x, %L1;\n\t%.\tmov.u64\t%0.y, %H1;";
+ else if (dst_inner == TImode && src_inner == V2DImode)
+ return "%.\tmov.u64\t%L0, %1.x;\n\t%.\tmov.u64\t%H0, %1.y;";
+
+ gcc_unreachable ();
+ }
+ return "%.\tmov.b%T0\t%0, %1;";
+ }
return "%.\tcvt%t0%t1\t%0, %1;";
}
@@ -2403,13 +2434,33 @@ nvptx_print_operand (FILE *file, rtx x, int code)
case 'u':
if (x_code == SUBREG)
{
- mode = GET_MODE (SUBREG_REG (x));
- if (split_mode_p (mode))
- mode = maybe_split_mode (mode);
+ machine_mode inner_mode = GET_MODE (SUBREG_REG (x));
+ if (VECTOR_MODE_P (inner_mode)
+ && (GET_MODE_SIZE (mode)
+ <= GET_MODE_SIZE (GET_MODE_INNER (inner_mode))))
+ mode = GET_MODE_INNER (inner_mode);
+ else if (split_mode_p (inner_mode))
+ mode = maybe_split_mode (inner_mode);
+ else
+ mode = inner_mode;
}
fprintf (file, "%s", nvptx_ptx_type_from_mode (mode, code == 't'));
break;
+ case 'H':
+ case 'L':
+ {
+ rtx inner_x = SUBREG_REG (x);
+ machine_mode inner_mode = GET_MODE (inner_x);
+ machine_mode split = maybe_split_mode (inner_mode);
+
+ output_reg (file, REGNO (inner_x), split,
+ (code == 'H'
+ ? GET_MODE_SIZE (inner_mode) / 2
+ : 0));
+ }
+ break;
+
case 'S':
{
nvptx_shuffle_kind kind = (nvptx_shuffle_kind) UINTVAL (x);
@@ -2506,7 +2557,14 @@ nvptx_print_operand (FILE *file, rtx x, int code)
machine_mode inner_mode = GET_MODE (inner_x);
machine_mode split = maybe_split_mode (inner_mode);
- if (split_mode_p (inner_mode)
+ if (VECTOR_MODE_P (inner_mode)
+ && (GET_MODE_SIZE (mode)
+ <= GET_MODE_SIZE (GET_MODE_INNER (inner_mode))))
+ {
+ output_reg (file, REGNO (inner_x), VOIDmode);
+ fprintf (file, ".%s", SUBREG_BYTE (x) == 0 ? "x" : "y");
+ }
+ else if (split_mode_p (inner_mode)
&& (GET_MODE_SIZE (inner_mode) == GET_MODE_SIZE (mode)))
output_reg (file, REGNO (inner_x), split);
else
@@ -2548,6 +2606,22 @@ nvptx_print_operand (FILE *file, rtx x, int code)
fprintf (file, "0d%08lx%08lx", vals[1], vals[0]);
break;
+ case CONST_VECTOR:
+ {
+ unsigned n = CONST_VECTOR_NUNITS (x);
+ fprintf (file, "{ ");
+ for (unsigned i = 0; i < n; ++i)
+ {
+ if (i != 0)
+ fprintf (file, ", ");
+
+ rtx elem = CONST_VECTOR_ELT (x, i);
+ output_addr_const (file, elem);
+ }
+ fprintf (file, " }");
+ }
+ break;
+
default:
output_addr_const (file, x);
}
@@ -3844,6 +3918,24 @@ nvptx_wsync (bool after)
return gen_nvptx_barsync (GEN_INT (after));
}
+#if WORKAROUND_PTXJIT_BUG
+/* Return first real insn in BB, or return NULL_RTX if BB does not contain
+ real insns. */
+
+static rtx_insn *
+bb_first_real_insn (basic_block bb)
+{
+ rtx_insn *insn;
+
+ /* Find first insn of from block. */
+ FOR_BB_INSNS (bb, insn)
+ if (INSN_P (insn))
+ return insn;
+
+ return 0;
+}
+#endif
+
/* Single neutering according to MASK. FROM is the incoming block and
TO is the outgoing block. These may be the same block. Insert at
start of FROM:
@@ -3866,9 +3958,25 @@ nvptx_single (unsigned mask, basic_block from, basic_block to)
rtx_insn *tail = BB_END (to);
unsigned skip_mask = mask;
- /* Find first insn of from block */
- while (head != BB_END (from) && !INSN_P (head))
- head = NEXT_INSN (head);
+ while (true)
+ {
+ /* Find first insn of from block. */
+ while (head != BB_END (from) && !INSN_P (head))
+ head = NEXT_INSN (head);
+
+ if (from == to)
+ break;
+
+ if (!(JUMP_P (head) && single_succ_p (from)))
+ break;
+
+ basic_block jump_target = single_succ (from);
+ if (!single_pred_p (jump_target))
+ break;
+
+ from = jump_target;
+ head = BB_HEAD (from);
+ }
/* Find last insn of to block */
rtx_insn *limit = from == to ? head : BB_HEAD (to);
@@ -3958,6 +4066,39 @@ nvptx_single (unsigned mask, basic_block from, basic_block to)
if (GOMP_DIM_MASK (GOMP_DIM_VECTOR) == mask)
{
/* Vector mode only, do a shuffle. */
+#if WORKAROUND_PTXJIT_BUG
+ /* The branch condition %rcond is propagated like this:
+
+ {
+ .reg .u32 %x;
+ mov.u32 %x,%tid.x;
+ setp.ne.u32 %rnotvzero,%x,0;
+ }
+
+ @%rnotvzero bra Lskip;
+ setp.<op>.<type> %rcond,op1,op2;
+ Lskip:
+ selp.u32 %rcondu32,1,0,%rcond;
+ shfl.idx.b32 %rcondu32,%rcondu32,0,31;
+ setp.ne.u32 %rcond,%rcondu32,0;
+
+ There seems to be a bug in the ptx JIT compiler (observed at driver
+ version 381.22, at -O1 and higher for sm_61), that drops the shfl
+ unless %rcond is initialized to something before 'bra Lskip'. The
+ bug is not observed with ptxas from cuda 8.0.61.
+
+ It is true that the code is non-trivial: at Lskip, %rcond is
+ uninitialized in threads 1-31, and after the selp the same holds
+ for %rcondu32. But shfl propagates the defined value in thread 0
+ to threads 1-31, so after the shfl %rcondu32 is defined in threads
+ 0-31, and after the setp.ne %rcond is defined in threads 0-31.
+
+ There is nothing in the PTX spec to suggest that this is wrong, or
+ to explain why the extra initialization is needed. So, we classify
+ it as a JIT bug, and the extra initialization as workaround. */
+ emit_insn_before (gen_movbi (pvar, const0_rtx),
+ bb_first_real_insn (from));
+#endif
emit_insn_before (nvptx_gen_vcast (pvar), tail);
}
else
@@ -5143,6 +5284,7 @@ nvptx_goacc_reduction_init (gcall *call)
/* Fixup flags from call_bb to init_bb. */
init_edge->flags ^= EDGE_FALLTHRU | EDGE_TRUE_VALUE;
+ init_edge->probability = profile_probability::even ();
/* Set the initialization stmts. */
gimple_seq init_seq = NULL;
@@ -5158,6 +5300,7 @@ nvptx_goacc_reduction_init (gcall *call)
/* Create false edge from call_bb to dst_bb. */
edge nop_edge = make_edge (call_bb, dst_bb, EDGE_FALSE_VALUE);
+ nop_edge->probability = profile_probability::even ();
/* Create phi node in dst block. */
gphi *phi = create_phi_node (lhs, dst_bb);
@@ -5335,9 +5478,49 @@ nvptx_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED,
return true;
}
+static bool
+nvptx_vector_mode_supported (machine_mode mode)
+{
+ return (mode == V2SImode
+ || mode == V2DImode);
+}
+
+/* Return the preferred mode for vectorizing scalar MODE. */
+
+static machine_mode
+nvptx_preferred_simd_mode (machine_mode mode)
+{
+ switch (mode)
+ {
+ case DImode:
+ return V2DImode;
+ case SImode:
+ return V2SImode;
+
+ default:
+ return default_preferred_simd_mode (mode);
+ }
+}
+
+unsigned int
+nvptx_data_alignment (const_tree type, unsigned int basic_align)
+{
+ if (TREE_CODE (type) == INTEGER_TYPE)
+ {
+ unsigned HOST_WIDE_INT size = tree_to_uhwi (TYPE_SIZE_UNIT (type));
+ if (size == GET_MODE_SIZE (TImode))
+ return GET_MODE_BITSIZE (maybe_split_mode (TImode));
+ }
+
+ return basic_align;
+}
+
#undef TARGET_OPTION_OVERRIDE
#define TARGET_OPTION_OVERRIDE nvptx_option_override
+#undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
+#define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE nvptx_override_options_after_change
+
#undef TARGET_ATTRIBUTE_TABLE
#define TARGET_ATTRIBUTE_TABLE nvptx_attribute_table
@@ -5452,6 +5635,13 @@ nvptx_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED,
#undef TARGET_CANNOT_FORCE_CONST_MEM
#define TARGET_CANNOT_FORCE_CONST_MEM nvptx_cannot_force_const_mem
+#undef TARGET_VECTOR_MODE_SUPPORTED_P
+#define TARGET_VECTOR_MODE_SUPPORTED_P nvptx_vector_mode_supported
+
+#undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
+#define TARGET_VECTORIZE_PREFERRED_SIMD_MODE \
+ nvptx_preferred_simd_mode
+
struct gcc_target targetm = TARGET_INITIALIZER;
#include "gt-nvptx.h"