diff options
Diffstat (limited to 'gcc/config/nvptx/nvptx.c')
-rw-r--r-- | gcc/config/nvptx/nvptx.c | 206 |
1 files changed, 198 insertions, 8 deletions
diff --git a/gcc/config/nvptx/nvptx.c b/gcc/config/nvptx/nvptx.c index daeec2733ea..208b11555f2 100644 --- a/gcc/config/nvptx/nvptx.c +++ b/gcc/config/nvptx/nvptx.c @@ -74,6 +74,8 @@ /* This file should be included last. */ #include "target-def.h" +#define WORKAROUND_PTXJIT_BUG 1 + /* The various PTX memory areas an object might reside in. */ enum nvptx_data_area { @@ -205,6 +207,17 @@ nvptx_option_override (void) target_flags |= MASK_SOFT_STACK | MASK_UNIFORM_SIMT; } +/* Implement TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE. */ + +static void +nvptx_override_options_after_change (void) +{ + /* This is a workaround for PR81430 - nvptx acceleration compilation broken + because of running pass_partition_blocks. This should be dealt with in the + common code, not in the target. */ + flag_reorder_blocks_and_partition = 0; +} + /* Return a ptx type for MODE. If PROMOTE, then use .u32 for QImode to deal with ptx ideosyncracies. */ @@ -234,6 +247,11 @@ nvptx_ptx_type_from_mode (machine_mode mode, bool promote) case DFmode: return ".f64"; + case V2SImode: + return ".v2.u32"; + case V2DImode: + return ".v2.u64"; + default: gcc_unreachable (); } @@ -2178,7 +2196,20 @@ nvptx_output_mov_insn (rtx dst, rtx src) ? "%.\tmov%t0\t%0, %1;" : "%.\tmov.b%T0\t%0, %1;"); if (GET_MODE_SIZE (dst_inner) == GET_MODE_SIZE (src_inner)) - return "%.\tmov.b%T0\t%0, %1;"; + { + if (GET_MODE_BITSIZE (dst_mode) == 128 + && GET_MODE_BITSIZE (GET_MODE (src)) == 128) + { + /* mov.b128 is not supported. */ + if (dst_inner == V2DImode && src_inner == TImode) + return "%.\tmov.u64\t%0.x, %L1;\n\t%.\tmov.u64\t%0.y, %H1;"; + else if (dst_inner == TImode && src_inner == V2DImode) + return "%.\tmov.u64\t%L0, %1.x;\n\t%.\tmov.u64\t%H0, %1.y;"; + + gcc_unreachable (); + } + return "%.\tmov.b%T0\t%0, %1;"; + } return "%.\tcvt%t0%t1\t%0, %1;"; } @@ -2403,13 +2434,33 @@ nvptx_print_operand (FILE *file, rtx x, int code) case 'u': if (x_code == SUBREG) { - mode = GET_MODE (SUBREG_REG (x)); - if (split_mode_p (mode)) - mode = maybe_split_mode (mode); + machine_mode inner_mode = GET_MODE (SUBREG_REG (x)); + if (VECTOR_MODE_P (inner_mode) + && (GET_MODE_SIZE (mode) + <= GET_MODE_SIZE (GET_MODE_INNER (inner_mode)))) + mode = GET_MODE_INNER (inner_mode); + else if (split_mode_p (inner_mode)) + mode = maybe_split_mode (inner_mode); + else + mode = inner_mode; } fprintf (file, "%s", nvptx_ptx_type_from_mode (mode, code == 't')); break; + case 'H': + case 'L': + { + rtx inner_x = SUBREG_REG (x); + machine_mode inner_mode = GET_MODE (inner_x); + machine_mode split = maybe_split_mode (inner_mode); + + output_reg (file, REGNO (inner_x), split, + (code == 'H' + ? GET_MODE_SIZE (inner_mode) / 2 + : 0)); + } + break; + case 'S': { nvptx_shuffle_kind kind = (nvptx_shuffle_kind) UINTVAL (x); @@ -2506,7 +2557,14 @@ nvptx_print_operand (FILE *file, rtx x, int code) machine_mode inner_mode = GET_MODE (inner_x); machine_mode split = maybe_split_mode (inner_mode); - if (split_mode_p (inner_mode) + if (VECTOR_MODE_P (inner_mode) + && (GET_MODE_SIZE (mode) + <= GET_MODE_SIZE (GET_MODE_INNER (inner_mode)))) + { + output_reg (file, REGNO (inner_x), VOIDmode); + fprintf (file, ".%s", SUBREG_BYTE (x) == 0 ? "x" : "y"); + } + else if (split_mode_p (inner_mode) && (GET_MODE_SIZE (inner_mode) == GET_MODE_SIZE (mode))) output_reg (file, REGNO (inner_x), split); else @@ -2548,6 +2606,22 @@ nvptx_print_operand (FILE *file, rtx x, int code) fprintf (file, "0d%08lx%08lx", vals[1], vals[0]); break; + case CONST_VECTOR: + { + unsigned n = CONST_VECTOR_NUNITS (x); + fprintf (file, "{ "); + for (unsigned i = 0; i < n; ++i) + { + if (i != 0) + fprintf (file, ", "); + + rtx elem = CONST_VECTOR_ELT (x, i); + output_addr_const (file, elem); + } + fprintf (file, " }"); + } + break; + default: output_addr_const (file, x); } @@ -3844,6 +3918,24 @@ nvptx_wsync (bool after) return gen_nvptx_barsync (GEN_INT (after)); } +#if WORKAROUND_PTXJIT_BUG +/* Return first real insn in BB, or return NULL_RTX if BB does not contain + real insns. */ + +static rtx_insn * +bb_first_real_insn (basic_block bb) +{ + rtx_insn *insn; + + /* Find first insn of from block. */ + FOR_BB_INSNS (bb, insn) + if (INSN_P (insn)) + return insn; + + return 0; +} +#endif + /* Single neutering according to MASK. FROM is the incoming block and TO is the outgoing block. These may be the same block. Insert at start of FROM: @@ -3866,9 +3958,25 @@ nvptx_single (unsigned mask, basic_block from, basic_block to) rtx_insn *tail = BB_END (to); unsigned skip_mask = mask; - /* Find first insn of from block */ - while (head != BB_END (from) && !INSN_P (head)) - head = NEXT_INSN (head); + while (true) + { + /* Find first insn of from block. */ + while (head != BB_END (from) && !INSN_P (head)) + head = NEXT_INSN (head); + + if (from == to) + break; + + if (!(JUMP_P (head) && single_succ_p (from))) + break; + + basic_block jump_target = single_succ (from); + if (!single_pred_p (jump_target)) + break; + + from = jump_target; + head = BB_HEAD (from); + } /* Find last insn of to block */ rtx_insn *limit = from == to ? head : BB_HEAD (to); @@ -3958,6 +4066,39 @@ nvptx_single (unsigned mask, basic_block from, basic_block to) if (GOMP_DIM_MASK (GOMP_DIM_VECTOR) == mask) { /* Vector mode only, do a shuffle. */ +#if WORKAROUND_PTXJIT_BUG + /* The branch condition %rcond is propagated like this: + + { + .reg .u32 %x; + mov.u32 %x,%tid.x; + setp.ne.u32 %rnotvzero,%x,0; + } + + @%rnotvzero bra Lskip; + setp.<op>.<type> %rcond,op1,op2; + Lskip: + selp.u32 %rcondu32,1,0,%rcond; + shfl.idx.b32 %rcondu32,%rcondu32,0,31; + setp.ne.u32 %rcond,%rcondu32,0; + + There seems to be a bug in the ptx JIT compiler (observed at driver + version 381.22, at -O1 and higher for sm_61), that drops the shfl + unless %rcond is initialized to something before 'bra Lskip'. The + bug is not observed with ptxas from cuda 8.0.61. + + It is true that the code is non-trivial: at Lskip, %rcond is + uninitialized in threads 1-31, and after the selp the same holds + for %rcondu32. But shfl propagates the defined value in thread 0 + to threads 1-31, so after the shfl %rcondu32 is defined in threads + 0-31, and after the setp.ne %rcond is defined in threads 0-31. + + There is nothing in the PTX spec to suggest that this is wrong, or + to explain why the extra initialization is needed. So, we classify + it as a JIT bug, and the extra initialization as workaround. */ + emit_insn_before (gen_movbi (pvar, const0_rtx), + bb_first_real_insn (from)); +#endif emit_insn_before (nvptx_gen_vcast (pvar), tail); } else @@ -5143,6 +5284,7 @@ nvptx_goacc_reduction_init (gcall *call) /* Fixup flags from call_bb to init_bb. */ init_edge->flags ^= EDGE_FALLTHRU | EDGE_TRUE_VALUE; + init_edge->probability = profile_probability::even (); /* Set the initialization stmts. */ gimple_seq init_seq = NULL; @@ -5158,6 +5300,7 @@ nvptx_goacc_reduction_init (gcall *call) /* Create false edge from call_bb to dst_bb. */ edge nop_edge = make_edge (call_bb, dst_bb, EDGE_FALSE_VALUE); + nop_edge->probability = profile_probability::even (); /* Create phi node in dst block. */ gphi *phi = create_phi_node (lhs, dst_bb); @@ -5335,9 +5478,49 @@ nvptx_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, return true; } +static bool +nvptx_vector_mode_supported (machine_mode mode) +{ + return (mode == V2SImode + || mode == V2DImode); +} + +/* Return the preferred mode for vectorizing scalar MODE. */ + +static machine_mode +nvptx_preferred_simd_mode (machine_mode mode) +{ + switch (mode) + { + case DImode: + return V2DImode; + case SImode: + return V2SImode; + + default: + return default_preferred_simd_mode (mode); + } +} + +unsigned int +nvptx_data_alignment (const_tree type, unsigned int basic_align) +{ + if (TREE_CODE (type) == INTEGER_TYPE) + { + unsigned HOST_WIDE_INT size = tree_to_uhwi (TYPE_SIZE_UNIT (type)); + if (size == GET_MODE_SIZE (TImode)) + return GET_MODE_BITSIZE (maybe_split_mode (TImode)); + } + + return basic_align; +} + #undef TARGET_OPTION_OVERRIDE #define TARGET_OPTION_OVERRIDE nvptx_option_override +#undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE +#define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE nvptx_override_options_after_change + #undef TARGET_ATTRIBUTE_TABLE #define TARGET_ATTRIBUTE_TABLE nvptx_attribute_table @@ -5452,6 +5635,13 @@ nvptx_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, #undef TARGET_CANNOT_FORCE_CONST_MEM #define TARGET_CANNOT_FORCE_CONST_MEM nvptx_cannot_force_const_mem +#undef TARGET_VECTOR_MODE_SUPPORTED_P +#define TARGET_VECTOR_MODE_SUPPORTED_P nvptx_vector_mode_supported + +#undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE +#define TARGET_VECTORIZE_PREFERRED_SIMD_MODE \ + nvptx_preferred_simd_mode + struct gcc_target targetm = TARGET_INITIALIZER; #include "gt-nvptx.h" |