diff options
author | jakub <jakub@138bc75d-0d04-0410-961f-82ee72b054a4> | 2013-10-31 19:06:49 +0000 |
---|---|---|
committer | jakub <jakub@138bc75d-0d04-0410-961f-82ee72b054a4> | 2013-10-31 19:06:49 +0000 |
commit | 09e640e662399f5e84c6fa1cd9f10c05f19c9777 (patch) | |
tree | 7ea25de19c8a02724d9cc355bb4b34e90e4f4b93 /gcc | |
parent | 23871d0cedfe77078bb09c3be41afd26e4a9f551 (diff) | |
download | gcc-09e640e662399f5e84c6fa1cd9f10c05f19c9777.tar.gz |
* optabs.c (expand_vec_perm): Avoid vector mode punning
SUBREGs in SET_DEST.
* expmed.c (store_bit_field_1): Likewise.
* config/i386/sse.md (movdi_to_sse, vec_pack_sfix_trunc_v2df,
vec_pack_sfix_v2df, vec_shl_<mode>, vec_shr_<mode>,
vec_interleave_high<mode>, vec_interleave_low<mode>): Likewise.
* config/i386/i386.c (ix86_expand_vector_move_misalign,
ix86_expand_sse_movcc, ix86_expand_int_vcond, ix86_expand_vec_perm,
ix86_expand_sse_unpack, ix86_expand_args_builtin,
ix86_expand_vector_init_duplicate, ix86_expand_vector_set,
emit_reduc_half, expand_vec_perm_blend, expand_vec_perm_pshufb,
expand_vec_perm_interleave2, expand_vec_perm_pshufb2,
expand_vec_perm_vpshufb2_vpermq,
expand_vec_perm_vpshufb2_vpermq_even_odd, expand_vec_perm_even_odd_1,
expand_vec_perm_broadcast_1, expand_vec_perm_vpshufb4_vpermq2,
ix86_expand_sse2_mulv4si3, ix86_expand_pinsr): Likewise.
(expand_vec_perm_palignr): Likewise. Modify a copy of *d rather
than *d itself.
git-svn-id: svn+ssh://gcc.gnu.org/svn/gcc/trunk@204274 138bc75d-0d04-0410-961f-82ee72b054a4
Diffstat (limited to 'gcc')
-rw-r--r-- | gcc/ChangeLog | 21 | ||||
-rw-r--r-- | gcc/config/i386/i386.c | 320 | ||||
-rw-r--r-- | gcc/config/i386/sse.md | 65 | ||||
-rw-r--r-- | gcc/expmed.c | 27 | ||||
-rw-r--r-- | gcc/optabs.c | 10 |
5 files changed, 300 insertions, 143 deletions
diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 48a6aab2fb5..94fb8ad54b4 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,24 @@ +2013-10-31 Jakub Jelinek <jakub@redhat.com> + + * optabs.c (expand_vec_perm): Avoid vector mode punning + SUBREGs in SET_DEST. + * expmed.c (store_bit_field_1): Likewise. + * config/i386/sse.md (movdi_to_sse, vec_pack_sfix_trunc_v2df, + vec_pack_sfix_v2df, vec_shl_<mode>, vec_shr_<mode>, + vec_interleave_high<mode>, vec_interleave_low<mode>): Likewise. + * config/i386/i386.c (ix86_expand_vector_move_misalign, + ix86_expand_sse_movcc, ix86_expand_int_vcond, ix86_expand_vec_perm, + ix86_expand_sse_unpack, ix86_expand_args_builtin, + ix86_expand_vector_init_duplicate, ix86_expand_vector_set, + emit_reduc_half, expand_vec_perm_blend, expand_vec_perm_pshufb, + expand_vec_perm_interleave2, expand_vec_perm_pshufb2, + expand_vec_perm_vpshufb2_vpermq, + expand_vec_perm_vpshufb2_vpermq_even_odd, expand_vec_perm_even_odd_1, + expand_vec_perm_broadcast_1, expand_vec_perm_vpshufb4_vpermq2, + ix86_expand_sse2_mulv4si3, ix86_expand_pinsr): Likewise. + (expand_vec_perm_palignr): Likewise. Modify a copy of *d rather + than *d itself. + 2013-10-31 Uros Bizjak <ubizjak@gmail.com> * config/i386/i386.c (ix86_expand_sse2_abs): Rename function arguments. diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index 5a2597b01ab..902e1699604 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -16803,6 +16803,8 @@ ix86_expand_vector_move_misalign (enum machine_mode mode, rtx operands[]) } else { + rtx t; + if (TARGET_AVX || TARGET_SSE_UNALIGNED_LOAD_OPTIMAL || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL @@ -16821,18 +16823,22 @@ ix86_expand_vector_move_misalign (enum machine_mode mode, rtx operands[]) return; } + if (mode != V4SFmode) + t = gen_reg_rtx (V4SFmode); + else + t = op0; + if (TARGET_SSE_PARTIAL_REG_DEPENDENCY) - emit_move_insn (op0, CONST0_RTX (mode)); + emit_move_insn (t, CONST0_RTX (V4SFmode)); else - emit_clobber (op0); - - if (mode != V4SFmode) - op0 = gen_lowpart (V4SFmode, op0); + emit_clobber (t); m = adjust_address (op1, V2SFmode, 0); - emit_insn (gen_sse_loadlps (op0, op0, m)); + emit_insn (gen_sse_loadlps (t, t, m)); m = adjust_address (op1, V2SFmode, 8); - emit_insn (gen_sse_loadhps (op0, op0, m)); + emit_insn (gen_sse_loadhps (t, t, m)); + if (mode != V4SFmode) + emit_move_insn (op0, gen_lowpart (mode, t)); } } else if (MEM_P (op0)) @@ -20473,6 +20479,7 @@ ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false) else { rtx (*gen) (rtx, rtx, rtx, rtx) = NULL; + rtx d = dest; if (!nonimmediate_operand (op_true, mode)) op_true = force_reg (mode, op_true); @@ -20496,7 +20503,8 @@ ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false) if (TARGET_SSE4_1) { gen = gen_sse4_1_pblendvb; - dest = gen_lowpart (V16QImode, dest); + if (mode != V16QImode) + d = gen_reg_rtx (V16QImode); op_false = gen_lowpart (V16QImode, op_false); op_true = gen_lowpart (V16QImode, op_true); cmp = gen_lowpart (V16QImode, cmp); @@ -20517,7 +20525,8 @@ ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false) if (TARGET_AVX2) { gen = gen_avx2_pblendvb; - dest = gen_lowpart (V32QImode, dest); + if (mode != V32QImode) + d = gen_reg_rtx (V32QImode); op_false = gen_lowpart (V32QImode, op_false); op_true = gen_lowpart (V32QImode, op_true); cmp = gen_lowpart (V32QImode, cmp); @@ -20528,7 +20537,11 @@ ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false) } if (gen != NULL) - emit_insn (gen (dest, op_false, op_true, cmp)); + { + emit_insn (gen (d, op_false, op_true, cmp)); + if (d != dest) + emit_move_insn (dest, gen_lowpart (GET_MODE (dest), d)); + } else { op_true = force_reg (mode, op_true); @@ -20849,8 +20862,7 @@ ix86_expand_int_vcond (rtx operands[]) else { gcc_assert (GET_MODE_SIZE (data_mode) == GET_MODE_SIZE (mode)); - x = ix86_expand_sse_cmp (gen_lowpart (mode, operands[0]), - code, cop0, cop1, + x = ix86_expand_sse_cmp (gen_reg_rtx (mode), code, cop0, cop1, operands[1+negate], operands[2-negate]); x = gen_lowpart (data_mode, x); } @@ -20869,7 +20881,7 @@ ix86_expand_vec_perm (rtx operands[]) rtx op0 = operands[1]; rtx op1 = operands[2]; rtx mask = operands[3]; - rtx t1, t2, t3, t4, vt, vt2, vec[32]; + rtx t1, t2, t3, t4, t5, t6, t7, t8, vt, vt2, vec[32]; enum machine_mode mode = GET_MODE (op0); enum machine_mode maskmode = GET_MODE (mask); int w, e, i; @@ -20937,7 +20949,7 @@ ix86_expand_vec_perm (rtx operands[]) /* Continue as if V8SImode (resp. V32QImode) was used initially. */ operands[3] = mask = t1; - target = gen_lowpart (mode, target); + target = gen_reg_rtx (mode); op0 = gen_lowpart (mode, op0); op1 = gen_lowpart (mode, op1); } @@ -20949,7 +20961,12 @@ ix86_expand_vec_perm (rtx operands[]) the high bits of the shuffle elements. No need for us to perform an AND ourselves. */ if (one_operand_shuffle) - emit_insn (gen_avx2_permvarv8si (target, op0, mask)); + { + emit_insn (gen_avx2_permvarv8si (target, op0, mask)); + if (target != operands[0]) + emit_move_insn (operands[0], + gen_lowpart (GET_MODE (operands[0]), target)); + } else { t1 = gen_reg_rtx (V8SImode); @@ -21022,13 +21039,13 @@ ix86_expand_vec_perm (rtx operands[]) stands for other 12 bytes. */ /* The bit whether element is from the same lane or the other lane is bit 4, so shift it up by 3 to the MSB position. */ - emit_insn (gen_ashlv4di3 (gen_lowpart (V4DImode, t1), - gen_lowpart (V4DImode, mask), + t5 = gen_reg_rtx (V4DImode); + emit_insn (gen_ashlv4di3 (t5, gen_lowpart (V4DImode, mask), GEN_INT (3))); /* Clear MSB bits from the mask just in case it had them set. */ emit_insn (gen_avx2_andnotv32qi3 (t2, vt, mask)); /* After this t1 will have MSB set for elements from other lane. */ - emit_insn (gen_xorv32qi3 (t1, t1, vt2)); + emit_insn (gen_xorv32qi3 (t1, gen_lowpart (V32QImode, t5), vt2)); /* Clear bits other than MSB. */ emit_insn (gen_andv32qi3 (t1, t1, vt)); /* Or in the lower bits from mask into t3. */ @@ -21037,8 +21054,8 @@ ix86_expand_vec_perm (rtx operands[]) lane. */ emit_insn (gen_xorv32qi3 (t1, t1, vt)); /* Swap 128-bit lanes in t3. */ - emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t3), - gen_lowpart (V4DImode, t3), + t6 = gen_reg_rtx (V4DImode); + emit_insn (gen_avx2_permv4di_1 (t6, gen_lowpart (V4DImode, t3), const2_rtx, GEN_INT (3), const0_rtx, const1_rtx)); /* And or in the lower bits from mask into t1. */ @@ -21048,15 +21065,20 @@ ix86_expand_vec_perm (rtx operands[]) /* Each of these shuffles will put 0s in places where element from the other 128-bit lane is needed, otherwise will shuffle in the requested value. */ - emit_insn (gen_avx2_pshufbv32qi3 (t3, op0, t3)); + emit_insn (gen_avx2_pshufbv32qi3 (t3, op0, + gen_lowpart (V32QImode, t6))); emit_insn (gen_avx2_pshufbv32qi3 (t1, op0, t1)); /* For t3 the 128-bit lanes are swapped again. */ - emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t3), - gen_lowpart (V4DImode, t3), + t7 = gen_reg_rtx (V4DImode); + emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t3), const2_rtx, GEN_INT (3), const0_rtx, const1_rtx)); /* And oring both together leads to the result. */ - emit_insn (gen_iorv32qi3 (target, t1, t3)); + emit_insn (gen_iorv32qi3 (target, t1, + gen_lowpart (V32QImode, t7))); + if (target != operands[0]) + emit_move_insn (operands[0], + gen_lowpart (GET_MODE (operands[0]), target)); return; } @@ -21064,20 +21086,22 @@ ix86_expand_vec_perm (rtx operands[]) /* Similarly to the above one_operand_shuffle code, just for repeated twice for each operand. merge_two: code will merge the two results together. */ - emit_insn (gen_avx2_pshufbv32qi3 (t4, op0, t3)); - emit_insn (gen_avx2_pshufbv32qi3 (t3, op1, t3)); + emit_insn (gen_avx2_pshufbv32qi3 (t4, op0, + gen_lowpart (V32QImode, t6))); + emit_insn (gen_avx2_pshufbv32qi3 (t3, op1, + gen_lowpart (V32QImode, t6))); emit_insn (gen_avx2_pshufbv32qi3 (t2, op0, t1)); emit_insn (gen_avx2_pshufbv32qi3 (t1, op1, t1)); - emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t4), - gen_lowpart (V4DImode, t4), + t7 = gen_reg_rtx (V4DImode); + emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t4), const2_rtx, GEN_INT (3), const0_rtx, const1_rtx)); - emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t3), - gen_lowpart (V4DImode, t3), + t8 = gen_reg_rtx (V4DImode); + emit_insn (gen_avx2_permv4di_1 (t8, gen_lowpart (V4DImode, t3), const2_rtx, GEN_INT (3), const0_rtx, const1_rtx)); - emit_insn (gen_iorv32qi3 (t4, t2, t4)); - emit_insn (gen_iorv32qi3 (t3, t1, t3)); + emit_insn (gen_iorv32qi3 (t4, t2, gen_lowpart (V32QImode, t7))); + emit_insn (gen_iorv32qi3 (t3, t1, gen_lowpart (V32QImode, t8))); t1 = t4; t2 = t3; goto merge_two; @@ -21146,15 +21170,24 @@ ix86_expand_vec_perm (rtx operands[]) /* The actual shuffle operations all operate on V16QImode. */ op0 = gen_lowpart (V16QImode, op0); op1 = gen_lowpart (V16QImode, op1); - target = gen_lowpart (V16QImode, target); if (TARGET_XOP) { + if (GET_MODE (target) != V16QImode) + target = gen_reg_rtx (V16QImode); emit_insn (gen_xop_pperm (target, op0, op1, mask)); + if (target != operands[0]) + emit_move_insn (operands[0], + gen_lowpart (GET_MODE (operands[0]), target)); } else if (one_operand_shuffle) { + if (GET_MODE (target) != V16QImode) + target = gen_reg_rtx (V16QImode); emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, mask)); + if (target != operands[0]) + emit_move_insn (operands[0], + gen_lowpart (GET_MODE (operands[0]), target)); } else { @@ -21194,7 +21227,9 @@ ix86_expand_vec_perm (rtx operands[]) mask = expand_simple_binop (maskmode, AND, mask, vt, NULL_RTX, 0, OPTAB_DIRECT); - xops[0] = gen_lowpart (mode, operands[0]); + if (GET_MODE (target) != mode) + target = gen_reg_rtx (mode); + xops[0] = target; xops[1] = gen_lowpart (mode, t2); xops[2] = gen_lowpart (mode, t1); xops[3] = gen_rtx_EQ (maskmode, mask, vt); @@ -21202,6 +21237,9 @@ ix86_expand_vec_perm (rtx operands[]) xops[5] = vt; ok = ix86_expand_int_vcond (xops); gcc_assert (ok); + if (target != operands[0]) + emit_move_insn (operands[0], + gen_lowpart (GET_MODE (operands[0]), target)); } } @@ -21280,10 +21318,10 @@ ix86_expand_sse_unpack (rtx dest, rtx src, bool unsigned_p, bool high_p) else if (high_p) { /* Shift higher 8 bytes to lower 8 bytes. */ - tmp = gen_reg_rtx (imode); - emit_insn (gen_sse2_lshrv1ti3 (gen_lowpart (V1TImode, tmp), - gen_lowpart (V1TImode, src), + tmp = gen_reg_rtx (V1TImode); + emit_insn (gen_sse2_lshrv1ti3 (tmp, gen_lowpart (V1TImode, src), GEN_INT (64))); + tmp = gen_lowpart (imode, tmp); } else tmp = src; @@ -21324,7 +21362,9 @@ ix86_expand_sse_unpack (rtx dest, rtx src, bool unsigned_p, bool high_p) tmp = ix86_expand_sse_cmp (gen_reg_rtx (imode), GT, CONST0_RTX (imode), src, pc_rtx, pc_rtx); - emit_insn (unpack (gen_lowpart (imode, dest), src, tmp)); + rtx tmp2 = gen_reg_rtx (imode); + emit_insn (unpack (tmp2, src, tmp)); + emit_move_insn (dest, gen_lowpart (GET_MODE (dest), tmp2)); } } @@ -31967,8 +32007,8 @@ ix86_expand_args_builtin (const struct builtin_description *d, } else { - target = gen_reg_rtx (rmode); - real_target = simplify_gen_subreg (tmode, target, rmode, 0); + real_target = gen_reg_rtx (tmode); + target = simplify_gen_subreg (rmode, real_target, tmode, 0); } for (i = 0; i < nargs; i++) @@ -36691,8 +36731,9 @@ ix86_expand_vector_init_duplicate (bool mmx_ok, enum machine_mode mode, emit_move_insn (tmp1, gen_lowpart (SImode, val)); /* Insert the SImode value as low element of a V4SImode vector. */ - tmp2 = gen_lowpart (V4SImode, dperm.op0); + tmp2 = gen_reg_rtx (V4SImode); emit_insn (gen_vec_setv4si_0 (tmp2, CONST0_RTX (V4SImode), tmp1)); + emit_move_insn (dperm.op0, gen_lowpart (mode, tmp2)); ok = (expand_vec_perm_1 (&dperm) || expand_vec_perm_broadcast_1 (&dperm)); @@ -36722,9 +36763,10 @@ ix86_expand_vector_init_duplicate (bool mmx_ok, enum machine_mode mode, NULL_RTX, 1, OPTAB_LIB_WIDEN); val = expand_simple_binop (wsmode, IOR, val, x, x, 1, OPTAB_LIB_WIDEN); - x = gen_lowpart (wvmode, target); + x = gen_reg_rtx (wvmode); ok = ix86_expand_vector_init_duplicate (mmx_ok, wvmode, x, val); gcc_assert (ok); + emit_move_insn (target, gen_lowpart (GET_MODE (target), x)); return ok; } @@ -37599,8 +37641,9 @@ ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt) else { /* For SSE1, we have to reuse the V4SF code. */ - ix86_expand_vector_set (false, gen_lowpart (V4SFmode, target), - gen_lowpart (SFmode, val), elt); + rtx t = gen_reg_rtx (V4SFmode); + ix86_expand_vector_set (false, t, gen_lowpart (SFmode, val), elt); + emit_move_insn (target, gen_lowpart (mode, t)); } return; @@ -37918,7 +37961,7 @@ ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt) static void emit_reduc_half (rtx dest, rtx src, int i) { - rtx tem; + rtx tem, d = dest; switch (GET_MODE (src)) { case V4SFmode: @@ -37935,8 +37978,8 @@ emit_reduc_half (rtx dest, rtx src, int i) case V8HImode: case V4SImode: case V2DImode: - tem = gen_sse2_lshrv1ti3 (gen_lowpart (V1TImode, dest), - gen_lowpart (V1TImode, src), + d = gen_reg_rtx (V1TImode); + tem = gen_sse2_lshrv1ti3 (d, gen_lowpart (V1TImode, src), GEN_INT (i / 2)); break; case V8SFmode: @@ -37957,19 +38000,26 @@ emit_reduc_half (rtx dest, rtx src, int i) case V8SImode: case V4DImode: if (i == 256) - tem = gen_avx2_permv2ti (gen_lowpart (V4DImode, dest), - gen_lowpart (V4DImode, src), - gen_lowpart (V4DImode, src), - const1_rtx); + { + if (GET_MODE (dest) != V4DImode) + d = gen_reg_rtx (V4DImode); + tem = gen_avx2_permv2ti (d, gen_lowpart (V4DImode, src), + gen_lowpart (V4DImode, src), + const1_rtx); + } else - tem = gen_avx2_lshrv2ti3 (gen_lowpart (V2TImode, dest), - gen_lowpart (V2TImode, src), - GEN_INT (i / 2)); + { + d = gen_reg_rtx (V2TImode); + tem = gen_avx2_lshrv2ti3 (d, gen_lowpart (V2TImode, src), + GEN_INT (i / 2)); + } break; default: gcc_unreachable (); } emit_insn (tem); + if (d != dest) + emit_move_insn (dest, gen_lowpart (GET_MODE (dest), d)); } /* Expand a vector reduction. FN is the binary pattern to reduce; @@ -39462,6 +39512,8 @@ expand_vec_perm_blend (struct expand_vec_perm_d *d) emit_insn (gen_sse4_1_pblendvb (target, op0, op1, vperm)); else emit_insn (gen_avx2_pblendvb (target, op0, op1, vperm)); + if (target != d->target) + emit_move_insn (d->target, gen_lowpart (d->vmode, target)); return true; } @@ -39471,7 +39523,7 @@ expand_vec_perm_blend (struct expand_vec_perm_d *d) /* FALLTHRU */ do_subreg: - target = gen_lowpart (vmode, target); + target = gen_reg_rtx (vmode); op0 = gen_lowpart (vmode, op0); op1 = gen_lowpart (vmode, op1); break; @@ -39525,7 +39577,7 @@ expand_vec_perm_blend (struct expand_vec_perm_d *d) vmode = V32QImode; nelt = 32; - target = gen_lowpart (vmode, target); + target = gen_reg_rtx (vmode); op0 = gen_lowpart (vmode, op0); op1 = gen_lowpart (vmode, op1); goto finish_pblendvb; @@ -39558,6 +39610,8 @@ expand_vec_perm_blend (struct expand_vec_perm_d *d) x = gen_rtx_VEC_MERGE (vmode, op1, op0, GEN_INT (mask)); x = gen_rtx_SET (VOIDmode, target, x); emit_insn (x); + if (target != d->target) + emit_move_insn (d->target, gen_lowpart (d->vmode, target)); return true; } @@ -39663,13 +39717,17 @@ expand_vec_perm_pshufb (struct expand_vec_perm_d *d) /* Use vperm2i128 insn. The pattern uses V4DImode instead of V2TImode. */ - target = gen_lowpart (V4DImode, d->target); + target = d->target; + if (d->vmode != V4DImode) + target = gen_reg_rtx (V4DImode); op0 = gen_lowpart (V4DImode, d->op0); op1 = gen_lowpart (V4DImode, d->op1); rperm[0] = GEN_INT (((d->perm[0] & (nelt / 2)) ? 1 : 0) || ((d->perm[nelt / 2] & (nelt / 2)) ? 2 : 0)); emit_insn (gen_avx2_permv2ti (target, op0, op1, rperm[0])); + if (target != d->target) + emit_move_insn (d->target, gen_lowpart (d->vmode, target)); return true; } return false; @@ -39704,9 +39762,15 @@ expand_vec_perm_pshufb (struct expand_vec_perm_d *d) perm[i] = (d->perm[i * nelt / 4] * 4 / nelt) & 3; if (d->testing_p) return true; - return expand_vselect (gen_lowpart (V4DImode, d->target), - gen_lowpart (V4DImode, d->op0), - perm, 4, false); + target = gen_reg_rtx (V4DImode); + if (expand_vselect (target, gen_lowpart (V4DImode, d->op0), + perm, 4, false)) + { + emit_move_insn (d->target, + gen_lowpart (d->vmode, target)); + return true; + } + return false; } /* Next see if vpermd can be used. */ @@ -39758,7 +39822,9 @@ expand_vec_perm_pshufb (struct expand_vec_perm_d *d) gen_rtvec_v (GET_MODE_NUNITS (vmode), rperm)); vperm = force_reg (vmode, vperm); - target = gen_lowpart (vmode, d->target); + target = d->target; + if (d->vmode != vmode) + target = gen_reg_rtx (vmode); op0 = gen_lowpart (vmode, d->op0); if (d->one_operand_p) { @@ -39776,6 +39842,8 @@ expand_vec_perm_pshufb (struct expand_vec_perm_d *d) op1 = gen_lowpart (vmode, d->op1); emit_insn (gen_xop_pperm (target, op0, op1, vperm)); } + if (target != d->target) + emit_move_insn (d->target, gen_lowpart (d->vmode, target)); return true; } @@ -39975,7 +40043,8 @@ expand_vec_perm_palignr (struct expand_vec_perm_d *d) unsigned i, nelt = d->nelt; unsigned min, max; bool in_order, ok; - rtx shift; + rtx shift, target; + struct expand_vec_perm_d dcopy; /* Even with AVX, palignr only operates on 128-bit vectors. */ if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16) @@ -39998,29 +40067,33 @@ expand_vec_perm_palignr (struct expand_vec_perm_d *d) if (d->testing_p) return true; + dcopy = *d; shift = GEN_INT (min * GET_MODE_BITSIZE (GET_MODE_INNER (d->vmode))); - emit_insn (gen_ssse3_palignrti (gen_lowpart (TImode, d->target), - gen_lowpart (TImode, d->op1), + target = gen_reg_rtx (TImode); + emit_insn (gen_ssse3_palignrti (target, gen_lowpart (TImode, d->op1), gen_lowpart (TImode, d->op0), shift)); - d->op0 = d->op1 = d->target; - d->one_operand_p = true; + dcopy.op0 = dcopy.op1 = gen_lowpart (d->vmode, target); + dcopy.one_operand_p = true; in_order = true; for (i = 0; i < nelt; ++i) { - unsigned e = d->perm[i] - min; + unsigned e = dcopy.perm[i] - min; if (e != i) in_order = false; - d->perm[i] = e; + dcopy.perm[i] = e; } /* Test for the degenerate case where the alignment by itself produces the desired permutation. */ if (in_order) - return true; + { + emit_move_insn (d->target, dcopy.op0); + return true; + } - ok = expand_vec_perm_1 (d); + ok = expand_vec_perm_1 (&dcopy); gcc_assert (ok); return ok; @@ -40274,10 +40347,10 @@ expand_vec_perm_interleave2 (struct expand_vec_perm_d *d) else dfinal.perm[i] = e; } - dfinal.op0 = gen_reg_rtx (dfinal.vmode); + dremap.target = gen_reg_rtx (dremap.vmode); + dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target); dfinal.op1 = dfinal.op0; dfinal.one_operand_p = true; - dremap.target = dfinal.op0; /* Test if the final remap can be done with a single insn. For V4SFmode or V4SImode this *will* succeed. For V8HImode or V16QImode it may not. */ @@ -40294,7 +40367,6 @@ expand_vec_perm_interleave2 (struct expand_vec_perm_d *d) if (dremap.vmode != dfinal.vmode) { - dremap.target = gen_lowpart (dremap.vmode, dremap.target); dremap.op0 = gen_lowpart (dremap.vmode, dremap.op0); dremap.op1 = gen_lowpart (dremap.vmode, dremap.op1); } @@ -40745,8 +40817,12 @@ expand_vec_perm_pshufb2 (struct expand_vec_perm_d *d) op = gen_lowpart (V16QImode, d->op1); emit_insn (gen_ssse3_pshufbv16qi3 (h, op, vperm)); - op = gen_lowpart (V16QImode, d->target); + op = d->target; + if (d->vmode != V16QImode) + op = gen_reg_rtx (V16QImode); emit_insn (gen_iorv16qi3 (op, l, h)); + if (op != d->target) + emit_move_insn (d->target, gen_lowpart (d->vmode, op)); return true; } @@ -40812,8 +40888,12 @@ expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d *d) op = gen_lowpart (V32QImode, d->op0); emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm)); - op = gen_lowpart (V32QImode, d->target); + op = d->target; + if (d->vmode != V32QImode) + op = gen_reg_rtx (V32QImode); emit_insn (gen_iorv32qi3 (op, l, gen_lowpart (V32QImode, hp))); + if (op != d->target) + emit_move_insn (d->target, gen_lowpart (d->vmode, op)); return true; } @@ -40889,10 +40969,11 @@ expand_vec_perm_vpshufb2_vpermq_even_odd (struct expand_vec_perm_d *d) emit_insn (gen_iorv32qi3 (ior, l, h)); /* Permute the V4DImode quarters using { 0, 2, 1, 3 } permutation. */ - op = gen_lowpart (V4DImode, d->target); + op = gen_reg_rtx (V4DImode); ior = gen_lowpart (V4DImode, ior); emit_insn (gen_avx2_permv4di_1 (op, ior, const0_rtx, const2_rtx, const1_rtx, GEN_INT (3))); + emit_move_insn (d->target, gen_lowpart (d->vmode, op)); return true; } @@ -40903,7 +40984,7 @@ expand_vec_perm_vpshufb2_vpermq_even_odd (struct expand_vec_perm_d *d) static bool expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd) { - rtx t1, t2, t3; + rtx t1, t2, t3, t4, t5; switch (d->vmode) { @@ -41015,10 +41096,17 @@ expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd) { struct expand_vec_perm_d d_copy = *d; d_copy.vmode = V4DFmode; - d_copy.target = gen_lowpart (V4DFmode, d->target); + d_copy.target = gen_reg_rtx (V4DFmode); d_copy.op0 = gen_lowpart (V4DFmode, d->op0); d_copy.op1 = gen_lowpart (V4DFmode, d->op1); - return expand_vec_perm_even_odd_1 (&d_copy, odd); + if (expand_vec_perm_even_odd_1 (&d_copy, odd)) + { + if (!d->testing_p) + emit_move_insn (d->target, + gen_lowpart (V4DImode, d_copy.target)); + return true; + } + return false; } t1 = gen_reg_rtx (V4DImode); @@ -41041,44 +41129,51 @@ expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd) { struct expand_vec_perm_d d_copy = *d; d_copy.vmode = V8SFmode; - d_copy.target = gen_lowpart (V8SFmode, d->target); + d_copy.target = gen_reg_rtx (V8SFmode); d_copy.op0 = gen_lowpart (V8SFmode, d->op0); d_copy.op1 = gen_lowpart (V8SFmode, d->op1); - return expand_vec_perm_even_odd_1 (&d_copy, odd); + if (expand_vec_perm_even_odd_1 (&d_copy, odd)) + { + if (!d->testing_p) + emit_move_insn (d->target, + gen_lowpart (V8SImode, d_copy.target)); + return true; + } + return false; } t1 = gen_reg_rtx (V8SImode); t2 = gen_reg_rtx (V8SImode); + t3 = gen_reg_rtx (V4DImode); + t4 = gen_reg_rtx (V4DImode); + t5 = gen_reg_rtx (V4DImode); /* Shuffle the lanes around into { 0 1 2 3 8 9 a b } and { 4 5 6 7 c d e f }. */ - emit_insn (gen_avx2_permv2ti (gen_lowpart (V4DImode, t1), - gen_lowpart (V4DImode, d->op0), + emit_insn (gen_avx2_permv2ti (t3, gen_lowpart (V4DImode, d->op0), gen_lowpart (V4DImode, d->op1), GEN_INT (0x20))); - emit_insn (gen_avx2_permv2ti (gen_lowpart (V4DImode, t2), - gen_lowpart (V4DImode, d->op0), + emit_insn (gen_avx2_permv2ti (t4, gen_lowpart (V4DImode, d->op0), gen_lowpart (V4DImode, d->op1), GEN_INT (0x31))); /* Swap the 2nd and 3rd position in each lane into { 0 2 1 3 8 a 9 b } and { 4 6 5 7 c e d f }. */ - emit_insn (gen_avx2_pshufdv3 (t1, t1, + emit_insn (gen_avx2_pshufdv3 (t1, gen_lowpart (V8SImode, t3), GEN_INT (2 * 4 + 1 * 16 + 3 * 64))); - emit_insn (gen_avx2_pshufdv3 (t2, t2, + emit_insn (gen_avx2_pshufdv3 (t2, gen_lowpart (V8SImode, t4), GEN_INT (2 * 4 + 1 * 16 + 3 * 64))); /* Now an vpunpck[lh]qdq will produce { 0 2 4 6 8 a c e } resp. { 1 3 5 7 9 b d f }. */ if (odd) - t3 = gen_avx2_interleave_highv4di (gen_lowpart (V4DImode, d->target), - gen_lowpart (V4DImode, t1), + t3 = gen_avx2_interleave_highv4di (t5, gen_lowpart (V4DImode, t1), gen_lowpart (V4DImode, t2)); else - t3 = gen_avx2_interleave_lowv4di (gen_lowpart (V4DImode, d->target), - gen_lowpart (V4DImode, t1), + t3 = gen_avx2_interleave_lowv4di (t5, gen_lowpart (V4DImode, t1), gen_lowpart (V4DImode, t2)); emit_insn (t3); + emit_move_insn (d->target, gen_lowpart (V8SImode, t5)); break; default: @@ -41116,7 +41211,7 @@ expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d) unsigned elt = d->perm[0], nelt2 = d->nelt / 2; enum machine_mode vmode = d->vmode; unsigned char perm2[4]; - rtx op0 = d->op0; + rtx op0 = d->op0, dest; bool ok; switch (vmode) @@ -41162,9 +41257,11 @@ expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d) while (vmode != V4SImode); memset (perm2, elt, 4); - ok = expand_vselect (gen_lowpart (V4SImode, d->target), op0, perm2, 4, - d->testing_p); + dest = gen_reg_rtx (V4SImode); + ok = expand_vselect (dest, op0, perm2, 4, d->testing_p); gcc_assert (ok); + if (!d->testing_p) + emit_move_insn (d->target, gen_lowpart (d->vmode, dest)); return true; case V32QImode: @@ -41306,8 +41403,12 @@ expand_vec_perm_vpshufb4_vpermq2 (struct expand_vec_perm_d *d) } gcc_assert (l[0] && l[1]); - op = gen_lowpart (V32QImode, d->target); + op = d->target; + if (d->vmode != V32QImode) + op = gen_reg_rtx (V32QImode); emit_insn (gen_iorv32qi3 (op, l[0], l[1])); + if (op != d->target) + emit_move_insn (d->target, gen_lowpart (d->vmode, op)); return true; } @@ -41875,7 +41976,9 @@ ix86_expand_mul_widen_hilo (rtx dest, rtx op1, rtx op2, op1, op2, NULL_RTX, uns_p, OPTAB_DIRECT); gcc_assert (t1 && t2); - ix86_expand_vec_interleave (gen_lowpart (mode, dest), t1, t2, high_p); + t3 = gen_reg_rtx (mode); + ix86_expand_vec_interleave (t3, t1, t2, high_p); + emit_move_insn (dest, gen_lowpart (wmode, t3)); break; case V16QImode: @@ -41896,14 +41999,14 @@ ix86_expand_mul_widen_hilo (rtx dest, rtx op1, rtx op2, void ix86_expand_sse2_mulv4si3 (rtx op0, rtx op1, rtx op2) { - rtx res_1, res_2; + rtx res_1, res_2, res_3, res_4; res_1 = gen_reg_rtx (V4SImode); res_2 = gen_reg_rtx (V4SImode); - ix86_expand_mul_widen_evenodd (gen_lowpart (V2DImode, res_1), - op1, op2, true, false); - ix86_expand_mul_widen_evenodd (gen_lowpart (V2DImode, res_2), - op1, op2, true, true); + res_3 = gen_reg_rtx (V2DImode); + res_4 = gen_reg_rtx (V2DImode); + ix86_expand_mul_widen_evenodd (res_3, op1, op2, true, false); + ix86_expand_mul_widen_evenodd (res_4, op1, op2, true, true); /* Move the results in element 2 down to element 1; we don't care what goes in elements 2 and 3. Then we can merge the parts @@ -41917,9 +42020,11 @@ ix86_expand_sse2_mulv4si3 (rtx op0, rtx op1, rtx op2) In both cases the cost of the reformatting stall was too high and the overall sequence slower. */ - emit_insn (gen_sse2_pshufd_1 (res_1, res_1, const0_rtx, const2_rtx, + emit_insn (gen_sse2_pshufd_1 (res_1, gen_lowpart (V4SImode, res_3), + const0_rtx, const2_rtx, const0_rtx, const0_rtx)); - emit_insn (gen_sse2_pshufd_1 (res_2, res_2, const0_rtx, const2_rtx, + emit_insn (gen_sse2_pshufd_1 (res_2, gen_lowpart (V4SImode, res_4), + const0_rtx, const2_rtx, const0_rtx, const0_rtx)); res_1 = emit_insn (gen_vec_interleave_lowv4si (op0, res_1, res_2)); @@ -42138,12 +42243,17 @@ ix86_expand_pinsr (rtx *operands) return false; } - dst = gen_lowpart (dstmode, dst); + rtx d = dst; + if (GET_MODE (dst) != dstmode) + d = gen_reg_rtx (dstmode); src = gen_lowpart (srcmode, src); pos /= size; - emit_insn (pinsr (dst, dst, src, GEN_INT (1 << pos))); + emit_insn (pinsr (d, gen_lowpart (dstmode, dst), src, + GEN_INT (1 << pos))); + if (d != dst) + emit_move_insn (dst, gen_lowpart (GET_MODE (dst), d)); return true; } diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 584a01116e6..7187035a6f2 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -800,10 +800,13 @@ gen_rtx_SUBREG (SImode, operands[1], 4))); emit_insn (gen_vec_interleave_lowv4si (operands[0], operands[0], operands[2])); - } + } else if (memory_operand (operands[1], DImode)) - emit_insn (gen_vec_concatv2di (gen_lowpart (V2DImode, operands[0]), - operands[1], const0_rtx)); + { + rtx tmp = gen_reg_rtx (V2DImode); + emit_insn (gen_vec_concatv2di (tmp, operands[1], const0_rtx)); + emit_move_insn (operands[0], gen_lowpart (V4SImode, tmp)); + } else gcc_unreachable (); }) @@ -4208,7 +4211,7 @@ (match_operand:V2DF 2 "nonimmediate_operand")] "TARGET_SSE2" { - rtx tmp0, tmp1; + rtx tmp0, tmp1, tmp2; if (TARGET_AVX && !TARGET_PREFER_AVX128) { @@ -4222,13 +4225,14 @@ { tmp0 = gen_reg_rtx (V4SImode); tmp1 = gen_reg_rtx (V4SImode); + tmp2 = gen_reg_rtx (V2DImode); emit_insn (gen_sse2_cvttpd2dq (tmp0, operands[1])); emit_insn (gen_sse2_cvttpd2dq (tmp1, operands[2])); - emit_insn - (gen_vec_interleave_lowv2di (gen_lowpart (V2DImode, operands[0]), - gen_lowpart (V2DImode, tmp0), - gen_lowpart (V2DImode, tmp1))); + emit_insn (gen_vec_interleave_lowv2di (tmp2, + gen_lowpart (V2DImode, tmp0), + gen_lowpart (V2DImode, tmp1))); + emit_move_insn (operands[0], gen_lowpart (V4SImode, tmp2)); } DONE; }) @@ -4289,7 +4293,7 @@ (match_operand:V2DF 2 "nonimmediate_operand")] "TARGET_SSE2" { - rtx tmp0, tmp1; + rtx tmp0, tmp1, tmp2; if (TARGET_AVX && !TARGET_PREFER_AVX128) { @@ -4303,13 +4307,14 @@ { tmp0 = gen_reg_rtx (V4SImode); tmp1 = gen_reg_rtx (V4SImode); + tmp2 = gen_reg_rtx (V2DImode); emit_insn (gen_sse2_cvtpd2dq (tmp0, operands[1])); emit_insn (gen_sse2_cvtpd2dq (tmp1, operands[2])); - emit_insn - (gen_vec_interleave_lowv2di (gen_lowpart (V2DImode, operands[0]), - gen_lowpart (V2DImode, tmp0), - gen_lowpart (V2DImode, tmp1))); + emit_insn (gen_vec_interleave_lowv2di (tmp2, + gen_lowpart (V2DImode, tmp0), + gen_lowpart (V2DImode, tmp1))); + emit_move_insn (operands[0], gen_lowpart (V4SImode, tmp2)); } DONE; }) @@ -7328,14 +7333,16 @@ (set_attr "mode" "<sseinsnmode>")]) (define_expand "vec_shl_<mode>" - [(set (match_operand:VI_128 0 "register_operand") + [(set (match_dup 3) (ashift:V1TI (match_operand:VI_128 1 "register_operand") - (match_operand:SI 2 "const_0_to_255_mul_8_operand")))] + (match_operand:SI 2 "const_0_to_255_mul_8_operand"))) + (set (match_operand:VI_128 0 "register_operand") (match_dup 4))] "TARGET_SSE2" { - operands[0] = gen_lowpart (V1TImode, operands[0]); operands[1] = gen_lowpart (V1TImode, operands[1]); + operands[3] = gen_reg_rtx (V1TImode); + operands[4] = gen_lowpart (<MODE>mode, operands[3]); }) (define_insn "<sse2_avx2>_ashl<mode>3" @@ -7365,14 +7372,16 @@ (set_attr "mode" "<sseinsnmode>")]) (define_expand "vec_shr_<mode>" - [(set (match_operand:VI_128 0 "register_operand") + [(set (match_dup 3) (lshiftrt:V1TI (match_operand:VI_128 1 "register_operand") - (match_operand:SI 2 "const_0_to_255_mul_8_operand")))] + (match_operand:SI 2 "const_0_to_255_mul_8_operand"))) + (set (match_operand:VI_128 0 "register_operand") (match_dup 4))] "TARGET_SSE2" { - operands[0] = gen_lowpart (V1TImode, operands[0]); operands[1] = gen_lowpart (V1TImode, operands[1]); + operands[3] = gen_reg_rtx (V1TImode); + operands[4] = gen_lowpart (<MODE>mode, operands[3]); }) (define_insn "<sse2_avx2>_lshr<mode>3" @@ -8542,12 +8551,13 @@ { rtx t1 = gen_reg_rtx (<MODE>mode); rtx t2 = gen_reg_rtx (<MODE>mode); + rtx t3 = gen_reg_rtx (V4DImode); emit_insn (gen_avx2_interleave_low<mode> (t1, operands[1], operands[2])); emit_insn (gen_avx2_interleave_high<mode> (t2, operands[1], operands[2])); - emit_insn (gen_avx2_permv2ti - (gen_lowpart (V4DImode, operands[0]), - gen_lowpart (V4DImode, t1), - gen_lowpart (V4DImode, t2), GEN_INT (1 + (3 << 4)))); + emit_insn (gen_avx2_permv2ti (t3, gen_lowpart (V4DImode, t1), + gen_lowpart (V4DImode, t2), + GEN_INT (1 + (3 << 4)))); + emit_move_insn (operands[0], gen_lowpart (<MODE>mode, t3)); DONE; }) @@ -8559,12 +8569,13 @@ { rtx t1 = gen_reg_rtx (<MODE>mode); rtx t2 = gen_reg_rtx (<MODE>mode); + rtx t3 = gen_reg_rtx (V4DImode); emit_insn (gen_avx2_interleave_low<mode> (t1, operands[1], operands[2])); emit_insn (gen_avx2_interleave_high<mode> (t2, operands[1], operands[2])); - emit_insn (gen_avx2_permv2ti - (gen_lowpart (V4DImode, operands[0]), - gen_lowpart (V4DImode, t1), - gen_lowpart (V4DImode, t2), GEN_INT (0 + (2 << 4)))); + emit_insn (gen_avx2_permv2ti (t3, gen_lowpart (V4DImode, t1), + gen_lowpart (V4DImode, t2), + GEN_INT (0 + (2 << 4)))); + emit_move_insn (operands[0], gen_lowpart (<MODE>mode, t3)); DONE; }) diff --git a/gcc/expmed.c b/gcc/expmed.c index 92c293879d5..59f81df5adf 100644 --- a/gcc/expmed.c +++ b/gcc/expmed.c @@ -624,13 +624,28 @@ store_bit_field_1 (rtx str_rtx, unsigned HOST_WIDE_INT bitsize, || (bitsize % BITS_PER_WORD == 0 && bitnum % BITS_PER_WORD == 0))) { /* Use the subreg machinery either to narrow OP0 to the required - words or to cope with mode punning between equal-sized modes. */ - rtx sub = simplify_gen_subreg (fieldmode, op0, GET_MODE (op0), - bitnum / BITS_PER_UNIT); - if (sub) + words or to cope with mode punning between equal-sized modes. + In the latter case, use subreg on the rhs side, not lhs. */ + rtx sub; + + if (bitsize == GET_MODE_BITSIZE (GET_MODE (op0))) { - emit_move_insn (sub, value); - return true; + sub = simplify_gen_subreg (GET_MODE (op0), value, fieldmode, 0); + if (sub) + { + emit_move_insn (op0, sub); + return true; + } + } + else + { + sub = simplify_gen_subreg (fieldmode, op0, GET_MODE (op0), + bitnum / BITS_PER_UNIT); + if (sub) + { + emit_move_insn (sub, value); + return true; + } } } diff --git a/gcc/optabs.c b/gcc/optabs.c index a8a7e4fb792..3755670af84 100644 --- a/gcc/optabs.c +++ b/gcc/optabs.c @@ -6624,8 +6624,8 @@ expand_vec_perm (enum machine_mode mode, rtx v0, rtx v1, rtx sel, rtx target) icode = direct_optab_handler (vec_perm_const_optab, qimode); if (icode != CODE_FOR_nothing) { - tmp = expand_vec_perm_1 (icode, gen_lowpart (qimode, target), - gen_lowpart (qimode, v0), + tmp = mode != qimode ? gen_reg_rtx (qimode) : target; + tmp = expand_vec_perm_1 (icode, tmp, gen_lowpart (qimode, v0), gen_lowpart (qimode, v1), sel_qi); if (tmp) return gen_lowpart (mode, tmp); @@ -6674,7 +6674,7 @@ expand_vec_perm (enum machine_mode mode, rtx v0, rtx v1, rtx sel, rtx target) } tmp = gen_rtx_CONST_VECTOR (qimode, vec); sel = gen_lowpart (qimode, sel); - sel = expand_vec_perm (qimode, sel, sel, tmp, NULL); + sel = expand_vec_perm (qimode, gen_reg_rtx (qimode), sel, tmp, NULL); gcc_assert (sel != NULL); /* Add the byte offset to each byte element. */ @@ -6689,8 +6689,8 @@ expand_vec_perm (enum machine_mode mode, rtx v0, rtx v1, rtx sel, rtx target) gcc_assert (sel_qi != NULL); } - tmp = expand_vec_perm_1 (icode, gen_lowpart (qimode, target), - gen_lowpart (qimode, v0), + tmp = mode != qimode ? gen_reg_rtx (qimode) : target; + tmp = expand_vec_perm_1 (icode, tmp, gen_lowpart (qimode, v0), gen_lowpart (qimode, v1), sel_qi); if (tmp) tmp = gen_lowpart (mode, tmp); |