summaryrefslogtreecommitdiff
path: root/gcc/config
diff options
context:
space:
mode:
authorRichard Henderson <rth@redhat.com>2009-11-30 10:26:55 -0800
committerRichard Henderson <rth@gcc.gnu.org>2009-11-30 10:26:55 -0800
commit5e04b3b69417b07f66ed919b64e47055d3a2f9db (patch)
treeb588611807250a4b1eed00a81e79ec2d2412b8d5 /gcc/config
parent9fda11a2ecf9047b64bbe2e92343f1e263ce1509 (diff)
downloadgcc-5e04b3b69417b07f66ed919b64e47055d3a2f9db.tar.gz
Implement vec_perm broadcast, and tidy lots of patterns to help.
From-SVN: r154836
Diffstat (limited to 'gcc/config')
-rw-r--r--gcc/config/i386/i386-protos.h1
-rw-r--r--gcc/config/i386/i386.c346
-rw-r--r--gcc/config/i386/predicates.md17
-rw-r--r--gcc/config/i386/sse.md506
4 files changed, 576 insertions, 294 deletions
diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h
index 1e94cde200d..1451e799fa6 100644
--- a/gcc/config/i386/i386-protos.h
+++ b/gcc/config/i386/i386-protos.h
@@ -86,6 +86,7 @@ extern void ix86_expand_binary_operator (enum rtx_code,
enum machine_mode, rtx[]);
extern int ix86_binary_operator_ok (enum rtx_code, enum machine_mode, rtx[]);
extern bool ix86_lea_for_add_ok (enum rtx_code, rtx, rtx[]);
+extern bool ix86_vec_interleave_v2df_operator_ok (rtx operands[3], bool high);
extern bool ix86_dep_by_shift_count (const_rtx set_insn, const_rtx use_insn);
extern bool ix86_agi_dependent (rtx set_insn, rtx use_insn);
extern void ix86_expand_unary_operator (enum rtx_code, enum machine_mode,
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index fa84e32a997..21be89fed83 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -13849,6 +13849,19 @@ ix86_unary_operator_ok (enum rtx_code code ATTRIBUTE_UNUSED,
return TRUE;
}
+/* Return TRUE if the operands to a vec_interleave_{high,low}v2df
+ are ok, keeping in mind the possible movddup alternative. */
+
+bool
+ix86_vec_interleave_v2df_operator_ok (rtx operands[3], bool high)
+{
+ if (MEM_P (operands[0]))
+ return rtx_equal_p (operands[0], operands[1 + high]);
+ if (MEM_P (operands[1]) && MEM_P (operands[2]))
+ return TARGET_SSE3 && rtx_equal_p (operands[1], operands[2]);
+ return true;
+}
+
/* Post-reload splitter for converting an SF or DFmode value in an
SSE register into an unsigned SImode. */
@@ -21480,11 +21493,11 @@ static const struct builtin_description bdesc_special_args[] =
{ OPTION_MASK_ISA_AVX, CODE_FOR_avx_vzeroall, "__builtin_ia32_vzeroall", IX86_BUILTIN_VZEROALL, UNKNOWN, (int) VOID_FTYPE_VOID },
{ OPTION_MASK_ISA_AVX, CODE_FOR_avx_vzeroupper, "__builtin_ia32_vzeroupper", IX86_BUILTIN_VZEROUPPER, UNKNOWN, (int) VOID_FTYPE_VOID },
- { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vbroadcastss, "__builtin_ia32_vbroadcastss", IX86_BUILTIN_VBROADCASTSS, UNKNOWN, (int) V4SF_FTYPE_PCFLOAT },
- { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vbroadcastsd256, "__builtin_ia32_vbroadcastsd256", IX86_BUILTIN_VBROADCASTSD256, UNKNOWN, (int) V4DF_FTYPE_PCDOUBLE },
- { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vbroadcastss256, "__builtin_ia32_vbroadcastss256", IX86_BUILTIN_VBROADCASTSS256, UNKNOWN, (int) V8SF_FTYPE_PCFLOAT },
- { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vbroadcastf128_pd256, "__builtin_ia32_vbroadcastf128_pd256", IX86_BUILTIN_VBROADCASTPD256, UNKNOWN, (int) V4DF_FTYPE_PCV2DF },
- { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vbroadcastf128_ps256, "__builtin_ia32_vbroadcastf128_ps256", IX86_BUILTIN_VBROADCASTPS256, UNKNOWN, (int) V8SF_FTYPE_PCV4SF },
+ { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv4sf, "__builtin_ia32_vbroadcastss", IX86_BUILTIN_VBROADCASTSS, UNKNOWN, (int) V4SF_FTYPE_PCFLOAT },
+ { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv4df, "__builtin_ia32_vbroadcastsd256", IX86_BUILTIN_VBROADCASTSD256, UNKNOWN, (int) V4DF_FTYPE_PCDOUBLE },
+ { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv8sf, "__builtin_ia32_vbroadcastss256", IX86_BUILTIN_VBROADCASTSS256, UNKNOWN, (int) V8SF_FTYPE_PCFLOAT },
+ { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vbroadcastf128_v4df, "__builtin_ia32_vbroadcastf128_pd256", IX86_BUILTIN_VBROADCASTPD256, UNKNOWN, (int) V4DF_FTYPE_PCV2DF },
+ { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vbroadcastf128_v8sf, "__builtin_ia32_vbroadcastf128_ps256", IX86_BUILTIN_VBROADCASTPS256, UNKNOWN, (int) V8SF_FTYPE_PCV4SF },
{ OPTION_MASK_ISA_AVX, CODE_FOR_avx_movupd256, "__builtin_ia32_loadupd256", IX86_BUILTIN_LOADUPD256, UNKNOWN, (int) V4DF_FTYPE_PCDOUBLE },
{ OPTION_MASK_ISA_AVX, CODE_FOR_avx_movups256, "__builtin_ia32_loadups256", IX86_BUILTIN_LOADUPS256, UNKNOWN, (int) V8SF_FTYPE_PCFLOAT },
@@ -24597,7 +24610,7 @@ avx_vpermilp_parallel (rtx par, enum machine_mode mode)
if (!CONST_INT_P (er))
return 0;
ei = INTVAL (er);
- if (ei >= nelt)
+ if (ei >= 2 * nelt)
return 0;
ipar[i] = ei;
}
@@ -25713,6 +25726,16 @@ ix86_rtx_costs (rtx x, int code, int outer_code_i, int *total, bool speed)
*total = 0;
return false;
+ case VEC_SELECT:
+ case VEC_CONCAT:
+ case VEC_MERGE:
+ case VEC_DUPLICATE:
+ /* ??? Assume all of these vector manipulation patterns are
+ recognizable. In which case they all pretty much have the
+ same cost. */
+ *total = COSTS_N_INSNS (1);
+ return true;
+
default:
return false;
}
@@ -26547,16 +26570,43 @@ x86_emit_floatuns (rtx operands[2])
emit_label (donelab);
}
+/* AVX does not support 32-byte integer vector operations,
+ thus the longest vector we are faced with is V16QImode. */
+#define MAX_VECT_LEN 16
+
+struct expand_vec_perm_d
+{
+ rtx target, op0, op1;
+ unsigned char perm[MAX_VECT_LEN];
+ enum machine_mode vmode;
+ unsigned char nelt;
+ bool testing_p;
+};
+
+static bool expand_vec_perm_1 (struct expand_vec_perm_d *d);
+static bool expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d);
+
+/* Get a vector mode of the same size as the original but with elements
+ twice as wide. This is only guaranteed to apply to integral vectors. */
+
+static inline enum machine_mode
+get_mode_wider_vector (enum machine_mode o)
+{
+ /* ??? Rely on the ordering that genmodes.c gives to vectors. */
+ enum machine_mode n = GET_MODE_WIDER_MODE (o);
+ gcc_assert (GET_MODE_NUNITS (o) == GET_MODE_NUNITS (n) * 2);
+ gcc_assert (GET_MODE_SIZE (o) == GET_MODE_SIZE (n));
+ return n;
+}
+
/* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
with all elements equal to VAR. Return true if successful. */
-/* ??? Call into the vec_perm support to implement the broadcast. */
static bool
ix86_expand_vector_init_duplicate (bool mmx_ok, enum machine_mode mode,
rtx target, rtx val)
{
- enum machine_mode hmode, smode, wsmode, wvmode;
- rtx x;
+ bool ok;
switch (mode)
{
@@ -26566,13 +26616,28 @@ ix86_expand_vector_init_duplicate (bool mmx_ok, enum machine_mode mode,
return false;
/* FALLTHRU */
+ case V4DFmode:
+ case V4DImode:
+ case V8SFmode:
+ case V8SImode:
case V2DFmode:
case V2DImode:
case V4SFmode:
case V4SImode:
- val = force_reg (GET_MODE_INNER (mode), val);
- x = gen_rtx_VEC_DUPLICATE (mode, val);
- emit_insn (gen_rtx_SET (VOIDmode, target, x));
+ {
+ rtx insn, dup;
+
+ /* First attempt to recognize VAL as-is. */
+ dup = gen_rtx_VEC_DUPLICATE (mode, val);
+ insn = emit_insn (gen_rtx_SET (VOIDmode, target, dup));
+ if (recog_memoized (insn) < 0)
+ {
+ /* If that fails, force VAL into a register. */
+ XEXP (dup, 0) = force_reg (GET_MODE_INNER (mode), val);
+ ok = recog_memoized (insn) >= 0;
+ gcc_assert (ok);
+ }
+ }
return true;
case V4HImode:
@@ -26580,130 +26645,87 @@ ix86_expand_vector_init_duplicate (bool mmx_ok, enum machine_mode mode,
return false;
if (TARGET_SSE || TARGET_3DNOW_A)
{
+ rtx x;
+
val = gen_lowpart (SImode, val);
x = gen_rtx_TRUNCATE (HImode, val);
x = gen_rtx_VEC_DUPLICATE (mode, x);
emit_insn (gen_rtx_SET (VOIDmode, target, x));
return true;
}
- else
- {
- smode = HImode;
- wsmode = SImode;
- wvmode = V2SImode;
- goto widen;
- }
+ goto widen;
case V8QImode:
if (!mmx_ok)
return false;
- smode = QImode;
- wsmode = HImode;
- wvmode = V4HImode;
goto widen;
+
case V8HImode:
if (TARGET_SSE2)
{
+ struct expand_vec_perm_d dperm;
rtx tmp1, tmp2;
- /* Extend HImode to SImode using a paradoxical SUBREG. */
+
+ permute:
+ memset (&dperm, 0, sizeof (dperm));
+ dperm.target = target;
+ dperm.vmode = mode;
+ dperm.nelt = GET_MODE_NUNITS (mode);
+ dperm.op0 = dperm.op1 = gen_reg_rtx (mode);
+
+ /* Extend to SImode using a paradoxical SUBREG. */
tmp1 = gen_reg_rtx (SImode);
emit_move_insn (tmp1, gen_lowpart (SImode, val));
- /* Insert the SImode value as low element of V4SImode vector. */
- tmp2 = gen_reg_rtx (V4SImode);
- tmp1 = gen_rtx_VEC_MERGE (V4SImode,
- gen_rtx_VEC_DUPLICATE (V4SImode, tmp1),
- CONST0_RTX (V4SImode),
- const1_rtx);
- emit_insn (gen_rtx_SET (VOIDmode, tmp2, tmp1));
- /* Cast the V4SImode vector back to a V8HImode vector. */
- tmp1 = gen_reg_rtx (V8HImode);
- emit_move_insn (tmp1, gen_lowpart (V8HImode, tmp2));
- /* Duplicate the low short through the whole low SImode word. */
- emit_insn (gen_vec_interleave_lowv8hi (tmp1, tmp1, tmp1));
- /* Cast the V8HImode vector back to a V4SImode vector. */
- tmp2 = gen_reg_rtx (V4SImode);
- emit_move_insn (tmp2, gen_lowpart (V4SImode, tmp1));
- /* Replicate the low element of the V4SImode vector. */
- emit_insn (gen_sse2_pshufd (tmp2, tmp2, const0_rtx));
- /* Cast the V2SImode back to V8HImode, and store in target. */
- emit_move_insn (target, gen_lowpart (V8HImode, tmp2));
- return true;
+
+ /* Insert the SImode value as low element of a V4SImode vector. */
+ tmp2 = gen_lowpart (V4SImode, dperm.op0);
+ emit_insn (gen_vec_setv4si_0 (tmp2, CONST0_RTX (V4SImode), tmp1));
+
+ ok = (expand_vec_perm_1 (&dperm)
+ || expand_vec_perm_broadcast_1 (&dperm));
+ gcc_assert (ok);
+ return ok;
}
- smode = HImode;
- wsmode = SImode;
- wvmode = V4SImode;
goto widen;
+
case V16QImode:
if (TARGET_SSE2)
- {
- rtx tmp1, tmp2;
- /* Extend QImode to SImode using a paradoxical SUBREG. */
- tmp1 = gen_reg_rtx (SImode);
- emit_move_insn (tmp1, gen_lowpart (SImode, val));
- /* Insert the SImode value as low element of V4SImode vector. */
- tmp2 = gen_reg_rtx (V4SImode);
- tmp1 = gen_rtx_VEC_MERGE (V4SImode,
- gen_rtx_VEC_DUPLICATE (V4SImode, tmp1),
- CONST0_RTX (V4SImode),
- const1_rtx);
- emit_insn (gen_rtx_SET (VOIDmode, tmp2, tmp1));
- /* Cast the V4SImode vector back to a V16QImode vector. */
- tmp1 = gen_reg_rtx (V16QImode);
- emit_move_insn (tmp1, gen_lowpart (V16QImode, tmp2));
- /* Duplicate the low byte through the whole low SImode word. */
- emit_insn (gen_vec_interleave_lowv16qi (tmp1, tmp1, tmp1));
- emit_insn (gen_vec_interleave_lowv16qi (tmp1, tmp1, tmp1));
- /* Cast the V16QImode vector back to a V4SImode vector. */
- tmp2 = gen_reg_rtx (V4SImode);
- emit_move_insn (tmp2, gen_lowpart (V4SImode, tmp1));
- /* Replicate the low element of the V4SImode vector. */
- emit_insn (gen_sse2_pshufd (tmp2, tmp2, const0_rtx));
- /* Cast the V2SImode back to V16QImode, and store in target. */
- emit_move_insn (target, gen_lowpart (V16QImode, tmp2));
- return true;
- }
- smode = QImode;
- wsmode = HImode;
- wvmode = V8HImode;
+ goto permute;
goto widen;
+
widen:
/* Replicate the value once into the next wider mode and recurse. */
- val = convert_modes (wsmode, smode, val, true);
- x = expand_simple_binop (wsmode, ASHIFT, val,
- GEN_INT (GET_MODE_BITSIZE (smode)),
- NULL_RTX, 1, OPTAB_LIB_WIDEN);
- val = expand_simple_binop (wsmode, IOR, val, x, x, 1, OPTAB_LIB_WIDEN);
-
- x = gen_reg_rtx (wvmode);
- if (!ix86_expand_vector_init_duplicate (mmx_ok, wvmode, x, val))
- gcc_unreachable ();
- emit_move_insn (target, gen_lowpart (mode, x));
- return true;
+ {
+ enum machine_mode smode, wsmode, wvmode;
+ rtx x;
+
+ smode = GET_MODE_INNER (mode);
+ wvmode = get_mode_wider_vector (mode);
+ wsmode = GET_MODE_INNER (wvmode);
+
+ val = convert_modes (wsmode, smode, val, true);
+ x = expand_simple_binop (wsmode, ASHIFT, val,
+ GEN_INT (GET_MODE_BITSIZE (smode)),
+ NULL_RTX, 1, OPTAB_LIB_WIDEN);
+ val = expand_simple_binop (wsmode, IOR, val, x, x, 1, OPTAB_LIB_WIDEN);
+
+ x = gen_lowpart (wvmode, target);
+ ok = ix86_expand_vector_init_duplicate (mmx_ok, wvmode, x, val);
+ gcc_assert (ok);
+ return ok;
+ }
- case V4DFmode:
- hmode = V2DFmode;
- goto half;
- case V4DImode:
- hmode = V2DImode;
- goto half;
- case V8SFmode:
- hmode = V4SFmode;
- goto half;
- case V8SImode:
- hmode = V4SImode;
- goto half;
case V16HImode:
- hmode = V8HImode;
- goto half;
case V32QImode:
- hmode = V16QImode;
- goto half;
-half:
{
- rtx tmp = gen_reg_rtx (hmode);
- ix86_expand_vector_init_duplicate (mmx_ok, hmode, tmp, val);
- emit_insn (gen_rtx_SET (VOIDmode, target,
- gen_rtx_VEC_CONCAT (mode, tmp, tmp)));
+ enum machine_mode hvmode = (mode == V16HImode ? V8HImode : V16QImode);
+ rtx x = gen_reg_rtx (hvmode);
+
+ ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
+ gcc_assert (ok);
+
+ x = gen_rtx_VEC_CONCAT (mode, x, x);
+ emit_insn (gen_rtx_SET (VOIDmode, target, x));
}
return true;
@@ -29085,19 +29107,6 @@ ix86_vectorize_builtin_vec_perm (tree vec_type, tree *mask_type)
return ix86_builtins[(int) fcode];
}
-/* AVX does not support 32-byte integer vector operations,
- thus the longest vector we are faced with is V16QImode. */
-#define MAX_VECT_LEN 16
-
-struct expand_vec_perm_d
-{
- rtx target, op0, op1;
- unsigned char perm[MAX_VECT_LEN];
- enum machine_mode vmode;
- unsigned char nelt;
- bool testing_p;
-};
-
/* Return a vector mode with twice as many elements as VMODE. */
/* ??? Consider moving this to a table generated by genmodes.c. */
@@ -29739,8 +29748,8 @@ expand_vec_perm_pshufb2 (struct expand_vec_perm_d *d)
return true;
}
-/* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
- extract-even and extract-odd permutations. */
+/* A subroutine of ix86_expand_vec_perm_builtin_1. Implement extract-even
+ and extract-odd permutations. */
static bool
expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd)
@@ -29855,6 +29864,9 @@ expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd)
return true;
}
+/* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
+ extract-even and extract-odd permutations. */
+
static bool
expand_vec_perm_even_odd (struct expand_vec_perm_d *d)
{
@@ -29871,6 +29883,84 @@ expand_vec_perm_even_odd (struct expand_vec_perm_d *d)
return expand_vec_perm_even_odd_1 (d, odd);
}
+/* A subroutine of ix86_expand_vec_perm_builtin_1. Implement broadcast
+ permutations. We assume that expand_vec_perm_1 has already failed. */
+
+static bool
+expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d)
+{
+ unsigned elt = d->perm[0], nelt2 = d->nelt / 2;
+ enum machine_mode vmode = d->vmode;
+ unsigned char perm2[4];
+ rtx op0 = d->op0;
+ bool ok;
+
+ switch (vmode)
+ {
+ case V4DFmode:
+ case V8SFmode:
+ /* These are special-cased in sse.md so that we can optionally
+ use the vbroadcast instruction. They expand to two insns
+ if the input happens to be in a register. */
+ gcc_unreachable ();
+
+ case V2DFmode:
+ case V2DImode:
+ case V4SFmode:
+ case V4SImode:
+ /* These are always implementable using standard shuffle patterns. */
+ gcc_unreachable ();
+
+ case V8HImode:
+ case V16QImode:
+ /* These can be implemented via interleave. We save one insn by
+ stopping once we have promoted to V4SImode and then use pshufd. */
+ do
+ {
+ optab otab = vec_interleave_low_optab;
+
+ if (elt >= nelt2)
+ {
+ otab = vec_interleave_high_optab;
+ elt -= nelt2;
+ }
+ nelt2 /= 2;
+
+ op0 = expand_binop (vmode, otab, op0, op0, NULL, 0, OPTAB_DIRECT);
+ vmode = get_mode_wider_vector (vmode);
+ op0 = gen_lowpart (vmode, op0);
+ }
+ while (vmode != V4SImode);
+
+ memset (perm2, elt, 4);
+ ok = expand_vselect (gen_lowpart (V4SImode, d->target), op0, perm2, 4);
+ gcc_assert (ok);
+ return true;
+
+ default:
+ gcc_unreachable ();
+ }
+}
+
+/* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
+ broadcast permutations. */
+
+static bool
+expand_vec_perm_broadcast (struct expand_vec_perm_d *d)
+{
+ unsigned i, elt, nelt = d->nelt;
+
+ if (d->op0 != d->op1)
+ return false;
+
+ elt = d->perm[0];
+ for (i = 1; i < nelt; ++i)
+ if (d->perm[i] != elt)
+ return false;
+
+ return expand_vec_perm_broadcast_1 (d);
+}
+
/* The guts of ix86_expand_vec_perm_builtin, also used by the ok hook.
With all of the interface bits taken care of, perform the expansion
in D and return true on success. */
@@ -29878,8 +29968,7 @@ expand_vec_perm_even_odd (struct expand_vec_perm_d *d)
static bool
ix86_expand_vec_perm_builtin_1 (struct expand_vec_perm_d *d)
{
- /* First things first -- check if the instruction is implementable
- with a single instruction. */
+ /* Try a single instruction expansion. */
if (expand_vec_perm_1 (d))
return true;
@@ -29894,13 +29983,16 @@ ix86_expand_vec_perm_builtin_1 (struct expand_vec_perm_d *d)
if (expand_vec_perm_interleave2 (d))
return true;
+ if (expand_vec_perm_broadcast (d))
+ return true;
+
/* Try sequences of three instructions. */
if (expand_vec_perm_pshufb2 (d))
return true;
/* ??? Look for narrow permutations whose element orderings would
- allow the promition to a wider mode. */
+ allow the promotion to a wider mode. */
/* ??? Look for sequences of interleave or a wider permute that place
the data into the correct lanes for a half-vector shuffle like
@@ -29912,8 +30004,6 @@ ix86_expand_vec_perm_builtin_1 (struct expand_vec_perm_d *d)
if (expand_vec_perm_even_odd (d))
return true;
- /* ??? Pattern match broadcast. */
-
return false;
}
diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md
index 50a68d97ff8..8f901cd8754 100644
--- a/gcc/config/i386/predicates.md
+++ b/gcc/config/i386/predicates.md
@@ -1241,3 +1241,20 @@
(define_predicate "avx_vperm2f128_v4df_operand"
(and (match_code "parallel")
(match_test "avx_vperm2f128_parallel (op, V4DFmode)")))
+
+;; Return 1 if OP is a parallel for a vbroadcast permute.
+
+(define_predicate "avx_vbroadcast_operand"
+ (and (match_code "parallel")
+ (match_code "const_int" "a"))
+{
+ rtx elt = XVECEXP (op, 0, 0);
+ int i, nelt = XVECLEN (op, 0);
+
+ /* Don't bother checking there are the right number of operands,
+ merely that they're all identical. */
+ for (i = 1; i < nelt; ++i)
+ if (XVECEXP (op, 0, i) != elt)
+ return false;
+ return true;
+})
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index b73820bc1d0..08a3b5b5c89 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -54,6 +54,7 @@
(define_mode_iterator AVX256MODEF2P [V8SF V4DF])
(define_mode_iterator AVX256MODE2P [V8SI V8SF V4DF])
+(define_mode_iterator AVX256MODE24P [V8SI V8SF V4DI V4DF])
(define_mode_iterator AVX256MODE4P [V4DI V4DF])
(define_mode_iterator AVX256MODE8P [V8SI V8SF])
(define_mode_iterator AVXMODEF2P [V4SF V2DF V8SF V4DF])
@@ -96,6 +97,8 @@
(define_mode_attr ssemodesuffixf2c [(V4SF "s") (V2DF "d")])
+(define_mode_attr ssescalarmodesuffix2s [(V4SF "ss") (V4SI "d")])
+
;; Mapping of the max integer size for xop rotate immediate constraint
(define_mode_attr sserotatemax [(V16QI "7") (V8HI "15") (V4SI "31") (V2DI "63")])
@@ -125,17 +128,18 @@
[(V16QI "V4SF") (V8HI "V4SF") (V4SI "V4SF") (V2DI "V4SF")
(V32QI "V8SF") (V16HI "V8SF") (V8SI "V8SF") (V4DI "V8SF")])
(define_mode_attr avxhalfvecmode
- [(V4SF "V2SF") (V32QI "V16QI") (V16HI "V8HI") (V8SI "V4SI")
- (V4DI "V2DI") (V8SF "V4SF") (V4DF "V2DF")])
+ [(V32QI "V16QI") (V16HI "V8HI") (V8SI "V4SI") (V4DI "V2DI")
+ (V8SF "V4SF") (V4DF "V2DF")
+ (V16QI "V8QI") (V8HI "V4HI") (V4SI "V2SI") (V4SF "V2SF")])
(define_mode_attr avxscalarmode
- [(V16QI "QI") (V8HI "HI") (V4SI "SI") (V4SF "SF") (V2DF "DF")
- (V8SF "SF") (V4DF "DF")])
+ [(V16QI "QI") (V8HI "HI") (V4SI "SI") (V2DI "DI") (V4SF "SF") (V2DF "DF")
+ (V32QI "QI") (V16HI "HI") (V8SI "SI") (V4DI "DI") (V8SF "SF") (V4DF "DF")])
(define_mode_attr avxcvtvecmode
[(V4SF "V4SI") (V8SF "V8SI") (V4SI "V4SF") (V8SI "V8SF")])
(define_mode_attr avxpermvecmode
[(V2DF "V2DI") (V4SF "V4SI") (V4DF "V4DI") (V8SF "V8SI")])
(define_mode_attr avxmodesuffixf2c
- [(V4SF "s") (V2DF "d") (V8SF "s") (V4DF "d")])
+ [(V4SF "s") (V2DF "d") (V8SI "s") (V8SF "s") (V4DI "d") (V4DF "d")])
(define_mode_attr avxmodesuffixp
[(V2DF "pd") (V4SI "si") (V4SF "ps") (V8SF "ps") (V8SI "si")
(V4DF "pd")])
@@ -4012,14 +4016,27 @@
[(set_attr "type" "ssemov")
(set_attr "mode" "SF")])
+(define_expand "vec_dupv4sf"
+ [(set (match_operand:V4SF 0 "register_operand" "")
+ (vec_duplicate:V4SF
+ (match_operand:SF 1 "nonimmediate_operand" "")))]
+ "TARGET_SSE"
+{
+ if (!TARGET_AVX)
+ operands[1] = force_reg (V4SFmode, operands[1]);
+})
+
(define_insn "*vec_dupv4sf_avx"
- [(set (match_operand:V4SF 0 "register_operand" "=x")
+ [(set (match_operand:V4SF 0 "register_operand" "=x,x")
(vec_duplicate:V4SF
- (match_operand:SF 1 "register_operand" "x")))]
+ (match_operand:SF 1 "nonimmediate_operand" "x,m")))]
"TARGET_AVX"
- "vshufps\t{$0, %1, %1, %0|%0, %1, %1, 0}"
- [(set_attr "type" "sselog1")
- (set_attr "length_immediate" "1")
+ "@
+ vshufps\t{$0, %1, %1, %0|%0, %1, %1, 0}
+ vbroadcastss\t{%1, %0|%0, %1}"
+ [(set_attr "type" "sselog1,ssemov")
+ (set_attr "length_immediate" "1,0")
+ (set_attr "prefix_extra" "0,1")
(set_attr "prefix" "vex")
(set_attr "mode" "V4SF")])
@@ -4125,35 +4142,78 @@
DONE;
})
-(define_insn "*vec_setv4sf_0_avx"
- [(set (match_operand:V4SF 0 "nonimmediate_operand" "=x,x,x,m")
- (vec_merge:V4SF
- (vec_duplicate:V4SF
- (match_operand:SF 2 "general_operand" " x,m,*r,x*rfF"))
- (match_operand:V4SF 1 "vector_move_operand" " x,C,C ,0")
+(define_insn "*vec_set<mode>_0_avx"
+ [(set (match_operand:SSEMODE4S 0 "nonimmediate_operand" "=x,x, x,x, x,m")
+ (vec_merge:SSEMODE4S
+ (vec_duplicate:SSEMODE4S
+ (match_operand:<ssescalarmode> 2
+ "general_operand" " x,m,*r,x,*rm,x*rfF"))
+ (match_operand:SSEMODE4S 1 "vector_move_operand" " C,C, C,x, x,0")
(const_int 1)))]
"TARGET_AVX"
"@
- vmovss\t{%2, %1, %0|%0, %1, %2}
- vmovss\t{%2, %0|%0, %2}
+ vinsertps\t{$0xe, %2, %2, %0|%0, %2, %2, 0xe}
+ vmov<ssescalarmodesuffix2s>\t{%2, %0|%0, %2}
vmovd\t{%2, %0|%0, %2}
+ vmovss\t{%2, %1, %0|%0, %1, %2}
+ vpinsrd\t{$0, %2, %1, %0|%0, %1, %2, 0}
+ #"
+ [(set_attr "type" "sselog,ssemov,ssemov,ssemov,sselog,*")
+ (set_attr "prefix_extra" "*,*,*,*,1,*")
+ (set_attr "length_immediate" "*,*,*,*,1,*")
+ (set_attr "prefix" "vex")
+ (set_attr "mode" "SF,<ssescalarmode>,SI,SF,TI,*")])
+
+(define_insn "*vec_set<mode>_0_sse4_1"
+ [(set (match_operand:SSEMODE4S 0 "nonimmediate_operand" "=x,x, x,x, x,m")
+ (vec_merge:SSEMODE4S
+ (vec_duplicate:SSEMODE4S
+ (match_operand:<ssescalarmode> 2
+ "general_operand" " x,m,*r,x,*rm,*rfF"))
+ (match_operand:SSEMODE4S 1 "vector_move_operand" " C,C, C,0, 0,0")
+ (const_int 1)))]
+ "TARGET_SSE4_1"
+ "@
+ insertps\t{$0xe, %2, %0|%0, %2, 0xe}
+ mov<ssescalarmodesuffix2s>\t{%2, %0|%0, %2}
+ movd\t{%2, %0|%0, %2}
+ movss\t{%2, %0|%0, %2}
+ pinsrd\t{$0, %2, %0|%0, %2, 0}
+ #"
+ [(set_attr "type" "sselog,ssemov,ssemov,ssemov,sselog,*")
+ (set_attr "prefix_extra" "*,*,*,*,1,*")
+ (set_attr "length_immediate" "*,*,*,*,1,*")
+ (set_attr "mode" "SF,<ssescalarmode>,SI,SF,TI,*")])
+
+(define_insn "*vec_set<mode>_0_sse2"
+ [(set (match_operand:SSEMODE4S 0 "nonimmediate_operand" "=x, x,x,m")
+ (vec_merge:SSEMODE4S
+ (vec_duplicate:SSEMODE4S
+ (match_operand:<ssescalarmode> 2
+ "general_operand" " m,*r,x,x*rfF"))
+ (match_operand:SSEMODE4S 1 "vector_move_operand" " C, C,0,0")
+ (const_int 1)))]
+ "TARGET_SSE2"
+ "@
+ mov<ssescalarmodesuffix2s>\t{%2, %0|%0, %2}
+ movd\t{%2, %0|%0, %2}
+ movss\t{%2, %0|%0, %2}
#"
[(set_attr "type" "ssemov")
- (set_attr "prefix" "vex")
- (set_attr "mode" "SF")])
-
-(define_insn "vec_setv4sf_0"
- [(set (match_operand:V4SF 0 "nonimmediate_operand" "=x,x,Y2,m")
- (vec_merge:V4SF
- (vec_duplicate:V4SF
- (match_operand:SF 2 "general_operand" " x,m,*r,x*rfF"))
- (match_operand:V4SF 1 "vector_move_operand" " 0,C,C ,0")
+ (set_attr "mode" "<ssescalarmode>,SI,SF,*")])
+
+(define_insn "vec_set<mode>_0"
+ [(set (match_operand:SSEMODE4S 0 "nonimmediate_operand" "=x,x,m")
+ (vec_merge:SSEMODE4S
+ (vec_duplicate:SSEMODE4S
+ (match_operand:<ssescalarmode> 2
+ "general_operand" " m,x,x*rfF"))
+ (match_operand:SSEMODE4S 1 "vector_move_operand" " C,0,0")
(const_int 1)))]
"TARGET_SSE"
"@
movss\t{%2, %0|%0, %2}
movss\t{%2, %0|%0, %2}
- movd\t{%2, %0|%0, %2}
#"
[(set_attr "type" "ssemov")
(set_attr "mode" "SF")])
@@ -4484,7 +4544,7 @@
(set_attr "mode" "V4DF")])
(define_expand "vec_interleave_highv2df"
- [(set (match_operand:V2DF 0 "nonimmediate_operand" "")
+ [(set (match_operand:V2DF 0 "register_operand" "")
(vec_select:V2DF
(vec_concat:V4DF
(match_operand:V2DF 1 "nonimmediate_operand" "")
@@ -4492,24 +4552,46 @@
(parallel [(const_int 1)
(const_int 3)])))]
"TARGET_SSE2"
- "ix86_fixup_binary_operands (UNKNOWN, V2DFmode, operands);")
+{
+ if (!ix86_vec_interleave_v2df_operator_ok (operands, 1))
+ operands[2] = force_reg (V2DFmode, operands[2]);
+})
(define_insn "*avx_interleave_highv2df"
- [(set (match_operand:V2DF 0 "nonimmediate_operand" "=x,x,m")
+ [(set (match_operand:V2DF 0 "nonimmediate_operand" "=x,x,x,m")
(vec_select:V2DF
(vec_concat:V4DF
- (match_operand:V2DF 1 "nonimmediate_operand" " x,o,x")
- (match_operand:V2DF 2 "nonimmediate_operand" " x,x,0"))
+ (match_operand:V2DF 1 "nonimmediate_operand" " x,o,o,x")
+ (match_operand:V2DF 2 "nonimmediate_operand" " x,1,x,0"))
(parallel [(const_int 1)
(const_int 3)])))]
- "TARGET_AVX && !(MEM_P (operands[1]) && MEM_P (operands[2]))"
+ "TARGET_AVX && ix86_vec_interleave_v2df_operator_ok (operands, 1)"
"@
vunpckhpd\t{%2, %1, %0|%0, %1, %2}
+ vmovddup\t{%H1, %0|%0, %H1}
vmovlpd\t{%H1, %2, %0|%0, %2, %H1}
vmovhpd\t{%1, %0|%0, %1}"
- [(set_attr "type" "sselog,ssemov,ssemov")
+ [(set_attr "type" "sselog,sselog,ssemov,ssemov")
(set_attr "prefix" "vex")
- (set_attr "mode" "V2DF,V1DF,V1DF")])
+ (set_attr "mode" "V2DF,V2DF,V1DF,V1DF")])
+
+(define_insn "*sse3_interleave_highv2df"
+ [(set (match_operand:V2DF 0 "nonimmediate_operand" "=x,x,x,m")
+ (vec_select:V2DF
+ (vec_concat:V4DF
+ (match_operand:V2DF 1 "nonimmediate_operand" " 0,o,o,x")
+ (match_operand:V2DF 2 "nonimmediate_operand" " x,1,0,0"))
+ (parallel [(const_int 1)
+ (const_int 3)])))]
+ "TARGET_SSE3 && ix86_vec_interleave_v2df_operator_ok (operands, 1)"
+ "@
+ unpckhpd\t{%2, %0|%0, %2}
+ movddup\t{%H1, %0|%0, %H1}
+ movlpd\t{%H1, %0|%0, %H1}
+ movhpd\t{%1, %0|%0, %1}"
+ [(set_attr "type" "sselog,sselog,ssemov,ssemov")
+ (set_attr "prefix_data16" "*,*,1,1")
+ (set_attr "mode" "V2DF,V2DF,V1DF,V1DF")])
(define_insn "*sse2_interleave_highv2df"
[(set (match_operand:V2DF 0 "nonimmediate_operand" "=x,x,m")
@@ -4519,7 +4601,7 @@
(match_operand:V2DF 2 "nonimmediate_operand" " x,0,0"))
(parallel [(const_int 1)
(const_int 3)])))]
- "TARGET_SSE2 && !(MEM_P (operands[1]) && MEM_P (operands[2]))"
+ "TARGET_SSE2 && ix86_vec_interleave_v2df_operator_ok (operands, 1)"
"@
unpckhpd\t{%2, %0|%0, %2}
movlpd\t{%H1, %0|%0, %H1}
@@ -4528,85 +4610,48 @@
(set_attr "prefix_data16" "*,1,1")
(set_attr "mode" "V2DF,V1DF,V1DF")])
-(define_insn "avx_movddup256"
- [(set (match_operand:V4DF 0 "register_operand" "=x")
+;; Recall that the 256-bit unpck insns only shuffle within their lanes.
+(define_expand "avx_movddup256"
+ [(set (match_operand:V4DF 0 "register_operand" "")
(vec_select:V4DF
(vec_concat:V8DF
- (match_operand:V4DF 1 "nonimmediate_operand" "xm")
+ (match_operand:V4DF 1 "nonimmediate_operand" "")
(match_dup 1))
- (parallel [(const_int 0) (const_int 2)
- (const_int 4) (const_int 6)])))]
+ (parallel [(const_int 0) (const_int 4)
+ (const_int 2) (const_int 6)])))]
"TARGET_AVX"
- "vmovddup\t{%1, %0|%0, %1}"
- [(set_attr "type" "sselog1")
- (set_attr "prefix" "vex")
- (set_attr "mode" "V4DF")])
-
-(define_insn "*avx_movddup"
- [(set (match_operand:V2DF 0 "nonimmediate_operand" "=x,o")
- (vec_select:V2DF
- (vec_concat:V4DF
- (match_operand:V2DF 1 "nonimmediate_operand" "xm,x")
- (match_dup 1))
- (parallel [(const_int 0)
- (const_int 2)])))]
- "TARGET_AVX && !(MEM_P (operands[0]) && MEM_P (operands[1]))"
- "@
- vmovddup\t{%1, %0|%0, %1}
- #"
- [(set_attr "type" "sselog1,ssemov")
- (set_attr "prefix" "vex")
- (set_attr "mode" "V2DF")])
-
-(define_insn "*sse3_movddup"
- [(set (match_operand:V2DF 0 "nonimmediate_operand" "=x,o")
- (vec_select:V2DF
- (vec_concat:V4DF
- (match_operand:V2DF 1 "nonimmediate_operand" "xm,x")
- (match_dup 1))
- (parallel [(const_int 0)
- (const_int 2)])))]
- "TARGET_SSE3 && !(MEM_P (operands[0]) && MEM_P (operands[1]))"
- "@
- movddup\t{%1, %0|%0, %1}
- #"
- [(set_attr "type" "sselog1,ssemov")
- (set_attr "mode" "V2DF")])
-
-(define_split
- [(set (match_operand:V2DF 0 "memory_operand" "")
- (vec_select:V2DF
- (vec_concat:V4DF
- (match_operand:V2DF 1 "register_operand" "")
- (match_dup 1))
- (parallel [(const_int 0)
- (const_int 2)])))]
- "TARGET_SSE3 && reload_completed"
- [(const_int 0)]
-{
- rtx low = gen_rtx_REG (DFmode, REGNO (operands[1]));
- emit_move_insn (adjust_address (operands[0], DFmode, 0), low);
- emit_move_insn (adjust_address (operands[0], DFmode, 8), low);
- DONE;
-})
+ "")
-;; Recall that the 256-bit unpck insns only shuffle within their lanes.
-(define_insn "avx_unpcklpd256"
- [(set (match_operand:V4DF 0 "register_operand" "=x")
+(define_expand "avx_unpcklpd256"
+ [(set (match_operand:V4DF 0 "register_operand" "")
(vec_select:V4DF
(vec_concat:V8DF
- (match_operand:V4DF 1 "register_operand" "x")
- (match_operand:V4DF 2 "nonimmediate_operand" "xm"))
+ (match_operand:V4DF 1 "register_operand" "")
+ (match_operand:V4DF 2 "nonimmediate_operand" ""))
(parallel [(const_int 0) (const_int 4)
(const_int 2) (const_int 6)])))]
"TARGET_AVX"
- "vunpcklpd\t{%2, %1, %0|%0, %1, %2}"
+ "")
+
+(define_insn "*avx_unpcklpd256"
+ [(set (match_operand:V4DF 0 "register_operand" "=x,x")
+ (vec_select:V4DF
+ (vec_concat:V8DF
+ (match_operand:V4DF 1 "nonimmediate_operand" "xm,x")
+ (match_operand:V4DF 2 "nonimmediate_operand" " 1,xm"))
+ (parallel [(const_int 0) (const_int 4)
+ (const_int 2) (const_int 6)])))]
+ "TARGET_AVX
+ && (!MEM_P (operands[1]) || rtx_equal_p (operands[1], operands[2]))"
+ "@
+ vmovddup\t{%1, %0|%0, %1}
+ vunpcklpd\t{%2, %1, %0|%0, %1, %2}"
[(set_attr "type" "sselog")
(set_attr "prefix" "vex")
(set_attr "mode" "V4DF")])
(define_expand "vec_interleave_lowv2df"
- [(set (match_operand:V2DF 0 "nonimmediate_operand" "")
+ [(set (match_operand:V2DF 0 "register_operand" "")
(vec_select:V2DF
(vec_concat:V4DF
(match_operand:V2DF 1 "nonimmediate_operand" "")
@@ -4614,24 +4659,46 @@
(parallel [(const_int 0)
(const_int 2)])))]
"TARGET_SSE2"
- "ix86_fixup_binary_operands (UNKNOWN, V2DFmode, operands);")
+{
+ if (!ix86_vec_interleave_v2df_operator_ok (operands, 0))
+ operands[1] = force_reg (V2DFmode, operands[1]);
+})
(define_insn "*avx_interleave_lowv2df"
- [(set (match_operand:V2DF 0 "nonimmediate_operand" "=x,x,o")
+ [(set (match_operand:V2DF 0 "nonimmediate_operand" "=x,x,x,o")
(vec_select:V2DF
(vec_concat:V4DF
- (match_operand:V2DF 1 "nonimmediate_operand" " x,x,0")
- (match_operand:V2DF 2 "nonimmediate_operand" " x,m,x"))
+ (match_operand:V2DF 1 "nonimmediate_operand" " x,m,x,0")
+ (match_operand:V2DF 2 "nonimmediate_operand" " x,1,m,x"))
(parallel [(const_int 0)
(const_int 2)])))]
- "TARGET_AVX && !(MEM_P (operands[1]) && MEM_P (operands[2]))"
+ "TARGET_AVX && ix86_vec_interleave_v2df_operator_ok (operands, 0)"
"@
vunpcklpd\t{%2, %1, %0|%0, %1, %2}
+ vmovddup\t{%1, %0|%0, %1}
vmovhpd\t{%2, %1, %0|%0, %1, %2}
vmovlpd\t{%2, %H0|%H0, %2}"
- [(set_attr "type" "sselog,ssemov,ssemov")
+ [(set_attr "type" "sselog,sselog,ssemov,ssemov")
(set_attr "prefix" "vex")
- (set_attr "mode" "V2DF,V1DF,V1DF")])
+ (set_attr "mode" "V2DF,V2DF,V1DF,V1DF")])
+
+(define_insn "*sse3_interleave_lowv2df"
+ [(set (match_operand:V2DF 0 "nonimmediate_operand" "=x,x,x,o")
+ (vec_select:V2DF
+ (vec_concat:V4DF
+ (match_operand:V2DF 1 "nonimmediate_operand" " 0,m,0,0")
+ (match_operand:V2DF 2 "nonimmediate_operand" " x,1,m,x"))
+ (parallel [(const_int 0)
+ (const_int 2)])))]
+ "TARGET_SSE3 && ix86_vec_interleave_v2df_operator_ok (operands, 0)"
+ "@
+ unpcklpd\t{%2, %0|%0, %2}
+ movddup\t{%1, %0|%0, %1}
+ movhpd\t{%2, %0|%0, %2}
+ movlpd\t{%2, %H0|%H0, %2}"
+ [(set_attr "type" "sselog,sselog,ssemov,ssemov")
+ (set_attr "prefix_data16" "*,*,1,1")
+ (set_attr "mode" "V2DF,V2DF,V1DF,V1DF")])
(define_insn "*sse2_interleave_lowv2df"
[(set (match_operand:V2DF 0 "nonimmediate_operand" "=x,x,o")
@@ -4641,7 +4708,7 @@
(match_operand:V2DF 2 "nonimmediate_operand" " x,m,x"))
(parallel [(const_int 0)
(const_int 2)])))]
- "TARGET_SSE2 && !(MEM_P (operands[1]) && MEM_P (operands[2]))"
+ "TARGET_SSE2 && ix86_vec_interleave_v2df_operator_ok (operands, 0)"
"@
unpcklpd\t{%2, %0|%0, %2}
movhpd\t{%2, %0|%0, %2}
@@ -4650,6 +4717,37 @@
(set_attr "prefix_data16" "*,1,1")
(set_attr "mode" "V2DF,V1DF,V1DF")])
+(define_split
+ [(set (match_operand:V2DF 0 "memory_operand" "")
+ (vec_select:V2DF
+ (vec_concat:V4DF
+ (match_operand:V2DF 1 "register_operand" "")
+ (match_dup 1))
+ (parallel [(const_int 0)
+ (const_int 2)])))]
+ "TARGET_SSE3 && reload_completed"
+ [(const_int 0)]
+{
+ rtx low = gen_rtx_REG (DFmode, REGNO (operands[1]));
+ emit_move_insn (adjust_address (operands[0], DFmode, 0), low);
+ emit_move_insn (adjust_address (operands[0], DFmode, 8), low);
+ DONE;
+})
+
+(define_split
+ [(set (match_operand:V2DF 0 "register_operand" "")
+ (vec_select:V2DF
+ (vec_concat:V4DF
+ (match_operand:V2DF 1 "memory_operand" "")
+ (match_dup 1))
+ (parallel [(match_operand:SI 2 "const_0_to_1_operand" "")
+ (match_operand:SI 3 "const_int_operand" "")])))]
+ "TARGET_SSE3 && INTVAL (operands[2]) + 2 == INTVAL (operands[3])"
+ [(set (match_dup 0) (vec_duplicate:V2DF (match_dup 1)))]
+{
+ operands[1] = adjust_address (operands[1], DFmode, INTVAL (operands[2]) * 8);
+})
+
(define_expand "avx_shufpd256"
[(match_operand:V4DF 0 "register_operand" "")
(match_operand:V4DF 1 "register_operand" "")
@@ -7408,6 +7506,20 @@
[(set_attr "type" "ssemov")
(set_attr "mode" "V2SF,V4SF,V2SF")])
+(define_insn "*vec_dupv4si_avx"
+ [(set (match_operand:V4SI 0 "register_operand" "=x,x")
+ (vec_duplicate:V4SI
+ (match_operand:SI 1 "register_operand" "x,m")))]
+ "TARGET_AVX"
+ "@
+ vpshufd\t{$0, %1, %0|%0, %1, 0}
+ vbroadcastss\t{%1, %0|%0, %1}"
+ [(set_attr "type" "sselog1,ssemov")
+ (set_attr "length_immediate" "1,0")
+ (set_attr "prefix_extra" "0,1")
+ (set_attr "prefix" "vex")
+ (set_attr "mode" "TI,V4SF")])
+
(define_insn "*vec_dupv4si"
[(set (match_operand:V4SI 0 "register_operand" "=Y2,x")
(vec_duplicate:V4SI
@@ -7417,19 +7529,31 @@
%vpshufd\t{$0, %1, %0|%0, %1, 0}
shufps\t{$0, %0, %0|%0, %0, 0}"
[(set_attr "type" "sselog1")
- (set_attr "prefix" "maybe_vex,orig")
(set_attr "length_immediate" "1")
(set_attr "mode" "TI,V4SF")])
(define_insn "*vec_dupv2di_avx"
- [(set (match_operand:V2DI 0 "register_operand" "=x")
+ [(set (match_operand:V2DI 0 "register_operand" "=x,x")
(vec_duplicate:V2DI
- (match_operand:DI 1 "register_operand" "x")))]
+ (match_operand:DI 1 "nonimmediate_operand" " x,m")))]
"TARGET_AVX"
- "vpunpcklqdq\t{%1, %1, %0|%0, %1, %1}"
+ "@
+ vpunpcklqdq\t{%1, %1, %0|%0, %1, %1}
+ vmovddup\t{%1, %0|%0, %1}"
[(set_attr "type" "sselog1")
(set_attr "prefix" "vex")
- (set_attr "mode" "TI")])
+ (set_attr "mode" "TI,DF")])
+
+(define_insn "*vec_dupv2di_sse3"
+ [(set (match_operand:V2DI 0 "register_operand" "=x,x")
+ (vec_duplicate:V2DI
+ (match_operand:DI 1 "nonimmediate_operand" " 0,m")))]
+ "TARGET_SSE3"
+ "@
+ punpcklqdq\t%0, %0
+ movddup\t{%1, %0|%0, %1}"
+ [(set_attr "type" "sselog1")
+ (set_attr "mode" "TI,DF")])
(define_insn "*vec_dupv2di"
[(set (match_operand:V2DI 0 "register_operand" "=Y2,x")
@@ -11838,6 +11962,108 @@
(set_attr "prefix" "vex")
(set_attr "mode" "OI")])
+(define_insn_and_split "vec_dup<mode>"
+ [(set (match_operand:AVX256MODE24P 0 "register_operand" "=x,x")
+ (vec_duplicate:AVX256MODE24P
+ (match_operand:<avxscalarmode> 1 "nonimmediate_operand" "m,?x")))]
+ "TARGET_AVX"
+ "@
+ vbroadcasts<avxmodesuffixf2c>\t{%1, %0|%0, %1}
+ #"
+ "&& reload_completed && REG_P (operands[1])"
+ [(set (match_dup 2) (vec_duplicate:<avxhalfvecmode> (match_dup 1)))
+ (set (match_dup 0) (vec_concat:AVX256MODE24P (match_dup 2) (match_dup 2)))]
+{
+ operands[2] = gen_rtx_REG (<avxhalfvecmode>mode, REGNO (operands[0]));
+}
+ [(set_attr "type" "ssemov")
+ (set_attr "prefix_extra" "1")
+ (set_attr "prefix" "vex")
+ (set_attr "mode" "V8SF")])
+
+(define_insn "avx_vbroadcastf128_<mode>"
+ [(set (match_operand:AVX256MODE 0 "register_operand" "=x,x,x")
+ (vec_concat:AVX256MODE
+ (match_operand:<avxhalfvecmode> 1 "nonimmediate_operand" "m,0,?x")
+ (match_dup 1)))]
+ "TARGET_AVX"
+ "@
+ vbroadcastf128\t{%1, %0|%0, %1}
+ vinsertf128\t{$1, %1, %0, %0|%0, %0, %1, 1}
+ vperm2f128\t{$0, %t1, %t1, %0|%0, %t1, %t1, 0}"
+ [(set_attr "type" "ssemov,sselog1,sselog1")
+ (set_attr "prefix_extra" "1")
+ (set_attr "length_immediate" "0,1,1")
+ (set_attr "prefix" "vex")
+ (set_attr "mode" "V4SF,V8SF,V8SF")])
+
+;; Recognize broadcast as a vec_select as produced by builtin_vec_perm.
+;; If it so happens that the input is in memory, use vbroadcast.
+;; Otherwise use vpermilp (and in the case of 256-bit modes, vperm2f128).
+(define_insn "*avx_vperm_broadcast_v4sf"
+ [(set (match_operand:V4SF 0 "register_operand" "=x,x,x")
+ (vec_select:V4SF
+ (match_operand:V4SF 1 "nonimmediate_operand" "m,o,x")
+ (match_parallel 2 "avx_vbroadcast_operand"
+ [(match_operand 3 "const_int_operand" "C,n,n")])))]
+ "TARGET_AVX"
+{
+ int elt = INTVAL (operands[3]);
+ switch (which_alternative)
+ {
+ case 0:
+ case 1:
+ operands[1] = adjust_address_nv (operands[1], SFmode, elt * 4);
+ return "vbroadcastss\t{%1, %0|%0, %1}";
+ case 2:
+ operands[2] = GEN_INT (elt * 0x55);
+ return "vpermilps\t{%2, %1, %0|%0, %1, %2}";
+ default:
+ gcc_unreachable ();
+ }
+}
+ [(set_attr "type" "ssemov,ssemov,sselog1")
+ (set_attr "prefix_extra" "1")
+ (set_attr "length_immediate" "0,0,1")
+ (set_attr "prefix" "vex")
+ (set_attr "mode" "SF,SF,V4SF")])
+
+(define_insn_and_split "*avx_vperm_broadcast_<mode>"
+ [(set (match_operand:AVX256MODEF2P 0 "register_operand" "=x,x,x")
+ (vec_select:AVX256MODEF2P
+ (match_operand:AVX256MODEF2P 1 "nonimmediate_operand" "m,o,?x")
+ (match_parallel 2 "avx_vbroadcast_operand"
+ [(match_operand 3 "const_int_operand" "C,n,n")])))]
+ "TARGET_AVX"
+ "#"
+ "&& reload_completed"
+ [(set (match_dup 0) (vec_duplicate:AVX256MODEF2P (match_dup 1)))]
+{
+ rtx op0 = operands[0], op1 = operands[1];
+ int elt = INTVAL (operands[3]);
+
+ if (REG_P (op1))
+ {
+ int mask;
+
+ /* Shuffle element we care about into all elements of the 128-bit lane.
+ The other lane gets shuffled too, but we don't care. */
+ if (<MODE>mode == V4DFmode)
+ mask = (elt & 1 ? 15 : 0);
+ else
+ mask = (elt & 3) * 0x55;
+ emit_insn (gen_avx_vpermil<mode> (op0, op1, GEN_INT (mask)));
+
+ /* Shuffle the lane we care about into both lanes of the dest. */
+ mask = (elt / (<ssescalarnum> / 2)) * 0x11;
+ emit_insn (gen_avx_vperm2f128<mode>3 (op0, op0, op0, GEN_INT (mask)));
+ DONE;
+ }
+
+ operands[1] = adjust_address_nv (op1, <avxscalarmode>mode,
+ elt * GET_MODE_SIZE (<avxscalarmode>mode));
+})
+
(define_expand "avx_vpermil<mode>"
[(set (match_operand:AVXMODEFDP 0 "register_operand" "")
(vec_select:AVXMODEFDP
@@ -11989,58 +12215,6 @@
(set_attr "prefix" "vex")
(set_attr "mode" "V8SF")])
-(define_insn "avx_vbroadcasts<avxmodesuffixf2c><avxmodesuffix>"
- [(set (match_operand:AVXMODEF4P 0 "register_operand" "=x")
- (vec_concat:AVXMODEF4P
- (vec_concat:<avxhalfvecmode>
- (match_operand:<avxscalarmode> 1 "memory_operand" "m")
- (match_dup 1))
- (vec_concat:<avxhalfvecmode>
- (match_dup 1)
- (match_dup 1))))]
- "TARGET_AVX"
- "vbroadcasts<avxmodesuffixf2c>\t{%1, %0|%0, %1}"
- [(set_attr "type" "ssemov")
- (set_attr "prefix_extra" "1")
- (set_attr "prefix" "vex")
- (set_attr "mode" "<avxscalarmode>")])
-
-(define_insn "avx_vbroadcastss256"
- [(set (match_operand:V8SF 0 "register_operand" "=x")
- (vec_concat:V8SF
- (vec_concat:V4SF
- (vec_concat:V2SF
- (match_operand:SF 1 "memory_operand" "m")
- (match_dup 1))
- (vec_concat:V2SF
- (match_dup 1)
- (match_dup 1)))
- (vec_concat:V4SF
- (vec_concat:V2SF
- (match_dup 1)
- (match_dup 1))
- (vec_concat:V2SF
- (match_dup 1)
- (match_dup 1)))))]
- "TARGET_AVX"
- "vbroadcastss\t{%1, %0|%0, %1}"
- [(set_attr "type" "ssemov")
- (set_attr "prefix_extra" "1")
- (set_attr "prefix" "vex")
- (set_attr "mode" "SF")])
-
-(define_insn "avx_vbroadcastf128_p<avxmodesuffixf2c>256"
- [(set (match_operand:AVX256MODEF2P 0 "register_operand" "=x")
- (vec_concat:AVX256MODEF2P
- (match_operand:<avxhalfvecmode> 1 "memory_operand" "m")
- (match_dup 1)))]
- "TARGET_AVX"
- "vbroadcastf128\t{%1, %0|%0, %1}"
- [(set_attr "type" "ssemov")
- (set_attr "prefix_extra" "1")
- (set_attr "prefix" "vex")
- (set_attr "mode" "V4SF")])
-
(define_expand "avx_vinsertf128<mode>"
[(match_operand:AVX256MODE 0 "register_operand" "")
(match_operand:AVX256MODE 1 "register_operand" "")