diff options
-rw-r--r-- | gcc/ChangeLog | 73 | ||||
-rw-r--r-- | gcc/config/i386/i386.c | 673 | ||||
-rw-r--r-- | gcc/config/i386/sse.md | 115 | ||||
-rw-r--r-- | gcc/testsuite/ChangeLog | 12 | ||||
-rw-r--r-- | gcc/testsuite/gcc.target/i386/pr49002-2.c | 2 | ||||
-rw-r--r-- | gcc/tree-vect-stmts.c | 34 | ||||
-rw-r--r-- | gcc/tree-vectorizer.h | 4 |
7 files changed, 802 insertions, 111 deletions
diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 33946f28d0e..1bb86a88fc9 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,76 @@ +2013-12-31 Alexander Ivchenko <alexander.ivchenko@intel.com> + Maxim Kuznetsov <maxim.kuznetsov@intel.com> + Sergey Lega <sergey.s.lega@intel.com> + Anna Tikhonova <anna.tikhonova@intel.com> + Ilya Tocar <ilya.tocar@intel.com> + Andrey Turetskiy <andrey.turetskiy@intel.com> + Ilya Verbin <ilya.verbin@intel.com> + Kirill Yukhin <kirill.yukhin@intel.com> + Michael Zolotukhin <michael.v.zolotukhin@intel.com> + + * config/i386/i386.c (MAX_CLASSES): Increase number of classes. + (classify_argument): Extend for 512 bit vectors. + (construct_container): Ditto. + (function_arg_advance_32): Ditto. + (function_arg_advance_64): Ditto. + (function_arg_32): Ditto. + (function_arg_64): Ditto. + (function_value_32): Ditto. + (return_in_memory_32): Ditto. + (ix86_gimplify_va_arg): Ditto. + (standard_sse_constant_p): Ditto. + (standard_sse_constant_opcode): Ditto. + (ix86_expand_vector_convert_uns_vsivsf): Ditto. + (ix86_build_const_vector): Ditto. + (ix86_build_signbit_mask): Ditto. + (ix86_expand_sse_cmp): Extend for AVX512. + (ix86_expand_sse_movcc): Ditto. + (ix86_expand_int_vcond): Ditto. + (ix86_expand_vec_perm): Ditto. + (ix86_expand_sse_unpack): Ditto. + (ix86_constant_alignment): Ditto. + (ix86_builtin_vectorized_function): Ditto. + (ix86_vectorize_builtin_gather): Ditto. + (avx_vpermilp_parallel): Ditto. + (ix86_rtx_costs): Ditto. + (ix86_expand_vector_init_duplicate): Ditto. + (ix86_expand_vector_init_concat): Ditto. + (ix86_expand_vector_init_general): Ditto. + (ix86_expand_vector_extract): Ditto. + (emit_reduc_half): Ditto. + (ix86_vector_mode_supported_p): Ditto. + (ix86_emit_swdivsf): Ditto. + (ix86_emit_swsqrtsf): Ditto. + (expand_vec_perm_1): Ditto. + (ix86_vectorize_vec_perm_const_ok): Ditto. + (ix86_expand_mul_widen_evenodd): Ditto. + (ix86_expand_sse2_mulvxdi3): Ditto. + (ix86_preferred_simd_mode): Ditto. + (ix86_autovectorize_vector_sizes): Ditto. + (ix86_expand_vec_perm_vpermi2): New. + (ix86_vector_duplicate_value): Ditto. + (IX86_BUILTIN_SQRTPD512, IX86_BUILTIN_EXP2PS, IX86_BUILTIN_SQRTPS_NR512, + IX86_BUILTIN_GATHER3ALTDIV16SF, IX86_BUILTIN_GATHER3ALTDIV16SI, + IX86_BUILTIN_GATHER3ALTSIV8DF, IX86_BUILTIN_GATHER3ALTSIV8DI, + IX86_BUILTIN_GATHER3DIV16SF, IX86_BUILTIN_GATHER3DIV16SI, + IX86_BUILTIN_GATHER3DIV8DF, IX86_BUILTIN_GATHER3DIV8DI, + IX86_BUILTIN_GATHER3SIV16SF, IX86_BUILTIN_GATHER3SIV16SI, + IX86_BUILTIN_GATHER3SIV8DF, IX86_BUILTIN_CEILPD_VEC_PACK_SFIX512, + IX86_BUILTIN_CPYSGNPS512, IX86_BUILTIN_CPYSGNPD512, + IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX512, + IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX512): Ditto. + * config/i386/sse.md (*mov<mode>_internal): Disable SSE typeless + stores vectors > 128bit (AVX*). + (<sse>_storeu<ssemodesuffix><avxsizesuffix>): Ditto. + (<sse2_avx_avx512f>_storedqu<mode>): Extend for AVX-512, disable + SSE typeless stores vectors > 128bit (AVX*). + (fixuns_trunc<mode><sseintvecmodelower>2): Extend for AVX-512. + (vec_pack_ufix_trunc_<mode>): Ditto. + (vec_unpacku_float_hi_v16si): New. + * tree-vect-stmts.c (vectorizable_load): Support AVX512's gathers. + * tree-vectorizer.h (MAX_VECTORIZATION_FACTOR): Extend for 512 bit + vectors. + 2013-12-31 Chung-Lin Tang <cltang@codesourcery.com> Sandra Loosemore <sandra@codesourcery.com> Based on patches from Altera Corporation diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index 0a90ead8970..dd48cc51656 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -2308,7 +2308,7 @@ enum x86_64_reg_class X86_64_MEMORY_CLASS }; -#define MAX_CLASSES 4 +#define MAX_CLASSES 8 /* Table of constants used by fldpi, fldln2, etc.... */ static REAL_VALUE_TYPE ext_80387_constants_table [5]; @@ -6242,7 +6242,7 @@ merge_classes (enum x86_64_reg_class class1, enum x86_64_reg_class class2) sized containers, classes[0] will be NO_CLASS and 1 is returned. BIT_OFFSET is used internally for handling records and specifies offset - of the offset in bits modulo 256 to avoid overflow cases. + of the offset in bits modulo 512 to avoid overflow cases. See the x86-64 PS ABI for details. */ @@ -6342,7 +6342,7 @@ classify_argument (enum machine_mode mode, const_tree type, num = classify_argument (TYPE_MODE (type), type, subclasses, (int_bit_position (field) - + bit_offset) % 256); + + bit_offset) % 512); if (!num) return 0; pos = (int_bit_position (field) @@ -6592,6 +6592,21 @@ classify_argument (enum machine_mode mode, const_tree type, classes[2] = X86_64_SSEUP_CLASS; classes[3] = X86_64_SSEUP_CLASS; return 4; + case V8DFmode: + case V16SFmode: + case V8DImode: + case V16SImode: + case V32HImode: + case V64QImode: + classes[0] = X86_64_SSE_CLASS; + classes[1] = X86_64_SSEUP_CLASS; + classes[2] = X86_64_SSEUP_CLASS; + classes[3] = X86_64_SSEUP_CLASS; + classes[4] = X86_64_SSEUP_CLASS; + classes[5] = X86_64_SSEUP_CLASS; + classes[6] = X86_64_SSEUP_CLASS; + classes[7] = X86_64_SSEUP_CLASS; + return 8; case V4SFmode: case V4SImode: case V16QImode: @@ -6777,6 +6792,18 @@ construct_container (enum machine_mode mode, enum machine_mode orig_mode, && mode != BLKmode) return gen_reg_or_parallel (mode, orig_mode, SSE_REGNO (sse_regno)); + if (n == 8 + && regclass[0] == X86_64_SSE_CLASS + && regclass[1] == X86_64_SSEUP_CLASS + && regclass[2] == X86_64_SSEUP_CLASS + && regclass[3] == X86_64_SSEUP_CLASS + && regclass[4] == X86_64_SSEUP_CLASS + && regclass[5] == X86_64_SSEUP_CLASS + && regclass[6] == X86_64_SSEUP_CLASS + && regclass[7] == X86_64_SSEUP_CLASS + && mode != BLKmode) + return gen_reg_or_parallel (mode, orig_mode, + SSE_REGNO (sse_regno)); if (n == 2 && regclass[0] == X86_64_X87_CLASS && regclass[1] == X86_64_X87UP_CLASS) @@ -6858,6 +6885,18 @@ construct_container (enum machine_mode mode, enum machine_mode orig_mode, tmpmode = OImode; i += 3; break; + case 8: + gcc_assert (i == 0 + && regclass[1] == X86_64_SSEUP_CLASS + && regclass[2] == X86_64_SSEUP_CLASS + && regclass[3] == X86_64_SSEUP_CLASS + && regclass[4] == X86_64_SSEUP_CLASS + && regclass[5] == X86_64_SSEUP_CLASS + && regclass[6] == X86_64_SSEUP_CLASS + && regclass[7] == X86_64_SSEUP_CLASS); + tmpmode = XImode; + i += 7; + break; default: gcc_unreachable (); } @@ -6931,6 +6970,12 @@ function_arg_advance_32 (CUMULATIVE_ARGS *cum, enum machine_mode mode, case V8SFmode: case V8SImode: + case V64QImode: + case V32HImode: + case V16SImode: + case V8DImode: + case V16SFmode: + case V8DFmode: case V32QImode: case V16HImode: case V4DFmode: @@ -6982,8 +7027,9 @@ function_arg_advance_64 (CUMULATIVE_ARGS *cum, enum machine_mode mode, { int int_nregs, sse_nregs; - /* Unnamed 256bit vector mode parameters are passed on stack. */ - if (!named && VALID_AVX256_REG_MODE (mode)) + /* Unnamed 512 and 256bit vector mode parameters are passed on stack. */ + if (!named && (VALID_AVX512F_REG_MODE (mode) + || VALID_AVX256_REG_MODE (mode))) return; if (examine_argument (mode, type, 0, &int_nregs, &sse_nregs) @@ -7134,9 +7180,16 @@ function_arg_32 (const CUMULATIVE_ARGS *cum, enum machine_mode mode, break; case OImode: - /* OImode shouldn't be used directly. */ + case XImode: + /* OImode and XImode shouldn't be used directly. */ gcc_unreachable (); + case V64QImode: + case V32HImode: + case V16SImode: + case V8DImode: + case V16SFmode: + case V8DFmode: case V8SFmode: case V8SImode: case V32QImode: @@ -7199,7 +7252,13 @@ function_arg_64 (const CUMULATIVE_ARGS *cum, enum machine_mode mode, case V16HImode: case V4DFmode: case V4DImode: - /* Unnamed 256bit vector mode parameters are passed on stack. */ + case V16SFmode: + case V16SImode: + case V64QImode: + case V32HImode: + case V8DFmode: + case V8DImode: + /* Unnamed 256 and 512bit vector mode parameters are passed on stack. */ if (!named) return NULL; break; @@ -7602,6 +7661,10 @@ function_value_32 (enum machine_mode orig_mode, enum machine_mode mode, else if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 32) regno = FIRST_SSE_REG; + /* 64-byte vector modes in %zmm0. */ + else if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 64) + regno = FIRST_SSE_REG; + /* Floating point return values in %st(0) (unless -mno-fp-ret-in-387). */ else if (X87_FLOAT_MODE_P (mode) && TARGET_FLOAT_RETURNS_IN_80387) regno = FIRST_FLOAT_REG; @@ -7809,6 +7872,10 @@ return_in_memory_32 (const_tree type, enum machine_mode mode) /* AVX values are returned in YMM0, except when it doesn't exist. */ if (size == 32) return !TARGET_AVX; + + /* AVX512F values are returned in ZMM0, except when it doesn't exist. */ + if (size == 64) + return !TARGET_AVX512F; } if (mode == XFmode) @@ -8345,7 +8412,13 @@ ix86_gimplify_va_arg (tree valist, tree type, gimple_seq *pre_p, case V16HImode: case V4DFmode: case V4DImode: - /* Unnamed 256bit vector mode parameters are passed on stack. */ + case V16SFmode: + case V16SImode: + case V64QImode: + case V32HImode: + case V8DFmode: + case V8DImode: + /* Unnamed 256 and 512bit vector mode parameters are passed on stack. */ if (!TARGET_64BIT_MS_ABI) { container = NULL; @@ -8760,6 +8833,12 @@ standard_sse_constant_p (rtx x) case V4DImode: if (TARGET_AVX2) return 2; + case V64QImode: + case V32HImode: + case V16SImode: + case V8DImode: + if (TARGET_AVX512F) + return 2; default: break; } @@ -8778,6 +8857,11 @@ standard_sse_constant_opcode (rtx insn, rtx x) case 1: switch (get_attr_mode (insn)) { + case MODE_XI: + case MODE_V16SF: + return "vpxord\t%g0, %g0, %g0"; + case MODE_V8DF: + return "vpxorq\t%g0, %g0, %g0"; case MODE_TI: return "%vpxor\t%0, %d0"; case MODE_V2DF: @@ -18693,17 +18777,23 @@ ix86_build_const_vector (enum machine_mode mode, bool vect, rtx value) switch (mode) { + case V64QImode: case V32QImode: case V16QImode: + case V32HImode: case V16HImode: case V8HImode: + case V16SImode: case V8SImode: case V4SImode: + case V8DImode: case V4DImode: case V2DImode: gcc_assert (vect); + case V16SFmode: case V8SFmode: case V4SFmode: + case V8DFmode: case V4DFmode: case V2DFmode: n_elt = GET_MODE_NUNITS (mode); @@ -18740,6 +18830,8 @@ ix86_build_signbit_mask (enum machine_mode mode, bool vect, bool invert) /* Find the sign bit, sign extended to 2*HWI. */ switch (mode) { + case V16SImode: + case V16SFmode: case V8SImode: case V4SImode: case V8SFmode: @@ -18750,8 +18842,10 @@ ix86_build_signbit_mask (enum machine_mode mode, bool vect, bool invert) lo = 0x80000000, hi = lo < 0; break; + case V8DImode: case V4DImode: case V2DImode: + case V8DFmode: case V4DFmode: case V2DFmode: vec_mode = mode; @@ -20608,22 +20702,63 @@ ix86_expand_sse_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1, rtx op_true, rtx op_false) { enum machine_mode mode = GET_MODE (dest); - enum machine_mode cmp_mode = GET_MODE (cmp_op0); + enum machine_mode cmp_ops_mode = GET_MODE (cmp_op0); + + /* In general case result of comparison can differ from operands' type. */ + enum machine_mode cmp_mode; + + /* In AVX512F the result of comparison is an integer mask. */ + bool maskcmp = false; rtx x; - cmp_op0 = force_reg (cmp_mode, cmp_op0); - if (!nonimmediate_operand (cmp_op1, cmp_mode)) - cmp_op1 = force_reg (cmp_mode, cmp_op1); + if (GET_MODE_SIZE (cmp_ops_mode) == 64) + { + cmp_mode = mode_for_size (GET_MODE_NUNITS (cmp_ops_mode), MODE_INT, 0); + gcc_assert (cmp_mode != BLKmode); + + maskcmp = true; + } + else + cmp_mode = cmp_ops_mode; + + + cmp_op0 = force_reg (cmp_ops_mode, cmp_op0); + if (!nonimmediate_operand (cmp_op1, cmp_ops_mode)) + cmp_op1 = force_reg (cmp_ops_mode, cmp_op1); if (optimize || reg_overlap_mentioned_p (dest, op_true) || reg_overlap_mentioned_p (dest, op_false)) - dest = gen_reg_rtx (mode); + dest = gen_reg_rtx (maskcmp ? cmp_mode : mode); + + /* Compare patterns for int modes are unspec in AVX512F only. */ + if (maskcmp && (code == GT || code == EQ)) + { + rtx (*gen)(rtx, rtx, rtx); + switch (cmp_ops_mode) + { + case V16SImode: + gen = code == GT ? gen_avx512f_gtv16si3 : gen_avx512f_eqv16si3_1; + break; + case V8DImode: + gen = code == GT ? gen_avx512f_gtv8di3 : gen_avx512f_eqv8di3_1; + break; + default: + gen = NULL; + } + + if (gen) + { + emit_insn (gen (dest, cmp_op0, cmp_op1)); + return dest; + } + } x = gen_rtx_fmt_ee (code, cmp_mode, cmp_op0, cmp_op1); - if (cmp_mode != mode) + + if (cmp_mode != mode && !maskcmp) { - x = force_reg (cmp_mode, x); + x = force_reg (cmp_ops_mode, x); convert_move (dest, x, false); } else @@ -20639,33 +20774,43 @@ static void ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false) { enum machine_mode mode = GET_MODE (dest); + enum machine_mode cmpmode = GET_MODE (cmp); + + /* In AVX512F the result of comparison is an integer mask. */ + bool maskcmp = (mode != cmpmode && TARGET_AVX512F); + rtx t2, t3, x; if (vector_all_ones_operand (op_true, mode) - && rtx_equal_p (op_false, CONST0_RTX (mode))) + && rtx_equal_p (op_false, CONST0_RTX (mode)) + && !maskcmp) { emit_insn (gen_rtx_SET (VOIDmode, dest, cmp)); } - else if (op_false == CONST0_RTX (mode)) + else if (op_false == CONST0_RTX (mode) + && !maskcmp) { op_true = force_reg (mode, op_true); x = gen_rtx_AND (mode, cmp, op_true); emit_insn (gen_rtx_SET (VOIDmode, dest, x)); } - else if (op_true == CONST0_RTX (mode)) + else if (op_true == CONST0_RTX (mode) + && !maskcmp) { op_false = force_reg (mode, op_false); x = gen_rtx_NOT (mode, cmp); x = gen_rtx_AND (mode, x, op_false); emit_insn (gen_rtx_SET (VOIDmode, dest, x)); } - else if (INTEGRAL_MODE_P (mode) && op_true == CONSTM1_RTX (mode)) + else if (INTEGRAL_MODE_P (mode) && op_true == CONSTM1_RTX (mode) + && !maskcmp) { op_false = force_reg (mode, op_false); x = gen_rtx_IOR (mode, cmp, op_false); emit_insn (gen_rtx_SET (VOIDmode, dest, x)); } - else if (TARGET_XOP) + else if (TARGET_XOP + && !maskcmp) { op_true = force_reg (mode, op_true); @@ -20733,6 +20878,20 @@ ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false) cmp = gen_lowpart (V32QImode, cmp); } break; + + case V16SImode: + gen = gen_avx512f_blendmv16si; + break; + case V8DImode: + gen = gen_avx512f_blendmv8di; + break; + case V8DFmode: + gen = gen_avx512f_blendmv8df; + break; + case V16SFmode: + gen = gen_avx512f_blendmv16sf; + break; + default: break; } @@ -21000,6 +21159,8 @@ ix86_expand_int_vcond (rtx operands[]) switch (mode) { + case V16SImode: + case V8DImode: case V8SImode: case V4DImode: case V4SImode: @@ -21010,6 +21171,8 @@ ix86_expand_int_vcond (rtx operands[]) switch (mode) { + case V16SImode: gen_sub3 = gen_subv16si3; break; + case V8DImode: gen_sub3 = gen_subv8di3; break; case V8SImode: gen_sub3 = gen_subv8si3; break; case V4DImode: gen_sub3 = gen_subv4di3; break; case V4SImode: gen_sub3 = gen_subv4si3; break; @@ -21065,7 +21228,8 @@ ix86_expand_int_vcond (rtx operands[]) gcc_assert (GET_MODE_SIZE (data_mode) == GET_MODE_SIZE (mode)); x = ix86_expand_sse_cmp (gen_reg_rtx (mode), code, cop0, cop1, operands[1+negate], operands[2-negate]); - x = gen_lowpart (data_mode, x); + if (GET_MODE (x) == mode) + x = gen_lowpart (data_mode, x); } ix86_expand_sse_movcc (operands[0], x, operands[1+negate], @@ -21073,6 +21237,35 @@ ix86_expand_int_vcond (rtx operands[]) return true; } +static bool +ix86_expand_vec_perm_vpermi2 (rtx target, rtx op0, rtx mask, rtx op1) +{ + enum machine_mode mode = GET_MODE (op0); + switch (mode) + { + case V16SImode: + emit_insn (gen_avx512f_vpermi2varv16si3 (target, op0, + force_reg (V16SImode, mask), + op1)); + return true; + case V16SFmode: + emit_insn (gen_avx512f_vpermi2varv16sf3 (target, op0, + force_reg (V16SImode, mask), + op1)); + return true; + case V8DImode: + emit_insn (gen_avx512f_vpermi2varv8di3 (target, op0, + force_reg (V8DImode, mask), op1)); + return true; + case V8DFmode: + emit_insn (gen_avx512f_vpermi2varv8df3 (target, op0, + force_reg (V8DImode, mask), op1)); + return true; + default: + return false; + } +} + /* Expand a variable vector permutation. */ void @@ -21091,7 +21284,10 @@ ix86_expand_vec_perm (rtx operands[]) /* Number of elements in the vector. */ w = GET_MODE_NUNITS (mode); e = GET_MODE_UNIT_SIZE (mode); - gcc_assert (w <= 32); + gcc_assert (w <= 64); + + if (ix86_expand_vec_perm_vpermi2 (target, op0, mask, op1)) + return; if (TARGET_AVX2) { @@ -21471,6 +21667,15 @@ ix86_expand_sse_unpack (rtx dest, rtx src, bool unsigned_p, bool high_p) extract = high_p ? gen_vec_extract_hi_v32qi : gen_vec_extract_lo_v32qi; break; + case V32HImode: + if (unsigned_p) + unpack = gen_avx512f_zero_extendv16hiv16si2; + else + unpack = gen_avx512f_sign_extendv16hiv16si2; + halfmode = V16HImode; + extract + = high_p ? gen_vec_extract_hi_v32hi : gen_vec_extract_lo_v32hi; + break; case V16HImode: if (unsigned_p) unpack = gen_avx2_zero_extendv8hiv8si2; @@ -21480,6 +21685,15 @@ ix86_expand_sse_unpack (rtx dest, rtx src, bool unsigned_p, bool high_p) extract = high_p ? gen_vec_extract_hi_v16hi : gen_vec_extract_lo_v16hi; break; + case V16SImode: + if (unsigned_p) + unpack = gen_avx512f_zero_extendv8siv8di2; + else + unpack = gen_avx512f_sign_extendv8siv8di2; + halfmode = V8SImode; + extract + = high_p ? gen_vec_extract_hi_v16si : gen_vec_extract_lo_v16si; + break; case V8SImode: if (unsigned_p) unpack = gen_avx2_zero_extendv4siv4di2; @@ -21511,7 +21725,7 @@ ix86_expand_sse_unpack (rtx dest, rtx src, bool unsigned_p, bool high_p) gcc_unreachable (); } - if (GET_MODE_SIZE (imode) == 32) + if (GET_MODE_SIZE (imode) >= 32) { tmp = gen_reg_rtx (halfmode); emit_insn (extract (tmp, src)); @@ -26245,7 +26459,8 @@ ix86_constant_alignment (tree exp, int align) int ix86_data_alignment (tree type, int align, bool opt) { - int max_align = optimize_size ? BITS_PER_WORD : MIN (256, MAX_OFILE_ALIGNMENT); + int max_align = optimize_size ? BITS_PER_WORD + : MIN (512, MAX_OFILE_ALIGNMENT); if (opt && AGGREGATE_TYPE_P (type) @@ -27707,12 +27922,27 @@ enum ix86_builtins IX86_BUILTIN_GATHERDIV4SI, IX86_BUILTIN_GATHERDIV8SI, + IX86_BUILTIN_SQRTPD512, + IX86_BUILTIN_EXP2PS, + IX86_BUILTIN_SQRTPS_NR512, + /* Alternate 4 element gather for the vectorizer where all operands are 32-byte wide. */ IX86_BUILTIN_GATHERALTSIV4DF, IX86_BUILTIN_GATHERALTDIV8SF, IX86_BUILTIN_GATHERALTSIV4DI, IX86_BUILTIN_GATHERALTDIV8SI, + IX86_BUILTIN_GATHER3ALTDIV16SF, + IX86_BUILTIN_GATHER3ALTDIV16SI, + IX86_BUILTIN_GATHER3ALTSIV8DF, + IX86_BUILTIN_GATHER3ALTSIV8DI, + IX86_BUILTIN_GATHER3DIV16SF, + IX86_BUILTIN_GATHER3DIV16SI, + IX86_BUILTIN_GATHER3DIV8DF, + IX86_BUILTIN_GATHER3DIV8DI, + IX86_BUILTIN_GATHER3SIV16SF, + IX86_BUILTIN_GATHER3SIV16SI, + IX86_BUILTIN_GATHER3SIV8DF, /* TFmode support builtins. */ IX86_BUILTIN_INFQ, @@ -27721,10 +27951,16 @@ enum ix86_builtins IX86_BUILTIN_COPYSIGNQ, /* Vectorizer support builtins. */ + IX86_BUILTIN_CEILPD_VEC_PACK_SFIX512, IX86_BUILTIN_CPYSGNPS, IX86_BUILTIN_CPYSGNPD, IX86_BUILTIN_CPYSGNPS256, + IX86_BUILTIN_CPYSGNPS512, IX86_BUILTIN_CPYSGNPD256, + IX86_BUILTIN_CPYSGNPD512, + IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX512, + IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX512, + /* FMA4 instructions. */ IX86_BUILTIN_VFMADDSS, @@ -33902,6 +34138,16 @@ ix86_builtin_vectorized_function (tree fndecl, tree type_out, return ix86_get_builtin (IX86_BUILTIN_SQRTPD); else if (out_n == 4 && in_n == 4) return ix86_get_builtin (IX86_BUILTIN_SQRTPD256); + else if (out_n == 8 && in_n == 8) + return ix86_get_builtin (IX86_BUILTIN_SQRTPD512); + } + break; + + case BUILT_IN_EXP2F: + if (out_mode == SFmode && in_mode == SFmode) + { + if (out_n == 16 && in_n == 16) + return ix86_get_builtin (IX86_BUILTIN_EXP2PS); } break; @@ -33912,6 +34158,8 @@ ix86_builtin_vectorized_function (tree fndecl, tree type_out, return ix86_get_builtin (IX86_BUILTIN_SQRTPS_NR); else if (out_n == 8 && in_n == 8) return ix86_get_builtin (IX86_BUILTIN_SQRTPS_NR256); + else if (out_n == 16 && in_n == 16) + return ix86_get_builtin (IX86_BUILTIN_SQRTPS_NR512); } break; @@ -33928,6 +34176,8 @@ ix86_builtin_vectorized_function (tree fndecl, tree type_out, return ix86_get_builtin (IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX); else if (out_n == 8 && in_n == 4) return ix86_get_builtin (IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256); + else if (out_n == 16 && in_n == 8) + return ix86_get_builtin (IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX512); } break; @@ -33960,6 +34210,8 @@ ix86_builtin_vectorized_function (tree fndecl, tree type_out, return ix86_get_builtin (IX86_BUILTIN_CEILPD_VEC_PACK_SFIX); else if (out_n == 8 && in_n == 4) return ix86_get_builtin (IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256); + else if (out_n == 16 && in_n == 8) + return ix86_get_builtin (IX86_BUILTIN_CEILPD_VEC_PACK_SFIX512); } break; @@ -34016,6 +34268,8 @@ ix86_builtin_vectorized_function (tree fndecl, tree type_out, return ix86_get_builtin (IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX); else if (out_n == 8 && in_n == 4) return ix86_get_builtin (IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256); + else if (out_n == 16 && in_n == 8) + return ix86_get_builtin (IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX512); } break; @@ -34042,6 +34296,8 @@ ix86_builtin_vectorized_function (tree fndecl, tree type_out, return ix86_get_builtin (IX86_BUILTIN_CPYSGNPD); else if (out_n == 4 && in_n == 4) return ix86_get_builtin (IX86_BUILTIN_CPYSGNPD256); + else if (out_n == 8 && in_n == 8) + return ix86_get_builtin (IX86_BUILTIN_CPYSGNPD512); } break; @@ -34052,6 +34308,8 @@ ix86_builtin_vectorized_function (tree fndecl, tree type_out, return ix86_get_builtin (IX86_BUILTIN_CPYSGNPS); else if (out_n == 8 && in_n == 8) return ix86_get_builtin (IX86_BUILTIN_CPYSGNPS256); + else if (out_n == 16 && in_n == 16) + return ix86_get_builtin (IX86_BUILTIN_CPYSGNPS512); } break; @@ -34487,6 +34745,34 @@ ix86_vectorize_builtin_gather (const_tree mem_vectype, case V8SImode: code = si ? IX86_BUILTIN_GATHERSIV8SI : IX86_BUILTIN_GATHERALTDIV8SI; break; +#if 0 + /* FIXME: Commented until vectorizer can work with (mask_type != src_type) + PR59617. */ + case V8DFmode: + if (TARGET_AVX512F) + code = si ? IX86_BUILTIN_GATHER3ALTSIV8DF : IX86_BUILTIN_GATHER3DIV8DF; + else + return NULL_TREE; + break; + case V8DImode: + if (TARGET_AVX512F) + code = si ? IX86_BUILTIN_GATHER3ALTSIV8DI : IX86_BUILTIN_GATHER3DIV8DI; + else + return NULL_TREE; + break; + case V16SFmode: + if (TARGET_AVX512F) + code = si ? IX86_BUILTIN_GATHER3SIV16SF : IX86_BUILTIN_GATHER3ALTDIV16SF; + else + return NULL_TREE; + break; + case V16SImode: + if (TARGET_AVX512F) + code = si ? IX86_BUILTIN_GATHER3SIV16SI : IX86_BUILTIN_GATHER3ALTDIV16SI; + else + return NULL_TREE; + break; +#endif default: return NULL_TREE; } @@ -34542,7 +34828,7 @@ avx_vpermilp_parallel (rtx par, enum machine_mode mode) { unsigned i, nelt = GET_MODE_NUNITS (mode); unsigned mask = 0; - unsigned char ipar[8] = {}; /* Silence -Wuninitialized warning. */ + unsigned char ipar[16] = {}; /* Silence -Wuninitialized warning. */ if (XVECLEN (par, 0) != (int) nelt) return 0; @@ -34565,6 +34851,24 @@ avx_vpermilp_parallel (rtx par, enum machine_mode mode) switch (mode) { + case V8DFmode: + /* In the 512-bit DFmode case, we can only move elements within + a 128-bit lane. First fill the second part of the mask, + then fallthru. */ + for (i = 4; i < 6; ++i) + { + if (ipar[i] < 4 || ipar[i] >= 6) + return 0; + mask |= (ipar[i] - 4) << i; + } + for (i = 6; i < 8; ++i) + { + if (ipar[i] < 6) + return 0; + mask |= (ipar[i] - 6) << i; + } + /* FALLTHRU */ + case V4DFmode: /* In the 256-bit DFmode case, we can only move elements within a 128-bit lane. */ @@ -34582,10 +34886,18 @@ avx_vpermilp_parallel (rtx par, enum machine_mode mode) } break; + case V16SFmode: + /* In 512 bit SFmode case, permutation in the upper 256 bits + must mirror the permutation in the lower 256-bits. */ + for (i = 0; i < 8; ++i) + if (ipar[i] + 8 != ipar[i + 8]) + return 0; + /* FALLTHRU */ + case V8SFmode: - /* In the 256-bit SFmode case, we have full freedom of movement - within the low 128-bit lane, but the high 128-bit lane must - mirror the exact same pattern. */ + /* In 256 bit SFmode case, we have full freedom of + movement within the low 128-bit lane, but the high 128-bit + lane must mirror the exact same pattern. */ for (i = 0; i < 4; ++i) if (ipar[i] + 4 != ipar[i + 4]) return 0; @@ -35536,6 +35848,7 @@ static bool ix86_rtx_costs (rtx x, int code_i, int outer_code_i, int opno, int *total, bool speed) { + rtx mask; enum rtx_code code = (enum rtx_code) code_i; enum rtx_code outer_code = (enum rtx_code) outer_code_i; enum machine_mode mode = GET_MODE (x); @@ -36012,13 +36325,21 @@ ix86_rtx_costs (rtx x, int code_i, int outer_code_i, int opno, int *total, case VEC_SELECT: case VEC_CONCAT: - case VEC_MERGE: case VEC_DUPLICATE: /* ??? Assume all of these vector manipulation patterns are recognizable. In which case they all pretty much have the same cost. */ *total = cost->fabs; return true; + case VEC_MERGE: + mask = XEXP (x, 2); + /* This is masked instruction, assume the same cost, + as nonmasked variant. */ + if (TARGET_AVX512F && register_operand (mask, GET_MODE (mask))) + *total = rtx_cost (XEXP (x, 0), outer_code, opno, speed); + else + *total = cost->fabs; + return true; default: return false; @@ -37184,6 +37505,36 @@ get_mode_wider_vector (enum machine_mode o) return n; } +/* A subroutine of ix86_expand_vector_init_duplicate. Tries to + fill target with val via vec_duplicate. */ + +static bool +ix86_vector_duplicate_value (enum machine_mode mode, rtx target, rtx val) +{ + bool ok; + rtx insn, dup; + + /* First attempt to recognize VAL as-is. */ + dup = gen_rtx_VEC_DUPLICATE (mode, val); + insn = emit_insn (gen_rtx_SET (VOIDmode, target, dup)); + if (recog_memoized (insn) < 0) + { + rtx seq; + /* If that fails, force VAL into a register. */ + + start_sequence (); + XEXP (dup, 0) = force_reg (GET_MODE_INNER (mode), val); + seq = get_insns (); + end_sequence (); + if (seq) + emit_insn_before (seq, insn); + + ok = recog_memoized (insn) >= 0; + gcc_assert (ok); + } + return true; +} + /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector with all elements equal to VAR. Return true if successful. */ @@ -37209,29 +37560,11 @@ ix86_expand_vector_init_duplicate (bool mmx_ok, enum machine_mode mode, case V2DImode: case V4SFmode: case V4SImode: - { - rtx insn, dup; - - /* First attempt to recognize VAL as-is. */ - dup = gen_rtx_VEC_DUPLICATE (mode, val); - insn = emit_insn (gen_rtx_SET (VOIDmode, target, dup)); - if (recog_memoized (insn) < 0) - { - rtx seq; - /* If that fails, force VAL into a register. */ - - start_sequence (); - XEXP (dup, 0) = force_reg (GET_MODE_INNER (mode), val); - seq = get_insns (); - end_sequence (); - if (seq) - emit_insn_before (seq, insn); - - ok = recog_memoized (insn) >= 0; - gcc_assert (ok); - } - } - return true; + case V16SImode: + case V8DImode: + case V16SFmode: + case V8DFmode: + return ix86_vector_duplicate_value (mode, target, val); case V4HImode: if (!mmx_ok) @@ -37581,8 +37914,8 @@ static void ix86_expand_vector_init_concat (enum machine_mode mode, rtx target, rtx *ops, int n) { - enum machine_mode cmode, hmode = VOIDmode; - rtx first[8], second[4]; + enum machine_mode cmode, hmode = VOIDmode, gmode = VOIDmode; + rtx first[16], second[8], third[4]; rtvec v; int i, j; @@ -37591,6 +37924,18 @@ ix86_expand_vector_init_concat (enum machine_mode mode, case 2: switch (mode) { + case V16SImode: + cmode = V8SImode; + break; + case V16SFmode: + cmode = V8SFmode; + break; + case V8DImode: + cmode = V4DImode; + break; + case V8DFmode: + cmode = V4DFmode; + break; case V8SImode: cmode = V4SImode; break; @@ -37657,6 +38002,14 @@ ix86_expand_vector_init_concat (enum machine_mode mode, case 8: switch (mode) { + case V8DImode: + cmode = V2DImode; + hmode = V4DImode; + break; + case V8DFmode: + cmode = V2DFmode; + hmode = V4DFmode; + break; case V8SImode: cmode = V2SImode; hmode = V4SImode; @@ -37670,6 +38023,24 @@ ix86_expand_vector_init_concat (enum machine_mode mode, } goto half; + case 16: + switch (mode) + { + case V16SImode: + cmode = V2SImode; + hmode = V4SImode; + gmode = V8SImode; + break; + case V16SFmode: + cmode = V2SFmode; + hmode = V4SFmode; + gmode = V8SFmode; + break; + default: + gcc_unreachable (); + } + goto half; + half: /* FIXME: We process inputs backward to help RA. PR 36222. */ i = n - 1; @@ -37683,7 +38054,27 @@ half: } n >>= 1; - if (n > 2) + if (n > 4) + { + gcc_assert (hmode != VOIDmode); + gcc_assert (gmode != VOIDmode); + for (i = j = 0; i < n; i += 2, j++) + { + second[j] = gen_reg_rtx (hmode); + ix86_expand_vector_init_concat (hmode, second [j], + &first [i], 2); + } + n >>= 1; + for (i = j = 0; i < n; i += 2, j++) + { + third[j] = gen_reg_rtx (gmode); + ix86_expand_vector_init_concat (gmode, third[j], + &second[i], 2); + } + n >>= 1; + ix86_expand_vector_init_concat (mode, target, third, n); + } + else if (n > 2) { gcc_assert (hmode != VOIDmode); for (i = j = 0; i < n; i += 2, j++) @@ -37826,7 +38217,7 @@ static void ix86_expand_vector_init_general (bool mmx_ok, enum machine_mode mode, rtx target, rtx vals) { - rtx ops[32], op0, op1; + rtx ops[64], op0, op1; enum machine_mode half_mode = VOIDmode; int n, i; @@ -37838,6 +38229,10 @@ ix86_expand_vector_init_general (bool mmx_ok, enum machine_mode mode, break; /* FALLTHRU */ + case V16SImode: + case V16SFmode: + case V8DFmode: + case V8DImode: case V8SFmode: case V8SImode: case V4DFmode: @@ -38463,6 +38858,42 @@ ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt) } break; + case V16SFmode: + tmp = gen_reg_rtx (V8SFmode); + if (elt < 8) + emit_insn (gen_vec_extract_lo_v16sf (tmp, vec)); + else + emit_insn (gen_vec_extract_hi_v16sf (tmp, vec)); + ix86_expand_vector_extract (false, target, tmp, elt & 7); + return; + + case V8DFmode: + tmp = gen_reg_rtx (V4DFmode); + if (elt < 4) + emit_insn (gen_vec_extract_lo_v8df (tmp, vec)); + else + emit_insn (gen_vec_extract_hi_v8df (tmp, vec)); + ix86_expand_vector_extract (false, target, tmp, elt & 3); + return; + + case V16SImode: + tmp = gen_reg_rtx (V8SImode); + if (elt < 8) + emit_insn (gen_vec_extract_lo_v16si (tmp, vec)); + else + emit_insn (gen_vec_extract_hi_v16si (tmp, vec)); + ix86_expand_vector_extract (false, target, tmp, elt & 7); + return; + + case V8DImode: + tmp = gen_reg_rtx (V4DImode); + if (elt < 4) + emit_insn (gen_vec_extract_lo_v8di (tmp, vec)); + else + emit_insn (gen_vec_extract_hi_v8di (tmp, vec)); + ix86_expand_vector_extract (false, target, tmp, elt & 3); + return; + case V8QImode: /* ??? Could extract the appropriate HImode element and shift. */ default: @@ -38555,6 +38986,44 @@ emit_reduc_half (rtx dest, rtx src, int i) GEN_INT (i / 2)); } break; + case V16SImode: + case V16SFmode: + case V8DImode: + case V8DFmode: + if (i > 128) + tem = gen_avx512f_shuf_i32x4_1 (gen_lowpart (V16SImode, dest), + gen_lowpart (V16SImode, src), + gen_lowpart (V16SImode, src), + GEN_INT (0x4 + (i == 512 ? 4 : 0)), + GEN_INT (0x5 + (i == 512 ? 4 : 0)), + GEN_INT (0x6 + (i == 512 ? 4 : 0)), + GEN_INT (0x7 + (i == 512 ? 4 : 0)), + GEN_INT (0xC), GEN_INT (0xD), + GEN_INT (0xE), GEN_INT (0xF), + GEN_INT (0x10), GEN_INT (0x11), + GEN_INT (0x12), GEN_INT (0x13), + GEN_INT (0x14), GEN_INT (0x15), + GEN_INT (0x16), GEN_INT (0x17)); + else + tem = gen_avx512f_pshufd_1 (gen_lowpart (V16SImode, dest), + gen_lowpart (V16SImode, src), + GEN_INT (i == 128 ? 0x2 : 0x1), + GEN_INT (0x3), + GEN_INT (0x3), + GEN_INT (0x3), + GEN_INT (i == 128 ? 0x6 : 0x5), + GEN_INT (0x7), + GEN_INT (0x7), + GEN_INT (0x7), + GEN_INT (i == 128 ? 0xA : 0x9), + GEN_INT (0xB), + GEN_INT (0xB), + GEN_INT (0xB), + GEN_INT (i == 128 ? 0xE : 0xD), + GEN_INT (0xF), + GEN_INT (0xF), + GEN_INT (0xF)); + break; default: gcc_unreachable (); } @@ -38619,6 +39088,8 @@ ix86_vector_mode_supported_p (enum machine_mode mode) return true; if (TARGET_AVX && VALID_AVX256_REG_MODE (mode)) return true; + if (TARGET_AVX512F && VALID_AVX512F_REG_MODE (mode)) + return true; if (TARGET_MMX && VALID_MMX_REG_MODE (mode)) return true; if (TARGET_3DNOW && VALID_MMX_REG_MODE_3DNOW (mode)) @@ -38932,9 +39403,15 @@ void ix86_emit_swdivsf (rtx res, rtx a, rtx b, enum machine_mode mode) b = force_reg (mode, b); /* x0 = rcp(b) estimate */ - emit_insn (gen_rtx_SET (VOIDmode, x0, - gen_rtx_UNSPEC (mode, gen_rtvec (1, b), - UNSPEC_RCP))); + if (mode == V16SFmode || mode == V8DFmode) + emit_insn (gen_rtx_SET (VOIDmode, x0, + gen_rtx_UNSPEC (mode, gen_rtvec (1, b), + UNSPEC_RCP14))); + else + emit_insn (gen_rtx_SET (VOIDmode, x0, + gen_rtx_UNSPEC (mode, gen_rtvec (1, b), + UNSPEC_RCP))); + /* e0 = x0 * b */ emit_insn (gen_rtx_SET (VOIDmode, e0, gen_rtx_MULT (mode, x0, b))); @@ -38964,6 +39441,7 @@ void ix86_emit_swsqrtsf (rtx res, rtx a, enum machine_mode mode, { rtx x0, e0, e1, e2, e3, mthree, mhalf; REAL_VALUE_TYPE r; + int unspec; x0 = gen_reg_rtx (mode); e0 = gen_reg_rtx (mode); @@ -38976,11 +39454,15 @@ void ix86_emit_swsqrtsf (rtx res, rtx a, enum machine_mode mode, real_arithmetic (&r, NEGATE_EXPR, &dconsthalf, NULL); mhalf = CONST_DOUBLE_FROM_REAL_VALUE (r, SFmode); + unspec = UNSPEC_RSQRT; if (VECTOR_MODE_P (mode)) { mthree = ix86_build_const_vector (mode, true, mthree); mhalf = ix86_build_const_vector (mode, true, mhalf); + /* There is no 512-bit rsqrt. There is however rsqrt14. */ + if (GET_MODE_SIZE (mode) == 64) + unspec = UNSPEC_RSQRT14; } /* sqrt(a) = -0.5 * a * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0) @@ -38991,7 +39473,7 @@ void ix86_emit_swsqrtsf (rtx res, rtx a, enum machine_mode mode, /* x0 = rsqrt(a) estimate */ emit_insn (gen_rtx_SET (VOIDmode, x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, a), - UNSPEC_RSQRT))); + unspec))); /* If (a == 0.0) Filter out infinity to prevent NaN for sqrt(0.0). */ if (!recip) @@ -39002,11 +39484,23 @@ void ix86_emit_swsqrtsf (rtx res, rtx a, enum machine_mode mode, mask = gen_reg_rtx (mode); zero = force_reg (mode, CONST0_RTX(mode)); - emit_insn (gen_rtx_SET (VOIDmode, mask, - gen_rtx_NE (mode, zero, a))); - emit_insn (gen_rtx_SET (VOIDmode, x0, - gen_rtx_AND (mode, x0, mask))); + /* Handle masked compare. */ + if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 64) + { + mask = gen_reg_rtx (HImode); + /* Imm value 0x4 corresponds to not-equal comparison. */ + emit_insn (gen_avx512f_cmpv16sf3 (mask, zero, a, GEN_INT (0x4))); + emit_insn (gen_avx512f_blendmv16sf (x0, zero, x0, mask)); + } + else + { + emit_insn (gen_rtx_SET (VOIDmode, mask, + gen_rtx_NE (mode, zero, a))); + + emit_insn (gen_rtx_SET (VOIDmode, x0, + gen_rtx_AND (mode, x0, mask))); + } } /* e0 = x0 * a */ @@ -40528,6 +41022,19 @@ expand_vec_perm_1 (struct expand_vec_perm_d *d) if (expand_vec_perm_pshufb (d)) return true; + /* Try the AVX512F vpermi2 instructions. */ + rtx vec[64]; + enum machine_mode mode = d->vmode; + if (mode == V8DFmode) + mode = V8DImode; + else if (mode == V16SFmode) + mode = V16SImode; + for (i = 0; i < nelt; ++i) + vec[i] = GEN_INT (d->perm[i]); + rtx mask = gen_rtx_CONST_VECTOR (mode, gen_rtvec_v (nelt, vec)); + if (ix86_expand_vec_perm_vpermi2 (d->target, d->op0, mask, d->op1)) + return true; + return false; } @@ -42135,6 +42642,10 @@ ix86_vectorize_vec_perm_const_ok (enum machine_mode vmode, /* Given sufficient ISA support we can just return true here for selected vector modes. */ + if (d.vmode == V16SImode || d.vmode == V16SFmode + || d.vmode == V8DFmode || d.vmode == V8DImode) + /* All implementable with a single vpermi2 insn. */ + return true; if (GET_MODE_SIZE (d.vmode) == 16) { /* All implementable with a single vpperm insn. */ @@ -42377,7 +42888,7 @@ ix86_expand_mul_widen_evenodd (rtx dest, rtx op1, rtx op2, op2 = force_reg (mode, op2); /* We only play even/odd games with vectors of SImode. */ - gcc_assert (mode == V4SImode || mode == V8SImode); + gcc_assert (mode == V4SImode || mode == V8SImode || mode == V16SImode); /* If we're looking for the odd results, shift those members down to the even slots. For some cpus this is faster than a PSHUFD. */ @@ -42403,7 +42914,14 @@ ix86_expand_mul_widen_evenodd (rtx dest, rtx op1, rtx op2, op2 = gen_lowpart (mode, op2); } - if (mode == V8SImode) + if (mode == V16SImode) + { + if (uns_p) + x = gen_vec_widen_umult_even_v16si (dest, op1, op2); + else + x = gen_vec_widen_smult_even_v16si (dest, op1, op2); + } + else if (mode == V8SImode) { if (uns_p) x = gen_vec_widen_umult_even_v8si (dest, op1, op2); @@ -42623,6 +43141,11 @@ ix86_expand_sse2_mulvxdi3 (rtx op0, rtx op1, rtx op2) umul = gen_vec_widen_umult_even_v8si; nmode = V8SImode; } + else if (mode == V8DImode) + { + umul = gen_vec_widen_umult_even_v16si; + nmode = V16SImode; + } else gcc_unreachable (); @@ -43769,12 +44292,16 @@ ix86_preferred_simd_mode (enum machine_mode mode) case HImode: return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V16HImode : V8HImode; case SImode: - return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V8SImode : V4SImode; + return TARGET_AVX512F ? V16SImode : + (TARGET_AVX && !TARGET_PREFER_AVX128) ? V8SImode : V4SImode; case DImode: - return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V4DImode : V2DImode; + return TARGET_AVX512F ? V8DImode : + (TARGET_AVX && !TARGET_PREFER_AVX128) ? V4DImode : V2DImode; case SFmode: - if (TARGET_AVX && !TARGET_PREFER_AVX128) + if (TARGET_AVX512F) + return V16SFmode; + else if (TARGET_AVX && !TARGET_PREFER_AVX128) return V8SFmode; else return V4SFmode; @@ -43782,6 +44309,8 @@ ix86_preferred_simd_mode (enum machine_mode mode) case DFmode: if (!TARGET_VECTORIZE_DOUBLE) return word_mode; + else if (TARGET_AVX512F) + return V8DFmode; else if (TARGET_AVX && !TARGET_PREFER_AVX128) return V4DFmode; else if (TARGET_SSE2) @@ -43794,12 +44323,14 @@ ix86_preferred_simd_mode (enum machine_mode mode) } /* If AVX is enabled then try vectorizing with both 256bit and 128bit - vectors. */ + vectors. If AVX512F is enabled then try vectorizing with 512bit, + 256bit and 128bit vectors. */ static unsigned int ix86_autovectorize_vector_sizes (void) { - return (TARGET_AVX && !TARGET_PREFER_AVX128) ? 32 | 16 : 0; + return TARGET_AVX512F ? 64 | 32 | 16 : + (TARGET_AVX && !TARGET_PREFER_AVX128) ? 32 | 16 : 0; } diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 7beb245d9c7..a3c0e0c2398 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -748,8 +748,9 @@ (set (attr "mode") (cond [(match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL") (const_string "<ssePSmode>") - (and (eq_attr "alternative" "2") - (match_test "TARGET_SSE_TYPELESS_STORES")) + (and (match_test "GET_MODE_SIZE (<MODE>mode) == 16") + (and (eq_attr "alternative" "2") + (match_test "TARGET_SSE_TYPELESS_STORES"))) (const_string "<ssePSmode>") (match_test "TARGET_AVX") (const_string "<sseinsnmode>") @@ -986,8 +987,9 @@ (set_attr "ssememalign" "8") (set_attr "prefix" "maybe_vex") (set (attr "mode") - (cond [(ior (match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL") - (match_test "TARGET_SSE_TYPELESS_STORES")) + (cond [(and (match_test "GET_MODE_SIZE (<MODE>mode) == 16") + (ior (match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL") + (match_test "TARGET_SSE_TYPELESS_STORES"))) (const_string "<ssePSmode>") (match_test "TARGET_AVX") (const_string "<MODE>") @@ -1091,6 +1093,7 @@ { switch (get_attr_mode (insn)) { + case MODE_V16SF: case MODE_V8SF: case MODE_V4SF: return "%vmovups\t{%1, %0|%0, %1}"; @@ -1113,8 +1116,9 @@ (const_string "1"))) (set_attr "prefix" "maybe_vex") (set (attr "mode") - (cond [(ior (match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL") - (match_test "TARGET_SSE_TYPELESS_STORES")) + (cond [(and (match_test "GET_MODE_SIZE (<MODE>mode) == 16") + (ior (match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL") + (match_test "TARGET_SSE_TYPELESS_STORES"))) (const_string "<ssePSmode>") (match_test "TARGET_AVX") (const_string "<sseinsnmode>") @@ -3492,7 +3496,11 @@ (match_operand:<sseintvecmode> 1 "register_operand")] "TARGET_SSE2 && (<MODE>mode == V4SFmode || TARGET_AVX2)" { - ix86_expand_vector_convert_uns_vsivsf (operands[0], operands[1]); + if (<MODE>mode == V16SFmode) + emit_insn (gen_ufloatv16siv16sf2 (operands[0], operands[1])); + else + ix86_expand_vector_convert_uns_vsivsf (operands[0], operands[1]); + DONE; }) @@ -3583,11 +3591,17 @@ (match_operand:VF1 1 "register_operand")] "TARGET_SSE2" { - rtx tmp[3]; - tmp[0] = ix86_expand_adjust_ufix_to_sfix_si (operands[1], &tmp[2]); - tmp[1] = gen_reg_rtx (<sseintvecmode>mode); - emit_insn (gen_fix_trunc<mode><sseintvecmodelower>2 (tmp[1], tmp[0])); - emit_insn (gen_xor<sseintvecmodelower>3 (operands[0], tmp[1], tmp[2])); + if (<MODE>mode == V16SFmode) + emit_insn (gen_ufix_truncv16sfv16si2 (operands[0], + operands[1])); + else + { + rtx tmp[3]; + tmp[0] = ix86_expand_adjust_ufix_to_sfix_si (operands[1], &tmp[2]); + tmp[1] = gen_reg_rtx (<sseintvecmode>mode); + emit_insn (gen_fix_trunc<mode><sseintvecmodelower>2 (tmp[1], tmp[0])); + emit_insn (gen_xor<sseintvecmodelower>3 (operands[0], tmp[1], tmp[2])); + } DONE; }) @@ -4514,6 +4528,32 @@ DONE; }) +(define_expand "vec_unpacku_float_hi_v16si" + [(match_operand:V8DF 0 "register_operand") + (match_operand:V16SI 1 "register_operand")] + "TARGET_AVX512F" +{ + REAL_VALUE_TYPE TWO32r; + rtx k, x, tmp[4]; + + real_ldexp (&TWO32r, &dconst1, 32); + x = const_double_from_real_value (TWO32r, DFmode); + + tmp[0] = force_reg (V8DFmode, CONST0_RTX (V8DFmode)); + tmp[1] = force_reg (V8DFmode, ix86_build_const_vector (V8DFmode, 1, x)); + tmp[2] = gen_reg_rtx (V8DFmode); + tmp[3] = gen_reg_rtx (V8SImode); + k = gen_reg_rtx (QImode); + + emit_insn (gen_vec_extract_hi_v16si (tmp[3], operands[1])); + emit_insn (gen_floatv8siv8df2 (tmp[2], tmp[3])); + emit_insn (gen_rtx_SET (VOIDmode, k, + gen_rtx_LT (QImode, tmp[2], tmp[0]))); + emit_insn (gen_addv8df3_mask (tmp[2], tmp[2], tmp[1], tmp[2], k)); + emit_move_insn (operands[0], tmp[2]); + DONE; +}) + (define_expand "vec_unpacku_float_lo_v8si" [(match_operand:V4DF 0 "register_operand") (match_operand:V8SI 1 "nonimmediate_operand")] @@ -4679,31 +4719,46 @@ (define_expand "vec_pack_ufix_trunc_<mode>" [(match_operand:<ssepackfltmode> 0 "register_operand") - (match_operand:VF2_128_256 1 "register_operand") - (match_operand:VF2_128_256 2 "register_operand")] + (match_operand:VF2 1 "register_operand") + (match_operand:VF2 2 "register_operand")] "TARGET_SSE2" { - rtx tmp[7]; - tmp[0] = ix86_expand_adjust_ufix_to_sfix_si (operands[1], &tmp[2]); - tmp[1] = ix86_expand_adjust_ufix_to_sfix_si (operands[2], &tmp[3]); - tmp[4] = gen_reg_rtx (<ssepackfltmode>mode); - emit_insn (gen_vec_pack_sfix_trunc_<mode> (tmp[4], tmp[0], tmp[1])); - if (<ssepackfltmode>mode == V4SImode || TARGET_AVX2) + if (<MODE>mode == V8DFmode) { - tmp[5] = gen_reg_rtx (<ssepackfltmode>mode); - ix86_expand_vec_extract_even_odd (tmp[5], tmp[2], tmp[3], 0); + rtx r1, r2; + + r1 = gen_reg_rtx (V8SImode); + r2 = gen_reg_rtx (V8SImode); + + emit_insn (gen_ufix_truncv8dfv8si2 (r1, operands[1])); + emit_insn (gen_ufix_truncv8dfv8si2 (r2, operands[2])); + emit_insn (gen_avx_vec_concatv16si (operands[0], r1, r2)); } else { - tmp[5] = gen_reg_rtx (V8SFmode); - ix86_expand_vec_extract_even_odd (tmp[5], gen_lowpart (V8SFmode, tmp[2]), - gen_lowpart (V8SFmode, tmp[3]), 0); - tmp[5] = gen_lowpart (V8SImode, tmp[5]); + rtx tmp[7]; + tmp[0] = ix86_expand_adjust_ufix_to_sfix_si (operands[1], &tmp[2]); + tmp[1] = ix86_expand_adjust_ufix_to_sfix_si (operands[2], &tmp[3]); + tmp[4] = gen_reg_rtx (<ssepackfltmode>mode); + emit_insn (gen_vec_pack_sfix_trunc_<mode> (tmp[4], tmp[0], tmp[1])); + if (<ssepackfltmode>mode == V4SImode || TARGET_AVX2) + { + tmp[5] = gen_reg_rtx (<ssepackfltmode>mode); + ix86_expand_vec_extract_even_odd (tmp[5], tmp[2], tmp[3], 0); + } + else + { + tmp[5] = gen_reg_rtx (V8SFmode); + ix86_expand_vec_extract_even_odd (tmp[5], gen_lowpart (V8SFmode, tmp[2]), + gen_lowpart (V8SFmode, tmp[3]), 0); + tmp[5] = gen_lowpart (V8SImode, tmp[5]); + } + tmp[6] = expand_simple_binop (<ssepackfltmode>mode, XOR, tmp[4], tmp[5], + operands[0], 0, OPTAB_DIRECT); + if (tmp[6] != operands[0]) + emit_move_insn (operands[0], tmp[6]); } - tmp[6] = expand_simple_binop (<ssepackfltmode>mode, XOR, tmp[4], tmp[5], - operands[0], 0, OPTAB_DIRECT); - if (tmp[6] != operands[0]) - emit_move_insn (operands[0], tmp[6]); + DONE; }) diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index 1060e569aaa..ed9467b04bb 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,3 +1,15 @@ +2013-12-31 Alexander Ivchenko <alexander.ivchenko@intel.com> + Maxim Kuznetsov <maxim.kuznetsov@intel.com> + Sergey Lega <sergey.s.lega@intel.com> + Anna Tikhonova <anna.tikhonova@intel.com> + Ilya Tocar <ilya.tocar@intel.com> + Andrey Turetskiy <andrey.turetskiy@intel.com> + Ilya Verbin <ilya.verbin@intel.com> + Kirill Yukhin <kirill.yukhin@intel.com> + Michael Zolotukhin <michael.v.zolotukhin@intel.com> + + * gcc.target/i386/pr49002-2.c: allow vmovapd generation. + 2013-12-31 Sandra Loosemore <sandra@codesourcery.com> Chung-Lin Tang <cltang@codesourcery.com> Based on patches from Altera Corporation diff --git a/gcc/testsuite/gcc.target/i386/pr49002-2.c b/gcc/testsuite/gcc.target/i386/pr49002-2.c index 9f21a2d17d9..dfb83b4a75d 100644 --- a/gcc/testsuite/gcc.target/i386/pr49002-2.c +++ b/gcc/testsuite/gcc.target/i386/pr49002-2.c @@ -12,4 +12,4 @@ void foo(const __m128d from, __m256d *to) /* Ensure we store ymm, not xmm. */ /* { dg-final { scan-assembler-not "vmovapd\[\t \]*%xmm\[0-9\]\+,\[^,\]*" } } */ /* { dg-final { scan-assembler-not "vmovaps\[\t \]*%xmm\[0-9\]\+,\[^,\]*" } } */ -/* { dg-final { scan-assembler "vmovaps\[\t \]*%ymm\[0-9\]\+,\[^,\]*" } } */ +/* { dg-final { scan-assembler "vmovap\[sd\]\[\t \]*%ymm\[0-9\]\+,\[^,\]*" } } */ diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c index a07c14d153e..e4f04c44760 100644 --- a/gcc/tree-vect-stmts.c +++ b/gcc/tree-vect-stmts.c @@ -5699,7 +5699,7 @@ vectorizable_load (gimple stmt, gimple_stmt_iterator *gsi, gimple *vec_stmt, tree vec_oprnd0 = NULL_TREE, op; tree arglist = TYPE_ARG_TYPES (TREE_TYPE (gather_decl)); tree rettype, srctype, ptrtype, idxtype, masktype, scaletype; - tree ptr, mask, var, scale, perm_mask = NULL_TREE, prev_res = NULL_TREE; + tree ptr, mask, var, scale, merge, perm_mask = NULL_TREE, prev_res = NULL_TREE; edge pe = loop_preheader_edge (loop); gimple_seq seq; basic_block new_bb; @@ -5741,8 +5741,7 @@ vectorizable_load (gimple stmt, gimple_stmt_iterator *gsi, gimple *vec_stmt, idxtype = TREE_VALUE (arglist); arglist = TREE_CHAIN (arglist); masktype = TREE_VALUE (arglist); arglist = TREE_CHAIN (arglist); scaletype = TREE_VALUE (arglist); - gcc_checking_assert (types_compatible_p (srctype, rettype) - && types_compatible_p (srctype, masktype)); + gcc_checking_assert (types_compatible_p (srctype, rettype)); vec_dest = vect_create_destination_var (scalar_dest, vectype); @@ -5756,8 +5755,13 @@ vectorizable_load (gimple stmt, gimple_stmt_iterator *gsi, gimple *vec_stmt, /* Currently we support only unconditional gather loads, so mask should be all ones. */ - if (TREE_CODE (TREE_TYPE (masktype)) == INTEGER_TYPE) - mask = build_int_cst (TREE_TYPE (masktype), -1); + if (TREE_CODE (masktype) == INTEGER_TYPE) + mask = build_int_cst (masktype, -1); + else if (TREE_CODE (TREE_TYPE (masktype)) == INTEGER_TYPE) + { + mask = build_int_cst (TREE_TYPE (masktype), -1); + mask = build_vector_from_val (masktype, mask); + } else if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (masktype))) { REAL_VALUE_TYPE r; @@ -5766,14 +5770,30 @@ vectorizable_load (gimple stmt, gimple_stmt_iterator *gsi, gimple *vec_stmt, tmp[j] = -1; real_from_target (&r, tmp, TYPE_MODE (TREE_TYPE (masktype))); mask = build_real (TREE_TYPE (masktype), r); + mask = build_vector_from_val (masktype, mask); } else gcc_unreachable (); - mask = build_vector_from_val (masktype, mask); mask = vect_init_vector (stmt, mask, masktype, NULL); scale = build_int_cst (scaletype, gather_scale); + if (TREE_CODE (TREE_TYPE (rettype)) == INTEGER_TYPE) + merge = build_int_cst (TREE_TYPE (rettype), 0); + else if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (rettype))) + { + REAL_VALUE_TYPE r; + long tmp[6]; + for (j = 0; j < 6; ++j) + tmp[j] = 0; + real_from_target (&r, tmp, TYPE_MODE (TREE_TYPE (rettype))); + merge = build_real (TREE_TYPE (rettype), r); + } + else + gcc_unreachable (); + merge = build_vector_from_val (rettype, merge); + merge = vect_init_vector (stmt, merge, rettype, NULL); + prev_stmt_info = NULL; for (j = 0; j < ncopies; ++j) { @@ -5802,7 +5822,7 @@ vectorizable_load (gimple stmt, gimple_stmt_iterator *gsi, gimple *vec_stmt, } new_stmt - = gimple_build_call (gather_decl, 5, mask, ptr, op, mask, scale); + = gimple_build_call (gather_decl, 5, merge, ptr, op, mask, scale); if (!useless_type_conversion_p (vectype, rettype)) { diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h index 54e73c8c9a0..00e56dcb388 100644 --- a/gcc/tree-vectorizer.h +++ b/gcc/tree-vectorizer.h @@ -683,8 +683,8 @@ struct dataref_aux { conversion. */ #define MAX_INTERM_CVT_STEPS 3 -/* The maximum vectorization factor supported by any target (V32QI). */ -#define MAX_VECTORIZATION_FACTOR 32 +/* The maximum vectorization factor supported by any target (V64QI). */ +#define MAX_VECTORIZATION_FACTOR 64 /* Avoid GTY(()) on stmt_vec_info. */ typedef void *vec_void_p; |