diff options
Diffstat (limited to 'gcc/config/i386')
-rw-r--r-- | gcc/config/i386/avx512fintrin.h | 76 | ||||
-rw-r--r-- | gcc/config/i386/i386-expand.cc | 77 | ||||
-rw-r--r-- | gcc/config/i386/i386.cc | 10 | ||||
-rw-r--r-- | gcc/config/i386/smmintrin.h | 25 | ||||
-rw-r--r-- | gcc/config/i386/sse.md | 34 |
5 files changed, 138 insertions, 84 deletions
diff --git a/gcc/config/i386/avx512fintrin.h b/gcc/config/i386/avx512fintrin.h index 29511fd2831..77d6249c2bc 100644 --- a/gcc/config/i386/avx512fintrin.h +++ b/gcc/config/i386/avx512fintrin.h @@ -3286,31 +3286,67 @@ _mm_maskz_scalef_round_ss (__mmask8 __U, __m128 __A, __m128 __B, const int __R) (__mmask8) __U, __R); } #else -#define _mm512_scalef_round_pd(A, B, C) \ - (__m512d)__builtin_ia32_scalefpd512_mask(A, B, (__v8df)_mm512_undefined_pd(), -1, C) - -#define _mm512_mask_scalef_round_pd(W, U, A, B, C) \ - (__m512d)__builtin_ia32_scalefpd512_mask(A, B, W, U, C) - -#define _mm512_maskz_scalef_round_pd(U, A, B, C) \ - (__m512d)__builtin_ia32_scalefpd512_mask(A, B, (__v8df)_mm512_setzero_pd(), U, C) +#define _mm512_scalef_round_pd(A, B, C) \ + ((__m512d) \ + __builtin_ia32_scalefpd512_mask((A), (B), \ + (__v8df) _mm512_undefined_pd(), \ + -1, (C))) + +#define _mm512_mask_scalef_round_pd(W, U, A, B, C) \ + ((__m512d) __builtin_ia32_scalefpd512_mask((A), (B), (W), (U), (C))) + +#define _mm512_maskz_scalef_round_pd(U, A, B, C) \ + ((__m512d) \ + __builtin_ia32_scalefpd512_mask((A), (B), \ + (__v8df) _mm512_setzero_pd(), \ + (U), (C))) + +#define _mm512_scalef_round_ps(A, B, C) \ + ((__m512) \ + __builtin_ia32_scalefps512_mask((A), (B), \ + (__v16sf) _mm512_undefined_ps(), \ + -1, (C))) + +#define _mm512_mask_scalef_round_ps(W, U, A, B, C) \ + ((__m512) __builtin_ia32_scalefps512_mask((A), (B), (W), (U), (C))) + +#define _mm512_maskz_scalef_round_ps(U, A, B, C) \ + ((__m512) \ + __builtin_ia32_scalefps512_mask((A), (B), \ + (__v16sf) _mm512_setzero_ps(), \ + (U), (C))) + +#define _mm_scalef_round_sd(A, B, C) \ + ((__m128d) \ + __builtin_ia32_scalefsd_mask_round ((A), (B), \ + (__v2df) _mm_undefined_pd (), \ + -1, (C))) -#define _mm512_scalef_round_ps(A, B, C) \ - (__m512)__builtin_ia32_scalefps512_mask(A, B, (__v16sf)_mm512_undefined_ps(), -1, C) +#define _mm_scalef_round_ss(A, B, C) \ + ((__m128) \ + __builtin_ia32_scalefss_mask_round ((A), (B), \ + (__v4sf) _mm_undefined_ps (), \ + -1, (C))) -#define _mm512_mask_scalef_round_ps(W, U, A, B, C) \ - (__m512)__builtin_ia32_scalefps512_mask(A, B, W, U, C) +#define _mm_mask_scalef_round_sd(W, U, A, B, C) \ + ((__m128d) \ + __builtin_ia32_scalefsd_mask_round ((A), (B), (W), (U), (C))) -#define _mm512_maskz_scalef_round_ps(U, A, B, C) \ - (__m512)__builtin_ia32_scalefps512_mask(A, B, (__v16sf)_mm512_setzero_ps(), U, C) +#define _mm_mask_scalef_round_ss(W, U, A, B, C) \ + ((__m128) \ + __builtin_ia32_scalefss_mask_round ((A), (B), (W), (U), (C))) -#define _mm_scalef_round_sd(A, B, C) \ - (__m128d)__builtin_ia32_scalefsd_mask_round (A, B, \ - (__v2df)_mm_setzero_pd (), -1, C) +#define _mm_maskz_scalef_round_sd(U, A, B, C) \ + ((__m128d) \ + __builtin_ia32_scalefsd_mask_round ((A), (B), \ + (__v2df) _mm_setzero_pd (), \ + (U), (C))) -#define _mm_scalef_round_ss(A, B, C) \ - (__m128)__builtin_ia32_scalefss_mask_round (A, B, \ - (__v4sf)_mm_setzero_ps (), -1, C) +#define _mm_maskz_scalef_round_ss(U, A, B, C) \ + ((__m128) \ + __builtin_ia32_scalefss_mask_round ((A), (B), \ + (__v4sf) _mm_setzero_ps (), \ + (U), (C))) #endif #define _mm_mask_scalef_sd(W, U, A, B) \ diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc index 794315ee2f7..31780b6daf7 100644 --- a/gcc/config/i386/i386-expand.cc +++ b/gcc/config/i386/i386-expand.cc @@ -3136,6 +3136,8 @@ ix86_expand_int_movcc (rtx operands[]) bool sign_bit_compare_p = false; rtx op0 = XEXP (operands[1], 0); rtx op1 = XEXP (operands[1], 1); + rtx op2 = operands[2]; + rtx op3 = operands[3]; if (GET_MODE (op0) == TImode || (GET_MODE (op0) == DImode @@ -3153,17 +3155,29 @@ ix86_expand_int_movcc (rtx operands[]) || (op1 == constm1_rtx && (code == GT || code == LE))) sign_bit_compare_p = true; + /* op0 == op1 ? op0 : op3 is equivalent to op0 == op1 ? op1 : op3, + but if op1 is a constant, the latter form allows more optimizations, + either through the last 2 ops being constant handling, or the one + constant and one variable cases. On the other side, for cmov the + former might be better as we don't need to load the constant into + another register. */ + if (code == EQ && CONST_INT_P (op1) && rtx_equal_p (op0, op2)) + op2 = op1; + /* Similarly for op0 != op1 ? op2 : op0 and op0 != op1 ? op2 : op1. */ + else if (code == NE && CONST_INT_P (op1) && rtx_equal_p (op0, op3)) + op3 = op1; + /* Don't attempt mode expansion here -- if we had to expand 5 or 6 HImode insns, we'd be swallowed in word prefix ops. */ if ((mode != HImode || TARGET_FAST_PREFIX) && (mode != (TARGET_64BIT ? TImode : DImode)) - && CONST_INT_P (operands[2]) - && CONST_INT_P (operands[3])) + && CONST_INT_P (op2) + && CONST_INT_P (op3)) { rtx out = operands[0]; - HOST_WIDE_INT ct = INTVAL (operands[2]); - HOST_WIDE_INT cf = INTVAL (operands[3]); + HOST_WIDE_INT ct = INTVAL (op2); + HOST_WIDE_INT cf = INTVAL (op3); HOST_WIDE_INT diff; diff = ct - cf; @@ -3559,6 +3573,9 @@ ix86_expand_int_movcc (rtx operands[]) if (BRANCH_COST (optimize_insn_for_speed_p (), false) <= 2) return false; + operands[2] = op2; + operands[3] = op3; + /* If one of the two operands is an interesting constant, load a constant with the above and mask it in with a logical operation. */ @@ -17036,7 +17053,8 @@ ix86_emit_fp_unordered_jump (rtx label) /* Output code to perform an sinh XFmode calculation. */ -void ix86_emit_i387_sinh (rtx op0, rtx op1) +void +ix86_emit_i387_sinh (rtx op0, rtx op1) { rtx e1 = gen_reg_rtx (XFmode); rtx e2 = gen_reg_rtx (XFmode); @@ -17084,7 +17102,8 @@ void ix86_emit_i387_sinh (rtx op0, rtx op1) /* Output code to perform an cosh XFmode calculation. */ -void ix86_emit_i387_cosh (rtx op0, rtx op1) +void +ix86_emit_i387_cosh (rtx op0, rtx op1) { rtx e1 = gen_reg_rtx (XFmode); rtx e2 = gen_reg_rtx (XFmode); @@ -17106,7 +17125,8 @@ void ix86_emit_i387_cosh (rtx op0, rtx op1) /* Output code to perform an tanh XFmode calculation. */ -void ix86_emit_i387_tanh (rtx op0, rtx op1) +void +ix86_emit_i387_tanh (rtx op0, rtx op1) { rtx e1 = gen_reg_rtx (XFmode); rtx e2 = gen_reg_rtx (XFmode); @@ -17152,7 +17172,8 @@ void ix86_emit_i387_tanh (rtx op0, rtx op1) /* Output code to perform an asinh XFmode calculation. */ -void ix86_emit_i387_asinh (rtx op0, rtx op1) +void +ix86_emit_i387_asinh (rtx op0, rtx op1) { rtx e1 = gen_reg_rtx (XFmode); rtx e2 = gen_reg_rtx (XFmode); @@ -17204,7 +17225,8 @@ void ix86_emit_i387_asinh (rtx op0, rtx op1) /* Output code to perform an acosh XFmode calculation. */ -void ix86_emit_i387_acosh (rtx op0, rtx op1) +void +ix86_emit_i387_acosh (rtx op0, rtx op1) { rtx e1 = gen_reg_rtx (XFmode); rtx e2 = gen_reg_rtx (XFmode); @@ -17230,7 +17252,8 @@ void ix86_emit_i387_acosh (rtx op0, rtx op1) /* Output code to perform an atanh XFmode calculation. */ -void ix86_emit_i387_atanh (rtx op0, rtx op1) +void +ix86_emit_i387_atanh (rtx op0, rtx op1) { rtx e1 = gen_reg_rtx (XFmode); rtx e2 = gen_reg_rtx (XFmode); @@ -17281,7 +17304,8 @@ void ix86_emit_i387_atanh (rtx op0, rtx op1) /* Output code to perform a log1p XFmode calculation. */ -void ix86_emit_i387_log1p (rtx op0, rtx op1) +void +ix86_emit_i387_log1p (rtx op0, rtx op1) { rtx_code_label *label1 = gen_label_rtx (); rtx_code_label *label2 = gen_label_rtx (); @@ -17291,6 +17315,11 @@ void ix86_emit_i387_log1p (rtx op0, rtx op1) rtx cst, cstln2, cst1; rtx_insn *insn; + /* The emit_jump call emits pending stack adjust, make sure it is emitted + before the conditional jump, otherwise the stack adjustment will be + only conditional. */ + do_pending_stack_adjust (); + cst = const_double_from_real_value (REAL_VALUE_ATOF ("0.29289321881345247561810596348408353", XFmode), XFmode); cstln2 = force_reg (XFmode, standard_80387_constant_rtx (4)); /* fldln2 */ @@ -17320,7 +17349,8 @@ void ix86_emit_i387_log1p (rtx op0, rtx op1) } /* Emit code for round calculation. */ -void ix86_emit_i387_round (rtx op0, rtx op1) +void +ix86_emit_i387_round (rtx op0, rtx op1) { machine_mode inmode = GET_MODE (op1); machine_mode outmode = GET_MODE (op0); @@ -17434,7 +17464,8 @@ void ix86_emit_i387_round (rtx op0, rtx op1) /* Output code to perform a Newton-Rhapson approximation of a single precision floating point divide [http://en.wikipedia.org/wiki/N-th_root_algorithm]. */ -void ix86_emit_swdivsf (rtx res, rtx a, rtx b, machine_mode mode) +void +ix86_emit_swdivsf (rtx res, rtx a, rtx b, machine_mode mode) { rtx x0, x1, e0, e1; @@ -17485,7 +17516,8 @@ void ix86_emit_swdivsf (rtx res, rtx a, rtx b, machine_mode mode) /* Output code to perform a Newton-Rhapson approximation of a single precision floating point [reciprocal] square root. */ -void ix86_emit_swsqrtsf (rtx res, rtx a, machine_mode mode, bool recip) +void +ix86_emit_swsqrtsf (rtx res, rtx a, machine_mode mode, bool recip) { rtx x0, e0, e1, e2, e3, mthree, mhalf; REAL_VALUE_TYPE r; @@ -23240,9 +23272,10 @@ ix86_expand_divmod_libfunc (rtx libfunc, machine_mode mode, *rem_p = rem; } -void ix86_expand_atomic_fetch_op_loop (rtx target, rtx mem, rtx val, - enum rtx_code code, bool after, - bool doubleword) +void +ix86_expand_atomic_fetch_op_loop (rtx target, rtx mem, rtx val, + enum rtx_code code, bool after, + bool doubleword) { rtx old_reg, new_reg, old_mem, success; machine_mode mode = GET_MODE (target); @@ -23286,10 +23319,11 @@ void ix86_expand_atomic_fetch_op_loop (rtx target, rtx mem, rtx val, it will be relaxed to an atomic load + compare, and skip cmpxchg instruction if mem != exp_input. */ -void ix86_expand_cmpxchg_loop (rtx *ptarget_bool, rtx target_val, - rtx mem, rtx exp_input, rtx new_input, - rtx mem_model, bool doubleword, - rtx_code_label *loop_label) +void +ix86_expand_cmpxchg_loop (rtx *ptarget_bool, rtx target_val, + rtx mem, rtx exp_input, rtx new_input, + rtx mem_model, bool doubleword, + rtx_code_label *loop_label) { rtx_code_label *cmp_label = NULL; rtx_code_label *done_label = NULL; @@ -23388,6 +23422,7 @@ void ix86_expand_cmpxchg_loop (rtx *ptarget_bool, rtx target_val, /* If mem is not expected, pause and loop back. */ emit_label (cmp_label); + emit_move_insn (target_val, new_mem); emit_insn (gen_pause ()); emit_jump_insn (gen_jump (loop_label)); emit_barrier (); diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc index c959b7144de..b16df5b183e 100644 --- a/gcc/config/i386/i386.cc +++ b/gcc/config/i386/i386.cc @@ -4891,6 +4891,7 @@ ix86_gimplify_va_arg (tree valist, tree type, gimple_seq *pre_p, { int i, prev_size = 0; tree temp = create_tmp_var (type, "va_arg_tmp"); + TREE_ADDRESSABLE (temp) = 1; /* addr = &temp; */ t = build1 (ADDR_EXPR, build_pointer_type (type), temp); @@ -6524,7 +6525,8 @@ ix86_initial_elimination_offset (int from, int to) } /* Emits a warning for unsupported msabi to sysv pro/epilogues. */ -void warn_once_call_ms2sysv_xlogues (const char *feature) +void +warn_once_call_ms2sysv_xlogues (const char *feature) { static bool warned_once = false; if (!warned_once) @@ -18806,7 +18808,8 @@ ix86_veclibabi_svml (combined_fn fn, tree type_out, tree type_in) return NULL_TREE; } - tree fndecl = mathfn_built_in (TREE_TYPE (type_in), fn); + tree fndecl = mathfn_built_in (el_mode == DFmode + ? double_type_node : float_type_node, fn); bname = IDENTIFIER_POINTER (DECL_NAME (fndecl)); if (DECL_FUNCTION_CODE (fndecl) == BUILT_IN_LOGF) @@ -18898,7 +18901,8 @@ ix86_veclibabi_acml (combined_fn fn, tree type_out, tree type_in) return NULL_TREE; } - tree fndecl = mathfn_built_in (TREE_TYPE (type_in), fn); + tree fndecl = mathfn_built_in (el_mode == DFmode + ? double_type_node : float_type_node, fn); bname = IDENTIFIER_POINTER (DECL_NAME (fndecl)); sprintf (name + 7, "%s", bname+10); diff --git a/gcc/config/i386/smmintrin.h b/gcc/config/i386/smmintrin.h index b42b212300f..eb6a451c10a 100644 --- a/gcc/config/i386/smmintrin.h +++ b/gcc/config/i386/smmintrin.h @@ -810,17 +810,11 @@ _mm_cmpgt_epi64 (__m128i __X, __m128i __Y) #include <popcntintrin.h> -#ifndef __SSE4_1__ +#ifndef __CRC32__ #pragma GCC push_options -#pragma GCC target("sse4.1") -#define __DISABLE_SSE4_1__ -#endif /* __SSE4_1__ */ - -#ifndef __SSE4_2__ -#pragma GCC push_options -#pragma GCC target("sse4.2") -#define __DISABLE_SSE4_2__ -#endif /* __SSE4_1__ */ +#pragma GCC target("crc32") +#define __DISABLE_CRC32__ +#endif /* __CRC32__ */ /* Accumulate CRC32 (polynomial 0x11EDC6F41) value. */ extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) @@ -849,14 +843,9 @@ _mm_crc32_u64 (unsigned long long __C, unsigned long long __V) } #endif -#ifdef __DISABLE_SSE4_2__ -#undef __DISABLE_SSE4_2__ +#ifdef __DISABLE_CRC32__ +#undef __DISABLE_CRC32__ #pragma GCC pop_options -#endif /* __DISABLE_SSE4_2__ */ - -#ifdef __DISABLE_SSE4_1__ -#undef __DISABLE_SSE4_1__ -#pragma GCC pop_options -#endif /* __DISABLE_SSE4_1__ */ +#endif /* __DISABLE_CRC32__ */ #endif /* _SMMINTRIN_H_INCLUDED */ diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index a852c166a58..5e93aa23b47 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -327,9 +327,7 @@ ;; 128-, 256- and 512-bit float vector modes for bitwise operations (define_mode_iterator VFB - [(V32HF "TARGET_AVX512FP16") - (V16HF "TARGET_AVX512FP16") - (V8HF "TARGET_AVX512FP16") + [(V32HF "TARGET_AVX512F") (V16HF "TARGET_AVX") (V8HF "TARGET_SSE2") (V16SF "TARGET_AVX512F") (V8SF "TARGET_AVX") V4SF (V8DF "TARGET_AVX512F") (V4DF "TARGET_AVX") (V2DF "TARGET_SSE2")]) @@ -340,8 +338,7 @@ ;; 128- and 256-bit float vector modes for bitwise operations (define_mode_iterator VFB_128_256 - [(V16HF "TARGET_AVX512FP16") - (V8HF "TARGET_AVX512FP16") + [(V16HF "TARGET_AVX") (V8HF "TARGET_SSE2") (V8SF "TARGET_AVX") V4SF (V4DF "TARGET_AVX") (V2DF "TARGET_SSE2")]) @@ -399,7 +396,7 @@ ;; All 512bit vector float modes for bitwise operations (define_mode_iterator VFB_512 - [(V32HF "TARGET_AVX512FP16") V16SF V8DF]) + [V32HF V16SF V8DF]) (define_mode_iterator VI48_AVX512VL [V16SI (V8SI "TARGET_AVX512VL") (V4SI "TARGET_AVX512VL") @@ -4581,7 +4578,8 @@ (not:VFB_128_256 (match_operand:VFB_128_256 1 "register_operand" "0,x,v,v")) (match_operand:VFB_128_256 2 "vector_operand" "xBm,xm,vm,vm")))] - "TARGET_SSE && <mask_avx512vl_condition>" + "TARGET_SSE && <mask_avx512vl_condition> + && (!<mask_applied> || <ssescalarmode>mode != HFmode)" { char buf[128]; const char *ops; @@ -4648,7 +4646,7 @@ (not:VFB_512 (match_operand:VFB_512 1 "register_operand" "v")) (match_operand:VFB_512 2 "nonimmediate_operand" "vm")))] - "TARGET_AVX512F" + "TARGET_AVX512F && (!<mask_applied> || <ssescalarmode>mode != HFmode)" { char buf[128]; const char *ops; @@ -4683,7 +4681,8 @@ (any_logic:VFB_128_256 (match_operand:VFB_128_256 1 "vector_operand") (match_operand:VFB_128_256 2 "vector_operand")))] - "TARGET_SSE && <mask_avx512vl_condition>" + "TARGET_SSE && <mask_avx512vl_condition> + && (!<mask_applied> || <ssescalarmode>mode != HFmode)" "ix86_fixup_binary_operands_no_copy (<CODE>, <MODE>mode, operands);") (define_expand "<code><mode>3<mask_name>" @@ -4691,7 +4690,7 @@ (any_logic:VFB_512 (match_operand:VFB_512 1 "nonimmediate_operand") (match_operand:VFB_512 2 "nonimmediate_operand")))] - "TARGET_AVX512F" + "TARGET_AVX512F && (!<mask_applied> || <ssescalarmode>mode != HFmode)" "ix86_fixup_binary_operands_no_copy (<CODE>, <MODE>mode, operands);") (define_insn "*<code><mode>3<mask_name>" @@ -4700,6 +4699,7 @@ (match_operand:VFB_128_256 1 "vector_operand" "%0,x,v,v") (match_operand:VFB_128_256 2 "vector_operand" "xBm,xm,vm,vm")))] "TARGET_SSE && <mask_avx512vl_condition> + && (!<mask_applied> || <ssescalarmode>mode != HFmode) && !(MEM_P (operands[1]) && MEM_P (operands[2]))" { char buf[128]; @@ -4766,7 +4766,8 @@ (any_logic:VFB_512 (match_operand:VFB_512 1 "nonimmediate_operand" "%v") (match_operand:VFB_512 2 "nonimmediate_operand" "vm")))] - "TARGET_AVX512F && !(MEM_P (operands[1]) && MEM_P (operands[2]))" + "TARGET_AVX512F && !(MEM_P (operands[1]) && MEM_P (operands[2])) + && (!<mask_applied> || <ssescalarmode>mode != HFmode)" { char buf[128]; const char *ops; @@ -16741,17 +16742,6 @@ (match_operand:<avx512fmaskmode> 4 "register_operand")))] "TARGET_AVX512F") -(define_expand "<sse2_avx2>_andnot<mode>3_mask" - [(set (match_operand:VI12_AVX512VL 0 "register_operand") - (vec_merge:VI12_AVX512VL - (and:VI12_AVX512VL - (not:VI12_AVX512VL - (match_operand:VI12_AVX512VL 1 "register_operand")) - (match_operand:VI12_AVX512VL 2 "nonimmediate_operand")) - (match_operand:VI12_AVX512VL 3 "nonimm_or_0_operand") - (match_operand:<avx512fmaskmode> 4 "register_operand")))] - "TARGET_AVX512BW") - (define_insn "*andnot<mode>3" [(set (match_operand:VI 0 "register_operand" "=x,x,v") (and:VI |