5 files changed, 138 insertions, 84 deletions
diff --git a/gcc/config/i386/avx512fintrin.h b/gcc/config/i386/avx512fintrin.h
index 29511fd2831..77d6249c2bc 100644
--- a/gcc/config/i386/avx512fintrin.h
+++ b/gcc/config/i386/avx512fintrin.h
@@ -3286,31 +3286,67 @@ _mm_maskz_scalef_round_ss (__mmask8 __U, __m128 __A, __m128 __B, const int __R)
 						      (__mmask8) __U, __R);
 }
 #else
-#define _mm512_scalef_round_pd(A, B, C)            \
-    (__m512d)__builtin_ia32_scalefpd512_mask(A, B, (__v8df)_mm512_undefined_pd(), -1, C)
-
-#define _mm512_mask_scalef_round_pd(W, U, A, B, C) \
-    (__m512d)__builtin_ia32_scalefpd512_mask(A, B, W, U, C)
-
-#define _mm512_maskz_scalef_round_pd(U, A, B, C)   \
-    (__m512d)__builtin_ia32_scalefpd512_mask(A, B, (__v8df)_mm512_setzero_pd(), U, C)
+#define _mm512_scalef_round_pd(A, B, C)					\
+  ((__m512d)								\
+   __builtin_ia32_scalefpd512_mask((A), (B),				\
+				   (__v8df) _mm512_undefined_pd(),	\
+				   -1, (C)))
+
+#define _mm512_mask_scalef_round_pd(W, U, A, B, C)			\
+  ((__m512d) __builtin_ia32_scalefpd512_mask((A), (B), (W), (U), (C)))
+
+#define _mm512_maskz_scalef_round_pd(U, A, B, C)			\
+  ((__m512d)								\
+   __builtin_ia32_scalefpd512_mask((A), (B),				\
+				   (__v8df) _mm512_setzero_pd(),	\
+				   (U), (C)))
+
+#define _mm512_scalef_round_ps(A, B, C)					\
+  ((__m512)								\
+   __builtin_ia32_scalefps512_mask((A), (B),				\
+				   (__v16sf) _mm512_undefined_ps(),	\
+				   -1, (C)))
+
+#define _mm512_mask_scalef_round_ps(W, U, A, B, C)			\
+  ((__m512) __builtin_ia32_scalefps512_mask((A), (B), (W), (U), (C)))
+
+#define _mm512_maskz_scalef_round_ps(U, A, B, C)			\
+  ((__m512)								\
+   __builtin_ia32_scalefps512_mask((A), (B),				\
+				   (__v16sf) _mm512_setzero_ps(),	\
+				   (U), (C)))
+
+#define _mm_scalef_round_sd(A, B, C)					\
+  ((__m128d)								\
+   __builtin_ia32_scalefsd_mask_round ((A), (B),			\
+				       (__v2df) _mm_undefined_pd (),	\
+				       -1, (C)))
 
-#define _mm512_scalef_round_ps(A, B, C)            \
-    (__m512)__builtin_ia32_scalefps512_mask(A, B, (__v16sf)_mm512_undefined_ps(), -1, C)
+#define _mm_scalef_round_ss(A, B, C)					\
+  ((__m128)								\
+   __builtin_ia32_scalefss_mask_round ((A), (B),			\
+				       (__v4sf) _mm_undefined_ps (),	\
+				       -1, (C)))
 
-#define _mm512_mask_scalef_round_ps(W, U, A, B, C) \
-    (__m512)__builtin_ia32_scalefps512_mask(A, B, W, U, C)
+#define _mm_mask_scalef_round_sd(W, U, A, B, C)				\
+  ((__m128d)								\
+   __builtin_ia32_scalefsd_mask_round ((A), (B), (W), (U), (C)))
 
-#define _mm512_maskz_scalef_round_ps(U, A, B, C)   \
-    (__m512)__builtin_ia32_scalefps512_mask(A, B, (__v16sf)_mm512_setzero_ps(), U, C)
+#define _mm_mask_scalef_round_ss(W, U, A, B, C)				\
+  ((__m128)								\
+   __builtin_ia32_scalefss_mask_round ((A), (B), (W), (U), (C)))
 
-#define _mm_scalef_round_sd(A, B, C)            \
-    (__m128d)__builtin_ia32_scalefsd_mask_round (A, B, \
-	(__v2df)_mm_setzero_pd (), -1, C)
+#define _mm_maskz_scalef_round_sd(U, A, B, C)				\
+  ((__m128d)								\
+   __builtin_ia32_scalefsd_mask_round ((A), (B),			\
+				       (__v2df) _mm_setzero_pd (),	\
+				       (U), (C)))
 
-#define _mm_scalef_round_ss(A, B, C)            \
-    (__m128)__builtin_ia32_scalefss_mask_round (A, B, \
-	(__v4sf)_mm_setzero_ps (), -1, C)
+#define _mm_maskz_scalef_round_ss(U, A, B, C)				\
+  ((__m128)								\
+   __builtin_ia32_scalefss_mask_round ((A), (B),			\
+				       (__v4sf) _mm_setzero_ps (),	\
+				       (U), (C)))
 #endif
 
 #define _mm_mask_scalef_sd(W, U, A, B) \
diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index 794315ee2f7..31780b6daf7 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -3136,6 +3136,8 @@ ix86_expand_int_movcc (rtx operands[])
   bool sign_bit_compare_p = false;
   rtx op0 = XEXP (operands[1], 0);
   rtx op1 = XEXP (operands[1], 1);
+  rtx op2 = operands[2];
+  rtx op3 = operands[3];
 
   if (GET_MODE (op0) == TImode
       || (GET_MODE (op0) == DImode
@@ -3153,17 +3155,29 @@ ix86_expand_int_movcc (rtx operands[])
       || (op1 == constm1_rtx && (code == GT || code == LE)))
     sign_bit_compare_p = true;
 
+  /* op0 == op1 ? op0 : op3 is equivalent to op0 == op1 ? op1 : op3,
+     but if op1 is a constant, the latter form allows more optimizations,
+     either through the last 2 ops being constant handling, or the one
+     constant and one variable cases.  On the other side, for cmov the
+     former might be better as we don't need to load the constant into
+     another register.  */
+  if (code == EQ && CONST_INT_P (op1) && rtx_equal_p (op0, op2))
+    op2 = op1;
+  /* Similarly for op0 != op1 ? op2 : op0 and op0 != op1 ? op2 : op1.  */
+  else if (code == NE && CONST_INT_P (op1) && rtx_equal_p (op0, op3))
+    op3 = op1;
+
   /* Don't attempt mode expansion here -- if we had to expand 5 or 6
      HImode insns, we'd be swallowed in word prefix ops.  */
 
   if ((mode != HImode || TARGET_FAST_PREFIX)
       && (mode != (TARGET_64BIT ? TImode : DImode))
-      && CONST_INT_P (operands[2])
-      && CONST_INT_P (operands[3]))
+      && CONST_INT_P (op2)
+      && CONST_INT_P (op3))
     {
       rtx out = operands[0];
-      HOST_WIDE_INT ct = INTVAL (operands[2]);
-      HOST_WIDE_INT cf = INTVAL (operands[3]);
+      HOST_WIDE_INT ct = INTVAL (op2);
+      HOST_WIDE_INT cf = INTVAL (op3);
       HOST_WIDE_INT diff;
 
       diff = ct - cf;
@@ -3559,6 +3573,9 @@ ix86_expand_int_movcc (rtx operands[])
       if (BRANCH_COST (optimize_insn_for_speed_p (), false) <= 2)
 	return false;
 
+      operands[2] = op2;
+      operands[3] = op3;
+
       /* If one of the two operands is an interesting constant, load a
 	 constant with the above and mask it in with a logical operation.  */
 
@@ -17036,7 +17053,8 @@ ix86_emit_fp_unordered_jump (rtx label)
 
 /* Output code to perform an sinh XFmode calculation.  */
 
-void ix86_emit_i387_sinh (rtx op0, rtx op1)
+void
+ix86_emit_i387_sinh (rtx op0, rtx op1)
 {
   rtx e1 = gen_reg_rtx (XFmode);
   rtx e2 = gen_reg_rtx (XFmode);
@@ -17084,7 +17102,8 @@ void ix86_emit_i387_sinh (rtx op0, rtx op1)
 
 /* Output code to perform an cosh XFmode calculation.  */
 
-void ix86_emit_i387_cosh (rtx op0, rtx op1)
+void
+ix86_emit_i387_cosh (rtx op0, rtx op1)
 {
   rtx e1 = gen_reg_rtx (XFmode);
   rtx e2 = gen_reg_rtx (XFmode);
@@ -17106,7 +17125,8 @@ void ix86_emit_i387_cosh (rtx op0, rtx op1)
 
 /* Output code to perform an tanh XFmode calculation.  */
 
-void ix86_emit_i387_tanh (rtx op0, rtx op1)
+void
+ix86_emit_i387_tanh (rtx op0, rtx op1)
 {
   rtx e1 = gen_reg_rtx (XFmode);
   rtx e2 = gen_reg_rtx (XFmode);
@@ -17152,7 +17172,8 @@ void ix86_emit_i387_tanh (rtx op0, rtx op1)
 
 /* Output code to perform an asinh XFmode calculation.  */
 
-void ix86_emit_i387_asinh (rtx op0, rtx op1)
+void
+ix86_emit_i387_asinh (rtx op0, rtx op1)
 {
   rtx e1 = gen_reg_rtx (XFmode);
   rtx e2 = gen_reg_rtx (XFmode);
@@ -17204,7 +17225,8 @@ void ix86_emit_i387_asinh (rtx op0, rtx op1)
 
 /* Output code to perform an acosh XFmode calculation.  */
 
-void ix86_emit_i387_acosh (rtx op0, rtx op1)
+void
+ix86_emit_i387_acosh (rtx op0, rtx op1)
 {
   rtx e1 = gen_reg_rtx (XFmode);
   rtx e2 = gen_reg_rtx (XFmode);
@@ -17230,7 +17252,8 @@ void ix86_emit_i387_acosh (rtx op0, rtx op1)
 
 /* Output code to perform an atanh XFmode calculation.  */
 
-void ix86_emit_i387_atanh (rtx op0, rtx op1)
+void
+ix86_emit_i387_atanh (rtx op0, rtx op1)
 {
   rtx e1 = gen_reg_rtx (XFmode);
   rtx e2 = gen_reg_rtx (XFmode);
@@ -17281,7 +17304,8 @@ void ix86_emit_i387_atanh (rtx op0, rtx op1)
 
 /* Output code to perform a log1p XFmode calculation.  */
 
-void ix86_emit_i387_log1p (rtx op0, rtx op1)
+void
+ix86_emit_i387_log1p (rtx op0, rtx op1)
 {
   rtx_code_label *label1 = gen_label_rtx ();
   rtx_code_label *label2 = gen_label_rtx ();
@@ -17291,6 +17315,11 @@ void ix86_emit_i387_log1p (rtx op0, rtx op1)
   rtx cst, cstln2, cst1;
   rtx_insn *insn;
 
+  /* The emit_jump call emits pending stack adjust, make sure it is emitted
+     before the conditional jump, otherwise the stack adjustment will be
+     only conditional.  */
+  do_pending_stack_adjust ();
+
   cst = const_double_from_real_value
     (REAL_VALUE_ATOF ("0.29289321881345247561810596348408353", XFmode), XFmode);
   cstln2 = force_reg (XFmode, standard_80387_constant_rtx (4)); /* fldln2 */
@@ -17320,7 +17349,8 @@ void ix86_emit_i387_log1p (rtx op0, rtx op1)
 }
 
 /* Emit code for round calculation.  */
-void ix86_emit_i387_round (rtx op0, rtx op1)
+void
+ix86_emit_i387_round (rtx op0, rtx op1)
 {
   machine_mode inmode = GET_MODE (op1);
   machine_mode outmode = GET_MODE (op0);
@@ -17434,7 +17464,8 @@ void ix86_emit_i387_round (rtx op0, rtx op1)
 /* Output code to perform a Newton-Rhapson approximation of a single precision
    floating point divide [http://en.wikipedia.org/wiki/N-th_root_algorithm].  */
 
-void ix86_emit_swdivsf (rtx res, rtx a, rtx b, machine_mode mode)
+void
+ix86_emit_swdivsf (rtx res, rtx a, rtx b, machine_mode mode)
 {
   rtx x0, x1, e0, e1;
 
@@ -17485,7 +17516,8 @@ void ix86_emit_swdivsf (rtx res, rtx a, rtx b, machine_mode mode)
 /* Output code to perform a Newton-Rhapson approximation of a
    single precision floating point [reciprocal] square root.  */
 
-void ix86_emit_swsqrtsf (rtx res, rtx a, machine_mode mode, bool recip)
+void
+ix86_emit_swsqrtsf (rtx res, rtx a, machine_mode mode, bool recip)
 {
   rtx x0, e0, e1, e2, e3, mthree, mhalf;
   REAL_VALUE_TYPE r;
@@ -23240,9 +23272,10 @@ ix86_expand_divmod_libfunc (rtx libfunc, machine_mode mode,
   *rem_p = rem;
 }
 
-void ix86_expand_atomic_fetch_op_loop (rtx target, rtx mem, rtx val,
-				       enum rtx_code code, bool after,
-				       bool doubleword)
+void
+ix86_expand_atomic_fetch_op_loop (rtx target, rtx mem, rtx val,
+				  enum rtx_code code, bool after,
+				  bool doubleword)
 {
   rtx old_reg, new_reg, old_mem, success;
   machine_mode mode = GET_MODE (target);
@@ -23286,10 +23319,11 @@ void ix86_expand_atomic_fetch_op_loop (rtx target, rtx mem, rtx val,
    it will be relaxed to an atomic load + compare, and skip
    cmpxchg instruction if mem != exp_input.  */
 
-void ix86_expand_cmpxchg_loop (rtx *ptarget_bool, rtx target_val,
-			       rtx mem, rtx exp_input, rtx new_input,
-			       rtx mem_model, bool doubleword,
-			       rtx_code_label *loop_label)
+void
+ix86_expand_cmpxchg_loop (rtx *ptarget_bool, rtx target_val,
+			  rtx mem, rtx exp_input, rtx new_input,
+			  rtx mem_model, bool doubleword,
+			  rtx_code_label *loop_label)
 {
   rtx_code_label *cmp_label = NULL;
   rtx_code_label *done_label = NULL;
@@ -23388,6 +23422,7 @@ void ix86_expand_cmpxchg_loop (rtx *ptarget_bool, rtx target_val,
 
     /* If mem is not expected, pause and loop back.  */
     emit_label (cmp_label);
+    emit_move_insn (target_val, new_mem);
     emit_insn (gen_pause ());
     emit_jump_insn (gen_jump (loop_label));
     emit_barrier ();
diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index c959b7144de..b16df5b183e 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -4891,6 +4891,7 @@ ix86_gimplify_va_arg (tree valist, tree type, gimple_seq *pre_p,
 	{
 	  int i, prev_size = 0;
 	  tree temp = create_tmp_var (type, "va_arg_tmp");
+	  TREE_ADDRESSABLE (temp) = 1;
 
 	  /* addr = &temp; */
 	  t = build1 (ADDR_EXPR, build_pointer_type (type), temp);
@@ -6524,7 +6525,8 @@ ix86_initial_elimination_offset (int from, int to)
 }
 
 /* Emits a warning for unsupported msabi to sysv pro/epilogues.  */
-void warn_once_call_ms2sysv_xlogues (const char *feature)
+void
+warn_once_call_ms2sysv_xlogues (const char *feature)
 {
   static bool warned_once = false;
   if (!warned_once)
@@ -18806,7 +18808,8 @@ ix86_veclibabi_svml (combined_fn fn, tree type_out, tree type_in)
       return NULL_TREE;
     }
 
-  tree fndecl = mathfn_built_in (TREE_TYPE (type_in), fn);
+  tree fndecl = mathfn_built_in (el_mode == DFmode
+				 ? double_type_node : float_type_node, fn);
   bname = IDENTIFIER_POINTER (DECL_NAME (fndecl));
 
   if (DECL_FUNCTION_CODE (fndecl) == BUILT_IN_LOGF)
@@ -18898,7 +18901,8 @@ ix86_veclibabi_acml (combined_fn fn, tree type_out, tree type_in)
       return NULL_TREE;
     }
 
-  tree fndecl = mathfn_built_in (TREE_TYPE (type_in), fn);
+  tree fndecl = mathfn_built_in (el_mode == DFmode
+				 ? double_type_node : float_type_node, fn);
   bname = IDENTIFIER_POINTER (DECL_NAME (fndecl));
   sprintf (name + 7, "%s", bname+10);
 
diff --git a/gcc/config/i386/smmintrin.h b/gcc/config/i386/smmintrin.h
index b42b212300f..eb6a451c10a 100644
--- a/gcc/config/i386/smmintrin.h
+++ b/gcc/config/i386/smmintrin.h
@@ -810,17 +810,11 @@ _mm_cmpgt_epi64 (__m128i __X, __m128i __Y)
 
 #include <popcntintrin.h>
 
-#ifndef __SSE4_1__
+#ifndef __CRC32__
 #pragma GCC push_options
-#pragma GCC target("sse4.1")
-#define __DISABLE_SSE4_1__
-#endif /* __SSE4_1__ */
-
-#ifndef __SSE4_2__
-#pragma GCC push_options
-#pragma GCC target("sse4.2")
-#define __DISABLE_SSE4_2__
-#endif /* __SSE4_1__ */
+#pragma GCC target("crc32")
+#define __DISABLE_CRC32__
+#endif /* __CRC32__ */
 
 /* Accumulate CRC32 (polynomial 0x11EDC6F41) value.  */
 extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
@@ -849,14 +843,9 @@ _mm_crc32_u64 (unsigned long long __C, unsigned long long __V)
 }
 #endif
 
-#ifdef __DISABLE_SSE4_2__
-#undef __DISABLE_SSE4_2__
+#ifdef __DISABLE_CRC32__
+#undef __DISABLE_CRC32__
 #pragma GCC pop_options
-#endif /* __DISABLE_SSE4_2__ */
-
-#ifdef __DISABLE_SSE4_1__
-#undef __DISABLE_SSE4_1__
-#pragma GCC pop_options
-#endif /* __DISABLE_SSE4_1__ */
+#endif /* __DISABLE_CRC32__ */
 
 #endif /* _SMMINTRIN_H_INCLUDED */
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index a852c166a58..5e93aa23b47 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -327,9 +327,7 @@
 
 ;; 128-, 256- and 512-bit float vector modes for bitwise operations
 (define_mode_iterator VFB
-  [(V32HF "TARGET_AVX512FP16")
-   (V16HF "TARGET_AVX512FP16")
-   (V8HF "TARGET_AVX512FP16")
+  [(V32HF "TARGET_AVX512F") (V16HF "TARGET_AVX") (V8HF "TARGET_SSE2")
    (V16SF "TARGET_AVX512F") (V8SF "TARGET_AVX") V4SF
    (V8DF "TARGET_AVX512F") (V4DF "TARGET_AVX") (V2DF "TARGET_SSE2")])
 
@@ -340,8 +338,7 @@
 
 ;; 128- and 256-bit float vector modes for bitwise operations
 (define_mode_iterator VFB_128_256
-  [(V16HF "TARGET_AVX512FP16")
-   (V8HF "TARGET_AVX512FP16")
+  [(V16HF "TARGET_AVX") (V8HF "TARGET_SSE2")
    (V8SF "TARGET_AVX") V4SF
    (V4DF "TARGET_AVX") (V2DF "TARGET_SSE2")])
 
@@ -399,7 +396,7 @@
 
 ;; All 512bit vector float modes for bitwise operations
 (define_mode_iterator VFB_512
-  [(V32HF "TARGET_AVX512FP16") V16SF V8DF])
+  [V32HF V16SF V8DF])
 
 (define_mode_iterator VI48_AVX512VL
   [V16SI (V8SI  "TARGET_AVX512VL") (V4SI  "TARGET_AVX512VL")
@@ -4581,7 +4578,8 @@
 	  (not:VFB_128_256
 	    (match_operand:VFB_128_256 1 "register_operand" "0,x,v,v"))
 	  (match_operand:VFB_128_256 2 "vector_operand" "xBm,xm,vm,vm")))]
-  "TARGET_SSE && <mask_avx512vl_condition>"
+  "TARGET_SSE && <mask_avx512vl_condition>
+   && (!<mask_applied> || <ssescalarmode>mode != HFmode)"
 {
   char buf[128];
   const char *ops;
@@ -4648,7 +4646,7 @@
 	  (not:VFB_512
 	    (match_operand:VFB_512 1 "register_operand" "v"))
 	  (match_operand:VFB_512 2 "nonimmediate_operand" "vm")))]
-  "TARGET_AVX512F"
+  "TARGET_AVX512F && (!<mask_applied> || <ssescalarmode>mode != HFmode)"
 {
   char buf[128];
   const char *ops;
@@ -4683,7 +4681,8 @@
        (any_logic:VFB_128_256
          (match_operand:VFB_128_256 1 "vector_operand")
          (match_operand:VFB_128_256 2 "vector_operand")))]
-  "TARGET_SSE && <mask_avx512vl_condition>"
+  "TARGET_SSE && <mask_avx512vl_condition>
+   && (!<mask_applied> || <ssescalarmode>mode != HFmode)"
   "ix86_fixup_binary_operands_no_copy (<CODE>, <MODE>mode, operands);")
 
 (define_expand "<code><mode>3<mask_name>"
@@ -4691,7 +4690,7 @@
        (any_logic:VFB_512
          (match_operand:VFB_512 1 "nonimmediate_operand")
          (match_operand:VFB_512 2 "nonimmediate_operand")))]
-  "TARGET_AVX512F"
+  "TARGET_AVX512F && (!<mask_applied> || <ssescalarmode>mode != HFmode)"
   "ix86_fixup_binary_operands_no_copy (<CODE>, <MODE>mode, operands);")
 
 (define_insn "*<code><mode>3<mask_name>"
@@ -4700,6 +4699,7 @@
 	  (match_operand:VFB_128_256 1 "vector_operand" "%0,x,v,v")
 	  (match_operand:VFB_128_256 2 "vector_operand" "xBm,xm,vm,vm")))]
   "TARGET_SSE && <mask_avx512vl_condition>
+   && (!<mask_applied> || <ssescalarmode>mode != HFmode)
    && !(MEM_P (operands[1]) && MEM_P (operands[2]))"
 {
   char buf[128];
@@ -4766,7 +4766,8 @@
 	(any_logic:VFB_512
 	  (match_operand:VFB_512 1 "nonimmediate_operand" "%v")
 	  (match_operand:VFB_512 2 "nonimmediate_operand" "vm")))]
-  "TARGET_AVX512F && !(MEM_P (operands[1]) && MEM_P (operands[2]))"
+  "TARGET_AVX512F && !(MEM_P (operands[1]) && MEM_P (operands[2]))
+   && (!<mask_applied> || <ssescalarmode>mode != HFmode)"
 {
   char buf[128];
   const char *ops;
@@ -16741,17 +16742,6 @@
 	  (match_operand:<avx512fmaskmode> 4 "register_operand")))]
   "TARGET_AVX512F")
 
-(define_expand "<sse2_avx2>_andnot<mode>3_mask"
-  [(set (match_operand:VI12_AVX512VL 0 "register_operand")
-	(vec_merge:VI12_AVX512VL
-	  (and:VI12_AVX512VL
-	    (not:VI12_AVX512VL
-	      (match_operand:VI12_AVX512VL 1 "register_operand"))
-	    (match_operand:VI12_AVX512VL 2 "nonimmediate_operand"))
-	  (match_operand:VI12_AVX512VL 3 "nonimm_or_0_operand")
-	  (match_operand:<avx512fmaskmode> 4 "register_operand")))]
-  "TARGET_AVX512BW")
-
 (define_insn "*andnot<mode>3"
   [(set (match_operand:VI 0 "register_operand" "=x,x,v")
 	(and:VI