gcc-6.2.0gcc-6.2.0

author: Lorry Tar Creator <lorry-tar-importer@lorry> 2016-08-22 10:27:46 +0000
committer: Lorry Tar Creator <lorry-tar-importer@lorry> 2016-08-22 10:27:46 +0000
commit: f733cf303bcdc952c92b81dd62199a40a1f555ec (patch)
tree: 0a9a9e0f28aa7c7f5bc4d1d1d0e9647163cac4f7 /gcc/config
parent: e0e4357b88efe5dc53e50d341a09de4d02331200 (diff)
download: gcc-tarball-51b577d7e597bf27e0277cae6b3b54bbf2e90ee2.tar.gz
104 files changed, 5984 insertions, 1886 deletions
diff --git a/gcc/config/aarch64/aarch64-builtins.c b/gcc/config/aarch64/aarch64-builtins.c
index 5573903fe0..2c4eac08e4 100644
--- a/gcc/config/aarch64/aarch64-builtins.c
+++ b/gcc/config/aarch64/aarch64-builtins.c
@@ -434,13 +434,15 @@ static struct aarch64_simd_type_info aarch64_simd_types [] = {
 };
 #undef ENTRY
 
-/* This type is not SIMD-specific; it is the user-visible __fp16.  */
-static tree aarch64_fp16_type_node = NULL_TREE;
-
 static tree aarch64_simd_intOI_type_node = NULL_TREE;
 static tree aarch64_simd_intCI_type_node = NULL_TREE;
 static tree aarch64_simd_intXI_type_node = NULL_TREE;
 
+/* The user-visible __fp16 type, and a pointer to that type.  Used
+   across the back-end.  */
+tree aarch64_fp16_type_node = NULL_TREE;
+tree aarch64_fp16_ptr_type_node = NULL_TREE;
+
 static const char *
 aarch64_mangle_builtin_scalar_type (const_tree type)
 {
@@ -874,6 +876,21 @@ aarch64_init_builtin_rsqrt (void)
   }
 }
 
+/* Initialize the backend types that support the user-visible __fp16
+   type, also initialize a pointer to that type, to be used when
+   forming HFAs.  */
+
+static void
+aarch64_init_fp16_types (void)
+{
+  aarch64_fp16_type_node = make_node (REAL_TYPE);
+  TYPE_PRECISION (aarch64_fp16_type_node) = 16;
+  layout_type (aarch64_fp16_type_node);
+
+  (*lang_hooks.types.register_builtin_type) (aarch64_fp16_type_node, "__fp16");
+  aarch64_fp16_ptr_type_node = build_pointer_type (aarch64_fp16_type_node);
+}
+
 void
 aarch64_init_builtins (void)
 {
@@ -895,11 +912,7 @@ aarch64_init_builtins (void)
     = add_builtin_function ("__builtin_aarch64_set_fpsr", ftype_set_fpr,
 			    AARCH64_BUILTIN_SET_FPSR, BUILT_IN_MD, NULL, NULL_TREE);
 
-  aarch64_fp16_type_node = make_node (REAL_TYPE);
-  TYPE_PRECISION (aarch64_fp16_type_node) = 16;
-  layout_type (aarch64_fp16_type_node);
-
-  (*lang_hooks.types.register_builtin_type) (aarch64_fp16_type_node, "__fp16");
+  aarch64_init_fp16_types ();
 
   if (TARGET_SIMD)
     aarch64_init_simd_builtins ();
diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def
index dd045792b2..f440907aed 100644
--- a/gcc/config/aarch64/aarch64-simd-builtins.def
+++ b/gcc/config/aarch64/aarch64-simd-builtins.def
@@ -244,13 +244,17 @@
   /* Implemented by <maxmin><mode>3.
      smax variants map to fmaxnm,
      smax_nan variants map to fmax.  */
-  BUILTIN_VDQIF (BINOP, smax, 3)
-  BUILTIN_VDQIF (BINOP, smin, 3)
+  BUILTIN_VDQ_BHSI (BINOP, smax, 3)
+  BUILTIN_VDQ_BHSI (BINOP, smin, 3)
   BUILTIN_VDQ_BHSI (BINOP, umax, 3)
   BUILTIN_VDQ_BHSI (BINOP, umin, 3)
   BUILTIN_VDQF (BINOP, smax_nan, 3)
   BUILTIN_VDQF (BINOP, smin_nan, 3)
 
+  /* Implemented by <fmaxmin><mode>3.  */
+  BUILTIN_VDQF (BINOP, fmax, 3)
+  BUILTIN_VDQF (BINOP, fmin, 3)
+
   /* Implemented by aarch64_<maxmin_uns>p<mode>.  */
   BUILTIN_VDQ_BHSI (BINOP, smaxp, 0)
   BUILTIN_VDQ_BHSI (BINOP, sminp, 0)
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index bd73bce644..ded8bff097 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -1919,16 +1919,6 @@
   }
 )
 
-(define_insn "aarch64_vmls<mode>"
-  [(set (match_operand:VDQF 0 "register_operand" "=w")
-       (minus:VDQF (match_operand:VDQF 1 "register_operand" "0")
-		   (mult:VDQF (match_operand:VDQF 2 "register_operand" "w")
-			      (match_operand:VDQF 3 "register_operand" "w"))))]
-  "TARGET_SIMD"
- "fmls\\t%0.<Vtype>, %2.<Vtype>, %3.<Vtype>"
-  [(set_attr "type" "neon_fp_mla_<Vetype>_scalar<q>")]
-)
-
 ;; FP Max/Min
 ;; Max/Min are introduced by idiom recognition by GCC's mid-end.  An
 ;; expression like:
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 999549426e..199147adb9 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -9771,15 +9771,10 @@ aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
 	  field_t = long_double_type_node;
 	  field_ptr_t = long_double_ptr_type_node;
 	  break;
-/* The half precision and quad precision are not fully supported yet.  Enable
-   the following code after the support is complete.  Need to find the correct
-   type node for __fp16 *.  */
-#if 0
 	case HFmode:
-	  field_t = float_type_node;
-	  field_ptr_t = float_ptr_type_node;
+	  field_t = aarch64_fp16_type_node;
+	  field_ptr_t = aarch64_fp16_ptr_type_node;
 	  break;
-#endif
 	case V2SImode:
 	case V4SImode:
 	    {
@@ -9934,7 +9929,8 @@ aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
     {
     case REAL_TYPE:
       mode = TYPE_MODE (type);
-      if (mode != DFmode && mode != SFmode && mode != TFmode)
+      if (mode != DFmode && mode != SFmode
+	  && mode != TFmode && mode != HFmode)
 	return -1;
 
       if (*modep == VOIDmode)
@@ -9947,7 +9943,8 @@ aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
 
     case COMPLEX_TYPE:
       mode = TYPE_MODE (TREE_TYPE (type));
-      if (mode != DFmode && mode != SFmode && mode != TFmode)
+      if (mode != DFmode && mode != SFmode
+	  && mode != TFmode && mode != HFmode)
 	return -1;
 
       if (*modep == VOIDmode)
diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
index 15d7e4019a..b9bf9793e4 100644
--- a/gcc/config/aarch64/aarch64.h
+++ b/gcc/config/aarch64/aarch64.h
@@ -930,4 +930,9 @@ extern const char *host_detect_local_cpu (int argc, const char **argv);
 
 #define ASM_OUTPUT_POOL_EPILOGUE  aarch64_asm_output_pool_epilogue
 
+/* This type is the user-visible __fp16, and a pointer to that type.  We
+   need it in many places in the backend.  Defined in aarch64-builtins.c.  */
+extern tree aarch64_fp16_type_node;
+extern tree aarch64_fp16_ptr_type_node;
+
 #endif /* GCC_AARCH64_H */
diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index 2612a32571..ec543684ed 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -17856,19 +17856,19 @@ vpminnms_f32 (float32x2_t a)
 __extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
 vmaxnm_f32 (float32x2_t __a, float32x2_t __b)
 {
-  return __builtin_aarch64_smaxv2sf (__a, __b);
+  return __builtin_aarch64_fmaxv2sf (__a, __b);
 }
 
 __extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
 vmaxnmq_f32 (float32x4_t __a, float32x4_t __b)
 {
-  return __builtin_aarch64_smaxv4sf (__a, __b);
+  return __builtin_aarch64_fmaxv4sf (__a, __b);
 }
 
 __extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
 vmaxnmq_f64 (float64x2_t __a, float64x2_t __b)
 {
-  return __builtin_aarch64_smaxv2df (__a, __b);
+  return __builtin_aarch64_fmaxv2df (__a, __b);
 }
 
 /* vmaxv  */
@@ -18086,19 +18086,19 @@ vminq_u32 (uint32x4_t __a, uint32x4_t __b)
 __extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
 vminnm_f32 (float32x2_t __a, float32x2_t __b)
 {
-  return __builtin_aarch64_sminv2sf (__a, __b);
+  return __builtin_aarch64_fminv2sf (__a, __b);
 }
 
 __extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
 vminnmq_f32 (float32x4_t __a, float32x4_t __b)
 {
-  return __builtin_aarch64_sminv4sf (__a, __b);
+  return __builtin_aarch64_fminv4sf (__a, __b);
 }
 
 __extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
 vminnmq_f64 (float64x2_t __a, float64x2_t __b)
 {
-  return __builtin_aarch64_sminv2df (__a, __b);
+  return __builtin_aarch64_fminv2df (__a, __b);
 }
 
 /* vminv  */
diff --git a/gcc/config/alpha/alpha.md b/gcc/config/alpha/alpha.md
index 932608b0fe..3e4594bf4c 100644
--- a/gcc/config/alpha/alpha.md
+++ b/gcc/config/alpha/alpha.md
@@ -3738,7 +3738,8 @@
 
 ;; BUGCHK is documented common to OSF/1 and VMS PALcode.
 (define_insn "trap"
-  [(trap_if (const_int 1) (const_int 0))]
+  [(trap_if (const_int 1) (const_int 0))
+   (use (reg:DI 29))]
   ""
   "call_pal 0x81"
   [(set_attr "type" "callpal")])
@@ -5157,7 +5158,7 @@
   "TARGET_ABI_OSF"
 {
   if (TARGET_EXPLICIT_RELOCS)
-    return "ldah $29,0($26)\t\t!gpdisp!%*\;lda $29,0($29)\t\t!gpdisp!%*";
+    return "#";
   else
     return "ldgp $29,0($26)";
 }
diff --git a/gcc/config/arm/arm-builtins.c b/gcc/config/arm/arm-builtins.c
index 90fb40fed2..68b2839879 100644
--- a/gcc/config/arm/arm-builtins.c
+++ b/gcc/config/arm/arm-builtins.c
@@ -2861,6 +2861,10 @@ arm_builtin_vectorized_function (unsigned int fn, tree type_out, tree type_in)
   int in_n, out_n;
   bool out_unsigned_p = TYPE_UNSIGNED (type_out);
 
+  /* Can't provide any vectorized builtins when we can't use NEON.  */
+  if (!TARGET_NEON)
+    return NULL_TREE;
+
   if (TREE_CODE (type_out) != VECTOR_TYPE
       || TREE_CODE (type_in) != VECTOR_TYPE)
     return NULL_TREE;
@@ -2875,7 +2879,7 @@ arm_builtin_vectorized_function (unsigned int fn, tree type_out, tree type_in)
    NULL_TREE is returned if no such builtin is available.  */
 #undef ARM_CHECK_BUILTIN_MODE
 #define ARM_CHECK_BUILTIN_MODE(C)    \
-  (TARGET_NEON && TARGET_FPU_ARMV8   \
+  (TARGET_FPU_ARMV8   \
    && flag_unsafe_math_optimizations \
    && ARM_CHECK_BUILTIN_MODE_1 (C))
 
diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c
index 5974c65d31..24b204a1d4 100644
--- a/gcc/config/arm/arm.c
+++ b/gcc/config/arm/arm.c
@@ -6704,7 +6704,7 @@ arm_function_ok_for_sibcall (tree decl, tree exp)
 
   /* The PIC register is live on entry to VxWorks PLT entries, so we
      must make the call before restoring the PIC register.  */
-  if (TARGET_VXWORKS_RTP && flag_pic && !targetm.binds_local_p (decl))
+  if (TARGET_VXWORKS_RTP && flag_pic && decl && !targetm.binds_local_p (decl))
     return false;
 
   /* If we are interworking and the function is not declared static
@@ -17755,6 +17755,7 @@ arm_output_multireg_pop (rtx *operands, bool return_pc, rtx cond, bool reverse,
   int num_saves = XVECLEN (operands[0], 0);
   unsigned int regno;
   unsigned int regno_base = REGNO (operands[1]);
+  bool interrupt_p = IS_INTERRUPT (arm_current_func_type ());
 
   offset = 0;
   offset += update ? 1 : 0;
@@ -17772,20 +17773,16 @@ arm_output_multireg_pop (rtx *operands, bool return_pc, rtx cond, bool reverse,
     }
 
   conditional = reverse ? "%?%D0" : "%?%d0";
-  if ((regno_base == SP_REGNUM) && update)
-    {
-      sprintf (pattern, "pop%s\t{", conditional);
-    }
+  /* Can't use POP if returning from an interrupt.  */
+  if ((regno_base == SP_REGNUM) && update && !(interrupt_p && return_pc))
+    sprintf (pattern, "pop%s\t{", conditional);
   else
     {
       /* Output ldmfd when the base register is SP, otherwise output ldmia.
          It's just a convention, their semantics are identical.  */
       if (regno_base == SP_REGNUM)
-	  /* update is never true here, hence there is no need to handle
-	     pop here.  */
-	sprintf (pattern, "ldmfd%s", conditional);
-
-      if (update)
+	sprintf (pattern, "ldmfd%s\t", conditional);
+      else if (update)
 	sprintf (pattern, "ldmia%s\t", conditional);
       else
 	sprintf (pattern, "ldm%s\t", conditional);
@@ -17811,7 +17808,7 @@ arm_output_multireg_pop (rtx *operands, bool return_pc, rtx cond, bool reverse,
 
   strcat (pattern, "}");
 
-  if (IS_INTERRUPT (arm_current_func_type ()) && return_pc)
+  if (interrupt_p && return_pc)
     strcat (pattern, "^");
 
   output_asm_insn (pattern, &cond);
@@ -19622,8 +19619,12 @@ output_return_instruction (rtx operand, bool really_return, bool reverse,
 		  sprintf (instr, "ldmfd%s\t%%|sp, {", conditional);
 		}
 	    }
+	  /* For interrupt returns we have to use an LDM rather than
+	     a POP so that we can use the exception return variant.  */
+	  else if (IS_INTERRUPT (func_type))
+	    sprintf (instr, "ldmfd%s\t%%|sp!, {", conditional);
 	  else
-	      sprintf (instr, "pop%s\t{", conditional);
+	    sprintf (instr, "pop%s\t{", conditional);
 
 	  p = instr + strlen (instr);
 
@@ -21461,7 +21462,11 @@ arm_expand_prologue (void)
 
   /* Naked functions don't have prologues.  */
   if (IS_NAKED (func_type))
-    return;
+    {
+      if (flag_stack_usage_info)
+	current_function_static_stack_size = 0;
+      return;
+    }
 
   /* Make a copy of c_f_p_a_s as we may need to modify it locally.  */
   args_to_push = crtl->args.pretend_args_size;
@@ -24715,7 +24720,11 @@ thumb1_expand_prologue (void)
 
   /* Naked functions don't have prologues.  */
   if (IS_NAKED (func_type))
-    return;
+    {
+      if (flag_stack_usage_info)
+	current_function_static_stack_size = 0;
+      return;
+    }
 
   if (IS_INTERRUPT (func_type))
     {
diff --git a/gcc/config/arm/freebsd.h b/gcc/config/arm/freebsd.h
index 948fdd6843..0ade4e99be 100644
--- a/gcc/config/arm/freebsd.h
+++ b/gcc/config/arm/freebsd.h
@@ -120,6 +120,9 @@
 #define SUBTARGET_CPU_DEFAULT   TARGET_CPU_arm9
 #endif
 
+/* FreeBSD 10 does not support unaligned access for armv6 and up.
+   Unaligned access support was added in FreeBSD 11.  */
+#if FBSD_MAJOR < 11
 #define SUBTARGET_OVERRIDE_INTERNAL_OPTIONS				\
 do {									\
     if (opts_set->x_unaligned_access == 1)				\
@@ -127,6 +130,7 @@ do {									\
     if (opts->x_unaligned_access)					\
 	opts->x_unaligned_access = 0;					\
 } while (0)
+#endif
 
 #undef MAX_SYNC_LIBFUNC_SIZE
 #define MAX_SYNC_LIBFUNC_SIZE 4 /* UNITS_PER_WORD not defined yet.  */
diff --git a/gcc/config/avr/avr.c b/gcc/config/avr/avr.c
index a7728e3b10..0ae913c73b 100644
--- a/gcc/config/avr/avr.c
+++ b/gcc/config/avr/avr.c
@@ -203,9 +203,6 @@ static GTY(()) rtx xstring_e;
 /* Current architecture.  */
 const avr_arch_t *avr_arch;
 
-/* Section to put switch tables in.  */
-static GTY(()) section *progmem_swtable_section;
-
 /* Unnamed sections associated to __attribute__((progmem)) aka. PROGMEM
    or to address space __flash* or __memx.  Only used as singletons inside
    avr_asm_select_section, but it must not be local there because of GTY.  */
@@ -9461,24 +9458,6 @@ avr_output_progmem_section_asm_op (const void *data)
 static void
 avr_asm_init_sections (void)
 {
-  /* Set up a section for jump tables.  Alignment is handled by
-     ASM_OUTPUT_BEFORE_CASE_LABEL.  */
-
-  if (AVR_HAVE_JMP_CALL)
-    {
-      progmem_swtable_section
-        = get_unnamed_section (0, output_section_asm_op,
-                               "\t.section\t.progmem.gcc_sw_table"
-                               ",\"a\",@progbits");
-    }
-  else
-    {
-      progmem_swtable_section
-        = get_unnamed_section (SECTION_CODE, output_section_asm_op,
-                               "\t.section\t.progmem.gcc_sw_table"
-                               ",\"ax\",@progbits");
-    }
-
   /* Override section callbacks to keep track of `avr_need_clear_bss_p'
      resp. `avr_need_copy_data_p'.  */
 
@@ -9488,65 +9467,6 @@ avr_asm_init_sections (void)
 }
 
 
-/* Implement `TARGET_ASM_FUNCTION_RODATA_SECTION'.  */
-
-static section*
-avr_asm_function_rodata_section (tree decl)
-{
-  /* If a function is unused and optimized out by -ffunction-sections
-     and --gc-sections, ensure that the same will happen for its jump
-     tables by putting them into individual sections.  */
-
-  unsigned int flags;
-  section * frodata;
-
-  /* Get the frodata section from the default function in varasm.c
-     but treat function-associated data-like jump tables as code
-     rather than as user defined data.  AVR has no constant pools.  */
-  {
-    int fdata = flag_data_sections;
-
-    flag_data_sections = flag_function_sections;
-    frodata = default_function_rodata_section (decl);
-    flag_data_sections = fdata;
-    flags = frodata->common.flags;
-  }
-
-  if (frodata != readonly_data_section
-      && flags & SECTION_NAMED)
-    {
-      /* Adjust section flags and replace section name prefix.  */
-
-      unsigned int i;
-
-      static const char* const prefix[] =
-        {
-          ".rodata",          ".progmem.gcc_sw_table",
-          ".gnu.linkonce.r.", ".gnu.linkonce.t."
-        };
-
-      for (i = 0; i < sizeof (prefix) / sizeof (*prefix); i += 2)
-        {
-          const char * old_prefix = prefix[i];
-          const char * new_prefix = prefix[i+1];
-          const char * name = frodata->named.name;
-
-          if (STR_PREFIX_P (name, old_prefix))
-            {
-              const char *rname = ACONCAT ((new_prefix,
-                                            name + strlen (old_prefix), NULL));
-              flags &= ~SECTION_CODE;
-              flags |= AVR_HAVE_JMP_CALL ? 0 : SECTION_CODE;
-
-              return get_section (rname, flags, frodata->named.decl);
-            }
-        }
-    }
-
-  return progmem_swtable_section;
-}
-
-
 /* Implement `TARGET_ASM_NAMED_SECTION'.  */
 /* Track need of __do_clear_bss, __do_copy_data for named sections.  */
 
@@ -9721,7 +9641,9 @@ avr_asm_select_section (tree decl, int reloc, unsigned HOST_WIDE_INT align)
             {
               const char *sname = ACONCAT ((new_prefix,
                                             name + strlen (old_prefix), NULL));
-              return get_section (sname, sect->common.flags, sect->named.decl);
+              return get_section (sname,
+                                  sect->common.flags & ~SECTION_DECLARED,
+                                  sect->named.decl);
             }
         }
 
@@ -13747,9 +13669,6 @@ avr_fold_builtin (tree fndecl, int n_args ATTRIBUTE_UNUSED, tree *arg,
 #undef  TARGET_FOLD_BUILTIN
 #define TARGET_FOLD_BUILTIN avr_fold_builtin
 
-#undef  TARGET_ASM_FUNCTION_RODATA_SECTION
-#define TARGET_ASM_FUNCTION_RODATA_SECTION avr_asm_function_rodata_section
-
 #undef  TARGET_SCALAR_MODE_SUPPORTED_P
 #define TARGET_SCALAR_MODE_SUPPORTED_P avr_scalar_mode_supported_p
 
diff --git a/gcc/config/avr/avr.h b/gcc/config/avr/avr.h
index 01da70867e..ab5e465114 100644
--- a/gcc/config/avr/avr.h
+++ b/gcc/config/avr/avr.h
@@ -391,7 +391,7 @@ typedef struct avr_args
 
 #define SUPPORTS_INIT_PRIORITY 0
 
-#define JUMP_TABLES_IN_TEXT_SECTION 0
+#define JUMP_TABLES_IN_TEXT_SECTION 1
 
 #define ASM_COMMENT_START " ; "
 
diff --git a/gcc/config/avr/avr.md b/gcc/config/avr/avr.md
index c988446184..2a2b620cd1 100644
--- a/gcc/config/avr/avr.md
+++ b/gcc/config/avr/avr.md
@@ -112,12 +112,12 @@
 (define_attr "length" ""
   (cond [(eq_attr "type" "branch")
          (if_then_else (and (ge (minus (pc) (match_dup 0))
-                                (const_int -63))
+                                (const_int -62))
                             (le (minus (pc) (match_dup 0))
                                 (const_int 62)))
                        (const_int 1)
                        (if_then_else (and (ge (minus (pc) (match_dup 0))
-                                              (const_int -2045))
+                                              (const_int -2044))
                                           (le (minus (pc) (match_dup 0))
                                               (const_int 2045)))
                                      (const_int 2)
@@ -641,6 +641,22 @@
     if (avr_mem_flash_p (dest))
       DONE;
 
+    if (QImode == <MODE>mode
+        && SUBREG_P (src)
+        && CONSTANT_ADDRESS_P (SUBREG_REG (src))
+        && can_create_pseudo_p())
+      {
+        // store_bitfield may want to store a SYMBOL_REF or CONST in a
+        // structure that's represented as PSImode.  As the upper 16 bits
+        // of PSImode cannot be expressed as an HImode subreg, the rhs is
+        // decomposed into QImode (word_mode) subregs of SYMBOL_REF,
+        // CONST or LABEL_REF; cf. PR71103.
+
+        rtx const_addr = SUBREG_REG (src);
+        operands[1] = src = copy_rtx (src);
+        SUBREG_REG (src) = copy_to_mode_reg (GET_MODE (const_addr), const_addr);
+      }
+
     /* One of the operands has to be in a register.  */
     if (!register_operand (dest, <MODE>mode)
         && !reg_or_0_operand (src, <MODE>mode))
diff --git a/gcc/config/avr/gen-avr-mmcu-specs.c b/gcc/config/avr/gen-avr-mmcu-specs.c
index de8680a8a1..fabe8c2c59 100644
--- a/gcc/config/avr/gen-avr-mmcu-specs.c
+++ b/gcc/config/avr/gen-avr-mmcu-specs.c
@@ -27,7 +27,7 @@
 
 #include "avr-devices.c"
 
-// Get rid of "defaults.h".  We just need tm.h for `WITH_AVRLIBS' and
+// Get rid of "defaults.h".  We just need tm.h for `WITH_AVRLIBC' and
 // and `WITH_RTEMS'.  */
 #define GCC_DEFAULTS_H
 
@@ -242,12 +242,13 @@ print_mcu (const avr_mcu_t *mcu)
       fprintf (f, "*link_data_start:\n");
       if (mcu->data_section_start
           != arch->default_data_section_start)
-        fprintf (f, "\t-Tdata 0x%lX", 0x800000UL + mcu->data_section_start);
+        fprintf (f, "\t%%{!Tdata:-Tdata 0x%lX}",
+                 0x800000UL + mcu->data_section_start);
       fprintf (f, "\n\n");
 
       fprintf (f, "*link_text_start:\n");
       if (mcu->text_section_start != 0x0)
-        fprintf (f, "\t-Ttext 0x%lX", 0UL + mcu->text_section_start);
+        fprintf (f, "\t%%{!Ttext:-Ttext 0x%lX}", 0UL + mcu->text_section_start);
       fprintf (f, "\n\n");
     }
 
diff --git a/gcc/config/darwin.h b/gcc/config/darwin.h
index c9981b815c..0277cf2238 100644
--- a/gcc/config/darwin.h
+++ b/gcc/config/darwin.h
@@ -179,6 +179,7 @@ extern GTY(()) int darwin_ms_struct;
     %{L*} %(link_libgcc) %o %{fprofile-arcs|fprofile-generate*|coverage:-lgcov} \
     %{fopenacc|fopenmp|%:gt(%{ftree-parallelize-loops=*:%*} 1): \
       %{static|static-libgcc|static-libstdc++|static-libgfortran: libgomp.a%s; : -lgomp } } \
+    %{fcilkplus:%:include(libcilkrts.spec)%(link_cilkrts)} \
     %{fgnu-tm: \
       %{static|static-libgcc|static-libstdc++|static-libgfortran: libitm.a%s; : -litm } } \
     %{!nostdlib:%{!nodefaultlibs:\
diff --git a/gcc/config/i386/avx512bwintrin.h b/gcc/config/i386/avx512bwintrin.h
index f40a7d91df..6ef63e9fa4 100644
--- a/gcc/config/i386/avx512bwintrin.h
+++ b/gcc/config/i386/avx512bwintrin.h
@@ -270,9 +270,8 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm512_broadcastb_epi8 (__m128i __A)
 {
   return (__m512i) __builtin_ia32_pbroadcastb512_mask ((__v16qi) __A,
-						       (__v64qi)_mm512_undefined_si512(),
-						       (__mmask64) -
-						       1);
+						       (__v64qi)_mm512_undefined_epi32(),
+						       (__mmask64) -1);
 }
 
 extern __inline __m512i
@@ -318,8 +317,8 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm512_broadcastw_epi16 (__m128i __A)
 {
   return (__m512i) __builtin_ia32_pbroadcastw512_mask ((__v8hi) __A,
-						       (__v32hi)_mm512_undefined_si512(),
-						       (__mmask32)-1);
+						       (__v32hi)_mm512_undefined_epi32(),
+						       (__mmask32) -1);
 }
 
 extern __inline __m512i
@@ -588,8 +587,7 @@ _mm512_permutex2var_epi16 (__m512i __A, __m512i __I, __m512i __B)
 							/* idx */ ,
 							(__v32hi) __A,
 							(__v32hi) __B,
-							(__mmask32) -
-							1);
+							(__mmask32) -1);
 }
 
 extern __inline __m512i
@@ -2284,7 +2282,7 @@ _mm512_cmpneq_epu8_mask (__m512i __X, __m512i __Y)
 {
   return (__mmask64) __builtin_ia32_ucmpb512_mask ((__v64qi) __X,
 						   (__v64qi) __Y, 4,
-						   (__mmask64) - 1);
+						   (__mmask64) -1);
 }
 
 extern __inline __mmask64
@@ -2293,7 +2291,7 @@ _mm512_cmplt_epu8_mask (__m512i __X, __m512i __Y)
 {
   return (__mmask64) __builtin_ia32_ucmpb512_mask ((__v64qi) __X,
 						   (__v64qi) __Y, 1,
-						   (__mmask64) - 1);
+						   (__mmask64) -1);
 }
 
 extern __inline __mmask64
@@ -2302,7 +2300,7 @@ _mm512_cmpge_epu8_mask (__m512i __X, __m512i __Y)
 {
   return (__mmask64) __builtin_ia32_ucmpb512_mask ((__v64qi) __X,
 						   (__v64qi) __Y, 5,
-						   (__mmask64) - 1);
+						   (__mmask64) -1);
 }
 
 extern __inline __mmask64
@@ -2311,7 +2309,7 @@ _mm512_cmple_epu8_mask (__m512i __X, __m512i __Y)
 {
   return (__mmask64) __builtin_ia32_ucmpb512_mask ((__v64qi) __X,
 						   (__v64qi) __Y, 2,
-						   (__mmask64) - 1);
+						   (__mmask64) -1);
 }
 
 extern __inline __mmask32
@@ -2320,7 +2318,7 @@ _mm512_cmpneq_epu16_mask (__m512i __X, __m512i __Y)
 {
   return (__mmask32) __builtin_ia32_ucmpw512_mask ((__v32hi) __X,
 						   (__v32hi) __Y, 4,
-						   (__mmask32) - 1);
+						   (__mmask32) -1);
 }
 
 extern __inline __mmask32
@@ -2329,7 +2327,7 @@ _mm512_cmplt_epu16_mask (__m512i __X, __m512i __Y)
 {
   return (__mmask32) __builtin_ia32_ucmpw512_mask ((__v32hi) __X,
 						   (__v32hi) __Y, 1,
-						   (__mmask32) - 1);
+						   (__mmask32) -1);
 }
 
 extern __inline __mmask32
@@ -2338,7 +2336,7 @@ _mm512_cmpge_epu16_mask (__m512i __X, __m512i __Y)
 {
   return (__mmask32) __builtin_ia32_ucmpw512_mask ((__v32hi) __X,
 						   (__v32hi) __Y, 5,
-						   (__mmask32) - 1);
+						   (__mmask32) -1);
 }
 
 extern __inline __mmask32
@@ -2347,7 +2345,7 @@ _mm512_cmple_epu16_mask (__m512i __X, __m512i __Y)
 {
   return (__mmask32) __builtin_ia32_ucmpw512_mask ((__v32hi) __X,
 						   (__v32hi) __Y, 2,
-						   (__mmask32) - 1);
+						   (__mmask32) -1);
 }
 
 extern __inline __mmask64
@@ -2356,7 +2354,7 @@ _mm512_cmpneq_epi8_mask (__m512i __X, __m512i __Y)
 {
   return (__mmask64) __builtin_ia32_cmpb512_mask ((__v64qi) __X,
 						  (__v64qi) __Y, 4,
-						  (__mmask64) - 1);
+						  (__mmask64) -1);
 }
 
 extern __inline __mmask64
@@ -2365,7 +2363,7 @@ _mm512_cmplt_epi8_mask (__m512i __X, __m512i __Y)
 {
   return (__mmask64) __builtin_ia32_cmpb512_mask ((__v64qi) __X,
 						  (__v64qi) __Y, 1,
-						  (__mmask64) - 1);
+						  (__mmask64) -1);
 }
 
 extern __inline __mmask64
@@ -2374,7 +2372,7 @@ _mm512_cmpge_epi8_mask (__m512i __X, __m512i __Y)
 {
   return (__mmask64) __builtin_ia32_cmpb512_mask ((__v64qi) __X,
 						  (__v64qi) __Y, 5,
-						  (__mmask64) - 1);
+						  (__mmask64) -1);
 }
 
 extern __inline __mmask64
@@ -2383,7 +2381,7 @@ _mm512_cmple_epi8_mask (__m512i __X, __m512i __Y)
 {
   return (__mmask64) __builtin_ia32_cmpb512_mask ((__v64qi) __X,
 						  (__v64qi) __Y, 2,
-						  (__mmask64) - 1);
+						  (__mmask64) -1);
 }
 
 extern __inline __mmask32
@@ -2392,7 +2390,7 @@ _mm512_cmpneq_epi16_mask (__m512i __X, __m512i __Y)
 {
   return (__mmask32) __builtin_ia32_cmpw512_mask ((__v32hi) __X,
 						  (__v32hi) __Y, 4,
-						  (__mmask32) - 1);
+						  (__mmask32) -1);
 }
 
 extern __inline __mmask32
@@ -2401,7 +2399,7 @@ _mm512_cmplt_epi16_mask (__m512i __X, __m512i __Y)
 {
   return (__mmask32) __builtin_ia32_cmpw512_mask ((__v32hi) __X,
 						  (__v32hi) __Y, 1,
-						  (__mmask32) - 1);
+						  (__mmask32) -1);
 }
 
 extern __inline __mmask32
@@ -2410,7 +2408,7 @@ _mm512_cmpge_epi16_mask (__m512i __X, __m512i __Y)
 {
   return (__mmask32) __builtin_ia32_cmpw512_mask ((__v32hi) __X,
 						  (__v32hi) __Y, 5,
-						  (__mmask32) - 1);
+						  (__mmask32) -1);
 }
 
 extern __inline __mmask32
@@ -2419,7 +2417,7 @@ _mm512_cmple_epi16_mask (__m512i __X, __m512i __Y)
 {
   return (__mmask32) __builtin_ia32_cmpw512_mask ((__v32hi) __X,
 						  (__v32hi) __Y, 2,
-						  (__mmask32) - 1);
+						  (__mmask32) -1);
 }
 
 #ifdef __OPTIMIZE__
diff --git a/gcc/config/i386/avx512dqintrin.h b/gcc/config/i386/avx512dqintrin.h
index 14a4e8869a..1dbb6b04e4 100644
--- a/gcc/config/i386/avx512dqintrin.h
+++ b/gcc/config/i386/avx512dqintrin.h
@@ -41,8 +41,7 @@ _mm512_broadcast_f64x2 (__m128d __A)
   return (__m512d) __builtin_ia32_broadcastf64x2_512_mask ((__v2df)
 							   __A,
 							   _mm512_undefined_pd(),
-							   (__mmask8) -
-							   1);
+							   (__mmask8) -1);
 }
 
 extern __inline __m512d
@@ -72,9 +71,8 @@ _mm512_broadcast_i64x2 (__m128i __A)
 {
   return (__m512i) __builtin_ia32_broadcasti64x2_512_mask ((__v2di)
 							   __A,
-							   _mm512_undefined_si512(),
-							   (__mmask8) -
-							   1);
+							   _mm512_undefined_epi32(),
+							   (__mmask8) -1);
 }
 
 extern __inline __m512i
@@ -104,8 +102,7 @@ _mm512_broadcast_f32x2 (__m128 __A)
 {
   return (__m512) __builtin_ia32_broadcastf32x2_512_mask ((__v4sf) __A,
 							  (__v16sf)_mm512_undefined_ps(),
-							  (__mmask16) -
-							  1);
+							  (__mmask16) -1);
 }
 
 extern __inline __m512
@@ -133,9 +130,8 @@ _mm512_broadcast_i32x2 (__m128i __A)
 {
   return (__m512i) __builtin_ia32_broadcasti32x2_512_mask ((__v4si)
 							   __A,
-							   (__v16si)_mm512_undefined_si512(),
-							   (__mmask16)
-							   -1);
+							   (__v16si)_mm512_undefined_epi32(),
+							   (__mmask16) -1);
 }
 
 extern __inline __m512i
@@ -165,8 +161,7 @@ _mm512_broadcast_f32x8 (__m256 __A)
 {
   return (__m512) __builtin_ia32_broadcastf32x8_512_mask ((__v8sf) __A,
 							  _mm512_undefined_ps(),
-							  (__mmask16) -
-							  1);
+							  (__mmask16) -1);
 }
 
 extern __inline __m512
@@ -194,9 +189,8 @@ _mm512_broadcast_i32x8 (__m256i __A)
 {
   return (__m512i) __builtin_ia32_broadcasti32x8_512_mask ((__v8si)
 							   __A,
-							   (__v16si)_mm512_undefined_si512(),
-							   (__mmask16)
-							   -1);
+							   (__v16si)_mm512_undefined_epi32(),
+							   (__mmask16) -1);
 }
 
 extern __inline __m512i
@@ -1569,8 +1563,7 @@ _mm512_extractf64x2_pd (__m512d __A, const int __imm)
 							 __imm,
 							 (__v2df)
 							 _mm_setzero_pd (),
-							 (__mmask8) -
-							 1);
+							 (__mmask8) -1);
 }
 
 extern __inline __m128d
@@ -1640,8 +1633,7 @@ _mm512_extracti64x2_epi64 (__m512i __A, const int __imm)
 							 __imm,
 							 (__v2di)
 							 _mm_setzero_di (),
-							 (__mmask8) -
-							 1);
+							 (__mmask8) -1);
 }
 
 extern __inline __m128i
@@ -1829,8 +1821,7 @@ _mm512_inserti64x2 (__m512i __A, __m128i __B, const int __imm)
 							__imm,
 							(__v8di)
 							_mm512_setzero_si512 (),
-							(__mmask8) -
-							1);
+							(__mmask8) -1);
 }
 
 extern __inline __m512i
@@ -1869,8 +1860,7 @@ _mm512_insertf64x2 (__m512d __A, __m128d __B, const int __imm)
 							__imm,
 							(__v8df)
 							_mm512_setzero_pd (),
-							(__mmask8) -
-							1);
+							(__mmask8) -1);
 }
 
 extern __inline __m512d
@@ -1933,8 +1923,7 @@ _mm512_fpclass_ps_mask (__m512 __A, const int __imm)
 {
   return (__mmask16) __builtin_ia32_fpclassps512_mask ((__v16sf) __A,
 						       __imm,
-						       (__mmask16) -
-						       1);
+						       (__mmask16) -1);
 }
 
 #else
diff --git a/gcc/config/i386/avx512fintrin.h b/gcc/config/i386/avx512fintrin.h
index e009d8c55b..305ed555d4 100644
--- a/gcc/config/i386/avx512fintrin.h
+++ b/gcc/config/i386/avx512fintrin.h
@@ -130,12 +130,14 @@ _mm512_undefined_pd (void)
 
 extern __inline __m512i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm512_undefined_si512 (void)
+_mm512_undefined_epi32 (void)
 {
   __m512i __Y = __Y;
   return __Y;
 }
 
+#define _mm512_undefined_si512 _mm512_undefined_epi32
+
 extern __inline __m512i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm512_set1_epi8 (char __A)
@@ -549,7 +551,7 @@ _mm512_sllv_epi32 (__m512i __X, __m512i __Y)
   return (__m512i) __builtin_ia32_psllv16si_mask ((__v16si) __X,
 						  (__v16si) __Y,
 						  (__v16si)
-						  _mm512_undefined_si512 (),
+						  _mm512_undefined_epi32 (),
 						  (__mmask16) -1);
 }
 
@@ -581,7 +583,7 @@ _mm512_srav_epi32 (__m512i __X, __m512i __Y)
   return (__m512i) __builtin_ia32_psrav16si_mask ((__v16si) __X,
 						  (__v16si) __Y,
 						  (__v16si)
-						  _mm512_undefined_si512 (),
+						  _mm512_undefined_epi32 (),
 						  (__mmask16) -1);
 }
 
@@ -613,7 +615,7 @@ _mm512_srlv_epi32 (__m512i __X, __m512i __Y)
   return (__m512i) __builtin_ia32_psrlv16si_mask ((__v16si) __X,
 						  (__v16si) __Y,
 						  (__v16si)
-						  _mm512_undefined_si512 (),
+						  _mm512_undefined_epi32 (),
 						  (__mmask16) -1);
 }
 
@@ -733,7 +735,7 @@ _mm512_srav_epi64 (__m512i __X, __m512i __Y)
   return (__m512i) __builtin_ia32_psrav8di_mask ((__v8di) __X,
 						 (__v8di) __Y,
 						 (__v8di)
-						 _mm512_undefined_si512 (),
+						 _mm512_undefined_epi32 (),
 						 (__mmask8) -1);
 }
 
@@ -765,7 +767,7 @@ _mm512_srlv_epi64 (__m512i __X, __m512i __Y)
   return (__m512i) __builtin_ia32_psrlv8di_mask ((__v8di) __X,
 						 (__v8di) __Y,
 						 (__v8di)
-						 _mm512_undefined_si512 (),
+						 _mm512_undefined_epi32 (),
 						 (__mmask8) -1);
 }
 
@@ -825,7 +827,7 @@ _mm512_mul_epi32 (__m512i __X, __m512i __Y)
   return (__m512i) __builtin_ia32_pmuldq512_mask ((__v16si) __X,
 						  (__v16si) __Y,
 						  (__v8di)
-						  _mm512_undefined_si512 (),
+						  _mm512_undefined_epi32 (),
 						  (__mmask8) -1);
 }
 
@@ -884,7 +886,7 @@ _mm512_mul_epu32 (__m512i __X, __m512i __Y)
   return (__m512i) __builtin_ia32_pmuludq512_mask ((__v16si) __X,
 						   (__v16si) __Y,
 						   (__v8di)
-						   _mm512_undefined_si512 (),
+						   _mm512_undefined_epi32 (),
 						   (__mmask8) -1);
 }
 
@@ -915,7 +917,7 @@ _mm512_slli_epi64 (__m512i __A, unsigned int __B)
 {
   return (__m512i) __builtin_ia32_psllqi512_mask ((__v8di) __A, __B,
 						  (__v8di)
-						  _mm512_undefined_si512 (),
+						  _mm512_undefined_epi32 (),
 						  (__mmask8) -1);
 }
 
@@ -941,7 +943,7 @@ _mm512_maskz_slli_epi64 (__mmask8 __U, __m512i __A, unsigned int __B)
 #else
 #define _mm512_slli_epi64(X, C)						   \
   ((__m512i) __builtin_ia32_psllqi512_mask ((__v8di)(__m512i)(X), (int)(C),\
-    (__v8di)(__m512i)_mm512_undefined_si512 (),\
+    (__v8di)(__m512i)_mm512_undefined_epi32 (),\
     (__mmask8)-1))
 
 #define _mm512_mask_slli_epi64(W, U, X, C)				   \
@@ -962,7 +964,7 @@ _mm512_sll_epi64 (__m512i __A, __m128i __B)
   return (__m512i) __builtin_ia32_psllq512_mask ((__v8di) __A,
 						 (__v2di) __B,
 						 (__v8di)
-						 _mm512_undefined_si512 (),
+						 _mm512_undefined_epi32 (),
 						 (__mmask8) -1);
 }
 
@@ -994,7 +996,7 @@ _mm512_srli_epi64 (__m512i __A, unsigned int __B)
 {
   return (__m512i) __builtin_ia32_psrlqi512_mask ((__v8di) __A, __B,
 						  (__v8di)
-						  _mm512_undefined_si512 (),
+						  _mm512_undefined_epi32 (),
 						  (__mmask8) -1);
 }
 
@@ -1020,7 +1022,7 @@ _mm512_maskz_srli_epi64 (__mmask8 __U, __m512i __A, unsigned int __B)
 #else
 #define _mm512_srli_epi64(X, C)						   \
   ((__m512i) __builtin_ia32_psrlqi512_mask ((__v8di)(__m512i)(X), (int)(C),\
-    (__v8di)(__m512i)_mm512_undefined_si512 (),\
+    (__v8di)(__m512i)_mm512_undefined_epi32 (),\
     (__mmask8)-1))
 
 #define _mm512_mask_srli_epi64(W, U, X, C)				   \
@@ -1041,7 +1043,7 @@ _mm512_srl_epi64 (__m512i __A, __m128i __B)
   return (__m512i) __builtin_ia32_psrlq512_mask ((__v8di) __A,
 						 (__v2di) __B,
 						 (__v8di)
-						 _mm512_undefined_si512 (),
+						 _mm512_undefined_epi32 (),
 						 (__mmask8) -1);
 }
 
@@ -1073,7 +1075,7 @@ _mm512_srai_epi64 (__m512i __A, unsigned int __B)
 {
   return (__m512i) __builtin_ia32_psraqi512_mask ((__v8di) __A, __B,
 						  (__v8di)
-						  _mm512_undefined_si512 (),
+						  _mm512_undefined_epi32 (),
 						  (__mmask8) -1);
 }
 
@@ -1099,7 +1101,7 @@ _mm512_maskz_srai_epi64 (__mmask8 __U, __m512i __A, unsigned int __B)
 #else
 #define _mm512_srai_epi64(X, C)						   \
   ((__m512i) __builtin_ia32_psraqi512_mask ((__v8di)(__m512i)(X), (int)(C),\
-    (__v8di)(__m512i)_mm512_undefined_si512 (),\
+    (__v8di)(__m512i)_mm512_undefined_epi32 (),\
     (__mmask8)-1))
 
 #define _mm512_mask_srai_epi64(W, U, X, C)				   \
@@ -1120,7 +1122,7 @@ _mm512_sra_epi64 (__m512i __A, __m128i __B)
   return (__m512i) __builtin_ia32_psraq512_mask ((__v8di) __A,
 						 (__v2di) __B,
 						 (__v8di)
-						 _mm512_undefined_si512 (),
+						 _mm512_undefined_epi32 (),
 						 (__mmask8) -1);
 }
 
@@ -1152,7 +1154,7 @@ _mm512_slli_epi32 (__m512i __A, unsigned int __B)
 {
   return (__m512i) __builtin_ia32_pslldi512_mask ((__v16si) __A, __B,
 						  (__v16si)
-						  _mm512_undefined_si512 (),
+						  _mm512_undefined_epi32 (),
 						  (__mmask16) -1);
 }
 
@@ -1178,7 +1180,7 @@ _mm512_maskz_slli_epi32 (__mmask16 __U, __m512i __A, unsigned int __B)
 #else
 #define _mm512_slli_epi32(X, C)						    \
   ((__m512i) __builtin_ia32_pslldi512_mask ((__v16si)(__m512i)(X), (int)(C),\
-    (__v16si)(__m512i)_mm512_undefined_si512 (),\
+    (__v16si)(__m512i)_mm512_undefined_epi32 (),\
     (__mmask16)-1))
 
 #define _mm512_mask_slli_epi32(W, U, X, C)                                  \
@@ -1199,7 +1201,7 @@ _mm512_sll_epi32 (__m512i __A, __m128i __B)
   return (__m512i) __builtin_ia32_pslld512_mask ((__v16si) __A,
 						 (__v4si) __B,
 						 (__v16si)
-						 _mm512_undefined_si512 (),
+						 _mm512_undefined_epi32 (),
 						 (__mmask16) -1);
 }
 
@@ -1231,7 +1233,7 @@ _mm512_srli_epi32 (__m512i __A, unsigned int __B)
 {
   return (__m512i) __builtin_ia32_psrldi512_mask ((__v16si) __A, __B,
 						  (__v16si)
-						  _mm512_undefined_si512 (),
+						  _mm512_undefined_epi32 (),
 						  (__mmask16) -1);
 }
 
@@ -1257,7 +1259,7 @@ _mm512_maskz_srli_epi32 (__mmask16 __U, __m512i __A, unsigned int __B)
 #else
 #define _mm512_srli_epi32(X, C)						    \
   ((__m512i) __builtin_ia32_psrldi512_mask ((__v16si)(__m512i)(X), (int)(C),\
-    (__v16si)(__m512i)_mm512_undefined_si512 (),\
+    (__v16si)(__m512i)_mm512_undefined_epi32 (),\
     (__mmask16)-1))
 
 #define _mm512_mask_srli_epi32(W, U, X, C)                                  \
@@ -1278,7 +1280,7 @@ _mm512_srl_epi32 (__m512i __A, __m128i __B)
   return (__m512i) __builtin_ia32_psrld512_mask ((__v16si) __A,
 						 (__v4si) __B,
 						 (__v16si)
-						 _mm512_undefined_si512 (),
+						 _mm512_undefined_epi32 (),
 						 (__mmask16) -1);
 }
 
@@ -1310,7 +1312,7 @@ _mm512_srai_epi32 (__m512i __A, unsigned int __B)
 {
   return (__m512i) __builtin_ia32_psradi512_mask ((__v16si) __A, __B,
 						  (__v16si)
-						  _mm512_undefined_si512 (),
+						  _mm512_undefined_epi32 (),
 						  (__mmask16) -1);
 }
 
@@ -1336,7 +1338,7 @@ _mm512_maskz_srai_epi32 (__mmask16 __U, __m512i __A, unsigned int __B)
 #else
 #define _mm512_srai_epi32(X, C)						    \
   ((__m512i) __builtin_ia32_psradi512_mask ((__v16si)(__m512i)(X), (int)(C),\
-    (__v16si)(__m512i)_mm512_undefined_si512 (),\
+    (__v16si)(__m512i)_mm512_undefined_epi32 (),\
     (__mmask16)-1))
 
 #define _mm512_mask_srai_epi32(W, U, X, C)				    \
@@ -1357,7 +1359,7 @@ _mm512_sra_epi32 (__m512i __A, __m128i __B)
   return (__m512i) __builtin_ia32_psrad512_mask ((__v16si) __A,
 						 (__v4si) __B,
 						 (__v16si)
-						 _mm512_undefined_si512 (),
+						 _mm512_undefined_epi32 (),
 						 (__mmask16) -1);
 }
 
@@ -1778,7 +1780,7 @@ _mm512_cvtepi8_epi32 (__m128i __A)
 {
   return (__m512i) __builtin_ia32_pmovsxbd512_mask ((__v16qi) __A,
 						    (__v16si)
-						    _mm512_undefined_si512 (),
+						    _mm512_undefined_epi32 (),
 						    (__mmask16) -1);
 }
 
@@ -1807,7 +1809,7 @@ _mm512_cvtepi8_epi64 (__m128i __A)
 {
   return (__m512i) __builtin_ia32_pmovsxbq512_mask ((__v16qi) __A,
 						    (__v8di)
-						    _mm512_undefined_si512 (),
+						    _mm512_undefined_epi32 (),
 						    (__mmask8) -1);
 }
 
@@ -1836,7 +1838,7 @@ _mm512_cvtepi16_epi32 (__m256i __A)
 {
   return (__m512i) __builtin_ia32_pmovsxwd512_mask ((__v16hi) __A,
 						    (__v16si)
-						    _mm512_undefined_si512 (),
+						    _mm512_undefined_epi32 (),
 						    (__mmask16) -1);
 }
 
@@ -1865,7 +1867,7 @@ _mm512_cvtepi16_epi64 (__m128i __A)
 {
   return (__m512i) __builtin_ia32_pmovsxwq512_mask ((__v8hi) __A,
 						    (__v8di)
-						    _mm512_undefined_si512 (),
+						    _mm512_undefined_epi32 (),
 						    (__mmask8) -1);
 }
 
@@ -1894,7 +1896,7 @@ _mm512_cvtepi32_epi64 (__m256i __X)
 {
   return (__m512i) __builtin_ia32_pmovsxdq512_mask ((__v8si) __X,
 						    (__v8di)
-						    _mm512_undefined_si512 (),
+						    _mm512_undefined_epi32 (),
 						    (__mmask8) -1);
 }
 
@@ -1923,7 +1925,7 @@ _mm512_cvtepu8_epi32 (__m128i __A)
 {
   return (__m512i) __builtin_ia32_pmovzxbd512_mask ((__v16qi) __A,
 						    (__v16si)
-						    _mm512_undefined_si512 (),
+						    _mm512_undefined_epi32 (),
 						    (__mmask16) -1);
 }
 
@@ -1952,7 +1954,7 @@ _mm512_cvtepu8_epi64 (__m128i __A)
 {
   return (__m512i) __builtin_ia32_pmovzxbq512_mask ((__v16qi) __A,
 						    (__v8di)
-						    _mm512_undefined_si512 (),
+						    _mm512_undefined_epi32 (),
 						    (__mmask8) -1);
 }
 
@@ -1981,7 +1983,7 @@ _mm512_cvtepu16_epi32 (__m256i __A)
 {
   return (__m512i) __builtin_ia32_pmovzxwd512_mask ((__v16hi) __A,
 						    (__v16si)
-						    _mm512_undefined_si512 (),
+						    _mm512_undefined_epi32 (),
 						    (__mmask16) -1);
 }
 
@@ -2010,7 +2012,7 @@ _mm512_cvtepu16_epi64 (__m128i __A)
 {
   return (__m512i) __builtin_ia32_pmovzxwq512_mask ((__v8hi) __A,
 						    (__v8di)
-						    _mm512_undefined_si512 (),
+						    _mm512_undefined_epi32 (),
 						    (__mmask8) -1);
 }
 
@@ -2039,7 +2041,7 @@ _mm512_cvtepu32_epi64 (__m256i __X)
 {
   return (__m512i) __builtin_ia32_pmovzxdq512_mask ((__v8si) __X,
 						    (__v8di)
-						    _mm512_undefined_si512 (),
+						    _mm512_undefined_epi32 (),
 						    (__mmask8) -1);
 }
 
@@ -3407,7 +3409,7 @@ _mm512_abs_epi64 (__m512i __A)
 {
   return (__m512i) __builtin_ia32_pabsq512_mask ((__v8di) __A,
 						 (__v8di)
-						 _mm512_undefined_si512 (),
+						 _mm512_undefined_epi32 (),
 						 (__mmask8) -1);
 }
 
@@ -3436,7 +3438,7 @@ _mm512_abs_epi32 (__m512i __A)
 {
   return (__m512i) __builtin_ia32_pabsd512_mask ((__v16si) __A,
 						 (__v16si)
-						 _mm512_undefined_si512 (),
+						 _mm512_undefined_epi32 (),
 						 (__mmask16) -1);
 }
 
@@ -3521,7 +3523,7 @@ _mm512_broadcastd_epi32 (__m128i __A)
 {
   return (__m512i) __builtin_ia32_pbroadcastd512 ((__v4si) __A,
 						  (__v16si)
-						  _mm512_undefined_si512 (),
+						  _mm512_undefined_epi32 (),
 						  (__mmask16) -1);
 }
 
@@ -3549,7 +3551,7 @@ _mm512_set1_epi32 (int __A)
 {
   return (__m512i) __builtin_ia32_pbroadcastd512_gpr_mask (__A,
 							   (__v16si)
-							   _mm512_undefined_si512 (),
+							   _mm512_undefined_epi32 (),
 							   (__mmask16)(-1));
 }
 
@@ -3577,7 +3579,7 @@ _mm512_broadcastq_epi64 (__m128i __A)
 {
   return (__m512i) __builtin_ia32_pbroadcastq512 ((__v2di) __A,
 						  (__v8di)
-						  _mm512_undefined_si512 (),
+						  _mm512_undefined_epi32 (),
 						  (__mmask8) -1);
 }
 
@@ -3605,7 +3607,7 @@ _mm512_set1_epi64 (long long __A)
 {
   return (__m512i) __builtin_ia32_pbroadcastq512_gpr_mask (__A,
 							   (__v8di)
-							   _mm512_undefined_si512 (),
+							   _mm512_undefined_epi32 (),
 							   (__mmask8)(-1));
 }
 
@@ -3662,7 +3664,7 @@ _mm512_broadcast_i32x4 (__m128i __A)
 {
   return (__m512i) __builtin_ia32_broadcasti32x4_512 ((__v4si) __A,
 						      (__v16si)
-						      _mm512_undefined_si512 (),
+						      _mm512_undefined_epi32 (),
 						      (__mmask16) -1);
 }
 
@@ -3720,7 +3722,7 @@ _mm512_broadcast_i64x4 (__m256i __A)
 {
   return (__m512i) __builtin_ia32_broadcasti64x4_512 ((__v4di) __A,
 						      (__v8di)
-						      _mm512_undefined_si512 (),
+						      _mm512_undefined_epi32 (),
 						      (__mmask8) -1);
 }
 
@@ -3841,7 +3843,7 @@ _mm512_shuffle_epi32 (__m512i __A, _MM_PERM_ENUM __mask)
   return (__m512i) __builtin_ia32_pshufd512_mask ((__v16si) __A,
 						  __mask,
 						  (__v16si)
-						  _mm512_undefined_si512 (),
+						  _mm512_undefined_epi32 (),
 						  (__mmask16) -1);
 }
 
@@ -3874,7 +3876,7 @@ _mm512_shuffle_i64x2 (__m512i __A, __m512i __B, const int __imm)
   return (__m512i) __builtin_ia32_shuf_i64x2_mask ((__v8di) __A,
 						   (__v8di) __B, __imm,
 						   (__v8di)
-						   _mm512_undefined_si512 (),
+						   _mm512_undefined_epi32 (),
 						   (__mmask8) -1);
 }
 
@@ -3909,7 +3911,7 @@ _mm512_shuffle_i32x4 (__m512i __A, __m512i __B, const int __imm)
 						   (__v16si) __B,
 						   __imm,
 						   (__v16si)
-						   _mm512_undefined_si512 (),
+						   _mm512_undefined_epi32 (),
 						   (__mmask16) -1);
 }
 
@@ -4009,7 +4011,7 @@ _mm512_maskz_shuffle_f32x4 (__mmask16 __U, __m512 __A, __m512 __B,
 #else
 #define _mm512_shuffle_epi32(X, C)                                      \
   ((__m512i)  __builtin_ia32_pshufd512_mask ((__v16si)(__m512i)(X), (int)(C),\
-    (__v16si)(__m512i)_mm512_undefined_si512 (),\
+    (__v16si)(__m512i)_mm512_undefined_epi32 (),\
     (__mmask16)-1))
 
 #define _mm512_mask_shuffle_epi32(W, U, X, C)                           \
@@ -4025,7 +4027,7 @@ _mm512_maskz_shuffle_f32x4 (__mmask16 __U, __m512 __A, __m512 __B,
 #define _mm512_shuffle_i64x2(X, Y, C)                                   \
   ((__m512i)  __builtin_ia32_shuf_i64x2_mask ((__v8di)(__m512i)(X),     \
       (__v8di)(__m512i)(Y), (int)(C),\
-    (__v8di)(__m512i)_mm512_undefined_si512 (),\
+    (__v8di)(__m512i)_mm512_undefined_epi32 (),\
     (__mmask8)-1))
 
 #define _mm512_mask_shuffle_i64x2(W, U, X, Y, C)                        \
@@ -4043,7 +4045,7 @@ _mm512_maskz_shuffle_f32x4 (__mmask16 __U, __m512 __A, __m512 __B,
 #define _mm512_shuffle_i32x4(X, Y, C)                                   \
   ((__m512i)  __builtin_ia32_shuf_i32x4_mask ((__v16si)(__m512i)(X),    \
       (__v16si)(__m512i)(Y), (int)(C),\
-    (__v16si)(__m512i)_mm512_undefined_si512 (),\
+    (__v16si)(__m512i)_mm512_undefined_epi32 (),\
     (__mmask16)-1))
 
 #define _mm512_mask_shuffle_i32x4(W, U, X, Y, C)                        \
@@ -4102,7 +4104,7 @@ _mm512_rolv_epi32 (__m512i __A, __m512i __B)
   return (__m512i) __builtin_ia32_prolvd512_mask ((__v16si) __A,
 						  (__v16si) __B,
 						  (__v16si)
-						  _mm512_undefined_si512 (),
+						  _mm512_undefined_epi32 (),
 						  (__mmask16) -1);
 }
 
@@ -4134,7 +4136,7 @@ _mm512_rorv_epi32 (__m512i __A, __m512i __B)
   return (__m512i) __builtin_ia32_prorvd512_mask ((__v16si) __A,
 						  (__v16si) __B,
 						  (__v16si)
-						  _mm512_undefined_si512 (),
+						  _mm512_undefined_epi32 (),
 						  (__mmask16) -1);
 }
 
@@ -4166,7 +4168,7 @@ _mm512_rolv_epi64 (__m512i __A, __m512i __B)
   return (__m512i) __builtin_ia32_prolvq512_mask ((__v8di) __A,
 						  (__v8di) __B,
 						  (__v8di)
-						  _mm512_undefined_si512 (),
+						  _mm512_undefined_epi32 (),
 						  (__mmask8) -1);
 }
 
@@ -4198,7 +4200,7 @@ _mm512_rorv_epi64 (__m512i __A, __m512i __B)
   return (__m512i) __builtin_ia32_prorvq512_mask ((__v8di) __A,
 						  (__v8di) __B,
 						  (__v8di)
-						  _mm512_undefined_si512 (),
+						  _mm512_undefined_epi32 (),
 						  (__mmask8) -1);
 }
 
@@ -4390,7 +4392,7 @@ _mm512_cvtt_roundps_epi32 (__m512 __A, const int __R)
 {
   return (__m512i) __builtin_ia32_cvttps2dq512_mask ((__v16sf) __A,
 						     (__v16si)
-						     _mm512_undefined_si512 (),
+						     _mm512_undefined_epi32 (),
 						     (__mmask16) -1, __R);
 }
 
@@ -4420,7 +4422,7 @@ _mm512_cvtt_roundps_epu32 (__m512 __A, const int __R)
 {
   return (__m512i) __builtin_ia32_cvttps2udq512_mask ((__v16sf) __A,
 						      (__v16si)
-						      _mm512_undefined_si512 (),
+						      _mm512_undefined_epi32 (),
 						      (__mmask16) -1, __R);
 }
 
@@ -4445,7 +4447,7 @@ _mm512_maskz_cvtt_roundps_epu32 (__mmask16 __U, __m512 __A, const int __R)
 }
 #else
 #define _mm512_cvtt_roundps_epi32(A, B)		     \
-    ((__m512i)__builtin_ia32_cvttps2dq512_mask(A, (__v16si)_mm512_undefined_si512 (), -1, B))
+    ((__m512i)__builtin_ia32_cvttps2dq512_mask(A, (__v16si)_mm512_undefined_epi32 (), -1, B))
 
 #define _mm512_mask_cvtt_roundps_epi32(W, U, A, B)   \
     ((__m512i)__builtin_ia32_cvttps2dq512_mask(A, (__v16si)(W), U, B))
@@ -4454,7 +4456,7 @@ _mm512_maskz_cvtt_roundps_epu32 (__mmask16 __U, __m512 __A, const int __R)
     ((__m512i)__builtin_ia32_cvttps2dq512_mask(A, (__v16si)_mm512_setzero_si512 (), U, B))
 
 #define _mm512_cvtt_roundps_epu32(A, B)		     \
-    ((__m512i)__builtin_ia32_cvttps2udq512_mask(A, (__v16si)_mm512_undefined_si512 (), -1, B))
+    ((__m512i)__builtin_ia32_cvttps2udq512_mask(A, (__v16si)_mm512_undefined_epi32 (), -1, B))
 
 #define _mm512_mask_cvtt_roundps_epu32(W, U, A, B)   \
     ((__m512i)__builtin_ia32_cvttps2udq512_mask(A, (__v16si)(W), U, B))
@@ -4470,7 +4472,7 @@ _mm512_cvt_roundps_epi32 (__m512 __A, const int __R)
 {
   return (__m512i) __builtin_ia32_cvtps2dq512_mask ((__v16sf) __A,
 						    (__v16si)
-						    _mm512_undefined_si512 (),
+						    _mm512_undefined_epi32 (),
 						    (__mmask16) -1, __R);
 }
 
@@ -4500,7 +4502,7 @@ _mm512_cvt_roundps_epu32 (__m512 __A, const int __R)
 {
   return (__m512i) __builtin_ia32_cvtps2udq512_mask ((__v16sf) __A,
 						     (__v16si)
-						     _mm512_undefined_si512 (),
+						     _mm512_undefined_epi32 (),
 						     (__mmask16) -1, __R);
 }
 
@@ -4525,7 +4527,7 @@ _mm512_maskz_cvt_roundps_epu32 (__mmask16 __U, __m512 __A, const int __R)
 }
 #else
 #define _mm512_cvt_roundps_epi32(A, B)		    \
-    ((__m512i)__builtin_ia32_cvtps2dq512_mask(A, (__v16si)_mm512_undefined_si512 (), -1, B))
+    ((__m512i)__builtin_ia32_cvtps2dq512_mask(A, (__v16si)_mm512_undefined_epi32 (), -1, B))
 
 #define _mm512_mask_cvt_roundps_epi32(W, U, A, B)   \
     ((__m512i)__builtin_ia32_cvtps2dq512_mask(A, (__v16si)(W), U, B))
@@ -4534,7 +4536,7 @@ _mm512_maskz_cvt_roundps_epu32 (__mmask16 __U, __m512 __A, const int __R)
     ((__m512i)__builtin_ia32_cvtps2dq512_mask(A, (__v16si)_mm512_setzero_si512 (), U, B))
 
 #define _mm512_cvt_roundps_epu32(A, B)		    \
-    ((__m512i)__builtin_ia32_cvtps2udq512_mask(A, (__v16si)_mm512_undefined_si512 (), -1, B))
+    ((__m512i)__builtin_ia32_cvtps2udq512_mask(A, (__v16si)_mm512_undefined_epi32 (), -1, B))
 
 #define _mm512_mask_cvt_roundps_epu32(W, U, A, B)   \
     ((__m512i)__builtin_ia32_cvtps2udq512_mask(A, (__v16si)(W), U, B))
@@ -4903,7 +4905,6 @@ extern __inline __m256i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm512_cvtsepi64_epi32 (__m512i __A)
 {
-  __v8si __O;
   return (__m256i) __builtin_ia32_pmovsqd512_mask ((__v8di) __A,
 						   (__v8si)
 						   _mm256_undefined_si256 (),
@@ -5556,7 +5557,7 @@ _mm512_inserti64x4 (__m512i __A, __m256i __B, const int __imm)
 						    (__v4di) __B,
 						    __imm,
 						    (__v8di)
-						    _mm512_undefined_si512 (),
+						    _mm512_undefined_epi32 (),
 						    (__mmask8) -1);
 }
 
@@ -5651,7 +5652,7 @@ _mm512_maskz_insertf64x4 (__mmask8 __U, __m512d __A, __m256d __B,
 #define _mm512_inserti64x4(X, Y, C)                                     \
   ((__m512i) __builtin_ia32_inserti64x4_mask ((__v8di)(__m512i) (X),    \
     (__v4di)(__m256i) (Y), (int) (C),					\
-    (__v8di)(__m512i)_mm512_undefined_si512 (),				\
+    (__v8di)(__m512i)_mm512_undefined_epi32 (),				\
     (__mmask8)-1))
 
 #define _mm512_mask_inserti64x4(W, U, X, Y, C)                          \
@@ -6177,7 +6178,7 @@ _mm512_permutex_epi64 (__m512i __X, const int __I)
 {
   return (__m512i) __builtin_ia32_permdi512_mask ((__v8di) __X, __I,
 						  (__v8di)
-						  _mm512_undefined_si512 (),
+						  _mm512_undefined_epi32 (),
 						  (__mmask8) (-1));
 }
 
@@ -6248,7 +6249,7 @@ _mm512_maskz_permutex_pd (__mmask8 __U, __m512d __X, const int __M)
   ((__m512i) __builtin_ia32_permdi512_mask ((__v8di)(__m512i)(X), \
 					    (int)(I),             \
 					    (__v8di)(__m512i)	  \
-					    (_mm512_undefined_si512 ()),\
+					    (_mm512_undefined_epi32 ()),\
 					    (__mmask8)(-1)))
 
 #define _mm512_maskz_permutex_epi64(M, X, I)                 \
@@ -6283,7 +6284,7 @@ _mm512_permutexvar_epi64 (__m512i __X, __m512i __Y)
   return (__m512i) __builtin_ia32_permvardi512_mask ((__v8di) __Y,
 						     (__v8di) __X,
 						     (__v8di)
-						     _mm512_undefined_si512 (),
+						     _mm512_undefined_epi32 (),
 						     (__mmask8) -1);
 }
 
@@ -6316,7 +6317,7 @@ _mm512_permutexvar_epi32 (__m512i __X, __m512i __Y)
   return (__m512i) __builtin_ia32_permvarsi512_mask ((__v16si) __Y,
 						     (__v16si) __X,
 						     (__v16si)
-						     _mm512_undefined_si512 (),
+						     _mm512_undefined_epi32 (),
 						     (__mmask16) -1);
 }
 
@@ -6891,7 +6892,7 @@ _mm512_rol_epi32 (__m512i __A, const int __B)
 {
   return (__m512i) __builtin_ia32_prold512_mask ((__v16si) __A, __B,
 						 (__v16si)
-						 _mm512_undefined_si512 (),
+						 _mm512_undefined_epi32 (),
 						 (__mmask16) -1);
 }
 
@@ -6920,7 +6921,7 @@ _mm512_ror_epi32 (__m512i __A, int __B)
 {
   return (__m512i) __builtin_ia32_prord512_mask ((__v16si) __A, __B,
 						 (__v16si)
-						 _mm512_undefined_si512 (),
+						 _mm512_undefined_epi32 (),
 						 (__mmask16) -1);
 }
 
@@ -6949,7 +6950,7 @@ _mm512_rol_epi64 (__m512i __A, const int __B)
 {
   return (__m512i) __builtin_ia32_prolq512_mask ((__v8di) __A, __B,
 						 (__v8di)
-						 _mm512_undefined_si512 (),
+						 _mm512_undefined_epi32 (),
 						 (__mmask8) -1);
 }
 
@@ -6978,7 +6979,7 @@ _mm512_ror_epi64 (__m512i __A, int __B)
 {
   return (__m512i) __builtin_ia32_prorq512_mask ((__v8di) __A, __B,
 						 (__v8di)
-						 _mm512_undefined_si512 (),
+						 _mm512_undefined_epi32 (),
 						 (__mmask8) -1);
 }
 
@@ -7005,7 +7006,7 @@ _mm512_maskz_ror_epi64 (__mmask8 __U, __m512i __A, int __B)
 #define _mm512_rol_epi32(A, B)						  \
     ((__m512i)__builtin_ia32_prold512_mask ((__v16si)(__m512i)(A),	  \
 					    (int)(B),			  \
-					    (__v16si)_mm512_undefined_si512 (), \
+					    (__v16si)_mm512_undefined_epi32 (), \
 					    (__mmask16)(-1)))
 #define _mm512_mask_rol_epi32(W, U, A, B)				  \
     ((__m512i)__builtin_ia32_prold512_mask ((__v16si)(__m512i)(A),	  \
@@ -7020,7 +7021,7 @@ _mm512_maskz_ror_epi64 (__mmask8 __U, __m512i __A, int __B)
 #define _mm512_ror_epi32(A, B)						  \
     ((__m512i)__builtin_ia32_prord512_mask ((__v16si)(__m512i)(A),	  \
 					    (int)(B),			  \
-					    (__v16si)_mm512_undefined_si512 (), \
+					    (__v16si)_mm512_undefined_epi32 (), \
 					    (__mmask16)(-1)))
 #define _mm512_mask_ror_epi32(W, U, A, B)				  \
     ((__m512i)__builtin_ia32_prord512_mask ((__v16si)(__m512i)(A),	  \
@@ -7035,7 +7036,7 @@ _mm512_maskz_ror_epi64 (__mmask8 __U, __m512i __A, int __B)
 #define _mm512_rol_epi64(A, B)						  \
     ((__m512i)__builtin_ia32_prolq512_mask ((__v8di)(__m512i)(A),	  \
 					    (int)(B),			  \
-					    (__v8di)_mm512_undefined_si512 (),  \
+					    (__v8di)_mm512_undefined_epi32 (),  \
 					    (__mmask8)(-1)))
 #define _mm512_mask_rol_epi64(W, U, A, B)				  \
     ((__m512i)__builtin_ia32_prolq512_mask ((__v8di)(__m512i)(A),	  \
@@ -7051,7 +7052,7 @@ _mm512_maskz_ror_epi64 (__mmask8 __U, __m512i __A, int __B)
 #define _mm512_ror_epi64(A, B)						  \
     ((__m512i)__builtin_ia32_prorq512_mask ((__v8di)(__m512i)(A),	  \
 					    (int)(B),			  \
-					    (__v8di)_mm512_undefined_si512 (),  \
+					    (__v8di)_mm512_undefined_epi32 (),  \
 					    (__mmask8)(-1)))
 #define _mm512_mask_ror_epi64(W, U, A, B)				  \
     ((__m512i)__builtin_ia32_prorq512_mask ((__v8di)(__m512i)(A),	  \
@@ -7134,7 +7135,7 @@ _mm512_andnot_si512 (__m512i __A, __m512i __B)
   return (__m512i) __builtin_ia32_pandnd512_mask ((__v16si) __A,
 						  (__v16si) __B,
 						  (__v16si)
-						  _mm512_undefined_si512 (),
+						  _mm512_undefined_epi32 (),
 						  (__mmask16) -1);
 }
 
@@ -7145,7 +7146,7 @@ _mm512_andnot_epi32 (__m512i __A, __m512i __B)
   return (__m512i) __builtin_ia32_pandnd512_mask ((__v16si) __A,
 						  (__v16si) __B,
 						  (__v16si)
-						  _mm512_undefined_si512 (),
+						  _mm512_undefined_epi32 (),
 						  (__mmask16) -1);
 }
 
@@ -7177,7 +7178,7 @@ _mm512_andnot_epi64 (__m512i __A, __m512i __B)
   return (__m512i) __builtin_ia32_pandnq512_mask ((__v8di) __A,
 						  (__v8di) __B,
 						  (__v8di)
-						  _mm512_undefined_si512 (),
+						  _mm512_undefined_epi32 (),
 						  (__mmask8) -1);
 }
 
@@ -7275,7 +7276,7 @@ _mm512_unpackhi_epi32 (__m512i __A, __m512i __B)
   return (__m512i) __builtin_ia32_punpckhdq512_mask ((__v16si) __A,
 						     (__v16si) __B,
 						     (__v16si)
-						     _mm512_undefined_si512 (),
+						     _mm512_undefined_epi32 (),
 						     (__mmask16) -1);
 }
 
@@ -7308,7 +7309,7 @@ _mm512_unpackhi_epi64 (__m512i __A, __m512i __B)
   return (__m512i) __builtin_ia32_punpckhqdq512_mask ((__v8di) __A,
 						      (__v8di) __B,
 						      (__v8di)
-						      _mm512_undefined_si512 (),
+						      _mm512_undefined_epi32 (),
 						      (__mmask8) -1);
 }
 
@@ -7340,7 +7341,7 @@ _mm512_unpacklo_epi32 (__m512i __A, __m512i __B)
   return (__m512i) __builtin_ia32_punpckldq512_mask ((__v16si) __A,
 						     (__v16si) __B,
 						     (__v16si)
-						     _mm512_undefined_si512 (),
+						     _mm512_undefined_epi32 (),
 						     (__mmask16) -1);
 }
 
@@ -7373,7 +7374,7 @@ _mm512_unpacklo_epi64 (__m512i __A, __m512i __B)
   return (__m512i) __builtin_ia32_punpcklqdq512_mask ((__v8di) __A,
 						      (__v8di) __B,
 						      (__v8di)
-						      _mm512_undefined_si512 (),
+						      _mm512_undefined_epi32 (),
 						      (__mmask8) -1);
 }
 
@@ -8512,7 +8513,7 @@ _mm512_alignr_epi32 (__m512i __A, __m512i __B, const int __imm)
   return (__m512i) __builtin_ia32_alignd512_mask ((__v16si) __A,
 						  (__v16si) __B, __imm,
 						  (__v16si)
-						  _mm512_undefined_si512 (),
+						  _mm512_undefined_epi32 (),
 						  (__mmask16) -1);
 }
 
@@ -8546,7 +8547,7 @@ _mm512_alignr_epi64 (__m512i __A, __m512i __B, const int __imm)
   return (__m512i) __builtin_ia32_alignq512_mask ((__v8di) __A,
 						  (__v8di) __B, __imm,
 						  (__v8di)
-						  _mm512_undefined_si512 (),
+						  _mm512_undefined_epi32 (),
 						  (__mmask8) -1);
 }
 
@@ -8575,7 +8576,7 @@ _mm512_maskz_alignr_epi64 (__mmask8 __U, __m512i __A, __m512i __B,
 #else
 #define _mm512_alignr_epi32(X, Y, C)                                        \
     ((__m512i)__builtin_ia32_alignd512_mask ((__v16si)(__m512i)(X),         \
-        (__v16si)(__m512i)(Y), (int)(C), (__v16si)_mm512_undefined_si512 (),\
+        (__v16si)(__m512i)(Y), (int)(C), (__v16si)_mm512_undefined_epi32 (),\
         (__mmask16)-1))
 
 #define _mm512_mask_alignr_epi32(W, U, X, Y, C)                             \
@@ -8590,7 +8591,7 @@ _mm512_maskz_alignr_epi64 (__mmask8 __U, __m512i __A, __m512i __B,
 
 #define _mm512_alignr_epi64(X, Y, C)                                        \
     ((__m512i)__builtin_ia32_alignq512_mask ((__v8di)(__m512i)(X),          \
-        (__v8di)(__m512i)(Y), (int)(C), (__v8di)_mm512_undefined_si512 (),  \
+        (__v8di)(__m512i)(Y), (int)(C), (__v8di)_mm512_undefined_epi32 (),  \
 	(__mmask8)-1))
 
 #define _mm512_mask_alignr_epi64(W, U, X, Y, C)                             \
@@ -9130,9 +9131,9 @@ _mm_mask_cmp_round_ss_mask (__mmask8 __M, __m128 __X, __m128 __Y,
 					   (__mmask8)-1))
 
 #define _mm512_cmp_epi32_mask(X, Y, P)					\
-  ((__mmask8) __builtin_ia32_cmpd512_mask ((__v16si)(__m512i)(X),	\
-					   (__v16si)(__m512i)(Y), (int)(P),\
-					   (__mmask16)-1))
+  ((__mmask16) __builtin_ia32_cmpd512_mask ((__v16si)(__m512i)(X),	\
+					    (__v16si)(__m512i)(Y), (int)(P), \
+					    (__mmask16)-1))
 
 #define _mm512_cmp_epu64_mask(X, Y, P)					\
   ((__mmask8) __builtin_ia32_ucmpq512_mask ((__v8di)(__m512i)(X),	\
@@ -9140,66 +9141,66 @@ _mm_mask_cmp_round_ss_mask (__mmask8 __M, __m128 __X, __m128 __Y,
 					    (__mmask8)-1))
 
 #define _mm512_cmp_epu32_mask(X, Y, P)					\
-  ((__mmask8) __builtin_ia32_ucmpd512_mask ((__v16si)(__m512i)(X),	\
-					    (__v16si)(__m512i)(Y), (int)(P),\
-					    (__mmask16)-1))
+  ((__mmask16) __builtin_ia32_ucmpd512_mask ((__v16si)(__m512i)(X),	\
+					     (__v16si)(__m512i)(Y), (int)(P), \
+					     (__mmask16)-1))
 
-#define _mm512_cmp_round_pd_mask(X, Y, P, R)					\
+#define _mm512_cmp_round_pd_mask(X, Y, P, R)				\
   ((__mmask8) __builtin_ia32_cmppd512_mask ((__v8df)(__m512d)(X),	\
 					    (__v8df)(__m512d)(Y), (int)(P),\
 					    (__mmask8)-1, R))
 
-#define _mm512_cmp_round_ps_mask(X, Y, P, R)					\
+#define _mm512_cmp_round_ps_mask(X, Y, P, R)				\
   ((__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf)(__m512)(X),	\
 					     (__v16sf)(__m512)(Y), (int)(P),\
 					     (__mmask16)-1, R))
 
-#define _mm512_mask_cmp_epi64_mask(M, X, Y, P)					\
+#define _mm512_mask_cmp_epi64_mask(M, X, Y, P)				\
   ((__mmask8) __builtin_ia32_cmpq512_mask ((__v8di)(__m512i)(X),	\
 					   (__v8di)(__m512i)(Y), (int)(P),\
 					   (__mmask8)M))
 
-#define _mm512_mask_cmp_epi32_mask(M, X, Y, P)					\
-  ((__mmask8) __builtin_ia32_cmpd512_mask ((__v16si)(__m512i)(X),	\
-					   (__v16si)(__m512i)(Y), (int)(P),\
-					   (__mmask16)M))
+#define _mm512_mask_cmp_epi32_mask(M, X, Y, P)				\
+  ((__mmask16) __builtin_ia32_cmpd512_mask ((__v16si)(__m512i)(X),	\
+					    (__v16si)(__m512i)(Y), (int)(P), \
+					    (__mmask16)M))
 
-#define _mm512_mask_cmp_epu64_mask(M, X, Y, P)					\
+#define _mm512_mask_cmp_epu64_mask(M, X, Y, P)				\
   ((__mmask8) __builtin_ia32_ucmpq512_mask ((__v8di)(__m512i)(X),	\
 					    (__v8di)(__m512i)(Y), (int)(P),\
 					    (__mmask8)M))
 
-#define _mm512_mask_cmp_epu32_mask(M, X, Y, P)					\
-  ((__mmask8) __builtin_ia32_ucmpd512_mask ((__v16si)(__m512i)(X),	\
-					    (__v16si)(__m512i)(Y), (int)(P),\
-					    (__mmask16)M))
+#define _mm512_mask_cmp_epu32_mask(M, X, Y, P)				\
+  ((__mmask16) __builtin_ia32_ucmpd512_mask ((__v16si)(__m512i)(X),	\
+					     (__v16si)(__m512i)(Y), (int)(P), \
+					     (__mmask16)M))
 
-#define _mm512_mask_cmp_round_pd_mask(M, X, Y, P, R)					\
+#define _mm512_mask_cmp_round_pd_mask(M, X, Y, P, R)			\
   ((__mmask8) __builtin_ia32_cmppd512_mask ((__v8df)(__m512d)(X),	\
 					    (__v8df)(__m512d)(Y), (int)(P),\
 					    (__mmask8)M, R))
 
-#define _mm512_mask_cmp_round_ps_mask(M, X, Y, P, R)					\
+#define _mm512_mask_cmp_round_ps_mask(M, X, Y, P, R)			\
   ((__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf)(__m512)(X),	\
 					     (__v16sf)(__m512)(Y), (int)(P),\
 					     (__mmask16)M, R))
 
-#define _mm_cmp_round_sd_mask(X, Y, P, R)					\
+#define _mm_cmp_round_sd_mask(X, Y, P, R)				\
   ((__mmask8) __builtin_ia32_cmpsd_mask ((__v2df)(__m128d)(X),		\
 					 (__v2df)(__m128d)(Y), (int)(P),\
 					 (__mmask8)-1, R))
 
-#define _mm_mask_cmp_round_sd_mask(M, X, Y, P, R)					\
+#define _mm_mask_cmp_round_sd_mask(M, X, Y, P, R)			\
   ((__mmask8) __builtin_ia32_cmpsd_mask ((__v2df)(__m128d)(X),		\
 					 (__v2df)(__m128d)(Y), (int)(P),\
 					 (M), R))
 
-#define _mm_cmp_round_ss_mask(X, Y, P, R)					\
+#define _mm_cmp_round_ss_mask(X, Y, P, R)				\
   ((__mmask8) __builtin_ia32_cmpss_mask ((__v4sf)(__m128)(X),		\
 					 (__v4sf)(__m128)(Y), (int)(P), \
 					 (__mmask8)-1, R))
 
-#define _mm_mask_cmp_round_ss_mask(M, X, Y, P, R)					\
+#define _mm_mask_cmp_round_ss_mask(M, X, Y, P, R)			\
   ((__mmask8) __builtin_ia32_cmpss_mask ((__v4sf)(__m128)(X),		\
 					 (__v4sf)(__m128)(Y), (int)(P), \
 					 (M), R))
@@ -9306,7 +9307,7 @@ extern __inline __m512i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm512_i32gather_epi32 (__m512i __index, int const *__addr, int __scale)
 {
-  __m512i v1_old = _mm512_undefined_si512 ();
+  __m512i v1_old = _mm512_undefined_epi32 ();
   __mmask16 mask = 0xFFFF;
 
   return (__m512i) __builtin_ia32_gathersiv16si ((__v16si) v1_old,
@@ -9330,7 +9331,7 @@ extern __inline __m512i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm512_i32gather_epi64 (__m256i __index, long long const *__addr, int __scale)
 {
-  __m512i v1_old = _mm512_undefined_si512 ();
+  __m512i v1_old = _mm512_undefined_epi32 ();
   __mmask8 mask = 0xFF;
 
   return (__m512i) __builtin_ia32_gathersiv8di ((__v8di) v1_old,
@@ -9379,7 +9380,7 @@ extern __inline __m512i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm512_i64gather_epi64 (__m512i __index, long long const *__addr, int __scale)
 {
-  __m512i v1_old = _mm512_undefined_si512 ();
+  __m512i v1_old = _mm512_undefined_epi32 ();
   __mmask8 mask = 0xFF;
 
   return (__m512i) __builtin_ia32_gatherdiv8di ((__v8di) v1_old,
@@ -9591,7 +9592,7 @@ _mm512_mask_i64scatter_epi64 (long long *__addr, __mmask8 __mask,
 					 (__mmask8)MASK, (int)SCALE)
 
 #define _mm512_i32gather_epi32(INDEX, ADDR, SCALE)			\
-  (__m512i) __builtin_ia32_gathersiv16si ((__v16si)_mm512_undefined_si512 (),	\
+  (__m512i) __builtin_ia32_gathersiv16si ((__v16si)_mm512_undefined_epi32 (),	\
 					  (int const *)ADDR,		\
 					  (__v16si)(__m512i)INDEX,	\
 					  (__mmask16)0xFFFF, (int)SCALE)
@@ -9603,7 +9604,7 @@ _mm512_mask_i64scatter_epi64 (long long *__addr, __mmask8 __mask,
 					  (__mmask16)MASK, (int)SCALE)
 
 #define _mm512_i32gather_epi64(INDEX, ADDR, SCALE)			\
-  (__m512i) __builtin_ia32_gathersiv8di ((__v8di)_mm512_undefined_si512 (),	\
+  (__m512i) __builtin_ia32_gathersiv8di ((__v8di)_mm512_undefined_epi32 (),	\
 					 (long long const *)ADDR,	\
 					 (__v8si)(__m256i)INDEX,	\
 					 (__mmask8)0xFF, (int)SCALE)
@@ -9627,7 +9628,7 @@ _mm512_mask_i64scatter_epi64 (long long *__addr, __mmask8 __mask,
 					  (__mmask8)MASK, (int)SCALE)
 
 #define _mm512_i64gather_epi64(INDEX, ADDR, SCALE)			\
-  (__m512i) __builtin_ia32_gatherdiv8di ((__v8di)_mm512_undefined_si512 (),	\
+  (__m512i) __builtin_ia32_gatherdiv8di ((__v8di)_mm512_undefined_epi32 (),	\
 					 (long long const *)ADDR,	\
 					 (__v8di)(__m512i)INDEX,	\
 					 (__mmask8)0xFF, (int)SCALE)
@@ -10123,7 +10124,7 @@ _mm512_max_epi64 (__m512i __A, __m512i __B)
   return (__m512i) __builtin_ia32_pmaxsq512_mask ((__v8di) __A,
 						  (__v8di) __B,
 						  (__v8di)
-						  _mm512_undefined_si512 (),
+						  _mm512_undefined_epi32 (),
 						  (__mmask8) -1);
 }
 
@@ -10154,7 +10155,7 @@ _mm512_min_epi64 (__m512i __A, __m512i __B)
   return (__m512i) __builtin_ia32_pminsq512_mask ((__v8di) __A,
 						  (__v8di) __B,
 						  (__v8di)
-						  _mm512_undefined_si512 (),
+						  _mm512_undefined_epi32 (),
 						  (__mmask8) -1);
 }
 
@@ -10185,7 +10186,7 @@ _mm512_max_epu64 (__m512i __A, __m512i __B)
   return (__m512i) __builtin_ia32_pmaxuq512_mask ((__v8di) __A,
 						  (__v8di) __B,
 						  (__v8di)
-						  _mm512_undefined_si512 (),
+						  _mm512_undefined_epi32 (),
 						  (__mmask8) -1);
 }
 
@@ -10216,7 +10217,7 @@ _mm512_min_epu64 (__m512i __A, __m512i __B)
   return (__m512i) __builtin_ia32_pminuq512_mask ((__v8di) __A,
 						  (__v8di) __B,
 						  (__v8di)
-						  _mm512_undefined_si512 (),
+						  _mm512_undefined_epi32 (),
 						  (__mmask8) -1);
 }
 
@@ -10247,7 +10248,7 @@ _mm512_max_epi32 (__m512i __A, __m512i __B)
   return (__m512i) __builtin_ia32_pmaxsd512_mask ((__v16si) __A,
 						  (__v16si) __B,
 						  (__v16si)
-						  _mm512_undefined_si512 (),
+						  _mm512_undefined_epi32 (),
 						  (__mmask16) -1);
 }
 
@@ -10278,7 +10279,7 @@ _mm512_min_epi32 (__m512i __A, __m512i __B)
   return (__m512i) __builtin_ia32_pminsd512_mask ((__v16si) __A,
 						  (__v16si) __B,
 						  (__v16si)
-						  _mm512_undefined_si512 (),
+						  _mm512_undefined_epi32 (),
 						  (__mmask16) -1);
 }
 
@@ -10309,7 +10310,7 @@ _mm512_max_epu32 (__m512i __A, __m512i __B)
   return (__m512i) __builtin_ia32_pmaxud512_mask ((__v16si) __A,
 						  (__v16si) __B,
 						  (__v16si)
-						  _mm512_undefined_si512 (),
+						  _mm512_undefined_epi32 (),
 						  (__mmask16) -1);
 }
 
@@ -10340,7 +10341,7 @@ _mm512_min_epu32 (__m512i __A, __m512i __B)
   return (__m512i) __builtin_ia32_pminud512_mask ((__v16si) __A,
 						  (__v16si) __B,
 						  (__v16si)
-						  _mm512_undefined_si512 (),
+						  _mm512_undefined_epi32 (),
 						  (__mmask16) -1);
 }
 
@@ -11804,7 +11805,7 @@ _mm512_cvttps_epi32 (__m512 __A)
 {
   return (__m512i) __builtin_ia32_cvttps2dq512_mask ((__v16sf) __A,
 						     (__v16si)
-						     _mm512_undefined_si512 (),
+						     _mm512_undefined_epi32 (),
 						     (__mmask16) -1,
 						     _MM_FROUND_CUR_DIRECTION);
 }
@@ -11836,7 +11837,7 @@ _mm512_cvttps_epu32 (__m512 __A)
 {
   return (__m512i) __builtin_ia32_cvttps2udq512_mask ((__v16sf) __A,
 						      (__v16si)
-						      _mm512_undefined_si512 (),
+						      _mm512_undefined_epi32 (),
 						      (__mmask16) -1,
 						      _MM_FROUND_CUR_DIRECTION);
 }
@@ -11868,7 +11869,7 @@ _mm512_cvtps_epi32 (__m512 __A)
 {
   return (__m512i) __builtin_ia32_cvtps2dq512_mask ((__v16sf) __A,
 						    (__v16si)
-						    _mm512_undefined_si512 (),
+						    _mm512_undefined_epi32 (),
 						    (__mmask16) -1,
 						    _MM_FROUND_CUR_DIRECTION);
 }
@@ -11900,7 +11901,7 @@ _mm512_cvtps_epu32 (__m512 __A)
 {
   return (__m512i) __builtin_ia32_cvtps2udq512_mask ((__v16sf) __A,
 						     (__v16si)
-						     _mm512_undefined_si512 (),
+						     _mm512_undefined_epi32 (),
 						     (__mmask16) -1,
 						     _MM_FROUND_CUR_DIRECTION);
 }
diff --git a/gcc/config/i386/avx512ifmaintrin.h b/gcc/config/i386/avx512ifmaintrin.h
index c2f43111e1..c50aa65d02 100644
--- a/gcc/config/i386/avx512ifmaintrin.h
+++ b/gcc/config/i386/avx512ifmaintrin.h
@@ -41,7 +41,7 @@ _mm512_madd52lo_epu64 (__m512i __X, __m512i __Y, __m512i __Z)
   return (__m512i) __builtin_ia32_vpmadd52luq512_mask ((__v8di) __X,
 						       (__v8di) __Y,
 						       (__v8di) __Z,
-						       (__mmask8) - 1);
+						       (__mmask8) -1);
 }
 
 extern __inline __m512i
@@ -51,7 +51,7 @@ _mm512_madd52hi_epu64 (__m512i __X, __m512i __Y, __m512i __Z)
   return (__m512i) __builtin_ia32_vpmadd52huq512_mask ((__v8di) __X,
 						       (__v8di) __Y,
 						       (__v8di) __Z,
-						       (__mmask8) - 1);
+						       (__mmask8) -1);
 }
 
 extern __inline __m512i
diff --git a/gcc/config/i386/avx512ifmavlintrin.h b/gcc/config/i386/avx512ifmavlintrin.h
index 9091f899ca..6c496f78ce 100644
--- a/gcc/config/i386/avx512ifmavlintrin.h
+++ b/gcc/config/i386/avx512ifmavlintrin.h
@@ -41,7 +41,7 @@ _mm_madd52lo_epu64 (__m128i __X, __m128i __Y, __m128i __Z)
   return (__m128i) __builtin_ia32_vpmadd52luq128_mask ((__v2di) __X,
 						       (__v2di) __Y,
 						       (__v2di) __Z,
-						       (__mmask8) - 1);
+						       (__mmask8) -1);
 }
 
 extern __inline __m128i
@@ -51,7 +51,7 @@ _mm_madd52hi_epu64 (__m128i __X, __m128i __Y, __m128i __Z)
   return (__m128i) __builtin_ia32_vpmadd52huq128_mask ((__v2di) __X,
 						       (__v2di) __Y,
 						       (__v2di) __Z,
-						       (__mmask8) - 1);
+						       (__mmask8) -1);
 }
 
 extern __inline __m256i
@@ -61,7 +61,7 @@ _mm256_madd52lo_epu64 (__m256i __X, __m256i __Y, __m256i __Z)
   return (__m256i) __builtin_ia32_vpmadd52luq256_mask ((__v4di) __X,
 						       (__v4di) __Y,
 						       (__v4di) __Z,
-						       (__mmask8) - 1);
+						       (__mmask8) -1);
 }
 
 extern __inline __m256i
@@ -71,7 +71,7 @@ _mm256_madd52hi_epu64 (__m256i __X, __m256i __Y, __m256i __Z)
   return (__m256i) __builtin_ia32_vpmadd52huq256_mask ((__v4di) __X,
 						       (__v4di) __Y,
 						       (__v4di) __Z,
-						       (__mmask8) - 1);
+						       (__mmask8) -1);
 }
 
 extern __inline __m128i
diff --git a/gcc/config/i386/avx512vbmiintrin.h b/gcc/config/i386/avx512vbmiintrin.h
index a00cf70f52..a2ad07a6a2 100644
--- a/gcc/config/i386/avx512vbmiintrin.h
+++ b/gcc/config/i386/avx512vbmiintrin.h
@@ -62,7 +62,7 @@ _mm512_multishift_epi64_epi8 (__m512i __X, __m512i __Y)
   return (__m512i) __builtin_ia32_vpmultishiftqb512_mask ((__v64qi) __X,
 							  (__v64qi) __Y,
 							  (__v64qi)
-							  _mm512_undefined_si512 (),
+							  _mm512_undefined_epi32 (),
 							  (__mmask64) -1);
 }
 
@@ -73,7 +73,7 @@ _mm512_permutexvar_epi8 (__m512i __A, __m512i __B)
   return (__m512i) __builtin_ia32_permvarqi512_mask ((__v64qi) __B,
 						     (__v64qi) __A,
 						     (__v64qi)
-						     _mm512_undefined_si512 (),
+						     _mm512_undefined_epi32 (),
 						     (__mmask64) -1);
 }
 
@@ -108,8 +108,7 @@ _mm512_permutex2var_epi8 (__m512i __A, __m512i __I, __m512i __B)
 							/* idx */ ,
 							(__v64qi) __A,
 							(__v64qi) __B,
-							(__mmask64) -
-							1);
+							(__mmask64) -1);
 }
 
 extern __inline __m512i
diff --git a/gcc/config/i386/avx512vbmivlintrin.h b/gcc/config/i386/avx512vbmivlintrin.h
index 4af9fb9b01..04308ead42 100644
--- a/gcc/config/i386/avx512vbmivlintrin.h
+++ b/gcc/config/i386/avx512vbmivlintrin.h
@@ -173,8 +173,7 @@ _mm256_permutex2var_epi8 (__m256i __A, __m256i __I, __m256i __B)
 							/* idx */ ,
 							(__v32qi) __A,
 							(__v32qi) __B,
-							(__mmask32) -
-							1);
+							(__mmask32) -1);
 }
 
 extern __inline __m256i
@@ -224,8 +223,7 @@ _mm_permutex2var_epi8 (__m128i __A, __m128i __I, __m128i __B)
 							/* idx */ ,
 							(__v16qi) __A,
 							(__v16qi) __B,
-							(__mmask16) -
-							1);
+							(__mmask16) -1);
 }
 
 extern __inline __m128i
diff --git a/gcc/config/i386/avx512vlbwintrin.h b/gcc/config/i386/avx512vlbwintrin.h
index f260526d06..cf94755ea3 100644
--- a/gcc/config/i386/avx512vlbwintrin.h
+++ b/gcc/config/i386/avx512vlbwintrin.h
@@ -575,8 +575,7 @@ _mm256_permutex2var_epi16 (__m256i __A, __m256i __I, __m256i __B)
 							/* idx */ ,
 							(__v16hi) __A,
 							(__v16hi) __B,
-							(__mmask16) -
-							1);
+							(__mmask16) -1);
 }
 
 extern __inline __m256i
@@ -626,8 +625,7 @@ _mm_permutex2var_epi16 (__m128i __A, __m128i __I, __m128i __B)
 							/* idx */ ,
 							(__v8hi) __A,
 							(__v8hi) __B,
-							(__mmask8) -
-							1);
+							(__mmask8) -1);
 }
 
 extern __inline __m128i
@@ -2009,7 +2007,7 @@ _mm256_cmpneq_epi8_mask (__m256i __X, __m256i __Y)
 {
   return (__mmask32) __builtin_ia32_cmpb256_mask ((__v32qi) __X,
 						  (__v32qi) __Y, 4,
-						  (__mmask32) - 1);
+						  (__mmask32) -1);
 }
 
 extern __inline __mmask32
@@ -2018,7 +2016,7 @@ _mm256_cmplt_epi8_mask (__m256i __X, __m256i __Y)
 {
   return (__mmask32) __builtin_ia32_cmpb256_mask ((__v32qi) __X,
 						  (__v32qi) __Y, 1,
-						  (__mmask32) - 1);
+						  (__mmask32) -1);
 }
 
 extern __inline __mmask32
@@ -2027,7 +2025,7 @@ _mm256_cmpge_epi8_mask (__m256i __X, __m256i __Y)
 {
   return (__mmask32) __builtin_ia32_cmpb256_mask ((__v32qi) __X,
 						  (__v32qi) __Y, 5,
-						  (__mmask32) - 1);
+						  (__mmask32) -1);
 }
 
 extern __inline __mmask32
@@ -2036,7 +2034,7 @@ _mm256_cmple_epi8_mask (__m256i __X, __m256i __Y)
 {
   return (__mmask32) __builtin_ia32_cmpb256_mask ((__v32qi) __X,
 						  (__v32qi) __Y, 2,
-						  (__mmask32) - 1);
+						  (__mmask32) -1);
 }
 
 extern __inline __mmask16
@@ -2045,7 +2043,7 @@ _mm256_cmpneq_epi16_mask (__m256i __X, __m256i __Y)
 {
   return (__mmask16) __builtin_ia32_cmpw256_mask ((__v16hi) __X,
 						  (__v16hi) __Y, 4,
-						  (__mmask16) - 1);
+						  (__mmask16) -1);
 }
 
 extern __inline __mmask16
@@ -2054,7 +2052,7 @@ _mm256_cmplt_epi16_mask (__m256i __X, __m256i __Y)
 {
   return (__mmask16) __builtin_ia32_cmpw256_mask ((__v16hi) __X,
 						  (__v16hi) __Y, 1,
-						  (__mmask16) - 1);
+						  (__mmask16) -1);
 }
 
 extern __inline __mmask16
@@ -2063,7 +2061,7 @@ _mm256_cmpge_epi16_mask (__m256i __X, __m256i __Y)
 {
   return (__mmask16) __builtin_ia32_cmpw256_mask ((__v16hi) __X,
 						  (__v16hi) __Y, 5,
-						  (__mmask16) - 1);
+						  (__mmask16) -1);
 }
 
 extern __inline __mmask16
@@ -2072,7 +2070,7 @@ _mm256_cmple_epi16_mask (__m256i __X, __m256i __Y)
 {
   return (__mmask16) __builtin_ia32_cmpw256_mask ((__v16hi) __X,
 						  (__v16hi) __Y, 2,
-						  (__mmask16) - 1);
+						  (__mmask16) -1);
 }
 
 extern __inline __mmask16
@@ -2081,7 +2079,7 @@ _mm_cmpneq_epu8_mask (__m128i __X, __m128i __Y)
 {
   return (__mmask16) __builtin_ia32_ucmpb128_mask ((__v16qi) __X,
 						   (__v16qi) __Y, 4,
-						   (__mmask16) - 1);
+						   (__mmask16) -1);
 }
 
 extern __inline __mmask16
@@ -2090,7 +2088,7 @@ _mm_cmplt_epu8_mask (__m128i __X, __m128i __Y)
 {
   return (__mmask16) __builtin_ia32_ucmpb128_mask ((__v16qi) __X,
 						   (__v16qi) __Y, 1,
-						   (__mmask16) - 1);
+						   (__mmask16) -1);
 }
 
 extern __inline __mmask16
@@ -2099,7 +2097,7 @@ _mm_cmpge_epu8_mask (__m128i __X, __m128i __Y)
 {
   return (__mmask16) __builtin_ia32_ucmpb128_mask ((__v16qi) __X,
 						   (__v16qi) __Y, 5,
-						   (__mmask16) - 1);
+						   (__mmask16) -1);
 }
 
 extern __inline __mmask16
@@ -2108,7 +2106,7 @@ _mm_cmple_epu8_mask (__m128i __X, __m128i __Y)
 {
   return (__mmask16) __builtin_ia32_ucmpb128_mask ((__v16qi) __X,
 						   (__v16qi) __Y, 2,
-						   (__mmask16) - 1);
+						   (__mmask16) -1);
 }
 
 extern __inline __mmask8
@@ -2117,7 +2115,7 @@ _mm_cmpneq_epu16_mask (__m128i __X, __m128i __Y)
 {
   return (__mmask8) __builtin_ia32_ucmpw128_mask ((__v8hi) __X,
 						  (__v8hi) __Y, 4,
-						  (__mmask8) - 1);
+						  (__mmask8) -1);
 }
 
 extern __inline __mmask8
@@ -2126,7 +2124,7 @@ _mm_cmplt_epu16_mask (__m128i __X, __m128i __Y)
 {
   return (__mmask8) __builtin_ia32_ucmpw128_mask ((__v8hi) __X,
 						  (__v8hi) __Y, 1,
-						  (__mmask8) - 1);
+						  (__mmask8) -1);
 }
 
 extern __inline __mmask8
@@ -2135,7 +2133,7 @@ _mm_cmpge_epu16_mask (__m128i __X, __m128i __Y)
 {
   return (__mmask8) __builtin_ia32_ucmpw128_mask ((__v8hi) __X,
 						  (__v8hi) __Y, 5,
-						  (__mmask8) - 1);
+						  (__mmask8) -1);
 }
 
 extern __inline __mmask8
@@ -2144,7 +2142,7 @@ _mm_cmple_epu16_mask (__m128i __X, __m128i __Y)
 {
   return (__mmask8) __builtin_ia32_ucmpw128_mask ((__v8hi) __X,
 						  (__v8hi) __Y, 2,
-						  (__mmask8) - 1);
+						  (__mmask8) -1);
 }
 
 extern __inline __mmask16
@@ -2153,7 +2151,7 @@ _mm_cmpneq_epi8_mask (__m128i __X, __m128i __Y)
 {
   return (__mmask16) __builtin_ia32_cmpb128_mask ((__v16qi) __X,
 						  (__v16qi) __Y, 4,
-						  (__mmask16) - 1);
+						  (__mmask16) -1);
 }
 
 extern __inline __mmask16
@@ -2162,7 +2160,7 @@ _mm_cmplt_epi8_mask (__m128i __X, __m128i __Y)
 {
   return (__mmask16) __builtin_ia32_cmpb128_mask ((__v16qi) __X,
 						  (__v16qi) __Y, 1,
-						  (__mmask16) - 1);
+						  (__mmask16) -1);
 }
 
 extern __inline __mmask16
@@ -2171,7 +2169,7 @@ _mm_cmpge_epi8_mask (__m128i __X, __m128i __Y)
 {
   return (__mmask16) __builtin_ia32_cmpb128_mask ((__v16qi) __X,
 						  (__v16qi) __Y, 5,
-						  (__mmask16) - 1);
+						  (__mmask16) -1);
 }
 
 extern __inline __mmask16
@@ -2180,7 +2178,7 @@ _mm_cmple_epi8_mask (__m128i __X, __m128i __Y)
 {
   return (__mmask16) __builtin_ia32_cmpb128_mask ((__v16qi) __X,
 						  (__v16qi) __Y, 2,
-						  (__mmask16) - 1);
+						  (__mmask16) -1);
 }
 
 extern __inline __mmask8
@@ -2189,7 +2187,7 @@ _mm_cmpneq_epi16_mask (__m128i __X, __m128i __Y)
 {
   return (__mmask8) __builtin_ia32_cmpw128_mask ((__v8hi) __X,
 						 (__v8hi) __Y, 4,
-						 (__mmask8) - 1);
+						 (__mmask8) -1);
 }
 
 extern __inline __mmask8
@@ -2198,7 +2196,7 @@ _mm_cmplt_epi16_mask (__m128i __X, __m128i __Y)
 {
   return (__mmask8) __builtin_ia32_cmpw128_mask ((__v8hi) __X,
 						 (__v8hi) __Y, 1,
-						 (__mmask8) - 1);
+						 (__mmask8) -1);
 }
 
 extern __inline __mmask8
@@ -2207,7 +2205,7 @@ _mm_cmpge_epi16_mask (__m128i __X, __m128i __Y)
 {
   return (__mmask8) __builtin_ia32_cmpw128_mask ((__v8hi) __X,
 						 (__v8hi) __Y, 5,
-						 (__mmask8) - 1);
+						 (__mmask8) -1);
 }
 
 extern __inline __mmask8
@@ -2216,7 +2214,7 @@ _mm_cmple_epi16_mask (__m128i __X, __m128i __Y)
 {
   return (__mmask8) __builtin_ia32_cmpw128_mask ((__v8hi) __X,
 						 (__v8hi) __Y, 2,
-						 (__mmask8) - 1);
+						 (__mmask8) -1);
 }
 
 extern __inline __m256i
@@ -3609,7 +3607,7 @@ _mm256_cmpneq_epu8_mask (__m256i __X, __m256i __Y)
 {
   return (__mmask32) __builtin_ia32_ucmpb256_mask ((__v32qi) __X,
 						   (__v32qi) __Y, 4,
-						   (__mmask32) - 1);
+						   (__mmask32) -1);
 }
 
 extern __inline __mmask32
@@ -3618,7 +3616,7 @@ _mm256_cmplt_epu8_mask (__m256i __X, __m256i __Y)
 {
   return (__mmask32) __builtin_ia32_ucmpb256_mask ((__v32qi) __X,
 						   (__v32qi) __Y, 1,
-						   (__mmask32) - 1);
+						   (__mmask32) -1);
 }
 
 extern __inline __mmask32
@@ -3627,7 +3625,7 @@ _mm256_cmpge_epu8_mask (__m256i __X, __m256i __Y)
 {
   return (__mmask32) __builtin_ia32_ucmpb256_mask ((__v32qi) __X,
 						   (__v32qi) __Y, 5,
-						   (__mmask32) - 1);
+						   (__mmask32) -1);
 }
 
 extern __inline __mmask32
@@ -3636,7 +3634,7 @@ _mm256_cmple_epu8_mask (__m256i __X, __m256i __Y)
 {
   return (__mmask32) __builtin_ia32_ucmpb256_mask ((__v32qi) __X,
 						   (__v32qi) __Y, 2,
-						   (__mmask32) - 1);
+						   (__mmask32) -1);
 }
 
 extern __inline __mmask16
@@ -3645,7 +3643,7 @@ _mm256_cmpneq_epu16_mask (__m256i __X, __m256i __Y)
 {
   return (__mmask16) __builtin_ia32_ucmpw256_mask ((__v16hi) __X,
 						   (__v16hi) __Y, 4,
-						   (__mmask16) - 1);
+						   (__mmask16) -1);
 }
 
 extern __inline __mmask16
@@ -3654,7 +3652,7 @@ _mm256_cmplt_epu16_mask (__m256i __X, __m256i __Y)
 {
   return (__mmask16) __builtin_ia32_ucmpw256_mask ((__v16hi) __X,
 						   (__v16hi) __Y, 1,
-						   (__mmask16) - 1);
+						   (__mmask16) -1);
 }
 
 extern __inline __mmask16
@@ -3663,7 +3661,7 @@ _mm256_cmpge_epu16_mask (__m256i __X, __m256i __Y)
 {
   return (__mmask16) __builtin_ia32_ucmpw256_mask ((__v16hi) __X,
 						   (__v16hi) __Y, 5,
-						   (__mmask16) - 1);
+						   (__mmask16) -1);
 }
 
 extern __inline __mmask16
@@ -3672,7 +3670,7 @@ _mm256_cmple_epu16_mask (__m256i __X, __m256i __Y)
 {
   return (__mmask16) __builtin_ia32_ucmpw256_mask ((__v16hi) __X,
 						   (__v16hi) __Y, 2,
-						   (__mmask16) - 1);
+						   (__mmask16) -1);
 }
 
 extern __inline void
diff --git a/gcc/config/i386/avx512vldqintrin.h b/gcc/config/i386/avx512vldqintrin.h
index 697b81c401..5ff0a52639 100644
--- a/gcc/config/i386/avx512vldqintrin.h
+++ b/gcc/config/i386/avx512vldqintrin.h
@@ -389,8 +389,7 @@ _mm256_broadcast_f64x2 (__m128d __A)
   return (__m256d) __builtin_ia32_broadcastf64x2_256_mask ((__v2df)
 							   __A,
 						           (__v4df)_mm256_undefined_pd(),
-							   (__mmask8) -
-							   1);
+							   (__mmask8) -1);
 }
 
 extern __inline __m256d
@@ -421,8 +420,7 @@ _mm256_broadcast_i64x2 (__m128i __A)
   return (__m256i) __builtin_ia32_broadcasti64x2_256_mask ((__v2di)
 							   __A,
 						           (__v4di)_mm256_undefined_si256(),
-							   (__mmask8) -
-							   1);
+							   (__mmask8) -1);
 }
 
 extern __inline __m256i
@@ -452,8 +450,7 @@ _mm256_broadcast_f32x2 (__m128 __A)
 {
   return (__m256) __builtin_ia32_broadcastf32x2_256_mask ((__v4sf) __A,
 						          (__v8sf)_mm256_undefined_ps(),
-							  (__mmask8) -
-							  1);
+							  (__mmask8) -1);
 }
 
 extern __inline __m256
@@ -482,8 +479,7 @@ _mm256_broadcast_i32x2 (__m128i __A)
   return (__m256i) __builtin_ia32_broadcasti32x2_256_mask ((__v4si)
 							   __A,
 						          (__v8si)_mm256_undefined_si256(),
-							   (__mmask8) -
-							   1);
+							   (__mmask8) -1);
 }
 
 extern __inline __m256i
@@ -514,8 +510,7 @@ _mm_broadcast_i32x2 (__m128i __A)
   return (__m128i) __builtin_ia32_broadcasti32x2_128_mask ((__v4si)
 							   __A,
 						          (__v4si)_mm_undefined_si128(),
-							   (__mmask8) -
-							   1);
+							   (__mmask8) -1);
 }
 
 extern __inline __m128i
@@ -1351,8 +1346,7 @@ _mm256_extractf64x2_pd (__m256d __A, const int __imm)
 							 __imm,
 							 (__v2df)
 							 _mm_setzero_pd (),
-							 (__mmask8) -
-							 1);
+							 (__mmask8) -1);
 }
 
 extern __inline __m128d
@@ -1388,8 +1382,7 @@ _mm256_extracti64x2_epi64 (__m256i __A, const int __imm)
 							 __imm,
 							 (__v2di)
 							 _mm_setzero_di (),
-							 (__mmask8) -
-							 1);
+							 (__mmask8) -1);
 }
 
 extern __inline __m128i
@@ -1743,8 +1736,7 @@ _mm256_inserti64x2 (__m256i __A, __m128i __B, const int __imm)
 							__imm,
 							(__v4di)
 							_mm256_setzero_si256 (),
-							(__mmask8) -
-							1);
+							(__mmask8) -1);
 }
 
 extern __inline __m256i
@@ -1783,8 +1775,7 @@ _mm256_insertf64x2 (__m256d __A, __m128d __B, const int __imm)
 							__imm,
 							(__v4df)
 							_mm256_setzero_pd (),
-							(__mmask8) -
-							1);
+							(__mmask8) -1);
 }
 
 extern __inline __m256d
diff --git a/gcc/config/i386/avx512vlintrin.h b/gcc/config/i386/avx512vlintrin.h
index d0ffb2b4d3..d627ad86da 100644
--- a/gcc/config/i386/avx512vlintrin.h
+++ b/gcc/config/i386/avx512vlintrin.h
@@ -2698,8 +2698,7 @@ _mm256_broadcast_f32x4 (__m128 __A)
 {
   return (__m256) __builtin_ia32_broadcastf32x4_256_mask ((__v4sf) __A,
 						          (__v8sf)_mm256_undefined_pd (),
-							  (__mmask8) -
-							  1);
+							  (__mmask8) -1);
 }
 
 extern __inline __m256
@@ -2728,8 +2727,7 @@ _mm256_broadcast_i32x4 (__m128i __A)
   return (__m256i) __builtin_ia32_broadcasti32x4_256_mask ((__v4si)
 							   __A,
 						           (__v8si)_mm256_undefined_si256 (),
-							   (__mmask8) -
-							   1);
+							   (__mmask8) -1);
 }
 
 extern __inline __m256i
@@ -6293,8 +6291,7 @@ _mm256_permutex2var_pd (__m256d __A, __m256i __I, __m256d __B)
 							/* idx */ ,
 							(__v4df) __A,
 							(__v4df) __B,
-							(__mmask8) -
-							1);
+							(__mmask8) -1);
 }
 
 extern __inline __m256d
@@ -6584,8 +6581,7 @@ _mm_permutex2var_pd (__m128d __A, __m128i __I, __m128d __B)
 							/* idx */ ,
 							(__v2df) __A,
 							(__v2df) __B,
-							(__mmask8) -
-							1);
+							(__mmask8) -1);
 }
 
 extern __inline __m128d
@@ -8259,8 +8255,7 @@ _mm256_conflict_epi64 (__m256i __A)
   return (__m256i) __builtin_ia32_vpconflictdi_256_mask ((__v4di) __A,
 							 (__v4di)
 							 _mm256_setzero_si256 (),
-							 (__mmask8) -
-							 1);
+							 (__mmask8) -1);
 }
 
 extern __inline __m256i
@@ -8291,8 +8286,7 @@ _mm256_conflict_epi32 (__m256i __A)
   return (__m256i) __builtin_ia32_vpconflictsi_256_mask ((__v8si) __A,
 							 (__v8si)
 							 _mm256_setzero_si256 (),
-							 (__mmask8) -
-							 1);
+							 (__mmask8) -1);
 }
 
 extern __inline __m256i
@@ -8381,8 +8375,7 @@ _mm_conflict_epi64 (__m128i __A)
   return (__m128i) __builtin_ia32_vpconflictdi_128_mask ((__v2di) __A,
 							 (__v2di)
 							 _mm_setzero_di (),
-							 (__mmask8) -
-							 1);
+							 (__mmask8) -1);
 }
 
 extern __inline __m128i
@@ -8413,8 +8406,7 @@ _mm_conflict_epi32 (__m128i __A)
   return (__m128i) __builtin_ia32_vpconflictsi_128_mask ((__v4si) __A,
 							 (__v4si)
 							 _mm_setzero_si128 (),
-							 (__mmask8) -
-							 1);
+							 (__mmask8) -1);
 }
 
 extern __inline __m128i
@@ -9291,8 +9283,7 @@ _mm256_inserti32x4 (__m256i __A, __m128i __B, const int __imm)
 							__imm,
 							(__v8si)
 							_mm256_setzero_si256 (),
-							(__mmask8) -
-							1);
+							(__mmask8) -1);
 }
 
 extern __inline __m256i
@@ -9367,8 +9358,7 @@ _mm256_extracti32x4_epi32 (__m256i __A, const int __imm)
 							 __imm,
 							 (__v4si)
 							 _mm_setzero_si128 (),
-							 (__mmask8) -
-							 1);
+							 (__mmask8) -1);
 }
 
 extern __inline __m128i
@@ -9404,8 +9394,7 @@ _mm256_extractf32x4_ps (__m256 __A, const int __imm)
 							__imm,
 							(__v4sf)
 							_mm_setzero_ps (),
-							(__mmask8) -
-							1);
+							(__mmask8) -1);
 }
 
 extern __inline __m128
@@ -11797,7 +11786,7 @@ _mm256_cmpneq_epu32_mask (__m256i __X, __m256i __Y)
 {
   return (__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si) __X,
 						  (__v8si) __Y, 4,
-						  (__mmask8) - 1);
+						  (__mmask8) -1);
 }
 
 extern __inline __mmask8
@@ -11815,7 +11804,7 @@ _mm256_cmplt_epu32_mask (__m256i __X, __m256i __Y)
 {
   return (__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si) __X,
 						  (__v8si) __Y, 1,
-						  (__mmask8) - 1);
+						  (__mmask8) -1);
 }
 
 extern __inline __mmask8
@@ -11833,7 +11822,7 @@ _mm256_cmpge_epu32_mask (__m256i __X, __m256i __Y)
 {
   return (__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si) __X,
 						  (__v8si) __Y, 5,
-						  (__mmask8) - 1);
+						  (__mmask8) -1);
 }
 
 extern __inline __mmask8
@@ -11851,7 +11840,7 @@ _mm256_cmple_epu32_mask (__m256i __X, __m256i __Y)
 {
   return (__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si) __X,
 						  (__v8si) __Y, 2,
-						  (__mmask8) - 1);
+						  (__mmask8) -1);
 }
 
 extern __inline __mmask8
@@ -11869,7 +11858,7 @@ _mm256_cmpneq_epu64_mask (__m256i __X, __m256i __Y)
 {
   return (__mmask8) __builtin_ia32_ucmpq256_mask ((__v4di) __X,
 						  (__v4di) __Y, 4,
-						  (__mmask8) - 1);
+						  (__mmask8) -1);
 }
 
 extern __inline __mmask8
@@ -11887,7 +11876,7 @@ _mm256_cmplt_epu64_mask (__m256i __X, __m256i __Y)
 {
   return (__mmask8) __builtin_ia32_ucmpq256_mask ((__v4di) __X,
 						  (__v4di) __Y, 1,
-						  (__mmask8) - 1);
+						  (__mmask8) -1);
 }
 
 extern __inline __mmask8
@@ -11905,7 +11894,7 @@ _mm256_cmpge_epu64_mask (__m256i __X, __m256i __Y)
 {
   return (__mmask8) __builtin_ia32_ucmpq256_mask ((__v4di) __X,
 						  (__v4di) __Y, 5,
-						  (__mmask8) - 1);
+						  (__mmask8) -1);
 }
 
 extern __inline __mmask8
@@ -11923,7 +11912,7 @@ _mm256_cmple_epu64_mask (__m256i __X, __m256i __Y)
 {
   return (__mmask8) __builtin_ia32_ucmpq256_mask ((__v4di) __X,
 						  (__v4di) __Y, 2,
-						  (__mmask8) - 1);
+						  (__mmask8) -1);
 }
 
 extern __inline __mmask8
@@ -11941,7 +11930,7 @@ _mm256_cmpneq_epi32_mask (__m256i __X, __m256i __Y)
 {
   return (__mmask8) __builtin_ia32_cmpd256_mask ((__v8si) __X,
 						 (__v8si) __Y, 4,
-						 (__mmask8) - 1);
+						 (__mmask8) -1);
 }
 
 extern __inline __mmask8
@@ -11959,7 +11948,7 @@ _mm256_cmplt_epi32_mask (__m256i __X, __m256i __Y)
 {
   return (__mmask8) __builtin_ia32_cmpd256_mask ((__v8si) __X,
 						 (__v8si) __Y, 1,
-						 (__mmask8) - 1);
+						 (__mmask8) -1);
 }
 
 extern __inline __mmask8
@@ -11977,7 +11966,7 @@ _mm256_cmpge_epi32_mask (__m256i __X, __m256i __Y)
 {
   return (__mmask8) __builtin_ia32_cmpd256_mask ((__v8si) __X,
 						 (__v8si) __Y, 5,
-						 (__mmask8) - 1);
+						 (__mmask8) -1);
 }
 
 extern __inline __mmask8
@@ -11995,7 +11984,7 @@ _mm256_cmple_epi32_mask (__m256i __X, __m256i __Y)
 {
   return (__mmask8) __builtin_ia32_cmpd256_mask ((__v8si) __X,
 						 (__v8si) __Y, 2,
-						 (__mmask8) - 1);
+						 (__mmask8) -1);
 }
 
 extern __inline __mmask8
@@ -12013,7 +12002,7 @@ _mm256_cmpneq_epi64_mask (__m256i __X, __m256i __Y)
 {
   return (__mmask8) __builtin_ia32_cmpq256_mask ((__v4di) __X,
 						 (__v4di) __Y, 4,
-						 (__mmask8) - 1);
+						 (__mmask8) -1);
 }
 
 extern __inline __mmask8
@@ -12031,7 +12020,7 @@ _mm256_cmplt_epi64_mask (__m256i __X, __m256i __Y)
 {
   return (__mmask8) __builtin_ia32_cmpq256_mask ((__v4di) __X,
 						 (__v4di) __Y, 1,
-						 (__mmask8) - 1);
+						 (__mmask8) -1);
 }
 
 extern __inline __mmask8
@@ -12049,7 +12038,7 @@ _mm256_cmpge_epi64_mask (__m256i __X, __m256i __Y)
 {
   return (__mmask8) __builtin_ia32_cmpq256_mask ((__v4di) __X,
 						 (__v4di) __Y, 5,
-						 (__mmask8) - 1);
+						 (__mmask8) -1);
 }
 
 extern __inline __mmask8
@@ -12067,7 +12056,7 @@ _mm256_cmple_epi64_mask (__m256i __X, __m256i __Y)
 {
   return (__mmask8) __builtin_ia32_cmpq256_mask ((__v4di) __X,
 						 (__v4di) __Y, 2,
-						 (__mmask8) - 1);
+						 (__mmask8) -1);
 }
 
 extern __inline __mmask8
@@ -12085,7 +12074,7 @@ _mm_cmpneq_epu32_mask (__m128i __X, __m128i __Y)
 {
   return (__mmask8) __builtin_ia32_ucmpd128_mask ((__v4si) __X,
 						  (__v4si) __Y, 4,
-						  (__mmask8) - 1);
+						  (__mmask8) -1);
 }
 
 extern __inline __mmask8
@@ -12103,7 +12092,7 @@ _mm_cmplt_epu32_mask (__m128i __X, __m128i __Y)
 {
   return (__mmask8) __builtin_ia32_ucmpd128_mask ((__v4si) __X,
 						  (__v4si) __Y, 1,
-						  (__mmask8) - 1);
+						  (__mmask8) -1);
 }
 
 extern __inline __mmask8
@@ -12121,7 +12110,7 @@ _mm_cmpge_epu32_mask (__m128i __X, __m128i __Y)
 {
   return (__mmask8) __builtin_ia32_ucmpd128_mask ((__v4si) __X,
 						  (__v4si) __Y, 5,
-						  (__mmask8) - 1);
+						  (__mmask8) -1);
 }
 
 extern __inline __mmask8
@@ -12139,7 +12128,7 @@ _mm_cmple_epu32_mask (__m128i __X, __m128i __Y)
 {
   return (__mmask8) __builtin_ia32_ucmpd128_mask ((__v4si) __X,
 						  (__v4si) __Y, 2,
-						  (__mmask8) - 1);
+						  (__mmask8) -1);
 }
 
 extern __inline __mmask8
@@ -12157,7 +12146,7 @@ _mm_cmpneq_epu64_mask (__m128i __X, __m128i __Y)
 {
   return (__mmask8) __builtin_ia32_ucmpq128_mask ((__v2di) __X,
 						  (__v2di) __Y, 4,
-						  (__mmask8) - 1);
+						  (__mmask8) -1);
 }
 
 extern __inline __mmask8
@@ -12175,7 +12164,7 @@ _mm_cmplt_epu64_mask (__m128i __X, __m128i __Y)
 {
   return (__mmask8) __builtin_ia32_ucmpq128_mask ((__v2di) __X,
 						  (__v2di) __Y, 1,
-						  (__mmask8) - 1);
+						  (__mmask8) -1);
 }
 
 extern __inline __mmask8
@@ -12193,7 +12182,7 @@ _mm_cmpge_epu64_mask (__m128i __X, __m128i __Y)
 {
   return (__mmask8) __builtin_ia32_ucmpq128_mask ((__v2di) __X,
 						  (__v2di) __Y, 5,
-						  (__mmask8) - 1);
+						  (__mmask8) -1);
 }
 
 extern __inline __mmask8
@@ -12211,7 +12200,7 @@ _mm_cmple_epu64_mask (__m128i __X, __m128i __Y)
 {
   return (__mmask8) __builtin_ia32_ucmpq128_mask ((__v2di) __X,
 						  (__v2di) __Y, 2,
-						  (__mmask8) - 1);
+						  (__mmask8) -1);
 }
 
 extern __inline __mmask8
@@ -12229,7 +12218,7 @@ _mm_cmpneq_epi32_mask (__m128i __X, __m128i __Y)
 {
   return (__mmask8) __builtin_ia32_cmpd128_mask ((__v4si) __X,
 						 (__v4si) __Y, 4,
-						 (__mmask8) - 1);
+						 (__mmask8) -1);
 }
 
 extern __inline __mmask8
@@ -12247,7 +12236,7 @@ _mm_cmplt_epi32_mask (__m128i __X, __m128i __Y)
 {
   return (__mmask8) __builtin_ia32_cmpd128_mask ((__v4si) __X,
 						 (__v4si) __Y, 1,
-						 (__mmask8) - 1);
+						 (__mmask8) -1);
 }
 
 extern __inline __mmask8
@@ -12265,7 +12254,7 @@ _mm_cmpge_epi32_mask (__m128i __X, __m128i __Y)
 {
   return (__mmask8) __builtin_ia32_cmpd128_mask ((__v4si) __X,
 						 (__v4si) __Y, 5,
-						 (__mmask8) - 1);
+						 (__mmask8) -1);
 }
 
 extern __inline __mmask8
@@ -12283,7 +12272,7 @@ _mm_cmple_epi32_mask (__m128i __X, __m128i __Y)
 {
   return (__mmask8) __builtin_ia32_cmpd128_mask ((__v4si) __X,
 						 (__v4si) __Y, 2,
-						 (__mmask8) - 1);
+						 (__mmask8) -1);
 }
 
 extern __inline __mmask8
@@ -12301,7 +12290,7 @@ _mm_cmpneq_epi64_mask (__m128i __X, __m128i __Y)
 {
   return (__mmask8) __builtin_ia32_cmpq128_mask ((__v2di) __X,
 						 (__v2di) __Y, 4,
-						 (__mmask8) - 1);
+						 (__mmask8) -1);
 }
 
 extern __inline __mmask8
@@ -12319,7 +12308,7 @@ _mm_cmplt_epi64_mask (__m128i __X, __m128i __Y)
 {
   return (__mmask8) __builtin_ia32_cmpq128_mask ((__v2di) __X,
 						 (__v2di) __Y, 1,
-						 (__mmask8) - 1);
+						 (__mmask8) -1);
 }
 
 extern __inline __mmask8
@@ -12337,7 +12326,7 @@ _mm_cmpge_epi64_mask (__m128i __X, __m128i __Y)
 {
   return (__mmask8) __builtin_ia32_cmpq128_mask ((__v2di) __X,
 						 (__v2di) __Y, 5,
-						 (__mmask8) - 1);
+						 (__mmask8) -1);
 }
 
 extern __inline __mmask8
@@ -12355,7 +12344,7 @@ _mm_cmple_epi64_mask (__m128i __X, __m128i __Y)
 {
   return (__mmask8) __builtin_ia32_cmpq128_mask ((__v2di) __X,
 						 (__v2di) __Y, 2,
-						 (__mmask8) - 1);
+						 (__mmask8) -1);
 }
 
 #else
diff --git a/gcc/config/i386/constraints.md b/gcc/config/i386/constraints.md
index afdc546c06..1a4c701ad1 100644
--- a/gcc/config/i386/constraints.md
+++ b/gcc/config/i386/constraints.md
@@ -141,8 +141,12 @@
  "(ix86_fpmath & FPMATH_387) ? FLOAT_REGS : NO_REGS"
  "@internal Any x87 register when 80387 FP arithmetic is enabled.")
 
+;; Yr constraint is meant to be used in noavx contexts only, for VEX and EVEX
+;; the lower register numbers need the same instruction sizes as any other.
+;; In case Yr constraint is misused, try to limit the damage, by treating
+;; it as x constraint in avx mode, not v constraint.
 (define_register_constraint "Yr"
- "TARGET_SSE ? (X86_TUNE_AVOID_4BYTE_PREFIXES ? NO_REX_SSE_REGS : ALL_SSE_REGS) : NO_REGS"
+ "TARGET_SSE ? ((TARGET_AVOID_4BYTE_PREFIXES && !TARGET_AVX) ? NO_REX_SSE_REGS : SSE_REGS) : NO_REGS"
  "@internal Lower SSE register when avoiding REX prefix and all SSE registers otherwise.")
 
 ;; We use the B prefix to denote any number of internal operands:
diff --git a/gcc/config/i386/driver-i386.c b/gcc/config/i386/driver-i386.c
index b12146663e..a9d5135f67 100644
--- a/gcc/config/i386/driver-i386.c
+++ b/gcc/config/i386/driver-i386.c
@@ -637,33 +637,27 @@ const char *host_detect_local_cpu (int argc, const char **argv)
     }
   else if (vendor == signature_CENTAUR_ebx)
     {
-      if (arch)
+      processor = PROCESSOR_GENERIC;
+
+      switch (family)
 	{
-	  switch (family)
-	    {
-	    case 6:
-	      if (model > 9)
-		/* Use the default detection procedure.  */
-		processor = PROCESSOR_GENERIC;
-	      else if (model == 9)
-		cpu = "c3-2";
-	      else if (model >= 6)
-		cpu = "c3";
-	      else
-		processor = PROCESSOR_GENERIC;
-	      break;
-	    case 5:
-	      if (has_3dnow)
-		cpu = "winchip2";
-	      else if (has_mmx)
-		cpu = "winchip2-c6";
-	      else
-		processor = PROCESSOR_GENERIC;
-	      break;
-	    default:
-	      /* We have no idea.  */
-	      processor = PROCESSOR_GENERIC;
-	    }
+	default:
+	  /* We have no idea.  */
+	  break;
+
+	case 5:
+	  if (has_3dnow || has_mmx)
+	    processor = PROCESSOR_I486;
+	  break;
+
+	case 6:
+	  if (model > 9 || has_longmode)
+	    /* Use the default detection procedure.  */
+	    ;
+	  else if (model == 9)
+	    processor = PROCESSOR_PENTIUMPRO;
+	  else if (model >= 6)
+	    processor = PROCESSOR_I486;
 	}
     }
   else
@@ -694,7 +688,18 @@ const char *host_detect_local_cpu (int argc, const char **argv)
       /* Default.  */
       break;
     case PROCESSOR_I486:
-      cpu = "i486";
+      if (arch && vendor == signature_CENTAUR_ebx)
+	{
+	  if (model >= 6)
+	    cpu = "c3";
+	  else if (has_3dnow)
+	    cpu = "winchip2";
+	  else
+	    /* Assume WinChip C6.  */
+	    cpu = "winchip-c6";
+	}
+      else
+	cpu = "i486";
       break;
     case PROCESSOR_PENTIUM:
       if (arch && has_mmx)
@@ -817,8 +822,13 @@ const char *host_detect_local_cpu (int argc, const char **argv)
 		/* It is Pentium M.  */
 		cpu = "pentium-m";
 	      else if (has_sse)
-		/* It is Pentium III.  */
-		cpu = "pentium3";
+		{
+		  if (vendor == signature_CENTAUR_ebx)
+		    cpu = "c3-2";
+		  else
+		    /* It is Pentium III.  */
+		    cpu = "pentium3";
+		}
 	      else if (has_mmx)
 		/* It is Pentium II.  */
 		cpu = "pentium2";
@@ -902,6 +912,11 @@ const char *host_detect_local_cpu (int argc, const char **argv)
 	      else
 		cpu = "prescott";
 	    }
+	  else if (has_longmode)
+	    /* Perhaps some emulator?  Assume x86-64, otherwise gcc
+	       -march=native would be unusable for 64-bit compilations,
+	       as all the CPUs below are 32-bit only.  */
+	    cpu = "x86-64";
 	  else if (has_sse2)
 	    cpu = "pentium4";
 	  else if (has_cmov)
diff --git a/gcc/config/i386/i386-builtin-types.def b/gcc/config/i386/i386-builtin-types.def
index b892f08679..86139db4ab 100644
--- a/gcc/config/i386/i386-builtin-types.def
+++ b/gcc/config/i386/i386-builtin-types.def
@@ -292,6 +292,7 @@ DEF_FUNCTION_TYPE (V8DF, V4DF)
 DEF_FUNCTION_TYPE (V8DF, V2DF)
 DEF_FUNCTION_TYPE (V16SI, V4SI)
 DEF_FUNCTION_TYPE (V16SI, V8SI)
+DEF_FUNCTION_TYPE (V16SI, V16SF)
 DEF_FUNCTION_TYPE (V16SI, V16SI, V16SI, UHI)
 DEF_FUNCTION_TYPE (V8DI, V8DI, V8DI, UQI)
 DEF_FUNCTION_TYPE (V8DI, PV8DI)
@@ -1035,14 +1036,17 @@ DEF_FUNCTION_TYPE (VOID, QI, V8DI, PCINT, INT, INT)
 
 DEF_FUNCTION_TYPE_ALIAS (V2DF_FTYPE_V2DF, ROUND)
 DEF_FUNCTION_TYPE_ALIAS (V4DF_FTYPE_V4DF, ROUND)
+DEF_FUNCTION_TYPE_ALIAS (V8DF_FTYPE_V8DF, ROUND)
 DEF_FUNCTION_TYPE_ALIAS (V4SF_FTYPE_V4SF, ROUND)
 DEF_FUNCTION_TYPE_ALIAS (V8SF_FTYPE_V8SF, ROUND)
+DEF_FUNCTION_TYPE_ALIAS (V16SF_FTYPE_V16SF, ROUND)
 
 DEF_FUNCTION_TYPE_ALIAS (V4SI_FTYPE_V2DF_V2DF, ROUND)
 DEF_FUNCTION_TYPE_ALIAS (V8SI_FTYPE_V4DF_V4DF, ROUND)
 DEF_FUNCTION_TYPE_ALIAS (V16SI_FTYPE_V8DF_V8DF, ROUND)
 DEF_FUNCTION_TYPE_ALIAS (V4SI_FTYPE_V4SF, ROUND)
 DEF_FUNCTION_TYPE_ALIAS (V8SI_FTYPE_V8SF, ROUND)
+DEF_FUNCTION_TYPE_ALIAS (V16SI_FTYPE_V16SF, ROUND)
 
 DEF_FUNCTION_TYPE_ALIAS (INT_FTYPE_V2DF_V2DF, PTEST)
 DEF_FUNCTION_TYPE_ALIAS (INT_FTYPE_V2DI_V2DI, PTEST)
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 3d044e8bd6..c1bdcc7ee2 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -18787,12 +18787,29 @@ ix86_expand_vector_move (machine_mode mode, rtx operands[])
      of the register, once we have that information we may be able
      to handle some of them more efficiently.  */
   if (can_create_pseudo_p ()
-      && register_operand (op0, mode)
       && (CONSTANT_P (op1)
 	  || (SUBREG_P (op1)
 	      && CONSTANT_P (SUBREG_REG (op1))))
-      && !standard_sse_constant_p (op1))
-    op1 = validize_mem (force_const_mem (mode, op1));
+      && ((register_operand (op0, mode)
+	   && !standard_sse_constant_p (op1))
+	  /* ix86_expand_vector_move_misalign() does not like constants.  */
+	  || (SSE_REG_MODE_P (mode)
+	      && MEM_P (op0)
+	      && MEM_ALIGN (op0) < align)))
+    {
+      if (SUBREG_P (op1))
+	{
+	  machine_mode imode = GET_MODE (SUBREG_REG (op1));
+	  rtx r = force_const_mem (imode, SUBREG_REG (op1));
+	  if (r)
+	    r = validize_mem (r);
+	  else
+	    r = force_reg (imode, SUBREG_REG (op1));
+	  op1 = simplify_gen_subreg (mode, r, imode, SUBREG_BYTE (op1));
+	}
+      else
+	op1 = validize_mem (force_const_mem (mode, op1));
+    }
 
   /* We need to check memory alignment for SSE mode since attribute
      can make operands unaligned.  */
@@ -18803,13 +18820,8 @@ ix86_expand_vector_move (machine_mode mode, rtx operands[])
     {
       rtx tmp[2];
 
-      /* ix86_expand_vector_move_misalign() does not like constants ... */
-      if (CONSTANT_P (op1)
-	  || (SUBREG_P (op1)
-	      && CONSTANT_P (SUBREG_REG (op1))))
-	op1 = validize_mem (force_const_mem (mode, op1));
-
-      /* ... nor both arguments in memory.  */
+      /* ix86_expand_vector_move_misalign() does not like both
+	 arguments in memory.  */
       if (!register_operand (op0, mode)
 	  && !register_operand (op1, mode))
 	op1 = force_reg (mode, op1);
@@ -18895,7 +18907,7 @@ ix86_avx256_split_vector_move_misalign (rtx op0, rtx op1)
 	  m = adjust_address (op0, mode, 0);
 	  emit_insn (extract (m, op1, const0_rtx));
 	  m = adjust_address (op0, mode, 16);
-	  emit_insn (extract (m, op1, const1_rtx));
+	  emit_insn (extract (m, copy_rtx (op1), const1_rtx));
 	}
       else
 	emit_insn (store_unaligned (op0, op1));
@@ -19203,7 +19215,7 @@ ix86_expand_vector_move_misalign (machine_mode mode, rtx operands[])
 	      m = adjust_address (op0, V2SFmode, 0);
 	      emit_insn (gen_sse_storelps (m, op1));
 	      m = adjust_address (op0, V2SFmode, 8);
-	      emit_insn (gen_sse_storehps (m, op1));
+	      emit_insn (gen_sse_storehps (m, copy_rtx (op1)));
 	    }
 	}
     }
@@ -23027,17 +23039,33 @@ ix86_fp_cmp_code_to_pcmp_immediate (enum rtx_code code)
   switch (code)
     {
     case EQ:
-      return 0x08;
+      return 0x00;
     case NE:
       return 0x04;
     case GT:
-      return 0x16;
+      return 0x0e;
     case LE:
-      return 0x1a;
+      return 0x02;
     case GE:
-      return 0x15;
+      return 0x0d;
     case LT:
-      return 0x19;
+      return 0x01;
+    case UNLE:
+      return 0x0a;
+    case UNLT:
+      return 0x09;
+    case UNGE:
+      return 0x05;
+    case UNGT:
+      return 0x06;
+    case UNEQ:
+      return 0x18;
+    case LTGT:
+      return 0x0c;
+    case ORDERED:
+      return 0x07;
+    case UNORDERED:
+      return 0x03;
     default:
       gcc_unreachable ();
     }
@@ -30686,7 +30714,7 @@ enum ix86_builtins
   IX86_BUILTIN_CVTPD2PS512,
   IX86_BUILTIN_CVTPD2UDQ512,
   IX86_BUILTIN_CVTPH2PS512,
-  IX86_BUILTIN_CVTPS2DQ512,
+  IX86_BUILTIN_CVTPS2DQ512_MASK,
   IX86_BUILTIN_CVTPS2PD512,
   IX86_BUILTIN_CVTPS2PH512,
   IX86_BUILTIN_CVTPS2UDQ512,
@@ -32126,14 +32154,25 @@ enum ix86_builtins
   IX86_BUILTIN_COPYSIGNQ,
 
   /* Vectorizer support builtins.  */
-  IX86_BUILTIN_CEILPD_VEC_PACK_SFIX512,
   IX86_BUILTIN_CPYSGNPS,
   IX86_BUILTIN_CPYSGNPD,
   IX86_BUILTIN_CPYSGNPS256,
   IX86_BUILTIN_CPYSGNPS512,
   IX86_BUILTIN_CPYSGNPD256,
   IX86_BUILTIN_CPYSGNPD512,
+  IX86_BUILTIN_FLOORPS512,
+  IX86_BUILTIN_FLOORPD512,
+  IX86_BUILTIN_CEILPS512,
+  IX86_BUILTIN_CEILPD512,
+  IX86_BUILTIN_TRUNCPS512,
+  IX86_BUILTIN_TRUNCPD512,
+  IX86_BUILTIN_CVTPS2DQ512,
+  IX86_BUILTIN_VEC_PACK_SFIX512,
+  IX86_BUILTIN_FLOORPS_SFIX512,
   IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX512,
+  IX86_BUILTIN_CEILPS_SFIX512,
+  IX86_BUILTIN_CEILPD_VEC_PACK_SFIX512,
+  IX86_BUILTIN_ROUNDPS_AZ_SFIX512,
   IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX512,
 
 
@@ -32787,9 +32826,9 @@ static const struct builtin_description bdesc_special_args[] =
   { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_llwpcb, "__builtin_ia32_llwpcb", IX86_BUILTIN_LLWPCB, UNKNOWN, (int) VOID_FTYPE_PVOID },
   { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_slwpcb, "__builtin_ia32_slwpcb", IX86_BUILTIN_SLWPCB, UNKNOWN, (int) PVOID_FTYPE_VOID },
   { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpvalsi3, "__builtin_ia32_lwpval32", IX86_BUILTIN_LWPVAL32, UNKNOWN, (int) VOID_FTYPE_UINT_UINT_UINT },
-  { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpvaldi3, "__builtin_ia32_lwpval64", IX86_BUILTIN_LWPVAL64, UNKNOWN, (int) VOID_FTYPE_UINT64_UINT_UINT },
+  { OPTION_MASK_ISA_LWP | OPTION_MASK_ISA_64BIT, CODE_FOR_lwp_lwpvaldi3, "__builtin_ia32_lwpval64", IX86_BUILTIN_LWPVAL64, UNKNOWN, (int) VOID_FTYPE_UINT64_UINT_UINT },
   { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpinssi3, "__builtin_ia32_lwpins32", IX86_BUILTIN_LWPINS32, UNKNOWN, (int) UCHAR_FTYPE_UINT_UINT_UINT },
-  { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpinsdi3, "__builtin_ia32_lwpins64", IX86_BUILTIN_LWPINS64, UNKNOWN, (int) UCHAR_FTYPE_UINT64_UINT_UINT },
+  { OPTION_MASK_ISA_LWP | OPTION_MASK_ISA_64BIT, CODE_FOR_lwp_lwpinsdi3, "__builtin_ia32_lwpins64", IX86_BUILTIN_LWPINS64, UNKNOWN, (int) UCHAR_FTYPE_UINT64_UINT_UINT },
 
   /* FSGSBASE */
   { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdfsbasesi, "__builtin_ia32_rdfsbase32", IX86_BUILTIN_RDFSBASE32, UNKNOWN, (int) UNSIGNED_FTYPE_VOID },
@@ -33724,12 +33763,12 @@ static const struct builtin_description bdesc_args[] =
 
   /* BMI */
   { OPTION_MASK_ISA_BMI, CODE_FOR_bmi_bextr_si, "__builtin_ia32_bextr_u32", IX86_BUILTIN_BEXTR32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
-  { OPTION_MASK_ISA_BMI, CODE_FOR_bmi_bextr_di, "__builtin_ia32_bextr_u64", IX86_BUILTIN_BEXTR64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
+  { OPTION_MASK_ISA_BMI | OPTION_MASK_ISA_64BIT, CODE_FOR_bmi_bextr_di, "__builtin_ia32_bextr_u64", IX86_BUILTIN_BEXTR64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
   { OPTION_MASK_ISA_BMI, CODE_FOR_ctzhi2,       "__builtin_ctzs",           IX86_BUILTIN_CTZS,    UNKNOWN, (int) UINT16_FTYPE_UINT16 },
 
   /* TBM */
   { OPTION_MASK_ISA_TBM, CODE_FOR_tbm_bextri_si, "__builtin_ia32_bextri_u32", IX86_BUILTIN_BEXTRI32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
-  { OPTION_MASK_ISA_TBM, CODE_FOR_tbm_bextri_di, "__builtin_ia32_bextri_u64", IX86_BUILTIN_BEXTRI64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
+  { OPTION_MASK_ISA_TBM | OPTION_MASK_ISA_64BIT, CODE_FOR_tbm_bextri_di, "__builtin_ia32_bextri_u64", IX86_BUILTIN_BEXTRI64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
 
   /* F16C */
   { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtph2ps, "__builtin_ia32_vcvtph2ps", IX86_BUILTIN_CVTPH2PS, UNKNOWN, (int) V4SF_FTYPE_V8HI },
@@ -33739,11 +33778,11 @@ static const struct builtin_description bdesc_args[] =
 
   /* BMI2 */
   { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_bzhi_si3, "__builtin_ia32_bzhi_si", IX86_BUILTIN_BZHI32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
-  { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_bzhi_di3, "__builtin_ia32_bzhi_di", IX86_BUILTIN_BZHI64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
+  { OPTION_MASK_ISA_BMI2 | OPTION_MASK_ISA_64BIT, CODE_FOR_bmi2_bzhi_di3, "__builtin_ia32_bzhi_di", IX86_BUILTIN_BZHI64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
   { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pdep_si3, "__builtin_ia32_pdep_si", IX86_BUILTIN_PDEP32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
-  { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pdep_di3, "__builtin_ia32_pdep_di", IX86_BUILTIN_PDEP64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
+  { OPTION_MASK_ISA_BMI2 | OPTION_MASK_ISA_64BIT, CODE_FOR_bmi2_pdep_di3, "__builtin_ia32_pdep_di", IX86_BUILTIN_PDEP64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
   { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pext_si3, "__builtin_ia32_pext_si", IX86_BUILTIN_PEXT32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
-  { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pext_di3, "__builtin_ia32_pext_di", IX86_BUILTIN_PEXT64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
+  { OPTION_MASK_ISA_BMI2 | OPTION_MASK_ISA_64BIT, CODE_FOR_bmi2_pext_di3, "__builtin_ia32_pext_di", IX86_BUILTIN_PEXT64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
 
   /* AVX512F */
   { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_si512_256si, "__builtin_ia32_si512_256si", IX86_BUILTIN_SI512_SI256, UNKNOWN, (int) V16SI_FTYPE_V8SI },
@@ -33948,6 +33987,17 @@ static const struct builtin_description bdesc_args[] =
   { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_sqrtv8df2, "__builtin_ia32_sqrtpd512", IX86_BUILTIN_SQRTPD512, UNKNOWN, (int) V8DF_FTYPE_V8DF },
   { OPTION_MASK_ISA_AVX512F, CODE_FOR_sqrtv16sf2, "__builtin_ia32_sqrtps512", IX86_BUILTIN_SQRTPS_NR512, UNKNOWN, (int) V16SF_FTYPE_V16SF },
   { OPTION_MASK_ISA_AVX512ER, CODE_FOR_avx512er_exp2v16sf, "__builtin_ia32_exp2ps", IX86_BUILTIN_EXP2PS, UNKNOWN, (int) V16SF_FTYPE_V16SF },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_roundps512, "__builtin_ia32_floorps512", IX86_BUILTIN_FLOORPS512, (enum rtx_code) ROUND_FLOOR, (int) V16SF_FTYPE_V16SF_ROUND },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_roundps512, "__builtin_ia32_ceilps512", IX86_BUILTIN_CEILPS512, (enum rtx_code) ROUND_CEIL, (int) V16SF_FTYPE_V16SF_ROUND },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_roundps512, "__builtin_ia32_truncps512", IX86_BUILTIN_TRUNCPS512, (enum rtx_code) ROUND_TRUNC, (int) V16SF_FTYPE_V16SF_ROUND },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_roundpd512, "__builtin_ia32_floorpd512", IX86_BUILTIN_FLOORPD512, (enum rtx_code) ROUND_FLOOR, (int) V8DF_FTYPE_V8DF_ROUND },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_roundpd512, "__builtin_ia32_ceilpd512", IX86_BUILTIN_CEILPD512, (enum rtx_code) ROUND_CEIL, (int) V8DF_FTYPE_V8DF_ROUND },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_roundpd512, "__builtin_ia32_truncpd512", IX86_BUILTIN_TRUNCPD512, (enum rtx_code) ROUND_TRUNC, (int) V8DF_FTYPE_V8DF_ROUND },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_fix_notruncv16sfv16si, "__builtin_ia32_cvtps2dq512", IX86_BUILTIN_CVTPS2DQ512, UNKNOWN, (int) V16SI_FTYPE_V16SF },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_vec_pack_sfix_v8df, "__builtin_ia32_vec_pack_sfix512", IX86_BUILTIN_VEC_PACK_SFIX512, UNKNOWN, (int) V16SI_FTYPE_V8DF_V8DF },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_roundv16sf2_sfix, "__builtin_ia32_roundps_az_sfix512", IX86_BUILTIN_ROUNDPS_AZ_SFIX512, UNKNOWN, (int) V16SI_FTYPE_V16SF },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_roundps512_sfix, "__builtin_ia32_floorps_sfix512", IX86_BUILTIN_FLOORPS_SFIX512, (enum rtx_code) ROUND_FLOOR, (int) V16SI_FTYPE_V16SF_ROUND },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_roundps512_sfix, "__builtin_ia32_ceilps_sfix512", IX86_BUILTIN_CEILPS_SFIX512, (enum rtx_code) ROUND_CEIL, (int) V16SI_FTYPE_V16SF_ROUND },
   { OPTION_MASK_ISA_AVX512F, CODE_FOR_roundv8df2_vec_pack_sfix, "__builtin_ia32_roundpd_az_vec_pack_sfix512", IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX512, UNKNOWN, (int) V16SI_FTYPE_V8DF_V8DF },
   { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_roundpd_vec_pack_sfix512, "__builtin_ia32_floorpd_vec_pack_sfix512", IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX512, (enum rtx_code) ROUND_FLOOR, (int) V16SI_FTYPE_V8DF_V8DF_ROUND },
   { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_roundpd_vec_pack_sfix512, "__builtin_ia32_ceilpd_vec_pack_sfix512", IX86_BUILTIN_CEILPD_VEC_PACK_SFIX512, (enum rtx_code) ROUND_CEIL, (int) V16SI_FTYPE_V8DF_V8DF_ROUND },
@@ -34864,7 +34914,7 @@ static const struct builtin_description bdesc_round_args[] =
   { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_cvtpd2ps512_mask_round,  "__builtin_ia32_cvtpd2ps512_mask", IX86_BUILTIN_CVTPD2PS512, UNKNOWN, (int) V8SF_FTYPE_V8DF_V8SF_QI_INT },
   { OPTION_MASK_ISA_AVX512F, CODE_FOR_ufix_notruncv8dfv8si2_mask_round, "__builtin_ia32_cvtpd2udq512_mask", IX86_BUILTIN_CVTPD2UDQ512, UNKNOWN, (int) V8SI_FTYPE_V8DF_V8SI_QI_INT },
   { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_vcvtph2ps512_mask_round,  "__builtin_ia32_vcvtph2ps512_mask", IX86_BUILTIN_CVTPH2PS512, UNKNOWN, (int) V16SF_FTYPE_V16HI_V16SF_HI_INT },
-  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_fix_notruncv16sfv16si_mask_round, "__builtin_ia32_cvtps2dq512_mask", IX86_BUILTIN_CVTPS2DQ512, UNKNOWN, (int) V16SI_FTYPE_V16SF_V16SI_HI_INT },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_fix_notruncv16sfv16si_mask_round, "__builtin_ia32_cvtps2dq512_mask", IX86_BUILTIN_CVTPS2DQ512_MASK, UNKNOWN, (int) V16SI_FTYPE_V16SF_V16SI_HI_INT },
   { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_cvtps2pd512_mask_round, "__builtin_ia32_cvtps2pd512_mask", IX86_BUILTIN_CVTPS2PD512, UNKNOWN, (int) V8DF_FTYPE_V8SF_V8DF_QI_INT },
   { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_ufix_notruncv16sfv16si_mask_round, "__builtin_ia32_cvtps2udq512_mask", IX86_BUILTIN_CVTPS2UDQ512, UNKNOWN, (int) V16SI_FTYPE_V16SF_V16SI_HI_INT },
   { OPTION_MASK_ISA_AVX512F, CODE_FOR_sse2_cvtsd2ss_round, "__builtin_ia32_cvtsd2ss_round", IX86_BUILTIN_CVTSD2SS_ROUND, UNKNOWN, (int) V4SF_FTYPE_V4SF_V2DF_INT },
@@ -38441,10 +38491,13 @@ ix86_expand_args_builtin (const struct builtin_description *d,
     {
     case V2DF_FTYPE_V2DF_ROUND:
     case V4DF_FTYPE_V4DF_ROUND:
+    case V8DF_FTYPE_V8DF_ROUND:
     case V4SF_FTYPE_V4SF_ROUND:
     case V8SF_FTYPE_V8SF_ROUND:
+    case V16SF_FTYPE_V16SF_ROUND:
     case V4SI_FTYPE_V4SF_ROUND:
     case V8SI_FTYPE_V8SF_ROUND:
+    case V16SI_FTYPE_V16SF_ROUND:
       return ix86_expand_sse_round (d, exp, target);
     case V4SI_FTYPE_V2DF_V2DF_ROUND:
     case V8SI_FTYPE_V4DF_V4DF_ROUND:
@@ -38558,6 +38611,7 @@ ix86_expand_args_builtin (const struct builtin_description *d,
     case V16SI_FTYPE_V8SI:
     case V16SF_FTYPE_V4SF:
     case V16SI_FTYPE_V4SI:
+    case V16SI_FTYPE_V16SF:
     case V16SF_FTYPE_V16SF:
     case V8DI_FTYPE_UQI:
     case V8DF_FTYPE_V4DF:
@@ -39315,6 +39369,7 @@ ix86_expand_args_builtin (const struct builtin_description *d,
     case 5:
       pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
 			     args[2].op, args[3].op, args[4].op);
+      break;
     case 6:
       pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
 			     args[2].op, args[3].op, args[4].op,
@@ -39689,6 +39744,7 @@ ix86_expand_round_builtin (const struct builtin_description *d,
     case 5:
       pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
 			     args[2].op, args[3].op, args[4].op);
+      break;
     case 6:
       pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
 			     args[2].op, args[3].op, args[4].op,
@@ -42263,6 +42319,8 @@ ix86_builtin_vectorized_function (unsigned int fn, tree type_out,
 	    return ix86_get_builtin (IX86_BUILTIN_FLOORPS_SFIX);
 	  else if (out_n == 8 && in_n == 8)
 	    return ix86_get_builtin (IX86_BUILTIN_FLOORPS_SFIX256);
+	  else if (out_n == 16 && in_n == 16)
+	    return ix86_get_builtin (IX86_BUILTIN_FLOORPS_SFIX512);
 	}
       break;
 
@@ -42288,6 +42346,8 @@ ix86_builtin_vectorized_function (unsigned int fn, tree type_out,
 	    return ix86_get_builtin (IX86_BUILTIN_CEILPS_SFIX);
 	  else if (out_n == 8 && in_n == 8)
 	    return ix86_get_builtin (IX86_BUILTIN_CEILPS_SFIX256);
+	  else if (out_n == 16 && in_n == 16)
+	    return ix86_get_builtin (IX86_BUILTIN_CEILPS_SFIX512);
 	}
       break;
 
@@ -42300,6 +42360,8 @@ ix86_builtin_vectorized_function (unsigned int fn, tree type_out,
 	    return ix86_get_builtin (IX86_BUILTIN_VEC_PACK_SFIX);
 	  else if (out_n == 8 && in_n == 4)
 	    return ix86_get_builtin (IX86_BUILTIN_VEC_PACK_SFIX256);
+	  else if (out_n == 16 && in_n == 8)
+	    return ix86_get_builtin (IX86_BUILTIN_VEC_PACK_SFIX512);
 	}
       if (out_mode == SImode && in_mode == SFmode)
 	{
@@ -42307,6 +42369,8 @@ ix86_builtin_vectorized_function (unsigned int fn, tree type_out,
 	    return ix86_get_builtin (IX86_BUILTIN_CVTPS2DQ);
 	  else if (out_n == 8 && in_n == 8)
 	    return ix86_get_builtin (IX86_BUILTIN_CVTPS2DQ256);
+	  else if (out_n == 16 && in_n == 16)
+	    return ix86_get_builtin (IX86_BUILTIN_CVTPS2DQ512);
 	}
       break;
 
@@ -42332,6 +42396,8 @@ ix86_builtin_vectorized_function (unsigned int fn, tree type_out,
 	    return ix86_get_builtin (IX86_BUILTIN_ROUNDPS_AZ_SFIX);
 	  else if (out_n == 8 && in_n == 8)
 	    return ix86_get_builtin (IX86_BUILTIN_ROUNDPS_AZ_SFIX256);
+	  else if (out_n == 16 && in_n == 16)
+	    return ix86_get_builtin (IX86_BUILTIN_ROUNDPS_AZ_SFIX512);
 	}
       break;
 
@@ -42346,6 +42412,8 @@ ix86_builtin_vectorized_function (unsigned int fn, tree type_out,
 	    return ix86_get_builtin (IX86_BUILTIN_FLOORPD);
 	  else if (out_n == 4 && in_n == 4)
 	    return ix86_get_builtin (IX86_BUILTIN_FLOORPD256);
+	  else if (out_n == 8 && in_n == 8)
+	    return ix86_get_builtin (IX86_BUILTIN_FLOORPD512);
 	}
       if (out_mode == SFmode && in_mode == SFmode)
 	{
@@ -42353,6 +42421,8 @@ ix86_builtin_vectorized_function (unsigned int fn, tree type_out,
 	    return ix86_get_builtin (IX86_BUILTIN_FLOORPS);
 	  else if (out_n == 8 && in_n == 8)
 	    return ix86_get_builtin (IX86_BUILTIN_FLOORPS256);
+	  else if (out_n == 16 && in_n == 16)
+	    return ix86_get_builtin (IX86_BUILTIN_FLOORPS512);
 	}
       break;
 
@@ -42367,6 +42437,8 @@ ix86_builtin_vectorized_function (unsigned int fn, tree type_out,
 	    return ix86_get_builtin (IX86_BUILTIN_CEILPD);
 	  else if (out_n == 4 && in_n == 4)
 	    return ix86_get_builtin (IX86_BUILTIN_CEILPD256);
+	  else if (out_n == 8 && in_n == 8)
+	    return ix86_get_builtin (IX86_BUILTIN_CEILPD512);
 	}
       if (out_mode == SFmode && in_mode == SFmode)
 	{
@@ -42374,6 +42446,8 @@ ix86_builtin_vectorized_function (unsigned int fn, tree type_out,
 	    return ix86_get_builtin (IX86_BUILTIN_CEILPS);
 	  else if (out_n == 8 && in_n == 8)
 	    return ix86_get_builtin (IX86_BUILTIN_CEILPS256);
+	  else if (out_n == 16 && in_n == 16)
+	    return ix86_get_builtin (IX86_BUILTIN_CEILPS512);
 	}
       break;
 
@@ -42388,6 +42462,8 @@ ix86_builtin_vectorized_function (unsigned int fn, tree type_out,
 	    return ix86_get_builtin (IX86_BUILTIN_TRUNCPD);
 	  else if (out_n == 4 && in_n == 4)
 	    return ix86_get_builtin (IX86_BUILTIN_TRUNCPD256);
+	  else if (out_n == 8 && in_n == 8)
+	    return ix86_get_builtin (IX86_BUILTIN_TRUNCPD512);
 	}
       if (out_mode == SFmode && in_mode == SFmode)
 	{
@@ -42395,6 +42471,8 @@ ix86_builtin_vectorized_function (unsigned int fn, tree type_out,
 	    return ix86_get_builtin (IX86_BUILTIN_TRUNCPS);
 	  else if (out_n == 8 && in_n == 8)
 	    return ix86_get_builtin (IX86_BUILTIN_TRUNCPS256);
+	  else if (out_n == 16 && in_n == 16)
+	    return ix86_get_builtin (IX86_BUILTIN_TRUNCPS512);
 	}
       break;
 
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index d0b418b0fd..ec306f37d3 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -465,6 +465,8 @@ extern unsigned char ix86_tune_features[X86_TUNE_LAST];
 	ix86_tune_features[X86_TUNE_SLOW_PSHUFB]
 #define TARGET_VECTOR_PARALLEL_EXECUTION \
 	ix86_tune_features[X86_TUNE_VECTOR_PARALLEL_EXECUTION]
+#define TARGET_AVOID_4BYTE_PREFIXES \
+	ix86_tune_features[X86_TUNE_AVOID_4BYTE_PREFIXES]
 #define TARGET_FUSE_CMP_AND_BRANCH_32 \
 	ix86_tune_features[X86_TUNE_FUSE_CMP_AND_BRANCH_32]
 #define TARGET_FUSE_CMP_AND_BRANCH_64 \
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index cf29e5d39d..7fbbea619e 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -3089,7 +3089,7 @@
 	(match_operand:TF 1 "general_operand"	   "C ,xm,x,*roF,*rC"))]
   "(TARGET_64BIT || TARGET_SSE)
    && !(MEM_P (operands[0]) && MEM_P (operands[1]))
-   && (!can_create_pseudo_p ()
+   && (lra_in_progress || reload_completed
        || (ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_LARGE)
        || !CONST_DOUBLE_P (operands[1])
        || (optimize_function_for_size_p (cfun)
@@ -3167,7 +3167,7 @@
 	(match_operand:XF 1 "general_operand"
 	 "fm,f,G,roF,r , *roF,*r,F ,C,roF,rF"))]
   "!(MEM_P (operands[0]) && MEM_P (operands[1]))
-   && (!can_create_pseudo_p ()
+   && (lra_in_progress || reload_completed
        || (ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_LARGE)
        || !CONST_DOUBLE_P (operands[1])
        || (optimize_function_for_size_p (cfun)
@@ -3240,7 +3240,7 @@
 	(match_operand:DF 1 "general_operand"
     "Yf*fm,Yf*f,G   ,roF,r ,*roF,*r,F ,rm,rC,C ,F ,C,v,m,v,C ,*x,m ,*x,Yj,r ,roF,rF,rmF,rC"))]
   "!(MEM_P (operands[0]) && MEM_P (operands[1]))
-   && (!can_create_pseudo_p ()
+   && (lra_in_progress || reload_completed
        || (ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_LARGE)
        || !CONST_DOUBLE_P (operands[1])
        || (optimize_function_for_size_p (cfun)
@@ -3442,7 +3442,7 @@
 	(match_operand:SF 1 "general_operand"
 	  "Yf*fm,Yf*f,G   ,rmF,rF,C,v,m,v,Yj,r  ,*y ,m  ,*y,*Yn,r   ,rmF,rF"))]
   "!(MEM_P (operands[0]) && MEM_P (operands[1]))
-   && (!can_create_pseudo_p ()
+   && (lra_in_progress || reload_completed
        || (ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_LARGE)
        || !CONST_DOUBLE_P (operands[1])
        || (optimize_function_for_size_p (cfun)
@@ -9332,7 +9332,7 @@
 })
 
 (define_split
-  [(set (match_operand:SF 0 "register_operand")
+  [(set (match_operand:SF 0 "general_reg_operand")
 	(match_operator:SF 1 "absneg_operator" [(match_dup 0)]))
    (use (match_operand:V4SF 2))
    (clobber (reg:CC FLAGS_REG))]
@@ -9356,7 +9356,7 @@
 })
 
 (define_split
-  [(set (match_operand:DF 0 "register_operand")
+  [(set (match_operand:DF 0 "general_reg_operand")
 	(match_operator:DF 1 "absneg_operator" [(match_dup 0)]))
    (use (match_operand 2))
    (clobber (reg:CC FLAGS_REG))]
@@ -9394,7 +9394,7 @@
 })
 
 (define_split
-  [(set (match_operand:XF 0 "register_operand")
+  [(set (match_operand:XF 0 "general_reg_operand")
 	(match_operator:XF 1 "absneg_operator" [(match_dup 0)]))
    (use (match_operand 2))
    (clobber (reg:CC FLAGS_REG))]
@@ -11078,20 +11078,19 @@
 		   (const_int 1))
 	      (clobber (reg:CC FLAGS_REG))])]
   "TARGET_64BIT && !TARGET_USE_BT"
-  [(const_int 0)]
+  [(parallel [(set (match_dup 0)
+		   (ior:DI (match_dup 0) (match_dup 3)))
+	      (clobber (reg:CC FLAGS_REG))])]
 {
   int i = INTVAL (operands[1]);
 
-  rtx op1 = gen_int_mode (HOST_WIDE_INT_1U << i, DImode);
+  operands[3] = gen_int_mode (HOST_WIDE_INT_1U << i, DImode);
 
-  if (i >= 31)
+  if (!x86_64_immediate_operand (operands[3], DImode))
     {
-      emit_move_insn (operands[2], op1);
-      op1 = operands[2];
+      emit_move_insn (operands[2], operands[3]);
+      operands[3] = operands[2];
     }
-
-  emit_insn (gen_iordi3 (operands[0], operands[0], op1));
-  DONE;
 })
 
 (define_peephole2
@@ -11103,20 +11102,19 @@
 		   (const_int 0))
 	      (clobber (reg:CC FLAGS_REG))])]
   "TARGET_64BIT && !TARGET_USE_BT"
-  [(const_int 0)]
+  [(parallel [(set (match_dup 0)
+		   (and:DI (match_dup 0) (match_dup 3)))
+	      (clobber (reg:CC FLAGS_REG))])]
 {
   int i = INTVAL (operands[1]);
 
-  rtx op1 = gen_int_mode (HOST_WIDE_INT_1U << i, DImode);
+  operands[3] = gen_int_mode (~(HOST_WIDE_INT_1U << i), DImode);
  
-  if (i >= 32)
+  if (!x86_64_immediate_operand (operands[3], DImode))
     {
-      emit_move_insn (operands[2], op1);
-      op1 = operands[2];
+      emit_move_insn (operands[2], operands[3]);
+      operands[3] = operands[2];
     }
-
-  emit_insn (gen_anddi3 (operands[0], operands[0], op1));
-  DONE;
 })
 
 (define_peephole2
@@ -11129,20 +11127,19 @@
 			(match_dup 0) (const_int 1) (match_dup 1))))
 	      (clobber (reg:CC FLAGS_REG))])]
   "TARGET_64BIT && !TARGET_USE_BT"
-  [(const_int 0)]
+  [(parallel [(set (match_dup 0)
+		   (xor:DI (match_dup 0) (match_dup 3)))
+	      (clobber (reg:CC FLAGS_REG))])]
 {
   int i = INTVAL (operands[1]);
 
-  rtx op1 = gen_int_mode (HOST_WIDE_INT_1U << i, DImode);
+  operands[3] = gen_int_mode (HOST_WIDE_INT_1U << i, DImode);
 
-  if (i >= 31)
+  if (!x86_64_immediate_operand (operands[3], DImode))
     {
-      emit_move_insn (operands[2], op1);
-      op1 = operands[2];
+      emit_move_insn (operands[2], operands[3]);
+      operands[3] = operands[2];
     }
-
-  emit_insn (gen_xordi3 (operands[0], operands[0], op1));
-  DONE;
 })
 
 (define_insn "*bt<mode>"
@@ -11859,8 +11856,7 @@
   "(peep2_reg_dead_p (3, operands[1])
     || operands_match_p (operands[1], operands[3]))
    && ! reg_overlap_mentioned_p (operands[3], operands[0])
-   && ! (GET_CODE (operands[4]) == CLOBBER
-	 && reg_mentioned_p (operands[3], operands[4]))"
+   && ! reg_set_p (operands[3], operands[4])"
   [(parallel [(set (match_dup 5) (match_dup 0))
 	      (match_dup 4)])
    (set (strict_low_part (match_dup 6))
@@ -11904,8 +11900,7 @@
   "(peep2_reg_dead_p (3, operands[1])
     || operands_match_p (operands[1], operands[3]))
    && ! reg_overlap_mentioned_p (operands[3], operands[0])
-   && ! (GET_CODE (operands[4]) == CLOBBER
-	 && reg_mentioned_p (operands[3], operands[4]))"
+   && ! reg_set_p (operands[3], operands[4])"
   [(parallel [(set (match_dup 5) (match_dup 0))
 	      (match_dup 4)])
    (set (strict_low_part (match_dup 6))
diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md
index 14e80d9b48..93dda7bb0e 100644
--- a/gcc/config/i386/predicates.md
+++ b/gcc/config/i386/predicates.md
@@ -637,7 +637,7 @@
 		     (op, mode == VOIDmode ? mode : Pmode)")
        (match_operand 0 "call_register_no_elim_operand")
        (ior (and (not (match_test "TARGET_X32"))
-		 (match_operand 0 "sibcall_memory_operand"))
+		 (match_operand 0 "memory_operand"))
 	    (and (match_test "TARGET_X32 && Pmode == DImode")
 		 (match_operand 0 "GOT_memory_operand")))))
 
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 1ffb3b9a86..42506efc52 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -4488,7 +4488,7 @@
    (set_attr "prefix" "maybe_vex")
    (set_attr "mode" "<sseinsnmode>")])
 
-(define_insn "<mask_codefor>avx512f_fix_notruncv16sfv16si<mask_name><round_name>"
+(define_insn "avx512f_fix_notruncv16sfv16si<mask_name><round_name>"
   [(set (match_operand:V16SI 0 "register_operand" "=v")
 	(unspec:V16SI
 	  [(match_operand:V16SF 1 "<round_nimm_predicate>" "<round_constraint>")]
@@ -5046,7 +5046,7 @@
    (set_attr "ssememalign" "64")
    (set_attr "mode" "V2DF")])
 
-(define_insn "<mask_codefor>avx512f_cvtpd2dq512<mask_name><round_name>"
+(define_insn "avx512f_cvtpd2dq512<mask_name><round_name>"
   [(set (match_operand:V8SI 0 "register_operand" "=v")
 	(unspec:V8SI
 	  [(match_operand:V8DF 1 "<round_nimm_predicate>" "<round_constraint>")]
@@ -6006,6 +6006,23 @@
   DONE;
 })
 
+(define_expand "avx512f_vec_pack_sfix_v8df"
+  [(match_operand:V16SI 0 "register_operand")
+   (match_operand:V8DF 1 "nonimmediate_operand")
+   (match_operand:V8DF 2 "nonimmediate_operand")]
+  "TARGET_AVX512F"
+{
+  rtx r1, r2;
+
+  r1 = gen_reg_rtx (V8SImode);
+  r2 = gen_reg_rtx (V8SImode);
+
+  emit_insn (gen_avx512f_cvtpd2dq512 (r1, operands[1]));
+  emit_insn (gen_avx512f_cvtpd2dq512 (r2, operands[2]));
+  emit_insn (gen_avx_vec_concatv16si (operands[0], r1, r2));
+  DONE;
+})
+
 (define_expand "vec_pack_sfix_v4df"
   [(match_operand:V8SI 0 "register_operand")
    (match_operand:V4DF 1 "nonimmediate_operand")
@@ -10566,22 +10583,23 @@
    (set_attr "mode" "<sseinsnmode>")])
 
 (define_insn "<shift_insn><mode>3<mask_name>"
-  [(set (match_operand:VI48_AVX2 0 "register_operand" "=x,v")
+  [(set (match_operand:VI48_AVX2 0 "register_operand" "=x,x,v")
 	(any_lshift:VI48_AVX2
-	  (match_operand:VI48_AVX2 1 "register_operand" "0,v")
-	  (match_operand:SI 2 "nonmemory_operand" "xN,vN")))]
+	  (match_operand:VI48_AVX2 1 "register_operand" "0,x,v")
+	  (match_operand:SI 2 "nonmemory_operand" "xN,xN,vN")))]
   "TARGET_SSE2 && <mask_mode512bit_condition>"
   "@
    p<vshift><ssemodesuffix>\t{%2, %0|%0, %2}
-   vp<vshift><ssemodesuffix>\t{%2, %1, %0<mask_operand3>|%0<mask_operand3>, %1, %2}"
-  [(set_attr "isa" "noavx,avx")
+   vp<vshift><ssemodesuffix>\t{%2, %1, %0<mask_operand3>|%0<mask_operand3>, %1, %2}
+   vp<vshift><ssemodesuffix>\t{%2, %1, %0<mask_operand3>|%0<mask_operand3>, %1, %2}"  
+  [(set_attr "isa" "noavx,avx,avx512bw")
    (set_attr "type" "sseishft")
    (set (attr "length_immediate")
      (if_then_else (match_operand 2 "const_int_operand")
        (const_string "1")
        (const_string "0")))
-   (set_attr "prefix_data16" "1,*")
-   (set_attr "prefix" "orig,vex")
+   (set_attr "prefix_data16" "1,*,*")
+   (set_attr "prefix" "orig,vex,evex")
    (set_attr "mode" "<sseinsnmode>")])
 
 (define_insn "<shift_insn><mode>3<mask_name>"
@@ -15246,13 +15264,25 @@
   DONE;
 })
 
-(define_expand "avx512f_roundpd512"
-  [(match_operand:V8DF 0 "register_operand")
-   (match_operand:V8DF 1 "nonimmediate_operand")
+(define_expand "avx512f_round<castmode>512"
+  [(match_operand:VF_512 0 "register_operand")
+   (match_operand:VF_512 1 "nonimmediate_operand")
    (match_operand:SI 2 "const_0_to_15_operand")]
   "TARGET_AVX512F"
 {
-  emit_insn (gen_avx512f_rndscalev8df (operands[0], operands[1], operands[2]));
+  emit_insn (gen_avx512f_rndscale<mode> (operands[0], operands[1], operands[2]));
+  DONE;
+})
+
+(define_expand "avx512f_roundps512_sfix"
+  [(match_operand:V16SI 0 "register_operand")
+   (match_operand:V16SF 1 "nonimmediate_operand")
+   (match_operand:SI 2 "const_0_to_15_operand")]
+  "TARGET_AVX512F"
+{
+  rtx tmp = gen_reg_rtx (V16SFmode);
+  emit_insn (gen_avx512f_rndscalev16sf (tmp, operands[1], operands[2]));
+  emit_insn (gen_fix_truncv16sfv16si2 (operands[0], tmp));
   DONE;
 })
 
@@ -15352,7 +15382,7 @@
 
 (define_expand "round<mode>2_sfix"
   [(match_operand:<sseintvecmode> 0 "register_operand")
-   (match_operand:VF1_128_256 1 "register_operand")]
+   (match_operand:VF1 1 "register_operand")]
   "TARGET_ROUND && !flag_trapping_math"
 {
   rtx tmp = gen_reg_rtx (<MODE>mode);
diff --git a/gcc/config/microblaze/rtems.h b/gcc/config/microblaze/rtems.h
index 68aa381b37..56f3f7087d 100644
--- a/gcc/config/microblaze/rtems.h
+++ b/gcc/config/microblaze/rtems.h
@@ -23,3 +23,10 @@ along with GCC; see the file COPYING3.  If not see
   builtin_define( "__rtems__" );		\
   builtin_assert( "system=rtems" );		\
 } while (0)
+
+/* Redefine to include only items relevant for RTEMS */
+#undef LINK_SPEC
+#define LINK_SPEC "%{shared:-shared} -N -relax \
+  %{mbig-endian:-EB --oformat=elf32-microblaze} \
+  %{mlittle-endian:-EL --oformat=elf32-microblazeel} \
+  %{mxl-gp-opt:%{G*}} %{!mxl-gp-opt: -G 0}"
diff --git a/gcc/config/nvptx/nvptx.c b/gcc/config/nvptx/nvptx.c
index b088cf89bb..a6c90b633f 100644
--- a/gcc/config/nvptx/nvptx.c
+++ b/gcc/config/nvptx/nvptx.c
@@ -483,7 +483,7 @@ nvptx_strict_argument_naming (cumulative_args_t cum_v)
 static rtx
 nvptx_libcall_value (machine_mode mode, const_rtx)
 {
-  if (!cfun->machine->doing_call)
+  if (!cfun || !cfun->machine->doing_call)
     /* Pretend to return in a hard reg for early uses before pseudos can be
        generated.  */
     return gen_rtx_REG (mode, NVPTX_RETURN_REGNUM);
@@ -502,6 +502,7 @@ nvptx_function_value (const_tree type, const_tree ARG_UNUSED (func),
 
   if (outgoing)
     {
+      gcc_assert (cfun);
       cfun->machine->return_mode = mode;
       return gen_rtx_REG (mode, NVPTX_RETURN_REGNUM);
     }
diff --git a/gcc/config/pa/pa.c b/gcc/config/pa/pa.c
index 8b1c8327c7..5fb1da78a3 100644
--- a/gcc/config/pa/pa.c
+++ b/gcc/config/pa/pa.c
@@ -4541,63 +4541,78 @@ hppa_profile_hook (int label_no)
      lcla2 and load_offset_label_address insn patterns.  */
   rtx reg = gen_reg_rtx (SImode);
   rtx_code_label *label_rtx = gen_label_rtx ();
-  rtx begin_label_rtx;
+  rtx mcount = gen_rtx_MEM (Pmode, gen_rtx_SYMBOL_REF (Pmode, "_mcount"));
+  int reg_parm_stack_space = REG_PARM_STACK_SPACE (NULL_TREE);
+  rtx arg_bytes, begin_label_rtx;
   rtx_insn *call_insn;
   char begin_label_name[16];
+  bool use_mcount_pcrel_call;
+
+  /* If we can reach _mcount with a pc-relative call, we can optimize
+     loading the address of the current function.  This requires linker
+     long branch stub support.  */
+  if (!TARGET_PORTABLE_RUNTIME
+      && !TARGET_LONG_CALLS
+      && (TARGET_SOM || flag_function_sections))
+    use_mcount_pcrel_call = TRUE;
+  else
+    use_mcount_pcrel_call = FALSE;
 
   ASM_GENERATE_INTERNAL_LABEL (begin_label_name, FUNC_BEGIN_PROLOG_LABEL,
 			       label_no);
   begin_label_rtx = gen_rtx_SYMBOL_REF (SImode, ggc_strdup (begin_label_name));
 
-  if (TARGET_64BIT)
-    emit_move_insn (arg_pointer_rtx,
-		    gen_rtx_PLUS (word_mode, virtual_outgoing_args_rtx,
-				  GEN_INT (64)));
-
   emit_move_insn (gen_rtx_REG (word_mode, 26), gen_rtx_REG (word_mode, 2));
 
-  /* The address of the function is loaded into %r25 with an instruction-
-     relative sequence that avoids the use of relocations.  The sequence
-     is split so that the load_offset_label_address instruction can
-     occupy the delay slot of the call to _mcount.  */
-  if (TARGET_PA_20)
-    emit_insn (gen_lcla2 (reg, label_rtx));
-  else
-    emit_insn (gen_lcla1 (reg, label_rtx));
-
-  emit_insn (gen_load_offset_label_address (gen_rtx_REG (SImode, 25), 
-					    reg, begin_label_rtx, label_rtx));
-
-#if !NO_DEFERRED_PROFILE_COUNTERS
-  {
-    rtx count_label_rtx, addr, r24;
-    char count_label_name[16];
+  if (!use_mcount_pcrel_call)
+    {
+      /* The address of the function is loaded into %r25 with an instruction-
+	 relative sequence that avoids the use of relocations.  The sequence
+	 is split so that the load_offset_label_address instruction can
+	 occupy the delay slot of the call to _mcount.  */
+      if (TARGET_PA_20)
+	emit_insn (gen_lcla2 (reg, label_rtx));
+      else
+	emit_insn (gen_lcla1 (reg, label_rtx));
 
-    funcdef_nos.safe_push (label_no);
-    ASM_GENERATE_INTERNAL_LABEL (count_label_name, "LP", label_no);
-    count_label_rtx = gen_rtx_SYMBOL_REF (Pmode, ggc_strdup (count_label_name));
+      emit_insn (gen_load_offset_label_address (gen_rtx_REG (SImode, 25), 
+						reg,
+						begin_label_rtx,
+						label_rtx));
+    }
 
-    addr = force_reg (Pmode, count_label_rtx);
-    r24 = gen_rtx_REG (Pmode, 24);
-    emit_move_insn (r24, addr);
+  if (!NO_DEFERRED_PROFILE_COUNTERS)
+    {
+      rtx count_label_rtx, addr, r24;
+      char count_label_name[16];
 
-    call_insn =
-      emit_call_insn (gen_call (gen_rtx_MEM (Pmode, 
-					     gen_rtx_SYMBOL_REF (Pmode, 
-								 "_mcount")),
-				GEN_INT (TARGET_64BIT ? 24 : 12)));
+      funcdef_nos.safe_push (label_no);
+      ASM_GENERATE_INTERNAL_LABEL (count_label_name, "LP", label_no);
+      count_label_rtx = gen_rtx_SYMBOL_REF (Pmode,
+					    ggc_strdup (count_label_name));
 
-    use_reg (&CALL_INSN_FUNCTION_USAGE (call_insn), r24);
-  }
-#else
+      addr = force_reg (Pmode, count_label_rtx);
+      r24 = gen_rtx_REG (Pmode, 24);
+      emit_move_insn (r24, addr);
 
-  call_insn =
-    emit_call_insn (gen_call (gen_rtx_MEM (Pmode, 
-					   gen_rtx_SYMBOL_REF (Pmode, 
-							       "_mcount")),
-			      GEN_INT (TARGET_64BIT ? 16 : 8)));
+      arg_bytes = GEN_INT (TARGET_64BIT ? 24 : 12);
+      if (use_mcount_pcrel_call)
+	call_insn = emit_call_insn (gen_call_mcount (mcount, arg_bytes,
+						     begin_label_rtx));
+      else
+	call_insn = emit_call_insn (gen_call (mcount, arg_bytes));
 
-#endif
+      use_reg (&CALL_INSN_FUNCTION_USAGE (call_insn), r24);
+    }
+  else
+    {
+      arg_bytes = GEN_INT (TARGET_64BIT ? 16 : 8);
+      if (use_mcount_pcrel_call)
+	call_insn = emit_call_insn (gen_call_mcount (mcount, arg_bytes,
+						     begin_label_rtx));
+      else
+	call_insn = emit_call_insn (gen_call (mcount, arg_bytes));
+    }
 
   use_reg (&CALL_INSN_FUNCTION_USAGE (call_insn), gen_rtx_REG (SImode, 25));
   use_reg (&CALL_INSN_FUNCTION_USAGE (call_insn), gen_rtx_REG (SImode, 26));
@@ -4605,6 +4620,10 @@ hppa_profile_hook (int label_no)
   /* Indicate the _mcount call cannot throw, nor will it execute a
      non-local goto.  */
   make_reg_eh_region_note_nothrow_nononlocal (call_insn);
+
+  /* Allocate space for fixed arguments.  */
+  if (reg_parm_stack_space > crtl->outgoing_args_size)
+    crtl->outgoing_args_size = reg_parm_stack_space;
 }
 
 /* Fetch the return address for the frame COUNT steps up from
diff --git a/gcc/config/pa/pa.md b/gcc/config/pa/pa.md
index 75100d2d69..2c52465ca8 100644
--- a/gcc/config/pa/pa.md
+++ b/gcc/config/pa/pa.md
@@ -8216,6 +8216,170 @@ add,l %2,%3,%3\;bv,n %%r0(%3)"
 	(cond [(and (const_int 0) (eq (const_int 0) (pc))) (const_int 12)]
 	      (symbol_ref "pa_attr_length_indirect_call (insn)")))])
 
+/* Expand special pc-relative call to _mcount.  */
+
+(define_expand "call_mcount"
+  [(parallel [(call (match_operand:SI 0 "" "")
+		    (match_operand 1 "" ""))
+	      (set (reg:SI 25)
+		   (plus:SI (reg:SI 2)
+			    (minus:SI (match_operand 2 "" "")
+				      (plus:SI (pc) (const_int 4)))))
+	      (clobber (reg:SI 2))])]
+  "!TARGET_PORTABLE_RUNTIME"
+  "
+{
+  rtx op = XEXP (operands[0], 0);
+  rtx nb = operands[1];
+  rtx lab = operands[2];
+
+  if (TARGET_64BIT)
+    {
+      rtx r4 = gen_rtx_REG (word_mode, 4);
+      emit_move_insn (arg_pointer_rtx,
+		      gen_rtx_PLUS (word_mode, virtual_outgoing_args_rtx,
+				    GEN_INT (64)));
+      emit_call_insn (gen_call_mcount_64bit (op, nb, lab, r4));
+    }
+  else
+    {
+      if (flag_pic)
+	{
+	  rtx r4 = gen_rtx_REG (word_mode, 4);
+	  emit_call_insn (gen_call_mcount_pic (op, nb, lab, r4));
+	}
+      else
+	emit_call_insn (gen_call_mcount_nonpic (op, nb, lab));
+    }
+
+  DONE;
+}")
+
+(define_insn "call_mcount_nonpic"
+  [(call (mem:SI (match_operand 0 "call_operand_address" ""))
+	 (match_operand 1 "" "i"))
+   (set (reg:SI 25)
+	(plus:SI (reg:SI 2)
+		 (minus:SI (match_operand 2 "" "")
+			   (plus:SI (pc) (const_int 4)))))
+   (clobber (reg:SI 2))]
+  "!TARGET_PORTABLE_RUNTIME && !TARGET_64BIT"
+  "*
+{
+  pa_output_arg_descriptor (insn);
+  return \"{bl|b,l} %0,%%r2\;ldo %2-.-4(%%r2),%%r25\";
+}"
+  [(set_attr "type" "multi")
+   (set_attr "length" "8")])
+
+(define_insn "call_mcount_pic"
+  [(call (mem:SI (match_operand 0 "call_operand_address" ""))
+	 (match_operand 1 "" "i"))
+   (set (reg:SI 25)
+	(plus:SI (reg:SI 2)
+		 (minus:SI (match_operand 2 "" "")
+			   (plus:SI (pc) (const_int 4)))))
+   (clobber (reg:SI 2))
+   (clobber (match_operand 3))
+   (use (reg:SI 19))]
+  "!TARGET_PORTABLE_RUNTIME && !TARGET_64BIT"
+  "#")
+
+(define_split
+  [(parallel [(call (mem:SI (match_operand 0 "call_operand_address" ""))
+		    (match_operand 1 "" ""))
+	      (set (reg:SI 25)
+		   (plus:SI (reg:SI 2)
+			    (minus:SI (match_operand 2 "" "")
+				      (plus:SI (pc) (const_int 4)))))
+	      (clobber (reg:SI 2))
+	      (clobber (match_operand 3))
+	      (use (reg:SI 19))])]
+  "!TARGET_PORTABLE_RUNTIME && !TARGET_64BIT && reload_completed"
+  [(set (match_dup 3) (reg:SI 19))
+   (parallel [(call (mem:SI (match_dup 0))
+		    (match_dup 1))
+	      (set (reg:SI 25)
+		   (plus:SI (reg:SI 2)
+			    (minus:SI (match_dup 2)
+				      (plus:SI (pc) (const_int 4)))))
+	      (clobber (reg:SI 2))
+	      (use (reg:SI 19))])
+   (set (reg:SI 19) (match_dup 3))]
+  "")
+
+(define_insn "*call_mcount_pic_post_reload"
+  [(call (mem:SI (match_operand 0 "call_operand_address" ""))
+	 (match_operand 1 "" "i"))
+   (set (reg:SI 25)
+	(plus:SI (reg:SI 2)
+		 (minus:SI (match_operand 2 "" "")
+			   (plus:SI (pc) (const_int 4)))))
+   (clobber (reg:SI 2))
+   (use (reg:SI 19))]
+  "!TARGET_PORTABLE_RUNTIME && !TARGET_64BIT"
+  "*
+{
+  pa_output_arg_descriptor (insn);
+  return \"{bl|b,l} %0,%%r2\;ldo %2-.-4(%%r2),%%r25\";
+}"
+  [(set_attr "type" "multi")
+   (set_attr "length" "8")])
+
+(define_insn "call_mcount_64bit"
+  [(call (mem:SI (match_operand 0 "call_operand_address" ""))
+	 (match_operand 1 "" "i"))
+   (set (reg:SI 25)
+	(plus:SI (reg:SI 2)
+		 (minus:SI (match_operand 2 "" "")
+			   (plus:SI (pc) (const_int 4)))))
+   (clobber (reg:DI 2))
+   (clobber (match_operand 3))
+   (use (reg:DI 27))
+   (use (reg:DI 29))]
+  "TARGET_64BIT"
+  "#")
+
+(define_split
+  [(parallel [(call (mem:SI (match_operand 0 "call_operand_address" ""))
+		    (match_operand 1 "" ""))
+	      (set (reg:SI 25)
+		   (plus:SI (reg:SI 2)
+			    (minus:SI (match_operand 2 "" "")
+				      (plus:SI (pc) (const_int 4)))))
+	      (clobber (reg:DI 2))
+	      (clobber (match_operand 3))
+	      (use (reg:DI 27))
+	      (use (reg:DI 29))])]
+  "TARGET_64BIT && reload_completed"
+  [(set (match_dup 3) (reg:DI 27))
+   (parallel [(call (mem:SI (match_dup 0))
+		    (match_dup 1))
+	      (set (reg:SI 25)
+		   (plus:SI (reg:SI 2)
+			    (minus:SI (match_dup 2)
+				      (plus:SI (pc) (const_int 4)))))
+	      (clobber (reg:DI 2))
+	      (use (reg:DI 27))
+	      (use (reg:DI 29))])
+   (set (reg:DI 27) (match_dup 3))]
+  "")
+
+(define_insn "*call_mcount_64bit_post_reload"
+  [(call (mem:SI (match_operand 0 "call_operand_address" ""))
+	 (match_operand 1 "" "i"))
+   (set (reg:SI 25)
+	(plus:SI (reg:SI 2)
+		 (minus:SI (match_operand 2 "" "")
+			   (plus:SI (pc) (const_int 4)))))
+   (clobber (reg:DI 2))
+   (use (reg:DI 27))
+   (use (reg:DI 29))]
+  "TARGET_64BIT"
+  "{bl|b,l} %0,%%r2\;ldo %2-.-4(%%r2),%%r25"
+  [(set_attr "type" "multi")
+   (set_attr "length" "8")])
+
 ;; Call subroutine returning any type.
 
 (define_expand "untyped_call"
diff --git a/gcc/config/rl78/rl78-expand.md b/gcc/config/rl78/rl78-expand.md
index 331eec1e90..4fd195865a 100644
--- a/gcc/config/rl78/rl78-expand.md
+++ b/gcc/config/rl78/rl78-expand.md
@@ -159,7 +159,7 @@
   [(set (match_operand:HI 0 "register_operand")
         (mult:HI (zero_extend:HI (match_operand:QI 1 "register_operand"))
                  (zero_extend:HI (match_operand:QI 2 "register_operand"))))]
-  "!TARGET_G10"
+  ""
   ""
 )
 
diff --git a/gcc/config/rl78/rl78-real.md b/gcc/config/rl78/rl78-real.md
index aacaefff9e..530b2fe90f 100644
--- a/gcc/config/rl78/rl78-real.md
+++ b/gcc/config/rl78/rl78-real.md
@@ -179,7 +179,7 @@
   [(set (match_operand:HI 0 "register_operand" "=A,A")
         (mult:HI (match_operand:HI 1 "rl78_nonfar_operand" "0,0")
                  (match_operand:HI 2 "rl78_24_operand" "N,i")))]
-  "rl78_real_insns_ok () && !TARGET_G10"
+  "rl78_real_insns_ok ()"
   "@
    shlw\t%0, 1
    shlw\t%0, 2"
@@ -189,7 +189,7 @@
   [(set (match_operand:HI 0 "nonimmediate_operand" "=A")
         (mult:HI (zero_extend:HI (match_operand:QI 1 "general_operand" "%a"))
                  (zero_extend:HI (match_operand:QI 2 "general_operand" "x"))))]
-  "rl78_real_insns_ok () && !TARGET_G10"
+  "rl78_real_insns_ok ()"
   "mulu\t%2"
 )
 
diff --git a/gcc/config/rl78/rl78-virt.md b/gcc/config/rl78/rl78-virt.md
index e2e7f4750e..8d1b2a8731 100644
--- a/gcc/config/rl78/rl78-virt.md
+++ b/gcc/config/rl78/rl78-virt.md
@@ -116,7 +116,7 @@
   [(set (match_operand:HI          0 "register_operand" "=v")
         (mult:HI (match_operand:HI 1 "rl78_nonfar_operand" "%vim")
                  (match_operand:HI 2 "rl78_24_operand" "Ni")))]
-  "rl78_virt_insns_ok () && !TARGET_G10"
+  "rl78_virt_insns_ok ()"
   "v.mulu\t%0, %1, %2"
   [(set_attr "valloc" "umul")]
 )
@@ -125,7 +125,7 @@
   [(set (match_operand:HI                          0 "register_operand" "=v")
         (mult:HI (zero_extend:HI (match_operand:QI 1 "rl78_nonfar_operand" "%vim"))
                  (zero_extend:HI (match_operand:QI 2 "general_operand" "vim"))))]
-  "rl78_virt_insns_ok () && !TARGET_G10"
+  "rl78_virt_insns_ok ()"
   "v.mulu\t%0, %2"
   [(set_attr "valloc" "umul")]
 )
diff --git a/gcc/config/rs6000/40x.md b/gcc/config/rs6000/40x.md
index 91e5cffaa3..98d9ae02ba 100644
--- a/gcc/config/rs6000/40x.md
+++ b/gcc/config/rs6000/40x.md
@@ -119,6 +119,6 @@
   "bpu_40x")
 
 (define_insn_reservation "ppc405-float" 11
-  (and (eq_attr "type" "fpload,fpstore,fpcompare,fp,dmul,sdiv,ddiv")
+  (and (eq_attr "type" "fpload,fpstore,fpcompare,fp,fpsimple,dmul,sdiv,ddiv")
        (eq_attr "cpu" "ppc405"))
   "fpu_405*10")
diff --git a/gcc/config/rs6000/440.md b/gcc/config/rs6000/440.md
index 6d07ef3ea3..c33f4accb0 100644
--- a/gcc/config/rs6000/440.md
+++ b/gcc/config/rs6000/440.md
@@ -107,7 +107,7 @@
   "ppc440_issue,ppc440_f_pipe+ppc440_i_pipe")
 
 (define_insn_reservation "ppc440-fp" 5
-  (and (eq_attr "type" "fp,dmul")
+  (and (eq_attr "type" "fp,fpsimple,dmul")
        (eq_attr "cpu" "ppc440"))
   "ppc440_issue,ppc440_f_pipe")
 
diff --git a/gcc/config/rs6000/476.md b/gcc/config/rs6000/476.md
index 8c266b992d..4cae8fcc9e 100644
--- a/gcc/config/rs6000/476.md
+++ b/gcc/config/rs6000/476.md
@@ -124,7 +124,7 @@
    ppc476_f_pipe+ppc476_i_pipe")
 
 (define_insn_reservation "ppc476-fp" 6
-  (and (eq_attr "type" "fp,dmul")
+  (and (eq_attr "type" "fp,fpsimple,dmul")
        (eq_attr "cpu" "ppc476"))
   "ppc476_issue_fp,\
    ppc476_f_pipe")
diff --git a/gcc/config/rs6000/601.md b/gcc/config/rs6000/601.md
index e34c9bf20f..aa869d86d8 100644
--- a/gcc/config/rs6000/601.md
+++ b/gcc/config/rs6000/601.md
@@ -86,7 +86,7 @@
   "(fpu_ppc601+iu_ppc601*2),nothing*2,bpu_ppc601")
 
 (define_insn_reservation "ppc601-fp" 4
-  (and (eq_attr "type" "fp")
+  (and (eq_attr "type" "fp,fpsimple")
        (eq_attr "cpu" "ppc601"))
   "fpu_ppc601")
 
diff --git a/gcc/config/rs6000/603.md b/gcc/config/rs6000/603.md
index 3b07461bf0..052c1c1c95 100644
--- a/gcc/config/rs6000/603.md
+++ b/gcc/config/rs6000/603.md
@@ -105,7 +105,7 @@
   "(fpu_603+iu_603*2),bpu_603")
 
 (define_insn_reservation "ppc603-fp" 3
-  (and (eq_attr "type" "fp")
+  (and (eq_attr "type" "fp,fpsimple")
        (eq_attr "cpu" "ppc603"))
   "fpu_603")
 
diff --git a/gcc/config/rs6000/6xx.md b/gcc/config/rs6000/6xx.md
index 29893aeeef..3ab80a2b26 100644
--- a/gcc/config/rs6000/6xx.md
+++ b/gcc/config/rs6000/6xx.md
@@ -160,7 +160,7 @@
   "fpu_6xx")
 
 (define_insn_reservation "ppc604-fp" 3
-  (and (eq_attr "type" "fp")
+  (and (eq_attr "type" "fp,fpsimple")
        (eq_attr "cpu" "ppc604,ppc604e,ppc620"))
   "fpu_6xx")
 
diff --git a/gcc/config/rs6000/7450.md b/gcc/config/rs6000/7450.md
index 8146369399..0ebf6fa0cd 100644
--- a/gcc/config/rs6000/7450.md
+++ b/gcc/config/rs6000/7450.md
@@ -120,7 +120,7 @@
   "ppc7450_du,fpu_7450")
 
 (define_insn_reservation "ppc7450-fp" 5
-  (and (eq_attr "type" "fp,dmul")
+  (and (eq_attr "type" "fp,fpsimple,dmul")
        (eq_attr "cpu" "ppc7450"))
   "ppc7450_du,fpu_7450")
 
@@ -162,7 +162,7 @@
 
 ;; Altivec
 (define_insn_reservation "ppc7450-vecsimple" 1
-  (and (eq_attr "type" "vecsimple")
+  (and (eq_attr "type" "vecsimple,veclogical,vecmove")
        (eq_attr "cpu" "ppc7450"))
   "ppc7450_du,ppc7450_vec_du,vecsmpl_7450")
 
@@ -172,7 +172,7 @@
   "ppc7450_du,ppc7450_vec_du,veccmplx_7450")
 
 (define_insn_reservation "ppc7450-veccmp" 2
-  (and (eq_attr "type" "veccmp")
+  (and (eq_attr "type" "veccmp,veccmpfx")
        (eq_attr "cpu" "ppc7450"))
   "ppc7450_du,ppc7450_vec_du,veccmplx_7450")
 
diff --git a/gcc/config/rs6000/7xx.md b/gcc/config/rs6000/7xx.md
index 1da48b77fd..70e2eb17f1 100644
--- a/gcc/config/rs6000/7xx.md
+++ b/gcc/config/rs6000/7xx.md
@@ -113,7 +113,7 @@
   "ppc750_du,fpu_7xx")
 
 (define_insn_reservation "ppc750-fp" 3
-  (and (eq_attr "type" "fp")
+  (and (eq_attr "type" "fp,fpsimple")
        (eq_attr "cpu" "ppc750,ppc7400"))
   "ppc750_du,fpu_7xx")
 
@@ -165,7 +165,7 @@
 
 ;; Altivec
 (define_insn_reservation "ppc7400-vecsimple" 1
-  (and (eq_attr "type" "vecsimple,veccmp")
+  (and (eq_attr "type" "vecsimple,veclogical,vecmove,veccmp,veccmpfx")
        (eq_attr "cpu" "ppc7400"))
   "ppc750_du,ppc7400_vec_du,veccmplx_7xx")
 
diff --git a/gcc/config/rs6000/8540.md b/gcc/config/rs6000/8540.md
index ae4e45f89b..f39f1f6751 100644
--- a/gcc/config/rs6000/8540.md
+++ b/gcc/config/rs6000/8540.md
@@ -190,7 +190,7 @@
 
 ;; Simple vector
 (define_insn_reservation "ppc8540_simple_vector" 1
-  (and (eq_attr "type" "vecsimple")
+  (and (eq_attr "type" "vecsimple,veclogical,vecmove")
        (eq_attr "cpu" "ppc8540,ppc8548"))
   "ppc8540_decode,ppc8540_issue+ppc8540_su1_stage0+ppc8540_retire")
 
@@ -202,7 +202,7 @@
 
 ;; Vector compare
 (define_insn_reservation "ppc8540_vector_compare" 1
-  (and (eq_attr "type" "veccmp")
+  (and (eq_attr "type" "veccmp,veccmpfx")
        (eq_attr "cpu" "ppc8540,ppc8548"))
   "ppc8540_decode,ppc8540_issue+ppc8540_su1_stage0+ppc8540_retire")
 
diff --git a/gcc/config/rs6000/a2.md b/gcc/config/rs6000/a2.md
index 1fcf1cfb20..e0b800ce61 100644
--- a/gcc/config/rs6000/a2.md
+++ b/gcc/config/rs6000/a2.md
@@ -81,7 +81,7 @@
 
 ;; D.8.1
 (define_insn_reservation "ppca2-fp" 6
-  (and (eq_attr "type" "fp")     	   ;; Ignore fpsimple insn types (SPE only).
+  (and (eq_attr "type" "fp,fpsimple")
        (eq_attr "cpu" "ppca2"))
   "axu")
 
diff --git a/gcc/config/rs6000/altivec.h b/gcc/config/rs6000/altivec.h
index ea6af8d192..f77d446646 100644
--- a/gcc/config/rs6000/altivec.h
+++ b/gcc/config/rs6000/altivec.h
@@ -327,8 +327,8 @@
 #define vec_sqrt __builtin_vec_sqrt
 #define vec_vsx_ld __builtin_vec_vsx_ld
 #define vec_vsx_st __builtin_vec_vsx_st
-#define vec_xl __builtin_vec_vsx_ld
-#define vec_xst __builtin_vec_vsx_st
+#define vec_xl __builtin_vec_xl
+#define vec_xst __builtin_vec_xst
 
 /* Note, xxsldi and xxpermdi were added as __builtin_vsx_<xxx> functions
    instead of __builtin_vec_<xxx>  */
@@ -384,6 +384,31 @@
 #define vec_vupklsw __builtin_vec_vupklsw
 #endif
 
+#ifdef _ARCH_PWR9
+/* Vector additions added in ISA 3.0.  */
+#define vec_vctz __builtin_vec_vctz
+#define vec_cntlz __builtin_vec_vctz
+#define vec_vctzb __builtin_vec_vctzb
+#define vec_vctzd __builtin_vec_vctzd
+#define vec_vctzh __builtin_vec_vctzh
+#define vec_vctzw __builtin_vec_vctzw
+#define vec_vprtyb __builtin_vec_vprtyb
+#define vec_vprtybd __builtin_vec_vprtybd
+#define vec_vprtybw __builtin_vec_vprtybw
+
+#ifdef _ARCH_PPC64
+#define vec_vprtybq __builtin_vec_vprtybq
+#endif
+
+#define vec_slv __builtin_vec_vslv
+#define vec_srv __builtin_vec_vsrv
+
+#define vec_absd __builtin_vec_vadu
+#define vec_absdb __builtin_vec_vadub
+#define vec_absdh __builtin_vec_vaduh
+#define vec_absdw __builtin_vec_vaduw
+#endif
+
 /* Predicates.
    For C++, we use templates in order to allow non-parenthesized arguments.
    For C, instead, we use macros since non-parenthesized arguments were
diff --git a/gcc/config/rs6000/altivec.md b/gcc/config/rs6000/altivec.md
index 9c3084dcb8..362fa221c1 100644
--- a/gcc/config/rs6000/altivec.md
+++ b/gcc/config/rs6000/altivec.md
@@ -58,6 +58,7 @@
    UNSPEC_VSUM2SWS
    UNSPEC_VSUMSWS
    UNSPEC_VPERM
+   UNSPEC_VPERMR
    UNSPEC_VPERM_UNS
    UNSPEC_VRFIN
    UNSPEC_VCFUX
@@ -73,6 +74,9 @@
    UNSPEC_VUNPACK_LO_SIGN_DIRECT
    UNSPEC_VUPKHPX
    UNSPEC_VUPKLPX
+   UNSPEC_DARN
+   UNSPEC_DARN_32
+   UNSPEC_DARN_RAW
    UNSPEC_DST
    UNSPEC_DSTT
    UNSPEC_DSTST
@@ -110,6 +114,9 @@
    UNSPEC_STVLXL
    UNSPEC_STVRX
    UNSPEC_STVRXL
+   UNSPEC_VSLV
+   UNSPEC_VSRV
+   UNSPEC_VADU
    UNSPEC_VMULWHUB
    UNSPEC_VMULWLUB
    UNSPEC_VMULWHSB
@@ -189,6 +196,13 @@
 			   (KF "FLOAT128_VECTOR_P (KFmode)")
 			   (TF "FLOAT128_VECTOR_P (TFmode)")])
 
+;; Specific iterator for parity which does not have a byte/half-word form, but
+;; does have a quad word form
+(define_mode_iterator VParity [V4SI
+			       V2DI
+			       V1TI
+			       (TI "TARGET_VSX_TIMODE")])
+
 (define_mode_attr VI_char [(V2DI "d") (V4SI "w") (V8HI "h") (V16QI "b")])
 (define_mode_attr VI_scalar [(V2DI "DI") (V4SI "SI") (V8HI "HI") (V16QI "QI")])
 (define_mode_attr VI_unit [(V16QI "VECTOR_UNIT_ALTIVEC_P (V16QImode)")
@@ -203,6 +217,9 @@
 (define_mode_attr VP_small_lc [(V2DI "v4si") (V4SI "v8hi") (V8HI "v16qi")])
 (define_mode_attr VU_char [(V2DI "w") (V4SI "h") (V8HI "b")])
 
+;; Vector negate
+(define_mode_iterator VNEG [V4SI V2DI])
+
 ;; Vector move instructions.
 (define_insn "*altivec_mov<mode>"
   [(set (match_operand:VM2 0 "nonimmediate_operand" "=Z,v,v,*Y,*r,*r,v,v,*r")
@@ -225,7 +242,7 @@
     default: gcc_unreachable ();
     }
 }
-  [(set_attr "type" "vecstore,vecload,vecsimple,store,load,*,vecsimple,*,*")
+  [(set_attr "type" "vecstore,vecload,veclogical,store,load,*,veclogical,*,*")
    (set_attr "length" "4,4,4,20,20,20,4,8,32")])
 
 ;; Unlike other altivec moves, allow the GPRs, since a normal use of TImode
@@ -251,7 +268,7 @@
     default: gcc_unreachable ();
     }
 }
-  [(set_attr "type" "vecstore,vecload,vecsimple,store,load,*,vecsimple,*")])
+  [(set_attr "type" "vecstore,vecload,veclogical,store,load,*,veclogical,*")])
 
 ;; Load up a vector with the most significant bit set by loading up -1 and
 ;; doing a shift left
@@ -586,7 +603,7 @@
 		(match_operand:VI2 2 "altivec_register_operand" "v")))]
   "<VI_unit>"
   "vcmpequ<VI_char> %0,%1,%2"
-  [(set_attr "type" "veccmp")])
+  [(set_attr "type" "veccmpfx")])
 
 (define_insn "*altivec_gt<mode>"
   [(set (match_operand:VI2 0 "altivec_register_operand" "=v")
@@ -594,7 +611,7 @@
 		(match_operand:VI2 2 "altivec_register_operand" "v")))]
   "<VI_unit>"
   "vcmpgts<VI_char> %0,%1,%2"
-  [(set_attr "type" "veccmp")])
+  [(set_attr "type" "veccmpfx")])
 
 (define_insn "*altivec_gtu<mode>"
   [(set (match_operand:VI2 0 "altivec_register_operand" "=v")
@@ -602,7 +619,7 @@
 		 (match_operand:VI2 2 "altivec_register_operand" "v")))]
   "<VI_unit>"
   "vcmpgtu<VI_char> %0,%1,%2"
-  [(set_attr "type" "veccmp")])
+  [(set_attr "type" "veccmpfx")])
 
 (define_insn "*altivec_eqv4sf"
   [(set (match_operand:V4SF 0 "altivec_register_operand" "=v")
@@ -637,7 +654,7 @@
 	 (match_operand:VM 3 "altivec_register_operand" "v")))]
   "VECTOR_MEM_ALTIVEC_P (<MODE>mode)"
   "vsel %0,%3,%2,%1"
-  [(set_attr "type" "vecperm")])
+  [(set_attr "type" "vecmove")])
 
 (define_insn "*altivec_vsel<mode>_uns"
   [(set (match_operand:VM 0 "altivec_register_operand" "=v")
@@ -648,7 +665,7 @@
 	 (match_operand:VM 3 "altivec_register_operand" "v")))]
   "VECTOR_MEM_ALTIVEC_P (<MODE>mode)"
   "vsel %0,%3,%2,%1"
-  [(set_attr "type" "vecperm")])
+  [(set_attr "type" "vecmove")])
 
 ;; Fused multiply add.
 
@@ -1617,6 +1634,24 @@
   "vslo %0,%1,%2"
   [(set_attr "type" "vecperm")])
 
+(define_insn "vslv"
+  [(set (match_operand:V16QI 0 "register_operand" "=v")
+	(unspec:V16QI [(match_operand:V16QI 1 "register_operand" "v")
+		       (match_operand:V16QI 2 "register_operand" "v")]
+         UNSPEC_VSLV))]
+  "TARGET_P9_VECTOR"
+  "vslv %0,%1,%2"
+  [(set_attr "type" "vecsimple")])
+
+(define_insn "vsrv"
+  [(set (match_operand:V16QI 0 "register_operand" "=v")
+	(unspec:V16QI [(match_operand:V16QI 1 "register_operand" "v")
+		       (match_operand:V16QI 2 "register_operand" "v")]
+         UNSPEC_VSRV))]
+  "TARGET_P9_VECTOR"
+  "vsrv %0,%1,%2"
+  [(set_attr "type" "vecsimple")])
+
 (define_insn "*altivec_vsl<VI_char>"
   [(set (match_operand:VI2 0 "register_operand" "=v")
         (ashift:VI2 (match_operand:VI2 1 "register_operand" "v")
@@ -1949,32 +1984,30 @@
 
 ;; Slightly prefer vperm, since the target does not overlap the source
 (define_insn "*altivec_vperm_<mode>_internal"
-  [(set (match_operand:VM 0 "register_operand" "=v,?wo,?&wo")
-	(unspec:VM [(match_operand:VM 1 "register_operand" "v,0,wo")
-		    (match_operand:VM 2 "register_operand" "v,wo,wo")
-		    (match_operand:V16QI 3 "register_operand" "v,wo,wo")]
+  [(set (match_operand:VM 0 "register_operand" "=v,?wo")
+	(unspec:VM [(match_operand:VM 1 "register_operand" "v,wo")
+		    (match_operand:VM 2 "register_operand" "v,0")
+		    (match_operand:V16QI 3 "register_operand" "v,wo")]
 		   UNSPEC_VPERM))]
   "TARGET_ALTIVEC"
   "@
    vperm %0,%1,%2,%3
-   xxperm %x0,%x2,%x3
-   xxlor %x0,%x1,%x1\t\t# xxperm fusion\;xxperm %x0,%x2,%x3"
+   xxperm %x0,%x1,%x3"
   [(set_attr "type" "vecperm")
-   (set_attr "length" "4,4,8")])
+   (set_attr "length" "4")])
 
 (define_insn "altivec_vperm_v8hiv16qi"
-  [(set (match_operand:V16QI 0 "register_operand" "=v,?wo,?&wo")
-	(unspec:V16QI [(match_operand:V8HI 1 "register_operand" "v,0,wo")
-   	               (match_operand:V8HI 2 "register_operand" "v,wo,wo")
-		       (match_operand:V16QI 3 "register_operand" "v,wo,wo")]
+  [(set (match_operand:V16QI 0 "register_operand" "=v,?wo")
+	(unspec:V16QI [(match_operand:V8HI 1 "register_operand" "v,wo")
+   	               (match_operand:V8HI 2 "register_operand" "v,0")
+		       (match_operand:V16QI 3 "register_operand" "v,wo")]
 		   UNSPEC_VPERM))]
   "TARGET_ALTIVEC"
   "@
    vperm %0,%1,%2,%3
-   xxperm %x0,%x2,%x3
-   xxlor %x0,%x1,%x1\t\t# xxperm fusion\;xxperm %x0,%x2,%x3"
+   xxperm %x0,%x1,%x3"
   [(set_attr "type" "vecperm")
-   (set_attr "length" "4,4,8")])
+   (set_attr "length" "4")])
 
 (define_expand "altivec_vperm_<mode>_uns"
   [(set (match_operand:VM 0 "register_operand" "")
@@ -1992,18 +2025,17 @@
 })
 
 (define_insn "*altivec_vperm_<mode>_uns_internal"
-  [(set (match_operand:VM 0 "register_operand" "=v,?wo,?&wo")
-	(unspec:VM [(match_operand:VM 1 "register_operand" "v,0,wo")
-		    (match_operand:VM 2 "register_operand" "v,wo,wo")
-		    (match_operand:V16QI 3 "register_operand" "v,wo,wo")]
+  [(set (match_operand:VM 0 "register_operand" "=v,?wo")
+	(unspec:VM [(match_operand:VM 1 "register_operand" "v,wo")
+		    (match_operand:VM 2 "register_operand" "v,0")
+		    (match_operand:V16QI 3 "register_operand" "v,wo")]
 		   UNSPEC_VPERM_UNS))]
   "TARGET_ALTIVEC"
   "@
    vperm %0,%1,%2,%3
-   xxperm %x0,%x2,%x3
-   xxlor %x0,%x1,%x1\t\t# xxperm fusion\;xxperm %x0,%x2,%x3"
+   xxperm %x0,%x1,%x3"
   [(set_attr "type" "vecperm")
-   (set_attr "length" "4,4,8")])
+   (set_attr "length" "4")])
 
 (define_expand "vec_permv16qi"
   [(set (match_operand:V16QI 0 "register_operand" "")
@@ -2032,6 +2064,19 @@
     FAIL;
 })
 
+(define_insn "*altivec_vpermr_<mode>_internal"
+  [(set (match_operand:VM 0 "register_operand" "=v,?wo")
+	(unspec:VM [(match_operand:VM 1 "register_operand" "v,wo")
+		    (match_operand:VM 2 "register_operand" "v,0")
+		    (match_operand:V16QI 3 "register_operand" "v,wo")]
+		   UNSPEC_VPERMR))]
+  "TARGET_P9_VECTOR"
+  "@
+   vpermr %0,%2,%1,%3
+   xxpermr %x0,%x1,%x3"
+  [(set_attr "type" "vecperm")
+   (set_attr "length" "4")])
+
 (define_insn "altivec_vrfip"		; ceil
   [(set (match_operand:V4SF 0 "register_operand" "=v")
         (unspec:V4SF [(match_operand:V4SF 1 "register_operand" "v")]
@@ -2238,7 +2283,7 @@
 		(match_dup 2)))]
   "<VI_unit>"
   "vcmpequ<VI_char>. %0,%1,%2"
-  [(set_attr "type" "veccmp")])
+  [(set_attr "type" "veccmpfx")])
 
 (define_insn "*altivec_vcmpgts<VI_char>_p"
   [(set (reg:CC 74)
@@ -2250,7 +2295,7 @@
 		(match_dup 2)))]
   "<VI_unit>"
   "vcmpgts<VI_char>. %0,%1,%2"
-  [(set_attr "type" "veccmp")])
+  [(set_attr "type" "veccmpfx")])
 
 (define_insn "*altivec_vcmpgtu<VI_char>_p"
   [(set (reg:CC 74)
@@ -2262,7 +2307,7 @@
 		 (match_dup 2)))]
   "<VI_unit>"
   "vcmpgtu<VI_char>. %0,%1,%2"
-  [(set_attr "type" "veccmp")])
+  [(set_attr "type" "veccmpfx")])
 
 (define_insn "*altivec_vcmpeqfp_p"
   [(set (reg:CC 74)
@@ -2690,20 +2735,28 @@
   DONE;
 })
 
+(define_insn "*p9_neg<mode>2"
+  [(set (match_operand:VNEG 0 "altivec_register_operand" "=v")
+	(neg:VNEG (match_operand:VNEG 1 "altivec_register_operand" "v")))]
+  "TARGET_P9_VECTOR"
+  "vneg<VI_char> %0,%1"
+  [(set_attr "type" "vecsimple")])
+
 (define_expand "neg<mode>2"
-  [(use (match_operand:VI 0 "register_operand" ""))
-   (use (match_operand:VI 1 "register_operand" ""))]
-  "TARGET_ALTIVEC"
-  "
+  [(set (match_operand:VI2 0 "register_operand" "")
+	(neg:VI2 (match_operand:VI2 1 "register_operand" "")))]
+  "<VI_unit>"
 {
-  rtx vzero;
+  if (!TARGET_P9_VECTOR || (<MODE>mode != V4SImode && <MODE>mode != V2DImode))
+    {
+      rtx vzero;
 
-  vzero = gen_reg_rtx (GET_MODE (operands[0]));
-  emit_insn (gen_altivec_vspltis<VI_char> (vzero, const0_rtx));
-  emit_insn (gen_sub<mode>3 (operands[0], vzero, operands[1])); 
-  
-  DONE;
-}")
+      vzero = gen_reg_rtx (GET_MODE (operands[0]));
+      emit_move_insn (vzero, CONST0_RTX (<MODE>mode));
+      emit_insn (gen_sub<mode>3 (operands[0], vzero, operands[1]));
+      DONE;
+    }
+})
 
 (define_expand "udot_prod<mode>"
   [(set (match_operand:V4SI 0 "register_operand" "=v")
@@ -2791,32 +2844,30 @@
   "")
 
 (define_insn "vperm_v8hiv4si"
-  [(set (match_operand:V4SI 0 "register_operand" "=v,?wo,?&wo")
-        (unspec:V4SI [(match_operand:V8HI 1 "register_operand" "v,0,wo")
-		      (match_operand:V4SI 2 "register_operand" "v,wo,wo")
-		      (match_operand:V16QI 3 "register_operand" "v,wo,wo")]
+  [(set (match_operand:V4SI 0 "register_operand" "=v,?wo")
+        (unspec:V4SI [(match_operand:V8HI 1 "register_operand" "v,wo")
+		      (match_operand:V4SI 2 "register_operand" "v,0")
+		      (match_operand:V16QI 3 "register_operand" "v,wo")]
                   UNSPEC_VPERMSI))]
   "TARGET_ALTIVEC"
   "@
    vperm %0,%1,%2,%3
-   xxperm %x0,%x2,%x3
-   xxlor %x0,%x1,%x1\t\t# xxperm fusion\;xxperm %x0,%x2,%x3"
+   xxperm %x0,%x1,%x3"
   [(set_attr "type" "vecperm")
-   (set_attr "length" "4,4,8")])
+   (set_attr "length" "4")])
 
 (define_insn "vperm_v16qiv8hi"
-  [(set (match_operand:V8HI 0 "register_operand" "=v,?wo,?&wo")
-        (unspec:V8HI [(match_operand:V16QI 1 "register_operand" "v,0,wo")
-		      (match_operand:V8HI 2 "register_operand" "v,wo,wo")
-		      (match_operand:V16QI 3 "register_operand" "v,wo,wo")]
+  [(set (match_operand:V8HI 0 "register_operand" "=v,?wo")
+        (unspec:V8HI [(match_operand:V16QI 1 "register_operand" "v,wo")
+		      (match_operand:V8HI 2 "register_operand" "v,0")
+		      (match_operand:V16QI 3 "register_operand" "v,wo")]
                   UNSPEC_VPERMHI))]
   "TARGET_ALTIVEC"
   "@
    vperm %0,%1,%2,%3
-   xxperm %x0,%x2,%x3
-   xxlor %x0,%x1,%x1\t\t# xxperm fusion\;xxperm %x0,%x2,%x3"
+   xxperm %x0,%x1,%x3"
   [(set_attr "type" "vecperm")
-   (set_attr "length" "4,4,8")])
+   (set_attr "length" "4")])
 
 
 (define_expand "vec_unpacku_hi_v16qi"
@@ -3353,7 +3404,7 @@
 }")
 
 
-;; Power8 vector instructions encoded as Altivec instructions
+;; Power8/power9 vector instructions encoded as Altivec instructions
 
 ;; Vector count leading zeros
 (define_insn "*p8v_clz<mode>2"
@@ -3364,6 +3415,33 @@
   [(set_attr "length" "4")
    (set_attr "type" "vecsimple")])
 
+;; Vector absolute difference unsigned
+(define_expand "vadu<mode>3"
+  [(set (match_operand:VI 0 "register_operand")
+        (unspec:VI [(match_operand:VI 1 "register_operand")
+		    (match_operand:VI 2 "register_operand")]
+         UNSPEC_VADU))]
+  "TARGET_P9_VECTOR")
+
+;; Vector absolute difference unsigned
+(define_insn "*p9_vadu<mode>3"
+  [(set (match_operand:VI 0 "register_operand" "=v")
+        (unspec:VI [(match_operand:VI 1 "register_operand" "v")
+		    (match_operand:VI 2 "register_operand" "v")]
+         UNSPEC_VADU))]
+  "TARGET_P9_VECTOR"
+  "vabsdu<wd> %0,%1,%2"
+  [(set_attr "type" "vecsimple")])
+
+;; Vector count trailing zeros
+(define_insn "*p9v_ctz<mode>2"
+  [(set (match_operand:VI2 0 "register_operand" "=v")
+	(ctz:VI2 (match_operand:VI2 1 "register_operand" "v")))]
+  "TARGET_P9_VECTOR"
+  "vctz<wd> %0,%1"
+  [(set_attr "length" "4")
+   (set_attr "type" "vecsimple")])
+
 ;; Vector population count
 (define_insn "*p8v_popcount<mode>2"
   [(set (match_operand:VI2 0 "register_operand" "=v")
@@ -3373,6 +3451,15 @@
   [(set_attr "length" "4")
    (set_attr "type" "vecsimple")])
 
+;; Vector parity
+(define_insn "*p9v_parity<mode>2"
+  [(set (match_operand:VParity 0 "register_operand" "=v")
+        (parity:VParity (match_operand:VParity 1 "register_operand" "v")))]
+  "TARGET_P9_VECTOR"
+  "vprtyb<wd> %0,%1"
+  [(set_attr "length" "4")
+   (set_attr "type" "vecsimple")])
+
 ;; Vector Gather Bits by Bytes by Doubleword
 (define_insn "p8v_vgbbd"
   [(set (match_operand:V16QI 0 "register_operand" "=v")
@@ -3540,6 +3627,27 @@
   [(set_attr "length" "4")
    (set_attr "type" "vecsimple")])
 
+(define_insn "darn_32"
+  [(set (match_operand:SI 0 "register_operand" "=r")
+        (unspec:SI [(const_int 0)] UNSPEC_DARN_32))]
+  "TARGET_P9_MISC"
+  "darn %0,0"
+  [(set_attr "type" "integer")])
+
+(define_insn "darn_raw"
+  [(set (match_operand:DI 0 "register_operand" "=r")
+        (unspec:DI [(const_int 0)] UNSPEC_DARN_RAW))]
+  "TARGET_P9_MISC && TARGET_64BIT"
+  "darn %0,2"
+  [(set_attr "type" "integer")])
+
+(define_insn "darn"
+  [(set (match_operand:DI 0 "register_operand" "=r")
+        (unspec:DI [(const_int 0)] UNSPEC_DARN))]
+  "TARGET_P9_MISC && TARGET_64BIT"
+  "darn %0,1"
+  [(set_attr "type" "integer")])
+
 (define_expand "bcd<bcd_add_sub>_<code>"
   [(parallel [(set (reg:CCFP 74)
 		   (compare:CCFP
diff --git a/gcc/config/rs6000/cell.md b/gcc/config/rs6000/cell.md
index b780f09efe..7eee77cd5f 100644
--- a/gcc/config/rs6000/cell.md
+++ b/gcc/config/rs6000/cell.md
@@ -306,7 +306,7 @@
 
 ; Basic FP latency is 10 cycles, thoughput is 1/cycle
 (define_insn_reservation "cell-fp" 10
-  (and (eq_attr "type" "fp,dmul")
+  (and (eq_attr "type" "fp,fpsimple,dmul")
        (eq_attr "cpu" "cell"))
   "slot01,vsu1_cell,vsu1_cell*8")
 
@@ -329,7 +329,7 @@
 
 ; VMX
 (define_insn_reservation "cell-vecsimple" 4
-  (and (eq_attr "type" "vecsimple")
+  (and (eq_attr "type" "vecsimple,veclogical,vecmove")
        (eq_attr "cpu" "cell"))
   "slot01,vsu1_cell,vsu1_cell*2")
 
@@ -341,7 +341,7 @@
 
 ;; TODO: add support for recording instructions
 (define_insn_reservation "cell-veccmp" 4
-  (and (eq_attr "type" "veccmp")
+  (and (eq_attr "type" "veccmp,veccmpfx")
        (eq_attr "cpu" "cell"))
   "slot01,vsu1_cell,vsu1_cell*2")
 
diff --git a/gcc/config/rs6000/constraints.md b/gcc/config/rs6000/constraints.md
index ea15764e51..465ad6d921 100644
--- a/gcc/config/rs6000/constraints.md
+++ b/gcc/config/rs6000/constraints.md
@@ -140,6 +140,10 @@
   (and (match_code "const_int")
        (match_test "TARGET_VSX && (ival == VECTOR_ELEMENT_SCALAR_64BIT)")))
 
+(define_constraint "wE"
+  "Vector constant that can be loaded with the XXSPLTIB instruction."
+  (match_test "xxspltib_constant_nosplit (op, mode)"))
+
 ;; Extended fusion store
 (define_memory_constraint "wF"
   "Memory operand suitable for power9 fusion load/stores"
@@ -156,11 +160,35 @@
        (and (match_test "TARGET_DIRECT_MOVE_128")
 	    (match_test "(ival == VECTOR_ELEMENT_MFVSRLD_64BIT)"))))
 
+;; Generate the XXORC instruction to set a register to all 1's
+(define_constraint "wM"
+  "Match vector constant with all 1's if the XXLORC instruction is available"
+  (and (match_test "TARGET_P8_VECTOR")
+       (match_operand 0 "all_ones_constant")))
+
+;; ISA 3.0 vector d-form addresses
+(define_memory_constraint "wO"
+  "Memory operand suitable for the ISA 3.0 vector d-form instructions."
+  (match_operand 0 "vsx_quad_dform_memory_operand"))
+
 ;; Lq/stq validates the address for load/store quad
 (define_memory_constraint "wQ"
   "Memory operand suitable for the load/store quad instructions"
   (match_operand 0 "quad_memory_operand"))
 
+(define_constraint "wS"
+  "Vector constant that can be loaded with XXSPLTIB & sign extension."
+  (match_test "xxspltib_constant_split (op, mode)"))
+
+;; ISA 3.0 DS-form instruction that has the bottom 2 bits 0 and no update form.
+;; Used by LXSD/STXSD/LXSSP/STXSSP.  In contrast to "Y", the multiple-of-four
+;; offset is enforced for 32-bit too.
+(define_memory_constraint "wY"
+  "Offsettable memory operand, with bottom 2 bits 0"
+  (and (match_code "mem")
+       (not (match_test "update_address_mem (op, mode)"))
+       (match_test "mem_operand_ds_form (op, mode)")))
+
 ;; Altivec style load/store that ignores the bottom bits of the address
 (define_memory_constraint "wZ"
   "Indexed or indirect memory operand, ignoring the bottom 4 bits"
diff --git a/gcc/config/rs6000/crypto.md b/gcc/config/rs6000/crypto.md
index 5957abb8f5..83a26aef36 100644
--- a/gcc/config/rs6000/crypto.md
+++ b/gcc/config/rs6000/crypto.md
@@ -107,4 +107,4 @@
 			UNSPEC_VSHASIGMA))]
   "TARGET_CRYPTO"
   "vshasigma<CR_char> %0,%1,%2,%3"
-  [(set_attr "type" "crypto")])
+  [(set_attr "type" "vecsimple")])
diff --git a/gcc/config/rs6000/dfp.md b/gcc/config/rs6000/dfp.md
index a631ff5fd9..e6ed70ed8e 100644
--- a/gcc/config/rs6000/dfp.md
+++ b/gcc/config/rs6000/dfp.md
@@ -58,7 +58,7 @@
 	(float_extend:DD (match_operand:SD 1 "gpc_reg_operand" "f")))]
   "TARGET_DFP"
   "dctdp %0,%1"
-  [(set_attr "type" "fp")])
+  [(set_attr "type" "dfp")])
 
 (define_expand "extendsdtd2"
   [(set (match_operand:TD 0 "gpc_reg_operand" "=d")
@@ -76,7 +76,7 @@
 	(float_truncate:SD (match_operand:DD 1 "gpc_reg_operand" "d")))]
   "TARGET_DFP"
   "drsp %0,%1"
-  [(set_attr "type" "fp")])
+  [(set_attr "type" "dfp")])
 
 (define_expand "negdd2"
   [(set (match_operand:DD 0 "gpc_reg_operand" "")
@@ -89,7 +89,7 @@
 	(neg:DD (match_operand:DD 1 "gpc_reg_operand" "d")))]
   "TARGET_HARD_FLOAT && TARGET_FPRS"
   "fneg %0,%1"
-  [(set_attr "type" "fp")])
+  [(set_attr "type" "fpsimple")])
 
 (define_expand "absdd2"
   [(set (match_operand:DD 0 "gpc_reg_operand" "")
@@ -102,14 +102,14 @@
 	(abs:DD (match_operand:DD 1 "gpc_reg_operand" "d")))]
   "TARGET_HARD_FLOAT && TARGET_FPRS"
   "fabs %0,%1"
-  [(set_attr "type" "fp")])
+  [(set_attr "type" "fpsimple")])
 
 (define_insn "*nabsdd2_fpr"
   [(set (match_operand:DD 0 "gpc_reg_operand" "=d")
 	(neg:DD (abs:DD (match_operand:DD 1 "gpc_reg_operand" "d"))))]
   "TARGET_HARD_FLOAT && TARGET_FPRS"
   "fnabs %0,%1"
-  [(set_attr "type" "fp")])
+  [(set_attr "type" "fpsimple")])
 
 (define_expand "negtd2"
   [(set (match_operand:TD 0 "gpc_reg_operand" "")
@@ -124,7 +124,7 @@
   "@
    fneg %0,%1
    fneg %0,%1\;fmr %L0,%L1"
-  [(set_attr "type" "fp")
+  [(set_attr "type" "fpsimple")
    (set_attr "length" "4,8")])
 
 (define_expand "abstd2"
@@ -140,7 +140,7 @@
   "@
    fabs %0,%1
    fabs %0,%1\;fmr %L0,%L1"
-  [(set_attr "type" "fp")
+  [(set_attr "type" "fpsimple")
    (set_attr "length" "4,8")])
 
 (define_insn "*nabstd2_fpr"
@@ -150,7 +150,7 @@
   "@
    fnabs %0,%1
    fnabs %0,%1\;fmr %L0,%L1"
-  [(set_attr "type" "fp")
+  [(set_attr "type" "fpsimple")
    (set_attr "length" "4,8")])
 
 ;; Hardware support for decimal floating point operations.
@@ -160,7 +160,7 @@
 	(float_extend:TD (match_operand:DD 1 "gpc_reg_operand" "d")))]
   "TARGET_DFP"
   "dctqpq %0,%1"
-  [(set_attr "type" "fp")])
+  [(set_attr "type" "dfp")])
 
 ;; The result of drdpq is an even/odd register pair with the converted
 ;; value in the even register and zero in the odd register.
@@ -173,7 +173,7 @@
    (clobber (match_scratch:TD 2 "=d"))]
   "TARGET_DFP"
   "drdpq %2,%1\;fmr %0,%2"
-  [(set_attr "type" "fp")])
+  [(set_attr "type" "dfp")])
 
 (define_insn "adddd3"
   [(set (match_operand:DD 0 "gpc_reg_operand" "=d")
@@ -181,7 +181,7 @@
 		 (match_operand:DD 2 "gpc_reg_operand" "d")))]
   "TARGET_DFP"
   "dadd %0,%1,%2"
-  [(set_attr "type" "fp")])
+  [(set_attr "type" "dfp")])
 
 (define_insn "addtd3"
   [(set (match_operand:TD 0 "gpc_reg_operand" "=d")
@@ -189,7 +189,7 @@
 		 (match_operand:TD 2 "gpc_reg_operand" "d")))]
   "TARGET_DFP"
   "daddq %0,%1,%2"
-  [(set_attr "type" "fp")])
+  [(set_attr "type" "dfp")])
 
 (define_insn "subdd3"
   [(set (match_operand:DD 0 "gpc_reg_operand" "=d")
@@ -197,7 +197,7 @@
 		  (match_operand:DD 2 "gpc_reg_operand" "d")))]
   "TARGET_DFP"
   "dsub %0,%1,%2"
-  [(set_attr "type" "fp")])
+  [(set_attr "type" "dfp")])
 
 (define_insn "subtd3"
   [(set (match_operand:TD 0 "gpc_reg_operand" "=d")
@@ -205,7 +205,7 @@
 		  (match_operand:TD 2 "gpc_reg_operand" "d")))]
   "TARGET_DFP"
   "dsubq %0,%1,%2"
-  [(set_attr "type" "fp")])
+  [(set_attr "type" "dfp")])
 
 (define_insn "muldd3"
   [(set (match_operand:DD 0 "gpc_reg_operand" "=d")
@@ -213,7 +213,7 @@
 		 (match_operand:DD 2 "gpc_reg_operand" "d")))]
   "TARGET_DFP"
   "dmul %0,%1,%2"
-  [(set_attr "type" "fp")])
+  [(set_attr "type" "dfp")])
 
 (define_insn "multd3"
   [(set (match_operand:TD 0 "gpc_reg_operand" "=d")
@@ -221,7 +221,7 @@
 		 (match_operand:TD 2 "gpc_reg_operand" "d")))]
   "TARGET_DFP"
   "dmulq %0,%1,%2"
-  [(set_attr "type" "fp")])
+  [(set_attr "type" "dfp")])
 
 (define_insn "divdd3"
   [(set (match_operand:DD 0 "gpc_reg_operand" "=d")
@@ -229,7 +229,7 @@
 		(match_operand:DD 2 "gpc_reg_operand" "d")))]
   "TARGET_DFP"
   "ddiv %0,%1,%2"
-  [(set_attr "type" "fp")])
+  [(set_attr "type" "dfp")])
 
 (define_insn "divtd3"
   [(set (match_operand:TD 0 "gpc_reg_operand" "=d")
@@ -237,7 +237,7 @@
 		(match_operand:TD 2 "gpc_reg_operand" "d")))]
   "TARGET_DFP"
   "ddivq %0,%1,%2"
-  [(set_attr "type" "fp")])
+  [(set_attr "type" "dfp")])
 
 (define_insn "*cmpdd_internal1"
   [(set (match_operand:CCFP 0 "cc_reg_operand" "=y")
@@ -245,7 +245,7 @@
 		      (match_operand:DD 2 "gpc_reg_operand" "d")))]
   "TARGET_DFP"
   "dcmpu %0,%1,%2"
-  [(set_attr "type" "fpcompare")])
+  [(set_attr "type" "dfp")])
 
 (define_insn "*cmptd_internal1"
   [(set (match_operand:CCFP 0 "cc_reg_operand" "=y")
@@ -253,21 +253,21 @@
 		      (match_operand:TD 2 "gpc_reg_operand" "d")))]
   "TARGET_DFP"
   "dcmpuq %0,%1,%2"
-  [(set_attr "type" "fpcompare")])
+  [(set_attr "type" "dfp")])
 
 (define_insn "floatdidd2"
   [(set (match_operand:DD 0 "gpc_reg_operand" "=d")
 	(float:DD (match_operand:DI 1 "gpc_reg_operand" "d")))]
   "TARGET_DFP && TARGET_POPCNTD"
   "dcffix %0,%1"
-  [(set_attr "type" "fp")])
+  [(set_attr "type" "dfp")])
 
 (define_insn "floatditd2"
   [(set (match_operand:TD 0 "gpc_reg_operand" "=d")
 	(float:TD (match_operand:DI 1 "gpc_reg_operand" "d")))]
   "TARGET_DFP"
   "dcffixq %0,%1"
-  [(set_attr "type" "fp")])
+  [(set_attr "type" "dfp")])
 
 ;; Convert a decimal64 to a decimal64 whose value is an integer.
 ;; This is the first stage of converting it to an integer type.
@@ -277,7 +277,7 @@
 	(fix:DD (match_operand:DD 1 "gpc_reg_operand" "d")))]
   "TARGET_DFP"
   "drintn. 0,%0,%1,1"
-  [(set_attr "type" "fp")])
+  [(set_attr "type" "dfp")])
 
 ;; Convert a decimal64 whose value is an integer to an actual integer.
 ;; This is the second stage of converting decimal float to integer type.
@@ -287,7 +287,7 @@
 	(fix:DI (match_operand:DD 1 "gpc_reg_operand" "d")))]
   "TARGET_DFP"
   "dctfix %0,%1"
-  [(set_attr "type" "fp")])
+  [(set_attr "type" "dfp")])
 
 ;; Convert a decimal128 to a decimal128 whose value is an integer.
 ;; This is the first stage of converting it to an integer type.
@@ -297,7 +297,7 @@
 	(fix:TD (match_operand:TD 1 "gpc_reg_operand" "d")))]
   "TARGET_DFP"
   "drintnq. 0,%0,%1,1"
-  [(set_attr "type" "fp")])
+  [(set_attr "type" "dfp")])
 
 ;; Convert a decimal128 whose value is an integer to an actual integer.
 ;; This is the second stage of converting decimal float to integer type.
@@ -307,7 +307,7 @@
 	(fix:DI (match_operand:TD 1 "gpc_reg_operand" "d")))]
   "TARGET_DFP"
   "dctfixq %0,%1"
-  [(set_attr "type" "fp")])
+  [(set_attr "type" "dfp")])
 
 
 ;; Decimal builtin support
@@ -318,8 +318,11 @@
    UNSPEC_DXEX
    UNSPEC_DIEX
    UNSPEC_DSCLI
+   UNSPEC_DTSTSFI
    UNSPEC_DSCRI])
 
+(define_code_iterator DFP_TEST [eq lt gt unordered])
+
 (define_mode_iterator D64_D128 [DD TD])
 
 (define_mode_attr dfp_suffix [(DD "")
@@ -332,7 +335,7 @@
 			 UNSPEC_DDEDPD))]
   "TARGET_DFP"
   "ddedpd<dfp_suffix> %1,%0,%2"
-  [(set_attr "type" "fp")])
+  [(set_attr "type" "dfp")])
 
 (define_insn "dfp_denbcd_<mode>"
   [(set (match_operand:D64_D128 0 "gpc_reg_operand" "=d")
@@ -341,7 +344,7 @@
 			 UNSPEC_DENBCD))]
   "TARGET_DFP"
   "denbcd<dfp_suffix> %1,%0,%2"
-  [(set_attr "type" "fp")])
+  [(set_attr "type" "dfp")])
 
 (define_insn "dfp_dxex_<mode>"
   [(set (match_operand:D64_D128 0 "gpc_reg_operand" "=d")
@@ -349,7 +352,7 @@
 			 UNSPEC_DXEX))]
   "TARGET_DFP"
   "dxex<dfp_suffix> %0,%1"
-  [(set_attr "type" "fp")])
+  [(set_attr "type" "dfp")])
 
 (define_insn "dfp_diex_<mode>"
   [(set (match_operand:D64_D128 0 "gpc_reg_operand" "=d")
@@ -358,6 +361,42 @@
 			 UNSPEC_DXEX))]
   "TARGET_DFP"
   "diex<dfp_suffix> %0,%1,%2"
+  [(set_attr "type" "dfp")])
+
+(define_expand "dfptstsfi_<code>_<mode>"
+  [(set (match_dup 3)
+	(compare:CCFP
+         (unspec:D64_D128
+	  [(match_operand:SI 1 "const_int_operand" "n")
+	   (match_operand:D64_D128 2 "gpc_reg_operand" "d")]
+	  UNSPEC_DTSTSFI)
+	 (match_dup 4)))
+   (set (match_operand:SI 0 "register_operand" "")
+   	(DFP_TEST:SI (match_dup 3)
+		     (const_int 0)))
+  ]
+  "TARGET_P9_MISC"
+{
+  operands[3] = gen_reg_rtx (CCFPmode);
+  operands[4] = const0_rtx;
+})
+
+(define_insn "*dfp_sgnfcnc_<mode>"
+  [(set (match_operand:CCFP 0 "" "=y")
+        (compare:CCFP
+	 (unspec:D64_D128 [(match_operand:SI 1 "const_int_operand" "n")
+	 	           (match_operand:D64_D128 2 "gpc_reg_operand" "d")]
+          UNSPEC_DTSTSFI)
+	 (match_operand:SI 3 "zero_constant" "j")))]
+  "TARGET_P9_MISC"
+{
+  /* If immediate operand is greater than 63, it will behave as if
+     the value had been 63.  The code generator does not support
+     immediate operand values greater than 63.  */
+  if (!(IN_RANGE (INTVAL (operands[1]), 0, 63)))
+    operands[1] = GEN_INT (63);
+  return "dtstsfi<dfp_suffix> %0,%1,%2";
+}
   [(set_attr "type" "fp")])
 
 (define_insn "dfp_dscli_<mode>"
@@ -367,7 +406,7 @@
 			 UNSPEC_DSCLI))]
   "TARGET_DFP"
   "dscli<dfp_suffix> %0,%1,%2"
-  [(set_attr "type" "fp")])
+  [(set_attr "type" "dfp")])
 
 (define_insn "dfp_dscri_<mode>"
   [(set (match_operand:D64_D128 0 "gpc_reg_operand" "=d")
@@ -376,4 +415,4 @@
 			 UNSPEC_DSCRI))]
   "TARGET_DFP"
   "dscri<dfp_suffix> %0,%1,%2"
-  [(set_attr "type" "fp")])
+  [(set_attr "type" "dfp")])
diff --git a/gcc/config/rs6000/e300c2c3.md b/gcc/config/rs6000/e300c2c3.md
index 5865e95e2d..e48979979a 100644
--- a/gcc/config/rs6000/e300c2c3.md
+++ b/gcc/config/rs6000/e300c2c3.md
@@ -150,7 +150,7 @@
   "ppce300c3_decode,ppce300c3_issue+ppce300c3_fpu,nothing,ppce300c3_retire")
 
 (define_insn_reservation "ppce300c3_fp" 3
-  (and (eq_attr "type" "fp")
+  (and (eq_attr "type" "fp,fpsimple")
        (eq_attr "cpu" "ppce300c3"))
   "ppce300c3_decode,ppce300c3_issue+ppce300c3_fpu,nothing,ppce300c3_retire")
 
diff --git a/gcc/config/rs6000/e6500.md b/gcc/config/rs6000/e6500.md
index 428222d14b..e094192d61 100644
--- a/gcc/config/rs6000/e6500.md
+++ b/gcc/config/rs6000/e6500.md
@@ -205,7 +205,7 @@
 
 ;; VSFX.
 (define_insn_reservation "e6500_vecsimple" 1
-  (and (eq_attr "type" "vecsimple,veccmp")
+  (and (eq_attr "type" "vecsimple,veclogical,vecmove,veccmp,veccmpfx")
        (eq_attr "cpu" "ppce6500"))
   "e6500_decode,e6500_vec")
 
diff --git a/gcc/config/rs6000/htm.md b/gcc/config/rs6000/htm.md
index 0d0823824a..c0203a9c0c 100644
--- a/gcc/config/rs6000/htm.md
+++ b/gcc/config/rs6000/htm.md
@@ -72,7 +72,7 @@
    (set (match_operand:BLK 2) (unspec:BLK [(match_dup 2)] UNSPEC_HTM_FENCE))]
   "TARGET_HTM"
   "tabort. %0"
-  [(set_attr "type" "htm")
+  [(set_attr "type" "htmsimple")
    (set_attr "length" "4")])
 
 (define_expand "tabort<wd>c"
@@ -98,7 +98,7 @@
    (set (match_operand:BLK 4) (unspec:BLK [(match_dup 4)] UNSPEC_HTM_FENCE))]
   "TARGET_HTM"
   "tabort<wd>c. %0,%1,%2"
-  [(set_attr "type" "htm")
+  [(set_attr "type" "htmsimple")
    (set_attr "length" "4")])
 
 (define_expand "tabort<wd>ci"
@@ -124,7 +124,7 @@
    (set (match_operand:BLK 4) (unspec:BLK [(match_dup 4)] UNSPEC_HTM_FENCE))]
   "TARGET_HTM"
   "tabort<wd>ci. %0,%1,%2"
-  [(set_attr "type" "htm")
+  [(set_attr "type" "htmsimple")
    (set_attr "length" "4")])
 
 (define_expand "tbegin"
@@ -208,7 +208,7 @@
    (set (match_operand:BLK 1) (unspec:BLK [(match_dup 1)] UNSPEC_HTM_FENCE))]
   "TARGET_HTM"
   "trechkpt."
-  [(set_attr "type" "htm")
+  [(set_attr "type" "htmsimple")
    (set_attr "length" "4")])
 
 (define_expand "treclaim"
@@ -230,7 +230,7 @@
    (set (match_operand:BLK 2) (unspec:BLK [(match_dup 2)] UNSPEC_HTM_FENCE))]
   "TARGET_HTM"
   "treclaim. %0"
-  [(set_attr "type" "htm")
+  [(set_attr "type" "htmsimple")
    (set_attr "length" "4")])
 
 (define_expand "tsr"
@@ -252,7 +252,7 @@
    (set (match_operand:BLK 2) (unspec:BLK [(match_dup 2)] UNSPEC_HTM_FENCE))]
   "TARGET_HTM"
   "tsr. %0"
-  [(set_attr "type" "htm")
+  [(set_attr "type" "htmsimple")
    (set_attr "length" "4")])
 
 (define_expand "ttest"
@@ -272,7 +272,7 @@
    (set (match_operand:BLK 1) (unspec:BLK [(match_dup 1)] UNSPEC_HTM_FENCE))]
   "TARGET_HTM"
   "tabortwci. 0,1,0"
-  [(set_attr "type" "htm")
+  [(set_attr "type" "htmsimple")
    (set_attr "length" "4")])
 
 (define_insn "htm_mfspr_<mode>"
diff --git a/gcc/config/rs6000/mpc.md b/gcc/config/rs6000/mpc.md
index 010dc9444e..42cb11a598 100644
--- a/gcc/config/rs6000/mpc.md
+++ b/gcc/config/rs6000/mpc.md
@@ -81,7 +81,7 @@
   "fpu_mpc,bpu_mpc")
 
 (define_insn_reservation "mpccore-fp" 4
-  (and (eq_attr "type" "fp")
+  (and (eq_attr "type" "fp,fpsimple")
        (eq_attr "cpu" "mpccore"))
   "fpu_mpc*2")
 
diff --git a/gcc/config/rs6000/power4.md b/gcc/config/rs6000/power4.md
index 7b0ccbedaa..84ac439fe9 100644
--- a/gcc/config/rs6000/power4.md
+++ b/gcc/config/rs6000/power4.md
@@ -381,7 +381,7 @@
 
 ; Basic FP latency is 6 cycles
 (define_insn_reservation "power4-fp" 6
-  (and (eq_attr "type" "fp,dmul")
+  (and (eq_attr "type" "fp,fpsimple,dmul")
        (eq_attr "cpu" "power4"))
   "fpq_power4")
 
@@ -410,7 +410,7 @@
 
 ; VMX
 (define_insn_reservation "power4-vecsimple" 2
-  (and (eq_attr "type" "vecsimple")
+  (and (eq_attr "type" "vecsimple,veclogical,vecmove")
        (eq_attr "cpu" "power4"))
   "vq_power4")
 
@@ -421,7 +421,7 @@
 
 ; vecfp compare
 (define_insn_reservation "power4-veccmp" 8
-  (and (eq_attr "type" "veccmp")
+  (and (eq_attr "type" "veccmp,veccmpfx")
        (eq_attr "cpu" "power4"))
   "vq_power4")
 
diff --git a/gcc/config/rs6000/power5.md b/gcc/config/rs6000/power5.md
index 2d7c15e59c..b00d5ead14 100644
--- a/gcc/config/rs6000/power5.md
+++ b/gcc/config/rs6000/power5.md
@@ -322,7 +322,7 @@
 
 ; Basic FP latency is 6 cycles
 (define_insn_reservation "power5-fp" 6
-  (and (eq_attr "type" "fp,dmul")
+  (and (eq_attr "type" "fp,fpsimple,dmul")
        (eq_attr "cpu" "power5"))
   "fpq_power5")
 
diff --git a/gcc/config/rs6000/power6.md b/gcc/config/rs6000/power6.md
index 15d31eb81a..a94052417e 100644
--- a/gcc/config/rs6000/power6.md
+++ b/gcc/config/rs6000/power6.md
@@ -500,7 +500,7 @@
 (define_bypass 9 "power6-mtcr" "power6-branch")
 
 (define_insn_reservation "power6-fp" 6
-  (and (eq_attr "type" "fp,dmul")
+  (and (eq_attr "type" "fp,fpsimple,dmul,dfp")
        (eq_attr "cpu" "power6"))
   "FPU_power6")
 
@@ -556,7 +556,7 @@
   "LSF_power6")
 
 (define_insn_reservation "power6-vecsimple" 3
-  (and (eq_attr "type" "vecsimple")
+  (and (eq_attr "type" "vecsimple,veclogical,vecmove")
        (eq_attr "cpu" "power6"))
   "FPU_power6")
 
@@ -568,7 +568,7 @@
 (define_bypass 4 "power6-vecsimple" "power6-vecstore" )
 
 (define_insn_reservation "power6-veccmp" 1
-  (and (eq_attr "type" "veccmp")
+  (and (eq_attr "type" "veccmp,veccmpfx")
        (eq_attr "cpu" "power6"))
   "FPU_power6")
 
diff --git a/gcc/config/rs6000/power7.md b/gcc/config/rs6000/power7.md
index 9c6326dd26..91ebbf97f9 100644
--- a/gcc/config/rs6000/power7.md
+++ b/gcc/config/rs6000/power7.md
@@ -292,7 +292,7 @@
 
 ; VS Unit (includes FP/VSX/VMX/DFP)
 (define_insn_reservation "power7-fp" 6
-  (and (eq_attr "type" "fp,dmul")
+  (and (eq_attr "type" "fp,fpsimple,dmul,dfp")
        (eq_attr "cpu" "power7"))
   "DU_power7,VSU_power7")
 
@@ -324,7 +324,7 @@
   "DU_power7,VSU_power7")
 
 (define_insn_reservation "power7-vecsimple" 2
-  (and (eq_attr "type" "vecsimple,veccmp")
+  (and (eq_attr "type" "vecsimple,veclogical,vecmove,veccmp,veccmpfx")
        (eq_attr "cpu" "power7"))
   "DU_power7,vsu1_power7")
 
diff --git a/gcc/config/rs6000/power8.md b/gcc/config/rs6000/power8.md
index 6b6f0ffb8d..4bb323ff43 100644
--- a/gcc/config/rs6000/power8.md
+++ b/gcc/config/rs6000/power8.md
@@ -317,7 +317,7 @@
 
 ; VS Unit (includes FP/VSX/VMX/DFP/Crypto)
 (define_insn_reservation "power8-fp" 6
-  (and (eq_attr "type" "fp,dmul")
+  (and (eq_attr "type" "fp,fpsimple,dmul,dfp")
        (eq_attr "cpu" "power8"))
   "DU_any_power8,VSU_power8")
 
@@ -350,7 +350,8 @@
   "DU_any_power8,VSU_power8")
 
 (define_insn_reservation "power8-vecsimple" 2
-  (and (eq_attr "type" "vecperm,vecsimple,veccmp")
+  (and (eq_attr "type" "vecperm,vecsimple,veclogical,vecmove,veccmp,
+			veccmpfx")
        (eq_attr "cpu" "power8"))
   "DU_any_power8,VSU_power8")
 
diff --git a/gcc/config/rs6000/power9.md b/gcc/config/rs6000/power9.md
new file mode 100644
index 0000000000..015b5ba58b
--- /dev/null
+++ b/gcc/config/rs6000/power9.md
@@ -0,0 +1,477 @@
+;; Scheduling description for IBM POWER9 processor.
+;; Copyright (C) 2016 Free Software Foundation, Inc.
+;;
+;; Contributed by Pat Haugen (pthaugen@us.ibm.com).
+
+;; This file is part of GCC.
+;;
+;; GCC is free software; you can redistribute it and/or modify it
+;; under the terms of the GNU General Public License as published
+;; by the Free Software Foundation; either version 3, or (at your
+;; option) any later version.
+;;
+;; GCC is distributed in the hope that it will be useful, but WITHOUT
+;; ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+;; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
+;; License for more details.
+;;
+;; You should have received a copy of the GNU General Public License
+;; along with GCC; see the file COPYING3.  If not see
+;; <http://www.gnu.org/licenses/>.
+
+(define_automaton "power9dsp,power9lsu,power9vsu,power9misc")
+
+(define_cpu_unit "lsu0_power9,lsu1_power9,lsu2_power9,lsu3_power9" "power9lsu")
+(define_cpu_unit "vsu0_power9,vsu1_power9,vsu2_power9,vsu3_power9" "power9vsu")
+; Two vector permute units, part of vsu
+(define_cpu_unit "prm0_power9,prm1_power9" "power9vsu")
+; Two fixed point divide units, not pipelined
+(define_cpu_unit "fx_div0_power9,fx_div1_power9" "power9misc")
+(define_cpu_unit "bru_power9,cryptu_power9,dfu_power9" "power9misc")
+
+(define_cpu_unit "x0_power9,x1_power9,xa0_power9,xa1_power9,
+		  x2_power9,x3_power9,xb0_power9,xb1_power9,
+		  br0_power9,br1_power9" "power9dsp")
+
+
+; Dispatch port reservations
+;
+; Power9 can dispatch a maximum of 6 iops per cycle with the following
+; general restrictions (other restrictions also apply):
+;   1) At most 2 iops per execution slice
+;   2) At most 2 iops to the branch unit
+; Note that insn position in a dispatch group of 6 insns does not infer which
+; execution slice the insn is routed to.  The units are used to infer the
+; conflicts that exist (i.e. an 'even' requirement will preclude dispatch
+; with 2 insns with 'superslice' requirement).
+
+; The xa0/xa1 units really represent the 3rd dispatch port for a superslice but
+; are listed as separate units to allow those insns that preclude its use to
+; still be scheduled two to a superslice while reserving the 3rd slot.  The
+; same applies for xb0/xb1.
+(define_reservation "DU_xa_power9" "xa0_power9+xa1_power9")
+(define_reservation "DU_xb_power9" "xb0_power9+xb1_power9")
+
+; Any execution slice dispatch
+(define_reservation "DU_any_power9"
+		    "x0_power9|x1_power9|DU_xa_power9|x2_power9|x3_power9|
+		     DU_xb_power9")
+
+; Even slice, actually takes even/odd slots
+(define_reservation "DU_even_power9" "x0_power9+x1_power9|x2_power9+x3_power9")
+
+; Slice plus 3rd slot
+(define_reservation "DU_slice_3_power9"
+		    "x0_power9+xa0_power9|x1_power9+xa1_power9|
+		     x2_power9+xb0_power9|x3_power9+xb1_power9")
+
+; Superslice
+(define_reservation "DU_super_power9"
+		    "x0_power9+x1_power9|x2_power9+x3_power9")
+
+; 2-way cracked
+(define_reservation "DU_C2_power9" "x0_power9+x1_power9|
+				    x1_power9+DU_xa_power9|
+				    x1_power9+x2_power9|
+				    DU_xa_power9+x2_power9|
+				    x2_power9+x3_power9|
+				    x3_power9+DU_xb_power9")
+
+; 2-way cracked plus 3rd slot
+(define_reservation "DU_C2_3_power9" "x0_power9+x1_power9+xa0_power9|
+				      x1_power9+x2_power9+xa0_power9|
+				      x1_power9+x2_power9+xb0_power9|
+				      x2_power9+x3_power9+xb0_power9")
+
+; 3-way cracked (consumes whole decode/dispatch cycle)
+(define_reservation "DU_C3_power9"
+		    "x0_power9+x1_power9+xa0_power9+xa1_power9+x2_power9+
+		     x3_power9+xb0_power9+xb1_power9+br0_power9+br1_power9")
+
+; Branch ports
+(define_reservation "DU_branch_power9" "br0_power9|br1_power9")
+
+
+; Execution unit reservations
+(define_reservation "LSU_power9"
+		    "lsu0_power9|lsu1_power9|lsu2_power9|lsu3_power9")
+
+(define_reservation "LSU_pair_power9"
+		    "lsu0_power9+lsu1_power9|lsu1_power9+lsu2_power9|
+		     lsu2_power9+lsu3_power9|lsu3_power9+lsu0_power9")
+
+(define_reservation "VSU_power9"
+		    "vsu0_power9|vsu1_power9|vsu2_power9|vsu3_power9")
+
+(define_reservation "VSU_super_power9"
+		    "vsu0_power9+vsu1_power9|vsu2_power9+vsu3_power9")
+
+(define_reservation "VSU_PRM_power9" "prm0_power9|prm1_power9")
+
+
+; LS Unit
+(define_insn_reservation "power9-load" 4
+  (and (eq_attr "type" "load")
+       (eq_attr "sign_extend" "no")
+       (eq_attr "update" "no")
+       (eq_attr "cpu" "power9"))
+  "DU_any_power9,LSU_power9")
+
+(define_insn_reservation "power9-load-update" 4
+  (and (eq_attr "type" "load")
+       (eq_attr "sign_extend" "no")
+       (eq_attr "update" "yes")
+       (eq_attr "cpu" "power9"))
+  "DU_C2_power9,LSU_power9+VSU_power9")
+
+(define_insn_reservation "power9-load-ext" 6
+  (and (eq_attr "type" "load")
+       (eq_attr "sign_extend" "yes")
+       (eq_attr "update" "no")
+       (eq_attr "cpu" "power9"))
+  "DU_C2_power9,LSU_power9")
+
+(define_insn_reservation "power9-load-ext-update" 6
+  (and (eq_attr "type" "load")
+       (eq_attr "sign_extend" "yes")
+       (eq_attr "update" "yes")
+       (eq_attr "cpu" "power9"))
+  "DU_C3_power9,LSU_power9+VSU_power9")
+
+(define_insn_reservation "power9-fpload-double" 4
+  (and (eq_attr "type" "fpload")
+       (eq_attr "update" "no")
+       (eq_attr "size" "64")
+       (eq_attr "cpu" "power9"))
+  "DU_slice_3_power9,LSU_power9")
+
+(define_insn_reservation "power9-fpload-update-double" 4
+  (and (eq_attr "type" "fpload")
+       (eq_attr "update" "yes")
+       (eq_attr "size" "64")
+       (eq_attr "cpu" "power9"))
+  "DU_C2_3_power9,LSU_power9+VSU_power9")
+
+; SFmode loads are cracked and have additional 2 cycles over DFmode
+(define_insn_reservation "power9-fpload-single" 6
+  (and (eq_attr "type" "fpload")
+       (eq_attr "update" "no")
+       (eq_attr "size" "32")
+       (eq_attr "cpu" "power9"))
+  "DU_C2_3_power9,LSU_power9")
+
+(define_insn_reservation "power9-fpload-update-single" 6
+  (and (eq_attr "type" "fpload")
+       (eq_attr "update" "yes")
+       (eq_attr "size" "32")
+       (eq_attr "cpu" "power9"))
+  "DU_C3_power9,LSU_power9+VSU_power9")
+
+(define_insn_reservation "power9-vecload" 5
+  (and (eq_attr "type" "vecload")
+       (eq_attr "cpu" "power9"))
+  "DU_any_power9,LSU_pair_power9")
+
+; Store data can issue 2 cycles after AGEN issue, 3 cycles for vector store
+(define_insn_reservation "power9-store" 0
+  (and (eq_attr "type" "store")
+       (eq_attr "update" "no")
+       (eq_attr "indexed" "no")
+       (eq_attr "cpu" "power9"))
+  "DU_slice_3_power9,LSU_power9")
+
+(define_insn_reservation "power9-store-indexed" 0
+  (and (eq_attr "type" "store")
+       (eq_attr "update" "no")
+       (eq_attr "indexed" "yes")
+       (eq_attr "cpu" "power9"))
+  "DU_slice_3_power9,LSU_power9")
+
+; Update forms have 2 cycle latency for updated addr reg
+(define_insn_reservation "power9-store-update" 2
+  (and (eq_attr "type" "store")
+       (eq_attr "update" "yes")
+       (eq_attr "indexed" "no")
+       (eq_attr "cpu" "power9"))
+  "DU_C2_3_power9,LSU_power9+VSU_power9")
+
+; Update forms have 2 cycle latency for updated addr reg
+(define_insn_reservation "power9-store-update-indexed" 2
+  (and (eq_attr "type" "store")
+       (eq_attr "update" "yes")
+       (eq_attr "indexed" "yes")
+       (eq_attr "cpu" "power9"))
+  "DU_C2_3_power9,LSU_power9+VSU_power9")
+
+(define_insn_reservation "power9-fpstore" 0
+  (and (eq_attr "type" "fpstore")
+       (eq_attr "update" "no")
+       (eq_attr "cpu" "power9"))
+  "DU_slice_3_power9,LSU_power9")
+
+; Update forms have 2 cycle latency for updated addr reg
+(define_insn_reservation "power9-fpstore-update" 2
+  (and (eq_attr "type" "fpstore")
+       (eq_attr "update" "yes")
+       (eq_attr "cpu" "power9"))
+  "DU_C2_3_power9,LSU_power9+VSU_power9")
+
+(define_insn_reservation "power9-vecstore" 0
+  (and (eq_attr "type" "vecstore")
+       (eq_attr "cpu" "power9"))
+  "DU_super_power9,LSU_pair_power9")
+
+(define_insn_reservation "power9-larx" 4
+  (and (eq_attr "type" "load_l")
+       (eq_attr "cpu" "power9"))
+  "DU_any_power9,LSU_power9")
+
+(define_insn_reservation "power9-stcx" 2
+  (and (eq_attr "type" "store_c")
+       (eq_attr "cpu" "power9"))
+  "DU_C2_3_power9,LSU_power9+VSU_power9")
+
+(define_insn_reservation "power9-sync" 4
+  (and (eq_attr "type" "sync,isync")
+       (eq_attr "cpu" "power9"))
+  "DU_any_power9,LSU_power9")
+
+
+; VSU Execution Unit
+
+; Fixed point ops
+
+; Most ALU insns are simple 2 cycle, including record form
+(define_insn_reservation "power9-alu" 2
+  (and (ior (eq_attr "type" "add,cmp,exts,integer,logical,isel")
+	    (and (eq_attr "type" "insert,shift")
+		 (eq_attr "dot" "no")))
+       (eq_attr "cpu" "power9"))
+  "DU_any_power9,VSU_power9")
+
+; Record form rotate/shift are cracked
+(define_insn_reservation "power9-cracked-alu" 2
+  (and (eq_attr "type" "insert,shift")
+       (eq_attr "dot" "yes")
+       (eq_attr "cpu" "power9"))
+  "DU_C2_power9,VSU_power9")
+; 4 cycle CR latency
+(define_bypass 4 "power9-cracked-alu"
+		 "power9-crlogical,power9-mfcr,power9-mfcrf,power9-branch")
+
+(define_insn_reservation "power9-alu2" 3
+  (and (eq_attr "type" "cntlz,popcnt,trap")
+       (eq_attr "cpu" "power9"))
+  "DU_any_power9,VSU_power9")
+
+; Treat 'two' and 'three' types as 2 or 3 way cracked
+(define_insn_reservation "power9-two" 4
+  (and (eq_attr "type" "two")
+       (eq_attr "cpu" "power9"))
+  "DU_C2_power9,VSU_power9")
+
+(define_insn_reservation "power9-three" 6
+  (and (eq_attr "type" "three")
+       (eq_attr "cpu" "power9"))
+  "DU_C3_power9,VSU_power9")
+
+(define_insn_reservation "power9-mul" 4
+  (and (eq_attr "type" "mul")
+       (eq_attr "dot" "no")
+       (eq_attr "cpu" "power9"))
+  "DU_any_power9,VSU_power9")
+
+(define_insn_reservation "power9-mul-compare" 4
+  (and (eq_attr "type" "mul")
+       (eq_attr "dot" "yes")
+       (eq_attr "cpu" "power9"))
+  "DU_C2_power9,VSU_power9")
+; 6 cycle CR latency
+(define_bypass 6 "power9-mul-compare"
+		 "power9-crlogical,power9-mfcr,power9-mfcrf,power9-branch")
+
+; Fixed point divides reserve the divide units for a minimum of 8 cycles
+(define_insn_reservation "power9-idiv" 16
+  (and (eq_attr "type" "div")
+       (eq_attr "size" "32")
+       (eq_attr "cpu" "power9"))
+  "DU_even_power9,fx_div0_power9*8|fx_div1_power9*8")
+
+(define_insn_reservation "power9-ldiv" 24
+  (and (eq_attr "type" "div")
+       (eq_attr "size" "64")
+       (eq_attr "cpu" "power9"))
+  "DU_even_power9,fx_div0_power9*8|fx_div1_power9*8")
+
+(define_insn_reservation "power9-crlogical" 2
+  (and (eq_attr "type" "cr_logical,delayed_cr")
+       (eq_attr "cpu" "power9"))
+  "DU_any_power9,VSU_power9")
+
+(define_insn_reservation "power9-mfcrf" 2
+  (and (eq_attr "type" "mfcrf")
+       (eq_attr "cpu" "power9"))
+  "DU_any_power9,VSU_power9")
+
+(define_insn_reservation "power9-mfcr" 6
+  (and (eq_attr "type" "mfcr")
+       (eq_attr "cpu" "power9"))
+  "DU_C3_power9,VSU_power9")
+
+; Should differentiate between 1 cr field and > 1 since target of > 1 cr
+; is cracked
+(define_insn_reservation "power9-mtcr" 2
+  (and (eq_attr "type" "mtcr")
+       (eq_attr "cpu" "power9"))
+  "DU_any_power9,VSU_power9")
+
+; Move to LR/CTR are executed in VSU
+(define_insn_reservation "power9-mtjmpr" 5
+  (and (eq_attr "type" "mtjmpr")
+       (eq_attr "cpu" "power9"))
+  "DU_any_power9,VSU_power9")
+
+; Floating point/Vector ops
+(define_insn_reservation "power9-fpsimple" 2
+  (and (eq_attr "type" "fpsimple")
+       (eq_attr "cpu" "power9"))
+  "DU_slice_3_power9,VSU_power9")
+
+(define_insn_reservation "power9-fp" 7
+  (and (eq_attr "type" "fp,dmul")
+       (eq_attr "cpu" "power9"))
+  "DU_slice_3_power9,VSU_power9")
+
+(define_insn_reservation "power9-fpcompare" 3
+  (and (eq_attr "type" "fpcompare")
+       (eq_attr "cpu" "power9"))
+  "DU_slice_3_power9,VSU_power9")
+
+; FP div/sqrt are executed in the VSU slices.  They are not pipelined wrt other
+; divide insns, but for the most part do not block pipelined ops.
+(define_insn_reservation "power9-sdiv" 22
+  (and (eq_attr "type" "sdiv")
+       (eq_attr "cpu" "power9"))
+  "DU_slice_3_power9,VSU_power9")
+
+(define_insn_reservation "power9-ddiv" 33
+  (and (eq_attr "type" "ddiv")
+       (eq_attr "cpu" "power9"))
+  "DU_slice_3_power9,VSU_power9")
+
+(define_insn_reservation "power9-sqrt" 26
+  (and (eq_attr "type" "ssqrt")
+       (eq_attr "cpu" "power9"))
+  "DU_slice_3_power9,VSU_power9")
+
+(define_insn_reservation "power9-dsqrt" 36
+  (and (eq_attr "type" "dsqrt")
+       (eq_attr "cpu" "power9"))
+  "DU_slice_3_power9,VSU_power9")
+
+(define_insn_reservation "power9-vec-2cyc" 2
+  (and (eq_attr "type" "vecmove,veclogical,vecexts,veccmpfx")
+       (eq_attr "cpu" "power9"))
+  "DU_super_power9,VSU_super_power9")
+
+(define_insn_reservation "power9-veccmp" 3
+  (and (eq_attr "type" "veccmp")
+       (eq_attr "cpu" "power9"))
+  "DU_super_power9,VSU_super_power9")
+
+(define_insn_reservation "power9-vecsimple" 3
+  (and (eq_attr "type" "vecsimple")
+       (eq_attr "cpu" "power9"))
+  "DU_super_power9,VSU_super_power9")
+
+(define_insn_reservation "power9-vecnormal" 7
+  (and (eq_attr "type" "vecfloat,vecdouble")
+       (eq_attr "size" "!128")
+       (eq_attr "cpu" "power9"))
+  "DU_super_power9,VSU_super_power9")
+
+; Quad-precision FP ops, execute in DFU
+(define_insn_reservation "power9-qp" 12
+  (and (eq_attr "type" "vecfloat,vecdouble")
+       (eq_attr "size" "128")
+       (eq_attr "cpu" "power9"))
+  "DU_super_power9,dfu_power9")
+
+(define_insn_reservation "power9-vecperm" 3
+  (and (eq_attr "type" "vecperm")
+       (eq_attr "cpu" "power9"))
+  "DU_super_power9,VSU_PRM_power9")
+
+(define_insn_reservation "power9-veccomplex" 7
+  (and (eq_attr "type" "veccomplex")
+       (eq_attr "cpu" "power9"))
+  "DU_super_power9,VSU_super_power9")
+
+(define_insn_reservation "power9-vecfdiv" 28
+  (and (eq_attr "type" "vecfdiv")
+       (eq_attr "cpu" "power9"))
+  "DU_super_power9,VSU_super_power9")
+
+(define_insn_reservation "power9-vecdiv" 32
+  (and (eq_attr "type" "vecdiv")
+       (eq_attr "size" "!128")
+       (eq_attr "cpu" "power9"))
+  "DU_super_power9,VSU_super_power9")
+
+(define_insn_reservation "power9-qpdiv" 56
+  (and (eq_attr "type" "vecdiv")
+       (eq_attr "size" "128")
+       (eq_attr "cpu" "power9"))
+  "DU_super_power9,dfu_power9")
+
+(define_insn_reservation "power9-mffgpr" 2
+  (and (eq_attr "type" "mffgpr")
+       (eq_attr "cpu" "power9"))
+  "DU_slice_3_power9,VSU_power9")
+
+(define_insn_reservation "power9-mftgpr" 2
+  (and (eq_attr "type" "mftgpr")
+       (eq_attr "cpu" "power9"))
+  "DU_slice_3_power9,VSU_power9")
+
+
+; Branch Unit
+; Move from LR/CTR are executed in BRU but consume a writeback port from an
+; execution slice.
+(define_insn_reservation "power9-mfjmpr" 6
+  (and (eq_attr "type" "mfjmpr")
+       (eq_attr "cpu" "power9"))
+  "DU_branch_power9,bru_power9+VSU_power9")
+
+; Branch is 2 cycles
+(define_insn_reservation "power9-branch" 2
+  (and (eq_attr "type" "jmpreg,branch")
+       (eq_attr "cpu" "power9"))
+  "DU_branch_power9,bru_power9")
+
+
+; Crypto Unit
+(define_insn_reservation "power9-crypto" 6
+  (and (eq_attr "type" "crypto")
+       (eq_attr "cpu" "power9"))
+  "DU_super_power9,cryptu_power9")
+
+
+; HTM Unit
+(define_insn_reservation "power9-htm" 4
+  (and (eq_attr "type" "htm")
+       (eq_attr "cpu" "power9"))
+  "DU_C2_power9,LSU_power9")
+
+(define_insn_reservation "power9-htm-simple" 2
+  (and (eq_attr "type" "htmsimple")
+       (eq_attr "cpu" "power9"))
+  "DU_any_power9,VSU_power9")
+
+
+; DFP Unit
+(define_insn_reservation "power9-dfp" 12
+  (and (eq_attr "type" "dfp")
+       (eq_attr "cpu" "power9"))
+  "DU_even_power9,dfu_power9")
+
diff --git a/gcc/config/rs6000/predicates.md b/gcc/config/rs6000/predicates.md
index 71fac765e2..41694a51f1 100644
--- a/gcc/config/rs6000/predicates.md
+++ b/gcc/config/rs6000/predicates.md
@@ -572,6 +572,38 @@
     }
 })
 
+;; Return 1 if the operand is a CONST_VECTOR or VEC_DUPLICATE of a constant
+;; that can loaded with a XXSPLTIB instruction and then a VUPKHSB, VECSB2W or
+;; VECSB2D instruction.
+
+(define_predicate "xxspltib_constant_split"
+  (match_code "const_vector,vec_duplicate,const_int")
+{
+  int value = 256;
+  int num_insns = -1;
+
+  if (!xxspltib_constant_p (op, mode, &num_insns, &value))
+    return false;
+
+  return num_insns > 1;
+})
+
+
+;; Return 1 if the operand is a CONST_VECTOR that can loaded directly with a
+;; XXSPLTIB instruction.
+
+(define_predicate "xxspltib_constant_nosplit"
+  (match_code "const_vector,vec_duplicate,const_int")
+{
+  int value = 256;
+  int num_insns = -1;
+
+  if (!xxspltib_constant_p (op, mode, &num_insns, &value))
+    return false;
+
+  return num_insns == 1;
+})
+
 ;; Return 1 if the operand is a CONST_VECTOR and can be loaded into a
 ;; vector register without using memory.
 (define_predicate "easy_vector_constant"
@@ -590,7 +622,14 @@
 
   if (VECTOR_MEM_ALTIVEC_OR_VSX_P (mode))
     {
-      if (zero_constant (op, mode))
+      int value = 256;
+      int num_insns = -1;
+
+      if (zero_constant (op, mode) || all_ones_constant (op, mode))
+	return true;
+
+      if (TARGET_P9_VECTOR
+          && xxspltib_constant_p (op, mode, &num_insns, &value))
 	return true;
 
       return easy_altivec_constant (op, mode);
@@ -669,6 +708,11 @@
   (and (match_code "const_int,const_double,const_wide_int,const_vector")
        (match_test "op == CONST0_RTX (mode)")))
 
+;; Return 1 if operand is constant -1 (scalars and vectors).
+(define_predicate "all_ones_constant"
+  (and (match_code "const_int,const_double,const_wide_int,const_vector")
+       (match_test "op == CONSTM1_RTX (mode) && !FLOAT_MODE_P (mode)")))
+
 ;; Return 1 if operand is 0.0.
 (define_predicate "zero_fp_constant"
   (and (match_code "const_double")
@@ -698,48 +742,25 @@
 (define_predicate "quad_memory_operand"
   (match_code "mem")
 {
-  rtx addr, op0, op1;
-  int ret;
-
   if (!TARGET_QUAD_MEMORY && !TARGET_SYNC_TI)
-    ret = 0;
-
-  else if (!memory_operand (op, mode))
-    ret = 0;
-
-  else if (GET_MODE_SIZE (GET_MODE (op)) != 16)
-    ret = 0;
-
-  else if (MEM_ALIGN (op) < 128)
-    ret = 0;
-
-  else
-    {
-      addr = XEXP (op, 0);
-      if (int_reg_operand (addr, Pmode))
-	ret = 1;
+    return false;
 
-      else if (GET_CODE (addr) != PLUS)
-	ret = 0;
+  if (GET_MODE_SIZE (mode) != 16 || !MEM_P (op) || MEM_ALIGN (op) < 128)
+    return false;
 
-      else
-	{
-	  op0 = XEXP (addr, 0);
-	  op1 = XEXP (addr, 1);
-	  ret = (int_reg_operand (op0, Pmode)
-		 && GET_CODE (op1) == CONST_INT
-		 && IN_RANGE (INTVAL (op1), -32768, 32767)
-		 && (INTVAL (op1) & 15) == 0);
-	}
-    }
+  return quad_address_p (XEXP (op, 0), mode, false);
+})
 
-  if (TARGET_DEBUG_ADDR)
-    {
-      fprintf (stderr, "\nquad_memory_operand, ret = %s\n", ret ? "true" : "false");
-      debug_rtx (op);
-    }
+;; Return 1 if the operand is suitable for load/store to vector registers with
+;; d-form addressing (register+offset), which was added in ISA 3.0.
+;; Unlike quad_memory_operand, we do not have to check for alignment.
+(define_predicate "vsx_quad_dform_memory_operand"
+  (match_code "mem")
+{
+  if (!TARGET_P9_DFORM_VECTOR || !MEM_P (op) || GET_MODE_SIZE (mode) != 16)
+    return false;
 
-  return ret;
+  return quad_address_p (XEXP (op, 0), mode, false);
 })
 
 ;; Return 1 if the operand is an indexed or indirect memory operand.
@@ -1054,6 +1075,10 @@
 	mode = V2DFmode;
       else if (mode == DImode)
 	mode = V2DImode;
+      else if (mode == SImode && TARGET_P9_VECTOR)
+	mode = V4SImode;
+      else if (mode == SFmode && TARGET_P9_VECTOR)
+	mode = V4SFmode;
       else
 	gcc_unreachable ();
       return memory_address_addr_space_p (mode, XEXP (op, 0),
@@ -1091,10 +1116,6 @@
 (define_special_predicate "equality_operator"
   (match_code "eq,ne"))
 
-;; Return true if operand is MIN or MAX operator.
-(define_predicate "min_max_operator"
-  (match_code "smin,smax,umin,umax"))
-
 ;; Return 1 if OP is a comparison operation that is valid for a branch
 ;; instruction.  We check the opcode against the mode of the CC value.
 ;; validate_condition_mode is an assertion.
@@ -1137,6 +1158,11 @@
   (and (match_operand 0 "branch_comparison_operator")
        (match_code "ne,le,ge,leu,geu,ordered")))
 
+;; Return 1 if OP is a comparison operator suitable for vector/scalar
+;; comparisons that generate a -1/0 mask.
+(define_predicate "fpmask_comparison_operator"
+  (match_code "eq,gt,ge"))
+
 ;; Return 1 if OP is a comparison operation that is valid for a branch
 ;; insn, which is true if the corresponding bit in the CC register is set.
 (define_predicate "branch_positive_comparison_operator"
diff --git a/gcc/config/rs6000/rs6000-builtin.def b/gcc/config/rs6000/rs6000-builtin.def
index 5b82b00449..a33faa6e5b 100644
--- a/gcc/config/rs6000/rs6000-builtin.def
+++ b/gcc/config/rs6000/rs6000-builtin.def
@@ -24,6 +24,7 @@
    <http://www.gnu.org/licenses/>.  */
 
 /* Before including this file, some macros must be defined:
+   RS6000_BUILTIN_0 -- 0 arg builtins
    RS6000_BUILTIN_1 -- 1 arg builtins
    RS6000_BUILTIN_2 -- 2 arg builtins
    RS6000_BUILTIN_3 -- 3 arg builtins
@@ -43,6 +44,10 @@
 	ATTR	builtin attribute information.
 	ICODE	Insn code of the function that implents the builtin.  */
 
+#ifndef RS6000_BUILTIN_0
+  #error "RS6000_BUILTIN_0 is not defined."
+#endif
+
 #ifndef RS6000_BUILTIN_1
   #error "RS6000_BUILTIN_1 is not defined."
 #endif
@@ -637,6 +642,91 @@
 		     | RS6000_BTC_TERNARY),				\
 		    CODE_FOR_ ## ICODE)			/* ICODE */
 
+/* Miscellaneous builtins for instructions added in ISA 3.0.  These
+   instructions don't require either the DFP or VSX options, just the basic
+   ISA 3.0 enablement since they operate on general purpose registers.  */
+#define BU_P9_MISC_0(ENUM, NAME, ATTR, ICODE)                      \
+  RS6000_BUILTIN_0 (MISC_BUILTIN_ ## ENUM,		/* ENUM */	\
+		    "__builtin_" NAME,			/* NAME */	\
+		    RS6000_BTM_P9_MISC,			/* MASK */	\
+		    (RS6000_BTC_ ## ATTR		/* ATTR */	\
+		     | RS6000_BTC_SPECIAL),				\
+		    CODE_FOR_ ## ICODE)			/* ICODE */
+
+#define BU_P9_MISC_1(ENUM, NAME, ATTR, ICODE)				\
+  RS6000_BUILTIN_1 (MISC_BUILTIN_ ## ENUM,		/* ENUM */	\
+		    "__builtin_" NAME,			/* NAME */	\
+		    RS6000_BTM_P9_MISC,			/* MASK */	\
+		    (RS6000_BTC_ ## ATTR		/* ATTR */	\
+		     | RS6000_BTC_UNARY),				\
+		    CODE_FOR_ ## ICODE)			/* ICODE */
+
+/* Miscellaneous builtins for instructions added in ISA 3.0.  These
+   instructions don't require either the DFP or VSX options, just the basic
+   ISA 3.0 enablement since they operate on general purpose registers,
+   and they require 64-bit addressing.  */
+#define BU_P9_64BIT_MISC_0(ENUM, NAME, ATTR, ICODE)			\
+  RS6000_BUILTIN_0 (MISC_BUILTIN_ ## ENUM,		/* ENUM */	\
+		    "__builtin_" NAME,			/* NAME */	\
+		    RS6000_BTM_P9_MISC					\
+                     | RS6000_BTM_64BIT,		/* MASK */	\
+		    (RS6000_BTC_ ## ATTR		/* ATTR */	\
+		     | RS6000_BTC_SPECIAL),				\
+		    CODE_FOR_ ## ICODE)			/* ICODE */
+
+/* Miscellaneous builtins for decimal floating point instructions
+   added in ISA 3.0.  These instructions don't require the VSX
+   options, just the basic ISA 3.0 enablement since they operate on
+   general purpose registers.  */
+#define BU_P9_DFP_MISC_0(ENUM, NAME, ATTR, ICODE)			\
+  RS6000_BUILTIN_0 (MISC_BUILTIN_ ## ENUM,		/* ENUM */	\
+		    "__builtin_" NAME,			/* NAME */	\
+		    RS6000_BTM_P9_MISC,			/* MASK */	\
+		    (RS6000_BTC_ ## ATTR		/* ATTR */	\
+		     | RS6000_BTC_SPECIAL),				\
+		    CODE_FOR_ ## ICODE)			/* ICODE */
+
+#define BU_P9_DFP_MISC_1(ENUM, NAME, ATTR, ICODE)			\
+  RS6000_BUILTIN_1 (MISC_BUILTIN_ ## ENUM,		/* ENUM */	\
+		    "__builtin_" NAME,			/* NAME */	\
+		    RS6000_BTM_P9_MISC,			/* MASK */	\
+		    (RS6000_BTC_ ## ATTR		/* ATTR */	\
+		     | RS6000_BTC_SPECIAL),				\
+		    CODE_FOR_ ## ICODE)			/* ICODE */
+
+#define BU_P9_DFP_MISC_2(ENUM, NAME, ATTR, ICODE)			\
+  RS6000_BUILTIN_2 (MISC_BUILTIN_ ## ENUM,		/* ENUM */	\
+		    "__builtin_" NAME,			/* NAME */	\
+		    RS6000_BTM_P9_MISC,			/* MASK */	\
+		    (RS6000_BTC_ ## ATTR		/* ATTR */	\
+		     | RS6000_BTC_SPECIAL),				\
+		    CODE_FOR_ ## ICODE)			/* ICODE */
+
+/* Decimal floating point overloaded functions added in ISA 3.0 */
+#define BU_P9_DFP_OVERLOAD_1(ENUM, NAME)				\
+  RS6000_BUILTIN_1 (P9_BUILTIN_DFP_ ## ENUM,		/* ENUM */	\
+		    "__builtin_dfp_" NAME,		/* NAME */	\
+		    RS6000_BTM_P9_MISC,			/* MASK */	\
+		    (RS6000_BTC_OVERLOADED		/* ATTR */	\
+		     | RS6000_BTC_UNARY),				\
+		    CODE_FOR_nothing)			/* ICODE */
+
+#define BU_P9_DFP_OVERLOAD_2(ENUM, NAME)				\
+  RS6000_BUILTIN_2 (P9_BUILTIN_DFP_ ## ENUM,		/* ENUM */	\
+		    "__builtin_dfp_" NAME,		/* NAME */	\
+		    RS6000_BTM_P9_MISC,			/* MASK */	\
+		    (RS6000_BTC_OVERLOADED		/* ATTR */	\
+		     | RS6000_BTC_BINARY),				\
+		    CODE_FOR_nothing)			/* ICODE */
+
+#define BU_P9_DFP_OVERLOAD_3(ENUM, NAME)				\
+  RS6000_BUILTIN_3 (P9_BUILTIN_DFP_ ## ENUM,		/* ENUM */	\
+		    "__builtin_dfp_" NAME,		/* NAME */	\
+		    RS6000_BTM_P9_MISC,			/* MASK */	\
+		    (RS6000_BTC_OVERLOADED		/* ATTR */	\
+		     | RS6000_BTC_TERNARY),				\
+		    CODE_FOR_nothing)			/* ICODE */
+
 /* 128-bit long double floating point builtins.  */
 #define BU_LDBL128_2(ENUM, NAME, ATTR, ICODE)				\
   RS6000_BUILTIN_2 (MISC_BUILTIN_ ## ENUM,		/* ENUM */	\
@@ -647,8 +737,94 @@
 		     | RS6000_BTC_BINARY),				\
 		    CODE_FOR_ ## ICODE)			/* ICODE */
 
+/* IEEE 128-bit floating-point builtins.  */
+#define BU_FLOAT128_2(ENUM, NAME, ATTR, ICODE)                          \
+  RS6000_BUILTIN_2 (MISC_BUILTIN_ ## ENUM,              /* ENUM */      \
+		    "__builtin_" NAME,                  /* NAME */      \
+		    RS6000_BTM_FLOAT128,                /* MASK */      \
+		    (RS6000_BTC_ ## ATTR                /* ATTR */      \
+		     | RS6000_BTC_BINARY),                              \
+		    CODE_FOR_ ## ICODE)                 /* ICODE */
+
+#define BU_FLOAT128_1(ENUM, NAME, ATTR, ICODE)                          \
+  RS6000_BUILTIN_1 (MISC_BUILTIN_ ## ENUM,              /* ENUM */      \
+		    "__builtin_" NAME,                  /* NAME */      \
+		    RS6000_BTM_FLOAT128,                /* MASK */      \
+		    (RS6000_BTC_ ## ATTR                /* ATTR */      \
+		     | RS6000_BTC_UNARY),                               \
+		    CODE_FOR_ ## ICODE)                 /* ICODE */
+
+/* ISA 3.0 (power9) vector convenience macros.  */
+/* For the instructions that are encoded as altivec instructions use
+   __builtin_altivec_ as the builtin name.  */
+#define BU_P9V_AV_1(ENUM, NAME, ATTR, ICODE)				\
+  RS6000_BUILTIN_1 (P9V_BUILTIN_ ## ENUM,		/* ENUM */	\
+		    "__builtin_altivec_" NAME,		/* NAME */	\
+		    RS6000_BTM_P9_VECTOR,		/* MASK */	\
+		    (RS6000_BTC_ ## ATTR		/* ATTR */	\
+		     | RS6000_BTC_UNARY),				\
+		    CODE_FOR_ ## ICODE)			/* ICODE */
+
+#define BU_P9V_AV_2(ENUM, NAME, ATTR, ICODE)				\
+  RS6000_BUILTIN_2 (P9V_BUILTIN_ ## ENUM,		/* ENUM */	\
+		    "__builtin_altivec_" NAME,		/* NAME */	\
+		    RS6000_BTM_P9_VECTOR,		/* MASK */	\
+		    (RS6000_BTC_ ## ATTR		/* ATTR */	\
+		     | RS6000_BTC_BINARY),				\
+		    CODE_FOR_ ## ICODE)			/* ICODE */
+
+#define BU_P9V_AV_3(ENUM, NAME, ATTR, ICODE)				\
+  RS6000_BUILTIN_3 (P9V_BUILTIN_ ## ENUM,		/* ENUM */	\
+		    "__builtin_altivec_" NAME,		/* NAME */	\
+		    RS6000_BTM_P9_VECTOR,		/* MASK */	\
+		    (RS6000_BTC_ ## ATTR		/* ATTR */	\
+		     | RS6000_BTC_TERNARY),				\
+		    CODE_FOR_ ## ICODE)			/* ICODE */
+
+#define BU_P9V_AV_P(ENUM, NAME, ATTR, ICODE)				\
+  RS6000_BUILTIN_P (P9V_BUILTIN_ ## ENUM,		/* ENUM */	\
+		    "__builtin_altivec_" NAME,		/* NAME */	\
+		    RS6000_BTM_P9_VECTOR,		/* MASK */	\
+		    (RS6000_BTC_ ## ATTR		/* ATTR */	\
+		     | RS6000_BTC_PREDICATE),				\
+		    CODE_FOR_ ## ICODE)			/* ICODE */
+
+/* For the instructions encoded as VSX instructions use __builtin_vsx as the
+   builtin name.  */
+#define BU_P9V_VSX_1(ENUM, NAME, ATTR, ICODE)				\
+  RS6000_BUILTIN_1 (P9V_BUILTIN_ ## ENUM,		/* ENUM */	\
+		    "__builtin_vsx_" NAME,		/* NAME */	\
+		    RS6000_BTM_P9_VECTOR,		/* MASK */	\
+		    (RS6000_BTC_ ## ATTR		/* ATTR */	\
+		     | RS6000_BTC_UNARY),				\
+		    CODE_FOR_ ## ICODE)			/* ICODE */
+
+#define BU_P9V_OVERLOAD_1(ENUM, NAME)					\
+  RS6000_BUILTIN_1 (P9V_BUILTIN_VEC_ ## ENUM,		/* ENUM */	\
+		    "__builtin_vec_" NAME,		/* NAME */	\
+		    RS6000_BTM_P9_VECTOR,		/* MASK */	\
+		    (RS6000_BTC_OVERLOADED		/* ATTR */	\
+		     | RS6000_BTC_UNARY),				\
+		    CODE_FOR_nothing)			/* ICODE */
+
+#define BU_P9V_OVERLOAD_2(ENUM, NAME)					\
+  RS6000_BUILTIN_2 (P9V_BUILTIN_VEC_ ## ENUM,		/* ENUM */	\
+		    "__builtin_vec_" NAME,		/* NAME */	\
+		    RS6000_BTM_P9_VECTOR,		/* MASK */	\
+		    (RS6000_BTC_OVERLOADED		/* ATTR */	\
+		     | RS6000_BTC_BINARY),				\
+		    CODE_FOR_nothing)			/* ICODE */
+
+#define BU_P9V_OVERLOAD_3(ENUM, NAME)					\
+  RS6000_BUILTIN_3 (P9V_BUILTIN_VEC_ ## ENUM,		/* ENUM */	\
+		    "__builtin_vec_" NAME,		/* NAME */	\
+		    RS6000_BTM_P9_VECTOR,		/* MASK */	\
+		    (RS6000_BTC_OVERLOADED		/* ATTR */	\
+		     | RS6000_BTC_TERNARY),				\
+		    CODE_FOR_nothing)			/* ICODE */
 #endif
 
+
 /* Insure 0 is not a legitimate index.  */
 BU_SPECIAL_X (RS6000_BUILTIN_NONE, NULL, 0, RS6000_BTC_MISC)
 
@@ -1391,13 +1567,25 @@ BU_VSX_X (LXVW4X_V4SI,        "lxvw4x_v4si",	MEM)
 BU_VSX_X (LXVW4X_V8HI,        "lxvw4x_v8hi",	MEM)
 BU_VSX_X (LXVW4X_V16QI,	      "lxvw4x_v16qi",	MEM)
 BU_VSX_X (STXSDX,	      "stxsdx",		MEM)
-BU_VSX_X (STXVD2X_V1TI,	      "stxsdx_v1ti",	MEM)
-BU_VSX_X (STXVD2X_V2DF,	      "stxsdx_v2df",	MEM)
-BU_VSX_X (STXVD2X_V2DI,	      "stxsdx_v2di",	MEM)
-BU_VSX_X (STXVW4X_V4SF,	      "stxsdx_v4sf",	MEM)
-BU_VSX_X (STXVW4X_V4SI,	      "stxsdx_v4si",	MEM)
-BU_VSX_X (STXVW4X_V8HI,	      "stxsdx_v8hi",	MEM)
-BU_VSX_X (STXVW4X_V16QI,      "stxsdx_v16qi",	MEM)
+BU_VSX_X (STXVD2X_V1TI,	      "stxvd2x_v1ti",	MEM)
+BU_VSX_X (STXVD2X_V2DF,	      "stxvd2x_v2df",	MEM)
+BU_VSX_X (STXVD2X_V2DI,	      "stxvd2x_v2di",	MEM)
+BU_VSX_X (STXVW4X_V4SF,	      "stxvw4x_v4sf",	MEM)
+BU_VSX_X (STXVW4X_V4SI,	      "stxvw4x_v4si",	MEM)
+BU_VSX_X (STXVW4X_V8HI,	      "stxvw4x_v8hi",	MEM)
+BU_VSX_X (STXVW4X_V16QI,      "stxvw4x_v16qi",	MEM)
+BU_VSX_X (LD_ELEMREV_V2DF,    "ld_elemrev_v2df",  MEM)
+BU_VSX_X (LD_ELEMREV_V2DI,    "ld_elemrev_v2di",  MEM)
+BU_VSX_X (LD_ELEMREV_V4SF,    "ld_elemrev_v4sf",  MEM)
+BU_VSX_X (LD_ELEMREV_V4SI,    "ld_elemrev_v4si",  MEM)
+BU_VSX_X (LD_ELEMREV_V8HI,    "ld_elemrev_v8hi",  MEM)
+BU_VSX_X (LD_ELEMREV_V16QI,   "ld_elemrev_v16qi", MEM)
+BU_VSX_X (ST_ELEMREV_V2DF,    "st_elemrev_v2df",  MEM)
+BU_VSX_X (ST_ELEMREV_V2DI,    "st_elemrev_v2di",  MEM)
+BU_VSX_X (ST_ELEMREV_V4SF,    "st_elemrev_v4sf",  MEM)
+BU_VSX_X (ST_ELEMREV_V4SI,    "st_elemrev_v4si",  MEM)
+BU_VSX_X (ST_ELEMREV_V8HI,    "st_elemrev_v8hi",  MEM)
+BU_VSX_X (ST_ELEMREV_V16QI,   "st_elemrev_v16qi", MEM)
 BU_VSX_X (XSABSDP,	      "xsabsdp",	CONST)
 BU_VSX_X (XSADDDP,	      "xsadddp",	FP)
 BU_VSX_X (XSCMPODP,	      "xscmpodp",	FP)
@@ -1455,6 +1643,8 @@ BU_VSX_OVERLOAD_1 (DOUBLE,   "double")
 /* VSX builtins that are handled as special cases.  */
 BU_VSX_OVERLOAD_X (LD,	     "ld")
 BU_VSX_OVERLOAD_X (ST,	     "st")
+BU_VSX_OVERLOAD_X (XL,	     "xl")
+BU_VSX_OVERLOAD_X (XST,	     "xst")
 
 /* 1 argument VSX instructions added in ISA 2.07.  */
 BU_P8V_VSX_1 (XSCVSPDPN,      "xscvspdpn",	CONST,	vsx_xscvspdpn)
@@ -1589,6 +1779,25 @@ BU_P8V_OVERLOAD_3 (VADDEUQM,	"vaddeuqm")
 BU_P8V_OVERLOAD_3 (VSUBECUQ,	"vsubecuq")
 BU_P8V_OVERLOAD_3 (VSUBEUQM,	"vsubeuqm")
 
+/* ISA 3.0 vector overloaded 2-argument functions. */
+BU_P9V_AV_2 (VSLV,		"vslv",			CONST, vslv)
+BU_P9V_AV_2 (VSRV,		"vsrv",			CONST, vsrv)
+
+/* ISA 3.0 vector overloaded 2-argument functions. */
+BU_P9V_OVERLOAD_2 (VSLV,	"vslv")
+BU_P9V_OVERLOAD_2 (VSRV,	"vsrv")
+
+/* 2 argument vector functions added in ISA 3.0 (power9). */
+BU_P9V_AV_2 (VADUB,		"vadub",		CONST,  vaduv16qi3)
+BU_P9V_AV_2 (VADUH,		"vaduh",		CONST,  vaduv8hi3)
+BU_P9V_AV_2 (VADUW,		"vaduw",		CONST,  vaduv4si3)
+
+/* ISA 3.0 vector overloaded 2 argument functions. */
+BU_P9V_OVERLOAD_2 (VADU,	"vadu")
+BU_P9V_OVERLOAD_2 (VADUB,	"vadub")
+BU_P9V_OVERLOAD_2 (VADUH,	"vaduh")
+BU_P9V_OVERLOAD_2 (VADUW,	"vaduw")
+
 
 /* 2 argument extended divide functions added in ISA 2.06.  */
 BU_P7_MISC_2 (DIVWE,		"divwe",	CONST,	dive_si)
@@ -1639,12 +1848,72 @@ BU_P8V_MISC_3 (BCDSUB_OV,	"bcdsub_ov",	CONST,	bcdsub_unordered)
 BU_DFP_MISC_2 (PACK_TD,		"pack_dec128",		CONST,	packtd)
 BU_DFP_MISC_2 (UNPACK_TD,	"unpack_dec128",	CONST,	unpacktd)
 
+/* 0 argument general-purpose register functions added in ISA 3.0 (power9).  */
+BU_P9_MISC_0 (DARN_32,		"darn_32", 		MISC, darn_32)
+BU_P9_64BIT_MISC_0 (DARN_RAW,	"darn_raw", 		MISC, darn_raw)
+BU_P9_64BIT_MISC_0 (DARN,	"darn",			MISC, darn)
+
 BU_LDBL128_2 (PACK_TF,		"pack_longdouble",	CONST,	packtf)
 BU_LDBL128_2 (UNPACK_TF,	"unpack_longdouble",	CONST,	unpacktf)
 
 BU_P7_MISC_2 (PACK_V1TI,	"pack_vector_int128",	CONST,	packv1ti)
 BU_P7_MISC_2 (UNPACK_V1TI,	"unpack_vector_int128",	CONST,	unpackv1ti)
 
+/* 2 argument DFP (Decimal Floating Point) functions added in ISA 3.0.  */
+BU_P9_DFP_MISC_2 (TSTSFI_LT_DD, "dtstsfi_lt_dd", CONST, dfptstsfi_lt_dd)
+BU_P9_DFP_MISC_2 (TSTSFI_LT_TD, "dtstsfi_lt_td", CONST, dfptstsfi_lt_td)
+
+BU_P9_DFP_MISC_2 (TSTSFI_EQ_DD, "dtstsfi_eq_dd", CONST, dfptstsfi_eq_dd)
+BU_P9_DFP_MISC_2 (TSTSFI_EQ_TD, "dtstsfi_eq_td", CONST, dfptstsfi_eq_td)
+
+BU_P9_DFP_MISC_2 (TSTSFI_GT_DD, "dtstsfi_gt_dd", CONST, dfptstsfi_gt_dd)
+BU_P9_DFP_MISC_2 (TSTSFI_GT_TD, "dtstsfi_gt_td", CONST, dfptstsfi_gt_td)
+
+BU_P9_DFP_MISC_2 (TSTSFI_OV_DD, "dtstsfi_ov_dd", CONST, dfptstsfi_unordered_dd)
+BU_P9_DFP_MISC_2 (TSTSFI_OV_TD, "dtstsfi_ov_td", CONST, dfptstsfi_unordered_td)
+
+/* 2 argument overloaded DFP functions added in ISA 3.0.  */
+BU_P9_DFP_OVERLOAD_2 (TSTSFI_LT,	"dtstsfi_lt")
+BU_P9_DFP_OVERLOAD_2 (TSTSFI_LT_DD,	"dtstsfi_lt_dd")
+BU_P9_DFP_OVERLOAD_2 (TSTSFI_LT_TD,	"dtstsfi_lt_td")
+
+BU_P9_DFP_OVERLOAD_2 (TSTSFI_EQ,	"dtstsfi_eq")
+BU_P9_DFP_OVERLOAD_2 (TSTSFI_EQ_DD,	"dtstsfi_eq_dd")
+BU_P9_DFP_OVERLOAD_2 (TSTSFI_EQ_TD,	"dtstsfi_eq_td")
+
+BU_P9_DFP_OVERLOAD_2 (TSTSFI_GT,	"dtstsfi_gt")
+BU_P9_DFP_OVERLOAD_2 (TSTSFI_GT_DD,	"dtstsfi_gt_dd")
+BU_P9_DFP_OVERLOAD_2 (TSTSFI_GT_TD,	"dtstsfi_gt_td")
+
+BU_P9_DFP_OVERLOAD_2 (TSTSFI_OV,	"dtstsfi_ov")
+BU_P9_DFP_OVERLOAD_2 (TSTSFI_OV_DD,	"dtstsfi_ov_dd")
+BU_P9_DFP_OVERLOAD_2 (TSTSFI_OV_TD,	"dtstsfi_ov_td")
+
+/* 1 argument vector functions added in ISA 3.0 (power9).  */
+BU_P9V_AV_1 (VCTZB,		"vctzb",		CONST,  ctzv16qi2)
+BU_P9V_AV_1 (VCTZH,		"vctzh",		CONST,  ctzv8hi2)
+BU_P9V_AV_1 (VCTZW,		"vctzw",		CONST,  ctzv4si2)
+BU_P9V_AV_1 (VCTZD,		"vctzd",		CONST,  ctzv2di2)
+BU_P9V_AV_1 (VPRTYBD,		"vprtybd",		CONST,  parityv2di2)
+BU_P9V_AV_1 (VPRTYBQ,		"vprtybq",		CONST,  parityv1ti2)
+BU_P9V_AV_1 (VPRTYBW,		"vprtybw",		CONST,  parityv4si2)
+
+/* ISA 3.0 vector overloaded 1 argument functions.  */
+BU_P9V_OVERLOAD_1 (VCTZ,	"vctz")
+BU_P9V_OVERLOAD_1 (VCTZB,	"vctzb")
+BU_P9V_OVERLOAD_1 (VCTZH,	"vctzh")
+BU_P9V_OVERLOAD_1 (VCTZW,	"vctzw")
+BU_P9V_OVERLOAD_1 (VCTZD,	"vctzd")
+BU_P9V_OVERLOAD_1 (VPRTYB,	"vprtyb")
+BU_P9V_OVERLOAD_1 (VPRTYBD,	"vprtybd")
+BU_P9V_OVERLOAD_1 (VPRTYBQ,	"vprtybq")
+BU_P9V_OVERLOAD_1 (VPRTYBW,	"vprtybw")
+
+/* 1 argument IEEE 128-bit floating-point functions.  */
+BU_FLOAT128_1 (FABSQ,		"fabsq",       CONST, abskf2)
+
+/* 2 argument IEEE 128-bit floating-point functions.  */
+BU_FLOAT128_2 (COPYSIGNQ,	"copysignq",   CONST, copysignkf3)
 
 /* 1 argument crypto functions.  */
 BU_CRYPTO_1 (VSBOX,		"vsbox",	  CONST, crypto_vsbox)
@@ -2022,6 +2291,18 @@ BU_SPECIAL_X (RS6000_BUILTIN_CPU_IS, "__builtin_cpu_is",
 BU_SPECIAL_X (RS6000_BUILTIN_CPU_SUPPORTS, "__builtin_cpu_supports",
 	      RS6000_BTM_ALWAYS, RS6000_BTC_MISC)
 
+BU_SPECIAL_X (RS6000_BUILTIN_NANQ, "__builtin_nanq",
+	      RS6000_BTM_FLOAT128, RS6000_BTC_CONST)
+
+BU_SPECIAL_X (RS6000_BUILTIN_NANSQ, "__builtin_nansq",
+	      RS6000_BTM_FLOAT128, RS6000_BTC_CONST)
+
+BU_SPECIAL_X (RS6000_BUILTIN_INFQ, "__builtin_infq",
+	      RS6000_BTM_FLOAT128, RS6000_BTC_CONST)
+
+BU_SPECIAL_X (RS6000_BUILTIN_HUGE_VALQ, "__builtin_huge_valq",
+	      RS6000_BTM_FLOAT128, RS6000_BTC_CONST)
+
 /* Darwin CfString builtin.  */
 BU_SPECIAL_X (RS6000_BUILTIN_CFSTRING, "__builtin_cfstring", RS6000_BTM_ALWAYS,
 	      RS6000_BTC_MISC)
diff --git a/gcc/config/rs6000/rs6000-c.c b/gcc/config/rs6000/rs6000-c.c
index ceb80b216b..9eb6d545c5 100644
--- a/gcc/config/rs6000/rs6000-c.c
+++ b/gcc/config/rs6000/rs6000-c.c
@@ -2726,6 +2726,49 @@ const struct altivec_builtin_types altivec_overloaded_builtins[] = {
     RS6000_BTI_V4SI, RS6000_BTI_V4SI, RS6000_BTI_V4SI, 0 },
   { ALTIVEC_BUILTIN_VEC_SUMS, ALTIVEC_BUILTIN_VSUMSWS,
     RS6000_BTI_V4SI, RS6000_BTI_V4SI, RS6000_BTI_V4SI, 0 },
+  { VSX_BUILTIN_VEC_XL, VSX_BUILTIN_LD_ELEMREV_V2DF,
+    RS6000_BTI_V2DF, RS6000_BTI_INTSI, ~RS6000_BTI_V2DF, 0 },
+  { VSX_BUILTIN_VEC_XL, VSX_BUILTIN_LD_ELEMREV_V2DF,
+    RS6000_BTI_V2DF, RS6000_BTI_INTSI, ~RS6000_BTI_double, 0 },
+  { VSX_BUILTIN_VEC_XL, VSX_BUILTIN_LD_ELEMREV_V2DI,
+    RS6000_BTI_V2DI, RS6000_BTI_INTSI, ~RS6000_BTI_V2DI, 0 },
+  { VSX_BUILTIN_VEC_XL, VSX_BUILTIN_LD_ELEMREV_V2DI,
+    RS6000_BTI_V2DI, RS6000_BTI_INTSI, ~RS6000_BTI_long_long, 0 },
+  { VSX_BUILTIN_VEC_XL, VSX_BUILTIN_LD_ELEMREV_V2DI,
+    RS6000_BTI_unsigned_V2DI, RS6000_BTI_INTSI,
+    ~RS6000_BTI_unsigned_V2DI, 0 },
+  { VSX_BUILTIN_VEC_XL, VSX_BUILTIN_LD_ELEMREV_V2DI,
+    RS6000_BTI_unsigned_V2DI, RS6000_BTI_INTSI,
+    ~RS6000_BTI_unsigned_long_long, 0 },
+  { VSX_BUILTIN_VEC_XL, VSX_BUILTIN_LD_ELEMREV_V4SF,
+    RS6000_BTI_V4SF, RS6000_BTI_INTSI, ~RS6000_BTI_V4SF, 0 },
+  { VSX_BUILTIN_VEC_XL, VSX_BUILTIN_LD_ELEMREV_V4SF,
+    RS6000_BTI_V4SF, RS6000_BTI_INTSI, ~RS6000_BTI_float, 0 },
+  { VSX_BUILTIN_VEC_XL, VSX_BUILTIN_LD_ELEMREV_V4SI,
+    RS6000_BTI_V4SI, RS6000_BTI_INTSI, ~RS6000_BTI_V4SI, 0 },
+  { VSX_BUILTIN_VEC_XL, VSX_BUILTIN_LD_ELEMREV_V4SI,
+    RS6000_BTI_V4SI, RS6000_BTI_INTSI, ~RS6000_BTI_INTSI, 0 },
+  { VSX_BUILTIN_VEC_XL, VSX_BUILTIN_LD_ELEMREV_V4SI,
+    RS6000_BTI_unsigned_V4SI, RS6000_BTI_INTSI, ~RS6000_BTI_unsigned_V4SI, 0 },
+  { VSX_BUILTIN_VEC_XL, VSX_BUILTIN_LD_ELEMREV_V4SI,
+    RS6000_BTI_unsigned_V4SI, RS6000_BTI_INTSI, ~RS6000_BTI_UINTSI, 0 },
+  { VSX_BUILTIN_VEC_XL, VSX_BUILTIN_LD_ELEMREV_V8HI,
+    RS6000_BTI_V8HI, RS6000_BTI_INTSI, ~RS6000_BTI_V8HI, 0 },
+  { VSX_BUILTIN_VEC_XL, VSX_BUILTIN_LD_ELEMREV_V8HI,
+    RS6000_BTI_V8HI, RS6000_BTI_INTSI, ~RS6000_BTI_INTHI, 0 },
+  { VSX_BUILTIN_VEC_XL, VSX_BUILTIN_LD_ELEMREV_V8HI,
+    RS6000_BTI_unsigned_V8HI, RS6000_BTI_INTSI, ~RS6000_BTI_unsigned_V8HI, 0 },
+  { VSX_BUILTIN_VEC_XL, VSX_BUILTIN_LD_ELEMREV_V8HI,
+    RS6000_BTI_unsigned_V8HI, RS6000_BTI_INTSI, ~RS6000_BTI_UINTHI, 0 },
+  { VSX_BUILTIN_VEC_XL, VSX_BUILTIN_LD_ELEMREV_V16QI,
+    RS6000_BTI_V16QI, RS6000_BTI_INTSI, ~RS6000_BTI_V16QI, 0 },
+  { VSX_BUILTIN_VEC_XL, VSX_BUILTIN_LD_ELEMREV_V16QI,
+    RS6000_BTI_V16QI, RS6000_BTI_INTSI, ~RS6000_BTI_INTQI, 0 },
+  { VSX_BUILTIN_VEC_XL, VSX_BUILTIN_LD_ELEMREV_V16QI,
+    RS6000_BTI_unsigned_V16QI, RS6000_BTI_INTSI,
+    ~RS6000_BTI_unsigned_V16QI, 0 },
+  { VSX_BUILTIN_VEC_XL, VSX_BUILTIN_LD_ELEMREV_V16QI,
+    RS6000_BTI_unsigned_V16QI, RS6000_BTI_INTSI, ~RS6000_BTI_UINTQI, 0 },
   { ALTIVEC_BUILTIN_VEC_XOR, ALTIVEC_BUILTIN_VXOR,
     RS6000_BTI_V4SF, RS6000_BTI_V4SF, RS6000_BTI_V4SF, 0 },
   { ALTIVEC_BUILTIN_VEC_XOR, ALTIVEC_BUILTIN_VXOR,
@@ -3475,6 +3518,55 @@ const struct altivec_builtin_types altivec_overloaded_builtins[] = {
     RS6000_BTI_void, RS6000_BTI_unsigned_V16QI, RS6000_BTI_INTSI, ~RS6000_BTI_unsigned_V16QI },
   { ALTIVEC_BUILTIN_VEC_STVRXL, ALTIVEC_BUILTIN_STVRXL,
     RS6000_BTI_void, RS6000_BTI_unsigned_V16QI, RS6000_BTI_INTSI, ~RS6000_BTI_UINTQI },
+  { VSX_BUILTIN_VEC_XST, VSX_BUILTIN_ST_ELEMREV_V2DF,
+    RS6000_BTI_void, RS6000_BTI_V2DF, RS6000_BTI_INTSI, ~RS6000_BTI_V2DF },
+  { VSX_BUILTIN_VEC_XST, VSX_BUILTIN_ST_ELEMREV_V2DF,
+    RS6000_BTI_void, RS6000_BTI_V2DF, RS6000_BTI_INTSI, ~RS6000_BTI_double },
+  { VSX_BUILTIN_VEC_XST, VSX_BUILTIN_ST_ELEMREV_V2DI,
+    RS6000_BTI_void, RS6000_BTI_V2DI, RS6000_BTI_INTSI, ~RS6000_BTI_V2DI },
+  { VSX_BUILTIN_VEC_XST, VSX_BUILTIN_ST_ELEMREV_V2DI,
+    RS6000_BTI_void, RS6000_BTI_V2DI, RS6000_BTI_INTSI,
+    ~RS6000_BTI_long_long },
+  { VSX_BUILTIN_VEC_XST, VSX_BUILTIN_ST_ELEMREV_V2DI,
+    RS6000_BTI_void, RS6000_BTI_unsigned_V2DI, RS6000_BTI_INTSI,
+    ~RS6000_BTI_unsigned_V2DI },
+  { VSX_BUILTIN_VEC_XST, VSX_BUILTIN_ST_ELEMREV_V2DI,
+    RS6000_BTI_void, RS6000_BTI_unsigned_V2DI, RS6000_BTI_INTSI,
+    ~RS6000_BTI_unsigned_long_long },
+  { VSX_BUILTIN_VEC_XST, VSX_BUILTIN_ST_ELEMREV_V4SF,
+    RS6000_BTI_void, RS6000_BTI_V4SF, RS6000_BTI_INTSI, ~RS6000_BTI_V4SF },
+  { VSX_BUILTIN_VEC_XST, VSX_BUILTIN_ST_ELEMREV_V4SF,
+    RS6000_BTI_void, RS6000_BTI_V4SF, RS6000_BTI_INTSI, ~RS6000_BTI_float },
+  { VSX_BUILTIN_VEC_XST, VSX_BUILTIN_ST_ELEMREV_V4SI,
+    RS6000_BTI_void, RS6000_BTI_V4SI, RS6000_BTI_INTSI, ~RS6000_BTI_V4SI },
+  { VSX_BUILTIN_VEC_XST, VSX_BUILTIN_ST_ELEMREV_V4SI,
+    RS6000_BTI_void, RS6000_BTI_V4SI, RS6000_BTI_INTSI, ~RS6000_BTI_INTSI },
+  { VSX_BUILTIN_VEC_XST, VSX_BUILTIN_ST_ELEMREV_V4SI,
+    RS6000_BTI_void, RS6000_BTI_unsigned_V4SI, RS6000_BTI_INTSI,
+    ~RS6000_BTI_unsigned_V4SI },
+  { VSX_BUILTIN_VEC_XST, VSX_BUILTIN_ST_ELEMREV_V4SI,
+    RS6000_BTI_void, RS6000_BTI_unsigned_V4SI, RS6000_BTI_INTSI,
+    ~RS6000_BTI_UINTSI },
+  { VSX_BUILTIN_VEC_XST, VSX_BUILTIN_ST_ELEMREV_V8HI,
+    RS6000_BTI_void, RS6000_BTI_V8HI, RS6000_BTI_INTSI, ~RS6000_BTI_V8HI },
+  { VSX_BUILTIN_VEC_XST, VSX_BUILTIN_ST_ELEMREV_V8HI,
+    RS6000_BTI_void, RS6000_BTI_V8HI, RS6000_BTI_INTSI, ~RS6000_BTI_INTHI },
+  { VSX_BUILTIN_VEC_XST, VSX_BUILTIN_ST_ELEMREV_V8HI,
+    RS6000_BTI_void, RS6000_BTI_unsigned_V8HI, RS6000_BTI_INTSI,
+    ~RS6000_BTI_unsigned_V8HI },
+  { VSX_BUILTIN_VEC_XST, VSX_BUILTIN_ST_ELEMREV_V8HI,
+    RS6000_BTI_void, RS6000_BTI_unsigned_V8HI, RS6000_BTI_INTSI,
+    ~RS6000_BTI_UINTHI },
+  { VSX_BUILTIN_VEC_XST, VSX_BUILTIN_ST_ELEMREV_V16QI,
+    RS6000_BTI_void, RS6000_BTI_V16QI, RS6000_BTI_INTSI, ~RS6000_BTI_V16QI },
+  { VSX_BUILTIN_VEC_XST, VSX_BUILTIN_ST_ELEMREV_V16QI,
+    RS6000_BTI_void, RS6000_BTI_V16QI, RS6000_BTI_INTSI, ~RS6000_BTI_INTQI },
+  { VSX_BUILTIN_VEC_XST, VSX_BUILTIN_ST_ELEMREV_V16QI,
+    RS6000_BTI_void, RS6000_BTI_unsigned_V16QI, RS6000_BTI_INTSI,
+    ~RS6000_BTI_unsigned_V16QI },
+  { VSX_BUILTIN_VEC_XST, VSX_BUILTIN_ST_ELEMREV_V16QI,
+    RS6000_BTI_void, RS6000_BTI_unsigned_V16QI, RS6000_BTI_INTSI,
+    ~RS6000_BTI_UINTQI },
   { VSX_BUILTIN_VEC_XXSLDWI, VSX_BUILTIN_XXSLDWI_16QI,
     RS6000_BTI_V16QI, RS6000_BTI_V16QI, RS6000_BTI_V16QI, RS6000_BTI_NOT_OPAQUE },
   { VSX_BUILTIN_VEC_XXSLDWI, VSX_BUILTIN_XXSLDWI_16QI,
@@ -4123,6 +4215,105 @@ const struct altivec_builtin_types altivec_overloaded_builtins[] = {
   { P8V_BUILTIN_VEC_VCLZD, P8V_BUILTIN_VCLZD,
     RS6000_BTI_unsigned_V2DI, RS6000_BTI_unsigned_V2DI, 0, 0 },
 
+  { P9_BUILTIN_DFP_TSTSFI_LT, MISC_BUILTIN_TSTSFI_LT_TD,
+    RS6000_BTI_INTSI, RS6000_BTI_UINTSI, RS6000_BTI_dfloat128, 0 },
+  { P9_BUILTIN_DFP_TSTSFI_LT, MISC_BUILTIN_TSTSFI_LT_DD,
+    RS6000_BTI_INTSI, RS6000_BTI_UINTSI, RS6000_BTI_dfloat64, 0 },
+
+  { P9_BUILTIN_DFP_TSTSFI_LT_TD, MISC_BUILTIN_TSTSFI_LT_TD,
+    RS6000_BTI_INTSI, RS6000_BTI_UINTSI, RS6000_BTI_dfloat128, 0 },
+  { P9_BUILTIN_DFP_TSTSFI_LT_DD, MISC_BUILTIN_TSTSFI_LT_DD,
+    RS6000_BTI_INTSI, RS6000_BTI_UINTSI, RS6000_BTI_dfloat64, 0 },
+
+  { P9_BUILTIN_DFP_TSTSFI_EQ, MISC_BUILTIN_TSTSFI_EQ_TD,
+    RS6000_BTI_INTSI, RS6000_BTI_UINTSI, RS6000_BTI_dfloat128, 0 },
+  { P9_BUILTIN_DFP_TSTSFI_EQ, MISC_BUILTIN_TSTSFI_EQ_DD,
+    RS6000_BTI_INTSI, RS6000_BTI_UINTSI, RS6000_BTI_dfloat64, 0 },
+
+  { P9_BUILTIN_DFP_TSTSFI_EQ_TD, MISC_BUILTIN_TSTSFI_EQ_TD,
+    RS6000_BTI_INTSI, RS6000_BTI_UINTSI, RS6000_BTI_dfloat128, 0 },
+  { P9_BUILTIN_DFP_TSTSFI_EQ_DD, MISC_BUILTIN_TSTSFI_EQ_DD,
+    RS6000_BTI_INTSI, RS6000_BTI_UINTSI, RS6000_BTI_dfloat64, 0 },
+
+  { P9_BUILTIN_DFP_TSTSFI_GT, MISC_BUILTIN_TSTSFI_GT_TD,
+    RS6000_BTI_INTSI, RS6000_BTI_UINTSI, RS6000_BTI_dfloat128, 0 },
+  { P9_BUILTIN_DFP_TSTSFI_GT, MISC_BUILTIN_TSTSFI_GT_DD,
+    RS6000_BTI_INTSI, RS6000_BTI_UINTSI, RS6000_BTI_dfloat64, 0 },
+
+  { P9_BUILTIN_DFP_TSTSFI_GT_TD, MISC_BUILTIN_TSTSFI_GT_TD,
+    RS6000_BTI_INTSI, RS6000_BTI_UINTSI, RS6000_BTI_dfloat128, 0 },
+  { P9_BUILTIN_DFP_TSTSFI_GT_DD, MISC_BUILTIN_TSTSFI_GT_DD,
+    RS6000_BTI_INTSI, RS6000_BTI_UINTSI, RS6000_BTI_dfloat64, 0 },
+
+  { P9_BUILTIN_DFP_TSTSFI_OV, MISC_BUILTIN_TSTSFI_OV_TD,
+    RS6000_BTI_INTSI, RS6000_BTI_UINTSI, RS6000_BTI_dfloat128, 0 },
+  { P9_BUILTIN_DFP_TSTSFI_OV, MISC_BUILTIN_TSTSFI_OV_DD,
+    RS6000_BTI_INTSI, RS6000_BTI_UINTSI, RS6000_BTI_dfloat64, 0 },
+
+  { P9_BUILTIN_DFP_TSTSFI_OV_TD, MISC_BUILTIN_TSTSFI_OV_TD,
+    RS6000_BTI_INTSI, RS6000_BTI_UINTSI, RS6000_BTI_dfloat128, 0 },
+  { P9_BUILTIN_DFP_TSTSFI_OV_DD, MISC_BUILTIN_TSTSFI_OV_DD,
+    RS6000_BTI_INTSI, RS6000_BTI_UINTSI, RS6000_BTI_dfloat64, 0 },
+
+  { P9V_BUILTIN_VEC_VCTZ, P9V_BUILTIN_VCTZB,
+    RS6000_BTI_V16QI, RS6000_BTI_V16QI, 0, 0 },
+  { P9V_BUILTIN_VEC_VCTZ, P9V_BUILTIN_VCTZB,
+    RS6000_BTI_unsigned_V16QI, RS6000_BTI_unsigned_V16QI, 0, 0 },
+  { P9V_BUILTIN_VEC_VCTZ, P9V_BUILTIN_VCTZH,
+    RS6000_BTI_V8HI, RS6000_BTI_V8HI, 0, 0 },
+  { P9V_BUILTIN_VEC_VCTZ, P9V_BUILTIN_VCTZH,
+    RS6000_BTI_unsigned_V8HI, RS6000_BTI_unsigned_V8HI, 0, 0 },
+  { P9V_BUILTIN_VEC_VCTZ, P9V_BUILTIN_VCTZW,
+    RS6000_BTI_V4SI, RS6000_BTI_V4SI, 0, 0 },
+  { P9V_BUILTIN_VEC_VCTZ, P9V_BUILTIN_VCTZW,
+    RS6000_BTI_unsigned_V4SI, RS6000_BTI_unsigned_V4SI, 0, 0 },
+  { P9V_BUILTIN_VEC_VCTZ, P9V_BUILTIN_VCTZD,
+    RS6000_BTI_V2DI, RS6000_BTI_V2DI, 0, 0 },
+  { P9V_BUILTIN_VEC_VCTZ, P9V_BUILTIN_VCTZD,
+    RS6000_BTI_unsigned_V2DI, RS6000_BTI_unsigned_V2DI, 0, 0 },
+
+  { P9V_BUILTIN_VEC_VCTZB, P9V_BUILTIN_VCTZB,
+    RS6000_BTI_V16QI, RS6000_BTI_V16QI, 0, 0 },
+  { P9V_BUILTIN_VEC_VCTZB, P9V_BUILTIN_VCTZB,
+    RS6000_BTI_unsigned_V16QI, RS6000_BTI_unsigned_V16QI, 0, 0 },
+
+  { P9V_BUILTIN_VEC_VCTZH, P9V_BUILTIN_VCTZH,
+    RS6000_BTI_V8HI, RS6000_BTI_V8HI, 0, 0 },
+  { P9V_BUILTIN_VEC_VCTZH, P9V_BUILTIN_VCTZH,
+    RS6000_BTI_unsigned_V8HI, RS6000_BTI_unsigned_V8HI, 0, 0 },
+
+  { P9V_BUILTIN_VEC_VCTZW, P9V_BUILTIN_VCTZW,
+    RS6000_BTI_V4SI, RS6000_BTI_V4SI, 0, 0 },
+  { P9V_BUILTIN_VEC_VCTZW, P9V_BUILTIN_VCTZW,
+    RS6000_BTI_unsigned_V4SI, RS6000_BTI_unsigned_V4SI, 0, 0 },
+
+  { P9V_BUILTIN_VEC_VCTZD, P9V_BUILTIN_VCTZD,
+    RS6000_BTI_V2DI, RS6000_BTI_V2DI, 0, 0 },
+  { P9V_BUILTIN_VEC_VCTZD, P9V_BUILTIN_VCTZD,
+    RS6000_BTI_unsigned_V2DI, RS6000_BTI_unsigned_V2DI, 0, 0 },
+
+  { P9V_BUILTIN_VEC_VADU, P9V_BUILTIN_VADUB,
+    RS6000_BTI_unsigned_V16QI, RS6000_BTI_unsigned_V16QI,
+    RS6000_BTI_unsigned_V16QI, 0 },
+  { P9V_BUILTIN_VEC_VADU, P9V_BUILTIN_VADUH,
+    RS6000_BTI_unsigned_V8HI, RS6000_BTI_unsigned_V8HI,
+    RS6000_BTI_unsigned_V8HI, 0 },
+  { P9V_BUILTIN_VEC_VADU, P9V_BUILTIN_VADUW,
+    RS6000_BTI_unsigned_V4SI, RS6000_BTI_unsigned_V4SI,
+    RS6000_BTI_unsigned_V4SI, 0 },
+
+  { P9V_BUILTIN_VEC_VADUB, P9V_BUILTIN_VADUB,
+    RS6000_BTI_unsigned_V16QI, RS6000_BTI_unsigned_V16QI,
+    RS6000_BTI_unsigned_V16QI, 0 },
+
+  { P9V_BUILTIN_VEC_VADUH, P9V_BUILTIN_VADUH,
+    RS6000_BTI_unsigned_V8HI, RS6000_BTI_unsigned_V8HI,
+    RS6000_BTI_unsigned_V8HI, 0 },
+
+  { P9V_BUILTIN_VEC_VADUW, P9V_BUILTIN_VADUW,
+    RS6000_BTI_unsigned_V4SI, RS6000_BTI_unsigned_V4SI,
+    RS6000_BTI_unsigned_V4SI, 0 },
+
   { P8V_BUILTIN_VEC_VGBBD, P8V_BUILTIN_VGBBD,
     RS6000_BTI_V16QI, RS6000_BTI_V16QI, 0, 0 },
   { P8V_BUILTIN_VEC_VGBBD, P8V_BUILTIN_VGBBD,
@@ -4252,6 +4443,42 @@ const struct altivec_builtin_types altivec_overloaded_builtins[] = {
   { P8V_BUILTIN_VEC_VPOPCNTD, P8V_BUILTIN_VPOPCNTD,
     RS6000_BTI_unsigned_V2DI, RS6000_BTI_unsigned_V2DI, 0, 0 },
 
+  { P9V_BUILTIN_VEC_VPRTYB, P9V_BUILTIN_VPRTYBW,
+    RS6000_BTI_V4SI, RS6000_BTI_V4SI, 0, 0 },
+  { P9V_BUILTIN_VEC_VPRTYB, P9V_BUILTIN_VPRTYBW,
+    RS6000_BTI_unsigned_V4SI, RS6000_BTI_unsigned_V4SI, 0, 0 },
+  { P9V_BUILTIN_VEC_VPRTYB, P9V_BUILTIN_VPRTYBD,
+    RS6000_BTI_V2DI, RS6000_BTI_V2DI, 0, 0 },
+  { P9V_BUILTIN_VEC_VPRTYB, P9V_BUILTIN_VPRTYBD,
+    RS6000_BTI_unsigned_V2DI, RS6000_BTI_unsigned_V2DI, 0, 0 },
+  { P9V_BUILTIN_VEC_VPRTYB, P9V_BUILTIN_VPRTYBQ,
+    RS6000_BTI_V1TI, RS6000_BTI_V1TI, 0, 0 },
+  { P9V_BUILTIN_VEC_VPRTYB, P9V_BUILTIN_VPRTYBQ,
+    RS6000_BTI_unsigned_V1TI, RS6000_BTI_unsigned_V1TI, 0, 0 },
+  { P9V_BUILTIN_VEC_VPRTYB, P9V_BUILTIN_VPRTYBQ,
+    RS6000_BTI_INTTI, RS6000_BTI_INTTI, 0, 0 },
+  { P9V_BUILTIN_VEC_VPRTYB, P9V_BUILTIN_VPRTYBQ,
+    RS6000_BTI_UINTTI, RS6000_BTI_UINTTI, 0, 0 },
+
+  { P9V_BUILTIN_VEC_VPRTYBW, P9V_BUILTIN_VPRTYBW,
+    RS6000_BTI_V4SI, RS6000_BTI_V4SI, 0, 0 },
+  { P9V_BUILTIN_VEC_VPRTYBW, P9V_BUILTIN_VPRTYBW,
+    RS6000_BTI_unsigned_V4SI, RS6000_BTI_unsigned_V4SI, 0, 0 },
+
+  { P9V_BUILTIN_VEC_VPRTYBD, P9V_BUILTIN_VPRTYBD,
+    RS6000_BTI_V2DI, RS6000_BTI_V2DI, 0, 0 },
+  { P9V_BUILTIN_VEC_VPRTYBD, P9V_BUILTIN_VPRTYBD,
+    RS6000_BTI_unsigned_V2DI, RS6000_BTI_unsigned_V2DI, 0, 0 },
+
+  { P9V_BUILTIN_VEC_VPRTYBQ, P9V_BUILTIN_VPRTYBQ,
+    RS6000_BTI_V1TI, RS6000_BTI_V1TI, 0, 0 },
+  { P9V_BUILTIN_VEC_VPRTYBQ, P9V_BUILTIN_VPRTYBQ,
+    RS6000_BTI_unsigned_V1TI, RS6000_BTI_unsigned_V1TI, 0, 0 },
+  { P9V_BUILTIN_VEC_VPRTYBQ, P9V_BUILTIN_VPRTYBQ,
+    RS6000_BTI_INTTI, RS6000_BTI_INTTI, 0, 0 },
+  { P9V_BUILTIN_VEC_VPRTYBQ, P9V_BUILTIN_VPRTYBQ,
+    RS6000_BTI_UINTTI, RS6000_BTI_UINTTI, 0, 0 },
+
   { P8V_BUILTIN_VEC_VPKUDUM, P8V_BUILTIN_VPKUDUM,
     RS6000_BTI_V4SI, RS6000_BTI_V2DI, RS6000_BTI_V2DI, 0 },
   { P8V_BUILTIN_VEC_VPKUDUM, P8V_BUILTIN_VPKUDUM,
@@ -4328,6 +4555,13 @@ const struct altivec_builtin_types altivec_overloaded_builtins[] = {
   { P8V_BUILTIN_VEC_VGBBD, P8V_BUILTIN_VGBBD,
     RS6000_BTI_unsigned_V16QI, 0, 0, 0 },
 
+  { P9V_BUILTIN_VEC_VSLV, P9V_BUILTIN_VSLV,
+    RS6000_BTI_unsigned_V16QI, RS6000_BTI_unsigned_V16QI,
+    RS6000_BTI_unsigned_V16QI, 0 },
+  { P9V_BUILTIN_VEC_VSRV, P9V_BUILTIN_VSRV,
+    RS6000_BTI_unsigned_V16QI, RS6000_BTI_unsigned_V16QI,
+    RS6000_BTI_unsigned_V16QI, 0 },
+
   /* Crypto builtins.  */
   { CRYPTO_BUILTIN_VPERMXOR, CRYPTO_BUILTIN_VPERMXOR_V16QI,
     RS6000_BTI_unsigned_V16QI, RS6000_BTI_unsigned_V16QI,
diff --git a/gcc/config/rs6000/rs6000-cpus.def b/gcc/config/rs6000/rs6000-cpus.def
index 275404a63a..7d97f7f84e 100644
--- a/gcc/config/rs6000/rs6000-cpus.def
+++ b/gcc/config/rs6000/rs6000-cpus.def
@@ -60,15 +60,26 @@
 				 | OPTION_MASK_UPPER_REGS_SF)
 
 /* Add ISEL back into ISA 3.0, since it is supposed to be a win.  Do not add
-   P9_DFORM or P9_MINMAX until they are fully debugged.  */
+   P9_MINMAX until the hardware that supports it is available.  Do not add
+   FLOAT128_HW here until we are ready to make -mfloat128 on by default.  */
 #define ISA_3_0_MASKS_SERVER	(ISA_2_7_MASKS_SERVER			\
-				 | OPTION_MASK_FLOAT128_HW		\
 				 | OPTION_MASK_ISEL			\
 				 | OPTION_MASK_MODULO			\
 				 | OPTION_MASK_P9_FUSION		\
-				 | OPTION_MASK_P9_DFORM			\
+				 | OPTION_MASK_P9_DFORM_SCALAR		\
+				 | OPTION_MASK_P9_DFORM_VECTOR		\
+				 | OPTION_MASK_P9_MISC			\
 				 | OPTION_MASK_P9_VECTOR)
 
+/* Support for the IEEE 128-bit floating point hardware requires a lot of the
+   VSX instructions that are part of ISA 3.0.  */
+#define ISA_3_0_MASKS_IEEE	(OPTION_MASK_VSX			\
+				 | OPTION_MASK_P8_VECTOR		\
+				 | OPTION_MASK_P9_VECTOR		\
+				 | OPTION_MASK_DIRECT_MOVE		\
+				 | OPTION_MASK_UPPER_REGS_DF		\
+				 | OPTION_MASK_UPPER_REGS_SF)
+
 #define POWERPC_7400_MASK	(OPTION_MASK_PPC_GFXOPT | OPTION_MASK_ALTIVEC)
 
 /* Deal with ports that do not have -mstrict-align.  */
@@ -94,6 +105,7 @@
 				 | OPTION_MASK_FPRND			\
 				 | OPTION_MASK_HTM			\
 				 | OPTION_MASK_ISEL			\
+				 | OPTION_MASK_LRA			\
 				 | OPTION_MASK_MFCRF			\
 				 | OPTION_MASK_MFPGPR			\
 				 | OPTION_MASK_MODULO			\
@@ -101,9 +113,11 @@
 				 | OPTION_MASK_NO_UPDATE		\
 				 | OPTION_MASK_P8_FUSION		\
 				 | OPTION_MASK_P8_VECTOR		\
-				 | OPTION_MASK_P9_DFORM			\
+				 | OPTION_MASK_P9_DFORM_SCALAR		\
+				 | OPTION_MASK_P9_DFORM_VECTOR		\
 				 | OPTION_MASK_P9_FUSION		\
 				 | OPTION_MASK_P9_MINMAX		\
+				 | OPTION_MASK_P9_MISC			\
 				 | OPTION_MASK_P9_VECTOR		\
 				 | OPTION_MASK_POPCNTB			\
 				 | OPTION_MASK_POPCNTD			\
diff --git a/gcc/config/rs6000/rs6000-protos.h b/gcc/config/rs6000/rs6000-protos.h
index d9a6b1f784..3bb25c0493 100644
--- a/gcc/config/rs6000/rs6000-protos.h
+++ b/gcc/config/rs6000/rs6000-protos.h
@@ -31,6 +31,7 @@ extern void init_cumulative_args (CUMULATIVE_ARGS *, tree, rtx, int, int, int,
 #endif /* TREE_CODE */
 
 extern bool easy_altivec_constant (rtx, machine_mode);
+extern bool xxspltib_constant_p (rtx, machine_mode, int *, int *);
 extern int vspltis_shifted (rtx);
 extern HOST_WIDE_INT const_vector_elt_as_int (rtx, unsigned int);
 extern bool macho_lo_sum_memory_operand (rtx, machine_mode);
@@ -38,6 +39,7 @@ extern int num_insns_constant (rtx, machine_mode);
 extern int num_insns_constant_wide (HOST_WIDE_INT);
 extern int small_data_operand (rtx, machine_mode);
 extern bool mem_operand_gpr (rtx, machine_mode);
+extern bool mem_operand_ds_form (rtx, machine_mode);
 extern bool toc_relative_expr_p (const_rtx, bool);
 extern bool invalid_e500_subreg (rtx, machine_mode);
 extern void validate_condition_mode (enum rtx_code, machine_mode);
@@ -86,6 +88,7 @@ extern int registers_ok_for_quad_peep (rtx, rtx);
 extern int mems_ok_for_quad_peep (rtx, rtx);
 extern bool gpr_or_gpr_p (rtx, rtx);
 extern bool direct_move_p (rtx, rtx);
+extern bool quad_address_p (rtx, machine_mode, bool);
 extern bool quad_load_store_p (rtx, rtx);
 extern bool fusion_gpr_load_p (rtx, rtx, rtx, rtx);
 extern void expand_fusion_gpr_load (rtx *);
@@ -133,6 +136,7 @@ extern bool rs6000_emit_set_const (rtx, rtx);
 extern int rs6000_emit_cmove (rtx, rtx, rtx, rtx);
 extern int rs6000_emit_vector_cond_expr (rtx, rtx, rtx, rtx, rtx, rtx);
 extern void rs6000_emit_minmax (rtx, enum rtx_code, rtx, rtx);
+extern void rs6000_split_signbit (rtx, rtx);
 extern void rs6000_expand_atomic_compare_and_swap (rtx op[]);
 extern void rs6000_expand_atomic_exchange (rtx op[]);
 extern void rs6000_expand_atomic_op (enum rtx_code, rtx, rtx, rtx, rtx, rtx);
diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c
index 1d0076c41f..9c24ad516a 100644
--- a/gcc/config/rs6000/rs6000.c
+++ b/gcc/config/rs6000/rs6000.c
@@ -452,6 +452,7 @@ typedef unsigned char addr_mask_type;
 #define RELOAD_REG_PRE_INCDEC	0x10	/* PRE_INC/PRE_DEC valid.  */
 #define RELOAD_REG_PRE_MODIFY	0x20	/* PRE_MODIFY valid.  */
 #define RELOAD_REG_AND_M16	0x40	/* AND -16 addressing.  */
+#define RELOAD_REG_QUAD_OFFSET	0x80	/* quad offset is limited.  */
 
 /* Register type masks based on the type, of valid addressing modes.  */
 struct rs6000_reg_addr {
@@ -499,6 +500,16 @@ mode_supports_vmx_dform (machine_mode mode)
   return ((reg_addr[mode].addr_mask[RELOAD_REG_VMX] & RELOAD_REG_OFFSET) != 0);
 }
 
+/* Return true if we have D-form addressing in VSX registers.  This addressing
+   is more limited than normal d-form addressing in that the offset must be
+   aligned on a 16-byte boundary.  */
+static inline bool
+mode_supports_vsx_dform_quad (machine_mode mode)
+{
+  return ((reg_addr[mode].addr_mask[RELOAD_REG_ANY] & RELOAD_REG_QUAD_OFFSET)
+	  != 0);
+}
+
 
 /* Target cpu costs.  */
 
@@ -1093,16 +1104,16 @@ struct processor_costs power9_cost = {
   COSTS_N_INSNS (3),	/* mulsi_const */
   COSTS_N_INSNS (3),	/* mulsi_const9 */
   COSTS_N_INSNS (3),	/* muldi */
-  COSTS_N_INSNS (19),	/* divsi */
-  COSTS_N_INSNS (35),	/* divdi */
+  COSTS_N_INSNS (8),	/* divsi */
+  COSTS_N_INSNS (12),	/* divdi */
   COSTS_N_INSNS (3),	/* fp */
   COSTS_N_INSNS (3),	/* dmul */
-  COSTS_N_INSNS (14),	/* sdiv */
-  COSTS_N_INSNS (17),	/* ddiv */
+  COSTS_N_INSNS (13),	/* sdiv */
+  COSTS_N_INSNS (18),	/* ddiv */
   128,			/* cache line size */
   32,			/* l1 cache */
-  256,			/* l2 cache */
-  12,			/* prefetch streams */
+  512,			/* l2 cache */
+  8,			/* prefetch streams */
   COSTS_N_INSNS (3),	/* SF->DF convert */
 };
 
@@ -1128,6 +1139,7 @@ struct processor_costs ppca2_cost = {
 
 
 /* Table that classifies rs6000 builtin functions (pure, const, etc.).  */
+#undef RS6000_BUILTIN_0
 #undef RS6000_BUILTIN_1
 #undef RS6000_BUILTIN_2
 #undef RS6000_BUILTIN_3
@@ -1140,6 +1152,9 @@ struct processor_costs ppca2_cost = {
 #undef RS6000_BUILTIN_S
 #undef RS6000_BUILTIN_X
 
+#define RS6000_BUILTIN_0(ENUM, NAME, MASK, ATTR, ICODE) \
+  { NAME, ICODE, MASK, ATTR },
+
 #define RS6000_BUILTIN_1(ENUM, NAME, MASK, ATTR, ICODE) \
   { NAME, ICODE, MASK, ATTR },
 
@@ -1185,6 +1200,7 @@ static const struct rs6000_builtin_info_type rs6000_builtin_info[] =
 #include "rs6000-builtin.def"
 };
 
+#undef RS6000_BUILTIN_0
 #undef RS6000_BUILTIN_1
 #undef RS6000_BUILTIN_2
 #undef RS6000_BUILTIN_3
@@ -1312,6 +1328,7 @@ static bool rs6000_secondary_reload_move (enum rs6000_reg_type,
 					  bool);
 rtl_opt_pass *make_pass_analyze_swaps (gcc::context*);
 static bool rs6000_keep_leaf_when_profiled () __attribute__ ((unused));
+static tree rs6000_fold_builtin (tree, int, tree *, bool);
 
 /* Hash table stuff for keeping track of TOC entries.  */
 
@@ -1586,6 +1603,9 @@ static const struct attribute_spec rs6000_attribute_table[] =
 #undef TARGET_BUILTIN_DECL
 #define TARGET_BUILTIN_DECL rs6000_builtin_decl
 
+#undef TARGET_FOLD_BUILTIN
+#define TARGET_FOLD_BUILTIN rs6000_fold_builtin
+
 #undef TARGET_EXPAND_BUILTIN
 #define TARGET_EXPAND_BUILTIN rs6000_expand_builtin
 
@@ -1866,7 +1886,7 @@ rs6000_hard_regno_nregs_internal (int regno, machine_mode mode)
      128-bit floating point that can go in vector registers, which has VSX
      memory addressing.  */
   if (FP_REGNO_P (regno))
-    reg_size = (VECTOR_MEM_VSX_P (mode)
+    reg_size = (VECTOR_MEM_VSX_P (mode) || FLOAT128_VECTOR_P (mode)
 		? UNITS_PER_VSX_WORD
 		: UNITS_PER_FP_WORD);
 
@@ -1898,6 +1918,9 @@ rs6000_hard_regno_mode_ok (int regno, machine_mode mode)
 {
   int last_regno = regno + rs6000_hard_regno_nregs[mode][regno] - 1;
 
+  if (COMPLEX_MODE_P (mode))
+    mode = GET_MODE_INNER (mode);
+
   /* PTImode can only go in GPRs.  Quad word memory operations require even/odd
      register combinations, and use PTImode where we need to deal with quad
      word memory operations.  Don't allow quad words in the argument or frame
@@ -2105,7 +2128,9 @@ rs6000_debug_addr_mask (addr_mask_type mask, bool keep_spaces)
   else if (keep_spaces)
     *p++ = ' ';
 
-  if ((mask & RELOAD_REG_OFFSET) != 0)
+  if ((mask & RELOAD_REG_QUAD_OFFSET) != 0)
+    *p++ = 'O';
+  else if ((mask & RELOAD_REG_OFFSET) != 0)
     *p++ = 'o';
   else if (keep_spaces)
     *p++ = ' ';
@@ -2642,8 +2667,7 @@ rs6000_debug_reg_global (void)
   if (TARGET_LINK_STACK)
     fprintf (stderr, DEBUG_FMT_S, "link_stack", "true");
 
-  if (targetm.lra_p ())
-    fprintf (stderr, DEBUG_FMT_S, "lra", "true");
+  fprintf (stderr, DEBUG_FMT_S, "lra", TARGET_LRA ? "true" : "false");
 
   if (TARGET_P8_FUSION)
     {
@@ -2699,8 +2723,17 @@ rs6000_setup_reg_addr_masks (void)
 
   for (m = 0; m < NUM_MACHINE_MODES; ++m)
     {
-      machine_mode m2 = (machine_mode)m;
-      unsigned short msize = GET_MODE_SIZE (m2);
+      machine_mode m2 = (machine_mode) m;
+      bool complex_p = false;
+      size_t msize;
+
+      if (COMPLEX_MODE_P (m2))
+	{
+	  complex_p = true;
+	  m2 = GET_MODE_INNER (m2);
+	}
+
+      msize = GET_MODE_SIZE (m2);
 
       /* SDmode is special in that we want to access it only via REG+REG
 	 addressing on power7 and above, since we want to use the LFIWZX and
@@ -2722,7 +2755,7 @@ rs6000_setup_reg_addr_masks (void)
 	      /* Indicate if the mode takes more than 1 physical register.  If
 		 it takes a single register, indicate it can do REG+REG
 		 addressing.  */
-	      if (nregs > 1 || m == BLKmode)
+	      if (nregs > 1 || m == BLKmode || complex_p)
 		addr_mask |= RELOAD_REG_MULTIPLE;
 	      else
 		addr_mask |= RELOAD_REG_INDEXED;
@@ -2738,7 +2771,7 @@ rs6000_setup_reg_addr_masks (void)
 		  && msize <= 8
 		  && !VECTOR_MODE_P (m2)
 		  && !FLOAT128_VECTOR_P (m2)
-		  && !COMPLEX_MODE_P (m2)
+		  && !complex_p
 		  && (m2 != DFmode || !TARGET_UPPER_REGS_DF)
 		  && (m2 != SFmode || !TARGET_UPPER_REGS_SF)
 		  && !(TARGET_E500_DOUBLE && msize == 8))
@@ -2769,17 +2802,31 @@ rs6000_setup_reg_addr_masks (void)
 	    }
 
 	  /* GPR and FPR registers can do REG+OFFSET addressing, except
-	     possibly for SDmode.  ISA 3.0 (i.e. power9) adds D-form
-	     addressing for scalars to altivec registers.  */
+	     possibly for SDmode.  ISA 3.0 (i.e. power9) adds D-form addressing
+	     for 64-bit scalars and 32-bit SFmode to altivec registers.  */
 	  if ((addr_mask != 0) && !indexed_only_p
 	      && msize <= 8
 	      && (rc == RELOAD_REG_GPR
-		  || rc == RELOAD_REG_FPR
-		  || (rc == RELOAD_REG_VMX
-		      && TARGET_P9_DFORM
-		      && (m2 == DFmode || m2 == SFmode))))
+		  || ((msize == 8 || m2 == SFmode)
+		      && (rc == RELOAD_REG_FPR
+			  || (rc == RELOAD_REG_VMX
+			      && TARGET_P9_DFORM_SCALAR)))))
 	    addr_mask |= RELOAD_REG_OFFSET;
 
+	  /* VSX registers can do REG+OFFSET addresssing if ISA 3.0
+	     instructions are enabled.  The offset for 128-bit VSX registers is
+	     only 12-bits.  While GPRs can handle the full offset range, VSX
+	     registers can only handle the restricted range.  */
+	  else if ((addr_mask != 0) && !indexed_only_p
+		   && msize == 16 && TARGET_P9_DFORM_VECTOR
+		   && (ALTIVEC_OR_VSX_VECTOR_MODE (m2)
+		       || (m2 == TImode && TARGET_VSX_TIMODE)))
+	    {
+	      addr_mask |= RELOAD_REG_OFFSET;
+	      if (rc == RELOAD_REG_FPR || rc == RELOAD_REG_VMX)
+		addr_mask |= RELOAD_REG_QUAD_OFFSET;
+	    }
+
 	  /* VMX registers can do (REG & -16) and ((REG+REG) & -16)
 	     addressing on 128-bit types.  */
 	  if (rc == RELOAD_REG_VMX && msize == 16
@@ -3102,7 +3149,7 @@ rs6000_init_hard_regno_mode_ok (bool global_init_p)
     }
 
   /* Support for new D-form instructions.  */
-  if (TARGET_P9_DFORM)
+  if (TARGET_P9_DFORM_SCALAR)
     rs6000_constraints[RS6000_CONSTRAINT_wb] = ALTIVEC_REGS;
 
   /* Support for ISA 3.0 (power9) vectors.  */
@@ -3621,11 +3668,16 @@ rs6000_builtin_mask_calculate (void)
 	  | ((TARGET_POPCNTD)		    ? RS6000_BTM_POPCNTD   : 0)
 	  | ((rs6000_cpu == PROCESSOR_CELL) ? RS6000_BTM_CELL      : 0)
 	  | ((TARGET_P8_VECTOR)		    ? RS6000_BTM_P8_VECTOR : 0)
+	  | ((TARGET_P9_VECTOR)		    ? RS6000_BTM_P9_VECTOR : 0)
+	  | ((TARGET_P9_MISC)		    ? RS6000_BTM_P9_MISC   : 0)
+	  | ((TARGET_MODULO)		    ? RS6000_BTM_MODULO    : 0)
+	  | ((TARGET_64BIT)		    ? RS6000_BTM_64BIT     : 0)
 	  | ((TARGET_CRYPTO)		    ? RS6000_BTM_CRYPTO	   : 0)
 	  | ((TARGET_HTM)		    ? RS6000_BTM_HTM	   : 0)
 	  | ((TARGET_DFP)		    ? RS6000_BTM_DFP	   : 0)
 	  | ((TARGET_HARD_FLOAT)	    ? RS6000_BTM_HARD_FLOAT : 0)
-	  | ((TARGET_LONG_DOUBLE_128)	    ? RS6000_BTM_LDBL128 : 0));
+	  | ((TARGET_LONG_DOUBLE_128)	    ? RS6000_BTM_LDBL128   : 0)
+	  | ((TARGET_FLOAT128)		    ? RS6000_BTM_FLOAT128  : 0));
 }
 
 /* Implement TARGET_MD_ASM_ADJUST.  All asm statements are considered
@@ -3784,22 +3836,7 @@ rs6000_option_override_internal (bool global_init_p)
   if (rs6000_tune_index >= 0)
     tune_index = rs6000_tune_index;
   else if (have_cpu)
-    {
-      /* Until power9 tuning is available, use power8 tuning if -mcpu=power9.  */
-      if (processor_target_table[cpu_index].processor != PROCESSOR_POWER9)
-	rs6000_tune_index = tune_index = cpu_index;
-      else
-	{
-	  size_t i;
-	  tune_index = -1;
-	  for (i = 0; i < ARRAY_SIZE (processor_target_table); i++)
-	    if (processor_target_table[i].processor == PROCESSOR_POWER8)
-	      {
-		rs6000_tune_index = tune_index = i;
-		break;
-	      }
-	}
-    }
+    rs6000_tune_index = tune_index = cpu_index;
   else
     {
       size_t i;
@@ -3974,7 +4011,8 @@ rs6000_option_override_internal (bool global_init_p)
 
   /* For the newer switches (vsx, dfp, etc.) set some of the older options,
      unless the user explicitly used the -mno-<option> to disable the code.  */
-  if (TARGET_P9_VECTOR || TARGET_MODULO || TARGET_P9_DFORM || TARGET_P9_MINMAX)
+  if (TARGET_P9_VECTOR || TARGET_MODULO || TARGET_P9_DFORM_SCALAR
+      || TARGET_P9_DFORM_VECTOR || TARGET_P9_DFORM_BOTH > 0 || TARGET_P9_MINMAX)
     rs6000_isa_flags |= (ISA_3_0_MASKS_SERVER & ~rs6000_isa_flags_explicit);
   else if (TARGET_P8_VECTOR || TARGET_DIRECT_MOVE || TARGET_CRYPTO)
     rs6000_isa_flags |= (ISA_2_7_MASKS_SERVER & ~rs6000_isa_flags_explicit);
@@ -4188,34 +4226,80 @@ rs6000_option_override_internal (bool global_init_p)
       && !(rs6000_isa_flags_explicit & OPTION_MASK_TOC_FUSION))
     rs6000_isa_flags |= OPTION_MASK_TOC_FUSION;
 
+  /* ISA 3.0 vector instructions include ISA 2.07.  */
+  if (TARGET_P9_VECTOR && !TARGET_P8_VECTOR)
+    {
+      if (rs6000_isa_flags_explicit & OPTION_MASK_P8_VECTOR)
+	error ("-mpower9-vector requires -mpower8-vector");
+      rs6000_isa_flags &= ~OPTION_MASK_P9_VECTOR;
+    }
+
+  /* -mpower9-dform turns on both -mpower9-dform-scalar and
+      -mpower9-dform-vector.  */
+  if (TARGET_P9_DFORM_BOTH > 0)
+    {
+      if (!(rs6000_isa_flags_explicit & OPTION_MASK_P9_DFORM_VECTOR))
+	rs6000_isa_flags |= OPTION_MASK_P9_DFORM_VECTOR;
+
+      if (!(rs6000_isa_flags_explicit & OPTION_MASK_P9_DFORM_SCALAR))
+	rs6000_isa_flags |= OPTION_MASK_P9_DFORM_SCALAR;
+    }
+  else if (TARGET_P9_DFORM_BOTH == 0)
+    {
+      if (!(rs6000_isa_flags_explicit & OPTION_MASK_P9_DFORM_VECTOR))
+	rs6000_isa_flags &= ~OPTION_MASK_P9_DFORM_VECTOR;
+
+      if (!(rs6000_isa_flags_explicit & OPTION_MASK_P9_DFORM_SCALAR))
+	rs6000_isa_flags &= ~OPTION_MASK_P9_DFORM_SCALAR;
+    }
+
   /* ISA 3.0 D-form instructions require p9-vector and upper-regs.  */
-  if (TARGET_P9_DFORM && !TARGET_P9_VECTOR)
+  if ((TARGET_P9_DFORM_SCALAR || TARGET_P9_DFORM_VECTOR) && !TARGET_P9_VECTOR)
     {
       if (rs6000_isa_flags_explicit & OPTION_MASK_P9_VECTOR)
 	error ("-mpower9-dform requires -mpower9-vector");
-      rs6000_isa_flags &= ~OPTION_MASK_P9_DFORM;
+      rs6000_isa_flags &= ~(OPTION_MASK_P9_DFORM_SCALAR
+			    | OPTION_MASK_P9_DFORM_VECTOR);
     }
 
-  if (TARGET_P9_DFORM && !TARGET_UPPER_REGS_DF)
+  if (TARGET_P9_DFORM_SCALAR && !TARGET_UPPER_REGS_DF)
     {
       if (rs6000_isa_flags_explicit & OPTION_MASK_UPPER_REGS_DF)
 	error ("-mpower9-dform requires -mupper-regs-df");
-      rs6000_isa_flags &= ~OPTION_MASK_P9_DFORM;
+      rs6000_isa_flags &= ~OPTION_MASK_P9_DFORM_SCALAR;
     }
 
-  if (TARGET_P9_DFORM && !TARGET_UPPER_REGS_SF)
+  if (TARGET_P9_DFORM_SCALAR && !TARGET_UPPER_REGS_SF)
     {
       if (rs6000_isa_flags_explicit & OPTION_MASK_UPPER_REGS_SF)
 	error ("-mpower9-dform requires -mupper-regs-sf");
-      rs6000_isa_flags &= ~OPTION_MASK_P9_DFORM;
+      rs6000_isa_flags &= ~OPTION_MASK_P9_DFORM_SCALAR;
     }
 
-  /* ISA 3.0 vector instructions include ISA 2.07.  */
-  if (TARGET_P9_VECTOR && !TARGET_P8_VECTOR)
+  /* There have been bugs with -mvsx-timode that don't show up with -mlra,
+     but do show up with -mno-lra.  Given -mlra will become the default once
+     PR 69847 is fixed, turn off the options with problems by default if
+     -mno-lra was used, and warn if the user explicitly asked for the option.
+
+     Enable -mpower9-dform-vector by default if LRA and other power9 options.
+     Enable -mvsx-timode by default if LRA and VSX.  */
+  if (!TARGET_LRA)
     {
-      if (rs6000_isa_flags_explicit & OPTION_MASK_P8_VECTOR)
-	error ("-mpower9-vector requires -mpower8-vector");
-      rs6000_isa_flags &= ~OPTION_MASK_P9_VECTOR;
+      if (TARGET_VSX_TIMODE)
+	{
+	  if ((rs6000_isa_flags_explicit & OPTION_MASK_VSX_TIMODE) != 0)
+	    warning (0, "-mvsx-timode might need -mlra");
+
+	  else
+	    rs6000_isa_flags &= ~OPTION_MASK_VSX_TIMODE;
+	}
+    }
+
+  else
+    {
+      if (TARGET_VSX && !TARGET_VSX_TIMODE
+	  && (rs6000_isa_flags_explicit & OPTION_MASK_VSX_TIMODE) == 0)
+	rs6000_isa_flags |= OPTION_MASK_VSX_TIMODE;
     }
 
   /* Set -mallow-movmisalign to explicitly on if we have full ISA 2.07
@@ -4267,13 +4351,21 @@ rs6000_option_override_internal (bool global_init_p)
       rs6000_isa_flags &= ~(OPTION_MASK_FLOAT128 | OPTION_MASK_FLOAT128_HW);
     }
 
+  /* If we have -mfloat128 and full ISA 3.0 support, enable -mfloat128-hardware
+     by default.  */
+  if (TARGET_FLOAT128 && !TARGET_FLOAT128_HW
+      && (rs6000_isa_flags & ISA_3_0_MASKS_IEEE) == ISA_3_0_MASKS_IEEE
+      && !(rs6000_isa_flags_explicit & OPTION_MASK_FLOAT128_HW))
+    {
+      rs6000_isa_flags |= OPTION_MASK_FLOAT128_HW;
+      if ((rs6000_isa_flags & OPTION_MASK_FLOAT128) != 0)
+	rs6000_isa_flags_explicit |= OPTION_MASK_FLOAT128_HW;
+    }
+
   /* IEEE 128-bit floating point hardware instructions imply enabling
      __float128.  */
   if (TARGET_FLOAT128_HW
-      && (rs6000_isa_flags & (OPTION_MASK_P9_VECTOR
-			      | OPTION_MASK_DIRECT_MOVE
-			      | OPTION_MASK_UPPER_REGS_DF
-			      | OPTION_MASK_UPPER_REGS_SF)) == 0)
+      && (rs6000_isa_flags & ISA_3_0_MASKS_IEEE) != ISA_3_0_MASKS_IEEE)
     {
       if ((rs6000_isa_flags_explicit & OPTION_MASK_FLOAT128_HW) != 0)
 	error ("-mfloat128-hardware requires full ISA 3.0 support");
@@ -4281,10 +4373,6 @@ rs6000_option_override_internal (bool global_init_p)
       rs6000_isa_flags &= ~OPTION_MASK_FLOAT128_HW;
     }
 
-  else if (TARGET_P9_VECTOR && !TARGET_FLOAT128_HW
-	   && (rs6000_isa_flags_explicit & OPTION_MASK_FLOAT128_HW) == 0)
-    rs6000_isa_flags |= OPTION_MASK_FLOAT128_HW;
-
   if (TARGET_FLOAT128_HW
       && (rs6000_isa_flags_explicit & OPTION_MASK_FLOAT128) == 0)
     rs6000_isa_flags |= OPTION_MASK_FLOAT128;
@@ -4494,8 +4582,7 @@ rs6000_option_override_internal (bool global_init_p)
   rs6000_sched_groups = (rs6000_cpu == PROCESSOR_POWER4
 			 || rs6000_cpu == PROCESSOR_POWER5
 			 || rs6000_cpu == PROCESSOR_POWER7
-			 || rs6000_cpu == PROCESSOR_POWER8
-			 || rs6000_cpu == PROCESSOR_POWER9);
+			 || rs6000_cpu == PROCESSOR_POWER8);
   rs6000_align_branch_targets = (rs6000_cpu == PROCESSOR_POWER4
 				 || rs6000_cpu == PROCESSOR_POWER5
 				 || rs6000_cpu == PROCESSOR_POWER6
@@ -5677,8 +5764,8 @@ rs6000_file_start (void)
     }
 
 #ifdef USING_ELFOS_H
-  if (rs6000_default_cpu == 0 || rs6000_default_cpu[0] == '\0'
-      || !global_options_set.x_rs6000_cpu_index)
+  if (!(rs6000_default_cpu && rs6000_default_cpu[0])
+      && !global_options_set.x_rs6000_cpu_index)
     {
       fputs ("\t.machine ", asm_out_file);
       if ((rs6000_isa_flags & OPTION_MASK_MODULO) != 0)
@@ -6134,6 +6221,128 @@ gen_easy_altivec_constant (rtx op)
   gcc_unreachable ();
 }
 
+/* Return true if OP is of the given MODE and can be synthesized with ISA 3.0
+   instructions (xxspltib, vupkhsb/vextsb2w/vextb2d).
+
+   Return the number of instructions needed (1 or 2) into the address pointed
+   via NUM_INSNS_PTR.
+
+   If NOSPLIT_P, only return true for constants that only generate the XXSPLTIB
+   instruction and can go in any VSX register.  If !NOSPLIT_P, only return true
+   for constants that generate XXSPLTIB and need a sign extend operation, which
+   restricts us to the Altivec registers.
+
+   Allow either (vec_const [...]) or (vec_duplicate <const>).  If OP is a valid
+   XXSPLTIB constant, return the constant being set via the CONST_PTR
+   pointer.  */
+
+bool
+xxspltib_constant_p (rtx op,
+		     machine_mode mode,
+		     int *num_insns_ptr,
+		     int *constant_ptr)
+{
+  size_t nunits = GET_MODE_NUNITS (mode);
+  size_t i;
+  HOST_WIDE_INT value;
+  rtx element;
+
+  /* Set the returned values to out of bound values.  */
+  *num_insns_ptr = -1;
+  *constant_ptr = 256;
+
+  if (!TARGET_P9_VECTOR)
+    return false;
+
+  if (mode == VOIDmode)
+    mode = GET_MODE (op);
+
+  else if (mode != GET_MODE (op))
+    return false;
+
+  /* Handle (vec_duplicate <constant>).  */
+  if (GET_CODE (op) == VEC_DUPLICATE)
+    {
+      if (mode != V16QImode && mode != V8HImode && mode != V4SImode
+	  && mode != V2DImode)
+	return false;
+
+      element = XEXP (op, 0);
+      if (!CONST_INT_P (element))
+	return false;
+
+      value = INTVAL (element);
+      if (!IN_RANGE (value, -128, 127))
+	return false;
+    }
+
+  /* Handle (const_vector [...]).  */
+  else if (GET_CODE (op) == CONST_VECTOR)
+    {
+      if (mode != V16QImode && mode != V8HImode && mode != V4SImode
+	  && mode != V2DImode)
+	return false;
+
+      element = CONST_VECTOR_ELT (op, 0);
+      if (!CONST_INT_P (element))
+	return false;
+
+      value = INTVAL (element);
+      if (!IN_RANGE (value, -128, 127))
+	return false;
+
+      for (i = 1; i < nunits; i++)
+	{
+	  element = CONST_VECTOR_ELT (op, i);
+	  if (!CONST_INT_P (element))
+	    return false;
+
+	  if (value != INTVAL (element))
+	    return false;
+	}
+
+      /* See if we could generate vspltisw/vspltish directly instead of
+	 xxspltib + sign extend.  Special case 0/-1 to allow getting
+         any VSX register instead of an Altivec register.  */
+      if (!IN_RANGE (value, -1, 0) && EASY_VECTOR_15 (value)
+	  && (mode == V4SImode || mode == V8HImode))
+	return false;
+    }
+
+  /* Handle integer constants being loaded into the upper part of the VSX
+     register as a scalar.  If the value isn't 0/-1, only allow it if
+     the mode can go in Altivec registers.  */
+  else if (CONST_INT_P (op))
+    {
+      if (!SCALAR_INT_MODE_P (mode))
+	return false;
+
+      value = INTVAL (op);
+      if (!IN_RANGE (value, -128, 127))
+	return false;
+
+      if (!IN_RANGE (value, -1, 0)
+	  && (reg_addr[mode].addr_mask[RELOAD_REG_VMX] & RELOAD_REG_VALID) == 0)
+	return false;
+    }
+
+  else
+    return false;
+
+  /* Return # of instructions and the constant byte for XXSPLTIB.  */
+  if (mode == V16QImode)
+    *num_insns_ptr = 1;
+
+  else if (IN_RANGE (value, -1, 0))
+    *num_insns_ptr = 1;
+
+  else
+    *num_insns_ptr = 2;
+
+  *constant_ptr = (int) value;
+  return true;
+}
+
 const char *
 output_vec_const_move (rtx *operands)
 {
@@ -6147,23 +6356,60 @@ output_vec_const_move (rtx *operands)
 
   if (TARGET_VSX)
     {
+      bool dest_vmx_p = ALTIVEC_REGNO_P (REGNO (dest));
+      int xxspltib_value = 256;
+      int num_insns = -1;
+
       if (zero_constant (vec, mode))
-	return "xxlxor %x0,%x0,%x0";
+	{
+	  if (TARGET_P9_VECTOR)
+	    return "xxspltib %x0,0";
+
+	  else if (dest_vmx_p)
+	    return "vspltisw %0,0";
+
+	  else
+	    return "xxlxor %x0,%x0,%x0";
+	}
+
+      if (all_ones_constant (vec, mode))
+	{
+	  if (TARGET_P9_VECTOR)
+	    return "xxspltib %x0,255";
 
-      if (TARGET_P8_VECTOR && vec == CONSTM1_RTX (mode))
-	return "xxlorc %x0,%x0,%x0";
+	  else if (dest_vmx_p)
+	    return "vspltisw %0,-1";
 
-      if ((mode == V2DImode || mode == V1TImode)
-	  && INTVAL (CONST_VECTOR_ELT (vec, 0)) == -1
-	  && INTVAL (CONST_VECTOR_ELT (vec, 1)) == -1)
-	return (TARGET_P8_VECTOR) ? "xxlorc %x0,%x0,%x0" : "vspltisw %0,-1";
+	  else if (TARGET_P8_VECTOR)
+	    return "xxlorc %x0,%x0,%x0";
+
+	  else
+	    gcc_unreachable ();
+	}
+
+      if (TARGET_P9_VECTOR
+	  && xxspltib_constant_p (vec, mode, &num_insns, &xxspltib_value))
+	{
+	  if (num_insns == 1)
+	    {
+	      operands[2] = GEN_INT (xxspltib_value & 0xff);
+	      return "xxspltib %x0,%2";
+	    }
+
+	  return "#";
+	}
     }
 
   if (TARGET_ALTIVEC)
     {
       rtx splat_vec;
+
+      gcc_assert (ALTIVEC_REGNO_P (REGNO (dest)));
       if (zero_constant (vec, mode))
-	return "vxor %0,%0,%0";
+	return "vspltisw %0,0";
+
+      if (all_ones_constant (vec, mode))
+	return "vspltisw %0,-1";
 
       /* Do we need to construct a value using VSLDOI?  */
       shift = vspltis_shifted (vec);
@@ -6436,6 +6682,15 @@ rs6000_expand_vector_init (rtx target, rtx vals)
       return;
     }
 
+  /* Word values on ISA 3.0 can use mtvsrws, lxvwsx, or vspltisw.  V4SF is
+     complicated since scalars are stored as doubles in the registers.  */
+  if (TARGET_P9_VECTOR && mode == V4SImode && all_same
+      && VECTOR_MEM_VSX_P (mode))
+    {
+      emit_insn (gen_vsx_splat_v4si (target, XVECEXP (vals, 0, 0)));
+      return;
+    }
+
   /* With single precision floating point on VSX, know that internally single
      precision is actually represented as a double, and either make 2 V2DF
      vectors, and convert these vectors to single precision, or do one
@@ -6444,14 +6699,23 @@ rs6000_expand_vector_init (rtx target, rtx vals)
     {
       if (all_same)
 	{
-	  rtx freg = gen_reg_rtx (V4SFmode);
-	  rtx sreg = force_reg (SFmode, XVECEXP (vals, 0, 0));
-	  rtx cvt  = ((TARGET_XSCVDPSPN)
-		      ? gen_vsx_xscvdpspn_scalar (freg, sreg)
-		      : gen_vsx_xscvdpsp_scalar (freg, sreg));
+	  rtx op0 = XVECEXP (vals, 0, 0);
+
+	  if (TARGET_P9_VECTOR)
+	    emit_insn (gen_vsx_splat_v4sf (target, op0));
 
-	  emit_insn (cvt);
-	  emit_insn (gen_vsx_xxspltw_v4sf_direct (target, freg, const0_rtx));
+	  else
+	    {
+	      rtx freg = gen_reg_rtx (V4SFmode);
+	      rtx sreg = force_reg (SFmode, op0);
+	      rtx cvt  = (TARGET_XSCVDPSPN
+			  ? gen_vsx_xscvdpspn_scalar (freg, sreg)
+			  : gen_vsx_xscvdpsp_scalar (freg, sreg));
+
+	      emit_insn (cvt);
+	      emit_insn (gen_vsx_xxspltw_v4sf_direct (target, freg,
+						      const0_rtx));
+	    }
 	}
       else
 	{
@@ -6572,21 +6836,29 @@ rs6000_expand_vector_set (rtx target, rtx val, int elt)
 			gen_rtvec (3, target, reg,
 				   force_reg (V16QImode, x)),
 			UNSPEC_VPERM);
-  else 
+  else
     {
-      /* Invert selector.  We prefer to generate VNAND on P8 so
-         that future fusion opportunities can kick in, but must
-         generate VNOR elsewhere.  */
-      rtx notx = gen_rtx_NOT (V16QImode, force_reg (V16QImode, x));
-      rtx iorx = (TARGET_P8_VECTOR
-		  ? gen_rtx_IOR (V16QImode, notx, notx)
-		  : gen_rtx_AND (V16QImode, notx, notx));
-      rtx tmp = gen_reg_rtx (V16QImode);
-      emit_insn (gen_rtx_SET (tmp, iorx));
+      if (TARGET_P9_VECTOR)
+	x = gen_rtx_UNSPEC (mode,
+			    gen_rtvec (3, target, reg,
+				       force_reg (V16QImode, x)),
+			    UNSPEC_VPERMR);
+      else
+	{
+	  /* Invert selector.  We prefer to generate VNAND on P8 so
+	     that future fusion opportunities can kick in, but must
+	     generate VNOR elsewhere.  */
+	  rtx notx = gen_rtx_NOT (V16QImode, force_reg (V16QImode, x));
+	  rtx iorx = (TARGET_P8_VECTOR
+		      ? gen_rtx_IOR (V16QImode, notx, notx)
+		      : gen_rtx_AND (V16QImode, notx, notx));
+	  rtx tmp = gen_reg_rtx (V16QImode);
+	  emit_insn (gen_rtx_SET (tmp, iorx));
 
-      /* Permute with operands reversed and adjusted selector.  */
-      x = gen_rtx_UNSPEC (mode, gen_rtvec (3, reg, target, tmp),
-			  UNSPEC_VPERM);
+	  /* Permute with operands reversed and adjusted selector.  */
+	  x = gen_rtx_UNSPEC (mode, gen_rtvec (3, reg, target, tmp),
+			      UNSPEC_VPERM);
+	}
     }
 
   emit_insn (gen_rtx_SET (target, x));
@@ -6902,6 +7174,49 @@ direct_move_p (rtx op0, rtx op1)
   return false;
 }
 
+/* Return true if the OFFSET is valid for the quad address instructions that
+   use d-form (register + offset) addressing.  */
+
+static inline bool
+quad_address_offset_p (HOST_WIDE_INT offset)
+{
+  return (IN_RANGE (offset, -32768, 32767) && ((offset) & 0xf) == 0);
+}
+
+/* Return true if the ADDR is an acceptable address for a quad memory
+   operation of mode MODE (either LQ/STQ for general purpose registers, or
+   LXV/STXV for vector registers under ISA 3.0.  GPR_P is true if this address
+   is intended for LQ/STQ.  If it is false, the address is intended for the ISA
+   3.0 LXV/STXV instruction.  */
+
+bool
+quad_address_p (rtx addr, machine_mode mode, bool strict)
+{
+  rtx op0, op1;
+
+  if (GET_MODE_SIZE (mode) != 16)
+    return false;
+
+  if (legitimate_indirect_address_p (addr, strict))
+    return true;
+
+  if (VECTOR_MODE_P (mode) && !mode_supports_vsx_dform_quad (mode))
+    return false;
+
+  if (GET_CODE (addr) != PLUS)
+    return false;
+
+  op0 = XEXP (addr, 0);
+  if (!REG_P (op0) || !INT_REG_OK_FOR_BASE_P (op0, strict))
+    return false;
+
+  op1 = XEXP (addr, 1);
+  if (!CONST_INT_P (op1))
+    return false;
+
+  return quad_address_offset_p (INTVAL (op1));
+}
+
 /* Return true if this is a load or store quad operation.  This function does
    not handle the atomic quad memory instructions.  */
 
@@ -7005,6 +7320,39 @@ mem_operand_gpr (rtx op, machine_mode mode)
 
   return offset + 0x8000 < 0x10000u - extra;
 }
+
+/* As above, but for DS-FORM VSX insns.  Unlike mem_operand_gpr,
+   enforce an offset divisible by 4 even for 32-bit.  */
+
+bool
+mem_operand_ds_form (rtx op, machine_mode mode)
+{
+  unsigned HOST_WIDE_INT offset;
+  int extra;
+  rtx addr = XEXP (op, 0);
+
+  if (!offsettable_address_p (false, mode, addr))
+    return false;
+
+  op = address_offset (addr);
+  if (op == NULL_RTX)
+    return true;
+
+  offset = INTVAL (op);
+  if ((offset & 3) != 0)
+    return false;
+
+  extra = GET_MODE_SIZE (mode) - UNITS_PER_WORD;
+  if (extra < 0)
+    extra = 0;
+
+  if (GET_CODE (addr) == LO_SUM)
+    /* For lo_sum addresses, we must allow any offset except one that
+       causes a wrap, so test only the low 16 bits.  */
+    offset = ((offset & 0xffff) ^ 0x8000) - 0x8000;
+
+  return offset + 0x8000 < 0x10000u - extra;
+}
 
 /* Subroutines of rs6000_legitimize_address and rs6000_legitimate_address_p.  */
 
@@ -7023,13 +7371,14 @@ reg_offset_addressing_ok_p (machine_mode mode)
     case TImode:
     case TFmode:
     case KFmode:
-      /* AltiVec/VSX vector modes.  Only reg+reg addressing is valid.  While
-	 TImode is not a vector mode, if we want to use the VSX registers to
-	 move it around, we need to restrict ourselves to reg+reg addressing.
-	 Similarly for IEEE 128-bit floating point that is passed in a single
-	 vector register.  */
+      /* AltiVec/VSX vector modes.  Only reg+reg addressing was valid until the
+	 ISA 3.0 vector d-form addressing mode was added.  While TImode is not
+	 a vector mode, if we want to use the VSX registers to move it around,
+	 we need to restrict ourselves to reg+reg addressing.  Similarly for
+	 IEEE 128-bit floating point that is passed in a single vector
+	 register.  */
       if (VECTOR_MEM_ALTIVEC_OR_VSX_P (mode))
-	return false;
+	return mode_supports_vsx_dform_quad (mode);
       break;
 
     case V4HImode:
@@ -7096,6 +7445,11 @@ offsettable_ok_by_alignment (rtx op, HOST_WIDE_INT offset,
   if (GET_CODE (op) != SYMBOL_REF)
     return false;
 
+  /* ISA 3.0 vector d-form addressing is restricted, don't allow
+     SYMBOL_REF.  */
+  if (mode_supports_vsx_dform_quad (mode))
+    return false;
+
   dsize = GET_MODE_SIZE (mode);
   decl = SYMBOL_REF_DECL (op);
   if (!decl)
@@ -7250,6 +7604,8 @@ rs6000_legitimate_offset_address_p (machine_mode mode, rtx x,
     return false;
   if (!INT_REG_OK_FOR_BASE_P (XEXP (x, 0), strict))
     return false;
+  if (mode_supports_vsx_dform_quad (mode))
+    return quad_address_p (x, mode, strict);
   if (!reg_offset_addressing_ok_p (mode))
     return virtual_stack_registers_memory_p (x);
   if (legitimate_constant_pool_address_p (x, mode, strict || lra_in_progress))
@@ -7388,6 +7744,9 @@ legitimate_lo_sum_address_p (machine_mode mode, rtx x, int strict)
     return false;
   if (!INT_REG_OK_FOR_BASE_P (XEXP (x, 0), strict))
     return false;
+  /* quad word addresses are restricted, and we can't use LO_SUM.  */
+  if (mode_supports_vsx_dform_quad (mode))
+    return false;
   /* Restrict addressing for DI because of our SUBREG hackery.  */
   if (TARGET_E500_DOUBLE && GET_MODE_SIZE (mode) > UNITS_PER_WORD)
     return false;
@@ -7399,7 +7758,7 @@ legitimate_lo_sum_address_p (machine_mode mode, rtx x, int strict)
 
       if (DEFAULT_ABI == ABI_V4 && flag_pic)
 	return false;
-      /* LRA don't use LEGITIMIZE_RELOAD_ADDRESS as it usually calls
+      /* LRA doesn't use LEGITIMIZE_RELOAD_ADDRESS as it usually calls
 	 push_reload from reload pass code.  LEGITIMIZE_RELOAD_ADDRESS
 	 recognizes some LO_SUM addresses as valid although this
 	 function says opposite.  In most cases, LRA through different
@@ -7453,7 +7812,8 @@ rs6000_legitimize_address (rtx x, rtx oldx ATTRIBUTE_UNUSED,
 {
   unsigned int extra;
 
-  if (!reg_offset_addressing_ok_p (mode))
+  if (!reg_offset_addressing_ok_p (mode)
+      || mode_supports_vsx_dform_quad (mode))
     {
       if (virtual_stack_registers_memory_p (x))
 	return x;
@@ -8148,13 +8508,18 @@ rs6000_legitimize_reload_address (rtx x, machine_mode mode,
 				  int ind_levels ATTRIBUTE_UNUSED, int *win)
 {
   bool reg_offset_p = reg_offset_addressing_ok_p (mode);
+  bool quad_offset_p = mode_supports_vsx_dform_quad (mode);
 
-  /* Nasty hack for vsx_splat_V2DF/V2DI load from mem, which takes a
-     DFmode/DImode MEM.  */
+  /* Nasty hack for vsx_splat_v2df/v2di load from mem, which takes a
+     DFmode/DImode MEM.  Ditto for ISA 3.0 vsx_splat_v4sf/v4si.  */
   if (reg_offset_p
       && opnum == 1
       && ((mode == DFmode && recog_data.operand_mode[0] == V2DFmode)
-	  || (mode == DImode && recog_data.operand_mode[0] == V2DImode)))
+	  || (mode == DImode && recog_data.operand_mode[0] == V2DImode)
+	  || (mode == SFmode && recog_data.operand_mode[0] == V4SFmode
+	      && TARGET_P9_VECTOR)
+	  || (mode == SImode && recog_data.operand_mode[0] == V4SImode
+	      && TARGET_P9_VECTOR)))
     reg_offset_p = false;
 
   /* We must recognize output that we have already generated ourselves.  */
@@ -8164,6 +8529,11 @@ rs6000_legitimize_reload_address (rtx x, machine_mode mode,
       && GET_CODE (XEXP (XEXP (x, 0), 1)) == CONST_INT
       && GET_CODE (XEXP (x, 1)) == CONST_INT)
     {
+      if (TARGET_DEBUG_ADDR)
+	{
+	  fprintf (stderr, "\nlegitimize_reload_address push_reload #1:\n");
+	  debug_rtx (x);
+	}
       push_reload (XEXP (x, 0), NULL_RTX, &XEXP (x, 0), NULL,
 		   BASE_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
 		   opnum, (enum reload_type) type);
@@ -8175,6 +8545,11 @@ rs6000_legitimize_reload_address (rtx x, machine_mode mode,
   if (GET_CODE (x) == LO_SUM
       && GET_CODE (XEXP (x, 0)) == HIGH)
     {
+      if (TARGET_DEBUG_ADDR)
+	{
+	  fprintf (stderr, "\nlegitimize_reload_address push_reload #2:\n");
+	  debug_rtx (x);
+	}
       push_reload (XEXP (x, 0), NULL_RTX, &XEXP (x, 0), NULL,
 		   BASE_REG_CLASS, Pmode, VOIDmode, 0, 0,
 		   opnum, (enum reload_type) type);
@@ -8203,10 +8578,16 @@ rs6000_legitimize_reload_address (rtx x, machine_mode mode,
 
   if (TARGET_CMODEL != CMODEL_SMALL
       && reg_offset_p
+      && !quad_offset_p
       && small_toc_ref (x, VOIDmode))
     {
       rtx hi = gen_rtx_HIGH (Pmode, copy_rtx (x));
       x = gen_rtx_LO_SUM (Pmode, hi, x);
+      if (TARGET_DEBUG_ADDR)
+	{
+	  fprintf (stderr, "\nlegitimize_reload_address push_reload #3:\n");
+	  debug_rtx (x);
+	}
       push_reload (XEXP (x, 0), NULL_RTX, &XEXP (x, 0), NULL,
 		   BASE_REG_CLASS, Pmode, VOIDmode, 0, 0,
 		   opnum, (enum reload_type) type);
@@ -8215,22 +8596,24 @@ rs6000_legitimize_reload_address (rtx x, machine_mode mode,
     }
 
   if (GET_CODE (x) == PLUS
-      && GET_CODE (XEXP (x, 0)) == REG
+      && REG_P (XEXP (x, 0))
       && REGNO (XEXP (x, 0)) < FIRST_PSEUDO_REGISTER
       && INT_REG_OK_FOR_BASE_P (XEXP (x, 0), 1)
-      && GET_CODE (XEXP (x, 1)) == CONST_INT
+      && CONST_INT_P (XEXP (x, 1))
       && reg_offset_p
       && !SPE_VECTOR_MODE (mode)
       && !(TARGET_E500_DOUBLE && GET_MODE_SIZE (mode) > UNITS_PER_WORD)
-      && (!VECTOR_MODE_P (mode) || VECTOR_MEM_NONE_P (mode)))
+      && (quad_offset_p || !VECTOR_MODE_P (mode) || VECTOR_MEM_NONE_P (mode)))
     {
       HOST_WIDE_INT val = INTVAL (XEXP (x, 1));
       HOST_WIDE_INT low = ((val & 0xffff) ^ 0x8000) - 0x8000;
       HOST_WIDE_INT high
 	= (((val - low) & 0xffffffff) ^ 0x80000000) - 0x80000000;
 
-      /* Check for 32-bit overflow.  */
-      if (high + low != val)
+      /* Check for 32-bit overflow or quad addresses with one of the
+	 four least significant bits set.  */
+      if (high + low != val
+	  || (quad_offset_p && (low & 0xf)))
 	{
 	  *win = 0;
 	  return x;
@@ -8244,6 +8627,11 @@ rs6000_legitimize_reload_address (rtx x, machine_mode mode,
 				      GEN_INT (high)),
 			GEN_INT (low));
 
+      if (TARGET_DEBUG_ADDR)
+	{
+	  fprintf (stderr, "\nlegitimize_reload_address push_reload #4:\n");
+	  debug_rtx (x);
+	}
       push_reload (XEXP (x, 0), NULL_RTX, &XEXP (x, 0), NULL,
 		   BASE_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
 		   opnum, (enum reload_type) type);
@@ -8253,6 +8641,7 @@ rs6000_legitimize_reload_address (rtx x, machine_mode mode,
 
   if (GET_CODE (x) == SYMBOL_REF
       && reg_offset_p
+      && !quad_offset_p
       && (!VECTOR_MODE_P (mode) || VECTOR_MEM_NONE_P (mode))
       && !SPE_VECTOR_MODE (mode)
 #if TARGET_MACHO
@@ -8304,6 +8693,11 @@ rs6000_legitimize_reload_address (rtx x, machine_mode mode,
 	x = gen_rtx_LO_SUM (GET_MODE (x),
 	      gen_rtx_HIGH (Pmode, x), x);
 
+      if (TARGET_DEBUG_ADDR)
+	{
+	  fprintf (stderr, "\nlegitimize_reload_address push_reload #5:\n");
+	  debug_rtx (x);
+	}
       push_reload (XEXP (x, 0), NULL_RTX, &XEXP (x, 0), NULL,
 		   BASE_REG_CLASS, Pmode, VOIDmode, 0, 0,
 		   opnum, (enum reload_type) type);
@@ -8332,14 +8726,22 @@ rs6000_legitimize_reload_address (rtx x, machine_mode mode,
 
   if (TARGET_TOC
       && reg_offset_p
+      && !quad_offset_p
       && GET_CODE (x) == SYMBOL_REF
       && use_toc_relative_ref (x, mode))
     {
       x = create_TOC_reference (x, NULL_RTX);
       if (TARGET_CMODEL != CMODEL_SMALL)
-	push_reload (XEXP (x, 0), NULL_RTX, &XEXP (x, 0), NULL,
-		     BASE_REG_CLASS, Pmode, VOIDmode, 0, 0,
-		     opnum, (enum reload_type) type);
+	{
+	  if (TARGET_DEBUG_ADDR)
+	    {
+	      fprintf (stderr, "\nlegitimize_reload_address push_reload #6:\n");
+	      debug_rtx (x);
+	    }
+	  push_reload (XEXP (x, 0), NULL_RTX, &XEXP (x, 0), NULL,
+		       BASE_REG_CLASS, Pmode, VOIDmode, 0, 0,
+		       opnum, (enum reload_type) type);
+	}
       *win = 1;
       return x;
     }
@@ -8395,6 +8797,7 @@ static bool
 rs6000_legitimate_address_p (machine_mode mode, rtx x, bool reg_ok_strict)
 {
   bool reg_offset_p = reg_offset_addressing_ok_p (mode);
+  bool quad_offset_p = mode_supports_vsx_dform_quad (mode);
 
   /* If this is an unaligned stvx/ldvx type address, discard the outer AND.  */
   if (VECTOR_MEM_ALTIVEC_P (mode)
@@ -8412,17 +8815,27 @@ rs6000_legitimate_address_p (machine_mode mode, rtx x, bool reg_ok_strict)
       && mode_supports_pre_incdec_p (mode)
       && legitimate_indirect_address_p (XEXP (x, 0), reg_ok_strict))
     return 1;
-  if (virtual_stack_registers_memory_p (x))
-    return 1;
-  if (reg_offset_p && legitimate_small_data_p (mode, x))
+  /* Handle restricted vector d-form offsets in ISA 3.0.  */
+  if (quad_offset_p)
+    {
+      if (quad_address_p (x, mode, reg_ok_strict))
+	return 1;
+    }
+  else if (virtual_stack_registers_memory_p (x))
     return 1;
-  if (reg_offset_p
-      && legitimate_constant_pool_address_p (x, mode,
+
+  else if (reg_offset_p)
+    {
+      if (legitimate_small_data_p (mode, x))
+	return 1;
+      if (legitimate_constant_pool_address_p (x, mode,
 					     reg_ok_strict || lra_in_progress))
-    return 1;
-  if (reg_offset_p && reg_addr[mode].fused_toc && GET_CODE (x) == UNSPEC
-      && XINT (x, 1) == UNSPEC_FUSION_ADDIS)
-    return 1;
+	return 1;
+      if (reg_addr[mode].fused_toc && GET_CODE (x) == UNSPEC
+	  && XINT (x, 1) == UNSPEC_FUSION_ADDIS)
+	return 1;
+    }
+
   /* For TImode, if we have load/store quad and TImode in VSX registers, only
      allow register indirect addresses.  This will allow the values to go in
      either GPRs or VSX registers without reloading.  The vector types would
@@ -8461,7 +8874,8 @@ rs6000_legitimate_address_p (machine_mode mode, rtx x, bool reg_ok_strict)
 	      && legitimate_indexed_address_p (XEXP (x, 1), reg_ok_strict)))
       && rtx_equal_p (XEXP (XEXP (x, 1), 0), XEXP (x, 0)))
     return 1;
-  if (reg_offset_p && legitimate_lo_sum_address_p (mode, x, reg_ok_strict))
+  if (reg_offset_p && !quad_offset_p
+      && legitimate_lo_sum_address_p (mode, x, reg_ok_strict))
     return 1;
   return 0;
 }
@@ -10044,6 +10458,35 @@ rs6000_must_pass_in_stack (machine_mode mode, const_tree type)
     return must_pass_in_stack_var_size_or_pad (mode, type);
 }
 
+static inline bool
+is_complex_IBM_long_double (machine_mode mode)
+{
+  return mode == ICmode || (!TARGET_IEEEQUAD && mode == TCmode);
+}
+
+/* Whether ABI_V4 passes MODE args to a function in floating point
+   registers.  */
+
+static bool
+abi_v4_pass_in_fpr (machine_mode mode)
+{
+  if (!TARGET_FPRS || !TARGET_HARD_FLOAT)
+    return false;
+  if (TARGET_SINGLE_FLOAT && mode == SFmode)
+    return true;
+  if (TARGET_DOUBLE_FLOAT && mode == DFmode)
+    return true;
+  /* ABI_V4 passes complex IBM long double in 8 gprs.
+     Stupid, but we can't change the ABI now.  */
+  if (is_complex_IBM_long_double (mode))
+    return false;
+  if (FLOAT128_2REG_P (mode))
+    return true;
+  if (DECIMAL_FLOAT_MODE_P (mode))
+    return true;
+  return false;
+}
+
 /* If defined, a C expression which determines whether, and in which
    direction, to pad out an argument with extra space.  The value
    should be of type `enum direction': either `upward' to pad above
@@ -10128,6 +10571,7 @@ rs6000_function_arg_boundary (machine_mode mode, const_tree type)
       && (GET_MODE_SIZE (mode) == 8
 	  || (TARGET_HARD_FLOAT
 	      && TARGET_FPRS
+	      && !is_complex_IBM_long_double (mode)
 	      && FLOAT128_2REG_P (mode))))
     return 64;
   else if (FLOAT128_VECTOR_P (mode))
@@ -10507,11 +10951,7 @@ rs6000_function_arg_advance_1 (CUMULATIVE_ARGS *cum, machine_mode mode,
     }
   else if (DEFAULT_ABI == ABI_V4)
     {
-      if (TARGET_HARD_FLOAT && TARGET_FPRS
-	  && ((TARGET_SINGLE_FLOAT && mode == SFmode)
-	      || (TARGET_DOUBLE_FLOAT && mode == DFmode)
-	      || FLOAT128_2REG_P (mode)
-	      || DECIMAL_FLOAT_MODE_P (mode)))
+      if (abi_v4_pass_in_fpr (mode))
 	{
 	  /* _Decimal128 must use an even/odd register pair.  This assumes
 	     that the register number is odd when fregno is odd.  */
@@ -11168,11 +11608,7 @@ rs6000_function_arg (cumulative_args_t cum_v, machine_mode mode,
 
   else if (abi == ABI_V4)
     {
-      if (TARGET_HARD_FLOAT && TARGET_FPRS
-	  && ((TARGET_SINGLE_FLOAT && mode == SFmode)
-	      || (TARGET_DOUBLE_FLOAT && mode == DFmode)
-	      || FLOAT128_2REG_P (mode)
-	      || DECIMAL_FLOAT_MODE_P (mode)))
+      if (abi_v4_pass_in_fpr (mode))
 	{
 	  /* _Decimal128 must use an even/odd register pair.  This assumes
 	     that the register number is odd when fregno is odd.  */
@@ -12093,19 +12529,15 @@ rs6000_gimplify_va_arg (tree valist, tree type, gimple_seq *pre_p,
   rsize = (size + 3) / 4;
   align = 1;
 
-  if (TARGET_HARD_FLOAT && TARGET_FPRS
-      && ((TARGET_SINGLE_FLOAT && TYPE_MODE (type) == SFmode)
-          || (TARGET_DOUBLE_FLOAT 
-              && (TYPE_MODE (type) == DFmode 
-		  || FLOAT128_2REG_P (TYPE_MODE (type))
-		  || DECIMAL_FLOAT_MODE_P (TYPE_MODE (type))))))
+  machine_mode mode = TYPE_MODE (type);
+  if (abi_v4_pass_in_fpr (mode))
     {
       /* FP args go in FP registers, if present.  */
       reg = fpr;
       n_reg = (size + 7) / 8;
       sav_ofs = ((TARGET_HARD_FLOAT && TARGET_DOUBLE_FLOAT) ? 8 : 4) * 4;
       sav_scale = ((TARGET_HARD_FLOAT && TARGET_DOUBLE_FLOAT) ? 8 : 4);
-      if (TYPE_MODE (type) != SFmode && TYPE_MODE (type) != SDmode)
+      if (mode != SFmode && mode != SDmode)
 	align = 8;
     }
   else
@@ -12125,7 +12557,7 @@ rs6000_gimplify_va_arg (tree valist, tree type, gimple_seq *pre_p,
   addr = create_tmp_var (ptr_type_node, "addr");
 
   /*  AltiVec vectors never go in registers when -mabi=altivec.  */
-  if (TARGET_ALTIVEC_ABI && ALTIVEC_VECTOR_MODE (TYPE_MODE (type)))
+  if (TARGET_ALTIVEC_ABI && ALTIVEC_VECTOR_MODE (mode))
     align = 16;
   else
     {
@@ -12146,7 +12578,7 @@ rs6000_gimplify_va_arg (tree valist, tree type, gimple_seq *pre_p,
 	}
       /* _Decimal128 is passed in even/odd fpr pairs; the stored
 	 reg number is 0 for f1, so we want to make it odd.  */
-      else if (reg == fpr && TYPE_MODE (type) == TDmode)
+      else if (reg == fpr && mode == TDmode)
 	{
 	  t = build2 (BIT_IOR_EXPR, TREE_TYPE (reg), unshare_expr (reg),
 		      build_int_cst (TREE_TYPE (reg), 1));
@@ -12173,7 +12605,7 @@ rs6000_gimplify_va_arg (tree valist, tree type, gimple_seq *pre_p,
 	 FP register for 32-bit binaries.  */
       if (TARGET_32BIT
 	  && TARGET_HARD_FLOAT && TARGET_FPRS
-	  && TYPE_MODE (type) == SDmode)
+	  && mode == SDmode)
 	t = fold_build_pointer_plus_hwi (t, size);
 
       gimplify_assign (addr, t, pre_p);
@@ -12260,7 +12692,7 @@ def_builtin (const char *name, tree type, enum rs6000_builtins code)
       /* const function, function only depends on the inputs.  */
       TREE_READONLY (t) = 1;
       TREE_NOTHROW (t) = 1;
-      attr_string = ", pure";
+      attr_string = ", const";
     }
   else if ((classify & RS6000_BTC_PURE) != 0)
     {
@@ -12268,7 +12700,7 @@ def_builtin (const char *name, tree type, enum rs6000_builtins code)
 	 external state.  */
       DECL_PURE_P (t) = 1;
       TREE_NOTHROW (t) = 1;
-      attr_string = ", const";
+      attr_string = ", pure";
     }
   else if ((classify & RS6000_BTC_FP) != 0)
     {
@@ -12300,6 +12732,7 @@ def_builtin (const char *name, tree type, enum rs6000_builtins code)
 
 /* Simple ternary operations: VECd = foo (VECa, VECb, VECc).  */
 
+#undef RS6000_BUILTIN_0
 #undef RS6000_BUILTIN_1
 #undef RS6000_BUILTIN_2
 #undef RS6000_BUILTIN_3
@@ -12312,6 +12745,7 @@ def_builtin (const char *name, tree type, enum rs6000_builtins code)
 #undef RS6000_BUILTIN_S
 #undef RS6000_BUILTIN_X
 
+#define RS6000_BUILTIN_0(ENUM, NAME, MASK, ATTR, ICODE)
 #define RS6000_BUILTIN_1(ENUM, NAME, MASK, ATTR, ICODE)
 #define RS6000_BUILTIN_2(ENUM, NAME, MASK, ATTR, ICODE)
 #define RS6000_BUILTIN_3(ENUM, NAME, MASK, ATTR, ICODE) \
@@ -12333,6 +12767,7 @@ static const struct builtin_description bdesc_3arg[] =
 
 /* DST operations: void foo (void *, const int, const char).  */
 
+#undef RS6000_BUILTIN_0
 #undef RS6000_BUILTIN_1
 #undef RS6000_BUILTIN_2
 #undef RS6000_BUILTIN_3
@@ -12345,6 +12780,7 @@ static const struct builtin_description bdesc_3arg[] =
 #undef RS6000_BUILTIN_S
 #undef RS6000_BUILTIN_X
 
+#define RS6000_BUILTIN_0(ENUM, NAME, MASK, ATTR, ICODE)
 #define RS6000_BUILTIN_1(ENUM, NAME, MASK, ATTR, ICODE)
 #define RS6000_BUILTIN_2(ENUM, NAME, MASK, ATTR, ICODE)
 #define RS6000_BUILTIN_3(ENUM, NAME, MASK, ATTR, ICODE)
@@ -12366,6 +12802,7 @@ static const struct builtin_description bdesc_dst[] =
 
 /* Simple binary operations: VECc = foo (VECa, VECb).  */
 
+#undef RS6000_BUILTIN_0
 #undef RS6000_BUILTIN_1
 #undef RS6000_BUILTIN_2
 #undef RS6000_BUILTIN_3
@@ -12378,6 +12815,7 @@ static const struct builtin_description bdesc_dst[] =
 #undef RS6000_BUILTIN_S
 #undef RS6000_BUILTIN_X
 
+#define RS6000_BUILTIN_0(ENUM, NAME, MASK, ATTR, ICODE)
 #define RS6000_BUILTIN_1(ENUM, NAME, MASK, ATTR, ICODE)
 #define RS6000_BUILTIN_2(ENUM, NAME, MASK, ATTR, ICODE) \
   { MASK, ICODE, NAME, ENUM },
@@ -12397,6 +12835,7 @@ static const struct builtin_description bdesc_2arg[] =
 #include "rs6000-builtin.def"
 };
 
+#undef RS6000_BUILTIN_0
 #undef RS6000_BUILTIN_1
 #undef RS6000_BUILTIN_2
 #undef RS6000_BUILTIN_3
@@ -12409,6 +12848,7 @@ static const struct builtin_description bdesc_2arg[] =
 #undef RS6000_BUILTIN_S
 #undef RS6000_BUILTIN_X
 
+#define RS6000_BUILTIN_0(ENUM, NAME, MASK, ATTR, ICODE)
 #define RS6000_BUILTIN_1(ENUM, NAME, MASK, ATTR, ICODE)
 #define RS6000_BUILTIN_2(ENUM, NAME, MASK, ATTR, ICODE)
 #define RS6000_BUILTIN_3(ENUM, NAME, MASK, ATTR, ICODE)
@@ -12431,6 +12871,7 @@ static const struct builtin_description bdesc_altivec_preds[] =
 };
 
 /* SPE predicates.  */
+#undef RS6000_BUILTIN_0
 #undef RS6000_BUILTIN_1
 #undef RS6000_BUILTIN_2
 #undef RS6000_BUILTIN_3
@@ -12443,6 +12884,7 @@ static const struct builtin_description bdesc_altivec_preds[] =
 #undef RS6000_BUILTIN_S
 #undef RS6000_BUILTIN_X
 
+#define RS6000_BUILTIN_0(ENUM, NAME, MASK, ATTR, ICODE)
 #define RS6000_BUILTIN_1(ENUM, NAME, MASK, ATTR, ICODE)
 #define RS6000_BUILTIN_2(ENUM, NAME, MASK, ATTR, ICODE)
 #define RS6000_BUILTIN_3(ENUM, NAME, MASK, ATTR, ICODE)
@@ -12463,6 +12905,7 @@ static const struct builtin_description bdesc_spe_predicates[] =
 };
 
 /* SPE evsel predicates.  */
+#undef RS6000_BUILTIN_0
 #undef RS6000_BUILTIN_1
 #undef RS6000_BUILTIN_2
 #undef RS6000_BUILTIN_3
@@ -12475,6 +12918,7 @@ static const struct builtin_description bdesc_spe_predicates[] =
 #undef RS6000_BUILTIN_S
 #undef RS6000_BUILTIN_X
 
+#define RS6000_BUILTIN_0(ENUM, NAME, MASK, ATTR, ICODE)
 #define RS6000_BUILTIN_1(ENUM, NAME, MASK, ATTR, ICODE)
 #define RS6000_BUILTIN_2(ENUM, NAME, MASK, ATTR, ICODE)
 #define RS6000_BUILTIN_3(ENUM, NAME, MASK, ATTR, ICODE)
@@ -12495,6 +12939,7 @@ static const struct builtin_description bdesc_spe_evsel[] =
 };
 
 /* PAIRED predicates.  */
+#undef RS6000_BUILTIN_0
 #undef RS6000_BUILTIN_1
 #undef RS6000_BUILTIN_2
 #undef RS6000_BUILTIN_3
@@ -12507,6 +12952,7 @@ static const struct builtin_description bdesc_spe_evsel[] =
 #undef RS6000_BUILTIN_S
 #undef RS6000_BUILTIN_X
 
+#define RS6000_BUILTIN_0(ENUM, NAME, MASK, ATTR, ICODE)
 #define RS6000_BUILTIN_1(ENUM, NAME, MASK, ATTR, ICODE)
 #define RS6000_BUILTIN_2(ENUM, NAME, MASK, ATTR, ICODE)
 #define RS6000_BUILTIN_3(ENUM, NAME, MASK, ATTR, ICODE)
@@ -12528,6 +12974,7 @@ static const struct builtin_description bdesc_paired_preds[] =
 
 /* ABS* operations.  */
 
+#undef RS6000_BUILTIN_0
 #undef RS6000_BUILTIN_1
 #undef RS6000_BUILTIN_2
 #undef RS6000_BUILTIN_3
@@ -12540,6 +12987,7 @@ static const struct builtin_description bdesc_paired_preds[] =
 #undef RS6000_BUILTIN_S
 #undef RS6000_BUILTIN_X
 
+#define RS6000_BUILTIN_0(ENUM, NAME, MASK, ATTR, ICODE)
 #define RS6000_BUILTIN_1(ENUM, NAME, MASK, ATTR, ICODE)
 #define RS6000_BUILTIN_2(ENUM, NAME, MASK, ATTR, ICODE)
 #define RS6000_BUILTIN_3(ENUM, NAME, MASK, ATTR, ICODE)
@@ -12562,6 +13010,7 @@ static const struct builtin_description bdesc_abs[] =
 /* Simple unary operations: VECb = foo (unsigned literal) or VECb =
    foo (VECa).  */
 
+#undef RS6000_BUILTIN_0
 #undef RS6000_BUILTIN_1
 #undef RS6000_BUILTIN_2
 #undef RS6000_BUILTIN_3
@@ -12574,6 +13023,7 @@ static const struct builtin_description bdesc_abs[] =
 #undef RS6000_BUILTIN_S
 #undef RS6000_BUILTIN_X
 
+#define RS6000_BUILTIN_0(ENUM, NAME, MASK, ATTR, ICODE)
 #define RS6000_BUILTIN_1(ENUM, NAME, MASK, ATTR, ICODE) \
   { MASK, ICODE, NAME, ENUM },
 
@@ -12593,7 +13043,43 @@ static const struct builtin_description bdesc_1arg[] =
 #include "rs6000-builtin.def"
 };
 
+/* Simple no-argument operations: result = __builtin_darn_32 () */
+
+#undef RS6000_BUILTIN_0
+#undef RS6000_BUILTIN_1
+#undef RS6000_BUILTIN_2
+#undef RS6000_BUILTIN_3
+#undef RS6000_BUILTIN_A
+#undef RS6000_BUILTIN_D
+#undef RS6000_BUILTIN_E
+#undef RS6000_BUILTIN_H
+#undef RS6000_BUILTIN_P
+#undef RS6000_BUILTIN_Q
+#undef RS6000_BUILTIN_S
+#undef RS6000_BUILTIN_X
+
+#define RS6000_BUILTIN_0(ENUM, NAME, MASK, ATTR, ICODE) \
+  { MASK, ICODE, NAME, ENUM },
+
+#define RS6000_BUILTIN_1(ENUM, NAME, MASK, ATTR, ICODE)
+#define RS6000_BUILTIN_2(ENUM, NAME, MASK, ATTR, ICODE)
+#define RS6000_BUILTIN_3(ENUM, NAME, MASK, ATTR, ICODE)
+#define RS6000_BUILTIN_A(ENUM, NAME, MASK, ATTR, ICODE)
+#define RS6000_BUILTIN_D(ENUM, NAME, MASK, ATTR, ICODE)
+#define RS6000_BUILTIN_E(ENUM, NAME, MASK, ATTR, ICODE)
+#define RS6000_BUILTIN_H(ENUM, NAME, MASK, ATTR, ICODE)
+#define RS6000_BUILTIN_P(ENUM, NAME, MASK, ATTR, ICODE)
+#define RS6000_BUILTIN_Q(ENUM, NAME, MASK, ATTR, ICODE)
+#define RS6000_BUILTIN_S(ENUM, NAME, MASK, ATTR, ICODE)
+#define RS6000_BUILTIN_X(ENUM, NAME, MASK, ATTR, ICODE)
+
+static const struct builtin_description bdesc_0arg[] =
+{
+#include "rs6000-builtin.def"
+};
+
 /* HTM builtins.  */
+#undef RS6000_BUILTIN_0
 #undef RS6000_BUILTIN_1
 #undef RS6000_BUILTIN_2
 #undef RS6000_BUILTIN_3
@@ -12606,6 +13092,7 @@ static const struct builtin_description bdesc_1arg[] =
 #undef RS6000_BUILTIN_S
 #undef RS6000_BUILTIN_X
 
+#define RS6000_BUILTIN_0(ENUM, NAME, MASK, ATTR, ICODE)
 #define RS6000_BUILTIN_1(ENUM, NAME, MASK, ATTR, ICODE)
 #define RS6000_BUILTIN_2(ENUM, NAME, MASK, ATTR, ICODE)
 #define RS6000_BUILTIN_3(ENUM, NAME, MASK, ATTR, ICODE)
@@ -12625,6 +13112,7 @@ static const struct builtin_description bdesc_htm[] =
 #include "rs6000-builtin.def"
 };
 
+#undef RS6000_BUILTIN_0
 #undef RS6000_BUILTIN_1
 #undef RS6000_BUILTIN_2
 #undef RS6000_BUILTIN_3
@@ -12845,6 +13333,24 @@ rs6000_expand_binop_builtin (enum insn_code icode, tree exp, rtx target)
 	  return const0_rtx;
 	}
     }
+  else if (icode == CODE_FOR_dfptstsfi_eq_dd
+      || icode == CODE_FOR_dfptstsfi_lt_dd
+      || icode == CODE_FOR_dfptstsfi_gt_dd
+      || icode == CODE_FOR_dfptstsfi_unordered_dd
+      || icode == CODE_FOR_dfptstsfi_eq_td
+      || icode == CODE_FOR_dfptstsfi_lt_td
+      || icode == CODE_FOR_dfptstsfi_gt_td
+      || icode == CODE_FOR_dfptstsfi_unordered_td)
+    {
+      /* Only allow 6-bit unsigned literals.  */
+      STRIP_NOPS (arg0);
+      if (TREE_CODE (arg0) != INTEGER_CST
+	  || !IN_RANGE (TREE_INT_CST_LOW (arg0), 0, 63))
+	{
+	  error ("argument 1 must be a 6-bit unsigned literal");
+	  return CONST0_RTX (tmode);
+	}
+    }
 
   if (target == 0
       || GET_MODE (target) != tmode
@@ -13768,6 +14274,7 @@ altivec_expand_ld_builtin (tree exp, rtx target, bool *expandedp)
       break;
     case ALTIVEC_BUILTIN_LD_INTERNAL_2di:
       icode = CODE_FOR_vector_altivec_load_v2di;
+      break;
     case ALTIVEC_BUILTIN_LD_INTERNAL_1ti:
       icode = CODE_FOR_vector_altivec_load_v1ti;
       break;
@@ -13829,6 +14336,7 @@ altivec_expand_st_builtin (tree exp, rtx target ATTRIBUTE_UNUSED,
       break;
     case ALTIVEC_BUILTIN_ST_INTERNAL_2di:
       icode = CODE_FOR_vector_altivec_store_v2di;
+      break;
     case ALTIVEC_BUILTIN_ST_INTERNAL_1ti:
       icode = CODE_FOR_vector_altivec_store_v1ti;
       break;
@@ -14129,6 +14637,47 @@ altivec_expand_builtin (tree exp, rtx target, bool *expandedp)
     case VSX_BUILTIN_STXVW4X_V16QI:
       return altivec_expand_stv_builtin (CODE_FOR_vsx_store_v16qi, exp);
 
+    /* For the following on big endian, it's ok to use any appropriate
+       unaligned-supporting store, so use a generic expander.  For
+       little-endian, the exact element-reversing instruction must
+       be used.  */
+    case VSX_BUILTIN_ST_ELEMREV_V2DF:
+      {
+	enum insn_code code = (BYTES_BIG_ENDIAN ? CODE_FOR_vsx_store_v2df
+			       : CODE_FOR_vsx_st_elemrev_v2df);
+	return altivec_expand_stv_builtin (code, exp);
+      }
+    case VSX_BUILTIN_ST_ELEMREV_V2DI:
+      {
+	enum insn_code code = (BYTES_BIG_ENDIAN ? CODE_FOR_vsx_store_v2di
+			       : CODE_FOR_vsx_st_elemrev_v2di);
+	return altivec_expand_stv_builtin (code, exp);
+      }
+    case VSX_BUILTIN_ST_ELEMREV_V4SF:
+      {
+	enum insn_code code = (BYTES_BIG_ENDIAN ? CODE_FOR_vsx_store_v4sf
+			       : CODE_FOR_vsx_st_elemrev_v4sf);
+	return altivec_expand_stv_builtin (code, exp);
+      }
+    case VSX_BUILTIN_ST_ELEMREV_V4SI:
+      {
+	enum insn_code code = (BYTES_BIG_ENDIAN ? CODE_FOR_vsx_store_v4si
+			       : CODE_FOR_vsx_st_elemrev_v4si);
+	return altivec_expand_stv_builtin (code, exp);
+      }
+    case VSX_BUILTIN_ST_ELEMREV_V8HI:
+      {
+	enum insn_code code = (BYTES_BIG_ENDIAN ? CODE_FOR_vsx_store_v8hi
+			       : CODE_FOR_vsx_st_elemrev_v8hi);
+	return altivec_expand_stv_builtin (code, exp);
+      }
+    case VSX_BUILTIN_ST_ELEMREV_V16QI:
+      {
+	enum insn_code code = (BYTES_BIG_ENDIAN ? CODE_FOR_vsx_store_v16qi
+			       : CODE_FOR_vsx_st_elemrev_v16qi);
+	return altivec_expand_stv_builtin (code, exp);
+      }
+
     case ALTIVEC_BUILTIN_MFVSCR:
       icode = CODE_FOR_altivec_mfvscr;
       tmode = insn_data[icode].operand[0].mode;
@@ -14323,6 +14872,46 @@ altivec_expand_builtin (tree exp, rtx target, bool *expandedp)
     case VSX_BUILTIN_LXVW4X_V16QI:
       return altivec_expand_lv_builtin (CODE_FOR_vsx_load_v16qi,
 					exp, target, false);
+    /* For the following on big endian, it's ok to use any appropriate
+       unaligned-supporting load, so use a generic expander.  For
+       little-endian, the exact element-reversing instruction must
+       be used.  */
+    case VSX_BUILTIN_LD_ELEMREV_V2DF:
+      {
+	enum insn_code code = (BYTES_BIG_ENDIAN ? CODE_FOR_vsx_load_v2df
+			       : CODE_FOR_vsx_ld_elemrev_v2df);
+	return altivec_expand_lv_builtin (code, exp, target, false);
+      }
+    case VSX_BUILTIN_LD_ELEMREV_V2DI:
+      {
+	enum insn_code code = (BYTES_BIG_ENDIAN ? CODE_FOR_vsx_load_v2di
+			       : CODE_FOR_vsx_ld_elemrev_v2di);
+	return altivec_expand_lv_builtin (code, exp, target, false);
+      }
+    case VSX_BUILTIN_LD_ELEMREV_V4SF:
+      {
+	enum insn_code code = (BYTES_BIG_ENDIAN ? CODE_FOR_vsx_load_v4sf
+			       : CODE_FOR_vsx_ld_elemrev_v4sf);
+	return altivec_expand_lv_builtin (code, exp, target, false);
+      }
+    case VSX_BUILTIN_LD_ELEMREV_V4SI:
+      {
+	enum insn_code code = (BYTES_BIG_ENDIAN ? CODE_FOR_vsx_load_v4si
+			       : CODE_FOR_vsx_ld_elemrev_v4si);
+	return altivec_expand_lv_builtin (code, exp, target, false);
+      }
+    case VSX_BUILTIN_LD_ELEMREV_V8HI:
+      {
+	enum insn_code code = (BYTES_BIG_ENDIAN ? CODE_FOR_vsx_load_v8hi
+			       : CODE_FOR_vsx_ld_elemrev_v8hi);
+	return altivec_expand_lv_builtin (code, exp, target, false);
+      }
+    case VSX_BUILTIN_LD_ELEMREV_V16QI:
+      {
+	enum insn_code code = (BYTES_BIG_ENDIAN ? CODE_FOR_vsx_load_v16qi
+			       : CODE_FOR_vsx_ld_elemrev_v16qi);
+	return altivec_expand_lv_builtin (code, exp, target, false);
+      }
       break;
     default:
       break;
@@ -14792,17 +15381,71 @@ rs6000_invalid_builtin (enum rs6000_builtins fncode)
     error ("Builtin function %s requires the -mhard-dfp option", name);
   else if ((fnmask & RS6000_BTM_P8_VECTOR) != 0)
     error ("Builtin function %s requires the -mpower8-vector option", name);
+  else if ((fnmask & RS6000_BTM_P9_VECTOR) != 0)
+    error ("Builtin function %s requires the -mpower9-vector option", name);
+  else if ((fnmask & (RS6000_BTM_P9_MISC | RS6000_BTM_64BIT))
+	   == (RS6000_BTM_P9_MISC | RS6000_BTM_64BIT))
+    error ("Builtin function %s requires the -mpower9-misc and"
+	   " -m64 options", name);
+  else if ((fnmask & RS6000_BTM_P9_MISC) == RS6000_BTM_P9_MISC)
+    error ("Builtin function %s requires the -mpower9-misc option", name);
   else if ((fnmask & (RS6000_BTM_HARD_FLOAT | RS6000_BTM_LDBL128))
 	   == (RS6000_BTM_HARD_FLOAT | RS6000_BTM_LDBL128))
     error ("Builtin function %s requires the -mhard-float and"
 	   " -mlong-double-128 options", name);
   else if ((fnmask & RS6000_BTM_HARD_FLOAT) != 0)
     error ("Builtin function %s requires the -mhard-float option", name);
+  else if ((fnmask & RS6000_BTM_FLOAT128) != 0)
+    error ("Builtin function %s requires the -mfloat128 option", name);
   else
     error ("Builtin function %s is not supported with the current options",
 	   name);
 }
 
+/* Target hook for early folding of built-ins, shamelessly stolen
+   from ia64.c.  */
+
+static tree
+rs6000_fold_builtin (tree fndecl, int n_args ATTRIBUTE_UNUSED,
+		     tree *args, bool ignore ATTRIBUTE_UNUSED)
+{
+  if (DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD)
+    {
+      enum rs6000_builtins fn_code
+	= (enum rs6000_builtins) DECL_FUNCTION_CODE (fndecl);
+      switch (fn_code)
+	{
+	case RS6000_BUILTIN_NANQ:
+	case RS6000_BUILTIN_NANSQ:
+	  {
+	    tree type = TREE_TYPE (TREE_TYPE (fndecl));
+	    const char *str = c_getstr (*args);
+	    int quiet = fn_code == RS6000_BUILTIN_NANQ;
+	    REAL_VALUE_TYPE real;
+
+	    if (str && real_nan (&real, str, quiet, TYPE_MODE (type)))
+	      return build_real (type, real);
+	    return NULL_TREE;
+	  }
+	case RS6000_BUILTIN_INFQ:
+	case RS6000_BUILTIN_HUGE_VALQ:
+	  {
+	    tree type = TREE_TYPE (TREE_TYPE (fndecl));
+	    REAL_VALUE_TYPE inf;
+	    real_inf (&inf);
+	    return build_real (type, inf);
+	  }
+	default:
+	  break;
+	}
+    }
+#ifdef SUBTARGET_FOLD_BUILTIN
+  return SUBTARGET_FOLD_BUILTIN (fndecl, n_args, args, ignore);
+#else
+  return NULL_TREE;
+#endif
+}
+
 /* Expand an expression EXP that calls a built-in function,
    with result going to TARGET if that's convenient
    (and in mode MODE if that's convenient).
@@ -14990,9 +15633,11 @@ rs6000_expand_builtin (tree exp, rtx target, rtx subtarget ATTRIBUTE_UNUSED,
     }  
 
   unsigned attr = rs6000_builtin_info[uns_fcode].attr & RS6000_BTC_TYPE_MASK;
+  /* RS6000_BTC_SPECIAL represents no-operand operators.  */
   gcc_assert (attr == RS6000_BTC_UNARY
 	      || attr == RS6000_BTC_BINARY
-	      || attr == RS6000_BTC_TERNARY);
+	      || attr == RS6000_BTC_TERNARY
+	      || attr == RS6000_BTC_SPECIAL);
 
   /* Handle simple unary operations.  */
   d = bdesc_1arg;
@@ -15012,6 +15657,12 @@ rs6000_expand_builtin (tree exp, rtx target, rtx subtarget ATTRIBUTE_UNUSED,
     if (d->code == fcode)
       return rs6000_expand_ternop_builtin (d->icode, exp, target);
 
+  /* Handle simple no-argument operations. */
+  d = bdesc_0arg;
+  for (i = 0; i < ARRAY_SIZE (bdesc_0arg); i++, d++)
+    if (d->code == fcode)
+      return rs6000_expand_zeroop_builtin (d->icode, target);
+
   gcc_unreachable ();
 }
 
@@ -15049,6 +15700,10 @@ rs6000_init_builtins (void)
   opaque_p_V2SI_type_node = build_pointer_type (opaque_V2SI_type_node);
   opaque_V4SI_type_node = build_opaque_vector_type (intSI_type_node, 4);
 
+  const_str_type_node
+    = build_pointer_type (build_qualified_type (char_type_node,
+						TYPE_QUAL_CONST));
+
   /* We use V1TI mode as a special container to hold __int128_t items that
      must live in VSX registers.  */
   if (intTI_type_node)
@@ -15111,6 +15766,12 @@ rs6000_init_builtins (void)
       lang_hooks.types.register_builtin_type (ibm128_float_type_node,
 					      "__ibm128");
     }
+  else
+    {
+      /* All types must be nonzero, or self-test barfs during bootstrap.  */
+      ieee128_float_type_node = long_double_type_node;
+      ibm128_float_type_node = long_double_type_node;
+    }
 
   /* Initialize the modes for builtin_function_type, mapping a machine mode to
      tree type node.  */
@@ -15252,6 +15913,15 @@ rs6000_init_builtins (void)
   if (TARGET_EXTRA_BUILTINS || TARGET_SPE || TARGET_PAIRED_FLOAT)
     rs6000_common_init_builtins ();
 
+  ftype = build_function_type_list (ieee128_float_type_node,
+				    const_str_type_node, NULL_TREE);
+  def_builtin ("__builtin_nanq", ftype, RS6000_BUILTIN_NANQ);
+  def_builtin ("__builtin_nansq", ftype, RS6000_BUILTIN_NANSQ);
+
+  ftype = build_function_type_list (ieee128_float_type_node, NULL_TREE);
+  def_builtin ("__builtin_infq", ftype, RS6000_BUILTIN_INFQ);
+  def_builtin ("__builtin_huge_valq", ftype, RS6000_BUILTIN_HUGE_VALQ);
+
   ftype = builtin_function_type (DFmode, DFmode, DFmode, VOIDmode,
 				 RS6000_BUILTIN_RECIP, "__builtin_recipdiv");
   def_builtin ("__builtin_recipdiv", ftype, RS6000_BUILTIN_RECIP);
@@ -15816,10 +16486,44 @@ altivec_init_builtins (void)
 	       VSX_BUILTIN_STXVW4X_V8HI);
   def_builtin ("__builtin_vsx_stxvw4x_v16qi", void_ftype_v16qi_long_pvoid,
 	       VSX_BUILTIN_STXVW4X_V16QI);
+
+  def_builtin ("__builtin_vsx_ld_elemrev_v2df", v2df_ftype_long_pcvoid,
+	       VSX_BUILTIN_LD_ELEMREV_V2DF);
+  def_builtin ("__builtin_vsx_ld_elemrev_v2di", v2di_ftype_long_pcvoid,
+	       VSX_BUILTIN_LD_ELEMREV_V2DI);
+  def_builtin ("__builtin_vsx_ld_elemrev_v4sf", v4sf_ftype_long_pcvoid,
+	       VSX_BUILTIN_LD_ELEMREV_V4SF);
+  def_builtin ("__builtin_vsx_ld_elemrev_v4si", v4si_ftype_long_pcvoid,
+	       VSX_BUILTIN_LD_ELEMREV_V4SI);
+  def_builtin ("__builtin_vsx_st_elemrev_v2df", void_ftype_v2df_long_pvoid,
+	       VSX_BUILTIN_ST_ELEMREV_V2DF);
+  def_builtin ("__builtin_vsx_st_elemrev_v2di", void_ftype_v2di_long_pvoid,
+	       VSX_BUILTIN_ST_ELEMREV_V2DI);
+  def_builtin ("__builtin_vsx_st_elemrev_v4sf", void_ftype_v4sf_long_pvoid,
+	       VSX_BUILTIN_ST_ELEMREV_V4SF);
+  def_builtin ("__builtin_vsx_st_elemrev_v4si", void_ftype_v4si_long_pvoid,
+	       VSX_BUILTIN_ST_ELEMREV_V4SI);
+
+  if (TARGET_P9_VECTOR)
+    {
+      def_builtin ("__builtin_vsx_ld_elemrev_v8hi", v8hi_ftype_long_pcvoid,
+		   VSX_BUILTIN_LD_ELEMREV_V8HI);
+      def_builtin ("__builtin_vsx_ld_elemrev_v16qi", v16qi_ftype_long_pcvoid,
+		   VSX_BUILTIN_LD_ELEMREV_V16QI);
+      def_builtin ("__builtin_vsx_st_elemrev_v8hi",
+		   void_ftype_v8hi_long_pvoid, VSX_BUILTIN_ST_ELEMREV_V8HI);
+      def_builtin ("__builtin_vsx_st_elemrev_v16qi",
+		   void_ftype_v16qi_long_pvoid, VSX_BUILTIN_ST_ELEMREV_V16QI);
+    }
+
   def_builtin ("__builtin_vec_vsx_ld", opaque_ftype_long_pcvoid,
 	       VSX_BUILTIN_VEC_LD);
   def_builtin ("__builtin_vec_vsx_st", void_ftype_opaque_long_pvoid,
 	       VSX_BUILTIN_VEC_ST);
+  def_builtin ("__builtin_vec_xl", opaque_ftype_long_pcvoid,
+	       VSX_BUILTIN_VEC_XL);
+  def_builtin ("__builtin_vec_xst", void_ftype_opaque_long_pvoid,
+	       VSX_BUILTIN_VEC_XST);
 
   def_builtin ("__builtin_vec_step", int_ftype_opaque, ALTIVEC_BUILTIN_VEC_STEP);
   def_builtin ("__builtin_vec_splats", opaque_ftype_opaque, ALTIVEC_BUILTIN_VEC_SPLATS);
@@ -16351,10 +17055,6 @@ builtin_function_type (machine_mode mode_ret, machine_mode mode_arg0,
   while (num_args > 0 && h.mode[num_args] == VOIDmode)
     num_args--;
 
-  if (num_args == 0)
-    fatal_error (input_location,
-		 "internal error: builtin function %s had no type", name);
-
   ret_type = builtin_mode_to_type[h.mode[0]][h.uns_p[0]];
   if (!ret_type && h.uns_p[0])
     ret_type = builtin_mode_to_type[h.mode[0]][0];
@@ -16406,6 +17106,7 @@ rs6000_common_init_builtins (void)
   tree opaque_ftype_opaque = NULL_TREE;
   tree opaque_ftype_opaque_opaque = NULL_TREE;
   tree opaque_ftype_opaque_opaque_opaque = NULL_TREE;
+  tree v2si_ftype = NULL_TREE;
   tree v2si_ftype_qi = NULL_TREE;
   tree v2si_ftype_v2si_qi = NULL_TREE;
   tree v2si_ftype_int_qi = NULL_TREE;
@@ -16622,6 +17323,64 @@ rs6000_common_init_builtins (void)
 
       def_builtin (d->name, type, d->code);
     }
+
+  /* Add the simple no-argument operators.  */
+  d = bdesc_0arg;
+  for (i = 0; i < ARRAY_SIZE (bdesc_0arg); i++, d++)
+    {
+      machine_mode mode0;
+      tree type;
+      HOST_WIDE_INT mask = d->mask;
+
+      if ((mask & builtin_mask) != mask)
+	{
+	  if (TARGET_DEBUG_BUILTIN)
+	    fprintf (stderr, "rs6000_builtin, skip no-argument %s\n", d->name);
+	  continue;
+	}
+      if (rs6000_overloaded_builtin_p (d->code))
+	{
+	  if (!opaque_ftype_opaque)
+	    opaque_ftype_opaque
+	      = build_function_type_list (opaque_V4SI_type_node, NULL_TREE);
+	  type = opaque_ftype_opaque;
+	}
+      else
+	{
+	  enum insn_code icode = d->icode;
+	  if (d->name == 0)
+	    {
+	      if (TARGET_DEBUG_BUILTIN)
+		fprintf (stderr, "rs6000_builtin, bdesc_0arg[%lu] no name\n",
+			 (long unsigned) i);
+	      continue;
+	    }
+	  if (icode == CODE_FOR_nothing)
+	    {
+	      if (TARGET_DEBUG_BUILTIN)
+		fprintf (stderr,
+			 "rs6000_builtin, skip no-argument %s (no code)\n",
+			 d->name);
+	      continue;
+	    }
+	  mode0 = insn_data[icode].operand[0].mode;
+	  if (mode0 == V2SImode)
+	    {
+	      /* code for SPE */
+	      if (! (type = v2si_ftype))
+		{
+		  v2si_ftype
+		    = build_function_type_list (opaque_V2SI_type_node, 
+						NULL_TREE);
+		  type = v2si_ftype;
+		}
+	    }
+	  else
+	    type = builtin_function_type (mode0, VOIDmode, VOIDmode, VOIDmode,
+					  d->code, d->name);
+	}
+      def_builtin (d->name, type, d->code);
+    }
 }
 
 /* Set up AIX/Darwin/64-bit Linux quad floating point routines.  */
@@ -18006,25 +18765,33 @@ rs6000_secondary_reload_memory (rtx addr,
     addr_mask = (reg_addr[mode].addr_mask[RELOAD_REG_VMX]
 		 & ~RELOAD_REG_AND_M16);
 
-  else
+  /* If the register allocator hasn't made up its mind yet on the register
+     class to use, settle on defaults to use.  */
+  else if (rclass == NO_REGS)
     {
-      if (TARGET_DEBUG_ADDR)
-	fprintf (stderr,
-		 "rs6000_secondary_reload_memory: mode = %s, class = %s, "
-		 "class is not GPR, FPR, VMX\n",
-		 GET_MODE_NAME (mode), reg_class_names[rclass]);
+      addr_mask = (reg_addr[mode].addr_mask[RELOAD_REG_ANY]
+		   & ~RELOAD_REG_AND_M16);
 
-      return -1;
+      if ((addr_mask & RELOAD_REG_MULTIPLE) != 0)
+	addr_mask &= ~(RELOAD_REG_INDEXED
+		       | RELOAD_REG_PRE_INCDEC
+		       | RELOAD_REG_PRE_MODIFY);
     }
 
+  else
+    addr_mask = 0;
+
   /* If the register isn't valid in this register class, just return now.  */
   if ((addr_mask & RELOAD_REG_VALID) == 0)
     {
       if (TARGET_DEBUG_ADDR)
-	fprintf (stderr,
-		 "rs6000_secondary_reload_memory: mode = %s, class = %s, "
-		 "not valid in class\n",
-		 GET_MODE_NAME (mode), reg_class_names[rclass]);
+	{
+	  fprintf (stderr,
+		   "rs6000_secondary_reload_memory: mode = %s, class = %s, "
+		   "not valid in class\n",
+		   GET_MODE_NAME (mode), reg_class_names[rclass]);
+	  debug_rtx (addr);
+	}
 
       return -1;
     }
@@ -18152,13 +18919,23 @@ rs6000_secondary_reload_memory (rtx addr,
 	    }
 	}
 
+      else if ((addr_mask & RELOAD_REG_QUAD_OFFSET) != 0
+	       && CONST_INT_P (plus_arg1))
+	{
+	  if (!quad_address_offset_p (INTVAL (plus_arg1)))
+	    {
+	      extra_cost = 1;
+	      type = "vector d-form offset";
+	    }
+	}
+
       /* Make sure the register class can handle offset addresses.  */
       else if (rs6000_legitimate_offset_address_p (mode, addr, false, true))
 	{
 	  if ((addr_mask & RELOAD_REG_OFFSET) == 0)
 	    {
 	      extra_cost = 1;
-	      type = "offset";
+	      type = "offset #2";
 	    }
 	}
 
@@ -18171,7 +18948,14 @@ rs6000_secondary_reload_memory (rtx addr,
       break;
 
     case LO_SUM:
-      if (!legitimate_lo_sum_address_p (mode, addr, false))
+      /* Quad offsets are restricted and can't handle normal addresses.  */
+      if ((addr_mask & RELOAD_REG_QUAD_OFFSET) != 0)
+	{
+	  extra_cost = -1;
+	  type = "vector d-form lo_sum";
+	}
+
+      else if (!legitimate_lo_sum_address_p (mode, addr, false))
 	{
 	  fail_msg = "bad LO_SUM";
 	  extra_cost = -1;
@@ -18188,8 +18972,17 @@ rs6000_secondary_reload_memory (rtx addr,
     case CONST:
     case SYMBOL_REF:
     case LABEL_REF:
-      type = "address";
-      extra_cost = rs6000_secondary_reload_toc_costs (addr_mask);
+      if ((addr_mask & RELOAD_REG_QUAD_OFFSET) != 0)
+	{
+	  extra_cost = -1;
+	  type = "vector d-form lo_sum #2";
+	}
+
+      else
+	{
+	  type = "address";
+	  extra_cost = rs6000_secondary_reload_toc_costs (addr_mask);
+	}
       break;
 
       /* TOC references look like offsetable memory.  */
@@ -18200,6 +18993,12 @@ rs6000_secondary_reload_memory (rtx addr,
 	  extra_cost = -1;
 	}
 
+      else if ((addr_mask & RELOAD_REG_QUAD_OFFSET) != 0)
+	{
+	  extra_cost = -1;
+	  type = "vector d-form lo_sum #3";
+	}
+
       else if ((addr_mask & RELOAD_REG_OFFSET) == 0)
 	{
 	  extra_cost = 1;
@@ -18256,7 +19055,8 @@ rs6000_secondary_reload_simple_move (enum rs6000_reg_type to_type,
      simple move insns are issued.  At present, 32-bit integers are not allowed
      in FPR/VSX registers.  Single precision binary floating is not a simple
      move because we need to convert to the single precision memory layout.
-     The 4-byte SDmode can be moved.  */
+     The 4-byte SDmode can be moved.  TDmode values are disallowed since they
+     need special direct move handling, which we do not support yet.  */
   size = GET_MODE_SIZE (mode);
   if (TARGET_DIRECT_MOVE
       && ((mode == SDmode) || (TARGET_POWERPC64 && size == 8))
@@ -18264,7 +19064,7 @@ rs6000_secondary_reload_simple_move (enum rs6000_reg_type to_type,
 	  || (to_type == VSX_REG_TYPE && from_type == GPR_REG_TYPE)))
     return true;
 
-  else if (TARGET_DIRECT_MOVE_128 && size == 16
+  else if (TARGET_DIRECT_MOVE_128 && size == 16 && mode != TDmode
 	   && ((to_type == VSX_REG_TYPE && from_type == GPR_REG_TYPE)
 	       || (to_type == GPR_REG_TYPE && from_type == VSX_REG_TYPE)))
     return true;
@@ -18460,6 +19260,7 @@ rs6000_secondary_reload (bool in_p,
 		       && MEM_P (SUBREG_REG (x))));
 
   sri->icode = CODE_FOR_nothing;
+  sri->t_icode = CODE_FOR_nothing;
   sri->extra_cost = 0;
   icode = ((in_p)
 	   ? reg_addr[mode].reload_load
@@ -18653,6 +19454,9 @@ rs6000_secondary_reload (bool in_p,
 	fprintf (stderr, ", reload func = %s, extra cost = %d",
 		 insn_data[sri->icode].name, sri->extra_cost);
 
+      else if (sri->extra_cost > 0)
+	fprintf (stderr, ", extra cost = %d", sri->extra_cost);
+
       fputs ("\n", stderr);
       debug_rtx (x);
     }
@@ -18827,6 +19631,16 @@ rs6000_secondary_reload_inner (rtx reg, rtx mem, rtx scratch, bool store_p)
 	    }
 	}
 
+      else if (mode_supports_vsx_dform_quad (mode) && CONST_INT_P (op1))
+	{
+	  if (((addr_mask & RELOAD_REG_QUAD_OFFSET) == 0)
+	      || !quad_address_p (addr, mode, false))
+	    {
+	      emit_insn (gen_rtx_SET (scratch, addr));
+	      new_addr = scratch;
+	    }
+	}
+
       /* Make sure the register class can handle offset addresses.  */
       else if (rs6000_legitimate_offset_address_p (mode, addr, false, true))
 	{
@@ -18857,6 +19671,13 @@ rs6000_secondary_reload_inner (rtx reg, rtx mem, rtx scratch, bool store_p)
 	    }
 	}
 
+      /* Quad offsets are restricted and can't handle normal addresses.  */
+      else if (mode_supports_vsx_dform_quad (mode))
+	{
+	  emit_insn (gen_rtx_SET (scratch, addr));
+	  new_addr = scratch;
+	}
+
       /* Make sure the register class can handle offset addresses.  */
       else if (legitimate_lo_sum_address_p (mode, addr, false))
 	{
@@ -19046,6 +19867,16 @@ rs6000_preferred_reload_class (rtx x, enum reg_class rclass)
   machine_mode mode = GET_MODE (x);
   bool is_constant = CONSTANT_P (x);
 
+  /* If a mode can't go in FPR/ALTIVEC/VSX registers, don't return a preferred
+     reload class for it.  */
+  if ((rclass == ALTIVEC_REGS || rclass == VSX_REGS)
+      && (reg_addr[mode].addr_mask[RELOAD_REG_VMX] & RELOAD_REG_VALID) == 0)
+    return NO_REGS;
+
+  if ((rclass == FLOAT_REGS || rclass == VSX_REGS)
+      && (reg_addr[mode].addr_mask[RELOAD_REG_FPR] & RELOAD_REG_VALID) == 0)
+    return NO_REGS;
+
   /* For VSX, see if we should prefer FLOAT_REGS or ALTIVEC_REGS.  Do not allow
      the reloading of address expressions using PLUS into floating point
      registers.  */
@@ -19067,7 +19898,8 @@ rs6000_preferred_reload_class (rtx x, enum reg_class rclass)
 	}
 
       /* D-form addressing can easily reload the value.  */
-      if (mode_supports_vmx_dform (mode))
+      if (mode_supports_vmx_dform (mode)
+	  || mode_supports_vsx_dform_quad (mode))
 	return rclass;
 
       /* If this is a scalar floating point value and we don't have D-form
@@ -19095,6 +19927,25 @@ rs6000_preferred_reload_class (rtx x, enum reg_class rclass)
       return NO_REGS;
     }
 
+  /* If we haven't picked a register class, and the type is a vector or
+     floating point type, prefer to use the VSX, FPR, or Altivec register
+     classes.  */
+  if (rclass == NO_REGS)
+    {
+      if (TARGET_VSX && VECTOR_MEM_VSX_OR_P8_VECTOR_P (mode))
+	return VSX_REGS;
+
+      if (TARGET_ALTIVEC && VECTOR_MEM_ALTIVEC_P (mode))
+	return ALTIVEC_REGS;
+
+      if (DECIMAL_FLOAT_MODE_P (mode))
+	return TARGET_DFP ? FLOAT_REGS : NO_REGS;
+
+      if (TARGET_FPRS && TARGET_HARD_FLOAT && FLOAT_MODE_P (mode)
+	  && (reg_addr[mode].addr_mask[RELOAD_REG_FPR] & RELOAD_REG_VALID) == 0)
+	return FLOAT_REGS;
+    }
+
   if (GET_MODE_CLASS (mode) == MODE_INT && rclass == NON_SPECIAL_REGS)
     return GENERAL_REGS;
 
@@ -19483,8 +20334,16 @@ rs6000_output_move_128bit (rtx operands[])
 
       else if (TARGET_VSX && dest_vsx_p)
 	{
-	  if (mode == V16QImode || mode == V8HImode || mode == V4SImode)
+	  if (mode_supports_vsx_dform_quad (mode)
+	      && quad_address_p (XEXP (src, 0), mode, true))
+	    return "lxv %x0,%1";
+
+	  else if (TARGET_P9_VECTOR)
+	    return "lxvx %x0,%y1";
+
+	  else if (mode == V16QImode || mode == V8HImode || mode == V4SImode)
 	    return "lxvw4x %x0,%y1";
+
 	  else
 	    return "lxvd2x %x0,%y1";
 	}
@@ -19513,8 +20372,16 @@ rs6000_output_move_128bit (rtx operands[])
 
       else if (TARGET_VSX && src_vsx_p)
 	{
-	  if (mode == V16QImode || mode == V8HImode || mode == V4SImode)
+	  if (mode_supports_vsx_dform_quad (mode)
+	      && quad_address_p (XEXP (dest, 0), mode, true))
+	    return "stxv %x1,%0";
+
+	  else if (TARGET_P9_VECTOR)
+	    return "stxvx %x1,%y0";
+
+	  else if (mode == V16QImode || mode == V8HImode || mode == V4SImode)
 	    return "stxvw4x %x1,%y0";
+
 	  else
 	    return "stxvd2x %x1,%y0";
 	}
@@ -19536,10 +20403,8 @@ rs6000_output_move_128bit (rtx operands[])
       if (dest_gpr_p)
 	return "#";
 
-      else if (TARGET_VSX && dest_vsx_p && zero_constant (src, mode))
-	return "xxlxor %x0,%x0,%x0";
-
-      else if (TARGET_ALTIVEC && dest_vmx_p)
+      else if ((dest_vmx_p && TARGET_ALTIVEC)
+	       || (dest_vsx_p && TARGET_VSX))
 	return output_vec_const_move (operands);
     }
 
@@ -20732,8 +21597,8 @@ rs6000_generate_compare (rtx cmp, machine_mode mode)
   else if (!TARGET_FLOAT128_HW && FLOAT128_VECTOR_P (mode))
     {
       rtx libfunc = NULL_RTX;
-      bool uneq_or_ltgt = false;
-      rtx dest = gen_reg_rtx (SImode);
+      bool check_nan = false;
+      rtx dest;
 
       switch (code)
 	{
@@ -20760,21 +21625,23 @@ rs6000_generate_compare (rtx cmp, machine_mode mode)
 
 	case UNGE:
 	case UNGT:
-	  libfunc = optab_libfunc (le_optab, mode);
+	  check_nan = true;
+	  libfunc = optab_libfunc (ge_optab, mode);
 	  code = (code == UNGE) ? GE : GT;
 	  break;
 
 	case UNLE:
 	case UNLT:
-	  libfunc = optab_libfunc (ge_optab, mode);
+	  check_nan = true;
+	  libfunc = optab_libfunc (le_optab, mode);
 	  code = (code == UNLE) ? LE : LT;
 	  break;
 
 	case UNEQ:
 	case LTGT:
-	  libfunc = optab_libfunc (le_optab, mode);
-	  uneq_or_ltgt = true;
-	  code = (code = UNEQ) ? NE : EQ;
+	  check_nan = true;
+	  libfunc = optab_libfunc (eq_optab, mode);
+	  code = (code = UNEQ) ? EQ : NE;
 	  break;
 
 	default:
@@ -20782,21 +21649,56 @@ rs6000_generate_compare (rtx cmp, machine_mode mode)
 	}
 
       gcc_assert (libfunc);
-      dest = emit_library_call_value (libfunc, NULL_RTX, LCT_CONST,
-				      SImode, 2, op0, mode, op1, mode);
 
-      /* If this is UNEQ or LTGT, we call __lekf2, which returns -1 for less
-	 than, 0 for equal, +1 for greater, and +2 for nan.  We add 1, to give
-	 a value of 0..3, and then do and AND immediate of 1 to isolate whether
-	 it is 0/Nan (i.e. bottom bit is 0), or less than/greater than
-	 (i.e. bottom bit is 1).  */
-      if (uneq_or_ltgt)
+      if (!check_nan)
+	dest = emit_library_call_value (libfunc, NULL_RTX, LCT_CONST,
+					SImode, 2, op0, mode, op1, mode);
+
+      /* The library signals an exception for signalling NaNs, so we need to
+	 handle isgreater, etc. by first checking isordered.  */
+      else
 	{
-	  rtx add_result = gen_reg_rtx (SImode);
-	  rtx and_result = gen_reg_rtx (SImode);
-	  emit_insn (gen_addsi3 (add_result, dest, GEN_INT (1)));
-	  emit_insn (gen_andsi3 (and_result, add_result, GEN_INT (1)));
-	  dest = and_result;
+	  rtx ne_rtx, normal_dest, unord_dest;
+	  rtx unord_func = optab_libfunc (unord_optab, mode);
+	  rtx join_label = gen_label_rtx ();
+	  rtx join_ref = gen_rtx_LABEL_REF (VOIDmode, join_label);
+	  rtx unord_cmp = gen_reg_rtx (comp_mode);
+
+
+	  /* Test for either value being a NaN.  */
+	  gcc_assert (unord_func);
+	  unord_dest = emit_library_call_value (unord_func, NULL_RTX, LCT_CONST,
+						SImode, 2, op0, mode, op1,
+						mode);
+
+	  /* Set value (0) if either value is a NaN, and jump to the join
+	     label.  */
+	  dest = gen_reg_rtx (SImode);
+	  emit_move_insn (dest, const1_rtx);
+	  emit_insn (gen_rtx_SET (unord_cmp,
+				  gen_rtx_COMPARE (comp_mode, unord_dest,
+						   const0_rtx)));
+
+	  ne_rtx = gen_rtx_NE (comp_mode, unord_cmp, const0_rtx);
+	  emit_jump_insn (gen_rtx_SET (pc_rtx,
+				       gen_rtx_IF_THEN_ELSE (VOIDmode, ne_rtx,
+							     join_ref,
+							     pc_rtx)));
+
+	  /* Do the normal comparison, knowing that the values are not
+	     NaNs.  */
+	  normal_dest = emit_library_call_value (libfunc, NULL_RTX, LCT_CONST,
+						 SImode, 2, op0, mode, op1,
+						 mode);
+
+	  emit_insn (gen_cstoresi4 (dest,
+				    gen_rtx_fmt_ee (code, SImode, normal_dest,
+						    const0_rtx),
+				    normal_dest, const0_rtx));
+
+	  /* Join NaN and non-Nan paths.  Compare dest against 0.  */
+	  emit_label (join_label);
+	  code = NE;
 	}
 
       emit_insn (gen_rtx_SET (compare_result,
@@ -21747,6 +22649,101 @@ rs6000_emit_vector_cond_expr (rtx dest, rtx op_true, rtx op_false,
   return 1;
 }
 
+/* ISA 3.0 (power9) minmax subcase to emit a XSMAXCDP or XSMINCDP instruction
+   for SF/DF scalars.  Move TRUE_COND to DEST if OP of the operands of the last
+   comparison is nonzero/true, FALSE_COND if it is zero/false.  Return 0 if the
+   hardware has no such operation.  */
+
+static int
+rs6000_emit_p9_fp_minmax (rtx dest, rtx op, rtx true_cond, rtx false_cond)
+{
+  enum rtx_code code = GET_CODE (op);
+  rtx op0 = XEXP (op, 0);
+  rtx op1 = XEXP (op, 1);
+  machine_mode compare_mode = GET_MODE (op0);
+  machine_mode result_mode = GET_MODE (dest);
+  bool max_p = false;
+
+  if (result_mode != compare_mode)
+    return 0;
+
+  if (code == GE || code == GT)
+    max_p = true;
+  else if (code == LE || code == LT)
+    max_p = false;
+  else
+    return 0;
+
+  if (rtx_equal_p (op0, true_cond) && rtx_equal_p (op1, false_cond))
+    ;
+
+  else if (rtx_equal_p (op1, true_cond) && rtx_equal_p (op0, false_cond))
+    max_p = !max_p;
+
+  else
+    return 0;
+
+  rs6000_emit_minmax (dest, max_p ? SMAX : SMIN, op0, op1);
+  return 1;
+}
+
+/* ISA 3.0 (power9) conditional move subcase to emit XSCMP{EQ,GE,GT,NE}DP and
+   XXSEL instructions for SF/DF scalars.  Move TRUE_COND to DEST if OP of the
+   operands of the last comparison is nonzero/true, FALSE_COND if it is
+   zero/false.  Return 0 if the hardware has no such operation.  */
+
+static int
+rs6000_emit_p9_fp_cmove (rtx dest, rtx op, rtx true_cond, rtx false_cond)
+{
+  enum rtx_code code = GET_CODE (op);
+  rtx op0 = XEXP (op, 0);
+  rtx op1 = XEXP (op, 1);
+  machine_mode result_mode = GET_MODE (dest);
+  rtx compare_rtx;
+  rtx cmove_rtx;
+  rtx clobber_rtx;
+
+  if (!can_create_pseudo_p ())
+    return 0;
+
+  switch (code)
+    {
+    case EQ:
+    case GE:
+    case GT:
+      break;
+
+    case NE:
+    case LT:
+    case LE:
+      code = swap_condition (code);
+      std::swap (op0, op1);
+      break;
+
+    default:
+      return 0;
+    }
+
+  /* Generate:	[(parallel [(set (dest)
+				 (if_then_else (op (cmp1) (cmp2))
+					       (true)
+					       (false)))
+			    (clobber (scratch))])].  */
+
+  compare_rtx = gen_rtx_fmt_ee (code, CCFPmode, op0, op1);
+  cmove_rtx = gen_rtx_SET (dest,
+			   gen_rtx_IF_THEN_ELSE (result_mode,
+						 compare_rtx,
+						 true_cond,
+						 false_cond));
+
+  clobber_rtx = gen_rtx_CLOBBER (VOIDmode, gen_rtx_SCRATCH (V2DImode));
+  emit_insn (gen_rtx_PARALLEL (VOIDmode,
+			       gen_rtvec (2, cmove_rtx, clobber_rtx)));
+
+  return 1;
+}
+
 /* Emit a conditional move: move TRUE_COND to DEST if OP of the
    operands of the last comparison is nonzero/true, FALSE_COND if it
    is zero/false.  Return 0 if the hardware has no such operation.  */
@@ -21773,6 +22770,18 @@ rs6000_emit_cmove (rtx dest, rtx op, rtx true_cond, rtx false_cond)
   if (GET_MODE (false_cond) != result_mode)
     return 0;
 
+  /* See if we can use the ISA 3.0 (power9) min/max/compare functions.  */
+  if (TARGET_P9_MINMAX
+      && (compare_mode == SFmode || compare_mode == DFmode)
+      && (result_mode == SFmode || result_mode == DFmode))
+    {
+      if (rs6000_emit_p9_fp_minmax (dest, op, true_cond, false_cond))
+	return 1;
+
+      if (rs6000_emit_p9_fp_cmove (dest, op, true_cond, false_cond))
+	return 1;
+    }
+
   /* Don't allow using floating point comparisons for integer results for
      now.  */
   if (FLOAT_MODE_P (compare_mode) && !FLOAT_MODE_P (result_mode))
@@ -22034,6 +23043,48 @@ rs6000_emit_minmax (rtx dest, enum rtx_code code, rtx op0, rtx op1)
     emit_move_insn (dest, target);
 }
 
+/* Split a signbit operation on 64-bit machines with direct move.  Also allow
+   for the value to come from memory or if it is already loaded into a GPR.  */
+
+void
+rs6000_split_signbit (rtx dest, rtx src)
+{
+  machine_mode d_mode = GET_MODE (dest);
+  machine_mode s_mode = GET_MODE (src);
+  rtx dest_di = (d_mode == DImode) ? dest : gen_lowpart (DImode, dest);
+  rtx shift_reg = dest_di;
+
+  gcc_assert (REG_P (dest));
+  gcc_assert (REG_P (src) || MEM_P (src));
+  gcc_assert (s_mode == KFmode || s_mode == TFmode);
+
+  if (MEM_P (src))
+    {
+      rtx mem = (WORDS_BIG_ENDIAN
+		 ? adjust_address (src, DImode, 0)
+		 : adjust_address (src, DImode, 8));
+      emit_insn (gen_rtx_SET (dest_di, mem));
+    }
+
+  else
+    {
+      unsigned int r = REGNO (src);
+
+      /* If this is a VSX register, generate the special mfvsrd instruction
+	 to get it in a GPR.  Until we support SF and DF modes, that will
+	 always be true.  */
+      gcc_assert (VSX_REGNO_P (r));
+
+      if (s_mode == KFmode)
+	emit_insn (gen_signbitkf2_dm2 (dest_di, src));
+      else
+	emit_insn (gen_signbittf2_dm2 (dest_di, src));
+    }
+
+  emit_insn (gen_lshrdi3 (dest_di, shift_reg, GEN_INT (63)));
+  return;
+}
+
 /* A subroutine of the atomic operation splitters.  Jump to LABEL if
    COND is true.  Mark the jump as unlikely to be taken.  */
 
@@ -25949,25 +27000,37 @@ rs6000_emit_prologue (void)
 	if (info->vrsave_mask & ALTIVEC_REG_BIT (i))
 	  {
 	    rtx areg, savereg, mem;
-	    int offset;
+	    HOST_WIDE_INT offset;
 
 	    offset = (info->altivec_save_offset + frame_off
 		      + 16 * (i - info->first_altivec_reg_save));
 
 	    savereg = gen_rtx_REG (V4SImode, i);
 
-	    NOT_INUSE (0);
-	    areg = gen_rtx_REG (Pmode, 0);
-	    emit_move_insn (areg, GEN_INT (offset));
-
-	    /* AltiVec addressing mode is [reg+reg].  */
-	    mem = gen_frame_mem (V4SImode,
-				 gen_rtx_PLUS (Pmode, frame_reg_rtx, areg));
+	    if (TARGET_P9_DFORM_VECTOR && quad_address_offset_p (offset))
+	      {
+		mem = gen_frame_mem (V4SImode,
+				     gen_rtx_PLUS (Pmode, frame_reg_rtx,
+						   GEN_INT (offset)));
+		insn = emit_insn (gen_rtx_SET (mem, savereg));
+		areg = NULL_RTX;
+	      }
+	    else
+	      {
+		NOT_INUSE (0);
+		areg = gen_rtx_REG (Pmode, 0);
+		emit_move_insn (areg, GEN_INT (offset));
 
-	    /* Rather than emitting a generic move, force use of the stvx
-	       instruction, which we always want.  In particular we don't
-	       want xxpermdi/stxvd2x for little endian.  */
-	    insn = emit_insn (gen_altivec_stvx_v4si_internal (mem, savereg));
+		/* AltiVec addressing mode is [reg+reg].  */
+		mem = gen_frame_mem (V4SImode,
+				     gen_rtx_PLUS (Pmode, frame_reg_rtx, areg));
+
+		/* Rather than emitting a generic move, force use of the stvx
+		   instruction, which we always want on ISA 2.07 (power8) systems.
+		   In particular we don't want xxpermdi/stxvd2x for little
+		   endian.  */
+		insn = emit_insn (gen_altivec_stvx_v4si_internal (mem, savereg));
+	      }
 
 	    rs6000_frame_related (insn, frame_reg_rtx, sp_off - frame_off,
 				  areg, GEN_INT (offset));
@@ -26687,23 +27750,35 @@ rs6000_emit_epilogue (int sibcall)
 	  for (i = info->first_altivec_reg_save; i <= LAST_ALTIVEC_REGNO; ++i)
 	    if (info->vrsave_mask & ALTIVEC_REG_BIT (i))
 	      {
-		rtx addr, areg, mem, reg;
+		rtx addr, areg, mem, insn;
+		rtx reg = gen_rtx_REG (V4SImode, i);
+		HOST_WIDE_INT offset
+		  = (info->altivec_save_offset + frame_off
+		     + 16 * (i - info->first_altivec_reg_save));
 
-		areg = gen_rtx_REG (Pmode, 0);
-		emit_move_insn
-		  (areg, GEN_INT (info->altivec_save_offset
-				  + frame_off
-				  + 16 * (i - info->first_altivec_reg_save)));
+		if (TARGET_P9_DFORM_VECTOR && quad_address_offset_p (offset))
+		  {
+		    mem = gen_frame_mem (V4SImode,
+					 gen_rtx_PLUS (Pmode, frame_reg_rtx,
+						       GEN_INT (offset)));
+		    insn = gen_rtx_SET (reg, mem);
+		  }
+		else
+		  {
+		    areg = gen_rtx_REG (Pmode, 0);
+		    emit_move_insn (areg, GEN_INT (offset));
 
-		/* AltiVec addressing mode is [reg+reg].  */
-		addr = gen_rtx_PLUS (Pmode, frame_reg_rtx, areg);
-		mem = gen_frame_mem (V4SImode, addr);
-
-		reg = gen_rtx_REG (V4SImode, i);
-		/* Rather than emitting a generic move, force use of the
-		   lvx instruction, which we always want.  In particular
-		   we don't want lxvd2x/xxpermdi for little endian.  */
-		(void) emit_insn (gen_altivec_lvx_v4si_internal (reg, mem));
+		    /* AltiVec addressing mode is [reg+reg].  */
+		    addr = gen_rtx_PLUS (Pmode, frame_reg_rtx, areg);
+		    mem = gen_frame_mem (V4SImode, addr);
+
+		    /* Rather than emitting a generic move, force use of the
+		       lvx instruction, which we always want.  In particular we
+		       don't want lxvd2x/xxpermdi for little endian.  */
+		    insn = gen_altivec_lvx_v4si_internal (reg, mem);
+		  }
+
+		(void) emit_insn (insn);
 	      }
 	}
 
@@ -26890,23 +27965,35 @@ rs6000_emit_epilogue (int sibcall)
 	  for (i = info->first_altivec_reg_save; i <= LAST_ALTIVEC_REGNO; ++i)
 	    if (info->vrsave_mask & ALTIVEC_REG_BIT (i))
 	      {
-		rtx addr, areg, mem, reg;
+		rtx addr, areg, mem, insn;
+		rtx reg = gen_rtx_REG (V4SImode, i);
+		HOST_WIDE_INT offset
+		  = (info->altivec_save_offset + frame_off
+		     + 16 * (i - info->first_altivec_reg_save));
 
-		areg = gen_rtx_REG (Pmode, 0);
-		emit_move_insn
-		  (areg, GEN_INT (info->altivec_save_offset
-				  + frame_off
-				  + 16 * (i - info->first_altivec_reg_save)));
+		if (TARGET_P9_DFORM_VECTOR && quad_address_offset_p (offset))
+		  {
+		    mem = gen_frame_mem (V4SImode,
+					 gen_rtx_PLUS (Pmode, frame_reg_rtx,
+						       GEN_INT (offset)));
+		    insn = gen_rtx_SET (reg, mem);
+		  }
+		else
+		  {
+		    areg = gen_rtx_REG (Pmode, 0);
+		    emit_move_insn (areg, GEN_INT (offset));
 
-		/* AltiVec addressing mode is [reg+reg].  */
-		addr = gen_rtx_PLUS (Pmode, frame_reg_rtx, areg);
-		mem = gen_frame_mem (V4SImode, addr);
-
-		reg = gen_rtx_REG (V4SImode, i);
-		/* Rather than emitting a generic move, force use of the
-		   lvx instruction, which we always want.  In particular
-		   we don't want lxvd2x/xxpermdi for little endian.  */
-		(void) emit_insn (gen_altivec_lvx_v4si_internal (reg, mem));
+		    /* AltiVec addressing mode is [reg+reg].  */
+		    addr = gen_rtx_PLUS (Pmode, frame_reg_rtx, areg);
+		    mem = gen_frame_mem (V4SImode, addr);
+
+		    /* Rather than emitting a generic move, force use of the
+		       lvx instruction, which we always want.  In particular we
+		       don't want lxvd2x/xxpermdi for little endian.  */
+		    insn = gen_altivec_lvx_v4si_internal (reg, mem);
+		  }
+
+		(void) emit_insn (insn);
 	      }
 	}
 
@@ -27724,6 +28811,11 @@ rs6000_expand_split_stack_prologue (void)
 				   const0_rtx, const0_rtx));
   call_fusage = NULL_RTX;
   use_reg (&call_fusage, r12);
+  /* Say the call uses r0, even though it doesn't, to stop regrename
+     from twiddling with the insns saving lr, trashing args for cfun.
+     The insns restoring lr are similarly protected by making
+     split_stack_return use r0.  */
+  use_reg (&call_fusage, r0);
   add_function_usage_to (insn, call_fusage);
   emit_insn (gen_frame_load (r0, r1, info->lr_save_offset));
   insn = emit_move_insn (lr, r0);
@@ -28763,13 +29855,20 @@ output_function_profiler (FILE *file, int labelno)
 
 /* The following variable value is the last issued insn.  */
 
-static rtx last_scheduled_insn;
+static rtx_insn *last_scheduled_insn;
 
 /* The following variable helps to balance issuing of load and
    store instructions */
 
 static int load_store_pendulum;
 
+/* The following variable helps pair divide insns during scheduling.  */
+static int divide_cnt;
+/* The following variable helps pair and alternate vector and vector load
+   insns during scheduling.  */
+static int vec_load_pendulum;
+
+
 /* Power4 load update and store update instructions are cracked into a
    load or store and an integer insn which are executed in the same cycle.
    Branches have their own dispatch slot which does not count against the
@@ -28844,7 +29943,7 @@ rs6000_adjust_cost (rtx_insn *insn, rtx link, rtx_insn *dep_insn, int cost)
 	   some cycles later.  */
 
 	/* Separate a load from a narrower, dependent store.  */
-	if (rs6000_sched_groups
+	if ((rs6000_sched_groups || rs6000_cpu_attr == CPU_POWER9)
 	    && GET_CODE (PATTERN (insn)) == SET
 	    && GET_CODE (PATTERN (dep_insn)) == SET
 	    && GET_CODE (XEXP (PATTERN (insn), 1)) == MEM
@@ -29070,7 +30169,9 @@ rs6000_adjust_cost (rtx_insn *insn, rtx link, rtx_insn *dep_insn, int cost)
           switch (attr_type)
             {
             case TYPE_FP:
-              if (get_attr_type (dep_insn) == TYPE_FP)
+            case TYPE_FPSIMPLE:
+              if (get_attr_type (dep_insn) == TYPE_FP
+		  || get_attr_type (dep_insn) == TYPE_FPSIMPLE)
                 return 1;
               break;
             case TYPE_FPLOAD:
@@ -29082,6 +30183,8 @@ rs6000_adjust_cost (rtx_insn *insn, rtx link, rtx_insn *dep_insn, int cost)
               break;
             }
         }
+      /* Fall through, no cost for output dependency.  */
+
     case REG_DEP_ANTI:
       /* Anti dependency; DEP_INSN reads a register that INSN writes some
 	 cycles later.  */
@@ -29454,8 +30557,9 @@ rs6000_issue_rate (void)
   case CPU_POWER7:
     return 5;
   case CPU_POWER8:
-  case CPU_POWER9:
     return 7;
+  case CPU_POWER9:
+    return 6;
   default:
     return 1;
   }
@@ -29613,6 +30717,28 @@ is_store_insn (rtx insn, rtx *str_mem)
   return is_store_insn1 (PATTERN (insn), str_mem);
 }
 
+/* Return whether TYPE is a Power9 pairable vector instruction type.  */
+
+static bool
+is_power9_pairable_vec_type (enum attr_type type)
+{
+  switch (type)
+    {
+      case TYPE_VECSIMPLE:
+      case TYPE_VECCOMPLEX:
+      case TYPE_VECDIV:
+      case TYPE_VECCMP:
+      case TYPE_VECPERM:
+      case TYPE_VECFLOAT:
+      case TYPE_VECFDIV:
+      case TYPE_VECDOUBLE:
+	return true;
+      default:
+	break;
+    }
+  return false;
+}
+
 /* Returns whether the dependence between INSN and NEXT is considered
    costly by the given target.  */
 
@@ -29689,6 +30815,229 @@ get_next_active_insn (rtx_insn *insn, rtx_insn *tail)
   return insn;
 }
 
+/* Do Power9 specific sched_reorder2 reordering of ready list.  */
+
+static int
+power9_sched_reorder2 (rtx_insn **ready, int lastpos)
+{
+  int pos;
+  int i;
+  rtx_insn *tmp;
+  enum attr_type type;
+
+  type = get_attr_type (last_scheduled_insn);
+
+  /* Try to issue fixed point divides back-to-back in pairs so they will be
+     routed to separate execution units and execute in parallel.  */
+  if (type == TYPE_DIV && divide_cnt == 0)
+    {
+      /* First divide has been scheduled.  */
+      divide_cnt = 1;
+
+      /* Scan the ready list looking for another divide, if found move it
+	 to the end of the list so it is chosen next.  */
+      pos = lastpos;
+      while (pos >= 0)
+	{
+	  if (recog_memoized (ready[pos]) >= 0
+	      && get_attr_type (ready[pos]) == TYPE_DIV)
+	    {
+	      tmp = ready[pos];
+	      for (i = pos; i < lastpos; i++)
+		ready[i] = ready[i + 1];
+	      ready[lastpos] = tmp;
+	      break;
+	    }
+	  pos--;
+	}
+    }
+  else
+    {
+      /* Last insn was the 2nd divide or not a divide, reset the counter.  */
+      divide_cnt = 0;
+
+      /* Power9 can execute 2 vector operations and 2 vector loads in a single
+	 cycle.  So try to pair up and alternate groups of vector and vector
+	 load instructions.
+
+	 To aid this formation, a counter is maintained to keep track of
+	 vec/vecload insns issued.  The value of vec_load_pendulum maintains
+	 the current state with the following values:
+
+	     0  : Initial state, no vec/vecload group has been started.
+
+	     -1 : 1 vector load has been issued and another has been found on
+		  the ready list and moved to the end.
+
+	     -2 : 2 vector loads have been issued and a vector operation has
+		  been found and moved to the end of the ready list.
+
+	     -3 : 2 vector loads and a vector insn have been issued and a
+		  vector operation has been found and moved to the end of the
+		  ready list.
+
+	     1  : 1 vector insn has been issued and another has been found and
+		  moved to the end of the ready list.
+
+	     2  : 2 vector insns have been issued and a vector load has been
+		  found and moved to the end of the ready list.
+
+	     3  : 2 vector insns and a vector load have been issued and another
+		  vector load has been found and moved to the end of the ready
+		  list.	 */
+      if (type == TYPE_VECLOAD)
+	{
+	  /* Issued a vecload.  */
+	  if (vec_load_pendulum == 0)
+	    {
+	      /* We issued a single vecload, look for another and move it to
+		 the end of the ready list so it will be scheduled next.
+		 Set pendulum if found.  */
+	      pos = lastpos;
+	      while (pos >= 0)
+		{
+		  if (recog_memoized (ready[pos]) >= 0
+		      && get_attr_type (ready[pos]) == TYPE_VECLOAD)
+		    {
+		      tmp = ready[pos];
+		      for (i = pos; i < lastpos; i++)
+			ready[i] = ready[i + 1];
+		      ready[lastpos] = tmp;
+		      vec_load_pendulum = -1;
+		      return cached_can_issue_more;
+		    }
+		  pos--;
+		}
+	    }
+	  else if (vec_load_pendulum == -1)
+	    {
+	      /* This is the second vecload we've issued, search the ready
+	         list for a vector operation so we can try to schedule a
+	         pair of those next.  If found move to the end of the ready
+	         list so it is scheduled next and set the pendulum.  */
+	      pos = lastpos;
+	      while (pos >= 0)
+		{
+		  if (recog_memoized (ready[pos]) >= 0
+		      && is_power9_pairable_vec_type (
+			   get_attr_type (ready[pos])))
+		    {
+		      tmp = ready[pos];
+		      for (i = pos; i < lastpos; i++)
+			ready[i] = ready[i + 1];
+		      ready[lastpos] = tmp;
+		      vec_load_pendulum = -2;
+		      return cached_can_issue_more;
+		    }
+		  pos--;
+		}
+	    }
+	  else if (vec_load_pendulum == 2)
+	    {
+	      /* Two vector ops have been issued and we've just issued a
+		 vecload, look for another vecload and move to end of ready
+		 list if found.  */
+	      pos = lastpos;
+	      while (pos >= 0)
+	        {
+		  if (recog_memoized (ready[pos]) >= 0
+		      && get_attr_type (ready[pos]) == TYPE_VECLOAD)
+		    {
+		      tmp = ready[pos];
+		      for (i = pos; i < lastpos; i++)
+			ready[i] = ready[i + 1];
+		      ready[lastpos] = tmp;
+		      /* Set pendulum so that next vecload will be seen as
+			 finishing a group, not start of one.  */
+		      vec_load_pendulum = 3;
+		      return cached_can_issue_more;
+		    }
+		  pos--;
+		}
+	    }
+	}
+      else if (is_power9_pairable_vec_type (type))
+	{
+	  /* Issued a vector operation.  */
+	  if (vec_load_pendulum == 0)
+	    /* We issued a single vec op, look for another and move it
+	       to the end of the ready list so it will be scheduled next.
+	       Set pendulum if found.  */
+	    {
+	      pos = lastpos;
+	      while (pos >= 0)
+		{
+		  if (recog_memoized (ready[pos]) >= 0
+		      && is_power9_pairable_vec_type (
+			   get_attr_type (ready[pos])))
+		    {
+		      tmp = ready[pos];
+		      for (i = pos; i < lastpos; i++)
+			ready[i] = ready[i + 1];
+		      ready[lastpos] = tmp;
+		      vec_load_pendulum = 1;
+		      return cached_can_issue_more;
+		    }
+		  pos--;
+		}
+	    }
+	  else if (vec_load_pendulum == 1)
+	    {
+	      /* This is the second vec op we've issued, search the ready
+		 list for a vecload operation so we can try to schedule a
+		 pair of those next.  If found move to the end of the ready
+		 list so it is scheduled next and set the pendulum.  */
+	      pos = lastpos;
+	      while (pos >= 0)
+		{
+		  if (recog_memoized (ready[pos]) >= 0
+		      && get_attr_type (ready[pos]) == TYPE_VECLOAD)
+		    {
+		      tmp = ready[pos];
+		      for (i = pos; i < lastpos; i++)
+			ready[i] = ready[i + 1];
+		      ready[lastpos] = tmp;
+		      vec_load_pendulum = 2;
+		      return cached_can_issue_more;
+		    }
+		  pos--;
+		}
+	    }
+	  else if (vec_load_pendulum == -2)
+	    {
+	      /* Two vecload ops have been issued and we've just issued a
+		 vec op, look for another vec op and move to end of ready
+	  	 list if found.  */
+	      pos = lastpos;
+	      while (pos >= 0)
+		{
+		  if (recog_memoized (ready[pos]) >= 0
+		      && is_power9_pairable_vec_type (
+			   get_attr_type (ready[pos])))
+		    {
+		      tmp = ready[pos];
+		      for (i = pos; i < lastpos; i++)
+			ready[i] = ready[i + 1];
+		      ready[lastpos] = tmp;
+		      /* Set pendulum so that next vec op will be seen as
+			 finishing a group, not start of one.  */
+		      vec_load_pendulum = -3;
+		      return cached_can_issue_more;
+		    }
+		  pos--;
+		}
+	    }
+	}
+
+      /* We've either finished a vec/vecload group, couldn't find an insn to
+	 continue the current group, or the last insn had nothing to do with
+	 with a group.  In any case, reset the pendulum.  */
+      vec_load_pendulum = 0;
+    }
+
+  return cached_can_issue_more;
+}
+
 /* We are about to begin issuing insns for this clock cycle. */
 
 static int
@@ -29920,6 +31269,11 @@ rs6000_sched_reorder2 (FILE *dump, int sched_verbose, rtx_insn **ready,
         }
     }
 
+  /* Do Power9 dependent reordering if necessary.  */
+  if (rs6000_cpu == PROCESSOR_POWER9 && last_scheduled_insn
+      && recog_memoized (last_scheduled_insn) >= 0)
+    return power9_sched_reorder2 (ready, *pn_ready - 1);
+
   return cached_can_issue_more;
 }
 
@@ -30088,7 +31442,6 @@ insn_must_be_first_in_group (rtx_insn *insn)
         }
       break;
     case PROCESSOR_POWER8:
-    case PROCESSOR_POWER9:
       type = get_attr_type (insn);
 
       switch (type)
@@ -30219,7 +31572,6 @@ insn_must_be_last_in_group (rtx_insn *insn)
     }
     break;
   case PROCESSOR_POWER8:
-  case PROCESSOR_POWER9:
     type = get_attr_type (insn);
 
     switch (type)
@@ -30338,7 +31690,7 @@ force_new_group (int sched_verbose, FILE *dump, rtx *group_insns,
 
       /* Do we have a special group ending nop? */
       if (rs6000_cpu_attr == CPU_POWER6 || rs6000_cpu_attr == CPU_POWER7
-	  || rs6000_cpu_attr == CPU_POWER8 || rs6000_cpu_attr == CPU_POWER9)
+	  || rs6000_cpu_attr == CPU_POWER8)
 	{
 	  nop = gen_group_ending_nop ();
 	  emit_insn_before (nop, next_insn);
@@ -30592,8 +31944,10 @@ rs6000_sched_init (FILE *dump ATTRIBUTE_UNUSED,
 		     int sched_verbose ATTRIBUTE_UNUSED,
 		     int max_ready ATTRIBUTE_UNUSED)
 {
-  last_scheduled_insn = NULL_RTX;
+  last_scheduled_insn = NULL;
   load_store_pendulum = 0;
+  divide_cnt = 0;
+  vec_load_pendulum = 0;
 }
 
 /* The following function is called at the end of scheduling BB.
@@ -30634,14 +31988,16 @@ rs6000_sched_finish (FILE *dump, int sched_verbose)
     }
 }
 
-struct _rs6000_sched_context
+struct rs6000_sched_context
 {
   short cached_can_issue_more;
-  rtx last_scheduled_insn;
+  rtx_insn *last_scheduled_insn;
   int load_store_pendulum;
+  int divide_cnt;
+  int vec_load_pendulum;
 };
 
-typedef struct _rs6000_sched_context rs6000_sched_context_def;
+typedef struct rs6000_sched_context rs6000_sched_context_def;
 typedef rs6000_sched_context_def *rs6000_sched_context_t;
 
 /* Allocate store for new scheduling context.  */
@@ -30661,14 +32017,18 @@ rs6000_init_sched_context (void *_sc, bool clean_p)
   if (clean_p)
     {
       sc->cached_can_issue_more = 0;
-      sc->last_scheduled_insn = NULL_RTX;
+      sc->last_scheduled_insn = NULL;
       sc->load_store_pendulum = 0;
+      sc->divide_cnt = 0;
+      sc->vec_load_pendulum = 0;
     }
   else
     {
       sc->cached_can_issue_more = cached_can_issue_more;
       sc->last_scheduled_insn = last_scheduled_insn;
       sc->load_store_pendulum = load_store_pendulum;
+      sc->divide_cnt = divide_cnt;
+      sc->vec_load_pendulum = vec_load_pendulum;
     }
 }
 
@@ -30683,6 +32043,8 @@ rs6000_set_sched_context (void *_sc)
   cached_can_issue_more = sc->cached_can_issue_more;
   last_scheduled_insn = sc->last_scheduled_insn;
   load_store_pendulum = sc->load_store_pendulum;
+  divide_cnt = sc->divide_cnt;
+  vec_load_pendulum = sc->vec_load_pendulum;
 }
 
 /* Free _SC.  */
@@ -33448,17 +34810,25 @@ altivec_expand_vec_perm_le (rtx operands[4])
   if (!REG_P (target))
     tmp = gen_reg_rtx (mode);
 
-  /* Invert the selector with a VNAND if available, else a VNOR.
-     The VNAND is preferred for future fusion opportunities.  */
-  notx = gen_rtx_NOT (V16QImode, sel);
-  iorx = (TARGET_P8_VECTOR
-	  ? gen_rtx_IOR (V16QImode, notx, notx)
-	  : gen_rtx_AND (V16QImode, notx, notx));
-  emit_insn (gen_rtx_SET (norreg, iorx));
+  if (TARGET_P9_VECTOR)
+    {
+      unspec = gen_rtx_UNSPEC (mode, gen_rtvec (3, op0, op1, sel),
+			       UNSPEC_VPERMR);
+    }
+  else
+    {
+      /* Invert the selector with a VNAND if available, else a VNOR.
+	 The VNAND is preferred for future fusion opportunities.  */
+      notx = gen_rtx_NOT (V16QImode, sel);
+      iorx = (TARGET_P8_VECTOR
+	      ? gen_rtx_IOR (V16QImode, notx, notx)
+	      : gen_rtx_AND (V16QImode, notx, notx));
+      emit_insn (gen_rtx_SET (norreg, iorx));
 
-  /* Permute with operands reversed and adjusted selector.  */
-  unspec = gen_rtx_UNSPEC (mode, gen_rtvec (3, op1, op0, norreg),
-			   UNSPEC_VPERM);
+      /* Permute with operands reversed and adjusted selector.  */
+      unspec = gen_rtx_UNSPEC (mode, gen_rtvec (3, op1, op0, norreg),
+			       UNSPEC_VPERM);
+    }
 
   /* Copy into target, possibly by way of a register.  */
   if (!REG_P (target))
@@ -33869,8 +35239,14 @@ rs6000_complex_function_value (machine_mode mode)
   machine_mode inner = GET_MODE_INNER (mode);
   unsigned int inner_bytes = GET_MODE_UNIT_SIZE (mode);
 
-  if (FLOAT_MODE_P (mode) && TARGET_HARD_FLOAT && TARGET_FPRS)
+  if (TARGET_FLOAT128
+      && (mode == KCmode
+	  || (mode == TCmode && TARGET_IEEEQUAD)))
+    regno = ALTIVEC_ARG_RETURN;
+
+  else if (FLOAT_MODE_P (mode) && TARGET_HARD_FLOAT && TARGET_FPRS)
     regno = FP_ARG_RETURN;
+
   else
     {
       regno = GP_ARG_RETURN;
@@ -33992,7 +35368,8 @@ rs6000_function_value (const_tree valtype,
   if (DECIMAL_FLOAT_MODE_P (mode) && TARGET_HARD_FLOAT && TARGET_FPRS)
     /* _Decimal128 must use an even/odd register pair.  */
     regno = (mode == TDmode) ? FP_ARG_RETURN + 1 : FP_ARG_RETURN;
-  else if (SCALAR_FLOAT_MODE_NOT_VECTOR_P (mode) && TARGET_HARD_FLOAT && TARGET_FPRS
+  else if (SCALAR_FLOAT_TYPE_P (valtype) && TARGET_HARD_FLOAT && TARGET_FPRS
+	   && !FLOAT128_VECTOR_P (mode)
 	   && ((TARGET_SINGLE_FLOAT && (mode == SFmode)) || TARGET_DOUBLE_FLOAT))
     regno = FP_ARG_RETURN;
   else if (TREE_CODE (valtype) == COMPLEX_TYPE
@@ -34056,7 +35433,7 @@ rs6000_libcall_value (machine_mode mode)
 static bool
 rs6000_lra_p (void)
 {
-  return rs6000_lra_flag;
+  return TARGET_LRA;
 }
 
 /* Given FROM and TO register numbers, say whether this elimination is allowed.
@@ -34417,9 +35794,11 @@ static struct rs6000_opt_mask const rs6000_opt_masks[] =
   { "power8-fusion",		OPTION_MASK_P8_FUSION,		false, true  },
   { "power8-fusion-sign",	OPTION_MASK_P8_FUSION_SIGN,	false, true  },
   { "power8-vector",		OPTION_MASK_P8_VECTOR,		false, true  },
-  { "power9-dform",		OPTION_MASK_P9_DFORM,		false, true  },
+  { "power9-dform-scalar",	OPTION_MASK_P9_DFORM_SCALAR,	false, true  },
+  { "power9-dform-vector",	OPTION_MASK_P9_DFORM_VECTOR,	false, true  },
   { "power9-fusion",		OPTION_MASK_P9_FUSION,		false, true  },
   { "power9-minmax",		OPTION_MASK_P9_MINMAX,		false, true  },
+  { "power9-misc",		OPTION_MASK_P9_MISC,		false, true  },
   { "power9-vector",		OPTION_MASK_P9_VECTOR,		false, true  },
   { "powerpc-gfxopt",		OPTION_MASK_PPC_GFXOPT,		false, true  },
   { "powerpc-gpopt",		OPTION_MASK_PPC_GPOPT,		false, true  },
@@ -34474,11 +35853,14 @@ static struct rs6000_opt_mask const rs6000_builtin_mask_names[] =
   { "popcntd",		 RS6000_BTM_POPCNTD,	false, false },
   { "cell",		 RS6000_BTM_CELL,	false, false },
   { "power8-vector",	 RS6000_BTM_P8_VECTOR,	false, false },
+  { "power9-vector",	 RS6000_BTM_P9_VECTOR,	false, false },
+  { "power9-misc",	 RS6000_BTM_P9_MISC,	false, false },
   { "crypto",		 RS6000_BTM_CRYPTO,	false, false },
   { "htm",		 RS6000_BTM_HTM,	false, false },
   { "hard-dfp",		 RS6000_BTM_DFP,	false, false },
   { "hard-float",	 RS6000_BTM_HARD_FLOAT,	false, false },
   { "long-double-128",	 RS6000_BTM_LDBL128,	false, false },
+  { "float128",		 RS6000_BTM_FLOAT128,   false, false },
 };
 
 /* Option variables that we want to support inside attribute((target)) and
@@ -35049,7 +36431,9 @@ rs6000_print_options_internal (FILE *file,
   size_t i;
   size_t start_column = 0;
   size_t cur_column;
-  size_t max_column = 76;
+  size_t max_column = 120;
+  size_t prefix_len = strlen (prefix);
+  size_t comma_len = 0;
   const char *comma = "";
 
   if (indent)
@@ -35067,27 +36451,45 @@ rs6000_print_options_internal (FILE *file,
   cur_column = start_column;
   for (i = 0; i < num_elements; i++)
     {
-      if ((flags & opts[i].mask) != 0)
+      bool invert = opts[i].invert;
+      const char *name = opts[i].name;
+      const char *no_str = "";
+      HOST_WIDE_INT mask = opts[i].mask;
+      size_t len = comma_len + prefix_len + strlen (name);
+
+      if (!invert)
 	{
-	  const char *no_str = rs6000_opt_masks[i].invert ? "no-" : "";
-	  size_t len = (strlen (comma)
-			+ strlen (prefix)
-			+ strlen (no_str)
-			+ strlen (rs6000_opt_masks[i].name));
+	  if ((flags & mask) == 0)
+	    {
+	      no_str = "no-";
+	      len += sizeof ("no-") - 1;
+	    }
+
+	  flags &= ~mask;
+	}
 
-	  cur_column += len;
-	  if (cur_column > max_column)
+      else
+	{
+	  if ((flags & mask) != 0)
 	    {
-	      fprintf (stderr, ", \\\n%*s", (int)start_column, "");
-	      cur_column = start_column + len;
-	      comma = "";
+	      no_str = "no-";
+	      len += sizeof ("no-") - 1;
 	    }
 
-	  fprintf (file, "%s%s%s%s", comma, prefix, no_str,
-		   rs6000_opt_masks[i].name);
-	  flags &= ~ opts[i].mask;
-	  comma = ", ";
+	  flags |= mask;
 	}
+
+      cur_column += len;
+      if (cur_column > max_column)
+	{
+	  fprintf (stderr, ", \\\n%*s", (int)start_column, "");
+	  cur_column = start_column + len;
+	  comma = "";
+	}
+
+      fprintf (file, "%s%s%s%s", comma, prefix, no_str, name);
+      comma = ", ";
+      comma_len = sizeof (", ") - 1;
     }
 
   fputs ("\n", file);
diff --git a/gcc/config/rs6000/rs6000.h b/gcc/config/rs6000/rs6000.h
index 8c6bd07dd5..6285478392 100644
--- a/gcc/config/rs6000/rs6000.h
+++ b/gcc/config/rs6000/rs6000.h
@@ -302,6 +302,28 @@ extern const char *host_detect_local_cpu (int argc, const char **argv);
 #define TARGET_P8_VECTOR 0
 #endif
 
+/* Define the ISA 3.0 flags as 0 if the target assembler does not support
+   Power9 instructions.  Allow -mpower9-fusion, since it does not add new
+   instructions.  Allow -misel, since it predates ISA 3.0 and does
+   not require any Power9 features.  */
+
+#ifndef HAVE_AS_POWER9
+#undef  TARGET_FLOAT128_HW
+#undef  TARGET_MODULO
+#undef  TARGET_P9_VECTOR
+#undef  TARGET_P9_MINMAX
+#undef  TARGET_P9_DFORM_SCALAR
+#undef  TARGET_P9_DFORM_VECTOR
+#undef  TARGET_P9_MISC
+#define TARGET_FLOAT128_HW 0
+#define TARGET_MODULO 0
+#define TARGET_P9_VECTOR 0
+#define TARGET_P9_MINMAX 0
+#define TARGET_P9_DFORM_SCALAR 0
+#define TARGET_P9_DFORM_VECTOR 0
+#define TARGET_P9_MISC 0
+#endif
+
 /* Define TARGET_LWSYNC_INSTRUCTION if the assembler knows about lwsync.  If
    not, generate the lwsync code as an integer constant.  */
 #ifdef HAVE_AS_LWSYNC
@@ -418,12 +440,12 @@ extern const char *host_detect_local_cpu (int argc, const char **argv);
    Similarly IFmode is the IBM long double format even if the default is IEEE
    128-bit.  */
 #define FLOAT128_IEEE_P(MODE)						\
-  (((MODE) == TFmode && TARGET_IEEEQUAD)				\
-   || ((MODE) == KFmode))
+  ((TARGET_IEEEQUAD && ((MODE) == TFmode || (MODE) == TCmode))		\
+   || ((MODE) == KFmode) || ((MODE) == KCmode))
 
 #define FLOAT128_IBM_P(MODE)						\
-  (((MODE) == TFmode && !TARGET_IEEEQUAD)				\
-   || ((MODE) == IFmode))
+  ((!TARGET_IEEEQUAD && ((MODE) == TFmode || (MODE) == TCmode))		\
+   || ((MODE) == IFmode) || ((MODE) == ICmode))
 
 /* Helper macros to say whether a 128-bit floating point type can go in a
    single vector register, or whether it needs paired scalar values.  */
@@ -594,6 +616,15 @@ extern int rs6000_vector_align[];
    in the register.  */
 #define TARGET_NO_SDMODE_STACK	(TARGET_LFIWZX && TARGET_STFIWX && TARGET_DFP)
 
+/* ISA 3.0 has new min/max functions that don't need fast math that are being
+   phased in.  Min/max using FSEL or XSMAXDP/XSMINDP do not return the correct
+   answers if the arguments are not in the normal range.  */
+#define TARGET_MINMAX_SF	(TARGET_SF_FPR && TARGET_PPC_GFXOPT	\
+				 && (TARGET_P9_MINMAX || !flag_trapping_math))
+
+#define TARGET_MINMAX_DF	(TARGET_DF_FPR && TARGET_PPC_GFXOPT	\
+				 && (TARGET_P9_MINMAX || !flag_trapping_math))
+
 /* In switching from using target_flags to using rs6000_isa_flags, the options
    machinery creates OPTION_MASK_<xxx> instead of MASK_<xxx>.  For now map
    OPTION_MASK_<xxx> back into MASK_<xxx>.  */
@@ -615,6 +646,8 @@ extern int rs6000_vector_align[];
 #define MASK_MULTIPLE			OPTION_MASK_MULTIPLE
 #define MASK_NO_UPDATE			OPTION_MASK_NO_UPDATE
 #define MASK_P8_VECTOR			OPTION_MASK_P8_VECTOR
+#define MASK_P9_VECTOR			OPTION_MASK_P9_VECTOR
+#define MASK_P9_MISC			OPTION_MASK_P9_MISC
 #define MASK_POPCNTB			OPTION_MASK_POPCNTB
 #define MASK_POPCNTD			OPTION_MASK_POPCNTD
 #define MASK_PPC_GFXOPT			OPTION_MASK_PPC_GFXOPT
@@ -655,6 +688,11 @@ extern int rs6000_vector_align[];
 #define MASK_PROTOTYPE			OPTION_MASK_PROTOTYPE
 #endif
 
+#ifdef TARGET_MODULO
+#define RS6000_BTM_MODULO		OPTION_MASK_MODULO
+#endif
+
+
 /* For power systems, we want to enable Altivec and VSX builtins even if the
    user did not use -maltivec or -mvsx to allow the builtins to be used inside
    of #pragma GCC target or the target attribute to change the code level for a
@@ -1774,7 +1812,9 @@ extern enum reg_class rs6000_constraints[RS6000_CONSTRAINT_MAX];
 #define ALTIVEC_ARG_RETURN (FIRST_ALTIVEC_REGNO + 2)
 #define FP_ARG_MAX_RETURN (DEFAULT_ABI != ABI_ELFv2 ? FP_ARG_RETURN	\
 			   : (FP_ARG_RETURN + AGGR_ARG_NUM_REG - 1))
-#define ALTIVEC_ARG_MAX_RETURN (DEFAULT_ABI != ABI_ELFv2 ? ALTIVEC_ARG_RETURN \
+#define ALTIVEC_ARG_MAX_RETURN (DEFAULT_ABI != ABI_ELFv2		\
+				? (ALTIVEC_ARG_RETURN			\
+				   + (TARGET_FLOAT128 ? 1 : 0))		\
 			        : (ALTIVEC_ARG_RETURN + AGGR_ARG_NUM_REG - 1))
 
 /* Flags for the call/call_value rtl operations set up by function_arg */
@@ -2638,7 +2678,9 @@ extern int frame_pointer_needed;
 
 #define RS6000_BTC_MISC		0x00000000	/* No special attributes.  */
 #define RS6000_BTC_CONST	0x00000100	/* uses no global state.  */
-#define RS6000_BTC_PURE		0x00000200	/* reads global state/mem.  */
+#define RS6000_BTC_PURE		0x00000200	/* reads global
+						   state/mem and does
+						   not modify global state.  */
 #define RS6000_BTC_FP		0x00000400	/* depends on rounding mode.  */
 #define RS6000_BTC_ATTR_MASK	0x00000700	/* Mask of the attributes.  */
 
@@ -2660,6 +2702,8 @@ extern int frame_pointer_needed;
 #define RS6000_BTM_ALTIVEC	MASK_ALTIVEC	/* VMX/altivec vectors.  */
 #define RS6000_BTM_VSX		MASK_VSX	/* VSX (vector/scalar).  */
 #define RS6000_BTM_P8_VECTOR	MASK_P8_VECTOR	/* ISA 2.07 vector.  */
+#define RS6000_BTM_P9_VECTOR	MASK_P9_VECTOR	/* ISA 3.00 vector.  */
+#define RS6000_BTM_P9_MISC	MASK_P9_MISC	/* ISA 3.0 misc. non-vector.  */
 #define RS6000_BTM_CRYPTO	MASK_CRYPTO	/* crypto funcs.  */
 #define RS6000_BTM_HTM		MASK_HTM	/* hardware TM funcs.  */
 #define RS6000_BTM_SPE		MASK_STRING	/* E500 */
@@ -2673,10 +2717,15 @@ extern int frame_pointer_needed;
 #define RS6000_BTM_DFP		MASK_DFP	/* Decimal floating point.  */
 #define RS6000_BTM_HARD_FLOAT	MASK_SOFT_FLOAT	/* Hardware floating point.  */
 #define RS6000_BTM_LDBL128	MASK_MULTIPLE	/* 128-bit long double.  */
+#define RS6000_BTM_64BIT	MASK_64BIT	/* 64-bit addressing.  */
+#define RS6000_BTM_FLOAT128	MASK_P9_VECTOR	/* IEEE 128-bit float.  */
 
 #define RS6000_BTM_COMMON	(RS6000_BTM_ALTIVEC			\
 				 | RS6000_BTM_VSX			\
 				 | RS6000_BTM_P8_VECTOR			\
+				 | RS6000_BTM_P9_VECTOR			\
+				 | RS6000_BTM_P9_MISC			\
+				 | RS6000_BTM_MODULO                    \
 				 | RS6000_BTM_CRYPTO			\
 				 | RS6000_BTM_FRE			\
 				 | RS6000_BTM_FRES			\
@@ -2687,10 +2736,12 @@ extern int frame_pointer_needed;
 				 | RS6000_BTM_CELL			\
 				 | RS6000_BTM_DFP			\
 				 | RS6000_BTM_HARD_FLOAT		\
-				 | RS6000_BTM_LDBL128)
+				 | RS6000_BTM_LDBL128			\
+				 | RS6000_BTM_FLOAT128)
 
 /* Define builtin enum index.  */
 
+#undef RS6000_BUILTIN_0
 #undef RS6000_BUILTIN_1
 #undef RS6000_BUILTIN_2
 #undef RS6000_BUILTIN_3
@@ -2703,6 +2754,7 @@ extern int frame_pointer_needed;
 #undef RS6000_BUILTIN_S
 #undef RS6000_BUILTIN_X
 
+#define RS6000_BUILTIN_0(ENUM, NAME, MASK, ATTR, ICODE) ENUM,
 #define RS6000_BUILTIN_1(ENUM, NAME, MASK, ATTR, ICODE) ENUM,
 #define RS6000_BUILTIN_2(ENUM, NAME, MASK, ATTR, ICODE) ENUM,
 #define RS6000_BUILTIN_3(ENUM, NAME, MASK, ATTR, ICODE) ENUM,
@@ -2722,6 +2774,7 @@ enum rs6000_builtins
   RS6000_BUILTIN_COUNT
 };
 
+#undef RS6000_BUILTIN_0
 #undef RS6000_BUILTIN_1
 #undef RS6000_BUILTIN_2
 #undef RS6000_BUILTIN_3
@@ -2788,6 +2841,7 @@ enum rs6000_builtin_type_index
   RS6000_BTI_void,	         /* void_type_node */
   RS6000_BTI_ieee128_float,	 /* ieee 128-bit floating point */
   RS6000_BTI_ibm128_float,	 /* IBM 128-bit floating point */
+  RS6000_BTI_const_str,		 /* pointer to const char * */
   RS6000_BTI_MAX
 };
 
@@ -2844,6 +2898,7 @@ enum rs6000_builtin_type_index
 #define void_type_internal_node		 (rs6000_builtin_types[RS6000_BTI_void])
 #define ieee128_float_type_node		 (rs6000_builtin_types[RS6000_BTI_ieee128_float])
 #define ibm128_float_type_node		 (rs6000_builtin_types[RS6000_BTI_ibm128_float])
+#define const_str_type_node		 (rs6000_builtin_types[RS6000_BTI_const_str])
 
 extern GTY(()) tree rs6000_builtin_types[RS6000_BTI_MAX];
 extern GTY(()) tree rs6000_builtin_decls[RS6000_BUILTIN_COUNT];
diff --git a/gcc/config/rs6000/rs6000.md b/gcc/config/rs6000/rs6000.md
index 849b19a7b0..29d82c9f97 100644
--- a/gcc/config/rs6000/rs6000.md
+++ b/gcc/config/rs6000/rs6000.md
@@ -147,6 +147,8 @@
    UNSPEC_ROUND_TO_ODD
    UNSPEC_IEEE128_MOVE
    UNSPEC_IEEE128_CONVERT
+   UNSPEC_SIGNBIT
+   UNSPEC_DOLOOP
   ])
 
 ;;
@@ -183,12 +185,13 @@
    brinc,
    vecsimple,veccomplex,vecdiv,veccmp,veccmpsimple,vecperm,
    vecfloat,vecfdiv,vecdouble,mffgpr,mftgpr,crypto,
-   htm"
+   veclogical,veccmpfx,vecexts,vecmove,
+   htm,htmsimple,dfp"
   (const_string "integer"))
 
 ;; What data size does this instruction work on?
-;; This is used for insert, mul.
-(define_attr "size" "8,16,32,64" (const_string "32"))
+;; This is used for insert, mul and others as necessary.
+(define_attr "size" "8,16,32,64,128" (const_string "32"))
 
 ;; Is this instruction record form ("dot", signed compare to 0, writing CR0)?
 ;; This is used for add, logical, shift, exts, mul.
@@ -298,6 +301,7 @@
 (include "power6.md")
 (include "power7.md")
 (include "power8.md")
+(include "power9.md")
 (include "cell.md")
 (include "xfpu.md")
 (include "a2.md")
@@ -442,7 +446,7 @@
 (define_mode_attr f32_lr  [(SF "f")		  (SD "wz")])
 (define_mode_attr f32_lr2 [(SF "wb")		  (SD "wn")])
 (define_mode_attr f32_lm  [(SF "m")		  (SD "Z")])
-(define_mode_attr f32_lm2 [(SF "o")		  (SD "wn")])
+(define_mode_attr f32_lm2 [(SF "wY")		  (SD "wn")])
 (define_mode_attr f32_li  [(SF "lfs%U1%X1 %0,%1") (SD "lfiwzx %0,%y1")])
 (define_mode_attr f32_li2 [(SF "lxssp %0,%1")     (SD "lfiwzx %0,%y1")])
 (define_mode_attr f32_lv  [(SF "lxsspx %x0,%y1")  (SD "lxsiwzx %x0,%y1")])
@@ -451,7 +455,7 @@
 (define_mode_attr f32_sr  [(SF "f")		   (SD "wx")])
 (define_mode_attr f32_sr2 [(SF "wb")		   (SD "wn")])
 (define_mode_attr f32_sm  [(SF "m")		   (SD "Z")])
-(define_mode_attr f32_sm2 [(SF "o")		   (SD "wn")])
+(define_mode_attr f32_sm2 [(SF "wY")		   (SD "wn")])
 (define_mode_attr f32_si  [(SF "stfs%U0%X0 %1,%0") (SD "stfiwx %1,%y0")])
 (define_mode_attr f32_si2 [(SF "stxssp %1,%0")     (SD "stfiwx %1,%y0")])
 (define_mode_attr f32_sv  [(SF "stxsspx %x1,%y0")  (SD "stxsiwzx %x1,%y0")])
@@ -489,6 +493,10 @@
 ; Iterator for just SF/DF
 (define_mode_iterator SFDF [SF DF])
 
+; Like SFDF, but a different name to match conditional move where the
+; comparison operands may be a different mode than the input operands.
+(define_mode_iterator SFDF2 [SF DF])
+
 ; Iterator for 128-bit floating point that uses the IBM double-double format
 (define_mode_iterator IBM128 [(IF "FLOAT128_IBM_P (IFmode)")
 			      (TF "FLOAT128_IBM_P (TFmode)")])
@@ -502,6 +510,13 @@
 				(IF "TARGET_FLOAT128")
 				(TF "TARGET_LONG_DOUBLE_128")])
 
+; Iterator for signbit on 64-bit machines with direct move
+(define_mode_iterator SIGNBIT [(KF "FLOAT128_VECTOR_P (KFmode)")
+			       (TF "FLOAT128_VECTOR_P (TFmode)")])
+
+(define_mode_attr Fsignbit	[(KF "wa")
+				 (TF "wa")])
+
 ; SF/DF suffix for traditional floating instructions
 (define_mode_attr Ftrad		[(SF "s") (DF "")])
 
@@ -577,7 +592,9 @@
 		      (V16QI "b")
 		      (V8HI  "h")
 		      (V4SI  "w")
-		      (V2DI  "d")])
+		      (V2DI  "d")
+		      (V1TI  "q")
+		      (TI    "q")])
 
 ;; How many bits in this mode?
 (define_mode_attr bits [(QI "8") (HI "16") (SI "32") (DI "64")])
@@ -698,6 +715,15 @@
 (define_mode_iterator RELOAD [V16QI V8HI V4SI V2DI V4SF V2DF V1TI
 			      SF SD SI DF DD DI TI PTI KF IF TF])
 
+;; Iterate over smin, smax
+(define_code_iterator fp_minmax	[smin smax])
+
+(define_code_attr     minmax	[(smin "min")
+				 (smax "max")])
+
+(define_code_attr     SMINMAX	[(smin "SMIN")
+				 (smax "SMAX")])
+
 
 ;; Start with fixed-point load and store insns.  Here we put only the more
 ;; complex forms.  Basic data transfer is done later.
@@ -4044,7 +4070,7 @@
 
   if (REGNO (cr) == CR0_REGNO)
     {
-      emit_insn (gen_ashdi3_extswsli_dot (dest, src2, shift, cr));
+      emit_insn (gen_ashdi3_extswsli_dot2 (dest, src2, shift, cr));
       DONE;
     }
 
@@ -4305,7 +4331,7 @@
   "@
    fabs %0,%1
    xsabsdp %x0,%x1"
-  [(set_attr "type" "fp")
+  [(set_attr "type" "fpsimple")
    (set_attr "fp_type" "fp_addsub_<Fs>")])
 
 (define_insn "*nabs<mode>2_fpr"
@@ -4317,7 +4343,7 @@
   "@
    fnabs %0,%1
    xsnabsdp %x0,%x1"
-  [(set_attr "type" "fp")
+  [(set_attr "type" "fpsimple")
    (set_attr "fp_type" "fp_addsub_<Fs>")])
 
 (define_expand "neg<mode>2"
@@ -4333,7 +4359,7 @@
   "@
    fneg %0,%1
    xsnegdp %x0,%x1"
-  [(set_attr "type" "fp")
+  [(set_attr "type" "fpsimple")
    (set_attr "fp_type" "fp_addsub_<Fs>")])
 
 (define_expand "add<mode>3"
@@ -4478,7 +4504,7 @@
 
 (define_insn_and_split "*extendsfdf2_fpr"
   [(set (match_operand:DF 0 "gpc_reg_operand" "=d,?d,d,ws,?ws,wu,wb")
-	(float_extend:DF (match_operand:SF 1 "reg_or_mem_operand" "0,f,m,0,wy,Z,o")))]
+	(float_extend:DF (match_operand:SF 1 "reg_or_mem_operand" "0,f,m,0,wy,Z,wY")))]
   "TARGET_HARD_FLOAT && TARGET_FPRS && TARGET_DOUBLE_FLOAT"
   "@
    #
@@ -4494,7 +4520,7 @@
   emit_note (NOTE_INSN_DELETED);
   DONE;
 }
-  [(set_attr "type" "fp,fp,fpload,fp,fp,fpload,fpload")])
+  [(set_attr "type" "fp,fpsimple,fpload,fp,fpsimple,fpload,fpload")])
 
 (define_expand "truncdfsf2"
   [(set (match_operand:SF 0 "gpc_reg_operand" "")
@@ -4516,7 +4542,7 @@
 ;; when little-endian.
 (define_expand "signbit<mode>2"
   [(set (match_dup 2)
-	(float_truncate:DF (match_operand:IBM128 1 "gpc_reg_operand" "")))
+	(float_truncate:DF (match_operand:FLOAT128 1 "gpc_reg_operand" "")))
    (set (match_dup 3)
    	(subreg:DI (match_dup 2) 0))
    (set (match_dup 4)
@@ -4524,8 +4550,20 @@
    (set (match_operand:SI 0 "gpc_reg_operand" "")
   	(match_dup 6))]
   "TARGET_HARD_FLOAT
-   && (TARGET_FPRS || TARGET_E500_DOUBLE)"
+   && (TARGET_FPRS || TARGET_E500_DOUBLE)
+   && (!FLOAT128_IEEE_P (<MODE>mode)
+       || (TARGET_POWERPC64 && TARGET_DIRECT_MOVE))"
 {
+  if (FLOAT128_IEEE_P (<MODE>mode))
+    {
+      if (<MODE>mode == KFmode)
+	emit_insn (gen_signbitkf2_dm (operands[0], operands[1]));
+      else if (<MODE>mode == TFmode)
+	emit_insn (gen_signbittf2_dm (operands[0], operands[1]));
+      else
+	gcc_unreachable ();
+      DONE;
+    }
   operands[2] = gen_reg_rtx (DFmode);
   operands[3] = gen_reg_rtx (DImode);
   if (TARGET_POWERPC64)
@@ -4573,6 +4611,37 @@
    operands[5] = CONST0_RTX (<MODE>mode);
   })
 
+;; Optimize signbit on 64-bit systems with direct move to avoid doing the store
+;; and load.
+(define_insn_and_split "signbit<mode>2_dm"
+  [(set (match_operand:SI 0 "gpc_reg_operand" "=r,r,r")
+	(unspec:SI
+	 [(match_operand:SIGNBIT 1 "input_operand" "<Fsignbit>,m,r")]
+	 UNSPEC_SIGNBIT))]
+  "TARGET_POWERPC64 && TARGET_DIRECT_MOVE"
+  "#"
+  "&& reload_completed"
+  [(const_int 0)]
+{
+  rs6000_split_signbit (operands[0], operands[1]);
+  DONE;
+}
+ [(set_attr "length" "8,8,12")
+  (set_attr "type" "mftgpr,load,integer")])
+
+;; MODES_TIEABLE_P doesn't allow DImode to be tied with the various floating
+;; point types, which makes normal SUBREG's problematical. Instead use a
+;; special pattern to avoid using a normal movdi.
+(define_insn "signbit<mode>2_dm2"
+  [(set (match_operand:DI 0 "gpc_reg_operand" "=r")
+	(unspec:DI [(match_operand:SIGNBIT 1 "gpc_reg_operand" "<Fsignbit>")
+		    (const_int 0)]
+		   UNSPEC_SIGNBIT))]
+  "TARGET_POWERPC64 && TARGET_DIRECT_MOVE"
+  "mfvsrd %0,%x1"
+ [(set_attr "type" "mftgpr")])
+
+
 ;; Use an unspec rather providing an if-then-else in RTL, to prevent the
 ;; compiler from optimizing -0.0
 (define_insn "copysign<mode>3_fcpsgn"
@@ -4584,7 +4653,7 @@
   "@
    fcpsgn %0,%2,%1
    xscpsgndp %x0,%x2,%x1"
-  [(set_attr "type" "fp")])
+  [(set_attr "type" "fpsimple")])
 
 ;; For MIN, MAX, and conditional move, we use DEFINE_EXPAND's that involve a
 ;; fsel instruction and some auxiliary computations.  Then we just have a
@@ -4599,74 +4668,45 @@
 ;; On VSX, we only check for TARGET_VSX instead of checking for a vsx/p8 vector
 ;; to allow either DF/SF to use only traditional registers.
 
-(define_expand "smax<mode>3"
+(define_expand "s<minmax><mode>3"
   [(set (match_operand:SFDF 0 "gpc_reg_operand" "")
-	(if_then_else:SFDF (ge (match_operand:SFDF 1 "gpc_reg_operand" "")
-			       (match_operand:SFDF 2 "gpc_reg_operand" ""))
-			   (match_dup 1)
-			   (match_dup 2)))]
-  "TARGET_<MODE>_FPR && TARGET_PPC_GFXOPT && !flag_trapping_math"
+	(fp_minmax:SFDF (match_operand:SFDF 1 "gpc_reg_operand" "")
+			(match_operand:SFDF 2 "gpc_reg_operand" "")))]
+  "TARGET_MINMAX_<MODE>"
 {
-  rs6000_emit_minmax (operands[0], SMAX, operands[1], operands[2]);
+  rs6000_emit_minmax (operands[0], <SMINMAX>, operands[1], operands[2]);
   DONE;
 })
 
-(define_insn "*smax<mode>3_vsx"
-  [(set (match_operand:SFDF 0 "gpc_reg_operand" "=<Ff>,<Fv>")
-	(smax:SFDF (match_operand:SFDF 1 "gpc_reg_operand" "%<Ff>,<Fv>")
-		   (match_operand:SFDF 2 "gpc_reg_operand" "<Ff>,<Fv>")))]
-  "TARGET_<MODE>_FPR && TARGET_VSX"
-  "xsmaxdp %x0,%x1,%x2"
-  [(set_attr "type" "fp")])
-
-(define_expand "smin<mode>3"
-  [(set (match_operand:SFDF 0 "gpc_reg_operand" "")
-	(if_then_else:SFDF (ge (match_operand:SFDF 1 "gpc_reg_operand" "")
-			       (match_operand:SFDF 2 "gpc_reg_operand" ""))
-			   (match_dup 2)
-			   (match_dup 1)))]
-  "TARGET_<MODE>_FPR && TARGET_PPC_GFXOPT && !flag_trapping_math"
+(define_insn "*s<minmax><mode>3_vsx"
+  [(set (match_operand:SFDF 0 "vsx_register_operand" "=<Fv>")
+	(fp_minmax:SFDF (match_operand:SFDF 1 "vsx_register_operand" "<Fv>")
+			(match_operand:SFDF 2 "vsx_register_operand" "<Fv>")))]
+  "TARGET_VSX && TARGET_<MODE>_FPR"
 {
-  rs6000_emit_minmax (operands[0], SMIN, operands[1], operands[2]);
-  DONE;
-})
-
-(define_insn "*smin<mode>3_vsx"
-  [(set (match_operand:SFDF 0 "gpc_reg_operand" "=<Ff>,<Fv>")
-	(smin:SFDF (match_operand:SFDF 1 "gpc_reg_operand" "%<Ff>,<Fv>")
-		   (match_operand:SFDF 2 "gpc_reg_operand" "<Ff>,<Fv>")))]
-  "TARGET_<MODE>_FPR && TARGET_VSX"
-  "xsmindp %x0,%x1,%x2"
+  return (TARGET_P9_MINMAX
+	  ? "xs<minmax>cdp %x0,%x1,%x2"
+	  : "xs<minmax>dp %x0,%x1,%x2");
+}
   [(set_attr "type" "fp")])
 
-(define_split
+;; The conditional move instructions allow us to perform max and min operations
+;; even when we don't have the appropriate max/min instruction using the FSEL
+;; instruction.
+
+(define_insn_and_split "*s<minmax><mode>3_fpr"
   [(set (match_operand:SFDF 0 "gpc_reg_operand" "")
-	(match_operator:SFDF 3 "min_max_operator"
-	 [(match_operand:SFDF 1 "gpc_reg_operand" "")
-	  (match_operand:SFDF 2 "gpc_reg_operand" "")]))]
-  "TARGET_<MODE>_FPR && TARGET_PPC_GFXOPT && !flag_trapping_math
-   && !TARGET_VSX"
+	(fp_minmax:SFDF (match_operand:SFDF 1 "gpc_reg_operand" "")
+			(match_operand:SFDF 2 "gpc_reg_operand" "")))]
+  "!TARGET_VSX && TARGET_MINMAX_<MODE>"
+  "#"
+  "&& 1"
   [(const_int 0)]
 {
-  rs6000_emit_minmax (operands[0], GET_CODE (operands[3]), operands[1],
-		      operands[2]);
+  rs6000_emit_minmax (operands[0], <SMINMAX>, operands[1], operands[2]);
   DONE;
 })
 
-(define_split
-  [(set (match_operand:SF 0 "gpc_reg_operand" "")
-	(match_operator:SF 3 "min_max_operator"
-	 [(match_operand:SF 1 "gpc_reg_operand" "")
-	  (match_operand:SF 2 "gpc_reg_operand" "")]))]
-  "TARGET_PPC_GFXOPT && TARGET_HARD_FLOAT && TARGET_FPRS 
-   && TARGET_SINGLE_FLOAT && !flag_trapping_math"
-  [(const_int 0)]
-  "
-{ rs6000_emit_minmax (operands[0], GET_CODE (operands[3]),
-		      operands[1], operands[2]);
-  DONE;
-}")
-
 (define_expand "mov<mode>cc"
    [(set (match_operand:GPR 0 "gpc_reg_operand" "")
 	 (if_then_else:GPR (match_operand 1 "comparison_operator" "")
@@ -4749,12 +4789,13 @@
   [(set_attr "type" "isel")
    (set_attr "length" "4")])
 
-(define_expand "movsfcc"
-   [(set (match_operand:SF 0 "gpc_reg_operand" "")
-	 (if_then_else:SF (match_operand 1 "comparison_operator" "")
-			  (match_operand:SF 2 "gpc_reg_operand" "")
-			  (match_operand:SF 3 "gpc_reg_operand" "")))]
-  "TARGET_PPC_GFXOPT && TARGET_HARD_FLOAT && TARGET_FPRS && TARGET_SINGLE_FLOAT"
+;; Floating point conditional move
+(define_expand "mov<mode>cc"
+   [(set (match_operand:SFDF 0 "gpc_reg_operand" "")
+	 (if_then_else:SFDF (match_operand 1 "comparison_operator" "")
+			    (match_operand:SFDF 2 "gpc_reg_operand" "")
+			    (match_operand:SFDF 3 "gpc_reg_operand" "")))]
+  "TARGET_<MODE>_FPR && TARGET_PPC_GFXOPT"
   "
 {
   if (rs6000_emit_cmove (operands[0], operands[1], operands[2], operands[3]))
@@ -4763,76 +4804,70 @@
     FAIL;
 }")
 
-(define_insn "*fselsfsf4"
-  [(set (match_operand:SF 0 "gpc_reg_operand" "=f")
-	(if_then_else:SF (ge (match_operand:SF 1 "gpc_reg_operand" "f")
-			     (match_operand:SF 4 "zero_fp_constant" "F"))
-			 (match_operand:SF 2 "gpc_reg_operand" "f")
-			 (match_operand:SF 3 "gpc_reg_operand" "f")))]
-  "TARGET_PPC_GFXOPT && TARGET_HARD_FLOAT && TARGET_FPRS && TARGET_SINGLE_FLOAT"
+(define_insn "*fsel<SFDF:mode><SFDF2:mode>4"
+  [(set (match_operand:SFDF 0 "fpr_reg_operand" "=&<SFDF:rreg2>")
+	(if_then_else:SFDF
+	 (ge (match_operand:SFDF2 1 "fpr_reg_operand" "<SFDF2:rreg2>")
+	     (match_operand:SFDF2 4 "zero_fp_constant" "F"))
+	 (match_operand:SFDF 2 "fpr_reg_operand" "<SFDF:rreg2>")
+	 (match_operand:SFDF 3 "fpr_reg_operand" "<SFDF:rreg2>")))]
+  "TARGET_<MODE>_FPR && TARGET_PPC_GFXOPT"
   "fsel %0,%1,%2,%3"
   [(set_attr "type" "fp")])
 
-(define_insn "*fseldfsf4"
-  [(set (match_operand:SF 0 "gpc_reg_operand" "=f")
-	(if_then_else:SF (ge (match_operand:DF 1 "gpc_reg_operand" "d")
-			     (match_operand:DF 4 "zero_fp_constant" "F"))
-			 (match_operand:SF 2 "gpc_reg_operand" "f")
-			 (match_operand:SF 3 "gpc_reg_operand" "f")))]
-  "TARGET_PPC_GFXOPT && TARGET_HARD_FLOAT && TARGET_FPRS && TARGET_DOUBLE_FLOAT && TARGET_SINGLE_FLOAT"
-  "fsel %0,%1,%2,%3"
-  [(set_attr "type" "fp")])
-
-;; The conditional move instructions allow us to perform max and min
-;; operations even when
-
-(define_split
-  [(set (match_operand:DF 0 "gpc_reg_operand" "")
-	(match_operator:DF 3 "min_max_operator"
-	 [(match_operand:DF 1 "gpc_reg_operand" "")
-	  (match_operand:DF 2 "gpc_reg_operand" "")]))]
-  "TARGET_PPC_GFXOPT && TARGET_HARD_FLOAT && TARGET_FPRS && TARGET_DOUBLE_FLOAT 
-   && !flag_trapping_math"
-  [(const_int 0)]
-  "
-{ rs6000_emit_minmax (operands[0], GET_CODE (operands[3]),
-		      operands[1], operands[2]);
-  DONE;
-}")
-
-(define_expand "movdfcc"
-   [(set (match_operand:DF 0 "gpc_reg_operand" "")
-	 (if_then_else:DF (match_operand 1 "comparison_operator" "")
-			  (match_operand:DF 2 "gpc_reg_operand" "")
-			  (match_operand:DF 3 "gpc_reg_operand" "")))]
-  "TARGET_PPC_GFXOPT && TARGET_HARD_FLOAT && TARGET_FPRS && TARGET_DOUBLE_FLOAT"
-  "
+(define_insn_and_split "*mov<SFDF:mode><SFDF2:mode>cc_p9"
+  [(set (match_operand:SFDF 0 "vsx_register_operand" "=&<SFDF:Fv>,<SFDF:Fv>")
+	(if_then_else:SFDF
+	 (match_operator:CCFP 1 "fpmask_comparison_operator"
+		[(match_operand:SFDF2 2 "vsx_register_operand" "<SFDF2:Fv>,<SFDF2:Fv>")
+		 (match_operand:SFDF2 3 "vsx_register_operand" "<SFDF2:Fv>,<SFDF2:Fv>")])
+	 (match_operand:SFDF 4 "vsx_register_operand" "<SFDF:Fv>,<SFDF:Fv>")
+	 (match_operand:SFDF 5 "vsx_register_operand" "<SFDF:Fv>,<SFDF:Fv>")))
+   (clobber (match_scratch:V2DI 6 "=0,&wa"))]
+  "TARGET_P9_MINMAX"
+  "#"
+  ""
+  [(set (match_dup 6)
+	(if_then_else:V2DI (match_dup 1)
+			   (match_dup 7)
+			   (match_dup 8)))
+   (set (match_dup 0)
+	(if_then_else:SFDF (ne (match_dup 6)
+			       (match_dup 8))
+			   (match_dup 4)
+			   (match_dup 5)))]
 {
-  if (rs6000_emit_cmove (operands[0], operands[1], operands[2], operands[3]))
-    DONE;
-  else
-    FAIL;
-}")
+  if (GET_CODE (operands[6]) == SCRATCH)
+    operands[6] = gen_reg_rtx (V2DImode);
 
-(define_insn "*fseldfdf4"
-  [(set (match_operand:DF 0 "gpc_reg_operand" "=d")
-	(if_then_else:DF (ge (match_operand:DF 1 "gpc_reg_operand" "d")
-			     (match_operand:DF 4 "zero_fp_constant" "F"))
-			 (match_operand:DF 2 "gpc_reg_operand" "d")
-			 (match_operand:DF 3 "gpc_reg_operand" "d")))]
-  "TARGET_PPC_GFXOPT && TARGET_HARD_FLOAT && TARGET_FPRS && TARGET_DOUBLE_FLOAT"
-  "fsel %0,%1,%2,%3"
-  [(set_attr "type" "fp")])
+  operands[7] = CONSTM1_RTX (V2DImode);
+  operands[8] = CONST0_RTX (V2DImode);
+}
+ [(set_attr "length" "8")
+  (set_attr "type" "vecperm")])
+
+(define_insn "*fpmask<mode>"
+  [(set (match_operand:V2DI 0 "vsx_register_operand" "=wa")
+	(if_then_else:V2DI
+	 (match_operator:CCFP 1 "fpmask_comparison_operator"
+		[(match_operand:SFDF 2 "vsx_register_operand" "<Fv>")
+		 (match_operand:SFDF 3 "vsx_register_operand" "<Fv>")])
+	 (match_operand:V2DI 4 "all_ones_constant" "")
+	 (match_operand:V2DI 5 "zero_constant" "")))]
+  "TARGET_P9_MINMAX"
+  "xscmp%V1dp %x0,%x2,%x3"
+  [(set_attr "type" "fpcompare")])
+
+(define_insn "*xxsel<mode>"
+  [(set (match_operand:SFDF 0 "vsx_register_operand" "=<Fv>")
+	(if_then_else:SFDF (ne (match_operand:V2DI 1 "vsx_register_operand" "wa")
+			       (match_operand:V2DI 2 "zero_constant" ""))
+			   (match_operand:SFDF 3 "vsx_register_operand" "<Fv>")
+			   (match_operand:SFDF 4 "vsx_register_operand" "<Fv>")))]
+  "TARGET_P9_MINMAX"
+  "xxsel %x0,%x1,%x3,%x4"
+  [(set_attr "type" "vecmove")])
 
-(define_insn "*fselsfdf4"
-  [(set (match_operand:DF 0 "gpc_reg_operand" "=d")
-	(if_then_else:DF (ge (match_operand:SF 1 "gpc_reg_operand" "f")
-			     (match_operand:SF 4 "zero_fp_constant" "F"))
-			 (match_operand:DF 2 "gpc_reg_operand" "d")
-			 (match_operand:DF 3 "gpc_reg_operand" "d")))]
-  "TARGET_PPC_GFXOPT && TARGET_HARD_FLOAT && TARGET_FPRS && TARGET_DOUBLE_FLOAT && TARGET_SINGLE_FLOAT"
-  "fsel %0,%1,%2,%3"
-  [(set_attr "type" "fp")])
 
 ;; Conversions to and from floating-point.
 
@@ -5942,7 +5977,7 @@
   [(set (attr "type")
       (if_then_else
 	(match_test "vsx_register_operand (operands[0], <MODE>mode)")
-	(const_string "vecsimple")
+	(const_string "veclogical")
 	(const_string "integer")))
    (set (attr "length")
       (if_then_else
@@ -5978,7 +6013,7 @@
   [(set (attr "type")
       (if_then_else
 	(match_test "vsx_register_operand (operands[0], <MODE>mode)")
-	(const_string "vecsimple")
+	(const_string "veclogical")
 	(const_string "integer")))
    (set (attr "length")
       (if_then_else
@@ -6016,7 +6051,7 @@
   [(set (attr "type")
       (if_then_else
 	(match_test "vsx_register_operand (operands[0], <MODE>mode)")
-	(const_string "vecsimple")
+	(const_string "veclogical")
 	(const_string "integer")))
    (set (attr "length")
       (if_then_else
@@ -6076,7 +6111,7 @@
   [(set (attr "type")
       (if_then_else
 	(match_test "vsx_register_operand (operands[0], <MODE>mode)")
-	(const_string "vecsimple")
+	(const_string "veclogical")
 	(const_string "integer")))
    (set (attr "length")
       (if_then_else
@@ -6134,7 +6169,7 @@
   [(set (attr "type")
       (if_then_else
 	(match_test "vsx_register_operand (operands[0], <MODE>mode)")
-	(const_string "vecsimple")
+	(const_string "veclogical")
 	(const_string "integer")))
    (set (attr "length")
       (if_then_else
@@ -6190,7 +6225,7 @@
   [(set (attr "type")
       (if_then_else
 	(match_test "vsx_register_operand (operands[0], <MODE>mode)")
-	(const_string "vecsimple")
+	(const_string "veclogical")
 	(const_string "integer")))
    (set (attr "length")
       (if_then_else
@@ -6481,32 +6516,37 @@
 }")
 
 (define_insn "mov<mode>_hardfloat"
-  [(set (match_operand:FMOVE32 0 "nonimmediate_operand" "=!r,!r,m,f,<f32_vsx>,<f32_vsx>,!r,<f32_lr>,<f32_lr2>,<f32_sm>,<f32_sm2>,<f32_av>,Z,?<f32_dm>,?r,*c*l,!r,*h")
-	(match_operand:FMOVE32 1 "input_operand" "r,m,r,f,<f32_vsx>,<zero_fp>,<zero_fp>,<f32_lm>,<f32_lm2>,<f32_sr>,<f32_sr2>,Z,<f32_av>,r,<f32_dm>,r,h,0"))]
+  [(set (match_operand:FMOVE32 0 "nonimmediate_operand"
+	 "=!r,       <f32_lr>,  <f32_lr2>, <f32_av>,  m,         <f32_sm>,
+	  <f32_sm2>, Z,         <f32_vsx>, !r,        ?<f32_dm>, ?r,
+	  f,         <f32_vsx>, !r,        *c*l,      !r,        *h")
+	(match_operand:FMOVE32 1 "input_operand"
+	 "m,         <f32_lm>,  <f32_lm2>, Z,         r,         <f32_sr>,
+	  <f32_sr2>, <f32_av>,  <zero_fp>, <zero_fp>, r,         <f32_dm>,
+	  f,         <f32_vsx>, r,         r,         *h,        0"))]
   "(gpc_reg_operand (operands[0], <MODE>mode)
    || gpc_reg_operand (operands[1], <MODE>mode))
    && (TARGET_HARD_FLOAT && TARGET_FPRS && TARGET_SINGLE_FLOAT)"
   "@
-   mr %0,%1
    lwz%U1%X1 %0,%1
-   stw%U0%X0 %1,%0
-   fmr %0,%1
-   xscpsgndp %x0,%x1,%x1
-   xxlxor %x0,%x0,%x0
-   li %0,0
    <f32_li>
    <f32_li2>
+   <f32_lv>
+   stw%U0%X0 %1,%0
    <f32_si>
    <f32_si2>
-   <f32_lv>
    <f32_sv>
+   xxlxor %x0,%x0,%x0
+   li %0,0
    mtvsrwz %x0,%1
    mfvsrwz %0,%x1
+   fmr %0,%1
+   xscpsgndp %x0,%x1,%x1
+   mr %0,%1
    mt%0 %1
    mf%1 %0
    nop"
-  [(set_attr "type" "*,load,store,fp,fp,vecsimple,integer,fpload,fpload,fpstore,fpstore,fpload,fpstore,mffgpr,mftgpr,mtjmpr,mfjmpr,*")
-   (set_attr "length" "4")])
+  [(set_attr "type" "load,fpload,fpload,fpload,store,fpstore,fpstore,fpstore,veclogical,integer,mffgpr,mftgpr,fpsimple,fpsimple,*,mtjmpr,mfjmpr,*")])
 
 (define_insn "*mov<mode>_softfloat"
   [(set (match_operand:FMOVE32 0 "nonimmediate_operand" "=r,cl,r,r,m,r,r,r,r,*h")
@@ -6640,7 +6680,8 @@
    #
    #
    #"
-  [(set_attr "type" "fpstore,fpload,fp,fpload,fpstore,fpload,fpstore,vecsimple,vecsimple,two,store,load,two")
+  [(set_attr "type" "fpstore,fpload,fpsimple,fpload,fpstore,fpload,fpstore,veclogical,veclogical,two,store,load,two")
+   (set_attr "size" "64")
    (set_attr "length" "4,4,4,4,4,4,4,4,4,8,8,8,8")])
 
 (define_insn "*mov<mode>_softfloat32"
@@ -6685,7 +6726,8 @@
    mffgpr %0,%1
    mfvsrd %0,%x1
    mtvsrd %x0,%1"
-  [(set_attr "type" "fpstore,fpload,fp,fpload,fpstore,fpload,fpstore,vecsimple,vecsimple,integer,store,load,*,mtjmpr,mfjmpr,*,mftgpr,mffgpr,mftgpr,mffgpr")
+  [(set_attr "type" "fpstore,fpload,fpsimple,fpload,fpstore,fpload,fpstore,veclogical,veclogical,integer,store,load,*,mtjmpr,mfjmpr,*,mftgpr,mffgpr,mftgpr,mffgpr")
+   (set_attr "size" "64")
    (set_attr "length" "4")])
 
 (define_insn "*mov<mode>_softfloat64"
@@ -6896,7 +6938,7 @@
   emit_note (NOTE_INSN_DELETED);
   DONE;
 }
-  [(set_attr "type" "fp")])
+  [(set_attr "type" "fpsimple")])
 
 (define_insn "trunc<mode>df2_internal2"
   [(set (match_operand:DF 0 "gpc_reg_operand" "=d")
@@ -7129,7 +7171,7 @@
   else
     return \"fneg %0,%1\;fneg %L0,%L1\";
 }"
-  [(set_attr "type" "fp")
+  [(set_attr "type" "fpsimple")
    (set_attr "length" "8")])
 
 (define_expand "abs<mode>2"
@@ -7264,7 +7306,7 @@
    (use (match_operand:V16QI 2 "register_operand" "v"))]
   "TARGET_FLOAT128 && !TARGET_FLOAT128_HW"
   "xxlxor %x0,%x1,%x2"
-  [(set_attr "type" "vecsimple")])
+  [(set_attr "type" "veclogical")])
 
 ;; IEEE 128-bit absolute value
 (define_insn_and_split "ieee_128bit_vsx_abs<mode>2"
@@ -7293,7 +7335,7 @@
    (use (match_operand:V16QI 2 "register_operand" "v"))]
   "TARGET_FLOAT128 && !TARGET_FLOAT128_HW"
   "xxlandc %x0,%x1,%x2"
-  [(set_attr "type" "vecsimple")])
+  [(set_attr "type" "veclogical")])
 
 ;; IEEE 128-bit negative absolute value
 (define_insn_and_split "*ieee_128bit_vsx_nabs<mode>2"
@@ -7326,7 +7368,7 @@
    (use (match_operand:V16QI 2 "register_operand" "v"))]
   "TARGET_FLOAT128 && !TARGET_FLOAT128_HW"
   "xxlor %x0,%x1,%x2"
-  [(set_attr "type" "vecsimple")])
+  [(set_attr "type" "veclogical")])
 
 ;; Float128 conversion functions.  These expand to library function calls.
 ;; We use expand to convert from IBM double double to IEEE 128-bit
@@ -7482,7 +7524,7 @@
 			 UNSPEC_P8V_FMRGOW))]
   "!TARGET_POWERPC64 && TARGET_DIRECT_MOVE"
   "fmrgow %0,%1,%2"
-  [(set_attr "type" "vecperm")])
+  [(set_attr "type" "fpsimple")])
 
 (define_insn "p8_mtvsrwz"
   [(set (match_operand:DF 0 "register_operand" "=d")
@@ -7705,7 +7747,8 @@
    lfd%U1%X1 %0,%1
    fmr %0,%1
    #"
-  [(set_attr "type" "store,load,*,fpstore,fpload,fp,*")])
+  [(set_attr "type" "store,load,*,fpstore,fpload,fpsimple,*")
+   (set_attr "size" "64")])
 
 (define_split
   [(set (match_operand:DI 0 "gpc_reg_operand" "")
@@ -7759,7 +7802,8 @@
    mfvsrd %0,%x1
    mtvsrd %x0,%1
    xxlxor %x0,%x0,%x0"
-  [(set_attr "type" "store,load,*,*,*,*,fpstore,fpload,fp,mfjmpr,mtjmpr,*,mftgpr,mffgpr,mftgpr,mffgpr,vecsimple")
+  [(set_attr "type" "store,load,*,*,*,*,fpstore,fpload,fpsimple,mfjmpr,mtjmpr,*,mftgpr,mffgpr,mftgpr,mffgpr,veclogical")
+   (set_attr "size" "64")
    (set_attr "length" "4,4,4,4,4,20,4,4,4,4,4,4,4,4,4,4,4")])
 
 ; Some DImode loads are best done as a load of -1 followed by a mask
@@ -8767,7 +8811,8 @@
    lfdu %3,%2(%0)"
   [(set_attr "type" "fpload")
    (set_attr "update" "yes")
-   (set_attr "indexed" "yes,no")])
+   (set_attr "indexed" "yes,no")
+   (set_attr "size" "64")])
 
 (define_insn "*movdf_update2"
   [(set (mem:DF (plus:SI (match_operand:SI 1 "gpc_reg_operand" "0,0")
@@ -11902,6 +11947,7 @@
 	      (set (match_dup 0)
 		   (plus:P (match_dup 0)
 			    (const_int -1)))
+	      (unspec [(const_int 0)] UNSPEC_DOLOOP)
 	      (clobber (match_scratch:CC 2 ""))
 	      (clobber (match_scratch:P 3 ""))])]
   ""
@@ -11912,6 +11958,7 @@
 ;; JUMP_INSNs.
 ;; For the length attribute to be calculated correctly, the
 ;; label MUST be operand 0.
+;; The UNSPEC is present to prevent combine creating this pattern.
 
 (define_insn "*ctr<mode>_internal1"
   [(set (pc)
@@ -11919,9 +11966,10 @@
 			  (const_int 1))
 		      (label_ref (match_operand 0 "" ""))
 		      (pc)))
-   (set (match_operand:P 2 "nonimmediate_operand" "=1,*r,m,*d*c*l")
+   (set (match_operand:P 2 "nonimmediate_operand" "=1,*r,m,*d*wi*c*l")
 	(plus:P (match_dup 1)
 		 (const_int -1)))
+   (unspec [(const_int 0)] UNSPEC_DOLOOP)
    (clobber (match_scratch:CC 3 "=X,&x,&x,&x"))
    (clobber (match_scratch:P 4 "=X,X,&r,r"))]
   ""
@@ -11943,9 +11991,10 @@
 			  (const_int 1))
 		      (pc)
 		      (label_ref (match_operand 0 "" ""))))
-   (set (match_operand:P 2 "nonimmediate_operand" "=1,*r,m,*d*c*l")
+   (set (match_operand:P 2 "nonimmediate_operand" "=1,*r,m,*d*wi*c*l")
 	(plus:P (match_dup 1)
 		 (const_int -1)))
+   (unspec [(const_int 0)] UNSPEC_DOLOOP)
    (clobber (match_scratch:CC 3 "=X,&x,&x,&x"))
    (clobber (match_scratch:P 4 "=X,X,&r,r"))]
   ""
@@ -11969,9 +12018,10 @@
 			  (const_int 1))
 		      (label_ref (match_operand 0 "" ""))
 		      (pc)))
-   (set (match_operand:P 2 "nonimmediate_operand" "=1,*r,m,*d*c*l")
+   (set (match_operand:P 2 "nonimmediate_operand" "=1,*r,m,*d*wi*c*l")
 	(plus:P (match_dup 1)
 		 (const_int -1)))
+   (unspec [(const_int 0)] UNSPEC_DOLOOP)
    (clobber (match_scratch:CC 3 "=X,&x,&x,&x"))
    (clobber (match_scratch:P 4 "=X,X,&r,r"))]
   ""
@@ -11993,9 +12043,10 @@
 			  (const_int 1))
 		      (pc)
 		      (label_ref (match_operand 0 "" ""))))
-   (set (match_operand:P 2 "nonimmediate_operand" "=1,*r,m,*d*c*l")
+   (set (match_operand:P 2 "nonimmediate_operand" "=1,*r,m,*d*wi*c*l")
 	(plus:P (match_dup 1)
 		 (const_int -1)))
+   (unspec [(const_int 0)] UNSPEC_DOLOOP)
    (clobber (match_scratch:CC 3 "=X,&x,&x,&x"))
    (clobber (match_scratch:P 4 "=X,X,&r,r"))]
   ""
@@ -12022,6 +12073,7 @@
 		      (match_operand 6 "" "")))
    (set (match_operand:P 0 "int_reg_operand" "")
 	(plus:P (match_dup 1) (const_int -1)))
+   (unspec [(const_int 0)] UNSPEC_DOLOOP)
    (clobber (match_scratch:CC 3 ""))
    (clobber (match_scratch:P 4 ""))]
   "reload_completed"
@@ -12047,6 +12099,7 @@
 		      (match_operand 6 "" "")))
    (set (match_operand:P 0 "nonimmediate_operand" "")
 	(plus:P (match_dup 1) (const_int -1)))
+   (unspec [(const_int 0)] UNSPEC_DOLOOP)
    (clobber (match_scratch:CC 3 ""))
    (clobber (match_scratch:P 4 ""))]
   "reload_completed && ! gpc_reg_operand (operands[0], SImode)"
@@ -12563,8 +12616,10 @@
    (set_attr "indexed" "no")])
 
 ;; A return instruction which the middle-end doesn't see.
+;; Use r0 to stop regrename twiddling with lr restore insns emitted
+;; after the call to __morestack.
 (define_insn "split_stack_return"
-  [(unspec_volatile [(const_int 0)] UNSPECV_SPLIT_STACK_RETURN)]
+  [(unspec_volatile [(use (reg:SI 0))] UNSPECV_SPLIT_STACK_RETURN)]
   ""
   "blr"
   [(set_attr "type" "jmpreg")])
@@ -13166,7 +13221,7 @@
   operands[3] = gen_rtx_REG (<FP128_64>mode, dest_hi);
   operands[4] = gen_rtx_REG (<FP128_64>mode, dest_lo);
 }
-  [(set_attr "type" "fp,fp")
+  [(set_attr "type" "fpsimple,fp")
    (set_attr "length" "4,8")])
 
 (define_insn "unpack<mode>"
@@ -13205,7 +13260,8 @@
 	 (match_operand:IEEE128 2 "altivec_register_operand" "v")))]
   "TARGET_FLOAT128_HW && FLOAT128_IEEE_P (<MODE>mode)"
   "xsaddqp %0,%1,%2"
-  [(set_attr "type" "vecfloat")])
+  [(set_attr "type" "vecfloat")
+   (set_attr "size" "128")])
 
 (define_insn "sub<mode>3"
   [(set (match_operand:IEEE128 0 "altivec_register_operand" "=v")
@@ -13214,7 +13270,8 @@
 	 (match_operand:IEEE128 2 "altivec_register_operand" "v")))]
   "TARGET_FLOAT128_HW && FLOAT128_IEEE_P (<MODE>mode)"
   "xssubqp %0,%1,%2"
-  [(set_attr "type" "vecfloat")])
+  [(set_attr "type" "vecfloat")
+   (set_attr "size" "128")])
 
 (define_insn "mul<mode>3"
   [(set (match_operand:IEEE128 0 "altivec_register_operand" "=v")
@@ -13223,7 +13280,8 @@
 	 (match_operand:IEEE128 2 "altivec_register_operand" "v")))]
   "TARGET_FLOAT128_HW && FLOAT128_IEEE_P (<MODE>mode)"
   "xsmulqp %0,%1,%2"
-  [(set_attr "type" "vecfloat")])
+  [(set_attr "type" "vecfloat")
+   (set_attr "size" "128")])
 
 (define_insn "div<mode>3"
   [(set (match_operand:IEEE128 0 "altivec_register_operand" "=v")
@@ -13232,7 +13290,8 @@
 	 (match_operand:IEEE128 2 "altivec_register_operand" "v")))]
   "TARGET_FLOAT128_HW && FLOAT128_IEEE_P (<MODE>mode)"
   "xsdivqp %0,%1,%2"
-  [(set_attr "type" "vecdiv")])
+  [(set_attr "type" "vecdiv")
+   (set_attr "size" "128")])
 
 (define_insn "sqrt<mode>2"
   [(set (match_operand:IEEE128 0 "altivec_register_operand" "=v")
@@ -13240,9 +13299,28 @@
 	 (match_operand:IEEE128 1 "altivec_register_operand" "v")))]
   "TARGET_FLOAT128_HW && FLOAT128_IEEE_P (<MODE>mode)"
    "xssqrtqp %0,%1"
-  [(set_attr "type" "vecdiv")])
+  [(set_attr "type" "vecdiv")
+   (set_attr "size" "128")])
 
-(define_insn "copysign<mode>3"
+(define_expand "copysign<mode>3"
+  [(use (match_operand:IEEE128 0 "altivec_register_operand"))
+   (use (match_operand:IEEE128 1 "altivec_register_operand"))
+   (use (match_operand:IEEE128 2 "altivec_register_operand"))]
+  "FLOAT128_IEEE_P (<MODE>mode)"
+{
+  if (TARGET_FLOAT128_HW)
+    emit_insn (gen_copysign<mode>3_hard (operands[0], operands[1],
+					 operands[2]));
+  else
+    {
+      rtx tmp = gen_reg_rtx (<MODE>mode);
+      emit_insn (gen_copysign<mode>3_soft (operands[0], operands[1],
+					   operands[2], tmp));
+    }
+  DONE;
+})
+
+(define_insn "copysign<mode>3_hard"
   [(set (match_operand:IEEE128 0 "altivec_register_operand" "=v")
 	(unspec:IEEE128
 	 [(match_operand:IEEE128 1 "altivec_register_operand" "v")
@@ -13250,7 +13328,20 @@
 	 UNSPEC_COPYSIGN))]
   "TARGET_FLOAT128_HW && FLOAT128_IEEE_P (<MODE>mode)"
    "xscpsgnqp %0,%2,%1"
-  [(set_attr "type" "vecsimple")])
+  [(set_attr "type" "vecmove")
+   (set_attr "size" "128")])
+
+(define_insn "copysign<mode>3_soft"
+  [(set (match_operand:IEEE128 0 "altivec_register_operand" "=v")
+	(unspec:IEEE128
+	 [(match_operand:IEEE128 1 "altivec_register_operand" "v")
+	  (match_operand:IEEE128 2 "altivec_register_operand" "v")
+	  (match_operand:IEEE128 3 "altivec_register_operand" "+v")]
+	 UNSPEC_COPYSIGN))]
+  "!TARGET_FLOAT128_HW && FLOAT128_IEEE_P (<MODE>mode)"
+   "xscpsgndp %x3,%x2,%x1\;xxpermdi %x0,%x3,%x1,1"
+  [(set_attr "type" "veccomplex")
+   (set_attr "length" "8")])
 
 (define_insn "neg<mode>2_hw"
   [(set (match_operand:IEEE128 0 "altivec_register_operand" "=v")
@@ -13258,7 +13349,8 @@
 	 (match_operand:IEEE128 1 "altivec_register_operand" "v")))]
   "TARGET_FLOAT128_HW && FLOAT128_IEEE_P (<MODE>mode)"
   "xsnegqp %0,%1"
-  [(set_attr "type" "vecfloat")])
+  [(set_attr "type" "vecmove")
+   (set_attr "size" "128")])
 
 
 (define_insn "abs<mode>2_hw"
@@ -13267,7 +13359,8 @@
 	 (match_operand:IEEE128 1 "altivec_register_operand" "v")))]
   "TARGET_FLOAT128_HW && FLOAT128_IEEE_P (<MODE>mode)"
   "xsabsqp %0,%1"
-  [(set_attr "type" "vecfloat")])
+  [(set_attr "type" "vecmove")
+   (set_attr "size" "128")])
 
 
 (define_insn "*nabs<mode>2_hw"
@@ -13277,7 +13370,8 @@
 	  (match_operand:IEEE128 1 "altivec_register_operand" "v"))))]
   "TARGET_FLOAT128_HW && FLOAT128_IEEE_P (<MODE>mode)"
   "xsnabsqp %0,%1"
-  [(set_attr "type" "vecfloat")])
+  [(set_attr "type" "vecmove")
+   (set_attr "size" "128")])
 
 ;; Initially don't worry about doing fusion
 (define_insn "*fma<mode>4_hw"
@@ -13288,7 +13382,8 @@
 	 (match_operand:IEEE128 3 "altivec_register_operand" "0")))]
   "TARGET_FLOAT128_HW && FLOAT128_IEEE_P (<MODE>mode)"
   "xsmaddqp %0,%1,%2"
-  [(set_attr "type" "vecfloat")])
+  [(set_attr "type" "vecfloat")
+   (set_attr "size" "128")])
 
 (define_insn "*fms<mode>4_hw"
   [(set (match_operand:IEEE128 0 "altivec_register_operand" "=v")
@@ -13299,7 +13394,8 @@
 	  (match_operand:IEEE128 3 "altivec_register_operand" "0"))))]
   "TARGET_FLOAT128_HW && FLOAT128_IEEE_P (<MODE>mode)"
   "xsmsubqp %0,%1,%2"
-  [(set_attr "type" "vecfloat")])
+  [(set_attr "type" "vecfloat")
+   (set_attr "size" "128")])
 
 (define_insn "*nfma<mode>4_hw"
   [(set (match_operand:IEEE128 0 "altivec_register_operand" "=v")
@@ -13310,7 +13406,8 @@
 	  (match_operand:IEEE128 3 "altivec_register_operand" "0"))))]
   "TARGET_FLOAT128_HW && FLOAT128_IEEE_P (<MODE>mode)"
   "xsnmaddqp %0,%1,%2"
-  [(set_attr "type" "vecfloat")])
+  [(set_attr "type" "vecfloat")
+   (set_attr "size" "128")])
 
 (define_insn "*nfms<mode>4_hw"
   [(set (match_operand:IEEE128 0 "altivec_register_operand" "=v")
@@ -13322,7 +13419,8 @@
 	   (match_operand:IEEE128 3 "altivec_register_operand" "0")))))]
   "TARGET_FLOAT128_HW && FLOAT128_IEEE_P (<MODE>mode)"
   "xsnmsubqp %0,%1,%2"
-  [(set_attr "type" "vecfloat")])
+  [(set_attr "type" "vecfloat")
+   (set_attr "size" "128")])
 
 (define_insn "extend<SFDF:mode><IEEE128:mode>2_hw"
   [(set (match_operand:IEEE128 0 "altivec_register_operand" "=v")
@@ -13330,7 +13428,8 @@
 	 (match_operand:SFDF 1 "altivec_register_operand" "v")))]
   "TARGET_FLOAT128_HW && FLOAT128_IEEE_P (<IEEE128:MODE>mode)"
   "xscvdpqp %0,%1"
-  [(set_attr "type" "vecfloat")])
+  [(set_attr "type" "vecfloat")
+   (set_attr "size" "128")])
 
 ;; Conversion between KFmode and TFmode if TFmode is ieee 128-bit floating
 ;; point is a simple copy.
@@ -13347,7 +13446,7 @@
   emit_note (NOTE_INSN_DELETED);
   DONE;
 }
-  [(set_attr "type" "*,vecsimple")
+  [(set_attr "type" "*,veclogical")
    (set_attr "length" "0,4")])
 
 (define_insn_and_split "trunctfkf2"
@@ -13363,7 +13462,7 @@
   emit_note (NOTE_INSN_DELETED);
   DONE;
 }
-  [(set_attr "type" "*,vecsimple")
+  [(set_attr "type" "*,veclogical")
    (set_attr "length" "0,4")])
 
 (define_insn "trunc<mode>df2_hw"
@@ -13372,7 +13471,8 @@
 	 (match_operand:IEEE128 1 "altivec_register_operand" "v")))]
   "TARGET_FLOAT128_HW && FLOAT128_IEEE_P (<MODE>mode)"
   "xscvqpdp %0,%1"
-  [(set_attr "type" "vecfloat")])
+  [(set_attr "type" "vecfloat")
+   (set_attr "size" "128")])
 
 ;; There is no KFmode -> SFmode instruction. Preserve the accuracy by doing
 ;; the KFmode -> DFmode conversion using round to odd rather than the normal
@@ -13469,7 +13569,8 @@
 	 UNSPEC_IEEE128_CONVERT))]
   "TARGET_FLOAT128_HW && FLOAT128_IEEE_P (<MODE>mode)"
   "xscvqp<su>wz %0,%1"
-  [(set_attr "type" "vecfloat")])
+  [(set_attr "type" "vecfloat")
+   (set_attr "size" "128")])
 
 (define_insn "*xscvqp<su>dz_<mode>"
   [(set (match_operand:V2DI 0 "altivec_register_operand" "=v")
@@ -13479,7 +13580,8 @@
 	 UNSPEC_IEEE128_CONVERT))]
   "TARGET_FLOAT128_HW && FLOAT128_IEEE_P (<MODE>mode)"
   "xscvqp<su>dz %0,%1"
-  [(set_attr "type" "vecfloat")])
+  [(set_attr "type" "vecfloat")
+   (set_attr "size" "128")])
 
 (define_insn "*xscv<su>dqp_<mode>"
   [(set (match_operand:IEEE128 0 "altivec_register_operand" "=v")
@@ -13488,7 +13590,8 @@
 		    UNSPEC_IEEE128_CONVERT)))]
   "TARGET_FLOAT128_HW && FLOAT128_IEEE_P (<MODE>mode)"
   "xscv<su>dqp %0,%1"
-  [(set_attr "type" "vecfloat")])
+  [(set_attr "type" "vecfloat")
+   (set_attr "size" "128")])
 
 (define_insn "*ieee128_mfvsrd_64bit"
   [(set (match_operand:DI 0 "reg_or_indexed_operand" "=wr,Z,wi")
@@ -13499,7 +13602,7 @@
    mfvsrd %0,%x1
    stxsdx %x1,%y0
    xxlor %x0,%x1,%x1"
-  [(set_attr "type" "mftgpr,fpstore,vecsimple")])
+  [(set_attr "type" "mftgpr,fpstore,veclogical")])
 
 
 (define_insn "*ieee128_mfvsrd_32bit"
@@ -13510,7 +13613,7 @@
   "@
    stxsdx %x1,%y0
    xxlor %x0,%x1,%x1"
-  [(set_attr "type" "fpstore,vecsimple")])
+  [(set_attr "type" "fpstore,veclogical")])
 
 (define_insn "*ieee128_mfvsrwz"
   [(set (match_operand:SI 0 "reg_or_indexed_operand" "=r,Z")
@@ -13546,7 +13649,7 @@
    mtvsrd %x0,%1
    lxsdx %x0,%y1
    xxlor %x0,%x1,%x1"
-  [(set_attr "type" "mffgpr,fpload,vecsimple")])
+  [(set_attr "type" "mffgpr,fpload,veclogical")])
 
 (define_insn "*ieee128_mtvsrd_32bit"
   [(set (match_operand:V2DI 0 "altivec_register_operand" "=v,v")
@@ -13556,7 +13659,7 @@
   "@
    lxsdx %x0,%y1
    xxlor %x0,%x1,%x1"
-  [(set_attr "type" "fpload,vecsimple")])
+  [(set_attr "type" "fpload,veclogical")])
 
 ;; IEEE 128-bit instructions with round to odd semantics
 (define_insn "*trunc<mode>df2_odd"
@@ -13565,7 +13668,8 @@
 		   UNSPEC_ROUND_TO_ODD))]
   "TARGET_FLOAT128_HW && FLOAT128_IEEE_P (<MODE>mode)"
   "xscvqpdpo %0,%1"
-  [(set_attr "type" "vecfloat")])
+  [(set_attr "type" "vecfloat")
+   (set_attr "size" "128")])
 
 ;; IEEE 128-bit comparisons
 (define_insn "*cmp<mode>_hw"
@@ -13574,7 +13678,8 @@
 		      (match_operand:IEEE128 2 "altivec_register_operand" "v")))]
   "TARGET_FLOAT128_HW && FLOAT128_IEEE_P (<MODE>mode)"
    "xscmpuqp %0,%1,%2"
-  [(set_attr "type" "fpcompare")])
+  [(set_attr "type" "veccmp")
+   (set_attr "size" "128")])
 
 
 
diff --git a/gcc/config/rs6000/rs6000.opt b/gcc/config/rs6000/rs6000.opt
index 9a155ce0e0..611ed01b2a 100644
--- a/gcc/config/rs6000/rs6000.opt
+++ b/gcc/config/rs6000/rs6000.opt
@@ -470,8 +470,8 @@ Target RejectNegative Joined UInteger Var(rs6000_long_double_type_size) Save
 -mlong-double-<n>	Specify size of long double (64 or 128 bits).
 
 mlra
-Target Report Var(rs6000_lra_flag) Init(0) Save
-Use LRA instead of reload.
+Target Report Mask(LRA) Var(rs6000_isa_flags)
+Enable Local Register Allocation.
 
 msched-costly-dep=
 Target RejectNegative Joined Var(rs6000_sched_costly_dep_str)
@@ -605,13 +605,25 @@ mpower9-fusion
 Target Report Mask(P9_FUSION) Var(rs6000_isa_flags)
 Fuse certain operations together for better performance on power9.
 
+mpower9-misc
+Target Undocumented Report Mask(P9_MISC) Var(rs6000_isa_flags)
+Use/do not use certain scalar instructions added in ISA 3.0.
+
 mpower9-vector
 Target Report Mask(P9_VECTOR) Var(rs6000_isa_flags)
-Use/do not use vector and scalar instructions added in ISA 3.0.
+Use/do not use vector instructions added in ISA 3.0.
+
+mpower9-dform-scalar
+Target Undocumented Mask(P9_DFORM_SCALAR) Var(rs6000_isa_flags)
+Use/do not use scalar register+offset memory instructions added in ISA 3.0.
+
+mpower9-dform-vector
+Target Undocumented Mask(P9_DFORM_VECTOR) Var(rs6000_isa_flags)
+Use/do not use vector register+offset memory instructions added in ISA 3.0.
 
 mpower9-dform
-Target Undocumented Mask(P9_DFORM) Var(rs6000_isa_flags)
-Use/do not use vector and scalar instructions added in ISA 3.0.
+Target Report Var(TARGET_P9_DFORM_BOTH) Init(-1) Save
+Use/do not use register+offset memory instructions added in ISA 3.0.
 
 mpower9-minmax
 Target Undocumented Mask(P9_MINMAX) Var(rs6000_isa_flags)
diff --git a/gcc/config/rs6000/rs64.md b/gcc/config/rs6000/rs64.md
index b730aa82ec..e33cb78559 100644
--- a/gcc/config/rs6000/rs64.md
+++ b/gcc/config/rs6000/rs64.md
@@ -111,7 +111,7 @@
   "mciu_rs64,fpu_rs64,bpu_rs64")
 
 (define_insn_reservation "rs64a-fp" 4
-  (and (eq_attr "type" "fp,dmul")
+  (and (eq_attr "type" "fp,fpsimple,dmul")
        (eq_attr "cpu" "rs64a"))
   "mciu_rs64,fpu_rs64")
 
diff --git a/gcc/config/rs6000/sysv4.h b/gcc/config/rs6000/sysv4.h
index cbf909722d..ba35fd27d0 100644
--- a/gcc/config/rs6000/sysv4.h
+++ b/gcc/config/rs6000/sysv4.h
@@ -744,21 +744,32 @@ ENDIAN_SELECT(" -mbig", " -mlittle", DEFAULT_ASM_ENDIAN)
 %{!mnewlib: %{pthread:-lpthread} %{shared:-lc} \
 %{!shared: %{profile:-lc_p} %{!profile:-lc}}}"
 
+#if ENABLE_OFFLOADING == 1
+#define CRTOFFLOADBEGIN "%{fopenacc|fopenmp:crtoffloadbegin%O%s}"
+#define CRTOFFLOADEND "%{fopenacc|fopenmp:crtoffloadend%O%s}"
+#else
+#define CRTOFFLOADBEGIN ""
+#define CRTOFFLOADEND ""
+#endif
+
 #ifdef HAVE_LD_PIE
 #define	STARTFILE_LINUX_SPEC "\
 %{!shared: %{pg|p|profile:gcrt1.o%s;pie:Scrt1.o%s;:crt1.o%s}} \
 %{mnewlib:ecrti.o%s;:crti.o%s} \
-%{static:crtbeginT.o%s;shared|pie:crtbeginS.o%s;:crtbegin.o%s}"
+%{static:crtbeginT.o%s;shared|pie:crtbeginS.o%s;:crtbegin.o%s} \
+" CRTOFFLOADBEGIN
 #else
 #define	STARTFILE_LINUX_SPEC "\
 %{!shared: %{pg|p|profile:gcrt1.o%s;:crt1.o%s}} \
 %{mnewlib:ecrti.o%s;:crti.o%s} \
-%{static:crtbeginT.o%s;shared|pie:crtbeginS.o%s;:crtbegin.o%s}"
+%{static:crtbeginT.o%s;shared|pie:crtbeginS.o%s;:crtbegin.o%s} \
+" CRTOFFLOADBEGIN
 #endif
 
 #define	ENDFILE_LINUX_SPEC "\
 %{shared|pie:crtendS.o%s;:crtend.o%s} \
-%{mnewlib:ecrtn.o%s;:crtn.o%s}"
+%{mnewlib:ecrtn.o%s;:crtn.o%s} \
+" CRTOFFLOADEND
 
 #define LINK_START_LINUX_SPEC ""
 
diff --git a/gcc/config/rs6000/t-rs6000 b/gcc/config/rs6000/t-rs6000
index 0ba0af0666..f72f729d3a 100644
--- a/gcc/config/rs6000/t-rs6000
+++ b/gcc/config/rs6000/t-rs6000
@@ -50,6 +50,7 @@ MD_INCLUDES = $(srcdir)/config/rs6000/rs64.md \
 	$(srcdir)/config/rs6000/power6.md \
 	$(srcdir)/config/rs6000/power7.md \
 	$(srcdir)/config/rs6000/power8.md \
+	$(srcdir)/config/rs6000/power9.md \
 	$(srcdir)/config/rs6000/cell.md \
 	$(srcdir)/config/rs6000/xfpu.md \
 	$(srcdir)/config/rs6000/a2.md \
diff --git a/gcc/config/rs6000/titan.md b/gcc/config/rs6000/titan.md
index 74389534b4..e6658d67bd 100644
--- a/gcc/config/rs6000/titan.md
+++ b/gcc/config/rs6000/titan.md
@@ -156,7 +156,7 @@
 ;; Make sure the "titan_fp" rule stays last, as it's a catch all for
 ;; double-precision and unclassified (e.g. fsel) FP-instructions
 (define_insn_reservation "titan_fp" 10
-  (and (eq_attr "type" "fpcompare,fp,dmul")
+  (and (eq_attr "type" "fpcompare,fp,fpsimple,dmul")
        (eq_attr "cpu" "titan"))
   "titan_issue,titan_fp0*2,nothing*8,titan_fpwb")
 
diff --git a/gcc/config/rs6000/vector.md b/gcc/config/rs6000/vector.md
index 02fb3e3d12..3983c3b8f0 100644
--- a/gcc/config/rs6000/vector.md
+++ b/gcc/config/rs6000/vector.md
@@ -26,6 +26,13 @@
 ;; Vector int modes
 (define_mode_iterator VEC_I [V16QI V8HI V4SI V2DI])
 
+;; Vector int modes for parity
+(define_mode_iterator VEC_IP [V8HI
+			      V4SI
+			      V2DI
+			      V1TI
+			      (TI "TARGET_VSX_TIMODE")])
+
 ;; Vector float modes
 (define_mode_iterator VEC_F [V4SF V2DF])
 
@@ -738,12 +745,24 @@
 	(clz:VEC_I (match_operand:VEC_I 1 "register_operand" "")))]
   "TARGET_P8_VECTOR")
 
+;; Vector count trailing zeros
+(define_expand "ctz<mode>2"
+  [(set (match_operand:VEC_I 0 "register_operand" "")
+	(ctz:VEC_I (match_operand:VEC_I 1 "register_operand" "")))]
+  "TARGET_P9_VECTOR")
+
 ;; Vector population count
 (define_expand "popcount<mode>2"
   [(set (match_operand:VEC_I 0 "register_operand" "")
         (popcount:VEC_I (match_operand:VEC_I 1 "register_operand" "")))]
   "TARGET_P8_VECTOR")
 
+;; Vector parity
+(define_expand "parity<mode>2"
+  [(set (match_operand:VEC_IP 0 "register_operand" "")
+	(parity:VEC_IP (match_operand:VEC_IP 1 "register_operand" "")))]
+  "TARGET_P9_VECTOR")
+
 
 ;; Same size conversions
 (define_expand "float<VEC_int><mode>2"
diff --git a/gcc/config/rs6000/vsx.md b/gcc/config/rs6000/vsx.md
index 45af23361a..09cc23229c 100644
--- a/gcc/config/rs6000/vsx.md
+++ b/gcc/config/rs6000/vsx.md
@@ -55,8 +55,7 @@
 			     (KF	"FLOAT128_VECTOR_P (KFmode)")
 			     (TF	"FLOAT128_VECTOR_P (TFmode)")])
 
-;; Iterator for memory move.  Handle TImode specially to allow
-;; it to use gprs as well as vsx registers.
+;; Iterator for memory moves.
 (define_mode_iterator VSX_M [V16QI
 			     V8HI
 			     V4SI
@@ -65,18 +64,8 @@
 			     V2DF
 			     V1TI
 			     (KF	"FLOAT128_VECTOR_P (KFmode)")
-			     (TF	"FLOAT128_VECTOR_P (TFmode)")])
-
-(define_mode_iterator VSX_M2 [V16QI
-			      V8HI
-			      V4SI
-			      V2DI
-			      V4SF
-			      V2DF
-			      V1TI
-			      (KF	"FLOAT128_VECTOR_P (KFmode)")
-			      (TF	"FLOAT128_VECTOR_P (TFmode)")
-			      (TI	"TARGET_VSX_TIMODE")])
+			     (TF	"FLOAT128_VECTOR_P (TFmode)")
+			     (TI	"TARGET_VSX_TIMODE")])
 
 ;; Map into the appropriate load/store name based on the type
 (define_mode_attr VSm  [(V16QI "vw4")
@@ -270,6 +259,10 @@
 (define_mode_attr VS_64reg [(V2DF	"ws")
 			    (V2DI	"wi")])
 
+;; Iterators for loading constants with xxspltib
+(define_mode_iterator VSINT_84  [V4SI V2DI])
+(define_mode_iterator VSINT_842 [V8HI V4SI V2DI])
+
 ;; Constants for creating unspecs
 (define_c_enum "unspec"
   [UNSPEC_VSX_CONCAT
@@ -299,26 +292,9 @@
    UNSPEC_VSX_XVCVUXDDP
    UNSPEC_VSX_XVCVDPSXDS
    UNSPEC_VSX_XVCVDPUXDS
+   UNSPEC_VSX_SIGN_EXTEND
   ])
 
-;; VSX (P9) moves
-
-(define_insn "*p9_vecload_<mode>"
-  [(set (match_operand:VSX_M2 0 "vsx_register_operand" "=<VSa>")
-        (match_operand:VSX_M2 1 "memory_operand" "Z"))]
-  "TARGET_P9_VECTOR"
-  "lxvx %x0,%y1"
-  [(set_attr "type" "vecload")
-   (set_attr "length" "4")])
-
-(define_insn "*p9_vecstore_<mode>"
-  [(set (match_operand:VSX_M2 0 "memory_operand" "=Z")
-        (match_operand:VSX_M2 1 "vsx_register_operand" "<VSa>"))]
-  "TARGET_P9_VECTOR"
-  "stxvx %x1,%y0"
-  [(set_attr "type" "vecstore")
-   (set_attr "length" "4")])
-
 ;; VSX moves
 
 ;; The patterns for LE permuted loads and stores come before the general
@@ -709,7 +685,7 @@
     }
 }
   [(set_attr "length" "0,4")
-   (set_attr "type" "vecsimple")])
+   (set_attr "type" "veclogical")])
 
 (define_insn_and_split "*vsx_le_perm_load_<mode>"
   [(set (match_operand:VSX_LE_128 0 "vsx_register_operand" "=<VSa>")
@@ -787,92 +763,141 @@
 			   (const_int 64)))]
   "")
 
-(define_insn "*vsx_mov<mode>"
-  [(set (match_operand:VSX_M 0 "nonimmediate_operand" "=Z,<VSr>,<VSr>,?Z,?<VSa>,?<VSa>,r,we,wQ,?&r,??Y,??r,??r,<VSr>,?<VSa>,*r,v,wZ,v")
-	(match_operand:VSX_M 1 "input_operand" "<VSr>,Z,<VSr>,<VSa>,Z,<VSa>,we,b,r,wQ,r,Y,r,j,j,j,W,v,wZ"))]
-  "VECTOR_MEM_VSX_P (<MODE>mode)
-   && (register_operand (operands[0], <MODE>mode) 
-       || register_operand (operands[1], <MODE>mode))"
+;; Vector constants that can be generated with XXSPLTIB that was added in ISA
+;; 3.0.  Both (const_vector [..]) and (vec_duplicate ...) forms are recognized.
+(define_insn "xxspltib_v16qi"
+  [(set (match_operand:V16QI 0 "vsx_register_operand" "=wa")
+	(vec_duplicate:V16QI (match_operand:SI 1 "s8bit_cint_operand" "n")))]
+  "TARGET_P9_VECTOR"
 {
-  return rs6000_output_move_128bit (operands);
+  operands[2] = GEN_INT (INTVAL (operands[1]) & 0xff);
+  return "xxspltib %x0,%2";
 }
-  [(set_attr "type" "vecstore,vecload,vecsimple,vecstore,vecload,vecsimple,mffgpr,mftgpr,load,store,store,load, *,vecsimple,vecsimple,*, *,vecstore,vecload")
-   (set_attr "length" "4,4,4,4,4,4,8,4,12,12,12,12,16,4,4,*,16,4,4")])
-
-;; Unlike other VSX moves, allow the GPRs even for reloading, since a normal
-;; use of TImode is for unions.  However for plain data movement, slightly
-;; favor the vector loads
-(define_insn "*vsx_movti_64bit"
-  [(set (match_operand:TI 0 "nonimmediate_operand" "=Z,wa,wa,wa,r,we,v,v,wZ,wQ,&r,Y,r,r,?r")
-	(match_operand:TI 1 "input_operand" "wa,Z,wa,O,we,b,W,wZ,v,r,wQ,r,Y,r,n"))]
-  "TARGET_POWERPC64 && VECTOR_MEM_VSX_P (TImode)
-   && (register_operand (operands[0], TImode) 
-       || register_operand (operands[1], TImode))"
+  [(set_attr "type" "vecperm")])
+
+(define_insn "xxspltib_<mode>_nosplit"
+  [(set (match_operand:VSINT_842 0 "vsx_register_operand" "=wa,wa")
+	(match_operand:VSINT_842 1 "xxspltib_constant_nosplit" "jwM,wE"))]
+  "TARGET_P9_VECTOR"
 {
-  return rs6000_output_move_128bit (operands);
+  rtx op1 = operands[1];
+  int value = 256;
+  int num_insns = -1;
+
+  if (!xxspltib_constant_p (op1, <MODE>mode, &num_insns, &value)
+      || num_insns != 1)
+    gcc_unreachable ();
+
+  operands[2] = GEN_INT (value & 0xff);
+  return "xxspltib %x0,%2";
 }
-  [(set_attr "type" "vecstore,vecload,vecsimple,vecsimple,mffgpr,mftgpr,vecsimple,vecstore,vecload,store,load,store,load,*,*")
-   (set_attr "length" "4,4,4,4,8,4,16,4,4,8,8,8,8,8,8")])
-
-(define_insn "*vsx_movti_32bit"
-  [(set (match_operand:TI 0 "nonimmediate_operand" "=Z,wa,wa,wa,v, v,wZ,Q,Y,????r,????r,????r,r")
-	(match_operand:TI 1 "input_operand"        "wa, Z,wa, O,W,wZ, v,r,r,    Q,    Y,    r,n"))]
-  "! TARGET_POWERPC64 && VECTOR_MEM_VSX_P (TImode)
-   && (register_operand (operands[0], TImode)
-       || register_operand (operands[1], TImode))"
+  [(set_attr "type" "vecperm")])
+
+(define_insn_and_split "*xxspltib_<mode>_split"
+  [(set (match_operand:VSINT_842 0 "altivec_register_operand" "=v")
+	(match_operand:VSINT_842 1 "xxspltib_constant_split" "wS"))]
+  "TARGET_P9_VECTOR"
+  "#"
+  "&& 1"
+  [(const_int 0)]
 {
-  switch (which_alternative)
-    {
-    case 0:
-      return "stxvd2x %x1,%y0";
+  int value = 256;
+  int num_insns = -1;
+  rtx op0 = operands[0];
+  rtx op1 = operands[1];
+  rtx tmp = ((can_create_pseudo_p ())
+	     ? gen_reg_rtx (V16QImode)
+	     : gen_lowpart (V16QImode, op0));
 
-    case 1:
-      return "lxvd2x %x0,%y1";
+  if (!xxspltib_constant_p (op1, <MODE>mode, &num_insns, &value)
+      || num_insns != 2)
+    gcc_unreachable ();
 
-    case 2:
-      return "xxlor %x0,%x1,%x1";
+  emit_insn (gen_xxspltib_v16qi (tmp, GEN_INT (value)));
 
-    case 3:
-      return "xxlxor %x0,%x0,%x0";
+  if (<MODE>mode == V2DImode)
+    emit_insn (gen_vsx_sign_extend_qi_v2di (op0, tmp));
 
-    case 4:
-      return output_vec_const_move (operands);
+  else if (<MODE>mode == V4SImode)
+    emit_insn (gen_vsx_sign_extend_qi_v4si (op0, tmp));
 
-    case 5:
-      return "stvx %1,%y0";
+  else if (<MODE>mode == V8HImode)
+    emit_insn (gen_altivec_vupkhsb  (op0, tmp));
 
-    case 6:
-      return "lvx %0,%y1";
+  else
+    gcc_unreachable ();
 
-    case 7:
-      if (TARGET_STRING)
-        return \"stswi %1,%P0,16\";
+  DONE;
+}
+  [(set_attr "type" "vecperm")
+   (set_attr "length" "8")])
 
-    case 8:
-      return \"#\";
 
-    case 9:
-      /* If the address is not used in the output, we can use lsi.  Otherwise,
-	 fall through to generating four loads.  */
-      if (TARGET_STRING
-          && ! reg_overlap_mentioned_p (operands[0], operands[1]))
-	return \"lswi %0,%P1,16\";
-      /* ... fall through ...  */
+;; Prefer using vector registers over GPRs.  Prefer using ISA 3.0's XXSPLTISB
+;; or Altivec VSPLITW 0/-1 over XXLXOR/XXLORC to set a register to all 0's or
+;; all 1's, since the machine does not have to wait for the previous
+;; instruction using the register being set (such as a store waiting on a slow
+;; instruction). But generate XXLXOR/XXLORC if it will avoid a register move.
 
-    case 10:
-    case 11:
-    case 12:
-      return \"#\";
-    default:
-      gcc_unreachable ();
-    }
+;;              VSX store  VSX load   VSX move  VSX->GPR   GPR->VSX    LQ (GPR)
+;;              STQ (GPR)  GPR load   GPR store GPR move   XXSPLTIB    VSPLTISW
+;;              VSX 0/-1   GPR 0/-1   VMX const GPR const  LVX (VMX)   STVX (VMX)
+(define_insn "*vsx_mov<mode>_64bit"
+  [(set (match_operand:VSX_M 0 "nonimmediate_operand"
+               "=ZwO,      <VSa>,     <VSa>,     r,         we,        ?wQ,
+                ?&r,       ??r,       ??Y,       ??r,       wo,        v,
+                ?<VSa>,    *r,        v,         ??r,       wZ,        v")
+
+	(match_operand:VSX_M 1 "input_operand" 
+               "<VSa>,     ZwO,       <VSa>,     we,        r,         r,
+                wQ,        Y,         r,         r,         wE,        jwM,
+                ?jwM,      jwM,       W,         W,         v,         wZ"))]
+
+  "TARGET_POWERPC64 && VECTOR_MEM_VSX_P (<MODE>mode)
+   && (register_operand (operands[0], <MODE>mode) 
+       || register_operand (operands[1], <MODE>mode))"
+{
+  return rs6000_output_move_128bit (operands);
 }
-  [(set_attr "type" "vecstore,vecload,vecsimple,vecsimple,vecsimple,vecstore,vecload,store,store,load,load, *, *")
-   (set_attr "update" "     *,      *,        *,       *,         *,       *,      *,  yes,  yes, yes, yes, *, *")
-   (set_attr "length" "     4,      4,        4,       4,         8,       4,      4,   16,   16,  16,  16,16,16")
-   (set (attr "cell_micro") (if_then_else (match_test "TARGET_STRING")
-   			                  (const_string "always")
-					  (const_string "conditional")))])
+  [(set_attr "type"
+               "vecstore,  vecload,   vecsimple, mffgpr,    mftgpr,    load,
+                store,     load,      store,     *,         vecsimple, vecsimple,
+                vecsimple, *,         *,         *,         vecstore,  vecload")
+
+   (set_attr "length"
+               "4,         4,         4,         8,         4,         8,
+                8,         8,         8,         8,         4,         4,
+                4,         8,         20,        20,        4,         4")])
+
+;;              VSX store  VSX load   VSX move   GPR load   GPR store  GPR move
+;;              XXSPLTIB   VSPLTISW   VSX 0/-1   GPR 0/-1   VMX const  GPR const
+;;              LVX (VMX)  STVX (VMX)
+(define_insn "*vsx_mov<mode>_32bit"
+  [(set (match_operand:VSX_M 0 "nonimmediate_operand"
+               "=ZwO,      <VSa>,     <VSa>,     ??r,       ??Y,       ??r,
+                wo,        v,         ?<VSa>,    *r,        v,         ??r,
+                wZ,        v")
+
+	(match_operand:VSX_M 1 "input_operand" 
+               "<VSa>,     ZwO,       <VSa>,     Y,         r,         r,
+                wE,        jwM,       ?jwM,      jwM,       W,         W,
+                v,         wZ"))]
+
+  "!TARGET_POWERPC64 && VECTOR_MEM_VSX_P (<MODE>mode)
+   && (register_operand (operands[0], <MODE>mode) 
+       || register_operand (operands[1], <MODE>mode))"
+{
+  return rs6000_output_move_128bit (operands);
+}
+  [(set_attr "type"
+               "vecstore,  vecload,   vecsimple, load,      store,    *,
+                vecsimple, vecsimple, vecsimple, *,         *,        *,
+                vecstore,  vecload")
+
+   (set_attr "length"
+               "4,         4,         4,         16,        16,        16,
+                4,         4,         4,         16,        20,        32,
+                4,         4")])
 
 ;; Explicit  load/store expanders for the builtin functions
 (define_expand "vsx_load_<mode>"
@@ -887,6 +912,140 @@
   "VECTOR_MEM_VSX_P (<MODE>mode)"
   "")
 
+;; Explicit load/store expanders for the builtin functions for lxvd2x, etc.,
+;; when you really want their element-reversing behavior.
+(define_insn "vsx_ld_elemrev_v2di"
+  [(set (match_operand:V2DI 0 "vsx_register_operand" "=wa")
+        (vec_select:V2DI
+	  (match_operand:V2DI 1 "memory_operand" "Z")
+	  (parallel [(const_int 1) (const_int 0)])))]
+  "VECTOR_MEM_VSX_P (V2DImode) && !BYTES_BIG_ENDIAN"
+  "lxvd2x %x0,%y1"
+  [(set_attr "type" "vecload")])
+
+(define_insn "vsx_ld_elemrev_v2df"
+  [(set (match_operand:V2DF 0 "vsx_register_operand" "=wa")
+        (vec_select:V2DF
+	  (match_operand:V2DF 1 "memory_operand" "Z")
+	  (parallel [(const_int 1) (const_int 0)])))]
+  "VECTOR_MEM_VSX_P (V2DFmode) && !BYTES_BIG_ENDIAN"
+  "lxvd2x %x0,%y1"
+  [(set_attr "type" "vecload")])
+
+(define_insn "vsx_ld_elemrev_v4si"
+  [(set (match_operand:V4SI 0 "vsx_register_operand" "=wa")
+        (vec_select:V4SI
+	  (match_operand:V4SI 1 "memory_operand" "Z")
+	  (parallel [(const_int 3) (const_int 2)
+	             (const_int 1) (const_int 0)])))]
+  "VECTOR_MEM_VSX_P (V4SImode) && !BYTES_BIG_ENDIAN"
+  "lxvw4x %x0,%y1"
+  [(set_attr "type" "vecload")])
+
+(define_insn "vsx_ld_elemrev_v4sf"
+  [(set (match_operand:V4SF 0 "vsx_register_operand" "=wa")
+        (vec_select:V4SF
+	  (match_operand:V4SF 1 "memory_operand" "Z")
+	  (parallel [(const_int 3) (const_int 2)
+	             (const_int 1) (const_int 0)])))]
+  "VECTOR_MEM_VSX_P (V4SFmode) && !BYTES_BIG_ENDIAN"
+  "lxvw4x %x0,%y1"
+  [(set_attr "type" "vecload")])
+
+(define_insn "vsx_ld_elemrev_v8hi"
+  [(set (match_operand:V8HI 0 "vsx_register_operand" "=wa")
+        (vec_select:V8HI
+	  (match_operand:V8HI 1 "memory_operand" "Z")
+	  (parallel [(const_int 7) (const_int 6)
+	             (const_int 5) (const_int 4)
+		     (const_int 3) (const_int 2)
+	             (const_int 1) (const_int 0)])))]
+  "VECTOR_MEM_VSX_P (V8HImode) && !BYTES_BIG_ENDIAN && TARGET_P9_VECTOR"
+  "lxvh8x %x0,%y1"
+  [(set_attr "type" "vecload")])
+
+(define_insn "vsx_ld_elemrev_v16qi"
+  [(set (match_operand:V16QI 0 "vsx_register_operand" "=wa")
+        (vec_select:V16QI
+	  (match_operand:V16QI 1 "memory_operand" "Z")
+	  (parallel [(const_int 15) (const_int 14)
+	             (const_int 13) (const_int 12)
+		     (const_int 11) (const_int 10)
+		     (const_int  9) (const_int  8)
+		     (const_int  7) (const_int  6)
+	             (const_int  5) (const_int  4)
+		     (const_int  3) (const_int  2)
+	             (const_int  1) (const_int  0)])))]
+  "VECTOR_MEM_VSX_P (V16QImode) && !BYTES_BIG_ENDIAN && TARGET_P9_VECTOR"
+  "lxvb16x %x0,%y1"
+  [(set_attr "type" "vecload")])
+
+(define_insn "vsx_st_elemrev_v2df"
+  [(set (match_operand:V2DF 0 "memory_operand" "=Z")
+        (vec_select:V2DF
+	  (match_operand:V2DF 1 "vsx_register_operand" "wa")
+	  (parallel [(const_int 1) (const_int 0)])))]
+  "VECTOR_MEM_VSX_P (V2DFmode) && !BYTES_BIG_ENDIAN"
+  "stxvd2x %x1,%y0"
+  [(set_attr "type" "vecstore")])
+
+(define_insn "vsx_st_elemrev_v2di"
+  [(set (match_operand:V2DI 0 "memory_operand" "=Z")
+        (vec_select:V2DI
+	  (match_operand:V2DI 1 "vsx_register_operand" "wa")
+	  (parallel [(const_int 1) (const_int 0)])))]
+  "VECTOR_MEM_VSX_P (V2DImode) && !BYTES_BIG_ENDIAN"
+  "stxvd2x %x1,%y0"
+  [(set_attr "type" "vecstore")])
+
+(define_insn "vsx_st_elemrev_v4sf"
+  [(set (match_operand:V4SF 0 "memory_operand" "=Z")
+        (vec_select:V4SF
+	  (match_operand:V4SF 1 "vsx_register_operand" "wa")
+	  (parallel [(const_int 3) (const_int 2)
+	             (const_int 1) (const_int 0)])))]
+  "VECTOR_MEM_VSX_P (V4SFmode) && !BYTES_BIG_ENDIAN"
+  "stxvw4x %x1,%y0"
+  [(set_attr "type" "vecstore")])
+
+(define_insn "vsx_st_elemrev_v4si"
+  [(set (match_operand:V4SI 0 "memory_operand" "=Z")
+        (vec_select:V4SI
+	  (match_operand:V4SI 1 "vsx_register_operand" "wa")
+	  (parallel [(const_int 3) (const_int 2)
+	             (const_int 1) (const_int 0)])))]
+  "VECTOR_MEM_VSX_P (V4SImode) && !BYTES_BIG_ENDIAN"
+  "stxvw4x %x1,%y0"
+  [(set_attr "type" "vecstore")])
+
+(define_insn "vsx_st_elemrev_v8hi"
+  [(set (match_operand:V8HI 0 "memory_operand" "=Z")
+        (vec_select:V8HI
+	  (match_operand:V8HI 1 "vsx_register_operand" "wa")
+	  (parallel [(const_int 7) (const_int 6)
+	             (const_int 5) (const_int 4)
+		     (const_int 3) (const_int 2)
+	             (const_int 1) (const_int 0)])))]
+  "VECTOR_MEM_VSX_P (V8HImode) && !BYTES_BIG_ENDIAN && TARGET_P9_VECTOR"
+  "stxvh8x %x1,%y0"
+  [(set_attr "type" "vecstore")])
+
+(define_insn "vsx_st_elemrev_v16qi"
+  [(set (match_operand:V16QI 0 "memory_operand" "=Z")
+        (vec_select:V16QI
+	  (match_operand:V16QI 1 "vsx_register_operand" "wa")
+	  (parallel [(const_int 15) (const_int 14)
+	             (const_int 13) (const_int 12)
+		     (const_int 11) (const_int 10)
+		     (const_int  9) (const_int  8)
+	             (const_int  7) (const_int  6)
+	             (const_int  5) (const_int  4)
+		     (const_int  3) (const_int  2)
+	             (const_int  1) (const_int  0)])))]
+  "VECTOR_MEM_VSX_P (V16QImode) && !BYTES_BIG_ENDIAN && TARGET_P9_VECTOR"
+  "stxvb16x %x1,%y0"
+  [(set_attr "type" "vecstore")])
+
 
 ;; VSX vector floating point arithmetic instructions.  The VSX scalar
 ;; instructions are now combined with the insn for the traditional floating
@@ -1333,7 +1492,7 @@
 	 (match_operand:VSX_L 3 "vsx_register_operand" "<VSr>,<VSa>")))]
   "VECTOR_MEM_VSX_P (<MODE>mode)"
   "xxsel %x0,%x3,%x2,%x1"
-  [(set_attr "type" "vecperm")])
+  [(set_attr "type" "vecmove")])
 
 (define_insn "*vsx_xxsel<mode>_uns"
   [(set (match_operand:VSX_L 0 "vsx_register_operand" "=<VSr>,?<VSa>")
@@ -1344,7 +1503,7 @@
 	 (match_operand:VSX_L 3 "vsx_register_operand" "<VSr>,<VSa>")))]
   "VECTOR_MEM_VSX_P (<MODE>mode)"
   "xxsel %x0,%x3,%x2,%x1"
-  [(set_attr "type" "vecperm")])
+  [(set_attr "type" "vecmove")])
 
 ;; Copy sign
 (define_insn "vsx_copysign<mode>3"
@@ -1583,10 +1742,15 @@
 {
   rtx op0 = operands[0];
   rtx op1 = operands[1];
-  rtx tmp = gen_reg_rtx (V2DFmode);
-  int scale = INTVAL(operands[2]);
-  if (scale != 0)
-    rs6000_scale_v2df (tmp, op1, scale);
+  rtx tmp;
+  int scale = INTVAL (operands[2]);
+  if (scale == 0)
+    tmp = op1;
+  else
+    {
+      tmp  = gen_reg_rtx (V2DFmode);
+      rs6000_scale_v2df (tmp, op1, scale);
+    }
   emit_insn (gen_vsx_xvcvdpsxds (op0, tmp));
   DONE;
 })
@@ -1607,10 +1771,15 @@
 {
   rtx op0 = operands[0];
   rtx op1 = operands[1];
-  rtx tmp = gen_reg_rtx (V2DFmode);
-  int scale = INTVAL(operands[2]);
-  if (scale != 0)
-    rs6000_scale_v2df (tmp, op1, scale);
+  rtx tmp;
+  int scale = INTVAL (operands[2]);
+  if (scale == 0)
+    tmp = op1;
+  else
+    {
+      tmp = gen_reg_rtx (V2DFmode);
+      rs6000_scale_v2df (tmp, op1, scale);
+    }
   emit_insn (gen_vsx_xvcvdpuxds (op0, tmp));
   DONE;
 })
@@ -1960,7 +2129,7 @@
 
   return "xxlor %x0,%x1,%x1";
 }
-  [(set_attr "type" "fp,vecsimple,mftgpr,mftgpr")
+  [(set_attr "type" "fpsimple,veclogical,mftgpr,mftgpr")
    (set_attr "length" "4")])
 
 (define_insn "*vsx_extract_<mode>_internal2"
@@ -1995,7 +2164,7 @@
   operands[3] = GEN_INT (fldDM);
   return "xxpermdi %x0,%x1,%x1,%3";
 }
-  [(set_attr "type" "fp,vecsimple,vecperm")
+  [(set_attr "type" "fpsimple,veclogical,vecperm")
    (set_attr "length" "4")])
 
 ;; Optimize extracting a single scalar element from memory if the scalar is in
@@ -2215,20 +2384,61 @@
 
 ;; V2DF/V2DI splat
 (define_insn "vsx_splat_<mode>"
-  [(set (match_operand:VSX_D 0 "vsx_register_operand" "=wd,wd,wd,?<VSa>,?<VSa>,?<VSa>")
+  [(set (match_operand:VSX_D 0 "vsx_register_operand" "=<VSa>,<VSa>,we")
 	(vec_duplicate:VSX_D
-	 (match_operand:<VS_scalar> 1 "splat_input_operand" "<VS_64reg>,f,Z,<VSa>,<VSa>,Z")))]
+	 (match_operand:<VS_scalar> 1 "splat_input_operand" "<VS_64reg>,Z,b")))]
   "VECTOR_MEM_VSX_P (<MODE>mode)"
   "@
    xxpermdi %x0,%x1,%x1,0
-   xxpermdi %x0,%x1,%x1,0
    lxvdsx %x0,%y1
-   xxpermdi %x0,%x1,%x1,0
-   xxpermdi %x0,%x1,%x1,0
-   lxvdsx %x0,%y1"
-  [(set_attr "type" "vecperm,vecperm,vecload,vecperm,vecperm,vecload")])
+   mtvsrdd %x0,%1,%1"
+  [(set_attr "type" "vecperm,vecload,mftgpr")])
 
-;; V4SF/V4SI splat
+;; V4SI splat (ISA 3.0)
+;; When SI's are allowed in VSX registers, add XXSPLTW support
+(define_expand "vsx_splat_<mode>"
+  [(set (match_operand:VSX_W 0 "vsx_register_operand" "")
+	(vec_duplicate:VSX_W
+	 (match_operand:<VS_scalar> 1 "splat_input_operand" "")))]
+  "TARGET_P9_VECTOR"
+{
+  if (MEM_P (operands[1]))
+    operands[1] = rs6000_address_for_fpconvert (operands[1]);
+  else if (!REG_P (operands[1]))
+    operands[1] = force_reg (<VS_scalar>mode, operands[1]);
+})
+
+(define_insn "*vsx_splat_v4si_internal"
+  [(set (match_operand:V4SI 0 "vsx_register_operand" "=wa,wa")
+	(vec_duplicate:V4SI
+	 (match_operand:SI 1 "splat_input_operand" "r,Z")))]
+  "TARGET_P9_VECTOR"
+  "@
+   mtvsrws %x0,%1
+   lxvwsx %x0,%y1"
+  [(set_attr "type" "mftgpr,vecload")])
+
+;; V4SF splat (ISA 3.0)
+(define_insn_and_split "*vsx_splat_v4sf_internal"
+  [(set (match_operand:V4SF 0 "vsx_register_operand" "=wa,wa,wa")
+	(vec_duplicate:V4SF
+	 (match_operand:SF 1 "splat_input_operand" "Z,wy,r")))]
+  "TARGET_P9_VECTOR"
+  "@
+   lxvwsx %x0,%y1
+   #
+   mtvsrws %x0,%1"
+  "&& reload_completed && vsx_register_operand (operands[1], SFmode)"
+  [(set (match_dup 0)
+	(unspec:V4SF [(match_dup 1)] UNSPEC_VSX_CVDPSPN))
+   (set (match_dup 0)
+	(unspec:V4SF [(match_dup 0)
+		      (const_int 0)] UNSPEC_VSX_XXSPLTW))]
+  ""
+  [(set_attr "type" "vecload,vecperm,mftgpr")
+   (set_attr "length" "4,8,4")])
+
+;; V4SF/V4SI splat from a vector element
 (define_insn "vsx_xxspltw_<mode>"
   [(set (match_operand:VSX_W 0 "vsx_register_operand" "=wf,?<VSa>")
 	(vec_duplicate:VSX_W
@@ -2471,21 +2681,50 @@
 (define_peephole
   [(set (match_operand:P 0 "base_reg_operand" "")
 	(match_operand:P 1 "short_cint_operand" ""))
-   (set (match_operand:VSX_M2 2 "vsx_register_operand" "")
-	(mem:VSX_M2 (plus:P (match_dup 0)
-			    (match_operand:P 3 "int_reg_operand" ""))))]
+   (set (match_operand:VSX_M 2 "vsx_register_operand" "")
+	(mem:VSX_M (plus:P (match_dup 0)
+			   (match_operand:P 3 "int_reg_operand" ""))))]
   "TARGET_VSX && TARGET_P8_FUSION && !TARGET_P9_VECTOR"
-  "li %0,%1\t\t\t# vector load fusion\;lx<VSX_M2:VSm>x %x2,%0,%3"  
+  "li %0,%1\t\t\t# vector load fusion\;lx<VSX_M:VSm>x %x2,%0,%3"  
   [(set_attr "length" "8")
    (set_attr "type" "vecload")])
 
 (define_peephole
   [(set (match_operand:P 0 "base_reg_operand" "")
 	(match_operand:P 1 "short_cint_operand" ""))
-   (set (match_operand:VSX_M2 2 "vsx_register_operand" "")
-	(mem:VSX_M2 (plus:P (match_operand:P 3 "int_reg_operand" "")
-			    (match_dup 0))))]
+   (set (match_operand:VSX_M 2 "vsx_register_operand" "")
+	(mem:VSX_M (plus:P (match_operand:P 3 "int_reg_operand" "")
+			   (match_dup 0))))]
   "TARGET_VSX && TARGET_P8_FUSION && !TARGET_P9_VECTOR"
-  "li %0,%1\t\t\t# vector load fusion\;lx<VSX_M2:VSm>x %x2,%0,%3"  
+  "li %0,%1\t\t\t# vector load fusion\;lx<VSX_M:VSm>x %x2,%0,%3"  
   [(set_attr "length" "8")
    (set_attr "type" "vecload")])
+
+
+;; ISA 3.0 vector extend sign support
+
+(define_insn "vsx_sign_extend_qi_<mode>"
+  [(set (match_operand:VSINT_84 0 "vsx_register_operand" "=v")
+	(unspec:VSINT_84
+	 [(match_operand:V16QI 1 "vsx_register_operand" "v")]
+	 UNSPEC_VSX_SIGN_EXTEND))]
+  "TARGET_P9_VECTOR"
+  "vextsb2<wd> %0,%1"
+  [(set_attr "type" "vecexts")])
+
+(define_insn "*vsx_sign_extend_hi_<mode>"
+  [(set (match_operand:VSINT_84 0 "vsx_register_operand" "=v")
+	(unspec:VSINT_84
+	 [(match_operand:V8HI 1 "vsx_register_operand" "v")]
+	 UNSPEC_VSX_SIGN_EXTEND))]
+  "TARGET_P9_VECTOR"
+  "vextsh2<wd> %0,%1"
+  [(set_attr "type" "vecexts")])
+
+(define_insn "*vsx_sign_extend_si_v2di"
+  [(set (match_operand:V2DI 0 "vsx_register_operand" "=v")
+	(unspec:V2DI [(match_operand:V4SI 1 "vsx_register_operand" "v")]
+		     UNSPEC_VSX_SIGN_EXTEND))]
+  "TARGET_P9_VECTOR"
+  "vextsw2d %0,%1"
+  [(set_attr "type" "vecexts")])
diff --git a/gcc/config/rs6000/xfpu.md b/gcc/config/rs6000/xfpu.md
index 14557eb81b..963a1b5e75 100644
--- a/gcc/config/rs6000/xfpu.md
+++ b/gcc/config/rs6000/xfpu.md
@@ -55,7 +55,7 @@
 
 (define_insn_reservation "fp-default" 2
   (and (and 
-        (eq_attr "type" "fp")
+        (eq_attr "type" "fp,fpsimple")
         (eq_attr "fp_type" "fp_default"))
        (eq_attr "cpu" "ppc405"))
   "Xfpu_issue*2")
@@ -67,14 +67,14 @@
 
 (define_insn_reservation "fp-addsub-s" 14
   (and (and
-        (eq_attr "type" "fp")
+        (eq_attr "type" "fp,fpsimple")
         (eq_attr "fp_type" "fp_addsub_s"))
        (eq_attr "cpu" "ppc405"))
   "Xfpu_issue*2,Xfpu_addsub")
 
 (define_insn_reservation "fp-addsub-d" 18
   (and (and
-        (eq_attr "type" "fp")
+        (eq_attr "type" "fp,fpsimple")
         (eq_attr "fp_type" "fp_addsub_d"))
        (eq_attr "cpu" "ppc405"))
   "Xfpu_issue*2,Xfpu_addsub")
diff --git a/gcc/config/rtems.h b/gcc/config/rtems.h
index f13f72fd17..e005547fd9 100644
--- a/gcc/config/rtems.h
+++ b/gcc/config/rtems.h
@@ -45,6 +45,6 @@ see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
 #define LIB_SPEC "%{!qrtems: " STD_LIB_SPEC "} " \
 "%{!nostdlib: %{qrtems: --start-group \
  -lrtemsbsp -lrtemscpu \
- -lc -lgcc --end-group %{!qnolinkcmds: -T linkcmds%s}}}"
+ -latomic -lc -lgcc --end-group %{!qnolinkcmds: -T linkcmds%s}}}"
 
 #define TARGET_POSIX_IO
diff --git a/gcc/config/s390/s390-builtin-types.def b/gcc/config/s390/s390-builtin-types.def
index 6179b040e2..f5fcf986c8 100644
--- a/gcc/config/s390/s390-builtin-types.def
+++ b/gcc/config/s390/s390-builtin-types.def
@@ -19,29 +19,29 @@
    along with GCC; see the file COPYING3.  If not see
    <http://www.gnu.org/licenses/>.  */
 
-#define DEF_FN_TYPE_1(FN_TYPE, FLAGS, T1)	\
+#define DEF_FN_TYPE_0(FN_TYPE, FLAGS, T1)	\
   DEF_FN_TYPE (FN_TYPE,				\
 	       FLAGS,				\
 	       s390_builtin_types[T1])
-#define DEF_FN_TYPE_2(FN_TYPE, FLAGS, T1, T2)	\
+#define DEF_FN_TYPE_1(FN_TYPE, FLAGS, T1, T2)	\
   DEF_FN_TYPE (FN_TYPE,				\
 	       FLAGS,				\
 	       s390_builtin_types[T1],		\
 	       s390_builtin_types[T2])
-#define DEF_FN_TYPE_3(FN_TYPE, FLAGS, T1, T2, T3)	\
+#define DEF_FN_TYPE_2(FN_TYPE, FLAGS, T1, T2, T3)	\
   DEF_FN_TYPE (FN_TYPE,					\
 	       FLAGS,					\
 	       s390_builtin_types[T1],			\
 	       s390_builtin_types[T2],			\
 	       s390_builtin_types[T3])
-#define DEF_FN_TYPE_4(FN_TYPE, FLAGS, T1, T2, T3, T4)	\
+#define DEF_FN_TYPE_3(FN_TYPE, FLAGS, T1, T2, T3, T4)	\
   DEF_FN_TYPE (FN_TYPE,					\
 	       FLAGS,					\
 	       s390_builtin_types[T1],			\
 	       s390_builtin_types[T2],			\
 	       s390_builtin_types[T3],			\
 	       s390_builtin_types[T4])
-#define DEF_FN_TYPE_5(FN_TYPE, FLAGS, T1, T2, T3, T4, T5)	\
+#define DEF_FN_TYPE_4(FN_TYPE, FLAGS, T1, T2, T3, T4, T5)	\
   DEF_FN_TYPE (FN_TYPE,						\
 	       FLAGS,						\
 	       s390_builtin_types[T1],				\
@@ -49,7 +49,7 @@
 	       s390_builtin_types[T3],				\
 	       s390_builtin_types[T4],				\
 	       s390_builtin_types[T5])
-#define DEF_FN_TYPE_6(FN_TYPE, FLAGS, T1, T2, T3, T4, T5, T6)	\
+#define DEF_FN_TYPE_5(FN_TYPE, FLAGS, T1, T2, T3, T4, T5, T6)	\
   DEF_FN_TYPE (FN_TYPE,						\
 	       FLAGS,						\
 	       s390_builtin_types[T1],				\
@@ -66,6 +66,7 @@ DEF_TYPE (BT_FLT, B_VX, float_type_node, 0)
 DEF_TYPE (BT_UINT, 0, unsigned_type_node, 0)
 DEF_TYPE (BT_VOIDCONST, B_VX, void_type_node, 1)
 DEF_TYPE (BT_ULONG, B_VX, long_unsigned_type_node, 0)
+DEF_TYPE (BT_INT128, B_VX, intTI_type_node, 0)
 DEF_TYPE (BT_USHORTCONST, B_VX, short_unsigned_type_node, 1)
 DEF_TYPE (BT_SHORTCONST, B_VX, short_integer_type_node, 1)
 DEF_TYPE (BT_INTCONST, B_VX, integer_type_node, 1)
@@ -126,210 +127,212 @@ DEF_OPAQUE_VECTOR_TYPE (BT_OUV4SI, B_VX, BT_UINT, 4)
 DEF_OPAQUE_VECTOR_TYPE (BT_BV4SI, B_VX, BT_BINT, 4)
 DEF_OPAQUE_VECTOR_TYPE (BT_BV2DI, B_VX, BT_BLONGLONG, 2)
 DEF_OPAQUE_VECTOR_TYPE (BT_BV8HI, B_VX, BT_BSHORT, 8)
-DEF_FN_TYPE_1 (BT_FN_INT, B_HTM, BT_INT)
-DEF_FN_TYPE_1 (BT_FN_UINT, 0, BT_UINT)
-DEF_FN_TYPE_2 (BT_FN_INT_INT, B_VX, BT_INT, BT_INT)
-DEF_FN_TYPE_2 (BT_FN_INT_VOIDPTR, B_HTM, BT_INT, BT_VOIDPTR)
-DEF_FN_TYPE_2 (BT_FN_OV4SI_INT, B_VX, BT_OV4SI, BT_INT)
-DEF_FN_TYPE_2 (BT_FN_OV4SI_INTCONSTPTR, B_VX, BT_OV4SI, BT_INTCONSTPTR)
-DEF_FN_TYPE_2 (BT_FN_OV4SI_OV4SI, B_VX, BT_OV4SI, BT_OV4SI)
-DEF_FN_TYPE_2 (BT_FN_UV16QI_UCHAR, B_VX, BT_UV16QI, BT_UCHAR)
-DEF_FN_TYPE_2 (BT_FN_UV16QI_UCHARCONSTPTR, B_VX, BT_UV16QI, BT_UCHARCONSTPTR)
-DEF_FN_TYPE_2 (BT_FN_UV16QI_USHORT, B_VX, BT_UV16QI, BT_USHORT)
-DEF_FN_TYPE_2 (BT_FN_UV16QI_UV16QI, B_VX, BT_UV16QI, BT_UV16QI)
-DEF_FN_TYPE_2 (BT_FN_UV2DI_ULONGLONG, B_VX, BT_UV2DI, BT_ULONGLONG)
-DEF_FN_TYPE_2 (BT_FN_UV2DI_ULONGLONGCONSTPTR, B_VX, BT_UV2DI, BT_ULONGLONGCONSTPTR)
-DEF_FN_TYPE_2 (BT_FN_UV2DI_USHORT, B_VX, BT_UV2DI, BT_USHORT)
-DEF_FN_TYPE_2 (BT_FN_UV2DI_UV2DI, B_VX, BT_UV2DI, BT_UV2DI)
-DEF_FN_TYPE_2 (BT_FN_UV2DI_UV4SI, B_VX, BT_UV2DI, BT_UV4SI)
-DEF_FN_TYPE_2 (BT_FN_UV4SI_UINT, B_VX, BT_UV4SI, BT_UINT)
-DEF_FN_TYPE_2 (BT_FN_UV4SI_UINTCONSTPTR, B_VX, BT_UV4SI, BT_UINTCONSTPTR)
-DEF_FN_TYPE_2 (BT_FN_UV4SI_USHORT, B_VX, BT_UV4SI, BT_USHORT)
-DEF_FN_TYPE_2 (BT_FN_UV4SI_UV4SI, B_VX, BT_UV4SI, BT_UV4SI)
-DEF_FN_TYPE_2 (BT_FN_UV4SI_UV8HI, B_VX, BT_UV4SI, BT_UV8HI)
-DEF_FN_TYPE_2 (BT_FN_UV8HI_USHORT, B_VX, BT_UV8HI, BT_USHORT)
-DEF_FN_TYPE_2 (BT_FN_UV8HI_USHORTCONSTPTR, B_VX, BT_UV8HI, BT_USHORTCONSTPTR)
-DEF_FN_TYPE_2 (BT_FN_UV8HI_UV16QI, B_VX, BT_UV8HI, BT_UV16QI)
-DEF_FN_TYPE_2 (BT_FN_UV8HI_UV8HI, B_VX, BT_UV8HI, BT_UV8HI)
-DEF_FN_TYPE_2 (BT_FN_V16QI_SCHAR, B_VX, BT_V16QI, BT_SCHAR)
-DEF_FN_TYPE_2 (BT_FN_V16QI_UCHAR, B_VX, BT_V16QI, BT_UCHAR)
-DEF_FN_TYPE_2 (BT_FN_V16QI_V16QI, B_VX, BT_V16QI, BT_V16QI)
-DEF_FN_TYPE_2 (BT_FN_V2DF_DBL, B_VX, BT_V2DF, BT_DBL)
-DEF_FN_TYPE_2 (BT_FN_V2DF_FLTCONSTPTR, B_VX, BT_V2DF, BT_FLTCONSTPTR)
-DEF_FN_TYPE_2 (BT_FN_V2DF_V2DF, B_VX, BT_V2DF, BT_V2DF)
-DEF_FN_TYPE_2 (BT_FN_V2DI_SHORT, B_VX, BT_V2DI, BT_SHORT)
-DEF_FN_TYPE_2 (BT_FN_V2DI_V16QI, B_VX, BT_V2DI, BT_V16QI)
-DEF_FN_TYPE_2 (BT_FN_V2DI_V2DI, B_VX, BT_V2DI, BT_V2DI)
-DEF_FN_TYPE_2 (BT_FN_V2DI_V4SI, B_VX, BT_V2DI, BT_V4SI)
-DEF_FN_TYPE_2 (BT_FN_V2DI_V8HI, B_VX, BT_V2DI, BT_V8HI)
-DEF_FN_TYPE_2 (BT_FN_V4SI_SHORT, B_VX, BT_V4SI, BT_SHORT)
-DEF_FN_TYPE_2 (BT_FN_V4SI_V4SI, B_VX, BT_V4SI, BT_V4SI)
-DEF_FN_TYPE_2 (BT_FN_V4SI_V8HI, B_VX, BT_V4SI, BT_V8HI)
-DEF_FN_TYPE_2 (BT_FN_V8HI_SHORT, B_VX, BT_V8HI, BT_SHORT)
-DEF_FN_TYPE_2 (BT_FN_V8HI_V16QI, B_VX, BT_V8HI, BT_V16QI)
-DEF_FN_TYPE_2 (BT_FN_V8HI_V8HI, B_VX, BT_V8HI, BT_V8HI)
-DEF_FN_TYPE_2 (BT_FN_VOID_INT, B_HTM, BT_VOID, BT_INT)
-DEF_FN_TYPE_2 (BT_FN_VOID_UINT, 0, BT_VOID, BT_UINT)
-DEF_FN_TYPE_3 (BT_FN_DBL_V2DF_INT, B_VX, BT_DBL, BT_V2DF, BT_INT)
-DEF_FN_TYPE_3 (BT_FN_INT_OV4SI_INT, B_VX, BT_INT, BT_OV4SI, BT_INT)
-DEF_FN_TYPE_3 (BT_FN_INT_OV4SI_OV4SI, B_VX, BT_INT, BT_OV4SI, BT_OV4SI)
-DEF_FN_TYPE_3 (BT_FN_INT_UV16QI_UV16QI, B_VX, BT_INT, BT_UV16QI, BT_UV16QI)
-DEF_FN_TYPE_3 (BT_FN_INT_UV2DI_UV2DI, B_VX, BT_INT, BT_UV2DI, BT_UV2DI)
-DEF_FN_TYPE_3 (BT_FN_INT_UV4SI_UV4SI, B_VX, BT_INT, BT_UV4SI, BT_UV4SI)
-DEF_FN_TYPE_3 (BT_FN_INT_UV8HI_UV8HI, B_VX, BT_INT, BT_UV8HI, BT_UV8HI)
-DEF_FN_TYPE_3 (BT_FN_INT_V16QI_V16QI, B_VX, BT_INT, BT_V16QI, BT_V16QI)
-DEF_FN_TYPE_3 (BT_FN_INT_V2DF_V2DF, B_VX, BT_INT, BT_V2DF, BT_V2DF)
-DEF_FN_TYPE_3 (BT_FN_INT_V2DI_V2DI, B_VX, BT_INT, BT_V2DI, BT_V2DI)
-DEF_FN_TYPE_3 (BT_FN_INT_V4SI_V4SI, B_VX, BT_INT, BT_V4SI, BT_V4SI)
-DEF_FN_TYPE_3 (BT_FN_INT_V8HI_V8HI, B_VX, BT_INT, BT_V8HI, BT_V8HI)
-DEF_FN_TYPE_3 (BT_FN_INT_VOIDPTR_INT, B_HTM, BT_INT, BT_VOIDPTR, BT_INT)
-DEF_FN_TYPE_3 (BT_FN_OV2DI_LONGLONG_LONGLONG, B_VX, BT_OV2DI, BT_LONGLONG, BT_LONGLONG)
-DEF_FN_TYPE_3 (BT_FN_OV4SI_INTCONSTPTR_INT, B_VX, BT_OV4SI, BT_INTCONSTPTR, BT_INT)
-DEF_FN_TYPE_3 (BT_FN_OV4SI_INTCONSTPTR_UINT, B_VX, BT_OV4SI, BT_INTCONSTPTR, BT_UINT)
-DEF_FN_TYPE_3 (BT_FN_OV4SI_INT_INT, B_VX, BT_OV4SI, BT_INT, BT_INT)
-DEF_FN_TYPE_3 (BT_FN_OV4SI_OV4SI_INTPTR, B_VX, BT_OV4SI, BT_OV4SI, BT_INTPTR)
-DEF_FN_TYPE_3 (BT_FN_OV4SI_OV4SI_OV4SI, B_VX, BT_OV4SI, BT_OV4SI, BT_OV4SI)
-DEF_FN_TYPE_3 (BT_FN_OV4SI_OV4SI_UCHAR, B_VX, BT_OV4SI, BT_OV4SI, BT_UCHAR)
-DEF_FN_TYPE_3 (BT_FN_OV4SI_OV4SI_ULONG, B_VX, BT_OV4SI, BT_OV4SI, BT_ULONG)
-DEF_FN_TYPE_3 (BT_FN_UCHAR_UV16QI_INT, B_VX, BT_UCHAR, BT_UV16QI, BT_INT)
-DEF_FN_TYPE_3 (BT_FN_UINT_UV4SI_INT, B_VX, BT_UINT, BT_UV4SI, BT_INT)
-DEF_FN_TYPE_3 (BT_FN_UINT_VOIDCONSTPTR_INT, B_VX, BT_UINT, BT_VOIDCONSTPTR, BT_INT)
-DEF_FN_TYPE_3 (BT_FN_ULONGLONG_UV2DI_INT, B_VX, BT_ULONGLONG, BT_UV2DI, BT_INT)
-DEF_FN_TYPE_3 (BT_FN_USHORT_UV8HI_INT, B_VX, BT_USHORT, BT_UV8HI, BT_INT)
-DEF_FN_TYPE_3 (BT_FN_UV16QI_UCHARCONSTPTR_USHORT, B_VX, BT_UV16QI, BT_UCHARCONSTPTR, BT_USHORT)
-DEF_FN_TYPE_3 (BT_FN_UV16QI_UCHAR_INT, B_VX, BT_UV16QI, BT_UCHAR, BT_INT)
-DEF_FN_TYPE_3 (BT_FN_UV16QI_UCHAR_UCHAR, B_VX, BT_UV16QI, BT_UCHAR, BT_UCHAR)
-DEF_FN_TYPE_3 (BT_FN_UV16QI_UV16QI_INTPTR, B_VX, BT_UV16QI, BT_UV16QI, BT_INTPTR)
-DEF_FN_TYPE_3 (BT_FN_UV16QI_UV16QI_UCHAR, B_VX, BT_UV16QI, BT_UV16QI, BT_UCHAR)
-DEF_FN_TYPE_3 (BT_FN_UV16QI_UV16QI_UINT, B_VX, BT_UV16QI, BT_UV16QI, BT_UINT)
-DEF_FN_TYPE_3 (BT_FN_UV16QI_UV16QI_UV16QI, B_VX, BT_UV16QI, BT_UV16QI, BT_UV16QI)
-DEF_FN_TYPE_3 (BT_FN_UV16QI_UV2DI_UV2DI, B_VX, BT_UV16QI, BT_UV2DI, BT_UV2DI)
-DEF_FN_TYPE_3 (BT_FN_UV16QI_UV4SI_UV4SI, B_VX, BT_UV16QI, BT_UV4SI, BT_UV4SI)
-DEF_FN_TYPE_3 (BT_FN_UV16QI_UV8HI_UV8HI, B_VX, BT_UV16QI, BT_UV8HI, BT_UV8HI)
-DEF_FN_TYPE_3 (BT_FN_UV2DI_UCHAR_UCHAR, B_VX, BT_UV2DI, BT_UCHAR, BT_UCHAR)
-DEF_FN_TYPE_3 (BT_FN_UV2DI_ULONGLONG_INT, B_VX, BT_UV2DI, BT_ULONGLONG, BT_INT)
-DEF_FN_TYPE_3 (BT_FN_UV2DI_UV2DI_UCHAR, B_VX, BT_UV2DI, BT_UV2DI, BT_UCHAR)
-DEF_FN_TYPE_3 (BT_FN_UV2DI_UV2DI_UINT, B_VX, BT_UV2DI, BT_UV2DI, BT_UINT)
-DEF_FN_TYPE_3 (BT_FN_UV2DI_UV2DI_UV2DI, B_VX, BT_UV2DI, BT_UV2DI, BT_UV2DI)
-DEF_FN_TYPE_3 (BT_FN_UV2DI_UV4SI_UV4SI, B_VX, BT_UV2DI, BT_UV4SI, BT_UV4SI)
-DEF_FN_TYPE_3 (BT_FN_UV2DI_UV8HI_UV8HI, B_VX, BT_UV2DI, BT_UV8HI, BT_UV8HI)
-DEF_FN_TYPE_3 (BT_FN_UV2DI_V2DF_INT, B_VX, BT_UV2DI, BT_V2DF, BT_INT)
-DEF_FN_TYPE_3 (BT_FN_UV4SI_UCHAR_UCHAR, B_VX, BT_UV4SI, BT_UCHAR, BT_UCHAR)
-DEF_FN_TYPE_3 (BT_FN_UV4SI_UINT_INT, B_VX, BT_UV4SI, BT_UINT, BT_INT)
-DEF_FN_TYPE_3 (BT_FN_UV4SI_UV16QI_UV16QI, B_VX, BT_UV4SI, BT_UV16QI, BT_UV16QI)
-DEF_FN_TYPE_3 (BT_FN_UV4SI_UV2DI_UV2DI, B_VX, BT_UV4SI, BT_UV2DI, BT_UV2DI)
-DEF_FN_TYPE_3 (BT_FN_UV4SI_UV4SI_INTPTR, B_VX, BT_UV4SI, BT_UV4SI, BT_INTPTR)
-DEF_FN_TYPE_3 (BT_FN_UV4SI_UV4SI_UCHAR, B_VX, BT_UV4SI, BT_UV4SI, BT_UCHAR)
-DEF_FN_TYPE_3 (BT_FN_UV4SI_UV4SI_UINT, B_VX, BT_UV4SI, BT_UV4SI, BT_UINT)
-DEF_FN_TYPE_3 (BT_FN_UV4SI_UV4SI_UV4SI, B_VX, BT_UV4SI, BT_UV4SI, BT_UV4SI)
-DEF_FN_TYPE_3 (BT_FN_UV4SI_UV8HI_UV8HI, B_VX, BT_UV4SI, BT_UV8HI, BT_UV8HI)
-DEF_FN_TYPE_3 (BT_FN_UV8HI_UCHAR_UCHAR, B_VX, BT_UV8HI, BT_UCHAR, BT_UCHAR)
-DEF_FN_TYPE_3 (BT_FN_UV8HI_USHORT_INT, B_VX, BT_UV8HI, BT_USHORT, BT_INT)
-DEF_FN_TYPE_3 (BT_FN_UV8HI_UV16QI_UV16QI, B_VX, BT_UV8HI, BT_UV16QI, BT_UV16QI)
-DEF_FN_TYPE_3 (BT_FN_UV8HI_UV4SI_UV4SI, B_VX, BT_UV8HI, BT_UV4SI, BT_UV4SI)
-DEF_FN_TYPE_3 (BT_FN_UV8HI_UV8HI_INTPTR, B_VX, BT_UV8HI, BT_UV8HI, BT_INTPTR)
-DEF_FN_TYPE_3 (BT_FN_UV8HI_UV8HI_UCHAR, B_VX, BT_UV8HI, BT_UV8HI, BT_UCHAR)
-DEF_FN_TYPE_3 (BT_FN_UV8HI_UV8HI_UINT, B_VX, BT_UV8HI, BT_UV8HI, BT_UINT)
-DEF_FN_TYPE_3 (BT_FN_UV8HI_UV8HI_UV8HI, B_VX, BT_UV8HI, BT_UV8HI, BT_UV8HI)
-DEF_FN_TYPE_3 (BT_FN_V16QI_BV16QI_V16QI, B_VX, BT_V16QI, BT_BV16QI, BT_V16QI)
-DEF_FN_TYPE_3 (BT_FN_V16QI_UINT_VOIDCONSTPTR, B_VX, BT_V16QI, BT_UINT, BT_VOIDCONSTPTR)
-DEF_FN_TYPE_3 (BT_FN_V16QI_UV16QI_UV16QI, B_VX, BT_V16QI, BT_UV16QI, BT_UV16QI)
-DEF_FN_TYPE_3 (BT_FN_V16QI_V16QI_V16QI, B_VX, BT_V16QI, BT_V16QI, BT_V16QI)
-DEF_FN_TYPE_3 (BT_FN_V16QI_V8HI_V8HI, B_VX, BT_V16QI, BT_V8HI, BT_V8HI)
-DEF_FN_TYPE_3 (BT_FN_V2DF_DBL_INT, B_VX, BT_V2DF, BT_DBL, BT_INT)
-DEF_FN_TYPE_3 (BT_FN_V2DF_UV2DI_INT, B_VX, BT_V2DF, BT_UV2DI, BT_INT)
-DEF_FN_TYPE_3 (BT_FN_V2DF_UV4SI_INT, B_VX, BT_V2DF, BT_UV4SI, BT_INT)
-DEF_FN_TYPE_3 (BT_FN_V2DF_V2DF_V2DF, B_VX, BT_V2DF, BT_V2DF, BT_V2DF)
-DEF_FN_TYPE_3 (BT_FN_V2DF_V2DI_INT, B_VX, BT_V2DF, BT_V2DI, BT_INT)
-DEF_FN_TYPE_3 (BT_FN_V2DI_BV2DI_V2DI, B_VX, BT_V2DI, BT_BV2DI, BT_V2DI)
-DEF_FN_TYPE_3 (BT_FN_V2DI_UV2DI_UV2DI, B_VX, BT_V2DI, BT_UV2DI, BT_UV2DI)
-DEF_FN_TYPE_3 (BT_FN_V2DI_V2DF_INT, B_VX, BT_V2DI, BT_V2DF, BT_INT)
-DEF_FN_TYPE_3 (BT_FN_V2DI_V2DF_V2DF, B_VX, BT_V2DI, BT_V2DF, BT_V2DF)
-DEF_FN_TYPE_3 (BT_FN_V2DI_V2DI_V2DI, B_VX, BT_V2DI, BT_V2DI, BT_V2DI)
-DEF_FN_TYPE_3 (BT_FN_V2DI_V4SI_V4SI, B_VX, BT_V2DI, BT_V4SI, BT_V4SI)
-DEF_FN_TYPE_3 (BT_FN_V4SI_BV4SI_V4SI, B_VX, BT_V4SI, BT_BV4SI, BT_V4SI)
-DEF_FN_TYPE_3 (BT_FN_V4SI_INT_VOIDPTR, B_VX, BT_V4SI, BT_INT, BT_VOIDPTR)
-DEF_FN_TYPE_3 (BT_FN_V4SI_UV4SI_UV4SI, B_VX, BT_V4SI, BT_UV4SI, BT_UV4SI)
-DEF_FN_TYPE_3 (BT_FN_V4SI_V2DI_V2DI, B_VX, BT_V4SI, BT_V2DI, BT_V2DI)
-DEF_FN_TYPE_3 (BT_FN_V4SI_V4SI_V4SI, B_VX, BT_V4SI, BT_V4SI, BT_V4SI)
-DEF_FN_TYPE_3 (BT_FN_V4SI_V8HI_V8HI, B_VX, BT_V4SI, BT_V8HI, BT_V8HI)
-DEF_FN_TYPE_3 (BT_FN_V8HI_BV8HI_V8HI, B_VX, BT_V8HI, BT_BV8HI, BT_V8HI)
-DEF_FN_TYPE_3 (BT_FN_V8HI_UV8HI_UV8HI, B_VX, BT_V8HI, BT_UV8HI, BT_UV8HI)
-DEF_FN_TYPE_3 (BT_FN_V8HI_V16QI_V16QI, B_VX, BT_V8HI, BT_V16QI, BT_V16QI)
-DEF_FN_TYPE_3 (BT_FN_V8HI_V4SI_V4SI, B_VX, BT_V8HI, BT_V4SI, BT_V4SI)
-DEF_FN_TYPE_3 (BT_FN_V8HI_V8HI_V8HI, B_VX, BT_V8HI, BT_V8HI, BT_V8HI)
-DEF_FN_TYPE_3 (BT_FN_VOID_UINT64PTR_UINT64, B_HTM, BT_VOID, BT_UINT64PTR, BT_UINT64)
-DEF_FN_TYPE_3 (BT_FN_VOID_V2DF_FLTPTR, B_VX, BT_VOID, BT_V2DF, BT_FLTPTR)
-DEF_FN_TYPE_4 (BT_FN_INT_OV4SI_OV4SI_INTPTR, B_VX, BT_INT, BT_OV4SI, BT_OV4SI, BT_INTPTR)
-DEF_FN_TYPE_4 (BT_FN_OV4SI_INT_OV4SI_INT, B_VX, BT_OV4SI, BT_INT, BT_OV4SI, BT_INT)
-DEF_FN_TYPE_4 (BT_FN_OV4SI_OV4SI_OV4SI_INT, B_VX, BT_OV4SI, BT_OV4SI, BT_OV4SI, BT_INT)
-DEF_FN_TYPE_4 (BT_FN_OV4SI_OV4SI_OV4SI_INTPTR, B_VX, BT_OV4SI, BT_OV4SI, BT_OV4SI, BT_INTPTR)
-DEF_FN_TYPE_4 (BT_FN_OV4SI_OV4SI_OV4SI_OV4SI, B_VX, BT_OV4SI, BT_OV4SI, BT_OV4SI, BT_OV4SI)
-DEF_FN_TYPE_4 (BT_FN_OV4SI_OV4SI_OV4SI_UCHAR, B_VX, BT_OV4SI, BT_OV4SI, BT_OV4SI, BT_UCHAR)
-DEF_FN_TYPE_4 (BT_FN_OV4SI_OV4SI_OV4SI_ULONGLONG, B_VX, BT_OV4SI, BT_OV4SI, BT_OV4SI, BT_ULONGLONG)
-DEF_FN_TYPE_4 (BT_FN_UV16QI_UV16QI_UCHAR_INT, B_VX, BT_UV16QI, BT_UV16QI, BT_UCHAR, BT_INT)
-DEF_FN_TYPE_4 (BT_FN_UV16QI_UV16QI_UV16QI_INT, B_VX, BT_UV16QI, BT_UV16QI, BT_UV16QI, BT_INT)
-DEF_FN_TYPE_4 (BT_FN_UV16QI_UV16QI_UV16QI_INTPTR, B_VX, BT_UV16QI, BT_UV16QI, BT_UV16QI, BT_INTPTR)
-DEF_FN_TYPE_4 (BT_FN_UV16QI_UV16QI_UV16QI_UV16QI, B_VX, BT_UV16QI, BT_UV16QI, BT_UV16QI, BT_UV16QI)
-DEF_FN_TYPE_4 (BT_FN_UV16QI_UV2DI_UV2DI_UV16QI, B_VX, BT_UV16QI, BT_UV2DI, BT_UV2DI, BT_UV16QI)
-DEF_FN_TYPE_4 (BT_FN_UV16QI_UV8HI_UV8HI_INTPTR, B_VX, BT_UV16QI, BT_UV8HI, BT_UV8HI, BT_INTPTR)
-DEF_FN_TYPE_4 (BT_FN_UV2DI_UV2DI_ULONGLONG_INT, B_VX, BT_UV2DI, BT_UV2DI, BT_ULONGLONG, BT_INT)
-DEF_FN_TYPE_4 (BT_FN_UV2DI_UV2DI_UV2DI_INT, B_VX, BT_UV2DI, BT_UV2DI, BT_UV2DI, BT_INT)
-DEF_FN_TYPE_4 (BT_FN_UV2DI_UV4SI_UV4SI_UV2DI, B_VX, BT_UV2DI, BT_UV4SI, BT_UV4SI, BT_UV2DI)
-DEF_FN_TYPE_4 (BT_FN_UV4SI_UV2DI_UV2DI_INTPTR, B_VX, BT_UV4SI, BT_UV2DI, BT_UV2DI, BT_INTPTR)
-DEF_FN_TYPE_4 (BT_FN_UV4SI_UV4SI_UINT_INT, B_VX, BT_UV4SI, BT_UV4SI, BT_UINT, BT_INT)
-DEF_FN_TYPE_4 (BT_FN_UV4SI_UV4SI_UV4SI_INT, B_VX, BT_UV4SI, BT_UV4SI, BT_UV4SI, BT_INT)
-DEF_FN_TYPE_4 (BT_FN_UV4SI_UV4SI_UV4SI_INTPTR, B_VX, BT_UV4SI, BT_UV4SI, BT_UV4SI, BT_INTPTR)
-DEF_FN_TYPE_4 (BT_FN_UV4SI_UV4SI_UV4SI_UV4SI, B_VX, BT_UV4SI, BT_UV4SI, BT_UV4SI, BT_UV4SI)
-DEF_FN_TYPE_4 (BT_FN_UV4SI_UV8HI_UV8HI_UV4SI, B_VX, BT_UV4SI, BT_UV8HI, BT_UV8HI, BT_UV4SI)
-DEF_FN_TYPE_4 (BT_FN_UV8HI_UV16QI_UV16QI_UV8HI, B_VX, BT_UV8HI, BT_UV16QI, BT_UV16QI, BT_UV8HI)
-DEF_FN_TYPE_4 (BT_FN_UV8HI_UV4SI_UV4SI_INTPTR, B_VX, BT_UV8HI, BT_UV4SI, BT_UV4SI, BT_INTPTR)
-DEF_FN_TYPE_4 (BT_FN_UV8HI_UV8HI_USHORT_INT, B_VX, BT_UV8HI, BT_UV8HI, BT_USHORT, BT_INT)
-DEF_FN_TYPE_4 (BT_FN_UV8HI_UV8HI_UV8HI_INT, B_VX, BT_UV8HI, BT_UV8HI, BT_UV8HI, BT_INT)
-DEF_FN_TYPE_4 (BT_FN_UV8HI_UV8HI_UV8HI_INTPTR, B_VX, BT_UV8HI, BT_UV8HI, BT_UV8HI, BT_INTPTR)
-DEF_FN_TYPE_4 (BT_FN_UV8HI_UV8HI_UV8HI_UV8HI, B_VX, BT_UV8HI, BT_UV8HI, BT_UV8HI, BT_UV8HI)
-DEF_FN_TYPE_4 (BT_FN_V16QI_UV16QI_UV16QI_INTPTR, B_VX, BT_V16QI, BT_UV16QI, BT_UV16QI, BT_INTPTR)
-DEF_FN_TYPE_4 (BT_FN_V16QI_V16QI_V16QI_INTPTR, B_VX, BT_V16QI, BT_V16QI, BT_V16QI, BT_INTPTR)
-DEF_FN_TYPE_4 (BT_FN_V16QI_V16QI_V16QI_V16QI, B_VX, BT_V16QI, BT_V16QI, BT_V16QI, BT_V16QI)
-DEF_FN_TYPE_4 (BT_FN_V16QI_V8HI_V8HI_INTPTR, B_VX, BT_V16QI, BT_V8HI, BT_V8HI, BT_INTPTR)
-DEF_FN_TYPE_4 (BT_FN_V2DF_V2DF_DBL_INT, B_VX, BT_V2DF, BT_V2DF, BT_DBL, BT_INT)
-DEF_FN_TYPE_4 (BT_FN_V2DF_V2DF_UCHAR_UCHAR, B_VX, BT_V2DF, BT_V2DF, BT_UCHAR, BT_UCHAR)
-DEF_FN_TYPE_4 (BT_FN_V2DF_V2DF_V2DF_V2DF, B_VX, BT_V2DF, BT_V2DF, BT_V2DF, BT_V2DF)
-DEF_FN_TYPE_4 (BT_FN_V2DI_UV2DI_UV2DI_INTPTR, B_VX, BT_V2DI, BT_UV2DI, BT_UV2DI, BT_INTPTR)
-DEF_FN_TYPE_4 (BT_FN_V2DI_V2DF_INT_INTPTR, B_VX, BT_V2DI, BT_V2DF, BT_INT, BT_INTPTR)
-DEF_FN_TYPE_4 (BT_FN_V2DI_V2DF_V2DF_INTPTR, B_VX, BT_V2DI, BT_V2DF, BT_V2DF, BT_INTPTR)
-DEF_FN_TYPE_4 (BT_FN_V2DI_V2DI_V2DI_INTPTR, B_VX, BT_V2DI, BT_V2DI, BT_V2DI, BT_INTPTR)
-DEF_FN_TYPE_4 (BT_FN_V2DI_V4SI_V4SI_V2DI, B_VX, BT_V2DI, BT_V4SI, BT_V4SI, BT_V2DI)
-DEF_FN_TYPE_4 (BT_FN_V4SI_UV4SI_UV4SI_INTPTR, B_VX, BT_V4SI, BT_UV4SI, BT_UV4SI, BT_INTPTR)
-DEF_FN_TYPE_4 (BT_FN_V4SI_V2DI_V2DI_INTPTR, B_VX, BT_V4SI, BT_V2DI, BT_V2DI, BT_INTPTR)
-DEF_FN_TYPE_4 (BT_FN_V4SI_V4SI_V4SI_INTPTR, B_VX, BT_V4SI, BT_V4SI, BT_V4SI, BT_INTPTR)
-DEF_FN_TYPE_4 (BT_FN_V4SI_V4SI_V4SI_V4SI, B_VX, BT_V4SI, BT_V4SI, BT_V4SI, BT_V4SI)
-DEF_FN_TYPE_4 (BT_FN_V4SI_V8HI_V8HI_V4SI, B_VX, BT_V4SI, BT_V8HI, BT_V8HI, BT_V4SI)
-DEF_FN_TYPE_4 (BT_FN_V8HI_UV8HI_UV8HI_INTPTR, B_VX, BT_V8HI, BT_UV8HI, BT_UV8HI, BT_INTPTR)
-DEF_FN_TYPE_4 (BT_FN_V8HI_V16QI_V16QI_V8HI, B_VX, BT_V8HI, BT_V16QI, BT_V16QI, BT_V8HI)
-DEF_FN_TYPE_4 (BT_FN_V8HI_V4SI_V4SI_INTPTR, B_VX, BT_V8HI, BT_V4SI, BT_V4SI, BT_INTPTR)
-DEF_FN_TYPE_4 (BT_FN_V8HI_V8HI_V8HI_INTPTR, B_VX, BT_V8HI, BT_V8HI, BT_V8HI, BT_INTPTR)
-DEF_FN_TYPE_4 (BT_FN_V8HI_V8HI_V8HI_V8HI, B_VX, BT_V8HI, BT_V8HI, BT_V8HI, BT_V8HI)
-DEF_FN_TYPE_4 (BT_FN_VOID_OV4SI_INT_VOIDPTR, B_VX, BT_VOID, BT_OV4SI, BT_INT, BT_VOIDPTR)
-DEF_FN_TYPE_4 (BT_FN_VOID_OV4SI_VOIDPTR_UINT, B_VX, BT_VOID, BT_OV4SI, BT_VOIDPTR, BT_UINT)
-DEF_FN_TYPE_4 (BT_FN_VOID_V16QI_UINT_VOIDPTR, B_VX, BT_VOID, BT_V16QI, BT_UINT, BT_VOIDPTR)
-DEF_FN_TYPE_5 (BT_FN_OV4SI_OV4SI_OUV4SI_INTCONSTPTR_UCHAR, B_VX, BT_OV4SI, BT_OV4SI, BT_OUV4SI, BT_INTCONSTPTR, BT_UCHAR)
-DEF_FN_TYPE_5 (BT_FN_OV4SI_OV4SI_OV4SI_OV4SI_INTPTR, B_VX, BT_OV4SI, BT_OV4SI, BT_OV4SI, BT_OV4SI, BT_INTPTR)
-DEF_FN_TYPE_5 (BT_FN_UV16QI_UV16QI_UV16QI_INT_INTPTR, B_VX, BT_UV16QI, BT_UV16QI, BT_UV16QI, BT_INT, BT_INTPTR)
-DEF_FN_TYPE_5 (BT_FN_UV16QI_UV16QI_UV16QI_UV16QI_INT, B_VX, BT_UV16QI, BT_UV16QI, BT_UV16QI, BT_UV16QI, BT_INT)
-DEF_FN_TYPE_5 (BT_FN_UV2DI_UV2DI_UV2DI_ULONGLONGCONSTPTR_UCHAR, B_VX, BT_UV2DI, BT_UV2DI, BT_UV2DI, BT_ULONGLONGCONSTPTR, BT_UCHAR)
-DEF_FN_TYPE_5 (BT_FN_UV2DI_UV2DI_UV2DI_UV2DI_INT, B_VX, BT_UV2DI, BT_UV2DI, BT_UV2DI, BT_UV2DI, BT_INT)
-DEF_FN_TYPE_5 (BT_FN_UV4SI_UV4SI_UV4SI_INT_INTPTR, B_VX, BT_UV4SI, BT_UV4SI, BT_UV4SI, BT_INT, BT_INTPTR)
-DEF_FN_TYPE_5 (BT_FN_UV4SI_UV4SI_UV4SI_UINTCONSTPTR_UCHAR, B_VX, BT_UV4SI, BT_UV4SI, BT_UV4SI, BT_UINTCONSTPTR, BT_UCHAR)
-DEF_FN_TYPE_5 (BT_FN_UV4SI_UV4SI_UV4SI_UV4SI_INT, B_VX, BT_UV4SI, BT_UV4SI, BT_UV4SI, BT_UV4SI, BT_INT)
-DEF_FN_TYPE_5 (BT_FN_UV8HI_UV8HI_UV8HI_INT_INTPTR, B_VX, BT_UV8HI, BT_UV8HI, BT_UV8HI, BT_INT, BT_INTPTR)
-DEF_FN_TYPE_5 (BT_FN_UV8HI_UV8HI_UV8HI_UV8HI_INT, B_VX, BT_UV8HI, BT_UV8HI, BT_UV8HI, BT_UV8HI, BT_INT)
-DEF_FN_TYPE_5 (BT_FN_VOID_UV2DI_UV2DI_ULONGLONGPTR_ULONGLONG, B_VX, BT_VOID, BT_UV2DI, BT_UV2DI, BT_ULONGLONGPTR, BT_ULONGLONG)
-DEF_FN_TYPE_5 (BT_FN_VOID_UV4SI_UV4SI_UINTPTR_ULONGLONG, B_VX, BT_VOID, BT_UV4SI, BT_UV4SI, BT_UINTPTR, BT_ULONGLONG)
-DEF_FN_TYPE_5 (BT_FN_VOID_V4SI_V4SI_INTPTR_ULONGLONG, B_VX, BT_VOID, BT_V4SI, BT_V4SI, BT_INTPTR, BT_ULONGLONG)
-DEF_FN_TYPE_6 (BT_FN_UV16QI_UV16QI_UV16QI_UV16QI_INT_INTPTR, B_VX, BT_UV16QI, BT_UV16QI, BT_UV16QI, BT_UV16QI, BT_INT, BT_INTPTR)
-DEF_FN_TYPE_6 (BT_FN_UV4SI_UV4SI_UV4SI_UV4SI_INT_INTPTR, B_VX, BT_UV4SI, BT_UV4SI, BT_UV4SI, BT_UV4SI, BT_INT, BT_INTPTR)
-DEF_FN_TYPE_6 (BT_FN_UV8HI_UV8HI_UV8HI_UV8HI_INT_INTPTR, B_VX, BT_UV8HI, BT_UV8HI, BT_UV8HI, BT_UV8HI, BT_INT, BT_INTPTR)
+DEF_FN_TYPE_0 (BT_FN_INT, B_HTM, BT_INT)
+DEF_FN_TYPE_0 (BT_FN_UINT, 0, BT_UINT)
+DEF_FN_TYPE_1 (BT_FN_INT_INT, B_VX, BT_INT, BT_INT)
+DEF_FN_TYPE_1 (BT_FN_INT_VOIDPTR, B_HTM, BT_INT, BT_VOIDPTR)
+DEF_FN_TYPE_1 (BT_FN_OV4SI_INT, B_VX, BT_OV4SI, BT_INT)
+DEF_FN_TYPE_1 (BT_FN_OV4SI_INTCONSTPTR, B_VX, BT_OV4SI, BT_INTCONSTPTR)
+DEF_FN_TYPE_1 (BT_FN_OV4SI_OV4SI, B_VX, BT_OV4SI, BT_OV4SI)
+DEF_FN_TYPE_1 (BT_FN_UV16QI_UCHAR, B_VX, BT_UV16QI, BT_UCHAR)
+DEF_FN_TYPE_1 (BT_FN_UV16QI_UCHARCONSTPTR, B_VX, BT_UV16QI, BT_UCHARCONSTPTR)
+DEF_FN_TYPE_1 (BT_FN_UV16QI_USHORT, B_VX, BT_UV16QI, BT_USHORT)
+DEF_FN_TYPE_1 (BT_FN_UV16QI_UV16QI, B_VX, BT_UV16QI, BT_UV16QI)
+DEF_FN_TYPE_1 (BT_FN_UV2DI_ULONGLONG, B_VX, BT_UV2DI, BT_ULONGLONG)
+DEF_FN_TYPE_1 (BT_FN_UV2DI_ULONGLONGCONSTPTR, B_VX, BT_UV2DI, BT_ULONGLONGCONSTPTR)
+DEF_FN_TYPE_1 (BT_FN_UV2DI_USHORT, B_VX, BT_UV2DI, BT_USHORT)
+DEF_FN_TYPE_1 (BT_FN_UV2DI_UV2DI, B_VX, BT_UV2DI, BT_UV2DI)
+DEF_FN_TYPE_1 (BT_FN_UV2DI_UV4SI, B_VX, BT_UV2DI, BT_UV4SI)
+DEF_FN_TYPE_1 (BT_FN_UV4SI_UINT, B_VX, BT_UV4SI, BT_UINT)
+DEF_FN_TYPE_1 (BT_FN_UV4SI_UINTCONSTPTR, B_VX, BT_UV4SI, BT_UINTCONSTPTR)
+DEF_FN_TYPE_1 (BT_FN_UV4SI_USHORT, B_VX, BT_UV4SI, BT_USHORT)
+DEF_FN_TYPE_1 (BT_FN_UV4SI_UV4SI, B_VX, BT_UV4SI, BT_UV4SI)
+DEF_FN_TYPE_1 (BT_FN_UV4SI_UV8HI, B_VX, BT_UV4SI, BT_UV8HI)
+DEF_FN_TYPE_1 (BT_FN_UV8HI_USHORT, B_VX, BT_UV8HI, BT_USHORT)
+DEF_FN_TYPE_1 (BT_FN_UV8HI_USHORTCONSTPTR, B_VX, BT_UV8HI, BT_USHORTCONSTPTR)
+DEF_FN_TYPE_1 (BT_FN_UV8HI_UV16QI, B_VX, BT_UV8HI, BT_UV16QI)
+DEF_FN_TYPE_1 (BT_FN_UV8HI_UV8HI, B_VX, BT_UV8HI, BT_UV8HI)
+DEF_FN_TYPE_1 (BT_FN_V16QI_SCHAR, B_VX, BT_V16QI, BT_SCHAR)
+DEF_FN_TYPE_1 (BT_FN_V16QI_UCHAR, B_VX, BT_V16QI, BT_UCHAR)
+DEF_FN_TYPE_1 (BT_FN_V16QI_V16QI, B_VX, BT_V16QI, BT_V16QI)
+DEF_FN_TYPE_1 (BT_FN_V2DF_DBL, B_VX, BT_V2DF, BT_DBL)
+DEF_FN_TYPE_1 (BT_FN_V2DF_FLTCONSTPTR, B_VX, BT_V2DF, BT_FLTCONSTPTR)
+DEF_FN_TYPE_1 (BT_FN_V2DF_V2DF, B_VX, BT_V2DF, BT_V2DF)
+DEF_FN_TYPE_1 (BT_FN_V2DI_SHORT, B_VX, BT_V2DI, BT_SHORT)
+DEF_FN_TYPE_1 (BT_FN_V2DI_V16QI, B_VX, BT_V2DI, BT_V16QI)
+DEF_FN_TYPE_1 (BT_FN_V2DI_V2DI, B_VX, BT_V2DI, BT_V2DI)
+DEF_FN_TYPE_1 (BT_FN_V2DI_V4SI, B_VX, BT_V2DI, BT_V4SI)
+DEF_FN_TYPE_1 (BT_FN_V2DI_V8HI, B_VX, BT_V2DI, BT_V8HI)
+DEF_FN_TYPE_1 (BT_FN_V4SI_SHORT, B_VX, BT_V4SI, BT_SHORT)
+DEF_FN_TYPE_1 (BT_FN_V4SI_V4SI, B_VX, BT_V4SI, BT_V4SI)
+DEF_FN_TYPE_1 (BT_FN_V4SI_V8HI, B_VX, BT_V4SI, BT_V8HI)
+DEF_FN_TYPE_1 (BT_FN_V8HI_SHORT, B_VX, BT_V8HI, BT_SHORT)
+DEF_FN_TYPE_1 (BT_FN_V8HI_V16QI, B_VX, BT_V8HI, BT_V16QI)
+DEF_FN_TYPE_1 (BT_FN_V8HI_V8HI, B_VX, BT_V8HI, BT_V8HI)
+DEF_FN_TYPE_1 (BT_FN_VOID_INT, B_HTM, BT_VOID, BT_INT)
+DEF_FN_TYPE_1 (BT_FN_VOID_UINT, 0, BT_VOID, BT_UINT)
+DEF_FN_TYPE_2 (BT_FN_DBL_V2DF_INT, B_VX, BT_DBL, BT_V2DF, BT_INT)
+DEF_FN_TYPE_2 (BT_FN_INT128_INT128_INT128, B_VX, BT_INT128, BT_INT128, BT_INT128)
+DEF_FN_TYPE_2 (BT_FN_INT_OV4SI_INT, B_VX, BT_INT, BT_OV4SI, BT_INT)
+DEF_FN_TYPE_2 (BT_FN_INT_OV4SI_OV4SI, B_VX, BT_INT, BT_OV4SI, BT_OV4SI)
+DEF_FN_TYPE_2 (BT_FN_INT_UV16QI_UV16QI, B_VX, BT_INT, BT_UV16QI, BT_UV16QI)
+DEF_FN_TYPE_2 (BT_FN_INT_UV2DI_UV2DI, B_VX, BT_INT, BT_UV2DI, BT_UV2DI)
+DEF_FN_TYPE_2 (BT_FN_INT_UV4SI_UV4SI, B_VX, BT_INT, BT_UV4SI, BT_UV4SI)
+DEF_FN_TYPE_2 (BT_FN_INT_UV8HI_UV8HI, B_VX, BT_INT, BT_UV8HI, BT_UV8HI)
+DEF_FN_TYPE_2 (BT_FN_INT_V16QI_V16QI, B_VX, BT_INT, BT_V16QI, BT_V16QI)
+DEF_FN_TYPE_2 (BT_FN_INT_V2DF_V2DF, B_VX, BT_INT, BT_V2DF, BT_V2DF)
+DEF_FN_TYPE_2 (BT_FN_INT_V2DI_V2DI, B_VX, BT_INT, BT_V2DI, BT_V2DI)
+DEF_FN_TYPE_2 (BT_FN_INT_V4SI_V4SI, B_VX, BT_INT, BT_V4SI, BT_V4SI)
+DEF_FN_TYPE_2 (BT_FN_INT_V8HI_V8HI, B_VX, BT_INT, BT_V8HI, BT_V8HI)
+DEF_FN_TYPE_2 (BT_FN_INT_VOIDPTR_INT, B_HTM, BT_INT, BT_VOIDPTR, BT_INT)
+DEF_FN_TYPE_2 (BT_FN_OV2DI_LONGLONG_LONGLONG, B_VX, BT_OV2DI, BT_LONGLONG, BT_LONGLONG)
+DEF_FN_TYPE_2 (BT_FN_OV4SI_INTCONSTPTR_INT, B_VX, BT_OV4SI, BT_INTCONSTPTR, BT_INT)
+DEF_FN_TYPE_2 (BT_FN_OV4SI_INTCONSTPTR_UINT, B_VX, BT_OV4SI, BT_INTCONSTPTR, BT_UINT)
+DEF_FN_TYPE_2 (BT_FN_OV4SI_INT_INT, B_VX, BT_OV4SI, BT_INT, BT_INT)
+DEF_FN_TYPE_2 (BT_FN_OV4SI_OV4SI_INTPTR, B_VX, BT_OV4SI, BT_OV4SI, BT_INTPTR)
+DEF_FN_TYPE_2 (BT_FN_OV4SI_OV4SI_OV4SI, B_VX, BT_OV4SI, BT_OV4SI, BT_OV4SI)
+DEF_FN_TYPE_2 (BT_FN_OV4SI_OV4SI_UCHAR, B_VX, BT_OV4SI, BT_OV4SI, BT_UCHAR)
+DEF_FN_TYPE_2 (BT_FN_OV4SI_OV4SI_ULONG, B_VX, BT_OV4SI, BT_OV4SI, BT_ULONG)
+DEF_FN_TYPE_2 (BT_FN_UCHAR_UV16QI_INT, B_VX, BT_UCHAR, BT_UV16QI, BT_INT)
+DEF_FN_TYPE_2 (BT_FN_UINT_UV4SI_INT, B_VX, BT_UINT, BT_UV4SI, BT_INT)
+DEF_FN_TYPE_2 (BT_FN_UINT_VOIDCONSTPTR_INT, B_VX, BT_UINT, BT_VOIDCONSTPTR, BT_INT)
+DEF_FN_TYPE_2 (BT_FN_ULONGLONG_UV2DI_INT, B_VX, BT_ULONGLONG, BT_UV2DI, BT_INT)
+DEF_FN_TYPE_2 (BT_FN_USHORT_UV8HI_INT, B_VX, BT_USHORT, BT_UV8HI, BT_INT)
+DEF_FN_TYPE_2 (BT_FN_UV16QI_UCHARCONSTPTR_USHORT, B_VX, BT_UV16QI, BT_UCHARCONSTPTR, BT_USHORT)
+DEF_FN_TYPE_2 (BT_FN_UV16QI_UCHAR_INT, B_VX, BT_UV16QI, BT_UCHAR, BT_INT)
+DEF_FN_TYPE_2 (BT_FN_UV16QI_UCHAR_UCHAR, B_VX, BT_UV16QI, BT_UCHAR, BT_UCHAR)
+DEF_FN_TYPE_2 (BT_FN_UV16QI_UV16QI_INTPTR, B_VX, BT_UV16QI, BT_UV16QI, BT_INTPTR)
+DEF_FN_TYPE_2 (BT_FN_UV16QI_UV16QI_UCHAR, B_VX, BT_UV16QI, BT_UV16QI, BT_UCHAR)
+DEF_FN_TYPE_2 (BT_FN_UV16QI_UV16QI_UINT, B_VX, BT_UV16QI, BT_UV16QI, BT_UINT)
+DEF_FN_TYPE_2 (BT_FN_UV16QI_UV16QI_UV16QI, B_VX, BT_UV16QI, BT_UV16QI, BT_UV16QI)
+DEF_FN_TYPE_2 (BT_FN_UV16QI_UV2DI_UV2DI, B_VX, BT_UV16QI, BT_UV2DI, BT_UV2DI)
+DEF_FN_TYPE_2 (BT_FN_UV16QI_UV4SI_UV4SI, B_VX, BT_UV16QI, BT_UV4SI, BT_UV4SI)
+DEF_FN_TYPE_2 (BT_FN_UV16QI_UV8HI_UV8HI, B_VX, BT_UV16QI, BT_UV8HI, BT_UV8HI)
+DEF_FN_TYPE_2 (BT_FN_UV2DI_UCHAR_UCHAR, B_VX, BT_UV2DI, BT_UCHAR, BT_UCHAR)
+DEF_FN_TYPE_2 (BT_FN_UV2DI_ULONGLONG_INT, B_VX, BT_UV2DI, BT_ULONGLONG, BT_INT)
+DEF_FN_TYPE_2 (BT_FN_UV2DI_UV2DI_UCHAR, B_VX, BT_UV2DI, BT_UV2DI, BT_UCHAR)
+DEF_FN_TYPE_2 (BT_FN_UV2DI_UV2DI_UINT, B_VX, BT_UV2DI, BT_UV2DI, BT_UINT)
+DEF_FN_TYPE_2 (BT_FN_UV2DI_UV2DI_UV2DI, B_VX, BT_UV2DI, BT_UV2DI, BT_UV2DI)
+DEF_FN_TYPE_2 (BT_FN_UV2DI_UV4SI_UV4SI, B_VX, BT_UV2DI, BT_UV4SI, BT_UV4SI)
+DEF_FN_TYPE_2 (BT_FN_UV2DI_UV8HI_UV8HI, B_VX, BT_UV2DI, BT_UV8HI, BT_UV8HI)
+DEF_FN_TYPE_2 (BT_FN_UV2DI_V2DF_INT, B_VX, BT_UV2DI, BT_V2DF, BT_INT)
+DEF_FN_TYPE_2 (BT_FN_UV4SI_UCHAR_UCHAR, B_VX, BT_UV4SI, BT_UCHAR, BT_UCHAR)
+DEF_FN_TYPE_2 (BT_FN_UV4SI_UINT_INT, B_VX, BT_UV4SI, BT_UINT, BT_INT)
+DEF_FN_TYPE_2 (BT_FN_UV4SI_UV16QI_UV16QI, B_VX, BT_UV4SI, BT_UV16QI, BT_UV16QI)
+DEF_FN_TYPE_2 (BT_FN_UV4SI_UV2DI_UV2DI, B_VX, BT_UV4SI, BT_UV2DI, BT_UV2DI)
+DEF_FN_TYPE_2 (BT_FN_UV4SI_UV4SI_INTPTR, B_VX, BT_UV4SI, BT_UV4SI, BT_INTPTR)
+DEF_FN_TYPE_2 (BT_FN_UV4SI_UV4SI_UCHAR, B_VX, BT_UV4SI, BT_UV4SI, BT_UCHAR)
+DEF_FN_TYPE_2 (BT_FN_UV4SI_UV4SI_UINT, B_VX, BT_UV4SI, BT_UV4SI, BT_UINT)
+DEF_FN_TYPE_2 (BT_FN_UV4SI_UV4SI_UV4SI, B_VX, BT_UV4SI, BT_UV4SI, BT_UV4SI)
+DEF_FN_TYPE_2 (BT_FN_UV4SI_UV8HI_UV8HI, B_VX, BT_UV4SI, BT_UV8HI, BT_UV8HI)
+DEF_FN_TYPE_2 (BT_FN_UV8HI_UCHAR_UCHAR, B_VX, BT_UV8HI, BT_UCHAR, BT_UCHAR)
+DEF_FN_TYPE_2 (BT_FN_UV8HI_USHORT_INT, B_VX, BT_UV8HI, BT_USHORT, BT_INT)
+DEF_FN_TYPE_2 (BT_FN_UV8HI_UV16QI_UV16QI, B_VX, BT_UV8HI, BT_UV16QI, BT_UV16QI)
+DEF_FN_TYPE_2 (BT_FN_UV8HI_UV4SI_UV4SI, B_VX, BT_UV8HI, BT_UV4SI, BT_UV4SI)
+DEF_FN_TYPE_2 (BT_FN_UV8HI_UV8HI_INTPTR, B_VX, BT_UV8HI, BT_UV8HI, BT_INTPTR)
+DEF_FN_TYPE_2 (BT_FN_UV8HI_UV8HI_UCHAR, B_VX, BT_UV8HI, BT_UV8HI, BT_UCHAR)
+DEF_FN_TYPE_2 (BT_FN_UV8HI_UV8HI_UINT, B_VX, BT_UV8HI, BT_UV8HI, BT_UINT)
+DEF_FN_TYPE_2 (BT_FN_UV8HI_UV8HI_UV8HI, B_VX, BT_UV8HI, BT_UV8HI, BT_UV8HI)
+DEF_FN_TYPE_2 (BT_FN_V16QI_BV16QI_V16QI, B_VX, BT_V16QI, BT_BV16QI, BT_V16QI)
+DEF_FN_TYPE_2 (BT_FN_V16QI_UINT_VOIDCONSTPTR, B_VX, BT_V16QI, BT_UINT, BT_VOIDCONSTPTR)
+DEF_FN_TYPE_2 (BT_FN_V16QI_UV16QI_UV16QI, B_VX, BT_V16QI, BT_UV16QI, BT_UV16QI)
+DEF_FN_TYPE_2 (BT_FN_V16QI_V16QI_V16QI, B_VX, BT_V16QI, BT_V16QI, BT_V16QI)
+DEF_FN_TYPE_2 (BT_FN_V16QI_V8HI_V8HI, B_VX, BT_V16QI, BT_V8HI, BT_V8HI)
+DEF_FN_TYPE_2 (BT_FN_V2DF_DBL_INT, B_VX, BT_V2DF, BT_DBL, BT_INT)
+DEF_FN_TYPE_2 (BT_FN_V2DF_UV2DI_INT, B_VX, BT_V2DF, BT_UV2DI, BT_INT)
+DEF_FN_TYPE_2 (BT_FN_V2DF_UV4SI_INT, B_VX, BT_V2DF, BT_UV4SI, BT_INT)
+DEF_FN_TYPE_2 (BT_FN_V2DF_V2DF_V2DF, B_VX, BT_V2DF, BT_V2DF, BT_V2DF)
+DEF_FN_TYPE_2 (BT_FN_V2DF_V2DI_INT, B_VX, BT_V2DF, BT_V2DI, BT_INT)
+DEF_FN_TYPE_2 (BT_FN_V2DI_BV2DI_V2DI, B_VX, BT_V2DI, BT_BV2DI, BT_V2DI)
+DEF_FN_TYPE_2 (BT_FN_V2DI_UV2DI_UV2DI, B_VX, BT_V2DI, BT_UV2DI, BT_UV2DI)
+DEF_FN_TYPE_2 (BT_FN_V2DI_V2DF_INT, B_VX, BT_V2DI, BT_V2DF, BT_INT)
+DEF_FN_TYPE_2 (BT_FN_V2DI_V2DF_V2DF, B_VX, BT_V2DI, BT_V2DF, BT_V2DF)
+DEF_FN_TYPE_2 (BT_FN_V2DI_V2DI_V2DI, B_VX, BT_V2DI, BT_V2DI, BT_V2DI)
+DEF_FN_TYPE_2 (BT_FN_V2DI_V4SI_V4SI, B_VX, BT_V2DI, BT_V4SI, BT_V4SI)
+DEF_FN_TYPE_2 (BT_FN_V4SI_BV4SI_V4SI, B_VX, BT_V4SI, BT_BV4SI, BT_V4SI)
+DEF_FN_TYPE_2 (BT_FN_V4SI_INT_VOIDPTR, B_VX, BT_V4SI, BT_INT, BT_VOIDPTR)
+DEF_FN_TYPE_2 (BT_FN_V4SI_UV4SI_UV4SI, B_VX, BT_V4SI, BT_UV4SI, BT_UV4SI)
+DEF_FN_TYPE_2 (BT_FN_V4SI_V2DI_V2DI, B_VX, BT_V4SI, BT_V2DI, BT_V2DI)
+DEF_FN_TYPE_2 (BT_FN_V4SI_V4SI_V4SI, B_VX, BT_V4SI, BT_V4SI, BT_V4SI)
+DEF_FN_TYPE_2 (BT_FN_V4SI_V8HI_V8HI, B_VX, BT_V4SI, BT_V8HI, BT_V8HI)
+DEF_FN_TYPE_2 (BT_FN_V8HI_BV8HI_V8HI, B_VX, BT_V8HI, BT_BV8HI, BT_V8HI)
+DEF_FN_TYPE_2 (BT_FN_V8HI_UV8HI_UV8HI, B_VX, BT_V8HI, BT_UV8HI, BT_UV8HI)
+DEF_FN_TYPE_2 (BT_FN_V8HI_V16QI_V16QI, B_VX, BT_V8HI, BT_V16QI, BT_V16QI)
+DEF_FN_TYPE_2 (BT_FN_V8HI_V4SI_V4SI, B_VX, BT_V8HI, BT_V4SI, BT_V4SI)
+DEF_FN_TYPE_2 (BT_FN_V8HI_V8HI_V8HI, B_VX, BT_V8HI, BT_V8HI, BT_V8HI)
+DEF_FN_TYPE_2 (BT_FN_VOID_UINT64PTR_UINT64, B_HTM, BT_VOID, BT_UINT64PTR, BT_UINT64)
+DEF_FN_TYPE_2 (BT_FN_VOID_V2DF_FLTPTR, B_VX, BT_VOID, BT_V2DF, BT_FLTPTR)
+DEF_FN_TYPE_3 (BT_FN_INT128_INT128_INT128_INT128, B_VX, BT_INT128, BT_INT128, BT_INT128, BT_INT128)
+DEF_FN_TYPE_3 (BT_FN_INT_OV4SI_OV4SI_INTPTR, B_VX, BT_INT, BT_OV4SI, BT_OV4SI, BT_INTPTR)
+DEF_FN_TYPE_3 (BT_FN_OV4SI_INT_OV4SI_INT, B_VX, BT_OV4SI, BT_INT, BT_OV4SI, BT_INT)
+DEF_FN_TYPE_3 (BT_FN_OV4SI_OV4SI_OV4SI_INT, B_VX, BT_OV4SI, BT_OV4SI, BT_OV4SI, BT_INT)
+DEF_FN_TYPE_3 (BT_FN_OV4SI_OV4SI_OV4SI_INTPTR, B_VX, BT_OV4SI, BT_OV4SI, BT_OV4SI, BT_INTPTR)
+DEF_FN_TYPE_3 (BT_FN_OV4SI_OV4SI_OV4SI_OV4SI, B_VX, BT_OV4SI, BT_OV4SI, BT_OV4SI, BT_OV4SI)
+DEF_FN_TYPE_3 (BT_FN_OV4SI_OV4SI_OV4SI_UCHAR, B_VX, BT_OV4SI, BT_OV4SI, BT_OV4SI, BT_UCHAR)
+DEF_FN_TYPE_3 (BT_FN_OV4SI_OV4SI_OV4SI_ULONGLONG, B_VX, BT_OV4SI, BT_OV4SI, BT_OV4SI, BT_ULONGLONG)
+DEF_FN_TYPE_3 (BT_FN_UV16QI_UV16QI_UCHAR_INT, B_VX, BT_UV16QI, BT_UV16QI, BT_UCHAR, BT_INT)
+DEF_FN_TYPE_3 (BT_FN_UV16QI_UV16QI_UV16QI_INT, B_VX, BT_UV16QI, BT_UV16QI, BT_UV16QI, BT_INT)
+DEF_FN_TYPE_3 (BT_FN_UV16QI_UV16QI_UV16QI_INTPTR, B_VX, BT_UV16QI, BT_UV16QI, BT_UV16QI, BT_INTPTR)
+DEF_FN_TYPE_3 (BT_FN_UV16QI_UV16QI_UV16QI_UV16QI, B_VX, BT_UV16QI, BT_UV16QI, BT_UV16QI, BT_UV16QI)
+DEF_FN_TYPE_3 (BT_FN_UV16QI_UV2DI_UV2DI_UV16QI, B_VX, BT_UV16QI, BT_UV2DI, BT_UV2DI, BT_UV16QI)
+DEF_FN_TYPE_3 (BT_FN_UV16QI_UV8HI_UV8HI_INTPTR, B_VX, BT_UV16QI, BT_UV8HI, BT_UV8HI, BT_INTPTR)
+DEF_FN_TYPE_3 (BT_FN_UV2DI_UV2DI_ULONGLONG_INT, B_VX, BT_UV2DI, BT_UV2DI, BT_ULONGLONG, BT_INT)
+DEF_FN_TYPE_3 (BT_FN_UV2DI_UV2DI_UV2DI_INT, B_VX, BT_UV2DI, BT_UV2DI, BT_UV2DI, BT_INT)
+DEF_FN_TYPE_3 (BT_FN_UV2DI_UV4SI_UV4SI_UV2DI, B_VX, BT_UV2DI, BT_UV4SI, BT_UV4SI, BT_UV2DI)
+DEF_FN_TYPE_3 (BT_FN_UV4SI_UV2DI_UV2DI_INTPTR, B_VX, BT_UV4SI, BT_UV2DI, BT_UV2DI, BT_INTPTR)
+DEF_FN_TYPE_3 (BT_FN_UV4SI_UV4SI_UINT_INT, B_VX, BT_UV4SI, BT_UV4SI, BT_UINT, BT_INT)
+DEF_FN_TYPE_3 (BT_FN_UV4SI_UV4SI_UV4SI_INT, B_VX, BT_UV4SI, BT_UV4SI, BT_UV4SI, BT_INT)
+DEF_FN_TYPE_3 (BT_FN_UV4SI_UV4SI_UV4SI_INTPTR, B_VX, BT_UV4SI, BT_UV4SI, BT_UV4SI, BT_INTPTR)
+DEF_FN_TYPE_3 (BT_FN_UV4SI_UV4SI_UV4SI_UV4SI, B_VX, BT_UV4SI, BT_UV4SI, BT_UV4SI, BT_UV4SI)
+DEF_FN_TYPE_3 (BT_FN_UV4SI_UV8HI_UV8HI_UV4SI, B_VX, BT_UV4SI, BT_UV8HI, BT_UV8HI, BT_UV4SI)
+DEF_FN_TYPE_3 (BT_FN_UV8HI_UV16QI_UV16QI_UV8HI, B_VX, BT_UV8HI, BT_UV16QI, BT_UV16QI, BT_UV8HI)
+DEF_FN_TYPE_3 (BT_FN_UV8HI_UV4SI_UV4SI_INTPTR, B_VX, BT_UV8HI, BT_UV4SI, BT_UV4SI, BT_INTPTR)
+DEF_FN_TYPE_3 (BT_FN_UV8HI_UV8HI_USHORT_INT, B_VX, BT_UV8HI, BT_UV8HI, BT_USHORT, BT_INT)
+DEF_FN_TYPE_3 (BT_FN_UV8HI_UV8HI_UV8HI_INT, B_VX, BT_UV8HI, BT_UV8HI, BT_UV8HI, BT_INT)
+DEF_FN_TYPE_3 (BT_FN_UV8HI_UV8HI_UV8HI_INTPTR, B_VX, BT_UV8HI, BT_UV8HI, BT_UV8HI, BT_INTPTR)
+DEF_FN_TYPE_3 (BT_FN_UV8HI_UV8HI_UV8HI_UV8HI, B_VX, BT_UV8HI, BT_UV8HI, BT_UV8HI, BT_UV8HI)
+DEF_FN_TYPE_3 (BT_FN_V16QI_UV16QI_UV16QI_INTPTR, B_VX, BT_V16QI, BT_UV16QI, BT_UV16QI, BT_INTPTR)
+DEF_FN_TYPE_3 (BT_FN_V16QI_V16QI_V16QI_INTPTR, B_VX, BT_V16QI, BT_V16QI, BT_V16QI, BT_INTPTR)
+DEF_FN_TYPE_3 (BT_FN_V16QI_V16QI_V16QI_V16QI, B_VX, BT_V16QI, BT_V16QI, BT_V16QI, BT_V16QI)
+DEF_FN_TYPE_3 (BT_FN_V16QI_V8HI_V8HI_INTPTR, B_VX, BT_V16QI, BT_V8HI, BT_V8HI, BT_INTPTR)
+DEF_FN_TYPE_3 (BT_FN_V2DF_V2DF_DBL_INT, B_VX, BT_V2DF, BT_V2DF, BT_DBL, BT_INT)
+DEF_FN_TYPE_3 (BT_FN_V2DF_V2DF_UCHAR_UCHAR, B_VX, BT_V2DF, BT_V2DF, BT_UCHAR, BT_UCHAR)
+DEF_FN_TYPE_3 (BT_FN_V2DF_V2DF_V2DF_V2DF, B_VX, BT_V2DF, BT_V2DF, BT_V2DF, BT_V2DF)
+DEF_FN_TYPE_3 (BT_FN_V2DI_UV2DI_UV2DI_INTPTR, B_VX, BT_V2DI, BT_UV2DI, BT_UV2DI, BT_INTPTR)
+DEF_FN_TYPE_3 (BT_FN_V2DI_V2DF_INT_INTPTR, B_VX, BT_V2DI, BT_V2DF, BT_INT, BT_INTPTR)
+DEF_FN_TYPE_3 (BT_FN_V2DI_V2DF_V2DF_INTPTR, B_VX, BT_V2DI, BT_V2DF, BT_V2DF, BT_INTPTR)
+DEF_FN_TYPE_3 (BT_FN_V2DI_V2DI_V2DI_INTPTR, B_VX, BT_V2DI, BT_V2DI, BT_V2DI, BT_INTPTR)
+DEF_FN_TYPE_3 (BT_FN_V2DI_V4SI_V4SI_V2DI, B_VX, BT_V2DI, BT_V4SI, BT_V4SI, BT_V2DI)
+DEF_FN_TYPE_3 (BT_FN_V4SI_UV4SI_UV4SI_INTPTR, B_VX, BT_V4SI, BT_UV4SI, BT_UV4SI, BT_INTPTR)
+DEF_FN_TYPE_3 (BT_FN_V4SI_V2DI_V2DI_INTPTR, B_VX, BT_V4SI, BT_V2DI, BT_V2DI, BT_INTPTR)
+DEF_FN_TYPE_3 (BT_FN_V4SI_V4SI_V4SI_INTPTR, B_VX, BT_V4SI, BT_V4SI, BT_V4SI, BT_INTPTR)
+DEF_FN_TYPE_3 (BT_FN_V4SI_V4SI_V4SI_V4SI, B_VX, BT_V4SI, BT_V4SI, BT_V4SI, BT_V4SI)
+DEF_FN_TYPE_3 (BT_FN_V4SI_V8HI_V8HI_V4SI, B_VX, BT_V4SI, BT_V8HI, BT_V8HI, BT_V4SI)
+DEF_FN_TYPE_3 (BT_FN_V8HI_UV8HI_UV8HI_INTPTR, B_VX, BT_V8HI, BT_UV8HI, BT_UV8HI, BT_INTPTR)
+DEF_FN_TYPE_3 (BT_FN_V8HI_V16QI_V16QI_V8HI, B_VX, BT_V8HI, BT_V16QI, BT_V16QI, BT_V8HI)
+DEF_FN_TYPE_3 (BT_FN_V8HI_V4SI_V4SI_INTPTR, B_VX, BT_V8HI, BT_V4SI, BT_V4SI, BT_INTPTR)
+DEF_FN_TYPE_3 (BT_FN_V8HI_V8HI_V8HI_INTPTR, B_VX, BT_V8HI, BT_V8HI, BT_V8HI, BT_INTPTR)
+DEF_FN_TYPE_3 (BT_FN_V8HI_V8HI_V8HI_V8HI, B_VX, BT_V8HI, BT_V8HI, BT_V8HI, BT_V8HI)
+DEF_FN_TYPE_3 (BT_FN_VOID_OV4SI_INT_VOIDPTR, B_VX, BT_VOID, BT_OV4SI, BT_INT, BT_VOIDPTR)
+DEF_FN_TYPE_3 (BT_FN_VOID_OV4SI_VOIDPTR_UINT, B_VX, BT_VOID, BT_OV4SI, BT_VOIDPTR, BT_UINT)
+DEF_FN_TYPE_3 (BT_FN_VOID_V16QI_UINT_VOIDPTR, B_VX, BT_VOID, BT_V16QI, BT_UINT, BT_VOIDPTR)
+DEF_FN_TYPE_4 (BT_FN_OV4SI_OV4SI_OUV4SI_INTCONSTPTR_UCHAR, B_VX, BT_OV4SI, BT_OV4SI, BT_OUV4SI, BT_INTCONSTPTR, BT_UCHAR)
+DEF_FN_TYPE_4 (BT_FN_OV4SI_OV4SI_OV4SI_OV4SI_INTPTR, B_VX, BT_OV4SI, BT_OV4SI, BT_OV4SI, BT_OV4SI, BT_INTPTR)
+DEF_FN_TYPE_4 (BT_FN_UV16QI_UV16QI_UV16QI_INT_INTPTR, B_VX, BT_UV16QI, BT_UV16QI, BT_UV16QI, BT_INT, BT_INTPTR)
+DEF_FN_TYPE_4 (BT_FN_UV16QI_UV16QI_UV16QI_UV16QI_INT, B_VX, BT_UV16QI, BT_UV16QI, BT_UV16QI, BT_UV16QI, BT_INT)
+DEF_FN_TYPE_4 (BT_FN_UV2DI_UV2DI_UV2DI_ULONGLONGCONSTPTR_UCHAR, B_VX, BT_UV2DI, BT_UV2DI, BT_UV2DI, BT_ULONGLONGCONSTPTR, BT_UCHAR)
+DEF_FN_TYPE_4 (BT_FN_UV2DI_UV2DI_UV2DI_UV2DI_INT, B_VX, BT_UV2DI, BT_UV2DI, BT_UV2DI, BT_UV2DI, BT_INT)
+DEF_FN_TYPE_4 (BT_FN_UV4SI_UV4SI_UV4SI_INT_INTPTR, B_VX, BT_UV4SI, BT_UV4SI, BT_UV4SI, BT_INT, BT_INTPTR)
+DEF_FN_TYPE_4 (BT_FN_UV4SI_UV4SI_UV4SI_UINTCONSTPTR_UCHAR, B_VX, BT_UV4SI, BT_UV4SI, BT_UV4SI, BT_UINTCONSTPTR, BT_UCHAR)
+DEF_FN_TYPE_4 (BT_FN_UV4SI_UV4SI_UV4SI_UV4SI_INT, B_VX, BT_UV4SI, BT_UV4SI, BT_UV4SI, BT_UV4SI, BT_INT)
+DEF_FN_TYPE_4 (BT_FN_UV8HI_UV8HI_UV8HI_INT_INTPTR, B_VX, BT_UV8HI, BT_UV8HI, BT_UV8HI, BT_INT, BT_INTPTR)
+DEF_FN_TYPE_4 (BT_FN_UV8HI_UV8HI_UV8HI_UV8HI_INT, B_VX, BT_UV8HI, BT_UV8HI, BT_UV8HI, BT_UV8HI, BT_INT)
+DEF_FN_TYPE_4 (BT_FN_VOID_UV2DI_UV2DI_ULONGLONGPTR_ULONGLONG, B_VX, BT_VOID, BT_UV2DI, BT_UV2DI, BT_ULONGLONGPTR, BT_ULONGLONG)
+DEF_FN_TYPE_4 (BT_FN_VOID_UV4SI_UV4SI_UINTPTR_ULONGLONG, B_VX, BT_VOID, BT_UV4SI, BT_UV4SI, BT_UINTPTR, BT_ULONGLONG)
+DEF_FN_TYPE_4 (BT_FN_VOID_V4SI_V4SI_INTPTR_ULONGLONG, B_VX, BT_VOID, BT_V4SI, BT_V4SI, BT_INTPTR, BT_ULONGLONG)
+DEF_FN_TYPE_5 (BT_FN_UV16QI_UV16QI_UV16QI_UV16QI_INT_INTPTR, B_VX, BT_UV16QI, BT_UV16QI, BT_UV16QI, BT_UV16QI, BT_INT, BT_INTPTR)
+DEF_FN_TYPE_5 (BT_FN_UV4SI_UV4SI_UV4SI_UV4SI_INT_INTPTR, B_VX, BT_UV4SI, BT_UV4SI, BT_UV4SI, BT_UV4SI, BT_INT, BT_INTPTR)
+DEF_FN_TYPE_5 (BT_FN_UV8HI_UV8HI_UV8HI_UV8HI_INT_INTPTR, B_VX, BT_UV8HI, BT_UV8HI, BT_UV8HI, BT_UV8HI, BT_INT, BT_INTPTR)
 DEF_OV_TYPE (BT_OV_BV16QI_BV16QI, BT_BV16QI, BT_BV16QI)
 DEF_OV_TYPE (BT_OV_BV16QI_BV16QI_BV16QI, BT_BV16QI, BT_BV16QI, BT_BV16QI)
 DEF_OV_TYPE (BT_OV_BV16QI_BV16QI_BV16QI_BV16QI, BT_BV16QI, BT_BV16QI, BT_BV16QI, BT_BV16QI)
diff --git a/gcc/config/s390/s390-builtins.def b/gcc/config/s390/s390-builtins.def
index 408c51980c..4bcdb22d52 100644
--- a/gcc/config/s390/s390-builtins.def
+++ b/gcc/config/s390/s390-builtins.def
@@ -386,11 +386,11 @@ OB_DEF_VAR (s390_vec_insert_u64,        s390_vlvgg,         O3_ELEM,
 OB_DEF_VAR (s390_vec_insert_b64,        s390_vlvgg,         O3_ELEM,            BT_OV_UV2DI_ULONGLONG_BV2DI_INT)
 OB_DEF_VAR (s390_vec_insert_dbl,        s390_vlvgg_dbl,     O3_ELEM,            BT_OV_V2DF_DBL_V2DF_INT)
 
-B_DEF      (s390_vlvgb,                 vec_insertv16qi,    0,                  B_VX,               0,                  BT_FN_UV16QI_UV16QI_UCHAR_INT)
-B_DEF      (s390_vlvgh,                 vec_insertv8hi,     0,                  B_VX,               0,                  BT_FN_UV8HI_UV8HI_USHORT_INT)
-B_DEF      (s390_vlvgf,                 vec_insertv4si,     0,                  B_VX,               0,                  BT_FN_UV4SI_UV4SI_UINT_INT)
-B_DEF      (s390_vlvgg,                 vec_insertv2di,     0,                  B_VX,               0,                  BT_FN_UV2DI_UV2DI_ULONGLONG_INT)
-B_DEF      (s390_vlvgg_dbl,             vec_insertv2df,     0,                  B_VX | B_INT,       0,                  BT_FN_V2DF_V2DF_DBL_INT)
+B_DEF      (s390_vlvgb,                 vec_insertv16qi,    0,                  B_VX,               O3_ELEM,            BT_FN_UV16QI_UV16QI_UCHAR_INT)
+B_DEF      (s390_vlvgh,                 vec_insertv8hi,     0,                  B_VX,               O3_ELEM,            BT_FN_UV8HI_UV8HI_USHORT_INT)
+B_DEF      (s390_vlvgf,                 vec_insertv4si,     0,                  B_VX,               O3_ELEM,            BT_FN_UV4SI_UV4SI_UINT_INT)
+B_DEF      (s390_vlvgg,                 vec_insertv2di,     0,                  B_VX,               O3_ELEM,            BT_FN_UV2DI_UV2DI_ULONGLONG_INT)
+B_DEF      (s390_vlvgg_dbl,             vec_insertv2df,     0,                  B_VX | B_INT,       O3_ELEM,            BT_FN_V2DF_V2DF_DBL_INT)
 
 OB_DEF     (s390_vec_promote,           s390_vec_promote_s8,s390_vec_promote_dbl,B_VX,              BT_FN_OV4SI_INT_INT)
 OB_DEF_VAR (s390_vec_promote_s8,        s390_vlvgb_noin,    O2_ELEM,            BT_OV_V16QI_SCHAR_INT)                   /* vlvgb */
@@ -424,11 +424,11 @@ OB_DEF_VAR (s390_vec_extract_u64,       s390_vlgvg,         O2_ELEM,
 OB_DEF_VAR (s390_vec_extract_b64,       s390_vlgvg,         O2_ELEM,            BT_OV_ULONGLONG_BV2DI_INT)
 OB_DEF_VAR (s390_vec_extract_dbl,       s390_vlgvg_dbl,     O2_ELEM,            BT_OV_DBL_V2DF_INT)                      /* vlgvg */
 
-B_DEF      (s390_vlgvb,                 vec_extractv16qi,   0,                  B_VX,               0,                  BT_FN_UCHAR_UV16QI_INT)
-B_DEF      (s390_vlgvh,                 vec_extractv8hi,    0,                  B_VX,               0,                  BT_FN_USHORT_UV8HI_INT)
-B_DEF      (s390_vlgvf,                 vec_extractv4si,    0,                  B_VX,               0,                  BT_FN_UINT_UV4SI_INT)
-B_DEF      (s390_vlgvg,                 vec_extractv2di,    0,                  B_VX,               0,                  BT_FN_ULONGLONG_UV2DI_INT)
-B_DEF      (s390_vlgvg_dbl,             vec_extractv2df,    0,                  B_VX | B_INT,       0,                  BT_FN_DBL_V2DF_INT)
+B_DEF      (s390_vlgvb,                 vec_extractv16qi,   0,                  B_VX,               O2_ELEM,            BT_FN_UCHAR_UV16QI_INT)
+B_DEF      (s390_vlgvh,                 vec_extractv8hi,    0,                  B_VX,               O2_ELEM,            BT_FN_USHORT_UV8HI_INT)
+B_DEF      (s390_vlgvf,                 vec_extractv4si,    0,                  B_VX,               O2_ELEM,            BT_FN_UINT_UV4SI_INT)
+B_DEF      (s390_vlgvg,                 vec_extractv2di,    0,                  B_VX,               O2_ELEM,            BT_FN_ULONGLONG_UV2DI_INT)
+B_DEF      (s390_vlgvg_dbl,             vec_extractv2df,    0,                  B_VX | B_INT,       O2_ELEM,            BT_FN_DBL_V2DF_INT)
 
 OB_DEF     (s390_vec_insert_and_zero,   s390_vec_insert_and_zero_s8,s390_vec_insert_and_zero_dbl,B_VX,BT_FN_OV4SI_INTCONSTPTR)
 OB_DEF_VAR (s390_vec_insert_and_zero_s8,s390_vllezb,        0,                  BT_OV_V16QI_SCHARCONSTPTR)
@@ -741,7 +741,6 @@ B_DEF      (s390_vuplhw,                vec_unpacklv8hi,    0,
 B_DEF      (s390_vupllh,                vec_unpackl_lv8hi,  0,                  B_VX,               0,                  BT_FN_UV4SI_UV8HI)
 B_DEF      (s390_vuplf,                 vec_unpacklv4si,    0,                  B_VX,               0,                  BT_FN_V2DI_V4SI)
 B_DEF      (s390_vupllf,                vec_unpackl_lv4si,  0,                  B_VX,               0,                  BT_FN_UV2DI_UV4SI)
-B_DEF      (s390_vaq,                   vec_add_u128,       0,                  B_VX,               0,                  BT_FN_UV16QI_UV16QI_UV16QI)
 
 OB_DEF     (s390_vec_addc,              s390_vec_addc_u8,   s390_vec_addc_u64,  B_VX,               BT_FN_OV4SI_OV4SI_OV4SI)
 OB_DEF_VAR (s390_vec_addc_u8,           s390_vaccb,         0,                  BT_OV_UV16QI_UV16QI_UV16QI)
@@ -749,13 +748,20 @@ OB_DEF_VAR (s390_vec_addc_u16,          s390_vacch,         0,
 OB_DEF_VAR (s390_vec_addc_u32,          s390_vaccf,         0,                  BT_OV_UV4SI_UV4SI_UV4SI)
 OB_DEF_VAR (s390_vec_addc_u64,          s390_vaccg,         0,                  BT_OV_UV2DI_UV2DI_UV2DI)
 
-B_DEF      (s390_vaccb,                 vec_addcv16qi,      0,                  B_VX,               0,                  BT_FN_UV16QI_UV16QI_UV16QI)
-B_DEF      (s390_vacch,                 vec_addcv8hi,       0,                  B_VX,               0,                  BT_FN_UV8HI_UV8HI_UV8HI)
-B_DEF      (s390_vaccf,                 vec_addcv4si,       0,                  B_VX,               0,                  BT_FN_UV4SI_UV4SI_UV4SI)
-B_DEF      (s390_vaccg,                 vec_addcv2di,       0,                  B_VX,               0,                  BT_FN_UV2DI_UV2DI_UV2DI)
-B_DEF      (s390_vaccq,                 vec_addc_u128,      0,                  B_VX,               0,                  BT_FN_UV16QI_UV16QI_UV16QI)
-B_DEF      (s390_vacq,                  vec_adde_u128,      0,                  B_VX,               0,                  BT_FN_UV16QI_UV16QI_UV16QI_UV16QI)
-B_DEF      (s390_vacccq,                vec_addec_u128,     0,                  B_VX,               0,                  BT_FN_UV16QI_UV16QI_UV16QI_UV16QI)
+B_DEF      (s390_vaccb,                 vaccb_v16qi,        0,                  B_VX,               0,                  BT_FN_UV16QI_UV16QI_UV16QI)
+B_DEF      (s390_vacch,                 vacch_v8hi,         0,                  B_VX,               0,                  BT_FN_UV8HI_UV8HI_UV8HI)
+B_DEF      (s390_vaccf,                 vaccf_v4si,         0,                  B_VX,               0,                  BT_FN_UV4SI_UV4SI_UV4SI)
+B_DEF      (s390_vaccg,                 vaccg_v2di,         0,                  B_VX,               0,                  BT_FN_UV2DI_UV2DI_UV2DI)
+
+B_DEF      (s390_vec_add_u128,          addti3,             0,                  B_VX,               0,                  BT_FN_UV16QI_UV16QI_UV16QI)
+B_DEF      (s390_vec_addc_u128,         vaccq_ti,           0,                  B_VX,               0,                  BT_FN_UV16QI_UV16QI_UV16QI)
+B_DEF      (s390_vec_adde_u128,         vacq,               0,                  B_VX,               0,                  BT_FN_UV16QI_UV16QI_UV16QI_UV16QI)
+B_DEF      (s390_vec_addec_u128,        vacccq,             0,                  B_VX,               0,                  BT_FN_UV16QI_UV16QI_UV16QI_UV16QI)
+
+B_DEF      (s390_vaq,                   addti3,             0,                  B_VX,               0,                  BT_FN_INT128_INT128_INT128)
+B_DEF      (s390_vaccq,                 vaccq_ti,           0,                  B_VX,               0,                  BT_FN_INT128_INT128_INT128)
+B_DEF      (s390_vacq,                  vacq,               0,                  B_VX,               0,                  BT_FN_INT128_INT128_INT128_INT128)
+B_DEF      (s390_vacccq,                vacccq,             0,                  B_VX,               0,                  BT_FN_INT128_INT128_INT128_INT128)
 
 OB_DEF     (s390_vec_and,               s390_vec_and_b8,    s390_vec_and_dbl_c, B_VX,               BT_FN_OV4SI_OV4SI_OV4SI)
 OB_DEF_VAR (s390_vec_and_b8,            s390_vn,            0,                  BT_OV_BV16QI_BV16QI_BV16QI)
@@ -2051,7 +2057,6 @@ OB_DEF_VAR (s390_vec_srb_dbl_u64,       s390_vsrlb,         0,
 OB_DEF_VAR (s390_vec_srb_dbl_s64,       s390_vsrlb,         0,                  BT_OV_V2DF_V2DF_V2DI)
 
 B_DEF      (s390_vsrlb,                 vec_srbv16qi,       0,                  B_VX,               0,                  BT_FN_UV16QI_UV16QI_UV16QI)
-B_DEF      (s390_vsq,                   vec_sub_u128,       0,                  B_VX,               0,                  BT_FN_UV16QI_UV16QI_UV16QI)
 
 OB_DEF     (s390_vec_subc,              s390_vec_subc_u8,   s390_vec_subc_u64,  B_VX,               BT_FN_OV4SI_OV4SI_OV4SI)
 OB_DEF_VAR (s390_vec_subc_u8,           s390_vscbib,        0,                  BT_OV_UV16QI_UV16QI_UV16QI)
@@ -2059,13 +2064,25 @@ OB_DEF_VAR (s390_vec_subc_u16,          s390_vscbih,        0,
 OB_DEF_VAR (s390_vec_subc_u32,          s390_vscbif,        0,                  BT_OV_UV4SI_UV4SI_UV4SI)
 OB_DEF_VAR (s390_vec_subc_u64,          s390_vscbig,        0,                  BT_OV_UV2DI_UV2DI_UV2DI)
 
-B_DEF      (s390_vscbib,                vec_subcv16qi,      0,                  B_VX,               0,                  BT_FN_UV16QI_UV16QI_UV16QI)
-B_DEF      (s390_vscbih,                vec_subcv8hi,       0,                  B_VX,               0,                  BT_FN_UV8HI_UV8HI_UV8HI)
-B_DEF      (s390_vscbif,                vec_subcv4si,       0,                  B_VX,               0,                  BT_FN_UV4SI_UV4SI_UV4SI)
-B_DEF      (s390_vscbig,                vec_subcv2di,       0,                  B_VX,               0,                  BT_FN_UV2DI_UV2DI_UV2DI)
-B_DEF      (s390_vscbiq,                vec_subc_u128,      0,                  B_VX,               0,                  BT_FN_UV16QI_UV16QI_UV16QI)
-B_DEF      (s390_vsbiq,                 vec_sube_u128,      0,                  B_VX,               0,                  BT_FN_UV16QI_UV16QI_UV16QI_UV16QI)
-B_DEF      (s390_vsbcbiq,               vec_subec_u128,     0,                  B_VX,               0,                  BT_FN_UV16QI_UV16QI_UV16QI_UV16QI)
+B_DEF      (s390_vscbib,                vscbib_v16qi,       0,                  B_VX,               0,                  BT_FN_UV16QI_UV16QI_UV16QI)
+B_DEF      (s390_vscbih,                vscbih_v8hi,        0,                  B_VX,               0,                  BT_FN_UV8HI_UV8HI_UV8HI)
+B_DEF      (s390_vscbif,                vscbif_v4si,        0,                  B_VX,               0,                  BT_FN_UV4SI_UV4SI_UV4SI)
+B_DEF      (s390_vscbig,                vscbig_v2di,        0,                  B_VX,               0,                  BT_FN_UV2DI_UV2DI_UV2DI)
+
+/* The builtin definitions requires these to use vector unsigned char.
+   But we want the GCC low-level builtins and the insn patterns to
+   allow int128_t and TImode.  So we rely on s390_expand_builtin to
+   switch modes.  */
+
+B_DEF      (s390_vec_sub_u128,          subti3,             0,                  B_VX,               0,                  BT_FN_UV16QI_UV16QI_UV16QI)
+B_DEF      (s390_vec_subc_u128,         vscbiq_ti,          0,                  B_VX,               0,                  BT_FN_UV16QI_UV16QI_UV16QI)
+B_DEF      (s390_vec_sube_u128,         vsbiq,              0,                  B_VX,               0,                  BT_FN_UV16QI_UV16QI_UV16QI_UV16QI)
+B_DEF      (s390_vec_subec_u128,        vsbcbiq,            0,                  B_VX,               0,                  BT_FN_UV16QI_UV16QI_UV16QI_UV16QI)
+
+B_DEF      (s390_vsq,                   subti3,       	    0,                  B_VX,               0,                  BT_FN_INT128_INT128_INT128)
+B_DEF      (s390_vscbiq,                vscbiq_ti,          0,                  B_VX,               0,                  BT_FN_INT128_INT128_INT128)
+B_DEF      (s390_vsbiq,                 vsbiq,              0,                  B_VX,               0,                  BT_FN_INT128_INT128_INT128_INT128)
+B_DEF      (s390_vsbcbiq,               vsbcbiq,            0,                  B_VX,               0,                  BT_FN_INT128_INT128_INT128_INT128)
 
 OB_DEF     (s390_vec_sum2,              s390_vec_sum2_u16,  s390_vec_sum2_u32,  B_VX,               BT_FN_OV4SI_OV4SI_OV4SI)
 OB_DEF_VAR (s390_vec_sum2_u16,          s390_vsumgh,        0,                  BT_OV_UV2DI_UV8HI_UV8HI)
diff --git a/gcc/config/s390/s390.c b/gcc/config/s390/s390.c
index a1d0930c07..452b415901 100644
--- a/gcc/config/s390/s390.c
+++ b/gcc/config/s390/s390.c
@@ -791,7 +791,7 @@ s390_expand_builtin (tree exp, rtx target, rtx subtarget ATTRIBUTE_UNUSED,
 		     machine_mode mode ATTRIBUTE_UNUSED,
 		     int ignore ATTRIBUTE_UNUSED)
 {
-#define MAX_ARGS 5
+#define MAX_ARGS 6
 
   tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
   unsigned int fcode = DECL_FUNCTION_CODE (fndecl);
@@ -875,6 +875,7 @@ s390_expand_builtin (tree exp, rtx target, rtx subtarget ATTRIBUTE_UNUSED,
   arity = 0;
   FOR_EACH_CALL_EXPR_ARG (arg, iter, exp)
     {
+      rtx tmp_rtx;
       const struct insn_operand_data *insn_op;
       unsigned int op_flags = all_op_flags & ((1 << O_SHIFT) - 1);
 
@@ -950,6 +951,20 @@ s390_expand_builtin (tree exp, rtx target, rtx subtarget ATTRIBUTE_UNUSED,
 					     copy_to_mode_reg (Pmode,
 					       XEXP (op[arity], 0)));
 	}
+      /* Some of the builtins require different modes/types than the
+	 pattern in order to implement a specific API.  Instead of
+	 adding many expanders which do the mode change we do it here.
+	 E.g. s390_vec_add_u128 required to have vector unsigned char
+	 arguments is mapped to addti3.  */
+      else if (insn_op->mode != VOIDmode
+	       && GET_MODE (op[arity]) != VOIDmode
+	       && GET_MODE (op[arity]) != insn_op->mode
+	       && ((tmp_rtx = simplify_gen_subreg (insn_op->mode, op[arity],
+						   GET_MODE (op[arity]), 0))
+		   != NULL_RTX))
+	{
+	  op[arity] = tmp_rtx;
+	}
       else if (GET_MODE (op[arity]) == insn_op->mode
 	       || GET_MODE (op[arity]) == VOIDmode
 	       || (insn_op->predicate == address_operand
@@ -6442,11 +6457,17 @@ s390_expand_vec_init (rtx target, rtx vals)
   /* Unfortunately the vec_init expander is not allowed to fail.  So
      we have to implement the fallback ourselves.  */
   for (i = 0; i < n_elts; i++)
-    emit_insn (gen_rtx_SET (target,
-			    gen_rtx_UNSPEC (mode,
-					    gen_rtvec (3, XVECEXP (vals, 0, i),
-						       GEN_INT (i), target),
-					    UNSPEC_VEC_SET)));
+    {
+      rtx elem = XVECEXP (vals, 0, i);
+      if (!general_operand (elem, GET_MODE (elem)))
+	elem = force_reg (inner_mode, elem);
+
+      emit_insn (gen_rtx_SET (target,
+			      gen_rtx_UNSPEC (mode,
+					      gen_rtvec (3, elem,
+							 GEN_INT (i), target),
+					      UNSPEC_VEC_SET)));
+    }
 }
 
 /* Structure to hold the initial parameters for a compare_and_swap operation
@@ -12405,17 +12426,13 @@ s390_encode_section_info (tree decl, rtx rtl, int first)
     {
       /* Store the alignment to be able to check if we can use
 	 a larl/load-relative instruction.  We only handle the cases
-	 that can go wrong (i.e. no FUNC_DECLs).  If a symref does
-	 not have any flag we assume it to be correctly aligned.  */
-
-      if (DECL_ALIGN (decl) % 64)
-	SYMBOL_FLAG_SET_NOTALIGN8 (XEXP (rtl, 0));
-
-      if (DECL_ALIGN (decl) % 32)
-	SYMBOL_FLAG_SET_NOTALIGN4 (XEXP (rtl, 0));
-
+	 that can go wrong (i.e. no FUNC_DECLs).  */
       if (DECL_ALIGN (decl) == 0 || DECL_ALIGN (decl) % 16)
 	SYMBOL_FLAG_SET_NOTALIGN2 (XEXP (rtl, 0));
+      else if (DECL_ALIGN (decl) % 32)
+	SYMBOL_FLAG_SET_NOTALIGN4 (XEXP (rtl, 0));
+      else if (DECL_ALIGN (decl) % 64)
+	SYMBOL_FLAG_SET_NOTALIGN8 (XEXP (rtl, 0));
     }
 
   /* Literal pool references don't have a decl so they are handled
@@ -12423,18 +12440,14 @@ s390_encode_section_info (tree decl, rtx rtl, int first)
      entry to decide upon the alignment.  */
   if (MEM_P (rtl)
       && GET_CODE (XEXP (rtl, 0)) == SYMBOL_REF
-      && TREE_CONSTANT_POOL_ADDRESS_P (XEXP (rtl, 0))
-      && MEM_ALIGN (rtl) != 0
-      && GET_MODE_BITSIZE (GET_MODE (rtl)) != 0)
+      && TREE_CONSTANT_POOL_ADDRESS_P (XEXP (rtl, 0)))
     {
-      if (MEM_ALIGN (rtl) % 64)
-	SYMBOL_FLAG_SET_NOTALIGN8 (XEXP (rtl, 0));
-
-      if (MEM_ALIGN (rtl) % 32)
-	SYMBOL_FLAG_SET_NOTALIGN4 (XEXP (rtl, 0));
-
       if (MEM_ALIGN (rtl) == 0 || MEM_ALIGN (rtl) % 16)
 	SYMBOL_FLAG_SET_NOTALIGN2 (XEXP (rtl, 0));
+      else if (MEM_ALIGN (rtl) % 32)
+	SYMBOL_FLAG_SET_NOTALIGN4 (XEXP (rtl, 0));
+      else if (MEM_ALIGN (rtl) % 64)
+	SYMBOL_FLAG_SET_NOTALIGN8 (XEXP (rtl, 0));
     }
 }
 
diff --git a/gcc/config/s390/s390.md b/gcc/config/s390/s390.md
index 2b4e8f4b2b..b3d4370196 100644
--- a/gcc/config/s390/s390.md
+++ b/gcc/config/s390/s390.md
@@ -174,7 +174,6 @@
    UNSPEC_VEC_UNPACKL
    UNSPEC_VEC_UNPACKL_L
    UNSPEC_VEC_ADDC
-   UNSPEC_VEC_ADDC_U128
    UNSPEC_VEC_ADDE_U128
    UNSPEC_VEC_ADDEC_U128
    UNSPEC_VEC_AVG
@@ -198,9 +197,7 @@
    UNSPEC_VEC_SRL
    UNSPEC_VEC_SRLB
 
-   UNSPEC_VEC_SUB_U128
    UNSPEC_VEC_SUBC
-   UNSPEC_VEC_SUBC_U128
    UNSPEC_VEC_SUBE_U128
    UNSPEC_VEC_SUBEC_U128
 
@@ -1295,7 +1292,7 @@
 	(compare:VFCMP (match_operand:DF 0 "register_operand" "v")
 		       (match_operand:DF 1 "register_operand" "v")))
    (clobber (match_scratch:V2DI 2 "=v"))]
-  "TARGET_Z13 && TARGET_HARD_FLOAT"
+  "TARGET_VX && TARGET_HARD_FLOAT"
   "wfc<asm_fcmp>dbs\t%v2,%v0,%v1"
   [(set_attr "op_type" "VRR")])
 
@@ -4649,7 +4646,7 @@
 	(unsigned_fix:DI (match_operand:DF 1 "register_operand"  "f,v")))
    (unspec:DI [(match_operand:DI           2 "immediate_operand" "K,K")] UNSPEC_ROUND)
    (clobber (reg:CC CC_REGNUM))]
-   "TARGET_Z13 && TARGET_HARD_FLOAT"
+   "TARGET_VX && TARGET_HARD_FLOAT"
    "@
     clgdbr\t%0,%h2,%1,0
     wclgdb\t%v0,%v1,0,%h2"
@@ -4664,7 +4661,7 @@
    (unspec:GPR [(match_operand:GPR          2 "immediate_operand" "K")] UNSPEC_ROUND)
    (clobber (reg:CC CC_REGNUM))]
    "TARGET_Z196 && TARGET_HARD_FLOAT
-    && (!TARGET_Z13 || <GPR:MODE>mode != DImode || <FP:MODE>mode != DFmode)"
+    && (!TARGET_VX || <GPR:MODE>mode != DImode || <FP:MODE>mode != DFmode)"
    "cl<GPR:gf><FP:xde><FP:bt>r\t%0,%h2,%1,0"
    [(set_attr "op_type" "RRF")
     (set_attr "type"    "ftoi")])
@@ -4684,7 +4681,7 @@
         (fix:DI (match_operand:DF 1 "register_operand"  "f,v")))
    (unspec:DI [(match_operand:DI  2 "immediate_operand" "K,K")] UNSPEC_ROUND)
    (clobber (reg:CC CC_REGNUM))]
-  "TARGET_Z13 && TARGET_HARD_FLOAT"
+  "TARGET_VX && TARGET_HARD_FLOAT"
   "@
    cgdbr\t%0,%h2,%1
    wcgdb\t%v0,%v1,0,%h2"
@@ -4792,7 +4789,7 @@
 (define_insn "*floatunsdidf2_z13"
   [(set (match_operand:DF                    0 "register_operand" "=f,v")
         (unsigned_float:DF (match_operand:DI 1 "register_operand"  "d,v")))]
-  "TARGET_Z13 && TARGET_HARD_FLOAT"
+  "TARGET_VX && TARGET_HARD_FLOAT"
   "@
    cdlgbr\t%0,0,%1,0
    wcdlgb\t%v0,%v1,0,0"
@@ -4896,7 +4893,7 @@
 (define_insn "*extendsfdf2_z13"
   [(set (match_operand:DF                  0 "register_operand"     "=f,f,v")
         (float_extend:DF (match_operand:SF 1 "nonimmediate_operand"  "f,R,v")))]
-  "TARGET_Z13 && TARGET_HARD_FLOAT"
+  "TARGET_VX && TARGET_HARD_FLOAT"
   "@
    ldebr\t%0,%1
    ldeb\t%0,%1
@@ -5651,7 +5648,7 @@
      (clobber (reg:CC CC_REGNUM))])]
   "TARGET_ZARCH"
 {
-  /* For z13 we have vaq which doesn't set CC.  */
+  /* For z13 we have vsq which doesn't set CC.  */
   if (TARGET_VX)
     {
       emit_insn (gen_rtx_SET (operands[0],
diff --git a/gcc/config/s390/vecintrin.h b/gcc/config/s390/vecintrin.h
index ab82e7aed0..52e46ff381 100644
--- a/gcc/config/s390/vecintrin.h
+++ b/gcc/config/s390/vecintrin.h
@@ -1,4 +1,4 @@
-/* GNU compiler hardware transactional execution intrinsics
+/* GNU compiler vector extension intrinsics
    Copyright (C) 2015-2016 Free Software Foundation, Inc.
    Contributed by Andreas Krebbel (Andreas.Krebbel@de.ibm.com)
 
@@ -73,17 +73,9 @@ __lcbb(const void *ptr, int bndry)
 #define vec_splat_s32 __builtin_s390_vec_splat_s32
 #define vec_splat_u64 __builtin_s390_vec_splat_u64
 #define vec_splat_s64 __builtin_s390_vec_splat_s64
-#define vec_add_u128 __builtin_s390_vaq
-#define vec_addc_u128 __builtin_s390_vaccq
-#define vec_adde_u128 __builtin_s390_vacq
-#define vec_addec_u128 __builtin_s390_vacccq
 #define vec_checksum __builtin_s390_vcksm
 #define vec_gfmsum_128 __builtin_s390_vgfmg
 #define vec_gfmsum_accum_128 __builtin_s390_vgfmag
-#define vec_sub_u128 __builtin_s390_vsq
-#define vec_subc_u128 __builtin_s390_vscbiq
-#define vec_sube_u128 __builtin_s390_vsbiq
-#define vec_subec_u128 __builtin_s390_vsbcbiq
 #define vec_ceil(X) __builtin_s390_vfidb((X), 4, 6)
 #define vec_roundp(X) __builtin_s390_vfidb((X), 4, 6)
 #define vec_floor(X) __builtin_s390_vfidb((X), 4, 7)
@@ -169,6 +161,10 @@ __lcbb(const void *ptr, int bndry)
 #define vec_unpackh __builtin_s390_vec_unpackh
 #define vec_unpackl __builtin_s390_vec_unpackl
 #define vec_addc __builtin_s390_vec_addc
+#define vec_add_u128 __builtin_s390_vec_add_u128
+#define vec_addc_u128 __builtin_s390_vec_addc_u128
+#define vec_adde_u128 __builtin_s390_vec_adde_u128
+#define vec_addec_u128 __builtin_s390_vec_addec_u128
 #define vec_and __builtin_s390_vec_and
 #define vec_andc __builtin_s390_vec_andc
 #define vec_avg __builtin_s390_vec_avg
@@ -219,6 +215,10 @@ __lcbb(const void *ptr, int bndry)
 #define vec_srl __builtin_s390_vec_srl
 #define vec_srb __builtin_s390_vec_srb
 #define vec_subc __builtin_s390_vec_subc
+#define vec_sub_u128 __builtin_s390_vec_sub_u128
+#define vec_subc_u128 __builtin_s390_vec_subc_u128
+#define vec_sube_u128 __builtin_s390_vec_sube_u128
+#define vec_subec_u128 __builtin_s390_vec_subec_u128
 #define vec_sum2 __builtin_s390_vec_sum2
 #define vec_sum_u128 __builtin_s390_vec_sum_u128
 #define vec_sum4 __builtin_s390_vec_sum4
diff --git a/gcc/config/s390/vx-builtins.md b/gcc/config/s390/vx-builtins.md
index c4a837b177..34fd6300f1 100644
--- a/gcc/config/s390/vx-builtins.md
+++ b/gcc/config/s390/vx-builtins.md
@@ -550,56 +550,25 @@
 
 ; Vector add
 
-; vaq
-
-; zvector builtins uses V16QI operands.  So replace the modes in order
-; to map this to a TImode add.  We have to keep the V16QI mode
-; operands in the expander in order to allow some operand type
-; checking when expanding the builtin.
-(define_expand "vec_add_u128"
-  [(match_operand:V16QI 0 "register_operand" "")
-   (match_operand:V16QI 1 "register_operand" "")
-   (match_operand:V16QI 2 "register_operand" "")]
-  "TARGET_VX"
-{
-  rtx op0 = gen_rtx_SUBREG (TImode, operands[0], 0);
-  rtx op1 = gen_rtx_SUBREG (TImode, operands[1], 0);
-  rtx op2 = gen_rtx_SUBREG (TImode, operands[2], 0);
-
-  emit_insn (gen_rtx_SET (op0,
-			  gen_rtx_PLUS (TImode, op1, op2)));
-  DONE;
-})
-
 ; Vector add compute carry
 
-(define_insn "vec_addc<mode>"
-  [(set (match_operand:VI_HW                0 "register_operand" "=v")
-	(unspec:VI_HW [(match_operand:VI_HW 1 "register_operand" "%v")
-		       (match_operand:VI_HW 2 "register_operand"  "v")]
-		      UNSPEC_VEC_ADDC))]
+(define_insn "vacc<bhfgq>_<mode>"
+  [(set (match_operand:VIT_HW                 0 "register_operand" "=v")
+	(unspec:VIT_HW [(match_operand:VIT_HW 1 "register_operand" "%v")
+			(match_operand:VIT_HW 2 "register_operand"  "v")]
+		       UNSPEC_VEC_ADDC))]
   "TARGET_VX"
   "vacc<bhfgq>\t%v0,%v1,%v2"
   [(set_attr "op_type" "VRR")])
 
-(define_insn "vec_addc_u128"
-  [(set (match_operand:V16QI                0 "register_operand" "=v")
-	(unspec:V16QI [(match_operand:V16QI 1 "register_operand" "%v")
-		       (match_operand:V16QI 2 "register_operand"  "v")]
-		      UNSPEC_VEC_ADDC_U128))]
-  "TARGET_VX"
-  "vaccq\t%v0,%v1,%v2"
-  [(set_attr "op_type" "VRR")])
-
-
 ; Vector add with carry
 
-(define_insn "vec_adde_u128"
-  [(set (match_operand:V16QI                0 "register_operand" "=v")
-	(unspec:V16QI [(match_operand:V16QI 1 "register_operand" "%v")
-		       (match_operand:V16QI 2 "register_operand"  "v")
-		       (match_operand:V16QI 3 "register_operand"  "v")]
-		      UNSPEC_VEC_ADDE_U128))]
+(define_insn "vacq"
+  [(set (match_operand:TI             0 "register_operand" "=v")
+	(unspec:TI [(match_operand:TI 1 "register_operand" "%v")
+		    (match_operand:TI 2 "register_operand"  "v")
+		    (match_operand:TI 3 "register_operand"  "v")]
+		   UNSPEC_VEC_ADDE_U128))]
   "TARGET_VX"
   "vacq\t%v0,%v1,%v2,%v3"
   [(set_attr "op_type" "VRR")])
@@ -607,12 +576,12 @@
 
 ; Vector add with carry compute carry
 
-(define_insn "vec_addec_u128"
-  [(set (match_operand:V16QI                0 "register_operand" "=v")
-	(unspec:V16QI [(match_operand:V16QI 1 "register_operand" "%v")
-		       (match_operand:V16QI 2 "register_operand"  "v")
-		       (match_operand:V16QI 3 "register_operand"  "v")]
-		      UNSPEC_VEC_ADDEC_U128))]
+(define_insn "vacccq"
+  [(set (match_operand:TI             0 "register_operand" "=v")
+	(unspec:TI [(match_operand:TI 1 "register_operand" "%v")
+		    (match_operand:TI 2 "register_operand"  "v")
+		    (match_operand:TI 3 "register_operand"  "v")]
+		   UNSPEC_VEC_ADDEC_U128))]
   "TARGET_VX"
   "vacccq\t%v0,%v1,%v2,%v3"
   [(set_attr "op_type" "VRR")])
@@ -1145,44 +1114,24 @@
 
 ; Vector subtract
 
-(define_insn "vec_sub_u128"
-  [(set (match_operand:V16QI 0 "register_operand"               "=v")
-	(unspec:V16QI [(match_operand:V16QI 1 "register_operand" "v")
-		       (match_operand:V16QI 2 "register_operand" "v")]
-		     UNSPEC_VEC_SUB_U128))]
-  "TARGET_VX"
-  "vsq\t%v0,%v1,%v2"
-  [(set_attr "op_type" "VRR")])
-
-
 ; Vector subtract compute borrow indication
 
-(define_insn "vec_subc<mode>"
-  [(set (match_operand:VI_HW 0 "register_operand"               "=v")
-	(unspec:VI_HW [(match_operand:VI_HW 1 "register_operand" "v")
-		       (match_operand:VI_HW 2 "register_operand" "v")]
+(define_insn "vscbi<bhfgq>_<mode>"
+  [(set (match_operand:VIT_HW 0 "register_operand"                "=v")
+	(unspec:VIT_HW [(match_operand:VIT_HW 1 "register_operand" "v")
+			(match_operand:VIT_HW 2 "register_operand" "v")]
 		      UNSPEC_VEC_SUBC))]
   "TARGET_VX"
   "vscbi<bhfgq>\t%v0,%v1,%v2"
   [(set_attr "op_type" "VRR")])
 
-(define_insn "vec_subc_u128"
-  [(set (match_operand:V16QI 0 "register_operand"               "=v")
-	(unspec:V16QI [(match_operand:V16QI 1 "register_operand" "v")
-		       (match_operand:V16QI 2 "register_operand" "v")]
-		     UNSPEC_VEC_SUBC_U128))]
-  "TARGET_VX"
-  "vscbiq\t%v0,%v1,%v2"
-  [(set_attr "op_type" "VRR")])
-
-
 ; Vector subtract with borrow indication
 
-(define_insn "vec_sube_u128"
-  [(set (match_operand:V16QI 0 "register_operand"               "=v")
-	(unspec:V16QI [(match_operand:V16QI 1 "register_operand" "v")
-		       (match_operand:V16QI 2 "register_operand" "v")
-		       (match_operand:V16QI 3 "register_operand" "v")]
+(define_insn "vsbiq"
+  [(set (match_operand:TI 0 "register_operand"               "=v")
+	(unspec:TI [(match_operand:TI 1 "register_operand"    "v")
+		       (match_operand:TI 2 "register_operand" "v")
+		       (match_operand:TI 3 "register_operand" "v")]
 		      UNSPEC_VEC_SUBE_U128))]
   "TARGET_VX"
   "vsbiq\t%v0,%v1,%v2,%v3"
@@ -1191,11 +1140,11 @@
 
 ; Vector subtract with borrow compute and borrow indication
 
-(define_insn "vec_subec_u128"
-  [(set (match_operand:V16QI 0 "register_operand"               "=v")
-	(unspec:V16QI [(match_operand:V16QI 1 "register_operand" "v")
-		       (match_operand:V16QI 2 "register_operand" "v")
-		       (match_operand:V16QI 3 "register_operand" "v")]
+(define_insn "vsbcbiq"
+  [(set (match_operand:TI 0 "register_operand"               "=v")
+	(unspec:TI [(match_operand:TI 1 "register_operand"    "v")
+		       (match_operand:TI 2 "register_operand" "v")
+		       (match_operand:TI 3 "register_operand" "v")]
 		      UNSPEC_VEC_SUBEC_U128))]
   "TARGET_VX"
   "vsbcbiq\t%v0,%v1,%v2,%v3"
diff --git a/gcc/config/sparc/driver-sparc.c b/gcc/config/sparc/driver-sparc.c
index 7e9ee24ac2..ea174bf3dc 100644
--- a/gcc/config/sparc/driver-sparc.c
+++ b/gcc/config/sparc/driver-sparc.c
@@ -75,6 +75,8 @@ static const struct cpu_names {
   { "UltraSparc T4",	"niagara4" },
   { "LEON",		"leon3" },
 #endif
+  { "SPARC-M7",		"niagara7" },
+  { "SPARC-S7",		"niagara7" },
   { NULL,	NULL }
   };
 
diff --git a/gcc/config/sparc/linux64.h b/gcc/config/sparc/linux64.h
index a1ef325946..9d53c29ede 100644
--- a/gcc/config/sparc/linux64.h
+++ b/gcc/config/sparc/linux64.h
@@ -164,22 +164,42 @@ extern const char *host_detect_local_cpu (int argc, const char **argv);
 #endif
 
 /* Support for a compile-time default CPU, et cetera.  The rules are:
-   --with-cpu is ignored if -mcpu is specified.
-   --with-tune is ignored if -mtune is specified.
+   --with-cpu is ignored if -mcpu is specified; likewise --with-cpu-32
+     and --with-cpu-64.
+   --with-tune is ignored if -mtune is specified; likewise --with-tune-32
+     and --with-tune-64.
    --with-float is ignored if -mhard-float, -msoft-float, -mfpu, or -mno-fpu
      are specified.
    In the SPARC_BI_ARCH compiler we cannot pass %{!mcpu=*:-mcpu=%(VALUE)}
    here, otherwise say -mcpu=v7 would be passed even when -m64.
-   CC1_SPEC above takes care of this instead.  */
+   CC1_SPEC above takes care of this instead.
+
+   Note that the order of the cpu* and tune* options matters: the
+   config.gcc file always sets with_cpu to some value, even if the
+   user didn't use --with-cpu when invoking the configure script.
+   This value is based on the target name.  Therefore we have to make
+   sure that --with-cpu-32 takes precedence to --with-cpu in < v9
+   systems, and that --with-cpu-64 takes precedence to --with-cpu in
+   >= v9 systems.  As for the tune* options, in some platforms
+   config.gcc also sets a default value for it if the user didn't use
+   --with-tune when invoking the configure script.  */
 #undef OPTION_DEFAULT_SPECS
 #if DEFAULT_ARCH32_P
 #define OPTION_DEFAULT_SPECS \
+  {"cpu_32", "%{!m64:%{!mcpu=*:-mcpu=%(VALUE)}}" }, \
+  {"cpu_64", "%{m64:%{!mcpu=*:-mcpu=%(VALUE)}}" }, \
   {"cpu", "%{!m64:%{!mcpu=*:-mcpu=%(VALUE)}}" }, \
+  {"tune_32", "%{!m64:%{!mtune=*:-mtune=%(VALUE)}}" }, \
+  {"tune_64", "%{m64:%{!mtune=*:-mtune=%(VALUE)}}" }, \
   {"tune", "%{!mtune=*:-mtune=%(VALUE)}" }, \
   {"float", "%{!msoft-float:%{!mhard-float:%{!mfpu:%{!mno-fpu:-m%(VALUE)-float}}}}" }
 #else
 #define OPTION_DEFAULT_SPECS \
+  {"cpu_32", "%{m32:%{!mcpu=*:-mcpu=%(VALUE)}}" }, \
+  {"cpu_64", "%{!m32:%{!mcpu=*:-mcpu=%(VALUE)}}" }, \
   {"cpu", "%{!m32:%{!mcpu=*:-mcpu=%(VALUE)}}" }, \
+  {"tune_32", "%{m32:%{!mtune=*:-mtune=%(VALUE)}}" },	\
+  {"tune_64", "%{!m32:%{!mtune=*:-mtune=%(VALUE)}}" },	\
   {"tune", "%{!mtune=*:-mtune=%(VALUE)}" }, \
   {"float", "%{!msoft-float:%{!mhard-float:%{!mfpu:%{!mno-fpu:-m%(VALUE)-float}}}}" }
 #endif
diff --git a/gcc/config/sparc/niagara4.md b/gcc/config/sparc/niagara4.md
index a826fb458b..925fc6c800 100644
--- a/gcc/config/sparc/niagara4.md
+++ b/gcc/config/sparc/niagara4.md
@@ -75,6 +75,13 @@
       (eq_attr "fptype" "double")))
   "n4_slot1")
 
+;; The latency numbers for VIS instructions in the reservations below
+;; reflect empirical results, and don't match with the documented
+;; latency numbers in the T4 Processor Supplement.  This is because
+;; the HW chaps didn't feel it necessary to document the complexity in
+;; the PRM, and just assigned a latency of 11 to all/most of the VIS
+;; instructions.
+
 (define_insn_reservation "n4_vis_move_11cycle" 11
   (and (eq_attr "cpu" "niagara4")
     (and (eq_attr "type" "vismv")
diff --git a/gcc/config/sparc/niagara7.md b/gcc/config/sparc/niagara7.md
new file mode 100644
index 0000000000..56a4edb80c
--- /dev/null
+++ b/gcc/config/sparc/niagara7.md
@@ -0,0 +1,136 @@
+;; Scheduling description for Niagara-7
+;;   Copyright (C) 2016 Free Software Foundation, Inc.
+;;
+;; This file is part of GCC.
+;;
+;; GCC is free software; you can redistribute it and/or modify
+;; it under the terms of the GNU General Public License as published by
+;; the Free Software Foundation; either version 3, or (at your option)
+;; any later version.
+;;
+;; GCC is distributed in the hope that it will be useful,
+;; but WITHOUT ANY WARRANTY; without even the implied warranty of
+;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;; GNU General Public License for more details.
+;;
+;; You should have received a copy of the GNU General Public License
+;; along with GCC; see the file COPYING3.  If not see
+;; <http://www.gnu.org/licenses/>.
+
+(define_automaton "niagara7_0")
+
+(define_cpu_unit "n7_slot0,n7_slot1,n7_slot2" "niagara7_0")
+(define_reservation "n7_single_issue" "n7_slot0 + n7_slot1 + n7_slot2")
+
+(define_cpu_unit "n7_load_store" "niagara7_0")
+
+(define_insn_reservation "n7_single" 1
+  (and (eq_attr "cpu" "niagara7")
+    (eq_attr "type" "multi,savew,flushw,trap"))
+  "n7_single_issue")
+
+(define_insn_reservation "n7_iflush" 27
+  (and (eq_attr "cpu" "niagara7")
+       (eq_attr "type" "iflush"))
+  "(n7_slot0 | n7_slot1), nothing*26")
+
+(define_insn_reservation "n7_integer" 1
+  (and (eq_attr "cpu" "niagara7")
+    (eq_attr "type" "ialu,ialuX,shift,cmove,compare"))
+  "(n7_slot0 | n7_slot1)")
+
+(define_insn_reservation "n7_imul" 12
+  (and (eq_attr "cpu" "niagara7")
+    (eq_attr "type" "imul"))
+  "n7_slot1, nothing*11")
+
+(define_insn_reservation "n7_idiv" 35
+  (and (eq_attr "cpu" "niagara7")
+    (eq_attr "type" "idiv"))
+  "n7_slot1, nothing*34")
+
+(define_insn_reservation "n7_load" 5
+  (and (eq_attr "cpu" "niagara7")
+    (eq_attr "type" "load,fpload,sload"))
+  "(n7_slot0 + n7_load_store), nothing*4")
+
+(define_insn_reservation "n7_store" 1
+  (and (eq_attr "cpu" "niagara7")
+    (eq_attr "type" "store,fpstore"))
+  "(n7_slot0 | n7_slot2) + n7_load_store")
+
+(define_insn_reservation "n7_cti" 1
+  (and (eq_attr "cpu" "niagara7")
+    (eq_attr "type" "cbcond,uncond_cbcond,branch,call,sibcall,call_no_delay_slot,uncond_branch,return"))
+  "n7_slot1")
+
+(define_insn_reservation "n7_fp" 11
+  (and (eq_attr "cpu" "niagara7")
+    (eq_attr "type" "fpmove,fpcmove,fpcrmove,fp,fpcmp,fpmul"))
+  "n7_slot1, nothing*10")
+
+(define_insn_reservation "n7_array" 12
+  (and (eq_attr "cpu" "niagara7")
+    (eq_attr "type" "array,edge,edgen"))
+  "n7_slot1, nothing*11")
+
+(define_insn_reservation "n7_fpdivs" 24
+  (and (eq_attr "cpu" "niagara7")
+    (eq_attr "type" "fpdivs,fpsqrts"))
+  "n7_slot1, nothing*23")
+
+(define_insn_reservation "n7_fpdivd" 37
+  (and (eq_attr "cpu" "niagara7")
+    (eq_attr "type" "fpdivd,fpsqrtd"))
+  "n7_slot1, nothing*36")
+
+(define_insn_reservation "n7_lzd" 12
+  (and (eq_attr "cpu" "niagara7")
+       (eq_attr "type" "lzd"))
+  "(n7_slot0 | n7_slot1), nothing*11")
+
+;; There is an internal unit called the "V3 pipe", that was originally
+;; intended to process some of the short cryptographic instructions.
+;; However, as soon as in the T4 several of the VIS instructions
+;; (notably non-FP instructions) have been moved to the V3 pipe.
+;; Consequently, these instructions feature a latency of 3 instead of
+;; 11 or 12 cycles, provided their consumers also execute in the V3
+;; pipe.
+;;
+;; This is modelled here with a bypass.
+
+(define_insn_reservation "n7_vis_fga" 11
+  (and (eq_attr "cpu" "niagara7")
+    (eq_attr "type" "fga,gsr"))
+  "n7_slot1, nothing*10")
+
+(define_insn_reservation "n7_vis_fgm" 11
+  (and (eq_attr "cpu" "niagara7")
+    (eq_attr "type" "fgm_pack,fgm_mul,pdist"))
+  "n7_slot1, nothing*10")
+
+(define_insn_reservation "n7_vis_move_v3pipe" 11
+  (and (eq_attr "cpu" "niagara7")
+    (and (eq_attr "type" "vismv")
+         (eq_attr "v3pipe" "true")))
+  "n7_slot1")
+
+(define_insn_reservation "n7_vis_move_11cycle" 11
+  (and (eq_attr "cpu" "niagara7")
+    (and (eq_attr "type" "vismv")
+         (eq_attr "v3pipe" "false")))
+  "n7_slot1, nothing*10")
+
+(define_insn_reservation "n7_vis_logical_v3pipe" 11
+  (and (eq_attr "cpu" "niagara7")
+    (and (eq_attr "type" "visl,pdistn")
+         (eq_attr "v3pipe" "true")))
+  "n7_slot1, nothing*2")
+
+(define_insn_reservation "n7_vis_logical_11cycle" 11
+  (and (eq_attr "cpu" "niagara7")
+    (and (eq_attr "type" "visl")
+      (eq_attr "v3pipe" "false")))
+  "n7_slot1, nothing*10")
+
+(define_bypass 3 "*_v3pipe" "*_v3pipe")
diff --git a/gcc/config/sparc/sol2.h b/gcc/config/sparc/sol2.h
index 07e6368cae..2a843c5c26 100644
--- a/gcc/config/sparc/sol2.h
+++ b/gcc/config/sparc/sol2.h
@@ -165,13 +165,22 @@ along with GCC; see the file COPYING3.  If not see
 #define ASM_CPU64_DEFAULT_SPEC AS_SPARC64_FLAG AS_NIAGARA4_FLAG
 #endif
 
+#if TARGET_CPU_DEFAULT == TARGET_CPU_niagara7
+#undef CPP_CPU64_DEFAULT_SPEC
+#define CPP_CPU64_DEFAULT_SPEC ""
+#undef ASM_CPU32_DEFAULT_SPEC
+#define ASM_CPU32_DEFAUILT_SPEC AS_SPARC32_FLAG AS_NIAGARA7_FLAG
+#undef ASM_CPU64_DEFAULT_SPEC
+#define ASM_CPU64_DEFAULT_SPEC AS_SPARC64_FLAG AS_NIAGARA7_FLAG
+#endif
+
 #undef CPP_CPU_SPEC
 #define CPP_CPU_SPEC "\
 %{mcpu=sparclet|mcpu=tsc701:-D__sparclet__} \
 %{mcpu=sparclite|mcpu-f930|mcpu=f934:-D__sparclite__} \
 %{mcpu=v8:" DEF_ARCH32_SPEC("-D__sparcv8") "} \
 %{mcpu=supersparc:-D__supersparc__ " DEF_ARCH32_SPEC("-D__sparcv8") "} \
-%{mcpu=v9|mcpu=ultrasparc|mcpu=ultrasparc3|mcpu=niagara|mcpu=niagara2|mcpu=niagara3|mcpu=niagara4:" DEF_ARCH32_SPEC("-D__sparcv8") "} \
+%{mcpu=v9|mcpu=ultrasparc|mcpu=ultrasparc3|mcpu=niagara|mcpu=niagara2|mcpu=niagara3|mcpu=niagara4|mcpu=niagara7:" DEF_ARCH32_SPEC("-D__sparcv8") "} \
 %{!mcpu*:%(cpp_cpu_default)} \
 "
 
@@ -231,22 +240,42 @@ extern const char *host_detect_local_cpu (int argc, const char **argv);
 #endif
 
 /* Support for a compile-time default CPU, et cetera.  The rules are:
-   --with-cpu is ignored if -mcpu is specified.
-   --with-tune is ignored if -mtune is specified.
+   --with-cpu is ignored if -mcpu is specified; likewise --with-cpu-32
+     and --with-cpu-64.
+   --with-tune is ignored if -mtune is specified; likewise --with-tune-32
+     and --with-tune-64.
    --with-float is ignored if -mhard-float, -msoft-float, -mfpu, or -mno-fpu
      are specified.
    In the SPARC_BI_ARCH compiler we cannot pass %{!mcpu=*:-mcpu=%(VALUE)}
    here, otherwise say -mcpu=v7 would be passed even when -m64.
-   CC1_SPEC above takes care of this instead.  */
+   CC1_SPEC above takes care of this instead.
+
+   Note that the order of the cpu* and tune* options matters: the
+   config.gcc file always sets with_cpu to some value, even if the
+   user didn't use --with-cpu when invoking the configure script.
+   This value is based on the target name.  Therefore we have to make
+   sure that --with-cpu-32 takes precedence to --with-cpu in < v9
+   systems, and that --with-cpu-64 takes precedence to --with-cpu in
+   >= v9 systems.  As for the tune* options, in some platforms
+   config.gcc also sets a default value for it if the user didn't use
+   --with-tune when invoking the configure script.  */
 #undef OPTION_DEFAULT_SPECS
 #if DEFAULT_ARCH32_P
 #define OPTION_DEFAULT_SPECS \
+  {"cpu_32", "%{!m64:%{!mcpu=*:-mcpu=%(VALUE)}}" }, \
+  {"cpu_64", "%{m64:%{!mcpu=*:-mcpu=%(VALUE)}}" }, \
   {"cpu", "%{!m64:%{!mcpu=*:-mcpu=%(VALUE)}}" }, \
+  {"tune_32", "%{!m64:%{!mtune=*:-mtune=%(VALUE)}}" }, \
+  {"tune_64", "%{m64:%{!mtune=*:-mtune=%(VALUE)}}" }, \
   {"tune", "%{!mtune=*:-mtune=%(VALUE)}" }, \
   {"float", "%{!msoft-float:%{!mhard-float:%{!mfpu:%{!mno-fpu:-m%(VALUE)-float}}}}" }
 #else
 #define OPTION_DEFAULT_SPECS \
+  {"cpu_32", "%{m32:%{!mcpu=*:-mcpu=%(VALUE)}}" }, \
+  {"cpu_64", "%{!m32:%{!mcpu=*:-mcpu=%(VALUE)}}" }, \
   {"cpu", "%{!m32:%{!mcpu=*:-mcpu=%(VALUE)}}" }, \
+  {"tune_32", "%{m32:%{!mtune=*:-mtune=%(VALUE)}}" },	\
+  {"tune_64", "%{!m32:%{!mtune=*:-mtune=%(VALUE)}}" },	\
   {"tune", "%{!mtune=*:-mtune=%(VALUE)}" }, \
   {"float", "%{!msoft-float:%{!mhard-float:%{!mfpu:%{!mno-fpu:-m%(VALUE)-float}}}}" }
 #endif
@@ -260,7 +289,8 @@ extern const char *host_detect_local_cpu (int argc, const char **argv);
 %{mcpu=niagara2:" DEF_ARCH32_SPEC("-xarch=v8plusb") DEF_ARCH64_SPEC("-xarch=v9b") "} \
 %{mcpu=niagara3:" DEF_ARCH32_SPEC("-xarch=v8plus" AS_NIAGARA3_FLAG) DEF_ARCH64_SPEC("-xarch=v9" AS_NIAGARA3_FLAG) "} \
 %{mcpu=niagara4:" DEF_ARCH32_SPEC(AS_SPARC32_FLAG AS_NIAGARA4_FLAG) DEF_ARCH64_SPEC(AS_SPARC64_FLAG AS_NIAGARA4_FLAG) "} \
-%{!mcpu=niagara4:%{!mcpu=niagara3:%{!mcpu=niagara2:%{!mcpu=niagara:%{!mcpu=ultrasparc3:%{!mcpu=ultrasparc:%{!mcpu=v9:%{mcpu*:" DEF_ARCH32_SPEC("-xarch=v8") DEF_ARCH64_SPEC("-xarch=v9") "}}}}}}}} \
+%{mcpu=niagara7:" DEF_ARCH32_SPEC(AS_SPARC32_FLAG AS_NIAGARA7_FLAG) DEF_ARCH64_SPEC(AS_SPARC64_FLAG AS_NIAGARA7_FLAG) "} \
+%{!mcpu=niagara7:%{!mcpu=niagara4:%{!mcpu=niagara3:%{!mcpu=niagara2:%{!mcpu=niagara:%{!mcpu=ultrasparc3:%{!mcpu=ultrasparc:%{!mcpu=v9:%{mcpu*:" DEF_ARCH32_SPEC("-xarch=v8") DEF_ARCH64_SPEC("-xarch=v9") "}}}}}}}}} \
 %{!mcpu*:%(asm_cpu_default)} \
 "
 
diff --git a/gcc/config/sparc/sparc-c.c b/gcc/config/sparc/sparc-c.c
index d3fd60e403..d9f9c15d2e 100644
--- a/gcc/config/sparc/sparc-c.c
+++ b/gcc/config/sparc/sparc-c.c
@@ -40,7 +40,12 @@ sparc_target_macros (void)
       cpp_assert (parse_in, "machine=sparc");
     }
 
-  if (TARGET_VIS3)
+  if (TARGET_VIS4)
+    {
+      cpp_define (parse_in, "__VIS__=0x400");
+      cpp_define (parse_in, "__VIS__=0x400");
+    }
+  else if (TARGET_VIS3)
     {
       cpp_define (parse_in, "__VIS__=0x300");
       cpp_define (parse_in, "__VIS=0x300");
diff --git a/gcc/config/sparc/sparc-opts.h b/gcc/config/sparc/sparc-opts.h
index ce48f182c3..40d23a223e 100644
--- a/gcc/config/sparc/sparc-opts.h
+++ b/gcc/config/sparc/sparc-opts.h
@@ -45,6 +45,7 @@ enum processor_type {
   PROCESSOR_NIAGARA2,
   PROCESSOR_NIAGARA3,
   PROCESSOR_NIAGARA4,
+  PROCESSOR_NIAGARA7,
   PROCESSOR_NATIVE
 };
 
diff --git a/gcc/config/sparc/sparc.c b/gcc/config/sparc/sparc.c
index 082af3cc9b..1d2ecaaf3d 100644
--- a/gcc/config/sparc/sparc.c
+++ b/gcc/config/sparc/sparc.c
@@ -423,6 +423,30 @@ struct processor_costs niagara4_costs = {
   0, /* shift penalty */
 };
 
+static const
+struct processor_costs niagara7_costs = {
+  COSTS_N_INSNS (5), /* int load */
+  COSTS_N_INSNS (5), /* int signed load */
+  COSTS_N_INSNS (5), /* int zeroed load */
+  COSTS_N_INSNS (5), /* float load */
+  COSTS_N_INSNS (11), /* fmov, fneg, fabs */
+  COSTS_N_INSNS (11), /* fadd, fsub */
+  COSTS_N_INSNS (11), /* fcmp */
+  COSTS_N_INSNS (11), /* fmov, fmovr */
+  COSTS_N_INSNS (11), /* fmul */
+  COSTS_N_INSNS (24), /* fdivs */
+  COSTS_N_INSNS (37), /* fdivd */
+  COSTS_N_INSNS (24), /* fsqrts */
+  COSTS_N_INSNS (37), /* fsqrtd */
+  COSTS_N_INSNS (12), /* imul */
+  COSTS_N_INSNS (12), /* imulX */
+  0, /* imul bit factor */
+  COSTS_N_INSNS (51), /* idiv, average of 42 - 61 cycle range */
+  COSTS_N_INSNS (35), /* idivX, average of 26 - 44 cycle range */
+  COSTS_N_INSNS (1), /* movcc/movr */
+  0, /* shift penalty */
+};
+
 static const struct processor_costs *sparc_costs = &cypress_costs;
 
 #ifdef HAVE_AS_RELAX_OPTION
@@ -1175,6 +1199,8 @@ dump_target_flag_bits (const int flags)
     fprintf (stderr, "VIS2 ");
   if (flags & MASK_VIS3)
     fprintf (stderr, "VIS3 ");
+  if (flags & MASK_VIS4)
+    fprintf (stderr, "VIS4 ");
   if (flags & MASK_CBCOND)
     fprintf (stderr, "CBCOND ");
   if (flags & MASK_DEPRECATED_V8_INSNS)
@@ -1238,6 +1264,7 @@ sparc_option_override (void)
     { TARGET_CPU_niagara2, PROCESSOR_NIAGARA2 },
     { TARGET_CPU_niagara3, PROCESSOR_NIAGARA3 },
     { TARGET_CPU_niagara4, PROCESSOR_NIAGARA4 },
+    { TARGET_CPU_niagara7, PROCESSOR_NIAGARA7 },
     { -1, PROCESSOR_V7 }
   };
   const struct cpu_default *def;
@@ -1287,6 +1314,9 @@ sparc_option_override (void)
     /* UltraSPARC T4 */
     { "niagara4",	MASK_ISA,
       MASK_V9|MASK_POPC|MASK_VIS2|MASK_VIS3|MASK_FMAF|MASK_CBCOND },
+    /* UltraSPARC M7 */
+    { "niagara7",	MASK_ISA,
+      MASK_V9|MASK_POPC|MASK_VIS2|MASK_VIS3|MASK_VIS4|MASK_FMAF|MASK_CBCOND },
   };
   const struct cpu_table *cpu;
   unsigned int i;
@@ -1416,6 +1446,9 @@ sparc_option_override (void)
 #ifndef HAVE_AS_SPARC4
 		   & ~MASK_CBCOND
 #endif
+#ifndef HAVE_AS_SPARC5_VIS4
+		   & ~MASK_VIS4
+#endif
 #ifndef HAVE_AS_LEON
 		   & ~(MASK_LEON | MASK_LEON3)
 #endif
@@ -1434,10 +1467,15 @@ sparc_option_override (void)
   if (TARGET_VIS3)
     target_flags |= MASK_VIS2 | MASK_VIS;
 
-  /* Don't allow -mvis, -mvis2, -mvis3, or -mfmaf if FPU is
+  /* -mvis4 implies -mvis3, -mvis2 and -mvis */
+  if (TARGET_VIS4)
+    target_flags |= MASK_VIS3 | MASK_VIS2 | MASK_VIS;
+
+  /* Don't allow -mvis, -mvis2, -mvis3, -mvis4 or -mfmaf if FPU is
      disabled.  */
   if (! TARGET_FPU)
-    target_flags &= ~(MASK_VIS | MASK_VIS2 | MASK_VIS3 | MASK_FMAF);
+    target_flags &= ~(MASK_VIS | MASK_VIS2 | MASK_VIS3 | MASK_VIS4
+		      | MASK_FMAF);
 
   /* -mvis assumes UltraSPARC+, so we are sure v9 instructions
      are available.
@@ -1471,7 +1509,8 @@ sparc_option_override (void)
 	  || sparc_cpu == PROCESSOR_NIAGARA
 	  || sparc_cpu == PROCESSOR_NIAGARA2
 	  || sparc_cpu == PROCESSOR_NIAGARA3
-	  || sparc_cpu == PROCESSOR_NIAGARA4))
+	  || sparc_cpu == PROCESSOR_NIAGARA4
+	  || sparc_cpu == PROCESSOR_NIAGARA7))
     align_functions = 32;
 
   /* Validate PCC_STRUCT_RETURN.  */
@@ -1535,6 +1574,9 @@ sparc_option_override (void)
     case PROCESSOR_NIAGARA4:
       sparc_costs = &niagara4_costs;
       break;
+    case PROCESSOR_NIAGARA7:
+      sparc_costs = &niagara7_costs;
+      break;
     case PROCESSOR_NATIVE:
       gcc_unreachable ();
     };
@@ -1566,6 +1608,29 @@ sparc_option_override (void)
   if (TARGET_DEBUG_OPTIONS)
     dump_target_flags ("Final target_flags", target_flags);
 
+  /* PARAM_SIMULTANEOUS_PREFETCHES is the number of prefetches that
+     can run at the same time.  More important, it is the threshold
+     defining when additional prefetches will be dropped by the
+     hardware.
+
+     The UltraSPARC-III features a documented prefetch queue with a
+     size of 8.  Additional prefetches issued in the cpu are
+     dropped.
+
+     Niagara processors are different.  In these processors prefetches
+     are handled much like regular loads.  The L1 miss buffer is 32
+     entries, but prefetches start getting affected when 30 entries
+     become occupied.  That occupation could be a mix of regular loads
+     and prefetches though.  And that buffer is shared by all threads.
+     Once the threshold is reached, if the core is running a single
+     thread the prefetch will retry.  If more than one thread is
+     running, the prefetch will be dropped.
+
+     All this makes it very difficult to determine how many
+     simultaneous prefetches can be issued simultaneously, even in a
+     single-threaded program.  Experimental results show that setting
+     this parameter to 32 works well when the number of threads is not
+     high.  */
   maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
 			 ((sparc_cpu == PROCESSOR_ULTRASPARC
 			   || sparc_cpu == PROCESSOR_NIAGARA
@@ -1574,20 +1639,55 @@ sparc_option_override (void)
 			   || sparc_cpu == PROCESSOR_NIAGARA4)
 			  ? 2
 			  : (sparc_cpu == PROCESSOR_ULTRASPARC3
-			     ? 8 : 3)),
+			     ? 8 : (sparc_cpu == PROCESSOR_NIAGARA7
+				    ? 32 : 3))),
 			 global_options.x_param_values,
 			 global_options_set.x_param_values);
-  maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
+
+  /* For PARAM_L1_CACHE_LINE_SIZE we use the default 32 bytes (see
+     params.def), so no maybe_set_param_value is needed.
+
+     The Oracle SPARC Architecture (previously the UltraSPARC
+     Architecture) specification states that when a PREFETCH[A]
+     instruction is executed an implementation-specific amount of data
+     is prefetched, and that it is at least 64 bytes long (aligned to
+     at least 64 bytes).
+
+     However, this is not correct.  The M7 (and implementations prior
+     to that) does not guarantee a 64B prefetch into a cache if the
+     line size is smaller.  A single cache line is all that is ever
+     prefetched.  So for the M7, where the L1D$ has 32B lines and the
+     L2D$ and L3 have 64B lines, a prefetch will prefetch 64B into the
+     L2 and L3, but only 32B are brought into the L1D$. (Assuming it
+     is a read_n prefetch, which is the only type which allocates to
+     the L1.)  */
+
+  /* PARAM_L1_CACHE_SIZE is the size of the L1D$ (most SPARC chips use
+     Hardvard level-1 caches) in kilobytes.  Both UltraSPARC and
+     Niagara processors feature a L1D$ of 16KB.  */
+  maybe_set_param_value (PARAM_L1_CACHE_SIZE,
 			 ((sparc_cpu == PROCESSOR_ULTRASPARC
 			   || sparc_cpu == PROCESSOR_ULTRASPARC3
 			   || sparc_cpu == PROCESSOR_NIAGARA
 			   || sparc_cpu == PROCESSOR_NIAGARA2
 			   || sparc_cpu == PROCESSOR_NIAGARA3
-			   || sparc_cpu == PROCESSOR_NIAGARA4)
-			  ? 64 : 32),
+			   || sparc_cpu == PROCESSOR_NIAGARA4
+			   || sparc_cpu == PROCESSOR_NIAGARA7)
+			  ? 16 : 64),
 			 global_options.x_param_values,
 			 global_options_set.x_param_values);
 
+
+  /* PARAM_L2_CACHE_SIZE is the size fo the L2 in kilobytes.  Note
+     that 512 is the default in params.def.  */
+  maybe_set_param_value (PARAM_L2_CACHE_SIZE,
+			 (sparc_cpu == PROCESSOR_NIAGARA4
+			  ? 128 : (sparc_cpu == PROCESSOR_NIAGARA7
+				   ? 256 : 512)),
+			 global_options.x_param_values,
+			 global_options_set.x_param_values);
+  
+
   /* Disable save slot sharing for call-clobbered registers by default.
      The IRA sharing algorithm works on single registers only and this
      pessimizes for double floating-point registers.  */
@@ -9178,7 +9278,8 @@ sparc32_initialize_trampoline (rtx m_tramp, rtx fnaddr, rtx cxt)
       && sparc_cpu != PROCESSOR_NIAGARA
       && sparc_cpu != PROCESSOR_NIAGARA2
       && sparc_cpu != PROCESSOR_NIAGARA3
-      && sparc_cpu != PROCESSOR_NIAGARA4)
+      && sparc_cpu != PROCESSOR_NIAGARA4
+      && sparc_cpu != PROCESSOR_NIAGARA7)
     emit_insn (gen_flushsi (validize_mem (adjust_address (m_tramp, SImode, 8))));
 
   /* Call __enable_execute_stack after writing onto the stack to make sure
@@ -9223,7 +9324,8 @@ sparc64_initialize_trampoline (rtx m_tramp, rtx fnaddr, rtx cxt)
       && sparc_cpu != PROCESSOR_NIAGARA
       && sparc_cpu != PROCESSOR_NIAGARA2
       && sparc_cpu != PROCESSOR_NIAGARA3
-      && sparc_cpu != PROCESSOR_NIAGARA4)
+      && sparc_cpu != PROCESSOR_NIAGARA4
+      && sparc_cpu != PROCESSOR_NIAGARA7)
     emit_insn (gen_flushdi (validize_mem (adjust_address (m_tramp, DImode, 8))));
 
   /* Call __enable_execute_stack after writing onto the stack to make sure
@@ -9419,7 +9521,8 @@ sparc_use_sched_lookahead (void)
       || sparc_cpu == PROCESSOR_NIAGARA2
       || sparc_cpu == PROCESSOR_NIAGARA3)
     return 0;
-  if (sparc_cpu == PROCESSOR_NIAGARA4)
+  if (sparc_cpu == PROCESSOR_NIAGARA4
+      || sparc_cpu == PROCESSOR_NIAGARA7)
     return 2;
   if (sparc_cpu == PROCESSOR_ULTRASPARC
       || sparc_cpu == PROCESSOR_ULTRASPARC3)
@@ -9442,6 +9545,7 @@ sparc_issue_rate (void)
     default:
       return 1;
     case PROCESSOR_NIAGARA4:
+    case PROCESSOR_NIAGARA7:
     case PROCESSOR_V9:
       /* Assume V9 processors are capable of at least dual-issue.  */
       return 2;
@@ -10007,6 +10111,34 @@ enum sparc_builtins
   SPARC_BUILTIN_XMULX,
   SPARC_BUILTIN_XMULXHI,
 
+  /* VIS 4.0 builtins.  */
+  SPARC_BUILTIN_FPADD8,
+  SPARC_BUILTIN_FPADDS8,
+  SPARC_BUILTIN_FPADDUS8,
+  SPARC_BUILTIN_FPADDUS16,
+  SPARC_BUILTIN_FPCMPLE8,
+  SPARC_BUILTIN_FPCMPGT8,
+  SPARC_BUILTIN_FPCMPULE16,
+  SPARC_BUILTIN_FPCMPUGT16,
+  SPARC_BUILTIN_FPCMPULE32,
+  SPARC_BUILTIN_FPCMPUGT32,
+  SPARC_BUILTIN_FPMAX8,
+  SPARC_BUILTIN_FPMAX16,
+  SPARC_BUILTIN_FPMAX32,
+  SPARC_BUILTIN_FPMAXU8,
+  SPARC_BUILTIN_FPMAXU16,
+  SPARC_BUILTIN_FPMAXU32,
+  SPARC_BUILTIN_FPMIN8,
+  SPARC_BUILTIN_FPMIN16,
+  SPARC_BUILTIN_FPMIN32,
+  SPARC_BUILTIN_FPMINU8,
+  SPARC_BUILTIN_FPMINU16,
+  SPARC_BUILTIN_FPMINU32,
+  SPARC_BUILTIN_FPSUB8,
+  SPARC_BUILTIN_FPSUBS8,
+  SPARC_BUILTIN_FPSUBUS8,
+  SPARC_BUILTIN_FPSUBUS16,
+  
   SPARC_BUILTIN_MAX
 };
 
@@ -10483,6 +10615,83 @@ sparc_vis_init_builtins (void)
       def_builtin_const ("__builtin_vis_xmulxhi", CODE_FOR_xmulxhi_vis,
 			 SPARC_BUILTIN_XMULXHI, di_ftype_di_di);
     }
+
+  if (TARGET_VIS4)
+    {
+      def_builtin_const ("__builtin_vis_fpadd8", CODE_FOR_addv8qi3,
+			 SPARC_BUILTIN_FPADD8, v8qi_ftype_v8qi_v8qi);
+      def_builtin_const ("__builtin_vis_fpadds8", CODE_FOR_ssaddv8qi3,
+			 SPARC_BUILTIN_FPADDS8, v8qi_ftype_v8qi_v8qi);
+      def_builtin_const ("__builtin_vis_fpaddus8", CODE_FOR_usaddv8qi3,
+			 SPARC_BUILTIN_FPADDUS8, v8qi_ftype_v8qi_v8qi);
+      def_builtin_const ("__builtin_vis_fpaddus16", CODE_FOR_usaddv4hi3,
+			 SPARC_BUILTIN_FPADDUS16, v4hi_ftype_v4hi_v4hi);
+
+
+      if (TARGET_ARCH64)
+	{
+	  def_builtin_const ("__builtin_vis_fpcmple8", CODE_FOR_fpcmple8di_vis,
+			     SPARC_BUILTIN_FPCMPLE8, di_ftype_v8qi_v8qi);
+	  def_builtin_const ("__builtin_vis_fpcmpgt8", CODE_FOR_fpcmpgt8di_vis,
+			     SPARC_BUILTIN_FPCMPGT8, di_ftype_v8qi_v8qi);
+	  def_builtin_const ("__builtin_vis_fpcmpule16", CODE_FOR_fpcmpule16di_vis,
+			     SPARC_BUILTIN_FPCMPULE16, di_ftype_v4hi_v4hi);
+	  def_builtin_const ("__builtin_vis_fpcmpugt16", CODE_FOR_fpcmpugt16di_vis,
+			     SPARC_BUILTIN_FPCMPUGT16, di_ftype_v4hi_v4hi);
+	  def_builtin_const ("__builtin_vis_fpcmpule32", CODE_FOR_fpcmpule32di_vis,
+			     SPARC_BUILTIN_FPCMPULE32, di_ftype_v2si_v2si);
+	  def_builtin_const ("__builtin_vis_fpcmpugt32", CODE_FOR_fpcmpugt32di_vis,
+			     SPARC_BUILTIN_FPCMPUGT32, di_ftype_v2si_v2si);
+	}
+      else
+	{
+	  def_builtin_const ("__builtin_vis_fpcmple8", CODE_FOR_fpcmple8si_vis,
+			     SPARC_BUILTIN_FPCMPLE8, si_ftype_v8qi_v8qi);
+	  def_builtin_const ("__builtin_vis_fpcmpgt8", CODE_FOR_fpcmpgt8si_vis,
+			     SPARC_BUILTIN_FPCMPGT8, si_ftype_v8qi_v8qi);
+	  def_builtin_const ("__builtin_vis_fpcmpule16", CODE_FOR_fpcmpule16si_vis,
+			     SPARC_BUILTIN_FPCMPULE16, si_ftype_v4hi_v4hi);
+	  def_builtin_const ("__builtin_vis_fpcmpugt16", CODE_FOR_fpcmpugt16si_vis,
+			     SPARC_BUILTIN_FPCMPUGT16, si_ftype_v4hi_v4hi);
+	  def_builtin_const ("__builtin_vis_fpcmpule32", CODE_FOR_fpcmpule32si_vis,
+			     SPARC_BUILTIN_FPCMPULE32, di_ftype_v2si_v2si);
+	  def_builtin_const ("__builtin_vis_fpcmpugt32", CODE_FOR_fpcmpugt32si_vis,
+			     SPARC_BUILTIN_FPCMPUGT32, di_ftype_v2si_v2si);
+	}
+      
+      def_builtin_const ("__builtin_vis_fpmax8", CODE_FOR_maxv8qi3,
+			 SPARC_BUILTIN_FPMAX8, v8qi_ftype_v8qi_v8qi);
+      def_builtin_const ("__builtin_vis_fpmax16", CODE_FOR_maxv4hi3,
+			 SPARC_BUILTIN_FPMAX16, v4hi_ftype_v4hi_v4hi);
+      def_builtin_const ("__builtin_vis_fpmax32", CODE_FOR_maxv2si3,
+			 SPARC_BUILTIN_FPMAX32, v2si_ftype_v2si_v2si);
+      def_builtin_const ("__builtin_vis_fpmaxu8", CODE_FOR_maxuv8qi3,
+			 SPARC_BUILTIN_FPMAXU8, v8qi_ftype_v8qi_v8qi);
+      def_builtin_const ("__builtin_vis_fpmaxu16", CODE_FOR_maxuv4hi3,
+			 SPARC_BUILTIN_FPMAXU16, v4hi_ftype_v4hi_v4hi);
+      def_builtin_const ("__builtin_vis_fpmaxu32", CODE_FOR_maxuv2si3,
+			 SPARC_BUILTIN_FPMAXU32, v2si_ftype_v2si_v2si);
+      def_builtin_const ("__builtin_vis_fpmin8", CODE_FOR_minv8qi3,
+			 SPARC_BUILTIN_FPMIN8, v8qi_ftype_v8qi_v8qi);
+      def_builtin_const ("__builtin_vis_fpmin16", CODE_FOR_minv4hi3,
+			 SPARC_BUILTIN_FPMIN16, v4hi_ftype_v4hi_v4hi);
+      def_builtin_const ("__builtin_vis_fpmin32", CODE_FOR_minv2si3,
+			 SPARC_BUILTIN_FPMIN32, v2si_ftype_v2si_v2si);
+      def_builtin_const ("__builtin_vis_fpminu8", CODE_FOR_minuv8qi3,
+			 SPARC_BUILTIN_FPMINU8, v8qi_ftype_v8qi_v8qi);
+      def_builtin_const ("__builtin_vis_fpminu16", CODE_FOR_minuv4hi3,
+			 SPARC_BUILTIN_FPMINU16, v4hi_ftype_v4hi_v4hi);
+      def_builtin_const ("__builtin_vis_fpminu32", CODE_FOR_minuv2si3,
+			 SPARC_BUILTIN_FPMINU32, v2si_ftype_v2si_v2si);
+      def_builtin_const ("__builtin_vis_fpsub8", CODE_FOR_subv8qi3,
+			 SPARC_BUILTIN_FPSUB8, v8qi_ftype_v8qi_v8qi);
+      def_builtin_const ("__builtin_vis_fpsubs8", CODE_FOR_sssubv8qi3,
+			 SPARC_BUILTIN_FPSUBS8, v8qi_ftype_v8qi_v8qi);
+      def_builtin_const ("__builtin_vis_fpsubus8", CODE_FOR_ussubv8qi3,
+			 SPARC_BUILTIN_FPSUBUS8, v8qi_ftype_v8qi_v8qi);
+      def_builtin_const ("__builtin_vis_fpsubus16", CODE_FOR_ussubv4hi3,
+			 SPARC_BUILTIN_FPSUBUS16, v4hi_ftype_v4hi_v4hi);
+    }
 }
 
 /* Implement TARGET_BUILTIN_DECL hook.  */
@@ -11042,7 +11251,8 @@ sparc_register_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
 	  || sparc_cpu == PROCESSOR_NIAGARA
 	  || sparc_cpu == PROCESSOR_NIAGARA2
 	  || sparc_cpu == PROCESSOR_NIAGARA3
-	  || sparc_cpu == PROCESSOR_NIAGARA4)
+	  || sparc_cpu == PROCESSOR_NIAGARA4
+	  || sparc_cpu == PROCESSOR_NIAGARA7)
 	return 12;
 
       return 6;
diff --git a/gcc/config/sparc/sparc.h b/gcc/config/sparc/sparc.h
index ebfe87db04..d91496ab6a 100644
--- a/gcc/config/sparc/sparc.h
+++ b/gcc/config/sparc/sparc.h
@@ -142,6 +142,7 @@ extern enum cmodel sparc_cmodel;
 #define TARGET_CPU_niagara2	14
 #define TARGET_CPU_niagara3	15
 #define TARGET_CPU_niagara4	16
+#define TARGET_CPU_niagara7	19
 
 #if TARGET_CPU_DEFAULT == TARGET_CPU_v9 \
  || TARGET_CPU_DEFAULT == TARGET_CPU_ultrasparc \
@@ -149,7 +150,8 @@ extern enum cmodel sparc_cmodel;
  || TARGET_CPU_DEFAULT == TARGET_CPU_niagara \
  || TARGET_CPU_DEFAULT == TARGET_CPU_niagara2 \
  || TARGET_CPU_DEFAULT == TARGET_CPU_niagara3 \
- || TARGET_CPU_DEFAULT == TARGET_CPU_niagara4
+ || TARGET_CPU_DEFAULT == TARGET_CPU_niagara4 \
+ || TARGET_CPU_DEFAULT == TARGET_CPU_niagara7
 
 #define CPP_CPU32_DEFAULT_SPEC ""
 #define ASM_CPU32_DEFAULT_SPEC ""
@@ -186,6 +188,10 @@ extern enum cmodel sparc_cmodel;
 #define CPP_CPU64_DEFAULT_SPEC "-D__sparc_v9__"
 #define ASM_CPU64_DEFAULT_SPEC AS_NIAGARA4_FLAG
 #endif
+#if TARGET_CPU_DEFAULT == TARGET_CPU_niagara7
+#define CPP_CPU64_DEFAULT_SPEC "-D__sparc_v9__"
+#define ASM_CPU64_DEFAULT_SPEC AS_NIAGARA7_FLAG
+#endif
 
 #else
 
@@ -288,6 +294,7 @@ extern enum cmodel sparc_cmodel;
 %{mcpu=niagara2:-D__sparc_v9__} \
 %{mcpu=niagara3:-D__sparc_v9__} \
 %{mcpu=niagara4:-D__sparc_v9__} \
+%{mcpu=niagara7:-D__sparc_v9__} \
 %{!mcpu*:%(cpp_cpu_default)} \
 "
 #define CPP_ARCH32_SPEC ""
@@ -339,6 +346,7 @@ extern enum cmodel sparc_cmodel;
 %{mcpu=niagara2:%{!mv8plus:-Av9b}} \
 %{mcpu=niagara3:%{!mv8plus:-Av9" AS_NIAGARA3_FLAG "}} \
 %{mcpu=niagara4:%{!mv8plus:" AS_NIAGARA4_FLAG "}} \
+%{mcpu=niagara7:%{!mv8plus:" AS_NIAGARA7_FLAG "}} \
 %{!mcpu*:%(asm_cpu_default)} \
 "
 
@@ -1777,6 +1785,12 @@ extern int sparc_indent_opcode;
 #define AS_NIAGARA4_FLAG "-Av9" AS_NIAGARA3_FLAG
 #endif
 
+#ifdef HAVE_AS_SPARC5_VIS4
+#define AS_NIAGARA7_FLAG "-xarch=sparc5"
+#else
+#define AS_NIAGARA7_FLAG AS_NIAGARA4_FLAG
+#endif
+
 #ifdef HAVE_AS_LEON
 #define AS_LEON_FLAG "-Aleon"
 #define AS_LEONV7_FLAG "-Aleon"
diff --git a/gcc/config/sparc/sparc.md b/gcc/config/sparc/sparc.md
index 56d4f63017..29e4966fcc 100644
--- a/gcc/config/sparc/sparc.md
+++ b/gcc/config/sparc/sparc.md
@@ -234,7 +234,8 @@
    niagara,
    niagara2,
    niagara3,
-   niagara4"
+   niagara4,
+   niagara7"
   (const (symbol_ref "sparc_cpu_attr")))
 
 ;; Attribute for the instruction set.
@@ -247,7 +248,7 @@
 	 (symbol_ref "TARGET_SPARCLET") (const_string "sparclet")]
 	(const_string "v7"))))
 
-(define_attr "cpu_feature" "none,fpu,fpunotv9,v9,vis,vis3" (const_string "none"))
+(define_attr "cpu_feature" "none,fpu,fpunotv9,v9,vis,vis3,vis4" (const_string "none"))
 
 (define_attr "enabled" ""
   (cond [(eq_attr "cpu_feature" "none") (const_int 1)
@@ -255,7 +256,8 @@
 	 (eq_attr "cpu_feature" "fpunotv9") (symbol_ref "TARGET_FPU && ! TARGET_V9")
          (eq_attr "cpu_feature" "v9") (symbol_ref "TARGET_V9")
          (eq_attr "cpu_feature" "vis") (symbol_ref "TARGET_VIS")
-         (eq_attr "cpu_feature" "vis3") (symbol_ref "TARGET_VIS3")]
+         (eq_attr "cpu_feature" "vis3") (symbol_ref "TARGET_VIS3")
+         (eq_attr "cpu_feature" "vis4") (symbol_ref "TARGET_VIS4")]
         (const_int 0)))
 
 ;; Insn type.
@@ -274,7 +276,7 @@
    fga,visl,vismv,fgm_pack,fgm_mul,pdist,pdistn,edge,edgen,gsr,array,
    cmove,
    ialuX,
-   multi,savew,flushw,iflush,trap"
+   multi,savew,flushw,iflush,trap,lzd"
   (const_string "ialu"))
 
 ;; True if branch/call has empty delay slot and will emit a nop in it
@@ -476,6 +478,10 @@
 	   (const_string "true")
 	] (const_string "false")))
 
+;; True if the instruction executes in the V3 pipeline, in M7 and
+;; later processors.
+(define_attr "v3pipe" "false,true" (const_string "false"))
+
 (define_delay (eq_attr "type" "call")
   [(eq_attr "in_call_delay" "true") (nil) (nil)])
 
@@ -504,6 +510,7 @@
 (include "niagara.md")
 (include "niagara2.md")
 (include "niagara4.md")
+(include "niagara7.md")
 
 
 ;; Operand and operator predicates and constraints
@@ -1457,6 +1464,7 @@
    fzeros\t%0
    fones\t%0"
   [(set_attr "type" "*,*,load,store,vismv,vismv,fpmove,fpload,fpstore,visl,visl")
+   (set_attr "v3pipe" "*,*,*,*,true,true,*,*,*,true,true")
    (set_attr "cpu_feature" "*,*,*,*,vis3,vis3,*,*,*,vis,vis")])
 
 (define_insn "*movsi_lo_sum"
@@ -1622,6 +1630,7 @@
    fzero\t%0
    fone\t%0"
   [(set_attr "type" "store,store,store,load,*,*,*,*,fpstore,fpload,*,*,fpmove,*,*,*,fpload,fpstore,visl,visl")
+   (set_attr "v3pipe" "false, false, false, false,false,false,false,false,false,false,false,false,false,false,false,false,false,false, true, true")
    (set_attr "length" "*,2,*,*,2,2,2,2,*,*,2,2,*,2,2,2,*,*,*,*")
    (set_attr "fptype" "*,*,*,*,*,*,*,*,*,*,*,*,double,*,*,*,*,*,double,double")
    (set_attr "cpu_feature" "v9,*,*,*,*,*,*,*,fpu,fpu,fpu,fpu,v9,fpunotv9,vis3,vis3,fpu,fpu,vis,vis")])
@@ -1645,6 +1654,7 @@
    fzero\t%0
    fone\t%0"
   [(set_attr "type" "*,*,load,store,vismv,vismv,fpmove,fpload,fpstore,visl,visl")
+   (set_attr "v3pipe" "*, *, *, *, *, *, *, *, *, true, true")
    (set_attr "fptype" "*,*,*,*,*,*,double,*,*,double,double")
    (set_attr "cpu_feature" "*,*,*,*,vis3,vis3,*,*,*,vis,vis")])
 
@@ -2208,6 +2218,7 @@
     }
 }
   [(set_attr "type" "visl,visl,fpmove,*,*,*,vismv,vismv,fpload,load,fpstore,store")
+   (set_attr "v3pipe" "true, true, *, *, *, *, true, true, *, *, *, *")
    (set_attr "cpu_feature" "vis,vis,fpu,*,*,*,vis3,vis3,fpu,*,fpu,*")])
 
 ;; The following 3 patterns build SFmode constants in integer registers.
@@ -2276,6 +2287,7 @@
   #
   #"
   [(set_attr "type" "visl,visl,fpmove,*,*,*,fpload,store,fpstore,load,store,*,*,*,*")
+   (set_attr "v3pipe" "true, true, *, *, *, *, *, *, *, *, *, *, *, *, *")
    (set_attr "length" "*,*,*,2,2,2,*,*,*,*,*,2,2,2,2")
    (set_attr "fptype" "double,double,double,*,*,*,*,*,*,*,*,*,*,*,*")
    (set_attr "cpu_feature" "vis,vis,v9,fpunotv9,vis3,vis3,fpu,v9,fpu,*,*,fpu,*,*,fpu")])
@@ -2299,6 +2311,7 @@
   stx\t%r1, %0
   #"
   [(set_attr "type" "visl,visl,fpmove,vismv,vismv,load,store,*,load,store,*")
+   (set_attr "v3pipe" "true, true, *, *, *, *, *, *, *, *, *")
    (set_attr "length" "*,*,*,*,*,*,*,*,*,*,2")
    (set_attr "fptype" "double,double,double,double,double,*,*,*,*,*,*")
    (set_attr "cpu_feature" "vis,vis,fpu,vis3,vis3,fpu,fpu,*,*,*,*")])
@@ -2980,6 +2993,7 @@
    lduw\t%1, %0
    movstouw\t%1, %0"
   [(set_attr "type" "shift,load,*")
+   (set_attr "v3pipe" "*,*,true")
    (set_attr "cpu_feature" "*,*,vis3")])
 
 (define_insn_and_split "*zero_extendsidi2_insn_sp32"
@@ -3294,6 +3308,7 @@
   ldsw\t%1, %0
   movstosw\t%1, %0"
   [(set_attr "type" "shift,sload,*")
+   (set_attr "v3pipe" "*,*,true")
    (set_attr "us3load_type" "*,3cycle,*")
    (set_attr "cpu_feature" "*,*,vis3")])
 
@@ -6770,7 +6785,8 @@
   [(set (match_operand:DI 0 "register_operand" "=r")
         (clz:DI (match_operand:DI 1 "register_operand" "r")))]
   "TARGET_VIS3 && TARGET_ARCH64"
-  "lzd\t%1, %0")
+  "lzd\t%1, %0"
+  [(set_attr "type" "lzd")])
 
 (define_insn "clzdi_v8plus"
   [(set (match_operand:DI 0 "register_operand" "=r")
@@ -6811,7 +6827,8 @@
         (truncate:SI
           (clz:DI (match_operand:DI 1 "register_operand" "r"))))]
   "TARGET_VIS3 && TARGET_ARCH64"
-  "lzd\t%1, %0")
+  "lzd\t%1, %0"
+  [(set_attr "type" "lzd")])
 
 (define_insn "clzsi_v8plus"
   [(set (match_operand:SI 0 "register_operand" "=r")
@@ -7777,7 +7794,7 @@
 (define_mode_iterator VM64 [V1DI V2SI V4HI V8QI])
 (define_mode_iterator VMALL [V1SI V2HI V4QI V1DI V2SI V4HI V8QI])
 
-(define_mode_attr vbits [(V2SI "32") (V4HI "16") (V1SI "32s") (V2HI "16s")])
+(define_mode_attr vbits [(V2SI "32") (V4HI "16") (V1SI "32s") (V2HI "16s") (V8QI "8")])
 (define_mode_attr vconstr [(V1SI "f") (V2HI "f") (V4QI "f")
 			   (V1DI "e") (V2SI "e") (V4HI "e") (V8QI "e")])
 (define_mode_attr vfptype [(V1SI "single") (V2HI "single") (V4QI "single")
@@ -7812,6 +7829,7 @@
   movstouw\t%1, %0
   movwtos\t%1, %0"
   [(set_attr "type" "visl,visl,vismv,fpload,fpstore,store,load,store,*,vismv,vismv")
+   (set_attr "v3pipe" "true,true,true,false,false,false,false,false,false,true,true")
    (set_attr "cpu_feature" "vis,vis,vis,*,*,*,*,*,*,vis3,vis3")])
 
 (define_insn "*mov<VM64:mode>_insn_sp64"
@@ -7834,6 +7852,7 @@
   movxtod\t%1, %0
   mov\t%1, %0"
   [(set_attr "type" "visl,visl,vismv,fpload,fpstore,store,load,store,vismv,vismv,*")
+   (set_attr "v3pipe" "true, true, true, false, false, false, false, false, false, false, false")
    (set_attr "cpu_feature" "vis,vis,vis,*,*,*,*,*,vis3,vis3,*")])
 
 (define_insn "*mov<VM64:mode>_insn_sp32"
@@ -7857,6 +7876,7 @@
   #
   #"
   [(set_attr "type" "visl,visl,vismv,*,*,fpload,fpstore,store,load,store,*,*")
+   (set_attr "v3pipe" "true, true, true, false, false, false, false, false, false, false, false, false")
    (set_attr "length" "*,*,*,2,2,*,*,*,*,*,2,2")
    (set_attr "cpu_feature" "vis,vis,vis,vis3,vis3,*,*,*,*,*,*,*")])
 
@@ -7936,7 +7956,8 @@
   "TARGET_VIS"
   "fp<plusminus_insn><vbits>\t%1, %2, %0"
   [(set_attr "type" "fga")
-   (set_attr "fptype" "<vfptype>")])
+   (set_attr "fptype" "<vfptype>")
+   (set_attr "v3pipe" "true")])
 
 (define_mode_iterator VL [V1SI V2HI V4QI V1DI V2SI V4HI V8QI])
 (define_mode_attr vlsuf [(V1SI "s") (V2HI "s") (V4QI "s")
@@ -7952,6 +7973,7 @@
   "TARGET_VIS"
   "f<vlinsn><vlsuf>\t%1, %2, %0"
   [(set_attr "type" "visl")
+   (set_attr "v3pipe" "true")
    (set_attr "fptype" "<vfptype>")])
 
 (define_insn "*not_<code><mode>3"
@@ -7961,6 +7983,7 @@
   "TARGET_VIS"
   "f<vlninsn><vlsuf>\t%1, %2, %0"
   [(set_attr "type" "visl")
+   (set_attr "v3pipe" "true")
    (set_attr "fptype" "<vfptype>")])
 
 ;; (ior (not (op1)) (not (op2))) is the canonical form of NAND.
@@ -7971,6 +7994,7 @@
   "TARGET_VIS"
   "fnand<vlsuf>\t%1, %2, %0"
   [(set_attr "type" "visl")
+   (set_attr "v3pipe" "true")
    (set_attr "fptype" "<vfptype>")])
 
 (define_code_iterator vlnotop [ior and])
@@ -7982,6 +8006,7 @@
   "TARGET_VIS"
   "f<vlinsn>not1<vlsuf>\t%1, %2, %0"
   [(set_attr "type" "visl")
+   (set_attr "v3pipe" "true")
    (set_attr "fptype" "<vfptype>")])
 
 (define_insn "*<code>_not2<mode>_vis"
@@ -7991,6 +8016,7 @@
   "TARGET_VIS"
   "f<vlinsn>not2<vlsuf>\t%1, %2, %0"
   [(set_attr "type" "visl")
+   (set_attr "v3pipe" "true")
    (set_attr "fptype" "<vfptype>")])
 
 (define_insn "one_cmpl<mode>2"
@@ -7999,6 +8025,7 @@
   "TARGET_VIS"
   "fnot1<vlsuf>\t%1, %0"
   [(set_attr "type" "visl")
+   (set_attr "v3pipe" "true")
    (set_attr "fptype" "<vfptype>")])
 
 ;; Hard to generate VIS instructions.  We have builtins for these.
@@ -8225,7 +8252,8 @@
   "TARGET_VIS"
   "faligndata\t%1, %2, %0"
   [(set_attr "type" "fga")
-   (set_attr "fptype" "double")])
+   (set_attr "fptype" "double")
+   (set_attr "v3pipe" "true")])
 
 (define_insn "alignaddrsi_vis"
   [(set (match_operand:SI 0 "register_operand" "=r")
@@ -8235,7 +8263,8 @@
         (zero_extend:DI (plus:SI (match_dup 1) (match_dup 2))))]
   "TARGET_VIS"
   "alignaddr\t%r1, %r2, %0"
-  [(set_attr "type" "gsr")])
+  [(set_attr "type" "gsr")
+   (set_attr "v3pipe" "true")])
 
 (define_insn "alignaddrdi_vis"
   [(set (match_operand:DI 0 "register_operand" "=r")
@@ -8245,7 +8274,8 @@
         (plus:DI (match_dup 1) (match_dup 2)))]
   "TARGET_VIS"
   "alignaddr\t%r1, %r2, %0"
-  [(set_attr "type" "gsr")])
+  [(set_attr "type" "gsr")
+   (set_attr "v3pipe" "true")])
 
 (define_insn "alignaddrlsi_vis"
   [(set (match_operand:SI 0 "register_operand" "=r")
@@ -8256,7 +8286,8 @@
                 (const_int 7)))]
   "TARGET_VIS"
   "alignaddrl\t%r1, %r2, %0"
-  [(set_attr "type" "gsr")])
+  [(set_attr "type" "gsr")
+   (set_attr "v3pipe" "true")])
 
 (define_insn "alignaddrldi_vis"
   [(set (match_operand:DI 0 "register_operand" "=r")
@@ -8267,7 +8298,8 @@
                 (const_int 7)))]
   "TARGET_VIS"
   "alignaddrl\t%r1, %r2, %0"
-  [(set_attr "type" "gsr")])
+  [(set_attr "type" "gsr")
+   (set_attr "v3pipe" "true")])
 
 (define_insn "pdist_vis"
   [(set (match_operand:DI 0 "register_operand" "=e")
@@ -8360,6 +8392,17 @@
   "TARGET_VIS"
   "fcmp<code><GCM:gcm_name>\t%1, %2, %0"
   [(set_attr "type" "visl")
+   (set_attr "v3pipe" "true")
+   (set_attr "fptype" "double")])
+
+(define_insn "fpcmp<code>8<P:mode>_vis"
+  [(set (match_operand:P 0 "register_operand" "=r")
+  	(unspec:P [(gcond:V8QI (match_operand:V8QI 1 "register_operand" "e")
+		               (match_operand:V8QI 2 "register_operand" "e"))]
+	 UNSPEC_FCMP))]
+  "TARGET_VIS4"
+  "fpcmp<code>8\t%1, %2, %0"
+  [(set_attr "type" "visl")
    (set_attr "fptype" "double")])
 
 (define_expand "vcond<mode><mode>"
@@ -8427,7 +8470,8 @@
         (plus:DI (match_dup 1) (match_dup 2)))]
   "TARGET_VIS2"
   "bmask\t%r1, %r2, %0"
-  [(set_attr "type" "array")])
+  [(set_attr "type" "array")
+   (set_attr "v3pipe" "true")])
 
 (define_insn "bmasksi_vis"
   [(set (match_operand:SI 0 "register_operand" "=r")
@@ -8437,7 +8481,8 @@
         (zero_extend:DI (plus:SI (match_dup 1) (match_dup 2))))]
   "TARGET_VIS2"
   "bmask\t%r1, %r2, %0"
-  [(set_attr "type" "array")])
+  [(set_attr "type" "array")
+   (set_attr "v3pipe" "true")])
 
 (define_insn "bshuffle<VM64:mode>_vis"
   [(set (match_operand:VM64 0 "register_operand" "=e")
@@ -8448,7 +8493,8 @@
   "TARGET_VIS2"
   "bshuffle\t%1, %2, %0"
   [(set_attr "type" "fga")
-   (set_attr "fptype" "double")])
+   (set_attr "fptype" "double")
+   (set_attr "v3pipe" "true")])
 
 ;; The rtl expanders will happily convert constant permutations on other
 ;; modes down to V8QI.  Rely on this to avoid the complexity of the byte
@@ -8550,7 +8596,8 @@
                    UNSPEC_CMASK8))]
   "TARGET_VIS3"
   "cmask8\t%r0"
-  [(set_attr "type" "fga")])
+  [(set_attr "type" "fga")
+   (set_attr "v3pipe" "true")])
 
 (define_insn "cmask16<P:mode>_vis"
   [(set (reg:DI GSR_REG)
@@ -8559,7 +8606,8 @@
                    UNSPEC_CMASK16))]
   "TARGET_VIS3"
   "cmask16\t%r0"
-  [(set_attr "type" "fga")])
+  [(set_attr "type" "fga")
+   (set_attr "v3pipe" "true")])
 
 (define_insn "cmask32<P:mode>_vis"
   [(set (reg:DI GSR_REG)
@@ -8568,7 +8616,8 @@
                    UNSPEC_CMASK32))]
   "TARGET_VIS3"
   "cmask32\t%r0"
-  [(set_attr "type" "fga")])
+  [(set_attr "type" "fga")
+   (set_attr "v3pipe" "true")])
 
 (define_insn "fchksm16_vis"
   [(set (match_operand:V4HI 0 "register_operand" "=e")
@@ -8601,6 +8650,7 @@
   "TARGET_VIS3"
   "pdistn\t%1, %2, %0"
   [(set_attr "type" "pdistn")
+   (set_attr "v3pipe" "true")
    (set_attr "fptype" "double")])
 
 (define_insn "fmean16_vis"
@@ -8628,6 +8678,14 @@
   "fp<plusminus_insn>64\t%1, %2, %0"
   [(set_attr "type" "fga")])
 
+(define_insn "<plusminus_insn>v8qi3"
+  [(set (match_operand:V8QI 0 "register_operand" "=e")
+        (plusminus:V8QI (match_operand:V8QI 1 "register_operand" "e")
+                        (match_operand:V8QI 2 "register_operand" "e")))]
+  "TARGET_VIS4"
+  "fp<plusminus_insn>8\t%1, %2, %0"
+  [(set_attr "type" "fga")])
+
 (define_mode_iterator VASS [V4HI V2SI V2HI V1SI])
 (define_code_iterator vis3_addsub_ss [ss_plus ss_minus])
 (define_code_attr vis3_addsub_ss_insn
@@ -8641,8 +8699,63 @@
                              (match_operand:VASS 2 "register_operand" "<vconstr>")))]
   "TARGET_VIS3"
   "<vis3_addsub_ss_insn><vbits>\t%1, %2, %0"
+  [(set_attr "type" "fga")
+   (set_attr "v3pipe" "true")])
+
+(define_mode_iterator VMMAX [V8QI V4HI V2SI])
+(define_code_iterator vis4_minmax [smin smax])
+(define_code_attr vis4_minmax_insn
+  [(smin "fpmin") (smax "fpmax")])
+(define_code_attr vis4_minmax_patname
+  [(smin "min") (smax "max")])
+
+(define_insn "<vis4_minmax_patname><mode>3"
+  [(set (match_operand:VMMAX 0 "register_operand" "=<vconstr>")
+        (vis4_minmax:VMMAX (match_operand:VMMAX 1 "register_operand" "<vconstr>")
+                           (match_operand:VMMAX 2 "register_operand" "<vconstr>")))]
+  "TARGET_VIS4"
+  "<vis4_minmax_insn><vbits>\t%1, %2, %0"
+  [(set_attr "type" "fga")])
+
+(define_code_iterator vis4_uminmax [umin umax])
+(define_code_attr vis4_uminmax_insn
+  [(umin "fpminu") (umax "fpmaxu")])
+(define_code_attr vis4_uminmax_patname
+ [(umin "minu") (umax "maxu")])
+
+(define_insn "<vis4_uminmax_patname><mode>3"
+  [(set (match_operand:VMMAX 0 "register_operand" "=<vconstr>")
+        (vis4_uminmax:VMMAX (match_operand:VMMAX 1 "register_operand" "<vconstr>")
+                            (match_operand:VMMAX 2 "register_operand" "<vconstr>")))]
+  "TARGET_VIS4"
+  "<vis4_uminmax_insn><vbits>\t%1, %2, %0"
   [(set_attr "type" "fga")])
 
+;; The use of vis3_addsub_ss_patname in the VIS4 instruction below is
+;; intended.
+(define_insn "<vis3_addsub_ss_patname>v8qi3"
+  [(set (match_operand:V8QI 0 "register_operand" "=e")
+        (vis3_addsub_ss:V8QI (match_operand:V8QI 1 "register_operand" "e")
+                             (match_operand:V8QI 2 "register_operand" "e")))]
+  "TARGET_VIS4"
+  "<vis3_addsub_ss_insn>8\t%1, %2, %0"
+  [(set_attr "type" "fga")])
+
+(define_mode_iterator VAUS [V4HI V8QI])
+(define_code_iterator vis4_addsub_us [us_plus us_minus])
+(define_code_attr vis4_addsub_us_insn
+  [(us_plus "fpaddus") (us_minus "fpsubus")])
+(define_code_attr vis4_addsub_us_patname
+  [(us_plus "usadd") (us_minus "ussub")])
+
+(define_insn "<vis4_addsub_us_patname><mode>3"
+ [(set (match_operand:VAUS 0 "register_operand" "=<vconstr>")
+       (vis4_addsub_us:VAUS (match_operand:VAUS 1 "register_operand" "<vconstr>")
+                            (match_operand:VAUS 2 "register_operand" "<vconstr>")))]
+ "TARGET_VIS4"
+ "<vis4_addsub_us_insn><vbits>\t%1, %2, %0"
+ [(set_attr "type" "fga")])
+
 (define_insn "fucmp<code>8<P:mode>_vis"
   [(set (match_operand:P 0 "register_operand" "=r")
   	(unspec:P [(gcond:V8QI (match_operand:V8QI 1 "register_operand" "e")
@@ -8650,7 +8763,18 @@
 	 UNSPEC_FUCMP))]
   "TARGET_VIS3"
   "fucmp<code>8\t%1, %2, %0"
-  [(set_attr "type" "visl")])
+  [(set_attr "type" "visl")
+   (set_attr "v3pipe" "true")])
+
+(define_insn "fpcmpu<code><GCM:gcm_name><P:mode>_vis"
+  [(set (match_operand:P 0 "register_operand" "=r")
+  	(unspec:P [(gcond:GCM (match_operand:GCM 1 "register_operand" "e")
+		              (match_operand:GCM 2 "register_operand" "e"))]
+	 UNSPEC_FUCMP))]
+  "TARGET_VIS4"
+  "fpcmpu<code><GCM:gcm_name>\t%1, %2, %0"
+  [(set_attr "type" "visl")
+   (set_attr "fptype" "double")])
 
 (define_insn "*naddsf3"
   [(set (match_operand:SF 0 "register_operand" "=f")
diff --git a/gcc/config/sparc/sparc.opt b/gcc/config/sparc/sparc.opt
index 25eaa1ae5f..13d41515f0 100644
--- a/gcc/config/sparc/sparc.opt
+++ b/gcc/config/sparc/sparc.opt
@@ -73,6 +73,10 @@ mvis3
 Target Report Mask(VIS3)
 Use UltraSPARC Visual Instruction Set version 3.0 extensions.
 
+mvis4
+Target Report Mask(VIS4)
+Use UltraSPARC Visual Instruction Set version 4.0 extensions.
+
 mcbcond
 Target Report Mask(CBCOND)
 Use UltraSPARC Compare-and-Branch extensions.
@@ -194,6 +198,9 @@ Enum(sparc_processor_type) String(niagara3) Value(PROCESSOR_NIAGARA3)
 EnumValue
 Enum(sparc_processor_type) String(niagara4) Value(PROCESSOR_NIAGARA4)
 
+EnumValue
+Enum(sparc_processor_type) String(niagara7) Value(PROCESSOR_NIAGARA7)
+
 mcmodel=
 Target RejectNegative Joined Var(sparc_cmodel_string)
 Use given SPARC-V9 code model.
diff --git a/gcc/config/sparc/visintrin.h b/gcc/config/sparc/visintrin.h
index 51ef73906a..8b80cc1e56 100644
--- a/gcc/config/sparc/visintrin.h
+++ b/gcc/config/sparc/visintrin.h
@@ -704,6 +704,192 @@ __vis_xmulxhi (__i64 __A, __i64 __B)
 
 #endif /* __VIS__ >= 0x300 */
 
+#if __VIS__ >= 0x400
+
+extern __inline __v8qi
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__vis_fpadd8 (__v8qi __A, __v8qi __B)
+{
+  return __builtin_vis_fpadd8 (__A, __B);
+}
+
+extern __inline __v8qi
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__vis_fpadds8 (__v8qi __A, __v8qi __B)
+{
+  return __builtin_vis_fpadds8 (__A, __B);
+}
+
+extern __inline __v8qi
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__vis_fpaddus8 (__v8qi __A, __v8qi __B)
+{
+  return __builtin_vis_fpaddus8 (__A, __B);
+}
+
+extern __inline __v4hi
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__vis_fpaddus16 (__v4hi __A, __v4hi __B)
+{
+  return __builtin_vis_fpaddus16 (__A, __B);
+}
+
+extern __inline long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__vis_fpcmple8 (__v8qi __A, __v8qi __B)
+{
+  return __builtin_vis_fpcmple8 (__A, __B);
+}
+
+extern __inline long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__vis_fpcmpgt8 (__v8qi __A, __v8qi __B)
+{
+  return __builtin_vis_fpcmpgt8 (__A, __B);
+}
+
+extern __inline long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__vis_fpcmpule16 (__v4hi __A, __v4hi __B)
+{
+  return __builtin_vis_fpcmpule16 (__A, __B);
+}
+
+extern __inline long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__vis_fpcmpugt16 (__v4hi __A, __v4hi __B)
+{
+  return __builtin_vis_fpcmpugt16 (__A, __B);
+}
+
+extern __inline long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__vis_fpcmpule32 (__v2si __A, __v2si __B)
+{
+  return __builtin_vis_fpcmpule32 (__A, __B);
+}
+
+extern __inline long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__vis_fpcmpugt32 (__v2si __A, __v2si __B)
+{
+  return __builtin_vis_fpcmpugt32 (__A, __B);
+}
+
+extern __inline __v8qi
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__vis_fpmax8 (__v8qi __A, __v8qi __B)
+{
+  return __builtin_vis_fpmax8 (__A, __B);
+}
+
+extern __inline __v4hi
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__vis_fpmax16 (__v4hi __A, __v4hi __B)
+{
+  return __builtin_vis_fpmax16 (__A, __B);
+}
+
+extern __inline __v2si
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__vis_fpmax32 (__v2si __A, __v2si __B)
+{
+  return __builtin_vis_fpmax32 (__A, __B);
+}
+
+extern __inline __v8qi
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__vis_fpmaxu8 (__v8qi __A, __v8qi __B)
+{
+  return __builtin_vis_fpmaxu8 (__A, __B);
+}
+
+extern __inline __v4hi
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__vis_fpmaxu16 (__v4hi __A, __v4hi __B)
+{
+  return __builtin_vis_fpmaxu16 (__A, __B);
+}
+
+extern __inline __v2si
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__vis_fpmaxu32 (__v2si __A, __v2si __B)
+{
+  return __builtin_vis_fpmaxu32 (__A, __B);
+}
+
+extern __inline __v8qi
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__vis_fpmin8 (__v8qi __A, __v8qi __B)
+{
+  return __builtin_vis_fpmin8 (__A, __B);
+}
+
+extern __inline __v4hi
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__vis_fpmin16 (__v4hi __A, __v4hi __B)
+{
+  return __builtin_vis_fpmin16 (__A, __B);
+}
+
+extern __inline __v2si
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__vis_fpmin32 (__v2si __A, __v2si __B)
+{
+  return __builtin_vis_fpmin32 (__A, __B);
+}
+
+extern __inline __v8qi
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__vis_fpminu8 (__v8qi __A, __v8qi __B)
+{
+  return __builtin_vis_fpminu8 (__A, __B);
+}
+
+extern __inline __v4hi
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__vis_fpminu16 (__v4hi __A, __v4hi __B)
+{
+  return __builtin_vis_fpminu16 (__A, __B);
+}
+
+extern __inline __v2si
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__vis_fpminu32 (__v2si __A, __v2si __B)
+{
+  return __builtin_vis_fpminu32 (__A, __B);
+}
+
+extern __inline __v8qi
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__vis_fpsub8 (__v8qi __A, __v8qi __B)
+{
+  return __builtin_vis_fpsub8 (__A, __B);
+}
+
+extern __inline __v8qi
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__vis_fpsubs8 (__v8qi __A, __v8qi __B)
+{
+  return __builtin_vis_fpsubs8 (__A, __B);
+}
+
+extern __inline __v8qi
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__vis_fpsubus8 (__v8qi __A, __v8qi __B)
+{
+  return __builtin_vis_fpsubus8 (__A, __B);
+}
+
+extern __inline __v4hi
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__vis_fpsubus16 (__v4hi __A, __v4hi __B)
+{
+  return __builtin_vis_fpsubus16 (__A, __B);
+}
+
+#endif /* __VIS__ >= 0x400 */
+
 #endif /* __VIS__ */
 
 #endif  /* _VISINTRIN_H_INCLUDED */
diff --git a/gcc/config/visium/visium-protos.h b/gcc/config/visium/visium-protos.h
index 484d01e477..9dcbc67035 100644
--- a/gcc/config/visium/visium-protos.h
+++ b/gcc/config/visium/visium-protos.h
@@ -49,7 +49,8 @@ extern void visium_split_cbranch (enum rtx_code, rtx, rtx, rtx);
 extern const char *output_ubranch (rtx, rtx_insn *);
 extern const char *output_cbranch (rtx, enum rtx_code, enum machine_mode, int,
 				   rtx_insn *);
-extern void split_double_move (rtx *, enum machine_mode);
+extern void visium_split_double_move (rtx *, enum machine_mode);
+extern void visium_split_double_add (enum rtx_code, rtx, rtx, rtx);
 extern void visium_expand_copysign (rtx *, enum machine_mode);
 extern void visium_expand_int_cstore (rtx *, enum machine_mode);
 extern void visium_expand_fp_cstore (rtx *, enum machine_mode);
diff --git a/gcc/config/visium/visium.c b/gcc/config/visium/visium.c
index cd28f9bf90..6712fed72b 100644
--- a/gcc/config/visium/visium.c
+++ b/gcc/config/visium/visium.c
@@ -2026,7 +2026,7 @@ visium_rtx_costs (rtx x, machine_mode mode, int outer_code ATTRIBUTE_UNUSED,
 /* Split a double move of OPERANDS in MODE.  */
 
 void
-split_double_move (rtx *operands, enum machine_mode mode)
+visium_split_double_move (rtx *operands, enum machine_mode mode)
 {
   bool swap = false;
 
@@ -2076,14 +2076,74 @@ split_double_move (rtx *operands, enum machine_mode mode)
     }
 }
 
+/* Split a double addition or subtraction of operands.  */
+
+void
+visium_split_double_add (enum rtx_code code, rtx op0, rtx op1, rtx op2)
+{
+  rtx op3 = gen_lowpart (SImode, op0);
+  rtx op4 = gen_lowpart (SImode, op1);
+  rtx op5;
+  rtx op6 = gen_highpart (SImode, op0);
+  rtx op7 = (op1 == const0_rtx ? op1 : gen_highpart (SImode, op1));
+  rtx op8;
+  rtx x, pat, flags;
+
+  /* If operand #2 is a small constant, then its high part is null.  */
+  if (CONST_INT_P (op2))
+    {
+      HOST_WIDE_INT val = INTVAL (op2);
+
+      if (val < 0)
+	{
+	  code = (code == MINUS ? PLUS : MINUS);
+	  val = -val;
+	}
+
+      op5 = gen_int_mode (val, SImode);
+      op8 = const0_rtx;
+    }
+  else
+    {
+      op5 = gen_lowpart (SImode, op2);
+      op8 = gen_highpart (SImode, op2);
+    }
+
+  /* This is the {add,sub,neg}si3_insn_set_flags pattern.  */
+  if (op4 == const0_rtx)
+    x = gen_rtx_NEG (SImode, op5);
+  else
+    x = gen_rtx_fmt_ee (code, SImode, op4, op5);
+  pat = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (2));
+  XVECEXP (pat, 0, 0) = gen_rtx_SET (op3, x);
+  flags = gen_rtx_REG (CC_NOOVmode, FLAGS_REGNUM);
+  x = gen_rtx_COMPARE (CC_NOOVmode, shallow_copy_rtx (x), const0_rtx);
+  XVECEXP (pat, 0, 1) = gen_rtx_SET (flags, x);
+  emit_insn (pat);
+
+  /* This is the plus_[plus_]sltu_flags or minus_[minus_]sltu_flags pattern.  */
+  if (op8 == const0_rtx)
+    x = op7;
+  else
+    x = gen_rtx_fmt_ee (code, SImode, op7, op8);
+  x = gen_rtx_fmt_ee (code, SImode, x, gen_rtx_LTU (SImode, flags, const0_rtx));
+  pat = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (2));
+  XVECEXP (pat, 0, 0) = gen_rtx_SET (op6, x);
+  flags = gen_rtx_REG (CCmode, FLAGS_REGNUM);
+  XVECEXP (pat, 0, 1) = gen_rtx_CLOBBER (VOIDmode, flags);
+  emit_insn (pat);
+
+  visium_flags_exposed = true;
+}
+
 /* Expand a copysign of OPERANDS in MODE.  */
 
 void
 visium_expand_copysign (rtx *operands, enum machine_mode mode)
 {
-  rtx dest = operands[0];
-  rtx op0 = operands[1];
-  rtx op1 = operands[2];
+  rtx op0 = operands[0];
+  rtx op1 = operands[1];
+  rtx op2 = operands[2];
   rtx mask = force_reg (SImode, GEN_INT (0x7fffffff));
   rtx x;
 
@@ -2091,37 +2151,37 @@ visium_expand_copysign (rtx *operands, enum machine_mode mode)
      the FPU on the MCM have a non-standard behavior wrt NaNs.  */
   gcc_assert (mode == SFmode);
 
-  /* First get all the non-sign bits of OP0.  */
-  if (GET_CODE (op0) == CONST_DOUBLE)
+  /* First get all the non-sign bits of op1.  */
+  if (GET_CODE (op1) == CONST_DOUBLE)
     {
-      if (real_isneg (CONST_DOUBLE_REAL_VALUE (op0)))
-	op0 = simplify_unary_operation (ABS, mode, op0, mode);
-      if (op0 != CONST0_RTX (mode))
+      if (real_isneg (CONST_DOUBLE_REAL_VALUE (op1)))
+	op1 = simplify_unary_operation (ABS, mode, op1, mode);
+      if (op1 != CONST0_RTX (mode))
 	{
 	  long l;
-	  REAL_VALUE_TO_TARGET_SINGLE (*CONST_DOUBLE_REAL_VALUE (op0), l);
-	  op0 = force_reg (SImode, GEN_INT (trunc_int_for_mode (l, SImode)));
+	  REAL_VALUE_TO_TARGET_SINGLE (*CONST_DOUBLE_REAL_VALUE (op1), l);
+	  op1 = force_reg (SImode, gen_int_mode (l, SImode));
 	}
     }
   else
     {
-      op0 = copy_to_mode_reg (SImode, gen_lowpart (SImode, op0));
-      op0 = force_reg (SImode, gen_rtx_AND (SImode, op0, mask));
+      op1 = copy_to_mode_reg (SImode, gen_lowpart (SImode, op1));
+      op1 = force_reg (SImode, gen_rtx_AND (SImode, op1, mask));
     }
 
-  /* Then get the sign bit of OP1.  */
+  /* Then get the sign bit of op2.  */
   mask = force_reg (SImode, gen_rtx_NOT (SImode, mask));
-  op1 = copy_to_mode_reg (SImode, gen_lowpart (SImode, op1));
-  op1 = force_reg (SImode, gen_rtx_AND (SImode, op1, mask));
+  op2 = copy_to_mode_reg (SImode, gen_lowpart (SImode, op2));
+  op2 = force_reg (SImode, gen_rtx_AND (SImode, op2, mask));
 
   /* Finally OR the two values.  */
-  if (op0 == CONST0_RTX (SFmode))
-    x = op1;
+  if (op1 == CONST0_RTX (SFmode))
+    x = op2;
   else
-    x = force_reg (SImode, gen_rtx_IOR (SImode, op0, op1));
+    x = force_reg (SImode, gen_rtx_IOR (SImode, op1, op2));
 
   /* And move the result to the destination.  */
-  emit_insn (gen_rtx_SET (dest, gen_lowpart (SFmode, x)));
+  emit_insn (gen_rtx_SET (op0, gen_lowpart (SFmode, x)));
 }
 
 /* Expand a cstore of OPERANDS in MODE for EQ/NE/LTU/GTU/GEU/LEU.  We generate
@@ -3537,18 +3597,15 @@ visium_compute_frame_size (int size)
 int
 visium_initial_elimination_offset (int from, int to ATTRIBUTE_UNUSED)
 {
-  const int frame_size = visium_compute_frame_size (get_frame_size ());
   const int save_fp = current_frame_info.save_fp;
   const int save_lr = current_frame_info.save_lr;
   const int lr_slot = current_frame_info.lr_slot;
-  const int local_frame_offset
-    = (save_fp + save_lr + lr_slot) * UNITS_PER_WORD;
   int offset;
 
   if (from == FRAME_POINTER_REGNUM)
-    offset = local_frame_offset;
+    offset = (save_fp + save_lr + lr_slot) * UNITS_PER_WORD;
   else if (from == ARG_POINTER_REGNUM)
-    offset = frame_size;
+    offset = visium_compute_frame_size (get_frame_size ());
   else
     gcc_unreachable ();
 
diff --git a/gcc/config/visium/visium.md b/gcc/config/visium/visium.md
index 09d136f5f0..41e3e5c571 100644
--- a/gcc/config/visium/visium.md
+++ b/gcc/config/visium/visium.md
@@ -627,7 +627,7 @@
   [(set (match_dup 2) (match_dup 3))
    (set (match_dup 4) (match_dup 5))]
 {
-  split_double_move (operands, DImode);
+  visium_split_double_move (operands, DImode);
 })
 
 ;;
@@ -726,7 +726,7 @@
   [(set (match_dup 2) (match_dup 3))
    (set (match_dup 4) (match_dup 5))]
 {
-  split_double_move (operands, DFmode);
+  visium_split_double_move (operands, DFmode);
 })
 
 ;;
@@ -815,31 +815,20 @@
 		 (match_operand:DI 2 "add_operand" "")))]
   "")
 
+; Disfavour the use of add.l because of the early clobber.
+
 (define_insn_and_split "*addi3_insn"
   [(set (match_operand:DI 0 "register_operand"          "=r,r,&r")
 	(plus:DI (match_operand:DI 1 "register_operand" "%0,0, r")
-		 (match_operand:DI 2 "add_operand"      " J,L, r")))]
+		 (match_operand:DI 2 "add_operand"      " L,J, r")))]
   "ok_for_simple_arith_logic_operands (operands, DImode)"
   "#"
   "reload_completed"
-  [(parallel [(set (match_dup 0)
-		   (plus:DI (match_dup 1) (match_dup 2)))
-	      (clobber (reg:CC R_FLAGS))])]
-  ""
-  [(set_attr "type" "arith2")])
-
-; Disfavour the use of add.l because of the early clobber.
-
-(define_insn "*adddi3_insn_flags"
-  [(set (match_operand:DI 0 "register_operand"          "=r,r,&r")
-	(plus:DI (match_operand:DI 1 "register_operand" "%0,0, r")
-		 (match_operand:DI 2 "add_operand"      " J,L, r")))
-   (clobber (reg:CC R_FLAGS))]
-  "reload_completed"
-  "@
-    addi    %d0,%2\n\tadc.l   %0,%0,r0
-    subi    %d0,%n2\n\tsubc.l  %0,%0,r0
-    add.l   %d0,%d1,%d2\n\tadc.l   %0,%1,%2"
+  [(const_int 0)]
+{
+  visium_split_double_add (PLUS, operands[0], operands[1], operands[2]);
+  DONE;
+}
   [(set_attr "type" "arith2")])
 
 ;;
@@ -847,7 +836,7 @@
 ;;
 ;; Integer Add with Carry
 ;;
-;; Only SI mode is supported as slt[u] for the sake of cstore.
+;; Only SI mode is supported.
 ;;
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;;
@@ -869,6 +858,16 @@
   "adc.l   %0,%1,r0"
   [(set_attr "type" "arith")])
 
+(define_insn "*plus_plus_sltu<subst_arith>"
+  [(set (match_operand:SI 0 "register_operand" "=r")
+	(plus:SI (plus:SI (match_operand:SI 1 "register_operand" "r")
+			  (match_operand:SI 2 "register_operand" "r"))
+		 (ltu:SI (reg R_FLAGS) (const_int 0))))
+   (clobber (reg:CC R_FLAGS))]
+  "reload_completed"
+  "adc.l   %0,%1,%2"
+  [(set_attr "type" "arith")])
+
 ;;
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;;
@@ -955,31 +954,20 @@
 		  (match_operand:DI 2 "add_operand" "")))]
   "")
 
+; Disfavour the use of the sub.l because of the early clobber.
+
 (define_insn_and_split "*subdi3_insn"
   [(set (match_operand:DI 0 "register_operand"           "=r,r,&r")
 	(minus:DI (match_operand:DI 1 "register_operand" " 0,0, r")
-		  (match_operand:DI 2 "add_operand"      " J,L, r")))]
+		  (match_operand:DI 2 "add_operand"      " L,J, r")))]
   "ok_for_simple_arith_logic_operands (operands, DImode)"
   "#"
   "reload_completed"
-  [(parallel [(set (match_dup 0)
-		   (minus:DI (match_dup 1) (match_dup 2)))
-	      (clobber (reg:CC R_FLAGS))])]
- ""
-  [(set_attr "type" "arith2")])
-
-; Disfavour the use of the sub.l because of the early clobber.
-
-(define_insn "*subdi3_insn_flags"
-  [(set (match_operand:DI 0 "register_operand"           "=r,r,&r")
-	(minus:DI (match_operand:DI 1 "register_operand" " 0,0, r")
-		  (match_operand:DI 2 "add_operand"      " J,L, r")))
-   (clobber (reg:CC R_FLAGS))]
-  "reload_completed"
-  "@
-    subi    %d0,%2\n\tsubc.l  %0,%0,r0
-    addi    %d0,%n2\n\tadc.l   %0,%0,r0
-    sub.l   %d0,%d1,%d2\n\tsubc.l  %0,%1,%2"
+  [(const_int 0)]
+{
+  visium_split_double_add (MINUS, operands[0], operands[1], operands[2]);
+  DONE;
+}
   [(set_attr "type" "arith2")])
 
 ;;
@@ -987,7 +975,7 @@
 ;;
 ;; Integer Subtract with Carry
 ;;
-;; Only SI mode is supported as neg<slt[u]> for the sake of cstore.
+;; Only SI mode is supported.
 ;;
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;;
@@ -1009,6 +997,16 @@
   "subc.l  %0,%1,r0"
   [(set_attr "type" "arith")])
 
+(define_insn "*minus_minus_sltu<subst_arith>"
+  [(set (match_operand:SI 0 "register_operand" "=r")
+	(minus:SI (minus:SI (match_operand:SI 1 "reg_or_0_operand" "rO")
+			    (match_operand:SI 2 "register_operand" "r"))
+		  (ltu:SI (reg R_FLAGS) (const_int 0))))
+   (clobber (reg:CC R_FLAGS))]
+  "reload_completed"
+  "subc.l  %0,%r1,%2"
+  [(set_attr "type" "arith")])
+
 ;;
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;;
@@ -1054,17 +1052,11 @@
   "ok_for_simple_arith_logic_operands (operands, DImode)"
   "#"
   "reload_completed"
-  [(parallel [(set (match_dup 0) (neg:DI (match_dup 1)))
-	      (clobber (reg:CC R_FLAGS))])]
-  ""
-  [(set_attr "type" "arith2")])
-
-(define_insn "*negdi2_insn_flags"
-  [(set (match_operand:DI 0 "register_operand" "=&r")
-	(neg:DI (match_operand:DI 1 "register_operand" "r")))
-   (clobber (reg:CC R_FLAGS))]
-  "reload_completed"
-  "sub.l   %d0,r0,%d1\n\tsubc.l  %0,r0,%1"
+  [(const_int 0)]
+{
+  visium_split_double_add (MINUS, operands[0], const0_rtx, operands[1]);
+  DONE;
+}
   [(set_attr "type" "arith2")])
 
 ;;
author	Lorry Tar Creator <lorry-tar-importer@lorry>	2016-08-22 10:27:46 +0000
committer	Lorry Tar Creator <lorry-tar-importer@lorry>	2016-08-22 10:27:46 +0000
commit	f733cf303bcdc952c92b81dd62199a40a1f555ec (patch)
tree	0a9a9e0f28aa7c7f5bc4d1d1d0e9647163cac4f7 /gcc/config
parent	e0e4357b88efe5dc53e50d341a09de4d02331200 (diff)
download	gcc-tarball-51b577d7e597bf27e0277cae6b3b54bbf2e90ee2.tar.gz