63 files changed, 1960 insertions, 766 deletions
diff --git a/gcc/config/aarch64/aarch64-elf-raw.h b/gcc/config/aarch64/aarch64-elf-raw.h
index bd5e51c862..d8c682f433 100644
--- a/gcc/config/aarch64/aarch64-elf-raw.h
+++ b/gcc/config/aarch64/aarch64-elf-raw.h
@@ -44,7 +44,12 @@
 #endif
 
 #ifndef LINK_SPEC
-#define LINK_SPEC "%{mbig-endian:-EB} %{mlittle-endian:-EL} -X \
+#define LINK_SPEC "%{h*}			\
+   %{static:-Bstatic}				\
+   %{shared:-shared}				\
+   %{symbolic:-Bsymbolic}			\
+   %{!static:%{rdynamic:-export-dynamic}}	\
+   %{mbig-endian:-EB} %{mlittle-endian:-EL} -X	\
   -maarch64elf%{mabi=ilp32*:32}%{mbig-endian:b}" \
   CA53_ERR_835769_SPEC \
   CA53_ERR_843419_SPEC
diff --git a/gcc/config/aarch64/aarch64-linux.h b/gcc/config/aarch64/aarch64-linux.h
index ba7fc3b4d1..257acf05ee 100644
--- a/gcc/config/aarch64/aarch64-linux.h
+++ b/gcc/config/aarch64/aarch64-linux.h
@@ -35,8 +35,9 @@
    %{static:-Bstatic}				\
    %{shared:-shared}				\
    %{symbolic:-Bsymbolic}			\
-   %{rdynamic:-export-dynamic}			\
-   -dynamic-linker " GNU_USER_DYNAMIC_LINKER "	\
+   %{!static:					\
+     %{rdynamic:-export-dynamic}		\
+     %{!shared:-dynamic-linker " GNU_USER_DYNAMIC_LINKER "}} \
    -X						\
    %{mbig-endian:-EB} %{mlittle-endian:-EL}     \
    -maarch64linux%{mabi=ilp32:32}%{mbig-endian:b}"
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 6f86ede742..efa54a3881 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -4290,11 +4290,10 @@ aarch64_print_operand (FILE *f, rtx x, char code)
 	  break;
 
 	case CONST_DOUBLE:
-	  /* CONST_DOUBLE can represent a double-width integer.
-	     In this case, the mode of x is VOIDmode.  */
-	  if (GET_MODE (x) == VOIDmode)
-	    ; /* Do Nothing.  */
-	  else if (aarch64_float_const_zero_rtx_p (x))
+	  /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
+	     be getting CONST_DOUBLEs holding integers.  */
+	  gcc_assert (GET_MODE (x) != VOIDmode);
+	  if (aarch64_float_const_zero_rtx_p (x))
 	    {
 	      fputc ('0', f);
 	      break;
@@ -5246,11 +5245,17 @@ aarch64_rtx_mult_cost (rtx x, int code, int outer, bool speed)
       if (speed)
 	{
 	  /* Floating-point FMA/FMUL can also support negations of the
-	     operands.  */
-	  if (GET_CODE (op0) == NEG)
-	    op0 = XEXP (op0, 0);
-	  if (GET_CODE (op1) == NEG)
-	    op1 = XEXP (op1, 0);
+	     operands, unless the rounding mode is upward or downward in
+	     which case FNMUL is different than FMUL with operand negation.  */
+	  bool neg0 = GET_CODE (op0) == NEG;
+	  bool neg1 = GET_CODE (op1) == NEG;
+	  if (maybe_fma || !flag_rounding_math || (neg0 && neg1))
+	    {
+	      if (neg0)
+		op0 = XEXP (op0, 0);
+	      if (neg1)
+		op1 = XEXP (op1, 0);
+	    }
 
 	  if (maybe_fma)
 	    /* FMADD/FNMADD/FNMSUB/FMSUB.  */
@@ -5694,6 +5699,12 @@ aarch64_rtx_costs (rtx x, int code, int outer ATTRIBUTE_UNUSED,
               *cost = rtx_cost (op0, NEG, 0, speed);
               return true;
             }
+	  if (GET_CODE (op0) == MULT)
+	    {
+	      /* FNMUL.  */
+	      *cost = rtx_cost (op0, NEG, 0, speed);
+	      return true;
+	    }
 	  if (speed)
 	    /* FNEG.  */
 	    *cost += extra_cost->fp[mode == DFmode].neg;
@@ -8121,7 +8132,7 @@ aarch64_madd_needs_nop (rtx_insn* insn)
   if (!aarch64_fix_a53_err835769)
     return false;
 
-  if (recog_memoized (insn) < 0)
+  if (!INSN_P (insn) || recog_memoized (insn) < 0)
     return false;
 
   attr_type = get_attr_type (insn);
@@ -9020,8 +9031,8 @@ aarch64_expand_compare_and_swap (rtx operands[])
      unlikely event of fail being ACQUIRE and succ being RELEASE we need to
      promote succ to ACQ_REL so that we don't lose the acquire semantics.  */
 
-  if (INTVAL (mod_f) == MEMMODEL_ACQUIRE
-      && INTVAL (mod_s) == MEMMODEL_RELEASE)
+  if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
+      && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
     mod_s = GEN_INT (MEMMODEL_ACQ_REL);
 
   switch (mode)
@@ -9066,6 +9077,23 @@ aarch64_expand_compare_and_swap (rtx operands[])
   emit_insn (gen_rtx_SET (VOIDmode, bval, x));
 }
 
+/* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
+   sequence implementing an atomic operation.  */
+
+static void
+aarch64_emit_post_barrier (enum memmodel model)
+{
+  const enum memmodel base_model = memmodel_base (model);
+
+  if (is_mm_sync (model)
+      && (base_model == MEMMODEL_ACQUIRE
+	  || base_model == MEMMODEL_ACQ_REL
+	  || base_model == MEMMODEL_SEQ_CST))
+    {
+      emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
+    }
+}
+
 /* Split a compare and swap pattern.  */
 
 void
@@ -9076,14 +9104,18 @@ aarch64_split_compare_and_swap (rtx operands[])
   bool is_weak;
   rtx_code_label *label1, *label2;
   rtx x, cond;
+  enum memmodel model;
+  rtx model_rtx;
 
   rval = operands[0];
   mem = operands[1];
   oldval = operands[2];
   newval = operands[3];
   is_weak = (operands[4] != const0_rtx);
+  model_rtx = operands[5];
   scratch = operands[7];
   mode = GET_MODE (mem);
+  model = memmodel_from_int (INTVAL (model_rtx));
 
   label1 = NULL;
   if (!is_weak)
@@ -9093,7 +9125,13 @@ aarch64_split_compare_and_swap (rtx operands[])
     }
   label2 = gen_label_rtx ();
 
-  aarch64_emit_load_exclusive (mode, rval, mem, operands[5]);
+  /* The initial load can be relaxed for a __sync operation since a final
+     barrier will be emitted to stop code hoisting.  */
+  if (is_mm_sync (model))
+    aarch64_emit_load_exclusive (mode, rval, mem,
+				 GEN_INT (MEMMODEL_RELAXED));
+  else
+    aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
 
   cond = aarch64_gen_compare_reg (NE, rval, oldval);
   x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
@@ -9101,7 +9139,7 @@ aarch64_split_compare_and_swap (rtx operands[])
 			    gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
   aarch64_emit_unlikely_jump (gen_rtx_SET (VOIDmode, pc_rtx, x));
 
-  aarch64_emit_store_exclusive (mode, scratch, mem, newval, operands[5]);
+  aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
 
   if (!is_weak)
     {
@@ -9118,6 +9156,10 @@ aarch64_split_compare_and_swap (rtx operands[])
     }
 
   emit_label (label2);
+
+  /* Emit any final barrier needed for a __sync operation.  */
+  if (is_mm_sync (model))
+    aarch64_emit_post_barrier (model);
 }
 
 /* Split an atomic operation.  */
@@ -9128,6 +9170,8 @@ aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
 {
   machine_mode mode = GET_MODE (mem);
   machine_mode wmode = (mode == DImode ? DImode : SImode);
+  const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
+  const bool is_sync = is_mm_sync (model);
   rtx_code_label *label;
   rtx x;
 
@@ -9142,7 +9186,13 @@ aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
     old_out = new_out;
   value = simplify_gen_subreg (wmode, value, mode, 0);
 
-  aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
+  /* The initial load can be relaxed for a __sync operation since a final
+     barrier will be emitted to stop code hoisting.  */
+ if (is_sync)
+    aarch64_emit_load_exclusive (mode, old_out, mem,
+				 GEN_INT (MEMMODEL_RELAXED));
+  else
+    aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
 
   switch (code)
     {
@@ -9178,6 +9228,10 @@ aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
 			    gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
   aarch64_emit_unlikely_jump (gen_rtx_SET (VOIDmode, pc_rtx, x));
+
+  /* Emit any final barrier needed for a __sync operation.  */
+  if (is_sync)
+    aarch64_emit_post_barrier (model);
 }
 
 static void
diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
index bf59e40a64..b49e4561cc 100644
--- a/gcc/config/aarch64/aarch64.h
+++ b/gcc/config/aarch64/aarch64.h
@@ -151,6 +151,8 @@
 
 #define LONG_DOUBLE_TYPE_SIZE	128
 
+#define TARGET_SUPPORTS_WIDE_INT 1
+
 /* The architecture reserves all bits of the address for hardware use,
    so the vbit must go into the delta field of pointers to member
    functions.  This is the same config as that in the AArch32
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 534a862b4a..4665279446 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -382,7 +382,7 @@
 )
 
 (define_insn "prefetch"
-  [(prefetch (match_operand:DI 0 "address_operand" "r")
+  [(prefetch (match_operand:DI 0 "register_operand" "r")
             (match_operand:QI 1 "const_int_operand" "")
             (match_operand:QI 2 "const_int_operand" ""))]
   ""
@@ -872,7 +872,7 @@
    fmov\\t%w0, %s1
    fmov\\t%s0, %s1"
    "CONST_INT_P (operands[1]) && !aarch64_move_imm (INTVAL (operands[1]), SImode)
-    && GP_REGNUM_P (REGNO (operands[0]))"
+    && REG_P (operands[0]) && GP_REGNUM_P (REGNO (operands[0]))"
    [(const_int 0)]
    "{
        aarch64_expand_mov_immediate (operands[0], operands[1]);
@@ -905,7 +905,7 @@
    fmov\\t%d0, %d1
    movi\\t%d0, %1"
    "(CONST_INT_P (operands[1]) && !aarch64_move_imm (INTVAL (operands[1]), DImode))
-    && GP_REGNUM_P (REGNO (operands[0]))"
+    && REG_P (operands[0]) && GP_REGNUM_P (REGNO (operands[0]))"
    [(const_int 0)]
    "{
        aarch64_expand_mov_immediate (operands[0], operands[1]);
@@ -3971,6 +3971,16 @@
         (mult:GPF
 		 (neg:GPF (match_operand:GPF 1 "register_operand" "w"))
 		 (match_operand:GPF 2 "register_operand" "w")))]
+  "TARGET_FLOAT && !flag_rounding_math"
+  "fnmul\\t%<s>0, %<s>1, %<s>2"
+  [(set_attr "type" "fmul<s>")]
+)
+
+(define_insn "*fnmul<mode>3"
+  [(set (match_operand:GPF 0 "register_operand" "=w")
+        (neg:GPF (mult:GPF
+		 (match_operand:GPF 1 "register_operand" "w")
+		 (match_operand:GPF 2 "register_operand" "w"))))]
   "TARGET_FLOAT"
   "fnmul\\t%<s>0, %<s>1, %<s>2"
   [(set_attr "type" "fmul<s>")]
diff --git a/gcc/config/aarch64/atomics.md b/gcc/config/aarch64/atomics.md
index 939a11e011..6e6be99a30 100644
--- a/gcc/config/aarch64/atomics.md
+++ b/gcc/config/aarch64/atomics.md
@@ -119,7 +119,7 @@
   [(set (match_operand:ALLI 0 "aarch64_sync_memory_operand" "+Q")
     (unspec_volatile:ALLI
       [(atomic_op:ALLI (match_dup 0)
-	(match_operand:ALLI 1 "<atomic_op_operand>" "r<lconst_atomic>"))
+	(match_operand:ALLI 1 "<atomic_op_operand>" "r<const_atomic>"))
        (match_operand:SI 2 "const_int_operand")]		;; model
       UNSPECV_ATOMIC_OP))
        (clobber (reg:CC CC_REGNUM))
@@ -164,7 +164,7 @@
    (set (match_dup 1)
     (unspec_volatile:ALLI
       [(atomic_op:ALLI (match_dup 1)
-	(match_operand:ALLI 2 "<atomic_op_operand>" "r<lconst_atomic>"))
+	(match_operand:ALLI 2 "<atomic_op_operand>" "r<const_atomic>"))
        (match_operand:SI 3 "const_int_operand")]		;; model
       UNSPECV_ATOMIC_OP))
    (clobber (reg:CC CC_REGNUM))
@@ -209,7 +209,7 @@
   [(set (match_operand:ALLI 0 "register_operand" "=&r")
     (atomic_op:ALLI
       (match_operand:ALLI 1 "aarch64_sync_memory_operand" "+Q")
-      (match_operand:ALLI 2 "<atomic_op_operand>" "r<lconst_atomic>")))
+      (match_operand:ALLI 2 "<atomic_op_operand>" "r<const_atomic>")))
    (set (match_dup 1)
     (unspec_volatile:ALLI
       [(match_dup 1) (match_dup 2)
@@ -260,10 +260,8 @@
       UNSPECV_LDA))]
   ""
   {
-    enum memmodel model = (enum memmodel) INTVAL (operands[2]);
-    if (model == MEMMODEL_RELAXED
-	|| model == MEMMODEL_CONSUME
-	|| model == MEMMODEL_RELEASE)
+    enum memmodel model = memmodel_from_int (INTVAL (operands[2]));
+    if (is_mm_relaxed (model) || is_mm_consume (model) || is_mm_release (model))
       return "ldr<atomic_sfx>\t%<w>0, %1";
     else
       return "ldar<atomic_sfx>\t%<w>0, %1";
@@ -278,10 +276,8 @@
       UNSPECV_STL))]
   ""
   {
-    enum memmodel model = (enum memmodel) INTVAL (operands[2]);
-    if (model == MEMMODEL_RELAXED
-	|| model == MEMMODEL_CONSUME
-	|| model == MEMMODEL_ACQUIRE)
+    enum memmodel model = memmodel_from_int (INTVAL (operands[2]));
+    if (is_mm_relaxed (model) || is_mm_consume (model) || is_mm_acquire (model))
       return "str<atomic_sfx>\t%<w>1, %0";
     else
       return "stlr<atomic_sfx>\t%<w>1, %0";
@@ -297,10 +293,8 @@
 	UNSPECV_LX)))]
   ""
   {
-    enum memmodel model = (enum memmodel) INTVAL (operands[2]);
-    if (model == MEMMODEL_RELAXED
-	|| model == MEMMODEL_CONSUME
-	|| model == MEMMODEL_RELEASE)
+    enum memmodel model = memmodel_from_int (INTVAL (operands[2]));
+    if (is_mm_relaxed (model) || is_mm_consume (model) || is_mm_release (model))
       return "ldxr<atomic_sfx>\t%w0, %1";
     else
       return "ldaxr<atomic_sfx>\t%w0, %1";
@@ -315,10 +309,8 @@
       UNSPECV_LX))]
   ""
   {
-    enum memmodel model = (enum memmodel) INTVAL (operands[2]);
-    if (model == MEMMODEL_RELAXED
-	|| model == MEMMODEL_CONSUME
-	|| model == MEMMODEL_RELEASE)
+    enum memmodel model = memmodel_from_int (INTVAL (operands[2]));
+    if (is_mm_relaxed (model) || is_mm_consume (model) || is_mm_release (model))
       return "ldxr\t%<w>0, %1";
     else
       return "ldaxr\t%<w>0, %1";
@@ -335,10 +327,8 @@
       UNSPECV_SX))]
   ""
   {
-    enum memmodel model = (enum memmodel) INTVAL (operands[3]);
-    if (model == MEMMODEL_RELAXED
-	|| model == MEMMODEL_CONSUME
-	|| model == MEMMODEL_ACQUIRE)
+    enum memmodel model = memmodel_from_int (INTVAL (operands[3]));
+    if (is_mm_relaxed (model) || is_mm_consume (model) || is_mm_acquire (model))
       return "stxr<atomic_sfx>\t%w0, %<w>2, %1";
     else
       return "stlxr<atomic_sfx>\t%w0, %<w>2, %1";
@@ -349,8 +339,8 @@
   [(match_operand:SI 0 "const_int_operand" "")]
   ""
   {
-    enum memmodel model = (enum memmodel) INTVAL (operands[0]);
-    if (model != MEMMODEL_RELAXED && model != MEMMODEL_CONSUME)
+    enum memmodel model = memmodel_from_int (INTVAL (operands[0]));
+    if (!(is_mm_relaxed (model) || is_mm_consume (model)))
       emit_insn (gen_dmb (operands[0]));
     DONE;
   }
@@ -373,8 +363,8 @@
      UNSPEC_MB))]
   ""
   {
-    enum memmodel model = (enum memmodel) INTVAL (operands[1]);
-    if (model == MEMMODEL_ACQUIRE)
+    enum memmodel model = memmodel_from_int (INTVAL (operands[1]));
+    if (is_mm_acquire (model))
       return "dmb\\tishld";
     else
       return "dmb\\tish";
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index 498358a635..7c8bbfa894 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -342,9 +342,6 @@
 ;; Attribute to describe constants acceptable in logical operations
 (define_mode_attr lconst [(SI "K") (DI "L")])
 
-;; Attribute to describe constants acceptable in atomic logical operations
-(define_mode_attr lconst_atomic [(QI "K") (HI "K") (SI "K") (DI "L")])
-
 ;; Map a mode to a specific constraint character.
 (define_mode_attr cmode [(QI "q") (HI "h") (SI "s") (DI "d")])
 
@@ -845,6 +842,16 @@
    (plus "aarch64_plus_operand")
    (minus "aarch64_plus_operand")])
 
+;; Constants acceptable for atomic operations.
+;; This definition must appear in this file before the iterators it refers to.
+(define_code_attr const_atomic
+ [(plus "IJ") (minus "IJ")
+  (xor "<lconst_atomic>") (ior "<lconst_atomic>")
+  (and "<lconst_atomic>")])
+
+;; Attribute to describe constants acceptable in atomic logical operations
+(define_mode_attr lconst_atomic [(QI "K") (HI "K") (SI "K") (DI "L")])
+
 ;; -------------------------------------------------------------------
 ;; Int Iterators.
 ;; -------------------------------------------------------------------
diff --git a/gcc/config/aarch64/predicates.md b/gcc/config/aarch64/predicates.md
index 3979209722..8e07a17ce1 100644
--- a/gcc/config/aarch64/predicates.md
+++ b/gcc/config/aarch64/predicates.md
@@ -32,7 +32,7 @@
 
 ;; Return true if OP a (const_int 0) operand.
 (define_predicate "const0_operand"
-  (and (match_code "const_int, const_double")
+  (and (match_code "const_int")
        (match_test "op == CONST0_RTX (mode)")))
 
 (define_predicate "aarch64_ccmp_immediate"
diff --git a/gcc/config/alpha/alpha.c b/gcc/config/alpha/alpha.c
index 864a8fc4cb..731a966cdf 100644
--- a/gcc/config/alpha/alpha.c
+++ b/gcc/config/alpha/alpha.c
@@ -4548,8 +4548,8 @@ alpha_split_compare_and_swap (rtx operands[])
   oldval = operands[3];
   newval = operands[4];
   is_weak = (operands[5] != const0_rtx);
-  mod_s = (enum memmodel) INTVAL (operands[6]);
-  mod_f = (enum memmodel) INTVAL (operands[7]);
+  mod_s = memmodel_from_int (INTVAL (operands[6]));
+  mod_f = memmodel_from_int (INTVAL (operands[7]));
   mode = GET_MODE (mem);
 
   alpha_pre_atomic_barrier (mod_s);
@@ -4587,12 +4587,12 @@ alpha_split_compare_and_swap (rtx operands[])
       emit_unlikely_jump (x, label1);
     }
 
-  if (mod_f != MEMMODEL_RELAXED)
+  if (!is_mm_relaxed (mod_f))
     emit_label (XEXP (label2, 0));
 
   alpha_post_atomic_barrier (mod_s);
 
-  if (mod_f == MEMMODEL_RELAXED)
+  if (is_mm_relaxed (mod_f))
     emit_label (XEXP (label2, 0));
 }
 
@@ -4653,8 +4653,8 @@ alpha_split_compare_and_swap_12 (rtx operands[])
   newval = operands[4];
   align = operands[5];
   is_weak = (operands[6] != const0_rtx);
-  mod_s = (enum memmodel) INTVAL (operands[7]);
-  mod_f = (enum memmodel) INTVAL (operands[8]);
+  mod_s = memmodel_from_int (INTVAL (operands[7]));
+  mod_f = memmodel_from_int (INTVAL (operands[8]));
   scratch = operands[9];
   mode = GET_MODE (orig_mem);
   addr = XEXP (orig_mem, 0);
@@ -4706,12 +4706,12 @@ alpha_split_compare_and_swap_12 (rtx operands[])
       emit_unlikely_jump (x, label1);
     }
 
-  if (mod_f != MEMMODEL_RELAXED)
+  if (!is_mm_relaxed (mod_f))
     emit_label (XEXP (label2, 0));
 
   alpha_post_atomic_barrier (mod_s);
 
-  if (mod_f == MEMMODEL_RELAXED)
+  if (is_mm_relaxed (mod_f))
     emit_label (XEXP (label2, 0));
 }
 
diff --git a/gcc/config/arm/arm-protos.h b/gcc/config/arm/arm-protos.h
index 16eb854605..ebaf746227 100644
--- a/gcc/config/arm/arm-protos.h
+++ b/gcc/config/arm/arm-protos.h
@@ -312,6 +312,7 @@ extern int vfp3_const_double_for_bits (rtx);
 
 extern void arm_emit_coreregs_64bit_shift (enum rtx_code, rtx, rtx, rtx, rtx,
 					   rtx);
+extern bool arm_valid_symbolic_address_p (rtx);
 extern bool arm_validize_comparison (rtx *, rtx *, rtx *);
 #endif /* RTX_CODE */
 
diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c
index fb14891613..bfa58c3d92 100644
--- a/gcc/config/arm/arm.c
+++ b/gcc/config/arm/arm.c
@@ -10269,7 +10269,7 @@ arm_new_rtx_costs (rtx x, enum rtx_code code, enum rtx_code outer_code,
 
 	  *cost = COSTS_N_INSNS (1);
 
-	  if (GET_CODE (op0) == NEG)
+	  if (GET_CODE (op0) == NEG && !flag_rounding_math)
 	    op0 = XEXP (op0, 0);
 
 	  if (speed_p)
@@ -10345,6 +10345,13 @@ arm_new_rtx_costs (rtx x, enum rtx_code code, enum rtx_code outer_code,
       if (TARGET_HARD_FLOAT && GET_MODE_CLASS (mode) == MODE_FLOAT
 	  && (mode == SFmode || !TARGET_VFP_SINGLE))
 	{
+	  if (GET_CODE (XEXP (x, 0)) == MULT)
+	    {
+	      /* VNMUL.  */
+	      *cost = rtx_cost (XEXP (x, 0), NEG, 0, speed_p);
+	      return true;
+	    }
+
 	  *cost = COSTS_N_INSNS (1);
 	  if (speed_p)
 	    *cost += extra_cost->fp[mode != SFmode].neg;
@@ -27537,25 +27544,36 @@ vfp3_const_double_for_fract_bits (rtx operand)
   return 0;
 }
 
+/* If X is a CONST_DOUBLE with a value that is a power of 2 whose
+   log2 is in [1, 32], return that log2.  Otherwise return -1.
+   This is used in the patterns for vcvt.s32.f32 floating-point to
+   fixed-point conversions.  */
+
 int
-vfp3_const_double_for_bits (rtx operand)
+vfp3_const_double_for_bits (rtx x)
 {
-  REAL_VALUE_TYPE r0;
+  if (!CONST_DOUBLE_P (x))
+    return -1;
 
-  if (!CONST_DOUBLE_P (operand))
-    return 0;
+  REAL_VALUE_TYPE r;
 
-  REAL_VALUE_FROM_CONST_DOUBLE (r0, operand);
-  if (exact_real_truncate (DFmode, &r0))
-    {
-      HOST_WIDE_INT value = real_to_integer (&r0);
-      value = value & 0xffffffff;
-      if ((value != 0) && ( (value & (value - 1)) == 0))
-	return int_log2 (value);
-    }
+  REAL_VALUE_FROM_CONST_DOUBLE (r, x);
+  if (REAL_VALUE_NEGATIVE (r)
+      || REAL_VALUE_ISNAN (r)
+      || REAL_VALUE_ISINF (r)
+      || !real_isinteger (&r, SFmode))
+    return -1;
 
-  return 0;
+  HOST_WIDE_INT hwint = exact_log2 (real_to_integer (&r));
+
+  /* The exact_log2 above will have returned -1 if this is
+     not an exact log2.  */
+  if (!IN_RANGE (hwint, 1, 32))
+    return -1;
+
+  return hwint;
 }
+
 
 /* Emit a memory barrier around an atomic sequence according to MODEL.  */
 
@@ -27678,8 +27696,8 @@ arm_expand_compare_and_swap (rtx operands[])
      promote succ to ACQ_REL so that we don't lose the acquire semantics.  */
 
   if (TARGET_HAVE_LDACQ
-      && INTVAL (mod_f) == MEMMODEL_ACQUIRE
-      && INTVAL (mod_s) == MEMMODEL_RELEASE)
+      && is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
+      && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
     mod_s = GEN_INT (MEMMODEL_ACQ_REL);
 
   switch (mode)
@@ -27752,20 +27770,25 @@ arm_split_compare_and_swap (rtx operands[])
   oldval = operands[2];
   newval = operands[3];
   is_weak = (operands[4] != const0_rtx);
-  mod_s = (enum memmodel) INTVAL (operands[5]);
-  mod_f = (enum memmodel) INTVAL (operands[6]);
+  mod_s = memmodel_from_int (INTVAL (operands[5]));
+  mod_f = memmodel_from_int (INTVAL (operands[6]));
   scratch = operands[7];
   mode = GET_MODE (mem);
 
-  bool use_acquire = TARGET_HAVE_LDACQ
-                     && !(mod_s == MEMMODEL_RELAXED
-                          || mod_s == MEMMODEL_CONSUME
-                          || mod_s == MEMMODEL_RELEASE);
+  bool is_armv8_sync = arm_arch8 && is_mm_sync (mod_s);
 
+  bool use_acquire = TARGET_HAVE_LDACQ
+                     && !(is_mm_relaxed (mod_s) || is_mm_consume (mod_s)
+			  || is_mm_release (mod_s));
+		
   bool use_release = TARGET_HAVE_LDACQ
-                     && !(mod_s == MEMMODEL_RELAXED
-                          || mod_s == MEMMODEL_CONSUME
-                          || mod_s == MEMMODEL_ACQUIRE);
+                     && !(is_mm_relaxed (mod_s) || is_mm_consume (mod_s)
+			  || is_mm_acquire (mod_s));
+
+  /* For ARMv8, the load-acquire is too weak for __sync memory orders.  Instead,
+     a full barrier is emitted after the store-release.  */
+  if (is_armv8_sync)
+    use_acquire = false;
 
   /* Checks whether a barrier is needed and emits one accordingly.  */
   if (!(use_acquire || use_release))
@@ -27803,14 +27826,15 @@ arm_split_compare_and_swap (rtx operands[])
       emit_unlikely_jump (gen_rtx_SET (VOIDmode, pc_rtx, x));
     }
 
-  if (mod_f != MEMMODEL_RELAXED)
+  if (!is_mm_relaxed (mod_f))
     emit_label (label2);
 
   /* Checks whether a barrier is needed and emits one accordingly.  */
-  if (!(use_acquire || use_release))
+  if (is_armv8_sync
+      || !(use_acquire || use_release))
     arm_post_atomic_barrier (mod_s);
 
-  if (mod_f == MEMMODEL_RELAXED)
+  if (is_mm_relaxed (mod_f))
     emit_label (label2);
 }
 
@@ -27818,21 +27842,26 @@ void
 arm_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
 		     rtx value, rtx model_rtx, rtx cond)
 {
-  enum memmodel model = (enum memmodel) INTVAL (model_rtx);
+  enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
   machine_mode mode = GET_MODE (mem);
   machine_mode wmode = (mode == DImode ? DImode : SImode);
   rtx_code_label *label;
   rtx x;
 
+  bool is_armv8_sync = arm_arch8 && is_mm_sync (model);
+
   bool use_acquire = TARGET_HAVE_LDACQ
-                     && !(model == MEMMODEL_RELAXED
-                          || model == MEMMODEL_CONSUME
-                          || model == MEMMODEL_RELEASE);
+                     && !(is_mm_relaxed (model) || is_mm_consume (model)
+			  || is_mm_release (model));
 
   bool use_release = TARGET_HAVE_LDACQ
-                     && !(model == MEMMODEL_RELAXED
-                          || model == MEMMODEL_CONSUME
-                          || model == MEMMODEL_ACQUIRE);
+                     && !(is_mm_relaxed (model) || is_mm_consume (model)
+			  || is_mm_acquire (model));
+
+  /* For ARMv8, a load-acquire is too weak for __sync memory orders.  Instead,
+     a full barrier is emitted after the store-release.  */
+  if (is_armv8_sync)
+    use_acquire = false;
 
   /* Checks whether a barrier is needed and emits one accordingly.  */
   if (!(use_acquire || use_release))
@@ -27904,7 +27933,8 @@ arm_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
   emit_unlikely_jump (gen_cbranchsi4 (x, cond, const0_rtx, label));
 
   /* Checks whether a barrier is needed and emits one accordingly.  */
-  if (!(use_acquire || use_release))
+  if (is_armv8_sync
+      || !(use_acquire || use_release))
     arm_post_atomic_barrier (model);
 }
 
@@ -28792,6 +28822,38 @@ arm_emit_coreregs_64bit_shift (enum rtx_code code, rtx out, rtx in,
   #undef BRANCH
 }
 
+/* Returns true if the pattern is a valid symbolic address, which is either a
+   symbol_ref or (symbol_ref + addend).
+
+   According to the ARM ELF ABI, the initial addend of REL-type relocations
+   processing MOVW and MOVT instructions is formed by interpreting the 16-bit
+   literal field of the instruction as a 16-bit signed value in the range
+   -32768 <= A < 32768.  */
+
+bool
+arm_valid_symbolic_address_p (rtx addr)
+{
+  rtx xop0, xop1 = NULL_RTX;
+  rtx tmp = addr;
+
+  if (GET_CODE (tmp) == SYMBOL_REF || GET_CODE (tmp) == LABEL_REF)
+    return true;
+
+  /* (const (plus: symbol_ref const_int))  */
+  if (GET_CODE (addr) == CONST)
+    tmp = XEXP (addr, 0);
+
+  if (GET_CODE (tmp) == PLUS)
+    {
+      xop0 = XEXP (tmp, 0);
+      xop1 = XEXP (tmp, 1);
+
+      if (GET_CODE (xop0) == SYMBOL_REF && CONST_INT_P (xop1))
+	  return IN_RANGE (INTVAL (xop1), -0x8000, 0x7fff);
+    }
+
+  return false;
+}
 
 /* Returns true if a valid comparison operation and makes
    the operands in a form that is valid.  */
@@ -28942,7 +29004,7 @@ arm_block_set_unaligned_vect (rtx dstbase,
   rtx (*gen_func) (rtx, rtx);
   machine_mode mode;
   unsigned HOST_WIDE_INT v = value;
-
+  unsigned int offset = 0;
   gcc_assert ((align & 0x3) != 0);
   nelt_v8 = GET_MODE_NUNITS (V8QImode);
   nelt_v16 = GET_MODE_NUNITS (V16QImode);
@@ -28963,7 +29025,7 @@ arm_block_set_unaligned_vect (rtx dstbase,
     return false;
 
   dst = copy_addr_to_reg (XEXP (dstbase, 0));
-  mem = adjust_automodify_address (dstbase, mode, dst, 0);
+  mem = adjust_automodify_address (dstbase, mode, dst, offset);
 
   v = sext_hwi (v, BITS_PER_WORD);
   val_elt = GEN_INT (v);
@@ -28980,7 +29042,11 @@ arm_block_set_unaligned_vect (rtx dstbase,
     {
       emit_insn ((*gen_func) (mem, reg));
       if (i + 2 * nelt_mode <= length)
-	emit_insn (gen_add2_insn (dst, GEN_INT (nelt_mode)));
+	{
+	  emit_insn (gen_add2_insn (dst, GEN_INT (nelt_mode)));
+	  offset += nelt_mode;
+	  mem = adjust_automodify_address (dstbase, mode, dst, offset);
+	}
     }
 
   /* If there are not less than nelt_v8 bytes leftover, we must be in
@@ -28991,6 +29057,9 @@ arm_block_set_unaligned_vect (rtx dstbase,
   if (i + nelt_v8 < length)
     {
       emit_insn (gen_add2_insn (dst, GEN_INT (length - i)));
+      offset += length - i;
+      mem = adjust_automodify_address (dstbase, mode, dst, offset);
+
       /* We are shifting bytes back, set the alignment accordingly.  */
       if ((length & 1) != 0 && align >= 2)
 	set_mem_align (mem, BITS_PER_UNIT);
@@ -29001,12 +29070,13 @@ arm_block_set_unaligned_vect (rtx dstbase,
   else if (i < length && i + nelt_v8 >= length)
     {
       if (mode == V16QImode)
-	{
-	  reg = gen_lowpart (V8QImode, reg);
-	  mem = adjust_automodify_address (dstbase, V8QImode, dst, 0);
-	}
+	reg = gen_lowpart (V8QImode, reg);
+
       emit_insn (gen_add2_insn (dst, GEN_INT ((length - i)
 					      + (nelt_mode - nelt_v8))));
+      offset += (length - i) + (nelt_mode - nelt_v8);
+      mem = adjust_automodify_address (dstbase, V8QImode, dst, offset);
+
       /* We are shifting bytes back, set the alignment accordingly.  */
       if ((length & 1) != 0 && align >= 2)
 	set_mem_align (mem, BITS_PER_UNIT);
@@ -29033,6 +29103,7 @@ arm_block_set_aligned_vect (rtx dstbase,
   rtx rval[MAX_VECT_LEN];
   machine_mode mode;
   unsigned HOST_WIDE_INT v = value;
+  unsigned int offset = 0;
 
   gcc_assert ((align & 0x3) == 0);
   nelt_v8 = GET_MODE_NUNITS (V8QImode);
@@ -29064,14 +29135,15 @@ arm_block_set_aligned_vect (rtx dstbase,
   /* Handle first 16 bytes specially using vst1:v16qi instruction.  */
   if (mode == V16QImode)
     {
-      mem = adjust_automodify_address (dstbase, mode, dst, 0);
+      mem = adjust_automodify_address (dstbase, mode, dst, offset);
       emit_insn (gen_movmisalignv16qi (mem, reg));
       i += nelt_mode;
       /* Handle (8, 16) bytes leftover using vst1:v16qi again.  */
       if (i + nelt_v8 < length && i + nelt_v16 > length)
 	{
 	  emit_insn (gen_add2_insn (dst, GEN_INT (length - nelt_mode)));
-	  mem = adjust_automodify_address (dstbase, mode, dst, 0);
+	  offset += length - nelt_mode;
+	  mem = adjust_automodify_address (dstbase, mode, dst, offset);
 	  /* We are shifting bytes back, set the alignment accordingly.  */
 	  if ((length & 0x3) == 0)
 	    set_mem_align (mem, BITS_PER_UNIT * 4);
@@ -29093,7 +29165,7 @@ arm_block_set_aligned_vect (rtx dstbase,
   for (; (i + nelt_mode <= length); i += nelt_mode)
     {
       addr = plus_constant (Pmode, dst, i);
-      mem = adjust_automodify_address (dstbase, mode, addr, i);
+      mem = adjust_automodify_address (dstbase, mode, addr, offset + i);
       emit_move_insn (mem, reg);
     }
 
@@ -29102,8 +29174,8 @@ arm_block_set_aligned_vect (rtx dstbase,
   if (i + UNITS_PER_WORD == length)
     {
       addr = plus_constant (Pmode, dst, i - UNITS_PER_WORD);
-      mem = adjust_automodify_address (dstbase, mode,
-				       addr, i - UNITS_PER_WORD);
+      offset += i - UNITS_PER_WORD;
+      mem = adjust_automodify_address (dstbase, mode, addr, offset);
       /* We are shifting 4 bytes back, set the alignment accordingly.  */
       if (align > UNITS_PER_WORD)
 	set_mem_align (mem, BITS_PER_UNIT * UNITS_PER_WORD);
@@ -29115,7 +29187,8 @@ arm_block_set_aligned_vect (rtx dstbase,
   else if (i < length)
     {
       emit_insn (gen_add2_insn (dst, GEN_INT (length - nelt_mode)));
-      mem = adjust_automodify_address (dstbase, mode, dst, 0);
+      offset += length - nelt_mode;
+      mem = adjust_automodify_address (dstbase, mode, dst, offset);
       /* We are shifting bytes back, set the alignment accordingly.  */
       if ((length & 1) == 0)
 	set_mem_align (mem, BITS_PER_UNIT * 2);
diff --git a/gcc/config/arm/arm.md b/gcc/config/arm/arm.md
index f63fc39e63..f4c9dea9e5 100644
--- a/gcc/config/arm/arm.md
+++ b/gcc/config/arm/arm.md
@@ -5415,7 +5415,7 @@
       if (!REG_P (operands[0]))
 	operands[1] = force_reg (DImode, operands[1]);
     }
-  if (REG_P (operands[0]) && REGNO (operands[0]) < FIRST_VIRTUAL_REGISTER
+  if (REG_P (operands[0]) && REGNO (operands[0]) <= LAST_ARM_REGNUM
       && !HARD_REGNO_MODE_OK (REGNO (operands[0]), DImode))
     {
       /* Avoid LDRD's into an odd-numbered register pair in ARM state
@@ -5434,7 +5434,7 @@
 		      gen_highpart (SImode, operands[1]));
       DONE;
     }
-  else if (REG_P (operands[1]) && REGNO (operands[1]) < FIRST_VIRTUAL_REGISTER
+  else if (REG_P (operands[1]) && REGNO (operands[1]) <= LAST_ARM_REGNUM
 	   && !HARD_REGNO_MODE_OK (REGNO (operands[1]), DImode))
     {
       /* Avoid STRD's from an odd-numbered register pair in ARM state
@@ -5662,7 +5662,7 @@
   [(set (match_operand:SI 0 "nonimmediate_operand" "=r")
 	(lo_sum:SI (match_operand:SI 1 "nonimmediate_operand" "0")
 		   (match_operand:SI 2 "general_operand"      "i")))]
-  "arm_arch_thumb2"
+  "arm_arch_thumb2 && arm_valid_symbolic_address_p (operands[2])"
   "movt%?\t%0, #:upper16:%c2"
   [(set_attr "predicable" "yes")
    (set_attr "predicable_short_it" "no")
@@ -6508,7 +6508,7 @@
 (define_insn "*arm32_movhf"
   [(set (match_operand:HF 0 "nonimmediate_operand" "=r,m,r,r")
 	(match_operand:HF 1 "general_operand"	   " m,r,r,F"))]
-  "TARGET_32BIT && !(TARGET_HARD_FLOAT && TARGET_FP16) && !arm_restrict_it
+  "TARGET_32BIT && !(TARGET_HARD_FLOAT && TARGET_FP16)
    && (	  s_register_operand (operands[0], HFmode)
        || s_register_operand (operands[1], HFmode))"
   "*
@@ -6546,7 +6546,8 @@
   [(set_attr "conds" "unconditional")
    (set_attr "type" "load1,store1,mov_reg,multiple")
    (set_attr "length" "4,4,4,8")
-   (set_attr "predicable" "yes")]
+   (set_attr "predicable" "yes")
+   (set_attr "predicable_short_it" "no")]
 )
 
 (define_expand "movsf"
diff --git a/gcc/config/arm/constraints.md b/gcc/config/arm/constraints.md
index 42935a4ca6..d7d08266a4 100644
--- a/gcc/config/arm/constraints.md
+++ b/gcc/config/arm/constraints.md
@@ -67,7 +67,8 @@
 (define_constraint "j"
  "A constant suitable for a MOVW instruction. (ARM/Thumb-2)"
  (and (match_test "TARGET_32BIT && arm_arch_thumb2")
-      (ior (match_code "high")
+      (ior (and (match_code "high")
+		(match_test "arm_valid_symbolic_address_p (XEXP (op, 0))"))
 	   (and (match_code "const_int")
                 (match_test "(ival & 0xffff0000) == 0")))))
 
@@ -338,7 +339,8 @@
  "@internal
   In ARM/ Thumb2 a const_double which can be used with a vcvt.s32.f32 with bits operation"
   (and (match_code "const_double")
-       (match_test "TARGET_32BIT && TARGET_VFP && vfp3_const_double_for_bits (op)")))
+       (match_test "TARGET_32BIT && TARGET_VFP
+		    && vfp3_const_double_for_bits (op) > 0")))
 
 (define_register_constraint "Ts" "(arm_restrict_it) ? LO_REGS : GENERAL_REGS"
  "For arm_restrict_it the core registers @code{r0}-@code{r7}.  GENERAL_REGS otherwise.")
diff --git a/gcc/config/arm/predicates.md b/gcc/config/arm/predicates.md
index 08cc89924d..48e4ba86e7 100644
--- a/gcc/config/arm/predicates.md
+++ b/gcc/config/arm/predicates.md
@@ -668,7 +668,7 @@
 (define_predicate "const_double_vcvt_power_of_two"
   (and (match_code "const_double")
        (match_test "TARGET_32BIT && TARGET_VFP
-                   && vfp3_const_double_for_bits (op)")))
+		    && vfp3_const_double_for_bits (op) > 0")))
 
 (define_predicate "neon_struct_operand"
   (and (match_code "mem")
diff --git a/gcc/config/arm/sync.md b/gcc/config/arm/sync.md
index 78bdafc705..75dd52ea3a 100644
--- a/gcc/config/arm/sync.md
+++ b/gcc/config/arm/sync.md
@@ -73,15 +73,14 @@
       VUNSPEC_LDA))]
   "TARGET_HAVE_LDACQ"
   {
-    enum memmodel model = (enum memmodel) INTVAL (operands[2]);
-    if (model == MEMMODEL_RELAXED
-        || model == MEMMODEL_CONSUME
-        || model == MEMMODEL_RELEASE)
-      return \"ldr<sync_sfx>\\t%0, %1\";
+    enum memmodel model = memmodel_from_int (INTVAL (operands[2]));
+    if (is_mm_relaxed (model) || is_mm_consume (model) || is_mm_release (model))
+      return \"ldr%(<sync_sfx>%)\\t%0, %1\";
     else
-      return \"lda<sync_sfx>\\t%0, %1\";
+      return \"lda<sync_sfx>%?\\t%0, %1\";
   }
-)
+  [(set_attr "predicable" "yes")
+   (set_attr "predicable_short_it" "no")])
 
 (define_insn "atomic_store<mode>"
   [(set (match_operand:QHSI 0 "memory_operand" "=Q")
@@ -91,15 +90,14 @@
       VUNSPEC_STL))]
   "TARGET_HAVE_LDACQ"
   {
-    enum memmodel model = (enum memmodel) INTVAL (operands[2]);
-    if (model == MEMMODEL_RELAXED
-        || model == MEMMODEL_CONSUME
-        || model == MEMMODEL_ACQUIRE)
-      return \"str<sync_sfx>\t%1, %0\";
+    enum memmodel model = memmodel_from_int (INTVAL (operands[2]));
+    if (is_mm_relaxed (model) || is_mm_consume (model) || is_mm_acquire (model))
+      return \"str%(<sync_sfx>%)\t%1, %0\";
     else
-      return \"stl<sync_sfx>\t%1, %0\";
+      return \"stl<sync_sfx>%?\t%1, %0\";
   }
-)
+  [(set_attr "predicable" "yes")
+   (set_attr "predicable_short_it" "no")])
 
 ;; Note that ldrd and vldr are *not* guaranteed to be single-copy atomic,
 ;; even for a 64-bit aligned address.  Instead we use a ldrexd unparied
@@ -110,10 +108,10 @@
    (match_operand:SI 2 "const_int_operand")]		;; model
   "TARGET_HAVE_LDREXD && ARM_DOUBLEWORD_ALIGN"
 {
-  enum memmodel model = (enum memmodel) INTVAL (operands[2]);
+  enum memmodel model = memmodel_from_int (INTVAL (operands[2]));
   expand_mem_thread_fence (model);
   emit_insn (gen_atomic_loaddi_1 (operands[0], operands[1]));
-  if (model == MEMMODEL_SEQ_CST)
+  if (is_mm_seq_cst (model))
     expand_mem_thread_fence (model);
   DONE;
 })
diff --git a/gcc/config/arm/vfp.md b/gcc/config/arm/vfp.md
index ea136e82f3..c297ed927b 100644
--- a/gcc/config/arm/vfp.md
+++ b/gcc/config/arm/vfp.md
@@ -770,6 +770,17 @@
   [(set (match_operand:SF		   0 "s_register_operand" "=t")
 	(mult:SF (neg:SF (match_operand:SF 1 "s_register_operand" "t"))
 		 (match_operand:SF	   2 "s_register_operand" "t")))]
+  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_VFP && !flag_rounding_math"
+  "vnmul%?.f32\\t%0, %1, %2"
+  [(set_attr "predicable" "yes")
+   (set_attr "predicable_short_it" "no")
+   (set_attr "type" "fmuls")]
+)
+
+(define_insn "*negmulsf3_vfp"
+  [(set (match_operand:SF		   0 "s_register_operand" "=t")
+	(neg:SF (mult:SF (match_operand:SF 1 "s_register_operand" "t")
+		 (match_operand:SF	   2 "s_register_operand" "t"))))]
   "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_VFP"
   "vnmul%?.f32\\t%0, %1, %2"
   [(set_attr "predicable" "yes")
@@ -781,6 +792,18 @@
   [(set (match_operand:DF		   0 "s_register_operand" "=w")
 	(mult:DF (neg:DF (match_operand:DF 1 "s_register_operand" "w"))
 		 (match_operand:DF	   2 "s_register_operand" "w")))]
+  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_VFP_DOUBLE
+  && !flag_rounding_math"
+  "vnmul%?.f64\\t%P0, %P1, %P2"
+  [(set_attr "predicable" "yes")
+   (set_attr "predicable_short_it" "no")
+   (set_attr "type" "fmuld")]
+)
+
+(define_insn "*negmuldf3_vfp"
+  [(set (match_operand:DF		   0 "s_register_operand" "=w")
+	(neg:DF (mult:DF (match_operand:DF 1 "s_register_operand" "w")
+		 (match_operand:DF	   2 "s_register_operand" "w"))))]
   "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_VFP_DOUBLE"
   "vnmul%?.f64\\t%P0, %P1, %P2"
   [(set_attr "predicable" "yes")
diff --git a/gcc/config/avr/avr-dimode.md b/gcc/config/avr/avr-dimode.md
index 1243360eed..a1cd6d91c6 100644
--- a/gcc/config/avr/avr-dimode.md
+++ b/gcc/config/avr/avr-dimode.md
@@ -461,7 +461,8 @@
               (match_operand:SI 2 "general_operand" "")
               ;; Just to mention the iterator 
               (clobber (any_extend:SI (match_dup 1)))])]
-  "avr_have_dimode"
+  "avr_have_dimode
+   && AVR_HAVE_MUL"
   {
     avr_fix_inputs (operands, 1 << 2, regmask (SImode, 22));
     emit_move_insn (gen_rtx_REG (SImode, 22), operands[1]);
@@ -480,7 +481,8 @@
                  (any_extend:DI (reg:SI 22))))
    (clobber (reg:HI REG_X))
    (clobber (reg:HI REG_Z))]
-  "avr_have_dimode"
+  "avr_have_dimode
+   && AVR_HAVE_MUL"
   "%~call __<extend_u>mulsidi3"
   [(set_attr "adjust_len" "call")
    (set_attr "cc" "clobber")])
diff --git a/gcc/config/avr/avr.c b/gcc/config/avr/avr.c
index 68d5ddc9ee..75cdd3b30c 100644
--- a/gcc/config/avr/avr.c
+++ b/gcc/config/avr/avr.c
@@ -9272,10 +9272,10 @@ avr_pgm_check_var_decl (tree node)
         {
           if (TYPE_P (node))
             error ("%qT uses address space %qs beyond flash of %d KiB",
-                   node, avr_addrspace[as].name, avr_n_flash);
+                   node, avr_addrspace[as].name, 64 * avr_n_flash);
           else
             error ("%s %q+D uses address space %qs beyond flash of %d KiB",
-                   reason, node, avr_addrspace[as].name, avr_n_flash);
+                   reason, node, avr_addrspace[as].name, 64 * avr_n_flash);
         }
       else
         {
@@ -9322,7 +9322,7 @@ avr_insert_attributes (tree node, tree *attributes)
       if (avr_addrspace[as].segment >= avr_n_flash)
         {
           error ("variable %q+D located in address space %qs beyond flash "
-                 "of %d KiB", node, avr_addrspace[as].name, avr_n_flash);
+                 "of %d KiB", node, avr_addrspace[as].name, 64 * avr_n_flash);
         }
       else if (!AVR_HAVE_LPM && avr_addrspace[as].pointer_size > 2)
 	{
diff --git a/gcc/config/i386/freebsd.h b/gcc/config/i386/freebsd.h
index 6ce160ed40..78d5e19857 100644
--- a/gcc/config/i386/freebsd.h
+++ b/gcc/config/i386/freebsd.h
@@ -98,11 +98,21 @@ along with GCC; see the file COPYING3.  If not see
 
 #ifdef HAVE_GAS_MAX_SKIP_P2ALIGN
 #undef  ASM_OUTPUT_MAX_SKIP_ALIGN
-#define ASM_OUTPUT_MAX_SKIP_ALIGN(FILE, LOG, MAX_SKIP)					\
-  if ((LOG) != 0) {														\
-    if ((MAX_SKIP) == 0) fprintf ((FILE), "\t.p2align %d\n", (LOG));	\
-    else fprintf ((FILE), "\t.p2align %d,,%d\n", (LOG), (MAX_SKIP));	\
-  }
+#define ASM_OUTPUT_MAX_SKIP_ALIGN(FILE,LOG,MAX_SKIP)			\
+  do {									\
+    if ((LOG) != 0) {							\
+      if ((MAX_SKIP) == 0) fprintf ((FILE), "\t.p2align %d\n", (LOG));	\
+      else {								\
+	fprintf ((FILE), "\t.p2align %d,,%d\n", (LOG), (MAX_SKIP));	\
+	/* Make sure that we have at least 8 byte alignment if > 8 byte \
+	   alignment is preferred.  */					\
+	if ((LOG) > 3							\
+	    && (1 << (LOG)) > ((MAX_SKIP) + 1)				\
+	    && (MAX_SKIP) >= 7)						\
+	  fputs ("\t.p2align 3\n", (FILE));				\
+      }									\
+    }									\
+  } while (0)
 #endif
 
 /* Don't default to pcc-struct-return, we want to retain compatibility with
@@ -121,10 +131,7 @@ along with GCC; see the file COPYING3.  If not see
 /* Static stack checking is supported by means of probes.  */
 #define STACK_CHECK_STATIC_BUILTIN 1
 
-/* Support for i386 has been removed from FreeBSD 6.0 onward.  */
-#if FBSD_MAJOR >= 6
-#define SUBTARGET32_DEFAULT_CPU "i486"
-#endif
+#define SUBTARGET32_DEFAULT_CPU "i586"
 
 #define TARGET_ASM_FILE_END file_end_indicate_exec_stack
 
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 7c28a559a4..3b19caa413 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -4055,11 +4055,11 @@ ix86_option_override_internal (bool main_args_p,
   if (opts_set->x_ix86_incoming_stack_boundary_arg)
     {
       if (opts->x_ix86_incoming_stack_boundary_arg
-	  < (TARGET_64BIT_P (opts->x_ix86_isa_flags) ? 4 : 2)
+	  < (TARGET_64BIT_P (opts->x_ix86_isa_flags) ? 3 : 2)
 	  || opts->x_ix86_incoming_stack_boundary_arg > 12)
 	error ("-mincoming-stack-boundary=%d is not between %d and 12",
 	       opts->x_ix86_incoming_stack_boundary_arg,
-	       TARGET_64BIT_P (opts->x_ix86_isa_flags) ? 4 : 2);
+	       TARGET_64BIT_P (opts->x_ix86_isa_flags) ? 3 : 2);
       else
 	{
 	  ix86_user_incoming_stack_boundary
@@ -5201,6 +5201,14 @@ ix86_set_current_function (tree fndecl)
 	TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
     }
   ix86_previous_fndecl = fndecl;
+
+  /* 64-bit MS and SYSV ABI have different set of call used registers.
+     Avoid expensive re-initialization of init_regs each time we switch
+     function context.  */
+  if (TARGET_64BIT
+      && (call_used_regs[SI_REG]
+	  == (cfun->machine->call_abi == MS_ABI)))
+    reinit_regs ();
 }
 
 
@@ -6344,17 +6352,6 @@ ix86_call_abi_override (const_tree fndecl)
     cfun->machine->call_abi = ix86_function_type_abi (TREE_TYPE (fndecl));
 }
 
-/* 64-bit MS and SYSV ABI have different set of call used registers.  Avoid
-   expensive re-initialization of init_regs each time we switch function context
-   since this is needed only during RTL expansion.  */
-static void
-ix86_maybe_switch_abi (void)
-{
-  if (TARGET_64BIT &&
-      call_used_regs[SI_REG] == (cfun->machine->call_abi == MS_ABI))
-    reinit_regs ();
-}
-
 /* Return 1 if pseudo register should be created and used to hold
    GOT address for PIC code.  */
 bool
@@ -8211,7 +8208,8 @@ ix86_function_value_regno_p (const unsigned int regno)
     case SI_REG:
       return TARGET_64BIT && ix86_cfun_abi () != MS_ABI;
 
-    case FIRST_BND_REG:
+    case BND0_REG:
+    case BND1_REG:
       return chkp_function_instrumented_p (current_function_decl);
 
       /* Complex values are returned in %st(0)/%st(1) pair.  */
@@ -10104,11 +10102,14 @@ ix86_compute_frame_layout (struct ix86_frame *frame)
   frame->nregs = ix86_nsaved_regs ();
   frame->nsseregs = ix86_nsaved_sseregs ();
 
-  /* 64-bit MS ABI seem to require stack alignment to be always 16 except for
-     function prologues and leaf.  */
+  /* 64-bit MS ABI seem to require stack alignment to be always 16,
+     except for function prologues, leaf functions and when the defult
+     incoming stack boundary is overriden at command line or via
+     force_align_arg_pointer attribute.  */
   if ((TARGET_64BIT_MS_ABI && crtl->preferred_stack_boundary < 128)
       && (!crtl->is_leaf || cfun->calls_alloca != 0
-          || ix86_current_function_calls_tls_descriptor))
+	  || ix86_current_function_calls_tls_descriptor
+	  || ix86_incoming_stack_boundary < 128))
     {
       crtl->preferred_stack_boundary = 128;
       crtl->stack_alignment_needed = 128;
@@ -10207,10 +10208,14 @@ ix86_compute_frame_layout (struct ix86_frame *frame)
   if (frame->nsseregs)
     {
       /* The only ABI that has saved SSE registers (Win64) also has a
-         16-byte aligned default stack, and thus we don't need to be
-	 within the re-aligned local stack frame to save them.  */
-      gcc_assert (INCOMING_STACK_BOUNDARY >= 128);
-      offset = (offset + 16 - 1) & -16;
+	 16-byte aligned default stack, and thus we don't need to be
+	 within the re-aligned local stack frame to save them.  In case
+	 incoming stack boundary is aligned to less than 16 bytes,
+	 unaligned move of SSE register will be emitted, so there is
+	 no point to round up the SSE register save area outside the
+	 re-aligned local stack frame to 16 bytes.  */
+      if (ix86_incoming_stack_boundary >= 128)
+	offset = (offset + 16 - 1) & -16;
       offset += frame->nsseregs * 16;
     }
   frame->sse_reg_save_offset = offset;
@@ -10220,7 +10225,7 @@ ix86_compute_frame_layout (struct ix86_frame *frame)
      sure that no value happens to be the same before and after, force
      the alignment computation below to add a non-zero value.  */
   if (stack_realign_fp)
-    offset = (offset + stack_alignment_needed) & -stack_alignment_needed;
+    offset = (offset + stack_alignment_needed - 1) & -stack_alignment_needed;
 
   /* Va-arg area */
   frame->va_arg_size = ix86_varargs_gpr_size + ix86_varargs_fpr_size;
@@ -10433,15 +10438,24 @@ ix86_emit_save_reg_using_mov (machine_mode mode, unsigned int regno,
 {
   struct machine_function *m = cfun->machine;
   rtx reg = gen_rtx_REG (mode, regno);
+  rtx unspec = NULL_RTX;
   rtx mem, addr, base, insn;
+  unsigned int align;
 
   addr = choose_baseaddr (cfa_offset);
   mem = gen_frame_mem (mode, addr);
 
-  /* For SSE saves, we need to indicate the 128-bit alignment.  */
-  set_mem_align (mem, GET_MODE_ALIGNMENT (mode));
+  /* The location is aligned up to INCOMING_STACK_BOUNDARY.  */
+  align = MIN (GET_MODE_ALIGNMENT (mode), INCOMING_STACK_BOUNDARY);
+  set_mem_align (mem, align);
 
-  insn = emit_move_insn (mem, reg);
+  /* SSE saves are not within re-aligned local stack frame.
+     In case INCOMING_STACK_BOUNDARY is misaligned, we have
+     to emit unaligned store.  */
+  if (mode == V4SFmode && align < 128)
+    unspec = gen_rtx_UNSPEC (mode, gen_rtvec (1, reg), UNSPEC_STOREU);
+
+  insn = emit_insn (gen_rtx_SET (VOIDmode, mem, unspec ? unspec : reg));
   RTX_FRAME_RELATED_P (insn) = 1;
 
   base = addr;
@@ -10489,6 +10503,9 @@ ix86_emit_save_reg_using_mov (machine_mode mode, unsigned int regno,
       mem = gen_rtx_MEM (mode, addr);
       add_reg_note (insn, REG_CFA_OFFSET, gen_rtx_SET (VOIDmode, mem, reg));
     }
+  else if (unspec)
+    add_reg_note (insn, REG_CFA_EXPRESSION,
+		  gen_rtx_SET (VOIDmode, mem, reg));
 }
 
 /* Emit code to save registers using MOV insns.
@@ -10705,6 +10722,25 @@ find_drap_reg (void)
     }
 }
 
+/* Handle a "force_align_arg_pointer" attribute.  */
+
+static tree
+ix86_handle_force_align_arg_pointer_attribute (tree *node, tree name,
+					       tree, int, bool *no_add_attrs)
+{
+  if (TREE_CODE (*node) != FUNCTION_TYPE
+      && TREE_CODE (*node) != METHOD_TYPE
+      && TREE_CODE (*node) != FIELD_DECL
+      && TREE_CODE (*node) != TYPE_DECL)
+    {
+      warning (OPT_Wattributes, "%qE attribute only applies to functions",
+	       name);
+      *no_add_attrs = true;
+    }
+
+  return NULL_TREE;
+}
+
 /* Return minimum incoming stack alignment.  */
 
 static unsigned int
@@ -10719,7 +10755,6 @@ ix86_minimum_incoming_stack_boundary (bool sibcall)
      if -mstackrealign is used, it isn't used for sibcall check and
      estimated stack alignment is 128bit.  */
   else if (!sibcall
-	   && !TARGET_64BIT
 	   && ix86_force_align_arg_pointer
 	   && crtl->stack_alignment_estimated == 128)
     incoming_stack_boundary = MIN_STACK_BOUNDARY;
@@ -11026,8 +11061,6 @@ ix86_adjust_stack_and_probe (const HOST_WIDE_INT size)
       release_scratch_register_on_entry (&sr);
     }
 
-  gcc_assert (cfun->machine->fs.cfa_reg != stack_pointer_rtx);
-
   /* Even if the stack pointer isn't the CFA register, we need to correctly
      describe the adjustments made to it, in particular differentiate the
      frame-related ones from the frame-unrelated ones.  */
@@ -11251,7 +11284,11 @@ ix86_finalize_stack_realign_flags (void)
       && !crtl->accesses_prior_frames
       && !cfun->calls_alloca
       && !crtl->calls_eh_return
-      && !(flag_stack_check && STACK_CHECK_MOVING_SP)
+      /* See ira_setup_eliminable_regset for the rationale.  */
+      && !(STACK_CHECK_MOVING_SP
+	   && flag_stack_check
+	   && flag_exceptions
+	   && cfun->can_throw_non_call_exceptions)
       && !ix86_frame_pointer_required ()
       && get_frame_size () == 0
       && ix86_nsaved_sseregs () == 0
@@ -11578,7 +11615,7 @@ ix86_expand_prologue (void)
          pointer is no longer valid.  As for the value of sp_offset,
 	 see ix86_compute_frame_layout, which we need to match in order
 	 to pass verification of stack_pointer_offset at the end.  */
-      m->fs.sp_offset = (m->fs.sp_offset + align_bytes) & -align_bytes;
+      m->fs.sp_offset = (m->fs.sp_offset + align_bytes - 1) & -align_bytes;
       m->fs.sp_valid = false;
     }
 
@@ -11991,11 +12028,26 @@ ix86_emit_restore_sse_regs_using_mov (HOST_WIDE_INT cfa_offset,
       {
 	rtx reg = gen_rtx_REG (V4SFmode, regno);
 	rtx mem;
+ 	unsigned int align;
 
 	mem = choose_baseaddr (cfa_offset);
 	mem = gen_rtx_MEM (V4SFmode, mem);
-	set_mem_align (mem, 128);
-	emit_move_insn (reg, mem);
+
+ 	/* The location is aligned up to INCOMING_STACK_BOUNDARY.  */
+	align = MIN (GET_MODE_ALIGNMENT (V4SFmode), INCOMING_STACK_BOUNDARY);
+ 	set_mem_align (mem, align);
+
+ 	/* SSE saves are not within re-aligned local stack frame.
+ 	   In case INCOMING_STACK_BOUNDARY is misaligned, we have
+ 	   to emit unaligned load.  */
+ 	if (align < 128)
+ 	  {
+ 	    rtx unspec = gen_rtx_UNSPEC (V4SFmode, gen_rtvec (1, mem),
+ 					 UNSPEC_LOADU);
+ 	    emit_insn (gen_rtx_SET (VOIDmode, reg, unspec));
+ 	  }
+ 	else
+ 	  emit_insn (gen_rtx_SET (VOIDmode, reg, mem));
 
 	ix86_add_cfa_restore_note (NULL_RTX, reg, cfa_offset);
 
@@ -25140,7 +25192,8 @@ ix86_expand_set_or_movmem (rtx dst, rtx src, rtx count_exp, rtx val_exp,
       dst = change_address (dst, BLKmode, destreg);
       set_mem_align (dst, desired_align * BITS_PER_UNIT);
       epilogue_size_needed = 0;
-      if (need_zero_guard && !min_size)
+      if (need_zero_guard
+	  && min_size < (unsigned HOST_WIDE_INT) size_needed)
 	{
 	  /* It is possible that we copied enough so the main loop will not
 	     execute.  */
@@ -25272,7 +25325,7 @@ ix86_expand_set_or_movmem (rtx dst, rtx src, rtx count_exp, rtx val_exp,
 	  max_size -= align_bytes;
 	}
       if (need_zero_guard
-	  && !min_size
+	  && min_size < (unsigned HOST_WIDE_INT) size_needed
 	  && (count < (unsigned HOST_WIDE_INT) size_needed
 	      || (align_bytes == 0
 		  && count < ((unsigned HOST_WIDE_INT) size_needed
@@ -25557,7 +25610,7 @@ ix86_expand_strlensi_unroll_1 (rtx out, rtx src, rtx align_rtx)
 
   /* Avoid branch in fixing the byte.  */
   tmpreg = gen_lowpart (QImode, tmpreg);
-  emit_insn (gen_addqi3_cc (tmpreg, tmpreg, tmpreg));
+  emit_insn (gen_addqi3_cconly_overflow (tmpreg, tmpreg));
   tmp = gen_rtx_REG (CCmode, FLAGS_REG);
   cmp = gen_rtx_LTU (VOIDmode, tmp, const0_rtx);
   emit_insn (ix86_gen_sub3_carry (out, out, GEN_INT (3), tmp, cmp));
@@ -39522,60 +39575,57 @@ rdseed_step:
       return target;
 
     case IX86_BUILTIN_SBB32:
-      icode = CODE_FOR_subsi3_carry;
+      icode = CODE_FOR_subborrowsi;
       mode0 = SImode;
-      goto addcarryx;
+      goto handlecarry;
 
     case IX86_BUILTIN_SBB64:
-      icode = CODE_FOR_subdi3_carry;
+      icode = CODE_FOR_subborrowdi;
       mode0 = DImode;
-      goto addcarryx;
+      goto handlecarry;
 
     case IX86_BUILTIN_ADDCARRYX32:
-      icode = TARGET_ADX ? CODE_FOR_adcxsi3 : CODE_FOR_addsi3_carry;
+      icode = CODE_FOR_addcarrysi;
       mode0 = SImode;
-      goto addcarryx;
+      goto handlecarry;
 
     case IX86_BUILTIN_ADDCARRYX64:
-      icode = TARGET_ADX ? CODE_FOR_adcxdi3 : CODE_FOR_adddi3_carry;
+      icode = CODE_FOR_addcarrydi;
       mode0 = DImode;
 
-addcarryx:
+    handlecarry:
       arg0 = CALL_EXPR_ARG (exp, 0); /* unsigned char c_in.  */
       arg1 = CALL_EXPR_ARG (exp, 1); /* unsigned int src1.  */
       arg2 = CALL_EXPR_ARG (exp, 2); /* unsigned int src2.  */
       arg3 = CALL_EXPR_ARG (exp, 3); /* unsigned int *sum_out.  */
 
-      op0 = gen_reg_rtx (QImode);
-
-      /* Generate CF from input operand.  */
       op1 = expand_normal (arg0);
       op1 = copy_to_mode_reg (QImode, convert_to_mode (QImode, op1, 1));
-      emit_insn (gen_addqi3_cc (op0, op1, constm1_rtx));
 
-      /* Gen ADCX instruction to compute X+Y+CF.  */
       op2 = expand_normal (arg1);
-      op3 = expand_normal (arg2);
-
-      if (!REG_P (op2))
+      if (!register_operand (op2, mode0))
 	op2 = copy_to_mode_reg (mode0, op2);
-      if (!REG_P (op3))
-	op3 = copy_to_mode_reg (mode0, op3);
-
-      op0 = gen_reg_rtx (mode0);
 
-      op4 = gen_rtx_REG (CCCmode, FLAGS_REG);
-      pat = gen_rtx_LTU (VOIDmode, op4, const0_rtx);
-      emit_insn (GEN_FCN (icode) (op0, op2, op3, op4, pat));
+      op3 = expand_normal (arg2);
+      if (!register_operand (op3, mode0))
+	op3 = copy_to_mode_reg (mode0, op3);
 
-      /* Store the result.  */
       op4 = expand_normal (arg3);
       if (!address_operand (op4, VOIDmode))
 	{
 	  op4 = convert_memory_address (Pmode, op4);
 	  op4 = copy_addr_to_reg (op4);
 	}
-      emit_move_insn (gen_rtx_MEM (mode0, op4), op0);
+
+      /* Generate CF from input operand.  */
+      emit_insn (gen_addqi3_cconly_overflow (op1, constm1_rtx));
+
+      /* Generate instruction that consumes CF.  */
+      op0 = gen_reg_rtx (mode0);
+
+      op1 = gen_rtx_REG (CCCmode, FLAGS_REG);
+      pat = gen_rtx_LTU (mode0, op1, const0_rtx);
+      emit_insn (GEN_FCN (icode) (op0, op2, op3, op1, pat));
 
       /* Return current CF value.  */
       if (target == 0)
@@ -39583,6 +39633,10 @@ addcarryx:
 
       PUT_MODE (pat, QImode);
       emit_insn (gen_rtx_SET (VOIDmode, target, pat));
+
+      /* Store the result.  */
+      emit_move_insn (gen_rtx_MEM (mode0, op4), op0);
+
       return target;
 
     case IX86_BUILTIN_READ_FLAGS:
@@ -41758,11 +41812,24 @@ ix86_cannot_change_mode_class (machine_mode from, machine_mode to,
 
   if (MAYBE_SSE_CLASS_P (regclass) || MAYBE_MMX_CLASS_P (regclass))
     {
+      int from_size = GET_MODE_SIZE (from);
+      int to_size = GET_MODE_SIZE (to);
+
       /* Vector registers do not support QI or HImode loads.  If we don't
 	 disallow a change to these modes, reload will assume it's ok to
 	 drop the subreg from (subreg:SI (reg:HI 100) 0).  This affects
 	 the vec_dupv4hi pattern.  */
-      if (GET_MODE_SIZE (from) < 4)
+      if (from_size < 4)
+	return true;
+
+      /* Further, we cannot allow word_mode subregs of full vector modes.
+         Otherwise the middle-end will assume it's ok to store to
+         (subreg:DI (reg:TI 100) 0) in order to modify only the low 64 bits
+         of the 128-bit register.  However, after reload the subreg will
+         be dropped leaving a plain DImode store.  This is indistinguishable
+         from a "normal" DImode move, and so we're justified to use movsd,
+         which modifies the entire 128-bit register.  */
+      if (to_size == UNITS_PER_WORD && from_size > UNITS_PER_WORD)
 	return true;
     }
 
@@ -46836,7 +46903,7 @@ static const struct attribute_spec ix86_attribute_table[] =
     true },
   /* force_align_arg_pointer says this function realigns the stack at entry.  */
   { (const char *)&ix86_force_align_arg_pointer_string, 0, 0,
-    false, true,  true, ix86_handle_cconv_attribute, false },
+    false, true,  true, ix86_handle_force_align_arg_pointer_attribute, false },
 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
   { "dllimport", 0, 0, false, false, false, handle_dll_attribute, false },
   { "dllexport", 0, 0, false, false, false, handle_dll_attribute, false },
@@ -48957,6 +49024,62 @@ expand_vec_perm_even_odd_pack (struct expand_vec_perm_d *d)
   return true;
 }
 
+/* A subroutine of expand_vec_perm_even_odd_1.  Implement extract-even
+   and extract-odd permutations of two V64QI operands
+   with two "shifts", two "truncs" and one "concat" insns for "odd"
+   and two "truncs" and one concat insn for "even."
+   Have already failed all two instruction sequences.  */
+
+static bool
+expand_vec_perm_even_odd_trunc (struct expand_vec_perm_d *d)
+{
+  rtx t1, t2, t3, t4;
+  unsigned i, odd, nelt = d->nelt;
+
+  if (!TARGET_AVX512BW
+      || d->one_operand_p
+      || d->vmode != V64QImode)
+    return false;
+
+  /* Check that permutation is even or odd.  */
+  odd = d->perm[0];
+  if (odd > 1)
+    return false;
+
+  for (i = 1; i < nelt; ++i)
+    if (d->perm[i] != 2 * i + odd)
+      return false;
+
+  if (d->testing_p)
+    return true;
+
+
+  if (odd)
+    {
+      t1 = gen_reg_rtx (V32HImode);
+      t2 = gen_reg_rtx (V32HImode);
+      emit_insn (gen_lshrv32hi3 (t1,
+				 gen_lowpart (V32HImode, d->op0),
+				 GEN_INT (8)));
+      emit_insn (gen_lshrv32hi3 (t2,
+				 gen_lowpart (V32HImode, d->op1),
+				 GEN_INT (8)));
+    }
+  else
+    {
+      t1 = gen_lowpart (V32HImode, d->op0);
+      t2 = gen_lowpart (V32HImode, d->op1);
+    }
+
+  t3 = gen_reg_rtx (V32QImode);
+  t4 = gen_reg_rtx (V32QImode);
+  emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t3, t1));
+  emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t4, t2));
+  emit_insn (gen_avx_vec_concatv64qi (d->target, t3, t4));
+
+  return true;
+}
+
 /* A subroutine of ix86_expand_vec_perm_builtin_1.  Implement extract-even
    and extract-odd permutations.  */
 
@@ -49059,6 +49182,9 @@ expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd)
     case V32QImode:
       return expand_vec_perm_even_odd_pack (d);
 
+    case V64QImode:
+      return expand_vec_perm_even_odd_trunc (d);
+
     case V4DImode:
       if (!TARGET_AVX2)
 	{
@@ -49520,6 +49646,8 @@ ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
 
   /* Try sequences of four instructions.  */
 
+  if (expand_vec_perm_even_odd_trunc (d))
+    return true;
   if (expand_vec_perm_vpshufb2_vpermq (d))
     return true;
 
@@ -50335,15 +50463,20 @@ ix86_expand_pinsr (rtx *operands)
   unsigned int size = INTVAL (operands[1]);
   unsigned int pos = INTVAL (operands[2]);
 
+  if (GET_CODE (src) == SUBREG)
+    {
+      /* Reject non-lowpart subregs.  */
+      if (SUBREG_BYTE (src) != 0)
+       return false;
+      src = SUBREG_REG (src);
+    }
+
   if (GET_CODE (dst) == SUBREG)
     {
       pos += SUBREG_BYTE (dst) * BITS_PER_UNIT;
       dst = SUBREG_REG (dst);
     }
 
-  if (GET_CODE (src) == SUBREG)
-    src = SUBREG_REG (src);
-
   switch (GET_MODE (dst))
     {
     case V16QImode:
@@ -50391,6 +50524,10 @@ ix86_expand_pinsr (rtx *operands)
 	    return false;
 	  }
 
+	/* Reject insertions to misaligned positions.  */
+	if (pos & (size-1))
+	  return false;
+
 	rtx d = dst;
 	if (GET_MODE (dst) != dstmode)
 	  d = gen_reg_rtx (dstmode);
@@ -51516,7 +51653,7 @@ ix86_destroy_cost_data (void *data)
 static unsigned HOST_WIDE_INT
 ix86_memmodel_check (unsigned HOST_WIDE_INT val)
 {
-  unsigned HOST_WIDE_INT model = val & MEMMODEL_MASK;
+  enum memmodel model = memmodel_from_int (val);
   bool strong;
 
   if (val & ~(unsigned HOST_WIDE_INT)(IX86_HLE_ACQUIRE|IX86_HLE_RELEASE
@@ -51527,14 +51664,14 @@ ix86_memmodel_check (unsigned HOST_WIDE_INT val)
 	       "Unknown architecture specific memory model");
       return MEMMODEL_SEQ_CST;
     }
-  strong = (model == MEMMODEL_ACQ_REL || model == MEMMODEL_SEQ_CST);
-  if (val & IX86_HLE_ACQUIRE && !(model == MEMMODEL_ACQUIRE || strong))
+  strong = (is_mm_acq_rel (model) || is_mm_seq_cst (model));
+  if (val & IX86_HLE_ACQUIRE && !(is_mm_acquire (model) || strong))
     {
       warning (OPT_Winvalid_memory_model,
               "HLE_ACQUIRE not used with ACQUIRE or stronger memory model");
       return MEMMODEL_SEQ_CST | IX86_HLE_ACQUIRE;
     }
-   if (val & IX86_HLE_RELEASE && !(model == MEMMODEL_RELEASE || strong))
+  if (val & IX86_HLE_RELEASE && !(is_mm_release (model) || strong))
     {
       warning (OPT_Winvalid_memory_model,
               "HLE_RELEASE not used with RELEASE or stronger memory model");
@@ -52307,9 +52444,6 @@ ix86_binds_local_p (const_tree exp)
 #undef TARGET_CAN_INLINE_P
 #define TARGET_CAN_INLINE_P ix86_can_inline_p
 
-#undef TARGET_EXPAND_TO_RTL_HOOK
-#define TARGET_EXPAND_TO_RTL_HOOK ix86_maybe_switch_abi
-
 #undef TARGET_LEGITIMATE_ADDRESS_P
 #define TARGET_LEGITIMATE_ADDRESS_P ix86_legitimate_address_p
 
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index de43f06bef..d5a5263108 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -747,7 +747,7 @@ extern const char *host_detect_local_cpu (int argc, const char **argv);
 #define MAIN_STACK_BOUNDARY (TARGET_64BIT ? 128 : 32)
 
 /* Minimum stack boundary.  */
-#define MIN_STACK_BOUNDARY (TARGET_64BIT ? (TARGET_SSE ? 128 : 64) : 32)
+#define MIN_STACK_BOUNDARY BITS_PER_WORD
 
 /* Boundary (in *bits*) on which the stack pointer prefers to be
    aligned; the compiler cannot rely on having this alignment.  */
@@ -1429,7 +1429,7 @@ enum reg_class
 { 0x1ff1ffff,0xffffffe0,   0x1f },       /* FLOAT_INT_SSE_REGS */        \
        { 0x0,       0x0, 0x1fc0 },       /* MASK_EVEX_REGS */           \
        { 0x0,       0x0, 0x1fe0 },       /* MASK_REGS */                 \
-{ 0xffffffff,0xffffffff, 0x1fff }                                        \
+{ 0xffffffff,0xffffffff,0x1ffff }                                        \
 }
 
 /* The same information, inverted:
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index 6b6f44c8a1..8b1d372c58 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -102,7 +102,6 @@
   UNSPEC_SAHF
   UNSPEC_PARITY
   UNSPEC_FSTCW
-  UNSPEC_ADD_CARRY
   UNSPEC_FLDCW
   UNSPEC_REP
   UNSPEC_LD_MPIC	; load_macho_picbase
@@ -783,7 +782,8 @@
 (define_attr "isa" "base,x64,x64_sse4,x64_sse4_noavx,x64_avx,nox64,
 		    sse2,sse2_noavx,sse3,sse4,sse4_noavx,avx,noavx,
 		    avx2,noavx2,bmi,bmi2,fma4,fma,avx512f,noavx512f,
-		    fma_avx512f,avx512bw,noavx512bw,avx512dq,noavx512dq"
+		    fma_avx512f,avx512bw,noavx512bw,avx512dq,noavx512dq,
+		    avx512vl,noavx512vl"
   (const_string "base"))
 
 (define_attr "enabled" ""
@@ -818,6 +818,8 @@
 	 (eq_attr "isa" "noavx512bw") (symbol_ref "!TARGET_AVX512BW")
 	 (eq_attr "isa" "avx512dq") (symbol_ref "TARGET_AVX512DQ")
 	 (eq_attr "isa" "noavx512dq") (symbol_ref "!TARGET_AVX512DQ")
+	 (eq_attr "isa" "avx512vl") (symbol_ref "TARGET_AVX512VL")
+	 (eq_attr "isa" "noavx512vl") (symbol_ref "!TARGET_AVX512VL")
 	]
 	(const_int 1)))
 
@@ -844,8 +846,6 @@
 (define_code_attr plusminus_mnemonic
   [(plus "add") (ss_plus "adds") (us_plus "addus")
    (minus "sub") (ss_minus "subs") (us_minus "subus")])
-(define_code_attr plusminus_carry_mnemonic
-  [(plus "adc") (minus "sbb")])
 (define_code_attr multdiv_mnemonic
   [(mult "mul") (div "div")])
 
@@ -5051,11 +5051,11 @@
   /* The DImode arrived in a pair of integral registers (e.g. %edx:%eax).
      Assemble the 64-bit DImode value in an xmm register.  */
   emit_insn (gen_sse2_loadld (operands[3], CONST0_RTX (V4SImode),
-			      gen_rtx_SUBREG (SImode, operands[1], 0)));
+			      gen_lowpart (SImode, operands[1])));
   emit_insn (gen_sse2_loadld (operands[4], CONST0_RTX (V4SImode),
-			      gen_rtx_SUBREG (SImode, operands[1], 4)));
+			      gen_highpart (SImode, operands[1])));
   emit_insn (gen_vec_interleave_lowv4si (operands[3], operands[3],
-  	    				 operands[4]));
+					 operands[4]));
 
   operands[3] = gen_rtx_REG (DImode, REGNO (operands[3]));
 })
@@ -5213,46 +5213,21 @@
   "ix86_binary_operator_ok (PLUS, <DWI>mode, operands)"
   "#"
   "reload_completed"
-  [(parallel [(set (reg:CC FLAGS_REG)
-		   (unspec:CC [(match_dup 1) (match_dup 2)]
-			      UNSPEC_ADD_CARRY))
+  [(parallel [(set (reg:CCC FLAGS_REG)
+		   (compare:CCC
+		     (plus:DWIH (match_dup 1) (match_dup 2))
+		     (match_dup 1)))
 	      (set (match_dup 0)
 		   (plus:DWIH (match_dup 1) (match_dup 2)))])
    (parallel [(set (match_dup 3)
 		   (plus:DWIH
-		     (match_dup 4)
 		     (plus:DWIH
 		       (ltu:DWIH (reg:CC FLAGS_REG) (const_int 0))
-		       (match_dup 5))))
+		       (match_dup 4))
+		     (match_dup 5)))
 	      (clobber (reg:CC FLAGS_REG))])]
   "split_double_mode (<DWI>mode, &operands[0], 3, &operands[0], &operands[3]);")
 
-(define_insn "*add<mode>3_cc"
-  [(set (reg:CC FLAGS_REG)
-	(unspec:CC
-	  [(match_operand:SWI48 1 "nonimmediate_operand" "%0,0")
-	   (match_operand:SWI48 2 "<general_operand>" "r<i>,rm")]
-	  UNSPEC_ADD_CARRY))
-   (set (match_operand:SWI48 0 "nonimmediate_operand" "=rm,r")
-	(plus:SWI48 (match_dup 1) (match_dup 2)))]
-  "ix86_binary_operator_ok (PLUS, <MODE>mode, operands)"
-  "add{<imodesuffix>}\t{%2, %0|%0, %2}"
-  [(set_attr "type" "alu")
-   (set_attr "mode" "<MODE>")])
-
-(define_insn "addqi3_cc"
-  [(set (reg:CC FLAGS_REG)
-	(unspec:CC
-	  [(match_operand:QI 1 "nonimmediate_operand" "%0,0")
-	   (match_operand:QI 2 "general_operand" "qn,qm")]
-	  UNSPEC_ADD_CARRY))
-   (set (match_operand:QI 0 "nonimmediate_operand" "=qm,q")
-	(plus:QI (match_dup 1) (match_dup 2)))]
-  "ix86_binary_operator_ok (PLUS, QImode, operands)"
-  "add{b}\t{%2, %0|%0, %2}"
-  [(set_attr "type" "alu")
-   (set_attr "mode" "QI")])
-
 (define_insn "*add<mode>_1"
   [(set (match_operand:SWI48 0 "nonimmediate_operand" "=r,rm,r,r")
 	(plus:SWI48
@@ -6160,10 +6135,10 @@
 		   (minus:DWIH (match_dup 1) (match_dup 2)))])
    (parallel [(set (match_dup 3)
 		   (minus:DWIH
-		     (match_dup 4)
-		     (plus:DWIH
-		       (ltu:DWIH (reg:CC FLAGS_REG) (const_int 0))
-		       (match_dup 5))))
+		     (minus:DWIH
+		       (match_dup 4)
+		       (ltu:DWIH (reg:CC FLAGS_REG) (const_int 0)))
+		     (match_dup 5)))
 	      (clobber (reg:CC FLAGS_REG))])]
   "split_double_mode (<DWI>mode, &operands[0], 3, &operands[0], &operands[3]);")
 
@@ -6327,29 +6302,17 @@
 
 ;; Add with carry and subtract with borrow
 
-(define_expand "<plusminus_insn><mode>3_carry"
-  [(parallel
-    [(set (match_operand:SWI 0 "nonimmediate_operand")
-	  (plusminus:SWI
-	    (match_operand:SWI 1 "nonimmediate_operand")
-	    (plus:SWI (match_operator:SWI 4 "ix86_carry_flag_operator"
-		       [(match_operand 3 "flags_reg_operand")
-			(const_int 0)])
-		      (match_operand:SWI 2 "<general_operand>"))))
-     (clobber (reg:CC FLAGS_REG))])]
-  "ix86_binary_operator_ok (<CODE>, <MODE>mode, operands)")
-
-(define_insn "*<plusminus_insn><mode>3_carry"
+(define_insn "add<mode>3_carry"
   [(set (match_operand:SWI 0 "nonimmediate_operand" "=<r>m,<r>")
-	(plusminus:SWI
-	  (match_operand:SWI 1 "nonimmediate_operand" "<comm>0,0")
+	(plus:SWI
 	  (plus:SWI
-	    (match_operator 3 "ix86_carry_flag_operator"
-	     [(reg FLAGS_REG) (const_int 0)])
-	    (match_operand:SWI 2 "<general_operand>" "<r><i>,<r>m"))))
+	    (match_operator:SWI 4 "ix86_carry_flag_operator"
+	     [(match_operand 3 "flags_reg_operand") (const_int 0)])
+	    (match_operand:SWI 1 "nonimmediate_operand" "%0,0"))
+	  (match_operand:SWI 2 "<general_operand>" "<r><i>,<r>m")))
    (clobber (reg:CC FLAGS_REG))]
   "ix86_binary_operator_ok (PLUS, <MODE>mode, operands)"
-  "<plusminus_carry_mnemonic>{<imodesuffix>}\t{%2, %0|%0, %2}"
+  "adc{<imodesuffix>}\t{%2, %0|%0, %2}"
   [(set_attr "type" "alu")
    (set_attr "use_carry" "1")
    (set_attr "pent_pair" "pu")
@@ -6358,10 +6321,11 @@
 (define_insn "*addsi3_carry_zext"
   [(set (match_operand:DI 0 "register_operand" "=r")
 	(zero_extend:DI
-	  (plus:SI (match_operand:SI 1 "nonimmediate_operand" "%0")
-		   (plus:SI (match_operator 3 "ix86_carry_flag_operator"
-			     [(reg FLAGS_REG) (const_int 0)])
-			    (match_operand:SI 2 "x86_64_general_operand" "rme")))))
+	  (plus:SI
+	    (plus:SI (match_operator:SI 3 "ix86_carry_flag_operator"
+		      [(reg FLAGS_REG) (const_int 0)])
+		     (match_operand:SI 1 "register_operand" "%0"))
+	    (match_operand:SI 2 "x86_64_general_operand" "rme"))))
    (clobber (reg:CC FLAGS_REG))]
   "TARGET_64BIT && ix86_binary_operator_ok (PLUS, SImode, operands)"
   "adc{l}\t{%2, %k0|%k0, %2}"
@@ -6370,45 +6334,96 @@
    (set_attr "pent_pair" "pu")
    (set_attr "mode" "SI")])
 
+;; There is no point to generate ADCX instruction. ADC is shorter and faster.
+
+(define_insn "addcarry<mode>"
+  [(set (reg:CCC FLAGS_REG)
+	(compare:CCC
+	  (plus:SWI48
+	    (plus:SWI48
+	      (match_operator:SWI48 4 "ix86_carry_flag_operator"
+	       [(match_operand 3 "flags_reg_operand") (const_int 0)])
+	      (match_operand:SWI48 1 "nonimmediate_operand" "%0"))
+	    (match_operand:SWI48 2 "nonimmediate_operand" "rm"))
+	  (match_dup 1)))
+   (set (match_operand:SWI48 0 "register_operand" "=r")
+	(plus:SWI48 (plus:SWI48 (match_op_dup 4
+				 [(match_dup 3) (const_int 0)])
+				(match_dup 1))
+		    (match_dup 2)))]
+  "ix86_binary_operator_ok (PLUS, <MODE>mode, operands)"
+  "adc{<imodesuffix>}\t{%2, %0|%0, %2}"
+  [(set_attr "type" "alu")
+   (set_attr "use_carry" "1")
+   (set_attr "pent_pair" "pu")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "sub<mode>3_carry"
+  [(set (match_operand:SWI 0 "nonimmediate_operand" "=<r>m,<r>")
+	(minus:SWI
+	  (minus:SWI
+	    (match_operand:SWI 1 "nonimmediate_operand" "0,0")
+	    (match_operator:SWI 4 "ix86_carry_flag_operator"
+	     [(match_operand 3 "flags_reg_operand") (const_int 0)]))
+	  (match_operand:SWI 2 "<general_operand>" "<r><i>,<r>m")))
+   (clobber (reg:CC FLAGS_REG))]
+  "ix86_binary_operator_ok (MINUS, <MODE>mode, operands)"
+  "sbb{<imodesuffix>}\t{%2, %0|%0, %2}"
+  [(set_attr "type" "alu")
+   (set_attr "use_carry" "1")
+   (set_attr "pent_pair" "pu")
+   (set_attr "mode" "<MODE>")])
+
 (define_insn "*subsi3_carry_zext"
   [(set (match_operand:DI 0 "register_operand" "=r")
 	(zero_extend:DI
-	  (minus:SI (match_operand:SI 1 "register_operand" "0")
-		    (plus:SI (match_operator 3 "ix86_carry_flag_operator"
-			      [(reg FLAGS_REG) (const_int 0)])
-			     (match_operand:SI 2 "x86_64_general_operand" "rme")))))
+	  (minus:SI
+	    (minus:SI
+	      (match_operand:SI 1 "register_operand" "0")
+	      (match_operator:SI 3 "ix86_carry_flag_operator"
+	       [(reg FLAGS_REG) (const_int 0)]))
+	    (match_operand:SI 2 "x86_64_general_operand" "rme"))))
    (clobber (reg:CC FLAGS_REG))]
   "TARGET_64BIT && ix86_binary_operator_ok (MINUS, SImode, operands)"
   "sbb{l}\t{%2, %k0|%k0, %2}"
   [(set_attr "type" "alu")
+   (set_attr "use_carry" "1")
    (set_attr "pent_pair" "pu")
    (set_attr "mode" "SI")])
-
-;; ADCX instruction
 
-(define_insn "adcx<mode>3"
+(define_insn "subborrow<mode>"
   [(set (reg:CCC FLAGS_REG)
 	(compare:CCC
+	  (match_operand:SWI48 1 "nonimmediate_operand" "0")
 	  (plus:SWI48
-	    (match_operand:SWI48 1 "nonimmediate_operand" "%0")
-	    (plus:SWI48
-	      (match_operator 4 "ix86_carry_flag_operator"
-	       [(match_operand 3 "flags_reg_operand") (const_int 0)])
-	      (match_operand:SWI48 2 "nonimmediate_operand" "rm")))
-	  (const_int 0)))
+	    (match_operator:SWI48 4 "ix86_carry_flag_operator"
+	     [(match_operand 3 "flags_reg_operand") (const_int 0)])
+	    (match_operand:SWI48 2 "nonimmediate_operand" "rm"))))
    (set (match_operand:SWI48 0 "register_operand" "=r")
-	(plus:SWI48 (match_dup 1)
-		    (plus:SWI48 (match_op_dup 4
-				 [(match_dup 3) (const_int 0)])
-				(match_dup 2))))]
-  "TARGET_ADX && ix86_binary_operator_ok (PLUS, <MODE>mode, operands)"
-  "adcx\t{%2, %0|%0, %2}"
+	(minus:SWI48 (minus:SWI48 (match_dup 1)
+				  (match_op_dup 4
+				   [(match_dup 3) (const_int 0)]))
+		     (match_dup 2)))]
+  "ix86_binary_operator_ok (MINUS, <MODE>mode, operands)"
+  "sbb{<imodesuffix>}\t{%2, %0|%0, %2}"
   [(set_attr "type" "alu")
    (set_attr "use_carry" "1")
+   (set_attr "pent_pair" "pu")
    (set_attr "mode" "<MODE>")])
 
 ;; Overflow setting add instructions
 
+(define_expand "addqi3_cconly_overflow"
+  [(parallel
+     [(set (reg:CCC FLAGS_REG)
+	   (compare:CCC
+	     (plus:QI
+	       (match_operand:QI 0 "nonimmediate_operand")
+	       (match_operand:QI 1 "general_operand"))
+	     (match_dup 0)))
+      (clobber (match_scratch:QI 2))])]
+  "!(MEM_P (operands[0]) && MEM_P (operands[1]))")
+
 (define_insn "*add<mode>3_cconly_overflow"
   [(set (reg:CCC FLAGS_REG)
 	(compare:CCC
@@ -8462,11 +8477,11 @@
    (clobber (reg:CC FLAGS_REG))]
   "TARGET_AVX512F && reload_completed"
    [(parallel [(set (match_dup 0)
-		    (xor:HI (match_dup 0)
-			    (match_dup 1)))
+		    (xor:SWI1248x (match_dup 0)
+				  (match_dup 1)))
 	       (clobber (reg:CC FLAGS_REG))])
     (set (match_dup 0)
-	 (not:HI (match_dup 0)))])
+	 (not:SWI1248x (match_dup 0)))])
 
 ;;There are kortrest[bdq] but no intrinsics for them.
 ;;We probably don't need to implement them.
@@ -8500,7 +8515,7 @@
   [(set (match_operand:HI 0 "register_operand" "=k")
 	(ior:HI
 	  (ashift:HI
-	    (match_operand:HI 1 "register_operand" "k")
+	    (zero_extend:HI (match_operand:QI 1 "register_operand" "k"))
 	    (const_int 8))
 	  (zero_extend:HI (match_operand:QI 2 "register_operand" "k"))))]
   "TARGET_AVX512F"
@@ -8513,9 +8528,9 @@
   [(set (match_operand:SI 0 "register_operand" "=k")
 	(ior:SI
 	  (ashift:SI
-	    (match_operand:SI 1 "register_operand" "k")
+	    (zero_extend:SI (match_operand:HI 1 "register_operand" "k"))
 	    (const_int 16))
-	  (zero_extend:SI (subreg:HI (match_operand:SI 2 "register_operand" "k") 0))))]
+	  (zero_extend:SI (match_operand:HI 2 "register_operand" "k"))))]
   "TARGET_AVX512BW"
   "kunpckwd\t{%2, %1, %0|%0, %1, %2}"
   [(set_attr "mode" "SI")])
@@ -8524,9 +8539,9 @@
   [(set (match_operand:DI 0 "register_operand" "=k")
 	(ior:DI
 	  (ashift:DI
-	    (match_operand:DI 1 "register_operand" "k")
+	    (zero_extend:DI (match_operand:SI 1 "register_operand" "k"))
 	    (const_int 32))
-	  (zero_extend:DI (subreg:SI (match_operand:DI 2 "register_operand" "k") 0))))]
+	  (zero_extend:DI (match_operand:SI 2 "register_operand" "k"))))]
   "TARGET_AVX512BW"
   "kunpckdq\t{%2, %1, %0|%0, %1, %2}"
   [(set_attr "mode" "DI")])
@@ -8754,9 +8769,9 @@
      (set (match_dup 0) (neg:DWIH (match_dup 1)))])
    (parallel
     [(set (match_dup 2)
-	  (plus:DWIH (match_dup 3)
-		     (plus:DWIH (ltu:DWIH (reg:CC FLAGS_REG) (const_int 0))
-				(const_int 0))))
+	  (plus:DWIH (plus:DWIH (ltu:DWIH (reg:CC FLAGS_REG) (const_int 0))
+				(match_dup 3))
+		     (const_int 0)))
      (clobber (reg:CC FLAGS_REG))])
    (parallel
     [(set (match_dup 2)
@@ -12123,10 +12138,14 @@
 ;; We need to disable this for TARGET_SEH, as otherwise
 ;; shrink-wrapped prologue gets enabled too.  This might exceed
 ;; the maximum size of prologue in unwind information.
+;; Also disallow shrink-wrapping if using stack slot to pass the
+;; static chain pointer - the first instruction has to be pushl %esi
+;; and it can't be moved around, as we use alternate entry points
+;; in that case.
 
 (define_expand "simple_return"
   [(simple_return)]
-  "!TARGET_SEH"
+  "!TARGET_SEH && !ix86_static_chain_on_stack"
 {
   if (crtl->args.pops_args)
     {
@@ -13237,7 +13256,8 @@
 	(call:P
 	 (mem:QI (match_operand 2 "constant_call_address_operand" "Bz"))
 	 (match_operand 3)))
-   (unspec:P [(match_operand 1 "tls_symbolic_operand")]
+   (unspec:P [(match_operand 1 "tls_symbolic_operand")
+	      (reg:P SP_REG)]
 	     UNSPEC_TLS_GD)]
   "TARGET_64BIT"
 {
@@ -13261,8 +13281,9 @@
 	 (mem:QI (plus:DI (match_operand:DI 2 "register_operand" "b")
 			  (match_operand:DI 3 "immediate_operand" "i")))
 	 (match_operand 4)))
-   (unspec:DI [(match_operand 1 "tls_symbolic_operand")]
-	     UNSPEC_TLS_GD)]
+   (unspec:DI [(match_operand 1 "tls_symbolic_operand")
+	       (reg:DI SP_REG)]
+	      UNSPEC_TLS_GD)]
   "TARGET_64BIT && ix86_cmodel == CM_LARGE_PIC && !TARGET_PECOFF
    && GET_CODE (operands[3]) == CONST
    && GET_CODE (XEXP (operands[3], 0)) == UNSPEC
@@ -13283,7 +13304,8 @@
 	  (call:P
 	   (mem:QI (match_operand 2))
 	   (const_int 0)))
-     (unspec:P [(match_operand 1 "tls_symbolic_operand")]
+     (unspec:P [(match_operand 1 "tls_symbolic_operand")
+		(reg:P SP_REG)]
 	       UNSPEC_TLS_GD)])]
   "TARGET_64BIT"
   "ix86_tls_descriptor_calls_expanded_in_cfun = true;")
@@ -13333,7 +13355,7 @@
 	(call:P
 	 (mem:QI (match_operand 1 "constant_call_address_operand" "Bz"))
 	 (match_operand 2)))
-   (unspec:P [(const_int 0)] UNSPEC_TLS_LD_BASE)]
+   (unspec:P [(reg:P SP_REG)] UNSPEC_TLS_LD_BASE)]
   "TARGET_64BIT"
 {
   output_asm_insn
@@ -13351,7 +13373,7 @@
 	 (mem:QI (plus:DI (match_operand:DI 1 "register_operand" "b")
 			  (match_operand:DI 2 "immediate_operand" "i")))
 	 (match_operand 3)))
-   (unspec:DI [(const_int 0)] UNSPEC_TLS_LD_BASE)]
+   (unspec:DI [(reg:DI SP_REG)] UNSPEC_TLS_LD_BASE)]
   "TARGET_64BIT && ix86_cmodel == CM_LARGE_PIC && !TARGET_PECOFF
    && GET_CODE (operands[2]) == CONST
    && GET_CODE (XEXP (operands[2], 0)) == UNSPEC
@@ -13372,7 +13394,7 @@
 	   (call:P
 	    (mem:QI (match_operand 1))
 	    (const_int 0)))
-      (unspec:P [(const_int 0)] UNSPEC_TLS_LD_BASE)])]
+      (unspec:P [(reg:P SP_REG)] UNSPEC_TLS_LD_BASE)])]
   "TARGET_64BIT"
   "ix86_tls_descriptor_calls_expanded_in_cfun = true;")
 
@@ -17431,8 +17453,8 @@
 ;; lifetime information then.
 
 (define_peephole2
-  [(set (match_operand:SWI124 0 "nonimmediate_operand")
-	(not:SWI124 (match_operand:SWI124 1 "nonimmediate_operand")))]
+  [(set (match_operand:SWI124 0 "nonimmediate_gr_operand")
+	(not:SWI124 (match_operand:SWI124 1 "nonimmediate_gr_operand")))]
   "optimize_insn_for_speed_p ()
    && ((TARGET_NOT_UNPAIRABLE
 	&& (!MEM_P (operands[0])
@@ -17576,8 +17598,10 @@
                      [(match_dup 0)
                       (match_operand 2 "memory_operand")]))]
   "REGNO (operands[0]) != REGNO (operands[1])
-   && ((MMX_REG_P (operands[0]) && MMX_REG_P (operands[1])) 
-       || (SSE_REG_P (operands[0]) && SSE_REG_P (operands[1])))"
+   && ((MMX_REGNO_P (REGNO (operands[0]))
+        && MMX_REGNO_P (REGNO (operands[1]))) 
+       || (SSE_REGNO_P (REGNO (operands[0]))
+           && SSE_REGNO_P (REGNO (operands[1]))))"
   [(set (match_dup 0) (match_dup 2))
    (set (match_dup 0)
         (match_op_dup 3 [(match_dup 0) (match_dup 1)]))])
@@ -17725,7 +17749,7 @@
 	(match_operand 1 "const0_operand"))]
   "GET_MODE_SIZE (GET_MODE (operands[0])) <= UNITS_PER_WORD
    && (! TARGET_USE_MOV0 || optimize_insn_for_size_p ())
-   && GENERAL_REG_P (operands[0])
+   && GENERAL_REGNO_P (REGNO (operands[0]))
    && peep2_regno_dead_p (0, FLAGS_REG)"
   [(parallel [(set (match_dup 0) (const_int 0))
 	      (clobber (reg:CC FLAGS_REG))])]
@@ -17746,6 +17770,7 @@
   [(set (match_operand:SWI248 0 "register_operand")
 	(const_int -1))]
   "(optimize_insn_for_size_p () || TARGET_MOVE_M1_VIA_OR)
+   && GENERAL_REGNO_P (REGNO (operands[0]))
    && peep2_regno_dead_p (0, FLAGS_REG)"
   [(parallel [(set (match_dup 0) (const_int -1))
 	      (clobber (reg:CC FLAGS_REG))])]
@@ -18113,11 +18138,13 @@
 
   operands[1] = gen_rtx_PLUS (word_mode, base,
 			      gen_rtx_MULT (word_mode, index, GEN_INT (scale)));
-  operands[5] = base;
   if (mode != word_mode)
     operands[1] = gen_rtx_SUBREG (mode, operands[1], 0);
+
+  operands[5] = base;
   if (op1mode != word_mode)
-    operands[5] = gen_rtx_SUBREG (op1mode, operands[5], 0);
+    operands[5] = gen_lowpart (op1mode, operands[5]);
+
   operands[0] = dest;
 })
 
diff --git a/gcc/config/i386/intelmic-mkoffload.c b/gcc/config/i386/intelmic-mkoffload.c
index e5e5c35fc4..f8aad905a4 100644
--- a/gcc/config/i386/intelmic-mkoffload.c
+++ b/gcc/config/i386/intelmic-mkoffload.c
@@ -453,17 +453,18 @@ prepare_target_image (const char *target_compiler, int argc, char **argv)
   fork_execute (objcopy_argv[0], CONST_CAST (char **, objcopy_argv), false);
 
   /* Objcopy has created symbols, containing the input file name with
-     special characters replaced with '_'.  We are going to rename these
-     new symbols.  */
+     non-alphanumeric characters replaced by underscores.
+     We are going to rename these new symbols.  */
   size_t symbol_name_len = strlen (target_so_filename);
   char *symbol_name = XALLOCAVEC (char, symbol_name_len + 1);
-  for (size_t i = 0; i <= symbol_name_len; i++)
+  for (size_t i = 0; i < symbol_name_len; i++)
     {
       char c = target_so_filename[i];
-      if ((c == '/') || (c == '.'))
+      if (!ISALNUM (c))
 	c = '_';
       symbol_name[i] = c;
     }
+  symbol_name[symbol_name_len] = '\0';
 
   char *opt_for_objcopy[3];
   opt_for_objcopy[0] = XALLOCAVEC (char, sizeof ("_binary__start=")
diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md
index 0f314ccb0e..f0c999cb18 100644
--- a/gcc/config/i386/predicates.md
+++ b/gcc/config/i386/predicates.md
@@ -37,6 +37,12 @@
   (and (match_code "reg")
        (match_test "GENERAL_REG_P (op)")))
 
+;; True if the operand is a nonimmediate operand with GENERAL class register.
+(define_predicate "nonimmediate_gr_operand"
+  (if_then_else (match_code "reg")
+    (match_test "GENERAL_REGNO_P (REGNO (op))")
+    (match_operand 0 "nonimmediate_operand")))
+
 ;; Return true if OP is a register operand other than an i387 fp register.
 (define_predicate "register_and_not_fp_reg_operand"
   (and (match_code "reg")
diff --git a/gcc/config/i386/sol2.h b/gcc/config/i386/sol2.h
index 9b725adc9d..ed963f8920 100644
--- a/gcc/config/i386/sol2.h
+++ b/gcc/config/i386/sol2.h
@@ -86,13 +86,10 @@ along with GCC; see the file COPYING3.  If not see
 #endif
 #endif
 
-#undef  ENDFILE_SPEC
-#define ENDFILE_SPEC \
-  "%{Ofast|ffast-math|funsafe-math-optimizations:crtfastmath.o%s} \
-   %{mpc32:crtprec32.o%s} \
+#define ENDFILE_ARCH_SPEC \
+  "%{mpc32:crtprec32.o%s} \
    %{mpc64:crtprec64.o%s} \
-   %{mpc80:crtprec80.o%s} \
-   crtend.o%s crtn.o%s"
+   %{mpc80:crtprec80.o%s}"
 
 #define SUBTARGET_CPU_EXTRA_SPECS \
   { "cpp_subtarget",	 CPP_SUBTARGET_SPEC },		\
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 58caf1aeb9..dc7f6a7bab 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -375,8 +375,8 @@
   [(V16HI "TARGET_AVX2") V8HI
    (V8SI "TARGET_AVX2") V4SI])
 
-(define_mode_iterator VI124_AVX512F
-  [(V32QI "TARGET_AVX2") V16QI
+(define_mode_iterator VI124_AVX2_24_AVX512F_1_AVX512BW
+  [(V64QI "TARGET_AVX512BW") (V32QI "TARGET_AVX2") V16QI
    (V32HI "TARGET_AVX512F") (V16HI "TARGET_AVX2") V8HI
    (V16SI "TARGET_AVX512F") (V8SI "TARGET_AVX2") V4SI])
 
@@ -392,9 +392,9 @@
   [(V8SI "TARGET_AVX2") V4SI
    (V4DI "TARGET_AVX2") V2DI])
 
-(define_mode_iterator VI248_AVX2_8_AVX512F
-  [(V16HI "TARGET_AVX2") V8HI
-   (V8SI "TARGET_AVX2") V4SI
+(define_mode_iterator VI248_AVX2_8_AVX512F_24_AVX512BW
+  [(V32HI "TARGET_AVX512BW") (V16HI "TARGET_AVX2") V8HI
+   (V16SI "TARGET_AVX512BW") (V8SI "TARGET_AVX2") V4SI
    (V8DI "TARGET_AVX512F") (V4DI "TARGET_AVX2") V2DI])
 
 (define_mode_iterator VI248_AVX512BW_AVX512VL
@@ -410,6 +410,14 @@
   [(V16SI "TARGET_AVX512F") V8SI V4SI
    (V8DI "TARGET_AVX512F") V4DI V2DI])
 
+(define_mode_iterator VI48_AVX_AVX512F
+  [(V16SI "TARGET_AVX512F") (V8SI "TARGET_AVX") V4SI
+   (V8DI "TARGET_AVX512F") (V4DI "TARGET_AVX") V2DI])
+
+(define_mode_iterator VI12_AVX_AVX512F
+  [ (V64QI "TARGET_AVX512F") (V32QI "TARGET_AVX") V16QI
+    (V32HI "TARGET_AVX512F") (V16HI "TARGET_AVX") V8HI])
+
 (define_mode_iterator V48_AVX2
   [V4SF V2DF
    V8SF V4DF
@@ -1078,9 +1086,9 @@
       /* The DImode arrived in a pair of integral registers (e.g. %edx:%eax).
 	 Assemble the 64-bit DImode value in an xmm register.  */
       emit_insn (gen_sse2_loadld (operands[0], CONST0_RTX (V4SImode),
-				  gen_rtx_SUBREG (SImode, operands[1], 0)));
+				  gen_lowpart (SImode, operands[1])));
       emit_insn (gen_sse2_loadld (operands[2], CONST0_RTX (V4SImode),
-				  gen_rtx_SUBREG (SImode, operands[1], 4)));
+				  gen_highpart (SImode, operands[1])));
       emit_insn (gen_vec_interleave_lowv4si (operands[0], operands[0],
 					     operands[2]));
    }
@@ -8490,42 +8498,48 @@
    (set_attr "mode" "DF,DF,V1DF,V1DF,V1DF,V2DF,V1DF,V1DF,V1DF")])
 
 (define_insn "vec_dupv2df<mask_name>"
-  [(set (match_operand:V2DF 0 "register_operand"     "=x,v")
+  [(set (match_operand:V2DF 0 "register_operand"     "=x,x,v")
 	(vec_duplicate:V2DF
-	  (match_operand:DF 1 "nonimmediate_operand" " 0,vm")))]
+	  (match_operand:DF 1 "nonimmediate_operand" " 0,xm,vm")))]
   "TARGET_SSE2 && <mask_avx512vl_condition>"
   "@
    unpcklpd\t%0, %0
-   %vmovddup\t{%1, %0<mask_operand2>|%0<mask_operand2>, %1}"
-  [(set_attr "isa" "noavx,sse3")
+   %vmovddup\t{%1, %0<mask_operand2>|%0<mask_operand2>, %1}
+   vmovddup\t{%1, %0<mask_operand2>|%0<mask_operand2>, %1}"
+  [(set_attr "isa" "noavx,sse3,avx512vl")
    (set_attr "type" "sselog1")
-   (set_attr "prefix" "orig,maybe_vex")
-   (set_attr "mode" "V2DF,DF")])
+   (set_attr "prefix" "orig,maybe_vex,evex")
+   (set_attr "mode" "V2DF,DF,DF")])
 
 (define_insn "*vec_concatv2df"
-  [(set (match_operand:V2DF 0 "register_operand"     "=x,v,v,x,x,v,x,x")
+  [(set (match_operand:V2DF 0 "register_operand"     "=x,x,v,x,v,x,x,v,x,x")
 	(vec_concat:V2DF
-	  (match_operand:DF 1 "nonimmediate_operand" " 0,v,m,0,x,m,0,0")
-	  (match_operand:DF 2 "vector_move_operand"  " x,v,1,m,m,C,x,m")))]
+	  (match_operand:DF 1 "nonimmediate_operand" " 0,x,v,m,m,0,x,m,0,0")
+	  (match_operand:DF 2 "vector_move_operand"  " x,x,v,1,1,m,m,C,x,m")))]
   "TARGET_SSE"
   "@
    unpcklpd\t{%2, %0|%0, %2}
    vunpcklpd\t{%2, %1, %0|%0, %1, %2}
+   vunpcklpd\t{%2, %1, %0|%0, %1, %2}
    %vmovddup\t{%1, %0|%0, %1}
+   vmovddup\t{%1, %0|%0, %1}
    movhpd\t{%2, %0|%0, %2}
    vmovhpd\t{%2, %1, %0|%0, %1, %2}
    %vmovsd\t{%1, %0|%0, %1}
    movlhps\t{%2, %0|%0, %2}
    movhps\t{%2, %0|%0, %2}"
-  [(set_attr "isa" "sse2_noavx,avx,sse3,sse2_noavx,avx,sse2,noavx,noavx")
+  [(set_attr "isa" "sse2_noavx,avx,avx512vl,sse3,avx512vl,sse2_noavx,avx,sse2,noavx,noavx")
    (set (attr "type")
      (if_then_else
-       (eq_attr "alternative" "0,1,2")
+       (eq_attr "alternative" "0,1,2,3,4")
        (const_string "sselog")
        (const_string "ssemov")))
-   (set_attr "prefix_data16" "*,*,*,1,*,*,*,*")
-   (set_attr "prefix" "orig,vex,maybe_vex,orig,vex,maybe_vex,orig,orig")
-   (set_attr "mode" "V2DF,V2DF,DF,V1DF,V1DF,DF,V4SF,V2SF")])
+   (set (attr "prefix_data16")
+	(if_then_else (eq_attr "alternative" "5")
+		      (const_string "1")
+		      (const_string "*")))
+   (set_attr "prefix" "orig,vex,evex,maybe_vex,evex,orig,vex,maybe_vex,orig,orig")
+   (set_attr "mode" "V2DF,V2DF,V2DF, DF, DF, V1DF,V1DF,DF,V4SF,V2SF")])
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;;
@@ -8575,7 +8589,7 @@
       (match_operand:<avx512fmaskmode> 2 "register_operand")))]
   "TARGET_AVX512F")
 
-(define_insn "*avx512bw_<code>v32hiv32qi2"
+(define_insn "avx512bw_<code>v32hiv32qi2"
   [(set (match_operand:V32QI 0 "nonimmediate_operand" "=v,m")
 	(any_truncate:V32QI
 	    (match_operand:V32HI 1 "register_operand" "v,v")))]
@@ -10911,10 +10925,10 @@
 })
 
 (define_insn "<mask_codefor><code><mode>3<mask_name>"
-  [(set (match_operand:VI 0 "register_operand" "=x,v")
-	(any_logic:VI
-	  (match_operand:VI 1 "nonimmediate_operand" "%0,v")
-	  (match_operand:VI 2 "nonimmediate_operand" "xm,vm")))]
+  [(set (match_operand:VI48_AVX_AVX512F 0 "register_operand" "=x,v")
+	(any_logic:VI48_AVX_AVX512F
+	  (match_operand:VI48_AVX_AVX512F 1 "nonimmediate_operand" "%0,v")
+	  (match_operand:VI48_AVX_AVX512F 2 "nonimmediate_operand" "xm,vm")))]
   "TARGET_SSE && <mask_mode512bit_condition>
    && ix86_binary_operator_ok (<CODE>, <MODE>mode, operands)"
 {
@@ -10943,24 +10957,120 @@
         case V4DImode:
         case V4SImode:
         case V2DImode:
-          if (TARGET_AVX512VL)
+          tmp = TARGET_AVX512VL ? "p<logic><ssemodesuffix>" : "p<logic>";
+          break;
+        default:
+          gcc_unreachable ();
+      }
+      break;
+
+   case MODE_V8SF:
+      gcc_assert (TARGET_AVX);
+   case MODE_V4SF:
+      gcc_assert (TARGET_SSE);
+      gcc_assert (!<mask_applied>);
+      tmp = "<logic>ps";
+      break;
+
+   default:
+      gcc_unreachable ();
+   }
+
+  switch (which_alternative)
+    {
+    case 0:
+      if (<mask_applied>)
+        ops = "v%s\t{%%2, %%0, %%0<mask_operand3_1>|%%0<mask_operand3_1>, %%0, %%2}";
+      else
+        ops = "%s\t{%%2, %%0|%%0, %%2}";
+      break;
+    case 1:
+      ops = "v%s\t{%%2, %%1, %%0<mask_operand3_1>|%%0<mask_operand3_1>, %%1, %%2}";
+      break;
+    default:
+      gcc_unreachable ();
+    }
+
+  snprintf (buf, sizeof (buf), ops, tmp);
+  return buf;
+}
+  [(set_attr "isa" "noavx,avx")
+   (set_attr "type" "sselog")
+   (set (attr "prefix_data16")
+     (if_then_else
+       (and (eq_attr "alternative" "0")
+	    (eq_attr "mode" "TI"))
+       (const_string "1")
+       (const_string "*")))
+   (set_attr "prefix" "<mask_prefix3>")
+   (set (attr "mode")
+	(cond [(and (match_test "<MODE_SIZE> == 16")
+		    (match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL"))
+		 (const_string "<ssePSmode>")
+	       (match_test "TARGET_AVX2")
+		 (const_string "<sseinsnmode>")
+	       (match_test "TARGET_AVX")
+		 (if_then_else
+		   (match_test "<MODE_SIZE> > 16")
+		   (const_string "V8SF")
+		   (const_string "<sseinsnmode>"))
+	       (ior (not (match_test "TARGET_SSE2"))
+		    (match_test "optimize_function_for_size_p (cfun)"))
+		 (const_string "V4SF")
+	      ]
+	      (const_string "<sseinsnmode>")))])
+
+(define_insn "*<code><mode>3"
+  [(set (match_operand:VI12_AVX_AVX512F 0 "register_operand" "=x,v")
+	(any_logic: VI12_AVX_AVX512F
+	  (match_operand:VI12_AVX_AVX512F 1 "nonimmediate_operand" "%0,v")
+	  (match_operand:VI12_AVX_AVX512F 2 "nonimmediate_operand" "xm,vm")))]
+  "TARGET_SSE && ix86_binary_operator_ok (<CODE>, <MODE>mode, operands)"
+{
+  static char buf[64];
+  const char *ops;
+  const char *tmp;
+  const char *ssesuffix;
+
+  switch (get_attr_mode (insn))
+    {
+    case MODE_XI:
+      gcc_assert (TARGET_AVX512F);
+    case MODE_OI:
+      gcc_assert (TARGET_AVX2 || TARGET_AVX512VL);
+    case MODE_TI:
+      gcc_assert (TARGET_SSE2 || TARGET_AVX512VL);
+      switch (<MODE>mode)
+        {
+        case V64QImode:
+        case V32HImode:
+          if (TARGET_AVX512F)
           {
-            tmp = "p<logic><ssemodesuffix>";
+            tmp = "p<logic>";
+            ssesuffix = "q";
+            break;
+          }
+        case V32QImode:
+        case V16HImode:
+        case V16QImode:
+        case V8HImode:
+          if (TARGET_AVX512VL || TARGET_AVX2 || TARGET_SSE2)
+          {
+            tmp = "p<logic>";
+            ssesuffix = TARGET_AVX512VL ? "q" : "";
             break;
           }
         default:
-          tmp = TARGET_AVX512VL ? "p<logic>q" : "p<logic>";
+          gcc_unreachable ();
       }
       break;
 
-   case MODE_V16SF:
-      gcc_assert (TARGET_AVX512F);
    case MODE_V8SF:
       gcc_assert (TARGET_AVX);
    case MODE_V4SF:
       gcc_assert (TARGET_SSE);
-
       tmp = "<logic>ps";
+      ssesuffix = "";
       break;
 
    default:
@@ -10971,15 +11081,16 @@
     {
     case 0:
       ops = "%s\t{%%2, %%0|%%0, %%2}";
+      snprintf (buf, sizeof (buf), ops, tmp);
       break;
     case 1:
-      ops = "v%s\t{%%2, %%1, %%0<mask_operand3_1>|%%0<mask_operand3_1>, %%1, %%2}";
+      ops = "v%s%s\t{%%2, %%1, %%0|%%0, %%1, %%2}";
+      snprintf (buf, sizeof (buf), ops, tmp, ssesuffix);
       break;
     default:
       gcc_unreachable ();
     }
 
-  snprintf (buf, sizeof (buf), ops, tmp);
   return buf;
 }
   [(set_attr "isa" "noavx,avx")
@@ -11060,8 +11171,8 @@
 
 (define_expand "vec_pack_trunc_<mode>"
   [(match_operand:<ssepackmode> 0 "register_operand")
-   (match_operand:VI248_AVX2_8_AVX512F 1 "register_operand")
-   (match_operand:VI248_AVX2_8_AVX512F 2 "register_operand")]
+   (match_operand:VI248_AVX2_8_AVX512F_24_AVX512BW 1 "register_operand")
+   (match_operand:VI248_AVX2_8_AVX512F_24_AVX512BW 2 "register_operand")]
   "TARGET_SSE2"
 {
   rtx op1 = gen_lowpart (<ssepackmode>mode, operands[1]);
@@ -12971,25 +13082,25 @@
 
 (define_expand "vec_unpacks_lo_<mode>"
   [(match_operand:<sseunpackmode> 0 "register_operand")
-   (match_operand:VI124_AVX512F 1 "register_operand")]
+   (match_operand:VI124_AVX2_24_AVX512F_1_AVX512BW 1 "register_operand")]
   "TARGET_SSE2"
   "ix86_expand_sse_unpack (operands[0], operands[1], false, false); DONE;")
 
 (define_expand "vec_unpacks_hi_<mode>"
   [(match_operand:<sseunpackmode> 0 "register_operand")
-   (match_operand:VI124_AVX512F 1 "register_operand")]
+   (match_operand:VI124_AVX2_24_AVX512F_1_AVX512BW 1 "register_operand")]
   "TARGET_SSE2"
   "ix86_expand_sse_unpack (operands[0], operands[1], false, true); DONE;")
 
 (define_expand "vec_unpacku_lo_<mode>"
   [(match_operand:<sseunpackmode> 0 "register_operand")
-   (match_operand:VI124_AVX512F 1 "register_operand")]
+   (match_operand:VI124_AVX2_24_AVX512F_1_AVX512BW 1 "register_operand")]
   "TARGET_SSE2"
   "ix86_expand_sse_unpack (operands[0], operands[1], true, false); DONE;")
 
 (define_expand "vec_unpacku_hi_<mode>"
   [(match_operand:<sseunpackmode> 0 "register_operand")
-   (match_operand:VI124_AVX512F 1 "register_operand")]
+   (match_operand:VI124_AVX2_24_AVX512F_1_AVX512BW 1 "register_operand")]
   "TARGET_SSE2"
   "ix86_expand_sse_unpack (operands[0], operands[1], true, true); DONE;")
 
diff --git a/gcc/config/i386/sync.md b/gcc/config/i386/sync.md
index 61a2a81926..59573d40a9 100644
--- a/gcc/config/i386/sync.md
+++ b/gcc/config/i386/sync.md
@@ -105,11 +105,11 @@
   [(match_operand:SI 0 "const_int_operand")]		;; model
   ""
 {
-  enum memmodel model = (enum memmodel) (INTVAL (operands[0]) & MEMMODEL_MASK);
+  enum memmodel model = memmodel_from_int (INTVAL (operands[0]));
 
   /* Unless this is a SEQ_CST fence, the i386 memory model is strong
      enough not to require barriers of any kind.  */
-  if (model == MEMMODEL_SEQ_CST)
+  if (is_mm_seq_cst (model))
     {
       rtx (*mfence_insn)(rtx);
       rtx mem;
@@ -217,7 +217,7 @@
 		       UNSPEC_STA))]
   ""
 {
-  enum memmodel model = (enum memmodel) (INTVAL (operands[2]) & MEMMODEL_MASK);
+  enum memmodel model = memmodel_from_int (INTVAL (operands[2]));
 
   if (<MODE>mode == DImode && !TARGET_64BIT)
     {
@@ -233,7 +233,7 @@
       operands[1] = force_reg (<MODE>mode, operands[1]);
 
       /* For seq-cst stores, when we lack MFENCE, use XCHG.  */
-      if (model == MEMMODEL_SEQ_CST && !(TARGET_64BIT || TARGET_SSE2))
+      if (is_mm_seq_cst (model) && !(TARGET_64BIT || TARGET_SSE2))
 	{
 	  emit_insn (gen_atomic_exchange<mode> (gen_reg_rtx (<MODE>mode),
 						operands[0], operands[1],
@@ -246,7 +246,7 @@
 					   operands[2]));
     }
   /* ... followed by an MFENCE, if required.  */
-  if (model == MEMMODEL_SEQ_CST)
+  if (is_mm_seq_cst (model))
     emit_insn (gen_mem_thread_fence (operands[2]));
   DONE;
 })
diff --git a/gcc/config/ia64/ia64.c b/gcc/config/ia64/ia64.c
index 5132d2f94e..21da9e212c 100644
--- a/gcc/config/ia64/ia64.c
+++ b/gcc/config/ia64/ia64.c
@@ -2389,10 +2389,12 @@ ia64_expand_atomic_op (enum rtx_code code, rtx mem, rtx val,
 	{
 	case MEMMODEL_ACQ_REL:
 	case MEMMODEL_SEQ_CST:
+	case MEMMODEL_SYNC_SEQ_CST:
 	  emit_insn (gen_memory_barrier ());
 	  /* FALLTHRU */
 	case MEMMODEL_RELAXED:
 	case MEMMODEL_ACQUIRE:
+	case MEMMODEL_SYNC_ACQUIRE:
 	case MEMMODEL_CONSUME:
 	  if (mode == SImode)
 	    icode = CODE_FOR_fetchadd_acq_si;
@@ -2400,6 +2402,7 @@ ia64_expand_atomic_op (enum rtx_code code, rtx mem, rtx val,
 	    icode = CODE_FOR_fetchadd_acq_di;
 	  break;
 	case MEMMODEL_RELEASE:
+	case MEMMODEL_SYNC_RELEASE:
 	  if (mode == SImode)
 	    icode = CODE_FOR_fetchadd_rel_si;
 	  else
@@ -2426,8 +2429,7 @@ ia64_expand_atomic_op (enum rtx_code code, rtx mem, rtx val,
      front half of the full barrier.  The end half is the cmpxchg.rel.
      For relaxed and release memory models, we don't need this.  But we
      also don't bother trying to prevent it either.  */
-  gcc_assert (model == MEMMODEL_RELAXED
-	      || model == MEMMODEL_RELEASE
+  gcc_assert (is_mm_relaxed (model) || is_mm_release (model)
 	      || MEM_VOLATILE_P (mem));
 
   old_reg = gen_reg_rtx (DImode);
@@ -2471,6 +2473,7 @@ ia64_expand_atomic_op (enum rtx_code code, rtx mem, rtx val,
     {
     case MEMMODEL_RELAXED:
     case MEMMODEL_ACQUIRE:
+    case MEMMODEL_SYNC_ACQUIRE:
     case MEMMODEL_CONSUME:
       switch (mode)
 	{
@@ -2484,8 +2487,10 @@ ia64_expand_atomic_op (enum rtx_code code, rtx mem, rtx val,
       break;
 
     case MEMMODEL_RELEASE:
+    case MEMMODEL_SYNC_RELEASE:
     case MEMMODEL_ACQ_REL:
     case MEMMODEL_SEQ_CST:
+    case MEMMODEL_SYNC_SEQ_CST:
       switch (mode)
 	{
 	case QImode: icode = CODE_FOR_cmpxchg_rel_qi;  break;
diff --git a/gcc/config/ia64/sync.md b/gcc/config/ia64/sync.md
index 75d746d74c..9c178b826b 100644
--- a/gcc/config/ia64/sync.md
+++ b/gcc/config/ia64/sync.md
@@ -33,7 +33,7 @@
   [(match_operand:SI 0 "const_int_operand" "")]		;; model
   ""
 {
-  if (INTVAL (operands[0]) == MEMMODEL_SEQ_CST)
+  if (is_mm_seq_cst (memmodel_from_int (INTVAL (operands[0]))))
     emit_insn (gen_memory_barrier ());
   DONE;
 })
@@ -60,11 +60,11 @@
    (match_operand:SI 2 "const_int_operand" "")]			;; model
   ""
 {
-  enum memmodel model = (enum memmodel) INTVAL (operands[2]);
+  enum memmodel model = memmodel_from_int (INTVAL (operands[2]));
 
   /* Unless the memory model is relaxed, we want to emit ld.acq, which
      will happen automatically for volatile memories.  */
-  gcc_assert (model == MEMMODEL_RELAXED || MEM_VOLATILE_P (operands[1]));
+  gcc_assert (is_mm_relaxed (model) || MEM_VOLATILE_P (operands[1]));
   emit_move_insn (operands[0], operands[1]);
   DONE;
 })
@@ -75,17 +75,17 @@
    (match_operand:SI 2 "const_int_operand" "")]			;; model
   ""
 {
-  enum memmodel model = (enum memmodel) INTVAL (operands[2]);
+  enum memmodel model = memmodel_from_int (INTVAL (operands[2]));
 
   /* Unless the memory model is relaxed, we want to emit st.rel, which
      will happen automatically for volatile memories.  */
-  gcc_assert (model == MEMMODEL_RELAXED || MEM_VOLATILE_P (operands[0]));
+  gcc_assert (is_mm_relaxed (model) || MEM_VOLATILE_P (operands[0]));
   emit_move_insn (operands[0], operands[1]);
 
   /* Sequentially consistent stores need a subsequent MF.  See
      http://www.decadent.org.uk/pipermail/cpp-threads/2008-December/001952.html
      for a discussion of why a MF is needed here, but not for atomic_load.  */
-  if (model == MEMMODEL_SEQ_CST)
+  if (is_mm_seq_cst (model))
     emit_insn (gen_memory_barrier ());
   DONE;
 })
@@ -101,7 +101,8 @@
    (match_operand:SI 7 "const_int_operand" "")]			;; fail model
   ""
 {
-  enum memmodel model = (enum memmodel) INTVAL (operands[6]);
+  /* No need to distinquish __sync from __atomic, so get base value.  */
+  enum memmodel model = memmodel_base (INTVAL (operands[6]));
   rtx ccv = gen_rtx_REG (DImode, AR_CCV_REGNUM);
   rtx dval, eval;
 
@@ -200,7 +201,8 @@
    (match_operand:SI 3 "const_int_operand" "")]			;; succ model
   ""
 {
-  enum memmodel model = (enum memmodel) INTVAL (operands[3]);
+  /* No need to distinquish __sync from __atomic, so get base value.  */
+  enum memmodel model = memmodel_base (INTVAL (operands[3]));
 
   switch (model)
     {
diff --git a/gcc/config/mips/mips.c b/gcc/config/mips/mips.c
index 1733457e73..67ec6b9709 100644
--- a/gcc/config/mips/mips.c
+++ b/gcc/config/mips/mips.c
@@ -13111,7 +13111,7 @@ mips_process_sync_loop (rtx_insn *insn, rtx *operands)
       model = MEMMODEL_ACQUIRE;
       break;
     default:
-      model = (enum memmodel) INTVAL (operands[memmodel_attr]);
+      model = memmodel_from_int (INTVAL (operands[memmodel_attr]));
     }
 
   mips_multi_start ();
diff --git a/gcc/config/nios2/nios2.c b/gcc/config/nios2/nios2.c
index 848cc51353..7b6b389989 100644
--- a/gcc/config/nios2/nios2.c
+++ b/gcc/config/nios2/nios2.c
@@ -1894,15 +1894,15 @@ nios2_legitimize_address (rtx x, rtx oldx ATTRIBUTE_UNUSED,
 
      Which will be output as '%tls_le(var+48)(r23)' in assembly.  */
   if (GET_CODE (x) == PLUS
-      && GET_CODE (XEXP (x, 0)) == REG
       && GET_CODE (XEXP (x, 1)) == CONST)
     {
-      rtx unspec, offset, reg = XEXP (x, 0);
+      rtx unspec, offset;
       split_const (XEXP (x, 1), &unspec, &offset);
       if (GET_CODE (unspec) == UNSPEC
 	  && !nios2_large_offset_p (XINT (unspec, 1))
 	  && offset != const0_rtx)
 	{
+	  rtx reg = force_reg (Pmode, XEXP (x, 0));
 	  unspec = copy_rtx (unspec);
 	  XVECEXP (unspec, 0, 0)
 	    = plus_constant (Pmode, XVECEXP (unspec, 0, 0), INTVAL (offset));
diff --git a/gcc/config/pa/pa-linux.h b/gcc/config/pa/pa-linux.h
index f8da185688..957a274249 100644
--- a/gcc/config/pa/pa-linux.h
+++ b/gcc/config/pa/pa-linux.h
@@ -140,3 +140,4 @@ along with GCC; see the file COPYING3.  If not see
 #define HAVE_sync_compare_and_swapqi 1
 #define HAVE_sync_compare_and_swaphi 1
 #define HAVE_sync_compare_and_swapsi 1
+#define HAVE_sync_compare_and_swapdi 1
diff --git a/gcc/config/pa/pa-protos.h b/gcc/config/pa/pa-protos.h
index 4a44dab671..3dfa6fb640 100644
--- a/gcc/config/pa/pa-protos.h
+++ b/gcc/config/pa/pa-protos.h
@@ -79,6 +79,7 @@ extern enum direction pa_function_arg_padding (machine_mode, const_tree);
 #endif /* ARGS_SIZE_RTX */
 extern int pa_insn_refs_are_delayed (rtx_insn *);
 extern rtx pa_get_deferred_plabel (rtx);
+extern rtx pa_maybe_emit_compare_and_swap_exchange_loop (rtx, rtx, rtx);
 #endif /* RTX_CODE */
 
 extern int pa_and_mask_p (unsigned HOST_WIDE_INT);
diff --git a/gcc/config/pa/pa.c b/gcc/config/pa/pa.c
index 49af050b9f..219560c6d3 100644
--- a/gcc/config/pa/pa.c
+++ b/gcc/config/pa/pa.c
@@ -2443,6 +2443,7 @@ pa_output_move_double (rtx *operands)
   enum { REGOP, OFFSOP, MEMOP, CNSTOP, RNDOP } optype0, optype1;
   rtx latehalf[2];
   rtx addreg0 = 0, addreg1 = 0;
+  int highonly = 0;
 
   /* First classify both operands.  */
 
@@ -2653,7 +2654,14 @@ pa_output_move_double (rtx *operands)
   else if (optype1 == OFFSOP)
     latehalf[1] = adjust_address_nv (operands[1], SImode, 4);
   else if (optype1 == CNSTOP)
-    split_double (operands[1], &operands[1], &latehalf[1]);
+    {
+      if (GET_CODE (operands[1]) == HIGH)
+	{
+	  operands[1] = XEXP (operands[1], 0);
+	  highonly = 1;
+	}
+      split_double (operands[1], &operands[1], &latehalf[1]);
+    }
   else
     latehalf[1] = operands[1];
 
@@ -2706,8 +2714,11 @@ pa_output_move_double (rtx *operands)
   if (addreg1)
     output_asm_insn ("ldo 4(%0),%0", &addreg1);
 
-  /* Do that word.  */
-  output_asm_insn (pa_singlemove_string (latehalf), latehalf);
+  /* Do high-numbered word.  */
+  if (highonly)
+    output_asm_insn ("ldil L'%1,%0", latehalf);
+  else
+    output_asm_insn (pa_singlemove_string (latehalf), latehalf);
 
   /* Undo the adds we just did.  */
   if (addreg0)
@@ -5712,7 +5723,7 @@ pa_init_libfuncs (void)
     }
 
   if (TARGET_SYNC_LIBCALL)
-    init_sync_libfuncs (UNITS_PER_WORD);
+    init_sync_libfuncs (8);
 }
 
 /* HP's millicode routines mean something special to the assembler.
@@ -8473,14 +8484,6 @@ pa_function_ok_for_sibcall (tree decl, tree exp ATTRIBUTE_UNUSED)
   if (TARGET_PORTABLE_RUNTIME)
     return false;
 
-  /* Sibcalls are ok for TARGET_ELF32 as along as the linker is used in
-     single subspace mode and the call is not indirect.  As far as I know,
-     there is no operating system support for the multiple subspace mode.
-     It might be possible to support indirect calls if we didn't use
-     $$dyncall (see the indirect sequence generated in pa_output_call).  */
-  if (TARGET_ELF32)
-    return (decl != NULL_TREE);
-
   /* Sibcalls are not ok because the arg pointer register is not a fixed
      register.  This prevents the sibcall optimization from occurring.  In
      addition, there are problems with stub placement using GNU ld.  This
@@ -10515,4 +10518,79 @@ pa_output_addr_diff_vec (rtx lab, rtx body)
     fputs ("\t.end_brtab\n", asm_out_file);
 }
 
+/* This is a helper function for the other atomic operations.  This function
+   emits a loop that contains SEQ that iterates until a compare-and-swap
+   operation at the end succeeds.  MEM is the memory to be modified.  SEQ is
+   a set of instructions that takes a value from OLD_REG as an input and
+   produces a value in NEW_REG as an output.  Before SEQ, OLD_REG will be
+   set to the current contents of MEM.  After SEQ, a compare-and-swap will
+   attempt to update MEM with NEW_REG.  The function returns true when the
+   loop was generated successfully.  */
+
+static bool
+pa_expand_compare_and_swap_loop (rtx mem, rtx old_reg, rtx new_reg, rtx seq)
+{
+  machine_mode mode = GET_MODE (mem);
+  rtx_code_label *label;
+  rtx cmp_reg, success, oldval;
+
+  /* The loop we want to generate looks like
+
+        cmp_reg = mem;
+      label:
+        old_reg = cmp_reg;
+        seq;
+        (success, cmp_reg) = compare-and-swap(mem, old_reg, new_reg)
+        if (success)
+          goto label;
+
+     Note that we only do the plain load from memory once.  Subsequent
+     iterations use the value loaded by the compare-and-swap pattern.  */
+
+  label = gen_label_rtx ();
+  cmp_reg = gen_reg_rtx (mode);
+
+  emit_move_insn (cmp_reg, mem);
+  emit_label (label);
+  emit_move_insn (old_reg, cmp_reg);
+  if (seq)
+    emit_insn (seq);
+
+  success = NULL_RTX;
+  oldval = cmp_reg;
+  if (!expand_atomic_compare_and_swap (&success, &oldval, mem, old_reg,
+                                       new_reg, false, MEMMODEL_SYNC_SEQ_CST,
+                                       MEMMODEL_RELAXED))
+    return false;
+
+  if (oldval != cmp_reg)
+    emit_move_insn (cmp_reg, oldval);
+
+  /* Mark this jump predicted not taken.  */
+  emit_cmp_and_jump_insns (success, const0_rtx, EQ, const0_rtx,
+                           GET_MODE (success), 1, label, 0);
+  return true;
+}
+
+/* This function tries to implement an atomic exchange operation using a 
+   compare_and_swap loop. VAL is written to *MEM.  The previous contents of
+   *MEM are returned, using TARGET if possible.  No memory model is required
+   since a compare_and_swap loop is seq-cst.  */
+
+rtx
+pa_maybe_emit_compare_and_swap_exchange_loop (rtx target, rtx mem, rtx val)
+{
+  machine_mode mode = GET_MODE (mem);
+
+  if (can_compare_and_swap_p (mode, true))
+    {
+      if (!target || !register_operand (target, mode))
+        target = gen_reg_rtx (mode);
+      if (pa_expand_compare_and_swap_loop (mem, target, val, NULL_RTX))
+        return target;
+    }
+
+  return NULL_RTX;
+}
+
 #include "gt-pa.h"
diff --git a/gcc/config/pa/pa.md b/gcc/config/pa/pa.md
index 158b789677..271a72e837 100644
--- a/gcc/config/pa/pa.md
+++ b/gcc/config/pa/pa.md
@@ -699,59 +699,229 @@
 ;; doubleword loads and stores are not guaranteed to be atomic
 ;; when referencing the I/O address space.
 
-;; Implement atomic DImode load using 64-bit floating point load and copy.
+;; The kernel cmpxchg operation on linux is not atomic with respect to
+;; memory stores on SMP machines, so we must do stores using a cmpxchg
+;; operation.
+
+;; Implement atomic QImode store using exchange.
+
+(define_expand "atomic_storeqi"
+  [(match_operand:QI 0 "memory_operand")                ;; memory
+   (match_operand:QI 1 "register_operand")              ;; val out
+   (match_operand:SI 2 "const_int_operand")]            ;; model
+  ""
+{
+  if (TARGET_SYNC_LIBCALL)
+    {
+      rtx mem = operands[0];
+      rtx val = operands[1];
+      if (pa_maybe_emit_compare_and_swap_exchange_loop (NULL_RTX, mem, val))
+	DONE;
+    }
+  FAIL;
+})
+
+;; Implement atomic HImode stores using exchange.
+
+(define_expand "atomic_storehi"
+  [(match_operand:HI 0 "memory_operand")                ;; memory
+   (match_operand:HI 1 "register_operand")              ;; val out
+   (match_operand:SI 2 "const_int_operand")]            ;; model
+  ""
+{
+  if (TARGET_SYNC_LIBCALL)
+    {
+      rtx mem = operands[0];
+      rtx val = operands[1];
+      if (pa_maybe_emit_compare_and_swap_exchange_loop (NULL_RTX, mem, val))
+	DONE;
+    }
+  FAIL;
+})
+
+;; Implement atomic SImode store using exchange.
+
+(define_expand "atomic_storesi"
+  [(match_operand:SI 0 "memory_operand")                ;; memory
+   (match_operand:SI 1 "register_operand")              ;; val out
+   (match_operand:SI 2 "const_int_operand")]            ;; model
+  ""
+{
+  if (TARGET_SYNC_LIBCALL)
+    {
+      rtx mem = operands[0];
+      rtx val = operands[1];
+      if (pa_maybe_emit_compare_and_swap_exchange_loop (NULL_RTX, mem, val))
+	DONE;
+    }
+  FAIL;
+})
+
+;; Implement atomic SFmode store using exchange.
+
+(define_expand "atomic_storesf"
+  [(match_operand:SF 0 "memory_operand")                ;; memory
+   (match_operand:SF 1 "register_operand")              ;; val out
+   (match_operand:SI 2 "const_int_operand")]            ;; model
+  ""
+{
+  if (TARGET_SYNC_LIBCALL)
+    {
+      rtx mem = operands[0];
+      rtx val = operands[1];
+      if (pa_maybe_emit_compare_and_swap_exchange_loop (NULL_RTX, mem, val))
+	DONE;
+    }
+  FAIL;
+})
+
+;; Implement atomic DImode load using 64-bit floating point load.
 
 (define_expand "atomic_loaddi"
   [(match_operand:DI 0 "register_operand")              ;; val out
    (match_operand:DI 1 "memory_operand")                ;; memory
    (match_operand:SI 2 "const_int_operand")]            ;; model
-  "!TARGET_64BIT && !TARGET_SOFT_FLOAT"
+  ""
 {
-  enum memmodel model = (enum memmodel) INTVAL (operands[2]);
+  enum memmodel model;
+
+  if (TARGET_64BIT || TARGET_SOFT_FLOAT)
+    FAIL;
+
+  model = memmodel_from_int (INTVAL (operands[2]));
   operands[1] = force_reg (SImode, XEXP (operands[1], 0));
-  operands[2] = gen_reg_rtx (DImode);
   expand_mem_thread_fence (model);
-  emit_insn (gen_atomic_loaddi_1 (operands[0], operands[1], operands[2]));
-  if ((model & MEMMODEL_MASK) == MEMMODEL_SEQ_CST)
+  emit_insn (gen_atomic_loaddi_1 (operands[0], operands[1]));
+  if (is_mm_seq_cst (model))
     expand_mem_thread_fence (model);
   DONE;
 })
 
 (define_insn "atomic_loaddi_1"
-  [(set (match_operand:DI 0 "register_operand" "=r")
-        (mem:DI (match_operand:SI 1 "register_operand" "r")))
-   (clobber (match_operand:DI 2 "register_operand" "=&f"))]
+  [(set (match_operand:DI 0 "register_operand" "=f,r")
+        (mem:DI (match_operand:SI 1 "register_operand" "r,r")))
+   (clobber (match_scratch:DI 2 "=X,f"))]
   "!TARGET_64BIT && !TARGET_SOFT_FLOAT"
-  "{fldds|fldd} 0(%1),%2\;{fstds|fstd} %2,-16(%%sp)\;{ldws|ldw} -16(%%sp),%0\;{ldws|ldw} -12(%%sp),%R0"
-  [(set_attr "type" "move")
-   (set_attr "length" "16")])
+  "@
+   {fldds|fldd} 0(%1),%0
+   {fldds|fldd} 0(%1),%2\n\t{fstds|fstd} %2,-16(%%sp)\n\t{ldws|ldw} -16(%%sp),%0\n\t{ldws|ldw} -12(%%sp),%R0"
+  [(set_attr "type" "move,move")
+   (set_attr "length" "4,16")])
 
-;; Implement atomic DImode store using copy and 64-bit floating point store.
+;; Implement atomic DImode store.
 
 (define_expand "atomic_storedi"
   [(match_operand:DI 0 "memory_operand")                ;; memory
    (match_operand:DI 1 "register_operand")              ;; val out
    (match_operand:SI 2 "const_int_operand")]            ;; model
-  "!TARGET_64BIT && !TARGET_SOFT_FLOAT"
+  ""
 {
-  enum memmodel model = (enum memmodel) INTVAL (operands[2]);
+  enum memmodel model;
+
+  if (TARGET_SYNC_LIBCALL)
+    {
+      rtx mem = operands[0];
+      rtx val = operands[1];
+      if (pa_maybe_emit_compare_and_swap_exchange_loop (NULL_RTX, mem, val))
+	DONE;
+    }
+
+  if (TARGET_64BIT || TARGET_SOFT_FLOAT)
+    FAIL;
+
+  model = memmodel_from_int (INTVAL (operands[2]));
   operands[0] = force_reg (SImode, XEXP (operands[0], 0));
-  operands[2] = gen_reg_rtx (DImode);
   expand_mem_thread_fence (model);
-  emit_insn (gen_atomic_storedi_1 (operands[0], operands[1], operands[2]));
-  if ((model & MEMMODEL_MASK) == MEMMODEL_SEQ_CST)
+  emit_insn (gen_atomic_storedi_1 (operands[0], operands[1]));
+  if (is_mm_seq_cst (model))
     expand_mem_thread_fence (model);
   DONE;
 })
 
 (define_insn "atomic_storedi_1"
-  [(set (mem:DI (match_operand:SI 0 "register_operand" "r"))
-        (match_operand:DI 1 "register_operand" "r"))
-   (clobber (match_operand:DI 2 "register_operand" "=&f"))]
+  [(set (mem:DI (match_operand:SI 0 "register_operand" "r,r"))
+        (match_operand:DI 1 "register_operand" "f,r"))
+   (clobber (match_scratch:DI 2 "=X,f"))]
+  "!TARGET_64BIT && !TARGET_SOFT_FLOAT && !TARGET_SYNC_LIBCALL"
+  "@
+   {fstds|fstd} %1,0(%0)
+   {stws|stw} %1,-16(%%sp)\n\t{stws|stw} %R1,-12(%%sp)\n\t{fldds|fldd} -16(%%sp),%2\n\t{fstds|fstd} %2,0(%0)"
+  [(set_attr "type" "move,move")
+   (set_attr "length" "4,16")])
+
+;; Implement atomic DFmode load using 64-bit floating point load.
+
+(define_expand "atomic_loaddf"
+  [(match_operand:DF 0 "register_operand")              ;; val out
+   (match_operand:DF 1 "memory_operand")                ;; memory
+   (match_operand:SI 2 "const_int_operand")]            ;; model
+  ""
+{
+  enum memmodel model;
+
+  if (TARGET_64BIT || TARGET_SOFT_FLOAT)
+    FAIL;
+
+  model = memmodel_from_int (INTVAL (operands[2]));
+  operands[1] = force_reg (SImode, XEXP (operands[1], 0));
+  expand_mem_thread_fence (model);
+  emit_insn (gen_atomic_loaddf_1 (operands[0], operands[1]));
+  if (is_mm_seq_cst (model))
+    expand_mem_thread_fence (model);
+  DONE;
+})
+
+(define_insn "atomic_loaddf_1"
+  [(set (match_operand:DF 0 "register_operand" "=f,r")
+        (mem:DF (match_operand:SI 1 "register_operand" "r,r")))
+   (clobber (match_scratch:DF 2 "=X,f"))]
   "!TARGET_64BIT && !TARGET_SOFT_FLOAT"
-  "{stws|stw} %1,-16(%%sp)\;{stws|stw} %R1,-12(%%sp)\;{fldds|fldd} -16(%%sp),%2\;{fstds|fstd} %2,0(%0)"
-  [(set_attr "type" "move")
-   (set_attr "length" "16")])
+  "@
+   {fldds|fldd} 0(%1),%0
+   {fldds|fldd} 0(%1),%2\n\t{fstds|fstd} %2,-16(%%sp)\n\t{ldws|ldw} -16(%%sp),%0\n\t{ldws|ldw} -12(%%sp),%R0"
+  [(set_attr "type" "move,move")
+   (set_attr "length" "4,16")])
+
+;; Implement atomic DFmode store using 64-bit floating point store.
+
+(define_expand "atomic_storedf"
+  [(match_operand:DF 0 "memory_operand")                ;; memory
+   (match_operand:DF 1 "register_operand")              ;; val out
+   (match_operand:SI 2 "const_int_operand")]            ;; model
+  ""
+{
+  enum memmodel model;
+
+  if (TARGET_SYNC_LIBCALL)
+    {
+      rtx mem = operands[0];
+      rtx val = operands[1];
+      if (pa_maybe_emit_compare_and_swap_exchange_loop (NULL_RTX, mem, val))
+	DONE;
+    }
+
+  if (TARGET_64BIT || TARGET_SOFT_FLOAT)
+    FAIL;
+
+  model = memmodel_from_int (INTVAL (operands[2]));
+  operands[0] = force_reg (SImode, XEXP (operands[0], 0));
+  expand_mem_thread_fence (model);
+  emit_insn (gen_atomic_storedf_1 (operands[0], operands[1]));
+  if (is_mm_seq_cst (model))
+    expand_mem_thread_fence (model);
+  DONE;
+})
+
+(define_insn "atomic_storedf_1"
+  [(set (mem:DF (match_operand:SI 0 "register_operand" "r,r"))
+        (match_operand:DF 1 "register_operand" "f,r"))
+   (clobber (match_scratch:DF 2 "=X,f"))]
+  "!TARGET_64BIT && !TARGET_SOFT_FLOAT"
+  "@
+   {fstds|fstd} %1,0(%0)
+   {stws|stw} %1,-16(%%sp)\n\t{stws|stw} %R1,-12(%%sp)\n\t{fldds|fldd} -16(%%sp),%2\n\t{fstds|fstd} %2,0(%0)"
+  [(set_attr "type" "move,move")
+   (set_attr "length" "4,16")])
 
 ;; Compare instructions.
 ;; This controls RTL generation and register allocation.
@@ -7516,7 +7686,6 @@ add,l %2,%3,%3\;bv,n %%r0(%3)"
 (define_insn "call_reg_64bit"
   [(call (mem:SI (match_operand:DI 0 "register_operand" "r"))
 	 (match_operand 1 "" "i"))
-   (clobber (reg:DI 1))
    (clobber (reg:DI 2))
    (clobber (match_operand 2))
    (use (reg:DI 27))
@@ -7537,7 +7706,6 @@ add,l %2,%3,%3\;bv,n %%r0(%3)"
 (define_split
   [(parallel [(call (mem:SI (match_operand 0 "register_operand" ""))
 		    (match_operand 1 "" ""))
-	      (clobber (reg:DI 1))
 	      (clobber (reg:DI 2))
 	      (clobber (match_operand 2))
 	      (use (reg:DI 27))
@@ -7548,7 +7716,6 @@ add,l %2,%3,%3\;bv,n %%r0(%3)"
   [(set (match_dup 2) (reg:DI 27))
    (parallel [(call (mem:SI (match_dup 0))
 		    (match_dup 1))
-	      (clobber (reg:DI 1))
 	      (clobber (reg:DI 2))
 	      (use (reg:DI 27))
 	      (use (reg:DI 29))
@@ -7558,7 +7725,6 @@ add,l %2,%3,%3\;bv,n %%r0(%3)"
 (define_split
   [(parallel [(call (mem:SI (match_operand 0 "register_operand" ""))
 		    (match_operand 1 "" ""))
-	      (clobber (reg:DI 1))
 	      (clobber (reg:DI 2))
 	      (clobber (match_operand 2))
 	      (use (reg:DI 27))
@@ -7568,7 +7734,6 @@ add,l %2,%3,%3\;bv,n %%r0(%3)"
   [(set (match_dup 2) (reg:DI 27))
    (parallel [(call (mem:SI (match_dup 0))
 		    (match_dup 1))
-	      (clobber (reg:DI 1))
 	      (clobber (reg:DI 2))
 	      (use (reg:DI 27))
 	      (use (reg:DI 29))
@@ -7579,7 +7744,6 @@ add,l %2,%3,%3\;bv,n %%r0(%3)"
 (define_insn "*call_reg_64bit_post_reload"
   [(call (mem:SI (match_operand:DI 0 "register_operand" "r"))
 	 (match_operand 1 "" "i"))
-   (clobber (reg:DI 1))
    (clobber (reg:DI 2))
    (use (reg:DI 27))
    (use (reg:DI 29))
diff --git a/gcc/config/rs6000/freebsd64.h b/gcc/config/rs6000/freebsd64.h
index 84066f60e1..9fc929dff5 100644
--- a/gcc/config/rs6000/freebsd64.h
+++ b/gcc/config/rs6000/freebsd64.h
@@ -130,7 +130,7 @@ extern int dot_symbols;
 #define	LINK_OS_FREEBSD_SPEC "%{m32:%(link_os_freebsd_spec32)}%{!m32:%(link_os_freebsd_spec64)}"
 
 #define ASM_SPEC32 "-a32 \
-%{mrelocatable} %{mrelocatable-lib} %{fpic:-K PIC} %{fPIC:-K PIC} \
+%{mrelocatable} %{mrelocatable-lib} %{fpic|fpie|fPIC|fPIE:-K PIC} \
 %{memb} %{!memb: %{msdata=eabi: -memb}} \
 %{!mlittle: %{!mlittle-endian: %{!mbig: %{!mbig-endian: \
     %{mcall-freebsd: -mbig} \
diff --git a/gcc/config/rs6000/htm.md b/gcc/config/rs6000/htm.md
index dbfd0db596..098723f630 100644
--- a/gcc/config/rs6000/htm.md
+++ b/gcc/config/rs6000/htm.md
@@ -27,6 +27,14 @@
   ])
 
 ;;
+;; UNSPEC usage
+;;
+
+(define_c_enum "unspec"
+  [UNSPEC_HTM_FENCE
+  ])
+
+;;
 ;; UNSPEC_VOLATILE usage
 ;;
 
@@ -45,96 +53,223 @@
    UNSPECV_HTM_MTSPR
   ])
 
+(define_expand "tabort"
+  [(parallel
+     [(set (match_operand:CC 1 "cc_reg_operand" "=x")
+	   (unspec_volatile:CC [(match_operand:SI 0 "base_reg_operand" "b")]
+			       UNSPECV_HTM_TABORT))
+      (set (match_dup 2) (unspec:BLK [(match_dup 2)] UNSPEC_HTM_FENCE))])]
+  "TARGET_HTM"
+{
+  operands[2] = gen_rtx_MEM (BLKmode, gen_rtx_SCRATCH (Pmode));
+  MEM_VOLATILE_P (operands[2]) = 1;
+})
 
-(define_insn "tabort"
+(define_insn "*tabort"
   [(set (match_operand:CC 1 "cc_reg_operand" "=x")
-	(unspec_volatile:CC [(match_operand:SI 0 "gpc_reg_operand" "r")]
-			    UNSPECV_HTM_TABORT))]
+	(unspec_volatile:CC [(match_operand:SI 0 "base_reg_operand" "b")]
+			    UNSPECV_HTM_TABORT))
+   (set (match_operand:BLK 2) (unspec:BLK [(match_dup 2)] UNSPEC_HTM_FENCE))]
   "TARGET_HTM"
   "tabort. %0"
   [(set_attr "type" "htm")
    (set_attr "length" "4")])
 
-(define_insn "tabort<wd>c"
+(define_expand "tabort<wd>c"
+  [(parallel
+     [(set (match_operand:CC 3 "cc_reg_operand" "=x")
+	   (unspec_volatile:CC [(match_operand 0 "u5bit_cint_operand" "n")
+				(match_operand:GPR 1 "gpc_reg_operand" "r")
+				(match_operand:GPR 2 "gpc_reg_operand" "r")]
+			       UNSPECV_HTM_TABORTXC))
+      (set (match_dup 4) (unspec:BLK [(match_dup 4)] UNSPEC_HTM_FENCE))])]
+  "TARGET_HTM"
+{
+  operands[4] = gen_rtx_MEM (BLKmode, gen_rtx_SCRATCH (Pmode));
+  MEM_VOLATILE_P (operands[4]) = 1;
+})
+
+(define_insn "*tabort<wd>c"
   [(set (match_operand:CC 3 "cc_reg_operand" "=x")
 	(unspec_volatile:CC [(match_operand 0 "u5bit_cint_operand" "n")
 			     (match_operand:GPR 1 "gpc_reg_operand" "r")
 			     (match_operand:GPR 2 "gpc_reg_operand" "r")]
-			    UNSPECV_HTM_TABORTXC))]
+			    UNSPECV_HTM_TABORTXC))
+   (set (match_operand:BLK 4) (unspec:BLK [(match_dup 4)] UNSPEC_HTM_FENCE))]
   "TARGET_HTM"
   "tabort<wd>c. %0,%1,%2"
   [(set_attr "type" "htm")
    (set_attr "length" "4")])
 
-(define_insn "tabort<wd>ci"
+(define_expand "tabort<wd>ci"
+  [(parallel
+     [(set (match_operand:CC 3 "cc_reg_operand" "=x")
+	   (unspec_volatile:CC [(match_operand 0 "u5bit_cint_operand" "n")
+				(match_operand:GPR 1 "gpc_reg_operand" "r")
+				(match_operand 2 "s5bit_cint_operand" "n")]
+			       UNSPECV_HTM_TABORTXCI))
+      (set (match_dup 4) (unspec:BLK [(match_dup 4)] UNSPEC_HTM_FENCE))])]
+  "TARGET_HTM"
+{
+  operands[4] = gen_rtx_MEM (BLKmode, gen_rtx_SCRATCH (Pmode));
+  MEM_VOLATILE_P (operands[4]) = 1;
+})
+
+(define_insn "*tabort<wd>ci"
   [(set (match_operand:CC 3 "cc_reg_operand" "=x")
 	(unspec_volatile:CC [(match_operand 0 "u5bit_cint_operand" "n")
 			     (match_operand:GPR 1 "gpc_reg_operand" "r")
 			     (match_operand 2 "s5bit_cint_operand" "n")]
-			    UNSPECV_HTM_TABORTXCI))]
+			    UNSPECV_HTM_TABORTXCI))
+   (set (match_operand:BLK 4) (unspec:BLK [(match_dup 4)] UNSPEC_HTM_FENCE))]
   "TARGET_HTM"
   "tabort<wd>ci. %0,%1,%2"
   [(set_attr "type" "htm")
    (set_attr "length" "4")])
 
-(define_insn "tbegin"
+(define_expand "tbegin"
+  [(parallel
+     [(set (match_operand:CC 1 "cc_reg_operand" "=x")
+	   (unspec_volatile:CC [(match_operand 0 "const_0_to_1_operand" "n")]
+			       UNSPECV_HTM_TBEGIN))
+      (set (match_dup 2) (unspec:BLK [(match_dup 2)] UNSPEC_HTM_FENCE))])]
+  "TARGET_HTM"
+{
+  operands[2] = gen_rtx_MEM (BLKmode, gen_rtx_SCRATCH (Pmode));
+  MEM_VOLATILE_P (operands[2]) = 1;
+})
+
+(define_insn "*tbegin"
   [(set (match_operand:CC 1 "cc_reg_operand" "=x")
 	(unspec_volatile:CC [(match_operand 0 "const_0_to_1_operand" "n")]
-			    UNSPECV_HTM_TBEGIN))]
+			    UNSPECV_HTM_TBEGIN))
+   (set (match_operand:BLK 2) (unspec:BLK [(match_dup 2)] UNSPEC_HTM_FENCE))]
   "TARGET_HTM"
   "tbegin. %0"
   [(set_attr "type" "htm")
    (set_attr "length" "4")])
 
-(define_insn "tcheck"
+(define_expand "tcheck"
+  [(parallel
+     [(set (match_operand:CC 0 "cc_reg_operand" "=y")
+	   (unspec_volatile:CC [(const_int 0)] UNSPECV_HTM_TCHECK))
+      (set (match_dup 1) (unspec:BLK [(match_dup 1)] UNSPEC_HTM_FENCE))])]
+  "TARGET_HTM"
+{
+  operands[1] = gen_rtx_MEM (BLKmode, gen_rtx_SCRATCH (Pmode));
+  MEM_VOLATILE_P (operands[1]) = 1;
+})
+
+(define_insn "*tcheck"
   [(set (match_operand:CC 0 "cc_reg_operand" "=y")
-	(unspec_volatile:CC [(const_int 0)]
-			    UNSPECV_HTM_TCHECK))]
+	(unspec_volatile:CC [(const_int 0)] UNSPECV_HTM_TCHECK))
+   (set (match_operand:BLK 1) (unspec:BLK [(match_dup 1)] UNSPEC_HTM_FENCE))]
   "TARGET_HTM"
   "tcheck %0"
   [(set_attr "type" "htm")
    (set_attr "length" "4")])
 
-(define_insn "tend"
+(define_expand "tend"
+  [(parallel
+     [(set (match_operand:CC 1 "cc_reg_operand" "=x")
+	   (unspec_volatile:CC [(match_operand 0 "const_0_to_1_operand" "n")]
+			       UNSPECV_HTM_TEND))
+      (set (match_dup 2) (unspec:BLK [(match_dup 2)] UNSPEC_HTM_FENCE))])]
+  "TARGET_HTM"
+{
+  operands[2] = gen_rtx_MEM (BLKmode, gen_rtx_SCRATCH (Pmode));
+  MEM_VOLATILE_P (operands[2]) = 1;
+})
+
+(define_insn "*tend"
   [(set (match_operand:CC 1 "cc_reg_operand" "=x")
 	(unspec_volatile:CC [(match_operand 0 "const_0_to_1_operand" "n")]
-			    UNSPECV_HTM_TEND))]
+			    UNSPECV_HTM_TEND))
+   (set (match_operand:BLK 2) (unspec:BLK [(match_dup 2)] UNSPEC_HTM_FENCE))]
   "TARGET_HTM"
   "tend. %0"
   [(set_attr "type" "htm")
    (set_attr "length" "4")])
 
-(define_insn "trechkpt"
+(define_expand "trechkpt"
+  [(parallel
+     [(set (match_operand:CC 0 "cc_reg_operand" "=x")
+	   (unspec_volatile:CC [(const_int 0)] UNSPECV_HTM_TRECHKPT))
+      (set (match_dup 1) (unspec:BLK [(match_dup 1)] UNSPEC_HTM_FENCE))])]
+  "TARGET_HTM"
+{
+  operands[1] = gen_rtx_MEM (BLKmode, gen_rtx_SCRATCH (Pmode));
+  MEM_VOLATILE_P (operands[1]) = 1;
+})
+
+(define_insn "*trechkpt"
   [(set (match_operand:CC 0 "cc_reg_operand" "=x")
-	(unspec_volatile:CC [(const_int 0)]
-			    UNSPECV_HTM_TRECHKPT))]
+	(unspec_volatile:CC [(const_int 0)] UNSPECV_HTM_TRECHKPT))
+   (set (match_operand:BLK 1) (unspec:BLK [(match_dup 1)] UNSPEC_HTM_FENCE))]
   "TARGET_HTM"
   "trechkpt."
   [(set_attr "type" "htm")
    (set_attr "length" "4")])
 
-(define_insn "treclaim"
+(define_expand "treclaim"
+  [(parallel
+     [(set (match_operand:CC 1 "cc_reg_operand" "=x")
+	   (unspec_volatile:CC [(match_operand:SI 0 "gpc_reg_operand" "r")]
+			       UNSPECV_HTM_TRECLAIM))
+      (set (match_dup 2) (unspec:BLK [(match_dup 2)] UNSPEC_HTM_FENCE))])]
+  "TARGET_HTM"
+{
+  operands[2] = gen_rtx_MEM (BLKmode, gen_rtx_SCRATCH (Pmode));
+  MEM_VOLATILE_P (operands[2]) = 1;
+})
+
+(define_insn "*treclaim"
   [(set (match_operand:CC 1 "cc_reg_operand" "=x")
 	(unspec_volatile:CC [(match_operand:SI 0 "gpc_reg_operand" "r")]
-			    UNSPECV_HTM_TRECLAIM))]
+			    UNSPECV_HTM_TRECLAIM))
+   (set (match_operand:BLK 2) (unspec:BLK [(match_dup 2)] UNSPEC_HTM_FENCE))]
   "TARGET_HTM"
   "treclaim. %0"
   [(set_attr "type" "htm")
    (set_attr "length" "4")])
 
-(define_insn "tsr"
+(define_expand "tsr"
+  [(parallel
+     [(set (match_operand:CC 1 "cc_reg_operand" "=x")
+	   (unspec_volatile:CC [(match_operand 0 "const_0_to_1_operand" "n")]
+			       UNSPECV_HTM_TSR))
+      (set (match_dup 2) (unspec:BLK [(match_dup 2)] UNSPEC_HTM_FENCE))])]
+  "TARGET_HTM"
+{
+  operands[2] = gen_rtx_MEM (BLKmode, gen_rtx_SCRATCH (Pmode));
+  MEM_VOLATILE_P (operands[2]) = 1;
+})
+
+(define_insn "*tsr"
   [(set (match_operand:CC 1 "cc_reg_operand" "=x")
 	(unspec_volatile:CC [(match_operand 0 "const_0_to_1_operand" "n")]
-			    UNSPECV_HTM_TSR))]
+			    UNSPECV_HTM_TSR))
+   (set (match_operand:BLK 2) (unspec:BLK [(match_dup 2)] UNSPEC_HTM_FENCE))]
   "TARGET_HTM"
   "tsr. %0"
   [(set_attr "type" "htm")
    (set_attr "length" "4")])
 
-(define_insn "ttest"
+(define_expand "ttest"
+  [(parallel
+     [(set (match_operand:CC 0 "cc_reg_operand" "=x")
+	   (unspec_volatile:CC [(const_int 0)] UNSPECV_HTM_TTEST))
+      (set (match_dup 1) (unspec:BLK [(match_dup 1)] UNSPEC_HTM_FENCE))])]
+  "TARGET_HTM"
+{
+  operands[1] = gen_rtx_MEM (BLKmode, gen_rtx_SCRATCH (Pmode));
+  MEM_VOLATILE_P (operands[1]) = 1;
+})
+
+(define_insn "*ttest"
   [(set (match_operand:CC 0 "cc_reg_operand" "=x")
-	(unspec_volatile:CC [(const_int 0)]
-			    UNSPECV_HTM_TTEST))]
+	(unspec_volatile:CC [(const_int 0)] UNSPECV_HTM_TTEST))
+   (set (match_operand:BLK 1) (unspec:BLK [(match_dup 1)] UNSPEC_HTM_FENCE))]
   "TARGET_HTM"
   "tabortwci. 0,1,0"
   [(set_attr "type" "htm")
diff --git a/gcc/config/rs6000/predicates.md b/gcc/config/rs6000/predicates.md
index 887263c5d0..3a23dfe031 100644
--- a/gcc/config/rs6000/predicates.md
+++ b/gcc/config/rs6000/predicates.md
@@ -1048,12 +1048,12 @@
 (define_predicate "current_file_function_operand"
   (and (match_code "symbol_ref")
        (match_test "(DEFAULT_ABI != ABI_AIX || SYMBOL_REF_FUNCTION_P (op))
-		    && ((SYMBOL_REF_LOCAL_P (op)
-			 && ((DEFAULT_ABI != ABI_AIX
-			      && DEFAULT_ABI != ABI_ELFv2)
-			     || !SYMBOL_REF_EXTERNAL_P (op)))
-		        || (op == XEXP (DECL_RTL (current_function_decl),
-						  0)))")))
+		    && (SYMBOL_REF_LOCAL_P (op)
+			|| op == XEXP (DECL_RTL (current_function_decl), 0))
+		    && !((DEFAULT_ABI == ABI_AIX
+			  || DEFAULT_ABI == ABI_ELFv2)
+			 && (SYMBOL_REF_EXTERNAL_P (op)
+			     || SYMBOL_REF_WEAK (op)))")))
 
 ;; Return 1 if this operand is a valid input for a move insn.
 (define_predicate "input_operand"
diff --git a/gcc/config/rs6000/rs6000-c.c b/gcc/config/rs6000/rs6000-c.c
index 9f22b4c428..06afab3e3c 100644
--- a/gcc/config/rs6000/rs6000-c.c
+++ b/gcc/config/rs6000/rs6000-c.c
@@ -381,7 +381,11 @@ rs6000_target_modify_macros (bool define_p, HOST_WIDE_INT flags,
   if ((flags & OPTION_MASK_VSX) != 0)
     rs6000_define_or_undefine_macro (define_p, "__VSX__");
   if ((flags & OPTION_MASK_HTM) != 0)
-    rs6000_define_or_undefine_macro (define_p, "__HTM__");
+    {
+      rs6000_define_or_undefine_macro (define_p, "__HTM__");
+      /* Tell the user that our HTM insn patterns act as memory barriers.  */
+      rs6000_define_or_undefine_macro (define_p, "__TM_FENCE__");
+    }
   if ((flags & OPTION_MASK_P8_VECTOR) != 0)
     rs6000_define_or_undefine_macro (define_p, "__POWER8_VECTOR__");
   if ((flags & OPTION_MASK_QUAD_MEMORY) != 0)
diff --git a/gcc/config/rs6000/rs6000-cpus.def b/gcc/config/rs6000/rs6000-cpus.def
index 9fd565286f..03764aef74 100644
--- a/gcc/config/rs6000/rs6000-cpus.def
+++ b/gcc/config/rs6000/rs6000-cpus.def
@@ -53,6 +53,7 @@
 				 | OPTION_MASK_P8_VECTOR		\
 				 | OPTION_MASK_CRYPTO			\
 				 | OPTION_MASK_DIRECT_MOVE		\
+				 | OPTION_MASK_EFFICIENT_UNALIGNED_VSX	\
 				 | OPTION_MASK_HTM			\
 				 | OPTION_MASK_QUAD_MEMORY		\
   				 | OPTION_MASK_QUAD_MEMORY_ATOMIC	\
@@ -78,6 +79,7 @@
 				 | OPTION_MASK_DFP			\
 				 | OPTION_MASK_DIRECT_MOVE		\
 				 | OPTION_MASK_DLMZB			\
+				 | OPTION_MASK_EFFICIENT_UNALIGNED_VSX	\
 				 | OPTION_MASK_FPRND			\
 				 | OPTION_MASK_HTM			\
 				 | OPTION_MASK_ISEL			\
diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c
index 97c5842f49..f5c2d422a8 100644
--- a/gcc/config/rs6000/rs6000.c
+++ b/gcc/config/rs6000/rs6000.c
@@ -3692,6 +3692,45 @@ rs6000_option_override_internal (bool global_init_p)
       && optimize >= 3)
     rs6000_isa_flags |= OPTION_MASK_P8_FUSION_SIGN;
 
+  /* Set -mallow-movmisalign to explicitly on if we have full ISA 2.07
+     support. If we only have ISA 2.06 support, and the user did not specify
+     the switch, leave it set to -1 so the movmisalign patterns are enabled,
+     but we don't enable the full vectorization support  */
+  if (TARGET_ALLOW_MOVMISALIGN == -1 && TARGET_P8_VECTOR && TARGET_DIRECT_MOVE)
+    TARGET_ALLOW_MOVMISALIGN = 1;
+
+  else if (TARGET_ALLOW_MOVMISALIGN && !TARGET_VSX)
+    {
+      if (TARGET_ALLOW_MOVMISALIGN > 0)
+	error ("-mallow-movmisalign requires -mvsx");
+
+      TARGET_ALLOW_MOVMISALIGN = 0;
+    }
+
+  /* Determine when unaligned vector accesses are permitted, and when
+     they are preferred over masked Altivec loads.  Note that if
+     TARGET_ALLOW_MOVMISALIGN has been disabled by the user, then
+     TARGET_EFFICIENT_UNALIGNED_VSX must be as well.  The converse is
+     not true.  */
+  if (TARGET_EFFICIENT_UNALIGNED_VSX)
+    {
+      if (!TARGET_VSX)
+	{
+	  if (rs6000_isa_flags_explicit & OPTION_MASK_EFFICIENT_UNALIGNED_VSX)
+	    error ("-mefficient-unaligned-vsx requires -mvsx");
+
+	  rs6000_isa_flags &= ~OPTION_MASK_EFFICIENT_UNALIGNED_VSX;
+	}
+
+      else if (!TARGET_ALLOW_MOVMISALIGN)
+	{
+	  if (rs6000_isa_flags_explicit & OPTION_MASK_EFFICIENT_UNALIGNED_VSX)
+	    error ("-mefficient-unaligned-vsx requires -mallow-movmisalign");
+
+	  rs6000_isa_flags &= ~OPTION_MASK_EFFICIENT_UNALIGNED_VSX;
+	}
+    }
+
   if (TARGET_DEBUG_REG || TARGET_DEBUG_TARGET)
     rs6000_print_isa_options (stderr, 0, "after defaults", rs6000_isa_flags);
 
@@ -4251,22 +4290,6 @@ rs6000_option_override_internal (bool global_init_p)
 	}
     }
 
-  /* Determine when unaligned vector accesses are permitted, and when
-     they are preferred over masked Altivec loads.  Note that if
-     TARGET_ALLOW_MOVMISALIGN has been disabled by the user, then
-     TARGET_EFFICIENT_UNALIGNED_VSX must be as well.  The converse is
-     not true.  */
-  if (TARGET_EFFICIENT_UNALIGNED_VSX == -1) {
-    if (TARGET_VSX && rs6000_cpu == PROCESSOR_POWER8
-	&& TARGET_ALLOW_MOVMISALIGN != 0)
-      TARGET_EFFICIENT_UNALIGNED_VSX = 1;
-    else
-      TARGET_EFFICIENT_UNALIGNED_VSX = 0;
-  }
-
-  if (TARGET_ALLOW_MOVMISALIGN == -1 && rs6000_cpu == PROCESSOR_POWER8)
-    TARGET_ALLOW_MOVMISALIGN = 1;
-
   /* Set the builtin mask of the various options used that could affect which
      builtins were used.  In the past we used target_flags, but we've run out
      of bits, and some options like SPE and PAIRED are no longer in
@@ -17678,8 +17701,21 @@ rs6000_secondary_reload_gpr (rtx reg, rtx mem, rtx scratch, bool store_p)
 
   if (GET_CODE (addr) == PRE_MODIFY)
     {
+      gcc_assert (REG_P (XEXP (addr, 0))
+		  && GET_CODE (XEXP (addr, 1)) == PLUS
+		  && XEXP (XEXP (addr, 1), 0) == XEXP (addr, 0));
       scratch_or_premodify = XEXP (addr, 0);
-      gcc_assert (REG_P (scratch_or_premodify));
+      if (!HARD_REGISTER_P (scratch_or_premodify))
+	/* If we have a pseudo here then reload will have arranged
+	   to have it replaced, but only in the original insn.
+	   Use the replacement here too.  */
+	scratch_or_premodify = find_replacement (&XEXP (addr, 0));
+
+      /* RTL emitted by rs6000_secondary_reload_gpr uses RTL
+	 expressions from the original insn, without unsharing them.
+	 Any RTL that points into the original insn will of course
+	 have register replacements applied.  That is why we don't
+	 need to look for replacements under the PLUS.  */
       addr = XEXP (addr, 1);
     }
   gcc_assert (GET_CODE (addr) == PLUS || GET_CODE (addr) == LO_SUM);
@@ -20537,12 +20573,15 @@ rs6000_pre_atomic_barrier (rtx mem, enum memmodel model)
     case MEMMODEL_RELAXED:
     case MEMMODEL_CONSUME:
     case MEMMODEL_ACQUIRE:
+    case MEMMODEL_SYNC_ACQUIRE:
       break;
     case MEMMODEL_RELEASE:
+    case MEMMODEL_SYNC_RELEASE:
     case MEMMODEL_ACQ_REL:
       emit_insn (gen_lwsync ());
       break;
     case MEMMODEL_SEQ_CST:
+    case MEMMODEL_SYNC_SEQ_CST:
       emit_insn (gen_hwsync ());
       break;
     default:
@@ -20559,10 +20598,13 @@ rs6000_post_atomic_barrier (enum memmodel model)
     case MEMMODEL_RELAXED:
     case MEMMODEL_CONSUME:
     case MEMMODEL_RELEASE:
+    case MEMMODEL_SYNC_RELEASE:
       break;
     case MEMMODEL_ACQUIRE:
+    case MEMMODEL_SYNC_ACQUIRE:
     case MEMMODEL_ACQ_REL:
     case MEMMODEL_SEQ_CST:
+    case MEMMODEL_SYNC_SEQ_CST:
       emit_insn (gen_isync ());
       break;
     default:
@@ -20662,8 +20704,8 @@ rs6000_expand_atomic_compare_and_swap (rtx operands[])
   oldval = operands[3];
   newval = operands[4];
   is_weak = (INTVAL (operands[5]) != 0);
-  mod_s = (enum memmodel) INTVAL (operands[6]);
-  mod_f = (enum memmodel) INTVAL (operands[7]);
+  mod_s = memmodel_from_int (INTVAL (operands[6]));
+  mod_f = memmodel_from_int (INTVAL (operands[7]));
   orig_mode = mode = GET_MODE (mem);
 
   mask = shift = NULL_RTX;
@@ -20751,12 +20793,12 @@ rs6000_expand_atomic_compare_and_swap (rtx operands[])
       emit_unlikely_jump (x, label1);
     }
 
-  if (mod_f != MEMMODEL_RELAXED)
+  if (!is_mm_relaxed (mod_f))
     emit_label (XEXP (label2, 0));
 
   rs6000_post_atomic_barrier (mod_s);
 
-  if (mod_f == MEMMODEL_RELAXED)
+  if (is_mm_relaxed (mod_f))
     emit_label (XEXP (label2, 0));
 
   if (shift)
@@ -22311,6 +22353,7 @@ rs6000_function_ok_for_sibcall (tree decl, tree exp)
       || ((DEFAULT_ABI == ABI_AIX || DEFAULT_ABI == ABI_ELFv2)
 	  && decl
 	  && !DECL_EXTERNAL (decl)
+	  && !DECL_WEAK (decl)
 	  && (*targetm.binds_local_p) (decl))
       || (DEFAULT_ABI == ABI_V4
 	  && (!TARGET_SECURE_PLT
@@ -30220,13 +30263,15 @@ rs6000_declare_alias (struct symtab_node *n, void *d)
           if (dollar_inside) {
 	      if (data->function_descriptor)
                 fprintf(data->file, "\t.rename .%s,\".%s\"\n", buffer, name);
-	      else
-                fprintf(data->file, "\t.rename %s,\"%s\"\n", buffer, name);
+	      fprintf(data->file, "\t.rename %s,\"%s\"\n", buffer, name);
 	    }
 	  if (data->function_descriptor)
-	    fputs ("\t.globl .", data->file);
-	  else
-	    fputs ("\t.globl ", data->file);
+	    {
+	      fputs ("\t.globl .", data->file);
+	      RS6000_OUTPUT_BASENAME (data->file, buffer);
+	      putc ('\n', data->file);
+	    }
+	  fputs ("\t.globl ", data->file);
 	  RS6000_OUTPUT_BASENAME (data->file, buffer);
 	  putc ('\n', data->file);
 	}
@@ -30240,14 +30285,16 @@ rs6000_declare_alias (struct symtab_node *n, void *d)
       if (dollar_inside)
 	{
 	  if (data->function_descriptor)
-            fprintf(data->file, "\t.rename %s,\"%s\"\n", buffer, name);
-	  else
             fprintf(data->file, "\t.rename .%s,\".%s\"\n", buffer, name);
+	  fprintf(data->file, "\t.rename %s,\"%s\"\n", buffer, name);
 	}
       if (data->function_descriptor)
-	fputs ("\t.lglobl .", data->file);
-      else
-	fputs ("\t.lglobl ", data->file);
+	{
+	  fputs ("\t.lglobl .", data->file);
+	  RS6000_OUTPUT_BASENAME (data->file, buffer);
+	  putc ('\n', data->file);
+	}
+      fputs ("\t.lglobl ", data->file);
       RS6000_OUTPUT_BASENAME (data->file, buffer);
       putc ('\n', data->file);
     }
@@ -32274,6 +32321,8 @@ static struct rs6000_opt_mask const rs6000_opt_masks[] =
   { "crypto",			OPTION_MASK_CRYPTO,		false, true  },
   { "direct-move",		OPTION_MASK_DIRECT_MOVE,	false, true  },
   { "dlmzb",			OPTION_MASK_DLMZB,		false, true  },
+  { "efficient-unaligned-vsx",	OPTION_MASK_EFFICIENT_UNALIGNED_VSX,
+								false, true  },
   { "fprnd",			OPTION_MASK_FPRND,		false, true  },
   { "hard-dfp",			OPTION_MASK_DFP,		false, true  },
   { "htm",			OPTION_MASK_HTM,		false, true  },
diff --git a/gcc/config/rs6000/rs6000.md b/gcc/config/rs6000/rs6000.md
index 0178bf45b0..0e5883c7fc 100644
--- a/gcc/config/rs6000/rs6000.md
+++ b/gcc/config/rs6000/rs6000.md
@@ -77,6 +77,7 @@
    UNSPEC_FRIN
    UNSPEC_FRIP
    UNSPEC_FRIZ
+   UNSPEC_XSRDPI
    UNSPEC_LD_MPIC		; load_macho_picbase
    UNSPEC_RELD_MPIC		; re-load_macho_picbase
    UNSPEC_MPIC_CORRECT		; macho_correct_pic
@@ -438,8 +439,16 @@
 ; SF/DF constraint for arithmetic on traditional floating point registers
 (define_mode_attr Ff		[(SF "f") (DF "d")])
 
-; SF/DF constraint for arithmetic on VSX registers
-(define_mode_attr Fv		[(SF "wy") (DF "ws")])
+; SF/DF constraint for arithmetic on VSX registers using instructions added in
+; ISA 2.06 (power7).  This includes instructions that normally target DF mode,
+; but are used on SFmode, since internally SFmode values are kept in the DFmode
+; format.
+(define_mode_attr Fv		[(SF "ww") (DF "ws") (DI "wi")])
+
+; SF/DF constraint for arithmetic on VSX registers.  This is intended to be
+; used for DFmode instructions added in ISA 2.06 (power7) and SFmode
+; instructions added in ISA 2.07 (power8)
+(define_mode_attr Fv2		[(SF "wy") (DF "ws") (DI "wi")])
 
 ; SF/DF constraint for arithmetic on altivec registers
 (define_mode_attr Fa		[(SF "wu") (DF "wv")])
@@ -5099,9 +5108,9 @@
   "")
 
 (define_insn "*add<mode>3_fpr"
-  [(set (match_operand:SFDF 0 "gpc_reg_operand" "=<Ff>,<Fv>")
-	(plus:SFDF (match_operand:SFDF 1 "gpc_reg_operand" "%<Ff>,<Fv>")
-		   (match_operand:SFDF 2 "gpc_reg_operand" "<Ff>,<Fv>")))]
+  [(set (match_operand:SFDF 0 "gpc_reg_operand" "=<Ff>,<Fv2>")
+	(plus:SFDF (match_operand:SFDF 1 "gpc_reg_operand" "%<Ff>,<Fv2>")
+		   (match_operand:SFDF 2 "gpc_reg_operand" "<Ff>,<Fv2>")))]
   "TARGET_<MODE>_FPR"
   "@
    fadd<Ftrad> %0,%1,%2
@@ -5117,9 +5126,9 @@
   "")
 
 (define_insn "*sub<mode>3_fpr"
-  [(set (match_operand:SFDF 0 "gpc_reg_operand" "=<Ff>,<Fv>")
-	(minus:SFDF (match_operand:SFDF 1 "gpc_reg_operand" "<Ff>,<Fv>")
-		    (match_operand:SFDF 2 "gpc_reg_operand" "<Ff>,<Fv>")))]
+  [(set (match_operand:SFDF 0 "gpc_reg_operand" "=<Ff>,<Fv2>")
+	(minus:SFDF (match_operand:SFDF 1 "gpc_reg_operand" "<Ff>,<Fv2>")
+		    (match_operand:SFDF 2 "gpc_reg_operand" "<Ff>,<Fv2>")))]
   "TARGET_<MODE>_FPR"
   "@
    fsub<Ftrad> %0,%1,%2
@@ -5135,9 +5144,9 @@
   "")
 
 (define_insn "*mul<mode>3_fpr"
-  [(set (match_operand:SFDF 0 "gpc_reg_operand" "=<Ff>,<Fv>")
-	(mult:SFDF (match_operand:SFDF 1 "gpc_reg_operand" "%<Ff>,<Fv>")
-		   (match_operand:SFDF 2 "gpc_reg_operand" "<Ff>,<Fv>")))]
+  [(set (match_operand:SFDF 0 "gpc_reg_operand" "=<Ff>,<Fv2>")
+	(mult:SFDF (match_operand:SFDF 1 "gpc_reg_operand" "%<Ff>,<Fv2>")
+		   (match_operand:SFDF 2 "gpc_reg_operand" "<Ff>,<Fv2>")))]
   "TARGET_<MODE>_FPR"
   "@
    fmul<Ftrad> %0,%1,%2
@@ -5153,9 +5162,9 @@
   "")
 
 (define_insn "*div<mode>3_fpr"
-  [(set (match_operand:SFDF 0 "gpc_reg_operand" "=<Ff>,<Fv>")
-	(div:SFDF (match_operand:SFDF 1 "gpc_reg_operand" "<Ff>,<Fv>")
-		  (match_operand:SFDF 2 "gpc_reg_operand" "<Ff>,<Fv>")))]
+  [(set (match_operand:SFDF 0 "gpc_reg_operand" "=<Ff>,<Fv2>")
+	(div:SFDF (match_operand:SFDF 1 "gpc_reg_operand" "<Ff>,<Fv2>")
+		  (match_operand:SFDF 2 "gpc_reg_operand" "<Ff>,<Fv2>")))]
   "TARGET_<MODE>_FPR && !TARGET_SIMPLE_FPU"
   "@
    fdiv<Ftrad> %0,%1,%2
@@ -5164,8 +5173,8 @@
    (set_attr "fp_type" "fp_div_<Fs>")])
 
 (define_insn "sqrt<mode>2"
-  [(set (match_operand:SFDF 0 "gpc_reg_operand" "=<Ff>,<Fv>")
-	(sqrt:SFDF (match_operand:SFDF 1 "gpc_reg_operand" "<Ff>,<Fv>")))]
+  [(set (match_operand:SFDF 0 "gpc_reg_operand" "=<Ff>,<Fv2>")
+	(sqrt:SFDF (match_operand:SFDF 1 "gpc_reg_operand" "<Ff>,<Fv2>")))]
   "TARGET_<MODE>_FPR && !TARGET_SIMPLE_FPU
    && (TARGET_PPC_GPOPT || (<MODE>mode == SFmode && TARGET_XILINX_FPU))"
   "@
@@ -5176,8 +5185,8 @@
 
 ;; Floating point reciprocal approximation
 (define_insn "fre<Fs>"
-  [(set (match_operand:SFDF 0 "gpc_reg_operand" "=<Ff>,<Fv>")
-	(unspec:SFDF [(match_operand:SFDF 1 "gpc_reg_operand" "<Ff>,<Fv>")]
+  [(set (match_operand:SFDF 0 "gpc_reg_operand" "=<Ff>,<Fv2>")
+	(unspec:SFDF [(match_operand:SFDF 1 "gpc_reg_operand" "<Ff>,<Fv2>")]
 		     UNSPEC_FRES))]
   "TARGET_<FFRE>"
   "@
@@ -5186,8 +5195,8 @@
   [(set_attr "type" "fp")])
 
 (define_insn "*rsqrt<mode>2"
-  [(set (match_operand:SFDF 0 "gpc_reg_operand" "=<Ff>,<Fv>")
-	(unspec:SFDF [(match_operand:SFDF 1 "gpc_reg_operand" "<Ff>,<Fv>")]
+  [(set (match_operand:SFDF 0 "gpc_reg_operand" "=<Ff>,<Fv2>")
+	(unspec:SFDF [(match_operand:SFDF 1 "gpc_reg_operand" "<Ff>,<Fv2>")]
 		     UNSPEC_RSQRT))]
   "RS6000_RECIP_HAVE_RSQRTE_P (<MODE>mode)"
   "@
@@ -5198,8 +5207,8 @@
 ;; Floating point comparisons
 (define_insn "*cmp<mode>_fpr"
   [(set (match_operand:CCFP 0 "cc_reg_operand" "=y,y")
-	(compare:CCFP (match_operand:SFDF 1 "gpc_reg_operand" "<Ff>,<Fv>")
-		      (match_operand:SFDF 2 "gpc_reg_operand" "<Ff>,<Fv>")))]
+	(compare:CCFP (match_operand:SFDF 1 "gpc_reg_operand" "<Ff>,<Fv2>")
+		      (match_operand:SFDF 2 "gpc_reg_operand" "<Ff>,<Fv2>")))]
   "TARGET_<MODE>_FPR"
   "@
    fcmpu %0,%1,%2
@@ -6257,6 +6266,27 @@
   [(set_attr "type" "fp")
    (set_attr "fp_type" "fp_addsub_<Fs>")])
 
+(define_insn "*xsrdpi<mode>2"
+  [(set (match_operand:SFDF 0 "gpc_reg_operand" "=<Fv>")
+	(unspec:SFDF [(match_operand:SFDF 1 "gpc_reg_operand" "<Fv>")]
+		     UNSPEC_XSRDPI))]
+  "TARGET_<MODE>_FPR && TARGET_VSX"
+  "xsrdpi %x0,%x1"
+  [(set_attr "type" "fp")
+   (set_attr "fp_type" "fp_addsub_<Fs>")])
+
+(define_expand "lround<mode>di2"
+  [(set (match_dup 2)
+	(unspec:SFDF [(match_operand:SFDF 1 "gpc_reg_operand" "")]
+		     UNSPEC_XSRDPI))
+   (set (match_operand:DI 0 "gpc_reg_operand" "")
+	(unspec:DI [(match_dup 2)]
+		   UNSPEC_FCTID))]
+  "TARGET_<MODE>_FPR && TARGET_VSX"
+{
+  operands[2] = gen_reg_rtx (<MODE>mode);
+})
+
 ; An UNSPEC is used so we don't have to support SImode in FP registers.
 (define_insn "stfiwx"
   [(set (match_operand:SI 0 "memory_operand" "=Z")
@@ -8337,8 +8367,8 @@
   [(set_attr "length" "20,20,16")])
 
 (define_expand "extenddftf2"
-  [(set (match_operand:TF 0 "nonimmediate_operand" "")
-	(float_extend:TF (match_operand:DF 1 "input_operand" "")))]
+  [(set (match_operand:TF 0 "gpc_reg_operand" "")
+	(float_extend:TF (match_operand:DF 1 "gpc_reg_operand" "")))]
   "!TARGET_IEEEQUAD
    && TARGET_HARD_FLOAT
    && (TARGET_FPRS || TARGET_E500_DOUBLE)
@@ -8346,52 +8376,55 @@
 {
   if (TARGET_E500_DOUBLE)
     emit_insn (gen_spe_extenddftf2 (operands[0], operands[1]));
+  else if (TARGET_VSX)
+    emit_insn (gen_extenddftf2_vsx (operands[0], operands[1]));
   else
-    emit_insn (gen_extenddftf2_fprs (operands[0], operands[1]));
+    {
+      rtx zero = gen_reg_rtx (DFmode);
+      rs6000_emit_move (zero, CONST0_RTX (DFmode), DFmode);
+      emit_insn (gen_extenddftf2_fprs (operands[0], operands[1], zero));
+    }
   DONE;
 })
 
-(define_expand "extenddftf2_fprs"
-  [(parallel [(set (match_operand:TF 0 "nonimmediate_operand" "")
-		   (float_extend:TF (match_operand:DF 1 "input_operand" "")))
-	      (use (match_dup 2))])]
-  "!TARGET_IEEEQUAD
-   && TARGET_HARD_FLOAT && TARGET_FPRS && TARGET_DOUBLE_FLOAT 
-   && TARGET_LONG_DOUBLE_128"
+;; Allow memory operands for the source to be created by the combiner.
+(define_insn_and_split "extenddftf2_fprs"
+  [(set (match_operand:TF 0 "gpc_reg_operand" "=d,d,&d")
+	(float_extend:TF (match_operand:DF 1 "nonimmediate_operand" "d,m,d")))
+   (use (match_operand:DF 2 "nonimmediate_operand" "m,m,d"))]
+  "!TARGET_VSX && TARGET_HARD_FLOAT && TARGET_FPRS && TARGET_DOUBLE_FLOAT
+   && TARGET_LONG_DOUBLE_128 && !TARGET_IEEEQUAD"
+  "#"
+  "&& reload_completed"
+  [(set (match_dup 3) (match_dup 1))
+   (set (match_dup 4) (match_dup 2))]
 {
-  /* VSX can create 0.0 directly, otherwise let rs6000_emit_move create
-     the proper constant.  */
-  if (TARGET_VSX)
-    operands[2] = CONST0_RTX (DFmode);
-  else
-    {
-      operands[2] = gen_reg_rtx (DFmode);
-      rs6000_emit_move (operands[2], CONST0_RTX (DFmode), DFmode);
-    }
+  const int lo_word = LONG_DOUBLE_LARGE_FIRST ? GET_MODE_SIZE (DFmode) : 0;
+  const int hi_word = LONG_DOUBLE_LARGE_FIRST ? 0 : GET_MODE_SIZE (DFmode);
+
+  operands[3] = simplify_gen_subreg (DFmode, operands[0], TFmode, hi_word);
+  operands[4] = simplify_gen_subreg (DFmode, operands[0], TFmode, lo_word);
 })
 
-(define_insn_and_split "*extenddftf2_internal"
-  [(set (match_operand:TF 0 "nonimmediate_operand" "=m,Y,ws,d,&d")
-       (float_extend:TF (match_operand:DF 1 "input_operand" "d,r,md,md,md")))
-   (use (match_operand:DF 2 "zero_reg_mem_operand" "d,r,j,m,d"))]
-  "!TARGET_IEEEQUAD
-   && TARGET_HARD_FLOAT && TARGET_FPRS && TARGET_DOUBLE_FLOAT 
-   && TARGET_LONG_DOUBLE_128"
+(define_insn_and_split "extenddftf2_vsx"
+  [(set (match_operand:TF 0 "gpc_reg_operand" "=d,d")
+	(float_extend:TF (match_operand:DF 1 "nonimmediate_operand" "ws,m")))]
+  "TARGET_LONG_DOUBLE_128 && TARGET_VSX && !TARGET_IEEEQUAD"
   "#"
   "&& reload_completed"
-  [(pc)]
+  [(set (match_dup 2) (match_dup 1))
+   (set (match_dup 3) (match_dup 4))]
 {
   const int lo_word = LONG_DOUBLE_LARGE_FIRST ? GET_MODE_SIZE (DFmode) : 0;
   const int hi_word = LONG_DOUBLE_LARGE_FIRST ? 0 : GET_MODE_SIZE (DFmode);
-  emit_move_insn (simplify_gen_subreg (DFmode, operands[0], TFmode, hi_word),
-		  operands[1]);
-  emit_move_insn (simplify_gen_subreg (DFmode, operands[0], TFmode, lo_word),
-		  operands[2]);
-  DONE;
+
+  operands[2] = simplify_gen_subreg (DFmode, operands[0], TFmode, hi_word);
+  operands[3] = simplify_gen_subreg (DFmode, operands[0], TFmode, lo_word);
+  operands[4] = CONST0_RTX (DFmode);
 })
 
 (define_expand "extendsftf2"
-  [(set (match_operand:TF 0 "nonimmediate_operand" "")
+  [(set (match_operand:TF 0 "gpc_reg_operand" "")
 	(float_extend:TF (match_operand:SF 1 "gpc_reg_operand" "")))]
   "!TARGET_IEEEQUAD
    && TARGET_HARD_FLOAT
@@ -13577,11 +13610,11 @@
   "")
 
 (define_insn "*fma<mode>4_fpr"
-  [(set (match_operand:SFDF 0 "gpc_reg_operand" "=<Ff>,<Fv>,<Fv>")
+  [(set (match_operand:SFDF 0 "gpc_reg_operand" "=<Ff>,<Fv2>,<Fv2>")
 	(fma:SFDF
-	  (match_operand:SFDF 1 "gpc_reg_operand" "%<Ff>,<Fv>,<Fv>")
-	  (match_operand:SFDF 2 "gpc_reg_operand" "<Ff>,<Fv>,0")
-	  (match_operand:SFDF 3 "gpc_reg_operand" "<Ff>,0,<Fv>")))]
+	  (match_operand:SFDF 1 "gpc_reg_operand" "%<Ff>,<Fv2>,<Fv2>")
+	  (match_operand:SFDF 2 "gpc_reg_operand" "<Ff>,<Fv2>,0")
+	  (match_operand:SFDF 3 "gpc_reg_operand" "<Ff>,0,<Fv2>")))]
   "TARGET_<MODE>_FPR"
   "@
    fmadd<Ftrad> %0,%1,%2,%3
@@ -13601,11 +13634,11 @@
   "")
 
 (define_insn "*fms<mode>4_fpr"
-  [(set (match_operand:SFDF 0 "gpc_reg_operand" "=<Ff>,<Fv>,<Fv>")
+  [(set (match_operand:SFDF 0 "gpc_reg_operand" "=<Ff>,<Fv2>,<Fv2>")
 	(fma:SFDF
-	 (match_operand:SFDF 1 "gpc_reg_operand" "<Ff>,<Fv>,<Fv>")
-	 (match_operand:SFDF 2 "gpc_reg_operand" "<Ff>,<Fv>,0")
-	 (neg:SFDF (match_operand:SFDF 3 "gpc_reg_operand" "<Ff>,0,<Fv>"))))]
+	 (match_operand:SFDF 1 "gpc_reg_operand" "<Ff>,<Fv2>,<Fv2>")
+	 (match_operand:SFDF 2 "gpc_reg_operand" "<Ff>,<Fv2>,0")
+	 (neg:SFDF (match_operand:SFDF 3 "gpc_reg_operand" "<Ff>,0,<Fv2>"))))]
   "TARGET_<MODE>_FPR"
   "@
    fmsub<Ftrad> %0,%1,%2,%3
@@ -13648,12 +13681,12 @@
   "")
 
 (define_insn "*nfma<mode>4_fpr"
-  [(set (match_operand:SFDF 0 "gpc_reg_operand" "=<Ff>,<Fv>,<Fv>")
+  [(set (match_operand:SFDF 0 "gpc_reg_operand" "=<Ff>,<Fv2>,<Fv2>")
 	(neg:SFDF
 	 (fma:SFDF
-	  (match_operand:SFDF 1 "gpc_reg_operand" "<Ff>,<Fv>,<Fv>")
-	  (match_operand:SFDF 2 "gpc_reg_operand" "<Ff>,<Fv>,0")
-	  (match_operand:SFDF 3 "gpc_reg_operand" "<Ff>,0,<Fv>"))))]
+	  (match_operand:SFDF 1 "gpc_reg_operand" "<Ff>,<Fv2>,<Fv2>")
+	  (match_operand:SFDF 2 "gpc_reg_operand" "<Ff>,<Fv2>,0")
+	  (match_operand:SFDF 3 "gpc_reg_operand" "<Ff>,0,<Fv2>"))))]
   "TARGET_<MODE>_FPR"
   "@
    fnmadd<Ftrad> %0,%1,%2,%3
@@ -13674,13 +13707,13 @@
   "")
 
 (define_insn "*nfmssf4_fpr"
-  [(set (match_operand:SFDF 0 "gpc_reg_operand" "=<Ff>,<Fv>,<Fv>")
+  [(set (match_operand:SFDF 0 "gpc_reg_operand" "=<Ff>,<Fv2>,<Fv2>")
 	(neg:SFDF
 	 (fma:SFDF
-	  (match_operand:SFDF 1 "gpc_reg_operand" "<Ff>,<Fv>,<Fv>")
-	  (match_operand:SFDF 2 "gpc_reg_operand" "<Ff>,<Fv>,0")
+	  (match_operand:SFDF 1 "gpc_reg_operand" "<Ff>,<Fv2>,<Fv2>")
+	  (match_operand:SFDF 2 "gpc_reg_operand" "<Ff>,<Fv2>,0")
 	  (neg:SFDF
-	   (match_operand:SFDF 3 "gpc_reg_operand" "<Ff>,0,<Fv>")))))]
+	   (match_operand:SFDF 3 "gpc_reg_operand" "<Ff>,0,<Fv2>")))))]
   "TARGET_<MODE>_FPR"
   "@
    fnmsub<Ftrad> %0,%1,%2,%3
diff --git a/gcc/config/rs6000/rs6000.opt b/gcc/config/rs6000/rs6000.opt
index 79d01d4965..9e89eb7641 100644
--- a/gcc/config/rs6000/rs6000.opt
+++ b/gcc/config/rs6000/rs6000.opt
@@ -212,7 +212,7 @@ Target Undocumented Var(TARGET_ALLOW_MOVMISALIGN) Init(-1) Save
 ; Allow/disallow the movmisalign in DF/DI vectors
 
 mefficient-unaligned-vector
-Target Undocumented Report Var(TARGET_EFFICIENT_UNALIGNED_VSX) Init(-1) Save
+Target Undocumented Report Mask(EFFICIENT_UNALIGNED_VSX) Var(rs6000_isa_flags)
 ; Consider unaligned VSX accesses to be efficient/inefficient
 
 mallow-df-permute
diff --git a/gcc/config/rs6000/sync.md b/gcc/config/rs6000/sync.md
index 4364c8526d..8ba30b9865 100644
--- a/gcc/config/rs6000/sync.md
+++ b/gcc/config/rs6000/sync.md
@@ -41,18 +41,21 @@
   [(match_operand:SI 0 "const_int_operand" "")]		;; model
   ""
 {
-  enum memmodel model = (enum memmodel) INTVAL (operands[0]);
+  enum memmodel model = memmodel_from_int (INTVAL (operands[0]));
   switch (model)
     {
     case MEMMODEL_RELAXED:
       break;
     case MEMMODEL_CONSUME:
     case MEMMODEL_ACQUIRE:
+    case MEMMODEL_SYNC_ACQUIRE:
     case MEMMODEL_RELEASE:
+    case MEMMODEL_SYNC_RELEASE:
     case MEMMODEL_ACQ_REL:
       emit_insn (gen_lwsync ());
       break;
     case MEMMODEL_SEQ_CST:
+    case MEMMODEL_SYNC_SEQ_CST:
       emit_insn (gen_hwsync ());
       break;
     default:
@@ -144,9 +147,9 @@
   if (<MODE>mode == TImode && !TARGET_SYNC_TI)
     FAIL;
 
-  enum memmodel model = (enum memmodel) INTVAL (operands[2]);
+  enum memmodel model = memmodel_from_int (INTVAL (operands[2]));
 
-  if (model == MEMMODEL_SEQ_CST)
+  if (is_mm_seq_cst (model))
     emit_insn (gen_hwsync ());
 
   if (<MODE>mode != TImode)
@@ -182,7 +185,9 @@
       break;
     case MEMMODEL_CONSUME:
     case MEMMODEL_ACQUIRE:
+    case MEMMODEL_SYNC_ACQUIRE:
     case MEMMODEL_SEQ_CST:
+    case MEMMODEL_SYNC_SEQ_CST:
       emit_insn (gen_loadsync_<mode> (operands[0]));
       break;
     default:
@@ -209,15 +214,17 @@
   if (<MODE>mode == TImode && !TARGET_SYNC_TI)
     FAIL;
 
-  enum memmodel model = (enum memmodel) INTVAL (operands[2]);
+  enum memmodel model = memmodel_from_int (INTVAL (operands[2]));
   switch (model)
     {
     case MEMMODEL_RELAXED:
       break;
     case MEMMODEL_RELEASE:
+    case MEMMODEL_SYNC_RELEASE:
       emit_insn (gen_lwsync ());
       break;
     case MEMMODEL_SEQ_CST:
+    case MEMMODEL_SYNC_SEQ_CST:
       emit_insn (gen_hwsync ());
       break;
     default:
diff --git a/gcc/config/rs6000/sysv4.h b/gcc/config/rs6000/sysv4.h
index 9917c2f557..c6c31dc914 100644
--- a/gcc/config/rs6000/sysv4.h
+++ b/gcc/config/rs6000/sysv4.h
@@ -574,7 +574,6 @@ ENDIAN_SELECT(" -mbig", " -mlittle", DEFAULT_ASM_ENDIAN)
 %{R*} \
 %(link_shlib) \
 %{!T*: %(link_start) } \
-%(link_target) \
 %(link_os)"
 
 /* Shared libraries are not default.  */
@@ -584,10 +583,6 @@ ENDIAN_SELECT(" -mbig", " -mlittle", DEFAULT_ASM_ENDIAN)
 %{shared:-G -dy -z text } \
 %{symbolic:-Bsymbolic -G -dy -z text }"
 
-/* Override the default target of the linker.  */
-#define	LINK_TARGET_SPEC \
-  ENDIAN_SELECT("", " --oformat elf32-powerpcle", "")
-
 /* Any specific OS flags.  */
 #define LINK_OS_SPEC "\
 %{mads         : %(link_os_ads)         ; \
@@ -873,7 +868,6 @@ ncrtn.o%s"
   { "endfile_openbsd",		ENDFILE_OPENBSD_SPEC },			\
   { "endfile_default",		ENDFILE_DEFAULT_SPEC },			\
   { "link_shlib",		LINK_SHLIB_SPEC },			\
-  { "link_target",		LINK_TARGET_SPEC },			\
   { "link_start",		LINK_START_SPEC },			\
   { "link_start_ads",		LINK_START_ADS_SPEC },			\
   { "link_start_yellowknife",	LINK_START_YELLOWKNIFE_SPEC },		\
diff --git a/gcc/config/rs6000/sysv4le.h b/gcc/config/rs6000/sysv4le.h
index 7b1d6a1b4d..66ee7cadfe 100644
--- a/gcc/config/rs6000/sysv4le.h
+++ b/gcc/config/rs6000/sysv4le.h
@@ -25,10 +25,6 @@
 #undef	DEFAULT_ASM_ENDIAN
 #define	DEFAULT_ASM_ENDIAN " -mlittle"
 
-#undef	LINK_TARGET_SPEC
-#define	LINK_TARGET_SPEC \
-  ENDIAN_SELECT(" --oformat elf32-powerpc", "", "")
-
 #undef	MULTILIB_DEFAULTS
 #define	MULTILIB_DEFAULTS { "mlittle", "mcall-sysv" }
 
diff --git a/gcc/config/rs6000/vector.md b/gcc/config/rs6000/vector.md
index 668bad1d24..dd950fa137 100644
--- a/gcc/config/rs6000/vector.md
+++ b/gcc/config/rs6000/vector.md
@@ -918,6 +918,8 @@
 ;; General shift amounts can be supported using vsro + vsr. We're
 ;; not expecting to see these yet (the vectorizer currently
 ;; generates only shifts by a whole number of vector elements).
+;; Note that the vec_shr operation is actually defined as 
+;; 'shift toward element 0' so is a shr for LE and shl for BE.
 (define_expand "vec_shr_<mode>"
   [(match_operand:VEC_L 0 "vlogical_operand" "")
    (match_operand:VEC_L 1 "vlogical_operand" "")
@@ -928,6 +930,7 @@
   rtx bitshift = operands[2];
   rtx shift;
   rtx insn;
+  rtx zero_reg, op1, op2;
   HOST_WIDE_INT bitshift_val;
   HOST_WIDE_INT byteshift_val;
 
@@ -937,19 +940,29 @@
   if (bitshift_val & 0x7)
     FAIL;
   byteshift_val = (bitshift_val >> 3);
+  zero_reg = gen_reg_rtx (<MODE>mode);
+  emit_move_insn (zero_reg, CONST0_RTX (<MODE>mode));
   if (!BYTES_BIG_ENDIAN)
-    byteshift_val = 16 - byteshift_val;
+    {
+      byteshift_val = 16 - byteshift_val;
+      op1 = zero_reg;
+      op2 = operands[1];
+    }
+  else
+    {
+      op1 = operands[1];
+      op2 = zero_reg;
+    }
+
   if (TARGET_VSX && (byteshift_val & 0x3) == 0)
     {
       shift = gen_rtx_CONST_INT (QImode, byteshift_val >> 2);
-      insn = gen_vsx_xxsldwi_<mode> (operands[0], operands[1], operands[1],
-				     shift);
+      insn = gen_vsx_xxsldwi_<mode> (operands[0], op1, op2, shift);
     }
   else
     {
       shift = gen_rtx_CONST_INT (QImode, byteshift_val);
-      insn = gen_altivec_vsldoi_<mode> (operands[0], operands[1], operands[1],
-					shift);
+      insn = gen_altivec_vsldoi_<mode> (operands[0], op1, op2, shift);
     }
 
   emit_insn (insn);
diff --git a/gcc/config/s390/s390-builtins.def b/gcc/config/s390/s390-builtins.def
index 0a24da9bcb..b267b04e2a 100644
--- a/gcc/config/s390/s390-builtins.def
+++ b/gcc/config/s390/s390-builtins.def
@@ -438,15 +438,15 @@ B_DEF      (s390_vllezf,                vec_insert_and_zerov4si,0,
 B_DEF      (s390_vllezg,                vec_insert_and_zerov2di,0,              B_VX,               0,                  BT_FN_UV2DI_ULONGLONGCONSTPTR)
 
 OB_DEF     (s390_vec_load_bndry,        s390_vec_load_bndry_s8,s390_vec_load_bndry_dbl,B_VX,        BT_FN_OV4SI_INTCONSTPTR_INT)
-OB_DEF_VAR (s390_vec_load_bndry_s8,     s390_vlbb,          O2_U3,              BT_OV_V16QI_SCHARCONSTPTR_USHORT)
-OB_DEF_VAR (s390_vec_load_bndry_u8,     s390_vlbb,          O2_U3,              BT_OV_UV16QI_UCHARCONSTPTR_USHORT)
-OB_DEF_VAR (s390_vec_load_bndry_s16,    s390_vlbb,          O2_U3,              BT_OV_V8HI_SHORTCONSTPTR_USHORT)
-OB_DEF_VAR (s390_vec_load_bndry_u16,    s390_vlbb,          O2_U3,              BT_OV_UV8HI_USHORTCONSTPTR_USHORT)
-OB_DEF_VAR (s390_vec_load_bndry_s32,    s390_vlbb,          O2_U3,              BT_OV_V4SI_INTCONSTPTR_USHORT)
-OB_DEF_VAR (s390_vec_load_bndry_u32,    s390_vlbb,          O2_U3,              BT_OV_UV4SI_UINTCONSTPTR_USHORT)
-OB_DEF_VAR (s390_vec_load_bndry_s64,    s390_vlbb,          O2_U3,              BT_OV_V2DI_LONGLONGCONSTPTR_USHORT)
-OB_DEF_VAR (s390_vec_load_bndry_u64,    s390_vlbb,          O2_U3,              BT_OV_UV2DI_ULONGLONGCONSTPTR_USHORT)
-OB_DEF_VAR (s390_vec_load_bndry_dbl,    s390_vlbb,          O2_U3,              BT_OV_V2DF_DBLCONSTPTR_USHORT)
+OB_DEF_VAR (s390_vec_load_bndry_s8,     s390_vlbb,          O2_U16,              BT_OV_V16QI_SCHARCONSTPTR_USHORT)
+OB_DEF_VAR (s390_vec_load_bndry_u8,     s390_vlbb,          O2_U16,              BT_OV_UV16QI_UCHARCONSTPTR_USHORT)
+OB_DEF_VAR (s390_vec_load_bndry_s16,    s390_vlbb,          O2_U16,              BT_OV_V8HI_SHORTCONSTPTR_USHORT)
+OB_DEF_VAR (s390_vec_load_bndry_u16,    s390_vlbb,          O2_U16,              BT_OV_UV8HI_USHORTCONSTPTR_USHORT)
+OB_DEF_VAR (s390_vec_load_bndry_s32,    s390_vlbb,          O2_U16,              BT_OV_V4SI_INTCONSTPTR_USHORT)
+OB_DEF_VAR (s390_vec_load_bndry_u32,    s390_vlbb,          O2_U16,              BT_OV_UV4SI_UINTCONSTPTR_USHORT)
+OB_DEF_VAR (s390_vec_load_bndry_s64,    s390_vlbb,          O2_U16,              BT_OV_V2DI_LONGLONGCONSTPTR_USHORT)
+OB_DEF_VAR (s390_vec_load_bndry_u64,    s390_vlbb,          O2_U16,              BT_OV_UV2DI_ULONGLONGCONSTPTR_USHORT)
+OB_DEF_VAR (s390_vec_load_bndry_dbl,    s390_vlbb,          O2_U16,              BT_OV_V2DF_DBLCONSTPTR_USHORT)
 
 B_DEF      (s390_vlbb,                  vlbb,               0,                  B_VX,               O2_U3,              BT_FN_UV16QI_UCHARCONSTPTR_USHORT)
 
diff --git a/gcc/config/s390/s390.c b/gcc/config/s390/s390.c
index 538587e8da..ba2a151b93 100644
--- a/gcc/config/s390/s390.c
+++ b/gcc/config/s390/s390.c
@@ -750,7 +750,7 @@ s390_const_operand_ok (tree arg, int argnum, int op_flags, tree decl)
 		HOST_WIDE_INT_PRINT_DEC ".."
 		HOST_WIDE_INT_PRINT_DEC ")",
 		argnum, decl,
-		-(HOST_WIDE_INT)1 << (bitwidth - 1),
+		-((HOST_WIDE_INT)1 << (bitwidth - 1)),
 		((HOST_WIDE_INT)1 << (bitwidth - 1)) - 1);
 	  return false;
 	}
@@ -7705,11 +7705,12 @@ replace_ltrel_base (rtx *x)
 /* We keep a list of constants which we have to add to internal
    constant tables in the middle of large functions.  */
 
-#define NR_C_MODES 31
+#define NR_C_MODES 32
 machine_mode constant_modes[NR_C_MODES] =
 {
   TFmode, TImode, TDmode,
-  V16QImode, V8HImode, V4SImode, V2DImode, V4SFmode, V2DFmode, V1TFmode,
+  V16QImode, V8HImode, V4SImode, V2DImode, V1TImode,
+  V4SFmode, V2DFmode, V1TFmode,
   DFmode, DImode, DDmode,
   V8QImode, V4HImode, V2SImode, V1DImode, V2SFmode, V1DFmode,
   SFmode, SImode, SDmode,
diff --git a/gcc/config/s390/s390.md b/gcc/config/s390/s390.md
index b8e7b38b78..1f4147ac56 100644
--- a/gcc/config/s390/s390.md
+++ b/gcc/config/s390/s390.md
@@ -301,6 +301,7 @@
   [
    ; General purpose registers
    (GPR0_REGNUM                  0)
+   (GPR1_REGNUM                  1)
    ; Floating point registers.
    (FPR0_REGNUM                 16)
    (FPR1_REGNUM                 20)
@@ -4831,7 +4832,8 @@
   [(set (reg:DFP_ALL FPR0_REGNUM)
         (float_truncate:DFP_ALL (reg:BFP FPR4_REGNUM)))
    (use (reg:SI GPR0_REGNUM))
-   (clobber (reg:CC CC_REGNUM))]
+   (clobber (reg:CC CC_REGNUM))
+   (clobber (reg:SI GPR1_REGNUM))]
   "TARGET_HARD_DFP"
   "pfpo")
 
@@ -4839,7 +4841,8 @@
   [(set (reg:BFP FPR0_REGNUM)
         (float_truncate:BFP (reg:DFP_ALL FPR4_REGNUM)))
    (use (reg:SI GPR0_REGNUM))
-   (clobber (reg:CC CC_REGNUM))]
+   (clobber (reg:CC CC_REGNUM))
+   (clobber (reg:SI GPR1_REGNUM))]
   "TARGET_HARD_DFP"
   "pfpo")
 
@@ -4850,7 +4853,8 @@
     [(set (reg:DFP_ALL FPR0_REGNUM)
           (float_truncate:DFP_ALL (reg:BFP FPR4_REGNUM)))
      (use (reg:SI GPR0_REGNUM))
-     (clobber (reg:CC CC_REGNUM))])
+     (clobber (reg:CC CC_REGNUM))
+     (clobber (reg:SI GPR1_REGNUM))])
    (set (match_operand:DFP_ALL 0 "nonimmediate_operand" "")
         (reg:DFP_ALL FPR0_REGNUM))]
   "TARGET_HARD_DFP
@@ -4872,7 +4876,8 @@
    (parallel
     [(set (reg:BFP FPR0_REGNUM) (float_truncate:BFP (reg:DFP_ALL FPR4_REGNUM)))
      (use (reg:SI GPR0_REGNUM))
-     (clobber (reg:CC CC_REGNUM))])
+     (clobber (reg:CC CC_REGNUM))
+     (clobber (reg:SI GPR1_REGNUM))])
    (set (match_operand:BFP 0 "nonimmediate_operand" "") (reg:BFP FPR0_REGNUM))]
   "TARGET_HARD_DFP
    && GET_MODE_SIZE (<DFP_ALL:MODE>mode) >= GET_MODE_SIZE (<BFP:MODE>mode)"
@@ -4893,14 +4898,16 @@
 (define_insn "*extend<BFP:mode><DFP_ALL:mode>2"
   [(set (reg:DFP_ALL FPR0_REGNUM) (float_extend:DFP_ALL (reg:BFP FPR4_REGNUM)))
    (use (reg:SI GPR0_REGNUM))
-   (clobber (reg:CC CC_REGNUM))]
+   (clobber (reg:CC CC_REGNUM))
+   (clobber (reg:SI GPR1_REGNUM))]
   "TARGET_HARD_DFP"
   "pfpo")
 
 (define_insn "*extend<DFP_ALL:mode><BFP:mode>2"
   [(set (reg:BFP FPR0_REGNUM) (float_extend:BFP (reg:DFP_ALL FPR4_REGNUM)))
    (use (reg:SI GPR0_REGNUM))
-   (clobber (reg:CC CC_REGNUM))]
+   (clobber (reg:CC CC_REGNUM))
+   (clobber (reg:SI GPR1_REGNUM))]
   "TARGET_HARD_DFP"
   "pfpo")
 
@@ -4911,7 +4918,8 @@
     [(set (reg:DFP_ALL FPR0_REGNUM)
           (float_extend:DFP_ALL (reg:BFP FPR4_REGNUM)))
      (use (reg:SI GPR0_REGNUM))
-     (clobber (reg:CC CC_REGNUM))])
+     (clobber (reg:CC CC_REGNUM))
+     (clobber (reg:SI GPR1_REGNUM))])
    (set (match_operand:DFP_ALL 0 "nonimmediate_operand" "")
         (reg:DFP_ALL FPR0_REGNUM))]
   "TARGET_HARD_DFP
@@ -4933,7 +4941,8 @@
    (parallel
     [(set (reg:BFP FPR0_REGNUM) (float_extend:BFP (reg:DFP_ALL FPR4_REGNUM)))
      (use (reg:SI GPR0_REGNUM))
-     (clobber (reg:CC CC_REGNUM))])
+     (clobber (reg:CC CC_REGNUM))
+     (clobber (reg:SI GPR1_REGNUM))])
    (set (match_operand:BFP 0 "nonimmediate_operand" "") (reg:BFP FPR0_REGNUM))]
   "TARGET_HARD_DFP
    && GET_MODE_SIZE (<DFP_ALL:MODE>mode) < GET_MODE_SIZE (<BFP:MODE>mode)"
@@ -6044,8 +6053,13 @@
 			  (match_operand:GPR 2 "nonimmediate_operand" "")
 			  (match_operand:GPR 3 "nonimmediate_operand" "")))]
   "TARGET_Z196"
-  "operands[1] = s390_emit_compare (GET_CODE (operands[1]),
-                                    XEXP (operands[1], 0), XEXP (operands[1], 1));")
+{
+  /* Emit the comparison insn in case we do not already have a comparison result.  */
+  if (!s390_comparison (operands[1], VOIDmode))
+    operands[1] = s390_emit_compare (GET_CODE (operands[1]),
+				     XEXP (operands[1], 0),
+				     XEXP (operands[1], 1));
+})
 
 ; locr, loc, stoc, locgr, locg, stocg
 (define_insn_and_split "*mov<mode>cc"
@@ -9594,7 +9608,7 @@
 {
   /* Unless this is a SEQ_CST fence, the s390 memory model is strong
      enough not to require barriers of any kind.  */
-  if (INTVAL (operands[0]) == MEMMODEL_SEQ_CST)
+  if (is_mm_seq_cst (memmodel_from_int (INTVAL (operands[0]))))
     {
       rtx mem = gen_rtx_MEM (BLKmode, gen_rtx_SCRATCH (Pmode));
       MEM_VOLATILE_P (mem) = 1;
@@ -9675,7 +9689,7 @@
    (match_operand:SI 2 "const_int_operand")]	;; model
   ""
 {
-  enum memmodel model = (enum memmodel) INTVAL (operands[2]);
+  enum memmodel model = memmodel_from_int (INTVAL (operands[2]));
 
   if (MEM_ALIGN (operands[0]) < GET_MODE_BITSIZE (GET_MODE (operands[0])))
     FAIL;
@@ -9686,7 +9700,7 @@
     emit_insn (gen_atomic_storedi_1 (operands[0], operands[1]));
   else
     emit_move_insn (operands[0], operands[1]);
-  if (model == MEMMODEL_SEQ_CST)
+  if (is_mm_seq_cst (model))
     emit_insn (gen_mem_thread_fence (operands[2]));
   DONE;
 })
@@ -10370,6 +10384,8 @@
 ; Byte swap instructions
 ;
 
+; FIXME: There is also mvcin but we cannot use it since src and target
+; may overlap.
 (define_insn "bswap<mode>2"
   [(set (match_operand:GPR 0            "register_operand"     "=d, d")
 	(bswap:GPR (match_operand:GPR 1 "nonimmediate_operand" " d,RT")))]
@@ -10381,6 +10397,14 @@
    (set_attr "op_type" "RRE,RXY")
    (set_attr "z10prop" "z10_super")])
 
+(define_insn "bswaphi2"
+  [(set (match_operand:HI 0           "register_operand" "=d")
+	(bswap:HI (match_operand:HI 1 "memory_operand"   "RT")))]
+  "TARGET_CPU_ZARCH"
+  "lrvh\t%0,%1"
+  [(set_attr "type" "load")
+   (set_attr "op_type" "RXY")
+   (set_attr "z10prop" "z10_super")])
 
 ;
 ; Population count instruction
diff --git a/gcc/config/s390/vx-builtins.md b/gcc/config/s390/vx-builtins.md
index a3f13ce4a7..fb76534f72 100644
--- a/gcc/config/s390/vx-builtins.md
+++ b/gcc/config/s390/vx-builtins.md
@@ -414,7 +414,7 @@
 	(unspec:<non_vec> [(match_operand:V_HW_64                0 "register_operand"  "v")
 			   (match_dup 3)] UNSPEC_VEC_EXTRACT))]
   "TARGET_VX && !TARGET_64BIT"
-  "vsce<V_HW_64:gf>\t%v0,%O2(%v1,%R2),%3"
+  "vsce<V_HW_64:bhfgq>\t%v0,%O2(%v1,%R2),%3"
   [(set_attr "op_type" "VRV")])
 
 ; Element size and target adress size is the same
@@ -428,7 +428,7 @@
 	(unspec:<non_vec> [(match_operand:V_HW_32_64     0 "register_operand"  "v")
 			   (match_dup 3)] UNSPEC_VEC_EXTRACT))]
   "TARGET_VX"
-  "vsce<gf>\t%v0,%O2(%v1,%R2),%3"
+  "vsce<bhfgq>\t%v0,%O2(%v1,%R2),%3"
   [(set_attr "op_type" "VRV")])
 
 ; Depending on the address size we have to expand a different pattern.
@@ -870,11 +870,11 @@
 ; vec_mladd -> vec_vmal
 ; vmalb, vmalh, vmalf, vmalg
 (define_insn "vec_vmal<mode>"
-  [(set (match_operand:VI_HW 0 "register_operand" "=v")
-	(unspec:VI_HW [(match_operand:VI_HW 1 "register_operand" "v")
-		       (match_operand:VI_HW 2 "register_operand" "v")
-		       (match_operand:VI_HW 3 "register_operand" "v")]
-		      UNSPEC_VEC_VMAL))]
+  [(set (match_operand:VI_HW_QHS 0 "register_operand" "=v")
+	(unspec:VI_HW_QHS [(match_operand:VI_HW_QHS 1 "register_operand" "v")
+			   (match_operand:VI_HW_QHS 2 "register_operand" "v")
+			   (match_operand:VI_HW_QHS 3 "register_operand" "v")]
+			  UNSPEC_VEC_VMAL))]
   "TARGET_VX"
   "vmal<bhfgq><w>\t%v0,%v1,%v2,%v3"
   [(set_attr "op_type" "VRR")])
@@ -883,22 +883,22 @@
 
 ; vmahb; vmahh, vmahf, vmahg
 (define_insn "vec_vmah<mode>"
-  [(set (match_operand:VI_HW 0 "register_operand" "=v")
-	(unspec:VI_HW [(match_operand:VI_HW 1 "register_operand" "v")
-		       (match_operand:VI_HW 2 "register_operand" "v")
-		       (match_operand:VI_HW 3 "register_operand" "v")]
-		      UNSPEC_VEC_VMAH))]
+  [(set (match_operand:VI_HW_QHS 0 "register_operand" "=v")
+	(unspec:VI_HW_QHS [(match_operand:VI_HW_QHS 1 "register_operand" "v")
+			   (match_operand:VI_HW_QHS 2 "register_operand" "v")
+			   (match_operand:VI_HW_QHS 3 "register_operand" "v")]
+			  UNSPEC_VEC_VMAH))]
   "TARGET_VX"
   "vmah<bhfgq>\t%v0,%v1,%v2,%v3"
   [(set_attr "op_type" "VRR")])
 
 ; vmalhb; vmalhh, vmalhf, vmalhg
 (define_insn "vec_vmalh<mode>"
-  [(set (match_operand:VI_HW 0 "register_operand" "=v")
-	(unspec:VI_HW [(match_operand:VI_HW 1 "register_operand" "v")
-		       (match_operand:VI_HW 2 "register_operand" "v")
-		       (match_operand:VI_HW 3 "register_operand" "v")]
-		      UNSPEC_VEC_VMALH))]
+  [(set (match_operand:VI_HW_QHS 0 "register_operand" "=v")
+	(unspec:VI_HW_QHS [(match_operand:VI_HW_QHS 1 "register_operand" "v")
+			   (match_operand:VI_HW_QHS 2 "register_operand" "v")
+			   (match_operand:VI_HW_QHS 3 "register_operand" "v")]
+			  UNSPEC_VEC_VMALH))]
   "TARGET_VX"
   "vmalh<bhfgq>\t%v0,%v1,%v2,%v3"
   [(set_attr "op_type" "VRR")])
diff --git a/gcc/config/sh/sh-protos.h b/gcc/config/sh/sh-protos.h
index 5a552e2be8..081f1ceed2 100644
--- a/gcc/config/sh/sh-protos.h
+++ b/gcc/config/sh/sh-protos.h
@@ -93,6 +93,7 @@ extern rtx sh_fsca_sf2int (void);
 extern rtx sh_fsca_int2sf (void);
 
 /* Declare functions defined in sh.c and used in templates.  */
+extern bool sh_lra_p (void);
 
 extern const char *output_branch (int, rtx_insn *, rtx *);
 extern const char *output_ieee_ccmpeq (rtx_insn *, rtx *);
@@ -191,18 +192,19 @@ sh_find_set_of_reg (rtx reg, rtx_insn* insn, F stepfunc,
   if (!REG_P (reg) || insn == NULL_RTX)
     return result;
 
-  rtx_insn* previnsn = insn;
-
-  for (result.insn = stepfunc (insn); result.insn != NULL_RTX;
-       previnsn = result.insn, result.insn = stepfunc (result.insn))
+  for (rtx_insn* i = stepfunc (insn); i != NULL_RTX; i = stepfunc (i))
     {
-      if (BARRIER_P (result.insn))
+      if (BARRIER_P (i))
 	break;
-      if (!NONJUMP_INSN_P (result.insn))
-	continue;
-      if (reg_set_p (reg, result.insn))
+      if (!INSN_P (i) || DEBUG_INSN_P (i))
+	  continue;
+      if (reg_set_p (reg, i))
 	{
-	  result.set_rtx = set_of (reg, result.insn);
+	  if (CALL_P (i))
+	    break;
+
+	  result.insn = i;
+	  result.set_rtx = set_of (reg, i);
 
 	  if (result.set_rtx == NULL_RTX || GET_CODE (result.set_rtx) != SET)
 	    break;
@@ -225,12 +227,6 @@ sh_find_set_of_reg (rtx reg, rtx_insn* insn, F stepfunc,
 	}
     }
 
-  /* If the loop above stopped at the first insn in the list,
-     result.insn will be null.  Use the insn from the previous iteration
-     in this case.  */
-  if (result.insn == NULL)
-    result.insn = previnsn;
-
   if (result.set_src != NULL)
     gcc_assert (result.insn != NULL && result.set_rtx != NULL);
 
@@ -310,6 +306,8 @@ extern bool sh_insn_operands_modified_between_p (rtx_insn* operands_insn,
 extern bool sh_reg_dead_or_unused_after_insn (const rtx_insn* i, int regno);
 extern void sh_remove_reg_dead_or_unused_notes (rtx_insn* i, int regno);
 extern rtx_insn* sh_check_add_incdec_notes (rtx_insn* i);
+extern rtx sh_remove_overlapping_post_inc (rtx dst, rtx src);
+extern rtx_insn* sh_peephole_emit_move_insn (rtx dst, rtx src);
 
 extern bool sh_in_recog_treg_set_expr (void);
 extern bool sh_recog_treg_set_expr (rtx op, machine_mode mode);
diff --git a/gcc/config/sh/sh.c b/gcc/config/sh/sh.c
index 651ee1d341..3bb81fc1cd 100644
--- a/gcc/config/sh/sh.c
+++ b/gcc/config/sh/sh.c
@@ -222,6 +222,7 @@ static bool noncall_uses_reg (rtx, rtx_insn *, rtx *);
 static rtx_insn *gen_block_redirect (rtx_insn *, int, int);
 static void sh_reorg (void);
 static void sh_option_override (void);
+static void sh_override_options_after_change (void);
 static void output_stack_adjust (int, rtx, int, HARD_REG_SET *, bool);
 static rtx_insn *frame_insn (rtx);
 static rtx push (int);
@@ -236,7 +237,6 @@ static int sh_mode_after (int, int, rtx_insn *);
 static int sh_mode_entry (int);
 static int sh_mode_exit (int);
 static int sh_mode_priority (int entity, int n);
-static bool sh_lra_p (void);
 
 static rtx mark_constant_pool_use (rtx);
 static tree sh_handle_interrupt_handler_attribute (tree *, tree, tree,
@@ -413,6 +413,10 @@ static const struct attribute_spec sh_attribute_table[] =
 #undef TARGET_OPTION_OVERRIDE
 #define TARGET_OPTION_OVERRIDE sh_option_override
 
+#undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
+#define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
+  sh_override_options_after_change
+
 #undef TARGET_PRINT_OPERAND
 #define TARGET_PRINT_OPERAND sh_print_operand
 #undef TARGET_PRINT_OPERAND_ADDRESS
@@ -1065,42 +1069,6 @@ sh_option_override (void)
       TARGET_ACCUMULATE_OUTGOING_ARGS = 1;
     }
 
-  /*  Adjust loop, jump and function alignment values (in bytes), if those
-      were not specified by the user using -falign-loops, -falign-jumps
-      and -falign-functions options.
-      32 bit alignment is better for speed, because instructions can be
-      fetched as a pair from a longword boundary.  For size use 16 bit
-      alignment to get more compact code.
-      Aligning all jumps increases the code size, even if it might
-      result in slightly faster code.  Thus, it is set to the smallest 
-      alignment possible if not specified by the user.  */
-  if (align_loops == 0)
-    {
-      if (TARGET_SH5)
-	align_loops = 8;
-      else
-	align_loops = optimize_size ? 2 : 4;
-    }
-
-  if (align_jumps == 0)
-    {
-      if (TARGET_SHMEDIA)
-	align_jumps = 1 << CACHE_LOG;
-      else
-	align_jumps = 2;
-    }
-  else if (align_jumps < (TARGET_SHMEDIA ? 4 : 2))
-    align_jumps = TARGET_SHMEDIA ? 4 : 2;
-
-  if (align_functions == 0)
-    {
-      if (TARGET_SHMEDIA)
-	align_functions = optimize_size
-			  ? FUNCTION_BOUNDARY/8 : (1 << CACHE_LOG);
-      else
-	align_functions = optimize_size ? 2 : 4;
-    }
-
   /* The linker relaxation code breaks when a function contains
      alignments that are larger than that at the start of a
      compilation unit.  */
@@ -1144,6 +1112,8 @@ sh_option_override (void)
   if (flag_strict_volatile_bitfields < 0 && abi_version_at_least(2))
     flag_strict_volatile_bitfields = 1;
 
+  sh_override_options_after_change ();
+
   /* Parse atomic model option and make sure it is valid for the current
      target CPU.  */
   selected_atomic_model_
@@ -1151,6 +1121,62 @@ sh_option_override (void)
 
   register_sh_passes ();
 }
+
+/* Implement targetm.override_options_after_change.  */
+
+static void
+sh_override_options_after_change (void)
+{
+  /*  Adjust loop, jump and function alignment values (in bytes), if those
+      were not specified by the user using -falign-loops, -falign-jumps
+      and -falign-functions options.
+      32 bit alignment is better for speed, because instructions can be
+      fetched as a pair from a longword boundary.  For size use 16 bit
+      alignment to get more compact code.
+      Aligning all jumps increases the code size, even if it might
+      result in slightly faster code.  Thus, it is set to the smallest 
+      alignment possible if not specified by the user.  */
+  if (align_loops == 0)
+    {
+      if (TARGET_SH5)
+	align_loops = 8;
+      else
+	align_loops = optimize_size ? 2 : 4;
+    }
+
+  if (align_jumps == 0)
+    {
+      if (TARGET_SHMEDIA)
+	align_jumps = 1 << CACHE_LOG;
+      else
+	align_jumps = 2;
+    }
+  else if (align_jumps < (TARGET_SHMEDIA ? 4 : 2))
+    align_jumps = TARGET_SHMEDIA ? 4 : 2;
+
+  if (align_functions == 0)
+    {
+      if (TARGET_SHMEDIA)
+	align_functions = optimize_size
+			  ? FUNCTION_BOUNDARY/8 : (1 << CACHE_LOG);
+      else
+	align_functions = optimize_size ? 2 : 4;
+    }
+
+  /* The linker relaxation code breaks when a function contains
+     alignments that are larger than that at the start of a
+     compilation unit.  */
+  if (TARGET_RELAX)
+    {
+      int min_align = align_loops > align_jumps ? align_loops : align_jumps;
+
+      /* Also take possible .long constants / mova tables into account.	*/
+      if (min_align < 4)
+	min_align = 4;
+      if (align_functions < min_align)
+	align_functions = min_align;
+    }
+}
 
 /* Print the operand address in x to the stream.  */
 static void
@@ -13787,6 +13813,34 @@ sh_check_add_incdec_notes (rtx_insn* i)
   return i;
 }
 
+/* Given a move insn destiation and a source, make sure that the move source
+   operand is not a post-inc mem load with the same address reg as the
+   destination.  Returns the modified source operand with the post-inc removed
+   if necessary.  */
+rtx
+sh_remove_overlapping_post_inc (rtx dst, rtx src)
+{
+  if (!MEM_P (src))
+    return src;
+
+  rtx addr = XEXP (src, 0);
+
+  if (GET_CODE (addr) == POST_INC
+      && reg_overlap_mentioned_p (XEXP (addr, 0), dst))
+    return replace_equiv_address (src, XEXP (addr, 0));
+
+  gcc_assert (GET_CODE (addr) != POST_MODIFY);
+  return src;
+}
+
+/* Emit a move insn that is safe to be used in peephole patterns.  */
+rtx_insn*
+sh_peephole_emit_move_insn (rtx dst, rtx src)
+{
+  return sh_check_add_incdec_notes (
+	emit_move_insn (dst, sh_remove_overlapping_post_inc (dst, src)));
+}
+
 /* Given an op rtx and an insn, try to find out whether the result of the
    specified op consists only of logical operations on T bit stores.  */
 bool
@@ -13886,6 +13940,7 @@ sh_split_movrt_negc_to_movt_xor (rtx_insn* curr_insn, rtx operands[])
       && !sh_insn_operands_modified_between_p (t_before_negc.insn,
 					       t_before_negc.insn,
 					       t_after_negc.insn)
+      && !modified_between_p (get_t_reg_rtx (), curr_insn, t_after_negc.insn)
       && !sh_unspec_insn_p (t_after_negc.insn)
       && !volatile_insn_p (PATTERN (t_after_negc.insn))
       && !side_effects_p (PATTERN (t_after_negc.insn))
@@ -13992,6 +14047,9 @@ sh_extending_set_of_reg::use_as_extended_reg (rtx_insn* use_at_insn) const
   else
     {
       rtx extension_dst = XEXP (set_rtx, 0);
+      if (GET_MODE (extension_dst) != SImode)
+	extension_dst = simplify_gen_subreg (SImode, extension_dst,
+					     GET_MODE (extension_dst), 0);
       if (modified_between_p (extension_dst, insn, use_at_insn))
 	{
 	  if (dump_file)
@@ -14162,6 +14220,12 @@ sh_recog_treg_set_expr (rtx op, machine_mode mode)
   if (!can_create_pseudo_p ())
     return false;
 
+  /* expand_debug_locations may call this to compute rtx costs at
+     very early stage.  In that case, don't make new insns here to
+     avoid codegen differences with -g. */
+  if (currently_expanding_to_rtl)
+    return false;
+
   /* We are going to invoke recog in a re-entrant way and thus
      have to capture its current state and restore it afterwards.  */
   recog_data_d prev_recog_data = recog_data;
@@ -14446,7 +14510,7 @@ sh_mode_priority (int entity ATTRIBUTE_UNUSED, int n)
 */
 
 /* Return true if we use LRA instead of reload pass.  */
-static bool
+bool
 sh_lra_p (void)
 {
   return sh_lra_flag;
diff --git a/gcc/config/sh/sh.md b/gcc/config/sh/sh.md
index f445e73127..a3154d68ab 100644
--- a/gcc/config/sh/sh.md
+++ b/gcc/config/sh/sh.md
@@ -2082,17 +2082,18 @@
 })
 
 (define_expand "addsi3"
-  [(set (match_operand:SI 0 "arith_reg_operand" "")
-	(plus:SI (match_operand:SI 1 "arith_operand" "")
-		 (match_operand:SI 2 "arith_or_int_operand" "")))]
+  [(set (match_operand:SI 0 "arith_reg_dest")
+	(plus:SI (match_operand:SI 1 "arith_reg_operand")
+		 (match_operand:SI 2 "arith_or_int_operand")))]
   ""
 {
-  if (TARGET_SHMEDIA)
-    operands[1] = force_reg (SImode, operands[1]);
-  else if (! arith_operand (operands[2], SImode))
+  if (TARGET_SH1 && !arith_operand (operands[2], SImode))
     {
-      if (reg_overlap_mentioned_p (operands[0], operands[1]))
-	FAIL;
+      if (!sh_lra_p () || reg_overlap_mentioned_p (operands[0], operands[1]))
+	{
+	  emit_insn (gen_addsi3_scr (operands[0], operands[1], operands[2]));
+	  DONE;
+	}
     }
 })
 
@@ -2128,18 +2129,22 @@
 ;; copy or constant load before the actual add insn.
 ;; Use u constraint for that case to avoid the invalid value in the stack
 ;; pointer.
-(define_insn_and_split "*addsi3_compact"
+;; This also results in better code when LRA is not used.  However, we have
+;; to use different sets of patterns and the order of these patterns is
+;; important.
+;; In some cases the constant zero might end up in operands[2] of the
+;; patterns.  We have to accept that and convert it into a reg-reg move.
+(define_insn_and_split "*addsi3_compact_lra"
   [(set (match_operand:SI 0 "arith_reg_dest" "=r,&u")
-	(plus:SI (match_operand:SI 1 "arith_operand" "%0,r")
+	(plus:SI (match_operand:SI 1 "arith_reg_operand" "%0,r")
 		 (match_operand:SI 2 "arith_or_int_operand" "rI08,rn")))]
-  "TARGET_SH1
-   && ((rtx_equal_p (operands[0], operands[1])
-        && arith_operand (operands[2], SImode))
-       || ! reg_overlap_mentioned_p (operands[0], operands[1]))"
+  "TARGET_SH1 && sh_lra_p ()
+   && (! reg_overlap_mentioned_p (operands[0], operands[1])
+       || arith_operand (operands[2], SImode))"
   "@
 	add	%2,%0
 	#"
-  "reload_completed
+  "&& reload_completed
    && ! reg_overlap_mentioned_p (operands[0], operands[1])"
   [(set (match_dup 0) (match_dup 2))
    (set (match_dup 0) (plus:SI (match_dup 0) (match_dup 1)))]
@@ -2150,6 +2155,98 @@
 }
   [(set_attr "type" "arith")])
 
+(define_insn_and_split "addsi3_scr"
+  [(set (match_operand:SI 0 "arith_reg_dest" "=r,&u,&u")
+	(plus:SI (match_operand:SI 1 "arith_reg_operand" "%0,r,r")
+		 (match_operand:SI 2 "arith_or_int_operand" "rI08,r,n")))
+   (clobber (match_scratch:SI 3 "=X,X,&u"))]
+  "TARGET_SH1"
+  "@
+	add	%2,%0
+	#
+	#"
+  "&& reload_completed"
+  [(set (match_dup 0) (plus:SI (match_dup 0) (match_dup 2)))]
+{
+  if (operands[2] == const0_rtx)
+    {
+      emit_move_insn (operands[0], operands[1]);
+      DONE;
+    }
+
+  if (CONST_INT_P (operands[2]) && !satisfies_constraint_I08 (operands[2]))
+    {
+      if (reg_overlap_mentioned_p (operands[0], operands[1]))
+	{
+	  emit_move_insn (operands[3], operands[2]);
+	  emit_move_insn (operands[0], operands[1]);
+	  operands[2] = operands[3];
+	}
+      else
+	{
+	  emit_move_insn (operands[0], operands[2]);
+	  operands[2] = operands[1];
+	}
+    }
+  else if (!reg_overlap_mentioned_p (operands[0], operands[1]))
+    {
+      if (!reg_overlap_mentioned_p (operands[0], operands[2]))
+	emit_move_insn (operands[0], operands[1]);
+      else
+	operands[2] = operands[1];
+    }
+}
+  [(set_attr "type" "arith")])
+
+;; Old reload might generate add insns directly (not through the expander) for
+;; the memory address of complex insns like atomic insns when reloading.
+(define_insn_and_split "*addsi3"
+  [(set (match_operand:SI 0 "arith_reg_dest" "=r")
+	(plus:SI (match_operand:SI 1 "arith_reg_operand" "r")
+		 (match_operand:SI 2 "arith_or_int_operand" "rn")))]
+  "TARGET_SH1 && !sh_lra_p ()
+   && reload_completed
+   && !reg_overlap_mentioned_p (operands[0], operands[1])"
+  "#"
+  "&& 1"
+  [(set (match_dup 0) (plus:SI (match_dup 0) (match_dup 2)))]
+{
+  if (operands[2] == const0_rtx)
+    {
+      emit_move_insn (operands[0], operands[1]);
+      DONE;
+    }
+
+  if (CONST_INT_P (operands[2]))
+    {
+      if (satisfies_constraint_I08 (operands[2]))
+	emit_move_insn (operands[0], operands[1]);
+      else
+	{
+	  emit_move_insn (operands[0], operands[2]);
+	  operands[2] = operands[1];
+	}
+    }
+  else if (!reg_overlap_mentioned_p (operands[0], operands[2]))
+    emit_move_insn (operands[0], operands[1]);
+  else
+    operands[2] = operands[1];
+})
+
+(define_insn_and_split "*addsi3"
+  [(set (match_operand:SI 0 "arith_reg_dest" "=r,r")
+	(plus:SI (match_operand:SI 1 "arith_reg_operand" "%0,r")
+		 (match_operand:SI 2 "arith_operand" "rI08,Z")))]
+  "TARGET_SH1 && !sh_lra_p ()"
+  "@
+	add	%2,%0
+	#"
+  "&& operands[2] == const0_rtx"
+  [(set (match_dup 0) (match_dup 1))]
+{
+}
+  [(set_attr "type" "arith")])
+
 ;; -------------------------------------------------------------------------
 ;; Subtraction instructions
 ;; -------------------------------------------------------------------------
@@ -7851,6 +7948,24 @@ label:
   ""
 {
   prepare_move_operands (operands, DImode);
+  if (TARGET_SH1)
+    {
+      /* When the dest operand is (R0, R1) register pair, split it to
+	 two movsi of which dest is R1 and R0 so as to lower R0-register
+	 pressure on the first movsi.  Apply only for simple source not
+	 to make complex rtl here.  */
+      if (REG_P (operands[0])
+	  && REGNO (operands[0]) == R0_REG
+	  && REG_P (operands[1])
+	  && REGNO (operands[1]) >= FIRST_PSEUDO_REGISTER)
+	{
+	  emit_insn (gen_movsi (gen_rtx_REG (SImode, R1_REG),
+			        gen_rtx_SUBREG (SImode, operands[1], 4)));
+	  emit_insn (gen_movsi (gen_rtx_REG (SImode, R0_REG),
+			        gen_rtx_SUBREG (SImode, operands[1], 0)));
+	  DONE;
+	}
+    }
 })
 
 (define_insn "movdf_media"
@@ -9404,7 +9519,7 @@ label:
    (use (reg:SI FPSCR_MODES_REG))
    (use (reg:SI PIC_REG))
    (clobber (reg:SI PR_REG))
-   (clobber (match_scratch:SI 2 "=r"))]
+   (clobber (match_scratch:SI 2 "=&r"))]
   "TARGET_SH2"
   "#"
   "reload_completed"
@@ -9538,7 +9653,7 @@ label:
    (use (reg:SI FPSCR_MODES_REG))
    (use (reg:SI PIC_REG))
    (clobber (reg:SI PR_REG))
-   (clobber (match_scratch:SI 3 "=r"))]
+   (clobber (match_scratch:SI 3 "=&r"))]
   "TARGET_SH2"
   "#"
   "reload_completed"
@@ -9939,7 +10054,7 @@ label:
   [(call (mem:SI (match_operand:SI 0 "symbol_ref_operand" ""))
 	 (match_operand 1 "" ""))
    (use (reg:SI FPSCR_MODES_REG))
-   (clobber (match_scratch:SI 2 "=k"))
+   (clobber (match_scratch:SI 2 "=&k"))
    (return)]
   "TARGET_SH2"
   "#"
@@ -10131,7 +10246,7 @@ label:
 	(call (mem:SI (match_operand:SI 1 "symbol_ref_operand" ""))
 	      (match_operand 2 "" "")))
    (use (reg:SI FPSCR_MODES_REG))
-   (clobber (match_scratch:SI 3 "=k"))
+   (clobber (match_scratch:SI 3 "=&k"))
    (return)]
   "TARGET_SH2"
   "#"
@@ -10579,7 +10694,7 @@ label:
   if (TARGET_SHMEDIA)
     {
       rtx tr = gen_rtx_REG (Pmode, TR0_REG);
-      rtx pic = operands[0];
+      rtx pic = operands[1];
       rtx lab = PATTERN (gen_call_site ());
       rtx insn, equiv;
 
@@ -14626,7 +14741,7 @@ label:
   [(const_int 0)]
 {
   emit_insn (gen_addsi3 (operands[1], operands[1], operands[2]));
-  sh_check_add_incdec_notes (emit_move_insn (operands[3], operands[1]));
+  sh_peephole_emit_move_insn (operands[3], operands[1]);
 })
 
 ;;	mov.l	@(r0,r9),r1
@@ -14639,7 +14754,7 @@ label:
   "TARGET_SH1 && peep2_reg_dead_p (2, operands[0])"
   [(const_int 0)]
 {
-  sh_check_add_incdec_notes (emit_move_insn (operands[2], operands[1]));
+  sh_peephole_emit_move_insn (operands[2], operands[1]);
 })
 
 (define_peephole2
@@ -14650,7 +14765,7 @@ label:
   "TARGET_SH1 && peep2_reg_dead_p (2, operands[0])"
   [(const_int 0)]
 {
-  sh_check_add_incdec_notes (emit_move_insn (operands[2], operands[1]));
+  sh_peephole_emit_move_insn (operands[2], operands[1]);
 })
 
 (define_peephole2
@@ -14662,7 +14777,7 @@ label:
   [(const_int 0)]
 {
   sh_check_add_incdec_notes (emit_insn (gen_extend<mode>si2 (operands[2],
-							     operands[1])));
+		   sh_remove_overlapping_post_inc (operands[2], operands[1]))));
 })
 
 ;;	mov.w	@(18,r1),r0 (r0 = HImode)
@@ -14692,8 +14807,9 @@ label:
 
   // We don't know what the new set insn will be in detail.  Just make sure
   // that it still can be recognized and the constraints are satisfied.
-  rtx_insn* i = emit_insn (gen_rtx_SET (VOIDmode, operands[2], operands[3]));
-						     
+  rtx_insn* i = emit_insn (gen_rtx_SET (VOIDmode, operands[2],
+		    sh_remove_overlapping_post_inc (operands[2], operands[3])));
+
   recog_data_d prev_recog_data = recog_data;
   bool i_invalid = insn_invalid_p (i, false); 
   recog_data = prev_recog_data;
@@ -14731,7 +14847,8 @@ label:
 {
   // We don't know what the new set insn will be in detail.  Just make sure
   // that it still can be recognized and the constraints are satisfied.
-  rtx_insn* i = emit_insn (gen_rtx_SET (VOIDmode, operands[2], operands[3]));
+  rtx_insn* i = emit_insn (gen_rtx_SET (VOIDmode, operands[2],
+		    sh_remove_overlapping_post_inc (operands[2], operands[3])));
 
   recog_data_d prev_recog_data = recog_data;
   bool i_invalid = insn_invalid_p (i, false); 
diff --git a/gcc/config/sol2.h b/gcc/config/sol2.h
index 4dceb16d4c..a08ad3c155 100644
--- a/gcc/config/sol2.h
+++ b/gcc/config/sol2.h
@@ -154,21 +154,41 @@ along with GCC; see the file COPYING3.  If not see
 #define STARTFILE_ARCH_SPEC "%{ansi:values-Xc.o%s} \
 			    %{!ansi:values-Xa.o%s}"
 
+#if defined(HAVE_LD_PIE) && defined(HAVE_SOLARIS_CRTS)
+#define STARTFILE_CRTBEGIN_SPEC "%{shared|pie:crtbeginS.o%s;:crtbegin.o%s}"
+#else
+#define STARTFILE_CRTBEGIN_SPEC	"crtbegin.o%s"
+#endif
+
 /* We don't use the standard svr4 STARTFILE_SPEC because it's wrong for us.  */
 #undef STARTFILE_SPEC
-#define STARTFILE_SPEC "%{!shared: \
-			 %{!symbolic: \
-			  %{p:mcrt1.o%s} \
-                          %{!p: \
-	                    %{pg:gcrt1.o%s gmon.o%s} \
-                            %{!pg:crt1.o%s}}}} \
-			crti.o%s %(startfile_arch) \
-			crtbegin.o%s"
+#ifdef HAVE_SOLARIS_CRTS
+/* Since Solaris 11.x and Solaris 12, the OS delivers crt1.o, crti.o, and
+   crtn.o, with a hook for compiler-dependent stuff like profile handling.  */
+#define STARTFILE_SPEC "%{!shared:%{!symbolic: \
+			  crt1.o%s \
+			  %{p:%e-p is not supported; \
+			    pg:crtpg.o%s gmon.o%s; \
+			      :crtp.o%s}}} \
+			crti.o%s %(startfile_arch) %(startfile_crtbegin)"
+#else
+#define STARTFILE_SPEC "%{!shared:%{!symbolic: \
+			  %{p:mcrt1.o%s; \
+                            pg:gcrt1.o%s gmon.o%s; \
+                              :crt1.o%s}}} \
+			crti.o%s %(startfile_arch) %(startfile_crtbegin)"
+#endif
+
+#if defined(HAVE_LD_PIE) && defined(HAVE_SOLARIS_CRTS)
+#define ENDFILE_CRTEND_SPEC "%{shared|pie:crtendS.o%s;:crtend.o%s}"
+#else
+#define ENDFILE_CRTEND_SPEC "crtend.o%s"
+#endif
 
 #undef  ENDFILE_SPEC
 #define ENDFILE_SPEC \
   "%{Ofast|ffast-math|funsafe-math-optimizations:crtfastmath.o%s} \
-   crtend.o%s crtn.o%s"
+   %(endfile_arch) %(endfile_crtend) crtn.o%s"
 
 #undef LINK_ARCH32_SPEC_BASE
 #define LINK_ARCH32_SPEC_BASE \
@@ -241,11 +261,14 @@ along with GCC; see the file COPYING3.  If not see
 
 #undef SUBTARGET_EXTRA_SPECS
 #define SUBTARGET_EXTRA_SPECS \
-  { "startfile_arch",	 STARTFILE_ARCH_SPEC },		\
-  { "link_arch32",       LINK_ARCH32_SPEC },            \
-  { "link_arch64",       LINK_ARCH64_SPEC },            \
-  { "link_arch_default", LINK_ARCH_DEFAULT_SPEC },	\
-  { "link_arch",	 LINK_ARCH_SPEC },		\
+  { "startfile_arch",	 	STARTFILE_ARCH_SPEC },		\
+  { "startfile_crtbegin",	STARTFILE_CRTBEGIN_SPEC },	\
+  { "link_arch32",       	LINK_ARCH32_SPEC },		\
+  { "link_arch64",       	LINK_ARCH64_SPEC },		\
+  { "link_arch_default", 	LINK_ARCH_DEFAULT_SPEC },	\
+  { "link_arch",	 	LINK_ARCH_SPEC },		\
+  { "endfile_arch",	 	ENDFILE_ARCH_SPEC },		\
+  { "endfile_crtend",		ENDFILE_CRTEND_SPEC },	\
   SUBTARGET_CPU_EXTRA_SPECS
 
 /* C++11 programs need -lrt for nanosleep.  */
@@ -300,6 +323,20 @@ along with GCC; see the file COPYING3.  If not see
 #endif /* HAVE_LD_EH_FRAME && TARGET_DL_ITERATE_PHDR */
 #endif
 
+#if defined(HAVE_LD_PIE) && defined(HAVE_SOLARIS_CRTS)
+#ifdef USE_GLD
+/* Assert -z text by default to match Solaris ld.  */
+#define LINK_PIE_SPEC "%{pie:-pie %{!mimpure-text:-z text}} "
+#else
+/* Solaris ld needs -z type=pie instead of -pie.  */
+#define LINK_PIE_SPEC "%{pie:-z type=pie %{mimpure-text:-z textoff}} "
+#endif
+#else
+/* Error out if some part of PIE support is missing.  */
+#define LINK_PIE_SPEC \
+  "%{no-pie:} %{pie:%e-pie is not supported in this configuration} "
+#endif
+
 /* collect2.c can only parse GNU nm -n output.  Solaris nm needs -png to
    produce the same format.  */
 #define NM_FLAGS "-png"
diff --git a/gcc/config/sparc/driver-sparc.c b/gcc/config/sparc/driver-sparc.c
index 778de2ccde..5969735c41 100644
--- a/gcc/config/sparc/driver-sparc.c
+++ b/gcc/config/sparc/driver-sparc.c
@@ -73,6 +73,7 @@ static const struct cpu_names {
   { "UltraSparc T2",	"niagara2" },
   { "UltraSparc T3",	"niagara3" },
   { "UltraSparc T4",	"niagara4" },
+  { "LEON",		"leon3" },
 #endif
   { NULL,	NULL }
   };
diff --git a/gcc/config/sparc/sol2.h b/gcc/config/sparc/sol2.h
index 5a2144261c..0d2570347f 100644
--- a/gcc/config/sparc/sol2.h
+++ b/gcc/config/sparc/sol2.h
@@ -280,6 +280,8 @@ extern const char *host_detect_local_cpu (int argc, const char **argv);
 
 #define SUBTARGET_CPU_EXTRA_SPECS
 
+#define ENDFILE_ARCH_SPEC ""
+
 
 
 /* Register the Solaris-specific #pragma directives.  */
diff --git a/gcc/config/sparc/sp-elf.h b/gcc/config/sparc/sp-elf.h
index 85da652a99..4835653e04 100644
--- a/gcc/config/sparc/sp-elf.h
+++ b/gcc/config/sparc/sp-elf.h
@@ -53,6 +53,10 @@ along with GCC; see the file COPYING3.  If not see
 #undef  ASM_GENERATE_INTERNAL_LABEL
 #define ASM_GENERATE_INTERNAL_LABEL(LABEL,PREFIX,NUM)	\
   sprintf ((LABEL), "*.L%s%ld", (PREFIX), (long)(NUM))
+
+/* We use GNU ld so undefine this so that attribute((init_priority)) works.  */
+#undef CTORS_SECTION_ASM_OP
+#undef DTORS_SECTION_ASM_OP
 
 /* ??? Inherited from sol2.h.  Probably wrong.  */
 #undef WCHAR_TYPE
diff --git a/gcc/config/sparc/sparc.c b/gcc/config/sparc/sparc.c
index 1fd3c1eb35..61c7c311c4 100644
--- a/gcc/config/sparc/sparc.c
+++ b/gcc/config/sparc/sparc.c
@@ -7412,9 +7412,10 @@ sparc_function_value_1 (const_tree type, machine_mode mode,
 	mode = word_mode;
     }
 
-  /* We should only have pointer and integer types at this point.  This must
-     match sparc_promote_function_mode.  */
+  /* We should only have pointer and integer types at this point, except with
+     -freg-struct-return.  This must match sparc_promote_function_mode.  */
   else if (TARGET_ARCH32
+	   && !(type && AGGREGATE_TYPE_P (type))
 	   && mclass == MODE_INT
 	   && GET_MODE_SIZE (mode) < UNITS_PER_WORD)
     mode = word_mode;
@@ -7455,7 +7456,7 @@ sparc_libcall_value (machine_mode mode,
 static bool
 sparc_function_value_regno_p (const unsigned int regno)
 {
-  return (regno == 8 || regno == 32);
+  return (regno == 8 || (TARGET_FPU && regno == 32));
 }
 
 /* Do what is necessary for `va_start'.  We look at the current function
@@ -11678,9 +11679,8 @@ sparc_emit_membar_for_model (enum memmodel model,
 
   if (before_after & 1)
     {
-      if (model == MEMMODEL_RELEASE
-	  || model == MEMMODEL_ACQ_REL
-	  || model == MEMMODEL_SEQ_CST)
+      if (is_mm_release (model) || is_mm_acq_rel (model)
+	  || is_mm_seq_cst (model))
 	{
 	  if (load_store & 1)
 	    mm |= LoadLoad | StoreLoad;
@@ -11690,9 +11690,8 @@ sparc_emit_membar_for_model (enum memmodel model,
     }
   if (before_after & 2)
     {
-      if (model == MEMMODEL_ACQUIRE
-	  || model == MEMMODEL_ACQ_REL
-	  || model == MEMMODEL_SEQ_CST)
+      if (is_mm_acquire (model) || is_mm_acq_rel (model)
+	  || is_mm_seq_cst (model))
 	{
 	  if (load_store & 1)
 	    mm |= LoadLoad | LoadStore;
diff --git a/gcc/config/sparc/sparc.md b/gcc/config/sparc/sparc.md
index 700704434a..98ff5c999b 100644
--- a/gcc/config/sparc/sparc.md
+++ b/gcc/config/sparc/sparc.md
@@ -6398,7 +6398,6 @@
   ""
 {
   rtx valreg1 = gen_rtx_REG (DImode, 8);
-  rtx valreg2 = gen_rtx_REG (TARGET_ARCH64 ? TFmode : DFmode, 32);
   rtx result = operands[1];
 
   /* Pass constm1 to indicate that it may expect a structure value, but
@@ -6407,8 +6406,12 @@
 
   /* Save the function value registers.  */
   emit_move_insn (adjust_address (result, DImode, 0), valreg1);
-  emit_move_insn (adjust_address (result, TARGET_ARCH64 ? TFmode : DFmode, 8),
-				  valreg2);
+  if (TARGET_FPU)
+    {
+      rtx valreg2 = gen_rtx_REG (TARGET_ARCH64 ? TFmode : DFmode, 32);
+      emit_move_insn (adjust_address (result, TARGET_ARCH64 ? TFmode : DFmode, 8),
+		      valreg2);
+    }
 
   /* The optimizer does not know that the call sets the function value
      registers we stored in the result block.  We avoid problems by
@@ -6620,7 +6623,6 @@
   ""
 {
   rtx valreg1 = gen_rtx_REG (DImode, 24);
-  rtx valreg2 = gen_rtx_REG (TARGET_ARCH64 ? TFmode : DFmode, 32);
   rtx result = operands[0];
 
   if (! TARGET_ARCH64)
@@ -6637,14 +6639,18 @@
       emit_insn (gen_update_return (rtnreg, value));
     }
 
-  /* Reload the function value registers.  */
+  /* Reload the function value registers.
+     Put USE insns before the return.  */
   emit_move_insn (valreg1, adjust_address (result, DImode, 0));
-  emit_move_insn (valreg2,
-		  adjust_address (result, TARGET_ARCH64 ? TFmode : DFmode, 8));
-
-  /* Put USE insns before the return.  */
   emit_use (valreg1);
-  emit_use (valreg2);
+
+  if (TARGET_FPU)
+    {
+      rtx valreg2 = gen_rtx_REG (TARGET_ARCH64 ? TFmode : DFmode, 32);
+      emit_move_insn (valreg2,
+		      adjust_address (result, TARGET_ARCH64 ? TFmode : DFmode, 8));
+      emit_use (valreg2);
+    }
 
   /* Construct the return.  */
   expand_naked_return ();
diff --git a/gcc/config/sparc/sparc.opt b/gcc/config/sparc/sparc.opt
index 93d24a69be..85bf0bdb49 100644
--- a/gcc/config/sparc/sparc.opt
+++ b/gcc/config/sparc/sparc.opt
@@ -114,8 +114,8 @@ Target
 Optimize tail call instructions in assembler and linker
 
 muser-mode
-Target Report Mask(USER_MODE)
-Do not generate code that can only run in supervisor mode
+Target Report InverseMask(SV_MODE)
+Do not generate code that can only run in supervisor mode (default)
 
 mcpu=
 Target RejectNegative Joined Var(sparc_cpu_and_features) Enum(sparc_processor_type) Init(PROCESSOR_V7)
diff --git a/gcc/config/sparc/sync.md b/gcc/config/sparc/sync.md
index 7d00b1080b..2fabff5f67 100644
--- a/gcc/config/sparc/sync.md
+++ b/gcc/config/sparc/sync.md
@@ -222,10 +222,10 @@
 	  UNSPECV_CAS))]
   "TARGET_LEON3"
 {
-  if (TARGET_USER_MODE)
-    return "casa\t%1 0xa, %2, %0"; /* ASI for user data space.  */
-  else
+  if (TARGET_SV_MODE)
     return "casa\t%1 0xb, %2, %0"; /* ASI for supervisor data space.  */
+  else
+    return "casa\t%1 0xa, %2, %0"; /* ASI for user data space.  */
 }
   [(set_attr "type" "multi")])
 
diff --git a/gcc/config/sparc/t-rtems b/gcc/config/sparc/t-rtems
index adb6dcbaac..6f7cc6fabc 100644
--- a/gcc/config/sparc/t-rtems
+++ b/gcc/config/sparc/t-rtems
@@ -17,15 +17,20 @@
 # <http://www.gnu.org/licenses/>.
 #
 
-MULTILIB_OPTIONS = msoft-float mcpu=v8/mcpu=leon3/mcpu=leon3v7 muser-mode
-MULTILIB_DIRNAMES = soft v8 leon3 leon3v7 user-mode
+MULTILIB_OPTIONS = msoft-float mcpu=v8/mcpu=leon3/mcpu=leon3v7/mcpu=leon \
+		   mfix-ut699/mfix-at697f
+MULTILIB_DIRNAMES = soft v8 leon3 leon3v7 leon ut699 at697f
 MULTILIB_MATCHES = msoft-float=mno-fpu
 
-MULTILIB_EXCEPTIONS = muser-mode
-MULTILIB_EXCEPTIONS += mcpu=leon3
-MULTILIB_EXCEPTIONS += mcpu=leon3v7
-MULTILIB_EXCEPTIONS += msoft-float/mcpu=leon3
-MULTILIB_EXCEPTIONS += msoft-float/mcpu=leon3v7
-MULTILIB_EXCEPTIONS += msoft-float/muser-mode
-MULTILIB_EXCEPTIONS += msoft-float/mcpu=v8/muser-mode
-MULTILIB_EXCEPTIONS += mcpu=v8/muser-mode
+MULTILIB_EXCEPTIONS = mfix-ut699
+MULTILIB_EXCEPTIONS += msoft-float/mfix-ut699
+MULTILIB_EXCEPTIONS += msoft-float/mcpu=v8/mfix-ut699
+MULTILIB_EXCEPTIONS += msoft-float/mcpu=leon3*/mfix-ut699
+MULTILIB_EXCEPTIONS += mcpu=v8/mfix-ut699
+MULTILIB_EXCEPTIONS += mcpu=leon3*/mfix-ut699
+MULTILIB_EXCEPTIONS += mfix-at697f
+MULTILIB_EXCEPTIONS += msoft-float/mfix-at697f
+MULTILIB_EXCEPTIONS += msoft-float/mcpu=v8/mfix-at697f
+MULTILIB_EXCEPTIONS += msoft-float/mcpu=leon3*/mfix-at697f
+MULTILIB_EXCEPTIONS += mcpu=v8/mfix-at697f
+MULTILIB_EXCEPTIONS += mcpu=leon3*/mfix-at697f