13 files changed, 181 insertions, 137 deletions
diff --git a/gcc/config/arm/arm-builtins.c b/gcc/config/arm/arm-builtins.c
index 692496d49d5..d09c6e371de 100644
--- a/gcc/config/arm/arm-builtins.c
+++ b/gcc/config/arm/arm-builtins.c
@@ -107,6 +107,13 @@ arm_ternop_qualifiers[SIMD_MAX_BUILTIN_ARGS]
   = { qualifier_none, qualifier_none, qualifier_none, qualifier_none };
 #define TERNOP_QUALIFIERS (arm_ternop_qualifiers)
 
+/* unsigned T (unsigned T, unsigned T, unsigned T).  */
+static enum arm_type_qualifiers
+arm_unsigned_uternop_qualifiers[SIMD_MAX_BUILTIN_ARGS]
+  = { qualifier_unsigned, qualifier_unsigned, qualifier_unsigned,
+      qualifier_unsigned };
+#define UTERNOP_QUALIFIERS (arm_unsigned_uternop_qualifiers)
+
 /* T (T, immediate).  */
 static enum arm_type_qualifiers
 arm_binop_imm_qualifiers[SIMD_MAX_BUILTIN_ARGS]
@@ -133,6 +140,13 @@ arm_mac_lane_qualifiers[SIMD_MAX_BUILTIN_ARGS]
       qualifier_none, qualifier_lane_index };
 #define MAC_LANE_QUALIFIERS (arm_mac_lane_qualifiers)
 
+/* unsigned T (unsigned T, unsigned T, unsigend T, lane index).  */
+static enum arm_type_qualifiers
+arm_umac_lane_qualifiers[SIMD_MAX_BUILTIN_ARGS]
+  = { qualifier_unsigned, qualifier_unsigned, qualifier_unsigned,
+      qualifier_unsigned, qualifier_lane_index };
+#define UMAC_LANE_QUALIFIERS (arm_umac_lane_qualifiers)
+
 /* T (T, T, immediate).  */
 static enum arm_type_qualifiers
 arm_ternop_imm_qualifiers[SIMD_MAX_BUILTIN_ARGS]
diff --git a/gcc/config/arm/arm-c.c b/gcc/config/arm/arm-c.c
index b2e9af6c45d..635bc3c1c38 100644
--- a/gcc/config/arm/arm-c.c
+++ b/gcc/config/arm/arm-c.c
@@ -74,11 +74,11 @@ arm_cpu_builtins (struct cpp_reader* pfile)
 
   def_or_undef_macro (pfile, "__ARM_FEATURE_QRDMX", TARGET_NEON_RDMA);
 
-  if (TARGET_CRC32)
-    builtin_define ("__ARM_FEATURE_CRC32");
-
+  def_or_undef_macro (pfile, "__ARM_FEATURE_CRC32", TARGET_CRC32);
+  def_or_undef_macro (pfile, "__ARM_FEATURE_DOTPROD", TARGET_DOTPROD);
   def_or_undef_macro (pfile, "__ARM_32BIT_STATE", TARGET_32BIT);
 
+  cpp_undef (pfile, "__ARM_FEATURE_CMSE");
   if (arm_arch8 && !arm_arch_notm)
     {
       if (arm_arch_cmse && use_cmse)
diff --git a/gcc/config/arm/arm-cpus.in b/gcc/config/arm/arm-cpus.in
index 07de4c9375b..0820ad74c2e 100644
--- a/gcc/config/arm/arm-cpus.in
+++ b/gcc/config/arm/arm-cpus.in
@@ -156,6 +156,8 @@ define feature crypto
 # FP16 data processing (half-precision float).
 define feature fp16
 
+# Dot Product instructions extension to ARMv8.2-a.
+define feature dotprod
 
 # ISA Quirks (errata?).  Don't forget to add this to the fgroup
 # ALL_QUIRKS below.
@@ -173,6 +175,17 @@ define feature quirk_cm3_ldrd
 define feature smallmul
 
 # Feature groups.  Conventionally all (or mostly) upper case.
+# ALL_FPU lists all the feature bits associated with the floating-point
+# unit; these will all be removed if the floating-point unit is disabled
+# (eg -mfloat-abi=soft).  ALL_FPU_INTERNAL must ONLY contain features that
+# form part of a named -mfpu option; it is used to map the capabilities
+# back to a named FPU for the benefit of the assembler.
+#
+# ALL_SIMD_INTERNAL and ALL_SIMD are similarly defined to help with the
+# construction of ALL_FPU and ALL_FPU_INTERNAL; they describe the SIMD
+# extensions that are either part of a named FPU or optional extensions
+# respectively.
+
 
 # List of all cryptographic extensions to stripout if crypto is
 # disabled.  Currently, that's trivial, but we define it anyway for
@@ -182,11 +195,12 @@ define fgroup ALL_CRYPTO	crypto
 # List of all SIMD bits to strip out if SIMD is disabled.  This does
 # strip off 32 D-registers, but does not remove support for
 # double-precision FP.
-define fgroup ALL_SIMD	fp_d32 neon ALL_CRYPTO
+define fgroup ALL_SIMD_INTERNAL	fp_d32 neon ALL_CRYPTO
+define fgroup ALL_SIMD	ALL_SIMD_INTERNAL dotprod
 
 # List of all FPU bits to strip out if -mfpu is used to override the
 # default.  fp16 is deliberately missing from this list.
-define fgroup ALL_FPU_INTERNAL	vfpv2 vfpv3 vfpv4 fpv5 fp16conv fp_dbl ALL_SIMD
+define fgroup ALL_FPU_INTERNAL	vfpv2 vfpv3 vfpv4 fpv5 fp16conv fp_dbl ALL_SIMD_INTERNAL
 
 # Similarly, but including fp16 and other extensions that aren't part of
 # -mfpu support.
@@ -239,6 +253,7 @@ define fgroup FP_D32	FP_DBL fp_d32
 define fgroup FP_ARMv8	FPv5 FP_D32
 define fgroup NEON	FP_D32 neon
 define fgroup CRYPTO	NEON crypto
+define fgroup DOTPROD	NEON dotprod
 
 # List of all quirk bits to strip out when comparing CPU features with
 # architectures.
@@ -561,6 +576,7 @@ begin arch armv8.2-a
  option crypto add FP_ARMv8 CRYPTO
  option nocrypto remove ALL_CRYPTO
  option nofp remove ALL_FP
+ option dotprod add FP_ARMv8 DOTPROD
 end arch armv8.2-a
 
 begin arch armv8-m.base
@@ -1473,7 +1489,7 @@ begin cpu cortex-a55
  cname cortexa55
  tune for cortex-a53
  tune flags LDSCHED
- architecture armv8.2-a+fp16
+ architecture armv8.2-a+fp16+dotprod
  fpu neon-fp-armv8
  option crypto add FP_ARMv8 CRYPTO
  option nofp remove ALL_FP
@@ -1484,7 +1500,7 @@ begin cpu cortex-a75
  cname cortexa75
  tune for cortex-a57
  tune flags LDSCHED
- architecture armv8.2-a+fp16
+ architecture armv8.2-a+fp16+dotprod
  fpu neon-fp-armv8
  option crypto add FP_ARMv8 CRYPTO
  costs cortex_a73
@@ -1496,7 +1512,7 @@ begin cpu cortex-a75.cortex-a55
  cname cortexa75cortexa55
  tune for cortex-a53
  tune flags LDSCHED
- architecture armv8.2-a+fp16
+ architecture armv8.2-a+fp16+dotprod
  fpu neon-fp-armv8
  option crypto add FP_ARMv8 CRYPTO
  costs cortex_a73
@@ -1516,6 +1532,7 @@ begin cpu cortex-m33
  architecture armv8-m.main+dsp
  fpu fpv5-sp-d16
  option nofp remove ALL_FP
+ option nodsp remove armv7em
  costs v7m
 end cpu cortex-m33
 
diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c
index ce3aaeb04e0..47ba0dd09e3 100644
--- a/gcc/config/arm/arm.c
+++ b/gcc/config/arm/arm.c
@@ -973,6 +973,9 @@ int arm_condexec_masklen = 0;
 /* Nonzero if chip supports the ARMv8 CRC instructions.  */
 int arm_arch_crc = 0;
 
+/* Nonzero if chip supports the AdvSIMD Dot Product instructions.  */
+int arm_arch_dotprod = 0;
+
 /* Nonzero if chip supports the ARMv8-M security extensions.  */
 int arm_arch_cmse = 0;
 
@@ -9429,6 +9432,9 @@ arm_rtx_costs_internal (rtx x, enum rtx_code code, enum rtx_code outer_code,
 		   + rtx_cost (XEXP (x, 0), mode, code, 0, speed_p));
 	  if (speed_p)
 	    *cost += 2 * extra_cost->alu.shift;
+	  /* Slightly disparage left shift by 1 at so we prefer adddi3.  */
+	  if (code == ASHIFT && XEXP (x, 1) == CONST1_RTX (SImode))
+	    *cost += 1;
 	  return true;
 	}
       else if (mode == SImode)
@@ -11252,9 +11258,11 @@ arm_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
         return current_tune->vec_costs->scalar_to_vec_cost;
 
       case unaligned_load:
+      case vector_gather_load:
         return current_tune->vec_costs->vec_unalign_load_cost;
 
       case unaligned_store:
+      case vector_scatter_store:
         return current_tune->vec_costs->vec_unalign_store_cost;
 
       case cond_branch_taken:
@@ -15293,12 +15301,23 @@ operands_ok_ldrd_strd (rtx rt, rtx rt2, rtx rn, HOST_WIDE_INT offset,
   return true;
 }
 
+/* Return true if a 64-bit access with alignment ALIGN and with a
+   constant offset OFFSET from the base pointer is permitted on this
+   architecture.  */
+static bool
+align_ok_ldrd_strd (HOST_WIDE_INT align, HOST_WIDE_INT offset)
+{
+  return (unaligned_access
+	  ? (align >= BITS_PER_WORD && (offset & 3) == 0)
+	  : (align >= 2 * BITS_PER_WORD && (offset & 7) == 0));
+}
+
 /* Helper for gen_operands_ldrd_strd.  Returns true iff the memory
    operand MEM's address contains an immediate offset from the base
-   register and has no side effects, in which case it sets BASE and
-   OFFSET accordingly.  */
+   register and has no side effects, in which case it sets BASE,
+   OFFSET and ALIGN accordingly.  */
 static bool
-mem_ok_for_ldrd_strd (rtx mem, rtx *base, rtx *offset)
+mem_ok_for_ldrd_strd (rtx mem, rtx *base, rtx *offset, HOST_WIDE_INT *align)
 {
   rtx addr;
 
@@ -15317,6 +15336,7 @@ mem_ok_for_ldrd_strd (rtx mem, rtx *base, rtx *offset)
   gcc_assert (MEM_P (mem));
 
   *offset = const0_rtx;
+  *align = MEM_ALIGN (mem);
 
   addr = XEXP (mem, 0);
 
@@ -15357,7 +15377,7 @@ gen_operands_ldrd_strd (rtx *operands, bool load,
                         bool const_store, bool commute)
 {
   int nops = 2;
-  HOST_WIDE_INT offsets[2], offset;
+  HOST_WIDE_INT offsets[2], offset, align[2];
   rtx base = NULL_RTX;
   rtx cur_base, cur_offset, tmp;
   int i, gap;
@@ -15369,7 +15389,8 @@ gen_operands_ldrd_strd (rtx *operands, bool load,
      registers, and the corresponding memory offsets.  */
   for (i = 0; i < nops; i++)
     {
-      if (!mem_ok_for_ldrd_strd (operands[nops+i], &cur_base, &cur_offset))
+      if (!mem_ok_for_ldrd_strd (operands[nops+i], &cur_base, &cur_offset,
+				 &align[i]))
         return false;
 
       if (i == 0)
@@ -15483,6 +15504,7 @@ gen_operands_ldrd_strd (rtx *operands, bool load,
       /* Swap the instructions such that lower memory is accessed first.  */
       std::swap (operands[0], operands[1]);
       std::swap (operands[2], operands[3]);
+      std::swap (align[0], align[1]);
       if (const_store)
         std::swap (operands[4], operands[5]);
     }
@@ -15496,6 +15518,9 @@ gen_operands_ldrd_strd (rtx *operands, bool load,
   if (gap != 4)
     return false;
 
+  if (!align_ok_ldrd_strd (align[0], offset))
+    return false;
+
   /* Make sure we generate legal instructions.  */
   if (operands_ok_ldrd_strd (operands[0], operands[1], base, offset,
                              false, load))
@@ -30365,6 +30390,8 @@ arm_const_not_ok_for_debug_p (rtx p)
   tree decl_op0 = NULL;
   tree decl_op1 = NULL;
 
+  if (GET_CODE (p) == UNSPEC)
+    return true;
   if (GET_CODE (p) == MINUS)
     {
       if (GET_CODE (XEXP (p, 1)) == SYMBOL_REF)
diff --git a/gcc/config/arm/arm.h b/gcc/config/arm/arm.h
index 336db4b042d..65d6db4d086 100644
--- a/gcc/config/arm/arm.h
+++ b/gcc/config/arm/arm.h
@@ -210,6 +210,11 @@ extern tree arm_fp16_type_node;
 /* FPU supports ARMv8.1 Adv.SIMD extensions.  */
 #define TARGET_NEON_RDMA (TARGET_NEON && arm_arch8_1)
 
+/* Supports for Dot Product AdvSIMD extensions.  */
+#define TARGET_DOTPROD (TARGET_NEON					\
+			&& bitmap_bit_p (arm_active_target.isa,		\
+					isa_bit_dotprod))
+
 /* FPU supports the floating point FP16 instructions for ARMv8.2 and later.  */
 #define TARGET_VFP_FP16INST \
   (TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_VFP5 && arm_fp16_inst)
@@ -1248,7 +1253,7 @@ enum reg_class
    couldn't convert a direct call into an indirect one.  */
 #define CALLER_INTERWORKING_SLOT_SIZE			\
   (TARGET_CALLER_INTERWORKING				\
-   && maybe_nonzero (crtl->outgoing_args_size)		\
+   && may_ne (crtl->outgoing_args_size, 0)		\
    ? UNITS_PER_WORD : 0)
 
 /* If we generate an insn to push BYTES bytes,
diff --git a/gcc/config/arm/arm.md b/gcc/config/arm/arm.md
index f241f9d0b7d..ddb9d8f3590 100644
--- a/gcc/config/arm/arm.md
+++ b/gcc/config/arm/arm.md
@@ -4059,12 +4059,6 @@
     {
       rtx scratch1, scratch2;
 
-      if (operands[2] == CONST1_RTX (SImode))
-        {
-          emit_insn (gen_arm_ashldi3_1bit (operands[0], operands[1]));
-          DONE;
-        }
-
       /* Ideally we should use iwmmxt here if we could know that operands[1]
          ends up already living in an iwmmxt register. Otherwise it's
          cheaper to have the alternate code being generated than moving
@@ -4081,18 +4075,6 @@
   "
 )
 
-(define_insn "arm_ashldi3_1bit"
-  [(set (match_operand:DI            0 "s_register_operand" "=r,&r")
-        (ashift:DI (match_operand:DI 1 "s_register_operand" "0,r")
-                   (const_int 1)))
-   (clobber (reg:CC CC_REGNUM))]
-  "TARGET_32BIT"
-  "movs\\t%Q0, %Q1, asl #1\;adc\\t%R0, %R1, %R1"
-  [(set_attr "conds" "clob")
-   (set_attr "length" "8")
-   (set_attr "type" "multiple")]
-)
-
 (define_expand "ashlsi3"
   [(set (match_operand:SI            0 "s_register_operand" "")
 	(ashift:SI (match_operand:SI 1 "s_register_operand" "")
@@ -4128,12 +4110,6 @@
     {
       rtx scratch1, scratch2;
 
-      if (operands[2] == CONST1_RTX (SImode))
-        {
-          emit_insn (gen_arm_ashrdi3_1bit (operands[0], operands[1]));
-          DONE;
-        }
-
       /* Ideally we should use iwmmxt here if we could know that operands[1]
          ends up already living in an iwmmxt register. Otherwise it's
          cheaper to have the alternate code being generated than moving
@@ -4150,18 +4126,6 @@
   "
 )
 
-(define_insn "arm_ashrdi3_1bit"
-  [(set (match_operand:DI              0 "s_register_operand" "=r,&r")
-        (ashiftrt:DI (match_operand:DI 1 "s_register_operand" "0,r")
-                     (const_int 1)))
-   (clobber (reg:CC CC_REGNUM))]
-  "TARGET_32BIT"
-  "movs\\t%R0, %R1, asr #1\;mov\\t%Q0, %Q1, rrx"
-  [(set_attr "conds" "clob")
-   (set_attr "length" "8")
-   (set_attr "type" "multiple")]
-)
-
 (define_expand "ashrsi3"
   [(set (match_operand:SI              0 "s_register_operand" "")
 	(ashiftrt:SI (match_operand:SI 1 "s_register_operand" "")
@@ -4194,12 +4158,6 @@
     {
       rtx scratch1, scratch2;
 
-      if (operands[2] == CONST1_RTX (SImode))
-        {
-          emit_insn (gen_arm_lshrdi3_1bit (operands[0], operands[1]));
-          DONE;
-        }
-
       /* Ideally we should use iwmmxt here if we could know that operands[1]
          ends up already living in an iwmmxt register. Otherwise it's
          cheaper to have the alternate code being generated than moving
@@ -4216,18 +4174,6 @@
   "
 )
 
-(define_insn "arm_lshrdi3_1bit"
-  [(set (match_operand:DI              0 "s_register_operand" "=r,&r")
-        (lshiftrt:DI (match_operand:DI 1 "s_register_operand" "0,r")
-                     (const_int 1)))
-   (clobber (reg:CC CC_REGNUM))]
-  "TARGET_32BIT"
-  "movs\\t%R0, %R1, lsr #1\;mov\\t%Q0, %Q1, rrx"
-  [(set_attr "conds" "clob")
-   (set_attr "length" "8")
-   (set_attr "type" "multiple")]
-)
-
 (define_expand "lshrsi3"
   [(set (match_operand:SI              0 "s_register_operand" "")
 	(lshiftrt:SI (match_operand:SI 1 "s_register_operand" "")
diff --git a/gcc/config/arm/arm_neon_builtins.def b/gcc/config/arm/arm_neon_builtins.def
index 07f0368343a..982eec810da 100644
--- a/gcc/config/arm/arm_neon_builtins.def
+++ b/gcc/config/arm/arm_neon_builtins.def
@@ -331,3 +331,7 @@ VAR11 (STORE1, vst4,
 	v8qi, v4hi, v4hf, v2si, v2sf, di, v16qi, v8hi, v8hf, v4si, v4sf)
 VAR9 (STORE1LANE, vst4_lane,
 	v8qi, v4hi, v4hf, v2si, v2sf, v8hi, v8hf, v4si, v4sf)
+VAR2 (TERNOP, sdot, v8qi, v16qi)
+VAR2 (UTERNOP, udot, v8qi, v16qi)
+VAR2 (MAC_LANE, sdot_lane, v8qi, v16qi)
+VAR2 (UMAC_LANE, udot_lane, v8qi, v16qi)
diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md
index 7acbaf1bb40..a4fb234a846 100644
--- a/gcc/config/arm/iterators.md
+++ b/gcc/config/arm/iterators.md
@@ -410,6 +410,8 @@
 
 (define_int_iterator VFM_LANE_AS [UNSPEC_VFMA_LANE UNSPEC_VFMS_LANE])
 
+(define_int_iterator DOTPROD [UNSPEC_DOT_S UNSPEC_DOT_U])
+
 ;;----------------------------------------------------------------------------
 ;; Mode attributes
 ;;----------------------------------------------------------------------------
@@ -720,6 +722,9 @@
 
 (define_mode_attr pf [(V8QI "p") (V16QI "p") (V2SF "f") (V4SF "f")])
 
+(define_mode_attr VSI2QI [(V2SI "V8QI") (V4SI "V16QI")])
+(define_mode_attr vsi2qi [(V2SI "v8qi") (V4SI "v16qi")])
+
 ;;----------------------------------------------------------------------------
 ;; Code attributes
 ;;----------------------------------------------------------------------------
@@ -816,6 +821,7 @@
   (UNSPEC_VSRA_S_N "s") (UNSPEC_VSRA_U_N "u")
   (UNSPEC_VRSRA_S_N "s") (UNSPEC_VRSRA_U_N "u")
   (UNSPEC_VCVTH_S "s") (UNSPEC_VCVTH_U "u")
+  (UNSPEC_DOT_S "s") (UNSPEC_DOT_U "u")
 ])
 
 (define_int_attr vcvth_op
@@ -1003,3 +1009,6 @@
 
 (define_int_attr mrrc [(VUNSPEC_MRRC "mrrc") (VUNSPEC_MRRC2 "mrrc2")])
 (define_int_attr MRRC [(VUNSPEC_MRRC "MRRC") (VUNSPEC_MRRC2 "MRRC2")])
+
+(define_int_attr opsuffix [(UNSPEC_DOT_S "s8")
+			   (UNSPEC_DOT_U "u8")])
diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md
index 6590e8cd894..073c26580dd 100644
--- a/gcc/config/arm/neon.md
+++ b/gcc/config/arm/neon.md
@@ -1221,12 +1221,8 @@
 	gcc_assert (!reg_overlap_mentioned_p (operands[0], operands[1])
 		    || REGNO (operands[0]) == REGNO (operands[1]));
 
-	if (operands[2] == CONST1_RTX (SImode))
-	  /* This clobbers CC.  */
-	  emit_insn (gen_arm_ashldi3_1bit (operands[0], operands[1]));
-	else
-	  arm_emit_coreregs_64bit_shift (ASHIFT, operands[0], operands[1],
-					 operands[2], operands[3], operands[4]);
+	arm_emit_coreregs_64bit_shift (ASHIFT, operands[0], operands[1],
+				       operands[2], operands[3], operands[4]);
       }
     DONE;
   }"
@@ -1325,13 +1321,9 @@
 	gcc_assert (!reg_overlap_mentioned_p (operands[0], operands[1])
 		    || REGNO (operands[0]) == REGNO (operands[1]));
 
-	if (operands[2] == CONST1_RTX (SImode))
-	  /* This clobbers CC.  */
-	  emit_insn (gen_arm_<shift>di3_1bit (operands[0], operands[1]));
-	else
-	  /* This clobbers CC (ASHIFTRT by register only).  */
-	  arm_emit_coreregs_64bit_shift (<CODE>, operands[0], operands[1],
-				 	 operands[2], operands[3], operands[4]);
+	/* This clobbers CC (ASHIFTRT by register only).  */
+	arm_emit_coreregs_64bit_shift (<CODE>, operands[0], operands[1],
+				       operands[2], operands[3], operands[4]);
       }
 
     DONE;
@@ -3044,6 +3036,76 @@
   DONE;
 })
 
+;; These instructions map to the __builtins for the Dot Product operations.
+(define_insn "neon_<sup>dot<vsi2qi>"
+  [(set (match_operand:VCVTI 0 "register_operand" "=w")
+	(plus:VCVTI (match_operand:VCVTI 1 "register_operand" "0")
+		    (unspec:VCVTI [(match_operand:<VSI2QI> 2
+							"register_operand" "w")
+				   (match_operand:<VSI2QI> 3
+							"register_operand" "w")]
+		DOTPROD)))]
+  "TARGET_DOTPROD"
+  "v<sup>dot.<opsuffix>\\t%<V_reg>0, %<V_reg>2, %<V_reg>3"
+  [(set_attr "type" "neon_dot")]
+)
+
+;; These instructions map to the __builtins for the Dot Product
+;; indexed operations.
+(define_insn "neon_<sup>dot_lane<vsi2qi>"
+  [(set (match_operand:VCVTI 0 "register_operand" "=w")
+	(plus:VCVTI (match_operand:VCVTI 1 "register_operand" "0")
+		    (unspec:VCVTI [(match_operand:<VSI2QI> 2
+							"register_operand" "w")
+				   (match_operand:V8QI 3 "register_operand" "t")
+				   (match_operand:SI 4 "immediate_operand" "i")]
+		DOTPROD)))]
+  "TARGET_DOTPROD"
+  {
+    operands[4]
+      = GEN_INT (NEON_ENDIAN_LANE_N (V8QImode, INTVAL (operands[4])));
+    return "v<sup>dot.<opsuffix>\\t%<V_reg>0, %<V_reg>2, %P3[%c4]";
+  }
+  [(set_attr "type" "neon_dot")]
+)
+
+;; These expands map to the Dot Product optab the vectorizer checks for.
+;; The auto-vectorizer expects a dot product builtin that also does an
+;; accumulation into the provided register.
+;; Given the following pattern
+;;
+;; for (i=0; i<len; i++) {
+;;     c = a[i] * b[i];
+;;     r += c;
+;; }
+;; return result;
+;;
+;; This can be auto-vectorized to
+;; r  = a[0]*b[0] + a[1]*b[1] + a[2]*b[2] + a[3]*b[3];
+;;
+;; given enough iterations.  However the vectorizer can keep unrolling the loop
+;; r += a[4]*b[4] + a[5]*b[5] + a[6]*b[6] + a[7]*b[7];
+;; r += a[8]*b[8] + a[9]*b[9] + a[10]*b[10] + a[11]*b[11];
+;; ...
+;;
+;; and so the vectorizer provides r, in which the result has to be accumulated.
+(define_expand "<sup>dot_prod<vsi2qi>"
+  [(set (match_operand:VCVTI 0 "register_operand")
+	(plus:VCVTI (unspec:VCVTI [(match_operand:<VSI2QI> 1
+							"register_operand")
+				   (match_operand:<VSI2QI> 2
+							"register_operand")]
+		     DOTPROD)
+		    (match_operand:VCVTI 3 "register_operand")))]
+  "TARGET_DOTPROD"
+{
+  emit_insn (
+    gen_neon_<sup>dot<vsi2qi> (operands[3], operands[3], operands[1],
+				 operands[2]));
+  emit_insn (gen_rtx_SET (operands[0], operands[3]));
+  DONE;
+})
+
 (define_expand "neon_copysignf<mode>"
   [(match_operand:VCVTF 0 "register_operand")
    (match_operand:VCVTF 1 "register_operand")
diff --git a/gcc/config/arm/t-multilib b/gcc/config/arm/t-multilib
index ec4b76dbc8f..47f3673160a 100644
--- a/gcc/config/arm/t-multilib
+++ b/gcc/config/arm/t-multilib
@@ -68,7 +68,7 @@ v7ve_vfpv4_simd_variants := +simd
 v8_a_nosimd_variants	:= +crc
 v8_a_simd_variants	:= $(call all_feat_combs, simd crypto)
 v8_1_a_simd_variants	:= $(call all_feat_combs, simd crypto)
-v8_2_a_simd_variants	:= $(call all_feat_combs, simd fp16 crypto)
+v8_2_a_simd_variants	:= $(call all_feat_combs, simd fp16 crypto dotprod)
 
 
 ifneq (,$(HAS_APROFILE))
diff --git a/gcc/config/arm/types.md b/gcc/config/arm/types.md
index 22d993d46a3..03e9cdebb75 100644
--- a/gcc/config/arm/types.md
+++ b/gcc/config/arm/types.md
@@ -316,6 +316,8 @@
 ; neon_cls_q
 ; neon_cnt
 ; neon_cnt_q
+; neon_dot
+; neon_dot_q
 ; neon_ext
 ; neon_ext_q
 ; neon_rbit
@@ -764,6 +766,8 @@
 \
   neon_abs,\
   neon_abs_q,\
+  neon_dot,\
+  neon_dot_q,\
   neon_neg,\
   neon_neg_q,\
   neon_qneg,\
@@ -1110,8 +1114,8 @@
           neon_sub, neon_sub_q, neon_sub_widen, neon_sub_long, neon_qsub,\
           neon_qsub_q, neon_sub_halve, neon_sub_halve_q,\
           neon_sub_halve_narrow_q,\
-          neon_abs, neon_abs_q, neon_neg, neon_neg_q, neon_qneg,\
-          neon_qneg_q, neon_qabs, neon_qabs_q, neon_abd, neon_abd_q,\
+	  neon_abs, neon_abs_q, neon_dot, neon_dot_q, neon_neg, neon_neg_q,\
+	  neon_qneg, neon_qneg_q, neon_qabs, neon_qabs_q, neon_abd, neon_abd_q,\
           neon_abd_long, neon_minmax, neon_minmax_q, neon_compare,\
           neon_compare_q, neon_compare_zero, neon_compare_zero_q,\
           neon_arith_acc, neon_arith_acc_q, neon_reduc_add,\
diff --git a/gcc/config/arm/unspecs.md b/gcc/config/arm/unspecs.md
index 99cfa41b08d..c474f4bb5db 100644
--- a/gcc/config/arm/unspecs.md
+++ b/gcc/config/arm/unspecs.md
@@ -410,4 +410,6 @@
   UNSPEC_VRNDN
   UNSPEC_VRNDP
   UNSPEC_VRNDX
+  UNSPEC_DOT_S
+  UNSPEC_DOT_U
 ])
diff --git a/gcc/config/arm/vfp.md b/gcc/config/arm/vfp.md
index 9521e904d21..a541413c263 100644
--- a/gcc/config/arm/vfp.md
+++ b/gcc/config/arm/vfp.md
@@ -304,9 +304,9 @@
 ;; DImode moves
 
 (define_insn "*movdi_vfp"
-  [(set (match_operand:DI 0 "nonimmediate_di_operand" "=r,r,r,r,q,q,m,w,r,w,w, Uv")
+  [(set (match_operand:DI 0 "nonimmediate_di_operand" "=r,r,r,r,q,q,m,w,!r,w,w, Uv")
        (match_operand:DI 1 "di_operand"              "r,rDa,Db,Dc,mi,mi,q,r,w,w,Uvi,w"))]
-  "TARGET_32BIT && TARGET_HARD_FLOAT && arm_tune != TARGET_CPU_cortexa8
+  "TARGET_32BIT && TARGET_HARD_FLOAT
    && (   register_operand (operands[0], DImode)
        || register_operand (operands[1], DImode))
    && !(TARGET_NEON && CONST_INT_P (operands[1])
@@ -339,71 +339,25 @@
     }
   "
   [(set_attr "type" "multiple,multiple,multiple,multiple,load_8,load_8,store_8,f_mcrr,f_mrrc,ffarithd,f_loadd,f_stored")
-   (set (attr "length") (cond [(eq_attr "alternative" "1,4,5,6") (const_int 8)
+   (set (attr "length") (cond [(eq_attr "alternative" "1") (const_int 8)
                               (eq_attr "alternative" "2") (const_int 12)
                               (eq_attr "alternative" "3") (const_int 16)
+			      (eq_attr "alternative" "4,5,6")
+			       (symbol_ref "arm_count_output_move_double_insns (operands) * 4")
                               (eq_attr "alternative" "9")
                                (if_then_else
                                  (match_test "TARGET_VFP_SINGLE")
                                  (const_int 8)
                                  (const_int 4))]
                               (const_int 4)))
+   (set_attr "predicable"    "yes")
    (set_attr "arm_pool_range"     "*,*,*,*,1020,4096,*,*,*,*,1020,*")
    (set_attr "thumb2_pool_range"     "*,*,*,*,1018,4094,*,*,*,*,1018,*")
    (set_attr "neg_pool_range" "*,*,*,*,1004,0,*,*,*,*,1004,*")
+   (set (attr "ce_count") (symbol_ref "get_attr_length (insn) / 4"))
    (set_attr "arch"           "t2,any,any,any,a,t2,any,any,any,any,any,any")]
 )
 
-(define_insn "*movdi_vfp_cortexa8"
-  [(set (match_operand:DI 0 "nonimmediate_di_operand" "=r,r,r,r,q,q,m,w,!r,w,w, Uv")
-	(match_operand:DI 1 "di_operand"		"r,rDa,Db,Dc,mi,mi,q,r,w,w,Uvi,w"))]
-  "TARGET_32BIT && TARGET_HARD_FLOAT && arm_tune == TARGET_CPU_cortexa8
-    && (   register_operand (operands[0], DImode)
-        || register_operand (operands[1], DImode))
-    && !(TARGET_NEON && CONST_INT_P (operands[1])
-	 && neon_immediate_valid_for_move (operands[1], DImode, NULL, NULL))"
-  "*
-  switch (which_alternative)
-    {
-    case 0: 
-    case 1:
-    case 2:
-    case 3:
-      return \"#\";
-    case 4:
-    case 5:
-    case 6:
-      return output_move_double (operands, true, NULL);
-    case 7:
-      return \"vmov%?\\t%P0, %Q1, %R1\\t%@ int\";
-    case 8:
-      return \"vmov%?\\t%Q0, %R0, %P1\\t%@ int\";
-    case 9:
-      return \"vmov%?.f64\\t%P0, %P1\\t%@ int\";
-    case 10: case 11:
-      return output_move_vfp (operands);
-    default:
-      gcc_unreachable ();
-    }
-  "
-  [(set_attr "type" "multiple,multiple,multiple,multiple,load_8,load_8,store_8,f_mcrr,f_mrrc,ffarithd,f_loadd,f_stored")
-   (set (attr "length") (cond [(eq_attr "alternative" "1") (const_int 8)
-                               (eq_attr "alternative" "2") (const_int 12)
-                               (eq_attr "alternative" "3") (const_int 16)
-                               (eq_attr "alternative" "4,5,6") 
-			       (symbol_ref 
-				"arm_count_output_move_double_insns (operands) \
-                                 * 4")]
-                              (const_int 4)))
-   (set_attr "predicable"    "yes")
-   (set_attr "arm_pool_range"     "*,*,*,*,1018,4094,*,*,*,*,1018,*")
-   (set_attr "thumb2_pool_range"     "*,*,*,*,1018,4094,*,*,*,*,1018,*")
-   (set_attr "neg_pool_range" "*,*,*,*,1004,0,*,*,*,*,1004,*")
-   (set (attr "ce_count") 
-	(symbol_ref "get_attr_length (insn) / 4"))
-   (set_attr "arch"           "t2,any,any,any,a,t2,any,any,any,any,any,any")]
- )
-
 ;; HFmode moves
 
 (define_insn "*movhf_vfp_fp16"