summaryrefslogtreecommitdiff
path: root/gcc/config/arm
diff options
context:
space:
mode:
Diffstat (limited to 'gcc/config/arm')
-rw-r--r--gcc/config/arm/arm-builtins.c14
-rw-r--r--gcc/config/arm/arm-c.c6
-rw-r--r--gcc/config/arm/arm-cpus.in27
-rw-r--r--gcc/config/arm/arm.c37
-rw-r--r--gcc/config/arm/arm.h7
-rw-r--r--gcc/config/arm/arm.md54
-rw-r--r--gcc/config/arm/arm_neon_builtins.def4
-rw-r--r--gcc/config/arm/iterators.md9
-rw-r--r--gcc/config/arm/neon.md88
-rw-r--r--gcc/config/arm/t-multilib2
-rw-r--r--gcc/config/arm/types.md8
-rw-r--r--gcc/config/arm/unspecs.md2
-rw-r--r--gcc/config/arm/vfp.md60
13 files changed, 181 insertions, 137 deletions
diff --git a/gcc/config/arm/arm-builtins.c b/gcc/config/arm/arm-builtins.c
index 692496d49d5..d09c6e371de 100644
--- a/gcc/config/arm/arm-builtins.c
+++ b/gcc/config/arm/arm-builtins.c
@@ -107,6 +107,13 @@ arm_ternop_qualifiers[SIMD_MAX_BUILTIN_ARGS]
= { qualifier_none, qualifier_none, qualifier_none, qualifier_none };
#define TERNOP_QUALIFIERS (arm_ternop_qualifiers)
+/* unsigned T (unsigned T, unsigned T, unsigned T). */
+static enum arm_type_qualifiers
+arm_unsigned_uternop_qualifiers[SIMD_MAX_BUILTIN_ARGS]
+ = { qualifier_unsigned, qualifier_unsigned, qualifier_unsigned,
+ qualifier_unsigned };
+#define UTERNOP_QUALIFIERS (arm_unsigned_uternop_qualifiers)
+
/* T (T, immediate). */
static enum arm_type_qualifiers
arm_binop_imm_qualifiers[SIMD_MAX_BUILTIN_ARGS]
@@ -133,6 +140,13 @@ arm_mac_lane_qualifiers[SIMD_MAX_BUILTIN_ARGS]
qualifier_none, qualifier_lane_index };
#define MAC_LANE_QUALIFIERS (arm_mac_lane_qualifiers)
+/* unsigned T (unsigned T, unsigned T, unsigend T, lane index). */
+static enum arm_type_qualifiers
+arm_umac_lane_qualifiers[SIMD_MAX_BUILTIN_ARGS]
+ = { qualifier_unsigned, qualifier_unsigned, qualifier_unsigned,
+ qualifier_unsigned, qualifier_lane_index };
+#define UMAC_LANE_QUALIFIERS (arm_umac_lane_qualifiers)
+
/* T (T, T, immediate). */
static enum arm_type_qualifiers
arm_ternop_imm_qualifiers[SIMD_MAX_BUILTIN_ARGS]
diff --git a/gcc/config/arm/arm-c.c b/gcc/config/arm/arm-c.c
index b2e9af6c45d..635bc3c1c38 100644
--- a/gcc/config/arm/arm-c.c
+++ b/gcc/config/arm/arm-c.c
@@ -74,11 +74,11 @@ arm_cpu_builtins (struct cpp_reader* pfile)
def_or_undef_macro (pfile, "__ARM_FEATURE_QRDMX", TARGET_NEON_RDMA);
- if (TARGET_CRC32)
- builtin_define ("__ARM_FEATURE_CRC32");
-
+ def_or_undef_macro (pfile, "__ARM_FEATURE_CRC32", TARGET_CRC32);
+ def_or_undef_macro (pfile, "__ARM_FEATURE_DOTPROD", TARGET_DOTPROD);
def_or_undef_macro (pfile, "__ARM_32BIT_STATE", TARGET_32BIT);
+ cpp_undef (pfile, "__ARM_FEATURE_CMSE");
if (arm_arch8 && !arm_arch_notm)
{
if (arm_arch_cmse && use_cmse)
diff --git a/gcc/config/arm/arm-cpus.in b/gcc/config/arm/arm-cpus.in
index 07de4c9375b..0820ad74c2e 100644
--- a/gcc/config/arm/arm-cpus.in
+++ b/gcc/config/arm/arm-cpus.in
@@ -156,6 +156,8 @@ define feature crypto
# FP16 data processing (half-precision float).
define feature fp16
+# Dot Product instructions extension to ARMv8.2-a.
+define feature dotprod
# ISA Quirks (errata?). Don't forget to add this to the fgroup
# ALL_QUIRKS below.
@@ -173,6 +175,17 @@ define feature quirk_cm3_ldrd
define feature smallmul
# Feature groups. Conventionally all (or mostly) upper case.
+# ALL_FPU lists all the feature bits associated with the floating-point
+# unit; these will all be removed if the floating-point unit is disabled
+# (eg -mfloat-abi=soft). ALL_FPU_INTERNAL must ONLY contain features that
+# form part of a named -mfpu option; it is used to map the capabilities
+# back to a named FPU for the benefit of the assembler.
+#
+# ALL_SIMD_INTERNAL and ALL_SIMD are similarly defined to help with the
+# construction of ALL_FPU and ALL_FPU_INTERNAL; they describe the SIMD
+# extensions that are either part of a named FPU or optional extensions
+# respectively.
+
# List of all cryptographic extensions to stripout if crypto is
# disabled. Currently, that's trivial, but we define it anyway for
@@ -182,11 +195,12 @@ define fgroup ALL_CRYPTO crypto
# List of all SIMD bits to strip out if SIMD is disabled. This does
# strip off 32 D-registers, but does not remove support for
# double-precision FP.
-define fgroup ALL_SIMD fp_d32 neon ALL_CRYPTO
+define fgroup ALL_SIMD_INTERNAL fp_d32 neon ALL_CRYPTO
+define fgroup ALL_SIMD ALL_SIMD_INTERNAL dotprod
# List of all FPU bits to strip out if -mfpu is used to override the
# default. fp16 is deliberately missing from this list.
-define fgroup ALL_FPU_INTERNAL vfpv2 vfpv3 vfpv4 fpv5 fp16conv fp_dbl ALL_SIMD
+define fgroup ALL_FPU_INTERNAL vfpv2 vfpv3 vfpv4 fpv5 fp16conv fp_dbl ALL_SIMD_INTERNAL
# Similarly, but including fp16 and other extensions that aren't part of
# -mfpu support.
@@ -239,6 +253,7 @@ define fgroup FP_D32 FP_DBL fp_d32
define fgroup FP_ARMv8 FPv5 FP_D32
define fgroup NEON FP_D32 neon
define fgroup CRYPTO NEON crypto
+define fgroup DOTPROD NEON dotprod
# List of all quirk bits to strip out when comparing CPU features with
# architectures.
@@ -561,6 +576,7 @@ begin arch armv8.2-a
option crypto add FP_ARMv8 CRYPTO
option nocrypto remove ALL_CRYPTO
option nofp remove ALL_FP
+ option dotprod add FP_ARMv8 DOTPROD
end arch armv8.2-a
begin arch armv8-m.base
@@ -1473,7 +1489,7 @@ begin cpu cortex-a55
cname cortexa55
tune for cortex-a53
tune flags LDSCHED
- architecture armv8.2-a+fp16
+ architecture armv8.2-a+fp16+dotprod
fpu neon-fp-armv8
option crypto add FP_ARMv8 CRYPTO
option nofp remove ALL_FP
@@ -1484,7 +1500,7 @@ begin cpu cortex-a75
cname cortexa75
tune for cortex-a57
tune flags LDSCHED
- architecture armv8.2-a+fp16
+ architecture armv8.2-a+fp16+dotprod
fpu neon-fp-armv8
option crypto add FP_ARMv8 CRYPTO
costs cortex_a73
@@ -1496,7 +1512,7 @@ begin cpu cortex-a75.cortex-a55
cname cortexa75cortexa55
tune for cortex-a53
tune flags LDSCHED
- architecture armv8.2-a+fp16
+ architecture armv8.2-a+fp16+dotprod
fpu neon-fp-armv8
option crypto add FP_ARMv8 CRYPTO
costs cortex_a73
@@ -1516,6 +1532,7 @@ begin cpu cortex-m33
architecture armv8-m.main+dsp
fpu fpv5-sp-d16
option nofp remove ALL_FP
+ option nodsp remove armv7em
costs v7m
end cpu cortex-m33
diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c
index ce3aaeb04e0..47ba0dd09e3 100644
--- a/gcc/config/arm/arm.c
+++ b/gcc/config/arm/arm.c
@@ -973,6 +973,9 @@ int arm_condexec_masklen = 0;
/* Nonzero if chip supports the ARMv8 CRC instructions. */
int arm_arch_crc = 0;
+/* Nonzero if chip supports the AdvSIMD Dot Product instructions. */
+int arm_arch_dotprod = 0;
+
/* Nonzero if chip supports the ARMv8-M security extensions. */
int arm_arch_cmse = 0;
@@ -9429,6 +9432,9 @@ arm_rtx_costs_internal (rtx x, enum rtx_code code, enum rtx_code outer_code,
+ rtx_cost (XEXP (x, 0), mode, code, 0, speed_p));
if (speed_p)
*cost += 2 * extra_cost->alu.shift;
+ /* Slightly disparage left shift by 1 at so we prefer adddi3. */
+ if (code == ASHIFT && XEXP (x, 1) == CONST1_RTX (SImode))
+ *cost += 1;
return true;
}
else if (mode == SImode)
@@ -11252,9 +11258,11 @@ arm_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
return current_tune->vec_costs->scalar_to_vec_cost;
case unaligned_load:
+ case vector_gather_load:
return current_tune->vec_costs->vec_unalign_load_cost;
case unaligned_store:
+ case vector_scatter_store:
return current_tune->vec_costs->vec_unalign_store_cost;
case cond_branch_taken:
@@ -15293,12 +15301,23 @@ operands_ok_ldrd_strd (rtx rt, rtx rt2, rtx rn, HOST_WIDE_INT offset,
return true;
}
+/* Return true if a 64-bit access with alignment ALIGN and with a
+ constant offset OFFSET from the base pointer is permitted on this
+ architecture. */
+static bool
+align_ok_ldrd_strd (HOST_WIDE_INT align, HOST_WIDE_INT offset)
+{
+ return (unaligned_access
+ ? (align >= BITS_PER_WORD && (offset & 3) == 0)
+ : (align >= 2 * BITS_PER_WORD && (offset & 7) == 0));
+}
+
/* Helper for gen_operands_ldrd_strd. Returns true iff the memory
operand MEM's address contains an immediate offset from the base
- register and has no side effects, in which case it sets BASE and
- OFFSET accordingly. */
+ register and has no side effects, in which case it sets BASE,
+ OFFSET and ALIGN accordingly. */
static bool
-mem_ok_for_ldrd_strd (rtx mem, rtx *base, rtx *offset)
+mem_ok_for_ldrd_strd (rtx mem, rtx *base, rtx *offset, HOST_WIDE_INT *align)
{
rtx addr;
@@ -15317,6 +15336,7 @@ mem_ok_for_ldrd_strd (rtx mem, rtx *base, rtx *offset)
gcc_assert (MEM_P (mem));
*offset = const0_rtx;
+ *align = MEM_ALIGN (mem);
addr = XEXP (mem, 0);
@@ -15357,7 +15377,7 @@ gen_operands_ldrd_strd (rtx *operands, bool load,
bool const_store, bool commute)
{
int nops = 2;
- HOST_WIDE_INT offsets[2], offset;
+ HOST_WIDE_INT offsets[2], offset, align[2];
rtx base = NULL_RTX;
rtx cur_base, cur_offset, tmp;
int i, gap;
@@ -15369,7 +15389,8 @@ gen_operands_ldrd_strd (rtx *operands, bool load,
registers, and the corresponding memory offsets. */
for (i = 0; i < nops; i++)
{
- if (!mem_ok_for_ldrd_strd (operands[nops+i], &cur_base, &cur_offset))
+ if (!mem_ok_for_ldrd_strd (operands[nops+i], &cur_base, &cur_offset,
+ &align[i]))
return false;
if (i == 0)
@@ -15483,6 +15504,7 @@ gen_operands_ldrd_strd (rtx *operands, bool load,
/* Swap the instructions such that lower memory is accessed first. */
std::swap (operands[0], operands[1]);
std::swap (operands[2], operands[3]);
+ std::swap (align[0], align[1]);
if (const_store)
std::swap (operands[4], operands[5]);
}
@@ -15496,6 +15518,9 @@ gen_operands_ldrd_strd (rtx *operands, bool load,
if (gap != 4)
return false;
+ if (!align_ok_ldrd_strd (align[0], offset))
+ return false;
+
/* Make sure we generate legal instructions. */
if (operands_ok_ldrd_strd (operands[0], operands[1], base, offset,
false, load))
@@ -30365,6 +30390,8 @@ arm_const_not_ok_for_debug_p (rtx p)
tree decl_op0 = NULL;
tree decl_op1 = NULL;
+ if (GET_CODE (p) == UNSPEC)
+ return true;
if (GET_CODE (p) == MINUS)
{
if (GET_CODE (XEXP (p, 1)) == SYMBOL_REF)
diff --git a/gcc/config/arm/arm.h b/gcc/config/arm/arm.h
index 336db4b042d..65d6db4d086 100644
--- a/gcc/config/arm/arm.h
+++ b/gcc/config/arm/arm.h
@@ -210,6 +210,11 @@ extern tree arm_fp16_type_node;
/* FPU supports ARMv8.1 Adv.SIMD extensions. */
#define TARGET_NEON_RDMA (TARGET_NEON && arm_arch8_1)
+/* Supports for Dot Product AdvSIMD extensions. */
+#define TARGET_DOTPROD (TARGET_NEON \
+ && bitmap_bit_p (arm_active_target.isa, \
+ isa_bit_dotprod))
+
/* FPU supports the floating point FP16 instructions for ARMv8.2 and later. */
#define TARGET_VFP_FP16INST \
(TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_VFP5 && arm_fp16_inst)
@@ -1248,7 +1253,7 @@ enum reg_class
couldn't convert a direct call into an indirect one. */
#define CALLER_INTERWORKING_SLOT_SIZE \
(TARGET_CALLER_INTERWORKING \
- && maybe_nonzero (crtl->outgoing_args_size) \
+ && may_ne (crtl->outgoing_args_size, 0) \
? UNITS_PER_WORD : 0)
/* If we generate an insn to push BYTES bytes,
diff --git a/gcc/config/arm/arm.md b/gcc/config/arm/arm.md
index f241f9d0b7d..ddb9d8f3590 100644
--- a/gcc/config/arm/arm.md
+++ b/gcc/config/arm/arm.md
@@ -4059,12 +4059,6 @@
{
rtx scratch1, scratch2;
- if (operands[2] == CONST1_RTX (SImode))
- {
- emit_insn (gen_arm_ashldi3_1bit (operands[0], operands[1]));
- DONE;
- }
-
/* Ideally we should use iwmmxt here if we could know that operands[1]
ends up already living in an iwmmxt register. Otherwise it's
cheaper to have the alternate code being generated than moving
@@ -4081,18 +4075,6 @@
"
)
-(define_insn "arm_ashldi3_1bit"
- [(set (match_operand:DI 0 "s_register_operand" "=r,&r")
- (ashift:DI (match_operand:DI 1 "s_register_operand" "0,r")
- (const_int 1)))
- (clobber (reg:CC CC_REGNUM))]
- "TARGET_32BIT"
- "movs\\t%Q0, %Q1, asl #1\;adc\\t%R0, %R1, %R1"
- [(set_attr "conds" "clob")
- (set_attr "length" "8")
- (set_attr "type" "multiple")]
-)
-
(define_expand "ashlsi3"
[(set (match_operand:SI 0 "s_register_operand" "")
(ashift:SI (match_operand:SI 1 "s_register_operand" "")
@@ -4128,12 +4110,6 @@
{
rtx scratch1, scratch2;
- if (operands[2] == CONST1_RTX (SImode))
- {
- emit_insn (gen_arm_ashrdi3_1bit (operands[0], operands[1]));
- DONE;
- }
-
/* Ideally we should use iwmmxt here if we could know that operands[1]
ends up already living in an iwmmxt register. Otherwise it's
cheaper to have the alternate code being generated than moving
@@ -4150,18 +4126,6 @@
"
)
-(define_insn "arm_ashrdi3_1bit"
- [(set (match_operand:DI 0 "s_register_operand" "=r,&r")
- (ashiftrt:DI (match_operand:DI 1 "s_register_operand" "0,r")
- (const_int 1)))
- (clobber (reg:CC CC_REGNUM))]
- "TARGET_32BIT"
- "movs\\t%R0, %R1, asr #1\;mov\\t%Q0, %Q1, rrx"
- [(set_attr "conds" "clob")
- (set_attr "length" "8")
- (set_attr "type" "multiple")]
-)
-
(define_expand "ashrsi3"
[(set (match_operand:SI 0 "s_register_operand" "")
(ashiftrt:SI (match_operand:SI 1 "s_register_operand" "")
@@ -4194,12 +4158,6 @@
{
rtx scratch1, scratch2;
- if (operands[2] == CONST1_RTX (SImode))
- {
- emit_insn (gen_arm_lshrdi3_1bit (operands[0], operands[1]));
- DONE;
- }
-
/* Ideally we should use iwmmxt here if we could know that operands[1]
ends up already living in an iwmmxt register. Otherwise it's
cheaper to have the alternate code being generated than moving
@@ -4216,18 +4174,6 @@
"
)
-(define_insn "arm_lshrdi3_1bit"
- [(set (match_operand:DI 0 "s_register_operand" "=r,&r")
- (lshiftrt:DI (match_operand:DI 1 "s_register_operand" "0,r")
- (const_int 1)))
- (clobber (reg:CC CC_REGNUM))]
- "TARGET_32BIT"
- "movs\\t%R0, %R1, lsr #1\;mov\\t%Q0, %Q1, rrx"
- [(set_attr "conds" "clob")
- (set_attr "length" "8")
- (set_attr "type" "multiple")]
-)
-
(define_expand "lshrsi3"
[(set (match_operand:SI 0 "s_register_operand" "")
(lshiftrt:SI (match_operand:SI 1 "s_register_operand" "")
diff --git a/gcc/config/arm/arm_neon_builtins.def b/gcc/config/arm/arm_neon_builtins.def
index 07f0368343a..982eec810da 100644
--- a/gcc/config/arm/arm_neon_builtins.def
+++ b/gcc/config/arm/arm_neon_builtins.def
@@ -331,3 +331,7 @@ VAR11 (STORE1, vst4,
v8qi, v4hi, v4hf, v2si, v2sf, di, v16qi, v8hi, v8hf, v4si, v4sf)
VAR9 (STORE1LANE, vst4_lane,
v8qi, v4hi, v4hf, v2si, v2sf, v8hi, v8hf, v4si, v4sf)
+VAR2 (TERNOP, sdot, v8qi, v16qi)
+VAR2 (UTERNOP, udot, v8qi, v16qi)
+VAR2 (MAC_LANE, sdot_lane, v8qi, v16qi)
+VAR2 (UMAC_LANE, udot_lane, v8qi, v16qi)
diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md
index 7acbaf1bb40..a4fb234a846 100644
--- a/gcc/config/arm/iterators.md
+++ b/gcc/config/arm/iterators.md
@@ -410,6 +410,8 @@
(define_int_iterator VFM_LANE_AS [UNSPEC_VFMA_LANE UNSPEC_VFMS_LANE])
+(define_int_iterator DOTPROD [UNSPEC_DOT_S UNSPEC_DOT_U])
+
;;----------------------------------------------------------------------------
;; Mode attributes
;;----------------------------------------------------------------------------
@@ -720,6 +722,9 @@
(define_mode_attr pf [(V8QI "p") (V16QI "p") (V2SF "f") (V4SF "f")])
+(define_mode_attr VSI2QI [(V2SI "V8QI") (V4SI "V16QI")])
+(define_mode_attr vsi2qi [(V2SI "v8qi") (V4SI "v16qi")])
+
;;----------------------------------------------------------------------------
;; Code attributes
;;----------------------------------------------------------------------------
@@ -816,6 +821,7 @@
(UNSPEC_VSRA_S_N "s") (UNSPEC_VSRA_U_N "u")
(UNSPEC_VRSRA_S_N "s") (UNSPEC_VRSRA_U_N "u")
(UNSPEC_VCVTH_S "s") (UNSPEC_VCVTH_U "u")
+ (UNSPEC_DOT_S "s") (UNSPEC_DOT_U "u")
])
(define_int_attr vcvth_op
@@ -1003,3 +1009,6 @@
(define_int_attr mrrc [(VUNSPEC_MRRC "mrrc") (VUNSPEC_MRRC2 "mrrc2")])
(define_int_attr MRRC [(VUNSPEC_MRRC "MRRC") (VUNSPEC_MRRC2 "MRRC2")])
+
+(define_int_attr opsuffix [(UNSPEC_DOT_S "s8")
+ (UNSPEC_DOT_U "u8")])
diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md
index 6590e8cd894..073c26580dd 100644
--- a/gcc/config/arm/neon.md
+++ b/gcc/config/arm/neon.md
@@ -1221,12 +1221,8 @@
gcc_assert (!reg_overlap_mentioned_p (operands[0], operands[1])
|| REGNO (operands[0]) == REGNO (operands[1]));
- if (operands[2] == CONST1_RTX (SImode))
- /* This clobbers CC. */
- emit_insn (gen_arm_ashldi3_1bit (operands[0], operands[1]));
- else
- arm_emit_coreregs_64bit_shift (ASHIFT, operands[0], operands[1],
- operands[2], operands[3], operands[4]);
+ arm_emit_coreregs_64bit_shift (ASHIFT, operands[0], operands[1],
+ operands[2], operands[3], operands[4]);
}
DONE;
}"
@@ -1325,13 +1321,9 @@
gcc_assert (!reg_overlap_mentioned_p (operands[0], operands[1])
|| REGNO (operands[0]) == REGNO (operands[1]));
- if (operands[2] == CONST1_RTX (SImode))
- /* This clobbers CC. */
- emit_insn (gen_arm_<shift>di3_1bit (operands[0], operands[1]));
- else
- /* This clobbers CC (ASHIFTRT by register only). */
- arm_emit_coreregs_64bit_shift (<CODE>, operands[0], operands[1],
- operands[2], operands[3], operands[4]);
+ /* This clobbers CC (ASHIFTRT by register only). */
+ arm_emit_coreregs_64bit_shift (<CODE>, operands[0], operands[1],
+ operands[2], operands[3], operands[4]);
}
DONE;
@@ -3044,6 +3036,76 @@
DONE;
})
+;; These instructions map to the __builtins for the Dot Product operations.
+(define_insn "neon_<sup>dot<vsi2qi>"
+ [(set (match_operand:VCVTI 0 "register_operand" "=w")
+ (plus:VCVTI (match_operand:VCVTI 1 "register_operand" "0")
+ (unspec:VCVTI [(match_operand:<VSI2QI> 2
+ "register_operand" "w")
+ (match_operand:<VSI2QI> 3
+ "register_operand" "w")]
+ DOTPROD)))]
+ "TARGET_DOTPROD"
+ "v<sup>dot.<opsuffix>\\t%<V_reg>0, %<V_reg>2, %<V_reg>3"
+ [(set_attr "type" "neon_dot")]
+)
+
+;; These instructions map to the __builtins for the Dot Product
+;; indexed operations.
+(define_insn "neon_<sup>dot_lane<vsi2qi>"
+ [(set (match_operand:VCVTI 0 "register_operand" "=w")
+ (plus:VCVTI (match_operand:VCVTI 1 "register_operand" "0")
+ (unspec:VCVTI [(match_operand:<VSI2QI> 2
+ "register_operand" "w")
+ (match_operand:V8QI 3 "register_operand" "t")
+ (match_operand:SI 4 "immediate_operand" "i")]
+ DOTPROD)))]
+ "TARGET_DOTPROD"
+ {
+ operands[4]
+ = GEN_INT (NEON_ENDIAN_LANE_N (V8QImode, INTVAL (operands[4])));
+ return "v<sup>dot.<opsuffix>\\t%<V_reg>0, %<V_reg>2, %P3[%c4]";
+ }
+ [(set_attr "type" "neon_dot")]
+)
+
+;; These expands map to the Dot Product optab the vectorizer checks for.
+;; The auto-vectorizer expects a dot product builtin that also does an
+;; accumulation into the provided register.
+;; Given the following pattern
+;;
+;; for (i=0; i<len; i++) {
+;; c = a[i] * b[i];
+;; r += c;
+;; }
+;; return result;
+;;
+;; This can be auto-vectorized to
+;; r = a[0]*b[0] + a[1]*b[1] + a[2]*b[2] + a[3]*b[3];
+;;
+;; given enough iterations. However the vectorizer can keep unrolling the loop
+;; r += a[4]*b[4] + a[5]*b[5] + a[6]*b[6] + a[7]*b[7];
+;; r += a[8]*b[8] + a[9]*b[9] + a[10]*b[10] + a[11]*b[11];
+;; ...
+;;
+;; and so the vectorizer provides r, in which the result has to be accumulated.
+(define_expand "<sup>dot_prod<vsi2qi>"
+ [(set (match_operand:VCVTI 0 "register_operand")
+ (plus:VCVTI (unspec:VCVTI [(match_operand:<VSI2QI> 1
+ "register_operand")
+ (match_operand:<VSI2QI> 2
+ "register_operand")]
+ DOTPROD)
+ (match_operand:VCVTI 3 "register_operand")))]
+ "TARGET_DOTPROD"
+{
+ emit_insn (
+ gen_neon_<sup>dot<vsi2qi> (operands[3], operands[3], operands[1],
+ operands[2]));
+ emit_insn (gen_rtx_SET (operands[0], operands[3]));
+ DONE;
+})
+
(define_expand "neon_copysignf<mode>"
[(match_operand:VCVTF 0 "register_operand")
(match_operand:VCVTF 1 "register_operand")
diff --git a/gcc/config/arm/t-multilib b/gcc/config/arm/t-multilib
index ec4b76dbc8f..47f3673160a 100644
--- a/gcc/config/arm/t-multilib
+++ b/gcc/config/arm/t-multilib
@@ -68,7 +68,7 @@ v7ve_vfpv4_simd_variants := +simd
v8_a_nosimd_variants := +crc
v8_a_simd_variants := $(call all_feat_combs, simd crypto)
v8_1_a_simd_variants := $(call all_feat_combs, simd crypto)
-v8_2_a_simd_variants := $(call all_feat_combs, simd fp16 crypto)
+v8_2_a_simd_variants := $(call all_feat_combs, simd fp16 crypto dotprod)
ifneq (,$(HAS_APROFILE))
diff --git a/gcc/config/arm/types.md b/gcc/config/arm/types.md
index 22d993d46a3..03e9cdebb75 100644
--- a/gcc/config/arm/types.md
+++ b/gcc/config/arm/types.md
@@ -316,6 +316,8 @@
; neon_cls_q
; neon_cnt
; neon_cnt_q
+; neon_dot
+; neon_dot_q
; neon_ext
; neon_ext_q
; neon_rbit
@@ -764,6 +766,8 @@
\
neon_abs,\
neon_abs_q,\
+ neon_dot,\
+ neon_dot_q,\
neon_neg,\
neon_neg_q,\
neon_qneg,\
@@ -1110,8 +1114,8 @@
neon_sub, neon_sub_q, neon_sub_widen, neon_sub_long, neon_qsub,\
neon_qsub_q, neon_sub_halve, neon_sub_halve_q,\
neon_sub_halve_narrow_q,\
- neon_abs, neon_abs_q, neon_neg, neon_neg_q, neon_qneg,\
- neon_qneg_q, neon_qabs, neon_qabs_q, neon_abd, neon_abd_q,\
+ neon_abs, neon_abs_q, neon_dot, neon_dot_q, neon_neg, neon_neg_q,\
+ neon_qneg, neon_qneg_q, neon_qabs, neon_qabs_q, neon_abd, neon_abd_q,\
neon_abd_long, neon_minmax, neon_minmax_q, neon_compare,\
neon_compare_q, neon_compare_zero, neon_compare_zero_q,\
neon_arith_acc, neon_arith_acc_q, neon_reduc_add,\
diff --git a/gcc/config/arm/unspecs.md b/gcc/config/arm/unspecs.md
index 99cfa41b08d..c474f4bb5db 100644
--- a/gcc/config/arm/unspecs.md
+++ b/gcc/config/arm/unspecs.md
@@ -410,4 +410,6 @@
UNSPEC_VRNDN
UNSPEC_VRNDP
UNSPEC_VRNDX
+ UNSPEC_DOT_S
+ UNSPEC_DOT_U
])
diff --git a/gcc/config/arm/vfp.md b/gcc/config/arm/vfp.md
index 9521e904d21..a541413c263 100644
--- a/gcc/config/arm/vfp.md
+++ b/gcc/config/arm/vfp.md
@@ -304,9 +304,9 @@
;; DImode moves
(define_insn "*movdi_vfp"
- [(set (match_operand:DI 0 "nonimmediate_di_operand" "=r,r,r,r,q,q,m,w,r,w,w, Uv")
+ [(set (match_operand:DI 0 "nonimmediate_di_operand" "=r,r,r,r,q,q,m,w,!r,w,w, Uv")
(match_operand:DI 1 "di_operand" "r,rDa,Db,Dc,mi,mi,q,r,w,w,Uvi,w"))]
- "TARGET_32BIT && TARGET_HARD_FLOAT && arm_tune != TARGET_CPU_cortexa8
+ "TARGET_32BIT && TARGET_HARD_FLOAT
&& ( register_operand (operands[0], DImode)
|| register_operand (operands[1], DImode))
&& !(TARGET_NEON && CONST_INT_P (operands[1])
@@ -339,71 +339,25 @@
}
"
[(set_attr "type" "multiple,multiple,multiple,multiple,load_8,load_8,store_8,f_mcrr,f_mrrc,ffarithd,f_loadd,f_stored")
- (set (attr "length") (cond [(eq_attr "alternative" "1,4,5,6") (const_int 8)
+ (set (attr "length") (cond [(eq_attr "alternative" "1") (const_int 8)
(eq_attr "alternative" "2") (const_int 12)
(eq_attr "alternative" "3") (const_int 16)
+ (eq_attr "alternative" "4,5,6")
+ (symbol_ref "arm_count_output_move_double_insns (operands) * 4")
(eq_attr "alternative" "9")
(if_then_else
(match_test "TARGET_VFP_SINGLE")
(const_int 8)
(const_int 4))]
(const_int 4)))
+ (set_attr "predicable" "yes")
(set_attr "arm_pool_range" "*,*,*,*,1020,4096,*,*,*,*,1020,*")
(set_attr "thumb2_pool_range" "*,*,*,*,1018,4094,*,*,*,*,1018,*")
(set_attr "neg_pool_range" "*,*,*,*,1004,0,*,*,*,*,1004,*")
+ (set (attr "ce_count") (symbol_ref "get_attr_length (insn) / 4"))
(set_attr "arch" "t2,any,any,any,a,t2,any,any,any,any,any,any")]
)
-(define_insn "*movdi_vfp_cortexa8"
- [(set (match_operand:DI 0 "nonimmediate_di_operand" "=r,r,r,r,q,q,m,w,!r,w,w, Uv")
- (match_operand:DI 1 "di_operand" "r,rDa,Db,Dc,mi,mi,q,r,w,w,Uvi,w"))]
- "TARGET_32BIT && TARGET_HARD_FLOAT && arm_tune == TARGET_CPU_cortexa8
- && ( register_operand (operands[0], DImode)
- || register_operand (operands[1], DImode))
- && !(TARGET_NEON && CONST_INT_P (operands[1])
- && neon_immediate_valid_for_move (operands[1], DImode, NULL, NULL))"
- "*
- switch (which_alternative)
- {
- case 0:
- case 1:
- case 2:
- case 3:
- return \"#\";
- case 4:
- case 5:
- case 6:
- return output_move_double (operands, true, NULL);
- case 7:
- return \"vmov%?\\t%P0, %Q1, %R1\\t%@ int\";
- case 8:
- return \"vmov%?\\t%Q0, %R0, %P1\\t%@ int\";
- case 9:
- return \"vmov%?.f64\\t%P0, %P1\\t%@ int\";
- case 10: case 11:
- return output_move_vfp (operands);
- default:
- gcc_unreachable ();
- }
- "
- [(set_attr "type" "multiple,multiple,multiple,multiple,load_8,load_8,store_8,f_mcrr,f_mrrc,ffarithd,f_loadd,f_stored")
- (set (attr "length") (cond [(eq_attr "alternative" "1") (const_int 8)
- (eq_attr "alternative" "2") (const_int 12)
- (eq_attr "alternative" "3") (const_int 16)
- (eq_attr "alternative" "4,5,6")
- (symbol_ref
- "arm_count_output_move_double_insns (operands) \
- * 4")]
- (const_int 4)))
- (set_attr "predicable" "yes")
- (set_attr "arm_pool_range" "*,*,*,*,1018,4094,*,*,*,*,1018,*")
- (set_attr "thumb2_pool_range" "*,*,*,*,1018,4094,*,*,*,*,1018,*")
- (set_attr "neg_pool_range" "*,*,*,*,1004,0,*,*,*,*,1004,*")
- (set (attr "ce_count")
- (symbol_ref "get_attr_length (insn) / 4"))
- (set_attr "arch" "t2,any,any,any,a,t2,any,any,any,any,any,any")]
- )
-
;; HFmode moves
(define_insn "*movhf_vfp_fp16"