diff options
Diffstat (limited to 'gcc/config/arm')
-rw-r--r-- | gcc/config/arm/arm-builtins.c | 14 | ||||
-rw-r--r-- | gcc/config/arm/arm-c.c | 6 | ||||
-rw-r--r-- | gcc/config/arm/arm-cpus.in | 27 | ||||
-rw-r--r-- | gcc/config/arm/arm.c | 37 | ||||
-rw-r--r-- | gcc/config/arm/arm.h | 7 | ||||
-rw-r--r-- | gcc/config/arm/arm.md | 54 | ||||
-rw-r--r-- | gcc/config/arm/arm_neon_builtins.def | 4 | ||||
-rw-r--r-- | gcc/config/arm/iterators.md | 9 | ||||
-rw-r--r-- | gcc/config/arm/neon.md | 88 | ||||
-rw-r--r-- | gcc/config/arm/t-multilib | 2 | ||||
-rw-r--r-- | gcc/config/arm/types.md | 8 | ||||
-rw-r--r-- | gcc/config/arm/unspecs.md | 2 | ||||
-rw-r--r-- | gcc/config/arm/vfp.md | 60 |
13 files changed, 181 insertions, 137 deletions
diff --git a/gcc/config/arm/arm-builtins.c b/gcc/config/arm/arm-builtins.c index 692496d49d5..d09c6e371de 100644 --- a/gcc/config/arm/arm-builtins.c +++ b/gcc/config/arm/arm-builtins.c @@ -107,6 +107,13 @@ arm_ternop_qualifiers[SIMD_MAX_BUILTIN_ARGS] = { qualifier_none, qualifier_none, qualifier_none, qualifier_none }; #define TERNOP_QUALIFIERS (arm_ternop_qualifiers) +/* unsigned T (unsigned T, unsigned T, unsigned T). */ +static enum arm_type_qualifiers +arm_unsigned_uternop_qualifiers[SIMD_MAX_BUILTIN_ARGS] + = { qualifier_unsigned, qualifier_unsigned, qualifier_unsigned, + qualifier_unsigned }; +#define UTERNOP_QUALIFIERS (arm_unsigned_uternop_qualifiers) + /* T (T, immediate). */ static enum arm_type_qualifiers arm_binop_imm_qualifiers[SIMD_MAX_BUILTIN_ARGS] @@ -133,6 +140,13 @@ arm_mac_lane_qualifiers[SIMD_MAX_BUILTIN_ARGS] qualifier_none, qualifier_lane_index }; #define MAC_LANE_QUALIFIERS (arm_mac_lane_qualifiers) +/* unsigned T (unsigned T, unsigned T, unsigend T, lane index). */ +static enum arm_type_qualifiers +arm_umac_lane_qualifiers[SIMD_MAX_BUILTIN_ARGS] + = { qualifier_unsigned, qualifier_unsigned, qualifier_unsigned, + qualifier_unsigned, qualifier_lane_index }; +#define UMAC_LANE_QUALIFIERS (arm_umac_lane_qualifiers) + /* T (T, T, immediate). */ static enum arm_type_qualifiers arm_ternop_imm_qualifiers[SIMD_MAX_BUILTIN_ARGS] diff --git a/gcc/config/arm/arm-c.c b/gcc/config/arm/arm-c.c index b2e9af6c45d..635bc3c1c38 100644 --- a/gcc/config/arm/arm-c.c +++ b/gcc/config/arm/arm-c.c @@ -74,11 +74,11 @@ arm_cpu_builtins (struct cpp_reader* pfile) def_or_undef_macro (pfile, "__ARM_FEATURE_QRDMX", TARGET_NEON_RDMA); - if (TARGET_CRC32) - builtin_define ("__ARM_FEATURE_CRC32"); - + def_or_undef_macro (pfile, "__ARM_FEATURE_CRC32", TARGET_CRC32); + def_or_undef_macro (pfile, "__ARM_FEATURE_DOTPROD", TARGET_DOTPROD); def_or_undef_macro (pfile, "__ARM_32BIT_STATE", TARGET_32BIT); + cpp_undef (pfile, "__ARM_FEATURE_CMSE"); if (arm_arch8 && !arm_arch_notm) { if (arm_arch_cmse && use_cmse) diff --git a/gcc/config/arm/arm-cpus.in b/gcc/config/arm/arm-cpus.in index 07de4c9375b..0820ad74c2e 100644 --- a/gcc/config/arm/arm-cpus.in +++ b/gcc/config/arm/arm-cpus.in @@ -156,6 +156,8 @@ define feature crypto # FP16 data processing (half-precision float). define feature fp16 +# Dot Product instructions extension to ARMv8.2-a. +define feature dotprod # ISA Quirks (errata?). Don't forget to add this to the fgroup # ALL_QUIRKS below. @@ -173,6 +175,17 @@ define feature quirk_cm3_ldrd define feature smallmul # Feature groups. Conventionally all (or mostly) upper case. +# ALL_FPU lists all the feature bits associated with the floating-point +# unit; these will all be removed if the floating-point unit is disabled +# (eg -mfloat-abi=soft). ALL_FPU_INTERNAL must ONLY contain features that +# form part of a named -mfpu option; it is used to map the capabilities +# back to a named FPU for the benefit of the assembler. +# +# ALL_SIMD_INTERNAL and ALL_SIMD are similarly defined to help with the +# construction of ALL_FPU and ALL_FPU_INTERNAL; they describe the SIMD +# extensions that are either part of a named FPU or optional extensions +# respectively. + # List of all cryptographic extensions to stripout if crypto is # disabled. Currently, that's trivial, but we define it anyway for @@ -182,11 +195,12 @@ define fgroup ALL_CRYPTO crypto # List of all SIMD bits to strip out if SIMD is disabled. This does # strip off 32 D-registers, but does not remove support for # double-precision FP. -define fgroup ALL_SIMD fp_d32 neon ALL_CRYPTO +define fgroup ALL_SIMD_INTERNAL fp_d32 neon ALL_CRYPTO +define fgroup ALL_SIMD ALL_SIMD_INTERNAL dotprod # List of all FPU bits to strip out if -mfpu is used to override the # default. fp16 is deliberately missing from this list. -define fgroup ALL_FPU_INTERNAL vfpv2 vfpv3 vfpv4 fpv5 fp16conv fp_dbl ALL_SIMD +define fgroup ALL_FPU_INTERNAL vfpv2 vfpv3 vfpv4 fpv5 fp16conv fp_dbl ALL_SIMD_INTERNAL # Similarly, but including fp16 and other extensions that aren't part of # -mfpu support. @@ -239,6 +253,7 @@ define fgroup FP_D32 FP_DBL fp_d32 define fgroup FP_ARMv8 FPv5 FP_D32 define fgroup NEON FP_D32 neon define fgroup CRYPTO NEON crypto +define fgroup DOTPROD NEON dotprod # List of all quirk bits to strip out when comparing CPU features with # architectures. @@ -561,6 +576,7 @@ begin arch armv8.2-a option crypto add FP_ARMv8 CRYPTO option nocrypto remove ALL_CRYPTO option nofp remove ALL_FP + option dotprod add FP_ARMv8 DOTPROD end arch armv8.2-a begin arch armv8-m.base @@ -1473,7 +1489,7 @@ begin cpu cortex-a55 cname cortexa55 tune for cortex-a53 tune flags LDSCHED - architecture armv8.2-a+fp16 + architecture armv8.2-a+fp16+dotprod fpu neon-fp-armv8 option crypto add FP_ARMv8 CRYPTO option nofp remove ALL_FP @@ -1484,7 +1500,7 @@ begin cpu cortex-a75 cname cortexa75 tune for cortex-a57 tune flags LDSCHED - architecture armv8.2-a+fp16 + architecture armv8.2-a+fp16+dotprod fpu neon-fp-armv8 option crypto add FP_ARMv8 CRYPTO costs cortex_a73 @@ -1496,7 +1512,7 @@ begin cpu cortex-a75.cortex-a55 cname cortexa75cortexa55 tune for cortex-a53 tune flags LDSCHED - architecture armv8.2-a+fp16 + architecture armv8.2-a+fp16+dotprod fpu neon-fp-armv8 option crypto add FP_ARMv8 CRYPTO costs cortex_a73 @@ -1516,6 +1532,7 @@ begin cpu cortex-m33 architecture armv8-m.main+dsp fpu fpv5-sp-d16 option nofp remove ALL_FP + option nodsp remove armv7em costs v7m end cpu cortex-m33 diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c index ce3aaeb04e0..47ba0dd09e3 100644 --- a/gcc/config/arm/arm.c +++ b/gcc/config/arm/arm.c @@ -973,6 +973,9 @@ int arm_condexec_masklen = 0; /* Nonzero if chip supports the ARMv8 CRC instructions. */ int arm_arch_crc = 0; +/* Nonzero if chip supports the AdvSIMD Dot Product instructions. */ +int arm_arch_dotprod = 0; + /* Nonzero if chip supports the ARMv8-M security extensions. */ int arm_arch_cmse = 0; @@ -9429,6 +9432,9 @@ arm_rtx_costs_internal (rtx x, enum rtx_code code, enum rtx_code outer_code, + rtx_cost (XEXP (x, 0), mode, code, 0, speed_p)); if (speed_p) *cost += 2 * extra_cost->alu.shift; + /* Slightly disparage left shift by 1 at so we prefer adddi3. */ + if (code == ASHIFT && XEXP (x, 1) == CONST1_RTX (SImode)) + *cost += 1; return true; } else if (mode == SImode) @@ -11252,9 +11258,11 @@ arm_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost, return current_tune->vec_costs->scalar_to_vec_cost; case unaligned_load: + case vector_gather_load: return current_tune->vec_costs->vec_unalign_load_cost; case unaligned_store: + case vector_scatter_store: return current_tune->vec_costs->vec_unalign_store_cost; case cond_branch_taken: @@ -15293,12 +15301,23 @@ operands_ok_ldrd_strd (rtx rt, rtx rt2, rtx rn, HOST_WIDE_INT offset, return true; } +/* Return true if a 64-bit access with alignment ALIGN and with a + constant offset OFFSET from the base pointer is permitted on this + architecture. */ +static bool +align_ok_ldrd_strd (HOST_WIDE_INT align, HOST_WIDE_INT offset) +{ + return (unaligned_access + ? (align >= BITS_PER_WORD && (offset & 3) == 0) + : (align >= 2 * BITS_PER_WORD && (offset & 7) == 0)); +} + /* Helper for gen_operands_ldrd_strd. Returns true iff the memory operand MEM's address contains an immediate offset from the base - register and has no side effects, in which case it sets BASE and - OFFSET accordingly. */ + register and has no side effects, in which case it sets BASE, + OFFSET and ALIGN accordingly. */ static bool -mem_ok_for_ldrd_strd (rtx mem, rtx *base, rtx *offset) +mem_ok_for_ldrd_strd (rtx mem, rtx *base, rtx *offset, HOST_WIDE_INT *align) { rtx addr; @@ -15317,6 +15336,7 @@ mem_ok_for_ldrd_strd (rtx mem, rtx *base, rtx *offset) gcc_assert (MEM_P (mem)); *offset = const0_rtx; + *align = MEM_ALIGN (mem); addr = XEXP (mem, 0); @@ -15357,7 +15377,7 @@ gen_operands_ldrd_strd (rtx *operands, bool load, bool const_store, bool commute) { int nops = 2; - HOST_WIDE_INT offsets[2], offset; + HOST_WIDE_INT offsets[2], offset, align[2]; rtx base = NULL_RTX; rtx cur_base, cur_offset, tmp; int i, gap; @@ -15369,7 +15389,8 @@ gen_operands_ldrd_strd (rtx *operands, bool load, registers, and the corresponding memory offsets. */ for (i = 0; i < nops; i++) { - if (!mem_ok_for_ldrd_strd (operands[nops+i], &cur_base, &cur_offset)) + if (!mem_ok_for_ldrd_strd (operands[nops+i], &cur_base, &cur_offset, + &align[i])) return false; if (i == 0) @@ -15483,6 +15504,7 @@ gen_operands_ldrd_strd (rtx *operands, bool load, /* Swap the instructions such that lower memory is accessed first. */ std::swap (operands[0], operands[1]); std::swap (operands[2], operands[3]); + std::swap (align[0], align[1]); if (const_store) std::swap (operands[4], operands[5]); } @@ -15496,6 +15518,9 @@ gen_operands_ldrd_strd (rtx *operands, bool load, if (gap != 4) return false; + if (!align_ok_ldrd_strd (align[0], offset)) + return false; + /* Make sure we generate legal instructions. */ if (operands_ok_ldrd_strd (operands[0], operands[1], base, offset, false, load)) @@ -30365,6 +30390,8 @@ arm_const_not_ok_for_debug_p (rtx p) tree decl_op0 = NULL; tree decl_op1 = NULL; + if (GET_CODE (p) == UNSPEC) + return true; if (GET_CODE (p) == MINUS) { if (GET_CODE (XEXP (p, 1)) == SYMBOL_REF) diff --git a/gcc/config/arm/arm.h b/gcc/config/arm/arm.h index 336db4b042d..65d6db4d086 100644 --- a/gcc/config/arm/arm.h +++ b/gcc/config/arm/arm.h @@ -210,6 +210,11 @@ extern tree arm_fp16_type_node; /* FPU supports ARMv8.1 Adv.SIMD extensions. */ #define TARGET_NEON_RDMA (TARGET_NEON && arm_arch8_1) +/* Supports for Dot Product AdvSIMD extensions. */ +#define TARGET_DOTPROD (TARGET_NEON \ + && bitmap_bit_p (arm_active_target.isa, \ + isa_bit_dotprod)) + /* FPU supports the floating point FP16 instructions for ARMv8.2 and later. */ #define TARGET_VFP_FP16INST \ (TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_VFP5 && arm_fp16_inst) @@ -1248,7 +1253,7 @@ enum reg_class couldn't convert a direct call into an indirect one. */ #define CALLER_INTERWORKING_SLOT_SIZE \ (TARGET_CALLER_INTERWORKING \ - && maybe_nonzero (crtl->outgoing_args_size) \ + && may_ne (crtl->outgoing_args_size, 0) \ ? UNITS_PER_WORD : 0) /* If we generate an insn to push BYTES bytes, diff --git a/gcc/config/arm/arm.md b/gcc/config/arm/arm.md index f241f9d0b7d..ddb9d8f3590 100644 --- a/gcc/config/arm/arm.md +++ b/gcc/config/arm/arm.md @@ -4059,12 +4059,6 @@ { rtx scratch1, scratch2; - if (operands[2] == CONST1_RTX (SImode)) - { - emit_insn (gen_arm_ashldi3_1bit (operands[0], operands[1])); - DONE; - } - /* Ideally we should use iwmmxt here if we could know that operands[1] ends up already living in an iwmmxt register. Otherwise it's cheaper to have the alternate code being generated than moving @@ -4081,18 +4075,6 @@ " ) -(define_insn "arm_ashldi3_1bit" - [(set (match_operand:DI 0 "s_register_operand" "=r,&r") - (ashift:DI (match_operand:DI 1 "s_register_operand" "0,r") - (const_int 1))) - (clobber (reg:CC CC_REGNUM))] - "TARGET_32BIT" - "movs\\t%Q0, %Q1, asl #1\;adc\\t%R0, %R1, %R1" - [(set_attr "conds" "clob") - (set_attr "length" "8") - (set_attr "type" "multiple")] -) - (define_expand "ashlsi3" [(set (match_operand:SI 0 "s_register_operand" "") (ashift:SI (match_operand:SI 1 "s_register_operand" "") @@ -4128,12 +4110,6 @@ { rtx scratch1, scratch2; - if (operands[2] == CONST1_RTX (SImode)) - { - emit_insn (gen_arm_ashrdi3_1bit (operands[0], operands[1])); - DONE; - } - /* Ideally we should use iwmmxt here if we could know that operands[1] ends up already living in an iwmmxt register. Otherwise it's cheaper to have the alternate code being generated than moving @@ -4150,18 +4126,6 @@ " ) -(define_insn "arm_ashrdi3_1bit" - [(set (match_operand:DI 0 "s_register_operand" "=r,&r") - (ashiftrt:DI (match_operand:DI 1 "s_register_operand" "0,r") - (const_int 1))) - (clobber (reg:CC CC_REGNUM))] - "TARGET_32BIT" - "movs\\t%R0, %R1, asr #1\;mov\\t%Q0, %Q1, rrx" - [(set_attr "conds" "clob") - (set_attr "length" "8") - (set_attr "type" "multiple")] -) - (define_expand "ashrsi3" [(set (match_operand:SI 0 "s_register_operand" "") (ashiftrt:SI (match_operand:SI 1 "s_register_operand" "") @@ -4194,12 +4158,6 @@ { rtx scratch1, scratch2; - if (operands[2] == CONST1_RTX (SImode)) - { - emit_insn (gen_arm_lshrdi3_1bit (operands[0], operands[1])); - DONE; - } - /* Ideally we should use iwmmxt here if we could know that operands[1] ends up already living in an iwmmxt register. Otherwise it's cheaper to have the alternate code being generated than moving @@ -4216,18 +4174,6 @@ " ) -(define_insn "arm_lshrdi3_1bit" - [(set (match_operand:DI 0 "s_register_operand" "=r,&r") - (lshiftrt:DI (match_operand:DI 1 "s_register_operand" "0,r") - (const_int 1))) - (clobber (reg:CC CC_REGNUM))] - "TARGET_32BIT" - "movs\\t%R0, %R1, lsr #1\;mov\\t%Q0, %Q1, rrx" - [(set_attr "conds" "clob") - (set_attr "length" "8") - (set_attr "type" "multiple")] -) - (define_expand "lshrsi3" [(set (match_operand:SI 0 "s_register_operand" "") (lshiftrt:SI (match_operand:SI 1 "s_register_operand" "") diff --git a/gcc/config/arm/arm_neon_builtins.def b/gcc/config/arm/arm_neon_builtins.def index 07f0368343a..982eec810da 100644 --- a/gcc/config/arm/arm_neon_builtins.def +++ b/gcc/config/arm/arm_neon_builtins.def @@ -331,3 +331,7 @@ VAR11 (STORE1, vst4, v8qi, v4hi, v4hf, v2si, v2sf, di, v16qi, v8hi, v8hf, v4si, v4sf) VAR9 (STORE1LANE, vst4_lane, v8qi, v4hi, v4hf, v2si, v2sf, v8hi, v8hf, v4si, v4sf) +VAR2 (TERNOP, sdot, v8qi, v16qi) +VAR2 (UTERNOP, udot, v8qi, v16qi) +VAR2 (MAC_LANE, sdot_lane, v8qi, v16qi) +VAR2 (UMAC_LANE, udot_lane, v8qi, v16qi) diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md index 7acbaf1bb40..a4fb234a846 100644 --- a/gcc/config/arm/iterators.md +++ b/gcc/config/arm/iterators.md @@ -410,6 +410,8 @@ (define_int_iterator VFM_LANE_AS [UNSPEC_VFMA_LANE UNSPEC_VFMS_LANE]) +(define_int_iterator DOTPROD [UNSPEC_DOT_S UNSPEC_DOT_U]) + ;;---------------------------------------------------------------------------- ;; Mode attributes ;;---------------------------------------------------------------------------- @@ -720,6 +722,9 @@ (define_mode_attr pf [(V8QI "p") (V16QI "p") (V2SF "f") (V4SF "f")]) +(define_mode_attr VSI2QI [(V2SI "V8QI") (V4SI "V16QI")]) +(define_mode_attr vsi2qi [(V2SI "v8qi") (V4SI "v16qi")]) + ;;---------------------------------------------------------------------------- ;; Code attributes ;;---------------------------------------------------------------------------- @@ -816,6 +821,7 @@ (UNSPEC_VSRA_S_N "s") (UNSPEC_VSRA_U_N "u") (UNSPEC_VRSRA_S_N "s") (UNSPEC_VRSRA_U_N "u") (UNSPEC_VCVTH_S "s") (UNSPEC_VCVTH_U "u") + (UNSPEC_DOT_S "s") (UNSPEC_DOT_U "u") ]) (define_int_attr vcvth_op @@ -1003,3 +1009,6 @@ (define_int_attr mrrc [(VUNSPEC_MRRC "mrrc") (VUNSPEC_MRRC2 "mrrc2")]) (define_int_attr MRRC [(VUNSPEC_MRRC "MRRC") (VUNSPEC_MRRC2 "MRRC2")]) + +(define_int_attr opsuffix [(UNSPEC_DOT_S "s8") + (UNSPEC_DOT_U "u8")]) diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md index 6590e8cd894..073c26580dd 100644 --- a/gcc/config/arm/neon.md +++ b/gcc/config/arm/neon.md @@ -1221,12 +1221,8 @@ gcc_assert (!reg_overlap_mentioned_p (operands[0], operands[1]) || REGNO (operands[0]) == REGNO (operands[1])); - if (operands[2] == CONST1_RTX (SImode)) - /* This clobbers CC. */ - emit_insn (gen_arm_ashldi3_1bit (operands[0], operands[1])); - else - arm_emit_coreregs_64bit_shift (ASHIFT, operands[0], operands[1], - operands[2], operands[3], operands[4]); + arm_emit_coreregs_64bit_shift (ASHIFT, operands[0], operands[1], + operands[2], operands[3], operands[4]); } DONE; }" @@ -1325,13 +1321,9 @@ gcc_assert (!reg_overlap_mentioned_p (operands[0], operands[1]) || REGNO (operands[0]) == REGNO (operands[1])); - if (operands[2] == CONST1_RTX (SImode)) - /* This clobbers CC. */ - emit_insn (gen_arm_<shift>di3_1bit (operands[0], operands[1])); - else - /* This clobbers CC (ASHIFTRT by register only). */ - arm_emit_coreregs_64bit_shift (<CODE>, operands[0], operands[1], - operands[2], operands[3], operands[4]); + /* This clobbers CC (ASHIFTRT by register only). */ + arm_emit_coreregs_64bit_shift (<CODE>, operands[0], operands[1], + operands[2], operands[3], operands[4]); } DONE; @@ -3044,6 +3036,76 @@ DONE; }) +;; These instructions map to the __builtins for the Dot Product operations. +(define_insn "neon_<sup>dot<vsi2qi>" + [(set (match_operand:VCVTI 0 "register_operand" "=w") + (plus:VCVTI (match_operand:VCVTI 1 "register_operand" "0") + (unspec:VCVTI [(match_operand:<VSI2QI> 2 + "register_operand" "w") + (match_operand:<VSI2QI> 3 + "register_operand" "w")] + DOTPROD)))] + "TARGET_DOTPROD" + "v<sup>dot.<opsuffix>\\t%<V_reg>0, %<V_reg>2, %<V_reg>3" + [(set_attr "type" "neon_dot")] +) + +;; These instructions map to the __builtins for the Dot Product +;; indexed operations. +(define_insn "neon_<sup>dot_lane<vsi2qi>" + [(set (match_operand:VCVTI 0 "register_operand" "=w") + (plus:VCVTI (match_operand:VCVTI 1 "register_operand" "0") + (unspec:VCVTI [(match_operand:<VSI2QI> 2 + "register_operand" "w") + (match_operand:V8QI 3 "register_operand" "t") + (match_operand:SI 4 "immediate_operand" "i")] + DOTPROD)))] + "TARGET_DOTPROD" + { + operands[4] + = GEN_INT (NEON_ENDIAN_LANE_N (V8QImode, INTVAL (operands[4]))); + return "v<sup>dot.<opsuffix>\\t%<V_reg>0, %<V_reg>2, %P3[%c4]"; + } + [(set_attr "type" "neon_dot")] +) + +;; These expands map to the Dot Product optab the vectorizer checks for. +;; The auto-vectorizer expects a dot product builtin that also does an +;; accumulation into the provided register. +;; Given the following pattern +;; +;; for (i=0; i<len; i++) { +;; c = a[i] * b[i]; +;; r += c; +;; } +;; return result; +;; +;; This can be auto-vectorized to +;; r = a[0]*b[0] + a[1]*b[1] + a[2]*b[2] + a[3]*b[3]; +;; +;; given enough iterations. However the vectorizer can keep unrolling the loop +;; r += a[4]*b[4] + a[5]*b[5] + a[6]*b[6] + a[7]*b[7]; +;; r += a[8]*b[8] + a[9]*b[9] + a[10]*b[10] + a[11]*b[11]; +;; ... +;; +;; and so the vectorizer provides r, in which the result has to be accumulated. +(define_expand "<sup>dot_prod<vsi2qi>" + [(set (match_operand:VCVTI 0 "register_operand") + (plus:VCVTI (unspec:VCVTI [(match_operand:<VSI2QI> 1 + "register_operand") + (match_operand:<VSI2QI> 2 + "register_operand")] + DOTPROD) + (match_operand:VCVTI 3 "register_operand")))] + "TARGET_DOTPROD" +{ + emit_insn ( + gen_neon_<sup>dot<vsi2qi> (operands[3], operands[3], operands[1], + operands[2])); + emit_insn (gen_rtx_SET (operands[0], operands[3])); + DONE; +}) + (define_expand "neon_copysignf<mode>" [(match_operand:VCVTF 0 "register_operand") (match_operand:VCVTF 1 "register_operand") diff --git a/gcc/config/arm/t-multilib b/gcc/config/arm/t-multilib index ec4b76dbc8f..47f3673160a 100644 --- a/gcc/config/arm/t-multilib +++ b/gcc/config/arm/t-multilib @@ -68,7 +68,7 @@ v7ve_vfpv4_simd_variants := +simd v8_a_nosimd_variants := +crc v8_a_simd_variants := $(call all_feat_combs, simd crypto) v8_1_a_simd_variants := $(call all_feat_combs, simd crypto) -v8_2_a_simd_variants := $(call all_feat_combs, simd fp16 crypto) +v8_2_a_simd_variants := $(call all_feat_combs, simd fp16 crypto dotprod) ifneq (,$(HAS_APROFILE)) diff --git a/gcc/config/arm/types.md b/gcc/config/arm/types.md index 22d993d46a3..03e9cdebb75 100644 --- a/gcc/config/arm/types.md +++ b/gcc/config/arm/types.md @@ -316,6 +316,8 @@ ; neon_cls_q ; neon_cnt ; neon_cnt_q +; neon_dot +; neon_dot_q ; neon_ext ; neon_ext_q ; neon_rbit @@ -764,6 +766,8 @@ \ neon_abs,\ neon_abs_q,\ + neon_dot,\ + neon_dot_q,\ neon_neg,\ neon_neg_q,\ neon_qneg,\ @@ -1110,8 +1114,8 @@ neon_sub, neon_sub_q, neon_sub_widen, neon_sub_long, neon_qsub,\ neon_qsub_q, neon_sub_halve, neon_sub_halve_q,\ neon_sub_halve_narrow_q,\ - neon_abs, neon_abs_q, neon_neg, neon_neg_q, neon_qneg,\ - neon_qneg_q, neon_qabs, neon_qabs_q, neon_abd, neon_abd_q,\ + neon_abs, neon_abs_q, neon_dot, neon_dot_q, neon_neg, neon_neg_q,\ + neon_qneg, neon_qneg_q, neon_qabs, neon_qabs_q, neon_abd, neon_abd_q,\ neon_abd_long, neon_minmax, neon_minmax_q, neon_compare,\ neon_compare_q, neon_compare_zero, neon_compare_zero_q,\ neon_arith_acc, neon_arith_acc_q, neon_reduc_add,\ diff --git a/gcc/config/arm/unspecs.md b/gcc/config/arm/unspecs.md index 99cfa41b08d..c474f4bb5db 100644 --- a/gcc/config/arm/unspecs.md +++ b/gcc/config/arm/unspecs.md @@ -410,4 +410,6 @@ UNSPEC_VRNDN UNSPEC_VRNDP UNSPEC_VRNDX + UNSPEC_DOT_S + UNSPEC_DOT_U ]) diff --git a/gcc/config/arm/vfp.md b/gcc/config/arm/vfp.md index 9521e904d21..a541413c263 100644 --- a/gcc/config/arm/vfp.md +++ b/gcc/config/arm/vfp.md @@ -304,9 +304,9 @@ ;; DImode moves (define_insn "*movdi_vfp" - [(set (match_operand:DI 0 "nonimmediate_di_operand" "=r,r,r,r,q,q,m,w,r,w,w, Uv") + [(set (match_operand:DI 0 "nonimmediate_di_operand" "=r,r,r,r,q,q,m,w,!r,w,w, Uv") (match_operand:DI 1 "di_operand" "r,rDa,Db,Dc,mi,mi,q,r,w,w,Uvi,w"))] - "TARGET_32BIT && TARGET_HARD_FLOAT && arm_tune != TARGET_CPU_cortexa8 + "TARGET_32BIT && TARGET_HARD_FLOAT && ( register_operand (operands[0], DImode) || register_operand (operands[1], DImode)) && !(TARGET_NEON && CONST_INT_P (operands[1]) @@ -339,71 +339,25 @@ } " [(set_attr "type" "multiple,multiple,multiple,multiple,load_8,load_8,store_8,f_mcrr,f_mrrc,ffarithd,f_loadd,f_stored") - (set (attr "length") (cond [(eq_attr "alternative" "1,4,5,6") (const_int 8) + (set (attr "length") (cond [(eq_attr "alternative" "1") (const_int 8) (eq_attr "alternative" "2") (const_int 12) (eq_attr "alternative" "3") (const_int 16) + (eq_attr "alternative" "4,5,6") + (symbol_ref "arm_count_output_move_double_insns (operands) * 4") (eq_attr "alternative" "9") (if_then_else (match_test "TARGET_VFP_SINGLE") (const_int 8) (const_int 4))] (const_int 4))) + (set_attr "predicable" "yes") (set_attr "arm_pool_range" "*,*,*,*,1020,4096,*,*,*,*,1020,*") (set_attr "thumb2_pool_range" "*,*,*,*,1018,4094,*,*,*,*,1018,*") (set_attr "neg_pool_range" "*,*,*,*,1004,0,*,*,*,*,1004,*") + (set (attr "ce_count") (symbol_ref "get_attr_length (insn) / 4")) (set_attr "arch" "t2,any,any,any,a,t2,any,any,any,any,any,any")] ) -(define_insn "*movdi_vfp_cortexa8" - [(set (match_operand:DI 0 "nonimmediate_di_operand" "=r,r,r,r,q,q,m,w,!r,w,w, Uv") - (match_operand:DI 1 "di_operand" "r,rDa,Db,Dc,mi,mi,q,r,w,w,Uvi,w"))] - "TARGET_32BIT && TARGET_HARD_FLOAT && arm_tune == TARGET_CPU_cortexa8 - && ( register_operand (operands[0], DImode) - || register_operand (operands[1], DImode)) - && !(TARGET_NEON && CONST_INT_P (operands[1]) - && neon_immediate_valid_for_move (operands[1], DImode, NULL, NULL))" - "* - switch (which_alternative) - { - case 0: - case 1: - case 2: - case 3: - return \"#\"; - case 4: - case 5: - case 6: - return output_move_double (operands, true, NULL); - case 7: - return \"vmov%?\\t%P0, %Q1, %R1\\t%@ int\"; - case 8: - return \"vmov%?\\t%Q0, %R0, %P1\\t%@ int\"; - case 9: - return \"vmov%?.f64\\t%P0, %P1\\t%@ int\"; - case 10: case 11: - return output_move_vfp (operands); - default: - gcc_unreachable (); - } - " - [(set_attr "type" "multiple,multiple,multiple,multiple,load_8,load_8,store_8,f_mcrr,f_mrrc,ffarithd,f_loadd,f_stored") - (set (attr "length") (cond [(eq_attr "alternative" "1") (const_int 8) - (eq_attr "alternative" "2") (const_int 12) - (eq_attr "alternative" "3") (const_int 16) - (eq_attr "alternative" "4,5,6") - (symbol_ref - "arm_count_output_move_double_insns (operands) \ - * 4")] - (const_int 4))) - (set_attr "predicable" "yes") - (set_attr "arm_pool_range" "*,*,*,*,1018,4094,*,*,*,*,1018,*") - (set_attr "thumb2_pool_range" "*,*,*,*,1018,4094,*,*,*,*,1018,*") - (set_attr "neg_pool_range" "*,*,*,*,1004,0,*,*,*,*,1004,*") - (set (attr "ce_count") - (symbol_ref "get_attr_length (insn) / 4")) - (set_attr "arch" "t2,any,any,any,a,t2,any,any,any,any,any,any")] - ) - ;; HFmode moves (define_insn "*movhf_vfp_fp16" |