diff options
author | rus <rus@138bc75d-0d04-0410-961f-82ee72b054a4> | 2009-11-09 20:58:24 +0000 |
---|---|---|
committer | rus <rus@138bc75d-0d04-0410-961f-82ee72b054a4> | 2009-11-09 20:58:24 +0000 |
commit | 7f4db7c80779ecbc57d1146654daf0acfe18de66 (patch) | |
tree | 3af522a3b5e149c3fd498ecb1255994daae2129a /gcc/config/arm | |
parent | 611349f0ec42a37591db2cd02974a11a48d10edb (diff) | |
download | gcc-7f4db7c80779ecbc57d1146654daf0acfe18de66.tar.gz |
merge from trunkprofile-stdlib
git-svn-id: svn+ssh://gcc.gnu.org/svn/gcc/branches/profile-stdlib@154052 138bc75d-0d04-0410-961f-82ee72b054a4
Diffstat (limited to 'gcc/config/arm')
-rw-r--r-- | gcc/config/arm/arm.c | 427 | ||||
-rw-r--r-- | gcc/config/arm/arm.h | 73 | ||||
-rw-r--r-- | gcc/config/arm/arm.md | 120 | ||||
-rw-r--r-- | gcc/config/arm/arm_neon.h | 100 | ||||
-rw-r--r-- | gcc/config/arm/bpabi.h | 6 | ||||
-rw-r--r-- | gcc/config/arm/cortex-a9.md | 186 | ||||
-rw-r--r-- | gcc/config/arm/fpa.md | 4 | ||||
-rw-r--r-- | gcc/config/arm/linux-eabi.h | 2 | ||||
-rw-r--r-- | gcc/config/arm/linux-elf.h | 2 | ||||
-rw-r--r-- | gcc/config/arm/neon-gen.ml | 3 | ||||
-rw-r--r-- | gcc/config/arm/neon.md | 3 | ||||
-rw-r--r-- | gcc/config/arm/neon.ml | 3 | ||||
-rw-r--r-- | gcc/config/arm/netbsd-elf.h | 2 | ||||
-rw-r--r-- | gcc/config/arm/thumb2.md | 10 | ||||
-rw-r--r-- | gcc/config/arm/unwind-arm.c | 1 | ||||
-rw-r--r-- | gcc/config/arm/vxworks.h | 2 |
16 files changed, 596 insertions, 348 deletions
diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c index cd5a0ed1403..4c7fcb65854 100644 --- a/gcc/config/arm/arm.c +++ b/gcc/config/arm/arm.c @@ -133,11 +133,12 @@ static enum machine_mode arm_promote_function_mode (const_tree, const_tree, int); static bool arm_return_in_memory (const_tree, const_tree); static rtx arm_function_value (const_tree, const_tree, bool); -static rtx arm_libcall_value (enum machine_mode, rtx); +static rtx arm_libcall_value (enum machine_mode, const_rtx); static void arm_internal_label (FILE *, const char *, unsigned long); static void arm_output_mi_thunk (FILE *, tree, HOST_WIDE_INT, HOST_WIDE_INT, tree); +static bool arm_have_conditional_execution (void); static bool arm_rtx_costs_1 (rtx, enum rtx_code, int*, bool); static bool arm_size_rtx_costs (rtx, enum rtx_code, enum rtx_code, int *); static bool arm_slowmul_rtx_costs (rtx, enum rtx_code, enum rtx_code, int *, bool); @@ -445,6 +446,9 @@ static const struct attribute_spec arm_attribute_table[] = #define TARGET_HAVE_TLS true #endif +#undef TARGET_HAVE_CONDITIONAL_EXECUTION +#define TARGET_HAVE_CONDITIONAL_EXECUTION arm_have_conditional_execution + #undef TARGET_CANNOT_FORCE_CONST_MEM #define TARGET_CANNOT_FORCE_CONST_MEM arm_cannot_force_const_mem @@ -520,14 +524,11 @@ enum processor_type arm_tune = arm_none; /* The default processor used if not overridden by commandline. */ static enum processor_type arm_default_cpu = arm_none; -/* Which floating point model to use. */ -enum arm_fp_model arm_fp_model; - -/* Which floating point hardware is available. */ -enum fputype arm_fpu_arch; - /* Which floating point hardware to schedule for. */ -enum fputype arm_fpu_tune; +int arm_fpu_attr; + +/* Which floating popint hardware to use. */ +const struct arm_fpu_desc *arm_fpu_desc; /* Whether to use floating point hardware. */ enum float_abi_type arm_float_abi; @@ -805,46 +806,21 @@ static struct arm_cpu_select arm_select[] = char arm_arch_name[] = "__ARM_ARCH_0UNK__"; -struct fpu_desc -{ - const char * name; - enum fputype fpu; -}; - - /* Available values for -mfpu=. */ -static const struct fpu_desc all_fpus[] = -{ - {"fpa", FPUTYPE_FPA}, - {"fpe2", FPUTYPE_FPA_EMU2}, - {"fpe3", FPUTYPE_FPA_EMU2}, - {"maverick", FPUTYPE_MAVERICK}, - {"vfp", FPUTYPE_VFP}, - {"vfp3", FPUTYPE_VFP3}, - {"vfpv3", FPUTYPE_VFP3}, - {"vfpv3-d16", FPUTYPE_VFP3D16}, - {"neon", FPUTYPE_NEON}, - {"neon-fp16", FPUTYPE_NEON_FP16} -}; - - -/* Floating point models used by the different hardware. - See fputype in arm.h. */ - -static const enum arm_fp_model fp_model_for_fpu[] = -{ - /* No FP hardware. */ - ARM_FP_MODEL_UNKNOWN, /* FPUTYPE_NONE */ - ARM_FP_MODEL_FPA, /* FPUTYPE_FPA */ - ARM_FP_MODEL_FPA, /* FPUTYPE_FPA_EMU2 */ - ARM_FP_MODEL_FPA, /* FPUTYPE_FPA_EMU3 */ - ARM_FP_MODEL_MAVERICK, /* FPUTYPE_MAVERICK */ - ARM_FP_MODEL_VFP, /* FPUTYPE_VFP */ - ARM_FP_MODEL_VFP, /* FPUTYPE_VFP3D16 */ - ARM_FP_MODEL_VFP, /* FPUTYPE_VFP3 */ - ARM_FP_MODEL_VFP, /* FPUTYPE_NEON */ - ARM_FP_MODEL_VFP /* FPUTYPE_NEON_FP16 */ +static const struct arm_fpu_desc all_fpus[] = +{ + {"fpa", ARM_FP_MODEL_FPA, 0, 0, false, false}, + {"fpe2", ARM_FP_MODEL_FPA, 2, 0, false, false}, + {"fpe3", ARM_FP_MODEL_FPA, 3, 0, false, false}, + {"maverick", ARM_FP_MODEL_MAVERICK, 0, 0, false, false}, + {"vfp", ARM_FP_MODEL_VFP, 2, VFP_REG_D16, false, false}, + {"vfpv3", ARM_FP_MODEL_VFP, 3, VFP_REG_D32, false, false}, + {"vfpv3-d16", ARM_FP_MODEL_VFP, 3, VFP_REG_D16, false, false}, + {"neon", ARM_FP_MODEL_VFP, 3, VFP_REG_D32, true , false}, + {"neon-fp16", ARM_FP_MODEL_VFP, 3, VFP_REG_D32, true , true }, + /* Compatibility aliases. */ + {"vfp3", ARM_FP_MODEL_VFP, 3, VFP_REG_D32, false, false}, }; @@ -1298,13 +1274,6 @@ arm_override_options (void) enum processor_type target_arch_cpu = arm_none; enum processor_type selected_cpu = arm_none; - /* Ideally we would want to use CFI directives to generate - debug info. However this also creates the .eh_frame - section, so disable them until GAS can handle - this properly. See PR40521. */ - if (TARGET_AAPCS_BASED) - flag_dwarf2_cfi_asm = 0; - /* Set up the flags based on the cpu/architecture selected by the user. */ for (i = ARRAY_SIZE (arm_select); i--;) { @@ -1618,7 +1587,6 @@ arm_override_options (void) if (TARGET_IWMMXT_ABI && !TARGET_IWMMXT) error ("iwmmxt abi requires an iwmmxt capable cpu"); - arm_fp_model = ARM_FP_MODEL_UNKNOWN; if (target_fpu_name == NULL && target_fpe_name != NULL) { if (streq (target_fpe_name, "2")) @@ -1629,46 +1597,52 @@ arm_override_options (void) error ("invalid floating point emulation option: -mfpe=%s", target_fpe_name); } - if (target_fpu_name != NULL) - { - /* The user specified a FPU. */ - for (i = 0; i < ARRAY_SIZE (all_fpus); i++) - { - if (streq (all_fpus[i].name, target_fpu_name)) - { - arm_fpu_arch = all_fpus[i].fpu; - arm_fpu_tune = arm_fpu_arch; - arm_fp_model = fp_model_for_fpu[arm_fpu_arch]; - break; - } - } - if (arm_fp_model == ARM_FP_MODEL_UNKNOWN) - error ("invalid floating point option: -mfpu=%s", target_fpu_name); - } - else + + if (target_fpu_name == NULL) { #ifdef FPUTYPE_DEFAULT - /* Use the default if it is specified for this platform. */ - arm_fpu_arch = FPUTYPE_DEFAULT; - arm_fpu_tune = FPUTYPE_DEFAULT; + target_fpu_name = FPUTYPE_DEFAULT; #else - /* Pick one based on CPU type. */ - /* ??? Some targets assume FPA is the default. - if ((insn_flags & FL_VFP) != 0) - arm_fpu_arch = FPUTYPE_VFP; - else - */ if (arm_arch_cirrus) - arm_fpu_arch = FPUTYPE_MAVERICK; + target_fpu_name = "maverick"; else - arm_fpu_arch = FPUTYPE_FPA_EMU2; + target_fpu_name = "fpe2"; #endif - if (tune_flags & FL_CO_PROC && arm_fpu_arch == FPUTYPE_FPA_EMU2) - arm_fpu_tune = FPUTYPE_FPA; + } + + arm_fpu_desc = NULL; + for (i = 0; i < ARRAY_SIZE (all_fpus); i++) + { + if (streq (all_fpus[i].name, target_fpu_name)) + { + arm_fpu_desc = &all_fpus[i]; + break; + } + } + if (!arm_fpu_desc) + error ("invalid floating point option: -mfpu=%s", target_fpu_name); + + switch (arm_fpu_desc->model) + { + case ARM_FP_MODEL_FPA: + if (arm_fpu_desc->rev == 2) + arm_fpu_attr = FPU_FPE2; + else if (arm_fpu_desc->rev == 3) + arm_fpu_attr = FPU_FPE3; else - arm_fpu_tune = arm_fpu_arch; - arm_fp_model = fp_model_for_fpu[arm_fpu_arch]; - gcc_assert (arm_fp_model != ARM_FP_MODEL_UNKNOWN); + arm_fpu_attr = FPU_FPA; + break; + + case ARM_FP_MODEL_MAVERICK: + arm_fpu_attr = FPU_MAVERICK; + break; + + case ARM_FP_MODEL_VFP: + arm_fpu_attr = FPU_VFP; + break; + + default: + gcc_unreachable(); } if (target_float_abi_name != NULL) @@ -1690,7 +1664,7 @@ arm_override_options (void) arm_float_abi = TARGET_DEFAULT_FLOAT_ABI; if (TARGET_AAPCS_BASED - && (arm_fp_model == ARM_FP_MODEL_FPA)) + && (arm_fpu_desc->model == ARM_FP_MODEL_FPA)) error ("FPA is unsupported in the AAPCS"); if (TARGET_AAPCS_BASED) @@ -1718,7 +1692,7 @@ arm_override_options (void) /* If soft-float is specified then don't use FPU. */ if (TARGET_SOFT_FLOAT) - arm_fpu_arch = FPUTYPE_NONE; + arm_fpu_attr = FPU_NONE; if (TARGET_AAPCS_BASED) { @@ -1745,8 +1719,7 @@ arm_override_options (void) /* For arm2/3 there is no need to do any scheduling if there is only a floating point emulator, or we are doing software floating-point. */ if ((TARGET_SOFT_FLOAT - || arm_fpu_tune == FPUTYPE_FPA_EMU2 - || arm_fpu_tune == FPUTYPE_FPA_EMU3) + || (TARGET_FPA && arm_fpu_desc->rev)) && (tune_flags & FL_MODE32) == 0) flag_schedule_insns = flag_schedule_insns_after_reload = 0; @@ -1871,6 +1844,23 @@ arm_override_options (void) max_insns_skipped = 3; } + /* Hot/Cold partitioning is not currently supported, since we can't + handle literal pool placement in that case. */ + if (flag_reorder_blocks_and_partition) + { + inform (input_location, + "-freorder-blocks-and-partition not supported on this architecture"); + flag_reorder_blocks_and_partition = 0; + flag_reorder_blocks = 1; + } + + /* Ideally we would want to use CFI directives to generate + debug info. However this also creates the .eh_frame + section, so disable them until GAS can handle + this properly. See PR40521. */ + if (TARGET_AAPCS_BASED) + flag_dwarf2_cfi_asm = 0; + /* Register global variables with the garbage collector. */ arm_add_gc_roots (); } @@ -2393,20 +2383,24 @@ arm_split_constant (enum rtx_code code, enum machine_mode mode, rtx insn, 1); } -/* Return the number of ARM instructions required to synthesize the given - constant. */ +/* Return the number of instructions required to synthesize the given + constant, if we start emitting them from bit-position I. */ static int count_insns_for_constant (HOST_WIDE_INT remainder, int i) { HOST_WIDE_INT temp1; + int step_size = TARGET_ARM ? 2 : 1; int num_insns = 0; + + gcc_assert (TARGET_ARM || i == 0); + do { int end; if (i <= 0) i += 32; - if (remainder & (3 << (i - 2))) + if (remainder & (((1 << step_size) - 1) << (i - step_size))) { end = i - 8; if (end < 0) @@ -2415,13 +2409,77 @@ count_insns_for_constant (HOST_WIDE_INT remainder, int i) | ((i < end) ? (0xff >> (32 - end)) : 0)); remainder &= ~temp1; num_insns++; - i -= 6; + i -= 8 - step_size; } - i -= 2; + i -= step_size; } while (remainder); return num_insns; } +static int +find_best_start (unsigned HOST_WIDE_INT remainder) +{ + int best_consecutive_zeros = 0; + int i; + int best_start = 0; + + /* If we aren't targetting ARM, the best place to start is always at + the bottom. */ + if (! TARGET_ARM) + return 0; + + for (i = 0; i < 32; i += 2) + { + int consecutive_zeros = 0; + + if (!(remainder & (3 << i))) + { + while ((i < 32) && !(remainder & (3 << i))) + { + consecutive_zeros += 2; + i += 2; + } + if (consecutive_zeros > best_consecutive_zeros) + { + best_consecutive_zeros = consecutive_zeros; + best_start = i - consecutive_zeros; + } + i -= 2; + } + } + + /* So long as it won't require any more insns to do so, it's + desirable to emit a small constant (in bits 0...9) in the last + insn. This way there is more chance that it can be combined with + a later addressing insn to form a pre-indexed load or store + operation. Consider: + + *((volatile int *)0xe0000100) = 1; + *((volatile int *)0xe0000110) = 2; + + We want this to wind up as: + + mov rA, #0xe0000000 + mov rB, #1 + str rB, [rA, #0x100] + mov rB, #2 + str rB, [rA, #0x110] + + rather than having to synthesize both large constants from scratch. + + Therefore, we calculate how many insns would be required to emit + the constant starting from `best_start', and also starting from + zero (i.e. with bit 31 first to be output). If `best_start' doesn't + yield a shorter sequence, we may as well use zero. */ + if (best_start != 0 + && ((((unsigned HOST_WIDE_INT) 1) << best_start) < remainder) + && (count_insns_for_constant (remainder, 0) <= + count_insns_for_constant (remainder, best_start))) + best_start = 0; + + return best_start; +} + /* Emit an instruction with the indicated PATTERN. If COND is non-NULL, conditionalize the execution of the instruction on COND being true. */ @@ -2445,6 +2503,7 @@ arm_gen_constant (enum rtx_code code, enum machine_mode mode, rtx cond, { int can_invert = 0; int can_negate = 0; + int final_invert = 0; int can_negate_initial = 0; int can_shift = 0; int i; @@ -2456,6 +2515,7 @@ arm_gen_constant (enum rtx_code code, enum machine_mode mode, rtx cond, int insns = 0; unsigned HOST_WIDE_INT temp1, temp2; unsigned HOST_WIDE_INT remainder = val & 0xffffffff; + int step_size = TARGET_ARM ? 2 : 1; /* Find out which operations are safe for a given CODE. Also do a quick check for degenerate cases; these can occur when DImode operations @@ -2529,14 +2589,15 @@ arm_gen_constant (enum rtx_code code, enum machine_mode mode, rtx cond, return 1; } - /* We don't know how to handle other cases yet. */ - gcc_assert (remainder == 0xffffffff); - - if (generate) - emit_constant_insn (cond, - gen_rtx_SET (VOIDmode, target, - gen_rtx_NOT (mode, source))); - return 1; + if (remainder == 0xffffffff) + { + if (generate) + emit_constant_insn (cond, + gen_rtx_SET (VOIDmode, target, + gen_rtx_NOT (mode, source))); + return 1; + } + break; case MINUS: /* We treat MINUS as (val - source), since (source - val) is always @@ -2987,9 +3048,25 @@ arm_gen_constant (enum rtx_code code, enum machine_mode mode, rtx cond, if ((code == AND) || (code != IOR && can_invert && num_bits_set > 16)) - remainder = (~remainder) & 0xffffffff; + remainder ^= 0xffffffff; else if (code == PLUS && num_bits_set > 16) remainder = (-remainder) & 0xffffffff; + + /* For XOR, if more than half the bits are set and there's a sequence + of more than 8 consecutive ones in the pattern then we can XOR by the + inverted constant and then invert the final result; this may save an + instruction and might also lead to the final mvn being merged with + some other operation. */ + else if (code == XOR && num_bits_set > 16 + && (count_insns_for_constant (remainder ^ 0xffffffff, + find_best_start + (remainder ^ 0xffffffff)) + < count_insns_for_constant (remainder, + find_best_start (remainder)))) + { + remainder ^= 0xffffffff; + final_invert = 1; + } else { can_invert = 0; @@ -3008,63 +3085,8 @@ arm_gen_constant (enum rtx_code code, enum machine_mode mode, rtx cond, /* ??? Use thumb2 replicated constants when the high and low halfwords are the same. */ { - int best_start = 0; - if (!TARGET_THUMB2) - { - int best_consecutive_zeros = 0; - - for (i = 0; i < 32; i += 2) - { - int consecutive_zeros = 0; - - if (!(remainder & (3 << i))) - { - while ((i < 32) && !(remainder & (3 << i))) - { - consecutive_zeros += 2; - i += 2; - } - if (consecutive_zeros > best_consecutive_zeros) - { - best_consecutive_zeros = consecutive_zeros; - best_start = i - consecutive_zeros; - } - i -= 2; - } - } - - /* So long as it won't require any more insns to do so, it's - desirable to emit a small constant (in bits 0...9) in the last - insn. This way there is more chance that it can be combined with - a later addressing insn to form a pre-indexed load or store - operation. Consider: - - *((volatile int *)0xe0000100) = 1; - *((volatile int *)0xe0000110) = 2; - - We want this to wind up as: - - mov rA, #0xe0000000 - mov rB, #1 - str rB, [rA, #0x100] - mov rB, #2 - str rB, [rA, #0x110] - - rather than having to synthesize both large constants from scratch. - - Therefore, we calculate how many insns would be required to emit - the constant starting from `best_start', and also starting from - zero (i.e. with bit 31 first to be output). If `best_start' doesn't - yield a shorter sequence, we may as well use zero. */ - if (best_start != 0 - && ((((unsigned HOST_WIDE_INT) 1) << best_start) < remainder) - && (count_insns_for_constant (remainder, 0) <= - count_insns_for_constant (remainder, best_start))) - best_start = 0; - } - /* Now start emitting the insns. */ - i = best_start; + i = find_best_start (remainder); do { int end; @@ -3092,7 +3114,7 @@ arm_gen_constant (enum rtx_code code, enum machine_mode mode, rtx cond, } else { - if (remainder && subtargets) + if ((final_invert || remainder) && subtargets) new_src = gen_reg_rtx (mode); else new_src = target; @@ -3127,21 +3149,23 @@ arm_gen_constant (enum rtx_code code, enum machine_mode mode, rtx cond, code = PLUS; insns++; - if (TARGET_ARM) - i -= 6; - else - i -= 7; + i -= 8 - step_size; } /* Arm allows rotates by a multiple of two. Thumb-2 allows arbitrary shifts. */ - if (TARGET_ARM) - i -= 2; - else - i--; + i -= step_size; } while (remainder); } + if (final_invert) + { + if (generate) + emit_constant_insn (cond, gen_rtx_SET (VOIDmode, target, + gen_rtx_NOT (mode, source))); + insns++; + } + return insns; } @@ -3264,7 +3288,7 @@ add_libcall (htab_t htab, rtx libcall) } static bool -arm_libcall_uses_aapcs_base (rtx libcall) +arm_libcall_uses_aapcs_base (const_rtx libcall) { static bool init_done = false; static htab_t libcall_htab; @@ -3311,7 +3335,7 @@ arm_libcall_uses_aapcs_base (rtx libcall) } rtx -arm_libcall_value (enum machine_mode mode, rtx libcall) +arm_libcall_value (enum machine_mode mode, const_rtx libcall) { if (TARGET_AAPCS_BASED && arm_pcs_default != ARM_PCS_AAPCS && GET_MODE_CLASS (mode) == MODE_FLOAT) @@ -6201,7 +6225,7 @@ thumb1_rtx_costs (rtx x, enum rtx_code code, enum rtx_code outer) else if ((outer == PLUS || outer == COMPARE) && INTVAL (x) < 256 && INTVAL (x) > -256) return 0; - else if (outer == AND + else if ((outer == IOR || outer == XOR || outer == AND) && INTVAL (x) < 256 && INTVAL (x) >= -256) return COSTS_N_INSNS (1); else if (outer == ASHIFT || outer == ASHIFTRT @@ -12269,7 +12293,7 @@ output_move_neon (rtx *operands) { /* We're only using DImode here because it's a convenient size. */ ops[0] = gen_rtx_REG (DImode, REGNO (reg) + 2 * i); - ops[1] = adjust_address (mem, SImode, 8 * i); + ops[1] = adjust_address (mem, DImode, 8 * i); if (reg_overlap_mentioned_p (ops[0], mem)) { gcc_assert (overlap == -1); @@ -13257,7 +13281,7 @@ arm_output_epilogue (rtx sibling) /* This variable is for the Virtual Frame Pointer, not VFP regs. */ int vfp_offset = offsets->frame; - if (arm_fpu_arch == FPUTYPE_FPA_EMU2) + if (TARGET_FPA_EMU2) { for (reg = LAST_FPA_REGNUM; reg >= FIRST_FPA_REGNUM; reg--) if (df_regs_ever_live_p (reg) && !call_used_regs[reg]) @@ -13480,7 +13504,7 @@ arm_output_epilogue (rtx sibling) SP_REGNUM, HARD_FRAME_POINTER_REGNUM); } - if (arm_fpu_arch == FPUTYPE_FPA_EMU2) + if (TARGET_FPA_EMU2) { for (reg = FIRST_FPA_REGNUM; reg <= LAST_FPA_REGNUM; reg++) if (df_regs_ever_live_p (reg) && !call_used_regs[reg]) @@ -14206,7 +14230,7 @@ arm_save_coproc_regs(void) /* Save any floating point call-saved registers used by this function. */ - if (arm_fpu_arch == FPUTYPE_FPA_EMU2) + if (TARGET_FPA_EMU2) { for (reg = LAST_FPA_REGNUM; reg >= FIRST_FPA_REGNUM; reg--) if (df_regs_ever_live_p (reg) && !call_used_regs[reg]) @@ -19688,45 +19712,8 @@ arm_file_start (void) } else { - int set_float_abi_attributes = 0; - switch (arm_fpu_arch) - { - case FPUTYPE_FPA: - fpu_name = "fpa"; - break; - case FPUTYPE_FPA_EMU2: - fpu_name = "fpe2"; - break; - case FPUTYPE_FPA_EMU3: - fpu_name = "fpe3"; - break; - case FPUTYPE_MAVERICK: - fpu_name = "maverick"; - break; - case FPUTYPE_VFP: - fpu_name = "vfp"; - set_float_abi_attributes = 1; - break; - case FPUTYPE_VFP3D16: - fpu_name = "vfpv3-d16"; - set_float_abi_attributes = 1; - break; - case FPUTYPE_VFP3: - fpu_name = "vfpv3"; - set_float_abi_attributes = 1; - break; - case FPUTYPE_NEON: - fpu_name = "neon"; - set_float_abi_attributes = 1; - break; - case FPUTYPE_NEON_FP16: - fpu_name = "neon-fp16"; - set_float_abi_attributes = 1; - break; - default: - abort(); - } - if (set_float_abi_attributes) + fpu_name = arm_fpu_desc->name; + if (arm_fpu_desc->model == ARM_FP_MODEL_VFP) { if (TARGET_HARD_FLOAT) asm_fprintf (asm_out_file, "\t.eabi_attribute 27, 3\n"); @@ -21173,4 +21160,12 @@ arm_frame_pointer_required (void) || (TARGET_ARM && TARGET_APCS_FRAME && ! leaf_function_p ())); } +/* Only thumb1 can't support conditional execution, so return true if + the target is not thumb1. */ +static bool +arm_have_conditional_execution (void) +{ + return !TARGET_THUMB1; +} + #include "gt-arm.h" diff --git a/gcc/config/arm/arm.h b/gcc/config/arm/arm.h index 9272ca51cba..2dfd22df45c 100644 --- a/gcc/config/arm/arm.h +++ b/gcc/config/arm/arm.h @@ -190,9 +190,9 @@ extern void (*arm_lang_output_object_attributes_hook)(void); #define TARGET_HARD_FLOAT (arm_float_abi != ARM_FLOAT_ABI_SOFT) /* Use hardware floating point calling convention. */ #define TARGET_HARD_FLOAT_ABI (arm_float_abi == ARM_FLOAT_ABI_HARD) -#define TARGET_FPA (arm_fp_model == ARM_FP_MODEL_FPA) -#define TARGET_MAVERICK (arm_fp_model == ARM_FP_MODEL_MAVERICK) -#define TARGET_VFP (arm_fp_model == ARM_FP_MODEL_VFP) +#define TARGET_FPA (arm_fpu_desc->model == ARM_FP_MODEL_FPA) +#define TARGET_MAVERICK (arm_fpu_desc->model == ARM_FP_MODEL_MAVERICK) +#define TARGET_VFP (arm_fpu_desc->model == ARM_FP_MODEL_VFP) #define TARGET_IWMMXT (arm_arch_iwmmxt) #define TARGET_REALLY_IWMMXT (TARGET_IWMMXT && TARGET_32BIT) #define TARGET_IWMMXT_ABI (TARGET_32BIT && arm_abi == ARM_ABI_IWMMXT) @@ -216,6 +216,8 @@ extern void (*arm_lang_output_object_attributes_hook)(void); #define TARGET_THUMB2 (TARGET_THUMB && arm_arch_thumb2) /* Thumb-1 only. */ #define TARGET_THUMB1_ONLY (TARGET_THUMB1 && !arm_arch_notm) +/* FPA emulator without LFM. */ +#define TARGET_FPA_EMU2 (TARGET_FPA && arm_fpu_desc->rev == 2) /* The following two macros concern the ability to execute coprocessor instructions for VFPv3 or NEON. TARGET_VFP3/TARGET_VFPD32 are currently @@ -223,27 +225,21 @@ extern void (*arm_lang_output_object_attributes_hook)(void); to be more careful with TARGET_NEON as noted below. */ /* FPU is has the full VFPv3/NEON register file of 32 D registers. */ -#define TARGET_VFPD32 (arm_fp_model == ARM_FP_MODEL_VFP \ - && (arm_fpu_arch == FPUTYPE_VFP3 \ - || arm_fpu_arch == FPUTYPE_NEON \ - || arm_fpu_arch == FPUTYPE_NEON_FP16)) +#define TARGET_VFPD32 (TARGET_VFP && arm_fpu_desc->regs == VFP_REG_D32) /* FPU supports VFPv3 instructions. */ -#define TARGET_VFP3 (arm_fp_model == ARM_FP_MODEL_VFP \ - && (arm_fpu_arch == FPUTYPE_VFP3D16 \ - || TARGET_VFPD32)) +#define TARGET_VFP3 (TARGET_VFP && arm_fpu_desc->rev >= 3) /* FPU supports NEON/VFP half-precision floating-point. */ -#define TARGET_NEON_FP16 (arm_fpu_arch == FPUTYPE_NEON_FP16) +#define TARGET_NEON_FP16 \ + (TARGET_VFP && arm_fpu_desc->neon && arm_fpu_desc->fp16) /* FPU supports Neon instructions. The setting of this macro gets revealed via __ARM_NEON__ so we add extra guards upon TARGET_32BIT and TARGET_HARD_FLOAT to ensure that NEON instructions are available. */ #define TARGET_NEON (TARGET_32BIT && TARGET_HARD_FLOAT \ - && arm_fp_model == ARM_FP_MODEL_VFP \ - && (arm_fpu_arch == FPUTYPE_NEON \ - || arm_fpu_arch == FPUTYPE_NEON_FP16)) + && TARGET_VFP && arm_fpu_desc->neon) /* "DSP" multiply instructions, eg. SMULxy. */ #define TARGET_DSP_MULTIPLY \ @@ -300,42 +296,25 @@ enum arm_fp_model ARM_FP_MODEL_VFP }; -extern enum arm_fp_model arm_fp_model; - -/* Which floating point hardware is available. Also update - fp_model_for_fpu in arm.c when adding entries to this list. */ -enum fputype +enum vfp_reg_type { - /* No FP hardware. */ - FPUTYPE_NONE, - /* Full FPA support. */ - FPUTYPE_FPA, - /* Emulated FPA hardware, Issue 2 emulator (no LFM/SFM). */ - FPUTYPE_FPA_EMU2, - /* Emulated FPA hardware, Issue 3 emulator. */ - FPUTYPE_FPA_EMU3, - /* Cirrus Maverick floating point co-processor. */ - FPUTYPE_MAVERICK, - /* VFP. */ - FPUTYPE_VFP, - /* VFPv3-D16. */ - FPUTYPE_VFP3D16, - /* VFPv3. */ - FPUTYPE_VFP3, - /* Neon. */ - FPUTYPE_NEON, - /* Neon with half-precision float extensions. */ - FPUTYPE_NEON_FP16 + VFP_REG_D16, + VFP_REG_D32, + VFP_REG_SINGLE }; -/* Recast the floating point class to be the floating point attribute. */ -#define arm_fpu_attr ((enum attr_fpu) arm_fpu_tune) - -/* What type of floating point to tune for */ -extern enum fputype arm_fpu_tune; - -/* What type of floating point instructions are available */ -extern enum fputype arm_fpu_arch; +extern const struct arm_fpu_desc +{ + const char *name; + enum arm_fp_model model; + int rev; + enum vfp_reg_type regs; + int neon; + int fp16; +} *arm_fpu_desc; + +/* Which floating point hardware to schedule for. */ +extern int arm_fpu_attr; enum float_abi_type { diff --git a/gcc/config/arm/arm.md b/gcc/config/arm/arm.md index e180c2f08f1..52edcbaa17b 100644 --- a/gcc/config/arm/arm.md +++ b/gcc/config/arm/arm.md @@ -160,7 +160,7 @@ ; Floating Point Unit. If we only have floating point emulation, then there ; is no point in scheduling the floating point insns. (Well, for best ; performance we should try and group them together). -(define_attr "fpu" "none,fpa,fpe2,fpe3,maverick,vfp,vfpv3d16,vfpv3,neon,neon_fp16" +(define_attr "fpu" "none,fpa,fpe2,fpe3,maverick,vfp" (const (symbol_ref "arm_fpu_attr"))) ; LENGTH of an instruction (in bytes) @@ -392,6 +392,9 @@ ; registers. (define_mode_iterator ANY64 [DI DF V8QI V4HI V2SI V2SF]) +;; The integer modes up to word size +(define_mode_iterator QHSI [QI HI SI]) + ;;--------------------------------------------------------------------------- ;; Predicates @@ -1914,7 +1917,16 @@ else /* TARGET_THUMB1 */ { if (GET_CODE (operands[2]) != CONST_INT) - operands[2] = force_reg (SImode, operands[2]); + { + rtx tmp = force_reg (SImode, operands[2]); + if (rtx_equal_p (operands[0], operands[1])) + operands[2] = tmp; + else + { + operands[2] = operands[1]; + operands[1] = tmp; + } + } else { int i; @@ -2623,7 +2635,16 @@ DONE; } else /* TARGET_THUMB1 */ - operands [2] = force_reg (SImode, operands [2]); + { + rtx tmp = force_reg (SImode, operands[2]); + if (rtx_equal_p (operands[0], operands[1])) + operands[2] = tmp; + else + { + operands[2] = operands[1]; + operands[1] = tmp; + } + } } " ) @@ -2731,12 +2752,29 @@ (define_expand "xorsi3" [(set (match_operand:SI 0 "s_register_operand" "") (xor:SI (match_operand:SI 1 "s_register_operand" "") - (match_operand:SI 2 "arm_rhs_operand" "")))] + (match_operand:SI 2 "reg_or_int_operand" "")))] "TARGET_EITHER" - "if (TARGET_THUMB1) - if (GET_CODE (operands[2]) == CONST_INT) - operands[2] = force_reg (SImode, operands[2]); - " + "if (GET_CODE (operands[2]) == CONST_INT) + { + if (TARGET_32BIT) + { + arm_split_constant (XOR, SImode, NULL_RTX, + INTVAL (operands[2]), operands[0], operands[1], + optimize && can_create_pseudo_p ()); + DONE; + } + else /* TARGET_THUMB1 */ + { + rtx tmp = force_reg (SImode, operands[2]); + if (rtx_equal_p (operands[0], operands[1])) + operands[2] = tmp; + else + { + operands[2] = operands[1]; + operands[1] = tmp; + } + } + }" ) (define_insn "*arm_xorsi3" @@ -5813,6 +5851,11 @@ { rtx reg = gen_reg_rtx (SImode); + /* For thumb we want an unsigned immediate, then we are more likely + to be able to use a movs insn. */ + if (TARGET_THUMB) + operands[1] = GEN_INT (INTVAL (operands[1]) & 255); + emit_insn (gen_movsi (reg, operands[1])); operands[1] = gen_lowpart (QImode, reg); } @@ -6727,6 +6770,7 @@ (const_int 6) (const_int 8))))] ) + (define_insn "*movsi_cbranchsi4" [(set (pc) (if_then_else @@ -6790,6 +6834,45 @@ (const_int 10)))))] ) +(define_peephole2 + [(set (match_operand:SI 0 "low_register_operand" "") + (match_operand:SI 1 "low_register_operand" "")) + (set (pc) + (if_then_else (match_operator 2 "arm_comparison_operator" + [(match_dup 1) (const_int 0)]) + (label_ref (match_operand 3 "" "")) + (pc)))] + "TARGET_THUMB1" + [(parallel + [(set (pc) + (if_then_else (match_op_dup 2 [(match_dup 1) (const_int 0)]) + (label_ref (match_dup 3)) + (pc))) + (set (match_dup 0) (match_dup 1))])] + "" +) + +;; Sigh! This variant shouldn't be needed, but combine often fails to +;; merge cases like this because the op1 is a hard register in +;; CLASS_LIKELY_SPILLED_P. +(define_peephole2 + [(set (match_operand:SI 0 "low_register_operand" "") + (match_operand:SI 1 "low_register_operand" "")) + (set (pc) + (if_then_else (match_operator 2 "arm_comparison_operator" + [(match_dup 0) (const_int 0)]) + (label_ref (match_operand 3 "" "")) + (pc)))] + "TARGET_THUMB1" + [(parallel + [(set (pc) + (if_then_else (match_op_dup 2 [(match_dup 1) (const_int 0)]) + (label_ref (match_dup 3)) + (pc))) + (set (match_dup 0) (match_dup 1))])] + "" +) + (define_insn "*negated_cbranchsi4" [(set (pc) (if_then_else @@ -8033,15 +8116,13 @@ if (!thumb1_cmp_operand (op3, SImode)) op3 = force_reg (SImode, op3); scratch = gen_reg_rtx (SImode); - emit_insn (gen_cstoresi_nltu_thumb1 (scratch, operands[2], op3)); - emit_insn (gen_negsi2 (operands[0], scratch)); + emit_insn (gen_cstoresi_ltu_thumb1 (operands[0], operands[2], op3)); break; case GTU: op3 = force_reg (SImode, operands[3]); scratch = gen_reg_rtx (SImode); - emit_insn (gen_cstoresi_nltu_thumb1 (scratch, op3, operands[2])); - emit_insn (gen_negsi2 (operands[0], scratch)); + emit_insn (gen_cstoresi_ltu_thumb1 (operands[0], op3, operands[2])); break; /* No good sequences for GT, LT. */ @@ -8125,6 +8206,7 @@ [(set_attr "length" "4")] ) +;; Used as part of the expansion of thumb ltu and gtu sequences (define_insn "cstoresi_nltu_thumb1" [(set (match_operand:SI 0 "s_register_operand" "=l,l") (neg:SI (ltu:SI (match_operand:SI 1 "s_register_operand" "l,*h") @@ -8134,6 +8216,20 @@ [(set_attr "length" "4")] ) +(define_insn_and_split "cstoresi_ltu_thumb1" + [(set (match_operand:SI 0 "s_register_operand" "=l,l") + (ltu:SI (match_operand:SI 1 "s_register_operand" "l,*h") + (match_operand:SI 2 "thumb1_cmp_operand" "lI*h,*r")))] + "TARGET_THUMB1" + "#" + "TARGET_THUMB1" + [(set (match_dup 3) + (neg:SI (ltu:SI (match_dup 1) (match_dup 2)))) + (set (match_dup 0) (neg:SI (match_dup 3)))] + "operands[3] = gen_reg_rtx (SImode);" + [(set_attr "length" "4")] +) + ;; Used as part of the expansion of thumb les sequence. (define_insn "thumb1_addsi3_addgeu" [(set (match_operand:SI 0 "s_register_operand" "=l") diff --git a/gcc/config/arm/arm_neon.h b/gcc/config/arm/arm_neon.h index faaaf7bca39..ccfc7426077 100644 --- a/gcc/config/arm/arm_neon.h +++ b/gcc/config/arm/arm_neon.h @@ -61,7 +61,7 @@ typedef __builtin_neon_uhi uint16x8_t __attribute__ ((__vector_size__ (16))); typedef __builtin_neon_usi uint32x4_t __attribute__ ((__vector_size__ (16))); typedef __builtin_neon_udi uint64x2_t __attribute__ ((__vector_size__ (16))); -typedef __builtin_neon_sf float32_t; +typedef float float32_t; typedef __builtin_neon_poly8 poly8_t; typedef __builtin_neon_poly16 poly16_t; @@ -5085,7 +5085,7 @@ vset_lane_s32 (int32_t __a, int32x2_t __b, const int __c) __extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) vset_lane_f32 (float32_t __a, float32x2_t __b, const int __c) { - return (float32x2_t)__builtin_neon_vset_lanev2sf (__a, __b, __c); + return (float32x2_t)__builtin_neon_vset_lanev2sf ((__builtin_neon_sf) __a, __b, __c); } __extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) @@ -5151,7 +5151,7 @@ vsetq_lane_s32 (int32_t __a, int32x4_t __b, const int __c) __extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) vsetq_lane_f32 (float32_t __a, float32x4_t __b, const int __c) { - return (float32x4_t)__builtin_neon_vset_lanev4sf (__a, __b, __c); + return (float32x4_t)__builtin_neon_vset_lanev4sf ((__builtin_neon_sf) __a, __b, __c); } __extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) @@ -5283,7 +5283,7 @@ vdup_n_s32 (int32_t __a) __extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) vdup_n_f32 (float32_t __a) { - return (float32x2_t)__builtin_neon_vdup_nv2sf (__a); + return (float32x2_t)__builtin_neon_vdup_nv2sf ((__builtin_neon_sf) __a); } __extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) @@ -5349,7 +5349,7 @@ vdupq_n_s32 (int32_t __a) __extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) vdupq_n_f32 (float32_t __a) { - return (float32x4_t)__builtin_neon_vdup_nv4sf (__a); + return (float32x4_t)__builtin_neon_vdup_nv4sf ((__builtin_neon_sf) __a); } __extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) @@ -5415,7 +5415,7 @@ vmov_n_s32 (int32_t __a) __extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) vmov_n_f32 (float32_t __a) { - return (float32x2_t)__builtin_neon_vdup_nv2sf (__a); + return (float32x2_t)__builtin_neon_vdup_nv2sf ((__builtin_neon_sf) __a); } __extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) @@ -5481,7 +5481,7 @@ vmovq_n_s32 (int32_t __a) __extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) vmovq_n_f32 (float32_t __a) { - return (float32x4_t)__builtin_neon_vdup_nv4sf (__a); + return (float32x4_t)__builtin_neon_vdup_nv4sf ((__builtin_neon_sf) __a); } __extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) @@ -6591,7 +6591,7 @@ vmul_n_s32 (int32x2_t __a, int32_t __b) __extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) vmul_n_f32 (float32x2_t __a, float32_t __b) { - return (float32x2_t)__builtin_neon_vmul_nv2sf (__a, __b, 3); + return (float32x2_t)__builtin_neon_vmul_nv2sf (__a, (__builtin_neon_sf) __b, 3); } __extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) @@ -6621,7 +6621,7 @@ vmulq_n_s32 (int32x4_t __a, int32_t __b) __extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) vmulq_n_f32 (float32x4_t __a, float32_t __b) { - return (float32x4_t)__builtin_neon_vmul_nv4sf (__a, __b, 3); + return (float32x4_t)__builtin_neon_vmul_nv4sf (__a, (__builtin_neon_sf) __b, 3); } __extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) @@ -6735,7 +6735,7 @@ vmla_n_s32 (int32x2_t __a, int32x2_t __b, int32_t __c) __extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) vmla_n_f32 (float32x2_t __a, float32x2_t __b, float32_t __c) { - return (float32x2_t)__builtin_neon_vmla_nv2sf (__a, __b, __c, 3); + return (float32x2_t)__builtin_neon_vmla_nv2sf (__a, __b, (__builtin_neon_sf) __c, 3); } __extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) @@ -6765,7 +6765,7 @@ vmlaq_n_s32 (int32x4_t __a, int32x4_t __b, int32_t __c) __extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) vmlaq_n_f32 (float32x4_t __a, float32x4_t __b, float32_t __c) { - return (float32x4_t)__builtin_neon_vmla_nv4sf (__a, __b, __c, 3); + return (float32x4_t)__builtin_neon_vmla_nv4sf (__a, __b, (__builtin_neon_sf) __c, 3); } __extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) @@ -6831,7 +6831,7 @@ vmls_n_s32 (int32x2_t __a, int32x2_t __b, int32_t __c) __extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) vmls_n_f32 (float32x2_t __a, float32x2_t __b, float32_t __c) { - return (float32x2_t)__builtin_neon_vmls_nv2sf (__a, __b, __c, 3); + return (float32x2_t)__builtin_neon_vmls_nv2sf (__a, __b, (__builtin_neon_sf) __c, 3); } __extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) @@ -6861,7 +6861,7 @@ vmlsq_n_s32 (int32x4_t __a, int32x4_t __b, int32_t __c) __extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) vmlsq_n_f32 (float32x4_t __a, float32x4_t __b, float32_t __c) { - return (float32x4_t)__builtin_neon_vmls_nv4sf (__a, __b, __c, 3); + return (float32x4_t)__builtin_neon_vmls_nv4sf (__a, __b, (__builtin_neon_sf) __c, 3); } __extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) @@ -7851,7 +7851,7 @@ vld1_s64 (const int64_t * __a) __extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) vld1_f32 (const float32_t * __a) { - return (float32x2_t)__builtin_neon_vld1v2sf (__a); + return (float32x2_t)__builtin_neon_vld1v2sf ((const __builtin_neon_sf *) __a); } __extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) @@ -7917,7 +7917,7 @@ vld1q_s64 (const int64_t * __a) __extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) vld1q_f32 (const float32_t * __a) { - return (float32x4_t)__builtin_neon_vld1v4sf (__a); + return (float32x4_t)__builtin_neon_vld1v4sf ((const __builtin_neon_sf *) __a); } __extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) @@ -7977,7 +7977,7 @@ vld1_lane_s32 (const int32_t * __a, int32x2_t __b, const int __c) __extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) vld1_lane_f32 (const float32_t * __a, float32x2_t __b, const int __c) { - return (float32x2_t)__builtin_neon_vld1_lanev2sf (__a, __b, __c); + return (float32x2_t)__builtin_neon_vld1_lanev2sf ((const __builtin_neon_sf *) __a, __b, __c); } __extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) @@ -8043,7 +8043,7 @@ vld1q_lane_s32 (const int32_t * __a, int32x4_t __b, const int __c) __extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) vld1q_lane_f32 (const float32_t * __a, float32x4_t __b, const int __c) { - return (float32x4_t)__builtin_neon_vld1_lanev4sf (__a, __b, __c); + return (float32x4_t)__builtin_neon_vld1_lanev4sf ((const __builtin_neon_sf *) __a, __b, __c); } __extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) @@ -8109,7 +8109,7 @@ vld1_dup_s32 (const int32_t * __a) __extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) vld1_dup_f32 (const float32_t * __a) { - return (float32x2_t)__builtin_neon_vld1_dupv2sf (__a); + return (float32x2_t)__builtin_neon_vld1_dupv2sf ((const __builtin_neon_sf *) __a); } __extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) @@ -8175,7 +8175,7 @@ vld1q_dup_s32 (const int32_t * __a) __extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) vld1q_dup_f32 (const float32_t * __a) { - return (float32x4_t)__builtin_neon_vld1_dupv4sf (__a); + return (float32x4_t)__builtin_neon_vld1_dupv4sf ((const __builtin_neon_sf *) __a); } __extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) @@ -8247,7 +8247,7 @@ vst1_s64 (int64_t * __a, int64x1_t __b) __extension__ static __inline void __attribute__ ((__always_inline__)) vst1_f32 (float32_t * __a, float32x2_t __b) { - __builtin_neon_vst1v2sf (__a, __b); + __builtin_neon_vst1v2sf ((__builtin_neon_sf *) __a, __b); } __extension__ static __inline void __attribute__ ((__always_inline__)) @@ -8313,7 +8313,7 @@ vst1q_s64 (int64_t * __a, int64x2_t __b) __extension__ static __inline void __attribute__ ((__always_inline__)) vst1q_f32 (float32_t * __a, float32x4_t __b) { - __builtin_neon_vst1v4sf (__a, __b); + __builtin_neon_vst1v4sf ((__builtin_neon_sf *) __a, __b); } __extension__ static __inline void __attribute__ ((__always_inline__)) @@ -8373,7 +8373,7 @@ vst1_lane_s32 (int32_t * __a, int32x2_t __b, const int __c) __extension__ static __inline void __attribute__ ((__always_inline__)) vst1_lane_f32 (float32_t * __a, float32x2_t __b, const int __c) { - __builtin_neon_vst1_lanev2sf (__a, __b, __c); + __builtin_neon_vst1_lanev2sf ((__builtin_neon_sf *) __a, __b, __c); } __extension__ static __inline void __attribute__ ((__always_inline__)) @@ -8439,7 +8439,7 @@ vst1q_lane_s32 (int32_t * __a, int32x4_t __b, const int __c) __extension__ static __inline void __attribute__ ((__always_inline__)) vst1q_lane_f32 (float32_t * __a, float32x4_t __b, const int __c) { - __builtin_neon_vst1_lanev4sf (__a, __b, __c); + __builtin_neon_vst1_lanev4sf ((__builtin_neon_sf *) __a, __b, __c); } __extension__ static __inline void __attribute__ ((__always_inline__)) @@ -8512,7 +8512,7 @@ __extension__ static __inline float32x2x2_t __attribute__ ((__always_inline__)) vld2_f32 (const float32_t * __a) { union { float32x2x2_t __i; __builtin_neon_ti __o; } __rv; - __rv.__o = __builtin_neon_vld2v2sf (__a); + __rv.__o = __builtin_neon_vld2v2sf ((const __builtin_neon_sf *) __a); return __rv.__i; } @@ -8600,7 +8600,7 @@ __extension__ static __inline float32x4x2_t __attribute__ ((__always_inline__)) vld2q_f32 (const float32_t * __a) { union { float32x4x2_t __i; __builtin_neon_oi __o; } __rv; - __rv.__o = __builtin_neon_vld2v4sf (__a); + __rv.__o = __builtin_neon_vld2v4sf ((const __builtin_neon_sf *) __a); return __rv.__i; } @@ -8676,7 +8676,7 @@ vld2_lane_f32 (const float32_t * __a, float32x2x2_t __b, const int __c) { union { float32x2x2_t __i; __builtin_neon_ti __o; } __bu = { __b }; union { float32x2x2_t __i; __builtin_neon_ti __o; } __rv; - __rv.__o = __builtin_neon_vld2_lanev2sf (__a, __bu.__o, __c); + __rv.__o = __builtin_neon_vld2_lanev2sf ((const __builtin_neon_sf *) __a, __bu.__o, __c); return __rv.__i; } @@ -8748,7 +8748,7 @@ vld2q_lane_f32 (const float32_t * __a, float32x4x2_t __b, const int __c) { union { float32x4x2_t __i; __builtin_neon_oi __o; } __bu = { __b }; union { float32x4x2_t __i; __builtin_neon_oi __o; } __rv; - __rv.__o = __builtin_neon_vld2_lanev4sf (__a, __bu.__o, __c); + __rv.__o = __builtin_neon_vld2_lanev4sf ((const __builtin_neon_sf *) __a, __bu.__o, __c); return __rv.__i; } @@ -8807,7 +8807,7 @@ __extension__ static __inline float32x2x2_t __attribute__ ((__always_inline__)) vld2_dup_f32 (const float32_t * __a) { union { float32x2x2_t __i; __builtin_neon_ti __o; } __rv; - __rv.__o = __builtin_neon_vld2_dupv2sf (__a); + __rv.__o = __builtin_neon_vld2_dupv2sf ((const __builtin_neon_sf *) __a); return __rv.__i; } @@ -8892,7 +8892,7 @@ __extension__ static __inline void __attribute__ ((__always_inline__)) vst2_f32 (float32_t * __a, float32x2x2_t __b) { union { float32x2x2_t __i; __builtin_neon_ti __o; } __bu = { __b }; - __builtin_neon_vst2v2sf (__a, __bu.__o); + __builtin_neon_vst2v2sf ((__builtin_neon_sf *) __a, __bu.__o); } __extension__ static __inline void __attribute__ ((__always_inline__)) @@ -8969,7 +8969,7 @@ __extension__ static __inline void __attribute__ ((__always_inline__)) vst2q_f32 (float32_t * __a, float32x4x2_t __b) { union { float32x4x2_t __i; __builtin_neon_oi __o; } __bu = { __b }; - __builtin_neon_vst2v4sf (__a, __bu.__o); + __builtin_neon_vst2v4sf ((__builtin_neon_sf *) __a, __bu.__o); } __extension__ static __inline void __attribute__ ((__always_inline__)) @@ -9032,7 +9032,7 @@ __extension__ static __inline void __attribute__ ((__always_inline__)) vst2_lane_f32 (float32_t * __a, float32x2x2_t __b, const int __c) { union { float32x2x2_t __i; __builtin_neon_ti __o; } __bu = { __b }; - __builtin_neon_vst2_lanev2sf (__a, __bu.__o, __c); + __builtin_neon_vst2_lanev2sf ((__builtin_neon_sf *) __a, __bu.__o, __c); } __extension__ static __inline void __attribute__ ((__always_inline__)) @@ -9088,7 +9088,7 @@ __extension__ static __inline void __attribute__ ((__always_inline__)) vst2q_lane_f32 (float32_t * __a, float32x4x2_t __b, const int __c) { union { float32x4x2_t __i; __builtin_neon_oi __o; } __bu = { __b }; - __builtin_neon_vst2_lanev4sf (__a, __bu.__o, __c); + __builtin_neon_vst2_lanev4sf ((__builtin_neon_sf *) __a, __bu.__o, __c); } __extension__ static __inline void __attribute__ ((__always_inline__)) @@ -9140,7 +9140,7 @@ __extension__ static __inline float32x2x3_t __attribute__ ((__always_inline__)) vld3_f32 (const float32_t * __a) { union { float32x2x3_t __i; __builtin_neon_ei __o; } __rv; - __rv.__o = __builtin_neon_vld3v2sf (__a); + __rv.__o = __builtin_neon_vld3v2sf ((const __builtin_neon_sf *) __a); return __rv.__i; } @@ -9228,7 +9228,7 @@ __extension__ static __inline float32x4x3_t __attribute__ ((__always_inline__)) vld3q_f32 (const float32_t * __a) { union { float32x4x3_t __i; __builtin_neon_ci __o; } __rv; - __rv.__o = __builtin_neon_vld3v4sf (__a); + __rv.__o = __builtin_neon_vld3v4sf ((const __builtin_neon_sf *) __a); return __rv.__i; } @@ -9304,7 +9304,7 @@ vld3_lane_f32 (const float32_t * __a, float32x2x3_t __b, const int __c) { union { float32x2x3_t __i; __builtin_neon_ei __o; } __bu = { __b }; union { float32x2x3_t __i; __builtin_neon_ei __o; } __rv; - __rv.__o = __builtin_neon_vld3_lanev2sf (__a, __bu.__o, __c); + __rv.__o = __builtin_neon_vld3_lanev2sf ((const __builtin_neon_sf *) __a, __bu.__o, __c); return __rv.__i; } @@ -9376,7 +9376,7 @@ vld3q_lane_f32 (const float32_t * __a, float32x4x3_t __b, const int __c) { union { float32x4x3_t __i; __builtin_neon_ci __o; } __bu = { __b }; union { float32x4x3_t __i; __builtin_neon_ci __o; } __rv; - __rv.__o = __builtin_neon_vld3_lanev4sf (__a, __bu.__o, __c); + __rv.__o = __builtin_neon_vld3_lanev4sf ((const __builtin_neon_sf *) __a, __bu.__o, __c); return __rv.__i; } @@ -9435,7 +9435,7 @@ __extension__ static __inline float32x2x3_t __attribute__ ((__always_inline__)) vld3_dup_f32 (const float32_t * __a) { union { float32x2x3_t __i; __builtin_neon_ei __o; } __rv; - __rv.__o = __builtin_neon_vld3_dupv2sf (__a); + __rv.__o = __builtin_neon_vld3_dupv2sf ((const __builtin_neon_sf *) __a); return __rv.__i; } @@ -9520,7 +9520,7 @@ __extension__ static __inline void __attribute__ ((__always_inline__)) vst3_f32 (float32_t * __a, float32x2x3_t __b) { union { float32x2x3_t __i; __builtin_neon_ei __o; } __bu = { __b }; - __builtin_neon_vst3v2sf (__a, __bu.__o); + __builtin_neon_vst3v2sf ((__builtin_neon_sf *) __a, __bu.__o); } __extension__ static __inline void __attribute__ ((__always_inline__)) @@ -9597,7 +9597,7 @@ __extension__ static __inline void __attribute__ ((__always_inline__)) vst3q_f32 (float32_t * __a, float32x4x3_t __b) { union { float32x4x3_t __i; __builtin_neon_ci __o; } __bu = { __b }; - __builtin_neon_vst3v4sf (__a, __bu.__o); + __builtin_neon_vst3v4sf ((__builtin_neon_sf *) __a, __bu.__o); } __extension__ static __inline void __attribute__ ((__always_inline__)) @@ -9660,7 +9660,7 @@ __extension__ static __inline void __attribute__ ((__always_inline__)) vst3_lane_f32 (float32_t * __a, float32x2x3_t __b, const int __c) { union { float32x2x3_t __i; __builtin_neon_ei __o; } __bu = { __b }; - __builtin_neon_vst3_lanev2sf (__a, __bu.__o, __c); + __builtin_neon_vst3_lanev2sf ((__builtin_neon_sf *) __a, __bu.__o, __c); } __extension__ static __inline void __attribute__ ((__always_inline__)) @@ -9716,7 +9716,7 @@ __extension__ static __inline void __attribute__ ((__always_inline__)) vst3q_lane_f32 (float32_t * __a, float32x4x3_t __b, const int __c) { union { float32x4x3_t __i; __builtin_neon_ci __o; } __bu = { __b }; - __builtin_neon_vst3_lanev4sf (__a, __bu.__o, __c); + __builtin_neon_vst3_lanev4sf ((__builtin_neon_sf *) __a, __bu.__o, __c); } __extension__ static __inline void __attribute__ ((__always_inline__)) @@ -9768,7 +9768,7 @@ __extension__ static __inline float32x2x4_t __attribute__ ((__always_inline__)) vld4_f32 (const float32_t * __a) { union { float32x2x4_t __i; __builtin_neon_oi __o; } __rv; - __rv.__o = __builtin_neon_vld4v2sf (__a); + __rv.__o = __builtin_neon_vld4v2sf ((const __builtin_neon_sf *) __a); return __rv.__i; } @@ -9856,7 +9856,7 @@ __extension__ static __inline float32x4x4_t __attribute__ ((__always_inline__)) vld4q_f32 (const float32_t * __a) { union { float32x4x4_t __i; __builtin_neon_xi __o; } __rv; - __rv.__o = __builtin_neon_vld4v4sf (__a); + __rv.__o = __builtin_neon_vld4v4sf ((const __builtin_neon_sf *) __a); return __rv.__i; } @@ -9932,7 +9932,7 @@ vld4_lane_f32 (const float32_t * __a, float32x2x4_t __b, const int __c) { union { float32x2x4_t __i; __builtin_neon_oi __o; } __bu = { __b }; union { float32x2x4_t __i; __builtin_neon_oi __o; } __rv; - __rv.__o = __builtin_neon_vld4_lanev2sf (__a, __bu.__o, __c); + __rv.__o = __builtin_neon_vld4_lanev2sf ((const __builtin_neon_sf *) __a, __bu.__o, __c); return __rv.__i; } @@ -10004,7 +10004,7 @@ vld4q_lane_f32 (const float32_t * __a, float32x4x4_t __b, const int __c) { union { float32x4x4_t __i; __builtin_neon_xi __o; } __bu = { __b }; union { float32x4x4_t __i; __builtin_neon_xi __o; } __rv; - __rv.__o = __builtin_neon_vld4_lanev4sf (__a, __bu.__o, __c); + __rv.__o = __builtin_neon_vld4_lanev4sf ((const __builtin_neon_sf *) __a, __bu.__o, __c); return __rv.__i; } @@ -10063,7 +10063,7 @@ __extension__ static __inline float32x2x4_t __attribute__ ((__always_inline__)) vld4_dup_f32 (const float32_t * __a) { union { float32x2x4_t __i; __builtin_neon_oi __o; } __rv; - __rv.__o = __builtin_neon_vld4_dupv2sf (__a); + __rv.__o = __builtin_neon_vld4_dupv2sf ((const __builtin_neon_sf *) __a); return __rv.__i; } @@ -10148,7 +10148,7 @@ __extension__ static __inline void __attribute__ ((__always_inline__)) vst4_f32 (float32_t * __a, float32x2x4_t __b) { union { float32x2x4_t __i; __builtin_neon_oi __o; } __bu = { __b }; - __builtin_neon_vst4v2sf (__a, __bu.__o); + __builtin_neon_vst4v2sf ((__builtin_neon_sf *) __a, __bu.__o); } __extension__ static __inline void __attribute__ ((__always_inline__)) @@ -10225,7 +10225,7 @@ __extension__ static __inline void __attribute__ ((__always_inline__)) vst4q_f32 (float32_t * __a, float32x4x4_t __b) { union { float32x4x4_t __i; __builtin_neon_xi __o; } __bu = { __b }; - __builtin_neon_vst4v4sf (__a, __bu.__o); + __builtin_neon_vst4v4sf ((__builtin_neon_sf *) __a, __bu.__o); } __extension__ static __inline void __attribute__ ((__always_inline__)) @@ -10288,7 +10288,7 @@ __extension__ static __inline void __attribute__ ((__always_inline__)) vst4_lane_f32 (float32_t * __a, float32x2x4_t __b, const int __c) { union { float32x2x4_t __i; __builtin_neon_oi __o; } __bu = { __b }; - __builtin_neon_vst4_lanev2sf (__a, __bu.__o, __c); + __builtin_neon_vst4_lanev2sf ((__builtin_neon_sf *) __a, __bu.__o, __c); } __extension__ static __inline void __attribute__ ((__always_inline__)) @@ -10344,7 +10344,7 @@ __extension__ static __inline void __attribute__ ((__always_inline__)) vst4q_lane_f32 (float32_t * __a, float32x4x4_t __b, const int __c) { union { float32x4x4_t __i; __builtin_neon_xi __o; } __bu = { __b }; - __builtin_neon_vst4_lanev4sf (__a, __bu.__o, __c); + __builtin_neon_vst4_lanev4sf ((__builtin_neon_sf *) __a, __bu.__o, __c); } __extension__ static __inline void __attribute__ ((__always_inline__)) diff --git a/gcc/config/arm/bpabi.h b/gcc/config/arm/bpabi.h index bc0c62f401e..ba206022b75 100644 --- a/gcc/config/arm/bpabi.h +++ b/gcc/config/arm/bpabi.h @@ -30,7 +30,7 @@ /* Section 4.1 of the AAPCS requires the use of VFP format. */ #undef FPUTYPE_DEFAULT -#define FPUTYPE_DEFAULT FPUTYPE_VFP +#define FPUTYPE_DEFAULT "vfp" /* TARGET_BIG_ENDIAN_DEFAULT is set in config.gcc for big endian configurations. */ @@ -53,6 +53,8 @@ #define TARGET_FIX_V4BX_SPEC " %{mcpu=arm8|mcpu=arm810|mcpu=strongarm*|march=armv4:--fix-v4bx}" +#define BE8_LINK_SPEC " %{mbig-endian:%{march=armv7-a|mcpu=cortex-a8|mcpu=cortex-a9:%{!r:--be8}}}" + /* Tell the assembler to build BPABI binaries. */ #undef SUBTARGET_EXTRA_ASM_SPEC #define SUBTARGET_EXTRA_ASM_SPEC "%{mabi=apcs-gnu|mabi=atpcs:-meabi=gnu;:-meabi=5}" TARGET_FIX_V4BX_SPEC @@ -65,7 +67,7 @@ #define BPABI_LINK_SPEC \ "%{mbig-endian:-EB} %{mlittle-endian:-EL} " \ "%{static:-Bstatic} %{shared:-shared} %{symbolic:-Bsymbolic} " \ - "-X" SUBTARGET_EXTRA_LINK_SPEC TARGET_FIX_V4BX_SPEC + "-X" SUBTARGET_EXTRA_LINK_SPEC TARGET_FIX_V4BX_SPEC BE8_LINK_SPEC #undef LINK_SPEC #define LINK_SPEC BPABI_LINK_SPEC diff --git a/gcc/config/arm/cortex-a9.md b/gcc/config/arm/cortex-a9.md index 121fd2da747..d1ad7cba767 100644 --- a/gcc/config/arm/cortex-a9.md +++ b/gcc/config/arm/cortex-a9.md @@ -1,6 +1,8 @@ -;; ARM Cortex-A9 VFP pipeline description -;; Copyright (C) 2008 Free Software Foundation, Inc. -;; Written by CodeSourcery. +;; ARM Cortex-A9 pipeline description +;; Copyright (C) 2008, 2009 Free Software Foundation, Inc. +;; Originally written by CodeSourcery for VFP. +;; +;; Integer core pipeline description contributed by ARM Ltd. ;; ;; This file is part of GCC. ;; @@ -20,9 +22,181 @@ (define_automaton "cortex_a9") -;; FIXME: We model a single pipeline for all instructions. -;; Is dual-issue possible, and do we have other pipelines? -(define_cpu_unit "cortex_a9_vfp" "cortex_a9") +;; The Cortex-A9 integer core is modelled as a dual issue pipeline that has +;; the following components. +;; 1. 1 Load Store Pipeline. +;; 2. P0 / main pipeline for data processing instructions. +;; 3. P1 / Dual pipeline for Data processing instructions. +;; 4. MAC pipeline for multiply as well as multiply +;; and accumulate instructions. +;; 5. 1 VFP / Neon pipeline. +;; The Load/Store and VFP/Neon pipeline are multiplexed. +;; The P0 / main pipeline and M1 stage of the MAC pipeline are +;; multiplexed. +;; The P1 / dual pipeline and M2 stage of the MAC pipeline are +;; multiplexed. +;; There are only 4 register read ports and hence at any point of +;; time we can't have issue down the E1 and the E2 ports unless +;; of course there are bypass paths that get exercised. +;; Both P0 and P1 have 2 stages E1 and E2. +;; Data processing instructions issue to E1 or E2 depending on +;; whether they have an early shift or not. + + +(define_cpu_unit "cortex_a9_vfp, cortex_a9_ls" "cortex_a9") +(define_cpu_unit "cortex_a9_p0_e1, cortex_a9_p0_e2" "cortex_a9") +(define_cpu_unit "cortex_a9_p1_e1, cortex_a9_p1_e2" "cortex_a9") +(define_cpu_unit "cortex_a9_p0_wb, cortex_a9_p1_wb" "cortex_a9") +(define_cpu_unit "cortex_a9_mac_m1, cortex_a9_mac_m2" "cortex_a9") +(define_cpu_unit "cortex_a9_branch, cortex_a9_issue_branch" "cortex_a9") + +(define_reservation "cortex_a9_p0_default" "cortex_a9_p0_e2, cortex_a9_p0_wb") +(define_reservation "cortex_a9_p1_default" "cortex_a9_p1_e2, cortex_a9_p1_wb") +(define_reservation "cortex_a9_p0_shift" "cortex_a9_p0_e1, cortex_a9_p0_default") +(define_reservation "cortex_a9_p1_shift" "cortex_a9_p1_e1, cortex_a9_p1_default") + +(define_reservation "cortex_a9_multcycle1" + "cortex_a9_p0_e2 + cortex_a9_mac_m1 + cortex_a9_mac_m2 + \ +cortex_a9_p1_e2 + cortex_a9_p0_e1 + cortex_a9_p1_e1") + +(define_reservation "cortex_a9_mult16" + "cortex_a9_mac_m1, cortex_a9_mac_m2, cortex_a9_p0_wb") +(define_reservation "cortex_a9_mac16" + "cortex_a9_multcycle1, cortex_a9_mac_m2, cortex_a9_p0_wb") +(define_reservation "cortex_a9_mult" + "cortex_a9_mac_m1*2, cortex_a9_mac_m2, cortex_a9_p0_wb") +(define_reservation "cortex_a9_mac" + "cortex_a9_multcycle1*2 ,cortex_a9_mac_m2, cortex_a9_p0_wb") + + +;; Issue at the same time along the load store pipeline and +;; the VFP / Neon pipeline is not possible. +;; FIXME:: At some point we need to model the issue +;; of the load store and the vfp being shared rather than anything else. + +(exclusion_set "cortex_a9_ls" "cortex_a9_vfp") + + +;; Default data processing instruction without any shift +;; The only exception to this is the mov instruction +;; which can go down E2 without any problem. +(define_insn_reservation "cortex_a9_dp" 2 + (and (eq_attr "tune" "cortexa9") + (ior (eq_attr "type" "alu") + (and (eq_attr "type" "alu_shift_reg, alu_shift") + (eq_attr "insn" "mov")))) + "cortex_a9_p0_default|cortex_a9_p1_default") + +;; An instruction using the shifter will go down E1. +(define_insn_reservation "cortex_a9_dp_shift" 3 + (and (eq_attr "tune" "cortexa9") + (and (eq_attr "type" "alu_shift_reg, alu_shift") + (not (eq_attr "insn" "mov")))) + "cortex_a9_p0_shift | cortex_a9_p1_shift") + +;; Loads have a latency of 4 cycles. +;; We don't model autoincrement instructions. These +;; instructions use the load store pipeline and 1 of +;; the E2 units to write back the result of the increment. + +(define_insn_reservation "cortex_a9_load1_2" 4 + (and (eq_attr "tune" "cortexa9") + (eq_attr "type" "load1, load2, load_byte")) + "cortex_a9_ls") + +;; Loads multiples and store multiples can't be issued for 2 cycles in a +;; row. The description below assumes that addresses are 64 bit aligned. +;; If not, there is an extra cycle latency which is not modelled. + +;; FIXME:: This bit might need to be reworked when we get to +;; tuning for the VFP because strictly speaking the ldm +;; is sent to the LSU unit as is and there is only an +;; issue restriction between the LSU and the VFP/ Neon unit. + +(define_insn_reservation "cortex_a9_load3_4" 5 + (and (eq_attr "tune" "cortexa9") + (eq_attr "type" "load3, load4")) + "cortex_a9_ls, cortex_a9_ls") + +(define_insn_reservation "cortex_a9_store1_2" 0 + (and (eq_attr "tune" "cortexa9") + (eq_attr "type" "store1, store2")) + "cortex_a9_ls") + +;; Almost all our store multiples use an auto-increment +;; form. Don't issue back to back load and store multiples +;; because the load store unit will stall. +(define_insn_reservation "cortex_a9_store3_4" 0 + (and (eq_attr "tune" "cortexa9") + (eq_attr "type" "store3, store4")) + "cortex_a9_ls+(cortex_a9_p0_default | cortex_a9_p1_default), cortex_a9_ls") + +;; We get 16*16 multiply / mac results in 3 cycles. +(define_insn_reservation "cortex_a9_mult16" 3 + (and (eq_attr "tune" "cortexa9") + (eq_attr "insn" "smulxy")) + "cortex_a9_mult16") + +;; The 16*16 mac is slightly different that it +;; reserves M1 and M2 in the same cycle. +(define_insn_reservation "cortex_a9_mac16" 3 + (and (eq_attr "tune" "cortexa9") + (eq_attr "insn" "smlaxy")) + "cortex_a9_mac16") + + +(define_insn_reservation "cortex_a9_multiply" 4 + (and (eq_attr "tune" "cortexa9") + (eq_attr "insn" "mul")) + "cortex_a9_mult") + +(define_insn_reservation "cortex_a9_mac" 4 + (and (eq_attr "tune" "cortexa9") + (eq_attr "insn" "mla")) + "cortex_a9_mac") + +;; An instruction with a result in E2 can be forwarded +;; to E2 or E1 or M1 or the load store unit in the next cycle. + +(define_bypass 1 "cortex_a9_dp" + "cortex_a9_dp_shift, cortex_a9_multiply, + cortex_a9_load1_2, cortex_a9_dp, cortex_a9_store1_2, + cortex_a9_mult16, cortex_a9_mac16, cortex_a9_mac, cortex_a9_store3_4, cortex_a9_load3_4") + +(define_bypass 2 "cortex_a9_dp_shift" + "cortex_a9_dp_shift, cortex_a9_multiply, + cortex_a9_load1_2, cortex_a9_dp, cortex_a9_store1_2, + cortex_a9_mult16, cortex_a9_mac16, cortex_a9_mac, cortex_a9_store3_4, cortex_a9_load3_4") + +;; An instruction in the load store pipeline can provide +;; read access to a DP instruction in the P0 default pipeline +;; before the writeback stage. + +(define_bypass 3 "cortex_a9_load1_2" "cortex_a9_dp, cortex_a9_load1_2, +cortex_a9_store3_4, cortex_a9_store1_2") + +(define_bypass 4 "cortex_a9_load3_4" "cortex_a9_dp, cortex_a9_load1_2, +cortex_a9_store3_4, cortex_a9_store1_2, cortex_a9_load3_4") + +;; Calls and branches. + +;; Branch instructions + +(define_insn_reservation "cortex_a9_branch" 0 + (and (eq_attr "tune" "cortexa9") + (eq_attr "type" "branch")) + "cortex_a9_branch") + +;; Call latencies are essentially 0 but make sure +;; dual issue doesn't happen i.e the next instruction +;; starts at the next cycle. +(define_insn_reservation "cortex_a9_call" 0 + (and (eq_attr "tune" "cortexa9") + (eq_attr "type" "call")) + "cortex_a9_issue_branch + cortex_a9_multcycle1 + cortex_a9_ls + cortex_a9_vfp") + + +;; Pipelining for VFP instructions. (define_insn_reservation "cortex_a9_ffarith" 1 (and (eq_attr "tune" "cortexa9") diff --git a/gcc/config/arm/fpa.md b/gcc/config/arm/fpa.md index fcd92b002d7..515de43d28b 100644 --- a/gcc/config/arm/fpa.md +++ b/gcc/config/arm/fpa.md @@ -599,10 +599,10 @@ { default: case 0: return \"mvf%?e\\t%0, %1\"; - case 1: if (arm_fpu_arch == FPUTYPE_FPA_EMU2) + case 1: if (TARGET_FPA_EMU2) return \"ldf%?e\\t%0, %1\"; return \"lfm%?\\t%0, 1, %1\"; - case 2: if (arm_fpu_arch == FPUTYPE_FPA_EMU2) + case 2: if (TARGET_FPA_EMU2) return \"stf%?e\\t%1, %0\"; return \"sfm%?\\t%1, 1, %0\"; } diff --git a/gcc/config/arm/linux-eabi.h b/gcc/config/arm/linux-eabi.h index 780a504add2..fce1ed165d3 100644 --- a/gcc/config/arm/linux-eabi.h +++ b/gcc/config/arm/linux-eabi.h @@ -66,7 +66,7 @@ /* At this point, bpabi.h will have clobbered LINK_SPEC. We want to use the GNU/Linux version, not the generic BPABI version. */ #undef LINK_SPEC -#define LINK_SPEC LINUX_TARGET_LINK_SPEC +#define LINK_SPEC LINUX_TARGET_LINK_SPEC BE8_LINK_SPEC /* Use the default LIBGCC_SPEC, not the version in linux-elf.h, as we do not use -lfloat. */ diff --git a/gcc/config/arm/linux-elf.h b/gcc/config/arm/linux-elf.h index 07455ee87fd..9fdca414e8e 100644 --- a/gcc/config/arm/linux-elf.h +++ b/gcc/config/arm/linux-elf.h @@ -98,7 +98,7 @@ /* NWFPE always understands FPA instructions. */ #undef FPUTYPE_DEFAULT -#define FPUTYPE_DEFAULT FPUTYPE_FPA_EMU3 +#define FPUTYPE_DEFAULT "fpe3" /* Call the function profiler with a given profile label. */ #undef ARM_FUNCTION_PROFILER diff --git a/gcc/config/arm/neon-gen.ml b/gcc/config/arm/neon-gen.ml index 9c8e2a89b86..112c8be6e3b 100644 --- a/gcc/config/arm/neon-gen.ml +++ b/gcc/config/arm/neon-gen.ml @@ -122,6 +122,7 @@ let rec signed_ctype = function | T_uint16 | T_int16 -> T_intHI | T_uint32 | T_int32 -> T_intSI | T_uint64 | T_int64 -> T_intDI + | T_float32 -> T_floatSF | T_poly8 -> T_intQI | T_poly16 -> T_intHI | T_arrayof (n, elt) -> T_arrayof (n, signed_ctype elt) @@ -320,7 +321,7 @@ let deftypes () = typeinfo; Format.print_newline (); (* Extra types not in <stdint.h>. *) - Format.printf "typedef __builtin_neon_sf float32_t;\n"; + Format.printf "typedef float float32_t;\n"; Format.printf "typedef __builtin_neon_poly8 poly8_t;\n"; Format.printf "typedef __builtin_neon_poly16 poly16_t;\n" diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md index 85bc3eed100..7d1ef111339 100644 --- a/gcc/config/arm/neon.md +++ b/gcc/config/arm/neon.md @@ -3655,7 +3655,8 @@ UNSPEC_VSHLL_N))] "TARGET_NEON" { - neon_const_bounds (operands[2], 0, neon_element_bits (<MODE>mode)); + /* The boundaries are: 0 < imm <= size. */ + neon_const_bounds (operands[2], 0, neon_element_bits (<MODE>mode) + 1); return "vshll.%T3%#<V_sz_elem>\t%q0, %P1, %2"; } [(set_attr "neon_type" "neon_shift_1")] diff --git a/gcc/config/arm/neon.ml b/gcc/config/arm/neon.ml index 10393b33ebc..114097d22a7 100644 --- a/gcc/config/arm/neon.ml +++ b/gcc/config/arm/neon.ml @@ -50,7 +50,7 @@ type vectype = T_int8x8 | T_int8x16 | T_ptrto of vectype | T_const of vectype | T_void | T_intQI | T_intHI | T_intSI - | T_intDI + | T_intDI | T_floatSF (* The meanings of the following are: TImode : "Tetra", two registers (four words). @@ -1693,6 +1693,7 @@ let string_of_vectype vt = | T_intHI -> "__builtin_neon_hi" | T_intSI -> "__builtin_neon_si" | T_intDI -> "__builtin_neon_di" + | T_floatSF -> "__builtin_neon_sf" | T_arrayof (num, base) -> let basename = name (fun x -> x) base in affix (Printf.sprintf "%sx%d" basename num) diff --git a/gcc/config/arm/netbsd-elf.h b/gcc/config/arm/netbsd-elf.h index 4c06fa1cb3b..9cf186b338d 100644 --- a/gcc/config/arm/netbsd-elf.h +++ b/gcc/config/arm/netbsd-elf.h @@ -153,5 +153,5 @@ do \ while (0) #undef FPUTYPE_DEFAULT -#define FPUTYPE_DEFAULT FPUTYPE_VFP +#define FPUTYPE_DEFAULT "vfp" diff --git a/gcc/config/arm/thumb2.md b/gcc/config/arm/thumb2.md index 884d58c7677..82f75f9b733 100644 --- a/gcc/config/arm/thumb2.md +++ b/gcc/config/arm/thumb2.md @@ -1156,8 +1156,8 @@ ;; 16-bit load immediate (define_peephole2 - [(set (match_operand:SI 0 "low_register_operand" "") - (match_operand:SI 1 "const_int_operand" ""))] + [(set (match_operand:QHSI 0 "low_register_operand" "") + (match_operand:QHSI 1 "const_int_operand" ""))] "TARGET_THUMB2 && peep2_regno_dead_p(0, CC_REGNUM) && (unsigned HOST_WIDE_INT) INTVAL(operands[1]) < 256" @@ -1168,9 +1168,9 @@ "" ) -(define_insn "*thumb2_movsi_shortim" - [(set (match_operand:SI 0 "low_register_operand" "=l") - (match_operand:SI 1 "const_int_operand" "I")) +(define_insn "*thumb2_mov<mode>_shortim" + [(set (match_operand:QHSI 0 "low_register_operand" "=l") + (match_operand:QHSI 1 "const_int_operand" "I")) (clobber (reg:CC CC_REGNUM))] "TARGET_THUMB2 && reload_completed" "mov%!\t%0, %1" diff --git a/gcc/config/arm/unwind-arm.c b/gcc/config/arm/unwind-arm.c index 4eb18215f17..2c6e004890e 100644 --- a/gcc/config/arm/unwind-arm.c +++ b/gcc/config/arm/unwind-arm.c @@ -1000,7 +1000,6 @@ __gnu_Unwind_Backtrace(_Unwind_Trace_Fn trace, void * trace_argument, while (code != _URC_END_OF_STACK && code != _URC_FAILURE); - finish: restore_non_core_regs (&saved_vrs); return code; } diff --git a/gcc/config/arm/vxworks.h b/gcc/config/arm/vxworks.h index 8879fedb7d7..aa7e197bc5d 100644 --- a/gcc/config/arm/vxworks.h +++ b/gcc/config/arm/vxworks.h @@ -97,7 +97,7 @@ along with GCC; see the file COPYING3. If not see /* There is no default multilib. */ #undef MULTILIB_DEFAULTS -#define FPUTYPE_DEFAULT FPUTYPE_VFP +#define FPUTYPE_DEFAULT "vfp" #undef FUNCTION_PROFILER #define FUNCTION_PROFILER VXWORKS_FUNCTION_PROFILER |