summaryrefslogtreecommitdiff
path: root/gcc/config/arm
diff options
context:
space:
mode:
Diffstat (limited to 'gcc/config/arm')
-rw-r--r--gcc/config/arm/arm.c427
-rw-r--r--gcc/config/arm/arm.h73
-rw-r--r--gcc/config/arm/arm.md120
-rw-r--r--gcc/config/arm/arm_neon.h100
-rw-r--r--gcc/config/arm/bpabi.h6
-rw-r--r--gcc/config/arm/cortex-a9.md186
-rw-r--r--gcc/config/arm/fpa.md4
-rw-r--r--gcc/config/arm/linux-eabi.h2
-rw-r--r--gcc/config/arm/linux-elf.h2
-rw-r--r--gcc/config/arm/neon-gen.ml3
-rw-r--r--gcc/config/arm/neon.md3
-rw-r--r--gcc/config/arm/neon.ml3
-rw-r--r--gcc/config/arm/netbsd-elf.h2
-rw-r--r--gcc/config/arm/thumb2.md10
-rw-r--r--gcc/config/arm/unwind-arm.c1
-rw-r--r--gcc/config/arm/vxworks.h2
16 files changed, 596 insertions, 348 deletions
diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c
index cd5a0ed1403..4c7fcb65854 100644
--- a/gcc/config/arm/arm.c
+++ b/gcc/config/arm/arm.c
@@ -133,11 +133,12 @@ static enum machine_mode arm_promote_function_mode (const_tree,
const_tree, int);
static bool arm_return_in_memory (const_tree, const_tree);
static rtx arm_function_value (const_tree, const_tree, bool);
-static rtx arm_libcall_value (enum machine_mode, rtx);
+static rtx arm_libcall_value (enum machine_mode, const_rtx);
static void arm_internal_label (FILE *, const char *, unsigned long);
static void arm_output_mi_thunk (FILE *, tree, HOST_WIDE_INT, HOST_WIDE_INT,
tree);
+static bool arm_have_conditional_execution (void);
static bool arm_rtx_costs_1 (rtx, enum rtx_code, int*, bool);
static bool arm_size_rtx_costs (rtx, enum rtx_code, enum rtx_code, int *);
static bool arm_slowmul_rtx_costs (rtx, enum rtx_code, enum rtx_code, int *, bool);
@@ -445,6 +446,9 @@ static const struct attribute_spec arm_attribute_table[] =
#define TARGET_HAVE_TLS true
#endif
+#undef TARGET_HAVE_CONDITIONAL_EXECUTION
+#define TARGET_HAVE_CONDITIONAL_EXECUTION arm_have_conditional_execution
+
#undef TARGET_CANNOT_FORCE_CONST_MEM
#define TARGET_CANNOT_FORCE_CONST_MEM arm_cannot_force_const_mem
@@ -520,14 +524,11 @@ enum processor_type arm_tune = arm_none;
/* The default processor used if not overridden by commandline. */
static enum processor_type arm_default_cpu = arm_none;
-/* Which floating point model to use. */
-enum arm_fp_model arm_fp_model;
-
-/* Which floating point hardware is available. */
-enum fputype arm_fpu_arch;
-
/* Which floating point hardware to schedule for. */
-enum fputype arm_fpu_tune;
+int arm_fpu_attr;
+
+/* Which floating popint hardware to use. */
+const struct arm_fpu_desc *arm_fpu_desc;
/* Whether to use floating point hardware. */
enum float_abi_type arm_float_abi;
@@ -805,46 +806,21 @@ static struct arm_cpu_select arm_select[] =
char arm_arch_name[] = "__ARM_ARCH_0UNK__";
-struct fpu_desc
-{
- const char * name;
- enum fputype fpu;
-};
-
-
/* Available values for -mfpu=. */
-static const struct fpu_desc all_fpus[] =
-{
- {"fpa", FPUTYPE_FPA},
- {"fpe2", FPUTYPE_FPA_EMU2},
- {"fpe3", FPUTYPE_FPA_EMU2},
- {"maverick", FPUTYPE_MAVERICK},
- {"vfp", FPUTYPE_VFP},
- {"vfp3", FPUTYPE_VFP3},
- {"vfpv3", FPUTYPE_VFP3},
- {"vfpv3-d16", FPUTYPE_VFP3D16},
- {"neon", FPUTYPE_NEON},
- {"neon-fp16", FPUTYPE_NEON_FP16}
-};
-
-
-/* Floating point models used by the different hardware.
- See fputype in arm.h. */
-
-static const enum arm_fp_model fp_model_for_fpu[] =
-{
- /* No FP hardware. */
- ARM_FP_MODEL_UNKNOWN, /* FPUTYPE_NONE */
- ARM_FP_MODEL_FPA, /* FPUTYPE_FPA */
- ARM_FP_MODEL_FPA, /* FPUTYPE_FPA_EMU2 */
- ARM_FP_MODEL_FPA, /* FPUTYPE_FPA_EMU3 */
- ARM_FP_MODEL_MAVERICK, /* FPUTYPE_MAVERICK */
- ARM_FP_MODEL_VFP, /* FPUTYPE_VFP */
- ARM_FP_MODEL_VFP, /* FPUTYPE_VFP3D16 */
- ARM_FP_MODEL_VFP, /* FPUTYPE_VFP3 */
- ARM_FP_MODEL_VFP, /* FPUTYPE_NEON */
- ARM_FP_MODEL_VFP /* FPUTYPE_NEON_FP16 */
+static const struct arm_fpu_desc all_fpus[] =
+{
+ {"fpa", ARM_FP_MODEL_FPA, 0, 0, false, false},
+ {"fpe2", ARM_FP_MODEL_FPA, 2, 0, false, false},
+ {"fpe3", ARM_FP_MODEL_FPA, 3, 0, false, false},
+ {"maverick", ARM_FP_MODEL_MAVERICK, 0, 0, false, false},
+ {"vfp", ARM_FP_MODEL_VFP, 2, VFP_REG_D16, false, false},
+ {"vfpv3", ARM_FP_MODEL_VFP, 3, VFP_REG_D32, false, false},
+ {"vfpv3-d16", ARM_FP_MODEL_VFP, 3, VFP_REG_D16, false, false},
+ {"neon", ARM_FP_MODEL_VFP, 3, VFP_REG_D32, true , false},
+ {"neon-fp16", ARM_FP_MODEL_VFP, 3, VFP_REG_D32, true , true },
+ /* Compatibility aliases. */
+ {"vfp3", ARM_FP_MODEL_VFP, 3, VFP_REG_D32, false, false},
};
@@ -1298,13 +1274,6 @@ arm_override_options (void)
enum processor_type target_arch_cpu = arm_none;
enum processor_type selected_cpu = arm_none;
- /* Ideally we would want to use CFI directives to generate
- debug info. However this also creates the .eh_frame
- section, so disable them until GAS can handle
- this properly. See PR40521. */
- if (TARGET_AAPCS_BASED)
- flag_dwarf2_cfi_asm = 0;
-
/* Set up the flags based on the cpu/architecture selected by the user. */
for (i = ARRAY_SIZE (arm_select); i--;)
{
@@ -1618,7 +1587,6 @@ arm_override_options (void)
if (TARGET_IWMMXT_ABI && !TARGET_IWMMXT)
error ("iwmmxt abi requires an iwmmxt capable cpu");
- arm_fp_model = ARM_FP_MODEL_UNKNOWN;
if (target_fpu_name == NULL && target_fpe_name != NULL)
{
if (streq (target_fpe_name, "2"))
@@ -1629,46 +1597,52 @@ arm_override_options (void)
error ("invalid floating point emulation option: -mfpe=%s",
target_fpe_name);
}
- if (target_fpu_name != NULL)
- {
- /* The user specified a FPU. */
- for (i = 0; i < ARRAY_SIZE (all_fpus); i++)
- {
- if (streq (all_fpus[i].name, target_fpu_name))
- {
- arm_fpu_arch = all_fpus[i].fpu;
- arm_fpu_tune = arm_fpu_arch;
- arm_fp_model = fp_model_for_fpu[arm_fpu_arch];
- break;
- }
- }
- if (arm_fp_model == ARM_FP_MODEL_UNKNOWN)
- error ("invalid floating point option: -mfpu=%s", target_fpu_name);
- }
- else
+
+ if (target_fpu_name == NULL)
{
#ifdef FPUTYPE_DEFAULT
- /* Use the default if it is specified for this platform. */
- arm_fpu_arch = FPUTYPE_DEFAULT;
- arm_fpu_tune = FPUTYPE_DEFAULT;
+ target_fpu_name = FPUTYPE_DEFAULT;
#else
- /* Pick one based on CPU type. */
- /* ??? Some targets assume FPA is the default.
- if ((insn_flags & FL_VFP) != 0)
- arm_fpu_arch = FPUTYPE_VFP;
- else
- */
if (arm_arch_cirrus)
- arm_fpu_arch = FPUTYPE_MAVERICK;
+ target_fpu_name = "maverick";
else
- arm_fpu_arch = FPUTYPE_FPA_EMU2;
+ target_fpu_name = "fpe2";
#endif
- if (tune_flags & FL_CO_PROC && arm_fpu_arch == FPUTYPE_FPA_EMU2)
- arm_fpu_tune = FPUTYPE_FPA;
+ }
+
+ arm_fpu_desc = NULL;
+ for (i = 0; i < ARRAY_SIZE (all_fpus); i++)
+ {
+ if (streq (all_fpus[i].name, target_fpu_name))
+ {
+ arm_fpu_desc = &all_fpus[i];
+ break;
+ }
+ }
+ if (!arm_fpu_desc)
+ error ("invalid floating point option: -mfpu=%s", target_fpu_name);
+
+ switch (arm_fpu_desc->model)
+ {
+ case ARM_FP_MODEL_FPA:
+ if (arm_fpu_desc->rev == 2)
+ arm_fpu_attr = FPU_FPE2;
+ else if (arm_fpu_desc->rev == 3)
+ arm_fpu_attr = FPU_FPE3;
else
- arm_fpu_tune = arm_fpu_arch;
- arm_fp_model = fp_model_for_fpu[arm_fpu_arch];
- gcc_assert (arm_fp_model != ARM_FP_MODEL_UNKNOWN);
+ arm_fpu_attr = FPU_FPA;
+ break;
+
+ case ARM_FP_MODEL_MAVERICK:
+ arm_fpu_attr = FPU_MAVERICK;
+ break;
+
+ case ARM_FP_MODEL_VFP:
+ arm_fpu_attr = FPU_VFP;
+ break;
+
+ default:
+ gcc_unreachable();
}
if (target_float_abi_name != NULL)
@@ -1690,7 +1664,7 @@ arm_override_options (void)
arm_float_abi = TARGET_DEFAULT_FLOAT_ABI;
if (TARGET_AAPCS_BASED
- && (arm_fp_model == ARM_FP_MODEL_FPA))
+ && (arm_fpu_desc->model == ARM_FP_MODEL_FPA))
error ("FPA is unsupported in the AAPCS");
if (TARGET_AAPCS_BASED)
@@ -1718,7 +1692,7 @@ arm_override_options (void)
/* If soft-float is specified then don't use FPU. */
if (TARGET_SOFT_FLOAT)
- arm_fpu_arch = FPUTYPE_NONE;
+ arm_fpu_attr = FPU_NONE;
if (TARGET_AAPCS_BASED)
{
@@ -1745,8 +1719,7 @@ arm_override_options (void)
/* For arm2/3 there is no need to do any scheduling if there is only
a floating point emulator, or we are doing software floating-point. */
if ((TARGET_SOFT_FLOAT
- || arm_fpu_tune == FPUTYPE_FPA_EMU2
- || arm_fpu_tune == FPUTYPE_FPA_EMU3)
+ || (TARGET_FPA && arm_fpu_desc->rev))
&& (tune_flags & FL_MODE32) == 0)
flag_schedule_insns = flag_schedule_insns_after_reload = 0;
@@ -1871,6 +1844,23 @@ arm_override_options (void)
max_insns_skipped = 3;
}
+ /* Hot/Cold partitioning is not currently supported, since we can't
+ handle literal pool placement in that case. */
+ if (flag_reorder_blocks_and_partition)
+ {
+ inform (input_location,
+ "-freorder-blocks-and-partition not supported on this architecture");
+ flag_reorder_blocks_and_partition = 0;
+ flag_reorder_blocks = 1;
+ }
+
+ /* Ideally we would want to use CFI directives to generate
+ debug info. However this also creates the .eh_frame
+ section, so disable them until GAS can handle
+ this properly. See PR40521. */
+ if (TARGET_AAPCS_BASED)
+ flag_dwarf2_cfi_asm = 0;
+
/* Register global variables with the garbage collector. */
arm_add_gc_roots ();
}
@@ -2393,20 +2383,24 @@ arm_split_constant (enum rtx_code code, enum machine_mode mode, rtx insn,
1);
}
-/* Return the number of ARM instructions required to synthesize the given
- constant. */
+/* Return the number of instructions required to synthesize the given
+ constant, if we start emitting them from bit-position I. */
static int
count_insns_for_constant (HOST_WIDE_INT remainder, int i)
{
HOST_WIDE_INT temp1;
+ int step_size = TARGET_ARM ? 2 : 1;
int num_insns = 0;
+
+ gcc_assert (TARGET_ARM || i == 0);
+
do
{
int end;
if (i <= 0)
i += 32;
- if (remainder & (3 << (i - 2)))
+ if (remainder & (((1 << step_size) - 1) << (i - step_size)))
{
end = i - 8;
if (end < 0)
@@ -2415,13 +2409,77 @@ count_insns_for_constant (HOST_WIDE_INT remainder, int i)
| ((i < end) ? (0xff >> (32 - end)) : 0));
remainder &= ~temp1;
num_insns++;
- i -= 6;
+ i -= 8 - step_size;
}
- i -= 2;
+ i -= step_size;
} while (remainder);
return num_insns;
}
+static int
+find_best_start (unsigned HOST_WIDE_INT remainder)
+{
+ int best_consecutive_zeros = 0;
+ int i;
+ int best_start = 0;
+
+ /* If we aren't targetting ARM, the best place to start is always at
+ the bottom. */
+ if (! TARGET_ARM)
+ return 0;
+
+ for (i = 0; i < 32; i += 2)
+ {
+ int consecutive_zeros = 0;
+
+ if (!(remainder & (3 << i)))
+ {
+ while ((i < 32) && !(remainder & (3 << i)))
+ {
+ consecutive_zeros += 2;
+ i += 2;
+ }
+ if (consecutive_zeros > best_consecutive_zeros)
+ {
+ best_consecutive_zeros = consecutive_zeros;
+ best_start = i - consecutive_zeros;
+ }
+ i -= 2;
+ }
+ }
+
+ /* So long as it won't require any more insns to do so, it's
+ desirable to emit a small constant (in bits 0...9) in the last
+ insn. This way there is more chance that it can be combined with
+ a later addressing insn to form a pre-indexed load or store
+ operation. Consider:
+
+ *((volatile int *)0xe0000100) = 1;
+ *((volatile int *)0xe0000110) = 2;
+
+ We want this to wind up as:
+
+ mov rA, #0xe0000000
+ mov rB, #1
+ str rB, [rA, #0x100]
+ mov rB, #2
+ str rB, [rA, #0x110]
+
+ rather than having to synthesize both large constants from scratch.
+
+ Therefore, we calculate how many insns would be required to emit
+ the constant starting from `best_start', and also starting from
+ zero (i.e. with bit 31 first to be output). If `best_start' doesn't
+ yield a shorter sequence, we may as well use zero. */
+ if (best_start != 0
+ && ((((unsigned HOST_WIDE_INT) 1) << best_start) < remainder)
+ && (count_insns_for_constant (remainder, 0) <=
+ count_insns_for_constant (remainder, best_start)))
+ best_start = 0;
+
+ return best_start;
+}
+
/* Emit an instruction with the indicated PATTERN. If COND is
non-NULL, conditionalize the execution of the instruction on COND
being true. */
@@ -2445,6 +2503,7 @@ arm_gen_constant (enum rtx_code code, enum machine_mode mode, rtx cond,
{
int can_invert = 0;
int can_negate = 0;
+ int final_invert = 0;
int can_negate_initial = 0;
int can_shift = 0;
int i;
@@ -2456,6 +2515,7 @@ arm_gen_constant (enum rtx_code code, enum machine_mode mode, rtx cond,
int insns = 0;
unsigned HOST_WIDE_INT temp1, temp2;
unsigned HOST_WIDE_INT remainder = val & 0xffffffff;
+ int step_size = TARGET_ARM ? 2 : 1;
/* Find out which operations are safe for a given CODE. Also do a quick
check for degenerate cases; these can occur when DImode operations
@@ -2529,14 +2589,15 @@ arm_gen_constant (enum rtx_code code, enum machine_mode mode, rtx cond,
return 1;
}
- /* We don't know how to handle other cases yet. */
- gcc_assert (remainder == 0xffffffff);
-
- if (generate)
- emit_constant_insn (cond,
- gen_rtx_SET (VOIDmode, target,
- gen_rtx_NOT (mode, source)));
- return 1;
+ if (remainder == 0xffffffff)
+ {
+ if (generate)
+ emit_constant_insn (cond,
+ gen_rtx_SET (VOIDmode, target,
+ gen_rtx_NOT (mode, source)));
+ return 1;
+ }
+ break;
case MINUS:
/* We treat MINUS as (val - source), since (source - val) is always
@@ -2987,9 +3048,25 @@ arm_gen_constant (enum rtx_code code, enum machine_mode mode, rtx cond,
if ((code == AND)
|| (code != IOR && can_invert && num_bits_set > 16))
- remainder = (~remainder) & 0xffffffff;
+ remainder ^= 0xffffffff;
else if (code == PLUS && num_bits_set > 16)
remainder = (-remainder) & 0xffffffff;
+
+ /* For XOR, if more than half the bits are set and there's a sequence
+ of more than 8 consecutive ones in the pattern then we can XOR by the
+ inverted constant and then invert the final result; this may save an
+ instruction and might also lead to the final mvn being merged with
+ some other operation. */
+ else if (code == XOR && num_bits_set > 16
+ && (count_insns_for_constant (remainder ^ 0xffffffff,
+ find_best_start
+ (remainder ^ 0xffffffff))
+ < count_insns_for_constant (remainder,
+ find_best_start (remainder))))
+ {
+ remainder ^= 0xffffffff;
+ final_invert = 1;
+ }
else
{
can_invert = 0;
@@ -3008,63 +3085,8 @@ arm_gen_constant (enum rtx_code code, enum machine_mode mode, rtx cond,
/* ??? Use thumb2 replicated constants when the high and low halfwords are
the same. */
{
- int best_start = 0;
- if (!TARGET_THUMB2)
- {
- int best_consecutive_zeros = 0;
-
- for (i = 0; i < 32; i += 2)
- {
- int consecutive_zeros = 0;
-
- if (!(remainder & (3 << i)))
- {
- while ((i < 32) && !(remainder & (3 << i)))
- {
- consecutive_zeros += 2;
- i += 2;
- }
- if (consecutive_zeros > best_consecutive_zeros)
- {
- best_consecutive_zeros = consecutive_zeros;
- best_start = i - consecutive_zeros;
- }
- i -= 2;
- }
- }
-
- /* So long as it won't require any more insns to do so, it's
- desirable to emit a small constant (in bits 0...9) in the last
- insn. This way there is more chance that it can be combined with
- a later addressing insn to form a pre-indexed load or store
- operation. Consider:
-
- *((volatile int *)0xe0000100) = 1;
- *((volatile int *)0xe0000110) = 2;
-
- We want this to wind up as:
-
- mov rA, #0xe0000000
- mov rB, #1
- str rB, [rA, #0x100]
- mov rB, #2
- str rB, [rA, #0x110]
-
- rather than having to synthesize both large constants from scratch.
-
- Therefore, we calculate how many insns would be required to emit
- the constant starting from `best_start', and also starting from
- zero (i.e. with bit 31 first to be output). If `best_start' doesn't
- yield a shorter sequence, we may as well use zero. */
- if (best_start != 0
- && ((((unsigned HOST_WIDE_INT) 1) << best_start) < remainder)
- && (count_insns_for_constant (remainder, 0) <=
- count_insns_for_constant (remainder, best_start)))
- best_start = 0;
- }
-
/* Now start emitting the insns. */
- i = best_start;
+ i = find_best_start (remainder);
do
{
int end;
@@ -3092,7 +3114,7 @@ arm_gen_constant (enum rtx_code code, enum machine_mode mode, rtx cond,
}
else
{
- if (remainder && subtargets)
+ if ((final_invert || remainder) && subtargets)
new_src = gen_reg_rtx (mode);
else
new_src = target;
@@ -3127,21 +3149,23 @@ arm_gen_constant (enum rtx_code code, enum machine_mode mode, rtx cond,
code = PLUS;
insns++;
- if (TARGET_ARM)
- i -= 6;
- else
- i -= 7;
+ i -= 8 - step_size;
}
/* Arm allows rotates by a multiple of two. Thumb-2 allows arbitrary
shifts. */
- if (TARGET_ARM)
- i -= 2;
- else
- i--;
+ i -= step_size;
}
while (remainder);
}
+ if (final_invert)
+ {
+ if (generate)
+ emit_constant_insn (cond, gen_rtx_SET (VOIDmode, target,
+ gen_rtx_NOT (mode, source)));
+ insns++;
+ }
+
return insns;
}
@@ -3264,7 +3288,7 @@ add_libcall (htab_t htab, rtx libcall)
}
static bool
-arm_libcall_uses_aapcs_base (rtx libcall)
+arm_libcall_uses_aapcs_base (const_rtx libcall)
{
static bool init_done = false;
static htab_t libcall_htab;
@@ -3311,7 +3335,7 @@ arm_libcall_uses_aapcs_base (rtx libcall)
}
rtx
-arm_libcall_value (enum machine_mode mode, rtx libcall)
+arm_libcall_value (enum machine_mode mode, const_rtx libcall)
{
if (TARGET_AAPCS_BASED && arm_pcs_default != ARM_PCS_AAPCS
&& GET_MODE_CLASS (mode) == MODE_FLOAT)
@@ -6201,7 +6225,7 @@ thumb1_rtx_costs (rtx x, enum rtx_code code, enum rtx_code outer)
else if ((outer == PLUS || outer == COMPARE)
&& INTVAL (x) < 256 && INTVAL (x) > -256)
return 0;
- else if (outer == AND
+ else if ((outer == IOR || outer == XOR || outer == AND)
&& INTVAL (x) < 256 && INTVAL (x) >= -256)
return COSTS_N_INSNS (1);
else if (outer == ASHIFT || outer == ASHIFTRT
@@ -12269,7 +12293,7 @@ output_move_neon (rtx *operands)
{
/* We're only using DImode here because it's a convenient size. */
ops[0] = gen_rtx_REG (DImode, REGNO (reg) + 2 * i);
- ops[1] = adjust_address (mem, SImode, 8 * i);
+ ops[1] = adjust_address (mem, DImode, 8 * i);
if (reg_overlap_mentioned_p (ops[0], mem))
{
gcc_assert (overlap == -1);
@@ -13257,7 +13281,7 @@ arm_output_epilogue (rtx sibling)
/* This variable is for the Virtual Frame Pointer, not VFP regs. */
int vfp_offset = offsets->frame;
- if (arm_fpu_arch == FPUTYPE_FPA_EMU2)
+ if (TARGET_FPA_EMU2)
{
for (reg = LAST_FPA_REGNUM; reg >= FIRST_FPA_REGNUM; reg--)
if (df_regs_ever_live_p (reg) && !call_used_regs[reg])
@@ -13480,7 +13504,7 @@ arm_output_epilogue (rtx sibling)
SP_REGNUM, HARD_FRAME_POINTER_REGNUM);
}
- if (arm_fpu_arch == FPUTYPE_FPA_EMU2)
+ if (TARGET_FPA_EMU2)
{
for (reg = FIRST_FPA_REGNUM; reg <= LAST_FPA_REGNUM; reg++)
if (df_regs_ever_live_p (reg) && !call_used_regs[reg])
@@ -14206,7 +14230,7 @@ arm_save_coproc_regs(void)
/* Save any floating point call-saved registers used by this
function. */
- if (arm_fpu_arch == FPUTYPE_FPA_EMU2)
+ if (TARGET_FPA_EMU2)
{
for (reg = LAST_FPA_REGNUM; reg >= FIRST_FPA_REGNUM; reg--)
if (df_regs_ever_live_p (reg) && !call_used_regs[reg])
@@ -19688,45 +19712,8 @@ arm_file_start (void)
}
else
{
- int set_float_abi_attributes = 0;
- switch (arm_fpu_arch)
- {
- case FPUTYPE_FPA:
- fpu_name = "fpa";
- break;
- case FPUTYPE_FPA_EMU2:
- fpu_name = "fpe2";
- break;
- case FPUTYPE_FPA_EMU3:
- fpu_name = "fpe3";
- break;
- case FPUTYPE_MAVERICK:
- fpu_name = "maverick";
- break;
- case FPUTYPE_VFP:
- fpu_name = "vfp";
- set_float_abi_attributes = 1;
- break;
- case FPUTYPE_VFP3D16:
- fpu_name = "vfpv3-d16";
- set_float_abi_attributes = 1;
- break;
- case FPUTYPE_VFP3:
- fpu_name = "vfpv3";
- set_float_abi_attributes = 1;
- break;
- case FPUTYPE_NEON:
- fpu_name = "neon";
- set_float_abi_attributes = 1;
- break;
- case FPUTYPE_NEON_FP16:
- fpu_name = "neon-fp16";
- set_float_abi_attributes = 1;
- break;
- default:
- abort();
- }
- if (set_float_abi_attributes)
+ fpu_name = arm_fpu_desc->name;
+ if (arm_fpu_desc->model == ARM_FP_MODEL_VFP)
{
if (TARGET_HARD_FLOAT)
asm_fprintf (asm_out_file, "\t.eabi_attribute 27, 3\n");
@@ -21173,4 +21160,12 @@ arm_frame_pointer_required (void)
|| (TARGET_ARM && TARGET_APCS_FRAME && ! leaf_function_p ()));
}
+/* Only thumb1 can't support conditional execution, so return true if
+ the target is not thumb1. */
+static bool
+arm_have_conditional_execution (void)
+{
+ return !TARGET_THUMB1;
+}
+
#include "gt-arm.h"
diff --git a/gcc/config/arm/arm.h b/gcc/config/arm/arm.h
index 9272ca51cba..2dfd22df45c 100644
--- a/gcc/config/arm/arm.h
+++ b/gcc/config/arm/arm.h
@@ -190,9 +190,9 @@ extern void (*arm_lang_output_object_attributes_hook)(void);
#define TARGET_HARD_FLOAT (arm_float_abi != ARM_FLOAT_ABI_SOFT)
/* Use hardware floating point calling convention. */
#define TARGET_HARD_FLOAT_ABI (arm_float_abi == ARM_FLOAT_ABI_HARD)
-#define TARGET_FPA (arm_fp_model == ARM_FP_MODEL_FPA)
-#define TARGET_MAVERICK (arm_fp_model == ARM_FP_MODEL_MAVERICK)
-#define TARGET_VFP (arm_fp_model == ARM_FP_MODEL_VFP)
+#define TARGET_FPA (arm_fpu_desc->model == ARM_FP_MODEL_FPA)
+#define TARGET_MAVERICK (arm_fpu_desc->model == ARM_FP_MODEL_MAVERICK)
+#define TARGET_VFP (arm_fpu_desc->model == ARM_FP_MODEL_VFP)
#define TARGET_IWMMXT (arm_arch_iwmmxt)
#define TARGET_REALLY_IWMMXT (TARGET_IWMMXT && TARGET_32BIT)
#define TARGET_IWMMXT_ABI (TARGET_32BIT && arm_abi == ARM_ABI_IWMMXT)
@@ -216,6 +216,8 @@ extern void (*arm_lang_output_object_attributes_hook)(void);
#define TARGET_THUMB2 (TARGET_THUMB && arm_arch_thumb2)
/* Thumb-1 only. */
#define TARGET_THUMB1_ONLY (TARGET_THUMB1 && !arm_arch_notm)
+/* FPA emulator without LFM. */
+#define TARGET_FPA_EMU2 (TARGET_FPA && arm_fpu_desc->rev == 2)
/* The following two macros concern the ability to execute coprocessor
instructions for VFPv3 or NEON. TARGET_VFP3/TARGET_VFPD32 are currently
@@ -223,27 +225,21 @@ extern void (*arm_lang_output_object_attributes_hook)(void);
to be more careful with TARGET_NEON as noted below. */
/* FPU is has the full VFPv3/NEON register file of 32 D registers. */
-#define TARGET_VFPD32 (arm_fp_model == ARM_FP_MODEL_VFP \
- && (arm_fpu_arch == FPUTYPE_VFP3 \
- || arm_fpu_arch == FPUTYPE_NEON \
- || arm_fpu_arch == FPUTYPE_NEON_FP16))
+#define TARGET_VFPD32 (TARGET_VFP && arm_fpu_desc->regs == VFP_REG_D32)
/* FPU supports VFPv3 instructions. */
-#define TARGET_VFP3 (arm_fp_model == ARM_FP_MODEL_VFP \
- && (arm_fpu_arch == FPUTYPE_VFP3D16 \
- || TARGET_VFPD32))
+#define TARGET_VFP3 (TARGET_VFP && arm_fpu_desc->rev >= 3)
/* FPU supports NEON/VFP half-precision floating-point. */
-#define TARGET_NEON_FP16 (arm_fpu_arch == FPUTYPE_NEON_FP16)
+#define TARGET_NEON_FP16 \
+ (TARGET_VFP && arm_fpu_desc->neon && arm_fpu_desc->fp16)
/* FPU supports Neon instructions. The setting of this macro gets
revealed via __ARM_NEON__ so we add extra guards upon TARGET_32BIT
and TARGET_HARD_FLOAT to ensure that NEON instructions are
available. */
#define TARGET_NEON (TARGET_32BIT && TARGET_HARD_FLOAT \
- && arm_fp_model == ARM_FP_MODEL_VFP \
- && (arm_fpu_arch == FPUTYPE_NEON \
- || arm_fpu_arch == FPUTYPE_NEON_FP16))
+ && TARGET_VFP && arm_fpu_desc->neon)
/* "DSP" multiply instructions, eg. SMULxy. */
#define TARGET_DSP_MULTIPLY \
@@ -300,42 +296,25 @@ enum arm_fp_model
ARM_FP_MODEL_VFP
};
-extern enum arm_fp_model arm_fp_model;
-
-/* Which floating point hardware is available. Also update
- fp_model_for_fpu in arm.c when adding entries to this list. */
-enum fputype
+enum vfp_reg_type
{
- /* No FP hardware. */
- FPUTYPE_NONE,
- /* Full FPA support. */
- FPUTYPE_FPA,
- /* Emulated FPA hardware, Issue 2 emulator (no LFM/SFM). */
- FPUTYPE_FPA_EMU2,
- /* Emulated FPA hardware, Issue 3 emulator. */
- FPUTYPE_FPA_EMU3,
- /* Cirrus Maverick floating point co-processor. */
- FPUTYPE_MAVERICK,
- /* VFP. */
- FPUTYPE_VFP,
- /* VFPv3-D16. */
- FPUTYPE_VFP3D16,
- /* VFPv3. */
- FPUTYPE_VFP3,
- /* Neon. */
- FPUTYPE_NEON,
- /* Neon with half-precision float extensions. */
- FPUTYPE_NEON_FP16
+ VFP_REG_D16,
+ VFP_REG_D32,
+ VFP_REG_SINGLE
};
-/* Recast the floating point class to be the floating point attribute. */
-#define arm_fpu_attr ((enum attr_fpu) arm_fpu_tune)
-
-/* What type of floating point to tune for */
-extern enum fputype arm_fpu_tune;
-
-/* What type of floating point instructions are available */
-extern enum fputype arm_fpu_arch;
+extern const struct arm_fpu_desc
+{
+ const char *name;
+ enum arm_fp_model model;
+ int rev;
+ enum vfp_reg_type regs;
+ int neon;
+ int fp16;
+} *arm_fpu_desc;
+
+/* Which floating point hardware to schedule for. */
+extern int arm_fpu_attr;
enum float_abi_type
{
diff --git a/gcc/config/arm/arm.md b/gcc/config/arm/arm.md
index e180c2f08f1..52edcbaa17b 100644
--- a/gcc/config/arm/arm.md
+++ b/gcc/config/arm/arm.md
@@ -160,7 +160,7 @@
; Floating Point Unit. If we only have floating point emulation, then there
; is no point in scheduling the floating point insns. (Well, for best
; performance we should try and group them together).
-(define_attr "fpu" "none,fpa,fpe2,fpe3,maverick,vfp,vfpv3d16,vfpv3,neon,neon_fp16"
+(define_attr "fpu" "none,fpa,fpe2,fpe3,maverick,vfp"
(const (symbol_ref "arm_fpu_attr")))
; LENGTH of an instruction (in bytes)
@@ -392,6 +392,9 @@
; registers.
(define_mode_iterator ANY64 [DI DF V8QI V4HI V2SI V2SF])
+;; The integer modes up to word size
+(define_mode_iterator QHSI [QI HI SI])
+
;;---------------------------------------------------------------------------
;; Predicates
@@ -1914,7 +1917,16 @@
else /* TARGET_THUMB1 */
{
if (GET_CODE (operands[2]) != CONST_INT)
- operands[2] = force_reg (SImode, operands[2]);
+ {
+ rtx tmp = force_reg (SImode, operands[2]);
+ if (rtx_equal_p (operands[0], operands[1]))
+ operands[2] = tmp;
+ else
+ {
+ operands[2] = operands[1];
+ operands[1] = tmp;
+ }
+ }
else
{
int i;
@@ -2623,7 +2635,16 @@
DONE;
}
else /* TARGET_THUMB1 */
- operands [2] = force_reg (SImode, operands [2]);
+ {
+ rtx tmp = force_reg (SImode, operands[2]);
+ if (rtx_equal_p (operands[0], operands[1]))
+ operands[2] = tmp;
+ else
+ {
+ operands[2] = operands[1];
+ operands[1] = tmp;
+ }
+ }
}
"
)
@@ -2731,12 +2752,29 @@
(define_expand "xorsi3"
[(set (match_operand:SI 0 "s_register_operand" "")
(xor:SI (match_operand:SI 1 "s_register_operand" "")
- (match_operand:SI 2 "arm_rhs_operand" "")))]
+ (match_operand:SI 2 "reg_or_int_operand" "")))]
"TARGET_EITHER"
- "if (TARGET_THUMB1)
- if (GET_CODE (operands[2]) == CONST_INT)
- operands[2] = force_reg (SImode, operands[2]);
- "
+ "if (GET_CODE (operands[2]) == CONST_INT)
+ {
+ if (TARGET_32BIT)
+ {
+ arm_split_constant (XOR, SImode, NULL_RTX,
+ INTVAL (operands[2]), operands[0], operands[1],
+ optimize && can_create_pseudo_p ());
+ DONE;
+ }
+ else /* TARGET_THUMB1 */
+ {
+ rtx tmp = force_reg (SImode, operands[2]);
+ if (rtx_equal_p (operands[0], operands[1]))
+ operands[2] = tmp;
+ else
+ {
+ operands[2] = operands[1];
+ operands[1] = tmp;
+ }
+ }
+ }"
)
(define_insn "*arm_xorsi3"
@@ -5813,6 +5851,11 @@
{
rtx reg = gen_reg_rtx (SImode);
+ /* For thumb we want an unsigned immediate, then we are more likely
+ to be able to use a movs insn. */
+ if (TARGET_THUMB)
+ operands[1] = GEN_INT (INTVAL (operands[1]) & 255);
+
emit_insn (gen_movsi (reg, operands[1]));
operands[1] = gen_lowpart (QImode, reg);
}
@@ -6727,6 +6770,7 @@
(const_int 6)
(const_int 8))))]
)
+
(define_insn "*movsi_cbranchsi4"
[(set (pc)
(if_then_else
@@ -6790,6 +6834,45 @@
(const_int 10)))))]
)
+(define_peephole2
+ [(set (match_operand:SI 0 "low_register_operand" "")
+ (match_operand:SI 1 "low_register_operand" ""))
+ (set (pc)
+ (if_then_else (match_operator 2 "arm_comparison_operator"
+ [(match_dup 1) (const_int 0)])
+ (label_ref (match_operand 3 "" ""))
+ (pc)))]
+ "TARGET_THUMB1"
+ [(parallel
+ [(set (pc)
+ (if_then_else (match_op_dup 2 [(match_dup 1) (const_int 0)])
+ (label_ref (match_dup 3))
+ (pc)))
+ (set (match_dup 0) (match_dup 1))])]
+ ""
+)
+
+;; Sigh! This variant shouldn't be needed, but combine often fails to
+;; merge cases like this because the op1 is a hard register in
+;; CLASS_LIKELY_SPILLED_P.
+(define_peephole2
+ [(set (match_operand:SI 0 "low_register_operand" "")
+ (match_operand:SI 1 "low_register_operand" ""))
+ (set (pc)
+ (if_then_else (match_operator 2 "arm_comparison_operator"
+ [(match_dup 0) (const_int 0)])
+ (label_ref (match_operand 3 "" ""))
+ (pc)))]
+ "TARGET_THUMB1"
+ [(parallel
+ [(set (pc)
+ (if_then_else (match_op_dup 2 [(match_dup 1) (const_int 0)])
+ (label_ref (match_dup 3))
+ (pc)))
+ (set (match_dup 0) (match_dup 1))])]
+ ""
+)
+
(define_insn "*negated_cbranchsi4"
[(set (pc)
(if_then_else
@@ -8033,15 +8116,13 @@
if (!thumb1_cmp_operand (op3, SImode))
op3 = force_reg (SImode, op3);
scratch = gen_reg_rtx (SImode);
- emit_insn (gen_cstoresi_nltu_thumb1 (scratch, operands[2], op3));
- emit_insn (gen_negsi2 (operands[0], scratch));
+ emit_insn (gen_cstoresi_ltu_thumb1 (operands[0], operands[2], op3));
break;
case GTU:
op3 = force_reg (SImode, operands[3]);
scratch = gen_reg_rtx (SImode);
- emit_insn (gen_cstoresi_nltu_thumb1 (scratch, op3, operands[2]));
- emit_insn (gen_negsi2 (operands[0], scratch));
+ emit_insn (gen_cstoresi_ltu_thumb1 (operands[0], op3, operands[2]));
break;
/* No good sequences for GT, LT. */
@@ -8125,6 +8206,7 @@
[(set_attr "length" "4")]
)
+;; Used as part of the expansion of thumb ltu and gtu sequences
(define_insn "cstoresi_nltu_thumb1"
[(set (match_operand:SI 0 "s_register_operand" "=l,l")
(neg:SI (ltu:SI (match_operand:SI 1 "s_register_operand" "l,*h")
@@ -8134,6 +8216,20 @@
[(set_attr "length" "4")]
)
+(define_insn_and_split "cstoresi_ltu_thumb1"
+ [(set (match_operand:SI 0 "s_register_operand" "=l,l")
+ (ltu:SI (match_operand:SI 1 "s_register_operand" "l,*h")
+ (match_operand:SI 2 "thumb1_cmp_operand" "lI*h,*r")))]
+ "TARGET_THUMB1"
+ "#"
+ "TARGET_THUMB1"
+ [(set (match_dup 3)
+ (neg:SI (ltu:SI (match_dup 1) (match_dup 2))))
+ (set (match_dup 0) (neg:SI (match_dup 3)))]
+ "operands[3] = gen_reg_rtx (SImode);"
+ [(set_attr "length" "4")]
+)
+
;; Used as part of the expansion of thumb les sequence.
(define_insn "thumb1_addsi3_addgeu"
[(set (match_operand:SI 0 "s_register_operand" "=l")
diff --git a/gcc/config/arm/arm_neon.h b/gcc/config/arm/arm_neon.h
index faaaf7bca39..ccfc7426077 100644
--- a/gcc/config/arm/arm_neon.h
+++ b/gcc/config/arm/arm_neon.h
@@ -61,7 +61,7 @@ typedef __builtin_neon_uhi uint16x8_t __attribute__ ((__vector_size__ (16)));
typedef __builtin_neon_usi uint32x4_t __attribute__ ((__vector_size__ (16)));
typedef __builtin_neon_udi uint64x2_t __attribute__ ((__vector_size__ (16)));
-typedef __builtin_neon_sf float32_t;
+typedef float float32_t;
typedef __builtin_neon_poly8 poly8_t;
typedef __builtin_neon_poly16 poly16_t;
@@ -5085,7 +5085,7 @@ vset_lane_s32 (int32_t __a, int32x2_t __b, const int __c)
__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
vset_lane_f32 (float32_t __a, float32x2_t __b, const int __c)
{
- return (float32x2_t)__builtin_neon_vset_lanev2sf (__a, __b, __c);
+ return (float32x2_t)__builtin_neon_vset_lanev2sf ((__builtin_neon_sf) __a, __b, __c);
}
__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
@@ -5151,7 +5151,7 @@ vsetq_lane_s32 (int32_t __a, int32x4_t __b, const int __c)
__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
vsetq_lane_f32 (float32_t __a, float32x4_t __b, const int __c)
{
- return (float32x4_t)__builtin_neon_vset_lanev4sf (__a, __b, __c);
+ return (float32x4_t)__builtin_neon_vset_lanev4sf ((__builtin_neon_sf) __a, __b, __c);
}
__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
@@ -5283,7 +5283,7 @@ vdup_n_s32 (int32_t __a)
__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
vdup_n_f32 (float32_t __a)
{
- return (float32x2_t)__builtin_neon_vdup_nv2sf (__a);
+ return (float32x2_t)__builtin_neon_vdup_nv2sf ((__builtin_neon_sf) __a);
}
__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
@@ -5349,7 +5349,7 @@ vdupq_n_s32 (int32_t __a)
__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
vdupq_n_f32 (float32_t __a)
{
- return (float32x4_t)__builtin_neon_vdup_nv4sf (__a);
+ return (float32x4_t)__builtin_neon_vdup_nv4sf ((__builtin_neon_sf) __a);
}
__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
@@ -5415,7 +5415,7 @@ vmov_n_s32 (int32_t __a)
__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
vmov_n_f32 (float32_t __a)
{
- return (float32x2_t)__builtin_neon_vdup_nv2sf (__a);
+ return (float32x2_t)__builtin_neon_vdup_nv2sf ((__builtin_neon_sf) __a);
}
__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
@@ -5481,7 +5481,7 @@ vmovq_n_s32 (int32_t __a)
__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
vmovq_n_f32 (float32_t __a)
{
- return (float32x4_t)__builtin_neon_vdup_nv4sf (__a);
+ return (float32x4_t)__builtin_neon_vdup_nv4sf ((__builtin_neon_sf) __a);
}
__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
@@ -6591,7 +6591,7 @@ vmul_n_s32 (int32x2_t __a, int32_t __b)
__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
vmul_n_f32 (float32x2_t __a, float32_t __b)
{
- return (float32x2_t)__builtin_neon_vmul_nv2sf (__a, __b, 3);
+ return (float32x2_t)__builtin_neon_vmul_nv2sf (__a, (__builtin_neon_sf) __b, 3);
}
__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
@@ -6621,7 +6621,7 @@ vmulq_n_s32 (int32x4_t __a, int32_t __b)
__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
vmulq_n_f32 (float32x4_t __a, float32_t __b)
{
- return (float32x4_t)__builtin_neon_vmul_nv4sf (__a, __b, 3);
+ return (float32x4_t)__builtin_neon_vmul_nv4sf (__a, (__builtin_neon_sf) __b, 3);
}
__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
@@ -6735,7 +6735,7 @@ vmla_n_s32 (int32x2_t __a, int32x2_t __b, int32_t __c)
__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
vmla_n_f32 (float32x2_t __a, float32x2_t __b, float32_t __c)
{
- return (float32x2_t)__builtin_neon_vmla_nv2sf (__a, __b, __c, 3);
+ return (float32x2_t)__builtin_neon_vmla_nv2sf (__a, __b, (__builtin_neon_sf) __c, 3);
}
__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
@@ -6765,7 +6765,7 @@ vmlaq_n_s32 (int32x4_t __a, int32x4_t __b, int32_t __c)
__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
vmlaq_n_f32 (float32x4_t __a, float32x4_t __b, float32_t __c)
{
- return (float32x4_t)__builtin_neon_vmla_nv4sf (__a, __b, __c, 3);
+ return (float32x4_t)__builtin_neon_vmla_nv4sf (__a, __b, (__builtin_neon_sf) __c, 3);
}
__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
@@ -6831,7 +6831,7 @@ vmls_n_s32 (int32x2_t __a, int32x2_t __b, int32_t __c)
__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
vmls_n_f32 (float32x2_t __a, float32x2_t __b, float32_t __c)
{
- return (float32x2_t)__builtin_neon_vmls_nv2sf (__a, __b, __c, 3);
+ return (float32x2_t)__builtin_neon_vmls_nv2sf (__a, __b, (__builtin_neon_sf) __c, 3);
}
__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
@@ -6861,7 +6861,7 @@ vmlsq_n_s32 (int32x4_t __a, int32x4_t __b, int32_t __c)
__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
vmlsq_n_f32 (float32x4_t __a, float32x4_t __b, float32_t __c)
{
- return (float32x4_t)__builtin_neon_vmls_nv4sf (__a, __b, __c, 3);
+ return (float32x4_t)__builtin_neon_vmls_nv4sf (__a, __b, (__builtin_neon_sf) __c, 3);
}
__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
@@ -7851,7 +7851,7 @@ vld1_s64 (const int64_t * __a)
__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
vld1_f32 (const float32_t * __a)
{
- return (float32x2_t)__builtin_neon_vld1v2sf (__a);
+ return (float32x2_t)__builtin_neon_vld1v2sf ((const __builtin_neon_sf *) __a);
}
__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
@@ -7917,7 +7917,7 @@ vld1q_s64 (const int64_t * __a)
__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
vld1q_f32 (const float32_t * __a)
{
- return (float32x4_t)__builtin_neon_vld1v4sf (__a);
+ return (float32x4_t)__builtin_neon_vld1v4sf ((const __builtin_neon_sf *) __a);
}
__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
@@ -7977,7 +7977,7 @@ vld1_lane_s32 (const int32_t * __a, int32x2_t __b, const int __c)
__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
vld1_lane_f32 (const float32_t * __a, float32x2_t __b, const int __c)
{
- return (float32x2_t)__builtin_neon_vld1_lanev2sf (__a, __b, __c);
+ return (float32x2_t)__builtin_neon_vld1_lanev2sf ((const __builtin_neon_sf *) __a, __b, __c);
}
__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
@@ -8043,7 +8043,7 @@ vld1q_lane_s32 (const int32_t * __a, int32x4_t __b, const int __c)
__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
vld1q_lane_f32 (const float32_t * __a, float32x4_t __b, const int __c)
{
- return (float32x4_t)__builtin_neon_vld1_lanev4sf (__a, __b, __c);
+ return (float32x4_t)__builtin_neon_vld1_lanev4sf ((const __builtin_neon_sf *) __a, __b, __c);
}
__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
@@ -8109,7 +8109,7 @@ vld1_dup_s32 (const int32_t * __a)
__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
vld1_dup_f32 (const float32_t * __a)
{
- return (float32x2_t)__builtin_neon_vld1_dupv2sf (__a);
+ return (float32x2_t)__builtin_neon_vld1_dupv2sf ((const __builtin_neon_sf *) __a);
}
__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
@@ -8175,7 +8175,7 @@ vld1q_dup_s32 (const int32_t * __a)
__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
vld1q_dup_f32 (const float32_t * __a)
{
- return (float32x4_t)__builtin_neon_vld1_dupv4sf (__a);
+ return (float32x4_t)__builtin_neon_vld1_dupv4sf ((const __builtin_neon_sf *) __a);
}
__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
@@ -8247,7 +8247,7 @@ vst1_s64 (int64_t * __a, int64x1_t __b)
__extension__ static __inline void __attribute__ ((__always_inline__))
vst1_f32 (float32_t * __a, float32x2_t __b)
{
- __builtin_neon_vst1v2sf (__a, __b);
+ __builtin_neon_vst1v2sf ((__builtin_neon_sf *) __a, __b);
}
__extension__ static __inline void __attribute__ ((__always_inline__))
@@ -8313,7 +8313,7 @@ vst1q_s64 (int64_t * __a, int64x2_t __b)
__extension__ static __inline void __attribute__ ((__always_inline__))
vst1q_f32 (float32_t * __a, float32x4_t __b)
{
- __builtin_neon_vst1v4sf (__a, __b);
+ __builtin_neon_vst1v4sf ((__builtin_neon_sf *) __a, __b);
}
__extension__ static __inline void __attribute__ ((__always_inline__))
@@ -8373,7 +8373,7 @@ vst1_lane_s32 (int32_t * __a, int32x2_t __b, const int __c)
__extension__ static __inline void __attribute__ ((__always_inline__))
vst1_lane_f32 (float32_t * __a, float32x2_t __b, const int __c)
{
- __builtin_neon_vst1_lanev2sf (__a, __b, __c);
+ __builtin_neon_vst1_lanev2sf ((__builtin_neon_sf *) __a, __b, __c);
}
__extension__ static __inline void __attribute__ ((__always_inline__))
@@ -8439,7 +8439,7 @@ vst1q_lane_s32 (int32_t * __a, int32x4_t __b, const int __c)
__extension__ static __inline void __attribute__ ((__always_inline__))
vst1q_lane_f32 (float32_t * __a, float32x4_t __b, const int __c)
{
- __builtin_neon_vst1_lanev4sf (__a, __b, __c);
+ __builtin_neon_vst1_lanev4sf ((__builtin_neon_sf *) __a, __b, __c);
}
__extension__ static __inline void __attribute__ ((__always_inline__))
@@ -8512,7 +8512,7 @@ __extension__ static __inline float32x2x2_t __attribute__ ((__always_inline__))
vld2_f32 (const float32_t * __a)
{
union { float32x2x2_t __i; __builtin_neon_ti __o; } __rv;
- __rv.__o = __builtin_neon_vld2v2sf (__a);
+ __rv.__o = __builtin_neon_vld2v2sf ((const __builtin_neon_sf *) __a);
return __rv.__i;
}
@@ -8600,7 +8600,7 @@ __extension__ static __inline float32x4x2_t __attribute__ ((__always_inline__))
vld2q_f32 (const float32_t * __a)
{
union { float32x4x2_t __i; __builtin_neon_oi __o; } __rv;
- __rv.__o = __builtin_neon_vld2v4sf (__a);
+ __rv.__o = __builtin_neon_vld2v4sf ((const __builtin_neon_sf *) __a);
return __rv.__i;
}
@@ -8676,7 +8676,7 @@ vld2_lane_f32 (const float32_t * __a, float32x2x2_t __b, const int __c)
{
union { float32x2x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
union { float32x2x2_t __i; __builtin_neon_ti __o; } __rv;
- __rv.__o = __builtin_neon_vld2_lanev2sf (__a, __bu.__o, __c);
+ __rv.__o = __builtin_neon_vld2_lanev2sf ((const __builtin_neon_sf *) __a, __bu.__o, __c);
return __rv.__i;
}
@@ -8748,7 +8748,7 @@ vld2q_lane_f32 (const float32_t * __a, float32x4x2_t __b, const int __c)
{
union { float32x4x2_t __i; __builtin_neon_oi __o; } __bu = { __b };
union { float32x4x2_t __i; __builtin_neon_oi __o; } __rv;
- __rv.__o = __builtin_neon_vld2_lanev4sf (__a, __bu.__o, __c);
+ __rv.__o = __builtin_neon_vld2_lanev4sf ((const __builtin_neon_sf *) __a, __bu.__o, __c);
return __rv.__i;
}
@@ -8807,7 +8807,7 @@ __extension__ static __inline float32x2x2_t __attribute__ ((__always_inline__))
vld2_dup_f32 (const float32_t * __a)
{
union { float32x2x2_t __i; __builtin_neon_ti __o; } __rv;
- __rv.__o = __builtin_neon_vld2_dupv2sf (__a);
+ __rv.__o = __builtin_neon_vld2_dupv2sf ((const __builtin_neon_sf *) __a);
return __rv.__i;
}
@@ -8892,7 +8892,7 @@ __extension__ static __inline void __attribute__ ((__always_inline__))
vst2_f32 (float32_t * __a, float32x2x2_t __b)
{
union { float32x2x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
- __builtin_neon_vst2v2sf (__a, __bu.__o);
+ __builtin_neon_vst2v2sf ((__builtin_neon_sf *) __a, __bu.__o);
}
__extension__ static __inline void __attribute__ ((__always_inline__))
@@ -8969,7 +8969,7 @@ __extension__ static __inline void __attribute__ ((__always_inline__))
vst2q_f32 (float32_t * __a, float32x4x2_t __b)
{
union { float32x4x2_t __i; __builtin_neon_oi __o; } __bu = { __b };
- __builtin_neon_vst2v4sf (__a, __bu.__o);
+ __builtin_neon_vst2v4sf ((__builtin_neon_sf *) __a, __bu.__o);
}
__extension__ static __inline void __attribute__ ((__always_inline__))
@@ -9032,7 +9032,7 @@ __extension__ static __inline void __attribute__ ((__always_inline__))
vst2_lane_f32 (float32_t * __a, float32x2x2_t __b, const int __c)
{
union { float32x2x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
- __builtin_neon_vst2_lanev2sf (__a, __bu.__o, __c);
+ __builtin_neon_vst2_lanev2sf ((__builtin_neon_sf *) __a, __bu.__o, __c);
}
__extension__ static __inline void __attribute__ ((__always_inline__))
@@ -9088,7 +9088,7 @@ __extension__ static __inline void __attribute__ ((__always_inline__))
vst2q_lane_f32 (float32_t * __a, float32x4x2_t __b, const int __c)
{
union { float32x4x2_t __i; __builtin_neon_oi __o; } __bu = { __b };
- __builtin_neon_vst2_lanev4sf (__a, __bu.__o, __c);
+ __builtin_neon_vst2_lanev4sf ((__builtin_neon_sf *) __a, __bu.__o, __c);
}
__extension__ static __inline void __attribute__ ((__always_inline__))
@@ -9140,7 +9140,7 @@ __extension__ static __inline float32x2x3_t __attribute__ ((__always_inline__))
vld3_f32 (const float32_t * __a)
{
union { float32x2x3_t __i; __builtin_neon_ei __o; } __rv;
- __rv.__o = __builtin_neon_vld3v2sf (__a);
+ __rv.__o = __builtin_neon_vld3v2sf ((const __builtin_neon_sf *) __a);
return __rv.__i;
}
@@ -9228,7 +9228,7 @@ __extension__ static __inline float32x4x3_t __attribute__ ((__always_inline__))
vld3q_f32 (const float32_t * __a)
{
union { float32x4x3_t __i; __builtin_neon_ci __o; } __rv;
- __rv.__o = __builtin_neon_vld3v4sf (__a);
+ __rv.__o = __builtin_neon_vld3v4sf ((const __builtin_neon_sf *) __a);
return __rv.__i;
}
@@ -9304,7 +9304,7 @@ vld3_lane_f32 (const float32_t * __a, float32x2x3_t __b, const int __c)
{
union { float32x2x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
union { float32x2x3_t __i; __builtin_neon_ei __o; } __rv;
- __rv.__o = __builtin_neon_vld3_lanev2sf (__a, __bu.__o, __c);
+ __rv.__o = __builtin_neon_vld3_lanev2sf ((const __builtin_neon_sf *) __a, __bu.__o, __c);
return __rv.__i;
}
@@ -9376,7 +9376,7 @@ vld3q_lane_f32 (const float32_t * __a, float32x4x3_t __b, const int __c)
{
union { float32x4x3_t __i; __builtin_neon_ci __o; } __bu = { __b };
union { float32x4x3_t __i; __builtin_neon_ci __o; } __rv;
- __rv.__o = __builtin_neon_vld3_lanev4sf (__a, __bu.__o, __c);
+ __rv.__o = __builtin_neon_vld3_lanev4sf ((const __builtin_neon_sf *) __a, __bu.__o, __c);
return __rv.__i;
}
@@ -9435,7 +9435,7 @@ __extension__ static __inline float32x2x3_t __attribute__ ((__always_inline__))
vld3_dup_f32 (const float32_t * __a)
{
union { float32x2x3_t __i; __builtin_neon_ei __o; } __rv;
- __rv.__o = __builtin_neon_vld3_dupv2sf (__a);
+ __rv.__o = __builtin_neon_vld3_dupv2sf ((const __builtin_neon_sf *) __a);
return __rv.__i;
}
@@ -9520,7 +9520,7 @@ __extension__ static __inline void __attribute__ ((__always_inline__))
vst3_f32 (float32_t * __a, float32x2x3_t __b)
{
union { float32x2x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
- __builtin_neon_vst3v2sf (__a, __bu.__o);
+ __builtin_neon_vst3v2sf ((__builtin_neon_sf *) __a, __bu.__o);
}
__extension__ static __inline void __attribute__ ((__always_inline__))
@@ -9597,7 +9597,7 @@ __extension__ static __inline void __attribute__ ((__always_inline__))
vst3q_f32 (float32_t * __a, float32x4x3_t __b)
{
union { float32x4x3_t __i; __builtin_neon_ci __o; } __bu = { __b };
- __builtin_neon_vst3v4sf (__a, __bu.__o);
+ __builtin_neon_vst3v4sf ((__builtin_neon_sf *) __a, __bu.__o);
}
__extension__ static __inline void __attribute__ ((__always_inline__))
@@ -9660,7 +9660,7 @@ __extension__ static __inline void __attribute__ ((__always_inline__))
vst3_lane_f32 (float32_t * __a, float32x2x3_t __b, const int __c)
{
union { float32x2x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
- __builtin_neon_vst3_lanev2sf (__a, __bu.__o, __c);
+ __builtin_neon_vst3_lanev2sf ((__builtin_neon_sf *) __a, __bu.__o, __c);
}
__extension__ static __inline void __attribute__ ((__always_inline__))
@@ -9716,7 +9716,7 @@ __extension__ static __inline void __attribute__ ((__always_inline__))
vst3q_lane_f32 (float32_t * __a, float32x4x3_t __b, const int __c)
{
union { float32x4x3_t __i; __builtin_neon_ci __o; } __bu = { __b };
- __builtin_neon_vst3_lanev4sf (__a, __bu.__o, __c);
+ __builtin_neon_vst3_lanev4sf ((__builtin_neon_sf *) __a, __bu.__o, __c);
}
__extension__ static __inline void __attribute__ ((__always_inline__))
@@ -9768,7 +9768,7 @@ __extension__ static __inline float32x2x4_t __attribute__ ((__always_inline__))
vld4_f32 (const float32_t * __a)
{
union { float32x2x4_t __i; __builtin_neon_oi __o; } __rv;
- __rv.__o = __builtin_neon_vld4v2sf (__a);
+ __rv.__o = __builtin_neon_vld4v2sf ((const __builtin_neon_sf *) __a);
return __rv.__i;
}
@@ -9856,7 +9856,7 @@ __extension__ static __inline float32x4x4_t __attribute__ ((__always_inline__))
vld4q_f32 (const float32_t * __a)
{
union { float32x4x4_t __i; __builtin_neon_xi __o; } __rv;
- __rv.__o = __builtin_neon_vld4v4sf (__a);
+ __rv.__o = __builtin_neon_vld4v4sf ((const __builtin_neon_sf *) __a);
return __rv.__i;
}
@@ -9932,7 +9932,7 @@ vld4_lane_f32 (const float32_t * __a, float32x2x4_t __b, const int __c)
{
union { float32x2x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
union { float32x2x4_t __i; __builtin_neon_oi __o; } __rv;
- __rv.__o = __builtin_neon_vld4_lanev2sf (__a, __bu.__o, __c);
+ __rv.__o = __builtin_neon_vld4_lanev2sf ((const __builtin_neon_sf *) __a, __bu.__o, __c);
return __rv.__i;
}
@@ -10004,7 +10004,7 @@ vld4q_lane_f32 (const float32_t * __a, float32x4x4_t __b, const int __c)
{
union { float32x4x4_t __i; __builtin_neon_xi __o; } __bu = { __b };
union { float32x4x4_t __i; __builtin_neon_xi __o; } __rv;
- __rv.__o = __builtin_neon_vld4_lanev4sf (__a, __bu.__o, __c);
+ __rv.__o = __builtin_neon_vld4_lanev4sf ((const __builtin_neon_sf *) __a, __bu.__o, __c);
return __rv.__i;
}
@@ -10063,7 +10063,7 @@ __extension__ static __inline float32x2x4_t __attribute__ ((__always_inline__))
vld4_dup_f32 (const float32_t * __a)
{
union { float32x2x4_t __i; __builtin_neon_oi __o; } __rv;
- __rv.__o = __builtin_neon_vld4_dupv2sf (__a);
+ __rv.__o = __builtin_neon_vld4_dupv2sf ((const __builtin_neon_sf *) __a);
return __rv.__i;
}
@@ -10148,7 +10148,7 @@ __extension__ static __inline void __attribute__ ((__always_inline__))
vst4_f32 (float32_t * __a, float32x2x4_t __b)
{
union { float32x2x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
- __builtin_neon_vst4v2sf (__a, __bu.__o);
+ __builtin_neon_vst4v2sf ((__builtin_neon_sf *) __a, __bu.__o);
}
__extension__ static __inline void __attribute__ ((__always_inline__))
@@ -10225,7 +10225,7 @@ __extension__ static __inline void __attribute__ ((__always_inline__))
vst4q_f32 (float32_t * __a, float32x4x4_t __b)
{
union { float32x4x4_t __i; __builtin_neon_xi __o; } __bu = { __b };
- __builtin_neon_vst4v4sf (__a, __bu.__o);
+ __builtin_neon_vst4v4sf ((__builtin_neon_sf *) __a, __bu.__o);
}
__extension__ static __inline void __attribute__ ((__always_inline__))
@@ -10288,7 +10288,7 @@ __extension__ static __inline void __attribute__ ((__always_inline__))
vst4_lane_f32 (float32_t * __a, float32x2x4_t __b, const int __c)
{
union { float32x2x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
- __builtin_neon_vst4_lanev2sf (__a, __bu.__o, __c);
+ __builtin_neon_vst4_lanev2sf ((__builtin_neon_sf *) __a, __bu.__o, __c);
}
__extension__ static __inline void __attribute__ ((__always_inline__))
@@ -10344,7 +10344,7 @@ __extension__ static __inline void __attribute__ ((__always_inline__))
vst4q_lane_f32 (float32_t * __a, float32x4x4_t __b, const int __c)
{
union { float32x4x4_t __i; __builtin_neon_xi __o; } __bu = { __b };
- __builtin_neon_vst4_lanev4sf (__a, __bu.__o, __c);
+ __builtin_neon_vst4_lanev4sf ((__builtin_neon_sf *) __a, __bu.__o, __c);
}
__extension__ static __inline void __attribute__ ((__always_inline__))
diff --git a/gcc/config/arm/bpabi.h b/gcc/config/arm/bpabi.h
index bc0c62f401e..ba206022b75 100644
--- a/gcc/config/arm/bpabi.h
+++ b/gcc/config/arm/bpabi.h
@@ -30,7 +30,7 @@
/* Section 4.1 of the AAPCS requires the use of VFP format. */
#undef FPUTYPE_DEFAULT
-#define FPUTYPE_DEFAULT FPUTYPE_VFP
+#define FPUTYPE_DEFAULT "vfp"
/* TARGET_BIG_ENDIAN_DEFAULT is set in
config.gcc for big endian configurations. */
@@ -53,6 +53,8 @@
#define TARGET_FIX_V4BX_SPEC " %{mcpu=arm8|mcpu=arm810|mcpu=strongarm*|march=armv4:--fix-v4bx}"
+#define BE8_LINK_SPEC " %{mbig-endian:%{march=armv7-a|mcpu=cortex-a8|mcpu=cortex-a9:%{!r:--be8}}}"
+
/* Tell the assembler to build BPABI binaries. */
#undef SUBTARGET_EXTRA_ASM_SPEC
#define SUBTARGET_EXTRA_ASM_SPEC "%{mabi=apcs-gnu|mabi=atpcs:-meabi=gnu;:-meabi=5}" TARGET_FIX_V4BX_SPEC
@@ -65,7 +67,7 @@
#define BPABI_LINK_SPEC \
"%{mbig-endian:-EB} %{mlittle-endian:-EL} " \
"%{static:-Bstatic} %{shared:-shared} %{symbolic:-Bsymbolic} " \
- "-X" SUBTARGET_EXTRA_LINK_SPEC TARGET_FIX_V4BX_SPEC
+ "-X" SUBTARGET_EXTRA_LINK_SPEC TARGET_FIX_V4BX_SPEC BE8_LINK_SPEC
#undef LINK_SPEC
#define LINK_SPEC BPABI_LINK_SPEC
diff --git a/gcc/config/arm/cortex-a9.md b/gcc/config/arm/cortex-a9.md
index 121fd2da747..d1ad7cba767 100644
--- a/gcc/config/arm/cortex-a9.md
+++ b/gcc/config/arm/cortex-a9.md
@@ -1,6 +1,8 @@
-;; ARM Cortex-A9 VFP pipeline description
-;; Copyright (C) 2008 Free Software Foundation, Inc.
-;; Written by CodeSourcery.
+;; ARM Cortex-A9 pipeline description
+;; Copyright (C) 2008, 2009 Free Software Foundation, Inc.
+;; Originally written by CodeSourcery for VFP.
+;;
+;; Integer core pipeline description contributed by ARM Ltd.
;;
;; This file is part of GCC.
;;
@@ -20,9 +22,181 @@
(define_automaton "cortex_a9")
-;; FIXME: We model a single pipeline for all instructions.
-;; Is dual-issue possible, and do we have other pipelines?
-(define_cpu_unit "cortex_a9_vfp" "cortex_a9")
+;; The Cortex-A9 integer core is modelled as a dual issue pipeline that has
+;; the following components.
+;; 1. 1 Load Store Pipeline.
+;; 2. P0 / main pipeline for data processing instructions.
+;; 3. P1 / Dual pipeline for Data processing instructions.
+;; 4. MAC pipeline for multiply as well as multiply
+;; and accumulate instructions.
+;; 5. 1 VFP / Neon pipeline.
+;; The Load/Store and VFP/Neon pipeline are multiplexed.
+;; The P0 / main pipeline and M1 stage of the MAC pipeline are
+;; multiplexed.
+;; The P1 / dual pipeline and M2 stage of the MAC pipeline are
+;; multiplexed.
+;; There are only 4 register read ports and hence at any point of
+;; time we can't have issue down the E1 and the E2 ports unless
+;; of course there are bypass paths that get exercised.
+;; Both P0 and P1 have 2 stages E1 and E2.
+;; Data processing instructions issue to E1 or E2 depending on
+;; whether they have an early shift or not.
+
+
+(define_cpu_unit "cortex_a9_vfp, cortex_a9_ls" "cortex_a9")
+(define_cpu_unit "cortex_a9_p0_e1, cortex_a9_p0_e2" "cortex_a9")
+(define_cpu_unit "cortex_a9_p1_e1, cortex_a9_p1_e2" "cortex_a9")
+(define_cpu_unit "cortex_a9_p0_wb, cortex_a9_p1_wb" "cortex_a9")
+(define_cpu_unit "cortex_a9_mac_m1, cortex_a9_mac_m2" "cortex_a9")
+(define_cpu_unit "cortex_a9_branch, cortex_a9_issue_branch" "cortex_a9")
+
+(define_reservation "cortex_a9_p0_default" "cortex_a9_p0_e2, cortex_a9_p0_wb")
+(define_reservation "cortex_a9_p1_default" "cortex_a9_p1_e2, cortex_a9_p1_wb")
+(define_reservation "cortex_a9_p0_shift" "cortex_a9_p0_e1, cortex_a9_p0_default")
+(define_reservation "cortex_a9_p1_shift" "cortex_a9_p1_e1, cortex_a9_p1_default")
+
+(define_reservation "cortex_a9_multcycle1"
+ "cortex_a9_p0_e2 + cortex_a9_mac_m1 + cortex_a9_mac_m2 + \
+cortex_a9_p1_e2 + cortex_a9_p0_e1 + cortex_a9_p1_e1")
+
+(define_reservation "cortex_a9_mult16"
+ "cortex_a9_mac_m1, cortex_a9_mac_m2, cortex_a9_p0_wb")
+(define_reservation "cortex_a9_mac16"
+ "cortex_a9_multcycle1, cortex_a9_mac_m2, cortex_a9_p0_wb")
+(define_reservation "cortex_a9_mult"
+ "cortex_a9_mac_m1*2, cortex_a9_mac_m2, cortex_a9_p0_wb")
+(define_reservation "cortex_a9_mac"
+ "cortex_a9_multcycle1*2 ,cortex_a9_mac_m2, cortex_a9_p0_wb")
+
+
+;; Issue at the same time along the load store pipeline and
+;; the VFP / Neon pipeline is not possible.
+;; FIXME:: At some point we need to model the issue
+;; of the load store and the vfp being shared rather than anything else.
+
+(exclusion_set "cortex_a9_ls" "cortex_a9_vfp")
+
+
+;; Default data processing instruction without any shift
+;; The only exception to this is the mov instruction
+;; which can go down E2 without any problem.
+(define_insn_reservation "cortex_a9_dp" 2
+ (and (eq_attr "tune" "cortexa9")
+ (ior (eq_attr "type" "alu")
+ (and (eq_attr "type" "alu_shift_reg, alu_shift")
+ (eq_attr "insn" "mov"))))
+ "cortex_a9_p0_default|cortex_a9_p1_default")
+
+;; An instruction using the shifter will go down E1.
+(define_insn_reservation "cortex_a9_dp_shift" 3
+ (and (eq_attr "tune" "cortexa9")
+ (and (eq_attr "type" "alu_shift_reg, alu_shift")
+ (not (eq_attr "insn" "mov"))))
+ "cortex_a9_p0_shift | cortex_a9_p1_shift")
+
+;; Loads have a latency of 4 cycles.
+;; We don't model autoincrement instructions. These
+;; instructions use the load store pipeline and 1 of
+;; the E2 units to write back the result of the increment.
+
+(define_insn_reservation "cortex_a9_load1_2" 4
+ (and (eq_attr "tune" "cortexa9")
+ (eq_attr "type" "load1, load2, load_byte"))
+ "cortex_a9_ls")
+
+;; Loads multiples and store multiples can't be issued for 2 cycles in a
+;; row. The description below assumes that addresses are 64 bit aligned.
+;; If not, there is an extra cycle latency which is not modelled.
+
+;; FIXME:: This bit might need to be reworked when we get to
+;; tuning for the VFP because strictly speaking the ldm
+;; is sent to the LSU unit as is and there is only an
+;; issue restriction between the LSU and the VFP/ Neon unit.
+
+(define_insn_reservation "cortex_a9_load3_4" 5
+ (and (eq_attr "tune" "cortexa9")
+ (eq_attr "type" "load3, load4"))
+ "cortex_a9_ls, cortex_a9_ls")
+
+(define_insn_reservation "cortex_a9_store1_2" 0
+ (and (eq_attr "tune" "cortexa9")
+ (eq_attr "type" "store1, store2"))
+ "cortex_a9_ls")
+
+;; Almost all our store multiples use an auto-increment
+;; form. Don't issue back to back load and store multiples
+;; because the load store unit will stall.
+(define_insn_reservation "cortex_a9_store3_4" 0
+ (and (eq_attr "tune" "cortexa9")
+ (eq_attr "type" "store3, store4"))
+ "cortex_a9_ls+(cortex_a9_p0_default | cortex_a9_p1_default), cortex_a9_ls")
+
+;; We get 16*16 multiply / mac results in 3 cycles.
+(define_insn_reservation "cortex_a9_mult16" 3
+ (and (eq_attr "tune" "cortexa9")
+ (eq_attr "insn" "smulxy"))
+ "cortex_a9_mult16")
+
+;; The 16*16 mac is slightly different that it
+;; reserves M1 and M2 in the same cycle.
+(define_insn_reservation "cortex_a9_mac16" 3
+ (and (eq_attr "tune" "cortexa9")
+ (eq_attr "insn" "smlaxy"))
+ "cortex_a9_mac16")
+
+
+(define_insn_reservation "cortex_a9_multiply" 4
+ (and (eq_attr "tune" "cortexa9")
+ (eq_attr "insn" "mul"))
+ "cortex_a9_mult")
+
+(define_insn_reservation "cortex_a9_mac" 4
+ (and (eq_attr "tune" "cortexa9")
+ (eq_attr "insn" "mla"))
+ "cortex_a9_mac")
+
+;; An instruction with a result in E2 can be forwarded
+;; to E2 or E1 or M1 or the load store unit in the next cycle.
+
+(define_bypass 1 "cortex_a9_dp"
+ "cortex_a9_dp_shift, cortex_a9_multiply,
+ cortex_a9_load1_2, cortex_a9_dp, cortex_a9_store1_2,
+ cortex_a9_mult16, cortex_a9_mac16, cortex_a9_mac, cortex_a9_store3_4, cortex_a9_load3_4")
+
+(define_bypass 2 "cortex_a9_dp_shift"
+ "cortex_a9_dp_shift, cortex_a9_multiply,
+ cortex_a9_load1_2, cortex_a9_dp, cortex_a9_store1_2,
+ cortex_a9_mult16, cortex_a9_mac16, cortex_a9_mac, cortex_a9_store3_4, cortex_a9_load3_4")
+
+;; An instruction in the load store pipeline can provide
+;; read access to a DP instruction in the P0 default pipeline
+;; before the writeback stage.
+
+(define_bypass 3 "cortex_a9_load1_2" "cortex_a9_dp, cortex_a9_load1_2,
+cortex_a9_store3_4, cortex_a9_store1_2")
+
+(define_bypass 4 "cortex_a9_load3_4" "cortex_a9_dp, cortex_a9_load1_2,
+cortex_a9_store3_4, cortex_a9_store1_2, cortex_a9_load3_4")
+
+;; Calls and branches.
+
+;; Branch instructions
+
+(define_insn_reservation "cortex_a9_branch" 0
+ (and (eq_attr "tune" "cortexa9")
+ (eq_attr "type" "branch"))
+ "cortex_a9_branch")
+
+;; Call latencies are essentially 0 but make sure
+;; dual issue doesn't happen i.e the next instruction
+;; starts at the next cycle.
+(define_insn_reservation "cortex_a9_call" 0
+ (and (eq_attr "tune" "cortexa9")
+ (eq_attr "type" "call"))
+ "cortex_a9_issue_branch + cortex_a9_multcycle1 + cortex_a9_ls + cortex_a9_vfp")
+
+
+;; Pipelining for VFP instructions.
(define_insn_reservation "cortex_a9_ffarith" 1
(and (eq_attr "tune" "cortexa9")
diff --git a/gcc/config/arm/fpa.md b/gcc/config/arm/fpa.md
index fcd92b002d7..515de43d28b 100644
--- a/gcc/config/arm/fpa.md
+++ b/gcc/config/arm/fpa.md
@@ -599,10 +599,10 @@
{
default:
case 0: return \"mvf%?e\\t%0, %1\";
- case 1: if (arm_fpu_arch == FPUTYPE_FPA_EMU2)
+ case 1: if (TARGET_FPA_EMU2)
return \"ldf%?e\\t%0, %1\";
return \"lfm%?\\t%0, 1, %1\";
- case 2: if (arm_fpu_arch == FPUTYPE_FPA_EMU2)
+ case 2: if (TARGET_FPA_EMU2)
return \"stf%?e\\t%1, %0\";
return \"sfm%?\\t%1, 1, %0\";
}
diff --git a/gcc/config/arm/linux-eabi.h b/gcc/config/arm/linux-eabi.h
index 780a504add2..fce1ed165d3 100644
--- a/gcc/config/arm/linux-eabi.h
+++ b/gcc/config/arm/linux-eabi.h
@@ -66,7 +66,7 @@
/* At this point, bpabi.h will have clobbered LINK_SPEC. We want to
use the GNU/Linux version, not the generic BPABI version. */
#undef LINK_SPEC
-#define LINK_SPEC LINUX_TARGET_LINK_SPEC
+#define LINK_SPEC LINUX_TARGET_LINK_SPEC BE8_LINK_SPEC
/* Use the default LIBGCC_SPEC, not the version in linux-elf.h, as we
do not use -lfloat. */
diff --git a/gcc/config/arm/linux-elf.h b/gcc/config/arm/linux-elf.h
index 07455ee87fd..9fdca414e8e 100644
--- a/gcc/config/arm/linux-elf.h
+++ b/gcc/config/arm/linux-elf.h
@@ -98,7 +98,7 @@
/* NWFPE always understands FPA instructions. */
#undef FPUTYPE_DEFAULT
-#define FPUTYPE_DEFAULT FPUTYPE_FPA_EMU3
+#define FPUTYPE_DEFAULT "fpe3"
/* Call the function profiler with a given profile label. */
#undef ARM_FUNCTION_PROFILER
diff --git a/gcc/config/arm/neon-gen.ml b/gcc/config/arm/neon-gen.ml
index 9c8e2a89b86..112c8be6e3b 100644
--- a/gcc/config/arm/neon-gen.ml
+++ b/gcc/config/arm/neon-gen.ml
@@ -122,6 +122,7 @@ let rec signed_ctype = function
| T_uint16 | T_int16 -> T_intHI
| T_uint32 | T_int32 -> T_intSI
| T_uint64 | T_int64 -> T_intDI
+ | T_float32 -> T_floatSF
| T_poly8 -> T_intQI
| T_poly16 -> T_intHI
| T_arrayof (n, elt) -> T_arrayof (n, signed_ctype elt)
@@ -320,7 +321,7 @@ let deftypes () =
typeinfo;
Format.print_newline ();
(* Extra types not in <stdint.h>. *)
- Format.printf "typedef __builtin_neon_sf float32_t;\n";
+ Format.printf "typedef float float32_t;\n";
Format.printf "typedef __builtin_neon_poly8 poly8_t;\n";
Format.printf "typedef __builtin_neon_poly16 poly16_t;\n"
diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md
index 85bc3eed100..7d1ef111339 100644
--- a/gcc/config/arm/neon.md
+++ b/gcc/config/arm/neon.md
@@ -3655,7 +3655,8 @@
UNSPEC_VSHLL_N))]
"TARGET_NEON"
{
- neon_const_bounds (operands[2], 0, neon_element_bits (<MODE>mode));
+ /* The boundaries are: 0 < imm <= size. */
+ neon_const_bounds (operands[2], 0, neon_element_bits (<MODE>mode) + 1);
return "vshll.%T3%#<V_sz_elem>\t%q0, %P1, %2";
}
[(set_attr "neon_type" "neon_shift_1")]
diff --git a/gcc/config/arm/neon.ml b/gcc/config/arm/neon.ml
index 10393b33ebc..114097d22a7 100644
--- a/gcc/config/arm/neon.ml
+++ b/gcc/config/arm/neon.ml
@@ -50,7 +50,7 @@ type vectype = T_int8x8 | T_int8x16
| T_ptrto of vectype | T_const of vectype
| T_void | T_intQI
| T_intHI | T_intSI
- | T_intDI
+ | T_intDI | T_floatSF
(* The meanings of the following are:
TImode : "Tetra", two registers (four words).
@@ -1693,6 +1693,7 @@ let string_of_vectype vt =
| T_intHI -> "__builtin_neon_hi"
| T_intSI -> "__builtin_neon_si"
| T_intDI -> "__builtin_neon_di"
+ | T_floatSF -> "__builtin_neon_sf"
| T_arrayof (num, base) ->
let basename = name (fun x -> x) base in
affix (Printf.sprintf "%sx%d" basename num)
diff --git a/gcc/config/arm/netbsd-elf.h b/gcc/config/arm/netbsd-elf.h
index 4c06fa1cb3b..9cf186b338d 100644
--- a/gcc/config/arm/netbsd-elf.h
+++ b/gcc/config/arm/netbsd-elf.h
@@ -153,5 +153,5 @@ do \
while (0)
#undef FPUTYPE_DEFAULT
-#define FPUTYPE_DEFAULT FPUTYPE_VFP
+#define FPUTYPE_DEFAULT "vfp"
diff --git a/gcc/config/arm/thumb2.md b/gcc/config/arm/thumb2.md
index 884d58c7677..82f75f9b733 100644
--- a/gcc/config/arm/thumb2.md
+++ b/gcc/config/arm/thumb2.md
@@ -1156,8 +1156,8 @@
;; 16-bit load immediate
(define_peephole2
- [(set (match_operand:SI 0 "low_register_operand" "")
- (match_operand:SI 1 "const_int_operand" ""))]
+ [(set (match_operand:QHSI 0 "low_register_operand" "")
+ (match_operand:QHSI 1 "const_int_operand" ""))]
"TARGET_THUMB2
&& peep2_regno_dead_p(0, CC_REGNUM)
&& (unsigned HOST_WIDE_INT) INTVAL(operands[1]) < 256"
@@ -1168,9 +1168,9 @@
""
)
-(define_insn "*thumb2_movsi_shortim"
- [(set (match_operand:SI 0 "low_register_operand" "=l")
- (match_operand:SI 1 "const_int_operand" "I"))
+(define_insn "*thumb2_mov<mode>_shortim"
+ [(set (match_operand:QHSI 0 "low_register_operand" "=l")
+ (match_operand:QHSI 1 "const_int_operand" "I"))
(clobber (reg:CC CC_REGNUM))]
"TARGET_THUMB2 && reload_completed"
"mov%!\t%0, %1"
diff --git a/gcc/config/arm/unwind-arm.c b/gcc/config/arm/unwind-arm.c
index 4eb18215f17..2c6e004890e 100644
--- a/gcc/config/arm/unwind-arm.c
+++ b/gcc/config/arm/unwind-arm.c
@@ -1000,7 +1000,6 @@ __gnu_Unwind_Backtrace(_Unwind_Trace_Fn trace, void * trace_argument,
while (code != _URC_END_OF_STACK
&& code != _URC_FAILURE);
- finish:
restore_non_core_regs (&saved_vrs);
return code;
}
diff --git a/gcc/config/arm/vxworks.h b/gcc/config/arm/vxworks.h
index 8879fedb7d7..aa7e197bc5d 100644
--- a/gcc/config/arm/vxworks.h
+++ b/gcc/config/arm/vxworks.h
@@ -97,7 +97,7 @@ along with GCC; see the file COPYING3. If not see
/* There is no default multilib. */
#undef MULTILIB_DEFAULTS
-#define FPUTYPE_DEFAULT FPUTYPE_VFP
+#define FPUTYPE_DEFAULT "vfp"
#undef FUNCTION_PROFILER
#define FUNCTION_PROFILER VXWORKS_FUNCTION_PROFILER