summaryrefslogtreecommitdiff
path: root/gcc/config/i386/i386.c
diff options
context:
space:
mode:
Diffstat (limited to 'gcc/config/i386/i386.c')
-rw-r--r--gcc/config/i386/i386.c709
1 files changed, 547 insertions, 162 deletions
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 42f6d93d3c3..2eaa1c54875 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -21,6 +21,8 @@ Boston, MA 02111-1307, USA. */
#include "config.h"
#include "system.h"
+#include "coretypes.h"
+#include "tm.h"
#include "rtl.h"
#include "tree.h"
#include "tm_p.h"
@@ -55,9 +57,9 @@ struct processor_costs size_cost = { /* costs for tunning for size */
3, /* cost of a lea instruction */
2, /* variable shift costs */
3, /* constant shift costs */
- 3, /* cost of starting a multiply */
+ {3, 3, 3, 3, 5}, /* cost of starting a multiply */
0, /* cost of multiply per each bit set */
- 3, /* cost of a divide/mod */
+ {3, 3, 3, 3, 5}, /* cost of a divide/mod */
3, /* cost of movsx */
3, /* cost of movzx */
0, /* "large" insn */
@@ -84,6 +86,7 @@ struct processor_costs size_cost = { /* costs for tunning for size */
3, /* MMX or SSE register to integer */
0, /* size of prefetch block */
0, /* number of parallel prefetches */
+ 1, /* Branch cost */
2, /* cost of FADD and FSUB insns. */
2, /* cost of FMUL instruction. */
2, /* cost of FDIV instruction. */
@@ -99,9 +102,9 @@ struct processor_costs i386_cost = { /* 386 specific costs */
1, /* cost of a lea instruction */
3, /* variable shift costs */
2, /* constant shift costs */
- 6, /* cost of starting a multiply */
+ {6, 6, 6, 6, 6}, /* cost of starting a multiply */
1, /* cost of multiply per each bit set */
- 23, /* cost of a divide/mod */
+ {23, 23, 23, 23, 23}, /* cost of a divide/mod */
3, /* cost of movsx */
2, /* cost of movzx */
15, /* "large" insn */
@@ -128,6 +131,7 @@ struct processor_costs i386_cost = { /* 386 specific costs */
3, /* MMX or SSE register to integer */
0, /* size of prefetch block */
0, /* number of parallel prefetches */
+ 1, /* Branch cost */
23, /* cost of FADD and FSUB insns. */
27, /* cost of FMUL instruction. */
88, /* cost of FDIV instruction. */
@@ -142,9 +146,9 @@ struct processor_costs i486_cost = { /* 486 specific costs */
1, /* cost of a lea instruction */
3, /* variable shift costs */
2, /* constant shift costs */
- 12, /* cost of starting a multiply */
+ {12, 12, 12, 12, 12}, /* cost of starting a multiply */
1, /* cost of multiply per each bit set */
- 40, /* cost of a divide/mod */
+ {40, 40, 40, 40, 40}, /* cost of a divide/mod */
3, /* cost of movsx */
2, /* cost of movzx */
15, /* "large" insn */
@@ -171,6 +175,7 @@ struct processor_costs i486_cost = { /* 486 specific costs */
3, /* MMX or SSE register to integer */
0, /* size of prefetch block */
0, /* number of parallel prefetches */
+ 1, /* Branch cost */
8, /* cost of FADD and FSUB insns. */
16, /* cost of FMUL instruction. */
73, /* cost of FDIV instruction. */
@@ -185,9 +190,9 @@ struct processor_costs pentium_cost = {
1, /* cost of a lea instruction */
4, /* variable shift costs */
1, /* constant shift costs */
- 11, /* cost of starting a multiply */
+ {11, 11, 11, 11, 11}, /* cost of starting a multiply */
0, /* cost of multiply per each bit set */
- 25, /* cost of a divide/mod */
+ {25, 25, 25, 25, 25}, /* cost of a divide/mod */
3, /* cost of movsx */
2, /* cost of movzx */
8, /* "large" insn */
@@ -214,6 +219,7 @@ struct processor_costs pentium_cost = {
3, /* MMX or SSE register to integer */
0, /* size of prefetch block */
0, /* number of parallel prefetches */
+ 2, /* Branch cost */
3, /* cost of FADD and FSUB insns. */
3, /* cost of FMUL instruction. */
39, /* cost of FDIV instruction. */
@@ -228,9 +234,9 @@ struct processor_costs pentiumpro_cost = {
1, /* cost of a lea instruction */
1, /* variable shift costs */
1, /* constant shift costs */
- 4, /* cost of starting a multiply */
+ {4, 4, 4, 4, 4}, /* cost of starting a multiply */
0, /* cost of multiply per each bit set */
- 17, /* cost of a divide/mod */
+ {17, 17, 17, 17, 17}, /* cost of a divide/mod */
1, /* cost of movsx */
1, /* cost of movzx */
8, /* "large" insn */
@@ -257,6 +263,7 @@ struct processor_costs pentiumpro_cost = {
3, /* MMX or SSE register to integer */
32, /* size of prefetch block */
6, /* number of parallel prefetches */
+ 2, /* Branch cost */
3, /* cost of FADD and FSUB insns. */
5, /* cost of FMUL instruction. */
56, /* cost of FDIV instruction. */
@@ -271,9 +278,9 @@ struct processor_costs k6_cost = {
2, /* cost of a lea instruction */
1, /* variable shift costs */
1, /* constant shift costs */
- 3, /* cost of starting a multiply */
+ {3, 3, 3, 3, 3}, /* cost of starting a multiply */
0, /* cost of multiply per each bit set */
- 18, /* cost of a divide/mod */
+ {18, 18, 18, 18, 18}, /* cost of a divide/mod */
2, /* cost of movsx */
2, /* cost of movzx */
8, /* "large" insn */
@@ -300,6 +307,7 @@ struct processor_costs k6_cost = {
6, /* MMX or SSE register to integer */
32, /* size of prefetch block */
1, /* number of parallel prefetches */
+ 1, /* Branch cost */
2, /* cost of FADD and FSUB insns. */
2, /* cost of FMUL instruction. */
56, /* cost of FDIV instruction. */
@@ -314,9 +322,9 @@ struct processor_costs athlon_cost = {
2, /* cost of a lea instruction */
1, /* variable shift costs */
1, /* constant shift costs */
- 5, /* cost of starting a multiply */
+ {5, 5, 5, 5, 5}, /* cost of starting a multiply */
0, /* cost of multiply per each bit set */
- 42, /* cost of a divide/mod */
+ {18, 26, 42, 74, 74}, /* cost of a divide/mod */
1, /* cost of movsx */
1, /* cost of movzx */
8, /* "large" insn */
@@ -343,6 +351,7 @@ struct processor_costs athlon_cost = {
5, /* MMX or SSE register to integer */
64, /* size of prefetch block */
6, /* number of parallel prefetches */
+ 2, /* Branch cost */
4, /* cost of FADD and FSUB insns. */
4, /* cost of FMUL instruction. */
24, /* cost of FDIV instruction. */
@@ -352,14 +361,58 @@ struct processor_costs athlon_cost = {
};
static const
+struct processor_costs k8_cost = {
+ 1, /* cost of an add instruction */
+ 2, /* cost of a lea instruction */
+ 1, /* variable shift costs */
+ 1, /* constant shift costs */
+ {3, 4, 3, 4, 5}, /* cost of starting a multiply */
+ 0, /* cost of multiply per each bit set */
+ {18, 26, 42, 74, 74}, /* cost of a divide/mod */
+ 1, /* cost of movsx */
+ 1, /* cost of movzx */
+ 8, /* "large" insn */
+ 9, /* MOVE_RATIO */
+ 4, /* cost for loading QImode using movzbl */
+ {3, 4, 3}, /* cost of loading integer registers
+ in QImode, HImode and SImode.
+ Relative to reg-reg move (2). */
+ {3, 4, 3}, /* cost of storing integer registers */
+ 4, /* cost of reg,reg fld/fst */
+ {4, 4, 12}, /* cost of loading fp registers
+ in SFmode, DFmode and XFmode */
+ {6, 6, 8}, /* cost of loading integer registers */
+ 2, /* cost of moving MMX register */
+ {3, 3}, /* cost of loading MMX registers
+ in SImode and DImode */
+ {4, 4}, /* cost of storing MMX registers
+ in SImode and DImode */
+ 2, /* cost of moving SSE register */
+ {4, 3, 6}, /* cost of loading SSE registers
+ in SImode, DImode and TImode */
+ {4, 4, 5}, /* cost of storing SSE registers
+ in SImode, DImode and TImode */
+ 5, /* MMX or SSE register to integer */
+ 64, /* size of prefetch block */
+ 6, /* number of parallel prefetches */
+ 2, /* Branch cost */
+ 4, /* cost of FADD and FSUB insns. */
+ 4, /* cost of FMUL instruction. */
+ 19, /* cost of FDIV instruction. */
+ 2, /* cost of FABS instruction. */
+ 2, /* cost of FCHS instruction. */
+ 35, /* cost of FSQRT instruction. */
+};
+
+static const
struct processor_costs pentium4_cost = {
1, /* cost of an add instruction */
1, /* cost of a lea instruction */
- 8, /* variable shift costs */
- 8, /* constant shift costs */
- 30, /* cost of starting a multiply */
+ 4, /* variable shift costs */
+ 4, /* constant shift costs */
+ {15, 15, 15, 15, 15}, /* cost of starting a multiply */
0, /* cost of multiply per each bit set */
- 112, /* cost of a divide/mod */
+ {56, 56, 56, 56, 56}, /* cost of a divide/mod */
1, /* cost of movsx */
1, /* cost of movzx */
16, /* "large" insn */
@@ -386,6 +439,7 @@ struct processor_costs pentium4_cost = {
10, /* MMX or SSE register to integer */
64, /* size of prefetch block */
6, /* number of parallel prefetches */
+ 2, /* Branch cost */
5, /* cost of FADD and FSUB insns. */
7, /* cost of FMUL instruction. */
43, /* cost of FDIV instruction. */
@@ -404,52 +458,66 @@ const struct processor_costs *ix86_cost = &pentium_cost;
#define m_K6 (1<<PROCESSOR_K6)
#define m_ATHLON (1<<PROCESSOR_ATHLON)
#define m_PENT4 (1<<PROCESSOR_PENTIUM4)
+#define m_K8 (1<<PROCESSOR_K8)
+#define m_ATHLON_K8 (m_K8 | m_ATHLON)
-const int x86_use_leave = m_386 | m_K6 | m_ATHLON;
-const int x86_push_memory = m_386 | m_K6 | m_ATHLON | m_PENT4;
+const int x86_use_leave = m_386 | m_K6 | m_ATHLON_K8;
+const int x86_push_memory = m_386 | m_K6 | m_ATHLON_K8 | m_PENT4;
const int x86_zero_extend_with_and = m_486 | m_PENT;
-const int x86_movx = m_ATHLON | m_PPRO | m_PENT4 /* m_386 | m_K6 */;
+const int x86_movx = m_ATHLON_K8 | m_PPRO | m_PENT4 /* m_386 | m_K6 */;
const int x86_double_with_add = ~m_386;
const int x86_use_bit_test = m_386;
-const int x86_unroll_strlen = m_486 | m_PENT | m_PPRO | m_ATHLON | m_K6;
-const int x86_cmove = m_PPRO | m_ATHLON | m_PENT4;
-const int x86_3dnow_a = m_ATHLON;
-const int x86_deep_branch = m_PPRO | m_K6 | m_ATHLON | m_PENT4;
+const int x86_unroll_strlen = m_486 | m_PENT | m_PPRO | m_ATHLON_K8 | m_K6;
+const int x86_cmove = m_PPRO | m_ATHLON_K8 | m_PENT4;
+const int x86_3dnow_a = m_ATHLON_K8;
+const int x86_deep_branch = m_PPRO | m_K6 | m_ATHLON_K8 | m_PENT4;
const int x86_branch_hints = m_PENT4;
const int x86_use_sahf = m_PPRO | m_K6 | m_PENT4;
const int x86_partial_reg_stall = m_PPRO;
const int x86_use_loop = m_K6;
-const int x86_use_fiop = ~(m_PPRO | m_ATHLON | m_PENT);
+const int x86_use_fiop = ~(m_PPRO | m_ATHLON_K8 | m_PENT);
const int x86_use_mov0 = m_K6;
const int x86_use_cltd = ~(m_PENT | m_K6);
const int x86_read_modify_write = ~m_PENT;
const int x86_read_modify = ~(m_PENT | m_PPRO);
const int x86_split_long_moves = m_PPRO;
-const int x86_promote_QImode = m_K6 | m_PENT | m_386 | m_486 | m_ATHLON;
+const int x86_promote_QImode = m_K6 | m_PENT | m_386 | m_486 | m_ATHLON_K8;
const int x86_fast_prefix = ~(m_PENT | m_486 | m_386);
const int x86_single_stringop = m_386 | m_PENT4;
const int x86_qimode_math = ~(0);
const int x86_promote_qi_regs = 0;
const int x86_himode_math = ~(m_PPRO);
const int x86_promote_hi_regs = m_PPRO;
-const int x86_sub_esp_4 = m_ATHLON | m_PPRO | m_PENT4;
-const int x86_sub_esp_8 = m_ATHLON | m_PPRO | m_386 | m_486 | m_PENT4;
-const int x86_add_esp_4 = m_ATHLON | m_K6 | m_PENT4;
-const int x86_add_esp_8 = m_ATHLON | m_PPRO | m_K6 | m_386 | m_486 | m_PENT4;
-const int x86_integer_DFmode_moves = ~(m_ATHLON | m_PENT4 | m_PPRO);
-const int x86_partial_reg_dependency = m_ATHLON | m_PENT4;
-const int x86_memory_mismatch_stall = m_ATHLON | m_PENT4;
-const int x86_accumulate_outgoing_args = m_ATHLON | m_PENT4 | m_PPRO;
-const int x86_prologue_using_move = m_ATHLON | m_PENT4 | m_PPRO;
-const int x86_epilogue_using_move = m_ATHLON | m_PENT4 | m_PPRO;
+const int x86_sub_esp_4 = m_ATHLON_K8 | m_PPRO | m_PENT4;
+const int x86_sub_esp_8 = m_ATHLON_K8 | m_PPRO | m_386 | m_486 | m_PENT4;
+const int x86_add_esp_4 = m_ATHLON_K8 | m_K6 | m_PENT4;
+const int x86_add_esp_8 = m_ATHLON_K8 | m_PPRO | m_K6 | m_386 | m_486 | m_PENT4;
+const int x86_integer_DFmode_moves = ~(m_ATHLON_K8 | m_PENT4 | m_PPRO);
+const int x86_partial_reg_dependency = m_ATHLON_K8 | m_PENT4;
+const int x86_memory_mismatch_stall = m_ATHLON_K8 | m_PENT4;
+const int x86_accumulate_outgoing_args = m_ATHLON_K8 | m_PENT4 | m_PPRO;
+const int x86_prologue_using_move = m_ATHLON_K8 | m_PENT4 | m_PPRO;
+const int x86_epilogue_using_move = m_ATHLON_K8 | m_PENT4 | m_PPRO;
const int x86_decompose_lea = m_PENT4;
const int x86_shift1 = ~m_486;
-const int x86_arch_always_fancy_math_387 = m_PENT | m_PPRO | m_ATHLON | m_PENT4;
+const int x86_arch_always_fancy_math_387 = m_PENT | m_PPRO | m_ATHLON_K8 | m_PENT4;
+const int x86_sse_partial_reg_dependency = m_PENT4 | m_PPRO;
+/* Set for machines where the type and dependencies are resolved on SSE register
+ parts insetad of whole registers, so we may maintain just lower part of
+ scalar values in proper format leaving the upper part undefined. */
+const int x86_sse_partial_regs = m_ATHLON_K8;
+/* Athlon optimizes partial-register FPS special case, thus avoiding the
+ need for extra instructions beforehand */
+const int x86_sse_partial_regs_for_cvtsd2ss = 0;
+const int x86_sse_typeless_stores = m_ATHLON_K8;
+const int x86_sse_load0_by_pxor = m_PPRO | m_PENT4;
+const int x86_use_ffreep = m_ATHLON_K8;
+const int x86_rep_movl_optimal = m_386 | m_PENT | m_PPRO | m_K6;
/* In case the avreage insn count for single function invocation is
lower than this constant, emit fast (but longer) prologue and
epilogue code. */
-#define FAST_PROLOGUE_INSN_COUNT 30
+#define FAST_PROLOGUE_INSN_COUNT 20
/* Set by prologue expander and used by epilogue expander to determine
the style used. */
@@ -755,6 +823,7 @@ static void x86_output_mi_thunk PARAMS ((FILE *, tree, HOST_WIDE_INT,
HOST_WIDE_INT, tree));
static bool x86_can_output_mi_thunk PARAMS ((tree, HOST_WIDE_INT,
HOST_WIDE_INT, tree));
+bool ix86_expand_carry_flag_compare PARAMS ((enum rtx_code, rtx, rtx, rtx*));
struct ix86_address
{
@@ -796,9 +865,12 @@ static void ix86_compute_frame_layout PARAMS ((struct ix86_frame *));
static int ix86_comp_type_attributes PARAMS ((tree, tree));
static int ix86_fntype_regparm PARAMS ((tree));
const struct attribute_spec ix86_attribute_table[];
+static bool ix86_function_ok_for_sibcall PARAMS ((tree, tree));
static tree ix86_handle_cdecl_attribute PARAMS ((tree *, tree, tree, int, bool *));
static tree ix86_handle_regparm_attribute PARAMS ((tree *, tree, tree, int, bool *));
static int ix86_value_regno PARAMS ((enum machine_mode));
+static bool ix86_ms_bitfield_layout_p PARAMS ((tree));
+static int extended_reg_mentioned_1 PARAMS ((rtx *, void *));
#if defined (DO_GLOBAL_CTORS_BODY) && defined (HAS_INIT_SECTION)
static void ix86_svr3_asm_out_constructor PARAMS ((rtx, int));
@@ -897,6 +969,9 @@ static enum x86_64_reg_class merge_classes PARAMS ((enum x86_64_reg_class,
#define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
ia32_multipass_dfa_lookahead
+#undef TARGET_FUNCTION_OK_FOR_SIBCALL
+#define TARGET_FUNCTION_OK_FOR_SIBCALL ix86_function_ok_for_sibcall
+
#ifdef HAVE_AS_TLS
#undef TARGET_HAVE_TLS
#define TARGET_HAVE_TLS true
@@ -904,6 +979,9 @@ static enum x86_64_reg_class merge_classes PARAMS ((enum x86_64_reg_class,
#undef TARGET_CANNOT_FORCE_CONST_MEM
#define TARGET_CANNOT_FORCE_CONST_MEM ix86_cannot_force_const_mem
+#undef TARGET_MS_BITFIELD_LAYOUT_P
+#define TARGET_MS_BITFIELD_LAYOUT_P ix86_ms_bitfield_layout_p
+
#undef TARGET_ASM_OUTPUT_MI_THUNK
#define TARGET_ASM_OUTPUT_MI_THUNK x86_output_mi_thunk
#undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
@@ -937,17 +1015,17 @@ override_options ()
const int align_jump;
const int align_jump_max_skip;
const int align_func;
- const int branch_cost;
}
const processor_target_table[PROCESSOR_max] =
{
- {&i386_cost, 0, 0, 4, 3, 4, 3, 4, 1},
- {&i486_cost, 0, 0, 16, 15, 16, 15, 16, 1},
- {&pentium_cost, 0, 0, 16, 7, 16, 7, 16, 1},
- {&pentiumpro_cost, 0, 0, 16, 15, 16, 7, 16, 1},
- {&k6_cost, 0, 0, 32, 7, 32, 7, 32, 1},
- {&athlon_cost, 0, 0, 16, 7, 64, 7, 16, 1},
- {&pentium4_cost, 0, 0, 0, 0, 0, 0, 0, 1}
+ {&i386_cost, 0, 0, 4, 3, 4, 3, 4},
+ {&i486_cost, 0, 0, 16, 15, 16, 15, 16},
+ {&pentium_cost, 0, 0, 16, 7, 16, 7, 16},
+ {&pentiumpro_cost, 0, 0, 16, 15, 16, 7, 16},
+ {&k6_cost, 0, 0, 32, 7, 32, 7, 32},
+ {&athlon_cost, 0, 0, 16, 7, 16, 7, 16},
+ {&pentium4_cost, 0, 0, 0, 0, 0, 0, 0},
+ {&k8_cost, 0, 0, 16, 7, 16, 7, 16}
};
static const char * const cpu_names[] = TARGET_CPU_DEFAULT_NAMES;
@@ -962,7 +1040,8 @@ override_options ()
PTA_MMX = 4,
PTA_PREFETCH_SSE = 8,
PTA_3DNOW = 16,
- PTA_3DNOW_A = 64
+ PTA_3DNOW_A = 64,
+ PTA_64BIT = 128
} flags;
}
const processor_alias_table[] =
@@ -994,6 +1073,8 @@ override_options ()
| PTA_3DNOW_A | PTA_SSE},
{"athlon-mp", PROCESSOR_ATHLON, PTA_MMX | PTA_PREFETCH_SSE | PTA_3DNOW
| PTA_3DNOW_A | PTA_SSE},
+ {"k8", PROCESSOR_K8, PTA_MMX | PTA_PREFETCH_SSE | PTA_3DNOW | PTA_64BIT
+ | PTA_3DNOW_A | PTA_SSE | PTA_SSE2},
};
int const pta_size = ARRAY_SIZE (processor_alias_table);
@@ -1033,7 +1114,7 @@ override_options ()
if (!ix86_cpu_string)
ix86_cpu_string = cpu_names [TARGET_CPU_DEFAULT];
if (!ix86_arch_string)
- ix86_arch_string = TARGET_64BIT ? "athlon-4" : "i386";
+ ix86_arch_string = TARGET_64BIT ? "k8" : "i386";
if (ix86_cmodel_string != 0)
{
@@ -1099,6 +1180,8 @@ override_options ()
target_flags |= MASK_SSE2;
if (processor_alias_table[i].flags & PTA_PREFETCH_SSE)
x86_prefetch_sse = true;
+ if (TARGET_64BIT && !(processor_alias_table[i].flags & PTA_64BIT))
+ error ("CPU you selected does not support x86-64 instruction set");
break;
}
@@ -1109,6 +1192,8 @@ override_options ()
if (! strcmp (ix86_cpu_string, processor_alias_table[i].name))
{
ix86_cpu = processor_alias_table[i].processor;
+ if (TARGET_64BIT && !(processor_alias_table[i].flags & PTA_64BIT))
+ error ("CPU you selected does not support x86-64 instruction set");
break;
}
if (processor_alias_table[i].flags & PTA_PREFETCH_SSE)
@@ -1215,7 +1300,7 @@ override_options ()
}
/* Validate -mbranch-cost= value, or provide default. */
- ix86_branch_cost = processor_target_table[ix86_cpu].branch_cost;
+ ix86_branch_cost = processor_target_table[ix86_cpu].cost->branch_cost;
if (ix86_branch_cost_string)
{
i = atoi (ix86_branch_cost_string);
@@ -1371,6 +1456,60 @@ const struct attribute_spec ix86_attribute_table[] =
{ NULL, 0, 0, false, false, false, NULL }
};
+/* If PIC, we cannot make sibling calls to global functions
+ because the PLT requires %ebx live.
+ If we are returning floats on the register stack, we cannot make
+ sibling calls to functions that return floats. (The stack adjust
+ instruction will wind up after the sibcall jump, and not be executed.) */
+
+static bool
+ix86_function_ok_for_sibcall (decl, exp)
+ tree decl;
+ tree exp;
+{
+ /* If we are generating position-independent code, we cannot sibcall
+ optimize any indirect call, or a direct call to a global function,
+ as the PLT requires %ebx be live. */
+ if (!TARGET_64BIT && flag_pic && (!decl || TREE_PUBLIC (decl)))
+ return false;
+
+ /* If we are returning floats on the 80387 register stack, we cannot
+ make a sibcall from a function that doesn't return a float to a
+ function that does; the necessary stack adjustment will not be
+ executed. */
+ if (STACK_REG_P (ix86_function_value (TREE_TYPE (exp)))
+ && ! STACK_REG_P (ix86_function_value (TREE_TYPE (DECL_RESULT (cfun->decl)))))
+ return false;
+
+ /* If this call is indirect, we'll need to be able to use a call-clobbered
+ register for the address of the target function. Make sure that all
+ such registers are not used for passing parameters. */
+ if (!decl && !TARGET_64BIT)
+ {
+ int regparm = ix86_regparm;
+ tree attr, type;
+
+ /* We're looking at the CALL_EXPR, we need the type of the function. */
+ type = TREE_OPERAND (exp, 0); /* pointer expression */
+ type = TREE_TYPE (type); /* pointer type */
+ type = TREE_TYPE (type); /* function type */
+
+ attr = lookup_attribute ("regparm", TYPE_ATTRIBUTES (type));
+ if (attr)
+ regparm = TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr)));
+
+ if (regparm >= 3)
+ {
+ /* ??? Need to count the actual number of registers to be used,
+ not the possible number of registers. Fix later. */
+ return false;
+ }
+ }
+
+ /* Otherwise okay. That also includes certain types of indirect calls. */
+ return true;
+}
+
/* Handle a "cdecl" or "stdcall" attribute;
arguments as in struct attribute_spec.handler. */
static tree
@@ -3209,6 +3348,32 @@ call_insn_operand (op, mode)
return general_operand (op, Pmode);
}
+/* Test for a valid operand for a call instruction. Don't allow the
+ arg pointer register or virtual regs since they may decay into
+ reg + const, which the patterns can't handle. */
+
+int
+sibcall_insn_operand (op, mode)
+ rtx op;
+ enum machine_mode mode ATTRIBUTE_UNUSED;
+{
+ /* Disallow indirect through a virtual register. This leads to
+ compiler aborts when trying to eliminate them. */
+ if (GET_CODE (op) == REG
+ && (op == arg_pointer_rtx
+ || op == frame_pointer_rtx
+ || (REGNO (op) >= FIRST_PSEUDO_REGISTER
+ && REGNO (op) <= LAST_VIRTUAL_REGISTER)))
+ return 0;
+
+ /* Explicitly allow SYMBOL_REF even if pic. */
+ if (GET_CODE (op) == SYMBOL_REF)
+ return 1;
+
+ /* Otherwise we can only allow register operands. */
+ return register_operand (op, Pmode);
+}
+
int
constant_call_address_operand (op, mode)
rtx op;
@@ -3387,6 +3552,18 @@ q_regs_operand (op, mode)
return ANY_QI_REG_P (op);
}
+/* Return true if op is an flags register. */
+
+int
+flags_reg_operand (op, mode)
+ register rtx op;
+ enum machine_mode mode;
+{
+ if (mode != VOIDmode && GET_MODE (op) != mode)
+ return 0;
+ return REG_P (op) && REGNO (op) == FLAGS_REG && GET_MODE (op) != VOIDmode;
+}
+
/* Return true if op is a NON_Q_REGS class register. */
int
@@ -3401,6 +3578,31 @@ non_q_regs_operand (op, mode)
return NON_QI_REG_P (op);
}
+int
+zero_extended_scalar_load_operand (op, mode)
+ rtx op;
+ enum machine_mode mode ATTRIBUTE_UNUSED;
+{
+ unsigned n_elts;
+ if (GET_CODE (op) != MEM)
+ return 0;
+ op = maybe_get_pool_constant (op);
+ if (!op)
+ return 0;
+ if (GET_CODE (op) != CONST_VECTOR)
+ return 0;
+ n_elts =
+ (GET_MODE_SIZE (GET_MODE (op)) /
+ GET_MODE_SIZE (GET_MODE_INNER (GET_MODE (op))));
+ for (n_elts--; n_elts > 0; n_elts--)
+ {
+ rtx elt = CONST_VECTOR_ELT (op, n_elts);
+ if (elt != CONST0_RTX (GET_MODE_INNER (GET_MODE (op))))
+ return 0;
+ }
+ return 1;
+}
+
/* Return 1 if OP is a comparison that can be used in the CMPSS/CMPPS
insns. */
int
@@ -4160,7 +4362,7 @@ output_set_got (dest)
is what will be referred to by the Mach-O PIC subsystem. */
ASM_OUTPUT_LABEL (asm_out_file, machopic_function_base_name ());
#endif
- ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, "L",
+ (*targetm.asm_out.internal_label) (asm_out_file, "L",
CODE_LABEL_NUMBER (XEXP (xops[2], 0)));
if (flag_pic)
@@ -4456,14 +4658,32 @@ ix86_expand_prologue ()
int use_mov = 0;
HOST_WIDE_INT allocate;
+ ix86_compute_frame_layout (&frame);
if (!optimize_size)
{
- use_fast_prologue_epilogue
- = !expensive_function_p (FAST_PROLOGUE_INSN_COUNT);
+ int count = frame.nregs;
+
+ /* The fast prologue uses move instead of push to save registers. This
+ is significantly longer, but also executes faster as modern hardware
+ can execute the moves in parallel, but can't do that for push/pop.
+
+ Be curefull about choosing what prologue to emit: When function takes
+ many instructions to execute we may use slow version as well as in
+ case function is known to be outside hot spot (this is known with
+ feedback only). Weight the size of function by number of registers
+ to save as it is cheap to use one or two push instructions but very
+ slow to use many of them. */
+ if (count)
+ count = (count - 1) * FAST_PROLOGUE_INSN_COUNT;
+ if (cfun->function_frequency < FUNCTION_FREQUENCY_NORMAL
+ || (flag_branch_probabilities
+ && cfun->function_frequency < FUNCTION_FREQUENCY_HOT))
+ use_fast_prologue_epilogue = 0;
+ else
+ use_fast_prologue_epilogue = !expensive_function_p (count);
if (TARGET_PROLOGUE_USING_MOVE)
use_mov = use_fast_prologue_epilogue;
}
- ix86_compute_frame_layout (&frame);
/* Note: AT&T enter does NOT have reversed args. Enter is probably
slower on all targets. Also sdb doesn't like it. */
@@ -8960,6 +9180,84 @@ ix86_expand_setcc (code, dest)
return 1; /* DONE */
}
+/* Expand comparison setting or clearing carry flag. Return true when sucesfull
+ and set pop for the operation. */
+bool
+ix86_expand_carry_flag_compare (code, op0, op1, pop)
+ rtx op0, op1, *pop;
+ enum rtx_code code;
+{
+ enum machine_mode mode =
+ GET_MODE (op0) != VOIDmode ? GET_MODE (op0) : GET_MODE (op1);
+
+ /* Do not handle DImode compares that go trought special path. Also we can't
+ deal with FP compares yet. This is possible to add. */
+ if ((mode == DImode && !TARGET_64BIT) || !INTEGRAL_MODE_P (mode))
+ return false;
+ switch (code)
+ {
+ case LTU:
+ case GEU:
+ break;
+
+ /* Convert a==0 into (unsigned)a<1. */
+ case EQ:
+ case NE:
+ if (op1 != const0_rtx)
+ return false;
+ op1 = const1_rtx;
+ code = (code == EQ ? LTU : GEU);
+ break;
+
+ /* Convert a>b into b<a or a>=b-1. */
+ case GTU:
+ case LEU:
+ if (GET_CODE (op1) == CONST_INT)
+ {
+ op1 = gen_int_mode (INTVAL (op1) + 1, GET_MODE (op0));
+ /* Bail out on overflow. We still can swap operands but that
+ would force loading of the constant into register. */
+ if (op1 == const0_rtx
+ || !x86_64_immediate_operand (op1, GET_MODE (op1)))
+ return false;
+ code = (code == GTU ? GEU : LTU);
+ }
+ else
+ {
+ rtx tmp = op1;
+ op1 = op0;
+ op0 = tmp;
+ code = (code == GTU ? LTU : GEU);
+ }
+ break;
+
+ /* Convert a>0 into (unsigned)a<0x7fffffff. */
+ case LT:
+ case GE:
+ if (mode == DImode || op1 != const0_rtx)
+ return false;
+ op1 = gen_int_mode (~(1 << (GET_MODE_BITSIZE (mode) - 1)), mode);
+ code = (code == LT ? GEU : LTU);
+ break;
+ case LE:
+ case GT:
+ if (mode == DImode || op1 != constm1_rtx)
+ return false;
+ op1 = gen_int_mode (~(1 << (GET_MODE_BITSIZE (mode) - 1)), mode);
+ code = (code == LE ? GEU : LTU);
+ break;
+
+ default:
+ return false;
+ }
+ ix86_compare_op0 = op0;
+ ix86_compare_op1 = op1;
+ *pop = ix86_expand_compare (code, NULL, NULL);
+ if (GET_CODE (*pop) != LTU && GET_CODE (*pop) != GEU)
+ abort ();
+ return true;
+}
+
int
ix86_expand_int_movcc (operands)
rtx operands[];
@@ -8968,30 +9266,7 @@ ix86_expand_int_movcc (operands)
rtx compare_seq, compare_op;
rtx second_test, bypass_test;
enum machine_mode mode = GET_MODE (operands[0]);
-
- /* When the compare code is not LTU or GEU, we can not use sbbl case.
- In case comparsion is done with immediate, we can convert it to LTU or
- GEU by altering the integer. */
-
- if ((code == LEU || code == GTU)
- && GET_CODE (ix86_compare_op1) == CONST_INT
- && mode != HImode
- && INTVAL (ix86_compare_op1) != -1
- /* For x86-64, the immediate field in the instruction is 32-bit
- signed, so we can't increment a DImode value above 0x7fffffff. */
- && (!TARGET_64BIT
- || GET_MODE (ix86_compare_op0) != DImode
- || INTVAL (ix86_compare_op1) != 0x7fffffff)
- && GET_CODE (operands[2]) == CONST_INT
- && GET_CODE (operands[3]) == CONST_INT)
- {
- if (code == LEU)
- code = LTU;
- else
- code = GEU;
- ix86_compare_op1 = gen_int_mode (INTVAL (ix86_compare_op1) + 1,
- GET_MODE (ix86_compare_op0));
- }
+ bool sign_bit_compare_p = false;;
start_sequence ();
compare_op = ix86_expand_compare (code, &second_test, &bypass_test);
@@ -9000,10 +9275,14 @@ ix86_expand_int_movcc (operands)
compare_code = GET_CODE (compare_op);
+ if ((ix86_compare_op1 == const0_rtx && (code == GE || code == LT))
+ || (ix86_compare_op1 == constm1_rtx && (code == GT || code == LE)))
+ sign_bit_compare_p = true;
+
/* Don't attempt mode expansion here -- if we had to expand 5 or 6
HImode insns, we'd be swallowed in word prefix ops. */
- if (mode != HImode
+ if ((mode != HImode || TARGET_FAST_PREFIX)
&& (mode != DImode || TARGET_64BIT)
&& GET_CODE (operands[2]) == CONST_INT
&& GET_CODE (operands[3]) == CONST_INT)
@@ -9013,32 +9292,53 @@ ix86_expand_int_movcc (operands)
HOST_WIDE_INT cf = INTVAL (operands[3]);
HOST_WIDE_INT diff;
- if ((compare_code == LTU || compare_code == GEU)
- && !second_test && !bypass_test)
+ diff = ct - cf;
+ /* Sign bit compares are better done using shifts than we do by using
+ sbb. */
+ if (sign_bit_compare_p
+ || ix86_expand_carry_flag_compare (code, ix86_compare_op0,
+ ix86_compare_op1, &compare_op))
{
/* Detect overlap between destination and compare sources. */
rtx tmp = out;
- /* To simplify rest of code, restrict to the GEU case. */
- if (compare_code == LTU)
+ if (!sign_bit_compare_p)
{
- HOST_WIDE_INT tmp = ct;
- ct = cf;
- cf = tmp;
- compare_code = reverse_condition (compare_code);
- code = reverse_condition (code);
- }
- diff = ct - cf;
+ compare_code = GET_CODE (compare_op);
+
+ /* To simplify rest of code, restrict to the GEU case. */
+ if (compare_code == LTU)
+ {
+ HOST_WIDE_INT tmp = ct;
+ ct = cf;
+ cf = tmp;
+ compare_code = reverse_condition (compare_code);
+ code = reverse_condition (code);
+ }
+ diff = ct - cf;
- if (reg_overlap_mentioned_p (out, ix86_compare_op0)
- || reg_overlap_mentioned_p (out, ix86_compare_op1))
- tmp = gen_reg_rtx (mode);
+ if (reg_overlap_mentioned_p (out, ix86_compare_op0)
+ || reg_overlap_mentioned_p (out, ix86_compare_op1))
+ tmp = gen_reg_rtx (mode);
- emit_insn (compare_seq);
- if (mode == DImode)
- emit_insn (gen_x86_movdicc_0_m1_rex64 (tmp));
+ if (mode == DImode)
+ emit_insn (gen_x86_movdicc_0_m1_rex64 (tmp));
+ else
+ emit_insn (gen_x86_movsicc_0_m1 (gen_lowpart (SImode, tmp)));
+ }
else
- emit_insn (gen_x86_movsicc_0_m1 (tmp));
+ {
+ if (code == GT || code == GE)
+ code = reverse_condition (code);
+ else
+ {
+ HOST_WIDE_INT tmp = ct;
+ ct = cf;
+ cf = tmp;
+ }
+ tmp = emit_store_flag (tmp, code, ix86_compare_op0,
+ ix86_compare_op1, VOIDmode, 0, -1);
+ }
if (diff == 1)
{
@@ -9052,7 +9352,7 @@ ix86_expand_int_movcc (operands)
if (ct)
tmp = expand_simple_binop (mode, PLUS,
tmp, GEN_INT (ct),
- tmp, 1, OPTAB_DIRECT);
+ copy_rtx (tmp), 1, OPTAB_DIRECT);
}
else if (cf == -1)
{
@@ -9065,7 +9365,7 @@ ix86_expand_int_movcc (operands)
*/
tmp = expand_simple_binop (mode, IOR,
tmp, GEN_INT (ct),
- tmp, 1, OPTAB_DIRECT);
+ copy_rtx (tmp), 1, OPTAB_DIRECT);
}
else if (diff == -1 && ct)
{
@@ -9077,11 +9377,11 @@ ix86_expand_int_movcc (operands)
*
* Size 8 - 11.
*/
- tmp = expand_simple_unop (mode, NOT, tmp, tmp, 1);
+ tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
if (cf)
tmp = expand_simple_binop (mode, PLUS,
- tmp, GEN_INT (cf),
- tmp, 1, OPTAB_DIRECT);
+ copy_rtx (tmp), GEN_INT (cf),
+ copy_rtx (tmp), 1, OPTAB_DIRECT);
}
else
{
@@ -9099,26 +9399,25 @@ ix86_expand_int_movcc (operands)
{
cf = ct;
ct = 0;
- tmp = expand_simple_unop (mode, NOT, tmp, tmp, 1);
+ tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
}
tmp = expand_simple_binop (mode, AND,
- tmp,
+ copy_rtx (tmp),
gen_int_mode (cf - ct, mode),
- tmp, 1, OPTAB_DIRECT);
+ copy_rtx (tmp), 1, OPTAB_DIRECT);
if (ct)
tmp = expand_simple_binop (mode, PLUS,
- tmp, GEN_INT (ct),
- tmp, 1, OPTAB_DIRECT);
+ copy_rtx (tmp), GEN_INT (ct),
+ copy_rtx (tmp), 1, OPTAB_DIRECT);
}
- if (tmp != out)
- emit_move_insn (out, tmp);
+ if (!rtx_equal_p (tmp, out))
+ emit_move_insn (copy_rtx (out), copy_rtx (tmp));
return 1; /* DONE */
}
- diff = ct - cf;
if (diff < 0)
{
HOST_WIDE_INT tmp;
@@ -9194,8 +9493,10 @@ ix86_expand_int_movcc (operands)
}
}
+
if ((diff == 1 || diff == 2 || diff == 4 || diff == 8
|| diff == 3 || diff == 5 || diff == 9)
+ && ((mode != QImode && mode != HImode) || !TARGET_PARTIAL_REG_STALL)
&& (mode != DImode || x86_64_sign_extended_value (GEN_INT (cf))))
{
/*
@@ -9237,15 +9538,14 @@ ix86_expand_int_movcc (operands)
tmp = gen_rtx_PLUS (mode, tmp, GEN_INT (cf));
nops++;
}
- if (tmp != out
- && (GET_CODE (tmp) != SUBREG || SUBREG_REG (tmp) != out))
+ if (!rtx_equal_p (tmp, out))
{
if (nops == 1)
out = force_operand (tmp, out);
else
- emit_insn (gen_rtx_SET (VOIDmode, out, tmp));
+ emit_insn (gen_rtx_SET (VOIDmode, copy_rtx (out), copy_rtx (tmp)));
}
- if (out != operands[0])
+ if (!rtx_equal_p (out, operands[0]))
emit_move_insn (operands[0], copy_rtx (out));
return 1; /* DONE */
@@ -9265,12 +9565,10 @@ ix86_expand_int_movcc (operands)
* This is reasonably steep, but branch mispredict costs are
* high on modern cpus, so consider failing only if optimizing
* for space.
- *
- * %%% Parameterize branch_cost on the tuning architecture, then
- * use that. The 80386 couldn't care less about mispredicts.
*/
- if (!optimize_size && !TARGET_CMOVE)
+ if ((!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
+ && BRANCH_COST >= 2)
{
if (cf == 0)
{
@@ -9324,31 +9622,31 @@ ix86_expand_int_movcc (operands)
out = emit_store_flag (out, code, ix86_compare_op0,
ix86_compare_op1, VOIDmode, 0, 1);
- out = expand_simple_binop (mode, PLUS, out, constm1_rtx,
- out, 1, OPTAB_DIRECT);
+ out = expand_simple_binop (mode, PLUS, copy_rtx (out), constm1_rtx,
+ copy_rtx (out), 1, OPTAB_DIRECT);
}
- out = expand_simple_binop (mode, AND, out,
+ out = expand_simple_binop (mode, AND, copy_rtx (out),
gen_int_mode (cf - ct, mode),
- out, 1, OPTAB_DIRECT);
+ copy_rtx (out), 1, OPTAB_DIRECT);
if (ct)
- out = expand_simple_binop (mode, PLUS, out, GEN_INT (ct),
- out, 1, OPTAB_DIRECT);
- if (out != operands[0])
- emit_move_insn (operands[0], out);
+ out = expand_simple_binop (mode, PLUS, copy_rtx (out), GEN_INT (ct),
+ copy_rtx (out), 1, OPTAB_DIRECT);
+ if (!rtx_equal_p (out, operands[0]))
+ emit_move_insn (operands[0], copy_rtx (out));
return 1; /* DONE */
}
}
- if (!TARGET_CMOVE)
+ if (!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
{
/* Try a few things more with specific constants and a variable. */
optab op;
rtx var, orig_out, out, tmp;
- if (optimize_size)
+ if (BRANCH_COST <= 2)
return 0; /* FAIL */
/* If one of the two operands is an interesting constant, load a
@@ -9357,9 +9655,9 @@ ix86_expand_int_movcc (operands)
if (GET_CODE (operands[2]) == CONST_INT)
{
var = operands[3];
- if (INTVAL (operands[2]) == 0)
+ if (INTVAL (operands[2]) == 0 && operands[3] != constm1_rtx)
operands[3] = constm1_rtx, op = and_optab;
- else if (INTVAL (operands[2]) == -1)
+ else if (INTVAL (operands[2]) == -1 && operands[3] != const0_rtx)
operands[3] = const0_rtx, op = ior_optab;
else
return 0; /* FAIL */
@@ -9367,9 +9665,9 @@ ix86_expand_int_movcc (operands)
else if (GET_CODE (operands[3]) == CONST_INT)
{
var = operands[2];
- if (INTVAL (operands[3]) == 0)
+ if (INTVAL (operands[3]) == 0 && operands[2] != constm1_rtx)
operands[2] = constm1_rtx, op = and_optab;
- else if (INTVAL (operands[3]) == -1)
+ else if (INTVAL (operands[3]) == -1 && operands[3] != const0_rtx)
operands[2] = const0_rtx, op = ior_optab;
else
return 0; /* FAIL */
@@ -9388,8 +9686,8 @@ ix86_expand_int_movcc (operands)
/* Mask in the interesting variable. */
out = expand_binop (mode, op, var, tmp, orig_out, 0,
OPTAB_WIDEN);
- if (out != orig_out)
- emit_move_insn (orig_out, out);
+ if (!rtx_equal_p (out, orig_out))
+ emit_move_insn (copy_rtx (orig_out), copy_rtx (out));
return 1; /* DONE */
}
@@ -9422,27 +9720,33 @@ ix86_expand_int_movcc (operands)
emit_move_insn (tmp, operands[2]);
operands[2] = tmp;
}
+
if (! register_operand (operands[2], VOIDmode)
- && ! register_operand (operands[3], VOIDmode))
+ && (mode == QImode
+ || ! register_operand (operands[3], VOIDmode)))
operands[2] = force_reg (mode, operands[2]);
+ if (mode == QImode
+ && ! register_operand (operands[3], VOIDmode))
+ operands[3] = force_reg (mode, operands[3]);
+
emit_insn (compare_seq);
emit_insn (gen_rtx_SET (VOIDmode, operands[0],
gen_rtx_IF_THEN_ELSE (mode,
compare_op, operands[2],
operands[3])));
if (bypass_test)
- emit_insn (gen_rtx_SET (VOIDmode, operands[0],
+ emit_insn (gen_rtx_SET (VOIDmode, copy_rtx (operands[0]),
gen_rtx_IF_THEN_ELSE (mode,
bypass_test,
- operands[3],
- operands[0])));
+ copy_rtx (operands[3]),
+ copy_rtx (operands[0]))));
if (second_test)
- emit_insn (gen_rtx_SET (VOIDmode, operands[0],
+ emit_insn (gen_rtx_SET (VOIDmode, copy_rtx (operands[0]),
gen_rtx_IF_THEN_ELSE (mode,
second_test,
- operands[2],
- operands[0])));
+ copy_rtx (operands[2]),
+ copy_rtx (operands[0]))));
return 1; /* DONE */
}
@@ -9483,8 +9787,14 @@ ix86_expand_fp_movcc (operands)
if (rtx_equal_p (operands[2], op0) && rtx_equal_p (operands[3], op1))
{
/* Check for min operation. */
- if (code == LT)
+ if (code == LT || code == UNLE)
{
+ if (code == UNLE)
+ {
+ rtx tmp = op0;
+ op0 = op1;
+ op1 = tmp;
+ }
operands[0] = force_reg (GET_MODE (operands[0]), operands[0]);
if (memory_operand (op0, VOIDmode))
op0 = force_reg (GET_MODE (operands[0]), op0);
@@ -9495,8 +9805,14 @@ ix86_expand_fp_movcc (operands)
return 1;
}
/* Check for max operation. */
- if (code == GT)
+ if (code == GT || code == UNGE)
{
+ if (code == UNGE)
+ {
+ rtx tmp = op0;
+ op0 = op1;
+ op1 = tmp;
+ }
operands[0] = force_reg (GET_MODE (operands[0]), operands[0]);
if (memory_operand (op0, VOIDmode))
op0 = force_reg (GET_MODE (operands[0]), op0);
@@ -10242,8 +10558,12 @@ ix86_expand_movstr (dst, src, count_exp, align_exp)
/* In case we don't know anything about the alignment, default to
library version, since it is usually equally fast and result in
- shorter code. */
- if (!TARGET_INLINE_ALL_STRINGOPS && align < UNITS_PER_WORD)
+ shorter code.
+
+ Also emit call when we know that the count is large and call overhead
+ will not be important. */
+ if (!TARGET_INLINE_ALL_STRINGOPS
+ && (align < UNITS_PER_WORD || !TARGET_REP_MOVL_OPTIMAL))
{
end_sequence ();
return 0;
@@ -10457,8 +10777,12 @@ ix86_expand_clrstr (src, count_exp, align_exp)
/* In case we don't know anything about the alignment, default to
library version, since it is usually equally fast and result in
- shorter code. */
- if (!TARGET_INLINE_ALL_STRINGOPS && align < UNITS_PER_WORD)
+ shorter code.
+
+ Also emit call when we know that the count is large and call overhead
+ will not be important. */
+ if (!TARGET_INLINE_ALL_STRINGOPS
+ && (align < UNITS_PER_WORD || !TARGET_REP_MOVL_OPTIMAL))
return 0;
if (TARGET_SINGLE_STRINGOP)
@@ -10828,8 +11152,9 @@ ix86_expand_strlensi_unroll_1 (out, align_rtx)
}
void
-ix86_expand_call (retval, fnaddr, callarg1, callarg2, pop)
+ix86_expand_call (retval, fnaddr, callarg1, callarg2, pop, sibcall)
rtx retval, fnaddr, callarg1, callarg2, pop;
+ int sibcall;
{
rtx use = NULL, call;
@@ -10861,6 +11186,15 @@ ix86_expand_call (retval, fnaddr, callarg1, callarg2, pop)
fnaddr = copy_to_mode_reg (Pmode, XEXP (fnaddr, 0));
fnaddr = gen_rtx_MEM (QImode, fnaddr);
}
+ if (sibcall && TARGET_64BIT
+ && !constant_call_address_operand (XEXP (fnaddr, 0), Pmode))
+ {
+ rtx addr;
+ addr = copy_to_mode_reg (Pmode, XEXP (fnaddr, 0));
+ fnaddr = gen_rtx_REG (Pmode, 40);
+ emit_move_insn (fnaddr, addr);
+ fnaddr = gen_rtx_MEM (QImode, fnaddr);
+ }
call = gen_rtx_CALL (VOIDmode, fnaddr, callarg1);
if (retval)
@@ -11060,6 +11394,7 @@ ix86_issue_rate ()
case PROCESSOR_PENTIUMPRO:
case PROCESSOR_PENTIUM4:
case PROCESSOR_ATHLON:
+ case PROCESSOR_K8:
return 3;
default:
@@ -11271,16 +11606,10 @@ ix86_adjust_cost (insn, link, dep_insn, cost)
break;
case PROCESSOR_ATHLON:
+ case PROCESSOR_K8:
memory = get_attr_memory (insn);
dep_memory = get_attr_memory (dep_insn);
- if (dep_memory == MEMORY_LOAD || dep_memory == MEMORY_BOTH)
- {
- if (dep_insn_type == TYPE_IMOV || dep_insn_type == TYPE_FMOV)
- cost += 2;
- else
- cost += 3;
- }
/* Show ability of reorder buffer to hide latency of load by executing
in parallel with previous instruction in case
previous instruction is not needed to compute the address. */
@@ -11554,7 +11883,7 @@ ix86_variable_issue (dump, sched_verbose, insn, can_issue_more)
static int
ia32_use_dfa_pipeline_interface ()
{
- if (ix86_cpu == PROCESSOR_PENTIUM)
+ if (TARGET_PENTIUM || TARGET_ATHLON_K8)
return 1;
return 0;
}
@@ -12764,7 +13093,8 @@ safe_vector_operand (x, mode)
: gen_rtx_SUBREG (DImode, x, 0)));
else
emit_insn (gen_sse_clrv4sf (mode == V4SFmode ? x
- : gen_rtx_SUBREG (V4SFmode, x, 0)));
+ : gen_rtx_SUBREG (V4SFmode, x, 0),
+ CONST0_RTX (V4SFmode)));
return x;
}
@@ -13434,7 +13764,7 @@ ix86_expand_builtin (exp, target, subtarget, mode, ignore)
case IX86_BUILTIN_SSE_ZERO:
target = gen_reg_rtx (V4SFmode);
- emit_insn (gen_sse_clrv4sf (target));
+ emit_insn (gen_sse_clrv4sf (target, CONST0_RTX (V4SFmode)));
return target;
case IX86_BUILTIN_MMX_ZERO:
@@ -14058,6 +14388,17 @@ x86_order_regs_for_local_alloc ()
reg_alloc_order [pos++] = 0;
}
+#ifndef TARGET_USE_MS_BITFIELD_LAYOUT
+#define TARGET_USE_MS_BITFIELD_LAYOUT 0
+#endif
+
+static bool
+ix86_ms_bitfield_layout_p (record_type)
+ tree record_type ATTRIBUTE_UNUSED;
+{
+ return TARGET_USE_MS_BITFIELD_LAYOUT;
+}
+
/* Returns an expression indicating where the this parameter is
located on entry to the FUNCTION. */
@@ -14317,7 +14658,7 @@ x86_machine_dependent_reorg (first)
{
edge e;
- if (!TARGET_ATHLON || !optimize || optimize_size)
+ if (!TARGET_ATHLON_K8 || !optimize || optimize_size)
return;
for (e = EXIT_BLOCK_PTR->pred; e; e = e->pred_next)
{
@@ -14328,25 +14669,69 @@ x86_machine_dependent_reorg (first)
if (!returnjump_p (ret) || !maybe_hot_bb_p (bb))
continue;
- prev = prev_nonnote_insn (ret);
+ for (prev = PREV_INSN (ret); prev; prev = PREV_INSN (prev))
+ if (active_insn_p (prev) || GET_CODE (prev) == CODE_LABEL)
+ break;
if (prev && GET_CODE (prev) == CODE_LABEL)
{
edge e;
for (e = bb->pred; e; e = e->pred_next)
- if (EDGE_FREQUENCY (e) && e->src->index > 0
+ if (EDGE_FREQUENCY (e) && e->src->index >= 0
&& !(e->flags & EDGE_FALLTHRU))
insert = 1;
}
if (!insert)
{
- prev = prev_real_insn (ret);
+ prev = prev_active_insn (ret);
if (prev && GET_CODE (prev) == JUMP_INSN
&& any_condjump_p (prev))
insert = 1;
+ /* Empty functions get branch misspredict even when the jump destination
+ is not visible to us. */
+ if (!prev && cfun->function_frequency > FUNCTION_FREQUENCY_UNLIKELY_EXECUTED)
+ insert = 1;
}
if (insert)
emit_insn_before (gen_nop (), ret);
}
}
+/* Return nonzero when QImode register that must be represented via REX prefix
+ is used. */
+bool
+x86_extended_QIreg_mentioned_p (insn)
+ rtx insn;
+{
+ int i;
+ extract_insn_cached (insn);
+ for (i = 0; i < recog_data.n_operands; i++)
+ if (REG_P (recog_data.operand[i])
+ && REGNO (recog_data.operand[i]) >= 4)
+ return true;
+ return false;
+}
+
+/* Return nonzero when P points to register encoded via REX prefix.
+ Called via for_each_rtx. */
+static int
+extended_reg_mentioned_1 (p, data)
+ rtx *p;
+ void *data ATTRIBUTE_UNUSED;
+{
+ unsigned int regno;
+ if (!REG_P (*p))
+ return 0;
+ regno = REGNO (*p);
+ return REX_INT_REGNO_P (regno) || REX_SSE_REGNO_P (regno);
+}
+
+/* Return true when INSN mentions register that must be encoded using REX
+ prefix. */
+bool
+x86_extended_reg_mentioned_p (insn)
+ rtx insn;
+{
+ return for_each_rtx (&PATTERN (insn), extended_reg_mentioned_1, NULL);
+}
+
#include "gt-i386.h"