diff options
Diffstat (limited to 'gcc/config/i386/i386.c')
-rw-r--r-- | gcc/config/i386/i386.c | 709 |
1 files changed, 547 insertions, 162 deletions
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index 42f6d93d3c3..2eaa1c54875 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -21,6 +21,8 @@ Boston, MA 02111-1307, USA. */ #include "config.h" #include "system.h" +#include "coretypes.h" +#include "tm.h" #include "rtl.h" #include "tree.h" #include "tm_p.h" @@ -55,9 +57,9 @@ struct processor_costs size_cost = { /* costs for tunning for size */ 3, /* cost of a lea instruction */ 2, /* variable shift costs */ 3, /* constant shift costs */ - 3, /* cost of starting a multiply */ + {3, 3, 3, 3, 5}, /* cost of starting a multiply */ 0, /* cost of multiply per each bit set */ - 3, /* cost of a divide/mod */ + {3, 3, 3, 3, 5}, /* cost of a divide/mod */ 3, /* cost of movsx */ 3, /* cost of movzx */ 0, /* "large" insn */ @@ -84,6 +86,7 @@ struct processor_costs size_cost = { /* costs for tunning for size */ 3, /* MMX or SSE register to integer */ 0, /* size of prefetch block */ 0, /* number of parallel prefetches */ + 1, /* Branch cost */ 2, /* cost of FADD and FSUB insns. */ 2, /* cost of FMUL instruction. */ 2, /* cost of FDIV instruction. */ @@ -99,9 +102,9 @@ struct processor_costs i386_cost = { /* 386 specific costs */ 1, /* cost of a lea instruction */ 3, /* variable shift costs */ 2, /* constant shift costs */ - 6, /* cost of starting a multiply */ + {6, 6, 6, 6, 6}, /* cost of starting a multiply */ 1, /* cost of multiply per each bit set */ - 23, /* cost of a divide/mod */ + {23, 23, 23, 23, 23}, /* cost of a divide/mod */ 3, /* cost of movsx */ 2, /* cost of movzx */ 15, /* "large" insn */ @@ -128,6 +131,7 @@ struct processor_costs i386_cost = { /* 386 specific costs */ 3, /* MMX or SSE register to integer */ 0, /* size of prefetch block */ 0, /* number of parallel prefetches */ + 1, /* Branch cost */ 23, /* cost of FADD and FSUB insns. */ 27, /* cost of FMUL instruction. */ 88, /* cost of FDIV instruction. */ @@ -142,9 +146,9 @@ struct processor_costs i486_cost = { /* 486 specific costs */ 1, /* cost of a lea instruction */ 3, /* variable shift costs */ 2, /* constant shift costs */ - 12, /* cost of starting a multiply */ + {12, 12, 12, 12, 12}, /* cost of starting a multiply */ 1, /* cost of multiply per each bit set */ - 40, /* cost of a divide/mod */ + {40, 40, 40, 40, 40}, /* cost of a divide/mod */ 3, /* cost of movsx */ 2, /* cost of movzx */ 15, /* "large" insn */ @@ -171,6 +175,7 @@ struct processor_costs i486_cost = { /* 486 specific costs */ 3, /* MMX or SSE register to integer */ 0, /* size of prefetch block */ 0, /* number of parallel prefetches */ + 1, /* Branch cost */ 8, /* cost of FADD and FSUB insns. */ 16, /* cost of FMUL instruction. */ 73, /* cost of FDIV instruction. */ @@ -185,9 +190,9 @@ struct processor_costs pentium_cost = { 1, /* cost of a lea instruction */ 4, /* variable shift costs */ 1, /* constant shift costs */ - 11, /* cost of starting a multiply */ + {11, 11, 11, 11, 11}, /* cost of starting a multiply */ 0, /* cost of multiply per each bit set */ - 25, /* cost of a divide/mod */ + {25, 25, 25, 25, 25}, /* cost of a divide/mod */ 3, /* cost of movsx */ 2, /* cost of movzx */ 8, /* "large" insn */ @@ -214,6 +219,7 @@ struct processor_costs pentium_cost = { 3, /* MMX or SSE register to integer */ 0, /* size of prefetch block */ 0, /* number of parallel prefetches */ + 2, /* Branch cost */ 3, /* cost of FADD and FSUB insns. */ 3, /* cost of FMUL instruction. */ 39, /* cost of FDIV instruction. */ @@ -228,9 +234,9 @@ struct processor_costs pentiumpro_cost = { 1, /* cost of a lea instruction */ 1, /* variable shift costs */ 1, /* constant shift costs */ - 4, /* cost of starting a multiply */ + {4, 4, 4, 4, 4}, /* cost of starting a multiply */ 0, /* cost of multiply per each bit set */ - 17, /* cost of a divide/mod */ + {17, 17, 17, 17, 17}, /* cost of a divide/mod */ 1, /* cost of movsx */ 1, /* cost of movzx */ 8, /* "large" insn */ @@ -257,6 +263,7 @@ struct processor_costs pentiumpro_cost = { 3, /* MMX or SSE register to integer */ 32, /* size of prefetch block */ 6, /* number of parallel prefetches */ + 2, /* Branch cost */ 3, /* cost of FADD and FSUB insns. */ 5, /* cost of FMUL instruction. */ 56, /* cost of FDIV instruction. */ @@ -271,9 +278,9 @@ struct processor_costs k6_cost = { 2, /* cost of a lea instruction */ 1, /* variable shift costs */ 1, /* constant shift costs */ - 3, /* cost of starting a multiply */ + {3, 3, 3, 3, 3}, /* cost of starting a multiply */ 0, /* cost of multiply per each bit set */ - 18, /* cost of a divide/mod */ + {18, 18, 18, 18, 18}, /* cost of a divide/mod */ 2, /* cost of movsx */ 2, /* cost of movzx */ 8, /* "large" insn */ @@ -300,6 +307,7 @@ struct processor_costs k6_cost = { 6, /* MMX or SSE register to integer */ 32, /* size of prefetch block */ 1, /* number of parallel prefetches */ + 1, /* Branch cost */ 2, /* cost of FADD and FSUB insns. */ 2, /* cost of FMUL instruction. */ 56, /* cost of FDIV instruction. */ @@ -314,9 +322,9 @@ struct processor_costs athlon_cost = { 2, /* cost of a lea instruction */ 1, /* variable shift costs */ 1, /* constant shift costs */ - 5, /* cost of starting a multiply */ + {5, 5, 5, 5, 5}, /* cost of starting a multiply */ 0, /* cost of multiply per each bit set */ - 42, /* cost of a divide/mod */ + {18, 26, 42, 74, 74}, /* cost of a divide/mod */ 1, /* cost of movsx */ 1, /* cost of movzx */ 8, /* "large" insn */ @@ -343,6 +351,7 @@ struct processor_costs athlon_cost = { 5, /* MMX or SSE register to integer */ 64, /* size of prefetch block */ 6, /* number of parallel prefetches */ + 2, /* Branch cost */ 4, /* cost of FADD and FSUB insns. */ 4, /* cost of FMUL instruction. */ 24, /* cost of FDIV instruction. */ @@ -352,14 +361,58 @@ struct processor_costs athlon_cost = { }; static const +struct processor_costs k8_cost = { + 1, /* cost of an add instruction */ + 2, /* cost of a lea instruction */ + 1, /* variable shift costs */ + 1, /* constant shift costs */ + {3, 4, 3, 4, 5}, /* cost of starting a multiply */ + 0, /* cost of multiply per each bit set */ + {18, 26, 42, 74, 74}, /* cost of a divide/mod */ + 1, /* cost of movsx */ + 1, /* cost of movzx */ + 8, /* "large" insn */ + 9, /* MOVE_RATIO */ + 4, /* cost for loading QImode using movzbl */ + {3, 4, 3}, /* cost of loading integer registers + in QImode, HImode and SImode. + Relative to reg-reg move (2). */ + {3, 4, 3}, /* cost of storing integer registers */ + 4, /* cost of reg,reg fld/fst */ + {4, 4, 12}, /* cost of loading fp registers + in SFmode, DFmode and XFmode */ + {6, 6, 8}, /* cost of loading integer registers */ + 2, /* cost of moving MMX register */ + {3, 3}, /* cost of loading MMX registers + in SImode and DImode */ + {4, 4}, /* cost of storing MMX registers + in SImode and DImode */ + 2, /* cost of moving SSE register */ + {4, 3, 6}, /* cost of loading SSE registers + in SImode, DImode and TImode */ + {4, 4, 5}, /* cost of storing SSE registers + in SImode, DImode and TImode */ + 5, /* MMX or SSE register to integer */ + 64, /* size of prefetch block */ + 6, /* number of parallel prefetches */ + 2, /* Branch cost */ + 4, /* cost of FADD and FSUB insns. */ + 4, /* cost of FMUL instruction. */ + 19, /* cost of FDIV instruction. */ + 2, /* cost of FABS instruction. */ + 2, /* cost of FCHS instruction. */ + 35, /* cost of FSQRT instruction. */ +}; + +static const struct processor_costs pentium4_cost = { 1, /* cost of an add instruction */ 1, /* cost of a lea instruction */ - 8, /* variable shift costs */ - 8, /* constant shift costs */ - 30, /* cost of starting a multiply */ + 4, /* variable shift costs */ + 4, /* constant shift costs */ + {15, 15, 15, 15, 15}, /* cost of starting a multiply */ 0, /* cost of multiply per each bit set */ - 112, /* cost of a divide/mod */ + {56, 56, 56, 56, 56}, /* cost of a divide/mod */ 1, /* cost of movsx */ 1, /* cost of movzx */ 16, /* "large" insn */ @@ -386,6 +439,7 @@ struct processor_costs pentium4_cost = { 10, /* MMX or SSE register to integer */ 64, /* size of prefetch block */ 6, /* number of parallel prefetches */ + 2, /* Branch cost */ 5, /* cost of FADD and FSUB insns. */ 7, /* cost of FMUL instruction. */ 43, /* cost of FDIV instruction. */ @@ -404,52 +458,66 @@ const struct processor_costs *ix86_cost = &pentium_cost; #define m_K6 (1<<PROCESSOR_K6) #define m_ATHLON (1<<PROCESSOR_ATHLON) #define m_PENT4 (1<<PROCESSOR_PENTIUM4) +#define m_K8 (1<<PROCESSOR_K8) +#define m_ATHLON_K8 (m_K8 | m_ATHLON) -const int x86_use_leave = m_386 | m_K6 | m_ATHLON; -const int x86_push_memory = m_386 | m_K6 | m_ATHLON | m_PENT4; +const int x86_use_leave = m_386 | m_K6 | m_ATHLON_K8; +const int x86_push_memory = m_386 | m_K6 | m_ATHLON_K8 | m_PENT4; const int x86_zero_extend_with_and = m_486 | m_PENT; -const int x86_movx = m_ATHLON | m_PPRO | m_PENT4 /* m_386 | m_K6 */; +const int x86_movx = m_ATHLON_K8 | m_PPRO | m_PENT4 /* m_386 | m_K6 */; const int x86_double_with_add = ~m_386; const int x86_use_bit_test = m_386; -const int x86_unroll_strlen = m_486 | m_PENT | m_PPRO | m_ATHLON | m_K6; -const int x86_cmove = m_PPRO | m_ATHLON | m_PENT4; -const int x86_3dnow_a = m_ATHLON; -const int x86_deep_branch = m_PPRO | m_K6 | m_ATHLON | m_PENT4; +const int x86_unroll_strlen = m_486 | m_PENT | m_PPRO | m_ATHLON_K8 | m_K6; +const int x86_cmove = m_PPRO | m_ATHLON_K8 | m_PENT4; +const int x86_3dnow_a = m_ATHLON_K8; +const int x86_deep_branch = m_PPRO | m_K6 | m_ATHLON_K8 | m_PENT4; const int x86_branch_hints = m_PENT4; const int x86_use_sahf = m_PPRO | m_K6 | m_PENT4; const int x86_partial_reg_stall = m_PPRO; const int x86_use_loop = m_K6; -const int x86_use_fiop = ~(m_PPRO | m_ATHLON | m_PENT); +const int x86_use_fiop = ~(m_PPRO | m_ATHLON_K8 | m_PENT); const int x86_use_mov0 = m_K6; const int x86_use_cltd = ~(m_PENT | m_K6); const int x86_read_modify_write = ~m_PENT; const int x86_read_modify = ~(m_PENT | m_PPRO); const int x86_split_long_moves = m_PPRO; -const int x86_promote_QImode = m_K6 | m_PENT | m_386 | m_486 | m_ATHLON; +const int x86_promote_QImode = m_K6 | m_PENT | m_386 | m_486 | m_ATHLON_K8; const int x86_fast_prefix = ~(m_PENT | m_486 | m_386); const int x86_single_stringop = m_386 | m_PENT4; const int x86_qimode_math = ~(0); const int x86_promote_qi_regs = 0; const int x86_himode_math = ~(m_PPRO); const int x86_promote_hi_regs = m_PPRO; -const int x86_sub_esp_4 = m_ATHLON | m_PPRO | m_PENT4; -const int x86_sub_esp_8 = m_ATHLON | m_PPRO | m_386 | m_486 | m_PENT4; -const int x86_add_esp_4 = m_ATHLON | m_K6 | m_PENT4; -const int x86_add_esp_8 = m_ATHLON | m_PPRO | m_K6 | m_386 | m_486 | m_PENT4; -const int x86_integer_DFmode_moves = ~(m_ATHLON | m_PENT4 | m_PPRO); -const int x86_partial_reg_dependency = m_ATHLON | m_PENT4; -const int x86_memory_mismatch_stall = m_ATHLON | m_PENT4; -const int x86_accumulate_outgoing_args = m_ATHLON | m_PENT4 | m_PPRO; -const int x86_prologue_using_move = m_ATHLON | m_PENT4 | m_PPRO; -const int x86_epilogue_using_move = m_ATHLON | m_PENT4 | m_PPRO; +const int x86_sub_esp_4 = m_ATHLON_K8 | m_PPRO | m_PENT4; +const int x86_sub_esp_8 = m_ATHLON_K8 | m_PPRO | m_386 | m_486 | m_PENT4; +const int x86_add_esp_4 = m_ATHLON_K8 | m_K6 | m_PENT4; +const int x86_add_esp_8 = m_ATHLON_K8 | m_PPRO | m_K6 | m_386 | m_486 | m_PENT4; +const int x86_integer_DFmode_moves = ~(m_ATHLON_K8 | m_PENT4 | m_PPRO); +const int x86_partial_reg_dependency = m_ATHLON_K8 | m_PENT4; +const int x86_memory_mismatch_stall = m_ATHLON_K8 | m_PENT4; +const int x86_accumulate_outgoing_args = m_ATHLON_K8 | m_PENT4 | m_PPRO; +const int x86_prologue_using_move = m_ATHLON_K8 | m_PENT4 | m_PPRO; +const int x86_epilogue_using_move = m_ATHLON_K8 | m_PENT4 | m_PPRO; const int x86_decompose_lea = m_PENT4; const int x86_shift1 = ~m_486; -const int x86_arch_always_fancy_math_387 = m_PENT | m_PPRO | m_ATHLON | m_PENT4; +const int x86_arch_always_fancy_math_387 = m_PENT | m_PPRO | m_ATHLON_K8 | m_PENT4; +const int x86_sse_partial_reg_dependency = m_PENT4 | m_PPRO; +/* Set for machines where the type and dependencies are resolved on SSE register + parts insetad of whole registers, so we may maintain just lower part of + scalar values in proper format leaving the upper part undefined. */ +const int x86_sse_partial_regs = m_ATHLON_K8; +/* Athlon optimizes partial-register FPS special case, thus avoiding the + need for extra instructions beforehand */ +const int x86_sse_partial_regs_for_cvtsd2ss = 0; +const int x86_sse_typeless_stores = m_ATHLON_K8; +const int x86_sse_load0_by_pxor = m_PPRO | m_PENT4; +const int x86_use_ffreep = m_ATHLON_K8; +const int x86_rep_movl_optimal = m_386 | m_PENT | m_PPRO | m_K6; /* In case the avreage insn count for single function invocation is lower than this constant, emit fast (but longer) prologue and epilogue code. */ -#define FAST_PROLOGUE_INSN_COUNT 30 +#define FAST_PROLOGUE_INSN_COUNT 20 /* Set by prologue expander and used by epilogue expander to determine the style used. */ @@ -755,6 +823,7 @@ static void x86_output_mi_thunk PARAMS ((FILE *, tree, HOST_WIDE_INT, HOST_WIDE_INT, tree)); static bool x86_can_output_mi_thunk PARAMS ((tree, HOST_WIDE_INT, HOST_WIDE_INT, tree)); +bool ix86_expand_carry_flag_compare PARAMS ((enum rtx_code, rtx, rtx, rtx*)); struct ix86_address { @@ -796,9 +865,12 @@ static void ix86_compute_frame_layout PARAMS ((struct ix86_frame *)); static int ix86_comp_type_attributes PARAMS ((tree, tree)); static int ix86_fntype_regparm PARAMS ((tree)); const struct attribute_spec ix86_attribute_table[]; +static bool ix86_function_ok_for_sibcall PARAMS ((tree, tree)); static tree ix86_handle_cdecl_attribute PARAMS ((tree *, tree, tree, int, bool *)); static tree ix86_handle_regparm_attribute PARAMS ((tree *, tree, tree, int, bool *)); static int ix86_value_regno PARAMS ((enum machine_mode)); +static bool ix86_ms_bitfield_layout_p PARAMS ((tree)); +static int extended_reg_mentioned_1 PARAMS ((rtx *, void *)); #if defined (DO_GLOBAL_CTORS_BODY) && defined (HAS_INIT_SECTION) static void ix86_svr3_asm_out_constructor PARAMS ((rtx, int)); @@ -897,6 +969,9 @@ static enum x86_64_reg_class merge_classes PARAMS ((enum x86_64_reg_class, #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \ ia32_multipass_dfa_lookahead +#undef TARGET_FUNCTION_OK_FOR_SIBCALL +#define TARGET_FUNCTION_OK_FOR_SIBCALL ix86_function_ok_for_sibcall + #ifdef HAVE_AS_TLS #undef TARGET_HAVE_TLS #define TARGET_HAVE_TLS true @@ -904,6 +979,9 @@ static enum x86_64_reg_class merge_classes PARAMS ((enum x86_64_reg_class, #undef TARGET_CANNOT_FORCE_CONST_MEM #define TARGET_CANNOT_FORCE_CONST_MEM ix86_cannot_force_const_mem +#undef TARGET_MS_BITFIELD_LAYOUT_P +#define TARGET_MS_BITFIELD_LAYOUT_P ix86_ms_bitfield_layout_p + #undef TARGET_ASM_OUTPUT_MI_THUNK #define TARGET_ASM_OUTPUT_MI_THUNK x86_output_mi_thunk #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK @@ -937,17 +1015,17 @@ override_options () const int align_jump; const int align_jump_max_skip; const int align_func; - const int branch_cost; } const processor_target_table[PROCESSOR_max] = { - {&i386_cost, 0, 0, 4, 3, 4, 3, 4, 1}, - {&i486_cost, 0, 0, 16, 15, 16, 15, 16, 1}, - {&pentium_cost, 0, 0, 16, 7, 16, 7, 16, 1}, - {&pentiumpro_cost, 0, 0, 16, 15, 16, 7, 16, 1}, - {&k6_cost, 0, 0, 32, 7, 32, 7, 32, 1}, - {&athlon_cost, 0, 0, 16, 7, 64, 7, 16, 1}, - {&pentium4_cost, 0, 0, 0, 0, 0, 0, 0, 1} + {&i386_cost, 0, 0, 4, 3, 4, 3, 4}, + {&i486_cost, 0, 0, 16, 15, 16, 15, 16}, + {&pentium_cost, 0, 0, 16, 7, 16, 7, 16}, + {&pentiumpro_cost, 0, 0, 16, 15, 16, 7, 16}, + {&k6_cost, 0, 0, 32, 7, 32, 7, 32}, + {&athlon_cost, 0, 0, 16, 7, 16, 7, 16}, + {&pentium4_cost, 0, 0, 0, 0, 0, 0, 0}, + {&k8_cost, 0, 0, 16, 7, 16, 7, 16} }; static const char * const cpu_names[] = TARGET_CPU_DEFAULT_NAMES; @@ -962,7 +1040,8 @@ override_options () PTA_MMX = 4, PTA_PREFETCH_SSE = 8, PTA_3DNOW = 16, - PTA_3DNOW_A = 64 + PTA_3DNOW_A = 64, + PTA_64BIT = 128 } flags; } const processor_alias_table[] = @@ -994,6 +1073,8 @@ override_options () | PTA_3DNOW_A | PTA_SSE}, {"athlon-mp", PROCESSOR_ATHLON, PTA_MMX | PTA_PREFETCH_SSE | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE}, + {"k8", PROCESSOR_K8, PTA_MMX | PTA_PREFETCH_SSE | PTA_3DNOW | PTA_64BIT + | PTA_3DNOW_A | PTA_SSE | PTA_SSE2}, }; int const pta_size = ARRAY_SIZE (processor_alias_table); @@ -1033,7 +1114,7 @@ override_options () if (!ix86_cpu_string) ix86_cpu_string = cpu_names [TARGET_CPU_DEFAULT]; if (!ix86_arch_string) - ix86_arch_string = TARGET_64BIT ? "athlon-4" : "i386"; + ix86_arch_string = TARGET_64BIT ? "k8" : "i386"; if (ix86_cmodel_string != 0) { @@ -1099,6 +1180,8 @@ override_options () target_flags |= MASK_SSE2; if (processor_alias_table[i].flags & PTA_PREFETCH_SSE) x86_prefetch_sse = true; + if (TARGET_64BIT && !(processor_alias_table[i].flags & PTA_64BIT)) + error ("CPU you selected does not support x86-64 instruction set"); break; } @@ -1109,6 +1192,8 @@ override_options () if (! strcmp (ix86_cpu_string, processor_alias_table[i].name)) { ix86_cpu = processor_alias_table[i].processor; + if (TARGET_64BIT && !(processor_alias_table[i].flags & PTA_64BIT)) + error ("CPU you selected does not support x86-64 instruction set"); break; } if (processor_alias_table[i].flags & PTA_PREFETCH_SSE) @@ -1215,7 +1300,7 @@ override_options () } /* Validate -mbranch-cost= value, or provide default. */ - ix86_branch_cost = processor_target_table[ix86_cpu].branch_cost; + ix86_branch_cost = processor_target_table[ix86_cpu].cost->branch_cost; if (ix86_branch_cost_string) { i = atoi (ix86_branch_cost_string); @@ -1371,6 +1456,60 @@ const struct attribute_spec ix86_attribute_table[] = { NULL, 0, 0, false, false, false, NULL } }; +/* If PIC, we cannot make sibling calls to global functions + because the PLT requires %ebx live. + If we are returning floats on the register stack, we cannot make + sibling calls to functions that return floats. (The stack adjust + instruction will wind up after the sibcall jump, and not be executed.) */ + +static bool +ix86_function_ok_for_sibcall (decl, exp) + tree decl; + tree exp; +{ + /* If we are generating position-independent code, we cannot sibcall + optimize any indirect call, or a direct call to a global function, + as the PLT requires %ebx be live. */ + if (!TARGET_64BIT && flag_pic && (!decl || TREE_PUBLIC (decl))) + return false; + + /* If we are returning floats on the 80387 register stack, we cannot + make a sibcall from a function that doesn't return a float to a + function that does; the necessary stack adjustment will not be + executed. */ + if (STACK_REG_P (ix86_function_value (TREE_TYPE (exp))) + && ! STACK_REG_P (ix86_function_value (TREE_TYPE (DECL_RESULT (cfun->decl))))) + return false; + + /* If this call is indirect, we'll need to be able to use a call-clobbered + register for the address of the target function. Make sure that all + such registers are not used for passing parameters. */ + if (!decl && !TARGET_64BIT) + { + int regparm = ix86_regparm; + tree attr, type; + + /* We're looking at the CALL_EXPR, we need the type of the function. */ + type = TREE_OPERAND (exp, 0); /* pointer expression */ + type = TREE_TYPE (type); /* pointer type */ + type = TREE_TYPE (type); /* function type */ + + attr = lookup_attribute ("regparm", TYPE_ATTRIBUTES (type)); + if (attr) + regparm = TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr))); + + if (regparm >= 3) + { + /* ??? Need to count the actual number of registers to be used, + not the possible number of registers. Fix later. */ + return false; + } + } + + /* Otherwise okay. That also includes certain types of indirect calls. */ + return true; +} + /* Handle a "cdecl" or "stdcall" attribute; arguments as in struct attribute_spec.handler. */ static tree @@ -3209,6 +3348,32 @@ call_insn_operand (op, mode) return general_operand (op, Pmode); } +/* Test for a valid operand for a call instruction. Don't allow the + arg pointer register or virtual regs since they may decay into + reg + const, which the patterns can't handle. */ + +int +sibcall_insn_operand (op, mode) + rtx op; + enum machine_mode mode ATTRIBUTE_UNUSED; +{ + /* Disallow indirect through a virtual register. This leads to + compiler aborts when trying to eliminate them. */ + if (GET_CODE (op) == REG + && (op == arg_pointer_rtx + || op == frame_pointer_rtx + || (REGNO (op) >= FIRST_PSEUDO_REGISTER + && REGNO (op) <= LAST_VIRTUAL_REGISTER))) + return 0; + + /* Explicitly allow SYMBOL_REF even if pic. */ + if (GET_CODE (op) == SYMBOL_REF) + return 1; + + /* Otherwise we can only allow register operands. */ + return register_operand (op, Pmode); +} + int constant_call_address_operand (op, mode) rtx op; @@ -3387,6 +3552,18 @@ q_regs_operand (op, mode) return ANY_QI_REG_P (op); } +/* Return true if op is an flags register. */ + +int +flags_reg_operand (op, mode) + register rtx op; + enum machine_mode mode; +{ + if (mode != VOIDmode && GET_MODE (op) != mode) + return 0; + return REG_P (op) && REGNO (op) == FLAGS_REG && GET_MODE (op) != VOIDmode; +} + /* Return true if op is a NON_Q_REGS class register. */ int @@ -3401,6 +3578,31 @@ non_q_regs_operand (op, mode) return NON_QI_REG_P (op); } +int +zero_extended_scalar_load_operand (op, mode) + rtx op; + enum machine_mode mode ATTRIBUTE_UNUSED; +{ + unsigned n_elts; + if (GET_CODE (op) != MEM) + return 0; + op = maybe_get_pool_constant (op); + if (!op) + return 0; + if (GET_CODE (op) != CONST_VECTOR) + return 0; + n_elts = + (GET_MODE_SIZE (GET_MODE (op)) / + GET_MODE_SIZE (GET_MODE_INNER (GET_MODE (op)))); + for (n_elts--; n_elts > 0; n_elts--) + { + rtx elt = CONST_VECTOR_ELT (op, n_elts); + if (elt != CONST0_RTX (GET_MODE_INNER (GET_MODE (op)))) + return 0; + } + return 1; +} + /* Return 1 if OP is a comparison that can be used in the CMPSS/CMPPS insns. */ int @@ -4160,7 +4362,7 @@ output_set_got (dest) is what will be referred to by the Mach-O PIC subsystem. */ ASM_OUTPUT_LABEL (asm_out_file, machopic_function_base_name ()); #endif - ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, "L", + (*targetm.asm_out.internal_label) (asm_out_file, "L", CODE_LABEL_NUMBER (XEXP (xops[2], 0))); if (flag_pic) @@ -4456,14 +4658,32 @@ ix86_expand_prologue () int use_mov = 0; HOST_WIDE_INT allocate; + ix86_compute_frame_layout (&frame); if (!optimize_size) { - use_fast_prologue_epilogue - = !expensive_function_p (FAST_PROLOGUE_INSN_COUNT); + int count = frame.nregs; + + /* The fast prologue uses move instead of push to save registers. This + is significantly longer, but also executes faster as modern hardware + can execute the moves in parallel, but can't do that for push/pop. + + Be curefull about choosing what prologue to emit: When function takes + many instructions to execute we may use slow version as well as in + case function is known to be outside hot spot (this is known with + feedback only). Weight the size of function by number of registers + to save as it is cheap to use one or two push instructions but very + slow to use many of them. */ + if (count) + count = (count - 1) * FAST_PROLOGUE_INSN_COUNT; + if (cfun->function_frequency < FUNCTION_FREQUENCY_NORMAL + || (flag_branch_probabilities + && cfun->function_frequency < FUNCTION_FREQUENCY_HOT)) + use_fast_prologue_epilogue = 0; + else + use_fast_prologue_epilogue = !expensive_function_p (count); if (TARGET_PROLOGUE_USING_MOVE) use_mov = use_fast_prologue_epilogue; } - ix86_compute_frame_layout (&frame); /* Note: AT&T enter does NOT have reversed args. Enter is probably slower on all targets. Also sdb doesn't like it. */ @@ -8960,6 +9180,84 @@ ix86_expand_setcc (code, dest) return 1; /* DONE */ } +/* Expand comparison setting or clearing carry flag. Return true when sucesfull + and set pop for the operation. */ +bool +ix86_expand_carry_flag_compare (code, op0, op1, pop) + rtx op0, op1, *pop; + enum rtx_code code; +{ + enum machine_mode mode = + GET_MODE (op0) != VOIDmode ? GET_MODE (op0) : GET_MODE (op1); + + /* Do not handle DImode compares that go trought special path. Also we can't + deal with FP compares yet. This is possible to add. */ + if ((mode == DImode && !TARGET_64BIT) || !INTEGRAL_MODE_P (mode)) + return false; + switch (code) + { + case LTU: + case GEU: + break; + + /* Convert a==0 into (unsigned)a<1. */ + case EQ: + case NE: + if (op1 != const0_rtx) + return false; + op1 = const1_rtx; + code = (code == EQ ? LTU : GEU); + break; + + /* Convert a>b into b<a or a>=b-1. */ + case GTU: + case LEU: + if (GET_CODE (op1) == CONST_INT) + { + op1 = gen_int_mode (INTVAL (op1) + 1, GET_MODE (op0)); + /* Bail out on overflow. We still can swap operands but that + would force loading of the constant into register. */ + if (op1 == const0_rtx + || !x86_64_immediate_operand (op1, GET_MODE (op1))) + return false; + code = (code == GTU ? GEU : LTU); + } + else + { + rtx tmp = op1; + op1 = op0; + op0 = tmp; + code = (code == GTU ? LTU : GEU); + } + break; + + /* Convert a>0 into (unsigned)a<0x7fffffff. */ + case LT: + case GE: + if (mode == DImode || op1 != const0_rtx) + return false; + op1 = gen_int_mode (~(1 << (GET_MODE_BITSIZE (mode) - 1)), mode); + code = (code == LT ? GEU : LTU); + break; + case LE: + case GT: + if (mode == DImode || op1 != constm1_rtx) + return false; + op1 = gen_int_mode (~(1 << (GET_MODE_BITSIZE (mode) - 1)), mode); + code = (code == LE ? GEU : LTU); + break; + + default: + return false; + } + ix86_compare_op0 = op0; + ix86_compare_op1 = op1; + *pop = ix86_expand_compare (code, NULL, NULL); + if (GET_CODE (*pop) != LTU && GET_CODE (*pop) != GEU) + abort (); + return true; +} + int ix86_expand_int_movcc (operands) rtx operands[]; @@ -8968,30 +9266,7 @@ ix86_expand_int_movcc (operands) rtx compare_seq, compare_op; rtx second_test, bypass_test; enum machine_mode mode = GET_MODE (operands[0]); - - /* When the compare code is not LTU or GEU, we can not use sbbl case. - In case comparsion is done with immediate, we can convert it to LTU or - GEU by altering the integer. */ - - if ((code == LEU || code == GTU) - && GET_CODE (ix86_compare_op1) == CONST_INT - && mode != HImode - && INTVAL (ix86_compare_op1) != -1 - /* For x86-64, the immediate field in the instruction is 32-bit - signed, so we can't increment a DImode value above 0x7fffffff. */ - && (!TARGET_64BIT - || GET_MODE (ix86_compare_op0) != DImode - || INTVAL (ix86_compare_op1) != 0x7fffffff) - && GET_CODE (operands[2]) == CONST_INT - && GET_CODE (operands[3]) == CONST_INT) - { - if (code == LEU) - code = LTU; - else - code = GEU; - ix86_compare_op1 = gen_int_mode (INTVAL (ix86_compare_op1) + 1, - GET_MODE (ix86_compare_op0)); - } + bool sign_bit_compare_p = false;; start_sequence (); compare_op = ix86_expand_compare (code, &second_test, &bypass_test); @@ -9000,10 +9275,14 @@ ix86_expand_int_movcc (operands) compare_code = GET_CODE (compare_op); + if ((ix86_compare_op1 == const0_rtx && (code == GE || code == LT)) + || (ix86_compare_op1 == constm1_rtx && (code == GT || code == LE))) + sign_bit_compare_p = true; + /* Don't attempt mode expansion here -- if we had to expand 5 or 6 HImode insns, we'd be swallowed in word prefix ops. */ - if (mode != HImode + if ((mode != HImode || TARGET_FAST_PREFIX) && (mode != DImode || TARGET_64BIT) && GET_CODE (operands[2]) == CONST_INT && GET_CODE (operands[3]) == CONST_INT) @@ -9013,32 +9292,53 @@ ix86_expand_int_movcc (operands) HOST_WIDE_INT cf = INTVAL (operands[3]); HOST_WIDE_INT diff; - if ((compare_code == LTU || compare_code == GEU) - && !second_test && !bypass_test) + diff = ct - cf; + /* Sign bit compares are better done using shifts than we do by using + sbb. */ + if (sign_bit_compare_p + || ix86_expand_carry_flag_compare (code, ix86_compare_op0, + ix86_compare_op1, &compare_op)) { /* Detect overlap between destination and compare sources. */ rtx tmp = out; - /* To simplify rest of code, restrict to the GEU case. */ - if (compare_code == LTU) + if (!sign_bit_compare_p) { - HOST_WIDE_INT tmp = ct; - ct = cf; - cf = tmp; - compare_code = reverse_condition (compare_code); - code = reverse_condition (code); - } - diff = ct - cf; + compare_code = GET_CODE (compare_op); + + /* To simplify rest of code, restrict to the GEU case. */ + if (compare_code == LTU) + { + HOST_WIDE_INT tmp = ct; + ct = cf; + cf = tmp; + compare_code = reverse_condition (compare_code); + code = reverse_condition (code); + } + diff = ct - cf; - if (reg_overlap_mentioned_p (out, ix86_compare_op0) - || reg_overlap_mentioned_p (out, ix86_compare_op1)) - tmp = gen_reg_rtx (mode); + if (reg_overlap_mentioned_p (out, ix86_compare_op0) + || reg_overlap_mentioned_p (out, ix86_compare_op1)) + tmp = gen_reg_rtx (mode); - emit_insn (compare_seq); - if (mode == DImode) - emit_insn (gen_x86_movdicc_0_m1_rex64 (tmp)); + if (mode == DImode) + emit_insn (gen_x86_movdicc_0_m1_rex64 (tmp)); + else + emit_insn (gen_x86_movsicc_0_m1 (gen_lowpart (SImode, tmp))); + } else - emit_insn (gen_x86_movsicc_0_m1 (tmp)); + { + if (code == GT || code == GE) + code = reverse_condition (code); + else + { + HOST_WIDE_INT tmp = ct; + ct = cf; + cf = tmp; + } + tmp = emit_store_flag (tmp, code, ix86_compare_op0, + ix86_compare_op1, VOIDmode, 0, -1); + } if (diff == 1) { @@ -9052,7 +9352,7 @@ ix86_expand_int_movcc (operands) if (ct) tmp = expand_simple_binop (mode, PLUS, tmp, GEN_INT (ct), - tmp, 1, OPTAB_DIRECT); + copy_rtx (tmp), 1, OPTAB_DIRECT); } else if (cf == -1) { @@ -9065,7 +9365,7 @@ ix86_expand_int_movcc (operands) */ tmp = expand_simple_binop (mode, IOR, tmp, GEN_INT (ct), - tmp, 1, OPTAB_DIRECT); + copy_rtx (tmp), 1, OPTAB_DIRECT); } else if (diff == -1 && ct) { @@ -9077,11 +9377,11 @@ ix86_expand_int_movcc (operands) * * Size 8 - 11. */ - tmp = expand_simple_unop (mode, NOT, tmp, tmp, 1); + tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1); if (cf) tmp = expand_simple_binop (mode, PLUS, - tmp, GEN_INT (cf), - tmp, 1, OPTAB_DIRECT); + copy_rtx (tmp), GEN_INT (cf), + copy_rtx (tmp), 1, OPTAB_DIRECT); } else { @@ -9099,26 +9399,25 @@ ix86_expand_int_movcc (operands) { cf = ct; ct = 0; - tmp = expand_simple_unop (mode, NOT, tmp, tmp, 1); + tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1); } tmp = expand_simple_binop (mode, AND, - tmp, + copy_rtx (tmp), gen_int_mode (cf - ct, mode), - tmp, 1, OPTAB_DIRECT); + copy_rtx (tmp), 1, OPTAB_DIRECT); if (ct) tmp = expand_simple_binop (mode, PLUS, - tmp, GEN_INT (ct), - tmp, 1, OPTAB_DIRECT); + copy_rtx (tmp), GEN_INT (ct), + copy_rtx (tmp), 1, OPTAB_DIRECT); } - if (tmp != out) - emit_move_insn (out, tmp); + if (!rtx_equal_p (tmp, out)) + emit_move_insn (copy_rtx (out), copy_rtx (tmp)); return 1; /* DONE */ } - diff = ct - cf; if (diff < 0) { HOST_WIDE_INT tmp; @@ -9194,8 +9493,10 @@ ix86_expand_int_movcc (operands) } } + if ((diff == 1 || diff == 2 || diff == 4 || diff == 8 || diff == 3 || diff == 5 || diff == 9) + && ((mode != QImode && mode != HImode) || !TARGET_PARTIAL_REG_STALL) && (mode != DImode || x86_64_sign_extended_value (GEN_INT (cf)))) { /* @@ -9237,15 +9538,14 @@ ix86_expand_int_movcc (operands) tmp = gen_rtx_PLUS (mode, tmp, GEN_INT (cf)); nops++; } - if (tmp != out - && (GET_CODE (tmp) != SUBREG || SUBREG_REG (tmp) != out)) + if (!rtx_equal_p (tmp, out)) { if (nops == 1) out = force_operand (tmp, out); else - emit_insn (gen_rtx_SET (VOIDmode, out, tmp)); + emit_insn (gen_rtx_SET (VOIDmode, copy_rtx (out), copy_rtx (tmp))); } - if (out != operands[0]) + if (!rtx_equal_p (out, operands[0])) emit_move_insn (operands[0], copy_rtx (out)); return 1; /* DONE */ @@ -9265,12 +9565,10 @@ ix86_expand_int_movcc (operands) * This is reasonably steep, but branch mispredict costs are * high on modern cpus, so consider failing only if optimizing * for space. - * - * %%% Parameterize branch_cost on the tuning architecture, then - * use that. The 80386 couldn't care less about mispredicts. */ - if (!optimize_size && !TARGET_CMOVE) + if ((!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL)) + && BRANCH_COST >= 2) { if (cf == 0) { @@ -9324,31 +9622,31 @@ ix86_expand_int_movcc (operands) out = emit_store_flag (out, code, ix86_compare_op0, ix86_compare_op1, VOIDmode, 0, 1); - out = expand_simple_binop (mode, PLUS, out, constm1_rtx, - out, 1, OPTAB_DIRECT); + out = expand_simple_binop (mode, PLUS, copy_rtx (out), constm1_rtx, + copy_rtx (out), 1, OPTAB_DIRECT); } - out = expand_simple_binop (mode, AND, out, + out = expand_simple_binop (mode, AND, copy_rtx (out), gen_int_mode (cf - ct, mode), - out, 1, OPTAB_DIRECT); + copy_rtx (out), 1, OPTAB_DIRECT); if (ct) - out = expand_simple_binop (mode, PLUS, out, GEN_INT (ct), - out, 1, OPTAB_DIRECT); - if (out != operands[0]) - emit_move_insn (operands[0], out); + out = expand_simple_binop (mode, PLUS, copy_rtx (out), GEN_INT (ct), + copy_rtx (out), 1, OPTAB_DIRECT); + if (!rtx_equal_p (out, operands[0])) + emit_move_insn (operands[0], copy_rtx (out)); return 1; /* DONE */ } } - if (!TARGET_CMOVE) + if (!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL)) { /* Try a few things more with specific constants and a variable. */ optab op; rtx var, orig_out, out, tmp; - if (optimize_size) + if (BRANCH_COST <= 2) return 0; /* FAIL */ /* If one of the two operands is an interesting constant, load a @@ -9357,9 +9655,9 @@ ix86_expand_int_movcc (operands) if (GET_CODE (operands[2]) == CONST_INT) { var = operands[3]; - if (INTVAL (operands[2]) == 0) + if (INTVAL (operands[2]) == 0 && operands[3] != constm1_rtx) operands[3] = constm1_rtx, op = and_optab; - else if (INTVAL (operands[2]) == -1) + else if (INTVAL (operands[2]) == -1 && operands[3] != const0_rtx) operands[3] = const0_rtx, op = ior_optab; else return 0; /* FAIL */ @@ -9367,9 +9665,9 @@ ix86_expand_int_movcc (operands) else if (GET_CODE (operands[3]) == CONST_INT) { var = operands[2]; - if (INTVAL (operands[3]) == 0) + if (INTVAL (operands[3]) == 0 && operands[2] != constm1_rtx) operands[2] = constm1_rtx, op = and_optab; - else if (INTVAL (operands[3]) == -1) + else if (INTVAL (operands[3]) == -1 && operands[3] != const0_rtx) operands[2] = const0_rtx, op = ior_optab; else return 0; /* FAIL */ @@ -9388,8 +9686,8 @@ ix86_expand_int_movcc (operands) /* Mask in the interesting variable. */ out = expand_binop (mode, op, var, tmp, orig_out, 0, OPTAB_WIDEN); - if (out != orig_out) - emit_move_insn (orig_out, out); + if (!rtx_equal_p (out, orig_out)) + emit_move_insn (copy_rtx (orig_out), copy_rtx (out)); return 1; /* DONE */ } @@ -9422,27 +9720,33 @@ ix86_expand_int_movcc (operands) emit_move_insn (tmp, operands[2]); operands[2] = tmp; } + if (! register_operand (operands[2], VOIDmode) - && ! register_operand (operands[3], VOIDmode)) + && (mode == QImode + || ! register_operand (operands[3], VOIDmode))) operands[2] = force_reg (mode, operands[2]); + if (mode == QImode + && ! register_operand (operands[3], VOIDmode)) + operands[3] = force_reg (mode, operands[3]); + emit_insn (compare_seq); emit_insn (gen_rtx_SET (VOIDmode, operands[0], gen_rtx_IF_THEN_ELSE (mode, compare_op, operands[2], operands[3]))); if (bypass_test) - emit_insn (gen_rtx_SET (VOIDmode, operands[0], + emit_insn (gen_rtx_SET (VOIDmode, copy_rtx (operands[0]), gen_rtx_IF_THEN_ELSE (mode, bypass_test, - operands[3], - operands[0]))); + copy_rtx (operands[3]), + copy_rtx (operands[0])))); if (second_test) - emit_insn (gen_rtx_SET (VOIDmode, operands[0], + emit_insn (gen_rtx_SET (VOIDmode, copy_rtx (operands[0]), gen_rtx_IF_THEN_ELSE (mode, second_test, - operands[2], - operands[0]))); + copy_rtx (operands[2]), + copy_rtx (operands[0])))); return 1; /* DONE */ } @@ -9483,8 +9787,14 @@ ix86_expand_fp_movcc (operands) if (rtx_equal_p (operands[2], op0) && rtx_equal_p (operands[3], op1)) { /* Check for min operation. */ - if (code == LT) + if (code == LT || code == UNLE) { + if (code == UNLE) + { + rtx tmp = op0; + op0 = op1; + op1 = tmp; + } operands[0] = force_reg (GET_MODE (operands[0]), operands[0]); if (memory_operand (op0, VOIDmode)) op0 = force_reg (GET_MODE (operands[0]), op0); @@ -9495,8 +9805,14 @@ ix86_expand_fp_movcc (operands) return 1; } /* Check for max operation. */ - if (code == GT) + if (code == GT || code == UNGE) { + if (code == UNGE) + { + rtx tmp = op0; + op0 = op1; + op1 = tmp; + } operands[0] = force_reg (GET_MODE (operands[0]), operands[0]); if (memory_operand (op0, VOIDmode)) op0 = force_reg (GET_MODE (operands[0]), op0); @@ -10242,8 +10558,12 @@ ix86_expand_movstr (dst, src, count_exp, align_exp) /* In case we don't know anything about the alignment, default to library version, since it is usually equally fast and result in - shorter code. */ - if (!TARGET_INLINE_ALL_STRINGOPS && align < UNITS_PER_WORD) + shorter code. + + Also emit call when we know that the count is large and call overhead + will not be important. */ + if (!TARGET_INLINE_ALL_STRINGOPS + && (align < UNITS_PER_WORD || !TARGET_REP_MOVL_OPTIMAL)) { end_sequence (); return 0; @@ -10457,8 +10777,12 @@ ix86_expand_clrstr (src, count_exp, align_exp) /* In case we don't know anything about the alignment, default to library version, since it is usually equally fast and result in - shorter code. */ - if (!TARGET_INLINE_ALL_STRINGOPS && align < UNITS_PER_WORD) + shorter code. + + Also emit call when we know that the count is large and call overhead + will not be important. */ + if (!TARGET_INLINE_ALL_STRINGOPS + && (align < UNITS_PER_WORD || !TARGET_REP_MOVL_OPTIMAL)) return 0; if (TARGET_SINGLE_STRINGOP) @@ -10828,8 +11152,9 @@ ix86_expand_strlensi_unroll_1 (out, align_rtx) } void -ix86_expand_call (retval, fnaddr, callarg1, callarg2, pop) +ix86_expand_call (retval, fnaddr, callarg1, callarg2, pop, sibcall) rtx retval, fnaddr, callarg1, callarg2, pop; + int sibcall; { rtx use = NULL, call; @@ -10861,6 +11186,15 @@ ix86_expand_call (retval, fnaddr, callarg1, callarg2, pop) fnaddr = copy_to_mode_reg (Pmode, XEXP (fnaddr, 0)); fnaddr = gen_rtx_MEM (QImode, fnaddr); } + if (sibcall && TARGET_64BIT + && !constant_call_address_operand (XEXP (fnaddr, 0), Pmode)) + { + rtx addr; + addr = copy_to_mode_reg (Pmode, XEXP (fnaddr, 0)); + fnaddr = gen_rtx_REG (Pmode, 40); + emit_move_insn (fnaddr, addr); + fnaddr = gen_rtx_MEM (QImode, fnaddr); + } call = gen_rtx_CALL (VOIDmode, fnaddr, callarg1); if (retval) @@ -11060,6 +11394,7 @@ ix86_issue_rate () case PROCESSOR_PENTIUMPRO: case PROCESSOR_PENTIUM4: case PROCESSOR_ATHLON: + case PROCESSOR_K8: return 3; default: @@ -11271,16 +11606,10 @@ ix86_adjust_cost (insn, link, dep_insn, cost) break; case PROCESSOR_ATHLON: + case PROCESSOR_K8: memory = get_attr_memory (insn); dep_memory = get_attr_memory (dep_insn); - if (dep_memory == MEMORY_LOAD || dep_memory == MEMORY_BOTH) - { - if (dep_insn_type == TYPE_IMOV || dep_insn_type == TYPE_FMOV) - cost += 2; - else - cost += 3; - } /* Show ability of reorder buffer to hide latency of load by executing in parallel with previous instruction in case previous instruction is not needed to compute the address. */ @@ -11554,7 +11883,7 @@ ix86_variable_issue (dump, sched_verbose, insn, can_issue_more) static int ia32_use_dfa_pipeline_interface () { - if (ix86_cpu == PROCESSOR_PENTIUM) + if (TARGET_PENTIUM || TARGET_ATHLON_K8) return 1; return 0; } @@ -12764,7 +13093,8 @@ safe_vector_operand (x, mode) : gen_rtx_SUBREG (DImode, x, 0))); else emit_insn (gen_sse_clrv4sf (mode == V4SFmode ? x - : gen_rtx_SUBREG (V4SFmode, x, 0))); + : gen_rtx_SUBREG (V4SFmode, x, 0), + CONST0_RTX (V4SFmode))); return x; } @@ -13434,7 +13764,7 @@ ix86_expand_builtin (exp, target, subtarget, mode, ignore) case IX86_BUILTIN_SSE_ZERO: target = gen_reg_rtx (V4SFmode); - emit_insn (gen_sse_clrv4sf (target)); + emit_insn (gen_sse_clrv4sf (target, CONST0_RTX (V4SFmode))); return target; case IX86_BUILTIN_MMX_ZERO: @@ -14058,6 +14388,17 @@ x86_order_regs_for_local_alloc () reg_alloc_order [pos++] = 0; } +#ifndef TARGET_USE_MS_BITFIELD_LAYOUT +#define TARGET_USE_MS_BITFIELD_LAYOUT 0 +#endif + +static bool +ix86_ms_bitfield_layout_p (record_type) + tree record_type ATTRIBUTE_UNUSED; +{ + return TARGET_USE_MS_BITFIELD_LAYOUT; +} + /* Returns an expression indicating where the this parameter is located on entry to the FUNCTION. */ @@ -14317,7 +14658,7 @@ x86_machine_dependent_reorg (first) { edge e; - if (!TARGET_ATHLON || !optimize || optimize_size) + if (!TARGET_ATHLON_K8 || !optimize || optimize_size) return; for (e = EXIT_BLOCK_PTR->pred; e; e = e->pred_next) { @@ -14328,25 +14669,69 @@ x86_machine_dependent_reorg (first) if (!returnjump_p (ret) || !maybe_hot_bb_p (bb)) continue; - prev = prev_nonnote_insn (ret); + for (prev = PREV_INSN (ret); prev; prev = PREV_INSN (prev)) + if (active_insn_p (prev) || GET_CODE (prev) == CODE_LABEL) + break; if (prev && GET_CODE (prev) == CODE_LABEL) { edge e; for (e = bb->pred; e; e = e->pred_next) - if (EDGE_FREQUENCY (e) && e->src->index > 0 + if (EDGE_FREQUENCY (e) && e->src->index >= 0 && !(e->flags & EDGE_FALLTHRU)) insert = 1; } if (!insert) { - prev = prev_real_insn (ret); + prev = prev_active_insn (ret); if (prev && GET_CODE (prev) == JUMP_INSN && any_condjump_p (prev)) insert = 1; + /* Empty functions get branch misspredict even when the jump destination + is not visible to us. */ + if (!prev && cfun->function_frequency > FUNCTION_FREQUENCY_UNLIKELY_EXECUTED) + insert = 1; } if (insert) emit_insn_before (gen_nop (), ret); } } +/* Return nonzero when QImode register that must be represented via REX prefix + is used. */ +bool +x86_extended_QIreg_mentioned_p (insn) + rtx insn; +{ + int i; + extract_insn_cached (insn); + for (i = 0; i < recog_data.n_operands; i++) + if (REG_P (recog_data.operand[i]) + && REGNO (recog_data.operand[i]) >= 4) + return true; + return false; +} + +/* Return nonzero when P points to register encoded via REX prefix. + Called via for_each_rtx. */ +static int +extended_reg_mentioned_1 (p, data) + rtx *p; + void *data ATTRIBUTE_UNUSED; +{ + unsigned int regno; + if (!REG_P (*p)) + return 0; + regno = REGNO (*p); + return REX_INT_REGNO_P (regno) || REX_SSE_REGNO_P (regno); +} + +/* Return true when INSN mentions register that must be encoded using REX + prefix. */ +bool +x86_extended_reg_mentioned_p (insn) + rtx insn; +{ + return for_each_rtx (&PATTERN (insn), extended_reg_mentioned_1, NULL); +} + #include "gt-i386.h" |