diff options
Diffstat (limited to 'gcc/config')
-rw-r--r-- | gcc/config/i386/i386.c | 38 | ||||
-rw-r--r-- | gcc/config/i386/i386.h | 6 | ||||
-rw-r--r-- | gcc/config/i386/i386.md | 233 |
3 files changed, 211 insertions, 66 deletions
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index 516c2744934..b2e81a47e2a 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -163,12 +163,12 @@ struct processor_costs k6_cost = { struct processor_costs athlon_cost = { 1, /* cost of an add instruction */ - 1, /* cost of a lea instruction */ + 2, /* cost of a lea instruction */ 1, /* variable shift costs */ 1, /* constant shift costs */ 5, /* cost of starting a multiply */ 0, /* cost of multiply per each bit set */ - 19, /* cost of a divide/mod */ + 42, /* cost of a divide/mod */ 8, /* "large" insn */ 9, /* MOVE_RATIO */ 4, /* cost for loading QImode using movzbl */ @@ -177,9 +177,9 @@ struct processor_costs athlon_cost = { Relative to reg-reg move (2). */ {2, 3, 2}, /* cost of storing integer registers */ 4, /* cost of reg,reg fld/fst */ - {6, 6, 6}, /* cost of loading fp registers + {6, 6, 20}, /* cost of loading fp registers in SFmode, DFmode and XFmode */ - {4, 4, 4} /* cost of loading integer registers */ + {4, 4, 16} /* cost of loading integer registers */ }; struct processor_costs *ix86_cost = &pentium_cost; @@ -222,6 +222,9 @@ const int x86_sub_esp_4 = m_ATHLON | m_PPRO; const int x86_sub_esp_8 = m_ATHLON | m_PPRO | m_386 | m_486; const int x86_add_esp_4 = m_ATHLON | m_K6; const int x86_add_esp_8 = m_ATHLON | m_PPRO | m_K6 | m_386 | m_486; +const int x86_integer_DFmode_moves = ~m_ATHLON; +const int x86_partial_reg_dependency = m_ATHLON; +const int x86_memory_mismatch_stall = m_ATHLON; #define AT_BP(mode) (gen_rtx_MEM ((mode), hard_frame_pointer_rtx)) @@ -6287,6 +6290,7 @@ ix86_adjust_cost (insn, link, dep_insn, cost) int cost; { enum attr_type insn_type, dep_insn_type; + enum attr_memory memory; rtx set, set2; int dep_insn_code_number; @@ -6334,7 +6338,8 @@ ix86_adjust_cost (insn, link, dep_insn, cost) increase the cost here for non-imov insns. */ if (dep_insn_type != TYPE_IMOV && dep_insn_type != TYPE_FMOV - && get_attr_memory (dep_insn) == MEMORY_LOAD) + && ((memory = get_attr_memory (dep_insn) == MEMORY_LOAD) + || memory == MEMORY_BOTH)) cost += 1; /* INT->FP conversion is expensive. */ @@ -6359,7 +6364,8 @@ ix86_adjust_cost (insn, link, dep_insn, cost) /* Since we can't represent delayed latencies of load+operation, increase the cost here for non-imov insns. */ - if (get_attr_memory (dep_insn) == MEMORY_LOAD) + if ((memory = get_attr_memory (dep_insn) == MEMORY_LOAD) + || memory == MEMORY_BOTH) cost += (dep_insn_type != TYPE_IMOV) ? 2 : 1; /* INT->FP conversion is expensive. */ @@ -6368,19 +6374,15 @@ ix86_adjust_cost (insn, link, dep_insn, cost) break; case PROCESSOR_ATHLON: - /* Address Generation Interlock cause problems on the Athlon CPU because - the loads and stores are done in order so once one load or store has - to wait, others must too, so penalize the AGIs slightly by one cycle. - We might experiment with this value later. */ - if (ix86_agi_dependant (insn, dep_insn, insn_type)) - cost += 1; + if ((memory = get_attr_memory (dep_insn)) == MEMORY_LOAD + || memory == MEMORY_BOTH) + { + if (dep_insn_type == TYPE_IMOV || dep_insn_type == TYPE_FMOV) + cost += 2; + else + cost += 3; + } - /* Since we can't represent delayed latencies of load+operation, - increase the cost here for non-imov insns. */ - if (dep_insn_type != TYPE_IMOV - && dep_insn_type != TYPE_FMOV - && get_attr_memory (dep_insn) == MEMORY_LOAD) - cost += 2; default: break; } diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h index 50f08251420..8ae7be8d444 100644 --- a/gcc/config/i386/i386.h +++ b/gcc/config/i386/i386.h @@ -173,8 +173,9 @@ extern const int x86_use_cltd, x86_read_modify_write; extern const int x86_read_modify, x86_split_long_moves; extern const int x86_promote_QImode, x86_single_stringop; extern const int x86_himode_math, x86_qimode_math, x86_promote_qi_regs; -extern const int x86_promote_hi_regs; +extern const int x86_promote_hi_regs, x86_integer_DFmode_moves; extern const int x86_add_esp_4, x86_add_esp_8, x86_sub_esp_4, x86_sub_esp_8; +extern const int x86_partial_reg_dependency, x86_memory_mismatch_stall; #define TARGET_USE_LEAVE (x86_use_leave & CPUMASK) #define TARGET_PUSH_MEMORY (x86_push_memory & CPUMASK) @@ -206,6 +207,9 @@ extern const int x86_add_esp_4, x86_add_esp_8, x86_sub_esp_4, x86_sub_esp_8; #define TARGET_ADD_ESP_8 (x86_add_esp_8 & CPUMASK) #define TARGET_SUB_ESP_4 (x86_sub_esp_4 & CPUMASK) #define TARGET_SUB_ESP_8 (x86_sub_esp_8 & CPUMASK) +#define TARGET_INTEGER_DFMODE_MOVES (x86_integer_DFmode_moves & CPUMASK) +#define TARGET_PARTIAL_REG_DEPENDENCY (x86_partial_reg_dependency & CPUMASK) +#define TARGET_MEMORY_MISMATCH_STALL (x86_memory_mismatch_stall & CPUMASK) #define TARGET_STACK_PROBE (target_flags & MASK_STACK_PROBE) diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index 090d0ebd444..8b9b2530e78 100644 --- a/gcc/config/i386/i386.md +++ b/gcc/config/i386/i386.md @@ -738,7 +738,7 @@ ;; communicates with all the execution units seperately instead. (define_attr "athlon_decode" "direct,vector" - (cond [(eq_attr "type" "call,imul,idiv,other,multi,fcmov,fpspc,str") + (cond [(eq_attr "type" "call,imul,idiv,other,multi,fcmov,fpspc,str,pop,cld,fcmov") (const_string "vector") (and (eq_attr "type" "push") (match_operand 1 "memory_operand" "")) @@ -766,7 +766,7 @@ (define_function_unit "athlon_ieu" 3 0 (and (eq_attr "cpu" "athlon") - (eq_attr "type" "alu1,negnot,alu,icmp,imov,imovx,lea,incdec,ishift,imul,idiv,ibr,setcc,push,pop,call,callv,icmov,str,cld")) + (eq_attr "type" "alu1,negnot,alu,icmp,imov,imovx,lea,incdec,ishift,ibr,call,callv,icmov,cld,pop,setcc,push,pop")) 1 1) (define_function_unit "athlon_ieu" 3 0 @@ -777,12 +777,12 @@ (define_function_unit "athlon_ieu" 3 0 (and (eq_attr "cpu" "athlon") (eq_attr "type" "imul")) - 4 0) + 5 0) (define_function_unit "athlon_ieu" 3 0 (and (eq_attr "cpu" "athlon") (eq_attr "type" "idiv")) - 27 0) + 42 0) (define_function_unit "athlon_muldiv" 1 0 (and (eq_attr "cpu" "athlon") @@ -792,56 +792,118 @@ (define_function_unit "athlon_muldiv" 1 0 (and (eq_attr "cpu" "athlon") (eq_attr "type" "idiv")) - 27 27) + 42 42) -(define_attr "athlon_fpunits" "none,store,mul,add,muladd,all" +(define_attr "athlon_fpunits" "none,store,mul,add,muladd,any" (cond [(eq_attr "type" "fop,fop1,fcmp") (const_string "add") - (eq_attr "type" "fmul,fdiv,fpspc,fsgn") + (eq_attr "type" "fmul,fdiv,fpspc,fsgn,fcmov") (const_string "mul") - (and (eq_attr "type" "fmov") (eq_attr "memory" "!none")) + (and (eq_attr "type" "fmov") (eq_attr "memory" "store,both")) (const_string "store") + (and (eq_attr "type" "fmov") (eq_attr "memory" "load")) + (const_string "any") (and (eq_attr "type" "fmov") (ior (match_operand:SI 1 "register_operand" "") (match_operand 1 "immediate_operand" ""))) (const_string "store") (eq_attr "type" "fmov") - (const_string "muladd") - (eq_attr "type" "fcmov") - (const_string "all")] + (const_string "muladd")] (const_string "none"))) -(define_function_unit "athlon_fp_mul" 1 0 +;; We use latencies 1 for definitions. This is OK to model colisions +;; in execution units. The real latencies are modeled in the "fp" pipeline. + +;; fsin, fcos: 96-192 +;; fsincos: 107-211 +;; fsqrt: 19 for SFmode, 27 for DFmode, 35 for XFmode. +(define_function_unit "athlon_fp" 3 0 (and (eq_attr "cpu" "athlon") - (eq_attr "athlon_fpunits" "mul,all")) - 4 1) + (eq_attr "type" "fpspc")) + 100 1) -(define_function_unit "athlon_fp_add" 1 0 +;; 16 cycles for SFmode, 20 for DFmode and 24 for XFmode. +(define_function_unit "athlon_fp" 3 0 (and (eq_attr "cpu" "athlon") - (eq_attr "athlon_fpunits" "add,all")) + (eq_attr "type" "fdiv")) + 24 1) + +(define_function_unit "athlon_fp" 3 0 + (and (eq_attr "cpu" "athlon") + (eq_attr "type" "fop,fop1,fmul")) 4 1) -(define_function_unit "athlon_fp_muladd" 2 0 +;; XFmode loads are slow. +;; XFmode store is slow too (8 cycles), but we don't need to model it, because +;; there are no dependent instructions. + +(define_function_unit "athlon_fp" 3 0 (and (eq_attr "cpu" "athlon") (and (eq_attr "type" "fmov") - (eq_attr "athlon_fpunits" "muladd,mul,add,all"))) + (match_operand:XF 1 "memory_operand" ""))) + 10 1) + +(define_function_unit "athlon_fp" 3 0 + (and (eq_attr "cpu" "athlon") + (eq_attr "type" "fmov,fsgn")) 2 1) +;; fcmp and ftst instructions +(define_function_unit "athlon_fp" 3 0 + (and (eq_attr "cpu" "athlon") + (and (eq_attr "type" "fcmp") + (eq_attr "athlon_decode" "direct"))) + 3 1) + +;; fcmpi instructions. +(define_function_unit "athlon_fp" 3 0 + (and (eq_attr "cpu" "athlon") + (and (eq_attr "type" "fcmp") + (eq_attr "athlon_decode" "vector"))) + 3 1) + +(define_function_unit "athlon_fp" 3 0 + (and (eq_attr "cpu" "athlon") + (eq_attr "type" "fcmov")) + 7 1) + +(define_function_unit "athlon_fp_mul" 1 0 + (and (eq_attr "cpu" "athlon") + (eq_attr "athlon_fpunits" "mul")) + 1 1) + +(define_function_unit "athlon_fp_add" 1 0 + (and (eq_attr "cpu" "athlon") + (eq_attr "athlon_fpunits" "add")) + 1 1) + (define_function_unit "athlon_fp_muladd" 2 0 (and (eq_attr "cpu" "athlon") - (and (eq_attr "type" "!fmov") - (eq_attr "athlon_fpunits" "muladd,mul,add,all"))) - 4 1) + (eq_attr "athlon_fpunits" "muladd,mul,add")) + 1 1) (define_function_unit "athlon_fp_store" 1 0 (and (eq_attr "cpu" "athlon") - (eq_attr "athlon_fpunits" "store,all")) + (eq_attr "athlon_fpunits" "store")) 1 1) -(define_function_unit "athlon_agu" 3 0 +;; We don't need to model the Adress Generation Unit, since we don't model +;; the re-order buffer yet and thus we never schedule more than three operations +;; at time. Later we may want to experiment with MD_SCHED macros modeling the +;; decoders independently on the functional units. + +;(define_function_unit "athlon_agu" 3 0 +; (and (eq_attr "cpu" "athlon") +; (and (eq_attr "memory" "!none") +; (eq_attr "athlon_fpunits" "none"))) +; 1 1) + +;; Model load unit to avoid too long sequences of loads. We don't need to +;; model store queue, since it is hardly going to be bottleneck. + +(define_function_unit "athlon_load" 2 0 (and (eq_attr "cpu" "athlon") - (and (eq_attr "memory" "!none") - (eq_attr "athlon_fpunits" "none"))) + (eq_attr "memory" "load,both")) 1 1) @@ -1255,6 +1317,7 @@ "" "sahf" [(set_attr "length" "1") + (set_attr "athlon_decode" "vector") (set_attr "ppro_uops" "one")]) ;; Pentium Pro can do steps 1 through 3 in one go. @@ -1390,6 +1453,7 @@ "xchg{l}\\t%1, %0" [(set_attr "type" "imov") (set_attr "pent_pair" "np") + (set_attr "athlon_decode" "vector") (set_attr "ppro_uops" "few")]) (define_expand "movhi" @@ -1437,8 +1501,10 @@ }" [(set (attr "type") (cond [(and (eq_attr "alternative" "0") - (eq (symbol_ref "TARGET_PARTIAL_REG_STALL") - (const_int 0))) + (ior (eq (symbol_ref "TARGET_PARTIAL_REG_STALL") + (const_int 0)) + (eq (symbol_ref "TARGET_HIMODE_MATH") + (const_int 0)))) (const_string "imov") (and (eq_attr "alternative" "1,2") (match_operand:HI 1 "aligned_operand" "")) @@ -1456,8 +1522,10 @@ (match_operand:HI 1 "aligned_operand" "")) (const_string "0") (and (eq_attr "alternative" "0") - (eq (symbol_ref "TARGET_PARTIAL_REG_STALL") - (const_int 0))) + (ior (eq (symbol_ref "TARGET_PARTIAL_REG_STALL") + (const_int 0)) + (eq (symbol_ref "TARGET_HIMODE_MATH") + (const_int 0)))) (const_string "0") ] (const_string "1"))) @@ -1547,9 +1615,19 @@ [(set_attr "type" "pop") (set_attr "length_prefix" "1")]) +;; Situation is quite tricky about when to choose full sized (SImode) move +;; over QImode moves. For Q_REG -> Q_REG move we use full size only for +;; partial register dependency machines (such as AMD Athlon), where QImode +;; moves issue extra dependency and for partial register stalls machines +;; that don't use QImode patterns (and QImode move cause stall on the next +;; instruction). +;; +;; For loads of Q_REG to NONQ_REG we use full sized moves except for partial +;; register stall machines with, where we use QImode instructions, since +;; partial register stall can be caused there. Then we use movzx. (define_insn "*movqi_1" - [(set (match_operand:QI 0 "nonimmediate_operand" "=q,q,r,?r,m") - (match_operand:QI 1 "general_operand" "qn,qm,rn,qm,qn"))] + [(set (match_operand:QI 0 "nonimmediate_operand" "=q,q ,q ,r,r ,?r,m") + (match_operand:QI 1 "general_operand" " q,qn,qm,q,rn,qm,qn"))] "GET_CODE (operands[0]) != MEM || GET_CODE (operands[1]) != MEM" "* { @@ -1560,26 +1638,50 @@ abort (); return \"movz{bl|x}\\t{%1, %k0|%k0, %1}\"; default: - if (which_alternative == 2) + if (which_alternative == 4 || which_alternative == 3 + || (which_alternative == 1 && get_attr_length (insn) == 5) + || (which_alternative == 0 + && ((TARGET_PARTIAL_REG_STALL && !TARGET_QIMODE_MATH) + || TARGET_PARTIAL_REG_DEPENDENCY))) return \"mov{l}\\t{%k1, %k0|%k0, %k1}\"; else return \"mov{b}\\t{%1, %0|%0, %1}\"; } }" [(set (attr "type") - (cond [(eq_attr "alternative" "3") + (cond [(and (eq_attr "alternative" "3") + (ior (eq (symbol_ref "TARGET_PARTIAL_REG_STALL") + (const_int 0)) + (eq (symbol_ref "TARGET_QIMODE_MATH") + (const_int 0)))) + (const_string "imov") + (eq_attr "alternative" "3,5") (const_string "imovx") (and (ne (symbol_ref "TARGET_MOVX") (const_int 0)) - (eq_attr "alternative" "1")) + (eq_attr "alternative" "2")) (const_string "imovx") ] (const_string "imov"))) ; There's no place to override just the immediate length (set (attr "length") (cond [(and (eq_attr "type" "imov") - (and (eq_attr "alternative" "2") - (match_operand:HI 1 "immediate_operand" ""))) + (and (match_operand:HI 1 "immediate_operand" "") + (eq_attr "alternative" "4"))) + (const_string "5") + ;; Avoid extra dependency on partial register. + (and (eq_attr "type" "imov") + (and (eq_attr "alternative" "1") + (ne (symbol_ref "TARGET_PARTIAL_REG_DEPENDENCY") + (const_int 0)))) + (const_string "5") + ;; Avoid partial register stalls when not using QImode arithmetic + (and (eq_attr "type" "imov") + (and (eq_attr "alternative" "1") + (and (ne (symbol_ref "TARGET_PARTIAL_REG_STALL") + (const_int 0)) + (eq (symbol_ref "TARGET_QIMODE_MATH") + (const_int 0))))) (const_string "5") ] (const_string "*")))]) @@ -1904,10 +2006,38 @@ ;; On the average, pushdf using integers can be still shorter. Allow this ;; pattern for optimize_size too. -(define_insn "*pushdf" +(define_insn "*pushdf_nointeger" + [(set (match_operand:DF 0 "push_operand" "=<,<,<") + (match_operand:DF 1 "general_no_elim_operand" "f,Fo#f,*r#f"))] + "!TARGET_INTEGER_DFMODE_MOVES" + "* +{ + switch (which_alternative) + { + case 0: + /* %%% We loose REG_DEAD notes for controling pops if we split late. */ + operands[0] = gen_rtx_MEM (DFmode, stack_pointer_rtx); + operands[2] = stack_pointer_rtx; + operands[3] = GEN_INT (8); + if (find_regno_note (insn, REG_DEAD, REGNO (operands[1]))) + return \"sub{l}\\t{%3, %2|%2, %3}\;fstp%z0\\t%y0\"; + else + return \"sub{l}\\t{%3, %2|%2, %3}\;fst%z0\\t%y0\"; + + case 1: + case 2: + return \"#\"; + + default: + abort (); + } +}" + [(set_attr "type" "multi")]) + +(define_insn "*pushdf_integer" [(set (match_operand:DF 0 "push_operand" "=<,<") (match_operand:DF 1 "general_no_elim_operand" "f#r,rFo#f"))] - "" + "TARGET_INTEGER_DFMODE_MOVES" "* { switch (which_alternative) @@ -1955,7 +2085,7 @@ [(set (match_operand:DF 0 "nonimmediate_operand" "=f,m,f,*r,o") (match_operand:DF 1 "general_operand" "fm,f,G,*roF,F*r"))] "(GET_CODE (operands[0]) != MEM || GET_CODE (operands[1]) != MEM) - && optimize_size + && (optimize_size || !TARGET_INTEGER_DFMODE_MOVES) && (reload_in_progress || reload_completed || GET_CODE (operands[1]) != CONST_DOUBLE || memory_operand (operands[0], DFmode))" @@ -2002,7 +2132,7 @@ [(set (match_operand:DF 0 "nonimmediate_operand" "=f#r,m,f#r,r#f,o") (match_operand:DF 1 "general_operand" "fm#r,f#r,G,roF#f,Fr#f"))] "(GET_CODE (operands[0]) != MEM || GET_CODE (operands[1]) != MEM) - && !optimize_size + && !optimize_size && TARGET_INTEGER_DFMODE_MOVES && (reload_in_progress || reload_completed || GET_CODE (operands[1]) != CONST_DOUBLE || memory_operand (operands[0], DFmode))" @@ -2304,7 +2434,8 @@ else return \"fxch\\t%0\"; }" - [(set_attr "type" "fxch")]) + [(set_attr "type" "fxch") + (set_attr "athlon_decode" "vector")]) ;; Zero extension instructions @@ -3202,6 +3333,7 @@ "TARGET_80387" "fldcw\\t%0" [(set_attr "length_opcode" "2") + (set_attr "athlon_decode" "vector") (set_attr "ppro_uops" "few")]) ;; Conversion between fixed point and floating point. @@ -7691,6 +7823,7 @@ "" "leave" [(set_attr "length" "1") + (set_attr "athlon_decode" "vector") (set_attr "ppro_uops" "few")]) (define_expand "ffssi2" @@ -8123,7 +8256,8 @@ (sqrt:SF (match_operand:SF 1 "register_operand" "0")))] "! TARGET_NO_FANCY_MATH_387 && TARGET_80387" "fsqrt" - [(set_attr "type" "fpspc")]) + [(set_attr "type" "fpspc") + (set_attr "athlon_decode" "direct")]) (define_insn "sqrtdf2" [(set (match_operand:DF 0 "register_operand" "=f") @@ -8131,7 +8265,8 @@ "! TARGET_NO_FANCY_MATH_387 && TARGET_80387 && (TARGET_IEEE_FP || flag_fast_math) " "fsqrt" - [(set_attr "type" "fpspc")]) + [(set_attr "type" "fpspc") + (set_attr "athlon_decode" "direct")]) (define_insn "*sqrtextendsfdf2" [(set (match_operand:DF 0 "register_operand" "=f") @@ -8139,7 +8274,8 @@ (match_operand:SF 1 "register_operand" "0"))))] "! TARGET_NO_FANCY_MATH_387 && TARGET_80387" "fsqrt" - [(set_attr "type" "fpspc")]) + [(set_attr "type" "fpspc") + (set_attr "athlon_decode" "direct")]) (define_insn "sqrtxf2" [(set (match_operand:XF 0 "register_operand" "=f") @@ -8147,7 +8283,8 @@ "! TARGET_NO_FANCY_MATH_387 && TARGET_80387 && (TARGET_IEEE_FP || flag_fast_math) " "fsqrt" - [(set_attr "type" "fpspc")]) + [(set_attr "type" "fpspc") + (set_attr "athlon_decode" "direct")]) (define_insn "*sqrtextenddfxf2" [(set (match_operand:XF 0 "register_operand" "=f") @@ -8155,7 +8292,8 @@ (match_operand:DF 1 "register_operand" "0"))))] "! TARGET_NO_FANCY_MATH_387 && TARGET_80387" "fsqrt" - [(set_attr "type" "fpspc")]) + [(set_attr "type" "fpspc") + (set_attr "athlon_decode" "direct")]) (define_insn "*sqrtextendsfxf2" [(set (match_operand:XF 0 "register_operand" "=f") @@ -8163,7 +8301,8 @@ (match_operand:SF 1 "register_operand" "0"))))] "! TARGET_NO_FANCY_MATH_387 && TARGET_80387" "fsqrt" - [(set_attr "type" "fpspc")]) + [(set_attr "type" "fpspc") + (set_attr "athlon_decode" "direct")]) (define_insn "sindf2" [(set (match_operand:DF 0 "register_operand" "=f") |