diff options
-rw-r--r-- | gcc/ChangeLog | 30 | ||||
-rw-r--r-- | gcc/config/rs6000/cell.md | 401 | ||||
-rw-r--r-- | gcc/config/rs6000/rs6000.c | 131 | ||||
-rw-r--r-- | gcc/config/rs6000/rs6000.h | 4 | ||||
-rw-r--r-- | gcc/config/rs6000/rs6000.md | 20 |
5 files changed, 578 insertions, 8 deletions
diff --git a/gcc/ChangeLog b/gcc/ChangeLog index dbb72830af2..206bd435858 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,33 @@ +2006-11-13 Andrew Pinski <andrew_pinski@playstation.sony.com> + + * config/rs6000/cell.md: New file. + * config/rs6000/rs6000.c (rs6000_cell_dont_microcode): New + variable. + (ppccell_cost): New cost matrix. + (TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD): Define. + (rs6000_override_options): Set rs6000_always_hint to false + for cell. Also align functions/lables/loops to 8byte + for the Cell. Use PROCESSOR_CELL. + (rs6000_emit_epilogue): Rename using_mfcr_multiple to + using_mtcr_multiple. + (rs6000_variable_issue): If the insn is a nonpipelined instruction + on the Cell, return 0. + (rs6000_adjust_cost): Add Cell cost adjustments. + (is_microcoded_insn): Return true for Cell microcoded + instructions. + (is_nonpipeline_insn): New function. + (rs6000_issue_rate): Add PROCESSOR_CELL. + (rs6000_use_sched_lookahead): If Cell, then we should look ahead 8 + instructions. + (rs6000_use_sched_lookahead_guard): New function. + (rs6000_sched_reorder): Reorder the ready list, if the second + to last ready insn is a nonepipeline insn on the Cell. + * config/rs6000/rs6000.h (processor_type): Add PROCESSOR_CELL. + (ASM_CPU_SPEC): Add Cell. + * config/rs6000/rs6000.md (cpu): Add Cell. + (cell_micro): New Attr. + Include cell.md + 2006-11-13 Jakub Jelinek <jakub@redhat.com> * configure.ac (ld_vers): Parse GNU ld version 2.17.50.0.3-6 20060715 diff --git a/gcc/config/rs6000/cell.md b/gcc/config/rs6000/cell.md new file mode 100644 index 00000000000..f12d2a66cc8 --- /dev/null +++ b/gcc/config/rs6000/cell.md @@ -0,0 +1,401 @@ +;; Scheduling description for cell processor. +;; Copyright (C) 2001, 2002, 2003, 2004, 2005, 2006 +;; Free Software Foundation, Inc. +;; Contributed by Sony Computer Entertainment, Inc., + + +;; This file is free software; you can redistribute it and/or modify it under +;; the terms of the GNU General Public License as published by the Free +;; Software Foundation; either version 2 of the License, or (at your option) +;; any later version. + +;; This file is distributed in the hope that it will be useful, but WITHOUT +;; ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +;; FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +;; for more details. + +;; You should have received a copy of the GNU General Public License +;; along with this file; see the file COPYING. If not, write to the Free +;; Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA +;; 02110-1301, USA. + +;; Sources: BE BOOK4 (/sfs/enc/doc/PPU_BookIV_DD3.0_latest.pdf) + +;; BE Architechture *DD3.0 and DD3.1* +;; This file simulate PPU processor unit backend of pipeline, maualP24. +;; manual P27, stall and flush points +;; IU, XU, VSU, dipatcher decodes and dispatch 2 insns per cycle in program +;; order, the grouped adress are aligned by 8 +;; This file only simulate one thread situation +;; XU executes all fixed point insns(3 units, a simple alu, a complex unit, +;; and load/store unit) +;; VSU executes all scalar floating points insn(a float unit), +;; VMX insns(VMX unit, 4 sub units, simple, permute, complex, floating point) + +;; Dual issue combination + +;; FXU LSU BR VMX VMX +;; (sx,cx,vsu_fp,fp_arith) (perm,vsu_ls,fp_ls) +;;FXU X +;;LSU X X X +;;BR X +;;VMX(sx,cx,vsu_fp,fp_arth) X +;;VMX(perm,vsu_ls, fp_ls) X +;; X are illegal combination. + +;; Dual issue exceptons: +;;(1) nop-pipelined FXU instr in slot 0 +;;(2) non-pipelined FPU inst in slot 0 +;; CSI instr(contex-synchronizing insn) +;; Microcode insn + +;; BRU unit: bru(none register stall), bru_cr(cr register stall) +;; VSU unit: vus(vmx simple), vup(vmx permute), vuc(vmx complex), +;; vuf(vmx float), fpu(floats). fpu_div is hypthetical, it is for +;; nonpipelined simulation +;; micr insns will stall at least 7 cycles to get the first instr from ROM, +;; micro instructions are not dual issued. + +;; slot0 is older than slot1 +;; non-pipelined insn need to be in slot1 to avoid 1cycle stall + +;; There different stall point +;; IB2, only stall one thread if stall here, so try to stall here as much as +;; we can +;; condition(1) insert nop, OR and ORI instruction form +;; condition(2) flush happens, in case of: RAW, WAW, D-ERAT miss, or +;; CR0-access while stdcx, or stwcx +;; IS2 stall ;; Page91 for details +;; VQ8 stall +;; IS2 stall can be activated by VQ8 stall and trying to issue a vsu instr to +;; the vsu issue queue + +;;(define_automaton "cellxu") + +;;(define_cpu_unit "fxu_cell,lsu_cell,bru_cell,vsu1_cell,vsu2_cell" "cellxu") + +;; ndfa +(define_automaton "cellxu,cellvsu,cellbru,cell_mis") + +(define_cpu_unit "fxu_cell,lsu_cell" "cellxu") +(define_cpu_unit "bru_cell" "cellbru") +(define_cpu_unit "vsu1_cell,vsu2_cell" "cellvsu") + +(define_cpu_unit "slot0,slot1" "cell_mis") + +(absence_set "slot0" "slot1") + +(define_reservation "nonpipeline" "fxu_cell+lsu_cell+vsu1_cell+vsu2_cell") +(define_reservation "slot01" "slot0|slot1") + + +;; Load/store +;; lmw, lswi, lswx are only generated for optimize for space, MC, +;; these instr are not simulated +(define_insn_reservation "cell-load" 2 + (and (eq_attr "type" "load") + (eq_attr "cpu" "cell")) + "slot01,lsu_cell") + +;; ldux, ldu, lbzux, lbzu, hardware breaks it down to two instrs, +;; if with 32bytes alignment, CMC +(define_insn_reservation "cell-load-ux" 2 + (and (eq_attr "type" "load_ux,load_u") + (eq_attr "cpu" "cell")) + "slot01,fxu_cell+lsu_cell") + +;; lha, lhax, lhau, lhaux, lwa, lwax, lwaux, MC, latency unknown +;; 11/7, 11/8, 11/12 +(define_insn_reservation "cell-load-ext" 2 + (and (eq_attr "type" "load_ext,load_ext_u,load_ext_ux") + (eq_attr "cpu" "cell")) + "slot01,fxu_cell+lsu_cell") + +;;lfs,lfsx,lfd,lfdx, 1 cycle +(define_insn_reservation "cell-fpload" 1 + (and (eq_attr "type" "fpload") + (eq_attr "cpu" "cell")) + "vsu2_cell+lsu_cell+slot01") + +;; lfsu,lfsux,lfdu,lfdux 1cycle(fpr) 2 cycle(gpr) +(define_insn_reservation "cell-fpload-update" 1 + (and (eq_attr "type" "fpload,fpload_u,fpload_ux") + (eq_attr "cpu" "cell")) + "fxu_cell+vsu2_cell+lsu_cell+slot01") + +(define_insn_reservation "cell-vecload" 2 + (and (eq_attr "type" "vecload") + (eq_attr "cpu" "cell")) + "slot01,vsu2_cell+lsu_cell") + +;;st? stw(MC) +(define_insn_reservation "cell-store" 1 + (and (eq_attr "type" "store") + (eq_attr "cpu" "cell")) + "lsu_cell+slot01") + +;;stdux, stdu, (hardware breaks into store and add) 2 for update reg +(define_insn_reservation "cell-store-update" 1 + (and (eq_attr "type" "store_ux,store_u") + (eq_attr "cpu" "cell")) + "fxu_cell+lsu_cell+slot01") + +(define_insn_reservation "cell-fpstore" 1 + (and (eq_attr "type" "fpstore") + (eq_attr "cpu" "cell")) + "vsu2_cell+lsu_cell+slot01") + +(define_insn_reservation "cell-fpstore-update" 1 + (and (eq_attr "type" "fpstore_ux,fpstore_u") + (eq_attr "cpu" "cell")) + "vsu2_cell+fxu_cell+lsu_cell+slot01") + +(define_insn_reservation "cell-vecstore" 1 + (and (eq_attr "type" "vecstore") + (eq_attr "cpu" "cell")) + "vsu2_cell+lsu_cell+slot01") + +;; Integer latency is 2 cycles +(define_insn_reservation "cell-integer" 2 + (and (eq_attr "type" "integer,insert_dword,shift,trap,\ + var_shift_rotate,cntlz,exts") + (eq_attr "cpu" "cell")) + "slot01,fxu_cell") + +;; Two integer latency is 4 cycles +(define_insn_reservation "cell-two" 4 + (and (eq_attr "type" "two") + (eq_attr "cpu" "cell")) + "slot01,fxu_cell,fxu_cell*2") + +;; Three integer latency is 6 cycles +(define_insn_reservation "cell-three" 6 + (and (eq_attr "type" "three") + (eq_attr "cpu" "cell")) + "slot01,fxu_cell,fxu_cell*4") + +;; rlwimi, alter cr0 +(define_insn_reservation "cell-insert" 2 + (and (eq_attr "type" "insert_word") + (eq_attr "cpu" "cell")) + "slot01,fxu_cell") + +;; cmpi, cmpli, cmpla, add, addo, sub, subo, alter cr0 +(define_insn_reservation "cell-cmp" 1 + (and (eq_attr "type" "cmp") + (eq_attr "cpu" "cell")) + "fxu_cell+slot01") + +;; add, addo, sub, subo, alter cr0, rldcli, rlwinm +(define_insn_reservation "cell-fast-cmp" 2 + (and (and (eq_attr "type" "fast_compare,delayed_compare,compare,\ + var_delayed_compare") + (eq_attr "cpu" "cell")) + (eq_attr "cell_micro" "not")) + "slot01,fxu_cell") + +(define_insn_reservation "cell-cmp-microcoded" 9 + (and (and (eq_attr "type" "fast_compare,delayed_compare,compare,\ + var_delayed_compare") + (eq_attr "cpu" "cell")) + (eq_attr "cell_micro" "always")) + "slot0+slot1,fxu_cell,fxu_cell*7") + +;; mulld +(define_insn_reservation "cell-lmul" 15 + (and (eq_attr "type" "lmul") + (eq_attr "cpu" "cell")) + "slot1,nonpipeline,nonpipeline*13") + +;; mulld. is microcoded +(define_insn_reservation "cell-lmul-cmp" 22 + (and (eq_attr "type" "lmul_compare") + (eq_attr "cpu" "cell")) + "slot0+slot1,nonpipeline,nonpipeline*20") + +;; mulli, 6 cycles +(define_insn_reservation "cell-imul23" 6 + (and (eq_attr "type" "imul2,imul3") + (eq_attr "cpu" "cell")) + "slot1,nonpipeline,nonpipeline*4") + +;; mullw, 9 +(define_insn_reservation "cell-imul" 9 + (and (eq_attr "type" "imul") + (eq_attr "cpu" "cell")) + "slot1,nonpipeline,nonpipeline*7") + +;; divide +(define_insn_reservation "cell-idiv" 32 + (and (eq_attr "type" "idiv") + (eq_attr "cpu" "cell")) + "slot1,nonpipeline,nonpipeline*30") + +(define_insn_reservation "cell-ldiv" 64 + (and (eq_attr "type" "ldiv") + (eq_attr "cpu" "cell")) + "slot1,nonpipeline,nonpipeline*62") + +;;mflr and mfctr are pipelined +(define_insn_reservation "cell-mfjmpr" 1 + (and (eq_attr "type" "mfjmpr") + (eq_attr "cpu" "cell")) + "slot01+bru_cell") + +;;mtlr and mtctr, +;;mtspr fully pipelined +(define_insn_reservation "cell-mtjmpr" 1 + (and (eq_attr "type" "mtjmpr") + (eq_attr "cpu" "cell")) + "bru_cell+slot01") + +;; Branches +;; b, ba, bl, bla, unconditional branch always predicts correctly n/a latency +;; bcctr, bcctrl, latency 2, actually adjust by be to 4 +(define_insn_reservation "cell-branch" 1 + (and (eq_attr "type" "branch") + (eq_attr "cpu" "cell")) + "bru_cell+slot1") + +(define_insn_reservation "cell-branchreg" 1 + (and (eq_attr "type" "jmpreg") + (eq_attr "cpu" "cell")) + "bru_cell+slot1") + +;; cr hazard +;; page 90, special cases for CR hazard, only one instr can access cr per cycle +;; if insn reads CR following a stwcx, pipeline stall till stwcx finish +(define_insn_reservation "cell-crlogical" 1 + (and (eq_attr "type" "cr_logical,delayed_cr") + (eq_attr "cpu" "cell")) + "bru_cell+slot01") + +;; mfcrf and mfcr is about 34 cycles and nonpipelined +(define_insn_reservation "cell-mfcr" 34 + (and (eq_attr "type" "mfcrf,mfcr") + (eq_attr "cpu" "cell")) + "slot1,nonpipeline,nonpipeline*32") + +;; mtcrf (1 field) +(define_insn_reservation "cell-mtcrf" 1 + (and (eq_attr "type" "mtcr") + (eq_attr "cpu" "cell")) + "fxu_cell+slot01") + +; Basic FP latency is 10 cycles, thoughput is 1/cycle +(define_insn_reservation "cell-fp" 10 + (and (eq_attr "type" "fp,dmul") + (eq_attr "cpu" "cell")) + "slot01,vsu1_cell,vsu1_cell*8") + +(define_insn_reservation "cell-fpcompare" 1 + (and (eq_attr "type" "fpcompare") + (eq_attr "cpu" "cell")) + "vsu1_cell+slot01") + +;; sdiv thoughput 1/74, not pipelined but only in the FPU +(define_insn_reservation "cell-sdiv" 74 + (and (eq_attr "type" "sdiv,ddiv") + (eq_attr "cpu" "cell")) + "slot1,nonpipeline,nonpipeline*72") + +;; fsqrt thoughput 1/84, not pipelined but only in the FPU +(define_insn_reservation "cell-sqrt" 84 + (and (eq_attr "type" "ssqrt,dsqrt") + (eq_attr "cpu" "cell")) + "slot1,nonpipeline,nonpipeline*82") + +; VMX +(define_insn_reservation "cell-vecsimple" 4 + (and (eq_attr "type" "vecsimple") + (eq_attr "cpu" "cell")) + "slot01,vsu1_cell,vsu1_cell*2") + +;; mult, div, madd +(define_insn_reservation "cell-veccomplex" 10 + (and (eq_attr "type" "veccomplex") + (eq_attr "cpu" "cell")) + "slot01,vsu1_cell,vsu1_cell*8") + +;; TODO: add support for recording instructions +(define_insn_reservation "cell-veccmp" 4 + (and (eq_attr "type" "veccmp") + (eq_attr "cpu" "cell")) + "slot01,vsu1_cell,vsu1_cell*2") + +(define_insn_reservation "cell-vecfloat" 12 + (and (eq_attr "type" "vecfloat") + (eq_attr "cpu" "cell")) + "slot01,vsu1_cell,vsu1_cell*10") + +(define_insn_reservation "cell-vecperm" 4 + (and (eq_attr "type" "vecperm") + (eq_attr "cpu" "cell")) + "slot01,vsu2_cell,vsu2_cell*2") + +;; New for 4.2, syncs + +(define_insn_reservation "cell-sync" 11 + (and (eq_attr "type" "sync") + (eq_attr "cpu" "cell")) + "slot01,lsu_cell,lsu_cell*9") + +(define_insn_reservation "cell-isync" 11 + (and (eq_attr "type" "isync") + (eq_attr "cpu" "cell")) + "slot01,lsu_cell,lsu_cell*9") + +(define_insn_reservation "cell-load_l" 11 + (and (eq_attr "type" "load_l") + (eq_attr "cpu" "cell")) + "slot01,lsu_cell,lsu_cell*9") + +(define_insn_reservation "cell-store_c" 11 + (and (eq_attr "type" "store_c") + (eq_attr "cpu" "cell")) + "slot01,lsu_cell,lsu_cell*9") + +;; RAW register dependency + +;; addi r3, r3, 1 +;; lw r4,offset(r3) +;; there are 5 cycle deplay for r3 bypassing +;; there are 5 cycle delay for a dependent load after a load +(define_bypass 5 "cell-integer" "cell-load") +(define_bypass 5 "cell-integer" "cell-load-ext") +(define_bypass 5 "cell-load,cell-load-ext" "cell-load,cell-load-ext") + +;; there is a 6 cycle delay after a fp compare until you can use the cr. +(define_bypass 6 "cell-fpcompare" "cell-branch,cell-branchreg,cell-mfcr,cell-crlogical") + +;; VXU float RAW +(define_bypass 11 "cell-vecfloat" "cell-vecfloat") + +;; VXU and FPU +(define_bypass 6 "cell-veccomplex" "cell-vecsimple") +;;(define_bypass 6 "cell-veccompare" "cell-branch,cell-branchreg") +(define_bypass 3 "cell-vecfloat" "cell-veccomplex") +; this is not correct, +;; this is a stall in general and not dependent on result +(define_bypass 13 "cell-vecstore" "cell-fpstore") +; this is not correct, this can never be true, not depent on result +(define_bypass 7 "cell-fp" "cell-fpload") +;; vsu1 should avoid writing to the same target register as vsu2 insn +;; within 12 cycles. + +;; WAW hazard + +;; the target of VSU estimate should not be reused within 10 dispatch groups +;; the target of VSU float should not be reused within 8 dispatch groups +;; the target of VSU complex should not be reused within 5 dispatch groups +;; FP LOAD should not reuse an FPU Arithmetic target with 6 dispatch gropus + +;; mtctr-bcctr/bcctrl, branch target ctr register shadow update at +;; ex4 stage(10 cycles) +(define_bypass 10 "cell-mtjmpr" "cell-branchreg") + +;;Things are not simulated: +;; update instruction, update address gpr are not simulated +;; vrefp, vrsqrtefp have latency(14), currently simluated as 12 cycle float +;; insns + diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c index eec8a49093a..dc80f9ff3da 100644 --- a/gcc/config/rs6000/rs6000.c +++ b/gcc/config/rs6000/rs6000.c @@ -139,6 +139,8 @@ struct rs6000_cpu_select rs6000_select[3] = { (const char *)0, "-mtune=", 1, 0 }, }; +static GTY(()) bool rs6000_cell_dont_microcode; + /* Always emit branch hint bits. */ static GTY(()) bool rs6000_always_hint; @@ -519,6 +521,22 @@ struct processor_costs ppc630_cost = { COSTS_N_INSNS (21), /* ddiv */ }; +/* Instruction costs on Cell processor. */ +/* COSTS_N_INSNS (1) ~ one add. */ +static const +struct processor_costs ppccell_cost = { + COSTS_N_INSNS (9/2)+2, /* mulsi */ + COSTS_N_INSNS (6/2), /* mulsi_const */ + COSTS_N_INSNS (6/2), /* mulsi_const9 */ + COSTS_N_INSNS (15/2)+2, /* muldi */ + COSTS_N_INSNS (38/2), /* divsi */ + COSTS_N_INSNS (70/2), /* divdi */ + COSTS_N_INSNS (10/2), /* fp */ + COSTS_N_INSNS (10/2), /* dmul */ + COSTS_N_INSNS (74/2), /* sdiv */ + COSTS_N_INSNS (74/2), /* ddiv */ +}; + /* Instruction costs on PPC750 and PPC7400 processors. */ static const struct processor_costs ppc750_cost = { @@ -671,6 +689,7 @@ static bool rs6000_rtx_costs (rtx, int, int, int *); static int rs6000_adjust_cost (rtx, rtx, rtx, int); static void rs6000_sched_init (FILE *, int, int); static bool is_microcoded_insn (rtx); +static bool is_nonpipeline_insn (rtx); static bool is_cracked_insn (rtx); static bool is_branch_slot_insn (rtx); static bool is_load_insn (rtx); @@ -692,6 +711,7 @@ static void rs6000_sched_finish (FILE *, int); static int rs6000_sched_reorder (FILE *, int, rtx *, int *, int); static int rs6000_sched_reorder2 (FILE *, int, rtx *, int *, int); static int rs6000_use_sched_lookahead (void); +static int rs6000_use_sched_lookahead_guard (rtx); static tree rs6000_builtin_mask_for_load (void); static tree rs6000_builtin_mul_widen_even (tree); static tree rs6000_builtin_mul_widen_odd (tree); @@ -952,6 +972,9 @@ static const char alt_reg_names[][8] = #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD rs6000_use_sched_lookahead +#undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD +#define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD rs6000_use_sched_lookahead_guard + #undef TARGET_VECTORIZE_BUILTIN_MASK_FOR_LOAD #define TARGET_VECTORIZE_BUILTIN_MASK_FOR_LOAD rs6000_builtin_mask_for_load #undef TARGET_VECTORIZE_BUILTIN_MUL_WIDEN_EVEN @@ -1217,6 +1240,8 @@ rs6000_override_options (const char *default_cpu) {"860", PROCESSOR_MPCCORE, POWERPC_BASE_MASK | MASK_SOFT_FLOAT}, {"970", PROCESSOR_POWER4, POWERPC_7400_MASK | MASK_PPC_GPOPT | MASK_MFCRF | MASK_POWERPC64}, + {"cell", PROCESSOR_CELL, + POWERPC_7400_MASK | MASK_PPC_GPOPT | MASK_MFCRF | MASK_POWERPC64}, {"common", PROCESSOR_COMMON, MASK_NEW_MNEMONICS}, {"ec603e", PROCESSOR_PPC603, POWERPC_BASE_MASK | MASK_SOFT_FLOAT}, {"G3", PROCESSOR_PPC750, POWERPC_BASE_MASK | MASK_PPC_GFXOPT}, @@ -1445,7 +1470,8 @@ rs6000_override_options (const char *default_cpu) rs6000_always_hint = (rs6000_cpu != PROCESSOR_POWER4 && rs6000_cpu != PROCESSOR_POWER5 - && rs6000_cpu != PROCESSOR_POWER6); + && rs6000_cpu != PROCESSOR_POWER6 + && rs6000_cpu != PROCESSOR_CELL); rs6000_sched_groups = (rs6000_cpu == PROCESSOR_POWER4 || rs6000_cpu == PROCESSOR_POWER5); rs6000_align_branch_targets = (rs6000_cpu == PROCESSOR_POWER4 @@ -1519,6 +1545,16 @@ rs6000_override_options (const char *default_cpu) /* Set branch target alignment, if not optimizing for size. */ if (!optimize_size) { + /* Cell wants to be aligned 8byte for dual issue. */ + if (rs6000_cpu == PROCESSOR_CELL) + { + if (align_functions <= 0) + align_functions = 8; + if (align_jumps <= 0) + align_jumps = 8; + if (align_loops <= 0) + align_loops = 8; + } if (rs6000_align_branch_targets) { if (align_functions <= 0) @@ -1600,6 +1636,10 @@ rs6000_override_options (const char *default_cpu) rs6000_cost = &ppc630_cost; break; + case PROCESSOR_CELL: + rs6000_cost = &ppccell_cost; + break; + case PROCESSOR_PPC750: case PROCESSOR_PPC7400: rs6000_cost = &ppc750_cost; @@ -14940,7 +14980,7 @@ rs6000_emit_epilogue (int sibcall) rs6000_stack_t *info; int restoring_FPRs_inline; int using_load_multiple; - int using_mfcr_multiple; + int using_mtcr_multiple; int use_backchain_to_restore_sp; int sp_offset = 0; rtx sp_reg_rtx = gen_rtx_REG (Pmode, 1); @@ -14969,7 +15009,7 @@ rs6000_emit_epilogue (int sibcall) use_backchain_to_restore_sp = (frame_pointer_needed || current_function_calls_alloca || info->total_size > 32767); - using_mfcr_multiple = (rs6000_cpu == PROCESSOR_PPC601 + using_mtcr_multiple = (rs6000_cpu == PROCESSOR_PPC601 || rs6000_cpu == PROCESSOR_PPC603 || rs6000_cpu == PROCESSOR_PPC750 || optimize_size); @@ -15269,7 +15309,7 @@ rs6000_emit_epilogue (int sibcall) rtx r12_rtx = gen_rtx_REG (SImode, 12); int count = 0; - if (using_mfcr_multiple) + if (using_mtcr_multiple) { for (i = 0; i < 8; i++) if (regs_ever_live[CR0_REGNO+i] && ! call_used_regs[CR0_REGNO+i]) @@ -15277,7 +15317,7 @@ rs6000_emit_epilogue (int sibcall) gcc_assert (count); } - if (using_mfcr_multiple && count > 1) + if (using_mtcr_multiple && count > 1) { rtvec p; int ndx; @@ -16595,6 +16635,10 @@ rs6000_variable_issue (FILE *stream ATTRIBUTE_UNUSED, return cached_can_issue_more; } + /* If no reservation, but reach here */ + if (recog_memoized (insn) < 0) + return more; + if (rs6000_sched_groups) { if (is_microcoded_insn (insn)) @@ -16607,6 +16651,9 @@ rs6000_variable_issue (FILE *stream ATTRIBUTE_UNUSED, return cached_can_issue_more; } + if (rs6000_cpu_attr == CPU_CELL && is_nonpipeline_insn (insn)) + return 0; + cached_can_issue_more = more - 1; return cached_can_issue_more; } @@ -16662,7 +16709,8 @@ rs6000_adjust_cost (rtx insn, rtx link, rtx dep_insn, int cost) || rs6000_cpu_attr == CPU_PPC7400 || rs6000_cpu_attr == CPU_PPC7450 || rs6000_cpu_attr == CPU_POWER4 - || rs6000_cpu_attr == CPU_POWER5) + || rs6000_cpu_attr == CPU_POWER5 + || rs6000_cpu_attr == CPU_CELL) && recog_memoized (dep_insn) && (INSN_CODE (dep_insn) >= 0)) @@ -16912,6 +16960,9 @@ is_microcoded_insn (rtx insn) || GET_CODE (PATTERN (insn)) == CLOBBER) return false; + if (rs6000_cpu_attr == CPU_CELL) + return get_attr_cell_micro (insn) == CELL_MICRO_ALWAYS; + if (rs6000_sched_groups) { enum attr_type type = get_attr_type (insn); @@ -17115,6 +17166,37 @@ rs6000_adjust_priority (rtx insn ATTRIBUTE_UNUSED, int priority) return priority; } +/* Return true if the instruction is nonpipelined on the Cell. */ +static bool +is_nonpipeline_insn (rtx insn) +{ + enum attr_type type; + if (!insn || !INSN_P (insn) + || GET_CODE (PATTERN (insn)) == USE + || GET_CODE (PATTERN (insn)) == CLOBBER) + return false; + + type = get_attr_type (insn); + if (type == TYPE_IMUL + || type == TYPE_IMUL2 + || type == TYPE_IMUL3 + || type == TYPE_LMUL + || type == TYPE_IDIV + || type == TYPE_LDIV + || type == TYPE_SDIV + || type == TYPE_DDIV + || type == TYPE_SSQRT + || type == TYPE_DSQRT + || type == TYPE_MFCR + || type == TYPE_MFCRF + || type == TYPE_MFJMPR) + { + return true; + } + return false; +} + + /* Return how many instructions the machine can issue per cycle. */ static int @@ -17135,6 +17217,7 @@ rs6000_issue_rate (void) case CPU_PPC750: case CPU_PPC7400: case CPU_PPC8540: + case CPU_CELL: return 2; case CPU_RIOS2: case CPU_PPC604: @@ -17159,9 +17242,29 @@ rs6000_use_sched_lookahead (void) { if (rs6000_cpu_attr == CPU_PPC8540) return 4; + if (rs6000_cpu_attr == CPU_CELL) + return (reload_completed ? 8 : 0); return 0; } +/* We are choosing insn from the ready queue. Return nonzero if INSN can be chosen. */ +static int +rs6000_use_sched_lookahead_guard (rtx insn) +{ + if (rs6000_cpu_attr != CPU_CELL) + return 1; + + if (insn == NULL_RTX || !INSN_P (insn)) + abort (); + + if (!reload_completed + || is_nonpipeline_insn (insn) + || is_microcoded_insn (insn)) + return 0; + + return 1; +} + /* Determine is PAT refers to memory. */ static bool @@ -17337,9 +17440,25 @@ rs6000_sched_reorder (FILE *dump ATTRIBUTE_UNUSED, int sched_verbose, int *pn_ready ATTRIBUTE_UNUSED, int clock_var ATTRIBUTE_UNUSED) { + int n_ready = *pn_ready; + if (sched_verbose) fprintf (dump, "// rs6000_sched_reorder :\n"); + /* Reorder the ready list, if the second to last ready insn + is a nonepipeline insn. */ + if (rs6000_cpu_attr == CPU_CELL && n_ready > 1) + { + if (is_nonpipeline_insn (ready[n_ready - 1]) + && (recog_memoized (ready[n_ready - 2]) > 0)) + /* Simply swap first two insns. */ + { + rtx tmp = ready[n_ready - 1]; + ready[n_ready - 1] = ready[n_ready - 2]; + ready[n_ready - 2] = tmp; + } + } + if (rs6000_cpu == PROCESSOR_POWER6) load_store_pendulum = 0; diff --git a/gcc/config/rs6000/rs6000.h b/gcc/config/rs6000/rs6000.h index c9856d12aca..6ccc3c01ad4 100644 --- a/gcc/config/rs6000/rs6000.h +++ b/gcc/config/rs6000/rs6000.h @@ -68,6 +68,7 @@ %{mno-power: %{!mpowerpc*: -mcom}} \ %{!mno-power: %{!mpower*: %(asm_default)}}} \ %{mcpu=common: -mcom} \ +%{mcpu=cell: -mcell} \ %{mcpu=power: -mpwr} \ %{mcpu=power2: -mpwrx} \ %{mcpu=power3: -mppc64} \ @@ -222,7 +223,8 @@ enum processor_type PROCESSOR_PPC8540, PROCESSOR_POWER4, PROCESSOR_POWER5, - PROCESSOR_POWER6 + PROCESSOR_POWER6, + PROCESSOR_CELL }; extern enum processor_type rs6000_cpu; diff --git a/gcc/config/rs6000/rs6000.md b/gcc/config/rs6000/rs6000.md index 8ddf26d924d..f40e78dfafa 100644 --- a/gcc/config/rs6000/rs6000.md +++ b/gcc/config/rs6000/rs6000.md @@ -106,9 +106,26 @@ ;; Processor type -- this attribute must exactly match the processor_type ;; enumeration in rs6000.h. -(define_attr "cpu" "rios1,rios2,rs64a,mpccore,ppc403,ppc405,ppc440,ppc601,ppc603,ppc604,ppc604e,ppc620,ppc630,ppc750,ppc7400,ppc7450,ppc8540,power4,power5,power6" +(define_attr "cpu" "rios1,rios2,rs64a,mpccore,ppc403,ppc405,ppc440,ppc601,ppc603,ppc604,ppc604e,ppc620,ppc630,ppc750,ppc7400,ppc7450,ppc8540,power4,power5,power6,cell" (const (symbol_ref "rs6000_cpu_attr"))) + +;; If this instruction is microcoded on the CELL processor +; The default for load and stores is conditional +; The default for load extended and the recorded instructions is always microcoded +(define_attr "cell_micro" "not,conditional,always" + (if_then_else (ior (ior (eq_attr "type" "load") + (eq_attr "type" "store")) + (ior (eq_attr "type" "fpload") + (eq_attr "type" "fpstore"))) + (const_string "conditional") + (if_then_else (ior (eq_attr "type" "load_ext") + (ior (eq_attr "type" "compare") + (eq_attr "type" "delayed_compare"))) + (const_string "always") + (const_string "not")))) + + (automata_option "ndfa") (include "rios1.md") @@ -125,6 +142,7 @@ (include "power4.md") (include "power5.md") (include "power6.md") +(include "cell.md") (include "predicates.md") (include "constraints.md") |