5 files changed, 578 insertions, 8 deletions
diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index dbb72830af2..206bd435858 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,33 @@
+2006-11-13  Andrew Pinski  <andrew_pinski@playstation.sony.com>
+
+	* config/rs6000/cell.md: New file.
+	* config/rs6000/rs6000.c (rs6000_cell_dont_microcode): New 
+	variable.
+	(ppccell_cost): New cost matrix.
+	(TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD): Define.
+	(rs6000_override_options): Set rs6000_always_hint to false
+	for cell. Also align functions/lables/loops to 8byte
+	for the Cell. Use PROCESSOR_CELL.
+	(rs6000_emit_epilogue): Rename using_mfcr_multiple to
+	using_mtcr_multiple.
+	(rs6000_variable_issue): If the insn is a nonpipelined instruction
+	on the Cell, return 0.
+	(rs6000_adjust_cost): Add Cell cost adjustments.
+	(is_microcoded_insn): Return true for Cell microcoded
+	instructions.
+	(is_nonpipeline_insn): New function.
+	(rs6000_issue_rate): Add PROCESSOR_CELL.
+	(rs6000_use_sched_lookahead): If Cell, then we should look ahead 8
+	instructions.
+	(rs6000_use_sched_lookahead_guard): New function.
+	(rs6000_sched_reorder):  Reorder the ready list, if the second
+	to last ready insn is a nonepipeline insn on the Cell.
+	* config/rs6000/rs6000.h (processor_type): Add PROCESSOR_CELL.
+	(ASM_CPU_SPEC): Add Cell.
+	* config/rs6000/rs6000.md (cpu): Add Cell.
+	(cell_micro): New Attr.
+	Include cell.md
+
 2006-11-13  Jakub Jelinek  <jakub@redhat.com>
 
 	* configure.ac (ld_vers): Parse GNU ld version 2.17.50.0.3-6 20060715
diff --git a/gcc/config/rs6000/cell.md b/gcc/config/rs6000/cell.md
new file mode 100644
index 00000000000..f12d2a66cc8
--- /dev/null
+++ b/gcc/config/rs6000/cell.md
@@ -0,0 +1,401 @@
+;; Scheduling description for cell processor.
+;; Copyright (C) 2001, 2002, 2003, 2004, 2005, 2006
+;; Free Software Foundation, Inc.
+;; Contributed by Sony Computer Entertainment, Inc.,
+
+
+;; This file is free software; you can redistribute it and/or modify it under
+;; the terms of the GNU General Public License as published by the Free
+;; Software Foundation; either version 2 of the License, or (at your option) 
+;; any later version.
+
+;; This file is distributed in the hope that it will be useful, but WITHOUT
+;; ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+;; FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+;; for more details.
+
+;; You should have received a copy of the GNU General Public License
+;; along with this file; see the file COPYING.  If not, write to the Free
+;; Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA
+;; 02110-1301, USA.
+
+;; Sources: BE BOOK4 (/sfs/enc/doc/PPU_BookIV_DD3.0_latest.pdf)
+
+;; BE Architechture *DD3.0 and DD3.1*
+;; This file simulate PPU processor unit backend of pipeline, maualP24. 
+;; manual P27, stall and flush points
+;; IU, XU, VSU, dipatcher decodes and dispatch 2 insns per cycle in program
+;;  order, the grouped adress are aligned by 8
+;; This file only simulate one thread situation
+;; XU executes all fixed point insns(3 units, a simple alu, a complex unit,
+;;   and load/store unit)
+;; VSU executes all scalar floating points insn(a float unit),
+;;   VMX insns(VMX unit, 4 sub units, simple, permute, complex, floating point)
+
+;; Dual issue combination
+
+;;	FXU	LSU	BR 	        VMX	               VMX
+;;                             (sx,cx,vsu_fp,fp_arith)    (perm,vsu_ls,fp_ls)
+;;FXU	X
+;;LSU		X               	X               	X	
+;;BR			X
+;;VMX(sx,cx,vsu_fp,fp_arth)		X
+;;VMX(perm,vsu_ls, fp_ls)					X
+;;    X are illegal combination.
+
+;; Dual issue exceptons: 
+;;(1) nop-pipelined FXU instr in slot 0 
+;;(2) non-pipelined FPU inst in slot 0
+;; CSI instr(contex-synchronizing insn)
+;; Microcode insn
+
+;; BRU unit: bru(none register stall), bru_cr(cr register stall)
+;; VSU unit: vus(vmx simple), vup(vmx permute), vuc(vmx complex),
+;;  vuf(vmx float), fpu(floats). fpu_div is hypthetical, it is for 
+;;  nonpipelined simulation
+;; micr insns will stall at least 7 cycles to get the first instr from ROM,
+;;  micro instructions are not dual issued. 
+
+;; slot0 is older than slot1
+;; non-pipelined insn need to be in slot1 to avoid 1cycle stall
+
+;; There different stall point
+;; IB2, only stall one thread if stall here, so try to stall here as much as
+;; we can 
+;; condition(1) insert nop, OR and ORI instruction form 
+;; condition(2) flush happens, in case of: RAW, WAW, D-ERAT miss, or
+;;   CR0-access while stdcx, or stwcx
+;; IS2 stall ;; Page91 for details
+;; VQ8 stall
+;; IS2 stall can be activated by VQ8 stall and trying to issue a vsu instr to
+;;  the vsu issue queue
+
+;;(define_automaton "cellxu")
+
+;;(define_cpu_unit "fxu_cell,lsu_cell,bru_cell,vsu1_cell,vsu2_cell" "cellxu")
+
+;; ndfa
+(define_automaton "cellxu,cellvsu,cellbru,cell_mis")
+
+(define_cpu_unit "fxu_cell,lsu_cell" "cellxu")
+(define_cpu_unit "bru_cell" "cellbru")
+(define_cpu_unit "vsu1_cell,vsu2_cell" "cellvsu")
+
+(define_cpu_unit "slot0,slot1" "cell_mis")
+
+(absence_set "slot0" "slot1")
+
+(define_reservation "nonpipeline" "fxu_cell+lsu_cell+vsu1_cell+vsu2_cell")
+(define_reservation "slot01" "slot0|slot1")
+
+
+;; Load/store
+;; lmw, lswi, lswx are only generated for optimize for space, MC,
+;;   these instr are not simulated
+(define_insn_reservation "cell-load" 2
+  (and (eq_attr "type" "load")
+       (eq_attr "cpu" "cell"))
+  "slot01,lsu_cell")
+
+;; ldux, ldu, lbzux, lbzu, hardware breaks it down to two instrs,
+;;  if with 32bytes alignment, CMC
+(define_insn_reservation "cell-load-ux" 2
+  (and (eq_attr "type" "load_ux,load_u")
+       (eq_attr "cpu" "cell"))
+  "slot01,fxu_cell+lsu_cell")
+
+;; lha, lhax, lhau, lhaux, lwa, lwax, lwaux, MC, latency unknown
+;;   11/7, 11/8, 11/12
+(define_insn_reservation "cell-load-ext" 2
+  (and (eq_attr "type" "load_ext,load_ext_u,load_ext_ux")
+       (eq_attr "cpu" "cell")) 
+  "slot01,fxu_cell+lsu_cell")
+
+;;lfs,lfsx,lfd,lfdx, 1 cycle
+(define_insn_reservation "cell-fpload" 1
+  (and (eq_attr "type" "fpload")
+       (eq_attr "cpu" "cell"))
+  "vsu2_cell+lsu_cell+slot01")
+
+;; lfsu,lfsux,lfdu,lfdux 1cycle(fpr) 2 cycle(gpr)
+(define_insn_reservation "cell-fpload-update" 1
+  (and (eq_attr "type" "fpload,fpload_u,fpload_ux")
+       (eq_attr "cpu" "cell"))
+  "fxu_cell+vsu2_cell+lsu_cell+slot01")
+
+(define_insn_reservation "cell-vecload" 2
+  (and (eq_attr "type" "vecload")
+       (eq_attr "cpu" "cell"))
+  "slot01,vsu2_cell+lsu_cell")
+
+;;st? stw(MC)
+(define_insn_reservation "cell-store" 1
+  (and (eq_attr "type" "store")
+       (eq_attr "cpu" "cell"))
+  "lsu_cell+slot01")
+
+;;stdux, stdu, (hardware breaks into store and add) 2 for update reg
+(define_insn_reservation "cell-store-update" 1
+  (and (eq_attr "type" "store_ux,store_u")
+       (eq_attr "cpu" "cell"))
+  "fxu_cell+lsu_cell+slot01")
+
+(define_insn_reservation "cell-fpstore" 1
+  (and (eq_attr "type" "fpstore")
+       (eq_attr "cpu" "cell"))
+  "vsu2_cell+lsu_cell+slot01")
+
+(define_insn_reservation "cell-fpstore-update" 1
+  (and (eq_attr "type" "fpstore_ux,fpstore_u")
+       (eq_attr "cpu" "cell"))
+  "vsu2_cell+fxu_cell+lsu_cell+slot01")
+
+(define_insn_reservation "cell-vecstore" 1
+  (and (eq_attr "type" "vecstore")
+       (eq_attr "cpu" "cell"))
+  "vsu2_cell+lsu_cell+slot01")
+
+;; Integer latency is 2 cycles
+(define_insn_reservation "cell-integer" 2
+  (and (eq_attr "type" "integer,insert_dword,shift,trap,\
+			var_shift_rotate,cntlz,exts")
+       (eq_attr "cpu" "cell"))
+  "slot01,fxu_cell")
+
+;; Two integer latency is 4 cycles
+(define_insn_reservation "cell-two" 4
+  (and (eq_attr "type" "two")
+       (eq_attr "cpu" "cell"))
+  "slot01,fxu_cell,fxu_cell*2")
+
+;; Three integer latency is 6 cycles
+(define_insn_reservation "cell-three" 6
+  (and (eq_attr "type" "three")
+       (eq_attr "cpu" "cell"))
+  "slot01,fxu_cell,fxu_cell*4")
+
+;; rlwimi, alter cr0  
+(define_insn_reservation "cell-insert" 2
+  (and (eq_attr "type" "insert_word")
+       (eq_attr "cpu" "cell"))
+ "slot01,fxu_cell")
+
+;; cmpi, cmpli, cmpla, add, addo, sub, subo, alter cr0 
+(define_insn_reservation "cell-cmp" 1
+  (and (eq_attr "type" "cmp")
+       (eq_attr "cpu" "cell"))
+  "fxu_cell+slot01")
+
+;; add, addo, sub, subo, alter cr0, rldcli, rlwinm 
+(define_insn_reservation "cell-fast-cmp" 2
+  (and (and (eq_attr "type" "fast_compare,delayed_compare,compare,\
+			    var_delayed_compare")
+            (eq_attr "cpu" "cell"))
+        (eq_attr "cell_micro" "not"))
+  "slot01,fxu_cell")
+
+(define_insn_reservation "cell-cmp-microcoded" 9
+  (and (and (eq_attr "type" "fast_compare,delayed_compare,compare,\
+			    var_delayed_compare")
+            (eq_attr "cpu" "cell"))
+        (eq_attr "cell_micro" "always"))
+  "slot0+slot1,fxu_cell,fxu_cell*7")
+
+;; mulld
+(define_insn_reservation "cell-lmul" 15
+  (and (eq_attr "type" "lmul")
+       (eq_attr "cpu" "cell"))
+  "slot1,nonpipeline,nonpipeline*13")
+
+;; mulld. is microcoded
+(define_insn_reservation "cell-lmul-cmp" 22
+  (and (eq_attr "type" "lmul_compare")
+       (eq_attr "cpu" "cell"))
+  "slot0+slot1,nonpipeline,nonpipeline*20")
+
+;; mulli, 6 cycles
+(define_insn_reservation "cell-imul23" 6
+  (and (eq_attr "type" "imul2,imul3")
+       (eq_attr "cpu" "cell"))
+  "slot1,nonpipeline,nonpipeline*4")
+
+;; mullw, 9
+(define_insn_reservation "cell-imul" 9
+  (and (eq_attr "type" "imul")
+       (eq_attr "cpu" "cell"))
+  "slot1,nonpipeline,nonpipeline*7")
+ 
+;; divide
+(define_insn_reservation "cell-idiv" 32
+  (and (eq_attr "type" "idiv")
+       (eq_attr "cpu" "cell"))
+  "slot1,nonpipeline,nonpipeline*30")
+
+(define_insn_reservation "cell-ldiv" 64
+  (and (eq_attr "type" "ldiv")
+       (eq_attr "cpu" "cell"))
+  "slot1,nonpipeline,nonpipeline*62")
+
+;;mflr and mfctr are pipelined
+(define_insn_reservation "cell-mfjmpr" 1
+  (and (eq_attr "type" "mfjmpr")
+       (eq_attr "cpu" "cell"))
+  "slot01+bru_cell")
+
+;;mtlr and mtctr,
+;;mtspr fully pipelined 
+(define_insn_reservation "cell-mtjmpr" 1
+ (and (eq_attr "type" "mtjmpr")
+       (eq_attr "cpu" "cell"))
+  "bru_cell+slot01")
+
+;; Branches
+;; b, ba, bl, bla, unconditional branch always predicts correctly n/a latency
+;; bcctr, bcctrl, latency 2, actually adjust by be to 4
+(define_insn_reservation "cell-branch" 1
+  (and (eq_attr "type" "branch")
+       (eq_attr "cpu" "cell"))
+  "bru_cell+slot1")
+
+(define_insn_reservation "cell-branchreg" 1
+  (and (eq_attr "type" "jmpreg")
+       (eq_attr "cpu" "cell"))
+  "bru_cell+slot1")
+
+;; cr hazard
+;; page 90, special cases for CR hazard, only one instr can access cr per cycle
+;; if insn reads CR following a stwcx, pipeline stall till stwcx finish
+(define_insn_reservation "cell-crlogical" 1
+  (and (eq_attr "type" "cr_logical,delayed_cr")
+       (eq_attr "cpu" "cell"))
+  "bru_cell+slot01")
+
+;; mfcrf and mfcr is about 34 cycles and nonpipelined
+(define_insn_reservation "cell-mfcr" 34
+  (and (eq_attr "type" "mfcrf,mfcr")
+       (eq_attr "cpu" "cell"))
+   "slot1,nonpipeline,nonpipeline*32")
+
+;; mtcrf (1 field)
+(define_insn_reservation "cell-mtcrf" 1
+  (and (eq_attr "type" "mtcr")
+       (eq_attr "cpu" "cell"))
+  "fxu_cell+slot01")
+
+; Basic FP latency is 10 cycles, thoughput is 1/cycle
+(define_insn_reservation "cell-fp" 10
+  (and (eq_attr "type" "fp,dmul")
+       (eq_attr "cpu" "cell"))
+  "slot01,vsu1_cell,vsu1_cell*8")
+
+(define_insn_reservation "cell-fpcompare" 1
+  (and (eq_attr "type" "fpcompare")
+       (eq_attr "cpu" "cell"))
+  "vsu1_cell+slot01")
+
+;; sdiv thoughput 1/74, not pipelined but only in the FPU
+(define_insn_reservation "cell-sdiv" 74
+  (and (eq_attr "type" "sdiv,ddiv")
+       (eq_attr "cpu" "cell"))
+  "slot1,nonpipeline,nonpipeline*72")
+
+;; fsqrt thoughput 1/84, not pipelined but only in the FPU
+(define_insn_reservation "cell-sqrt" 84
+  (and (eq_attr "type" "ssqrt,dsqrt")
+       (eq_attr "cpu" "cell"))
+  "slot1,nonpipeline,nonpipeline*82")
+
+; VMX
+(define_insn_reservation "cell-vecsimple" 4
+  (and (eq_attr "type" "vecsimple")
+       (eq_attr "cpu" "cell"))
+  "slot01,vsu1_cell,vsu1_cell*2")
+
+;; mult, div, madd
+(define_insn_reservation "cell-veccomplex" 10
+  (and (eq_attr "type" "veccomplex")
+       (eq_attr "cpu" "cell"))
+  "slot01,vsu1_cell,vsu1_cell*8")
+
+;; TODO: add support for recording instructions
+(define_insn_reservation "cell-veccmp" 4
+  (and (eq_attr "type" "veccmp")
+       (eq_attr "cpu" "cell"))
+  "slot01,vsu1_cell,vsu1_cell*2")
+
+(define_insn_reservation "cell-vecfloat" 12
+  (and (eq_attr "type" "vecfloat")
+       (eq_attr "cpu" "cell"))
+  "slot01,vsu1_cell,vsu1_cell*10")
+
+(define_insn_reservation "cell-vecperm" 4
+  (and (eq_attr "type" "vecperm")
+       (eq_attr "cpu" "cell"))
+  "slot01,vsu2_cell,vsu2_cell*2")
+
+;; New for 4.2, syncs
+
+(define_insn_reservation "cell-sync" 11
+  (and (eq_attr "type" "sync")
+       (eq_attr "cpu" "cell"))
+  "slot01,lsu_cell,lsu_cell*9")
+
+(define_insn_reservation "cell-isync" 11
+  (and (eq_attr "type" "isync")
+       (eq_attr "cpu" "cell"))
+  "slot01,lsu_cell,lsu_cell*9")
+
+(define_insn_reservation "cell-load_l" 11
+  (and (eq_attr "type" "load_l")
+       (eq_attr "cpu" "cell"))
+  "slot01,lsu_cell,lsu_cell*9")
+
+(define_insn_reservation "cell-store_c" 11
+  (and (eq_attr "type" "store_c")
+       (eq_attr "cpu" "cell"))
+  "slot01,lsu_cell,lsu_cell*9")
+
+;; RAW register dependency
+
+;; addi r3, r3, 1
+;; lw r4,offset(r3)
+;; there are 5 cycle deplay for r3 bypassing
+;; there are 5 cycle delay for a dependent load after a load
+(define_bypass 5 "cell-integer" "cell-load")
+(define_bypass 5 "cell-integer" "cell-load-ext")
+(define_bypass 5 "cell-load,cell-load-ext" "cell-load,cell-load-ext")
+
+;; there is a 6 cycle delay after a fp compare until you can use the cr.
+(define_bypass 6 "cell-fpcompare" "cell-branch,cell-branchreg,cell-mfcr,cell-crlogical")
+
+;; VXU float RAW
+(define_bypass 11 "cell-vecfloat" "cell-vecfloat")
+
+;; VXU and FPU
+(define_bypass 6 "cell-veccomplex" "cell-vecsimple")
+;;(define_bypass 6 "cell-veccompare" "cell-branch,cell-branchreg")
+(define_bypass 3 "cell-vecfloat" "cell-veccomplex")
+; this is not correct, 
+;;  this is a stall in general and not dependent on result
+(define_bypass 13 "cell-vecstore" "cell-fpstore")
+; this is not correct, this can never be true, not depent on result
+(define_bypass 7 "cell-fp" "cell-fpload")
+;; vsu1 should avoid writing to the same target register as vsu2 insn
+;;   within 12 cycles. 
+
+;; WAW hazard
+
+;; the target of VSU estimate should not be reused within 10 dispatch groups
+;; the target of VSU float should not be reused within 8 dispatch groups
+;; the target of VSU complex should not be reused within 5 dispatch groups
+;; FP LOAD should not reuse an FPU Arithmetic target with 6 dispatch gropus
+
+;; mtctr-bcctr/bcctrl, branch target ctr register shadow update at
+;;  ex4 stage(10 cycles)
+(define_bypass 10 "cell-mtjmpr" "cell-branchreg")
+
+;;Things are not simulated:
+;; update instruction, update address gpr are not simulated
+;; vrefp, vrsqrtefp have latency(14), currently simluated as 12 cycle float
+;;  insns
+
diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c
index eec8a49093a..dc80f9ff3da 100644
--- a/gcc/config/rs6000/rs6000.c
+++ b/gcc/config/rs6000/rs6000.c
@@ -139,6 +139,8 @@ struct rs6000_cpu_select rs6000_select[3] =
   { (const char *)0,	"-mtune=",		1,	0 },
 };
 
+static GTY(()) bool rs6000_cell_dont_microcode;
+
 /* Always emit branch hint bits.  */
 static GTY(()) bool rs6000_always_hint;
 
@@ -519,6 +521,22 @@ struct processor_costs ppc630_cost = {
   COSTS_N_INSNS (21),   /* ddiv */
 };
 
+/* Instruction costs on Cell processor.  */
+/* COSTS_N_INSNS (1) ~ one add.  */
+static const
+struct processor_costs ppccell_cost = {
+  COSTS_N_INSNS (9/2)+2,    /* mulsi */
+  COSTS_N_INSNS (6/2),    /* mulsi_const */
+  COSTS_N_INSNS (6/2),    /* mulsi_const9 */
+  COSTS_N_INSNS (15/2)+2,   /* muldi */
+  COSTS_N_INSNS (38/2),   /* divsi */
+  COSTS_N_INSNS (70/2),   /* divdi */
+  COSTS_N_INSNS (10/2),   /* fp */
+  COSTS_N_INSNS (10/2),   /* dmul */
+  COSTS_N_INSNS (74/2),   /* sdiv */
+  COSTS_N_INSNS (74/2),   /* ddiv */
+};
+
 /* Instruction costs on PPC750 and PPC7400 processors.  */
 static const
 struct processor_costs ppc750_cost = {
@@ -671,6 +689,7 @@ static bool rs6000_rtx_costs (rtx, int, int, int *);
 static int rs6000_adjust_cost (rtx, rtx, rtx, int);
 static void rs6000_sched_init (FILE *, int, int);
 static bool is_microcoded_insn (rtx);
+static bool is_nonpipeline_insn (rtx);
 static bool is_cracked_insn (rtx);
 static bool is_branch_slot_insn (rtx);
 static bool is_load_insn (rtx);
@@ -692,6 +711,7 @@ static void rs6000_sched_finish (FILE *, int);
 static int rs6000_sched_reorder (FILE *, int, rtx *, int *, int);
 static int rs6000_sched_reorder2 (FILE *, int, rtx *, int *, int);
 static int rs6000_use_sched_lookahead (void);
+static int rs6000_use_sched_lookahead_guard (rtx);
 static tree rs6000_builtin_mask_for_load (void);
 static tree rs6000_builtin_mul_widen_even (tree);
 static tree rs6000_builtin_mul_widen_odd (tree);
@@ -952,6 +972,9 @@ static const char alt_reg_names[][8] =
 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD rs6000_use_sched_lookahead
 
+#undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
+#define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD rs6000_use_sched_lookahead_guard
+
 #undef TARGET_VECTORIZE_BUILTIN_MASK_FOR_LOAD
 #define TARGET_VECTORIZE_BUILTIN_MASK_FOR_LOAD rs6000_builtin_mask_for_load
 #undef TARGET_VECTORIZE_BUILTIN_MUL_WIDEN_EVEN
@@ -1217,6 +1240,8 @@ rs6000_override_options (const char *default_cpu)
 	 {"860", PROCESSOR_MPCCORE, POWERPC_BASE_MASK | MASK_SOFT_FLOAT},
 	 {"970", PROCESSOR_POWER4,
 	  POWERPC_7400_MASK | MASK_PPC_GPOPT | MASK_MFCRF | MASK_POWERPC64},
+	 {"cell", PROCESSOR_CELL,
+	  POWERPC_7400_MASK  | MASK_PPC_GPOPT | MASK_MFCRF | MASK_POWERPC64},
 	 {"common", PROCESSOR_COMMON, MASK_NEW_MNEMONICS},
 	 {"ec603e", PROCESSOR_PPC603, POWERPC_BASE_MASK | MASK_SOFT_FLOAT},
 	 {"G3", PROCESSOR_PPC750, POWERPC_BASE_MASK | MASK_PPC_GFXOPT},
@@ -1445,7 +1470,8 @@ rs6000_override_options (const char *default_cpu)
 
   rs6000_always_hint = (rs6000_cpu != PROCESSOR_POWER4
 			&& rs6000_cpu != PROCESSOR_POWER5
-                        && rs6000_cpu != PROCESSOR_POWER6);
+                        && rs6000_cpu != PROCESSOR_POWER6
+			&& rs6000_cpu != PROCESSOR_CELL);
   rs6000_sched_groups = (rs6000_cpu == PROCESSOR_POWER4
 			 || rs6000_cpu == PROCESSOR_POWER5);
   rs6000_align_branch_targets = (rs6000_cpu == PROCESSOR_POWER4
@@ -1519,6 +1545,16 @@ rs6000_override_options (const char *default_cpu)
   /* Set branch target alignment, if not optimizing for size.  */
   if (!optimize_size)
     {
+      /* Cell wants to be aligned 8byte for dual issue. */
+      if (rs6000_cpu == PROCESSOR_CELL)
+	{
+	  if (align_functions <= 0)
+	    align_functions = 8;
+	  if (align_jumps <= 0)
+	    align_jumps = 8;
+	  if (align_loops <= 0)
+	    align_loops = 8;
+ 	}
       if (rs6000_align_branch_targets)
 	{
 	  if (align_functions <= 0)
@@ -1600,6 +1636,10 @@ rs6000_override_options (const char *default_cpu)
 	rs6000_cost = &ppc630_cost;
 	break;
 
+      case PROCESSOR_CELL: 
+	rs6000_cost = &ppccell_cost;
+	break;
+
       case PROCESSOR_PPC750:
       case PROCESSOR_PPC7400:
 	rs6000_cost = &ppc750_cost;
@@ -14940,7 +14980,7 @@ rs6000_emit_epilogue (int sibcall)
   rs6000_stack_t *info;
   int restoring_FPRs_inline;
   int using_load_multiple;
-  int using_mfcr_multiple;
+  int using_mtcr_multiple;
   int use_backchain_to_restore_sp;
   int sp_offset = 0;
   rtx sp_reg_rtx = gen_rtx_REG (Pmode, 1);
@@ -14969,7 +15009,7 @@ rs6000_emit_epilogue (int sibcall)
   use_backchain_to_restore_sp = (frame_pointer_needed
 				 || current_function_calls_alloca
 				 || info->total_size > 32767);
-  using_mfcr_multiple = (rs6000_cpu == PROCESSOR_PPC601
+  using_mtcr_multiple = (rs6000_cpu == PROCESSOR_PPC601
 			 || rs6000_cpu == PROCESSOR_PPC603
 			 || rs6000_cpu == PROCESSOR_PPC750
 			 || optimize_size);
@@ -15269,7 +15309,7 @@ rs6000_emit_epilogue (int sibcall)
       rtx r12_rtx = gen_rtx_REG (SImode, 12);
       int count = 0;
 
-      if (using_mfcr_multiple)
+      if (using_mtcr_multiple)
 	{
 	  for (i = 0; i < 8; i++)
 	    if (regs_ever_live[CR0_REGNO+i] && ! call_used_regs[CR0_REGNO+i])
@@ -15277,7 +15317,7 @@ rs6000_emit_epilogue (int sibcall)
 	  gcc_assert (count);
 	}
 
-      if (using_mfcr_multiple && count > 1)
+      if (using_mtcr_multiple && count > 1)
 	{
 	  rtvec p;
 	  int ndx;
@@ -16595,6 +16635,10 @@ rs6000_variable_issue (FILE *stream ATTRIBUTE_UNUSED,
       return cached_can_issue_more;
     }
 
+  /* If no reservation, but reach here */
+  if (recog_memoized (insn) < 0)
+    return more;
+
   if (rs6000_sched_groups)
     {
       if (is_microcoded_insn (insn))
@@ -16607,6 +16651,9 @@ rs6000_variable_issue (FILE *stream ATTRIBUTE_UNUSED,
       return cached_can_issue_more;
     }
 
+  if (rs6000_cpu_attr == CPU_CELL && is_nonpipeline_insn (insn))
+    return 0;
+
   cached_can_issue_more = more - 1;
   return cached_can_issue_more;
 }
@@ -16662,7 +16709,8 @@ rs6000_adjust_cost (rtx insn, rtx link, rtx dep_insn, int cost)
                  || rs6000_cpu_attr == CPU_PPC7400
                  || rs6000_cpu_attr == CPU_PPC7450
                  || rs6000_cpu_attr == CPU_POWER4
-                 || rs6000_cpu_attr == CPU_POWER5)
+                 || rs6000_cpu_attr == CPU_POWER5
+                 || rs6000_cpu_attr == CPU_CELL)
                 && recog_memoized (dep_insn)
                 && (INSN_CODE (dep_insn) >= 0))
               
@@ -16912,6 +16960,9 @@ is_microcoded_insn (rtx insn)
       || GET_CODE (PATTERN (insn)) == CLOBBER)
     return false;
 
+  if (rs6000_cpu_attr == CPU_CELL)
+    return get_attr_cell_micro (insn) == CELL_MICRO_ALWAYS;
+
   if (rs6000_sched_groups)
     {
       enum attr_type type = get_attr_type (insn);
@@ -17115,6 +17166,37 @@ rs6000_adjust_priority (rtx insn ATTRIBUTE_UNUSED, int priority)
   return priority;
 }
 
+/* Return true if the instruction is nonpipelined on the Cell. */
+static bool
+is_nonpipeline_insn (rtx insn)
+{
+  enum attr_type type;
+  if (!insn || !INSN_P (insn)
+      || GET_CODE (PATTERN (insn)) == USE
+      || GET_CODE (PATTERN (insn)) == CLOBBER)
+    return false;
+
+  type = get_attr_type (insn);
+  if (type == TYPE_IMUL
+      || type == TYPE_IMUL2
+      || type == TYPE_IMUL3
+      || type == TYPE_LMUL
+      || type == TYPE_IDIV
+      || type == TYPE_LDIV
+      || type == TYPE_SDIV
+      || type == TYPE_DDIV
+      || type == TYPE_SSQRT
+      || type == TYPE_DSQRT
+      || type == TYPE_MFCR
+      || type == TYPE_MFCRF
+      || type == TYPE_MFJMPR)
+    {
+      return true;
+    }
+  return false;
+}
+
+
 /* Return how many instructions the machine can issue per cycle.  */
 
 static int
@@ -17135,6 +17217,7 @@ rs6000_issue_rate (void)
   case CPU_PPC750:
   case CPU_PPC7400:
   case CPU_PPC8540:
+  case CPU_CELL:
     return 2;
   case CPU_RIOS2:
   case CPU_PPC604:
@@ -17159,9 +17242,29 @@ rs6000_use_sched_lookahead (void)
 {
   if (rs6000_cpu_attr == CPU_PPC8540)
     return 4;
+  if (rs6000_cpu_attr == CPU_CELL)
+    return (reload_completed ? 8 : 0);
   return 0;
 }
 
+/* We are choosing insn from the ready queue.  Return nonzero if INSN can be chosen.  */
+static int
+rs6000_use_sched_lookahead_guard (rtx insn)
+{
+  if (rs6000_cpu_attr != CPU_CELL)
+    return 1;
+
+   if (insn == NULL_RTX || !INSN_P (insn))
+     abort ();
+   
+  if (!reload_completed
+      || is_nonpipeline_insn (insn)
+      || is_microcoded_insn (insn))
+    return 0;
+
+  return 1;
+}
+
 /* Determine is PAT refers to memory.  */
 
 static bool
@@ -17337,9 +17440,25 @@ rs6000_sched_reorder (FILE *dump ATTRIBUTE_UNUSED, int sched_verbose,
                         int *pn_ready ATTRIBUTE_UNUSED,
 		        int clock_var ATTRIBUTE_UNUSED)
 {
+  int n_ready = *pn_ready;
+
   if (sched_verbose)
     fprintf (dump, "// rs6000_sched_reorder :\n");
 
+  /* Reorder the ready list, if the second to last ready insn
+     is a nonepipeline insn.  */
+  if (rs6000_cpu_attr == CPU_CELL && n_ready > 1)
+  {
+    if (is_nonpipeline_insn (ready[n_ready - 1])
+        && (recog_memoized (ready[n_ready - 2]) > 0))
+      /* Simply swap first two insns.  */
+      {
+	rtx tmp = ready[n_ready - 1];
+	ready[n_ready - 1] = ready[n_ready - 2];
+	ready[n_ready - 2] = tmp;
+      }
+  }
+
   if (rs6000_cpu == PROCESSOR_POWER6)
     load_store_pendulum = 0;
 
diff --git a/gcc/config/rs6000/rs6000.h b/gcc/config/rs6000/rs6000.h
index c9856d12aca..6ccc3c01ad4 100644
--- a/gcc/config/rs6000/rs6000.h
+++ b/gcc/config/rs6000/rs6000.h
@@ -68,6 +68,7 @@
   %{mno-power: %{!mpowerpc*: -mcom}} \
   %{!mno-power: %{!mpower*: %(asm_default)}}} \
 %{mcpu=common: -mcom} \
+%{mcpu=cell: -mcell} \
 %{mcpu=power: -mpwr} \
 %{mcpu=power2: -mpwrx} \
 %{mcpu=power3: -mppc64} \
@@ -222,7 +223,8 @@ enum processor_type
    PROCESSOR_PPC8540,
    PROCESSOR_POWER4,
    PROCESSOR_POWER5,
-   PROCESSOR_POWER6
+   PROCESSOR_POWER6,
+   PROCESSOR_CELL
 };
 
 extern enum processor_type rs6000_cpu;
diff --git a/gcc/config/rs6000/rs6000.md b/gcc/config/rs6000/rs6000.md
index 8ddf26d924d..f40e78dfafa 100644
--- a/gcc/config/rs6000/rs6000.md
+++ b/gcc/config/rs6000/rs6000.md
@@ -106,9 +106,26 @@
 ;; Processor type -- this attribute must exactly match the processor_type
 ;; enumeration in rs6000.h.
 
-(define_attr "cpu" "rios1,rios2,rs64a,mpccore,ppc403,ppc405,ppc440,ppc601,ppc603,ppc604,ppc604e,ppc620,ppc630,ppc750,ppc7400,ppc7450,ppc8540,power4,power5,power6"
+(define_attr "cpu" "rios1,rios2,rs64a,mpccore,ppc403,ppc405,ppc440,ppc601,ppc603,ppc604,ppc604e,ppc620,ppc630,ppc750,ppc7400,ppc7450,ppc8540,power4,power5,power6,cell"
   (const (symbol_ref "rs6000_cpu_attr")))
 
+
+;; If this instruction is microcoded on the CELL processor
+; The default for load and stores is conditional
+; The default for load extended and the recorded instructions is always microcoded
+(define_attr "cell_micro" "not,conditional,always"
+  (if_then_else (ior (ior (eq_attr "type" "load")
+                          (eq_attr "type" "store"))
+                     (ior (eq_attr "type" "fpload")
+                          (eq_attr "type" "fpstore")))
+	        (const_string "conditional")
+                (if_then_else (ior (eq_attr "type" "load_ext")
+				   (ior (eq_attr "type" "compare")
+				        (eq_attr "type" "delayed_compare")))
+			      (const_string "always")
+                              (const_string "not"))))
+
+
 (automata_option "ndfa")
 
 (include "rios1.md")
@@ -125,6 +142,7 @@
 (include "power4.md")
 (include "power5.md")
 (include "power6.md")
+(include "cell.md")
 
 (include "predicates.md")
 (include "constraints.md")