16 files changed, 329 insertions, 218 deletions
diff --git a/compiler/cmm/CLabel.hs b/compiler/cmm/CLabel.hs
index 4d9596197e..3451c7d5a9 100644
--- a/compiler/cmm/CLabel.hs
+++ b/compiler/cmm/CLabel.hs
@@ -51,9 +51,7 @@ module CLabel (
 
 	mkAsmTempLabel,
 
-	mkModuleInitLabel,
-	mkPlainModuleInitLabel,
-	mkModuleInitTableLabel,
+        mkPlainModuleInitLabel,
 
 	mkSplitMarkerLabel,
 	mkDirty_MUT_VAR_Label,
@@ -70,10 +68,7 @@ module CLabel (
 	mkRtsPrimOpLabel,
 	mkRtsSlowTickyCtrLabel,
 
-	moduleRegdLabel,
-	moduleRegTableLabel,
-
-	mkSelectorInfoLabel,
+        mkSelectorInfoLabel,
 	mkSelectorEntryLabel,
 
 	mkCmmInfoLabel,
@@ -102,7 +97,6 @@ module CLabel (
         mkDeadStripPreventer,
 
         mkHpcTicksLabel,
-        mkHpcModuleNameLabel,
 
         hasCAF,
 	infoLblToEntryLbl, entryLblToInfoLbl, cvtToClosureLbl, cvtToSRTLbl,
@@ -202,23 +196,9 @@ data CLabel
   | StringLitLabel
 	{-# UNPACK #-} !Unique
 
-  | ModuleInitLabel 
-	Module			-- the module name
-	String			-- its "way"
-	-- at some point we might want some kind of version number in
-	-- the module init label, to guard against compiling modules in
-	-- the wrong order.  We can't use the interface file version however,
-	-- because we don't always recompile modules which depend on a module
-	-- whose version has changed.
-
-  | PlainModuleInitLabel	-- without the version & way info
+  | PlainModuleInitLabel        -- without the version & way info
 	Module
 
-  | ModuleInitTableLabel	-- table of imported modules to init
-	Module
-
-  | ModuleRegdLabel
-
   | CC_Label  CostCentre
   | CCS_Label CostCentreStack
 
@@ -242,9 +222,6 @@ data CLabel
   -- | Per-module table of tick locations
   | HpcTicksLabel Module
 
-  -- | Per-module name of the module for Hpc
-  | HpcModuleNameLabel
-
   -- | Label of an StgLargeSRT
   | LargeSRTLabel
         {-# UNPACK #-} !Unique
@@ -490,7 +467,6 @@ mkRtsSlowTickyCtrLabel pat = RtsLabel (RtsSlowTickyCtr pat)
 
 -- Constructing Code Coverage Labels
 mkHpcTicksLabel                = HpcTicksLabel
-mkHpcModuleNameLabel           = HpcModuleNameLabel
 
 
 -- Constructing labels used for dynamic linking
@@ -515,19 +491,9 @@ mkStringLitLabel		= StringLitLabel
 mkAsmTempLabel :: Uniquable a => a -> CLabel
 mkAsmTempLabel a		= AsmTempLabel (getUnique a)
 
-mkModuleInitLabel :: Module -> String -> CLabel
-mkModuleInitLabel mod way	= ModuleInitLabel mod way
-
 mkPlainModuleInitLabel :: Module -> CLabel
 mkPlainModuleInitLabel mod	= PlainModuleInitLabel mod
 
-mkModuleInitTableLabel :: Module -> CLabel
-mkModuleInitTableLabel mod	= ModuleInitTableLabel mod
-
-moduleRegdLabel			= ModuleRegdLabel
-moduleRegTableLabel		= ModuleInitTableLabel	
-
-
 -- -----------------------------------------------------------------------------
 -- Converting between info labels and entry/ret labels.
 
@@ -591,10 +557,7 @@ needsCDecl (LargeSRTLabel _)		= False
 needsCDecl (LargeBitmapLabel _)		= False
 needsCDecl (IdLabel _ _ _)		= True
 needsCDecl (CaseLabel _ _)	        = True
-needsCDecl (ModuleInitLabel _ _)	= True
-needsCDecl (PlainModuleInitLabel _)	= True
-needsCDecl (ModuleInitTableLabel _)	= True
-needsCDecl ModuleRegdLabel		= False
+needsCDecl (PlainModuleInitLabel _)     = True
 
 needsCDecl (StringLitLabel _)		= False
 needsCDecl (AsmTempLabel _)		= False
@@ -612,7 +575,6 @@ needsCDecl l@(ForeignLabel{})		= not (isMathFun l)
 needsCDecl (CC_Label _)			= True
 needsCDecl (CCS_Label _)		= True
 needsCDecl (HpcTicksLabel _)            = True
-needsCDecl HpcModuleNameLabel           = False
 
 
 -- | Check whether a label is a local temporary for native code generation
@@ -630,7 +592,7 @@ maybeAsmTemp _ 	    	       		= Nothing
 
 -- | Check whether a label corresponds to a C function that has 
 --      a prototype in a system header somehere, or is built-in
---      to the C compiler. For these labels we abovoid generating our
+--      to the C compiler. For these labels we avoid generating our
 --      own C prototypes.
 isMathFun :: CLabel -> Bool
 isMathFun (ForeignLabel fs _ _ _) 	= fs `elementOfUniqSet` math_funs
@@ -725,11 +687,8 @@ externallyVisibleCLabel :: CLabel -> Bool -- not C "static"
 externallyVisibleCLabel (CaseLabel _ _)		= False
 externallyVisibleCLabel (StringLitLabel _)	= False
 externallyVisibleCLabel (AsmTempLabel _)	= False
-externallyVisibleCLabel (ModuleInitLabel _ _)	= True
 externallyVisibleCLabel (PlainModuleInitLabel _)= True
-externallyVisibleCLabel (ModuleInitTableLabel _)= False
-externallyVisibleCLabel ModuleRegdLabel		= False
-externallyVisibleCLabel (RtsLabel _)		= True
+externallyVisibleCLabel (RtsLabel _)            = True
 externallyVisibleCLabel (CmmLabel _ _ _)	= True
 externallyVisibleCLabel (ForeignLabel{})	= True
 externallyVisibleCLabel (IdLabel name _ _)	= isExternalName name
@@ -737,8 +696,7 @@ externallyVisibleCLabel (CC_Label _)		= True
 externallyVisibleCLabel (CCS_Label _)		= True
 externallyVisibleCLabel (DynamicLinkerLabel _ _)  = False
 externallyVisibleCLabel (HpcTicksLabel _)	= True
-externallyVisibleCLabel HpcModuleNameLabel	= False
-externallyVisibleCLabel (LargeBitmapLabel _)	= False
+externallyVisibleCLabel (LargeBitmapLabel _)    = False
 externallyVisibleCLabel (LargeSRTLabel _)	= False
 
 -- -----------------------------------------------------------------------------
@@ -777,9 +735,7 @@ labelType (RtsLabel (RtsApInfoTable _ _))       = DataLabel
 labelType (RtsLabel (RtsApFast _))              = CodeLabel
 labelType (CaseLabel _ CaseReturnInfo)          = DataLabel
 labelType (CaseLabel _ _)	                = CodeLabel
-labelType (ModuleInitLabel _ _)                 = CodeLabel
 labelType (PlainModuleInitLabel _)              = CodeLabel
-labelType (ModuleInitTableLabel _)              = DataLabel
 labelType (LargeSRTLabel _)                     = DataLabel
 labelType (LargeBitmapLabel _)                  = DataLabel
 labelType (ForeignLabel _ _ _ IsFunction)	= CodeLabel
@@ -837,10 +793,8 @@ labelDynamic this_pkg lbl =
    CmmLabel pkg _ _     -> True 
 
 #endif
-   ModuleInitLabel m _    -> not opt_Static && this_pkg /= (modulePackageId m)
    PlainModuleInitLabel m -> not opt_Static && this_pkg /= (modulePackageId m)
-   ModuleInitTableLabel m -> not opt_Static && this_pkg /= (modulePackageId m)
-   
+
    -- Note that DynamicLinkerLabels do NOT require dynamic linking themselves.
    _ 		     -> False
 
@@ -896,8 +850,8 @@ instance Outputable CLabel where
 
 pprCLabel :: CLabel -> SDoc
 
-#if ! OMIT_NATIVE_CODEGEN
 pprCLabel (AsmTempLabel u)
+ | cGhcWithNativeCodeGen == "YES"
   =  getPprStyle $ \ sty ->
      if asmStyle sty then 
 	ptext asmTempLabelPrefix <> pprUnique u
@@ -905,23 +859,22 @@ pprCLabel (AsmTempLabel u)
 	char '_' <> pprUnique u
 
 pprCLabel (DynamicLinkerLabel info lbl)
+ | cGhcWithNativeCodeGen == "YES"
    = pprDynamicLinkerAsmLabel info lbl
    
 pprCLabel PicBaseLabel
+ | cGhcWithNativeCodeGen == "YES"
    = ptext (sLit "1b")
    
 pprCLabel (DeadStripPreventer lbl)
+ | cGhcWithNativeCodeGen == "YES"
    = pprCLabel lbl <> ptext (sLit "_dsp")
-#endif
 
-pprCLabel lbl = 
-#if ! OMIT_NATIVE_CODEGEN
-    getPprStyle $ \ sty ->
-    if asmStyle sty then 
-	maybe_underscore (pprAsmCLbl lbl)
-    else
-#endif
-       pprCLbl lbl
+pprCLabel lbl
+   = getPprStyle $ \ sty ->
+     if cGhcWithNativeCodeGen == "YES" && asmStyle sty
+     then maybe_underscore (pprAsmCLbl lbl)
+     else pprCLbl lbl
 
 maybe_underscore doc
   | underscorePrefix = pp_cSEP <> doc
@@ -1008,9 +961,6 @@ pprCLbl (RtsLabel (RtsPrimOp primop))
 pprCLbl (RtsLabel (RtsSlowTickyCtr pat)) 
   = ptext (sLit "SLOW_CALL_") <> text pat <> ptext (sLit "_ctr")
 
-pprCLbl ModuleRegdLabel
-  = ptext (sLit "_module_registered")
-
 pprCLbl (ForeignLabel str _ _ _)
   = ftext str
 
@@ -1019,22 +969,12 @@ pprCLbl (IdLabel name cafs flavor) = ppr name <> ppIdFlavor flavor
 pprCLbl (CC_Label cc) 		= ppr cc
 pprCLbl (CCS_Label ccs) 	= ppr ccs
 
-pprCLbl (ModuleInitLabel mod way)
-   = ptext (sLit "__stginit_") <> ppr mod
-	<> char '_' <> text way
-
 pprCLbl (PlainModuleInitLabel mod)
    = ptext (sLit "__stginit_") <> ppr mod
 
-pprCLbl (ModuleInitTableLabel mod)
-   = ptext (sLit "__stginittable_") <> ppr mod
-
 pprCLbl (HpcTicksLabel mod)
   = ptext (sLit "_hpc_tickboxes_")  <> ppr mod <> ptext (sLit "_hpc")
 
-pprCLbl HpcModuleNameLabel
-  = ptext (sLit "_hpc_module_name_str")
-
 ppIdFlavor :: IdLabelInfo -> SDoc
 ppIdFlavor x = pp_cSEP <>
 	       (case x of
diff --git a/compiler/cmm/Cmm.hs b/compiler/cmm/Cmm.hs
index 2e9f952f7b..54b4b11662 100644
--- a/compiler/cmm/Cmm.hs
+++ b/compiler/cmm/Cmm.hs
@@ -9,10 +9,11 @@
 #endif
 
 module Cmm
-  ( CmmGraph(..), CmmBlock
+  ( CmmGraph, GenCmmGraph(..), CmmBlock
   , CmmStackInfo(..), CmmTopInfo(..), Cmm, CmmTop
   , CmmReplGraph, CmmFwdRewrite, CmmBwdRewrite
 
+  , modifyGraph
   , lastNode, replaceLastNode, insertBetween
   , ofBlockMap, toBlockMap, insertBlock
   , ofBlockList, toBlockList, bodyToBlockList
@@ -41,7 +42,8 @@ import Panic
 -------------------------------------------------
 -- CmmBlock, CmmGraph and Cmm
 
-data CmmGraph = CmmGraph { g_entry :: BlockId, g_graph :: Graph CmmNode C C }
+type CmmGraph = GenCmmGraph CmmNode
+data GenCmmGraph n = CmmGraph { g_entry :: BlockId, g_graph :: Graph n C C }
 type CmmBlock = Block CmmNode C C
 
 type CmmReplGraph e x = FuelUniqSM (Maybe (Graph CmmNode e x))
@@ -56,6 +58,9 @@ type CmmTop       = GenCmmTop CmmStatic CmmTopInfo CmmGraph
 -------------------------------------------------
 -- Manipulating CmmGraphs
 
+modifyGraph :: (Graph n C C -> Graph n' C C) -> GenCmmGraph n -> GenCmmGraph n'
+modifyGraph f g = CmmGraph {g_entry=g_entry g, g_graph=f (g_graph g)}
+
 toBlockMap :: CmmGraph -> LabelMap CmmBlock
 toBlockMap (CmmGraph {g_graph=GMany NothingO body NothingO}) = body
 
@@ -150,26 +155,26 @@ insertBetween b ms succId = insert $ lastNode b
 -- Running dataflow analysis and/or rewrites
 
 -- Constructing forward and backward analysis-only pass
-analFwd    :: Monad m => DataflowLattice f -> FwdTransfer CmmNode f -> FwdPass m CmmNode f
-analBwd    :: Monad m => DataflowLattice f -> BwdTransfer CmmNode f -> BwdPass m CmmNode f
+analFwd    :: Monad m => DataflowLattice f -> FwdTransfer n f -> FwdPass m n f
+analBwd    :: Monad m => DataflowLattice f -> BwdTransfer n f -> BwdPass m n f
 
 analFwd lat xfer = analRewFwd lat xfer noFwdRewrite
 analBwd lat xfer = analRewBwd lat xfer noBwdRewrite
 
 -- Constructing forward and backward analysis + rewrite pass
-analRewFwd :: Monad m => DataflowLattice f -> FwdTransfer CmmNode f -> FwdRewrite m CmmNode f -> FwdPass m CmmNode f
-analRewBwd :: Monad m => DataflowLattice f -> BwdTransfer CmmNode f -> BwdRewrite m CmmNode f -> BwdPass m CmmNode f
+analRewFwd :: Monad m => DataflowLattice f -> FwdTransfer n f -> FwdRewrite m n f -> FwdPass m n f
+analRewBwd :: Monad m => DataflowLattice f -> BwdTransfer n f -> BwdRewrite m n f -> BwdPass m n f
 
 analRewFwd lat xfer rew = FwdPass {fp_lattice = lat, fp_transfer = xfer, fp_rewrite = rew}
 analRewBwd lat xfer rew = BwdPass {bp_lattice = lat, bp_transfer = xfer, bp_rewrite = rew}
 
 -- Running forward and backward dataflow analysis + optional rewrite
-dataflowPassFwd :: CmmGraph -> [(BlockId, f)] -> FwdPass FuelUniqSM CmmNode f -> FuelUniqSM (CmmGraph, BlockEnv f)
+dataflowPassFwd :: NonLocal n => GenCmmGraph n -> [(BlockId, f)] -> FwdPass FuelUniqSM n f -> FuelUniqSM (GenCmmGraph n, BlockEnv f)
 dataflowPassFwd (CmmGraph {g_entry=entry, g_graph=graph}) facts fwd = do
   (graph, facts, NothingO) <- analyzeAndRewriteFwd fwd (JustC [entry]) graph (mkFactBase (fp_lattice fwd) facts)
   return (CmmGraph {g_entry=entry, g_graph=graph}, facts)
 
-dataflowPassBwd :: CmmGraph -> [(BlockId, f)] -> BwdPass FuelUniqSM CmmNode f -> FuelUniqSM (CmmGraph, BlockEnv f)
+dataflowPassBwd :: NonLocal n => GenCmmGraph n -> [(BlockId, f)] -> BwdPass FuelUniqSM n f -> FuelUniqSM (GenCmmGraph n, BlockEnv f)
 dataflowPassBwd (CmmGraph {g_entry=entry, g_graph=graph}) facts bwd = do
   (graph, facts, NothingO) <- analyzeAndRewriteBwd bwd (JustC [entry]) graph (mkFactBase (bp_lattice bwd) facts)
   return (CmmGraph {g_entry=entry, g_graph=graph}, facts)
diff --git a/compiler/cmm/CmmCPS.hs b/compiler/cmm/CmmCPS.hs
index 372562cfca..aad00371a1 100644
--- a/compiler/cmm/CmmCPS.hs
+++ b/compiler/cmm/CmmCPS.hs
@@ -71,10 +71,10 @@ cpsTop _ p@(CmmData {}) = return ([], [(Map.empty, p)])
 cpsTop hsc_env (CmmProc h@(TopInfo {stack_info=StackInfo {arg_space=entry_off}}) l g) =
     do
        -- Why bother doing it this early?
-       -- g <- dual_rewrite Opt_D_dump_cmmz "spills and reloads"
+       -- g <- dual_rewrite run Opt_D_dump_cmmz "spills and reloads"
        --                       (dualLivenessWithInsertion callPPs) g
        -- g <- run $ insertLateReloads g -- Duplicate reloads just before uses
-       -- g <- dual_rewrite Opt_D_dump_cmmz "Dead Assignment Elimination"
+       -- g <- dual_rewrite runOptimization Opt_D_dump_cmmz "Dead Assignment Elimination"
        --                   (removeDeadAssignmentsAndReloads callPPs) g
        dump Opt_D_dump_cmmz "Pre common block elimination" g
        g <- return $ elimCommonBlocks g
@@ -91,16 +91,16 @@ cpsTop hsc_env (CmmProc h@(TopInfo {stack_info=StackInfo {arg_space=entry_off}})
        ----------- Spills and reloads -------------------
        g     <- 
               -- pprTrace "pre Spills" (ppr g) $
-                dual_rewrite Opt_D_dump_cmmz "spills and reloads"
+                dual_rewrite run Opt_D_dump_cmmz "spills and reloads"
                              (dualLivenessWithInsertion procPoints) g
                     -- Insert spills at defns; reloads at return points
        g     <-
               -- pprTrace "pre insertLateReloads" (ppr g) $
-                run $ insertLateReloads g -- Duplicate reloads just before uses
+                runOptimization $ insertLateReloads g -- Duplicate reloads just before uses
        dump Opt_D_dump_cmmz "Post late reloads" g
        g     <-
                -- pprTrace "post insertLateReloads" (ppr g) $
-                dual_rewrite Opt_D_dump_cmmz "Dead Assignment Elimination"
+                dual_rewrite runOptimization Opt_D_dump_cmmz "Dead Assignment Elimination"
                                         (removeDeadAssignmentsAndReloads procPoints) g
                     -- Remove redundant reloads (and any other redundant asst)
 
@@ -112,12 +112,13 @@ cpsTop hsc_env (CmmProc h@(TopInfo {stack_info=StackInfo {arg_space=entry_off}})
 
        --------------- Stack layout ----------------
        slotEnv <- run $ liveSlotAnal g
+       let spEntryMap = getSpEntryMap entry_off g
        mbpprTrace "live slot analysis results: " (ppr slotEnv) $ return ()
-       let areaMap = layout procPoints slotEnv entry_off g
+       let areaMap = layout procPoints spEntryMap slotEnv entry_off g
        mbpprTrace "areaMap" (ppr areaMap) $ return ()
 
        ------------  Manifest the stack pointer --------
-       g  <- run $ manifestSP areaMap entry_off g
+       g  <- run $ manifestSP spEntryMap areaMap entry_off g
        dump Opt_D_dump_cmmz "after manifestSP" g
        -- UGH... manifestSP can require updates to the procPointMap.
        -- We can probably do something quicker here for the update...
@@ -146,12 +147,16 @@ cpsTop hsc_env (CmmProc h@(TopInfo {stack_info=StackInfo {arg_space=entry_off}})
   where dflags = hsc_dflags hsc_env
         mbpprTrace x y z = if dopt Opt_D_dump_cmmz dflags then pprTrace x y z else z
         dump f txt g = dumpIfSet_dyn dflags f txt (ppr g)
-
-        run = runFuelIO (hsc_OptFuel hsc_env)
-
-        dual_rewrite flag txt pass g =
+        -- Runs a required transformation/analysis
+        run = runInfiniteFuelIO (hsc_OptFuel hsc_env)
+        -- Runs an optional transformation/analysis (and should
+        -- thus be subject to optimization fuel)
+        runOptimization = runFuelIO (hsc_OptFuel hsc_env)
+
+        -- pass 'run' or 'runOptimization' for 'r'
+        dual_rewrite r flag txt pass g =
           do dump flag ("Pre " ++ txt)  g
-             g <- run $ pass g
+             g <- r $ pass g
              dump flag ("Post " ++ txt) $ g
              return g
 
diff --git a/compiler/cmm/CmmExpr.hs b/compiler/cmm/CmmExpr.hs
index 3ae2996213..55a5b73ac5 100644
--- a/compiler/cmm/CmmExpr.hs
+++ b/compiler/cmm/CmmExpr.hs
@@ -42,8 +42,8 @@ data CmmExpr
   | CmmRegOff CmmReg Int	
 	-- CmmRegOff reg i
 	--        ** is shorthand only, meaning **
-	-- CmmMachOp (MO_S_Add rep (CmmReg reg) (CmmLit (CmmInt i rep)))
-	--	where rep = cmmRegType reg
+	-- CmmMachOp (MO_Add rep) [x, CmmLit (CmmInt (fromIntegral i) rep)]
+	--	where rep = typeWidth (cmmRegType reg)
 
 instance Eq CmmExpr where	-- Equality ignores the types
   CmmLit l1    	    == CmmLit l2    	 = l1==l2
@@ -124,6 +124,8 @@ cmmExprType (CmmReg reg)      	= cmmRegType reg
 cmmExprType (CmmMachOp op args) = machOpResultType op (map cmmExprType args)
 cmmExprType (CmmRegOff reg _)   = cmmRegType reg
 cmmExprType (CmmStackSlot _ _)  = bWord -- an address
+-- Careful though: what is stored at the stack slot may be bigger than
+-- an address
 
 cmmLitType :: CmmLit -> CmmType
 cmmLitType (CmmInt _ width)     = cmmBits  width
diff --git a/compiler/cmm/CmmLint.hs b/compiler/cmm/CmmLint.hs
index 95b1eef6a3..32fead337e 100644
--- a/compiler/cmm/CmmLint.hs
+++ b/compiler/cmm/CmmLint.hs
@@ -24,7 +24,6 @@ import OldPprCmm()
 import Constants
 import FastString
 
-import Control.Monad
 import Data.Maybe
 
 -- -----------------------------------------------------------------------------
@@ -70,8 +69,10 @@ lintCmmBlock labels (BasicBlock id stmts)
 lintCmmExpr :: CmmExpr -> CmmLint CmmType
 lintCmmExpr (CmmLoad expr rep) = do
   _ <- lintCmmExpr expr
-  when (widthInBytes (typeWidth rep) >= wORD_SIZE) $
-     cmmCheckWordAddress expr
+  -- Disabled, if we have the inlining phase before the lint phase,
+  -- we can have funny offsets due to pointer tagging. -- EZY
+  -- when (widthInBytes (typeWidth rep) >= wORD_SIZE) $
+  --   cmmCheckWordAddress expr
   return rep
 lintCmmExpr expr@(CmmMachOp op args) = do
   tys <- mapM lintCmmExpr args
@@ -99,14 +100,14 @@ isOffsetOp _ = False
 
 -- This expression should be an address from which a word can be loaded:
 -- check for funny-looking sub-word offsets.
-cmmCheckWordAddress :: CmmExpr -> CmmLint ()
-cmmCheckWordAddress e@(CmmMachOp op [arg, CmmLit (CmmInt i _)])
+_cmmCheckWordAddress :: CmmExpr -> CmmLint ()
+_cmmCheckWordAddress e@(CmmMachOp op [arg, CmmLit (CmmInt i _)])
   | isOffsetOp op && notNodeReg arg && i `rem` fromIntegral wORD_SIZE /= 0
   = cmmLintDubiousWordOffset e
-cmmCheckWordAddress e@(CmmMachOp op [CmmLit (CmmInt i _), arg])
+_cmmCheckWordAddress e@(CmmMachOp op [CmmLit (CmmInt i _), arg])
   | isOffsetOp op && notNodeReg arg && i `rem` fromIntegral wORD_SIZE /= 0
   = cmmLintDubiousWordOffset e
-cmmCheckWordAddress _
+_cmmCheckWordAddress _
   = return ()
 
 -- No warnings for unaligned arithmetic with the node register,
@@ -152,6 +153,7 @@ lintTarget (CmmPrim {})    = return ()
 
 checkCond :: CmmExpr -> CmmLint ()
 checkCond (CmmMachOp mop _) | isComparisonMachOp mop = return ()
+checkCond (CmmLit (CmmInt x t)) | x == 0 || x == 1, t == wordWidth = return () -- constant values
 checkCond expr = cmmLintErr (hang (text "expression is not a conditional:") 2
 				    (ppr expr))
 
diff --git a/compiler/cmm/CmmLive.hs b/compiler/cmm/CmmLive.hs
index 78867b0ce3..c87a3a9b33 100644
--- a/compiler/cmm/CmmLive.hs
+++ b/compiler/cmm/CmmLive.hs
@@ -63,12 +63,12 @@ gen  a live = foldRegsUsed    extendRegSet      live a
 kill :: DefinerOfLocalRegs a => a -> RegSet -> RegSet
 kill a live = foldRegsDefd delOneFromUniqSet live a
 
+-- Testing!
 xferLive :: BwdTransfer CmmNode CmmLive
 xferLive = mkBTransfer3 fst mid lst
   where fst _ f = f
         mid :: CmmNode O O -> CmmLive -> CmmLive
-        mid n f = gen_kill n $ case n of CmmUnsafeForeignCall {} -> emptyRegSet
-                                         _                       -> f
+        mid n f = gen_kill n f
         lst :: CmmNode O C -> FactBase CmmLive -> CmmLive
         lst n f = gen_kill n $ case n of CmmCall {}            -> emptyRegSet
                                          CmmForeignCall {}     -> emptyRegSet
diff --git a/compiler/cmm/CmmNode.hs b/compiler/cmm/CmmNode.hs
index 93564ac946..e67321c0b0 100644
--- a/compiler/cmm/CmmNode.hs
+++ b/compiler/cmm/CmmNode.hs
@@ -92,6 +92,8 @@ data CmmNode e x where
 A MidForeign call is used for *unsafe* foreign calls;
 a LastForeign call is used for *safe* foreign calls.
 Unsafe ones are easy: think of them as a "fat machine instruction".
+In particular, they do *not* kill all live registers (there was a bit
+of code in GHC that conservatively assumed otherwise.)
 
 Safe ones are trickier.  A safe foreign call 
      r = f(x)
diff --git a/compiler/cmm/CmmOpt.hs b/compiler/cmm/CmmOpt.hs
index 0dec26da6f..a2eecd5c48 100644
--- a/compiler/cmm/CmmOpt.hs
+++ b/compiler/cmm/CmmOpt.hs
@@ -14,6 +14,7 @@
 -----------------------------------------------------------------------------
 
 module CmmOpt (
+	cmmEliminateDeadBlocks,
 	cmmMiniInline,
 	cmmMachOpFold,
 	cmmLoopifyForC,
@@ -30,10 +31,69 @@ import UniqFM
 import Unique
 import FastTypes
 import Outputable
+import BlockId
 
 import Data.Bits
 import Data.Word
 import Data.Int
+import Data.Maybe
+import Data.List
+
+import Compiler.Hoopl hiding (Unique)
+
+-- -----------------------------------------------------------------------------
+-- Eliminates dead blocks
+
+{-
+We repeatedly expand the set of reachable blocks until we hit a
+fixpoint, and then prune any blocks that were not in this set.  This is
+actually a required optimization, as dead blocks can cause problems
+for invariants in the linear register allocator (and possibly other
+places.)
+-}
+
+-- Deep fold over statements could probably be abstracted out, but it
+-- might not be worth the effort since OldCmm is moribund
+cmmEliminateDeadBlocks :: [CmmBasicBlock] -> [CmmBasicBlock]
+cmmEliminateDeadBlocks [] = []
+cmmEliminateDeadBlocks blocks@(BasicBlock base_id _:_) =
+    let -- Calculate what's reachable from what block
+        reachableMap = foldl' f emptyUFM blocks -- lazy in values
+            where f m (BasicBlock block_id stmts) = addToUFM m block_id (reachableFrom stmts)
+        reachableFrom stmts = foldl stmt [] stmts
+            where
+                stmt m CmmNop = m
+                stmt m (CmmComment _) = m
+                stmt m (CmmAssign _ e) = expr m e
+                stmt m (CmmStore e1 e2) = expr (expr m e1) e2
+                stmt m (CmmCall c _ as _ _) = f (actuals m as) c
+                    where f m (CmmCallee e _) = expr m e
+                          f m (CmmPrim _) = m
+                stmt m (CmmBranch b) = b:m
+                stmt m (CmmCondBranch e b) = b:(expr m e)
+                stmt m (CmmSwitch e bs) = catMaybes bs ++ expr m e
+                stmt m (CmmJump e as) = expr (actuals m as) e
+                stmt m (CmmReturn as) = actuals m as
+                actuals m as = foldl' (\m h -> expr m (hintlessCmm h)) m as
+                -- We have to do a deep fold into CmmExpr because
+                -- there may be a BlockId in the CmmBlock literal.
+                expr m (CmmLit l) = lit m l
+                expr m (CmmLoad e _) = expr m e
+                expr m (CmmReg _) = m
+                expr m (CmmMachOp _ es) = foldl' expr m es
+                expr m (CmmStackSlot _ _) = m
+                expr m (CmmRegOff _ _) = m
+                lit m (CmmBlock b) = b:m
+                lit m _ = m
+        -- go todo done
+        reachable = go [base_id] (setEmpty :: BlockSet)
+          where go []     m = m
+                go (x:xs) m
+                    | setMember x m = go xs m
+                    | otherwise     = go (add ++ xs) (setInsert x m)
+                        where add = fromMaybe (panic "cmmEliminateDeadBlocks: unknown block")
+                                              (lookupUFM reachableMap x)
+    in filter (\(BasicBlock block_id _) -> setMember block_id reachable) blocks
 
 -- -----------------------------------------------------------------------------
 -- The mini-inliner
@@ -115,12 +175,15 @@ cmmMiniInlineStmts uses (stmt@(CmmAssign (CmmLocal (LocalReg u _)) expr) : stmts
 cmmMiniInlineStmts uses (stmt:stmts)
   = stmt : cmmMiniInlineStmts uses stmts
 
-lookForInline u expr (stmt : rest)
+lookForInline u expr stmts = lookForInline' u expr regset stmts
+    where regset = foldRegsUsed extendRegSet emptyRegSet expr
+
+lookForInline' u expr regset (stmt : rest)
   | Just 1 <- lookupUFM (countUses stmt) u, ok_to_inline
   = Just (inlineStmt u expr stmt : rest)
 
   | ok_to_skip
-  = case lookForInline u expr rest of
+  = case lookForInline' u expr regset rest of
            Nothing    -> Nothing
            Just stmts -> Just (stmt:stmts)
 
@@ -137,13 +200,18 @@ lookForInline u expr (stmt : rest)
 		     CmmCall{} -> hasNoGlobalRegs expr
 		     _ -> True
 
-   -- We can skip over assignments to other tempoararies, because we
-   -- know that expressions aren't side-effecting and temporaries are
-   -- single-assignment.
+   -- Expressions aren't side-effecting.  Temporaries may or may not
+   -- be single-assignment depending on the source (the old code
+   -- generator creates single-assignment code, but hand-written Cmm
+   -- and Cmm from the new code generator is not single-assignment.)
+   -- So we do an extra check to make sure that the register being
+   -- changed is not one we were relying on.  I don't know how much of a
+   -- performance hit this is (we have to create a regset for every
+   -- instruction.) -- EZY
     ok_to_skip = case stmt of
                  CmmNop -> True
                  CmmComment{} -> True
-                 CmmAssign (CmmLocal (LocalReg u' _)) rhs | u' /= u -> True
+                 CmmAssign (CmmLocal r@(LocalReg u' _)) rhs | u' /= u && not (r `elemRegSet` regset) -> True
                  CmmAssign g@(CmmGlobal _) rhs -> not (g `regUsedIn` expr)
                  _other -> False
 
diff --git a/compiler/cmm/CmmParse.y b/compiler/cmm/CmmParse.y
index 8c2498e5f8..4dc7e3214f 100644
--- a/compiler/cmm/CmmParse.y
+++ b/compiler/cmm/CmmParse.y
@@ -396,13 +396,15 @@ stmt	:: { ExtCode }
 	| NAME '(' exprs0 ')' ';'
 		{% stmtMacro $1 $3  }
 	| 'switch' maybe_range expr '{' arms default '}'
-		{ doSwitch $2 $3 $5 $6 }
+		{ do as <- sequence $5; doSwitch $2 $3 as $6 }
 	| 'goto' NAME ';'
 		{ do l <- lookupLabel $2; stmtEC (CmmBranch l) }
 	| 'jump' expr maybe_actuals ';'
 		{ do e1 <- $2; e2 <- sequence $3; stmtEC (CmmJump e1 e2) }
         | 'return' maybe_actuals ';'
 		{ do e <- sequence $2; stmtEC (CmmReturn e) }
+	| 'if' bool_expr 'goto' NAME
+		{ do l <- lookupLabel $4; cmmRawIf $2 l }
 	| 'if' bool_expr '{' body '}' else 	
 		{ cmmIfThenElse $2 $4 $6 }
 
@@ -441,12 +443,16 @@ maybe_range :: { Maybe (Int,Int) }
 	: '[' INT '..' INT ']'	{ Just (fromIntegral $2, fromIntegral $4) }
 	| {- empty -}		{ Nothing }
 
-arms	:: { [([Int],ExtCode)] }
+arms	:: { [ExtFCode ([Int],Either BlockId ExtCode)] }
 	: {- empty -}			{ [] }
 	| arm arms			{ $1 : $2 }
 
-arm	:: { ([Int],ExtCode) }
-	: 'case' ints ':' '{' body '}'	{ ($2, $5) }
+arm	:: { ExtFCode ([Int],Either BlockId ExtCode) }
+	: 'case' ints ':' arm_body	{ do b <- $4; return ($2, b) }
+
+arm_body :: { ExtFCode (Either BlockId ExtCode) }
+	: '{' body '}'			{ return (Right $2) }
+	| 'goto' NAME ';'		{ do l <- lookupLabel $2; return (Left l) }
 
 ints	:: { [Int] }
 	: INT				{ [ fromIntegral $1 ] }
@@ -458,6 +464,8 @@ default :: { Maybe ExtCode }
 	-- 'default' branches
 	| {- empty -}			{ Nothing }
 
+-- Note: OldCmm doesn't support a first class 'else' statement, though
+-- CmmNode does.
 else 	:: { ExtCode }
 	: {- empty -}			{ nopEC }
 	| 'else' '{' body '}'		{ $3 }
@@ -952,6 +960,10 @@ cmmIfThenElse cond then_part else_part = do
      -- fall through to join
      code (labelC join_id)
 
+cmmRawIf cond then_id = do
+    c <- cond
+    emitCond c then_id
+
 -- 'emitCond cond true_id'  emits code to test whether the cond is true,
 -- branching to true_id if so, and falling through otherwise.
 emitCond (BoolTest e) then_id = do
@@ -991,7 +1003,7 @@ emitCond (e1 `BoolAnd` e2) then_id = do
 -- optional range on the switch (eg. switch [0..7] {...}), or by
 -- the minimum/maximum values from the branches.
 
-doSwitch :: Maybe (Int,Int) -> ExtFCode CmmExpr -> [([Int],ExtCode)]
+doSwitch :: Maybe (Int,Int) -> ExtFCode CmmExpr -> [([Int],Either BlockId ExtCode)]
          -> Maybe ExtCode -> ExtCode
 doSwitch mb_range scrut arms deflt
    = do 
@@ -1018,12 +1030,12 @@ doSwitch mb_range scrut arms deflt
 	-- ToDo: check for out of range and jump to default if necessary
         stmtEC (CmmSwitch expr entries)
    where
-	emitArm :: ([Int],ExtCode) -> ExtFCode [(Int,BlockId)]
-	emitArm (ints,code) = do
+	emitArm :: ([Int],Either BlockId ExtCode) -> ExtFCode [(Int,BlockId)]
+	emitArm (ints,Left blockid) = return [ (i,blockid) | i <- ints ]
+	emitArm (ints,Right code) = do
 	   blockid <- forkLabelledCodeEC code
 	   return [ (i,blockid) | i <- ints ]
 
-
 -- -----------------------------------------------------------------------------
 -- Putting it all together
 
diff --git a/compiler/cmm/CmmProcPoint.hs b/compiler/cmm/CmmProcPoint.hs
index d0d54d909d..fbe979b9ab 100644
--- a/compiler/cmm/CmmProcPoint.hs
+++ b/compiler/cmm/CmmProcPoint.hs
@@ -378,6 +378,8 @@ add_CopyOuts protos procPoints g = foldGraphBlocks mb_copy_out (return mapEmpty)
 -- 4. build info tables for the procedures -- and update the info table for
 --    the SRTs in the entry procedure as well.
 -- Input invariant: A block should only be reachable from a single ProcPoint.
+-- ToDo: use the _ret naming convention that the old code generator
+-- used. -- EZY
 splitAtProcPoints :: CLabel -> ProcPointSet-> ProcPointSet -> BlockEnv Status ->
                      CmmTop -> FuelUniqSM [CmmTop]
 splitAtProcPoints entry_label callPPs procPoints procMap
diff --git a/compiler/cmm/CmmSpillReload.hs b/compiler/cmm/CmmSpillReload.hs
index 4e2dd38fd3..17364ad052 100644
--- a/compiler/cmm/CmmSpillReload.hs
+++ b/compiler/cmm/CmmSpillReload.hs
@@ -100,11 +100,11 @@ dualLiveTransfers entry procPoints = mkBTransfer3 first middle last
             where check live id x = if id == entry then noLiveOnEntry id (in_regs live) x else x
 
           middle :: CmmNode O O -> DualLive -> DualLive
-          middle m live = changeStack updSlots $ changeRegs (xferLiveMiddle m) (changeRegs regs_in live)
-            where xferLiveMiddle = case getBTransfer3 xferLive of (_, middle, _) -> middle
-	          regs_in :: RegSet -> RegSet
-                  regs_in live = case m of CmmUnsafeForeignCall {} -> emptyRegSet
-                                           _ -> live
+          middle m = changeStack updSlots
+                   . changeRegs  updRegs
+            where -- Reuse middle of liveness analysis from CmmLive
+                  updRegs = case getBTransfer3 xferLive of (_, middle, _) -> middle m
+
                   updSlots live = foldSlotsUsed reload (foldSlotsDefd spill live m) m
                   spill  live s@(RegSlot r, _, _) = check s $ deleteFromRegSet live r
                   spill  live _ = live
diff --git a/compiler/cmm/CmmStackLayout.hs b/compiler/cmm/CmmStackLayout.hs
index 01543c444e..c0fb6af037 100644
--- a/compiler/cmm/CmmStackLayout.hs
+++ b/compiler/cmm/CmmStackLayout.hs
@@ -13,7 +13,7 @@
 
 module CmmStackLayout
     ( SlotEnv, liveSlotAnal, liveSlotTransfers, removeLiveSlotDefs
-    , layout, manifestSP, igraph, areaBuilder
+    , getSpEntryMap, layout, manifestSP, igraph, areaBuilder
     , stubSlotsOnDeath ) -- to help crash early during debugging
 where
 
@@ -195,7 +195,7 @@ liveLastOut env l =
 type Set x = Map x ()
 data IGraphBuilder n =
   Builder { foldNodes     :: forall z. SubArea -> (n -> z -> z) -> z -> z
-          , _wordsOccupied :: AreaMap -> AreaMap -> n -> [Int]
+          , _wordsOccupied :: AreaSizeMap -> AreaMap -> n -> [Int]
           }
 
 areaBuilder :: IGraphBuilder Area
@@ -242,10 +242,13 @@ igraph builder env g = foldr interfere Map.empty (postorderDfs g)
 -- what's the highest offset (in bytes) used in each Area?
 -- We'll need to allocate that much space for each Area.
 
+-- Mapping of areas to area sizes (not offsets!)
+type AreaSizeMap = AreaMap
+
 -- JD: WHY CAN'T THIS COME FROM THE slot-liveness info?
-getAreaSize :: ByteOff -> CmmGraph -> AreaMap
+getAreaSize :: ByteOff -> CmmGraph -> AreaSizeMap
   -- The domain of the returned mapping consists only of Areas
-  -- used for (a) variable spill slots, and (b) parameter passing ares for calls
+  -- used for (a) variable spill slots, and (b) parameter passing areas for calls
 getAreaSize entry_off g =
   foldGraphBlocks (foldBlockNodesF3 (first, add_regslots, last))
               (Map.singleton (CallArea Old) entry_off) g
@@ -266,10 +269,11 @@ getAreaSize entry_off g =
 	-- The 'max' is important.  Two calls, to f and g, might share a common
 	-- continuation (and hence a common CallArea), but their number of overflow
 	-- parameters might differ.
+        -- EZY: Ought to use insert with combining function...
 
 
 -- Find the Stack slots occupied by the subarea's conflicts
-conflictSlots :: Ord x => IGPair x -> AreaMap -> AreaMap -> SubArea -> Set Int
+conflictSlots :: Ord x => IGPair x -> AreaSizeMap -> AreaMap -> SubArea -> Set Int
 conflictSlots (ig, Builder foldNodes wordsOccupied) areaSize areaMap subarea =
   foldNodes subarea foldNode Map.empty
   where foldNode n set = Map.foldRightWithKey conflict set $ Map.findWithDefault Map.empty n ig
@@ -278,10 +282,10 @@ conflictSlots (ig, Builder foldNodes wordsOccupied) areaSize areaMap subarea =
         liveInSlots areaMap n set = foldr setAdd set (wordsOccupied areaSize areaMap n)
         setAdd w s = Map.insert w () s
 
--- Find any open space on the stack, starting from the offset.
--- If the area is a CallArea or a spill slot for a pointer, then it must
--- be word-aligned.
-freeSlotFrom :: Ord x => IGPair x -> AreaMap -> Int -> AreaMap -> Area -> Int
+-- Find any open space for 'area' on the stack, starting from the
+-- 'offset'.  If the area is a CallArea or a spill slot for a pointer,
+-- then it must be word-aligned.
+freeSlotFrom :: Ord x => IGPair x -> AreaSizeMap -> Int -> AreaMap -> Area -> Int
 freeSlotFrom ig areaSize offset areaMap area =
   let size = Map.lookup area areaSize `orElse` 0
       conflicts = conflictSlots ig areaSize areaMap (area, size, size)
@@ -299,11 +303,24 @@ freeSlotFrom ig areaSize offset areaMap area =
   in findSpace (align (offset + size)) size
 
 -- Find an open space on the stack, and assign it to the area.
-allocSlotFrom :: Ord x => IGPair x -> AreaMap -> Int -> AreaMap -> Area -> AreaMap
+allocSlotFrom :: Ord x => IGPair x -> AreaSizeMap -> Int -> AreaMap -> Area -> AreaMap
 allocSlotFrom ig areaSize from areaMap area =
   if Map.member area areaMap then areaMap
   else Map.insert area (freeSlotFrom ig areaSize from areaMap area) areaMap
 
+-- Figure out all of the offsets from the slot location; this will be
+-- non-zero for procpoints.
+type SpEntryMap = BlockEnv Int
+getSpEntryMap :: Int -> CmmGraph -> SpEntryMap
+getSpEntryMap entry_off g@(CmmGraph {g_entry = entry})
+    = foldGraphBlocks add_sp_off (mapInsert entry entry_off emptyBlockMap) g
+  where add_sp_off :: CmmBlock -> BlockEnv Int -> BlockEnv Int
+        add_sp_off b env =
+          case lastNode b of
+            CmmCall {cml_cont=Just succ, cml_ret_args=off} -> mapInsert succ off env
+            CmmForeignCall {succ=succ}                     -> mapInsert succ wORD_SIZE env
+            _                                              -> env
+
 -- | Greedy stack layout.
 -- Compute liveness, build the interference graph, and allocate slots for the areas.
 -- We visit each basic block in a (generally) forward order.
@@ -326,12 +343,16 @@ allocSlotFrom ig areaSize from areaMap area =
 -- Note: The stack pointer only has to be younger than the youngest live stack slot
 -- at proc points. Otherwise, the stack pointer can point anywhere.
 
-layout :: ProcPointSet -> SlotEnv -> ByteOff -> CmmGraph -> AreaMap
+layout :: ProcPointSet -> SpEntryMap -> SlotEnv -> ByteOff -> CmmGraph -> AreaMap
 -- The domain of the returned map includes an Area for EVERY block
 -- including each block that is not the successor of a call (ie is not a proc-point)
--- That's how we return the info of what the SP should be at the entry of every block
+-- That's how we return the info of what the SP should be at the entry of every non
+-- procpoint block.  However, note that procpoint blocks have their
+-- /slot/ stored, which is not necessarily the value of the SP on entry
+-- to the block (in fact, it probably isn't, due to argument passing).
+-- See [Procpoint Sp offset]
 
-layout procPoints env entry_off g =
+layout procPoints spEntryMap env entry_off g =
   let ig = (igraph areaBuilder env g, areaBuilder)
       env' bid = mapLookup bid env `orElse` panic "unknown blockId in igraph"
       areaSize = getAreaSize entry_off g
@@ -370,21 +391,87 @@ layout procPoints env entry_off g =
       allocMid m areaMap = foldSlotsDefd alloc' (foldSlotsUsed alloc' areaMap m) m
       allocLast bid l areaMap =
         foldr (setSuccSPs inSp) areaMap' (successors l)
-        where inSp = expectJust "sp in" $ Map.lookup (CallArea (Young bid)) areaMap
+        where inSp = slot + spOffset -- [Procpoint Sp offset]
+              -- If it's not in the map, we should use our previous
+              -- calculation unchanged.
+              spOffset = mapLookup bid spEntryMap `orElse` 0
+              slot = expectJust "slot in" $ Map.lookup (CallArea (Young bid)) areaMap
               areaMap' = foldSlotsDefd alloc' (foldSlotsUsed alloc' areaMap l) l
       alloc' areaMap (a@(RegSlot _), _, _) = allocVarSlot areaMap a
       alloc' areaMap _ = areaMap
 
-      initMap = Map.insert (CallArea (Young (g_entry g))) 0 $
-                  Map.insert (CallArea Old) 0 Map.empty
-                        
+      initMap = Map.insert (CallArea (Young (g_entry g))) 0
+              . Map.insert (CallArea Old)                 0
+              $ Map.empty
+
       areaMap = foldl layoutAreas initMap (postorderDfs g)
   in -- pprTrace "ProcPoints" (ppr procPoints) $
-        -- pprTrace "Area SizeMap" (ppr areaSize) $
-         -- pprTrace "Entry SP" (ppr entrySp) $
-           -- pprTrace "Area Map" (ppr areaMap) $
+     -- pprTrace "Area SizeMap" (ppr areaSize) $
+     -- pprTrace "Entry offset" (ppr entry_off) $
+     -- pprTrace "Area Map" (ppr areaMap) $
      areaMap
 
+{- Note [Procpoint Sp offset]
+
+The calculation of inSp is a little tricky.  (Un)fortunately, if you get
+it wrong, you will get inefficient but correct code.  You know you've
+got it wrong if the generated stack pointer bounces up and down for no
+good reason.
+
+Why can't we just set inSp to the location of the slot?  (This is what
+the code used to do.)  The trouble is when we actually hit the proc
+point the start of the slot will not be the same as the actual Sp due
+to argument passing:
+
+  a:
+      I32[(young<b> + 4)] = cde;
+      // Stack pointer is moved to young end (bottom) of young<b> for call
+      // +-------+
+      // | arg 1 |
+      // +-------+ <- Sp
+      call (I32[foobar::I32])(...) returns to Just b (4) (4) with update frame 4;
+  b:
+      // After call, stack pointer is above the old end (top) of
+      // young<b> (the difference is spOffset)
+      // +-------+ <- Sp
+      // | arg 1 |
+      // +-------+
+
+If we blithely set the Sp to be the same as the slot (the young end of
+young<b>), an adjustment will be necessary when we go to the next block.
+This is wasteful.  So, instead, for the next block after a procpoint,
+the actual Sp should be set to the same as the true Sp when we just
+entered the procpoint.  Then manifestSP will automatically do the right
+thing.
+
+Questions you may ask:
+
+1. Why don't we need to change the mapping for the procpoint itself?
+   Because manifestSP does its own calculation of the true stack value,
+   manifestSP will notice the discrepancy between the actual stack
+   pointer and the slot start, and adjust all of its memory accesses
+   accordingly.  So the only problem is when we adjust the Sp in
+   preparation for the successor block; that's why this code is here and
+   not in setSuccSPs.
+
+2. Why don't we make the procpoint call area and the true offset match
+   up?  If we did that, we would never use memory above the true value
+   of the stack pointer, thus wasting all of the stack we used to store
+   arguments.  You might think that some clever changes to the slot
+   offsets, using negative offsets, might fix it, but this does not make
+   semantic sense.
+
+3. If manifestSP is already calculating the true stack value, why we can't
+   do this trick inside manifestSP itself?  The reason is that if two
+   branches join with inconsistent SPs, one of them has to be fixed: we
+   can't know what the fix should be without already knowing what the
+   chosen location of SP is on the next successor.  (This is
+   the "succ already knows incoming SP" case), This calculation cannot
+   be easily done in manifestSP, since it processes the nodes
+   /backwards/.  So we need to have figured this out before we hit
+   manifestSP.
+-}
+
 -- After determining the stack layout, we can:
 -- 1. Replace references to stack Areas with addresses relative to the stack
 --    pointer.
@@ -394,8 +481,8 @@ layout procPoints env entry_off g =
 --    stack pointer to be younger than the live values on the stack at proc points.
 -- 3. Compute the maximum stack offset used in the procedure and replace
 --    the stack high-water mark with that offset.
-manifestSP :: AreaMap -> ByteOff -> CmmGraph -> FuelUniqSM CmmGraph
-manifestSP areaMap entry_off g@(CmmGraph {g_entry=entry}) =
+manifestSP :: SpEntryMap -> AreaMap -> ByteOff -> CmmGraph -> FuelUniqSM CmmGraph
+manifestSP spEntryMap areaMap entry_off g@(CmmGraph {g_entry=entry}) =
   ofBlockMap entry `liftM` foldl replB (return mapEmpty) (postorderDfs g)
   where slot a = -- pprTrace "slot" (ppr a) $
                    Map.lookup a areaMap `orElse` panic "unallocated Area"
@@ -404,13 +491,6 @@ manifestSP areaMap entry_off g@(CmmGraph {g_entry=entry}) =
         sp_high = maxSlot slot g
         proc_entry_sp = slot (CallArea Old) + entry_off
 
-        add_sp_off :: CmmBlock -> BlockEnv Int -> BlockEnv Int
-        add_sp_off b env =
-          case lastNode b of
-            CmmCall {cml_cont=Just succ, cml_ret_args=off} -> mapInsert succ off env
-            CmmForeignCall {succ=succ}                     -> mapInsert succ wORD_SIZE env
-            _                                              -> env
-        spEntryMap = foldGraphBlocks add_sp_off (mapInsert entry entry_off emptyBlockMap) g
         spOffset id = mapLookup id spEntryMap `orElse` 0
 
         sp_on_entry id | id == entry = proc_entry_sp
@@ -427,10 +507,26 @@ manifestSP areaMap entry_off g@(CmmGraph {g_entry=entry}) =
           where spIn = sp_on_entry (entryLabel block)
 
                 middle spOff m = mapExpDeep (replSlot spOff) m
+                -- XXX there shouldn't be any global registers in the
+                -- CmmCall, so there shouldn't be any slots in
+                -- CmmCall... check that...
                 last   spOff l = mapExpDeep (replSlot spOff) l
                 replSlot spOff (CmmStackSlot a i) = CmmRegOff (CmmGlobal Sp) (spOff - (slot a + i))
                 replSlot _ (CmmLit CmmHighStackMark) = -- replacing the high water mark
                   CmmLit (CmmInt (toInteger (max 0 (sp_high - proc_entry_sp))) (typeWidth bWord))
+                -- Invariant: Sp is always greater than SpLim.  Thus, if
+                -- the high water mark is zero, we can optimize away the
+                -- conditional branch.  Relies on dead code elimination
+                -- to get rid of the dead GC blocks.
+                -- EZY: Maybe turn this into a guard that checks if a
+                -- statement is stack-check ish?  Maybe we should make
+                -- an actual mach-op for it, so there's no chance of
+                -- mixing this up with something else...
+                replSlot _ (CmmMachOp (MO_U_Lt _)
+                              [CmmMachOp (MO_Sub _)
+                                         [ CmmReg (CmmGlobal Sp)
+                                         , CmmLit (CmmInt 0 _)],
+                               CmmReg (CmmGlobal SpLim)]) = CmmLit (CmmInt 0 wordWidth)
                 replSlot _ e = e
 
                 replLast :: MaybeC C (CmmNode C O) -> [CmmNode O O] -> CmmNode O C -> FuelUniqSM [CmmBlock]
diff --git a/compiler/cmm/MkGraph.hs b/compiler/cmm/MkGraph.hs
index 69b481b501..c9e422fb4e 100644
--- a/compiler/cmm/MkGraph.hs
+++ b/compiler/cmm/MkGraph.hs
@@ -24,7 +24,7 @@ module MkGraph
          , copyInOflow, copyInSlot, copyOutOflow, copyOutSlot
   -- Reexport of needed Cmm stuff
   , Convention(..), ForeignConvention(..), ForeignTarget(..)
-  , CmmStackInfo(..), CmmTopInfo(..), CmmGraph(..)
+  , CmmStackInfo(..), CmmTopInfo(..), CmmGraph, GenCmmGraph(..)
   , Cmm, CmmTop
   )
 where
diff --git a/compiler/cmm/OptimizationFuel.hs b/compiler/cmm/OptimizationFuel.hs
index 057a96521f..f624c1c7b6 100644
--- a/compiler/cmm/OptimizationFuel.hs
+++ b/compiler/cmm/OptimizationFuel.hs
@@ -6,12 +6,12 @@
 -- the optimiser with varying amount of fuel to find out the exact number of
 -- steps where a bug is introduced in the output.
 module OptimizationFuel
-    ( OptimizationFuel, amountOfFuel, tankFilledTo, anyFuelLeft, oneLessFuel
+    ( OptimizationFuel, amountOfFuel, tankFilledTo, unlimitedFuel, anyFuelLeft, oneLessFuel
     , OptFuelState, initOptFuelState
     , FuelConsumer, FuelUsingMonad, FuelState
     , fuelGet, fuelSet, lastFuelPass, setFuelPass
     , fuelExhausted, fuelDec1, tryWithFuel
-    , runFuelIO, fuelConsumingPass
+    , runFuelIO, runInfiniteFuelIO, fuelConsumingPass
     , FuelUniqSM
     , liftUniq
     )
@@ -21,9 +21,7 @@ import Data.IORef
 import Control.Monad
 import StaticFlags (opt_Fuel)
 import UniqSupply
-#ifdef DEBUG
 import Panic
-#endif
 
 import Compiler.Hoopl
 import Compiler.Hoopl.GHC (getFuel, setFuel)
@@ -51,8 +49,8 @@ amountOfFuel :: OptimizationFuel -> Int
 
 anyFuelLeft :: OptimizationFuel -> Bool
 oneLessFuel :: OptimizationFuel -> OptimizationFuel
+unlimitedFuel :: OptimizationFuel
 
-#ifdef DEBUG
 newtype OptimizationFuel = OptimizationFuel Int
   deriving Show
 
@@ -61,16 +59,7 @@ amountOfFuel (OptimizationFuel f) = f
 
 anyFuelLeft (OptimizationFuel f) = f > 0
 oneLessFuel (OptimizationFuel f) = ASSERT (f > 0) (OptimizationFuel (f - 1))
-#else
--- type OptimizationFuel = State# () -- would like this, but it won't work
-data OptimizationFuel = OptimizationFuel
-  deriving Show
-tankFilledTo _ = OptimizationFuel
-amountOfFuel _ = maxBound
-
-anyFuelLeft _ = True
-oneLessFuel _ = OptimizationFuel
-#endif
+unlimitedFuel = OptimizationFuel infiniteFuel
 
 data FuelState = FuelState { fs_fuel :: OptimizationFuel, fs_lastpass :: String }
 newtype FuelUniqSM a = FUSM { unFUSM :: FuelState -> UniqSM (a, FuelState) }
@@ -92,6 +81,16 @@ runFuelIO fs (FUSM f) =
        writeIORef (fuel_ref fs) fuel'
        return a
 
+-- ToDo: Do we need the pass_ref when we are doing infinite fueld
+-- transformations?
+runInfiniteFuelIO :: OptFuelState -> FuelUniqSM a -> IO a
+runInfiniteFuelIO fs (FUSM f) =
+    do pass <- readIORef (pass_ref fs)
+       u <- mkSplitUniqSupply 'u'
+       let (a, FuelState _ pass') = initUs_ u $ f (FuelState unlimitedFuel pass)
+       writeIORef (pass_ref fs) pass'
+       return a
+
 instance Monad FuelUniqSM where
   FUSM f >>= k = FUSM (\s -> f s >>= \(a, s') -> unFUSM (k a) s')
   return a     = FUSM (\s -> return (a, s))
diff --git a/compiler/cmm/PprC.hs b/compiler/cmm/PprC.hs
index 10c9f18310..aa7d914253 100644
--- a/compiler/cmm/PprC.hs
+++ b/compiler/cmm/PprC.hs
@@ -50,6 +50,7 @@ import Outputable
 import Constants
 import BasicTypes
 import CLabel
+import Util
 
 -- The rest
 import Data.List
@@ -63,10 +64,6 @@ import Data.Word
 import Data.Array.ST
 import Control.Monad.ST
 
-#if x86_64_TARGET_ARCH
-import StaticFlags	( opt_Unregisterised )
-#endif
-
 #if defined(alpha_TARGET_ARCH) || defined(mips_TARGET_ARCH) || defined(mipsel_TARGET_ARCH) || defined(arm_TARGET_ARCH)
 #define BEWARE_LOAD_STORE_ALIGNMENT
 #endif
@@ -104,18 +101,19 @@ pprTop (CmmProc info clbl (ListGraph blocks)) =
         then pprDataExterns info $$
              pprWordArray (entryLblToInfoLbl clbl) info
         else empty) $$
-    (case blocks of
-        [] -> empty
-         -- the first block doesn't get a label:
-        (BasicBlock _ stmts : rest) -> vcat [
+    (vcat [
 	   blankLine,
 	   extern_decls,
            (if (externallyVisibleCLabel clbl)
                     then mkFN_ else mkIF_) (pprCLabel clbl) <+> lbrace,
            nest 8 temp_decls,
            nest 8 mkFB_,
-           nest 8 (vcat (map pprStmt stmts)) $$
-              vcat (map pprBBlock rest),
+           case blocks of
+               [] -> empty
+               -- the first block doesn't get a label:
+               (BasicBlock _ stmts : rest) ->
+                    nest 8 (vcat (map pprStmt stmts)) $$
+                       vcat (map pprBBlock rest),
            nest 8 mkFE_,
            rbrace ]
     )
@@ -818,17 +816,6 @@ pprCall ppr_fn cconv results args _
 
   | otherwise
   =
-#if x86_64_TARGET_ARCH
-	-- HACK around gcc optimisations.
-	-- x86_64 needs a __DISCARD__() here, to create a barrier between
-	-- putting the arguments into temporaries and passing the arguments
-	-- to the callee, because the argument expressions may refer to
-	-- machine registers that are also used for passing arguments in the
-	-- C calling convention.
-    (if (not opt_Unregisterised) 
-	then ptext (sLit "__DISCARD__();") 
-	else empty) $$
-#endif
     ppr_assign results (ppr_fn <> parens (commafy (map pprArg args))) <> semi
   where 
      ppr_assign []           rhs = rhs
@@ -1022,18 +1009,6 @@ machRep_S_CType _   = panic "machRep_S_CType"
 pprStringInCStyle :: [Word8] -> SDoc
 pprStringInCStyle s = doubleQuotes (text (concatMap charToC s))
 
-charToC :: Word8 -> String
-charToC w = 
-  case chr (fromIntegral w) of
-	'\"' -> "\\\""
-	'\'' -> "\\\'"
-	'\\' -> "\\\\"
-	c | c >= ' ' && c <= '~' -> [c]
-          | otherwise -> ['\\',
-                         chr (ord '0' + ord c `div` 64),
-                         chr (ord '0' + ord c `div` 8 `mod` 8),
-                         chr (ord '0' + ord c         `mod` 8)]
-
 -- ---------------------------------------------------------------------------
 -- Initialising static objects with floating-point numbers.  We can't
 -- just emit the floating point number, because C will cast it to an int
diff --git a/compiler/cmm/cmm-notes b/compiler/cmm/cmm-notes
index 0852711f96..e787f18b17 100644
--- a/compiler/cmm/cmm-notes
+++ b/compiler/cmm/cmm-notes
@@ -15,14 +15,11 @@ Things to do:
 	This will fix the spill before stack check problem but only really as a side
 	effect. A 'real fix' probably requires making the spiller know about sp checks.
 
- - There is some silly stuff happening with the Sp. We end up with code like:
-   Sp = Sp + 8; R1 = _vwf::I64; Sp = Sp -8
-	Seems to be perhaps caused by the issue above but also maybe a optimisation
-	pass needed?
+   EZY: I don't understand this comment. David Terei, can you clarify?
 
- - Proc pass all arguments on the stack, adding more code and slowing down things
-   a lot. We either need to fix this or even better would be to get rid of
-	proc points.
+ - Proc points pass all arguments on the stack, adding more code and
+   slowing down things a lot. We either need to fix this or even better
+   would be to get rid of proc points.
 
  - CmmInfo.cmmToRawCmm uses Old.Cmm, so it is called after converting Cmm.Cmm to
    Old.Cmm. We should abstract it to work on both representations, it needs only to
@@ -32,7 +29,7 @@ Things to do:
    we could convert codeGen/StgCmm* clients to the Hoopl's semantics?
    It's all deeply unsatisfactory.
 
- - Improve preformance of Hoopl.
+ - Improve performance of Hoopl.
 
    A nofib comparison of -fasm vs -fnewcodegen nofib compilation parameters
    (using the same ghc-cmm branch +libraries compiled by the old codegenerator)
@@ -50,6 +47,9 @@ Things to do:
 
    So we generate a bit better code, but it takes us longer!
 
+   EZY: Also importantly, Hoopl uses dramatically more memory than the
+   old code generator.
+
  - Are all blockToNodeList and blockOfNodeList really needed? Maybe we could
    splice blocks instead?
 
@@ -57,7 +57,7 @@ Things to do:
    a block catenation function would be probably nicer than blockToNodeList
    / blockOfNodeList combo.
 
- - loweSafeForeignCall seems too lowlevel. Just use Dataflow. After that
+ - lowerSafeForeignCall seems too lowlevel. Just use Dataflow. After that
    delete splitEntrySeq from HooplUtils.
 
  - manifestSP seems to touch a lot of the graph representation. It is
@@ -76,6 +76,9 @@ Things to do:
    calling convention, and the code for calling foreign calls is generated
 
  - AsmCodeGen has a generic Cmm optimiser; move this into new pipeline
+   EZY (2011-04-16): The mini-inliner has been generalized and ported,
+   but the constant folding and other optimizations need to still be
+   ported.
 
  - AsmCodeGen has post-native-cg branch eliminator (shortCutBranches);
    we ultimately want to share this with the Cmm branch eliminator.
@@ -113,7 +116,7 @@ Things to do:
  - See "CAFs" below; we want to totally refactor the way SRTs are calculated
 
  - Pull out Areas into its own module
-   Parameterise AreaMap
+   Parameterise AreaMap (note there are type synonyms in CmmStackLayout!)
    Add ByteWidth = Int
    type SubArea    = (Area, ByteOff, ByteWidth) 
    ByteOff should not be defined in SMRep -- that is too high up the hierarchy
@@ -293,8 +296,8 @@ cpsTop:
        insert spills/reloads across 
 	   LastCalls, and 
 	   Branches to proc-points
-     Now sink those reloads:
-     - CmmSpillReload.insertLateReloads
+     Now sink those reloads (and other instructions):
+     - CmmSpillReload.rewriteAssignments
      - CmmSpillReload.removeDeadAssignmentsAndReloads
 
   * CmmStackLayout.stubSlotsOnDeath
@@ -344,7 +347,7 @@ to J that way. This is an awkward choice.  (We think that we currently
 never pass variables to join points via arguments.)
 
 Furthermore, there is *no way* to pass q to J in a register (other
-than a paramter register).
+than a parameter register).
 
 What we want is to do register allocation across the whole caboodle.
 Then we could drop all the code that deals with the above awkward